diff --git a/.gitignore b/.gitignore
index 29a8957..35fbccf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,8 +8,10 @@ pixi.lock
 magic.lock
 # mojopkg files
 *.mojopkg
-# VSCode environments
+# Editor environments
 .vscode
+.helix
+# Temporary files
 tempCodeRunnerFile.mojo
 /temp*.mojo
 # macOS environments
diff --git a/src/decimojo/biguint/arithmetics.mojo b/src/decimojo/biguint/arithmetics.mojo
index 190a450..8e03af5 100644
--- a/src/decimojo/biguint/arithmetics.mojo
+++ b/src/decimojo/biguint/arithmetics.mojo
@@ -28,6 +28,11 @@ from decimojo.biguint.biguint import BigUInt
 import decimojo.biguint.comparison
 from decimojo.rounding_mode import RoundingMode
 
+alias CUTOFF_KARATSUBA: Int = 64
+"""The cutoff number of words for using Karatsuba multiplication."""
+alias CUTOFF_BURNIKEL_ZIEGLER = 32
+"""The cutoff number of words for using Burnikel-Ziegler division."""
+
 # ===----------------------------------------------------------------------=== #
 # List of functions in this module:
 #
@@ -778,10 +783,6 @@ fn multiply(x: BigUInt, y: BigUInt) -> BigUInt:
         to the cutoff number, the school multiplication algorithm is used.
     """
 
-    # TODO: Make this a global constant
-    alias CUTOFF_KARATSUBA: Int = 64
-    """The cutoff number of words for using Karatsuba multiplication."""
-
     debug_assert[assert_mode="none"](
         len(x.words) != 0, "BigUInt is uninitialized!"
     )
@@ -858,11 +859,6 @@ fn multiply_slices(
         to use. If the number of words in either operand is less than or equal
         to the cutoff number, the school multiplication algorithm is used.
     """
-
-    # TODO: Make this a global constant
-    alias CUTOFF_KARATSUBA: Int = 64
-    """The cutoff number of words for using Karatsuba multiplication."""
-
     n_words_x_slice = bounds_x[1] - bounds_x[0]
     n_words_y_slice = bounds_y[1] - bounds_y[0]
 
@@ -925,11 +921,7 @@ fn multiply_slices_school(
 
     # The max number of words in the result is the sum of the words in the operands
     var max_result_len = n_words_x_slice + n_words_y_slice
-    var words = List[UInt32](capacity=max_result_len)
-
-    # Initialize result words with zeros
-    for _ in range(max_result_len):
-        words.append(0)
+    var words = List[UInt32](length=max_result_len, fill=0)
 
     # Perform the multiplication word by word (from least significant to most significant)
     # x = x[start_x] + x[start_x + 1] * 10^9
@@ -1174,7 +1166,6 @@ fn multiply_inplace_by_uint32(mut x: BigUInt, y: UInt32):
         x.words.append(UInt32(carry))
 
 
-@always_inline
 fn multiply_inplace_by_uint32_le_4(mut x: BigUInt, y: UInt32):
     """Multiplies in-place a BigUInt by a UInt32 value which is between 0 and 4.
 
@@ -1462,15 +1453,15 @@ fn multiply_inplace_by_power_of_billion(mut x: BigUInt, n: Int):
 # ===----------------------------------------------------------------------=== #
 
 
-fn floor_divide(x1: BigUInt, x2: BigUInt) raises -> BigUInt:
+fn floor_divide(x: BigUInt, y: BigUInt) raises -> BigUInt:
     """Returns the quotient of two BigUInt numbers, truncating toward zero.
 
     Args:
-        x1: The dividend.
-        x2: The divisor.
+        x: The dividend.
+        y: The divisor.
 
     Returns:
-        The quotient of x1 / x2, truncated toward zero.
+        The quotient of x / y, truncated toward zero.
 
     Raises:
         ValueError: If the divisor is zero.
@@ -1479,160 +1470,164 @@ fn floor_divide(x1: BigUInt, x2: BigUInt) raises -> BigUInt:
         It is equal to truncated division for positive numbers.
     """
 
-    alias CUTOFF_BURNIKEL_ZIEGLER = 64
-
-    debug_assert[assert_mode="none"](
-        len(x1.words) != 0, "BigUInt ", x1, " is uninitialized!"
-    )
     debug_assert[assert_mode="none"](
-        len(x2.words) != 0, "BigUInt ", x2, " is uninitialized!"
+        (len(x.words) != 0) and (len(y.words) != 0),
+        "biguint.arithmetics.floor_divide(): BigUInt ",
+        x,
+        " and / or ",
+        y,
+        " is uninitialized!",
     )
 
-    # CASE: x2 is single word
-    if len(x2.words) == 1:
-        # SUB-CASE: Division by zero
-        if x2.words[0] == 0:
-            raise Error("biguint.arithmetics.floor_divide(): Division by zero")
-
-        # SUB-CASE: Division by one
-        if x2.words[0] == 1:
-            return x1
-
-        # SUB-CASE: Division by two
-        if x2.words[0] == 2:
-            var result = x1
-            floor_divide_inplace_by_2(result)
-            return result^
-
-        # SUB-CASE: Single word // single word
-        if len(x1.words) == 1:
-            var result = BigUInt(List[UInt32](x1.words[0] // x2.words[0]))
-            return result^
-
-        # SUB-CASE: Divisor is single word and is power of 2
-        if (x2.words[0] & (x2.words[0] - 1)) == 0:
-            var result = x1
-            var remainder = x2.words[0]
-            while remainder > 1:
-                floor_divide_inplace_by_2(result)
-                remainder >>= 1
-            return result^
-
-        # SUB-CASE: Divisor is single word (<= 9 digits)
-        else:
-            var result = x1
-            floor_divide_inplace_by_single_word(result, x2)
-            return result^
-
-    # CASE: Divisor is double-word (<= 20 digits)
-    if len(x2.words) == 2:
-        var result = x1
-        floor_divide_inplace_by_double_words(result, x2)
-        return result^
+    # CASE: y is zero
+    if y.is_zero():
+        raise Error("biguint.arithmetics.floor_divide(): Division by zero")
 
     # CASE: Dividend is zero
-    if x1.is_zero():
+    if x.is_zero():
         debug_assert[assert_mode="none"](
-            len(x1.words) == 1, "floor_divide(): x1 has leading zero words"
+            len(x.words) == 1,
+            "biguint.arithmetics.floor_divide(): x has leading zero words",
         )
         return BigUInt()  # Return zero
 
-    var comparison_result: Int8 = x1.compare(x2)
-    # CASE: dividend < divisor
+    # CASE: x is not greater than y
+    var comparison_result: Int8 = x.compare(y)
+    # SUB-CASE: dividend < divisor
     if comparison_result < 0:
         return BigUInt()  # Return zero
-    # CASE: dividend == divisor
+    # SUB-CASE: dividend == divisor
     if comparison_result == 0:
         return BigUInt(UInt32(1))
 
-    # CASE: Divisor is 10^n
-    # First remove the last words (10^9) and then shift the rest
-    if x2.is_power_of_10():
-        var result: BigUInt
-        if len(x2.words) == 1:
-            result = x1
+    # CASE: y is single word
+    if len(y.words) == 1:
+        # SUB-CASE: Division by one
+        if y.words[0] == 1:
+            return x
+        # SUB-CASE: Single word // single word
+        if len(x.words) == 1:
+            var result = BigUInt(List[UInt32](x.words[0] // y.words[0]))
+            return result^
+        # SUB-CASE: Divisor is single word (<= 9 digits)
         else:
-            var word_shift = len(x2.words) - 1
-            # If we need to drop more words than exists, result is zero
-            if word_shift >= len(x1.words):
-                return BigUInt()
-            # Create result with the remaining words
-            words = List[UInt32]()
-            for i in range(word_shift, len(x1.words)):
-                words.append(x1.words[i])
-            result = BigUInt(words=words^)
-
-        # Get the last word of the divisor
-        var x2_word = x2.words[len(x2.words) - 1]
-        var carry = UInt32(0)
-        var power_of_carry = BigUInt.BASE // x2_word
-        for i in range(len(result.words) - 1, -1, -1):
-            var quot = result.words[i] // x2_word
-            var rem = result.words[i] % x2_word
-            result.words[i] = quot + carry * power_of_carry
-            carry = rem
-
-        result.remove_leading_empty_words()
+            return floor_divide_by_uint32(x, y.words[0])
+
+    # CASE: y is double words
+    if len(y.words) == 2:
+        # Use `floor_divide_by_uint64` as it is more efficient
+        return floor_divide_by_uint64(x, y.to_uint64_with_first_2_words())
+
+    # CASE: y is triple or quadraple words
+    if len(y.words) <= 4:
+        # Use `floor_divide_by_uint128` as it is more efficient
+        return floor_divide_by_uint128(x, y.to_uint128_with_first_4_words())
+
+    # CASE: Divisor is 10^n
+    if y.is_power_of_10():
+        var result = floor_divide_by_power_of_ten(
+            x, y.number_of_trailing_zeros()
+        )
         return result^
 
     # CASE: Division of small numbers
-    # If the number of words in a or b is small enough,
+    # If the number of words in the dividend and the divisor is small enough,
     # we can use the schoolbook division algorithm.
-    if (len(x1.words) <= CUTOFF_BURNIKEL_ZIEGLER) and (
-        len(x2.words) <= CUTOFF_BURNIKEL_ZIEGLER
+    # 2n-by-n where n is the cutoff number of words for Burnikel-Ziegler
+    if (len(x.words) <= CUTOFF_BURNIKEL_ZIEGLER * 2) and (
+        len(y.words) <= CUTOFF_BURNIKEL_ZIEGLER
     ):
         # I will normalize the divisor to improve quotient estimation
-        var ndigits_to_shift: Int  # Number of digits to shift
         # Calculate normalization factor to make leading digit of divisor
         # as large as possible
-        ndigits_to_shift = calculate_ndigits_for_normalization(x2.words[-1])
+        var ndigits_to_shift = calculate_ndigits_for_normalization(y.words[-1])
 
         if ndigits_to_shift == 0:
             # No normalization needed, just use the general division algorithm
-            return floor_divide_school(x1, x2)
+            return floor_divide_school(x, y)
         else:
             # Normalize the divisor and dividend
-            var normalized_x1 = multiply_by_power_of_ten(x1, ndigits_to_shift)
-            var normalized_x2 = multiply_by_power_of_ten(x2, ndigits_to_shift)
-            return floor_divide_school(normalized_x1, normalized_x2)
+            var normalized_x = multiply_by_power_of_ten(x, ndigits_to_shift)
+            var normalized_y = multiply_by_power_of_ten(y, ndigits_to_shift)
+            return floor_divide_school(normalized_x, normalized_y)
 
     # CASE: division of very, very large numbers
     # Use the Burnikel-Ziegler division algorithm
-    # print("Using Burnikel-Ziegler division algorithm for large numbers")
-    return floor_divide_burnikel_ziegler(
-        x1, x2, cut_off=CUTOFF_BURNIKEL_ZIEGLER
-    )
+    return floor_divide_burnikel_ziegler(x, y, cut_off=CUTOFF_BURNIKEL_ZIEGLER)
 
 
-fn floor_divide_school(dividend: BigUInt, divisor: BigUInt) raises -> BigUInt:
-    """General schoolbook division algorithm for BigInt numbers.
+# TODO: Implement a `floor_divide_slices_school()` function that
+# can be used for slices of BigUInt numbers.
+fn floor_divide_school(x: BigUInt, y: BigUInt) raises -> BigUInt:
+    """**[PRIVATE]** General schoolbook division algorithm for BigInt numbers.
 
     Args:
-        dividend: The dividend.
-        divisor: The divisor.
+        x: The dividend.
+        y: The divisor.
 
     Returns:
-        The quotient of dividend // divisor.
+        The quotient of x // y.
 
     Raises:
-        Error: If the divisor is zero.
+        Error: If the y is zero.
     """
 
-    if divisor.is_zero():
+    # Because the Burnikel-Ziegler division algorithm will fall back to this
+    # function for small numbers, we need to ensure that special cases are
+    # handled properly to improve performance.
+    # CASE: y is zero
+    if y.is_zero():
+        raise Error("biguint.arithmetics.floor_divide(): Division by zero")
+
+    # CASE: Dividend is zero
+    if x.is_zero():
         debug_assert[assert_mode="none"](
-            len(divisor.words) == 1,
-            "floor_divide_school(): leading zero words",
-        )
-        raise Error(
-            "`biguint.arithmetics.floor_divide_school()`: Division by zero"
+            len(x.words) == 1,
+            "biguint.arithmetics.floor_divide(): x has leading zero words",
         )
+        return BigUInt()  # Return zero
+
+    # CASE: x is not greater than y
+    var comparison_result: Int8 = x.compare(y)
+    # SUB-CASE: dividend < divisor
+    if comparison_result < 0:
+        return BigUInt()  # Return zero
+    # SUB-CASE: dividend == divisor
+    if comparison_result == 0:
+        return BigUInt(UInt32(1))
+
+    # CASE: y is single word
+    if len(y.words) == 1:
+        # SUB-CASE: Division by one
+        if y.words[0] == 1:
+            return x
+        # SUB-CASE: Single word // single word
+        if len(x.words) == 1:
+            var result = BigUInt(List[UInt32](x.words[0] // y.words[0]))
+            return result^
+        # SUB-CASE: Divisor is single word (<= 9 digits)
+        else:
+            return floor_divide_by_uint32(x, y.words[0])
+
+    # CASE: y is double words
+    if len(y.words) == 2:
+        # Use `floor_divide_by_uint64` as it is more efficient
+        return floor_divide_by_uint64(x, y.to_uint64_with_first_2_words())
 
+    # CASE: y is triple or quadraple words
+    if len(y.words) <= 4:
+        # Use `floor_divide_by_uint128` as it is more efficient
+        return floor_divide_by_uint128(x, y.to_uint128_with_first_4_words())
+
+    # ===----------------------------------------------=== #
+    # ALL OTHER CASES
+    # Use the schoolbook division algorithm
     # Initialize result and remainder
-    var result = BigUInt(List[UInt32](capacity=len(dividend.words)))
-    var remainder = dividend
+    var result = BigUInt(List[UInt32](capacity=len(x.words)))
+    var remainder = x
 
     # Shift and initialize
-    var n_words_diff = len(remainder.words) - len(divisor.words)
+    var n_words_diff = len(remainder.words) - len(y.words)
     # The quotient will have at most n_words_diff + 1 words
     for _ in range(n_words_diff + 1):
         result.words.append(0)
@@ -1640,15 +1635,14 @@ fn floor_divide_school(dividend: BigUInt, divisor: BigUInt) raises -> BigUInt:
     # Main division loop
     var index_of_word = n_words_diff  # Start from the most significant word
     var trial_product: BigUInt
+    var quotient: UInt32
     while index_of_word >= 0:
         # OPTIMIZATION: Better quotient estimation
-        var quotient = floor_divide_estimate_quotient(
-            remainder, divisor, index_of_word
-        )
+        quotient = floor_divide_estimate_quotient(remainder, y, index_of_word)
 
         # Calculate trial product
-        trial_product = divisor
-        multiply_inplace_by_uint32(trial_product, UInt32(quotient))
+        trial_product = y
+        multiply_inplace_by_uint32(trial_product, quotient)
         multiply_inplace_by_power_of_billion(trial_product, index_of_word)
 
         # By construction, no correction is needed
@@ -1657,17 +1651,13 @@ fn floor_divide_school(dividend: BigUInt, divisor: BigUInt) raises -> BigUInt:
         while (trial_product.compare(remainder) > 0) and (quotient > 0):
             quotient -= 1
             correction_attempts += 1
-
-            trial_product = divisor
-            multiply_inplace_by_uint32(trial_product, UInt32(quotient))
-            multiply_inplace_by_power_of_billion(trial_product, index_of_word)
-
+            trial_product -= multiply_by_power_of_billion(y, index_of_word)
             debug_assert[assert_mode="none"](
-                correction_attempts <= 3, "Too many correction attempts"
+                correction_attempts <= 2, "Too many correction attempts"
             )
 
         # Store the quotient word
-        result.words[index_of_word] = UInt32(quotient)
+        result.words[index_of_word] = quotient
         # By construction, trial_product <= remainder
         subtract_inplace_no_check(remainder, trial_product)
         index_of_word -= 1
@@ -1678,7 +1668,7 @@ fn floor_divide_school(dividend: BigUInt, divisor: BigUInt) raises -> BigUInt:
 
 fn floor_divide_estimate_quotient(
     dividend: BigUInt, divisor: BigUInt, index_of_word: Int
-) -> UInt64:
+) -> UInt32:
     """Estimates the quotient digit using 3-by-2 division.
 
     This function implements a 3-by-2 quotient estimation algorithm,
@@ -1687,7 +1677,7 @@ fn floor_divide_estimate_quotient(
 
     Args:
         dividend: The dividend BigUInt number.
-        divisor: The divisor BigUInt number.
+        divisor: The divisor BigUInt number. Should be at least 2 words.
         index_of_word: The current position in the division algorithm.
 
     Returns:
@@ -1702,134 +1692,274 @@ fn floor_divide_estimate_quotient(
     """
 
     # Extract three highest words of relevant dividend portion
-    var r2 = UInt64(0)
-    if index_of_word + len(divisor.words) < len(dividend.words):
-        r2 = UInt64(dividend.words[index_of_word + len(divisor.words)])
-
-    var r1 = UInt64(0)
-    if index_of_word + len(divisor.words) - 1 < len(dividend.words):
-        r1 = UInt64(dividend.words[index_of_word + len(divisor.words) - 1])
+    # numerator = r2r1r0
+    # Extract 3-word dividend portion using SIMD for better performance
+    var numerator: UInt128
+    var base_index = index_of_word + len(divisor.words) - 2
+
+    # Ensure we don't read beyond bounds
+    if base_index + 2 < len(dividend.words):
+        # We can safely load 3 words: r0, r1, r2
+        numerator = (
+            dividend.words.data.load[width=4](base_index).cast[DType.uint128]()
+            * SIMD[DType.uint128, 4](
+                1, 1_000_000_000, 1_000_000_000_000_000_000, 0
+            )
+        ).reduce_add()
+    elif base_index + 1 < len(dividend.words):
+        # We can safely load 2 words: r0, r1 (r2 = 0)
+        numerator = (
+            dividend.words.data.load[width=2](base_index).cast[DType.uint128]()
+            * SIMD[DType.uint128, 2](1, 1_000_000_000)
+        ).reduce_add()
+    elif base_index < len(dividend.words):
+        # We can only load 1 word: r0 (r1 = r2 = 0)
+        numerator = UInt128(dividend.words[base_index])
+    else:
+        # All words are zero
+        numerator = UInt128(0)
 
-    var r0 = UInt64(0)
-    if index_of_word + len(divisor.words) - 2 < len(dividend.words):
-        r0 = UInt64(dividend.words[index_of_word + len(divisor.words) - 2])
+    # Extract two highest words of divisor using SIMD
+    var denominator: UInt128
+    debug_assert[assert_mode="none"](
+        len(divisor.words) >= 2,
+        "biguint.arithmetics.floor_divide_estimate_quotient(): ",
+        "Divisor must have at least 2 words by design.",
+    )
+    denominator = (
+        divisor.words.data.load[width=2](len(divisor.words) - 2).cast[
+            DType.uint128
+        ]()
+        * SIMD[DType.uint128, 2](1, 1_000_000_000)
+    ).reduce_add()
 
-    # Extract two highest words of divisor
-    var d1 = UInt64(divisor.words[len(divisor.words) - 1])
-    var d0 = UInt64(0)
-    if len(divisor.words) >= 2:
-        d0 = UInt64(divisor.words[len(divisor.words) - 2])
+    # Use the SIMD-computed full dividend
+    var quotient_128 = numerator // denominator
 
-    # Special case: if divisor is single word, fall back to 2-by-1 division
-    if len(divisor.words) == 1:
-        if r2 == d1:
-            return BigUInt.BASE_MAX
-        return min(
-            (r2 * UInt64(BigUInt.BASE) + r1) // d1, UInt64(BigUInt.BASE_MAX)
-        )
+    # Convert back to UInt32
+    var quotient = UInt32(quotient_128)
 
-    # Special case: if high word of dividend equals high word of divisor
-    # The quotient is likely to be large, so we use a conservative estimate
-    if r2 == d1:
-        return UInt64(BigUInt.BASE_MAX)
+    # Ensure we don't exceed the maximum value for a single word
+    return min(quotient, BigUInt.BASE_MAX)
 
-    # 3-by-2 division using 128-bit arithmetic
-    # We need to compute: (r2 * 10^18 + r1 * 10^9 + r0) // (d1 * 10^9 + d0)
 
-    # Convert to 128-bit for high precision calculation
-    var dividend_high = UInt128(r2) * UInt128(BigUInt.BASE) + UInt128(r1)
-    var dividend_low = UInt128(r0)
-    var divisor_128 = UInt128(d1) * UInt128(BigUInt.BASE) + UInt128(d0)
+fn floor_divide_by_uint32(x: BigUInt, y: UInt32) -> BigUInt:
+    """**[PRIVATE]** Divides a BigUInt by a UInt32 divisor.
 
-    # Handle the case where we need to consider the full 3-word dividend
-    # We compute: (dividend_high * 10^9 + dividend_low) // divisor_128
-    var full_dividend = dividend_high * UInt128(BigUInt.BASE) + dividend_low
+    Args:
+        x: The BigUInt value to divide by the divisor.
+        y: The UInt32 divisor. Must be non-zero.
 
-    # Perform the division
-    var quotient_128 = full_dividend // divisor_128
+    Notes:
 
-    # Convert back to UInt64
-    var quotient = UInt64(quotient_128)
+    This function is used internally for division by single word divisors.
+    It is not intended for public use. You need to ensure that y is non-zero.
+    """
+    debug_assert[assert_mode="none"](
+        y != 0, "biguint.arithmetics.floor_divide_by_uint32(): Division by zero"
+    )
 
-    # Ensure we don't exceed the maximum value for a single word
-    return min(quotient, UInt64(BigUInt.BASE_MAX))
+    # Most significant word of the dividend
+    var dividend = UInt64(x.words[-1] // y)
+    var carry = UInt64(x.words[-1] % y)
+    var y_uint64 = UInt64(y)
+    var result: BigUInt
+    if dividend == 0:
+        result = BigUInt(unsafe_uninit_length=len(x.words) - 1)
+    else:
+        result = BigUInt(unsafe_uninit_length=len(x.words))
+        result.words[-1] = UInt32(dividend)
+
+    # Process the rest of the words
+    for i in range(len(x.words) - 2, -1, -1):
+        dividend = carry * UInt64(BigUInt.BASE) + UInt64(x.words[i])
+        result.words[i] = UInt32(dividend // y_uint64)
+        carry = dividend % y_uint64
+    return result^
 
 
-fn floor_divide_inplace_by_single_word(
-    mut x1: BigUInt, x2: BigUInt
-) raises -> None:
-    """Divides a BigUInt by a single word divisor in-place.
+fn floor_divide_inplace_by_uint32(mut x: BigUInt, y: UInt32) -> None:
+    """Divides a BigUInt by a UInt32 divisor in-place.
 
     Args:
-        x1: The BigUInt value to divide by the divisor.
-        x2: The single word divisor.
+        x: The BigUInt value to divide by the divisor.
+        y: The UInt32 divisor. Must be non-zero.
+
+    Notes:
+
+    This function is used internally for division by single word divisors.
+    It is not intended for public use. You need to ensure that y is non-zero.
     """
-    if x2.is_zero():
-        debug_assert[assert_mode="none"](
-            len(x2.words) == 1,
-            "floor_divide_inplace_by_single_word(): leading zero words",
-        )
-        raise Error(
-            "Error in `floor_divide_inplace_by_single_word`: Division by zero"
-        )
+    debug_assert[assert_mode="none"](
+        y != 0, "biguint.arithmetics.floor_divide_by_uint32(): Division by zero"
+    )
 
-    # CASE: all other situations
-    var x2_value = UInt64(x2.words[0])
-    var carry = UInt64(0)
-    for i in range(len(x1.words) - 1, -1, -1):
-        var dividend = carry * UInt64(BigUInt.BASE) + UInt64(x1.words[i])
-        x1.words[i] = UInt32(dividend // x2_value)
-        carry = dividend % x2_value
-    x1.remove_leading_empty_words()
+    # Most significant word of the dividend
+    var dividend = UInt64(x.words[-1] // y)
+    var carry = UInt64(x.words[-1] % y)
+    var y_uint64 = UInt64(y)
+    if dividend == 0:
+        x.words.shrink(len(x.words) - 1)
+    else:
+        x.words[-1] = UInt32(dividend)
 
+    # Process the rest of the words
+    for i in range(len(x.words) - 2, -1, -1):
+        dividend = carry * UInt64(BigUInt.BASE) + UInt64(x.words[i])
+        x.words[i] = UInt32(dividend // y_uint64)
+        carry = dividend % y_uint64
 
-fn floor_divide_inplace_by_double_words(
-    mut x1: BigUInt, x2: BigUInt
-) raises -> None:
-    """Divides a BigUInt by double-word divisor in-place.
 
-    Args:
-        x1: The BigUInt value to divide by the divisor.
-        x2: The double-word divisor.
+fn floor_divide_by_uint64(x: BigUInt, y: UInt64) -> BigUInt:
+    """Divides a BigUInt by UInt64.
 
-    Raises:
-        Error: If the divisor is zero.
+    Args:
+        x: The BigUInt value to divide by the divisor.
+        y: The UInt64 divisor. Must be smaller than 10^18.
     """
-    if x2.is_zero():
-        debug_assert[assert_mode="none"](
-            len(x2.words) == 1,
-            "floor_divide_inplace_by_double_words(): leading zero words",
-        )
-        raise Error(
-            "biguint.arithmetics.floor_divide_inplace_by_double_words():"
-            " Division by zero"
+    debug_assert[assert_mode="none"](
+        y != 0,
+        "biguint.arithmetics.floor_divide_inplace_by_uint64(): ",
+        "Division by zero.",
+    )
+
+    var carry = UInt128(0)
+    var y_uint128 = UInt128(y)
+    var result: BigUInt
+    if len(x.words) % 2 == 1:
+        carry = UInt128(x.words[-1])
+        result = BigUInt(unsafe_uninit_length=len(x.words) - 1)
+    else:
+        result = BigUInt(unsafe_uninit_length=len(x.words))
+
+    for i in range(len(result.words) - 1, -1, -2):
+        var dividend = (
+            carry * UInt128(1_000_000_000_000_000_000)
+            + (
+                x.words.data.load[width=2](i - 1).cast[DType.uint128]()
+                * SIMD[DType.uint128, 2](1, 1_000_000_000)
+            ).reduce_add()
         )
+        var quotient = dividend // y_uint128
+        result.words[i] = UInt32(quotient // UInt128(BigUInt.BASE))
+        result.words[i - 1] = UInt32(quotient % UInt128(BigUInt.BASE))
+        carry = dividend % y_uint128
+
+    result.remove_leading_empty_words()
+    return result^
+
 
-    # CASE: all other situations
-    var x2_value = UInt128(x2.words[1]) * UInt128(BigUInt.BASE) + UInt128(
-        x2.words[0]
+fn floor_divide_inplace_by_uint64(mut x: BigUInt, y: UInt64) -> None:
+    """Divides a BigUInt by UInt64 in-place.
+
+    Args:
+        x: The BigUInt value to divide by the divisor.
+        y: The UInt64 divisor. Must be smaller than 10^18.
+    """
+    debug_assert[assert_mode="none"](
+        y != 0,
+        "biguint.arithmetics.floor_divide_inplace_by_uint64(): ",
+        "Division by zero.",
     )
 
     var carry = UInt128(0)
-    if len(x1.words) % 2 == 1:
-        carry = UInt128(x1.words[-1])
-        x1.words.resize(len(x1.words) - 1, UInt32(0))
+    var y_uint128 = UInt128(y)
+    if len(x.words) % 2 == 1:
+        carry = UInt128(x.words[-1])
+        x.words.resize(len(x.words) - 1, UInt32(0))
 
-    for i in range(len(x1.words) - 1, -1, -2):
+    for i in range(len(x.words) - 1, -1, -2):
         var dividend = (
             carry * UInt128(1_000_000_000_000_000_000)
-            + UInt128(x1.words[i]) * UInt128(BigUInt.BASE)
-            + UInt128(x1.words[i - 1])
+            + (
+                x.words.data.load[width=2](i - 1).cast[DType.uint128]()
+                * SIMD[DType.uint128, 2](1, 1_000_000_000)
+            ).reduce_add()
         )
-        var quotient = dividend // x2_value
-        x1.words[i] = UInt32(quotient // UInt128(BigUInt.BASE))
-        x1.words[i - 1] = UInt32(quotient % UInt128(BigUInt.BASE))
-        carry = dividend % x2_value
+        var quotient = dividend // y_uint128
+        x.words[i] = UInt32(quotient // UInt128(BigUInt.BASE))
+        x.words[i - 1] = UInt32(quotient % UInt128(BigUInt.BASE))
+        carry = dividend % y_uint128
 
-    x1.remove_leading_empty_words()
+    x.remove_leading_empty_words()
     return
 
 
+fn floor_divide_by_uint128(x: BigUInt, y: UInt128) -> BigUInt:
+    """Divides a BigUInt by UInt128.
+
+    Args:
+        x: The BigUInt value to divide by the divisor.
+        y: The UInt128 divisor. Must be smaller than 10^36.
+    """
+    debug_assert[assert_mode="none"](
+        y != 0,
+        "biguint.arithmetics.floor_divide_inplace_by_uint128(): ",
+        "Division by zero.",
+    )
+
+    var carry = UInt256(0)
+    var y_uint255 = UInt256(y)
+    var result: BigUInt
+    if len(x.words) % 4 == 1:
+        carry = UInt256(x.words[-1])
+        result = BigUInt(unsafe_uninit_length=len(x.words) - 1)
+    elif len(x.words) % 4 == 2:
+        carry = UInt256(
+            (
+                x.words.data.load[width=2](len(x.words) - 2).cast[
+                    DType.uint64
+                ]()
+                * SIMD[DType.uint64, 2](1, 1_000_000_000)
+            ).reduce_add()
+        )
+        result = BigUInt(unsafe_uninit_length=len(x.words) - 2)
+    elif len(x.words) % 4 == 3:
+        carry = UInt256(
+            (
+                x.words.data.load[width=4](len(x.words) - 3).cast[
+                    DType.uint128
+                ]()
+                * SIMD[DType.uint128, 4](
+                    1, 1_000_000_000, 1_000_000_000_000_000_000, 0
+                )
+            ).reduce_add()
+        )
+        result = BigUInt(unsafe_uninit_length=len(x.words) - 3)
+    else:
+        result = BigUInt(unsafe_uninit_length=len(x.words))
+
+    for i in range(len(result.words) - 1, -1, -4):
+        var dividend = (
+            carry * UInt256(1_000_000_000_000_000_000_000_000_000_000_000_000)
+            + (
+                x.words.data.load[width=4](i - 3).cast[DType.uint256]()
+                * SIMD[DType.uint256, 4](
+                    1,
+                    1_000_000_000,
+                    1_000_000_000_000_000_000,
+                    1_000_000_000_000_000_000_000_000_000,
+                )
+            ).reduce_add()
+        )
+        var quotient = dividend // y_uint255
+        result.words[i] = UInt32(
+            quotient // UInt256(1_000_000_000_000_000_000_000_000_000)
+        )
+        quotient %= UInt256(1_000_000_000_000_000_000_000_000_000)
+        result.words[i - 1] = UInt32(
+            quotient // UInt256(1_000_000_000_000_000_000)
+        )
+        quotient %= UInt256(1_000_000_000_000_000_000)
+        result.words[i - 2] = UInt32(quotient // UInt256(1_000_000_000))
+        quotient %= UInt256(1_000_000_000)
+        result.words[i - 3] = UInt32(quotient)
+        carry = dividend % y_uint255
+
+    result.remove_leading_empty_words()
+    return result^
+
+
 fn floor_divide_inplace_by_2(mut x: BigUInt) -> None:
     """Divides a BigUInt by 2 in-place.
 
@@ -1953,10 +2083,8 @@ fn floor_divide_by_power_of_ten(x: BigUInt, n: Int) -> BigUInt:
 # cannot be applied to 10^k-based integers. For example, when normalizing the
 # divisor to let its most significant word be at least BASE//2, we cannot simply
 # shift the bits until the most significant bit is 1.
-# TODO:
-# Some optimization needs to be done in future to
-# (1) accelerate the normalization process
-# (2) avoid unnecessary memory allocations and copies
+# TODO: Some optimization needs to be done in future to
+# - avoid unnecessary memory allocations and copies
 
 
 fn floor_divide_burnikel_ziegler(
@@ -2047,8 +2175,9 @@ fn floor_divide_burnikel_ziegler(
         if normalized_a.words[-1] >= 500_000_000:
             t += 1
 
-    var z = BigUInt.from_slice(normalized_a, ((t - 2) * n, t * n))
-    var q = BigUInt()
+    var z = BigUInt(0)  # Remainder of the division
+    var q = BigUInt(0)
+    var q_i: BigUInt
 
     for i in range(t - 2, -1, -1):
         # The below function is the recursive division algorithm.
@@ -2056,18 +2185,28 @@ fn floor_divide_burnikel_ziegler(
 
         # The below function is the recursive division algorithm but works
         # with slices of the dividend and divisor.
-        var q_i, r = floor_divide_slices_two_by_one(
-            z,
-            normalized_b,
-            bounds_a=(0, len(z.words)),
-            bounds_b=(0, len(normalized_b.words)),
-            n=n,
-            cut_off=cut_off,
-        )
+        # Save the remainder in z as it will be carried over to the next
+        # iteration and we can do some inplace operations.
 
         if i == t - 2:
-            q = q_i
+            # The first iteration, we can use the slize of normalized_a
+            q, z = floor_divide_slices_two_by_one(
+                normalized_a,
+                normalized_b,
+                bounds_a=((t - 2) * n, len(normalized_a.words)),
+                bounds_b=(0, len(normalized_b.words)),
+                n=n,
+                cut_off=cut_off,
+            )
         else:
+            q_i, z = floor_divide_slices_two_by_one(
+                z,
+                normalized_b,
+                bounds_a=(0, len(z.words)),
+                bounds_b=(0, len(normalized_b.words)),
+                n=n,
+                cut_off=cut_off,
+            )
             decimojo.biguint.arithmetics.multiply_inplace_by_power_of_billion(
                 q, n
             )
@@ -2075,16 +2214,15 @@ fn floor_divide_burnikel_ziegler(
 
         if i > 0:
             decimojo.biguint.arithmetics.multiply_inplace_by_power_of_billion(
-                r, n
+                z, n
             )
             # z = r + a[(i - 1) * n : i * n]
-            z = add_slices(
-                r,
+            decimojo.biguint.arithmetics.add_inplace_by_slice(
+                z,
                 normalized_a,
-                bounds_x=(0, len(r.words)),
                 bounds_y=((i - 1) * n, i * n),
             )
-    return q
+    return q^
 
 
 fn floor_divide_two_by_one(
@@ -2108,14 +2246,15 @@ fn floor_divide_two_by_one(
     You need to ensure that n is even to continue with the algorithm.
     Otherwise, it will use the schoolbook division algorithm.
     """
+    debug_assert[assert_mode="none"](
+        b.words[-1] >= 500_000_000, "b[-1] must be at least 500_000_000"
+    )
+
     if (n & 1 == 1) or (n <= cut_off):
         var q = floor_divide_school(a, b)
         var r = a - q * b
         return (q^, r^)
 
-    if b.words[-1] < 500_000_000:
-        raise Error("b[-1] must be at least 500_000_000")
-
     else:
         var a0 = BigUInt(a.words[0 : n // 2])
         var a1 = BigUInt(a.words[n // 2 : n])
@@ -2252,24 +2391,29 @@ fn floor_divide_slices_two_by_one(
     )
 
     if (n & 1 == 1) or (n <= cut_off):
+        debug_assert[assert_mode="none"](
+            (n <= cut_off) or (n & 1 == 0),
+            "floor_divide_slices_two_by_one(): ",
+            "n must be even by design but got ",
+            n,
+        )
+        # If n is odd or less than the cutoff, use the schoolbook division
+        # algorithm.
         var a_slice = BigUInt.from_slice(a, bounds_a)
         var b_slice = BigUInt.from_slice(b, bounds_b)
         var q = floor_divide_school(a_slice, b_slice)
         # r = a_slice - q * b_slice
+        # We use inplace subtraction to avoid copying
         a_slice -= multiply_slices(q, b, (0, len(q.words)), bounds_b)
         return (q^, a_slice^)
 
-    elif bounds_a[0] + n + n // 2 >= bounds_a[1]:
-        # If a3 is empty
-        # We just need to use three-by-two division once: a2a1a0 // b1b0
-        var q, r = floor_divide_slices_three_by_two(
-            a, b, bounds_a, bounds_b, n // 2, cut_off
-        )
-        return (q^, r^)
-
-    elif a.is_zero(bounds=(bounds_a[0] + n + n // 2, bounds_a[1])):
-        # If a3 is zero
+    elif (bounds_a[0] + n + n // 2 >= bounds_a[1]) or a.is_zero(
+        bounds=(bounds_a[0] + n + n // 2, bounds_a[1])
+    ):
+        # If a3 is empty or zero
         # We just need to use three-by-two division once: a2a1a0 // b1b0
+        # Note that the condition must be short-circuited to avoid slicing
+        # an empty BigUInt.
         var q, r = floor_divide_slices_three_by_two(
             a, b, bounds_a, bounds_b, n // 2, cut_off
         )
diff --git a/src/decimojo/biguint/biguint.mojo b/src/decimojo/biguint/biguint.mojo
index d9bb0cf..2d6cdc1 100644
--- a/src/decimojo/biguint/biguint.mojo
+++ b/src/decimojo/biguint/biguint.mojo
@@ -118,9 +118,31 @@ struct BigUInt(Absable, IntableRaising, Stringable, Writable):
         self.words = List[UInt32](UInt32(0))
 
     fn __init__(out self, *, uninitialized_capacity: Int):
-        """Creates an uninitialized BigUInt with a given capacity."""
+        """Creates an uninitialized BigUInt with a given capacity.
+
+        Args:
+            uninitialized_capacity: The capacity of the BigUInt.
+                This is the number of UInt32 words that can be stored in the
+                BigUInt without reallocating memory.
+
+        Notes:
+
+        The length of the BigUInt is zero.
+        """
         self.words = List[UInt32](capacity=uninitialized_capacity)
 
+    fn __init__(out self, *, unsafe_uninit_length: Int):
+        """Creates an uninitialized BigUInt with a given length.
+
+        Args:
+            unsafe_uninit_length: The length of the BigUInt.
+
+        Notes:
+
+        The length of the BigUInt is `unsafe_uninit_length`.
+        """
+        self.words = List[UInt32](unsafe_uninit_length=unsafe_uninit_length)
+
     fn __init__(out self, owned words: List[UInt32]):
         """Initializes a BigUInt from a list of UInt32 words.
         It does not verify whether the words are within the valid range