diff --git a/benches/biguint/bench.mojo b/benches/biguint/bench.mojo
index 5c0f08b..56cbb3b 100644
--- a/benches/biguint/bench.mojo
+++ b/benches/biguint/bench.mojo
@@ -1,4 +1,5 @@
 from bench_biguint_add import main as bench_add
+from bench_biguint_subtraction import main as bench_subtraction
 from bench_biguint_multiply import main as bench_multiply
 from bench_biguint_truncate_divide import main as bench_truncate_divide
 from bench_biguint_from_string import main as bench_from_string
@@ -11,6 +12,7 @@ fn main() raises:
 This is the BigUInt Benchmarks
 =========================================
 add:         Add
+sub:         Subtract
 mul:         Multiply
 div:         Truncate divide (//)
 fromstr:     From string
@@ -22,6 +24,8 @@ q:           Exit
     var command = input("Type name of bench you want to run: ")
     if command == "add":
         bench_add()
+    elif command == "sub":
+        bench_subtraction()
     elif command == "mul":
         bench_multiply()
     elif command == "div":
diff --git a/benches/biguint/bench_biguint_subtraction.mojo b/benches/biguint/bench_biguint_subtraction.mojo
new file mode 100644
index 0000000..7125cdd
--- /dev/null
+++ b/benches/biguint/bench_biguint_subtraction.mojo
@@ -0,0 +1,296 @@
+"""
+Comprehensive benchmarks for BigUInt subtraction.
+Compares performance against Python's built-in int with diverse test cases.
+"""
+
+from decimojo.biguint.biguint import BigUInt
+import decimojo.biguint.arithmetics
+from python import Python, PythonObject
+from time import perf_counter_ns
+import time
+import os
+from collections import List
+
+
+fn open_log_file() raises -> PythonObject:
+    """
+    Creates and opens a log file with a timestamp in the filename.
+
+    Returns:
+        A file object opened for writing.
+    """
+    var python = Python.import_module("builtins")
+    var datetime = Python.import_module("datetime")
+    var pysys = Python.import_module("sys")
+    pysys.set_int_max_str_digits(1000000)
+
+    # Create logs directory if it doesn't exist
+    var log_dir = "./logs"
+    if not os.path.exists(log_dir):
+        os.makedirs(log_dir)
+
+    # Generate a timestamp for the filename
+    var timestamp = String(datetime.datetime.now().isoformat())
+    var log_filename = (
+        log_dir + "/benchmark_biguint_subtraction_" + timestamp + ".log"
+    )
+
+    print("Saving benchmark results to:", log_filename)
+    return python.open(log_filename, "w")
+
+
+fn log_print(msg: String, log_file: PythonObject) raises:
+    """
+    Prints a message to both the console and the log file.
+
+    Args:
+        msg: The message to print.
+        log_file: The file object to write to.
+    """
+    print(msg)
+    log_file.write(msg + "\n")
+    log_file.flush()  # Ensure the message is written immediately
+
+
+fn run_benchmark_subtraction(
+    name: String,
+    value1: String,
+    value2: String,
+    iterations: Int,
+    log_file: PythonObject,
+    mut speedup_factors: List[Float64],
+) raises:
+    """
+    Run a benchmark comparing Mojo BigUInt subtraction with Python int subtraction.
+
+    Args:
+        name: Name of the benchmark case.
+        value1: String representation of first operand.
+        value2: String representation of second operand.
+        iterations: Number of iterations to run.
+        log_file: File object for logging results.
+        speedup_factors: Mojo List to store speedup factors for averaging.
+    """
+    log_print("\nBenchmark:       " + name, log_file)
+    log_print("First operand:   " + value1, log_file)
+    log_print("Second operand:  " + value2, log_file)
+
+    # Set up Mojo and Python values
+    var mojo_value1 = BigUInt(value1)
+    var mojo_value2 = BigUInt(value2)
+    var py = Python.import_module("builtins")
+    var py_value1 = py.int(value1)
+    var py_value2 = py.int(value2)
+
+    # Execute the operations once to verify correctness
+    var mojo_result = mojo_value1 - mojo_value2
+    var py_result = py_value1 - py_value2
+
+    # Display results for verification
+    log_print("Mojo result:     " + String(mojo_result), log_file)
+    log_print("Python result:   " + String(py_result), log_file)
+
+    # Benchmark Mojo implementation
+    var t0 = perf_counter_ns()
+    for _ in range(iterations):
+        _ = mojo_value1 - mojo_value2
+    var mojo_time = (perf_counter_ns() - t0) / iterations
+    if mojo_time == 0:
+        mojo_time = 1  # Prevent division by zero
+
+    # Benchmark Python implementation
+    t0 = perf_counter_ns()
+    for _ in range(iterations):
+        _ = py_value1 - py_value2
+    var python_time = (perf_counter_ns() - t0) / iterations
+
+    # Calculate speedup factor
+    var speedup = python_time / mojo_time
+    speedup_factors.append(Float64(speedup))
+
+    # Print results with speedup comparison
+    log_print(
+        "Mojo subtraction:   " + String(mojo_time) + " ns per iteration",
+        log_file,
+    )
+    log_print(
+        "Python subtraction: " + String(python_time) + " ns per iteration",
+        log_file,
+    )
+    log_print("Speedup factor:  " + String(speedup), log_file)
+
+
+fn main() raises:
+    # Open log file
+    var log_file = open_log_file()
+    var datetime = Python.import_module("datetime")
+
+    # Create a Mojo List to store speedup factors for averaging later
+    var speedup_factors = List[Float64]()
+
+    # Display benchmark header with system information
+    log_print("=== DeciMojo BigUInt Subtraction Benchmark ===", log_file)
+    log_print("Time: " + String(datetime.datetime.now().isoformat()), log_file)
+
+    # Try to get system info
+    try:
+        var platform = Python.import_module("platform")
+        log_print(
+            "System: "
+            + String(platform.system())
+            + " "
+            + String(platform.release()),
+            log_file,
+        )
+        log_print("Processor: " + String(platform.processor()), log_file)
+        log_print(
+            "Python version: " + String(platform.python_version()), log_file
+        )
+    except:
+        log_print("Could not retrieve system information", log_file)
+
+    var iterations = 1000
+
+    # Define benchmark cases
+    log_print(
+        "\nRunning subtraction benchmarks with "
+        + String(iterations)
+        + " iterations each",
+        log_file,
+    )
+
+    # Case 26: Subtraction with 2 words - 1 word
+    run_benchmark_subtraction(
+        "Subtraction with 2 words - 1 word",
+        "123456789" * 2,
+        "987654321",
+        iterations,
+        log_file,
+        speedup_factors,
+    )
+
+    # Case 27: Subtraction with 4 words - 2 words
+    run_benchmark_subtraction(
+        "Subtraction with 4 words - 2 words",
+        "123456789" * 4,
+        "987654321" * 2,
+        iterations,
+        log_file,
+        speedup_factors,
+    )
+
+    # Case 28: Subtraction with 8 words - 4 words
+    run_benchmark_subtraction(
+        "Subtraction with 8 words - 4 words",
+        "123456789" * 8,
+        "987654321" * 4,
+        iterations,
+        log_file,
+        speedup_factors,
+    )
+
+    # Case 29: Subtraction with 16 words - 8 words
+    run_benchmark_subtraction(
+        "Subtraction with 16 words - 8 words",
+        "123456789" * 16,
+        "987654321" * 8,
+        iterations,
+        log_file,
+        speedup_factors,
+    )
+
+    # Case 30: Subtraction with 32 words - 16 words
+    run_benchmark_subtraction(
+        "Subtraction with 32 words - 16 words",
+        "123456789" * 32,
+        "987654321" * 16,
+        iterations,
+        log_file,
+        speedup_factors,
+    )
+
+    # Case 31: Subtraction with 64 words - 32 words
+    run_benchmark_subtraction(
+        "Subtraction with 64 words - 32 words",
+        "123456789" * 64,
+        "987654321" * 32,
+        iterations,
+        log_file,
+        speedup_factors,
+    )
+
+    # Case 32: Subtraction with 256 words - 128 words
+    run_benchmark_subtraction(
+        "Subtraction with 256 words - 128 words",
+        "123456789" * 256,
+        "987654321" * 128,
+        iterations,
+        log_file,
+        speedup_factors,
+    )
+
+    # Case 33: Subtraction with 1024 words - 512 words
+    run_benchmark_subtraction(
+        "Subtraction with 1024 words - 512 words",
+        "123456789" * 1024,
+        "987654321" * 512,
+        iterations,
+        log_file,
+        speedup_factors,
+    )
+
+    # Case 34: Subtraction with 4096 words - 2048 words
+    run_benchmark_subtraction(
+        "Subtraction with 4096 words - 2048 words",
+        "123456789" * 4096,
+        "987654321" * 2048,
+        iterations,
+        log_file,
+        speedup_factors,
+    )
+
+    # Case 35: Subtraction with 16384 words - 8192 words
+    run_benchmark_subtraction(
+        "Subtraction with 16384 words - 8192 words",
+        "123456789" * 16384,
+        "987654321" * 8192,
+        iterations,
+        log_file,
+        speedup_factors,
+    )
+
+    # Case 36: Subtraction with 32768 words - 16384 words
+    run_benchmark_subtraction(
+        "Subtraction with 32768 words - 16384 words",
+        "123456789" * 32768,
+        "987654321" * 16384,
+        iterations,
+        log_file,
+        speedup_factors,
+    )
+
+    # Calculate average speedup factor
+    var sum_speedup: Float64 = 0.0
+    for i in range(len(speedup_factors)):
+        sum_speedup += speedup_factors[i]
+    var average_speedup = sum_speedup / Float64(len(speedup_factors))
+
+    # Display summary
+    log_print("\n=== BigUInt Subtraction Benchmark Summary ===", log_file)
+    log_print("Benchmarked:      different subtraction cases", log_file)
+    log_print(
+        "Each case ran:    " + String(iterations) + " iterations", log_file
+    )
+    log_print("Average speedup:  " + String(average_speedup) + "×", log_file)
+
+    # List all speedup factors
+    log_print("\nIndividual speedup factors:", log_file)
+    for i in range(len(speedup_factors)):
+        log_print(
+            String("Case {}: {}×").format(i + 1, round(speedup_factors[i], 2)),
+            log_file,
+        )
+
+    # Close the log file
+    log_file.close()
+    print("Benchmark completed. Log file closed.")
diff --git a/src/decimojo/biguint/arithmetics.mojo b/src/decimojo/biguint/arithmetics.mojo
index 6f78929..be3dbdb 100644
--- a/src/decimojo/biguint/arithmetics.mojo
+++ b/src/decimojo/biguint/arithmetics.mojo
@@ -33,6 +33,8 @@ from decimojo.rounding_mode import RoundingMode
 # absolute(x: BigUInt) -> BigUInt
 #
 # add(x1: BigUInt, x2: BigUInt) -> BigUInt
+# add_simd(x: BigUInt, y: BigUInt) -> BigUInt
+# add_slices(x: BigUInt, y: BigUInt, start_x: Int, end_x: Int, start_y: Int, end_y: Int) -> BigUInt
 # add_inplace(x1: BigUInt, x2: BigUInt)
 # add_inplace_by_uint32(x: BigUInt, y: UInt32) -> None
 #
@@ -44,6 +46,7 @@ from decimojo.rounding_mode import RoundingMode
 # multiply(x1: BigUInt, x2: BigUInt) -> BigUInt
 # multiply_slices(x: BigUInt, y: BigUInt, start_x: Int, end_x: Int, start_y: Int, end_y: Int) -> BigUInt
 # multiply_karatsuba(x: BigUInt, y: BigUInt, start_x: Int, end_x: Int, start_y: Int, end_y: Int, cutoff_number_of_words: Int) -> BigUInt
+# multiply_inplace_by_uint32(x: BigUInt, y: UInt32) -> None
 # multiply_by_power_of_ten(x: BigUInt, n: Int) -> BigUInt
 # multiply_inplace_by_power_of_billion(mut x: BigUInt, n: Int)
 #
@@ -62,6 +65,7 @@ from decimojo.rounding_mode import RoundingMode
 # ceil_modulo(x1: BigUInt, x2: BigUInt) -> BigUInt
 # divmod(x1: BigUInt, x2: BigUInt) -> Tuple[BigUInt, BigUInt]
 #
+# normalize_carries(x: BigUInt) -> None
 # power_of_10(n: Int) -> BigUInt
 # ===----------------------------------------------------------------------=== #
 
@@ -360,6 +364,10 @@ fn add_inplace(mut x: BigUInt, y: BigUInt) -> None:
     Args:
         x: The first unsigned integer operand.
         y: The second unsigned integer operand.
+
+    Notes:
+
+    This function uses SIMD operations to add the words of the two BigUInt.
     """
 
     # Short circuit cases
@@ -417,6 +425,30 @@ fn add_inplace_by_uint32(mut x: BigUInt, y: UInt32) -> None:
 fn subtract(x: BigUInt, y: BigUInt) raises -> BigUInt:
     """Returns the difference of two unsigned integers.
 
+    Args:
+        x: The first unsigned integer (minuend).
+        y: The second unsigned integer (subtrahend).
+
+    Raises:
+        Error: If y is greater than x, resulting in an underflow.
+
+    Returns:
+        The result of subtracting y from x.
+    """
+    # I will use the SIMD approach for subtraction
+    # This speeds up the subtraction by 1.25x for large numbers.
+    return subtract_simd(x, y)
+
+    # Yuhao ZHU:
+    # Below is a school method for subtraction.
+    # You go from the least significant word to the most significant word.
+    #
+    # return subtract_school(x, y)
+
+
+fn subtract_school(x: BigUInt, y: BigUInt) raises -> BigUInt:
+    """Returns the difference of two unsigned integers using the school method.
+
     Args:
         x: The first unsigned integer (minuend).
         y: The second unsigned integer (subtrahend).
@@ -478,6 +510,81 @@ fn subtract(x: BigUInt, y: BigUInt) raises -> BigUInt:
     return result^
 
 
+fn subtract_simd(x: BigUInt, y: BigUInt) raises -> BigUInt:
+    """Returns the difference of two unsigned integers using SIMD operations.
+
+    Args:
+        x: The first unsigned integer (minuend).
+        y: The second unsigned integer (subtrahend).
+
+    Raises:
+        Error: If y is greater than x, resulting in an underflow.
+
+    Returns:
+        The result of subtracting y from x.
+
+    Notes:
+
+    I will make use of SIMD operations to subtract the words in parallel.
+    This will first subtract the words in parallel and then handle the borrows.
+    Note that there will be potential overflow in the subtraction,
+    but I will take advantage of that.
+    """
+    # If the subtrahend is zero, return the minuend
+    # Yuhao ZHU:
+    # This step is important because y can be of zero words and is longer than x.
+    # This will makes the subtraction beyond the boundary of the result number,
+    # whose length is equal to the length of x.
+    # Note that our subtraction is via SIMD, so it is directly worked on unsafe
+    # pointers.
+    if y.is_zero():
+        return x
+
+    # We need to determine which number has the larger magnitude
+    var comparison_result = x.compare(y)
+    if comparison_result == 0:
+        # |x| = |y|
+        return BigUInt()  # Return zero
+    if comparison_result < 0:
+        raise Error("biguint.arithmetics.subtract(): Underflow due to x < y")
+
+    # Now it is safe to subtract the smaller number from the larger one
+    # The result will have no more words than the first number
+    var words = List[UInt32](unsafe_uninit_length=len(x.words))
+
+    # Yuhao ZHU:
+    # We will make use of SIMD operations to subtract the words in parallel.
+    # This will first subtract the words in parallel and then handle the borrows.
+    # Note that there will be potential overflow in the subtraction,
+    # but we will take advantage of that.
+    @parameter
+    fn vector_subtract[simd_width: Int](i: Int):
+        words.data.store[width=simd_width](
+            i,
+            x.words.data.load[width=simd_width](i)
+            - y.words.data.load[width=simd_width](i),
+        )
+
+    vectorize[vector_subtract, BigUInt.VECTOR_WIDTH](len(y.words))
+
+    @parameter
+    fn vector_copy_rest[simd_width: Int](i: Int):
+        words.data.store[width=simd_width](
+            len(y.words) + i,
+            x.words.data.load[width=simd_width](len(y.words) + i),
+        )
+
+    vectorize[vector_copy_rest, BigUInt.VECTOR_WIDTH](
+        len(x.words) - len(y.words)
+    )
+
+    var result = BigUInt(words=words^)
+    normalize_borrows(result)
+    result.remove_leading_empty_words()
+
+    return result^
+
+
 fn subtract_inplace(mut x: BigUInt, y: BigUInt) raises -> None:
     """Subtracts y from x in place."""
 
@@ -497,35 +604,59 @@ fn subtract_inplace(mut x: BigUInt, y: BigUInt) raises -> None:
 
     # Now it is safe to subtract the smaller number from the larger one
     # Note that len(x.words) >= len(y.words) here
-    var borrow: UInt32 = 0  # Can either be 0 or 1
-
-    for i in range(len(y.words)):
-        if x.words[i] < borrow + y.words[i]:
-            x.words[i] += BigUInt.BASE
-            x.words[i] -= borrow + y.words[i]
-            borrow = 1  # Set borrow for the next word
-        else:
-            x.words[i] -= borrow + y.words[i]
-            borrow = 0  # No borrow for the next word
+    # Use SIMD operations to subtract the words in parallel.
+    @parameter
+    fn vector_subtract[simd_width: Int](i: Int):
+        x.words.data.store[width=simd_width](
+            i,
+            x.words.data.load[width=simd_width](i)
+            - y.words.data.load[width=simd_width](i),
+        )
 
-    # If x has more words than y, we need to handle the remaining words
+    vectorize[vector_subtract, BigUInt.VECTOR_WIDTH](len(y.words))
 
-    if borrow == 0:
-        # If there is no borrow, we can stop early
-        x.remove_leading_empty_words()
-        return
+    # Normalize borrows after subtraction
+    normalize_borrows(x)
+    x.remove_leading_empty_words()
 
-    else:
-        # At this stage, borrow can only be 0 or 1
-        for i in range(len(y.words), len(x.words)):
-            if x.words[i] >= borrow:
-                x.words[i] -= borrow
-                break  # No more borrow, we can stop early
-            else:  # x.words[i] == 0, borrow == 1
-                x.words[i] = BigUInt.BASE - borrow
+    return
 
-        x.remove_leading_empty_words()
-        return
+    # Yuhao ZHU:
+    # Below is a school method for subtraction.
+    # You go from the least significant word to the most significant word.
+    # I keep it here for reference.
+    # In case the behavior of the underflow of the SIMD subtraction changes
+    # in the future, we can fall back to this method.
+
+    # var borrow: UInt32 = 0  # Can either be 0 or 1
+
+    # for i in range(len(y.words)):
+    #     if x.words[i] < borrow + y.words[i]:
+    #         x.words[i] += BigUInt.BASE
+    #         x.words[i] -= borrow + y.words[i]
+    #         borrow = 1  # Set borrow for the next word
+    #     else:
+    #         x.words[i] -= borrow + y.words[i]
+    #         borrow = 0  # No borrow for the next word
+
+    # # If x has more words than y, we need to handle the remaining words
+
+    # if borrow == 0:
+    #     # If there is no borrow, we can stop early
+    #     x.remove_leading_empty_words()
+    #     return
+
+    # else:
+    #     # At this stage, borrow can only be 0 or 1
+    #     for i in range(len(y.words), len(x.words)):
+    #         if x.words[i] >= borrow:
+    #             x.words[i] -= borrow
+    #             break  # No more borrow, we can stop early
+    #         else:  # x.words[i] == 0, borrow == 1
+    #             x.words[i] = BigUInt.BASE - borrow
+
+    #     x.remove_leading_empty_words()
+    #     return
 
 
 fn subtract_inplace_no_check(mut x: BigUInt, y: BigUInt) -> None:
@@ -543,38 +674,23 @@ fn subtract_inplace_no_check(mut x: BigUInt, y: BigUInt) -> None:
         return
 
     # Underflow checks are skipped here, so we assume x >= y
+    # Note that len(x.words) >= len(y.words) under this assumption
 
-    # Now it is safe to subtract the smaller number from the larger one
-    # Note that len(x.words) >= len(y.words) here
-    var borrow: UInt32 = 0  # Can either be 0 or 1
-
-    for i in range(len(y.words)):
-        if x.words[i] < borrow + y.words[i]:
-            x.words[i] += BigUInt.BASE
-            x.words[i] -= borrow + y.words[i]
-            borrow = 1  # Set borrow for the next word
-        else:
-            x.words[i] -= borrow + y.words[i]
-            borrow = 0  # No borrow for the next word
+    @parameter
+    fn vector_subtract[simd_width: Int](i: Int):
+        x.words.data.store[width=simd_width](
+            i,
+            x.words.data.load[width=simd_width](i)
+            - y.words.data.load[width=simd_width](i),
+        )
 
-    # If x has more words than y, we need to handle the remaining words
+    vectorize[vector_subtract, BigUInt.VECTOR_WIDTH](len(y.words))
 
-    if borrow == 0:
-        # If there is no borrow, we can stop early
-        x.remove_leading_empty_words()
-        return
+    # Normalize borrows after subtraction
+    normalize_borrows(x)
+    x.remove_leading_empty_words()
 
-    else:
-        # At this stage, borrow can only be 0 or 1
-        for i in range(len(y.words), len(x.words)):
-            if x.words[i] >= borrow:
-                x.words[i] -= borrow
-                break  # No more borrow, we can stop early
-            else:  # x.words[i] == 0, borrow == 1
-                x.words[i] = BigUInt.BASE - borrow
-
-        x.remove_leading_empty_words()
-        return
+    return
 
 
 # ===----------------------------------------------------------------------=== #
@@ -725,8 +841,8 @@ fn multiply_slices(
 
             # The lower 9 digits (base 10^9) go into the current word
             # The upper digits become the carry for the next position
-            words[i + j] = UInt32(product % BigUInt.BASE)
-            carry = product // BigUInt.BASE
+            words[i + j] = UInt32(product % UInt64(BigUInt.BASE))
+            carry = product // UInt64(BigUInt.BASE)
 
         # If there is a carry left, add it to the next position
         if carry > 0:
@@ -1703,11 +1819,11 @@ fn normalize_carries(mut x: BigUInt):
 
     # Yuhao ZHU:
     # By construction, the words of x are in the range [0, BASE*2).
-    # Thus, the crray can only be 0 or 1.
+    # Thus, the carry can only be 0 or 1.
     var carry: UInt32 = 0
     for ref word in x.words:
         if carry == 0:
-            if word < BigUInt.BASE:
+            if word <= BigUInt.BASE_MAX:
                 pass  # carry = 0
             else:
                 word -= BigUInt.BASE
@@ -1725,6 +1841,46 @@ fn normalize_carries(mut x: BigUInt):
     return
 
 
+fn normalize_borrows(mut x: BigUInt):
+    """Normalizes the values of words into valid range by borrowing.
+    The caller should ensure that the final result is non-negative.
+    The initial values of the words should be in the range:
+    [0, BASE-1] or [3294967297, 4294967295], in other words,
+    [UInt32.MAX - 999_999_998, ..., UInt32.MAX, 0, ..., BASE-1].
+
+    Notes:
+
+    If we subtract two BigUInt numbers word-by-word, we may end up with
+    a situation where some words are **underflowed**. We can take advantage of
+    the overflowed values of the words to normalize the borrows,
+    ensuring that all words are within the valid range.
+    """
+
+    alias NEG_BASE_MAX = UInt32(3294967297)  # UInt32(0) - BigUInt.BASE_MAX
+
+    # Yuhao ZHU:
+    # By construction, the words of x are in the range [-BASE_MAX, BASE_MAX].
+    # Thus, the borrow can only be 0 or 1.
+    var borrow: UInt32 = 0
+    for ref word in x.words:
+        if borrow == 0:
+            if word <= BigUInt.BASE_MAX:  # 0 <= word <= 999_999_999
+                pass  # borrow = 0
+            else:  # word >= 3294967297, overflowed value
+                word += BigUInt.BASE
+                borrow = 1
+        else:  # borrow == 1
+            if (word >= 1) and (
+                word <= BigUInt.BASE_MAX
+            ):  # 1 <= word <= 999_999_999
+                word -= 1
+                borrow = 0
+            else:  # word >= 3294967297 or word == 0, overflowed value
+                word = (word + BigUInt.BASE) - 1
+                # borrow = 1
+    return
+
+
 fn power_of_10(n: Int) raises -> BigUInt:
     """Calculates 10^n efficiently."""
     if n < 0: