diff --git a/Directory.Packages.props b/Directory.Packages.props
index 4a456017a76..dc98d01c92b 100644
--- a/Directory.Packages.props
+++ b/Directory.Packages.props
@@ -22,10 +22,11 @@
     <PackageVersion Include="Microsoft.NET.Test.Sdk" Version="17.13.0" />
     <PackageVersion Include="Microsoft.IdentityModel.Protocols.OpenIdConnect" Version="8.6.1" />
     <PackageVersion Include="Microsoft.IdentityModel.Validators" Version="8.6.1" />
-    <PackageVersion Include="StackExchange.Redis" Version="2.8.16" />
+    <PackageVersion Include="StackExchange.Redis" Version="2.9.25" />
     <PackageVersion Include="System.IdentityModel.Tokens.Jwt" Version="8.6.1" />
     <PackageVersion Include="System.Interactive.Async" Version="6.0.1" />
     <PackageVersion Include="System.Text.Json" Version="9.0.3" />
+    <PackageVersion Include="System.Numerics.Tensors" Version="9.0.9" />
     <PackageVersion Include="Microsoft.Extensions.Hosting" Version="9.0.8" />
     <PackageVersion Include="Microsoft.Extensions.Hosting.WindowsServices" Version="9.0.8" />
   </ItemGroup>
diff --git a/benchmark/BDN.benchmark/Bitmap/BinaryOperations.cs b/benchmark/BDN.benchmark/Bitmap/BinaryOperations.cs
new file mode 100644
index 00000000000..a600bcb0ad4
--- /dev/null
+++ b/benchmark/BDN.benchmark/Bitmap/BinaryOperations.cs
@@ -0,0 +1,81 @@
+﻿// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+using System.Runtime.InteropServices;
+using BenchmarkDotNet.Attributes;
+using Garnet.server;
+
+namespace BDN.benchmark.Bitmap
+{
+    public unsafe class BinaryOperations
+    {
+        private const int Alignment = 64;
+
+        [ParamsSource(nameof(GetBitmapSizes))]
+        public Sizes BitmapSizes { get; set; }
+
+        [Params(BitmapOperation.XOR)]
+        public BitmapOperation Op { get; set; }
+
+        public IEnumerable<Sizes> GetBitmapSizes()
+        {
+            yield return new([1 << 21, 1 << 21]);
+            yield return new([1 << 21, (1 << 21) + 1]);
+
+            yield return new([1 << 21, 1 << 21, 1 << 21]);
+            yield return new([1 << 21, 1 << 21, (1 << 21) + 1]);
+
+            yield return new([256, 6 * 512 + 7, 512, 1024]);
+        }
+
+        private int minBitmapSize;
+        private byte** srcPtrs;
+        private byte** srcEndPtrs;
+
+        private int dstLength;
+        private byte* dstPtr;
+
+        [GlobalSetup]
+        public void GlobalSetup_Binary()
+        {
+            minBitmapSize = BitmapSizes.Values.Min();
+            srcPtrs = (byte**)NativeMemory.AllocZeroed((nuint)BitmapSizes.Values.Length, (nuint)sizeof(byte*));
+            srcEndPtrs = (byte**)NativeMemory.AllocZeroed((nuint)BitmapSizes.Values.Length, (nuint)sizeof(byte*));
+
+            for (var i = 0; i < BitmapSizes.Values.Length; i++)
+            {
+                srcPtrs[i] = (byte*)NativeMemory.AlignedAlloc((nuint)BitmapSizes.Values[i], Alignment);
+                srcEndPtrs[i] = srcPtrs[i] + BitmapSizes.Values[i];
+
+                new Random(i).NextBytes(new Span<byte>(srcPtrs[i], BitmapSizes.Values[i]));
+            }
+
+            dstLength = BitmapSizes.Values.Max();
+            dstPtr = (byte*)NativeMemory.AlignedAlloc((nuint)dstLength, Alignment);
+        }
+
+        [Benchmark]
+        public void BinaryOperation()
+        {
+            BitmapManager.InvokeBitOperationUnsafe(Op, BitmapSizes.Values.Length, srcPtrs, srcEndPtrs, dstPtr, dstLength, minBitmapSize);
+        }
+
+        [GlobalCleanup]
+        public void GlobalCleanup()
+        {
+            for (var i = 0; i < BitmapSizes.Values.Length; i++)
+            {
+                NativeMemory.AlignedFree(srcPtrs[i]);
+            }
+
+            NativeMemory.Free(srcPtrs);
+            NativeMemory.Free(srcEndPtrs);
+            NativeMemory.AlignedFree(dstPtr);
+        }
+
+        public record struct Sizes(int[] Values)
+        {
+            public override string ToString() => string.Join(", ", Values);
+        }
+    }
+}
\ No newline at end of file
diff --git a/benchmark/BDN.benchmark/Bitmap/UnaryOperations.cs b/benchmark/BDN.benchmark/Bitmap/UnaryOperations.cs
new file mode 100644
index 00000000000..c1b2fc0bcaf
--- /dev/null
+++ b/benchmark/BDN.benchmark/Bitmap/UnaryOperations.cs
@@ -0,0 +1,59 @@
+﻿// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+using System.Runtime.InteropServices;
+using BenchmarkDotNet.Attributes;
+using Garnet.server;
+
+namespace BDN.benchmark.Bitmap
+{
+    public unsafe partial class UnaryOperations
+    {
+        private const int Alignment = 64;
+
+        [ParamsSource(nameof(GetBitmapSize))]
+        public int BitmapSize { get; set; }
+
+        public IEnumerable<int> GetBitmapSize()
+        {
+            yield return 256;
+            yield return 1 << 21;
+        }
+
+        private const int Keys = 1;
+        private byte** srcPtrs;
+        private byte** srcEndPtrs;
+
+        private byte* dstPtr;
+
+        [GlobalSetup]
+        public void GlobalSetup_Unary()
+        {
+            srcPtrs = (byte**)NativeMemory.AllocZeroed(Keys, (nuint)sizeof(byte*));
+            srcEndPtrs = (byte**)NativeMemory.AllocZeroed(Keys, (nuint)sizeof(byte*));
+
+            srcPtrs[0] = (byte*)NativeMemory.AlignedAlloc((uint)BitmapSize, Alignment);
+            srcEndPtrs[0] = srcPtrs[0] + (uint)BitmapSize;
+
+            new Random(0).NextBytes(new Span<byte>(srcPtrs[0], BitmapSize));
+
+            dstPtr = (byte*)NativeMemory.AlignedAlloc((nuint)BitmapSize, Alignment);
+        }
+
+        [Benchmark]
+        public void BitOperation_NOT()
+        {
+            BitmapManager.InvokeBitOperationUnsafe(BitmapOperation.NOT, Keys, srcPtrs, srcEndPtrs, dstPtr, BitmapSize, BitmapSize);
+        }
+
+        [GlobalCleanup]
+        public void GlobalCleanup()
+        {
+            NativeMemory.AlignedFree(srcPtrs[0]);
+
+            NativeMemory.Free(srcPtrs);
+            NativeMemory.Free(srcEndPtrs);
+            NativeMemory.AlignedFree(dstPtr);
+        }
+    }
+}
\ No newline at end of file
diff --git a/libs/common/Numerics/IBinaryOperator.cs b/libs/common/Numerics/IBinaryOperator.cs
new file mode 100644
index 00000000000..2abf864851d
--- /dev/null
+++ b/libs/common/Numerics/IBinaryOperator.cs
@@ -0,0 +1,80 @@
+﻿// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+using System.Numerics;
+using System.Runtime.Intrinsics;
+
+namespace Garnet.common.Numerics
+{
+    /// <summary>Operator that takes two input values and returns a single value.</summary>
+    public interface IBinaryOperator
+    {
+        /// <summary>
+        /// Computes the binary operation of two scalar values.
+        /// </summary>
+        static abstract T Invoke<T>(T x, T y) where T : IBinaryInteger<T>;
+
+        /// <summary>
+        /// Computes the binary operation of two vectors.
+        /// </summary>
+        static abstract Vector128<byte> Invoke(Vector128<byte> x, Vector128<byte> y);
+
+        /// <inheritdoc cref="Invoke(Vector128{byte}, Vector128{byte})"/>
+        static abstract Vector256<byte> Invoke(Vector256<byte> x, Vector256<byte> y);
+
+        /// <inheritdoc cref="Invoke(Vector128{byte}, Vector128{byte})"/>
+        static abstract Vector512<byte> Invoke(Vector512<byte> x, Vector512<byte> y);
+    }
+
+    /// <summary><c>x &amp; y</c></summary>
+    public readonly struct BitwiseAndOperator : IBinaryOperator
+    {
+        /// <inheritdoc/>
+        public static T Invoke<T>(T x, T y) where T : IBinaryInteger<T> => x & y;
+        /// <inheritdoc/>
+        public static Vector128<byte> Invoke(Vector128<byte> x, Vector128<byte> y) => x & y;
+        /// <inheritdoc/>
+        public static Vector256<byte> Invoke(Vector256<byte> x, Vector256<byte> y) => x & y;
+        /// <inheritdoc/>
+        public static Vector512<byte> Invoke(Vector512<byte> x, Vector512<byte> y) => x & y;
+    }
+
+    /// <summary><c>x | y</c></summary>
+    public readonly struct BitwiseOrOperator : IBinaryOperator
+    {
+        /// <inheritdoc/>
+        public static T Invoke<T>(T x, T y) where T : IBinaryInteger<T> => x | y;
+        /// <inheritdoc/>
+        public static Vector128<byte> Invoke(Vector128<byte> x, Vector128<byte> y) => x | y;
+        /// <inheritdoc/>
+        public static Vector256<byte> Invoke(Vector256<byte> x, Vector256<byte> y) => x | y;
+        /// <inheritdoc/>
+        public static Vector512<byte> Invoke(Vector512<byte> x, Vector512<byte> y) => x | y;
+    }
+
+    /// <summary><c>x ^ y</c></summary>
+    public readonly struct BitwiseXorOperator : IBinaryOperator
+    {
+        /// <inheritdoc/>
+        public static T Invoke<T>(T x, T y) where T : IBinaryInteger<T> => x ^ y;
+        /// <inheritdoc/>
+        public static Vector128<byte> Invoke(Vector128<byte> x, Vector128<byte> y) => x ^ y;
+        /// <inheritdoc/>
+        public static Vector256<byte> Invoke(Vector256<byte> x, Vector256<byte> y) => x ^ y;
+        /// <inheritdoc/>
+        public static Vector512<byte> Invoke(Vector512<byte> x, Vector512<byte> y) => x ^ y;
+    }
+
+    /// <summary><c>x &amp; ~y</c></summary>
+    public readonly struct BitwiseAndNotOperator : IBinaryOperator
+    {
+        /// <inheritdoc/>
+        public static T Invoke<T>(T x, T y) where T : IBinaryInteger<T> => x & ~y;
+        /// <inheritdoc/>
+        public static Vector128<byte> Invoke(Vector128<byte> x, Vector128<byte> y) => x & ~y;
+        /// <inheritdoc/>
+        public static Vector256<byte> Invoke(Vector256<byte> x, Vector256<byte> y) => x & ~y;
+        /// <inheritdoc/>
+        public static Vector512<byte> Invoke(Vector512<byte> x, Vector512<byte> y) => x & ~y;
+    }
+}
\ No newline at end of file
diff --git a/libs/server/Garnet.server.csproj b/libs/server/Garnet.server.csproj
index 15939de0249..2c351e80f45 100644
--- a/libs/server/Garnet.server.csproj
+++ b/libs/server/Garnet.server.csproj
@@ -20,6 +20,7 @@
     <PackageReference Include="Microsoft.Extensions.Logging" />
     <PackageReference Include="Microsoft.IdentityModel.Protocols.OpenIdConnect" />
     <PackageReference Include="System.IdentityModel.Tokens.Jwt" />
+    <PackageReference Include="System.Numerics.Tensors" />
     <PackageReference Include="KeraLua" />
   </ItemGroup>
 
diff --git a/libs/server/Lua/LuaRunner.Functions.cs b/libs/server/Lua/LuaRunner.Functions.cs
index 920ee638544..e5b36a83dd8 100644
--- a/libs/server/Lua/LuaRunner.Functions.cs
+++ b/libs/server/Lua/LuaRunner.Functions.cs
@@ -2857,6 +2857,7 @@ internal int AclCheckCommand(nint luaStatePtr)
                         case RespCommand.BITOP_OR: state.PushConstantString(constStrs.OR); break;
                         case RespCommand.BITOP_XOR: state.PushConstantString(constStrs.XOR); break;
                         case RespCommand.BITOP_NOT: state.PushConstantString(constStrs.NOT); break;
+                        case RespCommand.BITOP_DIFF: state.PushConstantString(constStrs.DIFF); break;
 
                         default: throw new InvalidOperationException($"Unexpected BITOP sub command: {subCommand}");
                     }
diff --git a/libs/server/Lua/LuaRunner.Strings.cs b/libs/server/Lua/LuaRunner.Strings.cs
index 6492d81d503..372900c0216 100644
--- a/libs/server/Lua/LuaRunner.Strings.cs
+++ b/libs/server/Lua/LuaRunner.Strings.cs
@@ -161,6 +161,8 @@ private readonly struct ConstantStringRegistryIndexes
             internal int XOR { get; }
             /// <see cref="CmdStrings.LUA_NOT"/>
             internal int NOT { get; }
+            /// <see cref="CmdStrings.LUA_DIFF"/>
+            internal int DIFF { get; }
             /// <see cref="CmdStrings.LUA_KEYS"/>
             internal int KEYS { get; }
             /// <see cref="CmdStrings.LUA_ARGV"/>
@@ -246,6 +248,7 @@ internal ConstantStringRegistryIndexes(ref LuaStateWrapper state)
                 OR = ConstantStringToRegistry(ref state, CmdStrings.LUA_OR);
                 XOR = ConstantStringToRegistry(ref state, CmdStrings.LUA_XOR);
                 NOT = ConstantStringToRegistry(ref state, CmdStrings.LUA_NOT);
+                DIFF = ConstantStringToRegistry(ref state, CmdStrings.LUA_DIFF);
                 KEYS = ConstantStringToRegistry(ref state, CmdStrings.LUA_KEYS);
                 ARGV = ConstantStringToRegistry(ref state, CmdStrings.LUA_ARGV);
             }
diff --git a/libs/server/Resp/Bitmap/BitmapCommands.cs b/libs/server/Resp/Bitmap/BitmapCommands.cs
index ce50482c0a0..a15acc877c0 100644
--- a/libs/server/Resp/Bitmap/BitmapCommands.cs
+++ b/libs/server/Resp/Bitmap/BitmapCommands.cs
@@ -40,7 +40,12 @@ public enum BitmapOperation : byte
         /// <summary>
         /// NOT
         /// </summary>
-        NOT
+        NOT,
+
+        /// <summary>
+        /// DIFF
+        /// </summary>
+        DIFF
     }
 
     internal enum BitFieldOverflow : byte
@@ -317,6 +322,11 @@ private bool NetworkStringBitOperation<TGarnetApi>(BitmapOperation bitOp, ref TG
                 return AbortWithErrorMessage(CmdStrings.RESP_ERR_WRONG_NUMBER_OF_ARGUMENTS);
             }
 
+            if (bitOp == BitmapOperation.DIFF && parseState.Count < 3)
+            {
+                return AbortWithErrorMessage(CmdStrings.RESP_ERR_BITOP_DIFF_TWO_SOURCE_KEYS_REQUIRED);
+            }
+
             if (parseState.Count > 64)
             {
                 return AbortWithErrorMessage(CmdStrings.RESP_ERR_BITOP_KEY_LIMIT);
diff --git a/libs/server/Resp/Bitmap/BitmapManagerBitOp.cs b/libs/server/Resp/Bitmap/BitmapManagerBitOp.cs
index 6917e259935..88835d65624 100644
--- a/libs/server/Resp/Bitmap/BitmapManagerBitOp.cs
+++ b/libs/server/Resp/Bitmap/BitmapManagerBitOp.cs
@@ -1,650 +1,336 @@
 ﻿// Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
+using System;
+using System.Diagnostics;
+using System.Numerics.Tensors;
+using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
 using Garnet.common;
-
+using Garnet.common.Numerics;
 
 namespace Garnet.server
 {
     public unsafe partial class BitmapManager
     {
         /// <summary>
-        /// BitOp main driver.
+        /// Performs a bitwise operation across one or more source buffers and writes the result to the destination buffer.
         /// </summary>
-        /// <param name="dstPtr">Output buffer to write BitOp result</param>
-        /// <param name="dstLen">Output buffer length.</param>
-        /// <param name="srcStartPtrs">Array of pointers to bitmaps used as input in the corresponding bitop.</param>
-        /// <param name="srcEndPtrs">Array of pointers to bitmap sources.</param>
-        /// <param name="srcKeyCount">Number of source keys.</param>
-        /// <param name="minSize">Minimum size of source bitmap.</param>
-        /// <param name="bitop">Type of bitop operation being executed.</param>
-        /// <returns></returns>
-        public static bool BitOpMainUnsafeMultiKey(byte* dstPtr, int dstLen, byte** srcStartPtrs, byte** srcEndPtrs, int srcKeyCount, int minSize, byte bitop)
+        /// <param name="op">The bitwise operation to perform.</param>
+        /// <param name="srcCount">Number of source buffers</param>
+        /// <param name="srcPtrs">Array of pointers to source buffers. The array length must be greater than or equal to <paramref name="srcCount"/></param>
+        /// <param name="srcEndPtrs">Array of the buffer lengths specified in <paramref name="srcPtrs"/>. The array length must be greater than or equal to <paramref name="srcCount"/></param>
+        /// <param name="dstPtr">Destination buffer to write the result.</param>
+        /// <param name="dstLength">Destination buffer length.</param>
+        /// <param name="shortestSrcLength">The length of shortest source buffer.</param>
+        public static void InvokeBitOperationUnsafe(BitmapOperation op, int srcCount, byte** srcPtrs, byte** srcEndPtrs, byte* dstPtr, int dstLength, int shortestSrcLength)
         {
-            switch (bitop)
-            {
-                case (byte)BitmapOperation.NOT:
-                    __bitop_multikey_simdX256_not(dstPtr, dstLen, srcStartPtrs[0], srcEndPtrs[0] - srcStartPtrs[0]);
-                    break;
-                case (byte)BitmapOperation.AND:
-                    __bitop_multikey_simdX256_and(dstPtr, dstLen, srcStartPtrs, srcEndPtrs, srcKeyCount, minSize);
-                    break;
-                case (byte)BitmapOperation.OR:
-                    __bitop_multikey_simdX256_or(dstPtr, dstLen, srcStartPtrs, srcEndPtrs, srcKeyCount, minSize);
-                    break;
-                case (byte)BitmapOperation.XOR:
-                    __bitop_multikey_simdX256_xor(dstPtr, dstLen, srcStartPtrs, srcEndPtrs, srcKeyCount, minSize);
-                    break;
-                default:
-                    throw new GarnetException("Unsupported BitOp command");
-            }
-            return true;
-        }
+            Debug.Assert(op is BitmapOperation.NOT or BitmapOperation.AND or BitmapOperation.OR or BitmapOperation.XOR or BitmapOperation.DIFF);
+            Debug.Assert(srcCount > 0);
+            Debug.Assert(dstLength >= 0 && shortestSrcLength >= 0);
+            Debug.Assert(dstLength >= shortestSrcLength);
 
-        /// <summary>
-        /// Negation bitop implementation using 256-wide SIMD registers.
-        /// </summary>
-        /// <param name="dstPtr">Output buffer to write BitOp result</param>
-        /// <param name="dstLen">Output buffer length.</param>
-        /// <param name="srcBitmap">Pointer to source bitmap.</param>
-        /// <param name="srcLen">Source bitmap length.</param>
-        private static void __bitop_multikey_simdX256_not(byte* dstPtr, long dstLen, byte* srcBitmap, long srcLen)
-        {
-            int batchSize = 8 * 32;
-            long slen = srcLen;
-            long stail = slen & (batchSize - 1);
-
-            //iterate using srcBitmap because always dstLen >= srcLen 
-            byte* srcCurr = srcBitmap;
-            byte* srcEnd = srcCurr + (slen - stail);
-            byte* dstCurr = dstPtr;
-
-            #region 8x32
-            while (srcCurr < srcEnd)
-            {
-                Vector256<byte> d00 = Avx.LoadVector256(srcCurr);
-                Vector256<byte> d01 = Avx.LoadVector256(srcCurr + 32);
-                Vector256<byte> d02 = Avx.LoadVector256(srcCurr + 64);
-                Vector256<byte> d03 = Avx.LoadVector256(srcCurr + 96);
-                Vector256<byte> d04 = Avx.LoadVector256(srcCurr + 128);
-                Vector256<byte> d05 = Avx.LoadVector256(srcCurr + 160);
-                Vector256<byte> d06 = Avx.LoadVector256(srcCurr + 192);
-                Vector256<byte> d07 = Avx.LoadVector256(srcCurr + 224);
-
-                Avx.Store(dstCurr, Avx2.Xor(d00, Vector256<byte>.AllBitsSet));
-                Avx.Store(dstCurr + 32, Avx2.Xor(d01, Vector256<byte>.AllBitsSet));
-                Avx.Store(dstCurr + 64, Avx2.Xor(d02, Vector256<byte>.AllBitsSet));
-                Avx.Store(dstCurr + 96, Avx2.Xor(d03, Vector256<byte>.AllBitsSet));
-                Avx.Store(dstCurr + 128, Avx2.Xor(d04, Vector256<byte>.AllBitsSet));
-                Avx.Store(dstCurr + 160, Avx2.Xor(d05, Vector256<byte>.AllBitsSet));
-                Avx.Store(dstCurr + 192, Avx2.Xor(d06, Vector256<byte>.AllBitsSet));
-                Avx.Store(dstCurr + 224, Avx2.Xor(d07, Vector256<byte>.AllBitsSet));
-
-                srcCurr += batchSize;
-                dstCurr += batchSize;
-            }
-            if (stail == 0) return;
-            #endregion
-
-            #region 1x32
-            slen = stail;
-            batchSize = 1 * 32;
-            stail = slen & (batchSize - 1);
-            srcEnd = srcCurr + (slen - stail);
-            while (srcCurr < srcEnd)
-            {
-                Vector256<byte> d00 = Avx.LoadVector256(srcCurr);
-                Avx.Store(dstCurr, Avx2.Xor(d00, Vector256<byte>.AllBitsSet));
-                srcCurr += batchSize;
-                dstCurr += batchSize;
-            }
-            if (stail == 0) return;
-            #endregion
-
-            #region 4x8
-            slen = stail;
-            batchSize = 4 * 8;
-            stail = slen & (batchSize - 1);
-            srcEnd = srcCurr + (slen - stail);
-            while (srcCurr < srcEnd)
+            if (srcCount == 1)
             {
-                long d00 = *(long*)(srcCurr);
-                long d01 = *(long*)(srcCurr + 8);
-                long d02 = *(long*)(srcCurr + 16);
-                long d03 = *(long*)(srcCurr + 24);
-
-                *(long*)dstCurr = ~d00;
-                *(long*)(dstCurr + 8) = ~d01;
-                *(long*)(dstCurr + 16) = ~d02;
-                *(long*)(dstCurr + 24) = ~d03;
-
-                srcCurr += batchSize;
-                dstCurr += batchSize;
-            }
-            if (stail == 0) return;
-            #endregion
-
-            #region 1x8
-            slen = stail;
-            batchSize = 8;
-            stail = slen & (batchSize - 1);
-            srcEnd = srcCurr + (slen - stail);
-            while (srcCurr < srcEnd)
-            {
-                long d00 = *(long*)(srcCurr);
+                if (op == BitmapOperation.DIFF) throw new GarnetException("BITOP DIFF operation requires at least two source bitmaps");
 
-                *(long*)dstCurr = ~d00;
+                var srcBitmap = new ReadOnlySpan<byte>(srcPtrs[0], checked((int)(srcEndPtrs[0] - srcPtrs[0])));
+                var dstBitmap = new Span<byte>(dstPtr, dstLength);
 
-                srcCurr += batchSize;
-                dstCurr += batchSize;
+                if (op == BitmapOperation.NOT)
+                {
+                    TensorPrimitives.OnesComplement(srcBitmap, dstBitmap);
+                }
+                else
+                {
+                    srcBitmap.CopyTo(dstBitmap);
+                }
             }
-            if (stail == 0) return;
-            #endregion
-
-            if (stail >= 7) dstCurr[6] = (byte)(~srcCurr[6]);
-            if (stail >= 6) dstCurr[5] = (byte)(~srcCurr[5]);
-            if (stail >= 5) dstCurr[4] = (byte)(~srcCurr[4]);
-            if (stail >= 4) dstCurr[3] = (byte)(~srcCurr[3]);
-            if (stail >= 3) dstCurr[2] = (byte)(~srcCurr[2]);
-            if (stail >= 2) dstCurr[1] = (byte)(~srcCurr[1]);
-            if (stail >= 1) dstCurr[0] = (byte)(~srcCurr[0]);
+            // srcCount ≥ 2
+            else if (op == BitmapOperation.AND) InvokeNaryBitwiseOperation<BitwiseAndOperator>(srcCount, srcPtrs, srcEndPtrs, dstPtr, dstLength, shortestSrcLength);
+            else if (op == BitmapOperation.OR) InvokeNaryBitwiseOperation<BitwiseOrOperator>(srcCount, srcPtrs, srcEndPtrs, dstPtr, dstLength, shortestSrcLength);
+            else if (op == BitmapOperation.XOR) InvokeNaryBitwiseOperation<BitwiseXorOperator>(srcCount, srcPtrs, srcEndPtrs, dstPtr, dstLength, shortestSrcLength);
+            else if (op == BitmapOperation.DIFF) InvokeNaryBitwiseOperation<BitwiseAndNotOperator>(srcCount, srcPtrs, srcEndPtrs, dstPtr, dstLength, shortestSrcLength);
         }
 
         /// <summary>
-        /// AND bitop implementation using 256-wide SIMD registers.
+        /// Invokes bitwise binary operation across n-ary source bitmaps.
         /// </summary>
-        /// <param name="dstPtr">Output buffer to write BitOp result</param>
-        /// <param name="dstLen">Output buffer length.</param>
-        /// <param name="srcStartPtrs">Pointer to start of bitmap sources.</param>
-        /// <param name="srcEndPtrs">Pointer to end of bitmap sources</param>
-        /// <param name="srcKeyCount">Number of source keys.</param>
-        /// <param name="minSize">Minimum size of source bitmaps.</param>
-        private static void __bitop_multikey_simdX256_and(byte* dstPtr, int dstLen, byte** srcStartPtrs, byte** srcEndPtrs, int srcKeyCount, int minSize)
+        /// <typeparam name="TBinaryOperator">The binary operator type to compute bitwise</typeparam>
+        /// <param name="srcCount">Number of source bitmaps.</param>
+        /// <param name="srcPtrs">Array of pointers to source bitmap buffers.</param>
+        /// <param name="srcEndPtrs">Array of the of pointers pointing to the end of the respective the bitmaps specified in <paramref name="srcPtrs"/>.</param>
+        /// <param name="dstPtr">Destination buffer to write the result.</param>
+        /// <param name="dstLength">Destination buffer length.</param>
+        /// <param name="shortestSrcLength">The length of shortest source buffer.</param>
+        [SkipLocalsInit]
+        private static void InvokeNaryBitwiseOperation<TBinaryOperator>(int srcCount, byte** srcPtrs, byte** srcEndPtrs, byte* dstPtr, int dstLength, int shortestSrcLength)
+            where TBinaryOperator : struct, IBinaryOperator
         {
-            int batchSize = 8 * 32;
-            long slen = minSize;
-            long stail = slen & (batchSize - 1);
-
-            byte* dstCurr = dstPtr;
-            byte* dstEnd = dstCurr + (slen - stail);
+            var dstEndPtr = dstPtr + dstLength;
 
-            #region 8x32
-            while (dstCurr < dstEnd)
-            {
-                Vector256<byte> d00 = Avx.LoadVector256(srcStartPtrs[0]);
-                Vector256<byte> d01 = Avx.LoadVector256(srcStartPtrs[0] + 32);
-                Vector256<byte> d02 = Avx.LoadVector256(srcStartPtrs[0] + 64);
-                Vector256<byte> d03 = Avx.LoadVector256(srcStartPtrs[0] + 96);
-                Vector256<byte> d04 = Avx.LoadVector256(srcStartPtrs[0] + 128);
-                Vector256<byte> d05 = Avx.LoadVector256(srcStartPtrs[0] + 160);
-                Vector256<byte> d06 = Avx.LoadVector256(srcStartPtrs[0] + 192);
-                Vector256<byte> d07 = Avx.LoadVector256(srcStartPtrs[0] + 224);
-                srcStartPtrs[0] += batchSize;
-                for (int i = 1; i < srcKeyCount; i++)
-                {
-                    Vector256<byte> s00 = Avx.LoadVector256(srcStartPtrs[i]);
-                    Vector256<byte> s01 = Avx.LoadVector256(srcStartPtrs[i] + 32);
-                    Vector256<byte> s02 = Avx.LoadVector256(srcStartPtrs[i] + 64);
-                    Vector256<byte> s03 = Avx.LoadVector256(srcStartPtrs[i] + 96);
-                    Vector256<byte> s04 = Avx.LoadVector256(srcStartPtrs[i] + 128);
-                    Vector256<byte> s05 = Avx.LoadVector256(srcStartPtrs[i] + 160);
-                    Vector256<byte> s06 = Avx.LoadVector256(srcStartPtrs[i] + 192);
-                    Vector256<byte> s07 = Avx.LoadVector256(srcStartPtrs[i] + 224);
-
-                    d00 = Avx2.And(d00, s00);
-                    d01 = Avx2.And(d01, s01);
-                    d02 = Avx2.And(d02, s02);
-                    d03 = Avx2.And(d03, s03);
-                    d04 = Avx2.And(d04, s04);
-                    d05 = Avx2.And(d05, s05);
-                    d06 = Avx2.And(d06, s06);
-                    d07 = Avx2.And(d07, s07);
-                    srcStartPtrs[i] += batchSize;
-                }
+            var remainingLength = shortestSrcLength;
+            var batchRemainder = shortestSrcLength;
+            byte* dstBatchEndPtr;
 
-                Avx.Store(dstCurr, d00);
-                Avx.Store(dstCurr + 32, d01);
-                Avx.Store(dstCurr + 64, d02);
-                Avx.Store(dstCurr + 96, d03);
-                Avx.Store(dstCurr + 128, d04);
-                Avx.Store(dstCurr + 160, d05);
-                Avx.Store(dstCurr + 192, d06);
-                Avx.Store(dstCurr + 224, d07);
+            // Keep the cursor of the first source buffer in local to keep processing tidy.
+            var firstSrcPtr = srcPtrs[0];
 
-                dstCurr += batchSize;
-            }
-            if (stail == 0) goto fillTail;
-            #endregion
-
-            #region 1x32
-            slen = stail;
-            batchSize = 1 * 32;
-            stail = slen & (batchSize - 1);
-            dstEnd = dstCurr + (slen - stail);
-
-            while (dstCurr < dstEnd)
+            // Copy remaining source buffer pointers so we don't increment caller's.
+            var tmpSrcPtrs = stackalloc byte*[srcCount];
+            for (var i = 0; i < srcCount; i++)
             {
-                Vector256<byte> d00 = Avx.LoadVector256(srcStartPtrs[0]);
-                srcStartPtrs[0] += batchSize;
-                for (int i = 1; i < srcKeyCount; i++)
-                {
-                    Vector256<byte> s00 = Avx.LoadVector256(srcStartPtrs[i]);
-                    d00 = Avx2.And(d00, s00);
-                    srcStartPtrs[i] += batchSize;
-                }
-                Avx.Store(dstCurr, d00);
-                dstCurr += batchSize;
+                tmpSrcPtrs[i] = srcPtrs[i];
             }
-            if (stail == 0) goto fillTail;
-            #endregion
-
-            #region scalar_4x8
-            slen = stail;
-            batchSize = 4 * 8;
-            stail = slen & (batchSize - 1);
-            dstEnd = dstCurr + (slen - stail);
-            while (dstCurr < dstEnd)
+            srcPtrs = tmpSrcPtrs;
+
+            if (Vector512.IsHardwareAccelerated && Vector512<byte>.IsSupported)
             {
-                long d00 = *(long*)(srcStartPtrs[0]);
-                long d01 = *(long*)(srcStartPtrs[0] + 8);
-                long d02 = *(long*)(srcStartPtrs[0] + 16);
-                long d03 = *(long*)(srcStartPtrs[0] + 24);
-                srcStartPtrs[0] += batchSize;
-                for (int i = 1; i < srcKeyCount; i++)
-                {
-                    d00 &= *(long*)(srcStartPtrs[i]);
-                    d01 &= *(long*)(srcStartPtrs[i] + 8);
-                    d02 &= *(long*)(srcStartPtrs[i] + 16);
-                    d03 &= *(long*)(srcStartPtrs[i] + 24);
-                    srcStartPtrs[i] += batchSize;
-                }
+                // Vectorized: 64 bytes x 8
+                batchRemainder = remainingLength & ((Vector512<byte>.Count * 8) - 1);
+                dstBatchEndPtr = dstPtr + (remainingLength - batchRemainder);
+                remainingLength = batchRemainder;
 
-                *(long*)dstCurr = d00;
-                *(long*)(dstCurr + 8) = d01;
-                *(long*)(dstCurr + 16) = d02;
-                *(long*)(dstCurr + 24) = d03;
-                dstCurr += batchSize;
+                Vectorized512(ref firstSrcPtr, srcCount, srcPtrs, ref dstPtr, dstBatchEndPtr);
             }
-            if (stail == 0) goto fillTail;
-            #endregion  
-
-            #region scalar_1x8
-            slen = stail;
-            batchSize = 8;
-            stail = slen & (batchSize - 1);
-            dstEnd = dstCurr + (slen - stail);
-            while (dstCurr < dstEnd)
+            else if (Vector256.IsHardwareAccelerated && Vector256<byte>.IsSupported)
             {
-                long d00 = *(long*)(srcStartPtrs[0]);
-                srcStartPtrs[0] += batchSize;
-                for (int i = 1; i < srcKeyCount; i++)
-                {
-                    d00 &= *(long*)(srcStartPtrs[i]);
-                    srcStartPtrs[i] += batchSize;
-                }
-                *(long*)dstCurr = d00;
-                dstCurr += batchSize;
-            }
-        #endregion
+                // Vectorized: 32 bytes x 8
+                batchRemainder = remainingLength & ((Vector256<byte>.Count * 8) - 1);
+                dstBatchEndPtr = dstPtr + (remainingLength - batchRemainder);
+                remainingLength = batchRemainder;
 
-        fillTail:
-            #region scalar_1x1    
-            byte* dstMaxEnd = dstPtr + dstLen;
-            int offset = 0;
-            while (dstCurr < dstMaxEnd)
+                Vectorized256(ref firstSrcPtr, srcCount, srcPtrs, ref dstPtr, dstBatchEndPtr);
+            }
+            else if (Vector128.IsHardwareAccelerated && Vector128<byte>.IsSupported)
             {
-                byte d00;
-                if (srcStartPtrs[0] + offset < srcEndPtrs[0])
-                    d00 = srcStartPtrs[0][offset];
-                else
-                {
-                    d00 = 0;
-                    goto writeBack;
-                }
+                // Vectorized: 16 bytes x 8
+                batchRemainder = remainingLength & ((Vector128<byte>.Count * 8) - 1);
+                dstBatchEndPtr = dstPtr + (remainingLength - batchRemainder);
+                remainingLength = batchRemainder;
 
-                for (int i = 1; i < srcKeyCount; i++)
-                {
-                    if (srcStartPtrs[i] + offset < srcEndPtrs[i])
-                        d00 &= srcStartPtrs[i][offset];
-                    else
-                    {
-                        d00 = 0;
-                        goto writeBack;
-                    }
-                }
-            writeBack:
-                *dstCurr++ = d00;
-                offset++;
+                Vectorized128(ref firstSrcPtr, srcCount, srcPtrs, ref dstPtr, dstBatchEndPtr);
             }
-            #endregion
-        }
-
-        /// <summary>
-        /// OR bitop implementation using 256-wide SIMD registers.
-        /// </summary>
-        /// <param name="dstPtr">Output buffer to write BitOp result</param>
-        /// <param name="dstLen">Output buffer length.</param>
-        /// <param name="srcStartPtrs">Pointer to start of bitmap sources.</param>
-        /// <param name="srcEndPtrs">Pointer to end of bitmap sources</param>
-        /// <param name="srcKeyCount">Number of source keys.</param>
-        /// <param name="minSize">Minimum size of source bitmaps.</param>
-        private static void __bitop_multikey_simdX256_or(byte* dstPtr, int dstLen, byte** srcStartPtrs, byte** srcEndPtrs, int srcKeyCount, int minSize)
-        {
-            int batchSize = 8 * 32;
-            long slen = minSize;
-            long stail = slen & (batchSize - 1);
 
-            byte* dstCurr = dstPtr;
-            byte* dstEnd = dstCurr + (slen - stail);
+            // Scalar: 8 bytes x 4
+            batchRemainder = remainingLength & ((sizeof(ulong) * 4) - 1);
+            dstBatchEndPtr = dstPtr + (remainingLength - batchRemainder);
+            remainingLength = batchRemainder;
 
-            #region 8x32
-            while (dstCurr < dstEnd)
+            while (dstPtr < dstBatchEndPtr)
             {
-                Vector256<byte> d00 = Avx.LoadVector256(srcStartPtrs[0]);
-                Vector256<byte> d01 = Avx.LoadVector256(srcStartPtrs[0] + 32);
-                Vector256<byte> d02 = Avx.LoadVector256(srcStartPtrs[0] + 64);
-                Vector256<byte> d03 = Avx.LoadVector256(srcStartPtrs[0] + 96);
-                Vector256<byte> d04 = Avx.LoadVector256(srcStartPtrs[0] + 128);
-                Vector256<byte> d05 = Avx.LoadVector256(srcStartPtrs[0] + 160);
-                Vector256<byte> d06 = Avx.LoadVector256(srcStartPtrs[0] + 192);
-                Vector256<byte> d07 = Avx.LoadVector256(srcStartPtrs[0] + 224);
-                srcStartPtrs[0] += batchSize;
-                for (int i = 1; i < srcKeyCount; i++)
-                {
-                    Vector256<byte> s00 = Avx.LoadVector256(srcStartPtrs[i]);
-                    Vector256<byte> s01 = Avx.LoadVector256(srcStartPtrs[i] + 32);
-                    Vector256<byte> s02 = Avx.LoadVector256(srcStartPtrs[i] + 64);
-                    Vector256<byte> s03 = Avx.LoadVector256(srcStartPtrs[i] + 96);
-                    Vector256<byte> s04 = Avx.LoadVector256(srcStartPtrs[i] + 128);
-                    Vector256<byte> s05 = Avx.LoadVector256(srcStartPtrs[i] + 160);
-                    Vector256<byte> s06 = Avx.LoadVector256(srcStartPtrs[i] + 192);
-                    Vector256<byte> s07 = Avx.LoadVector256(srcStartPtrs[i] + 224);
-
-                    d00 = Avx2.Or(d00, s00);
-                    d01 = Avx2.Or(d01, s01);
-                    d02 = Avx2.Or(d02, s02);
-                    d03 = Avx2.Or(d03, s03);
-                    d04 = Avx2.Or(d04, s04);
-                    d05 = Avx2.Or(d05, s05);
-                    d06 = Avx2.Or(d06, s06);
-                    d07 = Avx2.Or(d07, s07);
-                    srcStartPtrs[i] += batchSize;
-                }
+                var d00 = *(ulong*)(firstSrcPtr + (sizeof(ulong) * 0));
+                var d01 = *(ulong*)(firstSrcPtr + (sizeof(ulong) * 1));
+                var d02 = *(ulong*)(firstSrcPtr + (sizeof(ulong) * 2));
+                var d03 = *(ulong*)(firstSrcPtr + (sizeof(ulong) * 3));
 
-                Avx.Store(dstCurr, d00);
-                Avx.Store(dstCurr + 32, d01);
-                Avx.Store(dstCurr + 64, d02);
-                Avx.Store(dstCurr + 96, d03);
-                Avx.Store(dstCurr + 128, d04);
-                Avx.Store(dstCurr + 160, d05);
-                Avx.Store(dstCurr + 192, d06);
-                Avx.Store(dstCurr + 224, d07);
+                firstSrcPtr += sizeof(ulong) * 4;
 
-                dstCurr += batchSize;
-            }
-            if (stail == 0) goto fillTail;
-            #endregion
+                for (var i = 1; i < srcCount; i++)
+                {
+                    ref var startPtr = ref srcPtrs[i];
 
-            #region 1x32
-            slen = stail;
-            batchSize = 1 * 32;
-            stail = slen & (batchSize - 1);
-            dstEnd = dstCurr + (slen - stail);
+                    d00 = TBinaryOperator.Invoke(d00, *(ulong*)(startPtr + (sizeof(ulong) * 0)));
+                    d01 = TBinaryOperator.Invoke(d01, *(ulong*)(startPtr + (sizeof(ulong) * 1)));
+                    d02 = TBinaryOperator.Invoke(d02, *(ulong*)(startPtr + (sizeof(ulong) * 2)));
+                    d03 = TBinaryOperator.Invoke(d03, *(ulong*)(startPtr + (sizeof(ulong) * 3)));
 
-            while (dstCurr < dstEnd)
-            {
-                Vector256<byte> d00 = Avx.LoadVector256(srcStartPtrs[0]);
-                srcStartPtrs[0] += batchSize;
-                for (int i = 1; i < srcKeyCount; i++)
-                {
-                    Vector256<byte> s00 = Avx.LoadVector256(srcStartPtrs[i]);
-                    d00 = Avx2.Or(d00, s00);
-                    srcStartPtrs[i] += batchSize;
-                }
-                Avx.Store(dstCurr, d00);
-                dstCurr += batchSize;
-            }
-            if (stail == 0) goto fillTail;
-            #endregion
-
-            #region scalar_4x8
-            slen = stail;
-            batchSize = 4 * 8;
-            stail = slen & (batchSize - 1);
-            dstEnd = dstCurr + (slen - stail);
-            while (dstCurr < dstEnd)
-            {
-                long d00 = *(long*)(srcStartPtrs[0]);
-                long d01 = *(long*)(srcStartPtrs[0] + 8);
-                long d02 = *(long*)(srcStartPtrs[0] + 16);
-                long d03 = *(long*)(srcStartPtrs[0] + 24);
-                srcStartPtrs[0] += batchSize;
-                for (int i = 1; i < srcKeyCount; i++)
-                {
-                    d00 |= *(long*)(srcStartPtrs[i]);
-                    d01 |= *(long*)(srcStartPtrs[i] + 8);
-                    d02 |= *(long*)(srcStartPtrs[i] + 16);
-                    d03 |= *(long*)(srcStartPtrs[i] + 24);
-                    srcStartPtrs[i] += batchSize;
+                    srcPtrs[i] += sizeof(ulong) * 4;
                 }
 
-                *(long*)dstCurr = d00;
-                *(long*)(dstCurr + 8) = d01;
-                *(long*)(dstCurr + 16) = d02;
-                *(long*)(dstCurr + 24) = d03;
-                dstCurr += batchSize;
-            }
-            if (stail == 0) goto fillTail;
-            #endregion
-
-            #region scalar_1x8
-            slen = stail;
-            batchSize = 8;
-            stail = slen & (batchSize - 1);
-            dstEnd = dstCurr + (slen - stail);
-            while (dstCurr < dstEnd)
-            {
-                long d00 = *(long*)(srcStartPtrs[0]);
-                srcStartPtrs[0] += batchSize;
-                for (int i = 1; i < srcKeyCount; i++)
-                {
-                    d00 |= *(long*)(srcStartPtrs[i]);
-                    srcStartPtrs[i] += batchSize;
-                }
-                *(long*)dstCurr = d00;
-                dstCurr += batchSize;
+                *(ulong*)(dstPtr + (sizeof(ulong) * 0)) = d00;
+                *(ulong*)(dstPtr + (sizeof(ulong) * 1)) = d01;
+                *(ulong*)(dstPtr + (sizeof(ulong) * 2)) = d02;
+                *(ulong*)(dstPtr + (sizeof(ulong) * 3)) = d03;
+
+                dstPtr += sizeof(ulong) * 4;
             }
-        #endregion
 
-        fillTail:
-            #region scalar_1x1    
-            byte* dstMaxEnd = dstPtr + dstLen;
-            int offset = 0;
-            while (dstCurr < dstMaxEnd)
+            // Handle the remaining tails
+            while (dstPtr < dstEndPtr)
             {
                 byte d00 = 0;
-                if (srcStartPtrs[0] + offset < srcEndPtrs[0])
+
+                if (firstSrcPtr < srcEndPtrs[0])
                 {
-                    d00 = srcStartPtrs[0][offset];
-                    if (d00 == 0xff) goto writeBack;
+                    d00 = *firstSrcPtr;
+                    firstSrcPtr++;
                 }
 
-                for (int i = 1; i < srcKeyCount; i++)
+                for (var i = 1; i < srcCount; i++)
                 {
-                    if (srcStartPtrs[i] + offset < srcEndPtrs[i])
+                    if (srcPtrs[i] < srcEndPtrs[i])
                     {
-                        d00 |= srcStartPtrs[i][offset];
-                        if (d00 == 0xff) goto writeBack;
+                        d00 = TBinaryOperator.Invoke(d00, *srcPtrs[i]);
+                        srcPtrs[i]++;
+                    }
+                    else if (typeof(TBinaryOperator) == typeof(BitwiseAndOperator))
+                    {
+                        d00 = 0;
                     }
                 }
-            writeBack:
-                *dstCurr++ = d00;
-                offset++;
-            }
-            #endregion
-        }
-
-        /// <summary>
-        /// XOR bitop implementation using 256-wide SIMD registers.
-        /// </summary>
-        /// <param name="dstPtr">Output buffer to write BitOp result</param>
-        /// <param name="dstLen">Output buffer length.</param>
-        /// <param name="srcStartPtrs">Pointer to start of bitmap sources.</param>
-        /// <param name="srcEndPtrs">Pointer to end of bitmap sources</param>
-        /// <param name="srcKeyCount">Number of source keys.</param>
-        /// <param name="minSize">Minimum size of source bitmaps.</param>
-        private static void __bitop_multikey_simdX256_xor(byte* dstPtr, int dstLen, byte** srcStartPtrs, byte** srcEndPtrs, int srcKeyCount, int minSize)
-        {
-            int batchSize = 8 * 32;
-            long slen = minSize;
-            long stail = slen & (batchSize - 1);
 
-            byte* dstCurr = dstPtr;
-            byte* dstEnd = dstCurr + (slen - stail);
+                *dstPtr++ = d00;
+            }
 
-            #region 8x32
-            while (dstCurr < dstEnd)
+            static void Vectorized512(ref byte* firstPtr, int srcCount, byte** srcStartPtrs, ref byte* dstPtr, byte* dstBatchEndPtr)
             {
-                Vector256<byte> d00 = Avx.LoadVector256(srcStartPtrs[0]);
-                Vector256<byte> d01 = Avx.LoadVector256(srcStartPtrs[0] + 32);
-                Vector256<byte> d02 = Avx.LoadVector256(srcStartPtrs[0] + 64);
-                Vector256<byte> d03 = Avx.LoadVector256(srcStartPtrs[0] + 96);
-                Vector256<byte> d04 = Avx.LoadVector256(srcStartPtrs[0] + 128);
-                Vector256<byte> d05 = Avx.LoadVector256(srcStartPtrs[0] + 160);
-                Vector256<byte> d06 = Avx.LoadVector256(srcStartPtrs[0] + 192);
-                Vector256<byte> d07 = Avx.LoadVector256(srcStartPtrs[0] + 224);
-                srcStartPtrs[0] += batchSize;
-                for (int i = 1; i < srcKeyCount; i++)
+                while (dstPtr < dstBatchEndPtr)
                 {
-                    Vector256<byte> s00 = Avx.LoadVector256(srcStartPtrs[i]);
-                    Vector256<byte> s01 = Avx.LoadVector256(srcStartPtrs[i] + 32);
-                    Vector256<byte> s02 = Avx.LoadVector256(srcStartPtrs[i] + 64);
-                    Vector256<byte> s03 = Avx.LoadVector256(srcStartPtrs[i] + 96);
-                    Vector256<byte> s04 = Avx.LoadVector256(srcStartPtrs[i] + 128);
-                    Vector256<byte> s05 = Avx.LoadVector256(srcStartPtrs[i] + 160);
-                    Vector256<byte> s06 = Avx.LoadVector256(srcStartPtrs[i] + 192);
-                    Vector256<byte> s07 = Avx.LoadVector256(srcStartPtrs[i] + 224);
-
-                    d00 = Avx2.Xor(d00, s00);
-                    d01 = Avx2.Xor(d01, s01);
-                    d02 = Avx2.Xor(d02, s02);
-                    d03 = Avx2.Xor(d03, s03);
-                    d04 = Avx2.Xor(d04, s04);
-                    d05 = Avx2.Xor(d05, s05);
-                    d06 = Avx2.Xor(d06, s06);
-                    d07 = Avx2.Xor(d07, s07);
-                    srcStartPtrs[i] += batchSize;
-                }
-
-                Avx.Store(dstCurr, d00);
-                Avx.Store(dstCurr + 32, d01);
-                Avx.Store(dstCurr + 64, d02);
-                Avx.Store(dstCurr + 96, d03);
-                Avx.Store(dstCurr + 128, d04);
-                Avx.Store(dstCurr + 160, d05);
-                Avx.Store(dstCurr + 192, d06);
-                Avx.Store(dstCurr + 224, d07);
-
-                dstCurr += batchSize;
-            }
-            #endregion
+                    var d00 = Vector512.Load(firstPtr + (Vector512<byte>.Count * 0));
+                    var d01 = Vector512.Load(firstPtr + (Vector512<byte>.Count * 1));
+                    var d02 = Vector512.Load(firstPtr + (Vector512<byte>.Count * 2));
+                    var d03 = Vector512.Load(firstPtr + (Vector512<byte>.Count * 3));
+                    var d04 = Vector512.Load(firstPtr + (Vector512<byte>.Count * 4));
+                    var d05 = Vector512.Load(firstPtr + (Vector512<byte>.Count * 5));
+                    var d06 = Vector512.Load(firstPtr + (Vector512<byte>.Count * 6));
+                    var d07 = Vector512.Load(firstPtr + (Vector512<byte>.Count * 7));
+
+                    firstPtr += Vector512<byte>.Count * 8;
+
+                    for (var i = 1; i < srcCount; i++)
+                    {
+                        ref var startPtr = ref srcStartPtrs[i];
+
+                        var s00 = Vector512.Load(startPtr + (Vector512<byte>.Count * 0));
+                        var s01 = Vector512.Load(startPtr + (Vector512<byte>.Count * 1));
+                        var s02 = Vector512.Load(startPtr + (Vector512<byte>.Count * 2));
+                        var s03 = Vector512.Load(startPtr + (Vector512<byte>.Count * 3));
+                        var s04 = Vector512.Load(startPtr + (Vector512<byte>.Count * 4));
+                        var s05 = Vector512.Load(startPtr + (Vector512<byte>.Count * 5));
+                        var s06 = Vector512.Load(startPtr + (Vector512<byte>.Count * 6));
+                        var s07 = Vector512.Load(startPtr + (Vector512<byte>.Count * 7));
+
+                        d00 = TBinaryOperator.Invoke(d00, s00);
+                        d01 = TBinaryOperator.Invoke(d01, s01);
+                        d02 = TBinaryOperator.Invoke(d02, s02);
+                        d03 = TBinaryOperator.Invoke(d03, s03);
+                        d04 = TBinaryOperator.Invoke(d04, s04);
+                        d05 = TBinaryOperator.Invoke(d05, s05);
+                        d06 = TBinaryOperator.Invoke(d06, s06);
+                        d07 = TBinaryOperator.Invoke(d07, s07);
+
+                        startPtr += Vector512<byte>.Count * 8;
+                    }
 
-            #region 1x32
-            slen = stail;
-            batchSize = 1 * 32;
-            stail = slen & (batchSize - 1);
-            dstEnd = dstCurr + (slen - stail);
+                    Vector512.Store(d00, dstPtr + (Vector512<byte>.Count * 0));
+                    Vector512.Store(d01, dstPtr + (Vector512<byte>.Count * 1));
+                    Vector512.Store(d02, dstPtr + (Vector512<byte>.Count * 2));
+                    Vector512.Store(d03, dstPtr + (Vector512<byte>.Count * 3));
+                    Vector512.Store(d04, dstPtr + (Vector512<byte>.Count * 4));
+                    Vector512.Store(d05, dstPtr + (Vector512<byte>.Count * 5));
+                    Vector512.Store(d06, dstPtr + (Vector512<byte>.Count * 6));
+                    Vector512.Store(d07, dstPtr + (Vector512<byte>.Count * 7));
 
-            while (dstCurr < dstEnd)
-            {
-                Vector256<byte> d00 = Avx.LoadVector256(srcStartPtrs[0]);
-                srcStartPtrs[0] += batchSize;
-                for (int i = 1; i < srcKeyCount; i++)
-                {
-                    Vector256<byte> s00 = Avx.LoadVector256(srcStartPtrs[i]);
-                    d00 = Avx2.Xor(d00, s00);
-                    srcStartPtrs[i] += batchSize;
+                    dstPtr += Vector512<byte>.Count * 8;
                 }
-                Avx.Store(dstCurr, d00);
-                dstCurr += batchSize;
             }
-            #endregion
-
-            #region scalar_4x8
-            slen = stail;
-            batchSize = 4 * 8;
-            stail = slen & (batchSize - 1);
-            dstEnd = dstCurr + (slen - stail);
-            while (dstCurr < dstEnd)
-            {
-                long d00 = *(long*)(srcStartPtrs[0]);
-                long d01 = *(long*)(srcStartPtrs[0] + 8);
-                long d02 = *(long*)(srcStartPtrs[0] + 16);
-                long d03 = *(long*)(srcStartPtrs[0] + 24);
-                srcStartPtrs[0] += batchSize;
-                for (int i = 1; i < srcKeyCount; i++)
-                {
-                    d00 ^= *(long*)(srcStartPtrs[i]);
-                    d01 ^= *(long*)(srcStartPtrs[i] + 8);
-                    d02 ^= *(long*)(srcStartPtrs[i] + 16);
-                    d03 ^= *(long*)(srcStartPtrs[i] + 24);
-                    srcStartPtrs[i] += batchSize;
-                }
 
-                *(long*)dstCurr = d00;
-                *(long*)(dstCurr + 8) = d01;
-                *(long*)(dstCurr + 16) = d02;
-                *(long*)(dstCurr + 24) = d03;
-                dstCurr += batchSize;
-            }
-            if (stail == 0) goto fillTail;
-            #endregion
-
-            #region scalar_1x8
-            slen = stail;
-            batchSize = 8;
-            stail = slen & (batchSize - 1);
-            dstEnd = dstCurr + (slen - stail);
-            while (dstCurr < dstEnd)
+            static void Vectorized256(ref byte* firstPtr, int srcCount, byte** srcStartPtrs, ref byte* dstPtr, byte* dstBatchEndPtr)
             {
-                long d00 = *(long*)(srcStartPtrs[0]);
-                srcStartPtrs[0] += batchSize;
-                for (int i = 1; i < srcKeyCount; i++)
+                while (dstPtr < dstBatchEndPtr)
                 {
-                    d00 ^= *(long*)(srcStartPtrs[i]);
-                    srcStartPtrs[i] += batchSize;
+                    var d00 = Vector256.Load(firstPtr + (Vector256<byte>.Count * 0));
+                    var d01 = Vector256.Load(firstPtr + (Vector256<byte>.Count * 1));
+                    var d02 = Vector256.Load(firstPtr + (Vector256<byte>.Count * 2));
+                    var d03 = Vector256.Load(firstPtr + (Vector256<byte>.Count * 3));
+                    var d04 = Vector256.Load(firstPtr + (Vector256<byte>.Count * 4));
+                    var d05 = Vector256.Load(firstPtr + (Vector256<byte>.Count * 5));
+                    var d06 = Vector256.Load(firstPtr + (Vector256<byte>.Count * 6));
+                    var d07 = Vector256.Load(firstPtr + (Vector256<byte>.Count * 7));
+
+                    firstPtr += Vector256<byte>.Count * 8;
+
+                    for (var i = 1; i < srcCount; i++)
+                    {
+                        ref var startPtr = ref srcStartPtrs[i];
+
+                        var s00 = Vector256.Load(startPtr + (Vector256<byte>.Count * 0));
+                        var s01 = Vector256.Load(startPtr + (Vector256<byte>.Count * 1));
+                        var s02 = Vector256.Load(startPtr + (Vector256<byte>.Count * 2));
+                        var s03 = Vector256.Load(startPtr + (Vector256<byte>.Count * 3));
+                        var s04 = Vector256.Load(startPtr + (Vector256<byte>.Count * 4));
+                        var s05 = Vector256.Load(startPtr + (Vector256<byte>.Count * 5));
+                        var s06 = Vector256.Load(startPtr + (Vector256<byte>.Count * 6));
+                        var s07 = Vector256.Load(startPtr + (Vector256<byte>.Count * 7));
+
+                        d00 = TBinaryOperator.Invoke(d00, s00);
+                        d01 = TBinaryOperator.Invoke(d01, s01);
+                        d02 = TBinaryOperator.Invoke(d02, s02);
+                        d03 = TBinaryOperator.Invoke(d03, s03);
+                        d04 = TBinaryOperator.Invoke(d04, s04);
+                        d05 = TBinaryOperator.Invoke(d05, s05);
+                        d06 = TBinaryOperator.Invoke(d06, s06);
+                        d07 = TBinaryOperator.Invoke(d07, s07);
+
+                        startPtr += Vector256<byte>.Count * 8;
+                    }
+
+                    Vector256.Store(d00, dstPtr + (Vector256<byte>.Count * 0));
+                    Vector256.Store(d01, dstPtr + (Vector256<byte>.Count * 1));
+                    Vector256.Store(d02, dstPtr + (Vector256<byte>.Count * 2));
+                    Vector256.Store(d03, dstPtr + (Vector256<byte>.Count * 3));
+                    Vector256.Store(d04, dstPtr + (Vector256<byte>.Count * 4));
+                    Vector256.Store(d05, dstPtr + (Vector256<byte>.Count * 5));
+                    Vector256.Store(d06, dstPtr + (Vector256<byte>.Count * 6));
+                    Vector256.Store(d07, dstPtr + (Vector256<byte>.Count * 7));
+
+                    dstPtr += Vector256<byte>.Count * 8;
                 }
-                *(long*)dstCurr = d00;
-                dstCurr += batchSize;
             }
-        #endregion
 
-        fillTail:
-            #region scalar_1x1    
-            byte* dstMaxEnd = dstPtr + dstLen;
-            while (dstCurr < dstMaxEnd)
+            static void Vectorized128(ref byte* firstPtr, int srcCount, byte** srcStartPtrs, ref byte* dstPtr, byte* dstBatchEndPtr)
             {
-                byte d00 = 0;
-                if (srcStartPtrs[0] < srcEndPtrs[0])
+                while (dstPtr < dstBatchEndPtr)
                 {
-                    d00 = *srcStartPtrs[0];
-                    srcStartPtrs[0]++;
-                }
-
-                for (int i = 1; i < srcKeyCount; i++)
-                {
-                    if (srcStartPtrs[i] < srcEndPtrs[i])
+                    var d00 = Vector128.Load(firstPtr + (Vector128<byte>.Count * 0));
+                    var d01 = Vector128.Load(firstPtr + (Vector128<byte>.Count * 1));
+                    var d02 = Vector128.Load(firstPtr + (Vector128<byte>.Count * 2));
+                    var d03 = Vector128.Load(firstPtr + (Vector128<byte>.Count * 3));
+                    var d04 = Vector128.Load(firstPtr + (Vector128<byte>.Count * 4));
+                    var d05 = Vector128.Load(firstPtr + (Vector128<byte>.Count * 5));
+                    var d06 = Vector128.Load(firstPtr + (Vector128<byte>.Count * 6));
+                    var d07 = Vector128.Load(firstPtr + (Vector128<byte>.Count * 7));
+
+                    firstPtr += Vector128<byte>.Count * 8;
+
+                    for (var i = 1; i < srcCount; i++)
                     {
-                        d00 ^= *srcStartPtrs[i];
-                        srcStartPtrs[i]++;
+                        ref var startPtr = ref srcStartPtrs[i];
+
+                        var s00 = Vector128.Load(startPtr + (Vector128<byte>.Count * 0));
+                        var s01 = Vector128.Load(startPtr + (Vector128<byte>.Count * 1));
+                        var s02 = Vector128.Load(startPtr + (Vector128<byte>.Count * 2));
+                        var s03 = Vector128.Load(startPtr + (Vector128<byte>.Count * 3));
+                        var s04 = Vector128.Load(startPtr + (Vector128<byte>.Count * 4));
+                        var s05 = Vector128.Load(startPtr + (Vector128<byte>.Count * 5));
+                        var s06 = Vector128.Load(startPtr + (Vector128<byte>.Count * 6));
+                        var s07 = Vector128.Load(startPtr + (Vector128<byte>.Count * 7));
+
+                        d00 = TBinaryOperator.Invoke(d00, s00);
+                        d01 = TBinaryOperator.Invoke(d01, s01);
+                        d02 = TBinaryOperator.Invoke(d02, s02);
+                        d03 = TBinaryOperator.Invoke(d03, s03);
+                        d04 = TBinaryOperator.Invoke(d04, s04);
+                        d05 = TBinaryOperator.Invoke(d05, s05);
+                        d06 = TBinaryOperator.Invoke(d06, s06);
+                        d07 = TBinaryOperator.Invoke(d07, s07);
+
+                        startPtr += Vector128<byte>.Count * 8;
                     }
+
+                    Vector128.Store(d00, dstPtr + (Vector128<byte>.Count * 0));
+                    Vector128.Store(d01, dstPtr + (Vector128<byte>.Count * 1));
+                    Vector128.Store(d02, dstPtr + (Vector128<byte>.Count * 2));
+                    Vector128.Store(d03, dstPtr + (Vector128<byte>.Count * 3));
+                    Vector128.Store(d04, dstPtr + (Vector128<byte>.Count * 4));
+                    Vector128.Store(d05, dstPtr + (Vector128<byte>.Count * 5));
+                    Vector128.Store(d06, dstPtr + (Vector128<byte>.Count * 6));
+                    Vector128.Store(d07, dstPtr + (Vector128<byte>.Count * 7));
+
+                    dstPtr += Vector128<byte>.Count * 8;
                 }
-                *dstCurr++ = d00;
             }
-            #endregion
         }
-
     }
 }
\ No newline at end of file
diff --git a/libs/server/Resp/CmdStrings.cs b/libs/server/Resp/CmdStrings.cs
index e0a4a29eb77..cd3263aa808 100644
--- a/libs/server/Resp/CmdStrings.cs
+++ b/libs/server/Resp/CmdStrings.cs
@@ -269,6 +269,7 @@ static partial class CmdStrings
         public static ReadOnlySpan<byte> RESP_WRONGPASS_INVALID_USERNAME_PASSWORD => "WRONGPASS Invalid username/password combination"u8;
         public static ReadOnlySpan<byte> RESP_SYNTAX_ERROR => "ERR syntax error"u8;
         public static ReadOnlySpan<byte> RESP_ERR_BITOP_KEY_LIMIT => "ERR Bitop source key limit (64) exceeded"u8;
+        public static ReadOnlySpan<byte> RESP_ERR_BITOP_DIFF_TWO_SOURCE_KEYS_REQUIRED => "ERR BITOP DIFF must be called with at least two source keys."u8;
         public static ReadOnlySpan<byte> RESP_ERR_COUNT_IS_NOT_POSITIVE => "ERR COUNT must be > 0"u8;
         public static ReadOnlySpan<byte> RESP_ERR_COUNT_IS_OUT_OF_RANGE_N1 => "ERR count should be greater than or equal to -1."u8;
         public static ReadOnlySpan<byte> RESP_ERR_MODULE_LOADED_TYPES => "ERR Unable to load types from module. Ensure that the module is compatible with the current runtime."u8;
@@ -525,6 +526,7 @@ static partial class CmdStrings
         public static ReadOnlySpan<byte> LUA_OR => "OR"u8;
         public static ReadOnlySpan<byte> LUA_XOR => "XOR"u8;
         public static ReadOnlySpan<byte> LUA_NOT => "NOT"u8;
+        public static ReadOnlySpan<byte> LUA_DIFF => "DIFF"u8;
         public static ReadOnlySpan<byte> LUA_KEYS => "KEYS"u8;
         public static ReadOnlySpan<byte> LUA_ARGV => "ARGV"u8;
         public static ReadOnlySpan<byte> EXPDELSCAN => "EXPDELSCAN"u8;
diff --git a/libs/server/Resp/Parser/RespCommand.cs b/libs/server/Resp/Parser/RespCommand.cs
index 9d4224d56c8..cc81121b1df 100644
--- a/libs/server/Resp/Parser/RespCommand.cs
+++ b/libs/server/Resp/Parser/RespCommand.cs
@@ -220,7 +220,8 @@ public enum RespCommand : ushort
         BITOP_AND,
         BITOP_OR,
         BITOP_XOR,
-        BITOP_NOT, // Note: Update LastWriteCommand if adding new write commands after this
+        BITOP_NOT,
+        BITOP_DIFF, // Note: Update LastWriteCommand if adding new write commands after this
 
         // Script execution commands
         EVAL,
@@ -401,7 +402,7 @@ public enum RespCommand : ushort
     public static class RespCommandExtensions
     {
         private static readonly RespCommand[] ExpandedSET = [RespCommand.SETEXNX, RespCommand.SETEXXX, RespCommand.SETKEEPTTL, RespCommand.SETKEEPTTLXX];
-        private static readonly RespCommand[] ExpandedBITOP = [RespCommand.BITOP_AND, RespCommand.BITOP_NOT, RespCommand.BITOP_OR, RespCommand.BITOP_XOR];
+        private static readonly RespCommand[] ExpandedBITOP = [RespCommand.BITOP_AND, RespCommand.BITOP_NOT, RespCommand.BITOP_OR, RespCommand.BITOP_XOR, RespCommand.BITOP_DIFF];
 
         // Commands that are either returning static data or commands that cannot have issues from concurrent AOF interaction in another session
         private static readonly RespCommand[] AofIndependentCommands = [
@@ -516,7 +517,7 @@ public static RespCommand NormalizeForACLs(this RespCommand cmd)
                     RespCommand.SETEXXX => RespCommand.SET,
                     RespCommand.SETKEEPTTL => RespCommand.SET,
                     RespCommand.SETKEEPTTLXX => RespCommand.SET,
-                    RespCommand.BITOP_AND or RespCommand.BITOP_NOT or RespCommand.BITOP_OR or RespCommand.BITOP_XOR => RespCommand.BITOP,
+                    RespCommand.BITOP_AND or RespCommand.BITOP_NOT or RespCommand.BITOP_OR or RespCommand.BITOP_XOR or RespCommand.BITOP_DIFF => RespCommand.BITOP,
                     _ => cmd
                 };
         }
@@ -541,7 +542,7 @@ public static ReadOnlySpan<RespCommand> ExpandForACLs(this RespCommand cmd)
 
         internal const RespCommand FirstWriteCommand = RespCommand.APPEND;
 
-        internal const RespCommand LastWriteCommand = RespCommand.BITOP_NOT;
+        internal const RespCommand LastWriteCommand = RespCommand.BITOP_DIFF;
 
         internal const RespCommand LastDataCommand = RespCommand.EVALSHA;
 
@@ -986,44 +987,64 @@ private RespCommand FastParseArrayCommand(ref int count, ref ReadOnlySpan<byte>
                                             // Check for matching bit-operation
                                             if (remainingBytes > length + 6 + 8)
                                             {
-                                                // TODO: AND|OR|XOR|NOT may not correctly handle mixed cases?
+                                                // TODO: AND|OR|XOR|NOT|DIFF may not correctly handle mixed cases?
 
-                                                // 2-character operations
-                                                if (*(uint*)(ptr + 11) == MemoryMarshal.Read<uint>("$2\r\n"u8))
+                                                var tag64 = *(ulong*)(ptr + 11);
+                                                var tag32 = (uint)tag64;
+
+                                                if (tag32 == MemoryMarshal.Read<uint>("$2\r\n"u8))
                                                 {
-                                                    if (*(ulong*)(ptr + 11) == MemoryMarshal.Read<ulong>("$2\r\nOR\r\n"u8) || *(ulong*)(ptr + 11) == MemoryMarshal.Read<ulong>("$2\r\nor\r\n"u8))
+                                                    if (tag64 == MemoryMarshal.Read<ulong>("$2\r\nOR\r\n"u8) || tag64 == MemoryMarshal.Read<ulong>("$2\r\nor\r\n"u8))
                                                     {
-                                                        readHead += 8;
+                                                        readHead += 8; // "$2\r\n" + "OR" + "\r\n"
                                                         count -= 1;
                                                         return RespCommand.BITOP_OR;
                                                     }
                                                 }
-                                                // 3-character operations
-                                                else if (remainingBytes > length + 6 + 9)
+                                                else if (tag32 == MemoryMarshal.Read<uint>("$3\r\n"u8) && remainingBytes > length + 6 + 9)
                                                 {
-                                                    if (*(uint*)(ptr + 11) == MemoryMarshal.Read<uint>("$3\r\n"u8))
+                                                    // Optimistically adjust
+                                                    readHead += 9; // "$3\r\n" + AND|XOR|NOT + "\r\n"
+                                                    count -= 1;
+
+                                                    tag64 = *(ulong*)(ptr + 12);
+
+                                                    if (tag64 == MemoryMarshal.Read<ulong>("3\r\nAND\r\n"u8) || tag64 == MemoryMarshal.Read<ulong>("3\r\nand\r\n"u8))
                                                     {
-                                                        // Optimistically adjust read head and count
-                                                        readHead += 9;
-                                                        count -= 1;
+                                                        return RespCommand.BITOP_AND;
+                                                    }
+                                                    else if (tag64 == MemoryMarshal.Read<ulong>("3\r\nXOR\r\n"u8) || tag64 == MemoryMarshal.Read<ulong>("3\r\nxor\r\n"u8))
+                                                    {
+                                                        return RespCommand.BITOP_XOR;
+                                                    }
+                                                    else if (tag64 == MemoryMarshal.Read<ulong>("3\r\nNOT\r\n"u8) || tag64 == MemoryMarshal.Read<ulong>("3\r\nnot\r\n"u8))
+                                                    {
+                                                        return RespCommand.BITOP_NOT;
+                                                    }
 
-                                                        if (*(ulong*)(ptr + 12) == MemoryMarshal.Read<ulong>("3\r\nAND\r\n"u8) || *(ulong*)(ptr + 12) == MemoryMarshal.Read<ulong>("3\r\nand\r\n"u8))
-                                                        {
-                                                            return RespCommand.BITOP_AND;
-                                                        }
-                                                        else if (*(ulong*)(ptr + 12) == MemoryMarshal.Read<ulong>("3\r\nXOR\r\n"u8) || *(ulong*)(ptr + 12) == MemoryMarshal.Read<ulong>("3\r\nxor\r\n"u8))
-                                                        {
-                                                            return RespCommand.BITOP_XOR;
-                                                        }
-                                                        else if (*(ulong*)(ptr + 12) == MemoryMarshal.Read<ulong>("3\r\nNOT\r\n"u8) || *(ulong*)(ptr + 12) == MemoryMarshal.Read<ulong>("3\r\nnot\r\n"u8))
-                                                        {
-                                                            return RespCommand.BITOP_NOT;
-                                                        }
+                                                    // Reset if no match
+                                                    readHead -= 9;
+                                                    count += 1;
+                                                }
+                                                else if (tag32 == MemoryMarshal.Read<uint>("$4\r\n"u8) && remainingBytes > length + 6 + 10)
+                                                {
+                                                    // Optimistically adjust
+                                                    readHead += 10; // "$4\r\nDIFF\r\n"
+                                                    count -= 1;
+
+                                                    tag64 = *(ulong*)(ptr + 12);
 
-                                                        // Reset read head and count if we didn't match operator.
-                                                        readHead -= 9;
-                                                        count += 1;
+                                                    // Compare first 8 bytes then the trailing '\n' for "4\r\nDIFF\r\n"
+                                                    if ((*(ulong*)(ptr + 12) == MemoryMarshal.Read<ulong>("4\r\nDIFF\r"u8) ||
+                                                        *(ulong*)(ptr + 12) == MemoryMarshal.Read<ulong>("4\r\ndiff\r"u8)) &&
+                                                        *(ptr + 20) == (byte)'\n')
+                                                    {
+                                                        return RespCommand.BITOP_DIFF;
                                                     }
+
+                                                    // Reset if no match
+                                                    readHead -= 10;
+                                                    count += 1;
                                                 }
 
                                                 // Although we recognize BITOP, the pseudo-subcommand isn't recognized so fail early
diff --git a/libs/server/Resp/RespServerSession.cs b/libs/server/Resp/RespServerSession.cs
index aeb3e966c36..13225942eb6 100644
--- a/libs/server/Resp/RespServerSession.cs
+++ b/libs/server/Resp/RespServerSession.cs
@@ -876,6 +876,7 @@ private bool ProcessArrayCommands<TGarnetApi>(RespCommand cmd, ref TGarnetApi st
                 RespCommand.BITOP_OR => NetworkStringBitOperation(BitmapOperation.OR, ref storageApi),
                 RespCommand.BITOP_XOR => NetworkStringBitOperation(BitmapOperation.XOR, ref storageApi),
                 RespCommand.BITOP_NOT => NetworkStringBitOperation(BitmapOperation.NOT, ref storageApi),
+                RespCommand.BITOP_DIFF => NetworkStringBitOperation(BitmapOperation.DIFF, ref storageApi),
                 RespCommand.BITFIELD => StringBitField(ref storageApi),
                 RespCommand.BITFIELD_RO => StringBitFieldReadOnly(ref storageApi),
                 // List Commands
diff --git a/libs/server/Storage/Session/MainStore/BitmapOps.cs b/libs/server/Storage/Session/MainStore/BitmapOps.cs
index 3248354be87..67777f77da2 100644
--- a/libs/server/Storage/Session/MainStore/BitmapOps.cs
+++ b/libs/server/Storage/Session/MainStore/BitmapOps.cs
@@ -4,7 +4,6 @@
 using System;
 using System.Collections.Generic;
 using System.Diagnostics;
-using System.Runtime.CompilerServices;
 using System.Text;
 using Garnet.common;
 using Tsavorite.core;
@@ -84,8 +83,8 @@ public unsafe GarnetStatus StringBitOperation(ref RawStringInput input, BitmapOp
 
             // 8 byte start pointer
             // 4 byte int length
-            var output = stackalloc byte[12];
-            var srcBitmapStartPtrs = stackalloc byte*[keyCount - 1];
+            Span<byte> output = stackalloc byte[12];
+            var srcBitmapPtrs = stackalloc byte*[keyCount - 1];
             var srcBitmapEndPtrs = stackalloc byte*[keyCount - 1];
 
             var createTransaction = false;
@@ -113,7 +112,7 @@ public unsafe GarnetStatus StringBitOperation(ref RawStringInput input, BitmapOp
                 {
                     var srcKey = keys[i];
                     //Read srcKey
-                    var outputBitmap = new SpanByteAndMemory(output, 12);
+                    var outputBitmap = SpanByteAndMemory.FromPinnedSpan(output);
                     status = ReadWithUnsafeContext(srcKey, ref input, ref outputBitmap, localHeadAddress, out bool epochChanged, ref uc);
                     if (epochChanged)
                     {
@@ -125,48 +124,38 @@ public unsafe GarnetStatus StringBitOperation(ref RawStringInput input, BitmapOp
                         continue;
 
                     var outputBitmapPtr = outputBitmap.SpanByte.ToPointer();
-                    var localSrcBitmapPtr = (byte*)((IntPtr)(*(long*)outputBitmapPtr));
-                    var len = *(int*)(outputBitmapPtr + 8);
+                    var localBitmapPtr = (byte*)(nuint)(*(ulong*)outputBitmapPtr);
+                    var localBitmapLength = *(int*)(outputBitmapPtr + 8);
 
                     // Keep track of pointers returned from ISessionFunctions
-                    srcBitmapStartPtrs[keysFound] = localSrcBitmapPtr;
-                    srcBitmapEndPtrs[keysFound] = localSrcBitmapPtr + len;
+                    srcBitmapPtrs[keysFound] = localBitmapPtr;
+                    srcBitmapEndPtrs[keysFound] = localBitmapPtr + localBitmapLength;
                     keysFound++;
-                    maxBitmapLen = Math.Max(len, maxBitmapLen);
-                    minBitmapLen = Math.Min(len, minBitmapLen);
-                }
 
-                #region performBitop
-                // Allocate result buffers
-                sectorAlignedMemoryBitmap ??= new SectorAlignedMemory(bitmapBufferSize + sectorAlignedMemoryPoolAlignment, sectorAlignedMemoryPoolAlignment);
-                var dstBitmapPtr = sectorAlignedMemoryBitmap.GetValidPointer() + sectorAlignedMemoryPoolAlignment;
-                if (maxBitmapLen + sectorAlignedMemoryPoolAlignment > bitmapBufferSize)
-                {
-                    do
-                    {
-                        bitmapBufferSize <<= 1;
-                    } while (maxBitmapLen + sectorAlignedMemoryPoolAlignment > bitmapBufferSize);
-
-                    sectorAlignedMemoryBitmap.Dispose();
-                    sectorAlignedMemoryBitmap = new SectorAlignedMemory(bitmapBufferSize + sectorAlignedMemoryPoolAlignment, sectorAlignedMemoryPoolAlignment);
-                    dstBitmapPtr = sectorAlignedMemoryBitmap.GetValidPointer() + sectorAlignedMemoryPoolAlignment;
+                    maxBitmapLen = Math.Max(localBitmapLength, maxBitmapLen);
+                    minBitmapLen = Math.Min(localBitmapLength, minBitmapLen);
                 }
 
-
                 // Check if at least one key is found and execute bitop
                 if (keysFound > 0)
                 {
-                    //1. Multi-way bitmap merge
-                    _ = BitmapManager.BitOpMainUnsafeMultiKey(dstBitmapPtr, maxBitmapLen, srcBitmapStartPtrs, srcBitmapEndPtrs, keysFound, minBitmapLen, (byte)bitOp);
-                    #endregion
+                    // Allocate result buffer
+                    if (sectorAlignedMemoryBitmap == null || maxBitmapLen > bitmapBufferSize)
+                    {
+                        bitmapBufferSize = Math.Max(bitmapBufferSize, maxBitmapLen);
+
+                        sectorAlignedMemoryBitmap?.Dispose();
+                        sectorAlignedMemoryBitmap = new SectorAlignedMemory(bitmapBufferSize, sectorAlignedMemoryPoolAlignment);
+                    }
+
+                    var dstBitmapPtr = sectorAlignedMemoryBitmap.GetValidPointer();
+                    BitmapManager.InvokeBitOperationUnsafe(bitOp, keysFound, srcBitmapPtrs, srcBitmapEndPtrs, dstBitmapPtr, maxBitmapLen, minBitmapLen);
 
                     if (maxBitmapLen > 0)
                     {
                         var dstKey = keys[0].SpanByte;
-                        var valPtr = dstBitmapPtr;
-                        valPtr -= sizeof(int);
-                        *(int*)valPtr = maxBitmapLen;
-                        status = SET(ref dstKey, ref Unsafe.AsRef<SpanByte>(valPtr), ref uc);
+                        var dstBitmapSpanByte = SpanByte.FromPinnedPointer(dstBitmapPtr, maxBitmapLen);
+                        status = SET(ref dstKey, ref dstBitmapSpanByte, ref uc);
                     }
                 }
                 else
diff --git a/test/Garnet.test/GarnetBitmapTests.cs b/test/Garnet.test/GarnetBitmapTests.cs
index 1eb0a2e1958..22a8b4147bf 100644
--- a/test/Garnet.test/GarnetBitmapTests.cs
+++ b/test/Garnet.test/GarnetBitmapTests.cs
@@ -3,7 +3,8 @@
 
 using System;
 using System.Collections.Generic;
-using System.Runtime.Intrinsics.X86;
+using System.Linq;
+using System.Numerics.Tensors;
 using Garnet.common;
 using Garnet.server;
 using NUnit.Framework;
@@ -44,15 +45,26 @@ public void TearDown()
             TestUtils.DeleteDirectory(TestUtils.MethodTestDir);
         }
 
-        private long LongRandom() => ((long)this.r.Next() << 32) | (long)this.r.Next();
-
-        private ulong ULongRandom()
+        private GarnetServerTestProcess CreateServerWithEnvironmentVariables(string environment)
         {
-            ulong lsb = (ulong)(this.r.Next());
-            ulong msb = (ulong)(this.r.Next()) << 32;
-            return (msb | lsb);
+            var parts = environment.Split('=', 2);
+            if (parts.Length == 2)
+            {
+                Dictionary<string, string> envVars = [];
+                envVars.Add(parts[0], parts[1]);
+
+                return new GarnetServerTestProcess(envVars);
+            }
+            else
+            {
+                return new GarnetServerTestProcess();
+            }
         }
 
+        private long LongRandom() => r.NextInt64(long.MinValue, long.MaxValue);
+
+        private ulong ULongRandom() => (ulong)LongRandom();
+
         private unsafe long ResponseToLong(byte[] response, int offset)
         {
             fixed (byte* ptr = response)
@@ -271,60 +283,29 @@ public void BitmapSetGetBitTest_LTM(bool preSet)
 
         [Test, Order(6)]
         [Category("BITCOUNT")]
-        [TestCase(0, TestName = "BitmapSimpleBitCountTest(Hardware accelerated)")]
-        [TestCase(1, TestName = "BitmapSimpleBitCountTest(Avx2 disabled)")]
-        [TestCase(2, TestName = "BitmapSimpleBitCountTest(Software fallback)")]
-        public void BitmapSimpleBitCountTest(int acceleration)
+        [TestCase("DOTNET_EnableAVX2=0")]
+        [TestCase("DOTNET_EnableHWIntrinsic=1")]
+        [TestCase("DOTNET_EnableHWIntrinsic=0")]
+        public void BitmapSimpleBitCountTest(string environment)
         {
-            var configOptions = TestUtils.GetConfig();
+            using var server = CreateServerWithEnvironmentVariables(environment);
+            using var redis = ConnectionMultiplexer.Connect(server.Options);
 
-            if (acceleration == 0)
-            {
-                SimpleBitCountTest();
-            }
-            else
-            {
-                Dictionary<string, string> env = [];
-
-                if (acceleration == 1)
-                {
-                    if (!Avx2.IsSupported && Ssse3.IsSupported)
-                        Assert.Ignore("Already tested by main path");
-
-                    env.Add("DOTNET_EnableAVX2", "0");
-                }
-                else
-                {
-                    if (!Avx2.IsSupported && !Ssse3.IsSupported)
-                        Assert.Ignore("Already tested by main path");
-
-                    env.Add("DOTNET_EnableHWIntrinsic", "0");
-                }
-
-                using var p = new GarnetServerTestProcess(out configOptions, env);
-
-                SimpleBitCountTest();
-            }
+            var db = redis.GetDatabase(0);
+            var maxBitmapLen = 1 << 12;
+            var iter = 1024;
+            var expectedCount = 0;
+            var key = "SimpleBitCountTest";
 
-            void SimpleBitCountTest()
+            for (var i = 0; i < iter; i++)
             {
-                using var redis = ConnectionMultiplexer.Connect(configOptions);
-                var db = redis.GetDatabase(0);
-                var maxBitmapLen = 1 << 12;
-                var iter = 1024;
-                var expectedCount = 0;
-                var key = "SimpleBitCountTest";
-
-                for (var i = 0; i < iter; i++)
-                {
-                    var offset = r.Next(1, maxBitmapLen);
-                    var set = !db.StringSetBit(key, offset, true);
-                    expectedCount += set ? 1 : 0;
-                }
-
-                var count = db.StringBitCount(key);
-                ClassicAssert.AreEqual(expectedCount, count);
+                var offset = r.Next(1, maxBitmapLen);
+                var set = !db.StringSetBit(key, offset, true);
+                expectedCount += set ? 1 : 0;
             }
+
+            var count = db.StringBitCount(key);
+            ClassicAssert.AreEqual(expectedCount, count);
         }
 
         private static int Index(long offset) => (int)(offset >> 3);
@@ -801,391 +782,180 @@ public unsafe void BitmapSimpleBITPOS_PCT(int bytesPerSend)
             ClassicAssert.AreEqual(expectedPos, pos);
         }
 
-        [Test, Order(16)]
-        [TestCase(100)]
-        public unsafe void BitmapSimpleBITOP_PCT(int bytesPerSend)
+        private static byte[] CopyBitmap(byte[] sourceBitmap, bool invert = false)
         {
-            using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig());
-            using var lightClientRequest = TestUtils.CreateRequest();
-            var db = redis.GetDatabase(0);
-
-            int tests = 32;
-            string a = "a";
-            string b = "b";
-            string c = "c";
-            string d = "d";
+            var dst = new byte[sourceBitmap.Length];
+            if (invert)
+                TensorPrimitives.OnesComplement<byte>(sourceBitmap, dst);
+            else
+                sourceBitmap.AsSpan().CopyTo(dst);
 
-            long src = 0;
-            long dst = 0;
-            byte[] data;
+            return dst;
+        }
 
-            //Test NOT
-            for (int i = 0; i < tests; i++)
+        private static void ApplyBitop(ref byte[] dst, byte[] src, Func<byte, byte, byte> op)
+        {
+            if (dst.Length < src.Length)
             {
-                src = LongRandom();
-                data = BitConverter.GetBytes(src);
-                db.StringSet(a, data);
-
-                dst = ~src;
-                long size = 0;
-                byte[] response = lightClientRequest.SendCommandChunks("BITOP NOT " + d + " " + a, bytesPerSend);
-                size = ResponseToLong(response, 1);
-                ClassicAssert.AreEqual(size, 8);
-
-                data = db.StringGet(d);
-                src = BitConverter.ToInt64(data, 0);
-                ClassicAssert.AreEqual(dst, src);
+                var newDst = new byte[src.Length];
+                dst.AsSpan().CopyTo(newDst);
+                dst = newDst;
             }
 
-
-            //Test AND, OR, XOR
-            long srcA, srcB, srcC;
-            RedisKey[] keys = [a, b, c];
-            Bitwise[] bitwiseOps = [Bitwise.And, Bitwise.Or, Bitwise.Xor];
-            for (int j = 0; j < bitwiseOps.Length; j++)
+            for (var i = 0; i < src.Length; i++)
             {
-                for (int i = 0; i < tests; i++)
-                {
-                    srcA = LongRandom();
-                    srcB = LongRandom();
-                    srcC = LongRandom();
-
-                    data = BitConverter.GetBytes(srcA);
-                    db.StringSet(a, data);
-                    data = BitConverter.GetBytes(srcB);
-                    db.StringSet(b, data);
-                    data = BitConverter.GetBytes(srcC);
-                    db.StringSet(c, data);
-
-                    byte[] response = null;
-                    long size = 0;
-                    //size = db.StringBitOperation(bitwiseOps[j], d, keys);
-                    switch (bitwiseOps[j])
-                    {
-                        case Bitwise.And:
-                            dst = srcA & srcB & srcC;
-                            response = lightClientRequest.SendCommandChunks("BITOP AND " + d + " " + a + " " + b + " " + c, bytesPerSend);
-                            break;
-                        case Bitwise.Or:
-                            dst = srcA | srcB | srcC;
-                            response = lightClientRequest.SendCommandChunks("BITOP OR " + d + " " + a + " " + b + " " + c, bytesPerSend);
-                            break;
-                        case Bitwise.Xor:
-                            dst = srcA ^ srcB ^ srcC;
-                            response = lightClientRequest.SendCommandChunks("BITOP XOR " + d + " " + a + " " + b + " " + c, bytesPerSend);
-                            break;
-                    }
-
-                    size = ResponseToLong(response, 1);
-                    ClassicAssert.AreEqual(size, 8);
-
-                    data = db.StringGet(d);
-                    src = BitConverter.ToInt64(data, 0);
+                dst[i] = op(dst[i], src[i]);
+            }
 
-                    ClassicAssert.AreEqual(dst, src);
-                }
+            for (var i = src.Length; i < dst.Length; i++)
+            {
+                dst[i] = op(dst[i], 0);
             }
         }
 
-        [Test, Order(17)]
+        [Test, Order(19)]
         [Category("BITOP")]
-        public void BitmapSimpleBitOpTests()
+        public void BitOp_Unary_BitwiseNot(
+            [Values(Bitwise.Not)] Bitwise op,
+            [Values(1, 2, 16, 32 + 3, 128 + 32 + 3, 256 + 32 + 3, 512 + 32 + 3, 4096, 4096 + 32, 4096 + 32 + 3)] int bitmapLength)
         {
             using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig());
             var db = redis.GetDatabase(0);
 
-            int tests = 128;
-            string a = "a";
-            string b = "b";
-            string c = "c";
-            string d = "d";
-
-            long src = 0;
-            long dst = 0;
-            byte[] data;
-
-            //Test NOT
-            for (int i = 0; i < tests; i++)
-            {
-                src = LongRandom();
-                data = BitConverter.GetBytes(src);
-                db.StringSet(a, data);
-
-                dst = ~src;
-                long size = db.StringBitOperation(Bitwise.Not, d, a);
-                ClassicAssert.AreEqual(size, 8);
-
-                data = db.StringGet(d);
-                src = BitConverter.ToInt64(data, 0);
-                ClassicAssert.AreEqual(dst, src);
-            }
-
-            //Test AND, OR, XOR
-            long srcA, srcB, srcC;
-            RedisKey[] keys = [a, b, c];
-            Bitwise[] bitwiseOps = [Bitwise.And, Bitwise.Or, Bitwise.Xor];
-            for (int j = 0; j < bitwiseOps.Length; j++)
-            {
-                for (int i = 0; i < tests; i++)
-                {
-                    srcA = LongRandom();
-                    srcB = LongRandom();
-                    srcC = LongRandom();
-
-                    data = BitConverter.GetBytes(srcA);
-                    db.StringSet(a, data);
-                    data = BitConverter.GetBytes(srcB);
-                    db.StringSet(b, data);
-                    data = BitConverter.GetBytes(srcC);
-                    db.StringSet(c, data);
-
-                    switch (bitwiseOps[j])
-                    {
-                        case Bitwise.And:
-                            dst = srcA & srcB & srcC;
-                            break;
-                        case Bitwise.Or:
-                            dst = srcA | srcB | srcC;
-                            break;
-                        case Bitwise.Xor:
-                            dst = srcA ^ srcB ^ srcC;
-                            break;
-                    }
+            var srcKey = "src";
+            var dstKey = "dst";
 
-                    long size = db.StringBitOperation(bitwiseOps[j], d, keys);
-                    ClassicAssert.AreEqual(size, 8);
+            var srcKeyBitmap = new byte[bitmapLength];
+            r.NextBytes(srcKeyBitmap);
+            var expectedBitmap = CopyBitmap(srcKeyBitmap, invert: true);
+            db.StringSet(srcKey, srcKeyBitmap);
 
-                    data = db.StringGet(d);
-                    src = BitConverter.ToInt64(data, 0);
+            var size = db.StringBitOperation(op, dstKey, srcKey);
+            ClassicAssert.AreEqual(expectedBitmap.Length, size);
 
-                    ClassicAssert.AreEqual(dst, src);
-                }
-            }
+            byte[] actualBitmap = db.StringGet(dstKey);
+            ClassicAssert.AreEqual(expectedBitmap.Length, actualBitmap.Length);
+            ClassicAssert.AreEqual(expectedBitmap, actualBitmap);
         }
 
-        private static void InitBitmap(ref byte[] dst, byte[] srcA, bool invert = false)
+        [Test]
+        [Category("BITOP")]
+        public void BitOp_Binary_SameSize(
+            [Values("DOTNET_EnableHWIntrinsic=1", "DOTNET_PreferredVectorBitWidth=128", "DOTNET_EnableHWIntrinsic=0")] string environment,
+            [Values(Bitwise.And, Bitwise.Or, Bitwise.Xor, Bitwise.Diff)] Bitwise op,
+            [Values(512 + 32 + 3)] int bitmapSize,
+            [Values(2, 3, 4)] int keys)
         {
-            dst = new byte[srcA.Length];
-            if (invert)
-                for (int i = 0; i < srcA.Length; i++) dst[i] = (byte)~srcA[i];
-            else
-                for (int i = 0; i < srcA.Length; i++) dst[i] = srcA[i];
+            using var server = CreateServerWithEnvironmentVariables(environment);
+            BitOp_Binary_SameSize(server.Options, op, bitmapSize, keys);
         }
 
-        private static void ApplyBitop(ref byte[] dst, byte[] srcA, Func<byte, byte, byte> f8)
+        [Test]
+        [Category("BITOP")]
+        public void BitOp_Binary_SameSize(
+            [Values(Bitwise.And, Bitwise.Or, Bitwise.Xor, Bitwise.Diff)] Bitwise op,
+            [Values(1, 2, 16, 32 + 3, 128 + 32 + 3, 256 + 32 + 3, 512 + 32 + 3, 4096, 4096 + 32, 4096 + 32 + 3)] int bitmapSize,
+            [Values(2, 3, 4)] int keys)
         {
-            if (dst.Length < srcA.Length)
-            {
-                byte[] newDst = new byte[srcA.Length];
-                Buffer.BlockCopy(dst, 0, newDst, 0, dst.Length);
-                dst = newDst;
-            }
+            BitOp_Binary_SameSize(TestUtils.GetConfig(), op, bitmapSize, keys);
+        }
 
-            for (int i = 0; i < srcA.Length; i++)
+        private void BitOp_Binary_SameSize(
+            ConfigurationOptions configOptions,
+            Bitwise op,
+            int bitmapSize,
+            int keys)
+        {
+            Func<byte, byte, byte> opFunc = op switch
             {
-                dst[i] = f8(dst[i], srcA[i]);
-            }
+                Bitwise.And => static (a, b) => (byte)(a & b),
+                Bitwise.Or => static (a, b) => (byte)(a | b),
+                Bitwise.Xor => static (a, b) => (byte)(a ^ b),
+                Bitwise.Diff => static (a, b) => (byte)(a & ~b),
 
-            for (int i = srcA.Length; i < dst.Length; i++)
-            {
-                dst[i] = f8(dst[i], 0);
-            }
-        }
+                _ => throw new NotSupportedException()
+            };
 
-        [Test, Order(18)]
-        [Category("BITOP")]
-        public void BitmapSimpleVarLenBitOpTests()
-        {
-            using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig());
+            using var redis = ConnectionMultiplexer.Connect(configOptions);
             var db = redis.GetDatabase(0);
 
-            int tests = 32;
-            string a = "a";
-            string b = "b";
-            string c = "c";
-            string d = "d";
-            string x = "x";
-
-            RedisKey[] keys = [a, b, c, d];
-            Bitwise[] bitwiseOps = [Bitwise.And, Bitwise.Or, Bitwise.Xor, Bitwise.And, Bitwise.Or, Bitwise.Xor];
+            var srcKeys = new RedisKey[keys];
+            var srcKeyBitmaps = new byte[keys][];
 
-            int maxBytes = 512;
-            byte[] dataA = new byte[r.Next(1, maxBytes)];
-            byte[] dataB = new byte[r.Next(1, maxBytes)];
-            byte[] dataC = new byte[r.Next(1, maxBytes)];
-            byte[] dataD = new byte[r.Next(1, maxBytes)];
-            byte[] dataX = null;
+            var dstKey = "dst";
+            var expectedBitmap = new byte[bitmapSize];
 
-            for (int j = 0; j < bitwiseOps.Length; j++)
+            for (var i = 0; i < srcKeys.Length; i++)
             {
-                for (int i = 0; i < tests; i++)
-                {
-                    r.NextBytes(dataA);
-                    r.NextBytes(dataB);
-                    r.NextBytes(dataC);
-                    r.NextBytes(dataD);
-
-                    db.StringSet(a, dataA);
-                    db.StringSet(b, dataB);
-                    db.StringSet(c, dataC);
-                    db.StringSet(d, dataD);
-
-                    Func<byte, byte, byte> f8 = null;
-                    switch (bitwiseOps[j])
-                    {
-                        case Bitwise.And:
-                            f8 = (a, b) => (byte)(a & b);
-                            break;
-                        case Bitwise.Or:
-                            f8 = (a, b) => (byte)(a | b);
-                            break;
-                        case Bitwise.Xor:
-                            f8 = (a, b) => (byte)(a ^ b);
-                            break;
-                    }
+                srcKeyBitmaps[i] = new byte[bitmapSize];
+                r.NextBytes(srcKeyBitmaps[i]);
 
-                    dataX = null;
-                    InitBitmap(ref dataX, dataA);
-                    ApplyBitop(ref dataX, dataB, f8);
-                    ApplyBitop(ref dataX, dataC, f8);
-                    ApplyBitop(ref dataX, dataD, f8);
+                srcKeys[i] = "src" + i;
+                db.StringSet(srcKeys[i], srcKeyBitmaps[i]);
 
-                    long size = db.StringBitOperation(bitwiseOps[j], x, keys);
-                    ClassicAssert.AreEqual(size, dataX.Length);
+                if (i == 0)
+                    srcKeyBitmaps[i].AsSpan().CopyTo(expectedBitmap);
+                else
+                    ApplyBitop(ref expectedBitmap, srcKeyBitmaps[i], opFunc);
+            }
 
-                    byte[] expectedX = db.StringGet(x);
+            var size = db.StringBitOperation(op, dstKey, srcKeys);
+            ClassicAssert.AreEqual(expectedBitmap.Length, size);
 
-                    ClassicAssert.AreEqual(dataX, expectedX);
-                }
-            }
+            byte[] actualBitmap = db.StringGet(dstKey);
+            ClassicAssert.AreEqual(expectedBitmap.Length, actualBitmap.Length);
+            ClassicAssert.AreEqual(expectedBitmap, actualBitmap);
         }
 
-        private static void AssertNegatedEqual(byte[] dstVal, byte[] srcVal)
-        {
-            for (int i = 0; i < srcVal.Length; i++)
-            {
-                byte srcV = (byte)~srcVal[i];
-                ClassicAssert.AreEqual(srcV, dstVal[i]);
-            }
-        }
 
-        [Test, Order(19)]
+        [Test, Order(20)]
         [Category("BITOP")]
-        public void BitmapBitOpNotTest()
+        public void BitOp_Binary_DifferentTails(
+            [Values(Bitwise.And, Bitwise.Or, Bitwise.Xor, Bitwise.Diff)] Bitwise op,
+            [Values(1, 2, 16, 32 + 3, 128 + 32 + 3, 256 + 32 + 3, 512 + 32 + 3, 4096, 4096 + 32, 4096 + 32 + 3)] int sharedLength,
+            [Values(new int[] { 0, 7 }, new int[] { 16, 0, 7 }, new int[] { 1, 16, 1, 32 })] int[] additionalLengths)
         {
-            using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig());
-            var db = redis.GetDatabase(0);
-            int tests = 32;
-
-            string srcKey = "srcKey";
-            string dstKey = "dstKey";
-
-            int maxBytes = 256;
-            byte[] srcVal = new byte[r.Next(1, maxBytes)];
-            byte[] dstVal;
-            for (int i = 0; i < tests; i++)
+            Func<byte, byte, byte> opFunc = op switch
             {
-                r.NextBytes(srcVal);
-                db.StringSet(srcKey, srcVal);
+                Bitwise.And => static (a, b) => (byte)(a & b),
+                Bitwise.Or => static (a, b) => (byte)(a | b),
+                Bitwise.Xor => static (a, b) => (byte)(a ^ b),
+                Bitwise.Diff => static (a, b) => (byte)(a & ~b),
 
-                dstVal = db.StringGet(srcKey);
+                _ => throw new NotSupportedException()
+            };
 
-                long size = db.StringBitOperation(Bitwise.Not, dstKey, srcKey);
-
-                ClassicAssert.AreEqual(size, srcVal.Length);
-                dstVal = db.StringGet(dstKey);
-
-                AssertNegatedEqual(dstVal, srcVal);
-
-                db.KeyDelete(srcKey);
-            }
-        }
-
-        [Test, Order(20)]
-        [Category("BITOP")]
-        public void BitmapSimpleBitOpVarLenGrowingSizeTests()
-        {
             using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig());
             var db = redis.GetDatabase(0);
 
-            int tests = 16;
-            string a = "a";
-            string b = "b";
-            string c = "c";
-            string d = "d";
-            string x = "x";
+            var srcKeyCount = additionalLengths.Length;
+            var srcKeys = new RedisKey[srcKeyCount];
+            var srcKeyBitmaps = new byte[srcKeyCount][];
+            var srcMaxLength = sharedLength + Enumerable.Max(additionalLengths);
 
-            byte[] dataA, dataB, dataC, dataD;
-            byte[] dataX;
-            int minSize = 512;
-            Bitwise[] bitwiseOps = [Bitwise.And, Bitwise.Or, Bitwise.Xor, Bitwise.And, Bitwise.Or, Bitwise.Xor];
-            RedisKey[] keys = [a, b, c, d];
+            var dstKey = "dst";
+            var expectedBitmap = new byte[srcMaxLength];
 
-            //Test NOT
-            for (int i = 0; i < tests; i++)
+            for (var i = 0; i < srcKeys.Length; i++)
             {
-                dataA = new byte[r.Next(minSize, minSize + 32)];
-                r.NextBytes(dataA);
-                db.StringSet(a, dataA);
+                srcKeyBitmaps[i] = new byte[sharedLength + additionalLengths[i]];
+                r.NextBytes(srcKeyBitmaps[i]);
 
-                dataX = null;
-                InitBitmap(ref dataX, dataA, true);
-                long size = db.StringBitOperation(Bitwise.Not, x, a);
-                ClassicAssert.AreEqual(size, dataX.Length);
+                srcKeys[i] = "src" + i;
+                db.StringSet(srcKeys[i], srcKeyBitmaps[i]);
 
-                byte[] expectedX = db.StringGet(x);
-                ClassicAssert.AreEqual(dataX, expectedX);
+                if (i == 0)
+                    srcKeyBitmaps[i].AsSpan().CopyTo(expectedBitmap);
+                else
+                    ApplyBitop(ref expectedBitmap, srcKeyBitmaps[i], opFunc);
             }
 
-            //Test AND, OR, XOR
-            for (int j = 0; j < bitwiseOps.Length; j++)
-            {
-                for (int i = 0; i < tests; i++)
-                {
-                    dataA = new byte[r.Next(minSize, minSize + 16)]; minSize = dataA.Length;
-                    dataB = new byte[r.Next(minSize, minSize + 16)]; minSize = dataB.Length;
-                    dataC = new byte[r.Next(minSize, minSize + 16)]; minSize = dataC.Length;
-                    dataD = new byte[r.Next(minSize, minSize + 16)]; minSize = dataD.Length;
-                    minSize = 17;
-
-                    r.NextBytes(dataA);
-                    r.NextBytes(dataB);
-                    r.NextBytes(dataC);
-                    r.NextBytes(dataD);
-
-                    db.StringSet(a, dataA);
-                    db.StringSet(b, dataB);
-                    db.StringSet(c, dataC);
-                    db.StringSet(d, dataD);
-
-                    Func<byte, byte, byte> f8 = null;
-                    switch (bitwiseOps[j])
-                    {
-                        case Bitwise.And:
-                            f8 = (a, b) => (byte)(a & b);
-                            break;
-                        case Bitwise.Or:
-                            f8 = (a, b) => (byte)(a | b);
-                            break;
-                        case Bitwise.Xor:
-                            f8 = (a, b) => (byte)(a ^ b);
-                            break;
-                    }
-
-                    dataX = null;
-                    InitBitmap(ref dataX, dataA);
-                    ApplyBitop(ref dataX, dataB, f8);
-                    ApplyBitop(ref dataX, dataC, f8);
-                    ApplyBitop(ref dataX, dataD, f8);
-
-                    long size = db.StringBitOperation(bitwiseOps[j], x, keys);
-                    ClassicAssert.AreEqual(size, dataX.Length);
-                    byte[] expectedX = db.StringGet(x);
+            var size = db.StringBitOperation(op, dstKey, srcKeys);
+            ClassicAssert.AreEqual(expectedBitmap.Length, size);
 
-                    ClassicAssert.AreEqual(expectedX.Length, dataX.Length);
-                    ClassicAssert.AreEqual(dataX, expectedX);
-                }
-            }
+            byte[] actualBitmap = db.StringGet(dstKey);
+            ClassicAssert.AreEqual(expectedBitmap.Length, actualBitmap.Length);
+            ClassicAssert.AreEqual(expectedBitmap, actualBitmap);
         }
 
         private static long GetValueFromBitmap(ref byte[] bitmap, long offset, int bitCount, bool signed)
diff --git a/test/Garnet.test/RespCommandTests.cs b/test/Garnet.test/RespCommandTests.cs
index 742e737e62c..84e790a233e 100644
--- a/test/Garnet.test/RespCommandTests.cs
+++ b/test/Garnet.test/RespCommandTests.cs
@@ -47,6 +47,7 @@ public class RespCommandTests
             RespCommand.BITOP_OR,
             RespCommand.BITOP_XOR,
             RespCommand.BITOP_NOT,
+            RespCommand.BITOP_DIFF,
             RespCommand.INVALID,
             RespCommand.DELIFEXPIM
         ];
diff --git a/test/Garnet.test/TestProcess.cs b/test/Garnet.test/TestProcess.cs
index 45f13b577fd..d525a45630f 100644
--- a/test/Garnet.test/TestProcess.cs
+++ b/test/Garnet.test/TestProcess.cs
@@ -13,13 +13,13 @@ namespace Garnet.test
 {
     internal class GarnetServerTestProcess : IDisposable
     {
-        private readonly Process p = default;
+        private readonly Process process = default;
         private readonly Stopwatch stopWatch = default;
         private readonly LightClientRequest lightClientRequest = default;
 
-        internal GarnetServerTestProcess(out ConfigurationOptions opts,
-                                         Dictionary<string, string> env = default,
-                                         int port = 7000)
+        public ConfigurationOptions Options { get; }
+
+        internal GarnetServerTestProcess(Dictionary<string, string> env = default, int port = 7000)
         {
             var a = Assembly.GetAssembly(typeof(Garnet.Program));
             var name = a.Location;
@@ -27,15 +27,15 @@ internal GarnetServerTestProcess(out ConfigurationOptions opts,
 
             if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
             {
-                name = name.AsSpan().Slice(0, pos).ToString() + ".exe";
+                name = string.Concat(name.AsSpan(0, pos), ".exe");
             }
             else
             {
-                name = name.AsSpan().Slice(0, pos).ToString();
+                name = name.Substring(0, pos);
             }
 
             var endPoint = new IPEndPoint(IPAddress.Loopback, port);
-            opts = TestUtils.GetConfig([endPoint]);
+            Options = TestUtils.GetConfig([endPoint]);
 
             // We don't have to disable objects, it's done to improve startup time a bit.
             var psi = new ProcessStartInfo(name, ["--bind", "127.0.0.1", "--port", port.ToString(), "--enable-debug-command", "local", "--no-pubsub", "--no-obj"])
@@ -51,12 +51,12 @@ internal GarnetServerTestProcess(out ConfigurationOptions opts,
                     psi.Environment.Add(e.Key, e.Value);
             }
 
-            p = Process.Start(psi);
-            ClassicAssert.NotNull(p);
+            process = Process.Start(psi);
+            ClassicAssert.NotNull(process);
 
             // Block until the startup message to ensure process is up.
             var dummy = new char[1];
-            _ = p.StandardOutput.ReadBlock(dummy, 0, 1);
+            _ = process.StandardOutput.ReadBlock(dummy, 0, 1);
 
             // Give it a bit more time
             Thread.Sleep(100);
@@ -73,7 +73,7 @@ public void Dispose()
                 Console.WriteLine(stopWatch.ElapsedMilliseconds);
             }
 
-            if (p != default)
+            if (process != default)
             {
                 // We want to be sure the process is down, otherwise it may conflict
                 // with a future run. First, we'll ask nicely and then kill it.
@@ -85,10 +85,10 @@ public void Dispose()
                 }
                 catch { }
 
-                try { p.Kill(); }
+                try { process.Kill(); }
                 catch { }
 
-                p.Close();
+                process.Close();
             }
         }
     }