diff --git a/Directory.Packages.props b/Directory.Packages.props
index 4a456017a76..dc98d01c92b 100644
--- a/Directory.Packages.props
+++ b/Directory.Packages.props
@@ -22,10 +22,11 @@
-
+
+
diff --git a/benchmark/BDN.benchmark/Bitmap/BinaryOperations.cs b/benchmark/BDN.benchmark/Bitmap/BinaryOperations.cs
new file mode 100644
index 00000000000..a600bcb0ad4
--- /dev/null
+++ b/benchmark/BDN.benchmark/Bitmap/BinaryOperations.cs
@@ -0,0 +1,81 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+using System.Runtime.InteropServices;
+using BenchmarkDotNet.Attributes;
+using Garnet.server;
+
+namespace BDN.benchmark.Bitmap
+{
+ public unsafe class BinaryOperations
+ {
+ private const int Alignment = 64;
+
+ [ParamsSource(nameof(GetBitmapSizes))]
+ public Sizes BitmapSizes { get; set; }
+
+ [Params(BitmapOperation.XOR)]
+ public BitmapOperation Op { get; set; }
+
+ public IEnumerable GetBitmapSizes()
+ {
+ yield return new([1 << 21, 1 << 21]);
+ yield return new([1 << 21, (1 << 21) + 1]);
+
+ yield return new([1 << 21, 1 << 21, 1 << 21]);
+ yield return new([1 << 21, 1 << 21, (1 << 21) + 1]);
+
+ yield return new([256, 6 * 512 + 7, 512, 1024]);
+ }
+
+ private int minBitmapSize;
+ private byte** srcPtrs;
+ private byte** srcEndPtrs;
+
+ private int dstLength;
+ private byte* dstPtr;
+
+ [GlobalSetup]
+ public void GlobalSetup_Binary()
+ {
+ minBitmapSize = BitmapSizes.Values.Min();
+ srcPtrs = (byte**)NativeMemory.AllocZeroed((nuint)BitmapSizes.Values.Length, (nuint)sizeof(byte*));
+ srcEndPtrs = (byte**)NativeMemory.AllocZeroed((nuint)BitmapSizes.Values.Length, (nuint)sizeof(byte*));
+
+ for (var i = 0; i < BitmapSizes.Values.Length; i++)
+ {
+ srcPtrs[i] = (byte*)NativeMemory.AlignedAlloc((nuint)BitmapSizes.Values[i], Alignment);
+ srcEndPtrs[i] = srcPtrs[i] + BitmapSizes.Values[i];
+
+ new Random(i).NextBytes(new Span(srcPtrs[i], BitmapSizes.Values[i]));
+ }
+
+ dstLength = BitmapSizes.Values.Max();
+ dstPtr = (byte*)NativeMemory.AlignedAlloc((nuint)dstLength, Alignment);
+ }
+
+ [Benchmark]
+ public void BinaryOperation()
+ {
+ BitmapManager.InvokeBitOperationUnsafe(Op, BitmapSizes.Values.Length, srcPtrs, srcEndPtrs, dstPtr, dstLength, minBitmapSize);
+ }
+
+ [GlobalCleanup]
+ public void GlobalCleanup()
+ {
+ for (var i = 0; i < BitmapSizes.Values.Length; i++)
+ {
+ NativeMemory.AlignedFree(srcPtrs[i]);
+ }
+
+ NativeMemory.Free(srcPtrs);
+ NativeMemory.Free(srcEndPtrs);
+ NativeMemory.AlignedFree(dstPtr);
+ }
+
+ public record struct Sizes(int[] Values)
+ {
+ public override string ToString() => string.Join(", ", Values);
+ }
+ }
+}
\ No newline at end of file
diff --git a/benchmark/BDN.benchmark/Bitmap/UnaryOperations.cs b/benchmark/BDN.benchmark/Bitmap/UnaryOperations.cs
new file mode 100644
index 00000000000..c1b2fc0bcaf
--- /dev/null
+++ b/benchmark/BDN.benchmark/Bitmap/UnaryOperations.cs
@@ -0,0 +1,59 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+using System.Runtime.InteropServices;
+using BenchmarkDotNet.Attributes;
+using Garnet.server;
+
+namespace BDN.benchmark.Bitmap
+{
+ public unsafe partial class UnaryOperations
+ {
+ private const int Alignment = 64;
+
+ [ParamsSource(nameof(GetBitmapSize))]
+ public int BitmapSize { get; set; }
+
+ public IEnumerable GetBitmapSize()
+ {
+ yield return 256;
+ yield return 1 << 21;
+ }
+
+ private const int Keys = 1;
+ private byte** srcPtrs;
+ private byte** srcEndPtrs;
+
+ private byte* dstPtr;
+
+ [GlobalSetup]
+ public void GlobalSetup_Unary()
+ {
+ srcPtrs = (byte**)NativeMemory.AllocZeroed(Keys, (nuint)sizeof(byte*));
+ srcEndPtrs = (byte**)NativeMemory.AllocZeroed(Keys, (nuint)sizeof(byte*));
+
+ srcPtrs[0] = (byte*)NativeMemory.AlignedAlloc((uint)BitmapSize, Alignment);
+ srcEndPtrs[0] = srcPtrs[0] + (uint)BitmapSize;
+
+ new Random(0).NextBytes(new Span(srcPtrs[0], BitmapSize));
+
+ dstPtr = (byte*)NativeMemory.AlignedAlloc((nuint)BitmapSize, Alignment);
+ }
+
+ [Benchmark]
+ public void BitOperation_NOT()
+ {
+ BitmapManager.InvokeBitOperationUnsafe(BitmapOperation.NOT, Keys, srcPtrs, srcEndPtrs, dstPtr, BitmapSize, BitmapSize);
+ }
+
+ [GlobalCleanup]
+ public void GlobalCleanup()
+ {
+ NativeMemory.AlignedFree(srcPtrs[0]);
+
+ NativeMemory.Free(srcPtrs);
+ NativeMemory.Free(srcEndPtrs);
+ NativeMemory.AlignedFree(dstPtr);
+ }
+ }
+}
\ No newline at end of file
diff --git a/libs/common/Numerics/IBinaryOperator.cs b/libs/common/Numerics/IBinaryOperator.cs
new file mode 100644
index 00000000000..2abf864851d
--- /dev/null
+++ b/libs/common/Numerics/IBinaryOperator.cs
@@ -0,0 +1,80 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+using System.Numerics;
+using System.Runtime.Intrinsics;
+
+namespace Garnet.common.Numerics
+{
+ /// Operator that takes two input values and returns a single value.
+ public interface IBinaryOperator
+ {
+ ///
+ /// Computes the binary operation of two scalar values.
+ ///
+ static abstract T Invoke(T x, T y) where T : IBinaryInteger;
+
+ ///
+ /// Computes the binary operation of two vectors.
+ ///
+ static abstract Vector128 Invoke(Vector128 x, Vector128 y);
+
+ ///
+ static abstract Vector256 Invoke(Vector256 x, Vector256 y);
+
+ ///
+ static abstract Vector512 Invoke(Vector512 x, Vector512 y);
+ }
+
+ /// x & y
+ public readonly struct BitwiseAndOperator : IBinaryOperator
+ {
+ ///
+ public static T Invoke(T x, T y) where T : IBinaryInteger => x & y;
+ ///
+ public static Vector128 Invoke(Vector128 x, Vector128 y) => x & y;
+ ///
+ public static Vector256 Invoke(Vector256 x, Vector256 y) => x & y;
+ ///
+ public static Vector512 Invoke(Vector512 x, Vector512 y) => x & y;
+ }
+
+ /// x | y
+ public readonly struct BitwiseOrOperator : IBinaryOperator
+ {
+ ///
+ public static T Invoke(T x, T y) where T : IBinaryInteger => x | y;
+ ///
+ public static Vector128 Invoke(Vector128 x, Vector128 y) => x | y;
+ ///
+ public static Vector256 Invoke(Vector256 x, Vector256 y) => x | y;
+ ///
+ public static Vector512 Invoke(Vector512 x, Vector512 y) => x | y;
+ }
+
+ /// x ^ y
+ public readonly struct BitwiseXorOperator : IBinaryOperator
+ {
+ ///
+ public static T Invoke(T x, T y) where T : IBinaryInteger => x ^ y;
+ ///
+ public static Vector128 Invoke(Vector128 x, Vector128 y) => x ^ y;
+ ///
+ public static Vector256 Invoke(Vector256 x, Vector256 y) => x ^ y;
+ ///
+ public static Vector512 Invoke(Vector512 x, Vector512 y) => x ^ y;
+ }
+
+ /// x & ~y
+ public readonly struct BitwiseAndNotOperator : IBinaryOperator
+ {
+ ///
+ public static T Invoke(T x, T y) where T : IBinaryInteger => x & ~y;
+ ///
+ public static Vector128 Invoke(Vector128 x, Vector128 y) => x & ~y;
+ ///
+ public static Vector256 Invoke(Vector256 x, Vector256 y) => x & ~y;
+ ///
+ public static Vector512 Invoke(Vector512 x, Vector512 y) => x & ~y;
+ }
+}
\ No newline at end of file
diff --git a/libs/server/Garnet.server.csproj b/libs/server/Garnet.server.csproj
index 15939de0249..2c351e80f45 100644
--- a/libs/server/Garnet.server.csproj
+++ b/libs/server/Garnet.server.csproj
@@ -20,6 +20,7 @@
+
diff --git a/libs/server/Lua/LuaRunner.Functions.cs b/libs/server/Lua/LuaRunner.Functions.cs
index 920ee638544..e5b36a83dd8 100644
--- a/libs/server/Lua/LuaRunner.Functions.cs
+++ b/libs/server/Lua/LuaRunner.Functions.cs
@@ -2857,6 +2857,7 @@ internal int AclCheckCommand(nint luaStatePtr)
case RespCommand.BITOP_OR: state.PushConstantString(constStrs.OR); break;
case RespCommand.BITOP_XOR: state.PushConstantString(constStrs.XOR); break;
case RespCommand.BITOP_NOT: state.PushConstantString(constStrs.NOT); break;
+ case RespCommand.BITOP_DIFF: state.PushConstantString(constStrs.DIFF); break;
default: throw new InvalidOperationException($"Unexpected BITOP sub command: {subCommand}");
}
diff --git a/libs/server/Lua/LuaRunner.Strings.cs b/libs/server/Lua/LuaRunner.Strings.cs
index 6492d81d503..372900c0216 100644
--- a/libs/server/Lua/LuaRunner.Strings.cs
+++ b/libs/server/Lua/LuaRunner.Strings.cs
@@ -161,6 +161,8 @@ private readonly struct ConstantStringRegistryIndexes
internal int XOR { get; }
///
internal int NOT { get; }
+ ///
+ internal int DIFF { get; }
///
internal int KEYS { get; }
///
@@ -246,6 +248,7 @@ internal ConstantStringRegistryIndexes(ref LuaStateWrapper state)
OR = ConstantStringToRegistry(ref state, CmdStrings.LUA_OR);
XOR = ConstantStringToRegistry(ref state, CmdStrings.LUA_XOR);
NOT = ConstantStringToRegistry(ref state, CmdStrings.LUA_NOT);
+ DIFF = ConstantStringToRegistry(ref state, CmdStrings.LUA_DIFF);
KEYS = ConstantStringToRegistry(ref state, CmdStrings.LUA_KEYS);
ARGV = ConstantStringToRegistry(ref state, CmdStrings.LUA_ARGV);
}
diff --git a/libs/server/Resp/Bitmap/BitmapCommands.cs b/libs/server/Resp/Bitmap/BitmapCommands.cs
index ce50482c0a0..a15acc877c0 100644
--- a/libs/server/Resp/Bitmap/BitmapCommands.cs
+++ b/libs/server/Resp/Bitmap/BitmapCommands.cs
@@ -40,7 +40,12 @@ public enum BitmapOperation : byte
///
/// NOT
///
- NOT
+ NOT,
+
+ ///
+ /// DIFF
+ ///
+ DIFF
}
internal enum BitFieldOverflow : byte
@@ -317,6 +322,11 @@ private bool NetworkStringBitOperation(BitmapOperation bitOp, ref TG
return AbortWithErrorMessage(CmdStrings.RESP_ERR_WRONG_NUMBER_OF_ARGUMENTS);
}
+ if (bitOp == BitmapOperation.DIFF && parseState.Count < 3)
+ {
+ return AbortWithErrorMessage(CmdStrings.RESP_ERR_BITOP_DIFF_TWO_SOURCE_KEYS_REQUIRED);
+ }
+
if (parseState.Count > 64)
{
return AbortWithErrorMessage(CmdStrings.RESP_ERR_BITOP_KEY_LIMIT);
diff --git a/libs/server/Resp/Bitmap/BitmapManagerBitOp.cs b/libs/server/Resp/Bitmap/BitmapManagerBitOp.cs
index 6917e259935..88835d65624 100644
--- a/libs/server/Resp/Bitmap/BitmapManagerBitOp.cs
+++ b/libs/server/Resp/Bitmap/BitmapManagerBitOp.cs
@@ -1,650 +1,336 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
+using System;
+using System.Diagnostics;
+using System.Numerics.Tensors;
+using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
using Garnet.common;
-
+using Garnet.common.Numerics;
namespace Garnet.server
{
public unsafe partial class BitmapManager
{
///
- /// BitOp main driver.
+ /// Performs a bitwise operation across one or more source buffers and writes the result to the destination buffer.
///
- /// Output buffer to write BitOp result
- /// Output buffer length.
- /// Array of pointers to bitmaps used as input in the corresponding bitop.
- /// Array of pointers to bitmap sources.
- /// Number of source keys.
- /// Minimum size of source bitmap.
- /// Type of bitop operation being executed.
- ///
- public static bool BitOpMainUnsafeMultiKey(byte* dstPtr, int dstLen, byte** srcStartPtrs, byte** srcEndPtrs, int srcKeyCount, int minSize, byte bitop)
+ /// The bitwise operation to perform.
+ /// Number of source buffers
+ /// Array of pointers to source buffers. The array length must be greater than or equal to
+ /// Array of the buffer lengths specified in . The array length must be greater than or equal to
+ /// Destination buffer to write the result.
+ /// Destination buffer length.
+ /// The length of shortest source buffer.
+ public static void InvokeBitOperationUnsafe(BitmapOperation op, int srcCount, byte** srcPtrs, byte** srcEndPtrs, byte* dstPtr, int dstLength, int shortestSrcLength)
{
- switch (bitop)
- {
- case (byte)BitmapOperation.NOT:
- __bitop_multikey_simdX256_not(dstPtr, dstLen, srcStartPtrs[0], srcEndPtrs[0] - srcStartPtrs[0]);
- break;
- case (byte)BitmapOperation.AND:
- __bitop_multikey_simdX256_and(dstPtr, dstLen, srcStartPtrs, srcEndPtrs, srcKeyCount, minSize);
- break;
- case (byte)BitmapOperation.OR:
- __bitop_multikey_simdX256_or(dstPtr, dstLen, srcStartPtrs, srcEndPtrs, srcKeyCount, minSize);
- break;
- case (byte)BitmapOperation.XOR:
- __bitop_multikey_simdX256_xor(dstPtr, dstLen, srcStartPtrs, srcEndPtrs, srcKeyCount, minSize);
- break;
- default:
- throw new GarnetException("Unsupported BitOp command");
- }
- return true;
- }
+ Debug.Assert(op is BitmapOperation.NOT or BitmapOperation.AND or BitmapOperation.OR or BitmapOperation.XOR or BitmapOperation.DIFF);
+ Debug.Assert(srcCount > 0);
+ Debug.Assert(dstLength >= 0 && shortestSrcLength >= 0);
+ Debug.Assert(dstLength >= shortestSrcLength);
- ///
- /// Negation bitop implementation using 256-wide SIMD registers.
- ///
- /// Output buffer to write BitOp result
- /// Output buffer length.
- /// Pointer to source bitmap.
- /// Source bitmap length.
- private static void __bitop_multikey_simdX256_not(byte* dstPtr, long dstLen, byte* srcBitmap, long srcLen)
- {
- int batchSize = 8 * 32;
- long slen = srcLen;
- long stail = slen & (batchSize - 1);
-
- //iterate using srcBitmap because always dstLen >= srcLen
- byte* srcCurr = srcBitmap;
- byte* srcEnd = srcCurr + (slen - stail);
- byte* dstCurr = dstPtr;
-
- #region 8x32
- while (srcCurr < srcEnd)
- {
- Vector256 d00 = Avx.LoadVector256(srcCurr);
- Vector256 d01 = Avx.LoadVector256(srcCurr + 32);
- Vector256 d02 = Avx.LoadVector256(srcCurr + 64);
- Vector256 d03 = Avx.LoadVector256(srcCurr + 96);
- Vector256 d04 = Avx.LoadVector256(srcCurr + 128);
- Vector256 d05 = Avx.LoadVector256(srcCurr + 160);
- Vector256 d06 = Avx.LoadVector256(srcCurr + 192);
- Vector256 d07 = Avx.LoadVector256(srcCurr + 224);
-
- Avx.Store(dstCurr, Avx2.Xor(d00, Vector256.AllBitsSet));
- Avx.Store(dstCurr + 32, Avx2.Xor(d01, Vector256.AllBitsSet));
- Avx.Store(dstCurr + 64, Avx2.Xor(d02, Vector256.AllBitsSet));
- Avx.Store(dstCurr + 96, Avx2.Xor(d03, Vector256.AllBitsSet));
- Avx.Store(dstCurr + 128, Avx2.Xor(d04, Vector256.AllBitsSet));
- Avx.Store(dstCurr + 160, Avx2.Xor(d05, Vector256.AllBitsSet));
- Avx.Store(dstCurr + 192, Avx2.Xor(d06, Vector256.AllBitsSet));
- Avx.Store(dstCurr + 224, Avx2.Xor(d07, Vector256.AllBitsSet));
-
- srcCurr += batchSize;
- dstCurr += batchSize;
- }
- if (stail == 0) return;
- #endregion
-
- #region 1x32
- slen = stail;
- batchSize = 1 * 32;
- stail = slen & (batchSize - 1);
- srcEnd = srcCurr + (slen - stail);
- while (srcCurr < srcEnd)
- {
- Vector256 d00 = Avx.LoadVector256(srcCurr);
- Avx.Store(dstCurr, Avx2.Xor(d00, Vector256.AllBitsSet));
- srcCurr += batchSize;
- dstCurr += batchSize;
- }
- if (stail == 0) return;
- #endregion
-
- #region 4x8
- slen = stail;
- batchSize = 4 * 8;
- stail = slen & (batchSize - 1);
- srcEnd = srcCurr + (slen - stail);
- while (srcCurr < srcEnd)
+ if (srcCount == 1)
{
- long d00 = *(long*)(srcCurr);
- long d01 = *(long*)(srcCurr + 8);
- long d02 = *(long*)(srcCurr + 16);
- long d03 = *(long*)(srcCurr + 24);
-
- *(long*)dstCurr = ~d00;
- *(long*)(dstCurr + 8) = ~d01;
- *(long*)(dstCurr + 16) = ~d02;
- *(long*)(dstCurr + 24) = ~d03;
-
- srcCurr += batchSize;
- dstCurr += batchSize;
- }
- if (stail == 0) return;
- #endregion
-
- #region 1x8
- slen = stail;
- batchSize = 8;
- stail = slen & (batchSize - 1);
- srcEnd = srcCurr + (slen - stail);
- while (srcCurr < srcEnd)
- {
- long d00 = *(long*)(srcCurr);
+ if (op == BitmapOperation.DIFF) throw new GarnetException("BITOP DIFF operation requires at least two source bitmaps");
- *(long*)dstCurr = ~d00;
+ var srcBitmap = new ReadOnlySpan(srcPtrs[0], checked((int)(srcEndPtrs[0] - srcPtrs[0])));
+ var dstBitmap = new Span(dstPtr, dstLength);
- srcCurr += batchSize;
- dstCurr += batchSize;
+ if (op == BitmapOperation.NOT)
+ {
+ TensorPrimitives.OnesComplement(srcBitmap, dstBitmap);
+ }
+ else
+ {
+ srcBitmap.CopyTo(dstBitmap);
+ }
}
- if (stail == 0) return;
- #endregion
-
- if (stail >= 7) dstCurr[6] = (byte)(~srcCurr[6]);
- if (stail >= 6) dstCurr[5] = (byte)(~srcCurr[5]);
- if (stail >= 5) dstCurr[4] = (byte)(~srcCurr[4]);
- if (stail >= 4) dstCurr[3] = (byte)(~srcCurr[3]);
- if (stail >= 3) dstCurr[2] = (byte)(~srcCurr[2]);
- if (stail >= 2) dstCurr[1] = (byte)(~srcCurr[1]);
- if (stail >= 1) dstCurr[0] = (byte)(~srcCurr[0]);
+ // srcCount ≥ 2
+ else if (op == BitmapOperation.AND) InvokeNaryBitwiseOperation(srcCount, srcPtrs, srcEndPtrs, dstPtr, dstLength, shortestSrcLength);
+ else if (op == BitmapOperation.OR) InvokeNaryBitwiseOperation(srcCount, srcPtrs, srcEndPtrs, dstPtr, dstLength, shortestSrcLength);
+ else if (op == BitmapOperation.XOR) InvokeNaryBitwiseOperation(srcCount, srcPtrs, srcEndPtrs, dstPtr, dstLength, shortestSrcLength);
+ else if (op == BitmapOperation.DIFF) InvokeNaryBitwiseOperation(srcCount, srcPtrs, srcEndPtrs, dstPtr, dstLength, shortestSrcLength);
}
///
- /// AND bitop implementation using 256-wide SIMD registers.
+ /// Invokes bitwise binary operation across n-ary source bitmaps.
///
- /// Output buffer to write BitOp result
- /// Output buffer length.
- /// Pointer to start of bitmap sources.
- /// Pointer to end of bitmap sources
- /// Number of source keys.
- /// Minimum size of source bitmaps.
- private static void __bitop_multikey_simdX256_and(byte* dstPtr, int dstLen, byte** srcStartPtrs, byte** srcEndPtrs, int srcKeyCount, int minSize)
+ /// The binary operator type to compute bitwise
+ /// Number of source bitmaps.
+ /// Array of pointers to source bitmap buffers.
+ /// Array of the of pointers pointing to the end of the respective the bitmaps specified in .
+ /// Destination buffer to write the result.
+ /// Destination buffer length.
+ /// The length of shortest source buffer.
+ [SkipLocalsInit]
+ private static void InvokeNaryBitwiseOperation(int srcCount, byte** srcPtrs, byte** srcEndPtrs, byte* dstPtr, int dstLength, int shortestSrcLength)
+ where TBinaryOperator : struct, IBinaryOperator
{
- int batchSize = 8 * 32;
- long slen = minSize;
- long stail = slen & (batchSize - 1);
-
- byte* dstCurr = dstPtr;
- byte* dstEnd = dstCurr + (slen - stail);
+ var dstEndPtr = dstPtr + dstLength;
- #region 8x32
- while (dstCurr < dstEnd)
- {
- Vector256 d00 = Avx.LoadVector256(srcStartPtrs[0]);
- Vector256 d01 = Avx.LoadVector256(srcStartPtrs[0] + 32);
- Vector256 d02 = Avx.LoadVector256(srcStartPtrs[0] + 64);
- Vector256 d03 = Avx.LoadVector256(srcStartPtrs[0] + 96);
- Vector256 d04 = Avx.LoadVector256(srcStartPtrs[0] + 128);
- Vector256 d05 = Avx.LoadVector256(srcStartPtrs[0] + 160);
- Vector256 d06 = Avx.LoadVector256(srcStartPtrs[0] + 192);
- Vector256 d07 = Avx.LoadVector256(srcStartPtrs[0] + 224);
- srcStartPtrs[0] += batchSize;
- for (int i = 1; i < srcKeyCount; i++)
- {
- Vector256 s00 = Avx.LoadVector256(srcStartPtrs[i]);
- Vector256 s01 = Avx.LoadVector256(srcStartPtrs[i] + 32);
- Vector256 s02 = Avx.LoadVector256(srcStartPtrs[i] + 64);
- Vector256 s03 = Avx.LoadVector256(srcStartPtrs[i] + 96);
- Vector256 s04 = Avx.LoadVector256(srcStartPtrs[i] + 128);
- Vector256 s05 = Avx.LoadVector256(srcStartPtrs[i] + 160);
- Vector256 s06 = Avx.LoadVector256(srcStartPtrs[i] + 192);
- Vector256 s07 = Avx.LoadVector256(srcStartPtrs[i] + 224);
-
- d00 = Avx2.And(d00, s00);
- d01 = Avx2.And(d01, s01);
- d02 = Avx2.And(d02, s02);
- d03 = Avx2.And(d03, s03);
- d04 = Avx2.And(d04, s04);
- d05 = Avx2.And(d05, s05);
- d06 = Avx2.And(d06, s06);
- d07 = Avx2.And(d07, s07);
- srcStartPtrs[i] += batchSize;
- }
+ var remainingLength = shortestSrcLength;
+ var batchRemainder = shortestSrcLength;
+ byte* dstBatchEndPtr;
- Avx.Store(dstCurr, d00);
- Avx.Store(dstCurr + 32, d01);
- Avx.Store(dstCurr + 64, d02);
- Avx.Store(dstCurr + 96, d03);
- Avx.Store(dstCurr + 128, d04);
- Avx.Store(dstCurr + 160, d05);
- Avx.Store(dstCurr + 192, d06);
- Avx.Store(dstCurr + 224, d07);
+ // Keep the cursor of the first source buffer in local to keep processing tidy.
+ var firstSrcPtr = srcPtrs[0];
- dstCurr += batchSize;
- }
- if (stail == 0) goto fillTail;
- #endregion
-
- #region 1x32
- slen = stail;
- batchSize = 1 * 32;
- stail = slen & (batchSize - 1);
- dstEnd = dstCurr + (slen - stail);
-
- while (dstCurr < dstEnd)
+ // Copy remaining source buffer pointers so we don't increment caller's.
+ var tmpSrcPtrs = stackalloc byte*[srcCount];
+ for (var i = 0; i < srcCount; i++)
{
- Vector256 d00 = Avx.LoadVector256(srcStartPtrs[0]);
- srcStartPtrs[0] += batchSize;
- for (int i = 1; i < srcKeyCount; i++)
- {
- Vector256 s00 = Avx.LoadVector256(srcStartPtrs[i]);
- d00 = Avx2.And(d00, s00);
- srcStartPtrs[i] += batchSize;
- }
- Avx.Store(dstCurr, d00);
- dstCurr += batchSize;
+ tmpSrcPtrs[i] = srcPtrs[i];
}
- if (stail == 0) goto fillTail;
- #endregion
-
- #region scalar_4x8
- slen = stail;
- batchSize = 4 * 8;
- stail = slen & (batchSize - 1);
- dstEnd = dstCurr + (slen - stail);
- while (dstCurr < dstEnd)
+ srcPtrs = tmpSrcPtrs;
+
+ if (Vector512.IsHardwareAccelerated && Vector512.IsSupported)
{
- long d00 = *(long*)(srcStartPtrs[0]);
- long d01 = *(long*)(srcStartPtrs[0] + 8);
- long d02 = *(long*)(srcStartPtrs[0] + 16);
- long d03 = *(long*)(srcStartPtrs[0] + 24);
- srcStartPtrs[0] += batchSize;
- for (int i = 1; i < srcKeyCount; i++)
- {
- d00 &= *(long*)(srcStartPtrs[i]);
- d01 &= *(long*)(srcStartPtrs[i] + 8);
- d02 &= *(long*)(srcStartPtrs[i] + 16);
- d03 &= *(long*)(srcStartPtrs[i] + 24);
- srcStartPtrs[i] += batchSize;
- }
+ // Vectorized: 64 bytes x 8
+ batchRemainder = remainingLength & ((Vector512.Count * 8) - 1);
+ dstBatchEndPtr = dstPtr + (remainingLength - batchRemainder);
+ remainingLength = batchRemainder;
- *(long*)dstCurr = d00;
- *(long*)(dstCurr + 8) = d01;
- *(long*)(dstCurr + 16) = d02;
- *(long*)(dstCurr + 24) = d03;
- dstCurr += batchSize;
+ Vectorized512(ref firstSrcPtr, srcCount, srcPtrs, ref dstPtr, dstBatchEndPtr);
}
- if (stail == 0) goto fillTail;
- #endregion
-
- #region scalar_1x8
- slen = stail;
- batchSize = 8;
- stail = slen & (batchSize - 1);
- dstEnd = dstCurr + (slen - stail);
- while (dstCurr < dstEnd)
+ else if (Vector256.IsHardwareAccelerated && Vector256.IsSupported)
{
- long d00 = *(long*)(srcStartPtrs[0]);
- srcStartPtrs[0] += batchSize;
- for (int i = 1; i < srcKeyCount; i++)
- {
- d00 &= *(long*)(srcStartPtrs[i]);
- srcStartPtrs[i] += batchSize;
- }
- *(long*)dstCurr = d00;
- dstCurr += batchSize;
- }
- #endregion
+ // Vectorized: 32 bytes x 8
+ batchRemainder = remainingLength & ((Vector256.Count * 8) - 1);
+ dstBatchEndPtr = dstPtr + (remainingLength - batchRemainder);
+ remainingLength = batchRemainder;
- fillTail:
- #region scalar_1x1
- byte* dstMaxEnd = dstPtr + dstLen;
- int offset = 0;
- while (dstCurr < dstMaxEnd)
+ Vectorized256(ref firstSrcPtr, srcCount, srcPtrs, ref dstPtr, dstBatchEndPtr);
+ }
+ else if (Vector128.IsHardwareAccelerated && Vector128.IsSupported)
{
- byte d00;
- if (srcStartPtrs[0] + offset < srcEndPtrs[0])
- d00 = srcStartPtrs[0][offset];
- else
- {
- d00 = 0;
- goto writeBack;
- }
+ // Vectorized: 16 bytes x 8
+ batchRemainder = remainingLength & ((Vector128.Count * 8) - 1);
+ dstBatchEndPtr = dstPtr + (remainingLength - batchRemainder);
+ remainingLength = batchRemainder;
- for (int i = 1; i < srcKeyCount; i++)
- {
- if (srcStartPtrs[i] + offset < srcEndPtrs[i])
- d00 &= srcStartPtrs[i][offset];
- else
- {
- d00 = 0;
- goto writeBack;
- }
- }
- writeBack:
- *dstCurr++ = d00;
- offset++;
+ Vectorized128(ref firstSrcPtr, srcCount, srcPtrs, ref dstPtr, dstBatchEndPtr);
}
- #endregion
- }
-
- ///
- /// OR bitop implementation using 256-wide SIMD registers.
- ///
- /// Output buffer to write BitOp result
- /// Output buffer length.
- /// Pointer to start of bitmap sources.
- /// Pointer to end of bitmap sources
- /// Number of source keys.
- /// Minimum size of source bitmaps.
- private static void __bitop_multikey_simdX256_or(byte* dstPtr, int dstLen, byte** srcStartPtrs, byte** srcEndPtrs, int srcKeyCount, int minSize)
- {
- int batchSize = 8 * 32;
- long slen = minSize;
- long stail = slen & (batchSize - 1);
- byte* dstCurr = dstPtr;
- byte* dstEnd = dstCurr + (slen - stail);
+ // Scalar: 8 bytes x 4
+ batchRemainder = remainingLength & ((sizeof(ulong) * 4) - 1);
+ dstBatchEndPtr = dstPtr + (remainingLength - batchRemainder);
+ remainingLength = batchRemainder;
- #region 8x32
- while (dstCurr < dstEnd)
+ while (dstPtr < dstBatchEndPtr)
{
- Vector256 d00 = Avx.LoadVector256(srcStartPtrs[0]);
- Vector256 d01 = Avx.LoadVector256(srcStartPtrs[0] + 32);
- Vector256 d02 = Avx.LoadVector256(srcStartPtrs[0] + 64);
- Vector256 d03 = Avx.LoadVector256(srcStartPtrs[0] + 96);
- Vector256 d04 = Avx.LoadVector256(srcStartPtrs[0] + 128);
- Vector256 d05 = Avx.LoadVector256(srcStartPtrs[0] + 160);
- Vector256 d06 = Avx.LoadVector256(srcStartPtrs[0] + 192);
- Vector256 d07 = Avx.LoadVector256(srcStartPtrs[0] + 224);
- srcStartPtrs[0] += batchSize;
- for (int i = 1; i < srcKeyCount; i++)
- {
- Vector256 s00 = Avx.LoadVector256(srcStartPtrs[i]);
- Vector256 s01 = Avx.LoadVector256(srcStartPtrs[i] + 32);
- Vector256 s02 = Avx.LoadVector256(srcStartPtrs[i] + 64);
- Vector256 s03 = Avx.LoadVector256(srcStartPtrs[i] + 96);
- Vector256 s04 = Avx.LoadVector256(srcStartPtrs[i] + 128);
- Vector256 s05 = Avx.LoadVector256(srcStartPtrs[i] + 160);
- Vector256 s06 = Avx.LoadVector256(srcStartPtrs[i] + 192);
- Vector256 s07 = Avx.LoadVector256(srcStartPtrs[i] + 224);
-
- d00 = Avx2.Or(d00, s00);
- d01 = Avx2.Or(d01, s01);
- d02 = Avx2.Or(d02, s02);
- d03 = Avx2.Or(d03, s03);
- d04 = Avx2.Or(d04, s04);
- d05 = Avx2.Or(d05, s05);
- d06 = Avx2.Or(d06, s06);
- d07 = Avx2.Or(d07, s07);
- srcStartPtrs[i] += batchSize;
- }
+ var d00 = *(ulong*)(firstSrcPtr + (sizeof(ulong) * 0));
+ var d01 = *(ulong*)(firstSrcPtr + (sizeof(ulong) * 1));
+ var d02 = *(ulong*)(firstSrcPtr + (sizeof(ulong) * 2));
+ var d03 = *(ulong*)(firstSrcPtr + (sizeof(ulong) * 3));
- Avx.Store(dstCurr, d00);
- Avx.Store(dstCurr + 32, d01);
- Avx.Store(dstCurr + 64, d02);
- Avx.Store(dstCurr + 96, d03);
- Avx.Store(dstCurr + 128, d04);
- Avx.Store(dstCurr + 160, d05);
- Avx.Store(dstCurr + 192, d06);
- Avx.Store(dstCurr + 224, d07);
+ firstSrcPtr += sizeof(ulong) * 4;
- dstCurr += batchSize;
- }
- if (stail == 0) goto fillTail;
- #endregion
+ for (var i = 1; i < srcCount; i++)
+ {
+ ref var startPtr = ref srcPtrs[i];
- #region 1x32
- slen = stail;
- batchSize = 1 * 32;
- stail = slen & (batchSize - 1);
- dstEnd = dstCurr + (slen - stail);
+ d00 = TBinaryOperator.Invoke(d00, *(ulong*)(startPtr + (sizeof(ulong) * 0)));
+ d01 = TBinaryOperator.Invoke(d01, *(ulong*)(startPtr + (sizeof(ulong) * 1)));
+ d02 = TBinaryOperator.Invoke(d02, *(ulong*)(startPtr + (sizeof(ulong) * 2)));
+ d03 = TBinaryOperator.Invoke(d03, *(ulong*)(startPtr + (sizeof(ulong) * 3)));
- while (dstCurr < dstEnd)
- {
- Vector256 d00 = Avx.LoadVector256(srcStartPtrs[0]);
- srcStartPtrs[0] += batchSize;
- for (int i = 1; i < srcKeyCount; i++)
- {
- Vector256 s00 = Avx.LoadVector256(srcStartPtrs[i]);
- d00 = Avx2.Or(d00, s00);
- srcStartPtrs[i] += batchSize;
- }
- Avx.Store(dstCurr, d00);
- dstCurr += batchSize;
- }
- if (stail == 0) goto fillTail;
- #endregion
-
- #region scalar_4x8
- slen = stail;
- batchSize = 4 * 8;
- stail = slen & (batchSize - 1);
- dstEnd = dstCurr + (slen - stail);
- while (dstCurr < dstEnd)
- {
- long d00 = *(long*)(srcStartPtrs[0]);
- long d01 = *(long*)(srcStartPtrs[0] + 8);
- long d02 = *(long*)(srcStartPtrs[0] + 16);
- long d03 = *(long*)(srcStartPtrs[0] + 24);
- srcStartPtrs[0] += batchSize;
- for (int i = 1; i < srcKeyCount; i++)
- {
- d00 |= *(long*)(srcStartPtrs[i]);
- d01 |= *(long*)(srcStartPtrs[i] + 8);
- d02 |= *(long*)(srcStartPtrs[i] + 16);
- d03 |= *(long*)(srcStartPtrs[i] + 24);
- srcStartPtrs[i] += batchSize;
+ srcPtrs[i] += sizeof(ulong) * 4;
}
- *(long*)dstCurr = d00;
- *(long*)(dstCurr + 8) = d01;
- *(long*)(dstCurr + 16) = d02;
- *(long*)(dstCurr + 24) = d03;
- dstCurr += batchSize;
- }
- if (stail == 0) goto fillTail;
- #endregion
-
- #region scalar_1x8
- slen = stail;
- batchSize = 8;
- stail = slen & (batchSize - 1);
- dstEnd = dstCurr + (slen - stail);
- while (dstCurr < dstEnd)
- {
- long d00 = *(long*)(srcStartPtrs[0]);
- srcStartPtrs[0] += batchSize;
- for (int i = 1; i < srcKeyCount; i++)
- {
- d00 |= *(long*)(srcStartPtrs[i]);
- srcStartPtrs[i] += batchSize;
- }
- *(long*)dstCurr = d00;
- dstCurr += batchSize;
+ *(ulong*)(dstPtr + (sizeof(ulong) * 0)) = d00;
+ *(ulong*)(dstPtr + (sizeof(ulong) * 1)) = d01;
+ *(ulong*)(dstPtr + (sizeof(ulong) * 2)) = d02;
+ *(ulong*)(dstPtr + (sizeof(ulong) * 3)) = d03;
+
+ dstPtr += sizeof(ulong) * 4;
}
- #endregion
- fillTail:
- #region scalar_1x1
- byte* dstMaxEnd = dstPtr + dstLen;
- int offset = 0;
- while (dstCurr < dstMaxEnd)
+ // Handle the remaining tails
+ while (dstPtr < dstEndPtr)
{
byte d00 = 0;
- if (srcStartPtrs[0] + offset < srcEndPtrs[0])
+
+ if (firstSrcPtr < srcEndPtrs[0])
{
- d00 = srcStartPtrs[0][offset];
- if (d00 == 0xff) goto writeBack;
+ d00 = *firstSrcPtr;
+ firstSrcPtr++;
}
- for (int i = 1; i < srcKeyCount; i++)
+ for (var i = 1; i < srcCount; i++)
{
- if (srcStartPtrs[i] + offset < srcEndPtrs[i])
+ if (srcPtrs[i] < srcEndPtrs[i])
{
- d00 |= srcStartPtrs[i][offset];
- if (d00 == 0xff) goto writeBack;
+ d00 = TBinaryOperator.Invoke(d00, *srcPtrs[i]);
+ srcPtrs[i]++;
+ }
+ else if (typeof(TBinaryOperator) == typeof(BitwiseAndOperator))
+ {
+ d00 = 0;
}
}
- writeBack:
- *dstCurr++ = d00;
- offset++;
- }
- #endregion
- }
-
- ///
- /// XOR bitop implementation using 256-wide SIMD registers.
- ///
- /// Output buffer to write BitOp result
- /// Output buffer length.
- /// Pointer to start of bitmap sources.
- /// Pointer to end of bitmap sources
- /// Number of source keys.
- /// Minimum size of source bitmaps.
- private static void __bitop_multikey_simdX256_xor(byte* dstPtr, int dstLen, byte** srcStartPtrs, byte** srcEndPtrs, int srcKeyCount, int minSize)
- {
- int batchSize = 8 * 32;
- long slen = minSize;
- long stail = slen & (batchSize - 1);
- byte* dstCurr = dstPtr;
- byte* dstEnd = dstCurr + (slen - stail);
+ *dstPtr++ = d00;
+ }
- #region 8x32
- while (dstCurr < dstEnd)
+ static void Vectorized512(ref byte* firstPtr, int srcCount, byte** srcStartPtrs, ref byte* dstPtr, byte* dstBatchEndPtr)
{
- Vector256 d00 = Avx.LoadVector256(srcStartPtrs[0]);
- Vector256 d01 = Avx.LoadVector256(srcStartPtrs[0] + 32);
- Vector256 d02 = Avx.LoadVector256(srcStartPtrs[0] + 64);
- Vector256 d03 = Avx.LoadVector256(srcStartPtrs[0] + 96);
- Vector256 d04 = Avx.LoadVector256(srcStartPtrs[0] + 128);
- Vector256 d05 = Avx.LoadVector256(srcStartPtrs[0] + 160);
- Vector256 d06 = Avx.LoadVector256(srcStartPtrs[0] + 192);
- Vector256 d07 = Avx.LoadVector256(srcStartPtrs[0] + 224);
- srcStartPtrs[0] += batchSize;
- for (int i = 1; i < srcKeyCount; i++)
+ while (dstPtr < dstBatchEndPtr)
{
- Vector256 s00 = Avx.LoadVector256(srcStartPtrs[i]);
- Vector256 s01 = Avx.LoadVector256(srcStartPtrs[i] + 32);
- Vector256 s02 = Avx.LoadVector256(srcStartPtrs[i] + 64);
- Vector256 s03 = Avx.LoadVector256(srcStartPtrs[i] + 96);
- Vector256 s04 = Avx.LoadVector256(srcStartPtrs[i] + 128);
- Vector256 s05 = Avx.LoadVector256(srcStartPtrs[i] + 160);
- Vector256 s06 = Avx.LoadVector256(srcStartPtrs[i] + 192);
- Vector256 s07 = Avx.LoadVector256(srcStartPtrs[i] + 224);
-
- d00 = Avx2.Xor(d00, s00);
- d01 = Avx2.Xor(d01, s01);
- d02 = Avx2.Xor(d02, s02);
- d03 = Avx2.Xor(d03, s03);
- d04 = Avx2.Xor(d04, s04);
- d05 = Avx2.Xor(d05, s05);
- d06 = Avx2.Xor(d06, s06);
- d07 = Avx2.Xor(d07, s07);
- srcStartPtrs[i] += batchSize;
- }
-
- Avx.Store(dstCurr, d00);
- Avx.Store(dstCurr + 32, d01);
- Avx.Store(dstCurr + 64, d02);
- Avx.Store(dstCurr + 96, d03);
- Avx.Store(dstCurr + 128, d04);
- Avx.Store(dstCurr + 160, d05);
- Avx.Store(dstCurr + 192, d06);
- Avx.Store(dstCurr + 224, d07);
-
- dstCurr += batchSize;
- }
- #endregion
+ var d00 = Vector512.Load(firstPtr + (Vector512.Count * 0));
+ var d01 = Vector512.Load(firstPtr + (Vector512.Count * 1));
+ var d02 = Vector512.Load(firstPtr + (Vector512.Count * 2));
+ var d03 = Vector512.Load(firstPtr + (Vector512.Count * 3));
+ var d04 = Vector512.Load(firstPtr + (Vector512.Count * 4));
+ var d05 = Vector512.Load(firstPtr + (Vector512.Count * 5));
+ var d06 = Vector512.Load(firstPtr + (Vector512.Count * 6));
+ var d07 = Vector512.Load(firstPtr + (Vector512.Count * 7));
+
+ firstPtr += Vector512.Count * 8;
+
+ for (var i = 1; i < srcCount; i++)
+ {
+ ref var startPtr = ref srcStartPtrs[i];
+
+ var s00 = Vector512.Load(startPtr + (Vector512.Count * 0));
+ var s01 = Vector512.Load(startPtr + (Vector512.Count * 1));
+ var s02 = Vector512.Load(startPtr + (Vector512.Count * 2));
+ var s03 = Vector512.Load(startPtr + (Vector512.Count * 3));
+ var s04 = Vector512.Load(startPtr + (Vector512.Count * 4));
+ var s05 = Vector512.Load(startPtr + (Vector512.Count * 5));
+ var s06 = Vector512.Load(startPtr + (Vector512.Count * 6));
+ var s07 = Vector512.Load(startPtr + (Vector512.Count * 7));
+
+ d00 = TBinaryOperator.Invoke(d00, s00);
+ d01 = TBinaryOperator.Invoke(d01, s01);
+ d02 = TBinaryOperator.Invoke(d02, s02);
+ d03 = TBinaryOperator.Invoke(d03, s03);
+ d04 = TBinaryOperator.Invoke(d04, s04);
+ d05 = TBinaryOperator.Invoke(d05, s05);
+ d06 = TBinaryOperator.Invoke(d06, s06);
+ d07 = TBinaryOperator.Invoke(d07, s07);
+
+ startPtr += Vector512.Count * 8;
+ }
- #region 1x32
- slen = stail;
- batchSize = 1 * 32;
- stail = slen & (batchSize - 1);
- dstEnd = dstCurr + (slen - stail);
+ Vector512.Store(d00, dstPtr + (Vector512.Count * 0));
+ Vector512.Store(d01, dstPtr + (Vector512.Count * 1));
+ Vector512.Store(d02, dstPtr + (Vector512.Count * 2));
+ Vector512.Store(d03, dstPtr + (Vector512.Count * 3));
+ Vector512.Store(d04, dstPtr + (Vector512.Count * 4));
+ Vector512.Store(d05, dstPtr + (Vector512.Count * 5));
+ Vector512.Store(d06, dstPtr + (Vector512.Count * 6));
+ Vector512.Store(d07, dstPtr + (Vector512.Count * 7));
- while (dstCurr < dstEnd)
- {
- Vector256 d00 = Avx.LoadVector256(srcStartPtrs[0]);
- srcStartPtrs[0] += batchSize;
- for (int i = 1; i < srcKeyCount; i++)
- {
- Vector256 s00 = Avx.LoadVector256(srcStartPtrs[i]);
- d00 = Avx2.Xor(d00, s00);
- srcStartPtrs[i] += batchSize;
+ dstPtr += Vector512.Count * 8;
}
- Avx.Store(dstCurr, d00);
- dstCurr += batchSize;
}
- #endregion
-
- #region scalar_4x8
- slen = stail;
- batchSize = 4 * 8;
- stail = slen & (batchSize - 1);
- dstEnd = dstCurr + (slen - stail);
- while (dstCurr < dstEnd)
- {
- long d00 = *(long*)(srcStartPtrs[0]);
- long d01 = *(long*)(srcStartPtrs[0] + 8);
- long d02 = *(long*)(srcStartPtrs[0] + 16);
- long d03 = *(long*)(srcStartPtrs[0] + 24);
- srcStartPtrs[0] += batchSize;
- for (int i = 1; i < srcKeyCount; i++)
- {
- d00 ^= *(long*)(srcStartPtrs[i]);
- d01 ^= *(long*)(srcStartPtrs[i] + 8);
- d02 ^= *(long*)(srcStartPtrs[i] + 16);
- d03 ^= *(long*)(srcStartPtrs[i] + 24);
- srcStartPtrs[i] += batchSize;
- }
- *(long*)dstCurr = d00;
- *(long*)(dstCurr + 8) = d01;
- *(long*)(dstCurr + 16) = d02;
- *(long*)(dstCurr + 24) = d03;
- dstCurr += batchSize;
- }
- if (stail == 0) goto fillTail;
- #endregion
-
- #region scalar_1x8
- slen = stail;
- batchSize = 8;
- stail = slen & (batchSize - 1);
- dstEnd = dstCurr + (slen - stail);
- while (dstCurr < dstEnd)
+ static void Vectorized256(ref byte* firstPtr, int srcCount, byte** srcStartPtrs, ref byte* dstPtr, byte* dstBatchEndPtr)
{
- long d00 = *(long*)(srcStartPtrs[0]);
- srcStartPtrs[0] += batchSize;
- for (int i = 1; i < srcKeyCount; i++)
+ while (dstPtr < dstBatchEndPtr)
{
- d00 ^= *(long*)(srcStartPtrs[i]);
- srcStartPtrs[i] += batchSize;
+ var d00 = Vector256.Load(firstPtr + (Vector256.Count * 0));
+ var d01 = Vector256.Load(firstPtr + (Vector256.Count * 1));
+ var d02 = Vector256.Load(firstPtr + (Vector256.Count * 2));
+ var d03 = Vector256.Load(firstPtr + (Vector256.Count * 3));
+ var d04 = Vector256.Load(firstPtr + (Vector256.Count * 4));
+ var d05 = Vector256.Load(firstPtr + (Vector256.Count * 5));
+ var d06 = Vector256.Load(firstPtr + (Vector256.Count * 6));
+ var d07 = Vector256.Load(firstPtr + (Vector256.Count * 7));
+
+ firstPtr += Vector256.Count * 8;
+
+ for (var i = 1; i < srcCount; i++)
+ {
+ ref var startPtr = ref srcStartPtrs[i];
+
+ var s00 = Vector256.Load(startPtr + (Vector256.Count * 0));
+ var s01 = Vector256.Load(startPtr + (Vector256.Count * 1));
+ var s02 = Vector256.Load(startPtr + (Vector256.Count * 2));
+ var s03 = Vector256.Load(startPtr + (Vector256.Count * 3));
+ var s04 = Vector256.Load(startPtr + (Vector256.Count * 4));
+ var s05 = Vector256.Load(startPtr + (Vector256.Count * 5));
+ var s06 = Vector256.Load(startPtr + (Vector256.Count * 6));
+ var s07 = Vector256.Load(startPtr + (Vector256.Count * 7));
+
+ d00 = TBinaryOperator.Invoke(d00, s00);
+ d01 = TBinaryOperator.Invoke(d01, s01);
+ d02 = TBinaryOperator.Invoke(d02, s02);
+ d03 = TBinaryOperator.Invoke(d03, s03);
+ d04 = TBinaryOperator.Invoke(d04, s04);
+ d05 = TBinaryOperator.Invoke(d05, s05);
+ d06 = TBinaryOperator.Invoke(d06, s06);
+ d07 = TBinaryOperator.Invoke(d07, s07);
+
+ startPtr += Vector256.Count * 8;
+ }
+
+ Vector256.Store(d00, dstPtr + (Vector256.Count * 0));
+ Vector256.Store(d01, dstPtr + (Vector256.Count * 1));
+ Vector256.Store(d02, dstPtr + (Vector256.Count * 2));
+ Vector256.Store(d03, dstPtr + (Vector256.Count * 3));
+ Vector256.Store(d04, dstPtr + (Vector256.Count * 4));
+ Vector256.Store(d05, dstPtr + (Vector256.Count * 5));
+ Vector256.Store(d06, dstPtr + (Vector256.Count * 6));
+ Vector256.Store(d07, dstPtr + (Vector256.Count * 7));
+
+ dstPtr += Vector256.Count * 8;
}
- *(long*)dstCurr = d00;
- dstCurr += batchSize;
}
- #endregion
- fillTail:
- #region scalar_1x1
- byte* dstMaxEnd = dstPtr + dstLen;
- while (dstCurr < dstMaxEnd)
+ static void Vectorized128(ref byte* firstPtr, int srcCount, byte** srcStartPtrs, ref byte* dstPtr, byte* dstBatchEndPtr)
{
- byte d00 = 0;
- if (srcStartPtrs[0] < srcEndPtrs[0])
+ while (dstPtr < dstBatchEndPtr)
{
- d00 = *srcStartPtrs[0];
- srcStartPtrs[0]++;
- }
-
- for (int i = 1; i < srcKeyCount; i++)
- {
- if (srcStartPtrs[i] < srcEndPtrs[i])
+ var d00 = Vector128.Load(firstPtr + (Vector128.Count * 0));
+ var d01 = Vector128.Load(firstPtr + (Vector128.Count * 1));
+ var d02 = Vector128.Load(firstPtr + (Vector128.Count * 2));
+ var d03 = Vector128.Load(firstPtr + (Vector128.Count * 3));
+ var d04 = Vector128.Load(firstPtr + (Vector128.Count * 4));
+ var d05 = Vector128.Load(firstPtr + (Vector128.Count * 5));
+ var d06 = Vector128.Load(firstPtr + (Vector128.Count * 6));
+ var d07 = Vector128.Load(firstPtr + (Vector128.Count * 7));
+
+ firstPtr += Vector128.Count * 8;
+
+ for (var i = 1; i < srcCount; i++)
{
- d00 ^= *srcStartPtrs[i];
- srcStartPtrs[i]++;
+ ref var startPtr = ref srcStartPtrs[i];
+
+ var s00 = Vector128.Load(startPtr + (Vector128.Count * 0));
+ var s01 = Vector128.Load(startPtr + (Vector128.Count * 1));
+ var s02 = Vector128.Load(startPtr + (Vector128.Count * 2));
+ var s03 = Vector128.Load(startPtr + (Vector128.Count * 3));
+ var s04 = Vector128.Load(startPtr + (Vector128.Count * 4));
+ var s05 = Vector128.Load(startPtr + (Vector128.Count * 5));
+ var s06 = Vector128.Load(startPtr + (Vector128.Count * 6));
+ var s07 = Vector128.Load(startPtr + (Vector128.Count * 7));
+
+ d00 = TBinaryOperator.Invoke(d00, s00);
+ d01 = TBinaryOperator.Invoke(d01, s01);
+ d02 = TBinaryOperator.Invoke(d02, s02);
+ d03 = TBinaryOperator.Invoke(d03, s03);
+ d04 = TBinaryOperator.Invoke(d04, s04);
+ d05 = TBinaryOperator.Invoke(d05, s05);
+ d06 = TBinaryOperator.Invoke(d06, s06);
+ d07 = TBinaryOperator.Invoke(d07, s07);
+
+ startPtr += Vector128.Count * 8;
}
+
+ Vector128.Store(d00, dstPtr + (Vector128.Count * 0));
+ Vector128.Store(d01, dstPtr + (Vector128.Count * 1));
+ Vector128.Store(d02, dstPtr + (Vector128.Count * 2));
+ Vector128.Store(d03, dstPtr + (Vector128.Count * 3));
+ Vector128.Store(d04, dstPtr + (Vector128.Count * 4));
+ Vector128.Store(d05, dstPtr + (Vector128.Count * 5));
+ Vector128.Store(d06, dstPtr + (Vector128.Count * 6));
+ Vector128.Store(d07, dstPtr + (Vector128.Count * 7));
+
+ dstPtr += Vector128.Count * 8;
}
- *dstCurr++ = d00;
}
- #endregion
}
-
}
}
\ No newline at end of file
diff --git a/libs/server/Resp/CmdStrings.cs b/libs/server/Resp/CmdStrings.cs
index e0a4a29eb77..cd3263aa808 100644
--- a/libs/server/Resp/CmdStrings.cs
+++ b/libs/server/Resp/CmdStrings.cs
@@ -269,6 +269,7 @@ static partial class CmdStrings
public static ReadOnlySpan RESP_WRONGPASS_INVALID_USERNAME_PASSWORD => "WRONGPASS Invalid username/password combination"u8;
public static ReadOnlySpan RESP_SYNTAX_ERROR => "ERR syntax error"u8;
public static ReadOnlySpan RESP_ERR_BITOP_KEY_LIMIT => "ERR Bitop source key limit (64) exceeded"u8;
+ public static ReadOnlySpan RESP_ERR_BITOP_DIFF_TWO_SOURCE_KEYS_REQUIRED => "ERR BITOP DIFF must be called with at least two source keys."u8;
public static ReadOnlySpan RESP_ERR_COUNT_IS_NOT_POSITIVE => "ERR COUNT must be > 0"u8;
public static ReadOnlySpan RESP_ERR_COUNT_IS_OUT_OF_RANGE_N1 => "ERR count should be greater than or equal to -1."u8;
public static ReadOnlySpan RESP_ERR_MODULE_LOADED_TYPES => "ERR Unable to load types from module. Ensure that the module is compatible with the current runtime."u8;
@@ -525,6 +526,7 @@ static partial class CmdStrings
public static ReadOnlySpan LUA_OR => "OR"u8;
public static ReadOnlySpan LUA_XOR => "XOR"u8;
public static ReadOnlySpan LUA_NOT => "NOT"u8;
+ public static ReadOnlySpan LUA_DIFF => "DIFF"u8;
public static ReadOnlySpan LUA_KEYS => "KEYS"u8;
public static ReadOnlySpan LUA_ARGV => "ARGV"u8;
public static ReadOnlySpan EXPDELSCAN => "EXPDELSCAN"u8;
diff --git a/libs/server/Resp/Parser/RespCommand.cs b/libs/server/Resp/Parser/RespCommand.cs
index 9d4224d56c8..cc81121b1df 100644
--- a/libs/server/Resp/Parser/RespCommand.cs
+++ b/libs/server/Resp/Parser/RespCommand.cs
@@ -220,7 +220,8 @@ public enum RespCommand : ushort
BITOP_AND,
BITOP_OR,
BITOP_XOR,
- BITOP_NOT, // Note: Update LastWriteCommand if adding new write commands after this
+ BITOP_NOT,
+ BITOP_DIFF, // Note: Update LastWriteCommand if adding new write commands after this
// Script execution commands
EVAL,
@@ -401,7 +402,7 @@ public enum RespCommand : ushort
public static class RespCommandExtensions
{
private static readonly RespCommand[] ExpandedSET = [RespCommand.SETEXNX, RespCommand.SETEXXX, RespCommand.SETKEEPTTL, RespCommand.SETKEEPTTLXX];
- private static readonly RespCommand[] ExpandedBITOP = [RespCommand.BITOP_AND, RespCommand.BITOP_NOT, RespCommand.BITOP_OR, RespCommand.BITOP_XOR];
+ private static readonly RespCommand[] ExpandedBITOP = [RespCommand.BITOP_AND, RespCommand.BITOP_NOT, RespCommand.BITOP_OR, RespCommand.BITOP_XOR, RespCommand.BITOP_DIFF];
// Commands that are either returning static data or commands that cannot have issues from concurrent AOF interaction in another session
private static readonly RespCommand[] AofIndependentCommands = [
@@ -516,7 +517,7 @@ public static RespCommand NormalizeForACLs(this RespCommand cmd)
RespCommand.SETEXXX => RespCommand.SET,
RespCommand.SETKEEPTTL => RespCommand.SET,
RespCommand.SETKEEPTTLXX => RespCommand.SET,
- RespCommand.BITOP_AND or RespCommand.BITOP_NOT or RespCommand.BITOP_OR or RespCommand.BITOP_XOR => RespCommand.BITOP,
+ RespCommand.BITOP_AND or RespCommand.BITOP_NOT or RespCommand.BITOP_OR or RespCommand.BITOP_XOR or RespCommand.BITOP_DIFF => RespCommand.BITOP,
_ => cmd
};
}
@@ -541,7 +542,7 @@ public static ReadOnlySpan ExpandForACLs(this RespCommand cmd)
internal const RespCommand FirstWriteCommand = RespCommand.APPEND;
- internal const RespCommand LastWriteCommand = RespCommand.BITOP_NOT;
+ internal const RespCommand LastWriteCommand = RespCommand.BITOP_DIFF;
internal const RespCommand LastDataCommand = RespCommand.EVALSHA;
@@ -986,44 +987,64 @@ private RespCommand FastParseArrayCommand(ref int count, ref ReadOnlySpan
// Check for matching bit-operation
if (remainingBytes > length + 6 + 8)
{
- // TODO: AND|OR|XOR|NOT may not correctly handle mixed cases?
+ // TODO: AND|OR|XOR|NOT|DIFF may not correctly handle mixed cases?
- // 2-character operations
- if (*(uint*)(ptr + 11) == MemoryMarshal.Read("$2\r\n"u8))
+ var tag64 = *(ulong*)(ptr + 11);
+ var tag32 = (uint)tag64;
+
+ if (tag32 == MemoryMarshal.Read("$2\r\n"u8))
{
- if (*(ulong*)(ptr + 11) == MemoryMarshal.Read("$2\r\nOR\r\n"u8) || *(ulong*)(ptr + 11) == MemoryMarshal.Read("$2\r\nor\r\n"u8))
+ if (tag64 == MemoryMarshal.Read("$2\r\nOR\r\n"u8) || tag64 == MemoryMarshal.Read("$2\r\nor\r\n"u8))
{
- readHead += 8;
+ readHead += 8; // "$2\r\n" + "OR" + "\r\n"
count -= 1;
return RespCommand.BITOP_OR;
}
}
- // 3-character operations
- else if (remainingBytes > length + 6 + 9)
+ else if (tag32 == MemoryMarshal.Read("$3\r\n"u8) && remainingBytes > length + 6 + 9)
{
- if (*(uint*)(ptr + 11) == MemoryMarshal.Read("$3\r\n"u8))
+ // Optimistically adjust
+ readHead += 9; // "$3\r\n" + AND|XOR|NOT + "\r\n"
+ count -= 1;
+
+ tag64 = *(ulong*)(ptr + 12);
+
+ if (tag64 == MemoryMarshal.Read("3\r\nAND\r\n"u8) || tag64 == MemoryMarshal.Read("3\r\nand\r\n"u8))
{
- // Optimistically adjust read head and count
- readHead += 9;
- count -= 1;
+ return RespCommand.BITOP_AND;
+ }
+ else if (tag64 == MemoryMarshal.Read("3\r\nXOR\r\n"u8) || tag64 == MemoryMarshal.Read("3\r\nxor\r\n"u8))
+ {
+ return RespCommand.BITOP_XOR;
+ }
+ else if (tag64 == MemoryMarshal.Read