diff --git a/Directory.Packages.props b/Directory.Packages.props index 4a456017a76..dc98d01c92b 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -22,10 +22,11 @@ - + + diff --git a/benchmark/BDN.benchmark/Bitmap/BinaryOperations.cs b/benchmark/BDN.benchmark/Bitmap/BinaryOperations.cs new file mode 100644 index 00000000000..a600bcb0ad4 --- /dev/null +++ b/benchmark/BDN.benchmark/Bitmap/BinaryOperations.cs @@ -0,0 +1,81 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +using System.Runtime.InteropServices; +using BenchmarkDotNet.Attributes; +using Garnet.server; + +namespace BDN.benchmark.Bitmap +{ + public unsafe class BinaryOperations + { + private const int Alignment = 64; + + [ParamsSource(nameof(GetBitmapSizes))] + public Sizes BitmapSizes { get; set; } + + [Params(BitmapOperation.XOR)] + public BitmapOperation Op { get; set; } + + public IEnumerable GetBitmapSizes() + { + yield return new([1 << 21, 1 << 21]); + yield return new([1 << 21, (1 << 21) + 1]); + + yield return new([1 << 21, 1 << 21, 1 << 21]); + yield return new([1 << 21, 1 << 21, (1 << 21) + 1]); + + yield return new([256, 6 * 512 + 7, 512, 1024]); + } + + private int minBitmapSize; + private byte** srcPtrs; + private byte** srcEndPtrs; + + private int dstLength; + private byte* dstPtr; + + [GlobalSetup] + public void GlobalSetup_Binary() + { + minBitmapSize = BitmapSizes.Values.Min(); + srcPtrs = (byte**)NativeMemory.AllocZeroed((nuint)BitmapSizes.Values.Length, (nuint)sizeof(byte*)); + srcEndPtrs = (byte**)NativeMemory.AllocZeroed((nuint)BitmapSizes.Values.Length, (nuint)sizeof(byte*)); + + for (var i = 0; i < BitmapSizes.Values.Length; i++) + { + srcPtrs[i] = (byte*)NativeMemory.AlignedAlloc((nuint)BitmapSizes.Values[i], Alignment); + srcEndPtrs[i] = srcPtrs[i] + BitmapSizes.Values[i]; + + new Random(i).NextBytes(new Span(srcPtrs[i], BitmapSizes.Values[i])); + } + + dstLength = BitmapSizes.Values.Max(); + dstPtr = (byte*)NativeMemory.AlignedAlloc((nuint)dstLength, Alignment); + } + + [Benchmark] + public void BinaryOperation() + { + BitmapManager.InvokeBitOperationUnsafe(Op, BitmapSizes.Values.Length, srcPtrs, srcEndPtrs, dstPtr, dstLength, minBitmapSize); + } + + [GlobalCleanup] + public void GlobalCleanup() + { + for (var i = 0; i < BitmapSizes.Values.Length; i++) + { + NativeMemory.AlignedFree(srcPtrs[i]); + } + + NativeMemory.Free(srcPtrs); + NativeMemory.Free(srcEndPtrs); + NativeMemory.AlignedFree(dstPtr); + } + + public record struct Sizes(int[] Values) + { + public override string ToString() => string.Join(", ", Values); + } + } +} \ No newline at end of file diff --git a/benchmark/BDN.benchmark/Bitmap/UnaryOperations.cs b/benchmark/BDN.benchmark/Bitmap/UnaryOperations.cs new file mode 100644 index 00000000000..c1b2fc0bcaf --- /dev/null +++ b/benchmark/BDN.benchmark/Bitmap/UnaryOperations.cs @@ -0,0 +1,59 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +using System.Runtime.InteropServices; +using BenchmarkDotNet.Attributes; +using Garnet.server; + +namespace BDN.benchmark.Bitmap +{ + public unsafe partial class UnaryOperations + { + private const int Alignment = 64; + + [ParamsSource(nameof(GetBitmapSize))] + public int BitmapSize { get; set; } + + public IEnumerable GetBitmapSize() + { + yield return 256; + yield return 1 << 21; + } + + private const int Keys = 1; + private byte** srcPtrs; + private byte** srcEndPtrs; + + private byte* dstPtr; + + [GlobalSetup] + public void GlobalSetup_Unary() + { + srcPtrs = (byte**)NativeMemory.AllocZeroed(Keys, (nuint)sizeof(byte*)); + srcEndPtrs = (byte**)NativeMemory.AllocZeroed(Keys, (nuint)sizeof(byte*)); + + srcPtrs[0] = (byte*)NativeMemory.AlignedAlloc((uint)BitmapSize, Alignment); + srcEndPtrs[0] = srcPtrs[0] + (uint)BitmapSize; + + new Random(0).NextBytes(new Span(srcPtrs[0], BitmapSize)); + + dstPtr = (byte*)NativeMemory.AlignedAlloc((nuint)BitmapSize, Alignment); + } + + [Benchmark] + public void BitOperation_NOT() + { + BitmapManager.InvokeBitOperationUnsafe(BitmapOperation.NOT, Keys, srcPtrs, srcEndPtrs, dstPtr, BitmapSize, BitmapSize); + } + + [GlobalCleanup] + public void GlobalCleanup() + { + NativeMemory.AlignedFree(srcPtrs[0]); + + NativeMemory.Free(srcPtrs); + NativeMemory.Free(srcEndPtrs); + NativeMemory.AlignedFree(dstPtr); + } + } +} \ No newline at end of file diff --git a/libs/common/Numerics/IBinaryOperator.cs b/libs/common/Numerics/IBinaryOperator.cs new file mode 100644 index 00000000000..2abf864851d --- /dev/null +++ b/libs/common/Numerics/IBinaryOperator.cs @@ -0,0 +1,80 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +using System.Numerics; +using System.Runtime.Intrinsics; + +namespace Garnet.common.Numerics +{ + /// Operator that takes two input values and returns a single value. + public interface IBinaryOperator + { + /// + /// Computes the binary operation of two scalar values. + /// + static abstract T Invoke(T x, T y) where T : IBinaryInteger; + + /// + /// Computes the binary operation of two vectors. + /// + static abstract Vector128 Invoke(Vector128 x, Vector128 y); + + /// + static abstract Vector256 Invoke(Vector256 x, Vector256 y); + + /// + static abstract Vector512 Invoke(Vector512 x, Vector512 y); + } + + /// x & y + public readonly struct BitwiseAndOperator : IBinaryOperator + { + /// + public static T Invoke(T x, T y) where T : IBinaryInteger => x & y; + /// + public static Vector128 Invoke(Vector128 x, Vector128 y) => x & y; + /// + public static Vector256 Invoke(Vector256 x, Vector256 y) => x & y; + /// + public static Vector512 Invoke(Vector512 x, Vector512 y) => x & y; + } + + /// x | y + public readonly struct BitwiseOrOperator : IBinaryOperator + { + /// + public static T Invoke(T x, T y) where T : IBinaryInteger => x | y; + /// + public static Vector128 Invoke(Vector128 x, Vector128 y) => x | y; + /// + public static Vector256 Invoke(Vector256 x, Vector256 y) => x | y; + /// + public static Vector512 Invoke(Vector512 x, Vector512 y) => x | y; + } + + /// x ^ y + public readonly struct BitwiseXorOperator : IBinaryOperator + { + /// + public static T Invoke(T x, T y) where T : IBinaryInteger => x ^ y; + /// + public static Vector128 Invoke(Vector128 x, Vector128 y) => x ^ y; + /// + public static Vector256 Invoke(Vector256 x, Vector256 y) => x ^ y; + /// + public static Vector512 Invoke(Vector512 x, Vector512 y) => x ^ y; + } + + /// x & ~y + public readonly struct BitwiseAndNotOperator : IBinaryOperator + { + /// + public static T Invoke(T x, T y) where T : IBinaryInteger => x & ~y; + /// + public static Vector128 Invoke(Vector128 x, Vector128 y) => x & ~y; + /// + public static Vector256 Invoke(Vector256 x, Vector256 y) => x & ~y; + /// + public static Vector512 Invoke(Vector512 x, Vector512 y) => x & ~y; + } +} \ No newline at end of file diff --git a/libs/server/Garnet.server.csproj b/libs/server/Garnet.server.csproj index 15939de0249..2c351e80f45 100644 --- a/libs/server/Garnet.server.csproj +++ b/libs/server/Garnet.server.csproj @@ -20,6 +20,7 @@ + diff --git a/libs/server/Lua/LuaRunner.Functions.cs b/libs/server/Lua/LuaRunner.Functions.cs index 920ee638544..e5b36a83dd8 100644 --- a/libs/server/Lua/LuaRunner.Functions.cs +++ b/libs/server/Lua/LuaRunner.Functions.cs @@ -2857,6 +2857,7 @@ internal int AclCheckCommand(nint luaStatePtr) case RespCommand.BITOP_OR: state.PushConstantString(constStrs.OR); break; case RespCommand.BITOP_XOR: state.PushConstantString(constStrs.XOR); break; case RespCommand.BITOP_NOT: state.PushConstantString(constStrs.NOT); break; + case RespCommand.BITOP_DIFF: state.PushConstantString(constStrs.DIFF); break; default: throw new InvalidOperationException($"Unexpected BITOP sub command: {subCommand}"); } diff --git a/libs/server/Lua/LuaRunner.Strings.cs b/libs/server/Lua/LuaRunner.Strings.cs index 6492d81d503..372900c0216 100644 --- a/libs/server/Lua/LuaRunner.Strings.cs +++ b/libs/server/Lua/LuaRunner.Strings.cs @@ -161,6 +161,8 @@ private readonly struct ConstantStringRegistryIndexes internal int XOR { get; } /// internal int NOT { get; } + /// + internal int DIFF { get; } /// internal int KEYS { get; } /// @@ -246,6 +248,7 @@ internal ConstantStringRegistryIndexes(ref LuaStateWrapper state) OR = ConstantStringToRegistry(ref state, CmdStrings.LUA_OR); XOR = ConstantStringToRegistry(ref state, CmdStrings.LUA_XOR); NOT = ConstantStringToRegistry(ref state, CmdStrings.LUA_NOT); + DIFF = ConstantStringToRegistry(ref state, CmdStrings.LUA_DIFF); KEYS = ConstantStringToRegistry(ref state, CmdStrings.LUA_KEYS); ARGV = ConstantStringToRegistry(ref state, CmdStrings.LUA_ARGV); } diff --git a/libs/server/Resp/Bitmap/BitmapCommands.cs b/libs/server/Resp/Bitmap/BitmapCommands.cs index ce50482c0a0..a15acc877c0 100644 --- a/libs/server/Resp/Bitmap/BitmapCommands.cs +++ b/libs/server/Resp/Bitmap/BitmapCommands.cs @@ -40,7 +40,12 @@ public enum BitmapOperation : byte /// /// NOT /// - NOT + NOT, + + /// + /// DIFF + /// + DIFF } internal enum BitFieldOverflow : byte @@ -317,6 +322,11 @@ private bool NetworkStringBitOperation(BitmapOperation bitOp, ref TG return AbortWithErrorMessage(CmdStrings.RESP_ERR_WRONG_NUMBER_OF_ARGUMENTS); } + if (bitOp == BitmapOperation.DIFF && parseState.Count < 3) + { + return AbortWithErrorMessage(CmdStrings.RESP_ERR_BITOP_DIFF_TWO_SOURCE_KEYS_REQUIRED); + } + if (parseState.Count > 64) { return AbortWithErrorMessage(CmdStrings.RESP_ERR_BITOP_KEY_LIMIT); diff --git a/libs/server/Resp/Bitmap/BitmapManagerBitOp.cs b/libs/server/Resp/Bitmap/BitmapManagerBitOp.cs index 6917e259935..88835d65624 100644 --- a/libs/server/Resp/Bitmap/BitmapManagerBitOp.cs +++ b/libs/server/Resp/Bitmap/BitmapManagerBitOp.cs @@ -1,650 +1,336 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. +using System; +using System.Diagnostics; +using System.Numerics.Tensors; +using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; using Garnet.common; - +using Garnet.common.Numerics; namespace Garnet.server { public unsafe partial class BitmapManager { /// - /// BitOp main driver. + /// Performs a bitwise operation across one or more source buffers and writes the result to the destination buffer. /// - /// Output buffer to write BitOp result - /// Output buffer length. - /// Array of pointers to bitmaps used as input in the corresponding bitop. - /// Array of pointers to bitmap sources. - /// Number of source keys. - /// Minimum size of source bitmap. - /// Type of bitop operation being executed. - /// - public static bool BitOpMainUnsafeMultiKey(byte* dstPtr, int dstLen, byte** srcStartPtrs, byte** srcEndPtrs, int srcKeyCount, int minSize, byte bitop) + /// The bitwise operation to perform. + /// Number of source buffers + /// Array of pointers to source buffers. The array length must be greater than or equal to + /// Array of the buffer lengths specified in . The array length must be greater than or equal to + /// Destination buffer to write the result. + /// Destination buffer length. + /// The length of shortest source buffer. + public static void InvokeBitOperationUnsafe(BitmapOperation op, int srcCount, byte** srcPtrs, byte** srcEndPtrs, byte* dstPtr, int dstLength, int shortestSrcLength) { - switch (bitop) - { - case (byte)BitmapOperation.NOT: - __bitop_multikey_simdX256_not(dstPtr, dstLen, srcStartPtrs[0], srcEndPtrs[0] - srcStartPtrs[0]); - break; - case (byte)BitmapOperation.AND: - __bitop_multikey_simdX256_and(dstPtr, dstLen, srcStartPtrs, srcEndPtrs, srcKeyCount, minSize); - break; - case (byte)BitmapOperation.OR: - __bitop_multikey_simdX256_or(dstPtr, dstLen, srcStartPtrs, srcEndPtrs, srcKeyCount, minSize); - break; - case (byte)BitmapOperation.XOR: - __bitop_multikey_simdX256_xor(dstPtr, dstLen, srcStartPtrs, srcEndPtrs, srcKeyCount, minSize); - break; - default: - throw new GarnetException("Unsupported BitOp command"); - } - return true; - } + Debug.Assert(op is BitmapOperation.NOT or BitmapOperation.AND or BitmapOperation.OR or BitmapOperation.XOR or BitmapOperation.DIFF); + Debug.Assert(srcCount > 0); + Debug.Assert(dstLength >= 0 && shortestSrcLength >= 0); + Debug.Assert(dstLength >= shortestSrcLength); - /// - /// Negation bitop implementation using 256-wide SIMD registers. - /// - /// Output buffer to write BitOp result - /// Output buffer length. - /// Pointer to source bitmap. - /// Source bitmap length. - private static void __bitop_multikey_simdX256_not(byte* dstPtr, long dstLen, byte* srcBitmap, long srcLen) - { - int batchSize = 8 * 32; - long slen = srcLen; - long stail = slen & (batchSize - 1); - - //iterate using srcBitmap because always dstLen >= srcLen - byte* srcCurr = srcBitmap; - byte* srcEnd = srcCurr + (slen - stail); - byte* dstCurr = dstPtr; - - #region 8x32 - while (srcCurr < srcEnd) - { - Vector256 d00 = Avx.LoadVector256(srcCurr); - Vector256 d01 = Avx.LoadVector256(srcCurr + 32); - Vector256 d02 = Avx.LoadVector256(srcCurr + 64); - Vector256 d03 = Avx.LoadVector256(srcCurr + 96); - Vector256 d04 = Avx.LoadVector256(srcCurr + 128); - Vector256 d05 = Avx.LoadVector256(srcCurr + 160); - Vector256 d06 = Avx.LoadVector256(srcCurr + 192); - Vector256 d07 = Avx.LoadVector256(srcCurr + 224); - - Avx.Store(dstCurr, Avx2.Xor(d00, Vector256.AllBitsSet)); - Avx.Store(dstCurr + 32, Avx2.Xor(d01, Vector256.AllBitsSet)); - Avx.Store(dstCurr + 64, Avx2.Xor(d02, Vector256.AllBitsSet)); - Avx.Store(dstCurr + 96, Avx2.Xor(d03, Vector256.AllBitsSet)); - Avx.Store(dstCurr + 128, Avx2.Xor(d04, Vector256.AllBitsSet)); - Avx.Store(dstCurr + 160, Avx2.Xor(d05, Vector256.AllBitsSet)); - Avx.Store(dstCurr + 192, Avx2.Xor(d06, Vector256.AllBitsSet)); - Avx.Store(dstCurr + 224, Avx2.Xor(d07, Vector256.AllBitsSet)); - - srcCurr += batchSize; - dstCurr += batchSize; - } - if (stail == 0) return; - #endregion - - #region 1x32 - slen = stail; - batchSize = 1 * 32; - stail = slen & (batchSize - 1); - srcEnd = srcCurr + (slen - stail); - while (srcCurr < srcEnd) - { - Vector256 d00 = Avx.LoadVector256(srcCurr); - Avx.Store(dstCurr, Avx2.Xor(d00, Vector256.AllBitsSet)); - srcCurr += batchSize; - dstCurr += batchSize; - } - if (stail == 0) return; - #endregion - - #region 4x8 - slen = stail; - batchSize = 4 * 8; - stail = slen & (batchSize - 1); - srcEnd = srcCurr + (slen - stail); - while (srcCurr < srcEnd) + if (srcCount == 1) { - long d00 = *(long*)(srcCurr); - long d01 = *(long*)(srcCurr + 8); - long d02 = *(long*)(srcCurr + 16); - long d03 = *(long*)(srcCurr + 24); - - *(long*)dstCurr = ~d00; - *(long*)(dstCurr + 8) = ~d01; - *(long*)(dstCurr + 16) = ~d02; - *(long*)(dstCurr + 24) = ~d03; - - srcCurr += batchSize; - dstCurr += batchSize; - } - if (stail == 0) return; - #endregion - - #region 1x8 - slen = stail; - batchSize = 8; - stail = slen & (batchSize - 1); - srcEnd = srcCurr + (slen - stail); - while (srcCurr < srcEnd) - { - long d00 = *(long*)(srcCurr); + if (op == BitmapOperation.DIFF) throw new GarnetException("BITOP DIFF operation requires at least two source bitmaps"); - *(long*)dstCurr = ~d00; + var srcBitmap = new ReadOnlySpan(srcPtrs[0], checked((int)(srcEndPtrs[0] - srcPtrs[0]))); + var dstBitmap = new Span(dstPtr, dstLength); - srcCurr += batchSize; - dstCurr += batchSize; + if (op == BitmapOperation.NOT) + { + TensorPrimitives.OnesComplement(srcBitmap, dstBitmap); + } + else + { + srcBitmap.CopyTo(dstBitmap); + } } - if (stail == 0) return; - #endregion - - if (stail >= 7) dstCurr[6] = (byte)(~srcCurr[6]); - if (stail >= 6) dstCurr[5] = (byte)(~srcCurr[5]); - if (stail >= 5) dstCurr[4] = (byte)(~srcCurr[4]); - if (stail >= 4) dstCurr[3] = (byte)(~srcCurr[3]); - if (stail >= 3) dstCurr[2] = (byte)(~srcCurr[2]); - if (stail >= 2) dstCurr[1] = (byte)(~srcCurr[1]); - if (stail >= 1) dstCurr[0] = (byte)(~srcCurr[0]); + // srcCount ≥ 2 + else if (op == BitmapOperation.AND) InvokeNaryBitwiseOperation(srcCount, srcPtrs, srcEndPtrs, dstPtr, dstLength, shortestSrcLength); + else if (op == BitmapOperation.OR) InvokeNaryBitwiseOperation(srcCount, srcPtrs, srcEndPtrs, dstPtr, dstLength, shortestSrcLength); + else if (op == BitmapOperation.XOR) InvokeNaryBitwiseOperation(srcCount, srcPtrs, srcEndPtrs, dstPtr, dstLength, shortestSrcLength); + else if (op == BitmapOperation.DIFF) InvokeNaryBitwiseOperation(srcCount, srcPtrs, srcEndPtrs, dstPtr, dstLength, shortestSrcLength); } /// - /// AND bitop implementation using 256-wide SIMD registers. + /// Invokes bitwise binary operation across n-ary source bitmaps. /// - /// Output buffer to write BitOp result - /// Output buffer length. - /// Pointer to start of bitmap sources. - /// Pointer to end of bitmap sources - /// Number of source keys. - /// Minimum size of source bitmaps. - private static void __bitop_multikey_simdX256_and(byte* dstPtr, int dstLen, byte** srcStartPtrs, byte** srcEndPtrs, int srcKeyCount, int minSize) + /// The binary operator type to compute bitwise + /// Number of source bitmaps. + /// Array of pointers to source bitmap buffers. + /// Array of the of pointers pointing to the end of the respective the bitmaps specified in . + /// Destination buffer to write the result. + /// Destination buffer length. + /// The length of shortest source buffer. + [SkipLocalsInit] + private static void InvokeNaryBitwiseOperation(int srcCount, byte** srcPtrs, byte** srcEndPtrs, byte* dstPtr, int dstLength, int shortestSrcLength) + where TBinaryOperator : struct, IBinaryOperator { - int batchSize = 8 * 32; - long slen = minSize; - long stail = slen & (batchSize - 1); - - byte* dstCurr = dstPtr; - byte* dstEnd = dstCurr + (slen - stail); + var dstEndPtr = dstPtr + dstLength; - #region 8x32 - while (dstCurr < dstEnd) - { - Vector256 d00 = Avx.LoadVector256(srcStartPtrs[0]); - Vector256 d01 = Avx.LoadVector256(srcStartPtrs[0] + 32); - Vector256 d02 = Avx.LoadVector256(srcStartPtrs[0] + 64); - Vector256 d03 = Avx.LoadVector256(srcStartPtrs[0] + 96); - Vector256 d04 = Avx.LoadVector256(srcStartPtrs[0] + 128); - Vector256 d05 = Avx.LoadVector256(srcStartPtrs[0] + 160); - Vector256 d06 = Avx.LoadVector256(srcStartPtrs[0] + 192); - Vector256 d07 = Avx.LoadVector256(srcStartPtrs[0] + 224); - srcStartPtrs[0] += batchSize; - for (int i = 1; i < srcKeyCount; i++) - { - Vector256 s00 = Avx.LoadVector256(srcStartPtrs[i]); - Vector256 s01 = Avx.LoadVector256(srcStartPtrs[i] + 32); - Vector256 s02 = Avx.LoadVector256(srcStartPtrs[i] + 64); - Vector256 s03 = Avx.LoadVector256(srcStartPtrs[i] + 96); - Vector256 s04 = Avx.LoadVector256(srcStartPtrs[i] + 128); - Vector256 s05 = Avx.LoadVector256(srcStartPtrs[i] + 160); - Vector256 s06 = Avx.LoadVector256(srcStartPtrs[i] + 192); - Vector256 s07 = Avx.LoadVector256(srcStartPtrs[i] + 224); - - d00 = Avx2.And(d00, s00); - d01 = Avx2.And(d01, s01); - d02 = Avx2.And(d02, s02); - d03 = Avx2.And(d03, s03); - d04 = Avx2.And(d04, s04); - d05 = Avx2.And(d05, s05); - d06 = Avx2.And(d06, s06); - d07 = Avx2.And(d07, s07); - srcStartPtrs[i] += batchSize; - } + var remainingLength = shortestSrcLength; + var batchRemainder = shortestSrcLength; + byte* dstBatchEndPtr; - Avx.Store(dstCurr, d00); - Avx.Store(dstCurr + 32, d01); - Avx.Store(dstCurr + 64, d02); - Avx.Store(dstCurr + 96, d03); - Avx.Store(dstCurr + 128, d04); - Avx.Store(dstCurr + 160, d05); - Avx.Store(dstCurr + 192, d06); - Avx.Store(dstCurr + 224, d07); + // Keep the cursor of the first source buffer in local to keep processing tidy. + var firstSrcPtr = srcPtrs[0]; - dstCurr += batchSize; - } - if (stail == 0) goto fillTail; - #endregion - - #region 1x32 - slen = stail; - batchSize = 1 * 32; - stail = slen & (batchSize - 1); - dstEnd = dstCurr + (slen - stail); - - while (dstCurr < dstEnd) + // Copy remaining source buffer pointers so we don't increment caller's. + var tmpSrcPtrs = stackalloc byte*[srcCount]; + for (var i = 0; i < srcCount; i++) { - Vector256 d00 = Avx.LoadVector256(srcStartPtrs[0]); - srcStartPtrs[0] += batchSize; - for (int i = 1; i < srcKeyCount; i++) - { - Vector256 s00 = Avx.LoadVector256(srcStartPtrs[i]); - d00 = Avx2.And(d00, s00); - srcStartPtrs[i] += batchSize; - } - Avx.Store(dstCurr, d00); - dstCurr += batchSize; + tmpSrcPtrs[i] = srcPtrs[i]; } - if (stail == 0) goto fillTail; - #endregion - - #region scalar_4x8 - slen = stail; - batchSize = 4 * 8; - stail = slen & (batchSize - 1); - dstEnd = dstCurr + (slen - stail); - while (dstCurr < dstEnd) + srcPtrs = tmpSrcPtrs; + + if (Vector512.IsHardwareAccelerated && Vector512.IsSupported) { - long d00 = *(long*)(srcStartPtrs[0]); - long d01 = *(long*)(srcStartPtrs[0] + 8); - long d02 = *(long*)(srcStartPtrs[0] + 16); - long d03 = *(long*)(srcStartPtrs[0] + 24); - srcStartPtrs[0] += batchSize; - for (int i = 1; i < srcKeyCount; i++) - { - d00 &= *(long*)(srcStartPtrs[i]); - d01 &= *(long*)(srcStartPtrs[i] + 8); - d02 &= *(long*)(srcStartPtrs[i] + 16); - d03 &= *(long*)(srcStartPtrs[i] + 24); - srcStartPtrs[i] += batchSize; - } + // Vectorized: 64 bytes x 8 + batchRemainder = remainingLength & ((Vector512.Count * 8) - 1); + dstBatchEndPtr = dstPtr + (remainingLength - batchRemainder); + remainingLength = batchRemainder; - *(long*)dstCurr = d00; - *(long*)(dstCurr + 8) = d01; - *(long*)(dstCurr + 16) = d02; - *(long*)(dstCurr + 24) = d03; - dstCurr += batchSize; + Vectorized512(ref firstSrcPtr, srcCount, srcPtrs, ref dstPtr, dstBatchEndPtr); } - if (stail == 0) goto fillTail; - #endregion - - #region scalar_1x8 - slen = stail; - batchSize = 8; - stail = slen & (batchSize - 1); - dstEnd = dstCurr + (slen - stail); - while (dstCurr < dstEnd) + else if (Vector256.IsHardwareAccelerated && Vector256.IsSupported) { - long d00 = *(long*)(srcStartPtrs[0]); - srcStartPtrs[0] += batchSize; - for (int i = 1; i < srcKeyCount; i++) - { - d00 &= *(long*)(srcStartPtrs[i]); - srcStartPtrs[i] += batchSize; - } - *(long*)dstCurr = d00; - dstCurr += batchSize; - } - #endregion + // Vectorized: 32 bytes x 8 + batchRemainder = remainingLength & ((Vector256.Count * 8) - 1); + dstBatchEndPtr = dstPtr + (remainingLength - batchRemainder); + remainingLength = batchRemainder; - fillTail: - #region scalar_1x1 - byte* dstMaxEnd = dstPtr + dstLen; - int offset = 0; - while (dstCurr < dstMaxEnd) + Vectorized256(ref firstSrcPtr, srcCount, srcPtrs, ref dstPtr, dstBatchEndPtr); + } + else if (Vector128.IsHardwareAccelerated && Vector128.IsSupported) { - byte d00; - if (srcStartPtrs[0] + offset < srcEndPtrs[0]) - d00 = srcStartPtrs[0][offset]; - else - { - d00 = 0; - goto writeBack; - } + // Vectorized: 16 bytes x 8 + batchRemainder = remainingLength & ((Vector128.Count * 8) - 1); + dstBatchEndPtr = dstPtr + (remainingLength - batchRemainder); + remainingLength = batchRemainder; - for (int i = 1; i < srcKeyCount; i++) - { - if (srcStartPtrs[i] + offset < srcEndPtrs[i]) - d00 &= srcStartPtrs[i][offset]; - else - { - d00 = 0; - goto writeBack; - } - } - writeBack: - *dstCurr++ = d00; - offset++; + Vectorized128(ref firstSrcPtr, srcCount, srcPtrs, ref dstPtr, dstBatchEndPtr); } - #endregion - } - - /// - /// OR bitop implementation using 256-wide SIMD registers. - /// - /// Output buffer to write BitOp result - /// Output buffer length. - /// Pointer to start of bitmap sources. - /// Pointer to end of bitmap sources - /// Number of source keys. - /// Minimum size of source bitmaps. - private static void __bitop_multikey_simdX256_or(byte* dstPtr, int dstLen, byte** srcStartPtrs, byte** srcEndPtrs, int srcKeyCount, int minSize) - { - int batchSize = 8 * 32; - long slen = minSize; - long stail = slen & (batchSize - 1); - byte* dstCurr = dstPtr; - byte* dstEnd = dstCurr + (slen - stail); + // Scalar: 8 bytes x 4 + batchRemainder = remainingLength & ((sizeof(ulong) * 4) - 1); + dstBatchEndPtr = dstPtr + (remainingLength - batchRemainder); + remainingLength = batchRemainder; - #region 8x32 - while (dstCurr < dstEnd) + while (dstPtr < dstBatchEndPtr) { - Vector256 d00 = Avx.LoadVector256(srcStartPtrs[0]); - Vector256 d01 = Avx.LoadVector256(srcStartPtrs[0] + 32); - Vector256 d02 = Avx.LoadVector256(srcStartPtrs[0] + 64); - Vector256 d03 = Avx.LoadVector256(srcStartPtrs[0] + 96); - Vector256 d04 = Avx.LoadVector256(srcStartPtrs[0] + 128); - Vector256 d05 = Avx.LoadVector256(srcStartPtrs[0] + 160); - Vector256 d06 = Avx.LoadVector256(srcStartPtrs[0] + 192); - Vector256 d07 = Avx.LoadVector256(srcStartPtrs[0] + 224); - srcStartPtrs[0] += batchSize; - for (int i = 1; i < srcKeyCount; i++) - { - Vector256 s00 = Avx.LoadVector256(srcStartPtrs[i]); - Vector256 s01 = Avx.LoadVector256(srcStartPtrs[i] + 32); - Vector256 s02 = Avx.LoadVector256(srcStartPtrs[i] + 64); - Vector256 s03 = Avx.LoadVector256(srcStartPtrs[i] + 96); - Vector256 s04 = Avx.LoadVector256(srcStartPtrs[i] + 128); - Vector256 s05 = Avx.LoadVector256(srcStartPtrs[i] + 160); - Vector256 s06 = Avx.LoadVector256(srcStartPtrs[i] + 192); - Vector256 s07 = Avx.LoadVector256(srcStartPtrs[i] + 224); - - d00 = Avx2.Or(d00, s00); - d01 = Avx2.Or(d01, s01); - d02 = Avx2.Or(d02, s02); - d03 = Avx2.Or(d03, s03); - d04 = Avx2.Or(d04, s04); - d05 = Avx2.Or(d05, s05); - d06 = Avx2.Or(d06, s06); - d07 = Avx2.Or(d07, s07); - srcStartPtrs[i] += batchSize; - } + var d00 = *(ulong*)(firstSrcPtr + (sizeof(ulong) * 0)); + var d01 = *(ulong*)(firstSrcPtr + (sizeof(ulong) * 1)); + var d02 = *(ulong*)(firstSrcPtr + (sizeof(ulong) * 2)); + var d03 = *(ulong*)(firstSrcPtr + (sizeof(ulong) * 3)); - Avx.Store(dstCurr, d00); - Avx.Store(dstCurr + 32, d01); - Avx.Store(dstCurr + 64, d02); - Avx.Store(dstCurr + 96, d03); - Avx.Store(dstCurr + 128, d04); - Avx.Store(dstCurr + 160, d05); - Avx.Store(dstCurr + 192, d06); - Avx.Store(dstCurr + 224, d07); + firstSrcPtr += sizeof(ulong) * 4; - dstCurr += batchSize; - } - if (stail == 0) goto fillTail; - #endregion + for (var i = 1; i < srcCount; i++) + { + ref var startPtr = ref srcPtrs[i]; - #region 1x32 - slen = stail; - batchSize = 1 * 32; - stail = slen & (batchSize - 1); - dstEnd = dstCurr + (slen - stail); + d00 = TBinaryOperator.Invoke(d00, *(ulong*)(startPtr + (sizeof(ulong) * 0))); + d01 = TBinaryOperator.Invoke(d01, *(ulong*)(startPtr + (sizeof(ulong) * 1))); + d02 = TBinaryOperator.Invoke(d02, *(ulong*)(startPtr + (sizeof(ulong) * 2))); + d03 = TBinaryOperator.Invoke(d03, *(ulong*)(startPtr + (sizeof(ulong) * 3))); - while (dstCurr < dstEnd) - { - Vector256 d00 = Avx.LoadVector256(srcStartPtrs[0]); - srcStartPtrs[0] += batchSize; - for (int i = 1; i < srcKeyCount; i++) - { - Vector256 s00 = Avx.LoadVector256(srcStartPtrs[i]); - d00 = Avx2.Or(d00, s00); - srcStartPtrs[i] += batchSize; - } - Avx.Store(dstCurr, d00); - dstCurr += batchSize; - } - if (stail == 0) goto fillTail; - #endregion - - #region scalar_4x8 - slen = stail; - batchSize = 4 * 8; - stail = slen & (batchSize - 1); - dstEnd = dstCurr + (slen - stail); - while (dstCurr < dstEnd) - { - long d00 = *(long*)(srcStartPtrs[0]); - long d01 = *(long*)(srcStartPtrs[0] + 8); - long d02 = *(long*)(srcStartPtrs[0] + 16); - long d03 = *(long*)(srcStartPtrs[0] + 24); - srcStartPtrs[0] += batchSize; - for (int i = 1; i < srcKeyCount; i++) - { - d00 |= *(long*)(srcStartPtrs[i]); - d01 |= *(long*)(srcStartPtrs[i] + 8); - d02 |= *(long*)(srcStartPtrs[i] + 16); - d03 |= *(long*)(srcStartPtrs[i] + 24); - srcStartPtrs[i] += batchSize; + srcPtrs[i] += sizeof(ulong) * 4; } - *(long*)dstCurr = d00; - *(long*)(dstCurr + 8) = d01; - *(long*)(dstCurr + 16) = d02; - *(long*)(dstCurr + 24) = d03; - dstCurr += batchSize; - } - if (stail == 0) goto fillTail; - #endregion - - #region scalar_1x8 - slen = stail; - batchSize = 8; - stail = slen & (batchSize - 1); - dstEnd = dstCurr + (slen - stail); - while (dstCurr < dstEnd) - { - long d00 = *(long*)(srcStartPtrs[0]); - srcStartPtrs[0] += batchSize; - for (int i = 1; i < srcKeyCount; i++) - { - d00 |= *(long*)(srcStartPtrs[i]); - srcStartPtrs[i] += batchSize; - } - *(long*)dstCurr = d00; - dstCurr += batchSize; + *(ulong*)(dstPtr + (sizeof(ulong) * 0)) = d00; + *(ulong*)(dstPtr + (sizeof(ulong) * 1)) = d01; + *(ulong*)(dstPtr + (sizeof(ulong) * 2)) = d02; + *(ulong*)(dstPtr + (sizeof(ulong) * 3)) = d03; + + dstPtr += sizeof(ulong) * 4; } - #endregion - fillTail: - #region scalar_1x1 - byte* dstMaxEnd = dstPtr + dstLen; - int offset = 0; - while (dstCurr < dstMaxEnd) + // Handle the remaining tails + while (dstPtr < dstEndPtr) { byte d00 = 0; - if (srcStartPtrs[0] + offset < srcEndPtrs[0]) + + if (firstSrcPtr < srcEndPtrs[0]) { - d00 = srcStartPtrs[0][offset]; - if (d00 == 0xff) goto writeBack; + d00 = *firstSrcPtr; + firstSrcPtr++; } - for (int i = 1; i < srcKeyCount; i++) + for (var i = 1; i < srcCount; i++) { - if (srcStartPtrs[i] + offset < srcEndPtrs[i]) + if (srcPtrs[i] < srcEndPtrs[i]) { - d00 |= srcStartPtrs[i][offset]; - if (d00 == 0xff) goto writeBack; + d00 = TBinaryOperator.Invoke(d00, *srcPtrs[i]); + srcPtrs[i]++; + } + else if (typeof(TBinaryOperator) == typeof(BitwiseAndOperator)) + { + d00 = 0; } } - writeBack: - *dstCurr++ = d00; - offset++; - } - #endregion - } - - /// - /// XOR bitop implementation using 256-wide SIMD registers. - /// - /// Output buffer to write BitOp result - /// Output buffer length. - /// Pointer to start of bitmap sources. - /// Pointer to end of bitmap sources - /// Number of source keys. - /// Minimum size of source bitmaps. - private static void __bitop_multikey_simdX256_xor(byte* dstPtr, int dstLen, byte** srcStartPtrs, byte** srcEndPtrs, int srcKeyCount, int minSize) - { - int batchSize = 8 * 32; - long slen = minSize; - long stail = slen & (batchSize - 1); - byte* dstCurr = dstPtr; - byte* dstEnd = dstCurr + (slen - stail); + *dstPtr++ = d00; + } - #region 8x32 - while (dstCurr < dstEnd) + static void Vectorized512(ref byte* firstPtr, int srcCount, byte** srcStartPtrs, ref byte* dstPtr, byte* dstBatchEndPtr) { - Vector256 d00 = Avx.LoadVector256(srcStartPtrs[0]); - Vector256 d01 = Avx.LoadVector256(srcStartPtrs[0] + 32); - Vector256 d02 = Avx.LoadVector256(srcStartPtrs[0] + 64); - Vector256 d03 = Avx.LoadVector256(srcStartPtrs[0] + 96); - Vector256 d04 = Avx.LoadVector256(srcStartPtrs[0] + 128); - Vector256 d05 = Avx.LoadVector256(srcStartPtrs[0] + 160); - Vector256 d06 = Avx.LoadVector256(srcStartPtrs[0] + 192); - Vector256 d07 = Avx.LoadVector256(srcStartPtrs[0] + 224); - srcStartPtrs[0] += batchSize; - for (int i = 1; i < srcKeyCount; i++) + while (dstPtr < dstBatchEndPtr) { - Vector256 s00 = Avx.LoadVector256(srcStartPtrs[i]); - Vector256 s01 = Avx.LoadVector256(srcStartPtrs[i] + 32); - Vector256 s02 = Avx.LoadVector256(srcStartPtrs[i] + 64); - Vector256 s03 = Avx.LoadVector256(srcStartPtrs[i] + 96); - Vector256 s04 = Avx.LoadVector256(srcStartPtrs[i] + 128); - Vector256 s05 = Avx.LoadVector256(srcStartPtrs[i] + 160); - Vector256 s06 = Avx.LoadVector256(srcStartPtrs[i] + 192); - Vector256 s07 = Avx.LoadVector256(srcStartPtrs[i] + 224); - - d00 = Avx2.Xor(d00, s00); - d01 = Avx2.Xor(d01, s01); - d02 = Avx2.Xor(d02, s02); - d03 = Avx2.Xor(d03, s03); - d04 = Avx2.Xor(d04, s04); - d05 = Avx2.Xor(d05, s05); - d06 = Avx2.Xor(d06, s06); - d07 = Avx2.Xor(d07, s07); - srcStartPtrs[i] += batchSize; - } - - Avx.Store(dstCurr, d00); - Avx.Store(dstCurr + 32, d01); - Avx.Store(dstCurr + 64, d02); - Avx.Store(dstCurr + 96, d03); - Avx.Store(dstCurr + 128, d04); - Avx.Store(dstCurr + 160, d05); - Avx.Store(dstCurr + 192, d06); - Avx.Store(dstCurr + 224, d07); - - dstCurr += batchSize; - } - #endregion + var d00 = Vector512.Load(firstPtr + (Vector512.Count * 0)); + var d01 = Vector512.Load(firstPtr + (Vector512.Count * 1)); + var d02 = Vector512.Load(firstPtr + (Vector512.Count * 2)); + var d03 = Vector512.Load(firstPtr + (Vector512.Count * 3)); + var d04 = Vector512.Load(firstPtr + (Vector512.Count * 4)); + var d05 = Vector512.Load(firstPtr + (Vector512.Count * 5)); + var d06 = Vector512.Load(firstPtr + (Vector512.Count * 6)); + var d07 = Vector512.Load(firstPtr + (Vector512.Count * 7)); + + firstPtr += Vector512.Count * 8; + + for (var i = 1; i < srcCount; i++) + { + ref var startPtr = ref srcStartPtrs[i]; + + var s00 = Vector512.Load(startPtr + (Vector512.Count * 0)); + var s01 = Vector512.Load(startPtr + (Vector512.Count * 1)); + var s02 = Vector512.Load(startPtr + (Vector512.Count * 2)); + var s03 = Vector512.Load(startPtr + (Vector512.Count * 3)); + var s04 = Vector512.Load(startPtr + (Vector512.Count * 4)); + var s05 = Vector512.Load(startPtr + (Vector512.Count * 5)); + var s06 = Vector512.Load(startPtr + (Vector512.Count * 6)); + var s07 = Vector512.Load(startPtr + (Vector512.Count * 7)); + + d00 = TBinaryOperator.Invoke(d00, s00); + d01 = TBinaryOperator.Invoke(d01, s01); + d02 = TBinaryOperator.Invoke(d02, s02); + d03 = TBinaryOperator.Invoke(d03, s03); + d04 = TBinaryOperator.Invoke(d04, s04); + d05 = TBinaryOperator.Invoke(d05, s05); + d06 = TBinaryOperator.Invoke(d06, s06); + d07 = TBinaryOperator.Invoke(d07, s07); + + startPtr += Vector512.Count * 8; + } - #region 1x32 - slen = stail; - batchSize = 1 * 32; - stail = slen & (batchSize - 1); - dstEnd = dstCurr + (slen - stail); + Vector512.Store(d00, dstPtr + (Vector512.Count * 0)); + Vector512.Store(d01, dstPtr + (Vector512.Count * 1)); + Vector512.Store(d02, dstPtr + (Vector512.Count * 2)); + Vector512.Store(d03, dstPtr + (Vector512.Count * 3)); + Vector512.Store(d04, dstPtr + (Vector512.Count * 4)); + Vector512.Store(d05, dstPtr + (Vector512.Count * 5)); + Vector512.Store(d06, dstPtr + (Vector512.Count * 6)); + Vector512.Store(d07, dstPtr + (Vector512.Count * 7)); - while (dstCurr < dstEnd) - { - Vector256 d00 = Avx.LoadVector256(srcStartPtrs[0]); - srcStartPtrs[0] += batchSize; - for (int i = 1; i < srcKeyCount; i++) - { - Vector256 s00 = Avx.LoadVector256(srcStartPtrs[i]); - d00 = Avx2.Xor(d00, s00); - srcStartPtrs[i] += batchSize; + dstPtr += Vector512.Count * 8; } - Avx.Store(dstCurr, d00); - dstCurr += batchSize; } - #endregion - - #region scalar_4x8 - slen = stail; - batchSize = 4 * 8; - stail = slen & (batchSize - 1); - dstEnd = dstCurr + (slen - stail); - while (dstCurr < dstEnd) - { - long d00 = *(long*)(srcStartPtrs[0]); - long d01 = *(long*)(srcStartPtrs[0] + 8); - long d02 = *(long*)(srcStartPtrs[0] + 16); - long d03 = *(long*)(srcStartPtrs[0] + 24); - srcStartPtrs[0] += batchSize; - for (int i = 1; i < srcKeyCount; i++) - { - d00 ^= *(long*)(srcStartPtrs[i]); - d01 ^= *(long*)(srcStartPtrs[i] + 8); - d02 ^= *(long*)(srcStartPtrs[i] + 16); - d03 ^= *(long*)(srcStartPtrs[i] + 24); - srcStartPtrs[i] += batchSize; - } - *(long*)dstCurr = d00; - *(long*)(dstCurr + 8) = d01; - *(long*)(dstCurr + 16) = d02; - *(long*)(dstCurr + 24) = d03; - dstCurr += batchSize; - } - if (stail == 0) goto fillTail; - #endregion - - #region scalar_1x8 - slen = stail; - batchSize = 8; - stail = slen & (batchSize - 1); - dstEnd = dstCurr + (slen - stail); - while (dstCurr < dstEnd) + static void Vectorized256(ref byte* firstPtr, int srcCount, byte** srcStartPtrs, ref byte* dstPtr, byte* dstBatchEndPtr) { - long d00 = *(long*)(srcStartPtrs[0]); - srcStartPtrs[0] += batchSize; - for (int i = 1; i < srcKeyCount; i++) + while (dstPtr < dstBatchEndPtr) { - d00 ^= *(long*)(srcStartPtrs[i]); - srcStartPtrs[i] += batchSize; + var d00 = Vector256.Load(firstPtr + (Vector256.Count * 0)); + var d01 = Vector256.Load(firstPtr + (Vector256.Count * 1)); + var d02 = Vector256.Load(firstPtr + (Vector256.Count * 2)); + var d03 = Vector256.Load(firstPtr + (Vector256.Count * 3)); + var d04 = Vector256.Load(firstPtr + (Vector256.Count * 4)); + var d05 = Vector256.Load(firstPtr + (Vector256.Count * 5)); + var d06 = Vector256.Load(firstPtr + (Vector256.Count * 6)); + var d07 = Vector256.Load(firstPtr + (Vector256.Count * 7)); + + firstPtr += Vector256.Count * 8; + + for (var i = 1; i < srcCount; i++) + { + ref var startPtr = ref srcStartPtrs[i]; + + var s00 = Vector256.Load(startPtr + (Vector256.Count * 0)); + var s01 = Vector256.Load(startPtr + (Vector256.Count * 1)); + var s02 = Vector256.Load(startPtr + (Vector256.Count * 2)); + var s03 = Vector256.Load(startPtr + (Vector256.Count * 3)); + var s04 = Vector256.Load(startPtr + (Vector256.Count * 4)); + var s05 = Vector256.Load(startPtr + (Vector256.Count * 5)); + var s06 = Vector256.Load(startPtr + (Vector256.Count * 6)); + var s07 = Vector256.Load(startPtr + (Vector256.Count * 7)); + + d00 = TBinaryOperator.Invoke(d00, s00); + d01 = TBinaryOperator.Invoke(d01, s01); + d02 = TBinaryOperator.Invoke(d02, s02); + d03 = TBinaryOperator.Invoke(d03, s03); + d04 = TBinaryOperator.Invoke(d04, s04); + d05 = TBinaryOperator.Invoke(d05, s05); + d06 = TBinaryOperator.Invoke(d06, s06); + d07 = TBinaryOperator.Invoke(d07, s07); + + startPtr += Vector256.Count * 8; + } + + Vector256.Store(d00, dstPtr + (Vector256.Count * 0)); + Vector256.Store(d01, dstPtr + (Vector256.Count * 1)); + Vector256.Store(d02, dstPtr + (Vector256.Count * 2)); + Vector256.Store(d03, dstPtr + (Vector256.Count * 3)); + Vector256.Store(d04, dstPtr + (Vector256.Count * 4)); + Vector256.Store(d05, dstPtr + (Vector256.Count * 5)); + Vector256.Store(d06, dstPtr + (Vector256.Count * 6)); + Vector256.Store(d07, dstPtr + (Vector256.Count * 7)); + + dstPtr += Vector256.Count * 8; } - *(long*)dstCurr = d00; - dstCurr += batchSize; } - #endregion - fillTail: - #region scalar_1x1 - byte* dstMaxEnd = dstPtr + dstLen; - while (dstCurr < dstMaxEnd) + static void Vectorized128(ref byte* firstPtr, int srcCount, byte** srcStartPtrs, ref byte* dstPtr, byte* dstBatchEndPtr) { - byte d00 = 0; - if (srcStartPtrs[0] < srcEndPtrs[0]) + while (dstPtr < dstBatchEndPtr) { - d00 = *srcStartPtrs[0]; - srcStartPtrs[0]++; - } - - for (int i = 1; i < srcKeyCount; i++) - { - if (srcStartPtrs[i] < srcEndPtrs[i]) + var d00 = Vector128.Load(firstPtr + (Vector128.Count * 0)); + var d01 = Vector128.Load(firstPtr + (Vector128.Count * 1)); + var d02 = Vector128.Load(firstPtr + (Vector128.Count * 2)); + var d03 = Vector128.Load(firstPtr + (Vector128.Count * 3)); + var d04 = Vector128.Load(firstPtr + (Vector128.Count * 4)); + var d05 = Vector128.Load(firstPtr + (Vector128.Count * 5)); + var d06 = Vector128.Load(firstPtr + (Vector128.Count * 6)); + var d07 = Vector128.Load(firstPtr + (Vector128.Count * 7)); + + firstPtr += Vector128.Count * 8; + + for (var i = 1; i < srcCount; i++) { - d00 ^= *srcStartPtrs[i]; - srcStartPtrs[i]++; + ref var startPtr = ref srcStartPtrs[i]; + + var s00 = Vector128.Load(startPtr + (Vector128.Count * 0)); + var s01 = Vector128.Load(startPtr + (Vector128.Count * 1)); + var s02 = Vector128.Load(startPtr + (Vector128.Count * 2)); + var s03 = Vector128.Load(startPtr + (Vector128.Count * 3)); + var s04 = Vector128.Load(startPtr + (Vector128.Count * 4)); + var s05 = Vector128.Load(startPtr + (Vector128.Count * 5)); + var s06 = Vector128.Load(startPtr + (Vector128.Count * 6)); + var s07 = Vector128.Load(startPtr + (Vector128.Count * 7)); + + d00 = TBinaryOperator.Invoke(d00, s00); + d01 = TBinaryOperator.Invoke(d01, s01); + d02 = TBinaryOperator.Invoke(d02, s02); + d03 = TBinaryOperator.Invoke(d03, s03); + d04 = TBinaryOperator.Invoke(d04, s04); + d05 = TBinaryOperator.Invoke(d05, s05); + d06 = TBinaryOperator.Invoke(d06, s06); + d07 = TBinaryOperator.Invoke(d07, s07); + + startPtr += Vector128.Count * 8; } + + Vector128.Store(d00, dstPtr + (Vector128.Count * 0)); + Vector128.Store(d01, dstPtr + (Vector128.Count * 1)); + Vector128.Store(d02, dstPtr + (Vector128.Count * 2)); + Vector128.Store(d03, dstPtr + (Vector128.Count * 3)); + Vector128.Store(d04, dstPtr + (Vector128.Count * 4)); + Vector128.Store(d05, dstPtr + (Vector128.Count * 5)); + Vector128.Store(d06, dstPtr + (Vector128.Count * 6)); + Vector128.Store(d07, dstPtr + (Vector128.Count * 7)); + + dstPtr += Vector128.Count * 8; } - *dstCurr++ = d00; } - #endregion } - } } \ No newline at end of file diff --git a/libs/server/Resp/CmdStrings.cs b/libs/server/Resp/CmdStrings.cs index e0a4a29eb77..cd3263aa808 100644 --- a/libs/server/Resp/CmdStrings.cs +++ b/libs/server/Resp/CmdStrings.cs @@ -269,6 +269,7 @@ static partial class CmdStrings public static ReadOnlySpan RESP_WRONGPASS_INVALID_USERNAME_PASSWORD => "WRONGPASS Invalid username/password combination"u8; public static ReadOnlySpan RESP_SYNTAX_ERROR => "ERR syntax error"u8; public static ReadOnlySpan RESP_ERR_BITOP_KEY_LIMIT => "ERR Bitop source key limit (64) exceeded"u8; + public static ReadOnlySpan RESP_ERR_BITOP_DIFF_TWO_SOURCE_KEYS_REQUIRED => "ERR BITOP DIFF must be called with at least two source keys."u8; public static ReadOnlySpan RESP_ERR_COUNT_IS_NOT_POSITIVE => "ERR COUNT must be > 0"u8; public static ReadOnlySpan RESP_ERR_COUNT_IS_OUT_OF_RANGE_N1 => "ERR count should be greater than or equal to -1."u8; public static ReadOnlySpan RESP_ERR_MODULE_LOADED_TYPES => "ERR Unable to load types from module. Ensure that the module is compatible with the current runtime."u8; @@ -525,6 +526,7 @@ static partial class CmdStrings public static ReadOnlySpan LUA_OR => "OR"u8; public static ReadOnlySpan LUA_XOR => "XOR"u8; public static ReadOnlySpan LUA_NOT => "NOT"u8; + public static ReadOnlySpan LUA_DIFF => "DIFF"u8; public static ReadOnlySpan LUA_KEYS => "KEYS"u8; public static ReadOnlySpan LUA_ARGV => "ARGV"u8; public static ReadOnlySpan EXPDELSCAN => "EXPDELSCAN"u8; diff --git a/libs/server/Resp/Parser/RespCommand.cs b/libs/server/Resp/Parser/RespCommand.cs index 9d4224d56c8..cc81121b1df 100644 --- a/libs/server/Resp/Parser/RespCommand.cs +++ b/libs/server/Resp/Parser/RespCommand.cs @@ -220,7 +220,8 @@ public enum RespCommand : ushort BITOP_AND, BITOP_OR, BITOP_XOR, - BITOP_NOT, // Note: Update LastWriteCommand if adding new write commands after this + BITOP_NOT, + BITOP_DIFF, // Note: Update LastWriteCommand if adding new write commands after this // Script execution commands EVAL, @@ -401,7 +402,7 @@ public enum RespCommand : ushort public static class RespCommandExtensions { private static readonly RespCommand[] ExpandedSET = [RespCommand.SETEXNX, RespCommand.SETEXXX, RespCommand.SETKEEPTTL, RespCommand.SETKEEPTTLXX]; - private static readonly RespCommand[] ExpandedBITOP = [RespCommand.BITOP_AND, RespCommand.BITOP_NOT, RespCommand.BITOP_OR, RespCommand.BITOP_XOR]; + private static readonly RespCommand[] ExpandedBITOP = [RespCommand.BITOP_AND, RespCommand.BITOP_NOT, RespCommand.BITOP_OR, RespCommand.BITOP_XOR, RespCommand.BITOP_DIFF]; // Commands that are either returning static data or commands that cannot have issues from concurrent AOF interaction in another session private static readonly RespCommand[] AofIndependentCommands = [ @@ -516,7 +517,7 @@ public static RespCommand NormalizeForACLs(this RespCommand cmd) RespCommand.SETEXXX => RespCommand.SET, RespCommand.SETKEEPTTL => RespCommand.SET, RespCommand.SETKEEPTTLXX => RespCommand.SET, - RespCommand.BITOP_AND or RespCommand.BITOP_NOT or RespCommand.BITOP_OR or RespCommand.BITOP_XOR => RespCommand.BITOP, + RespCommand.BITOP_AND or RespCommand.BITOP_NOT or RespCommand.BITOP_OR or RespCommand.BITOP_XOR or RespCommand.BITOP_DIFF => RespCommand.BITOP, _ => cmd }; } @@ -541,7 +542,7 @@ public static ReadOnlySpan ExpandForACLs(this RespCommand cmd) internal const RespCommand FirstWriteCommand = RespCommand.APPEND; - internal const RespCommand LastWriteCommand = RespCommand.BITOP_NOT; + internal const RespCommand LastWriteCommand = RespCommand.BITOP_DIFF; internal const RespCommand LastDataCommand = RespCommand.EVALSHA; @@ -986,44 +987,64 @@ private RespCommand FastParseArrayCommand(ref int count, ref ReadOnlySpan // Check for matching bit-operation if (remainingBytes > length + 6 + 8) { - // TODO: AND|OR|XOR|NOT may not correctly handle mixed cases? + // TODO: AND|OR|XOR|NOT|DIFF may not correctly handle mixed cases? - // 2-character operations - if (*(uint*)(ptr + 11) == MemoryMarshal.Read("$2\r\n"u8)) + var tag64 = *(ulong*)(ptr + 11); + var tag32 = (uint)tag64; + + if (tag32 == MemoryMarshal.Read("$2\r\n"u8)) { - if (*(ulong*)(ptr + 11) == MemoryMarshal.Read("$2\r\nOR\r\n"u8) || *(ulong*)(ptr + 11) == MemoryMarshal.Read("$2\r\nor\r\n"u8)) + if (tag64 == MemoryMarshal.Read("$2\r\nOR\r\n"u8) || tag64 == MemoryMarshal.Read("$2\r\nor\r\n"u8)) { - readHead += 8; + readHead += 8; // "$2\r\n" + "OR" + "\r\n" count -= 1; return RespCommand.BITOP_OR; } } - // 3-character operations - else if (remainingBytes > length + 6 + 9) + else if (tag32 == MemoryMarshal.Read("$3\r\n"u8) && remainingBytes > length + 6 + 9) { - if (*(uint*)(ptr + 11) == MemoryMarshal.Read("$3\r\n"u8)) + // Optimistically adjust + readHead += 9; // "$3\r\n" + AND|XOR|NOT + "\r\n" + count -= 1; + + tag64 = *(ulong*)(ptr + 12); + + if (tag64 == MemoryMarshal.Read("3\r\nAND\r\n"u8) || tag64 == MemoryMarshal.Read("3\r\nand\r\n"u8)) { - // Optimistically adjust read head and count - readHead += 9; - count -= 1; + return RespCommand.BITOP_AND; + } + else if (tag64 == MemoryMarshal.Read("3\r\nXOR\r\n"u8) || tag64 == MemoryMarshal.Read("3\r\nxor\r\n"u8)) + { + return RespCommand.BITOP_XOR; + } + else if (tag64 == MemoryMarshal.Read("3\r\nNOT\r\n"u8) || tag64 == MemoryMarshal.Read("3\r\nnot\r\n"u8)) + { + return RespCommand.BITOP_NOT; + } - if (*(ulong*)(ptr + 12) == MemoryMarshal.Read("3\r\nAND\r\n"u8) || *(ulong*)(ptr + 12) == MemoryMarshal.Read("3\r\nand\r\n"u8)) - { - return RespCommand.BITOP_AND; - } - else if (*(ulong*)(ptr + 12) == MemoryMarshal.Read("3\r\nXOR\r\n"u8) || *(ulong*)(ptr + 12) == MemoryMarshal.Read("3\r\nxor\r\n"u8)) - { - return RespCommand.BITOP_XOR; - } - else if (*(ulong*)(ptr + 12) == MemoryMarshal.Read("3\r\nNOT\r\n"u8) || *(ulong*)(ptr + 12) == MemoryMarshal.Read("3\r\nnot\r\n"u8)) - { - return RespCommand.BITOP_NOT; - } + // Reset if no match + readHead -= 9; + count += 1; + } + else if (tag32 == MemoryMarshal.Read("$4\r\n"u8) && remainingBytes > length + 6 + 10) + { + // Optimistically adjust + readHead += 10; // "$4\r\nDIFF\r\n" + count -= 1; + + tag64 = *(ulong*)(ptr + 12); - // Reset read head and count if we didn't match operator. - readHead -= 9; - count += 1; + // Compare first 8 bytes then the trailing '\n' for "4\r\nDIFF\r\n" + if ((*(ulong*)(ptr + 12) == MemoryMarshal.Read("4\r\nDIFF\r"u8) || + *(ulong*)(ptr + 12) == MemoryMarshal.Read("4\r\ndiff\r"u8)) && + *(ptr + 20) == (byte)'\n') + { + return RespCommand.BITOP_DIFF; } + + // Reset if no match + readHead -= 10; + count += 1; } // Although we recognize BITOP, the pseudo-subcommand isn't recognized so fail early diff --git a/libs/server/Resp/RespServerSession.cs b/libs/server/Resp/RespServerSession.cs index aeb3e966c36..13225942eb6 100644 --- a/libs/server/Resp/RespServerSession.cs +++ b/libs/server/Resp/RespServerSession.cs @@ -876,6 +876,7 @@ private bool ProcessArrayCommands(RespCommand cmd, ref TGarnetApi st RespCommand.BITOP_OR => NetworkStringBitOperation(BitmapOperation.OR, ref storageApi), RespCommand.BITOP_XOR => NetworkStringBitOperation(BitmapOperation.XOR, ref storageApi), RespCommand.BITOP_NOT => NetworkStringBitOperation(BitmapOperation.NOT, ref storageApi), + RespCommand.BITOP_DIFF => NetworkStringBitOperation(BitmapOperation.DIFF, ref storageApi), RespCommand.BITFIELD => StringBitField(ref storageApi), RespCommand.BITFIELD_RO => StringBitFieldReadOnly(ref storageApi), // List Commands diff --git a/libs/server/Storage/Session/MainStore/BitmapOps.cs b/libs/server/Storage/Session/MainStore/BitmapOps.cs index 3248354be87..67777f77da2 100644 --- a/libs/server/Storage/Session/MainStore/BitmapOps.cs +++ b/libs/server/Storage/Session/MainStore/BitmapOps.cs @@ -4,7 +4,6 @@ using System; using System.Collections.Generic; using System.Diagnostics; -using System.Runtime.CompilerServices; using System.Text; using Garnet.common; using Tsavorite.core; @@ -84,8 +83,8 @@ public unsafe GarnetStatus StringBitOperation(ref RawStringInput input, BitmapOp // 8 byte start pointer // 4 byte int length - var output = stackalloc byte[12]; - var srcBitmapStartPtrs = stackalloc byte*[keyCount - 1]; + Span output = stackalloc byte[12]; + var srcBitmapPtrs = stackalloc byte*[keyCount - 1]; var srcBitmapEndPtrs = stackalloc byte*[keyCount - 1]; var createTransaction = false; @@ -113,7 +112,7 @@ public unsafe GarnetStatus StringBitOperation(ref RawStringInput input, BitmapOp { var srcKey = keys[i]; //Read srcKey - var outputBitmap = new SpanByteAndMemory(output, 12); + var outputBitmap = SpanByteAndMemory.FromPinnedSpan(output); status = ReadWithUnsafeContext(srcKey, ref input, ref outputBitmap, localHeadAddress, out bool epochChanged, ref uc); if (epochChanged) { @@ -125,48 +124,38 @@ public unsafe GarnetStatus StringBitOperation(ref RawStringInput input, BitmapOp continue; var outputBitmapPtr = outputBitmap.SpanByte.ToPointer(); - var localSrcBitmapPtr = (byte*)((IntPtr)(*(long*)outputBitmapPtr)); - var len = *(int*)(outputBitmapPtr + 8); + var localBitmapPtr = (byte*)(nuint)(*(ulong*)outputBitmapPtr); + var localBitmapLength = *(int*)(outputBitmapPtr + 8); // Keep track of pointers returned from ISessionFunctions - srcBitmapStartPtrs[keysFound] = localSrcBitmapPtr; - srcBitmapEndPtrs[keysFound] = localSrcBitmapPtr + len; + srcBitmapPtrs[keysFound] = localBitmapPtr; + srcBitmapEndPtrs[keysFound] = localBitmapPtr + localBitmapLength; keysFound++; - maxBitmapLen = Math.Max(len, maxBitmapLen); - minBitmapLen = Math.Min(len, minBitmapLen); - } - #region performBitop - // Allocate result buffers - sectorAlignedMemoryBitmap ??= new SectorAlignedMemory(bitmapBufferSize + sectorAlignedMemoryPoolAlignment, sectorAlignedMemoryPoolAlignment); - var dstBitmapPtr = sectorAlignedMemoryBitmap.GetValidPointer() + sectorAlignedMemoryPoolAlignment; - if (maxBitmapLen + sectorAlignedMemoryPoolAlignment > bitmapBufferSize) - { - do - { - bitmapBufferSize <<= 1; - } while (maxBitmapLen + sectorAlignedMemoryPoolAlignment > bitmapBufferSize); - - sectorAlignedMemoryBitmap.Dispose(); - sectorAlignedMemoryBitmap = new SectorAlignedMemory(bitmapBufferSize + sectorAlignedMemoryPoolAlignment, sectorAlignedMemoryPoolAlignment); - dstBitmapPtr = sectorAlignedMemoryBitmap.GetValidPointer() + sectorAlignedMemoryPoolAlignment; + maxBitmapLen = Math.Max(localBitmapLength, maxBitmapLen); + minBitmapLen = Math.Min(localBitmapLength, minBitmapLen); } - // Check if at least one key is found and execute bitop if (keysFound > 0) { - //1. Multi-way bitmap merge - _ = BitmapManager.BitOpMainUnsafeMultiKey(dstBitmapPtr, maxBitmapLen, srcBitmapStartPtrs, srcBitmapEndPtrs, keysFound, minBitmapLen, (byte)bitOp); - #endregion + // Allocate result buffer + if (sectorAlignedMemoryBitmap == null || maxBitmapLen > bitmapBufferSize) + { + bitmapBufferSize = Math.Max(bitmapBufferSize, maxBitmapLen); + + sectorAlignedMemoryBitmap?.Dispose(); + sectorAlignedMemoryBitmap = new SectorAlignedMemory(bitmapBufferSize, sectorAlignedMemoryPoolAlignment); + } + + var dstBitmapPtr = sectorAlignedMemoryBitmap.GetValidPointer(); + BitmapManager.InvokeBitOperationUnsafe(bitOp, keysFound, srcBitmapPtrs, srcBitmapEndPtrs, dstBitmapPtr, maxBitmapLen, minBitmapLen); if (maxBitmapLen > 0) { var dstKey = keys[0].SpanByte; - var valPtr = dstBitmapPtr; - valPtr -= sizeof(int); - *(int*)valPtr = maxBitmapLen; - status = SET(ref dstKey, ref Unsafe.AsRef(valPtr), ref uc); + var dstBitmapSpanByte = SpanByte.FromPinnedPointer(dstBitmapPtr, maxBitmapLen); + status = SET(ref dstKey, ref dstBitmapSpanByte, ref uc); } } else diff --git a/test/Garnet.test/GarnetBitmapTests.cs b/test/Garnet.test/GarnetBitmapTests.cs index 1eb0a2e1958..22a8b4147bf 100644 --- a/test/Garnet.test/GarnetBitmapTests.cs +++ b/test/Garnet.test/GarnetBitmapTests.cs @@ -3,7 +3,8 @@ using System; using System.Collections.Generic; -using System.Runtime.Intrinsics.X86; +using System.Linq; +using System.Numerics.Tensors; using Garnet.common; using Garnet.server; using NUnit.Framework; @@ -44,15 +45,26 @@ public void TearDown() TestUtils.DeleteDirectory(TestUtils.MethodTestDir); } - private long LongRandom() => ((long)this.r.Next() << 32) | (long)this.r.Next(); - - private ulong ULongRandom() + private GarnetServerTestProcess CreateServerWithEnvironmentVariables(string environment) { - ulong lsb = (ulong)(this.r.Next()); - ulong msb = (ulong)(this.r.Next()) << 32; - return (msb | lsb); + var parts = environment.Split('=', 2); + if (parts.Length == 2) + { + Dictionary envVars = []; + envVars.Add(parts[0], parts[1]); + + return new GarnetServerTestProcess(envVars); + } + else + { + return new GarnetServerTestProcess(); + } } + private long LongRandom() => r.NextInt64(long.MinValue, long.MaxValue); + + private ulong ULongRandom() => (ulong)LongRandom(); + private unsafe long ResponseToLong(byte[] response, int offset) { fixed (byte* ptr = response) @@ -271,60 +283,29 @@ public void BitmapSetGetBitTest_LTM(bool preSet) [Test, Order(6)] [Category("BITCOUNT")] - [TestCase(0, TestName = "BitmapSimpleBitCountTest(Hardware accelerated)")] - [TestCase(1, TestName = "BitmapSimpleBitCountTest(Avx2 disabled)")] - [TestCase(2, TestName = "BitmapSimpleBitCountTest(Software fallback)")] - public void BitmapSimpleBitCountTest(int acceleration) + [TestCase("DOTNET_EnableAVX2=0")] + [TestCase("DOTNET_EnableHWIntrinsic=1")] + [TestCase("DOTNET_EnableHWIntrinsic=0")] + public void BitmapSimpleBitCountTest(string environment) { - var configOptions = TestUtils.GetConfig(); + using var server = CreateServerWithEnvironmentVariables(environment); + using var redis = ConnectionMultiplexer.Connect(server.Options); - if (acceleration == 0) - { - SimpleBitCountTest(); - } - else - { - Dictionary env = []; - - if (acceleration == 1) - { - if (!Avx2.IsSupported && Ssse3.IsSupported) - Assert.Ignore("Already tested by main path"); - - env.Add("DOTNET_EnableAVX2", "0"); - } - else - { - if (!Avx2.IsSupported && !Ssse3.IsSupported) - Assert.Ignore("Already tested by main path"); - - env.Add("DOTNET_EnableHWIntrinsic", "0"); - } - - using var p = new GarnetServerTestProcess(out configOptions, env); - - SimpleBitCountTest(); - } + var db = redis.GetDatabase(0); + var maxBitmapLen = 1 << 12; + var iter = 1024; + var expectedCount = 0; + var key = "SimpleBitCountTest"; - void SimpleBitCountTest() + for (var i = 0; i < iter; i++) { - using var redis = ConnectionMultiplexer.Connect(configOptions); - var db = redis.GetDatabase(0); - var maxBitmapLen = 1 << 12; - var iter = 1024; - var expectedCount = 0; - var key = "SimpleBitCountTest"; - - for (var i = 0; i < iter; i++) - { - var offset = r.Next(1, maxBitmapLen); - var set = !db.StringSetBit(key, offset, true); - expectedCount += set ? 1 : 0; - } - - var count = db.StringBitCount(key); - ClassicAssert.AreEqual(expectedCount, count); + var offset = r.Next(1, maxBitmapLen); + var set = !db.StringSetBit(key, offset, true); + expectedCount += set ? 1 : 0; } + + var count = db.StringBitCount(key); + ClassicAssert.AreEqual(expectedCount, count); } private static int Index(long offset) => (int)(offset >> 3); @@ -801,391 +782,180 @@ public unsafe void BitmapSimpleBITPOS_PCT(int bytesPerSend) ClassicAssert.AreEqual(expectedPos, pos); } - [Test, Order(16)] - [TestCase(100)] - public unsafe void BitmapSimpleBITOP_PCT(int bytesPerSend) + private static byte[] CopyBitmap(byte[] sourceBitmap, bool invert = false) { - using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig()); - using var lightClientRequest = TestUtils.CreateRequest(); - var db = redis.GetDatabase(0); - - int tests = 32; - string a = "a"; - string b = "b"; - string c = "c"; - string d = "d"; + var dst = new byte[sourceBitmap.Length]; + if (invert) + TensorPrimitives.OnesComplement(sourceBitmap, dst); + else + sourceBitmap.AsSpan().CopyTo(dst); - long src = 0; - long dst = 0; - byte[] data; + return dst; + } - //Test NOT - for (int i = 0; i < tests; i++) + private static void ApplyBitop(ref byte[] dst, byte[] src, Func op) + { + if (dst.Length < src.Length) { - src = LongRandom(); - data = BitConverter.GetBytes(src); - db.StringSet(a, data); - - dst = ~src; - long size = 0; - byte[] response = lightClientRequest.SendCommandChunks("BITOP NOT " + d + " " + a, bytesPerSend); - size = ResponseToLong(response, 1); - ClassicAssert.AreEqual(size, 8); - - data = db.StringGet(d); - src = BitConverter.ToInt64(data, 0); - ClassicAssert.AreEqual(dst, src); + var newDst = new byte[src.Length]; + dst.AsSpan().CopyTo(newDst); + dst = newDst; } - - //Test AND, OR, XOR - long srcA, srcB, srcC; - RedisKey[] keys = [a, b, c]; - Bitwise[] bitwiseOps = [Bitwise.And, Bitwise.Or, Bitwise.Xor]; - for (int j = 0; j < bitwiseOps.Length; j++) + for (var i = 0; i < src.Length; i++) { - for (int i = 0; i < tests; i++) - { - srcA = LongRandom(); - srcB = LongRandom(); - srcC = LongRandom(); - - data = BitConverter.GetBytes(srcA); - db.StringSet(a, data); - data = BitConverter.GetBytes(srcB); - db.StringSet(b, data); - data = BitConverter.GetBytes(srcC); - db.StringSet(c, data); - - byte[] response = null; - long size = 0; - //size = db.StringBitOperation(bitwiseOps[j], d, keys); - switch (bitwiseOps[j]) - { - case Bitwise.And: - dst = srcA & srcB & srcC; - response = lightClientRequest.SendCommandChunks("BITOP AND " + d + " " + a + " " + b + " " + c, bytesPerSend); - break; - case Bitwise.Or: - dst = srcA | srcB | srcC; - response = lightClientRequest.SendCommandChunks("BITOP OR " + d + " " + a + " " + b + " " + c, bytesPerSend); - break; - case Bitwise.Xor: - dst = srcA ^ srcB ^ srcC; - response = lightClientRequest.SendCommandChunks("BITOP XOR " + d + " " + a + " " + b + " " + c, bytesPerSend); - break; - } - - size = ResponseToLong(response, 1); - ClassicAssert.AreEqual(size, 8); - - data = db.StringGet(d); - src = BitConverter.ToInt64(data, 0); + dst[i] = op(dst[i], src[i]); + } - ClassicAssert.AreEqual(dst, src); - } + for (var i = src.Length; i < dst.Length; i++) + { + dst[i] = op(dst[i], 0); } } - [Test, Order(17)] + [Test, Order(19)] [Category("BITOP")] - public void BitmapSimpleBitOpTests() + public void BitOp_Unary_BitwiseNot( + [Values(Bitwise.Not)] Bitwise op, + [Values(1, 2, 16, 32 + 3, 128 + 32 + 3, 256 + 32 + 3, 512 + 32 + 3, 4096, 4096 + 32, 4096 + 32 + 3)] int bitmapLength) { using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig()); var db = redis.GetDatabase(0); - int tests = 128; - string a = "a"; - string b = "b"; - string c = "c"; - string d = "d"; - - long src = 0; - long dst = 0; - byte[] data; - - //Test NOT - for (int i = 0; i < tests; i++) - { - src = LongRandom(); - data = BitConverter.GetBytes(src); - db.StringSet(a, data); - - dst = ~src; - long size = db.StringBitOperation(Bitwise.Not, d, a); - ClassicAssert.AreEqual(size, 8); - - data = db.StringGet(d); - src = BitConverter.ToInt64(data, 0); - ClassicAssert.AreEqual(dst, src); - } - - //Test AND, OR, XOR - long srcA, srcB, srcC; - RedisKey[] keys = [a, b, c]; - Bitwise[] bitwiseOps = [Bitwise.And, Bitwise.Or, Bitwise.Xor]; - for (int j = 0; j < bitwiseOps.Length; j++) - { - for (int i = 0; i < tests; i++) - { - srcA = LongRandom(); - srcB = LongRandom(); - srcC = LongRandom(); - - data = BitConverter.GetBytes(srcA); - db.StringSet(a, data); - data = BitConverter.GetBytes(srcB); - db.StringSet(b, data); - data = BitConverter.GetBytes(srcC); - db.StringSet(c, data); - - switch (bitwiseOps[j]) - { - case Bitwise.And: - dst = srcA & srcB & srcC; - break; - case Bitwise.Or: - dst = srcA | srcB | srcC; - break; - case Bitwise.Xor: - dst = srcA ^ srcB ^ srcC; - break; - } + var srcKey = "src"; + var dstKey = "dst"; - long size = db.StringBitOperation(bitwiseOps[j], d, keys); - ClassicAssert.AreEqual(size, 8); + var srcKeyBitmap = new byte[bitmapLength]; + r.NextBytes(srcKeyBitmap); + var expectedBitmap = CopyBitmap(srcKeyBitmap, invert: true); + db.StringSet(srcKey, srcKeyBitmap); - data = db.StringGet(d); - src = BitConverter.ToInt64(data, 0); + var size = db.StringBitOperation(op, dstKey, srcKey); + ClassicAssert.AreEqual(expectedBitmap.Length, size); - ClassicAssert.AreEqual(dst, src); - } - } + byte[] actualBitmap = db.StringGet(dstKey); + ClassicAssert.AreEqual(expectedBitmap.Length, actualBitmap.Length); + ClassicAssert.AreEqual(expectedBitmap, actualBitmap); } - private static void InitBitmap(ref byte[] dst, byte[] srcA, bool invert = false) + [Test] + [Category("BITOP")] + public void BitOp_Binary_SameSize( + [Values("DOTNET_EnableHWIntrinsic=1", "DOTNET_PreferredVectorBitWidth=128", "DOTNET_EnableHWIntrinsic=0")] string environment, + [Values(Bitwise.And, Bitwise.Or, Bitwise.Xor, Bitwise.Diff)] Bitwise op, + [Values(512 + 32 + 3)] int bitmapSize, + [Values(2, 3, 4)] int keys) { - dst = new byte[srcA.Length]; - if (invert) - for (int i = 0; i < srcA.Length; i++) dst[i] = (byte)~srcA[i]; - else - for (int i = 0; i < srcA.Length; i++) dst[i] = srcA[i]; + using var server = CreateServerWithEnvironmentVariables(environment); + BitOp_Binary_SameSize(server.Options, op, bitmapSize, keys); } - private static void ApplyBitop(ref byte[] dst, byte[] srcA, Func f8) + [Test] + [Category("BITOP")] + public void BitOp_Binary_SameSize( + [Values(Bitwise.And, Bitwise.Or, Bitwise.Xor, Bitwise.Diff)] Bitwise op, + [Values(1, 2, 16, 32 + 3, 128 + 32 + 3, 256 + 32 + 3, 512 + 32 + 3, 4096, 4096 + 32, 4096 + 32 + 3)] int bitmapSize, + [Values(2, 3, 4)] int keys) { - if (dst.Length < srcA.Length) - { - byte[] newDst = new byte[srcA.Length]; - Buffer.BlockCopy(dst, 0, newDst, 0, dst.Length); - dst = newDst; - } + BitOp_Binary_SameSize(TestUtils.GetConfig(), op, bitmapSize, keys); + } - for (int i = 0; i < srcA.Length; i++) + private void BitOp_Binary_SameSize( + ConfigurationOptions configOptions, + Bitwise op, + int bitmapSize, + int keys) + { + Func opFunc = op switch { - dst[i] = f8(dst[i], srcA[i]); - } + Bitwise.And => static (a, b) => (byte)(a & b), + Bitwise.Or => static (a, b) => (byte)(a | b), + Bitwise.Xor => static (a, b) => (byte)(a ^ b), + Bitwise.Diff => static (a, b) => (byte)(a & ~b), - for (int i = srcA.Length; i < dst.Length; i++) - { - dst[i] = f8(dst[i], 0); - } - } + _ => throw new NotSupportedException() + }; - [Test, Order(18)] - [Category("BITOP")] - public void BitmapSimpleVarLenBitOpTests() - { - using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig()); + using var redis = ConnectionMultiplexer.Connect(configOptions); var db = redis.GetDatabase(0); - int tests = 32; - string a = "a"; - string b = "b"; - string c = "c"; - string d = "d"; - string x = "x"; - - RedisKey[] keys = [a, b, c, d]; - Bitwise[] bitwiseOps = [Bitwise.And, Bitwise.Or, Bitwise.Xor, Bitwise.And, Bitwise.Or, Bitwise.Xor]; + var srcKeys = new RedisKey[keys]; + var srcKeyBitmaps = new byte[keys][]; - int maxBytes = 512; - byte[] dataA = new byte[r.Next(1, maxBytes)]; - byte[] dataB = new byte[r.Next(1, maxBytes)]; - byte[] dataC = new byte[r.Next(1, maxBytes)]; - byte[] dataD = new byte[r.Next(1, maxBytes)]; - byte[] dataX = null; + var dstKey = "dst"; + var expectedBitmap = new byte[bitmapSize]; - for (int j = 0; j < bitwiseOps.Length; j++) + for (var i = 0; i < srcKeys.Length; i++) { - for (int i = 0; i < tests; i++) - { - r.NextBytes(dataA); - r.NextBytes(dataB); - r.NextBytes(dataC); - r.NextBytes(dataD); - - db.StringSet(a, dataA); - db.StringSet(b, dataB); - db.StringSet(c, dataC); - db.StringSet(d, dataD); - - Func f8 = null; - switch (bitwiseOps[j]) - { - case Bitwise.And: - f8 = (a, b) => (byte)(a & b); - break; - case Bitwise.Or: - f8 = (a, b) => (byte)(a | b); - break; - case Bitwise.Xor: - f8 = (a, b) => (byte)(a ^ b); - break; - } + srcKeyBitmaps[i] = new byte[bitmapSize]; + r.NextBytes(srcKeyBitmaps[i]); - dataX = null; - InitBitmap(ref dataX, dataA); - ApplyBitop(ref dataX, dataB, f8); - ApplyBitop(ref dataX, dataC, f8); - ApplyBitop(ref dataX, dataD, f8); + srcKeys[i] = "src" + i; + db.StringSet(srcKeys[i], srcKeyBitmaps[i]); - long size = db.StringBitOperation(bitwiseOps[j], x, keys); - ClassicAssert.AreEqual(size, dataX.Length); + if (i == 0) + srcKeyBitmaps[i].AsSpan().CopyTo(expectedBitmap); + else + ApplyBitop(ref expectedBitmap, srcKeyBitmaps[i], opFunc); + } - byte[] expectedX = db.StringGet(x); + var size = db.StringBitOperation(op, dstKey, srcKeys); + ClassicAssert.AreEqual(expectedBitmap.Length, size); - ClassicAssert.AreEqual(dataX, expectedX); - } - } + byte[] actualBitmap = db.StringGet(dstKey); + ClassicAssert.AreEqual(expectedBitmap.Length, actualBitmap.Length); + ClassicAssert.AreEqual(expectedBitmap, actualBitmap); } - private static void AssertNegatedEqual(byte[] dstVal, byte[] srcVal) - { - for (int i = 0; i < srcVal.Length; i++) - { - byte srcV = (byte)~srcVal[i]; - ClassicAssert.AreEqual(srcV, dstVal[i]); - } - } - [Test, Order(19)] + [Test, Order(20)] [Category("BITOP")] - public void BitmapBitOpNotTest() + public void BitOp_Binary_DifferentTails( + [Values(Bitwise.And, Bitwise.Or, Bitwise.Xor, Bitwise.Diff)] Bitwise op, + [Values(1, 2, 16, 32 + 3, 128 + 32 + 3, 256 + 32 + 3, 512 + 32 + 3, 4096, 4096 + 32, 4096 + 32 + 3)] int sharedLength, + [Values(new int[] { 0, 7 }, new int[] { 16, 0, 7 }, new int[] { 1, 16, 1, 32 })] int[] additionalLengths) { - using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig()); - var db = redis.GetDatabase(0); - int tests = 32; - - string srcKey = "srcKey"; - string dstKey = "dstKey"; - - int maxBytes = 256; - byte[] srcVal = new byte[r.Next(1, maxBytes)]; - byte[] dstVal; - for (int i = 0; i < tests; i++) + Func opFunc = op switch { - r.NextBytes(srcVal); - db.StringSet(srcKey, srcVal); + Bitwise.And => static (a, b) => (byte)(a & b), + Bitwise.Or => static (a, b) => (byte)(a | b), + Bitwise.Xor => static (a, b) => (byte)(a ^ b), + Bitwise.Diff => static (a, b) => (byte)(a & ~b), - dstVal = db.StringGet(srcKey); + _ => throw new NotSupportedException() + }; - long size = db.StringBitOperation(Bitwise.Not, dstKey, srcKey); - - ClassicAssert.AreEqual(size, srcVal.Length); - dstVal = db.StringGet(dstKey); - - AssertNegatedEqual(dstVal, srcVal); - - db.KeyDelete(srcKey); - } - } - - [Test, Order(20)] - [Category("BITOP")] - public void BitmapSimpleBitOpVarLenGrowingSizeTests() - { using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig()); var db = redis.GetDatabase(0); - int tests = 16; - string a = "a"; - string b = "b"; - string c = "c"; - string d = "d"; - string x = "x"; + var srcKeyCount = additionalLengths.Length; + var srcKeys = new RedisKey[srcKeyCount]; + var srcKeyBitmaps = new byte[srcKeyCount][]; + var srcMaxLength = sharedLength + Enumerable.Max(additionalLengths); - byte[] dataA, dataB, dataC, dataD; - byte[] dataX; - int minSize = 512; - Bitwise[] bitwiseOps = [Bitwise.And, Bitwise.Or, Bitwise.Xor, Bitwise.And, Bitwise.Or, Bitwise.Xor]; - RedisKey[] keys = [a, b, c, d]; + var dstKey = "dst"; + var expectedBitmap = new byte[srcMaxLength]; - //Test NOT - for (int i = 0; i < tests; i++) + for (var i = 0; i < srcKeys.Length; i++) { - dataA = new byte[r.Next(minSize, minSize + 32)]; - r.NextBytes(dataA); - db.StringSet(a, dataA); + srcKeyBitmaps[i] = new byte[sharedLength + additionalLengths[i]]; + r.NextBytes(srcKeyBitmaps[i]); - dataX = null; - InitBitmap(ref dataX, dataA, true); - long size = db.StringBitOperation(Bitwise.Not, x, a); - ClassicAssert.AreEqual(size, dataX.Length); + srcKeys[i] = "src" + i; + db.StringSet(srcKeys[i], srcKeyBitmaps[i]); - byte[] expectedX = db.StringGet(x); - ClassicAssert.AreEqual(dataX, expectedX); + if (i == 0) + srcKeyBitmaps[i].AsSpan().CopyTo(expectedBitmap); + else + ApplyBitop(ref expectedBitmap, srcKeyBitmaps[i], opFunc); } - //Test AND, OR, XOR - for (int j = 0; j < bitwiseOps.Length; j++) - { - for (int i = 0; i < tests; i++) - { - dataA = new byte[r.Next(minSize, minSize + 16)]; minSize = dataA.Length; - dataB = new byte[r.Next(minSize, minSize + 16)]; minSize = dataB.Length; - dataC = new byte[r.Next(minSize, minSize + 16)]; minSize = dataC.Length; - dataD = new byte[r.Next(minSize, minSize + 16)]; minSize = dataD.Length; - minSize = 17; - - r.NextBytes(dataA); - r.NextBytes(dataB); - r.NextBytes(dataC); - r.NextBytes(dataD); - - db.StringSet(a, dataA); - db.StringSet(b, dataB); - db.StringSet(c, dataC); - db.StringSet(d, dataD); - - Func f8 = null; - switch (bitwiseOps[j]) - { - case Bitwise.And: - f8 = (a, b) => (byte)(a & b); - break; - case Bitwise.Or: - f8 = (a, b) => (byte)(a | b); - break; - case Bitwise.Xor: - f8 = (a, b) => (byte)(a ^ b); - break; - } - - dataX = null; - InitBitmap(ref dataX, dataA); - ApplyBitop(ref dataX, dataB, f8); - ApplyBitop(ref dataX, dataC, f8); - ApplyBitop(ref dataX, dataD, f8); - - long size = db.StringBitOperation(bitwiseOps[j], x, keys); - ClassicAssert.AreEqual(size, dataX.Length); - byte[] expectedX = db.StringGet(x); + var size = db.StringBitOperation(op, dstKey, srcKeys); + ClassicAssert.AreEqual(expectedBitmap.Length, size); - ClassicAssert.AreEqual(expectedX.Length, dataX.Length); - ClassicAssert.AreEqual(dataX, expectedX); - } - } + byte[] actualBitmap = db.StringGet(dstKey); + ClassicAssert.AreEqual(expectedBitmap.Length, actualBitmap.Length); + ClassicAssert.AreEqual(expectedBitmap, actualBitmap); } private static long GetValueFromBitmap(ref byte[] bitmap, long offset, int bitCount, bool signed) diff --git a/test/Garnet.test/RespCommandTests.cs b/test/Garnet.test/RespCommandTests.cs index 742e737e62c..84e790a233e 100644 --- a/test/Garnet.test/RespCommandTests.cs +++ b/test/Garnet.test/RespCommandTests.cs @@ -47,6 +47,7 @@ public class RespCommandTests RespCommand.BITOP_OR, RespCommand.BITOP_XOR, RespCommand.BITOP_NOT, + RespCommand.BITOP_DIFF, RespCommand.INVALID, RespCommand.DELIFEXPIM ]; diff --git a/test/Garnet.test/TestProcess.cs b/test/Garnet.test/TestProcess.cs index 45f13b577fd..d525a45630f 100644 --- a/test/Garnet.test/TestProcess.cs +++ b/test/Garnet.test/TestProcess.cs @@ -13,13 +13,13 @@ namespace Garnet.test { internal class GarnetServerTestProcess : IDisposable { - private readonly Process p = default; + private readonly Process process = default; private readonly Stopwatch stopWatch = default; private readonly LightClientRequest lightClientRequest = default; - internal GarnetServerTestProcess(out ConfigurationOptions opts, - Dictionary env = default, - int port = 7000) + public ConfigurationOptions Options { get; } + + internal GarnetServerTestProcess(Dictionary env = default, int port = 7000) { var a = Assembly.GetAssembly(typeof(Garnet.Program)); var name = a.Location; @@ -27,15 +27,15 @@ internal GarnetServerTestProcess(out ConfigurationOptions opts, if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { - name = name.AsSpan().Slice(0, pos).ToString() + ".exe"; + name = string.Concat(name.AsSpan(0, pos), ".exe"); } else { - name = name.AsSpan().Slice(0, pos).ToString(); + name = name.Substring(0, pos); } var endPoint = new IPEndPoint(IPAddress.Loopback, port); - opts = TestUtils.GetConfig([endPoint]); + Options = TestUtils.GetConfig([endPoint]); // We don't have to disable objects, it's done to improve startup time a bit. var psi = new ProcessStartInfo(name, ["--bind", "127.0.0.1", "--port", port.ToString(), "--enable-debug-command", "local", "--no-pubsub", "--no-obj"]) @@ -51,12 +51,12 @@ internal GarnetServerTestProcess(out ConfigurationOptions opts, psi.Environment.Add(e.Key, e.Value); } - p = Process.Start(psi); - ClassicAssert.NotNull(p); + process = Process.Start(psi); + ClassicAssert.NotNull(process); // Block until the startup message to ensure process is up. var dummy = new char[1]; - _ = p.StandardOutput.ReadBlock(dummy, 0, 1); + _ = process.StandardOutput.ReadBlock(dummy, 0, 1); // Give it a bit more time Thread.Sleep(100); @@ -73,7 +73,7 @@ public void Dispose() Console.WriteLine(stopWatch.ElapsedMilliseconds); } - if (p != default) + if (process != default) { // We want to be sure the process is down, otherwise it may conflict // with a future run. First, we'll ask nicely and then kill it. @@ -85,10 +85,10 @@ public void Dispose() } catch { } - try { p.Kill(); } + try { process.Kill(); } catch { } - p.Close(); + process.Close(); } } }