diff --git a/Makefile b/Makefile index 2aee18cc749f0..0d8314fd5d6c5 100644 --- a/Makefile +++ b/Makefile @@ -222,7 +222,7 @@ thirdparties: .PHONY: build build: config cgo thirdparties $(info [Build binary]) - $(CGO_OPTS) go build $(TAGS) $(RACE_OPT) $(GOLDFLAGS) $(DEBUG_OPT) $(GOBUILD_OPT) -o $(BIN_NAME) ./cmd/mo-service + $(CGO_OPTS) GOEXPERIMENT=simd go build $(TAGS) $(RACE_OPT) $(GOLDFLAGS) $(DEBUG_OPT) $(GOBUILD_OPT) -o $(BIN_NAME) ./cmd/mo-service # https://wiki.musl-libc.org/getting-started.html # https://musl.cc/ diff --git a/go.mod b/go.mod index 97ad6a4c2812d..bd6181ab754fa 100644 --- a/go.mod +++ b/go.mod @@ -1,7 +1,7 @@ module github.com/matrixorigin/matrixone // Minimum Go version required -go 1.25.4 +go 1.26.2 require ( github.com/BurntSushi/toml v1.2.1 diff --git a/optools/run_ut.sh b/optools/run_ut.sh index a8a8205891efe..7d8f2015291f5 100755 --- a/optools/run_ut.sh +++ b/optools/run_ut.sh @@ -102,11 +102,11 @@ function run_tests(){ if [[ $SKIP_TESTS == 'race' ]]; then logger "INF" "Run UT without race check" - CGO_CFLAGS="${CGO_CFLAGS}" CGO_LDFLAGS="${CGO_LDFLAGS}" go test -short -v -json -tags matrixone_test -p ${UT_PARALLEL} -timeout "${UT_TIMEOUT}m" $test_scope > $UT_REPORT + CGO_CFLAGS="${CGO_CFLAGS}" CGO_LDFLAGS="${CGO_LDFLAGS}" GOEXPERIMENT=simd go test -short -v -json -tags matrixone_test -p ${UT_PARALLEL} -timeout "${UT_TIMEOUT}m" $test_scope > $UT_REPORT else logger "INF" "Run UT with race check" - CGO_CFLAGS="${CGO_CFLAGS}" CGO_LDFLAGS="${CGO_LDFLAGS}" go test -short -v -json -tags matrixone_test -p ${UT_PARALLEL} -timeout "${UT_TIMEOUT}m" -race $test_scope > $UT_REPORT + CGO_CFLAGS="${CGO_CFLAGS}" CGO_LDFLAGS="${CGO_LDFLAGS}" GOEXPERIMENT=simd go test -short -v -json -tags matrixone_test -p ${UT_PARALLEL} -timeout "${UT_TIMEOUT}m" -race $test_scope > $UT_REPORT fi } diff --git a/pkg/common/simdkernels/d128_addsub.go b/pkg/common/simdkernels/d128_addsub.go new file mode 100644 index 0000000000000..d99b55cf446c1 --- /dev/null +++ b/pkg/common/simdkernels/d128_addsub.go @@ -0,0 +1,338 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package simdkernels + +import "math/bits" + +// Decimal128 add/sub on slices of uint64 with the matrixone Decimal128 +// in-memory layout (lo, hi pair per element). The slices have length 2*N +// where N is the element count; element i occupies indices 2i (lo) and +// 2i+1 (hi). Hi is interpreted as int64 for the signed-overflow predicate. +// +// Operands and result are assumed to share the same scale. +// +// Two variants per operator (mirroring D64*): +// +// *Unchecked — wraps on overflow, no detection. +// *Checked — returns the first overflowing element index, or -1 if none. +// +// The exported variables are dispatchers; their default values are the +// scalar reference implementations and may be replaced at init time on +// amd64 when AVX2 / AVX-512 are detected (see d128_addsub_simd_amd64.go). + +var ( + D128AddUnchecked func(a, b, r []uint64) = scalarD128AddUnchecked + D128SubUnchecked func(a, b, r []uint64) = scalarD128SubUnchecked + D128AddChecked func(a, b, r []uint64) int = scalarD128AddChecked + D128SubChecked func(a, b, r []uint64) int = scalarD128SubChecked + + // Scalar-broadcast variants: scalar is passed as two uint64s + // (slo = low 64 bits, shi = high 64 bits), matching the + // types.Decimal128 in-memory layout (B0_63, B64_127). + D128AddScalarUnchecked func(slo, shi uint64, v, r []uint64) = scalarD128AddScalarUnchecked + D128AddScalarChecked func(slo, shi uint64, v, r []uint64) int = scalarD128AddScalarChecked + D128SubScalarUnchecked func(v []uint64, slo, shi uint64, r []uint64) = scalarD128SubScalarUnchecked + D128SubScalarChecked func(v []uint64, slo, shi uint64, r []uint64) int = scalarD128SubScalarChecked + D128ScalarSubUnchecked func(slo, shi uint64, v, r []uint64) = scalarD128ScalarSubUnchecked + D128ScalarSubChecked func(slo, shi uint64, v, r []uint64) int = scalarD128ScalarSubChecked + + // D128SumReduce sums a contiguous slice of Decimal128 values and returns + // the 128-bit total as (lo, hi). Wraps on overflow (mod 2^128). + D128SumReduce func(v []uint64) (lo, hi uint64) = scalarD128SumReduce +) + +func scalarD128AddUnchecked(a, b, r []uint64) { + n := len(r) / 2 + if len(a) < 2*n || len(b) < 2*n { + return + } + for i := 0; i < n; i++ { + j := i << 1 + lo, c := bits.Add64(a[j], b[j], 0) + hi, _ := bits.Add64(a[j+1], b[j+1], c) + r[j] = lo + r[j+1] = hi + } +} + +func scalarD128SubUnchecked(a, b, r []uint64) { + n := len(r) / 2 + if len(a) < 2*n || len(b) < 2*n { + return + } + for i := 0; i < n; i++ { + j := i << 1 + lo, br := bits.Sub64(a[j], b[j], 0) + hi, _ := bits.Sub64(a[j+1], b[j+1], br) + r[j] = lo + r[j+1] = hi + } +} + +// d128 signed overflow on add: same as 64-bit, evaluated on the high half. +// +// signX == signY && signX != signR ⇔ ((aHi^rHi) &^ (aHi^bHi)) < 0 +func scalarD128AddChecked(a, b, r []uint64) int { + n := len(r) / 2 + if len(a) < 2*n || len(b) < 2*n { + return -1 + } + first := -1 + for i := 0; i < n; i++ { + j := i << 1 + aLo, aHi := a[j], a[j+1] + bLo, bHi := b[j], b[j+1] + lo, c := bits.Add64(aLo, bLo, 0) + hi, _ := bits.Add64(aHi, bHi, c) + r[j] = lo + r[j+1] = hi + if first < 0 { + ah, bh, rh := int64(aHi), int64(bHi), int64(hi) + if (ah^rh)&^(ah^bh) < 0 { + first = i + } + } + } + return first +} + +// d128 signed overflow on sub: +// +// signX != signY && signX != signR ⇔ ((aHi^rHi) & (aHi^bHi)) < 0 +func scalarD128SubChecked(a, b, r []uint64) int { + n := len(r) / 2 + if len(a) < 2*n || len(b) < 2*n { + return -1 + } + first := -1 + for i := 0; i < n; i++ { + j := i << 1 + aLo, aHi := a[j], a[j+1] + bLo, bHi := b[j], b[j+1] + lo, br := bits.Sub64(aLo, bLo, 0) + hi, _ := bits.Sub64(aHi, bHi, br) + r[j] = lo + r[j+1] = hi + if first < 0 { + ah, bh, rh := int64(aHi), int64(bHi), int64(hi) + if (ah^rh)&(ah^bh) < 0 { + first = i + } + } + } + return first +} + +// d128FirstOverflow rescans the first end elements (each = 2 uint64) for +// the first overflow. Used by SIMD checked variants when their accumulated +// mask reports overflow but the scalar tail did not see one. +func d128FirstOverflow(a, b []uint64, end int, sub bool) int { + if sub { + for i := 0; i < end; i++ { + j := i << 1 + lo, br := bits.Sub64(a[j], b[j], 0) + hi, _ := bits.Sub64(a[j+1], b[j+1], br) + _ = lo + ah, bh, rh := int64(a[j+1]), int64(b[j+1]), int64(hi) + if (ah^rh)&(ah^bh) < 0 { + return i + } + } + return -1 + } + for i := 0; i < end; i++ { + j := i << 1 + lo, c := bits.Add64(a[j], b[j], 0) + hi, _ := bits.Add64(a[j+1], b[j+1], c) + _ = lo + ah, bh, rh := int64(a[j+1]), int64(b[j+1]), int64(hi) + if (ah^rh)&^(ah^bh) < 0 { + return i + } + } + return -1 +} + +// --------------------------------------------------------------------------- +// Scalar-broadcast reference implementations. +// --------------------------------------------------------------------------- + +func scalarD128AddScalarUnchecked(slo, shi uint64, v, r []uint64) { + n := len(r) / 2 + if len(v) < 2*n { + return + } + for i := 0; i < n; i++ { + j := i << 1 + lo, c := bits.Add64(slo, v[j], 0) + hi, _ := bits.Add64(shi, v[j+1], c) + r[j] = lo + r[j+1] = hi + } +} + +func scalarD128SubScalarUnchecked(v []uint64, slo, shi uint64, r []uint64) { + n := len(r) / 2 + if len(v) < 2*n { + return + } + for i := 0; i < n; i++ { + j := i << 1 + lo, br := bits.Sub64(v[j], slo, 0) + hi, _ := bits.Sub64(v[j+1], shi, br) + r[j] = lo + r[j+1] = hi + } +} + +func scalarD128ScalarSubUnchecked(slo, shi uint64, v, r []uint64) { + n := len(r) / 2 + if len(v) < 2*n { + return + } + for i := 0; i < n; i++ { + j := i << 1 + lo, br := bits.Sub64(slo, v[j], 0) + hi, _ := bits.Sub64(shi, v[j+1], br) + r[j] = lo + r[j+1] = hi + } +} + +func scalarD128AddScalarChecked(slo, shi uint64, v, r []uint64) int { + n := len(r) / 2 + if len(v) < 2*n { + return -1 + } + first := -1 + sh := int64(shi) + for i := 0; i < n; i++ { + j := i << 1 + vLo, vHi := v[j], v[j+1] + lo, c := bits.Add64(slo, vLo, 0) + hi, _ := bits.Add64(shi, vHi, c) + r[j] = lo + r[j+1] = hi + if first < 0 { + vh, rh := int64(vHi), int64(hi) + if (sh^rh)&^(sh^vh) < 0 { + first = i + } + } + } + return first +} + +func scalarD128SubScalarChecked(v []uint64, slo, shi uint64, r []uint64) int { + n := len(r) / 2 + if len(v) < 2*n { + return -1 + } + first := -1 + sh := int64(shi) + for i := 0; i < n; i++ { + j := i << 1 + vLo, vHi := v[j], v[j+1] + lo, br := bits.Sub64(vLo, slo, 0) + hi, _ := bits.Sub64(vHi, shi, br) + r[j] = lo + r[j+1] = hi + if first < 0 { + vh, rh := int64(vHi), int64(hi) + if (vh^rh)&(vh^sh) < 0 { + first = i + } + } + } + return first +} + +func scalarD128ScalarSubChecked(slo, shi uint64, v, r []uint64) int { + n := len(r) / 2 + if len(v) < 2*n { + return -1 + } + first := -1 + sh := int64(shi) + for i := 0; i < n; i++ { + j := i << 1 + vLo, vHi := v[j], v[j+1] + lo, br := bits.Sub64(slo, vLo, 0) + hi, _ := bits.Sub64(shi, vHi, br) + r[j] = lo + r[j+1] = hi + if first < 0 { + vh, rh := int64(vHi), int64(hi) + if (sh^rh)&(sh^vh) < 0 { + first = i + } + } + } + return first +} + +// d128ScalarFirstOverflow rescans first end elements for the first overflow +// in scalar-broadcast operations. kind: 0=AddScalar, 1=SubScalar (v-s), +// 2=ScalarSub (s-v). +func d128ScalarFirstOverflow(slo, shi uint64, v []uint64, end int, kind int) int { + sh := int64(shi) + switch kind { + case 0: + for i := 0; i < end; i++ { + j := i << 1 + vLo, vHi := v[j], v[j+1] + _, c := bits.Add64(slo, vLo, 0) + hi, _ := bits.Add64(shi, vHi, c) + vh, rh := int64(vHi), int64(hi) + if (sh^rh)&^(sh^vh) < 0 { + return i + } + } + case 1: + for i := 0; i < end; i++ { + j := i << 1 + vLo, vHi := v[j], v[j+1] + _, br := bits.Sub64(vLo, slo, 0) + hi, _ := bits.Sub64(vHi, shi, br) + vh, rh := int64(vHi), int64(hi) + if (vh^rh)&(vh^sh) < 0 { + return i + } + } + case 2: + for i := 0; i < end; i++ { + j := i << 1 + vLo, vHi := v[j], v[j+1] + _, br := bits.Sub64(slo, vLo, 0) + hi, _ := bits.Sub64(shi, vHi, br) + vh, rh := int64(vHi), int64(hi) + if (sh^rh)&(sh^vh) < 0 { + return i + } + } + } + return -1 +} + +func scalarD128SumReduce(v []uint64) (lo, hi uint64) { + n := len(v) >> 1 + for i := 0; i < n; i++ { + j := i << 1 + var c uint64 + lo, c = bits.Add64(lo, v[j], 0) + hi, _ = bits.Add64(hi, v[j+1], c) + } + return +} diff --git a/pkg/common/simdkernels/d128_addsub_simd_amd64.go b/pkg/common/simdkernels/d128_addsub_simd_amd64.go new file mode 100644 index 0000000000000..e5f2c3357d34d --- /dev/null +++ b/pkg/common/simdkernels/d128_addsub_simd_amd64.go @@ -0,0 +1,1225 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build goexperiment.simd && amd64 + +package simdkernels + +import ( + "math/bits" + "simd/archsimd" + "unsafe" + + "golang.org/x/sys/cpu" +) + +// signBit128 is used to convert unsigned int64 compares to the available +// signed Less by flipping every lane's MSB. Stored as the int64 value +// -1<<63, which has only the sign bit set (writing 1<<63 directly would +// overflow an untyped int constant). +const signBit128 int64 = -1 << 63 + +func init() { + switch { + case cpu.X86.HasAVX512: + D128AddUnchecked = avx512D128AddUnchecked + D128SubUnchecked = avx512D128SubUnchecked + D128AddChecked = avx512D128AddChecked + D128SubChecked = avx512D128SubChecked + D128AddScalarUnchecked = avx512D128AddScalarUnchecked + D128SubScalarUnchecked = avx512D128SubScalarUnchecked + D128ScalarSubUnchecked = avx512D128ScalarSubUnchecked + D128AddScalarChecked = avx512D128AddScalarChecked + D128SubScalarChecked = avx512D128SubScalarChecked + D128ScalarSubChecked = avx512D128ScalarSubChecked + D128SumReduce = avx512D128SumReduce + case cpu.X86.HasAVX2: + D128AddUnchecked = avx2D128AddUnchecked + D128SubUnchecked = avx2D128SubUnchecked + D128AddChecked = avx2D128AddChecked + D128SubChecked = avx2D128SubChecked + D128AddScalarUnchecked = avx2D128AddScalarUnchecked + D128SubScalarUnchecked = avx2D128SubScalarUnchecked + D128ScalarSubUnchecked = avx2D128ScalarSubUnchecked + D128AddScalarChecked = avx2D128AddScalarChecked + D128SubScalarChecked = avx2D128SubScalarChecked + D128ScalarSubChecked = avx2D128ScalarSubChecked + D128SumReduce = avx2D128SumReduce + } +} + +// --------------------------------------------------------------------------- +// AVX2 path: each Decimal128 = 2 uint64 = 16 B. We process 4 elements per +// kernel iteration (= 64 B per input). Layout in two Int64x4 loads: +// +// vec0 = [a0.lo, a0.hi, a1.lo, a1.hi] +// vec1 = [a2.lo, a2.hi, a3.lo, a3.hi] +// +// InterleaveLoGrouped/HiGrouped (VPUNPCKLQDQ/VPUNPCKHQDQ) split into: +// +// los = [a0.lo, a2.lo, a1.lo, a3.lo] (= permutation [0,2,1,3] of a*.lo) +// his = [a0.hi, a2.hi, a1.hi, a3.hi] (same permutation of a*.hi) +// +// Both operands and the result use the same permutation, so reinterleaving +// (los, his) with the same two instructions restores the original order. +// Carry: SIMD has no unsigned int64 compare; flip the MSB on both inputs +// and use signed Less. Carry mask is -1 in overflowing lanes, so +// `hi - carryMask` is `hi + 1` exactly where carry is set. +// --------------------------------------------------------------------------- + +// avx2D128AddCarry computes [los, his] = [aLo, aHi] + [bLo, bHi] (128-bit +// across lanes). aLo/bLo/aHi/bHi are already in the deinterleaved permuted +// form. Used by both Unchecked and Checked variants. +// +//go:nosplit +func avx2D128AddCarry(aLo, aHi, bLo, bHi, sb archsimd.Int64x4) (rLo, rHi archsimd.Int64x4) { + rLo = aLo.Add(bLo) + carryMask := rLo.Xor(sb).Less(aLo.Xor(sb)).ToInt64x4() + rHi = aHi.Add(bHi).Sub(carryMask) + return +} + +//go:nosplit +func avx2D128SubBorrow(aLo, aHi, bLo, bHi, sb archsimd.Int64x4) (rLo, rHi archsimd.Int64x4) { + rLo = aLo.Sub(bLo) + // Borrow iff aLo < bLo (unsigned). borrowMask is -1 per borrowing lane, + // and `hi + borrowMask` equals `hi - 1` there. + borrowMask := aLo.Xor(sb).Less(bLo.Xor(sb)).ToInt64x4() + rHi = aHi.Sub(bHi).Add(borrowMask) + return +} + +func avx2D128AddUnchecked(a, b, r []uint64) { + n := len(r) / 2 + if n == 0 || len(a) < 2*n || len(b) < 2*n { + return + } + pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x4(signBit128) + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 16 + a0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off))) + a1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+32))) + b0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off))) + b1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+32))) + + aLo := a0.InterleaveLoGrouped(a1) + aHi := a0.InterleaveHiGrouped(a1) + bLo := b0.InterleaveLoGrouped(b1) + bHi := b0.InterleaveHiGrouped(b1) + + rLo, rHi := avx2D128AddCarry(aLo, aHi, bLo, bHi, sb) + + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[4]int64)(unsafe.Add(pr, off))) + r1.Store((*[4]int64)(unsafe.Add(pr, off+32))) + } + for ; i < n; i++ { + j := i << 1 + lo := a[j] + b[j] + var c uint64 + if lo < a[j] { + c = 1 + } + r[j] = lo + r[j+1] = a[j+1] + b[j+1] + c + } +} + +func avx2D128SubUnchecked(a, b, r []uint64) { + n := len(r) / 2 + if n == 0 || len(a) < 2*n || len(b) < 2*n { + return + } + pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x4(signBit128) + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 16 + a0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off))) + a1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+32))) + b0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off))) + b1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+32))) + + aLo := a0.InterleaveLoGrouped(a1) + aHi := a0.InterleaveHiGrouped(a1) + bLo := b0.InterleaveLoGrouped(b1) + bHi := b0.InterleaveHiGrouped(b1) + + rLo, rHi := avx2D128SubBorrow(aLo, aHi, bLo, bHi, sb) + + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[4]int64)(unsafe.Add(pr, off))) + r1.Store((*[4]int64)(unsafe.Add(pr, off+32))) + } + for ; i < n; i++ { + j := i << 1 + var br uint64 + if a[j] < b[j] { + br = 1 + } + r[j] = a[j] - b[j] + r[j+1] = a[j+1] - b[j+1] - br + } +} + +func avx2D128AddChecked(a, b, r []uint64) int { + n := len(r) / 2 + if n == 0 || len(a) < 2*n || len(b) < 2*n { + return -1 + } + pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x4(signBit128) + + var ofAcc archsimd.Int64x4 + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 16 + a0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off))) + a1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+32))) + b0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off))) + b1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+32))) + + aLo := a0.InterleaveLoGrouped(a1) + aHi := a0.InterleaveHiGrouped(a1) + bLo := b0.InterleaveLoGrouped(b1) + bHi := b0.InterleaveHiGrouped(b1) + + rLo, rHi := avx2D128AddCarry(aLo, aHi, bLo, bHi, sb) + // 128-bit add overflow predicate is the same as 64-bit, evaluated on + // the high words after carry propagation. + ofAcc = ofAcc.Or(aHi.Xor(rHi).AndNot(aHi.Xor(bHi))) + + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[4]int64)(unsafe.Add(pr, off))) + r1.Store((*[4]int64)(unsafe.Add(pr, off+32))) + } + vecEnd := i + + zero := archsimd.BroadcastInt64x4(0) + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + for ; i < n; i++ { + j := i << 1 + aLo, aHi := a[j], a[j+1] + bLo, bHi := b[j], b[j+1] + lo := aLo + bLo + var c uint64 + if lo < aLo { + c = 1 + } + hi := aHi + bHi + c + r[j] = lo + r[j+1] = hi + ah, bh, rh := int64(aHi), int64(bHi), int64(hi) + if (ah^rh)&^(ah^bh) < 0 { + if vecOverflow { + return d128FirstOverflow(a, b, vecEnd, false) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d128FirstOverflow(a, b, vecEnd, false) +} + +func avx2D128SubChecked(a, b, r []uint64) int { + n := len(r) / 2 + if n == 0 || len(a) < 2*n || len(b) < 2*n { + return -1 + } + pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x4(signBit128) + + var ofAcc archsimd.Int64x4 + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 16 + a0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off))) + a1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+32))) + b0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off))) + b1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+32))) + + aLo := a0.InterleaveLoGrouped(a1) + aHi := a0.InterleaveHiGrouped(a1) + bLo := b0.InterleaveLoGrouped(b1) + bHi := b0.InterleaveHiGrouped(b1) + + rLo, rHi := avx2D128SubBorrow(aLo, aHi, bLo, bHi, sb) + ofAcc = ofAcc.Or(aHi.Xor(rHi).And(aHi.Xor(bHi))) + + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[4]int64)(unsafe.Add(pr, off))) + r1.Store((*[4]int64)(unsafe.Add(pr, off+32))) + } + vecEnd := i + + zero := archsimd.BroadcastInt64x4(0) + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + for ; i < n; i++ { + j := i << 1 + aLo, aHi := a[j], a[j+1] + bLo, bHi := b[j], b[j+1] + var br uint64 + if aLo < bLo { + br = 1 + } + lo := aLo - bLo + hi := aHi - bHi - br + r[j] = lo + r[j+1] = hi + ah, bh, rh := int64(aHi), int64(bHi), int64(hi) + if (ah^rh)&(ah^bh) < 0 { + if vecOverflow { + return d128FirstOverflow(a, b, vecEnd, true) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d128FirstOverflow(a, b, vecEnd, true) +} + +// --------------------------------------------------------------------------- +// AVX-512 path: Int64x8 = 8 lanes = 4 D128 elements per vector. Processes 8 +// elements per kernel iteration (two Int64x8 = 128 B per input). Same +// deinterleave / carry-propagate / reinterleave pattern as AVX2; the only +// changes are the lane width and a wider scalar tail. +// --------------------------------------------------------------------------- + +//go:nosplit +func avx512D128AddCarry(aLo, aHi, bLo, bHi, sb archsimd.Int64x8) (rLo, rHi archsimd.Int64x8) { + rLo = aLo.Add(bLo) + carryMask := rLo.Xor(sb).Less(aLo.Xor(sb)).ToInt64x8() + rHi = aHi.Add(bHi).Sub(carryMask) + return +} + +//go:nosplit +func avx512D128SubBorrow(aLo, aHi, bLo, bHi, sb archsimd.Int64x8) (rLo, rHi archsimd.Int64x8) { + rLo = aLo.Sub(bLo) + borrowMask := aLo.Xor(sb).Less(bLo.Xor(sb)).ToInt64x8() + rHi = aHi.Sub(bHi).Add(borrowMask) + return +} + +func avx512D128AddUnchecked(a, b, r []uint64) { + n := len(r) / 2 + if n == 0 || len(a) < 2*n || len(b) < 2*n { + return + } + pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x8(signBit128) + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 16 + a0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off))) + a1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+64))) + b0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off))) + b1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+64))) + + aLo := a0.InterleaveLoGrouped(a1) + aHi := a0.InterleaveHiGrouped(a1) + bLo := b0.InterleaveLoGrouped(b1) + bHi := b0.InterleaveHiGrouped(b1) + + rLo, rHi := avx512D128AddCarry(aLo, aHi, bLo, bHi, sb) + + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[8]int64)(unsafe.Add(pr, off))) + r1.Store((*[8]int64)(unsafe.Add(pr, off+64))) + } + for ; i < n; i++ { + j := i << 1 + lo := a[j] + b[j] + var c uint64 + if lo < a[j] { + c = 1 + } + r[j] = lo + r[j+1] = a[j+1] + b[j+1] + c + } +} + +func avx512D128SubUnchecked(a, b, r []uint64) { + n := len(r) / 2 + if n == 0 || len(a) < 2*n || len(b) < 2*n { + return + } + pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x8(signBit128) + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 16 + a0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off))) + a1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+64))) + b0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off))) + b1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+64))) + + aLo := a0.InterleaveLoGrouped(a1) + aHi := a0.InterleaveHiGrouped(a1) + bLo := b0.InterleaveLoGrouped(b1) + bHi := b0.InterleaveHiGrouped(b1) + + rLo, rHi := avx512D128SubBorrow(aLo, aHi, bLo, bHi, sb) + + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[8]int64)(unsafe.Add(pr, off))) + r1.Store((*[8]int64)(unsafe.Add(pr, off+64))) + } + for ; i < n; i++ { + j := i << 1 + var br uint64 + if a[j] < b[j] { + br = 1 + } + r[j] = a[j] - b[j] + r[j+1] = a[j+1] - b[j+1] - br + } +} + +func avx512D128AddChecked(a, b, r []uint64) int { + n := len(r) / 2 + if n == 0 || len(a) < 2*n || len(b) < 2*n { + return -1 + } + pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x8(signBit128) + + var ofAcc archsimd.Int64x8 + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 16 + a0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off))) + a1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+64))) + b0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off))) + b1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+64))) + + aLo := a0.InterleaveLoGrouped(a1) + aHi := a0.InterleaveHiGrouped(a1) + bLo := b0.InterleaveLoGrouped(b1) + bHi := b0.InterleaveHiGrouped(b1) + + rLo, rHi := avx512D128AddCarry(aLo, aHi, bLo, bHi, sb) + ofAcc = ofAcc.Or(aHi.Xor(bHi).AndNot(aHi.Xor(rHi))) + + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[8]int64)(unsafe.Add(pr, off))) + r1.Store((*[8]int64)(unsafe.Add(pr, off+64))) + } + vecEnd := i + + zero := archsimd.BroadcastInt64x8(0) + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + for ; i < n; i++ { + j := i << 1 + aLo, aHi := a[j], a[j+1] + bLo, bHi := b[j], b[j+1] + lo := aLo + bLo + var c uint64 + if lo < aLo { + c = 1 + } + hi := aHi + bHi + c + r[j] = lo + r[j+1] = hi + ah, bh, rh := int64(aHi), int64(bHi), int64(hi) + if (ah^rh)&^(ah^bh) < 0 { + if vecOverflow { + return d128FirstOverflow(a, b, vecEnd, false) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d128FirstOverflow(a, b, vecEnd, false) +} + +func avx512D128SubChecked(a, b, r []uint64) int { + n := len(r) / 2 + if n == 0 || len(a) < 2*n || len(b) < 2*n { + return -1 + } + pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x8(signBit128) + + var ofAcc archsimd.Int64x8 + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 16 + a0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off))) + a1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+64))) + b0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off))) + b1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+64))) + + aLo := a0.InterleaveLoGrouped(a1) + aHi := a0.InterleaveHiGrouped(a1) + bLo := b0.InterleaveLoGrouped(b1) + bHi := b0.InterleaveHiGrouped(b1) + + rLo, rHi := avx512D128SubBorrow(aLo, aHi, bLo, bHi, sb) + ofAcc = ofAcc.Or(aHi.Xor(rHi).And(aHi.Xor(bHi))) + + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[8]int64)(unsafe.Add(pr, off))) + r1.Store((*[8]int64)(unsafe.Add(pr, off+64))) + } + vecEnd := i + + zero := archsimd.BroadcastInt64x8(0) + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + for ; i < n; i++ { + j := i << 1 + aLo, aHi := a[j], a[j+1] + bLo, bHi := b[j], b[j+1] + var br uint64 + if aLo < bLo { + br = 1 + } + lo := aLo - bLo + hi := aHi - bHi - br + r[j] = lo + r[j+1] = hi + ah, bh, rh := int64(aHi), int64(bHi), int64(hi) + if (ah^rh)&(ah^bh) < 0 { + if vecOverflow { + return d128FirstOverflow(a, b, vecEnd, true) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d128FirstOverflow(a, b, vecEnd, true) +} + +// --------------------------------------------------------------------------- +// AVX2 broadcast variants. Scalar (slo, shi) is broadcast once outside the +// loop; only the vector operand is loaded each iteration. Layout/permutation +// notes from the vec+vec path apply identically — the broadcast vectors are +// uniform, so deinterleaving them is a no-op (same value in every lane). +// --------------------------------------------------------------------------- + +func avx2D128AddScalarUnchecked(slo, shi uint64, v, r []uint64) { + n := len(r) / 2 + if n == 0 || len(v) < 2*n { + return + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x4(signBit128) + bLo := archsimd.BroadcastInt64x4(int64(slo)) + bHi := archsimd.BroadcastInt64x4(int64(shi)) + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 16 + v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32))) + + aLo := v0.InterleaveLoGrouped(v1) + aHi := v0.InterleaveHiGrouped(v1) + + rLo, rHi := avx2D128AddCarry(aLo, aHi, bLo, bHi, sb) + + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[4]int64)(unsafe.Add(pr, off))) + r1.Store((*[4]int64)(unsafe.Add(pr, off+32))) + } + for ; i < n; i++ { + j := i << 1 + lo := slo + v[j] + var c uint64 + if lo < slo { + c = 1 + } + r[j] = lo + r[j+1] = shi + v[j+1] + c + } +} + +func avx2D128SubScalarUnchecked(v []uint64, slo, shi uint64, r []uint64) { + n := len(r) / 2 + if n == 0 || len(v) < 2*n { + return + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x4(signBit128) + bLo := archsimd.BroadcastInt64x4(int64(slo)) + bHi := archsimd.BroadcastInt64x4(int64(shi)) + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 16 + v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32))) + + aLo := v0.InterleaveLoGrouped(v1) + aHi := v0.InterleaveHiGrouped(v1) + + rLo, rHi := avx2D128SubBorrow(aLo, aHi, bLo, bHi, sb) + + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[4]int64)(unsafe.Add(pr, off))) + r1.Store((*[4]int64)(unsafe.Add(pr, off+32))) + } + for ; i < n; i++ { + j := i << 1 + var br uint64 + if v[j] < slo { + br = 1 + } + r[j] = v[j] - slo + r[j+1] = v[j+1] - shi - br + } +} + +func avx2D128ScalarSubUnchecked(slo, shi uint64, v, r []uint64) { + n := len(r) / 2 + if n == 0 || len(v) < 2*n { + return + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x4(signBit128) + aLo := archsimd.BroadcastInt64x4(int64(slo)) + aHi := archsimd.BroadcastInt64x4(int64(shi)) + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 16 + v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32))) + + bLo := v0.InterleaveLoGrouped(v1) + bHi := v0.InterleaveHiGrouped(v1) + + rLo, rHi := avx2D128SubBorrow(aLo, aHi, bLo, bHi, sb) + + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[4]int64)(unsafe.Add(pr, off))) + r1.Store((*[4]int64)(unsafe.Add(pr, off+32))) + } + for ; i < n; i++ { + j := i << 1 + var br uint64 + if slo < v[j] { + br = 1 + } + r[j] = slo - v[j] + r[j+1] = shi - v[j+1] - br + } +} + +func avx2D128AddScalarChecked(slo, shi uint64, v, r []uint64) int { + n := len(r) / 2 + if n == 0 || len(v) < 2*n { + return -1 + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x4(signBit128) + bLo := archsimd.BroadcastInt64x4(int64(slo)) + bHi := archsimd.BroadcastInt64x4(int64(shi)) + + var ofAcc archsimd.Int64x4 + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 16 + v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32))) + + aLo := v0.InterleaveLoGrouped(v1) + aHi := v0.InterleaveHiGrouped(v1) + + rLo, rHi := avx2D128AddCarry(aLo, aHi, bLo, bHi, sb) + ofAcc = ofAcc.Or(aHi.Xor(rHi).AndNot(aHi.Xor(bHi))) + + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[4]int64)(unsafe.Add(pr, off))) + r1.Store((*[4]int64)(unsafe.Add(pr, off+32))) + } + vecEnd := i + + zero := archsimd.BroadcastInt64x4(0) + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + sh := int64(shi) + for ; i < n; i++ { + j := i << 1 + vLo, vHi := v[j], v[j+1] + lo := slo + vLo + var c uint64 + if lo < slo { + c = 1 + } + hi := shi + vHi + c + r[j] = lo + r[j+1] = hi + vh, rh := int64(vHi), int64(hi) + if (sh^rh)&^(sh^vh) < 0 { + if vecOverflow { + return d128ScalarFirstOverflow(slo, shi, v, vecEnd, 0) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d128ScalarFirstOverflow(slo, shi, v, vecEnd, 0) +} + +func avx2D128SubScalarChecked(v []uint64, slo, shi uint64, r []uint64) int { + n := len(r) / 2 + if n == 0 || len(v) < 2*n { + return -1 + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x4(signBit128) + bLo := archsimd.BroadcastInt64x4(int64(slo)) + bHi := archsimd.BroadcastInt64x4(int64(shi)) + + var ofAcc archsimd.Int64x4 + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 16 + v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32))) + + aLo := v0.InterleaveLoGrouped(v1) + aHi := v0.InterleaveHiGrouped(v1) + + rLo, rHi := avx2D128SubBorrow(aLo, aHi, bLo, bHi, sb) + ofAcc = ofAcc.Or(aHi.Xor(rHi).And(aHi.Xor(bHi))) + + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[4]int64)(unsafe.Add(pr, off))) + r1.Store((*[4]int64)(unsafe.Add(pr, off+32))) + } + vecEnd := i + + zero := archsimd.BroadcastInt64x4(0) + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + sh := int64(shi) + for ; i < n; i++ { + j := i << 1 + vLo, vHi := v[j], v[j+1] + var br uint64 + if vLo < slo { + br = 1 + } + lo := vLo - slo + hi := vHi - shi - br + r[j] = lo + r[j+1] = hi + vh, rh := int64(vHi), int64(hi) + if (vh^rh)&(vh^sh) < 0 { + if vecOverflow { + return d128ScalarFirstOverflow(slo, shi, v, vecEnd, 1) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d128ScalarFirstOverflow(slo, shi, v, vecEnd, 1) +} + +func avx2D128ScalarSubChecked(slo, shi uint64, v, r []uint64) int { + n := len(r) / 2 + if n == 0 || len(v) < 2*n { + return -1 + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x4(signBit128) + aLo := archsimd.BroadcastInt64x4(int64(slo)) + aHi := archsimd.BroadcastInt64x4(int64(shi)) + + var ofAcc archsimd.Int64x4 + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 16 + v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32))) + + bLo := v0.InterleaveLoGrouped(v1) + bHi := v0.InterleaveHiGrouped(v1) + + rLo, rHi := avx2D128SubBorrow(aLo, aHi, bLo, bHi, sb) + ofAcc = ofAcc.Or(aHi.Xor(rHi).And(aHi.Xor(bHi))) + + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[4]int64)(unsafe.Add(pr, off))) + r1.Store((*[4]int64)(unsafe.Add(pr, off+32))) + } + vecEnd := i + + zero := archsimd.BroadcastInt64x4(0) + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + sh := int64(shi) + for ; i < n; i++ { + j := i << 1 + vLo, vHi := v[j], v[j+1] + var br uint64 + if slo < vLo { + br = 1 + } + lo := slo - vLo + hi := shi - vHi - br + r[j] = lo + r[j+1] = hi + vh, rh := int64(vHi), int64(hi) + if (sh^rh)&(sh^vh) < 0 { + if vecOverflow { + return d128ScalarFirstOverflow(slo, shi, v, vecEnd, 2) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d128ScalarFirstOverflow(slo, shi, v, vecEnd, 2) +} + +// --------------------------------------------------------------------------- +// AVX-512 broadcast variants. Same structure as AVX2 with Int64x8 lanes +// (8 D128 elements per kernel iteration). +// --------------------------------------------------------------------------- + +func avx512D128AddScalarUnchecked(slo, shi uint64, v, r []uint64) { + n := len(r) / 2 + if n == 0 || len(v) < 2*n { + return + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x8(signBit128) + bLo := archsimd.BroadcastInt64x8(int64(slo)) + bHi := archsimd.BroadcastInt64x8(int64(shi)) + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 16 + v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64))) + + aLo := v0.InterleaveLoGrouped(v1) + aHi := v0.InterleaveHiGrouped(v1) + + rLo, rHi := avx512D128AddCarry(aLo, aHi, bLo, bHi, sb) + + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[8]int64)(unsafe.Add(pr, off))) + r1.Store((*[8]int64)(unsafe.Add(pr, off+64))) + } + for ; i < n; i++ { + j := i << 1 + lo := slo + v[j] + var c uint64 + if lo < slo { + c = 1 + } + r[j] = lo + r[j+1] = shi + v[j+1] + c + } +} + +func avx512D128SubScalarUnchecked(v []uint64, slo, shi uint64, r []uint64) { + n := len(r) / 2 + if n == 0 || len(v) < 2*n { + return + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x8(signBit128) + bLo := archsimd.BroadcastInt64x8(int64(slo)) + bHi := archsimd.BroadcastInt64x8(int64(shi)) + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 16 + v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64))) + + aLo := v0.InterleaveLoGrouped(v1) + aHi := v0.InterleaveHiGrouped(v1) + + rLo, rHi := avx512D128SubBorrow(aLo, aHi, bLo, bHi, sb) + + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[8]int64)(unsafe.Add(pr, off))) + r1.Store((*[8]int64)(unsafe.Add(pr, off+64))) + } + for ; i < n; i++ { + j := i << 1 + var br uint64 + if v[j] < slo { + br = 1 + } + r[j] = v[j] - slo + r[j+1] = v[j+1] - shi - br + } +} + +func avx512D128ScalarSubUnchecked(slo, shi uint64, v, r []uint64) { + n := len(r) / 2 + if n == 0 || len(v) < 2*n { + return + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x8(signBit128) + aLo := archsimd.BroadcastInt64x8(int64(slo)) + aHi := archsimd.BroadcastInt64x8(int64(shi)) + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 16 + v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64))) + + bLo := v0.InterleaveLoGrouped(v1) + bHi := v0.InterleaveHiGrouped(v1) + + rLo, rHi := avx512D128SubBorrow(aLo, aHi, bLo, bHi, sb) + + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[8]int64)(unsafe.Add(pr, off))) + r1.Store((*[8]int64)(unsafe.Add(pr, off+64))) + } + for ; i < n; i++ { + j := i << 1 + var br uint64 + if slo < v[j] { + br = 1 + } + r[j] = slo - v[j] + r[j+1] = shi - v[j+1] - br + } +} + +func avx512D128AddScalarChecked(slo, shi uint64, v, r []uint64) int { + n := len(r) / 2 + if n == 0 || len(v) < 2*n { + return -1 + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x8(signBit128) + bLo := archsimd.BroadcastInt64x8(int64(slo)) + bHi := archsimd.BroadcastInt64x8(int64(shi)) + + var ofAcc archsimd.Int64x8 + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 16 + v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64))) + + aLo := v0.InterleaveLoGrouped(v1) + aHi := v0.InterleaveHiGrouped(v1) + + rLo, rHi := avx512D128AddCarry(aLo, aHi, bLo, bHi, sb) + ofAcc = ofAcc.Or(aHi.Xor(bHi).AndNot(aHi.Xor(rHi))) + + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[8]int64)(unsafe.Add(pr, off))) + r1.Store((*[8]int64)(unsafe.Add(pr, off+64))) + } + vecEnd := i + + zero := archsimd.BroadcastInt64x8(0) + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + sh := int64(shi) + for ; i < n; i++ { + j := i << 1 + vLo, vHi := v[j], v[j+1] + lo := slo + vLo + var c uint64 + if lo < slo { + c = 1 + } + hi := shi + vHi + c + r[j] = lo + r[j+1] = hi + vh, rh := int64(vHi), int64(hi) + if (sh^rh)&^(sh^vh) < 0 { + if vecOverflow { + return d128ScalarFirstOverflow(slo, shi, v, vecEnd, 0) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d128ScalarFirstOverflow(slo, shi, v, vecEnd, 0) +} + +func avx512D128SubScalarChecked(v []uint64, slo, shi uint64, r []uint64) int { + n := len(r) / 2 + if n == 0 || len(v) < 2*n { + return -1 + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x8(signBit128) + bLo := archsimd.BroadcastInt64x8(int64(slo)) + bHi := archsimd.BroadcastInt64x8(int64(shi)) + + var ofAcc archsimd.Int64x8 + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 16 + v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64))) + + aLo := v0.InterleaveLoGrouped(v1) + aHi := v0.InterleaveHiGrouped(v1) + + rLo, rHi := avx512D128SubBorrow(aLo, aHi, bLo, bHi, sb) + ofAcc = ofAcc.Or(aHi.Xor(rHi).And(aHi.Xor(bHi))) + + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[8]int64)(unsafe.Add(pr, off))) + r1.Store((*[8]int64)(unsafe.Add(pr, off+64))) + } + vecEnd := i + + zero := archsimd.BroadcastInt64x8(0) + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + sh := int64(shi) + for ; i < n; i++ { + j := i << 1 + vLo, vHi := v[j], v[j+1] + var br uint64 + if vLo < slo { + br = 1 + } + lo := vLo - slo + hi := vHi - shi - br + r[j] = lo + r[j+1] = hi + vh, rh := int64(vHi), int64(hi) + if (vh^rh)&(vh^sh) < 0 { + if vecOverflow { + return d128ScalarFirstOverflow(slo, shi, v, vecEnd, 1) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d128ScalarFirstOverflow(slo, shi, v, vecEnd, 1) +} + +func avx512D128ScalarSubChecked(slo, shi uint64, v, r []uint64) int { + n := len(r) / 2 + if n == 0 || len(v) < 2*n { + return -1 + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x8(signBit128) + aLo := archsimd.BroadcastInt64x8(int64(slo)) + aHi := archsimd.BroadcastInt64x8(int64(shi)) + + var ofAcc archsimd.Int64x8 + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 16 + v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64))) + + bLo := v0.InterleaveLoGrouped(v1) + bHi := v0.InterleaveHiGrouped(v1) + + rLo, rHi := avx512D128SubBorrow(aLo, aHi, bLo, bHi, sb) + ofAcc = ofAcc.Or(aHi.Xor(rHi).And(aHi.Xor(bHi))) + + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[8]int64)(unsafe.Add(pr, off))) + r1.Store((*[8]int64)(unsafe.Add(pr, off+64))) + } + vecEnd := i + + zero := archsimd.BroadcastInt64x8(0) + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + sh := int64(shi) + for ; i < n; i++ { + j := i << 1 + vLo, vHi := v[j], v[j+1] + var br uint64 + if slo < vLo { + br = 1 + } + lo := slo - vLo + hi := shi - vHi - br + r[j] = lo + r[j+1] = hi + vh, rh := int64(vHi), int64(hi) + if (sh^rh)&(sh^vh) < 0 { + if vecOverflow { + return d128ScalarFirstOverflow(slo, shi, v, vecEnd, 2) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d128ScalarFirstOverflow(slo, shi, v, vecEnd, 2) +} + +// avx2D128SumReduce sums a slice of Decimal128 values laid out as +// [lo, hi, lo, hi, ...] and returns the 128-bit total (lo, hi). +// Wraps on overflow (mod 2^128). +func avx2D128SumReduce(v []uint64) (uint64, uint64) { + n := len(v) >> 1 + if n == 0 { + return 0, 0 + } + pv := unsafe.Pointer(&v[0]) + sb := archsimd.BroadcastInt64x4(signBit128) + + zero := archsimd.BroadcastInt64x4(0) + accLo := zero + accHi := zero + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 16 + v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32))) + bLo := v0.InterleaveLoGrouped(v1) + bHi := v0.InterleaveHiGrouped(v1) + + newLo := accLo.Add(bLo) + // Carry mask: -1 in lanes where unsigned newLo < accLo (wrap-around). + carryMask := newLo.Xor(sb).Less(accLo.Xor(sb)).ToInt64x4() + accHi = accHi.Add(bHi).Sub(carryMask) + accLo = newLo + } + + // Horizontal reduce of 4 partial 128-bit sums. + var loBuf, hiBuf [4]int64 + accLo.Store(&loBuf) + accHi.Store(&hiBuf) + var totLo, totHi uint64 + for k := 0; k < 4; k++ { + var c uint64 + totLo, c = bits.Add64(totLo, uint64(loBuf[k]), 0) + totHi, _ = bits.Add64(totHi, uint64(hiBuf[k]), c) + } + + // Tail (n%4 elements). + for ; i < n; i++ { + j := i << 1 + var c uint64 + totLo, c = bits.Add64(totLo, v[j], 0) + totHi, _ = bits.Add64(totHi, v[j+1], c) + } + return totLo, totHi +} + +// avx512D128SumReduce: same as AVX2 but 8 elements per iteration. +func avx512D128SumReduce(v []uint64) (uint64, uint64) { + n := len(v) >> 1 + if n == 0 { + return 0, 0 + } + pv := unsafe.Pointer(&v[0]) + sb := archsimd.BroadcastInt64x8(signBit128) + + zero := archsimd.BroadcastInt64x8(0) + accLo := zero + accHi := zero + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 16 + v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64))) + bLo := v0.InterleaveLoGrouped(v1) + bHi := v0.InterleaveHiGrouped(v1) + + newLo := accLo.Add(bLo) + carryMask := newLo.Xor(sb).Less(accLo.Xor(sb)).ToInt64x8() + accHi = accHi.Add(bHi).Sub(carryMask) + accLo = newLo + } + + var loBuf, hiBuf [8]int64 + accLo.Store(&loBuf) + accHi.Store(&hiBuf) + var totLo, totHi uint64 + for k := 0; k < 8; k++ { + var c uint64 + totLo, c = bits.Add64(totLo, uint64(loBuf[k]), 0) + totHi, _ = bits.Add64(totHi, uint64(hiBuf[k]), c) + } + + for ; i < n; i++ { + j := i << 1 + var c uint64 + totLo, c = bits.Add64(totLo, v[j], 0) + totHi, _ = bits.Add64(totHi, v[j+1], c) + } + return totLo, totHi +} diff --git a/pkg/common/simdkernels/d128_addsub_test.go b/pkg/common/simdkernels/d128_addsub_test.go new file mode 100644 index 0000000000000..57d9e44f4b0f1 --- /dev/null +++ b/pkg/common/simdkernels/d128_addsub_test.go @@ -0,0 +1,745 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build goexperiment.simd && amd64 + +package simdkernels + +import ( + "math" + "math/rand/v2" + "strconv" + "testing" + + "golang.org/x/sys/cpu" +) + +// All slice lengths below count Decimal128 elements (each backed by 2 uint64). + +type d128UncheckedImpl struct { + name string + fn func(a, b, r []uint64) +} + +type d128CheckedImpl struct { + name string + fn func(a, b, r []uint64) int +} + +func d128Sizes() []int { + return []int{0, 1, 3, 4, 5, 7, 8, 9, 15, 16, 17, 31, 32, 33, 35, 63, 64, 127, 128, 1023, 4096} +} + +func makeRandD128(n int, seed uint64) []uint64 { + rng := rand.New(rand.NewPCG(seed, seed^0xDEADBEEFCAFEBABE)) + out := make([]uint64, 2*n) + for i := range out { + out[i] = rng.Uint64() + } + return out +} + +// makeRand128SmallSigned produces N elements whose high words have their +// MSB cleared, so 128-bit add/sub of any two such values cannot overflow +// (both signs are non-negative; 2^126 + 2^126 ≪ 2^127). +func makeRand128SmallSigned(n int, seed uint64) []uint64 { + out := makeRandD128(n, seed) + for i := 1; i < len(out); i += 2 { + out[i] &= 0x3FFFFFFFFFFFFFFF + } + return out +} + +func TestD128AddVariants(t *testing.T) { + impls := []d128UncheckedImpl{ + {"scalar", scalarD128AddUnchecked}, + {"avx2", avx2D128AddUnchecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d128UncheckedImpl{"avx512", avx512D128AddUnchecked}) + } + for _, n := range d128Sizes() { + a := makeRandD128(n, uint64(n)*7+1) + b := makeRandD128(n, uint64(n)*11+3) + want := make([]uint64, 2*n) + scalarD128AddUnchecked(a, b, want) + for _, impl := range impls { + got := make([]uint64, 2*n) + impl.fn(a, b, got) + for i := 0; i < 2*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d slot=%d: got %x want %x", impl.name, n, i, got[i], want[i]) + } + } + } + } +} + +func TestD128SubVariants(t *testing.T) { + impls := []d128UncheckedImpl{ + {"scalar", scalarD128SubUnchecked}, + {"avx2", avx2D128SubUnchecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d128UncheckedImpl{"avx512", avx512D128SubUnchecked}) + } + for _, n := range d128Sizes() { + a := makeRandD128(n, uint64(n)*13+5) + b := makeRandD128(n, uint64(n)*17+9) + want := make([]uint64, 2*n) + scalarD128SubUnchecked(a, b, want) + for _, impl := range impls { + got := make([]uint64, 2*n) + impl.fn(a, b, got) + for i := 0; i < 2*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d slot=%d: got %x want %x", impl.name, n, i, got[i], want[i]) + } + } + } + } +} + +func TestD128AddCheckedVariants(t *testing.T) { + impls := []d128CheckedImpl{ + {"scalar", scalarD128AddChecked}, + {"avx2", avx2D128AddChecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d128CheckedImpl{"avx512", avx512D128AddChecked}) + } + + // 1) No-overflow random inputs. + for _, n := range d128Sizes() { + a := makeRand128SmallSigned(n, uint64(n)*19+7) + b := makeRand128SmallSigned(n, uint64(n)*23+11) + want := make([]uint64, 2*n) + if got := scalarD128AddChecked(a, b, want); got != -1 { + t.Fatalf("setup: scalar overflow at %d for masked input n=%d", got, n) + } + for _, impl := range impls { + got := make([]uint64, 2*n) + if idx := impl.fn(a, b, got); idx != -1 { + t.Fatalf("%s n=%d: spurious overflow at %d", impl.name, n, idx) + } + for i := 0; i < 2*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d slot=%d: got %x want %x", impl.name, n, i, got[i], want[i]) + } + } + } + } + + // 2) Inject a single overflow (MaxInt128 + 1) at varying positions. + for _, n := range []int{4, 8, 9, 16, 17, 33, 35, 64} { + for _, pos := range []int{0, 1, 3, 4, 7, 8, n - 1} { + if pos < 0 || pos >= n { + continue + } + a := make([]uint64, 2*n) + b := make([]uint64, 2*n) + j := pos << 1 + a[j] = math.MaxUint64 + a[j+1] = uint64(math.MaxInt64) // a = MaxInt128 (positive max) + b[j] = 1 + b[j+1] = 0 // b = 1 + for _, impl := range impls { + got := make([]uint64, 2*n) + idx := impl.fn(a, b, got) + if idx != pos { + t.Fatalf("%s n=%d inject pos=%d: got idx %d", impl.name, n, pos, idx) + } + } + } + } + + // 3) Carry-propagation correctness: aLo = MaxUint64, aHi small; bLo = 1. + // Result should have lo = 0, hi = aHi+1 — no signed overflow. + for _, n := range []int{4, 8, 16, 17, 33} { + a := make([]uint64, 2*n) + b := make([]uint64, 2*n) + for i := 0; i < n; i++ { + j := i << 1 + a[j] = math.MaxUint64 + a[j+1] = uint64(i) + b[j] = 1 + b[j+1] = 0 + } + want := make([]uint64, 2*n) + scalarD128AddChecked(a, b, want) + for _, impl := range impls { + got := make([]uint64, 2*n) + if idx := impl.fn(a, b, got); idx != -1 { + t.Fatalf("%s carry n=%d: spurious overflow at %d", impl.name, n, idx) + } + for i := 0; i < 2*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s carry n=%d slot=%d: got %x want %x", impl.name, n, i, got[i], want[i]) + } + } + } + } +} + +func TestD128SubCheckedVariants(t *testing.T) { + impls := []d128CheckedImpl{ + {"scalar", scalarD128SubChecked}, + {"avx2", avx2D128SubChecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d128CheckedImpl{"avx512", avx512D128SubChecked}) + } + + for _, n := range d128Sizes() { + a := makeRand128SmallSigned(n, uint64(n)*29+13) + b := makeRand128SmallSigned(n, uint64(n)*31+17) + want := make([]uint64, 2*n) + if got := scalarD128SubChecked(a, b, want); got != -1 { + t.Fatalf("setup: scalar overflow at %d for n=%d", got, n) + } + for _, impl := range impls { + got := make([]uint64, 2*n) + if idx := impl.fn(a, b, got); idx != -1 { + t.Fatalf("%s n=%d: spurious overflow at %d", impl.name, n, idx) + } + for i := 0; i < 2*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d slot=%d: got %x want %x", impl.name, n, i, got[i], want[i]) + } + } + } + } + + // Inject MinInt128 - 1 overflow at varying positions. + for _, n := range []int{4, 8, 9, 16, 17, 33, 35, 64} { + for _, pos := range []int{0, 1, 3, 4, 7, 8, n - 1} { + if pos < 0 || pos >= n { + continue + } + a := make([]uint64, 2*n) + b := make([]uint64, 2*n) + j := pos << 1 + a[j] = 0 + a[j+1] = 1 << 63 // a = MinInt128 + b[j] = 1 + b[j+1] = 0 // b = 1 + for _, impl := range impls { + got := make([]uint64, 2*n) + idx := impl.fn(a, b, got) + if idx != pos { + t.Fatalf("%s n=%d inject pos=%d: got idx %d", impl.name, n, pos, idx) + } + } + } + } + + // Borrow propagation: aLo = 0, aHi small; bLo = 1 ⇒ lo=Max, hi=aHi-1. + for _, n := range []int{4, 8, 16, 17, 33} { + a := make([]uint64, 2*n) + b := make([]uint64, 2*n) + for i := 0; i < n; i++ { + j := i << 1 + a[j] = 0 + a[j+1] = uint64(i + 10) // safely positive and > 0 after borrow + b[j] = 1 + b[j+1] = 0 + } + want := make([]uint64, 2*n) + scalarD128SubChecked(a, b, want) + for _, impl := range impls { + got := make([]uint64, 2*n) + if idx := impl.fn(a, b, got); idx != -1 { + t.Fatalf("%s borrow n=%d: spurious overflow at %d", impl.name, n, idx) + } + for i := 0; i < 2*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s borrow n=%d slot=%d: got %x want %x", impl.name, n, i, got[i], want[i]) + } + } + } + } +} + +// --------------------------------------------------------------------------- +// Benchmarks +// --------------------------------------------------------------------------- + +var d128BenchSizes = []int{32, 128, 512, 2048, 8192} + +func benchD128Unchecked(b *testing.B, fn func(a, bb, r []uint64), n int) { + a := makeRandD128(n, 1) + bb := makeRandD128(n, 2) + r := make([]uint64, 2*n) + b.SetBytes(int64(n) * 16 * 3) + b.ResetTimer() + for i := 0; i < b.N; i++ { + fn(a, bb, r) + } +} + +func benchD128Checked(b *testing.B, fn func(a, bb, r []uint64) int, n int) { + a := makeRand128SmallSigned(n, 1) + bb := makeRand128SmallSigned(n, 2) + r := make([]uint64, 2*n) + b.SetBytes(int64(n) * 16 * 3) + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = fn(a, bb, r) + } +} + +func BenchmarkD128AddUnchecked(b *testing.B) { + for _, n := range d128BenchSizes { + b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD128Unchecked(b, scalarD128AddUnchecked, n) }) + b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD128Unchecked(b, avx2D128AddUnchecked, n) }) + if cpu.X86.HasAVX512 { + b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD128Unchecked(b, avx512D128AddUnchecked, n) }) + } + } +} + +func BenchmarkD128SubUnchecked(b *testing.B) { + for _, n := range d128BenchSizes { + b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD128Unchecked(b, scalarD128SubUnchecked, n) }) + b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD128Unchecked(b, avx2D128SubUnchecked, n) }) + if cpu.X86.HasAVX512 { + b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD128Unchecked(b, avx512D128SubUnchecked, n) }) + } + } +} + +func BenchmarkD128AddChecked(b *testing.B) { + for _, n := range d128BenchSizes { + b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD128Checked(b, scalarD128AddChecked, n) }) + b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD128Checked(b, avx2D128AddChecked, n) }) + if cpu.X86.HasAVX512 { + b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD128Checked(b, avx512D128AddChecked, n) }) + } + } +} + +func BenchmarkD128SubChecked(b *testing.B) { + for _, n := range d128BenchSizes { + b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD128Checked(b, scalarD128SubChecked, n) }) + b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD128Checked(b, avx2D128SubChecked, n) }) + if cpu.X86.HasAVX512 { + b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD128Checked(b, avx512D128SubChecked, n) }) + } + } +} + +// --------------------------------------------------------------------------- +// Scalar-broadcast tests. +// --------------------------------------------------------------------------- + +type d128ScalarVUImpl struct { + name string + fn func(slo, shi uint64, v, r []uint64) +} + +type d128ScalarVCImpl struct { + name string + fn func(slo, shi uint64, v, r []uint64) int +} + +type d128VScalarUImpl struct { + name string + fn func(v []uint64, slo, shi uint64, r []uint64) +} + +type d128VScalarCImpl struct { + name string + fn func(v []uint64, slo, shi uint64, r []uint64) int +} + +func d128Scalars() []struct{ lo, hi uint64 } { + return []struct{ lo, hi uint64 }{ + {0, 0}, + {1, 0}, + {math.MaxUint64, 0}, + {0, 1}, + {0xDEADBEEFCAFEBABE, 0x123456789ABCDEF0}, + {math.MaxUint64, uint64(math.MaxInt64)}, + {0, 1 << 63}, + } +} + +func TestD128AddScalarVariants(t *testing.T) { + impls := []d128ScalarVUImpl{ + {"scalar", scalarD128AddScalarUnchecked}, + {"avx2", avx2D128AddScalarUnchecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d128ScalarVUImpl{"avx512", avx512D128AddScalarUnchecked}) + } + for _, n := range d128Sizes() { + v := makeRandD128(n, uint64(n)*37+1) + for si, s := range d128Scalars() { + want := make([]uint64, 2*n) + scalarD128AddScalarUnchecked(s.lo, s.hi, v, want) + for _, impl := range impls { + got := make([]uint64, 2*n) + impl.fn(s.lo, s.hi, v, got) + for i := 0; i < 2*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d scalar#%d slot=%d: got %x want %x", impl.name, n, si, i, got[i], want[i]) + } + } + } + } + } +} + +func TestD128SubScalarVariants(t *testing.T) { + impls := []d128VScalarUImpl{ + {"scalar", scalarD128SubScalarUnchecked}, + {"avx2", avx2D128SubScalarUnchecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d128VScalarUImpl{"avx512", avx512D128SubScalarUnchecked}) + } + for _, n := range d128Sizes() { + v := makeRandD128(n, uint64(n)*41+3) + for si, s := range d128Scalars() { + want := make([]uint64, 2*n) + scalarD128SubScalarUnchecked(v, s.lo, s.hi, want) + for _, impl := range impls { + got := make([]uint64, 2*n) + impl.fn(v, s.lo, s.hi, got) + for i := 0; i < 2*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d scalar#%d slot=%d: got %x want %x", impl.name, n, si, i, got[i], want[i]) + } + } + } + } + } +} + +func TestD128ScalarSubVariants(t *testing.T) { + impls := []d128ScalarVUImpl{ + {"scalar", scalarD128ScalarSubUnchecked}, + {"avx2", avx2D128ScalarSubUnchecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d128ScalarVUImpl{"avx512", avx512D128ScalarSubUnchecked}) + } + for _, n := range d128Sizes() { + v := makeRandD128(n, uint64(n)*43+5) + for si, s := range d128Scalars() { + want := make([]uint64, 2*n) + scalarD128ScalarSubUnchecked(s.lo, s.hi, v, want) + for _, impl := range impls { + got := make([]uint64, 2*n) + impl.fn(s.lo, s.hi, v, got) + for i := 0; i < 2*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d scalar#%d slot=%d: got %x want %x", impl.name, n, si, i, got[i], want[i]) + } + } + } + } + } +} + +func TestD128AddScalarCheckedVariants(t *testing.T) { + impls := []d128ScalarVCImpl{ + {"scalar", scalarD128AddScalarChecked}, + {"avx2", avx2D128AddScalarChecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d128ScalarVCImpl{"avx512", avx512D128AddScalarChecked}) + } + + // 1) No-overflow: small-signed v plus small-signed scalars. + smallScalars := []struct{ lo, hi uint64 }{ + {0, 0}, + {0xDEADBEEFCAFEBABE, 0x0123456789ABCDEF}, + {1, 0}, + {math.MaxUint64, 0x0FFFFFFFFFFFFFFF}, + } + for _, n := range d128Sizes() { + v := makeRand128SmallSigned(n, uint64(n)*47+7) + for si, s := range smallScalars { + want := make([]uint64, 2*n) + if got := scalarD128AddScalarChecked(s.lo, s.hi, v, want); got != -1 { + t.Fatalf("setup overflow at %d for n=%d scalar#%d", got, n, si) + } + for _, impl := range impls { + got := make([]uint64, 2*n) + if idx := impl.fn(s.lo, s.hi, v, got); idx != -1 { + t.Fatalf("%s n=%d scalar#%d: spurious overflow at %d", impl.name, n, si, idx) + } + for i := 0; i < 2*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d scalar#%d slot=%d: got %x want %x", impl.name, n, si, i, got[i], want[i]) + } + } + } + } + } + + // 2) Inject overflow: scalar = 1, v[pos] = MaxInt128. + for _, n := range []int{4, 8, 9, 16, 17, 33, 35, 64} { + for _, pos := range []int{0, 1, 3, 4, 7, 8, n - 1} { + if pos < 0 || pos >= n { + continue + } + v := make([]uint64, 2*n) + j := pos << 1 + v[j] = math.MaxUint64 + v[j+1] = uint64(math.MaxInt64) + for _, impl := range impls { + got := make([]uint64, 2*n) + idx := impl.fn(1, 0, v, got) + if idx != pos { + t.Fatalf("%s n=%d pos=%d: got idx %d", impl.name, n, pos, idx) + } + } + } + } +} + +func TestD128SubScalarCheckedVariants(t *testing.T) { + impls := []d128VScalarCImpl{ + {"scalar", scalarD128SubScalarChecked}, + {"avx2", avx2D128SubScalarChecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d128VScalarCImpl{"avx512", avx512D128SubScalarChecked}) + } + + smallScalars := []struct{ lo, hi uint64 }{ + {0, 0}, + {1, 0}, + {0xDEADBEEFCAFEBABE, 0x0123456789ABCDEF}, + {math.MaxUint64, 0x0FFFFFFFFFFFFFFF}, + } + for _, n := range d128Sizes() { + v := makeRand128SmallSigned(n, uint64(n)*53+11) + for si, s := range smallScalars { + want := make([]uint64, 2*n) + if got := scalarD128SubScalarChecked(v, s.lo, s.hi, want); got != -1 { + t.Fatalf("setup overflow at %d for n=%d scalar#%d", got, n, si) + } + for _, impl := range impls { + got := make([]uint64, 2*n) + if idx := impl.fn(v, s.lo, s.hi, got); idx != -1 { + t.Fatalf("%s n=%d scalar#%d: spurious overflow at %d", impl.name, n, si, idx) + } + for i := 0; i < 2*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d scalar#%d slot=%d: got %x want %x", impl.name, n, si, i, got[i], want[i]) + } + } + } + } + } + + // Inject overflow: v[pos] = MinInt128, scalar = 1 ⇒ MinInt128 - 1 overflows. + for _, n := range []int{4, 8, 9, 16, 17, 33, 35, 64} { + for _, pos := range []int{0, 1, 3, 4, 7, 8, n - 1} { + if pos < 0 || pos >= n { + continue + } + v := make([]uint64, 2*n) + j := pos << 1 + v[j] = 0 + v[j+1] = 1 << 63 + for _, impl := range impls { + got := make([]uint64, 2*n) + idx := impl.fn(v, 1, 0, got) + if idx != pos { + t.Fatalf("%s n=%d pos=%d: got idx %d", impl.name, n, pos, idx) + } + } + } + } +} + +func TestD128ScalarSubCheckedVariants(t *testing.T) { + impls := []d128ScalarVCImpl{ + {"scalar", scalarD128ScalarSubChecked}, + {"avx2", avx2D128ScalarSubChecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d128ScalarVCImpl{"avx512", avx512D128ScalarSubChecked}) + } + + smallScalars := []struct{ lo, hi uint64 }{ + {0, 0}, + {1, 0}, + {0xDEADBEEFCAFEBABE, 0x0123456789ABCDEF}, + {math.MaxUint64, 0x0FFFFFFFFFFFFFFF}, + } + for _, n := range d128Sizes() { + v := makeRand128SmallSigned(n, uint64(n)*59+13) + for si, s := range smallScalars { + want := make([]uint64, 2*n) + if got := scalarD128ScalarSubChecked(s.lo, s.hi, v, want); got != -1 { + t.Fatalf("setup overflow at %d for n=%d scalar#%d", got, n, si) + } + for _, impl := range impls { + got := make([]uint64, 2*n) + if idx := impl.fn(s.lo, s.hi, v, got); idx != -1 { + t.Fatalf("%s n=%d scalar#%d: spurious overflow at %d", impl.name, n, si, idx) + } + for i := 0; i < 2*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d scalar#%d slot=%d: got %x want %x", impl.name, n, si, i, got[i], want[i]) + } + } + } + } + } + + // Inject overflow: scalar = MinInt128, v[pos] = 1 ⇒ MinInt128 - 1 overflows. + for _, n := range []int{4, 8, 9, 16, 17, 33, 35, 64} { + for _, pos := range []int{0, 1, 3, 4, 7, 8, n - 1} { + if pos < 0 || pos >= n { + continue + } + v := make([]uint64, 2*n) + j := pos << 1 + v[j] = 1 + v[j+1] = 0 + for _, impl := range impls { + got := make([]uint64, 2*n) + idx := impl.fn(0, 1<<63, v, got) + if idx != pos { + t.Fatalf("%s n=%d pos=%d: got idx %d", impl.name, n, pos, idx) + } + } + } + } +} + +func benchD128AddScalarU(b *testing.B, fn func(slo, shi uint64, v, r []uint64), n int) { + v := makeRandD128(n, 1) + r := make([]uint64, 2*n) + b.SetBytes(int64(n) * 16 * 2) + b.ResetTimer() + for i := 0; i < b.N; i++ { + fn(0xDEADBEEFCAFEBABE, 0x0123456789ABCDEF, v, r) + } +} + +func benchD128SubScalarU(b *testing.B, fn func(v []uint64, slo, shi uint64, r []uint64), n int) { + v := makeRandD128(n, 1) + r := make([]uint64, 2*n) + b.SetBytes(int64(n) * 16 * 2) + b.ResetTimer() + for i := 0; i < b.N; i++ { + fn(v, 0xDEADBEEFCAFEBABE, 0x0123456789ABCDEF, r) + } +} + +func BenchmarkD128AddScalarUnchecked(b *testing.B) { + for _, n := range d128BenchSizes { + b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD128AddScalarU(b, scalarD128AddScalarUnchecked, n) }) + b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD128AddScalarU(b, avx2D128AddScalarUnchecked, n) }) + if cpu.X86.HasAVX512 { + b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD128AddScalarU(b, avx512D128AddScalarUnchecked, n) }) + } + } +} + +func BenchmarkD128SubScalarUnchecked(b *testing.B) { + for _, n := range d128BenchSizes { + b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD128SubScalarU(b, scalarD128SubScalarUnchecked, n) }) + b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD128SubScalarU(b, avx2D128SubScalarUnchecked, n) }) + if cpu.X86.HasAVX512 { + b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD128SubScalarU(b, avx512D128SubScalarUnchecked, n) }) + } + } +} + +func BenchmarkD128ScalarSubUnchecked(b *testing.B) { + for _, n := range d128BenchSizes { + b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD128AddScalarU(b, scalarD128ScalarSubUnchecked, n) }) + b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD128AddScalarU(b, avx2D128ScalarSubUnchecked, n) }) + if cpu.X86.HasAVX512 { + b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD128AddScalarU(b, avx512D128ScalarSubUnchecked, n) }) + } + } +} + +func TestD128SumReduceVariants(t *testing.T) { + impls := []struct { + name string + fn func([]uint64) (uint64, uint64) + }{ + {"scalar", scalarD128SumReduce}, + } + if cpu.X86.HasAVX2 { + impls = append(impls, struct { + name string + fn func([]uint64) (uint64, uint64) + }{"avx2", avx2D128SumReduce}) + } + if cpu.X86.HasAVX512 { + impls = append(impls, struct { + name string + fn func([]uint64) (uint64, uint64) + }{"avx512", avx512D128SumReduce}) + } + + for _, n := range d128Sizes() { + v := makeRandD128(n, uint64(n)*23+1) + var refLo, refHi uint64 + for i := 0; i < n; i++ { + j := i << 1 + c := uint64(0) + if v[j]+refLo < refLo { + c = 1 + } + refLo += v[j] + refHi += v[j+1] + c + } + for _, im := range impls { + lo, hi := im.fn(v) + if lo != refLo || hi != refHi { + t.Fatalf("%s n=%d: got (%x,%x) want (%x,%x)", im.name, n, lo, hi, refLo, refHi) + } + } + } +} + +func BenchmarkD128SumReduce(b *testing.B) { + for _, n := range d128BenchSizes { + v := makeRandD128(n, 1) + b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { + b.SetBytes(int64(n) * 16) + for i := 0; i < b.N; i++ { + _, _ = scalarD128SumReduce(v) + } + }) + if cpu.X86.HasAVX2 { + b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { + b.SetBytes(int64(n) * 16) + for i := 0; i < b.N; i++ { + _, _ = avx2D128SumReduce(v) + } + }) + } + if cpu.X86.HasAVX512 { + b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { + b.SetBytes(int64(n) * 16) + for i := 0; i < b.N; i++ { + _, _ = avx512D128SumReduce(v) + } + }) + } + } +} diff --git a/pkg/common/simdkernels/d128_negabs.go b/pkg/common/simdkernels/d128_negabs.go new file mode 100644 index 0000000000000..e6be2ddf29f52 --- /dev/null +++ b/pkg/common/simdkernels/d128_negabs.go @@ -0,0 +1,70 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package simdkernels + +import "math/bits" + +// Decimal128 element-wise negate / absolute value on slices of uint64 with +// the matrixone Decimal128 layout (lo, hi pair per element). The src and +// dst slices both have length 2*N. dst may alias src. +// +// Both ops use 128-bit two's complement (~x + 1). Negate is unconditional; +// Abs is conditional on the sign bit of the high word. MinInt128 wraps to +// itself, matching the scalar SQL semantics in arith_decimal_fast.go. +// +// The exported variables are dispatchers; their default values are the +// scalar reference implementations and may be replaced at init time on +// amd64 when AVX2 / AVX-512 are detected (see d128_negabs_simd_amd64.go). + +var ( + D128Negate func(src, dst []uint64) = scalarD128Negate + D128Abs func(src, dst []uint64) = scalarD128Abs +) + +func scalarD128Negate(src, dst []uint64) { + n := len(dst) / 2 + if len(src) < 2*n { + return + } + for i := 0; i < n; i++ { + j := i << 1 + lo := ^src[j] + hi := ^src[j+1] + var c uint64 + lo, c = bits.Add64(lo, 1, 0) + hi, _ = bits.Add64(hi, 0, c) + dst[j] = lo + dst[j+1] = hi + } +} + +func scalarD128Abs(src, dst []uint64) { + n := len(dst) / 2 + if len(src) < 2*n { + return + } + for i := 0; i < n; i++ { + j := i << 1 + lo, hi := src[j], src[j+1] + sign := uint64(int64(hi) >> 63) // 0 or all-ones + lo ^= sign + hi ^= sign + var c uint64 + lo, c = bits.Add64(lo, sign&1, 0) + hi, _ = bits.Add64(hi, 0, c) + dst[j] = lo + dst[j+1] = hi + } +} diff --git a/pkg/common/simdkernels/d128_negabs_simd_amd64.go b/pkg/common/simdkernels/d128_negabs_simd_amd64.go new file mode 100644 index 0000000000000..9ff5ad8e3345b --- /dev/null +++ b/pkg/common/simdkernels/d128_negabs_simd_amd64.go @@ -0,0 +1,253 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build goexperiment.simd && amd64 + +package simdkernels + +import ( + "math/bits" + "simd/archsimd" + "unsafe" + + "golang.org/x/sys/cpu" +) + +// d128_negabs_simd_amd64.go: SIMD batch negate / abs for Decimal128. +// +// Per-element semantics: rLo:rHi = (~lo:~hi) + m, where m is 1 (Negate) or +// the sign bit of hi (Abs). Implemented via the conditional-negate idiom: +// +// mask = -m (all-ones if m == 1, else zero) +// loBar = lo XOR mask // ~lo when negating, lo otherwise +// hiBar = hi XOR mask +// rLo = loBar - mask // adds 1 (or 0) without an explicit branch +// carry = rLo wraps // i.e., loBar < rLo unsigned ⇔ mask=-1 AND lo=0 +// rHi = hiBar - carry // adds 1 in carrying lanes; XOR already did ~ +// +// Crucially rHi must NOT subtract `mask` again: the XOR has already produced +// ~hi when negating, and the +1 for two's complement only propagates from lo +// via the carry. Subtracting mask twice would over-add 1 on every negated +// lane. +// +// 4 elements per AVX2 iteration (8 q-words = 64 B), 8 per AVX-512 iteration +// (16 q-words = 128 B). Layout deinterleave/reinterleave reuses the same +// VPUNPCK pattern as d128_addsub. + +func init() { + switch { + case cpu.X86.HasAVX512: + D128Negate = avx512D128Negate + D128Abs = avx512D128Abs + case cpu.X86.HasAVX2: + D128Negate = avx2D128Negate + D128Abs = avx2D128Abs + } +} + +// --------------------------------------------------------------------------- +// AVX2 (Int64x4) implementation +// --------------------------------------------------------------------------- + +//go:nosplit +func avx2D128NegBody(lo, hi, mask, sb archsimd.Int64x4) (rLo, rHi archsimd.Int64x4) { + loBar := lo.Xor(mask) + hiBar := hi.Xor(mask) + rLo = loBar.Sub(mask) + carry := rLo.Xor(sb).Less(loBar.Xor(sb)).ToInt64x4() + rHi = hiBar.Sub(carry) + return +} + +func avx2D128Negate(src, dst []uint64) { + n := len(dst) / 2 + if n == 0 || len(src) < 2*n { + return + } + ps, pd := unsafe.Pointer(&src[0]), unsafe.Pointer(&dst[0]) + sb := archsimd.BroadcastInt64x4(signBit128) + mask := archsimd.BroadcastInt64x4(-1) // unconditional negate + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 16 + s0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(ps, off))) + s1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(ps, off+32))) + lo := s0.InterleaveLoGrouped(s1) + hi := s0.InterleaveHiGrouped(s1) + rLo, rHi := avx2D128NegBody(lo, hi, mask, sb) + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[4]int64)(unsafe.Add(pd, off))) + r1.Store((*[4]int64)(unsafe.Add(pd, off+32))) + } + for ; i < n; i++ { + j := i << 1 + lo, c := bits.Add64(^src[j], 1, 0) + hi, _ := bits.Add64(^src[j+1], 0, c) + dst[j] = lo + dst[j+1] = hi + } +} + +func avx2D128Abs(src, dst []uint64) { + n := len(dst) / 2 + if n == 0 || len(src) < 2*n { + return + } + ps, pd := unsafe.Pointer(&src[0]), unsafe.Pointer(&dst[0]) + sb := archsimd.BroadcastInt64x4(signBit128) + zero := archsimd.BroadcastInt64x4(0) + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 16 + s0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(ps, off))) + s1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(ps, off+32))) + lo := s0.InterleaveLoGrouped(s1) + hi := s0.InterleaveHiGrouped(s1) + // mask = -1 in lanes where hi < 0 (signed), 0 elsewhere. + mask := hi.Less(zero).ToInt64x4() + rLo, rHi := avx2D128NegBody(lo, hi, mask, sb) + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[4]int64)(unsafe.Add(pd, off))) + r1.Store((*[4]int64)(unsafe.Add(pd, off+32))) + } + for ; i < n; i++ { + j := i << 1 + lo, hi := src[j], src[j+1] + sign := uint64(int64(hi) >> 63) + lo ^= sign + hi ^= sign + var c uint64 + lo, c = bits.Add64(lo, sign&1, 0) + hi, _ = bits.Add64(hi, 0, c) + dst[j] = lo + dst[j+1] = hi + } +} + +// --------------------------------------------------------------------------- +// AVX-512 (Int64x8) implementation +// --------------------------------------------------------------------------- + +//go:nosplit +func avx512D128NegBody(lo, hi, mask archsimd.Int64x8) (rLo, rHi archsimd.Int64x8) { + loBar := lo.Xor(mask) + hiBar := hi.Xor(mask) + rLo = loBar.Sub(mask) + // Native unsigned compare on AVX-512. + carry := rLo.AsUint64x8().Less(loBar.AsUint64x8()).ToInt64x8() + rHi = hiBar.Sub(carry) + return +} + +func avx512D128Negate(src, dst []uint64) { + n := len(dst) / 2 + if n == 0 || len(src) < 2*n { + return + } + ps, pd := unsafe.Pointer(&src[0]), unsafe.Pointer(&dst[0]) + mask := archsimd.BroadcastInt64x8(-1) + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 16 + s0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(ps, off))) + s1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(ps, off+64))) + lo := s0.InterleaveLoGrouped(s1) + hi := s0.InterleaveHiGrouped(s1) + rLo, rHi := avx512D128NegBody(lo, hi, mask) + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[8]int64)(unsafe.Add(pd, off))) + r1.Store((*[8]int64)(unsafe.Add(pd, off+64))) + } + // AVX2 path handles 4-elem chunks; reuse for 4..7 remainder. + if r := n - i; r >= 4 { + sb := archsimd.BroadcastInt64x4(signBit128) + mask4 := archsimd.BroadcastInt64x4(-1) + off := uintptr(i) * 16 + s0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(ps, off))) + s1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(ps, off+32))) + lo := s0.InterleaveLoGrouped(s1) + hi := s0.InterleaveHiGrouped(s1) + rLo, rHi := avx2D128NegBody(lo, hi, mask4, sb) + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[4]int64)(unsafe.Add(pd, off))) + r1.Store((*[4]int64)(unsafe.Add(pd, off+32))) + i += 4 + } + for ; i < n; i++ { + j := i << 1 + lo, c := bits.Add64(^src[j], 1, 0) + hi, _ := bits.Add64(^src[j+1], 0, c) + dst[j] = lo + dst[j+1] = hi + } +} + +func avx512D128Abs(src, dst []uint64) { + n := len(dst) / 2 + if n == 0 || len(src) < 2*n { + return + } + ps, pd := unsafe.Pointer(&src[0]), unsafe.Pointer(&dst[0]) + zero := archsimd.BroadcastInt64x8(0) + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 16 + s0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(ps, off))) + s1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(ps, off+64))) + lo := s0.InterleaveLoGrouped(s1) + hi := s0.InterleaveHiGrouped(s1) + mask := hi.Less(zero).ToInt64x8() + rLo, rHi := avx512D128NegBody(lo, hi, mask) + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[8]int64)(unsafe.Add(pd, off))) + r1.Store((*[8]int64)(unsafe.Add(pd, off+64))) + } + if r := n - i; r >= 4 { + sb := archsimd.BroadcastInt64x4(signBit128) + zero4 := archsimd.BroadcastInt64x4(0) + off := uintptr(i) * 16 + s0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(ps, off))) + s1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(ps, off+32))) + lo := s0.InterleaveLoGrouped(s1) + hi := s0.InterleaveHiGrouped(s1) + mask := hi.Less(zero4).ToInt64x4() + rLo, rHi := avx2D128NegBody(lo, hi, mask, sb) + r0 := rLo.InterleaveLoGrouped(rHi) + r1 := rLo.InterleaveHiGrouped(rHi) + r0.Store((*[4]int64)(unsafe.Add(pd, off))) + r1.Store((*[4]int64)(unsafe.Add(pd, off+32))) + i += 4 + } + for ; i < n; i++ { + j := i << 1 + lo, hi := src[j], src[j+1] + sign := uint64(int64(hi) >> 63) + lo ^= sign + hi ^= sign + var c uint64 + lo, c = bits.Add64(lo, sign&1, 0) + hi, _ = bits.Add64(hi, 0, c) + dst[j] = lo + dst[j+1] = hi + } +} diff --git a/pkg/common/simdkernels/d128_negabs_test.go b/pkg/common/simdkernels/d128_negabs_test.go new file mode 100644 index 0000000000000..4f2ddc1be6811 --- /dev/null +++ b/pkg/common/simdkernels/d128_negabs_test.go @@ -0,0 +1,167 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build goexperiment.simd && amd64 + +package simdkernels + +import ( + "math" + "strconv" + "testing" +) + +type d128UnaryImpl struct { + name string + fn func(src, dst []uint64) +} + +func d128NegateImpls() []d128UnaryImpl { + out := []d128UnaryImpl{{name: "scalar", fn: scalarD128Negate}} + if D128Negate != nil { + out = append(out, d128UnaryImpl{name: "dispatch", fn: D128Negate}) + } + return out +} + +func d128AbsImpls() []d128UnaryImpl { + out := []d128UnaryImpl{{name: "scalar", fn: scalarD128Abs}} + if D128Abs != nil { + out = append(out, d128UnaryImpl{name: "dispatch", fn: D128Abs}) + } + return out +} + +func d128NegAbsEdges() []uint64 { + // pairs of (lo, hi) covering boundary cases + pairs := [][2]uint64{ + {0, 0}, + {1, 0}, + {math.MaxUint64, 0}, + {0, 1}, + {0, math.MaxUint64}, + {math.MaxUint64, math.MaxUint64}, // -1 + {1, 0x8000000000000000}, // negative with lo == 1 + {0, 0x8000000000000000}, // MinInt128 (negation wraps) + {math.MaxUint64, 0x7FFFFFFFFFFFFFFF}, + {0, 0x7FFFFFFFFFFFFFFF}, + } + out := make([]uint64, 0, 2*len(pairs)) + for _, p := range pairs { + out = append(out, p[0], p[1]) + } + return out +} + +func TestD128NegateCorrectness(t *testing.T) { + impls := d128NegateImpls() + for _, n := range d128Sizes() { + src := makeRandD128(n, 0xDEC128^uint64(n)) + want := make([]uint64, 2*n) + scalarD128Negate(src, want) + for _, im := range impls { + got := make([]uint64, 2*n) + im.fn(src, got) + for i := 0; i < 2*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d idx=%d: got 0x%x want 0x%x src(lo=0x%x hi=0x%x)", + im.name, n, i, got[i], want[i], src[(i/2)*2], src[(i/2)*2+1]) + } + } + } + } +} + +func TestD128NegateEdges(t *testing.T) { + src := d128NegAbsEdges() + n := len(src) / 2 + want := make([]uint64, 2*n) + scalarD128Negate(src, want) + for _, im := range d128NegateImpls() { + got := make([]uint64, 2*n) + im.fn(src, got) + for i := 0; i < 2*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s idx=%d: got 0x%x want 0x%x src(lo=0x%x hi=0x%x)", + im.name, i, got[i], want[i], src[(i/2)*2], src[(i/2)*2+1]) + } + } + } +} + +func TestD128AbsCorrectness(t *testing.T) { + impls := d128AbsImpls() + for _, n := range d128Sizes() { + src := makeRandD128(n, 0xABA127^uint64(n)) + want := make([]uint64, 2*n) + scalarD128Abs(src, want) + for _, im := range impls { + got := make([]uint64, 2*n) + im.fn(src, got) + for i := 0; i < 2*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d idx=%d: got 0x%x want 0x%x src(lo=0x%x hi=0x%x)", + im.name, n, i, got[i], want[i], src[(i/2)*2], src[(i/2)*2+1]) + } + } + } + } +} + +func TestD128AbsEdges(t *testing.T) { + src := d128NegAbsEdges() + n := len(src) / 2 + want := make([]uint64, 2*n) + scalarD128Abs(src, want) + for _, im := range d128AbsImpls() { + got := make([]uint64, 2*n) + im.fn(src, got) + for i := 0; i < 2*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s idx=%d: got 0x%x want 0x%x src(lo=0x%x hi=0x%x)", + im.name, i, got[i], want[i], src[(i/2)*2], src[(i/2)*2+1]) + } + } + } +} + +func benchmarkD128Unary(b *testing.B, name string, fn func(src, dst []uint64), n int) { + src := makeRandD128(n, 0xBEEF^uint64(n)) + dst := make([]uint64, 2*n) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + fn(src, dst) + } +} + +func BenchmarkD128Negate(b *testing.B) { + for _, n := range []int{16, 64, 256, 1024, 4096} { + for _, im := range d128NegateImpls() { + b.Run(im.name+"/n="+strconv.Itoa(n), func(b *testing.B) { + benchmarkD128Unary(b, im.name, im.fn, n) + }) + } + } +} + +func BenchmarkD128Abs(b *testing.B) { + for _, n := range []int{16, 64, 256, 1024, 4096} { + for _, im := range d128AbsImpls() { + b.Run(im.name+"/n="+strconv.Itoa(n), func(b *testing.B) { + benchmarkD128Unary(b, im.name, im.fn, n) + }) + } + } +} diff --git a/pkg/common/simdkernels/d256_addsub.go b/pkg/common/simdkernels/d256_addsub.go new file mode 100644 index 0000000000000..07d9d8587dfb0 --- /dev/null +++ b/pkg/common/simdkernels/d256_addsub.go @@ -0,0 +1,320 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package simdkernels + +import "math/bits" + +// Decimal256 add/sub on uint64 slices with the matrixone Decimal256 layout +// (4 uint64 per element, lo→hi at indices 4i..4i+3). The top word (slot +// 4i+3) is interpreted as int64 for the signed-overflow predicate. +// +// API mirrors D64*/D128*: dispatcher pairs (Unchecked, Checked); the +// dispatcher defaults to the scalar reference and is replaced at init time +// on amd64 with AVX2 / AVX-512 (see d256_addsub_simd_amd64.go). + +var ( + D256AddUnchecked func(a, b, r []uint64) = scalarD256AddUnchecked + D256SubUnchecked func(a, b, r []uint64) = scalarD256SubUnchecked + D256AddChecked func(a, b, r []uint64) int = scalarD256AddChecked + D256SubChecked func(a, b, r []uint64) int = scalarD256SubChecked + + D256AddScalarUnchecked func(s0, s1, s2, s3 uint64, v, r []uint64) = scalarD256AddScalarUnchecked + D256AddScalarChecked func(s0, s1, s2, s3 uint64, v, r []uint64) int = scalarD256AddScalarChecked + D256SubScalarUnchecked func(v []uint64, s0, s1, s2, s3 uint64, r []uint64) = scalarD256SubScalarUnchecked + D256SubScalarChecked func(v []uint64, s0, s1, s2, s3 uint64, r []uint64) int = scalarD256SubScalarChecked + D256ScalarSubUnchecked func(s0, s1, s2, s3 uint64, v, r []uint64) = scalarD256ScalarSubUnchecked + D256ScalarSubChecked func(s0, s1, s2, s3 uint64, v, r []uint64) int = scalarD256ScalarSubChecked +) + +func scalarD256AddUnchecked(a, b, r []uint64) { + n := len(r) / 4 + if len(a) < 4*n || len(b) < 4*n { + return + } + for i := 0; i < n; i++ { + j := i << 2 + w0, c := bits.Add64(a[j], b[j], 0) + w1, c := bits.Add64(a[j+1], b[j+1], c) + w2, c := bits.Add64(a[j+2], b[j+2], c) + w3, _ := bits.Add64(a[j+3], b[j+3], c) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + } +} + +func scalarD256SubUnchecked(a, b, r []uint64) { + n := len(r) / 4 + if len(a) < 4*n || len(b) < 4*n { + return + } + for i := 0; i < n; i++ { + j := i << 2 + w0, br := bits.Sub64(a[j], b[j], 0) + w1, br := bits.Sub64(a[j+1], b[j+1], br) + w2, br := bits.Sub64(a[j+2], b[j+2], br) + w3, _ := bits.Sub64(a[j+3], b[j+3], br) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + } +} + +func scalarD256AddChecked(a, b, r []uint64) int { + n := len(r) / 4 + if len(a) < 4*n || len(b) < 4*n { + return -1 + } + first := -1 + for i := 0; i < n; i++ { + j := i << 2 + aHi := a[j+3] + bHi := b[j+3] + w0, c := bits.Add64(a[j], b[j], 0) + w1, c := bits.Add64(a[j+1], b[j+1], c) + w2, c := bits.Add64(a[j+2], b[j+2], c) + w3, _ := bits.Add64(aHi, bHi, c) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + if first < 0 { + ah, bh, rh := int64(aHi), int64(bHi), int64(w3) + if (ah^rh)&^(ah^bh) < 0 { + first = i + } + } + } + return first +} + +func scalarD256SubChecked(a, b, r []uint64) int { + n := len(r) / 4 + if len(a) < 4*n || len(b) < 4*n { + return -1 + } + first := -1 + for i := 0; i < n; i++ { + j := i << 2 + aHi := a[j+3] + bHi := b[j+3] + w0, br := bits.Sub64(a[j], b[j], 0) + w1, br := bits.Sub64(a[j+1], b[j+1], br) + w2, br := bits.Sub64(a[j+2], b[j+2], br) + w3, _ := bits.Sub64(aHi, bHi, br) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + if first < 0 { + ah, bh, rh := int64(aHi), int64(bHi), int64(w3) + if (ah^rh)&(ah^bh) < 0 { + first = i + } + } + } + return first +} + +// d256FirstOverflow rescans the first end elements (each = 4 uint64) for +// the first overflow. Used by SIMD checked variants when the accumulated +// mask reports overflow but the scalar tail did not see one. +func d256FirstOverflow(a, b []uint64, end int, sub bool) int { + if sub { + for i := 0; i < end; i++ { + j := i << 2 + _, br := bits.Sub64(a[j], b[j], 0) + _, br = bits.Sub64(a[j+1], b[j+1], br) + _, br = bits.Sub64(a[j+2], b[j+2], br) + w3, _ := bits.Sub64(a[j+3], b[j+3], br) + ah, bh, rh := int64(a[j+3]), int64(b[j+3]), int64(w3) + if (ah^rh)&(ah^bh) < 0 { + return i + } + } + return -1 + } + for i := 0; i < end; i++ { + j := i << 2 + _, c := bits.Add64(a[j], b[j], 0) + _, c = bits.Add64(a[j+1], b[j+1], c) + _, c = bits.Add64(a[j+2], b[j+2], c) + w3, _ := bits.Add64(a[j+3], b[j+3], c) + ah, bh, rh := int64(a[j+3]), int64(b[j+3]), int64(w3) + if (ah^rh)&^(ah^bh) < 0 { + return i + } + } + return -1 +} + +// --------------------------------------------------------------------------- +// Scalar-broadcast reference implementations (Decimal256 = 4 uint64/elem). +// --------------------------------------------------------------------------- + +func scalarD256AddScalarUnchecked(s0, s1, s2, s3 uint64, v, r []uint64) { + n := len(r) / 4 + if len(v) < 4*n { + return + } + for i := 0; i < n; i++ { + j := i << 2 + w0, c := bits.Add64(s0, v[j], 0) + w1, c := bits.Add64(s1, v[j+1], c) + w2, c := bits.Add64(s2, v[j+2], c) + w3, _ := bits.Add64(s3, v[j+3], c) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + } +} + +func scalarD256SubScalarUnchecked(v []uint64, s0, s1, s2, s3 uint64, r []uint64) { + n := len(r) / 4 + if len(v) < 4*n { + return + } + for i := 0; i < n; i++ { + j := i << 2 + w0, br := bits.Sub64(v[j], s0, 0) + w1, br := bits.Sub64(v[j+1], s1, br) + w2, br := bits.Sub64(v[j+2], s2, br) + w3, _ := bits.Sub64(v[j+3], s3, br) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + } +} + +func scalarD256ScalarSubUnchecked(s0, s1, s2, s3 uint64, v, r []uint64) { + n := len(r) / 4 + if len(v) < 4*n { + return + } + for i := 0; i < n; i++ { + j := i << 2 + w0, br := bits.Sub64(s0, v[j], 0) + w1, br := bits.Sub64(s1, v[j+1], br) + w2, br := bits.Sub64(s2, v[j+2], br) + w3, _ := bits.Sub64(s3, v[j+3], br) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + } +} + +func scalarD256AddScalarChecked(s0, s1, s2, s3 uint64, v, r []uint64) int { + n := len(r) / 4 + if len(v) < 4*n { + return -1 + } + first := -1 + sh := int64(s3) + for i := 0; i < n; i++ { + j := i << 2 + vHi := v[j+3] + w0, c := bits.Add64(s0, v[j], 0) + w1, c := bits.Add64(s1, v[j+1], c) + w2, c := bits.Add64(s2, v[j+2], c) + w3, _ := bits.Add64(s3, vHi, c) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + if first < 0 { + vh, rh := int64(vHi), int64(w3) + if (sh^rh)&^(sh^vh) < 0 { + first = i + } + } + } + return first +} + +func scalarD256SubScalarChecked(v []uint64, s0, s1, s2, s3 uint64, r []uint64) int { + n := len(r) / 4 + if len(v) < 4*n { + return -1 + } + first := -1 + sh := int64(s3) + for i := 0; i < n; i++ { + j := i << 2 + vHi := v[j+3] + w0, br := bits.Sub64(v[j], s0, 0) + w1, br := bits.Sub64(v[j+1], s1, br) + w2, br := bits.Sub64(v[j+2], s2, br) + w3, _ := bits.Sub64(vHi, s3, br) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + if first < 0 { + vh, rh := int64(vHi), int64(w3) + if (vh^rh)&(vh^sh) < 0 { + first = i + } + } + } + return first +} + +func scalarD256ScalarSubChecked(s0, s1, s2, s3 uint64, v, r []uint64) int { + n := len(r) / 4 + if len(v) < 4*n { + return -1 + } + first := -1 + sh := int64(s3) + for i := 0; i < n; i++ { + j := i << 2 + vHi := v[j+3] + w0, br := bits.Sub64(s0, v[j], 0) + w1, br := bits.Sub64(s1, v[j+1], br) + w2, br := bits.Sub64(s2, v[j+2], br) + w3, _ := bits.Sub64(s3, vHi, br) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + if first < 0 { + vh, rh := int64(vHi), int64(w3) + if (sh^rh)&(sh^vh) < 0 { + first = i + } + } + } + return first +} + +// d256ScalarFirstOverflow: rescan first end elements for first overflow in +// scalar-broadcast ops. kind: 0=AddScalar, 1=SubScalar (v-s), 2=ScalarSub (s-v). +func d256ScalarFirstOverflow(s0, s1, s2, s3 uint64, v []uint64, end int, kind int) int { + sh := int64(s3) + switch kind { + case 0: + for i := 0; i < end; i++ { + j := i << 2 + _, c := bits.Add64(s0, v[j], 0) + _, c = bits.Add64(s1, v[j+1], c) + _, c = bits.Add64(s2, v[j+2], c) + w3, _ := bits.Add64(s3, v[j+3], c) + vh, rh := int64(v[j+3]), int64(w3) + if (sh^rh)&^(sh^vh) < 0 { + return i + } + } + case 1: + for i := 0; i < end; i++ { + j := i << 2 + _, br := bits.Sub64(v[j], s0, 0) + _, br = bits.Sub64(v[j+1], s1, br) + _, br = bits.Sub64(v[j+2], s2, br) + w3, _ := bits.Sub64(v[j+3], s3, br) + vh, rh := int64(v[j+3]), int64(w3) + if (vh^rh)&(vh^sh) < 0 { + return i + } + } + case 2: + for i := 0; i < end; i++ { + j := i << 2 + _, br := bits.Sub64(s0, v[j], 0) + _, br = bits.Sub64(s1, v[j+1], br) + _, br = bits.Sub64(s2, v[j+2], br) + w3, _ := bits.Sub64(s3, v[j+3], br) + vh, rh := int64(v[j+3]), int64(w3) + if (sh^rh)&(sh^vh) < 0 { + return i + } + } + } + return -1 +} diff --git a/pkg/common/simdkernels/d256_addsub_simd_amd64.go b/pkg/common/simdkernels/d256_addsub_simd_amd64.go new file mode 100644 index 0000000000000..038caaf740d5c --- /dev/null +++ b/pkg/common/simdkernels/d256_addsub_simd_amd64.go @@ -0,0 +1,1380 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build goexperiment.simd && amd64 + +package simdkernels + +import ( + "simd/archsimd" + "unsafe" + + "golang.org/x/sys/cpu" +) + +// On AVX-512 hosts we use the avx512 path (8 elements/iter); otherwise on +// AVX2 hosts we use the avx2 path (4 elements/iter); else scalar. +func init() { + switch { + case cpu.X86.HasAVX512: + D256AddUnchecked = avx512D256AddUnchecked + D256SubUnchecked = avx512D256SubUnchecked + D256AddChecked = avx512D256AddChecked + D256SubChecked = avx512D256SubChecked + D256AddScalarUnchecked = avx512D256AddScalarUnchecked + D256SubScalarUnchecked = avx512D256SubScalarUnchecked + D256ScalarSubUnchecked = avx512D256ScalarSubUnchecked + D256AddScalarChecked = avx512D256AddScalarChecked + D256SubScalarChecked = avx512D256SubScalarChecked + D256ScalarSubChecked = avx512D256ScalarSubChecked + case cpu.X86.HasAVX2: + D256AddUnchecked = avx2D256AddUnchecked + D256SubUnchecked = avx2D256SubUnchecked + D256AddChecked = avx2D256AddChecked + D256SubChecked = avx2D256SubChecked + D256AddScalarUnchecked = avx2D256AddScalarUnchecked + D256SubScalarUnchecked = avx2D256SubScalarUnchecked + D256ScalarSubUnchecked = avx2D256ScalarSubUnchecked + D256AddScalarChecked = avx2D256AddScalarChecked + D256SubScalarChecked = avx2D256SubScalarChecked + D256ScalarSubChecked = avx2D256ScalarSubChecked + } +} + +// transpose4x4 turns the column-major load layout [v0=elem0, v1=elem1, +// v2=elem2, v3=elem3] (each Int64x4 = one Decimal256, lanes = words) into +// the row-major layout where Wk[i] = element_i.word_k. The operation is +// involutory: applying it again restores the original order, which we use +// to write the result back. +// +// Implemented as the standard 4×4 int64 transpose with 4 unpacks + 4 +// VPERM2I128 (Select128FromPair). All operations are AVX2. +// +//go:nosplit +func transpose4x4(v0, v1, v2, v3 archsimd.Int64x4) (W0, W1, W2, W3 archsimd.Int64x4) { + t0 := v0.InterleaveLoGrouped(v1) // [v0[0], v1[0], v0[2], v1[2]] + t1 := v0.InterleaveHiGrouped(v1) // [v0[1], v1[1], v0[3], v1[3]] + t2 := v2.InterleaveLoGrouped(v3) // [v2[0], v3[0], v2[2], v3[2]] + t3 := v2.InterleaveHiGrouped(v3) // [v2[1], v3[1], v2[3], v3[3]] + // VPERM2I128 imm encoding: arg(lo)/arg(hi) selects which 128-bit half: + // 0 = x.lo, 1 = x.hi, 2 = y.lo, 3 = y.hi. + W0 = t0.Select128FromPair(0, 2, t2) // [v0[0], v1[0], v2[0], v3[0]] + W2 = t0.Select128FromPair(1, 3, t2) // [v0[2], v1[2], v2[2], v3[2]] + W1 = t1.Select128FromPair(0, 2, t3) // [v0[1], v1[1], v2[1], v3[1]] + W3 = t1.Select128FromPair(1, 3, t3) // [v0[3], v1[3], v2[3], v3[3]] + return +} + +// addCarryStage performs one stage of the multi-word add: r = a + b + cIn, +// returning r and the carry-out vector (each lane = 0 or -1). +// +// cIn is encoded as a vector with values 0 or -1 (so subtracting it adds 0 +// or 1). cOut is the OR of two contributing wraps: +// - the bare `a + b` wrap (always possible, regardless of cIn) +// - the `(a+b) - cIn` wrap (only when cIn = -1 and a+b = MaxU64) +// +// The (s == -1) & cIn alternative formulation was tried but regressed by ~5% +// because addCarryStage already shares (s^sb) between csA and csB via CSE, +// so only one XOR is saved while an extra constant broadcast and AND are +// added. The asymmetry vs subBorrowStage reflects that addCarry's CSE is +// already optimal whereas subBorrow's was not. +// +// Sign-bit-flipped Less is used because AVX2 has no unsigned int64 compare. +// +//go:nosplit +func addCarryStage(a, b, cIn, sb archsimd.Int64x4) (r, cOut archsimd.Int64x4) { + s := a.Add(b) + csA := s.Xor(sb).Less(a.Xor(sb)).ToInt64x4() + r = s.Sub(cIn) + csB := r.Xor(sb).Less(s.Xor(sb)).ToInt64x4() + cOut = csA.Or(csB) + return +} + +// addCarryStageNoOut is addCarryStage without computing the carry-out +// (used for the topmost word in unchecked add). +// +//go:nosplit +func addCarryStageNoOut(a, b, cIn archsimd.Int64x4) archsimd.Int64x4 { + return a.Add(b).Sub(cIn) +} + +// subBorrowStage performs one stage of the multi-word sub: r = a - b - bIn. +// +// bIn is encoded as a vector with values 0 or -1 (so adding it subtracts 0 +// or 1). bOut is the OR of: +// - bare `a - b` borrow: a < b unsigned ⇔ (a^sb) < (b^sb) signed +// - `(a-b) + bIn` borrow: happens iff s = 0 AND bIn = -1 +// +// The second term is computed as (s == 0) & bIn, which avoids materializing +// `s^sb` and `r^sb`. Compared to the symmetric `(s^sb) < (r^sb)` form, this +// drops one XOR per stage and (more importantly) shortens the live-range +// chain so the register allocator does not spill across the carry chain. +// +//go:nosplit +func subBorrowStage(a, b, bIn, sb, zero archsimd.Int64x4) (r, bOut archsimd.Int64x4) { + s := a.Sub(b) + bsA := a.Xor(sb).Less(b.Xor(sb)).ToInt64x4() + r = s.Add(bIn) + bsB := s.Equal(zero).ToInt64x4().And(bIn) + bOut = bsA.Or(bsB) + return +} + +//go:nosplit +func subBorrowStageNoOut(a, b, bIn archsimd.Int64x4) archsimd.Int64x4 { + return a.Sub(b).Add(bIn) +} + +func avx2D256AddUnchecked(a, b, r []uint64) { + n := len(r) / 4 + if n == 0 || len(a) < 4*n || len(b) < 4*n { + return + } + pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x4(signBit128) + zero := archsimd.BroadcastInt64x4(0) + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 32 + aV0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off))) + aV1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+32))) + aV2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+64))) + aV3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+96))) + bV0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off))) + bV1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+32))) + bV2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+64))) + bV3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+96))) + + aW0, aW1, aW2, aW3 := transpose4x4(aV0, aV1, aV2, aV3) + bW0, bW1, bW2, bW3 := transpose4x4(bV0, bV1, bV2, bV3) + + rW0, c0 := addCarryStage(aW0, bW0, zero, sb) + rW1, c1 := addCarryStage(aW1, bW1, c0, sb) + rW2, c2 := addCarryStage(aW2, bW2, c1, sb) + rW3 := addCarryStageNoOut(aW3, bW3, c2) + + rV0, rV1, rV2, rV3 := transpose4x4(rW0, rW1, rW2, rW3) + rV0.Store((*[4]int64)(unsafe.Add(pr, off))) + rV1.Store((*[4]int64)(unsafe.Add(pr, off+32))) + rV2.Store((*[4]int64)(unsafe.Add(pr, off+64))) + rV3.Store((*[4]int64)(unsafe.Add(pr, off+96))) + } + for ; i < n; i++ { + j := i << 2 + w0, c := addU64(a[j], b[j], 0) + w1, c := addU64(a[j+1], b[j+1], c) + w2, c := addU64(a[j+2], b[j+2], c) + w3, _ := addU64(a[j+3], b[j+3], c) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + } +} + +func avx2D256SubUnchecked(a, b, r []uint64) { + n := len(r) / 4 + if n == 0 || len(a) < 4*n || len(b) < 4*n { + return + } + pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x4(signBit128) + zero := archsimd.BroadcastInt64x4(0) + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 32 + aV0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off))) + aV1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+32))) + aV2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+64))) + aV3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+96))) + bV0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off))) + bV1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+32))) + bV2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+64))) + bV3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+96))) + + aW0, aW1, aW2, aW3 := transpose4x4(aV0, aV1, aV2, aV3) + bW0, bW1, bW2, bW3 := transpose4x4(bV0, bV1, bV2, bV3) + + rW0, b0 := subBorrowStage(aW0, bW0, zero, sb, zero) + rW1, b1 := subBorrowStage(aW1, bW1, b0, sb, zero) + rW2, b2 := subBorrowStage(aW2, bW2, b1, sb, zero) + rW3 := subBorrowStageNoOut(aW3, bW3, b2) + + rV0, rV1, rV2, rV3 := transpose4x4(rW0, rW1, rW2, rW3) + rV0.Store((*[4]int64)(unsafe.Add(pr, off))) + rV1.Store((*[4]int64)(unsafe.Add(pr, off+32))) + rV2.Store((*[4]int64)(unsafe.Add(pr, off+64))) + rV3.Store((*[4]int64)(unsafe.Add(pr, off+96))) + } + for ; i < n; i++ { + j := i << 2 + w0, br := subU64(a[j], b[j], 0) + w1, br := subU64(a[j+1], b[j+1], br) + w2, br := subU64(a[j+2], b[j+2], br) + w3, _ := subU64(a[j+3], b[j+3], br) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + } +} + +func avx2D256AddChecked(a, b, r []uint64) int { + n := len(r) / 4 + if n == 0 || len(a) < 4*n || len(b) < 4*n { + return -1 + } + pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x4(signBit128) + zero := archsimd.BroadcastInt64x4(0) + + var ofAcc archsimd.Int64x4 + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 32 + aV0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off))) + aV1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+32))) + aV2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+64))) + aV3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+96))) + bV0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off))) + bV1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+32))) + bV2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+64))) + bV3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+96))) + + aW0, aW1, aW2, aW3 := transpose4x4(aV0, aV1, aV2, aV3) + bW0, bW1, bW2, bW3 := transpose4x4(bV0, bV1, bV2, bV3) + + rW0, c0 := addCarryStage(aW0, bW0, zero, sb) + rW1, c1 := addCarryStage(aW1, bW1, c0, sb) + rW2, c2 := addCarryStage(aW2, bW2, c1, sb) + rW3 := addCarryStageNoOut(aW3, bW3, c2) + + ofAcc = ofAcc.Or(aW3.Xor(rW3).AndNot(aW3.Xor(bW3))) + + rV0, rV1, rV2, rV3 := transpose4x4(rW0, rW1, rW2, rW3) + rV0.Store((*[4]int64)(unsafe.Add(pr, off))) + rV1.Store((*[4]int64)(unsafe.Add(pr, off+32))) + rV2.Store((*[4]int64)(unsafe.Add(pr, off+64))) + rV3.Store((*[4]int64)(unsafe.Add(pr, off+96))) + } + vecEnd := i + + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + for ; i < n; i++ { + j := i << 2 + aHi := a[j+3] + bHi := b[j+3] + w0, c := addU64(a[j], b[j], 0) + w1, c := addU64(a[j+1], b[j+1], c) + w2, c := addU64(a[j+2], b[j+2], c) + w3, _ := addU64(aHi, bHi, c) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + ah, bh, rh := int64(aHi), int64(bHi), int64(w3) + if (ah^rh)&^(ah^bh) < 0 { + if vecOverflow { + return d256FirstOverflow(a, b, vecEnd, false) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d256FirstOverflow(a, b, vecEnd, false) +} + +func avx2D256SubChecked(a, b, r []uint64) int { + n := len(r) / 4 + if n == 0 || len(a) < 4*n || len(b) < 4*n { + return -1 + } + pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x4(signBit128) + zero := archsimd.BroadcastInt64x4(0) + + var ofAcc archsimd.Int64x4 + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 32 + aV0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off))) + aV1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+32))) + aV2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+64))) + aV3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+96))) + bV0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off))) + bV1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+32))) + bV2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+64))) + bV3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+96))) + + aW0, aW1, aW2, aW3 := transpose4x4(aV0, aV1, aV2, aV3) + bW0, bW1, bW2, bW3 := transpose4x4(bV0, bV1, bV2, bV3) + + rW0, b0 := subBorrowStage(aW0, bW0, zero, sb, zero) + rW1, b1 := subBorrowStage(aW1, bW1, b0, sb, zero) + rW2, b2 := subBorrowStage(aW2, bW2, b1, sb, zero) + rW3 := subBorrowStageNoOut(aW3, bW3, b2) + + ofAcc = ofAcc.Or(aW3.Xor(rW3).And(aW3.Xor(bW3))) + + rV0, rV1, rV2, rV3 := transpose4x4(rW0, rW1, rW2, rW3) + rV0.Store((*[4]int64)(unsafe.Add(pr, off))) + rV1.Store((*[4]int64)(unsafe.Add(pr, off+32))) + rV2.Store((*[4]int64)(unsafe.Add(pr, off+64))) + rV3.Store((*[4]int64)(unsafe.Add(pr, off+96))) + } + vecEnd := i + + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + for ; i < n; i++ { + j := i << 2 + aHi := a[j+3] + bHi := b[j+3] + w0, br := subU64(a[j], b[j], 0) + w1, br := subU64(a[j+1], b[j+1], br) + w2, br := subU64(a[j+2], b[j+2], br) + w3, _ := subU64(aHi, bHi, br) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + ah, bh, rh := int64(aHi), int64(bHi), int64(w3) + if (ah^rh)&(ah^bh) < 0 { + if vecOverflow { + return d256FirstOverflow(a, b, vecEnd, true) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d256FirstOverflow(a, b, vecEnd, true) +} + +// addU64 / subU64 are local thin wrappers around math/bits to keep the +// scalar tail concise; they inline at call sites. +// +//go:nosplit +func addU64(x, y, c uint64) (uint64, uint64) { + s := x + y + c2 := uint64(0) + if s < x { + c2 = 1 + } + r := s + c + c3 := uint64(0) + if r < s { + c3 = 1 + } + return r, c2 | c3 +} + +//go:nosplit +func subU64(x, y, br uint64) (uint64, uint64) { + d := x - y + b1 := uint64(0) + if x < y { + b1 = 1 + } + r := d - br + b2 := uint64(0) + if d < br { + b2 = 1 + } + return r, b1 | b2 +} + +// --------------------------------------------------------------------------- +// AVX-512 path: each Decimal256 = 4 uint64 = 32 B. We process 8 elements per +// kernel iteration (= 256 B per input), loaded as 4 Int64x8 vectors. +// +// Layout per loaded vector (lo→hi inside each 4-word element): +// V0 = [e0w0,e0w1,e0w2,e0w3, e1w0,e1w1,e1w2,e1w3] +// V1 = [e2w0,e2w1,e2w2,e2w3, e3w0,e3w1,e3w2,e3w3] +// V2 = [e4*..] V3 = [e6*..] +// +// We transpose to per-word vectors W0..W3 each holding the same word from +// 8 elements, run the 4-stage carry/borrow chain, and inverse-transpose +// back. Both transposes are 8 ConcatPermute (VPERMI2Q) instructions +// arranged as two 4-permute stages. +// --------------------------------------------------------------------------- + +// AVX-512 transpose index vectors. They are package-scope so that +// LoadUint64x8 hoists to a single broadcast/load that the compiler can +// keep in a register across the loop body. +var ( + avx512D256IdxFwdW0W2 = [8]uint64{0, 4, 8, 12, 2, 6, 10, 14} + avx512D256IdxFwdW1W3 = [8]uint64{1, 5, 9, 13, 3, 7, 11, 15} + avx512D256IdxLoHalf = [8]uint64{0, 1, 2, 3, 8, 9, 10, 11} + avx512D256IdxHiHalf = [8]uint64{4, 5, 6, 7, 12, 13, 14, 15} + avx512D256IdxInvV0 = [8]uint64{0, 8, 4, 12, 1, 9, 5, 13} + avx512D256IdxInvV1 = [8]uint64{2, 10, 6, 14, 3, 11, 7, 15} +) + +// transpose8x4Forward: V0..V3 (each = 2 D256 elements in element-natural +// layout) → W0..W3 (each = one word from all 8 elements, in element order). +// +//go:nosplit +func transpose8x4Forward(v0, v1, v2, v3 archsimd.Int64x8) (w0, w1, w2, w3 archsimd.Int64x8) { + idxW02 := archsimd.LoadUint64x8(&avx512D256IdxFwdW0W2) + idxW13 := archsimd.LoadUint64x8(&avx512D256IdxFwdW1W3) + idxLo := archsimd.LoadUint64x8(&avx512D256IdxLoHalf) + idxHi := archsimd.LoadUint64x8(&avx512D256IdxHiHalf) + + // Stage 1: gather (w0,w2) and (w1,w3) within each pair-of-elements vector. + q0 := v0.ConcatPermute(v1, idxW02) // [e0w0,e1w0,e2w0,e3w0, e0w2,e1w2,e2w2,e3w2] + q1 := v0.ConcatPermute(v1, idxW13) // [e0w1,e1w1,e2w1,e3w1, e0w3,e1w3,e2w3,e3w3] + q2 := v2.ConcatPermute(v3, idxW02) // [e4w0..e7w0, e4w2..e7w2] + q3 := v2.ConcatPermute(v3, idxW13) // [e4w1..e7w1, e4w3..e7w3] + + // Stage 2: combine the lo/hi halves across pair groups. + w0 = q0.ConcatPermute(q2, idxLo) // [e0w0..e7w0] + w2 = q0.ConcatPermute(q2, idxHi) // [e0w2..e7w2] + w1 = q1.ConcatPermute(q3, idxLo) // [e0w1..e7w1] + w3 = q1.ConcatPermute(q3, idxHi) // [e0w3..e7w3] + return +} + +// transpose8x4Inverse: W0..W3 → V0..V3 (the inverse of transpose8x4Forward). +// +//go:nosplit +func transpose8x4Inverse(w0, w1, w2, w3 archsimd.Int64x8) (v0, v1, v2, v3 archsimd.Int64x8) { + idxLo := archsimd.LoadUint64x8(&avx512D256IdxLoHalf) + idxHi := archsimd.LoadUint64x8(&avx512D256IdxHiHalf) + idxV0 := archsimd.LoadUint64x8(&avx512D256IdxInvV0) + idxV1 := archsimd.LoadUint64x8(&avx512D256IdxInvV1) + + // Stage 1 (inverse of forward stage 2): split per-word vectors back into + // pair-of-elements groups. + q0 := w0.ConcatPermute(w2, idxLo) // [e0w0..e3w0, e0w2..e3w2] + q2 := w0.ConcatPermute(w2, idxHi) // [e4w0..e7w0, e4w2..e7w2] + q1 := w1.ConcatPermute(w3, idxLo) // [e0w1..e3w1, e0w3..e3w3] + q3 := w1.ConcatPermute(w3, idxHi) // [e4w1..e7w1, e4w3..e7w3] + + // Stage 2 (inverse of forward stage 1): re-interleave words back into + // element-natural layout. + v0 = q0.ConcatPermute(q1, idxV0) // [e0w0..e0w3, e1w0..e1w3] + v1 = q0.ConcatPermute(q1, idxV1) // [e2w0..e2w3, e3w0..e3w3] + v2 = q2.ConcatPermute(q3, idxV0) // [e4w0..e4w3, e5w0..e5w3] + v3 = q2.ConcatPermute(q3, idxV1) // [e6w0..e6w3, e7w0..e7w3] + return +} + +// avx512AddCarryStage and friends mirror the AVX2 helpers but on Int64x8. +// +//go:nosplit +func avx512AddCarryStage(a, b, cIn, sb archsimd.Int64x8) (r, cOut archsimd.Int64x8) { + s := a.Add(b) + csA := s.Xor(sb).Less(a.Xor(sb)).ToInt64x8() + r = s.Sub(cIn) + csB := r.Xor(sb).Less(s.Xor(sb)).ToInt64x8() + cOut = csA.Or(csB) + return +} + +//go:nosplit +func avx512AddCarryStageNoOut(a, b, cIn archsimd.Int64x8) archsimd.Int64x8 { + return a.Add(b).Sub(cIn) +} + +//go:nosplit +func avx512SubBorrowStage(a, b, bIn, sb, zero archsimd.Int64x8) (r, bOut archsimd.Int64x8) { + s := a.Sub(b) + bsA := a.Xor(sb).Less(b.Xor(sb)).ToInt64x8() + r = s.Add(bIn) + bsB := s.Equal(zero).ToInt64x8().And(bIn) + bOut = bsA.Or(bsB) + return +} + +//go:nosplit +func avx512SubBorrowStageNoOut(a, b, bIn archsimd.Int64x8) archsimd.Int64x8 { + return a.Sub(b).Add(bIn) +} + +func avx512D256AddUnchecked(a, b, r []uint64) { + n := len(r) / 4 + if n == 0 || len(a) < 4*n || len(b) < 4*n { + return + } + pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x8(signBit128) + zero := archsimd.BroadcastInt64x8(0) + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 32 + aV0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off))) + aV1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+64))) + aV2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+128))) + aV3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+192))) + bV0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off))) + bV1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+64))) + bV2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+128))) + bV3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+192))) + + aW0, aW1, aW2, aW3 := transpose8x4Forward(aV0, aV1, aV2, aV3) + bW0, bW1, bW2, bW3 := transpose8x4Forward(bV0, bV1, bV2, bV3) + + rW0, c0 := avx512AddCarryStage(aW0, bW0, zero, sb) + rW1, c1 := avx512AddCarryStage(aW1, bW1, c0, sb) + rW2, c2 := avx512AddCarryStage(aW2, bW2, c1, sb) + rW3 := avx512AddCarryStageNoOut(aW3, bW3, c2) + + rV0, rV1, rV2, rV3 := transpose8x4Inverse(rW0, rW1, rW2, rW3) + rV0.Store((*[8]int64)(unsafe.Add(pr, off))) + rV1.Store((*[8]int64)(unsafe.Add(pr, off+64))) + rV2.Store((*[8]int64)(unsafe.Add(pr, off+128))) + rV3.Store((*[8]int64)(unsafe.Add(pr, off+192))) + } + for ; i < n; i++ { + j := i << 2 + w0, c := addU64(a[j], b[j], 0) + w1, c := addU64(a[j+1], b[j+1], c) + w2, c := addU64(a[j+2], b[j+2], c) + w3, _ := addU64(a[j+3], b[j+3], c) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + } +} + +func avx512D256SubUnchecked(a, b, r []uint64) { + n := len(r) / 4 + if n == 0 || len(a) < 4*n || len(b) < 4*n { + return + } + pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x8(signBit128) + zero := archsimd.BroadcastInt64x8(0) + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 32 + aV0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off))) + aV1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+64))) + aV2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+128))) + aV3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+192))) + bV0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off))) + bV1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+64))) + bV2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+128))) + bV3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+192))) + + aW0, aW1, aW2, aW3 := transpose8x4Forward(aV0, aV1, aV2, aV3) + bW0, bW1, bW2, bW3 := transpose8x4Forward(bV0, bV1, bV2, bV3) + + rW0, b0 := avx512SubBorrowStage(aW0, bW0, zero, sb, zero) + rW1, b1 := avx512SubBorrowStage(aW1, bW1, b0, sb, zero) + rW2, b2 := avx512SubBorrowStage(aW2, bW2, b1, sb, zero) + rW3 := avx512SubBorrowStageNoOut(aW3, bW3, b2) + + rV0, rV1, rV2, rV3 := transpose8x4Inverse(rW0, rW1, rW2, rW3) + rV0.Store((*[8]int64)(unsafe.Add(pr, off))) + rV1.Store((*[8]int64)(unsafe.Add(pr, off+64))) + rV2.Store((*[8]int64)(unsafe.Add(pr, off+128))) + rV3.Store((*[8]int64)(unsafe.Add(pr, off+192))) + } + for ; i < n; i++ { + j := i << 2 + w0, br := subU64(a[j], b[j], 0) + w1, br := subU64(a[j+1], b[j+1], br) + w2, br := subU64(a[j+2], b[j+2], br) + w3, _ := subU64(a[j+3], b[j+3], br) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + } +} + +func avx512D256AddChecked(a, b, r []uint64) int { + n := len(r) / 4 + if n == 0 || len(a) < 4*n || len(b) < 4*n { + return -1 + } + pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x8(signBit128) + zero := archsimd.BroadcastInt64x8(0) + + var ofAcc archsimd.Int64x8 + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 32 + aV0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off))) + aV1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+64))) + aV2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+128))) + aV3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+192))) + bV0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off))) + bV1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+64))) + bV2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+128))) + bV3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+192))) + + aW0, aW1, aW2, aW3 := transpose8x4Forward(aV0, aV1, aV2, aV3) + bW0, bW1, bW2, bW3 := transpose8x4Forward(bV0, bV1, bV2, bV3) + + rW0, c0 := avx512AddCarryStage(aW0, bW0, zero, sb) + rW1, c1 := avx512AddCarryStage(aW1, bW1, c0, sb) + rW2, c2 := avx512AddCarryStage(aW2, bW2, c1, sb) + rW3 := avx512AddCarryStageNoOut(aW3, bW3, c2) + + ofAcc = ofAcc.Or(aW3.Xor(bW3).AndNot(aW3.Xor(rW3))) + + rV0, rV1, rV2, rV3 := transpose8x4Inverse(rW0, rW1, rW2, rW3) + rV0.Store((*[8]int64)(unsafe.Add(pr, off))) + rV1.Store((*[8]int64)(unsafe.Add(pr, off+64))) + rV2.Store((*[8]int64)(unsafe.Add(pr, off+128))) + rV3.Store((*[8]int64)(unsafe.Add(pr, off+192))) + } + vecEnd := i + + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + for ; i < n; i++ { + j := i << 2 + aHi := a[j+3] + bHi := b[j+3] + w0, c := addU64(a[j], b[j], 0) + w1, c := addU64(a[j+1], b[j+1], c) + w2, c := addU64(a[j+2], b[j+2], c) + w3, _ := addU64(aHi, bHi, c) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + ah, bh, rh := int64(aHi), int64(bHi), int64(w3) + if (ah^rh)&^(ah^bh) < 0 { + if vecOverflow { + return d256FirstOverflow(a, b, vecEnd, false) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d256FirstOverflow(a, b, vecEnd, false) +} + +func avx512D256SubChecked(a, b, r []uint64) int { + n := len(r) / 4 + if n == 0 || len(a) < 4*n || len(b) < 4*n { + return -1 + } + pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x8(signBit128) + zero := archsimd.BroadcastInt64x8(0) + + var ofAcc archsimd.Int64x8 + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 32 + aV0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off))) + aV1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+64))) + aV2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+128))) + aV3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+192))) + bV0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off))) + bV1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+64))) + bV2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+128))) + bV3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+192))) + + aW0, aW1, aW2, aW3 := transpose8x4Forward(aV0, aV1, aV2, aV3) + bW0, bW1, bW2, bW3 := transpose8x4Forward(bV0, bV1, bV2, bV3) + + rW0, b0 := avx512SubBorrowStage(aW0, bW0, zero, sb, zero) + rW1, b1 := avx512SubBorrowStage(aW1, bW1, b0, sb, zero) + rW2, b2 := avx512SubBorrowStage(aW2, bW2, b1, sb, zero) + rW3 := avx512SubBorrowStageNoOut(aW3, bW3, b2) + + ofAcc = ofAcc.Or(aW3.Xor(rW3).And(aW3.Xor(bW3))) + + rV0, rV1, rV2, rV3 := transpose8x4Inverse(rW0, rW1, rW2, rW3) + rV0.Store((*[8]int64)(unsafe.Add(pr, off))) + rV1.Store((*[8]int64)(unsafe.Add(pr, off+64))) + rV2.Store((*[8]int64)(unsafe.Add(pr, off+128))) + rV3.Store((*[8]int64)(unsafe.Add(pr, off+192))) + } + vecEnd := i + + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + for ; i < n; i++ { + j := i << 2 + aHi := a[j+3] + bHi := b[j+3] + w0, br := subU64(a[j], b[j], 0) + w1, br := subU64(a[j+1], b[j+1], br) + w2, br := subU64(a[j+2], b[j+2], br) + w3, _ := subU64(aHi, bHi, br) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + ah, bh, rh := int64(aHi), int64(bHi), int64(w3) + if (ah^rh)&(ah^bh) < 0 { + if vecOverflow { + return d256FirstOverflow(a, b, vecEnd, true) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d256FirstOverflow(a, b, vecEnd, true) +} + +// --------------------------------------------------------------------------- +// AVX2 D256 broadcast variants. The scalar's 4 words become 4 uniform Int64x4 +// vectors — no load/transpose needed for the scalar operand. +// --------------------------------------------------------------------------- + +func avx2D256AddScalarUnchecked(s0, s1, s2, s3 uint64, v, r []uint64) { + n := len(r) / 4 + if n == 0 || len(v) < 4*n { + return + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x4(signBit128) + zero := archsimd.BroadcastInt64x4(0) + bW0 := archsimd.BroadcastInt64x4(int64(s0)) + bW1 := archsimd.BroadcastInt64x4(int64(s1)) + bW2 := archsimd.BroadcastInt64x4(int64(s2)) + bW3 := archsimd.BroadcastInt64x4(int64(s3)) + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 32 + v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32))) + v2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+64))) + v3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+96))) + + aW0, aW1, aW2, aW3 := transpose4x4(v0, v1, v2, v3) + + rW0, c0 := addCarryStage(aW0, bW0, zero, sb) + rW1, c1 := addCarryStage(aW1, bW1, c0, sb) + rW2, c2 := addCarryStage(aW2, bW2, c1, sb) + rW3 := addCarryStageNoOut(aW3, bW3, c2) + + rV0, rV1, rV2, rV3 := transpose4x4(rW0, rW1, rW2, rW3) + rV0.Store((*[4]int64)(unsafe.Add(pr, off))) + rV1.Store((*[4]int64)(unsafe.Add(pr, off+32))) + rV2.Store((*[4]int64)(unsafe.Add(pr, off+64))) + rV3.Store((*[4]int64)(unsafe.Add(pr, off+96))) + } + for ; i < n; i++ { + j := i << 2 + w0, c := addU64(s0, v[j], 0) + w1, c := addU64(s1, v[j+1], c) + w2, c := addU64(s2, v[j+2], c) + w3, _ := addU64(s3, v[j+3], c) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + } +} + +func avx2D256SubScalarUnchecked(v []uint64, s0, s1, s2, s3 uint64, r []uint64) { + n := len(r) / 4 + if n == 0 || len(v) < 4*n { + return + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x4(signBit128) + zero := archsimd.BroadcastInt64x4(0) + bW0 := archsimd.BroadcastInt64x4(int64(s0)) + bW1 := archsimd.BroadcastInt64x4(int64(s1)) + bW2 := archsimd.BroadcastInt64x4(int64(s2)) + bW3 := archsimd.BroadcastInt64x4(int64(s3)) + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 32 + v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32))) + v2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+64))) + v3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+96))) + + aW0, aW1, aW2, aW3 := transpose4x4(v0, v1, v2, v3) + + rW0, b0 := subBorrowStage(aW0, bW0, zero, sb, zero) + rW1, b1 := subBorrowStage(aW1, bW1, b0, sb, zero) + rW2, b2 := subBorrowStage(aW2, bW2, b1, sb, zero) + rW3 := subBorrowStageNoOut(aW3, bW3, b2) + + rV0, rV1, rV2, rV3 := transpose4x4(rW0, rW1, rW2, rW3) + rV0.Store((*[4]int64)(unsafe.Add(pr, off))) + rV1.Store((*[4]int64)(unsafe.Add(pr, off+32))) + rV2.Store((*[4]int64)(unsafe.Add(pr, off+64))) + rV3.Store((*[4]int64)(unsafe.Add(pr, off+96))) + } + for ; i < n; i++ { + j := i << 2 + w0, br := subU64(v[j], s0, 0) + w1, br := subU64(v[j+1], s1, br) + w2, br := subU64(v[j+2], s2, br) + w3, _ := subU64(v[j+3], s3, br) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + } +} + +func avx2D256ScalarSubUnchecked(s0, s1, s2, s3 uint64, v, r []uint64) { + n := len(r) / 4 + if n == 0 || len(v) < 4*n { + return + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x4(signBit128) + zero := archsimd.BroadcastInt64x4(0) + aW0 := archsimd.BroadcastInt64x4(int64(s0)) + aW1 := archsimd.BroadcastInt64x4(int64(s1)) + aW2 := archsimd.BroadcastInt64x4(int64(s2)) + aW3 := archsimd.BroadcastInt64x4(int64(s3)) + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 32 + v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32))) + v2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+64))) + v3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+96))) + + bW0, bW1, bW2, bW3 := transpose4x4(v0, v1, v2, v3) + + rW0, b0 := subBorrowStage(aW0, bW0, zero, sb, zero) + rW1, b1 := subBorrowStage(aW1, bW1, b0, sb, zero) + rW2, b2 := subBorrowStage(aW2, bW2, b1, sb, zero) + rW3 := subBorrowStageNoOut(aW3, bW3, b2) + + rV0, rV1, rV2, rV3 := transpose4x4(rW0, rW1, rW2, rW3) + rV0.Store((*[4]int64)(unsafe.Add(pr, off))) + rV1.Store((*[4]int64)(unsafe.Add(pr, off+32))) + rV2.Store((*[4]int64)(unsafe.Add(pr, off+64))) + rV3.Store((*[4]int64)(unsafe.Add(pr, off+96))) + } + for ; i < n; i++ { + j := i << 2 + w0, br := subU64(s0, v[j], 0) + w1, br := subU64(s1, v[j+1], br) + w2, br := subU64(s2, v[j+2], br) + w3, _ := subU64(s3, v[j+3], br) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + } +} + +func avx2D256AddScalarChecked(s0, s1, s2, s3 uint64, v, r []uint64) int { + n := len(r) / 4 + if n == 0 || len(v) < 4*n { + return -1 + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x4(signBit128) + zero := archsimd.BroadcastInt64x4(0) + bW0 := archsimd.BroadcastInt64x4(int64(s0)) + bW1 := archsimd.BroadcastInt64x4(int64(s1)) + bW2 := archsimd.BroadcastInt64x4(int64(s2)) + bW3 := archsimd.BroadcastInt64x4(int64(s3)) + + var ofAcc archsimd.Int64x4 + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 32 + v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32))) + v2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+64))) + v3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+96))) + + aW0, aW1, aW2, aW3 := transpose4x4(v0, v1, v2, v3) + + rW0, c0 := addCarryStage(aW0, bW0, zero, sb) + rW1, c1 := addCarryStage(aW1, bW1, c0, sb) + rW2, c2 := addCarryStage(aW2, bW2, c1, sb) + rW3 := addCarryStageNoOut(aW3, bW3, c2) + ofAcc = ofAcc.Or(aW3.Xor(rW3).AndNot(aW3.Xor(bW3))) + + rV0, rV1, rV2, rV3 := transpose4x4(rW0, rW1, rW2, rW3) + rV0.Store((*[4]int64)(unsafe.Add(pr, off))) + rV1.Store((*[4]int64)(unsafe.Add(pr, off+32))) + rV2.Store((*[4]int64)(unsafe.Add(pr, off+64))) + rV3.Store((*[4]int64)(unsafe.Add(pr, off+96))) + } + vecEnd := i + + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + sh := int64(s3) + for ; i < n; i++ { + j := i << 2 + vHi := v[j+3] + w0, c := addU64(s0, v[j], 0) + w1, c := addU64(s1, v[j+1], c) + w2, c := addU64(s2, v[j+2], c) + w3, _ := addU64(s3, vHi, c) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + vh, rh := int64(vHi), int64(w3) + if (sh^rh)&^(sh^vh) < 0 { + if vecOverflow { + return d256ScalarFirstOverflow(s0, s1, s2, s3, v, vecEnd, 0) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d256ScalarFirstOverflow(s0, s1, s2, s3, v, vecEnd, 0) +} + +func avx2D256SubScalarChecked(v []uint64, s0, s1, s2, s3 uint64, r []uint64) int { + n := len(r) / 4 + if n == 0 || len(v) < 4*n { + return -1 + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x4(signBit128) + zero := archsimd.BroadcastInt64x4(0) + bW0 := archsimd.BroadcastInt64x4(int64(s0)) + bW1 := archsimd.BroadcastInt64x4(int64(s1)) + bW2 := archsimd.BroadcastInt64x4(int64(s2)) + bW3 := archsimd.BroadcastInt64x4(int64(s3)) + + var ofAcc archsimd.Int64x4 + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 32 + v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32))) + v2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+64))) + v3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+96))) + + aW0, aW1, aW2, aW3 := transpose4x4(v0, v1, v2, v3) + + rW0, b0 := subBorrowStage(aW0, bW0, zero, sb, zero) + rW1, b1 := subBorrowStage(aW1, bW1, b0, sb, zero) + rW2, b2 := subBorrowStage(aW2, bW2, b1, sb, zero) + rW3 := subBorrowStageNoOut(aW3, bW3, b2) + ofAcc = ofAcc.Or(aW3.Xor(rW3).And(aW3.Xor(bW3))) + + rV0, rV1, rV2, rV3 := transpose4x4(rW0, rW1, rW2, rW3) + rV0.Store((*[4]int64)(unsafe.Add(pr, off))) + rV1.Store((*[4]int64)(unsafe.Add(pr, off+32))) + rV2.Store((*[4]int64)(unsafe.Add(pr, off+64))) + rV3.Store((*[4]int64)(unsafe.Add(pr, off+96))) + } + vecEnd := i + + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + sh := int64(s3) + for ; i < n; i++ { + j := i << 2 + vHi := v[j+3] + w0, br := subU64(v[j], s0, 0) + w1, br := subU64(v[j+1], s1, br) + w2, br := subU64(v[j+2], s2, br) + w3, _ := subU64(vHi, s3, br) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + vh, rh := int64(vHi), int64(w3) + if (vh^rh)&(vh^sh) < 0 { + if vecOverflow { + return d256ScalarFirstOverflow(s0, s1, s2, s3, v, vecEnd, 1) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d256ScalarFirstOverflow(s0, s1, s2, s3, v, vecEnd, 1) +} + +func avx2D256ScalarSubChecked(s0, s1, s2, s3 uint64, v, r []uint64) int { + n := len(r) / 4 + if n == 0 || len(v) < 4*n { + return -1 + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x4(signBit128) + zero := archsimd.BroadcastInt64x4(0) + aW0 := archsimd.BroadcastInt64x4(int64(s0)) + aW1 := archsimd.BroadcastInt64x4(int64(s1)) + aW2 := archsimd.BroadcastInt64x4(int64(s2)) + aW3 := archsimd.BroadcastInt64x4(int64(s3)) + + var ofAcc archsimd.Int64x4 + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 32 + v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32))) + v2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+64))) + v3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+96))) + + bW0, bW1, bW2, bW3 := transpose4x4(v0, v1, v2, v3) + + rW0, b0 := subBorrowStage(aW0, bW0, zero, sb, zero) + rW1, b1 := subBorrowStage(aW1, bW1, b0, sb, zero) + rW2, b2 := subBorrowStage(aW2, bW2, b1, sb, zero) + rW3 := subBorrowStageNoOut(aW3, bW3, b2) + ofAcc = ofAcc.Or(aW3.Xor(rW3).And(aW3.Xor(bW3))) + + rV0, rV1, rV2, rV3 := transpose4x4(rW0, rW1, rW2, rW3) + rV0.Store((*[4]int64)(unsafe.Add(pr, off))) + rV1.Store((*[4]int64)(unsafe.Add(pr, off+32))) + rV2.Store((*[4]int64)(unsafe.Add(pr, off+64))) + rV3.Store((*[4]int64)(unsafe.Add(pr, off+96))) + } + vecEnd := i + + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + sh := int64(s3) + for ; i < n; i++ { + j := i << 2 + vHi := v[j+3] + w0, br := subU64(s0, v[j], 0) + w1, br := subU64(s1, v[j+1], br) + w2, br := subU64(s2, v[j+2], br) + w3, _ := subU64(s3, vHi, br) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + vh, rh := int64(vHi), int64(w3) + if (sh^rh)&(sh^vh) < 0 { + if vecOverflow { + return d256ScalarFirstOverflow(s0, s1, s2, s3, v, vecEnd, 2) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d256ScalarFirstOverflow(s0, s1, s2, s3, v, vecEnd, 2) +} + +// --------------------------------------------------------------------------- +// AVX-512 D256 broadcast variants. Same skeleton but Int64x8 and 8 elems/iter. +// --------------------------------------------------------------------------- + +func avx512D256AddScalarUnchecked(s0, s1, s2, s3 uint64, v, r []uint64) { + n := len(r) / 4 + if n == 0 || len(v) < 4*n { + return + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x8(signBit128) + zero := archsimd.BroadcastInt64x8(0) + bW0 := archsimd.BroadcastInt64x8(int64(s0)) + bW1 := archsimd.BroadcastInt64x8(int64(s1)) + bW2 := archsimd.BroadcastInt64x8(int64(s2)) + bW3 := archsimd.BroadcastInt64x8(int64(s3)) + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 32 + v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64))) + v2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+128))) + v3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+192))) + + aW0, aW1, aW2, aW3 := transpose8x4Forward(v0, v1, v2, v3) + + rW0, c0 := avx512AddCarryStage(aW0, bW0, zero, sb) + rW1, c1 := avx512AddCarryStage(aW1, bW1, c0, sb) + rW2, c2 := avx512AddCarryStage(aW2, bW2, c1, sb) + rW3 := avx512AddCarryStageNoOut(aW3, bW3, c2) + + rV0, rV1, rV2, rV3 := transpose8x4Inverse(rW0, rW1, rW2, rW3) + rV0.Store((*[8]int64)(unsafe.Add(pr, off))) + rV1.Store((*[8]int64)(unsafe.Add(pr, off+64))) + rV2.Store((*[8]int64)(unsafe.Add(pr, off+128))) + rV3.Store((*[8]int64)(unsafe.Add(pr, off+192))) + } + for ; i < n; i++ { + j := i << 2 + w0, c := addU64(s0, v[j], 0) + w1, c := addU64(s1, v[j+1], c) + w2, c := addU64(s2, v[j+2], c) + w3, _ := addU64(s3, v[j+3], c) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + } +} + +func avx512D256SubScalarUnchecked(v []uint64, s0, s1, s2, s3 uint64, r []uint64) { + n := len(r) / 4 + if n == 0 || len(v) < 4*n { + return + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x8(signBit128) + zero := archsimd.BroadcastInt64x8(0) + bW0 := archsimd.BroadcastInt64x8(int64(s0)) + bW1 := archsimd.BroadcastInt64x8(int64(s1)) + bW2 := archsimd.BroadcastInt64x8(int64(s2)) + bW3 := archsimd.BroadcastInt64x8(int64(s3)) + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 32 + v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64))) + v2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+128))) + v3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+192))) + + aW0, aW1, aW2, aW3 := transpose8x4Forward(v0, v1, v2, v3) + + rW0, b0 := avx512SubBorrowStage(aW0, bW0, zero, sb, zero) + rW1, b1 := avx512SubBorrowStage(aW1, bW1, b0, sb, zero) + rW2, b2 := avx512SubBorrowStage(aW2, bW2, b1, sb, zero) + rW3 := avx512SubBorrowStageNoOut(aW3, bW3, b2) + + rV0, rV1, rV2, rV3 := transpose8x4Inverse(rW0, rW1, rW2, rW3) + rV0.Store((*[8]int64)(unsafe.Add(pr, off))) + rV1.Store((*[8]int64)(unsafe.Add(pr, off+64))) + rV2.Store((*[8]int64)(unsafe.Add(pr, off+128))) + rV3.Store((*[8]int64)(unsafe.Add(pr, off+192))) + } + for ; i < n; i++ { + j := i << 2 + w0, br := subU64(v[j], s0, 0) + w1, br := subU64(v[j+1], s1, br) + w2, br := subU64(v[j+2], s2, br) + w3, _ := subU64(v[j+3], s3, br) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + } +} + +func avx512D256ScalarSubUnchecked(s0, s1, s2, s3 uint64, v, r []uint64) { + n := len(r) / 4 + if n == 0 || len(v) < 4*n { + return + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x8(signBit128) + zero := archsimd.BroadcastInt64x8(0) + aW0 := archsimd.BroadcastInt64x8(int64(s0)) + aW1 := archsimd.BroadcastInt64x8(int64(s1)) + aW2 := archsimd.BroadcastInt64x8(int64(s2)) + aW3 := archsimd.BroadcastInt64x8(int64(s3)) + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 32 + v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64))) + v2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+128))) + v3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+192))) + + bW0, bW1, bW2, bW3 := transpose8x4Forward(v0, v1, v2, v3) + + rW0, b0 := avx512SubBorrowStage(aW0, bW0, zero, sb, zero) + rW1, b1 := avx512SubBorrowStage(aW1, bW1, b0, sb, zero) + rW2, b2 := avx512SubBorrowStage(aW2, bW2, b1, sb, zero) + rW3 := avx512SubBorrowStageNoOut(aW3, bW3, b2) + + rV0, rV1, rV2, rV3 := transpose8x4Inverse(rW0, rW1, rW2, rW3) + rV0.Store((*[8]int64)(unsafe.Add(pr, off))) + rV1.Store((*[8]int64)(unsafe.Add(pr, off+64))) + rV2.Store((*[8]int64)(unsafe.Add(pr, off+128))) + rV3.Store((*[8]int64)(unsafe.Add(pr, off+192))) + } + for ; i < n; i++ { + j := i << 2 + w0, br := subU64(s0, v[j], 0) + w1, br := subU64(s1, v[j+1], br) + w2, br := subU64(s2, v[j+2], br) + w3, _ := subU64(s3, v[j+3], br) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + } +} + +func avx512D256AddScalarChecked(s0, s1, s2, s3 uint64, v, r []uint64) int { + n := len(r) / 4 + if n == 0 || len(v) < 4*n { + return -1 + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x8(signBit128) + zero := archsimd.BroadcastInt64x8(0) + bW0 := archsimd.BroadcastInt64x8(int64(s0)) + bW1 := archsimd.BroadcastInt64x8(int64(s1)) + bW2 := archsimd.BroadcastInt64x8(int64(s2)) + bW3 := archsimd.BroadcastInt64x8(int64(s3)) + + var ofAcc archsimd.Int64x8 + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 32 + v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64))) + v2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+128))) + v3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+192))) + + aW0, aW1, aW2, aW3 := transpose8x4Forward(v0, v1, v2, v3) + + rW0, c0 := avx512AddCarryStage(aW0, bW0, zero, sb) + rW1, c1 := avx512AddCarryStage(aW1, bW1, c0, sb) + rW2, c2 := avx512AddCarryStage(aW2, bW2, c1, sb) + rW3 := avx512AddCarryStageNoOut(aW3, bW3, c2) + ofAcc = ofAcc.Or(aW3.Xor(bW3).AndNot(aW3.Xor(rW3))) + + rV0, rV1, rV2, rV3 := transpose8x4Inverse(rW0, rW1, rW2, rW3) + rV0.Store((*[8]int64)(unsafe.Add(pr, off))) + rV1.Store((*[8]int64)(unsafe.Add(pr, off+64))) + rV2.Store((*[8]int64)(unsafe.Add(pr, off+128))) + rV3.Store((*[8]int64)(unsafe.Add(pr, off+192))) + } + vecEnd := i + + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + sh := int64(s3) + for ; i < n; i++ { + j := i << 2 + vHi := v[j+3] + w0, c := addU64(s0, v[j], 0) + w1, c := addU64(s1, v[j+1], c) + w2, c := addU64(s2, v[j+2], c) + w3, _ := addU64(s3, vHi, c) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + vh, rh := int64(vHi), int64(w3) + if (sh^rh)&^(sh^vh) < 0 { + if vecOverflow { + return d256ScalarFirstOverflow(s0, s1, s2, s3, v, vecEnd, 0) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d256ScalarFirstOverflow(s0, s1, s2, s3, v, vecEnd, 0) +} + +func avx512D256SubScalarChecked(v []uint64, s0, s1, s2, s3 uint64, r []uint64) int { + n := len(r) / 4 + if n == 0 || len(v) < 4*n { + return -1 + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x8(signBit128) + zero := archsimd.BroadcastInt64x8(0) + bW0 := archsimd.BroadcastInt64x8(int64(s0)) + bW1 := archsimd.BroadcastInt64x8(int64(s1)) + bW2 := archsimd.BroadcastInt64x8(int64(s2)) + bW3 := archsimd.BroadcastInt64x8(int64(s3)) + + var ofAcc archsimd.Int64x8 + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 32 + v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64))) + v2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+128))) + v3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+192))) + + aW0, aW1, aW2, aW3 := transpose8x4Forward(v0, v1, v2, v3) + + rW0, b0 := avx512SubBorrowStage(aW0, bW0, zero, sb, zero) + rW1, b1 := avx512SubBorrowStage(aW1, bW1, b0, sb, zero) + rW2, b2 := avx512SubBorrowStage(aW2, bW2, b1, sb, zero) + rW3 := avx512SubBorrowStageNoOut(aW3, bW3, b2) + ofAcc = ofAcc.Or(aW3.Xor(rW3).And(aW3.Xor(bW3))) + + rV0, rV1, rV2, rV3 := transpose8x4Inverse(rW0, rW1, rW2, rW3) + rV0.Store((*[8]int64)(unsafe.Add(pr, off))) + rV1.Store((*[8]int64)(unsafe.Add(pr, off+64))) + rV2.Store((*[8]int64)(unsafe.Add(pr, off+128))) + rV3.Store((*[8]int64)(unsafe.Add(pr, off+192))) + } + vecEnd := i + + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + sh := int64(s3) + for ; i < n; i++ { + j := i << 2 + vHi := v[j+3] + w0, br := subU64(v[j], s0, 0) + w1, br := subU64(v[j+1], s1, br) + w2, br := subU64(v[j+2], s2, br) + w3, _ := subU64(vHi, s3, br) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + vh, rh := int64(vHi), int64(w3) + if (vh^rh)&(vh^sh) < 0 { + if vecOverflow { + return d256ScalarFirstOverflow(s0, s1, s2, s3, v, vecEnd, 1) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d256ScalarFirstOverflow(s0, s1, s2, s3, v, vecEnd, 1) +} + +func avx512D256ScalarSubChecked(s0, s1, s2, s3 uint64, v, r []uint64) int { + n := len(r) / 4 + if n == 0 || len(v) < 4*n { + return -1 + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sb := archsimd.BroadcastInt64x8(signBit128) + zero := archsimd.BroadcastInt64x8(0) + aW0 := archsimd.BroadcastInt64x8(int64(s0)) + aW1 := archsimd.BroadcastInt64x8(int64(s1)) + aW2 := archsimd.BroadcastInt64x8(int64(s2)) + aW3 := archsimd.BroadcastInt64x8(int64(s3)) + + var ofAcc archsimd.Int64x8 + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 32 + v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64))) + v2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+128))) + v3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+192))) + + bW0, bW1, bW2, bW3 := transpose8x4Forward(v0, v1, v2, v3) + + rW0, b0 := avx512SubBorrowStage(aW0, bW0, zero, sb, zero) + rW1, b1 := avx512SubBorrowStage(aW1, bW1, b0, sb, zero) + rW2, b2 := avx512SubBorrowStage(aW2, bW2, b1, sb, zero) + rW3 := avx512SubBorrowStageNoOut(aW3, bW3, b2) + ofAcc = ofAcc.Or(aW3.Xor(rW3).And(aW3.Xor(bW3))) + + rV0, rV1, rV2, rV3 := transpose8x4Inverse(rW0, rW1, rW2, rW3) + rV0.Store((*[8]int64)(unsafe.Add(pr, off))) + rV1.Store((*[8]int64)(unsafe.Add(pr, off+64))) + rV2.Store((*[8]int64)(unsafe.Add(pr, off+128))) + rV3.Store((*[8]int64)(unsafe.Add(pr, off+192))) + } + vecEnd := i + + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + sh := int64(s3) + for ; i < n; i++ { + j := i << 2 + vHi := v[j+3] + w0, br := subU64(s0, v[j], 0) + w1, br := subU64(s1, v[j+1], br) + w2, br := subU64(s2, v[j+2], br) + w3, _ := subU64(s3, vHi, br) + r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3 + vh, rh := int64(vHi), int64(w3) + if (sh^rh)&(sh^vh) < 0 { + if vecOverflow { + return d256ScalarFirstOverflow(s0, s1, s2, s3, v, vecEnd, 2) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d256ScalarFirstOverflow(s0, s1, s2, s3, v, vecEnd, 2) +} diff --git a/pkg/common/simdkernels/d256_addsub_test.go b/pkg/common/simdkernels/d256_addsub_test.go new file mode 100644 index 0000000000000..c10b0b1cfe05a --- /dev/null +++ b/pkg/common/simdkernels/d256_addsub_test.go @@ -0,0 +1,670 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build goexperiment.simd && amd64 + +package simdkernels + +import ( + "math" + "math/rand/v2" + "strconv" + "testing" + + "golang.org/x/sys/cpu" +) + +// All slice lengths below count Decimal256 elements (each backed by 4 uint64). + +type d256UncheckedImpl struct { + name string + fn func(a, b, r []uint64) +} + +type d256CheckedImpl struct { + name string + fn func(a, b, r []uint64) int +} + +func d256Sizes() []int { + return []int{0, 1, 2, 3, 4, 5, 7, 8, 9, 15, 16, 17, 31, 32, 33, 35, 63, 64, 127, 128, 1023, 2048} +} + +func makeRandD256(n int, seed uint64) []uint64 { + rng := rand.New(rand.NewPCG(seed, seed^0xDEADBEEFCAFEBABE)) + out := make([]uint64, 4*n) + for i := range out { + out[i] = rng.Uint64() + } + return out +} + +// makeRand256SmallSigned clears the high bit of the top word so 256-bit +// add/sub of any two such values cannot overflow signed. +func makeRand256SmallSigned(n int, seed uint64) []uint64 { + out := makeRandD256(n, seed) + for i := 3; i < len(out); i += 4 { + out[i] &= 0x3FFFFFFFFFFFFFFF + } + return out +} + +func TestD256AddVariants(t *testing.T) { + impls := []d256UncheckedImpl{ + {"scalar", scalarD256AddUnchecked}, + {"avx2", avx2D256AddUnchecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d256UncheckedImpl{"avx512", avx512D256AddUnchecked}) + } + for _, n := range d256Sizes() { + a := makeRandD256(n, uint64(n)*7+1) + b := makeRandD256(n, uint64(n)*11+3) + want := make([]uint64, 4*n) + scalarD256AddUnchecked(a, b, want) + for _, impl := range impls { + got := make([]uint64, 4*n) + impl.fn(a, b, got) + for i := 0; i < 4*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d slot=%d: got %x want %x", impl.name, n, i, got[i], want[i]) + } + } + } + } +} + +func TestD256SubVariants(t *testing.T) { + impls := []d256UncheckedImpl{ + {"scalar", scalarD256SubUnchecked}, + {"avx2", avx2D256SubUnchecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d256UncheckedImpl{"avx512", avx512D256SubUnchecked}) + } + for _, n := range d256Sizes() { + a := makeRandD256(n, uint64(n)*13+5) + b := makeRandD256(n, uint64(n)*17+9) + want := make([]uint64, 4*n) + scalarD256SubUnchecked(a, b, want) + for _, impl := range impls { + got := make([]uint64, 4*n) + impl.fn(a, b, got) + for i := 0; i < 4*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d slot=%d: got %x want %x", impl.name, n, i, got[i], want[i]) + } + } + } + } +} + +func TestD256AddCheckedVariants(t *testing.T) { + impls := []d256CheckedImpl{ + {"scalar", scalarD256AddChecked}, + {"avx2", avx2D256AddChecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d256CheckedImpl{"avx512", avx512D256AddChecked}) + } + + // 1) No-overflow random inputs. + for _, n := range d256Sizes() { + a := makeRand256SmallSigned(n, uint64(n)*19+7) + b := makeRand256SmallSigned(n, uint64(n)*23+11) + want := make([]uint64, 4*n) + if got := scalarD256AddChecked(a, b, want); got != -1 { + t.Fatalf("setup: scalar overflow at %d for masked input n=%d", got, n) + } + for _, impl := range impls { + got := make([]uint64, 4*n) + if idx := impl.fn(a, b, got); idx != -1 { + t.Fatalf("%s n=%d: spurious overflow at %d", impl.name, n, idx) + } + for i := 0; i < 4*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d slot=%d: got %x want %x", impl.name, n, i, got[i], want[i]) + } + } + } + } + + // 2) Inject MaxInt256 + 1 overflow at varying positions. + for _, n := range []int{4, 8, 9, 16, 17, 33, 35, 64} { + for _, pos := range []int{0, 1, 3, 4, 7, 8, n - 1} { + if pos < 0 || pos >= n { + continue + } + a := make([]uint64, 4*n) + b := make([]uint64, 4*n) + j := pos << 2 + a[j] = math.MaxUint64 + a[j+1] = math.MaxUint64 + a[j+2] = math.MaxUint64 + a[j+3] = uint64(math.MaxInt64) // a = MaxInt256 + b[j] = 1 + // b = 1 + for _, impl := range impls { + got := make([]uint64, 4*n) + idx := impl.fn(a, b, got) + if idx != pos { + t.Fatalf("%s n=%d inject pos=%d: got idx %d", impl.name, n, pos, idx) + } + } + } + } + + // 3) Carry propagation cascading w0→w1→w2→w3. + for _, n := range []int{4, 8, 16, 17, 33} { + a := make([]uint64, 4*n) + b := make([]uint64, 4*n) + for i := 0; i < n; i++ { + j := i << 2 + a[j] = math.MaxUint64 + a[j+1] = math.MaxUint64 + a[j+2] = math.MaxUint64 + a[j+3] = uint64(i) // small positive top + b[j] = 1 + } + want := make([]uint64, 4*n) + scalarD256AddChecked(a, b, want) + for _, impl := range impls { + got := make([]uint64, 4*n) + if idx := impl.fn(a, b, got); idx != -1 { + t.Fatalf("%s carry n=%d: spurious overflow at %d", impl.name, n, idx) + } + for i := 0; i < 4*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s carry n=%d slot=%d: got %x want %x", impl.name, n, i, got[i], want[i]) + } + } + } + } +} + +func TestD256SubCheckedVariants(t *testing.T) { + impls := []d256CheckedImpl{ + {"scalar", scalarD256SubChecked}, + {"avx2", avx2D256SubChecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d256CheckedImpl{"avx512", avx512D256SubChecked}) + } + + for _, n := range d256Sizes() { + a := makeRand256SmallSigned(n, uint64(n)*29+13) + b := makeRand256SmallSigned(n, uint64(n)*31+17) + want := make([]uint64, 4*n) + if got := scalarD256SubChecked(a, b, want); got != -1 { + t.Fatalf("setup: scalar overflow at %d for n=%d", got, n) + } + for _, impl := range impls { + got := make([]uint64, 4*n) + if idx := impl.fn(a, b, got); idx != -1 { + t.Fatalf("%s n=%d: spurious overflow at %d", impl.name, n, idx) + } + for i := 0; i < 4*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d slot=%d: got %x want %x", impl.name, n, i, got[i], want[i]) + } + } + } + } + + // Inject MinInt256 - 1 overflow at varying positions. + for _, n := range []int{4, 8, 9, 16, 17, 33, 35, 64} { + for _, pos := range []int{0, 1, 3, 4, 7, 8, n - 1} { + if pos < 0 || pos >= n { + continue + } + a := make([]uint64, 4*n) + b := make([]uint64, 4*n) + j := pos << 2 + // a = MinInt256 + a[j+3] = 1 << 63 + b[j] = 1 // b = 1 + for _, impl := range impls { + got := make([]uint64, 4*n) + idx := impl.fn(a, b, got) + if idx != pos { + t.Fatalf("%s n=%d inject pos=%d: got idx %d", impl.name, n, pos, idx) + } + } + } + } + + // Borrow propagation cascading w0→w1→w2→w3. + for _, n := range []int{4, 8, 16, 17, 33} { + a := make([]uint64, 4*n) + b := make([]uint64, 4*n) + for i := 0; i < n; i++ { + j := i << 2 + a[j+3] = uint64(i + 10) // safely positive after borrow + b[j] = 1 + } + want := make([]uint64, 4*n) + scalarD256SubChecked(a, b, want) + for _, impl := range impls { + got := make([]uint64, 4*n) + if idx := impl.fn(a, b, got); idx != -1 { + t.Fatalf("%s borrow n=%d: spurious overflow at %d", impl.name, n, idx) + } + for i := 0; i < 4*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s borrow n=%d slot=%d: got %x want %x", impl.name, n, i, got[i], want[i]) + } + } + } + } +} + +// --------------------------------------------------------------------------- +// Benchmarks +// --------------------------------------------------------------------------- + +var d256BenchSizes = []int{16, 64, 256, 1024, 4096} + +func benchD256Unchecked(b *testing.B, fn func(a, bb, r []uint64), n int) { + a := makeRandD256(n, 1) + bb := makeRandD256(n, 2) + r := make([]uint64, 4*n) + b.SetBytes(int64(n) * 32 * 3) + b.ResetTimer() + for i := 0; i < b.N; i++ { + fn(a, bb, r) + } +} + +func benchD256Checked(b *testing.B, fn func(a, bb, r []uint64) int, n int) { + a := makeRand256SmallSigned(n, 1) + bb := makeRand256SmallSigned(n, 2) + r := make([]uint64, 4*n) + b.SetBytes(int64(n) * 32 * 3) + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = fn(a, bb, r) + } +} + +func BenchmarkD256AddUnchecked(b *testing.B) { + for _, n := range d256BenchSizes { + b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD256Unchecked(b, scalarD256AddUnchecked, n) }) + b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD256Unchecked(b, avx2D256AddUnchecked, n) }) + if cpu.X86.HasAVX512 { + b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD256Unchecked(b, avx512D256AddUnchecked, n) }) + } + } +} + +func BenchmarkD256SubUnchecked(b *testing.B) { + for _, n := range d256BenchSizes { + b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD256Unchecked(b, scalarD256SubUnchecked, n) }) + b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD256Unchecked(b, avx2D256SubUnchecked, n) }) + if cpu.X86.HasAVX512 { + b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD256Unchecked(b, avx512D256SubUnchecked, n) }) + } + } +} + +func BenchmarkD256AddChecked(b *testing.B) { + for _, n := range d256BenchSizes { + b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD256Checked(b, scalarD256AddChecked, n) }) + b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD256Checked(b, avx2D256AddChecked, n) }) + if cpu.X86.HasAVX512 { + b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD256Checked(b, avx512D256AddChecked, n) }) + } + } +} + +func BenchmarkD256SubChecked(b *testing.B) { + for _, n := range d256BenchSizes { + b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD256Checked(b, scalarD256SubChecked, n) }) + b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD256Checked(b, avx2D256SubChecked, n) }) + if cpu.X86.HasAVX512 { + b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD256Checked(b, avx512D256SubChecked, n) }) + } + } +} + +// --------------------------------------------------------------------------- +// Scalar-broadcast tests (D256 = 4 uint64/elem; scalar = (s0, s1, s2, s3)). +// --------------------------------------------------------------------------- + +type d256ScalarVUImpl struct { + name string + fn func(s0, s1, s2, s3 uint64, v, r []uint64) +} + +type d256ScalarVCImpl struct { + name string + fn func(s0, s1, s2, s3 uint64, v, r []uint64) int +} + +type d256VScalarUImpl struct { + name string + fn func(v []uint64, s0, s1, s2, s3 uint64, r []uint64) +} + +type d256VScalarCImpl struct { + name string + fn func(v []uint64, s0, s1, s2, s3 uint64, r []uint64) int +} + +func d256Scalars() []struct{ s0, s1, s2, s3 uint64 } { + return []struct{ s0, s1, s2, s3 uint64 }{ + {0, 0, 0, 0}, + {1, 0, 0, 0}, + {math.MaxUint64, 0, 0, 0}, + {0, 0, 0, 1}, + {0xDEADBEEFCAFEBABE, 0x123456789ABCDEF0, 0xFEEDFACEDEADBEEF, 0x0123456789ABCDEF}, + {math.MaxUint64, math.MaxUint64, math.MaxUint64, uint64(math.MaxInt64)}, + {0, 0, 0, 1 << 63}, + } +} + +func TestD256AddScalarVariants(t *testing.T) { + impls := []d256ScalarVUImpl{ + {"scalar", scalarD256AddScalarUnchecked}, + {"avx2", avx2D256AddScalarUnchecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d256ScalarVUImpl{"avx512", avx512D256AddScalarUnchecked}) + } + for _, n := range d256Sizes() { + v := makeRandD256(n, uint64(n)*37+1) + for si, s := range d256Scalars() { + want := make([]uint64, 4*n) + scalarD256AddScalarUnchecked(s.s0, s.s1, s.s2, s.s3, v, want) + for _, impl := range impls { + got := make([]uint64, 4*n) + impl.fn(s.s0, s.s1, s.s2, s.s3, v, got) + for i := 0; i < 4*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d scalar#%d slot=%d: got %x want %x", impl.name, n, si, i, got[i], want[i]) + } + } + } + } + } +} + +func TestD256SubScalarVariants(t *testing.T) { + impls := []d256VScalarUImpl{ + {"scalar", scalarD256SubScalarUnchecked}, + {"avx2", avx2D256SubScalarUnchecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d256VScalarUImpl{"avx512", avx512D256SubScalarUnchecked}) + } + for _, n := range d256Sizes() { + v := makeRandD256(n, uint64(n)*41+3) + for si, s := range d256Scalars() { + want := make([]uint64, 4*n) + scalarD256SubScalarUnchecked(v, s.s0, s.s1, s.s2, s.s3, want) + for _, impl := range impls { + got := make([]uint64, 4*n) + impl.fn(v, s.s0, s.s1, s.s2, s.s3, got) + for i := 0; i < 4*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d scalar#%d slot=%d: got %x want %x", impl.name, n, si, i, got[i], want[i]) + } + } + } + } + } +} + +func TestD256ScalarSubVariants(t *testing.T) { + impls := []d256ScalarVUImpl{ + {"scalar", scalarD256ScalarSubUnchecked}, + {"avx2", avx2D256ScalarSubUnchecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d256ScalarVUImpl{"avx512", avx512D256ScalarSubUnchecked}) + } + for _, n := range d256Sizes() { + v := makeRandD256(n, uint64(n)*43+5) + for si, s := range d256Scalars() { + want := make([]uint64, 4*n) + scalarD256ScalarSubUnchecked(s.s0, s.s1, s.s2, s.s3, v, want) + for _, impl := range impls { + got := make([]uint64, 4*n) + impl.fn(s.s0, s.s1, s.s2, s.s3, v, got) + for i := 0; i < 4*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d scalar#%d slot=%d: got %x want %x", impl.name, n, si, i, got[i], want[i]) + } + } + } + } + } +} + +func TestD256AddScalarCheckedVariants(t *testing.T) { + impls := []d256ScalarVCImpl{ + {"scalar", scalarD256AddScalarChecked}, + {"avx2", avx2D256AddScalarChecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d256ScalarVCImpl{"avx512", avx512D256AddScalarChecked}) + } + + smallScalars := []struct{ s0, s1, s2, s3 uint64 }{ + {0, 0, 0, 0}, + {1, 0, 0, 0}, + {0xDEADBEEFCAFEBABE, 0x123456789ABCDEF0, 0xFEEDFACEDEADBEEF, 0x0123456789ABCDEF}, + } + for _, n := range d256Sizes() { + v := makeRand256SmallSigned(n, uint64(n)*47+7) + for si, s := range smallScalars { + want := make([]uint64, 4*n) + if got := scalarD256AddScalarChecked(s.s0, s.s1, s.s2, s.s3, v, want); got != -1 { + t.Fatalf("setup overflow at %d for n=%d scalar#%d", got, n, si) + } + for _, impl := range impls { + got := make([]uint64, 4*n) + if idx := impl.fn(s.s0, s.s1, s.s2, s.s3, v, got); idx != -1 { + t.Fatalf("%s n=%d scalar#%d: spurious overflow at %d", impl.name, n, si, idx) + } + for i := 0; i < 4*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d scalar#%d slot=%d: got %x want %x", impl.name, n, si, i, got[i], want[i]) + } + } + } + } + } + + // Inject overflow: scalar = 1, v[pos] = MaxInt256. + for _, n := range []int{4, 8, 9, 16, 17, 33, 35, 64} { + for _, pos := range []int{0, 1, 3, 4, 7, 8, n - 1} { + if pos < 0 || pos >= n { + continue + } + v := make([]uint64, 4*n) + j := pos << 2 + v[j] = math.MaxUint64 + v[j+1] = math.MaxUint64 + v[j+2] = math.MaxUint64 + v[j+3] = uint64(math.MaxInt64) + for _, impl := range impls { + got := make([]uint64, 4*n) + idx := impl.fn(1, 0, 0, 0, v, got) + if idx != pos { + t.Fatalf("%s n=%d pos=%d: got idx %d", impl.name, n, pos, idx) + } + } + } + } +} + +func TestD256SubScalarCheckedVariants(t *testing.T) { + impls := []d256VScalarCImpl{ + {"scalar", scalarD256SubScalarChecked}, + {"avx2", avx2D256SubScalarChecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d256VScalarCImpl{"avx512", avx512D256SubScalarChecked}) + } + + smallScalars := []struct{ s0, s1, s2, s3 uint64 }{ + {0, 0, 0, 0}, + {1, 0, 0, 0}, + {0xDEADBEEFCAFEBABE, 0x123456789ABCDEF0, 0xFEEDFACEDEADBEEF, 0x0123456789ABCDEF}, + } + for _, n := range d256Sizes() { + v := makeRand256SmallSigned(n, uint64(n)*53+11) + for si, s := range smallScalars { + want := make([]uint64, 4*n) + if got := scalarD256SubScalarChecked(v, s.s0, s.s1, s.s2, s.s3, want); got != -1 { + t.Fatalf("setup overflow at %d for n=%d scalar#%d", got, n, si) + } + for _, impl := range impls { + got := make([]uint64, 4*n) + if idx := impl.fn(v, s.s0, s.s1, s.s2, s.s3, got); idx != -1 { + t.Fatalf("%s n=%d scalar#%d: spurious overflow at %d", impl.name, n, si, idx) + } + for i := 0; i < 4*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d scalar#%d slot=%d: got %x want %x", impl.name, n, si, i, got[i], want[i]) + } + } + } + } + } + + // Inject: v[pos] = MinInt256, scalar = 1. + for _, n := range []int{4, 8, 9, 16, 17, 33, 35, 64} { + for _, pos := range []int{0, 1, 3, 4, 7, 8, n - 1} { + if pos < 0 || pos >= n { + continue + } + v := make([]uint64, 4*n) + j := pos << 2 + v[j+3] = 1 << 63 + for _, impl := range impls { + got := make([]uint64, 4*n) + idx := impl.fn(v, 1, 0, 0, 0, got) + if idx != pos { + t.Fatalf("%s n=%d pos=%d: got idx %d", impl.name, n, pos, idx) + } + } + } + } +} + +func TestD256ScalarSubCheckedVariants(t *testing.T) { + impls := []d256ScalarVCImpl{ + {"scalar", scalarD256ScalarSubChecked}, + {"avx2", avx2D256ScalarSubChecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d256ScalarVCImpl{"avx512", avx512D256ScalarSubChecked}) + } + + smallScalars := []struct{ s0, s1, s2, s3 uint64 }{ + {0, 0, 0, 0}, + {1, 0, 0, 0}, + {0xDEADBEEFCAFEBABE, 0x123456789ABCDEF0, 0xFEEDFACEDEADBEEF, 0x0123456789ABCDEF}, + } + for _, n := range d256Sizes() { + v := makeRand256SmallSigned(n, uint64(n)*59+13) + for si, s := range smallScalars { + want := make([]uint64, 4*n) + if got := scalarD256ScalarSubChecked(s.s0, s.s1, s.s2, s.s3, v, want); got != -1 { + t.Fatalf("setup overflow at %d for n=%d scalar#%d", got, n, si) + } + for _, impl := range impls { + got := make([]uint64, 4*n) + if idx := impl.fn(s.s0, s.s1, s.s2, s.s3, v, got); idx != -1 { + t.Fatalf("%s n=%d scalar#%d: spurious overflow at %d", impl.name, n, si, idx) + } + for i := 0; i < 4*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d scalar#%d slot=%d: got %x want %x", impl.name, n, si, i, got[i], want[i]) + } + } + } + } + } + + // Inject: scalar = MinInt256, v[pos] = 1 ⇒ MinInt256-1 overflows. + for _, n := range []int{4, 8, 9, 16, 17, 33, 35, 64} { + for _, pos := range []int{0, 1, 3, 4, 7, 8, n - 1} { + if pos < 0 || pos >= n { + continue + } + v := make([]uint64, 4*n) + j := pos << 2 + v[j] = 1 + for _, impl := range impls { + got := make([]uint64, 4*n) + idx := impl.fn(0, 0, 0, 1<<63, v, got) + if idx != pos { + t.Fatalf("%s n=%d pos=%d: got idx %d", impl.name, n, pos, idx) + } + } + } + } +} + +func benchD256AddScalarU(b *testing.B, fn func(s0, s1, s2, s3 uint64, v, r []uint64), n int) { + v := makeRandD256(n, 1) + r := make([]uint64, 4*n) + b.SetBytes(int64(n) * 32 * 2) + b.ResetTimer() + for i := 0; i < b.N; i++ { + fn(0xDEADBEEFCAFEBABE, 0x123456789ABCDEF0, 0xFEEDFACEDEADBEEF, 0x0123456789ABCDEF, v, r) + } +} + +func benchD256SubScalarU(b *testing.B, fn func(v []uint64, s0, s1, s2, s3 uint64, r []uint64), n int) { + v := makeRandD256(n, 1) + r := make([]uint64, 4*n) + b.SetBytes(int64(n) * 32 * 2) + b.ResetTimer() + for i := 0; i < b.N; i++ { + fn(v, 0xDEADBEEFCAFEBABE, 0x123456789ABCDEF0, 0xFEEDFACEDEADBEEF, 0x0123456789ABCDEF, r) + } +} + +func BenchmarkD256AddScalarUnchecked(b *testing.B) { + for _, n := range d256BenchSizes { + b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD256AddScalarU(b, scalarD256AddScalarUnchecked, n) }) + b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD256AddScalarU(b, avx2D256AddScalarUnchecked, n) }) + if cpu.X86.HasAVX512 { + b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD256AddScalarU(b, avx512D256AddScalarUnchecked, n) }) + } + } +} + +func BenchmarkD256SubScalarUnchecked(b *testing.B) { + for _, n := range d256BenchSizes { + b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD256SubScalarU(b, scalarD256SubScalarUnchecked, n) }) + b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD256SubScalarU(b, avx2D256SubScalarUnchecked, n) }) + if cpu.X86.HasAVX512 { + b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD256SubScalarU(b, avx512D256SubScalarUnchecked, n) }) + } + } +} + +func BenchmarkD256ScalarSubUnchecked(b *testing.B) { + for _, n := range d256BenchSizes { + b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD256AddScalarU(b, scalarD256ScalarSubUnchecked, n) }) + b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD256AddScalarU(b, avx2D256ScalarSubUnchecked, n) }) + if cpu.X86.HasAVX512 { + b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD256AddScalarU(b, avx512D256ScalarSubUnchecked, n) }) + } + } +} diff --git a/pkg/common/simdkernels/d256_negabs.go b/pkg/common/simdkernels/d256_negabs.go new file mode 100644 index 0000000000000..5e8141a39b513 --- /dev/null +++ b/pkg/common/simdkernels/d256_negabs.go @@ -0,0 +1,78 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package simdkernels + +import "math/bits" + +// Decimal256 element-wise negate / absolute value on slices of uint64 with +// the matrixone Decimal256 layout (4 uint64 per element, low to high). The +// src and dst slices both have length 4*N. dst may alias src. +// +// Both ops use 256-bit two's complement (~x + 1). Negate is unconditional; +// Abs is conditional on the sign bit of the topmost word. MinInt256 wraps to +// itself, matching the scalar SQL semantics in arith_decimal_fast.go. + +var ( + D256Negate func(src, dst []uint64) = scalarD256Negate + D256Abs func(src, dst []uint64) = scalarD256Abs +) + +func scalarD256Negate(src, dst []uint64) { + n := len(dst) / 4 + if len(src) < 4*n { + return + } + for i := 0; i < n; i++ { + j := i << 2 + w0 := ^src[j] + w1 := ^src[j+1] + w2 := ^src[j+2] + w3 := ^src[j+3] + var c uint64 + w0, c = bits.Add64(w0, 1, 0) + w1, c = bits.Add64(w1, 0, c) + w2, c = bits.Add64(w2, 0, c) + w3, _ = bits.Add64(w3, 0, c) + dst[j] = w0 + dst[j+1] = w1 + dst[j+2] = w2 + dst[j+3] = w3 + } +} + +func scalarD256Abs(src, dst []uint64) { + n := len(dst) / 4 + if len(src) < 4*n { + return + } + for i := 0; i < n; i++ { + j := i << 2 + w0, w1, w2, w3 := src[j], src[j+1], src[j+2], src[j+3] + sign := uint64(int64(w3) >> 63) // 0 or all-ones + w0 ^= sign + w1 ^= sign + w2 ^= sign + w3 ^= sign + var c uint64 + w0, c = bits.Add64(w0, sign&1, 0) + w1, c = bits.Add64(w1, 0, c) + w2, c = bits.Add64(w2, 0, c) + w3, _ = bits.Add64(w3, 0, c) + dst[j] = w0 + dst[j+1] = w1 + dst[j+2] = w2 + dst[j+3] = w3 + } +} diff --git a/pkg/common/simdkernels/d256_negabs_simd_amd64.go b/pkg/common/simdkernels/d256_negabs_simd_amd64.go new file mode 100644 index 0000000000000..4dde4374399c3 --- /dev/null +++ b/pkg/common/simdkernels/d256_negabs_simd_amd64.go @@ -0,0 +1,133 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build goexperiment.simd && amd64 + +package simdkernels + +import ( + "math/bits" + "simd/archsimd" + "unsafe" + + "golang.org/x/sys/cpu" +) + +// d256_negabs_simd_amd64.go: SIMD batch negate / abs for Decimal256. +// +// Same conditional-negate idiom as d128_negabs but with 4 stages of carry +// propagation across the 4 words of each Decimal256: +// +// mask = -m (-1 if negating this lane, 0 otherwise) +// wBar = w XOR mask +// stage 0: r0 = wBar0 - mask // mask supplies the +1 only when negating +// stage k (k>0): rk = wBark - cIn // cIn is 0 or -1 carry from stage k-1 +// carry out of any stage k: rk wraps unsigned ⇔ rk <_unsigned wBar_k +// top stage: drop cOut. +// +// Layout: each Decimal256 is 4 q-words = 32 B. Process 4 elements per AVX2 +// iter (= 16 q-words = 128 B), 8 per AVX-512 iter. Reuse transpose4x4 from +// d256_addsub_simd_amd64.go for AoS↔SoA conversion. + +func init() { + // AVX-512 D256 transpose would need a custom 8×4 ConcatPermute layout + // and an inverse for the writeback; deferred. AVX2 path runs on AVX-512 + // hosts as well (still 4 elements per iter). + if cpu.X86.HasAVX2 { + D256Negate = avx2D256Negate + D256Abs = avx2D256Abs + } +} + +// --------------------------------------------------------------------------- +// AVX2 (Int64x4) implementation +// --------------------------------------------------------------------------- + +//go:nosplit +func avx2D256NegStage(w, mask, cIn, sb archsimd.Int64x4) (r, cOut archsimd.Int64x4) { + wBar := w.Xor(mask) + r = wBar.Sub(cIn) + cOut = r.Xor(sb).Less(wBar.Xor(sb)).ToInt64x4() + return +} + +//go:nosplit +func avx2D256NegStageNoOut(w, mask, cIn archsimd.Int64x4) archsimd.Int64x4 { + return w.Xor(mask).Sub(cIn) +} + +func avx2D256NegabsCore(src, dst []uint64, abs bool) { + n := len(dst) / 4 + if n == 0 || len(src) < 4*n { + return + } + ps, pd := unsafe.Pointer(&src[0]), unsafe.Pointer(&dst[0]) + sb := archsimd.BroadcastInt64x4(signBit128) + allOnes := archsimd.BroadcastInt64x4(-1) + zero := archsimd.BroadcastInt64x4(0) + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 32 + v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(ps, off))) + v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(ps, off+32))) + v2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(ps, off+64))) + v3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(ps, off+96))) + + w0, w1, w2, w3 := transpose4x4(v0, v1, v2, v3) + var mask archsimd.Int64x4 + if abs { + mask = w3.Less(zero).ToInt64x4() + } else { + mask = allOnes + } + + r0, c0 := avx2D256NegStage(w0, mask, mask, sb) + r1, c1 := avx2D256NegStage(w1, mask, c0, sb) + r2, c2 := avx2D256NegStage(w2, mask, c1, sb) + r3 := avx2D256NegStageNoOut(w3, mask, c2) + + rv0, rv1, rv2, rv3 := transpose4x4(r0, r1, r2, r3) + rv0.Store((*[4]int64)(unsafe.Add(pd, off))) + rv1.Store((*[4]int64)(unsafe.Add(pd, off+32))) + rv2.Store((*[4]int64)(unsafe.Add(pd, off+64))) + rv3.Store((*[4]int64)(unsafe.Add(pd, off+96))) + } + for ; i < n; i++ { + j := i << 2 + w0, w1, w2, w3 := src[j], src[j+1], src[j+2], src[j+3] + var sign uint64 + if abs { + sign = uint64(int64(w3) >> 63) + } else { + sign = ^uint64(0) + } + w0 ^= sign + w1 ^= sign + w2 ^= sign + w3 ^= sign + var c uint64 + w0, c = bits.Add64(w0, sign&1, 0) + w1, c = bits.Add64(w1, 0, c) + w2, c = bits.Add64(w2, 0, c) + w3, _ = bits.Add64(w3, 0, c) + dst[j] = w0 + dst[j+1] = w1 + dst[j+2] = w2 + dst[j+3] = w3 + } +} + +func avx2D256Negate(src, dst []uint64) { avx2D256NegabsCore(src, dst, false) } +func avx2D256Abs(src, dst []uint64) { avx2D256NegabsCore(src, dst, true) } diff --git a/pkg/common/simdkernels/d256_negabs_test.go b/pkg/common/simdkernels/d256_negabs_test.go new file mode 100644 index 0000000000000..454910156cd4e --- /dev/null +++ b/pkg/common/simdkernels/d256_negabs_test.go @@ -0,0 +1,161 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build goexperiment.simd && amd64 + +package simdkernels + +import ( + "math" + "strconv" + "testing" +) + +type d256UnaryImpl struct { + name string + fn func(src, dst []uint64) +} + +func d256NegateImpls() []d256UnaryImpl { + out := []d256UnaryImpl{{name: "scalar", fn: scalarD256Negate}} + if D256Negate != nil { + out = append(out, d256UnaryImpl{name: "dispatch", fn: D256Negate}) + } + return out +} + +func d256AbsImpls() []d256UnaryImpl { + out := []d256UnaryImpl{{name: "scalar", fn: scalarD256Abs}} + if D256Abs != nil { + out = append(out, d256UnaryImpl{name: "dispatch", fn: D256Abs}) + } + return out +} + +func d256NegAbsEdges() []uint64 { + // each row = 4 q-words (lo..hi) of one D256. + rows := [][4]uint64{ + {0, 0, 0, 0}, + {1, 0, 0, 0}, + {0, 0, 0, 1}, + {0, 0, 0, 0x8000000000000000}, // MinInt256 + {math.MaxUint64, math.MaxUint64, math.MaxUint64, math.MaxUint64}, // -1 + {0, 0, 0, 0x7FFFFFFFFFFFFFFF}, // MaxInt256 + {1, 0, 0, 0x8000000000000000}, // negative, lo!=0 + {math.MaxUint64, 0, 0, 0x8000000000000000}, // tests carry over middle words + {0, math.MaxUint64, math.MaxUint64, 0x8000000000000000}, + {math.MaxUint64, math.MaxUint64, math.MaxUint64, 0x8000000000000000}, + } + out := make([]uint64, 0, 4*len(rows)) + for _, r := range rows { + out = append(out, r[0], r[1], r[2], r[3]) + } + return out +} + +func TestD256NegateCorrectness(t *testing.T) { + for _, n := range d256Sizes() { + src := makeRandD256(n, 0xD256^uint64(n)) + want := make([]uint64, 4*n) + scalarD256Negate(src, want) + for _, im := range d256NegateImpls() { + got := make([]uint64, 4*n) + im.fn(src, got) + for i := 0; i < 4*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d idx=%d: got 0x%x want 0x%x", im.name, n, i, got[i], want[i]) + } + } + } + } +} + +func TestD256NegateEdges(t *testing.T) { + src := d256NegAbsEdges() + n := len(src) / 4 + want := make([]uint64, 4*n) + scalarD256Negate(src, want) + for _, im := range d256NegateImpls() { + got := make([]uint64, 4*n) + im.fn(src, got) + for i := 0; i < 4*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s idx=%d: got 0x%x want 0x%x", im.name, i, got[i], want[i]) + } + } + } +} + +func TestD256AbsCorrectness(t *testing.T) { + for _, n := range d256Sizes() { + src := makeRandD256(n, 0xABA256^uint64(n)) + want := make([]uint64, 4*n) + scalarD256Abs(src, want) + for _, im := range d256AbsImpls() { + got := make([]uint64, 4*n) + im.fn(src, got) + for i := 0; i < 4*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d idx=%d: got 0x%x want 0x%x", im.name, n, i, got[i], want[i]) + } + } + } + } +} + +func TestD256AbsEdges(t *testing.T) { + src := d256NegAbsEdges() + n := len(src) / 4 + want := make([]uint64, 4*n) + scalarD256Abs(src, want) + for _, im := range d256AbsImpls() { + got := make([]uint64, 4*n) + im.fn(src, got) + for i := 0; i < 4*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s idx=%d: got 0x%x want 0x%x", im.name, i, got[i], want[i]) + } + } + } +} + +func benchmarkD256Unary(b *testing.B, fn func(src, dst []uint64), n int) { + src := makeRandD256(n, 0xBEEF^uint64(n)) + dst := make([]uint64, 4*n) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + fn(src, dst) + } +} + +func BenchmarkD256Negate(b *testing.B) { + for _, n := range []int{16, 64, 256, 1024, 4096} { + for _, im := range d256NegateImpls() { + b.Run(im.name+"/n="+strconv.Itoa(n), func(b *testing.B) { + benchmarkD256Unary(b, im.fn, n) + }) + } + } +} + +func BenchmarkD256Abs(b *testing.B) { + for _, n := range []int{16, 64, 256, 1024, 4096} { + for _, im := range d256AbsImpls() { + b.Run(im.name+"/n="+strconv.Itoa(n), func(b *testing.B) { + benchmarkD256Unary(b, im.fn, n) + }) + } + } +} diff --git a/pkg/common/simdkernels/d64_addsub.go b/pkg/common/simdkernels/d64_addsub.go new file mode 100644 index 0000000000000..2a57d1fc58b26 --- /dev/null +++ b/pkg/common/simdkernels/d64_addsub.go @@ -0,0 +1,272 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package simdkernels + +import "math/bits" + +// Decimal64 add/sub on slices, treating elements as signed int64. +// +// Two variants per operator: +// *Unchecked — wraps on overflow, no detection. Use when caller can prove +// no overflow (e.g., max(p1,p2)+1 ≤ 18 for d_T(p,s) operands). +// *Checked — returns the index of the first element that overflows, or +// -1 if none did. Vector loop accumulates an overflow mask +// and pays for it whether or not overflow occurs. +// +// The exported variables are function-pointer dispatchers: their values are +// the scalar reference implementations by default and may be replaced at +// init time on amd64 when AVX2 is available (see d64_addsub_simd_amd64.go). + +var ( + D64AddUnchecked func(a, b, r []uint64) = scalarD64AddUnchecked + D64SubUnchecked func(a, b, r []uint64) = scalarD64SubUnchecked + D64AddChecked func(a, b, r []uint64) int = scalarD64AddChecked + D64SubChecked func(a, b, r []uint64) int = scalarD64SubChecked + + // Scalar-broadcast variants. Use when one operand is a constant / + // bound parameter / single-row literal — i.e. the SQL frontend's + // (column op constant) and (constant op column) shapes. + // + // For Add (commutative) one entry covers both sides. + // For Sub: + // D64SubScalar → v[i] - s (column - constant) + // D64ScalarSub → s - v[i] (constant - column) + D64AddScalarUnchecked func(s uint64, v, r []uint64) = scalarD64AddScalarUnchecked + D64AddScalarChecked func(s uint64, v, r []uint64) int = scalarD64AddScalarChecked + D64SubScalarUnchecked func(v []uint64, s uint64, r []uint64) = scalarD64SubScalarUnchecked + D64SubScalarChecked func(v []uint64, s uint64, r []uint64) int = scalarD64SubScalarChecked + D64ScalarSubUnchecked func(s uint64, v, r []uint64) = scalarD64ScalarSubUnchecked + D64ScalarSubChecked func(s uint64, v, r []uint64) int = scalarD64ScalarSubChecked + + // D64SumReduceToD128 sums a slice of Decimal64 values (signed) and + // returns the 128-bit signed total as (lo, hi). Wraps mod 2^128. + // Caller is responsible for ensuring the true sum fits in 128 bits + // (always true for any plausible Decimal64 batch since |val| < 10^18). + D64SumReduceToD128 func(v []uint64) (lo, hi uint64) = scalarD64SumReduceToD128 +) + +func scalarD64AddUnchecked(a, b, r []uint64) { + n := len(r) + if len(a) < n || len(b) < n { + return + } + for i := 0; i < n; i++ { + r[i] = a[i] + b[i] + } +} + +func scalarD64SubUnchecked(a, b, r []uint64) { + n := len(r) + if len(a) < n || len(b) < n { + return + } + for i := 0; i < n; i++ { + r[i] = a[i] - b[i] + } +} + +func scalarD64AddChecked(a, b, r []uint64) int { + n := len(r) + if len(a) < n || len(b) < n { + return -1 + } + first := -1 + for i := 0; i < n; i++ { + ai, bi := int64(a[i]), int64(b[i]) + ri := ai + bi + r[i] = uint64(ri) + if first < 0 && (ai^ri)&^(ai^bi) < 0 { + first = i + } + } + return first +} + +func scalarD64SubChecked(a, b, r []uint64) int { + n := len(r) + if len(a) < n || len(b) < n { + return -1 + } + first := -1 + for i := 0; i < n; i++ { + ai, bi := int64(a[i]), int64(b[i]) + ri := ai - bi + r[i] = uint64(ri) + if first < 0 && (ai^ri)&(ai^bi) < 0 { + first = i + } + } + return first +} + +// d64FirstOverflow rescans [0, end) for the first overflow index. Used by +// SIMD checked variants when their accumulated mask reports overflow but +// the scalar tail did not see one (so the offender is in the vector range). +func d64FirstOverflow(a, b []uint64, end int, sub bool) int { + if sub { + for i := 0; i < end; i++ { + ai, bi := int64(a[i]), int64(b[i]) + ri := ai - bi + if (ai^ri)&(ai^bi) < 0 { + return i + } + } + return -1 + } + for i := 0; i < end; i++ { + ai, bi := int64(a[i]), int64(b[i]) + ri := ai + bi + if (ai^ri)&^(ai^bi) < 0 { + return i + } + } + return -1 +} + +// --------------------------------------------------------------------------- +// Scalar-broadcast reference implementations. +// --------------------------------------------------------------------------- + +func scalarD64AddScalarUnchecked(s uint64, v, r []uint64) { + n := len(r) + if len(v) < n { + return + } + for i := 0; i < n; i++ { + r[i] = s + v[i] + } +} + +func scalarD64SubScalarUnchecked(v []uint64, s uint64, r []uint64) { + n := len(r) + if len(v) < n { + return + } + for i := 0; i < n; i++ { + r[i] = v[i] - s + } +} + +func scalarD64ScalarSubUnchecked(s uint64, v, r []uint64) { + n := len(r) + if len(v) < n { + return + } + for i := 0; i < n; i++ { + r[i] = s - v[i] + } +} + +func scalarD64AddScalarChecked(s uint64, v, r []uint64) int { + n := len(r) + if len(v) < n { + return -1 + } + si := int64(s) + first := -1 + for i := 0; i < n; i++ { + vi := int64(v[i]) + ri := si + vi + r[i] = uint64(ri) + if first < 0 && (si^ri)&^(si^vi) < 0 { + first = i + } + } + return first +} + +func scalarD64SubScalarChecked(v []uint64, s uint64, r []uint64) int { + n := len(r) + if len(v) < n { + return -1 + } + si := int64(s) + first := -1 + for i := 0; i < n; i++ { + vi := int64(v[i]) + ri := vi - si + r[i] = uint64(ri) + if first < 0 && (vi^ri)&(vi^si) < 0 { + first = i + } + } + return first +} + +func scalarD64ScalarSubChecked(s uint64, v, r []uint64) int { + n := len(r) + if len(v) < n { + return -1 + } + si := int64(s) + first := -1 + for i := 0; i < n; i++ { + vi := int64(v[i]) + ri := si - vi + r[i] = uint64(ri) + if first < 0 && (si^ri)&(si^vi) < 0 { + first = i + } + } + return first +} + +// d64ScalarFirstOverflow rescans [0, end) for the first overflow index in +// scalar-broadcast operations. kind selects the operation: +// +// 0 = s + v[i] (AddScalar) +// 1 = v[i] - s (SubScalar) +// 2 = s - v[i] (ScalarSub) +func d64ScalarFirstOverflow(s uint64, v []uint64, end int, kind int) int { + si := int64(s) + switch kind { + case 0: + for i := 0; i < end; i++ { + vi := int64(v[i]) + ri := si + vi + if (si^ri)&^(si^vi) < 0 { + return i + } + } + case 1: + for i := 0; i < end; i++ { + vi := int64(v[i]) + ri := vi - si + if (vi^ri)&(vi^si) < 0 { + return i + } + } + case 2: + for i := 0; i < end; i++ { + vi := int64(v[i]) + ri := si - vi + if (si^ri)&(si^vi) < 0 { + return i + } + } + } + return -1 +} + +func scalarD64SumReduceToD128(v []uint64) (lo, hi uint64) { + for i := 0; i < len(v); i++ { + x := int64(v[i]) + sx := uint64(x >> 63) + var c uint64 + lo, c = bits.Add64(lo, uint64(x), 0) + hi, _ = bits.Add64(hi, sx, c) + } + return +} diff --git a/pkg/common/simdkernels/d64_addsub_simd_amd64.go b/pkg/common/simdkernels/d64_addsub_simd_amd64.go new file mode 100644 index 0000000000000..e9790b94a719e --- /dev/null +++ b/pkg/common/simdkernels/d64_addsub_simd_amd64.go @@ -0,0 +1,901 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build goexperiment.simd && amd64 + +package simdkernels + +import ( + "math/bits" + "simd/archsimd" + "unsafe" + + "golang.org/x/sys/cpu" +) + +// init swaps the dispatcher function pointers from the package-default +// scalar implementations to the best available SIMD variant on the host. +// +// Order of preference: AVX-512 (Int64x8, 8 lanes) > AVX2 (Int64x4, 4 lanes). +// archsimd.X86.AVX512() requires the bundled F+CD+BW+DQ+VL set, exposed via +// cpu.X86.HasAVX512. +func init() { + switch { + case cpu.X86.HasAVX512: + D64AddUnchecked = avx512D64AddUnchecked + D64SubUnchecked = avx512D64SubUnchecked + D64AddChecked = avx512D64AddChecked + D64SubChecked = avx512D64SubChecked + D64AddScalarUnchecked = avx512D64AddScalarUnchecked + D64AddScalarChecked = avx512D64AddScalarChecked + D64SubScalarUnchecked = avx512D64SubScalarUnchecked + D64SubScalarChecked = avx512D64SubScalarChecked + D64ScalarSubUnchecked = avx512D64ScalarSubUnchecked + D64ScalarSubChecked = avx512D64ScalarSubChecked + D64SumReduceToD128 = avx512D64SumReduceToD128 + case cpu.X86.HasAVX2: + D64AddUnchecked = avx2D64AddUnchecked + D64SubUnchecked = avx2D64SubUnchecked + D64AddChecked = avx2D64AddChecked + D64SubChecked = avx2D64SubChecked + D64AddScalarUnchecked = avx2D64AddScalarUnchecked + D64AddScalarChecked = avx2D64AddScalarChecked + D64SubScalarUnchecked = avx2D64SubScalarUnchecked + D64SubScalarChecked = avx2D64SubScalarChecked + D64ScalarSubUnchecked = avx2D64ScalarSubUnchecked + D64ScalarSubChecked = avx2D64ScalarSubChecked + D64SumReduceToD128 = avx2D64SumReduceToD128 + } +} + +// --------------------------------------------------------------------------- +// AVX2 path: 4-lane Int64x4 vectors. Main loop processes 16 elements +// (4× Int64x4 = 128 B) per iteration to hide the 4-cycle VPADDQ latency on +// Zen 3 / Skylake; cleanup is a 4-wide loop, then a scalar tail. +// --------------------------------------------------------------------------- + +func avx2D64AddUnchecked(a, b, r []uint64) { + n := len(r) + if n == 0 || len(a) < n || len(b) < n { + return + } + pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0]) + + i := 0 + for ; i+16 <= n; i += 16 { + off := uintptr(i) * 8 + a0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off))) + a1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+32))) + a2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+64))) + a3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+96))) + b0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off))) + b1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+32))) + b2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+64))) + b3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+96))) + a0.Add(b0).Store((*[4]int64)(unsafe.Add(pr, off))) + a1.Add(b1).Store((*[4]int64)(unsafe.Add(pr, off+32))) + a2.Add(b2).Store((*[4]int64)(unsafe.Add(pr, off+64))) + a3.Add(b3).Store((*[4]int64)(unsafe.Add(pr, off+96))) + } + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 8 + av := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off))) + bv := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off))) + av.Add(bv).Store((*[4]int64)(unsafe.Add(pr, off))) + } + for ; i < n; i++ { + r[i] = a[i] + b[i] + } +} + +func avx2D64SubUnchecked(a, b, r []uint64) { + n := len(r) + if n == 0 || len(a) < n || len(b) < n { + return + } + pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0]) + + i := 0 + for ; i+16 <= n; i += 16 { + off := uintptr(i) * 8 + a0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off))) + a1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+32))) + a2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+64))) + a3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+96))) + b0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off))) + b1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+32))) + b2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+64))) + b3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+96))) + a0.Sub(b0).Store((*[4]int64)(unsafe.Add(pr, off))) + a1.Sub(b1).Store((*[4]int64)(unsafe.Add(pr, off+32))) + a2.Sub(b2).Store((*[4]int64)(unsafe.Add(pr, off+64))) + a3.Sub(b3).Store((*[4]int64)(unsafe.Add(pr, off+96))) + } + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 8 + av := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off))) + bv := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off))) + av.Sub(bv).Store((*[4]int64)(unsafe.Add(pr, off))) + } + for ; i < n; i++ { + r[i] = a[i] - b[i] + } +} + +// avx2D64AddChecked accumulates per-lane signed-overflow predicates into a +// vector OR; if any MSB is set at the end, it falls back to a scalar rescan +// (d64FirstOverflow) to find the first offending index. For the common +// "no overflow" case the cost is one OR + one Xor + one AndNot per 4 elems. +// +// Predicate: overflow iff sign(a)==sign(b) && sign(a)!=sign(r). +// +// ⇔ ((a^r) &^ (a^b)) < 0 (MSB set) +func avx2D64AddChecked(a, b, r []uint64) int { + n := len(r) + if n == 0 || len(a) < n || len(b) < n { + return -1 + } + pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0]) + + var ofAcc archsimd.Int64x4 + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 8 + av := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off))) + bv := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off))) + rv := av.Add(bv) + ofAcc = ofAcc.Or(av.Xor(rv).AndNot(av.Xor(bv))) + rv.Store((*[4]int64)(unsafe.Add(pr, off))) + } + vecEnd := i + + zero := archsimd.BroadcastInt64x4(0) + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + for ; i < n; i++ { + ai, bi := int64(a[i]), int64(b[i]) + ri := ai + bi + r[i] = uint64(ri) + if (ai^ri)&^(ai^bi) < 0 { + if vecOverflow { + return d64FirstOverflow(a, b, vecEnd, false) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d64FirstOverflow(a, b, vecEnd, false) +} + +// avx2D64SubChecked: overflow iff sign(a)!=sign(b) && sign(a)!=sign(r). +// +// ⇔ ((a^r) & (a^b)) < 0 +func avx2D64SubChecked(a, b, r []uint64) int { + n := len(r) + if n == 0 || len(a) < n || len(b) < n { + return -1 + } + pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0]) + + var ofAcc archsimd.Int64x4 + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 8 + av := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off))) + bv := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off))) + rv := av.Sub(bv) + ofAcc = ofAcc.Or(av.Xor(rv).And(av.Xor(bv))) + rv.Store((*[4]int64)(unsafe.Add(pr, off))) + } + vecEnd := i + + zero := archsimd.BroadcastInt64x4(0) + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + for ; i < n; i++ { + ai, bi := int64(a[i]), int64(b[i]) + ri := ai - bi + r[i] = uint64(ri) + if (ai^ri)&(ai^bi) < 0 { + if vecOverflow { + return d64FirstOverflow(a, b, vecEnd, true) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d64FirstOverflow(a, b, vecEnd, true) +} + +// --------------------------------------------------------------------------- +// AVX-512 path: 8-lane Int64x8 vectors (ZMM, 64 B). Main loop processes 32 +// elements per iteration (4× Int64x8 = 256 B) to keep the 4-cycle VPADDQ +// pipe full. Cleanup is 8-wide, then scalar tail. +// --------------------------------------------------------------------------- + +func avx512D64AddUnchecked(a, b, r []uint64) { + n := len(r) + if n == 0 || len(a) < n || len(b) < n { + return + } + pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0]) + + i := 0 + for ; i+32 <= n; i += 32 { + off := uintptr(i) * 8 + a0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off))) + a1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+64))) + a2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+128))) + a3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+192))) + b0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off))) + b1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+64))) + b2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+128))) + b3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+192))) + a0.Add(b0).Store((*[8]int64)(unsafe.Add(pr, off))) + a1.Add(b1).Store((*[8]int64)(unsafe.Add(pr, off+64))) + a2.Add(b2).Store((*[8]int64)(unsafe.Add(pr, off+128))) + a3.Add(b3).Store((*[8]int64)(unsafe.Add(pr, off+192))) + } + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 8 + av := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off))) + bv := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off))) + av.Add(bv).Store((*[8]int64)(unsafe.Add(pr, off))) + } + for ; i < n; i++ { + r[i] = a[i] + b[i] + } +} + +func avx512D64SubUnchecked(a, b, r []uint64) { + n := len(r) + if n == 0 || len(a) < n || len(b) < n { + return + } + pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0]) + + i := 0 + for ; i+32 <= n; i += 32 { + off := uintptr(i) * 8 + a0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off))) + a1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+64))) + a2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+128))) + a3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+192))) + b0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off))) + b1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+64))) + b2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+128))) + b3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+192))) + a0.Sub(b0).Store((*[8]int64)(unsafe.Add(pr, off))) + a1.Sub(b1).Store((*[8]int64)(unsafe.Add(pr, off+64))) + a2.Sub(b2).Store((*[8]int64)(unsafe.Add(pr, off+128))) + a3.Sub(b3).Store((*[8]int64)(unsafe.Add(pr, off+192))) + } + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 8 + av := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off))) + bv := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off))) + av.Sub(bv).Store((*[8]int64)(unsafe.Add(pr, off))) + } + for ; i < n; i++ { + r[i] = a[i] - b[i] + } +} + +func avx512D64AddChecked(a, b, r []uint64) int { + n := len(r) + if n == 0 || len(a) < n || len(b) < n { + return -1 + } + pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0]) + + var ofAcc archsimd.Int64x8 + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 8 + av := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off))) + bv := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off))) + rv := av.Add(bv) + // Int64x8.AndNot has inverted operand semantics (computes ~receiver & arg). + // We want (a^r) & ~(a^b), so use (a^b).AndNot(a^r). + ofAcc = ofAcc.Or(av.Xor(bv).AndNot(av.Xor(rv))) + rv.Store((*[8]int64)(unsafe.Add(pr, off))) + } + vecEnd := i + + zero := archsimd.BroadcastInt64x8(0) + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + for ; i < n; i++ { + ai, bi := int64(a[i]), int64(b[i]) + ri := ai + bi + r[i] = uint64(ri) + if (ai^ri)&^(ai^bi) < 0 { + if vecOverflow { + return d64FirstOverflow(a, b, vecEnd, false) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d64FirstOverflow(a, b, vecEnd, false) +} + +func avx512D64SubChecked(a, b, r []uint64) int { + n := len(r) + if n == 0 || len(a) < n || len(b) < n { + return -1 + } + pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0]) + + var ofAcc archsimd.Int64x8 + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 8 + av := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off))) + bv := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off))) + rv := av.Sub(bv) + ofAcc = ofAcc.Or(av.Xor(rv).And(av.Xor(bv))) + rv.Store((*[8]int64)(unsafe.Add(pr, off))) + } + vecEnd := i + + zero := archsimd.BroadcastInt64x8(0) + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + for ; i < n; i++ { + ai, bi := int64(a[i]), int64(b[i]) + ri := ai - bi + r[i] = uint64(ri) + if (ai^ri)&(ai^bi) < 0 { + if vecOverflow { + return d64FirstOverflow(a, b, vecEnd, true) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d64FirstOverflow(a, b, vecEnd, true) +} + +// --------------------------------------------------------------------------- +// Scalar-broadcast SIMD variants. Pattern: broadcast the scalar into a +// vector once outside the main loop, then fuse loads of the column with +// vector add/sub against the broadcast register. +// +// AVX2: 4× Int64x4 unroll = 16 elems/iter, then 4-wide cleanup, scalar tail. +// AVX-512: 4× Int64x8 unroll = 32 elems/iter, then 8-wide cleanup, scalar tail. +// --------------------------------------------------------------------------- + +// AVX2 — Add scalar broadcast. + +func avx2D64AddScalarUnchecked(s uint64, v, r []uint64) { + n := len(r) + if n == 0 || len(v) < n { + return + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sv := archsimd.BroadcastInt64x4(int64(s)) + + i := 0 + for ; i+16 <= n; i += 16 { + off := uintptr(i) * 8 + v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32))) + v2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+64))) + v3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+96))) + sv.Add(v0).Store((*[4]int64)(unsafe.Add(pr, off))) + sv.Add(v1).Store((*[4]int64)(unsafe.Add(pr, off+32))) + sv.Add(v2).Store((*[4]int64)(unsafe.Add(pr, off+64))) + sv.Add(v3).Store((*[4]int64)(unsafe.Add(pr, off+96))) + } + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 8 + vv := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off))) + sv.Add(vv).Store((*[4]int64)(unsafe.Add(pr, off))) + } + for ; i < n; i++ { + r[i] = s + v[i] + } +} + +func avx2D64SubScalarUnchecked(v []uint64, s uint64, r []uint64) { + n := len(r) + if n == 0 || len(v) < n { + return + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sv := archsimd.BroadcastInt64x4(int64(s)) + + i := 0 + for ; i+16 <= n; i += 16 { + off := uintptr(i) * 8 + v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32))) + v2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+64))) + v3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+96))) + v0.Sub(sv).Store((*[4]int64)(unsafe.Add(pr, off))) + v1.Sub(sv).Store((*[4]int64)(unsafe.Add(pr, off+32))) + v2.Sub(sv).Store((*[4]int64)(unsafe.Add(pr, off+64))) + v3.Sub(sv).Store((*[4]int64)(unsafe.Add(pr, off+96))) + } + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 8 + vv := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off))) + vv.Sub(sv).Store((*[4]int64)(unsafe.Add(pr, off))) + } + for ; i < n; i++ { + r[i] = v[i] - s + } +} + +func avx2D64ScalarSubUnchecked(s uint64, v, r []uint64) { + n := len(r) + if n == 0 || len(v) < n { + return + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sv := archsimd.BroadcastInt64x4(int64(s)) + + i := 0 + for ; i+16 <= n; i += 16 { + off := uintptr(i) * 8 + v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32))) + v2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+64))) + v3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+96))) + sv.Sub(v0).Store((*[4]int64)(unsafe.Add(pr, off))) + sv.Sub(v1).Store((*[4]int64)(unsafe.Add(pr, off+32))) + sv.Sub(v2).Store((*[4]int64)(unsafe.Add(pr, off+64))) + sv.Sub(v3).Store((*[4]int64)(unsafe.Add(pr, off+96))) + } + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 8 + vv := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off))) + sv.Sub(vv).Store((*[4]int64)(unsafe.Add(pr, off))) + } + for ; i < n; i++ { + r[i] = s - v[i] + } +} + +// Checked broadcast variants follow the same accumulate-mask pattern as the +// vector+vector ones. The overflow predicate is identical to the scalar +// reference because broadcasting the scalar gives the same per-lane test. + +func avx2D64AddScalarChecked(s uint64, v, r []uint64) int { + n := len(r) + if n == 0 || len(v) < n { + return -1 + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sv := archsimd.BroadcastInt64x4(int64(s)) + + var ofAcc archsimd.Int64x4 + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 8 + vv := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off))) + rv := sv.Add(vv) + ofAcc = ofAcc.Or(sv.Xor(rv).AndNot(sv.Xor(vv))) + rv.Store((*[4]int64)(unsafe.Add(pr, off))) + } + vecEnd := i + + zero := archsimd.BroadcastInt64x4(0) + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + si := int64(s) + for ; i < n; i++ { + vi := int64(v[i]) + ri := si + vi + r[i] = uint64(ri) + if (si^ri)&^(si^vi) < 0 { + if vecOverflow { + return d64ScalarFirstOverflow(s, v, vecEnd, 0) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d64ScalarFirstOverflow(s, v, vecEnd, 0) +} + +func avx2D64SubScalarChecked(v []uint64, s uint64, r []uint64) int { + n := len(r) + if n == 0 || len(v) < n { + return -1 + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sv := archsimd.BroadcastInt64x4(int64(s)) + + var ofAcc archsimd.Int64x4 + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 8 + vv := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off))) + rv := vv.Sub(sv) + ofAcc = ofAcc.Or(vv.Xor(rv).And(vv.Xor(sv))) + rv.Store((*[4]int64)(unsafe.Add(pr, off))) + } + vecEnd := i + + zero := archsimd.BroadcastInt64x4(0) + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + si := int64(s) + for ; i < n; i++ { + vi := int64(v[i]) + ri := vi - si + r[i] = uint64(ri) + if (vi^ri)&(vi^si) < 0 { + if vecOverflow { + return d64ScalarFirstOverflow(s, v, vecEnd, 1) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d64ScalarFirstOverflow(s, v, vecEnd, 1) +} + +func avx2D64ScalarSubChecked(s uint64, v, r []uint64) int { + n := len(r) + if n == 0 || len(v) < n { + return -1 + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sv := archsimd.BroadcastInt64x4(int64(s)) + + var ofAcc archsimd.Int64x4 + + i := 0 + for ; i+4 <= n; i += 4 { + off := uintptr(i) * 8 + vv := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off))) + rv := sv.Sub(vv) + ofAcc = ofAcc.Or(sv.Xor(rv).And(sv.Xor(vv))) + rv.Store((*[4]int64)(unsafe.Add(pr, off))) + } + vecEnd := i + + zero := archsimd.BroadcastInt64x4(0) + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + si := int64(s) + for ; i < n; i++ { + vi := int64(v[i]) + ri := si - vi + r[i] = uint64(ri) + if (si^ri)&(si^vi) < 0 { + if vecOverflow { + return d64ScalarFirstOverflow(s, v, vecEnd, 2) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d64ScalarFirstOverflow(s, v, vecEnd, 2) +} + +// AVX-512 broadcast variants — Int64x8 (8 lanes), main loop 4× unrolled. + +func avx512D64AddScalarUnchecked(s uint64, v, r []uint64) { + n := len(r) + if n == 0 || len(v) < n { + return + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sv := archsimd.BroadcastInt64x8(int64(s)) + + i := 0 + for ; i+32 <= n; i += 32 { + off := uintptr(i) * 8 + v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64))) + v2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+128))) + v3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+192))) + sv.Add(v0).Store((*[8]int64)(unsafe.Add(pr, off))) + sv.Add(v1).Store((*[8]int64)(unsafe.Add(pr, off+64))) + sv.Add(v2).Store((*[8]int64)(unsafe.Add(pr, off+128))) + sv.Add(v3).Store((*[8]int64)(unsafe.Add(pr, off+192))) + } + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 8 + vv := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off))) + sv.Add(vv).Store((*[8]int64)(unsafe.Add(pr, off))) + } + for ; i < n; i++ { + r[i] = s + v[i] + } +} + +func avx512D64SubScalarUnchecked(v []uint64, s uint64, r []uint64) { + n := len(r) + if n == 0 || len(v) < n { + return + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sv := archsimd.BroadcastInt64x8(int64(s)) + + i := 0 + for ; i+32 <= n; i += 32 { + off := uintptr(i) * 8 + v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64))) + v2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+128))) + v3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+192))) + v0.Sub(sv).Store((*[8]int64)(unsafe.Add(pr, off))) + v1.Sub(sv).Store((*[8]int64)(unsafe.Add(pr, off+64))) + v2.Sub(sv).Store((*[8]int64)(unsafe.Add(pr, off+128))) + v3.Sub(sv).Store((*[8]int64)(unsafe.Add(pr, off+192))) + } + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 8 + vv := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off))) + vv.Sub(sv).Store((*[8]int64)(unsafe.Add(pr, off))) + } + for ; i < n; i++ { + r[i] = v[i] - s + } +} + +func avx512D64ScalarSubUnchecked(s uint64, v, r []uint64) { + n := len(r) + if n == 0 || len(v) < n { + return + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sv := archsimd.BroadcastInt64x8(int64(s)) + + i := 0 + for ; i+32 <= n; i += 32 { + off := uintptr(i) * 8 + v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64))) + v2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+128))) + v3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+192))) + sv.Sub(v0).Store((*[8]int64)(unsafe.Add(pr, off))) + sv.Sub(v1).Store((*[8]int64)(unsafe.Add(pr, off+64))) + sv.Sub(v2).Store((*[8]int64)(unsafe.Add(pr, off+128))) + sv.Sub(v3).Store((*[8]int64)(unsafe.Add(pr, off+192))) + } + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 8 + vv := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off))) + sv.Sub(vv).Store((*[8]int64)(unsafe.Add(pr, off))) + } + for ; i < n; i++ { + r[i] = s - v[i] + } +} + +func avx512D64AddScalarChecked(s uint64, v, r []uint64) int { + n := len(r) + if n == 0 || len(v) < n { + return -1 + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sv := archsimd.BroadcastInt64x8(int64(s)) + + var ofAcc archsimd.Int64x8 + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 8 + vv := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off))) + rv := sv.Add(vv) + ofAcc = ofAcc.Or(sv.Xor(vv).AndNot(sv.Xor(rv))) + rv.Store((*[8]int64)(unsafe.Add(pr, off))) + } + vecEnd := i + + zero := archsimd.BroadcastInt64x8(0) + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + si := int64(s) + for ; i < n; i++ { + vi := int64(v[i]) + ri := si + vi + r[i] = uint64(ri) + if (si^ri)&^(si^vi) < 0 { + if vecOverflow { + return d64ScalarFirstOverflow(s, v, vecEnd, 0) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d64ScalarFirstOverflow(s, v, vecEnd, 0) +} + +func avx512D64SubScalarChecked(v []uint64, s uint64, r []uint64) int { + n := len(r) + if n == 0 || len(v) < n { + return -1 + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sv := archsimd.BroadcastInt64x8(int64(s)) + + var ofAcc archsimd.Int64x8 + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 8 + vv := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off))) + rv := vv.Sub(sv) + ofAcc = ofAcc.Or(vv.Xor(rv).And(vv.Xor(sv))) + rv.Store((*[8]int64)(unsafe.Add(pr, off))) + } + vecEnd := i + + zero := archsimd.BroadcastInt64x8(0) + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + si := int64(s) + for ; i < n; i++ { + vi := int64(v[i]) + ri := vi - si + r[i] = uint64(ri) + if (vi^ri)&(vi^si) < 0 { + if vecOverflow { + return d64ScalarFirstOverflow(s, v, vecEnd, 1) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d64ScalarFirstOverflow(s, v, vecEnd, 1) +} + +func avx512D64ScalarSubChecked(s uint64, v, r []uint64) int { + n := len(r) + if n == 0 || len(v) < n { + return -1 + } + pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0]) + sv := archsimd.BroadcastInt64x8(int64(s)) + + var ofAcc archsimd.Int64x8 + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 8 + vv := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off))) + rv := sv.Sub(vv) + ofAcc = ofAcc.Or(sv.Xor(rv).And(sv.Xor(vv))) + rv.Store((*[8]int64)(unsafe.Add(pr, off))) + } + vecEnd := i + + zero := archsimd.BroadcastInt64x8(0) + vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0 + + si := int64(s) + for ; i < n; i++ { + vi := int64(v[i]) + ri := si - vi + r[i] = uint64(ri) + if (si^ri)&(si^vi) < 0 { + if vecOverflow { + return d64ScalarFirstOverflow(s, v, vecEnd, 2) + } + return i + } + } + if !vecOverflow { + return -1 + } + return d64ScalarFirstOverflow(s, v, vecEnd, 2) +} + +// avx2D64SumReduceToD128 sums signed Decimal64 values into a 128-bit total. +// Uses K=8 inner iterations per lane: per-lane partial sum stays bounded +// because |Decimal64| < 10^18 ≈ 2^59.79, so 8 such values fit in int63. +func avx2D64SumReduceToD128(v []uint64) (lo, hi uint64) { + n := len(v) + if n == 0 { + return + } + pv := unsafe.Pointer(&v[0]) + const K = 8 + const blk = 4 * K // 32 elements per outer iteration + i := 0 + for ; i+blk <= n; i += blk { + acc := archsimd.BroadcastInt64x4(0) + for k := 0; k < K; k++ { + off := uintptr(i+4*k) * 8 + x := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off))) + acc = acc.Add(x) + } + var buf [4]int64 + acc.Store(&buf) + for j := 0; j < 4; j++ { + x := buf[j] + sx := uint64(x >> 63) + var c uint64 + lo, c = bits.Add64(lo, uint64(x), 0) + hi, _ = bits.Add64(hi, sx, c) + } + } + for ; i < n; i++ { + x := int64(v[i]) + sx := uint64(x >> 63) + var c uint64 + lo, c = bits.Add64(lo, uint64(x), 0) + hi, _ = bits.Add64(hi, sx, c) + } + return +} + +// avx512D64SumReduceToD128: same chunked partial-sum strategy with 8-wide +// lanes. K=8 inner iters → 64 elements per outer iter. +func avx512D64SumReduceToD128(v []uint64) (lo, hi uint64) { + n := len(v) + if n == 0 { + return + } + pv := unsafe.Pointer(&v[0]) + const K = 8 + const blk = 8 * K + i := 0 + for ; i+blk <= n; i += blk { + acc := archsimd.BroadcastInt64x8(0) + for k := 0; k < K; k++ { + off := uintptr(i+8*k) * 8 + x := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off))) + acc = acc.Add(x) + } + var buf [8]int64 + acc.Store(&buf) + for j := 0; j < 8; j++ { + x := buf[j] + sx := uint64(x >> 63) + var c uint64 + lo, c = bits.Add64(lo, uint64(x), 0) + hi, _ = bits.Add64(hi, sx, c) + } + } + for ; i < n; i++ { + x := int64(v[i]) + sx := uint64(x >> 63) + var c uint64 + lo, c = bits.Add64(lo, uint64(x), 0) + hi, _ = bits.Add64(hi, sx, c) + } + return +} diff --git a/pkg/common/simdkernels/d64_addsub_test.go b/pkg/common/simdkernels/d64_addsub_test.go new file mode 100644 index 0000000000000..2b31da90e7dc0 --- /dev/null +++ b/pkg/common/simdkernels/d64_addsub_test.go @@ -0,0 +1,688 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build goexperiment.simd && amd64 + +package simdkernels + +import ( + "math" + "math/rand/v2" + "strconv" + "testing" + + "golang.org/x/sys/cpu" +) + +// --------------------------------------------------------------------------- +// Correctness: compare every available impl (scalar, AVX2, AVX-512 if built) +// against the scalar reference on random + edge-case inputs at sizes that +// exercise the unrolled main loop, the 4-/8-wide cleanup, and the scalar +// tail (e.g. 35 = 32+3 or 4+...+3). +// --------------------------------------------------------------------------- + +type d64UncheckedImpl struct { + name string + fn func(a, b, r []uint64) +} + +type d64CheckedImpl struct { + name string + fn func(a, b, r []uint64) int +} + +func d64Sizes() []int { + return []int{0, 1, 3, 4, 7, 8, 15, 16, 17, 31, 32, 33, 35, 63, 64, 127, 128, 255, 256, 1023, 4096} +} + +func makeRandD64(n int, seed uint64) []uint64 { + rng := rand.New(rand.NewPCG(seed, seed^0x9E3779B97F4A7C15)) + out := make([]uint64, n) + for i := range out { + out[i] = rng.Uint64() + } + return out +} + +// edgeInputs returns a small input pair tuned to provoke add+sub overflows +// at a few specific positions (so tests cover both the "no overflow" fast +// path and the rescan slow path). +func edgeInputs() (a, b []uint64) { + a = []uint64{ + 0, 1, math.MaxUint64, + uint64(math.MaxInt64), uint64(math.MaxInt64), + 1 << 63, 1 << 63, + 42, 100, 200, + } + b = []uint64{ + 0, math.MaxUint64, 1, + 1, uint64(math.MaxInt64), + uint64(math.MaxInt64) + 1, 1, + 58, 200, 100, + } + return +} + +func TestD64AddVariants(t *testing.T) { + impls := []d64UncheckedImpl{ + {"scalar", scalarD64AddUnchecked}, + {"avx2", avx2D64AddUnchecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d64UncheckedImpl{"avx512", avx512D64AddUnchecked}) + } + for _, n := range d64Sizes() { + a := makeRandD64(n, uint64(n)*7+1) + b := makeRandD64(n, uint64(n)*11+3) + want := make([]uint64, n) + scalarD64AddUnchecked(a, b, want) + for _, impl := range impls { + got := make([]uint64, n) + impl.fn(a, b, got) + for i := 0; i < n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d i=%d: got %x want %x", impl.name, n, i, got[i], want[i]) + } + } + } + } +} + +func TestD64SubVariants(t *testing.T) { + impls := []d64UncheckedImpl{ + {"scalar", scalarD64SubUnchecked}, + {"avx2", avx2D64SubUnchecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d64UncheckedImpl{"avx512", avx512D64SubUnchecked}) + } + for _, n := range d64Sizes() { + a := makeRandD64(n, uint64(n)*13+5) + b := makeRandD64(n, uint64(n)*17+9) + want := make([]uint64, n) + scalarD64SubUnchecked(a, b, want) + for _, impl := range impls { + got := make([]uint64, n) + impl.fn(a, b, got) + for i := 0; i < n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d i=%d: got %x want %x", impl.name, n, i, got[i], want[i]) + } + } + } + } +} + +func TestD64AddCheckedVariants(t *testing.T) { + impls := []d64CheckedImpl{ + {"scalar", scalarD64AddChecked}, + {"avx2", avx2D64AddChecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d64CheckedImpl{"avx512", avx512D64AddChecked}) + } + + // 1) Edge inputs: contains a known overflow at index 5 (MinInt64 + (MaxInt64+1)). + ea, eb := edgeInputs() + wantR := make([]uint64, len(ea)) + wantIdx := scalarD64AddChecked(ea, eb, wantR) + for _, impl := range impls { + gotR := make([]uint64, len(ea)) + gotIdx := impl.fn(ea, eb, gotR) + // First-overflow position must match scalar reference. + if gotIdx != wantIdx { + t.Fatalf("%s edge: idx got %d want %d", impl.name, gotIdx, wantIdx) + } + // Values up to (and including) the first overflow must match. + end := len(ea) + if wantIdx >= 0 { + end = wantIdx + 1 + } + for i := 0; i < end; i++ { + if gotR[i] != wantR[i] { + t.Fatalf("%s edge i=%d: got %x want %x", impl.name, i, gotR[i], wantR[i]) + } + } + } + + // 2) No-overflow random inputs: clear sign bits to avoid spurious overflows. + for _, n := range d64Sizes() { + a := makeRandD64(n, uint64(n)*19+7) + b := makeRandD64(n, uint64(n)*23+11) + for i := range a { + a[i] &= 0x3FFFFFFFFFFFFFFF + b[i] &= 0x3FFFFFFFFFFFFFFF + } + want := make([]uint64, n) + if got := scalarD64AddChecked(a, b, want); got != -1 { + t.Fatalf("setup: scalar reported overflow at %d for masked input n=%d", got, n) + } + for _, impl := range impls { + got := make([]uint64, n) + if idx := impl.fn(a, b, got); idx != -1 { + t.Fatalf("%s n=%d: spurious overflow at %d", impl.name, n, idx) + } + for i := 0; i < n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d i=%d: got %x want %x", impl.name, n, i, got[i], want[i]) + } + } + } + } + + // 3) Single overflow injected at varying positions (covers both vector + // body and scalar tail of every impl). + for _, n := range []int{8, 16, 17, 33, 35, 64} { + for _, pos := range []int{0, 1, 4, 7, 8, n - 1} { + if pos < 0 || pos >= n { + continue + } + a := make([]uint64, n) + b := make([]uint64, n) + a[pos] = uint64(math.MaxInt64) + b[pos] = 1 + for _, impl := range impls { + got := make([]uint64, n) + idx := impl.fn(a, b, got) + if idx != pos { + t.Fatalf("%s n=%d inject pos=%d: got idx %d", impl.name, n, pos, idx) + } + } + } + } +} + +func TestD64SubCheckedVariants(t *testing.T) { + impls := []d64CheckedImpl{ + {"scalar", scalarD64SubChecked}, + {"avx2", avx2D64SubChecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d64CheckedImpl{"avx512", avx512D64SubChecked}) + } + + // No-overflow random inputs: mask to 62 bits so a-b stays in range. + for _, n := range d64Sizes() { + a := makeRandD64(n, uint64(n)*29+13) + b := makeRandD64(n, uint64(n)*31+17) + for i := range a { + a[i] &= 0x3FFFFFFFFFFFFFFF + b[i] &= 0x3FFFFFFFFFFFFFFF + } + want := make([]uint64, n) + if got := scalarD64SubChecked(a, b, want); got != -1 { + t.Fatalf("setup: scalar overflow at %d for n=%d", got, n) + } + for _, impl := range impls { + got := make([]uint64, n) + if idx := impl.fn(a, b, got); idx != -1 { + t.Fatalf("%s n=%d: spurious overflow at %d", impl.name, n, idx) + } + for i := 0; i < n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d i=%d: got %x want %x", impl.name, n, i, got[i], want[i]) + } + } + } + } + + // Inject MinInt64 - 1 overflow at varying positions. + for _, n := range []int{8, 16, 17, 33, 35, 64} { + for _, pos := range []int{0, 1, 4, 7, 8, n - 1} { + if pos < 0 || pos >= n { + continue + } + a := make([]uint64, n) + b := make([]uint64, n) + a[pos] = 1 << 63 + b[pos] = 1 + for _, impl := range impls { + got := make([]uint64, n) + idx := impl.fn(a, b, got) + if idx != pos { + t.Fatalf("%s n=%d inject pos=%d: got idx %d", impl.name, n, pos, idx) + } + } + } + } +} + +// --------------------------------------------------------------------------- +// Benchmarks: per-impl × per-size. Standard go test -bench output. +// --------------------------------------------------------------------------- + +var d64BenchSizes = []int{64, 256, 1024, 4096, 16384} + +func benchD64Unchecked(b *testing.B, fn func(a, bb, r []uint64), n int) { + a := makeRandD64(n, 1) + bb := makeRandD64(n, 2) + r := make([]uint64, n) + b.SetBytes(int64(n) * 8 * 3) + b.ResetTimer() + for i := 0; i < b.N; i++ { + fn(a, bb, r) + } +} + +func benchD64Checked(b *testing.B, fn func(a, bb, r []uint64) int, n int) { + a := makeRandD64(n, 1) + bb := makeRandD64(n, 2) + for i := range a { + a[i] &= 0x3FFFFFFFFFFFFFFF + bb[i] &= 0x3FFFFFFFFFFFFFFF + } + r := make([]uint64, n) + b.SetBytes(int64(n) * 8 * 3) + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = fn(a, bb, r) + } +} + +func BenchmarkD64AddUnchecked(b *testing.B) { + for _, n := range d64BenchSizes { + b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD64Unchecked(b, scalarD64AddUnchecked, n) }) + b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD64Unchecked(b, avx2D64AddUnchecked, n) }) + if cpu.X86.HasAVX512 { + b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD64Unchecked(b, avx512D64AddUnchecked, n) }) + } + } +} + +func BenchmarkD64SubUnchecked(b *testing.B) { + for _, n := range d64BenchSizes { + b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD64Unchecked(b, scalarD64SubUnchecked, n) }) + b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD64Unchecked(b, avx2D64SubUnchecked, n) }) + if cpu.X86.HasAVX512 { + b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD64Unchecked(b, avx512D64SubUnchecked, n) }) + } + } +} + +func BenchmarkD64AddChecked(b *testing.B) { + for _, n := range d64BenchSizes { + b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD64Checked(b, scalarD64AddChecked, n) }) + b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD64Checked(b, avx2D64AddChecked, n) }) + if cpu.X86.HasAVX512 { + b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD64Checked(b, avx512D64AddChecked, n) }) + } + } +} + +func BenchmarkD64SubChecked(b *testing.B) { + for _, n := range d64BenchSizes { + b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD64Checked(b, scalarD64SubChecked, n) }) + b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD64Checked(b, avx2D64SubChecked, n) }) + if cpu.X86.HasAVX512 { + b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD64Checked(b, avx512D64SubChecked, n) }) + } + } +} + +// --------------------------------------------------------------------------- +// Scalar-broadcast tests: validate AVX2/AVX-512 broadcast variants against +// the scalar reference for both Unchecked (results) and Checked (results + +// first-overflow index) shapes. +// --------------------------------------------------------------------------- + +type d64ScalarUncheckedImpl struct { + name string + fn func(s uint64, v, r []uint64) +} + +type d64ScalarUncheckedImplR struct { + name string + fn func(v []uint64, s uint64, r []uint64) +} + +type d64ScalarCheckedImpl struct { + name string + fn func(s uint64, v, r []uint64) int +} + +type d64ScalarCheckedImplR struct { + name string + fn func(v []uint64, s uint64, r []uint64) int +} + +func d64ScalarSamples() []uint64 { + return []uint64{ + 0, 1, 42, + uint64(math.MaxInt64), uint64(math.MaxInt64) - 1, + 1 << 63, // MinInt64 + (1 << 63) + 1, + math.MaxUint64, + } +} + +func TestD64AddScalarVariants(t *testing.T) { + uncheckedImpls := []d64ScalarUncheckedImpl{ + {"scalar", scalarD64AddScalarUnchecked}, + {"avx2", avx2D64AddScalarUnchecked}, + } + if cpu.X86.HasAVX512 { + uncheckedImpls = append(uncheckedImpls, d64ScalarUncheckedImpl{"avx512", avx512D64AddScalarUnchecked}) + } + for _, n := range d64Sizes() { + v := makeRandD64(n, uint64(n)*31+13) + for _, s := range d64ScalarSamples() { + want := make([]uint64, n) + scalarD64AddScalarUnchecked(s, v, want) + for _, impl := range uncheckedImpls { + got := make([]uint64, n) + impl.fn(s, v, got) + for i := 0; i < n; i++ { + if got[i] != want[i] { + t.Fatalf("AddScalarUnchecked %s n=%d s=%x i=%d: got %x want %x", + impl.name, n, s, i, got[i], want[i]) + } + } + } + } + } +} + +func TestD64SubScalarVariants(t *testing.T) { + uncheckedImpls := []d64ScalarUncheckedImplR{ + {"scalar", scalarD64SubScalarUnchecked}, + {"avx2", avx2D64SubScalarUnchecked}, + } + if cpu.X86.HasAVX512 { + uncheckedImpls = append(uncheckedImpls, d64ScalarUncheckedImplR{"avx512", avx512D64SubScalarUnchecked}) + } + for _, n := range d64Sizes() { + v := makeRandD64(n, uint64(n)*37+17) + for _, s := range d64ScalarSamples() { + want := make([]uint64, n) + scalarD64SubScalarUnchecked(v, s, want) + for _, impl := range uncheckedImpls { + got := make([]uint64, n) + impl.fn(v, s, got) + for i := 0; i < n; i++ { + if got[i] != want[i] { + t.Fatalf("SubScalarUnchecked %s n=%d s=%x i=%d: got %x want %x", + impl.name, n, s, i, got[i], want[i]) + } + } + } + } + } +} + +func TestD64ScalarSubVariants(t *testing.T) { + uncheckedImpls := []d64ScalarUncheckedImpl{ + {"scalar", scalarD64ScalarSubUnchecked}, + {"avx2", avx2D64ScalarSubUnchecked}, + } + if cpu.X86.HasAVX512 { + uncheckedImpls = append(uncheckedImpls, d64ScalarUncheckedImpl{"avx512", avx512D64ScalarSubUnchecked}) + } + for _, n := range d64Sizes() { + v := makeRandD64(n, uint64(n)*41+19) + for _, s := range d64ScalarSamples() { + want := make([]uint64, n) + scalarD64ScalarSubUnchecked(s, v, want) + for _, impl := range uncheckedImpls { + got := make([]uint64, n) + impl.fn(s, v, got) + for i := 0; i < n; i++ { + if got[i] != want[i] { + t.Fatalf("ScalarSubUnchecked %s n=%d s=%x i=%d: got %x want %x", + impl.name, n, s, i, got[i], want[i]) + } + } + } + } + } +} + +func TestD64AddScalarCheckedVariants(t *testing.T) { + impls := []d64ScalarCheckedImpl{ + {"scalar", scalarD64AddScalarChecked}, + {"avx2", avx2D64AddScalarChecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d64ScalarCheckedImpl{"avx512", avx512D64AddScalarChecked}) + } + + // 1) No-overflow random (mask sign bit) inputs. + for _, n := range d64Sizes() { + v := makeRandD64(n, uint64(n)*43+23) + for i := range v { + v[i] &= 0x3FFFFFFFFFFFFFFF + } + for _, s := range []uint64{0, 1, 42, 0x3FFFFFFFFFFFFFFF} { + want := make([]uint64, n) + if got := scalarD64AddScalarChecked(s, v, want); got != -1 { + t.Fatalf("setup: scalar reported overflow at %d for masked input n=%d s=%x", got, n, s) + } + for _, impl := range impls { + got := make([]uint64, n) + if idx := impl.fn(s, v, got); idx != -1 { + t.Fatalf("AddScalarChecked %s n=%d s=%x: spurious overflow at %d", impl.name, n, s, idx) + } + for i := 0; i < n; i++ { + if got[i] != want[i] { + t.Fatalf("AddScalarChecked %s n=%d s=%x i=%d: got %x want %x", + impl.name, n, s, i, got[i], want[i]) + } + } + } + } + } + + // 2) Inject a single overflow at varying positions, with s = MaxInt64. + s := uint64(math.MaxInt64) + for _, n := range []int{8, 16, 17, 33, 35, 64} { + for _, pos := range []int{0, 1, 4, 7, 8, n - 1} { + if pos < 0 || pos >= n { + continue + } + v := make([]uint64, n) + v[pos] = 1 // s + 1 = MaxInt64+1 → overflow + for _, impl := range impls { + got := make([]uint64, n) + idx := impl.fn(s, v, got) + if idx != pos { + t.Fatalf("AddScalarChecked %s n=%d inject pos=%d: got idx %d", impl.name, n, pos, idx) + } + } + } + } +} + +func TestD64SubScalarCheckedVariants(t *testing.T) { + impls := []d64ScalarCheckedImplR{ + {"scalar", scalarD64SubScalarChecked}, + {"avx2", avx2D64SubScalarChecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d64ScalarCheckedImplR{"avx512", avx512D64SubScalarChecked}) + } + + // 1) No-overflow random (mask sign bit) inputs. + for _, n := range d64Sizes() { + v := makeRandD64(n, uint64(n)*47+29) + for i := range v { + v[i] &= 0x3FFFFFFFFFFFFFFF + } + for _, s := range []uint64{0, 1, 42, 0x3FFFFFFFFFFFFFFF} { + want := make([]uint64, n) + if got := scalarD64SubScalarChecked(v, s, want); got != -1 { + t.Fatalf("setup: scalar reported overflow at %d for masked input n=%d s=%x", got, n, s) + } + for _, impl := range impls { + got := make([]uint64, n) + if idx := impl.fn(v, s, got); idx != -1 { + t.Fatalf("SubScalarChecked %s n=%d s=%x: spurious overflow at %d", impl.name, n, s, idx) + } + for i := 0; i < n; i++ { + if got[i] != want[i] { + t.Fatalf("SubScalarChecked %s n=%d s=%x i=%d: got %x want %x", + impl.name, n, s, i, got[i], want[i]) + } + } + } + } + } + + // 2) Inject overflow: v[pos] = MinInt64, s = 1 → MinInt64 - 1 → overflow. + s := uint64(1) + for _, n := range []int{8, 16, 17, 33, 35, 64} { + for _, pos := range []int{0, 1, 4, 7, 8, n - 1} { + if pos < 0 || pos >= n { + continue + } + v := make([]uint64, n) + v[pos] = 1 << 63 // MinInt64 + for _, impl := range impls { + got := make([]uint64, n) + idx := impl.fn(v, s, got) + if idx != pos { + t.Fatalf("SubScalarChecked %s n=%d inject pos=%d: got idx %d", impl.name, n, pos, idx) + } + } + } + } +} + +func TestD64ScalarSubCheckedVariants(t *testing.T) { + impls := []d64ScalarCheckedImpl{ + {"scalar", scalarD64ScalarSubChecked}, + {"avx2", avx2D64ScalarSubChecked}, + } + if cpu.X86.HasAVX512 { + impls = append(impls, d64ScalarCheckedImpl{"avx512", avx512D64ScalarSubChecked}) + } + + // 1) No-overflow random (mask sign bit) inputs. + for _, n := range d64Sizes() { + v := makeRandD64(n, uint64(n)*53+31) + for i := range v { + v[i] &= 0x3FFFFFFFFFFFFFFF + } + for _, s := range []uint64{0, 1, 42, 0x3FFFFFFFFFFFFFFF} { + want := make([]uint64, n) + if got := scalarD64ScalarSubChecked(s, v, want); got != -1 { + t.Fatalf("setup: scalar reported overflow at %d for masked input n=%d s=%x", got, n, s) + } + for _, impl := range impls { + got := make([]uint64, n) + if idx := impl.fn(s, v, got); idx != -1 { + t.Fatalf("ScalarSubChecked %s n=%d s=%x: spurious overflow at %d", impl.name, n, s, idx) + } + for i := 0; i < n; i++ { + if got[i] != want[i] { + t.Fatalf("ScalarSubChecked %s n=%d s=%x i=%d: got %x want %x", + impl.name, n, s, i, got[i], want[i]) + } + } + } + } + } + + // 2) Inject overflow: s = MinInt64, v[pos] = 1 → MinInt64 - 1 → overflow. + s := uint64(1 << 63) // MinInt64 + for _, n := range []int{8, 16, 17, 33, 35, 64} { + for _, pos := range []int{0, 1, 4, 7, 8, n - 1} { + if pos < 0 || pos >= n { + continue + } + v := make([]uint64, n) + v[pos] = 1 + for _, impl := range impls { + got := make([]uint64, n) + idx := impl.fn(s, v, got) + if idx != pos { + t.Fatalf("ScalarSubChecked %s n=%d inject pos=%d: got idx %d", impl.name, n, pos, idx) + } + } + } + } +} + +// makeRandD64Bounded returns Decimal64-range values: |x| < 10^18. +func makeRandD64Bounded(n int, seed uint64) []uint64 { + rng := rand.New(rand.NewPCG(seed, seed^0x9E3779B97F4A7C15)) + const maxAbs uint64 = 1_000_000_000_000_000_000 + out := make([]uint64, n) + for i := range out { + x := int64(rng.Uint64N(maxAbs)) + if rng.IntN(2) == 1 { + x = -x + } + out[i] = uint64(x) + } + return out +} + +func TestD64SumReduceToD128Variants(t *testing.T) { + impls := []struct { + name string + fn func([]uint64) (uint64, uint64) + }{ + {"scalar", scalarD64SumReduceToD128}, + } + if cpu.X86.HasAVX2 { + impls = append(impls, struct { + name string + fn func([]uint64) (uint64, uint64) + }{"avx2", avx2D64SumReduceToD128}) + } + if cpu.X86.HasAVX512 { + impls = append(impls, struct { + name string + fn func([]uint64) (uint64, uint64) + }{"avx512", avx512D64SumReduceToD128}) + } + + for _, n := range d64Sizes() { + v := makeRandD64Bounded(n, uint64(n)*23+1) + refLo, refHi := scalarD64SumReduceToD128(v) + for _, im := range impls { + lo, hi := im.fn(v) + if lo != refLo || hi != refHi { + t.Fatalf("%s n=%d: got (%x,%x) want (%x,%x)", im.name, n, lo, hi, refLo, refHi) + } + } + } +} + +func BenchmarkD64SumReduceToD128(b *testing.B) { + for _, n := range d64BenchSizes { + v := makeRandD64Bounded(n, 1) + b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { + b.SetBytes(int64(n) * 8) + for i := 0; i < b.N; i++ { + _, _ = scalarD64SumReduceToD128(v) + } + }) + if cpu.X86.HasAVX2 { + b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { + b.SetBytes(int64(n) * 8) + for i := 0; i < b.N; i++ { + _, _ = avx2D64SumReduceToD128(v) + } + }) + } + if cpu.X86.HasAVX512 { + b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { + b.SetBytes(int64(n) * 8) + for i := 0; i < b.N; i++ { + _, _ = avx512D64SumReduceToD128(v) + } + }) + } + } +} diff --git a/pkg/common/simdkernels/d64_compare.go b/pkg/common/simdkernels/d64_compare.go new file mode 100644 index 0000000000000..39622f1d8bd5c --- /dev/null +++ b/pkg/common/simdkernels/d64_compare.go @@ -0,0 +1,47 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package simdkernels + +// Decimal64 element-wise comparisons producing []bool output (1 byte per +// element; 0 = false, 1 = true). +// +// Inputs a, b are uint64-typed slices interpreted as int64 for signed +// compares (Decimal64 is two's-complement int64 with sign-bit-checked +// negativity, so signed integer ordering matches Decimal64.Less ordering). + +var ( + D64Eq func(a, b []uint64, out []bool) = scalarD64Eq + D64Lt func(a, b []uint64, out []bool) = scalarD64Lt +) + +func scalarD64Eq(a, b []uint64, out []bool) { + n := len(a) + if len(b) < n || len(out) < n { + return + } + for i := 0; i < n; i++ { + out[i] = a[i] == b[i] + } +} + +func scalarD64Lt(a, b []uint64, out []bool) { + n := len(a) + if len(b) < n || len(out) < n { + return + } + for i := 0; i < n; i++ { + out[i] = int64(a[i]) < int64(b[i]) + } +} diff --git a/pkg/common/simdkernels/d64_compare_simd_amd64.go b/pkg/common/simdkernels/d64_compare_simd_amd64.go new file mode 100644 index 0000000000000..5b47d60a4d36b --- /dev/null +++ b/pkg/common/simdkernels/d64_compare_simd_amd64.go @@ -0,0 +1,96 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build goexperiment.simd && amd64 + +package simdkernels + +import ( + "simd/archsimd" + "unsafe" + + "golang.org/x/sys/cpu" +) + +// AVX2 was tried (4-elem and 8-elem interleaved batches) but consistently +// loses ~13-24% to scalar on Zen 3 because: +// - scalar `cmp/sete/store` runs at ~1.2 cyc/elem (near retirement-width +// limit on a 4-wide OoO core); +// - AVX2 has no native qword->byte gather (VPMOVQB is AVX-512 only), +// so the post-compare bool packing on AVX2 (vector store -> 4 byte +// loads -> shift/OR -> store) adds a serial dependency chain that +// the OoO scheduler cannot parallelize away. +// Therefore AVX2 is intentionally NOT registered. Scalar stays as default +// on AVX2-only hardware. AVX-512 uses VPMOVQB (TruncateToInt8) which +// performs the byte gather natively in one instruction; this path is +// expected to win on Intel SPR / Zen 5 but cannot be benched on Zen 3. + +func init() { + if cpu.X86.HasAVX512 { + D64Eq = avx512D64Eq + D64Lt = avx512D64Lt + } +} + +// AVX-512 path: 8 D64 elements per inner step (Int64x8). Compare -> +// AND with broadcast(1) puts a 0/1 byte in the low byte of each qword. +// VPMOVQB (TruncateToInt8) then gathers the 8 low bytes into the low +// 8 bytes of an Int8x16. We extract that as a single uint64 and emit +// one 8-byte unsafe store. + +//go:nosplit +func storeBools8(v archsimd.Int64x8, one archsimd.Int64x8, out []bool, i int) { + packed := v.And(one).TruncateToInt8().AsInt64x2().GetElem(0) + *(*uint64)(unsafe.Pointer(&out[i])) = uint64(packed) +} + +func avx512D64Eq(a, b []uint64, out []bool) { + n := len(a) + if n == 0 || len(b) < n || len(out) < n { + return + } + pa, pb := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]) + one := archsimd.BroadcastInt64x8(1) + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 8 + av := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off))) + bv := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off))) + storeBools8(av.Equal(bv).ToInt64x8(), one, out, i) + } + for ; i < n; i++ { + out[i] = a[i] == b[i] + } +} + +func avx512D64Lt(a, b []uint64, out []bool) { + n := len(a) + if n == 0 || len(b) < n || len(out) < n { + return + } + pa, pb := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]) + one := archsimd.BroadcastInt64x8(1) + + i := 0 + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 8 + av := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off))) + bv := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off))) + storeBools8(av.Less(bv).ToInt64x8(), one, out, i) + } + for ; i < n; i++ { + out[i] = int64(a[i]) < int64(b[i]) + } +} diff --git a/pkg/common/simdkernels/d64_compare_test.go b/pkg/common/simdkernels/d64_compare_test.go new file mode 100644 index 0000000000000..3de2c2ce36e2b --- /dev/null +++ b/pkg/common/simdkernels/d64_compare_test.go @@ -0,0 +1,128 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package simdkernels + +import ( + "math" + "math/rand" + "strconv" + "testing" +) + +func makeD64Pair(n int, seed int64) ([]uint64, []uint64) { + r := rand.New(rand.NewSource(seed)) + a := make([]uint64, n) + b := make([]uint64, n) + for i := 0; i < n; i++ { + a[i] = uint64(r.Int63() - r.Int63()) + // 25% chance b shares value with a (test equality path) + if r.Intn(4) == 0 { + b[i] = a[i] + } else { + b[i] = uint64(r.Int63() - r.Int63()) + } + } + return a, b +} + +func edgeD64Pairs() (a, b []uint64) { + vals := []int64{ + math.MinInt64, math.MinInt64 + 1, -1 << 32, -1, 0, 1, 1 << 32, + math.MaxInt64 - 1, math.MaxInt64, + } + for _, x := range vals { + for _, y := range vals { + a = append(a, uint64(x)) + b = append(b, uint64(y)) + } + } + return +} + +func TestD64Eq(t *testing.T) { + cases := [][2][]uint64{} + for _, n := range []int{0, 1, 3, 4, 7, 8, 15, 16, 17, 64, 1023} { + a, b := makeD64Pair(n, int64(n)+1) + cases = append(cases, [2][]uint64{a, b}) + } + a, b := edgeD64Pairs() + cases = append(cases, [2][]uint64{a, b}) + + for ci, c := range cases { + a, b := c[0], c[1] + out := make([]bool, len(a)) + D64Eq(a, b, out) + for i := range a { + want := a[i] == b[i] + if out[i] != want { + t.Fatalf("case %d idx %d: D64Eq(%x,%x)=%v want %v", ci, i, a[i], b[i], out[i], want) + } + } + } +} + +func TestD64Lt(t *testing.T) { + cases := [][2][]uint64{} + for _, n := range []int{0, 1, 3, 4, 7, 8, 15, 16, 17, 64, 1023} { + a, b := makeD64Pair(n, int64(n)+101) + cases = append(cases, [2][]uint64{a, b}) + } + a, b := edgeD64Pairs() + cases = append(cases, [2][]uint64{a, b}) + + for ci, c := range cases { + a, b := c[0], c[1] + out := make([]bool, len(a)) + D64Lt(a, b, out) + for i := range a { + want := int64(a[i]) < int64(b[i]) + if out[i] != want { + t.Fatalf("case %d idx %d: D64Lt(%d,%d)=%v want %v", + ci, i, int64(a[i]), int64(b[i]), out[i], want) + } + } + } +} + +func benchD64Cmp(b *testing.B, fn func(a, b []uint64, out []bool), n int) { + x, y := makeD64Pair(n, 42) + out := make([]bool, n) + b.ResetTimer() + b.SetBytes(int64(n) * 16) + for i := 0; i < b.N; i++ { + fn(x, y, out) + } +} + +func BenchmarkD64Eq_Scalar(b *testing.B) { + for _, n := range []int{16, 64, 256, 1024, 4096} { + b.Run(strconv.Itoa(n), func(b *testing.B) { benchD64Cmp(b, scalarD64Eq, n) }) + } +} +func BenchmarkD64Eq_Dispatch(b *testing.B) { + for _, n := range []int{16, 64, 256, 1024, 4096} { + b.Run(strconv.Itoa(n), func(b *testing.B) { benchD64Cmp(b, D64Eq, n) }) + } +} +func BenchmarkD64Lt_Scalar(b *testing.B) { + for _, n := range []int{16, 64, 256, 1024, 4096} { + b.Run(strconv.Itoa(n), func(b *testing.B) { benchD64Cmp(b, scalarD64Lt, n) }) + } +} +func BenchmarkD64Lt_Dispatch(b *testing.B) { + for _, n := range []int{16, 64, 256, 1024, 4096} { + b.Run(strconv.Itoa(n), func(b *testing.B) { benchD64Cmp(b, D64Lt, n) }) + } +} diff --git a/pkg/common/simdkernels/d64_mul.go b/pkg/common/simdkernels/d64_mul.go new file mode 100644 index 0000000000000..8216d0bb0c381 --- /dev/null +++ b/pkg/common/simdkernels/d64_mul.go @@ -0,0 +1,133 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package simdkernels + +import "math/bits" + +// Decimal64 × Decimal64 → Decimal128 on slices. +// +// Inputs a, b are int64-typed (length n). Output r holds Decimal128 values +// stored as interleaved (lo, hi) uint64 pairs, length 2n. The kernel matches +// the semantics of d64MulInline in arith_decimal_fast.go: per element compute +// the signed 128-bit product of a[i] * b[i] via abs-mul-conditional-negate. +// +// The product of two int64s fits in 128 bits, so no overflow is possible at +// this step — the API has no Checked variant. +// +// scaleAdj is an output-scale adjustment in [-18, 0]. When non-zero the +// kernel divides the 128-bit product by 10^|scaleAdj| with half-up rounding +// (matching d128DivPow10Once). The scalar fallback reuses the same divisor +// per element; the SIMD path computes the product with SIMD and then defers +// the divide to a per-element scalar helper (the divide-by-constant primitive +// will be SIMD-vectorized in a separate task). + +var ( + D64MulNoBroadcast func(a, b, r []uint64, scaleAdj int32) = scalarD64MulNoBroadcast +) + +// pow10 mirrors types.Pow10 for 0..18. Duplicated here to keep simdkernels +// free of a dependency cycle on container/types. +var pow10Table = [19]uint64{ + 1, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000, + 1000000000, + 10000000000, + 100000000000, + 1000000000000, + 10000000000000, + 100000000000000, + 1000000000000000, + 10000000000000000, + 100000000000000000, + 1000000000000000000, +} + +func scalarD64MulNoBroadcast(a, b, r []uint64, scaleAdj int32) { + n := len(a) + if len(b) < n || len(r) < 2*n { + return + } + if scaleAdj == 0 { + for i := 0; i < n; i++ { + lo, hi := d64MulOne(a[i], b[i]) + r[2*i] = lo + r[2*i+1] = hi + } + return + } + d := pow10Table[-scaleAdj] + half := (d + 1) >> 1 + for i := 0; i < n; i++ { + lo, hi := d64MulOne(a[i], b[i]) + lo, hi = d128DivConst(lo, hi, d, half) + r[2*i] = lo + r[2*i+1] = hi + } +} + +// d64MulOne computes the signed 128-bit product (lo, hi) of two int64-typed +// uint64s, exactly mirroring d64MulInline in arith_decimal_fast.go. +func d64MulOne(av, bv uint64) (lo, hi uint64) { + xi, yi := int64(av), int64(bv) + mx, my := xi>>63, yi>>63 + ax, ay := uint64((xi^mx)-mx), uint64((yi^my)-my) + hi, lo = bits.Mul64(ax, ay) + nm := uint64((xi ^ yi) >> 63) // 0xFF..FF iff signs differ + lo ^= nm + hi ^= nm + var c uint64 + lo, c = bits.Add64(lo, 0, nm&1) + hi, _ = bits.Add64(hi, 0, c) + return lo, hi +} + +// d128DivConst divides a signed 128-bit value (lo, hi) by a positive +// constant d (≤ 10^18), with half-up rounding. Mirrors d128ScaleDownPow10 +// + d128DivPow10Once in arith_decimal_fast.go (sign extraction → unsigned +// divide → re-apply sign). +func d128DivConst(lo, hi, d, half uint64) (uint64, uint64) { + // Branchless abs of (lo, hi) when interpreted as int128. + // sign = top bit of hi as a 0/-1 mask. + sign := uint64(int64(hi) >> 63) + // Negate iff sign == -1: (lo, hi) := -(lo, hi). + lo ^= sign + hi ^= sign + var c uint64 + lo, c = bits.Add64(lo, 0, sign&1) + hi, _ = bits.Add64(hi, 0, c) + + // Unsigned 128 ÷ 64 with half-up rounding (matches d128DivPow10Once). + var rem uint64 + hi, rem = bits.Div64(0, hi, d) + lo, rem = bits.Div64(rem, lo, d) + _, borrow := bits.Sub64(rem, half, 0) + round := 1 - borrow + lo, c = bits.Add64(lo, round, 0) + hi += c + + // Re-apply sign: if sign == -1, negate the quotient. + lo ^= sign + hi ^= sign + lo, c = bits.Add64(lo, 0, sign&1) + hi, _ = bits.Add64(hi, 0, c) + return lo, hi +} diff --git a/pkg/common/simdkernels/d64_mul_test.go b/pkg/common/simdkernels/d64_mul_test.go new file mode 100644 index 0000000000000..3c4244931cb99 --- /dev/null +++ b/pkg/common/simdkernels/d64_mul_test.go @@ -0,0 +1,226 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build goexperiment.simd && amd64 + +package simdkernels + +import ( + "math" + "math/bits" + "math/rand/v2" + "strconv" + "testing" +) + +// referenceD64Mul is the canonical scalar reference. Independent of the +// dispatcher under test (scalarD64MulNoBroadcast or any future SIMD swap-in) +// so the SIMD impls are checked against the same source-of-truth used by +// the main repo (d64MulInline + d128DivPow10Once semantics). +func referenceD64Mul(a, b []uint64, scaleAdj int32) []uint64 { + n := len(a) + r := make([]uint64, 2*n) + for i := 0; i < n; i++ { + xi, yi := int64(a[i]), int64(b[i]) + mx, my := xi>>63, yi>>63 + ax, ay := uint64((xi^mx)-mx), uint64((yi^my)-my) + hi, lo := bits.Mul64(ax, ay) + nm := uint64((xi ^ yi) >> 63) + lo ^= nm + hi ^= nm + var c uint64 + lo, c = bits.Add64(lo, 0, nm&1) + hi, _ = bits.Add64(hi, 0, c) + if scaleAdj != 0 { + d := pow10Table[-scaleAdj] + half := (d + 1) >> 1 + // abs + s := uint64(int64(hi) >> 63) + lo ^= s + hi ^= s + lo, c = bits.Add64(lo, 0, s&1) + hi, _ = bits.Add64(hi, 0, c) + // divide + var rem uint64 + hi, rem = bits.Div64(0, hi, d) + lo, rem = bits.Div64(rem, lo, d) + _, borrow := bits.Sub64(rem, half, 0) + round := 1 - borrow + lo, c = bits.Add64(lo, round, 0) + hi += c + // re-sign + lo ^= s + hi ^= s + lo, c = bits.Add64(lo, 0, s&1) + hi, _ = bits.Add64(hi, 0, c) + } + r[2*i] = lo + r[2*i+1] = hi + } + return r +} + +func d64MulSizes() []int { + return []int{0, 1, 3, 4, 7, 8, 15, 16, 17, 31, 32, 33, 35, 63, 64, 127, 128, 255, 256, 1023, 4096} +} + +type d64MulImpl struct { + name string + fn func(a, b, r []uint64, scaleAdj int32) +} + +func d64MulImpls() []d64MulImpl { + out := []d64MulImpl{{"scalar", scalarD64MulNoBroadcast}} + if D64MulNoBroadcast != nil { + out = append(out, d64MulImpl{"dispatch", D64MulNoBroadcast}) + } + return out +} + +func makeRandInt64Slice(n int, seed uint64, maxAbs uint64) []uint64 { + rng := rand.New(rand.NewPCG(seed, seed^0xD1B54A32D192ED03)) + out := make([]uint64, n) + for i := range out { + v := rng.Uint64() + if maxAbs > 0 { + v %= 2 * maxAbs + out[i] = uint64(int64(v) - int64(maxAbs)) + } else { + out[i] = v + } + } + return out +} + +// u64 wraps a runtime int64→uint64 conversion. Used in test fixtures to +// avoid Go's compile-time check that rejects `uint64(int64(-N))` as a +// constant expression. +func u64(x int64) uint64 { return uint64(x) } + +func d64MulEdgeInputs() (a, b []uint64) { + a = []uint64{ + 0, + 1, + u64(-1), + u64(math.MaxInt64), + u64(math.MinInt64), + u64(math.MaxInt32), + u64(math.MinInt32), + 1 << 32, + 1<<32 - 1, + u64(-(1 << 32)), + u64(1e15), + u64(-1e15), + u64(1e9), + u64(-1e9), + u64(7), + u64(-7), + } + b = []uint64{ + 0, + u64(math.MaxInt64), + u64(math.MinInt64), + 1, + u64(-1), + 1 << 31, + u64(-(1 << 31)), + u64(1e9), + u64(-1e9), + 1 << 30, + u64(1e3), + u64(-1e3), + u64(1e9), + u64(-1e9), + u64(13), + u64(-13), + } + return a, b +} + +func TestD64MulCorrectness(t *testing.T) { + scaleAdjs := []int32{0, -1, -2, -4, -8, -12, -18} + impls := d64MulImpls() + + for _, sa := range scaleAdjs { + t.Run("scaleAdj="+strconv.Itoa(int(sa)), func(t *testing.T) { + for _, n := range d64MulSizes() { + a := makeRandInt64Slice(n, 0xC0FFEE^uint64(n), 1<<40) + b := makeRandInt64Slice(n, 0xBADBEEF^uint64(n), 1<<40) + want := referenceD64Mul(a, b, sa) + for _, im := range impls { + got := make([]uint64, 2*n) + im.fn(a, b, got, sa) + for i := 0; i < 2*n; i++ { + if got[i] != want[i] { + t.Fatalf("%s n=%d sa=%d idx=%d: got 0x%x want 0x%x (a=%d b=%d)", + im.name, n, sa, i, got[i], want[i], + int64(a[i/2]), int64(b[i/2])) + return + } + } + } + } + }) + } +} + +func TestD64MulEdges(t *testing.T) { + a, b := d64MulEdgeInputs() + scaleAdjs := []int32{0, -1, -8, -18} + impls := d64MulImpls() + for _, sa := range scaleAdjs { + want := referenceD64Mul(a, b, sa) + for _, im := range impls { + got := make([]uint64, 2*len(a)) + im.fn(a, b, got, sa) + for i := 0; i < 2*len(a); i++ { + if got[i] != want[i] { + t.Fatalf("%s sa=%d idx=%d: got 0x%x want 0x%x (a=%d b=%d)", + im.name, sa, i, got[i], want[i], + int64(a[i/2]), int64(b[i/2])) + } + } + } + } +} + +// --------------------------------------------------------------------------- +// Benchmarks +// --------------------------------------------------------------------------- + +func benchD64Mul(b *testing.B, fn func(a, b, r []uint64, scaleAdj int32), n int, scaleAdj int32) { + a := makeRandInt64Slice(n, 0x1234, 1<<40) + bv := makeRandInt64Slice(n, 0x5678, 1<<40) + r := make([]uint64, 2*n) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + fn(a, bv, r, scaleAdj) + } +} + +func BenchmarkD64Mul(b *testing.B) { + sizes := []int{16, 64, 256, 1024, 4096} + scaleAdjs := []int32{0, -8} + for _, sa := range scaleAdjs { + for _, n := range sizes { + b.Run("scalar/sa="+strconv.Itoa(int(sa))+"/n="+strconv.Itoa(n), func(b *testing.B) { + benchD64Mul(b, scalarD64MulNoBroadcast, n, sa) + }) + b.Run("dispatch/sa="+strconv.Itoa(int(sa))+"/n="+strconv.Itoa(n), func(b *testing.B) { + benchD64Mul(b, D64MulNoBroadcast, n, sa) + }) + } + } +} diff --git a/pkg/common/simdkernels/d64_scale.go b/pkg/common/simdkernels/d64_scale.go new file mode 100644 index 0000000000000..cd9c2b68b5b76 --- /dev/null +++ b/pkg/common/simdkernels/d64_scale.go @@ -0,0 +1,66 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package simdkernels + +import "math/bits" + +// Decimal64 scale-into-rs (multiply each element by a constant uint64 factor +// f = 10^scaleDiff), used by Add/Sub when input scales differ. +// +// Two variants: +// *Unchecked — caller has prescanned that every |vec[i]| ≤ MaxInt64/f, so +// a plain truncating 64-bit multiply suffices. Hot path for the +// common diff-scale add/sub case where data magnitude is small. +// *Checked — uses bits.Mul64 / 128-bit product, returns the index of the +// first overflowing element or -1. +// +// Signature matches the d64_addsub.go convention: slices are uint64 reinterpret +// of the underlying Decimal64. Sign is encoded in bit 63. + +var ( + D64ScaleUnchecked func(vec, rs []uint64, f uint64) = scalarD64ScaleUnchecked + D64ScaleChecked func(vec, rs []uint64, f uint64) int = scalarD64ScaleChecked +) + +func scalarD64ScaleUnchecked(vec, rs []uint64, f uint64) { + n := len(rs) + if len(vec) < n { + return + } + for i := 0; i < n; i++ { + signBit := vec[i] >> 63 + mask := -signBit + abs := (vec[i] ^ mask) + signBit + rs[i] = (abs*f ^ mask) + signBit + } +} + +func scalarD64ScaleChecked(vec, rs []uint64, f uint64) int { + n := len(rs) + if len(vec) < n { + return -1 + } + for i := 0; i < n; i++ { + signBit := vec[i] >> 63 + mask := -signBit + abs := (vec[i] ^ mask) + signBit + hi, lo := bits.Mul64(abs, f) + if hi|(lo>>63) != 0 { + return i + } + rs[i] = (lo ^ mask) + signBit + } + return -1 +} diff --git a/pkg/common/simdkernels/d64_scale_simd_amd64.go b/pkg/common/simdkernels/d64_scale_simd_amd64.go new file mode 100644 index 0000000000000..268510f6d59c8 --- /dev/null +++ b/pkg/common/simdkernels/d64_scale_simd_amd64.go @@ -0,0 +1,93 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build goexperiment.simd && amd64 + +package simdkernels + +import ( + "simd/archsimd" + "unsafe" + + "golang.org/x/sys/cpu" +) + +// D64Scale: per-lane signed 64-bit value × constant uint64 factor f. +// +// Algorithm (per lane, matches scalarD64ScaleUnchecked): +// +// mask = v >>arith 63 // 0 or -1 +// abs = (v XOR mask) - mask // |v| as Uint64 +// prod = abs * f // truncating low-64 product +// out = (prod XOR mask) - mask // restore sign +// +// The unchecked path assumes the caller has prescanned that abs ≤ MaxInt64/f +// for every lane, so the truncating mul cannot overflow. +// +// Only AVX-512 is enabled. AVX-512DQ provides VPMULLQ which gives one fused +// 64×64→low-64 multiply per Int64x8 lane. AVX2 lacks VPMULLQ; emulating it +// via 32×32 partial products (3 VPMULUDQ + shifts + adds per 4 lanes) is +// roughly 5× slower than scalar IMUL64 on Zen 3, so we fall back to scalar +// on AVX2-only hosts. + +func init() { + if cpu.X86.HasAVX512 { + D64ScaleUnchecked = avx512D64ScaleUnchecked + } +} + +// --------------------------------------------------------------------------- +// AVX-512 path — Int64x8.Mul (VPMULLQ from AVX-512DQ). +// One real multiply per 8 lanes. Main loop unrolls 32 lanes per iter. +// --------------------------------------------------------------------------- + +func avx512D64ScaleUnchecked(vec, rs []uint64, f uint64) { + n := len(rs) + if n == 0 || len(vec) < n { + return + } + pv, pr := unsafe.Pointer(&vec[0]), unsafe.Pointer(&rs[0]) + + fv := archsimd.BroadcastInt64x8(int64(f)) + zero := archsimd.BroadcastInt64x8(0) + + i := 0 + for ; i+32 <= n; i += 32 { + off := uintptr(i) * 8 + v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off))) + v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64))) + v2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+128))) + v3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+192))) + m0 := zero.Greater(v0).ToInt64x8() + m1 := zero.Greater(v1).ToInt64x8() + m2 := zero.Greater(v2).ToInt64x8() + m3 := zero.Greater(v3).ToInt64x8() + v0.Xor(m0).Sub(m0).Mul(fv).Xor(m0).Sub(m0).Store((*[8]int64)(unsafe.Add(pr, off))) + v1.Xor(m1).Sub(m1).Mul(fv).Xor(m1).Sub(m1).Store((*[8]int64)(unsafe.Add(pr, off+64))) + v2.Xor(m2).Sub(m2).Mul(fv).Xor(m2).Sub(m2).Store((*[8]int64)(unsafe.Add(pr, off+128))) + v3.Xor(m3).Sub(m3).Mul(fv).Xor(m3).Sub(m3).Store((*[8]int64)(unsafe.Add(pr, off+192))) + } + for ; i+8 <= n; i += 8 { + off := uintptr(i) * 8 + v := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off))) + m := zero.Greater(v).ToInt64x8() + v.Xor(m).Sub(m).Mul(fv).Xor(m).Sub(m).Store((*[8]int64)(unsafe.Add(pr, off))) + } + for ; i < n; i++ { + signBit := vec[i] >> 63 + mask := -signBit + abs := (vec[i] ^ mask) + signBit + rs[i] = (abs*f ^ mask) + signBit + } +} diff --git a/pkg/common/simdkernels/d64_scale_test.go b/pkg/common/simdkernels/d64_scale_test.go new file mode 100644 index 0000000000000..b382f5d384752 --- /dev/null +++ b/pkg/common/simdkernels/d64_scale_test.go @@ -0,0 +1,156 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build goexperiment.simd && amd64 + +package simdkernels + +import ( + "math" + "math/rand/v2" + "strconv" + "testing" +) + +// pow10 mirrors types.Pow10 up to 10^18; sufficient for D64 scale factors. +var pow10 = [...]uint64{ + 1, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000, + 1000000000, + 10000000000, + 100000000000, + 1000000000000, + 10000000000000, + 100000000000000, + 1000000000000000, + 10000000000000000, + 100000000000000000, + 1000000000000000000, +} + +func d64ScaleSizes() []int { + return []int{0, 1, 3, 4, 5, 7, 8, 15, 16, 17, 31, 32, 33, 63, 64, 127, 1024, 4096} +} + +type d64ScaleImpl struct { + name string + fn func(vec, rs []uint64, f uint64) +} + +func d64ScaleImpls() []d64ScaleImpl { + out := []d64ScaleImpl{{name: "scalar", fn: scalarD64ScaleUnchecked}} + if D64ScaleUnchecked != nil { + out = append(out, d64ScaleImpl{name: "dispatch", fn: D64ScaleUnchecked}) + } + return out +} + +// makeRandSafeD64 returns n int64 lanes, each guaranteed to satisfy +// |v| ≤ MaxInt64 / f (the unchecked-path precondition). +func makeRandSafeD64(n int, f uint64, seed uint64) []uint64 { + rng := rand.New(rand.NewPCG(seed, seed^0xDEADBEEF)) + maxAbs := uint64(math.MaxInt64) / f + out := make([]uint64, n) + for i := range out { + v := int64(rng.Uint64N(maxAbs + 1)) + if rng.Uint64()&1 == 1 { + v = -v + } + out[i] = uint64(v) + } + return out +} + +func TestD64ScaleUncheckedCorrectness(t *testing.T) { + for _, scale := range []int{1, 2, 6, 12, 18} { + f := pow10[scale] + for _, n := range d64ScaleSizes() { + vec := makeRandSafeD64(n, f, uint64(scale*1000+n)) + want := make([]uint64, n) + scalarD64ScaleUnchecked(vec, want, f) + for _, im := range d64ScaleImpls() { + got := make([]uint64, n) + im.fn(vec, got, f) + for i := 0; i < n; i++ { + if got[i] != want[i] { + t.Fatalf("%s scale=%d n=%d idx=%d: got 0x%x want 0x%x (vec=%d)", + im.name, scale, n, i, got[i], want[i], int64(vec[i])) + } + } + } + } + } +} + +func TestD64ScaleUncheckedEdges(t *testing.T) { + cases := []struct { + name string + vec []int64 + f uint64 + }{ + {"zeros", []int64{0, 0, 0, 0, 0}, 1000}, + {"ones", []int64{1, -1, 1, -1, 1, -1, 1, -1, 1}, pow10[6]}, + {"max_safe", []int64{ + int64(math.MaxInt64 / pow10[3]), -int64(math.MaxInt64 / pow10[3]), + int64(math.MaxInt64 / pow10[3]), -int64(math.MaxInt64 / pow10[3]), + }, pow10[3]}, + {"alt_signs", []int64{1, -1000, 12345, -987654321, 0, 99, -42, 100000}, pow10[2]}, + } + for _, tc := range cases { + vec := make([]uint64, len(tc.vec)) + for i, v := range tc.vec { + vec[i] = uint64(v) + } + want := make([]uint64, len(vec)) + scalarD64ScaleUnchecked(vec, want, tc.f) + for _, im := range d64ScaleImpls() { + got := make([]uint64, len(vec)) + im.fn(vec, got, tc.f) + for i := range vec { + if got[i] != want[i] { + t.Fatalf("%s/%s idx=%d: got %d want %d", tc.name, im.name, i, + int64(got[i]), int64(want[i])) + } + } + } + } +} + +func benchmarkD64Scale(b *testing.B, fn func(vec, rs []uint64, f uint64), n int, f uint64) { + vec := makeRandSafeD64(n, f, 0xC0FFEE^uint64(n)) + rs := make([]uint64, n) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + fn(vec, rs, f) + } +} + +func BenchmarkD64ScaleUnchecked(b *testing.B) { + f := pow10[6] + for _, n := range []int{16, 64, 256, 1024, 4096} { + for _, im := range d64ScaleImpls() { + b.Run(im.name+"/n="+strconv.Itoa(n), func(b *testing.B) { + benchmarkD64Scale(b, im.fn, n, f) + }) + } + } +} diff --git a/pkg/common/simdkernels/decimal_amd64.go b/pkg/common/simdkernels/decimal_amd64.go new file mode 100644 index 0000000000000..22774d2451e3b --- /dev/null +++ b/pkg/common/simdkernels/decimal_amd64.go @@ -0,0 +1,26 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package simdkernels + +import "unsafe" + +// Decimal64SignExtend widens n Decimal64 (uint64) values to Decimal128 via +// branchless sign extension. dst points to the Decimal128 output array, +// src points to the Decimal64 input array. +// Uses 4× loop unrolling and PREFETCHT0 to hide L2 latency on cold +// destination cache lines that would otherwise stall the store pipeline. +// +//go:noescape +func Decimal64SignExtend(dst, src unsafe.Pointer, n int) diff --git a/pkg/common/simdkernels/decimal_amd64.s b/pkg/common/simdkernels/decimal_amd64.s new file mode 100644 index 0000000000000..9db17fb647988 --- /dev/null +++ b/pkg/common/simdkernels/decimal_amd64.s @@ -0,0 +1,94 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "textflag.h" + +// func Decimal64SignExtend(dst, src unsafe.Pointer, n int) +// +// Widens n Decimal64 (8-byte) values to Decimal128 (16-byte struct {B0_63, B64_127 uint64}) +// via arithmetic right-shift sign extension. +// 4× unrolled with PREFETCHT0 on destination to hide L2 RFO latency. +TEXT ·Decimal64SignExtend(SB), NOSPLIT, $0-24 + MOVQ dst+0(FP), DI + MOVQ src+8(FP), SI + MOVQ n+16(FP), CX + + TESTQ CX, CX + JLE done + + // DX = n/4 (number of 4-element iterations) + // CX = n%4 (remainder) + MOVQ CX, DX + SHRQ $2, DX + ANDQ $3, CX + + TESTQ DX, DX + JZ remainder + +loop4: + // Prefetch destination 4 cache lines ahead (16 elements × 16 bytes = 256 bytes). + PREFETCHT0 256(DI) + // Prefetch source 2 cache lines ahead (16 elements × 8 bytes = 128 bytes). + PREFETCHT0 128(SI) + + // Element 0 + MOVQ 0(SI), AX + MOVQ AX, R8 + SARQ $63, R8 + MOVQ AX, 0(DI) + MOVQ R8, 8(DI) + + // Element 1 + MOVQ 8(SI), AX + MOVQ AX, R8 + SARQ $63, R8 + MOVQ AX, 16(DI) + MOVQ R8, 24(DI) + + // Element 2 + MOVQ 16(SI), AX + MOVQ AX, R8 + SARQ $63, R8 + MOVQ AX, 32(DI) + MOVQ R8, 40(DI) + + // Element 3 + MOVQ 24(SI), AX + MOVQ AX, R8 + SARQ $63, R8 + MOVQ AX, 48(DI) + MOVQ R8, 56(DI) + + ADDQ $32, SI // 4 × 8-byte source elements + ADDQ $64, DI // 4 × 16-byte destination elements + DECQ DX + JNZ loop4 + +remainder: + TESTQ CX, CX + JZ done + +loop1: + MOVQ 0(SI), AX + MOVQ AX, R8 + SARQ $63, R8 + MOVQ AX, 0(DI) + MOVQ R8, 8(DI) + ADDQ $8, SI + ADDQ $16, DI + DECQ CX + JNZ loop1 + +done: + RET diff --git a/pkg/common/simdkernels/decimal_arm64.go b/pkg/common/simdkernels/decimal_arm64.go new file mode 100644 index 0000000000000..9ee6012ce71da --- /dev/null +++ b/pkg/common/simdkernels/decimal_arm64.go @@ -0,0 +1,28 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package simdkernels + +import "unsafe" + +// Decimal64SignExtend widens n Decimal64 (uint64) values to Decimal128 via +// branchless sign extension. Pure Go fallback on arm64. +func Decimal64SignExtend(dst, src unsafe.Pointer, n int) { + for i := 0; i < n; i++ { + val := *(*uint64)(unsafe.Add(src, i*8)) + s := uint64(int64(val) >> 63) + *(*uint64)(unsafe.Add(dst, i*16)) = val + *(*uint64)(unsafe.Add(dst, i*16+8)) = s + } +} diff --git a/pkg/common/simdkernels/decimal_generic.go b/pkg/common/simdkernels/decimal_generic.go new file mode 100644 index 0000000000000..21cb80ae9be39 --- /dev/null +++ b/pkg/common/simdkernels/decimal_generic.go @@ -0,0 +1,30 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !amd64 && !arm64 + +package simdkernels + +import "unsafe" + +// Decimal64SignExtend widens n Decimal64 (uint64) values to Decimal128 via +// branchless sign extension. Pure Go fallback for unsupported architectures. +func Decimal64SignExtend(dst, src unsafe.Pointer, n int) { + for i := 0; i < n; i++ { + val := *(*uint64)(unsafe.Add(src, i*8)) + s := uint64(int64(val) >> 63) + *(*uint64)(unsafe.Add(dst, i*16)) = val + *(*uint64)(unsafe.Add(dst, i*16+8)) = s + } +} diff --git a/pkg/embed/tpch_agg_bench_test.go b/pkg/embed/tpch_agg_bench_test.go new file mode 100644 index 0000000000000..20502d2a3bdae --- /dev/null +++ b/pkg/embed/tpch_agg_bench_test.go @@ -0,0 +1,185 @@ +// Copyright 2026 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package embed + +import ( + "context" + "database/sql" + "fmt" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestTPCHAggBench(t *testing.T) { + RunBaseClusterTests( + func(c Cluster) { + cn0, err := c.GetCNService(0) + require.NoError(t, err) + + dsn := fmt.Sprintf("dump:111@tcp(127.0.0.1:%d)/", + cn0.GetServiceConfig().CN.Frontend.Port, + ) + + db, err := sql.Open("mysql", dsn) + require.NoError(t, err) + defer db.Close() + + ctx := context.Background() + conn, err := db.Conn(ctx) + require.NoError(t, err) + defer conn.Close() + + // Create and load TPC-H tables (SF=0.01 inline) + setupTPCH(t, ctx, conn) + + queries := []struct { + name string + sql string + }{ + {"Q1", `SELECT l_returnflag, l_linestatus, + sum(l_quantity), sum(l_extendedprice), + sum(l_extendedprice * (1 - l_discount)), + avg(l_quantity), avg(l_extendedprice), avg(l_discount), count(*) + FROM lineitem + WHERE l_shipdate <= date '1998-12-01' - interval '90' day + GROUP BY l_returnflag, l_linestatus`}, + {"Q5-like", `SELECT n_name, sum(l_extendedprice * (1 - l_discount)) as revenue + FROM lineitem, orders, customer, nation, region, supplier + WHERE c_custkey = o_custkey AND l_orderkey = o_orderkey + AND l_suppkey = s_suppkey AND c_nationkey = s_nationkey + AND s_nationkey = n_nationkey AND n_regionkey = r_regionkey + AND r_name = 'ASIA' + GROUP BY n_name ORDER BY revenue DESC`}, + {"CountStar", `SELECT count(*) FROM lineitem`}, + {"SumDecimal", `SELECT l_returnflag, sum(l_extendedprice) FROM lineitem GROUP BY l_returnflag`}, + } + + // Warmup + for _, q := range queries { + _, err := conn.ExecContext(ctx, q.sql) + require.NoError(t, err, "query %s failed", q.name) + } + + // Benchmark + const iterations = 5 + for _, q := range queries { + var total time.Duration + for i := 0; i < iterations; i++ { + start := time.Now() + _, err := conn.ExecContext(ctx, q.sql) + require.NoError(t, err) + total += time.Since(start) + } + t.Logf("%-12s avg=%v", q.name, total/iterations) + } + }, + ) +} + +func setupTPCH(t *testing.T, ctx context.Context, conn *sql.Conn) { + t.Helper() + + ddls := []string{ + `CREATE DATABASE IF NOT EXISTS tpch`, + `USE tpch`, + `DROP TABLE IF EXISTS nation`, + `DROP TABLE IF EXISTS region`, + `DROP TABLE IF EXISTS supplier`, + `DROP TABLE IF EXISTS customer`, + `DROP TABLE IF EXISTS orders`, + `DROP TABLE IF EXISTS lineitem`, + `CREATE TABLE region (r_regionkey INT, r_name CHAR(25), r_comment VARCHAR(152), PRIMARY KEY (r_regionkey))`, + `CREATE TABLE nation (n_nationkey INT, n_name CHAR(25), n_regionkey INT, n_comment VARCHAR(152), PRIMARY KEY (n_nationkey))`, + `CREATE TABLE supplier (s_suppkey INT, s_name CHAR(25), s_address VARCHAR(40), s_nationkey INT, s_phone CHAR(15), s_acctbal DECIMAL(15,2), s_comment VARCHAR(101), PRIMARY KEY (s_suppkey))`, + `CREATE TABLE customer (c_custkey INT, c_name VARCHAR(25), c_address VARCHAR(40), c_nationkey INT, c_phone CHAR(15), c_acctbal DECIMAL(15,2), c_mktsegment CHAR(10), c_comment VARCHAR(117), PRIMARY KEY (c_custkey))`, + `CREATE TABLE orders (o_orderkey BIGINT, o_custkey INT, o_orderstatus CHAR(1), o_totalprice DECIMAL(15,2), o_orderdate DATE, o_orderpriority CHAR(15), o_clerk CHAR(15), o_shippriority INT, o_comment VARCHAR(79), PRIMARY KEY (o_orderkey))`, + `CREATE TABLE lineitem (l_orderkey BIGINT, l_partkey INT, l_suppkey INT, l_linenumber INT, l_quantity DECIMAL(15,2), l_extendedprice DECIMAL(15,2), l_discount DECIMAL(15,2), l_tax DECIMAL(15,2), l_returnflag CHAR(1), l_linestatus CHAR(1), l_shipdate DATE, l_commitdate DATE, l_receiptdate DATE, l_shipinstruct CHAR(25), l_shipmode CHAR(10), l_comment VARCHAR(44), PRIMARY KEY (l_orderkey, l_linenumber))`, + } + for _, ddl := range ddls { + _, err := conn.ExecContext(ctx, ddl) + require.NoError(t, err, "DDL failed: %s", ddl) + } + + // Generate synthetic data (100K lineitem rows) + _, err := conn.ExecContext(ctx, `USE tpch`) + require.NoError(t, err) + + // Regions + _, err = conn.ExecContext(ctx, `INSERT INTO region VALUES (0,'AFRICA','a'),(1,'AMERICA','b'),(2,'ASIA','c'),(3,'EUROPE','d'),(4,'MIDDLE EAST','e')`) + require.NoError(t, err) + + // Nations (25) + for i := 0; i < 25; i++ { + _, err = conn.ExecContext(ctx, fmt.Sprintf(`INSERT INTO nation VALUES (%d, 'NATION_%d', %d, 'comment')`, i, i, i%5)) + require.NoError(t, err) + } + + // Suppliers (100) + for i := 1; i <= 100; i++ { + _, err = conn.ExecContext(ctx, fmt.Sprintf(`INSERT INTO supplier VALUES (%d,'Supplier#%05d','addr%d',%d,'phone%d',%.2f,'comment')`, i, i, i, i%25, i, float64(i)*10.5)) + require.NoError(t, err) + } + + // Customers (1500) + for batch := 0; batch < 15; batch++ { + vals := "" + for i := 0; i < 100; i++ { + id := batch*100 + i + 1 + if i > 0 { + vals += "," + } + vals += fmt.Sprintf("(%d,'Customer#%09d','addr%d',%d,'phone%d',%.2f,'BUILDING','comment')", id, id, id, id%25, id, float64(id)*5.5) + } + _, err = conn.ExecContext(ctx, "INSERT INTO customer VALUES "+vals) + require.NoError(t, err) + } + + // Orders (15000) + for batch := 0; batch < 150; batch++ { + vals := "" + for i := 0; i < 100; i++ { + id := batch*100 + i + 1 + if i > 0 { + vals += "," + } + vals += fmt.Sprintf("(%d,%d,'O',%.2f,'1995-01-01','1-URGENT','Clerk#000001',0,'comment')", id, (id%1500)+1, float64(id)*100.5) + } + _, err = conn.ExecContext(ctx, "INSERT INTO orders VALUES "+vals) + require.NoError(t, err) + } + + // Lineitem (100K rows in batches of 1000) + flags := []string{"A", "N", "R"} + statuses := []string{"F", "O"} + for batch := 0; batch < 100; batch++ { + vals := "" + for i := 0; i < 1000; i++ { + row := batch*1000 + i + orderkey := int64(row + 1) + linenum := 1 + if i > 0 { + vals += "," + } + vals += fmt.Sprintf("(%d,%d,%d,%d,%.2f,%.2f,%.2f,%.2f,'%s','%s','1995-06-17','1995-06-17','1995-06-17','DELIVER IN PERSON','TRUCK','comment')", + orderkey, row%200000+1, row%100+1, linenum, + float64(row%50)+1, float64(row%10000)+100.0, float64(row%11)*0.01, 0.02, + flags[row%3], statuses[row%2]) + } + _, err = conn.ExecContext(ctx, "INSERT INTO lineitem VALUES "+vals) + require.NoError(t, err) + } +} diff --git a/pkg/sql/colexec/aggexec/sum_decimal_fast.go b/pkg/sql/colexec/aggexec/sum_decimal_fast.go index ba52e38a94260..788a41d2d6bb8 100644 --- a/pkg/sql/colexec/aggexec/sum_decimal_fast.go +++ b/pkg/sql/colexec/aggexec/sum_decimal_fast.go @@ -19,14 +19,25 @@ package aggexec import ( "slices" + "unsafe" "github.com/matrixorigin/matrixone/pkg/common/bitmap" "github.com/matrixorigin/matrixone/pkg/common/mpool" + "github.com/matrixorigin/matrixone/pkg/common/simdkernels" "github.com/matrixorigin/matrixone/pkg/common/util" "github.com/matrixorigin/matrixone/pkg/container/types" "github.com/matrixorigin/matrixone/pkg/container/vector" ) +// sumReduceRunMin is the minimum run length (consecutive same-group, non-null +// rows over a flat vector) at which it pays to call into the SIMD SumReduce +// kernel instead of the scalar inner loop. Tuned by end-to-end micro-benching +// the dispatched kernel + slice-construction path against the inline scalar +// loop on Zen 3 (AVX2). Break-even is at n≈32 for both D128 and D64; n=64 +// gives a comfortable margin (D128 1.4×, D64 2.0×) without sacrificing many +// short-run opportunities. +const sumReduceRunMin = 32 + // ---- Decimal64 SUM/AVG ---- type sumDecimal64FastExec struct { @@ -88,6 +99,53 @@ func (exec *sumDecimal64FastExec) batchFill(offset int, groups []uint64, vectors np = vec.GetNulls().GetBitmap() } + // Fast path: scan for runs of the same group (no nulls, flat vec). + // Within a run we can SIMD-sum-reduce a contiguous Decimal64 slice and + // fold the 128-bit total into the state directly. + if !hasNull && constMask == -1 { + lastX := -1 + var sums *[AggBatchSize]types.Decimal128 + var cnts []int64 + i := 0 + N := len(groups) + for i < N { + grp := groups[i] + if grp == GroupNotMatched { + i++ + continue + } + j := i + 1 + for j < N && groups[j] == grp { + j++ + } + runLen := j - i + g := grp - 1 + x := int(g >> aggBatchSizeShift) + if x != lastX { + lastX = x + sums = chunkArr[types.Decimal128](exec.state[x].vecs[0]) + cnts = vector.MustFixedColNoTypeCheck[int64](exec.state[x].vecs[1]) + } + y := g & aggBatchSizeMask + if runLen >= sumReduceRunMin { + idx := i + offset + slo, shi := simdkernels.D64SumReduceToD128( + unsafe.Slice((*uint64)(unsafe.Pointer(&vals[idx])), runLen)) + sums[y] = sums[y].Add128Unchecked(types.Decimal128{B0_63: slo, B64_127: shi}) + cnts[y] += int64(runLen) + } else { + for k := i; k < j; k++ { + raw := vals[k+offset] + hi := uint64(int64(raw) >> 63) + sums[y] = sums[y].Add128Unchecked(types.Decimal128{B0_63: uint64(raw), B64_127: hi}) + } + cnts[y] += int64(runLen) + } + i = j + } + return nil + } + const maxSlots = 255 var slotOf [256]uint8 var localSums [maxSlots]types.Decimal128 diff --git a/pkg/sql/plan/function/arith_decimal_fast.go b/pkg/sql/plan/function/arith_decimal_fast.go index 13d1040c504a3..621eabef15327 100644 --- a/pkg/sql/plan/function/arith_decimal_fast.go +++ b/pkg/sql/plan/function/arith_decimal_fast.go @@ -29,14 +29,25 @@ package function import ( "math" "math/bits" + "unsafe" "github.com/matrixorigin/matrixone/pkg/common/bitmap" "github.com/matrixorigin/matrixone/pkg/common/moerr" + "github.com/matrixorigin/matrixone/pkg/common/simdkernels" "github.com/matrixorigin/matrixone/pkg/container/nulls" "github.com/matrixorigin/matrixone/pkg/container/types" "github.com/matrixorigin/matrixone/pkg/vm/process" ) +// d128AsU64 reinterprets a []Decimal128 as []uint64 (lo, hi, lo, hi, ...). +// Decimal128 is exactly {B0_63, B64_127 uint64} with no padding. +func d128AsU64(s []types.Decimal128) []uint64 { + if len(s) == 0 { + return nil + } + return unsafe.Slice((*uint64)(unsafe.Pointer(&s[0])), len(s)*2) +} + func operandsAt[T any](v1, v2 []T, idx int) (T, T) { // Branchless: mask is 0 when len==1 (scalar), -1 when len>1 (vector). // idx & 0 = 0 (always v[0]), idx & -1 = idx (v[idx]). @@ -140,14 +151,7 @@ func d128AddSameScale(v1, v2, rs []types.Decimal128, rsnull *nulls.Nulls) int { if len1 == len2 { if noNull { - for i := 0; i < len1; i++ { - signX := v1[i].B64_127 >> 63 - rs[i].B0_63, carry = bits.Add64(v1[i].B0_63, v2[i].B0_63, 0) - rs[i].B64_127, _ = bits.Add64(v1[i].B64_127, v2[i].B64_127, carry) - if signX == v2[i].B64_127>>63 && signX != rs[i].B64_127>>63 { - return i - } - } + return simdkernels.D128AddChecked(d128AsU64(v1), d128AsU64(v2), d128AsU64(rs)) } else { for i := 0; i < len1; i++ { if bmp.Contains(uint64(i)) { @@ -165,13 +169,7 @@ func d128AddSameScale(v1, v2, rs []types.Decimal128, rsnull *nulls.Nulls) int { a := v1[0] signA := a.B64_127 >> 63 if noNull { - for i := 0; i < len2; i++ { - rs[i].B0_63, carry = bits.Add64(a.B0_63, v2[i].B0_63, 0) - rs[i].B64_127, _ = bits.Add64(a.B64_127, v2[i].B64_127, carry) - if signA == v2[i].B64_127>>63 && signA != rs[i].B64_127>>63 { - return i - } - } + return simdkernels.D128AddScalarChecked(a.B0_63, a.B64_127, d128AsU64(v2), d128AsU64(rs)) } else { for i := 0; i < len2; i++ { if bmp.Contains(uint64(i)) { @@ -188,13 +186,7 @@ func d128AddSameScale(v1, v2, rs []types.Decimal128, rsnull *nulls.Nulls) int { b := v2[0] signB := b.B64_127 >> 63 if noNull { - for i := 0; i < len1; i++ { - rs[i].B0_63, carry = bits.Add64(v1[i].B0_63, b.B0_63, 0) - rs[i].B64_127, _ = bits.Add64(v1[i].B64_127, b.B64_127, carry) - if v1[i].B64_127>>63 == signB && v1[i].B64_127>>63 != rs[i].B64_127>>63 { - return i - } - } + return simdkernels.D128AddScalarChecked(b.B0_63, b.B64_127, d128AsU64(v1), d128AsU64(rs)) } else { for i := 0; i < len1; i++ { if bmp.Contains(uint64(i)) { @@ -291,14 +283,7 @@ func d128SubSameScale(v1, v2, rs []types.Decimal128, rsnull *nulls.Nulls) int { if len1 == len2 { if noNull { - for i := 0; i < len1; i++ { - signX := v1[i].B64_127 >> 63 - rs[i].B0_63, borrow = bits.Sub64(v1[i].B0_63, v2[i].B0_63, 0) - rs[i].B64_127, _ = bits.Sub64(v1[i].B64_127, v2[i].B64_127, borrow) - if signX != v2[i].B64_127>>63 && signX != rs[i].B64_127>>63 { - return i - } - } + return simdkernels.D128SubChecked(d128AsU64(v1), d128AsU64(v2), d128AsU64(rs)) } else { for i := 0; i < len1; i++ { if bmp.Contains(uint64(i)) { @@ -316,13 +301,7 @@ func d128SubSameScale(v1, v2, rs []types.Decimal128, rsnull *nulls.Nulls) int { a := v1[0] signA := a.B64_127 >> 63 if noNull { - for i := 0; i < len2; i++ { - rs[i].B0_63, borrow = bits.Sub64(a.B0_63, v2[i].B0_63, 0) - rs[i].B64_127, _ = bits.Sub64(a.B64_127, v2[i].B64_127, borrow) - if signA != v2[i].B64_127>>63 && signA != rs[i].B64_127>>63 { - return i - } - } + return simdkernels.D128ScalarSubChecked(a.B0_63, a.B64_127, d128AsU64(v2), d128AsU64(rs)) } else { for i := 0; i < len2; i++ { if bmp.Contains(uint64(i)) { @@ -339,14 +318,7 @@ func d128SubSameScale(v1, v2, rs []types.Decimal128, rsnull *nulls.Nulls) int { b := v2[0] signB := b.B64_127 >> 63 if noNull { - for i := 0; i < len1; i++ { - signX := v1[i].B64_127 >> 63 - rs[i].B0_63, borrow = bits.Sub64(v1[i].B0_63, b.B0_63, 0) - rs[i].B64_127, _ = bits.Sub64(v1[i].B64_127, b.B64_127, borrow) - if signX != signB && signX != rs[i].B64_127>>63 { - return i - } - } + return simdkernels.D128SubScalarChecked(d128AsU64(v1), b.B0_63, b.B64_127, d128AsU64(rs)) } else { for i := 0; i < len1; i++ { if bmp.Contains(uint64(i)) { diff --git a/pkg/sql/plan/function/arith_decimal_fast_test.go b/pkg/sql/plan/function/arith_decimal_fast_test.go index 19cd55b2918f3..1d3a807b1b2a7 100644 --- a/pkg/sql/plan/function/arith_decimal_fast_test.go +++ b/pkg/sql/plan/function/arith_decimal_fast_test.go @@ -1976,6 +1976,22 @@ func BenchmarkD128IntDiv_Generic(b *testing.B) { } } +func BenchmarkD128Sub_Fast(b *testing.B) { + rng := rand.New(rand.NewSource(42)) + xs := make([]types.Decimal128, benchN) + ys := make([]types.Decimal128, benchN) + rs := make([]types.Decimal128, benchN) + for i := range xs { + xs[i] = randD128(rng) + ys[i] = randD128(rng) + } + nul := nulls.NewWithSize(benchN) + b.ResetTimer() + for iter := 0; iter < b.N; iter++ { + _ = d128Sub(xs, ys, rs, 2, 2, nul) + } +} + func randD256(rng *rand.Rand) types.Decimal256 { return types.Decimal256{ B0_63: rng.Uint64(), diff --git a/pkg/sql/plan/function/func_cast.go b/pkg/sql/plan/function/func_cast.go index ca094c08c85dc..30a06121298e0 100644 --- a/pkg/sql/plan/function/func_cast.go +++ b/pkg/sql/plan/function/func_cast.go @@ -27,6 +27,7 @@ import ( "unsafe" "github.com/matrixorigin/matrixone/pkg/common/moerr" + "github.com/matrixorigin/matrixone/pkg/common/simdkernels" "github.com/matrixorigin/matrixone/pkg/common/util" "github.com/matrixorigin/matrixone/pkg/container/bytejson" "github.com/matrixorigin/matrixone/pkg/container/nulls" @@ -4309,18 +4310,9 @@ func decimal64ToDecimal128Array( } } else { if totype.Scale == fromtype.Scale { - // Fast path: direct slice write with branchless sign extension. - // BCE hints + int index eliminate bounds checks in the inner loop. + // Fast path: assembly sign-extend with prefetch for cold destination. dst := vector.MustFixedColNoTypeCheck[types.Decimal128](to.GetResultVector()) - _ = v[length-1] - _ = dst[length-1] - for i := 0; i < length; i++ { - s := int64(v[i]) >> 63 - dst[i] = types.Decimal128{ - B0_63: uint64(v[i]), - B64_127: uint64(s), - } - } + simdkernels.Decimal64SignExtend(unsafe.Pointer(&dst[0]), unsafe.Pointer(&v[0]), length) } else { for i := 0; i < length; i++ { fromdec := types.Decimal128{B0_63: uint64(v[i]), B64_127: 0} @@ -4352,15 +4344,7 @@ func decimal64ToDecimal128Array( rsVec.GetNulls().Or(srcVec.GetNulls()) dst := vector.MustFixedColNoTypeCheck[types.Decimal128](rsVec) v := vector.MustFixedColWithTypeCheck[types.Decimal64](srcVec) - _ = v[length-1] - _ = dst[length-1] - for i := 0; i < length; i++ { - s := int64(v[i]) >> 63 - dst[i] = types.Decimal128{ - B0_63: uint64(v[i]), - B64_127: uint64(s), - } - } + simdkernels.Decimal64SignExtend(unsafe.Pointer(&dst[0]), unsafe.Pointer(&v[0]), length) } else { var dft types.Decimal128 for i := 0; i < length; i++ {