diff --git a/Makefile b/Makefile
index 2aee18cc749f0..0d8314fd5d6c5 100644
--- a/Makefile
+++ b/Makefile
@@ -222,7 +222,7 @@ thirdparties:
 .PHONY: build
 build: config cgo thirdparties
 	$(info [Build binary])
-	$(CGO_OPTS) go build $(TAGS) $(RACE_OPT) $(GOLDFLAGS) $(DEBUG_OPT) $(GOBUILD_OPT) -o $(BIN_NAME) ./cmd/mo-service
+	$(CGO_OPTS) GOEXPERIMENT=simd go build $(TAGS) $(RACE_OPT) $(GOLDFLAGS) $(DEBUG_OPT) $(GOBUILD_OPT) -o $(BIN_NAME) ./cmd/mo-service
 
 # https://wiki.musl-libc.org/getting-started.html
 # https://musl.cc/
diff --git a/go.mod b/go.mod
index 97ad6a4c2812d..bd6181ab754fa 100644
--- a/go.mod
+++ b/go.mod
@@ -1,7 +1,7 @@
 module github.com/matrixorigin/matrixone
 
 // Minimum Go version required
-go 1.25.4
+go 1.26.2
 
 require (
 	github.com/BurntSushi/toml v1.2.1
diff --git a/optools/run_ut.sh b/optools/run_ut.sh
index a8a8205891efe..7d8f2015291f5 100755
--- a/optools/run_ut.sh
+++ b/optools/run_ut.sh
@@ -102,11 +102,11 @@ function run_tests(){
 
     if [[ $SKIP_TESTS == 'race' ]]; then
         logger "INF" "Run UT without race check"
-	    CGO_CFLAGS="${CGO_CFLAGS}" CGO_LDFLAGS="${CGO_LDFLAGS}" go test -short -v -json -tags matrixone_test -p ${UT_PARALLEL} -timeout "${UT_TIMEOUT}m"  $test_scope > $UT_REPORT
+	    CGO_CFLAGS="${CGO_CFLAGS}" CGO_LDFLAGS="${CGO_LDFLAGS}" GOEXPERIMENT=simd go test -short -v -json -tags matrixone_test -p ${UT_PARALLEL} -timeout "${UT_TIMEOUT}m"  $test_scope > $UT_REPORT
 
     else
         logger "INF" "Run UT with race check"
-        CGO_CFLAGS="${CGO_CFLAGS}" CGO_LDFLAGS="${CGO_LDFLAGS}" go test -short -v -json -tags matrixone_test -p ${UT_PARALLEL} -timeout "${UT_TIMEOUT}m" -race $test_scope > $UT_REPORT
+        CGO_CFLAGS="${CGO_CFLAGS}" CGO_LDFLAGS="${CGO_LDFLAGS}" GOEXPERIMENT=simd go test -short -v -json -tags matrixone_test -p ${UT_PARALLEL} -timeout "${UT_TIMEOUT}m" -race $test_scope > $UT_REPORT
     fi
 }
 
diff --git a/pkg/common/simdkernels/d128_addsub.go b/pkg/common/simdkernels/d128_addsub.go
new file mode 100644
index 0000000000000..d99b55cf446c1
--- /dev/null
+++ b/pkg/common/simdkernels/d128_addsub.go
@@ -0,0 +1,338 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package simdkernels
+
+import "math/bits"
+
+// Decimal128 add/sub on slices of uint64 with the matrixone Decimal128
+// in-memory layout (lo, hi pair per element). The slices have length 2*N
+// where N is the element count; element i occupies indices 2i (lo) and
+// 2i+1 (hi). Hi is interpreted as int64 for the signed-overflow predicate.
+//
+// Operands and result are assumed to share the same scale.
+//
+// Two variants per operator (mirroring D64*):
+//
+//	*Unchecked — wraps on overflow, no detection.
+//	*Checked   — returns the first overflowing element index, or -1 if none.
+//
+// The exported variables are dispatchers; their default values are the
+// scalar reference implementations and may be replaced at init time on
+// amd64 when AVX2 / AVX-512 are detected (see d128_addsub_simd_amd64.go).
+
+var (
+	D128AddUnchecked func(a, b, r []uint64)     = scalarD128AddUnchecked
+	D128SubUnchecked func(a, b, r []uint64)     = scalarD128SubUnchecked
+	D128AddChecked   func(a, b, r []uint64) int = scalarD128AddChecked
+	D128SubChecked   func(a, b, r []uint64) int = scalarD128SubChecked
+
+	// Scalar-broadcast variants: scalar is passed as two uint64s
+	// (slo = low 64 bits, shi = high 64 bits), matching the
+	// types.Decimal128 in-memory layout (B0_63, B64_127).
+	D128AddScalarUnchecked func(slo, shi uint64, v, r []uint64)              = scalarD128AddScalarUnchecked
+	D128AddScalarChecked   func(slo, shi uint64, v, r []uint64) int          = scalarD128AddScalarChecked
+	D128SubScalarUnchecked func(v []uint64, slo, shi uint64, r []uint64)     = scalarD128SubScalarUnchecked
+	D128SubScalarChecked   func(v []uint64, slo, shi uint64, r []uint64) int = scalarD128SubScalarChecked
+	D128ScalarSubUnchecked func(slo, shi uint64, v, r []uint64)              = scalarD128ScalarSubUnchecked
+	D128ScalarSubChecked   func(slo, shi uint64, v, r []uint64) int          = scalarD128ScalarSubChecked
+
+	// D128SumReduce sums a contiguous slice of Decimal128 values and returns
+	// the 128-bit total as (lo, hi). Wraps on overflow (mod 2^128).
+	D128SumReduce func(v []uint64) (lo, hi uint64) = scalarD128SumReduce
+)
+
+func scalarD128AddUnchecked(a, b, r []uint64) {
+	n := len(r) / 2
+	if len(a) < 2*n || len(b) < 2*n {
+		return
+	}
+	for i := 0; i < n; i++ {
+		j := i << 1
+		lo, c := bits.Add64(a[j], b[j], 0)
+		hi, _ := bits.Add64(a[j+1], b[j+1], c)
+		r[j] = lo
+		r[j+1] = hi
+	}
+}
+
+func scalarD128SubUnchecked(a, b, r []uint64) {
+	n := len(r) / 2
+	if len(a) < 2*n || len(b) < 2*n {
+		return
+	}
+	for i := 0; i < n; i++ {
+		j := i << 1
+		lo, br := bits.Sub64(a[j], b[j], 0)
+		hi, _ := bits.Sub64(a[j+1], b[j+1], br)
+		r[j] = lo
+		r[j+1] = hi
+	}
+}
+
+// d128 signed overflow on add: same as 64-bit, evaluated on the high half.
+//
+//	signX == signY && signX != signR     ⇔     ((aHi^rHi) &^ (aHi^bHi)) < 0
+func scalarD128AddChecked(a, b, r []uint64) int {
+	n := len(r) / 2
+	if len(a) < 2*n || len(b) < 2*n {
+		return -1
+	}
+	first := -1
+	for i := 0; i < n; i++ {
+		j := i << 1
+		aLo, aHi := a[j], a[j+1]
+		bLo, bHi := b[j], b[j+1]
+		lo, c := bits.Add64(aLo, bLo, 0)
+		hi, _ := bits.Add64(aHi, bHi, c)
+		r[j] = lo
+		r[j+1] = hi
+		if first < 0 {
+			ah, bh, rh := int64(aHi), int64(bHi), int64(hi)
+			if (ah^rh)&^(ah^bh) < 0 {
+				first = i
+			}
+		}
+	}
+	return first
+}
+
+// d128 signed overflow on sub:
+//
+//	signX != signY && signX != signR     ⇔     ((aHi^rHi) & (aHi^bHi)) < 0
+func scalarD128SubChecked(a, b, r []uint64) int {
+	n := len(r) / 2
+	if len(a) < 2*n || len(b) < 2*n {
+		return -1
+	}
+	first := -1
+	for i := 0; i < n; i++ {
+		j := i << 1
+		aLo, aHi := a[j], a[j+1]
+		bLo, bHi := b[j], b[j+1]
+		lo, br := bits.Sub64(aLo, bLo, 0)
+		hi, _ := bits.Sub64(aHi, bHi, br)
+		r[j] = lo
+		r[j+1] = hi
+		if first < 0 {
+			ah, bh, rh := int64(aHi), int64(bHi), int64(hi)
+			if (ah^rh)&(ah^bh) < 0 {
+				first = i
+			}
+		}
+	}
+	return first
+}
+
+// d128FirstOverflow rescans the first end elements (each = 2 uint64) for
+// the first overflow. Used by SIMD checked variants when their accumulated
+// mask reports overflow but the scalar tail did not see one.
+func d128FirstOverflow(a, b []uint64, end int, sub bool) int {
+	if sub {
+		for i := 0; i < end; i++ {
+			j := i << 1
+			lo, br := bits.Sub64(a[j], b[j], 0)
+			hi, _ := bits.Sub64(a[j+1], b[j+1], br)
+			_ = lo
+			ah, bh, rh := int64(a[j+1]), int64(b[j+1]), int64(hi)
+			if (ah^rh)&(ah^bh) < 0 {
+				return i
+			}
+		}
+		return -1
+	}
+	for i := 0; i < end; i++ {
+		j := i << 1
+		lo, c := bits.Add64(a[j], b[j], 0)
+		hi, _ := bits.Add64(a[j+1], b[j+1], c)
+		_ = lo
+		ah, bh, rh := int64(a[j+1]), int64(b[j+1]), int64(hi)
+		if (ah^rh)&^(ah^bh) < 0 {
+			return i
+		}
+	}
+	return -1
+}
+
+// ---------------------------------------------------------------------------
+// Scalar-broadcast reference implementations.
+// ---------------------------------------------------------------------------
+
+func scalarD128AddScalarUnchecked(slo, shi uint64, v, r []uint64) {
+	n := len(r) / 2
+	if len(v) < 2*n {
+		return
+	}
+	for i := 0; i < n; i++ {
+		j := i << 1
+		lo, c := bits.Add64(slo, v[j], 0)
+		hi, _ := bits.Add64(shi, v[j+1], c)
+		r[j] = lo
+		r[j+1] = hi
+	}
+}
+
+func scalarD128SubScalarUnchecked(v []uint64, slo, shi uint64, r []uint64) {
+	n := len(r) / 2
+	if len(v) < 2*n {
+		return
+	}
+	for i := 0; i < n; i++ {
+		j := i << 1
+		lo, br := bits.Sub64(v[j], slo, 0)
+		hi, _ := bits.Sub64(v[j+1], shi, br)
+		r[j] = lo
+		r[j+1] = hi
+	}
+}
+
+func scalarD128ScalarSubUnchecked(slo, shi uint64, v, r []uint64) {
+	n := len(r) / 2
+	if len(v) < 2*n {
+		return
+	}
+	for i := 0; i < n; i++ {
+		j := i << 1
+		lo, br := bits.Sub64(slo, v[j], 0)
+		hi, _ := bits.Sub64(shi, v[j+1], br)
+		r[j] = lo
+		r[j+1] = hi
+	}
+}
+
+func scalarD128AddScalarChecked(slo, shi uint64, v, r []uint64) int {
+	n := len(r) / 2
+	if len(v) < 2*n {
+		return -1
+	}
+	first := -1
+	sh := int64(shi)
+	for i := 0; i < n; i++ {
+		j := i << 1
+		vLo, vHi := v[j], v[j+1]
+		lo, c := bits.Add64(slo, vLo, 0)
+		hi, _ := bits.Add64(shi, vHi, c)
+		r[j] = lo
+		r[j+1] = hi
+		if first < 0 {
+			vh, rh := int64(vHi), int64(hi)
+			if (sh^rh)&^(sh^vh) < 0 {
+				first = i
+			}
+		}
+	}
+	return first
+}
+
+func scalarD128SubScalarChecked(v []uint64, slo, shi uint64, r []uint64) int {
+	n := len(r) / 2
+	if len(v) < 2*n {
+		return -1
+	}
+	first := -1
+	sh := int64(shi)
+	for i := 0; i < n; i++ {
+		j := i << 1
+		vLo, vHi := v[j], v[j+1]
+		lo, br := bits.Sub64(vLo, slo, 0)
+		hi, _ := bits.Sub64(vHi, shi, br)
+		r[j] = lo
+		r[j+1] = hi
+		if first < 0 {
+			vh, rh := int64(vHi), int64(hi)
+			if (vh^rh)&(vh^sh) < 0 {
+				first = i
+			}
+		}
+	}
+	return first
+}
+
+func scalarD128ScalarSubChecked(slo, shi uint64, v, r []uint64) int {
+	n := len(r) / 2
+	if len(v) < 2*n {
+		return -1
+	}
+	first := -1
+	sh := int64(shi)
+	for i := 0; i < n; i++ {
+		j := i << 1
+		vLo, vHi := v[j], v[j+1]
+		lo, br := bits.Sub64(slo, vLo, 0)
+		hi, _ := bits.Sub64(shi, vHi, br)
+		r[j] = lo
+		r[j+1] = hi
+		if first < 0 {
+			vh, rh := int64(vHi), int64(hi)
+			if (sh^rh)&(sh^vh) < 0 {
+				first = i
+			}
+		}
+	}
+	return first
+}
+
+// d128ScalarFirstOverflow rescans first end elements for the first overflow
+// in scalar-broadcast operations. kind: 0=AddScalar, 1=SubScalar (v-s),
+// 2=ScalarSub (s-v).
+func d128ScalarFirstOverflow(slo, shi uint64, v []uint64, end int, kind int) int {
+	sh := int64(shi)
+	switch kind {
+	case 0:
+		for i := 0; i < end; i++ {
+			j := i << 1
+			vLo, vHi := v[j], v[j+1]
+			_, c := bits.Add64(slo, vLo, 0)
+			hi, _ := bits.Add64(shi, vHi, c)
+			vh, rh := int64(vHi), int64(hi)
+			if (sh^rh)&^(sh^vh) < 0 {
+				return i
+			}
+		}
+	case 1:
+		for i := 0; i < end; i++ {
+			j := i << 1
+			vLo, vHi := v[j], v[j+1]
+			_, br := bits.Sub64(vLo, slo, 0)
+			hi, _ := bits.Sub64(vHi, shi, br)
+			vh, rh := int64(vHi), int64(hi)
+			if (vh^rh)&(vh^sh) < 0 {
+				return i
+			}
+		}
+	case 2:
+		for i := 0; i < end; i++ {
+			j := i << 1
+			vLo, vHi := v[j], v[j+1]
+			_, br := bits.Sub64(slo, vLo, 0)
+			hi, _ := bits.Sub64(shi, vHi, br)
+			vh, rh := int64(vHi), int64(hi)
+			if (sh^rh)&(sh^vh) < 0 {
+				return i
+			}
+		}
+	}
+	return -1
+}
+
+func scalarD128SumReduce(v []uint64) (lo, hi uint64) {
+	n := len(v) >> 1
+	for i := 0; i < n; i++ {
+		j := i << 1
+		var c uint64
+		lo, c = bits.Add64(lo, v[j], 0)
+		hi, _ = bits.Add64(hi, v[j+1], c)
+	}
+	return
+}
diff --git a/pkg/common/simdkernels/d128_addsub_simd_amd64.go b/pkg/common/simdkernels/d128_addsub_simd_amd64.go
new file mode 100644
index 0000000000000..e5f2c3357d34d
--- /dev/null
+++ b/pkg/common/simdkernels/d128_addsub_simd_amd64.go
@@ -0,0 +1,1225 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build goexperiment.simd && amd64
+
+package simdkernels
+
+import (
+	"math/bits"
+	"simd/archsimd"
+	"unsafe"
+
+	"golang.org/x/sys/cpu"
+)
+
+// signBit128 is used to convert unsigned int64 compares to the available
+// signed Less by flipping every lane's MSB. Stored as the int64 value
+// -1<<63, which has only the sign bit set (writing 1<<63 directly would
+// overflow an untyped int constant).
+const signBit128 int64 = -1 << 63
+
+func init() {
+	switch {
+	case cpu.X86.HasAVX512:
+		D128AddUnchecked = avx512D128AddUnchecked
+		D128SubUnchecked = avx512D128SubUnchecked
+		D128AddChecked = avx512D128AddChecked
+		D128SubChecked = avx512D128SubChecked
+		D128AddScalarUnchecked = avx512D128AddScalarUnchecked
+		D128SubScalarUnchecked = avx512D128SubScalarUnchecked
+		D128ScalarSubUnchecked = avx512D128ScalarSubUnchecked
+		D128AddScalarChecked = avx512D128AddScalarChecked
+		D128SubScalarChecked = avx512D128SubScalarChecked
+		D128ScalarSubChecked = avx512D128ScalarSubChecked
+		D128SumReduce = avx512D128SumReduce
+	case cpu.X86.HasAVX2:
+		D128AddUnchecked = avx2D128AddUnchecked
+		D128SubUnchecked = avx2D128SubUnchecked
+		D128AddChecked = avx2D128AddChecked
+		D128SubChecked = avx2D128SubChecked
+		D128AddScalarUnchecked = avx2D128AddScalarUnchecked
+		D128SubScalarUnchecked = avx2D128SubScalarUnchecked
+		D128ScalarSubUnchecked = avx2D128ScalarSubUnchecked
+		D128AddScalarChecked = avx2D128AddScalarChecked
+		D128SubScalarChecked = avx2D128SubScalarChecked
+		D128ScalarSubChecked = avx2D128ScalarSubChecked
+		D128SumReduce = avx2D128SumReduce
+	}
+}
+
+// ---------------------------------------------------------------------------
+// AVX2 path: each Decimal128 = 2 uint64 = 16 B. We process 4 elements per
+// kernel iteration (= 64 B per input). Layout in two Int64x4 loads:
+//
+//	vec0 = [a0.lo, a0.hi, a1.lo, a1.hi]
+//	vec1 = [a2.lo, a2.hi, a3.lo, a3.hi]
+//
+// InterleaveLoGrouped/HiGrouped (VPUNPCKLQDQ/VPUNPCKHQDQ) split into:
+//
+//	los = [a0.lo, a2.lo, a1.lo, a3.lo]   (= permutation [0,2,1,3] of a*.lo)
+//	his = [a0.hi, a2.hi, a1.hi, a3.hi]   (same permutation of a*.hi)
+//
+// Both operands and the result use the same permutation, so reinterleaving
+// (los, his) with the same two instructions restores the original order.
+// Carry: SIMD has no unsigned int64 compare; flip the MSB on both inputs
+// and use signed Less. Carry mask is -1 in overflowing lanes, so
+// `hi - carryMask` is `hi + 1` exactly where carry is set.
+// ---------------------------------------------------------------------------
+
+// avx2D128AddCarry computes [los, his] = [aLo, aHi] + [bLo, bHi] (128-bit
+// across lanes). aLo/bLo/aHi/bHi are already in the deinterleaved permuted
+// form. Used by both Unchecked and Checked variants.
+//
+//go:nosplit
+func avx2D128AddCarry(aLo, aHi, bLo, bHi, sb archsimd.Int64x4) (rLo, rHi archsimd.Int64x4) {
+	rLo = aLo.Add(bLo)
+	carryMask := rLo.Xor(sb).Less(aLo.Xor(sb)).ToInt64x4()
+	rHi = aHi.Add(bHi).Sub(carryMask)
+	return
+}
+
+//go:nosplit
+func avx2D128SubBorrow(aLo, aHi, bLo, bHi, sb archsimd.Int64x4) (rLo, rHi archsimd.Int64x4) {
+	rLo = aLo.Sub(bLo)
+	// Borrow iff aLo < bLo (unsigned). borrowMask is -1 per borrowing lane,
+	// and `hi + borrowMask` equals `hi - 1` there.
+	borrowMask := aLo.Xor(sb).Less(bLo.Xor(sb)).ToInt64x4()
+	rHi = aHi.Sub(bHi).Add(borrowMask)
+	return
+}
+
+func avx2D128AddUnchecked(a, b, r []uint64) {
+	n := len(r) / 2
+	if n == 0 || len(a) < 2*n || len(b) < 2*n {
+		return
+	}
+	pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x4(signBit128)
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 16
+		a0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off)))
+		a1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+32)))
+		b0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off)))
+		b1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+32)))
+
+		aLo := a0.InterleaveLoGrouped(a1)
+		aHi := a0.InterleaveHiGrouped(a1)
+		bLo := b0.InterleaveLoGrouped(b1)
+		bHi := b0.InterleaveHiGrouped(b1)
+
+		rLo, rHi := avx2D128AddCarry(aLo, aHi, bLo, bHi, sb)
+
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[4]int64)(unsafe.Add(pr, off)))
+		r1.Store((*[4]int64)(unsafe.Add(pr, off+32)))
+	}
+	for ; i < n; i++ {
+		j := i << 1
+		lo := a[j] + b[j]
+		var c uint64
+		if lo < a[j] {
+			c = 1
+		}
+		r[j] = lo
+		r[j+1] = a[j+1] + b[j+1] + c
+	}
+}
+
+func avx2D128SubUnchecked(a, b, r []uint64) {
+	n := len(r) / 2
+	if n == 0 || len(a) < 2*n || len(b) < 2*n {
+		return
+	}
+	pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x4(signBit128)
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 16
+		a0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off)))
+		a1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+32)))
+		b0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off)))
+		b1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+32)))
+
+		aLo := a0.InterleaveLoGrouped(a1)
+		aHi := a0.InterleaveHiGrouped(a1)
+		bLo := b0.InterleaveLoGrouped(b1)
+		bHi := b0.InterleaveHiGrouped(b1)
+
+		rLo, rHi := avx2D128SubBorrow(aLo, aHi, bLo, bHi, sb)
+
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[4]int64)(unsafe.Add(pr, off)))
+		r1.Store((*[4]int64)(unsafe.Add(pr, off+32)))
+	}
+	for ; i < n; i++ {
+		j := i << 1
+		var br uint64
+		if a[j] < b[j] {
+			br = 1
+		}
+		r[j] = a[j] - b[j]
+		r[j+1] = a[j+1] - b[j+1] - br
+	}
+}
+
+func avx2D128AddChecked(a, b, r []uint64) int {
+	n := len(r) / 2
+	if n == 0 || len(a) < 2*n || len(b) < 2*n {
+		return -1
+	}
+	pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x4(signBit128)
+
+	var ofAcc archsimd.Int64x4
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 16
+		a0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off)))
+		a1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+32)))
+		b0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off)))
+		b1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+32)))
+
+		aLo := a0.InterleaveLoGrouped(a1)
+		aHi := a0.InterleaveHiGrouped(a1)
+		bLo := b0.InterleaveLoGrouped(b1)
+		bHi := b0.InterleaveHiGrouped(b1)
+
+		rLo, rHi := avx2D128AddCarry(aLo, aHi, bLo, bHi, sb)
+		// 128-bit add overflow predicate is the same as 64-bit, evaluated on
+		// the high words after carry propagation.
+		ofAcc = ofAcc.Or(aHi.Xor(rHi).AndNot(aHi.Xor(bHi)))
+
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[4]int64)(unsafe.Add(pr, off)))
+		r1.Store((*[4]int64)(unsafe.Add(pr, off+32)))
+	}
+	vecEnd := i
+
+	zero := archsimd.BroadcastInt64x4(0)
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	for ; i < n; i++ {
+		j := i << 1
+		aLo, aHi := a[j], a[j+1]
+		bLo, bHi := b[j], b[j+1]
+		lo := aLo + bLo
+		var c uint64
+		if lo < aLo {
+			c = 1
+		}
+		hi := aHi + bHi + c
+		r[j] = lo
+		r[j+1] = hi
+		ah, bh, rh := int64(aHi), int64(bHi), int64(hi)
+		if (ah^rh)&^(ah^bh) < 0 {
+			if vecOverflow {
+				return d128FirstOverflow(a, b, vecEnd, false)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d128FirstOverflow(a, b, vecEnd, false)
+}
+
+func avx2D128SubChecked(a, b, r []uint64) int {
+	n := len(r) / 2
+	if n == 0 || len(a) < 2*n || len(b) < 2*n {
+		return -1
+	}
+	pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x4(signBit128)
+
+	var ofAcc archsimd.Int64x4
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 16
+		a0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off)))
+		a1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+32)))
+		b0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off)))
+		b1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+32)))
+
+		aLo := a0.InterleaveLoGrouped(a1)
+		aHi := a0.InterleaveHiGrouped(a1)
+		bLo := b0.InterleaveLoGrouped(b1)
+		bHi := b0.InterleaveHiGrouped(b1)
+
+		rLo, rHi := avx2D128SubBorrow(aLo, aHi, bLo, bHi, sb)
+		ofAcc = ofAcc.Or(aHi.Xor(rHi).And(aHi.Xor(bHi)))
+
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[4]int64)(unsafe.Add(pr, off)))
+		r1.Store((*[4]int64)(unsafe.Add(pr, off+32)))
+	}
+	vecEnd := i
+
+	zero := archsimd.BroadcastInt64x4(0)
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	for ; i < n; i++ {
+		j := i << 1
+		aLo, aHi := a[j], a[j+1]
+		bLo, bHi := b[j], b[j+1]
+		var br uint64
+		if aLo < bLo {
+			br = 1
+		}
+		lo := aLo - bLo
+		hi := aHi - bHi - br
+		r[j] = lo
+		r[j+1] = hi
+		ah, bh, rh := int64(aHi), int64(bHi), int64(hi)
+		if (ah^rh)&(ah^bh) < 0 {
+			if vecOverflow {
+				return d128FirstOverflow(a, b, vecEnd, true)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d128FirstOverflow(a, b, vecEnd, true)
+}
+
+// ---------------------------------------------------------------------------
+// AVX-512 path: Int64x8 = 8 lanes = 4 D128 elements per vector. Processes 8
+// elements per kernel iteration (two Int64x8 = 128 B per input). Same
+// deinterleave / carry-propagate / reinterleave pattern as AVX2; the only
+// changes are the lane width and a wider scalar tail.
+// ---------------------------------------------------------------------------
+
+//go:nosplit
+func avx512D128AddCarry(aLo, aHi, bLo, bHi, sb archsimd.Int64x8) (rLo, rHi archsimd.Int64x8) {
+	rLo = aLo.Add(bLo)
+	carryMask := rLo.Xor(sb).Less(aLo.Xor(sb)).ToInt64x8()
+	rHi = aHi.Add(bHi).Sub(carryMask)
+	return
+}
+
+//go:nosplit
+func avx512D128SubBorrow(aLo, aHi, bLo, bHi, sb archsimd.Int64x8) (rLo, rHi archsimd.Int64x8) {
+	rLo = aLo.Sub(bLo)
+	borrowMask := aLo.Xor(sb).Less(bLo.Xor(sb)).ToInt64x8()
+	rHi = aHi.Sub(bHi).Add(borrowMask)
+	return
+}
+
+func avx512D128AddUnchecked(a, b, r []uint64) {
+	n := len(r) / 2
+	if n == 0 || len(a) < 2*n || len(b) < 2*n {
+		return
+	}
+	pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x8(signBit128)
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 16
+		a0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off)))
+		a1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+64)))
+		b0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off)))
+		b1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+64)))
+
+		aLo := a0.InterleaveLoGrouped(a1)
+		aHi := a0.InterleaveHiGrouped(a1)
+		bLo := b0.InterleaveLoGrouped(b1)
+		bHi := b0.InterleaveHiGrouped(b1)
+
+		rLo, rHi := avx512D128AddCarry(aLo, aHi, bLo, bHi, sb)
+
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[8]int64)(unsafe.Add(pr, off)))
+		r1.Store((*[8]int64)(unsafe.Add(pr, off+64)))
+	}
+	for ; i < n; i++ {
+		j := i << 1
+		lo := a[j] + b[j]
+		var c uint64
+		if lo < a[j] {
+			c = 1
+		}
+		r[j] = lo
+		r[j+1] = a[j+1] + b[j+1] + c
+	}
+}
+
+func avx512D128SubUnchecked(a, b, r []uint64) {
+	n := len(r) / 2
+	if n == 0 || len(a) < 2*n || len(b) < 2*n {
+		return
+	}
+	pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x8(signBit128)
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 16
+		a0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off)))
+		a1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+64)))
+		b0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off)))
+		b1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+64)))
+
+		aLo := a0.InterleaveLoGrouped(a1)
+		aHi := a0.InterleaveHiGrouped(a1)
+		bLo := b0.InterleaveLoGrouped(b1)
+		bHi := b0.InterleaveHiGrouped(b1)
+
+		rLo, rHi := avx512D128SubBorrow(aLo, aHi, bLo, bHi, sb)
+
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[8]int64)(unsafe.Add(pr, off)))
+		r1.Store((*[8]int64)(unsafe.Add(pr, off+64)))
+	}
+	for ; i < n; i++ {
+		j := i << 1
+		var br uint64
+		if a[j] < b[j] {
+			br = 1
+		}
+		r[j] = a[j] - b[j]
+		r[j+1] = a[j+1] - b[j+1] - br
+	}
+}
+
+func avx512D128AddChecked(a, b, r []uint64) int {
+	n := len(r) / 2
+	if n == 0 || len(a) < 2*n || len(b) < 2*n {
+		return -1
+	}
+	pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x8(signBit128)
+
+	var ofAcc archsimd.Int64x8
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 16
+		a0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off)))
+		a1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+64)))
+		b0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off)))
+		b1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+64)))
+
+		aLo := a0.InterleaveLoGrouped(a1)
+		aHi := a0.InterleaveHiGrouped(a1)
+		bLo := b0.InterleaveLoGrouped(b1)
+		bHi := b0.InterleaveHiGrouped(b1)
+
+		rLo, rHi := avx512D128AddCarry(aLo, aHi, bLo, bHi, sb)
+		ofAcc = ofAcc.Or(aHi.Xor(bHi).AndNot(aHi.Xor(rHi)))
+
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[8]int64)(unsafe.Add(pr, off)))
+		r1.Store((*[8]int64)(unsafe.Add(pr, off+64)))
+	}
+	vecEnd := i
+
+	zero := archsimd.BroadcastInt64x8(0)
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	for ; i < n; i++ {
+		j := i << 1
+		aLo, aHi := a[j], a[j+1]
+		bLo, bHi := b[j], b[j+1]
+		lo := aLo + bLo
+		var c uint64
+		if lo < aLo {
+			c = 1
+		}
+		hi := aHi + bHi + c
+		r[j] = lo
+		r[j+1] = hi
+		ah, bh, rh := int64(aHi), int64(bHi), int64(hi)
+		if (ah^rh)&^(ah^bh) < 0 {
+			if vecOverflow {
+				return d128FirstOverflow(a, b, vecEnd, false)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d128FirstOverflow(a, b, vecEnd, false)
+}
+
+func avx512D128SubChecked(a, b, r []uint64) int {
+	n := len(r) / 2
+	if n == 0 || len(a) < 2*n || len(b) < 2*n {
+		return -1
+	}
+	pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x8(signBit128)
+
+	var ofAcc archsimd.Int64x8
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 16
+		a0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off)))
+		a1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+64)))
+		b0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off)))
+		b1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+64)))
+
+		aLo := a0.InterleaveLoGrouped(a1)
+		aHi := a0.InterleaveHiGrouped(a1)
+		bLo := b0.InterleaveLoGrouped(b1)
+		bHi := b0.InterleaveHiGrouped(b1)
+
+		rLo, rHi := avx512D128SubBorrow(aLo, aHi, bLo, bHi, sb)
+		ofAcc = ofAcc.Or(aHi.Xor(rHi).And(aHi.Xor(bHi)))
+
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[8]int64)(unsafe.Add(pr, off)))
+		r1.Store((*[8]int64)(unsafe.Add(pr, off+64)))
+	}
+	vecEnd := i
+
+	zero := archsimd.BroadcastInt64x8(0)
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	for ; i < n; i++ {
+		j := i << 1
+		aLo, aHi := a[j], a[j+1]
+		bLo, bHi := b[j], b[j+1]
+		var br uint64
+		if aLo < bLo {
+			br = 1
+		}
+		lo := aLo - bLo
+		hi := aHi - bHi - br
+		r[j] = lo
+		r[j+1] = hi
+		ah, bh, rh := int64(aHi), int64(bHi), int64(hi)
+		if (ah^rh)&(ah^bh) < 0 {
+			if vecOverflow {
+				return d128FirstOverflow(a, b, vecEnd, true)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d128FirstOverflow(a, b, vecEnd, true)
+}
+
+// ---------------------------------------------------------------------------
+// AVX2 broadcast variants. Scalar (slo, shi) is broadcast once outside the
+// loop; only the vector operand is loaded each iteration. Layout/permutation
+// notes from the vec+vec path apply identically — the broadcast vectors are
+// uniform, so deinterleaving them is a no-op (same value in every lane).
+// ---------------------------------------------------------------------------
+
+func avx2D128AddScalarUnchecked(slo, shi uint64, v, r []uint64) {
+	n := len(r) / 2
+	if n == 0 || len(v) < 2*n {
+		return
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x4(signBit128)
+	bLo := archsimd.BroadcastInt64x4(int64(slo))
+	bHi := archsimd.BroadcastInt64x4(int64(shi))
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 16
+		v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32)))
+
+		aLo := v0.InterleaveLoGrouped(v1)
+		aHi := v0.InterleaveHiGrouped(v1)
+
+		rLo, rHi := avx2D128AddCarry(aLo, aHi, bLo, bHi, sb)
+
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[4]int64)(unsafe.Add(pr, off)))
+		r1.Store((*[4]int64)(unsafe.Add(pr, off+32)))
+	}
+	for ; i < n; i++ {
+		j := i << 1
+		lo := slo + v[j]
+		var c uint64
+		if lo < slo {
+			c = 1
+		}
+		r[j] = lo
+		r[j+1] = shi + v[j+1] + c
+	}
+}
+
+func avx2D128SubScalarUnchecked(v []uint64, slo, shi uint64, r []uint64) {
+	n := len(r) / 2
+	if n == 0 || len(v) < 2*n {
+		return
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x4(signBit128)
+	bLo := archsimd.BroadcastInt64x4(int64(slo))
+	bHi := archsimd.BroadcastInt64x4(int64(shi))
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 16
+		v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32)))
+
+		aLo := v0.InterleaveLoGrouped(v1)
+		aHi := v0.InterleaveHiGrouped(v1)
+
+		rLo, rHi := avx2D128SubBorrow(aLo, aHi, bLo, bHi, sb)
+
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[4]int64)(unsafe.Add(pr, off)))
+		r1.Store((*[4]int64)(unsafe.Add(pr, off+32)))
+	}
+	for ; i < n; i++ {
+		j := i << 1
+		var br uint64
+		if v[j] < slo {
+			br = 1
+		}
+		r[j] = v[j] - slo
+		r[j+1] = v[j+1] - shi - br
+	}
+}
+
+func avx2D128ScalarSubUnchecked(slo, shi uint64, v, r []uint64) {
+	n := len(r) / 2
+	if n == 0 || len(v) < 2*n {
+		return
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x4(signBit128)
+	aLo := archsimd.BroadcastInt64x4(int64(slo))
+	aHi := archsimd.BroadcastInt64x4(int64(shi))
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 16
+		v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32)))
+
+		bLo := v0.InterleaveLoGrouped(v1)
+		bHi := v0.InterleaveHiGrouped(v1)
+
+		rLo, rHi := avx2D128SubBorrow(aLo, aHi, bLo, bHi, sb)
+
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[4]int64)(unsafe.Add(pr, off)))
+		r1.Store((*[4]int64)(unsafe.Add(pr, off+32)))
+	}
+	for ; i < n; i++ {
+		j := i << 1
+		var br uint64
+		if slo < v[j] {
+			br = 1
+		}
+		r[j] = slo - v[j]
+		r[j+1] = shi - v[j+1] - br
+	}
+}
+
+func avx2D128AddScalarChecked(slo, shi uint64, v, r []uint64) int {
+	n := len(r) / 2
+	if n == 0 || len(v) < 2*n {
+		return -1
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x4(signBit128)
+	bLo := archsimd.BroadcastInt64x4(int64(slo))
+	bHi := archsimd.BroadcastInt64x4(int64(shi))
+
+	var ofAcc archsimd.Int64x4
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 16
+		v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32)))
+
+		aLo := v0.InterleaveLoGrouped(v1)
+		aHi := v0.InterleaveHiGrouped(v1)
+
+		rLo, rHi := avx2D128AddCarry(aLo, aHi, bLo, bHi, sb)
+		ofAcc = ofAcc.Or(aHi.Xor(rHi).AndNot(aHi.Xor(bHi)))
+
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[4]int64)(unsafe.Add(pr, off)))
+		r1.Store((*[4]int64)(unsafe.Add(pr, off+32)))
+	}
+	vecEnd := i
+
+	zero := archsimd.BroadcastInt64x4(0)
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	sh := int64(shi)
+	for ; i < n; i++ {
+		j := i << 1
+		vLo, vHi := v[j], v[j+1]
+		lo := slo + vLo
+		var c uint64
+		if lo < slo {
+			c = 1
+		}
+		hi := shi + vHi + c
+		r[j] = lo
+		r[j+1] = hi
+		vh, rh := int64(vHi), int64(hi)
+		if (sh^rh)&^(sh^vh) < 0 {
+			if vecOverflow {
+				return d128ScalarFirstOverflow(slo, shi, v, vecEnd, 0)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d128ScalarFirstOverflow(slo, shi, v, vecEnd, 0)
+}
+
+func avx2D128SubScalarChecked(v []uint64, slo, shi uint64, r []uint64) int {
+	n := len(r) / 2
+	if n == 0 || len(v) < 2*n {
+		return -1
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x4(signBit128)
+	bLo := archsimd.BroadcastInt64x4(int64(slo))
+	bHi := archsimd.BroadcastInt64x4(int64(shi))
+
+	var ofAcc archsimd.Int64x4
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 16
+		v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32)))
+
+		aLo := v0.InterleaveLoGrouped(v1)
+		aHi := v0.InterleaveHiGrouped(v1)
+
+		rLo, rHi := avx2D128SubBorrow(aLo, aHi, bLo, bHi, sb)
+		ofAcc = ofAcc.Or(aHi.Xor(rHi).And(aHi.Xor(bHi)))
+
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[4]int64)(unsafe.Add(pr, off)))
+		r1.Store((*[4]int64)(unsafe.Add(pr, off+32)))
+	}
+	vecEnd := i
+
+	zero := archsimd.BroadcastInt64x4(0)
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	sh := int64(shi)
+	for ; i < n; i++ {
+		j := i << 1
+		vLo, vHi := v[j], v[j+1]
+		var br uint64
+		if vLo < slo {
+			br = 1
+		}
+		lo := vLo - slo
+		hi := vHi - shi - br
+		r[j] = lo
+		r[j+1] = hi
+		vh, rh := int64(vHi), int64(hi)
+		if (vh^rh)&(vh^sh) < 0 {
+			if vecOverflow {
+				return d128ScalarFirstOverflow(slo, shi, v, vecEnd, 1)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d128ScalarFirstOverflow(slo, shi, v, vecEnd, 1)
+}
+
+func avx2D128ScalarSubChecked(slo, shi uint64, v, r []uint64) int {
+	n := len(r) / 2
+	if n == 0 || len(v) < 2*n {
+		return -1
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x4(signBit128)
+	aLo := archsimd.BroadcastInt64x4(int64(slo))
+	aHi := archsimd.BroadcastInt64x4(int64(shi))
+
+	var ofAcc archsimd.Int64x4
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 16
+		v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32)))
+
+		bLo := v0.InterleaveLoGrouped(v1)
+		bHi := v0.InterleaveHiGrouped(v1)
+
+		rLo, rHi := avx2D128SubBorrow(aLo, aHi, bLo, bHi, sb)
+		ofAcc = ofAcc.Or(aHi.Xor(rHi).And(aHi.Xor(bHi)))
+
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[4]int64)(unsafe.Add(pr, off)))
+		r1.Store((*[4]int64)(unsafe.Add(pr, off+32)))
+	}
+	vecEnd := i
+
+	zero := archsimd.BroadcastInt64x4(0)
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	sh := int64(shi)
+	for ; i < n; i++ {
+		j := i << 1
+		vLo, vHi := v[j], v[j+1]
+		var br uint64
+		if slo < vLo {
+			br = 1
+		}
+		lo := slo - vLo
+		hi := shi - vHi - br
+		r[j] = lo
+		r[j+1] = hi
+		vh, rh := int64(vHi), int64(hi)
+		if (sh^rh)&(sh^vh) < 0 {
+			if vecOverflow {
+				return d128ScalarFirstOverflow(slo, shi, v, vecEnd, 2)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d128ScalarFirstOverflow(slo, shi, v, vecEnd, 2)
+}
+
+// ---------------------------------------------------------------------------
+// AVX-512 broadcast variants. Same structure as AVX2 with Int64x8 lanes
+// (8 D128 elements per kernel iteration).
+// ---------------------------------------------------------------------------
+
+func avx512D128AddScalarUnchecked(slo, shi uint64, v, r []uint64) {
+	n := len(r) / 2
+	if n == 0 || len(v) < 2*n {
+		return
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x8(signBit128)
+	bLo := archsimd.BroadcastInt64x8(int64(slo))
+	bHi := archsimd.BroadcastInt64x8(int64(shi))
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 16
+		v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64)))
+
+		aLo := v0.InterleaveLoGrouped(v1)
+		aHi := v0.InterleaveHiGrouped(v1)
+
+		rLo, rHi := avx512D128AddCarry(aLo, aHi, bLo, bHi, sb)
+
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[8]int64)(unsafe.Add(pr, off)))
+		r1.Store((*[8]int64)(unsafe.Add(pr, off+64)))
+	}
+	for ; i < n; i++ {
+		j := i << 1
+		lo := slo + v[j]
+		var c uint64
+		if lo < slo {
+			c = 1
+		}
+		r[j] = lo
+		r[j+1] = shi + v[j+1] + c
+	}
+}
+
+func avx512D128SubScalarUnchecked(v []uint64, slo, shi uint64, r []uint64) {
+	n := len(r) / 2
+	if n == 0 || len(v) < 2*n {
+		return
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x8(signBit128)
+	bLo := archsimd.BroadcastInt64x8(int64(slo))
+	bHi := archsimd.BroadcastInt64x8(int64(shi))
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 16
+		v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64)))
+
+		aLo := v0.InterleaveLoGrouped(v1)
+		aHi := v0.InterleaveHiGrouped(v1)
+
+		rLo, rHi := avx512D128SubBorrow(aLo, aHi, bLo, bHi, sb)
+
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[8]int64)(unsafe.Add(pr, off)))
+		r1.Store((*[8]int64)(unsafe.Add(pr, off+64)))
+	}
+	for ; i < n; i++ {
+		j := i << 1
+		var br uint64
+		if v[j] < slo {
+			br = 1
+		}
+		r[j] = v[j] - slo
+		r[j+1] = v[j+1] - shi - br
+	}
+}
+
+func avx512D128ScalarSubUnchecked(slo, shi uint64, v, r []uint64) {
+	n := len(r) / 2
+	if n == 0 || len(v) < 2*n {
+		return
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x8(signBit128)
+	aLo := archsimd.BroadcastInt64x8(int64(slo))
+	aHi := archsimd.BroadcastInt64x8(int64(shi))
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 16
+		v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64)))
+
+		bLo := v0.InterleaveLoGrouped(v1)
+		bHi := v0.InterleaveHiGrouped(v1)
+
+		rLo, rHi := avx512D128SubBorrow(aLo, aHi, bLo, bHi, sb)
+
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[8]int64)(unsafe.Add(pr, off)))
+		r1.Store((*[8]int64)(unsafe.Add(pr, off+64)))
+	}
+	for ; i < n; i++ {
+		j := i << 1
+		var br uint64
+		if slo < v[j] {
+			br = 1
+		}
+		r[j] = slo - v[j]
+		r[j+1] = shi - v[j+1] - br
+	}
+}
+
+func avx512D128AddScalarChecked(slo, shi uint64, v, r []uint64) int {
+	n := len(r) / 2
+	if n == 0 || len(v) < 2*n {
+		return -1
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x8(signBit128)
+	bLo := archsimd.BroadcastInt64x8(int64(slo))
+	bHi := archsimd.BroadcastInt64x8(int64(shi))
+
+	var ofAcc archsimd.Int64x8
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 16
+		v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64)))
+
+		aLo := v0.InterleaveLoGrouped(v1)
+		aHi := v0.InterleaveHiGrouped(v1)
+
+		rLo, rHi := avx512D128AddCarry(aLo, aHi, bLo, bHi, sb)
+		ofAcc = ofAcc.Or(aHi.Xor(bHi).AndNot(aHi.Xor(rHi)))
+
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[8]int64)(unsafe.Add(pr, off)))
+		r1.Store((*[8]int64)(unsafe.Add(pr, off+64)))
+	}
+	vecEnd := i
+
+	zero := archsimd.BroadcastInt64x8(0)
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	sh := int64(shi)
+	for ; i < n; i++ {
+		j := i << 1
+		vLo, vHi := v[j], v[j+1]
+		lo := slo + vLo
+		var c uint64
+		if lo < slo {
+			c = 1
+		}
+		hi := shi + vHi + c
+		r[j] = lo
+		r[j+1] = hi
+		vh, rh := int64(vHi), int64(hi)
+		if (sh^rh)&^(sh^vh) < 0 {
+			if vecOverflow {
+				return d128ScalarFirstOverflow(slo, shi, v, vecEnd, 0)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d128ScalarFirstOverflow(slo, shi, v, vecEnd, 0)
+}
+
+func avx512D128SubScalarChecked(v []uint64, slo, shi uint64, r []uint64) int {
+	n := len(r) / 2
+	if n == 0 || len(v) < 2*n {
+		return -1
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x8(signBit128)
+	bLo := archsimd.BroadcastInt64x8(int64(slo))
+	bHi := archsimd.BroadcastInt64x8(int64(shi))
+
+	var ofAcc archsimd.Int64x8
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 16
+		v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64)))
+
+		aLo := v0.InterleaveLoGrouped(v1)
+		aHi := v0.InterleaveHiGrouped(v1)
+
+		rLo, rHi := avx512D128SubBorrow(aLo, aHi, bLo, bHi, sb)
+		ofAcc = ofAcc.Or(aHi.Xor(rHi).And(aHi.Xor(bHi)))
+
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[8]int64)(unsafe.Add(pr, off)))
+		r1.Store((*[8]int64)(unsafe.Add(pr, off+64)))
+	}
+	vecEnd := i
+
+	zero := archsimd.BroadcastInt64x8(0)
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	sh := int64(shi)
+	for ; i < n; i++ {
+		j := i << 1
+		vLo, vHi := v[j], v[j+1]
+		var br uint64
+		if vLo < slo {
+			br = 1
+		}
+		lo := vLo - slo
+		hi := vHi - shi - br
+		r[j] = lo
+		r[j+1] = hi
+		vh, rh := int64(vHi), int64(hi)
+		if (vh^rh)&(vh^sh) < 0 {
+			if vecOverflow {
+				return d128ScalarFirstOverflow(slo, shi, v, vecEnd, 1)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d128ScalarFirstOverflow(slo, shi, v, vecEnd, 1)
+}
+
+func avx512D128ScalarSubChecked(slo, shi uint64, v, r []uint64) int {
+	n := len(r) / 2
+	if n == 0 || len(v) < 2*n {
+		return -1
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x8(signBit128)
+	aLo := archsimd.BroadcastInt64x8(int64(slo))
+	aHi := archsimd.BroadcastInt64x8(int64(shi))
+
+	var ofAcc archsimd.Int64x8
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 16
+		v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64)))
+
+		bLo := v0.InterleaveLoGrouped(v1)
+		bHi := v0.InterleaveHiGrouped(v1)
+
+		rLo, rHi := avx512D128SubBorrow(aLo, aHi, bLo, bHi, sb)
+		ofAcc = ofAcc.Or(aHi.Xor(rHi).And(aHi.Xor(bHi)))
+
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[8]int64)(unsafe.Add(pr, off)))
+		r1.Store((*[8]int64)(unsafe.Add(pr, off+64)))
+	}
+	vecEnd := i
+
+	zero := archsimd.BroadcastInt64x8(0)
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	sh := int64(shi)
+	for ; i < n; i++ {
+		j := i << 1
+		vLo, vHi := v[j], v[j+1]
+		var br uint64
+		if slo < vLo {
+			br = 1
+		}
+		lo := slo - vLo
+		hi := shi - vHi - br
+		r[j] = lo
+		r[j+1] = hi
+		vh, rh := int64(vHi), int64(hi)
+		if (sh^rh)&(sh^vh) < 0 {
+			if vecOverflow {
+				return d128ScalarFirstOverflow(slo, shi, v, vecEnd, 2)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d128ScalarFirstOverflow(slo, shi, v, vecEnd, 2)
+}
+
+// avx2D128SumReduce sums a slice of Decimal128 values laid out as
+// [lo, hi, lo, hi, ...] and returns the 128-bit total (lo, hi).
+// Wraps on overflow (mod 2^128).
+func avx2D128SumReduce(v []uint64) (uint64, uint64) {
+	n := len(v) >> 1
+	if n == 0 {
+		return 0, 0
+	}
+	pv := unsafe.Pointer(&v[0])
+	sb := archsimd.BroadcastInt64x4(signBit128)
+
+	zero := archsimd.BroadcastInt64x4(0)
+	accLo := zero
+	accHi := zero
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 16
+		v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32)))
+		bLo := v0.InterleaveLoGrouped(v1)
+		bHi := v0.InterleaveHiGrouped(v1)
+
+		newLo := accLo.Add(bLo)
+		// Carry mask: -1 in lanes where unsigned newLo < accLo (wrap-around).
+		carryMask := newLo.Xor(sb).Less(accLo.Xor(sb)).ToInt64x4()
+		accHi = accHi.Add(bHi).Sub(carryMask)
+		accLo = newLo
+	}
+
+	// Horizontal reduce of 4 partial 128-bit sums.
+	var loBuf, hiBuf [4]int64
+	accLo.Store(&loBuf)
+	accHi.Store(&hiBuf)
+	var totLo, totHi uint64
+	for k := 0; k < 4; k++ {
+		var c uint64
+		totLo, c = bits.Add64(totLo, uint64(loBuf[k]), 0)
+		totHi, _ = bits.Add64(totHi, uint64(hiBuf[k]), c)
+	}
+
+	// Tail (n%4 elements).
+	for ; i < n; i++ {
+		j := i << 1
+		var c uint64
+		totLo, c = bits.Add64(totLo, v[j], 0)
+		totHi, _ = bits.Add64(totHi, v[j+1], c)
+	}
+	return totLo, totHi
+}
+
+// avx512D128SumReduce: same as AVX2 but 8 elements per iteration.
+func avx512D128SumReduce(v []uint64) (uint64, uint64) {
+	n := len(v) >> 1
+	if n == 0 {
+		return 0, 0
+	}
+	pv := unsafe.Pointer(&v[0])
+	sb := archsimd.BroadcastInt64x8(signBit128)
+
+	zero := archsimd.BroadcastInt64x8(0)
+	accLo := zero
+	accHi := zero
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 16
+		v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64)))
+		bLo := v0.InterleaveLoGrouped(v1)
+		bHi := v0.InterleaveHiGrouped(v1)
+
+		newLo := accLo.Add(bLo)
+		carryMask := newLo.Xor(sb).Less(accLo.Xor(sb)).ToInt64x8()
+		accHi = accHi.Add(bHi).Sub(carryMask)
+		accLo = newLo
+	}
+
+	var loBuf, hiBuf [8]int64
+	accLo.Store(&loBuf)
+	accHi.Store(&hiBuf)
+	var totLo, totHi uint64
+	for k := 0; k < 8; k++ {
+		var c uint64
+		totLo, c = bits.Add64(totLo, uint64(loBuf[k]), 0)
+		totHi, _ = bits.Add64(totHi, uint64(hiBuf[k]), c)
+	}
+
+	for ; i < n; i++ {
+		j := i << 1
+		var c uint64
+		totLo, c = bits.Add64(totLo, v[j], 0)
+		totHi, _ = bits.Add64(totHi, v[j+1], c)
+	}
+	return totLo, totHi
+}
diff --git a/pkg/common/simdkernels/d128_addsub_test.go b/pkg/common/simdkernels/d128_addsub_test.go
new file mode 100644
index 0000000000000..57d9e44f4b0f1
--- /dev/null
+++ b/pkg/common/simdkernels/d128_addsub_test.go
@@ -0,0 +1,745 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build goexperiment.simd && amd64
+
+package simdkernels
+
+import (
+	"math"
+	"math/rand/v2"
+	"strconv"
+	"testing"
+
+	"golang.org/x/sys/cpu"
+)
+
+// All slice lengths below count Decimal128 elements (each backed by 2 uint64).
+
+type d128UncheckedImpl struct {
+	name string
+	fn   func(a, b, r []uint64)
+}
+
+type d128CheckedImpl struct {
+	name string
+	fn   func(a, b, r []uint64) int
+}
+
+func d128Sizes() []int {
+	return []int{0, 1, 3, 4, 5, 7, 8, 9, 15, 16, 17, 31, 32, 33, 35, 63, 64, 127, 128, 1023, 4096}
+}
+
+func makeRandD128(n int, seed uint64) []uint64 {
+	rng := rand.New(rand.NewPCG(seed, seed^0xDEADBEEFCAFEBABE))
+	out := make([]uint64, 2*n)
+	for i := range out {
+		out[i] = rng.Uint64()
+	}
+	return out
+}
+
+// makeRand128SmallSigned produces N elements whose high words have their
+// MSB cleared, so 128-bit add/sub of any two such values cannot overflow
+// (both signs are non-negative; 2^126 + 2^126 ≪ 2^127).
+func makeRand128SmallSigned(n int, seed uint64) []uint64 {
+	out := makeRandD128(n, seed)
+	for i := 1; i < len(out); i += 2 {
+		out[i] &= 0x3FFFFFFFFFFFFFFF
+	}
+	return out
+}
+
+func TestD128AddVariants(t *testing.T) {
+	impls := []d128UncheckedImpl{
+		{"scalar", scalarD128AddUnchecked},
+		{"avx2", avx2D128AddUnchecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d128UncheckedImpl{"avx512", avx512D128AddUnchecked})
+	}
+	for _, n := range d128Sizes() {
+		a := makeRandD128(n, uint64(n)*7+1)
+		b := makeRandD128(n, uint64(n)*11+3)
+		want := make([]uint64, 2*n)
+		scalarD128AddUnchecked(a, b, want)
+		for _, impl := range impls {
+			got := make([]uint64, 2*n)
+			impl.fn(a, b, got)
+			for i := 0; i < 2*n; i++ {
+				if got[i] != want[i] {
+					t.Fatalf("%s n=%d slot=%d: got %x want %x", impl.name, n, i, got[i], want[i])
+				}
+			}
+		}
+	}
+}
+
+func TestD128SubVariants(t *testing.T) {
+	impls := []d128UncheckedImpl{
+		{"scalar", scalarD128SubUnchecked},
+		{"avx2", avx2D128SubUnchecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d128UncheckedImpl{"avx512", avx512D128SubUnchecked})
+	}
+	for _, n := range d128Sizes() {
+		a := makeRandD128(n, uint64(n)*13+5)
+		b := makeRandD128(n, uint64(n)*17+9)
+		want := make([]uint64, 2*n)
+		scalarD128SubUnchecked(a, b, want)
+		for _, impl := range impls {
+			got := make([]uint64, 2*n)
+			impl.fn(a, b, got)
+			for i := 0; i < 2*n; i++ {
+				if got[i] != want[i] {
+					t.Fatalf("%s n=%d slot=%d: got %x want %x", impl.name, n, i, got[i], want[i])
+				}
+			}
+		}
+	}
+}
+
+func TestD128AddCheckedVariants(t *testing.T) {
+	impls := []d128CheckedImpl{
+		{"scalar", scalarD128AddChecked},
+		{"avx2", avx2D128AddChecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d128CheckedImpl{"avx512", avx512D128AddChecked})
+	}
+
+	// 1) No-overflow random inputs.
+	for _, n := range d128Sizes() {
+		a := makeRand128SmallSigned(n, uint64(n)*19+7)
+		b := makeRand128SmallSigned(n, uint64(n)*23+11)
+		want := make([]uint64, 2*n)
+		if got := scalarD128AddChecked(a, b, want); got != -1 {
+			t.Fatalf("setup: scalar overflow at %d for masked input n=%d", got, n)
+		}
+		for _, impl := range impls {
+			got := make([]uint64, 2*n)
+			if idx := impl.fn(a, b, got); idx != -1 {
+				t.Fatalf("%s n=%d: spurious overflow at %d", impl.name, n, idx)
+			}
+			for i := 0; i < 2*n; i++ {
+				if got[i] != want[i] {
+					t.Fatalf("%s n=%d slot=%d: got %x want %x", impl.name, n, i, got[i], want[i])
+				}
+			}
+		}
+	}
+
+	// 2) Inject a single overflow (MaxInt128 + 1) at varying positions.
+	for _, n := range []int{4, 8, 9, 16, 17, 33, 35, 64} {
+		for _, pos := range []int{0, 1, 3, 4, 7, 8, n - 1} {
+			if pos < 0 || pos >= n {
+				continue
+			}
+			a := make([]uint64, 2*n)
+			b := make([]uint64, 2*n)
+			j := pos << 1
+			a[j] = math.MaxUint64
+			a[j+1] = uint64(math.MaxInt64) // a = MaxInt128 (positive max)
+			b[j] = 1
+			b[j+1] = 0 // b = 1
+			for _, impl := range impls {
+				got := make([]uint64, 2*n)
+				idx := impl.fn(a, b, got)
+				if idx != pos {
+					t.Fatalf("%s n=%d inject pos=%d: got idx %d", impl.name, n, pos, idx)
+				}
+			}
+		}
+	}
+
+	// 3) Carry-propagation correctness: aLo = MaxUint64, aHi small; bLo = 1.
+	// Result should have lo = 0, hi = aHi+1 — no signed overflow.
+	for _, n := range []int{4, 8, 16, 17, 33} {
+		a := make([]uint64, 2*n)
+		b := make([]uint64, 2*n)
+		for i := 0; i < n; i++ {
+			j := i << 1
+			a[j] = math.MaxUint64
+			a[j+1] = uint64(i)
+			b[j] = 1
+			b[j+1] = 0
+		}
+		want := make([]uint64, 2*n)
+		scalarD128AddChecked(a, b, want)
+		for _, impl := range impls {
+			got := make([]uint64, 2*n)
+			if idx := impl.fn(a, b, got); idx != -1 {
+				t.Fatalf("%s carry n=%d: spurious overflow at %d", impl.name, n, idx)
+			}
+			for i := 0; i < 2*n; i++ {
+				if got[i] != want[i] {
+					t.Fatalf("%s carry n=%d slot=%d: got %x want %x", impl.name, n, i, got[i], want[i])
+				}
+			}
+		}
+	}
+}
+
+func TestD128SubCheckedVariants(t *testing.T) {
+	impls := []d128CheckedImpl{
+		{"scalar", scalarD128SubChecked},
+		{"avx2", avx2D128SubChecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d128CheckedImpl{"avx512", avx512D128SubChecked})
+	}
+
+	for _, n := range d128Sizes() {
+		a := makeRand128SmallSigned(n, uint64(n)*29+13)
+		b := makeRand128SmallSigned(n, uint64(n)*31+17)
+		want := make([]uint64, 2*n)
+		if got := scalarD128SubChecked(a, b, want); got != -1 {
+			t.Fatalf("setup: scalar overflow at %d for n=%d", got, n)
+		}
+		for _, impl := range impls {
+			got := make([]uint64, 2*n)
+			if idx := impl.fn(a, b, got); idx != -1 {
+				t.Fatalf("%s n=%d: spurious overflow at %d", impl.name, n, idx)
+			}
+			for i := 0; i < 2*n; i++ {
+				if got[i] != want[i] {
+					t.Fatalf("%s n=%d slot=%d: got %x want %x", impl.name, n, i, got[i], want[i])
+				}
+			}
+		}
+	}
+
+	// Inject MinInt128 - 1 overflow at varying positions.
+	for _, n := range []int{4, 8, 9, 16, 17, 33, 35, 64} {
+		for _, pos := range []int{0, 1, 3, 4, 7, 8, n - 1} {
+			if pos < 0 || pos >= n {
+				continue
+			}
+			a := make([]uint64, 2*n)
+			b := make([]uint64, 2*n)
+			j := pos << 1
+			a[j] = 0
+			a[j+1] = 1 << 63 // a = MinInt128
+			b[j] = 1
+			b[j+1] = 0 // b = 1
+			for _, impl := range impls {
+				got := make([]uint64, 2*n)
+				idx := impl.fn(a, b, got)
+				if idx != pos {
+					t.Fatalf("%s n=%d inject pos=%d: got idx %d", impl.name, n, pos, idx)
+				}
+			}
+		}
+	}
+
+	// Borrow propagation: aLo = 0, aHi small; bLo = 1 ⇒ lo=Max, hi=aHi-1.
+	for _, n := range []int{4, 8, 16, 17, 33} {
+		a := make([]uint64, 2*n)
+		b := make([]uint64, 2*n)
+		for i := 0; i < n; i++ {
+			j := i << 1
+			a[j] = 0
+			a[j+1] = uint64(i + 10) // safely positive and > 0 after borrow
+			b[j] = 1
+			b[j+1] = 0
+		}
+		want := make([]uint64, 2*n)
+		scalarD128SubChecked(a, b, want)
+		for _, impl := range impls {
+			got := make([]uint64, 2*n)
+			if idx := impl.fn(a, b, got); idx != -1 {
+				t.Fatalf("%s borrow n=%d: spurious overflow at %d", impl.name, n, idx)
+			}
+			for i := 0; i < 2*n; i++ {
+				if got[i] != want[i] {
+					t.Fatalf("%s borrow n=%d slot=%d: got %x want %x", impl.name, n, i, got[i], want[i])
+				}
+			}
+		}
+	}
+}
+
+// ---------------------------------------------------------------------------
+// Benchmarks
+// ---------------------------------------------------------------------------
+
+var d128BenchSizes = []int{32, 128, 512, 2048, 8192}
+
+func benchD128Unchecked(b *testing.B, fn func(a, bb, r []uint64), n int) {
+	a := makeRandD128(n, 1)
+	bb := makeRandD128(n, 2)
+	r := make([]uint64, 2*n)
+	b.SetBytes(int64(n) * 16 * 3)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		fn(a, bb, r)
+	}
+}
+
+func benchD128Checked(b *testing.B, fn func(a, bb, r []uint64) int, n int) {
+	a := makeRand128SmallSigned(n, 1)
+	bb := makeRand128SmallSigned(n, 2)
+	r := make([]uint64, 2*n)
+	b.SetBytes(int64(n) * 16 * 3)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = fn(a, bb, r)
+	}
+}
+
+func BenchmarkD128AddUnchecked(b *testing.B) {
+	for _, n := range d128BenchSizes {
+		b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD128Unchecked(b, scalarD128AddUnchecked, n) })
+		b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD128Unchecked(b, avx2D128AddUnchecked, n) })
+		if cpu.X86.HasAVX512 {
+			b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD128Unchecked(b, avx512D128AddUnchecked, n) })
+		}
+	}
+}
+
+func BenchmarkD128SubUnchecked(b *testing.B) {
+	for _, n := range d128BenchSizes {
+		b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD128Unchecked(b, scalarD128SubUnchecked, n) })
+		b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD128Unchecked(b, avx2D128SubUnchecked, n) })
+		if cpu.X86.HasAVX512 {
+			b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD128Unchecked(b, avx512D128SubUnchecked, n) })
+		}
+	}
+}
+
+func BenchmarkD128AddChecked(b *testing.B) {
+	for _, n := range d128BenchSizes {
+		b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD128Checked(b, scalarD128AddChecked, n) })
+		b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD128Checked(b, avx2D128AddChecked, n) })
+		if cpu.X86.HasAVX512 {
+			b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD128Checked(b, avx512D128AddChecked, n) })
+		}
+	}
+}
+
+func BenchmarkD128SubChecked(b *testing.B) {
+	for _, n := range d128BenchSizes {
+		b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD128Checked(b, scalarD128SubChecked, n) })
+		b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD128Checked(b, avx2D128SubChecked, n) })
+		if cpu.X86.HasAVX512 {
+			b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD128Checked(b, avx512D128SubChecked, n) })
+		}
+	}
+}
+
+// ---------------------------------------------------------------------------
+// Scalar-broadcast tests.
+// ---------------------------------------------------------------------------
+
+type d128ScalarVUImpl struct {
+	name string
+	fn   func(slo, shi uint64, v, r []uint64)
+}
+
+type d128ScalarVCImpl struct {
+	name string
+	fn   func(slo, shi uint64, v, r []uint64) int
+}
+
+type d128VScalarUImpl struct {
+	name string
+	fn   func(v []uint64, slo, shi uint64, r []uint64)
+}
+
+type d128VScalarCImpl struct {
+	name string
+	fn   func(v []uint64, slo, shi uint64, r []uint64) int
+}
+
+func d128Scalars() []struct{ lo, hi uint64 } {
+	return []struct{ lo, hi uint64 }{
+		{0, 0},
+		{1, 0},
+		{math.MaxUint64, 0},
+		{0, 1},
+		{0xDEADBEEFCAFEBABE, 0x123456789ABCDEF0},
+		{math.MaxUint64, uint64(math.MaxInt64)},
+		{0, 1 << 63},
+	}
+}
+
+func TestD128AddScalarVariants(t *testing.T) {
+	impls := []d128ScalarVUImpl{
+		{"scalar", scalarD128AddScalarUnchecked},
+		{"avx2", avx2D128AddScalarUnchecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d128ScalarVUImpl{"avx512", avx512D128AddScalarUnchecked})
+	}
+	for _, n := range d128Sizes() {
+		v := makeRandD128(n, uint64(n)*37+1)
+		for si, s := range d128Scalars() {
+			want := make([]uint64, 2*n)
+			scalarD128AddScalarUnchecked(s.lo, s.hi, v, want)
+			for _, impl := range impls {
+				got := make([]uint64, 2*n)
+				impl.fn(s.lo, s.hi, v, got)
+				for i := 0; i < 2*n; i++ {
+					if got[i] != want[i] {
+						t.Fatalf("%s n=%d scalar#%d slot=%d: got %x want %x", impl.name, n, si, i, got[i], want[i])
+					}
+				}
+			}
+		}
+	}
+}
+
+func TestD128SubScalarVariants(t *testing.T) {
+	impls := []d128VScalarUImpl{
+		{"scalar", scalarD128SubScalarUnchecked},
+		{"avx2", avx2D128SubScalarUnchecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d128VScalarUImpl{"avx512", avx512D128SubScalarUnchecked})
+	}
+	for _, n := range d128Sizes() {
+		v := makeRandD128(n, uint64(n)*41+3)
+		for si, s := range d128Scalars() {
+			want := make([]uint64, 2*n)
+			scalarD128SubScalarUnchecked(v, s.lo, s.hi, want)
+			for _, impl := range impls {
+				got := make([]uint64, 2*n)
+				impl.fn(v, s.lo, s.hi, got)
+				for i := 0; i < 2*n; i++ {
+					if got[i] != want[i] {
+						t.Fatalf("%s n=%d scalar#%d slot=%d: got %x want %x", impl.name, n, si, i, got[i], want[i])
+					}
+				}
+			}
+		}
+	}
+}
+
+func TestD128ScalarSubVariants(t *testing.T) {
+	impls := []d128ScalarVUImpl{
+		{"scalar", scalarD128ScalarSubUnchecked},
+		{"avx2", avx2D128ScalarSubUnchecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d128ScalarVUImpl{"avx512", avx512D128ScalarSubUnchecked})
+	}
+	for _, n := range d128Sizes() {
+		v := makeRandD128(n, uint64(n)*43+5)
+		for si, s := range d128Scalars() {
+			want := make([]uint64, 2*n)
+			scalarD128ScalarSubUnchecked(s.lo, s.hi, v, want)
+			for _, impl := range impls {
+				got := make([]uint64, 2*n)
+				impl.fn(s.lo, s.hi, v, got)
+				for i := 0; i < 2*n; i++ {
+					if got[i] != want[i] {
+						t.Fatalf("%s n=%d scalar#%d slot=%d: got %x want %x", impl.name, n, si, i, got[i], want[i])
+					}
+				}
+			}
+		}
+	}
+}
+
+func TestD128AddScalarCheckedVariants(t *testing.T) {
+	impls := []d128ScalarVCImpl{
+		{"scalar", scalarD128AddScalarChecked},
+		{"avx2", avx2D128AddScalarChecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d128ScalarVCImpl{"avx512", avx512D128AddScalarChecked})
+	}
+
+	// 1) No-overflow: small-signed v plus small-signed scalars.
+	smallScalars := []struct{ lo, hi uint64 }{
+		{0, 0},
+		{0xDEADBEEFCAFEBABE, 0x0123456789ABCDEF},
+		{1, 0},
+		{math.MaxUint64, 0x0FFFFFFFFFFFFFFF},
+	}
+	for _, n := range d128Sizes() {
+		v := makeRand128SmallSigned(n, uint64(n)*47+7)
+		for si, s := range smallScalars {
+			want := make([]uint64, 2*n)
+			if got := scalarD128AddScalarChecked(s.lo, s.hi, v, want); got != -1 {
+				t.Fatalf("setup overflow at %d for n=%d scalar#%d", got, n, si)
+			}
+			for _, impl := range impls {
+				got := make([]uint64, 2*n)
+				if idx := impl.fn(s.lo, s.hi, v, got); idx != -1 {
+					t.Fatalf("%s n=%d scalar#%d: spurious overflow at %d", impl.name, n, si, idx)
+				}
+				for i := 0; i < 2*n; i++ {
+					if got[i] != want[i] {
+						t.Fatalf("%s n=%d scalar#%d slot=%d: got %x want %x", impl.name, n, si, i, got[i], want[i])
+					}
+				}
+			}
+		}
+	}
+
+	// 2) Inject overflow: scalar = 1, v[pos] = MaxInt128.
+	for _, n := range []int{4, 8, 9, 16, 17, 33, 35, 64} {
+		for _, pos := range []int{0, 1, 3, 4, 7, 8, n - 1} {
+			if pos < 0 || pos >= n {
+				continue
+			}
+			v := make([]uint64, 2*n)
+			j := pos << 1
+			v[j] = math.MaxUint64
+			v[j+1] = uint64(math.MaxInt64)
+			for _, impl := range impls {
+				got := make([]uint64, 2*n)
+				idx := impl.fn(1, 0, v, got)
+				if idx != pos {
+					t.Fatalf("%s n=%d pos=%d: got idx %d", impl.name, n, pos, idx)
+				}
+			}
+		}
+	}
+}
+
+func TestD128SubScalarCheckedVariants(t *testing.T) {
+	impls := []d128VScalarCImpl{
+		{"scalar", scalarD128SubScalarChecked},
+		{"avx2", avx2D128SubScalarChecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d128VScalarCImpl{"avx512", avx512D128SubScalarChecked})
+	}
+
+	smallScalars := []struct{ lo, hi uint64 }{
+		{0, 0},
+		{1, 0},
+		{0xDEADBEEFCAFEBABE, 0x0123456789ABCDEF},
+		{math.MaxUint64, 0x0FFFFFFFFFFFFFFF},
+	}
+	for _, n := range d128Sizes() {
+		v := makeRand128SmallSigned(n, uint64(n)*53+11)
+		for si, s := range smallScalars {
+			want := make([]uint64, 2*n)
+			if got := scalarD128SubScalarChecked(v, s.lo, s.hi, want); got != -1 {
+				t.Fatalf("setup overflow at %d for n=%d scalar#%d", got, n, si)
+			}
+			for _, impl := range impls {
+				got := make([]uint64, 2*n)
+				if idx := impl.fn(v, s.lo, s.hi, got); idx != -1 {
+					t.Fatalf("%s n=%d scalar#%d: spurious overflow at %d", impl.name, n, si, idx)
+				}
+				for i := 0; i < 2*n; i++ {
+					if got[i] != want[i] {
+						t.Fatalf("%s n=%d scalar#%d slot=%d: got %x want %x", impl.name, n, si, i, got[i], want[i])
+					}
+				}
+			}
+		}
+	}
+
+	// Inject overflow: v[pos] = MinInt128, scalar = 1 ⇒ MinInt128 - 1 overflows.
+	for _, n := range []int{4, 8, 9, 16, 17, 33, 35, 64} {
+		for _, pos := range []int{0, 1, 3, 4, 7, 8, n - 1} {
+			if pos < 0 || pos >= n {
+				continue
+			}
+			v := make([]uint64, 2*n)
+			j := pos << 1
+			v[j] = 0
+			v[j+1] = 1 << 63
+			for _, impl := range impls {
+				got := make([]uint64, 2*n)
+				idx := impl.fn(v, 1, 0, got)
+				if idx != pos {
+					t.Fatalf("%s n=%d pos=%d: got idx %d", impl.name, n, pos, idx)
+				}
+			}
+		}
+	}
+}
+
+func TestD128ScalarSubCheckedVariants(t *testing.T) {
+	impls := []d128ScalarVCImpl{
+		{"scalar", scalarD128ScalarSubChecked},
+		{"avx2", avx2D128ScalarSubChecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d128ScalarVCImpl{"avx512", avx512D128ScalarSubChecked})
+	}
+
+	smallScalars := []struct{ lo, hi uint64 }{
+		{0, 0},
+		{1, 0},
+		{0xDEADBEEFCAFEBABE, 0x0123456789ABCDEF},
+		{math.MaxUint64, 0x0FFFFFFFFFFFFFFF},
+	}
+	for _, n := range d128Sizes() {
+		v := makeRand128SmallSigned(n, uint64(n)*59+13)
+		for si, s := range smallScalars {
+			want := make([]uint64, 2*n)
+			if got := scalarD128ScalarSubChecked(s.lo, s.hi, v, want); got != -1 {
+				t.Fatalf("setup overflow at %d for n=%d scalar#%d", got, n, si)
+			}
+			for _, impl := range impls {
+				got := make([]uint64, 2*n)
+				if idx := impl.fn(s.lo, s.hi, v, got); idx != -1 {
+					t.Fatalf("%s n=%d scalar#%d: spurious overflow at %d", impl.name, n, si, idx)
+				}
+				for i := 0; i < 2*n; i++ {
+					if got[i] != want[i] {
+						t.Fatalf("%s n=%d scalar#%d slot=%d: got %x want %x", impl.name, n, si, i, got[i], want[i])
+					}
+				}
+			}
+		}
+	}
+
+	// Inject overflow: scalar = MinInt128, v[pos] = 1 ⇒ MinInt128 - 1 overflows.
+	for _, n := range []int{4, 8, 9, 16, 17, 33, 35, 64} {
+		for _, pos := range []int{0, 1, 3, 4, 7, 8, n - 1} {
+			if pos < 0 || pos >= n {
+				continue
+			}
+			v := make([]uint64, 2*n)
+			j := pos << 1
+			v[j] = 1
+			v[j+1] = 0
+			for _, impl := range impls {
+				got := make([]uint64, 2*n)
+				idx := impl.fn(0, 1<<63, v, got)
+				if idx != pos {
+					t.Fatalf("%s n=%d pos=%d: got idx %d", impl.name, n, pos, idx)
+				}
+			}
+		}
+	}
+}
+
+func benchD128AddScalarU(b *testing.B, fn func(slo, shi uint64, v, r []uint64), n int) {
+	v := makeRandD128(n, 1)
+	r := make([]uint64, 2*n)
+	b.SetBytes(int64(n) * 16 * 2)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		fn(0xDEADBEEFCAFEBABE, 0x0123456789ABCDEF, v, r)
+	}
+}
+
+func benchD128SubScalarU(b *testing.B, fn func(v []uint64, slo, shi uint64, r []uint64), n int) {
+	v := makeRandD128(n, 1)
+	r := make([]uint64, 2*n)
+	b.SetBytes(int64(n) * 16 * 2)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		fn(v, 0xDEADBEEFCAFEBABE, 0x0123456789ABCDEF, r)
+	}
+}
+
+func BenchmarkD128AddScalarUnchecked(b *testing.B) {
+	for _, n := range d128BenchSizes {
+		b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD128AddScalarU(b, scalarD128AddScalarUnchecked, n) })
+		b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD128AddScalarU(b, avx2D128AddScalarUnchecked, n) })
+		if cpu.X86.HasAVX512 {
+			b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD128AddScalarU(b, avx512D128AddScalarUnchecked, n) })
+		}
+	}
+}
+
+func BenchmarkD128SubScalarUnchecked(b *testing.B) {
+	for _, n := range d128BenchSizes {
+		b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD128SubScalarU(b, scalarD128SubScalarUnchecked, n) })
+		b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD128SubScalarU(b, avx2D128SubScalarUnchecked, n) })
+		if cpu.X86.HasAVX512 {
+			b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD128SubScalarU(b, avx512D128SubScalarUnchecked, n) })
+		}
+	}
+}
+
+func BenchmarkD128ScalarSubUnchecked(b *testing.B) {
+	for _, n := range d128BenchSizes {
+		b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD128AddScalarU(b, scalarD128ScalarSubUnchecked, n) })
+		b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD128AddScalarU(b, avx2D128ScalarSubUnchecked, n) })
+		if cpu.X86.HasAVX512 {
+			b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD128AddScalarU(b, avx512D128ScalarSubUnchecked, n) })
+		}
+	}
+}
+
+func TestD128SumReduceVariants(t *testing.T) {
+	impls := []struct {
+		name string
+		fn   func([]uint64) (uint64, uint64)
+	}{
+		{"scalar", scalarD128SumReduce},
+	}
+	if cpu.X86.HasAVX2 {
+		impls = append(impls, struct {
+			name string
+			fn   func([]uint64) (uint64, uint64)
+		}{"avx2", avx2D128SumReduce})
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, struct {
+			name string
+			fn   func([]uint64) (uint64, uint64)
+		}{"avx512", avx512D128SumReduce})
+	}
+
+	for _, n := range d128Sizes() {
+		v := makeRandD128(n, uint64(n)*23+1)
+		var refLo, refHi uint64
+		for i := 0; i < n; i++ {
+			j := i << 1
+			c := uint64(0)
+			if v[j]+refLo < refLo {
+				c = 1
+			}
+			refLo += v[j]
+			refHi += v[j+1] + c
+		}
+		for _, im := range impls {
+			lo, hi := im.fn(v)
+			if lo != refLo || hi != refHi {
+				t.Fatalf("%s n=%d: got (%x,%x) want (%x,%x)", im.name, n, lo, hi, refLo, refHi)
+			}
+		}
+	}
+}
+
+func BenchmarkD128SumReduce(b *testing.B) {
+	for _, n := range d128BenchSizes {
+		v := makeRandD128(n, 1)
+		b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) {
+			b.SetBytes(int64(n) * 16)
+			for i := 0; i < b.N; i++ {
+				_, _ = scalarD128SumReduce(v)
+			}
+		})
+		if cpu.X86.HasAVX2 {
+			b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) {
+				b.SetBytes(int64(n) * 16)
+				for i := 0; i < b.N; i++ {
+					_, _ = avx2D128SumReduce(v)
+				}
+			})
+		}
+		if cpu.X86.HasAVX512 {
+			b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) {
+				b.SetBytes(int64(n) * 16)
+				for i := 0; i < b.N; i++ {
+					_, _ = avx512D128SumReduce(v)
+				}
+			})
+		}
+	}
+}
diff --git a/pkg/common/simdkernels/d128_negabs.go b/pkg/common/simdkernels/d128_negabs.go
new file mode 100644
index 0000000000000..e6be2ddf29f52
--- /dev/null
+++ b/pkg/common/simdkernels/d128_negabs.go
@@ -0,0 +1,70 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package simdkernels
+
+import "math/bits"
+
+// Decimal128 element-wise negate / absolute value on slices of uint64 with
+// the matrixone Decimal128 layout (lo, hi pair per element). The src and
+// dst slices both have length 2*N. dst may alias src.
+//
+// Both ops use 128-bit two's complement (~x + 1). Negate is unconditional;
+// Abs is conditional on the sign bit of the high word. MinInt128 wraps to
+// itself, matching the scalar SQL semantics in arith_decimal_fast.go.
+//
+// The exported variables are dispatchers; their default values are the
+// scalar reference implementations and may be replaced at init time on
+// amd64 when AVX2 / AVX-512 are detected (see d128_negabs_simd_amd64.go).
+
+var (
+	D128Negate func(src, dst []uint64) = scalarD128Negate
+	D128Abs    func(src, dst []uint64) = scalarD128Abs
+)
+
+func scalarD128Negate(src, dst []uint64) {
+	n := len(dst) / 2
+	if len(src) < 2*n {
+		return
+	}
+	for i := 0; i < n; i++ {
+		j := i << 1
+		lo := ^src[j]
+		hi := ^src[j+1]
+		var c uint64
+		lo, c = bits.Add64(lo, 1, 0)
+		hi, _ = bits.Add64(hi, 0, c)
+		dst[j] = lo
+		dst[j+1] = hi
+	}
+}
+
+func scalarD128Abs(src, dst []uint64) {
+	n := len(dst) / 2
+	if len(src) < 2*n {
+		return
+	}
+	for i := 0; i < n; i++ {
+		j := i << 1
+		lo, hi := src[j], src[j+1]
+		sign := uint64(int64(hi) >> 63) // 0 or all-ones
+		lo ^= sign
+		hi ^= sign
+		var c uint64
+		lo, c = bits.Add64(lo, sign&1, 0)
+		hi, _ = bits.Add64(hi, 0, c)
+		dst[j] = lo
+		dst[j+1] = hi
+	}
+}
diff --git a/pkg/common/simdkernels/d128_negabs_simd_amd64.go b/pkg/common/simdkernels/d128_negabs_simd_amd64.go
new file mode 100644
index 0000000000000..9ff5ad8e3345b
--- /dev/null
+++ b/pkg/common/simdkernels/d128_negabs_simd_amd64.go
@@ -0,0 +1,253 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build goexperiment.simd && amd64
+
+package simdkernels
+
+import (
+	"math/bits"
+	"simd/archsimd"
+	"unsafe"
+
+	"golang.org/x/sys/cpu"
+)
+
+// d128_negabs_simd_amd64.go: SIMD batch negate / abs for Decimal128.
+//
+// Per-element semantics: rLo:rHi = (~lo:~hi) + m, where m is 1 (Negate) or
+// the sign bit of hi (Abs). Implemented via the conditional-negate idiom:
+//
+//	mask  = -m              (all-ones if m == 1, else zero)
+//	loBar = lo XOR mask      // ~lo when negating, lo otherwise
+//	hiBar = hi XOR mask
+//	rLo   = loBar - mask     // adds 1 (or 0) without an explicit branch
+//	carry = rLo wraps        // i.e., loBar < rLo unsigned ⇔ mask=-1 AND lo=0
+//	rHi   = hiBar - carry    // adds 1 in carrying lanes; XOR already did ~
+//
+// Crucially rHi must NOT subtract `mask` again: the XOR has already produced
+// ~hi when negating, and the +1 for two's complement only propagates from lo
+// via the carry. Subtracting mask twice would over-add 1 on every negated
+// lane.
+//
+// 4 elements per AVX2 iteration (8 q-words = 64 B), 8 per AVX-512 iteration
+// (16 q-words = 128 B). Layout deinterleave/reinterleave reuses the same
+// VPUNPCK pattern as d128_addsub.
+
+func init() {
+	switch {
+	case cpu.X86.HasAVX512:
+		D128Negate = avx512D128Negate
+		D128Abs = avx512D128Abs
+	case cpu.X86.HasAVX2:
+		D128Negate = avx2D128Negate
+		D128Abs = avx2D128Abs
+	}
+}
+
+// ---------------------------------------------------------------------------
+// AVX2 (Int64x4) implementation
+// ---------------------------------------------------------------------------
+
+//go:nosplit
+func avx2D128NegBody(lo, hi, mask, sb archsimd.Int64x4) (rLo, rHi archsimd.Int64x4) {
+	loBar := lo.Xor(mask)
+	hiBar := hi.Xor(mask)
+	rLo = loBar.Sub(mask)
+	carry := rLo.Xor(sb).Less(loBar.Xor(sb)).ToInt64x4()
+	rHi = hiBar.Sub(carry)
+	return
+}
+
+func avx2D128Negate(src, dst []uint64) {
+	n := len(dst) / 2
+	if n == 0 || len(src) < 2*n {
+		return
+	}
+	ps, pd := unsafe.Pointer(&src[0]), unsafe.Pointer(&dst[0])
+	sb := archsimd.BroadcastInt64x4(signBit128)
+	mask := archsimd.BroadcastInt64x4(-1) // unconditional negate
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 16
+		s0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(ps, off)))
+		s1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(ps, off+32)))
+		lo := s0.InterleaveLoGrouped(s1)
+		hi := s0.InterleaveHiGrouped(s1)
+		rLo, rHi := avx2D128NegBody(lo, hi, mask, sb)
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[4]int64)(unsafe.Add(pd, off)))
+		r1.Store((*[4]int64)(unsafe.Add(pd, off+32)))
+	}
+	for ; i < n; i++ {
+		j := i << 1
+		lo, c := bits.Add64(^src[j], 1, 0)
+		hi, _ := bits.Add64(^src[j+1], 0, c)
+		dst[j] = lo
+		dst[j+1] = hi
+	}
+}
+
+func avx2D128Abs(src, dst []uint64) {
+	n := len(dst) / 2
+	if n == 0 || len(src) < 2*n {
+		return
+	}
+	ps, pd := unsafe.Pointer(&src[0]), unsafe.Pointer(&dst[0])
+	sb := archsimd.BroadcastInt64x4(signBit128)
+	zero := archsimd.BroadcastInt64x4(0)
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 16
+		s0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(ps, off)))
+		s1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(ps, off+32)))
+		lo := s0.InterleaveLoGrouped(s1)
+		hi := s0.InterleaveHiGrouped(s1)
+		// mask = -1 in lanes where hi < 0 (signed), 0 elsewhere.
+		mask := hi.Less(zero).ToInt64x4()
+		rLo, rHi := avx2D128NegBody(lo, hi, mask, sb)
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[4]int64)(unsafe.Add(pd, off)))
+		r1.Store((*[4]int64)(unsafe.Add(pd, off+32)))
+	}
+	for ; i < n; i++ {
+		j := i << 1
+		lo, hi := src[j], src[j+1]
+		sign := uint64(int64(hi) >> 63)
+		lo ^= sign
+		hi ^= sign
+		var c uint64
+		lo, c = bits.Add64(lo, sign&1, 0)
+		hi, _ = bits.Add64(hi, 0, c)
+		dst[j] = lo
+		dst[j+1] = hi
+	}
+}
+
+// ---------------------------------------------------------------------------
+// AVX-512 (Int64x8) implementation
+// ---------------------------------------------------------------------------
+
+//go:nosplit
+func avx512D128NegBody(lo, hi, mask archsimd.Int64x8) (rLo, rHi archsimd.Int64x8) {
+	loBar := lo.Xor(mask)
+	hiBar := hi.Xor(mask)
+	rLo = loBar.Sub(mask)
+	// Native unsigned compare on AVX-512.
+	carry := rLo.AsUint64x8().Less(loBar.AsUint64x8()).ToInt64x8()
+	rHi = hiBar.Sub(carry)
+	return
+}
+
+func avx512D128Negate(src, dst []uint64) {
+	n := len(dst) / 2
+	if n == 0 || len(src) < 2*n {
+		return
+	}
+	ps, pd := unsafe.Pointer(&src[0]), unsafe.Pointer(&dst[0])
+	mask := archsimd.BroadcastInt64x8(-1)
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 16
+		s0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(ps, off)))
+		s1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(ps, off+64)))
+		lo := s0.InterleaveLoGrouped(s1)
+		hi := s0.InterleaveHiGrouped(s1)
+		rLo, rHi := avx512D128NegBody(lo, hi, mask)
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[8]int64)(unsafe.Add(pd, off)))
+		r1.Store((*[8]int64)(unsafe.Add(pd, off+64)))
+	}
+	// AVX2 path handles 4-elem chunks; reuse for 4..7 remainder.
+	if r := n - i; r >= 4 {
+		sb := archsimd.BroadcastInt64x4(signBit128)
+		mask4 := archsimd.BroadcastInt64x4(-1)
+		off := uintptr(i) * 16
+		s0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(ps, off)))
+		s1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(ps, off+32)))
+		lo := s0.InterleaveLoGrouped(s1)
+		hi := s0.InterleaveHiGrouped(s1)
+		rLo, rHi := avx2D128NegBody(lo, hi, mask4, sb)
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[4]int64)(unsafe.Add(pd, off)))
+		r1.Store((*[4]int64)(unsafe.Add(pd, off+32)))
+		i += 4
+	}
+	for ; i < n; i++ {
+		j := i << 1
+		lo, c := bits.Add64(^src[j], 1, 0)
+		hi, _ := bits.Add64(^src[j+1], 0, c)
+		dst[j] = lo
+		dst[j+1] = hi
+	}
+}
+
+func avx512D128Abs(src, dst []uint64) {
+	n := len(dst) / 2
+	if n == 0 || len(src) < 2*n {
+		return
+	}
+	ps, pd := unsafe.Pointer(&src[0]), unsafe.Pointer(&dst[0])
+	zero := archsimd.BroadcastInt64x8(0)
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 16
+		s0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(ps, off)))
+		s1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(ps, off+64)))
+		lo := s0.InterleaveLoGrouped(s1)
+		hi := s0.InterleaveHiGrouped(s1)
+		mask := hi.Less(zero).ToInt64x8()
+		rLo, rHi := avx512D128NegBody(lo, hi, mask)
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[8]int64)(unsafe.Add(pd, off)))
+		r1.Store((*[8]int64)(unsafe.Add(pd, off+64)))
+	}
+	if r := n - i; r >= 4 {
+		sb := archsimd.BroadcastInt64x4(signBit128)
+		zero4 := archsimd.BroadcastInt64x4(0)
+		off := uintptr(i) * 16
+		s0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(ps, off)))
+		s1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(ps, off+32)))
+		lo := s0.InterleaveLoGrouped(s1)
+		hi := s0.InterleaveHiGrouped(s1)
+		mask := hi.Less(zero4).ToInt64x4()
+		rLo, rHi := avx2D128NegBody(lo, hi, mask, sb)
+		r0 := rLo.InterleaveLoGrouped(rHi)
+		r1 := rLo.InterleaveHiGrouped(rHi)
+		r0.Store((*[4]int64)(unsafe.Add(pd, off)))
+		r1.Store((*[4]int64)(unsafe.Add(pd, off+32)))
+		i += 4
+	}
+	for ; i < n; i++ {
+		j := i << 1
+		lo, hi := src[j], src[j+1]
+		sign := uint64(int64(hi) >> 63)
+		lo ^= sign
+		hi ^= sign
+		var c uint64
+		lo, c = bits.Add64(lo, sign&1, 0)
+		hi, _ = bits.Add64(hi, 0, c)
+		dst[j] = lo
+		dst[j+1] = hi
+	}
+}
diff --git a/pkg/common/simdkernels/d128_negabs_test.go b/pkg/common/simdkernels/d128_negabs_test.go
new file mode 100644
index 0000000000000..4f2ddc1be6811
--- /dev/null
+++ b/pkg/common/simdkernels/d128_negabs_test.go
@@ -0,0 +1,167 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build goexperiment.simd && amd64
+
+package simdkernels
+
+import (
+	"math"
+	"strconv"
+	"testing"
+)
+
+type d128UnaryImpl struct {
+	name string
+	fn   func(src, dst []uint64)
+}
+
+func d128NegateImpls() []d128UnaryImpl {
+	out := []d128UnaryImpl{{name: "scalar", fn: scalarD128Negate}}
+	if D128Negate != nil {
+		out = append(out, d128UnaryImpl{name: "dispatch", fn: D128Negate})
+	}
+	return out
+}
+
+func d128AbsImpls() []d128UnaryImpl {
+	out := []d128UnaryImpl{{name: "scalar", fn: scalarD128Abs}}
+	if D128Abs != nil {
+		out = append(out, d128UnaryImpl{name: "dispatch", fn: D128Abs})
+	}
+	return out
+}
+
+func d128NegAbsEdges() []uint64 {
+	// pairs of (lo, hi) covering boundary cases
+	pairs := [][2]uint64{
+		{0, 0},
+		{1, 0},
+		{math.MaxUint64, 0},
+		{0, 1},
+		{0, math.MaxUint64},
+		{math.MaxUint64, math.MaxUint64}, // -1
+		{1, 0x8000000000000000},          // negative with lo == 1
+		{0, 0x8000000000000000},          // MinInt128 (negation wraps)
+		{math.MaxUint64, 0x7FFFFFFFFFFFFFFF},
+		{0, 0x7FFFFFFFFFFFFFFF},
+	}
+	out := make([]uint64, 0, 2*len(pairs))
+	for _, p := range pairs {
+		out = append(out, p[0], p[1])
+	}
+	return out
+}
+
+func TestD128NegateCorrectness(t *testing.T) {
+	impls := d128NegateImpls()
+	for _, n := range d128Sizes() {
+		src := makeRandD128(n, 0xDEC128^uint64(n))
+		want := make([]uint64, 2*n)
+		scalarD128Negate(src, want)
+		for _, im := range impls {
+			got := make([]uint64, 2*n)
+			im.fn(src, got)
+			for i := 0; i < 2*n; i++ {
+				if got[i] != want[i] {
+					t.Fatalf("%s n=%d idx=%d: got 0x%x want 0x%x src(lo=0x%x hi=0x%x)",
+						im.name, n, i, got[i], want[i], src[(i/2)*2], src[(i/2)*2+1])
+				}
+			}
+		}
+	}
+}
+
+func TestD128NegateEdges(t *testing.T) {
+	src := d128NegAbsEdges()
+	n := len(src) / 2
+	want := make([]uint64, 2*n)
+	scalarD128Negate(src, want)
+	for _, im := range d128NegateImpls() {
+		got := make([]uint64, 2*n)
+		im.fn(src, got)
+		for i := 0; i < 2*n; i++ {
+			if got[i] != want[i] {
+				t.Fatalf("%s idx=%d: got 0x%x want 0x%x src(lo=0x%x hi=0x%x)",
+					im.name, i, got[i], want[i], src[(i/2)*2], src[(i/2)*2+1])
+			}
+		}
+	}
+}
+
+func TestD128AbsCorrectness(t *testing.T) {
+	impls := d128AbsImpls()
+	for _, n := range d128Sizes() {
+		src := makeRandD128(n, 0xABA127^uint64(n))
+		want := make([]uint64, 2*n)
+		scalarD128Abs(src, want)
+		for _, im := range impls {
+			got := make([]uint64, 2*n)
+			im.fn(src, got)
+			for i := 0; i < 2*n; i++ {
+				if got[i] != want[i] {
+					t.Fatalf("%s n=%d idx=%d: got 0x%x want 0x%x src(lo=0x%x hi=0x%x)",
+						im.name, n, i, got[i], want[i], src[(i/2)*2], src[(i/2)*2+1])
+				}
+			}
+		}
+	}
+}
+
+func TestD128AbsEdges(t *testing.T) {
+	src := d128NegAbsEdges()
+	n := len(src) / 2
+	want := make([]uint64, 2*n)
+	scalarD128Abs(src, want)
+	for _, im := range d128AbsImpls() {
+		got := make([]uint64, 2*n)
+		im.fn(src, got)
+		for i := 0; i < 2*n; i++ {
+			if got[i] != want[i] {
+				t.Fatalf("%s idx=%d: got 0x%x want 0x%x src(lo=0x%x hi=0x%x)",
+					im.name, i, got[i], want[i], src[(i/2)*2], src[(i/2)*2+1])
+			}
+		}
+	}
+}
+
+func benchmarkD128Unary(b *testing.B, name string, fn func(src, dst []uint64), n int) {
+	src := makeRandD128(n, 0xBEEF^uint64(n))
+	dst := make([]uint64, 2*n)
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		fn(src, dst)
+	}
+}
+
+func BenchmarkD128Negate(b *testing.B) {
+	for _, n := range []int{16, 64, 256, 1024, 4096} {
+		for _, im := range d128NegateImpls() {
+			b.Run(im.name+"/n="+strconv.Itoa(n), func(b *testing.B) {
+				benchmarkD128Unary(b, im.name, im.fn, n)
+			})
+		}
+	}
+}
+
+func BenchmarkD128Abs(b *testing.B) {
+	for _, n := range []int{16, 64, 256, 1024, 4096} {
+		for _, im := range d128AbsImpls() {
+			b.Run(im.name+"/n="+strconv.Itoa(n), func(b *testing.B) {
+				benchmarkD128Unary(b, im.name, im.fn, n)
+			})
+		}
+	}
+}
diff --git a/pkg/common/simdkernels/d256_addsub.go b/pkg/common/simdkernels/d256_addsub.go
new file mode 100644
index 0000000000000..07d9d8587dfb0
--- /dev/null
+++ b/pkg/common/simdkernels/d256_addsub.go
@@ -0,0 +1,320 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package simdkernels
+
+import "math/bits"
+
+// Decimal256 add/sub on uint64 slices with the matrixone Decimal256 layout
+// (4 uint64 per element, lo→hi at indices 4i..4i+3). The top word (slot
+// 4i+3) is interpreted as int64 for the signed-overflow predicate.
+//
+// API mirrors D64*/D128*: dispatcher pairs (Unchecked, Checked); the
+// dispatcher defaults to the scalar reference and is replaced at init time
+// on amd64 with AVX2 / AVX-512 (see d256_addsub_simd_amd64.go).
+
+var (
+	D256AddUnchecked func(a, b, r []uint64)     = scalarD256AddUnchecked
+	D256SubUnchecked func(a, b, r []uint64)     = scalarD256SubUnchecked
+	D256AddChecked   func(a, b, r []uint64) int = scalarD256AddChecked
+	D256SubChecked   func(a, b, r []uint64) int = scalarD256SubChecked
+
+	D256AddScalarUnchecked func(s0, s1, s2, s3 uint64, v, r []uint64)              = scalarD256AddScalarUnchecked
+	D256AddScalarChecked   func(s0, s1, s2, s3 uint64, v, r []uint64) int          = scalarD256AddScalarChecked
+	D256SubScalarUnchecked func(v []uint64, s0, s1, s2, s3 uint64, r []uint64)     = scalarD256SubScalarUnchecked
+	D256SubScalarChecked   func(v []uint64, s0, s1, s2, s3 uint64, r []uint64) int = scalarD256SubScalarChecked
+	D256ScalarSubUnchecked func(s0, s1, s2, s3 uint64, v, r []uint64)              = scalarD256ScalarSubUnchecked
+	D256ScalarSubChecked   func(s0, s1, s2, s3 uint64, v, r []uint64) int          = scalarD256ScalarSubChecked
+)
+
+func scalarD256AddUnchecked(a, b, r []uint64) {
+	n := len(r) / 4
+	if len(a) < 4*n || len(b) < 4*n {
+		return
+	}
+	for i := 0; i < n; i++ {
+		j := i << 2
+		w0, c := bits.Add64(a[j], b[j], 0)
+		w1, c := bits.Add64(a[j+1], b[j+1], c)
+		w2, c := bits.Add64(a[j+2], b[j+2], c)
+		w3, _ := bits.Add64(a[j+3], b[j+3], c)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+	}
+}
+
+func scalarD256SubUnchecked(a, b, r []uint64) {
+	n := len(r) / 4
+	if len(a) < 4*n || len(b) < 4*n {
+		return
+	}
+	for i := 0; i < n; i++ {
+		j := i << 2
+		w0, br := bits.Sub64(a[j], b[j], 0)
+		w1, br := bits.Sub64(a[j+1], b[j+1], br)
+		w2, br := bits.Sub64(a[j+2], b[j+2], br)
+		w3, _ := bits.Sub64(a[j+3], b[j+3], br)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+	}
+}
+
+func scalarD256AddChecked(a, b, r []uint64) int {
+	n := len(r) / 4
+	if len(a) < 4*n || len(b) < 4*n {
+		return -1
+	}
+	first := -1
+	for i := 0; i < n; i++ {
+		j := i << 2
+		aHi := a[j+3]
+		bHi := b[j+3]
+		w0, c := bits.Add64(a[j], b[j], 0)
+		w1, c := bits.Add64(a[j+1], b[j+1], c)
+		w2, c := bits.Add64(a[j+2], b[j+2], c)
+		w3, _ := bits.Add64(aHi, bHi, c)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+		if first < 0 {
+			ah, bh, rh := int64(aHi), int64(bHi), int64(w3)
+			if (ah^rh)&^(ah^bh) < 0 {
+				first = i
+			}
+		}
+	}
+	return first
+}
+
+func scalarD256SubChecked(a, b, r []uint64) int {
+	n := len(r) / 4
+	if len(a) < 4*n || len(b) < 4*n {
+		return -1
+	}
+	first := -1
+	for i := 0; i < n; i++ {
+		j := i << 2
+		aHi := a[j+3]
+		bHi := b[j+3]
+		w0, br := bits.Sub64(a[j], b[j], 0)
+		w1, br := bits.Sub64(a[j+1], b[j+1], br)
+		w2, br := bits.Sub64(a[j+2], b[j+2], br)
+		w3, _ := bits.Sub64(aHi, bHi, br)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+		if first < 0 {
+			ah, bh, rh := int64(aHi), int64(bHi), int64(w3)
+			if (ah^rh)&(ah^bh) < 0 {
+				first = i
+			}
+		}
+	}
+	return first
+}
+
+// d256FirstOverflow rescans the first end elements (each = 4 uint64) for
+// the first overflow. Used by SIMD checked variants when the accumulated
+// mask reports overflow but the scalar tail did not see one.
+func d256FirstOverflow(a, b []uint64, end int, sub bool) int {
+	if sub {
+		for i := 0; i < end; i++ {
+			j := i << 2
+			_, br := bits.Sub64(a[j], b[j], 0)
+			_, br = bits.Sub64(a[j+1], b[j+1], br)
+			_, br = bits.Sub64(a[j+2], b[j+2], br)
+			w3, _ := bits.Sub64(a[j+3], b[j+3], br)
+			ah, bh, rh := int64(a[j+3]), int64(b[j+3]), int64(w3)
+			if (ah^rh)&(ah^bh) < 0 {
+				return i
+			}
+		}
+		return -1
+	}
+	for i := 0; i < end; i++ {
+		j := i << 2
+		_, c := bits.Add64(a[j], b[j], 0)
+		_, c = bits.Add64(a[j+1], b[j+1], c)
+		_, c = bits.Add64(a[j+2], b[j+2], c)
+		w3, _ := bits.Add64(a[j+3], b[j+3], c)
+		ah, bh, rh := int64(a[j+3]), int64(b[j+3]), int64(w3)
+		if (ah^rh)&^(ah^bh) < 0 {
+			return i
+		}
+	}
+	return -1
+}
+
+// ---------------------------------------------------------------------------
+// Scalar-broadcast reference implementations (Decimal256 = 4 uint64/elem).
+// ---------------------------------------------------------------------------
+
+func scalarD256AddScalarUnchecked(s0, s1, s2, s3 uint64, v, r []uint64) {
+	n := len(r) / 4
+	if len(v) < 4*n {
+		return
+	}
+	for i := 0; i < n; i++ {
+		j := i << 2
+		w0, c := bits.Add64(s0, v[j], 0)
+		w1, c := bits.Add64(s1, v[j+1], c)
+		w2, c := bits.Add64(s2, v[j+2], c)
+		w3, _ := bits.Add64(s3, v[j+3], c)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+	}
+}
+
+func scalarD256SubScalarUnchecked(v []uint64, s0, s1, s2, s3 uint64, r []uint64) {
+	n := len(r) / 4
+	if len(v) < 4*n {
+		return
+	}
+	for i := 0; i < n; i++ {
+		j := i << 2
+		w0, br := bits.Sub64(v[j], s0, 0)
+		w1, br := bits.Sub64(v[j+1], s1, br)
+		w2, br := bits.Sub64(v[j+2], s2, br)
+		w3, _ := bits.Sub64(v[j+3], s3, br)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+	}
+}
+
+func scalarD256ScalarSubUnchecked(s0, s1, s2, s3 uint64, v, r []uint64) {
+	n := len(r) / 4
+	if len(v) < 4*n {
+		return
+	}
+	for i := 0; i < n; i++ {
+		j := i << 2
+		w0, br := bits.Sub64(s0, v[j], 0)
+		w1, br := bits.Sub64(s1, v[j+1], br)
+		w2, br := bits.Sub64(s2, v[j+2], br)
+		w3, _ := bits.Sub64(s3, v[j+3], br)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+	}
+}
+
+func scalarD256AddScalarChecked(s0, s1, s2, s3 uint64, v, r []uint64) int {
+	n := len(r) / 4
+	if len(v) < 4*n {
+		return -1
+	}
+	first := -1
+	sh := int64(s3)
+	for i := 0; i < n; i++ {
+		j := i << 2
+		vHi := v[j+3]
+		w0, c := bits.Add64(s0, v[j], 0)
+		w1, c := bits.Add64(s1, v[j+1], c)
+		w2, c := bits.Add64(s2, v[j+2], c)
+		w3, _ := bits.Add64(s3, vHi, c)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+		if first < 0 {
+			vh, rh := int64(vHi), int64(w3)
+			if (sh^rh)&^(sh^vh) < 0 {
+				first = i
+			}
+		}
+	}
+	return first
+}
+
+func scalarD256SubScalarChecked(v []uint64, s0, s1, s2, s3 uint64, r []uint64) int {
+	n := len(r) / 4
+	if len(v) < 4*n {
+		return -1
+	}
+	first := -1
+	sh := int64(s3)
+	for i := 0; i < n; i++ {
+		j := i << 2
+		vHi := v[j+3]
+		w0, br := bits.Sub64(v[j], s0, 0)
+		w1, br := bits.Sub64(v[j+1], s1, br)
+		w2, br := bits.Sub64(v[j+2], s2, br)
+		w3, _ := bits.Sub64(vHi, s3, br)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+		if first < 0 {
+			vh, rh := int64(vHi), int64(w3)
+			if (vh^rh)&(vh^sh) < 0 {
+				first = i
+			}
+		}
+	}
+	return first
+}
+
+func scalarD256ScalarSubChecked(s0, s1, s2, s3 uint64, v, r []uint64) int {
+	n := len(r) / 4
+	if len(v) < 4*n {
+		return -1
+	}
+	first := -1
+	sh := int64(s3)
+	for i := 0; i < n; i++ {
+		j := i << 2
+		vHi := v[j+3]
+		w0, br := bits.Sub64(s0, v[j], 0)
+		w1, br := bits.Sub64(s1, v[j+1], br)
+		w2, br := bits.Sub64(s2, v[j+2], br)
+		w3, _ := bits.Sub64(s3, vHi, br)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+		if first < 0 {
+			vh, rh := int64(vHi), int64(w3)
+			if (sh^rh)&(sh^vh) < 0 {
+				first = i
+			}
+		}
+	}
+	return first
+}
+
+// d256ScalarFirstOverflow: rescan first end elements for first overflow in
+// scalar-broadcast ops. kind: 0=AddScalar, 1=SubScalar (v-s), 2=ScalarSub (s-v).
+func d256ScalarFirstOverflow(s0, s1, s2, s3 uint64, v []uint64, end int, kind int) int {
+	sh := int64(s3)
+	switch kind {
+	case 0:
+		for i := 0; i < end; i++ {
+			j := i << 2
+			_, c := bits.Add64(s0, v[j], 0)
+			_, c = bits.Add64(s1, v[j+1], c)
+			_, c = bits.Add64(s2, v[j+2], c)
+			w3, _ := bits.Add64(s3, v[j+3], c)
+			vh, rh := int64(v[j+3]), int64(w3)
+			if (sh^rh)&^(sh^vh) < 0 {
+				return i
+			}
+		}
+	case 1:
+		for i := 0; i < end; i++ {
+			j := i << 2
+			_, br := bits.Sub64(v[j], s0, 0)
+			_, br = bits.Sub64(v[j+1], s1, br)
+			_, br = bits.Sub64(v[j+2], s2, br)
+			w3, _ := bits.Sub64(v[j+3], s3, br)
+			vh, rh := int64(v[j+3]), int64(w3)
+			if (vh^rh)&(vh^sh) < 0 {
+				return i
+			}
+		}
+	case 2:
+		for i := 0; i < end; i++ {
+			j := i << 2
+			_, br := bits.Sub64(s0, v[j], 0)
+			_, br = bits.Sub64(s1, v[j+1], br)
+			_, br = bits.Sub64(s2, v[j+2], br)
+			w3, _ := bits.Sub64(s3, v[j+3], br)
+			vh, rh := int64(v[j+3]), int64(w3)
+			if (sh^rh)&(sh^vh) < 0 {
+				return i
+			}
+		}
+	}
+	return -1
+}
diff --git a/pkg/common/simdkernels/d256_addsub_simd_amd64.go b/pkg/common/simdkernels/d256_addsub_simd_amd64.go
new file mode 100644
index 0000000000000..038caaf740d5c
--- /dev/null
+++ b/pkg/common/simdkernels/d256_addsub_simd_amd64.go
@@ -0,0 +1,1380 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build goexperiment.simd && amd64
+
+package simdkernels
+
+import (
+	"simd/archsimd"
+	"unsafe"
+
+	"golang.org/x/sys/cpu"
+)
+
+// On AVX-512 hosts we use the avx512 path (8 elements/iter); otherwise on
+// AVX2 hosts we use the avx2 path (4 elements/iter); else scalar.
+func init() {
+	switch {
+	case cpu.X86.HasAVX512:
+		D256AddUnchecked = avx512D256AddUnchecked
+		D256SubUnchecked = avx512D256SubUnchecked
+		D256AddChecked = avx512D256AddChecked
+		D256SubChecked = avx512D256SubChecked
+		D256AddScalarUnchecked = avx512D256AddScalarUnchecked
+		D256SubScalarUnchecked = avx512D256SubScalarUnchecked
+		D256ScalarSubUnchecked = avx512D256ScalarSubUnchecked
+		D256AddScalarChecked = avx512D256AddScalarChecked
+		D256SubScalarChecked = avx512D256SubScalarChecked
+		D256ScalarSubChecked = avx512D256ScalarSubChecked
+	case cpu.X86.HasAVX2:
+		D256AddUnchecked = avx2D256AddUnchecked
+		D256SubUnchecked = avx2D256SubUnchecked
+		D256AddChecked = avx2D256AddChecked
+		D256SubChecked = avx2D256SubChecked
+		D256AddScalarUnchecked = avx2D256AddScalarUnchecked
+		D256SubScalarUnchecked = avx2D256SubScalarUnchecked
+		D256ScalarSubUnchecked = avx2D256ScalarSubUnchecked
+		D256AddScalarChecked = avx2D256AddScalarChecked
+		D256SubScalarChecked = avx2D256SubScalarChecked
+		D256ScalarSubChecked = avx2D256ScalarSubChecked
+	}
+}
+
+// transpose4x4 turns the column-major load layout [v0=elem0, v1=elem1,
+// v2=elem2, v3=elem3] (each Int64x4 = one Decimal256, lanes = words) into
+// the row-major layout where Wk[i] = element_i.word_k. The operation is
+// involutory: applying it again restores the original order, which we use
+// to write the result back.
+//
+// Implemented as the standard 4×4 int64 transpose with 4 unpacks + 4
+// VPERM2I128 (Select128FromPair). All operations are AVX2.
+//
+//go:nosplit
+func transpose4x4(v0, v1, v2, v3 archsimd.Int64x4) (W0, W1, W2, W3 archsimd.Int64x4) {
+	t0 := v0.InterleaveLoGrouped(v1) // [v0[0], v1[0], v0[2], v1[2]]
+	t1 := v0.InterleaveHiGrouped(v1) // [v0[1], v1[1], v0[3], v1[3]]
+	t2 := v2.InterleaveLoGrouped(v3) // [v2[0], v3[0], v2[2], v3[2]]
+	t3 := v2.InterleaveHiGrouped(v3) // [v2[1], v3[1], v2[3], v3[3]]
+	// VPERM2I128 imm encoding: arg(lo)/arg(hi) selects which 128-bit half:
+	//   0 = x.lo, 1 = x.hi, 2 = y.lo, 3 = y.hi.
+	W0 = t0.Select128FromPair(0, 2, t2) // [v0[0], v1[0], v2[0], v3[0]]
+	W2 = t0.Select128FromPair(1, 3, t2) // [v0[2], v1[2], v2[2], v3[2]]
+	W1 = t1.Select128FromPair(0, 2, t3) // [v0[1], v1[1], v2[1], v3[1]]
+	W3 = t1.Select128FromPair(1, 3, t3) // [v0[3], v1[3], v2[3], v3[3]]
+	return
+}
+
+// addCarryStage performs one stage of the multi-word add: r = a + b + cIn,
+// returning r and the carry-out vector (each lane = 0 or -1).
+//
+// cIn is encoded as a vector with values 0 or -1 (so subtracting it adds 0
+// or 1). cOut is the OR of two contributing wraps:
+//   - the bare `a + b` wrap  (always possible, regardless of cIn)
+//   - the `(a+b) - cIn` wrap (only when cIn = -1 and a+b = MaxU64)
+//
+// The (s == -1) & cIn alternative formulation was tried but regressed by ~5%
+// because addCarryStage already shares (s^sb) between csA and csB via CSE,
+// so only one XOR is saved while an extra constant broadcast and AND are
+// added. The asymmetry vs subBorrowStage reflects that addCarry's CSE is
+// already optimal whereas subBorrow's was not.
+//
+// Sign-bit-flipped Less is used because AVX2 has no unsigned int64 compare.
+//
+//go:nosplit
+func addCarryStage(a, b, cIn, sb archsimd.Int64x4) (r, cOut archsimd.Int64x4) {
+	s := a.Add(b)
+	csA := s.Xor(sb).Less(a.Xor(sb)).ToInt64x4()
+	r = s.Sub(cIn)
+	csB := r.Xor(sb).Less(s.Xor(sb)).ToInt64x4()
+	cOut = csA.Or(csB)
+	return
+}
+
+// addCarryStageNoOut is addCarryStage without computing the carry-out
+// (used for the topmost word in unchecked add).
+//
+//go:nosplit
+func addCarryStageNoOut(a, b, cIn archsimd.Int64x4) archsimd.Int64x4 {
+	return a.Add(b).Sub(cIn)
+}
+
+// subBorrowStage performs one stage of the multi-word sub: r = a - b - bIn.
+//
+// bIn is encoded as a vector with values 0 or -1 (so adding it subtracts 0
+// or 1). bOut is the OR of:
+//   - bare `a - b` borrow: a < b unsigned     ⇔ (a^sb) < (b^sb) signed
+//   - `(a-b) + bIn` borrow: happens iff s = 0 AND bIn = -1
+//
+// The second term is computed as (s == 0) & bIn, which avoids materializing
+// `s^sb` and `r^sb`. Compared to the symmetric `(s^sb) < (r^sb)` form, this
+// drops one XOR per stage and (more importantly) shortens the live-range
+// chain so the register allocator does not spill across the carry chain.
+//
+//go:nosplit
+func subBorrowStage(a, b, bIn, sb, zero archsimd.Int64x4) (r, bOut archsimd.Int64x4) {
+	s := a.Sub(b)
+	bsA := a.Xor(sb).Less(b.Xor(sb)).ToInt64x4()
+	r = s.Add(bIn)
+	bsB := s.Equal(zero).ToInt64x4().And(bIn)
+	bOut = bsA.Or(bsB)
+	return
+}
+
+//go:nosplit
+func subBorrowStageNoOut(a, b, bIn archsimd.Int64x4) archsimd.Int64x4 {
+	return a.Sub(b).Add(bIn)
+}
+
+func avx2D256AddUnchecked(a, b, r []uint64) {
+	n := len(r) / 4
+	if n == 0 || len(a) < 4*n || len(b) < 4*n {
+		return
+	}
+	pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x4(signBit128)
+	zero := archsimd.BroadcastInt64x4(0)
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 32
+		aV0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off)))
+		aV1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+32)))
+		aV2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+64)))
+		aV3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+96)))
+		bV0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off)))
+		bV1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+32)))
+		bV2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+64)))
+		bV3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+96)))
+
+		aW0, aW1, aW2, aW3 := transpose4x4(aV0, aV1, aV2, aV3)
+		bW0, bW1, bW2, bW3 := transpose4x4(bV0, bV1, bV2, bV3)
+
+		rW0, c0 := addCarryStage(aW0, bW0, zero, sb)
+		rW1, c1 := addCarryStage(aW1, bW1, c0, sb)
+		rW2, c2 := addCarryStage(aW2, bW2, c1, sb)
+		rW3 := addCarryStageNoOut(aW3, bW3, c2)
+
+		rV0, rV1, rV2, rV3 := transpose4x4(rW0, rW1, rW2, rW3)
+		rV0.Store((*[4]int64)(unsafe.Add(pr, off)))
+		rV1.Store((*[4]int64)(unsafe.Add(pr, off+32)))
+		rV2.Store((*[4]int64)(unsafe.Add(pr, off+64)))
+		rV3.Store((*[4]int64)(unsafe.Add(pr, off+96)))
+	}
+	for ; i < n; i++ {
+		j := i << 2
+		w0, c := addU64(a[j], b[j], 0)
+		w1, c := addU64(a[j+1], b[j+1], c)
+		w2, c := addU64(a[j+2], b[j+2], c)
+		w3, _ := addU64(a[j+3], b[j+3], c)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+	}
+}
+
+func avx2D256SubUnchecked(a, b, r []uint64) {
+	n := len(r) / 4
+	if n == 0 || len(a) < 4*n || len(b) < 4*n {
+		return
+	}
+	pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x4(signBit128)
+	zero := archsimd.BroadcastInt64x4(0)
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 32
+		aV0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off)))
+		aV1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+32)))
+		aV2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+64)))
+		aV3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+96)))
+		bV0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off)))
+		bV1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+32)))
+		bV2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+64)))
+		bV3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+96)))
+
+		aW0, aW1, aW2, aW3 := transpose4x4(aV0, aV1, aV2, aV3)
+		bW0, bW1, bW2, bW3 := transpose4x4(bV0, bV1, bV2, bV3)
+
+		rW0, b0 := subBorrowStage(aW0, bW0, zero, sb, zero)
+		rW1, b1 := subBorrowStage(aW1, bW1, b0, sb, zero)
+		rW2, b2 := subBorrowStage(aW2, bW2, b1, sb, zero)
+		rW3 := subBorrowStageNoOut(aW3, bW3, b2)
+
+		rV0, rV1, rV2, rV3 := transpose4x4(rW0, rW1, rW2, rW3)
+		rV0.Store((*[4]int64)(unsafe.Add(pr, off)))
+		rV1.Store((*[4]int64)(unsafe.Add(pr, off+32)))
+		rV2.Store((*[4]int64)(unsafe.Add(pr, off+64)))
+		rV3.Store((*[4]int64)(unsafe.Add(pr, off+96)))
+	}
+	for ; i < n; i++ {
+		j := i << 2
+		w0, br := subU64(a[j], b[j], 0)
+		w1, br := subU64(a[j+1], b[j+1], br)
+		w2, br := subU64(a[j+2], b[j+2], br)
+		w3, _ := subU64(a[j+3], b[j+3], br)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+	}
+}
+
+func avx2D256AddChecked(a, b, r []uint64) int {
+	n := len(r) / 4
+	if n == 0 || len(a) < 4*n || len(b) < 4*n {
+		return -1
+	}
+	pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x4(signBit128)
+	zero := archsimd.BroadcastInt64x4(0)
+
+	var ofAcc archsimd.Int64x4
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 32
+		aV0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off)))
+		aV1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+32)))
+		aV2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+64)))
+		aV3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+96)))
+		bV0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off)))
+		bV1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+32)))
+		bV2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+64)))
+		bV3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+96)))
+
+		aW0, aW1, aW2, aW3 := transpose4x4(aV0, aV1, aV2, aV3)
+		bW0, bW1, bW2, bW3 := transpose4x4(bV0, bV1, bV2, bV3)
+
+		rW0, c0 := addCarryStage(aW0, bW0, zero, sb)
+		rW1, c1 := addCarryStage(aW1, bW1, c0, sb)
+		rW2, c2 := addCarryStage(aW2, bW2, c1, sb)
+		rW3 := addCarryStageNoOut(aW3, bW3, c2)
+
+		ofAcc = ofAcc.Or(aW3.Xor(rW3).AndNot(aW3.Xor(bW3)))
+
+		rV0, rV1, rV2, rV3 := transpose4x4(rW0, rW1, rW2, rW3)
+		rV0.Store((*[4]int64)(unsafe.Add(pr, off)))
+		rV1.Store((*[4]int64)(unsafe.Add(pr, off+32)))
+		rV2.Store((*[4]int64)(unsafe.Add(pr, off+64)))
+		rV3.Store((*[4]int64)(unsafe.Add(pr, off+96)))
+	}
+	vecEnd := i
+
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	for ; i < n; i++ {
+		j := i << 2
+		aHi := a[j+3]
+		bHi := b[j+3]
+		w0, c := addU64(a[j], b[j], 0)
+		w1, c := addU64(a[j+1], b[j+1], c)
+		w2, c := addU64(a[j+2], b[j+2], c)
+		w3, _ := addU64(aHi, bHi, c)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+		ah, bh, rh := int64(aHi), int64(bHi), int64(w3)
+		if (ah^rh)&^(ah^bh) < 0 {
+			if vecOverflow {
+				return d256FirstOverflow(a, b, vecEnd, false)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d256FirstOverflow(a, b, vecEnd, false)
+}
+
+func avx2D256SubChecked(a, b, r []uint64) int {
+	n := len(r) / 4
+	if n == 0 || len(a) < 4*n || len(b) < 4*n {
+		return -1
+	}
+	pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x4(signBit128)
+	zero := archsimd.BroadcastInt64x4(0)
+
+	var ofAcc archsimd.Int64x4
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 32
+		aV0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off)))
+		aV1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+32)))
+		aV2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+64)))
+		aV3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+96)))
+		bV0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off)))
+		bV1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+32)))
+		bV2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+64)))
+		bV3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+96)))
+
+		aW0, aW1, aW2, aW3 := transpose4x4(aV0, aV1, aV2, aV3)
+		bW0, bW1, bW2, bW3 := transpose4x4(bV0, bV1, bV2, bV3)
+
+		rW0, b0 := subBorrowStage(aW0, bW0, zero, sb, zero)
+		rW1, b1 := subBorrowStage(aW1, bW1, b0, sb, zero)
+		rW2, b2 := subBorrowStage(aW2, bW2, b1, sb, zero)
+		rW3 := subBorrowStageNoOut(aW3, bW3, b2)
+
+		ofAcc = ofAcc.Or(aW3.Xor(rW3).And(aW3.Xor(bW3)))
+
+		rV0, rV1, rV2, rV3 := transpose4x4(rW0, rW1, rW2, rW3)
+		rV0.Store((*[4]int64)(unsafe.Add(pr, off)))
+		rV1.Store((*[4]int64)(unsafe.Add(pr, off+32)))
+		rV2.Store((*[4]int64)(unsafe.Add(pr, off+64)))
+		rV3.Store((*[4]int64)(unsafe.Add(pr, off+96)))
+	}
+	vecEnd := i
+
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	for ; i < n; i++ {
+		j := i << 2
+		aHi := a[j+3]
+		bHi := b[j+3]
+		w0, br := subU64(a[j], b[j], 0)
+		w1, br := subU64(a[j+1], b[j+1], br)
+		w2, br := subU64(a[j+2], b[j+2], br)
+		w3, _ := subU64(aHi, bHi, br)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+		ah, bh, rh := int64(aHi), int64(bHi), int64(w3)
+		if (ah^rh)&(ah^bh) < 0 {
+			if vecOverflow {
+				return d256FirstOverflow(a, b, vecEnd, true)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d256FirstOverflow(a, b, vecEnd, true)
+}
+
+// addU64 / subU64 are local thin wrappers around math/bits to keep the
+// scalar tail concise; they inline at call sites.
+//
+//go:nosplit
+func addU64(x, y, c uint64) (uint64, uint64) {
+	s := x + y
+	c2 := uint64(0)
+	if s < x {
+		c2 = 1
+	}
+	r := s + c
+	c3 := uint64(0)
+	if r < s {
+		c3 = 1
+	}
+	return r, c2 | c3
+}
+
+//go:nosplit
+func subU64(x, y, br uint64) (uint64, uint64) {
+	d := x - y
+	b1 := uint64(0)
+	if x < y {
+		b1 = 1
+	}
+	r := d - br
+	b2 := uint64(0)
+	if d < br {
+		b2 = 1
+	}
+	return r, b1 | b2
+}
+
+// ---------------------------------------------------------------------------
+// AVX-512 path: each Decimal256 = 4 uint64 = 32 B. We process 8 elements per
+// kernel iteration (= 256 B per input), loaded as 4 Int64x8 vectors.
+//
+// Layout per loaded vector (lo→hi inside each 4-word element):
+//   V0 = [e0w0,e0w1,e0w2,e0w3, e1w0,e1w1,e1w2,e1w3]
+//   V1 = [e2w0,e2w1,e2w2,e2w3, e3w0,e3w1,e3w2,e3w3]
+//   V2 = [e4*..]  V3 = [e6*..]
+//
+// We transpose to per-word vectors W0..W3 each holding the same word from
+// 8 elements, run the 4-stage carry/borrow chain, and inverse-transpose
+// back. Both transposes are 8 ConcatPermute (VPERMI2Q) instructions
+// arranged as two 4-permute stages.
+// ---------------------------------------------------------------------------
+
+// AVX-512 transpose index vectors. They are package-scope so that
+// LoadUint64x8 hoists to a single broadcast/load that the compiler can
+// keep in a register across the loop body.
+var (
+	avx512D256IdxFwdW0W2 = [8]uint64{0, 4, 8, 12, 2, 6, 10, 14}
+	avx512D256IdxFwdW1W3 = [8]uint64{1, 5, 9, 13, 3, 7, 11, 15}
+	avx512D256IdxLoHalf  = [8]uint64{0, 1, 2, 3, 8, 9, 10, 11}
+	avx512D256IdxHiHalf  = [8]uint64{4, 5, 6, 7, 12, 13, 14, 15}
+	avx512D256IdxInvV0   = [8]uint64{0, 8, 4, 12, 1, 9, 5, 13}
+	avx512D256IdxInvV1   = [8]uint64{2, 10, 6, 14, 3, 11, 7, 15}
+)
+
+// transpose8x4Forward: V0..V3 (each = 2 D256 elements in element-natural
+// layout) → W0..W3 (each = one word from all 8 elements, in element order).
+//
+//go:nosplit
+func transpose8x4Forward(v0, v1, v2, v3 archsimd.Int64x8) (w0, w1, w2, w3 archsimd.Int64x8) {
+	idxW02 := archsimd.LoadUint64x8(&avx512D256IdxFwdW0W2)
+	idxW13 := archsimd.LoadUint64x8(&avx512D256IdxFwdW1W3)
+	idxLo := archsimd.LoadUint64x8(&avx512D256IdxLoHalf)
+	idxHi := archsimd.LoadUint64x8(&avx512D256IdxHiHalf)
+
+	// Stage 1: gather (w0,w2) and (w1,w3) within each pair-of-elements vector.
+	q0 := v0.ConcatPermute(v1, idxW02) // [e0w0,e1w0,e2w0,e3w0, e0w2,e1w2,e2w2,e3w2]
+	q1 := v0.ConcatPermute(v1, idxW13) // [e0w1,e1w1,e2w1,e3w1, e0w3,e1w3,e2w3,e3w3]
+	q2 := v2.ConcatPermute(v3, idxW02) // [e4w0..e7w0, e4w2..e7w2]
+	q3 := v2.ConcatPermute(v3, idxW13) // [e4w1..e7w1, e4w3..e7w3]
+
+	// Stage 2: combine the lo/hi halves across pair groups.
+	w0 = q0.ConcatPermute(q2, idxLo) // [e0w0..e7w0]
+	w2 = q0.ConcatPermute(q2, idxHi) // [e0w2..e7w2]
+	w1 = q1.ConcatPermute(q3, idxLo) // [e0w1..e7w1]
+	w3 = q1.ConcatPermute(q3, idxHi) // [e0w3..e7w3]
+	return
+}
+
+// transpose8x4Inverse: W0..W3 → V0..V3 (the inverse of transpose8x4Forward).
+//
+//go:nosplit
+func transpose8x4Inverse(w0, w1, w2, w3 archsimd.Int64x8) (v0, v1, v2, v3 archsimd.Int64x8) {
+	idxLo := archsimd.LoadUint64x8(&avx512D256IdxLoHalf)
+	idxHi := archsimd.LoadUint64x8(&avx512D256IdxHiHalf)
+	idxV0 := archsimd.LoadUint64x8(&avx512D256IdxInvV0)
+	idxV1 := archsimd.LoadUint64x8(&avx512D256IdxInvV1)
+
+	// Stage 1 (inverse of forward stage 2): split per-word vectors back into
+	// pair-of-elements groups.
+	q0 := w0.ConcatPermute(w2, idxLo) // [e0w0..e3w0, e0w2..e3w2]
+	q2 := w0.ConcatPermute(w2, idxHi) // [e4w0..e7w0, e4w2..e7w2]
+	q1 := w1.ConcatPermute(w3, idxLo) // [e0w1..e3w1, e0w3..e3w3]
+	q3 := w1.ConcatPermute(w3, idxHi) // [e4w1..e7w1, e4w3..e7w3]
+
+	// Stage 2 (inverse of forward stage 1): re-interleave words back into
+	// element-natural layout.
+	v0 = q0.ConcatPermute(q1, idxV0) // [e0w0..e0w3, e1w0..e1w3]
+	v1 = q0.ConcatPermute(q1, idxV1) // [e2w0..e2w3, e3w0..e3w3]
+	v2 = q2.ConcatPermute(q3, idxV0) // [e4w0..e4w3, e5w0..e5w3]
+	v3 = q2.ConcatPermute(q3, idxV1) // [e6w0..e6w3, e7w0..e7w3]
+	return
+}
+
+// avx512AddCarryStage and friends mirror the AVX2 helpers but on Int64x8.
+//
+//go:nosplit
+func avx512AddCarryStage(a, b, cIn, sb archsimd.Int64x8) (r, cOut archsimd.Int64x8) {
+	s := a.Add(b)
+	csA := s.Xor(sb).Less(a.Xor(sb)).ToInt64x8()
+	r = s.Sub(cIn)
+	csB := r.Xor(sb).Less(s.Xor(sb)).ToInt64x8()
+	cOut = csA.Or(csB)
+	return
+}
+
+//go:nosplit
+func avx512AddCarryStageNoOut(a, b, cIn archsimd.Int64x8) archsimd.Int64x8 {
+	return a.Add(b).Sub(cIn)
+}
+
+//go:nosplit
+func avx512SubBorrowStage(a, b, bIn, sb, zero archsimd.Int64x8) (r, bOut archsimd.Int64x8) {
+	s := a.Sub(b)
+	bsA := a.Xor(sb).Less(b.Xor(sb)).ToInt64x8()
+	r = s.Add(bIn)
+	bsB := s.Equal(zero).ToInt64x8().And(bIn)
+	bOut = bsA.Or(bsB)
+	return
+}
+
+//go:nosplit
+func avx512SubBorrowStageNoOut(a, b, bIn archsimd.Int64x8) archsimd.Int64x8 {
+	return a.Sub(b).Add(bIn)
+}
+
+func avx512D256AddUnchecked(a, b, r []uint64) {
+	n := len(r) / 4
+	if n == 0 || len(a) < 4*n || len(b) < 4*n {
+		return
+	}
+	pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x8(signBit128)
+	zero := archsimd.BroadcastInt64x8(0)
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 32
+		aV0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off)))
+		aV1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+64)))
+		aV2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+128)))
+		aV3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+192)))
+		bV0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off)))
+		bV1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+64)))
+		bV2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+128)))
+		bV3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+192)))
+
+		aW0, aW1, aW2, aW3 := transpose8x4Forward(aV0, aV1, aV2, aV3)
+		bW0, bW1, bW2, bW3 := transpose8x4Forward(bV0, bV1, bV2, bV3)
+
+		rW0, c0 := avx512AddCarryStage(aW0, bW0, zero, sb)
+		rW1, c1 := avx512AddCarryStage(aW1, bW1, c0, sb)
+		rW2, c2 := avx512AddCarryStage(aW2, bW2, c1, sb)
+		rW3 := avx512AddCarryStageNoOut(aW3, bW3, c2)
+
+		rV0, rV1, rV2, rV3 := transpose8x4Inverse(rW0, rW1, rW2, rW3)
+		rV0.Store((*[8]int64)(unsafe.Add(pr, off)))
+		rV1.Store((*[8]int64)(unsafe.Add(pr, off+64)))
+		rV2.Store((*[8]int64)(unsafe.Add(pr, off+128)))
+		rV3.Store((*[8]int64)(unsafe.Add(pr, off+192)))
+	}
+	for ; i < n; i++ {
+		j := i << 2
+		w0, c := addU64(a[j], b[j], 0)
+		w1, c := addU64(a[j+1], b[j+1], c)
+		w2, c := addU64(a[j+2], b[j+2], c)
+		w3, _ := addU64(a[j+3], b[j+3], c)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+	}
+}
+
+func avx512D256SubUnchecked(a, b, r []uint64) {
+	n := len(r) / 4
+	if n == 0 || len(a) < 4*n || len(b) < 4*n {
+		return
+	}
+	pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x8(signBit128)
+	zero := archsimd.BroadcastInt64x8(0)
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 32
+		aV0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off)))
+		aV1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+64)))
+		aV2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+128)))
+		aV3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+192)))
+		bV0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off)))
+		bV1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+64)))
+		bV2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+128)))
+		bV3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+192)))
+
+		aW0, aW1, aW2, aW3 := transpose8x4Forward(aV0, aV1, aV2, aV3)
+		bW0, bW1, bW2, bW3 := transpose8x4Forward(bV0, bV1, bV2, bV3)
+
+		rW0, b0 := avx512SubBorrowStage(aW0, bW0, zero, sb, zero)
+		rW1, b1 := avx512SubBorrowStage(aW1, bW1, b0, sb, zero)
+		rW2, b2 := avx512SubBorrowStage(aW2, bW2, b1, sb, zero)
+		rW3 := avx512SubBorrowStageNoOut(aW3, bW3, b2)
+
+		rV0, rV1, rV2, rV3 := transpose8x4Inverse(rW0, rW1, rW2, rW3)
+		rV0.Store((*[8]int64)(unsafe.Add(pr, off)))
+		rV1.Store((*[8]int64)(unsafe.Add(pr, off+64)))
+		rV2.Store((*[8]int64)(unsafe.Add(pr, off+128)))
+		rV3.Store((*[8]int64)(unsafe.Add(pr, off+192)))
+	}
+	for ; i < n; i++ {
+		j := i << 2
+		w0, br := subU64(a[j], b[j], 0)
+		w1, br := subU64(a[j+1], b[j+1], br)
+		w2, br := subU64(a[j+2], b[j+2], br)
+		w3, _ := subU64(a[j+3], b[j+3], br)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+	}
+}
+
+func avx512D256AddChecked(a, b, r []uint64) int {
+	n := len(r) / 4
+	if n == 0 || len(a) < 4*n || len(b) < 4*n {
+		return -1
+	}
+	pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x8(signBit128)
+	zero := archsimd.BroadcastInt64x8(0)
+
+	var ofAcc archsimd.Int64x8
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 32
+		aV0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off)))
+		aV1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+64)))
+		aV2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+128)))
+		aV3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+192)))
+		bV0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off)))
+		bV1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+64)))
+		bV2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+128)))
+		bV3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+192)))
+
+		aW0, aW1, aW2, aW3 := transpose8x4Forward(aV0, aV1, aV2, aV3)
+		bW0, bW1, bW2, bW3 := transpose8x4Forward(bV0, bV1, bV2, bV3)
+
+		rW0, c0 := avx512AddCarryStage(aW0, bW0, zero, sb)
+		rW1, c1 := avx512AddCarryStage(aW1, bW1, c0, sb)
+		rW2, c2 := avx512AddCarryStage(aW2, bW2, c1, sb)
+		rW3 := avx512AddCarryStageNoOut(aW3, bW3, c2)
+
+		ofAcc = ofAcc.Or(aW3.Xor(bW3).AndNot(aW3.Xor(rW3)))
+
+		rV0, rV1, rV2, rV3 := transpose8x4Inverse(rW0, rW1, rW2, rW3)
+		rV0.Store((*[8]int64)(unsafe.Add(pr, off)))
+		rV1.Store((*[8]int64)(unsafe.Add(pr, off+64)))
+		rV2.Store((*[8]int64)(unsafe.Add(pr, off+128)))
+		rV3.Store((*[8]int64)(unsafe.Add(pr, off+192)))
+	}
+	vecEnd := i
+
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	for ; i < n; i++ {
+		j := i << 2
+		aHi := a[j+3]
+		bHi := b[j+3]
+		w0, c := addU64(a[j], b[j], 0)
+		w1, c := addU64(a[j+1], b[j+1], c)
+		w2, c := addU64(a[j+2], b[j+2], c)
+		w3, _ := addU64(aHi, bHi, c)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+		ah, bh, rh := int64(aHi), int64(bHi), int64(w3)
+		if (ah^rh)&^(ah^bh) < 0 {
+			if vecOverflow {
+				return d256FirstOverflow(a, b, vecEnd, false)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d256FirstOverflow(a, b, vecEnd, false)
+}
+
+func avx512D256SubChecked(a, b, r []uint64) int {
+	n := len(r) / 4
+	if n == 0 || len(a) < 4*n || len(b) < 4*n {
+		return -1
+	}
+	pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x8(signBit128)
+	zero := archsimd.BroadcastInt64x8(0)
+
+	var ofAcc archsimd.Int64x8
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 32
+		aV0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off)))
+		aV1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+64)))
+		aV2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+128)))
+		aV3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+192)))
+		bV0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off)))
+		bV1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+64)))
+		bV2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+128)))
+		bV3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+192)))
+
+		aW0, aW1, aW2, aW3 := transpose8x4Forward(aV0, aV1, aV2, aV3)
+		bW0, bW1, bW2, bW3 := transpose8x4Forward(bV0, bV1, bV2, bV3)
+
+		rW0, b0 := avx512SubBorrowStage(aW0, bW0, zero, sb, zero)
+		rW1, b1 := avx512SubBorrowStage(aW1, bW1, b0, sb, zero)
+		rW2, b2 := avx512SubBorrowStage(aW2, bW2, b1, sb, zero)
+		rW3 := avx512SubBorrowStageNoOut(aW3, bW3, b2)
+
+		ofAcc = ofAcc.Or(aW3.Xor(rW3).And(aW3.Xor(bW3)))
+
+		rV0, rV1, rV2, rV3 := transpose8x4Inverse(rW0, rW1, rW2, rW3)
+		rV0.Store((*[8]int64)(unsafe.Add(pr, off)))
+		rV1.Store((*[8]int64)(unsafe.Add(pr, off+64)))
+		rV2.Store((*[8]int64)(unsafe.Add(pr, off+128)))
+		rV3.Store((*[8]int64)(unsafe.Add(pr, off+192)))
+	}
+	vecEnd := i
+
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	for ; i < n; i++ {
+		j := i << 2
+		aHi := a[j+3]
+		bHi := b[j+3]
+		w0, br := subU64(a[j], b[j], 0)
+		w1, br := subU64(a[j+1], b[j+1], br)
+		w2, br := subU64(a[j+2], b[j+2], br)
+		w3, _ := subU64(aHi, bHi, br)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+		ah, bh, rh := int64(aHi), int64(bHi), int64(w3)
+		if (ah^rh)&(ah^bh) < 0 {
+			if vecOverflow {
+				return d256FirstOverflow(a, b, vecEnd, true)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d256FirstOverflow(a, b, vecEnd, true)
+}
+
+// ---------------------------------------------------------------------------
+// AVX2 D256 broadcast variants. The scalar's 4 words become 4 uniform Int64x4
+// vectors — no load/transpose needed for the scalar operand.
+// ---------------------------------------------------------------------------
+
+func avx2D256AddScalarUnchecked(s0, s1, s2, s3 uint64, v, r []uint64) {
+	n := len(r) / 4
+	if n == 0 || len(v) < 4*n {
+		return
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x4(signBit128)
+	zero := archsimd.BroadcastInt64x4(0)
+	bW0 := archsimd.BroadcastInt64x4(int64(s0))
+	bW1 := archsimd.BroadcastInt64x4(int64(s1))
+	bW2 := archsimd.BroadcastInt64x4(int64(s2))
+	bW3 := archsimd.BroadcastInt64x4(int64(s3))
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 32
+		v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32)))
+		v2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+64)))
+		v3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+96)))
+
+		aW0, aW1, aW2, aW3 := transpose4x4(v0, v1, v2, v3)
+
+		rW0, c0 := addCarryStage(aW0, bW0, zero, sb)
+		rW1, c1 := addCarryStage(aW1, bW1, c0, sb)
+		rW2, c2 := addCarryStage(aW2, bW2, c1, sb)
+		rW3 := addCarryStageNoOut(aW3, bW3, c2)
+
+		rV0, rV1, rV2, rV3 := transpose4x4(rW0, rW1, rW2, rW3)
+		rV0.Store((*[4]int64)(unsafe.Add(pr, off)))
+		rV1.Store((*[4]int64)(unsafe.Add(pr, off+32)))
+		rV2.Store((*[4]int64)(unsafe.Add(pr, off+64)))
+		rV3.Store((*[4]int64)(unsafe.Add(pr, off+96)))
+	}
+	for ; i < n; i++ {
+		j := i << 2
+		w0, c := addU64(s0, v[j], 0)
+		w1, c := addU64(s1, v[j+1], c)
+		w2, c := addU64(s2, v[j+2], c)
+		w3, _ := addU64(s3, v[j+3], c)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+	}
+}
+
+func avx2D256SubScalarUnchecked(v []uint64, s0, s1, s2, s3 uint64, r []uint64) {
+	n := len(r) / 4
+	if n == 0 || len(v) < 4*n {
+		return
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x4(signBit128)
+	zero := archsimd.BroadcastInt64x4(0)
+	bW0 := archsimd.BroadcastInt64x4(int64(s0))
+	bW1 := archsimd.BroadcastInt64x4(int64(s1))
+	bW2 := archsimd.BroadcastInt64x4(int64(s2))
+	bW3 := archsimd.BroadcastInt64x4(int64(s3))
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 32
+		v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32)))
+		v2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+64)))
+		v3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+96)))
+
+		aW0, aW1, aW2, aW3 := transpose4x4(v0, v1, v2, v3)
+
+		rW0, b0 := subBorrowStage(aW0, bW0, zero, sb, zero)
+		rW1, b1 := subBorrowStage(aW1, bW1, b0, sb, zero)
+		rW2, b2 := subBorrowStage(aW2, bW2, b1, sb, zero)
+		rW3 := subBorrowStageNoOut(aW3, bW3, b2)
+
+		rV0, rV1, rV2, rV3 := transpose4x4(rW0, rW1, rW2, rW3)
+		rV0.Store((*[4]int64)(unsafe.Add(pr, off)))
+		rV1.Store((*[4]int64)(unsafe.Add(pr, off+32)))
+		rV2.Store((*[4]int64)(unsafe.Add(pr, off+64)))
+		rV3.Store((*[4]int64)(unsafe.Add(pr, off+96)))
+	}
+	for ; i < n; i++ {
+		j := i << 2
+		w0, br := subU64(v[j], s0, 0)
+		w1, br := subU64(v[j+1], s1, br)
+		w2, br := subU64(v[j+2], s2, br)
+		w3, _ := subU64(v[j+3], s3, br)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+	}
+}
+
+func avx2D256ScalarSubUnchecked(s0, s1, s2, s3 uint64, v, r []uint64) {
+	n := len(r) / 4
+	if n == 0 || len(v) < 4*n {
+		return
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x4(signBit128)
+	zero := archsimd.BroadcastInt64x4(0)
+	aW0 := archsimd.BroadcastInt64x4(int64(s0))
+	aW1 := archsimd.BroadcastInt64x4(int64(s1))
+	aW2 := archsimd.BroadcastInt64x4(int64(s2))
+	aW3 := archsimd.BroadcastInt64x4(int64(s3))
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 32
+		v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32)))
+		v2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+64)))
+		v3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+96)))
+
+		bW0, bW1, bW2, bW3 := transpose4x4(v0, v1, v2, v3)
+
+		rW0, b0 := subBorrowStage(aW0, bW0, zero, sb, zero)
+		rW1, b1 := subBorrowStage(aW1, bW1, b0, sb, zero)
+		rW2, b2 := subBorrowStage(aW2, bW2, b1, sb, zero)
+		rW3 := subBorrowStageNoOut(aW3, bW3, b2)
+
+		rV0, rV1, rV2, rV3 := transpose4x4(rW0, rW1, rW2, rW3)
+		rV0.Store((*[4]int64)(unsafe.Add(pr, off)))
+		rV1.Store((*[4]int64)(unsafe.Add(pr, off+32)))
+		rV2.Store((*[4]int64)(unsafe.Add(pr, off+64)))
+		rV3.Store((*[4]int64)(unsafe.Add(pr, off+96)))
+	}
+	for ; i < n; i++ {
+		j := i << 2
+		w0, br := subU64(s0, v[j], 0)
+		w1, br := subU64(s1, v[j+1], br)
+		w2, br := subU64(s2, v[j+2], br)
+		w3, _ := subU64(s3, v[j+3], br)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+	}
+}
+
+func avx2D256AddScalarChecked(s0, s1, s2, s3 uint64, v, r []uint64) int {
+	n := len(r) / 4
+	if n == 0 || len(v) < 4*n {
+		return -1
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x4(signBit128)
+	zero := archsimd.BroadcastInt64x4(0)
+	bW0 := archsimd.BroadcastInt64x4(int64(s0))
+	bW1 := archsimd.BroadcastInt64x4(int64(s1))
+	bW2 := archsimd.BroadcastInt64x4(int64(s2))
+	bW3 := archsimd.BroadcastInt64x4(int64(s3))
+
+	var ofAcc archsimd.Int64x4
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 32
+		v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32)))
+		v2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+64)))
+		v3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+96)))
+
+		aW0, aW1, aW2, aW3 := transpose4x4(v0, v1, v2, v3)
+
+		rW0, c0 := addCarryStage(aW0, bW0, zero, sb)
+		rW1, c1 := addCarryStage(aW1, bW1, c0, sb)
+		rW2, c2 := addCarryStage(aW2, bW2, c1, sb)
+		rW3 := addCarryStageNoOut(aW3, bW3, c2)
+		ofAcc = ofAcc.Or(aW3.Xor(rW3).AndNot(aW3.Xor(bW3)))
+
+		rV0, rV1, rV2, rV3 := transpose4x4(rW0, rW1, rW2, rW3)
+		rV0.Store((*[4]int64)(unsafe.Add(pr, off)))
+		rV1.Store((*[4]int64)(unsafe.Add(pr, off+32)))
+		rV2.Store((*[4]int64)(unsafe.Add(pr, off+64)))
+		rV3.Store((*[4]int64)(unsafe.Add(pr, off+96)))
+	}
+	vecEnd := i
+
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	sh := int64(s3)
+	for ; i < n; i++ {
+		j := i << 2
+		vHi := v[j+3]
+		w0, c := addU64(s0, v[j], 0)
+		w1, c := addU64(s1, v[j+1], c)
+		w2, c := addU64(s2, v[j+2], c)
+		w3, _ := addU64(s3, vHi, c)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+		vh, rh := int64(vHi), int64(w3)
+		if (sh^rh)&^(sh^vh) < 0 {
+			if vecOverflow {
+				return d256ScalarFirstOverflow(s0, s1, s2, s3, v, vecEnd, 0)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d256ScalarFirstOverflow(s0, s1, s2, s3, v, vecEnd, 0)
+}
+
+func avx2D256SubScalarChecked(v []uint64, s0, s1, s2, s3 uint64, r []uint64) int {
+	n := len(r) / 4
+	if n == 0 || len(v) < 4*n {
+		return -1
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x4(signBit128)
+	zero := archsimd.BroadcastInt64x4(0)
+	bW0 := archsimd.BroadcastInt64x4(int64(s0))
+	bW1 := archsimd.BroadcastInt64x4(int64(s1))
+	bW2 := archsimd.BroadcastInt64x4(int64(s2))
+	bW3 := archsimd.BroadcastInt64x4(int64(s3))
+
+	var ofAcc archsimd.Int64x4
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 32
+		v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32)))
+		v2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+64)))
+		v3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+96)))
+
+		aW0, aW1, aW2, aW3 := transpose4x4(v0, v1, v2, v3)
+
+		rW0, b0 := subBorrowStage(aW0, bW0, zero, sb, zero)
+		rW1, b1 := subBorrowStage(aW1, bW1, b0, sb, zero)
+		rW2, b2 := subBorrowStage(aW2, bW2, b1, sb, zero)
+		rW3 := subBorrowStageNoOut(aW3, bW3, b2)
+		ofAcc = ofAcc.Or(aW3.Xor(rW3).And(aW3.Xor(bW3)))
+
+		rV0, rV1, rV2, rV3 := transpose4x4(rW0, rW1, rW2, rW3)
+		rV0.Store((*[4]int64)(unsafe.Add(pr, off)))
+		rV1.Store((*[4]int64)(unsafe.Add(pr, off+32)))
+		rV2.Store((*[4]int64)(unsafe.Add(pr, off+64)))
+		rV3.Store((*[4]int64)(unsafe.Add(pr, off+96)))
+	}
+	vecEnd := i
+
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	sh := int64(s3)
+	for ; i < n; i++ {
+		j := i << 2
+		vHi := v[j+3]
+		w0, br := subU64(v[j], s0, 0)
+		w1, br := subU64(v[j+1], s1, br)
+		w2, br := subU64(v[j+2], s2, br)
+		w3, _ := subU64(vHi, s3, br)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+		vh, rh := int64(vHi), int64(w3)
+		if (vh^rh)&(vh^sh) < 0 {
+			if vecOverflow {
+				return d256ScalarFirstOverflow(s0, s1, s2, s3, v, vecEnd, 1)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d256ScalarFirstOverflow(s0, s1, s2, s3, v, vecEnd, 1)
+}
+
+func avx2D256ScalarSubChecked(s0, s1, s2, s3 uint64, v, r []uint64) int {
+	n := len(r) / 4
+	if n == 0 || len(v) < 4*n {
+		return -1
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x4(signBit128)
+	zero := archsimd.BroadcastInt64x4(0)
+	aW0 := archsimd.BroadcastInt64x4(int64(s0))
+	aW1 := archsimd.BroadcastInt64x4(int64(s1))
+	aW2 := archsimd.BroadcastInt64x4(int64(s2))
+	aW3 := archsimd.BroadcastInt64x4(int64(s3))
+
+	var ofAcc archsimd.Int64x4
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 32
+		v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32)))
+		v2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+64)))
+		v3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+96)))
+
+		bW0, bW1, bW2, bW3 := transpose4x4(v0, v1, v2, v3)
+
+		rW0, b0 := subBorrowStage(aW0, bW0, zero, sb, zero)
+		rW1, b1 := subBorrowStage(aW1, bW1, b0, sb, zero)
+		rW2, b2 := subBorrowStage(aW2, bW2, b1, sb, zero)
+		rW3 := subBorrowStageNoOut(aW3, bW3, b2)
+		ofAcc = ofAcc.Or(aW3.Xor(rW3).And(aW3.Xor(bW3)))
+
+		rV0, rV1, rV2, rV3 := transpose4x4(rW0, rW1, rW2, rW3)
+		rV0.Store((*[4]int64)(unsafe.Add(pr, off)))
+		rV1.Store((*[4]int64)(unsafe.Add(pr, off+32)))
+		rV2.Store((*[4]int64)(unsafe.Add(pr, off+64)))
+		rV3.Store((*[4]int64)(unsafe.Add(pr, off+96)))
+	}
+	vecEnd := i
+
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	sh := int64(s3)
+	for ; i < n; i++ {
+		j := i << 2
+		vHi := v[j+3]
+		w0, br := subU64(s0, v[j], 0)
+		w1, br := subU64(s1, v[j+1], br)
+		w2, br := subU64(s2, v[j+2], br)
+		w3, _ := subU64(s3, vHi, br)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+		vh, rh := int64(vHi), int64(w3)
+		if (sh^rh)&(sh^vh) < 0 {
+			if vecOverflow {
+				return d256ScalarFirstOverflow(s0, s1, s2, s3, v, vecEnd, 2)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d256ScalarFirstOverflow(s0, s1, s2, s3, v, vecEnd, 2)
+}
+
+// ---------------------------------------------------------------------------
+// AVX-512 D256 broadcast variants. Same skeleton but Int64x8 and 8 elems/iter.
+// ---------------------------------------------------------------------------
+
+func avx512D256AddScalarUnchecked(s0, s1, s2, s3 uint64, v, r []uint64) {
+	n := len(r) / 4
+	if n == 0 || len(v) < 4*n {
+		return
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x8(signBit128)
+	zero := archsimd.BroadcastInt64x8(0)
+	bW0 := archsimd.BroadcastInt64x8(int64(s0))
+	bW1 := archsimd.BroadcastInt64x8(int64(s1))
+	bW2 := archsimd.BroadcastInt64x8(int64(s2))
+	bW3 := archsimd.BroadcastInt64x8(int64(s3))
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 32
+		v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64)))
+		v2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+128)))
+		v3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+192)))
+
+		aW0, aW1, aW2, aW3 := transpose8x4Forward(v0, v1, v2, v3)
+
+		rW0, c0 := avx512AddCarryStage(aW0, bW0, zero, sb)
+		rW1, c1 := avx512AddCarryStage(aW1, bW1, c0, sb)
+		rW2, c2 := avx512AddCarryStage(aW2, bW2, c1, sb)
+		rW3 := avx512AddCarryStageNoOut(aW3, bW3, c2)
+
+		rV0, rV1, rV2, rV3 := transpose8x4Inverse(rW0, rW1, rW2, rW3)
+		rV0.Store((*[8]int64)(unsafe.Add(pr, off)))
+		rV1.Store((*[8]int64)(unsafe.Add(pr, off+64)))
+		rV2.Store((*[8]int64)(unsafe.Add(pr, off+128)))
+		rV3.Store((*[8]int64)(unsafe.Add(pr, off+192)))
+	}
+	for ; i < n; i++ {
+		j := i << 2
+		w0, c := addU64(s0, v[j], 0)
+		w1, c := addU64(s1, v[j+1], c)
+		w2, c := addU64(s2, v[j+2], c)
+		w3, _ := addU64(s3, v[j+3], c)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+	}
+}
+
+func avx512D256SubScalarUnchecked(v []uint64, s0, s1, s2, s3 uint64, r []uint64) {
+	n := len(r) / 4
+	if n == 0 || len(v) < 4*n {
+		return
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x8(signBit128)
+	zero := archsimd.BroadcastInt64x8(0)
+	bW0 := archsimd.BroadcastInt64x8(int64(s0))
+	bW1 := archsimd.BroadcastInt64x8(int64(s1))
+	bW2 := archsimd.BroadcastInt64x8(int64(s2))
+	bW3 := archsimd.BroadcastInt64x8(int64(s3))
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 32
+		v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64)))
+		v2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+128)))
+		v3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+192)))
+
+		aW0, aW1, aW2, aW3 := transpose8x4Forward(v0, v1, v2, v3)
+
+		rW0, b0 := avx512SubBorrowStage(aW0, bW0, zero, sb, zero)
+		rW1, b1 := avx512SubBorrowStage(aW1, bW1, b0, sb, zero)
+		rW2, b2 := avx512SubBorrowStage(aW2, bW2, b1, sb, zero)
+		rW3 := avx512SubBorrowStageNoOut(aW3, bW3, b2)
+
+		rV0, rV1, rV2, rV3 := transpose8x4Inverse(rW0, rW1, rW2, rW3)
+		rV0.Store((*[8]int64)(unsafe.Add(pr, off)))
+		rV1.Store((*[8]int64)(unsafe.Add(pr, off+64)))
+		rV2.Store((*[8]int64)(unsafe.Add(pr, off+128)))
+		rV3.Store((*[8]int64)(unsafe.Add(pr, off+192)))
+	}
+	for ; i < n; i++ {
+		j := i << 2
+		w0, br := subU64(v[j], s0, 0)
+		w1, br := subU64(v[j+1], s1, br)
+		w2, br := subU64(v[j+2], s2, br)
+		w3, _ := subU64(v[j+3], s3, br)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+	}
+}
+
+func avx512D256ScalarSubUnchecked(s0, s1, s2, s3 uint64, v, r []uint64) {
+	n := len(r) / 4
+	if n == 0 || len(v) < 4*n {
+		return
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x8(signBit128)
+	zero := archsimd.BroadcastInt64x8(0)
+	aW0 := archsimd.BroadcastInt64x8(int64(s0))
+	aW1 := archsimd.BroadcastInt64x8(int64(s1))
+	aW2 := archsimd.BroadcastInt64x8(int64(s2))
+	aW3 := archsimd.BroadcastInt64x8(int64(s3))
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 32
+		v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64)))
+		v2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+128)))
+		v3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+192)))
+
+		bW0, bW1, bW2, bW3 := transpose8x4Forward(v0, v1, v2, v3)
+
+		rW0, b0 := avx512SubBorrowStage(aW0, bW0, zero, sb, zero)
+		rW1, b1 := avx512SubBorrowStage(aW1, bW1, b0, sb, zero)
+		rW2, b2 := avx512SubBorrowStage(aW2, bW2, b1, sb, zero)
+		rW3 := avx512SubBorrowStageNoOut(aW3, bW3, b2)
+
+		rV0, rV1, rV2, rV3 := transpose8x4Inverse(rW0, rW1, rW2, rW3)
+		rV0.Store((*[8]int64)(unsafe.Add(pr, off)))
+		rV1.Store((*[8]int64)(unsafe.Add(pr, off+64)))
+		rV2.Store((*[8]int64)(unsafe.Add(pr, off+128)))
+		rV3.Store((*[8]int64)(unsafe.Add(pr, off+192)))
+	}
+	for ; i < n; i++ {
+		j := i << 2
+		w0, br := subU64(s0, v[j], 0)
+		w1, br := subU64(s1, v[j+1], br)
+		w2, br := subU64(s2, v[j+2], br)
+		w3, _ := subU64(s3, v[j+3], br)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+	}
+}
+
+func avx512D256AddScalarChecked(s0, s1, s2, s3 uint64, v, r []uint64) int {
+	n := len(r) / 4
+	if n == 0 || len(v) < 4*n {
+		return -1
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x8(signBit128)
+	zero := archsimd.BroadcastInt64x8(0)
+	bW0 := archsimd.BroadcastInt64x8(int64(s0))
+	bW1 := archsimd.BroadcastInt64x8(int64(s1))
+	bW2 := archsimd.BroadcastInt64x8(int64(s2))
+	bW3 := archsimd.BroadcastInt64x8(int64(s3))
+
+	var ofAcc archsimd.Int64x8
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 32
+		v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64)))
+		v2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+128)))
+		v3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+192)))
+
+		aW0, aW1, aW2, aW3 := transpose8x4Forward(v0, v1, v2, v3)
+
+		rW0, c0 := avx512AddCarryStage(aW0, bW0, zero, sb)
+		rW1, c1 := avx512AddCarryStage(aW1, bW1, c0, sb)
+		rW2, c2 := avx512AddCarryStage(aW2, bW2, c1, sb)
+		rW3 := avx512AddCarryStageNoOut(aW3, bW3, c2)
+		ofAcc = ofAcc.Or(aW3.Xor(bW3).AndNot(aW3.Xor(rW3)))
+
+		rV0, rV1, rV2, rV3 := transpose8x4Inverse(rW0, rW1, rW2, rW3)
+		rV0.Store((*[8]int64)(unsafe.Add(pr, off)))
+		rV1.Store((*[8]int64)(unsafe.Add(pr, off+64)))
+		rV2.Store((*[8]int64)(unsafe.Add(pr, off+128)))
+		rV3.Store((*[8]int64)(unsafe.Add(pr, off+192)))
+	}
+	vecEnd := i
+
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	sh := int64(s3)
+	for ; i < n; i++ {
+		j := i << 2
+		vHi := v[j+3]
+		w0, c := addU64(s0, v[j], 0)
+		w1, c := addU64(s1, v[j+1], c)
+		w2, c := addU64(s2, v[j+2], c)
+		w3, _ := addU64(s3, vHi, c)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+		vh, rh := int64(vHi), int64(w3)
+		if (sh^rh)&^(sh^vh) < 0 {
+			if vecOverflow {
+				return d256ScalarFirstOverflow(s0, s1, s2, s3, v, vecEnd, 0)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d256ScalarFirstOverflow(s0, s1, s2, s3, v, vecEnd, 0)
+}
+
+func avx512D256SubScalarChecked(v []uint64, s0, s1, s2, s3 uint64, r []uint64) int {
+	n := len(r) / 4
+	if n == 0 || len(v) < 4*n {
+		return -1
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x8(signBit128)
+	zero := archsimd.BroadcastInt64x8(0)
+	bW0 := archsimd.BroadcastInt64x8(int64(s0))
+	bW1 := archsimd.BroadcastInt64x8(int64(s1))
+	bW2 := archsimd.BroadcastInt64x8(int64(s2))
+	bW3 := archsimd.BroadcastInt64x8(int64(s3))
+
+	var ofAcc archsimd.Int64x8
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 32
+		v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64)))
+		v2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+128)))
+		v3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+192)))
+
+		aW0, aW1, aW2, aW3 := transpose8x4Forward(v0, v1, v2, v3)
+
+		rW0, b0 := avx512SubBorrowStage(aW0, bW0, zero, sb, zero)
+		rW1, b1 := avx512SubBorrowStage(aW1, bW1, b0, sb, zero)
+		rW2, b2 := avx512SubBorrowStage(aW2, bW2, b1, sb, zero)
+		rW3 := avx512SubBorrowStageNoOut(aW3, bW3, b2)
+		ofAcc = ofAcc.Or(aW3.Xor(rW3).And(aW3.Xor(bW3)))
+
+		rV0, rV1, rV2, rV3 := transpose8x4Inverse(rW0, rW1, rW2, rW3)
+		rV0.Store((*[8]int64)(unsafe.Add(pr, off)))
+		rV1.Store((*[8]int64)(unsafe.Add(pr, off+64)))
+		rV2.Store((*[8]int64)(unsafe.Add(pr, off+128)))
+		rV3.Store((*[8]int64)(unsafe.Add(pr, off+192)))
+	}
+	vecEnd := i
+
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	sh := int64(s3)
+	for ; i < n; i++ {
+		j := i << 2
+		vHi := v[j+3]
+		w0, br := subU64(v[j], s0, 0)
+		w1, br := subU64(v[j+1], s1, br)
+		w2, br := subU64(v[j+2], s2, br)
+		w3, _ := subU64(vHi, s3, br)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+		vh, rh := int64(vHi), int64(w3)
+		if (vh^rh)&(vh^sh) < 0 {
+			if vecOverflow {
+				return d256ScalarFirstOverflow(s0, s1, s2, s3, v, vecEnd, 1)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d256ScalarFirstOverflow(s0, s1, s2, s3, v, vecEnd, 1)
+}
+
+func avx512D256ScalarSubChecked(s0, s1, s2, s3 uint64, v, r []uint64) int {
+	n := len(r) / 4
+	if n == 0 || len(v) < 4*n {
+		return -1
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sb := archsimd.BroadcastInt64x8(signBit128)
+	zero := archsimd.BroadcastInt64x8(0)
+	aW0 := archsimd.BroadcastInt64x8(int64(s0))
+	aW1 := archsimd.BroadcastInt64x8(int64(s1))
+	aW2 := archsimd.BroadcastInt64x8(int64(s2))
+	aW3 := archsimd.BroadcastInt64x8(int64(s3))
+
+	var ofAcc archsimd.Int64x8
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 32
+		v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64)))
+		v2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+128)))
+		v3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+192)))
+
+		bW0, bW1, bW2, bW3 := transpose8x4Forward(v0, v1, v2, v3)
+
+		rW0, b0 := avx512SubBorrowStage(aW0, bW0, zero, sb, zero)
+		rW1, b1 := avx512SubBorrowStage(aW1, bW1, b0, sb, zero)
+		rW2, b2 := avx512SubBorrowStage(aW2, bW2, b1, sb, zero)
+		rW3 := avx512SubBorrowStageNoOut(aW3, bW3, b2)
+		ofAcc = ofAcc.Or(aW3.Xor(rW3).And(aW3.Xor(bW3)))
+
+		rV0, rV1, rV2, rV3 := transpose8x4Inverse(rW0, rW1, rW2, rW3)
+		rV0.Store((*[8]int64)(unsafe.Add(pr, off)))
+		rV1.Store((*[8]int64)(unsafe.Add(pr, off+64)))
+		rV2.Store((*[8]int64)(unsafe.Add(pr, off+128)))
+		rV3.Store((*[8]int64)(unsafe.Add(pr, off+192)))
+	}
+	vecEnd := i
+
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	sh := int64(s3)
+	for ; i < n; i++ {
+		j := i << 2
+		vHi := v[j+3]
+		w0, br := subU64(s0, v[j], 0)
+		w1, br := subU64(s1, v[j+1], br)
+		w2, br := subU64(s2, v[j+2], br)
+		w3, _ := subU64(s3, vHi, br)
+		r[j], r[j+1], r[j+2], r[j+3] = w0, w1, w2, w3
+		vh, rh := int64(vHi), int64(w3)
+		if (sh^rh)&(sh^vh) < 0 {
+			if vecOverflow {
+				return d256ScalarFirstOverflow(s0, s1, s2, s3, v, vecEnd, 2)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d256ScalarFirstOverflow(s0, s1, s2, s3, v, vecEnd, 2)
+}
diff --git a/pkg/common/simdkernels/d256_addsub_test.go b/pkg/common/simdkernels/d256_addsub_test.go
new file mode 100644
index 0000000000000..c10b0b1cfe05a
--- /dev/null
+++ b/pkg/common/simdkernels/d256_addsub_test.go
@@ -0,0 +1,670 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build goexperiment.simd && amd64
+
+package simdkernels
+
+import (
+	"math"
+	"math/rand/v2"
+	"strconv"
+	"testing"
+
+	"golang.org/x/sys/cpu"
+)
+
+// All slice lengths below count Decimal256 elements (each backed by 4 uint64).
+
+type d256UncheckedImpl struct {
+	name string
+	fn   func(a, b, r []uint64)
+}
+
+type d256CheckedImpl struct {
+	name string
+	fn   func(a, b, r []uint64) int
+}
+
+func d256Sizes() []int {
+	return []int{0, 1, 2, 3, 4, 5, 7, 8, 9, 15, 16, 17, 31, 32, 33, 35, 63, 64, 127, 128, 1023, 2048}
+}
+
+func makeRandD256(n int, seed uint64) []uint64 {
+	rng := rand.New(rand.NewPCG(seed, seed^0xDEADBEEFCAFEBABE))
+	out := make([]uint64, 4*n)
+	for i := range out {
+		out[i] = rng.Uint64()
+	}
+	return out
+}
+
+// makeRand256SmallSigned clears the high bit of the top word so 256-bit
+// add/sub of any two such values cannot overflow signed.
+func makeRand256SmallSigned(n int, seed uint64) []uint64 {
+	out := makeRandD256(n, seed)
+	for i := 3; i < len(out); i += 4 {
+		out[i] &= 0x3FFFFFFFFFFFFFFF
+	}
+	return out
+}
+
+func TestD256AddVariants(t *testing.T) {
+	impls := []d256UncheckedImpl{
+		{"scalar", scalarD256AddUnchecked},
+		{"avx2", avx2D256AddUnchecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d256UncheckedImpl{"avx512", avx512D256AddUnchecked})
+	}
+	for _, n := range d256Sizes() {
+		a := makeRandD256(n, uint64(n)*7+1)
+		b := makeRandD256(n, uint64(n)*11+3)
+		want := make([]uint64, 4*n)
+		scalarD256AddUnchecked(a, b, want)
+		for _, impl := range impls {
+			got := make([]uint64, 4*n)
+			impl.fn(a, b, got)
+			for i := 0; i < 4*n; i++ {
+				if got[i] != want[i] {
+					t.Fatalf("%s n=%d slot=%d: got %x want %x", impl.name, n, i, got[i], want[i])
+				}
+			}
+		}
+	}
+}
+
+func TestD256SubVariants(t *testing.T) {
+	impls := []d256UncheckedImpl{
+		{"scalar", scalarD256SubUnchecked},
+		{"avx2", avx2D256SubUnchecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d256UncheckedImpl{"avx512", avx512D256SubUnchecked})
+	}
+	for _, n := range d256Sizes() {
+		a := makeRandD256(n, uint64(n)*13+5)
+		b := makeRandD256(n, uint64(n)*17+9)
+		want := make([]uint64, 4*n)
+		scalarD256SubUnchecked(a, b, want)
+		for _, impl := range impls {
+			got := make([]uint64, 4*n)
+			impl.fn(a, b, got)
+			for i := 0; i < 4*n; i++ {
+				if got[i] != want[i] {
+					t.Fatalf("%s n=%d slot=%d: got %x want %x", impl.name, n, i, got[i], want[i])
+				}
+			}
+		}
+	}
+}
+
+func TestD256AddCheckedVariants(t *testing.T) {
+	impls := []d256CheckedImpl{
+		{"scalar", scalarD256AddChecked},
+		{"avx2", avx2D256AddChecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d256CheckedImpl{"avx512", avx512D256AddChecked})
+	}
+
+	// 1) No-overflow random inputs.
+	for _, n := range d256Sizes() {
+		a := makeRand256SmallSigned(n, uint64(n)*19+7)
+		b := makeRand256SmallSigned(n, uint64(n)*23+11)
+		want := make([]uint64, 4*n)
+		if got := scalarD256AddChecked(a, b, want); got != -1 {
+			t.Fatalf("setup: scalar overflow at %d for masked input n=%d", got, n)
+		}
+		for _, impl := range impls {
+			got := make([]uint64, 4*n)
+			if idx := impl.fn(a, b, got); idx != -1 {
+				t.Fatalf("%s n=%d: spurious overflow at %d", impl.name, n, idx)
+			}
+			for i := 0; i < 4*n; i++ {
+				if got[i] != want[i] {
+					t.Fatalf("%s n=%d slot=%d: got %x want %x", impl.name, n, i, got[i], want[i])
+				}
+			}
+		}
+	}
+
+	// 2) Inject MaxInt256 + 1 overflow at varying positions.
+	for _, n := range []int{4, 8, 9, 16, 17, 33, 35, 64} {
+		for _, pos := range []int{0, 1, 3, 4, 7, 8, n - 1} {
+			if pos < 0 || pos >= n {
+				continue
+			}
+			a := make([]uint64, 4*n)
+			b := make([]uint64, 4*n)
+			j := pos << 2
+			a[j] = math.MaxUint64
+			a[j+1] = math.MaxUint64
+			a[j+2] = math.MaxUint64
+			a[j+3] = uint64(math.MaxInt64) // a = MaxInt256
+			b[j] = 1
+			// b = 1
+			for _, impl := range impls {
+				got := make([]uint64, 4*n)
+				idx := impl.fn(a, b, got)
+				if idx != pos {
+					t.Fatalf("%s n=%d inject pos=%d: got idx %d", impl.name, n, pos, idx)
+				}
+			}
+		}
+	}
+
+	// 3) Carry propagation cascading w0→w1→w2→w3.
+	for _, n := range []int{4, 8, 16, 17, 33} {
+		a := make([]uint64, 4*n)
+		b := make([]uint64, 4*n)
+		for i := 0; i < n; i++ {
+			j := i << 2
+			a[j] = math.MaxUint64
+			a[j+1] = math.MaxUint64
+			a[j+2] = math.MaxUint64
+			a[j+3] = uint64(i) // small positive top
+			b[j] = 1
+		}
+		want := make([]uint64, 4*n)
+		scalarD256AddChecked(a, b, want)
+		for _, impl := range impls {
+			got := make([]uint64, 4*n)
+			if idx := impl.fn(a, b, got); idx != -1 {
+				t.Fatalf("%s carry n=%d: spurious overflow at %d", impl.name, n, idx)
+			}
+			for i := 0; i < 4*n; i++ {
+				if got[i] != want[i] {
+					t.Fatalf("%s carry n=%d slot=%d: got %x want %x", impl.name, n, i, got[i], want[i])
+				}
+			}
+		}
+	}
+}
+
+func TestD256SubCheckedVariants(t *testing.T) {
+	impls := []d256CheckedImpl{
+		{"scalar", scalarD256SubChecked},
+		{"avx2", avx2D256SubChecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d256CheckedImpl{"avx512", avx512D256SubChecked})
+	}
+
+	for _, n := range d256Sizes() {
+		a := makeRand256SmallSigned(n, uint64(n)*29+13)
+		b := makeRand256SmallSigned(n, uint64(n)*31+17)
+		want := make([]uint64, 4*n)
+		if got := scalarD256SubChecked(a, b, want); got != -1 {
+			t.Fatalf("setup: scalar overflow at %d for n=%d", got, n)
+		}
+		for _, impl := range impls {
+			got := make([]uint64, 4*n)
+			if idx := impl.fn(a, b, got); idx != -1 {
+				t.Fatalf("%s n=%d: spurious overflow at %d", impl.name, n, idx)
+			}
+			for i := 0; i < 4*n; i++ {
+				if got[i] != want[i] {
+					t.Fatalf("%s n=%d slot=%d: got %x want %x", impl.name, n, i, got[i], want[i])
+				}
+			}
+		}
+	}
+
+	// Inject MinInt256 - 1 overflow at varying positions.
+	for _, n := range []int{4, 8, 9, 16, 17, 33, 35, 64} {
+		for _, pos := range []int{0, 1, 3, 4, 7, 8, n - 1} {
+			if pos < 0 || pos >= n {
+				continue
+			}
+			a := make([]uint64, 4*n)
+			b := make([]uint64, 4*n)
+			j := pos << 2
+			// a = MinInt256
+			a[j+3] = 1 << 63
+			b[j] = 1 // b = 1
+			for _, impl := range impls {
+				got := make([]uint64, 4*n)
+				idx := impl.fn(a, b, got)
+				if idx != pos {
+					t.Fatalf("%s n=%d inject pos=%d: got idx %d", impl.name, n, pos, idx)
+				}
+			}
+		}
+	}
+
+	// Borrow propagation cascading w0→w1→w2→w3.
+	for _, n := range []int{4, 8, 16, 17, 33} {
+		a := make([]uint64, 4*n)
+		b := make([]uint64, 4*n)
+		for i := 0; i < n; i++ {
+			j := i << 2
+			a[j+3] = uint64(i + 10) // safely positive after borrow
+			b[j] = 1
+		}
+		want := make([]uint64, 4*n)
+		scalarD256SubChecked(a, b, want)
+		for _, impl := range impls {
+			got := make([]uint64, 4*n)
+			if idx := impl.fn(a, b, got); idx != -1 {
+				t.Fatalf("%s borrow n=%d: spurious overflow at %d", impl.name, n, idx)
+			}
+			for i := 0; i < 4*n; i++ {
+				if got[i] != want[i] {
+					t.Fatalf("%s borrow n=%d slot=%d: got %x want %x", impl.name, n, i, got[i], want[i])
+				}
+			}
+		}
+	}
+}
+
+// ---------------------------------------------------------------------------
+// Benchmarks
+// ---------------------------------------------------------------------------
+
+var d256BenchSizes = []int{16, 64, 256, 1024, 4096}
+
+func benchD256Unchecked(b *testing.B, fn func(a, bb, r []uint64), n int) {
+	a := makeRandD256(n, 1)
+	bb := makeRandD256(n, 2)
+	r := make([]uint64, 4*n)
+	b.SetBytes(int64(n) * 32 * 3)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		fn(a, bb, r)
+	}
+}
+
+func benchD256Checked(b *testing.B, fn func(a, bb, r []uint64) int, n int) {
+	a := makeRand256SmallSigned(n, 1)
+	bb := makeRand256SmallSigned(n, 2)
+	r := make([]uint64, 4*n)
+	b.SetBytes(int64(n) * 32 * 3)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = fn(a, bb, r)
+	}
+}
+
+func BenchmarkD256AddUnchecked(b *testing.B) {
+	for _, n := range d256BenchSizes {
+		b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD256Unchecked(b, scalarD256AddUnchecked, n) })
+		b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD256Unchecked(b, avx2D256AddUnchecked, n) })
+		if cpu.X86.HasAVX512 {
+			b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD256Unchecked(b, avx512D256AddUnchecked, n) })
+		}
+	}
+}
+
+func BenchmarkD256SubUnchecked(b *testing.B) {
+	for _, n := range d256BenchSizes {
+		b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD256Unchecked(b, scalarD256SubUnchecked, n) })
+		b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD256Unchecked(b, avx2D256SubUnchecked, n) })
+		if cpu.X86.HasAVX512 {
+			b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD256Unchecked(b, avx512D256SubUnchecked, n) })
+		}
+	}
+}
+
+func BenchmarkD256AddChecked(b *testing.B) {
+	for _, n := range d256BenchSizes {
+		b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD256Checked(b, scalarD256AddChecked, n) })
+		b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD256Checked(b, avx2D256AddChecked, n) })
+		if cpu.X86.HasAVX512 {
+			b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD256Checked(b, avx512D256AddChecked, n) })
+		}
+	}
+}
+
+func BenchmarkD256SubChecked(b *testing.B) {
+	for _, n := range d256BenchSizes {
+		b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD256Checked(b, scalarD256SubChecked, n) })
+		b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD256Checked(b, avx2D256SubChecked, n) })
+		if cpu.X86.HasAVX512 {
+			b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD256Checked(b, avx512D256SubChecked, n) })
+		}
+	}
+}
+
+// ---------------------------------------------------------------------------
+// Scalar-broadcast tests (D256 = 4 uint64/elem; scalar = (s0, s1, s2, s3)).
+// ---------------------------------------------------------------------------
+
+type d256ScalarVUImpl struct {
+	name string
+	fn   func(s0, s1, s2, s3 uint64, v, r []uint64)
+}
+
+type d256ScalarVCImpl struct {
+	name string
+	fn   func(s0, s1, s2, s3 uint64, v, r []uint64) int
+}
+
+type d256VScalarUImpl struct {
+	name string
+	fn   func(v []uint64, s0, s1, s2, s3 uint64, r []uint64)
+}
+
+type d256VScalarCImpl struct {
+	name string
+	fn   func(v []uint64, s0, s1, s2, s3 uint64, r []uint64) int
+}
+
+func d256Scalars() []struct{ s0, s1, s2, s3 uint64 } {
+	return []struct{ s0, s1, s2, s3 uint64 }{
+		{0, 0, 0, 0},
+		{1, 0, 0, 0},
+		{math.MaxUint64, 0, 0, 0},
+		{0, 0, 0, 1},
+		{0xDEADBEEFCAFEBABE, 0x123456789ABCDEF0, 0xFEEDFACEDEADBEEF, 0x0123456789ABCDEF},
+		{math.MaxUint64, math.MaxUint64, math.MaxUint64, uint64(math.MaxInt64)},
+		{0, 0, 0, 1 << 63},
+	}
+}
+
+func TestD256AddScalarVariants(t *testing.T) {
+	impls := []d256ScalarVUImpl{
+		{"scalar", scalarD256AddScalarUnchecked},
+		{"avx2", avx2D256AddScalarUnchecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d256ScalarVUImpl{"avx512", avx512D256AddScalarUnchecked})
+	}
+	for _, n := range d256Sizes() {
+		v := makeRandD256(n, uint64(n)*37+1)
+		for si, s := range d256Scalars() {
+			want := make([]uint64, 4*n)
+			scalarD256AddScalarUnchecked(s.s0, s.s1, s.s2, s.s3, v, want)
+			for _, impl := range impls {
+				got := make([]uint64, 4*n)
+				impl.fn(s.s0, s.s1, s.s2, s.s3, v, got)
+				for i := 0; i < 4*n; i++ {
+					if got[i] != want[i] {
+						t.Fatalf("%s n=%d scalar#%d slot=%d: got %x want %x", impl.name, n, si, i, got[i], want[i])
+					}
+				}
+			}
+		}
+	}
+}
+
+func TestD256SubScalarVariants(t *testing.T) {
+	impls := []d256VScalarUImpl{
+		{"scalar", scalarD256SubScalarUnchecked},
+		{"avx2", avx2D256SubScalarUnchecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d256VScalarUImpl{"avx512", avx512D256SubScalarUnchecked})
+	}
+	for _, n := range d256Sizes() {
+		v := makeRandD256(n, uint64(n)*41+3)
+		for si, s := range d256Scalars() {
+			want := make([]uint64, 4*n)
+			scalarD256SubScalarUnchecked(v, s.s0, s.s1, s.s2, s.s3, want)
+			for _, impl := range impls {
+				got := make([]uint64, 4*n)
+				impl.fn(v, s.s0, s.s1, s.s2, s.s3, got)
+				for i := 0; i < 4*n; i++ {
+					if got[i] != want[i] {
+						t.Fatalf("%s n=%d scalar#%d slot=%d: got %x want %x", impl.name, n, si, i, got[i], want[i])
+					}
+				}
+			}
+		}
+	}
+}
+
+func TestD256ScalarSubVariants(t *testing.T) {
+	impls := []d256ScalarVUImpl{
+		{"scalar", scalarD256ScalarSubUnchecked},
+		{"avx2", avx2D256ScalarSubUnchecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d256ScalarVUImpl{"avx512", avx512D256ScalarSubUnchecked})
+	}
+	for _, n := range d256Sizes() {
+		v := makeRandD256(n, uint64(n)*43+5)
+		for si, s := range d256Scalars() {
+			want := make([]uint64, 4*n)
+			scalarD256ScalarSubUnchecked(s.s0, s.s1, s.s2, s.s3, v, want)
+			for _, impl := range impls {
+				got := make([]uint64, 4*n)
+				impl.fn(s.s0, s.s1, s.s2, s.s3, v, got)
+				for i := 0; i < 4*n; i++ {
+					if got[i] != want[i] {
+						t.Fatalf("%s n=%d scalar#%d slot=%d: got %x want %x", impl.name, n, si, i, got[i], want[i])
+					}
+				}
+			}
+		}
+	}
+}
+
+func TestD256AddScalarCheckedVariants(t *testing.T) {
+	impls := []d256ScalarVCImpl{
+		{"scalar", scalarD256AddScalarChecked},
+		{"avx2", avx2D256AddScalarChecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d256ScalarVCImpl{"avx512", avx512D256AddScalarChecked})
+	}
+
+	smallScalars := []struct{ s0, s1, s2, s3 uint64 }{
+		{0, 0, 0, 0},
+		{1, 0, 0, 0},
+		{0xDEADBEEFCAFEBABE, 0x123456789ABCDEF0, 0xFEEDFACEDEADBEEF, 0x0123456789ABCDEF},
+	}
+	for _, n := range d256Sizes() {
+		v := makeRand256SmallSigned(n, uint64(n)*47+7)
+		for si, s := range smallScalars {
+			want := make([]uint64, 4*n)
+			if got := scalarD256AddScalarChecked(s.s0, s.s1, s.s2, s.s3, v, want); got != -1 {
+				t.Fatalf("setup overflow at %d for n=%d scalar#%d", got, n, si)
+			}
+			for _, impl := range impls {
+				got := make([]uint64, 4*n)
+				if idx := impl.fn(s.s0, s.s1, s.s2, s.s3, v, got); idx != -1 {
+					t.Fatalf("%s n=%d scalar#%d: spurious overflow at %d", impl.name, n, si, idx)
+				}
+				for i := 0; i < 4*n; i++ {
+					if got[i] != want[i] {
+						t.Fatalf("%s n=%d scalar#%d slot=%d: got %x want %x", impl.name, n, si, i, got[i], want[i])
+					}
+				}
+			}
+		}
+	}
+
+	// Inject overflow: scalar = 1, v[pos] = MaxInt256.
+	for _, n := range []int{4, 8, 9, 16, 17, 33, 35, 64} {
+		for _, pos := range []int{0, 1, 3, 4, 7, 8, n - 1} {
+			if pos < 0 || pos >= n {
+				continue
+			}
+			v := make([]uint64, 4*n)
+			j := pos << 2
+			v[j] = math.MaxUint64
+			v[j+1] = math.MaxUint64
+			v[j+2] = math.MaxUint64
+			v[j+3] = uint64(math.MaxInt64)
+			for _, impl := range impls {
+				got := make([]uint64, 4*n)
+				idx := impl.fn(1, 0, 0, 0, v, got)
+				if idx != pos {
+					t.Fatalf("%s n=%d pos=%d: got idx %d", impl.name, n, pos, idx)
+				}
+			}
+		}
+	}
+}
+
+func TestD256SubScalarCheckedVariants(t *testing.T) {
+	impls := []d256VScalarCImpl{
+		{"scalar", scalarD256SubScalarChecked},
+		{"avx2", avx2D256SubScalarChecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d256VScalarCImpl{"avx512", avx512D256SubScalarChecked})
+	}
+
+	smallScalars := []struct{ s0, s1, s2, s3 uint64 }{
+		{0, 0, 0, 0},
+		{1, 0, 0, 0},
+		{0xDEADBEEFCAFEBABE, 0x123456789ABCDEF0, 0xFEEDFACEDEADBEEF, 0x0123456789ABCDEF},
+	}
+	for _, n := range d256Sizes() {
+		v := makeRand256SmallSigned(n, uint64(n)*53+11)
+		for si, s := range smallScalars {
+			want := make([]uint64, 4*n)
+			if got := scalarD256SubScalarChecked(v, s.s0, s.s1, s.s2, s.s3, want); got != -1 {
+				t.Fatalf("setup overflow at %d for n=%d scalar#%d", got, n, si)
+			}
+			for _, impl := range impls {
+				got := make([]uint64, 4*n)
+				if idx := impl.fn(v, s.s0, s.s1, s.s2, s.s3, got); idx != -1 {
+					t.Fatalf("%s n=%d scalar#%d: spurious overflow at %d", impl.name, n, si, idx)
+				}
+				for i := 0; i < 4*n; i++ {
+					if got[i] != want[i] {
+						t.Fatalf("%s n=%d scalar#%d slot=%d: got %x want %x", impl.name, n, si, i, got[i], want[i])
+					}
+				}
+			}
+		}
+	}
+
+	// Inject: v[pos] = MinInt256, scalar = 1.
+	for _, n := range []int{4, 8, 9, 16, 17, 33, 35, 64} {
+		for _, pos := range []int{0, 1, 3, 4, 7, 8, n - 1} {
+			if pos < 0 || pos >= n {
+				continue
+			}
+			v := make([]uint64, 4*n)
+			j := pos << 2
+			v[j+3] = 1 << 63
+			for _, impl := range impls {
+				got := make([]uint64, 4*n)
+				idx := impl.fn(v, 1, 0, 0, 0, got)
+				if idx != pos {
+					t.Fatalf("%s n=%d pos=%d: got idx %d", impl.name, n, pos, idx)
+				}
+			}
+		}
+	}
+}
+
+func TestD256ScalarSubCheckedVariants(t *testing.T) {
+	impls := []d256ScalarVCImpl{
+		{"scalar", scalarD256ScalarSubChecked},
+		{"avx2", avx2D256ScalarSubChecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d256ScalarVCImpl{"avx512", avx512D256ScalarSubChecked})
+	}
+
+	smallScalars := []struct{ s0, s1, s2, s3 uint64 }{
+		{0, 0, 0, 0},
+		{1, 0, 0, 0},
+		{0xDEADBEEFCAFEBABE, 0x123456789ABCDEF0, 0xFEEDFACEDEADBEEF, 0x0123456789ABCDEF},
+	}
+	for _, n := range d256Sizes() {
+		v := makeRand256SmallSigned(n, uint64(n)*59+13)
+		for si, s := range smallScalars {
+			want := make([]uint64, 4*n)
+			if got := scalarD256ScalarSubChecked(s.s0, s.s1, s.s2, s.s3, v, want); got != -1 {
+				t.Fatalf("setup overflow at %d for n=%d scalar#%d", got, n, si)
+			}
+			for _, impl := range impls {
+				got := make([]uint64, 4*n)
+				if idx := impl.fn(s.s0, s.s1, s.s2, s.s3, v, got); idx != -1 {
+					t.Fatalf("%s n=%d scalar#%d: spurious overflow at %d", impl.name, n, si, idx)
+				}
+				for i := 0; i < 4*n; i++ {
+					if got[i] != want[i] {
+						t.Fatalf("%s n=%d scalar#%d slot=%d: got %x want %x", impl.name, n, si, i, got[i], want[i])
+					}
+				}
+			}
+		}
+	}
+
+	// Inject: scalar = MinInt256, v[pos] = 1 ⇒ MinInt256-1 overflows.
+	for _, n := range []int{4, 8, 9, 16, 17, 33, 35, 64} {
+		for _, pos := range []int{0, 1, 3, 4, 7, 8, n - 1} {
+			if pos < 0 || pos >= n {
+				continue
+			}
+			v := make([]uint64, 4*n)
+			j := pos << 2
+			v[j] = 1
+			for _, impl := range impls {
+				got := make([]uint64, 4*n)
+				idx := impl.fn(0, 0, 0, 1<<63, v, got)
+				if idx != pos {
+					t.Fatalf("%s n=%d pos=%d: got idx %d", impl.name, n, pos, idx)
+				}
+			}
+		}
+	}
+}
+
+func benchD256AddScalarU(b *testing.B, fn func(s0, s1, s2, s3 uint64, v, r []uint64), n int) {
+	v := makeRandD256(n, 1)
+	r := make([]uint64, 4*n)
+	b.SetBytes(int64(n) * 32 * 2)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		fn(0xDEADBEEFCAFEBABE, 0x123456789ABCDEF0, 0xFEEDFACEDEADBEEF, 0x0123456789ABCDEF, v, r)
+	}
+}
+
+func benchD256SubScalarU(b *testing.B, fn func(v []uint64, s0, s1, s2, s3 uint64, r []uint64), n int) {
+	v := makeRandD256(n, 1)
+	r := make([]uint64, 4*n)
+	b.SetBytes(int64(n) * 32 * 2)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		fn(v, 0xDEADBEEFCAFEBABE, 0x123456789ABCDEF0, 0xFEEDFACEDEADBEEF, 0x0123456789ABCDEF, r)
+	}
+}
+
+func BenchmarkD256AddScalarUnchecked(b *testing.B) {
+	for _, n := range d256BenchSizes {
+		b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD256AddScalarU(b, scalarD256AddScalarUnchecked, n) })
+		b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD256AddScalarU(b, avx2D256AddScalarUnchecked, n) })
+		if cpu.X86.HasAVX512 {
+			b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD256AddScalarU(b, avx512D256AddScalarUnchecked, n) })
+		}
+	}
+}
+
+func BenchmarkD256SubScalarUnchecked(b *testing.B) {
+	for _, n := range d256BenchSizes {
+		b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD256SubScalarU(b, scalarD256SubScalarUnchecked, n) })
+		b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD256SubScalarU(b, avx2D256SubScalarUnchecked, n) })
+		if cpu.X86.HasAVX512 {
+			b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD256SubScalarU(b, avx512D256SubScalarUnchecked, n) })
+		}
+	}
+}
+
+func BenchmarkD256ScalarSubUnchecked(b *testing.B) {
+	for _, n := range d256BenchSizes {
+		b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD256AddScalarU(b, scalarD256ScalarSubUnchecked, n) })
+		b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD256AddScalarU(b, avx2D256ScalarSubUnchecked, n) })
+		if cpu.X86.HasAVX512 {
+			b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD256AddScalarU(b, avx512D256ScalarSubUnchecked, n) })
+		}
+	}
+}
diff --git a/pkg/common/simdkernels/d256_negabs.go b/pkg/common/simdkernels/d256_negabs.go
new file mode 100644
index 0000000000000..5e8141a39b513
--- /dev/null
+++ b/pkg/common/simdkernels/d256_negabs.go
@@ -0,0 +1,78 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package simdkernels
+
+import "math/bits"
+
+// Decimal256 element-wise negate / absolute value on slices of uint64 with
+// the matrixone Decimal256 layout (4 uint64 per element, low to high). The
+// src and dst slices both have length 4*N. dst may alias src.
+//
+// Both ops use 256-bit two's complement (~x + 1). Negate is unconditional;
+// Abs is conditional on the sign bit of the topmost word. MinInt256 wraps to
+// itself, matching the scalar SQL semantics in arith_decimal_fast.go.
+
+var (
+	D256Negate func(src, dst []uint64) = scalarD256Negate
+	D256Abs    func(src, dst []uint64) = scalarD256Abs
+)
+
+func scalarD256Negate(src, dst []uint64) {
+	n := len(dst) / 4
+	if len(src) < 4*n {
+		return
+	}
+	for i := 0; i < n; i++ {
+		j := i << 2
+		w0 := ^src[j]
+		w1 := ^src[j+1]
+		w2 := ^src[j+2]
+		w3 := ^src[j+3]
+		var c uint64
+		w0, c = bits.Add64(w0, 1, 0)
+		w1, c = bits.Add64(w1, 0, c)
+		w2, c = bits.Add64(w2, 0, c)
+		w3, _ = bits.Add64(w3, 0, c)
+		dst[j] = w0
+		dst[j+1] = w1
+		dst[j+2] = w2
+		dst[j+3] = w3
+	}
+}
+
+func scalarD256Abs(src, dst []uint64) {
+	n := len(dst) / 4
+	if len(src) < 4*n {
+		return
+	}
+	for i := 0; i < n; i++ {
+		j := i << 2
+		w0, w1, w2, w3 := src[j], src[j+1], src[j+2], src[j+3]
+		sign := uint64(int64(w3) >> 63) // 0 or all-ones
+		w0 ^= sign
+		w1 ^= sign
+		w2 ^= sign
+		w3 ^= sign
+		var c uint64
+		w0, c = bits.Add64(w0, sign&1, 0)
+		w1, c = bits.Add64(w1, 0, c)
+		w2, c = bits.Add64(w2, 0, c)
+		w3, _ = bits.Add64(w3, 0, c)
+		dst[j] = w0
+		dst[j+1] = w1
+		dst[j+2] = w2
+		dst[j+3] = w3
+	}
+}
diff --git a/pkg/common/simdkernels/d256_negabs_simd_amd64.go b/pkg/common/simdkernels/d256_negabs_simd_amd64.go
new file mode 100644
index 0000000000000..4dde4374399c3
--- /dev/null
+++ b/pkg/common/simdkernels/d256_negabs_simd_amd64.go
@@ -0,0 +1,133 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build goexperiment.simd && amd64
+
+package simdkernels
+
+import (
+	"math/bits"
+	"simd/archsimd"
+	"unsafe"
+
+	"golang.org/x/sys/cpu"
+)
+
+// d256_negabs_simd_amd64.go: SIMD batch negate / abs for Decimal256.
+//
+// Same conditional-negate idiom as d128_negabs but with 4 stages of carry
+// propagation across the 4 words of each Decimal256:
+//
+//	mask = -m              (-1 if negating this lane, 0 otherwise)
+//	wBar = w XOR mask
+//	stage 0: r0 = wBar0 - mask        // mask supplies the +1 only when negating
+//	stage k (k>0): rk = wBark - cIn   // cIn is 0 or -1 carry from stage k-1
+//	carry out of any stage k: rk wraps unsigned ⇔ rk <_unsigned wBar_k
+//	top stage: drop cOut.
+//
+// Layout: each Decimal256 is 4 q-words = 32 B. Process 4 elements per AVX2
+// iter (= 16 q-words = 128 B), 8 per AVX-512 iter. Reuse transpose4x4 from
+// d256_addsub_simd_amd64.go for AoS↔SoA conversion.
+
+func init() {
+	// AVX-512 D256 transpose would need a custom 8×4 ConcatPermute layout
+	// and an inverse for the writeback; deferred. AVX2 path runs on AVX-512
+	// hosts as well (still 4 elements per iter).
+	if cpu.X86.HasAVX2 {
+		D256Negate = avx2D256Negate
+		D256Abs = avx2D256Abs
+	}
+}
+
+// ---------------------------------------------------------------------------
+// AVX2 (Int64x4) implementation
+// ---------------------------------------------------------------------------
+
+//go:nosplit
+func avx2D256NegStage(w, mask, cIn, sb archsimd.Int64x4) (r, cOut archsimd.Int64x4) {
+	wBar := w.Xor(mask)
+	r = wBar.Sub(cIn)
+	cOut = r.Xor(sb).Less(wBar.Xor(sb)).ToInt64x4()
+	return
+}
+
+//go:nosplit
+func avx2D256NegStageNoOut(w, mask, cIn archsimd.Int64x4) archsimd.Int64x4 {
+	return w.Xor(mask).Sub(cIn)
+}
+
+func avx2D256NegabsCore(src, dst []uint64, abs bool) {
+	n := len(dst) / 4
+	if n == 0 || len(src) < 4*n {
+		return
+	}
+	ps, pd := unsafe.Pointer(&src[0]), unsafe.Pointer(&dst[0])
+	sb := archsimd.BroadcastInt64x4(signBit128)
+	allOnes := archsimd.BroadcastInt64x4(-1)
+	zero := archsimd.BroadcastInt64x4(0)
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 32
+		v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(ps, off)))
+		v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(ps, off+32)))
+		v2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(ps, off+64)))
+		v3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(ps, off+96)))
+
+		w0, w1, w2, w3 := transpose4x4(v0, v1, v2, v3)
+		var mask archsimd.Int64x4
+		if abs {
+			mask = w3.Less(zero).ToInt64x4()
+		} else {
+			mask = allOnes
+		}
+
+		r0, c0 := avx2D256NegStage(w0, mask, mask, sb)
+		r1, c1 := avx2D256NegStage(w1, mask, c0, sb)
+		r2, c2 := avx2D256NegStage(w2, mask, c1, sb)
+		r3 := avx2D256NegStageNoOut(w3, mask, c2)
+
+		rv0, rv1, rv2, rv3 := transpose4x4(r0, r1, r2, r3)
+		rv0.Store((*[4]int64)(unsafe.Add(pd, off)))
+		rv1.Store((*[4]int64)(unsafe.Add(pd, off+32)))
+		rv2.Store((*[4]int64)(unsafe.Add(pd, off+64)))
+		rv3.Store((*[4]int64)(unsafe.Add(pd, off+96)))
+	}
+	for ; i < n; i++ {
+		j := i << 2
+		w0, w1, w2, w3 := src[j], src[j+1], src[j+2], src[j+3]
+		var sign uint64
+		if abs {
+			sign = uint64(int64(w3) >> 63)
+		} else {
+			sign = ^uint64(0)
+		}
+		w0 ^= sign
+		w1 ^= sign
+		w2 ^= sign
+		w3 ^= sign
+		var c uint64
+		w0, c = bits.Add64(w0, sign&1, 0)
+		w1, c = bits.Add64(w1, 0, c)
+		w2, c = bits.Add64(w2, 0, c)
+		w3, _ = bits.Add64(w3, 0, c)
+		dst[j] = w0
+		dst[j+1] = w1
+		dst[j+2] = w2
+		dst[j+3] = w3
+	}
+}
+
+func avx2D256Negate(src, dst []uint64) { avx2D256NegabsCore(src, dst, false) }
+func avx2D256Abs(src, dst []uint64)    { avx2D256NegabsCore(src, dst, true) }
diff --git a/pkg/common/simdkernels/d256_negabs_test.go b/pkg/common/simdkernels/d256_negabs_test.go
new file mode 100644
index 0000000000000..454910156cd4e
--- /dev/null
+++ b/pkg/common/simdkernels/d256_negabs_test.go
@@ -0,0 +1,161 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build goexperiment.simd && amd64
+
+package simdkernels
+
+import (
+	"math"
+	"strconv"
+	"testing"
+)
+
+type d256UnaryImpl struct {
+	name string
+	fn   func(src, dst []uint64)
+}
+
+func d256NegateImpls() []d256UnaryImpl {
+	out := []d256UnaryImpl{{name: "scalar", fn: scalarD256Negate}}
+	if D256Negate != nil {
+		out = append(out, d256UnaryImpl{name: "dispatch", fn: D256Negate})
+	}
+	return out
+}
+
+func d256AbsImpls() []d256UnaryImpl {
+	out := []d256UnaryImpl{{name: "scalar", fn: scalarD256Abs}}
+	if D256Abs != nil {
+		out = append(out, d256UnaryImpl{name: "dispatch", fn: D256Abs})
+	}
+	return out
+}
+
+func d256NegAbsEdges() []uint64 {
+	// each row = 4 q-words (lo..hi) of one D256.
+	rows := [][4]uint64{
+		{0, 0, 0, 0},
+		{1, 0, 0, 0},
+		{0, 0, 0, 1},
+		{0, 0, 0, 0x8000000000000000}, // MinInt256
+		{math.MaxUint64, math.MaxUint64, math.MaxUint64, math.MaxUint64}, // -1
+		{0, 0, 0, 0x7FFFFFFFFFFFFFFF},                                    // MaxInt256
+		{1, 0, 0, 0x8000000000000000},                                    // negative, lo!=0
+		{math.MaxUint64, 0, 0, 0x8000000000000000},                       // tests carry over middle words
+		{0, math.MaxUint64, math.MaxUint64, 0x8000000000000000},
+		{math.MaxUint64, math.MaxUint64, math.MaxUint64, 0x8000000000000000},
+	}
+	out := make([]uint64, 0, 4*len(rows))
+	for _, r := range rows {
+		out = append(out, r[0], r[1], r[2], r[3])
+	}
+	return out
+}
+
+func TestD256NegateCorrectness(t *testing.T) {
+	for _, n := range d256Sizes() {
+		src := makeRandD256(n, 0xD256^uint64(n))
+		want := make([]uint64, 4*n)
+		scalarD256Negate(src, want)
+		for _, im := range d256NegateImpls() {
+			got := make([]uint64, 4*n)
+			im.fn(src, got)
+			for i := 0; i < 4*n; i++ {
+				if got[i] != want[i] {
+					t.Fatalf("%s n=%d idx=%d: got 0x%x want 0x%x", im.name, n, i, got[i], want[i])
+				}
+			}
+		}
+	}
+}
+
+func TestD256NegateEdges(t *testing.T) {
+	src := d256NegAbsEdges()
+	n := len(src) / 4
+	want := make([]uint64, 4*n)
+	scalarD256Negate(src, want)
+	for _, im := range d256NegateImpls() {
+		got := make([]uint64, 4*n)
+		im.fn(src, got)
+		for i := 0; i < 4*n; i++ {
+			if got[i] != want[i] {
+				t.Fatalf("%s idx=%d: got 0x%x want 0x%x", im.name, i, got[i], want[i])
+			}
+		}
+	}
+}
+
+func TestD256AbsCorrectness(t *testing.T) {
+	for _, n := range d256Sizes() {
+		src := makeRandD256(n, 0xABA256^uint64(n))
+		want := make([]uint64, 4*n)
+		scalarD256Abs(src, want)
+		for _, im := range d256AbsImpls() {
+			got := make([]uint64, 4*n)
+			im.fn(src, got)
+			for i := 0; i < 4*n; i++ {
+				if got[i] != want[i] {
+					t.Fatalf("%s n=%d idx=%d: got 0x%x want 0x%x", im.name, n, i, got[i], want[i])
+				}
+			}
+		}
+	}
+}
+
+func TestD256AbsEdges(t *testing.T) {
+	src := d256NegAbsEdges()
+	n := len(src) / 4
+	want := make([]uint64, 4*n)
+	scalarD256Abs(src, want)
+	for _, im := range d256AbsImpls() {
+		got := make([]uint64, 4*n)
+		im.fn(src, got)
+		for i := 0; i < 4*n; i++ {
+			if got[i] != want[i] {
+				t.Fatalf("%s idx=%d: got 0x%x want 0x%x", im.name, i, got[i], want[i])
+			}
+		}
+	}
+}
+
+func benchmarkD256Unary(b *testing.B, fn func(src, dst []uint64), n int) {
+	src := makeRandD256(n, 0xBEEF^uint64(n))
+	dst := make([]uint64, 4*n)
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		fn(src, dst)
+	}
+}
+
+func BenchmarkD256Negate(b *testing.B) {
+	for _, n := range []int{16, 64, 256, 1024, 4096} {
+		for _, im := range d256NegateImpls() {
+			b.Run(im.name+"/n="+strconv.Itoa(n), func(b *testing.B) {
+				benchmarkD256Unary(b, im.fn, n)
+			})
+		}
+	}
+}
+
+func BenchmarkD256Abs(b *testing.B) {
+	for _, n := range []int{16, 64, 256, 1024, 4096} {
+		for _, im := range d256AbsImpls() {
+			b.Run(im.name+"/n="+strconv.Itoa(n), func(b *testing.B) {
+				benchmarkD256Unary(b, im.fn, n)
+			})
+		}
+	}
+}
diff --git a/pkg/common/simdkernels/d64_addsub.go b/pkg/common/simdkernels/d64_addsub.go
new file mode 100644
index 0000000000000..2a57d1fc58b26
--- /dev/null
+++ b/pkg/common/simdkernels/d64_addsub.go
@@ -0,0 +1,272 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package simdkernels
+
+import "math/bits"
+
+// Decimal64 add/sub on slices, treating elements as signed int64.
+//
+// Two variants per operator:
+//   *Unchecked — wraps on overflow, no detection. Use when caller can prove
+//                no overflow (e.g., max(p1,p2)+1 ≤ 18 for d_T(p,s) operands).
+//   *Checked   — returns the index of the first element that overflows, or
+//                -1 if none did. Vector loop accumulates an overflow mask
+//                and pays for it whether or not overflow occurs.
+//
+// The exported variables are function-pointer dispatchers: their values are
+// the scalar reference implementations by default and may be replaced at
+// init time on amd64 when AVX2 is available (see d64_addsub_simd_amd64.go).
+
+var (
+	D64AddUnchecked func(a, b, r []uint64)     = scalarD64AddUnchecked
+	D64SubUnchecked func(a, b, r []uint64)     = scalarD64SubUnchecked
+	D64AddChecked   func(a, b, r []uint64) int = scalarD64AddChecked
+	D64SubChecked   func(a, b, r []uint64) int = scalarD64SubChecked
+
+	// Scalar-broadcast variants. Use when one operand is a constant /
+	// bound parameter / single-row literal — i.e. the SQL frontend's
+	// (column op constant) and (constant op column) shapes.
+	//
+	// For Add (commutative) one entry covers both sides.
+	// For Sub:
+	//   D64SubScalar  → v[i] - s  (column - constant)
+	//   D64ScalarSub  → s - v[i]  (constant - column)
+	D64AddScalarUnchecked func(s uint64, v, r []uint64)              = scalarD64AddScalarUnchecked
+	D64AddScalarChecked   func(s uint64, v, r []uint64) int          = scalarD64AddScalarChecked
+	D64SubScalarUnchecked func(v []uint64, s uint64, r []uint64)     = scalarD64SubScalarUnchecked
+	D64SubScalarChecked   func(v []uint64, s uint64, r []uint64) int = scalarD64SubScalarChecked
+	D64ScalarSubUnchecked func(s uint64, v, r []uint64)              = scalarD64ScalarSubUnchecked
+	D64ScalarSubChecked   func(s uint64, v, r []uint64) int          = scalarD64ScalarSubChecked
+
+	// D64SumReduceToD128 sums a slice of Decimal64 values (signed) and
+	// returns the 128-bit signed total as (lo, hi). Wraps mod 2^128.
+	// Caller is responsible for ensuring the true sum fits in 128 bits
+	// (always true for any plausible Decimal64 batch since |val| < 10^18).
+	D64SumReduceToD128 func(v []uint64) (lo, hi uint64) = scalarD64SumReduceToD128
+)
+
+func scalarD64AddUnchecked(a, b, r []uint64) {
+	n := len(r)
+	if len(a) < n || len(b) < n {
+		return
+	}
+	for i := 0; i < n; i++ {
+		r[i] = a[i] + b[i]
+	}
+}
+
+func scalarD64SubUnchecked(a, b, r []uint64) {
+	n := len(r)
+	if len(a) < n || len(b) < n {
+		return
+	}
+	for i := 0; i < n; i++ {
+		r[i] = a[i] - b[i]
+	}
+}
+
+func scalarD64AddChecked(a, b, r []uint64) int {
+	n := len(r)
+	if len(a) < n || len(b) < n {
+		return -1
+	}
+	first := -1
+	for i := 0; i < n; i++ {
+		ai, bi := int64(a[i]), int64(b[i])
+		ri := ai + bi
+		r[i] = uint64(ri)
+		if first < 0 && (ai^ri)&^(ai^bi) < 0 {
+			first = i
+		}
+	}
+	return first
+}
+
+func scalarD64SubChecked(a, b, r []uint64) int {
+	n := len(r)
+	if len(a) < n || len(b) < n {
+		return -1
+	}
+	first := -1
+	for i := 0; i < n; i++ {
+		ai, bi := int64(a[i]), int64(b[i])
+		ri := ai - bi
+		r[i] = uint64(ri)
+		if first < 0 && (ai^ri)&(ai^bi) < 0 {
+			first = i
+		}
+	}
+	return first
+}
+
+// d64FirstOverflow rescans [0, end) for the first overflow index. Used by
+// SIMD checked variants when their accumulated mask reports overflow but
+// the scalar tail did not see one (so the offender is in the vector range).
+func d64FirstOverflow(a, b []uint64, end int, sub bool) int {
+	if sub {
+		for i := 0; i < end; i++ {
+			ai, bi := int64(a[i]), int64(b[i])
+			ri := ai - bi
+			if (ai^ri)&(ai^bi) < 0 {
+				return i
+			}
+		}
+		return -1
+	}
+	for i := 0; i < end; i++ {
+		ai, bi := int64(a[i]), int64(b[i])
+		ri := ai + bi
+		if (ai^ri)&^(ai^bi) < 0 {
+			return i
+		}
+	}
+	return -1
+}
+
+// ---------------------------------------------------------------------------
+// Scalar-broadcast reference implementations.
+// ---------------------------------------------------------------------------
+
+func scalarD64AddScalarUnchecked(s uint64, v, r []uint64) {
+	n := len(r)
+	if len(v) < n {
+		return
+	}
+	for i := 0; i < n; i++ {
+		r[i] = s + v[i]
+	}
+}
+
+func scalarD64SubScalarUnchecked(v []uint64, s uint64, r []uint64) {
+	n := len(r)
+	if len(v) < n {
+		return
+	}
+	for i := 0; i < n; i++ {
+		r[i] = v[i] - s
+	}
+}
+
+func scalarD64ScalarSubUnchecked(s uint64, v, r []uint64) {
+	n := len(r)
+	if len(v) < n {
+		return
+	}
+	for i := 0; i < n; i++ {
+		r[i] = s - v[i]
+	}
+}
+
+func scalarD64AddScalarChecked(s uint64, v, r []uint64) int {
+	n := len(r)
+	if len(v) < n {
+		return -1
+	}
+	si := int64(s)
+	first := -1
+	for i := 0; i < n; i++ {
+		vi := int64(v[i])
+		ri := si + vi
+		r[i] = uint64(ri)
+		if first < 0 && (si^ri)&^(si^vi) < 0 {
+			first = i
+		}
+	}
+	return first
+}
+
+func scalarD64SubScalarChecked(v []uint64, s uint64, r []uint64) int {
+	n := len(r)
+	if len(v) < n {
+		return -1
+	}
+	si := int64(s)
+	first := -1
+	for i := 0; i < n; i++ {
+		vi := int64(v[i])
+		ri := vi - si
+		r[i] = uint64(ri)
+		if first < 0 && (vi^ri)&(vi^si) < 0 {
+			first = i
+		}
+	}
+	return first
+}
+
+func scalarD64ScalarSubChecked(s uint64, v, r []uint64) int {
+	n := len(r)
+	if len(v) < n {
+		return -1
+	}
+	si := int64(s)
+	first := -1
+	for i := 0; i < n; i++ {
+		vi := int64(v[i])
+		ri := si - vi
+		r[i] = uint64(ri)
+		if first < 0 && (si^ri)&(si^vi) < 0 {
+			first = i
+		}
+	}
+	return first
+}
+
+// d64ScalarFirstOverflow rescans [0, end) for the first overflow index in
+// scalar-broadcast operations. kind selects the operation:
+//
+//	0 = s + v[i]   (AddScalar)
+//	1 = v[i] - s   (SubScalar)
+//	2 = s - v[i]   (ScalarSub)
+func d64ScalarFirstOverflow(s uint64, v []uint64, end int, kind int) int {
+	si := int64(s)
+	switch kind {
+	case 0:
+		for i := 0; i < end; i++ {
+			vi := int64(v[i])
+			ri := si + vi
+			if (si^ri)&^(si^vi) < 0 {
+				return i
+			}
+		}
+	case 1:
+		for i := 0; i < end; i++ {
+			vi := int64(v[i])
+			ri := vi - si
+			if (vi^ri)&(vi^si) < 0 {
+				return i
+			}
+		}
+	case 2:
+		for i := 0; i < end; i++ {
+			vi := int64(v[i])
+			ri := si - vi
+			if (si^ri)&(si^vi) < 0 {
+				return i
+			}
+		}
+	}
+	return -1
+}
+
+func scalarD64SumReduceToD128(v []uint64) (lo, hi uint64) {
+	for i := 0; i < len(v); i++ {
+		x := int64(v[i])
+		sx := uint64(x >> 63)
+		var c uint64
+		lo, c = bits.Add64(lo, uint64(x), 0)
+		hi, _ = bits.Add64(hi, sx, c)
+	}
+	return
+}
diff --git a/pkg/common/simdkernels/d64_addsub_simd_amd64.go b/pkg/common/simdkernels/d64_addsub_simd_amd64.go
new file mode 100644
index 0000000000000..e9790b94a719e
--- /dev/null
+++ b/pkg/common/simdkernels/d64_addsub_simd_amd64.go
@@ -0,0 +1,901 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build goexperiment.simd && amd64
+
+package simdkernels
+
+import (
+	"math/bits"
+	"simd/archsimd"
+	"unsafe"
+
+	"golang.org/x/sys/cpu"
+)
+
+// init swaps the dispatcher function pointers from the package-default
+// scalar implementations to the best available SIMD variant on the host.
+//
+// Order of preference: AVX-512 (Int64x8, 8 lanes) > AVX2 (Int64x4, 4 lanes).
+// archsimd.X86.AVX512() requires the bundled F+CD+BW+DQ+VL set, exposed via
+// cpu.X86.HasAVX512.
+func init() {
+	switch {
+	case cpu.X86.HasAVX512:
+		D64AddUnchecked = avx512D64AddUnchecked
+		D64SubUnchecked = avx512D64SubUnchecked
+		D64AddChecked = avx512D64AddChecked
+		D64SubChecked = avx512D64SubChecked
+		D64AddScalarUnchecked = avx512D64AddScalarUnchecked
+		D64AddScalarChecked = avx512D64AddScalarChecked
+		D64SubScalarUnchecked = avx512D64SubScalarUnchecked
+		D64SubScalarChecked = avx512D64SubScalarChecked
+		D64ScalarSubUnchecked = avx512D64ScalarSubUnchecked
+		D64ScalarSubChecked = avx512D64ScalarSubChecked
+		D64SumReduceToD128 = avx512D64SumReduceToD128
+	case cpu.X86.HasAVX2:
+		D64AddUnchecked = avx2D64AddUnchecked
+		D64SubUnchecked = avx2D64SubUnchecked
+		D64AddChecked = avx2D64AddChecked
+		D64SubChecked = avx2D64SubChecked
+		D64AddScalarUnchecked = avx2D64AddScalarUnchecked
+		D64AddScalarChecked = avx2D64AddScalarChecked
+		D64SubScalarUnchecked = avx2D64SubScalarUnchecked
+		D64SubScalarChecked = avx2D64SubScalarChecked
+		D64ScalarSubUnchecked = avx2D64ScalarSubUnchecked
+		D64ScalarSubChecked = avx2D64ScalarSubChecked
+		D64SumReduceToD128 = avx2D64SumReduceToD128
+	}
+}
+
+// ---------------------------------------------------------------------------
+// AVX2 path: 4-lane Int64x4 vectors. Main loop processes 16 elements
+// (4× Int64x4 = 128 B) per iteration to hide the 4-cycle VPADDQ latency on
+// Zen 3 / Skylake; cleanup is a 4-wide loop, then a scalar tail.
+// ---------------------------------------------------------------------------
+
+func avx2D64AddUnchecked(a, b, r []uint64) {
+	n := len(r)
+	if n == 0 || len(a) < n || len(b) < n {
+		return
+	}
+	pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0])
+
+	i := 0
+	for ; i+16 <= n; i += 16 {
+		off := uintptr(i) * 8
+		a0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off)))
+		a1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+32)))
+		a2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+64)))
+		a3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+96)))
+		b0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off)))
+		b1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+32)))
+		b2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+64)))
+		b3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+96)))
+		a0.Add(b0).Store((*[4]int64)(unsafe.Add(pr, off)))
+		a1.Add(b1).Store((*[4]int64)(unsafe.Add(pr, off+32)))
+		a2.Add(b2).Store((*[4]int64)(unsafe.Add(pr, off+64)))
+		a3.Add(b3).Store((*[4]int64)(unsafe.Add(pr, off+96)))
+	}
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 8
+		av := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off)))
+		bv := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off)))
+		av.Add(bv).Store((*[4]int64)(unsafe.Add(pr, off)))
+	}
+	for ; i < n; i++ {
+		r[i] = a[i] + b[i]
+	}
+}
+
+func avx2D64SubUnchecked(a, b, r []uint64) {
+	n := len(r)
+	if n == 0 || len(a) < n || len(b) < n {
+		return
+	}
+	pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0])
+
+	i := 0
+	for ; i+16 <= n; i += 16 {
+		off := uintptr(i) * 8
+		a0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off)))
+		a1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+32)))
+		a2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+64)))
+		a3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off+96)))
+		b0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off)))
+		b1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+32)))
+		b2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+64)))
+		b3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off+96)))
+		a0.Sub(b0).Store((*[4]int64)(unsafe.Add(pr, off)))
+		a1.Sub(b1).Store((*[4]int64)(unsafe.Add(pr, off+32)))
+		a2.Sub(b2).Store((*[4]int64)(unsafe.Add(pr, off+64)))
+		a3.Sub(b3).Store((*[4]int64)(unsafe.Add(pr, off+96)))
+	}
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 8
+		av := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off)))
+		bv := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off)))
+		av.Sub(bv).Store((*[4]int64)(unsafe.Add(pr, off)))
+	}
+	for ; i < n; i++ {
+		r[i] = a[i] - b[i]
+	}
+}
+
+// avx2D64AddChecked accumulates per-lane signed-overflow predicates into a
+// vector OR; if any MSB is set at the end, it falls back to a scalar rescan
+// (d64FirstOverflow) to find the first offending index. For the common
+// "no overflow" case the cost is one OR + one Xor + one AndNot per 4 elems.
+//
+// Predicate: overflow iff sign(a)==sign(b) && sign(a)!=sign(r).
+//
+//	⇔ ((a^r) &^ (a^b)) < 0   (MSB set)
+func avx2D64AddChecked(a, b, r []uint64) int {
+	n := len(r)
+	if n == 0 || len(a) < n || len(b) < n {
+		return -1
+	}
+	pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0])
+
+	var ofAcc archsimd.Int64x4
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 8
+		av := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off)))
+		bv := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off)))
+		rv := av.Add(bv)
+		ofAcc = ofAcc.Or(av.Xor(rv).AndNot(av.Xor(bv)))
+		rv.Store((*[4]int64)(unsafe.Add(pr, off)))
+	}
+	vecEnd := i
+
+	zero := archsimd.BroadcastInt64x4(0)
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	for ; i < n; i++ {
+		ai, bi := int64(a[i]), int64(b[i])
+		ri := ai + bi
+		r[i] = uint64(ri)
+		if (ai^ri)&^(ai^bi) < 0 {
+			if vecOverflow {
+				return d64FirstOverflow(a, b, vecEnd, false)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d64FirstOverflow(a, b, vecEnd, false)
+}
+
+// avx2D64SubChecked: overflow iff sign(a)!=sign(b) && sign(a)!=sign(r).
+//
+//	⇔ ((a^r) & (a^b)) < 0
+func avx2D64SubChecked(a, b, r []uint64) int {
+	n := len(r)
+	if n == 0 || len(a) < n || len(b) < n {
+		return -1
+	}
+	pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0])
+
+	var ofAcc archsimd.Int64x4
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 8
+		av := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pa, off)))
+		bv := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pb, off)))
+		rv := av.Sub(bv)
+		ofAcc = ofAcc.Or(av.Xor(rv).And(av.Xor(bv)))
+		rv.Store((*[4]int64)(unsafe.Add(pr, off)))
+	}
+	vecEnd := i
+
+	zero := archsimd.BroadcastInt64x4(0)
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	for ; i < n; i++ {
+		ai, bi := int64(a[i]), int64(b[i])
+		ri := ai - bi
+		r[i] = uint64(ri)
+		if (ai^ri)&(ai^bi) < 0 {
+			if vecOverflow {
+				return d64FirstOverflow(a, b, vecEnd, true)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d64FirstOverflow(a, b, vecEnd, true)
+}
+
+// ---------------------------------------------------------------------------
+// AVX-512 path: 8-lane Int64x8 vectors (ZMM, 64 B). Main loop processes 32
+// elements per iteration (4× Int64x8 = 256 B) to keep the 4-cycle VPADDQ
+// pipe full. Cleanup is 8-wide, then scalar tail.
+// ---------------------------------------------------------------------------
+
+func avx512D64AddUnchecked(a, b, r []uint64) {
+	n := len(r)
+	if n == 0 || len(a) < n || len(b) < n {
+		return
+	}
+	pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0])
+
+	i := 0
+	for ; i+32 <= n; i += 32 {
+		off := uintptr(i) * 8
+		a0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off)))
+		a1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+64)))
+		a2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+128)))
+		a3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+192)))
+		b0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off)))
+		b1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+64)))
+		b2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+128)))
+		b3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+192)))
+		a0.Add(b0).Store((*[8]int64)(unsafe.Add(pr, off)))
+		a1.Add(b1).Store((*[8]int64)(unsafe.Add(pr, off+64)))
+		a2.Add(b2).Store((*[8]int64)(unsafe.Add(pr, off+128)))
+		a3.Add(b3).Store((*[8]int64)(unsafe.Add(pr, off+192)))
+	}
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 8
+		av := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off)))
+		bv := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off)))
+		av.Add(bv).Store((*[8]int64)(unsafe.Add(pr, off)))
+	}
+	for ; i < n; i++ {
+		r[i] = a[i] + b[i]
+	}
+}
+
+func avx512D64SubUnchecked(a, b, r []uint64) {
+	n := len(r)
+	if n == 0 || len(a) < n || len(b) < n {
+		return
+	}
+	pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0])
+
+	i := 0
+	for ; i+32 <= n; i += 32 {
+		off := uintptr(i) * 8
+		a0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off)))
+		a1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+64)))
+		a2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+128)))
+		a3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off+192)))
+		b0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off)))
+		b1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+64)))
+		b2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+128)))
+		b3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off+192)))
+		a0.Sub(b0).Store((*[8]int64)(unsafe.Add(pr, off)))
+		a1.Sub(b1).Store((*[8]int64)(unsafe.Add(pr, off+64)))
+		a2.Sub(b2).Store((*[8]int64)(unsafe.Add(pr, off+128)))
+		a3.Sub(b3).Store((*[8]int64)(unsafe.Add(pr, off+192)))
+	}
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 8
+		av := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off)))
+		bv := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off)))
+		av.Sub(bv).Store((*[8]int64)(unsafe.Add(pr, off)))
+	}
+	for ; i < n; i++ {
+		r[i] = a[i] - b[i]
+	}
+}
+
+func avx512D64AddChecked(a, b, r []uint64) int {
+	n := len(r)
+	if n == 0 || len(a) < n || len(b) < n {
+		return -1
+	}
+	pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0])
+
+	var ofAcc archsimd.Int64x8
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 8
+		av := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off)))
+		bv := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off)))
+		rv := av.Add(bv)
+		// Int64x8.AndNot has inverted operand semantics (computes ~receiver & arg).
+		// We want (a^r) & ~(a^b), so use (a^b).AndNot(a^r).
+		ofAcc = ofAcc.Or(av.Xor(bv).AndNot(av.Xor(rv)))
+		rv.Store((*[8]int64)(unsafe.Add(pr, off)))
+	}
+	vecEnd := i
+
+	zero := archsimd.BroadcastInt64x8(0)
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	for ; i < n; i++ {
+		ai, bi := int64(a[i]), int64(b[i])
+		ri := ai + bi
+		r[i] = uint64(ri)
+		if (ai^ri)&^(ai^bi) < 0 {
+			if vecOverflow {
+				return d64FirstOverflow(a, b, vecEnd, false)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d64FirstOverflow(a, b, vecEnd, false)
+}
+
+func avx512D64SubChecked(a, b, r []uint64) int {
+	n := len(r)
+	if n == 0 || len(a) < n || len(b) < n {
+		return -1
+	}
+	pa, pb, pr := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0]), unsafe.Pointer(&r[0])
+
+	var ofAcc archsimd.Int64x8
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 8
+		av := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off)))
+		bv := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off)))
+		rv := av.Sub(bv)
+		ofAcc = ofAcc.Or(av.Xor(rv).And(av.Xor(bv)))
+		rv.Store((*[8]int64)(unsafe.Add(pr, off)))
+	}
+	vecEnd := i
+
+	zero := archsimd.BroadcastInt64x8(0)
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	for ; i < n; i++ {
+		ai, bi := int64(a[i]), int64(b[i])
+		ri := ai - bi
+		r[i] = uint64(ri)
+		if (ai^ri)&(ai^bi) < 0 {
+			if vecOverflow {
+				return d64FirstOverflow(a, b, vecEnd, true)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d64FirstOverflow(a, b, vecEnd, true)
+}
+
+// ---------------------------------------------------------------------------
+// Scalar-broadcast SIMD variants. Pattern: broadcast the scalar into a
+// vector once outside the main loop, then fuse loads of the column with
+// vector add/sub against the broadcast register.
+//
+// AVX2: 4× Int64x4 unroll = 16 elems/iter, then 4-wide cleanup, scalar tail.
+// AVX-512: 4× Int64x8 unroll = 32 elems/iter, then 8-wide cleanup, scalar tail.
+// ---------------------------------------------------------------------------
+
+// AVX2 — Add scalar broadcast.
+
+func avx2D64AddScalarUnchecked(s uint64, v, r []uint64) {
+	n := len(r)
+	if n == 0 || len(v) < n {
+		return
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sv := archsimd.BroadcastInt64x4(int64(s))
+
+	i := 0
+	for ; i+16 <= n; i += 16 {
+		off := uintptr(i) * 8
+		v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32)))
+		v2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+64)))
+		v3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+96)))
+		sv.Add(v0).Store((*[4]int64)(unsafe.Add(pr, off)))
+		sv.Add(v1).Store((*[4]int64)(unsafe.Add(pr, off+32)))
+		sv.Add(v2).Store((*[4]int64)(unsafe.Add(pr, off+64)))
+		sv.Add(v3).Store((*[4]int64)(unsafe.Add(pr, off+96)))
+	}
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 8
+		vv := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off)))
+		sv.Add(vv).Store((*[4]int64)(unsafe.Add(pr, off)))
+	}
+	for ; i < n; i++ {
+		r[i] = s + v[i]
+	}
+}
+
+func avx2D64SubScalarUnchecked(v []uint64, s uint64, r []uint64) {
+	n := len(r)
+	if n == 0 || len(v) < n {
+		return
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sv := archsimd.BroadcastInt64x4(int64(s))
+
+	i := 0
+	for ; i+16 <= n; i += 16 {
+		off := uintptr(i) * 8
+		v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32)))
+		v2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+64)))
+		v3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+96)))
+		v0.Sub(sv).Store((*[4]int64)(unsafe.Add(pr, off)))
+		v1.Sub(sv).Store((*[4]int64)(unsafe.Add(pr, off+32)))
+		v2.Sub(sv).Store((*[4]int64)(unsafe.Add(pr, off+64)))
+		v3.Sub(sv).Store((*[4]int64)(unsafe.Add(pr, off+96)))
+	}
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 8
+		vv := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off)))
+		vv.Sub(sv).Store((*[4]int64)(unsafe.Add(pr, off)))
+	}
+	for ; i < n; i++ {
+		r[i] = v[i] - s
+	}
+}
+
+func avx2D64ScalarSubUnchecked(s uint64, v, r []uint64) {
+	n := len(r)
+	if n == 0 || len(v) < n {
+		return
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sv := archsimd.BroadcastInt64x4(int64(s))
+
+	i := 0
+	for ; i+16 <= n; i += 16 {
+		off := uintptr(i) * 8
+		v0 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+32)))
+		v2 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+64)))
+		v3 := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off+96)))
+		sv.Sub(v0).Store((*[4]int64)(unsafe.Add(pr, off)))
+		sv.Sub(v1).Store((*[4]int64)(unsafe.Add(pr, off+32)))
+		sv.Sub(v2).Store((*[4]int64)(unsafe.Add(pr, off+64)))
+		sv.Sub(v3).Store((*[4]int64)(unsafe.Add(pr, off+96)))
+	}
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 8
+		vv := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off)))
+		sv.Sub(vv).Store((*[4]int64)(unsafe.Add(pr, off)))
+	}
+	for ; i < n; i++ {
+		r[i] = s - v[i]
+	}
+}
+
+// Checked broadcast variants follow the same accumulate-mask pattern as the
+// vector+vector ones. The overflow predicate is identical to the scalar
+// reference because broadcasting the scalar gives the same per-lane test.
+
+func avx2D64AddScalarChecked(s uint64, v, r []uint64) int {
+	n := len(r)
+	if n == 0 || len(v) < n {
+		return -1
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sv := archsimd.BroadcastInt64x4(int64(s))
+
+	var ofAcc archsimd.Int64x4
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 8
+		vv := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off)))
+		rv := sv.Add(vv)
+		ofAcc = ofAcc.Or(sv.Xor(rv).AndNot(sv.Xor(vv)))
+		rv.Store((*[4]int64)(unsafe.Add(pr, off)))
+	}
+	vecEnd := i
+
+	zero := archsimd.BroadcastInt64x4(0)
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	si := int64(s)
+	for ; i < n; i++ {
+		vi := int64(v[i])
+		ri := si + vi
+		r[i] = uint64(ri)
+		if (si^ri)&^(si^vi) < 0 {
+			if vecOverflow {
+				return d64ScalarFirstOverflow(s, v, vecEnd, 0)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d64ScalarFirstOverflow(s, v, vecEnd, 0)
+}
+
+func avx2D64SubScalarChecked(v []uint64, s uint64, r []uint64) int {
+	n := len(r)
+	if n == 0 || len(v) < n {
+		return -1
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sv := archsimd.BroadcastInt64x4(int64(s))
+
+	var ofAcc archsimd.Int64x4
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 8
+		vv := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off)))
+		rv := vv.Sub(sv)
+		ofAcc = ofAcc.Or(vv.Xor(rv).And(vv.Xor(sv)))
+		rv.Store((*[4]int64)(unsafe.Add(pr, off)))
+	}
+	vecEnd := i
+
+	zero := archsimd.BroadcastInt64x4(0)
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	si := int64(s)
+	for ; i < n; i++ {
+		vi := int64(v[i])
+		ri := vi - si
+		r[i] = uint64(ri)
+		if (vi^ri)&(vi^si) < 0 {
+			if vecOverflow {
+				return d64ScalarFirstOverflow(s, v, vecEnd, 1)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d64ScalarFirstOverflow(s, v, vecEnd, 1)
+}
+
+func avx2D64ScalarSubChecked(s uint64, v, r []uint64) int {
+	n := len(r)
+	if n == 0 || len(v) < n {
+		return -1
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sv := archsimd.BroadcastInt64x4(int64(s))
+
+	var ofAcc archsimd.Int64x4
+
+	i := 0
+	for ; i+4 <= n; i += 4 {
+		off := uintptr(i) * 8
+		vv := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off)))
+		rv := sv.Sub(vv)
+		ofAcc = ofAcc.Or(sv.Xor(rv).And(sv.Xor(vv)))
+		rv.Store((*[4]int64)(unsafe.Add(pr, off)))
+	}
+	vecEnd := i
+
+	zero := archsimd.BroadcastInt64x4(0)
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	si := int64(s)
+	for ; i < n; i++ {
+		vi := int64(v[i])
+		ri := si - vi
+		r[i] = uint64(ri)
+		if (si^ri)&(si^vi) < 0 {
+			if vecOverflow {
+				return d64ScalarFirstOverflow(s, v, vecEnd, 2)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d64ScalarFirstOverflow(s, v, vecEnd, 2)
+}
+
+// AVX-512 broadcast variants — Int64x8 (8 lanes), main loop 4× unrolled.
+
+func avx512D64AddScalarUnchecked(s uint64, v, r []uint64) {
+	n := len(r)
+	if n == 0 || len(v) < n {
+		return
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sv := archsimd.BroadcastInt64x8(int64(s))
+
+	i := 0
+	for ; i+32 <= n; i += 32 {
+		off := uintptr(i) * 8
+		v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64)))
+		v2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+128)))
+		v3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+192)))
+		sv.Add(v0).Store((*[8]int64)(unsafe.Add(pr, off)))
+		sv.Add(v1).Store((*[8]int64)(unsafe.Add(pr, off+64)))
+		sv.Add(v2).Store((*[8]int64)(unsafe.Add(pr, off+128)))
+		sv.Add(v3).Store((*[8]int64)(unsafe.Add(pr, off+192)))
+	}
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 8
+		vv := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off)))
+		sv.Add(vv).Store((*[8]int64)(unsafe.Add(pr, off)))
+	}
+	for ; i < n; i++ {
+		r[i] = s + v[i]
+	}
+}
+
+func avx512D64SubScalarUnchecked(v []uint64, s uint64, r []uint64) {
+	n := len(r)
+	if n == 0 || len(v) < n {
+		return
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sv := archsimd.BroadcastInt64x8(int64(s))
+
+	i := 0
+	for ; i+32 <= n; i += 32 {
+		off := uintptr(i) * 8
+		v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64)))
+		v2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+128)))
+		v3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+192)))
+		v0.Sub(sv).Store((*[8]int64)(unsafe.Add(pr, off)))
+		v1.Sub(sv).Store((*[8]int64)(unsafe.Add(pr, off+64)))
+		v2.Sub(sv).Store((*[8]int64)(unsafe.Add(pr, off+128)))
+		v3.Sub(sv).Store((*[8]int64)(unsafe.Add(pr, off+192)))
+	}
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 8
+		vv := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off)))
+		vv.Sub(sv).Store((*[8]int64)(unsafe.Add(pr, off)))
+	}
+	for ; i < n; i++ {
+		r[i] = v[i] - s
+	}
+}
+
+func avx512D64ScalarSubUnchecked(s uint64, v, r []uint64) {
+	n := len(r)
+	if n == 0 || len(v) < n {
+		return
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sv := archsimd.BroadcastInt64x8(int64(s))
+
+	i := 0
+	for ; i+32 <= n; i += 32 {
+		off := uintptr(i) * 8
+		v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64)))
+		v2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+128)))
+		v3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+192)))
+		sv.Sub(v0).Store((*[8]int64)(unsafe.Add(pr, off)))
+		sv.Sub(v1).Store((*[8]int64)(unsafe.Add(pr, off+64)))
+		sv.Sub(v2).Store((*[8]int64)(unsafe.Add(pr, off+128)))
+		sv.Sub(v3).Store((*[8]int64)(unsafe.Add(pr, off+192)))
+	}
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 8
+		vv := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off)))
+		sv.Sub(vv).Store((*[8]int64)(unsafe.Add(pr, off)))
+	}
+	for ; i < n; i++ {
+		r[i] = s - v[i]
+	}
+}
+
+func avx512D64AddScalarChecked(s uint64, v, r []uint64) int {
+	n := len(r)
+	if n == 0 || len(v) < n {
+		return -1
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sv := archsimd.BroadcastInt64x8(int64(s))
+
+	var ofAcc archsimd.Int64x8
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 8
+		vv := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off)))
+		rv := sv.Add(vv)
+		ofAcc = ofAcc.Or(sv.Xor(vv).AndNot(sv.Xor(rv)))
+		rv.Store((*[8]int64)(unsafe.Add(pr, off)))
+	}
+	vecEnd := i
+
+	zero := archsimd.BroadcastInt64x8(0)
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	si := int64(s)
+	for ; i < n; i++ {
+		vi := int64(v[i])
+		ri := si + vi
+		r[i] = uint64(ri)
+		if (si^ri)&^(si^vi) < 0 {
+			if vecOverflow {
+				return d64ScalarFirstOverflow(s, v, vecEnd, 0)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d64ScalarFirstOverflow(s, v, vecEnd, 0)
+}
+
+func avx512D64SubScalarChecked(v []uint64, s uint64, r []uint64) int {
+	n := len(r)
+	if n == 0 || len(v) < n {
+		return -1
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sv := archsimd.BroadcastInt64x8(int64(s))
+
+	var ofAcc archsimd.Int64x8
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 8
+		vv := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off)))
+		rv := vv.Sub(sv)
+		ofAcc = ofAcc.Or(vv.Xor(rv).And(vv.Xor(sv)))
+		rv.Store((*[8]int64)(unsafe.Add(pr, off)))
+	}
+	vecEnd := i
+
+	zero := archsimd.BroadcastInt64x8(0)
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	si := int64(s)
+	for ; i < n; i++ {
+		vi := int64(v[i])
+		ri := vi - si
+		r[i] = uint64(ri)
+		if (vi^ri)&(vi^si) < 0 {
+			if vecOverflow {
+				return d64ScalarFirstOverflow(s, v, vecEnd, 1)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d64ScalarFirstOverflow(s, v, vecEnd, 1)
+}
+
+func avx512D64ScalarSubChecked(s uint64, v, r []uint64) int {
+	n := len(r)
+	if n == 0 || len(v) < n {
+		return -1
+	}
+	pv, pr := unsafe.Pointer(&v[0]), unsafe.Pointer(&r[0])
+	sv := archsimd.BroadcastInt64x8(int64(s))
+
+	var ofAcc archsimd.Int64x8
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 8
+		vv := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off)))
+		rv := sv.Sub(vv)
+		ofAcc = ofAcc.Or(sv.Xor(rv).And(sv.Xor(vv)))
+		rv.Store((*[8]int64)(unsafe.Add(pr, off)))
+	}
+	vecEnd := i
+
+	zero := archsimd.BroadcastInt64x8(0)
+	vecOverflow := uint64(ofAcc.Less(zero).ToBits()) != 0
+
+	si := int64(s)
+	for ; i < n; i++ {
+		vi := int64(v[i])
+		ri := si - vi
+		r[i] = uint64(ri)
+		if (si^ri)&(si^vi) < 0 {
+			if vecOverflow {
+				return d64ScalarFirstOverflow(s, v, vecEnd, 2)
+			}
+			return i
+		}
+	}
+	if !vecOverflow {
+		return -1
+	}
+	return d64ScalarFirstOverflow(s, v, vecEnd, 2)
+}
+
+// avx2D64SumReduceToD128 sums signed Decimal64 values into a 128-bit total.
+// Uses K=8 inner iterations per lane: per-lane partial sum stays bounded
+// because |Decimal64| < 10^18 ≈ 2^59.79, so 8 such values fit in int63.
+func avx2D64SumReduceToD128(v []uint64) (lo, hi uint64) {
+	n := len(v)
+	if n == 0 {
+		return
+	}
+	pv := unsafe.Pointer(&v[0])
+	const K = 8
+	const blk = 4 * K // 32 elements per outer iteration
+	i := 0
+	for ; i+blk <= n; i += blk {
+		acc := archsimd.BroadcastInt64x4(0)
+		for k := 0; k < K; k++ {
+			off := uintptr(i+4*k) * 8
+			x := archsimd.LoadInt64x4((*[4]int64)(unsafe.Add(pv, off)))
+			acc = acc.Add(x)
+		}
+		var buf [4]int64
+		acc.Store(&buf)
+		for j := 0; j < 4; j++ {
+			x := buf[j]
+			sx := uint64(x >> 63)
+			var c uint64
+			lo, c = bits.Add64(lo, uint64(x), 0)
+			hi, _ = bits.Add64(hi, sx, c)
+		}
+	}
+	for ; i < n; i++ {
+		x := int64(v[i])
+		sx := uint64(x >> 63)
+		var c uint64
+		lo, c = bits.Add64(lo, uint64(x), 0)
+		hi, _ = bits.Add64(hi, sx, c)
+	}
+	return
+}
+
+// avx512D64SumReduceToD128: same chunked partial-sum strategy with 8-wide
+// lanes. K=8 inner iters → 64 elements per outer iter.
+func avx512D64SumReduceToD128(v []uint64) (lo, hi uint64) {
+	n := len(v)
+	if n == 0 {
+		return
+	}
+	pv := unsafe.Pointer(&v[0])
+	const K = 8
+	const blk = 8 * K
+	i := 0
+	for ; i+blk <= n; i += blk {
+		acc := archsimd.BroadcastInt64x8(0)
+		for k := 0; k < K; k++ {
+			off := uintptr(i+8*k) * 8
+			x := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off)))
+			acc = acc.Add(x)
+		}
+		var buf [8]int64
+		acc.Store(&buf)
+		for j := 0; j < 8; j++ {
+			x := buf[j]
+			sx := uint64(x >> 63)
+			var c uint64
+			lo, c = bits.Add64(lo, uint64(x), 0)
+			hi, _ = bits.Add64(hi, sx, c)
+		}
+	}
+	for ; i < n; i++ {
+		x := int64(v[i])
+		sx := uint64(x >> 63)
+		var c uint64
+		lo, c = bits.Add64(lo, uint64(x), 0)
+		hi, _ = bits.Add64(hi, sx, c)
+	}
+	return
+}
diff --git a/pkg/common/simdkernels/d64_addsub_test.go b/pkg/common/simdkernels/d64_addsub_test.go
new file mode 100644
index 0000000000000..2b31da90e7dc0
--- /dev/null
+++ b/pkg/common/simdkernels/d64_addsub_test.go
@@ -0,0 +1,688 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build goexperiment.simd && amd64
+
+package simdkernels
+
+import (
+	"math"
+	"math/rand/v2"
+	"strconv"
+	"testing"
+
+	"golang.org/x/sys/cpu"
+)
+
+// ---------------------------------------------------------------------------
+// Correctness: compare every available impl (scalar, AVX2, AVX-512 if built)
+// against the scalar reference on random + edge-case inputs at sizes that
+// exercise the unrolled main loop, the 4-/8-wide cleanup, and the scalar
+// tail (e.g. 35 = 32+3 or 4+...+3).
+// ---------------------------------------------------------------------------
+
+type d64UncheckedImpl struct {
+	name string
+	fn   func(a, b, r []uint64)
+}
+
+type d64CheckedImpl struct {
+	name string
+	fn   func(a, b, r []uint64) int
+}
+
+func d64Sizes() []int {
+	return []int{0, 1, 3, 4, 7, 8, 15, 16, 17, 31, 32, 33, 35, 63, 64, 127, 128, 255, 256, 1023, 4096}
+}
+
+func makeRandD64(n int, seed uint64) []uint64 {
+	rng := rand.New(rand.NewPCG(seed, seed^0x9E3779B97F4A7C15))
+	out := make([]uint64, n)
+	for i := range out {
+		out[i] = rng.Uint64()
+	}
+	return out
+}
+
+// edgeInputs returns a small input pair tuned to provoke add+sub overflows
+// at a few specific positions (so tests cover both the "no overflow" fast
+// path and the rescan slow path).
+func edgeInputs() (a, b []uint64) {
+	a = []uint64{
+		0, 1, math.MaxUint64,
+		uint64(math.MaxInt64), uint64(math.MaxInt64),
+		1 << 63, 1 << 63,
+		42, 100, 200,
+	}
+	b = []uint64{
+		0, math.MaxUint64, 1,
+		1, uint64(math.MaxInt64),
+		uint64(math.MaxInt64) + 1, 1,
+		58, 200, 100,
+	}
+	return
+}
+
+func TestD64AddVariants(t *testing.T) {
+	impls := []d64UncheckedImpl{
+		{"scalar", scalarD64AddUnchecked},
+		{"avx2", avx2D64AddUnchecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d64UncheckedImpl{"avx512", avx512D64AddUnchecked})
+	}
+	for _, n := range d64Sizes() {
+		a := makeRandD64(n, uint64(n)*7+1)
+		b := makeRandD64(n, uint64(n)*11+3)
+		want := make([]uint64, n)
+		scalarD64AddUnchecked(a, b, want)
+		for _, impl := range impls {
+			got := make([]uint64, n)
+			impl.fn(a, b, got)
+			for i := 0; i < n; i++ {
+				if got[i] != want[i] {
+					t.Fatalf("%s n=%d i=%d: got %x want %x", impl.name, n, i, got[i], want[i])
+				}
+			}
+		}
+	}
+}
+
+func TestD64SubVariants(t *testing.T) {
+	impls := []d64UncheckedImpl{
+		{"scalar", scalarD64SubUnchecked},
+		{"avx2", avx2D64SubUnchecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d64UncheckedImpl{"avx512", avx512D64SubUnchecked})
+	}
+	for _, n := range d64Sizes() {
+		a := makeRandD64(n, uint64(n)*13+5)
+		b := makeRandD64(n, uint64(n)*17+9)
+		want := make([]uint64, n)
+		scalarD64SubUnchecked(a, b, want)
+		for _, impl := range impls {
+			got := make([]uint64, n)
+			impl.fn(a, b, got)
+			for i := 0; i < n; i++ {
+				if got[i] != want[i] {
+					t.Fatalf("%s n=%d i=%d: got %x want %x", impl.name, n, i, got[i], want[i])
+				}
+			}
+		}
+	}
+}
+
+func TestD64AddCheckedVariants(t *testing.T) {
+	impls := []d64CheckedImpl{
+		{"scalar", scalarD64AddChecked},
+		{"avx2", avx2D64AddChecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d64CheckedImpl{"avx512", avx512D64AddChecked})
+	}
+
+	// 1) Edge inputs: contains a known overflow at index 5 (MinInt64 + (MaxInt64+1)).
+	ea, eb := edgeInputs()
+	wantR := make([]uint64, len(ea))
+	wantIdx := scalarD64AddChecked(ea, eb, wantR)
+	for _, impl := range impls {
+		gotR := make([]uint64, len(ea))
+		gotIdx := impl.fn(ea, eb, gotR)
+		// First-overflow position must match scalar reference.
+		if gotIdx != wantIdx {
+			t.Fatalf("%s edge: idx got %d want %d", impl.name, gotIdx, wantIdx)
+		}
+		// Values up to (and including) the first overflow must match.
+		end := len(ea)
+		if wantIdx >= 0 {
+			end = wantIdx + 1
+		}
+		for i := 0; i < end; i++ {
+			if gotR[i] != wantR[i] {
+				t.Fatalf("%s edge i=%d: got %x want %x", impl.name, i, gotR[i], wantR[i])
+			}
+		}
+	}
+
+	// 2) No-overflow random inputs: clear sign bits to avoid spurious overflows.
+	for _, n := range d64Sizes() {
+		a := makeRandD64(n, uint64(n)*19+7)
+		b := makeRandD64(n, uint64(n)*23+11)
+		for i := range a {
+			a[i] &= 0x3FFFFFFFFFFFFFFF
+			b[i] &= 0x3FFFFFFFFFFFFFFF
+		}
+		want := make([]uint64, n)
+		if got := scalarD64AddChecked(a, b, want); got != -1 {
+			t.Fatalf("setup: scalar reported overflow at %d for masked input n=%d", got, n)
+		}
+		for _, impl := range impls {
+			got := make([]uint64, n)
+			if idx := impl.fn(a, b, got); idx != -1 {
+				t.Fatalf("%s n=%d: spurious overflow at %d", impl.name, n, idx)
+			}
+			for i := 0; i < n; i++ {
+				if got[i] != want[i] {
+					t.Fatalf("%s n=%d i=%d: got %x want %x", impl.name, n, i, got[i], want[i])
+				}
+			}
+		}
+	}
+
+	// 3) Single overflow injected at varying positions (covers both vector
+	// body and scalar tail of every impl).
+	for _, n := range []int{8, 16, 17, 33, 35, 64} {
+		for _, pos := range []int{0, 1, 4, 7, 8, n - 1} {
+			if pos < 0 || pos >= n {
+				continue
+			}
+			a := make([]uint64, n)
+			b := make([]uint64, n)
+			a[pos] = uint64(math.MaxInt64)
+			b[pos] = 1
+			for _, impl := range impls {
+				got := make([]uint64, n)
+				idx := impl.fn(a, b, got)
+				if idx != pos {
+					t.Fatalf("%s n=%d inject pos=%d: got idx %d", impl.name, n, pos, idx)
+				}
+			}
+		}
+	}
+}
+
+func TestD64SubCheckedVariants(t *testing.T) {
+	impls := []d64CheckedImpl{
+		{"scalar", scalarD64SubChecked},
+		{"avx2", avx2D64SubChecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d64CheckedImpl{"avx512", avx512D64SubChecked})
+	}
+
+	// No-overflow random inputs: mask to 62 bits so a-b stays in range.
+	for _, n := range d64Sizes() {
+		a := makeRandD64(n, uint64(n)*29+13)
+		b := makeRandD64(n, uint64(n)*31+17)
+		for i := range a {
+			a[i] &= 0x3FFFFFFFFFFFFFFF
+			b[i] &= 0x3FFFFFFFFFFFFFFF
+		}
+		want := make([]uint64, n)
+		if got := scalarD64SubChecked(a, b, want); got != -1 {
+			t.Fatalf("setup: scalar overflow at %d for n=%d", got, n)
+		}
+		for _, impl := range impls {
+			got := make([]uint64, n)
+			if idx := impl.fn(a, b, got); idx != -1 {
+				t.Fatalf("%s n=%d: spurious overflow at %d", impl.name, n, idx)
+			}
+			for i := 0; i < n; i++ {
+				if got[i] != want[i] {
+					t.Fatalf("%s n=%d i=%d: got %x want %x", impl.name, n, i, got[i], want[i])
+				}
+			}
+		}
+	}
+
+	// Inject MinInt64 - 1 overflow at varying positions.
+	for _, n := range []int{8, 16, 17, 33, 35, 64} {
+		for _, pos := range []int{0, 1, 4, 7, 8, n - 1} {
+			if pos < 0 || pos >= n {
+				continue
+			}
+			a := make([]uint64, n)
+			b := make([]uint64, n)
+			a[pos] = 1 << 63
+			b[pos] = 1
+			for _, impl := range impls {
+				got := make([]uint64, n)
+				idx := impl.fn(a, b, got)
+				if idx != pos {
+					t.Fatalf("%s n=%d inject pos=%d: got idx %d", impl.name, n, pos, idx)
+				}
+			}
+		}
+	}
+}
+
+// ---------------------------------------------------------------------------
+// Benchmarks: per-impl × per-size. Standard go test -bench output.
+// ---------------------------------------------------------------------------
+
+var d64BenchSizes = []int{64, 256, 1024, 4096, 16384}
+
+func benchD64Unchecked(b *testing.B, fn func(a, bb, r []uint64), n int) {
+	a := makeRandD64(n, 1)
+	bb := makeRandD64(n, 2)
+	r := make([]uint64, n)
+	b.SetBytes(int64(n) * 8 * 3)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		fn(a, bb, r)
+	}
+}
+
+func benchD64Checked(b *testing.B, fn func(a, bb, r []uint64) int, n int) {
+	a := makeRandD64(n, 1)
+	bb := makeRandD64(n, 2)
+	for i := range a {
+		a[i] &= 0x3FFFFFFFFFFFFFFF
+		bb[i] &= 0x3FFFFFFFFFFFFFFF
+	}
+	r := make([]uint64, n)
+	b.SetBytes(int64(n) * 8 * 3)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = fn(a, bb, r)
+	}
+}
+
+func BenchmarkD64AddUnchecked(b *testing.B) {
+	for _, n := range d64BenchSizes {
+		b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD64Unchecked(b, scalarD64AddUnchecked, n) })
+		b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD64Unchecked(b, avx2D64AddUnchecked, n) })
+		if cpu.X86.HasAVX512 {
+			b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD64Unchecked(b, avx512D64AddUnchecked, n) })
+		}
+	}
+}
+
+func BenchmarkD64SubUnchecked(b *testing.B) {
+	for _, n := range d64BenchSizes {
+		b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD64Unchecked(b, scalarD64SubUnchecked, n) })
+		b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD64Unchecked(b, avx2D64SubUnchecked, n) })
+		if cpu.X86.HasAVX512 {
+			b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD64Unchecked(b, avx512D64SubUnchecked, n) })
+		}
+	}
+}
+
+func BenchmarkD64AddChecked(b *testing.B) {
+	for _, n := range d64BenchSizes {
+		b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD64Checked(b, scalarD64AddChecked, n) })
+		b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD64Checked(b, avx2D64AddChecked, n) })
+		if cpu.X86.HasAVX512 {
+			b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD64Checked(b, avx512D64AddChecked, n) })
+		}
+	}
+}
+
+func BenchmarkD64SubChecked(b *testing.B) {
+	for _, n := range d64BenchSizes {
+		b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) { benchD64Checked(b, scalarD64SubChecked, n) })
+		b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) { benchD64Checked(b, avx2D64SubChecked, n) })
+		if cpu.X86.HasAVX512 {
+			b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) { benchD64Checked(b, avx512D64SubChecked, n) })
+		}
+	}
+}
+
+// ---------------------------------------------------------------------------
+// Scalar-broadcast tests: validate AVX2/AVX-512 broadcast variants against
+// the scalar reference for both Unchecked (results) and Checked (results +
+// first-overflow index) shapes.
+// ---------------------------------------------------------------------------
+
+type d64ScalarUncheckedImpl struct {
+	name string
+	fn   func(s uint64, v, r []uint64)
+}
+
+type d64ScalarUncheckedImplR struct {
+	name string
+	fn   func(v []uint64, s uint64, r []uint64)
+}
+
+type d64ScalarCheckedImpl struct {
+	name string
+	fn   func(s uint64, v, r []uint64) int
+}
+
+type d64ScalarCheckedImplR struct {
+	name string
+	fn   func(v []uint64, s uint64, r []uint64) int
+}
+
+func d64ScalarSamples() []uint64 {
+	return []uint64{
+		0, 1, 42,
+		uint64(math.MaxInt64), uint64(math.MaxInt64) - 1,
+		1 << 63, // MinInt64
+		(1 << 63) + 1,
+		math.MaxUint64,
+	}
+}
+
+func TestD64AddScalarVariants(t *testing.T) {
+	uncheckedImpls := []d64ScalarUncheckedImpl{
+		{"scalar", scalarD64AddScalarUnchecked},
+		{"avx2", avx2D64AddScalarUnchecked},
+	}
+	if cpu.X86.HasAVX512 {
+		uncheckedImpls = append(uncheckedImpls, d64ScalarUncheckedImpl{"avx512", avx512D64AddScalarUnchecked})
+	}
+	for _, n := range d64Sizes() {
+		v := makeRandD64(n, uint64(n)*31+13)
+		for _, s := range d64ScalarSamples() {
+			want := make([]uint64, n)
+			scalarD64AddScalarUnchecked(s, v, want)
+			for _, impl := range uncheckedImpls {
+				got := make([]uint64, n)
+				impl.fn(s, v, got)
+				for i := 0; i < n; i++ {
+					if got[i] != want[i] {
+						t.Fatalf("AddScalarUnchecked %s n=%d s=%x i=%d: got %x want %x",
+							impl.name, n, s, i, got[i], want[i])
+					}
+				}
+			}
+		}
+	}
+}
+
+func TestD64SubScalarVariants(t *testing.T) {
+	uncheckedImpls := []d64ScalarUncheckedImplR{
+		{"scalar", scalarD64SubScalarUnchecked},
+		{"avx2", avx2D64SubScalarUnchecked},
+	}
+	if cpu.X86.HasAVX512 {
+		uncheckedImpls = append(uncheckedImpls, d64ScalarUncheckedImplR{"avx512", avx512D64SubScalarUnchecked})
+	}
+	for _, n := range d64Sizes() {
+		v := makeRandD64(n, uint64(n)*37+17)
+		for _, s := range d64ScalarSamples() {
+			want := make([]uint64, n)
+			scalarD64SubScalarUnchecked(v, s, want)
+			for _, impl := range uncheckedImpls {
+				got := make([]uint64, n)
+				impl.fn(v, s, got)
+				for i := 0; i < n; i++ {
+					if got[i] != want[i] {
+						t.Fatalf("SubScalarUnchecked %s n=%d s=%x i=%d: got %x want %x",
+							impl.name, n, s, i, got[i], want[i])
+					}
+				}
+			}
+		}
+	}
+}
+
+func TestD64ScalarSubVariants(t *testing.T) {
+	uncheckedImpls := []d64ScalarUncheckedImpl{
+		{"scalar", scalarD64ScalarSubUnchecked},
+		{"avx2", avx2D64ScalarSubUnchecked},
+	}
+	if cpu.X86.HasAVX512 {
+		uncheckedImpls = append(uncheckedImpls, d64ScalarUncheckedImpl{"avx512", avx512D64ScalarSubUnchecked})
+	}
+	for _, n := range d64Sizes() {
+		v := makeRandD64(n, uint64(n)*41+19)
+		for _, s := range d64ScalarSamples() {
+			want := make([]uint64, n)
+			scalarD64ScalarSubUnchecked(s, v, want)
+			for _, impl := range uncheckedImpls {
+				got := make([]uint64, n)
+				impl.fn(s, v, got)
+				for i := 0; i < n; i++ {
+					if got[i] != want[i] {
+						t.Fatalf("ScalarSubUnchecked %s n=%d s=%x i=%d: got %x want %x",
+							impl.name, n, s, i, got[i], want[i])
+					}
+				}
+			}
+		}
+	}
+}
+
+func TestD64AddScalarCheckedVariants(t *testing.T) {
+	impls := []d64ScalarCheckedImpl{
+		{"scalar", scalarD64AddScalarChecked},
+		{"avx2", avx2D64AddScalarChecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d64ScalarCheckedImpl{"avx512", avx512D64AddScalarChecked})
+	}
+
+	// 1) No-overflow random (mask sign bit) inputs.
+	for _, n := range d64Sizes() {
+		v := makeRandD64(n, uint64(n)*43+23)
+		for i := range v {
+			v[i] &= 0x3FFFFFFFFFFFFFFF
+		}
+		for _, s := range []uint64{0, 1, 42, 0x3FFFFFFFFFFFFFFF} {
+			want := make([]uint64, n)
+			if got := scalarD64AddScalarChecked(s, v, want); got != -1 {
+				t.Fatalf("setup: scalar reported overflow at %d for masked input n=%d s=%x", got, n, s)
+			}
+			for _, impl := range impls {
+				got := make([]uint64, n)
+				if idx := impl.fn(s, v, got); idx != -1 {
+					t.Fatalf("AddScalarChecked %s n=%d s=%x: spurious overflow at %d", impl.name, n, s, idx)
+				}
+				for i := 0; i < n; i++ {
+					if got[i] != want[i] {
+						t.Fatalf("AddScalarChecked %s n=%d s=%x i=%d: got %x want %x",
+							impl.name, n, s, i, got[i], want[i])
+					}
+				}
+			}
+		}
+	}
+
+	// 2) Inject a single overflow at varying positions, with s = MaxInt64.
+	s := uint64(math.MaxInt64)
+	for _, n := range []int{8, 16, 17, 33, 35, 64} {
+		for _, pos := range []int{0, 1, 4, 7, 8, n - 1} {
+			if pos < 0 || pos >= n {
+				continue
+			}
+			v := make([]uint64, n)
+			v[pos] = 1 // s + 1 = MaxInt64+1 → overflow
+			for _, impl := range impls {
+				got := make([]uint64, n)
+				idx := impl.fn(s, v, got)
+				if idx != pos {
+					t.Fatalf("AddScalarChecked %s n=%d inject pos=%d: got idx %d", impl.name, n, pos, idx)
+				}
+			}
+		}
+	}
+}
+
+func TestD64SubScalarCheckedVariants(t *testing.T) {
+	impls := []d64ScalarCheckedImplR{
+		{"scalar", scalarD64SubScalarChecked},
+		{"avx2", avx2D64SubScalarChecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d64ScalarCheckedImplR{"avx512", avx512D64SubScalarChecked})
+	}
+
+	// 1) No-overflow random (mask sign bit) inputs.
+	for _, n := range d64Sizes() {
+		v := makeRandD64(n, uint64(n)*47+29)
+		for i := range v {
+			v[i] &= 0x3FFFFFFFFFFFFFFF
+		}
+		for _, s := range []uint64{0, 1, 42, 0x3FFFFFFFFFFFFFFF} {
+			want := make([]uint64, n)
+			if got := scalarD64SubScalarChecked(v, s, want); got != -1 {
+				t.Fatalf("setup: scalar reported overflow at %d for masked input n=%d s=%x", got, n, s)
+			}
+			for _, impl := range impls {
+				got := make([]uint64, n)
+				if idx := impl.fn(v, s, got); idx != -1 {
+					t.Fatalf("SubScalarChecked %s n=%d s=%x: spurious overflow at %d", impl.name, n, s, idx)
+				}
+				for i := 0; i < n; i++ {
+					if got[i] != want[i] {
+						t.Fatalf("SubScalarChecked %s n=%d s=%x i=%d: got %x want %x",
+							impl.name, n, s, i, got[i], want[i])
+					}
+				}
+			}
+		}
+	}
+
+	// 2) Inject overflow: v[pos] = MinInt64, s = 1 → MinInt64 - 1 → overflow.
+	s := uint64(1)
+	for _, n := range []int{8, 16, 17, 33, 35, 64} {
+		for _, pos := range []int{0, 1, 4, 7, 8, n - 1} {
+			if pos < 0 || pos >= n {
+				continue
+			}
+			v := make([]uint64, n)
+			v[pos] = 1 << 63 // MinInt64
+			for _, impl := range impls {
+				got := make([]uint64, n)
+				idx := impl.fn(v, s, got)
+				if idx != pos {
+					t.Fatalf("SubScalarChecked %s n=%d inject pos=%d: got idx %d", impl.name, n, pos, idx)
+				}
+			}
+		}
+	}
+}
+
+func TestD64ScalarSubCheckedVariants(t *testing.T) {
+	impls := []d64ScalarCheckedImpl{
+		{"scalar", scalarD64ScalarSubChecked},
+		{"avx2", avx2D64ScalarSubChecked},
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, d64ScalarCheckedImpl{"avx512", avx512D64ScalarSubChecked})
+	}
+
+	// 1) No-overflow random (mask sign bit) inputs.
+	for _, n := range d64Sizes() {
+		v := makeRandD64(n, uint64(n)*53+31)
+		for i := range v {
+			v[i] &= 0x3FFFFFFFFFFFFFFF
+		}
+		for _, s := range []uint64{0, 1, 42, 0x3FFFFFFFFFFFFFFF} {
+			want := make([]uint64, n)
+			if got := scalarD64ScalarSubChecked(s, v, want); got != -1 {
+				t.Fatalf("setup: scalar reported overflow at %d for masked input n=%d s=%x", got, n, s)
+			}
+			for _, impl := range impls {
+				got := make([]uint64, n)
+				if idx := impl.fn(s, v, got); idx != -1 {
+					t.Fatalf("ScalarSubChecked %s n=%d s=%x: spurious overflow at %d", impl.name, n, s, idx)
+				}
+				for i := 0; i < n; i++ {
+					if got[i] != want[i] {
+						t.Fatalf("ScalarSubChecked %s n=%d s=%x i=%d: got %x want %x",
+							impl.name, n, s, i, got[i], want[i])
+					}
+				}
+			}
+		}
+	}
+
+	// 2) Inject overflow: s = MinInt64, v[pos] = 1 → MinInt64 - 1 → overflow.
+	s := uint64(1 << 63) // MinInt64
+	for _, n := range []int{8, 16, 17, 33, 35, 64} {
+		for _, pos := range []int{0, 1, 4, 7, 8, n - 1} {
+			if pos < 0 || pos >= n {
+				continue
+			}
+			v := make([]uint64, n)
+			v[pos] = 1
+			for _, impl := range impls {
+				got := make([]uint64, n)
+				idx := impl.fn(s, v, got)
+				if idx != pos {
+					t.Fatalf("ScalarSubChecked %s n=%d inject pos=%d: got idx %d", impl.name, n, pos, idx)
+				}
+			}
+		}
+	}
+}
+
+// makeRandD64Bounded returns Decimal64-range values: |x| < 10^18.
+func makeRandD64Bounded(n int, seed uint64) []uint64 {
+	rng := rand.New(rand.NewPCG(seed, seed^0x9E3779B97F4A7C15))
+	const maxAbs uint64 = 1_000_000_000_000_000_000
+	out := make([]uint64, n)
+	for i := range out {
+		x := int64(rng.Uint64N(maxAbs))
+		if rng.IntN(2) == 1 {
+			x = -x
+		}
+		out[i] = uint64(x)
+	}
+	return out
+}
+
+func TestD64SumReduceToD128Variants(t *testing.T) {
+	impls := []struct {
+		name string
+		fn   func([]uint64) (uint64, uint64)
+	}{
+		{"scalar", scalarD64SumReduceToD128},
+	}
+	if cpu.X86.HasAVX2 {
+		impls = append(impls, struct {
+			name string
+			fn   func([]uint64) (uint64, uint64)
+		}{"avx2", avx2D64SumReduceToD128})
+	}
+	if cpu.X86.HasAVX512 {
+		impls = append(impls, struct {
+			name string
+			fn   func([]uint64) (uint64, uint64)
+		}{"avx512", avx512D64SumReduceToD128})
+	}
+
+	for _, n := range d64Sizes() {
+		v := makeRandD64Bounded(n, uint64(n)*23+1)
+		refLo, refHi := scalarD64SumReduceToD128(v)
+		for _, im := range impls {
+			lo, hi := im.fn(v)
+			if lo != refLo || hi != refHi {
+				t.Fatalf("%s n=%d: got (%x,%x) want (%x,%x)", im.name, n, lo, hi, refLo, refHi)
+			}
+		}
+	}
+}
+
+func BenchmarkD64SumReduceToD128(b *testing.B) {
+	for _, n := range d64BenchSizes {
+		v := makeRandD64Bounded(n, 1)
+		b.Run("scalar/n="+strconv.Itoa(n), func(b *testing.B) {
+			b.SetBytes(int64(n) * 8)
+			for i := 0; i < b.N; i++ {
+				_, _ = scalarD64SumReduceToD128(v)
+			}
+		})
+		if cpu.X86.HasAVX2 {
+			b.Run("avx2/n="+strconv.Itoa(n), func(b *testing.B) {
+				b.SetBytes(int64(n) * 8)
+				for i := 0; i < b.N; i++ {
+					_, _ = avx2D64SumReduceToD128(v)
+				}
+			})
+		}
+		if cpu.X86.HasAVX512 {
+			b.Run("avx512/n="+strconv.Itoa(n), func(b *testing.B) {
+				b.SetBytes(int64(n) * 8)
+				for i := 0; i < b.N; i++ {
+					_, _ = avx512D64SumReduceToD128(v)
+				}
+			})
+		}
+	}
+}
diff --git a/pkg/common/simdkernels/d64_compare.go b/pkg/common/simdkernels/d64_compare.go
new file mode 100644
index 0000000000000..39622f1d8bd5c
--- /dev/null
+++ b/pkg/common/simdkernels/d64_compare.go
@@ -0,0 +1,47 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package simdkernels
+
+// Decimal64 element-wise comparisons producing []bool output (1 byte per
+// element; 0 = false, 1 = true).
+//
+// Inputs a, b are uint64-typed slices interpreted as int64 for signed
+// compares (Decimal64 is two's-complement int64 with sign-bit-checked
+// negativity, so signed integer ordering matches Decimal64.Less ordering).
+
+var (
+	D64Eq func(a, b []uint64, out []bool) = scalarD64Eq
+	D64Lt func(a, b []uint64, out []bool) = scalarD64Lt
+)
+
+func scalarD64Eq(a, b []uint64, out []bool) {
+	n := len(a)
+	if len(b) < n || len(out) < n {
+		return
+	}
+	for i := 0; i < n; i++ {
+		out[i] = a[i] == b[i]
+	}
+}
+
+func scalarD64Lt(a, b []uint64, out []bool) {
+	n := len(a)
+	if len(b) < n || len(out) < n {
+		return
+	}
+	for i := 0; i < n; i++ {
+		out[i] = int64(a[i]) < int64(b[i])
+	}
+}
diff --git a/pkg/common/simdkernels/d64_compare_simd_amd64.go b/pkg/common/simdkernels/d64_compare_simd_amd64.go
new file mode 100644
index 0000000000000..5b47d60a4d36b
--- /dev/null
+++ b/pkg/common/simdkernels/d64_compare_simd_amd64.go
@@ -0,0 +1,96 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build goexperiment.simd && amd64
+
+package simdkernels
+
+import (
+	"simd/archsimd"
+	"unsafe"
+
+	"golang.org/x/sys/cpu"
+)
+
+// AVX2 was tried (4-elem and 8-elem interleaved batches) but consistently
+// loses ~13-24% to scalar on Zen 3 because:
+//   - scalar `cmp/sete/store` runs at ~1.2 cyc/elem (near retirement-width
+//     limit on a 4-wide OoO core);
+//   - AVX2 has no native qword->byte gather (VPMOVQB is AVX-512 only),
+//     so the post-compare bool packing on AVX2 (vector store -> 4 byte
+//     loads -> shift/OR -> store) adds a serial dependency chain that
+//     the OoO scheduler cannot parallelize away.
+// Therefore AVX2 is intentionally NOT registered. Scalar stays as default
+// on AVX2-only hardware. AVX-512 uses VPMOVQB (TruncateToInt8) which
+// performs the byte gather natively in one instruction; this path is
+// expected to win on Intel SPR / Zen 5 but cannot be benched on Zen 3.
+
+func init() {
+	if cpu.X86.HasAVX512 {
+		D64Eq = avx512D64Eq
+		D64Lt = avx512D64Lt
+	}
+}
+
+// AVX-512 path: 8 D64 elements per inner step (Int64x8). Compare ->
+// AND with broadcast(1) puts a 0/1 byte in the low byte of each qword.
+// VPMOVQB (TruncateToInt8) then gathers the 8 low bytes into the low
+// 8 bytes of an Int8x16. We extract that as a single uint64 and emit
+// one 8-byte unsafe store.
+
+//go:nosplit
+func storeBools8(v archsimd.Int64x8, one archsimd.Int64x8, out []bool, i int) {
+	packed := v.And(one).TruncateToInt8().AsInt64x2().GetElem(0)
+	*(*uint64)(unsafe.Pointer(&out[i])) = uint64(packed)
+}
+
+func avx512D64Eq(a, b []uint64, out []bool) {
+	n := len(a)
+	if n == 0 || len(b) < n || len(out) < n {
+		return
+	}
+	pa, pb := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0])
+	one := archsimd.BroadcastInt64x8(1)
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 8
+		av := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off)))
+		bv := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off)))
+		storeBools8(av.Equal(bv).ToInt64x8(), one, out, i)
+	}
+	for ; i < n; i++ {
+		out[i] = a[i] == b[i]
+	}
+}
+
+func avx512D64Lt(a, b []uint64, out []bool) {
+	n := len(a)
+	if n == 0 || len(b) < n || len(out) < n {
+		return
+	}
+	pa, pb := unsafe.Pointer(&a[0]), unsafe.Pointer(&b[0])
+	one := archsimd.BroadcastInt64x8(1)
+
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 8
+		av := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pa, off)))
+		bv := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pb, off)))
+		storeBools8(av.Less(bv).ToInt64x8(), one, out, i)
+	}
+	for ; i < n; i++ {
+		out[i] = int64(a[i]) < int64(b[i])
+	}
+}
diff --git a/pkg/common/simdkernels/d64_compare_test.go b/pkg/common/simdkernels/d64_compare_test.go
new file mode 100644
index 0000000000000..3de2c2ce36e2b
--- /dev/null
+++ b/pkg/common/simdkernels/d64_compare_test.go
@@ -0,0 +1,128 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package simdkernels
+
+import (
+	"math"
+	"math/rand"
+	"strconv"
+	"testing"
+)
+
+func makeD64Pair(n int, seed int64) ([]uint64, []uint64) {
+	r := rand.New(rand.NewSource(seed))
+	a := make([]uint64, n)
+	b := make([]uint64, n)
+	for i := 0; i < n; i++ {
+		a[i] = uint64(r.Int63() - r.Int63())
+		// 25% chance b shares value with a (test equality path)
+		if r.Intn(4) == 0 {
+			b[i] = a[i]
+		} else {
+			b[i] = uint64(r.Int63() - r.Int63())
+		}
+	}
+	return a, b
+}
+
+func edgeD64Pairs() (a, b []uint64) {
+	vals := []int64{
+		math.MinInt64, math.MinInt64 + 1, -1 << 32, -1, 0, 1, 1 << 32,
+		math.MaxInt64 - 1, math.MaxInt64,
+	}
+	for _, x := range vals {
+		for _, y := range vals {
+			a = append(a, uint64(x))
+			b = append(b, uint64(y))
+		}
+	}
+	return
+}
+
+func TestD64Eq(t *testing.T) {
+	cases := [][2][]uint64{}
+	for _, n := range []int{0, 1, 3, 4, 7, 8, 15, 16, 17, 64, 1023} {
+		a, b := makeD64Pair(n, int64(n)+1)
+		cases = append(cases, [2][]uint64{a, b})
+	}
+	a, b := edgeD64Pairs()
+	cases = append(cases, [2][]uint64{a, b})
+
+	for ci, c := range cases {
+		a, b := c[0], c[1]
+		out := make([]bool, len(a))
+		D64Eq(a, b, out)
+		for i := range a {
+			want := a[i] == b[i]
+			if out[i] != want {
+				t.Fatalf("case %d idx %d: D64Eq(%x,%x)=%v want %v", ci, i, a[i], b[i], out[i], want)
+			}
+		}
+	}
+}
+
+func TestD64Lt(t *testing.T) {
+	cases := [][2][]uint64{}
+	for _, n := range []int{0, 1, 3, 4, 7, 8, 15, 16, 17, 64, 1023} {
+		a, b := makeD64Pair(n, int64(n)+101)
+		cases = append(cases, [2][]uint64{a, b})
+	}
+	a, b := edgeD64Pairs()
+	cases = append(cases, [2][]uint64{a, b})
+
+	for ci, c := range cases {
+		a, b := c[0], c[1]
+		out := make([]bool, len(a))
+		D64Lt(a, b, out)
+		for i := range a {
+			want := int64(a[i]) < int64(b[i])
+			if out[i] != want {
+				t.Fatalf("case %d idx %d: D64Lt(%d,%d)=%v want %v",
+					ci, i, int64(a[i]), int64(b[i]), out[i], want)
+			}
+		}
+	}
+}
+
+func benchD64Cmp(b *testing.B, fn func(a, b []uint64, out []bool), n int) {
+	x, y := makeD64Pair(n, 42)
+	out := make([]bool, n)
+	b.ResetTimer()
+	b.SetBytes(int64(n) * 16)
+	for i := 0; i < b.N; i++ {
+		fn(x, y, out)
+	}
+}
+
+func BenchmarkD64Eq_Scalar(b *testing.B) {
+	for _, n := range []int{16, 64, 256, 1024, 4096} {
+		b.Run(strconv.Itoa(n), func(b *testing.B) { benchD64Cmp(b, scalarD64Eq, n) })
+	}
+}
+func BenchmarkD64Eq_Dispatch(b *testing.B) {
+	for _, n := range []int{16, 64, 256, 1024, 4096} {
+		b.Run(strconv.Itoa(n), func(b *testing.B) { benchD64Cmp(b, D64Eq, n) })
+	}
+}
+func BenchmarkD64Lt_Scalar(b *testing.B) {
+	for _, n := range []int{16, 64, 256, 1024, 4096} {
+		b.Run(strconv.Itoa(n), func(b *testing.B) { benchD64Cmp(b, scalarD64Lt, n) })
+	}
+}
+func BenchmarkD64Lt_Dispatch(b *testing.B) {
+	for _, n := range []int{16, 64, 256, 1024, 4096} {
+		b.Run(strconv.Itoa(n), func(b *testing.B) { benchD64Cmp(b, D64Lt, n) })
+	}
+}
diff --git a/pkg/common/simdkernels/d64_mul.go b/pkg/common/simdkernels/d64_mul.go
new file mode 100644
index 0000000000000..8216d0bb0c381
--- /dev/null
+++ b/pkg/common/simdkernels/d64_mul.go
@@ -0,0 +1,133 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package simdkernels
+
+import "math/bits"
+
+// Decimal64 × Decimal64 → Decimal128 on slices.
+//
+// Inputs a, b are int64-typed (length n). Output r holds Decimal128 values
+// stored as interleaved (lo, hi) uint64 pairs, length 2n. The kernel matches
+// the semantics of d64MulInline in arith_decimal_fast.go: per element compute
+// the signed 128-bit product of a[i] * b[i] via abs-mul-conditional-negate.
+//
+// The product of two int64s fits in 128 bits, so no overflow is possible at
+// this step — the API has no Checked variant.
+//
+// scaleAdj is an output-scale adjustment in [-18, 0]. When non-zero the
+// kernel divides the 128-bit product by 10^|scaleAdj| with half-up rounding
+// (matching d128DivPow10Once). The scalar fallback reuses the same divisor
+// per element; the SIMD path computes the product with SIMD and then defers
+// the divide to a per-element scalar helper (the divide-by-constant primitive
+// will be SIMD-vectorized in a separate task).
+
+var (
+	D64MulNoBroadcast func(a, b, r []uint64, scaleAdj int32) = scalarD64MulNoBroadcast
+)
+
+// pow10 mirrors types.Pow10 for 0..18. Duplicated here to keep simdkernels
+// free of a dependency cycle on container/types.
+var pow10Table = [19]uint64{
+	1,
+	10,
+	100,
+	1000,
+	10000,
+	100000,
+	1000000,
+	10000000,
+	100000000,
+	1000000000,
+	10000000000,
+	100000000000,
+	1000000000000,
+	10000000000000,
+	100000000000000,
+	1000000000000000,
+	10000000000000000,
+	100000000000000000,
+	1000000000000000000,
+}
+
+func scalarD64MulNoBroadcast(a, b, r []uint64, scaleAdj int32) {
+	n := len(a)
+	if len(b) < n || len(r) < 2*n {
+		return
+	}
+	if scaleAdj == 0 {
+		for i := 0; i < n; i++ {
+			lo, hi := d64MulOne(a[i], b[i])
+			r[2*i] = lo
+			r[2*i+1] = hi
+		}
+		return
+	}
+	d := pow10Table[-scaleAdj]
+	half := (d + 1) >> 1
+	for i := 0; i < n; i++ {
+		lo, hi := d64MulOne(a[i], b[i])
+		lo, hi = d128DivConst(lo, hi, d, half)
+		r[2*i] = lo
+		r[2*i+1] = hi
+	}
+}
+
+// d64MulOne computes the signed 128-bit product (lo, hi) of two int64-typed
+// uint64s, exactly mirroring d64MulInline in arith_decimal_fast.go.
+func d64MulOne(av, bv uint64) (lo, hi uint64) {
+	xi, yi := int64(av), int64(bv)
+	mx, my := xi>>63, yi>>63
+	ax, ay := uint64((xi^mx)-mx), uint64((yi^my)-my)
+	hi, lo = bits.Mul64(ax, ay)
+	nm := uint64((xi ^ yi) >> 63) // 0xFF..FF iff signs differ
+	lo ^= nm
+	hi ^= nm
+	var c uint64
+	lo, c = bits.Add64(lo, 0, nm&1)
+	hi, _ = bits.Add64(hi, 0, c)
+	return lo, hi
+}
+
+// d128DivConst divides a signed 128-bit value (lo, hi) by a positive
+// constant d (≤ 10^18), with half-up rounding. Mirrors d128ScaleDownPow10
+// + d128DivPow10Once in arith_decimal_fast.go (sign extraction → unsigned
+// divide → re-apply sign).
+func d128DivConst(lo, hi, d, half uint64) (uint64, uint64) {
+	// Branchless abs of (lo, hi) when interpreted as int128.
+	// sign = top bit of hi as a 0/-1 mask.
+	sign := uint64(int64(hi) >> 63)
+	// Negate iff sign == -1: (lo, hi) := -(lo, hi).
+	lo ^= sign
+	hi ^= sign
+	var c uint64
+	lo, c = bits.Add64(lo, 0, sign&1)
+	hi, _ = bits.Add64(hi, 0, c)
+
+	// Unsigned 128 ÷ 64 with half-up rounding (matches d128DivPow10Once).
+	var rem uint64
+	hi, rem = bits.Div64(0, hi, d)
+	lo, rem = bits.Div64(rem, lo, d)
+	_, borrow := bits.Sub64(rem, half, 0)
+	round := 1 - borrow
+	lo, c = bits.Add64(lo, round, 0)
+	hi += c
+
+	// Re-apply sign: if sign == -1, negate the quotient.
+	lo ^= sign
+	hi ^= sign
+	lo, c = bits.Add64(lo, 0, sign&1)
+	hi, _ = bits.Add64(hi, 0, c)
+	return lo, hi
+}
diff --git a/pkg/common/simdkernels/d64_mul_test.go b/pkg/common/simdkernels/d64_mul_test.go
new file mode 100644
index 0000000000000..3c4244931cb99
--- /dev/null
+++ b/pkg/common/simdkernels/d64_mul_test.go
@@ -0,0 +1,226 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build goexperiment.simd && amd64
+
+package simdkernels
+
+import (
+	"math"
+	"math/bits"
+	"math/rand/v2"
+	"strconv"
+	"testing"
+)
+
+// referenceD64Mul is the canonical scalar reference. Independent of the
+// dispatcher under test (scalarD64MulNoBroadcast or any future SIMD swap-in)
+// so the SIMD impls are checked against the same source-of-truth used by
+// the main repo (d64MulInline + d128DivPow10Once semantics).
+func referenceD64Mul(a, b []uint64, scaleAdj int32) []uint64 {
+	n := len(a)
+	r := make([]uint64, 2*n)
+	for i := 0; i < n; i++ {
+		xi, yi := int64(a[i]), int64(b[i])
+		mx, my := xi>>63, yi>>63
+		ax, ay := uint64((xi^mx)-mx), uint64((yi^my)-my)
+		hi, lo := bits.Mul64(ax, ay)
+		nm := uint64((xi ^ yi) >> 63)
+		lo ^= nm
+		hi ^= nm
+		var c uint64
+		lo, c = bits.Add64(lo, 0, nm&1)
+		hi, _ = bits.Add64(hi, 0, c)
+		if scaleAdj != 0 {
+			d := pow10Table[-scaleAdj]
+			half := (d + 1) >> 1
+			// abs
+			s := uint64(int64(hi) >> 63)
+			lo ^= s
+			hi ^= s
+			lo, c = bits.Add64(lo, 0, s&1)
+			hi, _ = bits.Add64(hi, 0, c)
+			// divide
+			var rem uint64
+			hi, rem = bits.Div64(0, hi, d)
+			lo, rem = bits.Div64(rem, lo, d)
+			_, borrow := bits.Sub64(rem, half, 0)
+			round := 1 - borrow
+			lo, c = bits.Add64(lo, round, 0)
+			hi += c
+			// re-sign
+			lo ^= s
+			hi ^= s
+			lo, c = bits.Add64(lo, 0, s&1)
+			hi, _ = bits.Add64(hi, 0, c)
+		}
+		r[2*i] = lo
+		r[2*i+1] = hi
+	}
+	return r
+}
+
+func d64MulSizes() []int {
+	return []int{0, 1, 3, 4, 7, 8, 15, 16, 17, 31, 32, 33, 35, 63, 64, 127, 128, 255, 256, 1023, 4096}
+}
+
+type d64MulImpl struct {
+	name string
+	fn   func(a, b, r []uint64, scaleAdj int32)
+}
+
+func d64MulImpls() []d64MulImpl {
+	out := []d64MulImpl{{"scalar", scalarD64MulNoBroadcast}}
+	if D64MulNoBroadcast != nil {
+		out = append(out, d64MulImpl{"dispatch", D64MulNoBroadcast})
+	}
+	return out
+}
+
+func makeRandInt64Slice(n int, seed uint64, maxAbs uint64) []uint64 {
+	rng := rand.New(rand.NewPCG(seed, seed^0xD1B54A32D192ED03))
+	out := make([]uint64, n)
+	for i := range out {
+		v := rng.Uint64()
+		if maxAbs > 0 {
+			v %= 2 * maxAbs
+			out[i] = uint64(int64(v) - int64(maxAbs))
+		} else {
+			out[i] = v
+		}
+	}
+	return out
+}
+
+// u64 wraps a runtime int64→uint64 conversion. Used in test fixtures to
+// avoid Go's compile-time check that rejects `uint64(int64(-N))` as a
+// constant expression.
+func u64(x int64) uint64 { return uint64(x) }
+
+func d64MulEdgeInputs() (a, b []uint64) {
+	a = []uint64{
+		0,
+		1,
+		u64(-1),
+		u64(math.MaxInt64),
+		u64(math.MinInt64),
+		u64(math.MaxInt32),
+		u64(math.MinInt32),
+		1 << 32,
+		1<<32 - 1,
+		u64(-(1 << 32)),
+		u64(1e15),
+		u64(-1e15),
+		u64(1e9),
+		u64(-1e9),
+		u64(7),
+		u64(-7),
+	}
+	b = []uint64{
+		0,
+		u64(math.MaxInt64),
+		u64(math.MinInt64),
+		1,
+		u64(-1),
+		1 << 31,
+		u64(-(1 << 31)),
+		u64(1e9),
+		u64(-1e9),
+		1 << 30,
+		u64(1e3),
+		u64(-1e3),
+		u64(1e9),
+		u64(-1e9),
+		u64(13),
+		u64(-13),
+	}
+	return a, b
+}
+
+func TestD64MulCorrectness(t *testing.T) {
+	scaleAdjs := []int32{0, -1, -2, -4, -8, -12, -18}
+	impls := d64MulImpls()
+
+	for _, sa := range scaleAdjs {
+		t.Run("scaleAdj="+strconv.Itoa(int(sa)), func(t *testing.T) {
+			for _, n := range d64MulSizes() {
+				a := makeRandInt64Slice(n, 0xC0FFEE^uint64(n), 1<<40)
+				b := makeRandInt64Slice(n, 0xBADBEEF^uint64(n), 1<<40)
+				want := referenceD64Mul(a, b, sa)
+				for _, im := range impls {
+					got := make([]uint64, 2*n)
+					im.fn(a, b, got, sa)
+					for i := 0; i < 2*n; i++ {
+						if got[i] != want[i] {
+							t.Fatalf("%s n=%d sa=%d idx=%d: got 0x%x want 0x%x (a=%d b=%d)",
+								im.name, n, sa, i, got[i], want[i],
+								int64(a[i/2]), int64(b[i/2]))
+							return
+						}
+					}
+				}
+			}
+		})
+	}
+}
+
+func TestD64MulEdges(t *testing.T) {
+	a, b := d64MulEdgeInputs()
+	scaleAdjs := []int32{0, -1, -8, -18}
+	impls := d64MulImpls()
+	for _, sa := range scaleAdjs {
+		want := referenceD64Mul(a, b, sa)
+		for _, im := range impls {
+			got := make([]uint64, 2*len(a))
+			im.fn(a, b, got, sa)
+			for i := 0; i < 2*len(a); i++ {
+				if got[i] != want[i] {
+					t.Fatalf("%s sa=%d idx=%d: got 0x%x want 0x%x (a=%d b=%d)",
+						im.name, sa, i, got[i], want[i],
+						int64(a[i/2]), int64(b[i/2]))
+				}
+			}
+		}
+	}
+}
+
+// ---------------------------------------------------------------------------
+// Benchmarks
+// ---------------------------------------------------------------------------
+
+func benchD64Mul(b *testing.B, fn func(a, b, r []uint64, scaleAdj int32), n int, scaleAdj int32) {
+	a := makeRandInt64Slice(n, 0x1234, 1<<40)
+	bv := makeRandInt64Slice(n, 0x5678, 1<<40)
+	r := make([]uint64, 2*n)
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		fn(a, bv, r, scaleAdj)
+	}
+}
+
+func BenchmarkD64Mul(b *testing.B) {
+	sizes := []int{16, 64, 256, 1024, 4096}
+	scaleAdjs := []int32{0, -8}
+	for _, sa := range scaleAdjs {
+		for _, n := range sizes {
+			b.Run("scalar/sa="+strconv.Itoa(int(sa))+"/n="+strconv.Itoa(n), func(b *testing.B) {
+				benchD64Mul(b, scalarD64MulNoBroadcast, n, sa)
+			})
+			b.Run("dispatch/sa="+strconv.Itoa(int(sa))+"/n="+strconv.Itoa(n), func(b *testing.B) {
+				benchD64Mul(b, D64MulNoBroadcast, n, sa)
+			})
+		}
+	}
+}
diff --git a/pkg/common/simdkernels/d64_scale.go b/pkg/common/simdkernels/d64_scale.go
new file mode 100644
index 0000000000000..cd9c2b68b5b76
--- /dev/null
+++ b/pkg/common/simdkernels/d64_scale.go
@@ -0,0 +1,66 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package simdkernels
+
+import "math/bits"
+
+// Decimal64 scale-into-rs (multiply each element by a constant uint64 factor
+// f = 10^scaleDiff), used by Add/Sub when input scales differ.
+//
+// Two variants:
+//   *Unchecked — caller has prescanned that every |vec[i]| ≤ MaxInt64/f, so
+//                a plain truncating 64-bit multiply suffices. Hot path for the
+//                common diff-scale add/sub case where data magnitude is small.
+//   *Checked   — uses bits.Mul64 / 128-bit product, returns the index of the
+//                first overflowing element or -1.
+//
+// Signature matches the d64_addsub.go convention: slices are uint64 reinterpret
+// of the underlying Decimal64. Sign is encoded in bit 63.
+
+var (
+	D64ScaleUnchecked func(vec, rs []uint64, f uint64)     = scalarD64ScaleUnchecked
+	D64ScaleChecked   func(vec, rs []uint64, f uint64) int = scalarD64ScaleChecked
+)
+
+func scalarD64ScaleUnchecked(vec, rs []uint64, f uint64) {
+	n := len(rs)
+	if len(vec) < n {
+		return
+	}
+	for i := 0; i < n; i++ {
+		signBit := vec[i] >> 63
+		mask := -signBit
+		abs := (vec[i] ^ mask) + signBit
+		rs[i] = (abs*f ^ mask) + signBit
+	}
+}
+
+func scalarD64ScaleChecked(vec, rs []uint64, f uint64) int {
+	n := len(rs)
+	if len(vec) < n {
+		return -1
+	}
+	for i := 0; i < n; i++ {
+		signBit := vec[i] >> 63
+		mask := -signBit
+		abs := (vec[i] ^ mask) + signBit
+		hi, lo := bits.Mul64(abs, f)
+		if hi|(lo>>63) != 0 {
+			return i
+		}
+		rs[i] = (lo ^ mask) + signBit
+	}
+	return -1
+}
diff --git a/pkg/common/simdkernels/d64_scale_simd_amd64.go b/pkg/common/simdkernels/d64_scale_simd_amd64.go
new file mode 100644
index 0000000000000..268510f6d59c8
--- /dev/null
+++ b/pkg/common/simdkernels/d64_scale_simd_amd64.go
@@ -0,0 +1,93 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build goexperiment.simd && amd64
+
+package simdkernels
+
+import (
+	"simd/archsimd"
+	"unsafe"
+
+	"golang.org/x/sys/cpu"
+)
+
+// D64Scale: per-lane signed 64-bit value × constant uint64 factor f.
+//
+// Algorithm (per lane, matches scalarD64ScaleUnchecked):
+//
+//	mask = v >>arith 63               // 0 or -1
+//	abs  = (v XOR mask) - mask        // |v| as Uint64
+//	prod = abs * f                    // truncating low-64 product
+//	out  = (prod XOR mask) - mask     // restore sign
+//
+// The unchecked path assumes the caller has prescanned that abs ≤ MaxInt64/f
+// for every lane, so the truncating mul cannot overflow.
+//
+// Only AVX-512 is enabled. AVX-512DQ provides VPMULLQ which gives one fused
+// 64×64→low-64 multiply per Int64x8 lane. AVX2 lacks VPMULLQ; emulating it
+// via 32×32 partial products (3 VPMULUDQ + shifts + adds per 4 lanes) is
+// roughly 5× slower than scalar IMUL64 on Zen 3, so we fall back to scalar
+// on AVX2-only hosts.
+
+func init() {
+	if cpu.X86.HasAVX512 {
+		D64ScaleUnchecked = avx512D64ScaleUnchecked
+	}
+}
+
+// ---------------------------------------------------------------------------
+// AVX-512 path — Int64x8.Mul (VPMULLQ from AVX-512DQ).
+// One real multiply per 8 lanes. Main loop unrolls 32 lanes per iter.
+// ---------------------------------------------------------------------------
+
+func avx512D64ScaleUnchecked(vec, rs []uint64, f uint64) {
+	n := len(rs)
+	if n == 0 || len(vec) < n {
+		return
+	}
+	pv, pr := unsafe.Pointer(&vec[0]), unsafe.Pointer(&rs[0])
+
+	fv := archsimd.BroadcastInt64x8(int64(f))
+	zero := archsimd.BroadcastInt64x8(0)
+
+	i := 0
+	for ; i+32 <= n; i += 32 {
+		off := uintptr(i) * 8
+		v0 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off)))
+		v1 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+64)))
+		v2 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+128)))
+		v3 := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off+192)))
+		m0 := zero.Greater(v0).ToInt64x8()
+		m1 := zero.Greater(v1).ToInt64x8()
+		m2 := zero.Greater(v2).ToInt64x8()
+		m3 := zero.Greater(v3).ToInt64x8()
+		v0.Xor(m0).Sub(m0).Mul(fv).Xor(m0).Sub(m0).Store((*[8]int64)(unsafe.Add(pr, off)))
+		v1.Xor(m1).Sub(m1).Mul(fv).Xor(m1).Sub(m1).Store((*[8]int64)(unsafe.Add(pr, off+64)))
+		v2.Xor(m2).Sub(m2).Mul(fv).Xor(m2).Sub(m2).Store((*[8]int64)(unsafe.Add(pr, off+128)))
+		v3.Xor(m3).Sub(m3).Mul(fv).Xor(m3).Sub(m3).Store((*[8]int64)(unsafe.Add(pr, off+192)))
+	}
+	for ; i+8 <= n; i += 8 {
+		off := uintptr(i) * 8
+		v := archsimd.LoadInt64x8((*[8]int64)(unsafe.Add(pv, off)))
+		m := zero.Greater(v).ToInt64x8()
+		v.Xor(m).Sub(m).Mul(fv).Xor(m).Sub(m).Store((*[8]int64)(unsafe.Add(pr, off)))
+	}
+	for ; i < n; i++ {
+		signBit := vec[i] >> 63
+		mask := -signBit
+		abs := (vec[i] ^ mask) + signBit
+		rs[i] = (abs*f ^ mask) + signBit
+	}
+}
diff --git a/pkg/common/simdkernels/d64_scale_test.go b/pkg/common/simdkernels/d64_scale_test.go
new file mode 100644
index 0000000000000..b382f5d384752
--- /dev/null
+++ b/pkg/common/simdkernels/d64_scale_test.go
@@ -0,0 +1,156 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build goexperiment.simd && amd64
+
+package simdkernels
+
+import (
+	"math"
+	"math/rand/v2"
+	"strconv"
+	"testing"
+)
+
+// pow10 mirrors types.Pow10 up to 10^18; sufficient for D64 scale factors.
+var pow10 = [...]uint64{
+	1,
+	10,
+	100,
+	1000,
+	10000,
+	100000,
+	1000000,
+	10000000,
+	100000000,
+	1000000000,
+	10000000000,
+	100000000000,
+	1000000000000,
+	10000000000000,
+	100000000000000,
+	1000000000000000,
+	10000000000000000,
+	100000000000000000,
+	1000000000000000000,
+}
+
+func d64ScaleSizes() []int {
+	return []int{0, 1, 3, 4, 5, 7, 8, 15, 16, 17, 31, 32, 33, 63, 64, 127, 1024, 4096}
+}
+
+type d64ScaleImpl struct {
+	name string
+	fn   func(vec, rs []uint64, f uint64)
+}
+
+func d64ScaleImpls() []d64ScaleImpl {
+	out := []d64ScaleImpl{{name: "scalar", fn: scalarD64ScaleUnchecked}}
+	if D64ScaleUnchecked != nil {
+		out = append(out, d64ScaleImpl{name: "dispatch", fn: D64ScaleUnchecked})
+	}
+	return out
+}
+
+// makeRandSafeD64 returns n int64 lanes, each guaranteed to satisfy
+// |v| ≤ MaxInt64 / f (the unchecked-path precondition).
+func makeRandSafeD64(n int, f uint64, seed uint64) []uint64 {
+	rng := rand.New(rand.NewPCG(seed, seed^0xDEADBEEF))
+	maxAbs := uint64(math.MaxInt64) / f
+	out := make([]uint64, n)
+	for i := range out {
+		v := int64(rng.Uint64N(maxAbs + 1))
+		if rng.Uint64()&1 == 1 {
+			v = -v
+		}
+		out[i] = uint64(v)
+	}
+	return out
+}
+
+func TestD64ScaleUncheckedCorrectness(t *testing.T) {
+	for _, scale := range []int{1, 2, 6, 12, 18} {
+		f := pow10[scale]
+		for _, n := range d64ScaleSizes() {
+			vec := makeRandSafeD64(n, f, uint64(scale*1000+n))
+			want := make([]uint64, n)
+			scalarD64ScaleUnchecked(vec, want, f)
+			for _, im := range d64ScaleImpls() {
+				got := make([]uint64, n)
+				im.fn(vec, got, f)
+				for i := 0; i < n; i++ {
+					if got[i] != want[i] {
+						t.Fatalf("%s scale=%d n=%d idx=%d: got 0x%x want 0x%x (vec=%d)",
+							im.name, scale, n, i, got[i], want[i], int64(vec[i]))
+					}
+				}
+			}
+		}
+	}
+}
+
+func TestD64ScaleUncheckedEdges(t *testing.T) {
+	cases := []struct {
+		name string
+		vec  []int64
+		f    uint64
+	}{
+		{"zeros", []int64{0, 0, 0, 0, 0}, 1000},
+		{"ones", []int64{1, -1, 1, -1, 1, -1, 1, -1, 1}, pow10[6]},
+		{"max_safe", []int64{
+			int64(math.MaxInt64 / pow10[3]), -int64(math.MaxInt64 / pow10[3]),
+			int64(math.MaxInt64 / pow10[3]), -int64(math.MaxInt64 / pow10[3]),
+		}, pow10[3]},
+		{"alt_signs", []int64{1, -1000, 12345, -987654321, 0, 99, -42, 100000}, pow10[2]},
+	}
+	for _, tc := range cases {
+		vec := make([]uint64, len(tc.vec))
+		for i, v := range tc.vec {
+			vec[i] = uint64(v)
+		}
+		want := make([]uint64, len(vec))
+		scalarD64ScaleUnchecked(vec, want, tc.f)
+		for _, im := range d64ScaleImpls() {
+			got := make([]uint64, len(vec))
+			im.fn(vec, got, tc.f)
+			for i := range vec {
+				if got[i] != want[i] {
+					t.Fatalf("%s/%s idx=%d: got %d want %d", tc.name, im.name, i,
+						int64(got[i]), int64(want[i]))
+				}
+			}
+		}
+	}
+}
+
+func benchmarkD64Scale(b *testing.B, fn func(vec, rs []uint64, f uint64), n int, f uint64) {
+	vec := makeRandSafeD64(n, f, 0xC0FFEE^uint64(n))
+	rs := make([]uint64, n)
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		fn(vec, rs, f)
+	}
+}
+
+func BenchmarkD64ScaleUnchecked(b *testing.B) {
+	f := pow10[6]
+	for _, n := range []int{16, 64, 256, 1024, 4096} {
+		for _, im := range d64ScaleImpls() {
+			b.Run(im.name+"/n="+strconv.Itoa(n), func(b *testing.B) {
+				benchmarkD64Scale(b, im.fn, n, f)
+			})
+		}
+	}
+}
diff --git a/pkg/common/simdkernels/decimal_amd64.go b/pkg/common/simdkernels/decimal_amd64.go
new file mode 100644
index 0000000000000..22774d2451e3b
--- /dev/null
+++ b/pkg/common/simdkernels/decimal_amd64.go
@@ -0,0 +1,26 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package simdkernels
+
+import "unsafe"
+
+// Decimal64SignExtend widens n Decimal64 (uint64) values to Decimal128 via
+// branchless sign extension. dst points to the Decimal128 output array,
+// src points to the Decimal64 input array.
+// Uses 4× loop unrolling and PREFETCHT0 to hide L2 latency on cold
+// destination cache lines that would otherwise stall the store pipeline.
+//
+//go:noescape
+func Decimal64SignExtend(dst, src unsafe.Pointer, n int)
diff --git a/pkg/common/simdkernels/decimal_amd64.s b/pkg/common/simdkernels/decimal_amd64.s
new file mode 100644
index 0000000000000..9db17fb647988
--- /dev/null
+++ b/pkg/common/simdkernels/decimal_amd64.s
@@ -0,0 +1,94 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// func Decimal64SignExtend(dst, src unsafe.Pointer, n int)
+//
+// Widens n Decimal64 (8-byte) values to Decimal128 (16-byte struct {B0_63, B64_127 uint64})
+// via arithmetic right-shift sign extension.
+// 4× unrolled with PREFETCHT0 on destination to hide L2 RFO latency.
+TEXT ·Decimal64SignExtend(SB), NOSPLIT, $0-24
+	MOVQ dst+0(FP), DI
+	MOVQ src+8(FP), SI
+	MOVQ n+16(FP), CX
+
+	TESTQ CX, CX
+	JLE   done
+
+	// DX = n/4 (number of 4-element iterations)
+	// CX = n%4 (remainder)
+	MOVQ CX, DX
+	SHRQ $2, DX
+	ANDQ $3, CX
+
+	TESTQ DX, DX
+	JZ    remainder
+
+loop4:
+	// Prefetch destination 4 cache lines ahead (16 elements × 16 bytes = 256 bytes).
+	PREFETCHT0 256(DI)
+	// Prefetch source 2 cache lines ahead (16 elements × 8 bytes = 128 bytes).
+	PREFETCHT0 128(SI)
+
+	// Element 0
+	MOVQ 0(SI), AX
+	MOVQ AX, R8
+	SARQ $63, R8
+	MOVQ AX, 0(DI)
+	MOVQ R8, 8(DI)
+
+	// Element 1
+	MOVQ 8(SI), AX
+	MOVQ AX, R8
+	SARQ $63, R8
+	MOVQ AX, 16(DI)
+	MOVQ R8, 24(DI)
+
+	// Element 2
+	MOVQ 16(SI), AX
+	MOVQ AX, R8
+	SARQ $63, R8
+	MOVQ AX, 32(DI)
+	MOVQ R8, 40(DI)
+
+	// Element 3
+	MOVQ 24(SI), AX
+	MOVQ AX, R8
+	SARQ $63, R8
+	MOVQ AX, 48(DI)
+	MOVQ R8, 56(DI)
+
+	ADDQ $32, SI  // 4 × 8-byte source elements
+	ADDQ $64, DI  // 4 × 16-byte destination elements
+	DECQ DX
+	JNZ  loop4
+
+remainder:
+	TESTQ CX, CX
+	JZ    done
+
+loop1:
+	MOVQ 0(SI), AX
+	MOVQ AX, R8
+	SARQ $63, R8
+	MOVQ AX, 0(DI)
+	MOVQ R8, 8(DI)
+	ADDQ $8, SI
+	ADDQ $16, DI
+	DECQ CX
+	JNZ  loop1
+
+done:
+	RET
diff --git a/pkg/common/simdkernels/decimal_arm64.go b/pkg/common/simdkernels/decimal_arm64.go
new file mode 100644
index 0000000000000..9ee6012ce71da
--- /dev/null
+++ b/pkg/common/simdkernels/decimal_arm64.go
@@ -0,0 +1,28 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package simdkernels
+
+import "unsafe"
+
+// Decimal64SignExtend widens n Decimal64 (uint64) values to Decimal128 via
+// branchless sign extension. Pure Go fallback on arm64.
+func Decimal64SignExtend(dst, src unsafe.Pointer, n int) {
+	for i := 0; i < n; i++ {
+		val := *(*uint64)(unsafe.Add(src, i*8))
+		s := uint64(int64(val) >> 63)
+		*(*uint64)(unsafe.Add(dst, i*16)) = val
+		*(*uint64)(unsafe.Add(dst, i*16+8)) = s
+	}
+}
diff --git a/pkg/common/simdkernels/decimal_generic.go b/pkg/common/simdkernels/decimal_generic.go
new file mode 100644
index 0000000000000..21cb80ae9be39
--- /dev/null
+++ b/pkg/common/simdkernels/decimal_generic.go
@@ -0,0 +1,30 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !amd64 && !arm64
+
+package simdkernels
+
+import "unsafe"
+
+// Decimal64SignExtend widens n Decimal64 (uint64) values to Decimal128 via
+// branchless sign extension. Pure Go fallback for unsupported architectures.
+func Decimal64SignExtend(dst, src unsafe.Pointer, n int) {
+	for i := 0; i < n; i++ {
+		val := *(*uint64)(unsafe.Add(src, i*8))
+		s := uint64(int64(val) >> 63)
+		*(*uint64)(unsafe.Add(dst, i*16)) = val
+		*(*uint64)(unsafe.Add(dst, i*16+8)) = s
+	}
+}
diff --git a/pkg/embed/tpch_agg_bench_test.go b/pkg/embed/tpch_agg_bench_test.go
new file mode 100644
index 0000000000000..20502d2a3bdae
--- /dev/null
+++ b/pkg/embed/tpch_agg_bench_test.go
@@ -0,0 +1,185 @@
+// Copyright 2026 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package embed
+
+import (
+	"context"
+	"database/sql"
+	"fmt"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestTPCHAggBench(t *testing.T) {
+	RunBaseClusterTests(
+		func(c Cluster) {
+			cn0, err := c.GetCNService(0)
+			require.NoError(t, err)
+
+			dsn := fmt.Sprintf("dump:111@tcp(127.0.0.1:%d)/",
+				cn0.GetServiceConfig().CN.Frontend.Port,
+			)
+
+			db, err := sql.Open("mysql", dsn)
+			require.NoError(t, err)
+			defer db.Close()
+
+			ctx := context.Background()
+			conn, err := db.Conn(ctx)
+			require.NoError(t, err)
+			defer conn.Close()
+
+			// Create and load TPC-H tables (SF=0.01 inline)
+			setupTPCH(t, ctx, conn)
+
+			queries := []struct {
+				name string
+				sql  string
+			}{
+				{"Q1", `SELECT l_returnflag, l_linestatus,
+					sum(l_quantity), sum(l_extendedprice),
+					sum(l_extendedprice * (1 - l_discount)),
+					avg(l_quantity), avg(l_extendedprice), avg(l_discount), count(*)
+					FROM lineitem
+					WHERE l_shipdate <= date '1998-12-01' - interval '90' day
+					GROUP BY l_returnflag, l_linestatus`},
+				{"Q5-like", `SELECT n_name, sum(l_extendedprice * (1 - l_discount)) as revenue
+					FROM lineitem, orders, customer, nation, region, supplier
+					WHERE c_custkey = o_custkey AND l_orderkey = o_orderkey
+					AND l_suppkey = s_suppkey AND c_nationkey = s_nationkey
+					AND s_nationkey = n_nationkey AND n_regionkey = r_regionkey
+					AND r_name = 'ASIA'
+					GROUP BY n_name ORDER BY revenue DESC`},
+				{"CountStar", `SELECT count(*) FROM lineitem`},
+				{"SumDecimal", `SELECT l_returnflag, sum(l_extendedprice) FROM lineitem GROUP BY l_returnflag`},
+			}
+
+			// Warmup
+			for _, q := range queries {
+				_, err := conn.ExecContext(ctx, q.sql)
+				require.NoError(t, err, "query %s failed", q.name)
+			}
+
+			// Benchmark
+			const iterations = 5
+			for _, q := range queries {
+				var total time.Duration
+				for i := 0; i < iterations; i++ {
+					start := time.Now()
+					_, err := conn.ExecContext(ctx, q.sql)
+					require.NoError(t, err)
+					total += time.Since(start)
+				}
+				t.Logf("%-12s avg=%v", q.name, total/iterations)
+			}
+		},
+	)
+}
+
+func setupTPCH(t *testing.T, ctx context.Context, conn *sql.Conn) {
+	t.Helper()
+
+	ddls := []string{
+		`CREATE DATABASE IF NOT EXISTS tpch`,
+		`USE tpch`,
+		`DROP TABLE IF EXISTS nation`,
+		`DROP TABLE IF EXISTS region`,
+		`DROP TABLE IF EXISTS supplier`,
+		`DROP TABLE IF EXISTS customer`,
+		`DROP TABLE IF EXISTS orders`,
+		`DROP TABLE IF EXISTS lineitem`,
+		`CREATE TABLE region (r_regionkey INT, r_name CHAR(25), r_comment VARCHAR(152), PRIMARY KEY (r_regionkey))`,
+		`CREATE TABLE nation (n_nationkey INT, n_name CHAR(25), n_regionkey INT, n_comment VARCHAR(152), PRIMARY KEY (n_nationkey))`,
+		`CREATE TABLE supplier (s_suppkey INT, s_name CHAR(25), s_address VARCHAR(40), s_nationkey INT, s_phone CHAR(15), s_acctbal DECIMAL(15,2), s_comment VARCHAR(101), PRIMARY KEY (s_suppkey))`,
+		`CREATE TABLE customer (c_custkey INT, c_name VARCHAR(25), c_address VARCHAR(40), c_nationkey INT, c_phone CHAR(15), c_acctbal DECIMAL(15,2), c_mktsegment CHAR(10), c_comment VARCHAR(117), PRIMARY KEY (c_custkey))`,
+		`CREATE TABLE orders (o_orderkey BIGINT, o_custkey INT, o_orderstatus CHAR(1), o_totalprice DECIMAL(15,2), o_orderdate DATE, o_orderpriority CHAR(15), o_clerk CHAR(15), o_shippriority INT, o_comment VARCHAR(79), PRIMARY KEY (o_orderkey))`,
+		`CREATE TABLE lineitem (l_orderkey BIGINT, l_partkey INT, l_suppkey INT, l_linenumber INT, l_quantity DECIMAL(15,2), l_extendedprice DECIMAL(15,2), l_discount DECIMAL(15,2), l_tax DECIMAL(15,2), l_returnflag CHAR(1), l_linestatus CHAR(1), l_shipdate DATE, l_commitdate DATE, l_receiptdate DATE, l_shipinstruct CHAR(25), l_shipmode CHAR(10), l_comment VARCHAR(44), PRIMARY KEY (l_orderkey, l_linenumber))`,
+	}
+	for _, ddl := range ddls {
+		_, err := conn.ExecContext(ctx, ddl)
+		require.NoError(t, err, "DDL failed: %s", ddl)
+	}
+
+	// Generate synthetic data (100K lineitem rows)
+	_, err := conn.ExecContext(ctx, `USE tpch`)
+	require.NoError(t, err)
+
+	// Regions
+	_, err = conn.ExecContext(ctx, `INSERT INTO region VALUES (0,'AFRICA','a'),(1,'AMERICA','b'),(2,'ASIA','c'),(3,'EUROPE','d'),(4,'MIDDLE EAST','e')`)
+	require.NoError(t, err)
+
+	// Nations (25)
+	for i := 0; i < 25; i++ {
+		_, err = conn.ExecContext(ctx, fmt.Sprintf(`INSERT INTO nation VALUES (%d, 'NATION_%d', %d, 'comment')`, i, i, i%5))
+		require.NoError(t, err)
+	}
+
+	// Suppliers (100)
+	for i := 1; i <= 100; i++ {
+		_, err = conn.ExecContext(ctx, fmt.Sprintf(`INSERT INTO supplier VALUES (%d,'Supplier#%05d','addr%d',%d,'phone%d',%.2f,'comment')`, i, i, i, i%25, i, float64(i)*10.5))
+		require.NoError(t, err)
+	}
+
+	// Customers (1500)
+	for batch := 0; batch < 15; batch++ {
+		vals := ""
+		for i := 0; i < 100; i++ {
+			id := batch*100 + i + 1
+			if i > 0 {
+				vals += ","
+			}
+			vals += fmt.Sprintf("(%d,'Customer#%09d','addr%d',%d,'phone%d',%.2f,'BUILDING','comment')", id, id, id, id%25, id, float64(id)*5.5)
+		}
+		_, err = conn.ExecContext(ctx, "INSERT INTO customer VALUES "+vals)
+		require.NoError(t, err)
+	}
+
+	// Orders (15000)
+	for batch := 0; batch < 150; batch++ {
+		vals := ""
+		for i := 0; i < 100; i++ {
+			id := batch*100 + i + 1
+			if i > 0 {
+				vals += ","
+			}
+			vals += fmt.Sprintf("(%d,%d,'O',%.2f,'1995-01-01','1-URGENT','Clerk#000001',0,'comment')", id, (id%1500)+1, float64(id)*100.5)
+		}
+		_, err = conn.ExecContext(ctx, "INSERT INTO orders VALUES "+vals)
+		require.NoError(t, err)
+	}
+
+	// Lineitem (100K rows in batches of 1000)
+	flags := []string{"A", "N", "R"}
+	statuses := []string{"F", "O"}
+	for batch := 0; batch < 100; batch++ {
+		vals := ""
+		for i := 0; i < 1000; i++ {
+			row := batch*1000 + i
+			orderkey := int64(row + 1)
+			linenum := 1
+			if i > 0 {
+				vals += ","
+			}
+			vals += fmt.Sprintf("(%d,%d,%d,%d,%.2f,%.2f,%.2f,%.2f,'%s','%s','1995-06-17','1995-06-17','1995-06-17','DELIVER IN PERSON','TRUCK','comment')",
+				orderkey, row%200000+1, row%100+1, linenum,
+				float64(row%50)+1, float64(row%10000)+100.0, float64(row%11)*0.01, 0.02,
+				flags[row%3], statuses[row%2])
+		}
+		_, err = conn.ExecContext(ctx, "INSERT INTO lineitem VALUES "+vals)
+		require.NoError(t, err)
+	}
+}
diff --git a/pkg/sql/colexec/aggexec/sum_decimal_fast.go b/pkg/sql/colexec/aggexec/sum_decimal_fast.go
index ba52e38a94260..788a41d2d6bb8 100644
--- a/pkg/sql/colexec/aggexec/sum_decimal_fast.go
+++ b/pkg/sql/colexec/aggexec/sum_decimal_fast.go
@@ -19,14 +19,25 @@ package aggexec
 
 import (
 	"slices"
+	"unsafe"
 
 	"github.com/matrixorigin/matrixone/pkg/common/bitmap"
 	"github.com/matrixorigin/matrixone/pkg/common/mpool"
+	"github.com/matrixorigin/matrixone/pkg/common/simdkernels"
 	"github.com/matrixorigin/matrixone/pkg/common/util"
 	"github.com/matrixorigin/matrixone/pkg/container/types"
 	"github.com/matrixorigin/matrixone/pkg/container/vector"
 )
 
+// sumReduceRunMin is the minimum run length (consecutive same-group, non-null
+// rows over a flat vector) at which it pays to call into the SIMD SumReduce
+// kernel instead of the scalar inner loop. Tuned by end-to-end micro-benching
+// the dispatched kernel + slice-construction path against the inline scalar
+// loop on Zen 3 (AVX2). Break-even is at n≈32 for both D128 and D64; n=64
+// gives a comfortable margin (D128 1.4×, D64 2.0×) without sacrificing many
+// short-run opportunities.
+const sumReduceRunMin = 32
+
 // ---- Decimal64 SUM/AVG ----
 
 type sumDecimal64FastExec struct {
@@ -88,6 +99,53 @@ func (exec *sumDecimal64FastExec) batchFill(offset int, groups []uint64, vectors
 		np = vec.GetNulls().GetBitmap()
 	}
 
+	// Fast path: scan for runs of the same group (no nulls, flat vec).
+	// Within a run we can SIMD-sum-reduce a contiguous Decimal64 slice and
+	// fold the 128-bit total into the state directly.
+	if !hasNull && constMask == -1 {
+		lastX := -1
+		var sums *[AggBatchSize]types.Decimal128
+		var cnts []int64
+		i := 0
+		N := len(groups)
+		for i < N {
+			grp := groups[i]
+			if grp == GroupNotMatched {
+				i++
+				continue
+			}
+			j := i + 1
+			for j < N && groups[j] == grp {
+				j++
+			}
+			runLen := j - i
+			g := grp - 1
+			x := int(g >> aggBatchSizeShift)
+			if x != lastX {
+				lastX = x
+				sums = chunkArr[types.Decimal128](exec.state[x].vecs[0])
+				cnts = vector.MustFixedColNoTypeCheck[int64](exec.state[x].vecs[1])
+			}
+			y := g & aggBatchSizeMask
+			if runLen >= sumReduceRunMin {
+				idx := i + offset
+				slo, shi := simdkernels.D64SumReduceToD128(
+					unsafe.Slice((*uint64)(unsafe.Pointer(&vals[idx])), runLen))
+				sums[y] = sums[y].Add128Unchecked(types.Decimal128{B0_63: slo, B64_127: shi})
+				cnts[y] += int64(runLen)
+			} else {
+				for k := i; k < j; k++ {
+					raw := vals[k+offset]
+					hi := uint64(int64(raw) >> 63)
+					sums[y] = sums[y].Add128Unchecked(types.Decimal128{B0_63: uint64(raw), B64_127: hi})
+				}
+				cnts[y] += int64(runLen)
+			}
+			i = j
+		}
+		return nil
+	}
+
 	const maxSlots = 255
 	var slotOf [256]uint8
 	var localSums [maxSlots]types.Decimal128
diff --git a/pkg/sql/plan/function/arith_decimal_fast.go b/pkg/sql/plan/function/arith_decimal_fast.go
index 13d1040c504a3..621eabef15327 100644
--- a/pkg/sql/plan/function/arith_decimal_fast.go
+++ b/pkg/sql/plan/function/arith_decimal_fast.go
@@ -29,14 +29,25 @@ package function
 import (
 	"math"
 	"math/bits"
+	"unsafe"
 
 	"github.com/matrixorigin/matrixone/pkg/common/bitmap"
 	"github.com/matrixorigin/matrixone/pkg/common/moerr"
+	"github.com/matrixorigin/matrixone/pkg/common/simdkernels"
 	"github.com/matrixorigin/matrixone/pkg/container/nulls"
 	"github.com/matrixorigin/matrixone/pkg/container/types"
 	"github.com/matrixorigin/matrixone/pkg/vm/process"
 )
 
+// d128AsU64 reinterprets a []Decimal128 as []uint64 (lo, hi, lo, hi, ...).
+// Decimal128 is exactly {B0_63, B64_127 uint64} with no padding.
+func d128AsU64(s []types.Decimal128) []uint64 {
+	if len(s) == 0 {
+		return nil
+	}
+	return unsafe.Slice((*uint64)(unsafe.Pointer(&s[0])), len(s)*2)
+}
+
 func operandsAt[T any](v1, v2 []T, idx int) (T, T) {
 	// Branchless: mask is 0 when len==1 (scalar), -1 when len>1 (vector).
 	// idx & 0 = 0 (always v[0]), idx & -1 = idx (v[idx]).
@@ -140,14 +151,7 @@ func d128AddSameScale(v1, v2, rs []types.Decimal128, rsnull *nulls.Nulls) int {
 
 	if len1 == len2 {
 		if noNull {
-			for i := 0; i < len1; i++ {
-				signX := v1[i].B64_127 >> 63
-				rs[i].B0_63, carry = bits.Add64(v1[i].B0_63, v2[i].B0_63, 0)
-				rs[i].B64_127, _ = bits.Add64(v1[i].B64_127, v2[i].B64_127, carry)
-				if signX == v2[i].B64_127>>63 && signX != rs[i].B64_127>>63 {
-					return i
-				}
-			}
+			return simdkernels.D128AddChecked(d128AsU64(v1), d128AsU64(v2), d128AsU64(rs))
 		} else {
 			for i := 0; i < len1; i++ {
 				if bmp.Contains(uint64(i)) {
@@ -165,13 +169,7 @@ func d128AddSameScale(v1, v2, rs []types.Decimal128, rsnull *nulls.Nulls) int {
 		a := v1[0]
 		signA := a.B64_127 >> 63
 		if noNull {
-			for i := 0; i < len2; i++ {
-				rs[i].B0_63, carry = bits.Add64(a.B0_63, v2[i].B0_63, 0)
-				rs[i].B64_127, _ = bits.Add64(a.B64_127, v2[i].B64_127, carry)
-				if signA == v2[i].B64_127>>63 && signA != rs[i].B64_127>>63 {
-					return i
-				}
-			}
+			return simdkernels.D128AddScalarChecked(a.B0_63, a.B64_127, d128AsU64(v2), d128AsU64(rs))
 		} else {
 			for i := 0; i < len2; i++ {
 				if bmp.Contains(uint64(i)) {
@@ -188,13 +186,7 @@ func d128AddSameScale(v1, v2, rs []types.Decimal128, rsnull *nulls.Nulls) int {
 		b := v2[0]
 		signB := b.B64_127 >> 63
 		if noNull {
-			for i := 0; i < len1; i++ {
-				rs[i].B0_63, carry = bits.Add64(v1[i].B0_63, b.B0_63, 0)
-				rs[i].B64_127, _ = bits.Add64(v1[i].B64_127, b.B64_127, carry)
-				if v1[i].B64_127>>63 == signB && v1[i].B64_127>>63 != rs[i].B64_127>>63 {
-					return i
-				}
-			}
+			return simdkernels.D128AddScalarChecked(b.B0_63, b.B64_127, d128AsU64(v1), d128AsU64(rs))
 		} else {
 			for i := 0; i < len1; i++ {
 				if bmp.Contains(uint64(i)) {
@@ -291,14 +283,7 @@ func d128SubSameScale(v1, v2, rs []types.Decimal128, rsnull *nulls.Nulls) int {
 
 	if len1 == len2 {
 		if noNull {
-			for i := 0; i < len1; i++ {
-				signX := v1[i].B64_127 >> 63
-				rs[i].B0_63, borrow = bits.Sub64(v1[i].B0_63, v2[i].B0_63, 0)
-				rs[i].B64_127, _ = bits.Sub64(v1[i].B64_127, v2[i].B64_127, borrow)
-				if signX != v2[i].B64_127>>63 && signX != rs[i].B64_127>>63 {
-					return i
-				}
-			}
+			return simdkernels.D128SubChecked(d128AsU64(v1), d128AsU64(v2), d128AsU64(rs))
 		} else {
 			for i := 0; i < len1; i++ {
 				if bmp.Contains(uint64(i)) {
@@ -316,13 +301,7 @@ func d128SubSameScale(v1, v2, rs []types.Decimal128, rsnull *nulls.Nulls) int {
 		a := v1[0]
 		signA := a.B64_127 >> 63
 		if noNull {
-			for i := 0; i < len2; i++ {
-				rs[i].B0_63, borrow = bits.Sub64(a.B0_63, v2[i].B0_63, 0)
-				rs[i].B64_127, _ = bits.Sub64(a.B64_127, v2[i].B64_127, borrow)
-				if signA != v2[i].B64_127>>63 && signA != rs[i].B64_127>>63 {
-					return i
-				}
-			}
+			return simdkernels.D128ScalarSubChecked(a.B0_63, a.B64_127, d128AsU64(v2), d128AsU64(rs))
 		} else {
 			for i := 0; i < len2; i++ {
 				if bmp.Contains(uint64(i)) {
@@ -339,14 +318,7 @@ func d128SubSameScale(v1, v2, rs []types.Decimal128, rsnull *nulls.Nulls) int {
 		b := v2[0]
 		signB := b.B64_127 >> 63
 		if noNull {
-			for i := 0; i < len1; i++ {
-				signX := v1[i].B64_127 >> 63
-				rs[i].B0_63, borrow = bits.Sub64(v1[i].B0_63, b.B0_63, 0)
-				rs[i].B64_127, _ = bits.Sub64(v1[i].B64_127, b.B64_127, borrow)
-				if signX != signB && signX != rs[i].B64_127>>63 {
-					return i
-				}
-			}
+			return simdkernels.D128SubScalarChecked(d128AsU64(v1), b.B0_63, b.B64_127, d128AsU64(rs))
 		} else {
 			for i := 0; i < len1; i++ {
 				if bmp.Contains(uint64(i)) {
diff --git a/pkg/sql/plan/function/arith_decimal_fast_test.go b/pkg/sql/plan/function/arith_decimal_fast_test.go
index 19cd55b2918f3..1d3a807b1b2a7 100644
--- a/pkg/sql/plan/function/arith_decimal_fast_test.go
+++ b/pkg/sql/plan/function/arith_decimal_fast_test.go
@@ -1976,6 +1976,22 @@ func BenchmarkD128IntDiv_Generic(b *testing.B) {
 	}
 }
 
+func BenchmarkD128Sub_Fast(b *testing.B) {
+	rng := rand.New(rand.NewSource(42))
+	xs := make([]types.Decimal128, benchN)
+	ys := make([]types.Decimal128, benchN)
+	rs := make([]types.Decimal128, benchN)
+	for i := range xs {
+		xs[i] = randD128(rng)
+		ys[i] = randD128(rng)
+	}
+	nul := nulls.NewWithSize(benchN)
+	b.ResetTimer()
+	for iter := 0; iter < b.N; iter++ {
+		_ = d128Sub(xs, ys, rs, 2, 2, nul)
+	}
+}
+
 func randD256(rng *rand.Rand) types.Decimal256 {
 	return types.Decimal256{
 		B0_63:    rng.Uint64(),
diff --git a/pkg/sql/plan/function/func_cast.go b/pkg/sql/plan/function/func_cast.go
index ca094c08c85dc..30a06121298e0 100644
--- a/pkg/sql/plan/function/func_cast.go
+++ b/pkg/sql/plan/function/func_cast.go
@@ -27,6 +27,7 @@ import (
 	"unsafe"
 
 	"github.com/matrixorigin/matrixone/pkg/common/moerr"
+	"github.com/matrixorigin/matrixone/pkg/common/simdkernels"
 	"github.com/matrixorigin/matrixone/pkg/common/util"
 	"github.com/matrixorigin/matrixone/pkg/container/bytejson"
 	"github.com/matrixorigin/matrixone/pkg/container/nulls"
@@ -4309,18 +4310,9 @@ func decimal64ToDecimal128Array(
 			}
 		} else {
 			if totype.Scale == fromtype.Scale {
-				// Fast path: direct slice write with branchless sign extension.
-				// BCE hints + int index eliminate bounds checks in the inner loop.
+				// Fast path: assembly sign-extend with prefetch for cold destination.
 				dst := vector.MustFixedColNoTypeCheck[types.Decimal128](to.GetResultVector())
-				_ = v[length-1]
-				_ = dst[length-1]
-				for i := 0; i < length; i++ {
-					s := int64(v[i]) >> 63
-					dst[i] = types.Decimal128{
-						B0_63:   uint64(v[i]),
-						B64_127: uint64(s),
-					}
-				}
+				simdkernels.Decimal64SignExtend(unsafe.Pointer(&dst[0]), unsafe.Pointer(&v[0]), length)
 			} else {
 				for i := 0; i < length; i++ {
 					fromdec := types.Decimal128{B0_63: uint64(v[i]), B64_127: 0}
@@ -4352,15 +4344,7 @@ func decimal64ToDecimal128Array(
 			rsVec.GetNulls().Or(srcVec.GetNulls())
 			dst := vector.MustFixedColNoTypeCheck[types.Decimal128](rsVec)
 			v := vector.MustFixedColWithTypeCheck[types.Decimal64](srcVec)
-			_ = v[length-1]
-			_ = dst[length-1]
-			for i := 0; i < length; i++ {
-				s := int64(v[i]) >> 63
-				dst[i] = types.Decimal128{
-					B0_63:   uint64(v[i]),
-					B64_127: uint64(s),
-				}
-			}
+			simdkernels.Decimal64SignExtend(unsafe.Pointer(&dst[0]), unsafe.Pointer(&v[0]), length)
 		} else {
 			var dft types.Decimal128
 			for i := 0; i < length; i++ {