From 216a96066869736f2e1edb0a5a1cbf5a516474ad Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Thu, 30 May 2024 08:18:41 +0200 Subject: [PATCH 01/64] Allow compilation of AVX2 on x86 --- hwy/detect_targets.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hwy/detect_targets.h b/hwy/detect_targets.h index e40f78a92d..90e2573ba8 100644 --- a/hwy/detect_targets.h +++ b/hwy/detect_targets.h @@ -166,7 +166,12 @@ // 32-bit may fail to compile AVX2/3. #if HWY_ARCH_X86_32 +// GCC-13 is ok with AVX2: +#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1300) +#define HWY_BROKEN_32BIT (HWY_AVX3 | (HWY_AVX3 - 1)) +#else #define HWY_BROKEN_32BIT (HWY_AVX2 | (HWY_AVX2 - 1)) +#endif #else #define HWY_BROKEN_32BIT 0 #endif From 620fec7ff80a630119eaf5e6bb689aa5e873ea15 Mon Sep 17 00:00:00 2001 From: John Platts Date: Thu, 10 Oct 2024 21:18:25 -0500 Subject: [PATCH 02/64] Added MinMagnitude and MaxMagnitude ops --- g3doc/quick_reference.md | 18 +++++++ hwy/ops/generic_ops-inl.h | 57 +++++++++++++++++++++ hwy/tests/minmax_test.cc | 101 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 176 insertions(+) diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md index 8220e9b718..059acef4d8 100644 --- a/g3doc/quick_reference.md +++ b/g3doc/quick_reference.md @@ -678,6 +678,24 @@ is qNaN, and NaN if both are. * V **Max**(V a, V b): returns `max(a[i], b[i])`. +* V **MinMagnitude**(V a, V b): returns the number with the + smaller magnitude if `a[i]` and `b[i]` are both non-NaN values. + + If `a[i]` and `b[i]` are both non-NaN, `MinMagnitude(a, b)` returns + `(|a[i]| < |b[i]| || (|a[i]| == |b[i]| && a[i] < b[i])) ? a[i] : b[i]`. + + Otherwise, the results of `MinMagnitude(a, b)` are implementation-defined + if `a[i]` is NaN or `b[i]` is NaN. + +* V **MaxMagnitude**(V a, V b): returns the number with the + larger magnitude if `a[i]` and `b[i]` are both non-NaN values. + + If `a[i]` and `b[i]` are both non-NaN, `MaxMagnitude(a, b)` returns + `(|a[i]| < |b[i]| || (|a[i]| == |b[i]| && a[i] < b[i])) ? b[i] : a[i]`. + + Otherwise, the results of `MaxMagnitude(a, b)` are implementation-defined + if `a[i]` is NaN or `b[i]` is NaN. + All other ops in this section are only available if `HWY_TARGET != HWY_SCALAR`: * `V`: `u64` \ diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h index 99b518d99c..7447fcb6cc 100644 --- a/hwy/ops/generic_ops-inl.h +++ b/hwy/ops/generic_ops-inl.h @@ -488,6 +488,63 @@ HWY_API V InterleaveEven(V a, V b) { } #endif +// ------------------------------ MinMagnitude/MaxMagnitude + +#if (defined(HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE +#undef HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE +#else +#define HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE +#endif + +template +HWY_API V MinMagnitude(V a, V b) { + const auto abs_a = Abs(a); + const auto abs_b = Abs(b); + return IfThenElse(Lt(abs_a, abs_b), a, + Min(IfThenElse(Eq(abs_a, abs_b), a, b), b)); +} + +template +HWY_API V MaxMagnitude(V a, V b) { + const auto abs_a = Abs(a); + const auto abs_b = Abs(b); + return IfThenElse(Lt(abs_a, abs_b), b, + Max(IfThenElse(Eq(abs_a, abs_b), b, a), a)); +} + +#endif // HWY_NATIVE_FLOAT_MIN_MAX_MAGNITUDE + +template +HWY_API V MinMagnitude(V a, V b) { + const DFromV d; + const RebindToUnsigned du; + const auto abs_a = BitCast(du, Abs(a)); + const auto abs_b = BitCast(du, Abs(b)); + return IfThenElse(RebindMask(d, Lt(abs_a, abs_b)), a, + Min(IfThenElse(RebindMask(d, Eq(abs_a, abs_b)), a, b), b)); +} + +template +HWY_API V MaxMagnitude(V a, V b) { + const DFromV d; + const RebindToUnsigned du; + const auto abs_a = BitCast(du, Abs(a)); + const auto abs_b = BitCast(du, Abs(b)); + return IfThenElse(RebindMask(d, Lt(abs_a, abs_b)), b, + Max(IfThenElse(RebindMask(d, Eq(abs_a, abs_b)), b, a), a)); +} + +template +HWY_API V MinMagnitude(V a, V b) { + return Min(a, b); +} + +template +HWY_API V MaxMagnitude(V a, V b) { + return Max(a, b); +} + // ------------------------------ AddSub template , 1)> diff --git a/hwy/tests/minmax_test.cc b/hwy/tests/minmax_test.cc index 3ef116d30d..1a08d56aee 100644 --- a/hwy/tests/minmax_test.cc +++ b/hwy/tests/minmax_test.cc @@ -257,6 +257,106 @@ HWY_NOINLINE void TestAllMinMax128Upper() { ForGEVectors<128, TestMinMax128Upper>()(uint64_t()); } +struct TestMinMaxMagnitude { + template + static constexpr MakeSigned MaxPosIotaVal(hwy::FloatTag /*type_tag*/) { + return static_cast>(MantissaMask() + 1); + } + template + static constexpr MakeSigned MaxPosIotaVal(hwy::NonFloatTag /*type_tag*/) { + return static_cast>(((LimitsMax>()) >> 1) + 1); + } + + template + HWY_NOINLINE static void VerifyMinMaxMagnitude( + D d, const TFromD* HWY_RESTRICT in1_lanes, + const TFromD* HWY_RESTRICT in2_lanes, const int line) { + using T = TFromD; + using TAbs = If() || IsSpecialFloat(), T, MakeUnsigned>; + + const char* file = __FILE__; + const size_t N = Lanes(d); + auto expected_min_mag = AllocateAligned(N); + auto expected_max_mag = AllocateAligned(N); + HWY_ASSERT(expected_min_mag && expected_max_mag); + + for (size_t i = 0; i < N; i++) { + const T val1 = in1_lanes[i]; + const T val2 = in2_lanes[i]; + const TAbs abs_val1 = static_cast(ScalarAbs(val1)); + const TAbs abs_val2 = static_cast(ScalarAbs(val2)); + if (abs_val1 < abs_val2 || (abs_val1 == abs_val2 && val1 < val2)) { + expected_min_mag[i] = val1; + expected_max_mag[i] = val2; + } else { + expected_min_mag[i] = val2; + expected_max_mag[i] = val1; + } + } + + const auto in1 = Load(d, in1_lanes); + const auto in2 = Load(d, in2_lanes); + AssertVecEqual(d, expected_min_mag.get(), MinMagnitude(in1, in2), file, + line); + AssertVecEqual(d, expected_min_mag.get(), MinMagnitude(in2, in1), file, + line); + AssertVecEqual(d, expected_max_mag.get(), MaxMagnitude(in1, in2), file, + line); + AssertVecEqual(d, expected_max_mag.get(), MaxMagnitude(in2, in1), file, + line); + } + + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TI = MakeSigned; + using TU = MakeUnsigned; + constexpr TI kMaxPosIotaVal = MaxPosIotaVal(hwy::IsFloatTag()); + static_assert(kMaxPosIotaVal > 0, "kMaxPosIotaVal > 0 must be true"); + + constexpr size_t kPositiveIotaMask = static_cast( + static_cast(kMaxPosIotaVal - 1) & (HWY_MAX_LANES_D(D) - 1)); + + const size_t N = Lanes(d); + auto in1_lanes = AllocateAligned(N); + auto in2_lanes = AllocateAligned(N); + auto in3_lanes = AllocateAligned(N); + auto in4_lanes = AllocateAligned(N); + HWY_ASSERT(in1_lanes && in2_lanes && in3_lanes && in4_lanes); + + for (size_t i = 0; i < N; i++) { + const TI x1 = static_cast((i & kPositiveIotaMask) + 1); + const TI x2 = static_cast(kMaxPosIotaVal - x1); + const TI x3 = static_cast(-x1); + const TI x4 = static_cast(-x2); + + in1_lanes[i] = ConvertScalarTo(x1); + in2_lanes[i] = ConvertScalarTo(x2); + in3_lanes[i] = ConvertScalarTo(x3); + in4_lanes[i] = ConvertScalarTo(x4); + } + + VerifyMinMaxMagnitude(d, in1_lanes.get(), in2_lanes.get(), __LINE__); + VerifyMinMaxMagnitude(d, in1_lanes.get(), in3_lanes.get(), __LINE__); + VerifyMinMaxMagnitude(d, in1_lanes.get(), in4_lanes.get(), __LINE__); + VerifyMinMaxMagnitude(d, in2_lanes.get(), in3_lanes.get(), __LINE__); + VerifyMinMaxMagnitude(d, in2_lanes.get(), in4_lanes.get(), __LINE__); + VerifyMinMaxMagnitude(d, in3_lanes.get(), in4_lanes.get(), __LINE__); + + in2_lanes[0] = HighestValue(); + in4_lanes[0] = LowestValue(); + + VerifyMinMaxMagnitude(d, in1_lanes.get(), in2_lanes.get(), __LINE__); + VerifyMinMaxMagnitude(d, in1_lanes.get(), in4_lanes.get(), __LINE__); + VerifyMinMaxMagnitude(d, in2_lanes.get(), in3_lanes.get(), __LINE__); + VerifyMinMaxMagnitude(d, in2_lanes.get(), in4_lanes.get(), __LINE__); + VerifyMinMaxMagnitude(d, in3_lanes.get(), in4_lanes.get(), __LINE__); + } +}; + +HWY_NOINLINE void TestAllMinMaxMagnitude() { + ForAllTypes(ForPartialVectors()); +} + // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy @@ -269,6 +369,7 @@ HWY_BEFORE_TEST(HwyMinMaxTest); HWY_EXPORT_AND_TEST_P(HwyMinMaxTest, TestAllMinMax); HWY_EXPORT_AND_TEST_P(HwyMinMaxTest, TestAllMinMax128); HWY_EXPORT_AND_TEST_P(HwyMinMaxTest, TestAllMinMax128Upper); +HWY_EXPORT_AND_TEST_P(HwyMinMaxTest, TestAllMinMaxMagnitude); HWY_AFTER_TEST(); } // namespace hwy From 57a6c9bad2bdd92932c781b2621449a9b970d256 Mon Sep 17 00:00:00 2001 From: Sirui Lu Date: Wed, 23 Oct 2024 13:07:12 -0700 Subject: [PATCH 03/64] add Get/Set for vectors and use them to implement Concat* operators --- hwy/ops/rvv-inl.h | 176 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 164 insertions(+), 12 deletions(-) diff --git a/hwy/ops/rvv-inl.h b/hwy/ops/rvv-inl.h index f65153294f..62ac160f06 100644 --- a/hwy/ops/rvv-inl.h +++ b/hwy/ops/rvv-inl.h @@ -127,6 +127,26 @@ namespace detail { // for code folding X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \ X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP) +#define HWY_RVV_FOREACH_08_GET_SET(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP) + +#define HWY_RVV_FOREACH_16_GET_SET(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP) + +#define HWY_RVV_FOREACH_32_GET_SET(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP) + +#define HWY_RVV_FOREACH_64_GET_SET(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP) + // LMULS = _DEMOTE: can demote from SEW*LMUL to SEWH*LMULH. #define HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \ @@ -275,6 +295,35 @@ namespace detail { // for code folding HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) +// GET/SET + VIRT +#define HWY_RVV_FOREACH_08_GET_SET_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) + +#define HWY_RVV_FOREACH_16_GET_SET_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) + +#define HWY_RVV_FOREACH_32_GET_SET_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) + +#define HWY_RVV_FOREACH_64_GET_SET_VIRT(X_MACRO, BASE, CHAR, NAME, OP) + +// For the smallest LMUL for each SEW, similar to the LowerHalf operator, we +// provide the Get and Set operator that returns the same vector type. +#define HWY_RVV_FOREACH_08_GET_SET_SMALLEST(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 8, 16, __, mf8, mf4, __, -3, /*MLEN=*/64, NAME, OP) + +#define HWY_RVV_FOREACH_16_GET_SET_SMALLEST(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP) + +#define HWY_RVV_FOREACH_32_GET_SET_SMALLEST(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP) + +#define HWY_RVV_FOREACH_64_GET_SET_SMALLEST(X_MACRO, BASE, CHAR, NAME, OP) \ + X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP) + // EXT + VIRT #define HWY_RVV_FOREACH_08_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ @@ -3123,6 +3172,91 @@ HWY_RVV_FOREACH(HWY_RVV_SLIDE_DOWN, SlideDown, slidedown, _ALL) #undef HWY_RVV_SLIDE_UP #undef HWY_RVV_SLIDE_DOWN +#define HWY_RVV_GET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMULH) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULH( \ + v, kIndex); /* no AVL */ \ + } +#define HWY_RVV_GET_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMULH) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ + if constexpr (kIndex == 0) { \ + return Trunc(v); \ + } else { \ + static_assert(kIndex == 1); \ + return Trunc(SlideDown( \ + v, Lanes(HWY_RVV_D(BASE, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), \ + SHIFT - 1){}))); \ + } \ + } +#define HWY_RVV_GET_SMALLEST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ + if constexpr (kIndex == 0) { \ + return v; \ + } else { \ + static_assert(kIndex == 1); \ + return SlideDown( \ + v, Lanes(HWY_RVV_D(BASE, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), \ + SHIFT){}) / \ + 2); \ + } \ + } +HWY_RVV_FOREACH(HWY_RVV_GET, Get, get, _GET_SET) +HWY_RVV_FOREACH(HWY_RVV_GET_VIRT, Get, get, _GET_SET_VIRT) +HWY_RVV_FOREACH(HWY_RVV_GET_SMALLEST, Get, get, _GET_SET_SMALLEST) +#undef HWY_RVV_GET +#undef HWY_RVV_GET_VIRT +#undef HWY_RVV_GET_SMALLEST + +#define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) dest, HWY_RVV_V(BASE, SEW, LMULH) v) { \ + return __riscv_v##OP##_v_##CHAR##SEW##LMULH##_##CHAR##SEW##LMUL( \ + dest, kIndex, v); /* no AVL */ \ + } +#define HWY_RVV_SET_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) dest, HWY_RVV_V(BASE, SEW, LMULH) v) { \ + auto d = HWY_RVV_D(BASE, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), SHIFT){}; \ + auto df2 = \ + HWY_RVV_D(BASE, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), SHIFT - 1){}; \ + if constexpr (kIndex == 0) { \ + return __riscv_vmv_v_v_##CHAR##SEW##LMUL##_tu(dest, Ext(d, v), \ + Lanes(df2)); \ + } else { \ + static_assert(kIndex == 1); \ + return SlideUp(dest, Ext(d, v), Lanes(df2)); \ + } \ + } +#define HWY_RVV_SET_SMALLEST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) dest, HWY_RVV_V(BASE, SEW, LMUL) v) { \ + auto d = HWY_RVV_D(BASE, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), SHIFT){}; \ + if constexpr (kIndex == 0) { \ + return __riscv_vmv_v_v_##CHAR##SEW##LMUL##_tu(dest, v, Lanes(d) / 2); \ + } else { \ + static_assert(kIndex == 1); \ + return SlideUp(dest, v, Lanes(d) / 2); \ + } \ + } +HWY_RVV_FOREACH(HWY_RVV_SET, Set, set, _GET_SET) +HWY_RVV_FOREACH(HWY_RVV_SET_VIRT, Set, set, _GET_SET_VIRT) +HWY_RVV_FOREACH(HWY_RVV_SET_SMALLEST, Set, set, _GET_SET_SMALLEST) +#undef HWY_RVV_SET +#undef HWY_RVV_SET_VIRT +#undef HWY_RVV_SET_SMALLEST + } // namespace detail // ------------------------------ SlideUpLanes @@ -3144,29 +3278,47 @@ HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { // ------------------------------ ConcatUpperLower template -HWY_API V ConcatUpperLower(D d, const V hi, const V lo) { - const size_t half = Lanes(d) / 2; - const V hi_down = detail::SlideDown(hi, half); - return detail::SlideUp(lo, hi_down, half); +HWY_API V ConcatUpperLower(D, const V hi, const V lo) { + const auto lo_lower = detail::Get<0>(lo); + return detail::Set<0>(hi, lo_lower); } // ------------------------------ ConcatLowerLower template -HWY_API V ConcatLowerLower(D d, const V hi, const V lo) { - return detail::SlideUp(lo, hi, Lanes(d) / 2); +HWY_API V ConcatLowerLower(D, const V hi, const V lo) { + const auto hi_lower = detail::Get<0>(hi); + return detail::Set<1>(lo, hi_lower); } // ------------------------------ ConcatUpperUpper template -HWY_API V ConcatUpperUpper(D d, const V hi, const V lo) { - const size_t half = Lanes(d) / 2; - const V hi_down = detail::SlideDown(hi, half); - const V lo_down = detail::SlideDown(lo, half); - return detail::SlideUp(lo_down, hi_down, half); +HWY_API V ConcatUpperUpper(D, const V hi, const V lo) { + const auto lo_upper = detail::Get<1>(lo); + return detail::Set<0>(hi, lo_upper); } // ------------------------------ ConcatLowerUpper -template +namespace detail { + +// Only getting a full register is a no-op. +template +constexpr bool IsGetNoOp(D d) { + return d.Pow2() >= 0; +} + +} // namespace detail + +template ())>* = nullptr> +HWY_API V ConcatLowerUpper(D d, const V hi, const V lo) { + const auto lo_upper = detail::Get<1>(lo); + const auto hi_lower = detail::Get<0>(hi); + const auto undef = Undefined(d); + return detail::Set<1>(detail::Set<0>(undef, lo_upper), hi_lower); +} + +template ())>* = nullptr> HWY_API V ConcatLowerUpper(D d, const V hi, const V lo) { const size_t half = Lanes(d) / 2; const V lo_down = detail::SlideDown(lo, half); From bea0da5b72555f6a9f2bfb120d10d1eece2e7a47 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 19 Nov 2024 13:24:32 +0000 Subject: [PATCH 04/64] Bump step-security/harden-runner from 2.10.1 to 2.10.2 Bumps [step-security/harden-runner](https://github.com/step-security/harden-runner) from 2.10.1 to 2.10.2. - [Release notes](https://github.com/step-security/harden-runner/releases) - [Commits](https://github.com/step-security/harden-runner/compare/91182cccc01eb5e619899d80e4e971d6181294a7...0080882f6c36860b6ba35c610c98ce87d4e2f26f) --- updated-dependencies: - dependency-name: step-security/harden-runner dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/build_test.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml index 368b36a93c..ec62eaa44e 100644 --- a/.github/workflows/build_test.yml +++ b/.github/workflows/build_test.yml @@ -135,7 +135,7 @@ jobs: steps: - name: Harden Runner - uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 with: egress-policy: audit # cannot be block - runner does git checkout @@ -230,7 +230,7 @@ jobs: steps: - name: Harden Runner - uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 with: egress-policy: audit # cannot be block - runner does git checkout @@ -313,7 +313,7 @@ jobs: steps: - name: Harden Runner - uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 with: egress-policy: audit # cannot be block - runner does git checkout @@ -334,7 +334,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Harden Runner - uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 + uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 with: egress-policy: audit # cannot be block - runner does git checkout From 3da3328256e73b9e4140a486066a4b3a28df41ca Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Tue, 19 Nov 2024 18:24:08 +0000 Subject: [PATCH 05/64] Revert to previous logic for GatherIndexN --- hwy/ops/generic_ops-inl.h | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h index 99b518d99c..766c6c9d2e 100644 --- a/hwy/ops/generic_ops-inl.h +++ b/hwy/ops/generic_ops-inl.h @@ -2760,15 +2760,7 @@ template > HWY_API VFromD GatherIndexN(D d, const T* HWY_RESTRICT base, VFromD> index, const size_t max_lanes_to_load) { - const RebindToSigned di; - using TI = TFromD; - static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); - - VFromD v = Zero(d); - for (size_t i = 0; i < HWY_MIN(MaxLanes(d), max_lanes_to_load); ++i) { - v = InsertLane(v, i, base[ExtractLane(index, i)]); - } - return v; + return GatherIndexNOr(Zero(d), d, base, index, max_lanes_to_load); } template > @@ -2780,8 +2772,9 @@ HWY_API VFromD GatherIndexNOr(VFromD no, D d, const T* HWY_RESTRICT base, static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); VFromD v = no; - for (size_t i = 0; i < HWY_MIN(MaxLanes(d), max_lanes_to_load); ++i) { - v = InsertLane(v, i, base[ExtractLane(index, i)]); + for (size_t i = 0; i < MaxLanes(d); ++i) { + if (i < max_lanes_to_load) + v = InsertLane(v, i, base[ExtractLane(index, i)]); } return v; } From 3cb3a91a2ae74f9b5940779c6e7cf3ffc49475e3 Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Wed, 20 Nov 2024 05:32:46 -0800 Subject: [PATCH 06/64] detect cache parameters PiperOrigin-RevId: 698357330 --- hwy/contrib/thread_pool/topology.cc | 608 ++++++++++++++++++++--- hwy/contrib/thread_pool/topology.h | 33 ++ hwy/contrib/thread_pool/topology_test.cc | 45 +- 3 files changed, 611 insertions(+), 75 deletions(-) diff --git a/hwy/contrib/thread_pool/topology.cc b/hwy/contrib/thread_pool/topology.cc index cb34c2700e..0cca9bdcd8 100644 --- a/hwy/contrib/thread_pool/topology.cc +++ b/hwy/contrib/thread_pool/topology.cc @@ -20,11 +20,19 @@ #include #include // strchr +#include #include +#include #include #include "hwy/detect_compiler_arch.h" // HWY_OS_WIN +#if HWY_OS_APPLE +#include + +#include "hwy/aligned_allocator.h" // HWY_ALIGNMENT +#endif + #if HWY_OS_WIN #ifndef NOMINMAX #define NOMINMAX @@ -32,6 +40,9 @@ #ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN #endif +#ifndef _WIN32_WINNT +#define _WIN32_WINNT 0x0601 // Windows 7 / Server 2008 +#endif #include #endif // HWY_OS_WIN @@ -381,27 +392,38 @@ class Remapper { size_t num_ = 0; }; -// Stores the global cluster/core values separately for each package so we can -// return per-package arrays. +struct PackageSizes { + size_t num_clusters; + size_t num_cores; +}; + +// For internal use by `DetectPackages`. struct PerPackage { Remapper clusters; Remapper cores; + // We rely on this zero-init and increment it below. uint8_t smt_per_core[kMaxLogicalProcessors] = {0}; }; -// Initializes `lps` and returns a PerPackage vector (empty on failure). -std::vector DetectPackages(std::vector& lps) { - std::vector empty; +// Initializes `lps` and returns a PackageSizes vector (empty on failure) +// indicating the number of clusters and cores per package. +std::vector DetectPackages(std::vector& lps) { + std::vector empty; Remapper packages; for (size_t lp = 0; lp < lps.size(); ++lp) { if (!packages(kPackage, lp, &lps[lp].package)) return empty; } std::vector per_package(packages.Num()); + HWY_ASSERT(!per_package.empty()); for (size_t lp = 0; lp < lps.size(); ++lp) { PerPackage& pp = per_package[lps[lp].package]; - if (!pp.clusters(kCluster, lp, &lps[lp].cluster)) return empty; + // Not a failure: some CPUs lack a (shared) L3 cache. + if (!pp.clusters(kCluster, lp, &lps[lp].cluster)) { + lps[lp].cluster = 0; + } + if (!pp.cores(kCore, lp, &lps[lp].core)) return empty; // SMT ID is how many LP we have already seen assigned to the same core. @@ -410,7 +432,79 @@ std::vector DetectPackages(std::vector& lps) { HWY_ASSERT(lps[lp].smt < 16); } - return per_package; + std::vector package_sizes(per_package.size()); + for (size_t p = 0; p < package_sizes.size(); ++p) { + // Was zero if the package has no shared L3, see above. + package_sizes[p].num_clusters = HWY_MAX(1, per_package[p].clusters.Num()); + package_sizes[p].num_cores = per_package[p].cores.Num(); + HWY_ASSERT(package_sizes[p].num_cores != 0); + } + return package_sizes; +} + +std::vector ExpandList(const char* list, size_t list_end, + size_t max_lp) { + std::vector expanded; + constexpr size_t kNotFound = ~size_t{0}; + size_t pos = 0; + + // Returns first `found_pos >= pos` where `list[found_pos] == c`, or + // `kNotFound`. + const auto find = [list, list_end, &pos](char c) -> size_t { + const char* found_ptr = strchr(list + pos, c); + if (found_ptr == nullptr) return kNotFound; + const size_t found_pos = static_cast(found_ptr - list); + HWY_ASSERT(found_pos < list_end && list[found_pos] == c); + return found_pos; + }; + + // Reads LP number and advances `pos`. `end` is for verifying we did not + // read past a known terminator, or the end of string. + const auto parse_lp = [list, list_end, &pos, max_lp](size_t end) -> size_t { + end = HWY_MIN(end, list_end); + size_t lp; + HWY_ASSERT(ParseDigits(list, end, pos, &lp)); + HWY_IF_CONSTEXPR(HWY_ARCH_RISCV) { + // On RISC-V, both TotalLogicalProcessors and GetThreadAffinity may + // under-report the count, hence clamp. + lp = HWY_MIN(lp, max_lp); + } + HWY_ASSERT(lp <= max_lp); + HWY_ASSERT(pos <= end); + return lp; + }; + + // Parse all [first-]last separated by commas. + for (;;) { + // Single number or first of range: ends with dash, comma, or end. + const size_t lp_range_first = parse_lp(HWY_MIN(find('-'), find(','))); + + if (list[pos] == '-') { // range + ++pos; // skip dash + // Last of range ends with comma or end. + const size_t lp_range_last = parse_lp(find(',')); + + expanded.reserve(expanded.size() + lp_range_last - lp_range_first + 1); + for (size_t lp = lp_range_first; lp <= lp_range_last; ++lp) { + expanded.push_back(lp); + } + } else { // single number + expanded.push_back(lp_range_first); + } + + // Done if reached end of string. + if (pos == list_end || list[pos] == '\0' || list[pos] == '\n') { + break; + } + // Comma means at least one more term is coming. + if (list[pos] == ',') { + ++pos; + continue; + } + HWY_ABORT("Unexpected character at %zu in %s\n", pos, list); + } // for pos + + return expanded; } // Sets LP.node for all `lps`. @@ -421,67 +515,12 @@ void SetNodes(std::vector& lps) { char buf200[200]; const size_t bytes_read = ReadSysfs(kNode, node, buf200); if (bytes_read == 0) break; - - constexpr size_t kNotFound = ~size_t{0}; - size_t pos = 0; - - // Returns first `found_pos >= pos` where `buf200[found_pos] == c`, or - // `kNotFound`. - const auto find = [buf200, &pos](char c) -> size_t { - const char* found_ptr = strchr(buf200 + pos, c); - if (found_ptr == nullptr) return kNotFound; - HWY_ASSERT(found_ptr >= buf200); - const size_t found_pos = static_cast(found_ptr - buf200); - HWY_ASSERT(found_pos >= pos && buf200[found_pos] == c); - return found_pos; - }; - - // Reads LP number and advances `pos`. `end` is for verifying we did not - // read past a known terminator, or the end of string. - const auto parse_lp = [buf200, bytes_read, &pos, - &lps](size_t end) -> size_t { - end = HWY_MIN(end, bytes_read); - size_t lp; - HWY_ASSERT(ParseDigits(buf200, end, pos, &lp)); - HWY_IF_CONSTEXPR(HWY_ARCH_RISCV) { - // On RISC-V, both TotalLogicalProcessors and GetThreadAffinity may - // under-report the count, hence clamp. - lp = HWY_MIN(lp, lps.size() - 1); - } - HWY_ASSERT(lp < lps.size()); - HWY_ASSERT(pos <= end); - return lp; - }; - - // Parse all [first-]last separated by commas. - for (;;) { - // Single number or first of range: ends with dash, comma, or end. - const size_t lp_range_first = parse_lp(HWY_MIN(find('-'), find(','))); - - if (buf200[pos] == '-') { // range - ++pos; // skip dash - // Last of range ends with comma or end. - const size_t lp_range_last = parse_lp(find(',')); - - for (size_t lp = lp_range_first; lp <= lp_range_last; ++lp) { - lps[lp].node = static_cast(node); - } - } else { // single number - lps[lp_range_first].node = static_cast(node); - } - - // Done if reached end of string. - if (pos == bytes_read || buf200[pos] == '\0' || buf200[pos] == '\n') { - break; - } - // Comma means at least one more term is coming. - if (buf200[pos] == ',') { - ++pos; - continue; - } - HWY_ABORT("Unexpected character at %zu in %s\n", pos, buf200); - } // for pos - } // for node + const std::vector list = + ExpandList(buf200, bytes_read, lps.size() - 1); + for (size_t lp : list) { + lps[lp].node = static_cast(node); + } + } } } // namespace @@ -490,16 +529,16 @@ void SetNodes(std::vector& lps) { HWY_CONTRIB_DLLEXPORT Topology::Topology() { #if HWY_OS_LINUX lps.resize(TotalLogicalProcessors()); - const std::vector& per_package = DetectPackages(lps); - if (per_package.empty()) return; + const std::vector& package_sizes = DetectPackages(lps); + if (package_sizes.empty()) return; SetNodes(lps); // Allocate per-package/cluster/core vectors. This indicates to callers that // detection succeeded. - packages.resize(per_package.size()); + packages.resize(package_sizes.size()); for (size_t p = 0; p < packages.size(); ++p) { - packages[p].clusters.resize(per_package[p].clusters.Num()); - packages[p].cores.resize(per_package[p].cores.Num()); + packages[p].clusters.resize(package_sizes[p].num_clusters); + packages[p].cores.resize(package_sizes[p].num_cores); } // Populate the per-cluster/core sets of LP. @@ -527,4 +566,427 @@ HWY_CONTRIB_DLLEXPORT Topology::Topology() { #endif } +// ------------------------------ Cache detection + +using Caches = std::array; + +// Returns `whole / part`, with a check that `part` evenly divides `whole`, +// which implies the result is exact. +static HWY_MAYBE_UNUSED size_t DivByFactor(size_t whole, size_t part) { + HWY_ASSERT(part != 0); + const size_t div = whole / part; + const size_t mul = div * part; + if (mul != whole) { + HWY_ABORT("%zu / %zu = %zu; *%zu = %zu\n", whole, part, div, part, mul); + } + return div; +} + +// We assume homogeneous caches across all clusters because some OS APIs return +// a single value for a class of CPUs. + +#if HWY_OS_LINUX +std::string ReadString(const char* name, size_t index) { + // First CPU is usually a P core. + const std::string path("/sys/devices/system/cpu/cpu0/cache/index%zu/"); + char buf200[200]; + size_t end = ReadSysfs((path + name).c_str(), index, buf200); + // Remove trailing newline/null to simplify string comparison. + for (; end != 0; --end) { + if (buf200[end - 1] != '\0' && buf200[end - 1] != '\n') break; + } + return std::string(buf200, buf200 + end); +} + +template +bool WriteSysfs(const char* name, size_t index, T* out) { + const std::string str = ReadString(name, index); + // Do not call `ParseNumberWithOptionalSuffix` because it acts on the + // K suffix in "size", but we actually want KiB. + size_t pos = 0; + size_t val; + if (!ParseDigits(str.c_str(), str.length(), pos, &val)) return false; + HWY_ASSERT(pos <= str.length()); + *out = static_cast(val); + return true; +} + +// Reading from sysfs is preferred because sysconf returns L3 associativity = 0 +// on some CPUs, and does not indicate sharing across cores. +// https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-devices-system-cpu +static bool InitCachesSysfs(Caches& caches) { + // For computing shared cache sizes. + std::vector lps(TotalLogicalProcessors()); + const std::vector package_sizes = DetectPackages(lps); + // `package_sizes` is only used to check that `lps` were filled. + if (package_sizes.empty()) { + fprintf(stderr, "WARN: no packages, shared cache sizes may be incorrect\n"); + return false; + } + + for (size_t i = 0;; ++i) { + const std::string type = ReadString("type", i); + if (type.empty()) break; // done, no more entries + if (type != "Data" && type != "Unified") continue; + uint32_t level; + if (!WriteSysfs("level", i, &level)) continue; + if (level != 1 && level != 2 && level != 3) continue; + Cache& c = caches[level]; + + // Check before overwriting any fields. + if (c.size_kib != 0) { + fprintf(stderr, "WARN: ignoring another L%u, first size %u\n", level, + c.size_kib); + continue; + } + + const bool ok = WriteSysfs("size", i, &c.size_kib) && + WriteSysfs("ways_of_associativity", i, &c.associativity) && + WriteSysfs("number_of_sets", i, &c.sets); + if (HWY_UNLIKELY(!ok)) { + fprintf(stderr, "WARN: skipping partially-detected L%u, error %d\n", + level, errno); + c = Cache(); + continue; + } + + // Compute line size *before* adjusting the size for sharing. Note that + // `coherency_line_size` exists, but we are not sure that is the line size. + const size_t bytes = static_cast(c.size_kib) * 1024; + const size_t lines = c.associativity * c.sets; + c.bytes_per_line = static_cast(DivByFactor(bytes, lines)); + + // Divide by number of *cores* sharing the cache. + const std::string shared_str = ReadString("shared_cpu_list", i); + if (HWY_UNLIKELY(shared_str.empty())) { + fprintf(stderr, "WARN: no shared_cpu_list for L%u %s\n", level, + type.c_str()); + c.cores_sharing = 1; + } else { + const std::vector shared_lps = + ExpandList(shared_str.c_str(), shared_str.length(), lps.size() - 1); + size_t num_cores = 0; + for (size_t lp : shared_lps) { + if (HWY_LIKELY(lp < lps.size())) { + num_cores += lps[lp].smt == 0; + } else { + fprintf(stderr, "WARN: out of bounds lp %zu of %zu from %s\n", lp, + lps.size(), shared_str.c_str()); + } + } + if (num_cores == 0) { + fprintf(stderr, "WARN: no cores sharing L%u %s, setting to 1\n", level, + type.c_str()); + num_cores = 1; + } + c.cores_sharing = static_cast(num_cores); + // There exist CPUs for which L3 is not evenly divisible by `num_cores`, + // hence do not use `DivByFactor`. It is safer to round down. + c.size_kib = static_cast(c.size_kib / num_cores); + c.sets = static_cast(c.sets / num_cores); + } + } + + // Require L1 and L2 cache. + if (HWY_UNLIKELY(caches[1].size_kib == 0 || caches[2].size_kib == 0)) { + fprintf(stderr, "WARN: sysfs detected L1=%u L2=%u, err %x\n", + caches[1].size_kib, caches[2].size_kib, errno); + return false; + } + + // L3 is optional; if not found, its size is already zero from static init. + return true; +} + +#endif // HWY_OS_LINUX + +#if HWY_OS_WIN +using SLPI = SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX; + +template +bool ForEachSLPI(LOGICAL_PROCESSOR_RELATIONSHIP rel, Func&& func) { + // Get required buffer size. + DWORD buf_bytes = 0; + HWY_ASSERT(!GetLogicalProcessorInformationEx(rel, nullptr, &buf_bytes)); + HWY_ASSERT(GetLastError() == ERROR_INSUFFICIENT_BUFFER); + // Note: `buf_bytes` may be less than `sizeof(SLPI)`, which has padding. + uint8_t* buf = static_cast(malloc(buf_bytes)); + HWY_ASSERT(buf); + + // Fill the buffer. + SLPI* info = reinterpret_cast(buf); + if (HWY_UNLIKELY(!GetLogicalProcessorInformationEx(rel, info, &buf_bytes))) { + free(buf); + return false; + } + + // Iterate over each SLPI. `sizeof(SLPI)` is unreliable, see above. + uint8_t* pos = buf; + while (pos < buf + buf_bytes) { + info = reinterpret_cast(pos); + HWY_ASSERT(info->Relationship == rel); + func(*info); + pos += info->Size; + } + if (pos != buf + buf_bytes) { + fprintf( + stderr, + "WARN: unexpected pos %p, end %p, buf_bytes %lu, sizeof(SLPI) %zu\n", + pos, buf + buf_bytes, buf_bytes, sizeof(SLPI)); + } + + free(buf); + return true; +} + +static size_t NumBits(size_t num_groups, const GROUP_AFFINITY* affinity) { + size_t total_bits = 0; + for (size_t i = 0; i < num_groups; ++i) { + size_t bits = 0; + hwy::CopyBytes(&affinity[i].Mask, &bits); + total_bits += hwy::PopCount(bits); + } + return total_bits; +} + +static size_t MaxLogicalPerCore() { + size_t max_logical = 0; + ForEachSLPI(RelationProcessorCore, [&max_logical](const SLPI& info) { + const PROCESSOR_RELATIONSHIP& p = info.Processor; + max_logical = HWY_MAX(max_logical, NumBits(p.GroupCount, p.GroupMask)); + }); + HWY_ASSERT(max_logical != 0); + return max_logical; +} + +static bool InitCachesWin(Caches& caches) { + const size_t max_logical_per_core = MaxLogicalPerCore(); + + ForEachSLPI(RelationCache, [max_logical_per_core, &caches](const SLPI& info) { + const CACHE_RELATIONSHIP& cr = info.Cache; + if (cr.Type != CacheUnified && cr.Type != CacheData) return; + if (1 <= cr.Level && cr.Level <= 3) { + Cache& c = caches[cr.Level]; + HWY_ASSERT(c.size_kib == 0); // not set yet + c.size_kib = static_cast(DivByFactor(cr.CacheSize, 1024)); + c.bytes_per_line = static_cast(cr.LineSize); + c.associativity = (cr.Associativity == CACHE_FULLY_ASSOCIATIVE) + ? Cache::kMaxAssociativity + : cr.Associativity; + + // How many cores share this cache? + size_t shared_with = NumBits(cr.GroupCount, cr.GroupMasks); + // Divide out hyperthreads. This core may have fewer than + // `max_logical_per_core`, hence round up. + shared_with = DivCeil(shared_with, max_logical_per_core); + if (shared_with == 0) { + fprintf(stderr, "WARN: no cores sharing L%u, setting to 1\n", cr.Level); + shared_with = 1; + } + + // Update `size_kib` to *per-core* portion. + // There exist CPUs for which L3 is not evenly divisible by `shared_with`, + // hence do not use `DivByFactor`. It is safer to round down. + c.size_kib = static_cast(c.size_kib / shared_with); + c.cores_sharing = static_cast(shared_with); + } + }); + + // Require L1 and L2 cache. + if (HWY_UNLIKELY(caches[1].size_kib == 0 || caches[2].size_kib == 0)) { + fprintf(stderr, "WARN: Windows detected L1=%u, L2=%u, err %lx\n", + caches[1].size_kib, caches[2].size_kib, GetLastError()); + return false; + } + + // L3 is optional; if not found, its size is already zero from static init. + return true; +} +#endif // HWY_OS_WIN + +#if HWY_OS_APPLE +// Returns whether sysctlbyname() succeeded; if so, writes `val / div` to +// `out`, otherwise sets `err`. +template +bool Sysctl(const char* name, size_t div, int& err, T* out) { + size_t val = 0; + size_t size = sizeof(val); + // Last two arguments are for updating the value, which we do not want. + const int ret = sysctlbyname(name, &val, &size, nullptr, 0); + if (HWY_UNLIKELY(ret != 0)) { + // Do not print warnings because some `name` are expected to fail. + err = ret; + return false; + } + *out = static_cast(DivByFactor(val, div)); + return true; +} + +static bool InitCachesApple(Caches& caches) { + int err = 0; + Cache& L1 = caches[1]; + Cache& L2 = caches[2]; + Cache& L3 = caches[3]; + + // Total L1 and L2 size can be reliably queried, but prefer perflevel0 + // (P-cores) because hw.l1dcachesize etc. are documented to describe the + // "least performant core". + bool ok = Sysctl("hw.perflevel0.l1dcachesize", 1024, err, &L1.size_kib) || + Sysctl("hw.l1dcachesize", 1024, err, &L1.size_kib); + ok &= Sysctl("hw.perflevel0.l2cachesize", 1024, err, &L2.size_kib) || + Sysctl("hw.l2cachesize", 1024, err, &L2.size_kib); + if (HWY_UNLIKELY(!ok)) { + fprintf(stderr, "WARN: Apple cache detection failed, error %d\n", err); + return false; + } + L1.cores_sharing = 1; + if (Sysctl("hw.perflevel0.cpusperl2", 1, err, &L2.cores_sharing)) { + L2.size_kib = DivByFactor(L2.size_kib, L2.cores_sharing); + } else { + L2.cores_sharing = 1; + } + + // Other properties are not always reported. Set `associativity` and + // `bytes_per_line` based on known models. + char brand[128] = {0}; + size_t size = sizeof(brand); + if (!sysctlbyname("machdep.cpu.brand_string", brand, &size, nullptr, 0)) { + if (!strncmp(brand, "Apple ", 6)) { + // Unexpected, but we will continue check the string suffixes. + fprintf(stderr, "WARN: unexpected Apple brand %s\n", brand); + } + + if (brand[6] == 'M') { + // https://dougallj.github.io/applecpu/firestorm.html, + // https://www.7-cpu.com/cpu/Apple_M1.html: + L1.bytes_per_line = 64; + L1.associativity = 8; + L2.bytes_per_line = 128; + if (brand[7] == '1') { // M1 + L2.associativity = 12; + } else if ('2' <= brand[7] && brand[7] <= '4') { // M2/M3, maybe also M4 + L2.associativity = 16; + } else { + L2.associativity = 0; // Unknown, set below via sysctl. + } + + // Although Wikipedia lists SLC sizes per model, we do not know how it is + // partitioned/allocated, so do not treat it as a reliable L3. + } // M* + } // brand string + + // This sysctl does not distinguish between L1 and L2 line sizes, so only use + // it if we have not already set `bytes_per_line` above. + uint16_t bytes_per_line; + if (!Sysctl("hw.cachelinesize", 1, err, &bytes_per_line)) { + bytes_per_line = static_cast(HWY_ALIGNMENT); // guess + } + for (size_t level = 1; level <= 3; ++level) { + if (caches[level].bytes_per_line == 0) { + caches[level].bytes_per_line = bytes_per_line; + } + } + + // Fill in associativity if not already set. Unfortunately this is only + // reported on x86, not on M*. + if (L1.associativity == 0 && !Sysctl("machdep.cpu.cache.L1_associativity", 1, + err, &L1.associativity)) { + L1.associativity = 8; // guess + } + if (L2.associativity == 0 && !Sysctl("machdep.cpu.cache.L2_associativity", 1, + err, &L2.associativity)) { + L2.associativity = 12; // guess + } + // There is no L3_associativity. + if (L3.associativity == 0) { + L3.associativity = 12; // guess + } + + // Now attempt to query L3. Although this sysctl is documented, M3 does not + // report an L3 cache. + if (L3.size_kib == 0 && + (Sysctl("hw.perflevel0.l3cachesize", 1024, err, &L3.size_kib) || + Sysctl("hw.l3cachesize", 1024, err, &L3.size_kib))) { + if (Sysctl("hw.perflevel0.cpusperl3", 1, err, &L3.cores_sharing)) { + L3.size_kib = DivByFactor(L3.size_kib, L3.cores_sharing); + } else { + L3.cores_sharing = 1; + } + } + // If no L3 cache, reset all fields for consistency. + if (L3.size_kib == 0) { + L3 = Cache(); + } + + // Are there other useful sysctls? hw.cacheconfig appears to be how many + // cores share the memory and caches, though this is not documented, and + // duplicates information in hw.perflevel0.cpusperl*. + + return true; +} + +#endif // HWY_OS_APPLE + +// Most APIs do not set the `sets` field, so compute it from the size and +// associativity, and if a value is already set, ensure it matches. +static HWY_MAYBE_UNUSED void ComputeSets(Cache& c) { + // If there is no such cache, avoid division by zero. + if (HWY_UNLIKELY(c.size_kib == 0)) { + c.sets = 0; + return; + } + const size_t bytes = static_cast(c.size_kib) * 1024; + // `size_kib` may have been rounded down, hence `lines` and `sets` are not + // necessarily evenly divisible, so round down instead of `DivByFactor`. + const size_t lines = bytes / c.bytes_per_line; + const size_t sets = lines / c.associativity; + + if (c.sets == 0) { + c.sets = static_cast(sets); + } else { + if (c.sets != sets) { + HWY_ABORT("Inconsistent cache sets %u != %zu\n", c.sets, sets); + } + } +} + +static const Cache* InitDataCaches() { + alignas(64) static Caches caches; + + // On failure, return immediately because InitCaches*() already warn. +#if HWY_OS_LINUX + if (HWY_UNLIKELY(!InitCachesSysfs(caches))) return nullptr; +#elif HWY_OS_WIN + if (HWY_UNLIKELY(!InitCachesWin(caches))) return nullptr; +#elif HWY_OS_APPLE + if (HWY_UNLIKELY(!InitCachesApple(caches))) return nullptr; +#else + fprintf(stderr, "Cache detection not implemented for this platform.\n"); + (void)caches; + return nullptr; +#define HWY_NO_CACHE_DETECTION +#endif + + // Prevents "code not reached" warnings on WASM. +#ifndef HWY_NO_CACHE_DETECTION + for (size_t level = 1; level <= 3; ++level) { + ComputeSets(caches[level]); + } + + // Heuristic to ignore SLCs such as on Ampere Altra, which should not be + // treated as a reliable L3 because of their cache inclusion policy. + // On Apple M*, these are not even reported as an L3. + if (caches[3].cores_sharing >= 16 && caches[3].size_kib <= 512) { + caches[3] = Cache(); + } + + return &caches[0]; +#endif // HWY_NO_CACHE_DETECTION +} + +HWY_CONTRIB_DLLEXPORT const Cache* DataCaches() { + static const Cache* caches = InitDataCaches(); + return caches; +} + } // namespace hwy diff --git a/hwy/contrib/thread_pool/topology.h b/hwy/contrib/thread_pool/topology.h index 499b013c92..84780a8745 100644 --- a/hwy/contrib/thread_pool/topology.h +++ b/hwy/contrib/thread_pool/topology.h @@ -103,6 +103,39 @@ struct Topology { std::vector lps; // size() == TotalLogicalProcessors(). }; +#pragma pack(push, 1) +// Cache parameters. Note the overlap with `HWY_ALIGNMENT`, which is intended +// but not guaranteed to be an upper bound for L1/L2 line sizes, and +// `Topology::Cluster::private_kib/shared_kib`, which are intended but not +// guaranteed to be the L2/L3 sizes. Getting the exact parameters, including the +// ways of associativity, can be useful for modeling cache conflicts. +// +// Uses packed fields so the array of `Cache` fits in a typical cache line. +struct Cache { + // Arbitrary upper bound for sanity checking. + static constexpr uint16_t kMaxAssociativity = 128; + + // Zero if the level does not exist; *per-core* portion for shared caches. + uint32_t size_kib = 0; + // Also per-core portion, computed as number of lines / associativity. + uint32_t sets = 0; + uint16_t bytes_per_line = 0; + uint16_t associativity = 0; // number of ways + uint16_t cores_sharing = 0; // usually 1 for L1 + uint16_t reserved = 0; +}; +static_assert(sizeof(Cache) == 16, "Unexpected size"); +#pragma pack(pop) + +// Returns null if unknown, otherwise pointer to an array of `Cache` instances, +// where entry 0 is reserved, entry 1 describes the L1 data cache, entry 2 +// describes the (possibly unified or shared) L2, and entry 3 describes the L3 +// if its `size_kib != 0`. +// +// Initializes on-demand, which has some overhead for thread safety, hence +// callers should cache the result. +HWY_CONTRIB_DLLEXPORT const Cache* DataCaches(); + } // namespace hwy #endif // HIGHWAY_HWY_CONTRIB_THREAD_POOL_TOPOLOGY_H_ diff --git a/hwy/contrib/thread_pool/topology_test.cc b/hwy/contrib/thread_pool/topology_test.cc index e1f4409f3a..38d5202f3f 100644 --- a/hwy/contrib/thread_pool/topology_test.cc +++ b/hwy/contrib/thread_pool/topology_test.cc @@ -61,8 +61,7 @@ TEST(TopologyTest, TestTopology) { size_t lps_by_cluster = 0; size_t lps_by_core = 0; LogicalProcessorSet all_lps; - for (size_t p = 0; p < topology.packages.size(); ++p) { - const Topology::Package& pkg = topology.packages[p]; + for (const Topology::Package& pkg : topology.packages) { HWY_ASSERT(!pkg.clusters.empty()); HWY_ASSERT(!pkg.cores.empty()); HWY_ASSERT(pkg.clusters.size() <= pkg.cores.size()); @@ -83,6 +82,48 @@ TEST(TopologyTest, TestTopology) { HWY_ASSERT(all_lps.Count() == topology.lps.size()); } +void PrintCache(const Cache& c, size_t level) { + fprintf(stderr, + "L%zu: size %u KiB, line size %u, assoc %u, sets %u, cores %u\n", + level, c.size_kib, c.bytes_per_line, c.associativity, c.sets, + c.cores_sharing); +} + +static void CheckCache(const Cache& c, size_t level) { + // L1-L2 must exist, L3 is not guaranteed. + if (level == 3 && c.size_kib == 0) { + HWY_ASSERT(c.associativity == 0 && c.bytes_per_line == 0 && c.sets == 0); + return; + } + + // size and thus sets are not necessarily powers of two. + HWY_ASSERT(c.size_kib != 0); + HWY_ASSERT(c.sets != 0); + + // Intel Skylake has non-pow2 L3 associativity, and Apple L2 also, so we can + // only check loose bounds. + HWY_ASSERT(c.associativity >= 2); + HWY_ASSERT(c.associativity <= Cache::kMaxAssociativity); + + // line sizes are always powers of two because CPUs partition addresses into + // line offsets (the lower bits), set, and tag. + const auto is_pow2 = [](uint32_t x) { return x != 0 && (x & (x - 1)) == 0; }; + HWY_ASSERT(is_pow2(c.bytes_per_line)); + HWY_ASSERT(32 <= c.bytes_per_line && c.bytes_per_line <= 1024); + + HWY_ASSERT(c.cores_sharing != 0); + HWY_ASSERT(c.cores_sharing <= TotalLogicalProcessors()); +} + +TEST(TopologyTest, TestCaches) { + const Cache* caches = DataCaches(); + if (!caches) return; + for (size_t level = 1; level <= 3; ++level) { + PrintCache(caches[level], level); + CheckCache(caches[level], level); + } +} + } // namespace } // namespace hwy From 8a0602d40f9e6ed29a74e9fe8138d32739f19a6a Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Wed, 20 Nov 2024 09:22:39 -0800 Subject: [PATCH 07/64] replace non-test/trace fprintf with new hwy::Warn/HWY_WARN PiperOrigin-RevId: 698416280 --- hwy/abort.cc | 50 ++++++++++++++++++-- hwy/abort.h | 34 ++++++++++---- hwy/abort_test.cc | 21 +++++++++ hwy/base.h | 6 +++ hwy/contrib/bit_pack/bit_pack_test.cc | 2 +- hwy/contrib/math/math_test.cc | 3 +- hwy/contrib/sort/algo-inl.h | 5 +- hwy/contrib/sort/bench_sort.cc | 3 +- hwy/contrib/sort/sort_test.cc | 1 - hwy/contrib/sort/vqsort-inl.h | 10 ++-- hwy/contrib/thread_pool/topology.cc | 67 ++++++++++++--------------- hwy/examples/benchmark.cc | 11 ++--- hwy/nanobenchmark.cc | 24 +++++----- hwy/nanobenchmark.h | 20 +------- hwy/nanobenchmark_test.cc | 2 +- hwy/targets.cc | 11 ++--- hwy/tests/tuple_test.cc | 3 +- 17 files changed, 163 insertions(+), 110 deletions(-) diff --git a/hwy/abort.cc b/hwy/abort.cc index a40ee59e6e..a67819bbd3 100644 --- a/hwy/abort.cc +++ b/hwy/abort.cc @@ -9,6 +9,7 @@ #include #include +#include #include #include "hwy/base.h" @@ -20,21 +21,62 @@ namespace hwy { namespace { + +std::atomic& AtomicWarnFunc() { + static std::atomic func; + return func; +} + +std::atomic& AtomicAbortFunc() { + static std::atomic func; + return func; +} + std::string GetBaseName(std::string const& file_name) { auto last_slash = file_name.find_last_of("/\\"); return file_name.substr(last_slash + 1); } + } // namespace +// Returning a reference is unfortunately incompatible with `std::atomic`, which +// is required to safely implement `SetWarnFunc`. As a workaround, we store a +// copy here, update it when called, and return a reference to the copy. This +// has the added benefit of protecting the actual pointer from modification. +HWY_DLLEXPORT WarnFunc& GetWarnFunc() { + static WarnFunc func; + func = AtomicWarnFunc().load(); + return func; +} + HWY_DLLEXPORT AbortFunc& GetAbortFunc() { static AbortFunc func; + func = AtomicAbortFunc().load(); return func; } +HWY_DLLEXPORT WarnFunc SetWarnFunc(WarnFunc func) { + return AtomicWarnFunc().exchange(func); +} + HWY_DLLEXPORT AbortFunc SetAbortFunc(AbortFunc func) { - const AbortFunc prev = GetAbortFunc(); - GetAbortFunc() = func; - return prev; + return AtomicAbortFunc().exchange(func); +} + +HWY_DLLEXPORT void HWY_FORMAT(3, 4) + Warn(const char* file, int line, const char* format, ...) { + char buf[800]; + va_list args; + va_start(args, format); + vsnprintf(buf, sizeof(buf), format, args); + va_end(args); + + WarnFunc handler = AtomicWarnFunc().load(); + if (handler != nullptr) { + handler(file, line, buf); + } else { + fprintf(stderr, "Warn at %s:%d: %s\n", GetBaseName(file).data(), line, buf); + } } HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4) @@ -45,7 +87,7 @@ HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4) vsnprintf(buf, sizeof(buf), format, args); va_end(args); - AbortFunc handler = GetAbortFunc(); + AbortFunc handler = AtomicAbortFunc().load(); if (handler != nullptr) { handler(file, line, buf); } else { diff --git a/hwy/abort.h b/hwy/abort.h index b49d4685d0..afa68fb46f 100644 --- a/hwy/abort.h +++ b/hwy/abort.h @@ -9,20 +9,36 @@ namespace hwy { -// Interface for custom abort handler -typedef void (*AbortFunc)(const char* file, int line, - const char* formatted_err); +// Interfaces for custom Warn/Abort handlers. +typedef void (*WarnFunc)(const char* file, int line, const char* message); -// Retrieve current abort handler -// Returns null if no abort handler registered, indicating Highway should print and abort +typedef void (*AbortFunc)(const char* file, int line, const char* message); + +// Returns current Warn() handler, or nullptr if no handler was yet registered, +// indicating Highway should print to stderr. +// DEPRECATED because this is thread-hostile and prone to misuse (modifying the +// underlying pointer through the reference). +HWY_DLLEXPORT WarnFunc& GetWarnFunc(); + +// Returns current Abort() handler, or nullptr if no handler was yet registered, +// indicating Highway should print to stderr and abort. +// DEPRECATED because this is thread-hostile and prone to misuse (modifying the +// underlying pointer through the reference). HWY_DLLEXPORT AbortFunc& GetAbortFunc(); -// Sets a new abort handler and returns the previous abort handler -// If this handler does not do the aborting itself Highway will use its own abort mechanism -// which allows this to be used to customize the handling of the error itself. -// Returns null if no previous abort handler registered +// Sets a new Warn() handler and returns the previous handler, which is nullptr +// if no previous handler was registered, and should otherwise be called from +// the new handler. Thread-safe. +HWY_DLLEXPORT WarnFunc SetWarnFunc(WarnFunc func); + +// Sets a new Abort() handler and returns the previous handler, which is nullptr +// if no previous handler was registered, and should otherwise be called from +// the new handler. If all handlers return, then Highway will terminate the app. +// Thread-safe. HWY_DLLEXPORT AbortFunc SetAbortFunc(AbortFunc func); +// Abort()/Warn() and HWY_ABORT/HWY_WARN are declared in base.h. + } // namespace hwy #endif // HIGHWAY_HWY_ABORT_H_ diff --git a/hwy/abort_test.cc b/hwy/abort_test.cc index 804ae06ace..a3c81ab896 100644 --- a/hwy/abort_test.cc +++ b/hwy/abort_test.cc @@ -15,7 +15,28 @@ namespace hwy { namespace { +TEST(AbortTest, WarnOverrideChain) { + WarnFunc FirstHandler = [](const char* file, int line, + const char* formatted_err) -> void { + fprintf(stderr, "%s from %d of %s", formatted_err, line, file); + }; + WarnFunc SecondHandler = [](const char* file, int line, + const char* formatted_err) -> void { + fprintf(stderr, "%s from %d of %s", formatted_err, line, file); + }; + + // Do not check that the first SetWarnFunc returns nullptr, because it is + // not guaranteed to be the first call - other TEST may come first. + (void)SetWarnFunc(FirstHandler); + HWY_ASSERT(GetWarnFunc() == FirstHandler); + HWY_ASSERT(SetWarnFunc(SecondHandler) == FirstHandler); + HWY_ASSERT(GetWarnFunc() == SecondHandler); + HWY_ASSERT(SetWarnFunc(nullptr) == SecondHandler); + HWY_ASSERT(GetWarnFunc() == nullptr); +} + #ifdef GTEST_HAS_DEATH_TEST + std::string GetBaseName(std::string const& file_name) { auto last_slash = file_name.find_last_of("/\\"); return file_name.substr(last_slash + 1); diff --git a/hwy/base.h b/hwy/base.h index 1f40be19c3..f2dc87c0c5 100644 --- a/hwy/base.h +++ b/hwy/base.h @@ -251,6 +251,12 @@ namespace hwy { // 4 instances of a given literal value, useful as input to LoadDup128. #define HWY_REP4(literal) literal, literal, literal, literal +HWY_DLLEXPORT void HWY_FORMAT(3, 4) + Warn(const char* file, int line, const char* format, ...); + +#define HWY_WARN(format, ...) \ + ::hwy::Warn(__FILE__, __LINE__, format, ##__VA_ARGS__) + HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4) Abort(const char* file, int line, const char* format, ...); diff --git a/hwy/contrib/bit_pack/bit_pack_test.cc b/hwy/contrib/bit_pack/bit_pack_test.cc index 2a660b4750..d6a3bcfa87 100644 --- a/hwy/contrib/bit_pack/bit_pack_test.cc +++ b/hwy/contrib/bit_pack/bit_pack_test.cc @@ -133,7 +133,7 @@ struct TestPack { }, inputs, kNumInputs, results, p); if (num_results != kNumInputs) { - fprintf(stderr, "MeasureClosure failed.\n"); + HWY_WARN("MeasureClosure failed.\n"); return; } // Print throughput for pack+unpack round trip diff --git a/hwy/contrib/math/math_test.cc b/hwy/contrib/math/math_test.cc index 6ac36a6724..ef9eec3dbc 100644 --- a/hwy/contrib/math/math_test.cc +++ b/hwy/contrib/math/math_test.cc @@ -75,8 +75,7 @@ HWY_NOINLINE void TestMath(const char* name, T (*fx1)(T), static bool once = true; if (once) { once = false; - fprintf(stderr, - "Skipping math_test due to GCC issue with excess precision.\n"); + HWY_WARN("Skipping math_test due to GCC issue with excess precision.\n"); } return; } diff --git a/hwy/contrib/sort/algo-inl.h b/hwy/contrib/sort/algo-inl.h index 9087f10886..abbe25c7a7 100644 --- a/hwy/contrib/sort/algo-inl.h +++ b/hwy/contrib/sort/algo-inl.h @@ -554,8 +554,7 @@ void Run(Algo algo, KeyType* inout, size_t num_keys, SharedState& shared, case Algo::kVXSort: { #if (VXSORT_AVX3 && HWY_TARGET != HWY_AVX3) || \ (!VXSORT_AVX3 && HWY_TARGET != HWY_AVX2) - fprintf(stderr, "Do not call for target %s\n", - hwy::TargetName(HWY_TARGET)); + HWY_WARN("Do not call for target %s\n", hwy::TargetName(HWY_TARGET)); return; #else #if VXSORT_AVX3 @@ -566,7 +565,7 @@ void Run(Algo algo, KeyType* inout, size_t num_keys, SharedState& shared, if (kAscending) { return vx.sort(inout, inout + num_keys - 1); } else { - fprintf(stderr, "Skipping VX - does not support descending order\n"); + HWY_WARN("Skipping VX - does not support descending order\n"); return; } #endif // enabled for this target diff --git a/hwy/contrib/sort/bench_sort.cc b/hwy/contrib/sort/bench_sort.cc index 8bcbec6672..e53ee2708d 100644 --- a/hwy/contrib/sort/bench_sort.cc +++ b/hwy/contrib/sort/bench_sort.cc @@ -80,8 +80,7 @@ HWY_NOINLINE void BenchAllColdSort() { char cpu100[100]; if (!platform::HaveTimerStop(cpu100)) { - fprintf(stderr, "CPU '%s' does not support RDTSCP, skipping benchmark.\n", - cpu100); + HWY_WARN("CPU '%s' does not support RDTSCP, skipping benchmark.\n", cpu100); return; } diff --git a/hwy/contrib/sort/sort_test.cc b/hwy/contrib/sort/sort_test.cc index 2d9f825a99..e891418d8d 100644 --- a/hwy/contrib/sort/sort_test.cc +++ b/hwy/contrib/sort/sort_test.cc @@ -95,7 +95,6 @@ void TestAllSortIota() { if (hwy::HaveFloat64()) { TestSortIota(pool); } - fprintf(stderr, "Iota OK\n"); #endif } diff --git a/hwy/contrib/sort/vqsort-inl.h b/hwy/contrib/sort/vqsort-inl.h index fa849bb392..3d0fa0f76a 100644 --- a/hwy/contrib/sort/vqsort-inl.h +++ b/hwy/contrib/sort/vqsort-inl.h @@ -1917,8 +1917,8 @@ HWY_INLINE bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys, const bool huge_vec = kPotentiallyHuge && (2 * N > base_case_num); if (partial_128 || huge_vec) { if (VQSORT_PRINT >= 1) { - fprintf(stderr, "WARNING: using slow HeapSort: partial %d huge %d\n", - partial_128, huge_vec); + HWY_WARN("using slow HeapSort: partial %d huge %d\n", partial_128, + huge_vec); } HeapSort(st, keys, num); return true; @@ -1998,7 +1998,7 @@ void Sort(D d, Traits st, T* HWY_RESTRICT keys, const size_t num, (void)d; (void)buf; if (VQSORT_PRINT >= 1) { - fprintf(stderr, "WARNING: using slow HeapSort because vqsort disabled\n"); + HWY_WARN("using slow HeapSort because vqsort disabled\n"); } detail::HeapSort(st, keys, num); #endif // VQSORT_ENABLED @@ -2043,7 +2043,7 @@ void PartialSort(D d, Traits st, T* HWY_RESTRICT keys, size_t num, size_t k, (void)d; (void)buf; if (VQSORT_PRINT >= 1) { - fprintf(stderr, "WARNING: using slow HeapSort because vqsort disabled\n"); + HWY_WARN("using slow HeapSort because vqsort disabled\n"); } detail::HeapPartialSort(st, keys, num, k); #endif // VQSORT_ENABLED @@ -2084,7 +2084,7 @@ void Select(D d, Traits st, T* HWY_RESTRICT keys, const size_t num, (void)d; (void)buf; if (VQSORT_PRINT >= 1) { - fprintf(stderr, "WARNING: using slow HeapSort because vqsort disabled\n"); + HWY_WARN("using slow HeapSort because vqsort disabled\n"); } detail::HeapSelect(st, keys, num, k); #endif // VQSORT_ENABLED diff --git a/hwy/contrib/thread_pool/topology.cc b/hwy/contrib/thread_pool/topology.cc index 0cca9bdcd8..2183fd30b8 100644 --- a/hwy/contrib/thread_pool/topology.cc +++ b/hwy/contrib/thread_pool/topology.cc @@ -95,8 +95,7 @@ HWY_CONTRIB_DLLEXPORT size_t TotalLogicalProcessors() { // upper bound. const long ret = sysconf(_SC_NPROCESSORS_CONF); // NOLINT(runtime/int) if (ret < 0) { - fprintf(stderr, "Unexpected value of _SC_NPROCESSORS_CONF: %d\n", - static_cast(ret)); + HWY_WARN("Unexpected _SC_NPROCESSORS_CONF = %d\n", static_cast(ret)); } else { lp = static_cast(ret); } @@ -104,13 +103,12 @@ HWY_CONTRIB_DLLEXPORT size_t TotalLogicalProcessors() { if (HWY_UNLIKELY(lp == 0)) { // Failed to detect. HWY_IF_CONSTEXPR(HWY_IS_DEBUG_BUILD) { - fprintf(stderr, - "Unknown TotalLogicalProcessors, assuming 1. " - "HWY_OS_: WIN=%d LINUX=%d APPLE=%d;\n" - "HWY_ARCH_: WASM=%d X86=%d PPC=%d ARM=%d RISCV=%d S390X=%d\n", - HWY_OS_WIN, HWY_OS_LINUX, HWY_OS_APPLE, HWY_ARCH_WASM, - HWY_ARCH_X86, HWY_ARCH_PPC, HWY_ARCH_ARM, HWY_ARCH_RISCV, - HWY_ARCH_S390X); + HWY_WARN( + "Unknown TotalLogicalProcessors, assuming 1. " + "HWY_OS_: WIN=%d LINUX=%d APPLE=%d;\n" + "HWY_ARCH_: WASM=%d X86=%d PPC=%d ARM=%d RISCV=%d S390X=%d\n", + HWY_OS_WIN, HWY_OS_LINUX, HWY_OS_APPLE, HWY_ARCH_WASM, HWY_ARCH_X86, + HWY_ARCH_PPC, HWY_ARCH_ARM, HWY_ARCH_RISCV, HWY_ARCH_S390X); } return 1; } @@ -118,8 +116,8 @@ HWY_CONTRIB_DLLEXPORT size_t TotalLogicalProcessors() { // Warn that we are clamping. if (HWY_UNLIKELY(lp > kMaxLogicalProcessors)) { HWY_IF_CONSTEXPR(HWY_IS_DEBUG_BUILD) { - fprintf(stderr, "OS reports %zu processors but clamping to %zu\n", lp, - kMaxLogicalProcessors); + HWY_WARN("OS reports %zu processors but clamping to %zu\n", lp, + kMaxLogicalProcessors); } lp = kMaxLogicalProcessors; } @@ -254,7 +252,7 @@ class File { if (errno == EINTR) continue; // signal: retry if (errno == ENOENT) return; // not found, give up if (HWY_IS_DEBUG_BUILD) { - fprintf(stderr, "Unexpected error opening %s: %d\n", path, errno); + HWY_WARN("Unexpected error opening %s: %d\n", path, errno); } return; // unknown error, give up } @@ -267,7 +265,7 @@ class File { if (ret == 0) break; // success if (errno == EINTR) continue; // signal: retry if (HWY_IS_DEBUG_BUILD) { - fprintf(stderr, "Unexpected error closing file: %d\n", errno); + HWY_WARN("Unexpected error closing file: %d\n", errno); } return; // unknown error, ignore } @@ -288,7 +286,7 @@ class File { if (bytes_read == -1) { if (errno == EINTR) continue; // signal: retry if (HWY_IS_DEBUG_BUILD) { - fprintf(stderr, "Unexpected error reading file: %d\n", errno); + HWY_WARN("Unexpected error reading file: %d\n", errno); } return 0; } @@ -620,7 +618,7 @@ static bool InitCachesSysfs(Caches& caches) { const std::vector package_sizes = DetectPackages(lps); // `package_sizes` is only used to check that `lps` were filled. if (package_sizes.empty()) { - fprintf(stderr, "WARN: no packages, shared cache sizes may be incorrect\n"); + HWY_WARN("no packages, shared cache sizes may be incorrect\n"); return false; } @@ -635,8 +633,7 @@ static bool InitCachesSysfs(Caches& caches) { // Check before overwriting any fields. if (c.size_kib != 0) { - fprintf(stderr, "WARN: ignoring another L%u, first size %u\n", level, - c.size_kib); + HWY_WARN("ignoring another L%u, first size %u\n", level, c.size_kib); continue; } @@ -644,8 +641,7 @@ static bool InitCachesSysfs(Caches& caches) { WriteSysfs("ways_of_associativity", i, &c.associativity) && WriteSysfs("number_of_sets", i, &c.sets); if (HWY_UNLIKELY(!ok)) { - fprintf(stderr, "WARN: skipping partially-detected L%u, error %d\n", - level, errno); + HWY_WARN("skipping partially-detected L%u, error %d\n", level, errno); c = Cache(); continue; } @@ -659,8 +655,7 @@ static bool InitCachesSysfs(Caches& caches) { // Divide by number of *cores* sharing the cache. const std::string shared_str = ReadString("shared_cpu_list", i); if (HWY_UNLIKELY(shared_str.empty())) { - fprintf(stderr, "WARN: no shared_cpu_list for L%u %s\n", level, - type.c_str()); + HWY_WARN("no shared_cpu_list for L%u %s\n", level, type.c_str()); c.cores_sharing = 1; } else { const std::vector shared_lps = @@ -670,13 +665,13 @@ static bool InitCachesSysfs(Caches& caches) { if (HWY_LIKELY(lp < lps.size())) { num_cores += lps[lp].smt == 0; } else { - fprintf(stderr, "WARN: out of bounds lp %zu of %zu from %s\n", lp, - lps.size(), shared_str.c_str()); + HWY_WARN("out of bounds lp %zu of %zu from %s\n", lp, lps.size(), + shared_str.c_str()); } } if (num_cores == 0) { - fprintf(stderr, "WARN: no cores sharing L%u %s, setting to 1\n", level, - type.c_str()); + HWY_WARN("no cores sharing L%u %s, setting to 1\n", level, + type.c_str()); num_cores = 1; } c.cores_sharing = static_cast(num_cores); @@ -689,8 +684,8 @@ static bool InitCachesSysfs(Caches& caches) { // Require L1 and L2 cache. if (HWY_UNLIKELY(caches[1].size_kib == 0 || caches[2].size_kib == 0)) { - fprintf(stderr, "WARN: sysfs detected L1=%u L2=%u, err %x\n", - caches[1].size_kib, caches[2].size_kib, errno); + HWY_WARN("sysfs detected L1=%u L2=%u, err %x\n", caches[1].size_kib, + caches[2].size_kib, errno); return false; } @@ -729,10 +724,8 @@ bool ForEachSLPI(LOGICAL_PROCESSOR_RELATIONSHIP rel, Func&& func) { pos += info->Size; } if (pos != buf + buf_bytes) { - fprintf( - stderr, - "WARN: unexpected pos %p, end %p, buf_bytes %lu, sizeof(SLPI) %zu\n", - pos, buf + buf_bytes, buf_bytes, sizeof(SLPI)); + HWY_WARN("unexpected pos %p, end %p, buf_bytes %lu, sizeof(SLPI) %zu\n", + pos, buf + buf_bytes, buf_bytes, sizeof(SLPI)); } free(buf); @@ -780,7 +773,7 @@ static bool InitCachesWin(Caches& caches) { // `max_logical_per_core`, hence round up. shared_with = DivCeil(shared_with, max_logical_per_core); if (shared_with == 0) { - fprintf(stderr, "WARN: no cores sharing L%u, setting to 1\n", cr.Level); + HWY_WARN("no cores sharing L%u, setting to 1\n", cr.Level); shared_with = 1; } @@ -794,8 +787,8 @@ static bool InitCachesWin(Caches& caches) { // Require L1 and L2 cache. if (HWY_UNLIKELY(caches[1].size_kib == 0 || caches[2].size_kib == 0)) { - fprintf(stderr, "WARN: Windows detected L1=%u, L2=%u, err %lx\n", - caches[1].size_kib, caches[2].size_kib, GetLastError()); + HWY_WARN("Windows detected L1=%u, L2=%u, err %lx\n", caches[1].size_kib, + caches[2].size_kib, GetLastError()); return false; } @@ -836,7 +829,7 @@ static bool InitCachesApple(Caches& caches) { ok &= Sysctl("hw.perflevel0.l2cachesize", 1024, err, &L2.size_kib) || Sysctl("hw.l2cachesize", 1024, err, &L2.size_kib); if (HWY_UNLIKELY(!ok)) { - fprintf(stderr, "WARN: Apple cache detection failed, error %d\n", err); + HWY_WARN("Apple cache detection failed, error %d\n", err); return false; } L1.cores_sharing = 1; @@ -853,7 +846,7 @@ static bool InitCachesApple(Caches& caches) { if (!sysctlbyname("machdep.cpu.brand_string", brand, &size, nullptr, 0)) { if (!strncmp(brand, "Apple ", 6)) { // Unexpected, but we will continue check the string suffixes. - fprintf(stderr, "WARN: unexpected Apple brand %s\n", brand); + HWY_WARN("unexpected Apple brand %s\n", brand); } if (brand[6] == 'M') { @@ -961,7 +954,7 @@ static const Cache* InitDataCaches() { #elif HWY_OS_APPLE if (HWY_UNLIKELY(!InitCachesApple(caches))) return nullptr; #else - fprintf(stderr, "Cache detection not implemented for this platform.\n"); + HWY_WARN("Cache detection not implemented for this platform.\n"); (void)caches; return nullptr; #define HWY_NO_CACHE_DETECTION diff --git a/hwy/examples/benchmark.cc b/hwy/examples/benchmark.cc index 98aae6819b..2f48e52b0e 100644 --- a/hwy/examples/benchmark.cc +++ b/hwy/examples/benchmark.cc @@ -75,7 +75,7 @@ void RunBenchmark(const char* caption) { [&benchmark](const FuncInput input) { return benchmark(input); }, inputs, kNumInputs, results, p); if (num_results != kNumInputs) { - fprintf(stderr, "MeasureClosure failed.\n"); + HWY_WARN("MeasureClosure failed.\n"); } benchmark.Verify(num_items); @@ -147,17 +147,14 @@ class BenchmarkDot : public TwoArray { } void Verify(size_t num_items) { if (dot_ == -1.0f) { - fprintf(stderr, "Dot: must call Verify after benchmark"); - abort(); + HWY_ABORT("Dot: must call Verify after benchmark"); } const float expected = std::inner_product(a_.get(), a_.get() + num_items, b_, 0.0f); const float rel_err = std::abs(expected - dot_) / expected; if (rel_err > 1.1E-6f) { - fprintf(stderr, "Dot: expected %e actual %e (%e)\n", expected, dot_, - rel_err); - abort(); + HWY_ABORT("Dot: expected %e actual %e (%e)\n", expected, dot_, rel_err); } } @@ -214,7 +211,7 @@ struct BenchmarkDelta : public TwoArray { const float expected = (i == 0) ? a_[0] : a_[i] - a_[i - 1]; const float err = std::abs(expected - b_[i]); if (err > 1E-6f) { - fprintf(stderr, "Delta: expected %e, actual %e\n", expected, b_[i]); + HWY_WARN("Delta: expected %e, actual %e\n", expected, b_[i]); } } } diff --git a/hwy/nanobenchmark.cc b/hwy/nanobenchmark.cc index ea5549f3d1..0dec0bc469 100644 --- a/hwy/nanobenchmark.cc +++ b/hwy/nanobenchmark.cc @@ -24,6 +24,7 @@ #include #include +#include "hwy/base.h" #include "hwy/robust_statistics.h" #include "hwy/timer-inl.h" #include "hwy/timer.h" @@ -76,7 +77,9 @@ timer::Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad, // For "few" (depends also on the variance) samples, Median is safer. est = robust_statistics::Median(samples.data(), samples.size()); } - NANOBENCHMARK_CHECK(est != 0); + if (est == 0) { + HWY_WARN("estimated duration is 0\n"); + } // Median absolute deviation (mad) is a robust measure of 'variability'. const timer::Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation( @@ -194,9 +197,9 @@ void FillSubset(const InputVec& full, const FuncInput input_to_skip, (*subset)[idx_subset++] = next; } } - NANOBENCHMARK_CHECK(idx_subset == subset->size()); - NANOBENCHMARK_CHECK(idx_omit == omit.size()); - NANOBENCHMARK_CHECK(occurrence == count - 1); + HWY_DASSERT(idx_subset == subset->size()); + HWY_DASSERT(idx_omit == omit.size()); + HWY_DASSERT(occurrence == count - 1); } // Returns total ticks elapsed for all inputs. @@ -239,12 +242,11 @@ HWY_DLLEXPORT int Unpredictable1() { return timer::Start() != ~0ULL; } HWY_DLLEXPORT size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs, const size_t num_inputs, Result* results, const Params& p) { - NANOBENCHMARK_CHECK(num_inputs != 0); + HWY_DASSERT(num_inputs != 0); char cpu100[100]; if (!platform::HaveTimerStop(cpu100)) { - fprintf(stderr, "CPU '%s' does not support RDTSCP, skipping benchmark.\n", - cpu100); + HWY_WARN("CPU '%s' does not support RDTSCP, skipping benchmark.\n", cpu100); return 0; } @@ -262,8 +264,8 @@ HWY_DLLEXPORT size_t Measure(const Func func, const uint8_t* arg, const timer::Ticks overhead = Overhead(arg, &full, p); const timer::Ticks overhead_skip = Overhead(arg, &subset, p); if (overhead < overhead_skip) { - fprintf(stderr, "Measurement failed: overhead %d < %d\n", - static_cast(overhead), static_cast(overhead_skip)); + HWY_WARN("Measurement failed: overhead %d < %d\n", + static_cast(overhead), static_cast(overhead_skip)); return 0; } @@ -282,8 +284,8 @@ HWY_DLLEXPORT size_t Measure(const Func func, const uint8_t* arg, TotalDuration(func, arg, &subset, p, &max_rel_mad); if (total < total_skip) { - fprintf(stderr, "Measurement failed: total %f < %f\n", - static_cast(total), static_cast(total_skip)); + HWY_WARN("Measurement failed: total %f < %f\n", + static_cast(total), static_cast(total_skip)); return 0; } diff --git a/hwy/nanobenchmark.h b/hwy/nanobenchmark.h index 46bfc4b0a8..eefe6fb7e0 100644 --- a/hwy/nanobenchmark.h +++ b/hwy/nanobenchmark.h @@ -49,25 +49,7 @@ #include #include "hwy/highway_export.h" -#include "hwy/timer.h" - -// Enables sanity checks that verify correct operation at the cost of -// longer benchmark runs. -#ifndef NANOBENCHMARK_ENABLE_CHECKS -#define NANOBENCHMARK_ENABLE_CHECKS 0 -#endif - -#define NANOBENCHMARK_CHECK_ALWAYS(condition) \ - while (!(condition)) { \ - fprintf(stderr, "Nanobenchmark check failed at line %d\n", __LINE__); \ - abort(); \ - } - -#if NANOBENCHMARK_ENABLE_CHECKS -#define NANOBENCHMARK_CHECK(condition) NANOBENCHMARK_CHECK_ALWAYS(condition) -#else -#define NANOBENCHMARK_CHECK(condition) -#endif +#include "hwy/timer.h" // IWYU pragma: export namespace hwy { diff --git a/hwy/nanobenchmark_test.cc b/hwy/nanobenchmark_test.cc index 9ec004910d..6b3a8f97ef 100644 --- a/hwy/nanobenchmark_test.cc +++ b/hwy/nanobenchmark_test.cc @@ -72,7 +72,7 @@ void MeasureRandom(const FuncInput (&inputs)[N]) { p.verbose = false; const size_t num_results = Measure(&Random, nullptr, inputs, N, results, p); for (size_t i = 0; i < num_results; ++i) { - NANOBENCHMARK_CHECK(results[i].variability > 1E-3); + HWY_ASSERT(results[i].variability > 1E-3); } } diff --git a/hwy/targets.cc b/hwy/targets.cc index b246f1fdd6..b6c2419ba2 100644 --- a/hwy/targets.cc +++ b/hwy/targets.cc @@ -707,12 +707,11 @@ int64_t DetectTargets() { if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) { const uint64_t bits_u = static_cast(bits); const uint64_t enabled = static_cast(HWY_ENABLED_BASELINE); - fprintf(stderr, - "WARNING: CPU supports 0x%08x%08x, software requires 0x%08x%08x\n", - static_cast(bits_u >> 32), - static_cast(bits_u & 0xFFFFFFFF), - static_cast(enabled >> 32), - static_cast(enabled & 0xFFFFFFFF)); + HWY_WARN("CPU supports 0x%08x%08x, software requires 0x%08x%08x\n", + static_cast(bits_u >> 32), + static_cast(bits_u & 0xFFFFFFFF), + static_cast(enabled >> 32), + static_cast(enabled & 0xFFFFFFFF)); } return bits; diff --git a/hwy/tests/tuple_test.cc b/hwy/tests/tuple_test.cc index 60fe14e9c6..6e7ddfa24b 100644 --- a/hwy/tests/tuple_test.cc +++ b/hwy/tests/tuple_test.cc @@ -72,8 +72,7 @@ struct TestCreateAndSet { HWY_ASSERT_VEC_EQ(d, v0, Get4<3>(t4)); #else (void)d; - fprintf(stderr, "Warning: tuples are disabled for target %s\n", - hwy::TargetName(HWY_TARGET)); + HWY_WARN("Tuples disabled for target %s\n", hwy::TargetName(HWY_TARGET)); #endif // HWY_HAVE_TUPLE } }; From 89b26786b46845ca19fa535e72e10835ad7f8110 Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Thu, 21 Nov 2024 01:12:34 -0800 Subject: [PATCH 08/64] topology fixes for M3 PiperOrigin-RevId: 698685120 --- hwy/contrib/thread_pool/topology.cc | 80 +++++++++++++++++------------ 1 file changed, 46 insertions(+), 34 deletions(-) diff --git a/hwy/contrib/thread_pool/topology.cc b/hwy/contrib/thread_pool/topology.cc index 2183fd30b8..49d9b67bd0 100644 --- a/hwy/contrib/thread_pool/topology.cc +++ b/hwy/contrib/thread_pool/topology.cc @@ -25,7 +25,7 @@ #include #include -#include "hwy/detect_compiler_arch.h" // HWY_OS_WIN +#include "hwy/base.h" // HWY_OS_WIN, HWY_WARN #if HWY_OS_APPLE #include @@ -68,8 +68,6 @@ #include #endif -#include "hwy/base.h" - namespace hwy { HWY_CONTRIB_DLLEXPORT bool HaveThreadingSupport() { @@ -80,6 +78,39 @@ HWY_CONTRIB_DLLEXPORT bool HaveThreadingSupport() { #endif } +// Returns `whole / part`, with a check that `part` evenly divides `whole`, +// which implies the result is exact. +static HWY_MAYBE_UNUSED size_t DivByFactor(size_t whole, size_t part) { + HWY_ASSERT(part != 0); + const size_t div = whole / part; + const size_t mul = div * part; + if (mul != whole) { + HWY_ABORT("%zu / %zu = %zu; *%zu = %zu\n", whole, part, div, part, mul); + } + return div; +} + +#if HWY_OS_APPLE + +// Returns whether sysctlbyname() succeeded; if so, writes `val / div` to +// `out`, otherwise sets `err`. +template +bool Sysctl(const char* name, size_t div, int& err, T* out) { + size_t val = 0; + size_t size = sizeof(val); + // Last two arguments are for updating the value, which we do not want. + const int ret = sysctlbyname(name, &val, &size, nullptr, 0); + if (HWY_UNLIKELY(ret != 0)) { + // Do not print warnings because some `name` are expected to fail. + err = ret; + return false; + } + *out = static_cast(DivByFactor(val, div)); + return true; +} + +#endif // HWY_OS_APPLE + HWY_CONTRIB_DLLEXPORT size_t TotalLogicalProcessors() { size_t lp = 0; #if HWY_ARCH_WASM @@ -99,6 +130,11 @@ HWY_CONTRIB_DLLEXPORT size_t TotalLogicalProcessors() { } else { lp = static_cast(ret); } +#elif HWY_OS_APPLE + int err; + if (!Sysctl("hw.logicalcpu", 1, err, &lp)) { + lp = 0; + } #endif if (HWY_UNLIKELY(lp == 0)) { // Failed to detect. @@ -568,18 +604,6 @@ HWY_CONTRIB_DLLEXPORT Topology::Topology() { using Caches = std::array; -// Returns `whole / part`, with a check that `part` evenly divides `whole`, -// which implies the result is exact. -static HWY_MAYBE_UNUSED size_t DivByFactor(size_t whole, size_t part) { - HWY_ASSERT(part != 0); - const size_t div = whole / part; - const size_t mul = div * part; - if (mul != whole) { - HWY_ABORT("%zu / %zu = %zu; *%zu = %zu\n", whole, part, div, part, mul); - } - return div; -} - // We assume homogeneous caches across all clusters because some OS APIs return // a single value for a class of CPUs. @@ -798,22 +822,6 @@ static bool InitCachesWin(Caches& caches) { #endif // HWY_OS_WIN #if HWY_OS_APPLE -// Returns whether sysctlbyname() succeeded; if so, writes `val / div` to -// `out`, otherwise sets `err`. -template -bool Sysctl(const char* name, size_t div, int& err, T* out) { - size_t val = 0; - size_t size = sizeof(val); - // Last two arguments are for updating the value, which we do not want. - const int ret = sysctlbyname(name, &val, &size, nullptr, 0); - if (HWY_UNLIKELY(ret != 0)) { - // Do not print warnings because some `name` are expected to fail. - err = ret; - return false; - } - *out = static_cast(DivByFactor(val, div)); - return true; -} static bool InitCachesApple(Caches& caches) { int err = 0; @@ -834,7 +842,9 @@ static bool InitCachesApple(Caches& caches) { } L1.cores_sharing = 1; if (Sysctl("hw.perflevel0.cpusperl2", 1, err, &L2.cores_sharing)) { - L2.size_kib = DivByFactor(L2.size_kib, L2.cores_sharing); + // There exist CPUs for which L2 is not evenly divisible by `cores_sharing`, + // hence do not use `DivByFactor`. It is safer to round down. + L2.size_kib /= L2.cores_sharing; } else { L2.cores_sharing = 1; } @@ -844,7 +854,7 @@ static bool InitCachesApple(Caches& caches) { char brand[128] = {0}; size_t size = sizeof(brand); if (!sysctlbyname("machdep.cpu.brand_string", brand, &size, nullptr, 0)) { - if (!strncmp(brand, "Apple ", 6)) { + if (strncmp(brand, "Apple ", 6) != 0) { // Unexpected, but we will continue check the string suffixes. HWY_WARN("unexpected Apple brand %s\n", brand); } @@ -900,8 +910,10 @@ static bool InitCachesApple(Caches& caches) { if (L3.size_kib == 0 && (Sysctl("hw.perflevel0.l3cachesize", 1024, err, &L3.size_kib) || Sysctl("hw.l3cachesize", 1024, err, &L3.size_kib))) { + // There exist CPUs for which L3 is not evenly divisible by `cores_sharing`, + // hence do not use `DivByFactor`. It is safer to round down. if (Sysctl("hw.perflevel0.cpusperl3", 1, err, &L3.cores_sharing)) { - L3.size_kib = DivByFactor(L3.size_kib, L3.cores_sharing); + L3.size_kib /= L3.cores_sharing; } else { L3.cores_sharing = 1; } From d22ccd0524311aeb8dd8769268e497f8bdb80a07 Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Mon, 25 Nov 2024 02:35:19 -0800 Subject: [PATCH 09/64] Add topology support for Windows and Apple Also reduce #if in mainline by factoring into helper functions, share code between Linux and FreeBSD Use anon namespace instead of static PiperOrigin-RevId: 699906189 --- hwy/contrib/thread_pool/topology.cc | 605 ++++++++++++++++------- hwy/contrib/thread_pool/topology_test.cc | 4 + 2 files changed, 419 insertions(+), 190 deletions(-) diff --git a/hwy/contrib/thread_pool/topology.cc b/hwy/contrib/thread_pool/topology.cc index 49d9b67bd0..60ffbff829 100644 --- a/hwy/contrib/thread_pool/topology.cc +++ b/hwy/contrib/thread_pool/topology.cc @@ -55,14 +55,16 @@ #include #include #include +#include #include #include // sysconf #endif // HWY_OS_LINUX || HWY_OS_FREEBSD #if HWY_OS_FREEBSD -// must come after sys/types.h. -#include // CPU_SET -#endif // HWY_OS_FREEBSD +#include +// After param.h / types.h. +#include +#endif // HWY_OS_FREEBSD #if HWY_ARCH_WASM #include @@ -78,9 +80,11 @@ HWY_CONTRIB_DLLEXPORT bool HaveThreadingSupport() { #endif } +namespace { + // Returns `whole / part`, with a check that `part` evenly divides `whole`, // which implies the result is exact. -static HWY_MAYBE_UNUSED size_t DivByFactor(size_t whole, size_t part) { +HWY_MAYBE_UNUSED size_t DivByFactor(size_t whole, size_t part) { HWY_ASSERT(part != 0); const size_t div = whole / part; const size_t mul = div * part; @@ -90,7 +94,85 @@ static HWY_MAYBE_UNUSED size_t DivByFactor(size_t whole, size_t part) { return div; } -#if HWY_OS_APPLE +#if HWY_OS_WIN + +using SLPI = SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX; + +template +bool ForEachSLPI(LOGICAL_PROCESSOR_RELATIONSHIP rel, Func&& func) { + // Get required buffer size. + DWORD buf_bytes = 0; + HWY_ASSERT(!GetLogicalProcessorInformationEx(rel, nullptr, &buf_bytes)); + // Observed when `rel` is not supported: + if (HWY_UNLIKELY(buf_bytes == 0 && GetLastError() == ERROR_GEN_FAILURE)) { + if (rel != RelationNumaNodeEx && rel != RelationProcessorDie) { + HWY_WARN("Unexpected err %lx for GLPI relationship %d\n", GetLastError(), + static_cast(rel)); + } + return false; + } + HWY_ASSERT(GetLastError() == ERROR_INSUFFICIENT_BUFFER); + // Note: `buf_bytes` may be less than `sizeof(SLPI)`, which has padding. + uint8_t* buf = static_cast(malloc(buf_bytes)); + HWY_ASSERT(buf); + + // Fill the buffer. + SLPI* info = reinterpret_cast(buf); + if (HWY_UNLIKELY(!GetLogicalProcessorInformationEx(rel, info, &buf_bytes))) { + free(buf); + return false; + } + + // Iterate over each SLPI. `sizeof(SLPI)` is unreliable, see above. + uint8_t* pos = buf; + while (pos < buf + buf_bytes) { + info = reinterpret_cast(pos); + HWY_ASSERT(rel == RelationAll || info->Relationship == rel); + func(*info); + pos += info->Size; + } + if (pos != buf + buf_bytes) { + HWY_WARN("unexpected pos %p, end %p, buf_bytes %lu, sizeof(SLPI) %zu\n", + pos, buf + buf_bytes, buf_bytes, sizeof(SLPI)); + } + + free(buf); + return true; +} + +size_t NumBits(size_t num_groups, const GROUP_AFFINITY* affinity) { + size_t total_bits = 0; + for (size_t i = 0; i < num_groups; ++i) { + size_t bits = 0; + hwy::CopyBytes(&affinity[i].Mask, &bits); + total_bits += hwy::PopCount(bits); + } + return total_bits; +} + +// Calls `func(lp, lps)` for each index `lp` in the set, after ensuring that +// `lp < lps.size()`. `line` is for debugging via Warn(). +template +void ForeachBit(size_t num_groups, const GROUP_AFFINITY* affinity, + std::vector& lps, int line, const Func& func) { + for (size_t group = 0; group < num_groups; ++group) { + size_t bits = 0; + hwy::CopyBytes(&affinity[group].Mask, &bits); + while (bits != 0) { + size_t lp = Num0BitsBelowLS1Bit_Nonzero64(bits); + bits &= bits - 1; // clear LSB + if (HWY_UNLIKELY(lp >= lps.size())) { + Warn(__FILE__, __LINE__, + "Clamping lp %zu to lps.size() %zu, groups %zu\n", lp, lps.size(), + num_groups); + lp = lps.size() - 1; + } + func(lp, lps); + } + } +} + +#elif HWY_OS_APPLE // Returns whether sysctlbyname() succeeded; if so, writes `val / div` to // `out`, otherwise sets `err`. @@ -109,18 +191,22 @@ bool Sysctl(const char* name, size_t div, int& err, T* out) { return true; } -#endif // HWY_OS_APPLE +#endif // HWY_OS_* + +} // namespace HWY_CONTRIB_DLLEXPORT size_t TotalLogicalProcessors() { - size_t lp = 0; + size_t total_lps = 0; #if HWY_ARCH_WASM const int num_cores = emscripten_num_logical_cores(); - if (num_cores > 0) lp = static_cast(num_cores); + if (num_cores > 0) total_lps = static_cast(num_cores); #elif HWY_OS_WIN - SYSTEM_INFO sysinfo; - GetSystemInfo(&sysinfo); // always succeeds - // WARNING: this is only for the current group, hence limited to 64. - lp = static_cast(sysinfo.dwNumberOfProcessors); + // If there are multiple groups, this should return them all, rather than + // just the first 64, but VMs report less. + (void)ForEachSLPI(RelationProcessorCore, [&total_lps](const SLPI& info) { + const PROCESSOR_RELATIONSHIP& p = info.Processor; + total_lps += NumBits(p.GroupCount, p.GroupMask); + }); #elif HWY_OS_LINUX // Use configured, not "online" (_SC_NPROCESSORS_ONLN), because we want an // upper bound. @@ -128,16 +214,17 @@ HWY_CONTRIB_DLLEXPORT size_t TotalLogicalProcessors() { if (ret < 0) { HWY_WARN("Unexpected _SC_NPROCESSORS_CONF = %d\n", static_cast(ret)); } else { - lp = static_cast(ret); + total_lps = static_cast(ret); } #elif HWY_OS_APPLE int err; - if (!Sysctl("hw.logicalcpu", 1, err, &lp)) { - lp = 0; + // Only report P processors. + if (!Sysctl("hw.perflevel0.logicalcpu", 1, err, &total_lps)) { + total_lps = 0; } #endif - if (HWY_UNLIKELY(lp == 0)) { // Failed to detect. + if (HWY_UNLIKELY(total_lps == 0)) { // Failed to detect. HWY_IF_CONSTEXPR(HWY_IS_DEBUG_BUILD) { HWY_WARN( "Unknown TotalLogicalProcessors, assuming 1. " @@ -150,20 +237,78 @@ HWY_CONTRIB_DLLEXPORT size_t TotalLogicalProcessors() { } // Warn that we are clamping. - if (HWY_UNLIKELY(lp > kMaxLogicalProcessors)) { + if (HWY_UNLIKELY(total_lps > kMaxLogicalProcessors)) { HWY_IF_CONSTEXPR(HWY_IS_DEBUG_BUILD) { - HWY_WARN("OS reports %zu processors but clamping to %zu\n", lp, + HWY_WARN("OS reports %zu processors but clamping to %zu\n", total_lps, kMaxLogicalProcessors); } - lp = kMaxLogicalProcessors; + total_lps = kMaxLogicalProcessors; } - return lp; + return total_lps; } +// ------------------------------ Affinity + +#if HWY_OS_LINUX || HWY_OS_FREEBSD + +#if HWY_OS_LINUX +using CpuSet = cpu_set_t; +#else +using CpuSet = cpuset_t; +#endif + +// Helper functions reduce the number of #if in GetThreadAffinity. +int GetAffinity(CpuSet* set) { + // To specify the current thread, pass 0 on Linux/Android and -1 on FreeBSD. #ifdef __ANDROID__ -#include + return syscall(__NR_sched_getaffinity, 0, sizeof(CpuSet), set); +#elif HWY_OS_FREEBSD + return cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(CpuSet), + set); +#else // normal Linux + return sched_getaffinity(0, sizeof(CpuSet), set); +#endif +} + +int SetAffinity(CpuSet* set) { + // To specify the current thread, pass 0 on Linux/Android and -1 on FreeBSD. +#ifdef __ANDROID__ + return syscall(__NR_sched_setaffinity, 0, sizeof(CpuSet), set); +#elif HWY_OS_FREEBSD + return cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(CpuSet), + set); +#else // normal Linux + return sched_setaffinity(0, sizeof(CpuSet), set); +#endif +} + +bool IsSet(size_t lp, const CpuSet* set) { +#if HWY_COMPILER_GCC_ACTUAL + // Workaround for GCC compiler warning with CPU_ISSET macro + HWY_DIAGNOSTICS(push) + HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion") +#endif + const int is_set = CPU_ISSET(static_cast(lp), set); +#if HWY_COMPILER_GCC_ACTUAL + HWY_DIAGNOSTICS(pop) #endif + return is_set != 0; +} + +void Set(size_t lp, CpuSet* set) { +#if HWY_COMPILER_GCC_ACTUAL + // Workaround for GCC compiler warning with CPU_SET macro + HWY_DIAGNOSTICS(push) + HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion") +#endif + CPU_SET(static_cast(lp), set); +#if HWY_COMPILER_GCC_ACTUAL + HWY_DIAGNOSTICS(pop) +#endif +} + +#endif // HWY_OS_LINUX || HWY_OS_FREEBSD HWY_CONTRIB_DLLEXPORT bool GetThreadAffinity(LogicalProcessorSet& lps) { #if HWY_OS_WIN @@ -175,53 +320,18 @@ HWY_CONTRIB_DLLEXPORT bool GetThreadAffinity(LogicalProcessorSet& lps) { lps = LogicalProcessorSet(); // clear all lps.SetNonzeroBitsFrom64(prev); return true; -#elif HWY_OS_LINUX - cpu_set_t set; +#elif HWY_OS_LINUX || HWY_OS_FREEBSD + CpuSet set; CPU_ZERO(&set); - const pid_t pid = 0; // current thread -#ifdef __ANDROID__ - const int err = syscall(__NR_sched_getaffinity, pid, sizeof(cpu_set_t), &set); -#else - const int err = sched_getaffinity(pid, sizeof(cpu_set_t), &set); -#endif // __ANDROID__ + const int err = GetAffinity(&set); if (err != 0) return false; for (size_t lp = 0; lp < kMaxLogicalProcessors; ++lp) { -#if HWY_COMPILER_GCC_ACTUAL - // Workaround for GCC compiler warning with CPU_ISSET macro - HWY_DIAGNOSTICS(push) - HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion") -#endif - if (CPU_ISSET(static_cast(lp), &set)) { - lps.Set(lp); - } -#if HWY_COMPILER_GCC_ACTUAL - HWY_DIAGNOSTICS(pop) -#endif - } - return true; -#elif HWY_OS_FREEBSD - cpuset_t set; - CPU_ZERO(&set); - const pid_t pid = getpid(); // current thread - const int err = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, pid, - sizeof(cpuset_t), &set); - if (err != 0) return false; - for (size_t lp = 0; lp < kMaxLogicalProcessors; ++lp) { -#if HWY_COMPILER_GCC_ACTUAL - // Workaround for GCC compiler warning with CPU_ISSET macro - HWY_DIAGNOSTICS(push) - HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion") -#endif - if (CPU_ISSET(static_cast(lp), &set)) { - lps.Set(lp); - } -#if HWY_COMPILER_GCC_ACTUAL - HWY_DIAGNOSTICS(pop) -#endif + if (IsSet(lp, &set)) lps.Set(lp); } return true; #else - // Do not even set lp=0 to force callers to handle this case. + // For HWY_OS_APPLE, affinity is not supported. Do not even set lp=0 to force + // callers to handle this case. (void)lps; return false; #endif @@ -232,41 +342,11 @@ HWY_CONTRIB_DLLEXPORT bool SetThreadAffinity(const LogicalProcessorSet& lps) { const HANDLE hThread = GetCurrentThread(); const DWORD_PTR prev = SetThreadAffinityMask(hThread, lps.Get64()); return prev != 0; -#elif HWY_OS_LINUX - cpu_set_t set; +#elif HWY_OS_LINUX || HWY_OS_FREEBSD + CpuSet set; CPU_ZERO(&set); -#if HWY_COMPILER_GCC_ACTUAL - // Workaround for GCC compiler warning with CPU_SET macro - HWY_DIAGNOSTICS(push) - HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion") -#endif - lps.Foreach([&set](size_t lp) { CPU_SET(static_cast(lp), &set); }); -#if HWY_COMPILER_GCC_ACTUAL - HWY_DIAGNOSTICS(pop) -#endif - const pid_t pid = 0; // current thread -#ifdef __ANDROID__ - const int err = syscall(__NR_sched_setaffinity, pid, sizeof(cpu_set_t), &set); -#else - const int err = sched_setaffinity(pid, sizeof(cpu_set_t), &set); -#endif // __ANDROID__ - if (err != 0) return false; - return true; -#elif HWY_OS_FREEBSD - cpuset_t set; - CPU_ZERO(&set); -#if HWY_COMPILER_GCC_ACTUAL - // Workaround for GCC compiler warning with CPU_SET macro - HWY_DIAGNOSTICS(push) - HWY_DIAGNOSTICS_OFF(disable : 4305 4309, ignored "-Wsign-conversion") -#endif - lps.Foreach([&set](size_t lp) { CPU_SET(static_cast(lp), &set); }); -#if HWY_COMPILER_GCC_ACTUAL - HWY_DIAGNOSTICS(pop) -#endif - const pid_t pid = getpid(); // current thread - const int err = cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, pid, - sizeof(cpuset_t), &set); + lps.Foreach([&set](size_t lp) { Set(lp, &set); }); + const int err = SetAffinity(&set); if (err != 0) return false; return true; #else @@ -276,9 +356,15 @@ HWY_CONTRIB_DLLEXPORT bool SetThreadAffinity(const LogicalProcessorSet& lps) { #endif } -#if HWY_OS_LINUX namespace { +struct PackageSizes { + size_t num_clusters; + size_t num_cores; +}; + +#if HWY_OS_LINUX + class File { public: explicit File(const char* path) { @@ -426,11 +512,6 @@ class Remapper { size_t num_ = 0; }; -struct PackageSizes { - size_t num_clusters; - size_t num_cores; -}; - // For internal use by `DetectPackages`. struct PerPackage { Remapper clusters; @@ -557,11 +638,221 @@ void SetNodes(std::vector& lps) { } } +void SetClusterCacheSizes(std::vector& packages) { + for (size_t ip = 0; ip < packages.size(); ++ip) { + Topology::Package& p = packages[ip]; + for (size_t ic = 0; ic < p.clusters.size(); ++ic) { + Topology::Cluster& c = p.clusters[ic]; + const size_t lp = c.lps.First(); + size_t bytes; + if (ReadNumberWithOptionalSuffix(kL2Size, lp, &bytes)) { + c.private_kib = bytes >> 10; + } + if (ReadNumberWithOptionalSuffix(kL3Size, lp, &bytes)) { + c.shared_kib = bytes >> 10; + } + } + } +} + +#elif HWY_OS_WIN + +// Also sets LP.core and LP.smt. +size_t MaxLpsPerCore(std::vector& lps) { + size_t max_lps_per_core = 0; + size_t core_idx = 0; + (void)ForEachSLPI(RelationProcessorCore, [&max_lps_per_core, &core_idx, + &lps](const SLPI& info) { + const PROCESSOR_RELATIONSHIP& p = info.Processor; + const size_t lps_per_core = NumBits(p.GroupCount, p.GroupMask); + max_lps_per_core = HWY_MAX(max_lps_per_core, lps_per_core); + + size_t smt = 0; + ForeachBit(p.GroupCount, p.GroupMask, lps, __LINE__, + [core_idx, &smt](size_t lp, std::vector& lps) { + lps[lp].core = core_idx; + lps[lp].smt = smt++; + }); + ++core_idx; + }); + HWY_ASSERT(max_lps_per_core != 0); + return max_lps_per_core; +} + +// Interprets cluster (tyically a shared L3 cache) as a "processor die". Also +// sets LP.cluster. +size_t MaxCoresPerCluster(const size_t max_lps_per_core, + std::vector& lps) { + size_t max_cores_per_cluster = 0; + size_t cluster_idx = 0; + // Shared between `foreach_die` and `foreach_l3`. + const auto foreach_cluster = [&](size_t num_groups, + const GROUP_AFFINITY* groups) { + const size_t lps_per_cluster = NumBits(num_groups, groups); + // `max_lps_per_core` is an upper bound, hence round up. It is not an error + // if there is only one core per cluster - can happen for L3. + const size_t cores_per_cluster = DivCeil(lps_per_cluster, max_lps_per_core); + max_cores_per_cluster = HWY_MAX(max_cores_per_cluster, cores_per_cluster); + + ForeachBit(num_groups, groups, lps, __LINE__, + [cluster_idx](size_t lp, std::vector& lps) { + lps[lp].cluster = cluster_idx; + }); + ++cluster_idx; + }; + + // Passes group bits to `foreach_cluster`, depending on relationship type. + const auto foreach_die = [&foreach_cluster](const SLPI& info) { + const PROCESSOR_RELATIONSHIP& p = info.Processor; + foreach_cluster(p.GroupCount, p.GroupMask); + }; + const auto foreach_l3 = [&foreach_cluster](const SLPI& info) { + const CACHE_RELATIONSHIP& cr = info.Cache; + if (cr.Type != CacheUnified && cr.Type != CacheData) return; + if (cr.Level != 3) return; + foreach_cluster(cr.GroupCount, cr.GroupMasks); + }; + + if (!ForEachSLPI(RelationProcessorDie, foreach_die)) { + // Has been observed to fail; also check for shared L3 caches. + (void)ForEachSLPI(RelationCache, foreach_l3); + } + if (max_cores_per_cluster == 0) { + HWY_WARN("All clusters empty, assuming 1 core each\n"); + max_cores_per_cluster = 1; + } + return max_cores_per_cluster; +} + +// Initializes `lps` and returns a `PackageSizes` vector (empty on failure) +// indicating the number of clusters and cores per package. +std::vector DetectPackages(std::vector& lps) { + const size_t max_lps_per_core = MaxLpsPerCore(lps); + const size_t max_cores_per_cluster = + MaxCoresPerCluster(max_lps_per_core, lps); + + std::vector packages; + size_t package_idx = 0; + (void)ForEachSLPI(RelationProcessorPackage, [&](const SLPI& info) { + const PROCESSOR_RELATIONSHIP& p = info.Processor; + const size_t lps_per_package = NumBits(p.GroupCount, p.GroupMask); + PackageSizes ps; // avoid designated initializers for MSVC + ps.num_clusters = max_cores_per_cluster; + // `max_lps_per_core` is an upper bound, hence round up. + ps.num_cores = DivCeil(lps_per_package, max_lps_per_core); + packages.push_back(ps); + + ForeachBit(p.GroupCount, p.GroupMask, lps, __LINE__, + [package_idx](size_t lp, std::vector& lps) { + lps[lp].package = package_idx; + }); + ++package_idx; + }); + + return packages; +} + +// Sets LP.node for all `lps`. +void SetNodes(std::vector& lps) { + // Zero-initialize all nodes in case the below fails. + for (size_t lp = 0; lp < lps.size(); ++lp) { + lps[lp].node = 0; + } + + // We want the full NUMA nodes, but Windows Server 2022 truncates the results + // of `RelationNumaNode` to a single 64-LP group. To get the old, unlimited + // behavior without using the new `RelationNumaNodeEx` symbol, use the old + // `RelationAll` and filter the SLPI we want. + (void)ForEachSLPI(RelationAll, [&](const SLPI& info) { + if (info.Relationship != RelationNumaNode) return; + const NUMA_NODE_RELATIONSHIP& nn = info.NumaNode; + // This field was previously reserved/zero. There is at least one group. + const size_t num_groups = HWY_MAX(1, nn.GroupCount); + const uint8_t node = static_cast(nn.NodeNumber); + ForeachBit(num_groups, nn.GroupMasks, lps, __LINE__, + [node](size_t lp, std::vector& lps) { + lps[lp].node = node; + }); + }); +} + +#elif HWY_OS_APPLE + +// Initializes `lps` and returns a `PackageSizes` vector (empty on failure) +// indicating the number of clusters and cores per package. +std::vector DetectPackages(std::vector& lps) { + int err; + + size_t total_cores = 0; + if (!Sysctl("hw.perflevel0.physicalcpu", 1, err, &total_cores)) { + HWY_WARN("Error %d detecting total_cores, assuming one per LP\n", err); + total_cores = lps.size(); + } + + if (lps.size() % total_cores != 0) { + HWY_WARN("LPs %zu not a multiple of total_cores %zu\n", lps.size(), + total_cores); + } + const size_t lp_per_core = DivCeil(lps.size(), total_cores); + + size_t cores_per_cluster = 0; + if (!Sysctl("hw.perflevel0.cpusperl2", 1, err, &cores_per_cluster)) { + HWY_WARN("Error %d detecting cores_per_cluster\n", err); + cores_per_cluster = HWY_MIN(4, total_cores); + } + + if (total_cores % cores_per_cluster != 0) { + HWY_WARN("total_cores %zu not a multiple of cores_per_cluster %zu\n", + total_cores, cores_per_cluster); + } + + for (size_t lp = 0; lp < lps.size(); ++lp) { + lps[lp].package = 0; // single package + lps[lp].core = static_cast(lp / lp_per_core); + lps[lp].smt = static_cast(lp % lp_per_core); + lps[lp].cluster = static_cast(lp / cores_per_cluster); + } + + PackageSizes ps; + ps.num_clusters = DivCeil(total_cores, cores_per_cluster); + ps.num_cores = total_cores; + return std::vector{ps}; +} + +// Sets LP.node for all `lps`. +void SetNodes(std::vector& lps) { + for (size_t lp = 0; lp < lps.size(); ++lp) { + lps[lp].node = 0; // no NUMA + } +} + +#endif // HWY_OS_* + +#if HWY_OS_WIN || HWY_OS_APPLE + +void SetClusterCacheSizes(std::vector& packages) { + // Assumes clusters are homogeneous. Otherwise, we would have to scan + // `RelationCache` again and find the corresponding package_idx. + const Cache* caches = DataCaches(); + const size_t private_kib = caches ? caches[2].size_kib : 0; + const size_t shared_kib = caches ? caches[3].size_kib : 0; + + for (size_t ip = 0; ip < packages.size(); ++ip) { + Topology::Package& p = packages[ip]; + for (size_t ic = 0; ic < p.clusters.size(); ++ic) { + Topology::Cluster& c = p.clusters[ic]; + c.private_kib = private_kib; + c.shared_kib = shared_kib; + } + } +} + +#endif // HWY_OS_WIN || HWY_OS_APPLE + } // namespace -#endif // HWY_OS_LINUX HWY_CONTRIB_DLLEXPORT Topology::Topology() { -#if HWY_OS_LINUX +#if HWY_OS_LINUX || HWY_OS_WIN || HWY_OS_APPLE lps.resize(TotalLogicalProcessors()); const std::vector& package_sizes = DetectPackages(lps); if (package_sizes.empty()) return; @@ -582,26 +873,14 @@ HWY_CONTRIB_DLLEXPORT Topology::Topology() { p.cores[lps[lp].core].lps.Set(lp); } - // Detect cache sizes (only once per cluster) - for (size_t ip = 0; ip < packages.size(); ++ip) { - Package& p = packages[ip]; - for (size_t ic = 0; ic < p.clusters.size(); ++ic) { - Cluster& c = p.clusters[ic]; - const size_t lp = c.lps.First(); - size_t bytes; - if (ReadNumberWithOptionalSuffix(kL2Size, lp, &bytes)) { - c.private_kib = bytes >> 10; - } - if (ReadNumberWithOptionalSuffix(kL3Size, lp, &bytes)) { - c.shared_kib = bytes >> 10; - } - } - } -#endif + SetClusterCacheSizes(packages); +#endif // HWY_OS_* } // ------------------------------ Cache detection +namespace { + using Caches = std::array; // We assume homogeneous caches across all clusters because some OS APIs return @@ -636,7 +915,7 @@ bool WriteSysfs(const char* name, size_t index, T* out) { // Reading from sysfs is preferred because sysconf returns L3 associativity = 0 // on some CPUs, and does not indicate sharing across cores. // https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-devices-system-cpu -static bool InitCachesSysfs(Caches& caches) { +bool InitCachesSysfs(Caches& caches) { // For computing shared cache sizes. std::vector lps(TotalLogicalProcessors()); const std::vector package_sizes = DetectPackages(lps); @@ -717,69 +996,14 @@ static bool InitCachesSysfs(Caches& caches) { return true; } -#endif // HWY_OS_LINUX - -#if HWY_OS_WIN -using SLPI = SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX; - -template -bool ForEachSLPI(LOGICAL_PROCESSOR_RELATIONSHIP rel, Func&& func) { - // Get required buffer size. - DWORD buf_bytes = 0; - HWY_ASSERT(!GetLogicalProcessorInformationEx(rel, nullptr, &buf_bytes)); - HWY_ASSERT(GetLastError() == ERROR_INSUFFICIENT_BUFFER); - // Note: `buf_bytes` may be less than `sizeof(SLPI)`, which has padding. - uint8_t* buf = static_cast(malloc(buf_bytes)); - HWY_ASSERT(buf); - - // Fill the buffer. - SLPI* info = reinterpret_cast(buf); - if (HWY_UNLIKELY(!GetLogicalProcessorInformationEx(rel, info, &buf_bytes))) { - free(buf); - return false; - } - - // Iterate over each SLPI. `sizeof(SLPI)` is unreliable, see above. - uint8_t* pos = buf; - while (pos < buf + buf_bytes) { - info = reinterpret_cast(pos); - HWY_ASSERT(info->Relationship == rel); - func(*info); - pos += info->Size; - } - if (pos != buf + buf_bytes) { - HWY_WARN("unexpected pos %p, end %p, buf_bytes %lu, sizeof(SLPI) %zu\n", - pos, buf + buf_bytes, buf_bytes, sizeof(SLPI)); - } - - free(buf); - return true; -} - -static size_t NumBits(size_t num_groups, const GROUP_AFFINITY* affinity) { - size_t total_bits = 0; - for (size_t i = 0; i < num_groups; ++i) { - size_t bits = 0; - hwy::CopyBytes(&affinity[i].Mask, &bits); - total_bits += hwy::PopCount(bits); - } - return total_bits; -} - -static size_t MaxLogicalPerCore() { - size_t max_logical = 0; - ForEachSLPI(RelationProcessorCore, [&max_logical](const SLPI& info) { - const PROCESSOR_RELATIONSHIP& p = info.Processor; - max_logical = HWY_MAX(max_logical, NumBits(p.GroupCount, p.GroupMask)); - }); - HWY_ASSERT(max_logical != 0); - return max_logical; -} +#elif HWY_OS_WIN -static bool InitCachesWin(Caches& caches) { - const size_t max_logical_per_core = MaxLogicalPerCore(); +bool InitCachesWin(Caches& caches) { + std::vector lps(TotalLogicalProcessors()); + const size_t max_lps_per_core = MaxLpsPerCore(lps); - ForEachSLPI(RelationCache, [max_logical_per_core, &caches](const SLPI& info) { + (void)ForEachSLPI(RelationCache, [max_lps_per_core, + &caches](const SLPI& info) { const CACHE_RELATIONSHIP& cr = info.Cache; if (cr.Type != CacheUnified && cr.Type != CacheData) return; if (1 <= cr.Level && cr.Level <= 3) { @@ -794,8 +1018,8 @@ static bool InitCachesWin(Caches& caches) { // How many cores share this cache? size_t shared_with = NumBits(cr.GroupCount, cr.GroupMasks); // Divide out hyperthreads. This core may have fewer than - // `max_logical_per_core`, hence round up. - shared_with = DivCeil(shared_with, max_logical_per_core); + // `max_lps_per_core`, hence round up. + shared_with = DivCeil(shared_with, max_lps_per_core); if (shared_with == 0) { HWY_WARN("no cores sharing L%u, setting to 1\n", cr.Level); shared_with = 1; @@ -819,11 +1043,10 @@ static bool InitCachesWin(Caches& caches) { // L3 is optional; if not found, its size is already zero from static init. return true; } -#endif // HWY_OS_WIN -#if HWY_OS_APPLE +#elif HWY_OS_APPLE -static bool InitCachesApple(Caches& caches) { +bool InitCachesApple(Caches& caches) { int err = 0; Cache& L1 = caches[1]; Cache& L2 = caches[2]; @@ -930,11 +1153,11 @@ static bool InitCachesApple(Caches& caches) { return true; } -#endif // HWY_OS_APPLE +#endif // HWY_OS_* // Most APIs do not set the `sets` field, so compute it from the size and // associativity, and if a value is already set, ensure it matches. -static HWY_MAYBE_UNUSED void ComputeSets(Cache& c) { +HWY_MAYBE_UNUSED void ComputeSets(Cache& c) { // If there is no such cache, avoid division by zero. if (HWY_UNLIKELY(c.size_kib == 0)) { c.sets = 0; @@ -955,7 +1178,7 @@ static HWY_MAYBE_UNUSED void ComputeSets(Cache& c) { } } -static const Cache* InitDataCaches() { +const Cache* InitDataCaches() { alignas(64) static Caches caches; // On failure, return immediately because InitCaches*() already warn. @@ -989,6 +1212,8 @@ static const Cache* InitDataCaches() { #endif // HWY_NO_CACHE_DETECTION } +} // namespace + HWY_CONTRIB_DLLEXPORT const Cache* DataCaches() { static const Cache* caches = InitDataCaches(); return caches; diff --git a/hwy/contrib/thread_pool/topology_test.cc b/hwy/contrib/thread_pool/topology_test.cc index 38d5202f3f..dfa2b73017 100644 --- a/hwy/contrib/thread_pool/topology_test.cc +++ b/hwy/contrib/thread_pool/topology_test.cc @@ -48,6 +48,10 @@ TEST(TopologyTest, TestTopology) { Topology topology; if (topology.packages.empty()) return; + fprintf(stderr, "Topology: %zuP %zuX %zuC\n", topology.packages.size(), + topology.packages[0].clusters.size(), + topology.packages[0].clusters[0].lps.Count()); + HWY_ASSERT(!topology.lps.empty()); LogicalProcessorSet nodes; for (size_t lp = 0; lp < topology.lps.size(); ++lp) { From 7e01a07edfd2e000a74bfee3591c06a4773743c1 Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Mon, 25 Nov 2024 04:38:31 -0800 Subject: [PATCH 10/64] add LSX/LASX targets. Refs #2386 PiperOrigin-RevId: 699936076 --- g3doc/impl_details.md | 5 ++ hwy/detect_compiler_arch.h | 21 ++++++- hwy/detect_targets.h | 42 ++++++++++++-- hwy/foreach_target.h | 24 ++++++++ hwy/highway.h | 112 +++++++++++++++++++++--------------- hwy/ops/loongarch_lsx-inl.h | 16 ++++++ hwy/ops/set_macros-inl.h | 33 +++++++++++ hwy/targets.cc | 22 +++++++ hwy/targets.h | 15 +++++ hwy/targets_test.cc | 7 +++ 10 files changed, 244 insertions(+), 53 deletions(-) create mode 100644 hwy/ops/loongarch_lsx-inl.h diff --git a/g3doc/impl_details.md b/g3doc/impl_details.md index 3479d6fd09..8b5db6a0eb 100644 --- a/g3doc/impl_details.md +++ b/g3doc/impl_details.md @@ -249,6 +249,11 @@ For ZVector targets `HWY_Z14`, `HWY_Z15`, `HWY_Z16`, there is the (requires IBMid login), plus a [searchable reference](https://www.ibm.com/docs/en/zos/2.5.0?topic=topics-using-vector-programming-support). +For LoongArch, there is a +[list of intrinsics](https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/integer_computation/) +and +[ISA reference](https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html). + ## Why scalar target There can be various reasons to avoid using vector intrinsics: diff --git a/hwy/detect_compiler_arch.h b/hwy/detect_compiler_arch.h index 94b49cb728..9d4d56b0a0 100644 --- a/hwy/detect_compiler_arch.h +++ b/hwy/detect_compiler_arch.h @@ -303,10 +303,29 @@ #define HWY_ARCH_S390X 0 #endif +#if defined(__loongarch64__) || defined(__loongarch64) || \ + (defined(__loongarch_grlen) && __loongarch_grlen == 64) +#define HWY_ARCH_LOONGARCH_64 1 +#else +#define HWY_ARCH_LOONGARCH_64 0 +#endif + +#if defined(__loongarch__) && !HWY_ARCH_LOONGARCH_64 +#define HWY_ARCH_LOONGARCH_32 1 +#else +#define HWY_ARCH_LOONGARCH_32 0 +#endif + +#if HWY_ARCH_LOONGARCH_64 || HWY_ARCH_LOONGARCH_32 +#define HWY_ARCH_LOONGARCH 1 +#else +#define HWY_ARCH_LOONGARCH 0 +#endif + // It is an error to detect multiple architectures at the same time, but OK to // detect none of the above. #if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_ARM_OLD + \ - HWY_ARCH_WASM + HWY_ARCH_RISCV + HWY_ARCH_S390X) > 1 + HWY_ARCH_WASM + HWY_ARCH_RISCV + HWY_ARCH_S390X + HWY_ARCH_LOONGARCH) > 1 #error "Must not detect more than one architecture" #endif diff --git a/hwy/detect_targets.h b/hwy/detect_targets.h index d5f3ab07ae..df0d9c9ef4 100644 --- a/hwy/detect_targets.h +++ b/hwy/detect_targets.h @@ -107,8 +107,14 @@ // Bit 38 reserved #define HWY_HIGHEST_TARGET_BIT_RVV 38 -// --------------------------- Future expansion: 4 targets -// Bits 39..42 reserved +// --------------------------- LoongArch: 3 targets (+ one fallback) +// Bits 39 reserved (1 target) +#define HWY_LASX (1LL << 40) +#define HWY_LSX (1LL << 41) +#define HWY_HIGHEST_TARGET_BIT_LOONGARCH 41 + +// --------------------------- Future expansion: 1 target +// Bits 42 reserved // --------------------------- IBM Power/ZSeries: 9 targets (+ one fallback) // Bits 43..46 reserved (4 targets) @@ -278,6 +284,15 @@ #define HWY_BROKEN_RVV 0 #endif +// HWY_LSX/HWY_LASX require GCC 14 or Clang 18. +#if HWY_ARCH_LOONGARCH && \ + ((HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1800) || \ + (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400)) +#define HWY_BROKEN_LOONGARCH (HWY_LSX | HWY_LASX) +#else +#define HWY_BROKEN_LOONGARCH 0 +#endif + // Allow the user to override this without any guarantee of success. #ifndef HWY_BROKEN_TARGETS @@ -286,7 +301,7 @@ HWY_BROKEN_AVX3_DL_ZEN4 | HWY_BROKEN_AVX3_SPR | \ HWY_BROKEN_ARM7_BIG_ENDIAN | HWY_BROKEN_ARM7_WITHOUT_VFP4 | \ HWY_BROKEN_NEON_BF16 | HWY_BROKEN_SVE | HWY_BROKEN_PPC10 | \ - HWY_BROKEN_PPC_32BIT | HWY_BROKEN_RVV) + HWY_BROKEN_PPC_32BIT | HWY_BROKEN_RVV | HWY_BROKEN_LOONGARCH) #endif // HWY_BROKEN_TARGETS @@ -559,6 +574,14 @@ #define HWY_BASELINE_RVV 0 #endif +#if HWY_ARCH_LOONGARCH && defined(__loongarch_sx) && defined(__loongarch_asx) +#define HWY_BASELINE_LOONGARCH (HWY_LSX | HWY_LASX) +#elif HWY_ARCH_LOONGARCH && defined(__loongarch_sx) +#define HWY_BASELINE_LOONGARCH (HWY_LSX) +#else +#define HWY_BASELINE_LOONGARCH 0 +#endif + // Allow the user to override this without any guarantee of success. #ifndef HWY_BASELINE_TARGETS #define HWY_BASELINE_TARGETS \ @@ -568,7 +591,7 @@ HWY_BASELINE_NEON | HWY_BASELINE_SSE2 | HWY_BASELINE_SSSE3 | \ HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | \ HWY_BASELINE_AVX3_DL | HWY_BASELINE_AVX3_ZEN4 | HWY_BASELINE_AVX3_SPR | \ - HWY_BASELINE_RVV) + HWY_BASELINE_RVV | HWY_BASELINE_LOONGARCH) #endif // HWY_BASELINE_TARGETS //------------------------------------------------------------------------------ @@ -724,6 +747,12 @@ #define HWY_ATTAINABLE_RISCV HWY_BASELINE_RVV #endif +#if HWY_ARCH_LOONGARCH && HWY_HAVE_RUNTIME_DISPATCH +#define HWY_ATTAINABLE_LOONGARCH (HWY_LSX | HWY_LASX) +#else +#define HWY_ATTAINABLE_LOONGARCH HWY_BASELINE_LOONGARCH +#endif + #ifndef HWY_ATTAINABLE_TARGETS_X86 // allow override #if HWY_COMPILER_MSVC && defined(HWY_SLOW_MSVC) // Fewer targets for faster builds. @@ -738,7 +767,7 @@ #endif // HWY_ATTAINABLE_TARGETS_X86 // Attainable means enabled and the compiler allows intrinsics (even when not -// allowed to autovectorize). Used in 3 and 4. +// allowed to auto-vectorize). Used in 3 and 4. #if HWY_ARCH_X86 #define HWY_ATTAINABLE_TARGETS HWY_ATTAINABLE_TARGETS_X86 #elif HWY_ARCH_ARM @@ -754,6 +783,9 @@ #elif HWY_ARCH_RISCV #define HWY_ATTAINABLE_TARGETS \ HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_RISCV) +#elif HWY_ARCH_LOONGARCH +#define HWY_ATTAINABLE_TARGETS \ + HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_LOONGARCH) #else #define HWY_ATTAINABLE_TARGETS (HWY_ENABLED_BASELINE) #endif // HWY_ARCH_* diff --git a/hwy/foreach_target.h b/hwy/foreach_target.h index 5219aee28d..66bd8a4f4c 100644 --- a/hwy/foreach_target.h +++ b/hwy/foreach_target.h @@ -319,6 +319,30 @@ #endif #endif +// ------------------------------ HWY_ARCH_LOONGARCH + +#if (HWY_TARGETS & HWY_LSX) && (HWY_STATIC_TARGET != HWY_LSX) +#undef HWY_TARGET +#define HWY_TARGET HWY_LSX +#include HWY_TARGET_INCLUDE +#ifdef HWY_TARGET_TOGGLE +#undef HWY_TARGET_TOGGLE +#else +#define HWY_TARGET_TOGGLE +#endif +#endif + +#if (HWY_TARGETS & HWY_LASX) && (HWY_STATIC_TARGET != HWY_LASX) +#undef HWY_TARGET +#define HWY_TARGET HWY_LASX +#include HWY_TARGET_INCLUDE +#ifdef HWY_TARGET_TOGGLE +#undef HWY_TARGET_TOGGLE +#else +#define HWY_TARGET_TOGGLE +#endif +#endif + // ------------------------------ Scalar #if (HWY_TARGETS & HWY_EMU128) && (HWY_STATIC_TARGET != HWY_EMU128) diff --git a/hwy/highway.h b/hwy/highway.h index 48359ea4d7..2fa2eb0ea1 100644 --- a/hwy/highway.h +++ b/hwy/highway.h @@ -76,12 +76,26 @@ namespace hwy { #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SCALAR::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_EMU128 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_EMU128::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_RVV -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_RVV::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_WASM_EMU256 -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM_EMU256::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_WASM #define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_WASM_EMU256 +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM_EMU256::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_Z14 +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_Z14::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_Z15 +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_Z15::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_PPC8 +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_PPC9 +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC9::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_PPC10 +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC10::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_LSX +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_LSX::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_LASX +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_LASX::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_RVV +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_RVV::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_NEON_WITHOUT_AES #define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON_WITHOUT_AES::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_NEON @@ -96,16 +110,6 @@ namespace hwy { #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE_256::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_SVE2_128 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2_128::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_PPC8 -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_PPC9 -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC9::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_PPC10 -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC10::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_Z14 -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_Z14::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_Z15 -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_Z15::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_SSE2 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE2::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_SSSE3 @@ -136,16 +140,58 @@ namespace hwy { #define HWY_CHOOSE_FALLBACK(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME) #endif +#if HWY_TARGETS & HWY_WASM +#define HWY_CHOOSE_WASM(FUNC_NAME) &N_WASM::FUNC_NAME +#else +#define HWY_CHOOSE_WASM(FUNC_NAME) nullptr +#endif + #if HWY_TARGETS & HWY_WASM_EMU256 #define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) &N_WASM_EMU256::FUNC_NAME #else #define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) nullptr #endif -#if HWY_TARGETS & HWY_WASM -#define HWY_CHOOSE_WASM(FUNC_NAME) &N_WASM::FUNC_NAME +#if HWY_TARGETS & HWY_Z14 +#define HWY_CHOOSE_Z14(FUNC_NAME) &N_Z14::FUNC_NAME #else -#define HWY_CHOOSE_WASM(FUNC_NAME) nullptr +#define HWY_CHOOSE_Z14(FUNC_NAME) nullptr +#endif + +#if HWY_TARGETS & HWY_Z15 +#define HWY_CHOOSE_Z15(FUNC_NAME) &N_Z15::FUNC_NAME +#else +#define HWY_CHOOSE_Z15(FUNC_NAME) nullptr +#endif + +#if HWY_TARGETS & HWY_PPC8 +#define HWY_CHOOSE_PPC8(FUNC_NAME) &N_PPC8::FUNC_NAME +#else +#define HWY_CHOOSE_PPC8(FUNC_NAME) nullptr +#endif + +#if HWY_TARGETS & HWY_PPC9 +#define HWY_CHOOSE_PPC9(FUNC_NAME) &N_PPC9::FUNC_NAME +#else +#define HWY_CHOOSE_PPC9(FUNC_NAME) nullptr +#endif + +#if HWY_TARGETS & HWY_LSX +#define HWY_CHOOSE_LSX(FUNC_NAME) &N_LSX::FUNC_NAME +#else +#define HWY_CHOOSE_LSX(FUNC_NAME) nullptr +#endif + +#if HWY_TARGETS & HWY_LASX +#define HWY_CHOOSE_LASX(FUNC_NAME) &N_LASX::FUNC_NAME +#else +#define HWY_CHOOSE_LASX(FUNC_NAME) nullptr +#endif + +#if HWY_TARGETS & HWY_PPC10 +#define HWY_CHOOSE_PPC10(FUNC_NAME) &N_PPC10::FUNC_NAME +#else +#define HWY_CHOOSE_PPC10(FUNC_NAME) nullptr #endif #if HWY_TARGETS & HWY_RVV @@ -196,36 +242,6 @@ namespace hwy { #define HWY_CHOOSE_SVE2_128(FUNC_NAME) nullptr #endif -#if HWY_TARGETS & HWY_PPC8 -#define HWY_CHOOSE_PPC8(FUNC_NAME) &N_PPC8::FUNC_NAME -#else -#define HWY_CHOOSE_PPC8(FUNC_NAME) nullptr -#endif - -#if HWY_TARGETS & HWY_PPC9 -#define HWY_CHOOSE_PPC9(FUNC_NAME) &N_PPC9::FUNC_NAME -#else -#define HWY_CHOOSE_PPC9(FUNC_NAME) nullptr -#endif - -#if HWY_TARGETS & HWY_PPC10 -#define HWY_CHOOSE_PPC10(FUNC_NAME) &N_PPC10::FUNC_NAME -#else -#define HWY_CHOOSE_PPC10(FUNC_NAME) nullptr -#endif - -#if HWY_TARGETS & HWY_Z14 -#define HWY_CHOOSE_Z14(FUNC_NAME) &N_Z14::FUNC_NAME -#else -#define HWY_CHOOSE_Z14(FUNC_NAME) nullptr -#endif - -#if HWY_TARGETS & HWY_Z15 -#define HWY_CHOOSE_Z15(FUNC_NAME) &N_Z15::FUNC_NAME -#else -#define HWY_CHOOSE_Z15(FUNC_NAME) nullptr -#endif - #if HWY_TARGETS & HWY_SSE2 #define HWY_CHOOSE_SSE2(FUNC_NAME) &N_SSE2::FUNC_NAME #else @@ -598,6 +614,8 @@ struct AddExport { #include "hwy/ops/emu128-inl.h" #elif HWY_TARGET == HWY_SCALAR #include "hwy/ops/scalar-inl.h" +#elif HWY_TARGET == HWY_LSX || HWY_TARGET == HWY_LASX +#include "hwy/ops/loongarch_lsx-inl.h" #else #pragma message("HWY_TARGET does not match any known target") #endif // HWY_TARGET diff --git a/hwy/ops/loongarch_lsx-inl.h b/hwy/ops/loongarch_lsx-inl.h new file mode 100644 index 0000000000..035e38b978 --- /dev/null +++ b/hwy/ops/loongarch_lsx-inl.h @@ -0,0 +1,16 @@ +// Copyright 2024 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// TODO: fill \ No newline at end of file diff --git a/hwy/ops/set_macros-inl.h b/hwy/ops/set_macros-inl.h index f955f93630..731614ffd4 100644 --- a/hwy/ops/set_macros-inl.h +++ b/hwy/ops/set_macros-inl.h @@ -635,6 +635,39 @@ // HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op. #endif +//----------------------------------------------------------------------------- +// LSX/LASX +#elif HWY_TARGET == HWY_LSX || HWY_TARGET == HWY_LASX + +#if HWY_TARGET == HWY_LSX +#define HWY_ALIGN alignas(16) +#define HWY_MAX_BYTES 16 +#else +#define HWY_ALIGN alignas(32) +#define HWY_MAX_BYTES 32 +#endif + +#define HWY_LANES(T) (HWY_MAX_BYTES / sizeof(T)) + +// TODO: check flag values +#define HWY_HAVE_SCALABLE 0 +#define HWY_HAVE_INTEGER64 1 +#define HWY_HAVE_FLOAT16 1 +#define HWY_HAVE_FLOAT64 1 +#define HWY_MEM_OPS_MIGHT_FAULT 0 +#define HWY_NATIVE_FMA 1 +#define HWY_NATIVE_DOT_BF16 0 +#define HWY_CAP_GE256 0 +#define HWY_CAP_GE512 0 + +#if HWY_TARGET == HWY_LSX +#define HWY_NAMESPACE N_LSX +#else +#define HWY_NAMESPACE N_LASX +#endif + +// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op. + //----------------------------------------------------------------------------- // EMU128 #elif HWY_TARGET == HWY_EMU128 diff --git a/hwy/targets.cc b/hwy/targets.cc index b6c2419ba2..9923405ed2 100644 --- a/hwy/targets.cc +++ b/hwy/targets.cc @@ -675,6 +675,26 @@ int64_t DetectTargets() { return bits; } } // namespace rvv +#elif HWY_ARCH_LOONGARCH && HWY_HAVE_RUNTIME_DISPATCH +namespace loongarch { + +#ifndef LA_HWCAP_LSX +#define LA_HWCAP_LSX (1u << 4) +#endif +#ifndef LA_HWCAP_LASX +#define LA_HWCAP_LASX (1u << 5) +#endif + +using CapBits = unsigned long; // NOLINT + +int64_t DetectTargets() { + int64_t bits = 0; + const CapBits hw = getauxval(AT_HWCAP); + if (hwcap & LA_HWCAP_LSX) bits |= HWY_LSX; + if (hwcap & LA_HWCAP_LASX) bits |= HWY_LASX; + return bits; +} +} // namespace loongarch #endif // HWY_ARCH_* // Returns targets supported by the CPU, independently of DisableTargets. @@ -695,6 +715,8 @@ int64_t DetectTargets() { bits |= s390x::DetectTargets(); #elif HWY_ARCH_RISCV && HWY_HAVE_RUNTIME_DISPATCH bits |= rvv::DetectTargets(); +#elif HWY_ARCH_LOONGARCH && HWY_HAVE_RUNTIME_DISPATCH + bits |= loongarch::DetectTargets(); #else // TODO(janwas): detect support for WASM. diff --git a/hwy/targets.h b/hwy/targets.h index b3573dd194..faafc7a7eb 100644 --- a/hwy/targets.h +++ b/hwy/targets.h @@ -148,6 +148,13 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) { return "RVV"; #endif +#if HWY_ARCH_LOONGARCH + case HWY_LSX: + return "LSX"; + case HWY_LASX: + return "LASX"; +#endif + case HWY_EMU128: return "EMU128"; case HWY_SCALAR: @@ -284,6 +291,14 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) { HWY_CHOOSE_WASM(func_name), /* WASM */ \ nullptr /* reserved */ +#elif HWY_ARCH_LOONGARCH +#define HWY_MAX_DYNAMIC_TARGETS 3 +#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_LOONGARCH +#define HWY_CHOOSE_TARGET_LIST(func_name) \ + nullptr, /* reserved */ \ + HWY_CHOOSE_LASX(func_name), /* LASX */ \ + HWY_CHOOSE_LSX(func_name) /* LSX */ + #else // Unknown architecture, will use HWY_SCALAR without dynamic dispatch, though // still creating single-entry tables in HWY_EXPORT to ensure portability. diff --git a/hwy/targets_test.cc b/hwy/targets_test.cc index c0cb1f85f9..724296165d 100644 --- a/hwy/targets_test.cc +++ b/hwy/targets_test.cc @@ -64,6 +64,9 @@ DECLARE_FUNCTION(WASM_EMU256) DECLARE_FUNCTION(RVV) +DECLARE_FUNCTION(LASX) +DECLARE_FUNCTION(LSX) + DECLARE_FUNCTION(SCALAR) DECLARE_FUNCTION(EMU128) @@ -134,6 +137,10 @@ void CheckFakeFunction() { CallFunctionForTarget(HWY_WASM_EMU256, __LINE__); CallFunctionForTarget(HWY_RVV, __LINE__); + + CallFunctionForTarget(HWY_LASX, __LINE__); + CallFunctionForTarget(HWY_LSX, __LINE__); + // The tables only have space for either HWY_SCALAR or HWY_EMU128; the former // is opt-in only. #if defined(HWY_COMPILE_ONLY_SCALAR) || HWY_BROKEN_EMU128 From 02253c8ed75713a108dcb7cd12478ed83f2edf73 Mon Sep 17 00:00:00 2001 From: Zequan Wu Date: Tue, 26 Nov 2024 10:26:10 -0800 Subject: [PATCH 11/64] Remove VQSORT_SKIP workaround for compiler bug The test passes with the most recent clang. PiperOrigin-RevId: 700385250 --- hwy/contrib/sort/sort_test.cc | 11 +---------- hwy/contrib/sort/sort_unit_test.cc | 11 +---------- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/hwy/contrib/sort/sort_test.cc b/hwy/contrib/sort/sort_test.cc index e891418d8d..7771f07b21 100644 --- a/hwy/contrib/sort/sort_test.cc +++ b/hwy/contrib/sort/sort_test.cc @@ -38,15 +38,6 @@ #include "hwy/print-inl.h" #include "hwy/tests/test_util-inl.h" -// TODO(b/314758657): Compiler bug causes incorrect results on SSE2/S-SSE3. -#undef VQSORT_SKIP -#if !defined(VQSORT_DO_NOT_SKIP) && HWY_COMPILER_CLANG && HWY_ARCH_X86 && \ - HWY_TARGET >= HWY_SSSE3 -#define VQSORT_SKIP 1 -#else -#define VQSORT_SKIP 0 -#endif - HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { @@ -124,7 +115,7 @@ void TestAnySort(const std::vector& algos, size_t num_lanes) { HWY_ASSERT(aligned); for (Algo algo : algos) { - if (IsVQ(algo) && (!VQSORT_ENABLED || VQSORT_SKIP)) continue; + if (IsVQ(algo) && !VQSORT_ENABLED) continue; for (Dist dist : AllDist()) { for (size_t misalign : diff --git a/hwy/contrib/sort/sort_unit_test.cc b/hwy/contrib/sort/sort_unit_test.cc index f7611cf5f8..56bb17e88d 100644 --- a/hwy/contrib/sort/sort_unit_test.cc +++ b/hwy/contrib/sort/sort_unit_test.cc @@ -37,15 +37,6 @@ #include "hwy/print-inl.h" #include "hwy/tests/test_util-inl.h" -// TODO(b/314758657): Compiler bug causes incorrect results on SSE2/S-SSE3. -#undef VQSORT_SKIP -#if !defined(VQSORT_DO_NOT_SKIP) && HWY_COMPILER_CLANG && HWY_ARCH_X86 && \ - HWY_TARGET >= HWY_SSSE3 -#define VQSORT_SKIP 1 -#else -#define VQSORT_SKIP 0 -#endif - HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { @@ -310,7 +301,7 @@ static HWY_NOINLINE void TestBaseCase() { HWY_NOINLINE void TestAllBaseCase() { // Workaround for stack overflow on MSVC debug. -#if defined(_MSC_VER) || VQSORT_SKIP +#if defined(_MSC_VER) return; #endif From 68b0fdebffb14f3b8473fed1c33ce368efc431e7 Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Wed, 27 Nov 2024 07:54:53 -0800 Subject: [PATCH 12/64] Add BitsFromMask, promoting from detail::. Also split mask_test into mask_set_test, remove unused overload in scalar, modernize overloads (SFINAE instead of type tags). PiperOrigin-RevId: 700701299 --- BUILD | 1 + CMakeLists.txt | 1 + g3doc/quick_reference.md | 39 +++-- hwy/ops/arm_neon-inl.h | 91 ++++++----- hwy/ops/emu128-inl.h | 9 ++ hwy/ops/generic_ops-inl.h | 25 +-- hwy/ops/ppc_vsx-inl.h | 85 +++++----- hwy/ops/scalar-inl.h | 12 +- hwy/ops/wasm_128-inl.h | 137 ++++++++-------- hwy/ops/wasm_256-inl.h | 7 + hwy/ops/x86_128-inl.h | 73 ++++----- hwy/ops/x86_256-inl.h | 56 +++---- hwy/ops/x86_512-inl.h | 80 ++++----- hwy/tests/mask_set_test.cc | 317 ++++++++++++++++++++++++++++++++++++ hwy/tests/mask_test.cc | 323 +++++-------------------------------- 15 files changed, 663 insertions(+), 593 deletions(-) create mode 100644 hwy/tests/mask_set_test.cc diff --git a/BUILD b/BUILD index 114eef8a02..e74a2770b8 100644 --- a/BUILD +++ b/BUILD @@ -513,6 +513,7 @@ HWY_TESTS = [ ("hwy/tests/", "mask_combine_test"), ("hwy/tests/", "mask_convert_test"), ("hwy/tests/", "mask_mem_test"), + ("hwy/tests/", "mask_set_test"), ("hwy/tests/", "mask_slide_test"), ("hwy/tests/", "mask_test"), ("hwy/tests/", "masked_arithmetic_test"), diff --git a/CMakeLists.txt b/CMakeLists.txt index 9cf044cbc9..04f1fa4b6b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -746,6 +746,7 @@ set(HWY_TEST_FILES hwy/tests/mask_combine_test.cc hwy/tests/mask_convert_test.cc hwy/tests/mask_mem_test.cc + hwy/tests/mask_set_test.cc hwy/tests/mask_slide_test.cc hwy/tests/mask_test.cc hwy/tests/masked_arithmetic_test.cc diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md index 8220e9b718..4cdbb57d72 100644 --- a/g3doc/quick_reference.md +++ b/g3doc/quick_reference.md @@ -1153,6 +1153,12 @@ encoding depends on the platform). * V **VecFromMask**(D, M m): returns 0 in lane `i` if `m[i] == false`, otherwise all bits set. +* uint64_t **BitsFromMask**(M m): returns bits `b` such that + `(b >> i) & 1` indicates whether `m[i]` was set, and any remaining bits in + the `uint64_t` are zero. This is only available if `!HWY_HAVE_SCALABLE && + HWY_MAX_BYTES <= 64`, because 512-bit vectors are the longest for which + there are no more than 64 lanes and thus mask bits. + * size_t **StoreMaskBits**(D, M m, uint8_t* p): stores a bit array indicating whether `m[i]` is true, in ascending order of `i`, filling the bits of each byte from least to most significant, then proceeding to the @@ -1163,11 +1169,11 @@ encoding depends on the platform). Mask<DFrom> m): Promotes `m` to a mask with a lane type of `TFromD`, `DFrom` is `Rebind`. - `PromoteMaskTo(d_to, d_from, m)` is equivalent to - `MaskFromVec(BitCast(d_to, PromoteTo(di_to, BitCast(di_from, - VecFromMask(d_from, m)))))`, where `di_from` is `RebindToSigned()` - and `di_from` is `RebindToSigned()`, but - `PromoteMaskTo(d_to, d_from, m)` is more efficient on some targets. + `PromoteMaskTo(d_to, d_from, m)` is equivalent to `MaskFromVec(BitCast(d_to, + PromoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m)))))`, where + `di_from` is `RebindToSigned()` and `di_from` is + `RebindToSigned()`, but `PromoteMaskTo(d_to, d_from, m)` is more + efficient on some targets. PromoteMaskTo requires that `sizeof(TFromD) < sizeof(TFromD)` be true. @@ -1176,11 +1182,11 @@ encoding depends on the platform). Mask<DFrom> m): Demotes `m` to a mask with a lane type of `TFromD`, `DFrom` is `Rebind`. - `DemoteMaskTo(d_to, d_from, m)` is equivalent to - `MaskFromVec(BitCast(d_to, DemoteTo(di_to, BitCast(di_from, - VecFromMask(d_from, m)))))`, where `di_from` is `RebindToSigned()` - and `di_from` is `RebindToSigned()`, but - `DemoteMaskTo(d_to, d_from, m)` is more efficient on some targets. + `DemoteMaskTo(d_to, d_from, m)` is equivalent to `MaskFromVec(BitCast(d_to, + DemoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m)))))`, where + `di_from` is `RebindToSigned()` and `di_from` is + `RebindToSigned()`, but `DemoteMaskTo(d_to, d_from, m)` is more + efficient on some targets. DemoteMaskTo requires that `sizeof(TFromD) > sizeof(TFromD)` be true. @@ -1189,16 +1195,15 @@ encoding depends on the platform). whose `LowerHalf` is the first argument and whose `UpperHalf` is the second argument; `M2` is `Mask>`; `DTo` is `Repartition`. - OrderedDemote2MasksTo requires that - `sizeof(TFromD) == sizeof(TFromD) * 2` be true. + OrderedDemote2MasksTo requires that `sizeof(TFromD) == + sizeof(TFromD) * 2` be true. `OrderedDemote2MasksTo(d_to, d_from, a, b)` is equivalent to `MaskFromVec(BitCast(d_to, OrderedDemote2To(di_to, va, vb)))`, where `va` is - `BitCast(di_from, MaskFromVec(d_from, a))`, `vb` is - `BitCast(di_from, MaskFromVec(d_from, b))`, `di_to` is - `RebindToSigned()`, and `di_from` is `RebindToSigned()`, but - `OrderedDemote2MasksTo(d_to, d_from, a, b)` is more efficient on some - targets. + `BitCast(di_from, MaskFromVec(d_from, a))`, `vb` is `BitCast(di_from, + MaskFromVec(d_from, b))`, `di_to` is `RebindToSigned()`, and `di_from` + is `RebindToSigned()`, but `OrderedDemote2MasksTo(d_to, d_from, a, + b)` is more efficient on some targets. OrderedDemote2MasksTo is only available if `HWY_TARGET != HWY_SCALAR` is true. diff --git a/hwy/ops/arm_neon-inl.h b/hwy/ops/arm_neon-inl.h index 37294205a7..8740b714db 100644 --- a/hwy/ops/arm_neon-inl.h +++ b/hwy/ops/arm_neon-inl.h @@ -21,6 +21,7 @@ // Arm NEON intrinsics are documented at: // https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon] +#include "hwy/base.h" #include "hwy/ops/shared-inl.h" HWY_DIAGNOSTICS(push) @@ -8921,8 +8922,16 @@ HWY_INLINE uint64_t NibblesFromMask(D d, MFromD mask) { return nib & ((1ull << (d.MaxBytes() * 4)) - 1); } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128 mask) { +// Returns the lowest N for the BitsFromMask result. +template +constexpr uint64_t OnlyActive(uint64_t bits) { + return ((N * sizeof(T)) >= 8) ? bits : (bits & ((1ull << N) - 1)); +} + +} // namespace detail + +template +HWY_API uint64_t BitsFromMask(Mask128 mask) { alignas(16) static constexpr uint8_t kSliceLanes[16] = { 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, }; @@ -8945,8 +8954,8 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128 mask) { #endif } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(Mask128 mask) { // Upper lanes of partial loads are undefined. OnlyActive will fix this if // we load all kSliceLanes so the upper lanes do not pollute the valid bits. alignas(8) static constexpr uint8_t kSliceLanes[8] = {1, 2, 4, 8, @@ -8957,17 +8966,17 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128 mask) { const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; #if HWY_ARCH_ARM_A64 - return vaddv_u8(values.raw); + return detail::OnlyActive(vaddv_u8(values.raw)); #else const uint16x4_t x2 = vpaddl_u8(values.raw); const uint32x2_t x4 = vpaddl_u16(x2); const uint64x1_t x8 = vpaddl_u32(x4); - return vget_lane_u64(x8, 0); + return detail::OnlyActive(vget_lane_u64(x8, 0)); #endif } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(Mask128 mask) { alignas(16) static constexpr uint16_t kSliceLanes[8] = { 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80}; const Full128 d; @@ -8975,16 +8984,17 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128 mask) { const Vec128 values = BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); #if HWY_ARCH_ARM_A64 - return vaddvq_u16(values.raw); + return detail::OnlyActive(vaddvq_u16(values.raw)); #else const uint32x4_t x2 = vpaddlq_u16(values.raw); const uint64x2_t x4 = vpaddlq_u32(x2); - return vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1); + return detail::OnlyActive(vgetq_lane_u64(x4, 0) + + vgetq_lane_u64(x4, 1)); #endif } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(Mask128 mask) { // Upper lanes of partial loads are undefined. OnlyActive will fix this if // we load all kSliceLanes so the upper lanes do not pollute the valid bits. alignas(8) static constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8}; @@ -8993,31 +9003,32 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128 mask) { const Vec128 slice(Load(Full64(), kSliceLanes).raw); const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; #if HWY_ARCH_ARM_A64 - return vaddv_u16(values.raw); + return detail::OnlyActive(vaddv_u16(values.raw)); #else const uint32x2_t x2 = vpaddl_u16(values.raw); const uint64x1_t x4 = vpaddl_u32(x2); - return vget_lane_u64(x4, 0); + return detail::OnlyActive(vget_lane_u64(x4, 0)); #endif } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(Mask128 mask) { alignas(16) static constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8}; const Full128 d; const Full128 du; const Vec128 values = BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); #if HWY_ARCH_ARM_A64 - return vaddvq_u32(values.raw); + return detail::OnlyActive(vaddvq_u32(values.raw)); #else const uint64x2_t x2 = vpaddlq_u32(values.raw); - return vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1); + return detail::OnlyActive(vgetq_lane_u64(x2, 0) + + vgetq_lane_u64(x2, 1)); #endif } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(Mask128 mask) { // Upper lanes of partial loads are undefined. OnlyActive will fix this if // we load all kSliceLanes so the upper lanes do not pollute the valid bits. alignas(8) static constexpr uint32_t kSliceLanes[2] = {1, 2}; @@ -9026,45 +9037,37 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128 mask) { const Vec128 slice(Load(Full64(), kSliceLanes).raw); const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; #if HWY_ARCH_ARM_A64 - return vaddv_u32(values.raw); + return detail::OnlyActive(vaddv_u32(values.raw)); #else const uint64x1_t x2 = vpaddl_u32(values.raw); - return vget_lane_u64(x2, 0); + return detail::OnlyActive(vget_lane_u64(x2, 0)); #endif } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128 m) { +template +HWY_API uint64_t BitsFromMask(Mask128 m) { alignas(16) static constexpr uint64_t kSliceLanes[2] = {1, 2}; const Full128 d; const Full128 du; const Vec128 values = BitCast(du, VecFromMask(d, m)) & Load(du, kSliceLanes); #if HWY_ARCH_ARM_A64 - return vaddvq_u64(values.raw); + return detail::OnlyActive(vaddvq_u64(values.raw)); #else - return vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1); + return detail::OnlyActive(vgetq_lane_u64(values.raw, 0) + + vgetq_lane_u64(values.raw, 1)); #endif } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128 m) { +template +HWY_API uint64_t BitsFromMask(Mask128 m) { const Full64 d; const Full64 du; const Vec64 values = BitCast(du, VecFromMask(d, m)) & Set(du, 1); return vget_lane_u64(values.raw, 0); } -// Returns the lowest N for the BitsFromMask result. -template -constexpr uint64_t OnlyActive(uint64_t bits) { - return ((N * sizeof(T)) >= 8) ? bits : (bits & ((1ull << N) - 1)); -} - -template -HWY_INLINE uint64_t BitsFromMask(Mask128 mask) { - return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); -} +namespace detail { // Returns number of lanes whose mask is set. // @@ -9184,7 +9187,7 @@ HWY_API intptr_t FindLastTrue(D d, MFromD mask) { // `p` points to at least 8 writable bytes. template HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(mask); const size_t kNumBytes = (d.MaxLanes() + 7) / 8; CopyBytes(&mask_bits, bits); return kNumBytes; @@ -9672,7 +9675,7 @@ HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { // General case, 2 or 4 byte lanes template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { - return detail::Compress(v, detail::BitsFromMask(mask)); + return detail::Compress(v, BitsFromMask(mask)); } // Single lane: no-op @@ -9699,9 +9702,9 @@ HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { // For partial vectors, we cannot pull the Not() into the table because // BitsFromMask clears the upper bits. if (N < 16 / sizeof(T)) { - return detail::Compress(v, detail::BitsFromMask(Not(mask))); + return detail::Compress(v, BitsFromMask(Not(mask))); } - return detail::CompressNot(v, detail::BitsFromMask(mask)); + return detail::CompressNot(v, BitsFromMask(mask)); } // ------------------------------ CompressBlocksNot @@ -9729,7 +9732,7 @@ HWY_INLINE Vec128 CompressBits(Vec128 v, template HWY_API size_t CompressStore(VFromD v, MFromD mask, D d, TFromD* HWY_RESTRICT unaligned) { - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(mask); StoreU(detail::Compress(v, mask_bits), d, unaligned); return PopCount(mask_bits); } @@ -9739,7 +9742,7 @@ template HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; // so we can support fp16/bf16 - const uint64_t mask_bits = detail::BitsFromMask(m); + const uint64_t mask_bits = BitsFromMask(m); const size_t count = PopCount(mask_bits); const MFromD store_mask = RebindMask(d, FirstN(du, count)); const VFromD compressed = diff --git a/hwy/ops/emu128-inl.h b/hwy/ops/emu128-inl.h index 5c5ed98799..d19f7cc168 100644 --- a/hwy/ops/emu128-inl.h +++ b/hwy/ops/emu128-inl.h @@ -386,6 +386,15 @@ VFromD VecFromMask(D /* tag */, MFromD mask) { return v; } +template +uint64_t BitsFromMask(Mask128 mask) { + uint64_t bits = 0; + for (size_t i = 0; i < N; ++i) { + bits |= mask.bits[i] ? (1ull << i) : 0; + } + return bits; +} + template HWY_API MFromD FirstN(D d, size_t n) { MFromD m; diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h index 766c6c9d2e..940c2db957 100644 --- a/hwy/ops/generic_ops-inl.h +++ b/hwy/ops/generic_ops-inl.h @@ -5599,13 +5599,6 @@ HWY_API V CompressNot(V v, M mask) { namespace detail { -#if HWY_IDE -template -HWY_INLINE uint64_t BitsFromMask(M /* mask */) { - return 0; -} -#endif // HWY_IDE - template HWY_INLINE Vec128 IndicesForExpandFromBits(uint64_t mask_bits) { static_assert(N <= 8, "Should only be called for half-vectors"); @@ -5879,7 +5872,7 @@ template HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { const DFromV d; - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(mask); const Vec128 indices = detail::IndicesForExpandFromBits(mask_bits); return BitCast(d, TableLookupBytesOr0(v, indices)); @@ -5893,7 +5886,7 @@ HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { const Half duh; const Vec128 vu = BitCast(du, v); - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(mask); const uint64_t maskL = mask_bits & 0xFF; const uint64_t maskH = mask_bits >> 8; @@ -5925,7 +5918,7 @@ HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { const RebindToUnsigned du; const Rebind du8; - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(mask); // Storing as 8-bit reduces table size from 4 KiB to 2 KiB. We cannot apply // the nibble trick used below because not all indices fit within one lane. @@ -6207,7 +6200,7 @@ HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { const DFromV d; const RebindToUnsigned du; - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(mask); alignas(16) static constexpr uint32_t packed_array[16] = { // PrintExpand64x4Nibble - same for 32x4. @@ -7365,6 +7358,16 @@ HWY_API auto Le(V a, V b) -> decltype(a == b) { #undef HWY_GENERIC_IF_EMULATED_D +// TODO: remove once callers are updated. +#if !HWY_HAVE_SCALABLE && HWY_MAX_BYTES <= 64 +namespace detail { +template +uint64_t BitsFromMask(M m) { + return hwy::HWY_NAMESPACE::BitsFromMask(m); +} +} // namespace detail +#endif // !HWY_HAVE_SCALABLE && HWY_MAX_BYTES <= 64 + // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy diff --git a/hwy/ops/ppc_vsx-inl.h b/hwy/ops/ppc_vsx-inl.h index d216c54853..f8884bb7fc 100644 --- a/hwy/ops/ppc_vsx-inl.h +++ b/hwy/ops/ppc_vsx-inl.h @@ -5222,6 +5222,12 @@ HWY_API MFromD Dup128MaskFromMaskBits(D d, unsigned mask_bits) { namespace detail { +// Returns the lowest N of the mask bits. +template +constexpr uint64_t OnlyActive(uint64_t mask_bits) { + return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1); +} + #if !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN // fallback for missing vec_extractm template @@ -5242,23 +5248,27 @@ HWY_INLINE uint64_t ExtractSignBits(Vec128 sign_bits, #endif // !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128 mask) { +} // namespace detail + +template +HWY_API uint64_t BitsFromMask(Mask128 mask) { const DFromM d; const Repartition du8; const VFromD sign_bits = BitCast(du8, VecFromMask(d, mask)); #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN - return static_cast(vec_extractm(sign_bits.raw)); + return detail::OnlyActive( + static_cast(vec_extractm(sign_bits.raw))); #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10 const __vector unsigned char kBitShuffle = {120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0}; - return ExtractSignBits(sign_bits, kBitShuffle); + return detail::OnlyActive( + detail::ExtractSignBits(sign_bits, kBitShuffle)); #endif // HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(Mask128 mask) { const DFromM d; const RebindToUnsigned du; @@ -5266,7 +5276,8 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128 mask) { const VFromD sign_bits = BitCast(du8, VecFromMask(d, mask)); #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN - return static_cast(vec_extractm(BitCast(du, sign_bits).raw)); + return detail::OnlyActive( + static_cast(vec_extractm(BitCast(du, sign_bits).raw))); #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10 (void)du; #if HWY_IS_LITTLE_ENDIAN @@ -5276,12 +5287,13 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128 mask) { const __vector unsigned char kBitShuffle = { 128, 128, 128, 128, 128, 128, 128, 128, 112, 96, 80, 64, 48, 32, 16, 0}; #endif - return ExtractSignBits(sign_bits, kBitShuffle); + return detail::OnlyActive( + detail::ExtractSignBits(sign_bits, kBitShuffle)); #endif // HWY_PPC_HAVE_10 } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(Mask128 mask) { const DFromM d; const RebindToUnsigned du; @@ -5289,7 +5301,8 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128 mask) { const VFromD sign_bits = BitCast(du8, VecFromMask(d, mask)); #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN - return static_cast(vec_extractm(BitCast(du, sign_bits).raw)); + return detail::OnlyActive( + static_cast(vec_extractm(BitCast(du, sign_bits).raw))); #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10 (void)du; #if HWY_IS_LITTLE_ENDIAN @@ -5301,12 +5314,13 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128 mask) { 128, 128, 128, 128, 128, 128, 96, 64, 32, 0}; #endif - return ExtractSignBits(sign_bits, kBitShuffle); + return detail::OnlyActive( + detail::ExtractSignBits(sign_bits, kBitShuffle)); #endif // HWY_PPC_HAVE_10 } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(Mask128 mask) { const DFromM d; const RebindToUnsigned du; @@ -5314,7 +5328,8 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128 mask) { const VFromD sign_bits = BitCast(du8, VecFromMask(d, mask)); #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN - return static_cast(vec_extractm(BitCast(du, sign_bits).raw)); + return detail::OnlyActive( + static_cast(vec_extractm(BitCast(du, sign_bits).raw))); #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10 (void)du; #if HWY_IS_LITTLE_ENDIAN @@ -5326,35 +5341,23 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128 mask) { 128, 128, 128, 128, 128, 128, 128, 128, 64, 0}; #endif - return ExtractSignBits(sign_bits, kBitShuffle); + return detail::OnlyActive( + detail::ExtractSignBits(sign_bits, kBitShuffle)); #endif // HWY_PPC_HAVE_10 } -// Returns the lowest N of the mask bits. -template -constexpr uint64_t OnlyActive(uint64_t mask_bits) { - return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1); -} - -template -HWY_INLINE uint64_t BitsFromMask(Mask128 mask) { - return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); -} - -} // namespace detail - // `p` points to at least 8 writable bytes. template HWY_API size_t StoreMaskBits(D /*d*/, MFromD mask, uint8_t* bits) { // For vectors with 8 or fewer lanes, simply cast the result of BitsFromMask // to an uint8_t and store the result in bits[0]. - bits[0] = static_cast(detail::BitsFromMask(mask)); + bits[0] = static_cast(BitsFromMask(mask)); return sizeof(uint8_t); } template HWY_API size_t StoreMaskBits(D /*d*/, MFromD mask, uint8_t* bits) { - const auto mask_bits = detail::BitsFromMask(mask); + const auto mask_bits = BitsFromMask(mask); // First convert mask_bits to a uint16_t as we only want to store // the lower 16 bits of mask_bits as there are 16 lanes in mask. @@ -5420,7 +5423,7 @@ HWY_API bool AllTrue(D d, MFromD mask) { template HWY_API size_t CountTrue(D /* tag */, MFromD mask) { - return PopCount(detail::BitsFromMask(mask)); + return PopCount(BitsFromMask(mask)); } #if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) @@ -5468,7 +5471,7 @@ HWY_API size_t FindKnownFirstTrue(D d, MFromD mask) { } #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) (void)d; - return Num0BitsBelowLS1Bit_Nonzero64(detail::BitsFromMask(mask)); + return Num0BitsBelowLS1Bit_Nonzero64(BitsFromMask(mask)); } template > @@ -5484,7 +5487,7 @@ HWY_API intptr_t FindFirstTrue(D d, MFromD mask) { } #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) (void)d; - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(mask); return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1; } @@ -5500,7 +5503,7 @@ HWY_API size_t FindKnownLastTrue(D d, MFromD mask) { } #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) (void)d; - return 63 - Num0BitsAboveMS1Bit_Nonzero64(detail::BitsFromMask(mask)); + return 63 - Num0BitsAboveMS1Bit_Nonzero64(BitsFromMask(mask)); } template > @@ -5516,7 +5519,7 @@ HWY_API intptr_t FindLastTrue(D d, MFromD mask) { } #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) (void)d; - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(mask); return mask_bits ? intptr_t(63 - Num0BitsAboveMS1Bit_Nonzero64(mask_bits)) : -1; } @@ -6012,7 +6015,7 @@ HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { // General case, 2 or 4 bytes template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { - return detail::CompressBits(v, detail::BitsFromMask(mask)); + return detail::CompressBits(v, BitsFromMask(mask)); } // ------------------------------ CompressNot @@ -6051,9 +6054,9 @@ HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { // For partial vectors, we cannot pull the Not() into the table because // BitsFromMask clears the upper bits. if (N < 16 / sizeof(T)) { - return detail::CompressBits(v, detail::BitsFromMask(Not(mask))); + return detail::CompressBits(v, BitsFromMask(Not(mask))); } - return detail::CompressNotBits(v, detail::BitsFromMask(mask)); + return detail::CompressNotBits(v, BitsFromMask(mask)); } // ------------------------------ CompressBlocksNot @@ -6103,7 +6106,7 @@ HWY_API size_t CompressStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; - const uint64_t mask_bits = detail::BitsFromMask(m); + const uint64_t mask_bits = BitsFromMask(m); HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); const size_t count = PopCount(mask_bits); @@ -6130,7 +6133,7 @@ HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; - const uint64_t mask_bits = detail::BitsFromMask(m); + const uint64_t mask_bits = BitsFromMask(m); HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); const size_t count = PopCount(mask_bits); diff --git a/hwy/ops/scalar-inl.h b/hwy/ops/scalar-inl.h index a64faf9106..283e55261e 100644 --- a/hwy/ops/scalar-inl.h +++ b/hwy/ops/scalar-inl.h @@ -288,13 +288,6 @@ HWY_API Mask1 MaskFromVec(const Vec1 v) { template using MFromD = decltype(MaskFromVec(VFromD())); -template -Vec1 VecFromMask(const Mask1 mask) { - Vec1 v; - CopySameSize(&mask, &v); - return v; -} - template > Vec1 VecFromMask(D /* tag */, const Mask1 mask) { Vec1 v; @@ -302,6 +295,11 @@ Vec1 VecFromMask(D /* tag */, const Mask1 mask) { return v; } +template +uint64_t BitsFromMask(Mask1 mask) { + return mask.bits ? 1 : 0; +} + template > HWY_API Mask1 FirstN(D /*tag*/, size_t n) { return Mask1::FromBool(n != 0); diff --git a/hwy/ops/wasm_128-inl.h b/hwy/ops/wasm_128-inl.h index 39471d5239..cf526cc283 100644 --- a/hwy/ops/wasm_128-inl.h +++ b/hwy/ops/wasm_128-inl.h @@ -4946,75 +4946,97 @@ HWY_API MFromD Dup128MaskFromMaskBits(D d, unsigned mask_bits) { namespace detail { -// Full -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, - const Mask128 mask) { +// Returns the lowest N bits for the BitsFromMask result. +template +constexpr uint64_t OnlyActive(uint64_t bits) { + return ((N * sizeof(T)) == 16) ? bits : bits & ((1ull << N) - 1); +} + +} // namespace detail + +template +HWY_API uint64_t BitsFromMask(const Mask128 mask) { alignas(16) uint64_t lanes[2]; wasm_v128_store(lanes, mask.raw); constexpr uint64_t kMagic = 0x103070F1F3F80ULL; const uint64_t lo = ((lanes[0] * kMagic) >> 56); const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00; - return (hi + lo); + return hi + lo; // exactly 16 bits, no OnlyActive required } -// 64-bit -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, - const Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(const Mask128 mask) { constexpr uint64_t kMagic = 0x103070F1F3F80ULL; - return (static_cast(wasm_i64x2_extract_lane(mask.raw, 0)) * - kMagic) >> - 56; + const uint64_t bytes = + static_cast(wasm_i64x2_extract_lane(mask.raw, 0)); + return (bytes * kMagic) >> 56; // exactly 8 bits, no OnlyActive required } // 32-bit or less: need masking -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, - const Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(const Mask128 mask) { uint64_t bytes = static_cast(wasm_i64x2_extract_lane(mask.raw, 0)); // Clear potentially undefined bytes. bytes &= (1ULL << (N * 8)) - 1; constexpr uint64_t kMagic = 0x103070F1F3F80ULL; - return (bytes * kMagic) >> 56; + return detail::OnlyActive((bytes * kMagic) >> 56); } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, - const Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(const Mask128 mask) { // Remove useless lower half of each u16 while preserving the sign bit. const __i16x8 zero = wasm_i16x8_splat(0); const Mask128 mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)}; - return BitsFromMask(hwy::SizeTag<1>(), mask8); + return detail::OnlyActive(BitsFromMask(mask8)); } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, - const Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(const Mask128 mask) { const __i32x4 mask_i = static_cast<__i32x4>(mask.raw); const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8); const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice); alignas(16) uint32_t lanes[4]; wasm_v128_store(lanes, sliced_mask); - return lanes[0] | lanes[1] | lanes[2] | lanes[3]; + return detail::OnlyActive(lanes[0] | lanes[1] | lanes[2] | lanes[3]); } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, - const Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(const Mask128 mask) { const __i64x2 mask_i = static_cast<__i64x2>(mask.raw); const __i64x2 slice = wasm_i64x2_make(1, 2); const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice); alignas(16) uint64_t lanes[2]; wasm_v128_store(lanes, sliced_mask); - return lanes[0] | lanes[1]; + return detail::OnlyActive(lanes[0] | lanes[1]); } -// Returns the lowest N bits for the BitsFromMask result. -template -constexpr uint64_t OnlyActive(uint64_t bits) { - return ((N * sizeof(T)) == 16) ? bits : bits & ((1ull << N) - 1); +namespace detail { + +template +HWY_INLINE size_t CountTrue(hwy::SizeTag<1> /*tag*/, const Mask128 m) { + return PopCount(BitsFromMask(m)); +} + +template +HWY_INLINE size_t CountTrue(hwy::SizeTag<2> /*tag*/, const Mask128 m) { + return PopCount(BitsFromMask(m)); +} + +template +HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128 m) { + const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8); + const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift); + alignas(16) uint64_t lanes[2]; + wasm_v128_store(lanes, shifted_bits); + return PopCount(lanes[0] | lanes[1]); +} + +template +HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128 m) { + alignas(16) int64_t lanes[2]; + wasm_v128_store(lanes, m.raw); + return static_cast(-(lanes[0] + lanes[1])); } // Returns 0xFF for bytes with index >= N, otherwise 0. @@ -5047,43 +5069,12 @@ constexpr __i8x16 BytesAbove() { : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1); } -template -HWY_INLINE uint64_t BitsFromMask(const Mask128 mask) { - return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); -} - -template -HWY_INLINE size_t CountTrue(hwy::SizeTag<1> tag, const Mask128 m) { - return PopCount(BitsFromMask(tag, m)); -} - -template -HWY_INLINE size_t CountTrue(hwy::SizeTag<2> tag, const Mask128 m) { - return PopCount(BitsFromMask(tag, m)); -} - -template -HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128 m) { - const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8); - const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift); - alignas(16) uint64_t lanes[2]; - wasm_v128_store(lanes, shifted_bits); - return PopCount(lanes[0] | lanes[1]); -} - -template -HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128 m) { - alignas(16) int64_t lanes[2]; - wasm_v128_store(lanes, m.raw); - return static_cast(-(lanes[0] + lanes[1])); -} - } // namespace detail // `p` points to at least 8 writable bytes. template HWY_API size_t StoreMaskBits(D d, const MFromD mask, uint8_t* bits) { - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(mask); const size_t kNumBytes = (d.MaxLanes() + 7) / 8; CopyBytes(&mask_bits, bits); return kNumBytes; @@ -5154,25 +5145,25 @@ HWY_API bool AllTrue(D d, const MFromD m) { template HWY_API size_t FindKnownFirstTrue(D /* tag */, const MFromD mask) { - const uint32_t bits = static_cast(detail::BitsFromMask(mask)); + const uint32_t bits = static_cast(BitsFromMask(mask)); return Num0BitsBelowLS1Bit_Nonzero32(bits); } template HWY_API intptr_t FindFirstTrue(D /* tag */, const MFromD mask) { - const uint32_t bits = static_cast(detail::BitsFromMask(mask)); + const uint32_t bits = static_cast(BitsFromMask(mask)); return bits ? static_cast(Num0BitsBelowLS1Bit_Nonzero32(bits)) : -1; } template HWY_API size_t FindKnownLastTrue(D /* tag */, const MFromD mask) { - const uint32_t bits = static_cast(detail::BitsFromMask(mask)); + const uint32_t bits = static_cast(BitsFromMask(mask)); return 31 - Num0BitsAboveMS1Bit_Nonzero32(bits); } template HWY_API intptr_t FindLastTrue(D /* tag */, const MFromD mask) { - const uint32_t bits = static_cast(detail::BitsFromMask(mask)); + const uint32_t bits = static_cast(BitsFromMask(mask)); return bits ? (31 - static_cast(Num0BitsAboveMS1Bit_Nonzero32(bits))) : -1; @@ -5618,7 +5609,7 @@ HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { // General case, 2 or 4 byte lanes template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { - return detail::Compress(v, detail::BitsFromMask(mask)); + return detail::Compress(v, BitsFromMask(mask)); } // Single lane: no-op @@ -5645,9 +5636,9 @@ HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { // For partial vectors, we cannot pull the Not() into the table because // BitsFromMask clears the upper bits. if (N < 16 / sizeof(T)) { - return detail::Compress(v, detail::BitsFromMask(Not(mask))); + return detail::Compress(v, BitsFromMask(Not(mask))); } - return detail::CompressNot(v, detail::BitsFromMask(mask)); + return detail::CompressNot(v, BitsFromMask(mask)); } // ------------------------------ CompressBlocksNot @@ -5674,7 +5665,7 @@ HWY_API Vec128 CompressBits(Vec128 v, template HWY_API size_t CompressStore(VFromD v, MFromD mask, D d, TFromD* HWY_RESTRICT unaligned) { - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(mask); const auto c = detail::Compress(v, mask_bits); StoreU(c, d, unaligned); return PopCount(mask_bits); @@ -5685,7 +5676,7 @@ template HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; // so we can support fp16/bf16 - const uint64_t mask_bits = detail::BitsFromMask(m); + const uint64_t mask_bits = BitsFromMask(m); const size_t count = PopCount(mask_bits); const VFromD compressed = detail::Compress(BitCast(du, v), mask_bits); diff --git a/hwy/ops/wasm_256-inl.h b/hwy/ops/wasm_256-inl.h index aab7105e36..b3b3d50537 100644 --- a/hwy/ops/wasm_256-inl.h +++ b/hwy/ops/wasm_256-inl.h @@ -657,6 +657,13 @@ HWY_API Vec256 VecFromMask(D d, Mask256 m) { return v; } +template +HWY_API uint64_t BitsFromMask(Mask256 m) { + const uint64_t lo = BitsFromMask(m.m0); + const uint64_t hi = BitsFromMask(m.m1); + return (hi << (16 / sizeof(T))) | lo; +} + // mask ? yes : no template HWY_API Vec256 IfThenElse(Mask256 mask, Vec256 yes, Vec256 no) { diff --git a/hwy/ops/x86_128-inl.h b/hwy/ops/x86_128-inl.h index a863cd10c6..22aa9644c2 100644 --- a/hwy/ops/x86_128-inl.h +++ b/hwy/ops/x86_128-inl.h @@ -196,15 +196,12 @@ constexpr uint64_t OnlyActive(uint64_t mask_bits) { } // namespace detail #if HWY_TARGET <= HWY_AVX3 -namespace detail { -// Used by Expand() emulation, which is required for both AVX3 and AVX2. template HWY_INLINE uint64_t BitsFromMask(const Mask128 mask) { - return OnlyActive(mask.raw); + return detail::OnlyActive(mask.raw); } -} // namespace detail #endif // HWY_TARGET <= HWY_AVX3 template @@ -12600,7 +12597,7 @@ HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, #else // AVX2 or below -// ------------------------------ StoreMaskBits +// ------------------------------ BitsFromMask namespace detail { @@ -12608,50 +12605,48 @@ constexpr HWY_INLINE uint64_t U64FromInt(int mask_bits) { return static_cast(static_cast(mask_bits)); } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, - const Mask128 mask) { +} // namespace detail + +template +HWY_API uint64_t BitsFromMask(const Mask128 mask) { const Simd d; const auto sign_bits = BitCast(d, VecFromMask(d, mask)).raw; - return U64FromInt(_mm_movemask_epi8(sign_bits)); + return detail::OnlyActive( + detail::U64FromInt(_mm_movemask_epi8(sign_bits))); } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, - const Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(const Mask128 mask) { // Remove useless lower half of each u16 while preserving the sign bit. const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128()); - return U64FromInt(_mm_movemask_epi8(sign_bits)); + return detail::OnlyActive( + detail::U64FromInt(_mm_movemask_epi8(sign_bits))); } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(Mask128 mask) { const Simd d; const Simd df; const auto sign_bits = BitCast(df, VecFromMask(d, mask)); - return U64FromInt(_mm_movemask_ps(sign_bits.raw)); + return detail::OnlyActive( + detail::U64FromInt(_mm_movemask_ps(sign_bits.raw))); } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(Mask128 mask) { const Simd d; const Simd df; const auto sign_bits = BitCast(df, VecFromMask(d, mask)); - return U64FromInt(_mm_movemask_pd(sign_bits.raw)); -} - -template -HWY_INLINE uint64_t BitsFromMask(const Mask128 mask) { - return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); + return detail::OnlyActive( + detail::U64FromInt(_mm_movemask_pd(sign_bits.raw))); } -} // namespace detail - +// ------------------------------ StoreMaskBits // `p` points to at least 8 writable bytes. template HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { constexpr size_t kNumBytes = (MaxLanes(d) + 7) / 8; - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(mask); CopyBytes(&mask_bits, bits); return kNumBytes; } @@ -12661,41 +12656,41 @@ HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { template HWY_API bool AllFalse(D /* tag */, MFromD mask) { // Cheaper than PTEST, which is 2 uop / 3L. - return detail::BitsFromMask(mask) == 0; + return BitsFromMask(mask) == 0; } template HWY_API bool AllTrue(D d, MFromD mask) { constexpr uint64_t kAllBits = (1ull << MaxLanes(d)) - 1; - return detail::BitsFromMask(mask) == kAllBits; + return BitsFromMask(mask) == kAllBits; } template HWY_API size_t CountTrue(D /* tag */, MFromD mask) { - return PopCount(detail::BitsFromMask(mask)); + return PopCount(BitsFromMask(mask)); } template HWY_API size_t FindKnownFirstTrue(D /* tag */, MFromD mask) { return Num0BitsBelowLS1Bit_Nonzero32( - static_cast(detail::BitsFromMask(mask))); + static_cast(BitsFromMask(mask))); } template HWY_API intptr_t FindFirstTrue(D /* tag */, MFromD mask) { - const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); + const uint32_t mask_bits = static_cast(BitsFromMask(mask)); return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1; } template HWY_API size_t FindKnownLastTrue(D /* tag */, MFromD mask) { return 31 - Num0BitsAboveMS1Bit_Nonzero32( - static_cast(detail::BitsFromMask(mask))); + static_cast(BitsFromMask(mask))); } template HWY_API intptr_t FindLastTrue(D /* tag */, MFromD mask) { - const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); + const uint32_t mask_bits = static_cast(BitsFromMask(mask)); return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits)) : -1; } @@ -13137,7 +13132,7 @@ HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { // General case, 2 or 4 bytes template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { - return detail::CompressBits(v, detail::BitsFromMask(mask)); + return detail::CompressBits(v, BitsFromMask(mask)); } // ------------------------------ CompressNot @@ -13165,9 +13160,9 @@ HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { // For partial vectors, we cannot pull the Not() into the table because // BitsFromMask clears the upper bits. if (N < 16 / sizeof(T)) { - return detail::CompressBits(v, detail::BitsFromMask(Not(mask))); + return detail::CompressBits(v, BitsFromMask(Not(mask))); } - return detail::CompressNotBits(v, detail::BitsFromMask(mask)); + return detail::CompressNotBits(v, BitsFromMask(mask)); } // ------------------------------ CompressBlocksNot @@ -13196,7 +13191,7 @@ HWY_API size_t CompressStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; - const uint64_t mask_bits = detail::BitsFromMask(m); + const uint64_t mask_bits = BitsFromMask(m); HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); const size_t count = PopCount(mask_bits); @@ -13213,7 +13208,7 @@ HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; - const uint64_t mask_bits = detail::BitsFromMask(m); + const uint64_t mask_bits = BitsFromMask(m); HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); const size_t count = PopCount(mask_bits); diff --git a/hwy/ops/x86_256-inl.h b/hwy/ops/x86_256-inl.h index df09c052c7..be51787ce3 100644 --- a/hwy/ops/x86_256-inl.h +++ b/hwy/ops/x86_256-inl.h @@ -165,15 +165,12 @@ struct Mask256 { #endif // AVX2 #if HWY_TARGET <= HWY_AVX3 -namespace detail { -// Used by Expand() emulation, which is required for both AVX3 and AVX2. template HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { return mask.raw; } -} // namespace detail #endif // HWY_TARGET <= HWY_AVX3 template @@ -7732,12 +7729,10 @@ HWY_API MFromD LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { return detail::LoadMaskBits256>(mask_bits); } -// ------------------------------ StoreMaskBits - -namespace detail { +// ------------------------------ BitsFromMask template -HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { +HWY_API uint64_t BitsFromMask(const Mask256 mask) { const Full256 d; const Full256 d8; const auto sign_bits = BitCast(d8, VecFromMask(d, mask)).raw; @@ -7746,7 +7741,7 @@ HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { } template -HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { +HWY_API uint64_t BitsFromMask(const Mask256 mask) { #if !defined(HWY_DISABLE_BMI2_FMA) && !defined(HWY_DISABLE_PEXT_ON_AVX2) const Full256 d; const Full256 d8; @@ -7768,7 +7763,7 @@ HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { } template -HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { +HWY_API uint64_t BitsFromMask(const Mask256 mask) { const Full256 d; const Full256 df; const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw; @@ -7776,22 +7771,21 @@ HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { } template -HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { +HWY_API uint64_t BitsFromMask(const Mask256 mask) { const Full256 d; const Full256 df; const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw; return static_cast(_mm256_movemask_pd(sign_bits)); } -} // namespace detail - +// ------------------------------ StoreMaskBits // `p` points to at least 8 writable bytes. template HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { constexpr size_t N = Lanes(d); constexpr size_t kNumBytes = (N + 7) / 8; - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(mask); CopyBytes(&mask_bits, bits); return kNumBytes; } @@ -7804,59 +7798,59 @@ template HWY_API bool AllFalse(D d, MFromD mask) { const Repartition d8; const Mask256 mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); - return detail::BitsFromMask(mask8) == 0; + return BitsFromMask(mask8) == 0; } template HWY_API bool AllFalse(D /* tag */, MFromD mask) { // Cheaper than PTEST, which is 2 uop / 3L. - return detail::BitsFromMask(mask) == 0; + return BitsFromMask(mask) == 0; } template HWY_API bool AllTrue(D d, MFromD mask) { const Repartition d8; const Mask256 mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); - return detail::BitsFromMask(mask8) == (1ull << 32) - 1; + return BitsFromMask(mask8) == (1ull << 32) - 1; } template HWY_API bool AllTrue(D d, MFromD mask) { constexpr uint64_t kAllBits = (1ull << Lanes(d)) - 1; - return detail::BitsFromMask(mask) == kAllBits; + return BitsFromMask(mask) == kAllBits; } template HWY_API size_t CountTrue(D d, MFromD mask) { const Repartition d8; const Mask256 mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); - return PopCount(detail::BitsFromMask(mask8)) >> 1; + return PopCount(BitsFromMask(mask8)) >> 1; } template HWY_API size_t CountTrue(D /* tag */, MFromD mask) { - return PopCount(detail::BitsFromMask(mask)); + return PopCount(BitsFromMask(mask)); } template HWY_API size_t FindKnownFirstTrue(D /* tag */, MFromD mask) { - const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); + const uint32_t mask_bits = static_cast(BitsFromMask(mask)); return Num0BitsBelowLS1Bit_Nonzero32(mask_bits); } template HWY_API intptr_t FindFirstTrue(D /* tag */, MFromD mask) { - const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); + const uint32_t mask_bits = static_cast(BitsFromMask(mask)); return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1; } template HWY_API size_t FindKnownLastTrue(D /* tag */, MFromD mask) { - const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); + const uint32_t mask_bits = static_cast(BitsFromMask(mask)); return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits); } template HWY_API intptr_t FindLastTrue(D /* tag */, MFromD mask) { - const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); + const uint32_t mask_bits = static_cast(BitsFromMask(mask)); return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits)) : -1; } @@ -8109,12 +8103,12 @@ HWY_INLINE Vec256 CompressNot(Vec256 v, const uint64_t mask_bits) { template HWY_API Vec256 Compress(Vec256 v, Mask256 m) { - return detail::Compress(v, detail::BitsFromMask(m)); + return detail::Compress(v, BitsFromMask(m)); } template HWY_API Vec256 CompressNot(Vec256 v, Mask256 m) { - return detail::CompressNot(v, detail::BitsFromMask(m)); + return detail::CompressNot(v, BitsFromMask(m)); } HWY_API Vec256 CompressBlocksNot(Vec256 v, @@ -8142,7 +8136,7 @@ HWY_API Vec256 CompressBits(Vec256 v, const uint8_t* HWY_RESTRICT bits) { template HWY_API size_t CompressStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { - const uint64_t mask_bits = detail::BitsFromMask(m); + const uint64_t mask_bits = BitsFromMask(m); const size_t count = PopCount(mask_bits); StoreU(detail::Compress(v, mask_bits), d, unaligned); detail::MaybeUnpoison(unaligned, count); @@ -8153,7 +8147,7 @@ template HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { - const uint64_t mask_bits = detail::BitsFromMask(m); + const uint64_t mask_bits = BitsFromMask(m); const size_t count = PopCount(mask_bits); const RebindToUnsigned du; @@ -8180,7 +8174,7 @@ HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, template HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { - const uint64_t mask_bits = detail::BitsFromMask(m); + const uint64_t mask_bits = BitsFromMask(m); const size_t count = PopCount(mask_bits); const VFromD compressed = detail::Compress(v, mask_bits); @@ -8297,7 +8291,7 @@ HWY_API Vec256 Expand(Vec256 v, Mask256 mask) { // LUTs are infeasible for so many mask combinations, so Combine two // half-vector Expand. const Half dh; - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(mask); constexpr size_t N = 32 / sizeof(T); const size_t countL = PopCount(mask_bits & ((1 << (N / 2)) - 1)); const Mask128 maskL = MaskFromVec(LowerHalf(VecFromMask(d, mask))); @@ -8351,7 +8345,7 @@ HWY_API Vec256 Expand(Vec256 v, Mask256 mask) { return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); #else const RebindToUnsigned du; - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(mask); alignas(16) constexpr uint32_t packed_array[256] = { // PrintExpand32x8Nibble. @@ -8420,7 +8414,7 @@ HWY_API Vec256 Expand(Vec256 v, Mask256 mask) { return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); #else const RebindToUnsigned du; - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(mask); alignas(16) constexpr uint64_t packed_array[16] = { // PrintExpand64x4Nibble. diff --git a/hwy/ops/x86_512-inl.h b/hwy/ops/x86_512-inl.h index c906b2e32c..929e188664 100644 --- a/hwy/ops/x86_512-inl.h +++ b/hwy/ops/x86_512-inl.h @@ -2640,74 +2640,60 @@ HWY_API Mask512 operator<=(Vec512 a, Vec512 b) { // ------------------------------ Mask -namespace detail { - template -HWY_INLINE Mask512 MaskFromVec(hwy::SizeTag<1> /*tag*/, Vec512 v) { +HWY_API uint64_t BitsFromMask(const Mask512 mask) { + // OnlyActive is not required because we have at least 8 mask bits. + return mask.raw; +} + +template +HWY_API Mask512 MaskFromVec(Vec512 v) { return Mask512{_mm512_movepi8_mask(v.raw)}; } -template -HWY_INLINE Mask512 MaskFromVec(hwy::SizeTag<2> /*tag*/, Vec512 v) { +template +HWY_API Mask512 MaskFromVec(Vec512 v) { return Mask512{_mm512_movepi16_mask(v.raw)}; } -template -HWY_INLINE Mask512 MaskFromVec(hwy::SizeTag<4> /*tag*/, Vec512 v) { +template +HWY_API Mask512 MaskFromVec(Vec512 v) { return Mask512{_mm512_movepi32_mask(v.raw)}; } -template -HWY_INLINE Mask512 MaskFromVec(hwy::SizeTag<8> /*tag*/, Vec512 v) { - return Mask512{_mm512_movepi64_mask(v.raw)}; -} - -} // namespace detail - -template +template HWY_API Mask512 MaskFromVec(Vec512 v) { - return detail::MaskFromVec(hwy::SizeTag(), v); + return Mask512{_mm512_movepi64_mask(v.raw)}; } -template +template HWY_API Mask512 MaskFromVec(Vec512 v) { const RebindToSigned> di; return Mask512{MaskFromVec(BitCast(di, v)).raw}; } -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_movm_epi8(v.raw)}; -} -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_movm_epi8(v.raw)}; -} - -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_movm_epi16(v.raw)}; +template +HWY_API Vec512 VecFromMask(Mask512 m) { + return Vec512{_mm512_movm_epi8(m.raw)}; } -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_movm_epi16(v.raw)}; +template +HWY_API Vec512 VecFromMask(Mask512 m) { + return Vec512{_mm512_movm_epi16(m.raw)}; } #if HWY_HAVE_FLOAT16 -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_castsi512_ph(_mm512_movm_epi16(v.raw))}; +HWY_API Vec512 VecFromMask(Mask512 m) { + return Vec512{_mm512_castsi512_ph(_mm512_movm_epi16(m.raw))}; } #endif // HWY_HAVE_FLOAT16 - -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_movm_epi32(v.raw)}; -} -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_movm_epi32(v.raw)}; -} -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_castsi512_ps(_mm512_movm_epi32(v.raw))}; -} - -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_movm_epi64(v.raw)}; +template +HWY_API Vec512 VecFromMask(Mask512 m) { + return Vec512{_mm512_movm_epi32(m.raw)}; } -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_movm_epi64(v.raw)}; +template +HWY_API Vec512 VecFromMask(Mask512 m) { + return Vec512{_mm512_movm_epi64(m.raw)}; } -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_castsi512_pd(_mm512_movm_epi64(v.raw))}; +template +HWY_API Vec512 VecFromMask(Mask512 m) { + const Full512 d; + const Full512> di; + return BitCast(d, VecFromMask(RebindMask(di, m))); } // ------------------------------ Mask logical diff --git a/hwy/tests/mask_set_test.cc b/hwy/tests/mask_set_test.cc new file mode 100644 index 0000000000..85d8fd66f2 --- /dev/null +++ b/hwy/tests/mask_set_test.cc @@ -0,0 +1,317 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/mask_set_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace { + +struct TestMaskFalse { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { +#if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE || HWY_TARGET == HWY_SCALAR + // For RVV, SVE and SCALAR, use the underlying native vector. + const DFromV> d2; +#else + // Other targets are strongly-typed, but we can safely ResizeBitCast to the + // native vector. All targets have at least 128-bit vectors, but NEON also + // supports 64-bit vectors. + constexpr size_t kMinD2Lanes = (HWY_TARGET_IS_NEON ? 8 : 16) / sizeof(T); + const FixedTag d2; +#endif + static_assert(d2.MaxBytes() >= d.MaxBytes(), + "d2.MaxBytes() >= d.MaxBytes() should be true"); + using V2 = Vec; + + // Various ways of checking that false masks are false. + HWY_ASSERT(AllFalse(d, MaskFalse(d))); + HWY_ASSERT_EQ(0, CountTrue(d, MaskFalse(d))); + HWY_ASSERT_VEC_EQ(d, Zero(d), VecFromMask(d, MaskFalse(d))); + +#if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE + // For these targets, we can treat the result as if it were a vector of type + // `V2`. On SVE, vectors are always full (not fractional) and caps are only + // enforced by Highway ops. On RVV, LMUL must match but caps can also be + // ignored. For safety, MaskFalse also sets lanes >= `Lanes(d)` to false, + // and we verify that here. + HWY_ASSERT(AllFalse(d2, MaskFalse(d))); + HWY_ASSERT_EQ(0, CountTrue(d2, MaskFalse(d))); + HWY_ASSERT_VEC_EQ(d2, Zero(d2), VecFromMask(d2, MaskFalse(d))); +#endif + + // All targets support, and strongly-typed (non-scalable) targets require, + // ResizeBitCast before we compare to the 'native' underlying vector size. + const V2 actual2 = ResizeBitCast(d2, VecFromMask(d, MaskFalse(d))); + HWY_ASSERT_VEC_EQ(d2, Zero(d2), actual2); + } +}; + +HWY_NOINLINE void TestAllMaskFalse() { + ForAllTypes(ForPartialVectors()); +} + +struct TestFirstN { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(bool_lanes); + + using TN = SignedFromSize; + const size_t max_len = static_cast(LimitsMax()); + + const Vec k1 = Set(d, ConvertScalarTo(1)); + + const size_t max_lanes = HWY_MIN(2 * N, AdjustedReps(512)); + for (size_t len = 0; len <= HWY_MIN(max_lanes, max_len); ++len) { + // Loop instead of Iota+Lt to avoid wraparound for 8-bit T. + for (size_t i = 0; i < N; ++i) { + bool_lanes[i] = ConvertScalarTo(i < len ? 1 : 0); + } + const Mask expected = Eq(Load(d, bool_lanes.get()), k1); + HWY_ASSERT_MASK_EQ(d, expected, FirstN(d, len)); + } + + // Also ensure huge values yield all-true (unless the vector is actually + // larger than max_len). + for (size_t i = 0; i < N; ++i) { + bool_lanes[i] = ConvertScalarTo(i < max_len ? 1 : 0); + } + const Mask expected = Eq(Load(d, bool_lanes.get()), k1); + HWY_ASSERT_MASK_EQ(d, expected, FirstN(d, max_len)); + } +}; + +HWY_NOINLINE void TestAllFirstN() { + ForAllTypes(ForPartialVectors()); +} + +struct TestSetBeforeFirst { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(di); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(bool_lanes); + memset(bool_lanes.get(), 0, N * sizeof(TI)); + + // For all combinations of zero/nonzero state of subset of lanes: + const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); + for (size_t code = 0; code < (1ull << max_lanes); ++code) { + for (size_t i = 0; i < max_lanes; ++i) { + bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); + } + + const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + + const size_t first_set_lane_idx = + (code != 0) + ? Num0BitsBelowLS1Bit_Nonzero64(static_cast(code)) + : N; + const auto expected_mask = FirstN(d, first_set_lane_idx); + + HWY_ASSERT_MASK_EQ(d, expected_mask, SetBeforeFirst(m)); + } + } +}; + +HWY_NOINLINE void TestAllSetBeforeFirst() { + ForAllTypes(ForPartialVectors()); +} + +struct TestSetAtOrBeforeFirst { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(di); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(bool_lanes); + memset(bool_lanes.get(), 0, N * sizeof(TI)); + + // For all combinations of zero/nonzero state of subset of lanes: + const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); + for (size_t code = 0; code < (1ull << max_lanes); ++code) { + for (size_t i = 0; i < max_lanes; ++i) { + bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); + } + + const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + + const size_t idx_after_first_set_lane = + (code != 0) + ? (Num0BitsBelowLS1Bit_Nonzero64(static_cast(code)) + 1) + : N; + const auto expected_mask = FirstN(d, idx_after_first_set_lane); + + HWY_ASSERT_MASK_EQ(d, expected_mask, SetAtOrBeforeFirst(m)); + } + } +}; + +HWY_NOINLINE void TestAllSetAtOrBeforeFirst() { + ForAllTypes(ForPartialVectors()); +} + +struct TestSetOnlyFirst { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(di); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(bool_lanes); + memset(bool_lanes.get(), 0, N * sizeof(TI)); + auto expected_lanes = AllocateAligned(N); + HWY_ASSERT(expected_lanes); + + // For all combinations of zero/nonzero state of subset of lanes: + const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); + for (size_t code = 0; code < (1ull << max_lanes); ++code) { + for (size_t i = 0; i < max_lanes; ++i) { + bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); + } + + memset(expected_lanes.get(), 0, N * sizeof(TI)); + if (code != 0) { + const size_t idx_of_first_lane = + Num0BitsBelowLS1Bit_Nonzero64(static_cast(code)); + expected_lanes[idx_of_first_lane] = TI(1); + } + + const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + const auto expected_mask = + RebindMask(d, Gt(Load(di, expected_lanes.get()), Zero(di))); + + HWY_ASSERT_MASK_EQ(d, expected_mask, SetOnlyFirst(m)); + } + } +}; + +HWY_NOINLINE void TestAllSetOnlyFirst() { + ForAllTypes(ForPartialVectors()); +} + +struct TestSetAtOrAfterFirst { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(di); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(bool_lanes); + memset(bool_lanes.get(), 0, N * sizeof(TI)); + + // For all combinations of zero/nonzero state of subset of lanes: + const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); + for (size_t code = 0; code < (1ull << max_lanes); ++code) { + for (size_t i = 0; i < max_lanes; ++i) { + bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); + } + + const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + + const size_t first_set_lane_idx = + (code != 0) + ? Num0BitsBelowLS1Bit_Nonzero64(static_cast(code)) + : N; + const auto expected_at_or_after_first_mask = + Not(FirstN(d, first_set_lane_idx)); + const auto actual_at_or_after_first_mask = SetAtOrAfterFirst(m); + + HWY_ASSERT_MASK_EQ(d, expected_at_or_after_first_mask, + actual_at_or_after_first_mask); + HWY_ASSERT_MASK_EQ( + d, SetOnlyFirst(m), + And(actual_at_or_after_first_mask, SetAtOrBeforeFirst(m))); + HWY_ASSERT_MASK_EQ(d, m, And(m, actual_at_or_after_first_mask)); + HWY_ASSERT( + AllTrue(d, Xor(actual_at_or_after_first_mask, SetBeforeFirst(m)))); + } + } +}; + +HWY_NOINLINE void TestAllSetAtOrAfterFirst() { + ForAllTypes(ForPartialVectors()); +} + +struct TestDup128MaskFromMaskBits { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(di); + constexpr size_t kLanesPer16ByteBlock = 16 / sizeof(T); + + auto expected = AllocateAligned(N); + HWY_ASSERT(expected); + + // For all combinations of zero/nonzero state of subset of lanes: + constexpr size_t kMaxLanesToCheckPerBlk = + HWY_MIN(HWY_MAX_LANES_D(D), HWY_MIN(kLanesPer16ByteBlock, 10)); + const size_t max_lanes = HWY_MIN(N, kMaxLanesToCheckPerBlk); + + for (unsigned code = 0; code < (1u << max_lanes); ++code) { + for (size_t i = 0; i < N; i++) { + expected[i] = static_cast( + -static_cast((code >> (i & (kLanesPer16ByteBlock - 1))) & 1)); + } + + const auto expected_mask = + MaskFromVec(BitCast(d, LoadDup128(di, expected.get()))); + + const auto m = Dup128MaskFromMaskBits(d, code); + HWY_ASSERT_VEC_EQ(di, expected.get(), VecFromMask(di, RebindMask(di, m))); + HWY_ASSERT_MASK_EQ(d, expected_mask, m); + } + } +}; + +HWY_NOINLINE void TestAllDup128MaskFromMaskBits() { + ForAllTypes(ForPartialVectors()); +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_BEFORE_TEST(HwyMaskSetTest); +HWY_EXPORT_AND_TEST_P(HwyMaskSetTest, TestAllMaskFalse); +HWY_EXPORT_AND_TEST_P(HwyMaskSetTest, TestAllFirstN); +HWY_EXPORT_AND_TEST_P(HwyMaskSetTest, TestAllSetBeforeFirst); +HWY_EXPORT_AND_TEST_P(HwyMaskSetTest, TestAllSetAtOrBeforeFirst); +HWY_EXPORT_AND_TEST_P(HwyMaskSetTest, TestAllSetOnlyFirst); +HWY_EXPORT_AND_TEST_P(HwyMaskSetTest, TestAllSetAtOrAfterFirst); +HWY_EXPORT_AND_TEST_P(HwyMaskSetTest, TestAllDup128MaskFromMaskBits); +HWY_AFTER_TEST(); +} // namespace +} // namespace hwy +HWY_TEST_MAIN(); +#endif // HWY_ONCE diff --git a/hwy/tests/mask_test.cc b/hwy/tests/mask_test.cc index 3ad55f5ced..a3a4e564b1 100644 --- a/hwy/tests/mask_test.cc +++ b/hwy/tests/mask_test.cc @@ -15,7 +15,7 @@ #include #include -#include // memcmp +#include // memset #undef HWY_TARGET_INCLUDE #define HWY_TARGET_INCLUDE "tests/mask_test.cc" @@ -28,52 +28,7 @@ namespace hwy { namespace HWY_NAMESPACE { namespace { -// All types. -struct TestMaskFalse { - template - HWY_NOINLINE void operator()(T /*unused*/, D d) { -#if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE || HWY_TARGET == HWY_SCALAR - // For RVV, SVE and SCALAR, use the underlying native vector. - const DFromV> d2; -#else - // Other targets are strongly-typed, but we can safely ResizeBitCast to the - // native vector. All targets have at least 128-bit vectors, but NEON also - // supports 64-bit vectors. - constexpr size_t kMinD2Lanes = (HWY_TARGET_IS_NEON ? 8 : 16) / sizeof(T); - const FixedTag d2; -#endif - static_assert(d2.MaxBytes() >= d.MaxBytes(), - "d2.MaxBytes() >= d.MaxBytes() should be true"); - using V2 = Vec; - - // Various ways of checking that false masks are false. - HWY_ASSERT(AllFalse(d, MaskFalse(d))); - HWY_ASSERT_EQ(0, CountTrue(d, MaskFalse(d))); - HWY_ASSERT_VEC_EQ(d, Zero(d), VecFromMask(d, MaskFalse(d))); - -#if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE - // For these targets, we can treat the result as if it were a vector of type - // `V2`. On SVE, vectors are always full (not fractional) and caps are only - // enforced by Highway ops. On RVV, LMUL must match but caps can also be - // ignored. For safety, MaskFalse also sets lanes >= `Lanes(d)` to false, - // and we verify that here. - HWY_ASSERT(AllFalse(d2, MaskFalse(d))); - HWY_ASSERT_EQ(0, CountTrue(d2, MaskFalse(d))); - HWY_ASSERT_VEC_EQ(d2, Zero(d2), VecFromMask(d2, MaskFalse(d))); -#endif - - // All targets support, and strongly-typed (non-scalable) targets require, - // ResizeBitCast before we compare to the 'native' underlying vector size. - const V2 actual2 = ResizeBitCast(d2, VecFromMask(d, MaskFalse(d))); - HWY_ASSERT_VEC_EQ(d2, Zero(d2), actual2); - } -}; - -HWY_NOINLINE void TestAllMaskFalse() { - ForAllTypes(ForPartialVectors()); -} - -struct TestFromVec { +struct TestMaskFromVec { template HWY_NOINLINE void operator()(T /*unused*/, D d) { const size_t N = Lanes(d); @@ -81,86 +36,80 @@ struct TestFromVec { HWY_ASSERT(lanes); memset(lanes.get(), 0, N * sizeof(T)); - const auto actual_false = MaskFromVec(Load(d, lanes.get())); + const Mask actual_false = MaskFromVec(Load(d, lanes.get())); HWY_ASSERT_MASK_EQ(d, MaskFalse(d), actual_false); memset(lanes.get(), 0xFF, N * sizeof(T)); - const auto actual_true = MaskFromVec(Load(d, lanes.get())); + const Mask actual_true = MaskFromVec(Load(d, lanes.get())); HWY_ASSERT_MASK_EQ(d, MaskTrue(d), actual_true); } }; -HWY_NOINLINE void TestAllFromVec() { - ForAllTypes(ForPartialVectors()); +HWY_NOINLINE void TestAllMaskFromVec() { + ForAllTypes(ForPartialVectors()); } -struct TestFirstN { +// Round trip, using MaskFromVec. +struct TestVecFromMask { template HWY_NOINLINE void operator()(T /*unused*/, D d) { - const size_t N = Lanes(d); - auto bool_lanes = AllocateAligned(N); - HWY_ASSERT(bool_lanes); - - using TN = SignedFromSize; - const size_t max_len = static_cast(LimitsMax()); + RandomState rng; - const Vec k1 = Set(d, ConvertScalarTo(1)); + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(d); + auto lanes = AllocateAligned(N); + HWY_ASSERT(lanes); - const size_t max_lanes = HWY_MIN(2 * N, AdjustedReps(512)); - for (size_t len = 0; len <= HWY_MIN(max_lanes, max_len); ++len) { - // Loop instead of Iota+Lt to avoid wraparound for 8-bit T. + // Each lane should have a chance of having mask=true. + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { for (size_t i = 0; i < N; ++i) { - bool_lanes[i] = ConvertScalarTo(i < len ? 1 : 0); + lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); } - const Mask expected = Eq(Load(d, bool_lanes.get()), k1); - HWY_ASSERT_MASK_EQ(d, expected, FirstN(d, len)); - } - // Also ensure huge values yield all-true (unless the vector is actually - // larger than max_len). - for (size_t i = 0; i < N; ++i) { - bool_lanes[i] = ConvertScalarTo(i < max_len ? 1 : 0); + const Mask mask = RebindMask(d, Gt(Load(di, lanes.get()), Zero(di))); + HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask))); } - const Mask expected = Eq(Load(d, bool_lanes.get()), k1); - HWY_ASSERT_MASK_EQ(d, expected, FirstN(d, max_len)); } }; -HWY_NOINLINE void TestAllFirstN() { - ForAllTypes(ForPartialVectors()); +HWY_NOINLINE void TestAllVecFromMask() { + ForAllTypes(ForPartialVectors()); } -struct TestMaskVec { +struct TestBitsFromMask { template HWY_NOINLINE void operator()(T /*unused*/, D d) { +#if HWY_HAVE_SCALABLE || HWY_MAX_BYTES > 64 + (void)d; +#else RandomState rng; using TI = MakeSigned; // For mask > 0 comparison const Rebind di; const size_t N = Lanes(d); - auto bool_lanes = AllocateAligned(N); - HWY_ASSERT(bool_lanes); + HWY_ASSERT(N <= 64); // non-scalable targets have at most 512 bits. + auto lanes = AllocateAligned(N); + HWY_ASSERT(lanes); // Each lane should have a chance of having mask=true. for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + uint64_t expected_bits = 0; for (size_t i = 0; i < N; ++i) { - bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); + lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); + expected_bits |= lanes[i] ? (1ull << i) : 0; } - const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); - HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask))); + const Mask mask = RebindMask(d, Gt(Load(di, lanes.get()), Zero(di))); + const uint64_t actual_bits = BitsFromMask(mask); + HWY_ASSERT_EQ(expected_bits, actual_bits); } +#endif // HWY_HAVE_SCALABLE || HWY_MAX_BYTES > 64 } }; -HWY_NOINLINE void TestAllMaskVec() { - const ForPartialVectors test; - - test(uint16_t()); - test(int16_t()); - // TODO(janwas): float16_t - cannot compare yet - - ForUIF3264(test); +HWY_NOINLINE void TestAllBitsFromMask() { + ForAllTypes(ForPartialVectors()); } struct TestAllTrueFalse { @@ -361,192 +310,6 @@ HWY_NOINLINE void TestAllLogicalMask() { ForAllTypes(ForPartialVectors()); } -struct TestSetBeforeFirst { - template - HWY_NOINLINE void operator()(T /*unused*/, D d) { - using TI = MakeSigned; // For mask > 0 comparison - const Rebind di; - const size_t N = Lanes(di); - auto bool_lanes = AllocateAligned(N); - HWY_ASSERT(bool_lanes); - memset(bool_lanes.get(), 0, N * sizeof(TI)); - - // For all combinations of zero/nonzero state of subset of lanes: - const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); - for (size_t code = 0; code < (1ull << max_lanes); ++code) { - for (size_t i = 0; i < max_lanes; ++i) { - bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); - } - - const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); - - const size_t first_set_lane_idx = - (code != 0) - ? Num0BitsBelowLS1Bit_Nonzero64(static_cast(code)) - : N; - const auto expected_mask = FirstN(d, first_set_lane_idx); - - HWY_ASSERT_MASK_EQ(d, expected_mask, SetBeforeFirst(m)); - } - } -}; - -HWY_NOINLINE void TestAllSetBeforeFirst() { - ForAllTypes(ForPartialVectors()); -} - -struct TestSetAtOrBeforeFirst { - template - HWY_NOINLINE void operator()(T /*unused*/, D d) { - using TI = MakeSigned; // For mask > 0 comparison - const Rebind di; - const size_t N = Lanes(di); - auto bool_lanes = AllocateAligned(N); - HWY_ASSERT(bool_lanes); - memset(bool_lanes.get(), 0, N * sizeof(TI)); - - // For all combinations of zero/nonzero state of subset of lanes: - const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); - for (size_t code = 0; code < (1ull << max_lanes); ++code) { - for (size_t i = 0; i < max_lanes; ++i) { - bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); - } - - const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); - - const size_t idx_after_first_set_lane = - (code != 0) - ? (Num0BitsBelowLS1Bit_Nonzero64(static_cast(code)) + 1) - : N; - const auto expected_mask = FirstN(d, idx_after_first_set_lane); - - HWY_ASSERT_MASK_EQ(d, expected_mask, SetAtOrBeforeFirst(m)); - } - } -}; - -HWY_NOINLINE void TestAllSetAtOrBeforeFirst() { - ForAllTypes(ForPartialVectors()); -} - -struct TestSetOnlyFirst { - template - HWY_NOINLINE void operator()(T /*unused*/, D d) { - using TI = MakeSigned; // For mask > 0 comparison - const Rebind di; - const size_t N = Lanes(di); - auto bool_lanes = AllocateAligned(N); - HWY_ASSERT(bool_lanes); - memset(bool_lanes.get(), 0, N * sizeof(TI)); - auto expected_lanes = AllocateAligned(N); - HWY_ASSERT(expected_lanes); - - // For all combinations of zero/nonzero state of subset of lanes: - const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); - for (size_t code = 0; code < (1ull << max_lanes); ++code) { - for (size_t i = 0; i < max_lanes; ++i) { - bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); - } - - memset(expected_lanes.get(), 0, N * sizeof(TI)); - if (code != 0) { - const size_t idx_of_first_lane = - Num0BitsBelowLS1Bit_Nonzero64(static_cast(code)); - expected_lanes[idx_of_first_lane] = TI(1); - } - - const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); - const auto expected_mask = - RebindMask(d, Gt(Load(di, expected_lanes.get()), Zero(di))); - - HWY_ASSERT_MASK_EQ(d, expected_mask, SetOnlyFirst(m)); - } - } -}; - -HWY_NOINLINE void TestAllSetOnlyFirst() { - ForAllTypes(ForPartialVectors()); -} - -struct TestSetAtOrAfterFirst { - template - HWY_NOINLINE void operator()(T /*unused*/, D d) { - using TI = MakeSigned; // For mask > 0 comparison - const Rebind di; - const size_t N = Lanes(di); - auto bool_lanes = AllocateAligned(N); - HWY_ASSERT(bool_lanes); - memset(bool_lanes.get(), 0, N * sizeof(TI)); - - // For all combinations of zero/nonzero state of subset of lanes: - const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); - for (size_t code = 0; code < (1ull << max_lanes); ++code) { - for (size_t i = 0; i < max_lanes; ++i) { - bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); - } - - const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); - - const size_t first_set_lane_idx = - (code != 0) - ? Num0BitsBelowLS1Bit_Nonzero64(static_cast(code)) - : N; - const auto expected_at_or_after_first_mask = - Not(FirstN(d, first_set_lane_idx)); - const auto actual_at_or_after_first_mask = SetAtOrAfterFirst(m); - - HWY_ASSERT_MASK_EQ(d, expected_at_or_after_first_mask, - actual_at_or_after_first_mask); - HWY_ASSERT_MASK_EQ( - d, SetOnlyFirst(m), - And(actual_at_or_after_first_mask, SetAtOrBeforeFirst(m))); - HWY_ASSERT_MASK_EQ(d, m, And(m, actual_at_or_after_first_mask)); - HWY_ASSERT( - AllTrue(d, Xor(actual_at_or_after_first_mask, SetBeforeFirst(m)))); - } - } -}; - -HWY_NOINLINE void TestAllSetAtOrAfterFirst() { - ForAllTypes(ForPartialVectors()); -} - -struct TestDup128MaskFromMaskBits { - template - HWY_NOINLINE void operator()(T /*unused*/, D d) { - using TI = MakeSigned; // For mask > 0 comparison - const Rebind di; - const size_t N = Lanes(di); - constexpr size_t kLanesPer16ByteBlock = 16 / sizeof(T); - - auto expected = AllocateAligned(N); - HWY_ASSERT(expected); - - // For all combinations of zero/nonzero state of subset of lanes: - constexpr size_t kMaxLanesToCheckPerBlk = - HWY_MIN(HWY_MAX_LANES_D(D), HWY_MIN(kLanesPer16ByteBlock, 10)); - const size_t max_lanes = HWY_MIN(N, kMaxLanesToCheckPerBlk); - - for (unsigned code = 0; code < (1u << max_lanes); ++code) { - for (size_t i = 0; i < N; i++) { - expected[i] = static_cast( - -static_cast((code >> (i & (kLanesPer16ByteBlock - 1))) & 1)); - } - - const auto expected_mask = - MaskFromVec(BitCast(d, LoadDup128(di, expected.get()))); - - const auto m = Dup128MaskFromMaskBits(d, code); - HWY_ASSERT_VEC_EQ(di, expected.get(), VecFromMask(di, RebindMask(di, m))); - HWY_ASSERT_MASK_EQ(d, expected_mask, m); - } - } -}; - -HWY_NOINLINE void TestAllDup128MaskFromMaskBits() { - ForAllTypes(ForPartialVectors()); -} - } // namespace // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE @@ -557,20 +320,14 @@ HWY_AFTER_NAMESPACE(); namespace hwy { namespace { HWY_BEFORE_TEST(HwyMaskTest); -HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskFalse); -HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFromVec); -HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFirstN); -HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskVec); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskFromVec); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllVecFromMask); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllBitsFromMask); HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllAllTrueFalse); HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllCountTrue); HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFindFirstTrue); HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFindLastTrue); HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllLogicalMask); -HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllSetBeforeFirst); -HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllSetAtOrBeforeFirst); -HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllSetOnlyFirst); -HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllSetAtOrAfterFirst); -HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllDup128MaskFromMaskBits); HWY_AFTER_TEST(); } // namespace } // namespace hwy From bcf564e55e5cc46b8090c9bb9723a8a8ef8ecc97 Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Thu, 28 Nov 2024 01:09:22 -0800 Subject: [PATCH 13/64] Add BitsFromMask, promoting from detail::. Also split mask_test into mask_set_test, remove unused overload in scalar, modernize overloads (SFINAE instead of type tags). PiperOrigin-RevId: 700938851 --- BUILD | 1 - CMakeLists.txt | 1 - g3doc/quick_reference.md | 39 ++--- hwy/ops/arm_neon-inl.h | 91 +++++------ hwy/ops/emu128-inl.h | 9 -- hwy/ops/generic_ops-inl.h | 25 ++- hwy/ops/ppc_vsx-inl.h | 85 +++++----- hwy/ops/scalar-inl.h | 12 +- hwy/ops/wasm_128-inl.h | 137 ++++++++-------- hwy/ops/wasm_256-inl.h | 7 - hwy/ops/x86_128-inl.h | 73 +++++---- hwy/ops/x86_256-inl.h | 56 ++++--- hwy/ops/x86_512-inl.h | 80 +++++---- hwy/tests/mask_set_test.cc | 317 ------------------------------------ hwy/tests/mask_test.cc | 323 ++++++++++++++++++++++++++++++++----- 15 files changed, 593 insertions(+), 663 deletions(-) delete mode 100644 hwy/tests/mask_set_test.cc diff --git a/BUILD b/BUILD index e74a2770b8..114eef8a02 100644 --- a/BUILD +++ b/BUILD @@ -513,7 +513,6 @@ HWY_TESTS = [ ("hwy/tests/", "mask_combine_test"), ("hwy/tests/", "mask_convert_test"), ("hwy/tests/", "mask_mem_test"), - ("hwy/tests/", "mask_set_test"), ("hwy/tests/", "mask_slide_test"), ("hwy/tests/", "mask_test"), ("hwy/tests/", "masked_arithmetic_test"), diff --git a/CMakeLists.txt b/CMakeLists.txt index 04f1fa4b6b..9cf044cbc9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -746,7 +746,6 @@ set(HWY_TEST_FILES hwy/tests/mask_combine_test.cc hwy/tests/mask_convert_test.cc hwy/tests/mask_mem_test.cc - hwy/tests/mask_set_test.cc hwy/tests/mask_slide_test.cc hwy/tests/mask_test.cc hwy/tests/masked_arithmetic_test.cc diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md index 4cdbb57d72..8220e9b718 100644 --- a/g3doc/quick_reference.md +++ b/g3doc/quick_reference.md @@ -1153,12 +1153,6 @@ encoding depends on the platform). * V **VecFromMask**(D, M m): returns 0 in lane `i` if `m[i] == false`, otherwise all bits set. -* uint64_t **BitsFromMask**(M m): returns bits `b` such that - `(b >> i) & 1` indicates whether `m[i]` was set, and any remaining bits in - the `uint64_t` are zero. This is only available if `!HWY_HAVE_SCALABLE && - HWY_MAX_BYTES <= 64`, because 512-bit vectors are the longest for which - there are no more than 64 lanes and thus mask bits. - * size_t **StoreMaskBits**(D, M m, uint8_t* p): stores a bit array indicating whether `m[i]` is true, in ascending order of `i`, filling the bits of each byte from least to most significant, then proceeding to the @@ -1169,11 +1163,11 @@ encoding depends on the platform). Mask<DFrom> m): Promotes `m` to a mask with a lane type of `TFromD`, `DFrom` is `Rebind`. - `PromoteMaskTo(d_to, d_from, m)` is equivalent to `MaskFromVec(BitCast(d_to, - PromoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m)))))`, where - `di_from` is `RebindToSigned()` and `di_from` is - `RebindToSigned()`, but `PromoteMaskTo(d_to, d_from, m)` is more - efficient on some targets. + `PromoteMaskTo(d_to, d_from, m)` is equivalent to + `MaskFromVec(BitCast(d_to, PromoteTo(di_to, BitCast(di_from, + VecFromMask(d_from, m)))))`, where `di_from` is `RebindToSigned()` + and `di_from` is `RebindToSigned()`, but + `PromoteMaskTo(d_to, d_from, m)` is more efficient on some targets. PromoteMaskTo requires that `sizeof(TFromD) < sizeof(TFromD)` be true. @@ -1182,11 +1176,11 @@ encoding depends on the platform). Mask<DFrom> m): Demotes `m` to a mask with a lane type of `TFromD`, `DFrom` is `Rebind`. - `DemoteMaskTo(d_to, d_from, m)` is equivalent to `MaskFromVec(BitCast(d_to, - DemoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m)))))`, where - `di_from` is `RebindToSigned()` and `di_from` is - `RebindToSigned()`, but `DemoteMaskTo(d_to, d_from, m)` is more - efficient on some targets. + `DemoteMaskTo(d_to, d_from, m)` is equivalent to + `MaskFromVec(BitCast(d_to, DemoteTo(di_to, BitCast(di_from, + VecFromMask(d_from, m)))))`, where `di_from` is `RebindToSigned()` + and `di_from` is `RebindToSigned()`, but + `DemoteMaskTo(d_to, d_from, m)` is more efficient on some targets. DemoteMaskTo requires that `sizeof(TFromD) > sizeof(TFromD)` be true. @@ -1195,15 +1189,16 @@ encoding depends on the platform). whose `LowerHalf` is the first argument and whose `UpperHalf` is the second argument; `M2` is `Mask>`; `DTo` is `Repartition`. - OrderedDemote2MasksTo requires that `sizeof(TFromD) == - sizeof(TFromD) * 2` be true. + OrderedDemote2MasksTo requires that + `sizeof(TFromD) == sizeof(TFromD) * 2` be true. `OrderedDemote2MasksTo(d_to, d_from, a, b)` is equivalent to `MaskFromVec(BitCast(d_to, OrderedDemote2To(di_to, va, vb)))`, where `va` is - `BitCast(di_from, MaskFromVec(d_from, a))`, `vb` is `BitCast(di_from, - MaskFromVec(d_from, b))`, `di_to` is `RebindToSigned()`, and `di_from` - is `RebindToSigned()`, but `OrderedDemote2MasksTo(d_to, d_from, a, - b)` is more efficient on some targets. + `BitCast(di_from, MaskFromVec(d_from, a))`, `vb` is + `BitCast(di_from, MaskFromVec(d_from, b))`, `di_to` is + `RebindToSigned()`, and `di_from` is `RebindToSigned()`, but + `OrderedDemote2MasksTo(d_to, d_from, a, b)` is more efficient on some + targets. OrderedDemote2MasksTo is only available if `HWY_TARGET != HWY_SCALAR` is true. diff --git a/hwy/ops/arm_neon-inl.h b/hwy/ops/arm_neon-inl.h index 8740b714db..37294205a7 100644 --- a/hwy/ops/arm_neon-inl.h +++ b/hwy/ops/arm_neon-inl.h @@ -21,7 +21,6 @@ // Arm NEON intrinsics are documented at: // https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon] -#include "hwy/base.h" #include "hwy/ops/shared-inl.h" HWY_DIAGNOSTICS(push) @@ -8922,16 +8921,8 @@ HWY_INLINE uint64_t NibblesFromMask(D d, MFromD mask) { return nib & ((1ull << (d.MaxBytes() * 4)) - 1); } -// Returns the lowest N for the BitsFromMask result. -template -constexpr uint64_t OnlyActive(uint64_t bits) { - return ((N * sizeof(T)) >= 8) ? bits : (bits & ((1ull << N) - 1)); -} - -} // namespace detail - -template -HWY_API uint64_t BitsFromMask(Mask128 mask) { +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128 mask) { alignas(16) static constexpr uint8_t kSliceLanes[16] = { 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, }; @@ -8954,8 +8945,8 @@ HWY_API uint64_t BitsFromMask(Mask128 mask) { #endif } -template -HWY_API uint64_t BitsFromMask(Mask128 mask) { +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128 mask) { // Upper lanes of partial loads are undefined. OnlyActive will fix this if // we load all kSliceLanes so the upper lanes do not pollute the valid bits. alignas(8) static constexpr uint8_t kSliceLanes[8] = {1, 2, 4, 8, @@ -8966,17 +8957,17 @@ HWY_API uint64_t BitsFromMask(Mask128 mask) { const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; #if HWY_ARCH_ARM_A64 - return detail::OnlyActive(vaddv_u8(values.raw)); + return vaddv_u8(values.raw); #else const uint16x4_t x2 = vpaddl_u8(values.raw); const uint32x2_t x4 = vpaddl_u16(x2); const uint64x1_t x8 = vpaddl_u32(x4); - return detail::OnlyActive(vget_lane_u64(x8, 0)); + return vget_lane_u64(x8, 0); #endif } -template -HWY_API uint64_t BitsFromMask(Mask128 mask) { +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128 mask) { alignas(16) static constexpr uint16_t kSliceLanes[8] = { 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80}; const Full128 d; @@ -8984,17 +8975,16 @@ HWY_API uint64_t BitsFromMask(Mask128 mask) { const Vec128 values = BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); #if HWY_ARCH_ARM_A64 - return detail::OnlyActive(vaddvq_u16(values.raw)); + return vaddvq_u16(values.raw); #else const uint32x4_t x2 = vpaddlq_u16(values.raw); const uint64x2_t x4 = vpaddlq_u32(x2); - return detail::OnlyActive(vgetq_lane_u64(x4, 0) + - vgetq_lane_u64(x4, 1)); + return vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1); #endif } -template -HWY_API uint64_t BitsFromMask(Mask128 mask) { +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128 mask) { // Upper lanes of partial loads are undefined. OnlyActive will fix this if // we load all kSliceLanes so the upper lanes do not pollute the valid bits. alignas(8) static constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8}; @@ -9003,32 +8993,31 @@ HWY_API uint64_t BitsFromMask(Mask128 mask) { const Vec128 slice(Load(Full64(), kSliceLanes).raw); const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; #if HWY_ARCH_ARM_A64 - return detail::OnlyActive(vaddv_u16(values.raw)); + return vaddv_u16(values.raw); #else const uint32x2_t x2 = vpaddl_u16(values.raw); const uint64x1_t x4 = vpaddl_u32(x2); - return detail::OnlyActive(vget_lane_u64(x4, 0)); + return vget_lane_u64(x4, 0); #endif } -template -HWY_API uint64_t BitsFromMask(Mask128 mask) { +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128 mask) { alignas(16) static constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8}; const Full128 d; const Full128 du; const Vec128 values = BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); #if HWY_ARCH_ARM_A64 - return detail::OnlyActive(vaddvq_u32(values.raw)); + return vaddvq_u32(values.raw); #else const uint64x2_t x2 = vpaddlq_u32(values.raw); - return detail::OnlyActive(vgetq_lane_u64(x2, 0) + - vgetq_lane_u64(x2, 1)); + return vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1); #endif } -template -HWY_API uint64_t BitsFromMask(Mask128 mask) { +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128 mask) { // Upper lanes of partial loads are undefined. OnlyActive will fix this if // we load all kSliceLanes so the upper lanes do not pollute the valid bits. alignas(8) static constexpr uint32_t kSliceLanes[2] = {1, 2}; @@ -9037,37 +9026,45 @@ HWY_API uint64_t BitsFromMask(Mask128 mask) { const Vec128 slice(Load(Full64(), kSliceLanes).raw); const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; #if HWY_ARCH_ARM_A64 - return detail::OnlyActive(vaddv_u32(values.raw)); + return vaddv_u32(values.raw); #else const uint64x1_t x2 = vpaddl_u32(values.raw); - return detail::OnlyActive(vget_lane_u64(x2, 0)); + return vget_lane_u64(x2, 0); #endif } -template -HWY_API uint64_t BitsFromMask(Mask128 m) { +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128 m) { alignas(16) static constexpr uint64_t kSliceLanes[2] = {1, 2}; const Full128 d; const Full128 du; const Vec128 values = BitCast(du, VecFromMask(d, m)) & Load(du, kSliceLanes); #if HWY_ARCH_ARM_A64 - return detail::OnlyActive(vaddvq_u64(values.raw)); + return vaddvq_u64(values.raw); #else - return detail::OnlyActive(vgetq_lane_u64(values.raw, 0) + - vgetq_lane_u64(values.raw, 1)); + return vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1); #endif } -template -HWY_API uint64_t BitsFromMask(Mask128 m) { +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128 m) { const Full64 d; const Full64 du; const Vec64 values = BitCast(du, VecFromMask(d, m)) & Set(du, 1); return vget_lane_u64(values.raw, 0); } -namespace detail { +// Returns the lowest N for the BitsFromMask result. +template +constexpr uint64_t OnlyActive(uint64_t bits) { + return ((N * sizeof(T)) >= 8) ? bits : (bits & ((1ull << N) - 1)); +} + +template +HWY_INLINE uint64_t BitsFromMask(Mask128 mask) { + return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); +} // Returns number of lanes whose mask is set. // @@ -9187,7 +9184,7 @@ HWY_API intptr_t FindLastTrue(D d, MFromD mask) { // `p` points to at least 8 writable bytes. template HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { - const uint64_t mask_bits = BitsFromMask(mask); + const uint64_t mask_bits = detail::BitsFromMask(mask); const size_t kNumBytes = (d.MaxLanes() + 7) / 8; CopyBytes(&mask_bits, bits); return kNumBytes; @@ -9675,7 +9672,7 @@ HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { // General case, 2 or 4 byte lanes template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { - return detail::Compress(v, BitsFromMask(mask)); + return detail::Compress(v, detail::BitsFromMask(mask)); } // Single lane: no-op @@ -9702,9 +9699,9 @@ HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { // For partial vectors, we cannot pull the Not() into the table because // BitsFromMask clears the upper bits. if (N < 16 / sizeof(T)) { - return detail::Compress(v, BitsFromMask(Not(mask))); + return detail::Compress(v, detail::BitsFromMask(Not(mask))); } - return detail::CompressNot(v, BitsFromMask(mask)); + return detail::CompressNot(v, detail::BitsFromMask(mask)); } // ------------------------------ CompressBlocksNot @@ -9732,7 +9729,7 @@ HWY_INLINE Vec128 CompressBits(Vec128 v, template HWY_API size_t CompressStore(VFromD v, MFromD mask, D d, TFromD* HWY_RESTRICT unaligned) { - const uint64_t mask_bits = BitsFromMask(mask); + const uint64_t mask_bits = detail::BitsFromMask(mask); StoreU(detail::Compress(v, mask_bits), d, unaligned); return PopCount(mask_bits); } @@ -9742,7 +9739,7 @@ template HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; // so we can support fp16/bf16 - const uint64_t mask_bits = BitsFromMask(m); + const uint64_t mask_bits = detail::BitsFromMask(m); const size_t count = PopCount(mask_bits); const MFromD store_mask = RebindMask(d, FirstN(du, count)); const VFromD compressed = diff --git a/hwy/ops/emu128-inl.h b/hwy/ops/emu128-inl.h index d19f7cc168..5c5ed98799 100644 --- a/hwy/ops/emu128-inl.h +++ b/hwy/ops/emu128-inl.h @@ -386,15 +386,6 @@ VFromD VecFromMask(D /* tag */, MFromD mask) { return v; } -template -uint64_t BitsFromMask(Mask128 mask) { - uint64_t bits = 0; - for (size_t i = 0; i < N; ++i) { - bits |= mask.bits[i] ? (1ull << i) : 0; - } - return bits; -} - template HWY_API MFromD FirstN(D d, size_t n) { MFromD m; diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h index 940c2db957..766c6c9d2e 100644 --- a/hwy/ops/generic_ops-inl.h +++ b/hwy/ops/generic_ops-inl.h @@ -5599,6 +5599,13 @@ HWY_API V CompressNot(V v, M mask) { namespace detail { +#if HWY_IDE +template +HWY_INLINE uint64_t BitsFromMask(M /* mask */) { + return 0; +} +#endif // HWY_IDE + template HWY_INLINE Vec128 IndicesForExpandFromBits(uint64_t mask_bits) { static_assert(N <= 8, "Should only be called for half-vectors"); @@ -5872,7 +5879,7 @@ template HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { const DFromV d; - const uint64_t mask_bits = BitsFromMask(mask); + const uint64_t mask_bits = detail::BitsFromMask(mask); const Vec128 indices = detail::IndicesForExpandFromBits(mask_bits); return BitCast(d, TableLookupBytesOr0(v, indices)); @@ -5886,7 +5893,7 @@ HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { const Half duh; const Vec128 vu = BitCast(du, v); - const uint64_t mask_bits = BitsFromMask(mask); + const uint64_t mask_bits = detail::BitsFromMask(mask); const uint64_t maskL = mask_bits & 0xFF; const uint64_t maskH = mask_bits >> 8; @@ -5918,7 +5925,7 @@ HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { const RebindToUnsigned du; const Rebind du8; - const uint64_t mask_bits = BitsFromMask(mask); + const uint64_t mask_bits = detail::BitsFromMask(mask); // Storing as 8-bit reduces table size from 4 KiB to 2 KiB. We cannot apply // the nibble trick used below because not all indices fit within one lane. @@ -6200,7 +6207,7 @@ HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { const DFromV d; const RebindToUnsigned du; - const uint64_t mask_bits = BitsFromMask(mask); + const uint64_t mask_bits = detail::BitsFromMask(mask); alignas(16) static constexpr uint32_t packed_array[16] = { // PrintExpand64x4Nibble - same for 32x4. @@ -7358,16 +7365,6 @@ HWY_API auto Le(V a, V b) -> decltype(a == b) { #undef HWY_GENERIC_IF_EMULATED_D -// TODO: remove once callers are updated. -#if !HWY_HAVE_SCALABLE && HWY_MAX_BYTES <= 64 -namespace detail { -template -uint64_t BitsFromMask(M m) { - return hwy::HWY_NAMESPACE::BitsFromMask(m); -} -} // namespace detail -#endif // !HWY_HAVE_SCALABLE && HWY_MAX_BYTES <= 64 - // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy diff --git a/hwy/ops/ppc_vsx-inl.h b/hwy/ops/ppc_vsx-inl.h index f8884bb7fc..d216c54853 100644 --- a/hwy/ops/ppc_vsx-inl.h +++ b/hwy/ops/ppc_vsx-inl.h @@ -5222,12 +5222,6 @@ HWY_API MFromD Dup128MaskFromMaskBits(D d, unsigned mask_bits) { namespace detail { -// Returns the lowest N of the mask bits. -template -constexpr uint64_t OnlyActive(uint64_t mask_bits) { - return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1); -} - #if !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN // fallback for missing vec_extractm template @@ -5248,27 +5242,23 @@ HWY_INLINE uint64_t ExtractSignBits(Vec128 sign_bits, #endif // !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN -} // namespace detail - -template -HWY_API uint64_t BitsFromMask(Mask128 mask) { +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128 mask) { const DFromM d; const Repartition du8; const VFromD sign_bits = BitCast(du8, VecFromMask(d, mask)); #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN - return detail::OnlyActive( - static_cast(vec_extractm(sign_bits.raw))); + return static_cast(vec_extractm(sign_bits.raw)); #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10 const __vector unsigned char kBitShuffle = {120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0}; - return detail::OnlyActive( - detail::ExtractSignBits(sign_bits, kBitShuffle)); + return ExtractSignBits(sign_bits, kBitShuffle); #endif // HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN } -template -HWY_API uint64_t BitsFromMask(Mask128 mask) { +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128 mask) { const DFromM d; const RebindToUnsigned du; @@ -5276,8 +5266,7 @@ HWY_API uint64_t BitsFromMask(Mask128 mask) { const VFromD sign_bits = BitCast(du8, VecFromMask(d, mask)); #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN - return detail::OnlyActive( - static_cast(vec_extractm(BitCast(du, sign_bits).raw))); + return static_cast(vec_extractm(BitCast(du, sign_bits).raw)); #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10 (void)du; #if HWY_IS_LITTLE_ENDIAN @@ -5287,13 +5276,12 @@ HWY_API uint64_t BitsFromMask(Mask128 mask) { const __vector unsigned char kBitShuffle = { 128, 128, 128, 128, 128, 128, 128, 128, 112, 96, 80, 64, 48, 32, 16, 0}; #endif - return detail::OnlyActive( - detail::ExtractSignBits(sign_bits, kBitShuffle)); + return ExtractSignBits(sign_bits, kBitShuffle); #endif // HWY_PPC_HAVE_10 } -template -HWY_API uint64_t BitsFromMask(Mask128 mask) { +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128 mask) { const DFromM d; const RebindToUnsigned du; @@ -5301,8 +5289,7 @@ HWY_API uint64_t BitsFromMask(Mask128 mask) { const VFromD sign_bits = BitCast(du8, VecFromMask(d, mask)); #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN - return detail::OnlyActive( - static_cast(vec_extractm(BitCast(du, sign_bits).raw))); + return static_cast(vec_extractm(BitCast(du, sign_bits).raw)); #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10 (void)du; #if HWY_IS_LITTLE_ENDIAN @@ -5314,13 +5301,12 @@ HWY_API uint64_t BitsFromMask(Mask128 mask) { 128, 128, 128, 128, 128, 128, 96, 64, 32, 0}; #endif - return detail::OnlyActive( - detail::ExtractSignBits(sign_bits, kBitShuffle)); + return ExtractSignBits(sign_bits, kBitShuffle); #endif // HWY_PPC_HAVE_10 } -template -HWY_API uint64_t BitsFromMask(Mask128 mask) { +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128 mask) { const DFromM d; const RebindToUnsigned du; @@ -5328,8 +5314,7 @@ HWY_API uint64_t BitsFromMask(Mask128 mask) { const VFromD sign_bits = BitCast(du8, VecFromMask(d, mask)); #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN - return detail::OnlyActive( - static_cast(vec_extractm(BitCast(du, sign_bits).raw))); + return static_cast(vec_extractm(BitCast(du, sign_bits).raw)); #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10 (void)du; #if HWY_IS_LITTLE_ENDIAN @@ -5341,23 +5326,35 @@ HWY_API uint64_t BitsFromMask(Mask128 mask) { 128, 128, 128, 128, 128, 128, 128, 128, 64, 0}; #endif - return detail::OnlyActive( - detail::ExtractSignBits(sign_bits, kBitShuffle)); + return ExtractSignBits(sign_bits, kBitShuffle); #endif // HWY_PPC_HAVE_10 } +// Returns the lowest N of the mask bits. +template +constexpr uint64_t OnlyActive(uint64_t mask_bits) { + return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1); +} + +template +HWY_INLINE uint64_t BitsFromMask(Mask128 mask) { + return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); +} + +} // namespace detail + // `p` points to at least 8 writable bytes. template HWY_API size_t StoreMaskBits(D /*d*/, MFromD mask, uint8_t* bits) { // For vectors with 8 or fewer lanes, simply cast the result of BitsFromMask // to an uint8_t and store the result in bits[0]. - bits[0] = static_cast(BitsFromMask(mask)); + bits[0] = static_cast(detail::BitsFromMask(mask)); return sizeof(uint8_t); } template HWY_API size_t StoreMaskBits(D /*d*/, MFromD mask, uint8_t* bits) { - const auto mask_bits = BitsFromMask(mask); + const auto mask_bits = detail::BitsFromMask(mask); // First convert mask_bits to a uint16_t as we only want to store // the lower 16 bits of mask_bits as there are 16 lanes in mask. @@ -5423,7 +5420,7 @@ HWY_API bool AllTrue(D d, MFromD mask) { template HWY_API size_t CountTrue(D /* tag */, MFromD mask) { - return PopCount(BitsFromMask(mask)); + return PopCount(detail::BitsFromMask(mask)); } #if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) @@ -5471,7 +5468,7 @@ HWY_API size_t FindKnownFirstTrue(D d, MFromD mask) { } #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) (void)d; - return Num0BitsBelowLS1Bit_Nonzero64(BitsFromMask(mask)); + return Num0BitsBelowLS1Bit_Nonzero64(detail::BitsFromMask(mask)); } template > @@ -5487,7 +5484,7 @@ HWY_API intptr_t FindFirstTrue(D d, MFromD mask) { } #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) (void)d; - const uint64_t mask_bits = BitsFromMask(mask); + const uint64_t mask_bits = detail::BitsFromMask(mask); return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1; } @@ -5503,7 +5500,7 @@ HWY_API size_t FindKnownLastTrue(D d, MFromD mask) { } #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) (void)d; - return 63 - Num0BitsAboveMS1Bit_Nonzero64(BitsFromMask(mask)); + return 63 - Num0BitsAboveMS1Bit_Nonzero64(detail::BitsFromMask(mask)); } template > @@ -5519,7 +5516,7 @@ HWY_API intptr_t FindLastTrue(D d, MFromD mask) { } #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) (void)d; - const uint64_t mask_bits = BitsFromMask(mask); + const uint64_t mask_bits = detail::BitsFromMask(mask); return mask_bits ? intptr_t(63 - Num0BitsAboveMS1Bit_Nonzero64(mask_bits)) : -1; } @@ -6015,7 +6012,7 @@ HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { // General case, 2 or 4 bytes template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { - return detail::CompressBits(v, BitsFromMask(mask)); + return detail::CompressBits(v, detail::BitsFromMask(mask)); } // ------------------------------ CompressNot @@ -6054,9 +6051,9 @@ HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { // For partial vectors, we cannot pull the Not() into the table because // BitsFromMask clears the upper bits. if (N < 16 / sizeof(T)) { - return detail::CompressBits(v, BitsFromMask(Not(mask))); + return detail::CompressBits(v, detail::BitsFromMask(Not(mask))); } - return detail::CompressNotBits(v, BitsFromMask(mask)); + return detail::CompressNotBits(v, detail::BitsFromMask(mask)); } // ------------------------------ CompressBlocksNot @@ -6106,7 +6103,7 @@ HWY_API size_t CompressStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; - const uint64_t mask_bits = BitsFromMask(m); + const uint64_t mask_bits = detail::BitsFromMask(m); HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); const size_t count = PopCount(mask_bits); @@ -6133,7 +6130,7 @@ HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; - const uint64_t mask_bits = BitsFromMask(m); + const uint64_t mask_bits = detail::BitsFromMask(m); HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); const size_t count = PopCount(mask_bits); diff --git a/hwy/ops/scalar-inl.h b/hwy/ops/scalar-inl.h index 283e55261e..a64faf9106 100644 --- a/hwy/ops/scalar-inl.h +++ b/hwy/ops/scalar-inl.h @@ -288,16 +288,18 @@ HWY_API Mask1 MaskFromVec(const Vec1 v) { template using MFromD = decltype(MaskFromVec(VFromD())); -template > -Vec1 VecFromMask(D /* tag */, const Mask1 mask) { +template +Vec1 VecFromMask(const Mask1 mask) { Vec1 v; CopySameSize(&mask, &v); return v; } -template -uint64_t BitsFromMask(Mask1 mask) { - return mask.bits ? 1 : 0; +template > +Vec1 VecFromMask(D /* tag */, const Mask1 mask) { + Vec1 v; + CopySameSize(&mask, &v); + return v; } template > diff --git a/hwy/ops/wasm_128-inl.h b/hwy/ops/wasm_128-inl.h index cf526cc283..39471d5239 100644 --- a/hwy/ops/wasm_128-inl.h +++ b/hwy/ops/wasm_128-inl.h @@ -4946,97 +4946,75 @@ HWY_API MFromD Dup128MaskFromMaskBits(D d, unsigned mask_bits) { namespace detail { -// Returns the lowest N bits for the BitsFromMask result. -template -constexpr uint64_t OnlyActive(uint64_t bits) { - return ((N * sizeof(T)) == 16) ? bits : bits & ((1ull << N) - 1); -} - -} // namespace detail - -template -HWY_API uint64_t BitsFromMask(const Mask128 mask) { +// Full +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, + const Mask128 mask) { alignas(16) uint64_t lanes[2]; wasm_v128_store(lanes, mask.raw); constexpr uint64_t kMagic = 0x103070F1F3F80ULL; const uint64_t lo = ((lanes[0] * kMagic) >> 56); const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00; - return hi + lo; // exactly 16 bits, no OnlyActive required + return (hi + lo); } -template -HWY_API uint64_t BitsFromMask(const Mask128 mask) { +// 64-bit +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, + const Mask128 mask) { constexpr uint64_t kMagic = 0x103070F1F3F80ULL; - const uint64_t bytes = - static_cast(wasm_i64x2_extract_lane(mask.raw, 0)); - return (bytes * kMagic) >> 56; // exactly 8 bits, no OnlyActive required + return (static_cast(wasm_i64x2_extract_lane(mask.raw, 0)) * + kMagic) >> + 56; } // 32-bit or less: need masking -template -HWY_API uint64_t BitsFromMask(const Mask128 mask) { +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, + const Mask128 mask) { uint64_t bytes = static_cast(wasm_i64x2_extract_lane(mask.raw, 0)); // Clear potentially undefined bytes. bytes &= (1ULL << (N * 8)) - 1; constexpr uint64_t kMagic = 0x103070F1F3F80ULL; - return detail::OnlyActive((bytes * kMagic) >> 56); + return (bytes * kMagic) >> 56; } -template -HWY_API uint64_t BitsFromMask(const Mask128 mask) { +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, + const Mask128 mask) { // Remove useless lower half of each u16 while preserving the sign bit. const __i16x8 zero = wasm_i16x8_splat(0); const Mask128 mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)}; - return detail::OnlyActive(BitsFromMask(mask8)); + return BitsFromMask(hwy::SizeTag<1>(), mask8); } -template -HWY_API uint64_t BitsFromMask(const Mask128 mask) { +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, + const Mask128 mask) { const __i32x4 mask_i = static_cast<__i32x4>(mask.raw); const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8); const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice); alignas(16) uint32_t lanes[4]; wasm_v128_store(lanes, sliced_mask); - return detail::OnlyActive(lanes[0] | lanes[1] | lanes[2] | lanes[3]); + return lanes[0] | lanes[1] | lanes[2] | lanes[3]; } -template -HWY_API uint64_t BitsFromMask(const Mask128 mask) { +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, + const Mask128 mask) { const __i64x2 mask_i = static_cast<__i64x2>(mask.raw); const __i64x2 slice = wasm_i64x2_make(1, 2); const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice); alignas(16) uint64_t lanes[2]; wasm_v128_store(lanes, sliced_mask); - return detail::OnlyActive(lanes[0] | lanes[1]); -} - -namespace detail { - -template -HWY_INLINE size_t CountTrue(hwy::SizeTag<1> /*tag*/, const Mask128 m) { - return PopCount(BitsFromMask(m)); -} - -template -HWY_INLINE size_t CountTrue(hwy::SizeTag<2> /*tag*/, const Mask128 m) { - return PopCount(BitsFromMask(m)); -} - -template -HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128 m) { - const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8); - const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift); - alignas(16) uint64_t lanes[2]; - wasm_v128_store(lanes, shifted_bits); - return PopCount(lanes[0] | lanes[1]); + return lanes[0] | lanes[1]; } -template -HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128 m) { - alignas(16) int64_t lanes[2]; - wasm_v128_store(lanes, m.raw); - return static_cast(-(lanes[0] + lanes[1])); +// Returns the lowest N bits for the BitsFromMask result. +template +constexpr uint64_t OnlyActive(uint64_t bits) { + return ((N * sizeof(T)) == 16) ? bits : bits & ((1ull << N) - 1); } // Returns 0xFF for bytes with index >= N, otherwise 0. @@ -5069,12 +5047,43 @@ constexpr __i8x16 BytesAbove() { : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1); } +template +HWY_INLINE uint64_t BitsFromMask(const Mask128 mask) { + return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); +} + +template +HWY_INLINE size_t CountTrue(hwy::SizeTag<1> tag, const Mask128 m) { + return PopCount(BitsFromMask(tag, m)); +} + +template +HWY_INLINE size_t CountTrue(hwy::SizeTag<2> tag, const Mask128 m) { + return PopCount(BitsFromMask(tag, m)); +} + +template +HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128 m) { + const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8); + const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift); + alignas(16) uint64_t lanes[2]; + wasm_v128_store(lanes, shifted_bits); + return PopCount(lanes[0] | lanes[1]); +} + +template +HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128 m) { + alignas(16) int64_t lanes[2]; + wasm_v128_store(lanes, m.raw); + return static_cast(-(lanes[0] + lanes[1])); +} + } // namespace detail // `p` points to at least 8 writable bytes. template HWY_API size_t StoreMaskBits(D d, const MFromD mask, uint8_t* bits) { - const uint64_t mask_bits = BitsFromMask(mask); + const uint64_t mask_bits = detail::BitsFromMask(mask); const size_t kNumBytes = (d.MaxLanes() + 7) / 8; CopyBytes(&mask_bits, bits); return kNumBytes; @@ -5145,25 +5154,25 @@ HWY_API bool AllTrue(D d, const MFromD m) { template HWY_API size_t FindKnownFirstTrue(D /* tag */, const MFromD mask) { - const uint32_t bits = static_cast(BitsFromMask(mask)); + const uint32_t bits = static_cast(detail::BitsFromMask(mask)); return Num0BitsBelowLS1Bit_Nonzero32(bits); } template HWY_API intptr_t FindFirstTrue(D /* tag */, const MFromD mask) { - const uint32_t bits = static_cast(BitsFromMask(mask)); + const uint32_t bits = static_cast(detail::BitsFromMask(mask)); return bits ? static_cast(Num0BitsBelowLS1Bit_Nonzero32(bits)) : -1; } template HWY_API size_t FindKnownLastTrue(D /* tag */, const MFromD mask) { - const uint32_t bits = static_cast(BitsFromMask(mask)); + const uint32_t bits = static_cast(detail::BitsFromMask(mask)); return 31 - Num0BitsAboveMS1Bit_Nonzero32(bits); } template HWY_API intptr_t FindLastTrue(D /* tag */, const MFromD mask) { - const uint32_t bits = static_cast(BitsFromMask(mask)); + const uint32_t bits = static_cast(detail::BitsFromMask(mask)); return bits ? (31 - static_cast(Num0BitsAboveMS1Bit_Nonzero32(bits))) : -1; @@ -5609,7 +5618,7 @@ HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { // General case, 2 or 4 byte lanes template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { - return detail::Compress(v, BitsFromMask(mask)); + return detail::Compress(v, detail::BitsFromMask(mask)); } // Single lane: no-op @@ -5636,9 +5645,9 @@ HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { // For partial vectors, we cannot pull the Not() into the table because // BitsFromMask clears the upper bits. if (N < 16 / sizeof(T)) { - return detail::Compress(v, BitsFromMask(Not(mask))); + return detail::Compress(v, detail::BitsFromMask(Not(mask))); } - return detail::CompressNot(v, BitsFromMask(mask)); + return detail::CompressNot(v, detail::BitsFromMask(mask)); } // ------------------------------ CompressBlocksNot @@ -5665,7 +5674,7 @@ HWY_API Vec128 CompressBits(Vec128 v, template HWY_API size_t CompressStore(VFromD v, MFromD mask, D d, TFromD* HWY_RESTRICT unaligned) { - const uint64_t mask_bits = BitsFromMask(mask); + const uint64_t mask_bits = detail::BitsFromMask(mask); const auto c = detail::Compress(v, mask_bits); StoreU(c, d, unaligned); return PopCount(mask_bits); @@ -5676,7 +5685,7 @@ template HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; // so we can support fp16/bf16 - const uint64_t mask_bits = BitsFromMask(m); + const uint64_t mask_bits = detail::BitsFromMask(m); const size_t count = PopCount(mask_bits); const VFromD compressed = detail::Compress(BitCast(du, v), mask_bits); diff --git a/hwy/ops/wasm_256-inl.h b/hwy/ops/wasm_256-inl.h index b3b3d50537..aab7105e36 100644 --- a/hwy/ops/wasm_256-inl.h +++ b/hwy/ops/wasm_256-inl.h @@ -657,13 +657,6 @@ HWY_API Vec256 VecFromMask(D d, Mask256 m) { return v; } -template -HWY_API uint64_t BitsFromMask(Mask256 m) { - const uint64_t lo = BitsFromMask(m.m0); - const uint64_t hi = BitsFromMask(m.m1); - return (hi << (16 / sizeof(T))) | lo; -} - // mask ? yes : no template HWY_API Vec256 IfThenElse(Mask256 mask, Vec256 yes, Vec256 no) { diff --git a/hwy/ops/x86_128-inl.h b/hwy/ops/x86_128-inl.h index 22aa9644c2..a863cd10c6 100644 --- a/hwy/ops/x86_128-inl.h +++ b/hwy/ops/x86_128-inl.h @@ -196,12 +196,15 @@ constexpr uint64_t OnlyActive(uint64_t mask_bits) { } // namespace detail #if HWY_TARGET <= HWY_AVX3 +namespace detail { +// Used by Expand() emulation, which is required for both AVX3 and AVX2. template HWY_INLINE uint64_t BitsFromMask(const Mask128 mask) { - return detail::OnlyActive(mask.raw); + return OnlyActive(mask.raw); } +} // namespace detail #endif // HWY_TARGET <= HWY_AVX3 template @@ -12597,7 +12600,7 @@ HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, #else // AVX2 or below -// ------------------------------ BitsFromMask +// ------------------------------ StoreMaskBits namespace detail { @@ -12605,48 +12608,50 @@ constexpr HWY_INLINE uint64_t U64FromInt(int mask_bits) { return static_cast(static_cast(mask_bits)); } -} // namespace detail - -template -HWY_API uint64_t BitsFromMask(const Mask128 mask) { +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, + const Mask128 mask) { const Simd d; const auto sign_bits = BitCast(d, VecFromMask(d, mask)).raw; - return detail::OnlyActive( - detail::U64FromInt(_mm_movemask_epi8(sign_bits))); + return U64FromInt(_mm_movemask_epi8(sign_bits)); } -template -HWY_API uint64_t BitsFromMask(const Mask128 mask) { +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, + const Mask128 mask) { // Remove useless lower half of each u16 while preserving the sign bit. const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128()); - return detail::OnlyActive( - detail::U64FromInt(_mm_movemask_epi8(sign_bits))); + return U64FromInt(_mm_movemask_epi8(sign_bits)); } -template -HWY_API uint64_t BitsFromMask(Mask128 mask) { +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128 mask) { const Simd d; const Simd df; const auto sign_bits = BitCast(df, VecFromMask(d, mask)); - return detail::OnlyActive( - detail::U64FromInt(_mm_movemask_ps(sign_bits.raw))); + return U64FromInt(_mm_movemask_ps(sign_bits.raw)); } -template -HWY_API uint64_t BitsFromMask(Mask128 mask) { +template +HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128 mask) { const Simd d; const Simd df; const auto sign_bits = BitCast(df, VecFromMask(d, mask)); - return detail::OnlyActive( - detail::U64FromInt(_mm_movemask_pd(sign_bits.raw))); + return U64FromInt(_mm_movemask_pd(sign_bits.raw)); } -// ------------------------------ StoreMaskBits +template +HWY_INLINE uint64_t BitsFromMask(const Mask128 mask) { + return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); +} + +} // namespace detail + // `p` points to at least 8 writable bytes. template HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { constexpr size_t kNumBytes = (MaxLanes(d) + 7) / 8; - const uint64_t mask_bits = BitsFromMask(mask); + const uint64_t mask_bits = detail::BitsFromMask(mask); CopyBytes(&mask_bits, bits); return kNumBytes; } @@ -12656,41 +12661,41 @@ HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { template HWY_API bool AllFalse(D /* tag */, MFromD mask) { // Cheaper than PTEST, which is 2 uop / 3L. - return BitsFromMask(mask) == 0; + return detail::BitsFromMask(mask) == 0; } template HWY_API bool AllTrue(D d, MFromD mask) { constexpr uint64_t kAllBits = (1ull << MaxLanes(d)) - 1; - return BitsFromMask(mask) == kAllBits; + return detail::BitsFromMask(mask) == kAllBits; } template HWY_API size_t CountTrue(D /* tag */, MFromD mask) { - return PopCount(BitsFromMask(mask)); + return PopCount(detail::BitsFromMask(mask)); } template HWY_API size_t FindKnownFirstTrue(D /* tag */, MFromD mask) { return Num0BitsBelowLS1Bit_Nonzero32( - static_cast(BitsFromMask(mask))); + static_cast(detail::BitsFromMask(mask))); } template HWY_API intptr_t FindFirstTrue(D /* tag */, MFromD mask) { - const uint32_t mask_bits = static_cast(BitsFromMask(mask)); + const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1; } template HWY_API size_t FindKnownLastTrue(D /* tag */, MFromD mask) { return 31 - Num0BitsAboveMS1Bit_Nonzero32( - static_cast(BitsFromMask(mask))); + static_cast(detail::BitsFromMask(mask))); } template HWY_API intptr_t FindLastTrue(D /* tag */, MFromD mask) { - const uint32_t mask_bits = static_cast(BitsFromMask(mask)); + const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits)) : -1; } @@ -13132,7 +13137,7 @@ HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { // General case, 2 or 4 bytes template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { - return detail::CompressBits(v, BitsFromMask(mask)); + return detail::CompressBits(v, detail::BitsFromMask(mask)); } // ------------------------------ CompressNot @@ -13160,9 +13165,9 @@ HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { // For partial vectors, we cannot pull the Not() into the table because // BitsFromMask clears the upper bits. if (N < 16 / sizeof(T)) { - return detail::CompressBits(v, BitsFromMask(Not(mask))); + return detail::CompressBits(v, detail::BitsFromMask(Not(mask))); } - return detail::CompressNotBits(v, BitsFromMask(mask)); + return detail::CompressNotBits(v, detail::BitsFromMask(mask)); } // ------------------------------ CompressBlocksNot @@ -13191,7 +13196,7 @@ HWY_API size_t CompressStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; - const uint64_t mask_bits = BitsFromMask(m); + const uint64_t mask_bits = detail::BitsFromMask(m); HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); const size_t count = PopCount(mask_bits); @@ -13208,7 +13213,7 @@ HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; - const uint64_t mask_bits = BitsFromMask(m); + const uint64_t mask_bits = detail::BitsFromMask(m); HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); const size_t count = PopCount(mask_bits); diff --git a/hwy/ops/x86_256-inl.h b/hwy/ops/x86_256-inl.h index be51787ce3..df09c052c7 100644 --- a/hwy/ops/x86_256-inl.h +++ b/hwy/ops/x86_256-inl.h @@ -165,12 +165,15 @@ struct Mask256 { #endif // AVX2 #if HWY_TARGET <= HWY_AVX3 +namespace detail { +// Used by Expand() emulation, which is required for both AVX3 and AVX2. template HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { return mask.raw; } +} // namespace detail #endif // HWY_TARGET <= HWY_AVX3 template @@ -7729,10 +7732,12 @@ HWY_API MFromD LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { return detail::LoadMaskBits256>(mask_bits); } -// ------------------------------ BitsFromMask +// ------------------------------ StoreMaskBits + +namespace detail { template -HWY_API uint64_t BitsFromMask(const Mask256 mask) { +HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { const Full256 d; const Full256 d8; const auto sign_bits = BitCast(d8, VecFromMask(d, mask)).raw; @@ -7741,7 +7746,7 @@ HWY_API uint64_t BitsFromMask(const Mask256 mask) { } template -HWY_API uint64_t BitsFromMask(const Mask256 mask) { +HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { #if !defined(HWY_DISABLE_BMI2_FMA) && !defined(HWY_DISABLE_PEXT_ON_AVX2) const Full256 d; const Full256 d8; @@ -7763,7 +7768,7 @@ HWY_API uint64_t BitsFromMask(const Mask256 mask) { } template -HWY_API uint64_t BitsFromMask(const Mask256 mask) { +HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { const Full256 d; const Full256 df; const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw; @@ -7771,21 +7776,22 @@ HWY_API uint64_t BitsFromMask(const Mask256 mask) { } template -HWY_API uint64_t BitsFromMask(const Mask256 mask) { +HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { const Full256 d; const Full256 df; const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw; return static_cast(_mm256_movemask_pd(sign_bits)); } -// ------------------------------ StoreMaskBits +} // namespace detail + // `p` points to at least 8 writable bytes. template HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { constexpr size_t N = Lanes(d); constexpr size_t kNumBytes = (N + 7) / 8; - const uint64_t mask_bits = BitsFromMask(mask); + const uint64_t mask_bits = detail::BitsFromMask(mask); CopyBytes(&mask_bits, bits); return kNumBytes; } @@ -7798,59 +7804,59 @@ template HWY_API bool AllFalse(D d, MFromD mask) { const Repartition d8; const Mask256 mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); - return BitsFromMask(mask8) == 0; + return detail::BitsFromMask(mask8) == 0; } template HWY_API bool AllFalse(D /* tag */, MFromD mask) { // Cheaper than PTEST, which is 2 uop / 3L. - return BitsFromMask(mask) == 0; + return detail::BitsFromMask(mask) == 0; } template HWY_API bool AllTrue(D d, MFromD mask) { const Repartition d8; const Mask256 mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); - return BitsFromMask(mask8) == (1ull << 32) - 1; + return detail::BitsFromMask(mask8) == (1ull << 32) - 1; } template HWY_API bool AllTrue(D d, MFromD mask) { constexpr uint64_t kAllBits = (1ull << Lanes(d)) - 1; - return BitsFromMask(mask) == kAllBits; + return detail::BitsFromMask(mask) == kAllBits; } template HWY_API size_t CountTrue(D d, MFromD mask) { const Repartition d8; const Mask256 mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); - return PopCount(BitsFromMask(mask8)) >> 1; + return PopCount(detail::BitsFromMask(mask8)) >> 1; } template HWY_API size_t CountTrue(D /* tag */, MFromD mask) { - return PopCount(BitsFromMask(mask)); + return PopCount(detail::BitsFromMask(mask)); } template HWY_API size_t FindKnownFirstTrue(D /* tag */, MFromD mask) { - const uint32_t mask_bits = static_cast(BitsFromMask(mask)); + const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); return Num0BitsBelowLS1Bit_Nonzero32(mask_bits); } template HWY_API intptr_t FindFirstTrue(D /* tag */, MFromD mask) { - const uint32_t mask_bits = static_cast(BitsFromMask(mask)); + const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1; } template HWY_API size_t FindKnownLastTrue(D /* tag */, MFromD mask) { - const uint32_t mask_bits = static_cast(BitsFromMask(mask)); + const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits); } template HWY_API intptr_t FindLastTrue(D /* tag */, MFromD mask) { - const uint32_t mask_bits = static_cast(BitsFromMask(mask)); + const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits)) : -1; } @@ -8103,12 +8109,12 @@ HWY_INLINE Vec256 CompressNot(Vec256 v, const uint64_t mask_bits) { template HWY_API Vec256 Compress(Vec256 v, Mask256 m) { - return detail::Compress(v, BitsFromMask(m)); + return detail::Compress(v, detail::BitsFromMask(m)); } template HWY_API Vec256 CompressNot(Vec256 v, Mask256 m) { - return detail::CompressNot(v, BitsFromMask(m)); + return detail::CompressNot(v, detail::BitsFromMask(m)); } HWY_API Vec256 CompressBlocksNot(Vec256 v, @@ -8136,7 +8142,7 @@ HWY_API Vec256 CompressBits(Vec256 v, const uint8_t* HWY_RESTRICT bits) { template HWY_API size_t CompressStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { - const uint64_t mask_bits = BitsFromMask(m); + const uint64_t mask_bits = detail::BitsFromMask(m); const size_t count = PopCount(mask_bits); StoreU(detail::Compress(v, mask_bits), d, unaligned); detail::MaybeUnpoison(unaligned, count); @@ -8147,7 +8153,7 @@ template HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { - const uint64_t mask_bits = BitsFromMask(m); + const uint64_t mask_bits = detail::BitsFromMask(m); const size_t count = PopCount(mask_bits); const RebindToUnsigned du; @@ -8174,7 +8180,7 @@ HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, template HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { - const uint64_t mask_bits = BitsFromMask(m); + const uint64_t mask_bits = detail::BitsFromMask(m); const size_t count = PopCount(mask_bits); const VFromD compressed = detail::Compress(v, mask_bits); @@ -8291,7 +8297,7 @@ HWY_API Vec256 Expand(Vec256 v, Mask256 mask) { // LUTs are infeasible for so many mask combinations, so Combine two // half-vector Expand. const Half dh; - const uint64_t mask_bits = BitsFromMask(mask); + const uint64_t mask_bits = detail::BitsFromMask(mask); constexpr size_t N = 32 / sizeof(T); const size_t countL = PopCount(mask_bits & ((1 << (N / 2)) - 1)); const Mask128 maskL = MaskFromVec(LowerHalf(VecFromMask(d, mask))); @@ -8345,7 +8351,7 @@ HWY_API Vec256 Expand(Vec256 v, Mask256 mask) { return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); #else const RebindToUnsigned du; - const uint64_t mask_bits = BitsFromMask(mask); + const uint64_t mask_bits = detail::BitsFromMask(mask); alignas(16) constexpr uint32_t packed_array[256] = { // PrintExpand32x8Nibble. @@ -8414,7 +8420,7 @@ HWY_API Vec256 Expand(Vec256 v, Mask256 mask) { return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); #else const RebindToUnsigned du; - const uint64_t mask_bits = BitsFromMask(mask); + const uint64_t mask_bits = detail::BitsFromMask(mask); alignas(16) constexpr uint64_t packed_array[16] = { // PrintExpand64x4Nibble. diff --git a/hwy/ops/x86_512-inl.h b/hwy/ops/x86_512-inl.h index 929e188664..c906b2e32c 100644 --- a/hwy/ops/x86_512-inl.h +++ b/hwy/ops/x86_512-inl.h @@ -2640,60 +2640,74 @@ HWY_API Mask512 operator<=(Vec512 a, Vec512 b) { // ------------------------------ Mask -template -HWY_API uint64_t BitsFromMask(const Mask512 mask) { - // OnlyActive is not required because we have at least 8 mask bits. - return mask.raw; -} +namespace detail { -template -HWY_API Mask512 MaskFromVec(Vec512 v) { +template +HWY_INLINE Mask512 MaskFromVec(hwy::SizeTag<1> /*tag*/, Vec512 v) { return Mask512{_mm512_movepi8_mask(v.raw)}; } -template -HWY_API Mask512 MaskFromVec(Vec512 v) { +template +HWY_INLINE Mask512 MaskFromVec(hwy::SizeTag<2> /*tag*/, Vec512 v) { return Mask512{_mm512_movepi16_mask(v.raw)}; } -template -HWY_API Mask512 MaskFromVec(Vec512 v) { +template +HWY_INLINE Mask512 MaskFromVec(hwy::SizeTag<4> /*tag*/, Vec512 v) { return Mask512{_mm512_movepi32_mask(v.raw)}; } -template -HWY_API Mask512 MaskFromVec(Vec512 v) { +template +HWY_INLINE Mask512 MaskFromVec(hwy::SizeTag<8> /*tag*/, Vec512 v) { return Mask512{_mm512_movepi64_mask(v.raw)}; } -template + +} // namespace detail + +template +HWY_API Mask512 MaskFromVec(Vec512 v) { + return detail::MaskFromVec(hwy::SizeTag(), v); +} +template HWY_API Mask512 MaskFromVec(Vec512 v) { const RebindToSigned> di; return Mask512{MaskFromVec(BitCast(di, v)).raw}; } -template -HWY_API Vec512 VecFromMask(Mask512 m) { - return Vec512{_mm512_movm_epi8(m.raw)}; +HWY_API Vec512 VecFromMask(Mask512 v) { + return Vec512{_mm512_movm_epi8(v.raw)}; } -template -HWY_API Vec512 VecFromMask(Mask512 m) { - return Vec512{_mm512_movm_epi16(m.raw)}; +HWY_API Vec512 VecFromMask(Mask512 v) { + return Vec512{_mm512_movm_epi8(v.raw)}; +} + +HWY_API Vec512 VecFromMask(Mask512 v) { + return Vec512{_mm512_movm_epi16(v.raw)}; +} +HWY_API Vec512 VecFromMask(Mask512 v) { + return Vec512{_mm512_movm_epi16(v.raw)}; } #if HWY_HAVE_FLOAT16 -HWY_API Vec512 VecFromMask(Mask512 m) { - return Vec512{_mm512_castsi512_ph(_mm512_movm_epi16(m.raw))}; +HWY_API Vec512 VecFromMask(Mask512 v) { + return Vec512{_mm512_castsi512_ph(_mm512_movm_epi16(v.raw))}; } #endif // HWY_HAVE_FLOAT16 -template -HWY_API Vec512 VecFromMask(Mask512 m) { - return Vec512{_mm512_movm_epi32(m.raw)}; + +HWY_API Vec512 VecFromMask(Mask512 v) { + return Vec512{_mm512_movm_epi32(v.raw)}; } -template -HWY_API Vec512 VecFromMask(Mask512 m) { - return Vec512{_mm512_movm_epi64(m.raw)}; +HWY_API Vec512 VecFromMask(Mask512 v) { + return Vec512{_mm512_movm_epi32(v.raw)}; } -template -HWY_API Vec512 VecFromMask(Mask512 m) { - const Full512 d; - const Full512> di; - return BitCast(d, VecFromMask(RebindMask(di, m))); +HWY_API Vec512 VecFromMask(Mask512 v) { + return Vec512{_mm512_castsi512_ps(_mm512_movm_epi32(v.raw))}; +} + +HWY_API Vec512 VecFromMask(Mask512 v) { + return Vec512{_mm512_movm_epi64(v.raw)}; +} +HWY_API Vec512 VecFromMask(Mask512 v) { + return Vec512{_mm512_movm_epi64(v.raw)}; +} +HWY_API Vec512 VecFromMask(Mask512 v) { + return Vec512{_mm512_castsi512_pd(_mm512_movm_epi64(v.raw))}; } // ------------------------------ Mask logical diff --git a/hwy/tests/mask_set_test.cc b/hwy/tests/mask_set_test.cc deleted file mode 100644 index 85d8fd66f2..0000000000 --- a/hwy/tests/mask_set_test.cc +++ /dev/null @@ -1,317 +0,0 @@ -// Copyright 2019 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#undef HWY_TARGET_INCLUDE -#define HWY_TARGET_INCLUDE "tests/mask_set_test.cc" -#include "hwy/foreach_target.h" // IWYU pragma: keep -#include "hwy/highway.h" -#include "hwy/tests/test_util-inl.h" - -HWY_BEFORE_NAMESPACE(); -namespace hwy { -namespace HWY_NAMESPACE { -namespace { - -struct TestMaskFalse { - template - HWY_NOINLINE void operator()(T /*unused*/, D d) { -#if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE || HWY_TARGET == HWY_SCALAR - // For RVV, SVE and SCALAR, use the underlying native vector. - const DFromV> d2; -#else - // Other targets are strongly-typed, but we can safely ResizeBitCast to the - // native vector. All targets have at least 128-bit vectors, but NEON also - // supports 64-bit vectors. - constexpr size_t kMinD2Lanes = (HWY_TARGET_IS_NEON ? 8 : 16) / sizeof(T); - const FixedTag d2; -#endif - static_assert(d2.MaxBytes() >= d.MaxBytes(), - "d2.MaxBytes() >= d.MaxBytes() should be true"); - using V2 = Vec; - - // Various ways of checking that false masks are false. - HWY_ASSERT(AllFalse(d, MaskFalse(d))); - HWY_ASSERT_EQ(0, CountTrue(d, MaskFalse(d))); - HWY_ASSERT_VEC_EQ(d, Zero(d), VecFromMask(d, MaskFalse(d))); - -#if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE - // For these targets, we can treat the result as if it were a vector of type - // `V2`. On SVE, vectors are always full (not fractional) and caps are only - // enforced by Highway ops. On RVV, LMUL must match but caps can also be - // ignored. For safety, MaskFalse also sets lanes >= `Lanes(d)` to false, - // and we verify that here. - HWY_ASSERT(AllFalse(d2, MaskFalse(d))); - HWY_ASSERT_EQ(0, CountTrue(d2, MaskFalse(d))); - HWY_ASSERT_VEC_EQ(d2, Zero(d2), VecFromMask(d2, MaskFalse(d))); -#endif - - // All targets support, and strongly-typed (non-scalable) targets require, - // ResizeBitCast before we compare to the 'native' underlying vector size. - const V2 actual2 = ResizeBitCast(d2, VecFromMask(d, MaskFalse(d))); - HWY_ASSERT_VEC_EQ(d2, Zero(d2), actual2); - } -}; - -HWY_NOINLINE void TestAllMaskFalse() { - ForAllTypes(ForPartialVectors()); -} - -struct TestFirstN { - template - HWY_NOINLINE void operator()(T /*unused*/, D d) { - const size_t N = Lanes(d); - auto bool_lanes = AllocateAligned(N); - HWY_ASSERT(bool_lanes); - - using TN = SignedFromSize; - const size_t max_len = static_cast(LimitsMax()); - - const Vec k1 = Set(d, ConvertScalarTo(1)); - - const size_t max_lanes = HWY_MIN(2 * N, AdjustedReps(512)); - for (size_t len = 0; len <= HWY_MIN(max_lanes, max_len); ++len) { - // Loop instead of Iota+Lt to avoid wraparound for 8-bit T. - for (size_t i = 0; i < N; ++i) { - bool_lanes[i] = ConvertScalarTo(i < len ? 1 : 0); - } - const Mask expected = Eq(Load(d, bool_lanes.get()), k1); - HWY_ASSERT_MASK_EQ(d, expected, FirstN(d, len)); - } - - // Also ensure huge values yield all-true (unless the vector is actually - // larger than max_len). - for (size_t i = 0; i < N; ++i) { - bool_lanes[i] = ConvertScalarTo(i < max_len ? 1 : 0); - } - const Mask expected = Eq(Load(d, bool_lanes.get()), k1); - HWY_ASSERT_MASK_EQ(d, expected, FirstN(d, max_len)); - } -}; - -HWY_NOINLINE void TestAllFirstN() { - ForAllTypes(ForPartialVectors()); -} - -struct TestSetBeforeFirst { - template - HWY_NOINLINE void operator()(T /*unused*/, D d) { - using TI = MakeSigned; // For mask > 0 comparison - const Rebind di; - const size_t N = Lanes(di); - auto bool_lanes = AllocateAligned(N); - HWY_ASSERT(bool_lanes); - memset(bool_lanes.get(), 0, N * sizeof(TI)); - - // For all combinations of zero/nonzero state of subset of lanes: - const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); - for (size_t code = 0; code < (1ull << max_lanes); ++code) { - for (size_t i = 0; i < max_lanes; ++i) { - bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); - } - - const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); - - const size_t first_set_lane_idx = - (code != 0) - ? Num0BitsBelowLS1Bit_Nonzero64(static_cast(code)) - : N; - const auto expected_mask = FirstN(d, first_set_lane_idx); - - HWY_ASSERT_MASK_EQ(d, expected_mask, SetBeforeFirst(m)); - } - } -}; - -HWY_NOINLINE void TestAllSetBeforeFirst() { - ForAllTypes(ForPartialVectors()); -} - -struct TestSetAtOrBeforeFirst { - template - HWY_NOINLINE void operator()(T /*unused*/, D d) { - using TI = MakeSigned; // For mask > 0 comparison - const Rebind di; - const size_t N = Lanes(di); - auto bool_lanes = AllocateAligned(N); - HWY_ASSERT(bool_lanes); - memset(bool_lanes.get(), 0, N * sizeof(TI)); - - // For all combinations of zero/nonzero state of subset of lanes: - const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); - for (size_t code = 0; code < (1ull << max_lanes); ++code) { - for (size_t i = 0; i < max_lanes; ++i) { - bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); - } - - const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); - - const size_t idx_after_first_set_lane = - (code != 0) - ? (Num0BitsBelowLS1Bit_Nonzero64(static_cast(code)) + 1) - : N; - const auto expected_mask = FirstN(d, idx_after_first_set_lane); - - HWY_ASSERT_MASK_EQ(d, expected_mask, SetAtOrBeforeFirst(m)); - } - } -}; - -HWY_NOINLINE void TestAllSetAtOrBeforeFirst() { - ForAllTypes(ForPartialVectors()); -} - -struct TestSetOnlyFirst { - template - HWY_NOINLINE void operator()(T /*unused*/, D d) { - using TI = MakeSigned; // For mask > 0 comparison - const Rebind di; - const size_t N = Lanes(di); - auto bool_lanes = AllocateAligned(N); - HWY_ASSERT(bool_lanes); - memset(bool_lanes.get(), 0, N * sizeof(TI)); - auto expected_lanes = AllocateAligned(N); - HWY_ASSERT(expected_lanes); - - // For all combinations of zero/nonzero state of subset of lanes: - const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); - for (size_t code = 0; code < (1ull << max_lanes); ++code) { - for (size_t i = 0; i < max_lanes; ++i) { - bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); - } - - memset(expected_lanes.get(), 0, N * sizeof(TI)); - if (code != 0) { - const size_t idx_of_first_lane = - Num0BitsBelowLS1Bit_Nonzero64(static_cast(code)); - expected_lanes[idx_of_first_lane] = TI(1); - } - - const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); - const auto expected_mask = - RebindMask(d, Gt(Load(di, expected_lanes.get()), Zero(di))); - - HWY_ASSERT_MASK_EQ(d, expected_mask, SetOnlyFirst(m)); - } - } -}; - -HWY_NOINLINE void TestAllSetOnlyFirst() { - ForAllTypes(ForPartialVectors()); -} - -struct TestSetAtOrAfterFirst { - template - HWY_NOINLINE void operator()(T /*unused*/, D d) { - using TI = MakeSigned; // For mask > 0 comparison - const Rebind di; - const size_t N = Lanes(di); - auto bool_lanes = AllocateAligned(N); - HWY_ASSERT(bool_lanes); - memset(bool_lanes.get(), 0, N * sizeof(TI)); - - // For all combinations of zero/nonzero state of subset of lanes: - const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); - for (size_t code = 0; code < (1ull << max_lanes); ++code) { - for (size_t i = 0; i < max_lanes; ++i) { - bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); - } - - const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); - - const size_t first_set_lane_idx = - (code != 0) - ? Num0BitsBelowLS1Bit_Nonzero64(static_cast(code)) - : N; - const auto expected_at_or_after_first_mask = - Not(FirstN(d, first_set_lane_idx)); - const auto actual_at_or_after_first_mask = SetAtOrAfterFirst(m); - - HWY_ASSERT_MASK_EQ(d, expected_at_or_after_first_mask, - actual_at_or_after_first_mask); - HWY_ASSERT_MASK_EQ( - d, SetOnlyFirst(m), - And(actual_at_or_after_first_mask, SetAtOrBeforeFirst(m))); - HWY_ASSERT_MASK_EQ(d, m, And(m, actual_at_or_after_first_mask)); - HWY_ASSERT( - AllTrue(d, Xor(actual_at_or_after_first_mask, SetBeforeFirst(m)))); - } - } -}; - -HWY_NOINLINE void TestAllSetAtOrAfterFirst() { - ForAllTypes(ForPartialVectors()); -} - -struct TestDup128MaskFromMaskBits { - template - HWY_NOINLINE void operator()(T /*unused*/, D d) { - using TI = MakeSigned; // For mask > 0 comparison - const Rebind di; - const size_t N = Lanes(di); - constexpr size_t kLanesPer16ByteBlock = 16 / sizeof(T); - - auto expected = AllocateAligned(N); - HWY_ASSERT(expected); - - // For all combinations of zero/nonzero state of subset of lanes: - constexpr size_t kMaxLanesToCheckPerBlk = - HWY_MIN(HWY_MAX_LANES_D(D), HWY_MIN(kLanesPer16ByteBlock, 10)); - const size_t max_lanes = HWY_MIN(N, kMaxLanesToCheckPerBlk); - - for (unsigned code = 0; code < (1u << max_lanes); ++code) { - for (size_t i = 0; i < N; i++) { - expected[i] = static_cast( - -static_cast((code >> (i & (kLanesPer16ByteBlock - 1))) & 1)); - } - - const auto expected_mask = - MaskFromVec(BitCast(d, LoadDup128(di, expected.get()))); - - const auto m = Dup128MaskFromMaskBits(d, code); - HWY_ASSERT_VEC_EQ(di, expected.get(), VecFromMask(di, RebindMask(di, m))); - HWY_ASSERT_MASK_EQ(d, expected_mask, m); - } - } -}; - -HWY_NOINLINE void TestAllDup128MaskFromMaskBits() { - ForAllTypes(ForPartialVectors()); -} - -} // namespace -// NOLINTNEXTLINE(google-readability-namespace-comments) -} // namespace HWY_NAMESPACE -} // namespace hwy -HWY_AFTER_NAMESPACE(); - -#if HWY_ONCE -namespace hwy { -namespace { -HWY_BEFORE_TEST(HwyMaskSetTest); -HWY_EXPORT_AND_TEST_P(HwyMaskSetTest, TestAllMaskFalse); -HWY_EXPORT_AND_TEST_P(HwyMaskSetTest, TestAllFirstN); -HWY_EXPORT_AND_TEST_P(HwyMaskSetTest, TestAllSetBeforeFirst); -HWY_EXPORT_AND_TEST_P(HwyMaskSetTest, TestAllSetAtOrBeforeFirst); -HWY_EXPORT_AND_TEST_P(HwyMaskSetTest, TestAllSetOnlyFirst); -HWY_EXPORT_AND_TEST_P(HwyMaskSetTest, TestAllSetAtOrAfterFirst); -HWY_EXPORT_AND_TEST_P(HwyMaskSetTest, TestAllDup128MaskFromMaskBits); -HWY_AFTER_TEST(); -} // namespace -} // namespace hwy -HWY_TEST_MAIN(); -#endif // HWY_ONCE diff --git a/hwy/tests/mask_test.cc b/hwy/tests/mask_test.cc index a3a4e564b1..3ad55f5ced 100644 --- a/hwy/tests/mask_test.cc +++ b/hwy/tests/mask_test.cc @@ -15,7 +15,7 @@ #include #include -#include // memset +#include // memcmp #undef HWY_TARGET_INCLUDE #define HWY_TARGET_INCLUDE "tests/mask_test.cc" @@ -28,7 +28,52 @@ namespace hwy { namespace HWY_NAMESPACE { namespace { -struct TestMaskFromVec { +// All types. +struct TestMaskFalse { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { +#if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE || HWY_TARGET == HWY_SCALAR + // For RVV, SVE and SCALAR, use the underlying native vector. + const DFromV> d2; +#else + // Other targets are strongly-typed, but we can safely ResizeBitCast to the + // native vector. All targets have at least 128-bit vectors, but NEON also + // supports 64-bit vectors. + constexpr size_t kMinD2Lanes = (HWY_TARGET_IS_NEON ? 8 : 16) / sizeof(T); + const FixedTag d2; +#endif + static_assert(d2.MaxBytes() >= d.MaxBytes(), + "d2.MaxBytes() >= d.MaxBytes() should be true"); + using V2 = Vec; + + // Various ways of checking that false masks are false. + HWY_ASSERT(AllFalse(d, MaskFalse(d))); + HWY_ASSERT_EQ(0, CountTrue(d, MaskFalse(d))); + HWY_ASSERT_VEC_EQ(d, Zero(d), VecFromMask(d, MaskFalse(d))); + +#if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE + // For these targets, we can treat the result as if it were a vector of type + // `V2`. On SVE, vectors are always full (not fractional) and caps are only + // enforced by Highway ops. On RVV, LMUL must match but caps can also be + // ignored. For safety, MaskFalse also sets lanes >= `Lanes(d)` to false, + // and we verify that here. + HWY_ASSERT(AllFalse(d2, MaskFalse(d))); + HWY_ASSERT_EQ(0, CountTrue(d2, MaskFalse(d))); + HWY_ASSERT_VEC_EQ(d2, Zero(d2), VecFromMask(d2, MaskFalse(d))); +#endif + + // All targets support, and strongly-typed (non-scalable) targets require, + // ResizeBitCast before we compare to the 'native' underlying vector size. + const V2 actual2 = ResizeBitCast(d2, VecFromMask(d, MaskFalse(d))); + HWY_ASSERT_VEC_EQ(d2, Zero(d2), actual2); + } +}; + +HWY_NOINLINE void TestAllMaskFalse() { + ForAllTypes(ForPartialVectors()); +} + +struct TestFromVec { template HWY_NOINLINE void operator()(T /*unused*/, D d) { const size_t N = Lanes(d); @@ -36,80 +81,86 @@ struct TestMaskFromVec { HWY_ASSERT(lanes); memset(lanes.get(), 0, N * sizeof(T)); - const Mask actual_false = MaskFromVec(Load(d, lanes.get())); + const auto actual_false = MaskFromVec(Load(d, lanes.get())); HWY_ASSERT_MASK_EQ(d, MaskFalse(d), actual_false); memset(lanes.get(), 0xFF, N * sizeof(T)); - const Mask actual_true = MaskFromVec(Load(d, lanes.get())); + const auto actual_true = MaskFromVec(Load(d, lanes.get())); HWY_ASSERT_MASK_EQ(d, MaskTrue(d), actual_true); } }; -HWY_NOINLINE void TestAllMaskFromVec() { - ForAllTypes(ForPartialVectors()); +HWY_NOINLINE void TestAllFromVec() { + ForAllTypes(ForPartialVectors()); } -// Round trip, using MaskFromVec. -struct TestVecFromMask { +struct TestFirstN { template HWY_NOINLINE void operator()(T /*unused*/, D d) { - RandomState rng; - - using TI = MakeSigned; // For mask > 0 comparison - const Rebind di; const size_t N = Lanes(d); - auto lanes = AllocateAligned(N); - HWY_ASSERT(lanes); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(bool_lanes); - // Each lane should have a chance of having mask=true. - for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + using TN = SignedFromSize; + const size_t max_len = static_cast(LimitsMax()); + + const Vec k1 = Set(d, ConvertScalarTo(1)); + + const size_t max_lanes = HWY_MIN(2 * N, AdjustedReps(512)); + for (size_t len = 0; len <= HWY_MIN(max_lanes, max_len); ++len) { + // Loop instead of Iota+Lt to avoid wraparound for 8-bit T. for (size_t i = 0; i < N; ++i) { - lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); + bool_lanes[i] = ConvertScalarTo(i < len ? 1 : 0); } + const Mask expected = Eq(Load(d, bool_lanes.get()), k1); + HWY_ASSERT_MASK_EQ(d, expected, FirstN(d, len)); + } - const Mask mask = RebindMask(d, Gt(Load(di, lanes.get()), Zero(di))); - HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask))); + // Also ensure huge values yield all-true (unless the vector is actually + // larger than max_len). + for (size_t i = 0; i < N; ++i) { + bool_lanes[i] = ConvertScalarTo(i < max_len ? 1 : 0); } + const Mask expected = Eq(Load(d, bool_lanes.get()), k1); + HWY_ASSERT_MASK_EQ(d, expected, FirstN(d, max_len)); } }; -HWY_NOINLINE void TestAllVecFromMask() { - ForAllTypes(ForPartialVectors()); +HWY_NOINLINE void TestAllFirstN() { + ForAllTypes(ForPartialVectors()); } -struct TestBitsFromMask { +struct TestMaskVec { template HWY_NOINLINE void operator()(T /*unused*/, D d) { -#if HWY_HAVE_SCALABLE || HWY_MAX_BYTES > 64 - (void)d; -#else RandomState rng; using TI = MakeSigned; // For mask > 0 comparison const Rebind di; const size_t N = Lanes(d); - HWY_ASSERT(N <= 64); // non-scalable targets have at most 512 bits. - auto lanes = AllocateAligned(N); - HWY_ASSERT(lanes); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(bool_lanes); // Each lane should have a chance of having mask=true. for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { - uint64_t expected_bits = 0; for (size_t i = 0; i < N; ++i) { - lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); - expected_bits |= lanes[i] ? (1ull << i) : 0; + bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); } - const Mask mask = RebindMask(d, Gt(Load(di, lanes.get()), Zero(di))); - const uint64_t actual_bits = BitsFromMask(mask); - HWY_ASSERT_EQ(expected_bits, actual_bits); + const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask))); } -#endif // HWY_HAVE_SCALABLE || HWY_MAX_BYTES > 64 } }; -HWY_NOINLINE void TestAllBitsFromMask() { - ForAllTypes(ForPartialVectors()); +HWY_NOINLINE void TestAllMaskVec() { + const ForPartialVectors test; + + test(uint16_t()); + test(int16_t()); + // TODO(janwas): float16_t - cannot compare yet + + ForUIF3264(test); } struct TestAllTrueFalse { @@ -310,6 +361,192 @@ HWY_NOINLINE void TestAllLogicalMask() { ForAllTypes(ForPartialVectors()); } +struct TestSetBeforeFirst { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(di); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(bool_lanes); + memset(bool_lanes.get(), 0, N * sizeof(TI)); + + // For all combinations of zero/nonzero state of subset of lanes: + const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); + for (size_t code = 0; code < (1ull << max_lanes); ++code) { + for (size_t i = 0; i < max_lanes; ++i) { + bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); + } + + const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + + const size_t first_set_lane_idx = + (code != 0) + ? Num0BitsBelowLS1Bit_Nonzero64(static_cast(code)) + : N; + const auto expected_mask = FirstN(d, first_set_lane_idx); + + HWY_ASSERT_MASK_EQ(d, expected_mask, SetBeforeFirst(m)); + } + } +}; + +HWY_NOINLINE void TestAllSetBeforeFirst() { + ForAllTypes(ForPartialVectors()); +} + +struct TestSetAtOrBeforeFirst { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(di); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(bool_lanes); + memset(bool_lanes.get(), 0, N * sizeof(TI)); + + // For all combinations of zero/nonzero state of subset of lanes: + const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); + for (size_t code = 0; code < (1ull << max_lanes); ++code) { + for (size_t i = 0; i < max_lanes; ++i) { + bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); + } + + const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + + const size_t idx_after_first_set_lane = + (code != 0) + ? (Num0BitsBelowLS1Bit_Nonzero64(static_cast(code)) + 1) + : N; + const auto expected_mask = FirstN(d, idx_after_first_set_lane); + + HWY_ASSERT_MASK_EQ(d, expected_mask, SetAtOrBeforeFirst(m)); + } + } +}; + +HWY_NOINLINE void TestAllSetAtOrBeforeFirst() { + ForAllTypes(ForPartialVectors()); +} + +struct TestSetOnlyFirst { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(di); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(bool_lanes); + memset(bool_lanes.get(), 0, N * sizeof(TI)); + auto expected_lanes = AllocateAligned(N); + HWY_ASSERT(expected_lanes); + + // For all combinations of zero/nonzero state of subset of lanes: + const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); + for (size_t code = 0; code < (1ull << max_lanes); ++code) { + for (size_t i = 0; i < max_lanes; ++i) { + bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); + } + + memset(expected_lanes.get(), 0, N * sizeof(TI)); + if (code != 0) { + const size_t idx_of_first_lane = + Num0BitsBelowLS1Bit_Nonzero64(static_cast(code)); + expected_lanes[idx_of_first_lane] = TI(1); + } + + const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + const auto expected_mask = + RebindMask(d, Gt(Load(di, expected_lanes.get()), Zero(di))); + + HWY_ASSERT_MASK_EQ(d, expected_mask, SetOnlyFirst(m)); + } + } +}; + +HWY_NOINLINE void TestAllSetOnlyFirst() { + ForAllTypes(ForPartialVectors()); +} + +struct TestSetAtOrAfterFirst { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(di); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(bool_lanes); + memset(bool_lanes.get(), 0, N * sizeof(TI)); + + // For all combinations of zero/nonzero state of subset of lanes: + const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); + for (size_t code = 0; code < (1ull << max_lanes); ++code) { + for (size_t i = 0; i < max_lanes; ++i) { + bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); + } + + const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + + const size_t first_set_lane_idx = + (code != 0) + ? Num0BitsBelowLS1Bit_Nonzero64(static_cast(code)) + : N; + const auto expected_at_or_after_first_mask = + Not(FirstN(d, first_set_lane_idx)); + const auto actual_at_or_after_first_mask = SetAtOrAfterFirst(m); + + HWY_ASSERT_MASK_EQ(d, expected_at_or_after_first_mask, + actual_at_or_after_first_mask); + HWY_ASSERT_MASK_EQ( + d, SetOnlyFirst(m), + And(actual_at_or_after_first_mask, SetAtOrBeforeFirst(m))); + HWY_ASSERT_MASK_EQ(d, m, And(m, actual_at_or_after_first_mask)); + HWY_ASSERT( + AllTrue(d, Xor(actual_at_or_after_first_mask, SetBeforeFirst(m)))); + } + } +}; + +HWY_NOINLINE void TestAllSetAtOrAfterFirst() { + ForAllTypes(ForPartialVectors()); +} + +struct TestDup128MaskFromMaskBits { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(di); + constexpr size_t kLanesPer16ByteBlock = 16 / sizeof(T); + + auto expected = AllocateAligned(N); + HWY_ASSERT(expected); + + // For all combinations of zero/nonzero state of subset of lanes: + constexpr size_t kMaxLanesToCheckPerBlk = + HWY_MIN(HWY_MAX_LANES_D(D), HWY_MIN(kLanesPer16ByteBlock, 10)); + const size_t max_lanes = HWY_MIN(N, kMaxLanesToCheckPerBlk); + + for (unsigned code = 0; code < (1u << max_lanes); ++code) { + for (size_t i = 0; i < N; i++) { + expected[i] = static_cast( + -static_cast((code >> (i & (kLanesPer16ByteBlock - 1))) & 1)); + } + + const auto expected_mask = + MaskFromVec(BitCast(d, LoadDup128(di, expected.get()))); + + const auto m = Dup128MaskFromMaskBits(d, code); + HWY_ASSERT_VEC_EQ(di, expected.get(), VecFromMask(di, RebindMask(di, m))); + HWY_ASSERT_MASK_EQ(d, expected_mask, m); + } + } +}; + +HWY_NOINLINE void TestAllDup128MaskFromMaskBits() { + ForAllTypes(ForPartialVectors()); +} + } // namespace // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE @@ -320,14 +557,20 @@ HWY_AFTER_NAMESPACE(); namespace hwy { namespace { HWY_BEFORE_TEST(HwyMaskTest); -HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskFromVec); -HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllVecFromMask); -HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllBitsFromMask); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskFalse); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFromVec); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFirstN); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskVec); HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllAllTrueFalse); HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllCountTrue); HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFindFirstTrue); HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFindLastTrue); HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllLogicalMask); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllSetBeforeFirst); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllSetAtOrBeforeFirst); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllSetOnlyFirst); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllSetAtOrAfterFirst); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllDup128MaskFromMaskBits); HWY_AFTER_TEST(); } // namespace } // namespace hwy From 9b39ef27cc5f954cbfc0b7ce148a9eb5dd7a1371 Mon Sep 17 00:00:00 2001 From: John Platts Date: Sun, 1 Dec 2024 10:39:29 -0600 Subject: [PATCH 14/64] Added AVX10_2 and AVX10_2_512 targets --- BUILD | 1 + CMakeLists.txt | 1 + hwy.gni | 1 + hwy/contrib/unroller/unroller_test.cc | 2 +- hwy/detect_targets.h | 39 +- hwy/foreach_target.h | 22 + hwy/highway.h | 23 +- hwy/ops/set_macros-inl.h | 81 ++- hwy/ops/x86_128-inl.h | 20 + hwy/ops/x86_256-inl.h | 144 +++++ hwy/ops/x86_512-inl.h | 745 ++------------------------ hwy/ops/x86_avx3-inl.h | 497 +++++++++++++++++ hwy/targets.cc | 39 +- hwy/targets.h | 36 +- hwy/targets_test.cc | 2 + 15 files changed, 921 insertions(+), 732 deletions(-) create mode 100644 hwy/ops/x86_avx3-inl.h diff --git a/BUILD b/BUILD index 114eef8a02..cea4582d56 100644 --- a/BUILD +++ b/BUILD @@ -199,6 +199,7 @@ cc_library( "hwy/ops/x86_128-inl.h", "hwy/ops/x86_256-inl.h", "hwy/ops/x86_512-inl.h", + "hwy/ops/x86_avx3-inl.h", # Select avoids recompiling native arch if only non-native changed ] + select({ ":compiler_emscripten": [ diff --git a/CMakeLists.txt b/CMakeLists.txt index 9cf044cbc9..5a7fc8cb24 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -219,6 +219,7 @@ set(HWY_SOURCES hwy/ops/x86_128-inl.h hwy/ops/x86_256-inl.h hwy/ops/x86_512-inl.h + hwy/ops/x86_avx3-inl.h hwy/per_target.h hwy/print-inl.h hwy/print.h diff --git a/hwy.gni b/hwy.gni index b48aa44310..fcf3e83824 100644 --- a/hwy.gni +++ b/hwy.gni @@ -32,6 +32,7 @@ hwy_public = [ "$_hwy/ops/x86_128-inl.h", "$_hwy/ops/x86_256-inl.h", "$_hwy/ops/x86_512-inl.h", + "$_hwy/ops/x86_avx3-inl.h", ] hwy_sources = [ diff --git a/hwy/contrib/unroller/unroller_test.cc b/hwy/contrib/unroller/unroller_test.cc index 2e5c04cd0e..7a13825dda 100644 --- a/hwy/contrib/unroller/unroller_test.cc +++ b/hwy/contrib/unroller/unroller_test.cc @@ -376,7 +376,7 @@ struct TestDot { AccumulateUnit accfn; T dot_via_mul_acc; Unroller(accfn, y, &dot_via_mul_acc, static_cast(num)); - const double tolerance = 48.0 * + const double tolerance = 120.0 * ConvertScalarTo(hwy::Epsilon()) * ScalarAbs(expected_dot); HWY_ASSERT(ScalarAbs(expected_dot - dot_via_mul_acc) < tolerance); diff --git a/hwy/detect_targets.h b/hwy/detect_targets.h index df0d9c9ef4..8b70c5b4b8 100644 --- a/hwy/detect_targets.h +++ b/hwy/detect_targets.h @@ -59,9 +59,10 @@ // left-shifting 2^62), but still do not use bit 63 because it is the sign bit. // --------------------------- x86: 15 targets (+ one fallback) -// Bits 0..3 reserved (4 targets) +// Bits 0..2 reserved (3 targets) +#define HWY_AVX10_2_512 (1LL << 3) // AVX10.2 with 512-bit vectors #define HWY_AVX3_SPR (1LL << 4) -// Bit 5 reserved (likely AVX10.2 with 256-bit vectors) +#define HWY_AVX10_2 (1LL << 5) // AVX10.2 with 256-bit vectors // Currently HWY_AVX3_DL plus AVX512BF16 and a special case for CompressStore // (10x as fast). // We may later also use VPCONFLICT. @@ -534,7 +535,10 @@ // Require everything in AVX2 plus AVX-512 flags (also set by MSVC) #if HWY_BASELINE_AVX2 != 0 && defined(__AVX512F__) && defined(__AVX512BW__) && \ - defined(__AVX512DQ__) && defined(__AVX512VL__) + defined(__AVX512DQ__) && defined(__AVX512VL__) && \ + ((!HWY_COMPILER_GCC_ACTUAL && !HWY_COMPILER_CLANG) || \ + HWY_COMPILER_GCC_ACTUAL < 1400 || HWY_COMPILER_CLANG < 1800 || \ + defined(__EVEX512__)) #define HWY_BASELINE_AVX3 HWY_AVX3 #else #define HWY_BASELINE_AVX3 0 @@ -559,6 +563,12 @@ #define HWY_BASELINE_AVX3_ZEN4 0 #endif +#if HWY_BASELINE_AVX2 != 0 && defined(__AVX10_2__) +#define HWY_BASELINE_AVX10_2 HWY_AVX10_2 +#else +#define HWY_BASELINE_AVX10_2 0 +#endif + #if HWY_BASELINE_AVX3_DL != 0 && defined(__AVX512BF16__) && \ defined(__AVX512FP16__) #define HWY_BASELINE_AVX3_SPR HWY_AVX3_SPR @@ -566,6 +576,12 @@ #define HWY_BASELINE_AVX3_SPR 0 #endif +#if HWY_BASELINE_AVX3_SPR != 0 && defined(__AVX10_2_512__) +#define HWY_BASELINE_AVX10_2_512 HWY_AVX10_2_512 +#else +#define HWY_BASELINE_AVX10_2_512 0 +#endif + // RVV requires intrinsics 0.11 or later, see #1156. #if HWY_ARCH_RISCV && defined(__riscv_v_intrinsic) && \ __riscv_v_intrinsic >= 11000 @@ -584,14 +600,15 @@ // Allow the user to override this without any guarantee of success. #ifndef HWY_BASELINE_TARGETS -#define HWY_BASELINE_TARGETS \ - (HWY_BASELINE_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | \ - HWY_BASELINE_PPC9 | HWY_BASELINE_PPC10 | HWY_BASELINE_Z14 | \ - HWY_BASELINE_Z15 | HWY_BASELINE_SVE2 | HWY_BASELINE_SVE | \ - HWY_BASELINE_NEON | HWY_BASELINE_SSE2 | HWY_BASELINE_SSSE3 | \ - HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | \ - HWY_BASELINE_AVX3_DL | HWY_BASELINE_AVX3_ZEN4 | HWY_BASELINE_AVX3_SPR | \ - HWY_BASELINE_RVV | HWY_BASELINE_LOONGARCH) +#define HWY_BASELINE_TARGETS \ + (HWY_BASELINE_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | \ + HWY_BASELINE_PPC9 | HWY_BASELINE_PPC10 | HWY_BASELINE_Z14 | \ + HWY_BASELINE_Z15 | HWY_BASELINE_SVE2 | HWY_BASELINE_SVE | \ + HWY_BASELINE_NEON | HWY_BASELINE_SSE2 | HWY_BASELINE_SSSE3 | \ + HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | \ + HWY_BASELINE_AVX3_DL | HWY_BASELINE_AVX3_ZEN4 | HWY_BASELINE_AVX10_2 | \ + HWY_BASELINE_AVX3_SPR | HWY_BASELINE_AVX10_2_512 | HWY_BASELINE_RVV | \ + HWY_BASELINE_LOONGARCH) #endif // HWY_BASELINE_TARGETS //------------------------------------------------------------------------------ diff --git a/hwy/foreach_target.h b/hwy/foreach_target.h index 66bd8a4f4c..19bc478c5a 100644 --- a/hwy/foreach_target.h +++ b/hwy/foreach_target.h @@ -143,6 +143,28 @@ #endif #endif +#if (HWY_TARGETS & HWY_AVX10_2) && (HWY_STATIC_TARGET != HWY_AVX10_2) +#undef HWY_TARGET +#define HWY_TARGET HWY_AVX10_2 +#include HWY_TARGET_INCLUDE +#ifdef HWY_TARGET_TOGGLE +#undef HWY_TARGET_TOGGLE +#else +#define HWY_TARGET_TOGGLE +#endif +#endif + +#if (HWY_TARGETS & HWY_AVX10_2_512) && (HWY_STATIC_TARGET != HWY_AVX10_2_512) +#undef HWY_TARGET +#define HWY_TARGET HWY_AVX10_2_512 +#include HWY_TARGET_INCLUDE +#ifdef HWY_TARGET_TOGGLE +#undef HWY_TARGET_TOGGLE +#else +#define HWY_TARGET_TOGGLE +#endif +#endif + // ------------------------------ HWY_ARCH_ARM #if (HWY_TARGETS & HWY_NEON_WITHOUT_AES) && \ diff --git a/hwy/highway.h b/hwy/highway.h index 2fa2eb0ea1..fe91e23589 100644 --- a/hwy/highway.h +++ b/hwy/highway.h @@ -124,8 +124,12 @@ namespace hwy { #define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_DL::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_AVX3_ZEN4 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_ZEN4::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_AVX10_2 +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX10_2::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_AVX3_SPR #define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_SPR::FUNC_NAME +#elif HWY_STATIC_TARGET == HWY_AVX10_2_512 +#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX10_2_512::FUNC_NAME #endif // HWY_CHOOSE_*(FUNC_NAME) expands to the function pointer for that target or @@ -284,12 +288,24 @@ namespace hwy { #define HWY_CHOOSE_AVX3_ZEN4(FUNC_NAME) nullptr #endif +#if HWY_TARGETS & HWY_AVX10_2 +#define HWY_CHOOSE_AVX10_2(FUNC_NAME) &N_AVX10_2::FUNC_NAME +#else +#define HWY_CHOOSE_AVX10_2(FUNC_NAME) nullptr +#endif + #if HWY_TARGETS & HWY_AVX3_SPR #define HWY_CHOOSE_AVX3_SPR(FUNC_NAME) &N_AVX3_SPR::FUNC_NAME #else #define HWY_CHOOSE_AVX3_SPR(FUNC_NAME) nullptr #endif +#if HWY_TARGETS & HWY_AVX10_2_512 +#define HWY_CHOOSE_AVX10_2_512(FUNC_NAME) &N_AVX10_2_512::FUNC_NAME +#else +#define HWY_CHOOSE_AVX10_2_512(FUNC_NAME) nullptr +#endif + // MSVC 2017 workaround: the non-type template parameter to ChooseAndCall // apparently cannot be an array. Use a function pointer instead, which has the // disadvantage that we call the static (not best) target on the first call to @@ -594,9 +610,10 @@ struct AddExport { #include "hwy/ops/x86_128-inl.h" #elif HWY_TARGET == HWY_AVX2 #include "hwy/ops/x86_256-inl.h" -#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL || \ - HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR -#include "hwy/ops/x86_512-inl.h" +#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL || \ + HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX10_2 || \ + HWY_TARGET == HWY_AVX3_SPR || HWY_TARGET == HWY_AVX10_2_512 +#include "hwy/ops/x86_avx3-inl.h" #elif HWY_TARGET == HWY_Z14 || HWY_TARGET == HWY_Z15 || \ (HWY_TARGET & HWY_ALL_PPC) #include "hwy/ops/ppc_vsx-inl.h" diff --git a/hwy/ops/set_macros-inl.h b/hwy/ops/set_macros-inl.h index 731614ffd4..1d80bf213c 100644 --- a/hwy/ops/set_macros-inl.h +++ b/hwy/ops/set_macros-inl.h @@ -68,6 +68,13 @@ #define HWY_TARGET_IS_PPC 0 #endif +#undef HWY_TARGET_IS_AVX10_2 +#if HWY_TARGET == HWY_AVX10_2 || HWY_TARGET == HWY_AVX10_2_512 +#define HWY_TARGET_IS_AVX10_2 1 +#else +#define HWY_TARGET_IS_AVX10_2 0 +#endif + // Supported on all targets except RVV (requires GCC 14 or upcoming Clang) #if HWY_TARGET == HWY_RVV && \ ((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400) || \ @@ -133,13 +140,27 @@ // Include previous targets, which are the half-vectors of the next target. #define HWY_TARGET_STR_AVX2 \ HWY_TARGET_STR_SSE4 ",avx,avx2" HWY_TARGET_STR_BMI2_FMA HWY_TARGET_STR_F16C -#define HWY_TARGET_STR_AVX3 \ - HWY_TARGET_STR_AVX2 ",avx512f,avx512cd,avx512vl,avx512dq,avx512bw" -#define HWY_TARGET_STR_AVX3_DL \ - HWY_TARGET_STR_AVX3 \ + +#if HWY_COMPILER_GCC_ACTUAL >= 1400 || HWY_COMPILER_CLANG >= 1800 +#define HWY_TARGET_STR_AVX3_VL512 ",evex512" +#else +#define HWY_TARGET_STR_AVX3_VL512 +#endif + +#define HWY_TARGET_STR_AVX3_256 \ + HWY_TARGET_STR_AVX2 \ + ",avx512f,avx512cd,avx512vl,avx512dq,avx512bw" HWY_TARGET_STR_AVX3_VL512 + +#define HWY_TARGET_STR_AVX3 HWY_TARGET_STR_AVX3_256 HWY_TARGET_STR_AVX3_VL512 + +#define HWY_TARGET_STR_AVX3_DL_256 \ + HWY_TARGET_STR_AVX3_256 \ ",vpclmulqdq,avx512vbmi,avx512vbmi2,vaes,avx512vnni,avx512bitalg," \ "avx512vpopcntdq,gfni" +#define HWY_TARGET_STR_AVX3_DL \ + HWY_TARGET_STR_AVX3_DL_256 HWY_TARGET_STR_AVX3_VL512 + // Force-disable for compilers that do not properly support avx512bf16. #if !defined(HWY_AVX3_DISABLE_AVX512BF16) && \ (HWY_COMPILER_CLANGCL || \ @@ -149,12 +170,28 @@ #endif #if !defined(HWY_AVX3_DISABLE_AVX512BF16) -#define HWY_TARGET_STR_AVX3_ZEN4 HWY_TARGET_STR_AVX3_DL ",avx512bf16" +#define HWY_TARGET_STR_AVX3_ZEN4_256 HWY_TARGET_STR_AVX3_DL ",avx512bf16" #else -#define HWY_TARGET_STR_AVX3_ZEN4 HWY_TARGET_STR_AVX3_DL +#define HWY_TARGET_STR_AVX3_ZEN4_256 HWY_TARGET_STR_AVX3_DL #endif -#define HWY_TARGET_STR_AVX3_SPR HWY_TARGET_STR_AVX3_ZEN4 ",avx512fp16" +#define HWY_TARGET_STR_AVX3_ZEN4 \ + HWY_TARGET_STR_AVX3_ZEN4_256 HWY_TARGET_STR_AVX3_VL512 + +#define HWY_TARGET_STR_AVX3_SPR_256 HWY_TARGET_STR_AVX3_ZEN4 ",avx512fp16" + +#define HWY_TARGET_STR_AVX3_SPR \ + HWY_TARGET_STR_AVX3_SPR_256 HWY_TARGET_STR_AVX3_VL512 + +#if HWY_COMPILER_GCC_ACTUAL >= 1500 || HWY_COMPILER_CLANG >= 2000 +#define HWY_TARGET_STR_AVX10_2 \ + HWY_TARGET_STR_AVX3_SPR_256 ",no-evex512,avx10.2-256" +#define HWY_TARGET_STR_AVX10_2_512 \ + HWY_TARGET_STR_AVX3_SPR ",avx10.2-256,avx10.2-512" +#else +#define HWY_TARGET_STR_AVX10_2 HWY_TARGET_STR_AVX3_SPR_256 ",no-evex512" +#define HWY_TARGET_STR_AVX10_2_512 HWY_TARGET_STR_AVX3_SPR +#endif #if defined(HWY_DISABLE_PPC8_CRYPTO) #define HWY_TARGET_STR_PPC8_CRYPTO "" @@ -277,17 +314,24 @@ #define HWY_TARGET_STR HWY_TARGET_STR_AVX2 //----------------------------------------------------------------------------- -// AVX3[_DL] -#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL || \ - HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR +// AVX3[_DL]/AVX10 +#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL || \ + HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR || \ + HWY_TARGET == HWY_AVX10_2 || HWY_TARGET == HWY_AVX10_2_512 +#if HWY_TARGET == HWY_AVX10_2 +#define HWY_ALIGN alignas(32) +#define HWY_MAX_BYTES 32 +#define HWY_LANES(T) (32 / sizeof(T)) +#else #define HWY_ALIGN alignas(64) #define HWY_MAX_BYTES 64 #define HWY_LANES(T) (64 / sizeof(T)) +#endif #define HWY_HAVE_SCALABLE 0 #define HWY_HAVE_INTEGER64 1 -#if HWY_TARGET == HWY_AVX3_SPR && \ +#if HWY_TARGET <= HWY_AVX10_2 && \ (HWY_COMPILER_GCC_ACTUAL || HWY_COMPILER_CLANG >= 1901) && \ HWY_HAVE_SCALAR_F16_TYPE #define HWY_HAVE_FLOAT16 1 @@ -303,7 +347,12 @@ #define HWY_NATIVE_DOT_BF16 0 #endif #define HWY_CAP_GE256 1 + +#if HWY_MAX_BYTES >= 64 #define HWY_CAP_GE512 1 +#else +#define HWY_CAP_GE512 0 +#endif #if HWY_TARGET == HWY_AVX3 @@ -325,6 +374,16 @@ #define HWY_NAMESPACE N_AVX3_SPR #define HWY_TARGET_STR HWY_TARGET_STR_AVX3_SPR +#elif HWY_TARGET == HWY_AVX10_2 + +#define HWY_NAMESPACE N_AVX10_2 +#define HWY_TARGET_STR HWY_TARGET_STR_AVX10_2 + +#elif HWY_TARGET == HWY_AVX10_2_512 + +#define HWY_NAMESPACE N_AVX10_2_512 +#define HWY_TARGET_STR HWY_TARGET_STR_AVX10_2_512 + #else #error "Logic error" #endif // HWY_TARGET diff --git a/hwy/ops/x86_128-inl.h b/hwy/ops/x86_128-inl.h index a863cd10c6..dbe41cedce 100644 --- a/hwy/ops/x86_128-inl.h +++ b/hwy/ops/x86_128-inl.h @@ -4408,6 +4408,26 @@ HWY_API Vec128 operator*(const Vec128 a, return BitCast(d, BitCast(du, a) * BitCast(du, b)); } +#if HWY_TARGET <= HWY_AVX3 +// Per-target flag to prevent generic_ops-inl.h from defining 64-bit operator*. +#ifdef HWY_NATIVE_MUL_64 +#undef HWY_NATIVE_MUL_64 +#else +#define HWY_NATIVE_MUL_64 +#endif + +template +HWY_API Vec128 operator*(Vec128 a, + Vec128 b) { + return Vec128{_mm_mullo_epi64(a.raw, b.raw)}; +} +template +HWY_API Vec128 operator*(Vec128 a, + Vec128 b) { + return Vec128{_mm_mullo_epi64(a.raw, b.raw)}; +} +#endif + // ------------------------------ RotateRight (ShiftRight, Or) // U8 RotateRight implementation on AVX3_DL is now in x86_512-inl.h as U8 diff --git a/hwy/ops/x86_256-inl.h b/hwy/ops/x86_256-inl.h index df09c052c7..ebaea59de3 100644 --- a/hwy/ops/x86_256-inl.h +++ b/hwy/ops/x86_256-inl.h @@ -46,6 +46,36 @@ HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494, #include #include #include + +#if HWY_TARGET <= HWY_AVX10_2 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +// Must come after avx512fintrin, else will not define 512-bit intrinsics. +#include +#include +#include +#include +#include + +#endif // HWY_TARGET <= HWY_AVX10_2 + +// clang-format on #endif // HWY_COMPILER_CLANGCL // For half-width vectors. Already includes base.h. @@ -2146,6 +2176,11 @@ HWY_API Vec256 operator*(Vec256 a, Vec256 b) { HWY_API Vec256 operator*(Vec256 a, Vec256 b) { return Vec256{_mm256_mullo_epi32(a.raw, b.raw)}; } +#if HWY_TARGET <= HWY_AVX3 +HWY_API Vec256 operator*(Vec256 a, Vec256 b) { + return Vec256{_mm256_mullo_epi64(a.raw, b.raw)}; +} +#endif // Signed HWY_API Vec256 operator*(Vec256 a, Vec256 b) { @@ -2154,6 +2189,11 @@ HWY_API Vec256 operator*(Vec256 a, Vec256 b) { HWY_API Vec256 operator*(Vec256 a, Vec256 b) { return Vec256{_mm256_mullo_epi32(a.raw, b.raw)}; } +#if HWY_TARGET <= HWY_AVX3 +HWY_API Vec256 operator*(Vec256 a, Vec256 b) { + return Vec256{_mm256_mullo_epi64(a.raw, b.raw)}; +} +#endif // Returns the upper 16 bits of a * b in each lane. HWY_API Vec256 MulHigh(Vec256 a, Vec256 b) { @@ -6698,6 +6738,31 @@ HWY_API VFromD OrderedDemote2To(D d, V a, V b) { _MM_SHUFFLE(3, 1, 2, 0))}; } +#if HWY_TARGET <= HWY_AVX3 +template +HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, + VFromD> b) { + const Half dnh; + return Combine(dn, DemoteTo(dnh, b), DemoteTo(dnh, a)); +} + +template +HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, + VFromD> b) { + const Half dnh; + return Combine(dn, DemoteTo(dnh, b), DemoteTo(dnh, a)); +} + +template ), + HWY_IF_V_SIZE_GT_D(D, 16), class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), + HWY_IF_T_SIZE_V(V, sizeof(TFromD) * 2), + HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV) * 2), + HWY_IF_T_SIZE_V(V, 8)> +HWY_API VFromD OrderedDemote2To(D d, V a, V b) { + return ReorderDemote2To(d, a, b); +} +#endif + template HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { return VFromD{_mm256_cvtpd_ps(v.raw)}; @@ -8731,6 +8796,85 @@ template ), HWY_IF_V_SIZE_V(V, 32)> HWY_API V LeadingZeroCount(V v) { return V{_mm256_lzcnt_epi64(v.raw)}; } + +namespace detail { + +template , HWY_MAX_BYTES / 4)> +static HWY_INLINE HWY_MAYBE_UNUSED V Lzcnt32ForU8OrU16OrU32(V v) { + const DFromV d; + const Rebind di32; + const Rebind du32; + + const auto v_lz_count = LeadingZeroCount(PromoteTo(du32, v)); + return DemoteTo(d, BitCast(di32, v_lz_count)); +} + +template +static HWY_INLINE HWY_MAYBE_UNUSED V Lzcnt32ForU8OrU16OrU32(V v) { + return LeadingZeroCount(v); +} + +template , HWY_MAX_BYTES / 4)> +static HWY_INLINE HWY_MAYBE_UNUSED V Lzcnt32ForU8OrU16OrU32(V v) { + const DFromV d; + const RepartitionToWide dw; + const RebindToSigned dw_i; + + const auto lo_v_lz_count = Lzcnt32ForU8OrU16OrU32(PromoteLowerTo(dw, v)); + const auto hi_v_lz_count = Lzcnt32ForU8OrU16OrU32(PromoteUpperTo(dw, v)); + return OrderedDemote2To(d, BitCast(dw_i, lo_v_lz_count), + BitCast(dw_i, hi_v_lz_count)); +} + +} // namespace detail + +template +HWY_API V LeadingZeroCount(V v) { + const DFromV d; + const RebindToUnsigned du; + using TU = TFromD; + + constexpr TU kNumOfBitsInT{sizeof(TU) * 8}; + const auto v_lzcnt32 = detail::Lzcnt32ForU8OrU16OrU32(BitCast(du, v)); + return BitCast(d, Min(v_lzcnt32 - Set(du, TU{32 - kNumOfBitsInT}), + Set(du, TU{kNumOfBitsInT}))); +} + +template +HWY_API V HighestSetBitIndex(V v) { + const DFromV d; + const RebindToUnsigned du; + using TU = TFromD; + return BitCast( + d, Set(du, TU{31}) - detail::Lzcnt32ForU8OrU16OrU32(BitCast(du, v))); +} + +template +HWY_API V HighestSetBitIndex(V v) { + const DFromV d; + using T = TFromD; + return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v)); +} + +template +HWY_API V TrailingZeroCount(V v) { + const DFromV d; + const RebindToSigned di; + using T = TFromD; + + const auto vi = BitCast(di, v); + const auto lowest_bit = BitCast(d, And(vi, Neg(vi))); + constexpr T kNumOfBitsInT{sizeof(T) * 8}; + const auto bit_idx = HighestSetBitIndex(lowest_bit); + return IfThenElse(MaskFromVec(bit_idx), Set(d, kNumOfBitsInT), bit_idx); +} #endif // HWY_TARGET <= HWY_AVX3 // NOLINTNEXTLINE(google-readability-namespace-comments) diff --git a/hwy/ops/x86_512-inl.h b/hwy/ops/x86_512-inl.h index c906b2e32c..bcb930cac9 100644 --- a/hwy/ops/x86_512-inl.h +++ b/hwy/ops/x86_512-inl.h @@ -1338,20 +1338,7 @@ HWY_API Vec512 ShiftLeft(const Vec512 v) { return Vec512{_mm512_slli_epi64(v.raw, kBits)}; } -#if HWY_TARGET <= HWY_AVX3_DL - -// Generic for all vector lengths. Must be defined after all GaloisAffine. -template -HWY_API V ShiftLeft(const V v) { - const Repartition> du64; - if (kBits == 0) return v; - if (kBits == 1) return v + v; - constexpr uint64_t kMatrix = (0x0102040810204080ULL >> kBits) & - (0x0101010101010101ULL * (0xFF >> kBits)); - return detail::GaloisAffine(v, Set(du64, kMatrix)); -} - -#else // HWY_TARGET > HWY_AVX3_DL +#if HWY_TARGET > HWY_AVX3_DL template HWY_API Vec512 ShiftLeft(const Vec512 v) { @@ -1397,33 +1384,7 @@ HWY_API Vec512 ShiftRight(const Vec512 v) { return Vec512{_mm512_srai_epi64(v.raw, kBits)}; } -#if HWY_TARGET <= HWY_AVX3_DL - -// Generic for all vector lengths. Must be defined after all GaloisAffine. -template )> -HWY_API V ShiftRight(const V v) { - const Repartition> du64; - if (kBits == 0) return v; - constexpr uint64_t kMatrix = - (0x0102040810204080ULL << kBits) & - (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF)); - return detail::GaloisAffine(v, Set(du64, kMatrix)); -} - -// Generic for all vector lengths. Must be defined after all GaloisAffine. -template )> -HWY_API V ShiftRight(const V v) { - const Repartition> du64; - if (kBits == 0) return v; - constexpr uint64_t kShift = - (0x0102040810204080ULL << kBits) & - (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF)); - constexpr uint64_t kSign = - kBits == 0 ? 0 : (0x8080808080808080ULL >> (64 - (8 * kBits))); - return detail::GaloisAffine(v, Set(du64, kShift | kSign)); -} - -#else // HWY_TARGET > HWY_AVX3_DL +#if HWY_TARGET > HWY_AVX3_DL template HWY_API Vec512 ShiftRight(const Vec512 v) { @@ -1446,26 +1407,7 @@ HWY_API Vec512 ShiftRight(const Vec512 v) { // ------------------------------ RotateRight -#if HWY_TARGET <= HWY_AVX3_DL -// U8 RotateRight is generic for all vector lengths on AVX3_DL -template )> -HWY_API V RotateRight(V v) { - static_assert(0 <= kBits && kBits < 8, "Invalid shift count"); - - const Repartition> du64; - if (kBits == 0) return v; - - constexpr uint64_t kShrMatrix = - (0x0102040810204080ULL << kBits) & - (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF)); - constexpr int kShlBits = (-kBits) & 7; - constexpr uint64_t kShlMatrix = (0x0102040810204080ULL >> kShlBits) & - (0x0101010101010101ULL * (0xFF >> kShlBits)); - constexpr uint64_t kMatrix = kShrMatrix | kShlMatrix; - - return detail::GaloisAffine(v, Set(du64, kMatrix)); -} -#else // HWY_TARGET > HWY_AVX3_DL +#if HWY_TARGET > HWY_AVX3_DL template HWY_API Vec512 RotateRight(const Vec512 v) { static_assert(0 <= kBits && kBits < 8, "Invalid shift count"); @@ -1473,7 +1415,7 @@ HWY_API Vec512 RotateRight(const Vec512 v) { // AVX3 does not support 8-bit. return Or(ShiftRight(v), ShiftLeft(v)); } -#endif // HWY_TARGET <= HWY_AVX3_DL +#endif // HWY_TARGET > HWY_AVX3_DL template HWY_API Vec512 RotateRight(const Vec512 v) { @@ -1784,13 +1726,6 @@ HWY_API Vec512 Max(Vec512 a, Vec512 b) { // ------------------------------ Integer multiplication -// Per-target flag to prevent generic_ops-inl.h from defining 64-bit operator*. -#ifdef HWY_NATIVE_MUL_64 -#undef HWY_NATIVE_MUL_64 -#else -#define HWY_NATIVE_MUL_64 -#endif - // Unsigned HWY_API Vec512 operator*(Vec512 a, Vec512 b) { return Vec512{_mm512_mullo_epi16(a.raw, b.raw)}; @@ -1801,14 +1736,6 @@ HWY_API Vec512 operator*(Vec512 a, Vec512 b) { HWY_API Vec512 operator*(Vec512 a, Vec512 b) { return Vec512{_mm512_mullo_epi64(a.raw, b.raw)}; } -HWY_API Vec256 operator*(Vec256 a, Vec256 b) { - return Vec256{_mm256_mullo_epi64(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator*(Vec128 a, - Vec128 b) { - return Vec128{_mm_mullo_epi64(a.raw, b.raw)}; -} // Signed HWY_API Vec512 operator*(Vec512 a, Vec512 b) { @@ -1820,14 +1747,7 @@ HWY_API Vec512 operator*(Vec512 a, Vec512 b) { HWY_API Vec512 operator*(Vec512 a, Vec512 b) { return Vec512{_mm512_mullo_epi64(a.raw, b.raw)}; } -HWY_API Vec256 operator*(Vec256 a, Vec256 b) { - return Vec256{_mm256_mullo_epi64(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator*(Vec128 a, - Vec128 b) { - return Vec128{_mm_mullo_epi64(a.raw, b.raw)}; -} + // Returns the upper 16 bits of a * b in each lane. HWY_API Vec512 MulHigh(Vec512 a, Vec512 b) { return Vec512{_mm512_mulhi_epu16(a.raw, b.raw)}; @@ -5845,19 +5765,6 @@ HWY_API VFromD ReorderDemote2To(D dn, Vec512 a, BitCast(di16, Min(b, max_i16))); } -template -HWY_API VFromD ReorderDemote2To(D dn, Vec512 a, Vec512 b) { - const Half dnh; - return Combine(dn, DemoteTo(dnh, b), DemoteTo(dnh, a)); -} - -template -HWY_API VFromD ReorderDemote2To(D dn, Vec512 a, - Vec512 b) { - const Half dnh; - return Combine(dn, DemoteTo(dnh, b), DemoteTo(dnh, a)); -} - template ), HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD) * 2), @@ -5870,15 +5777,6 @@ HWY_API VFromD OrderedDemote2To(D d, V a, V b) { SetTableIndices(du64, kIdx))); } -template ), - HWY_IF_V_SIZE_GT_D(D, 16), class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), - HWY_IF_T_SIZE_V(V, sizeof(TFromD) * 2), - HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV) * 2), - HWY_IF_T_SIZE_V(V, 8)> -HWY_API VFromD OrderedDemote2To(D d, V a, V b) { - return ReorderDemote2To(d, a, b); -} - template HWY_API VFromD DemoteTo(D /* tag */, Vec512 v) { return VFromD{_mm512_cvtpd_ps(v.raw)}; @@ -6932,385 +6830,6 @@ HWY_API intptr_t FindLastTrue(D d, MFromD mask) { // ------------------------------ Compress -#ifndef HWY_X86_SLOW_COMPRESS_STORE // allow override -// Slow on Zen4 and SPR, faster if we emulate via Compress(). -#if HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR -#define HWY_X86_SLOW_COMPRESS_STORE 1 -#else -#define HWY_X86_SLOW_COMPRESS_STORE 0 -#endif -#endif // HWY_X86_SLOW_COMPRESS_STORE - -// Always implement 8-bit here even if we lack VBMI2 because we can do better -// than generic_ops (8 at a time) via the native 32-bit compress (16 at a time). -#ifdef HWY_NATIVE_COMPRESS8 -#undef HWY_NATIVE_COMPRESS8 -#else -#define HWY_NATIVE_COMPRESS8 -#endif - -namespace detail { - -#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 -template -HWY_INLINE Vec128 NativeCompress(const Vec128 v, - const Mask128 mask) { - return Vec128{_mm_maskz_compress_epi8(mask.raw, v.raw)}; -} -HWY_INLINE Vec256 NativeCompress(const Vec256 v, - const Mask256 mask) { - return Vec256{_mm256_maskz_compress_epi8(mask.raw, v.raw)}; -} -HWY_INLINE Vec512 NativeCompress(const Vec512 v, - const Mask512 mask) { - return Vec512{_mm512_maskz_compress_epi8(mask.raw, v.raw)}; -} - -template -HWY_INLINE Vec128 NativeCompress(const Vec128 v, - const Mask128 mask) { - return Vec128{_mm_maskz_compress_epi16(mask.raw, v.raw)}; -} -HWY_INLINE Vec256 NativeCompress(const Vec256 v, - const Mask256 mask) { - return Vec256{_mm256_maskz_compress_epi16(mask.raw, v.raw)}; -} -HWY_INLINE Vec512 NativeCompress(const Vec512 v, - const Mask512 mask) { - return Vec512{_mm512_maskz_compress_epi16(mask.raw, v.raw)}; -} - -// Do not even define these to prevent accidental usage. -#if !HWY_X86_SLOW_COMPRESS_STORE - -template -HWY_INLINE void NativeCompressStore(Vec128 v, - Mask128 mask, - uint8_t* HWY_RESTRICT unaligned) { - _mm_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw); -} -HWY_INLINE void NativeCompressStore(Vec256 v, Mask256 mask, - uint8_t* HWY_RESTRICT unaligned) { - _mm256_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw); -} -HWY_INLINE void NativeCompressStore(Vec512 v, Mask512 mask, - uint8_t* HWY_RESTRICT unaligned) { - _mm512_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw); -} - -template -HWY_INLINE void NativeCompressStore(Vec128 v, - Mask128 mask, - uint16_t* HWY_RESTRICT unaligned) { - _mm_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw); -} -HWY_INLINE void NativeCompressStore(Vec256 v, Mask256 mask, - uint16_t* HWY_RESTRICT unaligned) { - _mm256_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw); -} -HWY_INLINE void NativeCompressStore(Vec512 v, Mask512 mask, - uint16_t* HWY_RESTRICT unaligned) { - _mm512_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw); -} - -#endif // HWY_X86_SLOW_COMPRESS_STORE - -HWY_INLINE Vec512 NativeExpand(Vec512 v, - Mask512 mask) { - return Vec512{_mm512_maskz_expand_epi8(mask.raw, v.raw)}; -} - -HWY_INLINE Vec512 NativeExpand(Vec512 v, - Mask512 mask) { - return Vec512{_mm512_maskz_expand_epi16(mask.raw, v.raw)}; -} - -template -HWY_INLINE VFromD NativeLoadExpand(Mask512 mask, D /* d */, - const uint8_t* HWY_RESTRICT unaligned) { - return VFromD{_mm512_maskz_expandloadu_epi8(mask.raw, unaligned)}; -} - -template -HWY_INLINE VFromD NativeLoadExpand(Mask512 mask, D /* d */, - const uint16_t* HWY_RESTRICT unaligned) { - return VFromD{_mm512_maskz_expandloadu_epi16(mask.raw, unaligned)}; -} - -#endif // HWY_TARGET <= HWY_AVX3_DL - -template -HWY_INLINE Vec128 NativeCompress(Vec128 v, - Mask128 mask) { - return Vec128{_mm_maskz_compress_epi32(mask.raw, v.raw)}; -} -HWY_INLINE Vec256 NativeCompress(Vec256 v, - Mask256 mask) { - return Vec256{_mm256_maskz_compress_epi32(mask.raw, v.raw)}; -} -HWY_INLINE Vec512 NativeCompress(Vec512 v, - Mask512 mask) { - return Vec512{_mm512_maskz_compress_epi32(mask.raw, v.raw)}; -} -// We use table-based compress for 64-bit lanes, see CompressIsPartition. - -// Do not even define these to prevent accidental usage. -#if !HWY_X86_SLOW_COMPRESS_STORE - -template -HWY_INLINE void NativeCompressStore(Vec128 v, - Mask128 mask, - uint32_t* HWY_RESTRICT unaligned) { - _mm_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw); -} -HWY_INLINE void NativeCompressStore(Vec256 v, Mask256 mask, - uint32_t* HWY_RESTRICT unaligned) { - _mm256_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw); -} -HWY_INLINE void NativeCompressStore(Vec512 v, Mask512 mask, - uint32_t* HWY_RESTRICT unaligned) { - _mm512_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw); -} - -template -HWY_INLINE void NativeCompressStore(Vec128 v, - Mask128 mask, - uint64_t* HWY_RESTRICT unaligned) { - _mm_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw); -} -HWY_INLINE void NativeCompressStore(Vec256 v, Mask256 mask, - uint64_t* HWY_RESTRICT unaligned) { - _mm256_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw); -} -HWY_INLINE void NativeCompressStore(Vec512 v, Mask512 mask, - uint64_t* HWY_RESTRICT unaligned) { - _mm512_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw); -} - -template -HWY_INLINE void NativeCompressStore(Vec128 v, Mask128 mask, - float* HWY_RESTRICT unaligned) { - _mm_mask_compressstoreu_ps(unaligned, mask.raw, v.raw); -} -HWY_INLINE void NativeCompressStore(Vec256 v, Mask256 mask, - float* HWY_RESTRICT unaligned) { - _mm256_mask_compressstoreu_ps(unaligned, mask.raw, v.raw); -} -HWY_INLINE void NativeCompressStore(Vec512 v, Mask512 mask, - float* HWY_RESTRICT unaligned) { - _mm512_mask_compressstoreu_ps(unaligned, mask.raw, v.raw); -} - -template -HWY_INLINE void NativeCompressStore(Vec128 v, - Mask128 mask, - double* HWY_RESTRICT unaligned) { - _mm_mask_compressstoreu_pd(unaligned, mask.raw, v.raw); -} -HWY_INLINE void NativeCompressStore(Vec256 v, Mask256 mask, - double* HWY_RESTRICT unaligned) { - _mm256_mask_compressstoreu_pd(unaligned, mask.raw, v.raw); -} -HWY_INLINE void NativeCompressStore(Vec512 v, Mask512 mask, - double* HWY_RESTRICT unaligned) { - _mm512_mask_compressstoreu_pd(unaligned, mask.raw, v.raw); -} - -#endif // HWY_X86_SLOW_COMPRESS_STORE - -HWY_INLINE Vec512 NativeExpand(Vec512 v, - Mask512 mask) { - return Vec512{_mm512_maskz_expand_epi32(mask.raw, v.raw)}; -} - -HWY_INLINE Vec512 NativeExpand(Vec512 v, - Mask512 mask) { - return Vec512{_mm512_maskz_expand_epi64(mask.raw, v.raw)}; -} - -template -HWY_INLINE VFromD NativeLoadExpand(Mask512 mask, D /* d */, - const uint32_t* HWY_RESTRICT unaligned) { - return VFromD{_mm512_maskz_expandloadu_epi32(mask.raw, unaligned)}; -} - -template -HWY_INLINE VFromD NativeLoadExpand(Mask512 mask, D /* d */, - const uint64_t* HWY_RESTRICT unaligned) { - return VFromD{_mm512_maskz_expandloadu_epi64(mask.raw, unaligned)}; -} - -// For u8x16 and <= u16x16 we can avoid store+load for Compress because there is -// only a single compressed vector (u32x16). Other EmuCompress are implemented -// after the EmuCompressStore they build upon. -template -HWY_INLINE Vec128 EmuCompress(Vec128 v, - Mask128 mask) { - const DFromV d; - const Rebind d32; - const VFromD v0 = PromoteTo(d32, v); - - const uint64_t mask_bits{mask.raw}; - // Mask type is __mmask16 if v is full 128, else __mmask8. - using M32 = MFromD; - const M32 m0{static_cast(mask_bits)}; - return TruncateTo(d, Compress(v0, m0)); -} - -template -HWY_INLINE Vec128 EmuCompress(Vec128 v, - Mask128 mask) { - const DFromV d; - const Rebind di32; - const RebindToUnsigned du32; - const MFromD mask32{static_cast<__mmask8>(mask.raw)}; - // DemoteTo is 2 ops, but likely lower latency than TruncateTo on SKX. - // Only i32 -> u16 is supported, whereas NativeCompress expects u32. - const VFromD v32 = BitCast(du32, PromoteTo(di32, v)); - return DemoteTo(d, BitCast(di32, NativeCompress(v32, mask32))); -} - -HWY_INLINE Vec256 EmuCompress(Vec256 v, - Mask256 mask) { - const DFromV d; - const Rebind di32; - const RebindToUnsigned du32; - const Mask512 mask32{static_cast<__mmask16>(mask.raw)}; - const Vec512 v32 = BitCast(du32, PromoteTo(di32, v)); - return DemoteTo(d, BitCast(di32, NativeCompress(v32, mask32))); -} - -// See above - small-vector EmuCompressStore are implemented via EmuCompress. -template -HWY_INLINE void EmuCompressStore(VFromD v, MFromD mask, D d, - TFromD* HWY_RESTRICT unaligned) { - StoreU(EmuCompress(v, mask), d, unaligned); -} - -template -HWY_INLINE void EmuCompressStore(VFromD v, MFromD mask, D d, - TFromD* HWY_RESTRICT unaligned) { - StoreU(EmuCompress(v, mask), d, unaligned); -} - -// Main emulation logic for wider vector, starting with EmuCompressStore because -// it is most convenient to merge pieces using memory (concatenating vectors at -// byte offsets is difficult). -template -HWY_INLINE void EmuCompressStore(VFromD v, MFromD mask, D d, - TFromD* HWY_RESTRICT unaligned) { - const uint64_t mask_bits{mask.raw}; - const Half dh; - const Rebind d32; - const Vec512 v0 = PromoteTo(d32, LowerHalf(v)); - const Vec512 v1 = PromoteTo(d32, UpperHalf(dh, v)); - const Mask512 m0{static_cast<__mmask16>(mask_bits & 0xFFFFu)}; - const Mask512 m1{static_cast<__mmask16>(mask_bits >> 16)}; - const Vec128 c0 = TruncateTo(dh, NativeCompress(v0, m0)); - const Vec128 c1 = TruncateTo(dh, NativeCompress(v1, m1)); - uint8_t* HWY_RESTRICT pos = unaligned; - StoreU(c0, dh, pos); - StoreU(c1, dh, pos + CountTrue(d32, m0)); -} - -template -HWY_INLINE void EmuCompressStore(VFromD v, MFromD mask, D d, - TFromD* HWY_RESTRICT unaligned) { - const uint64_t mask_bits{mask.raw}; - const Half> dq; - const Rebind d32; - alignas(64) uint8_t lanes[64]; - Store(v, d, lanes); - const Vec512 v0 = PromoteTo(d32, LowerHalf(LowerHalf(v))); - const Vec512 v1 = PromoteTo(d32, Load(dq, lanes + 16)); - const Vec512 v2 = PromoteTo(d32, Load(dq, lanes + 32)); - const Vec512 v3 = PromoteTo(d32, Load(dq, lanes + 48)); - const Mask512 m0{static_cast<__mmask16>(mask_bits & 0xFFFFu)}; - const Mask512 m1{ - static_cast((mask_bits >> 16) & 0xFFFFu)}; - const Mask512 m2{ - static_cast((mask_bits >> 32) & 0xFFFFu)}; - const Mask512 m3{static_cast<__mmask16>(mask_bits >> 48)}; - const Vec128 c0 = TruncateTo(dq, NativeCompress(v0, m0)); - const Vec128 c1 = TruncateTo(dq, NativeCompress(v1, m1)); - const Vec128 c2 = TruncateTo(dq, NativeCompress(v2, m2)); - const Vec128 c3 = TruncateTo(dq, NativeCompress(v3, m3)); - uint8_t* HWY_RESTRICT pos = unaligned; - StoreU(c0, dq, pos); - pos += CountTrue(d32, m0); - StoreU(c1, dq, pos); - pos += CountTrue(d32, m1); - StoreU(c2, dq, pos); - pos += CountTrue(d32, m2); - StoreU(c3, dq, pos); -} - -template -HWY_INLINE void EmuCompressStore(VFromD v, MFromD mask, D d, - TFromD* HWY_RESTRICT unaligned) { - const Repartition di32; - const RebindToUnsigned du32; - const Half dh; - const Vec512 promoted0 = - BitCast(du32, PromoteTo(di32, LowerHalf(dh, v))); - const Vec512 promoted1 = - BitCast(du32, PromoteTo(di32, UpperHalf(dh, v))); - - const uint64_t mask_bits{mask.raw}; - const uint64_t maskL = mask_bits & 0xFFFF; - const uint64_t maskH = mask_bits >> 16; - const Mask512 mask0{static_cast<__mmask16>(maskL)}; - const Mask512 mask1{static_cast<__mmask16>(maskH)}; - const Vec512 compressed0 = NativeCompress(promoted0, mask0); - const Vec512 compressed1 = NativeCompress(promoted1, mask1); - - const Vec256 demoted0 = DemoteTo(dh, BitCast(di32, compressed0)); - const Vec256 demoted1 = DemoteTo(dh, BitCast(di32, compressed1)); - - // Store 256-bit halves - StoreU(demoted0, dh, unaligned); - StoreU(demoted1, dh, unaligned + PopCount(maskL)); -} - -// Finally, the remaining EmuCompress for wide vectors, using EmuCompressStore. -template // 1 or 2 bytes -HWY_INLINE Vec512 EmuCompress(Vec512 v, Mask512 mask) { - const DFromV d; - alignas(64) T buf[2 * Lanes(d)]; - EmuCompressStore(v, mask, d, buf); - return Load(d, buf); -} - -HWY_INLINE Vec256 EmuCompress(Vec256 v, - const Mask256 mask) { - const DFromV d; - alignas(32) uint8_t buf[2 * 32 / sizeof(uint8_t)]; - EmuCompressStore(v, mask, d, buf); - return Load(d, buf); -} - -} // namespace detail - -template -HWY_API V Compress(V v, const M mask) { - const DFromV d; - const RebindToUnsigned du; - const auto mu = RebindMask(du, mask); -#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 - return BitCast(d, detail::NativeCompress(BitCast(du, v), mu)); -#else - return BitCast(d, detail::EmuCompress(BitCast(du, v), mu)); -#endif -} - -template -HWY_API V Compress(V v, const M mask) { - const DFromV d; - const RebindToUnsigned du; - const auto mu = RebindMask(du, mask); - return BitCast(d, detail::NativeCompress(BitCast(du, v), mu)); -} - template HWY_API Vec512 Compress(Vec512 v, Mask512 mask) { // See CompressIsPartition. u64 is faster than u32. @@ -7375,6 +6894,56 @@ HWY_API Vec512 Compress(Vec512 v, Mask512 mask) { // ------------------------------ Expand +namespace detail { + +#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 +HWY_INLINE Vec512 NativeExpand(Vec512 v, + Mask512 mask) { + return Vec512{_mm512_maskz_expand_epi8(mask.raw, v.raw)}; +} + +HWY_INLINE Vec512 NativeExpand(Vec512 v, + Mask512 mask) { + return Vec512{_mm512_maskz_expand_epi16(mask.raw, v.raw)}; +} + +template +HWY_INLINE VFromD NativeLoadExpand(Mask512 mask, D /* d */, + const uint8_t* HWY_RESTRICT unaligned) { + return VFromD{_mm512_maskz_expandloadu_epi8(mask.raw, unaligned)}; +} + +template +HWY_INLINE VFromD NativeLoadExpand(Mask512 mask, D /* d */, + const uint16_t* HWY_RESTRICT unaligned) { + return VFromD{_mm512_maskz_expandloadu_epi16(mask.raw, unaligned)}; +} +#endif // HWY_TARGET <= HWY_AVX3_DL + +HWY_INLINE Vec512 NativeExpand(Vec512 v, + Mask512 mask) { + return Vec512{_mm512_maskz_expand_epi32(mask.raw, v.raw)}; +} + +HWY_INLINE Vec512 NativeExpand(Vec512 v, + Mask512 mask) { + return Vec512{_mm512_maskz_expand_epi64(mask.raw, v.raw)}; +} + +template +HWY_INLINE VFromD NativeLoadExpand(Mask512 mask, D /* d */, + const uint32_t* HWY_RESTRICT unaligned) { + return VFromD{_mm512_maskz_expandloadu_epi32(mask.raw, unaligned)}; +} + +template +HWY_INLINE VFromD NativeLoadExpand(Mask512 mask, D /* d */, + const uint64_t* HWY_RESTRICT unaligned) { + return VFromD{_mm512_maskz_expandloadu_epi64(mask.raw, unaligned)}; +} + +} // namespace detail + template HWY_API Vec512 Expand(Vec512 v, const Mask512 mask) { const Full512 d; @@ -7489,11 +7058,6 @@ HWY_API VFromD LoadExpand(MFromD mask, D d, // ------------------------------ CompressNot -template -HWY_API V CompressNot(V v, const M mask) { - return Compress(v, Not(mask)); -} - template HWY_API Vec512 CompressNot(Vec512 v, Mask512 mask) { // See CompressIsPartition. u64 is faster than u32. @@ -7556,102 +7120,6 @@ HWY_API Vec512 CompressNot(Vec512 v, Mask512 mask) { return TableLookupLanes(v, indices); } -// uint64_t lanes. Only implement for 256 and 512-bit vectors because this is a -// no-op for 128-bit. -template , 16)> -HWY_API V CompressBlocksNot(V v, M mask) { - return CompressNot(v, mask); -} - -// ------------------------------ CompressBits -template -HWY_API V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) { - return Compress(v, LoadMaskBits(DFromV(), bits)); -} - -// ------------------------------ CompressStore - -// Generic for all vector lengths. - -template -HWY_API size_t CompressStore(VFromD v, MFromD mask, D d, - TFromD* HWY_RESTRICT unaligned) { -#if HWY_X86_SLOW_COMPRESS_STORE - StoreU(Compress(v, mask), d, unaligned); -#else - const RebindToUnsigned du; - const auto mu = RebindMask(du, mask); - auto pu = reinterpret_cast * HWY_RESTRICT>(unaligned); - -#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 - detail::NativeCompressStore(BitCast(du, v), mu, pu); -#else - detail::EmuCompressStore(BitCast(du, v), mu, du, pu); -#endif -#endif // HWY_X86_SLOW_COMPRESS_STORE - const size_t count = CountTrue(d, mask); - detail::MaybeUnpoison(unaligned, count); - return count; -} - -template -HWY_API size_t CompressStore(VFromD v, MFromD mask, D d, - TFromD* HWY_RESTRICT unaligned) { -#if HWY_X86_SLOW_COMPRESS_STORE - StoreU(Compress(v, mask), d, unaligned); -#else - const RebindToUnsigned du; - const auto mu = RebindMask(du, mask); - using TU = TFromD; - TU* HWY_RESTRICT pu = reinterpret_cast(unaligned); - detail::NativeCompressStore(BitCast(du, v), mu, pu); -#endif // HWY_X86_SLOW_COMPRESS_STORE - const size_t count = CountTrue(d, mask); - detail::MaybeUnpoison(unaligned, count); - return count; -} - -// Additional overloads to avoid casting to uint32_t (delay?). -template -HWY_API size_t CompressStore(VFromD v, MFromD mask, D d, - TFromD* HWY_RESTRICT unaligned) { -#if HWY_X86_SLOW_COMPRESS_STORE - StoreU(Compress(v, mask), d, unaligned); -#else - (void)d; - detail::NativeCompressStore(v, mask, unaligned); -#endif // HWY_X86_SLOW_COMPRESS_STORE - const size_t count = PopCount(uint64_t{mask.raw}); - detail::MaybeUnpoison(unaligned, count); - return count; -} - -// ------------------------------ CompressBlendedStore -template -HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT unaligned) { - // Native CompressStore already does the blending at no extra cost (latency - // 11, rthroughput 2 - same as compress plus store). - if (HWY_TARGET == HWY_AVX3_DL || - (!HWY_X86_SLOW_COMPRESS_STORE && sizeof(TFromD) > 2)) { - return CompressStore(v, m, d, unaligned); - } else { - const size_t count = CountTrue(d, m); - BlendedStore(Compress(v, m), FirstN(d, count), d, unaligned); - detail::MaybeUnpoison(unaligned, count); - return count; - } -} - -// ------------------------------ CompressBitsStore -// Generic for all vector lengths. -template -HWY_API size_t CompressBitsStore(VFromD v, const uint8_t* HWY_RESTRICT bits, - D d, TFromD* HWY_RESTRICT unaligned) { - return CompressStore(v, LoadMaskBits(d, bits), d, unaligned); -} - // ------------------------------ LoadInterleaved4 // Actually implemented in generic_ops, we just overload LoadTransposedBlocks4. @@ -8087,7 +7555,7 @@ HWY_API V BitShuffle(V v, VI idx) { } #endif // HWY_TARGET <= HWY_AVX3_DL -// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex +// -------------------- LeadingZeroCount template ), HWY_IF_V_SIZE_V(V, 64)> HWY_API V LeadingZeroCount(V v) { @@ -8099,107 +7567,6 @@ HWY_API V LeadingZeroCount(V v) { return V{_mm512_lzcnt_epi64(v.raw)}; } -namespace detail { - -template , 16)> -HWY_INLINE V Lzcnt32ForU8OrU16(V v) { - const DFromV d; - const Rebind di32; - const Rebind du32; - - const auto v_lz_count = LeadingZeroCount(PromoteTo(du32, v)); - return DemoteTo(d, BitCast(di32, v_lz_count)); -} - -template , 32)> -HWY_INLINE VFromD>> Lzcnt32ForU8OrU16AsU16(V v) { - const DFromV d; - const Half dh; - const Rebind di32; - const Rebind du32; - const Rebind du16; - - const auto lo_v_lz_count = - LeadingZeroCount(PromoteTo(du32, LowerHalf(dh, v))); - const auto hi_v_lz_count = - LeadingZeroCount(PromoteTo(du32, UpperHalf(dh, v))); - return OrderedDemote2To(du16, BitCast(di32, lo_v_lz_count), - BitCast(di32, hi_v_lz_count)); -} - -HWY_INLINE Vec256 Lzcnt32ForU8OrU16(Vec256 v) { - const DFromV d; - const Rebind di16; - return DemoteTo(d, BitCast(di16, Lzcnt32ForU8OrU16AsU16(v))); -} - -HWY_INLINE Vec512 Lzcnt32ForU8OrU16(Vec512 v) { - const DFromV d; - const Half dh; - const Rebind di16; - - const auto lo_half = LowerHalf(dh, v); - const auto hi_half = UpperHalf(dh, v); - - const auto lo_v_lz_count = BitCast(di16, Lzcnt32ForU8OrU16AsU16(lo_half)); - const auto hi_v_lz_count = BitCast(di16, Lzcnt32ForU8OrU16AsU16(hi_half)); - return OrderedDemote2To(d, lo_v_lz_count, hi_v_lz_count); -} - -HWY_INLINE Vec512 Lzcnt32ForU8OrU16(Vec512 v) { - return Lzcnt32ForU8OrU16AsU16(v); -} - -} // namespace detail - -template -HWY_API V LeadingZeroCount(V v) { - const DFromV d; - const RebindToUnsigned du; - using TU = TFromD; - - constexpr TU kNumOfBitsInT{sizeof(TU) * 8}; - const auto v_lzcnt32 = detail::Lzcnt32ForU8OrU16(BitCast(du, v)); - return BitCast(d, Min(v_lzcnt32 - Set(du, TU{32 - kNumOfBitsInT}), - Set(du, TU{kNumOfBitsInT}))); -} - -template -HWY_API V HighestSetBitIndex(V v) { - const DFromV d; - const RebindToUnsigned du; - using TU = TFromD; - return BitCast(d, - Set(du, TU{31}) - detail::Lzcnt32ForU8OrU16(BitCast(du, v))); -} - -template -HWY_API V HighestSetBitIndex(V v) { - const DFromV d; - using T = TFromD; - return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v)); -} - -template -HWY_API V TrailingZeroCount(V v) { - const DFromV d; - const RebindToSigned di; - using T = TFromD; - - const auto vi = BitCast(di, v); - const auto lowest_bit = BitCast(d, And(vi, Neg(vi))); - constexpr T kNumOfBitsInT{sizeof(T) * 8}; - const auto bit_idx = HighestSetBitIndex(lowest_bit); - return IfThenElse(MaskFromVec(bit_idx), Set(d, kNumOfBitsInT), bit_idx); -} - // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy diff --git a/hwy/ops/x86_avx3-inl.h b/hwy/ops/x86_avx3-inl.h new file mode 100644 index 0000000000..cc3e9f467a --- /dev/null +++ b/hwy/ops/x86_avx3-inl.h @@ -0,0 +1,497 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// External include guard in highway.h - see comment there. + +#if HWY_TARGET == HWY_AVX10_2 +// For AVX10 targets that only support 256-bit or smaller vectors. Already +// includes base.h and shared-inl.h. +#include "hwy/ops/x86_256-inl.h" +#else +// For AVX3/AVX10 targets that support 512-byte vectors. Already includes base.h +// and shared-inl.h. +#include "hwy/ops/x86_512-inl.h" +#endif + +// AVX3/AVX10 ops that have dependencies on ops defined in x86_512-inl.h if +// HWY_MAX_BYTES >= 64 is true are defined below + +// Avoid uninitialized warnings in GCC's avx512fintrin.h - see +// https://github.com/google/highway/issues/710) +HWY_DIAGNOSTICS(push) +#if HWY_COMPILER_GCC_ACTUAL +HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") +HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494, + ignored "-Wmaybe-uninitialized") +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +#if HWY_TARGET <= HWY_AVX3_DL + +// ------------------------------ ShiftLeft + +// Generic for all vector lengths. Must be defined after all GaloisAffine. +template +HWY_API V ShiftLeft(const V v) { + const Repartition> du64; + if (kBits == 0) return v; + if (kBits == 1) return v + v; + constexpr uint64_t kMatrix = (0x0102040810204080ULL >> kBits) & + (0x0101010101010101ULL * (0xFF >> kBits)); + return detail::GaloisAffine(v, Set(du64, kMatrix)); +} + +// ------------------------------ ShiftRight + +// Generic for all vector lengths. Must be defined after all GaloisAffine. +template )> +HWY_API V ShiftRight(const V v) { + const Repartition> du64; + if (kBits == 0) return v; + constexpr uint64_t kMatrix = + (0x0102040810204080ULL << kBits) & + (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF)); + return detail::GaloisAffine(v, Set(du64, kMatrix)); +} + +// Generic for all vector lengths. Must be defined after all GaloisAffine. +template )> +HWY_API V ShiftRight(const V v) { + const Repartition> du64; + if (kBits == 0) return v; + constexpr uint64_t kShift = + (0x0102040810204080ULL << kBits) & + (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF)); + constexpr uint64_t kSign = + kBits == 0 ? 0 : (0x8080808080808080ULL >> (64 - (8 * kBits))); + return detail::GaloisAffine(v, Set(du64, kShift | kSign)); +} + +// ------------------------------ RotateRight + +// U8 RotateRight is generic for all vector lengths on AVX3_DL +template )> +HWY_API V RotateRight(V v) { + static_assert(0 <= kBits && kBits < 8, "Invalid shift count"); + + const Repartition> du64; + if (kBits == 0) return v; + + constexpr uint64_t kShrMatrix = + (0x0102040810204080ULL << kBits) & + (0x0101010101010101ULL * ((0xFF << kBits) & 0xFF)); + constexpr int kShlBits = (-kBits) & 7; + constexpr uint64_t kShlMatrix = (0x0102040810204080ULL >> kShlBits) & + (0x0101010101010101ULL * (0xFF >> kShlBits)); + constexpr uint64_t kMatrix = kShrMatrix | kShlMatrix; + + return detail::GaloisAffine(v, Set(du64, kMatrix)); +} + +#endif // HWY_TARGET <= HWY_AVX3_DL + +// ------------------------------ Compress + +#ifndef HWY_X86_SLOW_COMPRESS_STORE // allow override +// Slow on Zen4 and SPR, faster if we emulate via Compress(). +#if HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR +#define HWY_X86_SLOW_COMPRESS_STORE 1 +#else +#define HWY_X86_SLOW_COMPRESS_STORE 0 +#endif +#endif // HWY_X86_SLOW_COMPRESS_STORE + +// Always implement 8-bit here even if we lack VBMI2 because we can do better +// than generic_ops (8 at a time) via the native 32-bit compress (16 at a time). +#ifdef HWY_NATIVE_COMPRESS8 +#undef HWY_NATIVE_COMPRESS8 +#else +#define HWY_NATIVE_COMPRESS8 +#endif + +namespace detail { + +#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 +template +HWY_INLINE Vec128 NativeCompress(const Vec128 v, + const Mask128 mask) { + return Vec128{_mm_maskz_compress_epi8(mask.raw, v.raw)}; +} +HWY_INLINE Vec256 NativeCompress(const Vec256 v, + const Mask256 mask) { + return Vec256{_mm256_maskz_compress_epi8(mask.raw, v.raw)}; +} +#if HWY_MAX_BYTES >= 64 +HWY_INLINE Vec512 NativeCompress(const Vec512 v, + const Mask512 mask) { + return Vec512{_mm512_maskz_compress_epi8(mask.raw, v.raw)}; +} +#endif + +template +HWY_INLINE Vec128 NativeCompress(const Vec128 v, + const Mask128 mask) { + return Vec128{_mm_maskz_compress_epi16(mask.raw, v.raw)}; +} +HWY_INLINE Vec256 NativeCompress(const Vec256 v, + const Mask256 mask) { + return Vec256{_mm256_maskz_compress_epi16(mask.raw, v.raw)}; +} +#if HWY_MAX_BYTES >= 64 +HWY_INLINE Vec512 NativeCompress(const Vec512 v, + const Mask512 mask) { + return Vec512{_mm512_maskz_compress_epi16(mask.raw, v.raw)}; +} +#endif + +// Do not even define these to prevent accidental usage. +#if !HWY_X86_SLOW_COMPRESS_STORE + +template +HWY_INLINE void NativeCompressStore(Vec128 v, + Mask128 mask, + uint8_t* HWY_RESTRICT unaligned) { + _mm_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw); +} +HWY_INLINE void NativeCompressStore(Vec256 v, Mask256 mask, + uint8_t* HWY_RESTRICT unaligned) { + _mm256_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw); +} +#if HWY_MAX_BYTES >= 64 +HWY_INLINE void NativeCompressStore(Vec512 v, Mask512 mask, + uint8_t* HWY_RESTRICT unaligned) { + _mm512_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw); +} +#endif + +template +HWY_INLINE void NativeCompressStore(Vec128 v, + Mask128 mask, + uint16_t* HWY_RESTRICT unaligned) { + _mm_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw); +} +HWY_INLINE void NativeCompressStore(Vec256 v, Mask256 mask, + uint16_t* HWY_RESTRICT unaligned) { + _mm256_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw); +} +#if HWY_MAX_BYTES >= 64 +HWY_INLINE void NativeCompressStore(Vec512 v, Mask512 mask, + uint16_t* HWY_RESTRICT unaligned) { + _mm512_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw); +} +#endif // HWY_MAX_BYTES >= 64 + +#endif // HWY_X86_SLOW_COMPRESS_STORE + +#endif // HWY_TARGET <= HWY_AVX3_DL + +template +HWY_INLINE Vec128 NativeCompress(Vec128 v, + Mask128 mask) { + return Vec128{_mm_maskz_compress_epi32(mask.raw, v.raw)}; +} +HWY_INLINE Vec256 NativeCompress(Vec256 v, + Mask256 mask) { + return Vec256{_mm256_maskz_compress_epi32(mask.raw, v.raw)}; +} + +#if HWY_MAX_BYTES >= 64 +HWY_INLINE Vec512 NativeCompress(Vec512 v, + Mask512 mask) { + return Vec512{_mm512_maskz_compress_epi32(mask.raw, v.raw)}; +} +#endif +// We use table-based compress for 64-bit lanes, see CompressIsPartition. + +// Do not even define these to prevent accidental usage. +#if !HWY_X86_SLOW_COMPRESS_STORE + +template +HWY_INLINE void NativeCompressStore(Vec128 v, + Mask128 mask, + uint32_t* HWY_RESTRICT unaligned) { + _mm_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw); +} +HWY_INLINE void NativeCompressStore(Vec256 v, Mask256 mask, + uint32_t* HWY_RESTRICT unaligned) { + _mm256_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw); +} +#if HWY_MAX_BYTES >= 64 +HWY_INLINE void NativeCompressStore(Vec512 v, Mask512 mask, + uint32_t* HWY_RESTRICT unaligned) { + _mm512_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw); +} +#endif + +template +HWY_INLINE void NativeCompressStore(Vec128 v, + Mask128 mask, + uint64_t* HWY_RESTRICT unaligned) { + _mm_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw); +} +HWY_INLINE void NativeCompressStore(Vec256 v, Mask256 mask, + uint64_t* HWY_RESTRICT unaligned) { + _mm256_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw); +} +#if HWY_MAX_BYTES >= 64 +HWY_INLINE void NativeCompressStore(Vec512 v, Mask512 mask, + uint64_t* HWY_RESTRICT unaligned) { + _mm512_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw); +} +#endif + +template +HWY_INLINE void NativeCompressStore(Vec128 v, Mask128 mask, + float* HWY_RESTRICT unaligned) { + _mm_mask_compressstoreu_ps(unaligned, mask.raw, v.raw); +} +HWY_INLINE void NativeCompressStore(Vec256 v, Mask256 mask, + float* HWY_RESTRICT unaligned) { + _mm256_mask_compressstoreu_ps(unaligned, mask.raw, v.raw); +} +#if HWY_MAX_BYTES >= 64 +HWY_INLINE void NativeCompressStore(Vec512 v, Mask512 mask, + float* HWY_RESTRICT unaligned) { + _mm512_mask_compressstoreu_ps(unaligned, mask.raw, v.raw); +} +#endif + +template +HWY_INLINE void NativeCompressStore(Vec128 v, + Mask128 mask, + double* HWY_RESTRICT unaligned) { + _mm_mask_compressstoreu_pd(unaligned, mask.raw, v.raw); +} +HWY_INLINE void NativeCompressStore(Vec256 v, Mask256 mask, + double* HWY_RESTRICT unaligned) { + _mm256_mask_compressstoreu_pd(unaligned, mask.raw, v.raw); +} +#if HWY_MAX_BYTES >= 64 +HWY_INLINE void NativeCompressStore(Vec512 v, Mask512 mask, + double* HWY_RESTRICT unaligned) { + _mm512_mask_compressstoreu_pd(unaligned, mask.raw, v.raw); +} +#endif + +#endif // HWY_X86_SLOW_COMPRESS_STORE + +// For u8x16 and <= u16x16 we can avoid store+load for Compress because there is +// only a single compressed vector (u32x16). Other EmuCompress are implemented +// after the EmuCompressStore they build upon. +template ), + HWY_IF_LANES_LE_D(DFromV, HWY_MAX_BYTES / 4)> +static HWY_INLINE HWY_MAYBE_UNUSED V EmuCompress(V v, MFromD> mask) { + const DFromV d; + const Rebind d32; + const VFromD v0 = PromoteTo(d32, v); + + using M32 = MFromD; + const M32 m0 = PromoteMaskTo(d32, d, mask); + return TruncateTo(d, Compress(v0, m0)); +} + +template ), + HWY_IF_LANES_LE_D(DFromV, HWY_MAX_BYTES / 4)> +static HWY_INLINE HWY_MAYBE_UNUSED V EmuCompress(V v, MFromD> mask) { + const DFromV d; + const Rebind di32; + const RebindToUnsigned du32; + + const MFromD mask32 = PromoteMaskTo(du32, d, mask); + // DemoteTo is 2 ops, but likely lower latency than TruncateTo on SKX. + // Only i32 -> u16 is supported, whereas NativeCompress expects u32. + const VFromD v32 = PromoteTo(du32, v); + return DemoteTo(d, BitCast(di32, NativeCompress(v32, mask32))); +} + +// See above - small-vector EmuCompressStore are implemented via EmuCompress. +template +static HWY_INLINE HWY_MAYBE_UNUSED void EmuCompressStore( + VFromD v, MFromD mask, D d, TFromD* HWY_RESTRICT unaligned) { + StoreU(EmuCompress(v, mask), d, unaligned); +} + +// Main emulation logic for wider vector, starting with EmuCompressStore because +// it is most convenient to merge pieces using memory (concatenating vectors at +// byte offsets is difficult). +template +static HWY_INLINE HWY_MAYBE_UNUSED void EmuCompressStore( + VFromD v, MFromD mask, D d, TFromD* HWY_RESTRICT unaligned) { + const Half dh; + + const MFromD m0 = LowerHalfOfMask(dh, mask); + const MFromD m1 = UpperHalfOfMask(dh, mask); + + const VFromD v0 = LowerHalf(dh, v); + const VFromD v1 = UpperHalf(dh, v); + + EmuCompressStore(v0, m0, dh, unaligned); + EmuCompressStore(v1, m1, dh, unaligned + CountTrue(dh, m0)); +} + +// Finally, the remaining EmuCompress for wide vectors, using EmuCompressStore. +template , HWY_MAX_BYTES / 4)> +static HWY_INLINE HWY_MAYBE_UNUSED V EmuCompress(V v, MFromD> mask) { + using D = DFromV; + using T = TFromD; + const D d; + + alignas(HWY_MAX_LANES_D(D) * sizeof(T)) T buf[2 * HWY_MAX_LANES_D(D)]; + EmuCompressStore(v, mask, d, buf); + return Load(d, buf); +} + +} // namespace detail + +template +HWY_API V Compress(V v, const M mask) { + const DFromV d; + const RebindToUnsigned du; + const auto mu = RebindMask(du, mask); +#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 + return BitCast(d, detail::NativeCompress(BitCast(du, v), mu)); +#else + return BitCast(d, detail::EmuCompress(BitCast(du, v), mu)); +#endif +} + +template +HWY_API V Compress(V v, const M mask) { + const DFromV d; + const RebindToUnsigned du; + const auto mu = RebindMask(du, mask); + return BitCast(d, detail::NativeCompress(BitCast(du, v), mu)); +} + +// ------------------------------ CompressNot + +template +HWY_API V CompressNot(V v, const M mask) { + return Compress(v, Not(mask)); +} + +// uint64_t lanes. Only implement for 256 and 512-bit vectors because this is a +// no-op for 128-bit. +template , 16)> +HWY_API V CompressBlocksNot(V v, M mask) { + return CompressNot(v, mask); +} + +// ------------------------------ CompressBits +template +HWY_API V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) { + return Compress(v, LoadMaskBits(DFromV(), bits)); +} + +// ------------------------------ CompressStore + +// Generic for all vector lengths. + +template +HWY_API size_t CompressStore(VFromD v, MFromD mask, D d, + TFromD* HWY_RESTRICT unaligned) { +#if HWY_X86_SLOW_COMPRESS_STORE + StoreU(Compress(v, mask), d, unaligned); +#else + const RebindToUnsigned du; + const auto mu = RebindMask(du, mask); + auto pu = reinterpret_cast * HWY_RESTRICT>(unaligned); + +#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 + detail::NativeCompressStore(BitCast(du, v), mu, pu); +#else + detail::EmuCompressStore(BitCast(du, v), mu, du, pu); +#endif +#endif // HWY_X86_SLOW_COMPRESS_STORE + const size_t count = CountTrue(d, mask); + detail::MaybeUnpoison(unaligned, count); + return count; +} + +template +HWY_API size_t CompressStore(VFromD v, MFromD mask, D d, + TFromD* HWY_RESTRICT unaligned) { +#if HWY_X86_SLOW_COMPRESS_STORE + StoreU(Compress(v, mask), d, unaligned); +#else + const RebindToUnsigned du; + const auto mu = RebindMask(du, mask); + using TU = TFromD; + TU* HWY_RESTRICT pu = reinterpret_cast(unaligned); + detail::NativeCompressStore(BitCast(du, v), mu, pu); +#endif // HWY_X86_SLOW_COMPRESS_STORE + const size_t count = CountTrue(d, mask); + detail::MaybeUnpoison(unaligned, count); + return count; +} + +// Additional overloads to avoid casting to uint32_t (delay?). +template +HWY_API size_t CompressStore(VFromD v, MFromD mask, D d, + TFromD* HWY_RESTRICT unaligned) { +#if HWY_X86_SLOW_COMPRESS_STORE + StoreU(Compress(v, mask), d, unaligned); +#else + (void)d; + detail::NativeCompressStore(v, mask, unaligned); +#endif // HWY_X86_SLOW_COMPRESS_STORE + const size_t count = PopCount(uint64_t{mask.raw}); + detail::MaybeUnpoison(unaligned, count); + return count; +} + +// ------------------------------ CompressBlendedStore +template +HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, + TFromD* HWY_RESTRICT unaligned) { + // Native CompressStore already does the blending at no extra cost (latency + // 11, rthroughput 2 - same as compress plus store). + if (HWY_TARGET == HWY_AVX3_DL || + (!HWY_X86_SLOW_COMPRESS_STORE && sizeof(TFromD) > 2)) { + return CompressStore(v, m, d, unaligned); + } else { + const size_t count = CountTrue(d, m); + BlendedStore(Compress(v, m), FirstN(d, count), d, unaligned); + detail::MaybeUnpoison(unaligned, count); + return count; + } +} + +// ------------------------------ CompressBitsStore +// Generic for all vector lengths. +template +HWY_API size_t CompressBitsStore(VFromD v, const uint8_t* HWY_RESTRICT bits, + D d, TFromD* HWY_RESTRICT unaligned) { + return CompressStore(v, LoadMaskBits(d, bits), d, unaligned); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h - +// the warning seems to be issued at the call site of intrinsics, i.e. our code. +HWY_DIAGNOSTICS(pop) diff --git a/hwy/targets.cc b/hwy/targets.cc index 9923405ed2..fed6582fc4 100644 --- a/hwy/targets.cc +++ b/hwy/targets.cc @@ -215,6 +215,9 @@ enum class FeatureIndex : uint32_t { kBITALG, kGFNI, + kAVX10, + kAPX, + kSentinel }; static_assert(static_cast(FeatureIndex::kSentinel) < 64, @@ -275,6 +278,8 @@ uint64_t FlagsFromCPUID() { Cpuid(7, 1, abcd); flags |= IsBitSet(abcd[0], 5) ? Bit(FeatureIndex::kAVX512BF16) : 0; + flags |= IsBitSet(abcd[3], 19) ? Bit(FeatureIndex::kAVX10) : 0; + flags |= IsBitSet(abcd[3], 21) ? Bit(FeatureIndex::kAPX) : 0; } return flags; @@ -330,6 +335,11 @@ constexpr uint64_t kGroupAVX3_ZEN4 = constexpr uint64_t kGroupAVX3_SPR = Bit(FeatureIndex::kAVX512FP16) | kGroupAVX3_ZEN4; +constexpr uint64_t kGroupAVX10 = + Bit(FeatureIndex::kAVX10) | Bit(FeatureIndex::kAPX) | + Bit(FeatureIndex::kVPCLMULQDQ) | Bit(FeatureIndex::kVAES) | + Bit(FeatureIndex::kGFNI) | kGroupAVX2; + int64_t DetectTargets() { int64_t bits = 0; // return value of supported targets. HWY_IF_CONSTEXPR(HWY_ARCH_X86_64) { @@ -362,6 +372,34 @@ int64_t DetectTargets() { } } + uint32_t abcd[4]; + + if ((flags & kGroupAVX10) == kGroupAVX10) { + Cpuid(0x24, 0, abcd); + + // AVX10 version is in lower 8 bits of abcd[1] + const uint32_t avx10_ver = abcd[1] & 0xFFu; + + // 512-bit vectors are supported if avx10_ver >= 1 is true and bit 18 of + // abcd[1] is set + const bool has_avx10_with_512bit_vectors = + (avx10_ver >= 1) && IsBitSet(abcd[1], 18); + + if (has_avx10_with_512bit_vectors) { + // AVX10.1 or later with support for 512-bit vectors implies support for + // the AVX3/AVX3_DL/AVX3_SPR targets + bits |= (HWY_AVX3_SPR | HWY_AVX3_DL | HWY_AVX3); + } + + if (avx10_ver >= 2) { + // AVX10.2 is supported if avx10_ver >= 2 is true + bits |= HWY_AVX10_2; + if (has_avx10_with_512bit_vectors) { + bits |= HWY_AVX10_2_512; + } + } + } + // Clear AVX2/AVX3 bits if the CPU or OS does not support XSAVE - otherwise, // YMM/ZMM registers are not preserved across context switches. @@ -380,7 +418,6 @@ int64_t DetectTargets() { // - UnixWare 7 Release 7.1.1 or later // - Solaris 9 4/04 or later - uint32_t abcd[4]; Cpuid(1, 0, abcd); const bool has_xsave = IsBitSet(abcd[2], 26); const bool has_osxsave = IsBitSet(abcd[2], 27); diff --git a/hwy/targets.h b/hwy/targets.h index faafc7a7eb..da90376a70 100644 --- a/hwy/targets.h +++ b/hwy/targets.h @@ -99,8 +99,12 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) { return "AVX3_DL"; case HWY_AVX3_ZEN4: return "AVX3_ZEN4"; + case HWY_AVX10_2: + return "AVX10_2"; case HWY_AVX3_SPR: return "AVX3_SPR"; + case HWY_AVX10_2_512: + return "AVX10_2_512"; #endif #if HWY_ARCH_ARM @@ -208,22 +212,22 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) { // HWY_MAX_DYNAMIC_TARGETS) bit. This list must contain exactly // HWY_MAX_DYNAMIC_TARGETS elements and does not include SCALAR. The first entry // corresponds to the best target. Don't include a "," at the end of the list. -#define HWY_CHOOSE_TARGET_LIST(func_name) \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - HWY_CHOOSE_AVX3_SPR(func_name), /* AVX3_SPR */ \ - nullptr, /* reserved */ \ - HWY_CHOOSE_AVX3_ZEN4(func_name), /* AVX3_ZEN4 */ \ - HWY_CHOOSE_AVX3_DL(func_name), /* AVX3_DL */ \ - HWY_CHOOSE_AVX3(func_name), /* AVX3 */ \ - HWY_CHOOSE_AVX2(func_name), /* AVX2 */ \ - nullptr, /* AVX */ \ - HWY_CHOOSE_SSE4(func_name), /* SSE4 */ \ - HWY_CHOOSE_SSSE3(func_name), /* SSSE3 */ \ - nullptr, /* reserved - SSE3? */ \ - HWY_CHOOSE_SSE2(func_name) /* SSE2 */ +#define HWY_CHOOSE_TARGET_LIST(func_name) \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + nullptr, /* reserved */ \ + HWY_CHOOSE_AVX10_2_512(func_name), /* AVX10_2_512 */ \ + HWY_CHOOSE_AVX3_SPR(func_name), /* AVX3_SPR */ \ + HWY_CHOOSE_AVX10_2(func_name), /* reserved */ \ + HWY_CHOOSE_AVX3_ZEN4(func_name), /* AVX3_ZEN4 */ \ + HWY_CHOOSE_AVX3_DL(func_name), /* AVX3_DL */ \ + HWY_CHOOSE_AVX3(func_name), /* AVX3 */ \ + HWY_CHOOSE_AVX2(func_name), /* AVX2 */ \ + nullptr, /* AVX */ \ + HWY_CHOOSE_SSE4(func_name), /* SSE4 */ \ + HWY_CHOOSE_SSSE3(func_name), /* SSSE3 */ \ + nullptr, /* reserved - SSE3? */ \ + HWY_CHOOSE_SSE2(func_name) /* SSE2 */ #elif HWY_ARCH_ARM // See HWY_ARCH_X86 above for details. diff --git a/hwy/targets_test.cc b/hwy/targets_test.cc index 724296165d..f8b2ad9bb8 100644 --- a/hwy/targets_test.cc +++ b/hwy/targets_test.cc @@ -35,7 +35,9 @@ namespace { } \ } +DECLARE_FUNCTION(AVX10_2_512) DECLARE_FUNCTION(AVX3_SPR) +DECLARE_FUNCTION(AVX10_2) DECLARE_FUNCTION(AVX3_ZEN4) DECLARE_FUNCTION(AVX3_DL) DECLARE_FUNCTION(AVX3) From 62c0a79cb0249a05249f6c9c68e57369c2516488 Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Mon, 2 Dec 2024 04:25:47 -0800 Subject: [PATCH 15/64] (v2 of) Add BitsFromMask, promoting from detail::. Provide DFromM on all targets except SVE/RVV. Also split mask_test into mask_set_test, remove unused overload in scalar, modernize overloads (SFINAE instead of type tags). arm_sve required moving some sections earlier before their first usage. PiperOrigin-RevId: 701919058 --- BUILD | 1 + CMakeLists.txt | 1 + g3doc/quick_reference.md | 39 +-- hwy/ops/arm_neon-inl.h | 129 +++++---- hwy/ops/arm_sve-inl.h | 567 ++++++++++++++++++++----------------- hwy/ops/emu128-inl.h | 16 ++ hwy/ops/generic_ops-inl.h | 27 +- hwy/ops/ppc_vsx-inl.h | 98 +++---- hwy/ops/scalar-inl.h | 21 +- hwy/ops/wasm_128-inl.h | 147 +++++----- hwy/ops/wasm_256-inl.h | 11 + hwy/ops/x86_128-inl.h | 176 ++++++------ hwy/ops/x86_256-inl.h | 144 +++++----- hwy/ops/x86_512-inl.h | 80 +++--- hwy/tests/mask_set_test.cc | 317 +++++++++++++++++++++ hwy/tests/mask_test.cc | 328 +++------------------ 16 files changed, 1122 insertions(+), 980 deletions(-) create mode 100644 hwy/tests/mask_set_test.cc diff --git a/BUILD b/BUILD index cea4582d56..ee379363cc 100644 --- a/BUILD +++ b/BUILD @@ -514,6 +514,7 @@ HWY_TESTS = [ ("hwy/tests/", "mask_combine_test"), ("hwy/tests/", "mask_convert_test"), ("hwy/tests/", "mask_mem_test"), + ("hwy/tests/", "mask_set_test"), ("hwy/tests/", "mask_slide_test"), ("hwy/tests/", "mask_test"), ("hwy/tests/", "masked_arithmetic_test"), diff --git a/CMakeLists.txt b/CMakeLists.txt index 5a7fc8cb24..30196faa78 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -747,6 +747,7 @@ set(HWY_TEST_FILES hwy/tests/mask_combine_test.cc hwy/tests/mask_convert_test.cc hwy/tests/mask_mem_test.cc + hwy/tests/mask_set_test.cc hwy/tests/mask_slide_test.cc hwy/tests/mask_test.cc hwy/tests/masked_arithmetic_test.cc diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md index 8220e9b718..9fd11acacd 100644 --- a/g3doc/quick_reference.md +++ b/g3doc/quick_reference.md @@ -1153,6 +1153,12 @@ encoding depends on the platform). * V **VecFromMask**(D, M m): returns 0 in lane `i` if `m[i] == false`, otherwise all bits set. +* uint64_t **BitsFromMask**(D, M m): returns bits `b` such that + `(b >> i) & 1` indicates whether `m[i]` was set, and any remaining bits in + the `uint64_t` are zero. This is only available if `HWY_MAX_BYTES <= 64`, + because 512-bit vectors are the longest for which there are no more than 64 + lanes and thus mask bits. + * size_t **StoreMaskBits**(D, M m, uint8_t* p): stores a bit array indicating whether `m[i]` is true, in ascending order of `i`, filling the bits of each byte from least to most significant, then proceeding to the @@ -1163,11 +1169,11 @@ encoding depends on the platform). Mask<DFrom> m): Promotes `m` to a mask with a lane type of `TFromD`, `DFrom` is `Rebind`. - `PromoteMaskTo(d_to, d_from, m)` is equivalent to - `MaskFromVec(BitCast(d_to, PromoteTo(di_to, BitCast(di_from, - VecFromMask(d_from, m)))))`, where `di_from` is `RebindToSigned()` - and `di_from` is `RebindToSigned()`, but - `PromoteMaskTo(d_to, d_from, m)` is more efficient on some targets. + `PromoteMaskTo(d_to, d_from, m)` is equivalent to `MaskFromVec(BitCast(d_to, + PromoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m)))))`, where + `di_from` is `RebindToSigned()` and `di_from` is + `RebindToSigned()`, but `PromoteMaskTo(d_to, d_from, m)` is more + efficient on some targets. PromoteMaskTo requires that `sizeof(TFromD) < sizeof(TFromD)` be true. @@ -1176,11 +1182,11 @@ encoding depends on the platform). Mask<DFrom> m): Demotes `m` to a mask with a lane type of `TFromD`, `DFrom` is `Rebind`. - `DemoteMaskTo(d_to, d_from, m)` is equivalent to - `MaskFromVec(BitCast(d_to, DemoteTo(di_to, BitCast(di_from, - VecFromMask(d_from, m)))))`, where `di_from` is `RebindToSigned()` - and `di_from` is `RebindToSigned()`, but - `DemoteMaskTo(d_to, d_from, m)` is more efficient on some targets. + `DemoteMaskTo(d_to, d_from, m)` is equivalent to `MaskFromVec(BitCast(d_to, + DemoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m)))))`, where + `di_from` is `RebindToSigned()` and `di_from` is + `RebindToSigned()`, but `DemoteMaskTo(d_to, d_from, m)` is more + efficient on some targets. DemoteMaskTo requires that `sizeof(TFromD) > sizeof(TFromD)` be true. @@ -1189,16 +1195,15 @@ encoding depends on the platform). whose `LowerHalf` is the first argument and whose `UpperHalf` is the second argument; `M2` is `Mask>`; `DTo` is `Repartition`. - OrderedDemote2MasksTo requires that - `sizeof(TFromD) == sizeof(TFromD) * 2` be true. + OrderedDemote2MasksTo requires that `sizeof(TFromD) == + sizeof(TFromD) * 2` be true. `OrderedDemote2MasksTo(d_to, d_from, a, b)` is equivalent to `MaskFromVec(BitCast(d_to, OrderedDemote2To(di_to, va, vb)))`, where `va` is - `BitCast(di_from, MaskFromVec(d_from, a))`, `vb` is - `BitCast(di_from, MaskFromVec(d_from, b))`, `di_to` is - `RebindToSigned()`, and `di_from` is `RebindToSigned()`, but - `OrderedDemote2MasksTo(d_to, d_from, a, b)` is more efficient on some - targets. + `BitCast(di_from, MaskFromVec(d_from, a))`, `vb` is `BitCast(di_from, + MaskFromVec(d_from, b))`, `di_to` is `RebindToSigned()`, and `di_from` + is `RebindToSigned()`, but `OrderedDemote2MasksTo(d_to, d_from, a, + b)` is more efficient on some targets. OrderedDemote2MasksTo is only available if `HWY_TARGET != HWY_SCALAR` is true. diff --git a/hwy/ops/arm_neon-inl.h b/hwy/ops/arm_neon-inl.h index 37294205a7..c8b9f7312b 100644 --- a/hwy/ops/arm_neon-inl.h +++ b/hwy/ops/arm_neon-inl.h @@ -21,6 +21,7 @@ // Arm NEON intrinsics are documented at: // https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon] +#include "hwy/base.h" #include "hwy/ops/shared-inl.h" HWY_DIAGNOSTICS(push) @@ -8921,14 +8922,22 @@ HWY_INLINE uint64_t NibblesFromMask(D d, MFromD mask) { return nib & ((1ull << (d.MaxBytes() * 4)) - 1); } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128 mask) { +// Returns the lowest N for the BitsFromMask result. +template +constexpr uint64_t OnlyActive(D d, uint64_t bits) { + return (d.MaxBytes() >= 8) ? bits : (bits & ((1ull << d.MaxLanes()) - 1)); +} + +} // namespace detail + +template +HWY_API uint64_t BitsFromMask(D d, MFromD mask) { alignas(16) static constexpr uint8_t kSliceLanes[16] = { 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, }; - const Full128 du; + const RebindToUnsigned du; const Vec128 values = - BitCast(du, VecFromMask(Full128(), mask)) & Load(du, kSliceLanes); + BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); #if HWY_ARCH_ARM_A64 // Can't vaddv - we need two separate bytes (16 bits). @@ -8945,126 +8954,114 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128 mask) { #endif } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(D d, MFromD mask) { // Upper lanes of partial loads are undefined. OnlyActive will fix this if // we load all kSliceLanes so the upper lanes do not pollute the valid bits. alignas(8) static constexpr uint8_t kSliceLanes[8] = {1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80}; - const DFromM d; const RebindToUnsigned du; - const Vec128 slice(Load(Full64(), kSliceLanes).raw); - const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; + using VU = VFromD; + const VU slice(Load(Full64(), kSliceLanes).raw); + const VU values = BitCast(du, VecFromMask(d, mask)) & slice; #if HWY_ARCH_ARM_A64 - return vaddv_u8(values.raw); + return detail::OnlyActive(d, vaddv_u8(values.raw)); #else const uint16x4_t x2 = vpaddl_u8(values.raw); const uint32x2_t x4 = vpaddl_u16(x2); const uint64x1_t x8 = vpaddl_u32(x4); - return vget_lane_u64(x8, 0); + return detail::OnlyActive(d, vget_lane_u64(x8, 0)); #endif } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(D d, MFromD mask) { alignas(16) static constexpr uint16_t kSliceLanes[8] = { 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80}; - const Full128 d; - const Full128 du; + const RebindToUnsigned du; const Vec128 values = BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); #if HWY_ARCH_ARM_A64 - return vaddvq_u16(values.raw); + return detail::OnlyActive(d, vaddvq_u16(values.raw)); #else const uint32x4_t x2 = vpaddlq_u16(values.raw); const uint64x2_t x4 = vpaddlq_u32(x2); - return vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1); + return detail::OnlyActive(d, vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1)); #endif } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(D d, MFromD mask) { // Upper lanes of partial loads are undefined. OnlyActive will fix this if // we load all kSliceLanes so the upper lanes do not pollute the valid bits. alignas(8) static constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8}; - const DFromM d; const RebindToUnsigned du; - const Vec128 slice(Load(Full64(), kSliceLanes).raw); - const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; + using VU = VFromD; + const VU slice(Load(Full64(), kSliceLanes).raw); + const VU values = BitCast(du, VecFromMask(d, mask)) & slice; #if HWY_ARCH_ARM_A64 - return vaddv_u16(values.raw); + return detail::OnlyActive(d, vaddv_u16(values.raw)); #else const uint32x2_t x2 = vpaddl_u16(values.raw); const uint64x1_t x4 = vpaddl_u32(x2); - return vget_lane_u64(x4, 0); + return detail::OnlyActive(d, vget_lane_u64(x4, 0)); #endif } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(D d, MFromD mask) { alignas(16) static constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8}; - const Full128 d; - const Full128 du; + const RebindToUnsigned du; const Vec128 values = BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); #if HWY_ARCH_ARM_A64 - return vaddvq_u32(values.raw); + return detail::OnlyActive(d, vaddvq_u32(values.raw)); #else const uint64x2_t x2 = vpaddlq_u32(values.raw); - return vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1); + return detail::OnlyActive(d, vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1)); #endif } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(D d, MFromD mask) { // Upper lanes of partial loads are undefined. OnlyActive will fix this if // we load all kSliceLanes so the upper lanes do not pollute the valid bits. alignas(8) static constexpr uint32_t kSliceLanes[2] = {1, 2}; - const DFromM d; const RebindToUnsigned du; - const Vec128 slice(Load(Full64(), kSliceLanes).raw); - const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; + using VU = VFromD; + const VU slice(Load(Full64(), kSliceLanes).raw); + const VU values = BitCast(du, VecFromMask(d, mask)) & slice; #if HWY_ARCH_ARM_A64 - return vaddv_u32(values.raw); + return detail::OnlyActive(d, vaddv_u32(values.raw)); #else const uint64x1_t x2 = vpaddl_u32(values.raw); - return vget_lane_u64(x2, 0); + return detail::OnlyActive(d, vget_lane_u64(x2, 0)); #endif } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128 m) { +template +HWY_API uint64_t BitsFromMask(D d, MFromD mask) { alignas(16) static constexpr uint64_t kSliceLanes[2] = {1, 2}; - const Full128 d; - const Full128 du; + const RebindToUnsigned du; const Vec128 values = - BitCast(du, VecFromMask(d, m)) & Load(du, kSliceLanes); + BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); #if HWY_ARCH_ARM_A64 - return vaddvq_u64(values.raw); + return detail::OnlyActive(d, vaddvq_u64(values.raw)); #else - return vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1); + return detail::OnlyActive( + d, vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1)); #endif } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128 m) { - const Full64 d; - const Full64 du; - const Vec64 values = BitCast(du, VecFromMask(d, m)) & Set(du, 1); +template +HWY_API uint64_t BitsFromMask(D d, MFromD mask) { + const RebindToUnsigned du; + const Vec64 values = BitCast(du, VecFromMask(d, mask)) & Set(du, 1); return vget_lane_u64(values.raw, 0); } -// Returns the lowest N for the BitsFromMask result. -template -constexpr uint64_t OnlyActive(uint64_t bits) { - return ((N * sizeof(T)) >= 8) ? bits : (bits & ((1ull << N) - 1)); -} - -template -HWY_INLINE uint64_t BitsFromMask(Mask128 mask) { - return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); -} +namespace detail { // Returns number of lanes whose mask is set. // @@ -9184,7 +9181,7 @@ HWY_API intptr_t FindLastTrue(D d, MFromD mask) { // `p` points to at least 8 writable bytes. template HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(d, mask); const size_t kNumBytes = (d.MaxLanes() + 7) / 8; CopyBytes(&mask_bits, bits); return kNumBytes; @@ -9672,7 +9669,8 @@ HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { // General case, 2 or 4 byte lanes template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { - return detail::Compress(v, detail::BitsFromMask(mask)); + const DFromV d; + return detail::Compress(v, BitsFromMask(d, mask)); } // Single lane: no-op @@ -9696,12 +9694,13 @@ HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { // General case, 2 or 4 byte lanes template HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { + const DFromV d; // For partial vectors, we cannot pull the Not() into the table because // BitsFromMask clears the upper bits. if (N < 16 / sizeof(T)) { - return detail::Compress(v, detail::BitsFromMask(Not(mask))); + return detail::Compress(v, BitsFromMask(d, Not(mask))); } - return detail::CompressNot(v, detail::BitsFromMask(mask)); + return detail::CompressNot(v, BitsFromMask(d, mask)); } // ------------------------------ CompressBlocksNot @@ -9729,7 +9728,7 @@ HWY_INLINE Vec128 CompressBits(Vec128 v, template HWY_API size_t CompressStore(VFromD v, MFromD mask, D d, TFromD* HWY_RESTRICT unaligned) { - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(d, mask); StoreU(detail::Compress(v, mask_bits), d, unaligned); return PopCount(mask_bits); } @@ -9739,7 +9738,7 @@ template HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; // so we can support fp16/bf16 - const uint64_t mask_bits = detail::BitsFromMask(m); + const uint64_t mask_bits = BitsFromMask(d, m); const size_t count = PopCount(mask_bits); const MFromD store_mask = RebindMask(d, FirstN(du, count)); const VFromD compressed = diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h index 2dde1479de..502440ebef 100644 --- a/hwy/ops/arm_sve-inl.h +++ b/hwy/ops/arm_sve-inl.h @@ -1099,7 +1099,7 @@ HWY_API V RotateRight(const V v) { } #endif -// ------------------------------ Shl/r +// ------------------------------ Shl, Shr #define HWY_SVE_SHIFT(BASE, CHAR, BITS, HALF, NAME, OP) \ HWY_API HWY_SVE_V(BASE, BITS) \ @@ -1584,6 +1584,93 @@ HWY_API V MaskedSatSubOr(V no, M m, V a, V b) { } #endif +// ================================================== REDUCE + +#ifdef HWY_NATIVE_REDUCE_SCALAR +#undef HWY_NATIVE_REDUCE_SCALAR +#else +#define HWY_NATIVE_REDUCE_SCALAR +#endif + +// These return T, suitable for ReduceSum. +namespace detail { +#define HWY_SVE_REDUCE_ADD(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \ + /* The intrinsic returns [u]int64_t; truncate to T so we can broadcast. */ \ + using T = HWY_SVE_T(BASE, BITS); \ + using TU = MakeUnsigned; \ + constexpr uint64_t kMask = LimitsMax(); \ + return static_cast(static_cast( \ + static_cast(sv##OP##_##CHAR##BITS(pg, v)) & kMask)); \ + } + +#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \ + return sv##OP##_##CHAR##BITS(pg, v); \ + } + +HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE_ADD, SumOfLanesM, addv) +HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, SumOfLanesM, addv) + +HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MinOfLanesM, minv) +HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MaxOfLanesM, maxv) +// NaN if all are +HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MinOfLanesM, minnmv) +HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanesM, maxnmv) + +#undef HWY_SVE_REDUCE +#undef HWY_SVE_REDUCE_ADD +} // namespace detail + +// detail::SumOfLanesM, detail::MinOfLanesM, and detail::MaxOfLanesM is more +// efficient for N=4 I8/U8 reductions on SVE than the default implementations +// of the N=4 I8/U8 ReduceSum/ReduceMin/ReduceMax operations in +// generic_ops-inl.h +#undef HWY_IF_REDUCE_D +#define HWY_IF_REDUCE_D(D) hwy::EnableIf* = nullptr + +#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8 +#undef HWY_NATIVE_REDUCE_SUM_4_UI8 +#else +#define HWY_NATIVE_REDUCE_SUM_4_UI8 +#endif + +#ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8 +#undef HWY_NATIVE_REDUCE_MINMAX_4_UI8 +#else +#define HWY_NATIVE_REDUCE_MINMAX_4_UI8 +#endif + +template +HWY_API TFromD ReduceSum(D d, VFromD v) { + return detail::SumOfLanesM(detail::MakeMask(d), v); +} + +template +HWY_API TFromD ReduceMin(D d, VFromD v) { + return detail::MinOfLanesM(detail::MakeMask(d), v); +} + +template +HWY_API TFromD ReduceMax(D d, VFromD v) { + return detail::MaxOfLanesM(detail::MakeMask(d), v); +} + +// ------------------------------ SumOfLanes + +template +HWY_API VFromD SumOfLanes(D d, VFromD v) { + return Set(d, ReduceSum(d, v)); +} +template +HWY_API VFromD MinOfLanes(D d, VFromD v) { + return Set(d, ReduceMin(d, v)); +} +template +HWY_API VFromD MaxOfLanes(D d, VFromD v) { + return Set(d, ReduceMax(d, v)); +} + // ================================================== COMPARE // mask = f(vector, vector) @@ -1643,6 +1730,133 @@ HWY_API svbool_t TestBit(const V a, const V bit) { return detail::NeN(And(a, bit), 0); } +// ================================================== LANE ACCESS + +// ------------------------------ GetLane + +namespace detail { +#define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_INLINE HWY_SVE_T(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \ + return sv##OP##_##CHAR##BITS(mask, v); \ + } + +HWY_SVE_FOREACH(HWY_SVE_GET_LANE, GetLaneM, lasta) +HWY_SVE_FOREACH(HWY_SVE_GET_LANE, ExtractLastMatchingLaneM, lastb) +#undef HWY_SVE_GET_LANE +} // namespace detail + +template +HWY_API TFromV GetLane(V v) { + return detail::GetLaneM(v, detail::PFalse()); +} + +// ------------------------------ ExtractLane +template +HWY_API TFromV ExtractLane(V v, size_t i) { + return detail::GetLaneM(v, FirstN(DFromV(), i)); +} + +// ------------------------------ InsertLane (IfThenElse, EqN) +template +HWY_API V InsertLane(const V v, size_t i, T t) { + static_assert(sizeof(TFromV) == sizeof(T), "Lane size mismatch"); + const DFromV d; + const RebindToSigned di; + using TI = TFromD; + const svbool_t is_i = detail::EqN(Iota(di, 0), static_cast(i)); + return IfThenElse(RebindMask(d, is_i), + Set(d, hwy::ConvertScalarTo>(t)), v); +} + +// ================================================== SWIZZLE + +// ------------------------------ ConcatEven/ConcatOdd + +// WARNING: the upper half of these needs fixing up (uzp1/uzp2 use the +// full vector length, not rounded down to a power of two as we require). +namespace detail { + +#define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_INLINE HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \ + return sv##OP##_##CHAR##BITS(lo, hi); \ + } +HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, uzp1) +HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, uzp2) +#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC +HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, + uzp1) +HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, + uzp2) +#endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC +#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) +HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q) +HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q) +#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC +HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, + ConcatEvenBlocks, uzp1q) +HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, + uzp2q) +#endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC +#endif // defined(__ARM_FEATURE_SVE_MATMUL_FP64) +#undef HWY_SVE_CONCAT_EVERY_SECOND + +// Used to slide up / shift whole register left; mask indicates which range +// to take from lo, and the rest is filled from hi starting at its lowest. +#define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) NAME( \ + HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo, svbool_t mask) { \ + return sv##OP##_##CHAR##BITS(mask, lo, hi); \ + } +HWY_SVE_FOREACH(HWY_SVE_SPLICE, Splice, splice) +#if HWY_SVE_HAVE_BF16_FEATURE +HWY_SVE_FOREACH_BF16(HWY_SVE_SPLICE, Splice, splice) +#else +template )> +HWY_INLINE V Splice(V hi, V lo, svbool_t mask) { + const DFromV d; + const RebindToUnsigned du; + return BitCast(d, Splice(BitCast(du, hi), BitCast(du, lo), mask)); +} +#endif // HWY_SVE_HAVE_BF16_FEATURE +#undef HWY_SVE_SPLICE + +} // namespace detail + +template +HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { +#if HWY_SVE_IS_POW2 + if (detail::IsFull(d)) return detail::ConcatOddFull(hi, lo); +#endif + const VFromD hi_odd = detail::ConcatOddFull(hi, hi); + const VFromD lo_odd = detail::ConcatOddFull(lo, lo); + return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2)); +} + +template +HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { +#if HWY_SVE_IS_POW2 + if (detail::IsFull(d)) return detail::ConcatEvenFull(hi, lo); +#endif + const VFromD hi_odd = detail::ConcatEvenFull(hi, hi); + const VFromD lo_odd = detail::ConcatEvenFull(lo, lo); + return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2)); +} + +HWY_API svuint8_t U8FromU32(const svuint32_t v) { + const DFromV du32; + const RepartitionToNarrow du16; + const RepartitionToNarrow du8; + + const svuint16_t cast16 = BitCast(du16, v); + const svuint16_t x2 = svuzp1_u16(cast16, cast16); + const svuint8_t cast8 = BitCast(du8, x2); + return svuzp1_u8(cast8, cast8); +} + +// ================================================== MASK + // ------------------------------ MaskFromVec (Ne) template HWY_API svbool_t MaskFromVec(const V v) { @@ -1659,6 +1873,87 @@ HWY_API VFromD VecFromMask(const D d, svbool_t mask) { return BitCast(d, IfThenElseZero(mask, Set(di, -1))); } +// ------------------------------ BitsFromMask (AndN, Shl, ReduceSum, GetLane +// ConcatEvenFull, U8FromU32) + +namespace detail { + +// For each mask lane (governing lane type T), store 1 or 0 in BYTE lanes. +template +HWY_INLINE svuint8_t BoolFromMask(svbool_t m) { + return svdup_n_u8_z(m, 1); +} +template +HWY_INLINE svuint8_t BoolFromMask(svbool_t m) { + const ScalableTag d8; + const svuint8_t b16 = BitCast(d8, svdup_n_u16_z(m, 1)); + return detail::ConcatEvenFull(b16, b16); // lower half +} +template +HWY_INLINE svuint8_t BoolFromMask(svbool_t m) { + return U8FromU32(svdup_n_u32_z(m, 1)); +} +template +HWY_INLINE svuint8_t BoolFromMask(svbool_t m) { + const ScalableTag d32; + const svuint32_t b64 = BitCast(d32, svdup_n_u64_z(m, 1)); + return U8FromU32(detail::ConcatEvenFull(b64, b64)); // lower half +} + +// Compacts groups of 8 u8 into 8 contiguous bits in a 64-bit lane. +HWY_INLINE svuint64_t BitsFromBool(svuint8_t x) { + const ScalableTag d8; + const ScalableTag d16; + const ScalableTag d32; + const ScalableTag d64; + // TODO(janwas): could use SVE2 BDEP, but it's optional. + x = Or(x, BitCast(d8, ShiftRight<7>(BitCast(d16, x)))); + x = Or(x, BitCast(d8, ShiftRight<14>(BitCast(d32, x)))); + x = Or(x, BitCast(d8, ShiftRight<28>(BitCast(d64, x)))); + return BitCast(d64, x); +} + +} // namespace detail + +// BitsFromMask is required if `HWY_MAX_BYTES <= 64`, which is true for the +// fixed-size SVE targets. +#if HWY_TARGET == HWY_SVE2_128 || HWY_TARGET == HWY_SVE_256 +template +HWY_API uint64_t BitsFromMask(D d, svbool_t mask) { + const Repartition du64; + svuint64_t bits_in_u64 = detail::BitsFromBool(detail::BoolFromMask(mask)); + + constexpr size_t N = Lanes(d); + static_assert(N < 64, "SVE2_128 and SVE_256 are only 128 or 256 bits"); + const uint64_t valid = (1ull << N) - 1; + if constexpr (N <= 8) { + // Upper bits are undefined even if N == 8, hence mask. + return GetLane(bits_in_u64) & valid; + } + + // Up to 8 of the least-significant bits of each u64 lane are valid. + bits_in_u64 = detail::AndN(bits_in_u64, 0xFF); + + // 128-bit vector: only two u64, so avoid ReduceSum. + if constexpr (HWY_TARGET == HWY_SVE2_128) { + alignas(16) uint64_t lanes[2]; + Store(bits_in_u64, du64, lanes); + // lanes[0] is always valid because we know N > 8, but lanes[1] might + // not be - we may mask it out below. + const uint64_t result = lanes[0] + (lanes[1] << 8); + // 8-bit lanes, no further masking + if constexpr (N == 16) return result; + return result & valid; + } + + // Shift the 8-bit groups into place in each u64 lane. + alignas(32) uint64_t kShifts[4] = {0 * 8, 1 * 8, 2 * 8, 3 * 8}; + bits_in_u64 = Shl(bits_in_u64, Load(du64, kShifts)); + return ReduceSum(du64, bits_in_u64) & valid; +} + +#endif // HWY_TARGET == HWY_SVE2_128 || HWY_TARGET == HWY_SVE_256 + // ------------------------------ IsNegative (Lt) #ifdef HWY_NATIVE_IS_NEGATIVE #undef HWY_NATIVE_IS_NEGATIVE @@ -2368,17 +2663,6 @@ HWY_API svuint8_t DemoteTo(Simd dn, const svint32_t v) { return svuzp1_u8(x2, x2); } -HWY_API svuint8_t U8FromU32(const svuint32_t v) { - const DFromV du32; - const RepartitionToNarrow du16; - const RepartitionToNarrow du8; - - const svuint16_t cast16 = BitCast(du16, v); - const svuint16_t x2 = svuzp1_u16(cast16, cast16); - const svuint8_t cast8 = BitCast(du8, x2); - return svuzp1_u8(cast8, cast8); -} - template HWY_API svuint8_t DemoteTo(Simd dn, const svuint16_t v) { #if HWY_SVE_HAVE_2 @@ -2631,79 +2915,6 @@ HWY_API VFromD DemoteTo(D dn, V v) { return BitCast(dn, TruncateTo(dn_u, detail::SaturateU>(v))); } -// ------------------------------ ConcatEven/ConcatOdd - -// WARNING: the upper half of these needs fixing up (uzp1/uzp2 use the -// full vector length, not rounded down to a power of two as we require). -namespace detail { - -#define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_INLINE HWY_SVE_V(BASE, BITS) \ - NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \ - return sv##OP##_##CHAR##BITS(lo, hi); \ - } -HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, uzp1) -HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, uzp2) -#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC -HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, - uzp1) -HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, - uzp2) -#endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC -#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) -HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q) -HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q) -#if HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC -HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, - ConcatEvenBlocks, uzp1q) -HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, - uzp2q) -#endif // HWY_SVE_HAVE_BF16_FEATURE || HWY_SVE_HAVE_BF16_VEC -#endif // defined(__ARM_FEATURE_SVE_MATMUL_FP64) -#undef HWY_SVE_CONCAT_EVERY_SECOND - -// Used to slide up / shift whole register left; mask indicates which range -// to take from lo, and the rest is filled from hi starting at its lowest. -#define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) NAME( \ - HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo, svbool_t mask) { \ - return sv##OP##_##CHAR##BITS(mask, lo, hi); \ - } -HWY_SVE_FOREACH(HWY_SVE_SPLICE, Splice, splice) -#if HWY_SVE_HAVE_BF16_FEATURE -HWY_SVE_FOREACH_BF16(HWY_SVE_SPLICE, Splice, splice) -#else -template )> -HWY_INLINE V Splice(V hi, V lo, svbool_t mask) { - const DFromV d; - const RebindToUnsigned du; - return BitCast(d, Splice(BitCast(du, hi), BitCast(du, lo), mask)); -} -#endif // HWY_SVE_HAVE_BF16_FEATURE -#undef HWY_SVE_SPLICE - -} // namespace detail - -template -HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { -#if HWY_SVE_IS_POW2 - if (detail::IsFull(d)) return detail::ConcatOddFull(hi, lo); -#endif - const VFromD hi_odd = detail::ConcatOddFull(hi, hi); - const VFromD lo_odd = detail::ConcatOddFull(lo, lo); - return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2)); -} - -template -HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { -#if HWY_SVE_IS_POW2 - if (detail::IsFull(d)) return detail::ConcatEvenFull(hi, lo); -#endif - const VFromD hi_odd = detail::ConcatEvenFull(hi, hi); - const VFromD lo_odd = detail::ConcatEvenFull(lo, lo); - return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2)); -} - // ------------------------------ PromoteEvenTo/PromoteOddTo // Signed to signed PromoteEvenTo: 1 instruction instead of 2 in generic-inl.h. @@ -3216,132 +3427,8 @@ HWY_API V UpperHalf(const DH dh, const V v) { #endif } -// ================================================== REDUCE - -#ifdef HWY_NATIVE_REDUCE_SCALAR -#undef HWY_NATIVE_REDUCE_SCALAR -#else -#define HWY_NATIVE_REDUCE_SCALAR -#endif - -// These return T, suitable for ReduceSum. -namespace detail { -#define HWY_SVE_REDUCE_ADD(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \ - /* The intrinsic returns [u]int64_t; truncate to T so we can broadcast. */ \ - using T = HWY_SVE_T(BASE, BITS); \ - using TU = MakeUnsigned; \ - constexpr uint64_t kMask = LimitsMax(); \ - return static_cast(static_cast( \ - static_cast(sv##OP##_##CHAR##BITS(pg, v)) & kMask)); \ - } - -#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \ - return sv##OP##_##CHAR##BITS(pg, v); \ - } - -HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE_ADD, SumOfLanesM, addv) -HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, SumOfLanesM, addv) - -HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MinOfLanesM, minv) -HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MaxOfLanesM, maxv) -// NaN if all are -HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MinOfLanesM, minnmv) -HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanesM, maxnmv) - -#undef HWY_SVE_REDUCE -#undef HWY_SVE_REDUCE_ADD -} // namespace detail - -// detail::SumOfLanesM, detail::MinOfLanesM, and detail::MaxOfLanesM is more -// efficient for N=4 I8/U8 reductions on SVE than the default implementations -// of the N=4 I8/U8 ReduceSum/ReduceMin/ReduceMax operations in -// generic_ops-inl.h -#undef HWY_IF_REDUCE_D -#define HWY_IF_REDUCE_D(D) hwy::EnableIf* = nullptr - -#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8 -#undef HWY_NATIVE_REDUCE_SUM_4_UI8 -#else -#define HWY_NATIVE_REDUCE_SUM_4_UI8 -#endif - -#ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8 -#undef HWY_NATIVE_REDUCE_MINMAX_4_UI8 -#else -#define HWY_NATIVE_REDUCE_MINMAX_4_UI8 -#endif - -template -HWY_API TFromD ReduceSum(D d, VFromD v) { - return detail::SumOfLanesM(detail::MakeMask(d), v); -} - -template -HWY_API TFromD ReduceMin(D d, VFromD v) { - return detail::MinOfLanesM(detail::MakeMask(d), v); -} - -template -HWY_API TFromD ReduceMax(D d, VFromD v) { - return detail::MaxOfLanesM(detail::MakeMask(d), v); -} - -// ------------------------------ SumOfLanes - -template -HWY_API VFromD SumOfLanes(D d, VFromD v) { - return Set(d, ReduceSum(d, v)); -} -template -HWY_API VFromD MinOfLanes(D d, VFromD v) { - return Set(d, ReduceMin(d, v)); -} -template -HWY_API VFromD MaxOfLanes(D d, VFromD v) { - return Set(d, ReduceMax(d, v)); -} - // ================================================== SWIZZLE -// ------------------------------ GetLane - -namespace detail { -#define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_INLINE HWY_SVE_T(BASE, BITS) \ - NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \ - return sv##OP##_##CHAR##BITS(mask, v); \ - } - -HWY_SVE_FOREACH(HWY_SVE_GET_LANE, GetLaneM, lasta) -HWY_SVE_FOREACH(HWY_SVE_GET_LANE, ExtractLastMatchingLaneM, lastb) -#undef HWY_SVE_GET_LANE -} // namespace detail - -template -HWY_API TFromV GetLane(V v) { - return detail::GetLaneM(v, detail::PFalse()); -} - -// ------------------------------ ExtractLane -template -HWY_API TFromV ExtractLane(V v, size_t i) { - return detail::GetLaneM(v, FirstN(DFromV(), i)); -} - -// ------------------------------ InsertLane (IfThenElse) -template -HWY_API V InsertLane(const V v, size_t i, T t) { - static_assert(sizeof(TFromV) == sizeof(T), "Lane size mismatch"); - const DFromV d; - const RebindToSigned di; - using TI = TFromD; - const svbool_t is_i = detail::EqN(Iota(di, 0), static_cast(i)); - return IfThenElse(RebindMask(d, is_i), - Set(d, hwy::ConvertScalarTo>(t)), v); -} - // ------------------------------ DupEven namespace detail { @@ -4929,57 +5016,30 @@ HWY_API MFromD Dup128MaskFromMaskBits(D d, unsigned mask_bits) { return TestBit(BitCast(du, bytes), bit); } -// ------------------------------ StoreMaskBits - -namespace detail { - -// For each mask lane (governing lane type T), store 1 or 0 in BYTE lanes. -template -HWY_INLINE svuint8_t BoolFromMask(svbool_t m) { - return svdup_n_u8_z(m, 1); -} -template -HWY_INLINE svuint8_t BoolFromMask(svbool_t m) { - const ScalableTag d8; - const svuint8_t b16 = BitCast(d8, svdup_n_u16_z(m, 1)); - return detail::ConcatEvenFull(b16, b16); // lower half -} -template -HWY_INLINE svuint8_t BoolFromMask(svbool_t m) { - return U8FromU32(svdup_n_u32_z(m, 1)); -} -template -HWY_INLINE svuint8_t BoolFromMask(svbool_t m) { - const ScalableTag d32; - const svuint32_t b64 = BitCast(d32, svdup_n_u64_z(m, 1)); - return U8FromU32(detail::ConcatEvenFull(b64, b64)); // lower half -} - -// Compacts groups of 8 u8 into 8 contiguous bits in a 64-bit lane. -HWY_INLINE svuint64_t BitsFromBool(svuint8_t x) { - const ScalableTag d8; - const ScalableTag d16; - const ScalableTag d32; - const ScalableTag d64; - // TODO(janwas): could use SVE2 BDEP, but it's optional. - x = Or(x, BitCast(d8, ShiftRight<7>(BitCast(d16, x)))); - x = Or(x, BitCast(d8, ShiftRight<14>(BitCast(d32, x)))); - x = Or(x, BitCast(d8, ShiftRight<28>(BitCast(d64, x)))); - return BitCast(d64, x); -} - -} // namespace detail +// ------------------------------ StoreMaskBits (BitsFromMask) // `p` points to at least 8 writable bytes. -// TODO(janwas): specialize for HWY_SVE_256 // TODO(janwas): with SVE2.1, use PMOV to store to vector, then StoreU template HWY_API size_t StoreMaskBits(D d, svbool_t m, uint8_t* bits) { - svuint64_t bits_in_u64 = - detail::BitsFromBool(detail::BoolFromMask>(m)); +#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 + constexpr size_t N = Lanes(d); + const uint64_t bits64 = BitsFromMask(d, m); + if constexpr (N < 8) { + // BitsFromMask guarantees upper bits are zero, hence no masking. + bits[0] = static_cast(bits64); + } else { + static_assert(N % 8 == 0, "N is pow2 >= 8, hence divisible"); + static_assert(HWY_IS_LITTLE_ENDIAN, ""); + hwy::CopyBytes(&bits64, bits); + } + constexpr size_t num_bytes = hwy::DivCeil(N, size_t{8}); + return num_bytes; +#else + svuint64_t bits_in_u64 = detail::BitsFromBool(detail::BoolFromMask(m)); const size_t num_bits = Lanes(d); - const size_t num_bytes = (num_bits + 8 - 1) / 8; // Round up, see below + const size_t num_bytes = hwy::DivCeil(num_bits, size_t{8}); // Truncate each u64 to 8 bits and store to u8. svst1b_u64(FirstN(ScalableTag(), num_bytes), bits, bits_in_u64); @@ -4993,6 +5053,7 @@ HWY_API size_t StoreMaskBits(D d, svbool_t m, uint8_t* bits) { // Else: we wrote full bytes because num_bits is a power of two >= 8. return num_bytes; +#endif // HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 } // ------------------------------ CompressBits (LoadMaskBits) diff --git a/hwy/ops/emu128-inl.h b/hwy/ops/emu128-inl.h index 5c5ed98799..7a361de011 100644 --- a/hwy/ops/emu128-inl.h +++ b/hwy/ops/emu128-inl.h @@ -78,6 +78,10 @@ struct Vec128 { template struct Mask128 { using Raw = hwy::MakeUnsigned; + + using PrivateT = T; // only for DFromM + static constexpr size_t kPrivateN = N; // only for DFromM + static HWY_INLINE Raw FromBool(bool b) { return b ? static_cast(~Raw{0}) : 0; } @@ -89,6 +93,9 @@ struct Mask128 { template using DFromV = Simd; +template +using DFromM = Simd; + template using TFromV = typename V::PrivateT; @@ -386,6 +393,15 @@ VFromD VecFromMask(D /* tag */, MFromD mask) { return v; } +template +uint64_t BitsFromMask(D d, MFromD mask) { + uint64_t bits = 0; + for (size_t i = 0; i < Lanes(d); ++i) { + bits |= mask.bits[i] ? (1ull << i) : 0; + } + return bits; +} + template HWY_API MFromD FirstN(D d, size_t n) { MFromD m; diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h index 766c6c9d2e..bcbf03b695 100644 --- a/hwy/ops/generic_ops-inl.h +++ b/hwy/ops/generic_ops-inl.h @@ -5599,13 +5599,6 @@ HWY_API V CompressNot(V v, M mask) { namespace detail { -#if HWY_IDE -template -HWY_INLINE uint64_t BitsFromMask(M /* mask */) { - return 0; -} -#endif // HWY_IDE - template HWY_INLINE Vec128 IndicesForExpandFromBits(uint64_t mask_bits) { static_assert(N <= 8, "Should only be called for half-vectors"); @@ -5879,7 +5872,7 @@ template HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { const DFromV d; - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(d, mask); const Vec128 indices = detail::IndicesForExpandFromBits(mask_bits); return BitCast(d, TableLookupBytesOr0(v, indices)); @@ -5893,7 +5886,7 @@ HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { const Half duh; const Vec128 vu = BitCast(du, v); - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(d, mask); const uint64_t maskL = mask_bits & 0xFF; const uint64_t maskH = mask_bits >> 8; @@ -5925,7 +5918,7 @@ HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { const RebindToUnsigned du; const Rebind du8; - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(d, mask); // Storing as 8-bit reduces table size from 4 KiB to 2 KiB. We cannot apply // the nibble trick used below because not all indices fit within one lane. @@ -6207,7 +6200,7 @@ HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { const DFromV d; const RebindToUnsigned du; - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(d, mask); alignas(16) static constexpr uint32_t packed_array[16] = { // PrintExpand64x4Nibble - same for 32x4. @@ -7365,6 +7358,18 @@ HWY_API auto Le(V a, V b) -> decltype(a == b) { #undef HWY_GENERIC_IF_EMULATED_D +// TODO: remove once callers are updated. +// SVE and RVV do not support DFromM because their masks are loosely typed. +#if HWY_MAX_BYTES <= 64 && !HWY_TARGET_IS_SVE && HWY_TARGET != HWY_RVV +namespace detail { +template +uint64_t BitsFromMask(M m) { + const DFromM d; + return ::hwy::HWY_NAMESPACE::BitsFromMask(d, m); +} +} // namespace detail +#endif // !HWY_HAVE_SCALABLE && HWY_MAX_BYTES <= 64 + // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy diff --git a/hwy/ops/ppc_vsx-inl.h b/hwy/ops/ppc_vsx-inl.h index d216c54853..3c285c863a 100644 --- a/hwy/ops/ppc_vsx-inl.h +++ b/hwy/ops/ppc_vsx-inl.h @@ -5222,6 +5222,13 @@ HWY_API MFromD Dup128MaskFromMaskBits(D d, unsigned mask_bits) { namespace detail { +// Returns the lowest N of the mask bits. +template +constexpr uint64_t OnlyActive(D d, uint64_t mask_bits) { + return (d.MaxBytes() == 16) ? mask_bits + : mask_bits & ((1ull << d.MaxLanes()) - 1); +} + #if !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN // fallback for missing vec_extractm template @@ -5242,31 +5249,33 @@ HWY_INLINE uint64_t ExtractSignBits(Vec128 sign_bits, #endif // !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128 mask) { - const DFromM d; +} // namespace detail + +template +HWY_API uint64_t BitsFromMask(D d, MFromD mask) { const Repartition du8; const VFromD sign_bits = BitCast(du8, VecFromMask(d, mask)); #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN - return static_cast(vec_extractm(sign_bits.raw)); + return detail::OnlyActive(d, + static_cast(vec_extractm(sign_bits.raw))); #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10 const __vector unsigned char kBitShuffle = {120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0}; - return ExtractSignBits(sign_bits, kBitShuffle); + return detail::OnlyActive(d, detail::ExtractSignBits(sign_bits, kBitShuffle)); #endif // HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128 mask) { - const DFromM d; +template +HWY_API uint64_t BitsFromMask(D d, MFromD mask) { const RebindToUnsigned du; const Repartition du8; const VFromD sign_bits = BitCast(du8, VecFromMask(d, mask)); #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN - return static_cast(vec_extractm(BitCast(du, sign_bits).raw)); + return detail::OnlyActive( + d, static_cast(vec_extractm(BitCast(du, sign_bits).raw))); #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10 (void)du; #if HWY_IS_LITTLE_ENDIAN @@ -5276,20 +5285,20 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128 mask) { const __vector unsigned char kBitShuffle = { 128, 128, 128, 128, 128, 128, 128, 128, 112, 96, 80, 64, 48, 32, 16, 0}; #endif - return ExtractSignBits(sign_bits, kBitShuffle); + return detail::OnlyActive(d, detail::ExtractSignBits(sign_bits, kBitShuffle)); #endif // HWY_PPC_HAVE_10 } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128 mask) { - const DFromM d; +template +HWY_API uint64_t BitsFromMask(D d, MFromD mask) { const RebindToUnsigned du; const Repartition du8; const VFromD sign_bits = BitCast(du8, VecFromMask(d, mask)); #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN - return static_cast(vec_extractm(BitCast(du, sign_bits).raw)); + return detail::OnlyActive( + d, static_cast(vec_extractm(BitCast(du, sign_bits).raw))); #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10 (void)du; #if HWY_IS_LITTLE_ENDIAN @@ -5301,20 +5310,20 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128 mask) { 128, 128, 128, 128, 128, 128, 96, 64, 32, 0}; #endif - return ExtractSignBits(sign_bits, kBitShuffle); + return detail::OnlyActive(d, detail::ExtractSignBits(sign_bits, kBitShuffle)); #endif // HWY_PPC_HAVE_10 } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128 mask) { - const DFromM d; +template +HWY_API uint64_t BitsFromMask(D d, MFromD mask) { const RebindToUnsigned du; const Repartition du8; const VFromD sign_bits = BitCast(du8, VecFromMask(d, mask)); #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN - return static_cast(vec_extractm(BitCast(du, sign_bits).raw)); + return detail::OnlyActive( + d, static_cast(vec_extractm(BitCast(du, sign_bits).raw))); #else // Z14, Z15, PPC8, PPC9, or big-endian PPC10 (void)du; #if HWY_IS_LITTLE_ENDIAN @@ -5326,35 +5335,22 @@ HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128 mask) { 128, 128, 128, 128, 128, 128, 128, 128, 64, 0}; #endif - return ExtractSignBits(sign_bits, kBitShuffle); + return detail::OnlyActive(d, detail::ExtractSignBits(sign_bits, kBitShuffle)); #endif // HWY_PPC_HAVE_10 } -// Returns the lowest N of the mask bits. -template -constexpr uint64_t OnlyActive(uint64_t mask_bits) { - return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1); -} - -template -HWY_INLINE uint64_t BitsFromMask(Mask128 mask) { - return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); -} - -} // namespace detail - // `p` points to at least 8 writable bytes. template -HWY_API size_t StoreMaskBits(D /*d*/, MFromD mask, uint8_t* bits) { +HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { // For vectors with 8 or fewer lanes, simply cast the result of BitsFromMask // to an uint8_t and store the result in bits[0]. - bits[0] = static_cast(detail::BitsFromMask(mask)); + bits[0] = static_cast(BitsFromMask(d, mask)); return sizeof(uint8_t); } template -HWY_API size_t StoreMaskBits(D /*d*/, MFromD mask, uint8_t* bits) { - const auto mask_bits = detail::BitsFromMask(mask); +HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { + const auto mask_bits = BitsFromMask(d, mask); // First convert mask_bits to a uint16_t as we only want to store // the lower 16 bits of mask_bits as there are 16 lanes in mask. @@ -5419,8 +5415,8 @@ HWY_API bool AllTrue(D d, MFromD mask) { } template -HWY_API size_t CountTrue(D /* tag */, MFromD mask) { - return PopCount(detail::BitsFromMask(mask)); +HWY_API size_t CountTrue(D d, MFromD mask) { + return PopCount(BitsFromMask(d, mask)); } #if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) @@ -5467,8 +5463,7 @@ HWY_API size_t FindKnownFirstTrue(D d, MFromD mask) { return detail::VsxCntlzLsbb(bytes) / sizeof(T); } #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) - (void)d; - return Num0BitsBelowLS1Bit_Nonzero64(detail::BitsFromMask(mask)); + return Num0BitsBelowLS1Bit_Nonzero64(BitsFromMask(d, mask)); } template > @@ -5483,8 +5478,7 @@ HWY_API intptr_t FindFirstTrue(D d, MFromD mask) { return idx == kN ? -1 : static_cast(idx); } #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) - (void)d; - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(d, mask); return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1; } @@ -5499,8 +5493,7 @@ HWY_API size_t FindKnownLastTrue(D d, MFromD mask) { return 16 / sizeof(T) - 1 - idx; } #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) - (void)d; - return 63 - Num0BitsAboveMS1Bit_Nonzero64(detail::BitsFromMask(mask)); + return 63 - Num0BitsAboveMS1Bit_Nonzero64(BitsFromMask(d, mask)); } template > @@ -5515,8 +5508,7 @@ HWY_API intptr_t FindLastTrue(D d, MFromD mask) { return idx == kN ? -1 : static_cast(kN - 1 - idx); } #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) - (void)d; - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(d, mask); return mask_bits ? intptr_t(63 - Num0BitsAboveMS1Bit_Nonzero64(mask_bits)) : -1; } @@ -6012,7 +6004,8 @@ HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { // General case, 2 or 4 bytes template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { - return detail::CompressBits(v, detail::BitsFromMask(mask)); + const DFromV d; + return detail::CompressBits(v, BitsFromMask(d, mask)); } // ------------------------------ CompressNot @@ -6048,12 +6041,13 @@ HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { // General case, 2 or 4 bytes template HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { + const DFromV d; // For partial vectors, we cannot pull the Not() into the table because // BitsFromMask clears the upper bits. if (N < 16 / sizeof(T)) { - return detail::CompressBits(v, detail::BitsFromMask(Not(mask))); + return detail::CompressBits(v, BitsFromMask(d, Not(mask))); } - return detail::CompressNotBits(v, detail::BitsFromMask(mask)); + return detail::CompressNotBits(v, BitsFromMask(d, mask)); } // ------------------------------ CompressBlocksNot @@ -6103,7 +6097,7 @@ HWY_API size_t CompressStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; - const uint64_t mask_bits = detail::BitsFromMask(m); + const uint64_t mask_bits = BitsFromMask(d, m); HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); const size_t count = PopCount(mask_bits); @@ -6130,7 +6124,7 @@ HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; - const uint64_t mask_bits = detail::BitsFromMask(m); + const uint64_t mask_bits = BitsFromMask(d, m); HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); const size_t count = PopCount(mask_bits); diff --git a/hwy/ops/scalar-inl.h b/hwy/ops/scalar-inl.h index a64faf9106..f7c56c1a7e 100644 --- a/hwy/ops/scalar-inl.h +++ b/hwy/ops/scalar-inl.h @@ -72,10 +72,12 @@ struct Vec1 { // 0 or FF..FF, same size as Vec1. template -class Mask1 { +struct Mask1 { using Raw = hwy::MakeUnsigned; - public: + using PrivateT = T; // only for DFromM + static constexpr size_t kPrivateN = 1; // only for DFromM + static HWY_INLINE Mask1 FromBool(bool b) { Mask1 mask; mask.bits = b ? static_cast(~Raw{0}) : 0; @@ -88,6 +90,9 @@ class Mask1 { template using DFromV = Simd; +template +using DFromM = Simd; + template using TFromV = typename V::PrivateT; @@ -288,13 +293,6 @@ HWY_API Mask1 MaskFromVec(const Vec1 v) { template using MFromD = decltype(MaskFromVec(VFromD())); -template -Vec1 VecFromMask(const Mask1 mask) { - Vec1 v; - CopySameSize(&mask, &v); - return v; -} - template > Vec1 VecFromMask(D /* tag */, const Mask1 mask) { Vec1 v; @@ -302,6 +300,11 @@ Vec1 VecFromMask(D /* tag */, const Mask1 mask) { return v; } +template +uint64_t BitsFromMask(D, MFromD mask) { + return mask.bits ? 1 : 0; +} + template > HWY_API Mask1 FirstN(D /*tag*/, size_t n) { return Mask1::FromBool(n != 0); diff --git a/hwy/ops/wasm_128-inl.h b/hwy/ops/wasm_128-inl.h index 39471d5239..d34a90dbc9 100644 --- a/hwy/ops/wasm_128-inl.h +++ b/hwy/ops/wasm_128-inl.h @@ -4946,76 +4946,74 @@ HWY_API MFromD Dup128MaskFromMaskBits(D d, unsigned mask_bits) { namespace detail { -// Full -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, - const Mask128 mask) { +// Returns the lowest N bits for the BitsFromMask result. +template +constexpr uint64_t OnlyActive(D d, uint64_t bits) { + return (d.MaxBytes() == 16) ? bits : bits & ((1ull << d.MaxLanes()) - 1); +} + +} // namespace detail + +template +HWY_API uint64_t BitsFromMask(D /*d*/, const MFromD mask) { alignas(16) uint64_t lanes[2]; wasm_v128_store(lanes, mask.raw); constexpr uint64_t kMagic = 0x103070F1F3F80ULL; const uint64_t lo = ((lanes[0] * kMagic) >> 56); const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00; - return (hi + lo); + return hi + lo; // exactly 16 bits, no OnlyActive required } -// 64-bit -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, - const Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(D /*d*/, const MFromD mask) { constexpr uint64_t kMagic = 0x103070F1F3F80ULL; - return (static_cast(wasm_i64x2_extract_lane(mask.raw, 0)) * - kMagic) >> - 56; + const uint64_t bytes = + static_cast(wasm_i64x2_extract_lane(mask.raw, 0)); + return (bytes * kMagic) >> 56; // exactly 8 bits, no OnlyActive required } // 32-bit or less: need masking -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, - const Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(D d, const MFromD mask) { uint64_t bytes = static_cast(wasm_i64x2_extract_lane(mask.raw, 0)); // Clear potentially undefined bytes. - bytes &= (1ULL << (N * 8)) - 1; + bytes &= (1ULL << (Lanes(d) * 8)) - 1; constexpr uint64_t kMagic = 0x103070F1F3F80ULL; - return (bytes * kMagic) >> 56; + return detail::OnlyActive(d, (bytes * kMagic) >> 56); } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, - const Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(D d, const MFromD mask) { // Remove useless lower half of each u16 while preserving the sign bit. + const Rebind d8; + using M8 = MFromD; const __i16x8 zero = wasm_i16x8_splat(0); - const Mask128 mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)}; - return BitsFromMask(hwy::SizeTag<1>(), mask8); + const M8 mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)}; + return detail::OnlyActive(d8, BitsFromMask(d8, mask8)); } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, - const Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(D d, const MFromD mask) { const __i32x4 mask_i = static_cast<__i32x4>(mask.raw); const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8); const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice); alignas(16) uint32_t lanes[4]; wasm_v128_store(lanes, sliced_mask); - return lanes[0] | lanes[1] | lanes[2] | lanes[3]; + return detail::OnlyActive(d, lanes[0] | lanes[1] | lanes[2] | lanes[3]); } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, - const Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(D d, const MFromD mask) { const __i64x2 mask_i = static_cast<__i64x2>(mask.raw); const __i64x2 slice = wasm_i64x2_make(1, 2); const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice); alignas(16) uint64_t lanes[2]; wasm_v128_store(lanes, sliced_mask); - return lanes[0] | lanes[1]; + return detail::OnlyActive(d, lanes[0] | lanes[1]); } -// Returns the lowest N bits for the BitsFromMask result. -template -constexpr uint64_t OnlyActive(uint64_t bits) { - return ((N * sizeof(T)) == 16) ? bits : bits & ((1ull << N) - 1); -} +namespace detail { // Returns 0xFF for bytes with index >= N, otherwise 0. template @@ -5047,53 +5045,40 @@ constexpr __i8x16 BytesAbove() { : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1); } -template -HWY_INLINE uint64_t BitsFromMask(const Mask128 mask) { - return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); -} +} // namespace detail -template -HWY_INLINE size_t CountTrue(hwy::SizeTag<1> tag, const Mask128 m) { - return PopCount(BitsFromMask(tag, m)); +// `p` points to at least 8 writable bytes. +template +HWY_API size_t StoreMaskBits(D d, const MFromD mask, uint8_t* bits) { + const uint64_t mask_bits = BitsFromMask(d, mask); + const size_t kNumBytes = (d.MaxLanes() + 7) / 8; + CopyBytes(&mask_bits, bits); + return kNumBytes; } -template -HWY_INLINE size_t CountTrue(hwy::SizeTag<2> tag, const Mask128 m) { - return PopCount(BitsFromMask(tag, m)); +template +HWY_API size_t CountTrue(D d, const MFromD m) { + return PopCount(BitsFromMask(d, m)); } - -template -HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128 m) { +template +HWY_API size_t CountTrue(D d, const MFromD m) { + return PopCount(BitsFromMask(d, m)); +} +template +HWY_API size_t CountTrue(D d, const MFromD m) { const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8); const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift); alignas(16) uint64_t lanes[2]; wasm_v128_store(lanes, shifted_bits); return PopCount(lanes[0] | lanes[1]); } - -template -HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128 m) { +template +HWY_API size_t CountTrue(D d, const MFromD m) { alignas(16) int64_t lanes[2]; wasm_v128_store(lanes, m.raw); return static_cast(-(lanes[0] + lanes[1])); } -} // namespace detail - -// `p` points to at least 8 writable bytes. -template -HWY_API size_t StoreMaskBits(D d, const MFromD mask, uint8_t* bits) { - const uint64_t mask_bits = detail::BitsFromMask(mask); - const size_t kNumBytes = (d.MaxLanes() + 7) / 8; - CopyBytes(&mask_bits, bits); - return kNumBytes; -} - -template -HWY_API size_t CountTrue(D /* tag */, const MFromD m) { - return detail::CountTrue(hwy::SizeTag)>(), m); -} - // Partial template , HWY_IF_V_SIZE_LE_D(D, 8)> HWY_API size_t CountTrue(D d, MFromD m) { @@ -5153,26 +5138,26 @@ HWY_API bool AllTrue(D d, const MFromD m) { } template -HWY_API size_t FindKnownFirstTrue(D /* tag */, const MFromD mask) { - const uint32_t bits = static_cast(detail::BitsFromMask(mask)); +HWY_API size_t FindKnownFirstTrue(D d, const MFromD mask) { + const uint32_t bits = static_cast(BitsFromMask(d, mask)); return Num0BitsBelowLS1Bit_Nonzero32(bits); } template -HWY_API intptr_t FindFirstTrue(D /* tag */, const MFromD mask) { - const uint32_t bits = static_cast(detail::BitsFromMask(mask)); +HWY_API intptr_t FindFirstTrue(D d, const MFromD mask) { + const uint32_t bits = static_cast(BitsFromMask(d, mask)); return bits ? static_cast(Num0BitsBelowLS1Bit_Nonzero32(bits)) : -1; } template -HWY_API size_t FindKnownLastTrue(D /* tag */, const MFromD mask) { - const uint32_t bits = static_cast(detail::BitsFromMask(mask)); +HWY_API size_t FindKnownLastTrue(D d, const MFromD mask) { + const uint32_t bits = static_cast(BitsFromMask(d, mask)); return 31 - Num0BitsAboveMS1Bit_Nonzero32(bits); } template -HWY_API intptr_t FindLastTrue(D /* tag */, const MFromD mask) { - const uint32_t bits = static_cast(detail::BitsFromMask(mask)); +HWY_API intptr_t FindLastTrue(D d, const MFromD mask) { + const uint32_t bits = static_cast(BitsFromMask(d, mask)); return bits ? (31 - static_cast(Num0BitsAboveMS1Bit_Nonzero32(bits))) : -1; @@ -5618,7 +5603,8 @@ HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { // General case, 2 or 4 byte lanes template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { - return detail::Compress(v, detail::BitsFromMask(mask)); + const DFromV d; + return detail::Compress(v, BitsFromMask(d, mask)); } // Single lane: no-op @@ -5642,12 +5628,13 @@ HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { // General case, 2 or 4 byte lanes template HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { + const DFromV d; // For partial vectors, we cannot pull the Not() into the table because // BitsFromMask clears the upper bits. if (N < 16 / sizeof(T)) { - return detail::Compress(v, detail::BitsFromMask(Not(mask))); + return detail::Compress(v, BitsFromMask(d, Not(mask))); } - return detail::CompressNot(v, detail::BitsFromMask(mask)); + return detail::CompressNot(v, BitsFromMask(d, mask)); } // ------------------------------ CompressBlocksNot @@ -5674,7 +5661,7 @@ HWY_API Vec128 CompressBits(Vec128 v, template HWY_API size_t CompressStore(VFromD v, MFromD mask, D d, TFromD* HWY_RESTRICT unaligned) { - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(d, mask); const auto c = detail::Compress(v, mask_bits); StoreU(c, d, unaligned); return PopCount(mask_bits); @@ -5685,7 +5672,7 @@ template HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; // so we can support fp16/bf16 - const uint64_t mask_bits = detail::BitsFromMask(m); + const uint64_t mask_bits = BitsFromMask(d, m); const size_t count = PopCount(mask_bits); const VFromD compressed = detail::Compress(BitCast(du, v), mask_bits); diff --git a/hwy/ops/wasm_256-inl.h b/hwy/ops/wasm_256-inl.h index aab7105e36..03ef747d05 100644 --- a/hwy/ops/wasm_256-inl.h +++ b/hwy/ops/wasm_256-inl.h @@ -62,6 +62,9 @@ class Vec256 { template struct Mask256 { + using PrivateT = T; // only for DFromM + static constexpr size_t kPrivateN = 32 / sizeof(T); // only for DFromM + Mask128 m0; Mask128 m1; }; @@ -657,6 +660,14 @@ HWY_API Vec256 VecFromMask(D d, Mask256 m) { return v; } +template +HWY_API uint64_t BitsFromMask(D d, MFromD m) { + const Half dh; + const uint64_t lo = BitsFromMask(dh, m.m0); + const uint64_t hi = BitsFromMask(dh, m.m1); + return (hi << Lanes(dh)) | lo; +} + // mask ? yes : no template HWY_API Vec256 IfThenElse(Mask256 mask, Vec256 yes, Vec256 no) { diff --git a/hwy/ops/x86_128-inl.h b/hwy/ops/x86_128-inl.h index dbe41cedce..ee40181594 100644 --- a/hwy/ops/x86_128-inl.h +++ b/hwy/ops/x86_128-inl.h @@ -138,78 +138,66 @@ using Vec32 = Vec128; template using Vec16 = Vec128; -#if HWY_TARGET <= HWY_AVX3 - namespace detail { +#if HWY_TARGET <= HWY_AVX3 + // Template arg: sizeof(lane type) template -struct RawMask128 {}; +struct RawMask128T {}; template <> -struct RawMask128<1> { +struct RawMask128T<1> { using type = __mmask16; }; template <> -struct RawMask128<2> { +struct RawMask128T<2> { using type = __mmask8; }; template <> -struct RawMask128<4> { +struct RawMask128T<4> { using type = __mmask8; }; template <> -struct RawMask128<8> { +struct RawMask128T<8> { using type = __mmask8; }; -} // namespace detail +template +using RawMask128 = typename RawMask128T::type; -template -struct Mask128 { - using Raw = typename detail::RawMask128::type; +#else // AVX2 or earlier - static Mask128 FromBits(uint64_t mask_bits) { - return Mask128{static_cast(mask_bits)}; - } +template +using RawMask128 = typename Raw128::type; - Raw raw; -}; +#endif // HWY_TARGET <= HWY_AVX3 -#else // AVX2 or below +} // namespace detail -// FF..FF or 0. template struct Mask128 { - typename detail::Raw128::type raw; -}; - -#endif // AVX2 or below + using Raw = typename detail::RawMask128; -namespace detail { - -// Returns the lowest N of the _mm_movemask* bits. -template -constexpr uint64_t OnlyActive(uint64_t mask_bits) { - return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1); -} - -} // namespace detail + using PrivateT = T; // only for DFromM + static constexpr size_t kPrivateN = N; // only for DFromM #if HWY_TARGET <= HWY_AVX3 -namespace detail { - -// Used by Expand() emulation, which is required for both AVX3 and AVX2. -template -HWY_INLINE uint64_t BitsFromMask(const Mask128 mask) { - return OnlyActive(mask.raw); -} + static Mask128 FromBits(uint64_t mask_bits) { + return Mask128{static_cast(mask_bits)}; + } +#else +// Lanes are either FF..FF or 0. +#endif -} // namespace detail -#endif // HWY_TARGET <= HWY_AVX3 + Raw raw; +}; template using DFromV = Simd; +template +using DFromM = Simd; + template using TFromV = typename V::PrivateT; @@ -12446,8 +12434,27 @@ struct CompressIsPartition { #endif }; +namespace detail { + +// Returns `mask_bits` (from movemask) with the upper bits cleared, if there +// are 8 or fewer valid bits. +template +constexpr uint64_t OnlyActive(D d, uint64_t mask_bits) { + return (d.MaxBytes() >= 16) ? mask_bits + : mask_bits & ((1ull << d.MaxLanes()) - 1); +} + +} // namespace detail + #if HWY_TARGET <= HWY_AVX3 +// ------------------------------ BitsFromMask (MFromD, OnlyActive) +// Generic for all vector lengths. +template +HWY_INLINE uint64_t BitsFromMask(D d, MFromD mask) { + return detail::OnlyActive(d, mask.raw); +} + // ------------------------------ StoreMaskBits // `p` points to at least 8 writable bytes. @@ -12620,7 +12627,7 @@ HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, #else // AVX2 or below -// ------------------------------ StoreMaskBits +// ------------------------------ BitsFromMask namespace detail { @@ -12628,50 +12635,45 @@ constexpr HWY_INLINE uint64_t U64FromInt(int mask_bits) { return static_cast(static_cast(mask_bits)); } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, - const Mask128 mask) { - const Simd d; +} // namespace detail + +template +HWY_API uint64_t BitsFromMask(D d, MFromD mask) { const auto sign_bits = BitCast(d, VecFromMask(d, mask)).raw; - return U64FromInt(_mm_movemask_epi8(sign_bits)); + return detail::OnlyActive(d, + detail::U64FromInt(_mm_movemask_epi8(sign_bits))); } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, - const Mask128 mask) { +template +HWY_API uint64_t BitsFromMask(D d, MFromD mask) { // Remove useless lower half of each u16 while preserving the sign bit. const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128()); - return U64FromInt(_mm_movemask_epi8(sign_bits)); + return detail::OnlyActive(d, + detail::U64FromInt(_mm_movemask_epi8(sign_bits))); } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128 mask) { - const Simd d; - const Simd df; +template +HWY_API uint64_t BitsFromMask(D d, MFromD mask) { + const RebindToFloat df; const auto sign_bits = BitCast(df, VecFromMask(d, mask)); - return U64FromInt(_mm_movemask_ps(sign_bits.raw)); + return detail::OnlyActive(d, + detail::U64FromInt(_mm_movemask_ps(sign_bits.raw))); } -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128 mask) { - const Simd d; - const Simd df; +template +HWY_API uint64_t BitsFromMask(D d, MFromD mask) { + const RebindToFloat df; const auto sign_bits = BitCast(df, VecFromMask(d, mask)); - return U64FromInt(_mm_movemask_pd(sign_bits.raw)); + return detail::OnlyActive(d, + detail::U64FromInt(_mm_movemask_pd(sign_bits.raw))); } -template -HWY_INLINE uint64_t BitsFromMask(const Mask128 mask) { - return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); -} - -} // namespace detail - +// ------------------------------ StoreMaskBits // `p` points to at least 8 writable bytes. template HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { constexpr size_t kNumBytes = (MaxLanes(d) + 7) / 8; - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(d, mask); CopyBytes(&mask_bits, bits); return kNumBytes; } @@ -12679,43 +12681,43 @@ HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { // ------------------------------ Mask testing template -HWY_API bool AllFalse(D /* tag */, MFromD mask) { +HWY_API bool AllFalse(D d, MFromD mask) { // Cheaper than PTEST, which is 2 uop / 3L. - return detail::BitsFromMask(mask) == 0; + return BitsFromMask(d, mask) == 0; } template HWY_API bool AllTrue(D d, MFromD mask) { constexpr uint64_t kAllBits = (1ull << MaxLanes(d)) - 1; - return detail::BitsFromMask(mask) == kAllBits; + return BitsFromMask(d, mask) == kAllBits; } template -HWY_API size_t CountTrue(D /* tag */, MFromD mask) { - return PopCount(detail::BitsFromMask(mask)); +HWY_API size_t CountTrue(D d, MFromD mask) { + return PopCount(BitsFromMask(d, mask)); } template -HWY_API size_t FindKnownFirstTrue(D /* tag */, MFromD mask) { +HWY_API size_t FindKnownFirstTrue(D d, MFromD mask) { return Num0BitsBelowLS1Bit_Nonzero32( - static_cast(detail::BitsFromMask(mask))); + static_cast(BitsFromMask(d, mask))); } template -HWY_API intptr_t FindFirstTrue(D /* tag */, MFromD mask) { - const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); +HWY_API intptr_t FindFirstTrue(D d, MFromD mask) { + const uint32_t mask_bits = static_cast(BitsFromMask(d, mask)); return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1; } template -HWY_API size_t FindKnownLastTrue(D /* tag */, MFromD mask) { +HWY_API size_t FindKnownLastTrue(D d, MFromD mask) { return 31 - Num0BitsAboveMS1Bit_Nonzero32( - static_cast(detail::BitsFromMask(mask))); + static_cast(BitsFromMask(d, mask))); } template -HWY_API intptr_t FindLastTrue(D /* tag */, MFromD mask) { - const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); +HWY_API intptr_t FindLastTrue(D d, MFromD mask) { + const uint32_t mask_bits = static_cast(BitsFromMask(d, mask)); return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits)) : -1; } @@ -13157,7 +13159,8 @@ HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { // General case, 2 or 4 bytes template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { - return detail::CompressBits(v, detail::BitsFromMask(mask)); + const DFromV d; + return detail::CompressBits(v, BitsFromMask(d, mask)); } // ------------------------------ CompressNot @@ -13182,12 +13185,13 @@ HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { template HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { + const DFromV d; // For partial vectors, we cannot pull the Not() into the table because // BitsFromMask clears the upper bits. if (N < 16 / sizeof(T)) { - return detail::CompressBits(v, detail::BitsFromMask(Not(mask))); + return detail::CompressBits(v, BitsFromMask(d, Not(mask))); } - return detail::CompressNotBits(v, detail::BitsFromMask(mask)); + return detail::CompressNotBits(v, BitsFromMask(d, mask)); } // ------------------------------ CompressBlocksNot @@ -13216,7 +13220,7 @@ HWY_API size_t CompressStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; - const uint64_t mask_bits = detail::BitsFromMask(m); + const uint64_t mask_bits = BitsFromMask(d, m); HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); const size_t count = PopCount(mask_bits); @@ -13233,7 +13237,7 @@ HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; - const uint64_t mask_bits = detail::BitsFromMask(m); + const uint64_t mask_bits = BitsFromMask(d, m); HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); const size_t count = PopCount(mask_bits); diff --git a/hwy/ops/x86_256-inl.h b/hwy/ops/x86_256-inl.h index ebaea59de3..59e93bfe5b 100644 --- a/hwy/ops/x86_256-inl.h +++ b/hwy/ops/x86_256-inl.h @@ -147,65 +147,60 @@ class Vec256 { Raw raw; }; -#if HWY_TARGET <= HWY_AVX3 - namespace detail { +#if HWY_TARGET <= HWY_AVX3 + // Template arg: sizeof(lane type) template -struct RawMask256 {}; +struct RawMask256T {}; template <> -struct RawMask256<1> { +struct RawMask256T<1> { using type = __mmask32; }; template <> -struct RawMask256<2> { +struct RawMask256T<2> { using type = __mmask16; }; template <> -struct RawMask256<4> { +struct RawMask256T<4> { using type = __mmask8; }; template <> -struct RawMask256<8> { +struct RawMask256T<8> { using type = __mmask8; }; -} // namespace detail - template -struct Mask256 { - using Raw = typename detail::RawMask256::type; +using RawMask256 = typename RawMask256T::type; - static Mask256 FromBits(uint64_t mask_bits) { - return Mask256{static_cast(mask_bits)}; - } +#else // AVX2 or earlier - Raw raw; -}; +template +using RawMask256 = typename Raw256::type; -#else // AVX2 +#endif // HWY_TARGET <= HWY_AVX3 + +} // namespace detail -// FF..FF or 0. template struct Mask256 { - typename detail::Raw256::type raw; -}; + using Raw = typename detail::RawMask256; -#endif // AVX2 + using PrivateT = T; // only for DFromM + static constexpr size_t kPrivateN = 32 / sizeof(T); // only for DFromM #if HWY_TARGET <= HWY_AVX3 -namespace detail { - -// Used by Expand() emulation, which is required for both AVX3 and AVX2. -template -HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { - return mask.raw; -} - -} // namespace detail + static Mask256 FromBits(uint64_t mask_bits) { + return Mask256{static_cast(mask_bits)}; + } +#else +// Lanes are either FF..FF or 0. #endif // HWY_TARGET <= HWY_AVX3 + Raw raw; +}; + template using Full256 = Simd; @@ -7797,26 +7792,22 @@ HWY_API MFromD LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { return detail::LoadMaskBits256>(mask_bits); } -// ------------------------------ StoreMaskBits - -namespace detail { +// ------------------------------ BitsFromMask -template -HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { - const Full256 d; - const Full256 d8; +template +HWY_API uint64_t BitsFromMask(D d, MFromD mask) { + const RebindToUnsigned d8; const auto sign_bits = BitCast(d8, VecFromMask(d, mask)).raw; // Prevent sign-extension of 32-bit masks because the intrinsic returns int. return static_cast(_mm256_movemask_epi8(sign_bits)); } -template -HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { +template +HWY_API uint64_t BitsFromMask(D d, MFromD mask) { #if !defined(HWY_DISABLE_BMI2_FMA) && !defined(HWY_DISABLE_PEXT_ON_AVX2) - const Full256 d; - const Full256 d8; + const Repartition d8; const Mask256 mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); - const uint64_t sign_bits8 = BitsFromMask(mask8); + const uint64_t sign_bits8 = BitsFromMask(d8, mask8); // Skip the bits from the lower byte of each u16 (better not to use the // same packs_epi16 as SSE4, because that requires an extra swizzle here). return _pext_u32(static_cast(sign_bits8), 0xAAAAAAAAu); @@ -7832,31 +7823,28 @@ HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { #endif // HWY_ARCH_X86_64 } -template -HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { - const Full256 d; - const Full256 df; +template +HWY_API uint64_t BitsFromMask(D d, MFromD mask) { + const RebindToFloat df; const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw; return static_cast(_mm256_movemask_ps(sign_bits)); } -template -HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { - const Full256 d; - const Full256 df; +template +HWY_API uint64_t BitsFromMask(D d, MFromD mask) { + const RebindToFloat df; const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw; return static_cast(_mm256_movemask_pd(sign_bits)); } -} // namespace detail - +// ------------------------------ StoreMaskBits // `p` points to at least 8 writable bytes. template HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { constexpr size_t N = Lanes(d); constexpr size_t kNumBytes = (N + 7) / 8; - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(d, mask); CopyBytes(&mask_bits, bits); return kNumBytes; } @@ -7869,59 +7857,59 @@ template HWY_API bool AllFalse(D d, MFromD mask) { const Repartition d8; const Mask256 mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); - return detail::BitsFromMask(mask8) == 0; + return BitsFromMask(d8, mask8) == 0; } template -HWY_API bool AllFalse(D /* tag */, MFromD mask) { +HWY_API bool AllFalse(D d, MFromD mask) { // Cheaper than PTEST, which is 2 uop / 3L. - return detail::BitsFromMask(mask) == 0; + return BitsFromMask(d, mask) == 0; } template HWY_API bool AllTrue(D d, MFromD mask) { const Repartition d8; const Mask256 mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); - return detail::BitsFromMask(mask8) == (1ull << 32) - 1; + return BitsFromMask(d8, mask8) == (1ull << 32) - 1; } template HWY_API bool AllTrue(D d, MFromD mask) { constexpr uint64_t kAllBits = (1ull << Lanes(d)) - 1; - return detail::BitsFromMask(mask) == kAllBits; + return BitsFromMask(d, mask) == kAllBits; } template HWY_API size_t CountTrue(D d, MFromD mask) { const Repartition d8; const Mask256 mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); - return PopCount(detail::BitsFromMask(mask8)) >> 1; + return PopCount(BitsFromMask(d8, mask8)) >> 1; } template -HWY_API size_t CountTrue(D /* tag */, MFromD mask) { - return PopCount(detail::BitsFromMask(mask)); +HWY_API size_t CountTrue(D d, MFromD mask) { + return PopCount(BitsFromMask(d, mask)); } template -HWY_API size_t FindKnownFirstTrue(D /* tag */, MFromD mask) { - const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); +HWY_API size_t FindKnownFirstTrue(D d, MFromD mask) { + const uint32_t mask_bits = static_cast(BitsFromMask(d, mask)); return Num0BitsBelowLS1Bit_Nonzero32(mask_bits); } template -HWY_API intptr_t FindFirstTrue(D /* tag */, MFromD mask) { - const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); +HWY_API intptr_t FindFirstTrue(D d, MFromD mask) { + const uint32_t mask_bits = static_cast(BitsFromMask(d, mask)); return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1; } template -HWY_API size_t FindKnownLastTrue(D /* tag */, MFromD mask) { - const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); +HWY_API size_t FindKnownLastTrue(D d, MFromD mask) { + const uint32_t mask_bits = static_cast(BitsFromMask(d, mask)); return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits); } template -HWY_API intptr_t FindLastTrue(D /* tag */, MFromD mask) { - const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); +HWY_API intptr_t FindLastTrue(D d, MFromD mask) { + const uint32_t mask_bits = static_cast(BitsFromMask(d, mask)); return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits)) : -1; } @@ -8174,12 +8162,14 @@ HWY_INLINE Vec256 CompressNot(Vec256 v, const uint64_t mask_bits) { template HWY_API Vec256 Compress(Vec256 v, Mask256 m) { - return detail::Compress(v, detail::BitsFromMask(m)); + const DFromV d; + return detail::Compress(v, BitsFromMask(d, m)); } template HWY_API Vec256 CompressNot(Vec256 v, Mask256 m) { - return detail::CompressNot(v, detail::BitsFromMask(m)); + const DFromV d; + return detail::CompressNot(v, BitsFromMask(d, m)); } HWY_API Vec256 CompressBlocksNot(Vec256 v, @@ -8207,7 +8197,7 @@ HWY_API Vec256 CompressBits(Vec256 v, const uint8_t* HWY_RESTRICT bits) { template HWY_API size_t CompressStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { - const uint64_t mask_bits = detail::BitsFromMask(m); + const uint64_t mask_bits = BitsFromMask(d, m); const size_t count = PopCount(mask_bits); StoreU(detail::Compress(v, mask_bits), d, unaligned); detail::MaybeUnpoison(unaligned, count); @@ -8218,7 +8208,7 @@ template HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { - const uint64_t mask_bits = detail::BitsFromMask(m); + const uint64_t mask_bits = BitsFromMask(d, m); const size_t count = PopCount(mask_bits); const RebindToUnsigned du; @@ -8245,7 +8235,7 @@ HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, template HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { - const uint64_t mask_bits = detail::BitsFromMask(m); + const uint64_t mask_bits = BitsFromMask(d, m); const size_t count = PopCount(mask_bits); const VFromD compressed = detail::Compress(v, mask_bits); @@ -8362,7 +8352,7 @@ HWY_API Vec256 Expand(Vec256 v, Mask256 mask) { // LUTs are infeasible for so many mask combinations, so Combine two // half-vector Expand. const Half dh; - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(d, mask); constexpr size_t N = 32 / sizeof(T); const size_t countL = PopCount(mask_bits & ((1 << (N / 2)) - 1)); const Mask128 maskL = MaskFromVec(LowerHalf(VecFromMask(d, mask))); @@ -8416,7 +8406,7 @@ HWY_API Vec256 Expand(Vec256 v, Mask256 mask) { return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); #else const RebindToUnsigned du; - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(d, mask); alignas(16) constexpr uint32_t packed_array[256] = { // PrintExpand32x8Nibble. @@ -8485,7 +8475,7 @@ HWY_API Vec256 Expand(Vec256 v, Mask256 mask) { return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); #else const RebindToUnsigned du; - const uint64_t mask_bits = detail::BitsFromMask(mask); + const uint64_t mask_bits = BitsFromMask(d, mask); alignas(16) constexpr uint64_t packed_array[16] = { // PrintExpand64x4Nibble. diff --git a/hwy/ops/x86_512-inl.h b/hwy/ops/x86_512-inl.h index bcb930cac9..3f366aebe1 100644 --- a/hwy/ops/x86_512-inl.h +++ b/hwy/ops/x86_512-inl.h @@ -172,6 +172,10 @@ class Vec512 { template struct Mask512 { using Raw = typename detail::RawMask512::type; + + using PrivateT = T; // only for DFromM + static constexpr size_t kPrivateN = 64 / sizeof(T); // only for DFromM + Raw raw; }; @@ -2560,74 +2564,54 @@ HWY_API Mask512 operator<=(Vec512 a, Vec512 b) { // ------------------------------ Mask -namespace detail { - -template -HWY_INLINE Mask512 MaskFromVec(hwy::SizeTag<1> /*tag*/, Vec512 v) { +template +HWY_API Mask512 MaskFromVec(Vec512 v) { return Mask512{_mm512_movepi8_mask(v.raw)}; } -template -HWY_INLINE Mask512 MaskFromVec(hwy::SizeTag<2> /*tag*/, Vec512 v) { +template +HWY_API Mask512 MaskFromVec(Vec512 v) { return Mask512{_mm512_movepi16_mask(v.raw)}; } -template -HWY_INLINE Mask512 MaskFromVec(hwy::SizeTag<4> /*tag*/, Vec512 v) { +template +HWY_API Mask512 MaskFromVec(Vec512 v) { return Mask512{_mm512_movepi32_mask(v.raw)}; } -template -HWY_INLINE Mask512 MaskFromVec(hwy::SizeTag<8> /*tag*/, Vec512 v) { - return Mask512{_mm512_movepi64_mask(v.raw)}; -} - -} // namespace detail - -template +template HWY_API Mask512 MaskFromVec(Vec512 v) { - return detail::MaskFromVec(hwy::SizeTag(), v); + return Mask512{_mm512_movepi64_mask(v.raw)}; } -template +template HWY_API Mask512 MaskFromVec(Vec512 v) { const RebindToSigned> di; return Mask512{MaskFromVec(BitCast(di, v)).raw}; } -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_movm_epi8(v.raw)}; -} -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_movm_epi8(v.raw)}; -} - -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_movm_epi16(v.raw)}; +template +HWY_API Vec512 VecFromMask(Mask512 m) { + return Vec512{_mm512_movm_epi8(m.raw)}; } -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_movm_epi16(v.raw)}; +template +HWY_API Vec512 VecFromMask(Mask512 m) { + return Vec512{_mm512_movm_epi16(m.raw)}; } #if HWY_HAVE_FLOAT16 -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_castsi512_ph(_mm512_movm_epi16(v.raw))}; +HWY_API Vec512 VecFromMask(Mask512 m) { + return Vec512{_mm512_castsi512_ph(_mm512_movm_epi16(m.raw))}; } #endif // HWY_HAVE_FLOAT16 - -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_movm_epi32(v.raw)}; -} -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_movm_epi32(v.raw)}; -} -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_castsi512_ps(_mm512_movm_epi32(v.raw))}; -} - -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_movm_epi64(v.raw)}; +template +HWY_API Vec512 VecFromMask(Mask512 m) { + return Vec512{_mm512_movm_epi32(m.raw)}; } -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_movm_epi64(v.raw)}; +template +HWY_API Vec512 VecFromMask(Mask512 m) { + return Vec512{_mm512_movm_epi64(m.raw)}; } -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_castsi512_pd(_mm512_movm_epi64(v.raw))}; +template +HWY_API Vec512 VecFromMask(Mask512 m) { + const Full512 d; + const Full512> di; + return BitCast(d, VecFromMask(RebindMask(di, m))); } // ------------------------------ Mask logical diff --git a/hwy/tests/mask_set_test.cc b/hwy/tests/mask_set_test.cc new file mode 100644 index 0000000000..85d8fd66f2 --- /dev/null +++ b/hwy/tests/mask_set_test.cc @@ -0,0 +1,317 @@ +// Copyright 2019 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "tests/mask_set_test.cc" +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace { + +struct TestMaskFalse { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { +#if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE || HWY_TARGET == HWY_SCALAR + // For RVV, SVE and SCALAR, use the underlying native vector. + const DFromV> d2; +#else + // Other targets are strongly-typed, but we can safely ResizeBitCast to the + // native vector. All targets have at least 128-bit vectors, but NEON also + // supports 64-bit vectors. + constexpr size_t kMinD2Lanes = (HWY_TARGET_IS_NEON ? 8 : 16) / sizeof(T); + const FixedTag d2; +#endif + static_assert(d2.MaxBytes() >= d.MaxBytes(), + "d2.MaxBytes() >= d.MaxBytes() should be true"); + using V2 = Vec; + + // Various ways of checking that false masks are false. + HWY_ASSERT(AllFalse(d, MaskFalse(d))); + HWY_ASSERT_EQ(0, CountTrue(d, MaskFalse(d))); + HWY_ASSERT_VEC_EQ(d, Zero(d), VecFromMask(d, MaskFalse(d))); + +#if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE + // For these targets, we can treat the result as if it were a vector of type + // `V2`. On SVE, vectors are always full (not fractional) and caps are only + // enforced by Highway ops. On RVV, LMUL must match but caps can also be + // ignored. For safety, MaskFalse also sets lanes >= `Lanes(d)` to false, + // and we verify that here. + HWY_ASSERT(AllFalse(d2, MaskFalse(d))); + HWY_ASSERT_EQ(0, CountTrue(d2, MaskFalse(d))); + HWY_ASSERT_VEC_EQ(d2, Zero(d2), VecFromMask(d2, MaskFalse(d))); +#endif + + // All targets support, and strongly-typed (non-scalable) targets require, + // ResizeBitCast before we compare to the 'native' underlying vector size. + const V2 actual2 = ResizeBitCast(d2, VecFromMask(d, MaskFalse(d))); + HWY_ASSERT_VEC_EQ(d2, Zero(d2), actual2); + } +}; + +HWY_NOINLINE void TestAllMaskFalse() { + ForAllTypes(ForPartialVectors()); +} + +struct TestFirstN { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(bool_lanes); + + using TN = SignedFromSize; + const size_t max_len = static_cast(LimitsMax()); + + const Vec k1 = Set(d, ConvertScalarTo(1)); + + const size_t max_lanes = HWY_MIN(2 * N, AdjustedReps(512)); + for (size_t len = 0; len <= HWY_MIN(max_lanes, max_len); ++len) { + // Loop instead of Iota+Lt to avoid wraparound for 8-bit T. + for (size_t i = 0; i < N; ++i) { + bool_lanes[i] = ConvertScalarTo(i < len ? 1 : 0); + } + const Mask expected = Eq(Load(d, bool_lanes.get()), k1); + HWY_ASSERT_MASK_EQ(d, expected, FirstN(d, len)); + } + + // Also ensure huge values yield all-true (unless the vector is actually + // larger than max_len). + for (size_t i = 0; i < N; ++i) { + bool_lanes[i] = ConvertScalarTo(i < max_len ? 1 : 0); + } + const Mask expected = Eq(Load(d, bool_lanes.get()), k1); + HWY_ASSERT_MASK_EQ(d, expected, FirstN(d, max_len)); + } +}; + +HWY_NOINLINE void TestAllFirstN() { + ForAllTypes(ForPartialVectors()); +} + +struct TestSetBeforeFirst { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(di); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(bool_lanes); + memset(bool_lanes.get(), 0, N * sizeof(TI)); + + // For all combinations of zero/nonzero state of subset of lanes: + const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); + for (size_t code = 0; code < (1ull << max_lanes); ++code) { + for (size_t i = 0; i < max_lanes; ++i) { + bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); + } + + const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + + const size_t first_set_lane_idx = + (code != 0) + ? Num0BitsBelowLS1Bit_Nonzero64(static_cast(code)) + : N; + const auto expected_mask = FirstN(d, first_set_lane_idx); + + HWY_ASSERT_MASK_EQ(d, expected_mask, SetBeforeFirst(m)); + } + } +}; + +HWY_NOINLINE void TestAllSetBeforeFirst() { + ForAllTypes(ForPartialVectors()); +} + +struct TestSetAtOrBeforeFirst { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(di); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(bool_lanes); + memset(bool_lanes.get(), 0, N * sizeof(TI)); + + // For all combinations of zero/nonzero state of subset of lanes: + const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); + for (size_t code = 0; code < (1ull << max_lanes); ++code) { + for (size_t i = 0; i < max_lanes; ++i) { + bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); + } + + const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + + const size_t idx_after_first_set_lane = + (code != 0) + ? (Num0BitsBelowLS1Bit_Nonzero64(static_cast(code)) + 1) + : N; + const auto expected_mask = FirstN(d, idx_after_first_set_lane); + + HWY_ASSERT_MASK_EQ(d, expected_mask, SetAtOrBeforeFirst(m)); + } + } +}; + +HWY_NOINLINE void TestAllSetAtOrBeforeFirst() { + ForAllTypes(ForPartialVectors()); +} + +struct TestSetOnlyFirst { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(di); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(bool_lanes); + memset(bool_lanes.get(), 0, N * sizeof(TI)); + auto expected_lanes = AllocateAligned(N); + HWY_ASSERT(expected_lanes); + + // For all combinations of zero/nonzero state of subset of lanes: + const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); + for (size_t code = 0; code < (1ull << max_lanes); ++code) { + for (size_t i = 0; i < max_lanes; ++i) { + bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); + } + + memset(expected_lanes.get(), 0, N * sizeof(TI)); + if (code != 0) { + const size_t idx_of_first_lane = + Num0BitsBelowLS1Bit_Nonzero64(static_cast(code)); + expected_lanes[idx_of_first_lane] = TI(1); + } + + const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + const auto expected_mask = + RebindMask(d, Gt(Load(di, expected_lanes.get()), Zero(di))); + + HWY_ASSERT_MASK_EQ(d, expected_mask, SetOnlyFirst(m)); + } + } +}; + +HWY_NOINLINE void TestAllSetOnlyFirst() { + ForAllTypes(ForPartialVectors()); +} + +struct TestSetAtOrAfterFirst { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(di); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(bool_lanes); + memset(bool_lanes.get(), 0, N * sizeof(TI)); + + // For all combinations of zero/nonzero state of subset of lanes: + const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); + for (size_t code = 0; code < (1ull << max_lanes); ++code) { + for (size_t i = 0; i < max_lanes; ++i) { + bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); + } + + const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); + + const size_t first_set_lane_idx = + (code != 0) + ? Num0BitsBelowLS1Bit_Nonzero64(static_cast(code)) + : N; + const auto expected_at_or_after_first_mask = + Not(FirstN(d, first_set_lane_idx)); + const auto actual_at_or_after_first_mask = SetAtOrAfterFirst(m); + + HWY_ASSERT_MASK_EQ(d, expected_at_or_after_first_mask, + actual_at_or_after_first_mask); + HWY_ASSERT_MASK_EQ( + d, SetOnlyFirst(m), + And(actual_at_or_after_first_mask, SetAtOrBeforeFirst(m))); + HWY_ASSERT_MASK_EQ(d, m, And(m, actual_at_or_after_first_mask)); + HWY_ASSERT( + AllTrue(d, Xor(actual_at_or_after_first_mask, SetBeforeFirst(m)))); + } + } +}; + +HWY_NOINLINE void TestAllSetAtOrAfterFirst() { + ForAllTypes(ForPartialVectors()); +} + +struct TestDup128MaskFromMaskBits { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(di); + constexpr size_t kLanesPer16ByteBlock = 16 / sizeof(T); + + auto expected = AllocateAligned(N); + HWY_ASSERT(expected); + + // For all combinations of zero/nonzero state of subset of lanes: + constexpr size_t kMaxLanesToCheckPerBlk = + HWY_MIN(HWY_MAX_LANES_D(D), HWY_MIN(kLanesPer16ByteBlock, 10)); + const size_t max_lanes = HWY_MIN(N, kMaxLanesToCheckPerBlk); + + for (unsigned code = 0; code < (1u << max_lanes); ++code) { + for (size_t i = 0; i < N; i++) { + expected[i] = static_cast( + -static_cast((code >> (i & (kLanesPer16ByteBlock - 1))) & 1)); + } + + const auto expected_mask = + MaskFromVec(BitCast(d, LoadDup128(di, expected.get()))); + + const auto m = Dup128MaskFromMaskBits(d, code); + HWY_ASSERT_VEC_EQ(di, expected.get(), VecFromMask(di, RebindMask(di, m))); + HWY_ASSERT_MASK_EQ(d, expected_mask, m); + } + } +}; + +HWY_NOINLINE void TestAllDup128MaskFromMaskBits() { + ForAllTypes(ForPartialVectors()); +} + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_BEFORE_TEST(HwyMaskSetTest); +HWY_EXPORT_AND_TEST_P(HwyMaskSetTest, TestAllMaskFalse); +HWY_EXPORT_AND_TEST_P(HwyMaskSetTest, TestAllFirstN); +HWY_EXPORT_AND_TEST_P(HwyMaskSetTest, TestAllSetBeforeFirst); +HWY_EXPORT_AND_TEST_P(HwyMaskSetTest, TestAllSetAtOrBeforeFirst); +HWY_EXPORT_AND_TEST_P(HwyMaskSetTest, TestAllSetOnlyFirst); +HWY_EXPORT_AND_TEST_P(HwyMaskSetTest, TestAllSetAtOrAfterFirst); +HWY_EXPORT_AND_TEST_P(HwyMaskSetTest, TestAllDup128MaskFromMaskBits); +HWY_AFTER_TEST(); +} // namespace +} // namespace hwy +HWY_TEST_MAIN(); +#endif // HWY_ONCE diff --git a/hwy/tests/mask_test.cc b/hwy/tests/mask_test.cc index 3ad55f5ced..afd564b46e 100644 --- a/hwy/tests/mask_test.cc +++ b/hwy/tests/mask_test.cc @@ -15,7 +15,7 @@ #include #include -#include // memcmp +#include // memset #undef HWY_TARGET_INCLUDE #define HWY_TARGET_INCLUDE "tests/mask_test.cc" @@ -28,52 +28,7 @@ namespace hwy { namespace HWY_NAMESPACE { namespace { -// All types. -struct TestMaskFalse { - template - HWY_NOINLINE void operator()(T /*unused*/, D d) { -#if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE || HWY_TARGET == HWY_SCALAR - // For RVV, SVE and SCALAR, use the underlying native vector. - const DFromV> d2; -#else - // Other targets are strongly-typed, but we can safely ResizeBitCast to the - // native vector. All targets have at least 128-bit vectors, but NEON also - // supports 64-bit vectors. - constexpr size_t kMinD2Lanes = (HWY_TARGET_IS_NEON ? 8 : 16) / sizeof(T); - const FixedTag d2; -#endif - static_assert(d2.MaxBytes() >= d.MaxBytes(), - "d2.MaxBytes() >= d.MaxBytes() should be true"); - using V2 = Vec; - - // Various ways of checking that false masks are false. - HWY_ASSERT(AllFalse(d, MaskFalse(d))); - HWY_ASSERT_EQ(0, CountTrue(d, MaskFalse(d))); - HWY_ASSERT_VEC_EQ(d, Zero(d), VecFromMask(d, MaskFalse(d))); - -#if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE - // For these targets, we can treat the result as if it were a vector of type - // `V2`. On SVE, vectors are always full (not fractional) and caps are only - // enforced by Highway ops. On RVV, LMUL must match but caps can also be - // ignored. For safety, MaskFalse also sets lanes >= `Lanes(d)` to false, - // and we verify that here. - HWY_ASSERT(AllFalse(d2, MaskFalse(d))); - HWY_ASSERT_EQ(0, CountTrue(d2, MaskFalse(d))); - HWY_ASSERT_VEC_EQ(d2, Zero(d2), VecFromMask(d2, MaskFalse(d))); -#endif - - // All targets support, and strongly-typed (non-scalable) targets require, - // ResizeBitCast before we compare to the 'native' underlying vector size. - const V2 actual2 = ResizeBitCast(d2, VecFromMask(d, MaskFalse(d))); - HWY_ASSERT_VEC_EQ(d2, Zero(d2), actual2); - } -}; - -HWY_NOINLINE void TestAllMaskFalse() { - ForAllTypes(ForPartialVectors()); -} - -struct TestFromVec { +struct TestMaskFromVec { template HWY_NOINLINE void operator()(T /*unused*/, D d) { const size_t N = Lanes(d); @@ -81,86 +36,87 @@ struct TestFromVec { HWY_ASSERT(lanes); memset(lanes.get(), 0, N * sizeof(T)); - const auto actual_false = MaskFromVec(Load(d, lanes.get())); + const Mask actual_false = MaskFromVec(Load(d, lanes.get())); HWY_ASSERT_MASK_EQ(d, MaskFalse(d), actual_false); memset(lanes.get(), 0xFF, N * sizeof(T)); - const auto actual_true = MaskFromVec(Load(d, lanes.get())); + const Mask actual_true = MaskFromVec(Load(d, lanes.get())); HWY_ASSERT_MASK_EQ(d, MaskTrue(d), actual_true); } }; -HWY_NOINLINE void TestAllFromVec() { - ForAllTypes(ForPartialVectors()); +HWY_NOINLINE void TestAllMaskFromVec() { + ForAllTypes(ForPartialVectors()); } -struct TestFirstN { +// Round trip, using MaskFromVec. +struct TestVecFromMask { template HWY_NOINLINE void operator()(T /*unused*/, D d) { - const size_t N = Lanes(d); - auto bool_lanes = AllocateAligned(N); - HWY_ASSERT(bool_lanes); + RandomState rng; - using TN = SignedFromSize; - const size_t max_len = static_cast(LimitsMax()); + using M = Mask; // == MFromD +// Ensure DFromM works on all targets except `SVE` and `RVV`, whose built-in +// mask types are not strongly typed. +#if !HWY_TARGET_IS_SVE && HWY_TARGET != HWY_RVV + static_assert(hwy::IsSame, D>(), ""); +#endif - const Vec k1 = Set(d, ConvertScalarTo(1)); + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + const size_t N = Lanes(d); + auto lanes = AllocateAligned(N); + HWY_ASSERT(lanes); - const size_t max_lanes = HWY_MIN(2 * N, AdjustedReps(512)); - for (size_t len = 0; len <= HWY_MIN(max_lanes, max_len); ++len) { - // Loop instead of Iota+Lt to avoid wraparound for 8-bit T. + // Each lane should have a chance of having mask=true. + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { for (size_t i = 0; i < N; ++i) { - bool_lanes[i] = ConvertScalarTo(i < len ? 1 : 0); + lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); } - const Mask expected = Eq(Load(d, bool_lanes.get()), k1); - HWY_ASSERT_MASK_EQ(d, expected, FirstN(d, len)); - } - // Also ensure huge values yield all-true (unless the vector is actually - // larger than max_len). - for (size_t i = 0; i < N; ++i) { - bool_lanes[i] = ConvertScalarTo(i < max_len ? 1 : 0); + const M mask = RebindMask(d, Gt(Load(di, lanes.get()), Zero(di))); + HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask))); } - const Mask expected = Eq(Load(d, bool_lanes.get()), k1); - HWY_ASSERT_MASK_EQ(d, expected, FirstN(d, max_len)); } }; -HWY_NOINLINE void TestAllFirstN() { - ForAllTypes(ForPartialVectors()); +HWY_NOINLINE void TestAllVecFromMask() { + ForAllTypes(ForPartialVectors()); } -struct TestMaskVec { +struct TestBitsFromMask { template HWY_NOINLINE void operator()(T /*unused*/, D d) { +#if HWY_MAX_BYTES > 64 + (void)d; +#else RandomState rng; using TI = MakeSigned; // For mask > 0 comparison const Rebind di; const size_t N = Lanes(d); - auto bool_lanes = AllocateAligned(N); - HWY_ASSERT(bool_lanes); + HWY_ASSERT(N <= 64); // non-scalable targets have at most 512 bits. + auto lanes = AllocateAligned(N); + HWY_ASSERT(lanes); // Each lane should have a chance of having mask=true. for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + uint64_t expected_bits = 0; for (size_t i = 0; i < N; ++i) { - bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); + lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); + expected_bits |= lanes[i] ? (1ull << i) : 0; } - const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); - HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask))); + const Mask mask = RebindMask(d, Gt(Load(di, lanes.get()), Zero(di))); + const uint64_t actual_bits = BitsFromMask(d, mask); + HWY_ASSERT_EQ(expected_bits, actual_bits); } +#endif // HWY_MAX_BYTES > 64 } }; -HWY_NOINLINE void TestAllMaskVec() { - const ForPartialVectors test; - - test(uint16_t()); - test(int16_t()); - // TODO(janwas): float16_t - cannot compare yet - - ForUIF3264(test); +HWY_NOINLINE void TestAllBitsFromMask() { + ForAllTypes(ForPartialVectors()); } struct TestAllTrueFalse { @@ -361,192 +317,6 @@ HWY_NOINLINE void TestAllLogicalMask() { ForAllTypes(ForPartialVectors()); } -struct TestSetBeforeFirst { - template - HWY_NOINLINE void operator()(T /*unused*/, D d) { - using TI = MakeSigned; // For mask > 0 comparison - const Rebind di; - const size_t N = Lanes(di); - auto bool_lanes = AllocateAligned(N); - HWY_ASSERT(bool_lanes); - memset(bool_lanes.get(), 0, N * sizeof(TI)); - - // For all combinations of zero/nonzero state of subset of lanes: - const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); - for (size_t code = 0; code < (1ull << max_lanes); ++code) { - for (size_t i = 0; i < max_lanes; ++i) { - bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); - } - - const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); - - const size_t first_set_lane_idx = - (code != 0) - ? Num0BitsBelowLS1Bit_Nonzero64(static_cast(code)) - : N; - const auto expected_mask = FirstN(d, first_set_lane_idx); - - HWY_ASSERT_MASK_EQ(d, expected_mask, SetBeforeFirst(m)); - } - } -}; - -HWY_NOINLINE void TestAllSetBeforeFirst() { - ForAllTypes(ForPartialVectors()); -} - -struct TestSetAtOrBeforeFirst { - template - HWY_NOINLINE void operator()(T /*unused*/, D d) { - using TI = MakeSigned; // For mask > 0 comparison - const Rebind di; - const size_t N = Lanes(di); - auto bool_lanes = AllocateAligned(N); - HWY_ASSERT(bool_lanes); - memset(bool_lanes.get(), 0, N * sizeof(TI)); - - // For all combinations of zero/nonzero state of subset of lanes: - const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); - for (size_t code = 0; code < (1ull << max_lanes); ++code) { - for (size_t i = 0; i < max_lanes; ++i) { - bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); - } - - const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); - - const size_t idx_after_first_set_lane = - (code != 0) - ? (Num0BitsBelowLS1Bit_Nonzero64(static_cast(code)) + 1) - : N; - const auto expected_mask = FirstN(d, idx_after_first_set_lane); - - HWY_ASSERT_MASK_EQ(d, expected_mask, SetAtOrBeforeFirst(m)); - } - } -}; - -HWY_NOINLINE void TestAllSetAtOrBeforeFirst() { - ForAllTypes(ForPartialVectors()); -} - -struct TestSetOnlyFirst { - template - HWY_NOINLINE void operator()(T /*unused*/, D d) { - using TI = MakeSigned; // For mask > 0 comparison - const Rebind di; - const size_t N = Lanes(di); - auto bool_lanes = AllocateAligned(N); - HWY_ASSERT(bool_lanes); - memset(bool_lanes.get(), 0, N * sizeof(TI)); - auto expected_lanes = AllocateAligned(N); - HWY_ASSERT(expected_lanes); - - // For all combinations of zero/nonzero state of subset of lanes: - const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); - for (size_t code = 0; code < (1ull << max_lanes); ++code) { - for (size_t i = 0; i < max_lanes; ++i) { - bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); - } - - memset(expected_lanes.get(), 0, N * sizeof(TI)); - if (code != 0) { - const size_t idx_of_first_lane = - Num0BitsBelowLS1Bit_Nonzero64(static_cast(code)); - expected_lanes[idx_of_first_lane] = TI(1); - } - - const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); - const auto expected_mask = - RebindMask(d, Gt(Load(di, expected_lanes.get()), Zero(di))); - - HWY_ASSERT_MASK_EQ(d, expected_mask, SetOnlyFirst(m)); - } - } -}; - -HWY_NOINLINE void TestAllSetOnlyFirst() { - ForAllTypes(ForPartialVectors()); -} - -struct TestSetAtOrAfterFirst { - template - HWY_NOINLINE void operator()(T /*unused*/, D d) { - using TI = MakeSigned; // For mask > 0 comparison - const Rebind di; - const size_t N = Lanes(di); - auto bool_lanes = AllocateAligned(N); - HWY_ASSERT(bool_lanes); - memset(bool_lanes.get(), 0, N * sizeof(TI)); - - // For all combinations of zero/nonzero state of subset of lanes: - const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); - for (size_t code = 0; code < (1ull << max_lanes); ++code) { - for (size_t i = 0; i < max_lanes; ++i) { - bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); - } - - const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di))); - - const size_t first_set_lane_idx = - (code != 0) - ? Num0BitsBelowLS1Bit_Nonzero64(static_cast(code)) - : N; - const auto expected_at_or_after_first_mask = - Not(FirstN(d, first_set_lane_idx)); - const auto actual_at_or_after_first_mask = SetAtOrAfterFirst(m); - - HWY_ASSERT_MASK_EQ(d, expected_at_or_after_first_mask, - actual_at_or_after_first_mask); - HWY_ASSERT_MASK_EQ( - d, SetOnlyFirst(m), - And(actual_at_or_after_first_mask, SetAtOrBeforeFirst(m))); - HWY_ASSERT_MASK_EQ(d, m, And(m, actual_at_or_after_first_mask)); - HWY_ASSERT( - AllTrue(d, Xor(actual_at_or_after_first_mask, SetBeforeFirst(m)))); - } - } -}; - -HWY_NOINLINE void TestAllSetAtOrAfterFirst() { - ForAllTypes(ForPartialVectors()); -} - -struct TestDup128MaskFromMaskBits { - template - HWY_NOINLINE void operator()(T /*unused*/, D d) { - using TI = MakeSigned; // For mask > 0 comparison - const Rebind di; - const size_t N = Lanes(di); - constexpr size_t kLanesPer16ByteBlock = 16 / sizeof(T); - - auto expected = AllocateAligned(N); - HWY_ASSERT(expected); - - // For all combinations of zero/nonzero state of subset of lanes: - constexpr size_t kMaxLanesToCheckPerBlk = - HWY_MIN(HWY_MAX_LANES_D(D), HWY_MIN(kLanesPer16ByteBlock, 10)); - const size_t max_lanes = HWY_MIN(N, kMaxLanesToCheckPerBlk); - - for (unsigned code = 0; code < (1u << max_lanes); ++code) { - for (size_t i = 0; i < N; i++) { - expected[i] = static_cast( - -static_cast((code >> (i & (kLanesPer16ByteBlock - 1))) & 1)); - } - - const auto expected_mask = - MaskFromVec(BitCast(d, LoadDup128(di, expected.get()))); - - const auto m = Dup128MaskFromMaskBits(d, code); - HWY_ASSERT_VEC_EQ(di, expected.get(), VecFromMask(di, RebindMask(di, m))); - HWY_ASSERT_MASK_EQ(d, expected_mask, m); - } - } -}; - -HWY_NOINLINE void TestAllDup128MaskFromMaskBits() { - ForAllTypes(ForPartialVectors()); -} - } // namespace // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE @@ -557,20 +327,14 @@ HWY_AFTER_NAMESPACE(); namespace hwy { namespace { HWY_BEFORE_TEST(HwyMaskTest); -HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskFalse); -HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFromVec); -HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFirstN); -HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskVec); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskFromVec); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllVecFromMask); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllBitsFromMask); HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllAllTrueFalse); HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllCountTrue); HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFindFirstTrue); HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFindLastTrue); HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllLogicalMask); -HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllSetBeforeFirst); -HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllSetAtOrBeforeFirst); -HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllSetOnlyFirst); -HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllSetAtOrAfterFirst); -HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllDup128MaskFromMaskBits); HWY_AFTER_TEST(); } // namespace } // namespace hwy From 07396f9fd60f94b17650992cb7443ba5d48147cc Mon Sep 17 00:00:00 2001 From: John Platts Date: Mon, 2 Dec 2024 11:45:57 -0600 Subject: [PATCH 16/64] Fix for RVV CMake detection if cross-compiling with Clang --- CMakeLists.txt | 54 +++++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 30196faa78..115d0ab915 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,33 +59,6 @@ if(CHECK_PIE_SUPPORTED) endif() endif() -if (CMAKE_CXX_COMPILER_ARCHITECTURE_ID MATCHES "RISCV32|RISCV64|RISCV128" OR CMAKE_SYSTEM_PROCESSOR MATCHES "riscv32|riscv64|riscv128") - include(CheckCSourceCompiles) - check_c_source_compiles(" - #if __riscv_xlen == 64 - int main() { return 0; } - #else - #error Not RISCV-64 - #endif - " IS_RISCV_XLEN_64) - - check_c_source_compiles(" - #if __riscv_xlen == 32 - int main() { return 0; } - #else - #error Not RISCV-32 - #endif - " IS_RISCV_XLEN_32) - - if(IS_RISCV_XLEN_32) - set(RISCV_XLEN 32) - elseif(IS_RISCV_XLEN_64) - set(RISCV_XLEN 64) - else() - message(WARNING "Unable to determine RISC-V XLEN") - endif() -endif() - include(GNUInstallDirs) if (NOT CMAKE_BUILD_TYPE) @@ -163,6 +136,33 @@ check_cxx_source_compiles( HWY_RISCV ) +if (HWY_RISCV OR CMAKE_CXX_COMPILER_ARCHITECTURE_ID MATCHES "RISCV32|RISCV64|RISCV128" OR CMAKE_SYSTEM_PROCESSOR MATCHES "riscv32|riscv64|riscv128") + include(CheckCSourceCompiles) + check_c_source_compiles(" + #if __riscv_xlen == 64 + int main() { return 0; } + #else + #error Not RISCV-64 + #endif + " IS_RISCV_XLEN_64) + + check_c_source_compiles(" + #if __riscv_xlen == 32 + int main() { return 0; } + #else + #error Not RISCV-32 + #endif + " IS_RISCV_XLEN_32) + + if(IS_RISCV_XLEN_32) + set(RISCV_XLEN 32) + elseif(IS_RISCV_XLEN_64) + set(RISCV_XLEN 64) + else() + message(WARNING "Unable to determine RISC-V XLEN") + endif() +endif() + if (HWY_ENABLE_CONTRIB) # Glob all the traits so we don't need to modify this file when adding # additional special cases. From 914cb69c58c814e70d757ddc4d158e7c27ee43e8 Mon Sep 17 00:00:00 2001 From: John Platts Date: Mon, 2 Dec 2024 13:20:09 -0600 Subject: [PATCH 17/64] Made changes to RVV Concat, Combine, ZeroExtendVector, and UpperHalf ops --- hwy/ops/rvv-inl.h | 116 ++++++++++++++++++++++++++-------------------- 1 file changed, 65 insertions(+), 51 deletions(-) diff --git a/hwy/ops/rvv-inl.h b/hwy/ops/rvv-inl.h index 62ac160f06..3df8536fde 100644 --- a/hwy/ops/rvv-inl.h +++ b/hwy/ops/rvv-inl.h @@ -3183,10 +3183,9 @@ HWY_RVV_FOREACH(HWY_RVV_SLIDE_DOWN, SlideDown, slidedown, _ALL) SHIFT, MLEN, NAME, OP) \ template \ HWY_API HWY_RVV_V(BASE, SEW, LMULH) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ - if constexpr (kIndex == 0) { \ - return Trunc(v); \ - } else { \ - static_assert(kIndex == 1); \ + static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1"); \ + HWY_IF_CONSTEXPR(kIndex == 0) { return Trunc(v); } \ + else { \ return Trunc(SlideDown( \ v, Lanes(HWY_RVV_D(BASE, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), \ SHIFT - 1){}))); \ @@ -3196,10 +3195,9 @@ HWY_RVV_FOREACH(HWY_RVV_SLIDE_DOWN, SlideDown, slidedown, _ALL) SHIFT, MLEN, NAME, OP) \ template \ HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ - if constexpr (kIndex == 0) { \ - return v; \ - } else { \ - static_assert(kIndex == 1); \ + static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1"); \ + HWY_IF_CONSTEXPR(kIndex == 0) { return v; } \ + else { \ return SlideDown( \ v, Lanes(HWY_RVV_D(BASE, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), \ SHIFT){}) / \ @@ -3213,6 +3211,23 @@ HWY_RVV_FOREACH(HWY_RVV_GET_SMALLEST, Get, get, _GET_SET_SMALLEST) #undef HWY_RVV_GET_VIRT #undef HWY_RVV_GET_SMALLEST +template +static HWY_INLINE HWY_MAYBE_UNUSED VFromD>> +Get(D d, VFromD v) { + static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1"); + + const AdjustSimdTagToMinVecPow2> dh; + HWY_IF_CONSTEXPR(kIndex == 0 || detail::IsFull(d)) { + (void)dh; + return Get(v); + } + else { + const size_t slide_down_amt = + (dh.Pow2() < DFromV().Pow2()) ? Lanes(dh) : (Lanes(d) / 2); + return ResizeBitCast(dh, SlideDown(v, slide_down_amt)); + } +} + #define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ MLEN, NAME, OP) \ template \ @@ -3226,14 +3241,15 @@ HWY_RVV_FOREACH(HWY_RVV_GET_SMALLEST, Get, get, _GET_SET_SMALLEST) template \ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ NAME(HWY_RVV_V(BASE, SEW, LMUL) dest, HWY_RVV_V(BASE, SEW, LMULH) v) { \ + static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1"); \ auto d = HWY_RVV_D(BASE, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), SHIFT){}; \ auto df2 = \ HWY_RVV_D(BASE, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), SHIFT - 1){}; \ - if constexpr (kIndex == 0) { \ + HWY_IF_CONSTEXPR(kIndex == 0) { \ return __riscv_vmv_v_v_##CHAR##SEW##LMUL##_tu(dest, Ext(d, v), \ Lanes(df2)); \ - } else { \ - static_assert(kIndex == 1); \ + } \ + else { \ return SlideUp(dest, Ext(d, v), Lanes(df2)); \ } \ } @@ -3242,11 +3258,12 @@ HWY_RVV_FOREACH(HWY_RVV_GET_SMALLEST, Get, get, _GET_SET_SMALLEST) template \ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ NAME(HWY_RVV_V(BASE, SEW, LMUL) dest, HWY_RVV_V(BASE, SEW, LMUL) v) { \ + static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1"); \ auto d = HWY_RVV_D(BASE, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), SHIFT){}; \ - if constexpr (kIndex == 0) { \ + HWY_IF_CONSTEXPR(kIndex == 0) { \ return __riscv_vmv_v_v_##CHAR##SEW##LMUL##_tu(dest, v, Lanes(d) / 2); \ - } else { \ - static_assert(kIndex == 1); \ + } \ + else { \ return SlideUp(dest, v, Lanes(d) / 2); \ } \ } @@ -3257,6 +3274,23 @@ HWY_RVV_FOREACH(HWY_RVV_SET_SMALLEST, Set, set, _GET_SET_SMALLEST) #undef HWY_RVV_SET_VIRT #undef HWY_RVV_SET_SMALLEST +template +static HWY_INLINE HWY_MAYBE_UNUSED VFromD Set( + D d, VFromD dest, VFromD>> v) { + static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1"); + + const AdjustSimdTagToMinVecPow2> dh; + HWY_IF_CONSTEXPR(kIndex == 0 || detail::IsFull(d)) { + (void)dh; + return Set(dest, v); + } + else { + const size_t slide_up_amt = + (dh.Pow2() < DFromV().Pow2()) ? Lanes(dh) : (Lanes(d) / 2); + return SlideUp(dest, ResizeBitCast(d, v), slide_up_amt); + } +} + } // namespace detail // ------------------------------ SlideUpLanes @@ -3278,58 +3312,37 @@ HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { // ------------------------------ ConcatUpperLower template -HWY_API V ConcatUpperLower(D, const V hi, const V lo) { - const auto lo_lower = detail::Get<0>(lo); - return detail::Set<0>(hi, lo_lower); +HWY_API V ConcatUpperLower(D d, const V hi, const V lo) { + const auto lo_lower = detail::Get<0>(d, lo); + return detail::Set<0>(d, hi, lo_lower); } // ------------------------------ ConcatLowerLower template -HWY_API V ConcatLowerLower(D, const V hi, const V lo) { - const auto hi_lower = detail::Get<0>(hi); - return detail::Set<1>(lo, hi_lower); +HWY_API V ConcatLowerLower(D d, const V hi, const V lo) { + const auto hi_lower = detail::Get<0>(d, hi); + return detail::Set<1>(d, lo, hi_lower); } // ------------------------------ ConcatUpperUpper template -HWY_API V ConcatUpperUpper(D, const V hi, const V lo) { - const auto lo_upper = detail::Get<1>(lo); - return detail::Set<0>(hi, lo_upper); +HWY_API V ConcatUpperUpper(D d, const V hi, const V lo) { + const auto lo_upper = detail::Get<1>(d, lo); + return detail::Set<0>(d, hi, lo_upper); } // ------------------------------ ConcatLowerUpper -namespace detail { - -// Only getting a full register is a no-op. -template -constexpr bool IsGetNoOp(D d) { - return d.Pow2() >= 0; -} - -} // namespace detail - -template ())>* = nullptr> -HWY_API V ConcatLowerUpper(D d, const V hi, const V lo) { - const auto lo_upper = detail::Get<1>(lo); - const auto hi_lower = detail::Get<0>(hi); - const auto undef = Undefined(d); - return detail::Set<1>(detail::Set<0>(undef, lo_upper), hi_lower); -} - -template ())>* = nullptr> +template HWY_API V ConcatLowerUpper(D d, const V hi, const V lo) { - const size_t half = Lanes(d) / 2; - const V lo_down = detail::SlideDown(lo, half); - return detail::SlideUp(lo_down, hi, half); + const auto lo_upper = detail::Get<1>(d, lo); + const auto hi_lower = detail::Get<0>(d, hi); + return detail::Set<1>(d, ResizeBitCast(d, lo_upper), hi_lower); } // ------------------------------ Combine template HWY_API VFromD Combine(D2 d2, const V hi, const V lo) { - return detail::SlideUp(detail::Ext(d2, lo), detail::Ext(d2, hi), - Lanes(d2) / 2); + return detail::Set<1>(d2, ResizeBitCast(d2, lo), hi); } // ------------------------------ ZeroExtendVector @@ -3376,8 +3389,9 @@ HWY_API VFromD>> LowerHalf(const V v) { } template -HWY_API VFromD UpperHalf(const DH d2, const VFromD> v) { - return LowerHalf(d2, detail::SlideDown(v, Lanes(d2))); +HWY_API VFromD UpperHalf(const DH /*d2*/, const VFromD> v) { + const Twice d; + return detail::Get<1>(d, v); } // ================================================== SWIZZLE From 80839b54827ef570614bf7a7ad23083ac16064c5 Mon Sep 17 00:00:00 2001 From: John Platts Date: Mon, 2 Dec 2024 15:49:20 -0600 Subject: [PATCH 18/64] Enable tuples on RVV with Clang 17 or later --- hwy/ops/set_macros-inl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hwy/ops/set_macros-inl.h b/hwy/ops/set_macros-inl.h index 1d80bf213c..447442827b 100644 --- a/hwy/ops/set_macros-inl.h +++ b/hwy/ops/set_macros-inl.h @@ -78,7 +78,7 @@ // Supported on all targets except RVV (requires GCC 14 or upcoming Clang) #if HWY_TARGET == HWY_RVV && \ ((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400) || \ - (HWY_COMPILER_CLANG)) + (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1700)) #define HWY_HAVE_TUPLE 0 #else #define HWY_HAVE_TUPLE 1 From fccc82d15d861a0fb54084ae6cbfe561ec43ffba Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 6 Dec 2024 13:16:42 +0000 Subject: [PATCH 19/64] Bump actions/cache from 4.0.2 to 4.2.0 Bumps [actions/cache](https://github.com/actions/cache) from 4.0.2 to 4.2.0. - [Release notes](https://github.com/actions/cache/releases) - [Changelog](https://github.com/actions/cache/blob/main/RELEASES.md) - [Commits](https://github.com/actions/cache/compare/0c45773b623bea8c8e75f6c82b208c3cf94ea4f9...1bd1e32a3bdc45362d1e726936510720a7c30a57) --- updated-dependencies: - dependency-name: actions/cache dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/build_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml index ec62eaa44e..862f8e8f08 100644 --- a/.github/workflows/build_test.yml +++ b/.github/workflows/build_test.yml @@ -342,7 +342,7 @@ jobs: - uses: bazelbuild/setup-bazelisk@b39c379c82683a5f25d34f0d062761f62693e0b2 # v3.0.0 - - uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2 + - uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0 with: path: ~/.cache/bazel key: bazel-${{ runner.os }} From 83b81ab64aeb34acbc55626bb3e9c08638471277 Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Wed, 11 Dec 2024 09:28:26 -0800 Subject: [PATCH 20/64] add perf_counters PiperOrigin-RevId: 705134221 --- BUILD | 15 ++ CMakeLists.txt | 2 + hwy/perf_counters.cc | 372 ++++++++++++++++++++++++++++++++++++++ hwy/perf_counters.h | 156 ++++++++++++++++ hwy/perf_counters_test.cc | 158 ++++++++++++++++ 5 files changed, 703 insertions(+) create mode 100644 hwy/perf_counters.cc create mode 100644 hwy/perf_counters.h create mode 100644 hwy/perf_counters_test.cc diff --git a/BUILD b/BUILD index ee379363cc..1c12d6ab52 100644 --- a/BUILD +++ b/BUILD @@ -256,6 +256,19 @@ cc_library( ], ) +cc_library( + name = "perf_counters", + srcs = ["hwy/perf_counters.cc"], + hdrs = ["hwy/perf_counters.h"], + compatible_with = [], + copts = COPTS, + deps = [ + ":bit_set", + ":hwy", + ":nanobenchmark", + ], +) + cc_library( name = "profiler", hdrs = [ @@ -486,6 +499,7 @@ HWY_TESTS = [ ("hwy/", "bit_set_test"), ("hwy/", "highway_test"), ("hwy/", "nanobenchmark_test"), + ("hwy/", "perf_counters_test"), ("hwy/", "targets_test"), ("hwy/tests/", "arithmetic_test"), ("hwy/tests/", "bit_permute_test"), @@ -564,6 +578,7 @@ HWY_TEST_DEPS = [ ":math", ":matvec", ":nanobenchmark", + ":perf_counters", ":random", ":skeleton", ":thread_pool", diff --git a/CMakeLists.txt b/CMakeLists.txt index 115d0ab915..a1598a8a0a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -236,6 +236,7 @@ if (NOT HWY_CMAKE_HEADER_ONLY) hwy/aligned_allocator.cc hwy/nanobenchmark.cc hwy/per_target.cc + hwy/perf_counters.cc hwy/print.cc hwy/targets.cc hwy/timer.cc @@ -718,6 +719,7 @@ set(HWY_TEST_FILES hwy/bit_set_test.cc hwy/highway_test.cc hwy/nanobenchmark_test.cc + hwy/perf_counters_test.cc hwy/targets_test.cc hwy/examples/skeleton_test.cc hwy/tests/arithmetic_test.cc diff --git a/hwy/perf_counters.cc b/hwy/perf_counters.cc new file mode 100644 index 0000000000..bdd3dc27b8 --- /dev/null +++ b/hwy/perf_counters.cc @@ -0,0 +1,372 @@ +// Copyright 2024 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/perf_counters.h" + +#include "hwy/detect_compiler_arch.h" // HWY_OS_LINUX + +#if HWY_OS_LINUX || HWY_IDE +#include +#include // open +#include +#include +#include +#include +#include // strcmp +#include +#include +#include // O_RDONLY +#include +#include +#include + +#include +#include + +#include "hwy/base.h" // HWY_ASSERT +#include "hwy/bit_set.h" +#include "hwy/timer.h" + +#endif // HWY_OS_LINUX || HWY_IDE + +namespace hwy { +namespace platform { + +#if HWY_OS_LINUX || HWY_IDE + +namespace { + +bool PerfCountersSupported() { + // This is the documented way. + struct stat s; + return stat("/proc/sys/kernel/perf_event_paranoid", &s) == 0; +} + +// If we detect Linux < 6.9 and AMD EPYC, use cycles instead of ref-cycles +// because the latter is not supported and returns 0, see +// https://lwn.net/Articles/967791/. +uint64_t RefCyclesOrCycles() { + const uint32_t ref_cycles = PERF_COUNT_HW_REF_CPU_CYCLES; + + utsname buf; + if (uname(&buf) != 0) return ref_cycles; + if (std::string(buf.sysname) != "Linux") return ref_cycles; + int major, minor; + if (sscanf(buf.release, "%d.%d", &major, &minor) != 2) return ref_cycles; + if (major > 6 || (major == 6 && minor >= 9)) return ref_cycles; + + // AMD Zen4 CPU + char cpu100[100]; + if (!GetCpuString(cpu100)) return ref_cycles; + if (std::string(cpu100).rfind("AMD EPYC", 0) != 0) return ref_cycles; + + return PERF_COUNT_HW_CPU_CYCLES; +} + +struct CounterConfig { // for perf_event_open + uint64_t config; + uint32_t type; + PerfCounters::Counter c; +}; + +std::vector AllCounterConfigs() { + constexpr uint32_t kHW = PERF_TYPE_HARDWARE; + constexpr uint32_t kSW = PERF_TYPE_SOFTWARE; + constexpr uint32_t kC = PERF_TYPE_HW_CACHE; + constexpr uint64_t kL3 = PERF_COUNT_HW_CACHE_LL; + constexpr uint64_t kLoad = uint64_t{PERF_COUNT_HW_CACHE_OP_READ} << 8; + constexpr uint64_t kStore = uint64_t{PERF_COUNT_HW_CACHE_OP_WRITE} << 8; + constexpr uint64_t kAcc = uint64_t{PERF_COUNT_HW_CACHE_RESULT_ACCESS} << 16; + + // Order is important for bin-packing event groups. x86 can only handle two + // LLC-related events per group, so spread them out and arrange SW events + // such that do not start a new group. This list of counters may change. + return {{RefCyclesOrCycles(), kHW, PerfCounters::kRefCycles}, + {PERF_COUNT_HW_INSTRUCTIONS, kHW, PerfCounters::kInstructions}, + {PERF_COUNT_SW_PAGE_FAULTS, kSW, PerfCounters::kPageFaults}, + {kL3 | kLoad | kAcc, kC, PerfCounters::kL3Loads}, + {kL3 | kStore | kAcc, kC, PerfCounters::kL3Stores}, + {PERF_COUNT_HW_BRANCH_INSTRUCTIONS, kHW, PerfCounters::kBranches}, + {PERF_COUNT_HW_BRANCH_MISSES, kHW, PerfCounters::kBranchMispredicts}, + // Second group: + {PERF_COUNT_HW_BUS_CYCLES, kHW, PerfCounters::kBusCycles}, + {PERF_COUNT_SW_CPU_MIGRATIONS, kSW, PerfCounters::kMigrations}, + {PERF_COUNT_HW_CACHE_REFERENCES, kHW, PerfCounters::kCacheRefs}, + {PERF_COUNT_HW_CACHE_MISSES, kHW, PerfCounters::kCacheMisses}}; +} + +size_t& PackedIdx(PerfCounters::Counter c) { + static size_t packed_idx[64]; + return packed_idx[static_cast(c)]; +} + +class PMU { + static perf_event_attr MakeAttr(const CounterConfig& cc) { + perf_event_attr attr = {}; + attr.type = cc.type; + attr.size = sizeof(attr); + attr.config = cc.config; + // We request more counters than the HW may support. If so, they are + // multiplexed and only active for a fraction of the runtime. Recording the + // times lets us extrapolate. GROUP enables a single syscall to reduce the + // cost of reading. + attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | + PERF_FORMAT_TOTAL_TIME_RUNNING | PERF_FORMAT_GROUP; + // Do not set inherit=1 because that conflicts with PERF_FORMAT_GROUP. + // Do not set disable=1, so that perf_event_open verifies all events in the + // group can be scheduled together. + attr.exclude_kernel = 1; // required if perf_event_paranoid == 1 + attr.exclude_hv = 1; // = hypervisor + return attr; + } + + static int SysPerfEventOpen(const CounterConfig& cc, int leader_fd) { + perf_event_attr attr = MakeAttr(cc); + const int pid = 0; // current process (cannot also be -1) + const int cpu = -1; // any CPU + // Retry if interrupted by signals; this actually happens (b/64774091). + for (int retry = 0; retry < 10; ++retry) { + const int flags = 0; + const int fd = static_cast( + syscall(__NR_perf_event_open, &attr, pid, cpu, leader_fd, flags)); + if (!(fd == -1 && errno == EINTR)) return fd; + } + HWY_WARN("perf_event_open retries were insufficient."); + return -1; + } + + // Reads from `fd`; recovers from interruptions before/during the read. + static bool ReadBytes(int fd, ssize_t size, void* to) { + uint8_t* bytes = reinterpret_cast(to); + ssize_t pos = 0; + for (int retry = 0; retry < 10; ++retry) { + const ssize_t bytes_read = + read(fd, bytes + pos, static_cast(size - pos)); + if (HWY_UNLIKELY(bytes_read <= 0)) { + if (errno == EINTR) continue; + HWY_WARN("perf read() failed, errno %d.", errno); + return false; + } + pos += bytes_read; + HWY_ASSERT(pos <= size); + if (HWY_LIKELY(pos == size)) return true; // success + } + HWY_WARN("perf read() wanted %d bytes, got %d.", static_cast(size), + static_cast(pos)); + return false; + } + + // Array size in Buf; this is another upper bound on group size. It should be + // loose because it only wastes a bit of stack space, whereas an unnecessary + // extra group decreases coverage. Most HW supports 4-8 counters per group. + static constexpr size_t kMaxEventsPerGroup = PerfCounters::kCapacity; + +#pragma pack(push, 1) + struct Buf { + uint64_t num_events; + uint64_t time_enabled; + uint64_t time_running; + uint64_t values[kMaxEventsPerGroup]; + }; +#pragma pack(pop) + + // Returns false on error, otherwise sets `extrapolate` and `values`. + static bool ReadAndExtrapolate(int fd, size_t num_events, double& extrapolate, + double* HWY_RESTRICT values) { + Buf buf; + const ssize_t want_bytes = // size of var-len `Buf` + static_cast(24 + num_events * sizeof(uint64_t)); + if (HWY_UNLIKELY(!ReadBytes(fd, want_bytes, &buf))) return false; + + HWY_DASSERT(num_events == buf.num_events); + HWY_DASSERT(buf.time_running <= buf.time_enabled); + // If the group was not yet scheduled, we must avoid division by zero. + // In case counters were previously running and not reset, their current + // values may be nonzero. Returning zero could be interpreted as counters + // running backwards, so we instead treat this as a failure and mark the + // counters as invalid. + if (HWY_UNLIKELY(buf.time_running == 0)) return false; + + // Extrapolate each value. + extrapolate = static_cast(buf.time_enabled) / + static_cast(buf.time_running); + for (size_t i = 0; i < buf.num_events; ++i) { + values[i] = static_cast(buf.values[i]) * extrapolate; + } + return true; + } + + public: + bool Init() { + // Allow callers who do not know about each other to each call `Init`. + // If this already succeeded, we're done; if not, we will try again. + if (HWY_UNLIKELY(!fds_.empty())) return true; + if (HWY_UNLIKELY(!PerfCountersSupported())) { + HWY_WARN( + "This Linux does not support perf counters. The program will" + "continue, but counters will return zero."); + return false; + } + + groups_.push_back(Group()); + fds_.reserve(PerfCounters::kCapacity); + + for (const CounterConfig& config : AllCounterConfigs()) { + // If the group is limited by our buffer size, add a new one. + if (HWY_UNLIKELY(groups_.back().num_events == kMaxEventsPerGroup)) { + groups_.push_back(Group()); + } + + int fd = SysPerfEventOpen(config, groups_.back().leader_fd); + // Retry in case the group is limited by HW capacity. Do not check + // errno because it is too inconsistent (ENOSPC, EINVAL, others?). + if (HWY_UNLIKELY(fd < 0)) { + fd = SysPerfEventOpen(config, /*leader_fd=*/-1); + if (fd >= 0 && groups_.back().num_events != 0) { + groups_.push_back(Group()); + } + } + + if (HWY_UNLIKELY(fd < 0)) { + HWY_WARN("perf_event_open %d errno %d for counter %s.", fd, errno, + PerfCounters::Name(config.c)); + } else { + // Add to group and set as leader if empty. + if (groups_.back().leader_fd == -1) { + groups_.back().leader_fd = fd; + + // Ensure the leader is not a SW event, because adding an HW + // event to a group with only SW events is slow, and starting + // with SW may trigger a bug, see + // https://lore.kernel.org/lkml/tip-a1150c202207cc8501bebc45b63c264f91959260@git.kernel.org/ + if (HWY_UNLIKELY(config.type == PERF_TYPE_SOFTWARE)) { + HWY_WARN("SW event %s should not be leader.", + PerfCounters::Name(config.c)); + } + } + + PackedIdx(config.c) = fds_.size(); + groups_.back().num_events += 1; + valid_.Set(static_cast(config.c)); + fds_.push_back(fd); + } + } + + // If no counters are available, remove the empty group. + if (HWY_UNLIKELY(fds_.empty())) { + HWY_ASSERT(groups_.size() == 1); + HWY_ASSERT(groups_.back().num_events == 0); + HWY_ASSERT(groups_.back().leader_fd == -1); + groups_.clear(); + } + + size_t num_valid = 0; + for (const Group& group : groups_) { + num_valid += group.num_events; + // All groups have a leader and are not empty. + HWY_ASSERT(group.leader_fd >= 0); + HWY_ASSERT(0 != group.num_events && + group.num_events <= kMaxEventsPerGroup); + } + // Total `num_events` matches `fds_` and `Valid()`. + HWY_ASSERT(num_valid == fds_.size()); + HWY_ASSERT(num_valid == valid_.Count()); + HWY_ASSERT(num_valid <= PerfCounters::kCapacity); + + if (num_valid) { + StopAllAndReset(); + return true; + } else { + HWY_WARN("No valid counters found."); + return true; + } + } + + bool StartAll() { + if (HWY_UNLIKELY(fds_.empty())) return false; + HWY_ASSERT(prctl(PR_TASK_PERF_EVENTS_ENABLE) == 0); + return true; + } + + void StopAllAndReset() { + HWY_ASSERT(prctl(PR_TASK_PERF_EVENTS_DISABLE) == 0); + for (int fd : fds_) { + HWY_ASSERT(ioctl(fd, PERF_EVENT_IOC_RESET, 0) == 0); + } + } + + // Returns false on error, otherwise sets `valid`, `max_extrapolate`, and + // `values`. + bool Read(BitSet64& valid, double& max_extrapolate, double* values) { + if (HWY_UNLIKELY(!valid_.Any())) return false; + + // Read all counters into buffer in the order in which they were opened. + max_extrapolate = 1.0; + double* pos = values; + for (const Group& group : groups_) { + double extrapolate; + if (HWY_UNLIKELY(!ReadAndExtrapolate(group.leader_fd, group.num_events, + extrapolate, pos))) { + return false; + } + max_extrapolate = HWY_MAX(max_extrapolate, extrapolate); + pos += group.num_events; + } + + valid = valid_; + HWY_DASSERT(pos == values + valid.Count()); + return true; + } + + private: + std::vector fds_; // one per valid_ + BitSet64 valid_; + + struct Group { + size_t num_events = 0; + int leader_fd = -1; + }; + std::vector groups_; +}; + +// Monostate, see header. +PMU& GetPMU() { + static PMU pmu; + return pmu; +} + +} // namespace + +bool PerfCounters::Init() { return GetPMU().Init(); } +bool PerfCounters::StartAll() { return GetPMU().StartAll(); } +void PerfCounters::StopAllAndReset() { GetPMU().StopAllAndReset(); } +PerfCounters::PerfCounters() { + if (HWY_UNLIKELY(!GetPMU().Read(valid_, max_extrapolate_, values_))) { + valid_ = BitSet64(); + max_extrapolate_ = 0.0; + hwy::ZeroBytes(values_, sizeof(values_)); + } +} +size_t PerfCounters::IndexForCounter(Counter c) { return PackedIdx(c); } +#else +bool PerfCounters::Init() { return false; } +bool PerfCounters::StartAll() { return false; } +void PerfCounters::StopAllAndReset() {} +PerfCounters::PerfCounters() : max_extrapolate_(1.0), values_{0.0} {} +size_t PerfCounters::IndexForCounter(Counter) { return 0; } +#endif // HWY_OS_LINUX || HWY_IDE + +} // namespace platform +} // namespace hwy diff --git a/hwy/perf_counters.h b/hwy/perf_counters.h new file mode 100644 index 0000000000..ee68419ec9 --- /dev/null +++ b/hwy/perf_counters.h @@ -0,0 +1,156 @@ +// Copyright 2024 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAY_HWY_PERF_COUNTERS_H_ +#define HIGHWAY_HWY_PERF_COUNTERS_H_ + +// Reads OS/CPU performance counters. + +#include + +#include "hwy/base.h" // HWY_ABORT +#include "hwy/bit_set.h" + +namespace hwy { +namespace platform { + +// Avoid padding in case callers such as profiler.h store many instances. +#pragma pack(push, 1) +// Provides access to CPU/OS performance counters. Each instance has space for +// multiple counter values; which counters these are may change in future. +// Although counters are per-CPU, Linux accesses them via a syscall, hence we +// use the monostate pattern to avoid callers having to pass around a pointer. +// Note that this is not thread-safe, so the static member functions should only +// be called from the main thread. +class PerfCounters { + public: + // Chosen such that this class occupies one or two cache lines. + static constexpr size_t kCapacity = 14; + + // Bit indices used to identify counters. The ordering is arbitrary. Some of + // these counters may be 'removed' in the sense of not being visited by + // `Foreach`, but their enumerators will remain. New counters may be appended. + enum Counter { + kRefCycles = 0, + kInstructions, + kBranches, + kBranchMispredicts, + kBusCycles, + kCacheRefs, + kCacheMisses, + kL3Loads, + kL3Stores, + kPageFaults, // SW + kMigrations // SW + }; // BitSet64 requires these values to be less than 64. + + // Strings for user-facing messages, not used in the implementation. + static inline const char* Name(Counter c) { + switch (c) { + case kRefCycles: + return "ref_cycles"; + case kInstructions: + return "instructions"; + case kBranches: + return "branches"; + case kBranchMispredicts: + return "branch_mispredicts"; + case kBusCycles: + return "bus_cycles"; + case kCacheRefs: + return "cache_refs"; + case kCacheMisses: + return "cache_misses"; + case kL3Loads: + return "l3_load"; + case kL3Stores: + return "l3_store"; + case kPageFaults: + return "page_fault"; + case kMigrations: + return "migration"; + default: + HWY_ABORT("Bug: unknown counter %d", c); + } + } + + // Returns false if counters are unavailable. Must be called at least once + // before `StartAll`; it is separate to reduce the overhead of repeatedly + // stopping/starting counters. + static bool Init(); + + // Returns false if counters are unavailable, otherwise starts them. Note that + // they default to stopped. Unless this is called, the values read may be 0. + static bool StartAll(); + + // Stops and zeros all counters. This is not necessary if users subtract the + // previous counter values, but can increase precision because floating-point + // has more precision near zero. + static void StopAllAndReset(); + + // Reads the current (extrapolated, in case of multiplexing) counter values. + PerfCounters(); + + // Returns whether any counters were successfully read. + bool AnyValid() const { return valid_.Any(); } + + // Returns whether the given counter was successfully read. + bool IsValid(Counter c) const { + const size_t bit_idx = static_cast(c); + return valid_.Get(bit_idx); + } + + // Returns the maximum extrapolation factor for any counter, which is the + // total time between `StartAll` and now or the last `StopAllAndReset`, + // divided by the time that the counter was actually running. This + // approximates the number of counter groups that the CPU multiplexes onto the + // actual counter hardware. It is only meaningful if AnyValid(). + double MaxExtrapolate() const { return max_extrapolate_; } + + // Returns the value of the given counter, or zero if it is not valid. + double Get(Counter c) const { + return IsValid(c) ? values_[IndexForCounter(c)] : 0.0; + } + + // For each valid counter in increasing numerical order, calls `visitor` with + // the value and `Counter`. + template + void Foreach(const Visitor& visitor) { + valid_.Foreach([&](size_t bit_idx) { + const Counter c = static_cast(bit_idx); + visitor(values_[IndexForCounter(c)], c); + }); + } + + private: + // Index within `values_` for a given counter. + static size_t IndexForCounter(Counter c); + + BitSet64 valid_; + double max_extrapolate_; + // Floating-point because these are extrapolated (multiplexing). It would be + // nice for this to fit in one cache line to reduce the cost of reading + // counters in profiler.h, but some of the values are too large for float and + // we want more than 8 counters. Ensure all values are sums, not ratios, so + // that profiler.h can add/subtract them. These are contiguous in memory, in + // the order that counters were initialized. + double values_[kCapacity]; +}; +#pragma pack(pop) + +} // namespace platform +} // namespace hwy + +#endif // HIGHWAY_HWY_PERF_COUNTERS_H_ diff --git a/hwy/perf_counters_test.cc b/hwy/perf_counters_test.cc new file mode 100644 index 0000000000..6137fb5a7e --- /dev/null +++ b/hwy/perf_counters_test.cc @@ -0,0 +1,158 @@ +// Copyright 2024 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/perf_counters.h" + +#include +#include +#include + +#include + +#include "hwy/nanobenchmark.h" // Unpredictable1 +#include "hwy/tests/hwy_gtest.h" +#include "hwy/tests/test_util-inl.h" +#include "hwy/timer-inl.h" +#include "hwy/timer.h" + +#if !HWY_OS_WIN +#include // usleep +#endif + +namespace hwy { +namespace { + +using ::hwy::platform::PerfCounters; + +void ReadAndPrint(uint64_t r, double* values) { + char cpu100[100]; + const bool have_stop = hwy::platform::HaveTimerStop(cpu100); + const uint64_t t0 = HWY_STATIC_DISPATCH(timer::Start()); + + PerfCounters counters; + const uint64_t t1 = have_stop ? HWY_STATIC_DISPATCH(timer::Stop()) + : HWY_STATIC_DISPATCH(timer::Start()); + const double elapsed_ns = + static_cast(t1 - t0) * 1E9 / platform::InvariantTicksPerSecond(); + fprintf(stderr, "r: %d, any valid %d extrapolate %f, overhead %.1f ns\n", + static_cast(r), counters.AnyValid(), counters.MaxExtrapolate(), + elapsed_ns); + + if (counters.AnyValid()) { + HWY_ASSERT(counters.MaxExtrapolate() >= 1.0); + } + + counters.Foreach([&counters, values](double val, PerfCounters::Counter c) { + HWY_ASSERT(counters.IsValid(c)); + fprintf(stderr, "%-20s: %.3E\n", PerfCounters::Name(c), val); + values[static_cast(c)] = val; + }); + PerfCounters::StopAllAndReset(); +} + +// Ensures a memory-intensive workload has high memory-related counters. +TEST(PerfCountersTest, TestMem) { + RandomState rng; + if (!PerfCounters::Init() || !PerfCounters::StartAll()) { + HWY_WARN("Perf counters unavailable, skipping test\n"); + return; + } + // Force L3 cache misses (loads). + std::vector big_array(128 * 1024 * 1024); + for (uint64_t& x : big_array) { + x = rng() & static_cast(hwy::Unpredictable1()); + } + const uint64_t r = big_array[rng() & 0xFFFF]; + + double values[64] = {0.0}; + ReadAndPrint(r, values); + + // Note that counters might not be available, and values differ considerably + // for debug/sanitizer builds. + HWY_ASSERT(values[PerfCounters::kRefCycles] == 0.0 || + values[PerfCounters::kRefCycles] > 1E8); // 470M..9B + HWY_ASSERT(values[PerfCounters::kInstructions] == 0.0 || + values[PerfCounters::kInstructions] > 1E5); // 1.5M..10B + HWY_ASSERT(values[PerfCounters::kPageFaults] == 0.0 || + values[PerfCounters::kPageFaults] > 1); // 4..500K + HWY_ASSERT(values[PerfCounters::kBranches] == 0.0 || + values[PerfCounters::kBranches] > 1E6); + HWY_ASSERT(values[PerfCounters::kBranchMispredicts] < 1E7); // 273K..1M + + HWY_ASSERT(values[PerfCounters::kL3Loads] == 0.0 || + values[PerfCounters::kL3Loads] > 1000.0); // ~90K + HWY_ASSERT(values[PerfCounters::kL3Stores] == 0.0 || + values[PerfCounters::kL3Stores] > 1E3); // 9K..5M + + HWY_ASSERT(values[PerfCounters::kCacheRefs] == 0.0 || + values[PerfCounters::kCacheRefs] > 1E4); // 75K..66M + HWY_ASSERT(values[PerfCounters::kCacheMisses] == 0.0 || + values[PerfCounters::kCacheMisses] > 10.0); // 13..51M + HWY_ASSERT(values[PerfCounters::kBusCycles] == 0.0 || + values[PerfCounters::kBusCycles] > 1E7); // 82M +} + +// Ensures a branch-heavy workload has high branch-related counters and not +// too high memory-related counters. +TEST(PerfCountersTest, RunBranches) { + RandomState rng; + if (!PerfCounters::Init() || !PerfCounters::StartAll()) { + HWY_WARN("Perf counters unavailable, skipping test\n"); + return; + } + + // Branch-heavy, non-constexpr calculation so we see changes to counters. + const size_t iters = + static_cast(hwy::Unpredictable1()) * 100000 + (rng() & 1); + uint64_t r = rng(); + for (size_t i = 0; i < iters; ++i) { + if (PopCount(rng()) < 36) { + r += rng() & 0xFF; + } else { + // Entirely different operation to ensure there is a branch. + r >>= 1; + } +#if !HWY_OS_WIN + // Ensure test runs long enough for counter multiplexing to happen. + usleep(100); // NOLINT(runtime/sleep) +#endif + } + + double values[64] = {0.0}; + ReadAndPrint(r, values); + + // Note that counters might not be available, and values differ considerably + // for debug/sanitizer builds. + HWY_ASSERT(values[PerfCounters::kRefCycles] == 0.0 || + values[PerfCounters::kRefCycles] > 1E3); // 13K..18M + HWY_ASSERT(values[PerfCounters::kInstructions] == 0.0 || + values[PerfCounters::kInstructions] > 100.0); // 900..2M + HWY_ASSERT(values[PerfCounters::kBranches] == 0.0 || + values[PerfCounters::kBranches] > 100.0); // 1K..273K + HWY_ASSERT(values[PerfCounters::kBranchMispredicts] == 0 || + values[PerfCounters::kBranchMispredicts] > 10.0); // 65..5K + + HWY_ASSERT(values[PerfCounters::kL3Loads] < 1E7); // 174K..1M + HWY_ASSERT(values[PerfCounters::kL3Stores] < 1E6); // 44K..128K + HWY_ASSERT(values[PerfCounters::kCacheRefs] < 1E8); // 5M..27M + HWY_ASSERT(values[PerfCounters::kCacheMisses] < 1E8); // 500K..10M + HWY_ASSERT(values[PerfCounters::kBusCycles] < 1E10); // 1M..3B + HWY_ASSERT(values[PerfCounters::kPageFaults] < 100.0); // 0..12 +} + +} // namespace +} // namespace hwy + +HWY_TEST_MAIN(); From a076ade735e13b0dc9a20d9a63723d180151748d Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Thu, 12 Dec 2024 11:16:56 -0800 Subject: [PATCH 21/64] fix topology detection for some CPUs being offline (e.g. SMT off) PiperOrigin-RevId: 705569288 --- hwy/contrib/thread_pool/topology.cc | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/hwy/contrib/thread_pool/topology.cc b/hwy/contrib/thread_pool/topology.cc index 60ffbff829..8ab031db85 100644 --- a/hwy/contrib/thread_pool/topology.cc +++ b/hwy/contrib/thread_pool/topology.cc @@ -208,9 +208,9 @@ HWY_CONTRIB_DLLEXPORT size_t TotalLogicalProcessors() { total_lps += NumBits(p.GroupCount, p.GroupMask); }); #elif HWY_OS_LINUX - // Use configured, not "online" (_SC_NPROCESSORS_ONLN), because we want an - // upper bound. - const long ret = sysconf(_SC_NPROCESSORS_CONF); // NOLINT(runtime/int) + // Only check "online" because sysfs entries such as topology are missing for + // offline CPUs, which will cause `DetectPackages` to fail. + const long ret = sysconf(_SC_NPROCESSORS_ONLN); // NOLINT(runtime/int) if (ret < 0) { HWY_WARN("Unexpected _SC_NPROCESSORS_CONF = %d\n", static_cast(ret)); } else { @@ -527,7 +527,10 @@ std::vector DetectPackages(std::vector& lps) { Remapper packages; for (size_t lp = 0; lp < lps.size(); ++lp) { - if (!packages(kPackage, lp, &lps[lp].package)) return empty; + if (!packages(kPackage, lp, &lps[lp].package)) { + HWY_WARN("Failed to read sysfs package for LP %zu\n", lp); + return empty; + } } std::vector per_package(packages.Num()); HWY_ASSERT(!per_package.empty()); @@ -539,7 +542,10 @@ std::vector DetectPackages(std::vector& lps) { lps[lp].cluster = 0; } - if (!pp.cores(kCore, lp, &lps[lp].core)) return empty; + if (!pp.cores(kCore, lp, &lps[lp].core)) { + HWY_WARN("Failed to read sysfs core for LP %zu\n", lp); + return empty; + } // SMT ID is how many LP we have already seen assigned to the same core. HWY_ASSERT(lps[lp].core < kMaxLogicalProcessors); From eb4dc592c553ad90a2c2121e38fdb309f32a3494 Mon Sep 17 00:00:00 2001 From: John Platts Date: Fri, 13 Dec 2024 11:16:48 -0600 Subject: [PATCH 22/64] Fixes for ZSeries with GCC 9 or earlier or Clang 18 or earlier --- hwy/detect_targets.h | 27 ++++++++++++---- hwy/ops/ppc_vsx-inl.h | 71 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 89 insertions(+), 9 deletions(-) diff --git a/hwy/detect_targets.h b/hwy/detect_targets.h index 8b70c5b4b8..3f682178d8 100644 --- a/hwy/detect_targets.h +++ b/hwy/detect_targets.h @@ -294,15 +294,30 @@ #define HWY_BROKEN_LOONGARCH 0 #endif +#if HWY_ARCH_S390X +#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1900 +// Clang 18 and earlier have bugs with some ZVector intrinsics +#define HWY_BROKEN_Z14 (HWY_Z14 | HWY_Z15) +#elif HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900 +// Z15 target requires GCC 9 or later +#define HWY_BROKEN_Z14 (HWY_Z15) +#else +#define HWY_BROKEN_Z14 0 +#endif +#else // !HWY_ARCH_S390X +#define HWY_BROKEN_Z14 0 +#endif // HWY_ARCH_S390X + // Allow the user to override this without any guarantee of success. #ifndef HWY_BROKEN_TARGETS -#define HWY_BROKEN_TARGETS \ - (HWY_BROKEN_CLANG6 | HWY_BROKEN_32BIT | HWY_BROKEN_MSVC | \ - HWY_BROKEN_AVX3_DL_ZEN4 | HWY_BROKEN_AVX3_SPR | \ - HWY_BROKEN_ARM7_BIG_ENDIAN | HWY_BROKEN_ARM7_WITHOUT_VFP4 | \ - HWY_BROKEN_NEON_BF16 | HWY_BROKEN_SVE | HWY_BROKEN_PPC10 | \ - HWY_BROKEN_PPC_32BIT | HWY_BROKEN_RVV | HWY_BROKEN_LOONGARCH) +#define HWY_BROKEN_TARGETS \ + (HWY_BROKEN_CLANG6 | HWY_BROKEN_32BIT | HWY_BROKEN_MSVC | \ + HWY_BROKEN_AVX3_DL_ZEN4 | HWY_BROKEN_AVX3_SPR | \ + HWY_BROKEN_ARM7_BIG_ENDIAN | HWY_BROKEN_ARM7_WITHOUT_VFP4 | \ + HWY_BROKEN_NEON_BF16 | HWY_BROKEN_SVE | HWY_BROKEN_PPC10 | \ + HWY_BROKEN_PPC_32BIT | HWY_BROKEN_RVV | HWY_BROKEN_LOONGARCH | \ + HWY_BROKEN_Z14) #endif // HWY_BROKEN_TARGETS diff --git a/hwy/ops/ppc_vsx-inl.h b/hwy/ops/ppc_vsx-inl.h index 3c285c863a..86d6d98c39 100644 --- a/hwy/ops/ppc_vsx-inl.h +++ b/hwy/ops/ppc_vsx-inl.h @@ -878,10 +878,47 @@ HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) { } // ------------------------------ Reverse +#if HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL && \ + HWY_COMPILER_GCC_ACTUAL < 900 +// Workaround for missing vec_reve on Z14 with GCC 8 or earlier +template , HWY_IF_LANES_GT_D(D, 1), + HWY_IF_T_SIZE_D(D, 1)> +HWY_API Vec128 Reverse(D d, Vec128 v) { + const Repartition du8; + return TableLookupBytes( + v, BitCast(d, Dup128VecFromValues(du8, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, + 5, 4, 3, 2, 1, 0))); +} + +template , HWY_IF_LANES_GT_D(D, 1), + HWY_IF_T_SIZE_D(D, 2)> +HWY_API Vec128 Reverse(D d, Vec128 v) { + const Repartition du8; + return TableLookupBytes( + v, BitCast(d, Dup128VecFromValues(du8, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, + 4, 5, 2, 3, 0, 1))); +} + +template , HWY_IF_LANES_GT_D(D, 1), + HWY_IF_T_SIZE_D(D, 4)> +HWY_API Vec128 Reverse(D d, Vec128 v) { + const Repartition du8; + return TableLookupBytes( + v, BitCast(d, Dup128VecFromValues(du8, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, + 6, 7, 0, 1, 2, 3))); +} + +template , HWY_IF_LANES_GT_D(D, 1), + HWY_IF_T_SIZE_D(D, 8)> +HWY_API Vec128 Reverse(D /* tag */, Vec128 v) { + return Vec128{vec_sld(v.raw, v.raw, 8)}; +} +#else template , HWY_IF_LANES_GT_D(D, 1)> HWY_API Vec128 Reverse(D /* tag */, Vec128 v) { return Vec128{vec_reve(v.raw)}; } +#endif // ------------------------------ Shuffles (Reverse) @@ -2543,8 +2580,10 @@ HWY_API Vec32 Reverse(D d, Vec32 v) { // ------------------------------- ReverseLaneBytes -#if (HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14) && \ - (HWY_COMPILER_GCC_ACTUAL >= 710 || HWY_COMPILER_CLANG >= 400) +#if (HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14) && \ + ((!HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL >= 710) || \ + (HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL >= 900) || \ + HWY_COMPILER_CLANG >= 400) // Per-target flag to prevent generic_ops-inl.h defining 8-bit ReverseLaneBytes. #ifdef HWY_NATIVE_REVERSE_LANE_BYTES @@ -3651,6 +3690,10 @@ HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { const __vector float raw_v = InterleaveLower(v, v).raw; #if HWY_IS_LITTLE_ENDIAN return VFromD{vec_doubleo(raw_v)}; +#elif HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL && \ + HWY_COMPILER_GCC_ACTUAL < 1000 + // Workaround for compiler errors with GCC 9 or earlier on Z14 + return VFromD{__builtin_s390_vflls(raw_v)}; #else return VFromD{vec_doublee(raw_v)}; #endif @@ -3788,6 +3831,10 @@ HWY_API VFromD PromoteUpperTo(D /*tag*/, Vec128 v) { const __vector float raw_v = InterleaveUpper(Full128(), v, v).raw; #if HWY_IS_LITTLE_ENDIAN return VFromD{vec_doubleo(raw_v)}; +#elif HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL && \ + HWY_COMPILER_GCC_ACTUAL < 1000 + // Workaround for compiler error with GCC 9 or earlier on Z14 + return VFromD{__builtin_s390_vflls(raw_v)}; #else return VFromD{vec_doublee(raw_v)}; #endif @@ -4409,12 +4456,22 @@ HWY_API VFromD OrderedDemote2To(D d, V a, V b) { template HWY_API Vec32 DemoteTo(D /* tag */, Vec64 v) { +#if HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL && \ + HWY_COMPILER_GCC_ACTUAL < 1000 + // Workaround for compiler error with GCC 9 or earlier on Z14 + return Vec32{__builtin_s390_vflrd(v.raw, 0, 0)}; +#else return Vec32{vec_floate(v.raw)}; +#endif } template HWY_API Vec64 DemoteTo(D d, Vec128 v) { -#if HWY_S390X_HAVE_Z14 || HWY_IS_LITTLE_ENDIAN +#if HWY_S390X_HAVE_Z14 && HWY_COMPILER_GCC_ACTUAL && \ + HWY_COMPILER_GCC_ACTUAL < 1000 + // Workaround for compiler error with GCC 9 or earlier on Z14 + const Vec128 f64_to_f32{__builtin_s390_vflrd(v.raw, 0, 0)}; +#elif HWY_S390X_HAVE_Z14 || HWY_IS_LITTLE_ENDIAN const Vec128 f64_to_f32{vec_floate(v.raw)}; #else const Vec128 f64_to_f32{vec_floato(v.raw)}; @@ -4599,8 +4656,16 @@ template ConvertTo(D df32, Vec128 v) { const RepartitionToWide df64; +#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000 + // Workaround for compiler error with GCC 9 or earlier on Z14 + const VFromD vf32_lo{ + __builtin_s390_vflrd(PromoteLowerTo(df64, v).raw, 0, 0)}; + const VFromD vf32_hi{ + __builtin_s390_vflrd(PromoteUpperTo(df64, v).raw, 0, 0)}; +#else const VFromD vf32_lo{vec_floate(PromoteLowerTo(df64, v).raw)}; const VFromD vf32_hi{vec_floate(PromoteUpperTo(df64, v).raw)}; +#endif return ConcatEven(df32, vf32_hi, vf32_lo); } #else // Z15 or PPC From 49674e10270b7061e25e95f1688ab2e7f9eba412 Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Mon, 16 Dec 2024 10:53:16 -0800 Subject: [PATCH 23/64] update thresholds for test failure PiperOrigin-RevId: 706764322 --- hwy/perf_counters_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hwy/perf_counters_test.cc b/hwy/perf_counters_test.cc index 6137fb5a7e..28b2225507 100644 --- a/hwy/perf_counters_test.cc +++ b/hwy/perf_counters_test.cc @@ -148,7 +148,7 @@ TEST(PerfCountersTest, RunBranches) { HWY_ASSERT(values[PerfCounters::kL3Stores] < 1E6); // 44K..128K HWY_ASSERT(values[PerfCounters::kCacheRefs] < 1E8); // 5M..27M HWY_ASSERT(values[PerfCounters::kCacheMisses] < 1E8); // 500K..10M - HWY_ASSERT(values[PerfCounters::kBusCycles] < 1E10); // 1M..3B + HWY_ASSERT(values[PerfCounters::kBusCycles] < 1E11); // 1M..10B HWY_ASSERT(values[PerfCounters::kPageFaults] < 100.0); // 0..12 } From 5cde138f2eb5adc2c48b3965ade527276dade891 Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Mon, 16 Dec 2024 11:58:44 -0800 Subject: [PATCH 24/64] fix build in case building for loongarch already (not yet supported) PiperOrigin-RevId: 706786560 --- hwy/detect_targets.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/hwy/detect_targets.h b/hwy/detect_targets.h index 3f682178d8..8c48293c12 100644 --- a/hwy/detect_targets.h +++ b/hwy/detect_targets.h @@ -647,6 +647,13 @@ //------------------------------------------------------------------------------ // Choose targets for dynamic dispatch according to one of four policies +// TODO: remove once HWY_LSX is actually supported +#if HWY_ARCH_LOONGARCH +#undef HWY_COMPILE_ONLY_STATIC +#undef HWY_COMPILE_ONLY_EMU128 +#define HWY_COMPILE_ONLY_SCALAR +#endif + #if 1 < (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_EMU128) + \ defined(HWY_COMPILE_ONLY_STATIC)) #error "Can only define one of HWY_COMPILE_ONLY_{SCALAR|EMU128|STATIC} - bug?" From 09f8b6eaeb5338287fe88c16ba3b9124d5dab7b9 Mon Sep 17 00:00:00 2001 From: Eugene Ostroukhov Date: Wed, 18 Dec 2024 20:13:28 -0800 Subject: [PATCH 25/64] Make tests runnable with Bazel8 Add an alias for googletest in the MODULE.bazel. Also, added Bazel dirs into .gitignore. --- .gitignore | 6 ++++++ MODULE.bazel | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 2e264881af..34e5ded557 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,9 @@ +build +bazel-bin +bazel-highway +bazel-out +bazel-testlogs +MODULE.bazel.lock docs/g3doc/* docs/html/* docs/md/* diff --git a/MODULE.bazel b/MODULE.bazel index 1cc76ad269..e222c18c25 100644 --- a/MODULE.bazel +++ b/MODULE.bazel @@ -4,7 +4,7 @@ module( ) bazel_dep(name = "bazel_skylib", version = "1.6.1") -bazel_dep(name = "googletest", version = "1.15.2") +bazel_dep(name = "googletest", version = "1.15.2", repo_name = "com_google_googletest") bazel_dep(name = "rules_cc", version = "0.0.9") bazel_dep(name = "rules_license", version = "0.0.7") bazel_dep(name = "platforms", version = "0.0.10") From e8b3825418ba0ffe82a546cf1c8a948837dd7539 Mon Sep 17 00:00:00 2001 From: Eugene Ostroukhov Date: Wed, 18 Dec 2024 20:29:59 -0800 Subject: [PATCH 26/64] Unroller: allow const input --- hwy/contrib/unroller/unroller-inl.h | 29 +++++++++++++++------------ hwy/contrib/unroller/unroller_test.cc | 7 ++++--- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/hwy/contrib/unroller/unroller-inl.h b/hwy/contrib/unroller/unroller-inl.h index 4ed8c25c07..e5d9661b65 100644 --- a/hwy/contrib/unroller/unroller-inl.h +++ b/hwy/contrib/unroller/unroller-inl.h @@ -63,11 +63,11 @@ struct UnrollerUnit { Y_VEC YInitImpl() { return hn::Zero(d_out); } - X_VEC Load(const ptrdiff_t idx, IN_T* from) { + X_VEC Load(const ptrdiff_t idx, const IN_T* from) { return me()->LoadImpl(idx, from); } - X_VEC LoadImpl(const ptrdiff_t idx, IN_T* from) { + X_VEC LoadImpl(const ptrdiff_t idx, const IN_T* from) { return hn::LoadU(d_in, from + idx); } @@ -77,11 +77,13 @@ struct UnrollerUnit { // | o | o | o | x | x | x | x | x | // example places = -3 // | x | x | x | x | x | o | o | o | - X_VEC MaskLoad(const ptrdiff_t idx, IN_T* from, const ptrdiff_t places) { + X_VEC MaskLoad(const ptrdiff_t idx, const IN_T* from, + const ptrdiff_t places) { return me()->MaskLoadImpl(idx, from, places); } - X_VEC MaskLoadImpl(const ptrdiff_t idx, IN_T* from, const ptrdiff_t places) { + X_VEC MaskLoadImpl(const ptrdiff_t idx, const IN_T* from, + const ptrdiff_t places) { auto mask = hn::FirstN(d_in, static_cast(places)); auto maskneg = hn::Not(hn::FirstN( d_in, @@ -181,19 +183,19 @@ struct UnrollerUnit2D { Y_VEC YInitImpl() { return hn::Zero(d_out); } - X0_VEC Load0(const ptrdiff_t idx, IN0_T* from) { + X0_VEC Load0(const ptrdiff_t idx, const IN0_T* from) { return me()->Load0Impl(idx, from); } - X0_VEC Load0Impl(const ptrdiff_t idx, IN0_T* from) { + X0_VEC Load0Impl(const ptrdiff_t idx, const IN0_T* from) { return hn::LoadU(d_in0, from + idx); } - X1_VEC Load1(const ptrdiff_t idx, IN1_T* from) { + X1_VEC Load1(const ptrdiff_t idx, const IN1_T* from) { return me()->Load1Impl(idx, from); } - X1_VEC Load1Impl(const ptrdiff_t idx, IN1_T* from) { + X1_VEC Load1Impl(const ptrdiff_t idx, const IN1_T* from) { return hn::LoadU(d_in1, from + idx); } @@ -203,11 +205,12 @@ struct UnrollerUnit2D { // | o | o | o | x | x | x | x | x | // example places = -3 // | x | x | x | x | x | o | o | o | - X0_VEC MaskLoad0(const ptrdiff_t idx, IN0_T* from, const ptrdiff_t places) { + X0_VEC MaskLoad0(const ptrdiff_t idx, const IN0_T* from, + const ptrdiff_t places) { return me()->MaskLoad0Impl(idx, from, places); } - X0_VEC MaskLoad0Impl(const ptrdiff_t idx, IN0_T* from, + X0_VEC MaskLoad0Impl(const ptrdiff_t idx, const IN0_T* from, const ptrdiff_t places) { auto mask = hn::FirstN(d_in0, static_cast(places)); auto maskneg = hn::Not(hn::FirstN( @@ -218,12 +221,12 @@ struct UnrollerUnit2D { return hn::MaskedLoad(mask, d_in0, from + idx); } - hn::Vec MaskLoad1(const ptrdiff_t idx, IN1_T* from, + hn::Vec MaskLoad1(const ptrdiff_t idx, const IN1_T* from, const ptrdiff_t places) { return me()->MaskLoad1Impl(idx, from, places); } - hn::Vec MaskLoad1Impl(const ptrdiff_t idx, IN1_T* from, + hn::Vec MaskLoad1Impl(const ptrdiff_t idx, const IN1_T* from, const ptrdiff_t places) { auto mask = hn::FirstN(d_in1, static_cast(places)); auto maskneg = hn::Not(hn::FirstN( @@ -284,7 +287,7 @@ struct UnrollerUnit2D { }; template -inline void Unroller(FUNC& f, IN_T* HWY_RESTRICT x, OUT_T* HWY_RESTRICT y, +inline void Unroller(FUNC& f, const IN_T* HWY_RESTRICT x, OUT_T* HWY_RESTRICT y, const ptrdiff_t n) { auto xx = f.X0Init(); auto yy = f.YInit(); diff --git a/hwy/contrib/unroller/unroller_test.cc b/hwy/contrib/unroller/unroller_test.cc index 7a13825dda..2879eb2aa6 100644 --- a/hwy/contrib/unroller/unroller_test.cc +++ b/hwy/contrib/unroller/unroller_test.cc @@ -148,7 +148,7 @@ struct FindUnit : UnrollerUnit, T, MakeSigned> { hn::Vec YInitImpl() { return hn::Set(di, TI{-1}); } - hn::Vec MaskLoadImpl(const ptrdiff_t idx, T* from, + hn::Vec MaskLoadImpl(const ptrdiff_t idx, const T* from, const ptrdiff_t places) { auto mask = hn::FirstN(d, static_cast(places)); auto maskneg = hn::Not(hn::FirstN( @@ -236,7 +236,7 @@ struct MinUnit : UnrollerUnit, T, T> { hn::Vec YInitImpl() { return hn::Set(d, HighestValue()); } - hn::Vec MaskLoadImpl(const ptrdiff_t idx, T* from, + hn::Vec MaskLoadImpl(const ptrdiff_t idx, const T* from, const ptrdiff_t places) { auto mask = hn::FirstN(d, static_cast(places)); auto maskneg = hn::Not(hn::FirstN( @@ -452,7 +452,8 @@ struct TestFind { FindUnit cvtfn(ConvertScalarTo(num - 1)); MakeSigned idx = 0; - Unroller(cvtfn, a, &idx, static_cast(num)); + Unroller(cvtfn, const_cast(a), &idx, + static_cast(num)); HWY_ASSERT(static_cast>(idx) < num); HWY_ASSERT(a[idx] == ConvertScalarTo(num - 1)); From 6e6a4295ea0d52fd000061a74feb2b19330a5fd3 Mon Sep 17 00:00:00 2001 From: Yevgen Ostroukhov Date: Thu, 19 Dec 2024 09:41:03 -0800 Subject: [PATCH 27/64] Make the intention behind test more explicit --- hwy/contrib/unroller/unroller_test.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hwy/contrib/unroller/unroller_test.cc b/hwy/contrib/unroller/unroller_test.cc index 2879eb2aa6..50f26719db 100644 --- a/hwy/contrib/unroller/unroller_test.cc +++ b/hwy/contrib/unroller/unroller_test.cc @@ -452,8 +452,9 @@ struct TestFind { FindUnit cvtfn(ConvertScalarTo(num - 1)); MakeSigned idx = 0; - Unroller(cvtfn, const_cast(a), &idx, - static_cast(num)); + // Explicitly test input can be const + const T* const_a = a; + Unroller(cvtfn, const_a, &idx, static_cast(num)); HWY_ASSERT(static_cast>(idx) < num); HWY_ASSERT(a[idx] == ConvertScalarTo(num - 1)); From 9aa447ec5eb79af5ab9e583517c4b2e046a0cd09 Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Fri, 20 Dec 2024 06:35:00 -0800 Subject: [PATCH 28/64] update test thresholds PiperOrigin-RevId: 708306606 --- hwy/perf_counters_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hwy/perf_counters_test.cc b/hwy/perf_counters_test.cc index 28b2225507..d81710d236 100644 --- a/hwy/perf_counters_test.cc +++ b/hwy/perf_counters_test.cc @@ -99,7 +99,7 @@ TEST(PerfCountersTest, TestMem) { HWY_ASSERT(values[PerfCounters::kCacheRefs] == 0.0 || values[PerfCounters::kCacheRefs] > 1E4); // 75K..66M HWY_ASSERT(values[PerfCounters::kCacheMisses] == 0.0 || - values[PerfCounters::kCacheMisses] > 10.0); // 13..51M + values[PerfCounters::kCacheMisses] > 1.0); // 10..51M HWY_ASSERT(values[PerfCounters::kBusCycles] == 0.0 || values[PerfCounters::kBusCycles] > 1E7); // 82M } From e892ab40a9e1b6d7ca392a3255212d632cc1ff7b Mon Sep 17 00:00:00 2001 From: John Platts Date: Tue, 31 Dec 2024 14:17:15 -0600 Subject: [PATCH 29/64] Fixes to RVV Concat/Combine ops --- hwy/ops/rvv-inl.h | 124 +++++++++++++++++++++++++++++----------------- 1 file changed, 79 insertions(+), 45 deletions(-) diff --git a/hwy/ops/rvv-inl.h b/hwy/ops/rvv-inl.h index bcd850d60f..31232f7ee9 100644 --- a/hwy/ops/rvv-inl.h +++ b/hwy/ops/rvv-inl.h @@ -3228,67 +3228,101 @@ Get(D d, VFromD v) { } } -#define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_V(BASE, SEW, LMUL) dest, HWY_RVV_V(BASE, SEW, LMULH) v) { \ - return __riscv_v##OP##_v_##CHAR##SEW##LMULH##_##CHAR##SEW##LMUL( \ - dest, kIndex, v); /* no AVL */ \ +#define HWY_RVV_PARTIAL_VEC_SET_HALF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \ + LMULH, SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) dest, HWY_RVV_V(BASE, SEW, LMULH) v, \ + size_t half_N) { \ + static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1"); \ + const DFromV d; \ + HWY_IF_CONSTEXPR(kIndex == 0) { \ + return __riscv_v##OP##_v_v_##CHAR##SEW##LMUL##_tu(dest, Ext(d, v), \ + half_N); \ + } \ + else { \ + return SlideUp(dest, Ext(d, v), half_N); \ + } \ } -#define HWY_RVV_SET_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_V(BASE, SEW, LMUL) dest, HWY_RVV_V(BASE, SEW, LMULH) v) { \ - static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1"); \ - auto d = HWY_RVV_D(BASE, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), SHIFT){}; \ - auto df2 = \ - HWY_RVV_D(BASE, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), SHIFT - 1){}; \ - HWY_IF_CONSTEXPR(kIndex == 0) { \ - return __riscv_vmv_v_v_##CHAR##SEW##LMUL##_tu(dest, Ext(d, v), \ - Lanes(df2)); \ - } \ - else { \ - return SlideUp(dest, Ext(d, v), Lanes(df2)); \ - } \ +#define HWY_RVV_PARTIAL_VEC_SET_HALF_SMALLEST( \ + BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_V(BASE, SEW, LMUL) dest, HWY_RVV_V(BASE, SEW, LMUL) v, \ + size_t half_N) { \ + static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1"); \ + HWY_IF_CONSTEXPR(kIndex == 0) { \ + return __riscv_v##OP##_v_v_##CHAR##SEW##LMUL##_tu(dest, v, half_N); \ + } \ + else { \ + return SlideUp(dest, v, half_N); \ + } \ } -#define HWY_RVV_SET_SMALLEST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - template \ +HWY_RVV_FOREACH(HWY_RVV_PARTIAL_VEC_SET_HALF, PartialVecSetHalf, mv, _GET_SET) +HWY_RVV_FOREACH(HWY_RVV_PARTIAL_VEC_SET_HALF, PartialVecSetHalf, mv, + _GET_SET_VIRT) +HWY_RVV_FOREACH(HWY_RVV_PARTIAL_VEC_SET_HALF_SMALLEST, PartialVecSetHalf, mv, + _GET_SET_SMALLEST) +#undef HWY_RVV_PARTIAL_VEC_SET_HALF +#undef HWY_RVV_PARTIAL_VEC_SET_HALF_SMALLEST + +#define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ + MLEN, NAME, OP) \ + template \ HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_V(BASE, SEW, LMUL) dest, HWY_RVV_V(BASE, SEW, LMUL) v) { \ - static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1"); \ - auto d = HWY_RVV_D(BASE, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), SHIFT){}; \ - HWY_IF_CONSTEXPR(kIndex == 0) { \ - return __riscv_vmv_v_v_##CHAR##SEW##LMUL##_tu(dest, v, Lanes(d) / 2); \ + NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) dest, \ + HWY_RVV_V(BASE, SEW, LMULH) v) { \ + HWY_IF_CONSTEXPR(detail::IsFull(d)) { \ + return __riscv_v##OP##_v_##CHAR##SEW##LMULH##_##CHAR##SEW##LMUL( \ + dest, kIndex, v); /* no AVL */ \ } \ else { \ - return SlideUp(dest, v, Lanes(d) / 2); \ + const Half dh; \ + return PartialVecSetHalf(dest, v, Lanes(dh)); \ } \ } +#define HWY_RVV_SET_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) dest, \ + HWY_RVV_V(BASE, SEW, LMULH) v) { \ + const Half dh; \ + return PartialVecSetHalf(dest, v, Lanes(dh)); \ + } +#define HWY_RVV_SET_SMALLEST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ + SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) dest, \ + HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return PartialVecSetHalf(dest, v, Lanes(d) / 2); \ + } +#define HWY_RVV_SET_SMALLEST_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \ + LMULH, SHIFT, MLEN, NAME, OP) \ + template \ + HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ + NAME(HWY_RVV_D(BASE, SEW, N, SHIFT - 1) d, \ + HWY_RVV_V(BASE, SEW, LMUL) dest, HWY_RVV_V(BASE, SEW, LMUL) v) { \ + return PartialVecSetHalf(dest, v, Lanes(d) / 2); \ + } HWY_RVV_FOREACH(HWY_RVV_SET, Set, set, _GET_SET) HWY_RVV_FOREACH(HWY_RVV_SET_VIRT, Set, set, _GET_SET_VIRT) HWY_RVV_FOREACH(HWY_RVV_SET_SMALLEST, Set, set, _GET_SET_SMALLEST) +HWY_RVV_FOREACH_UI163264(HWY_RVV_SET_SMALLEST_VIRT, Set, set, _GET_SET_SMALLEST) +HWY_RVV_FOREACH_F(HWY_RVV_SET_SMALLEST_VIRT, Set, set, _GET_SET_SMALLEST) #undef HWY_RVV_SET #undef HWY_RVV_SET_VIRT #undef HWY_RVV_SET_SMALLEST +#undef HWY_RVV_SET_SMALLEST_VIRT -template +template static HWY_INLINE HWY_MAYBE_UNUSED VFromD Set( D d, VFromD dest, VFromD>> v) { - static_assert(kIndex == 0 || kIndex == 1, "kIndex must be 0 or 1"); - - const AdjustSimdTagToMinVecPow2> dh; - HWY_IF_CONSTEXPR(kIndex == 0 || detail::IsFull(d)) { - (void)dh; - return Set(dest, v); - } - else { - const size_t slide_up_amt = - (dh.Pow2() < DFromV().Pow2()) ? Lanes(dh) : (Lanes(d) / 2); - return SlideUp(dest, ResizeBitCast(d, v), slide_up_amt); - } + const RebindToUnsigned du; + return BitCast( + d, Set(du, BitCast(du, dest), + BitCast(RebindToUnsigned>(), v))); } } // namespace detail From 3a28dcb2951fe44bcc46bd5743905490962e15db Mon Sep 17 00:00:00 2001 From: Chris Sidebottom Date: Thu, 2 Jan 2025 18:00:17 +0000 Subject: [PATCH 30/64] Add VQSORT_COMPILER_COMPATIBLE, split from VQSORT_ENABLED This allows us to check whether our compiler is compatible with VQSort during build of our dynamic dispatch sources --- hwy/contrib/sort/shared-inl.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/hwy/contrib/sort/shared-inl.h b/hwy/contrib/sort/shared-inl.h index f63072ad67..8559c30d93 100644 --- a/hwy/contrib/sort/shared-inl.h +++ b/hwy/contrib/sort/shared-inl.h @@ -1,5 +1,7 @@ // Copyright 2021 Google LLC +// Copyright 2025 Arm Limited and/or its affiliates // SPDX-License-Identifier: Apache-2.0 +// SPDX-License-Identifier: BSD-3-Clause // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -128,10 +130,17 @@ static_assert(SortConstants::MaxBufBytes<2>(64) <= 1664, "Unexpectedly high"); // due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97696. Armv8 Clang // hwasan/msan/tsan/asan also fail to build SVE (b/335157772). #undef VQSORT_ENABLED -#if (HWY_TARGET == HWY_SCALAR) || \ - (HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD) || \ +#undef VQSORT_COMPILER_COMPATIBLE + +#if (HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD) || \ (HWY_ARCH_ARM_V7 && HWY_IS_DEBUG_BUILD) || \ (HWY_ARCH_ARM_A64 && HWY_COMPILER_GCC_ACTUAL && HWY_IS_ASAN) +#define VQSORT_COMPILER_COMPATIBLE 0 +#else +#define VQSORT_COMPILER_COMPATIBLE 1 +#endif + +#if (HWY_TARGET == HWY_SCALAR) || !VQSORT_COMPILER_COMPATIBLE #define VQSORT_ENABLED 0 #else #define VQSORT_ENABLED 1 From fdfce1f77c41013140b6db86d246426283aa015b Mon Sep 17 00:00:00 2001 From: Evgenii Kliuchnikov Date: Tue, 7 Jan 2025 02:45:49 -0800 Subject: [PATCH 31/64] fix warnings "unused parameter 'd'" PiperOrigin-RevId: 712837245 --- hwy/ops/wasm_128-inl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hwy/ops/wasm_128-inl.h b/hwy/ops/wasm_128-inl.h index d34a90dbc9..97d44bbce0 100644 --- a/hwy/ops/wasm_128-inl.h +++ b/hwy/ops/wasm_128-inl.h @@ -4984,7 +4984,7 @@ HWY_API uint64_t BitsFromMask(D d, const MFromD mask) { } template -HWY_API uint64_t BitsFromMask(D d, const MFromD mask) { +HWY_API uint64_t BitsFromMask(D /*d*/, const MFromD mask) { // Remove useless lower half of each u16 while preserving the sign bit. const Rebind d8; using M8 = MFromD; @@ -5065,7 +5065,7 @@ HWY_API size_t CountTrue(D d, const MFromD m) { return PopCount(BitsFromMask(d, m)); } template -HWY_API size_t CountTrue(D d, const MFromD m) { +HWY_API size_t CountTrue(D /*d*/, const MFromD m) { const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8); const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift); alignas(16) uint64_t lanes[2]; @@ -5073,7 +5073,7 @@ HWY_API size_t CountTrue(D d, const MFromD m) { return PopCount(lanes[0] | lanes[1]); } template -HWY_API size_t CountTrue(D d, const MFromD m) { +HWY_API size_t CountTrue(D /*d*/, const MFromD m) { alignas(16) int64_t lanes[2]; wasm_v128_store(lanes, m.raw); return static_cast(-(lanes[0] + lanes[1])); From fecd46577ce06d5b4c4432b6ad7bb08c3b5085d4 Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Tue, 7 Jan 2025 07:56:19 -0800 Subject: [PATCH 32/64] No longer require highway.h for profiler.h hn::timer is still available via timer-inl.h but deprecated. PiperOrigin-RevId: 712911289 --- hwy/contrib/sort/bench_sort.cc | 2 - hwy/nanobenchmark.cc | 3 - hwy/perf_counters_test.cc | 6 +- hwy/profiler.h | 25 ++--- hwy/timer-inl.h | 170 ++------------------------------- hwy/timer.cc | 3 - hwy/timer.h | 170 ++++++++++++++++++++++++++++++++- 7 files changed, 187 insertions(+), 192 deletions(-) diff --git a/hwy/contrib/sort/bench_sort.cc b/hwy/contrib/sort/bench_sort.cc index e53ee2708d..6afc1917e0 100644 --- a/hwy/contrib/sort/bench_sort.cc +++ b/hwy/contrib/sort/bench_sort.cc @@ -31,10 +31,8 @@ #include "hwy/contrib/sort/traits-inl.h" #include "hwy/contrib/sort/traits128-inl.h" #include "hwy/tests/test_util-inl.h" -#include "hwy/timer-inl.h" #include "hwy/nanobenchmark.h" #include "hwy/timer.h" -#include "hwy/per_target.h" // clang-format on #if HWY_OS_LINUX diff --git a/hwy/nanobenchmark.cc b/hwy/nanobenchmark.cc index 0dec0bc469..49aa807aba 100644 --- a/hwy/nanobenchmark.cc +++ b/hwy/nanobenchmark.cc @@ -26,13 +26,10 @@ #include "hwy/base.h" #include "hwy/robust_statistics.h" -#include "hwy/timer-inl.h" #include "hwy/timer.h" namespace hwy { namespace { -namespace timer = hwy::HWY_NAMESPACE::timer; - static const timer::Ticks timer_resolution = platform::TimerResolution(); // Estimates the expected value of "lambda" values with a variable number of diff --git a/hwy/perf_counters_test.cc b/hwy/perf_counters_test.cc index d81710d236..bd574d9ff7 100644 --- a/hwy/perf_counters_test.cc +++ b/hwy/perf_counters_test.cc @@ -24,7 +24,6 @@ #include "hwy/nanobenchmark.h" // Unpredictable1 #include "hwy/tests/hwy_gtest.h" #include "hwy/tests/test_util-inl.h" -#include "hwy/timer-inl.h" #include "hwy/timer.h" #if !HWY_OS_WIN @@ -39,11 +38,10 @@ using ::hwy::platform::PerfCounters; void ReadAndPrint(uint64_t r, double* values) { char cpu100[100]; const bool have_stop = hwy::platform::HaveTimerStop(cpu100); - const uint64_t t0 = HWY_STATIC_DISPATCH(timer::Start()); + const uint64_t t0 = timer::Start(); PerfCounters counters; - const uint64_t t1 = have_stop ? HWY_STATIC_DISPATCH(timer::Stop()) - : HWY_STATIC_DISPATCH(timer::Start()); + const uint64_t t1 = have_stop ? timer::Stop() : timer::Start(); const double elapsed_ns = static_cast(t1 - t0) * 1E9 / platform::InvariantTicksPerSecond(); fprintf(stderr, "r: %d, any valid %d extrapolate %f, overhead %.1f ns\n", diff --git a/hwy/profiler.h b/hwy/profiler.h index 467ac0c4bb..9a9978f640 100644 --- a/hwy/profiler.h +++ b/hwy/profiler.h @@ -61,7 +61,6 @@ #include "hwy/cache_control.h" // FlushStream // #include "hwy/contrib/sort/vqsort.h" #include "hwy/robust_statistics.h" -#include "hwy/timer-inl.h" #include "hwy/timer.h" #define PROFILER_PRINT_OVERHEAD 0 @@ -228,8 +227,7 @@ class Results { // Draw all required information from the packets, which can be discarded // afterwards. Called whenever this thread's storage is full. void AnalyzePackets(const Packet* packets, const size_t num_packets) { - namespace hn = HWY_NAMESPACE; - const uint64_t t0 = hn::timer::Start(); + const uint64_t t0 = timer::Start(); for (size_t i = 0; i < num_packets; ++i) { const Packet p = packets[i]; @@ -260,15 +258,14 @@ class Results { } } - const uint64_t t1 = hn::timer::Stop(); + const uint64_t t1 = timer::Stop(); analyze_elapsed_ += t1 - t0; } // Incorporates results from another thread. Call after all threads have // exited any zones. void Assimilate(Results& other) { - namespace hn = HWY_NAMESPACE; - const uint64_t t0 = hn::timer::Start(); + const uint64_t t0 = timer::Start(); HWY_DASSERT(depth_ == 0); HWY_DASSERT(other.depth_ == 0); @@ -277,14 +274,13 @@ class Results { UpdateOrAdd(zone.BiasedOffset(), zone.NumCalls(), zone.Duration()); } other.num_zones_ = 0; - const uint64_t t1 = hn::timer::Stop(); + const uint64_t t1 = timer::Stop(); analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_; } // Single-threaded. void Print() { - namespace hn = HWY_NAMESPACE; - const uint64_t t0 = hn::timer::Start(); + const uint64_t t0 = timer::Start(); MergeDuplicates(); // Sort by decreasing total (self) cost. @@ -307,7 +303,7 @@ class Results { } num_zones_ = 0; - const uint64_t t1 = hn::timer::Stop(); + const uint64_t t1 = timer::Stop(); analyze_elapsed_ += t1 - t0; printf("Total analysis [s]: %f\n", static_cast(analyze_elapsed_) * inv_freq); @@ -550,13 +546,13 @@ class Zone { // (Capture timestamp ASAP, not inside WriteEntry.) HWY_FENCE; - const uint64_t timestamp = HWY_NAMESPACE::timer::Start(); + const uint64_t timestamp = timer::Start(); thread_specific->WriteEntry(name, timestamp); } HWY_NOINLINE ~Zone() { HWY_FENCE; - const uint64_t timestamp = HWY_NAMESPACE::timer::Stop(); + const uint64_t timestamp = timer::Stop(); StaticThreadSpecific()->WriteExit(timestamp); HWY_FENCE; } @@ -597,7 +593,6 @@ class Zone { #define PROFILER_PRINT_RESULTS hwy::Zone::PrintResults inline void ThreadSpecific::ComputeOverhead() { - namespace hn = HWY_NAMESPACE; // Delay after capturing timestamps before/after the actual zone runs. Even // with frequency throttling disabled, this has a multimodal distribution, // including 32, 34, 48, 52, 59, 62. @@ -643,12 +638,12 @@ inline void ThreadSpecific::ComputeOverhead() { // Analysis time should not be included => must fit within buffer. HWY_DASSERT(kReps * 2 < max_packets_); std::atomic_thread_fence(std::memory_order_seq_cst); - const uint64_t t0 = hn::timer::Start(); + const uint64_t t0 = timer::Start(); for (size_t i = 0; i < kReps; ++i) { PROFILER_ZONE("Dummy"); } FlushStream(); - const uint64_t t1 = hn::timer::Stop(); + const uint64_t t1 = timer::Stop(); HWY_DASSERT(num_packets_ + buffer_size_ == kReps * 2); buffer_size_ = 0; num_packets_ = 0; diff --git a/hwy/timer-inl.h b/hwy/timer-inl.h index 9e98e6d00c..87c98df02c 100644 --- a/hwy/timer-inl.h +++ b/hwy/timer-inl.h @@ -13,11 +13,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -// High-resolution and high-precision timer +// DEPRECATED, use timer.h instead. + +#include "hwy/timer.h" -// Per-target include guard -// NOTE: this file could/should be a normal header, but user code may reference -// hn::timer, and defining that here requires highway.h. #if defined(HIGHWAY_HWY_TIMER_INL_H_) == defined(HWY_TARGET_TOGGLE) #ifdef HIGHWAY_HWY_TIMER_INL_H_ #undef HIGHWAY_HWY_TIMER_INL_H_ @@ -27,170 +26,17 @@ #include "hwy/highway.h" -#if defined(_WIN32) || defined(_WIN64) -#ifndef NOMINMAX -#define NOMINMAX -#endif // NOMINMAX -#include -#endif - -#if defined(__APPLE__) -#include -#include -#endif - -#if defined(__HAIKU__) -#include -#endif - -#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__) -#include // NOLINT __ppc_get_timebase_freq -#endif - -#if HWY_ARCH_X86 && HWY_COMPILER_MSVC -#include -#endif - -#include -#include // clock_gettime - HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { namespace timer { -// Ticks := platform-specific timer values (CPU cycles on x86). Must be -// unsigned to guarantee wraparound on overflow. -using Ticks = uint64_t; +// Deprecated aliases so that old code still compiles. Prefer to use +// `hwy::timer::*` from timer.h because that does not require highway.h. +using Ticks = hwy::timer::Ticks; -// Start/Stop return absolute timestamps and must be placed immediately before -// and after the region to measure. We provide separate Start/Stop functions -// because they use different fences. -// -// Background: RDTSC is not 'serializing'; earlier instructions may complete -// after it, and/or later instructions may complete before it. 'Fences' ensure -// regions' elapsed times are independent of such reordering. The only -// documented unprivileged serializing instruction is CPUID, which acts as a -// full fence (no reordering across it in either direction). Unfortunately -// the latency of CPUID varies wildly (perhaps made worse by not initializing -// its EAX input). Because it cannot reliably be deducted from the region's -// elapsed time, it must not be included in the region to measure (i.e. -// between the two RDTSC). -// -// The newer RDTSCP is sometimes described as serializing, but it actually -// only serves as a half-fence with release semantics. Although all -// instructions in the region will complete before the final timestamp is -// captured, subsequent instructions may leak into the region and increase the -// elapsed time. Inserting another fence after the final RDTSCP would prevent -// such reordering without affecting the measured region. -// -// Fortunately, such a fence exists. The LFENCE instruction is only documented -// to delay later loads until earlier loads are visible. However, Intel's -// reference manual says it acts as a full fence (waiting until all earlier -// instructions have completed, and delaying later instructions until it -// completes). AMD assigns the same behavior to MFENCE. -// -// We need a fence before the initial RDTSC to prevent earlier instructions -// from leaking into the region, and arguably another after RDTSC to avoid -// region instructions from completing before the timestamp is recorded. -// When surrounded by fences, the additional RDTSCP half-fence provides no -// benefit, so the initial timestamp can be recorded via RDTSC, which has -// lower overhead than RDTSCP because it does not read TSC_AUX. In summary, -// we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE. -// -// Using Start+Start leads to higher variance and overhead than Stop+Stop. -// However, Stop+Stop includes an LFENCE in the region measurements, which -// adds a delay dependent on earlier loads. The combination of Start+Stop -// is faster than Start+Start and more consistent than Stop+Stop because -// the first LFENCE already delayed subsequent loads before the measured -// region. This combination seems not to have been considered in prior work: -// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c -// -// Note: performance counters can measure 'exact' instructions-retired or -// (unhalted) cycle counts. The RDPMC instruction is not serializing and also -// requires fences. Unfortunately, it is not accessible on all OSes and we -// prefer to avoid kernel-mode drivers. Performance counters are also affected -// by several under/over-count errata, so we use the TSC instead. - -// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds, -// divide by InvariantTicksPerSecond. -inline Ticks Start() { - Ticks t; -#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__) - asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268)); -#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC - // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU. - asm volatile("mrs %0, cntvct_el0" : "=r"(t)); -#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC - _ReadWriteBarrier(); - _mm_lfence(); - _ReadWriteBarrier(); - t = __rdtsc(); - _ReadWriteBarrier(); - _mm_lfence(); - _ReadWriteBarrier(); -#elif HWY_ARCH_X86_64 - asm volatile( - "lfence\n\t" - "rdtsc\n\t" - "shl $32, %%rdx\n\t" - "or %%rdx, %0\n\t" - "lfence" - : "=a"(t) - : - // "memory" avoids reordering. rdx = TSC >> 32. - // "cc" = flags modified by SHL. - : "rdx", "memory", "cc"); -#elif HWY_ARCH_RISCV - asm volatile("fence; rdtime %0" : "=r"(t)); -#elif defined(_WIN32) || defined(_WIN64) - LARGE_INTEGER counter; - (void)QueryPerformanceCounter(&counter); - t = counter.QuadPart; -#elif defined(__APPLE__) - t = mach_absolute_time(); -#elif defined(__HAIKU__) - t = system_time_nsecs(); // since boot -#else // POSIX - timespec ts; - clock_gettime(CLOCK_MONOTONIC, &ts); - t = static_cast(ts.tv_sec * 1000000000LL + ts.tv_nsec); -#endif - return t; -} - -// WARNING: on x86, caller must check HasRDTSCP before using this! -inline Ticks Stop() { - uint64_t t; -#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__) - asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268)); -#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC - // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU. - asm volatile("mrs %0, cntvct_el0" : "=r"(t)); -#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC - _ReadWriteBarrier(); - unsigned aux; - t = __rdtscp(&aux); - _ReadWriteBarrier(); - _mm_lfence(); - _ReadWriteBarrier(); -#elif HWY_ARCH_X86_64 - // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx). - asm volatile( - "rdtscp\n\t" - "shl $32, %%rdx\n\t" - "or %%rdx, %0\n\t" - "lfence" - : "=a"(t) - : - // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32. - // "cc" = flags modified by SHL. - : "rcx", "rdx", "memory", "cc"); -#else - t = Start(); -#endif - return t; -} +inline Ticks Start() { return hwy::timer::Start(); } +inline Ticks Stop() { return hwy::timer::Stop(); } } // namespace timer diff --git a/hwy/timer.cc b/hwy/timer.cc index 4b7f241550..acbbb1f836 100644 --- a/hwy/timer.cc +++ b/hwy/timer.cc @@ -22,15 +22,12 @@ #include "hwy/base.h" #include "hwy/robust_statistics.h" -#include "hwy/timer-inl.h" #if HWY_ARCH_X86 && !HWY_COMPILER_MSVC #include // NOLINT #endif namespace hwy { -namespace timer = hwy::HWY_NAMESPACE::timer; - namespace platform { namespace { diff --git a/hwy/timer.h b/hwy/timer.h index 7ac0588f26..f56ab7c7ea 100644 --- a/hwy/timer.h +++ b/hwy/timer.h @@ -17,11 +17,36 @@ #define HIGHWAY_HWY_TIMER_H_ // Platform-specific timer functions. Provides Now() and functions for -// interpreting and converting the timer-inl.h Ticks. +// interpreting and converting Ticks. #include +#include // clock_gettime -#include "hwy/highway_export.h" +#include "hwy/base.h" + +#if defined(_WIN32) || defined(_WIN64) +#ifndef NOMINMAX +#define NOMINMAX +#endif // NOMINMAX +#include +#endif + +#if defined(__APPLE__) +#include +#include +#endif + +#if defined(__HAIKU__) +#include +#endif + +#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__) +#include // NOLINT __ppc_get_timebase_freq +#endif + +#if HWY_ARCH_X86 && HWY_COMPILER_MSVC +#include +#endif namespace hwy { namespace platform { @@ -32,7 +57,7 @@ namespace platform { // Uses InvariantTicksPerSecond and the baseline version of timer::Start(). HWY_DLLEXPORT double Now(); -// Functions for use with timer-inl.h: +// Functions related to `Ticks` below. // Returns whether it is safe to call timer::Stop without executing an illegal // instruction; if false, fills cpu100 (a pointer to a 100 character buffer) @@ -65,6 +90,145 @@ static inline double SecondsSince(const Timestamp& t0) { return t1.t - t0.t; } +// Low-level Start/Stop functions, previously in timer-inl.h. + +namespace timer { + +// Ticks := platform-specific timer values (CPU cycles on x86). Must be +// unsigned to guarantee wraparound on overflow. +using Ticks = uint64_t; + +// Start/Stop return absolute timestamps and must be placed immediately before +// and after the region to measure. We provide separate Start/Stop functions +// because they use different fences. +// +// Background: RDTSC is not 'serializing'; earlier instructions may complete +// after it, and/or later instructions may complete before it. 'Fences' ensure +// regions' elapsed times are independent of such reordering. The only +// documented unprivileged serializing instruction is CPUID, which acts as a +// full fence (no reordering across it in either direction). Unfortunately +// the latency of CPUID varies wildly (perhaps made worse by not initializing +// its EAX input). Because it cannot reliably be deducted from the region's +// elapsed time, it must not be included in the region to measure (i.e. +// between the two RDTSC). +// +// The newer RDTSCP is sometimes described as serializing, but it actually +// only serves as a half-fence with release semantics. Although all +// instructions in the region will complete before the final timestamp is +// captured, subsequent instructions may leak into the region and increase the +// elapsed time. Inserting another fence after the final `RDTSCP` would prevent +// such reordering without affecting the measured region. +// +// Fortunately, such a fence exists. The LFENCE instruction is only documented +// to delay later loads until earlier loads are visible. However, Intel's +// reference manual says it acts as a full fence (waiting until all earlier +// instructions have completed, and delaying later instructions until it +// completes). AMD assigns the same behavior to MFENCE. +// +// We need a fence before the initial RDTSC to prevent earlier instructions +// from leaking into the region, and arguably another after RDTSC to avoid +// region instructions from completing before the timestamp is recorded. +// When surrounded by fences, the additional `RDTSCP` half-fence provides no +// benefit, so the initial timestamp can be recorded via RDTSC, which has +// lower overhead than `RDTSCP` because it does not read TSC_AUX. In summary, +// we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE. +// +// Using Start+Start leads to higher variance and overhead than Stop+Stop. +// However, Stop+Stop includes an LFENCE in the region measurements, which +// adds a delay dependent on earlier loads. The combination of Start+Stop +// is faster than Start+Start and more consistent than Stop+Stop because +// the first LFENCE already delayed subsequent loads before the measured +// region. This combination seems not to have been considered in prior work: +// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c +// +// Note: performance counters can measure 'exact' instructions-retired or +// (unhalted) cycle counts. The RDPMC instruction is not serializing and also +// requires fences. Unfortunately, it is not accessible on all OSes and we +// prefer to avoid kernel-mode drivers. Performance counters are also affected +// by several under/over-count errata, so we use the TSC instead. + +// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds, +// divide by InvariantTicksPerSecond. +static HWY_INLINE Ticks Start() { + Ticks t; +#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__) + asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268)); +#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC + // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU. + asm volatile("mrs %0, cntvct_el0" : "=r"(t)); +#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC + _ReadWriteBarrier(); + _mm_lfence(); + _ReadWriteBarrier(); + t = __rdtsc(); + _ReadWriteBarrier(); + _mm_lfence(); + _ReadWriteBarrier(); +#elif HWY_ARCH_X86_64 + asm volatile( + "lfence\n\t" + "rdtsc\n\t" + "shl $32, %%rdx\n\t" + "or %%rdx, %0\n\t" + "lfence" + : "=a"(t) + : + // "memory" avoids reordering. rdx = TSC >> 32. + // "cc" = flags modified by SHL. + : "rdx", "memory", "cc"); +#elif HWY_ARCH_RISCV + asm volatile("fence; rdtime %0" : "=r"(t)); +#elif defined(_WIN32) || defined(_WIN64) + LARGE_INTEGER counter; + (void)QueryPerformanceCounter(&counter); + t = counter.QuadPart; +#elif defined(__APPLE__) + t = mach_absolute_time(); +#elif defined(__HAIKU__) + t = system_time_nsecs(); // since boot +#else // POSIX + timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + t = static_cast(ts.tv_sec * 1000000000LL + ts.tv_nsec); +#endif + return t; +} + +// WARNING: on x86, caller must check `HaveTimerStop()` before using this! +static HWY_INLINE Ticks Stop() { + uint64_t t; +#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__) + asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268)); +#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC + // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU. + asm volatile("mrs %0, cntvct_el0" : "=r"(t)); +#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC + _ReadWriteBarrier(); + unsigned aux; + t = __rdtscp(&aux); + _ReadWriteBarrier(); + _mm_lfence(); + _ReadWriteBarrier(); +#elif HWY_ARCH_X86_64 + // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx). + asm volatile( + "rdtscp\n\t" + "shl $32, %%rdx\n\t" + "or %%rdx, %0\n\t" + "lfence" + : "=a"(t) + : + // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32. + // "cc" = flags modified by SHL. + : "rcx", "rdx", "memory", "cc"); +#else + t = Start(); +#endif + return t; +} + +} // namespace timer + } // namespace hwy #endif // HIGHWAY_HWY_TIMER_H_ From c8c3f5eadab4a998a33fd86374a588672a3d658b Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Thu, 9 Jan 2025 10:13:58 -0800 Subject: [PATCH 33/64] update thresholds to account for a possible L4. Thanks @miladfarca, fixes 2435 PiperOrigin-RevId: 713713061 --- hwy/perf_counters_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hwy/perf_counters_test.cc b/hwy/perf_counters_test.cc index bd574d9ff7..03178f79e3 100644 --- a/hwy/perf_counters_test.cc +++ b/hwy/perf_counters_test.cc @@ -90,9 +90,9 @@ TEST(PerfCountersTest, TestMem) { HWY_ASSERT(values[PerfCounters::kBranchMispredicts] < 1E7); // 273K..1M HWY_ASSERT(values[PerfCounters::kL3Loads] == 0.0 || - values[PerfCounters::kL3Loads] > 1000.0); // ~90K + values[PerfCounters::kL3Loads] > 10.0); // ~90K, 50 with L4 HWY_ASSERT(values[PerfCounters::kL3Stores] == 0.0 || - values[PerfCounters::kL3Stores] > 1E3); // 9K..5M + values[PerfCounters::kL3Stores] > 10.0); // 9K..5M HWY_ASSERT(values[PerfCounters::kCacheRefs] == 0.0 || values[PerfCounters::kCacheRefs] > 1E4); // 75K..66M From bcf015564da9e8b730aa66627dd683ed60022699 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 10 Jan 2025 13:38:44 +0000 Subject: [PATCH 34/64] Bump step-security/harden-runner from 2.10.2 to 2.10.3 Bumps [step-security/harden-runner](https://github.com/step-security/harden-runner) from 2.10.2 to 2.10.3. - [Release notes](https://github.com/step-security/harden-runner/releases) - [Commits](https://github.com/step-security/harden-runner/compare/0080882f6c36860b6ba35c610c98ce87d4e2f26f...c95a14d0e5bab51a9f56296a4eb0e416910cd350) --- updated-dependencies: - dependency-name: step-security/harden-runner dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/build_test.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml index 862f8e8f08..d5eff94eeb 100644 --- a/.github/workflows/build_test.yml +++ b/.github/workflows/build_test.yml @@ -135,7 +135,7 @@ jobs: steps: - name: Harden Runner - uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 + uses: step-security/harden-runner@c95a14d0e5bab51a9f56296a4eb0e416910cd350 # v2.10.3 with: egress-policy: audit # cannot be block - runner does git checkout @@ -230,7 +230,7 @@ jobs: steps: - name: Harden Runner - uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 + uses: step-security/harden-runner@c95a14d0e5bab51a9f56296a4eb0e416910cd350 # v2.10.3 with: egress-policy: audit # cannot be block - runner does git checkout @@ -313,7 +313,7 @@ jobs: steps: - name: Harden Runner - uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 + uses: step-security/harden-runner@c95a14d0e5bab51a9f56296a4eb0e416910cd350 # v2.10.3 with: egress-policy: audit # cannot be block - runner does git checkout @@ -334,7 +334,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Harden Runner - uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2 + uses: step-security/harden-runner@c95a14d0e5bab51a9f56296a4eb0e416910cd350 # v2.10.3 with: egress-policy: audit # cannot be block - runner does git checkout From 4a0a5b5f24e17913a47e6d765cb74680d5f550da Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Mon, 13 Jan 2025 15:20:33 -0800 Subject: [PATCH 35/64] fix emu128 reduction with infinities. thanks @yohanchatelain, fixes #2434 PiperOrigin-RevId: 715126311 --- hwy/ops/emu128-inl.h | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/hwy/ops/emu128-inl.h b/hwy/ops/emu128-inl.h index 7a361de011..a530a8f9f6 100644 --- a/hwy/ops/emu128-inl.h +++ b/hwy/ops/emu128-inl.h @@ -2916,9 +2916,30 @@ HWY_API T ReduceSum(D d, VFromD v) { } return sum; } + +namespace detail { +template , HWY_IF_FLOAT_OR_SPECIAL(T)> +T InitReduceMin(D d) { + return GetLane(Inf(d)); +} +template , HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> +T InitReduceMin(D d) { + return HighestValue(); +} + +template , HWY_IF_FLOAT_OR_SPECIAL(T)> +T InitReduceMax(D d) { + return -GetLane(Inf(d)); +} +template , HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> +T InitReduceMax(D d) { + return LowestValue(); +} +} // namespace detail + template , HWY_IF_REDUCE_D(D)> HWY_API T ReduceMin(D d, VFromD v) { - T min = HighestValue(); + T min = detail::InitReduceMin(d); for (size_t i = 0; i < MaxLanes(d); ++i) { min = HWY_MIN(min, v.raw[i]); } @@ -2926,7 +2947,7 @@ HWY_API T ReduceMin(D d, VFromD v) { } template , HWY_IF_REDUCE_D(D)> HWY_API T ReduceMax(D d, VFromD v) { - T max = LowestValue(); + T max = detail::InitReduceMax(d); for (size_t i = 0; i < MaxLanes(d); ++i) { max = HWY_MAX(max, v.raw[i]); } From cd56bbcf1868ddf59ce789ec381aae5f0d25dd7c Mon Sep 17 00:00:00 2001 From: Zhouyu Qian Date: Mon, 13 Jan 2025 09:49:18 -0500 Subject: [PATCH 36/64] Defer the call to get timer resolution until needed We have a use case where the highway library is linked into a shared object. Currently whenever it is dlopen'ed, the static initializer runs including running the `platform::TimerResolution`. This has been observed in profiling and raised some questions, namely it shouldn't be necessary to initialize the benchmarking code if we are using the highway library but not to benchmark anything. Furthermore, doing so is against many companies' C++ guidelines, making this a blocker for inclusion into companies' code bases. This commit changes this variable from a namespace-scope static variable into a static local variable. It may have a little bit more overhead but it should be negligible. --- hwy/nanobenchmark.cc | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/hwy/nanobenchmark.cc b/hwy/nanobenchmark.cc index 49aa807aba..0a885d2d09 100644 --- a/hwy/nanobenchmark.cc +++ b/hwy/nanobenchmark.cc @@ -30,7 +30,10 @@ namespace hwy { namespace { -static const timer::Ticks timer_resolution = platform::TimerResolution(); +const timer::Ticks& GetTimerResolution() { + static const timer::Ticks timer_resolution = platform::TimerResolution(); + return timer_resolution; +} // Estimates the expected value of "lambda" values with a variable number of // samples until the variability "rel_mad" is less than "max_rel_mad". @@ -56,7 +59,7 @@ timer::Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad, // Percentage is too strict for tiny differences, so also allow a small // absolute "median absolute deviation". - const timer::Ticks max_abs_mad = (timer_resolution + 99) / 100; + const timer::Ticks max_abs_mad = (GetTimerResolution() + 99) / 100; *rel_mad = 0.0; // ensure initialized for (size_t eval = 0; eval < p.max_evals; ++eval, samples_per_eval *= 2) { @@ -122,7 +125,7 @@ size_t NumSkip(const Func func, const uint8_t* arg, const InputVec& unique, const timer::Ticks total = SampleUntilStable( p.target_rel_mad, &rel_mad, p, [func, arg, input]() { PreventElision(func(arg, input)); }); - min_duration = HWY_MIN(min_duration, total - timer_resolution); + min_duration = HWY_MIN(min_duration, total - GetTimerResolution()); } // Number of repetitions required to reach the target resolution. @@ -134,7 +137,7 @@ size_t NumSkip(const Func func, const uint8_t* arg, const InputVec& unique, : static_cast((max_skip + min_duration - 1) / min_duration); if (p.verbose) { printf("res=%d max_skip=%d min_dur=%d num_skip=%d\n", - static_cast(timer_resolution), static_cast(max_skip), + static_cast(GetTimerResolution()), static_cast(max_skip), static_cast(min_duration), static_cast(num_skip)); } return num_skip; From 87848c46fd1339a4c63df88870ce29cf1ea8298d Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Tue, 14 Jan 2025 15:38:56 -0800 Subject: [PATCH 37/64] add warning if TSC is not invariant PiperOrigin-RevId: 715556352 --- hwy/timer.cc | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/hwy/timer.cc b/hwy/timer.cc index acbbb1f836..da3103b4fb 100644 --- a/hwy/timer.cc +++ b/hwy/timer.cc @@ -89,7 +89,13 @@ void Cpuid(const uint32_t level, const uint32_t count, bool HasRDTSCP() { uint32_t abcd[4]; Cpuid(0x80000001U, 0, abcd); // Extended feature flags - return (abcd[3] & (1u << 27)) != 0; // RDTSCP + if ((abcd[3] & (1u << 27)) == 0) return false; // RDTSCP + + Cpuid(0x80000007U, 0, abcd); + if ((abcd[3] & (1u << 8)) == 0) { + HWY_WARN("TSC not constant/invariant, may vary frequency or jump."); + } + return true; } #endif // HWY_ARCH_X86 From fdf177dcccfea047c24b6dc0ebbe263f5fddcd2f Mon Sep 17 00:00:00 2001 From: Highway Date: Fri, 17 Jan 2025 07:35:17 -0800 Subject: [PATCH 38/64] hwy-contrib/thread_pool: Replace size check assert with skip. PiperOrigin-RevId: 716663095 --- hwy/contrib/thread_pool/topology.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hwy/contrib/thread_pool/topology.cc b/hwy/contrib/thread_pool/topology.cc index 8ab031db85..9a00a6fad2 100644 --- a/hwy/contrib/thread_pool/topology.cc +++ b/hwy/contrib/thread_pool/topology.cc @@ -1014,7 +1014,9 @@ bool InitCachesWin(Caches& caches) { if (cr.Type != CacheUnified && cr.Type != CacheData) return; if (1 <= cr.Level && cr.Level <= 3) { Cache& c = caches[cr.Level]; - HWY_ASSERT(c.size_kib == 0); // not set yet + // If the size is non-zero then we (probably) have already detected this + // cache and can skip the CR. + if (c.size_kib > 0) return; c.size_kib = static_cast(DivByFactor(cr.CacheSize, 1024)); c.bytes_per_line = static_cast(cr.LineSize); c.associativity = (cr.Associativity == CACHE_FULLY_ASSOCIATIVE) From a811732d06d2e920dba2b1d7dfe28c7dca2d7a7f Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Fri, 17 Jan 2025 09:28:10 -0800 Subject: [PATCH 39/64] add HWY_UNREACHABLE and add documentation for related macros PiperOrigin-RevId: 716696598 --- g3doc/quick_reference.md | 13 +++++++++++++ hwy/base.h | 6 ++++++ hwy/base_test.cc | 9 +++++++++ 3 files changed, 28 insertions(+) diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md index 7bf7c0b932..e3a338d69a 100644 --- a/g3doc/quick_reference.md +++ b/g3doc/quick_reference.md @@ -2476,6 +2476,19 @@ Ops in this section are only available if `HWY_TARGET != HWY_SCALAR`: * `HWY_ALIGN_MAX`: as `HWY_ALIGN`, but independent of `HWY_TARGET` and may be used outside `HWY_NAMESPACE`. +* `HWY_RESTRICT`: use after a pointer, e.g. `T* HWY_RESTRICT p`, to indicate + the pointer is not aliased, i.e. it is the only way to access the data. This + may improve code generation by preventing unnecessary reloads. + +* `HWY_LIKELY`: use `if (HWY_LIKELY(condition))` to signal to the compiler + that `condition` is likely to be true. This may improve performance by + influencing the layout of the generated code. + +* `HWY_UNLIKELY`: like `HWY_LIKELY`, but for conditions likely to be false. + +* `HWY_UNREACHABLE;`: signals to the compiler that control will never reach + this point, which may improve code generation. + ## Advanced macros Beware that these macros describe the current target being compiled. Imagine a diff --git a/hwy/base.h b/hwy/base.h index f2dc87c0c5..0219503f4c 100644 --- a/hwy/base.h +++ b/hwy/base.h @@ -101,6 +101,7 @@ #define HWY_NORETURN __declspec(noreturn) #define HWY_LIKELY(expr) (expr) #define HWY_UNLIKELY(expr) (expr) +#define HWY_UNREACHABLE __assume(false) #define HWY_PRAGMA(tokens) __pragma(tokens) #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens)) #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc) @@ -128,6 +129,11 @@ #define HWY_NORETURN __attribute__((noreturn)) #define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1) #define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0) +#if HWY_COMPILER_GCC || __has_builtin(__builtin_unreachable) +#define HWY_UNREACHABLE __builtin_unreachable() +#else +#define HWY_UNREACHABLE +#endif #define HWY_PRAGMA(tokens) _Pragma(#tokens) #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens) #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc) diff --git a/hwy/base_test.cc b/hwy/base_test.cc index a22da3aa64..b1eb953b79 100644 --- a/hwy/base_test.cc +++ b/hwy/base_test.cc @@ -17,6 +17,8 @@ #include +#include "hwy/nanobenchmark.h" + #undef HWY_TARGET_INCLUDE #define HWY_TARGET_INCLUDE "base_test.cc" #include "hwy/foreach_target.h" // IWYU pragma: keep @@ -28,6 +30,12 @@ namespace hwy { namespace HWY_NAMESPACE { namespace { +HWY_NOINLINE void TestUnreachable() { + if (!hwy::Unpredictable1()) { + HWY_UNREACHABLE; + } +} + HWY_NOINLINE void TestAllLimits() { HWY_ASSERT_EQ(uint8_t{0}, LimitsMin()); HWY_ASSERT_EQ(uint16_t{0}, LimitsMin()); @@ -848,6 +856,7 @@ HWY_AFTER_NAMESPACE(); namespace hwy { namespace { HWY_BEFORE_TEST(BaseTest); +HWY_EXPORT_AND_TEST_P(BaseTest, TestUnreachable); HWY_EXPORT_AND_TEST_P(BaseTest, TestAllLimits); HWY_EXPORT_AND_TEST_P(BaseTest, TestAllLowestHighest); HWY_EXPORT_AND_TEST_P(BaseTest, TestAllType); From dcc0ca1cd4245ecff9e5ba50818e47d5e2ccf699 Mon Sep 17 00:00:00 2001 From: John Platts Date: Fri, 17 Jan 2025 12:16:49 -0600 Subject: [PATCH 40/64] Fix for GCC 15 compiler error on PPC8/PPC9/PPC10 --- hwy/ops/ppc_vsx-inl.h | 167 ++++++++++++++++++++++++++---------------- 1 file changed, 103 insertions(+), 64 deletions(-) diff --git a/hwy/ops/ppc_vsx-inl.h b/hwy/ops/ppc_vsx-inl.h index 86d6d98c39..3564ae0b17 100644 --- a/hwy/ops/ppc_vsx-inl.h +++ b/hwy/ops/ppc_vsx-inl.h @@ -3744,16 +3744,73 @@ static HWY_INLINE V VsxF2INormalizeSrcVals(V v) { #endif } +template +static HWY_INLINE HWY_MAYBE_UNUSED VFromD>> +VsxXvcvspsxds(VF32 vf32) { + using VI64 = VFromD>>; +#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1500) || \ + HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds) + // Use __builtin_vsx_xvcvspsxds if it is available (which is the case with + // GCC 4.8 through GCC 14 or Clang 13 or later on PPC8/PPC9/PPC10) + return VI64{__builtin_vsx_xvcvspsxds(vf32.raw)}; +#elif HWY_COMPILER_GCC_ACTUAL >= 1500 && HWY_IS_LITTLE_ENDIAN + // On little-endian PPC8/PPC9/PPC10 with GCC 15 or later, use the F32->I64 + // vec_signedo intrinsic as the __builtin_vsx_xvcvspsxds intrinsic has been + // removed from GCC in GCC 15 + return VI64{vec_signedo(vf32.raw)}; +#elif HWY_COMPILER_GCC_ACTUAL >= 1500 && HWY_IS_BIG_ENDIAN + // On big-endian PPC8/PPC9/PPC10 with GCC 15 or later, use the F32->I64 + // vec_signede intrinsic as the __builtin_vsx_xvcvspsxds intrinsic has been + // removed from GCC in GCC 15 + return VI64{vec_signede(vf32.raw)}; +#else + // Inline assembly fallback for older versions of Clang that do not have the + // __builtin_vsx_xvcvspsxds intrinsic + __vector signed long long raw_result; + __asm__("xvcvspsxds %x0, %x1" : "=wa"(raw_result) : "wa"(vf32.raw) :); + return VI64{raw_result}; +#endif +} + +template +static HWY_INLINE HWY_MAYBE_UNUSED VFromD>> +VsxXvcvspuxds(VF32 vf32) { + using VU64 = VFromD>>; +#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1500) || \ + HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds) + // Use __builtin_vsx_xvcvspuxds if it is available (which is the case with + // GCC 4.8 through GCC 14 or Clang 13 or later on PPC8/PPC9/PPC10) + return VU64{reinterpret_cast<__vector unsigned long long>( + __builtin_vsx_xvcvspuxds(vf32.raw))}; +#elif HWY_COMPILER_GCC_ACTUAL >= 1500 && HWY_IS_LITTLE_ENDIAN + // On little-endian PPC8/PPC9/PPC10 with GCC 15 or later, use the F32->U64 + // vec_unsignedo intrinsic as the __builtin_vsx_xvcvspuxds intrinsic has been + // removed from GCC in GCC 15 + return VU64{vec_unsignedo(vf32.raw)}; +#elif HWY_COMPILER_GCC_ACTUAL >= 1500 && HWY_IS_BIG_ENDIAN + // On big-endian PPC8/PPC9/PPC10 with GCC 15 or later, use the F32->U64 + // vec_unsignedo intrinsic as the __builtin_vsx_xvcvspuxds intrinsic has been + // removed from GCC in GCC 15 + return VU64{vec_unsignede(vf32.raw)}; +#else + // Inline assembly fallback for older versions of Clang that do not have the + // __builtin_vsx_xvcvspuxds intrinsic + __vector unsigned long long raw_result; + __asm__("xvcvspuxds %x0, %x1" : "=wa"(raw_result) : "wa"(vf32.raw) :); + return VU64{raw_result}; +#endif +} + } // namespace detail #endif // !HWY_S390X_HAVE_Z14 template HWY_API VFromD PromoteTo(D di64, VFromD> v) { -#if !HWY_S390X_HAVE_Z14 && \ - (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds)) - const __vector float raw_v = - detail::VsxF2INormalizeSrcVals(InterleaveLower(v, v)).raw; - return VFromD{__builtin_vsx_xvcvspsxds(raw_v)}; +#if !HWY_S390X_HAVE_Z14 + const Repartition dt_f32; + const auto vt_f32 = ResizeBitCast(dt_f32, v); + return detail::VsxXvcvspsxds( + detail::VsxF2INormalizeSrcVals(InterleaveLower(vt_f32, vt_f32))); #else const RebindToFloat df64; return ConvertTo(di64, PromoteTo(df64, v)); @@ -3762,12 +3819,11 @@ HWY_API VFromD PromoteTo(D di64, VFromD> v) { template HWY_API VFromD PromoteTo(D du64, VFromD> v) { -#if !HWY_S390X_HAVE_Z14 && \ - (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds)) - const __vector float raw_v = - detail::VsxF2INormalizeSrcVals(InterleaveLower(v, v)).raw; - return VFromD{reinterpret_cast<__vector unsigned long long>( - __builtin_vsx_xvcvspuxds(raw_v))}; +#if !HWY_S390X_HAVE_Z14 + const Repartition dt_f32; + const auto vt_f32 = ResizeBitCast(dt_f32, v); + return detail::VsxXvcvspuxds( + detail::VsxF2INormalizeSrcVals(InterleaveLower(vt_f32, vt_f32))); #else const RebindToFloat df64; return ConvertTo(du64, PromoteTo(df64, v)); @@ -3876,12 +3932,10 @@ HWY_API VFromD PromoteUpperTo(D df64, Vec128 v) { template HWY_API VFromD PromoteUpperTo(D di64, Vec128 v) { -#if !HWY_S390X_HAVE_Z14 && \ - (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds)) - const __vector float raw_v = - detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128(), v, v)) - .raw; - return VFromD{__builtin_vsx_xvcvspsxds(raw_v)}; +#if !HWY_S390X_HAVE_Z14 + (void)di64; + return detail::VsxXvcvspsxds( + detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128(), v, v))); #else const RebindToFloat df64; return ConvertTo(di64, PromoteUpperTo(df64, v)); @@ -3890,13 +3944,10 @@ HWY_API VFromD PromoteUpperTo(D di64, Vec128 v) { template HWY_API VFromD PromoteUpperTo(D du64, Vec128 v) { -#if !HWY_S390X_HAVE_Z14 && \ - (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds)) - const __vector float raw_v = - detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128(), v, v)) - .raw; - return VFromD{reinterpret_cast<__vector unsigned long long>( - __builtin_vsx_xvcvspuxds(raw_v))}; +#if !HWY_S390X_HAVE_Z14 + (void)du64; + return detail::VsxXvcvspuxds( + detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128(), v, v))); #else const RebindToFloat df64; return ConvertTo(du64, PromoteUpperTo(df64, v)); @@ -3984,20 +4035,18 @@ HWY_INLINE VFromD PromoteEvenTo(hwy::SignedTag /*to_type_tag*/, hwy::SizeTag<8> /*to_lane_size_tag*/, hwy::FloatTag /*from_type_tag*/, D d_to, V v) { -#if !HWY_S390X_HAVE_Z14 && \ - (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds)) +#if !HWY_S390X_HAVE_Z14 (void)d_to; const auto normalized_v = detail::VsxF2INormalizeSrcVals(v); #if HWY_IS_LITTLE_ENDIAN - // __builtin_vsx_xvcvspsxds expects the source values to be in the odd lanes - // on little-endian PPC, and the vec_sld operation below will shift the even + // VsxXvcvspsxds expects the source values to be in the odd lanes on + // little-endian PPC, and the Shuffle2103 operation below will shift the even // lanes of normalized_v into the odd lanes. - return VFromD{ - __builtin_vsx_xvcvspsxds(vec_sld(normalized_v.raw, normalized_v.raw, 4))}; + return VsxXvcvspsxds(Shuffle2103(normalized_v)); #else - // __builtin_vsx_xvcvspsxds expects the source values to be in the even lanes - // on big-endian PPC. - return VFromD{__builtin_vsx_xvcvspsxds(normalized_v.raw)}; + // VsxXvcvspsxds expects the source values to be in the even lanes on + // big-endian PPC. + return VsxXvcvspsxds(normalized_v); #endif #else const RebindToFloat df64; @@ -4012,22 +4061,18 @@ HWY_INLINE VFromD PromoteEvenTo(hwy::UnsignedTag /*to_type_tag*/, hwy::SizeTag<8> /*to_lane_size_tag*/, hwy::FloatTag /*from_type_tag*/, D d_to, V v) { -#if !HWY_S390X_HAVE_Z14 && \ - (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds)) +#if !HWY_S390X_HAVE_Z14 (void)d_to; const auto normalized_v = detail::VsxF2INormalizeSrcVals(v); #if HWY_IS_LITTLE_ENDIAN - // __builtin_vsx_xvcvspuxds expects the source values to be in the odd lanes - // on little-endian PPC, and the vec_sld operation below will shift the even - // lanes of normalized_v into the odd lanes. - return VFromD{ - reinterpret_cast<__vector unsigned long long>(__builtin_vsx_xvcvspuxds( - vec_sld(normalized_v.raw, normalized_v.raw, 4)))}; + // VsxXvcvspuxds expects the source values to be in the odd lanes + // on little-endian PPC, and the Shuffle2103 operation below will shift the + // even lanes of normalized_v into the odd lanes. + return VsxXvcvspuxds(Shuffle2103(normalized_v)); #else - // __builtin_vsx_xvcvspuxds expects the source values to be in the even lanes + // VsxXvcvspuxds expects the source values to be in the even lanes // on big-endian PPC. - return VFromD{reinterpret_cast<__vector unsigned long long>( - __builtin_vsx_xvcvspuxds(normalized_v.raw))}; + return VsxXvcvspuxds(normalized_v); #endif #else const RebindToFloat df64; @@ -4069,20 +4114,18 @@ HWY_INLINE VFromD PromoteOddTo(hwy::SignedTag /*to_type_tag*/, hwy::SizeTag<8> /*to_lane_size_tag*/, hwy::FloatTag /*from_type_tag*/, D d_to, V v) { -#if !HWY_S390X_HAVE_Z14 && \ - (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds)) +#if !HWY_S390X_HAVE_Z14 (void)d_to; const auto normalized_v = detail::VsxF2INormalizeSrcVals(v); #if HWY_IS_LITTLE_ENDIAN - // __builtin_vsx_xvcvspsxds expects the source values to be in the odd lanes + // VsxXvcvspsxds expects the source values to be in the odd lanes // on little-endian PPC - return VFromD{__builtin_vsx_xvcvspsxds(normalized_v.raw)}; + return VsxXvcvspsxds(normalized_v); #else - // __builtin_vsx_xvcvspsxds expects the source values to be in the even lanes - // on big-endian PPC, and the vec_sld operation below will shift the odd lanes - // of normalized_v into the even lanes. - return VFromD{ - __builtin_vsx_xvcvspsxds(vec_sld(normalized_v.raw, normalized_v.raw, 4))}; + // VsxXvcvspsxds expects the source values to be in the even lanes + // on big-endian PPC, and the Shuffle0321 operation below will shift the odd + // lanes of normalized_v into the even lanes. + return VsxXvcvspsxds(Shuffle0321(normalized_v)); #endif #else const RebindToFloat df64; @@ -4097,22 +4140,18 @@ HWY_INLINE VFromD PromoteOddTo(hwy::UnsignedTag /*to_type_tag*/, hwy::SizeTag<8> /*to_lane_size_tag*/, hwy::FloatTag /*from_type_tag*/, D d_to, V v) { -#if !HWY_S390X_HAVE_Z14 && \ - (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds)) +#if !HWY_S390X_HAVE_Z14 (void)d_to; const auto normalized_v = detail::VsxF2INormalizeSrcVals(v); #if HWY_IS_LITTLE_ENDIAN - // __builtin_vsx_xvcvspuxds expects the source values to be in the odd lanes + // VsxXvcvspuxds expects the source values to be in the odd lanes // on little-endian PPC - return VFromD{reinterpret_cast<__vector unsigned long long>( - __builtin_vsx_xvcvspuxds(normalized_v.raw))}; + return VsxXvcvspuxds(normalized_v); #else - // __builtin_vsx_xvcvspuxds expects the source values to be in the even lanes - // on big-endian PPC, and the vec_sld operation below will shift the odd lanes - // of normalized_v into the even lanes. - return VFromD{ - reinterpret_cast<__vector unsigned long long>(__builtin_vsx_xvcvspuxds( - vec_sld(normalized_v.raw, normalized_v.raw, 4)))}; + // VsxXvcvspuxds expects the source values to be in the even lanes + // on big-endian PPC, and the Shuffle0321 operation below will shift the odd + // lanes of normalized_v into the even lanes. + return VsxXvcvspuxds(Shuffle0321(normalized_v)); #endif #else const RebindToFloat df64; From 070bc1fa16e646100a3551c5db72af0ccaa950b1 Mon Sep 17 00:00:00 2001 From: John Platts Date: Fri, 17 Jan 2025 13:53:22 -0600 Subject: [PATCH 41/64] Added PositiveInfOrHighestValue and NegativeInfOrLowestValue --- hwy/base.h | 39 +++++++++++++++++++++++++++++++++++++++ hwy/base_test.cc | 16 ++++++++++++++++ hwy/ops/emu128-inl.h | 24 ++---------------------- 3 files changed, 57 insertions(+), 22 deletions(-) diff --git a/hwy/base.h b/hwy/base.h index f2dc87c0c5..ceeea25c72 100644 --- a/hwy/base.h +++ b/hwy/base.h @@ -2414,6 +2414,45 @@ constexpr MakeSigned MaxExponentField() { return (MakeSigned{1} << ExponentBits()) - 1; } +namespace detail { + +template +static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CONSTEXPR T +NegativeInfOrLowestValue(hwy::FloatTag /* tag */) { + return BitCastScalar( + static_cast>(SignMask() | ExponentMask())); +} + +template +static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CONSTEXPR T +NegativeInfOrLowestValue(hwy::NonFloatTag /* tag */) { + return LowestValue(); +} + +template +static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CONSTEXPR T +PositiveInfOrHighestValue(hwy::FloatTag /* tag */) { + return BitCastScalar(ExponentMask()); +} + +template +static HWY_INLINE HWY_MAYBE_UNUSED HWY_BITCASTSCALAR_CONSTEXPR T +PositiveInfOrHighestValue(hwy::NonFloatTag /* tag */) { + return HighestValue(); +} + +} // namespace detail + +template +HWY_API HWY_BITCASTSCALAR_CONSTEXPR T NegativeInfOrLowestValue() { + return detail::NegativeInfOrLowestValue(IsFloatTag()); +} + +template +HWY_API HWY_BITCASTSCALAR_CONSTEXPR T PositiveInfOrHighestValue() { + return detail::PositiveInfOrHighestValue(IsFloatTag()); +} + //------------------------------------------------------------------------------ // Additional F16/BF16 operators diff --git a/hwy/base_test.cc b/hwy/base_test.cc index a22da3aa64..64c4b76609 100644 --- a/hwy/base_test.cc +++ b/hwy/base_test.cc @@ -101,6 +101,22 @@ struct TestLowestHighest { if (!IsSpecialFloat()) { HWY_ASSERT_EQ(std::numeric_limits::lowest(), LowestValue()); HWY_ASSERT_EQ(std::numeric_limits::max(), HighestValue()); + + if (IsFloat()) { + HWY_ASSERT(ScalarSignBit(NegativeInfOrLowestValue())); + HWY_ASSERT(!ScalarIsFinite(NegativeInfOrLowestValue())); + HWY_ASSERT(!ScalarSignBit(PositiveInfOrHighestValue())); + HWY_ASSERT(!ScalarIsFinite(PositiveInfOrHighestValue())); + HWY_ASSERT(NegativeInfOrLowestValue() < + std::numeric_limits::lowest()); + HWY_ASSERT(PositiveInfOrHighestValue() > + std::numeric_limits::max()); + } else { + HWY_ASSERT_EQ(std::numeric_limits::lowest(), + NegativeInfOrLowestValue()); + HWY_ASSERT_EQ(std::numeric_limits::max(), + PositiveInfOrHighestValue()); + } } } }; diff --git a/hwy/ops/emu128-inl.h b/hwy/ops/emu128-inl.h index a530a8f9f6..84e1704d64 100644 --- a/hwy/ops/emu128-inl.h +++ b/hwy/ops/emu128-inl.h @@ -2917,29 +2917,9 @@ HWY_API T ReduceSum(D d, VFromD v) { return sum; } -namespace detail { -template , HWY_IF_FLOAT_OR_SPECIAL(T)> -T InitReduceMin(D d) { - return GetLane(Inf(d)); -} -template , HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> -T InitReduceMin(D d) { - return HighestValue(); -} - -template , HWY_IF_FLOAT_OR_SPECIAL(T)> -T InitReduceMax(D d) { - return -GetLane(Inf(d)); -} -template , HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> -T InitReduceMax(D d) { - return LowestValue(); -} -} // namespace detail - template , HWY_IF_REDUCE_D(D)> HWY_API T ReduceMin(D d, VFromD v) { - T min = detail::InitReduceMin(d); + T min = PositiveInfOrHighestValue(); for (size_t i = 0; i < MaxLanes(d); ++i) { min = HWY_MIN(min, v.raw[i]); } @@ -2947,7 +2927,7 @@ HWY_API T ReduceMin(D d, VFromD v) { } template , HWY_IF_REDUCE_D(D)> HWY_API T ReduceMax(D d, VFromD v) { - T max = detail::InitReduceMax(d); + T max = NegativeInfOrLowestValue(); for (size_t i = 0; i < MaxLanes(d); ++i) { max = HWY_MAX(max, v.raw[i]); } From 758ec705cf1a5fb122115440ffa82c48d36ad9cb Mon Sep 17 00:00:00 2001 From: John Platts Date: Fri, 17 Jan 2025 14:31:31 -0600 Subject: [PATCH 42/64] Fix for compiler error with GCC 9 or earlier --- hwy/base.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hwy/base.h b/hwy/base.h index 0219503f4c..2706487911 100644 --- a/hwy/base.h +++ b/hwy/base.h @@ -129,7 +129,7 @@ #define HWY_NORETURN __attribute__((noreturn)) #define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1) #define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0) -#if HWY_COMPILER_GCC || __has_builtin(__builtin_unreachable) +#if HWY_COMPILER_GCC || HWY_HAS_BUILTIN(__builtin_unreachable) #define HWY_UNREACHABLE __builtin_unreachable() #else #define HWY_UNREACHABLE From cf6a122f6ae2955f8e5cb60d3029e28870205875 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 20 Jan 2025 13:08:49 +0000 Subject: [PATCH 43/64] Bump step-security/harden-runner from 2.10.3 to 2.10.4 Bumps [step-security/harden-runner](https://github.com/step-security/harden-runner) from 2.10.3 to 2.10.4. - [Release notes](https://github.com/step-security/harden-runner/releases) - [Commits](https://github.com/step-security/harden-runner/compare/c95a14d0e5bab51a9f56296a4eb0e416910cd350...cb605e52c26070c328afc4562f0b4ada7618a84e) --- updated-dependencies: - dependency-name: step-security/harden-runner dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/build_test.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml index d5eff94eeb..3d3b7a791a 100644 --- a/.github/workflows/build_test.yml +++ b/.github/workflows/build_test.yml @@ -135,7 +135,7 @@ jobs: steps: - name: Harden Runner - uses: step-security/harden-runner@c95a14d0e5bab51a9f56296a4eb0e416910cd350 # v2.10.3 + uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.4 with: egress-policy: audit # cannot be block - runner does git checkout @@ -230,7 +230,7 @@ jobs: steps: - name: Harden Runner - uses: step-security/harden-runner@c95a14d0e5bab51a9f56296a4eb0e416910cd350 # v2.10.3 + uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.4 with: egress-policy: audit # cannot be block - runner does git checkout @@ -313,7 +313,7 @@ jobs: steps: - name: Harden Runner - uses: step-security/harden-runner@c95a14d0e5bab51a9f56296a4eb0e416910cd350 # v2.10.3 + uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.4 with: egress-policy: audit # cannot be block - runner does git checkout @@ -334,7 +334,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Harden Runner - uses: step-security/harden-runner@c95a14d0e5bab51a9f56296a4eb0e416910cd350 # v2.10.3 + uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.4 with: egress-policy: audit # cannot be block - runner does git checkout From fc384ee6c654d905ac2481b7bd06486ecde49cc0 Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Mon, 20 Jan 2025 11:48:14 -0800 Subject: [PATCH 44/64] warning fix (unused param) PiperOrigin-RevId: 717590393 --- hwy/ops/emu128-inl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hwy/ops/emu128-inl.h b/hwy/ops/emu128-inl.h index a530a8f9f6..2a8611e393 100644 --- a/hwy/ops/emu128-inl.h +++ b/hwy/ops/emu128-inl.h @@ -2923,7 +2923,7 @@ T InitReduceMin(D d) { return GetLane(Inf(d)); } template , HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> -T InitReduceMin(D d) { +T InitReduceMin(D) { return HighestValue(); } @@ -2932,7 +2932,7 @@ T InitReduceMax(D d) { return -GetLane(Inf(d)); } template , HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> -T InitReduceMax(D d) { +T InitReduceMax(D) { return LowestValue(); } } // namespace detail From 21a6bb018f5dc579818d88b209272cd765130ad9 Mon Sep 17 00:00:00 2001 From: scuzqy Date: Tue, 21 Jan 2025 02:41:21 -0800 Subject: [PATCH 45/64] Copybara import of the project: -- fa07d18e450347578407bd91002a2578075a7592 by scuzqy : Resolve TODO in aligned_allocator And updated corresponding test. AllocateAligned was designed to take POD types only. COPYBARA_INTEGRATE_REVIEW=https://github.com/google/highway/pull/2298 from scuzqy:AlignedAllocatorPOD fa07d18e450347578407bd91002a2578075a7592 PiperOrigin-RevId: 717822922 --- hwy/aligned_allocator.h | 5 ++++- hwy/aligned_allocator_test.cc | 17 ++--------------- 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/hwy/aligned_allocator.h b/hwy/aligned_allocator.h index e738c8be65..d2f4e8479d 100644 --- a/hwy/aligned_allocator.h +++ b/hwy/aligned_allocator.h @@ -232,7 +232,6 @@ class AlignedFreer { template void operator()(T* aligned_pointer) const { - // TODO(deymo): assert that we are using a POD type T. FreeAlignedBytes(aligned_pointer, free_, opaque_ptr_); } @@ -251,6 +250,10 @@ using AlignedFreeUniquePtr = std::unique_ptr; template AlignedFreeUniquePtr AllocateAligned(const size_t items, AllocPtr alloc, FreePtr free, void* opaque) { + static_assert(std::is_trivially_copyable::value, + "AllocateAligned: requires trivially copyable T"); + static_assert(std::is_trivially_destructible::value, + "AllocateAligned: requires trivially destructible T"); return AlignedFreeUniquePtr( detail::AllocateAlignedItems(items, alloc, opaque), AlignedFreer(free, opaque)); diff --git a/hwy/aligned_allocator_test.cc b/hwy/aligned_allocator_test.cc index e35a8dc2ad..15a850039c 100644 --- a/hwy/aligned_allocator_test.cc +++ b/hwy/aligned_allocator_test.cc @@ -146,8 +146,8 @@ TEST(AlignedAllocatorTest, TestEmptyAlignedUniquePtr) { } TEST(AlignedAllocatorTest, TestEmptyAlignedFreeUniquePtr) { - AlignedFreeUniquePtr> ptr(nullptr, AlignedFreer()); - AlignedFreeUniquePtr[]> arr(nullptr, AlignedFreer()); + AlignedFreeUniquePtr> ptr(nullptr, AlignedFreer()); + AlignedFreeUniquePtr[]> arr(nullptr, AlignedFreer()); } TEST(AlignedAllocatorTest, TestCustomAlloc) { @@ -229,19 +229,6 @@ TEST(AlignedAllocatorTest, TestAllocMultipleInt) { HWY_ASSERT(ret != size_t{0}); } -TEST(AlignedAllocatorTest, TestAllocateAlignedObjectWithoutDestructor) { - int counter = 0; - { - // This doesn't call the constructor. - auto obj = AllocateAligned>(1); - HWY_ASSERT(obj); - obj[0].counter_ = &counter; - } - // Destroying the unique_ptr shouldn't have called the destructor of the - // SampleObject<24>. - HWY_ASSERT_EQ(0, counter); -} - TEST(AlignedAllocatorTest, TestMakeUniqueAlignedArrayWithCustomAlloc) { FakeAllocator fake_alloc; int counter = 0; From 0b696633f9ad89497dd5532b55eaa01625ad71ca Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Wed, 22 Jan 2025 09:36:45 -0800 Subject: [PATCH 46/64] fix incompatibility with Windows macro, fixes #2450, thanks @scuzqy PiperOrigin-RevId: 718419557 --- hwy/contrib/sort/vqsort.cc | 6 ++++++ hwy/contrib/thread_pool/futex.h | 6 ++++++ hwy/tests/memory_test.cc | 10 ++++++++-- hwy/timer.h | 3 +++ 4 files changed, 23 insertions(+), 2 deletions(-) diff --git a/hwy/contrib/sort/vqsort.cc b/hwy/contrib/sort/vqsort.cc index b0abe49341..e90dbf6b4d 100644 --- a/hwy/contrib/sort/vqsort.cc +++ b/hwy/contrib/sort/vqsort.cc @@ -72,6 +72,12 @@ #if VQSORT_SECURE_SEED == 1 #include #elif VQSORT_SECURE_SEED == 2 +#ifndef NOMINMAX +#define NOMINMAX +#endif // NOMINMAX +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif // WIN32_LEAN_AND_MEAN #include #if HWY_COMPILER_MSVC || HWY_COMPILER_CLANGCL #pragma comment(lib, "advapi32.lib") diff --git a/hwy/contrib/thread_pool/futex.h b/hwy/contrib/thread_pool/futex.h index cd4159484f..0f9a277555 100644 --- a/hwy/contrib/thread_pool/futex.h +++ b/hwy/contrib/thread_pool/futex.h @@ -65,6 +65,12 @@ int __ulock_wake(uint32_t op, void* address, uint64_t zero); #elif HWY_OS_WIN && !defined(HWY_DISABLE_FUTEX) // WakeByAddressAll requires Windows 8, so add an opt-out. +#ifndef NOMINMAX +#define NOMINMAX +#endif // NOMINMAX +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif // WIN32_LEAN_AND_MEAN #include #pragma comment(lib, "synchronization.lib") diff --git a/hwy/tests/memory_test.cc b/hwy/tests/memory_test.cc index 8b698f3308..09c05e9469 100644 --- a/hwy/tests/memory_test.cc +++ b/hwy/tests/memory_test.cc @@ -19,9 +19,15 @@ // detected. Must come before Highway headers. #include "hwy/base.h" #include "hwy/tests/test_util.h" -#if defined(_WIN32) || defined(_WIN64) +#if HWY_OS_WIN +#ifndef NOMINMAX +#define NOMINMAX +#endif // NOMINMAX +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif // WIN32_LEAN_AND_MEAN #include -#endif +#endif // HWY_OS_WIN #undef HWY_TARGET_INCLUDE #define HWY_TARGET_INCLUDE "tests/memory_test.cc" diff --git a/hwy/timer.h b/hwy/timer.h index f56ab7c7ea..16a4702912 100644 --- a/hwy/timer.h +++ b/hwy/timer.h @@ -28,6 +28,9 @@ #ifndef NOMINMAX #define NOMINMAX #endif // NOMINMAX +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif // WIN32_LEAN_AND_MEAN #include #endif From 9c8e963de84a56fb70b106628d059913dafdcddb Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Tue, 28 Jan 2025 05:17:40 -0800 Subject: [PATCH 47/64] no longer require opt-in for AVX3_DL PiperOrigin-RevId: 720535765 --- README.md | 9 ++++----- g3doc/design_philosophy.md | 3 +-- g3doc/quick_reference.md | 7 ++----- hwy/detect_targets.h | 28 +++++++++------------------- 4 files changed, 16 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index f6a0b8ad0d..c16c6ee73b 100644 --- a/README.md +++ b/README.md @@ -161,11 +161,10 @@ Highway supports 24 targets, listed in alphabetical order of platform: - `SSE4` (~Nehalem, also includes AES + CLMUL). - `AVX2` (~Haswell, also includes BMI2 + F16 + FMA) - `AVX3` (~Skylake, AVX-512F/BW/CD/DQ/VL) - - `AVX3_DL` (~Icelake, includes BitAlg + CLMUL + GFNI + VAES + VBMI + - VBMI2 + VNNI + VPOPCNT; requires opt-in by defining `HWY_WANT_AVX3_DL` - unless compiling for static dispatch), - - `AVX3_ZEN4` (like AVX3_DL but optimized for AMD Zen4; requires opt-in by - defining `HWY_WANT_AVX3_ZEN4` if compiling for static dispatch, but + - `AVX3_DL` (~Icelake, includes `BitAlg` + `CLMUL` + `GFNI` + `VAES` + + `VBMI` + `VBMI2` + `VNNI` + `VPOPCNT`), + - `AVX3_ZEN4` (AVX3_DL plus BF16, optimized for AMD Zen4; requires opt-in + by defining `HWY_WANT_AVX3_ZEN4` if compiling for static dispatch, but enabled by default for runtime dispatch), - `AVX3_SPR` (~Sapphire Rapids, includes AVX-512FP16) diff --git a/g3doc/design_philosophy.md b/g3doc/design_philosophy.md index c541b3a701..17b8d53b44 100644 --- a/g3doc/design_philosophy.md +++ b/g3doc/design_philosophy.md @@ -67,8 +67,7 @@ * Not every CPU need be supported. To reduce code size and compile time, we group x86 targets into clusters. In particular, SSE3 instructions are only used/available if S-SSE3 is also available, and AVX only if AVX2 is also - supported. Code generation for AVX3_DL also requires opting-in by defining - HWY_WANT_AVX3_DL. + supported. * Access to platform-specific intrinsics is necessary for acceptance in performance-critical projects. We provide conversions to and from intrinsics diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md index e3a338d69a..af08fd14cf 100644 --- a/g3doc/quick_reference.md +++ b/g3doc/quick_reference.md @@ -304,8 +304,8 @@ Store(v, d2, ptr); // Use d2, NOT DFromV() ## Targets Let `Target` denote an instruction set, one of `SCALAR/EMU128`, `RVV`, -`SSE2/SSSE3/SSE4/AVX2/AVX3/AVX3_DL/AVX3_ZEN4/AVX3_SPR` (x86), -`PPC8/PPC9/PPC10/Z14/Z15` (POWER), `WASM/WASM_EMU256` (WebAssembly), +`SSE2/SSSE3/SSE4/AVX2/AVX3/AVX3_DL/AVX3_ZEN4/AVX3_SPR` (x86), `PPC8/PPC9/PPC10` +(POWER), `Z14/Z15` (IBM Z), `WASM/WASM_EMU256` (WebAssembly), `NEON_WITHOUT_AES/NEON/NEON_BF16/SVE/SVE2/SVE_256/SVE2_128` (Arm). Note that x86 CPUs are segmented into dozens of feature flags and capabilities, @@ -349,9 +349,6 @@ instructions (implying the target CPU must support them). if they are not marked as available by the compiler. On MSVC, the only ways to enable SSSE3 and SSE4 are defining these, or enabling AVX. -* `HWY_WANT_AVX3_DL`: opt-in for dynamic dispatch to `HWY_AVX3_DL`. This is - unnecessary if the baseline already includes AVX3_DL. - You can detect and influence the set of supported targets: * `TargetName(t)` returns a string literal identifying the single target `t`, diff --git a/hwy/detect_targets.h b/hwy/detect_targets.h index abd2ad8af7..0d277d0d92 100644 --- a/hwy/detect_targets.h +++ b/hwy/detect_targets.h @@ -63,15 +63,14 @@ #define HWY_AVX10_2_512 (1LL << 3) // AVX10.2 with 512-bit vectors #define HWY_AVX3_SPR (1LL << 4) #define HWY_AVX10_2 (1LL << 5) // AVX10.2 with 256-bit vectors -// Currently HWY_AVX3_DL plus AVX512BF16 and a special case for CompressStore -// (10x as fast). -// We may later also use VPCONFLICT. +// Currently `HWY_AVX3_DL` plus `AVX512BF16` and a special case for +// `CompressStore` (10x as fast, still useful on Zen5). We may later also use +// `VPCONFLICT`. Note that `VP2INTERSECT` is available in Zen5. #define HWY_AVX3_ZEN4 (1LL << 6) // see HWY_WANT_AVX3_ZEN4 below -// Currently satisfiable by Ice Lake (VNNI, VPCLMULQDQ, VPOPCNTDQ, VBMI, VBMI2, -// VAES, BITALG, GFNI). Later to be added: BF16 (Cooper Lake). VP2INTERSECT is -// only in Tiger Lake? -#define HWY_AVX3_DL (1LL << 7) // see HWY_WANT_AVX3_DL below +// Currently satisfiable by Ice Lake (`VNNI`, `VPCLMULQDQ`, `VPOPCNTDQ`, +// `VBMI`, `VBMI2`, `VAES`, `BITALG`, `GFNI`). +#define HWY_AVX3_DL (1LL << 7) #define HWY_AVX3 (1LL << 8) // HWY_AVX2 plus AVX-512F/BW/CD/DQ/VL #define HWY_AVX2 (1LL << 9) // HWY_SSE4 plus BMI2 + F16 + FMA // Bit 10: reserved @@ -726,15 +725,6 @@ #endif #endif // HWY_HAVE_RUNTIME_DISPATCH -// AVX3_DL is not widely available yet. To reduce code size and compile time, -// only include it in the set of attainable targets (for dynamic dispatch) if -// the user opts in, OR it is in the baseline (we check whether enabled below). -#if defined(HWY_WANT_AVX3_DL) || (HWY_BASELINE_TARGETS & HWY_AVX3_DL) -#define HWY_ATTAINABLE_AVX3_DL (HWY_AVX3_DL) -#else -#define HWY_ATTAINABLE_AVX3_DL 0 -#endif - #if HWY_ARCH_ARM_A64 && HWY_HAVE_RUNTIME_DISPATCH #define HWY_ATTAINABLE_NEON HWY_ALL_NEON #elif HWY_ARCH_ARM // static dispatch, or HWY_ARCH_ARM_V7 @@ -803,9 +793,9 @@ #define HWY_ATTAINABLE_TARGETS_X86 \ HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_STATIC_TARGET | HWY_AVX2) #else // !HWY_COMPILER_MSVC -#define HWY_ATTAINABLE_TARGETS_X86 \ - HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_SSE2 | HWY_SSSE3 | HWY_SSE4 | \ - HWY_AVX2 | HWY_AVX3 | HWY_ATTAINABLE_AVX3_DL | HWY_AVX3_ZEN4 | \ +#define HWY_ATTAINABLE_TARGETS_X86 \ + HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_SSE2 | HWY_SSSE3 | HWY_SSE4 | \ + HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL | HWY_AVX3_ZEN4 | \ HWY_AVX3_SPR) #endif // !HWY_COMPILER_MSVC #endif // HWY_ATTAINABLE_TARGETS_X86 From 960f74de6c485eec2dba173b0a4c8d84e7ac86a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jadrian=20Sardi=C3=B1as?= Date: Tue, 28 Jan 2025 07:07:39 -0800 Subject: [PATCH 48/64] no longer require opt-in for AVX3_DL PiperOrigin-RevId: 720561925 --- README.md | 9 +++++---- g3doc/design_philosophy.md | 3 ++- g3doc/quick_reference.md | 7 +++++-- hwy/detect_targets.h | 28 +++++++++++++++++++--------- 4 files changed, 31 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index c16c6ee73b..f6a0b8ad0d 100644 --- a/README.md +++ b/README.md @@ -161,10 +161,11 @@ Highway supports 24 targets, listed in alphabetical order of platform: - `SSE4` (~Nehalem, also includes AES + CLMUL). - `AVX2` (~Haswell, also includes BMI2 + F16 + FMA) - `AVX3` (~Skylake, AVX-512F/BW/CD/DQ/VL) - - `AVX3_DL` (~Icelake, includes `BitAlg` + `CLMUL` + `GFNI` + `VAES` + - `VBMI` + `VBMI2` + `VNNI` + `VPOPCNT`), - - `AVX3_ZEN4` (AVX3_DL plus BF16, optimized for AMD Zen4; requires opt-in - by defining `HWY_WANT_AVX3_ZEN4` if compiling for static dispatch, but + - `AVX3_DL` (~Icelake, includes BitAlg + CLMUL + GFNI + VAES + VBMI + + VBMI2 + VNNI + VPOPCNT; requires opt-in by defining `HWY_WANT_AVX3_DL` + unless compiling for static dispatch), + - `AVX3_ZEN4` (like AVX3_DL but optimized for AMD Zen4; requires opt-in by + defining `HWY_WANT_AVX3_ZEN4` if compiling for static dispatch, but enabled by default for runtime dispatch), - `AVX3_SPR` (~Sapphire Rapids, includes AVX-512FP16) diff --git a/g3doc/design_philosophy.md b/g3doc/design_philosophy.md index 17b8d53b44..c541b3a701 100644 --- a/g3doc/design_philosophy.md +++ b/g3doc/design_philosophy.md @@ -67,7 +67,8 @@ * Not every CPU need be supported. To reduce code size and compile time, we group x86 targets into clusters. In particular, SSE3 instructions are only used/available if S-SSE3 is also available, and AVX only if AVX2 is also - supported. + supported. Code generation for AVX3_DL also requires opting-in by defining + HWY_WANT_AVX3_DL. * Access to platform-specific intrinsics is necessary for acceptance in performance-critical projects. We provide conversions to and from intrinsics diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md index af08fd14cf..e3a338d69a 100644 --- a/g3doc/quick_reference.md +++ b/g3doc/quick_reference.md @@ -304,8 +304,8 @@ Store(v, d2, ptr); // Use d2, NOT DFromV() ## Targets Let `Target` denote an instruction set, one of `SCALAR/EMU128`, `RVV`, -`SSE2/SSSE3/SSE4/AVX2/AVX3/AVX3_DL/AVX3_ZEN4/AVX3_SPR` (x86), `PPC8/PPC9/PPC10` -(POWER), `Z14/Z15` (IBM Z), `WASM/WASM_EMU256` (WebAssembly), +`SSE2/SSSE3/SSE4/AVX2/AVX3/AVX3_DL/AVX3_ZEN4/AVX3_SPR` (x86), +`PPC8/PPC9/PPC10/Z14/Z15` (POWER), `WASM/WASM_EMU256` (WebAssembly), `NEON_WITHOUT_AES/NEON/NEON_BF16/SVE/SVE2/SVE_256/SVE2_128` (Arm). Note that x86 CPUs are segmented into dozens of feature flags and capabilities, @@ -349,6 +349,9 @@ instructions (implying the target CPU must support them). if they are not marked as available by the compiler. On MSVC, the only ways to enable SSSE3 and SSE4 are defining these, or enabling AVX. +* `HWY_WANT_AVX3_DL`: opt-in for dynamic dispatch to `HWY_AVX3_DL`. This is + unnecessary if the baseline already includes AVX3_DL. + You can detect and influence the set of supported targets: * `TargetName(t)` returns a string literal identifying the single target `t`, diff --git a/hwy/detect_targets.h b/hwy/detect_targets.h index 0d277d0d92..abd2ad8af7 100644 --- a/hwy/detect_targets.h +++ b/hwy/detect_targets.h @@ -63,14 +63,15 @@ #define HWY_AVX10_2_512 (1LL << 3) // AVX10.2 with 512-bit vectors #define HWY_AVX3_SPR (1LL << 4) #define HWY_AVX10_2 (1LL << 5) // AVX10.2 with 256-bit vectors -// Currently `HWY_AVX3_DL` plus `AVX512BF16` and a special case for -// `CompressStore` (10x as fast, still useful on Zen5). We may later also use -// `VPCONFLICT`. Note that `VP2INTERSECT` is available in Zen5. +// Currently HWY_AVX3_DL plus AVX512BF16 and a special case for CompressStore +// (10x as fast). +// We may later also use VPCONFLICT. #define HWY_AVX3_ZEN4 (1LL << 6) // see HWY_WANT_AVX3_ZEN4 below -// Currently satisfiable by Ice Lake (`VNNI`, `VPCLMULQDQ`, `VPOPCNTDQ`, -// `VBMI`, `VBMI2`, `VAES`, `BITALG`, `GFNI`). -#define HWY_AVX3_DL (1LL << 7) +// Currently satisfiable by Ice Lake (VNNI, VPCLMULQDQ, VPOPCNTDQ, VBMI, VBMI2, +// VAES, BITALG, GFNI). Later to be added: BF16 (Cooper Lake). VP2INTERSECT is +// only in Tiger Lake? +#define HWY_AVX3_DL (1LL << 7) // see HWY_WANT_AVX3_DL below #define HWY_AVX3 (1LL << 8) // HWY_AVX2 plus AVX-512F/BW/CD/DQ/VL #define HWY_AVX2 (1LL << 9) // HWY_SSE4 plus BMI2 + F16 + FMA // Bit 10: reserved @@ -725,6 +726,15 @@ #endif #endif // HWY_HAVE_RUNTIME_DISPATCH +// AVX3_DL is not widely available yet. To reduce code size and compile time, +// only include it in the set of attainable targets (for dynamic dispatch) if +// the user opts in, OR it is in the baseline (we check whether enabled below). +#if defined(HWY_WANT_AVX3_DL) || (HWY_BASELINE_TARGETS & HWY_AVX3_DL) +#define HWY_ATTAINABLE_AVX3_DL (HWY_AVX3_DL) +#else +#define HWY_ATTAINABLE_AVX3_DL 0 +#endif + #if HWY_ARCH_ARM_A64 && HWY_HAVE_RUNTIME_DISPATCH #define HWY_ATTAINABLE_NEON HWY_ALL_NEON #elif HWY_ARCH_ARM // static dispatch, or HWY_ARCH_ARM_V7 @@ -793,9 +803,9 @@ #define HWY_ATTAINABLE_TARGETS_X86 \ HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_STATIC_TARGET | HWY_AVX2) #else // !HWY_COMPILER_MSVC -#define HWY_ATTAINABLE_TARGETS_X86 \ - HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_SSE2 | HWY_SSSE3 | HWY_SSE4 | \ - HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL | HWY_AVX3_ZEN4 | \ +#define HWY_ATTAINABLE_TARGETS_X86 \ + HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_SSE2 | HWY_SSSE3 | HWY_SSE4 | \ + HWY_AVX2 | HWY_AVX3 | HWY_ATTAINABLE_AVX3_DL | HWY_AVX3_ZEN4 | \ HWY_AVX3_SPR) #endif // !HWY_COMPILER_MSVC #endif // HWY_ATTAINABLE_TARGETS_X86 From 9e30869855f1a4b29d3118191f96c37b788147e4 Mon Sep 17 00:00:00 2001 From: Mohammad Azim Khan Date: Sun, 10 Nov 2024 18:42:40 +0000 Subject: [PATCH 49/64] Promote and round operations --- g3doc/quick_reference.md | 32 +++++ hwy/ops/arm_sve-inl.h | 33 +++++ hwy/ops/generic_ops-inl.h | 96 +++++++++++++++ hwy/tests/convert_test.cc | 253 ++++++++++++++++++++++++++++++++++++++ hwy/tests/demote_test.cc | 204 ++++++++++++++++++++++++++++++ 5 files changed, 618 insertions(+) diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md index e3a338d69a..3fac0ff972 100644 --- a/g3doc/quick_reference.md +++ b/g3doc/quick_reference.md @@ -1819,6 +1819,17 @@ obtain the `D` that describes the return type. Vec<D> **DemoteTo**(D, V v): narrows float to half (for bf16, it is unspecified whether this truncates or rounds). +* `V`,`D`: (`f64,i32`), (`f32,f16`) \ + Vec<D> **DemoteCeilTo**(D, V v): Demotes a floating point + number to half-sized integral type with ceiling rounding. + +* `V`,`D`: (`f64,i32`), (`f32,f16`) \ + Vec<D> **DemoteFloorTo**(D, V v): Demotes a floating + point number to half-sized integral type with floor rounding. + +* Vec<D> **MaskedDemoteToOrZero**(M m, D d, V v): returns `v[i]` + demoted to `D` where m is active and returns zero otherwise. + #### Single vector promotion These functions promote a half vector to a full vector. To obtain halves, use @@ -1845,6 +1856,27 @@ These functions promote a half vector to a full vector. To obtain halves, use integer. Returns an implementation-defined value if the input exceeds the destination range. +* `V`: `f`, `D`:`{u,i,f}`\ + Vec<D> **PromoteCeilTo**(D, V part): rounds `part[i]` + up and converts the rounded value to a signed or unsigned integer. + Returns an implementation-defined value if the input exceeds the + destination range. + +* `V`: `f`, `D`:`{u,i,f}`\ + Vec<D> **PromoteFloorTo**(D, V part): rounds `part[i]` + down and converts the rounded value to a signed or unsigned integer. + Returns an implementation-defined value if the input exceeds the + destination range. + +* `V`: `f`, `D`:`{u,i,f}`\ + Vec<D> **PromoteToNearestInt **(D, V part): rounds + `part[i]` towards the nearest integer, with ties to even, and converts the + rounded value to a signed or unsigned integer. Returns an + implementation-defined value if the input exceeds the destination range. + +* Vec<D> **MaskedPromoteToOrZero**(M m, D d, V v): returns `v[i]` + widened to `D` where m is active and returns zero otherwise. + The following may be more convenient or efficient than also calling `LowerHalf` / `UpperHalf`: diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h index 502440ebef..eb041a2d39 100644 --- a/hwy/ops/arm_sve-inl.h +++ b/hwy/ops/arm_sve-inl.h @@ -3074,6 +3074,39 @@ HWY_API VFromD DemoteToNearestInt(DI32 di32, return DemoteTo(di32, Round(v)); } +// ------------------------------ DemoteTo (Alternate Rounding) +#ifdef HWY_NATIVE_DEMOTE_CEIL_TO +#undef HWY_NATIVE_DEMOTE_CEIL_TO +#else +#define HWY_NATIVE_DEMOTE_CEIL_TO +#endif + +template +HWY_API VFromD DemoteCeilTo(DI32 di32, VFromD> v) { + return DemoteTo(di32, Ceil(v)); +} + +template +HWY_API VFromD DemoteCeilTo(D16 d16, VFromD> v) { + return DemoteTo(d16, Ceil(v)); +} + +#ifdef HWY_NATIVE_DEMOTE_FLOOR_TO +#undef HWY_NATIVE_DEMOTE_FLOOR_TO +#else +#define HWY_NATIVE_DEMOTE_FLOOR_TO +#endif + +template +HWY_API VFromD DemoteFloorTo(DI32 di32, VFromD> v) { + return DemoteTo(di32, Floor(v)); +} + +template +HWY_API VFromD DemoteFloorTo(D16 d16, VFromD> v) { + return DemoteTo(d16, Floor(v)); +} + // ------------------------------ Iota (Add, ConvertTo) #define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP) \ diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h index cae00560dc..3c0406f943 100644 --- a/hwy/ops/generic_ops-inl.h +++ b/hwy/ops/generic_ops-inl.h @@ -3250,6 +3250,19 @@ HWY_API VFromD DemoteTo(D df16, VFromD> v) { #endif // HWY_NATIVE_F16C +// ------------------------------ PromoteTo F16->I +#if HWY_HAVE_FLOAT16 || HWY_IDE +template +HWY_API VFromD PromoteTo(D d, VFromD> v) { + return ConvertTo(d, PromoteTo(Rebind(), v)); +} + +template +HWY_API VFromD PromoteTo(D d, VFromD> v) { + return PromoteTo(d, PromoteTo(Rebind(), v)); +} +#endif + // ------------------------------ F64->F16 DemoteTo #if (defined(HWY_NATIVE_DEMOTE_F64_TO_F16) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16 @@ -3383,6 +3396,53 @@ HWY_API VFromD ReorderDemote2To(D dbf16, VFromD> a, #endif // HWY_NATIVE_DEMOTE_F32_TO_BF16 +// ------------------------------ DemoteTo (Alternate Rounding) +#if (defined(HWY_NATIVE_DEMOTE_CEIL_TO) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_DEMOTE_CEIL_TO +#undef HWY_NATIVE_DEMOTE_CEIL_TO +#else +#define HWY_NATIVE_DEMOTE_CEIL_TO +#endif + +#if HWY_HAVE_FLOAT64 +template +HWY_API VFromD DemoteCeilTo(D32 d32, VFromD> v) { + return DemoteTo(d32, Ceil(v)); +} +#endif // HWY_HAVE_FLOAT64 + +#if HWY_HAVE_FLOAT16 +template +HWY_API VFromD DemoteCeilTo(D16 d16, VFromD> v) { + return DemoteTo(d16, Ceil(v)); +} +#endif // HWY_HAVE_FLOAT16 + +#endif // HWY_NATIVE_DEMOTE_CEIL_TO + +#if (defined(HWY_NATIVE_DEMOTE_FLOOR_TO) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_DEMOTE_FLOOR_TO +#undef HWY_NATIVE_DEMOTE_FLOOR_TO +#else +#define HWY_NATIVE_DEMOTE_FLOOR_TO +#endif + +#if HWY_HAVE_FLOAT64 +template +HWY_API VFromD DemoteFloorTo(D32 d32, VFromD> v) { + return DemoteTo(d32, Floor(v)); +} +#endif // HWY_HAVE_FLOAT64 + +#if HWY_HAVE_FLOAT16 +template +HWY_API VFromD DemoteFloorTo(D16 d16, VFromD> v) { + return DemoteTo(d16, Floor(v)); +} +#endif // HWY_HAVE_FLOAT16 + +#endif // HWY_NATIVE_DEMOTE_FLOOR_TO + // ------------------------------ PromoteInRangeTo #if (defined(HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO) == \ defined(HWY_TARGET_TOGGLE)) @@ -3514,6 +3574,24 @@ HWY_API VFromD PromoteInRangeOddTo(D d, V v) { } #endif // HWY_TARGET != HWY_SCALAR +// ------------------------------ PromoteCeilTo +template +HWY_API Vec PromoteCeilTo(DTo d, V v) { + return PromoteTo(d, Ceil(v)); +} + +// ------------------------------ PromoteFloorTo +template +HWY_API Vec PromoteFloorTo(DTo d, V v) { + return PromoteTo(d, Floor(v)); +} + +// ------------------------------ PromoteToNearestInt +template +HWY_API Vec PromoteToNearestInt(DTo d, V v) { + return PromoteTo(d, Round(v)); +} + // ------------------------------ SumsOf2 #if HWY_TARGET != HWY_SCALAR || HWY_IDE @@ -4410,6 +4488,24 @@ HWY_API V MulAddSub(V mul, V x, V sub_or_add) { return MulAdd(mul, x, add); } +// ------------------------------ MaskedPromoteToOrZero +template +HWY_API VFromD MaskedPromoteToOrZero(M m, D d, V v) { + return IfThenElseZero(m, PromoteTo(d, v)); +} + +// ------------------------------ MaskedDemoteToOrZero +template +HWY_API VFromD MaskedDemoteToOrZero(M m, D d, V v) { + return IfThenElseZero(m, DemoteTo(d, v)); +} + +// ------------------------------ MaskedConvertToOrZero +template +HWY_API VFromD MaskedConvertToOrZero(M m, D d, V v) { + return IfThenElseZero(m, ConvertTo(d, v)); +} + // ------------------------------ Integer division #if (defined(HWY_NATIVE_INT_DIV) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_INT_DIV diff --git a/hwy/tests/convert_test.cc b/hwy/tests/convert_test.cc index f9faac6bfb..4eac790e95 100644 --- a/hwy/tests/convert_test.cc +++ b/hwy/tests/convert_test.cc @@ -146,6 +146,151 @@ HWY_NOINLINE void TestAllPromoteTo() { #endif } +template +struct TestPromoteRoundTo { + template + HWY_NOINLINE void operator()(T /*unused*/, D from_d) { + static_assert(sizeof(T) < sizeof(ToT), "Input type must be narrower"); + const Rebind to_d; + + const size_t N = Lanes(from_d); + auto from = AllocateAligned(N); + auto expected_ceil = AllocateAligned(N); + auto expected_floor = AllocateAligned(N); + auto expected_nearest_int = AllocateAligned(N); + HWY_ASSERT(from && expected_ceil && expected_floor && expected_nearest_int); + + RandomState rng; + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + for (size_t i = 0; i < N; ++i) { + const uint64_t bits = rng(); + CopyBytes(&bits, &from[i]); // not same size + expected_ceil[i] = + ConvertScalarTo(std::ceil(static_cast(from[i]))); + expected_floor[i] = + ConvertScalarTo(std::floor(static_cast(from[i]))); + expected_nearest_int[i] = + ConvertScalarTo(std::nearbyint(static_cast(from[i]))); + } + + auto input = Load(from_d, from.get()); + auto output_ceil = PromoteCeilTo(to_d, input); + auto output_floor = PromoteFloorTo(to_d, input); + auto output_nearest_int = PromoteToNearestInt(to_d, input); + + HWY_ASSERT_VEC_EQ(to_d, expected_ceil.get(), output_ceil); + HWY_ASSERT_VEC_EQ(to_d, expected_floor.get(), output_floor); + HWY_ASSERT_VEC_EQ(to_d, expected_nearest_int.get(), output_nearest_int); + } + } +}; + +HWY_NOINLINE void TestAllPromoteRoundTo() { +#if HWY_HAVE_FLOAT16 + const ForPromoteVectors, 1> to_i32div2; + to_i32div2(hwy::float16_t()); + + const ForPromoteVectors, 1> to_f32div2; + to_f32div2(hwy::float16_t()); +#endif // HWY_HAVE_FLOAT16 + +#if HWY_HAVE_FLOAT64 + const ForPromoteVectors, 1> to_f64div2; + to_f64div2(float()); +#endif // HWY_HAVE_FLOAT64 +} + +template +struct TestMaskedPromoteToOrZero { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + static_assert(sizeof(T) < sizeof(ToT), "Input type must be narrower"); + const Rebind to_d; + + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(expected && bool_lanes); + + const auto v1 = Iota(d, 5); + + RandomState rng; + + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + for (size_t i = 0; i < N; ++i) { + bool_lanes[i] = (Random32(&rng) & 1024) ? T(1) : T(0); + + if (bool_lanes[i]) { + expected[i] = ConvertScalarTo(i + 5); + } else { + expected[i] = ConvertScalarTo(0); + } + } + + const auto mask_i = Load(to_d, bool_lanes.get()); + const auto mask = RebindMask(to_d, Gt(mask_i, Zero(to_d))); + + HWY_ASSERT_VEC_EQ(to_d, expected.get(), + MaskedPromoteToOrZero(mask, to_d, v1)); + } + } +}; + +HWY_NOINLINE void TestAllMaskedPromoteToOrZero() { + const ForPromoteVectors, 1> to_u16div2; + to_u16div2(uint8_t()); + + const ForPromoteVectors, 2> to_u32div4; + to_u32div4(uint8_t()); + + const ForPromoteVectors, 1> to_u32div2; + to_u32div2(uint16_t()); + + const ForPromoteVectors, 1> to_i16div2; + to_i16div2(uint8_t()); + to_i16div2(int8_t()); + + const ForPromoteVectors, 1> to_i32div2; + to_i32div2(uint16_t()); + to_i32div2(int16_t()); + + const ForPromoteVectors, 2> to_i32div4; + to_i32div4(uint8_t()); + to_i32div4(int8_t()); + + // Must test f16/bf16 separately because we can only load/store/convert them. + +#if HWY_HAVE_INTEGER64 + const ForPromoteVectors, 1> to_u64div2; + to_u64div2(uint32_t()); + + const ForPromoteVectors, 1> to_i64div2; + to_i64div2(int32_t()); + to_i64div2(uint32_t()); + + const ForPromoteVectors, 2> to_u64div4; + to_u64div4(uint16_t()); + + const ForPromoteVectors, 2> to_i64div4; + to_i64div4(int16_t()); + to_i64div4(uint16_t()); + + const ForPromoteVectors, 3> to_u64div8; + to_u64div8(uint8_t()); + + const ForPromoteVectors, 3> to_i64div8; + to_i64div8(int8_t()); + to_i64div8(uint8_t()); +#endif + +#if HWY_HAVE_FLOAT64 + const ForPromoteVectors, 1> to_f64div2; + to_f64div2(int32_t()); + to_f64div2(uint32_t()); + to_f64div2(float()); +#endif +} + template struct TestPromoteUpperLowerTo { template @@ -718,6 +863,111 @@ HWY_NOINLINE void TestAllIntFromFloat() { ForFloatTypes(ForPartialVectors()); } +struct TestMaskedIntFromFloat { + template + HWY_NOINLINE void operator()(TF /*unused*/, const DF df) { + using TI = MakeSigned; + const Rebind di; + const size_t N = Lanes(df); + auto expected = AllocateAligned(N); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(expected && bool_lanes); + + RandomState rng; + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + for (size_t i = 0; i < N; ++i) { + bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); + } + const auto mask_i = Load(di, bool_lanes.get()); + const auto mask = RebindMask(di, Gt(mask_i, Zero(di))); + + // This requires a test different to that in TestMaskedFloatFromInt and + // TestMaskedFloatFromUint, due to differences in saturation handling + // between ConvertTo() and static_cast<> + HWY_ASSERT_VEC_EQ(di, IfThenElseZero(mask, Set(di, 1)), + MaskedConvertToOrZero(mask, di, Set(df, 1))); + } + } +}; + +struct TestMaskedFloatFromInt { + template + HWY_NOINLINE void operator()(TF /*unused*/, const DF df) { + using TI = MakeSigned; + const RebindToSigned di; + const size_t N = Lanes(df); + auto from = AllocateAligned(N); + auto expected = AllocateAligned(N); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(from && expected && bool_lanes); + + RandomState rng; + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + for (size_t i = 0; i < N; ++i) { + const uint64_t bits = rng(); + CopyBytes(&bits, &from[i]); // not same size + + bool_lanes[i] = (Random32(&rng) & 1024) ? TF(1) : TF(0); + if (bool_lanes[i]) { + expected[i] = ConvertScalarTo(from[i]); + } else { + expected[i] = ConvertScalarTo(0); + } + } + const auto mask_i = Load(df, bool_lanes.get()); + const auto mask = RebindMask(df, Gt(mask_i, Zero(df))); + + const auto v1 = Load(di, from.get()); + + // Float from int + HWY_ASSERT_VEC_EQ(df, expected.get(), + MaskedConvertToOrZero(mask, df, v1)); + } + } +}; + +struct TestMaskedFloatFromUint { + template + HWY_NOINLINE void operator()(TF /*unused*/, const DF df) { + using TI = MakeUnsigned; + const RebindToUnsigned di; + const size_t N = Lanes(df); + auto from = AllocateAligned(N); + auto expected = AllocateAligned(N); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(from && expected && bool_lanes); + + RandomState rng; + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + for (size_t i = 0; i < N; ++i) { + const uint64_t bits = rng(); + CopyBytes(&bits, &from[i]); // not same size + + bool_lanes[i] = (Random32(&rng) & 1024) ? TF(1) : TF(0); + if (bool_lanes[i]) { + expected[i] = ConvertScalarTo(from[i]); + } else { + expected[i] = ConvertScalarTo(0); + } + } + const auto mask_i = Load(df, bool_lanes.get()); + const auto mask = RebindMask(df, Gt(mask_i, Zero(df))); + + const auto v1 = Load(di, from.get()); + + // Float from int + HWY_ASSERT_VEC_EQ(df, expected.get(), + MaskedConvertToOrZero(mask, df, v1)); + } + } +}; + +HWY_NOINLINE void TestAllMaskedConvertToOrZero() { + ForFloatTypes(ForPartialVectors()); + ForFloatTypes(ForPartialVectors()); + ForFloatTypes(ForPartialVectors()); +} + class TestUintFromFloat { template static HWY_NOINLINE void TestPowers(TF /*unused*/, const DF df) { @@ -1451,6 +1701,8 @@ namespace { HWY_BEFORE_TEST(HwyConvertTest); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllRebind); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteTo); +HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteRoundTo); +HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllMaskedPromoteToOrZero); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteUpperLowerTo); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteOddEvenTo); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllF16); @@ -1458,6 +1710,7 @@ HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllF16FromF64); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBF16); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllConvertU8); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat); +HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllMaskedConvertToOrZero); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllUintFromFloat); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromUint); diff --git a/hwy/tests/demote_test.cc b/hwy/tests/demote_test.cc index 1deff1d7c5..e40a90842d 100644 --- a/hwy/tests/demote_test.cc +++ b/hwy/tests/demote_test.cc @@ -143,6 +143,107 @@ HWY_NOINLINE void TestAllDemoteToMixed() { #endif } +template +struct TestMaskedDemoteToOrZeroInt { + template + HWY_NOINLINE void operator()(T /*unused*/, D from_d) { + static_assert(!IsFloat(), "Use TestDemoteToFloat for float output"); + static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider"); + const Rebind to_d; + + const size_t N = Lanes(from_d); + auto from = AllocateAligned(N); + auto expected = AllocateAligned(N); + auto bool_lanes = AllocateAligned(N); + HWY_ASSERT(from && expected && bool_lanes); + // Narrower range in the wider type, for clamping before we cast + const T min = ConvertScalarTo(IsSigned() ? LimitsMin() + : static_cast(0)); + const T max = LimitsMax(); + RandomState rng; + for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { + for (size_t i = 0; i < N; ++i) { + const uint64_t bits = rng(); + CopyBytes(&bits, &from[i]); // not same size + + bool_lanes[i] = (Random32(&rng) & 1024) ? ToT(1) : ToT(0); + if (bool_lanes[i]) { + expected[i] = static_cast(HWY_MIN(HWY_MAX(min, from[i]), max)); + + } else { + expected[i] = ConvertScalarTo(0); + } + } + + const auto mask_i = Load(to_d, bool_lanes.get()); + const auto mask = RebindMask(to_d, Gt(mask_i, Zero(to_d))); + + const auto v1 = Load(from_d, from.get()); + + HWY_ASSERT_VEC_EQ(to_d, expected.get(), + MaskedDemoteToOrZero(mask, to_d, v1)); + } + } +}; + +HWY_NOINLINE void TestAllMaskedDemoteToOrZeroInt() { + const ForDemoteVectors> from_i16_to_u8; + from_i16_to_u8(int16_t()); + from_i16_to_u8(uint16_t()); + + const ForDemoteVectors> from_i16_to_i8; + from_i16_to_i8(int16_t()); + from_i16_to_i8(uint16_t()); + + const ForDemoteVectors, 2> + from_i32_to_u8; + from_i32_to_u8(int32_t()); + from_i32_to_u8(uint32_t()); + + const ForDemoteVectors, 2> from_i32_to_i8; + from_i32_to_i8(int32_t()); + from_i32_to_i8(uint32_t()); + +#if HWY_HAVE_INTEGER64 + const ForDemoteVectors, 3> + from_i64_to_u8; + from_i64_to_u8(int64_t()); + from_i64_to_u8(uint64_t()); + + const ForDemoteVectors, 3> from_i64_to_i8; + from_i64_to_i8(int64_t()); + from_i64_to_i8(uint64_t()); +#endif + + const ForDemoteVectors> from_i32_to_u16; + from_i32_to_u16(int32_t()); + from_i32_to_u16(uint32_t()); + + const ForDemoteVectors> from_i32_to_i16; + from_i32_to_i16(int32_t()); + from_i32_to_i16(uint32_t()); + +#if HWY_HAVE_INTEGER64 + const ForDemoteVectors, 2> + from_i64_to_u16; + from_i64_to_u16(int64_t()); + from_i64_to_u16(uint64_t()); + + const ForDemoteVectors, 2> + from_i64_to_i16; + from_i64_to_i16(int64_t()); + from_i64_to_i16(uint64_t()); + + const ForDemoteVectors> from_i64_to_u32; + from_i64_to_u32(int64_t()); + from_i64_to_u32(uint64_t()); + + const ForDemoteVectors> from_i64_to_i32; + from_i64_to_i32(int64_t()); + from_i64_to_i32(uint64_t()); +#endif +} + template struct TestDemoteToFloat { template @@ -467,6 +568,107 @@ AlignedFreeUniquePtr ReorderBF16TestCases(D d, size_t& padded) { return in; } +template +struct TestDemoteRoundFloatToFloat { + template + HWY_NOINLINE void operator()(T /*unused*/, D from_d) { + static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider"); + const Rebind to_d; + + const size_t N = Lanes(from_d); + auto from = AllocateAligned(N); + auto expected_ceil = AllocateAligned(N); + auto expected_floor = AllocateAligned(N); + HWY_ASSERT(from && expected_ceil && expected_floor); + + RandomState rng; + for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { + for (size_t i = 0; i < N; ++i) { + const uint64_t bits = rng(); + CopyBytes(&bits, &from[i]); // not same size + expected_ceil[i] = static_cast(std::ceil(from[i])); + expected_floor[i] = static_cast(std::floor(from[i])); + } + const auto in = Load(from_d, from.get()); + HWY_ASSERT_VEC_EQ(to_d, expected_ceil.get(), DemoteCeilTo(to_d, in)); + HWY_ASSERT_VEC_EQ(to_d, expected_floor.get(), DemoteFloorTo(to_d, in)); + } + } +}; + +template +struct TestDemoteRoundFloatToInt { + template + HWY_NOINLINE void operator()(T /*unused*/, D from_d) { + static_assert(!IsFloat(), "Use TestDemoteToFloat for float output"); + static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider"); + const Rebind to_d; + + const size_t N = Lanes(from_d); + auto from = AllocateAligned(N); + auto from_ceil = AllocateAligned(N); + auto from_floor = AllocateAligned(N); + auto expected_ceil = AllocateAligned(N); + auto expected_floor = AllocateAligned(N); + HWY_ASSERT(from && from_ceil && from_floor && expected_ceil && + expected_floor); + + // Narrower range in the wider type, for clamping before we cast + const T min = ConvertScalarTo(IsSigned() ? LimitsMin() + : static_cast(0)); + const T max = LimitsMax(); + + RandomState rng; + for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { + for (size_t i = 0; i < N; ++i) { + const uint64_t bits = rng(); + CopyBytes(&bits, &from[i]); // not same size + expected_ceil[i] = + static_cast(std::ceil((HWY_MIN(HWY_MAX(min, from[i]), max)))); + expected_floor[i] = + static_cast(std::floor((HWY_MIN(HWY_MAX(min, from[i]), max)))); + } + const auto in = Load(from_d, from.get()); + HWY_ASSERT_VEC_EQ(to_d, expected_ceil.get(), DemoteCeilTo(to_d, in)); + HWY_ASSERT_VEC_EQ(to_d, expected_floor.get(), DemoteFloorTo(to_d, in)); + } + + for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { + for (size_t i = 0; i < N; ++i) { + const uint64_t bits = rng(); + CopyBytes(&bits, &expected_ceil[i]); // not same size + CopyBytes(&bits, &expected_floor[i]); // not same size + + if (!IsSigned() && IsSigned()) { + expected_ceil[i] &= static_cast(std::ceil((max))); + expected_floor[i] &= static_cast(std::floor((max))); + } + + from_ceil[i] = ConvertScalarTo(expected_ceil[i]); + from_floor[i] = ConvertScalarTo(expected_floor[i]); + } + + const auto in_ceil = Load(from_d, from_ceil.get()); + const auto in_floor = Load(from_d, from_floor.get()); + HWY_ASSERT_VEC_EQ(to_d, expected_ceil.get(), DemoteCeilTo(to_d, in_ceil)); + HWY_ASSERT_VEC_EQ(to_d, expected_floor.get(), + DemoteFloorTo(to_d, in_floor)); + } + } +}; + +HWY_NOINLINE void TestAllDemoteRoundTo() { +#if HWY_HAVE_FLOAT64 + const ForDemoteVectors> to_i32; + to_i32(double()); +#endif + +#if HWY_HAVE_FLOAT16 + const ForDemoteVectors> to_f16; + to_f16(float()); +#endif +} + class TestReorderDemote2To { // In-place N^2 selection sort to avoid dependencies void Sort(float* p, size_t count) { @@ -834,10 +1036,12 @@ namespace { #if !HWY_IS_MSAN HWY_BEFORE_TEST(HwyDemoteTest); HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToInt); +HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllMaskedDemoteToOrZeroInt); HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToMixed); HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToFloat); HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteUI64ToFloat); HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToBF16); +HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteRoundTo); HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllReorderDemote2To); HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllOrderedDemote2To); HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllI32F64); From ade9ee97947b336275f9f852abeb6a6e5db1bcd0 Mon Sep 17 00:00:00 2001 From: Mohammad Azim Khan Date: Fri, 22 Nov 2024 17:59:21 +0000 Subject: [PATCH 50/64] Add quick reference for MaskedConvertToOrZero --- g3doc/quick_reference.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md index 3fac0ff972..efc687796b 100644 --- a/g3doc/quick_reference.md +++ b/g3doc/quick_reference.md @@ -1779,6 +1779,9 @@ All functions except `Stream` are defined in cache_control.h. `DemoteToNearestInt(d, v)` is more efficient on some targets, including x86 and RVV. +* Vec<D> **MaskedConvertToOrZero**(M m, D d, V v): returns `v[i]` + converted to `D` where m is active and returns zero otherwise. + #### Single vector demotion These functions demote a full vector (or parts thereof) into a vector of half From 5c9c45b6ccc33f7a307324335cb1e1421bc71920 Mon Sep 17 00:00:00 2001 From: Mohammad Azim Khan Date: Fri, 22 Nov 2024 18:42:04 +0000 Subject: [PATCH 51/64] MaskedConvertToOrZero implementation for Arm SVE --- hwy/ops/arm_sve-inl.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h index eb041a2d39..de544ba485 100644 --- a/hwy/ops/arm_sve-inl.h +++ b/hwy/ops/arm_sve-inl.h @@ -3060,6 +3060,41 @@ HWY_API svfloat32_t DemoteTo(Simd d, const svuint64_t v) { HWY_SVE_FOREACH_F(HWY_SVE_CONVERT, ConvertTo, cvt) #undef HWY_SVE_CONVERT +// ------------------------------ MaskedConvertToOrZero F + +#define HWY_SVE_MASKED_CONVERT_TO_OR_ZERO(BASE, CHAR, BITS, HALF, NAME, OP) \ + /* Float from signed */ \ + template \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \ + HWY_SVE_V(int, BITS) v) { \ + return sv##OP##_##CHAR##BITS##_s##BITS##_z(m, v); \ + } \ + /* Float from unsigned */ \ + template \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \ + HWY_SVE_V(uint, BITS) v) { \ + return sv##OP##_##CHAR##BITS##_u##BITS##_z(m, v); \ + } \ + /* Signed from float, rounding toward zero */ \ + template \ + HWY_API HWY_SVE_V(int, BITS) \ + NAME(svbool_t m, HWY_SVE_D(int, BITS, N, kPow2) /* d */, \ + HWY_SVE_V(BASE, BITS) v) { \ + return sv##OP##_s##BITS##_##CHAR##BITS##_z(m, v); \ + } \ + /* Unsigned from float, rounding toward zero */ \ + template \ + HWY_API HWY_SVE_V(uint, BITS) \ + NAME(svbool_t m, HWY_SVE_D(uint, BITS, N, kPow2) /* d */, \ + HWY_SVE_V(BASE, BITS) v) { \ + return sv##OP##_u##BITS##_##CHAR##BITS##_z(m, v); \ + } + +HWY_SVE_FOREACH_F(HWY_SVE_MASKED_CONVERT_TO_OR_ZERO, MaskedConvertToOrZero, cvt) +#undef HWY_SVE_MASKED_CONVERT_TO_OR_ZERO + // ------------------------------ NearestInt (Round, ConvertTo) template >> HWY_API VFromD NearestInt(VF v) { From 7d9079b4616cb0ab3a208e0dfd62d7c0fef76d9e Mon Sep 17 00:00:00 2001 From: Will Barber Date: Fri, 24 Jan 2025 14:02:38 +0000 Subject: [PATCH 52/64] Fix review comments Remove OrZero suffixes for consistency Drop duplication of generic implementation --- g3doc/quick_reference.md | 6 +++--- hwy/ops/arm_sve-inl.h | 37 ++------------------------------ hwy/ops/generic_ops-inl.h | 12 +++++------ hwy/tests/convert_test.cc | 44 +++++++++++++++++++-------------------- hwy/tests/demote_test.cc | 32 ++++++++++++++-------------- 5 files changed, 49 insertions(+), 82 deletions(-) diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md index efc687796b..5eeee364c4 100644 --- a/g3doc/quick_reference.md +++ b/g3doc/quick_reference.md @@ -1779,7 +1779,7 @@ All functions except `Stream` are defined in cache_control.h. `DemoteToNearestInt(d, v)` is more efficient on some targets, including x86 and RVV. -* Vec<D> **MaskedConvertToOrZero**(M m, D d, V v): returns `v[i]` +* Vec<D> **MaskedConvertTo**(M m, D d, V v): returns `v[i]` converted to `D` where m is active and returns zero otherwise. #### Single vector demotion @@ -1830,7 +1830,7 @@ obtain the `D` that describes the return type. Vec<D> **DemoteFloorTo**(D, V v): Demotes a floating point number to half-sized integral type with floor rounding. -* Vec<D> **MaskedDemoteToOrZero**(M m, D d, V v): returns `v[i]` +* Vec<D> **MaskedDemoteTo**(M m, D d, V v): returns `v[i]` demoted to `D` where m is active and returns zero otherwise. #### Single vector promotion @@ -1877,7 +1877,7 @@ These functions promote a half vector to a full vector. To obtain halves, use rounded value to a signed or unsigned integer. Returns an implementation-defined value if the input exceeds the destination range. -* Vec<D> **MaskedPromoteToOrZero**(M m, D d, V v): returns `v[i]` +* Vec<D> **MaskedPromoteTo**(M m, D d, V v): returns `v[i]` widened to `D` where m is active and returns zero otherwise. The following may be more convenient or efficient than also calling `LowerHalf` diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h index de544ba485..a17cc418d9 100644 --- a/hwy/ops/arm_sve-inl.h +++ b/hwy/ops/arm_sve-inl.h @@ -3060,7 +3060,7 @@ HWY_API svfloat32_t DemoteTo(Simd d, const svuint64_t v) { HWY_SVE_FOREACH_F(HWY_SVE_CONVERT, ConvertTo, cvt) #undef HWY_SVE_CONVERT -// ------------------------------ MaskedConvertToOrZero F +// ------------------------------ MaskedConvertTo F #define HWY_SVE_MASKED_CONVERT_TO_OR_ZERO(BASE, CHAR, BITS, HALF, NAME, OP) \ /* Float from signed */ \ @@ -3092,7 +3092,7 @@ HWY_SVE_FOREACH_F(HWY_SVE_CONVERT, ConvertTo, cvt) return sv##OP##_u##BITS##_##CHAR##BITS##_z(m, v); \ } -HWY_SVE_FOREACH_F(HWY_SVE_MASKED_CONVERT_TO_OR_ZERO, MaskedConvertToOrZero, cvt) +HWY_SVE_FOREACH_F(HWY_SVE_MASKED_CONVERT_TO_OR_ZERO, MaskedConvertTo, cvt) #undef HWY_SVE_MASKED_CONVERT_TO_OR_ZERO // ------------------------------ NearestInt (Round, ConvertTo) @@ -3109,39 +3109,6 @@ HWY_API VFromD DemoteToNearestInt(DI32 di32, return DemoteTo(di32, Round(v)); } -// ------------------------------ DemoteTo (Alternate Rounding) -#ifdef HWY_NATIVE_DEMOTE_CEIL_TO -#undef HWY_NATIVE_DEMOTE_CEIL_TO -#else -#define HWY_NATIVE_DEMOTE_CEIL_TO -#endif - -template -HWY_API VFromD DemoteCeilTo(DI32 di32, VFromD> v) { - return DemoteTo(di32, Ceil(v)); -} - -template -HWY_API VFromD DemoteCeilTo(D16 d16, VFromD> v) { - return DemoteTo(d16, Ceil(v)); -} - -#ifdef HWY_NATIVE_DEMOTE_FLOOR_TO -#undef HWY_NATIVE_DEMOTE_FLOOR_TO -#else -#define HWY_NATIVE_DEMOTE_FLOOR_TO -#endif - -template -HWY_API VFromD DemoteFloorTo(DI32 di32, VFromD> v) { - return DemoteTo(di32, Floor(v)); -} - -template -HWY_API VFromD DemoteFloorTo(D16 d16, VFromD> v) { - return DemoteTo(d16, Floor(v)); -} - // ------------------------------ Iota (Add, ConvertTo) #define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP) \ diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h index 3c0406f943..a0be523e25 100644 --- a/hwy/ops/generic_ops-inl.h +++ b/hwy/ops/generic_ops-inl.h @@ -4488,21 +4488,21 @@ HWY_API V MulAddSub(V mul, V x, V sub_or_add) { return MulAdd(mul, x, add); } -// ------------------------------ MaskedPromoteToOrZero +// ------------------------------ MaskedPromoteTo template -HWY_API VFromD MaskedPromoteToOrZero(M m, D d, V v) { +HWY_API VFromD MaskedPromoteTo(M m, D d, V v) { return IfThenElseZero(m, PromoteTo(d, v)); } -// ------------------------------ MaskedDemoteToOrZero +// ------------------------------ MaskedDemoteTo template -HWY_API VFromD MaskedDemoteToOrZero(M m, D d, V v) { +HWY_API VFromD MaskedDemoteTo(M m, D d, V v) { return IfThenElseZero(m, DemoteTo(d, v)); } -// ------------------------------ MaskedConvertToOrZero +// ------------------------------ MaskedConvertTo template -HWY_API VFromD MaskedConvertToOrZero(M m, D d, V v) { +HWY_API VFromD MaskedConvertTo(M m, D d, V v) { return IfThenElseZero(m, ConvertTo(d, v)); } diff --git a/hwy/tests/convert_test.cc b/hwy/tests/convert_test.cc index 4eac790e95..3a31b82ecd 100644 --- a/hwy/tests/convert_test.cc +++ b/hwy/tests/convert_test.cc @@ -201,7 +201,7 @@ HWY_NOINLINE void TestAllPromoteRoundTo() { } template -struct TestMaskedPromoteToOrZero { +struct TestMaskedPromoteTo { template HWY_NOINLINE void operator()(T /*unused*/, D d) { static_assert(sizeof(T) < sizeof(ToT), "Input type must be narrower"); @@ -231,60 +231,60 @@ struct TestMaskedPromoteToOrZero { const auto mask = RebindMask(to_d, Gt(mask_i, Zero(to_d))); HWY_ASSERT_VEC_EQ(to_d, expected.get(), - MaskedPromoteToOrZero(mask, to_d, v1)); + MaskedPromoteTo(mask, to_d, v1)); } } }; -HWY_NOINLINE void TestAllMaskedPromoteToOrZero() { - const ForPromoteVectors, 1> to_u16div2; +HWY_NOINLINE void TestAllMaskedPromoteTo() { + const ForPromoteVectors, 1> to_u16div2; to_u16div2(uint8_t()); - const ForPromoteVectors, 2> to_u32div4; + const ForPromoteVectors, 2> to_u32div4; to_u32div4(uint8_t()); - const ForPromoteVectors, 1> to_u32div2; + const ForPromoteVectors, 1> to_u32div2; to_u32div2(uint16_t()); - const ForPromoteVectors, 1> to_i16div2; + const ForPromoteVectors, 1> to_i16div2; to_i16div2(uint8_t()); to_i16div2(int8_t()); - const ForPromoteVectors, 1> to_i32div2; + const ForPromoteVectors, 1> to_i32div2; to_i32div2(uint16_t()); to_i32div2(int16_t()); - const ForPromoteVectors, 2> to_i32div4; + const ForPromoteVectors, 2> to_i32div4; to_i32div4(uint8_t()); to_i32div4(int8_t()); // Must test f16/bf16 separately because we can only load/store/convert them. #if HWY_HAVE_INTEGER64 - const ForPromoteVectors, 1> to_u64div2; + const ForPromoteVectors, 1> to_u64div2; to_u64div2(uint32_t()); - const ForPromoteVectors, 1> to_i64div2; + const ForPromoteVectors, 1> to_i64div2; to_i64div2(int32_t()); to_i64div2(uint32_t()); - const ForPromoteVectors, 2> to_u64div4; + const ForPromoteVectors, 2> to_u64div4; to_u64div4(uint16_t()); - const ForPromoteVectors, 2> to_i64div4; + const ForPromoteVectors, 2> to_i64div4; to_i64div4(int16_t()); to_i64div4(uint16_t()); - const ForPromoteVectors, 3> to_u64div8; + const ForPromoteVectors, 3> to_u64div8; to_u64div8(uint8_t()); - const ForPromoteVectors, 3> to_i64div8; + const ForPromoteVectors, 3> to_i64div8; to_i64div8(int8_t()); to_i64div8(uint8_t()); #endif #if HWY_HAVE_FLOAT64 - const ForPromoteVectors, 1> to_f64div2; + const ForPromoteVectors, 1> to_f64div2; to_f64div2(int32_t()); to_f64div2(uint32_t()); to_f64div2(float()); @@ -885,7 +885,7 @@ struct TestMaskedIntFromFloat { // TestMaskedFloatFromUint, due to differences in saturation handling // between ConvertTo() and static_cast<> HWY_ASSERT_VEC_EQ(di, IfThenElseZero(mask, Set(di, 1)), - MaskedConvertToOrZero(mask, di, Set(df, 1))); + MaskedConvertTo(mask, di, Set(df, 1))); } } }; @@ -921,7 +921,7 @@ struct TestMaskedFloatFromInt { // Float from int HWY_ASSERT_VEC_EQ(df, expected.get(), - MaskedConvertToOrZero(mask, df, v1)); + MaskedConvertTo(mask, df, v1)); } } }; @@ -957,12 +957,12 @@ struct TestMaskedFloatFromUint { // Float from int HWY_ASSERT_VEC_EQ(df, expected.get(), - MaskedConvertToOrZero(mask, df, v1)); + MaskedConvertTo(mask, df, v1)); } } }; -HWY_NOINLINE void TestAllMaskedConvertToOrZero() { +HWY_NOINLINE void TestAllMaskedConvertTo() { ForFloatTypes(ForPartialVectors()); ForFloatTypes(ForPartialVectors()); ForFloatTypes(ForPartialVectors()); @@ -1702,7 +1702,7 @@ HWY_BEFORE_TEST(HwyConvertTest); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllRebind); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteTo); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteRoundTo); -HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllMaskedPromoteToOrZero); +HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllMaskedPromoteTo); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteUpperLowerTo); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteOddEvenTo); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllF16); @@ -1710,7 +1710,7 @@ HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllF16FromF64); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBF16); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllConvertU8); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat); -HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllMaskedConvertToOrZero); +HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllMaskedConvertTo); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllUintFromFloat); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromUint); diff --git a/hwy/tests/demote_test.cc b/hwy/tests/demote_test.cc index e40a90842d..1cfe581c0a 100644 --- a/hwy/tests/demote_test.cc +++ b/hwy/tests/demote_test.cc @@ -144,7 +144,7 @@ HWY_NOINLINE void TestAllDemoteToMixed() { } template -struct TestMaskedDemoteToOrZeroInt { +struct TestMaskedDemoteToInt { template HWY_NOINLINE void operator()(T /*unused*/, D from_d) { static_assert(!IsFloat(), "Use TestDemoteToFloat for float output"); @@ -181,64 +181,64 @@ struct TestMaskedDemoteToOrZeroInt { const auto v1 = Load(from_d, from.get()); HWY_ASSERT_VEC_EQ(to_d, expected.get(), - MaskedDemoteToOrZero(mask, to_d, v1)); + MaskedDemoteTo(mask, to_d, v1)); } } }; -HWY_NOINLINE void TestAllMaskedDemoteToOrZeroInt() { - const ForDemoteVectors> from_i16_to_u8; +HWY_NOINLINE void TestAllMaskedDemoteToInt() { + const ForDemoteVectors> from_i16_to_u8; from_i16_to_u8(int16_t()); from_i16_to_u8(uint16_t()); - const ForDemoteVectors> from_i16_to_i8; + const ForDemoteVectors> from_i16_to_i8; from_i16_to_i8(int16_t()); from_i16_to_i8(uint16_t()); - const ForDemoteVectors, 2> + const ForDemoteVectors, 2> from_i32_to_u8; from_i32_to_u8(int32_t()); from_i32_to_u8(uint32_t()); - const ForDemoteVectors, 2> from_i32_to_i8; + const ForDemoteVectors, 2> from_i32_to_i8; from_i32_to_i8(int32_t()); from_i32_to_i8(uint32_t()); #if HWY_HAVE_INTEGER64 - const ForDemoteVectors, 3> + const ForDemoteVectors, 3> from_i64_to_u8; from_i64_to_u8(int64_t()); from_i64_to_u8(uint64_t()); - const ForDemoteVectors, 3> from_i64_to_i8; + const ForDemoteVectors, 3> from_i64_to_i8; from_i64_to_i8(int64_t()); from_i64_to_i8(uint64_t()); #endif - const ForDemoteVectors> from_i32_to_u16; + const ForDemoteVectors> from_i32_to_u16; from_i32_to_u16(int32_t()); from_i32_to_u16(uint32_t()); - const ForDemoteVectors> from_i32_to_i16; + const ForDemoteVectors> from_i32_to_i16; from_i32_to_i16(int32_t()); from_i32_to_i16(uint32_t()); #if HWY_HAVE_INTEGER64 - const ForDemoteVectors, 2> + const ForDemoteVectors, 2> from_i64_to_u16; from_i64_to_u16(int64_t()); from_i64_to_u16(uint64_t()); - const ForDemoteVectors, 2> + const ForDemoteVectors, 2> from_i64_to_i16; from_i64_to_i16(int64_t()); from_i64_to_i16(uint64_t()); - const ForDemoteVectors> from_i64_to_u32; + const ForDemoteVectors> from_i64_to_u32; from_i64_to_u32(int64_t()); from_i64_to_u32(uint64_t()); - const ForDemoteVectors> from_i64_to_i32; + const ForDemoteVectors> from_i64_to_i32; from_i64_to_i32(int64_t()); from_i64_to_i32(uint64_t()); #endif @@ -1036,7 +1036,7 @@ namespace { #if !HWY_IS_MSAN HWY_BEFORE_TEST(HwyDemoteTest); HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToInt); -HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllMaskedDemoteToOrZeroInt); +HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllMaskedDemoteToInt); HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToMixed); HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToFloat); HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteUI64ToFloat); From bb045cc341ef129e1c8e17eedbb7d790ed9c03fc Mon Sep 17 00:00:00 2001 From: Mohammad Azim Khan Date: Fri, 15 Nov 2024 22:35:08 +0000 Subject: [PATCH 53/64] Float operations SqrtLower, MulSubAdd, GetExponent etc. --- g3doc/quick_reference.md | 27 +++++++ hwy/ops/arm_sve-inl.h | 60 ++++++++++++++++ hwy/ops/generic_ops-inl.h | 77 ++++++++++++++++++++ hwy/tests/float_test.cc | 146 +++++++++++++++++++++++++++++++++++++- hwy/tests/fma_test.cc | 36 ++++++++++ 5 files changed, 345 insertions(+), 1 deletion(-) diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md index e3a338d69a..6b23188586 100644 --- a/g3doc/quick_reference.md +++ b/g3doc/quick_reference.md @@ -657,6 +657,10 @@ from left to right, of the arguments passed to `Create{2-4}`. * `V`: `{f}` \ V **Sqrt**(V a): returns `sqrt(a[i])`. +* `V`: `{f}` \ + V **SqrtLower**(V a): returns `sqrt(a[0])` in lowest lane and + `a[i]` elsewhere. + * `V`: `{f}` \ V **ApproximateReciprocalSqrt**(V a): returns an approximation of `1.0 / sqrt(a[i])`. `sqrt(a) ~= ApproximateReciprocalSqrt(a) * a`. x86 @@ -666,6 +670,10 @@ from left to right, of the arguments passed to `Create{2-4}`. V **ApproximateReciprocal**(V a): returns an approximation of `1.0 / a[i]`. +* `V`: `{f}` \ + V **GetExponent**(V v): returns the exponent of `v[i]` as a floating point value. + Essentially calculates `floor(log2(x))`. + #### Min/Max **Note**: Min/Max corner cases are target-specific and may change. If either @@ -864,6 +872,10 @@ variants are somewhat slower on Arm, and unavailable for integer inputs; if the c))` or `MulAddSub(a, b, OddEven(c, Neg(c))`, but `MulSub(a, b, c)` is more efficient on some targets (including AVX2/AVX3). +* V **MulSubAdd**(V a, V b, V c): returns `a[i] * b[i] + c[i]` in + the even lanes and `a[i] * b[i] - c[i]` in the odd lanes. Essentially, + MulAddSub with `c[i]` negated. + * `V`: `bf16`, `D`: `RepartitionToWide>`, `VW`: `Vec` \ VW **MulEvenAdd**(D d, V a, V b, VW c): equivalent to and potentially more efficient than `MulAdd(PromoteEvenTo(d, a), @@ -905,6 +917,21 @@ not a concern, these are equivalent to, and potentially more efficient than, b[i]` saturated to the minimum/maximum representable value, or `no[i]` if `m[i]` is false. +#### Zero masked arithmetic + +All ops in this section return `0` for `mask=false` lanes. These are equivalent +to, and potentially more efficient than, `IfThenElseZero(m, Add(a, b));` etc. + +* `V`: `{f}` \ + V **MaskedSqrtOrZero**(M m, V a): returns `sqrt(a[i])` where + m is true, and zero otherwise. +* `V`: `{f}` \ + V **MaskedApproximateReciprocalSqrtOrZero**(M m, V a): returns + the result of ApproximateReciprocalSqrt where m is true and zero otherwise. +* `V`: `{f}` \ + V **MaskedApproximateReciprocalOrZero**(M m, V a): returns the + result of ApproximateReciprocal where m is true and zero otherwise. + #### Shifts **Note**: Counts not in `[0, sizeof(T)*8)` yield implementation-defined results. diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h index 502440ebef..772c91e08d 100644 --- a/hwy/ops/arm_sve-inl.h +++ b/hwy/ops/arm_sve-inl.h @@ -219,6 +219,15 @@ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SPECIALIZE, _, _) HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ return sv##OP##_##CHAR##BITS(v); \ } +#define HWY_SVE_RETV_ARGMV_M(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ + return sv##OP##_##CHAR##BITS##_m(b, m, a); \ + } +#define HWY_SVE_RETV_ARGMV_Z(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a) { \ + return sv##OP##_##CHAR##BITS##_z(m, a); \ + } // vector = f(vector, scalar), e.g. detail::AddN #define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP) \ @@ -1234,6 +1243,29 @@ HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGV, ApproximateReciprocal, recpe) // ------------------------------ Sqrt HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Sqrt, sqrt) +// ------------------------------ MaskedSqrt +namespace detail { +HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGMV_M, MaskedSqrt, sqrt) +} + +// ------------------------------ SqrtLower +#ifdef HWY_NATIVE_SQRT_LOWER +#undef HWY_NATIVE_SQRT_LOWER +#else +#define HWY_NATIVE_SQRT_LOWER +#endif + +#define HWY_SVE_SQRT_LOWER(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) a) { \ + return detail::MaskedSqrt(svptrue_pat_b##BITS(SV_VL1), a, a); \ + } + +HWY_SVE_FOREACH_F(HWY_SVE_SQRT_LOWER, SqrtLower, _) +#undef HWY_SVE_SQRT_LOWER + +// ------------------------------ MaskedSqrtOrZero +HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGMV_Z, MaskedSqrtOrZero, sqrt) + // ------------------------------ ApproximateReciprocalSqrt #ifdef HWY_NATIVE_F64_APPROX_RSQRT #undef HWY_NATIVE_F64_APPROX_RSQRT @@ -3094,6 +3126,34 @@ HWY_API VFromD Iota(const D d, T2 first) { ConvertScalarTo>(first)); } +// ------------------------------ GetExponent + +#if HWY_SVE_HAVE_2 || HWY_IDE +#ifdef HWY_NATIVE_GET_EXPONENT +#undef HWY_NATIVE_GET_EXPONENT +#else +#define HWY_NATIVE_GET_EXPONENT +#endif + +namespace detail { +#define HWY_SVE_GET_EXP(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(int, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ + return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \ + } +HWY_SVE_FOREACH_F(HWY_SVE_GET_EXP, GetExponent, logb) +#undef HWY_SVE_GET_EXP +} // namespace detail + +template +HWY_API V GetExponent(V v) { + const DFromV d; + const RebindToSigned di; + const VFromD exponent_int = detail::GetExponent(v); + // convert integer to original type + return ConvertTo(d, exponent_int); +} +#endif // HWY_SVE_HAVE_2 + // ------------------------------ InterleaveLower template diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h index cae00560dc..3b2cb185a8 100644 --- a/hwy/ops/generic_ops-inl.h +++ b/hwy/ops/generic_ops-inl.h @@ -1175,6 +1175,34 @@ HWY_API V MulByFloorPow2(V v, V exp) { #endif // HWY_NATIVE_MUL_BY_POW2 +// ------------------------------ GetExponent + +#if (defined(HWY_NATIVE_GET_EXPONENT) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_GET_EXPONENT +#undef HWY_NATIVE_GET_EXPONENT +#else +#define HWY_NATIVE_GET_EXPONENT +#endif + +template +HWY_API V GetExponent(V v) { + const DFromV d; + using T = TFromV; + const RebindToUnsigned du; + const RebindToSigned di; + + constexpr uint8_t mantissa_bits = MantissaBits(); + const auto exponent_offset = Set(di, MaxExponentField() >> 1); + + // extract exponent bits as integer + const auto encoded_exponent = ShiftRight(BitCast(du, Abs(v))); + const auto exponent_int = Sub(BitCast(di, encoded_exponent), exponent_offset); + + // convert integer to original type + return ConvertTo(d, exponent_int); +} + +#endif // HWY_NATIVE_GET_EXPONENT // ------------------------------ LoadInterleaved2 #if HWY_IDE || \ @@ -4409,6 +4437,19 @@ HWY_API V MulAddSub(V mul, V x, V sub_or_add) { OddEven(sub_or_add, BitCast(d, Neg(BitCast(d_negate, sub_or_add)))); return MulAdd(mul, x, add); } +// ------------------------------ MulSubAdd + +template +HWY_API V MulSubAdd(V mul, V x, V sub_or_add) { + using D = DFromV; + using T = TFromD; + using TNegate = If(), MakeSigned, T>; + + const D d; + const Rebind d_negate; + + return MulAddSub(mul, x, BitCast(d, Neg(BitCast(d_negate, sub_or_add)))); +} // ------------------------------ Integer division #if (defined(HWY_NATIVE_INT_DIV) == defined(HWY_TARGET_TOGGLE)) @@ -5234,6 +5275,30 @@ HWY_API VFromD SatWidenMulAccumFixedPoint(DI32 di32, #endif // HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT +// ------------------------------ SqrtLower +#if (defined(HWY_NATIVE_SQRT_LOWER) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_SQRT_LOWER +#undef HWY_NATIVE_SQRT_LOWER +#else +#define HWY_NATIVE_SQRT_LOWER +#endif + +template +HWY_API V SqrtLower(V a) { + const DFromV d; + const auto first_mask = FirstN(d, 1); + return IfThenElse(first_mask, Sqrt(a), a); +} + +#undef HWY_SVE_SQRT_LOWER +#endif // HWY_NATIVE_SQRT_LOWER + +// ------------------------------ MaskedSqrtOrZero +template +HWY_API V MaskedSqrtOrZero(M m, V v) { + return IfThenElseZero(m, Sqrt(v)); +} + // ------------------------------ SumOfMulQuadAccumulate #if (defined(HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE) == \ @@ -5418,6 +5483,12 @@ HWY_API V ApproximateReciprocal(V v) { #endif // HWY_NATIVE_F64_APPROX_RECIP +// ------------------------------ MaskedApproximateReciprocalOrZero +template +HWY_API V MaskedApproximateReciprocalOrZero(M m, V v) { + return IfThenElseZero(m, ApproximateReciprocal(v)); +} + // ------------------------------ F64 ApproximateReciprocalSqrt #if (defined(HWY_NATIVE_F64_APPROX_RSQRT) == defined(HWY_TARGET_TOGGLE)) @@ -5443,6 +5514,12 @@ HWY_API V ApproximateReciprocalSqrt(V v) { #endif // HWY_NATIVE_F64_APPROX_RSQRT +// ------------------------------ MaskedApproximateReciprocalSqrtOrZero +template +HWY_API V MaskedApproximateReciprocalSqrtOrZero(M m, V v) { + return IfThenElseZero(m, ApproximateReciprocalSqrt(v)); +} + // ------------------------------ Compress* #if (defined(HWY_NATIVE_COMPRESS8) == defined(HWY_TARGET_TOGGLE)) diff --git a/hwy/tests/float_test.cc b/hwy/tests/float_test.cc index 997332cf1b..5a03bfcedc 100644 --- a/hwy/tests/float_test.cc +++ b/hwy/tests/float_test.cc @@ -17,7 +17,7 @@ #include -#include // std::ceil, std::floor +#include // std::ceil, std::floor, std::log2 #include "hwy/base.h" @@ -166,6 +166,52 @@ HWY_NOINLINE void TestAllApproximateReciprocal() { ForFloatTypes(ForPartialVectors()); } +struct TestMaskedApproximateReciprocal { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const MFromD first_three = FirstN(d, 3); + const auto v = Iota(d, -2); + const auto nonzero = + IfThenElse(Eq(v, Zero(d)), Set(d, ConvertScalarTo(1)), v); + const size_t N = Lanes(d); + auto input = AllocateAligned(N); + auto actual = AllocateAligned(N); + HWY_ASSERT(input && actual); + + Store(nonzero, d, input.get()); + Store(MaskedApproximateReciprocalOrZero(first_three, nonzero), d, actual.get()); + + double max_l1 = 0.0; + double worst_expected = 0.0; + double worst_actual = 0.0; + double expected; + for (size_t i = 0; i < N; ++i) { + if (i < 3) { + expected = 1.0 / input[i]; + } else { + expected = 0.0; + } + const double l1 = ScalarAbs(expected - actual[i]); + if (l1 > max_l1) { + max_l1 = l1; + worst_expected = expected; + worst_actual = actual[i]; + } + } + const double abs_worst_expected = ScalarAbs(worst_expected); + if (abs_worst_expected > 1E-5) { + const double max_rel = max_l1 / abs_worst_expected; + fprintf(stderr, "max l1 %f rel %f (%f vs %f)\n", max_l1, max_rel, + worst_expected, worst_actual); + HWY_ASSERT(max_rel < 0.004); + } + } +}; + +HWY_NOINLINE void TestAllMaskedApproximateReciprocal() { + ForFloatTypes(ForPartialVectors()); +} + struct TestSquareRoot { template HWY_NOINLINE void operator()(T /*unused*/, D d) { @@ -178,6 +224,47 @@ HWY_NOINLINE void TestAllSquareRoot() { ForFloatTypes(ForPartialVectors()); } +struct TestSqrtLower { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto vi = Iota(d, 4); + + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + + for (size_t i = 0; i < N; ++i) { + if (i == 0) { + expected[i] = ConvertScalarTo(2); // sqrt(4) + } else { + expected[i] = ConvertScalarTo(i + 4); + } + } + + HWY_ASSERT_VEC_EQ(d, expected.get(), SqrtLower(vi)); + } +}; + +HWY_NOINLINE void TestAllSqrtLower() { + ForFloatTypes(ForPartialVectors()); +} + +struct TestMaskedSqrt { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + const auto vi = Iota(d, 4); + + const MFromD first_four = FirstN(d, 4); + const auto expected = IfThenElse(first_four, Sqrt(vi), v0); + + HWY_ASSERT_VEC_EQ(d, expected, MaskedSqrtOrZero(first_four, vi)); + } +}; + +HWY_NOINLINE void TestAllMaskedSqrt() { + ForFloatTypes(ForPartialVectors()); +} + struct TestReciprocalSquareRoot { template HWY_NOINLINE void operator()(T /*unused*/, D d) { @@ -202,6 +289,35 @@ HWY_NOINLINE void TestAllReciprocalSquareRoot() { ForFloatTypes(ForPartialVectors()); } +struct TestMaskedReciprocalSquareRoot { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const Vec v = Set(d, ConvertScalarTo(123.0f)); + const MFromD first_three = FirstN(d, 3); + const size_t N = Lanes(d); + auto lanes = AllocateAligned(N); + HWY_ASSERT(lanes); + Store(MaskedApproximateReciprocalSqrtOrZero(first_three, v), d, + lanes.get()); + for (size_t i = 0; i < N; ++i) { + T expected_val = i < 3 ? ConvertScalarTo(1 / std::sqrt(123.0f)) + : ConvertScalarTo(0); + T err = + ConvertScalarTo(ConvertScalarTo(lanes[i]) - expected_val); + if (err < ConvertScalarTo(0)) err = -err; + if (static_cast(err) >= 4E-4) { + HWY_ABORT("Lane %d (%d): actual %f err %f\n", static_cast(i), + static_cast(N), static_cast(lanes[i]), + static_cast(err)); + } + } + } +}; + +HWY_NOINLINE void TestAllMaskedReciprocalSquareRoot() { + ForFloatTypes(ForPartialVectors()); +} + template AlignedFreeUniquePtr RoundTestCases(T /*unused*/, D d, size_t& padded) { const T eps = Epsilon(); @@ -506,6 +622,29 @@ HWY_NOINLINE void TestAllAbsDiff() { ForFloatTypes(ForPartialVectors()); } +// Test GetExponent +struct TestGetExponent { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + + auto v = Iota(d, 1); + + auto expected = AllocateAligned(N); + HWY_ASSERT(expected); + + for (size_t i = 0; i < N; ++i) { + auto test_val = (float)(i + 1); + expected[i] = ConvertScalarTo(std::floor(std::log2(test_val))); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), GetExponent(v)); + } +}; + +HWY_NOINLINE void TestAllGetExponent() { + ForFloatTypes(ForPartialVectors()); +} + } // namespace // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE @@ -521,7 +660,11 @@ HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllF32FromF16); HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllDiv); HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllApproximateReciprocal); HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllSquareRoot); +HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllSqrtLower); +HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllMaskedSqrt); HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllReciprocalSquareRoot); +HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllMaskedReciprocalSquareRoot); +HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllMaskedApproximateReciprocal); HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllRound); HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllNearestInt); HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllDemoteToNearestInt); @@ -529,6 +672,7 @@ HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllTrunc); HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllCeil); HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllFloor); HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllAbsDiff); +HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllGetExponent); HWY_AFTER_TEST(); } // namespace } // namespace hwy diff --git a/hwy/tests/fma_test.cc b/hwy/tests/fma_test.cc index 2217270854..0ef76f455e 100644 --- a/hwy/tests/fma_test.cc +++ b/hwy/tests/fma_test.cc @@ -166,6 +166,41 @@ HWY_NOINLINE void TestAllMulAddSub() { ForAllTypes(ForPartialVectors()); } +struct TestMulSubAdd { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const Vec k0 = Zero(d); + const Vec v1 = Iota(d, 1); + const Vec v2 = Iota(d, 2); + + // Unlike RebindToSigned, we want to leave floating-point unchanged. + // This allows Neg for unsigned types. + const Rebind(), T, MakeSigned>, D> dif; + const Vec neg_v2 = BitCast(d, Neg(BitCast(dif, v2))); + + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + HWY_ASSERT(expected); + + HWY_ASSERT_VEC_EQ(d, k0, MulSubAdd(k0, k0, k0)); + + const auto v2_negated_if_odd = OddEven(neg_v2, v2); + HWY_ASSERT_VEC_EQ(d, v2_negated_if_odd, MulSubAdd(k0, v1, v2)); + HWY_ASSERT_VEC_EQ(d, v2_negated_if_odd, MulSubAdd(v1, k0, v2)); + + for (size_t i = 0; i < N; ++i) { + expected[i] = + ConvertScalarTo(((i & 1) == 0) ? ((i + 2) * (i + 2) + (i + 1)) + : ((i + 2) * (i + 2) - (i + 1))); + } + HWY_ASSERT_VEC_EQ(d, expected.get(), MulSubAdd(v2, v2, v1)); + } +}; + +HWY_NOINLINE void TestAllMulSubAdd() { + ForAllTypes(ForPartialVectors()); +} + } // namespace // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE @@ -179,6 +214,7 @@ HWY_BEFORE_TEST(HwyFmaTest); HWY_EXPORT_AND_TEST_P(HwyFmaTest, TestAllMulAdd); HWY_EXPORT_AND_TEST_P(HwyFmaTest, TestAllMulSub); HWY_EXPORT_AND_TEST_P(HwyFmaTest, TestAllMulAddSub); +HWY_EXPORT_AND_TEST_P(HwyFmaTest, TestAllMulSubAdd); HWY_AFTER_TEST(); } // namespace } // namespace hwy From e3c6c3bcf8e2279b0fe33cbc36cb1b2d9744e368 Mon Sep 17 00:00:00 2001 From: Will Barber Date: Fri, 24 Jan 2025 15:05:46 +0000 Subject: [PATCH 54/64] Fix review comments Remove OrZero suffixes for consistency Convert SqrtLower into MaskedSqrtOr Add TODO comments about GetExponent to x86_512 and ppc_vsx --- g3doc/quick_reference.md | 13 ++++++------- hwy/ops/arm_sve-inl.h | 37 +++++++++++++++---------------------- hwy/ops/generic_ops-inl.h | 38 +++++++++++++++++--------------------- hwy/ops/ppc_vsx-inl.h | 3 +++ hwy/ops/x86_512-inl.h | 2 ++ hwy/tests/float_test.cc | 34 ++++++---------------------------- 6 files changed, 49 insertions(+), 78 deletions(-) diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md index 6b23188586..f232e1dcd9 100644 --- a/g3doc/quick_reference.md +++ b/g3doc/quick_reference.md @@ -657,10 +657,6 @@ from left to right, of the arguments passed to `Create{2-4}`. * `V`: `{f}` \ V **Sqrt**(V a): returns `sqrt(a[i])`. -* `V`: `{f}` \ - V **SqrtLower**(V a): returns `sqrt(a[0])` in lowest lane and - `a[i]` elsewhere. - * `V`: `{f}` \ V **ApproximateReciprocalSqrt**(V a): returns an approximation of `1.0 / sqrt(a[i])`. `sqrt(a) ~= ApproximateReciprocalSqrt(a) * a`. x86 @@ -893,6 +889,9 @@ exceptions for those lanes if that is supported by the ISA. When exceptions are not a concern, these are equivalent to, and potentially more efficient than, `IfThenElse(m, Add(a, b), no);` etc. +* `V`: `{f}` \ + V **MaskedSqrtOr**(V no, M m, V a): returns `sqrt(a[i])` or + `no[i]` if `m[i]` is false. * V **MaskedMinOr**(V no, M m, V a, V b): returns `Min(a, b)[i]` or `no[i]` if `m[i]` is false. * V **MaskedMaxOr**(V no, M m, V a, V b): returns `Max(a, b)[i]` @@ -923,13 +922,13 @@ All ops in this section return `0` for `mask=false` lanes. These are equivalent to, and potentially more efficient than, `IfThenElseZero(m, Add(a, b));` etc. * `V`: `{f}` \ - V **MaskedSqrtOrZero**(M m, V a): returns `sqrt(a[i])` where + V **MaskedSqrt**(M m, V a): returns `sqrt(a[i])` where m is true, and zero otherwise. * `V`: `{f}` \ - V **MaskedApproximateReciprocalSqrtOrZero**(M m, V a): returns + V **MaskedApproximateReciprocalSqrt**(M m, V a): returns the result of ApproximateReciprocalSqrt where m is true and zero otherwise. * `V`: `{f}` \ - V **MaskedApproximateReciprocalOrZero**(M m, V a): returns the + V **MaskedApproximateReciprocal**(M m, V a): returns the result of ApproximateReciprocal where m is true and zero otherwise. #### Shifts diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h index 772c91e08d..21adff59bc 100644 --- a/hwy/ops/arm_sve-inl.h +++ b/hwy/ops/arm_sve-inl.h @@ -219,10 +219,9 @@ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SPECIALIZE, _, _) HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ return sv##OP##_##CHAR##BITS(v); \ } -#define HWY_SVE_RETV_ARGMV_M(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ - return sv##OP##_##CHAR##BITS##_m(b, m, a); \ +#define HWY_SVE_RETV_ARGMV(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) NAME(svbool_t m, HWY_SVE_V(BASE, BITS) v) { \ + return sv##OP##_##CHAR##BITS##_x(m, v); \ } #define HWY_SVE_RETV_ARGMV_Z(BASE, CHAR, BITS, HALF, NAME, OP) \ HWY_API HWY_SVE_V(BASE, BITS) NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a) { \ @@ -1244,27 +1243,13 @@ HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGV, ApproximateReciprocal, recpe) HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Sqrt, sqrt) // ------------------------------ MaskedSqrt -namespace detail { -HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGMV_M, MaskedSqrt, sqrt) -} - -// ------------------------------ SqrtLower -#ifdef HWY_NATIVE_SQRT_LOWER -#undef HWY_NATIVE_SQRT_LOWER +#ifdef HWY_NATIVE_MASKED_SQRT +#undef HWY_NATIVE_MASKED_SQRT #else -#define HWY_NATIVE_SQRT_LOWER +#define HWY_NATIVE_MASKED_SQRT #endif -#define HWY_SVE_SQRT_LOWER(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) a) { \ - return detail::MaskedSqrt(svptrue_pat_b##BITS(SV_VL1), a, a); \ - } - -HWY_SVE_FOREACH_F(HWY_SVE_SQRT_LOWER, SqrtLower, _) -#undef HWY_SVE_SQRT_LOWER - -// ------------------------------ MaskedSqrtOrZero -HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGMV_Z, MaskedSqrtOrZero, sqrt) +HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGMV_Z, MaskedSqrt, sqrt) // ------------------------------ ApproximateReciprocalSqrt #ifdef HWY_NATIVE_F64_APPROX_RSQRT @@ -1553,6 +1538,7 @@ HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV, MaskedMul, mul) HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGMVV, MaskedDiv, div) HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGMVV, MaskedDiv, div) HWY_SVE_FOREACH_UI64(HWY_SVE_RETV_ARGMVV, MaskedDiv, div) +HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGMV, MaskedSqrt, sqrt) #if HWY_SVE_HAVE_2 HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV, MaskedSatAdd, qadd) HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGMVV, MaskedSatSub, qsub) @@ -1616,6 +1602,11 @@ HWY_API V MaskedSatSubOr(V no, M m, V a, V b) { } #endif +template +HWY_API V MaskedSqrtOr(V no, M m, V v) { + return IfThenElse(m, detail::MaskedSqrt(m, v), no); +} + // ================================================== REDUCE #ifdef HWY_NATIVE_REDUCE_SCALAR @@ -6412,6 +6403,8 @@ HWY_API V HighestSetBitIndex(V v) { #undef HWY_SVE_IF_NOT_EMULATED_D #undef HWY_SVE_PTRUE #undef HWY_SVE_RETV_ARGMVV +#undef HWY_SVE_RETV_ARGMV_Z +#undef HWY_SVE_RETV_ARGMV #undef HWY_SVE_RETV_ARGPV #undef HWY_SVE_RETV_ARGPVN #undef HWY_SVE_RETV_ARGPVV diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h index 3b2cb185a8..1f3749ae68 100644 --- a/hwy/ops/generic_ops-inl.h +++ b/hwy/ops/generic_ops-inl.h @@ -5275,29 +5275,25 @@ HWY_API VFromD SatWidenMulAccumFixedPoint(DI32 di32, #endif // HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT -// ------------------------------ SqrtLower -#if (defined(HWY_NATIVE_SQRT_LOWER) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_SQRT_LOWER -#undef HWY_NATIVE_SQRT_LOWER +// ------------------------------ MaskedSqrt + +#if (defined(HWY_NATIVE_MASKED_SQRT) == defined(HWY_TARGET_TOGGLE)) + +#ifdef HWY_NATIVE_MASKED_SQRT +#undef HWY_NATIVE_MASKED_SQRT #else -#define HWY_NATIVE_SQRT_LOWER +#define HWY_NATIVE_MASKED_SQRT #endif - -template -HWY_API V SqrtLower(V a) { - const DFromV d; - const auto first_mask = FirstN(d, 1); - return IfThenElse(first_mask, Sqrt(a), a); +template +HWY_API V MaskedSqrt(M m, V v) { + return IfThenElseZero(m, Sqrt(v)); } -#undef HWY_SVE_SQRT_LOWER -#endif // HWY_NATIVE_SQRT_LOWER - -// ------------------------------ MaskedSqrtOrZero template -HWY_API V MaskedSqrtOrZero(M m, V v) { - return IfThenElseZero(m, Sqrt(v)); +HWY_API V MaskedSqrtOr(V no, M m, V v) { + return IfThenElse(m, Sqrt(v), no); } +#endif // ------------------------------ SumOfMulQuadAccumulate @@ -5483,9 +5479,9 @@ HWY_API V ApproximateReciprocal(V v) { #endif // HWY_NATIVE_F64_APPROX_RECIP -// ------------------------------ MaskedApproximateReciprocalOrZero +// ------------------------------ MaskedApproximateReciprocal template -HWY_API V MaskedApproximateReciprocalOrZero(M m, V v) { +HWY_API V MaskedApproximateReciprocal(M m, V v) { return IfThenElseZero(m, ApproximateReciprocal(v)); } @@ -5514,9 +5510,9 @@ HWY_API V ApproximateReciprocalSqrt(V v) { #endif // HWY_NATIVE_F64_APPROX_RSQRT -// ------------------------------ MaskedApproximateReciprocalSqrtOrZero +// ------------------------------ MaskedApproximateReciprocalSqrt template -HWY_API V MaskedApproximateReciprocalSqrtOrZero(M m, V v) { +HWY_API V MaskedApproximateReciprocalSqrt(M m, V v) { return IfThenElseZero(m, ApproximateReciprocalSqrt(v)); } diff --git a/hwy/ops/ppc_vsx-inl.h b/hwy/ops/ppc_vsx-inl.h index 3564ae0b17..5d2ccc3814 100644 --- a/hwy/ops/ppc_vsx-inl.h +++ b/hwy/ops/ppc_vsx-inl.h @@ -1939,6 +1939,9 @@ HWY_API Vec128 ApproximateReciprocal(Vec128 v) { #endif } +// TODO: Implement GetExponent using vec_extract_exp (which returns the biased +// exponent) followed by a subtraction by MaxExponentField() >> 1 + // ------------------------------ Floating-point square root #if HWY_S390X_HAVE_Z14 diff --git a/hwy/ops/x86_512-inl.h b/hwy/ops/x86_512-inl.h index 3f366aebe1..32ae7c2d80 100644 --- a/hwy/ops/x86_512-inl.h +++ b/hwy/ops/x86_512-inl.h @@ -1842,6 +1842,8 @@ HWY_API Vec512 ApproximateReciprocal(Vec512 v) { return Vec512{_mm512_rcp14_pd(v.raw)}; } +// TODO: Implement GetExponent using _mm_getexp_ps/_mm_getexp_pd/_mm_getexp_ph + // ------------------------------ MaskedMinOr template diff --git a/hwy/tests/float_test.cc b/hwy/tests/float_test.cc index 5a03bfcedc..29429fb2cd 100644 --- a/hwy/tests/float_test.cc +++ b/hwy/tests/float_test.cc @@ -179,7 +179,7 @@ struct TestMaskedApproximateReciprocal { HWY_ASSERT(input && actual); Store(nonzero, d, input.get()); - Store(MaskedApproximateReciprocalOrZero(first_three, nonzero), d, actual.get()); + Store(MaskedApproximateReciprocal(first_three, nonzero), d, actual.get()); double max_l1 = 0.0; double worst_expected = 0.0; @@ -224,40 +224,19 @@ HWY_NOINLINE void TestAllSquareRoot() { ForFloatTypes(ForPartialVectors()); } -struct TestSqrtLower { - template - HWY_NOINLINE void operator()(T /*unused*/, D d) { - const auto vi = Iota(d, 4); - - const size_t N = Lanes(d); - auto expected = AllocateAligned(N); - - for (size_t i = 0; i < N; ++i) { - if (i == 0) { - expected[i] = ConvertScalarTo(2); // sqrt(4) - } else { - expected[i] = ConvertScalarTo(i + 4); - } - } - - HWY_ASSERT_VEC_EQ(d, expected.get(), SqrtLower(vi)); - } -}; - -HWY_NOINLINE void TestAllSqrtLower() { - ForFloatTypes(ForPartialVectors()); -} - struct TestMaskedSqrt { template HWY_NOINLINE void operator()(T /*unused*/, D d) { const auto v0 = Zero(d); const auto vi = Iota(d, 4); + const auto v2 = Iota(d, 5); const MFromD first_four = FirstN(d, 4); const auto expected = IfThenElse(first_four, Sqrt(vi), v0); + const auto masked_expected = IfThenElse(first_four, Sqrt(vi), v2); - HWY_ASSERT_VEC_EQ(d, expected, MaskedSqrtOrZero(first_four, vi)); + HWY_ASSERT_VEC_EQ(d, expected, MaskedSqrt(first_four, vi)); + HWY_ASSERT_VEC_EQ(d, masked_expected, MaskedSqrtOr(v2, first_four, vi)); } }; @@ -297,7 +276,7 @@ struct TestMaskedReciprocalSquareRoot { const size_t N = Lanes(d); auto lanes = AllocateAligned(N); HWY_ASSERT(lanes); - Store(MaskedApproximateReciprocalSqrtOrZero(first_three, v), d, + Store(MaskedApproximateReciprocalSqrt(first_three, v), d, lanes.get()); for (size_t i = 0; i < N; ++i) { T expected_val = i < 3 ? ConvertScalarTo(1 / std::sqrt(123.0f)) @@ -660,7 +639,6 @@ HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllF32FromF16); HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllDiv); HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllApproximateReciprocal); HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllSquareRoot); -HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllSqrtLower); HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllMaskedSqrt); HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllReciprocalSquareRoot); HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllMaskedReciprocalSquareRoot); From 83f183d6c415a049378b5ecc491b4a149785c6e9 Mon Sep 17 00:00:00 2001 From: Will Barber Date: Tue, 28 Jan 2025 17:01:47 +0000 Subject: [PATCH 55/64] Remove new ops that only have a generic implementation --- g3doc/quick_reference.md | 32 ------ hwy/ops/generic_ops-inl.h | 90 ----------------- hwy/tests/convert_test.cc | 147 --------------------------- hwy/tests/demote_test.cc | 204 -------------------------------------- 4 files changed, 473 deletions(-) diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md index 5eeee364c4..127e74a9b0 100644 --- a/g3doc/quick_reference.md +++ b/g3doc/quick_reference.md @@ -1822,17 +1822,6 @@ obtain the `D` that describes the return type. Vec<D> **DemoteTo**(D, V v): narrows float to half (for bf16, it is unspecified whether this truncates or rounds). -* `V`,`D`: (`f64,i32`), (`f32,f16`) \ - Vec<D> **DemoteCeilTo**(D, V v): Demotes a floating point - number to half-sized integral type with ceiling rounding. - -* `V`,`D`: (`f64,i32`), (`f32,f16`) \ - Vec<D> **DemoteFloorTo**(D, V v): Demotes a floating - point number to half-sized integral type with floor rounding. - -* Vec<D> **MaskedDemoteTo**(M m, D d, V v): returns `v[i]` - demoted to `D` where m is active and returns zero otherwise. - #### Single vector promotion These functions promote a half vector to a full vector. To obtain halves, use @@ -1859,27 +1848,6 @@ These functions promote a half vector to a full vector. To obtain halves, use integer. Returns an implementation-defined value if the input exceeds the destination range. -* `V`: `f`, `D`:`{u,i,f}`\ - Vec<D> **PromoteCeilTo**(D, V part): rounds `part[i]` - up and converts the rounded value to a signed or unsigned integer. - Returns an implementation-defined value if the input exceeds the - destination range. - -* `V`: `f`, `D`:`{u,i,f}`\ - Vec<D> **PromoteFloorTo**(D, V part): rounds `part[i]` - down and converts the rounded value to a signed or unsigned integer. - Returns an implementation-defined value if the input exceeds the - destination range. - -* `V`: `f`, `D`:`{u,i,f}`\ - Vec<D> **PromoteToNearestInt **(D, V part): rounds - `part[i]` towards the nearest integer, with ties to even, and converts the - rounded value to a signed or unsigned integer. Returns an - implementation-defined value if the input exceeds the destination range. - -* Vec<D> **MaskedPromoteTo**(M m, D d, V v): returns `v[i]` - widened to `D` where m is active and returns zero otherwise. - The following may be more convenient or efficient than also calling `LowerHalf` / `UpperHalf`: diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h index a0be523e25..e23902fa00 100644 --- a/hwy/ops/generic_ops-inl.h +++ b/hwy/ops/generic_ops-inl.h @@ -3250,19 +3250,6 @@ HWY_API VFromD DemoteTo(D df16, VFromD> v) { #endif // HWY_NATIVE_F16C -// ------------------------------ PromoteTo F16->I -#if HWY_HAVE_FLOAT16 || HWY_IDE -template -HWY_API VFromD PromoteTo(D d, VFromD> v) { - return ConvertTo(d, PromoteTo(Rebind(), v)); -} - -template -HWY_API VFromD PromoteTo(D d, VFromD> v) { - return PromoteTo(d, PromoteTo(Rebind(), v)); -} -#endif - // ------------------------------ F64->F16 DemoteTo #if (defined(HWY_NATIVE_DEMOTE_F64_TO_F16) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16 @@ -3396,53 +3383,6 @@ HWY_API VFromD ReorderDemote2To(D dbf16, VFromD> a, #endif // HWY_NATIVE_DEMOTE_F32_TO_BF16 -// ------------------------------ DemoteTo (Alternate Rounding) -#if (defined(HWY_NATIVE_DEMOTE_CEIL_TO) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_DEMOTE_CEIL_TO -#undef HWY_NATIVE_DEMOTE_CEIL_TO -#else -#define HWY_NATIVE_DEMOTE_CEIL_TO -#endif - -#if HWY_HAVE_FLOAT64 -template -HWY_API VFromD DemoteCeilTo(D32 d32, VFromD> v) { - return DemoteTo(d32, Ceil(v)); -} -#endif // HWY_HAVE_FLOAT64 - -#if HWY_HAVE_FLOAT16 -template -HWY_API VFromD DemoteCeilTo(D16 d16, VFromD> v) { - return DemoteTo(d16, Ceil(v)); -} -#endif // HWY_HAVE_FLOAT16 - -#endif // HWY_NATIVE_DEMOTE_CEIL_TO - -#if (defined(HWY_NATIVE_DEMOTE_FLOOR_TO) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_DEMOTE_FLOOR_TO -#undef HWY_NATIVE_DEMOTE_FLOOR_TO -#else -#define HWY_NATIVE_DEMOTE_FLOOR_TO -#endif - -#if HWY_HAVE_FLOAT64 -template -HWY_API VFromD DemoteFloorTo(D32 d32, VFromD> v) { - return DemoteTo(d32, Floor(v)); -} -#endif // HWY_HAVE_FLOAT64 - -#if HWY_HAVE_FLOAT16 -template -HWY_API VFromD DemoteFloorTo(D16 d16, VFromD> v) { - return DemoteTo(d16, Floor(v)); -} -#endif // HWY_HAVE_FLOAT16 - -#endif // HWY_NATIVE_DEMOTE_FLOOR_TO - // ------------------------------ PromoteInRangeTo #if (defined(HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO) == \ defined(HWY_TARGET_TOGGLE)) @@ -3574,24 +3514,6 @@ HWY_API VFromD PromoteInRangeOddTo(D d, V v) { } #endif // HWY_TARGET != HWY_SCALAR -// ------------------------------ PromoteCeilTo -template -HWY_API Vec PromoteCeilTo(DTo d, V v) { - return PromoteTo(d, Ceil(v)); -} - -// ------------------------------ PromoteFloorTo -template -HWY_API Vec PromoteFloorTo(DTo d, V v) { - return PromoteTo(d, Floor(v)); -} - -// ------------------------------ PromoteToNearestInt -template -HWY_API Vec PromoteToNearestInt(DTo d, V v) { - return PromoteTo(d, Round(v)); -} - // ------------------------------ SumsOf2 #if HWY_TARGET != HWY_SCALAR || HWY_IDE @@ -4488,18 +4410,6 @@ HWY_API V MulAddSub(V mul, V x, V sub_or_add) { return MulAdd(mul, x, add); } -// ------------------------------ MaskedPromoteTo -template -HWY_API VFromD MaskedPromoteTo(M m, D d, V v) { - return IfThenElseZero(m, PromoteTo(d, v)); -} - -// ------------------------------ MaskedDemoteTo -template -HWY_API VFromD MaskedDemoteTo(M m, D d, V v) { - return IfThenElseZero(m, DemoteTo(d, v)); -} - // ------------------------------ MaskedConvertTo template HWY_API VFromD MaskedConvertTo(M m, D d, V v) { diff --git a/hwy/tests/convert_test.cc b/hwy/tests/convert_test.cc index 3a31b82ecd..5ab7181b4a 100644 --- a/hwy/tests/convert_test.cc +++ b/hwy/tests/convert_test.cc @@ -146,151 +146,6 @@ HWY_NOINLINE void TestAllPromoteTo() { #endif } -template -struct TestPromoteRoundTo { - template - HWY_NOINLINE void operator()(T /*unused*/, D from_d) { - static_assert(sizeof(T) < sizeof(ToT), "Input type must be narrower"); - const Rebind to_d; - - const size_t N = Lanes(from_d); - auto from = AllocateAligned(N); - auto expected_ceil = AllocateAligned(N); - auto expected_floor = AllocateAligned(N); - auto expected_nearest_int = AllocateAligned(N); - HWY_ASSERT(from && expected_ceil && expected_floor && expected_nearest_int); - - RandomState rng; - for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { - for (size_t i = 0; i < N; ++i) { - const uint64_t bits = rng(); - CopyBytes(&bits, &from[i]); // not same size - expected_ceil[i] = - ConvertScalarTo(std::ceil(static_cast(from[i]))); - expected_floor[i] = - ConvertScalarTo(std::floor(static_cast(from[i]))); - expected_nearest_int[i] = - ConvertScalarTo(std::nearbyint(static_cast(from[i]))); - } - - auto input = Load(from_d, from.get()); - auto output_ceil = PromoteCeilTo(to_d, input); - auto output_floor = PromoteFloorTo(to_d, input); - auto output_nearest_int = PromoteToNearestInt(to_d, input); - - HWY_ASSERT_VEC_EQ(to_d, expected_ceil.get(), output_ceil); - HWY_ASSERT_VEC_EQ(to_d, expected_floor.get(), output_floor); - HWY_ASSERT_VEC_EQ(to_d, expected_nearest_int.get(), output_nearest_int); - } - } -}; - -HWY_NOINLINE void TestAllPromoteRoundTo() { -#if HWY_HAVE_FLOAT16 - const ForPromoteVectors, 1> to_i32div2; - to_i32div2(hwy::float16_t()); - - const ForPromoteVectors, 1> to_f32div2; - to_f32div2(hwy::float16_t()); -#endif // HWY_HAVE_FLOAT16 - -#if HWY_HAVE_FLOAT64 - const ForPromoteVectors, 1> to_f64div2; - to_f64div2(float()); -#endif // HWY_HAVE_FLOAT64 -} - -template -struct TestMaskedPromoteTo { - template - HWY_NOINLINE void operator()(T /*unused*/, D d) { - static_assert(sizeof(T) < sizeof(ToT), "Input type must be narrower"); - const Rebind to_d; - - const size_t N = Lanes(d); - auto expected = AllocateAligned(N); - auto bool_lanes = AllocateAligned(N); - HWY_ASSERT(expected && bool_lanes); - - const auto v1 = Iota(d, 5); - - RandomState rng; - - for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { - for (size_t i = 0; i < N; ++i) { - bool_lanes[i] = (Random32(&rng) & 1024) ? T(1) : T(0); - - if (bool_lanes[i]) { - expected[i] = ConvertScalarTo(i + 5); - } else { - expected[i] = ConvertScalarTo(0); - } - } - - const auto mask_i = Load(to_d, bool_lanes.get()); - const auto mask = RebindMask(to_d, Gt(mask_i, Zero(to_d))); - - HWY_ASSERT_VEC_EQ(to_d, expected.get(), - MaskedPromoteTo(mask, to_d, v1)); - } - } -}; - -HWY_NOINLINE void TestAllMaskedPromoteTo() { - const ForPromoteVectors, 1> to_u16div2; - to_u16div2(uint8_t()); - - const ForPromoteVectors, 2> to_u32div4; - to_u32div4(uint8_t()); - - const ForPromoteVectors, 1> to_u32div2; - to_u32div2(uint16_t()); - - const ForPromoteVectors, 1> to_i16div2; - to_i16div2(uint8_t()); - to_i16div2(int8_t()); - - const ForPromoteVectors, 1> to_i32div2; - to_i32div2(uint16_t()); - to_i32div2(int16_t()); - - const ForPromoteVectors, 2> to_i32div4; - to_i32div4(uint8_t()); - to_i32div4(int8_t()); - - // Must test f16/bf16 separately because we can only load/store/convert them. - -#if HWY_HAVE_INTEGER64 - const ForPromoteVectors, 1> to_u64div2; - to_u64div2(uint32_t()); - - const ForPromoteVectors, 1> to_i64div2; - to_i64div2(int32_t()); - to_i64div2(uint32_t()); - - const ForPromoteVectors, 2> to_u64div4; - to_u64div4(uint16_t()); - - const ForPromoteVectors, 2> to_i64div4; - to_i64div4(int16_t()); - to_i64div4(uint16_t()); - - const ForPromoteVectors, 3> to_u64div8; - to_u64div8(uint8_t()); - - const ForPromoteVectors, 3> to_i64div8; - to_i64div8(int8_t()); - to_i64div8(uint8_t()); -#endif - -#if HWY_HAVE_FLOAT64 - const ForPromoteVectors, 1> to_f64div2; - to_f64div2(int32_t()); - to_f64div2(uint32_t()); - to_f64div2(float()); -#endif -} - template struct TestPromoteUpperLowerTo { template @@ -1701,8 +1556,6 @@ namespace { HWY_BEFORE_TEST(HwyConvertTest); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllRebind); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteTo); -HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteRoundTo); -HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllMaskedPromoteTo); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteUpperLowerTo); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteOddEvenTo); HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllF16); diff --git a/hwy/tests/demote_test.cc b/hwy/tests/demote_test.cc index 1cfe581c0a..1deff1d7c5 100644 --- a/hwy/tests/demote_test.cc +++ b/hwy/tests/demote_test.cc @@ -143,107 +143,6 @@ HWY_NOINLINE void TestAllDemoteToMixed() { #endif } -template -struct TestMaskedDemoteToInt { - template - HWY_NOINLINE void operator()(T /*unused*/, D from_d) { - static_assert(!IsFloat(), "Use TestDemoteToFloat for float output"); - static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider"); - const Rebind to_d; - - const size_t N = Lanes(from_d); - auto from = AllocateAligned(N); - auto expected = AllocateAligned(N); - auto bool_lanes = AllocateAligned(N); - HWY_ASSERT(from && expected && bool_lanes); - // Narrower range in the wider type, for clamping before we cast - const T min = ConvertScalarTo(IsSigned() ? LimitsMin() - : static_cast(0)); - const T max = LimitsMax(); - RandomState rng; - for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { - for (size_t i = 0; i < N; ++i) { - const uint64_t bits = rng(); - CopyBytes(&bits, &from[i]); // not same size - - bool_lanes[i] = (Random32(&rng) & 1024) ? ToT(1) : ToT(0); - if (bool_lanes[i]) { - expected[i] = static_cast(HWY_MIN(HWY_MAX(min, from[i]), max)); - - } else { - expected[i] = ConvertScalarTo(0); - } - } - - const auto mask_i = Load(to_d, bool_lanes.get()); - const auto mask = RebindMask(to_d, Gt(mask_i, Zero(to_d))); - - const auto v1 = Load(from_d, from.get()); - - HWY_ASSERT_VEC_EQ(to_d, expected.get(), - MaskedDemoteTo(mask, to_d, v1)); - } - } -}; - -HWY_NOINLINE void TestAllMaskedDemoteToInt() { - const ForDemoteVectors> from_i16_to_u8; - from_i16_to_u8(int16_t()); - from_i16_to_u8(uint16_t()); - - const ForDemoteVectors> from_i16_to_i8; - from_i16_to_i8(int16_t()); - from_i16_to_i8(uint16_t()); - - const ForDemoteVectors, 2> - from_i32_to_u8; - from_i32_to_u8(int32_t()); - from_i32_to_u8(uint32_t()); - - const ForDemoteVectors, 2> from_i32_to_i8; - from_i32_to_i8(int32_t()); - from_i32_to_i8(uint32_t()); - -#if HWY_HAVE_INTEGER64 - const ForDemoteVectors, 3> - from_i64_to_u8; - from_i64_to_u8(int64_t()); - from_i64_to_u8(uint64_t()); - - const ForDemoteVectors, 3> from_i64_to_i8; - from_i64_to_i8(int64_t()); - from_i64_to_i8(uint64_t()); -#endif - - const ForDemoteVectors> from_i32_to_u16; - from_i32_to_u16(int32_t()); - from_i32_to_u16(uint32_t()); - - const ForDemoteVectors> from_i32_to_i16; - from_i32_to_i16(int32_t()); - from_i32_to_i16(uint32_t()); - -#if HWY_HAVE_INTEGER64 - const ForDemoteVectors, 2> - from_i64_to_u16; - from_i64_to_u16(int64_t()); - from_i64_to_u16(uint64_t()); - - const ForDemoteVectors, 2> - from_i64_to_i16; - from_i64_to_i16(int64_t()); - from_i64_to_i16(uint64_t()); - - const ForDemoteVectors> from_i64_to_u32; - from_i64_to_u32(int64_t()); - from_i64_to_u32(uint64_t()); - - const ForDemoteVectors> from_i64_to_i32; - from_i64_to_i32(int64_t()); - from_i64_to_i32(uint64_t()); -#endif -} - template struct TestDemoteToFloat { template @@ -568,107 +467,6 @@ AlignedFreeUniquePtr ReorderBF16TestCases(D d, size_t& padded) { return in; } -template -struct TestDemoteRoundFloatToFloat { - template - HWY_NOINLINE void operator()(T /*unused*/, D from_d) { - static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider"); - const Rebind to_d; - - const size_t N = Lanes(from_d); - auto from = AllocateAligned(N); - auto expected_ceil = AllocateAligned(N); - auto expected_floor = AllocateAligned(N); - HWY_ASSERT(from && expected_ceil && expected_floor); - - RandomState rng; - for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { - for (size_t i = 0; i < N; ++i) { - const uint64_t bits = rng(); - CopyBytes(&bits, &from[i]); // not same size - expected_ceil[i] = static_cast(std::ceil(from[i])); - expected_floor[i] = static_cast(std::floor(from[i])); - } - const auto in = Load(from_d, from.get()); - HWY_ASSERT_VEC_EQ(to_d, expected_ceil.get(), DemoteCeilTo(to_d, in)); - HWY_ASSERT_VEC_EQ(to_d, expected_floor.get(), DemoteFloorTo(to_d, in)); - } - } -}; - -template -struct TestDemoteRoundFloatToInt { - template - HWY_NOINLINE void operator()(T /*unused*/, D from_d) { - static_assert(!IsFloat(), "Use TestDemoteToFloat for float output"); - static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider"); - const Rebind to_d; - - const size_t N = Lanes(from_d); - auto from = AllocateAligned(N); - auto from_ceil = AllocateAligned(N); - auto from_floor = AllocateAligned(N); - auto expected_ceil = AllocateAligned(N); - auto expected_floor = AllocateAligned(N); - HWY_ASSERT(from && from_ceil && from_floor && expected_ceil && - expected_floor); - - // Narrower range in the wider type, for clamping before we cast - const T min = ConvertScalarTo(IsSigned() ? LimitsMin() - : static_cast(0)); - const T max = LimitsMax(); - - RandomState rng; - for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { - for (size_t i = 0; i < N; ++i) { - const uint64_t bits = rng(); - CopyBytes(&bits, &from[i]); // not same size - expected_ceil[i] = - static_cast(std::ceil((HWY_MIN(HWY_MAX(min, from[i]), max)))); - expected_floor[i] = - static_cast(std::floor((HWY_MIN(HWY_MAX(min, from[i]), max)))); - } - const auto in = Load(from_d, from.get()); - HWY_ASSERT_VEC_EQ(to_d, expected_ceil.get(), DemoteCeilTo(to_d, in)); - HWY_ASSERT_VEC_EQ(to_d, expected_floor.get(), DemoteFloorTo(to_d, in)); - } - - for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { - for (size_t i = 0; i < N; ++i) { - const uint64_t bits = rng(); - CopyBytes(&bits, &expected_ceil[i]); // not same size - CopyBytes(&bits, &expected_floor[i]); // not same size - - if (!IsSigned() && IsSigned()) { - expected_ceil[i] &= static_cast(std::ceil((max))); - expected_floor[i] &= static_cast(std::floor((max))); - } - - from_ceil[i] = ConvertScalarTo(expected_ceil[i]); - from_floor[i] = ConvertScalarTo(expected_floor[i]); - } - - const auto in_ceil = Load(from_d, from_ceil.get()); - const auto in_floor = Load(from_d, from_floor.get()); - HWY_ASSERT_VEC_EQ(to_d, expected_ceil.get(), DemoteCeilTo(to_d, in_ceil)); - HWY_ASSERT_VEC_EQ(to_d, expected_floor.get(), - DemoteFloorTo(to_d, in_floor)); - } - } -}; - -HWY_NOINLINE void TestAllDemoteRoundTo() { -#if HWY_HAVE_FLOAT64 - const ForDemoteVectors> to_i32; - to_i32(double()); -#endif - -#if HWY_HAVE_FLOAT16 - const ForDemoteVectors> to_f16; - to_f16(float()); -#endif -} - class TestReorderDemote2To { // In-place N^2 selection sort to avoid dependencies void Sort(float* p, size_t count) { @@ -1036,12 +834,10 @@ namespace { #if !HWY_IS_MSAN HWY_BEFORE_TEST(HwyDemoteTest); HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToInt); -HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllMaskedDemoteToInt); HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToMixed); HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToFloat); HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteUI64ToFloat); HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToBF16); -HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteRoundTo); HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllReorderDemote2To); HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllOrderedDemote2To); HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllI32F64); From acd4f0926f4c50974e2c6bf4b0c50c149adb0421 Mon Sep 17 00:00:00 2001 From: Will Barber Date: Tue, 28 Jan 2025 17:44:39 +0000 Subject: [PATCH 56/64] Fix bool_lanes typing --- hwy/tests/convert_test.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/hwy/tests/convert_test.cc b/hwy/tests/convert_test.cc index 5ab7181b4a..8688017faa 100644 --- a/hwy/tests/convert_test.cc +++ b/hwy/tests/convert_test.cc @@ -753,7 +753,7 @@ struct TestMaskedFloatFromInt { const size_t N = Lanes(df); auto from = AllocateAligned(N); auto expected = AllocateAligned(N); - auto bool_lanes = AllocateAligned(N); + auto bool_lanes = AllocateAligned(N); HWY_ASSERT(from && expected && bool_lanes); RandomState rng; @@ -762,15 +762,15 @@ struct TestMaskedFloatFromInt { const uint64_t bits = rng(); CopyBytes(&bits, &from[i]); // not same size - bool_lanes[i] = (Random32(&rng) & 1024) ? TF(1) : TF(0); + bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); if (bool_lanes[i]) { expected[i] = ConvertScalarTo(from[i]); } else { expected[i] = ConvertScalarTo(0); } } - const auto mask_i = Load(df, bool_lanes.get()); - const auto mask = RebindMask(df, Gt(mask_i, Zero(df))); + const auto mask_i = Load(di, bool_lanes.get()); + const auto mask = RebindMask(df, Gt(mask_i, Zero(di))); const auto v1 = Load(di, from.get()); @@ -789,7 +789,7 @@ struct TestMaskedFloatFromUint { const size_t N = Lanes(df); auto from = AllocateAligned(N); auto expected = AllocateAligned(N); - auto bool_lanes = AllocateAligned(N); + auto bool_lanes = AllocateAligned(N); HWY_ASSERT(from && expected && bool_lanes); RandomState rng; @@ -798,15 +798,15 @@ struct TestMaskedFloatFromUint { const uint64_t bits = rng(); CopyBytes(&bits, &from[i]); // not same size - bool_lanes[i] = (Random32(&rng) & 1024) ? TF(1) : TF(0); + bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); if (bool_lanes[i]) { expected[i] = ConvertScalarTo(from[i]); } else { expected[i] = ConvertScalarTo(0); } } - const auto mask_i = Load(df, bool_lanes.get()); - const auto mask = RebindMask(df, Gt(mask_i, Zero(df))); + const auto mask_i = Load(di, bool_lanes.get()); + const auto mask = RebindMask(df, Gt(mask_i, Zero(di))); const auto v1 = Load(di, from.get()); From cdd64d284d3490fa28c07e0944431fee23c4f78e Mon Sep 17 00:00:00 2001 From: Mohammad Azim Khan Date: Sat, 16 Nov 2024 07:28:54 +0000 Subject: [PATCH 57/64] MulRound, MulLower and MulAddLower ops --- g3doc/quick_reference.md | 10 ++++ hwy/ops/arm_sve-inl.h | 71 +++++++++++++++++++++++++++++ hwy/ops/generic_ops-inl.h | 39 ++++++++++++++++ hwy/tests/masked_arithmetic_test.cc | 62 +++++++++++++++++++++++++ hwy/tests/mul_test.cc | 28 ++++++++++++ 5 files changed, 210 insertions(+) diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md index 86ceb89f38..83034e26d2 100644 --- a/g3doc/quick_reference.md +++ b/g3doc/quick_reference.md @@ -726,6 +726,10 @@ All other ops in this section are only available if `HWY_TARGET != HWY_SCALAR`: truncating it to the lower half for integer inputs. Currently unavailable on SVE/RVV; use the equivalent `Mul` instead. +* `V`: `f` + V **MulRound**(V a, V b): Multiplies `a[i]` by `b[i]` and rounds + the result to the nearest int with ties going to even. + * `V`: `f`, `VI`: `Vec>>` \ V **MulByPow2**(V a, VI b): Multiplies `a[i]` by `2^b[i]`. @@ -756,6 +760,9 @@ All other ops in this section are only available if `HWY_TARGET != HWY_SCALAR`: V **MulHigh**(V a, V b): returns the upper half of `a[i] * b[i]` in each lane. +* V **MulLower**(V a, V b): returns `a[0] * b[0]` in the + first lane and `a[i]` otherwise. + * `V`: `i16` \ V **MulFixedPoint15**(V a, V b): returns the result of multiplying two Q1.15 fixed-point numbers. This corresponds to doubling the @@ -882,6 +889,9 @@ variants are somewhat slower on Arm, and unavailable for integer inputs; if the potentially more efficient than `MulAdd(PromoteOddTo(d, a), PromoteOddTo(d, b), c)`. +* V **MulAddLower**(V a, V b, V c): returns `a[0] * b[0] + c[0]` + and `a[i]` in all other lanes. + #### Masked arithmetic All ops in this section return `no` for `mask=false` lanes, and suppress any diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h index 822d26d579..95eb16e0a3 100644 --- a/hwy/ops/arm_sve-inl.h +++ b/hwy/ops/arm_sve-inl.h @@ -260,6 +260,11 @@ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SPECIALIZE, _, _) NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ return sv##OP##_##CHAR##BITS##_x(m, a, b); \ } +#define HWY_SVE_RETV_ARGMVV_M(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ + return sv##OP##_##CHAR##BITS##_m(m, a, b); \ + } #define HWY_SVE_RETV_ARGVVV(BASE, CHAR, BITS, HALF, NAME, OP) \ HWY_API HWY_SVE_V(BASE, BITS) \ @@ -268,6 +273,13 @@ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SPECIALIZE, _, _) return sv##OP##_##CHAR##BITS(a, b, c); \ } +#define HWY_SVE_RETV_ARGMVVV(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b, \ + HWY_SVE_V(BASE, BITS) c) { \ + return sv##OP##_##CHAR##BITS##_m(m, a, b, c); \ + } + // ------------------------------ Lanes namespace detail { @@ -1289,6 +1301,31 @@ HWY_SVE_FOREACH_F(HWY_SVE_FMA, NegMulSub, nmad) #undef HWY_SVE_FMA +// ------------------------------ MaskedMulAdd +namespace detail { +HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVVV, MaskedMulAdd, mad) +} + +// ------------------------------ MulAddLower +#if (defined(HWY_NATIVE_MUL_ADD_LOWER) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_MUL_ADD_LOWER +#undef HWY_NATIVE_MUL_ADD_LOWER +#else +#define HWY_NATIVE_MUL_ADD_LOWER +#endif + +#define HWY_SVE_MUL_ADD_LOWER(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b, \ + HWY_SVE_V(BASE, BITS) c) { \ + return detail::MaskedMulAdd(svptrue_pat_b##BITS(SV_VL1), a, b, c); \ + } + +HWY_SVE_FOREACH(HWY_SVE_MUL_ADD_LOWER, MulAddLower, _) +#undef HWY_SVE_MUL_ADD_LOWER + +#endif // HWY_NATIVE_MUL_ADD_LOWER + // ------------------------------ Round etc. HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Round, rintn) @@ -1602,6 +1639,26 @@ HWY_API V MaskedSatSubOr(V no, M m, V a, V b) { } #endif +// ------------------------------ MaskedMul_M +namespace detail { +HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV_M, MaskedMul_M, mul); +} + +// ------------------------------ MulLower +#ifdef HWY_NATIVE_MUL_LOWER +#undef HWY_NATIVE_MUL_LOWER +#else +#define HWY_NATIVE_MUL_LOWER +#endif + +#define HWY_SVE_MUL_LOWER(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ + return detail::MaskedMul_M(svptrue_pat_b##BITS(SV_VL1), a, b); \ + } + +HWY_SVE_FOREACH(HWY_SVE_MUL_LOWER, MulLower, _) + template HWY_API V MaskedSqrtOr(V no, M m, V v) { return IfThenElse(m, detail::MaskedSqrt(m, v), no); @@ -2101,6 +2158,18 @@ HWY_API svbool_t IsFinite(const V v) { return RebindMask(d, detail::LtN(exp, hwy::MaxExponentField())); } +// ------------------------------ MulByPow2/MulByFloorPow2 + +#define HWY_SVE_MUL_BY_POW2(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(int, BITS) exp) { \ + return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, exp); \ + } + +HWY_SVE_FOREACH_F(HWY_SVE_MUL_BY_POW2, MulByPow2, scale) + +#undef HWY_SVE_MUL_BY_POW2 + // ================================================== MEMORY // ------------------------------ LoadU/MaskedLoad/LoadDup128/StoreU/Stream @@ -6446,7 +6515,9 @@ HWY_API V HighestSetBitIndex(V v) { #undef HWY_SVE_RETV_ARGV #undef HWY_SVE_RETV_ARGVN #undef HWY_SVE_RETV_ARGVV +#undef HWY_SVE_RETV_ARGMVV_M #undef HWY_SVE_RETV_ARGVVV +#undef HWY_SVE_RETV_ARGMVVV #undef HWY_SVE_T #undef HWY_SVE_UNDEFINED #undef HWY_SVE_V diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h index 645c9bf9d9..d56adf7408 100644 --- a/hwy/ops/generic_ops-inl.h +++ b/hwy/ops/generic_ops-inl.h @@ -577,6 +577,22 @@ HWY_API V AddSub(V a, V b) { return Add(a, negated_even_b); } +#if (defined(HWY_NATIVE_MUL_LOWER) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_MUL_LOWER +#undef HWY_NATIVE_MUL_LOWER +#else +#define HWY_NATIVE_MUL_LOWER +#endif + +template +HWY_API V MulLower(V a, V b) { + const DFromV d; + const auto first_mask = FirstN(d, 1); + return MaskedMulOr(a, first_mask, a, b); +} + +#endif // HWY_NATIVE_MUL_LOWER + // ------------------------------ MaskedAddOr etc. #if (defined(HWY_NATIVE_MASKED_ARITH) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_MASKED_ARITH @@ -4352,6 +4368,12 @@ HWY_API V operator*(V x, V y) { #endif // HWY_NATIVE_MUL_64 +// ------------------------------ MulRound +template +HWY_API V MulRound(V a, V b) { + return Round(Mul(a, b)); +} + // ------------------------------ MulAdd / NegMulAdd #if (defined(HWY_NATIVE_INT_FMA) == defined(HWY_TARGET_TOGGLE)) @@ -4383,6 +4405,23 @@ HWY_API V MulSub(V mul, V x, V sub) { } #endif // HWY_NATIVE_INT_FMA +// ------------------------------ MulAddLower +#if (defined(HWY_NATIVE_MUL_ADD_LOWER) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_MUL_ADD_LOWER +#undef HWY_NATIVE_MUL_ADD_LOWER +#else +#define HWY_NATIVE_MUL_ADD_LOWER +#endif + +template +HWY_API V MulAddLower(const V a, const V b, const V c) { + const DFromV d; + const MFromD> LowerMask = FirstN(d, 1); + return IfThenElse(LowerMask, MulAdd(a, b, c), a); +} + +#endif // HWY_NATIVE_MUL_ADD_LOWER + // ------------------------------ Integer MulSub / NegMulSub #if (defined(HWY_NATIVE_INT_FMSUB) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_INT_FMSUB diff --git a/hwy/tests/masked_arithmetic_test.cc b/hwy/tests/masked_arithmetic_test.cc index 2f1491b52d..910a942730 100644 --- a/hwy/tests/masked_arithmetic_test.cc +++ b/hwy/tests/masked_arithmetic_test.cc @@ -379,6 +379,66 @@ HWY_NOINLINE void TestAllFloatExceptions() { ForFloatTypes(ForPartialVectors()); } +struct TestMulLower { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const auto v0 = Zero(d); + + HWY_ASSERT_VEC_EQ(d, v0, MulLower(v0, v0)); + + const auto v2 = Iota(d, 2); + const auto v3 = Iota(d, 3); + + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + + for (size_t i = 0; i < N; ++i) { + if (i == 0) { + expected[i] = ConvertScalarTo(2 * 3); + } else { + expected[i] = ConvertScalarTo(i + 2); + } + } + + HWY_ASSERT_VEC_EQ(d, expected.get(), MulLower(v2, v3)); + } +}; + +HWY_NOINLINE void TestAllMulLower() { + ForAllTypes(ForPartialVectors()); +} + +struct TestMulAddLower { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const Vec v0 = Zero(d); + + // Test all zeros + HWY_ASSERT_VEC_EQ(d, v0, MulAddLower(v0, v0, v0)); + + // Test upper lanes of a being passed through + const Vec v1 = Iota(d, 1); + const Vec v2 = Iota(d, 2); + const Vec v3 = Iota(d, 3); + + const size_t N = Lanes(d); + auto expected = AllocateAligned(N); + + for (size_t i = 0; i < N; ++i) { + if (i == 0) { + expected[i] = ConvertScalarTo(5); + } else { + expected[i] = static_cast(i + 1); + } + } + + HWY_ASSERT_VEC_EQ(d, expected.get(), MulAddLower(v1, v2, v3)); + } +}; + +HWY_NOINLINE void TestAllTestMulAddLower() { + ForAllTypes(ForPartialVectors()); +} } // namespace // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE @@ -394,6 +454,8 @@ HWY_EXPORT_AND_TEST_P(HwyMaskedArithmeticTest, TestAllSatAddSub); HWY_EXPORT_AND_TEST_P(HwyMaskedArithmeticTest, TestAllDiv); HWY_EXPORT_AND_TEST_P(HwyMaskedArithmeticTest, TestAllIntegerDivMod); HWY_EXPORT_AND_TEST_P(HwyMaskedArithmeticTest, TestAllFloatExceptions); +HWY_EXPORT_AND_TEST_P(HwyMaskedArithmeticTest, TestAllMulLower); +HWY_EXPORT_AND_TEST_P(HwyMaskedArithmeticTest, TestAllTestMulAddLower); HWY_AFTER_TEST(); } // namespace } // namespace hwy diff --git a/hwy/tests/mul_test.cc b/hwy/tests/mul_test.cc index 13307f41c0..2f8ab178a1 100644 --- a/hwy/tests/mul_test.cc +++ b/hwy/tests/mul_test.cc @@ -424,6 +424,33 @@ HWY_NOINLINE void TestAllMulOdd() { // uint64_t MulOdd is already tested in TestMulEvenOdd64 } +struct TestMulRound { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const Vec v0 = Zero(d); + + // Test that we correctly get all zeros + HWY_ASSERT_VEC_EQ(d, v0, MulRound(v0, v0)); + + // Test that we round to closest even in case of tie + const Vec v_half = Set(d, ConvertScalarTo(0.5f)); + const Vec v_1 = Set(d, ConvertScalarTo(1)); + + HWY_ASSERT_VEC_EQ(d, v0, MulRound(v_half, v_1)); + + // Test arbitrary multiplication + const Vec v_2 = Set(d, ConvertScalarTo(6.75)); + const Vec v_3 = Set(d, ConvertScalarTo(3.33)); + const Vec expected = Set(d, ConvertScalarTo(22)); + + HWY_ASSERT_VEC_EQ(d, expected, MulRound(v_2, v_3)); + } +}; + +HWY_NOINLINE void TestAllMulRound() { + ForFloatTypes(ForPartialVectors()); +} + } // namespace // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE @@ -439,6 +466,7 @@ HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMulHigh); HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMulFixedPoint15); HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMulEven); HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMulOdd); +HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMulRound); HWY_AFTER_TEST(); } // namespace } // namespace hwy From c2646953c9a4a152bb960a0404d980229bcc821d Mon Sep 17 00:00:00 2001 From: Will Barber Date: Tue, 28 Jan 2025 12:58:01 +0000 Subject: [PATCH 58/64] Fix review comments Remove MulLower Use MaskedMulOr instead Replace MulAddLower with MaskedMulAddOr --- g3doc/quick_reference.md | 8 +--- hwy/ops/arm_sve-inl.h | 51 +++++--------------- hwy/ops/generic_ops-inl.h | 36 ++++---------- hwy/tests/masked_arithmetic_test.cc | 74 ++++++++++++----------------- 4 files changed, 54 insertions(+), 115 deletions(-) diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md index 83034e26d2..8213f4ac0b 100644 --- a/g3doc/quick_reference.md +++ b/g3doc/quick_reference.md @@ -760,9 +760,6 @@ All other ops in this section are only available if `HWY_TARGET != HWY_SCALAR`: V **MulHigh**(V a, V b): returns the upper half of `a[i] * b[i]` in each lane. -* V **MulLower**(V a, V b): returns `a[0] * b[0]` in the - first lane and `a[i]` otherwise. - * `V`: `i16` \ V **MulFixedPoint15**(V a, V b): returns the result of multiplying two Q1.15 fixed-point numbers. This corresponds to doubling the @@ -889,9 +886,6 @@ variants are somewhat slower on Arm, and unavailable for integer inputs; if the potentially more efficient than `MulAdd(PromoteOddTo(d, a), PromoteOddTo(d, b), c)`. -* V **MulAddLower**(V a, V b, V c): returns `a[0] * b[0] + c[0]` - and `a[i]` in all other lanes. - #### Masked arithmetic All ops in this section return `no` for `mask=false` lanes, and suppress any @@ -925,6 +919,8 @@ not a concern, these are equivalent to, and potentially more efficient than, V **MaskedSatSubOr**(V no, M m, V a, V b): returns `a[i] + b[i]` saturated to the minimum/maximum representable value, or `no[i]` if `m[i]` is false. +* V **MaskedMulAddOr**(V no, M m, V mul, V x, V add): returns + `mul[i] * x[i] + add[i]` or `no[i]` if `m[i]` is false. #### Zero masked arithmetic diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h index 95eb16e0a3..6942e90d68 100644 --- a/hwy/ops/arm_sve-inl.h +++ b/hwy/ops/arm_sve-inl.h @@ -277,7 +277,7 @@ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SPECIALIZE, _, _) HWY_API HWY_SVE_V(BASE, BITS) \ NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b, \ HWY_SVE_V(BASE, BITS) c) { \ - return sv##OP##_##CHAR##BITS##_m(m, a, b, c); \ + return sv##OP##_##CHAR##BITS##_x(m, a, b, c); \ } // ------------------------------ Lanes @@ -1301,31 +1301,6 @@ HWY_SVE_FOREACH_F(HWY_SVE_FMA, NegMulSub, nmad) #undef HWY_SVE_FMA -// ------------------------------ MaskedMulAdd -namespace detail { -HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVVV, MaskedMulAdd, mad) -} - -// ------------------------------ MulAddLower -#if (defined(HWY_NATIVE_MUL_ADD_LOWER) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_MUL_ADD_LOWER -#undef HWY_NATIVE_MUL_ADD_LOWER -#else -#define HWY_NATIVE_MUL_ADD_LOWER -#endif - -#define HWY_SVE_MUL_ADD_LOWER(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b, \ - HWY_SVE_V(BASE, BITS) c) { \ - return detail::MaskedMulAdd(svptrue_pat_b##BITS(SV_VL1), a, b, c); \ - } - -HWY_SVE_FOREACH(HWY_SVE_MUL_ADD_LOWER, MulAddLower, _) -#undef HWY_SVE_MUL_ADD_LOWER - -#endif // HWY_NATIVE_MUL_ADD_LOWER - // ------------------------------ Round etc. HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Round, rintn) @@ -1639,25 +1614,23 @@ HWY_API V MaskedSatSubOr(V no, M m, V a, V b) { } #endif -// ------------------------------ MaskedMul_M +// ------------------------------ MaskedMulAddOr namespace detail { -HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVV_M, MaskedMul_M, mul); +HWY_SVE_FOREACH(HWY_SVE_RETV_ARGMVVV, MaskedMulAdd, mad) } -// ------------------------------ MulLower -#ifdef HWY_NATIVE_MUL_LOWER -#undef HWY_NATIVE_MUL_LOWER +// Per-target flag to prevent generic_ops-inl.h from defining int +// MaskedMulAddOr. +#ifdef HWY_NATIVE_MASKED_INT_FMA +#undef HWY_NATIVE_MASKED_INT_FMA #else -#define HWY_NATIVE_MUL_LOWER +#define HWY_NATIVE_MASKED_INT_FMA #endif -#define HWY_SVE_MUL_LOWER(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ - return detail::MaskedMul_M(svptrue_pat_b##BITS(SV_VL1), a, b); \ - } - -HWY_SVE_FOREACH(HWY_SVE_MUL_LOWER, MulLower, _) +template +HWY_API V MaskedMulAddOr(V no, M m, V mul, V x, V add) { + return IfThenElse(m, detail::MaskedMulAdd(m, mul, x, add), no); +} template HWY_API V MaskedSqrtOr(V no, M m, V v) { diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h index d56adf7408..05ff70e0fb 100644 --- a/hwy/ops/generic_ops-inl.h +++ b/hwy/ops/generic_ops-inl.h @@ -577,22 +577,6 @@ HWY_API V AddSub(V a, V b) { return Add(a, negated_even_b); } -#if (defined(HWY_NATIVE_MUL_LOWER) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_MUL_LOWER -#undef HWY_NATIVE_MUL_LOWER -#else -#define HWY_NATIVE_MUL_LOWER -#endif - -template -HWY_API V MulLower(V a, V b) { - const DFromV d; - const auto first_mask = FirstN(d, 1); - return MaskedMulOr(a, first_mask, a, b); -} - -#endif // HWY_NATIVE_MUL_LOWER - // ------------------------------ MaskedAddOr etc. #if (defined(HWY_NATIVE_MASKED_ARITH) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_MASKED_ARITH @@ -4405,22 +4389,20 @@ HWY_API V MulSub(V mul, V x, V sub) { } #endif // HWY_NATIVE_INT_FMA -// ------------------------------ MulAddLower -#if (defined(HWY_NATIVE_MUL_ADD_LOWER) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_MUL_ADD_LOWER -#undef HWY_NATIVE_MUL_ADD_LOWER +// ------------------------------ MaskedMulAddOr +#if (defined(HWY_NATIVE_MASKED_INT_FMA) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_MASKED_INT_FMA +#undef HWY_NATIVE_MASKED_INT_FMA #else -#define HWY_NATIVE_MUL_ADD_LOWER +#define HWY_NATIVE_MASKED_INT_FMA #endif -template -HWY_API V MulAddLower(const V a, const V b, const V c) { - const DFromV d; - const MFromD> LowerMask = FirstN(d, 1); - return IfThenElse(LowerMask, MulAdd(a, b, c), a); +template +HWY_API V MaskedMulAddOr(V no, M m, V mul, V x, V add) { + return IfThenElse(m, MulAdd(mul, x, add), no); } -#endif // HWY_NATIVE_MUL_ADD_LOWER +#endif // HWY_NATIVE_MASKED_INT_FMA // ------------------------------ Integer MulSub / NegMulSub #if (defined(HWY_NATIVE_INT_FMSUB) == defined(HWY_TARGET_TOGGLE)) diff --git a/hwy/tests/masked_arithmetic_test.cc b/hwy/tests/masked_arithmetic_test.cc index 910a942730..4debd17561 100644 --- a/hwy/tests/masked_arithmetic_test.cc +++ b/hwy/tests/masked_arithmetic_test.cc @@ -379,65 +379,54 @@ HWY_NOINLINE void TestAllFloatExceptions() { ForFloatTypes(ForPartialVectors()); } -struct TestMulLower { +struct TestMaskedMulAdd { template HWY_NOINLINE void operator()(T /*unused*/, D d) { - const auto v0 = Zero(d); - - HWY_ASSERT_VEC_EQ(d, v0, MulLower(v0, v0)); - - const auto v2 = Iota(d, 2); - const auto v3 = Iota(d, 3); + RandomState rng; + const Vec k0 = Zero(d); + const Vec v1 = Iota(d, 1); + const Vec v2 = Iota(d, 2); + using TI = MakeSigned; // For mask > 0 comparison + const Rebind di; + using VI = Vec; const size_t N = Lanes(d); + auto bool_lanes = AllocateAligned(N); auto expected = AllocateAligned(N); + HWY_ASSERT(bool_lanes && expected); + HWY_ASSERT_VEC_EQ(d, k0, MaskedMulAddOr(v1, MaskTrue(d), k0, k0, k0)); + HWY_ASSERT_VEC_EQ(d, v2, MaskedMulAddOr(v1, MaskTrue(d), k0, v1, v2)); + HWY_ASSERT_VEC_EQ(d, v2, MaskedMulAddOr(v1, MaskTrue(d), v1, k0, v2)); + HWY_ASSERT_VEC_EQ(d, v1, MaskedMulAddOr(v1, MaskFalse(d), k0, k0, k0)); + HWY_ASSERT_VEC_EQ(d, v1, MaskedMulAddOr(v1, MaskFalse(d), k0, v1, v2)); + HWY_ASSERT_VEC_EQ(d, v1, MaskedMulAddOr(v1, MaskFalse(d), v1, k0, v2)); for (size_t i = 0; i < N; ++i) { - if (i == 0) { - expected[i] = ConvertScalarTo(2 * 3); + bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0); + if (bool_lanes[i]) { + expected[i] = ConvertScalarTo((i + 1) * (i + 2)); } else { - expected[i] = ConvertScalarTo(i + 2); + expected[i] = ConvertScalarTo(i + 1); } } - - HWY_ASSERT_VEC_EQ(d, expected.get(), MulLower(v2, v3)); - } -}; - -HWY_NOINLINE void TestAllMulLower() { - ForAllTypes(ForPartialVectors()); -} - -struct TestMulAddLower { - template - HWY_NOINLINE void operator()(T /*unused*/, D d) { - const Vec v0 = Zero(d); - - // Test all zeros - HWY_ASSERT_VEC_EQ(d, v0, MulAddLower(v0, v0, v0)); - - // Test upper lanes of a being passed through - const Vec v1 = Iota(d, 1); - const Vec v2 = Iota(d, 2); - const Vec v3 = Iota(d, 3); - - const size_t N = Lanes(d); - auto expected = AllocateAligned(N); + const VI mask_i = Load(di, bool_lanes.get()); + const Mask mask = RebindMask(d, Gt(mask_i, Zero(di))); + HWY_ASSERT_VEC_EQ(d, expected.get(), MaskedMulAddOr(v1, mask, v2, v1, k0)); + HWY_ASSERT_VEC_EQ(d, expected.get(), MaskedMulAddOr(v1, mask, v1, v2, k0)); for (size_t i = 0; i < N; ++i) { - if (i == 0) { - expected[i] = ConvertScalarTo(5); + if (bool_lanes[i]) { + expected[i] = ConvertScalarTo((i + 2) * (i + 2) + (i + 1)); } else { - expected[i] = static_cast(i + 1); + expected[i] = ConvertScalarTo(i + 2); } } - - HWY_ASSERT_VEC_EQ(d, expected.get(), MulAddLower(v1, v2, v3)); + HWY_ASSERT_VEC_EQ(d, expected.get(), MaskedMulAddOr(v2, mask, v2, v2, v1)); } }; -HWY_NOINLINE void TestAllTestMulAddLower() { - ForAllTypes(ForPartialVectors()); +HWY_NOINLINE void TestAllMaskedMulAdd() { + ForAllTypes(ForPartialVectors()); } } // namespace // NOLINTNEXTLINE(google-readability-namespace-comments) @@ -454,8 +443,7 @@ HWY_EXPORT_AND_TEST_P(HwyMaskedArithmeticTest, TestAllSatAddSub); HWY_EXPORT_AND_TEST_P(HwyMaskedArithmeticTest, TestAllDiv); HWY_EXPORT_AND_TEST_P(HwyMaskedArithmeticTest, TestAllIntegerDivMod); HWY_EXPORT_AND_TEST_P(HwyMaskedArithmeticTest, TestAllFloatExceptions); -HWY_EXPORT_AND_TEST_P(HwyMaskedArithmeticTest, TestAllMulLower); -HWY_EXPORT_AND_TEST_P(HwyMaskedArithmeticTest, TestAllTestMulAddLower); +HWY_EXPORT_AND_TEST_P(HwyMaskedArithmeticTest, TestAllMaskedMulAdd); HWY_AFTER_TEST(); } // namespace } // namespace hwy From ec5b0aa7f5dde15f535f557fbfa3b0dfeea2e78e Mon Sep 17 00:00:00 2001 From: Will Barber Date: Wed, 29 Jan 2025 14:11:14 +0000 Subject: [PATCH 59/64] Remove redundant macro --- hwy/ops/arm_sve-inl.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h index 6942e90d68..4c4a37e5d4 100644 --- a/hwy/ops/arm_sve-inl.h +++ b/hwy/ops/arm_sve-inl.h @@ -260,11 +260,6 @@ HWY_SVE_FOREACH_BF16_UNCONDITIONAL(HWY_SPECIALIZE, _, _) NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ return sv##OP##_##CHAR##BITS##_x(m, a, b); \ } -#define HWY_SVE_RETV_ARGMVV_M(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(svbool_t m, HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ - return sv##OP##_##CHAR##BITS##_m(m, a, b); \ - } #define HWY_SVE_RETV_ARGVVV(BASE, CHAR, BITS, HALF, NAME, OP) \ HWY_API HWY_SVE_V(BASE, BITS) \ @@ -6488,7 +6483,6 @@ HWY_API V HighestSetBitIndex(V v) { #undef HWY_SVE_RETV_ARGV #undef HWY_SVE_RETV_ARGVN #undef HWY_SVE_RETV_ARGVV -#undef HWY_SVE_RETV_ARGMVV_M #undef HWY_SVE_RETV_ARGVVV #undef HWY_SVE_RETV_ARGMVVV #undef HWY_SVE_T From ecb2f360bbaf787e0529e2c5180cd8c4941e27ad Mon Sep 17 00:00:00 2001 From: Mohammad Azim Khan Date: Fri, 15 Nov 2024 16:03:14 +0000 Subject: [PATCH 60/64] Load/Store, masked set and counting operations --- g3doc/quick_reference.md | 28 +++++++++++ hwy/ops/arm_sve-inl.h | 80 ++++++++++++++++++++++++++++++++ hwy/ops/generic_ops-inl.h | 97 +++++++++++++++++++++++++++++++++++++++ hwy/tests/count_test.cc | 43 +++++++++++++++++ hwy/tests/logical_test.cc | 28 +++++++++++ hwy/tests/mask_test.cc | 64 ++++++++++++++++++++++++++ hwy/tests/memory_test.cc | 81 ++++++++++++++++++++++++++++++++ 7 files changed, 421 insertions(+) diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md index 8213f4ac0b..62f8e9a537 100644 --- a/g3doc/quick_reference.md +++ b/g3doc/quick_reference.md @@ -429,6 +429,10 @@ for comparisons, for example `Lt` instead of `operator<`. the result, with `t0` in the least-significant (lowest-indexed) lane of each 128-bit block and `tK` in the most-significant (highest-indexed) lane of each 128-bit block: `{t0, t1, ..., tK}` +* V **SetOr**(V no, M m, T a): returns N-lane vector with lane + `i` equal to `a` if `m[i]` is true else `no[i]`. +* V **SetOrZero**(D d, M m, T a): returns N-lane vector with lane + `i` equal to `a` if `m[i]` is true else 0. ### Getting/setting lanes @@ -1065,6 +1069,10 @@ Per-lane variable shifts (slow if SSSE3/SSE4, or 16-bit, or Shr i64 on AVX2): leading zeros in each lane. For any lanes where ```a[i]``` is zero, ```sizeof(TFromV) * 8``` is returned in the corresponding result lanes. +* `V`: `{u,i}` \ + V **MaskedLeadingZeroCountOrZero**(M m, `V a): returns the + result of LeadingZeroCount where `m[i]` is true, and zero otherwise. + * `V`: `{u,i}` \ V **TrailingZeroCount**(V a): returns the number of trailing zeros in each lane. For any lanes where ```a[i]``` is zero, @@ -1079,6 +1087,12 @@ Per-lane variable shifts (slow if SSSE3/SSE4, or 16-bit, or Shr i64 on AVX2): ```HighestValue>>()``` is returned in the corresponding result lanes. +* bool **AllOnes**(D, V v): returns whether all bits in `v[i]` + are set. + +* bool **AllZeros**(D, V v): returns whether all bits in `v[i]` + are clear. + The following operate on individual bits within each lane. Note that the non-operator functions (`And` instead of `&`) must be used for floating-point types, and on SVE/RVV. @@ -1563,6 +1577,9 @@ aligned memory at indices which are not a multiple of the vector length): * Vec<D> **LoadU**(D, const T* p): returns `p[i]`. +* Vec<D> **MaskedLoadU**(D, M m, const T* p): returns `p[i]` + where mask is true and returns zero otherwise. + * Vec<D> **LoadDup128**(D, const T* p): returns one 128-bit block loaded from `p` and broadcasted into all 128-bit block\[s\]. This may be faster than broadcasting single values, and is more convenient than @@ -1593,6 +1610,10 @@ aligned memory at indices which are not a multiple of the vector length): lanes from `p` to the first (lowest-index) lanes of the result vector and fills the remaining lanes with `no`. Like LoadN, this does not fault. +* Vec<D> **LoadHigher**(D d, V v, T* p): Loads `Lanes(d)/2` lanes from + `p` into the upper lanes of the result vector and the lower half of `v` into + the lower lanes. + #### Store * void **Store**(Vec<D> v, D, T* aligned): copies `v[i]` @@ -1632,6 +1653,13 @@ aligned memory at indices which are not a multiple of the vector length): StoreN does not modify any memory past `p + HWY_MIN(Lanes(d), max_lanes_to_store) - 1`. +* void **StoreTruncated**(Vec<DFrom> v, DFrom d, To* HWY_RESTRICT + p): Truncates elements of `v` to type `To` and stores on `p`. It is + similar to performing `TruncateTo` followed by `StoreN` where + `max_lanes_to_store` is `Lanes(d)`. + + StoreTruncated does not modify any memory past `p + Lanes(d) - 1`. + #### Interleaved * void **LoadInterleaved2**(D, const T* p, Vec<D>& v0, diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h index 4c4a37e5d4..e571400b37 100644 --- a/hwy/ops/arm_sve-inl.h +++ b/hwy/ops/arm_sve-inl.h @@ -427,6 +427,27 @@ using VFromD = decltype(Set(D(), TFromD())); using VBF16 = VFromD>; +// ------------------------------ SetOr/SetOrZero + +#define HWY_SVE_SET_OR(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) inactive, \ + svbool_t m, HWY_SVE_T(BASE, BITS) op) { \ + return sv##OP##_##CHAR##BITS##_m(inactive, m, op); \ + } + +HWY_SVE_FOREACH(HWY_SVE_SET_OR, SetOr, dup_n) +#undef HWY_SVE_SET_OR + +#define HWY_SVE_SET_OR_ZERO(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \ + svbool_t m, HWY_SVE_T(BASE, BITS) op) { \ + return sv##OP##_##CHAR##BITS##_z(m, op); \ + } + +HWY_SVE_FOREACH(HWY_SVE_SET_OR_ZERO, SetOrZero, dup_n) +#undef HWY_SVE_SET_OR_ZERO + // ------------------------------ Zero template @@ -2211,6 +2232,18 @@ HWY_API void BlendedStore(VFromD v, MFromD m, D d, #undef HWY_SVE_MEM +#define HWY_SVE_MASKED_MEM(BASE, CHAR, BITS, HALF, NAME, OP) \ + template \ + HWY_API HWY_SVE_V(BASE, BITS) \ + MaskedLoadU(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, svbool_t m, \ + const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \ + return svld1_##CHAR##BITS(m, detail::NativeLanePointer(p)); \ + } + +HWY_SVE_FOREACH(HWY_SVE_MASKED_MEM, _, _) + +#undef HWY_SVE_MASKED_MEM + #if HWY_TARGET != HWY_SVE2_128 namespace detail { #define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP) \ @@ -2257,6 +2290,37 @@ HWY_API VFromD LoadDup128(D d, const TFromD* HWY_RESTRICT p) { #endif // HWY_TARGET != HWY_SVE2_128 +// Truncate to smaller size and store +#ifdef HWY_NATIVE_STORE_TRUNCATED +#undef HWY_NATIVE_STORE_TRUNCATED +#else +#define HWY_NATIVE_STORE_TRUNCATED +#endif + +#define HWY_SVE_STORE_TRUNCATED(BASE, CHAR, BITS, HALF, NAME, OP, TO_BITS) \ + template \ + HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \ + const HWY_SVE_D(BASE, BITS, N, kPow2) d, \ + HWY_SVE_T(BASE, TO_BITS) * HWY_RESTRICT p) { \ + sv##OP##_##CHAR##BITS(detail::PTrue(d), detail::NativeLanePointer(p), v); \ + } + +#define HWY_SVE_STORE_TRUNCATED_BYTE(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_SVE_STORE_TRUNCATED(BASE, CHAR, BITS, HALF, NAME, OP, 8) +#define HWY_SVE_STORE_TRUNCATED_HALF(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_SVE_STORE_TRUNCATED(BASE, CHAR, BITS, HALF, NAME, OP, 16) +#define HWY_SVE_STORE_TRUNCATED_WORD(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_SVE_STORE_TRUNCATED(BASE, CHAR, BITS, HALF, NAME, OP, 32) + +HWY_SVE_FOREACH_UI16(HWY_SVE_STORE_TRUNCATED_BYTE, StoreTruncated, st1b) +HWY_SVE_FOREACH_UI32(HWY_SVE_STORE_TRUNCATED_BYTE, StoreTruncated, st1b) +HWY_SVE_FOREACH_UI64(HWY_SVE_STORE_TRUNCATED_BYTE, StoreTruncated, st1b) +HWY_SVE_FOREACH_UI32(HWY_SVE_STORE_TRUNCATED_HALF, StoreTruncated, st1h) +HWY_SVE_FOREACH_UI64(HWY_SVE_STORE_TRUNCATED_HALF, StoreTruncated, st1h) +HWY_SVE_FOREACH_UI64(HWY_SVE_STORE_TRUNCATED_WORD, StoreTruncated, st1w) + +#undef HWY_SVE_STORE_TRUNCATED + // ------------------------------ Load/Store // SVE only requires lane alignment, not natural alignment of the entire @@ -6442,6 +6506,22 @@ HWY_API V HighestSetBitIndex(V v) { return BitCast(d, Sub(Set(d, T{sizeof(T) * 8 - 1}), LeadingZeroCount(v))); } +#ifdef HWY_NATIVE_MASKED_LEADING_ZERO_COUNT +#undef HWY_NATIVE_MASKED_LEADING_ZERO_COUNT +#else +#define HWY_NATIVE_MASKED_LEADING_ZERO_COUNT +#endif + +#define HWY_SVE_MASKED_LEADING_ZERO_COUNT(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) NAME(svbool_t m, HWY_SVE_V(BASE, BITS) v) { \ + const DFromV d; \ + return BitCast(d, sv##OP##_##CHAR##BITS##_z(m, v)); \ + } + +HWY_SVE_FOREACH_UI(HWY_SVE_MASKED_LEADING_ZERO_COUNT, + MaskedLeadingZeroCountOrZero, clz) +#undef HWY_SVE_LEADING_ZERO_COUNT + // ================================================== END MACROS #undef HWY_SVE_ALL_PTRUE #undef HWY_SVE_D diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h index 05ff70e0fb..9837835ec2 100644 --- a/hwy/ops/generic_ops-inl.h +++ b/hwy/ops/generic_ops-inl.h @@ -97,6 +97,21 @@ HWY_API Vec Inf(D d) { return BitCast(d, Set(du, max_x2 >> 1)); } +// ------------------------------ SetOr/SetOrZero + +template , typename D = DFromV, + typename M = MFromD> +HWY_API V SetOr(V no, M m, T a) { + D d; + return IfThenElse(m, Set(d, a), no); +} + +template , typename M = MFromD, + typename T = TFromD> +HWY_API V SetOrZero(D d, M m, T a) { + return IfThenElseZero(m, Set(d, a)); +} + // ------------------------------ ZeroExtendResizeBitCast // The implementation of detail::ZeroExtendResizeBitCast for the HWY_EMU128 @@ -336,6 +351,20 @@ HWY_API Mask DemoteMaskTo(DTo d_to, DFrom d_from, Mask m) { #endif // HWY_NATIVE_DEMOTE_MASK_TO +// ------------------------------ LoadHigher +#if (defined(HWY_NATIVE_LOAD_HIGHER) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_LOAD_HIGHER +#undef HWY_NATIVE_LOAD_HIGHER +#else +#define HWY_NATIVE_LOAD_HIGHER +#endif +template (), HWY_IF_LANES_GT_D(D, 1)> +HWY_API V LoadHigher(D d, V a, T* p) { + const V b = LoadU(d, p); + return ConcatLowerLower(d, b, a); +} +#endif // HWY_NATIVE_LOAD_HIGHER + // ------------------------------ CombineMasks #if (defined(HWY_NATIVE_COMBINE_MASKS) == defined(HWY_TARGET_TOGGLE)) @@ -1175,6 +1204,12 @@ HWY_API V MulByFloorPow2(V v, V exp) { #endif // HWY_NATIVE_MUL_BY_POW2 +// ------------------------------ MaskedLoadU +template +HWY_API VFromD MaskedLoadU(D d, M m, + const TFromD* HWY_RESTRICT unaligned) { + return IfThenElseZero(m, LoadU(d, unaligned)); +} // ------------------------------ GetExponent #if (defined(HWY_NATIVE_GET_EXPONENT) == defined(HWY_TARGET_TOGGLE)) @@ -2659,6 +2694,25 @@ HWY_API void StoreN(VFromD v, D d, T* HWY_RESTRICT p, #endif // (defined(HWY_NATIVE_STORE_N) == defined(HWY_TARGET_TOGGLE)) +// ------------------------------ StoreTruncated +#if (defined(HWY_NATIVE_STORE_TRUNCATED) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_STORE_TRUNCATED +#undef HWY_NATIVE_STORE_TRUNCATED +#else +#define HWY_NATIVE_STORE_TRUNCATED +#endif + +template , + HWY_IF_T_SIZE_GT_D(DFrom, sizeof(To)), + HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(VFromD)> +HWY_API void StoreTruncated(VFromD v, const DFrom d, + To* HWY_RESTRICT p) { + DTo dsmall; + StoreN(TruncateTo(dsmall, v), dsmall, p, Lanes(d)); +} + +#endif // (defined(HWY_NATIVE_STORE_TRUNCATED) == defined(HWY_TARGET_TOGGLE)) + // ------------------------------ Scatter #if (defined(HWY_NATIVE_SCATTER) == defined(HWY_TARGET_TOGGLE)) @@ -3886,6 +3940,21 @@ HWY_API V TrailingZeroCount(V v) { } #endif // HWY_NATIVE_LEADING_ZERO_COUNT +// ------------------------------ MaskedLeadingZeroCountOrZero +#if (defined(HWY_NATIVE_MASKED_LEADING_ZERO_COUNT) == \ + defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_MASKED_LEADING_ZERO_COUNT +#undef HWY_NATIVE_MASKED_LEADING_ZERO_COUNT +#else +#define HWY_NATIVE_MASKED_LEADING_ZERO_COUNT +#endif + +template +HWY_API V MaskedLeadingZeroCountOrZero(M m, V v) { + return IfThenElseZero(m, LeadingZeroCount(v)); +} +#endif // HWY_NATIVE_MASKED_LEADING_ZERO_COUNT + // ------------------------------ AESRound // Cannot implement on scalar: need at least 16 bytes for TableLookupBytes. @@ -7442,6 +7511,34 @@ HWY_API V BitShuffle(V v, VI idx) { #endif // HWY_NATIVE_BITSHUFFLE +// ------------------------------ AllOnes/AllZeros +#if (defined(HWY_NATIVE_ALLONES) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_ALLONES +#undef HWY_NATIVE_ALLONES +#else +#define HWY_NATIVE_ALLONES +#endif + +template +HWY_API bool AllOnes(V a) { + DFromV d; + return AllTrue(d, Eq(Not(a), Zero(d))); +} +#endif // HWY_NATIVE_ALLONES + +#if (defined(HWY_NATIVE_ALLZEROS) == defined(HWY_TARGET_TOGGLE)) +#ifdef HWY_NATIVE_ALLZEROS +#undef HWY_NATIVE_ALLZEROS +#else +#define HWY_NATIVE_ALLZEROS +#endif + +template +HWY_API bool AllZeros(V a) { + DFromV d; + return AllTrue(d, Eq(a, Zero(d))); +} +#endif // HWY_NATIVE_ALLZEROS // ================================================== Operator wrapper // SVE* and RVV currently cannot define operators and have already defined diff --git a/hwy/tests/count_test.cc b/hwy/tests/count_test.cc index cc2d841122..40939d949c 100644 --- a/hwy/tests/count_test.cc +++ b/hwy/tests/count_test.cc @@ -132,6 +132,48 @@ HWY_NOINLINE void TestAllLeadingZeroCount() { ForIntegerTypes(ForPartialVectors()); } +struct TestMaskedLeadingZeroCount { + template + HWY_ATTR_NO_MSAN HWY_NOINLINE void operator()(T /*unused*/, D d) { + RandomState rng; + using TU = MakeUnsigned; + const RebindToUnsigned du; + size_t N = Lanes(d); + const MFromD first_3 = FirstN(d, 3); + auto data = AllocateAligned(N); + auto lzcnt = AllocateAligned(N); + HWY_ASSERT(data && lzcnt); + + constexpr T kNumOfBitsInT = static_cast(sizeof(T) * 8); + for (size_t j = 0; j < N; j++) { + if (j < 3) { + lzcnt[j] = static_cast(kNumOfBitsInT - 2); + } else { + lzcnt[j] = static_cast(0); + } + } + HWY_ASSERT_VEC_EQ( + d, lzcnt.get(), + MaskedLeadingZeroCountOrZero(first_3, Set(d, static_cast(2)))); + + for (size_t j = 0; j < N; j++) { + if (j < 3) { + lzcnt[j] = static_cast(1); + } else { + lzcnt[j] = static_cast(0); + } + } + HWY_ASSERT_VEC_EQ( + d, lzcnt.get(), + MaskedLeadingZeroCountOrZero( + first_3, BitCast(d, Set(du, TU{1} << (kNumOfBitsInT - 2))))); + } +}; + +HWY_NOINLINE void TestAllMaskedLeadingZeroCount() { + ForIntegerTypes(ForPartialVectors()); +} + template static HWY_INLINE T TrailingZeroCountOfValue(T val) { @@ -303,6 +345,7 @@ namespace { HWY_BEFORE_TEST(HwyCountTest); HWY_EXPORT_AND_TEST_P(HwyCountTest, TestAllPopulationCount); HWY_EXPORT_AND_TEST_P(HwyCountTest, TestAllLeadingZeroCount); +HWY_EXPORT_AND_TEST_P(HwyCountTest, TestAllMaskedLeadingZeroCount); HWY_EXPORT_AND_TEST_P(HwyCountTest, TestAllTrailingZeroCount); HWY_EXPORT_AND_TEST_P(HwyCountTest, TestAllHighestSetBitIndex); HWY_AFTER_TEST(); diff --git a/hwy/tests/logical_test.cc b/hwy/tests/logical_test.cc index ecd7589c9e..6036594413 100644 --- a/hwy/tests/logical_test.cc +++ b/hwy/tests/logical_test.cc @@ -146,6 +146,32 @@ HWY_NOINLINE void TestAllTestBit() { ForIntegerTypes(ForPartialVectors()); } +struct TestAllOnes { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + auto v0s = Zero(d); + HWY_ASSERT(AllZeros(v0s)); + auto v1s = Not(v0s); + HWY_ASSERT(AllOnes(v1s)); + const size_t kNumBits = sizeof(T) * 8; + for (size_t i = 0; i < kNumBits; ++i) { + const Vec bit1 = Set(d, static_cast(1ull << i)); + const Vec bit2 = Set(d, static_cast(1ull << ((i + 1) % kNumBits))); + const Vec bits12 = Or(bit1, bit2); + HWY_ASSERT(!AllOnes(bit1)); + HWY_ASSERT(!AllZeros(bit1)); + HWY_ASSERT(!AllOnes(bit2)); + HWY_ASSERT(!AllZeros(bit2)); + HWY_ASSERT(!AllOnes(bits12)); + HWY_ASSERT(!AllZeros(bits12)); + } + } +}; + +HWY_NOINLINE void TestAllAllOnes() { + ForIntegerTypes(ForPartialVectors()); +} + } // namespace // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE @@ -159,6 +185,8 @@ HWY_BEFORE_TEST(HwyLogicalTest); HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllNot); HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogical); HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllTestBit); +HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllAllOnes); + HWY_AFTER_TEST(); } // namespace } // namespace hwy diff --git a/hwy/tests/mask_test.cc b/hwy/tests/mask_test.cc index afd564b46e..af65378084 100644 --- a/hwy/tests/mask_test.cc +++ b/hwy/tests/mask_test.cc @@ -317,6 +317,68 @@ HWY_NOINLINE void TestAllLogicalMask() { ForAllTypes(ForPartialVectors()); } +struct TestSetOr { + template + void testWithMask(D d, MFromD m) { + TFromD a = 1; + auto yes = Set(d, a); + auto no = Set(d, 2); + auto expected = IfThenElse(m, yes, no); + auto actual = SetOr(no, m, a); + HWY_ASSERT_VEC_EQ(d, expected, actual); + } + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + // All False + testWithMask(d, MaskFalse(d)); + auto N = Lanes(d); + // All True + testWithMask(d, FirstN(d, N)); + // Lower half + testWithMask(d, FirstN(d, N / 2)); + // Upper half + testWithMask(d, Not(FirstN(d, N / 2))); + // Interleaved + testWithMask(d, + MaskFromVec(InterleaveLower(Zero(d), Set(d, (TFromD)-1)))); + } +}; + +HWY_NOINLINE void TestAllSetOr() { + ForAllTypes(ForShrinkableVectors()); +} + +struct TestSetOrZero { + template + void testWithMask(D d, MFromD m) { + TFromD a = 1; + auto yes = Set(d, a); + auto no = Zero(d); + auto expected = IfThenElse(m, yes, no); + auto actual = SetOrZero(d, m, a); + HWY_ASSERT_VEC_EQ(d, expected, actual); + } + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + // All False + testWithMask(d, MaskFalse(d)); + auto N = Lanes(d); + // All True + testWithMask(d, FirstN(d, N)); + // Lower half + testWithMask(d, FirstN(d, N / 2)); + // Upper half + testWithMask(d, Not(FirstN(d, N / 2))); + // Interleaved + testWithMask(d, + MaskFromVec(InterleaveLower(Zero(d), Set(d, (TFromD)-1)))); + } +}; + +HWY_NOINLINE void TestAllSetOrZero() { + ForAllTypes(ForShrinkableVectors()); +} + } // namespace // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE @@ -335,6 +397,8 @@ HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllCountTrue); HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFindFirstTrue); HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFindLastTrue); HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllLogicalMask); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllSetOr); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllSetOrZero); HWY_AFTER_TEST(); } // namespace } // namespace hwy diff --git a/hwy/tests/memory_test.cc b/hwy/tests/memory_test.cc index 09c05e9469..5cc9f184e0 100644 --- a/hwy/tests/memory_test.cc +++ b/hwy/tests/memory_test.cc @@ -73,6 +73,18 @@ struct TestLoadStore { HWY_ASSERT_EQ(i + 2, lanes3[i]); } + // Unaligned masked load + const MFromD first_3 = FirstN(d, 3); + const VFromD vu2 = MaskedLoadU(d, first_3, &lanes[1]); + Store(vu2, d, lanes3.get()); + for (size_t i = 0; i < N; ++i) { + if (i < 3) { + HWY_ASSERT_EQ(i + 2, lanes3[i]); + } else { + HWY_ASSERT_EQ(0, lanes3[i]); + } + } + // Unaligned store StoreU(lo2, d, &lanes2[N / 2]); size_t i = 0; @@ -565,6 +577,73 @@ HWY_NOINLINE void TestAllStoreN() { ForAllTypesAndSpecial(ForPartialVectors()); } +template +constexpr bool IsSupportedTruncation() { + return (sizeof(To) < sizeof(From) && Rebind().Pow2() >= -3 && + Rebind().Pow2() + 4 >= static_cast(CeilLog2(sizeof(To)))); +} + +struct TestStoreTruncated { + template ()>* = nullptr> + HWY_NOINLINE void testTo(From, To, const D) { + // do nothing + } + + template ()>* = nullptr> + HWY_NOINLINE void testTo(From, To, const D d) { + constexpr uint32_t base = 0xFA578D00; + const Vec src = Iota(d, base & hwy::LimitsMax()); + const Rebind dTo; + const Vec v_expected = + Iota(dTo, base & hwy::LimitsMax()); + const size_t NFrom = Lanes(d); + auto expected = AllocateAligned(NFrom); + StoreN(v_expected, dTo, expected.get(), NFrom); + auto actual = AllocateAligned(NFrom); + StoreTruncated(src, d, actual.get()); + HWY_ASSERT_ARRAY_EQ(expected.get(), actual.get(), NFrom); + } + + template + HWY_NOINLINE void operator()(T from, const D d) { + testTo(from, uint8_t(), d); + testTo(from, uint16_t(), d); + testTo(from, uint32_t(), d); + } +}; + +HWY_NOINLINE void TestAllStoreTruncated() { + ForU163264(ForPartialVectors()); +} + +struct TestLoadHigher { + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + const size_t N = Lanes(d); + const Vec a = Set(d, 1); + + // Generate a generic vector, then extract the pointer to the first entry + AlignedFreeUniquePtr pa = AllocateAligned(N); + std::fill(pa.get(), pa.get() + N, 20.0); + T* pointer = pa.get(); + + const Vec b = Set(d, 20); + const Vec expected_output_lanes = ConcatLowerLower(d, b, a); + + HWY_ASSERT_VEC_EQ(d, expected_output_lanes, LoadHigher(d, a, pointer)); + } + template + HWY_NOINLINE void operator()(T /*unused*/, D d) { + (void)d; + } +}; + +HWY_NOINLINE void TestAllLoadHigher() { + ForAllTypes(ForPartialVectors()); +} + } // namespace // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE @@ -585,6 +664,8 @@ HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllCache); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadN); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadNOr); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreN); +HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreTruncated); +HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadHigher); HWY_AFTER_TEST(); } // namespace } // namespace hwy From 6b90d904eaa8fc3e7360ed8174b38e9358681472 Mon Sep 17 00:00:00 2001 From: Will Barber Date: Thu, 30 Jan 2025 11:44:13 +0000 Subject: [PATCH 61/64] Fix review comments Rename SetOr* ops for consistency Rename AllOnes/AllZeros to AllBits1/0 Remove MaskedLoadU, this is covered by MaskedLoad Rename LowerHigher to InsertIntoUpper Rework StoreTruncated, rename to TruncateStore Rename macro arg Avoid full-length load in LoadHigher Optimise AllBits1 --- g3doc/quick_reference.md | 26 +++++++++------------- hwy/ops/arm_sve-inl.h | 44 ++++++++++++++------------------------ hwy/ops/generic_ops-inl.h | 45 +++++++++++++++++---------------------- hwy/tests/logical_test.cc | 24 ++++++++++----------- hwy/tests/mask_test.cc | 20 ++++++++--------- hwy/tests/memory_test.cc | 32 +++++++++------------------- 6 files changed, 78 insertions(+), 113 deletions(-) diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md index 62f8e9a537..deb4bc6e21 100644 --- a/g3doc/quick_reference.md +++ b/g3doc/quick_reference.md @@ -429,9 +429,9 @@ for comparisons, for example `Lt` instead of `operator<`. the result, with `t0` in the least-significant (lowest-indexed) lane of each 128-bit block and `tK` in the most-significant (highest-indexed) lane of each 128-bit block: `{t0, t1, ..., tK}` -* V **SetOr**(V no, M m, T a): returns N-lane vector with lane +* V **MaskedSetOr**(V no, M m, T a): returns N-lane vector with lane `i` equal to `a` if `m[i]` is true else `no[i]`. -* V **SetOrZero**(D d, M m, T a): returns N-lane vector with lane +* V **MaskedSet**(D d, M m, T a): returns N-lane vector with lane `i` equal to `a` if `m[i]` is true else 0. ### Getting/setting lanes @@ -1087,10 +1087,10 @@ Per-lane variable shifts (slow if SSSE3/SSE4, or 16-bit, or Shr i64 on AVX2): ```HighestValue>>()``` is returned in the corresponding result lanes. -* bool **AllOnes**(D, V v): returns whether all bits in `v[i]` +* bool **AllBits1**(D, V v): returns whether all bits in `v[i]` are set. -* bool **AllZeros**(D, V v): returns whether all bits in `v[i]` +* bool **AllBits0**(D, V v): returns whether all bits in `v[i]` are clear. The following operate on individual bits within each lane. Note that the @@ -1577,9 +1577,6 @@ aligned memory at indices which are not a multiple of the vector length): * Vec<D> **LoadU**(D, const T* p): returns `p[i]`. -* Vec<D> **MaskedLoadU**(D, M m, const T* p): returns `p[i]` - where mask is true and returns zero otherwise. - * Vec<D> **LoadDup128**(D, const T* p): returns one 128-bit block loaded from `p` and broadcasted into all 128-bit block\[s\]. This may be faster than broadcasting single values, and is more convenient than @@ -1610,9 +1607,9 @@ aligned memory at indices which are not a multiple of the vector length): lanes from `p` to the first (lowest-index) lanes of the result vector and fills the remaining lanes with `no`. Like LoadN, this does not fault. -* Vec<D> **LoadHigher**(D d, V v, T* p): Loads `Lanes(d)/2` lanes from - `p` into the upper lanes of the result vector and the lower half of `v` into - the lower lanes. +* Vec<D> **InsertIntoUpper**(D d, T* p, V v): Loads `Lanes(d)/2` + lanes from `p` into the upper lanes of the result vector and the lower half + of `v` into the lower lanes. #### Store @@ -1653,12 +1650,9 @@ aligned memory at indices which are not a multiple of the vector length): StoreN does not modify any memory past `p + HWY_MIN(Lanes(d), max_lanes_to_store) - 1`. -* void **StoreTruncated**(Vec<DFrom> v, DFrom d, To* HWY_RESTRICT - p): Truncates elements of `v` to type `To` and stores on `p`. It is - similar to performing `TruncateTo` followed by `StoreN` where - `max_lanes_to_store` is `Lanes(d)`. - - StoreTruncated does not modify any memory past `p + Lanes(d) - 1`. +* void **TruncateStore**(Vec<D> v, D d, T* HWY_RESTRICT p): + Truncates elements of `v` to type `T` and stores on `p`. It is similar to + performing `TruncateTo` followed by `StoreU`. #### Interleaved diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h index e571400b37..3df91f7c85 100644 --- a/hwy/ops/arm_sve-inl.h +++ b/hwy/ops/arm_sve-inl.h @@ -427,26 +427,26 @@ using VFromD = decltype(Set(D(), TFromD())); using VBF16 = VFromD>; -// ------------------------------ SetOr/SetOrZero +// ------------------------------ MaskedSetOr/MaskedSet -#define HWY_SVE_SET_OR(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) inactive, \ - svbool_t m, HWY_SVE_T(BASE, BITS) op) { \ - return sv##OP##_##CHAR##BITS##_m(inactive, m, op); \ +#define HWY_SVE_MASKED_SET_OR(BASE, CHAR, BITS, HALF, NAME, OP) \ + HWY_API HWY_SVE_V(BASE, BITS) \ + NAME(HWY_SVE_V(BASE, BITS) no, svbool_t m, HWY_SVE_T(BASE, BITS) op) { \ + return sv##OP##_##CHAR##BITS##_m(no, m, op); \ } -HWY_SVE_FOREACH(HWY_SVE_SET_OR, SetOr, dup_n) -#undef HWY_SVE_SET_OR +HWY_SVE_FOREACH(HWY_SVE_MASKED_SET_OR, MaskedSetOr, dup_n) +#undef HWY_SVE_MASKED_SET_OR -#define HWY_SVE_SET_OR_ZERO(BASE, CHAR, BITS, HALF, NAME, OP) \ +#define HWY_SVE_MASKED_SET(BASE, CHAR, BITS, HALF, NAME, OP) \ template \ HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \ svbool_t m, HWY_SVE_T(BASE, BITS) op) { \ return sv##OP##_##CHAR##BITS##_z(m, op); \ } -HWY_SVE_FOREACH(HWY_SVE_SET_OR_ZERO, SetOrZero, dup_n) -#undef HWY_SVE_SET_OR_ZERO +HWY_SVE_FOREACH(HWY_SVE_MASKED_SET, MaskedSet, dup_n) +#undef HWY_SVE_MASKED_SET // ------------------------------ Zero @@ -2232,18 +2232,6 @@ HWY_API void BlendedStore(VFromD v, MFromD m, D d, #undef HWY_SVE_MEM -#define HWY_SVE_MASKED_MEM(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API HWY_SVE_V(BASE, BITS) \ - MaskedLoadU(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, svbool_t m, \ - const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \ - return svld1_##CHAR##BITS(m, detail::NativeLanePointer(p)); \ - } - -HWY_SVE_FOREACH(HWY_SVE_MASKED_MEM, _, _) - -#undef HWY_SVE_MASKED_MEM - #if HWY_TARGET != HWY_SVE2_128 namespace detail { #define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP) \ @@ -2312,12 +2300,12 @@ HWY_API VFromD LoadDup128(D d, const TFromD* HWY_RESTRICT p) { #define HWY_SVE_STORE_TRUNCATED_WORD(BASE, CHAR, BITS, HALF, NAME, OP) \ HWY_SVE_STORE_TRUNCATED(BASE, CHAR, BITS, HALF, NAME, OP, 32) -HWY_SVE_FOREACH_UI16(HWY_SVE_STORE_TRUNCATED_BYTE, StoreTruncated, st1b) -HWY_SVE_FOREACH_UI32(HWY_SVE_STORE_TRUNCATED_BYTE, StoreTruncated, st1b) -HWY_SVE_FOREACH_UI64(HWY_SVE_STORE_TRUNCATED_BYTE, StoreTruncated, st1b) -HWY_SVE_FOREACH_UI32(HWY_SVE_STORE_TRUNCATED_HALF, StoreTruncated, st1h) -HWY_SVE_FOREACH_UI64(HWY_SVE_STORE_TRUNCATED_HALF, StoreTruncated, st1h) -HWY_SVE_FOREACH_UI64(HWY_SVE_STORE_TRUNCATED_WORD, StoreTruncated, st1w) +HWY_SVE_FOREACH_UI16(HWY_SVE_STORE_TRUNCATED_BYTE, TruncateStore, st1b) +HWY_SVE_FOREACH_UI32(HWY_SVE_STORE_TRUNCATED_BYTE, TruncateStore, st1b) +HWY_SVE_FOREACH_UI64(HWY_SVE_STORE_TRUNCATED_BYTE, TruncateStore, st1b) +HWY_SVE_FOREACH_UI32(HWY_SVE_STORE_TRUNCATED_HALF, TruncateStore, st1h) +HWY_SVE_FOREACH_UI64(HWY_SVE_STORE_TRUNCATED_HALF, TruncateStore, st1h) +HWY_SVE_FOREACH_UI64(HWY_SVE_STORE_TRUNCATED_WORD, TruncateStore, st1w) #undef HWY_SVE_STORE_TRUNCATED diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h index 9837835ec2..061ec6aad2 100644 --- a/hwy/ops/generic_ops-inl.h +++ b/hwy/ops/generic_ops-inl.h @@ -97,18 +97,18 @@ HWY_API Vec Inf(D d) { return BitCast(d, Set(du, max_x2 >> 1)); } -// ------------------------------ SetOr/SetOrZero +// ------------------------------ MaskedSetOr/MaskedSet template , typename D = DFromV, typename M = MFromD> -HWY_API V SetOr(V no, M m, T a) { +HWY_API V MaskedSetOr(V no, M m, T a) { D d; return IfThenElse(m, Set(d, a), no); } template , typename M = MFromD, typename T = TFromD> -HWY_API V SetOrZero(D d, M m, T a) { +HWY_API V MaskedSet(D d, M m, T a) { return IfThenElseZero(m, Set(d, a)); } @@ -351,7 +351,7 @@ HWY_API Mask DemoteMaskTo(DTo d_to, DFrom d_from, Mask m) { #endif // HWY_NATIVE_DEMOTE_MASK_TO -// ------------------------------ LoadHigher +// ------------------------------ InsertIntoUpper #if (defined(HWY_NATIVE_LOAD_HIGHER) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_LOAD_HIGHER #undef HWY_NATIVE_LOAD_HIGHER @@ -359,9 +359,10 @@ HWY_API Mask DemoteMaskTo(DTo d_to, DFrom d_from, Mask m) { #define HWY_NATIVE_LOAD_HIGHER #endif template (), HWY_IF_LANES_GT_D(D, 1)> -HWY_API V LoadHigher(D d, V a, T* p) { - const V b = LoadU(d, p); - return ConcatLowerLower(d, b, a); +HWY_API V InsertIntoUpper(D d, T* p, V a) { + Half dh; + const VFromD b = LoadU(dh, p); + return Combine(d, b, LowerHalf(a)); } #endif // HWY_NATIVE_LOAD_HIGHER @@ -1204,12 +1205,6 @@ HWY_API V MulByFloorPow2(V v, V exp) { #endif // HWY_NATIVE_MUL_BY_POW2 -// ------------------------------ MaskedLoadU -template -HWY_API VFromD MaskedLoadU(D d, M m, - const TFromD* HWY_RESTRICT unaligned) { - return IfThenElseZero(m, LoadU(d, unaligned)); -} // ------------------------------ GetExponent #if (defined(HWY_NATIVE_GET_EXPONENT) == defined(HWY_TARGET_TOGGLE)) @@ -2694,7 +2689,7 @@ HWY_API void StoreN(VFromD v, D d, T* HWY_RESTRICT p, #endif // (defined(HWY_NATIVE_STORE_N) == defined(HWY_TARGET_TOGGLE)) -// ------------------------------ StoreTruncated +// ------------------------------ TruncateStore #if (defined(HWY_NATIVE_STORE_TRUNCATED) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_STORE_TRUNCATED #undef HWY_NATIVE_STORE_TRUNCATED @@ -2702,13 +2697,12 @@ HWY_API void StoreN(VFromD v, D d, T* HWY_RESTRICT p, #define HWY_NATIVE_STORE_TRUNCATED #endif -template , - HWY_IF_T_SIZE_GT_D(DFrom, sizeof(To)), - HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(VFromD)> -HWY_API void StoreTruncated(VFromD v, const DFrom d, - To* HWY_RESTRICT p) { +template +HWY_API void TruncateStore(VFromD v, const D /*d*/, T* HWY_RESTRICT p) { + using DTo = Rebind; DTo dsmall; - StoreN(TruncateTo(dsmall, v), dsmall, p, Lanes(d)); + StoreU(TruncateTo(dsmall, v), dsmall, p); } #endif // (defined(HWY_NATIVE_STORE_TRUNCATED) == defined(HWY_TARGET_TOGGLE)) @@ -7511,7 +7505,7 @@ HWY_API V BitShuffle(V v, VI idx) { #endif // HWY_NATIVE_BITSHUFFLE -// ------------------------------ AllOnes/AllZeros +// ------------------------------ AllBits1/AllBits0 #if (defined(HWY_NATIVE_ALLONES) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_ALLONES #undef HWY_NATIVE_ALLONES @@ -7520,9 +7514,10 @@ HWY_API V BitShuffle(V v, VI idx) { #endif template -HWY_API bool AllOnes(V a) { - DFromV d; - return AllTrue(d, Eq(Not(a), Zero(d))); +HWY_API bool AllBits1(V a) { + const RebindToUnsigned> du; + using TU = TFromD; + return AllTrue(du, Eq(BitCast(du, a), Set(du, hwy::HighestValue()))); } #endif // HWY_NATIVE_ALLONES @@ -7534,7 +7529,7 @@ HWY_API bool AllOnes(V a) { #endif template -HWY_API bool AllZeros(V a) { +HWY_API bool AllBits0(V a) { DFromV d; return AllTrue(d, Eq(a, Zero(d))); } diff --git a/hwy/tests/logical_test.cc b/hwy/tests/logical_test.cc index 6036594413..31882ec9eb 100644 --- a/hwy/tests/logical_test.cc +++ b/hwy/tests/logical_test.cc @@ -146,30 +146,30 @@ HWY_NOINLINE void TestAllTestBit() { ForIntegerTypes(ForPartialVectors()); } -struct TestAllOnes { +struct TestAllBits { template HWY_NOINLINE void operator()(T /*unused*/, D d) { auto v0s = Zero(d); - HWY_ASSERT(AllZeros(v0s)); + HWY_ASSERT(AllBits0(v0s)); auto v1s = Not(v0s); - HWY_ASSERT(AllOnes(v1s)); + HWY_ASSERT(AllBits1(v1s)); const size_t kNumBits = sizeof(T) * 8; for (size_t i = 0; i < kNumBits; ++i) { const Vec bit1 = Set(d, static_cast(1ull << i)); const Vec bit2 = Set(d, static_cast(1ull << ((i + 1) % kNumBits))); const Vec bits12 = Or(bit1, bit2); - HWY_ASSERT(!AllOnes(bit1)); - HWY_ASSERT(!AllZeros(bit1)); - HWY_ASSERT(!AllOnes(bit2)); - HWY_ASSERT(!AllZeros(bit2)); - HWY_ASSERT(!AllOnes(bits12)); - HWY_ASSERT(!AllZeros(bits12)); + HWY_ASSERT(!AllBits1(bit1)); + HWY_ASSERT(!AllBits0(bit1)); + HWY_ASSERT(!AllBits1(bit2)); + HWY_ASSERT(!AllBits0(bit2)); + HWY_ASSERT(!AllBits1(bits12)); + HWY_ASSERT(!AllBits0(bits12)); } } }; -HWY_NOINLINE void TestAllAllOnes() { - ForIntegerTypes(ForPartialVectors()); +HWY_NOINLINE void TestAllAllBits() { + ForIntegerTypes(ForPartialVectors()); } } // namespace @@ -185,7 +185,7 @@ HWY_BEFORE_TEST(HwyLogicalTest); HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllNot); HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogical); HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllTestBit); -HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllAllOnes); +HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllAllBits); HWY_AFTER_TEST(); } // namespace diff --git a/hwy/tests/mask_test.cc b/hwy/tests/mask_test.cc index af65378084..e73a790c37 100644 --- a/hwy/tests/mask_test.cc +++ b/hwy/tests/mask_test.cc @@ -317,14 +317,14 @@ HWY_NOINLINE void TestAllLogicalMask() { ForAllTypes(ForPartialVectors()); } -struct TestSetOr { +struct TestMaskedSetOr { template void testWithMask(D d, MFromD m) { TFromD a = 1; auto yes = Set(d, a); auto no = Set(d, 2); auto expected = IfThenElse(m, yes, no); - auto actual = SetOr(no, m, a); + auto actual = MaskedSetOr(no, m, a); HWY_ASSERT_VEC_EQ(d, expected, actual); } template @@ -344,18 +344,18 @@ struct TestSetOr { } }; -HWY_NOINLINE void TestAllSetOr() { - ForAllTypes(ForShrinkableVectors()); +HWY_NOINLINE void TestAllMaskedSetOr() { + ForAllTypes(ForShrinkableVectors()); } -struct TestSetOrZero { +struct TestMaskedSet { template void testWithMask(D d, MFromD m) { TFromD a = 1; auto yes = Set(d, a); auto no = Zero(d); auto expected = IfThenElse(m, yes, no); - auto actual = SetOrZero(d, m, a); + auto actual = MaskedSet(d, m, a); HWY_ASSERT_VEC_EQ(d, expected, actual); } template @@ -375,8 +375,8 @@ struct TestSetOrZero { } }; -HWY_NOINLINE void TestAllSetOrZero() { - ForAllTypes(ForShrinkableVectors()); +HWY_NOINLINE void TestAllMaskedSet() { + ForAllTypes(ForShrinkableVectors()); } } // namespace @@ -397,8 +397,8 @@ HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllCountTrue); HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFindFirstTrue); HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFindLastTrue); HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllLogicalMask); -HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllSetOr); -HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllSetOrZero); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskedSetOr); +HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskedSet); HWY_AFTER_TEST(); } // namespace } // namespace hwy diff --git a/hwy/tests/memory_test.cc b/hwy/tests/memory_test.cc index 5cc9f184e0..6ecf775d52 100644 --- a/hwy/tests/memory_test.cc +++ b/hwy/tests/memory_test.cc @@ -73,18 +73,6 @@ struct TestLoadStore { HWY_ASSERT_EQ(i + 2, lanes3[i]); } - // Unaligned masked load - const MFromD first_3 = FirstN(d, 3); - const VFromD vu2 = MaskedLoadU(d, first_3, &lanes[1]); - Store(vu2, d, lanes3.get()); - for (size_t i = 0; i < N; ++i) { - if (i < 3) { - HWY_ASSERT_EQ(i + 2, lanes3[i]); - } else { - HWY_ASSERT_EQ(0, lanes3[i]); - } - } - // Unaligned store StoreU(lo2, d, &lanes2[N / 2]); size_t i = 0; @@ -583,7 +571,7 @@ constexpr bool IsSupportedTruncation() { Rebind().Pow2() + 4 >= static_cast(CeilLog2(sizeof(To)))); } -struct TestStoreTruncated { +struct TestTruncateStore { template ()>* = nullptr> HWY_NOINLINE void testTo(From, To, const D) { @@ -602,7 +590,7 @@ struct TestStoreTruncated { auto expected = AllocateAligned(NFrom); StoreN(v_expected, dTo, expected.get(), NFrom); auto actual = AllocateAligned(NFrom); - StoreTruncated(src, d, actual.get()); + TruncateStore(src, d, actual.get()); HWY_ASSERT_ARRAY_EQ(expected.get(), actual.get(), NFrom); } @@ -614,11 +602,11 @@ struct TestStoreTruncated { } }; -HWY_NOINLINE void TestAllStoreTruncated() { - ForU163264(ForPartialVectors()); +HWY_NOINLINE void TestAllTruncateStore() { + ForU163264(ForPartialVectors()); } -struct TestLoadHigher { +struct TestInsertIntoUpper { template HWY_NOINLINE void operator()(T /*unused*/, D d) { const size_t N = Lanes(d); @@ -632,7 +620,7 @@ struct TestLoadHigher { const Vec b = Set(d, 20); const Vec expected_output_lanes = ConcatLowerLower(d, b, a); - HWY_ASSERT_VEC_EQ(d, expected_output_lanes, LoadHigher(d, a, pointer)); + HWY_ASSERT_VEC_EQ(d, expected_output_lanes, InsertIntoUpper(d, pointer, a)); } template HWY_NOINLINE void operator()(T /*unused*/, D d) { @@ -640,8 +628,8 @@ struct TestLoadHigher { } }; -HWY_NOINLINE void TestAllLoadHigher() { - ForAllTypes(ForPartialVectors()); +HWY_NOINLINE void TestAllInsertIntoUpper() { + ForAllTypes(ForPartialVectors()); } } // namespace @@ -664,8 +652,8 @@ HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllCache); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadN); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadNOr); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreN); -HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreTruncated); -HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadHigher); +HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllTruncateStore); +HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllInsertIntoUpper); HWY_AFTER_TEST(); } // namespace } // namespace hwy From 4ef10e1a7444b08643c32f9a9032d33be554d7dd Mon Sep 17 00:00:00 2001 From: Will Barber Date: Thu, 30 Jan 2025 15:55:58 +0000 Subject: [PATCH 62/64] Improve handling float_16 in TestInsertIntoUpper --- hwy/tests/memory_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hwy/tests/memory_test.cc b/hwy/tests/memory_test.cc index 6ecf775d52..b64cc5f398 100644 --- a/hwy/tests/memory_test.cc +++ b/hwy/tests/memory_test.cc @@ -611,13 +611,13 @@ struct TestInsertIntoUpper { HWY_NOINLINE void operator()(T /*unused*/, D d) { const size_t N = Lanes(d); const Vec a = Set(d, 1); + const Vec b = Set(d, 20); // Generate a generic vector, then extract the pointer to the first entry AlignedFreeUniquePtr pa = AllocateAligned(N); - std::fill(pa.get(), pa.get() + N, 20.0); + StoreU(b, d, pa.get()); T* pointer = pa.get(); - const Vec b = Set(d, 20); const Vec expected_output_lanes = ConcatLowerLower(d, b, a); HWY_ASSERT_VEC_EQ(d, expected_output_lanes, InsertIntoUpper(d, pointer, a)); From a74a04d34e0aeb2b9a5229f1df471c82385dcdc6 Mon Sep 17 00:00:00 2001 From: Will Barber Date: Thu, 30 Jan 2025 16:52:15 +0000 Subject: [PATCH 63/64] Remove OrZero suffix --- g3doc/quick_reference.md | 2 +- hwy/ops/arm_sve-inl.h | 4 ++-- hwy/ops/generic_ops-inl.h | 4 ++-- hwy/tests/count_test.cc | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md index deb4bc6e21..1ecbd9051f 100644 --- a/g3doc/quick_reference.md +++ b/g3doc/quick_reference.md @@ -1070,7 +1070,7 @@ Per-lane variable shifts (slow if SSSE3/SSE4, or 16-bit, or Shr i64 on AVX2): ```sizeof(TFromV) * 8``` is returned in the corresponding result lanes. * `V`: `{u,i}` \ - V **MaskedLeadingZeroCountOrZero**(M m, `V a): returns the + V **MaskedLeadingZeroCount**(M m, V a): returns the result of LeadingZeroCount where `m[i]` is true, and zero otherwise. * `V`: `{u,i}` \ diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h index 3df91f7c85..c0f626d94c 100644 --- a/hwy/ops/arm_sve-inl.h +++ b/hwy/ops/arm_sve-inl.h @@ -6506,8 +6506,8 @@ HWY_API V HighestSetBitIndex(V v) { return BitCast(d, sv##OP##_##CHAR##BITS##_z(m, v)); \ } -HWY_SVE_FOREACH_UI(HWY_SVE_MASKED_LEADING_ZERO_COUNT, - MaskedLeadingZeroCountOrZero, clz) +HWY_SVE_FOREACH_UI(HWY_SVE_MASKED_LEADING_ZERO_COUNT, MaskedLeadingZeroCount, + clz) #undef HWY_SVE_LEADING_ZERO_COUNT // ================================================== END MACROS diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h index 061ec6aad2..836f38db77 100644 --- a/hwy/ops/generic_ops-inl.h +++ b/hwy/ops/generic_ops-inl.h @@ -3934,7 +3934,7 @@ HWY_API V TrailingZeroCount(V v) { } #endif // HWY_NATIVE_LEADING_ZERO_COUNT -// ------------------------------ MaskedLeadingZeroCountOrZero +// ------------------------------ MaskedLeadingZeroCount #if (defined(HWY_NATIVE_MASKED_LEADING_ZERO_COUNT) == \ defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_MASKED_LEADING_ZERO_COUNT @@ -3944,7 +3944,7 @@ HWY_API V TrailingZeroCount(V v) { #endif template -HWY_API V MaskedLeadingZeroCountOrZero(M m, V v) { +HWY_API V MaskedLeadingZeroCount(M m, V v) { return IfThenElseZero(m, LeadingZeroCount(v)); } #endif // HWY_NATIVE_MASKED_LEADING_ZERO_COUNT diff --git a/hwy/tests/count_test.cc b/hwy/tests/count_test.cc index 40939d949c..02a5451bc3 100644 --- a/hwy/tests/count_test.cc +++ b/hwy/tests/count_test.cc @@ -154,7 +154,7 @@ struct TestMaskedLeadingZeroCount { } HWY_ASSERT_VEC_EQ( d, lzcnt.get(), - MaskedLeadingZeroCountOrZero(first_3, Set(d, static_cast(2)))); + MaskedLeadingZeroCount(first_3, Set(d, static_cast(2)))); for (size_t j = 0; j < N; j++) { if (j < 3) { @@ -165,7 +165,7 @@ struct TestMaskedLeadingZeroCount { } HWY_ASSERT_VEC_EQ( d, lzcnt.get(), - MaskedLeadingZeroCountOrZero( + MaskedLeadingZeroCount( first_3, BitCast(d, Set(du, TU{1} << (kNumOfBitsInT - 2))))); } }; From 6fe29f9b41a838cf5f927648909bbb35b579df2c Mon Sep 17 00:00:00 2001 From: Will Barber Date: Thu, 30 Jan 2025 16:53:02 +0000 Subject: [PATCH 64/64] Clarify AllBits0/1 --- g3doc/quick_reference.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md index 1ecbd9051f..1a17b910b5 100644 --- a/g3doc/quick_reference.md +++ b/g3doc/quick_reference.md @@ -1087,11 +1087,9 @@ Per-lane variable shifts (slow if SSSE3/SSE4, or 16-bit, or Shr i64 on AVX2): ```HighestValue>>()``` is returned in the corresponding result lanes. -* bool **AllBits1**(D, V v): returns whether all bits in `v[i]` - are set. +* bool **AllBits1**(D, V v): returns whether all bits are set. -* bool **AllBits0**(D, V v): returns whether all bits in `v[i]` - are clear. +* bool **AllBits0**(D, V v): returns whether all bits are clear. The following operate on individual bits within each lane. Note that the non-operator functions (`And` instead of `&`) must be used for floating-point