|
| 1 | +/// |
| 2 | +/// @file Sieve_count_simd.hpp |
| 3 | +/// @brief Highly optimized code to count the number of 1 bits in |
| 4 | +/// the sieve array using SIMD instructions. |
| 5 | +/// |
| 6 | +/// In-depth description of this algorithm: |
| 7 | +/// https://github.com/kimwalisch/primecount/blob/master/doc/Hard-Special-Leaves.pdf |
| 8 | +/// |
| 9 | +/// Copyright (C) 2026 Kim Walisch, <kim.walisch@gmail.com> |
| 10 | +/// |
| 11 | +/// This file is distributed under the BSD License. See the COPYING |
| 12 | +/// file in the top level directory. |
| 13 | +/// |
| 14 | + |
| 15 | +#ifndef SIEVE_COUNT_SIMD_HPP |
| 16 | +#define SIEVE_COUNT_SIMD_HPP |
| 17 | + |
| 18 | +#include <Sieve.hpp> |
| 19 | +#include <Sieve_arrays.hpp> |
| 20 | +#include <macros.hpp> |
| 21 | +#include <popcnt.hpp> |
| 22 | + |
| 23 | +#include <stdint.h> |
| 24 | + |
| 25 | +#if defined(ENABLE_ARM_SVE) || \ |
| 26 | + defined(ENABLE_MULTIARCH_ARM_SVE) |
| 27 | + #include <arm_sve.h> |
| 28 | +#elif defined(ENABLE_AVX512_VPOPCNT) || \ |
| 29 | + defined(ENABLE_MULTIARCH_AVX512_VPOPCNT) |
| 30 | + #include <immintrin.h> |
| 31 | +#endif |
| 32 | + |
| 33 | +/// POPCNT64 ///////////////////////////////////////////////////////// |
| 34 | + |
| 35 | +/// Count 1 bits inside [start, stop] using POPCNT64 |
| 36 | +#define SIEVE_COUNT_POPCNT64(start, stop) \ |
| 37 | + ASSERT(start <= stop); \ |
| 38 | + ASSERT(stop - start < segment_size()); \ |
| 39 | + uint64_t start_idx = start / 240; \ |
| 40 | + uint64_t stop_idx = stop / 240; \ |
| 41 | + uint64_t m1 = unset_smaller[start % 240]; \ |
| 42 | + uint64_t m2 = unset_larger[stop % 240]; \ |
| 43 | + \ |
| 44 | + /* Branchfree bitmask calculation: */ \ |
| 45 | + /* if (start_idx == stop_idx) m1 = m1 & m2; */ \ |
| 46 | + /* if (start_idx == stop_idx) m2 = 0; */ \ |
| 47 | + CONDITIONAL_MOVE(start_idx == stop_idx, m1, m1 & m2); \ |
| 48 | + CONDITIONAL_MOVE(start_idx == stop_idx, m2, 0); \ |
| 49 | + \ |
| 50 | + const uint64_t* sieve64 = (const uint64_t*) sieve_.data(); \ |
| 51 | + uint64_t start_bits = sieve64[start_idx] & m1; \ |
| 52 | + uint64_t stop_bits = sieve64[stop_idx] & m2; \ |
| 53 | + uint64_t cnt = popcnt64(start_bits); \ |
| 54 | + cnt += popcnt64(stop_bits); \ |
| 55 | + \ |
| 56 | + for (uint64_t i = start_idx + 1; i < stop_idx; i++) \ |
| 57 | + cnt += popcnt64(sieve64[i]); |
| 58 | + |
| 59 | +/// AVX512 /////////////////////////////////////////////////////////// |
| 60 | + |
| 61 | +/// Count 1 bits inside [start, stop] using AVX512 |
| 62 | +#define SIEVE_COUNT_AVX512(start, stop) \ |
| 63 | + ASSERT(start <= stop); \ |
| 64 | + ASSERT(stop - start < segment_size()); \ |
| 65 | + uint64_t start_idx = start / 240; \ |
| 66 | + uint64_t stop_idx = stop / 240; \ |
| 67 | + uint64_t m1 = unset_smaller[start % 240]; \ |
| 68 | + uint64_t m2 = unset_larger[stop % 240]; \ |
| 69 | + \ |
| 70 | + /* Branchfree bitmask calculation: */ \ |
| 71 | + /* if (start_idx == stop_idx) m1 = m1 & m2; */ \ |
| 72 | + /* if (start_idx == stop_idx) m2 = 0; */ \ |
| 73 | + CONDITIONAL_MOVE(start_idx == stop_idx, m1, m1 & m2); \ |
| 74 | + CONDITIONAL_MOVE(start_idx == stop_idx, m2, 0); \ |
| 75 | + \ |
| 76 | + const uint64_t* sieve64 = (const uint64_t*) sieve_.data(); \ |
| 77 | + uint64_t start_bits = sieve64[start_idx] & m1; \ |
| 78 | + uint64_t stop_bits = sieve64[stop_idx] & m2; \ |
| 79 | + __m512i vec = _mm512_set_epi64(0, 0, 0, 0, 0, 0, stop_bits, start_bits); \ |
| 80 | + __m512i vcnt = _mm512_popcnt_epi64(vec); \ |
| 81 | + uint64_t i = start_idx + 1; \ |
| 82 | + \ |
| 83 | + /* Compute this for loop using AVX512. */ \ |
| 84 | + /* for (i = start_idx + 1; i < stop_idx; i++) */ \ |
| 85 | + /* cnt += popcnt64(sieve64[i]); */ \ |
| 86 | + for (; i + 8 < stop_idx; i += 8) \ |
| 87 | + { \ |
| 88 | + vec = _mm512_loadu_epi64(&sieve64[i]); \ |
| 89 | + vec = _mm512_popcnt_epi64(vec); \ |
| 90 | + vcnt = _mm512_add_epi64(vcnt, vec); \ |
| 91 | + } \ |
| 92 | + __mmask8 mask = (__mmask8) (0xff >> (i + 8 - stop_idx)); \ |
| 93 | + vec = _mm512_maskz_loadu_epi64(mask, &sieve64[i]); \ |
| 94 | + vec = _mm512_popcnt_epi64(vec); \ |
| 95 | + vcnt = _mm512_add_epi64(vcnt, vec); \ |
| 96 | + uint64_t cnt = _mm512_reduce_add_epi64(vcnt); |
| 97 | + |
| 98 | +/// ARM SVE ////////////////////////////////////////////////////////// |
| 99 | + |
| 100 | +/// Count 1 bits inside [start, stop] using ARM SVE |
| 101 | +#define SIEVE_COUNT_ARM_SVE(start, stop) \ |
| 102 | + ASSERT(start <= stop); \ |
| 103 | + ASSERT(stop - start < segment_size()); \ |
| 104 | + uint64_t start_idx = start / 240; \ |
| 105 | + uint64_t stop_idx = stop / 240; \ |
| 106 | + uint64_t m1 = unset_smaller[start % 240]; \ |
| 107 | + uint64_t m2 = unset_larger[stop % 240]; \ |
| 108 | + \ |
| 109 | + /* Branchfree bitmask calculation: */ \ |
| 110 | + /* if (start_idx == stop_idx) m1 = m1 & m2; */ \ |
| 111 | + /* if (start_idx == stop_idx) m2 = 0; */ \ |
| 112 | + CONDITIONAL_MOVE(start_idx == stop_idx, m1, m1 & m2); \ |
| 113 | + CONDITIONAL_MOVE(start_idx == stop_idx, m2, 0); \ |
| 114 | + \ |
| 115 | + const uint64_t* sieve64 = (const uint64_t*) sieve_.data(); \ |
| 116 | + uint64_t start_bits = sieve64[start_idx] & m1; \ |
| 117 | + uint64_t stop_bits = sieve64[stop_idx] & m2; \ |
| 118 | + ASSERT(svcntd() >= 2); \ |
| 119 | + svuint64_t vec = svinsr_n_u64(svdup_u64(start_bits), stop_bits); \ |
| 120 | + svuint64_t vcnt = svcnt_u64_z(svwhilelt_b64(0, 2), vec); \ |
| 121 | + uint64_t i = start_idx + 1; \ |
| 122 | + \ |
| 123 | + /* Compute this for loop using ARM SVE. */ \ |
| 124 | + /* for (i = start_idx + 1; i < stop_idx; i++) */ \ |
| 125 | + /* cnt += popcnt64(sieve64[i]); */ \ |
| 126 | + for (; i + svcntd() < stop_idx; i += svcntd()) \ |
| 127 | + { \ |
| 128 | + vec = svld1_u64(svptrue_b64(), &sieve64[i]); \ |
| 129 | + vec = svcnt_u64_x(svptrue_b64(), vec); \ |
| 130 | + vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec); \ |
| 131 | + } \ |
| 132 | + svbool_t pg = svwhilelt_b64(i, stop_idx); \ |
| 133 | + vec = svld1_u64(pg, &sieve64[i]); \ |
| 134 | + vec = svcnt_u64_z(pg, vec); \ |
| 135 | + vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec); \ |
| 136 | + uint64_t cnt = svaddv_u64(svptrue_b64(), vcnt); |
| 137 | + |
| 138 | +#endif |
0 commit comments