Skip to content

Commit 1aa0d36

Browse files
committed
Refactor SIMD code
1 parent 01dfba5 commit 1aa0d36

3 files changed

Lines changed: 154 additions & 204 deletions

File tree

src/Sieve_count_simd.hpp

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
///
2+
/// @file Sieve_count_simd.hpp
3+
/// @brief Highly optimized code to count the number of 1 bits in
4+
/// the sieve array using SIMD instructions.
5+
///
6+
/// In-depth description of this algorithm:
7+
/// https://github.com/kimwalisch/primecount/blob/master/doc/Hard-Special-Leaves.pdf
8+
///
9+
/// Copyright (C) 2026 Kim Walisch, <kim.walisch@gmail.com>
10+
///
11+
/// This file is distributed under the BSD License. See the COPYING
12+
/// file in the top level directory.
13+
///
14+
15+
#ifndef SIEVE_COUNT_SIMD_HPP
16+
#define SIEVE_COUNT_SIMD_HPP
17+
18+
#include <Sieve.hpp>
19+
#include <Sieve_arrays.hpp>
20+
#include <macros.hpp>
21+
#include <popcnt.hpp>
22+
23+
#include <stdint.h>
24+
25+
#if defined(ENABLE_ARM_SVE) || \
26+
defined(ENABLE_MULTIARCH_ARM_SVE)
27+
#include <arm_sve.h>
28+
#elif defined(ENABLE_AVX512_VPOPCNT) || \
29+
defined(ENABLE_MULTIARCH_AVX512_VPOPCNT)
30+
#include <immintrin.h>
31+
#endif
32+
33+
/// POPCNT64 /////////////////////////////////////////////////////////
34+
35+
/// Count 1 bits inside [start, stop] using POPCNT64
36+
#define SIEVE_COUNT_POPCNT64(start, stop) \
37+
ASSERT(start <= stop); \
38+
ASSERT(stop - start < segment_size()); \
39+
uint64_t start_idx = start / 240; \
40+
uint64_t stop_idx = stop / 240; \
41+
uint64_t m1 = unset_smaller[start % 240]; \
42+
uint64_t m2 = unset_larger[stop % 240]; \
43+
\
44+
/* Branchfree bitmask calculation: */ \
45+
/* if (start_idx == stop_idx) m1 = m1 & m2; */ \
46+
/* if (start_idx == stop_idx) m2 = 0; */ \
47+
CONDITIONAL_MOVE(start_idx == stop_idx, m1, m1 & m2); \
48+
CONDITIONAL_MOVE(start_idx == stop_idx, m2, 0); \
49+
\
50+
const uint64_t* sieve64 = (const uint64_t*) sieve_.data(); \
51+
uint64_t start_bits = sieve64[start_idx] & m1; \
52+
uint64_t stop_bits = sieve64[stop_idx] & m2; \
53+
uint64_t cnt = popcnt64(start_bits); \
54+
cnt += popcnt64(stop_bits); \
55+
\
56+
for (uint64_t i = start_idx + 1; i < stop_idx; i++) \
57+
cnt += popcnt64(sieve64[i]);
58+
59+
/// AVX512 ///////////////////////////////////////////////////////////
60+
61+
/// Count 1 bits inside [start, stop] using AVX512
62+
#define SIEVE_COUNT_AVX512(start, stop) \
63+
ASSERT(start <= stop); \
64+
ASSERT(stop - start < segment_size()); \
65+
uint64_t start_idx = start / 240; \
66+
uint64_t stop_idx = stop / 240; \
67+
uint64_t m1 = unset_smaller[start % 240]; \
68+
uint64_t m2 = unset_larger[stop % 240]; \
69+
\
70+
/* Branchfree bitmask calculation: */ \
71+
/* if (start_idx == stop_idx) m1 = m1 & m2; */ \
72+
/* if (start_idx == stop_idx) m2 = 0; */ \
73+
CONDITIONAL_MOVE(start_idx == stop_idx, m1, m1 & m2); \
74+
CONDITIONAL_MOVE(start_idx == stop_idx, m2, 0); \
75+
\
76+
const uint64_t* sieve64 = (const uint64_t*) sieve_.data(); \
77+
uint64_t start_bits = sieve64[start_idx] & m1; \
78+
uint64_t stop_bits = sieve64[stop_idx] & m2; \
79+
__m512i vec = _mm512_set_epi64(0, 0, 0, 0, 0, 0, stop_bits, start_bits); \
80+
__m512i vcnt = _mm512_popcnt_epi64(vec); \
81+
uint64_t i = start_idx + 1; \
82+
\
83+
/* Compute this for loop using AVX512. */ \
84+
/* for (i = start_idx + 1; i < stop_idx; i++) */ \
85+
/* cnt += popcnt64(sieve64[i]); */ \
86+
for (; i + 8 < stop_idx; i += 8) \
87+
{ \
88+
vec = _mm512_loadu_epi64(&sieve64[i]); \
89+
vec = _mm512_popcnt_epi64(vec); \
90+
vcnt = _mm512_add_epi64(vcnt, vec); \
91+
} \
92+
__mmask8 mask = (__mmask8) (0xff >> (i + 8 - stop_idx)); \
93+
vec = _mm512_maskz_loadu_epi64(mask, &sieve64[i]); \
94+
vec = _mm512_popcnt_epi64(vec); \
95+
vcnt = _mm512_add_epi64(vcnt, vec); \
96+
uint64_t cnt = _mm512_reduce_add_epi64(vcnt);
97+
98+
/// ARM SVE //////////////////////////////////////////////////////////
99+
100+
/// Count 1 bits inside [start, stop] using ARM SVE
101+
#define SIEVE_COUNT_ARM_SVE(start, stop) \
102+
ASSERT(start <= stop); \
103+
ASSERT(stop - start < segment_size()); \
104+
uint64_t start_idx = start / 240; \
105+
uint64_t stop_idx = stop / 240; \
106+
uint64_t m1 = unset_smaller[start % 240]; \
107+
uint64_t m2 = unset_larger[stop % 240]; \
108+
\
109+
/* Branchfree bitmask calculation: */ \
110+
/* if (start_idx == stop_idx) m1 = m1 & m2; */ \
111+
/* if (start_idx == stop_idx) m2 = 0; */ \
112+
CONDITIONAL_MOVE(start_idx == stop_idx, m1, m1 & m2); \
113+
CONDITIONAL_MOVE(start_idx == stop_idx, m2, 0); \
114+
\
115+
const uint64_t* sieve64 = (const uint64_t*) sieve_.data(); \
116+
uint64_t start_bits = sieve64[start_idx] & m1; \
117+
uint64_t stop_bits = sieve64[stop_idx] & m2; \
118+
ASSERT(svcntd() >= 2); \
119+
svuint64_t vec = svinsr_n_u64(svdup_u64(start_bits), stop_bits); \
120+
svuint64_t vcnt = svcnt_u64_z(svwhilelt_b64(0, 2), vec); \
121+
uint64_t i = start_idx + 1; \
122+
\
123+
/* Compute this for loop using ARM SVE. */ \
124+
/* for (i = start_idx + 1; i < stop_idx; i++) */ \
125+
/* cnt += popcnt64(sieve64[i]); */ \
126+
for (; i + svcntd() < stop_idx; i += svcntd()) \
127+
{ \
128+
vec = svld1_u64(svptrue_b64(), &sieve64[i]); \
129+
vec = svcnt_u64_x(svptrue_b64(), vec); \
130+
vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec); \
131+
} \
132+
svbool_t pg = svwhilelt_b64(i, stop_idx); \
133+
vec = svld1_u64(pg, &sieve64[i]); \
134+
vec = svcnt_u64_z(pg, vec); \
135+
vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec); \
136+
uint64_t cnt = svaddv_u64(svptrue_b64(), vcnt);
137+
138+
#endif

src/Sieve_count_start_stop.hpp

Lines changed: 8 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
/// In-depth description of this algorithm:
1717
/// https://github.com/kimwalisch/primecount/blob/master/doc/Hard-Special-Leaves.pdf
1818
///
19-
/// Copyright (C) 2025 Kim Walisch, <kim.walisch@gmail.com>
19+
/// Copyright (C) 2026 Kim Walisch, <kim.walisch@gmail.com>
2020
///
2121
/// This file is distributed under the BSD License. See the COPYING
2222
/// file in the top level directory.
@@ -26,21 +26,16 @@
2626
#define SIEVE_COUNT_START_STOP_HPP
2727

2828
#include <Sieve.hpp>
29+
#include <Sieve_count_simd.hpp>
2930
#include <cpu_arch_macros.hpp>
3031
#include <macros.hpp>
3132
#include <popcnt.hpp>
3233

3334
#include <stdint.h>
3435

35-
#if defined(ENABLE_ARM_SVE)
36-
#include <arm_sve.h>
37-
#elif defined(ENABLE_AVX512_VPOPCNT)
38-
#include <immintrin.h>
39-
#elif defined(ENABLE_MULTIARCH_ARM_SVE)
40-
#include <arm_sve.h>
36+
#if defined(ENABLE_MULTIARCH_ARM_SVE)
4137
#include <cpu_supports_arm_sve.hpp>
4238
#elif defined(ENABLE_MULTIARCH_AVX512_VPOPCNT)
43-
#include <immintrin.h>
4439
#include <cpu_supports_avx512_vpopcnt.hpp>
4540
#endif
4641

@@ -124,27 +119,7 @@ uint64_t Sieve::count_popcnt64(uint64_t start, uint64_t stop) const
124119
if (start > stop)
125120
return 0;
126121

127-
ASSERT(stop - start < segment_size());
128-
uint64_t start_idx = start / 240;
129-
uint64_t stop_idx = stop / 240;
130-
uint64_t m1 = unset_smaller[start % 240];
131-
uint64_t m2 = unset_larger[stop % 240];
132-
133-
// Branchfree bitmask calculation:
134-
// if (start_idx == stop_idx) m1 = m1 & m2;
135-
// if (start_idx == stop_idx) m2 = 0;
136-
CONDITIONAL_MOVE(start_idx == stop_idx, m1, m1 & m2);
137-
CONDITIONAL_MOVE(start_idx == stop_idx, m2, 0);
138-
139-
const uint64_t* sieve64 = (const uint64_t*) sieve_.data();
140-
uint64_t start_bits = sieve64[start_idx] & m1;
141-
uint64_t stop_bits = sieve64[stop_idx] & m2;
142-
uint64_t cnt = popcnt64(start_bits);
143-
cnt += popcnt64(stop_bits);
144-
145-
for (uint64_t i = start_idx + 1; i < stop_idx; i++)
146-
cnt += popcnt64(sieve64[i]);
147-
122+
SIEVE_COUNT_POPCNT64(start, stop);
148123
return cnt;
149124
}
150125

@@ -166,41 +141,8 @@ uint64_t Sieve::count_avx512(uint64_t start, uint64_t stop) const
166141
if (start > stop)
167142
return 0;
168143

169-
ASSERT(stop - start < segment_size());
170-
uint64_t start_idx = start / 240;
171-
uint64_t stop_idx = stop / 240;
172-
uint64_t m1 = unset_smaller[start % 240];
173-
uint64_t m2 = unset_larger[stop % 240];
174-
175-
// Branchfree bitmask calculation:
176-
// if (start_idx == stop_idx) m1 = m1 & m2;
177-
// if (start_idx == stop_idx) m2 = 0;
178-
CONDITIONAL_MOVE(start_idx == stop_idx, m1, m1 & m2);
179-
CONDITIONAL_MOVE(start_idx == stop_idx, m2, 0);
180-
181-
const uint64_t* sieve64 = (const uint64_t*) sieve_.data();
182-
uint64_t start_bits = sieve64[start_idx] & m1;
183-
uint64_t stop_bits = sieve64[stop_idx] & m2;
184-
__m512i vec = _mm512_set_epi64(0, 0, 0, 0, 0, 0, stop_bits, start_bits);
185-
__m512i vcnt = _mm512_popcnt_epi64(vec);
186-
uint64_t i = start_idx + 1;
187-
188-
// Compute this for loop using AVX512.
189-
// for (i = start_idx + 1; i < stop_idx; i++)
190-
// cnt += popcnt64(sieve64[i]);
191-
192-
for (; i + 8 < stop_idx; i += 8)
193-
{
194-
vec = _mm512_loadu_epi64(&sieve64[i]);
195-
vec = _mm512_popcnt_epi64(vec);
196-
vcnt = _mm512_add_epi64(vcnt, vec);
197-
}
198-
199-
__mmask8 mask = (__mmask8) (0xff >> (i + 8 - stop_idx));
200-
vec = _mm512_maskz_loadu_epi64(mask, &sieve64[i]);
201-
vec = _mm512_popcnt_epi64(vec);
202-
vcnt = _mm512_add_epi64(vcnt, vec);
203-
return _mm512_reduce_add_epi64(vcnt);
144+
SIEVE_COUNT_AVX512(start, stop);
145+
return cnt;
204146
}
205147

206148
#elif defined(ENABLE_ARM_SVE) || \
@@ -219,42 +161,8 @@ uint64_t Sieve::count_arm_sve(uint64_t start, uint64_t stop) const
219161
if (start > stop)
220162
return 0;
221163

222-
ASSERT(stop - start < segment_size());
223-
uint64_t start_idx = start / 240;
224-
uint64_t stop_idx = stop / 240;
225-
uint64_t m1 = unset_smaller[start % 240];
226-
uint64_t m2 = unset_larger[stop % 240];
227-
228-
// Branchfree bitmask calculation:
229-
// if (start_idx == stop_idx) m1 = m1 & m2;
230-
// if (start_idx == stop_idx) m2 = 0;
231-
CONDITIONAL_MOVE(start_idx == stop_idx, m1, m1 & m2);
232-
CONDITIONAL_MOVE(start_idx == stop_idx, m2, 0);
233-
234-
const uint64_t* sieve64 = (const uint64_t*) sieve_.data();
235-
uint64_t start_bits = sieve64[start_idx] & m1;
236-
uint64_t stop_bits = sieve64[stop_idx] & m2;
237-
ASSERT(svcntd() >= 2);
238-
svuint64_t vec = svinsr_n_u64(svdup_u64(start_bits), stop_bits);
239-
svuint64_t vcnt = svcnt_u64_z(svwhilelt_b64(0, 2), vec);
240-
uint64_t i = start_idx + 1;
241-
242-
// Compute this for loop using ARM SVE.
243-
// for (i = start_idx + 1; i < stop_idx; i++)
244-
// cnt += popcnt64(sieve64[i]);
245-
246-
for (; i + svcntd() < stop_idx; i += svcntd())
247-
{
248-
vec = svld1_u64(svptrue_b64(), &sieve64[i]);
249-
vec = svcnt_u64_x(svptrue_b64(), vec);
250-
vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec);
251-
}
252-
253-
svbool_t pg = svwhilelt_b64(i, stop_idx);
254-
vec = svld1_u64(pg, &sieve64[i]);
255-
vec = svcnt_u64_z(pg, vec);
256-
vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec);
257-
return svaddv_u64(svptrue_b64(), vcnt);
164+
SIEVE_COUNT_ARM_SVE(start, stop);
165+
return cnt;
258166
}
259167

260168
#endif

0 commit comments

Comments
 (0)