Skip to content

Commit 7deb986

Browse files
authored
Refactor SIMD code (#102)
1 parent 01dfba5 commit 7deb986

3 files changed

Lines changed: 152 additions & 204 deletions

File tree

src/Sieve_count_simd.hpp

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
///
2+
/// @file Sieve_count_simd.hpp
3+
/// @brief Highly optimized code to count the number of 1 bits in
4+
/// the sieve array using SIMD instructions.
5+
///
6+
/// In-depth description of this algorithm:
7+
/// https://github.com/kimwalisch/primecount/blob/master/doc/Hard-Special-Leaves.pdf
8+
///
9+
/// Copyright (C) 2026 Kim Walisch, <kim.walisch@gmail.com>
10+
///
11+
/// This file is distributed under the BSD License. See the COPYING
12+
/// file in the top level directory.
13+
///
14+
15+
#ifndef SIEVE_COUNT_SIMD_HPP
16+
#define SIEVE_COUNT_SIMD_HPP
17+
18+
#include <macros.hpp>
19+
#include <popcnt.hpp>
20+
21+
#include <stdint.h>
22+
23+
#if defined(ENABLE_ARM_SVE) || \
24+
defined(ENABLE_MULTIARCH_ARM_SVE)
25+
#include <arm_sve.h>
26+
#elif defined(ENABLE_AVX512_VPOPCNT) || \
27+
defined(ENABLE_MULTIARCH_AVX512_VPOPCNT)
28+
#include <immintrin.h>
29+
#endif
30+
31+
/// POPCNT64 /////////////////////////////////////////////////////////
32+
33+
/// Count 1 bits inside [start, stop] using POPCNT64
34+
#define SIEVE_COUNT_POPCNT64(start, stop) \
35+
ASSERT(start <= stop); \
36+
ASSERT(stop - start < segment_size()); \
37+
uint64_t start_idx = start / 240; \
38+
uint64_t stop_idx = stop / 240; \
39+
uint64_t m1 = unset_smaller[start % 240]; \
40+
uint64_t m2 = unset_larger[stop % 240]; \
41+
\
42+
/* Branchfree bitmask calculation: */ \
43+
/* if (start_idx == stop_idx) m1 = m1 & m2; */ \
44+
/* if (start_idx == stop_idx) m2 = 0; */ \
45+
CONDITIONAL_MOVE(start_idx == stop_idx, m1, m1 & m2); \
46+
CONDITIONAL_MOVE(start_idx == stop_idx, m2, 0); \
47+
\
48+
const uint64_t* sieve64 = (const uint64_t*) sieve_.data(); \
49+
uint64_t start_bits = sieve64[start_idx] & m1; \
50+
uint64_t stop_bits = sieve64[stop_idx] & m2; \
51+
uint64_t cnt = popcnt64(start_bits); \
52+
cnt += popcnt64(stop_bits); \
53+
\
54+
for (uint64_t i = start_idx + 1; i < stop_idx; i++) \
55+
cnt += popcnt64(sieve64[i]);
56+
57+
/// AVX512 ///////////////////////////////////////////////////////////
58+
59+
/// Count 1 bits inside [start, stop] using AVX512
60+
#define SIEVE_COUNT_AVX512(start, stop) \
61+
ASSERT(start <= stop); \
62+
ASSERT(stop - start < segment_size()); \
63+
uint64_t start_idx = start / 240; \
64+
uint64_t stop_idx = stop / 240; \
65+
uint64_t m1 = unset_smaller[start % 240]; \
66+
uint64_t m2 = unset_larger[stop % 240]; \
67+
\
68+
/* Branchfree bitmask calculation: */ \
69+
/* if (start_idx == stop_idx) m1 = m1 & m2; */ \
70+
/* if (start_idx == stop_idx) m2 = 0; */ \
71+
CONDITIONAL_MOVE(start_idx == stop_idx, m1, m1 & m2); \
72+
CONDITIONAL_MOVE(start_idx == stop_idx, m2, 0); \
73+
\
74+
const uint64_t* sieve64 = (const uint64_t*) sieve_.data(); \
75+
uint64_t start_bits = sieve64[start_idx] & m1; \
76+
uint64_t stop_bits = sieve64[stop_idx] & m2; \
77+
__m512i vec = _mm512_set_epi64(0, 0, 0, 0, 0, 0, stop_bits, start_bits); \
78+
__m512i vcnt = _mm512_popcnt_epi64(vec); \
79+
uint64_t i = start_idx + 1; \
80+
\
81+
/* Compute this for loop using AVX512. */ \
82+
/* for (i = start_idx + 1; i < stop_idx; i++) */ \
83+
/* cnt += popcnt64(sieve64[i]); */ \
84+
for (; i + 8 < stop_idx; i += 8) \
85+
{ \
86+
vec = _mm512_loadu_epi64(&sieve64[i]); \
87+
vec = _mm512_popcnt_epi64(vec); \
88+
vcnt = _mm512_add_epi64(vcnt, vec); \
89+
} \
90+
__mmask8 mask = (__mmask8) (0xff >> (i + 8 - stop_idx)); \
91+
vec = _mm512_maskz_loadu_epi64(mask, &sieve64[i]); \
92+
vec = _mm512_popcnt_epi64(vec); \
93+
vcnt = _mm512_add_epi64(vcnt, vec); \
94+
uint64_t cnt = _mm512_reduce_add_epi64(vcnt);
95+
96+
/// ARM SVE //////////////////////////////////////////////////////////
97+
98+
/// Count 1 bits inside [start, stop] using ARM SVE
99+
#define SIEVE_COUNT_ARM_SVE(start, stop) \
100+
ASSERT(start <= stop); \
101+
ASSERT(stop - start < segment_size()); \
102+
uint64_t start_idx = start / 240; \
103+
uint64_t stop_idx = stop / 240; \
104+
uint64_t m1 = unset_smaller[start % 240]; \
105+
uint64_t m2 = unset_larger[stop % 240]; \
106+
\
107+
/* Branchfree bitmask calculation: */ \
108+
/* if (start_idx == stop_idx) m1 = m1 & m2; */ \
109+
/* if (start_idx == stop_idx) m2 = 0; */ \
110+
CONDITIONAL_MOVE(start_idx == stop_idx, m1, m1 & m2); \
111+
CONDITIONAL_MOVE(start_idx == stop_idx, m2, 0); \
112+
\
113+
const uint64_t* sieve64 = (const uint64_t*) sieve_.data(); \
114+
uint64_t start_bits = sieve64[start_idx] & m1; \
115+
uint64_t stop_bits = sieve64[stop_idx] & m2; \
116+
ASSERT(svcntd() >= 2); \
117+
svuint64_t vec = svinsr_n_u64(svdup_u64(start_bits), stop_bits); \
118+
svuint64_t vcnt = svcnt_u64_z(svwhilelt_b64(0, 2), vec); \
119+
uint64_t i = start_idx + 1; \
120+
\
121+
/* Compute this for loop using ARM SVE. */ \
122+
/* for (i = start_idx + 1; i < stop_idx; i++) */ \
123+
/* cnt += popcnt64(sieve64[i]); */ \
124+
for (; i + svcntd() < stop_idx; i += svcntd()) \
125+
{ \
126+
vec = svld1_u64(svptrue_b64(), &sieve64[i]); \
127+
vec = svcnt_u64_x(svptrue_b64(), vec); \
128+
vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec); \
129+
} \
130+
svbool_t pg = svwhilelt_b64(i, stop_idx); \
131+
vec = svld1_u64(pg, &sieve64[i]); \
132+
vec = svcnt_u64_z(pg, vec); \
133+
vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec); \
134+
uint64_t cnt = svaddv_u64(svptrue_b64(), vcnt);
135+
136+
#endif

src/Sieve_count_start_stop.hpp

Lines changed: 8 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
/// In-depth description of this algorithm:
1717
/// https://github.com/kimwalisch/primecount/blob/master/doc/Hard-Special-Leaves.pdf
1818
///
19-
/// Copyright (C) 2025 Kim Walisch, <kim.walisch@gmail.com>
19+
/// Copyright (C) 2026 Kim Walisch, <kim.walisch@gmail.com>
2020
///
2121
/// This file is distributed under the BSD License. See the COPYING
2222
/// file in the top level directory.
@@ -26,21 +26,16 @@
2626
#define SIEVE_COUNT_START_STOP_HPP
2727

2828
#include <Sieve.hpp>
29+
#include <Sieve_count_simd.hpp>
2930
#include <cpu_arch_macros.hpp>
3031
#include <macros.hpp>
3132
#include <popcnt.hpp>
3233

3334
#include <stdint.h>
3435

35-
#if defined(ENABLE_ARM_SVE)
36-
#include <arm_sve.h>
37-
#elif defined(ENABLE_AVX512_VPOPCNT)
38-
#include <immintrin.h>
39-
#elif defined(ENABLE_MULTIARCH_ARM_SVE)
40-
#include <arm_sve.h>
36+
#if defined(ENABLE_MULTIARCH_ARM_SVE)
4137
#include <cpu_supports_arm_sve.hpp>
4238
#elif defined(ENABLE_MULTIARCH_AVX512_VPOPCNT)
43-
#include <immintrin.h>
4439
#include <cpu_supports_avx512_vpopcnt.hpp>
4540
#endif
4641

@@ -124,27 +119,7 @@ uint64_t Sieve::count_popcnt64(uint64_t start, uint64_t stop) const
124119
if (start > stop)
125120
return 0;
126121

127-
ASSERT(stop - start < segment_size());
128-
uint64_t start_idx = start / 240;
129-
uint64_t stop_idx = stop / 240;
130-
uint64_t m1 = unset_smaller[start % 240];
131-
uint64_t m2 = unset_larger[stop % 240];
132-
133-
// Branchfree bitmask calculation:
134-
// if (start_idx == stop_idx) m1 = m1 & m2;
135-
// if (start_idx == stop_idx) m2 = 0;
136-
CONDITIONAL_MOVE(start_idx == stop_idx, m1, m1 & m2);
137-
CONDITIONAL_MOVE(start_idx == stop_idx, m2, 0);
138-
139-
const uint64_t* sieve64 = (const uint64_t*) sieve_.data();
140-
uint64_t start_bits = sieve64[start_idx] & m1;
141-
uint64_t stop_bits = sieve64[stop_idx] & m2;
142-
uint64_t cnt = popcnt64(start_bits);
143-
cnt += popcnt64(stop_bits);
144-
145-
for (uint64_t i = start_idx + 1; i < stop_idx; i++)
146-
cnt += popcnt64(sieve64[i]);
147-
122+
SIEVE_COUNT_POPCNT64(start, stop);
148123
return cnt;
149124
}
150125

@@ -166,41 +141,8 @@ uint64_t Sieve::count_avx512(uint64_t start, uint64_t stop) const
166141
if (start > stop)
167142
return 0;
168143

169-
ASSERT(stop - start < segment_size());
170-
uint64_t start_idx = start / 240;
171-
uint64_t stop_idx = stop / 240;
172-
uint64_t m1 = unset_smaller[start % 240];
173-
uint64_t m2 = unset_larger[stop % 240];
174-
175-
// Branchfree bitmask calculation:
176-
// if (start_idx == stop_idx) m1 = m1 & m2;
177-
// if (start_idx == stop_idx) m2 = 0;
178-
CONDITIONAL_MOVE(start_idx == stop_idx, m1, m1 & m2);
179-
CONDITIONAL_MOVE(start_idx == stop_idx, m2, 0);
180-
181-
const uint64_t* sieve64 = (const uint64_t*) sieve_.data();
182-
uint64_t start_bits = sieve64[start_idx] & m1;
183-
uint64_t stop_bits = sieve64[stop_idx] & m2;
184-
__m512i vec = _mm512_set_epi64(0, 0, 0, 0, 0, 0, stop_bits, start_bits);
185-
__m512i vcnt = _mm512_popcnt_epi64(vec);
186-
uint64_t i = start_idx + 1;
187-
188-
// Compute this for loop using AVX512.
189-
// for (i = start_idx + 1; i < stop_idx; i++)
190-
// cnt += popcnt64(sieve64[i]);
191-
192-
for (; i + 8 < stop_idx; i += 8)
193-
{
194-
vec = _mm512_loadu_epi64(&sieve64[i]);
195-
vec = _mm512_popcnt_epi64(vec);
196-
vcnt = _mm512_add_epi64(vcnt, vec);
197-
}
198-
199-
__mmask8 mask = (__mmask8) (0xff >> (i + 8 - stop_idx));
200-
vec = _mm512_maskz_loadu_epi64(mask, &sieve64[i]);
201-
vec = _mm512_popcnt_epi64(vec);
202-
vcnt = _mm512_add_epi64(vcnt, vec);
203-
return _mm512_reduce_add_epi64(vcnt);
144+
SIEVE_COUNT_AVX512(start, stop);
145+
return cnt;
204146
}
205147

206148
#elif defined(ENABLE_ARM_SVE) || \
@@ -219,42 +161,8 @@ uint64_t Sieve::count_arm_sve(uint64_t start, uint64_t stop) const
219161
if (start > stop)
220162
return 0;
221163

222-
ASSERT(stop - start < segment_size());
223-
uint64_t start_idx = start / 240;
224-
uint64_t stop_idx = stop / 240;
225-
uint64_t m1 = unset_smaller[start % 240];
226-
uint64_t m2 = unset_larger[stop % 240];
227-
228-
// Branchfree bitmask calculation:
229-
// if (start_idx == stop_idx) m1 = m1 & m2;
230-
// if (start_idx == stop_idx) m2 = 0;
231-
CONDITIONAL_MOVE(start_idx == stop_idx, m1, m1 & m2);
232-
CONDITIONAL_MOVE(start_idx == stop_idx, m2, 0);
233-
234-
const uint64_t* sieve64 = (const uint64_t*) sieve_.data();
235-
uint64_t start_bits = sieve64[start_idx] & m1;
236-
uint64_t stop_bits = sieve64[stop_idx] & m2;
237-
ASSERT(svcntd() >= 2);
238-
svuint64_t vec = svinsr_n_u64(svdup_u64(start_bits), stop_bits);
239-
svuint64_t vcnt = svcnt_u64_z(svwhilelt_b64(0, 2), vec);
240-
uint64_t i = start_idx + 1;
241-
242-
// Compute this for loop using ARM SVE.
243-
// for (i = start_idx + 1; i < stop_idx; i++)
244-
// cnt += popcnt64(sieve64[i]);
245-
246-
for (; i + svcntd() < stop_idx; i += svcntd())
247-
{
248-
vec = svld1_u64(svptrue_b64(), &sieve64[i]);
249-
vec = svcnt_u64_x(svptrue_b64(), vec);
250-
vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec);
251-
}
252-
253-
svbool_t pg = svwhilelt_b64(i, stop_idx);
254-
vec = svld1_u64(pg, &sieve64[i]);
255-
vec = svcnt_u64_z(pg, vec);
256-
vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec);
257-
return svaddv_u64(svptrue_b64(), vcnt);
164+
SIEVE_COUNT_ARM_SVE(start, stop);
165+
return cnt;
258166
}
259167

260168
#endif

0 commit comments

Comments
 (0)