From af7700e124db7b9aeb7c80d32d258adaad44875e Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Wed, 19 Jun 2024 11:19:26 +0200 Subject: [PATCH] Tune AVX512 code --- include/Sieve.hpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/include/Sieve.hpp b/include/Sieve.hpp index 2fc9486d..df52396a 100644 --- a/include/Sieve.hpp +++ b/include/Sieve.hpp @@ -208,14 +208,18 @@ class Sieve // Compute this for loop using AVX512. // for (i = start_idx + 1; i < stop_idx; i++) // cnt += popcnt64(sieve64[i]); - do { - __mmask8 mask = (i + 8 < stop_idx) ? 0xff : (__mmask8) _bzhi_u64(0xff, stop_idx - i); - __m512i vec = _mm512_maskz_loadu_epi64(mask , &sieve64[i]); + // + for (; i + 8 < stop_idx; i += 8) + { + __m512i vec = _mm512_loadu_epi64(&sieve64[i]); vec = _mm512_popcnt_epi64(vec); vcnt = _mm512_add_epi64(vcnt, vec); - i += 8; } - while (i < stop_idx); + + __mmask8 mask = (__mmask8) _bzhi_u64(0xff, stop_idx - i); + __m512i vec = _mm512_maskz_loadu_epi64(mask , &sieve64[i]); + vec = _mm512_popcnt_epi64(vec); + vcnt = _mm512_add_epi64(vcnt, vec); cnt += _mm512_reduce_add_epi64(vcnt); cnt += popcnt64(sieve64[stop_idx] & m2);