diff --git a/cmake/multiarch_avx512_vpopcnt.cmake b/cmake/multiarch_avx512_vpopcnt.cmake index 9666d1db..199ddfe0 100644 --- a/cmake/multiarch_avx512_vpopcnt.cmake +++ b/cmake/multiarch_avx512_vpopcnt.cmake @@ -11,13 +11,12 @@ set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}") check_cxx_source_compiles(" // GCC/Clang function multiversioning for AVX512 is not needed if - // the user compiles with -mavx512f -mavx512vpopcntdq -mbmi2. + // the user compiles with -mavx512f -mavx512vpopcntdq. // GCC/Clang function multiversioning generally causes a minor // overhead, hence we disable it if it is not needed. #if defined(__AVX512F__) && \ - defined(__AVX512VPOPCNTDQ__) && \ - defined(__BMI2__) - Error: AVX512 BMI2 multiarch not needed! + defined(__AVX512VPOPCNTDQ__) + Error: AVX512 multiarch not needed! #endif #include @@ -27,8 +26,8 @@ check_cxx_source_compiles(" class Sieve { public: uint64_t count_default(uint64_t* array, uint64_t stop_idx); - __attribute__ ((target (\"avx512f,avx512vpopcntdq,bmi2\"))) - uint64_t count_avx512_bmi2(uint64_t* array, uint64_t stop_idx); + __attribute__ ((target (\"avx512f,avx512vpopcntdq\"))) + uint64_t count_avx512(uint64_t* array, uint64_t stop_idx); }; uint64_t Sieve::count_default(uint64_t* array, uint64_t stop_idx) @@ -39,8 +38,8 @@ check_cxx_source_compiles(" return res; } - __attribute__ ((target (\"avx512f,avx512vpopcntdq,bmi2\"))) - uint64_t Sieve::count_avx512_bmi2(uint64_t* array, uint64_t stop_idx) + __attribute__ ((target (\"avx512f,avx512vpopcntdq\"))) + uint64_t Sieve::count_avx512(uint64_t* array, uint64_t stop_idx) { uint64_t i = 0; __m512i vcnt = _mm512_setzero_si512(); @@ -52,7 +51,7 @@ check_cxx_source_compiles(" vcnt = _mm512_add_epi64(vcnt, vec); } - __mmask8 mask = (__mmask8) _bzhi_u64(0xff, stop_idx - i); + __mmask8 mask = (__mmask8) (0xff >> (i + 8 - stop_idx)); __m512i vec = _mm512_maskz_loadu_epi64(mask , &array[i]); vec = _mm512_popcnt_epi64(vec); vcnt = _mm512_add_epi64(vcnt, vec); @@ -65,8 +64,8 @@ check_cxx_source_compiles(" uint64_t cnt = 0; Sieve sieve; - if (primecount::has_cpuid_avx512_bmi2()) - cnt = sieve.count_avx512_bmi2(&array[0], 10); + if (primecount::has_cpuid_avx512_vpopcnt()) + cnt = sieve.count_avx512(&array[0], 10); else cnt = sieve.count_default(&array[0], 10); @@ -75,7 +74,7 @@ check_cxx_source_compiles(" " multiarch_avx512_vpopcnt) if(multiarch_avx512_vpopcnt) - list(APPEND PRIMECOUNT_COMPILE_DEFINITIONS "ENABLE_MULTIARCH_AVX512_BMI2") + list(APPEND PRIMECOUNT_COMPILE_DEFINITIONS "ENABLE_MULTIARCH_AVX512_VPOPCNT") endif() cmake_pop_check_state() diff --git a/include/Sieve.hpp b/include/Sieve.hpp index 51b4685f..780f53d4 100644 --- a/include/Sieve.hpp +++ b/include/Sieve.hpp @@ -50,19 +50,17 @@ #elif defined(__AVX512F__) && \ defined(__AVX512VPOPCNTDQ__) && \ - defined(__BMI2__) && \ - !defined(__i386__) /* misses _bzhi_u64() */ && \ __has_include() #include - #define ENABLE_AVX512_BMI2 + #define ENABLE_AVX512_VPOPCNT #elif defined(ENABLE_MULTIARCH_ARM_SVE) #include #include #define ENABLE_DEFAULT -#elif defined(ENABLE_MULTIARCH_AVX512_BMI2) - #include +#elif defined(ENABLE_MULTIARCH_AVX512_VPOPCNT) + #include #include #define ENABLE_DEFAULT #else @@ -129,12 +127,12 @@ class Sieve { #if defined(ENABLE_ARM_SVE) return count_arm_sve(start, stop); - #elif defined(ENABLE_AVX512_BMI2) - return count_avx512_bmi2(start, stop); + #elif defined(ENABLE_AVX512_VPOPCNT) + return count_avx512(start, stop); #elif defined(ENABLE_MULTIARCH_ARM_SVE) return cpu_supports_sve ? count_arm_sve(start, stop) : count_default(start, stop); - #elif defined(ENABLE_MULTIARCH_AVX512_BMI2) - return cpu_supports_avx512_bmi2 ? count_avx512_bmi2(start, stop) : count_default(start, stop); + #elif defined(ENABLE_MULTIARCH_AVX512_VPOPCNT) + return cpu_supports_avx512_vpopcnt ? count_avx512(start, stop) : count_default(start, stop); #else return count_default(start, stop); #endif @@ -180,18 +178,18 @@ class Sieve #endif -#if defined(ENABLE_AVX512_BMI2) || \ - defined(ENABLE_MULTIARCH_AVX512_BMI2) +#if defined(ENABLE_AVX512_VPOPCNT) || \ + defined(ENABLE_MULTIARCH_AVX512_VPOPCNT) /// Count 1 bits inside [start, stop]. /// The distance [start, stop] is small here < sqrt(segment_size), /// hence we simply count the number of unsieved elements /// by linearly iterating over the sieve array. /// - #if defined(ENABLE_MULTIARCH_AVX512_BMI2) - __attribute__ ((target ("avx512f,avx512vpopcntdq,bmi2"))) + #if defined(ENABLE_MULTIARCH_AVX512_VPOPCNT) + __attribute__ ((target ("avx512f,avx512vpopcntdq"))) #endif - uint64_t count_avx512_bmi2(uint64_t start, uint64_t stop) const + uint64_t count_avx512(uint64_t start, uint64_t stop) const { if (start > stop) return 0; @@ -203,33 +201,34 @@ class Sieve uint64_t m2 = unset_larger[stop % 240]; const uint64_t* sieve64 = (const uint64_t*) sieve_.data(); - if (start_idx == stop_idx) - return popcnt64(sieve64[start_idx] & m1 & m2); - else + // Branchfree bitmask calculation: + // m1 = (start_idx != stop_idx) ? m1 : m1 & m2; + m1 = (m1 * (start_idx != stop_idx)) | ((m1 & m2) * (start_idx == stop_idx)); + // m2 = (start_idx != stop_idx) ? m2 : 0; + m2 *= (start_idx != stop_idx); + + uint64_t i = start_idx + 1; + uint64_t start_bits = sieve64[start_idx] & m1; + uint64_t stop_bits = sieve64[stop_idx] & m2; + __m512i vec = _mm512_set_epi64(0, 0, 0, 0, 0, 0, stop_bits, start_bits); + __m512i vcnt = _mm512_popcnt_epi64(vec); + + // Compute this for loop using AVX512. + // for (i = start_idx + 1; i < stop_idx; i++) + // cnt += popcnt64(sieve64[i]); + // + for (; i + 8 < stop_idx; i += 8) { - uint64_t i = start_idx + 1; - uint64_t start_bits = sieve64[start_idx] & m1; - uint64_t stop_bits = sieve64[stop_idx] & m2; - __m512i vec = _mm512_set_epi64(0, 0, 0, 0, 0, 0, stop_bits, start_bits); - __m512i vcnt = _mm512_popcnt_epi64(vec); - - // Compute this for loop using AVX512. - // for (i = start_idx + 1; i < stop_idx; i++) - // cnt += popcnt64(sieve64[i]); - // - for (; i + 8 < stop_idx; i += 8) - { - vec = _mm512_loadu_epi64(&sieve64[i]); - vec = _mm512_popcnt_epi64(vec); - vcnt = _mm512_add_epi64(vcnt, vec); - } - - __mmask8 mask = (__mmask8) _bzhi_u64(0xff, stop_idx - i); - vec = _mm512_maskz_loadu_epi64(mask, &sieve64[i]); + vec = _mm512_loadu_epi64(&sieve64[i]); vec = _mm512_popcnt_epi64(vec); vcnt = _mm512_add_epi64(vcnt, vec); - return _mm512_reduce_add_epi64(vcnt); } + + __mmask8 mask = (__mmask8) (0xff >> (i + 8 - stop_idx)); + vec = _mm512_maskz_loadu_epi64(mask, &sieve64[i]); + vec = _mm512_popcnt_epi64(vec); + vcnt = _mm512_add_epi64(vcnt, vec); + return _mm512_reduce_add_epi64(vcnt); } #elif defined(ENABLE_ARM_SVE) || \ diff --git a/include/cpu_supports_avx512_bmi2.hpp b/include/cpu_supports_avx512_vpopcnt.hpp similarity index 59% rename from include/cpu_supports_avx512_bmi2.hpp rename to include/cpu_supports_avx512_vpopcnt.hpp index 6e0494c6..e56a7221 100644 --- a/include/cpu_supports_avx512_bmi2.hpp +++ b/include/cpu_supports_avx512_vpopcnt.hpp @@ -1,5 +1,5 @@ /// -/// @file cpu_supports_avx512_bmi2.hpp +/// @file cpu_supports_avx512_vpopcnt.hpp /// @brief Detect if the x86 CPU supports AVX512 and BMI2. /// /// Copyright (C) 2024 Kim Walisch, @@ -8,19 +8,19 @@ /// file in the top level directory. /// -#ifndef CPU_SUPPORTS_AVX512_BMI2_HPP -#define CPU_SUPPORTS_AVX512_BMI2_HPP +#ifndef CPU_SUPPORTS_AVX512_VPOPCNT_HPP +#define CPU_SUPPORTS_AVX512_VPOPCNT_HPP namespace primecount { -bool has_cpuid_avx512_bmi2(); +bool has_cpuid_avx512_vpopcnt(); } // namespace namespace { /// Initialized at startup -const bool cpu_supports_avx512_bmi2 = primecount::has_cpuid_avx512_bmi2(); +const bool cpu_supports_avx512_vpopcnt = primecount::has_cpuid_avx512_vpopcnt(); } // namespace diff --git a/scripts/build_clang_multiarch_win_x64.bat b/scripts/build_clang_multiarch_win_x64.bat index 0a0524df..dfa62cf6 100644 --- a/scripts/build_clang_multiarch_win_x64.bat +++ b/scripts/build_clang_multiarch_win_x64.bat @@ -10,6 +10,6 @@ :: include all MMX, SSE, POPCNT, BMI, BMI2, AVX, AVX and AVX512 headers. del /Q ..\src\deleglise-rivat\S2_easy.cpp ..\src\gourdon\AC.cpp -clang++ -I../include -I../lib/primesieve/include -O3 -mpopcnt -fopenmp -Wall -Wextra -pedantic -DNDEBUG -DENABLE_INT128_OPENMP_PATCH -DENABLE_MULTIARCH_AVX512_VBMI2 -DENABLE_MULTIARCH_AVX512_BMI2 ../lib/primesieve/src/*.cpp ../lib/primesieve/src/x86/*.cpp ../src/*.cpp ../src/x86/*.cpp ../src/lmo/*.cpp ../src/deleglise-rivat/*.cpp ../src/gourdon/*.cpp ../src/app/*.cpp -o primecount.exe "C:\Program Files\LLVM\lib\clang\18\lib\windows\clang_rt.builtins-x86_64.lib" +clang++ -I../include -I../lib/primesieve/include -O3 -mpopcnt -fopenmp -Wall -Wextra -pedantic -DNDEBUG -DENABLE_INT128_OPENMP_PATCH -DENABLE_MULTIARCH_AVX512_VBMI2 -DENABLE_MULTIARCH_AVX512_VPOPCNT ../lib/primesieve/src/*.cpp ../lib/primesieve/src/x86/*.cpp ../src/*.cpp ../src/x86/*.cpp ../src/lmo/*.cpp ../src/deleglise-rivat/*.cpp ../src/gourdon/*.cpp ../src/app/*.cpp -o primecount.exe "C:\Program Files\LLVM\lib\clang\18\lib\windows\clang_rt.builtins-x86_64.lib" git checkout ..\src\deleglise-rivat git checkout ..\src\gourdon diff --git a/src/x86/cpuid.cpp b/src/x86/cpuid.cpp index e8556ffd..1d7064d7 100644 --- a/src/x86/cpuid.cpp +++ b/src/x86/cpuid.cpp @@ -19,7 +19,6 @@ // https://en.wikipedia.org/wiki/CPUID // %ebx bit flags -#define bit_BMI2 (1 << 8) #define bit_AVX512F (1 << 16) // %ecx bit flags @@ -91,7 +90,7 @@ bool has_cpuid_popcnt() return (abcd[2] & bit_POPCNT) == bit_POPCNT; } -bool has_cpuid_avx512_bmi2() +bool has_cpuid_avx512_vpopcnt() { int abcd[4]; @@ -117,9 +116,6 @@ bool has_cpuid_avx512_bmi2() run_cpuid(7, 0, abcd); - if ((abcd[1] & bit_BMI2) != bit_BMI2) - return false; - // AVX512F, AVX512VPOPCNTDQ return ((abcd[1] & bit_AVX512F) == bit_AVX512F && (abcd[2] & bit_AVX512_VPOPCNTDQ) == bit_AVX512_VPOPCNTDQ);