From dcc42887b39d7955b2e8a642ab9c29b4321c45d9 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Wed, 19 Jun 2024 12:08:07 +0200 Subject: [PATCH] Simplify AVX512 code --- ChangeLog | 2 + cmake/multiarch_avx512_vpopcnt.cmake | 40 ++++++++++--------- include/Sieve.hpp | 31 +++++++------- ...i2.hpp => cpu_supports_avx512_vpopcnt.hpp} | 16 +++----- scripts/build_clang_multiarch_win_x64.bat | 2 +- 5 files changed, 45 insertions(+), 46 deletions(-) rename include/{cpu_supports_avx512_bmi2.hpp => cpu_supports_avx512_vpopcnt.hpp} (80%) diff --git a/ChangeLog b/ChangeLog index d5fb33ab..613635c1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,8 @@ Changes in primecount-7.14, 2024-06-17 * int128_t.hpp: Rename namespace port to pstd (portable std namespace). +* Sieve.hpp: Simplify AVX512 code. +* multiarch_avx512_vpopcnt.cmake: Simplify AVX512 code. Changes in primecount-7.13, 2024-04-15 diff --git a/cmake/multiarch_avx512_vpopcnt.cmake b/cmake/multiarch_avx512_vpopcnt.cmake index 6ef61cfb..cb59f9ef 100644 --- a/cmake/multiarch_avx512_vpopcnt.cmake +++ b/cmake/multiarch_avx512_vpopcnt.cmake @@ -11,24 +11,23 @@ set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}/include") check_cxx_source_compiles(" // GCC/Clang function multiversioning for AVX512 is not needed if - // the user compiles with -mavx512f -mavx512vpopcntdq -mbmi2. + // the user compiles with -mavx512f -mavx512vpopcntdq. // GCC/Clang function multiversioning generally causes a minor // overhead, hence we disable it if it is not needed. #if defined(__AVX512F__) && \ - defined(__AVX512VPOPCNTDQ__) && \ - defined(__BMI2__) - Error: AVX512 BMI2 multiarch not needed! + defined(__AVX512VPOPCNTDQ__) + Error: AVX512 VPOPCNT multiarch not needed! #endif - #include + #include #include #include class Sieve { public: uint64_t count_default(uint64_t* array, uint64_t stop_idx); - __attribute__ ((target (\"avx512f,avx512vpopcntdq,bmi2\"))) - uint64_t count_avx512_bmi2(uint64_t* array, uint64_t stop_idx); + __attribute__ ((target (\"avx512f,avx512vpopcntdq\"))) + uint64_t count_avx512_vpopcnt(uint64_t* array, uint64_t stop_idx); }; uint64_t Sieve::count_default(uint64_t* array, uint64_t stop_idx) @@ -39,20 +38,23 @@ check_cxx_source_compiles(" return res; } - __attribute__ ((target (\"avx512f,avx512vpopcntdq,bmi2\"))) - uint64_t Sieve::count_avx512_bmi2(uint64_t* array, uint64_t stop_idx) + __attribute__ ((target (\"avx512f,avx512vpopcntdq\"))) + uint64_t Sieve::count_avx512_vpopcnt(uint64_t* array, uint64_t stop_idx) { uint64_t i = 0; __m512i vcnt = _mm512_setzero_si512(); - do + + for (; i + 8 < stop_idx; i += 8) { - __mmask8 mask = (i + 8 < stop_idx) ? 0xff : (__mmask8) _bzhi_u64(0xff, stop_idx - i); - __m512i vec = _mm512_maskz_loadu_epi64(mask , &array[i]); - vec = _mm512_popcnt_epi64(vec); - vcnt = _mm512_add_epi64(vcnt, vec); - i += 8; + __m512i vec = _mm512_loadu_epi64(&array[i]); + vec = _mm512_popcnt_epi64(vec); + vcnt = _mm512_add_epi64(vcnt, vec); } - while (i < stop_idx); + + __mmask8 mask = 0xff >> (stop_idx - i); + __m512i vec = _mm512_maskz_loadu_epi64(mask , &array[i]); + vec = _mm512_popcnt_epi64(vec); + vcnt = _mm512_add_epi64(vcnt, vec); return _mm512_reduce_add_epi64(vcnt); } @@ -62,8 +64,8 @@ check_cxx_source_compiles(" uint64_t cnt = 0; Sieve sieve; - if (cpu_supports_avx512_bmi2) - cnt = sieve.count_avx512_bmi2(&array[0], 10); + if (cpu_supports_avx512_vpopcnt) + cnt = sieve.count_avx512_vpopcnt(&array[0], 10); else cnt = sieve.count_default(&array[0], 10); @@ -72,7 +74,7 @@ check_cxx_source_compiles(" " multiarch_avx512_vpopcnt) if(multiarch_avx512_vpopcnt) - set(ENABLE_MULTIARCH "ENABLE_MULTIARCH_AVX512_BMI2") + set(ENABLE_MULTIARCH "ENABLE_MULTIARCH_AVX512_VPOPCNT") endif() cmake_pop_check_state() diff --git a/include/Sieve.hpp b/include/Sieve.hpp index df52396a..1caef8cd 100644 --- a/include/Sieve.hpp +++ b/include/Sieve.hpp @@ -49,19 +49,18 @@ #define ENABLE_ARM_SVE #elif defined(__AVX512F__) && \ - defined(__AVX512VPOPCNTDQ__) && \ - defined(__BMI2__) && \ - !defined(__i386__) /* misses _bzhi_u64() */ && \ + defined(__AVX512VPOPCNTDQ__) __has_include() #include - #define ENABLE_AVX512_BMI2 + #define ENABLE_AVX512_VPOPCNT #elif defined(ENABLE_MULTIARCH_ARM_SVE) #include #include #define ENABLE_DEFAULT -#elif defined(ENABLE_MULTIARCH_AVX512_BMI2) - #include + +#elif defined(ENABLE_MULTIARCH_AVX512_VPOPCNT) + #include #include #define ENABLE_DEFAULT #else @@ -113,12 +112,12 @@ class Sieve { #if defined(ENABLE_ARM_SVE) return count_arm_sve(start, stop); - #elif defined(ENABLE_AVX512_BMI2) - return count_avx512_bmi2(start, stop); + #elif defined(ENABLE_AVX512_VPOPCNT) + return count_avx512_vpopcnt(start, stop); #elif defined(ENABLE_MULTIARCH_ARM_SVE) return cpu_supports_sve ? count_arm_sve(start, stop) : count_default(start, stop); - #elif defined(ENABLE_MULTIARCH_AVX512_BMI2) - return cpu_supports_avx512_bmi2 ? count_avx512_bmi2(start, stop) : count_default(start, stop); + #elif defined(ENABLE_MULTIARCH_AVX512_VPOPCNT) + return cpu_supports_avx512_vpopcnt ? count_avx512_vpopcnt(start, stop) : count_default(start, stop); #else return count_default(start, stop); #endif @@ -174,18 +173,18 @@ class Sieve #endif -#if defined(ENABLE_AVX512_BMI2) || \ - defined(ENABLE_MULTIARCH_AVX512_BMI2) +#if defined(ENABLE_AVX512_VPOPCNT) || \ + defined(ENABLE_MULTIARCH_AVX512_VPOPCNT) /// Count 1 bits inside [start, stop]. /// The distance [start, stop] is small here < sqrt(segment_size), /// hence we simply count the number of unsieved elements /// by linearly iterating over the sieve array. /// - #if defined(ENABLE_MULTIARCH_AVX512_BMI2) - __attribute__ ((target ("avx512f,avx512vpopcntdq,bmi2"))) + #if defined(ENABLE_MULTIARCH_AVX512_VPOPCNT) + __attribute__ ((target ("avx512f,avx512vpopcntdq"))) #endif - uint64_t count_avx512_bmi2(uint64_t start, uint64_t stop) const + uint64_t count_avx512_vpopcnt(uint64_t start, uint64_t stop) const { if (start > stop) return 0; @@ -216,7 +215,7 @@ class Sieve vcnt = _mm512_add_epi64(vcnt, vec); } - __mmask8 mask = (__mmask8) _bzhi_u64(0xff, stop_idx - i); + __mmask8 mask = 0xff >> (stop_idx - i); __m512i vec = _mm512_maskz_loadu_epi64(mask , &sieve64[i]); vec = _mm512_popcnt_epi64(vec); vcnt = _mm512_add_epi64(vcnt, vec); diff --git a/include/cpu_supports_avx512_bmi2.hpp b/include/cpu_supports_avx512_vpopcnt.hpp similarity index 80% rename from include/cpu_supports_avx512_bmi2.hpp rename to include/cpu_supports_avx512_vpopcnt.hpp index 9b812d3c..50c31d3e 100644 --- a/include/cpu_supports_avx512_bmi2.hpp +++ b/include/cpu_supports_avx512_vpopcnt.hpp @@ -1,6 +1,6 @@ /// -/// @file cpu_supports_avx512_bmi2.hpp -/// @brief Detect if the x86 CPU supports AVX512 and BMI2. +/// @file cpu_supports_avx512_vpopcnt.hpp +/// @brief Detect if the x86 CPU supports AVX512 VPOPCNT. /// /// Copyright (C) 2024 Kim Walisch, /// @@ -8,8 +8,8 @@ /// file in the top level directory. /// -#ifndef CPU_SUPPORTS_AVX512_BMI2_HPP -#define CPU_SUPPORTS_AVX512_BMI2_HPP +#ifndef CPU_SUPPORTS_AVX512_VPOPCNT_HPP +#define CPU_SUPPORTS_AVX512_VPOPCNT_HPP #include @@ -18,7 +18,6 @@ #endif // %ebx bit flags -#define bit_BMI2 (1 << 8) #define bit_AVX512F (1 << 16) // %ecx bit flags @@ -45,7 +44,7 @@ inline int get_xcr0() return xcr0; } -inline bool run_cpuid_avx512_bmi2() +inline bool run_cpuid_avx512_vpopcnt() { int abcd[4]; @@ -72,16 +71,13 @@ inline bool run_cpuid_avx512_bmi2() run_cpuid(7, 0, abcd); - if ((abcd[1] & bit_BMI2) != bit_BMI2) - return false; - // AVX512F, AVX512VPOPCNTDQ return ((abcd[1] & bit_AVX512F) == bit_AVX512F && (abcd[2] & bit_AVX512_VPOPCNTDQ) == bit_AVX512_VPOPCNTDQ); } /// Initialized at startup -bool cpu_supports_avx512_bmi2 = run_cpuid_avx512_bmi2(); +bool cpu_supports_avx512_vpopcnt = run_cpuid_avx512_vpopcnt(); } // namespace diff --git a/scripts/build_clang_multiarch_win_x64.bat b/scripts/build_clang_multiarch_win_x64.bat index 4f33f73e..65b071fa 100644 --- a/scripts/build_clang_multiarch_win_x64.bat +++ b/scripts/build_clang_multiarch_win_x64.bat @@ -10,6 +10,6 @@ :: include all MMX, SSE, POPCNT, BMI, BMI2, AVX, AVX and AVX512 headers. del /Q ..\src\deleglise-rivat\S2_easy.cpp ..\src\gourdon\AC.cpp -clang++ -I../include -I../lib/primesieve/include -O3 -mpopcnt -fopenmp -Wall -Wextra -pedantic -DNDEBUG -DENABLE_INT128_OPENMP_PATCH -DENABLE_MULTIARCH_AVX512 -DENABLE_MULTIARCH_AVX512_BMI2 ../lib/primesieve/src/*.cpp ../src/*.cpp ../src/lmo/*.cpp ../src/deleglise-rivat/*.cpp ../src/gourdon/*.cpp ../src/app/*.cpp -o primecount.exe "C:\Program Files\LLVM\lib\clang\18\lib\windows\clang_rt.builtins-x86_64.lib" +clang++ -I../include -I../lib/primesieve/include -O3 -mpopcnt -fopenmp -Wall -Wextra -pedantic -DNDEBUG -DENABLE_INT128_OPENMP_PATCH -DENABLE_MULTIARCH_AVX512 -DENABLE_MULTIARCH_AVX512_VPOPCNT ../lib/primesieve/src/*.cpp ../src/*.cpp ../src/lmo/*.cpp ../src/deleglise-rivat/*.cpp ../src/gourdon/*.cpp ../src/app/*.cpp -o primecount.exe "C:\Program Files\LLVM\lib\clang\18\lib\windows\clang_rt.builtins-x86_64.lib" git checkout ..\src\deleglise-rivat git checkout ..\src\gourdon