Skip to content

Commit

Permalink
Simplify AVX512 code
Browse files Browse the repository at this point in the history
  • Loading branch information
kimwalisch committed Jun 19, 2024
1 parent af7700e commit dcc4288
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 46 deletions.
2 changes: 2 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
Changes in primecount-7.14, 2024-06-17

* int128_t.hpp: Rename namespace port to pstd (portable std namespace).
* Sieve.hpp: Simplify AVX512 code.
* multiarch_avx512_vpopcnt.cmake: Simplify AVX512 code.

Changes in primecount-7.13, 2024-04-15

Expand Down
40 changes: 21 additions & 19 deletions cmake/multiarch_avx512_vpopcnt.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,23 @@ set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}/include")

check_cxx_source_compiles("
// GCC/Clang function multiversioning for AVX512 is not needed if
// the user compiles with -mavx512f -mavx512vpopcntdq -mbmi2.
// the user compiles with -mavx512f -mavx512vpopcntdq.
// GCC/Clang function multiversioning generally causes a minor
// overhead, hence we disable it if it is not needed.
#if defined(__AVX512F__) && \
defined(__AVX512VPOPCNTDQ__) && \
defined(__BMI2__)
Error: AVX512 BMI2 multiarch not needed!
defined(__AVX512VPOPCNTDQ__)
Error: AVX512 VPOPCNT multiarch not needed!
#endif
#include <cpu_supports_avx512_bmi2.hpp>
#include <cpu_supports_avx512_vpopcnt.hpp>
#include <immintrin.h>
#include <stdint.h>
class Sieve {
public:
uint64_t count_default(uint64_t* array, uint64_t stop_idx);
__attribute__ ((target (\"avx512f,avx512vpopcntdq,bmi2\")))
uint64_t count_avx512_bmi2(uint64_t* array, uint64_t stop_idx);
__attribute__ ((target (\"avx512f,avx512vpopcntdq\")))
uint64_t count_avx512_vpopcnt(uint64_t* array, uint64_t stop_idx);
};
uint64_t Sieve::count_default(uint64_t* array, uint64_t stop_idx)
Expand All @@ -39,20 +38,23 @@ check_cxx_source_compiles("
return res;
}
__attribute__ ((target (\"avx512f,avx512vpopcntdq,bmi2\")))
uint64_t Sieve::count_avx512_bmi2(uint64_t* array, uint64_t stop_idx)
__attribute__ ((target (\"avx512f,avx512vpopcntdq\")))
uint64_t Sieve::count_avx512_vpopcnt(uint64_t* array, uint64_t stop_idx)
{
uint64_t i = 0;
__m512i vcnt = _mm512_setzero_si512();
do
for (; i + 8 < stop_idx; i += 8)
{
__mmask8 mask = (i + 8 < stop_idx) ? 0xff : (__mmask8) _bzhi_u64(0xff, stop_idx - i);
__m512i vec = _mm512_maskz_loadu_epi64(mask , &array[i]);
vec = _mm512_popcnt_epi64(vec);
vcnt = _mm512_add_epi64(vcnt, vec);
i += 8;
__m512i vec = _mm512_loadu_epi64(&array[i]);
vec = _mm512_popcnt_epi64(vec);
vcnt = _mm512_add_epi64(vcnt, vec);
}
while (i < stop_idx);
__mmask8 mask = 0xff >> (stop_idx - i);
__m512i vec = _mm512_maskz_loadu_epi64(mask , &array[i]);
vec = _mm512_popcnt_epi64(vec);
vcnt = _mm512_add_epi64(vcnt, vec);
return _mm512_reduce_add_epi64(vcnt);
}
Expand All @@ -62,8 +64,8 @@ check_cxx_source_compiles("
uint64_t cnt = 0;
Sieve sieve;
if (cpu_supports_avx512_bmi2)
cnt = sieve.count_avx512_bmi2(&array[0], 10);
if (cpu_supports_avx512_vpopcnt)
cnt = sieve.count_avx512_vpopcnt(&array[0], 10);
else
cnt = sieve.count_default(&array[0], 10);
Expand All @@ -72,7 +74,7 @@ check_cxx_source_compiles("
" multiarch_avx512_vpopcnt)

if(multiarch_avx512_vpopcnt)
set(ENABLE_MULTIARCH "ENABLE_MULTIARCH_AVX512_BMI2")
set(ENABLE_MULTIARCH "ENABLE_MULTIARCH_AVX512_VPOPCNT")
endif()

cmake_pop_check_state()
31 changes: 15 additions & 16 deletions include/Sieve.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,19 +49,18 @@
#define ENABLE_ARM_SVE

#elif defined(__AVX512F__) && \
defined(__AVX512VPOPCNTDQ__) && \
defined(__BMI2__) && \
!defined(__i386__) /* misses _bzhi_u64() */ && \
defined(__AVX512VPOPCNTDQ__)
__has_include(<immintrin.h>)
#include <immintrin.h>
#define ENABLE_AVX512_BMI2
#define ENABLE_AVX512_VPOPCNT

#elif defined(ENABLE_MULTIARCH_ARM_SVE)
#include <cpu_supports_arm_sve.hpp>
#include <arm_sve.h>
#define ENABLE_DEFAULT
#elif defined(ENABLE_MULTIARCH_AVX512_BMI2)
#include <cpu_supports_avx512_bmi2.hpp>

#elif defined(ENABLE_MULTIARCH_AVX512_VPOPCNT)
#include <cpu_supports_avx512_vpopcnt.hpp>
#include <immintrin.h>
#define ENABLE_DEFAULT
#else
Expand Down Expand Up @@ -113,12 +112,12 @@ class Sieve
{
#if defined(ENABLE_ARM_SVE)
return count_arm_sve(start, stop);
#elif defined(ENABLE_AVX512_BMI2)
return count_avx512_bmi2(start, stop);
#elif defined(ENABLE_AVX512_VPOPCNT)
return count_avx512_vpopcnt(start, stop);
#elif defined(ENABLE_MULTIARCH_ARM_SVE)
return cpu_supports_sve ? count_arm_sve(start, stop) : count_default(start, stop);
#elif defined(ENABLE_MULTIARCH_AVX512_BMI2)
return cpu_supports_avx512_bmi2 ? count_avx512_bmi2(start, stop) : count_default(start, stop);
#elif defined(ENABLE_MULTIARCH_AVX512_VPOPCNT)
return cpu_supports_avx512_vpopcnt ? count_avx512_vpopcnt(start, stop) : count_default(start, stop);
#else
return count_default(start, stop);
#endif
Expand Down Expand Up @@ -174,18 +173,18 @@ class Sieve

#endif

#if defined(ENABLE_AVX512_BMI2) || \
defined(ENABLE_MULTIARCH_AVX512_BMI2)
#if defined(ENABLE_AVX512_VPOPCNT) || \
defined(ENABLE_MULTIARCH_AVX512_VPOPCNT)

/// Count 1 bits inside [start, stop].
/// The distance [start, stop] is small here < sqrt(segment_size),
/// hence we simply count the number of unsieved elements
/// by linearly iterating over the sieve array.
///
#if defined(ENABLE_MULTIARCH_AVX512_BMI2)
__attribute__ ((target ("avx512f,avx512vpopcntdq,bmi2")))
#if defined(ENABLE_MULTIARCH_AVX512_VPOPCNT)
__attribute__ ((target ("avx512f,avx512vpopcntdq")))
#endif
uint64_t count_avx512_bmi2(uint64_t start, uint64_t stop) const
uint64_t count_avx512_vpopcnt(uint64_t start, uint64_t stop) const
{
if (start > stop)
return 0;
Expand Down Expand Up @@ -216,7 +215,7 @@ class Sieve
vcnt = _mm512_add_epi64(vcnt, vec);
}

__mmask8 mask = (__mmask8) _bzhi_u64(0xff, stop_idx - i);
__mmask8 mask = 0xff >> (stop_idx - i);
__m512i vec = _mm512_maskz_loadu_epi64(mask , &sieve64[i]);
vec = _mm512_popcnt_epi64(vec);
vcnt = _mm512_add_epi64(vcnt, vec);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
///
/// @file cpu_supports_avx512_bmi2.hpp
/// @brief Detect if the x86 CPU supports AVX512 and BMI2.
/// @file cpu_supports_avx512_vpopcnt.hpp
/// @brief Detect if the x86 CPU supports AVX512 VPOPCNT.
///
/// Copyright (C) 2024 Kim Walisch, <[email protected]>
///
/// This file is distributed under the BSD License. See the COPYING
/// file in the top level directory.
///

#ifndef CPU_SUPPORTS_AVX512_BMI2_HPP
#define CPU_SUPPORTS_AVX512_BMI2_HPP
#ifndef CPU_SUPPORTS_AVX512_VPOPCNT_HPP
#define CPU_SUPPORTS_AVX512_VPOPCNT_HPP

#include <cpuid.hpp>

Expand All @@ -18,7 +18,6 @@
#endif

// %ebx bit flags
#define bit_BMI2 (1 << 8)
#define bit_AVX512F (1 << 16)

// %ecx bit flags
Expand All @@ -45,7 +44,7 @@ inline int get_xcr0()
return xcr0;
}

inline bool run_cpuid_avx512_bmi2()
inline bool run_cpuid_avx512_vpopcnt()
{
int abcd[4];

Expand All @@ -72,16 +71,13 @@ inline bool run_cpuid_avx512_bmi2()

run_cpuid(7, 0, abcd);

if ((abcd[1] & bit_BMI2) != bit_BMI2)
return false;

// AVX512F, AVX512VPOPCNTDQ
return ((abcd[1] & bit_AVX512F) == bit_AVX512F &&
(abcd[2] & bit_AVX512_VPOPCNTDQ) == bit_AVX512_VPOPCNTDQ);
}

/// Initialized at startup
bool cpu_supports_avx512_bmi2 = run_cpuid_avx512_bmi2();
bool cpu_supports_avx512_vpopcnt = run_cpuid_avx512_vpopcnt();

} // namespace

Expand Down
2 changes: 1 addition & 1 deletion scripts/build_clang_multiarch_win_x64.bat
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@
:: include all MMX, SSE, POPCNT, BMI, BMI2, AVX, AVX and AVX512 headers.

del /Q ..\src\deleglise-rivat\S2_easy.cpp ..\src\gourdon\AC.cpp
clang++ -I../include -I../lib/primesieve/include -O3 -mpopcnt -fopenmp -Wall -Wextra -pedantic -DNDEBUG -DENABLE_INT128_OPENMP_PATCH -DENABLE_MULTIARCH_AVX512 -DENABLE_MULTIARCH_AVX512_BMI2 ../lib/primesieve/src/*.cpp ../src/*.cpp ../src/lmo/*.cpp ../src/deleglise-rivat/*.cpp ../src/gourdon/*.cpp ../src/app/*.cpp -o primecount.exe "C:\Program Files\LLVM\lib\clang\18\lib\windows\clang_rt.builtins-x86_64.lib"
clang++ -I../include -I../lib/primesieve/include -O3 -mpopcnt -fopenmp -Wall -Wextra -pedantic -DNDEBUG -DENABLE_INT128_OPENMP_PATCH -DENABLE_MULTIARCH_AVX512 -DENABLE_MULTIARCH_AVX512_VPOPCNT ../lib/primesieve/src/*.cpp ../src/*.cpp ../src/lmo/*.cpp ../src/deleglise-rivat/*.cpp ../src/gourdon/*.cpp ../src/app/*.cpp -o primecount.exe "C:\Program Files\LLVM\lib\clang\18\lib\windows\clang_rt.builtins-x86_64.lib"
git checkout ..\src\deleglise-rivat
git checkout ..\src\gourdon

0 comments on commit dcc4288

Please sign in to comment.