-
-
Notifications
You must be signed in to change notification settings - Fork 4
FIX: Adding build-patch to disable Scalar FMA on SLEEF #62
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
a33bb6f
efba625
f574019
bb4dfc0
01e2006
1281abf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| option('disable_fma', type: 'boolean', value: false, | ||
| description: 'Disable FMA (Fused Multiply-Add) code paths' + | ||
| 'Set to true when building for older CPUs like Sandy Bridge that lack FMA support.') |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,100 @@ | ||
| diff --git a/CMakeLists.txt b/CMakeLists.txt | ||
| index 1234567..abcdefg 100644 | ||
| --- a/CMakeLists.txt | ||
| +++ b/CMakeLists.txt | ||
| @@ -90,6 +90,10 @@ option(SLEEF_ENFORCE_CUDA "Build fails if CUDA is not supported" OFF) | ||
| option(SLEEF_DISABLE_OPENMP "Disable OPENMP" OFF) | ||
| option(SLEEF_ENFORCE_OPENMP "Build fails if OPENMP is not supported by the compiler" OFF) | ||
|
|
||
| +# Option to disable PURECFMA scalar code path on x86 for x86-64-v2 compatibility | ||
| +# When ON, PURECFMA scalar dispatch is disabled (useful for Sandy Bridge support) | ||
| +# This can be set dynamically by the build system based on target CPU detection | ||
| +option(SLEEF_DISABLE_PURECFMA_SCALAR "Disable PURECFMA scalar code path (for x86-64-v2 compatibility)" OFF) | ||
| # | ||
|
|
||
| if ((NOT "${CMAKE_C_COMPILER_ID}" STREQUAL "${CMAKE_CXX_COMPILER_ID}") OR | ||
| diff --git a/Configure.cmake b/Configure.cmake | ||
| index e23f577..f1a2b3c 100644 | ||
| --- a/Configure.cmake | ||
| +++ b/Configure.cmake | ||
| @@ -193,7 +193,12 @@ endif() | ||
| if(SLEEF_TARGET_PROCESSOR MATCHES "(x86|AMD64|amd64|^i.86$)") | ||
| set(SLEEF_ARCH_X86 ON CACHE INTERNAL "True for x86 architecture.") | ||
|
|
||
| - set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-mavx2;-mfma") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately my cmake is pretty terrible: is there a way to set these flags for x86_64-v3 or newer?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was reading and it seems querying CPUIDs is a way to conditionally set the flags, also not just flags but the entire dispatch mechanism.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, thanks! It'd be nice to enable better optimizations when people build for themselves but not critical. |
||
| + # Only set PURECFMA_SCALAR flags if not explicitly disabled | ||
| + if(NOT SLEEF_DISABLE_PURECFMA_SCALAR) | ||
| + set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-mavx2;-mfma") | ||
| + else() | ||
| + message(STATUS "PURECFMA_SCALAR disabled for x86-64-v2 compatibility") | ||
| + endif() | ||
| elseif(SLEEF_TARGET_PROCESSOR MATCHES "aarch64|arm64") | ||
| set(SLEEF_ARCH_AARCH64 ON CACHE INTERNAL "True for Aarch64 architecture.") | ||
| # Aarch64 requires support for advsimdfma4 | ||
| @@ -220,7 +225,12 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64") | ||
| endif() | ||
|
|
||
| set(COMPILER_SUPPORTS_PUREC_SCALAR 1) | ||
| -set(COMPILER_SUPPORTS_PURECFMA_SCALAR 1) | ||
| +# Conditionally enable PURECFMA_SCALAR based on option | ||
| +if(SLEEF_DISABLE_PURECFMA_SCALAR) | ||
| + set(COMPILER_SUPPORTS_PURECFMA_SCALAR 0) | ||
| +else() | ||
| + set(COMPILER_SUPPORTS_PURECFMA_SCALAR 1) | ||
| +endif() | ||
|
|
||
| # Compiler feature detection | ||
|
|
||
| diff --git a/src/quad/CMakeLists.txt b/src/quad/CMakeLists.txt | ||
| index 8e4e261..cc55002 100644 | ||
| --- a/src/quad/CMakeLists.txt | ||
| +++ b/src/quad/CMakeLists.txt | ||
| @@ -397,9 +397,17 @@ set_target_properties(qmkdisp PROPERTIES ${COMMON_TARGET_PROPERTIES}) | ||
|
|
||
| # Target qdispscalar.c | ||
|
|
||
| +# Set scalar dispatch backends based on PURECFMA support | ||
| +# When SLEEF_DISABLE_PURECFMA_SCALAR is ON, use purec for both slots | ||
| +if(COMPILER_SUPPORTS_PURECFMA_SCALAR) | ||
| + set(SCALAR_DISPATCH_BACKENDS "purec" "purecfma") | ||
| +else() | ||
| + set(SCALAR_DISPATCH_BACKENDS "purec" "purec") | ||
| +endif() | ||
| + | ||
| add_custom_command( | ||
| OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/qdispscalar.c.body | ||
| - COMMAND $<TARGET_FILE:qmkdisp> 1 Sleef_quad double int32_t int64_t uint64_t purec purecfma > ${CMAKE_CURRENT_BINARY_DIR}/qdispscalar.c.body | ||
| + COMMAND $<TARGET_FILE:qmkdisp> 1 Sleef_quad double int32_t int64_t uint64_t ${SCALAR_DISPATCH_BACKENDS} > ${CMAKE_CURRENT_BINARY_DIR}/qdispscalar.c.body | ||
| DEPENDS qmkdisp | ||
| ) | ||
| sleef_concat_files( | ||
| @@ -420,6 +428,11 @@ target_compile_definitions(qdispscalar_obj PRIVATE ${COMMON_TARGET_DEFINITIONS}) | ||
| target_include_directories(qdispscalar_obj PRIVATE ${sleef_BINARY_DIR}/include) | ||
| add_dependencies(qdispscalar_obj qdispscalar.c_generated qrenamedspscalar.h_generated | ||
| sleefquad_headers ${TARGET_LIBSLEEF} ${TARGET_HEADERS}) | ||
| +# Define ENABLE_PURECFMA when PURECFMA is supported, so qdispscalar.c.org | ||
| +# can conditionally include the tryFMA() function and SUBST_IF_EXT1 macro | ||
| +if(COMPILER_SUPPORTS_PURECFMA_SCALAR) | ||
| + target_compile_definitions(qdispscalar_obj PRIVATE ENABLE_PURECFMA=1) | ||
| +endif() | ||
| target_sources(sleefquad PRIVATE $<TARGET_OBJECTS:qdispscalar_obj>) | ||
|
|
||
| # Target qdispsse2_obj | ||
| diff --git a/src/quad/qdispscalar.c.org b/src/quad/qdispscalar.c.org | ||
| index c4c1292..48f309c 100644 | ||
| --- a/src/quad/qdispscalar.c.org | ||
| +++ b/src/quad/qdispscalar.c.org | ||
| @@ -15,10 +15,14 @@ | ||
|
|
||
| #include "qdispatcher.h" | ||
|
|
||
| +#ifdef ENABLE_PURECFMA | ||
| NOEXPORT Sleef_quad sleef_cpuid_QUADFMA_0; | ||
| static void tryFMA() { sleef_cpuid_QUADFMA_0 = Sleef_sinq1_u10purecfma(sleef_cpuid_QUADFMA_0); } | ||
|
|
||
| #define SUBST_IF_EXT1(funcExt1) if (cpuSupportsExt(tryFMA)) p = funcExt1; | ||
| +#else | ||
| +#define SUBST_IF_EXT1(funcExt1) | ||
| +#endif | ||
|
|
||
| // | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| option('disable_fma', type: 'boolean', value: false, | ||
| description: 'Force disable FMA (Fused Multiply-Add) code paths. ' + | ||
| 'Use this when targeting x86_64-v2 CPUs (like Sandy Bridge) that lack FMA support.') |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5877,4 +5877,56 @@ def test_logical_reduce_on_non_quad_arrays(): | |
| with standard NumPy operations like np.logical_or.reduce(np.arange(10.)). | ||
| """ | ||
| result = np.logical_or.reduce(np.arange(10.)) | ||
| assert result == True | ||
| assert result == True | ||
|
|
||
|
|
||
| def test_sleef_purecfma_symbols(): | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's a test that justs prints the available
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Which seems to work as from SandyBridge processor logs (fma is disabled) the output has nothing tests/test_quaddtype.py::test_sleef_purecfma_symbols PASSEDbut on haswell (fma enabled) output shows some tests/test_quaddtype.py::test_logical_reduce_on_non_quad_arrays PASSED
tests/test_quaddtype.py::test_sleef_purecfma_symbols
✓ Found 67 PURECFMA symbols (FMA optimizations enabled)
Sample symbols:
000000000014cd90 t sleef_acoshq1_u10purecfma
000000000013eab0 t sleef_acosq1_u10purecfma
0000000000129540 t sleef_addq1_u05purecfma
000000000014a150 t sleef_asinhq1_u10purecfma
000000000013d3b0 t sleef_asinq1_u10purecfma
... and 62 more
PASSED |
||
| """Test that SLEEF PURECFMA symbols are present in the compiled module. | ||
|
|
||
| PURECFMA provides optimized scalar code paths using FMA instructions. | ||
| This test verifies the module was built with FMA support enabled. | ||
| On systems without FMA (e.g., x86-64-v2/Sandy Bridge), the build should | ||
| automatically disable PURECFMA, and this test should be skipped. | ||
| """ | ||
| import subprocess | ||
| import shutil | ||
| import pathlib | ||
|
|
||
| # Skip if nm is not available | ||
| nm_path = shutil.which('nm') | ||
| if nm_path is None: | ||
| pytest.skip("nm command not available") | ||
|
|
||
| # Get the path to the compiled shared library (.so file) | ||
| module_dir = pathlib.Path(numpy_quaddtype.__file__).parent | ||
| so_files = list(module_dir.glob('_quaddtype_main*.so')) | ||
|
|
||
| if not so_files: | ||
| pytest.skip("Could not find _quaddtype_main shared library") | ||
|
|
||
| module_path = str(so_files[0]) | ||
|
|
||
| try: | ||
| result = subprocess.run( | ||
| ['nm', module_path], | ||
| capture_output=True, | ||
| text=True, | ||
| timeout=30 | ||
| ) | ||
| except subprocess.TimeoutExpired: | ||
| pytest.skip("nm command timed out") | ||
| except FileNotFoundError: | ||
| pytest.skip("nm command not found") | ||
|
|
||
| purecfma_symbols = [ | ||
| line for line in result.stdout.lower().splitlines() | ||
| if 'purecfma' in line | ||
| ] | ||
|
|
||
| if purecfma_symbols: | ||
| print(f"\n✓ Found {len(purecfma_symbols)} PURECFMA symbols (FMA optimizations enabled)") | ||
| print(" Sample symbols:") | ||
| for sym in purecfma_symbols[:5]: | ||
| print(f" {sym}") | ||
| if len(purecfma_symbols) > 5: | ||
| print(f" ... and {len(purecfma_symbols) - 5} more") | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we still need this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah will be easy to test the option on x86 machines