Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions .github/workflows/test_old_cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,13 @@ jobs:
env:
LDFLAGS: "-fopenmp"
run: |
pip install . --no-build-isolation -v
# For Sandy Bridge (x86-64-v2), we need to disable FMA code paths
# since FMA instructions are not available on that microarchitecture
if [ "${{ matrix.cpu[0] }}" = "snb" ]; then
pip install . --no-build-isolation -v -Csetup-args=-Ddisable_fma=true
else
pip install . --no-build-isolation -v
fi

- name: Test import on ${{ matrix.cpu[1] }}
run: |
Expand All @@ -69,4 +75,4 @@ jobs:
- name: Run tests on ${{ matrix.cpu[1] }}
run: |
pip install pytest mpmath
sde -${{ matrix.cpu[0] }} -- python -m pytest tests/ -v --tb=short
sde -${{ matrix.cpu[0] }} -- python -m pytest tests/ -v --tb=short -v -s
11 changes: 7 additions & 4 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@ qblas_dep = dependency('qblas', fallback: ['qblas', 'qblas_dep'])
# Try to find SLEEF system-wide first, fall back to subproject if not found
# Required SLEEF version (must match sleef.wrap revision)
required_sleef_version = '3.9.0'
sleef_dep = dependency('sleef', version: '>=' + required_sleef_version, fallback: ['sleef', 'sleef_dep'], required: false)
# Don't use fallback here - we need to call subproject() explicitly later with disable_fma option
sleef_dep = dependency('sleef', version: '>=' + required_sleef_version, required: false)

use_system_sleef = false
fallback_reason = ''

if sleef_dep.found() and sleef_dep.type_name() != 'internal' and sleef_dep.version().startswith(required_sleef_version)
if sleef_dep.found() and sleef_dep.version().startswith(required_sleef_version)
# SLEEF found system-wide - verify quad-precision support
cpp = meson.get_compiler('cpp')
sleefquad_lib = cpp.find_library('sleefquad', required: false)
Expand Down Expand Up @@ -68,7 +69,9 @@ endif
if use_system_sleef
message('Using system-wide SLEEF installation with quad-precision support')
else
sleef_subproj = subproject('sleef')
# Pass disable_fma option to sleef subproject for x86-64-v2 compatibility
message('SLEEF FMA disable option: ' + get_option('disable_fma').to_string())
sleef_subproj = subproject('sleef', default_options: ['disable_fma=' + get_option('disable_fma').to_string()])
sleef_dep = sleef_subproj.get_variable('sleef_dep')
sleefquad_dep = sleef_subproj.get_variable('sleefquad_dep')
warning(fallback_reason)
Expand Down Expand Up @@ -197,4 +200,4 @@ py.extension_module('_quaddtype_main',
install: true,
subdir: 'numpy_quaddtype',
include_directories: [includes, build_includes, pythoncapi_includes],
)
)
3 changes: 3 additions & 0 deletions meson.options
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
option('disable_fma', type: 'boolean', value: false,
description: 'Disable FMA (Fused Multiply-Add) code paths' +
'Set to true when building for older CPUs like Sandy Bridge that lack FMA support.')
1 change: 1 addition & 0 deletions reinstall.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ rm -rf .mesonpy-*

python -m pip uninstall -y numpy_quaddtype
python -m pip install . -vv 2>&1 | tee build_log.txt
# pip install . --no-build-isolation -v -Csetup-args=-Ddisable_fma=true 2>&1 | tee build_log.txt
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah will be easy to test the option on x86 machines


# for debugging and TSAN builds, comment the above line and uncomment all below:
# export CFLAGS="-fsanitize=thread -g -O0"
Expand Down
100 changes: 100 additions & 0 deletions subprojects/packagefiles/sleef/fix-purecfma-scalar-x86.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1234567..abcdefg 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -90,6 +90,10 @@ option(SLEEF_ENFORCE_CUDA "Build fails if CUDA is not supported" OFF)
option(SLEEF_DISABLE_OPENMP "Disable OPENMP" OFF)
option(SLEEF_ENFORCE_OPENMP "Build fails if OPENMP is not supported by the compiler" OFF)

+# Option to disable PURECFMA scalar code path on x86 for x86-64-v2 compatibility
+# When ON, PURECFMA scalar dispatch is disabled (useful for Sandy Bridge support)
+# This can be set dynamically by the build system based on target CPU detection
+option(SLEEF_DISABLE_PURECFMA_SCALAR "Disable PURECFMA scalar code path (for x86-64-v2 compatibility)" OFF)
#

if ((NOT "${CMAKE_C_COMPILER_ID}" STREQUAL "${CMAKE_CXX_COMPILER_ID}") OR
diff --git a/Configure.cmake b/Configure.cmake
index e23f577..f1a2b3c 100644
--- a/Configure.cmake
+++ b/Configure.cmake
@@ -193,7 +193,12 @@ endif()
if(SLEEF_TARGET_PROCESSOR MATCHES "(x86|AMD64|amd64|^i.86$)")
set(SLEEF_ARCH_X86 ON CACHE INTERNAL "True for x86 architecture.")

- set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-mavx2;-mfma")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately my cmake is pretty terrible: is there a way to set these flags for x86_64-v3 or newer?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was reading and it seems querying CPUIDs is a way to conditionally set the flags, also not just flags but the entire dispatch mechanism.
I'll try if I somehow managed to get this right, as this unconditional SIMD dispatching is very tightly integrated with SLEEF's FMA code generation

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, thanks! It'd be nice to enable better optimizations when people build for themselves but not critical.

+ # Only set PURECFMA_SCALAR flags if not explicitly disabled
+ if(NOT SLEEF_DISABLE_PURECFMA_SCALAR)
+ set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-mavx2;-mfma")
+ else()
+ message(STATUS "PURECFMA_SCALAR disabled for x86-64-v2 compatibility")
+ endif()
elseif(SLEEF_TARGET_PROCESSOR MATCHES "aarch64|arm64")
set(SLEEF_ARCH_AARCH64 ON CACHE INTERNAL "True for Aarch64 architecture.")
# Aarch64 requires support for advsimdfma4
@@ -220,7 +225,12 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
endif()

set(COMPILER_SUPPORTS_PUREC_SCALAR 1)
-set(COMPILER_SUPPORTS_PURECFMA_SCALAR 1)
+# Conditionally enable PURECFMA_SCALAR based on option
+if(SLEEF_DISABLE_PURECFMA_SCALAR)
+ set(COMPILER_SUPPORTS_PURECFMA_SCALAR 0)
+else()
+ set(COMPILER_SUPPORTS_PURECFMA_SCALAR 1)
+endif()

# Compiler feature detection

diff --git a/src/quad/CMakeLists.txt b/src/quad/CMakeLists.txt
index 8e4e261..cc55002 100644
--- a/src/quad/CMakeLists.txt
+++ b/src/quad/CMakeLists.txt
@@ -397,9 +397,17 @@ set_target_properties(qmkdisp PROPERTIES ${COMMON_TARGET_PROPERTIES})

# Target qdispscalar.c

+# Set scalar dispatch backends based on PURECFMA support
+# When SLEEF_DISABLE_PURECFMA_SCALAR is ON, use purec for both slots
+if(COMPILER_SUPPORTS_PURECFMA_SCALAR)
+ set(SCALAR_DISPATCH_BACKENDS "purec" "purecfma")
+else()
+ set(SCALAR_DISPATCH_BACKENDS "purec" "purec")
+endif()
+
add_custom_command(
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/qdispscalar.c.body
- COMMAND $<TARGET_FILE:qmkdisp> 1 Sleef_quad double int32_t int64_t uint64_t purec purecfma > ${CMAKE_CURRENT_BINARY_DIR}/qdispscalar.c.body
+ COMMAND $<TARGET_FILE:qmkdisp> 1 Sleef_quad double int32_t int64_t uint64_t ${SCALAR_DISPATCH_BACKENDS} > ${CMAKE_CURRENT_BINARY_DIR}/qdispscalar.c.body
DEPENDS qmkdisp
)
sleef_concat_files(
@@ -420,6 +428,11 @@ target_compile_definitions(qdispscalar_obj PRIVATE ${COMMON_TARGET_DEFINITIONS})
target_include_directories(qdispscalar_obj PRIVATE ${sleef_BINARY_DIR}/include)
add_dependencies(qdispscalar_obj qdispscalar.c_generated qrenamedspscalar.h_generated
sleefquad_headers ${TARGET_LIBSLEEF} ${TARGET_HEADERS})
+# Define ENABLE_PURECFMA when PURECFMA is supported, so qdispscalar.c.org
+# can conditionally include the tryFMA() function and SUBST_IF_EXT1 macro
+if(COMPILER_SUPPORTS_PURECFMA_SCALAR)
+ target_compile_definitions(qdispscalar_obj PRIVATE ENABLE_PURECFMA=1)
+endif()
target_sources(sleefquad PRIVATE $<TARGET_OBJECTS:qdispscalar_obj>)

# Target qdispsse2_obj
diff --git a/src/quad/qdispscalar.c.org b/src/quad/qdispscalar.c.org
index c4c1292..48f309c 100644
--- a/src/quad/qdispscalar.c.org
+++ b/src/quad/qdispscalar.c.org
@@ -15,10 +15,14 @@

#include "qdispatcher.h"

+#ifdef ENABLE_PURECFMA
NOEXPORT Sleef_quad sleef_cpuid_QUADFMA_0;
static void tryFMA() { sleef_cpuid_QUADFMA_0 = Sleef_sinq1_u10purecfma(sleef_cpuid_QUADFMA_0); }

#define SUBST_IF_EXT1(funcExt1) if (cpuSupportsExt(tryFMA)) p = funcExt1;
+#else
+#define SUBST_IF_EXT1(funcExt1)
+#endif

//
34 changes: 33 additions & 1 deletion subprojects/packagefiles/sleef/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ project('sleef')

cmake = find_program('cmake')
ninja = find_program('ninja', 'make', required: false)
cc = meson.get_compiler('c')

sleef_build_dir = 'sleef_build'
sleef_install_dir = 'sleef_install'
Expand All @@ -19,8 +20,39 @@ endif
# For building sleef with TSan, delete the sleef subproject and follow the README instructions to build sleef externally.
# Enable SIMD extensions that are OFF by default but required by qblas (will change in future)
sleef_simd_flags = []
sleef_purecfma_flag = []

# Check for force-disable FMA option (for cross-compilation or emulation scenarios)
force_disable_fma = get_option('disable_fma')

if host_machine.cpu_family() == 'x86_64' or host_machine.cpu_family() == 'x86'
sleef_simd_flags = ['-DSLEEF_ENABLE_SSE2=ON']

if force_disable_fma
# User explicitly requested no FMA
message('FMA explicitly disabled via option - disabling PURECFMA scalar for x86-64-v2 compatibility')
sleef_purecfma_flag = ['-DSLEEF_DISABLE_PURECFMA_SCALAR=ON']
else
# Auto-detect FMA support at configure time
has_fma = cc.compiles('''
#include <immintrin.h>
int main(void) {
__m128 a = _mm_set1_ps(1.0f);
__m128 b = _mm_set1_ps(2.0f);
__m128 c = _mm_set1_ps(3.0f);
__m128 r = _mm_fmadd_ps(a, b, c);
(void)r;
return 0;
}
''', args: ['-mfma'], name: 'FMA instruction support')

if not has_fma
message('FMA not supported - disabling PURECFMA scalar code path')
sleef_purecfma_flag = ['-DSLEEF_DISABLE_PURECFMA_SCALAR=ON']
else
message('FMA supported - enabling PURECFMA scalar code path')
endif
endif
endif

sleef_configure = run_command([
Expand All @@ -35,7 +67,7 @@ sleef_configure = run_command([
'-DSLEEF_ENABLE_TLFLOAT=OFF', # this is only used for testing in SLEEF, not runtime
'-DCMAKE_POSITION_INDEPENDENT_CODE=ON',
'-DCMAKE_INSTALL_PREFIX=' + meson.current_build_dir() / sleef_install_dir
] + sleef_simd_flags, check: false, capture: true)
] + sleef_simd_flags + sleef_purecfma_flag, check: false, capture: true)

if sleef_configure.returncode() != 0
error('SLEEF CMake configuration failed: ' + sleef_configure.stderr())
Expand Down
3 changes: 3 additions & 0 deletions subprojects/packagefiles/sleef/meson.options
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
option('disable_fma', type: 'boolean', value: false,
description: 'Force disable FMA (Fused Multiply-Add) code paths. ' +
'Use this when targeting x86_64-v2 CPUs (like Sandy Bridge) that lack FMA support.')
1 change: 1 addition & 0 deletions subprojects/sleef.wrap
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ directory=sleef
url=https://github.com/shibatch/sleef.git
revision=3.9.0
patch_directory=sleef
diff_files=sleef/fix-purecfma-scalar-x86.patch

[provide]
sleef = sleef_dep
Expand Down
54 changes: 53 additions & 1 deletion tests/test_quaddtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -5877,4 +5877,56 @@ def test_logical_reduce_on_non_quad_arrays():
with standard NumPy operations like np.logical_or.reduce(np.arange(10.)).
"""
result = np.logical_or.reduce(np.arange(10.))
assert result == True
assert result == True


def test_sleef_purecfma_symbols():
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a test that justs prints the available purecfma functions inside the quaddtype build (if supports) otherwise nothing

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Which seems to work as from SandyBridge processor logs (fma is disabled) the output has nothing

tests/test_quaddtype.py::test_sleef_purecfma_symbols PASSED

but on haswell (fma enabled) output shows some

tests/test_quaddtype.py::test_logical_reduce_on_non_quad_arrays PASSED
tests/test_quaddtype.py::test_sleef_purecfma_symbols 
✓ Found 67 PURECFMA symbols (FMA optimizations enabled)
  Sample symbols:
    000000000014cd90 t sleef_acoshq1_u10purecfma
    000000000013eab0 t sleef_acosq1_u10purecfma
    0000000000129540 t sleef_addq1_u05purecfma
    000000000014a150 t sleef_asinhq1_u10purecfma
    000000000013d3b0 t sleef_asinq1_u10purecfma
    ... and 62 more
PASSED

"""Test that SLEEF PURECFMA symbols are present in the compiled module.

PURECFMA provides optimized scalar code paths using FMA instructions.
This test verifies the module was built with FMA support enabled.
On systems without FMA (e.g., x86-64-v2/Sandy Bridge), the build should
automatically disable PURECFMA, and this test should be skipped.
"""
import subprocess
import shutil
import pathlib

# Skip if nm is not available
nm_path = shutil.which('nm')
if nm_path is None:
pytest.skip("nm command not available")

# Get the path to the compiled shared library (.so file)
module_dir = pathlib.Path(numpy_quaddtype.__file__).parent
so_files = list(module_dir.glob('_quaddtype_main*.so'))

if not so_files:
pytest.skip("Could not find _quaddtype_main shared library")

module_path = str(so_files[0])

try:
result = subprocess.run(
['nm', module_path],
capture_output=True,
text=True,
timeout=30
)
except subprocess.TimeoutExpired:
pytest.skip("nm command timed out")
except FileNotFoundError:
pytest.skip("nm command not found")

purecfma_symbols = [
line for line in result.stdout.lower().splitlines()
if 'purecfma' in line
]

if purecfma_symbols:
print(f"\n✓ Found {len(purecfma_symbols)} PURECFMA symbols (FMA optimizations enabled)")
print(" Sample symbols:")
for sym in purecfma_symbols[:5]:
print(f" {sym}")
if len(purecfma_symbols) > 5:
print(f" ... and {len(purecfma_symbols) - 5} more")