diff --git a/.github/workflows/test_old_cpu.yml b/.github/workflows/test_old_cpu.yml index 0282d8f..ca75cc4 100644 --- a/.github/workflows/test_old_cpu.yml +++ b/.github/workflows/test_old_cpu.yml @@ -51,7 +51,13 @@ jobs: env: LDFLAGS: "-fopenmp" run: | - pip install . --no-build-isolation -v + # For Sandy Bridge (x86-64-v2), we need to disable FMA code paths + # since FMA instructions are not available on that microarchitecture + if [ "${{ matrix.cpu[0] }}" = "snb" ]; then + pip install . --no-build-isolation -v -Csetup-args=-Ddisable_fma=true + else + pip install . --no-build-isolation -v + fi - name: Test import on ${{ matrix.cpu[1] }} run: | @@ -69,4 +75,4 @@ jobs: - name: Run tests on ${{ matrix.cpu[1] }} run: | pip install pytest mpmath - sde -${{ matrix.cpu[0] }} -- python -m pytest tests/ -v --tb=short + sde -${{ matrix.cpu[0] }} -- python -m pytest tests/ -v --tb=short -v -s \ No newline at end of file diff --git a/README.md b/README.md index 1b86d22..151fc1a 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,8 @@ A cross-platform Quad (128-bit) float Data-Type for NumPy. - [Installation from source](#installation-from-source) - [Linux/Unix/macOS](#linuxunixmacos) - [Windows](#windows) + - [Build Options](#build-options) + - [Disabling FMA (Fused Multiply-Add)](#disabling-fma-fused-multiply-add) - [Building with ThreadSanitizer (TSan)](#building-with-threadsanitizer-tsan) - [Building the documentation](#building-the-documentation) - [Serving the documentation](#serving-the-documentation) @@ -138,6 +140,23 @@ python -m pytest tests 8. **Architecture**: The instructions are for x64. For x86 builds, change `-A x64` to `-A Win32`. +## Build Options + +### Disabling FMA (Fused Multiply-Add) + +On older x86-64 CPUs without FMA support (e.g., Sandy Bridge / x86_64-v2), the SLEEF's `PURECFMA` scalar code path will cause illegal instruction errors. By default, FMA support is auto-detected at build time, but you can explicitly disable it: + +```bash +pip install . -Csetup-args=-Ddisable_fma=true +``` + +This is a workaround for a [SLEEF issue](https://github.com/shibatch/sleef/issues/707) where `PURECFMA` scalar functions are unconditionally compiled with FMA instructions even on systems that don't support them. + +**When to use this option:** +- Building on or for x86_64-v2 (Sandy Bridge era) CPUs +- Cross-compiling for older x86_64 targets +- Running in emulators/VMs that don't expose FMA capability + ## Building with ThreadSanitizer (TSan) This is a development feature to help detect threading issues. To build `numpy-quaddtype` with TSan enabled, follow these steps: diff --git a/meson.build b/meson.build index af0d62e..5101f24 100644 --- a/meson.build +++ b/meson.build @@ -18,12 +18,13 @@ qblas_dep = dependency('qblas', fallback: ['qblas', 'qblas_dep']) # Try to find SLEEF system-wide first, fall back to subproject if not found # Required SLEEF version (must match sleef.wrap revision) required_sleef_version = '3.9.0' -sleef_dep = dependency('sleef', version: '>=' + required_sleef_version, fallback: ['sleef', 'sleef_dep'], required: false) +# Don't use fallback here - we need to call subproject() explicitly later with disable_fma option +sleef_dep = dependency('sleef', version: '>=' + required_sleef_version, required: false) use_system_sleef = false fallback_reason = '' -if sleef_dep.found() and sleef_dep.type_name() != 'internal' and sleef_dep.version().startswith(required_sleef_version) +if sleef_dep.found() and sleef_dep.version().startswith(required_sleef_version) # SLEEF found system-wide - verify quad-precision support cpp = meson.get_compiler('cpp') sleefquad_lib = cpp.find_library('sleefquad', required: false) @@ -68,7 +69,9 @@ endif if use_system_sleef message('Using system-wide SLEEF installation with quad-precision support') else - sleef_subproj = subproject('sleef') + # Pass disable_fma option to sleef subproject for x86-64-v2 compatibility + message('SLEEF FMA disable option: ' + get_option('disable_fma').to_string()) + sleef_subproj = subproject('sleef', default_options: ['disable_fma=' + get_option('disable_fma').to_string()]) sleef_dep = sleef_subproj.get_variable('sleef_dep') sleefquad_dep = sleef_subproj.get_variable('sleefquad_dep') warning(fallback_reason) @@ -197,4 +200,4 @@ py.extension_module('_quaddtype_main', install: true, subdir: 'numpy_quaddtype', include_directories: [includes, build_includes, pythoncapi_includes], -) +) \ No newline at end of file diff --git a/meson.options b/meson.options new file mode 100644 index 0000000..d871c14 --- /dev/null +++ b/meson.options @@ -0,0 +1,3 @@ +option('disable_fma', type: 'boolean', value: false, + description: 'Disable FMA (Fused Multiply-Add) code paths' + + 'Set to true when building for older CPUs like Sandy Bridge that lack FMA support.') \ No newline at end of file diff --git a/reinstall.sh b/reinstall.sh index f228191..a833d08 100755 --- a/reinstall.sh +++ b/reinstall.sh @@ -10,6 +10,7 @@ rm -rf .mesonpy-* python -m pip uninstall -y numpy_quaddtype python -m pip install . -vv 2>&1 | tee build_log.txt +# pip install . --no-build-isolation -v -Csetup-args=-Ddisable_fma=true 2>&1 | tee build_log.txt # for debugging and TSAN builds, comment the above line and uncomment all below: # export CFLAGS="-fsanitize=thread -g -O0" diff --git a/subprojects/packagefiles/sleef/fix-purecfma-scalar-x86.patch b/subprojects/packagefiles/sleef/fix-purecfma-scalar-x86.patch new file mode 100644 index 0000000..9396cfe --- /dev/null +++ b/subprojects/packagefiles/sleef/fix-purecfma-scalar-x86.patch @@ -0,0 +1,100 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 1234567..abcdefg 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -90,6 +90,10 @@ option(SLEEF_ENFORCE_CUDA "Build fails if CUDA is not supported" OFF) + option(SLEEF_DISABLE_OPENMP "Disable OPENMP" OFF) + option(SLEEF_ENFORCE_OPENMP "Build fails if OPENMP is not supported by the compiler" OFF) + ++# Option to disable PURECFMA scalar code path on x86 for x86-64-v2 compatibility ++# When ON, PURECFMA scalar dispatch is disabled (useful for Sandy Bridge support) ++# This can be set dynamically by the build system based on target CPU detection ++option(SLEEF_DISABLE_PURECFMA_SCALAR "Disable PURECFMA scalar code path (for x86-64-v2 compatibility)" OFF) + # + + if ((NOT "${CMAKE_C_COMPILER_ID}" STREQUAL "${CMAKE_CXX_COMPILER_ID}") OR +diff --git a/Configure.cmake b/Configure.cmake +index e23f577..f1a2b3c 100644 +--- a/Configure.cmake ++++ b/Configure.cmake +@@ -193,7 +193,12 @@ endif() + if(SLEEF_TARGET_PROCESSOR MATCHES "(x86|AMD64|amd64|^i.86$)") + set(SLEEF_ARCH_X86 ON CACHE INTERNAL "True for x86 architecture.") + +- set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-mavx2;-mfma") ++ # Only set PURECFMA_SCALAR flags if not explicitly disabled ++ if(NOT SLEEF_DISABLE_PURECFMA_SCALAR) ++ set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-mavx2;-mfma") ++ else() ++ message(STATUS "PURECFMA_SCALAR disabled for x86-64-v2 compatibility") ++ endif() + elseif(SLEEF_TARGET_PROCESSOR MATCHES "aarch64|arm64") + set(SLEEF_ARCH_AARCH64 ON CACHE INTERNAL "True for Aarch64 architecture.") + # Aarch64 requires support for advsimdfma4 +@@ -220,7 +225,12 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64") + endif() + + set(COMPILER_SUPPORTS_PUREC_SCALAR 1) +-set(COMPILER_SUPPORTS_PURECFMA_SCALAR 1) ++# Conditionally enable PURECFMA_SCALAR based on option ++if(SLEEF_DISABLE_PURECFMA_SCALAR) ++ set(COMPILER_SUPPORTS_PURECFMA_SCALAR 0) ++else() ++ set(COMPILER_SUPPORTS_PURECFMA_SCALAR 1) ++endif() + + # Compiler feature detection + +diff --git a/src/quad/CMakeLists.txt b/src/quad/CMakeLists.txt +index 8e4e261..cc55002 100644 +--- a/src/quad/CMakeLists.txt ++++ b/src/quad/CMakeLists.txt +@@ -397,9 +397,17 @@ set_target_properties(qmkdisp PROPERTIES ${COMMON_TARGET_PROPERTIES}) + + # Target qdispscalar.c + ++# Set scalar dispatch backends based on PURECFMA support ++# When SLEEF_DISABLE_PURECFMA_SCALAR is ON, use purec for both slots ++if(COMPILER_SUPPORTS_PURECFMA_SCALAR) ++ set(SCALAR_DISPATCH_BACKENDS "purec" "purecfma") ++else() ++ set(SCALAR_DISPATCH_BACKENDS "purec" "purec") ++endif() ++ + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/qdispscalar.c.body +- COMMAND $ 1 Sleef_quad double int32_t int64_t uint64_t purec purecfma > ${CMAKE_CURRENT_BINARY_DIR}/qdispscalar.c.body ++ COMMAND $ 1 Sleef_quad double int32_t int64_t uint64_t ${SCALAR_DISPATCH_BACKENDS} > ${CMAKE_CURRENT_BINARY_DIR}/qdispscalar.c.body + DEPENDS qmkdisp + ) + sleef_concat_files( +@@ -420,6 +428,11 @@ target_compile_definitions(qdispscalar_obj PRIVATE ${COMMON_TARGET_DEFINITIONS}) + target_include_directories(qdispscalar_obj PRIVATE ${sleef_BINARY_DIR}/include) + add_dependencies(qdispscalar_obj qdispscalar.c_generated qrenamedspscalar.h_generated + sleefquad_headers ${TARGET_LIBSLEEF} ${TARGET_HEADERS}) ++# Define ENABLE_PURECFMA when PURECFMA is supported, so qdispscalar.c.org ++# can conditionally include the tryFMA() function and SUBST_IF_EXT1 macro ++if(COMPILER_SUPPORTS_PURECFMA_SCALAR) ++ target_compile_definitions(qdispscalar_obj PRIVATE ENABLE_PURECFMA=1) ++endif() + target_sources(sleefquad PRIVATE $) + + # Target qdispsse2_obj +diff --git a/src/quad/qdispscalar.c.org b/src/quad/qdispscalar.c.org +index c4c1292..48f309c 100644 +--- a/src/quad/qdispscalar.c.org ++++ b/src/quad/qdispscalar.c.org +@@ -15,10 +15,14 @@ + + #include "qdispatcher.h" + ++#ifdef ENABLE_PURECFMA + NOEXPORT Sleef_quad sleef_cpuid_QUADFMA_0; + static void tryFMA() { sleef_cpuid_QUADFMA_0 = Sleef_sinq1_u10purecfma(sleef_cpuid_QUADFMA_0); } + + #define SUBST_IF_EXT1(funcExt1) if (cpuSupportsExt(tryFMA)) p = funcExt1; ++#else ++#define SUBST_IF_EXT1(funcExt1) ++#endif + + // \ No newline at end of file diff --git a/subprojects/packagefiles/sleef/meson.build b/subprojects/packagefiles/sleef/meson.build index 524a550..70f4774 100644 --- a/subprojects/packagefiles/sleef/meson.build +++ b/subprojects/packagefiles/sleef/meson.build @@ -2,6 +2,7 @@ project('sleef') cmake = find_program('cmake') ninja = find_program('ninja', 'make', required: false) +cc = meson.get_compiler('c') sleef_build_dir = 'sleef_build' sleef_install_dir = 'sleef_install' @@ -19,8 +20,41 @@ endif # For building sleef with TSan, delete the sleef subproject and follow the README instructions to build sleef externally. # Enable SIMD extensions that are OFF by default but required by qblas (will change in future) sleef_simd_flags = [] +sleef_purecfma_flag = [] + +# Check for force-disable FMA option (for cross-compilation or emulation scenarios) +force_disable_fma = get_option('disable_fma') + if host_machine.cpu_family() == 'x86_64' or host_machine.cpu_family() == 'x86' sleef_simd_flags = ['-DSLEEF_ENABLE_SSE2=ON'] + + if force_disable_fma + # User explicitly requested no FMA + message('FMA explicitly disabled via option - disabling PURECFMA scalar for x86-64-v2 compatibility') + sleef_purecfma_flag = ['-DSLEEF_DISABLE_PURECFMA_SCALAR=ON'] + else + # Auto-detect FMA support at configure time by actually running FMA code + fma_test_result = cc.run(''' + #include + int main(void) { + __m128 a = _mm_set1_ps(1.0f); + __m128 b = _mm_set1_ps(2.0f); + __m128 c = _mm_set1_ps(3.0f); + __m128 r = _mm_fmadd_ps(a, b, c); + (void)r; + return 0; + } + ''', args: ['-mfma'], name: 'FMA instruction runtime support') + + has_fma = fma_test_result.compiled() and fma_test_result.returncode() == 0 + + if not has_fma + message('FMA not supported at runtime - disabling PURECFMA scalar code path') + sleef_purecfma_flag = ['-DSLEEF_DISABLE_PURECFMA_SCALAR=ON'] + else + message('FMA supported - enabling PURECFMA scalar code path') + endif + endif endif sleef_configure = run_command([ @@ -35,7 +69,7 @@ sleef_configure = run_command([ '-DSLEEF_ENABLE_TLFLOAT=OFF', # this is only used for testing in SLEEF, not runtime '-DCMAKE_POSITION_INDEPENDENT_CODE=ON', '-DCMAKE_INSTALL_PREFIX=' + meson.current_build_dir() / sleef_install_dir -] + sleef_simd_flags, check: false, capture: true) +] + sleef_simd_flags + sleef_purecfma_flag, check: false, capture: true) if sleef_configure.returncode() != 0 error('SLEEF CMake configuration failed: ' + sleef_configure.stderr()) diff --git a/subprojects/packagefiles/sleef/meson.options b/subprojects/packagefiles/sleef/meson.options new file mode 100644 index 0000000..f636b22 --- /dev/null +++ b/subprojects/packagefiles/sleef/meson.options @@ -0,0 +1,3 @@ +option('disable_fma', type: 'boolean', value: false, + description: 'Force disable FMA (Fused Multiply-Add) code paths. ' + + 'Use this when targeting x86_64-v2 CPUs (like Sandy Bridge) that lack FMA support.') \ No newline at end of file diff --git a/subprojects/sleef.wrap b/subprojects/sleef.wrap index 920f61e..c2dd5ad 100644 --- a/subprojects/sleef.wrap +++ b/subprojects/sleef.wrap @@ -3,6 +3,7 @@ directory=sleef url=https://github.com/shibatch/sleef.git revision=3.9.0 patch_directory=sleef +diff_files=sleef/fix-purecfma-scalar-x86.patch [provide] sleef = sleef_dep diff --git a/tests/test_quaddtype.py b/tests/test_quaddtype.py index 6736c86..f62f636 100644 --- a/tests/test_quaddtype.py +++ b/tests/test_quaddtype.py @@ -5877,4 +5877,56 @@ def test_logical_reduce_on_non_quad_arrays(): with standard NumPy operations like np.logical_or.reduce(np.arange(10.)). """ result = np.logical_or.reduce(np.arange(10.)) - assert result == True \ No newline at end of file + assert result == True + + +def test_sleef_purecfma_symbols(): + """Test that SLEEF PURECFMA symbols are present in the compiled module. + + PURECFMA provides optimized scalar code paths using FMA instructions. + This test verifies the module was built with FMA support enabled. + On systems without FMA (e.g., x86-64-v2/Sandy Bridge), the build should + automatically disable PURECFMA, and this test should be skipped. + """ + import subprocess + import shutil + import pathlib + + # Skip if nm is not available + nm_path = shutil.which('nm') + if nm_path is None: + pytest.skip("nm command not available") + + # Get the path to the compiled shared library (.so file) + module_dir = pathlib.Path(numpy_quaddtype.__file__).parent + so_files = list(module_dir.glob('_quaddtype_main*.so')) + + if not so_files: + pytest.skip("Could not find _quaddtype_main shared library") + + module_path = str(so_files[0]) + + try: + result = subprocess.run( + ['nm', module_path], + capture_output=True, + text=True, + timeout=30 + ) + except subprocess.TimeoutExpired: + pytest.skip("nm command timed out") + except FileNotFoundError: + pytest.skip("nm command not found") + + purecfma_symbols = [ + line for line in result.stdout.lower().splitlines() + if 'purecfma' in line + ] + + if purecfma_symbols: + print(f"\n✓ Found {len(purecfma_symbols)} PURECFMA symbols (FMA optimizations enabled)") + print(" Sample symbols:") + for sym in purecfma_symbols[:5]: + print(f" {sym}") + if len(purecfma_symbols) > 5: + print(f" ... and {len(purecfma_symbols) - 5} more") \ No newline at end of file