diff --git a/.github/workflows/test-paddle.yaml b/.github/workflows/test-paddle.yaml index c34c00d..d225939 100644 --- a/.github/workflows/test-paddle.yaml +++ b/.github/workflows/test-paddle.yaml @@ -48,7 +48,8 @@ jobs: fi pip install torch==${torch_ver} --index-url https://download.pytorch.org/whl/cpu # transformers requires torch pip install paddlepaddle==3.0.0 - pip install pytest pytest-cov setuptools_scm safetensors transformers==${tf_ver} numpy==${npy_ver} + # TOFIX: safetensors version (0.7.0 had a bug around fp8 in Dec 5 2025) + pip install pytest pytest-cov setuptools_scm safetensors==0.6.2 transformers==${tf_ver} numpy==${npy_ver} - name: Build Package run: | pip install . diff --git a/fastsafetensors/copier/gds.py b/fastsafetensors/copier/gds.py index 9fde2af..6888758 100644 --- a/fastsafetensors/copier/gds.py +++ b/fastsafetensors/copier/gds.py @@ -172,12 +172,24 @@ def new_gds_file_copier( raise Exception( "[FAIL] GPU runtime library (libcudart.so or libamdhip64.so) does not exist" ) - if not fstcpp.is_cufile_found() and not nogds: - warnings.warn( - "libcufile.so does not exist but nogds is False. use nogds=True", - UserWarning, + if device_is_not_cpu and not nogds: + gds_supported = fstcpp.is_gds_supported( + device.index if device.index is not None else 0 ) - nogds = True + if gds_supported < 0: + raise Exception(f"is_gds_supported({device.index}) failed") + if not fstcpp.is_cufile_found(): + warnings.warn( + "libcufile.so does not exist but nogds is False. use nogds=True", + UserWarning, + ) + nogds = True + elif gds_supported == 0: + warnings.warn( + "GDS is not supported in this platform but nogds is False. use nogds=True", + UserWarning, + ) + nogds = True if nogds: nogds_reader = fstcpp.nogds_file_reader( diff --git a/fastsafetensors/cpp.pyi b/fastsafetensors/cpp.pyi index 40fcb67..6154ccf 100644 --- a/fastsafetensors/cpp.pyi +++ b/fastsafetensors/cpp.pyi @@ -46,6 +46,7 @@ def is_cufile_found() -> bool: ... def cufile_version() -> int: ... def get_alignment_size() -> int: ... def set_debug_log(debug_log: bool) -> None: ... +def is_gds_supported(deviceId: int) -> int: ... def init_gds() -> int: ... def close_gds() -> int: ... def get_device_pci_bus(deviceId: int) -> str: ... diff --git a/fastsafetensors/cpp/ext.cpp b/fastsafetensors/cpp/ext.cpp index ec89199..76a2630 100644 --- a/fastsafetensors/cpp/ext.cpp +++ b/fastsafetensors/cpp/ext.cpp @@ -148,7 +148,13 @@ static void load_nvidia_functions() { mydlsym(&cuda_fns.cudaDeviceGetPCIBusId, handle_cudart, "cudaDeviceGetPCIBusId"); mydlsym(&cuda_fns.cudaDeviceMalloc, handle_cudart, "cudaMalloc"); mydlsym(&cuda_fns.cudaDeviceFree, handle_cudart, "cudaFree"); - bool success = cuda_fns.cudaMemcpy && cuda_fns.cudaDeviceSynchronize && cuda_fns.cudaHostAlloc && cuda_fns.cudaFreeHost && cuda_fns.cudaDeviceGetPCIBusId && cuda_fns.cudaDeviceMalloc && cuda_fns.cudaDeviceFree; + mydlsym(&cuda_fns.cudaDriverGetVersion, handle_cudart, "cudaDriverGetVersion"); + mydlsym(&cuda_fns.cudaDeviceGetAttribute, handle_cudart, "cudaDeviceGetAttribute"); + bool success = cuda_fns.cudaMemcpy && cuda_fns.cudaDeviceSynchronize; + success = success && cuda_fns.cudaHostAlloc && cuda_fns.cudaFreeHost; + success = success && cuda_fns.cudaDeviceGetPCIBusId && cuda_fns.cudaDeviceMalloc; + success = success && cuda_fns.cudaDeviceFree && cuda_fns.cudaDriverGetVersion; + success = success && cuda_fns.cudaDeviceGetAttribute; if (!success) { cuda_found = false; if (init_log) { @@ -159,6 +165,8 @@ static void load_nvidia_functions() { } } dlclose(handle_cudart); + } else if (init_log) { + fprintf(stderr, "[DEBUG] %s is not installed. fallback\n", cudartLib); } if (!cuda_found) { cuda_fns.cudaMemcpy = cpu_cudaMemcpy; @@ -291,6 +299,32 @@ void init_gil_release_from_env() { } } +int is_gds_supported(int deviceId) +{ +#ifndef USE_ROCM + int gdr_support = 1; + int driverVersion = 0; + cudaError_t err; + + err = cuda_fns.cudaDriverGetVersion(&driverVersion); + if (err != cudaSuccess) { + std::fprintf(stderr, "is_gds_supported: cudaDriverGetVersion failed, deviceId=%d, err=%d\n", deviceId, err); + return -1; + } + + if (driverVersion > 11030) { + err = cuda_fns.cudaDeviceGetAttribute(&gdr_support, cudaDevAttrGPUDirectRDMASupported, deviceId); + if (err != cudaSuccess) { + std::fprintf(stderr, "is_gds_supported: cudaDeviceGetAttribute failed, deviceId=%d, err=%d\n", deviceId, err); + return -1; + } + } + return gdr_support; +#endif + // ROCm does not have GDS + return 0; +} + int init_gds() { CUfileError_t err; @@ -787,6 +821,7 @@ PYBIND11_MODULE(__MOD_NAME__, m) m.def("cufile_version", &cufile_version); m.def("set_debug_log", &set_debug_log); m.def("get_alignment_size", &get_alignment_size); + m.def("is_gds_supported", &is_gds_supported); m.def("init_gds", &init_gds); m.def("close_gds", &close_gds); m.def("get_device_pci_bus", &get_device_pci_bus); diff --git a/fastsafetensors/cpp/ext.hpp b/fastsafetensors/cpp/ext.hpp index 762fa0c..fa26c1e 100644 --- a/fastsafetensors/cpp/ext.hpp +++ b/fastsafetensors/cpp/ext.hpp @@ -39,6 +39,7 @@ typedef struct CUfileError { CUfileOpError err; } CUfileError_t; // Define minimal CUDA/HIP types for both platforms to avoid compile-time dependencies // We load all GPU functions dynamically at runtime via dlopen() typedef enum cudaError { cudaSuccess = 0, cudaErrorMemoryAllocation = 2 } cudaError_t; +enum cudaDeviceAttr {cudaDevAttrGPUDirectRDMASupported = 116}; // Platform-specific enum values - CUDA and HIP have different values for HostToDevice #ifdef USE_ROCM enum cudaMemcpyKind { cudaMemcpyHostToDevice=1, cudaMemcpyDefault = 4 }; @@ -212,6 +213,8 @@ typedef struct ext_funcs { cudaError_t (*cudaDeviceMalloc)(void **, size_t); cudaError_t (*cudaDeviceFree)(void *); int (*numa_run_on_node)(int); + cudaError_t (*cudaDriverGetVersion)(int *); + cudaError_t (*cudaDeviceGetAttribute)(int *, enum cudaDeviceAttr, int); } ext_funcs_t; typedef struct cpp_metrics {