foundation-model-stack · takeshi-yoshimura · Jun 23, 2025 · May 30, 2025 · May 30, 2025 · Jun 6, 2025
diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml
@@ -0,0 +1,102 @@
+name: Build wheel and sdist
+
+on:
+  workflow_dispatch:
+  release:
+    types: [published]
+
+jobs:
+  build-wheels:
+    name: Build wheel for ${{ matrix.python-version }}
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["cp39", "cp310", "cp311", "cp312", "cp313"]
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: "3.11"
+
+    - name: Install cibuildwheel
+      run: python -m pip install cibuildwheel setuptools pybind11
+    - name: Build wheels with cibuildwheel
+      run: |
+        cibuildwheel --output-dir wheelhouse
+      env:
+        CIBW_BUILD: "${{ matrix.python-version }}-manylinux_x86_64"
+        CIBW_SKIP: "*-musllinux_* *-win32 *-manylinux_i686"
+        CIBW_TEST_SKIP: "*"
+        CIBW_ARCHS: "x86_64"
+
+    - name: Upload wheel artifact
+      uses: actions/upload-artifact@v4
+      with:
+        name: wheels-${{ matrix.python-version }}
+        path: wheelhouse/*.whl
+
+  build-sdist:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install build
+        run: pip install build
+
+      - name: Build sdist
+        run: python -m build --sdist --outdir dist/
+
+      - name: Upload sdist artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: sdist
+          path: dist/*.tar.gz
+
+  upload:
+    needs: [build-wheels, build-sdist]
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Download all wheels
+      uses: actions/download-artifact@v4
+      with:
+        path: dist
+    - name: Download sdist artifact
+      uses: actions/download-artifact@v4
+      with:
+        name: sdist
+        path: dist
+
+    - name: Flatten all artifacts
+      run: |
+        mkdir final_dist
+        find dist -name '*.whl' -exec cp {} final_dist/ \;
+        find dist -name '*.tar.gz' -exec cp {} final_dist/ \;
+
+    - name: Upload all artifact
+      uses: actions/upload-artifact@v4
+      with:
+        name: final_dist
+        path: final_dist
+
+#    - name: Set up Python
+#      uses: actions/setup-python@v5
+#      with:
+#        python-version: "3.11"
+#    - name: Publish to PyPI
+#      run: |
+#        python -m pip install twine
+#        twine upload --non-interactive final_dist/*
+#      env:
+#        TWINE_USERNAME: __token__
+#        TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/.github/workflows/test-paddle.yaml b/.github/workflows/test-paddle.yaml
@@ -32,9 +32,23 @@ jobs:
       - name: Install Python dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu
+          tf_ver=4.52
+          npy_ver=2.2
+          torch_ver=2.7
+          if [ "${{ matrix.python-version }}" = "3.9" ]; then
+            npy_ver=1.26
+            tf_ver=4.40
+            torch_ver=2.1
+          elif [ "${{ matrix.python-version }}" = "3.10" ]; then
+            torch_ver=2.3
+          elif [ "${{ matrix.python-version }}" = "3.11" ]; then
+            torch_ver=2.5
+          elif [ "${{ matrix.python-version }}" = "3.12" ]; then
+            torch_ver=2.6
+          fi
+          pip install torch==${torch_ver} --index-url https://download.pytorch.org/whl/cpu # transformers requires torch
           pip install paddlepaddle==3.0.0
-          pip install pytest pytest-cov setuptools_scm safetensors transformers==4.52
+          pip install pytest pytest-cov setuptools_scm safetensors transformers==${tf_ver} numpy==${npy_ver}
       - name: Build Package
         run: |
           pip install .
@@ -43,12 +57,11 @@ jobs:
           cd tests
           LIBDIR=`python3 -c "import os; os.chdir('/tmp'); import fastsafetensors; print(os.path.dirname(fastsafetensors.__file__))"`
           mkdir -p /tmp/pytest-log
+          export TEST_FASTSAFETENSORS_FRAMEWORK=paddle
           COVERAGE_FILE=.coverage_0 pytest -s --cov=${LIBDIR} test_fastsafetensors.py > /tmp/pytest-log/0.log 2>&1
-          COVERAGE_FILE=.coverage_1 CUDA_VISIBLE_DEVICES="" pytest -s --cov=${LIBDIR} test_fastsafetensors.py > /tmp/pytest-log/1.log 2>&1
-          COVERAGE_FILE=.coverage_2 torchrun --nnodes=2 --master_addr=0.0.0.0 --master_port=1234 --node_rank=0 --no-python pytest -s --cov=${LIBDIR} test_multi.py > /tmp/pytest-log/2.log 2>&1 &
-          COVERAGE_FILE=.coverage_3 torchrun --nnodes=2 --master_addr=0.0.0.0 --master_port=1234 --node_rank=1 --no-python pytest -s --cov=${LIBDIR} test_multi.py > /tmp/pytest-log/3.log 2>&1
-          python -m paddle.distributed.launch --nproc_per_node 2 run_distributed_paddle_test.py -s --cov=${LIBDIR} test_multi_paddle.py
-          coverage combine .coverage_0 .coverage_1 .coverage_2 .coverage_3 .coverage_4 .coverage_5
+          COVERAGE_FILE=.coverage_1 WORLD_SIZE=2 python3 -m paddle.distributed.launch --nnodes 2 --master 127.0.0.1:1234 --rank 0 test_multi.py --cov=${LIBDIR} -s test_multi.py > /tmp/pytest-log/1.log 2>&1 & \
+          COVERAGE_FILE=.coverage_2 WORLD_SIZE=2 python3 -m paddle.distributed.launch --nnodes 2 --master 127.0.0.1:1234 --rank 1 test_multi.py --cov=${LIBDIR} -s test_multi.py > /tmp/pytest-log/2.log 2>&1 && \
+          coverage combine .coverage_*
           coverage html
           mv htmlcov /tmp/pytest-log
       - name: upload pytest log

diff --git a/.github/workflows/test-torch.yaml b/.github/workflows/test-torch.yaml
@@ -56,11 +56,14 @@ jobs:
           cd tests
           LIBDIR=`python3 -c "import os; os.chdir('/tmp'); import fastsafetensors; print(os.path.dirname(fastsafetensors.__file__))"`
           mkdir -p /tmp/pytest-log
+          export TEST_FASTSAFETENSORS_FRAMEWORK=pytorch
           COVERAGE_FILE=.coverage_0 pytest -s --cov=${LIBDIR} test_fastsafetensors.py > /tmp/pytest-log/0.log 2>&1
-          COVERAGE_FILE=.coverage_1 CUDA_VISIBLE_DEVICES="" pytest -s --cov=${LIBDIR} test_fastsafetensors.py > /tmp/pytest-log/1.log 2>&1
-          COVERAGE_FILE=.coverage_2 torchrun --nnodes=2 --master_addr=0.0.0.0 --master_port=1234 --node_rank=0 --no-python pytest -s --cov=${LIBDIR} test_multi.py > /tmp/pytest-log/2.log 2>&1 &
-          COVERAGE_FILE=.coverage_3 torchrun --nnodes=2 --master_addr=0.0.0.0 --master_port=1234 --node_rank=1 --no-python pytest -s --cov=${LIBDIR} test_multi.py > /tmp/pytest-log/3.log 2>&1
-          coverage combine .coverage_0 .coverage_1 .coverage_2 .coverage_3
+          COVERAGE_FILE=.coverage_1 torchrun --nnodes=1 --master_addr=0.0.0.0 --master_port=1234 --node_rank=0 test_multi.py --cov=${LIBDIR} -s test_multi.py > /tmp/pytest-log/1.log 2>&1
+          COVERAGE_FILE=.coverage_2 torchrun --nnodes=4 --master_addr=0.0.0.0 --master_port=1234 --node_rank=0 test_multi.py --cov=${LIBDIR} -s test_multi.py > /tmp/pytest-log/2.log 2>&1 &
+          COVERAGE_FILE=.coverage_3 torchrun --nnodes=4 --master_addr=0.0.0.0 --master_port=1234 --node_rank=1 test_multi.py --cov=${LIBDIR} -s test_multi.py > /tmp/pytest-log/3.log 2>&1 &
+          COVERAGE_FILE=.coverage_4 torchrun --nnodes=4 --master_addr=0.0.0.0 --master_port=1234 --node_rank=2 test_multi.py --cov=${LIBDIR} -s test_multi.py > /tmp/pytest-log/4.log 2>&1 &
+          COVERAGE_FILE=.coverage_5 torchrun --nnodes=4 --master_addr=0.0.0.0 --master_port=1234 --node_rank=3 test_multi.py --cov=${LIBDIR} -s test_multi.py > /tmp/pytest-log/5.log 2>&1
+          coverage combine .coverage_*
           coverage html
           mv htmlcov /tmp/pytest-log
       - name: Upload Pytest log

diff --git a/Makefile b/Makefile
@@ -6,24 +6,44 @@ CONCMD := docker
 ifdef PODMAN
 	CONCMD = podman
 endif
-FST_DIR := $(shell python3 -c "import os; os.chdir('/tmp'); import fastsafetensors; print(os.path.dirname(fastsafetensors.__file__))")
 
 .PHONY: install
 install:
 	pip install . --no-cache-dir --no-build-isolation
 
-.PHONY: unittest
+.PHONY: unittest unittest-parallel unittest-paddle unittest-paddle-gpu htmlcov
+
+FST_DIR := $(shell python3 -c "import os; os.chdir('/tmp'); import fastsafetensors; print(os.path.dirname(fastsafetensors.__file__))")
+
 unittest:
-	COVERAGE_FILE=.coverage_0 pytest -s --cov=$(FST_DIR) tests/test_fastsafetensors.py
-	COVERAGE_FILE=.coverage_1 CUDA_VISIBLE_DEVICES="" pytest -s --cov=$(FST_DIR) tests/test_fastsafetensors.py
-	COVERAGE_FILE=.coverage_2 torchrun --nnodes=2 --master_addr=0.0.0.0 --master_port=1234 --node_rank=0 --no-python pytest -s --cov=${FST_DIR} tests/test_multi.py > /tmp/2.log 2>&1 &
-	COVERAGE_FILE=.coverage_3 torchrun --nnodes=2 --master_addr=0.0.0.0 --master_port=1234 --node_rank=1 --no-python pytest -s --cov=${FST_DIR} tests/test_multi.py > /tmp/3.log 2>&1
-	coverage combine .coverage_0 .coverage_1 .coverage_2 .coverage_3
-	coverage html
+	@FST_DIR=$(FST_DIR); \
+	TEST_FASTSAFETENSORS_FRAMEWORK=torch COVERAGE_FILE=.coverage_0 pytest -s --cov=$(FST_DIR) tests/test_fastsafetensors.py && \
+	TEST_FASTSAFETENSORS_FRAMEWORK=torch COVERAGE_FILE=.coverage_1 CUDA_VISIBLE_DEVICES="" pytest -s --cov=$(FST_DIR) tests/test_fastsafetensors.py && \
+	TEST_FASTSAFETENSORS_FRAMEWORK=torch COVERAGE_FILE=.coverage_2 pytest -s --cov=$(FST_DIR) -s tests/test_vllm.py
+
+unittest-parallel:
+	TEST_FASTSAFETENSORS_FRAMEWORK=torch COVERAGE_FILE=.coverage_3 torchrun --nnodes=4 --master_addr=0.0.0.0 --master_port=1234 --node_rank=0 tests/test_multi.py --cov=$(FST_DIR) -s tests/test_multi.py > /tmp/3.log 2>&1 & \
+	TEST_FASTSAFETENSORS_FRAMEWORK=torch COVERAGE_FILE=.coverage_4 torchrun --nnodes=4 --master_addr=0.0.0.0 --master_port=1234 --node_rank=1 tests/test_multi.py --cov=$(FST_DIR) -s tests/test_multi.py > /tmp/4.log 2>&1 & \
+	TEST_FASTSAFETENSORS_FRAMEWORK=torch COVERAGE_FILE=.coverage_5 torchrun --nnodes=4 --master_addr=0.0.0.0 --master_port=1234 --node_rank=2 tests/test_multi.py --cov=$(FST_DIR) -s tests/test_multi.py > /tmp/5.log 2>&1 & \
+	TEST_FASTSAFETENSORS_FRAMEWORK=torch COVERAGE_FILE=.coverage_6 torchrun --nnodes=4 --master_addr=0.0.0.0 --master_port=1234 --node_rank=3 tests/test_multi.py --cov=$(FST_DIR) -s tests/test_multi.py > /tmp/6.log 2>&1 && \
+	wait && \
+	TEST_FASTSAFETENSORS_FRAMEWORK=torch COVERAGE_FILE=.coverage_7 torchrun --nnodes=1 --master_addr=0.0.0.0 --master_port=1234 --node_rank=0 tests/test_multi.py --cov=$(FST_DIR) -s tests/test_multi.py > /tmp/7.log 2>&1 & \
+	wait
 
-.PHONY: integrationtest
-integrationtest:
-	cd tests && COVERAGE_FILE=.coverage pytest -s test_vllm.py
+unittest-paddle:
+	@FST_DIR=$(FST_DIR); \
+	TEST_FASTSAFETENSORS_FRAMEWORK=paddle COVERAGE_FILE=.coverage_8 CUDA_VISIBLE_DEVICES="" pytest -s --cov=$(FST_DIR) tests/test_fastsafetensors.py && \
+	TEST_FASTSAFETENSORS_FRAMEWORK=paddle COVERAGE_FILE=.coverage_9 CUDA_VISIBLE_DEVICES="" WORLD_SIZE=2 python3 -m paddle.distributed.launch --nnodes 2 --master 127.0.0.1:1234 --rank 0 tests/test_multi.py --cov=$(FST_DIR) -s tests/test_multi.py > /tmp/9.log 2>&1 & \
+	TEST_FASTSAFETENSORS_FRAMEWORK=paddle COVERAGE_FILE=.coverage_10 CUDA_VISIBLE_DEVICES="" WORLD_SIZE=2 python3 -m paddle.distributed.launch --nnodes 2 --master 127.0.0.1:1234 --rank 1 tests/test_multi.py --cov=$(FST_DIR) -s tests/test_multi.py > /tmp/10.log 2>&1 && \
+	wait
+
+unittest-paddle-gpu:
+	@FST_DIR=$(FST_DIR); \
+	TEST_FASTSAFETENSORS_FRAMEWORK=paddle COVERAGE_FILE=.coverage_11 pytest -s --cov=$(FST_DIR) tests/test_fastsafetensors.py
+
+htmlcov:
+	coverage combine .coverage_* && \
+	coverage html
 
 .PHONY: builder
 builder: Dockerfile.build
@@ -45,7 +65,20 @@ upload:
 	python3 -m twine upload -u __token__ dist/fastsafetensors-$(shell grep version pyproject.toml | sed -e 's/version = "\([0-9.]\+\)"/\1/g')*
 
 perf/dist:
-	cd perf && python3 -m build
+	cd perf && pip install .
+
+.PHONY: format
+format:
+	black8 .
+	isort .
+
+.PHONY: lint
+lint:
+	black .
+	isort .
+	flake8 . --select=E9,F63,F7,F82
+	mypy . --ignore-missing-imports
 
+.PHONY: clean
 clean:
 	rm -rf dist build fastsafetensors.egg-info
diff --git a/examples/paddle_case/a_paddle.safetensors → examples/a_paddle.safetensors b/examples/paddle_case/a_paddle.safetensors → examples/a_paddle.safetensors
diff --git a/examples/paddle_case/b_paddle.safetensors → examples/b_paddle.safetensors b/examples/paddle_case/b_paddle.safetensors → examples/b_paddle.safetensors
diff --git a/examples/extract_keys.py b/examples/extract_keys.py
@@ -1,32 +1,38 @@
-import sys
 import os
-import torch
+import sys
+from typing import Dict, List
+
+from safetensors.torch import load_file
+
 from fastsafetensors import SafeTensorsFileLoader, SingleGroup
-from safetensors import safe_open
 
 if __name__ == "__main__":
     if len(sys.argv) != 2:
         print("specify a directory containing safetensors files")
         sys.exit(1)
-    loader = SafeTensorsFileLoader(SingleGroup(), torch.device("cpu"), nogds=True)
+    loader = SafeTensorsFileLoader(SingleGroup(), device="cpu", nogds=True)
     input_file_or_dir = sys.argv[1]
-    src_files = {0: []}
+    src_files: Dict[int, List[str]] = {0: []}
     orig_keys = {}
     if os.path.isdir(input_file_or_dir):
         for dir, _, files in os.walk(input_file_or_dir):
-                for filename in files:
-                    if filename.endswith(".safetensors"):
-                        src_files[0].append(f"{dir}/{filename}")
-    elif os.path.exists(input_file_or_dir) and input_file_or_dir.endswith(".safetensors"):
-         src_files[0].append(input_file_or_dir)
-         with safe_open(input_file_or_dir, framework="pytorch") as f:
-              for key in f.keys():
-                   orig_keys[key] = f.get_tensor(key)
+            for filename in files:
+                if filename.endswith(".safetensors"):
+                    src_files[0].append(f"{dir}/{filename}")
+    elif os.path.exists(input_file_or_dir) and input_file_or_dir.endswith(
+        ".safetensors"
+    ):
+        src_files[0].append(input_file_or_dir)
+        orig_keys = load_file(input_file_or_dir)
     loader.add_filenames(src_files)
     fb = loader.copy_files_to_device()
     if len(orig_keys) > 0:
         for key in loader.get_keys():
-            print(f"\"{key}\",{loader.get_shape(key)},{loader.frames[key].data_offsets},{fb.get_tensor(key).dtype},{orig_keys[key].dtype}")
+            print(
+                f'"{key}",{loader.get_shape(key)},{loader.frames[key].data_offsets},{fb.get_tensor(key).dtype},{orig_keys[key].dtype}'
+            )
     else:
         for key in loader.get_keys():
-            print(f"\"{key}\",{loader.get_shape(key)},{loader.frames[key].data_offsets},{fb.get_tensor(key).dtype}")
+            print(
+                f'"{key}",{loader.get_shape(key)},{loader.frames[key].data_offsets},{fb.get_tensor(key).dtype}'
+            )