From 4bd5fe2547d72f5c0fc538a00ae73d26eb930158 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 28 Aug 2025 15:18:31 +0000 Subject: [PATCH 01/26] build: Minor tweeks for wheel build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- deep_gemm/__init__.py | 34 +++++++---- setup.py | 130 ++++++++++++++++++++++++++---------------- 2 files changed, 105 insertions(+), 59 deletions(-) diff --git a/deep_gemm/__init__.py b/deep_gemm/__init__.py index 169e2e6b..7ffa8131 100644 --- a/deep_gemm/__init__.py +++ b/deep_gemm/__init__.py @@ -5,6 +5,7 @@ try: # noinspection PyUnresolvedReferences from .envs import persistent_envs + for key, value in persistent_envs.items(): if key not in os.environ: os.environ[key] = value @@ -23,19 +24,23 @@ # Kernels from deep_gemm_cpp import ( # FP8 GEMMs - fp8_gemm_nt, fp8_gemm_nn, - fp8_gemm_tn, fp8_gemm_tt, + fp8_gemm_nt, + fp8_gemm_nn, + fp8_gemm_tn, + fp8_gemm_tt, m_grouped_fp8_gemm_nt_contiguous, m_grouped_fp8_gemm_nn_contiguous, m_grouped_fp8_gemm_nt_masked, k_grouped_fp8_gemm_tn_contiguous, # BF16 GEMMs - bf16_gemm_nt, bf16_gemm_nn, - bf16_gemm_tn, bf16_gemm_tt, + bf16_gemm_nt, + bf16_gemm_nn, + bf16_gemm_tn, + bf16_gemm_tt, m_grouped_bf16_gemm_nt_contiguous, m_grouped_bf16_gemm_nt_masked, # Layout kernels - transform_sf_into_required_layout + transform_sf_into_required_layout, ) # Some alias for legacy supports @@ -53,15 +58,19 @@ def _find_cuda_home() -> str: # TODO: reuse PyTorch API later # For some PyTorch versions, the original `_find_cuda_home` will initialize CUDA, which is incompatible with process forks - cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH') + cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH") if cuda_home is None: # noinspection PyBroadException try: - with open(os.devnull, 'w') as devnull: - nvcc = subprocess.check_output(['which', 'nvcc'], stderr=devnull).decode().rstrip('\r\n') + with open(os.devnull, "w") as devnull: + nvcc = ( + subprocess.check_output(["which", "nvcc"], stderr=devnull) + .decode() + .rstrip("\r\n") + ) cuda_home = os.path.dirname(os.path.dirname(nvcc)) except Exception: - cuda_home = '/usr/local/cuda' + cuda_home = "/usr/local/cuda" if not os.path.exists(cuda_home): cuda_home = None assert cuda_home is not None @@ -69,6 +78,9 @@ def _find_cuda_home() -> str: deep_gemm_cpp.init( - os.path.dirname(os.path.abspath(__file__)), # Library root directory path - _find_cuda_home() # CUDA home + os.path.dirname(os.path.abspath(__file__)), # Library root directory path + _find_cuda_home(), # CUDA home ) + + +__version__ = "2.0.0" diff --git a/setup.py b/setup.py index 1f29ad04..fdac0207 100644 --- a/setup.py +++ b/setup.py @@ -1,36 +1,45 @@ import os import setuptools import shutil -import subprocess import torch +import re +import ast from setuptools import find_packages from setuptools.command.build_py import build_py from torch.utils.cpp_extension import CUDAExtension, CUDA_HOME +from pathlib import Path +import subprocess + +SKIP_CUDA_BUILD = os.getenv("DEEP_GEMM_SKIP_CUDA_BUILD", "FALSE") == "TRUE" + current_dir = os.path.dirname(os.path.realpath(__file__)) -cxx_flags = ['-std=c++17', '-O3', '-fPIC', '-Wno-psabi', '-Wno-deprecated-declarations', - f'-D_GLIBCXX_USE_CXX11_ABI={int(torch.compiled_with_cxx11_abi())}'] -sources = ['csrc/python_api.cpp'] -build_include_dirs = [ - f'{CUDA_HOME}/include', - f'{CUDA_HOME}/include/cccl', - 'deep_gemm/include', - 'third-party/cutlass/include', - 'third-party/fmt/include', +cxx_flags = [ + "-std=c++17", + "-O3", + "-fPIC", + "-Wno-psabi", + "-Wno-deprecated-declarations", + f"-D_GLIBCXX_USE_CXX11_ABI={int(torch.compiled_with_cxx11_abi())}", ] -build_libraries = ['cuda', 'cudart', 'nvrtc'] -build_library_dirs = [ - f'{CUDA_HOME}/lib64', - f'{CUDA_HOME}/lib64/stubs' +sources = ["csrc/python_api.cpp"] +build_include_dirs = [ + f"{CUDA_HOME}/include", + f"{CUDA_HOME}/include/cccl", + "deep_gemm/include", + "third-party/cutlass/include", + "third-party/fmt/include", ] +build_libraries = ["cuda", "cudart", "nvrtc"] +build_library_dirs = [f"{CUDA_HOME}/lib64", f"{CUDA_HOME}/lib64/stubs"] third_party_include_dirs = [ - 'third-party/cutlass/include/cute', - 'third-party/cutlass/include/cutlass', + "third-party/cutlass/include/cute", + "third-party/cutlass/include/cutlass", ] # Use runtime API -if int(os.environ.get('DG_JIT_USE_RUNTIME_API', '0')): - cxx_flags.append('-DDG_JIT_USE_RUNTIME_API') +if int(os.environ.get("DG_JIT_USE_RUNTIME_API", "0")): + cxx_flags.append("-DDG_JIT_USE_RUNTIME_API") class CustomBuildPy(build_py): @@ -45,22 +54,30 @@ def run(self): build_py.run(self) def generate_default_envs(self): - code = '# Pre-installed environment variables\n' - code += 'persistent_envs = dict()\n' - for name in ('DG_JIT_CACHE_DIR', 'DG_JIT_PRINT_COMPILER_COMMAND', 'DG_JIT_CPP_STANDARD'): - code += f"persistent_envs['{name}'] = '{os.environ[name]}'\n" if name in os.environ else '' + code = "# Pre-installed environment variables\n" + code += "persistent_envs = dict()\n" + for name in ( + "DG_JIT_CACHE_DIR", + "DG_JIT_PRINT_COMPILER_COMMAND", + "DG_JIT_CPP_STANDARD", + ): + code += ( + f"persistent_envs['{name}'] = '{os.environ[name]}'\n" + if name in os.environ + else "" + ) - with open(os.path.join(self.build_lib, 'deep_gemm', 'envs.py'), 'w') as f: + with open(os.path.join(self.build_lib, "deep_gemm", "envs.py"), "w") as f: f.write(code) def prepare_includes(self): # Create temporary build directory instead of modifying package directory - build_include_dir = os.path.join(self.build_lib, 'deep_gemm/include') + build_include_dir = os.path.join(self.build_lib, "deep_gemm/include") os.makedirs(build_include_dir, exist_ok=True) # Copy third-party includes to the build directory for d in third_party_include_dirs: - dirname = d.split('/')[-1] + dirname = d.split("/")[-1] src_dir = os.path.join(current_dir, d) dst_dir = os.path.join(build_include_dir, dirname) @@ -72,36 +89,53 @@ def prepare_includes(self): shutil.copytree(src_dir, dst_dir) -if __name__ == '__main__': - # noinspection PyBroadException - try: - cmd = ['git', 'rev-parse', '--short', 'HEAD'] - revision = '+' + subprocess.check_output(cmd).decode('ascii').rstrip() - except: - revision = '' +if not SKIP_CUDA_BUILD: + ext_modules = [ + CUDAExtension( + name="deep_gemm_cpp", + sources=sources, + include_dirs=build_include_dirs, + ) + ] +else: + ext_modules = [] + + +NO_LOCAL_VERSION = os.getenv("DEEP_GEMM_NO_LOCAL_VERSION", "FALSE") == "TRUE" + + +def get_package_version(): + with open(Path(current_dir) / "deep_gemm" / "__init__.py", "r") as f: + version_match = re.search(r"^__version__\s*=\s*(.*)$", f.read(), re.MULTILINE) + public_version = ast.literal_eval(version_match.group(1)) + revision = "" + + if not NO_LOCAL_VERSION: + try: + cmd = ["git", "rev-parse", "--short", "HEAD"] + revision = "+" + subprocess.check_output(cmd).decode("ascii").rstrip() + except: + revision = "" + + return f"{public_version}{revision}" + +if __name__ == "__main__": # noinspection PyTypeChecker setuptools.setup( - name='deep_gemm', - version='2.0.0' + revision, - packages=find_packages('.'), + name="deep_gemm", + version=get_package_version(), + packages=find_packages("."), package_data={ - 'deep_gemm': [ - 'include/deep_gemm/**/*', - 'include/cute/**/*', - 'include/cutlass/**/*', + "deep_gemm": [ + "include/deep_gemm/**/*", + "include/cute/**/*", + "include/cutlass/**/*", ] }, - ext_modules=[ - CUDAExtension(name='deep_gemm_cpp', - sources=sources, - include_dirs=build_include_dirs, - libraries=build_libraries, - library_dirs=build_library_dirs, - extra_compile_args=cxx_flags) - ], + ext_modules=ext_modules, zip_safe=False, cmdclass={ - 'build_py': CustomBuildPy, + "build_py": CustomBuildPy, }, ) From 5d743c15d35bb11bf425220fd7caa9c88e489ba0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 28 Aug 2025 15:18:40 +0000 Subject: [PATCH 02/26] ci: Workflows for wheel build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/_build.yml | 221 ++++++++++++++++++++++++++++++++++ .github/workflows/build.yml | 67 +++++++++++ .github/workflows/publish.yml | 93 ++++++++++++++ 3 files changed, 381 insertions(+) create mode 100644 .github/workflows/_build.yml create mode 100644 .github/workflows/build.yml create mode 100644 .github/workflows/publish.yml diff --git a/.github/workflows/_build.yml b/.github/workflows/_build.yml new file mode 100644 index 00000000..6a9a1469 --- /dev/null +++ b/.github/workflows/_build.yml @@ -0,0 +1,221 @@ +name: ~Build wheel template + +on: + workflow_call: + inputs: + runs-on: + description: "The runner to use for the build" + required: true + type: string + python-version: + description: "The Python version to use for the build" + required: true + type: string + cuda-version: + description: "The CUDA version to use for the build" + required: true + type: string + torch-version: + description: "The PyTorch version to use for the build" + required: true + type: string + cxx11_abi: + description: "The C++11 ABI to use for the build" + required: true + type: string + upload-to-release: + description: "Upload wheel to this release" + required: false + type: boolean + default: false + release-version: + description: "Upload wheel to this release" + required: false + type: string + +defaults: + run: + shell: bash -x -e -u -o pipefail {0} + +jobs: + build-wheel: + runs-on: ${{ inputs.runs-on }} + name: Build wheel (${{ inputs.release-version }}-${{ inputs.python-version }}-${{ inputs.cuda-version }}-${{ inputs.torch-version }}-${{ inputs.cxx11_abi }}) + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + ref: ${{ inputs.release-version }} + submodules: recursive + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} + + - name: Set CUDA and PyTorch versions + run: | + echo "MATRIX_CUDA_VERSION=$(echo ${{ inputs.cuda-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV + echo "MATRIX_TORCH_VERSION=$(echo ${{ inputs.torch-version }} | awk -F \. {'print $1 "." $2'})" >> $GITHUB_ENV + echo "WHEEL_CUDA_VERSION=$(echo ${{ inputs.cuda-version }} | awk -F \. {'print $1'})" >> $GITHUB_ENV + echo "MATRIX_PYTHON_VERSION=$(echo ${{ inputs.python-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV + + - name: Free up disk space + if: ${{ runner.os == 'Linux' }} + # https://github.com/easimon/maximize-build-space/blob/master/action.yml + # https://github.com/easimon/maximize-build-space/tree/test-report + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf /opt/hostedtoolcache/CodeQL + + - name: Set up swap space + if: runner.os == 'Linux' + uses: pierotofy/set-swap-space@v1.0 + with: + swap-size-gb: 10 + + - name: Install CUDA ${{ inputs.cuda-version }} + if: ${{ inputs.cuda-version != 'cpu' }} + uses: Jimver/cuda-toolkit@v0.2.26 + id: cuda-toolkit + with: + cuda: ${{ inputs.cuda-version }} + linux-local-args: '["--toolkit"]' + # default method is "local", and we're hitting some error with caching for CUDA 11.8 and 12.1 + # method: ${{ (inputs.cuda-version == '11.8.0' || inputs.cuda-version == '12.1.0') && 'network' || 'local' }} + method: "network" + + - name: Install additional CUDA libraries + run: | + CUDA_VERSION=$(echo ${{ inputs.cuda-version }} | awk -F \. {'print $1 "-" $2'}) + sudo apt-get update + sudo apt-get install -y libcusparse-$CUDA_VERSION libcusolver-$CUDA_VERSION + sudo apt-get clean + + - name: Install PyTorch ${{ inputs.torch-version }}+cu${{ inputs.cuda-version }} + run: | + pip install --upgrade pip + # With python 3.13 and torch 2.5.1, unless we update typing-extensions, we get error + # AttributeError: attribute '__default__' of 'typing.ParamSpec' objects is not writable + pip install typing-extensions==4.12.2 + # We want to figure out the CUDA version to download pytorch + # e.g. we can have system CUDA version being 11.7 but if torch==1.12 then we need to download the wheel from cu116 + # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix + # This code is ugly, maybe there's a better way to do this. + export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \ + minv = {'2.4': 118, '2.5': 118, '2.6': 118, '2.7': 118, '2.8': 126}[env['MATRIX_TORCH_VERSION']]; \ + maxv = {'2.4': 124, '2.5': 124, '2.6': 126, '2.7': 128, '2.8': 129}[env['MATRIX_TORCH_VERSION']]; \ + print(minv if int(env['MATRIX_CUDA_VERSION']) < 120 else maxv)" \ + ) + if [[ ${{ inputs.torch-version }} == *"dev"* ]]; then + # pip install --no-cache-dir --pre torch==${{ inputs.torch-version }} --index-url https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION} + # Can't use --no-deps because we need cudnn etc. + # Hard-coding this version of pytorch-triton for torch 2.6.0.dev20241001 + pip install jinja2 + pip install https://download.pytorch.org/whl/nightly/pytorch_triton-3.1.0%2Bcf34004b8a-cp${MATRIX_PYTHON_VERSION}-cp${MATRIX_PYTHON_VERSION}-linux_x86_64.whl + pip install --no-cache-dir --pre https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION}/torch-${{ inputs.torch-version }}%2Bcu${TORCH_CUDA_VERSION}-cp${MATRIX_PYTHON_VERSION}-cp${MATRIX_PYTHON_VERSION}-linux_x86_64.whl + else + pip install --no-cache-dir torch==${{ inputs.torch-version }} --index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION} + fi + nvcc --version + python --version + python -c "import torch; print('PyTorch:', torch.__version__)" + python -c "import torch; print('CUDA:', torch.version.cuda)" + python -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)" + + - name: Restore build cache + uses: actions/cache/restore@v4 + with: + path: build.tar + key: build-${{ inputs.release-version }}-${{ inputs.python-version }}-${{ inputs.cuda-version }}-${{ inputs.torch-version }}-${{ inputs.cxx11_abi }}-${{ github.run_number }}-${{ github.run_attempt }} + restore-keys: | + build-${{ inputs.release-version }}-${{ inputs.python-version }}-${{ inputs.cuda-version }}-${{ inputs.torch-version }}-${{ inputs.cxx11_abi }}- + + - name: Unpack build cache + run: | + echo ::group::Adjust timestamps + sudo find / -exec touch -t 197001010000 {} + || true + echo ::endgroup:: + + if [ -f build.tar ]; then + find . -mindepth 1 -maxdepth 1 ! -name 'build.tar' -exec rm -rf {} + + tar -xpvf build.tar -C . + else + echo "No build.tar found, skipping" + fi + + ls -al ./ + ls -al build/ || true + ls -al csrc/ || true + + - name: Build wheel + id: build_wheel + run: | + # We want setuptools >= 49.6.0 otherwise we can't compile the extension if system CUDA version is 11.7 and pytorch cuda version is 11.6 + # https://github.com/pytorch/pytorch/blob/664058fa83f1d8eede5d66418abff6e20bd76ca8/torch/utils/cpp_extension.py#L810 + # However this still fails so I'm using a newer version of setuptools + pip install setuptools==75.8.0 + pip install ninja packaging wheel + export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH + export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH + # Limit MAX_JOBS otherwise the github runner goes OOM + # nvcc 11.8 can compile with 2 jobs, but nvcc 12.3 goes OOM + + export MAX_JOBS=$([ "$MATRIX_CUDA_VERSION" == "129" ] && echo 1 || echo 2) + export NVCC_THREADS=2 + export TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 9.0+PTX" + + # 5h timeout since GH allows max 6h and we want some buffer + EXIT_CODE=0 + timeout 5h python setup.py bdist_wheel --dist-dir=dist || EXIT_CODE=$? + + if [ $EXIT_CODE -eq 0 ]; then + tmpname=cu${WHEEL_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi${{ inputs.cxx11_abi }} + wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2") + ls dist/*whl |xargs -I {} mv {} dist/${wheel_name} + echo "wheel_name=${wheel_name}" >> $GITHUB_ENV + fi + + # Store exit code in GitHub env for later steps + echo "build_exit_code=$EXIT_CODE" | tee -a "$GITHUB_OUTPUT" + + # Do not fail the job if timeout killed the build + exit $EXIT_CODE + + - name: Log build logs after timeout + if: always() && steps.build_wheel.outputs.build_exit_code == 124 + run: | + ls -al ./ + tar -cvf build.tar . --atime-preserve=replace + + - name: Save build cache timeout + if: always() && steps.build_wheel.outputs.build_exit_code == 124 + uses: actions/cache/save@v4 + with: + key: build-${{ inputs.release-version }}-${{ inputs.python-version }}-${{ inputs.cuda-version }}-${{ inputs.torch-version }}-${{ inputs.cxx11_abi }}-${{ github.run_number }}-${{ github.run_attempt }} + path: build.tar + + - name: Log Built Wheels + run: | + ls dist + + - name: Get Release with tag + id: get_current_release + uses: joutvhu/get-release@v1 + with: + tag_name: ${{ inputs.release-version }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Upload Release Asset + id: upload_release_asset + if: inputs.upload-to-release + uses: actions/upload-release-asset@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ steps.get_current_release.outputs.upload_url }} + asset_path: ./dist/${{env.wheel_name}} + asset_name: ${{env.wheel_name}} + asset_content_type: application/* diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 00000000..fbf33c9f --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,67 @@ +name: Build wheels + +on: + workflow_call: + inputs: + runs-on: + description: "The runner to use for the build" + required: true + type: string + default: ubuntu-22.04 + python-version: + description: "The Python version to use for the build" + required: true + type: string + cuda-version: + description: "The CUDA version to use for the build" + required: true + type: string + torch-version: + description: "The PyTorch version to use for the build" + required: true + type: string + cxx11_abi: + description: "Enable torch flag C++11 ABI (TRUE/FALSE)" + required: true + type: string + upload-to-release: + description: "Upload wheel to this release" + required: false + type: boolean + default: false + release-version: + description: "Upload wheel to this release" + required: false + type: string + push: + +jobs: + build-wheels: + uses: ./.github/workflows/_build.yml + strategy: + fail-fast: false + matrix: + # Using ubuntu-22.04 instead of 24.04 for more compatibility (glibc). Ideally we'd use the + # manylinux docker image, but I haven't figured out how to install CUDA on manylinux. + os: [ubuntu-22.04] + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] + torch-version: ["2.4.0", "2.5.1", "2.6.0", "2.7.1", "2.8.0"] + cuda-version: ["12.9.1"] + # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not. + # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI. + # Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs) + # when building without C++11 ABI and using it on nvcr images. + cxx11_abi: ["FALSE", "TRUE"] + exclude: + # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix + # Pytorch < 2.5 does not support Python 3.13 + - torch-version: "2.4.0" + python-version: "3.13" + with: + runs-on: ${{ inputs.runs-on || matrix.os }} + python-version: ${{ inputs.python-version || matrix.python-version }} + cuda-version: ${{ inputs.cuda-version || matrix.cuda-version }} + torch-version: ${{ inputs.torch-version || matrix.torch-version }} + cxx11_abi: ${{ inputs.cxx11_abi || matrix.cxx11_abi }} + upload-to-release: ${{ inputs.upload-to-release || matrix.upload-to-release }} + release-version: ${{ inputs.release-version || matrix.release-version }} diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 00000000..f0c93eae --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,93 @@ +# This workflow will: +# - Create a new Github release +# - Build wheels for supported architectures +# - Deploy the wheels to the Github release +# - Release the static code to PyPi +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +name: Build wheels and deploy + +on: + create: + tags: + - v* + +jobs: + setup_release: + name: Create Release + runs-on: ubuntu-latest + outputs: + release-version: ${{ steps.extract_branch.outputs.branch }} + steps: + - name: Get the tag version + id: extract_branch + run: echo ::set-output name=branch::${GITHUB_REF#refs/tags/} + shell: bash + - name: Create Release + id: create_release + uses: actions/create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: ${{ steps.extract_branch.outputs.branch }} + release_name: ${{ steps.extract_branch.outputs.branch }} + + build_wheels: + name: Build Wheel + needs: setup_release + strategy: + fail-fast: false + matrix: + # Using ubuntu-22.04 instead of 24.04 for more compatibility (glibc). Ideally we'd use the + # manylinux docker image, but I haven't figured out how to install CUDA on manylinux. + os: [ubuntu-22.04] + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] + torch-version: ["2.4.0", "2.5.1", "2.6.0", "2.7.1", "2.8.0"] + cuda-version: ["12.9.1"] + # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not. + # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI. + # Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs) + # when building without C++11 ABI and using it on nvcr images. + cxx11_abi: ["FALSE", "TRUE"] + exclude: + # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix + # Pytorch < 2.5 does not support Python 3.13 + - torch-version: "2.4.0" + python-version: "3.13" + uses: ./.github/workflows/_build.yml + with: + runs-on: ${{ matrix.os }} + python-version: ${{ matrix.python-version }} + cuda-version: ${{ matrix.cuda-version }} + torch-version: ${{ matrix.torch-version }} + cxx11_abi: ${{ matrix.cxx11_abi }} + release-version: ${{ needs.setup_release.outputs.release-version }} + upload-to-release: true + + publish_package: + name: Publish package + needs: [build_wheels] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + - name: Install dependencies + run: | + pip install ninja packaging wheel twine + # Install latest setuptools with support for pypi metadata 2.2 (improved compat w/ uv) + pip install setuptools==75.8.0 + # We don't want to download anything CUDA-related here + pip install torch --index-url https://download.pytorch.org/whl/cpu + - name: Build core package + env: + GROUPED_GEMM_SKIP_CUDA_BUILD: "TRUE" + run: | + python setup.py sdist --dist-dir=dist + - name: Deploy + env: + TWINE_USERNAME: "__token__" + TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} + run: | + python -m twine upload dist/* From 30cf950e231cda2760f8c74a86c68a5ce1333d1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 28 Aug 2025 15:19:58 +0000 Subject: [PATCH 03/26] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index fbf33c9f..d1d26e86 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -63,5 +63,5 @@ jobs: cuda-version: ${{ inputs.cuda-version || matrix.cuda-version }} torch-version: ${{ inputs.torch-version || matrix.torch-version }} cxx11_abi: ${{ inputs.cxx11_abi || matrix.cxx11_abi }} - upload-to-release: ${{ inputs.upload-to-release || matrix.upload-to-release }} - release-version: ${{ inputs.release-version || matrix.release-version }} + upload-to-release: ${{ inputs.upload-to-release || false }} + release-version: ${{ inputs.release-version || v2.0.0 }} From 9b1e96085f987e25ff51c76744105a9d0d41cf19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 28 Aug 2025 15:20:23 +0000 Subject: [PATCH 04/26] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d1d26e86..3bd828f9 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -64,4 +64,4 @@ jobs: torch-version: ${{ inputs.torch-version || matrix.torch-version }} cxx11_abi: ${{ inputs.cxx11_abi || matrix.cxx11_abi }} upload-to-release: ${{ inputs.upload-to-release || false }} - release-version: ${{ inputs.release-version || v2.0.0 }} + release-version: ${{ inputs.release-version || 'v2.0.0' }} From 9288860c20b4252269c3a025b9f1bf449713f26e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 28 Aug 2025 15:57:03 +0000 Subject: [PATCH 05/26] build: Add CachedWheel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/publish.yml | 3 +- setup.py | 122 +++++++++++++++++++++++++++++----- 2 files changed, 107 insertions(+), 18 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index f0c93eae..db847e83 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -82,7 +82,8 @@ jobs: pip install torch --index-url https://download.pytorch.org/whl/cpu - name: Build core package env: - GROUPED_GEMM_SKIP_CUDA_BUILD: "TRUE" + DEEP_GEMM_NO_LOCAL_VERSION: "TRUE" + DEEP_GEMM_SKIP_CUDA_BUILD: "TRUE" run: | python setup.py sdist --dist-dir=dist - name: Deploy diff --git a/setup.py b/setup.py index fdac0207..a4d9ae3f 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,4 @@ +from calendar import c import os import setuptools import shutil @@ -9,11 +10,84 @@ from torch.utils.cpp_extension import CUDAExtension, CUDA_HOME from pathlib import Path import subprocess +import sys +import platform +from packaging.version import parse +from wheel.bdist_wheel import bdist_wheel as _bdist_wheel +import urllib SKIP_CUDA_BUILD = os.getenv("DEEP_GEMM_SKIP_CUDA_BUILD", "FALSE") == "TRUE" +NO_LOCAL_VERSION = os.getenv("DEEP_GEMM_NO_LOCAL_VERSION", "FALSE") == "TRUE" +FORCE_BUILD = os.getenv("DEEP_GEMM_FORCE_BUILD", "FALSE") == "TRUE" - +BASE_WHEEL_URL = ( + "https://github.com/DeepSeek-AI/DeepGEMM/releases/download/{tag_name}/{wheel_name}" +) +PACKAGE_NAME = "deep_gemm" current_dir = os.path.dirname(os.path.realpath(__file__)) + + +def get_package_version(): + with open(Path(current_dir) / "deep_gemm" / "__init__.py", "r") as f: + version_match = re.search(r"^__version__\s*=\s*(.*)$", f.read(), re.MULTILINE) + public_version = ast.literal_eval(version_match.group(1)) + revision = "" + + if not NO_LOCAL_VERSION: + try: + cmd = ["git", "rev-parse", "--short", "HEAD"] + revision = "+" + subprocess.check_output(cmd).decode("ascii").rstrip() + except: + revision = "" + + return f"{public_version}{revision}" + + +def get_platform(): + """ + Returns the platform name as used in wheel filenames. + """ + if sys.platform.startswith("linux"): + return f"linux_{platform.uname().machine}" + elif sys.platform == "darwin": + mac_version = ".".join(platform.mac_ver()[0].split(".")[:2]) + return f"macosx_{mac_version}_x86_64" + elif sys.platform == "win32": + return "win_amd64" + else: + raise ValueError("Unsupported platform: {}".format(sys.platform)) + + +def get_wheel_url(): + torch_version_raw = parse(torch.__version__) + python_version = f"cp{sys.version_info.major}{sys.version_info.minor}" + platform_name = get_platform() + grouped_gemm_version = get_package_version() + torch_version = f"{torch_version_raw.major}.{torch_version_raw.minor}" + cxx11_abi = str(torch._C._GLIBCXX_USE_CXX11_ABI).upper() + + # Determine the version numbers that will be used to determine the correct wheel + # We're using the CUDA version used to build torch, not the one currently installed + # _, cuda_version_raw = get_cuda_bare_metal_version(CUDA_HOME) + torch_cuda_version = parse(torch.version.cuda) + # For CUDA 11, we only compile for CUDA 11.8, and for CUDA 12 we only compile for CUDA 12.3 + # to save CI time. Minor versions should be compatible. + torch_cuda_version = ( + parse("11.8") if torch_cuda_version.major == 11 else parse("12.3") + ) + # cuda_version = f"{cuda_version_raw.major}{cuda_version_raw.minor}" + cuda_version = f"{torch_cuda_version.major}" + + # Determine wheel URL based on CUDA version, torch version, python version and OS + wheel_filename = f"{PACKAGE_NAME}-{grouped_gemm_version}+cu{cuda_version}torch{torch_version}cxx11abi{cxx11_abi}-{python_version}-{python_version}-{platform_name}.whl" + + wheel_url = BASE_WHEEL_URL.format( + tag_name=f"v{grouped_gemm_version}", wheel_name=wheel_filename + ) + + return wheel_url, wheel_filename + + cxx_flags = [ "-std=c++17", "-O3", @@ -101,23 +175,39 @@ def prepare_includes(self): ext_modules = [] -NO_LOCAL_VERSION = os.getenv("DEEP_GEMM_NO_LOCAL_VERSION", "FALSE") == "TRUE" +class CachedWheelsCommand(_bdist_wheel): + """ + The CachedWheelsCommand plugs into the default bdist wheel, which is ran by pip when it cannot + find an existing wheel (which is currently the case for all grouped gemm installs). We use + the environment parameters to detect whether there is already a pre-built version of a compatible + wheel available and short-circuits the standard full build pipeline. + """ + def run(self): + if FORCE_BUILD: + return super().run() -def get_package_version(): - with open(Path(current_dir) / "deep_gemm" / "__init__.py", "r") as f: - version_match = re.search(r"^__version__\s*=\s*(.*)$", f.read(), re.MULTILINE) - public_version = ast.literal_eval(version_match.group(1)) - revision = "" - - if not NO_LOCAL_VERSION: + wheel_url, wheel_filename = get_wheel_url() + print("Guessing wheel URL: ", wheel_url) try: - cmd = ["git", "rev-parse", "--short", "HEAD"] - revision = "+" + subprocess.check_output(cmd).decode("ascii").rstrip() - except: - revision = "" + urllib.request.urlretrieve(wheel_url, wheel_filename) - return f"{public_version}{revision}" + # Make the archive + # Lifted from the root wheel processing command + # https://github.com/pypa/wheel/blob/cf71108ff9f6ffc36978069acb28824b44ae028e/src/wheel/bdist_wheel.py#LL381C9-L381C85 + if not os.path.exists(self.dist_dir): + os.makedirs(self.dist_dir) + + impl_tag, abi_tag, plat_tag = self.get_tag() + archive_basename = f"{self.wheel_dist_name}-{impl_tag}-{abi_tag}-{plat_tag}" + + wheel_path = os.path.join(self.dist_dir, archive_basename + ".whl") + print("Raw wheel path", wheel_path) + os.rename(wheel_filename, wheel_path) + except (urllib.error.HTTPError, urllib.error.URLError): + print("Precompiled wheel not found. Building from source...") + # If the wheel could not be downloaded, build from source + super().run() if __name__ == "__main__": @@ -135,7 +225,5 @@ def get_package_version(): }, ext_modules=ext_modules, zip_safe=False, - cmdclass={ - "build_py": CustomBuildPy, - }, + cmdclass={"build_py": CustomBuildPy, "bdist_wheel": CachedWheelsCommand}, ) From 07eab72c2f927e4b72ac080096f578315529b695 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 1 Sep 2025 09:14:08 +0000 Subject: [PATCH 06/26] add version to init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- deep_gemm/__init__.py | 49 ++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 29 deletions(-) diff --git a/deep_gemm/__init__.py b/deep_gemm/__init__.py index 7ffa8131..3edf6459 100644 --- a/deep_gemm/__init__.py +++ b/deep_gemm/__init__.py @@ -5,7 +5,6 @@ try: # noinspection PyUnresolvedReferences from .envs import persistent_envs - for key, value in persistent_envs.items(): if key not in os.environ: os.environ[key] = value @@ -14,31 +13,29 @@ # Configs import deep_gemm_cpp -from deep_gemm_cpp import ( - set_num_sms, - get_num_sms, - set_tc_util, - get_tc_util, -) # Kernels from deep_gemm_cpp import ( + bf16_gemm_nn, + # BF16 GEMMs + bf16_gemm_nt, + bf16_gemm_tn, + bf16_gemm_tt, + fp8_gemm_nn, # FP8 GEMMs fp8_gemm_nt, - fp8_gemm_nn, fp8_gemm_tn, fp8_gemm_tt, - m_grouped_fp8_gemm_nt_contiguous, - m_grouped_fp8_gemm_nn_contiguous, - m_grouped_fp8_gemm_nt_masked, + get_num_sms, + get_tc_util, k_grouped_fp8_gemm_tn_contiguous, - # BF16 GEMMs - bf16_gemm_nt, - bf16_gemm_nn, - bf16_gemm_tn, - bf16_gemm_tt, m_grouped_bf16_gemm_nt_contiguous, m_grouped_bf16_gemm_nt_masked, + m_grouped_fp8_gemm_nn_contiguous, + m_grouped_fp8_gemm_nt_contiguous, + m_grouped_fp8_gemm_nt_masked, + set_num_sms, + set_tc_util, # Layout kernels transform_sf_into_required_layout, ) @@ -49,8 +46,7 @@ bf16_m_grouped_gemm_nt_masked = m_grouped_bf16_gemm_nt_masked # Some utils -from . import testing -from . import utils +from . import testing, utils from .utils import * @@ -58,19 +54,15 @@ def _find_cuda_home() -> str: # TODO: reuse PyTorch API later # For some PyTorch versions, the original `_find_cuda_home` will initialize CUDA, which is incompatible with process forks - cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH") + cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH') if cuda_home is None: # noinspection PyBroadException try: - with open(os.devnull, "w") as devnull: - nvcc = ( - subprocess.check_output(["which", "nvcc"], stderr=devnull) - .decode() - .rstrip("\r\n") - ) + with open(os.devnull, 'w') as devnull: + nvcc = subprocess.check_output(['which', 'nvcc'], stderr=devnull).decode().rstrip('\r\n') cuda_home = os.path.dirname(os.path.dirname(nvcc)) except Exception: - cuda_home = "/usr/local/cuda" + cuda_home = '/usr/local/cuda' if not os.path.exists(cuda_home): cuda_home = None assert cuda_home is not None @@ -78,9 +70,8 @@ def _find_cuda_home() -> str: deep_gemm_cpp.init( - os.path.dirname(os.path.abspath(__file__)), # Library root directory path - _find_cuda_home(), # CUDA home + os.path.dirname(os.path.abspath(__file__)), # Library root directory path + _find_cuda_home() # CUDA home ) - __version__ = "2.0.0" From 442afc63767ac95147b693b58ba1aa67c1778adc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 1 Sep 2025 09:16:30 +0000 Subject: [PATCH 07/26] revert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- deep_gemm/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deep_gemm/__init__.py b/deep_gemm/__init__.py index 3edf6459..588b781d 100644 --- a/deep_gemm/__init__.py +++ b/deep_gemm/__init__.py @@ -46,7 +46,8 @@ bf16_m_grouped_gemm_nt_masked = m_grouped_bf16_gemm_nt_masked # Some utils -from . import testing, utils +from . import testing +from . import utils from .utils import * From a0768a61b2a91295e0f516d034d2720ba679a467 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 1 Sep 2025 09:17:58 +0000 Subject: [PATCH 08/26] revert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- setup.py | 217 +++++++++++++------------------------------------------ 1 file changed, 52 insertions(+), 165 deletions(-) diff --git a/setup.py b/setup.py index a4d9ae3f..ab0d0f3f 100644 --- a/setup.py +++ b/setup.py @@ -1,20 +1,11 @@ -from calendar import c import os import setuptools import shutil +import subprocess import torch -import re -import ast from setuptools import find_packages from setuptools.command.build_py import build_py from torch.utils.cpp_extension import CUDAExtension, CUDA_HOME -from pathlib import Path -import subprocess -import sys -import platform -from packaging.version import parse -from wheel.bdist_wheel import bdist_wheel as _bdist_wheel -import urllib SKIP_CUDA_BUILD = os.getenv("DEEP_GEMM_SKIP_CUDA_BUILD", "FALSE") == "TRUE" NO_LOCAL_VERSION = os.getenv("DEEP_GEMM_NO_LOCAL_VERSION", "FALSE") == "TRUE" @@ -24,96 +15,31 @@ "https://github.com/DeepSeek-AI/DeepGEMM/releases/download/{tag_name}/{wheel_name}" ) PACKAGE_NAME = "deep_gemm" -current_dir = os.path.dirname(os.path.realpath(__file__)) - - -def get_package_version(): - with open(Path(current_dir) / "deep_gemm" / "__init__.py", "r") as f: - version_match = re.search(r"^__version__\s*=\s*(.*)$", f.read(), re.MULTILINE) - public_version = ast.literal_eval(version_match.group(1)) - revision = "" - - if not NO_LOCAL_VERSION: - try: - cmd = ["git", "rev-parse", "--short", "HEAD"] - revision = "+" + subprocess.check_output(cmd).decode("ascii").rstrip() - except: - revision = "" - - return f"{public_version}{revision}" - - -def get_platform(): - """ - Returns the platform name as used in wheel filenames. - """ - if sys.platform.startswith("linux"): - return f"linux_{platform.uname().machine}" - elif sys.platform == "darwin": - mac_version = ".".join(platform.mac_ver()[0].split(".")[:2]) - return f"macosx_{mac_version}_x86_64" - elif sys.platform == "win32": - return "win_amd64" - else: - raise ValueError("Unsupported platform: {}".format(sys.platform)) - - -def get_wheel_url(): - torch_version_raw = parse(torch.__version__) - python_version = f"cp{sys.version_info.major}{sys.version_info.minor}" - platform_name = get_platform() - grouped_gemm_version = get_package_version() - torch_version = f"{torch_version_raw.major}.{torch_version_raw.minor}" - cxx11_abi = str(torch._C._GLIBCXX_USE_CXX11_ABI).upper() - - # Determine the version numbers that will be used to determine the correct wheel - # We're using the CUDA version used to build torch, not the one currently installed - # _, cuda_version_raw = get_cuda_bare_metal_version(CUDA_HOME) - torch_cuda_version = parse(torch.version.cuda) - # For CUDA 11, we only compile for CUDA 11.8, and for CUDA 12 we only compile for CUDA 12.3 - # to save CI time. Minor versions should be compatible. - torch_cuda_version = ( - parse("11.8") if torch_cuda_version.major == 11 else parse("12.3") - ) - # cuda_version = f"{cuda_version_raw.major}{cuda_version_raw.minor}" - cuda_version = f"{torch_cuda_version.major}" - - # Determine wheel URL based on CUDA version, torch version, python version and OS - wheel_filename = f"{PACKAGE_NAME}-{grouped_gemm_version}+cu{cuda_version}torch{torch_version}cxx11abi{cxx11_abi}-{python_version}-{python_version}-{platform_name}.whl" - wheel_url = BASE_WHEEL_URL.format( - tag_name=f"v{grouped_gemm_version}", wheel_name=wheel_filename - ) - - return wheel_url, wheel_filename - - -cxx_flags = [ - "-std=c++17", - "-O3", - "-fPIC", - "-Wno-psabi", - "-Wno-deprecated-declarations", - f"-D_GLIBCXX_USE_CXX11_ABI={int(torch.compiled_with_cxx11_abi())}", -] -sources = ["csrc/python_api.cpp"] +current_dir = os.path.dirname(os.path.realpath(__file__)) +cxx_flags = ['-std=c++17', '-O3', '-fPIC', '-Wno-psabi', '-Wno-deprecated-declarations', + f'-D_GLIBCXX_USE_CXX11_ABI={int(torch.compiled_with_cxx11_abi())}'] +sources = ['csrc/python_api.cpp'] build_include_dirs = [ - f"{CUDA_HOME}/include", - f"{CUDA_HOME}/include/cccl", - "deep_gemm/include", - "third-party/cutlass/include", - "third-party/fmt/include", + f'{CUDA_HOME}/include', + f'{CUDA_HOME}/include/cccl', + 'deep_gemm/include', + 'third-party/cutlass/include', + 'third-party/fmt/include', +] +build_libraries = ['cuda', 'cudart', 'nvrtc'] +build_library_dirs = [ + f'{CUDA_HOME}/lib64', + f'{CUDA_HOME}/lib64/stubs' ] -build_libraries = ["cuda", "cudart", "nvrtc"] -build_library_dirs = [f"{CUDA_HOME}/lib64", f"{CUDA_HOME}/lib64/stubs"] third_party_include_dirs = [ - "third-party/cutlass/include/cute", - "third-party/cutlass/include/cutlass", + 'third-party/cutlass/include/cute', + 'third-party/cutlass/include/cutlass', ] # Use runtime API -if int(os.environ.get("DG_JIT_USE_RUNTIME_API", "0")): - cxx_flags.append("-DDG_JIT_USE_RUNTIME_API") +if int(os.environ.get('DG_JIT_USE_RUNTIME_API', '0')): + cxx_flags.append('-DDG_JIT_USE_RUNTIME_API') class CustomBuildPy(build_py): @@ -128,30 +54,22 @@ def run(self): build_py.run(self) def generate_default_envs(self): - code = "# Pre-installed environment variables\n" - code += "persistent_envs = dict()\n" - for name in ( - "DG_JIT_CACHE_DIR", - "DG_JIT_PRINT_COMPILER_COMMAND", - "DG_JIT_CPP_STANDARD", - ): - code += ( - f"persistent_envs['{name}'] = '{os.environ[name]}'\n" - if name in os.environ - else "" - ) - - with open(os.path.join(self.build_lib, "deep_gemm", "envs.py"), "w") as f: + code = '# Pre-installed environment variables\n' + code += 'persistent_envs = dict()\n' + for name in ('DG_JIT_CACHE_DIR', 'DG_JIT_PRINT_COMPILER_COMMAND', 'DG_JIT_CPP_STANDARD'): + code += f"persistent_envs['{name}'] = '{os.environ[name]}'\n" if name in os.environ else '' + + with open(os.path.join(self.build_lib, 'deep_gemm', 'envs.py'), 'w') as f: f.write(code) def prepare_includes(self): # Create temporary build directory instead of modifying package directory - build_include_dir = os.path.join(self.build_lib, "deep_gemm/include") + build_include_dir = os.path.join(self.build_lib, 'deep_gemm/include') os.makedirs(build_include_dir, exist_ok=True) # Copy third-party includes to the build directory for d in third_party_include_dirs: - dirname = d.split("/")[-1] + dirname = d.split('/')[-1] src_dir = os.path.join(current_dir, d) dst_dir = os.path.join(build_include_dir, dirname) @@ -163,67 +81,36 @@ def prepare_includes(self): shutil.copytree(src_dir, dst_dir) -if not SKIP_CUDA_BUILD: - ext_modules = [ - CUDAExtension( - name="deep_gemm_cpp", - sources=sources, - include_dirs=build_include_dirs, - ) - ] -else: - ext_modules = [] - - -class CachedWheelsCommand(_bdist_wheel): - """ - The CachedWheelsCommand plugs into the default bdist wheel, which is ran by pip when it cannot - find an existing wheel (which is currently the case for all grouped gemm installs). We use - the environment parameters to detect whether there is already a pre-built version of a compatible - wheel available and short-circuits the standard full build pipeline. - """ - - def run(self): - if FORCE_BUILD: - return super().run() - - wheel_url, wheel_filename = get_wheel_url() - print("Guessing wheel URL: ", wheel_url) - try: - urllib.request.urlretrieve(wheel_url, wheel_filename) - - # Make the archive - # Lifted from the root wheel processing command - # https://github.com/pypa/wheel/blob/cf71108ff9f6ffc36978069acb28824b44ae028e/src/wheel/bdist_wheel.py#LL381C9-L381C85 - if not os.path.exists(self.dist_dir): - os.makedirs(self.dist_dir) - - impl_tag, abi_tag, plat_tag = self.get_tag() - archive_basename = f"{self.wheel_dist_name}-{impl_tag}-{abi_tag}-{plat_tag}" +if __name__ == '__main__': + # noinspection PyBroadException + try: + cmd = ['git', 'rev-parse', '--short', 'HEAD'] + revision = '+' + subprocess.check_output(cmd).decode('ascii').rstrip() + except: + revision = '' - wheel_path = os.path.join(self.dist_dir, archive_basename + ".whl") - print("Raw wheel path", wheel_path) - os.rename(wheel_filename, wheel_path) - except (urllib.error.HTTPError, urllib.error.URLError): - print("Precompiled wheel not found. Building from source...") - # If the wheel could not be downloaded, build from source - super().run() - - -if __name__ == "__main__": # noinspection PyTypeChecker setuptools.setup( - name="deep_gemm", - version=get_package_version(), - packages=find_packages("."), + name='deep_gemm', + version='2.0.0' + revision, + packages=find_packages('.'), package_data={ - "deep_gemm": [ - "include/deep_gemm/**/*", - "include/cute/**/*", - "include/cutlass/**/*", + 'deep_gemm': [ + 'include/deep_gemm/**/*', + 'include/cute/**/*', + 'include/cutlass/**/*', ] }, - ext_modules=ext_modules, + ext_modules=[ + CUDAExtension(name='deep_gemm_cpp', + sources=sources, + include_dirs=build_include_dirs, + libraries=build_libraries, + library_dirs=build_library_dirs, + extra_compile_args=cxx_flags) + ], zip_safe=False, - cmdclass={"build_py": CustomBuildPy, "bdist_wheel": CachedWheelsCommand}, + cmdclass={ + 'build_py': CustomBuildPy, + }, ) From a16fdc02c58a7a6e193ba907b64335af64bfd594 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 1 Sep 2025 09:18:46 +0000 Subject: [PATCH 09/26] revert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- deep_gemm/__init__.py | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/deep_gemm/__init__.py b/deep_gemm/__init__.py index 588b781d..169e2e6b 100644 --- a/deep_gemm/__init__.py +++ b/deep_gemm/__init__.py @@ -13,31 +13,29 @@ # Configs import deep_gemm_cpp +from deep_gemm_cpp import ( + set_num_sms, + get_num_sms, + set_tc_util, + get_tc_util, +) # Kernels from deep_gemm_cpp import ( - bf16_gemm_nn, - # BF16 GEMMs - bf16_gemm_nt, - bf16_gemm_tn, - bf16_gemm_tt, - fp8_gemm_nn, # FP8 GEMMs - fp8_gemm_nt, - fp8_gemm_tn, - fp8_gemm_tt, - get_num_sms, - get_tc_util, + fp8_gemm_nt, fp8_gemm_nn, + fp8_gemm_tn, fp8_gemm_tt, + m_grouped_fp8_gemm_nt_contiguous, + m_grouped_fp8_gemm_nn_contiguous, + m_grouped_fp8_gemm_nt_masked, k_grouped_fp8_gemm_tn_contiguous, + # BF16 GEMMs + bf16_gemm_nt, bf16_gemm_nn, + bf16_gemm_tn, bf16_gemm_tt, m_grouped_bf16_gemm_nt_contiguous, m_grouped_bf16_gemm_nt_masked, - m_grouped_fp8_gemm_nn_contiguous, - m_grouped_fp8_gemm_nt_contiguous, - m_grouped_fp8_gemm_nt_masked, - set_num_sms, - set_tc_util, # Layout kernels - transform_sf_into_required_layout, + transform_sf_into_required_layout ) # Some alias for legacy supports @@ -74,5 +72,3 @@ def _find_cuda_home() -> str: os.path.dirname(os.path.abspath(__file__)), # Library root directory path _find_cuda_home() # CUDA home ) - -__version__ = "2.0.0" From 3692017e2ba78bc1b25389fe0ef861308593c85d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 1 Sep 2025 09:18:52 +0000 Subject: [PATCH 10/26] v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- deep_gemm/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deep_gemm/__init__.py b/deep_gemm/__init__.py index 169e2e6b..d26eafa9 100644 --- a/deep_gemm/__init__.py +++ b/deep_gemm/__init__.py @@ -72,3 +72,5 @@ def _find_cuda_home() -> str: os.path.dirname(os.path.abspath(__file__)), # Library root directory path _find_cuda_home() # CUDA home ) + +__version__ = "2.0.0" From 8442cf69b938184fa3d88b828b76551abab1edf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 1 Sep 2025 09:22:17 +0000 Subject: [PATCH 11/26] update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- setup.py | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 112 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ab0d0f3f..5862f278 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,19 @@ +import ast import os +import re import setuptools import shutil import subprocess +import sys +import urllib import torch +import platform from setuptools import find_packages from setuptools.command.build_py import build_py from torch.utils.cpp_extension import CUDAExtension, CUDA_HOME +from pathlib import Path +from packaging import version as parse +from wheel.bdist_wheel import bdist_wheel as _bdist_wheel SKIP_CUDA_BUILD = os.getenv("DEEP_GEMM_SKIP_CUDA_BUILD", "FALSE") == "TRUE" NO_LOCAL_VERSION = os.getenv("DEEP_GEMM_NO_LOCAL_VERSION", "FALSE") == "TRUE" @@ -41,6 +49,63 @@ if int(os.environ.get('DG_JIT_USE_RUNTIME_API', '0')): cxx_flags.append('-DDG_JIT_USE_RUNTIME_API') +def get_package_version(): + with open(Path(current_dir) / "deep_gemm" / "__init__.py", "r") as f: + version_match = re.search(r"^__version__\s*=\s*(.*)$", f.read(), re.MULTILINE) + public_version = ast.literal_eval(version_match.group(1)) + revision = "" + + if not NO_LOCAL_VERSION: + try: + cmd = ["git", "rev-parse", "--short", "HEAD"] + revision = "+" + subprocess.check_output(cmd).decode("ascii").rstrip() + except: + revision = "" + + return f"{public_version}{revision}" + +def get_platform(): + """ + Returns the platform name as used in wheel filenames. + """ + if sys.platform.startswith("linux"): + return f"linux_{platform.uname().machine}" + elif sys.platform == "darwin": + mac_version = ".".join(platform.mac_ver()[0].split(".")[:2]) + return f"macosx_{mac_version}_x86_64" + elif sys.platform == "win32": + return "win_amd64" + else: + raise ValueError("Unsupported platform: {}".format(sys.platform)) + +def get_wheel_url(): + torch_version_raw = parse(torch.__version__) + python_version = f"cp{sys.version_info.major}{sys.version_info.minor}" + platform_name = get_platform() + grouped_gemm_version = get_package_version() + torch_version = f"{torch_version_raw.major}.{torch_version_raw.minor}" + cxx11_abi = str(torch._C._GLIBCXX_USE_CXX11_ABI).upper() + + # Determine the version numbers that will be used to determine the correct wheel + # We're using the CUDA version used to build torch, not the one currently installed + # _, cuda_version_raw = get_cuda_bare_metal_version(CUDA_HOME) + torch_cuda_version = parse(torch.version.cuda) + # For CUDA 11, we only compile for CUDA 11.8, and for CUDA 12 we only compile for CUDA 12.3 + # to save CI time. Minor versions should be compatible. + torch_cuda_version = ( + parse("11.8") if torch_cuda_version.major == 11 else parse("12.3") + ) + # cuda_version = f"{cuda_version_raw.major}{cuda_version_raw.minor}" + cuda_version = f"{torch_cuda_version.major}" + + # Determine wheel URL based on CUDA version, torch version, python version and OS + wheel_filename = f"{PACKAGE_NAME}-{grouped_gemm_version}+cu{cuda_version}torch{torch_version}cxx11abi{cxx11_abi}-{python_version}-{python_version}-{platform_name}.whl" + + wheel_url = BASE_WHEEL_URL.format( + tag_name=f"v{grouped_gemm_version}", wheel_name=wheel_filename + ) + + return wheel_url, wheel_filename class CustomBuildPy(build_py): def run(self): @@ -80,6 +145,51 @@ def prepare_includes(self): # Copy the directory shutil.copytree(src_dir, dst_dir) +if not SKIP_CUDA_BUILD: + ext_modules = [ + CUDAExtension( + name="deep_gemm_cpp", + sources=sources, + include_dirs=build_include_dirs, + ) + ] +else: + ext_modules = [] + +class CachedWheelsCommand(_bdist_wheel): + """ + The CachedWheelsCommand plugs into the default bdist wheel, which is ran by pip when it cannot + find an existing wheel (which is currently the case for all grouped gemm installs). We use + the environment parameters to detect whether there is already a pre-built version of a compatible + wheel available and short-circuits the standard full build pipeline. + """ + + def run(self): + if FORCE_BUILD: + return super().run() + + wheel_url, wheel_filename = get_wheel_url() + print("Guessing wheel URL: ", wheel_url) + try: + urllib.request.urlretrieve(wheel_url, wheel_filename) + + # Make the archive + # Lifted from the root wheel processing command + # https://github.com/pypa/wheel/blob/cf71108ff9f6ffc36978069acb28824b44ae028e/src/wheel/bdist_wheel.py#LL381C9-L381C85 + if not os.path.exists(self.dist_dir): + os.makedirs(self.dist_dir) + + impl_tag, abi_tag, plat_tag = self.get_tag() + archive_basename = f"{self.wheel_dist_name}-{impl_tag}-{abi_tag}-{plat_tag}" + + wheel_path = os.path.join(self.dist_dir, archive_basename + ".whl") + print("Raw wheel path", wheel_path) + os.rename(wheel_filename, wheel_path) + except (urllib.error.HTTPError, urllib.error.URLError): + print("Precompiled wheel not found. Building from source...") + # If the wheel could not be downloaded, build from source + super().run() + if __name__ == '__main__': # noinspection PyBroadException @@ -92,7 +202,7 @@ def prepare_includes(self): # noinspection PyTypeChecker setuptools.setup( name='deep_gemm', - version='2.0.0' + revision, + version=get_package_version(), packages=find_packages('.'), package_data={ 'deep_gemm': [ @@ -112,5 +222,6 @@ def prepare_includes(self): zip_safe=False, cmdclass={ 'build_py': CustomBuildPy, + 'bdist_wheel': CachedWheelsCommand, }, ) From 11c0127f7160136e61e6cc58166cca7a478c45e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 1 Sep 2025 09:23:16 +0000 Subject: [PATCH 12/26] test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 3bd828f9..6de67db7 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -64,4 +64,4 @@ jobs: torch-version: ${{ inputs.torch-version || matrix.torch-version }} cxx11_abi: ${{ inputs.cxx11_abi || matrix.cxx11_abi }} upload-to-release: ${{ inputs.upload-to-release || false }} - release-version: ${{ inputs.release-version || 'v2.0.0' }} + release-version: ${{ inputs.release-version || github.sha }} From 163cb16be75279e076abba129cdfb73ff68c033d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 1 Sep 2025 09:36:34 +0000 Subject: [PATCH 13/26] from packaging.version import parse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 5862f278..ab65357e 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ from setuptools.command.build_py import build_py from torch.utils.cpp_extension import CUDAExtension, CUDA_HOME from pathlib import Path -from packaging import version as parse +from packaging.version import parse from wheel.bdist_wheel import bdist_wheel as _bdist_wheel SKIP_CUDA_BUILD = os.getenv("DEEP_GEMM_SKIP_CUDA_BUILD", "FALSE") == "TRUE" From bec3874426ee29319a1ad68249115ff0b1473426 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 1 Sep 2025 09:52:46 +0000 Subject: [PATCH 14/26] local version MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/_build.yml | 6 ++++++ .github/workflows/build.yml | 6 ++++++ .github/workflows/publish.yml | 1 + .vscode/settings.json | 39 +++++++++++++++++++++++++++++++++++ 4 files changed, 52 insertions(+) create mode 100644 .vscode/settings.json diff --git a/.github/workflows/_build.yml b/.github/workflows/_build.yml index 6a9a1469..594430df 100644 --- a/.github/workflows/_build.yml +++ b/.github/workflows/_build.yml @@ -32,6 +32,11 @@ on: description: "Upload wheel to this release" required: false type: string + use-local-version: + description: "Use local version" + required: false + type: boolean + default: false defaults: run: @@ -165,6 +170,7 @@ jobs: export MAX_JOBS=$([ "$MATRIX_CUDA_VERSION" == "129" ] && echo 1 || echo 2) export NVCC_THREADS=2 export TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 9.0+PTX" + export DEEP_GEMM_NO_LOCAL_VERSION=${{ inputs.use-local-version && 'FALSE' || 'TRUE' }} # 5h timeout since GH allows max 6h and we want some buffer EXIT_CODE=0 diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 6de67db7..9b5d0a05 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -33,6 +33,11 @@ on: description: "Upload wheel to this release" required: false type: string + use-local-version: + description: "Use local version" + required: false + type: boolean + default: false push: jobs: @@ -65,3 +70,4 @@ jobs: cxx11_abi: ${{ inputs.cxx11_abi || matrix.cxx11_abi }} upload-to-release: ${{ inputs.upload-to-release || false }} release-version: ${{ inputs.release-version || github.sha }} + use-local-version: ${{ inputs.use-local-version || false }} diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index db847e83..c47aabdc 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -63,6 +63,7 @@ jobs: cxx11_abi: ${{ matrix.cxx11_abi }} release-version: ${{ needs.setup_release.outputs.release-version }} upload-to-release: true + use-local-version: false publish_package: name: Publish package diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..11b52b6f --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,39 @@ +{ + "python.defaultInterpreterPath": "/home/okoenig/Documents/Repositories/github.com/NVIDIA-NeMo/NeMo-Export-Deploy/.venv/bin/python", + "editor.formatOnSave": false, + "git.enableCommitSigning": true, + "cSpell.words": [ + "argjson", + "cicd", + "CICD", + "CUDA", + "elif", + "garay", + "Gotchapython", + "mode", + "nemo", + "Nemo", + "NEMO", + "nemoci", + "pipefail", + "PYTHONUNBUFFERED", + "subteam", + "timerange", + "TRTLLM" + ], + "[python]": { + "editor.formatOnSave": false, + "editor.defaultFormatter": "charliermarsh.ruff", + "editor.codeActionsOnSave": { + "source.organizeImports.ruff": "false" + } + }, + "[dockerfile]": { + "editor.formatOnSave": false, + "editor.defaultFormatter": "ms-azuretools.vscode-containers" + }, + "flake8.enabled": false, + "pylint.enabled": false, + "python.languageServer": "Pylance", + "isort.enabled": false +} \ No newline at end of file From 163f54dfe723f3c8ab058c0dce496bdb9b6ed368 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 1 Sep 2025 10:02:16 +0000 Subject: [PATCH 15/26] remove file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .vscode/settings.json | 39 --------------------------------------- 1 file changed, 39 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 11b52b6f..00000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,39 +0,0 @@ -{ - "python.defaultInterpreterPath": "/home/okoenig/Documents/Repositories/github.com/NVIDIA-NeMo/NeMo-Export-Deploy/.venv/bin/python", - "editor.formatOnSave": false, - "git.enableCommitSigning": true, - "cSpell.words": [ - "argjson", - "cicd", - "CICD", - "CUDA", - "elif", - "garay", - "Gotchapython", - "mode", - "nemo", - "Nemo", - "NEMO", - "nemoci", - "pipefail", - "PYTHONUNBUFFERED", - "subteam", - "timerange", - "TRTLLM" - ], - "[python]": { - "editor.formatOnSave": false, - "editor.defaultFormatter": "charliermarsh.ruff", - "editor.codeActionsOnSave": { - "source.organizeImports.ruff": "false" - } - }, - "[dockerfile]": { - "editor.formatOnSave": false, - "editor.defaultFormatter": "ms-azuretools.vscode-containers" - }, - "flake8.enabled": false, - "pylint.enabled": false, - "python.languageServer": "Pylance", - "isort.enabled": false -} \ No newline at end of file From bd86ac7948056baea089514c49bbacb79e5ff828 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 1 Sep 2025 10:03:14 +0000 Subject: [PATCH 16/26] revert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/build.yml | 38 +++++++++---------------------------- 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9b5d0a05..ee250aa4 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,7 +1,7 @@ name: Build wheels on: - workflow_call: + workflow_dispatch: inputs: runs-on: description: "The runner to use for the build" @@ -38,36 +38,16 @@ on: required: false type: boolean default: false - push: jobs: build-wheels: uses: ./.github/workflows/_build.yml - strategy: - fail-fast: false - matrix: - # Using ubuntu-22.04 instead of 24.04 for more compatibility (glibc). Ideally we'd use the - # manylinux docker image, but I haven't figured out how to install CUDA on manylinux. - os: [ubuntu-22.04] - python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] - torch-version: ["2.4.0", "2.5.1", "2.6.0", "2.7.1", "2.8.0"] - cuda-version: ["12.9.1"] - # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not. - # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI. - # Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs) - # when building without C++11 ABI and using it on nvcr images. - cxx11_abi: ["FALSE", "TRUE"] - exclude: - # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix - # Pytorch < 2.5 does not support Python 3.13 - - torch-version: "2.4.0" - python-version: "3.13" with: - runs-on: ${{ inputs.runs-on || matrix.os }} - python-version: ${{ inputs.python-version || matrix.python-version }} - cuda-version: ${{ inputs.cuda-version || matrix.cuda-version }} - torch-version: ${{ inputs.torch-version || matrix.torch-version }} - cxx11_abi: ${{ inputs.cxx11_abi || matrix.cxx11_abi }} - upload-to-release: ${{ inputs.upload-to-release || false }} - release-version: ${{ inputs.release-version || github.sha }} - use-local-version: ${{ inputs.use-local-version || false }} + runs-on: ${{ inputs.runs-on }} + python-version: ${{ inputs.python-version }} + cuda-version: ${{ inputs.cuda-version }} + torch-version: ${{ inputs.torch-version }} + cxx11_abi: ${{ inputs.cxx11_abi }} + upload-to-release: ${{ inputs.upload-to-release }} + release-version: ${{ inputs.release-version }} + use-local-version: ${{ inputs.use-local-version }} From bffc64b296a1eb311f4f21aff8164c16abbfaca3 Mon Sep 17 00:00:00 2001 From: Chenggang Zhao Date: Tue, 2 Sep 2025 14:38:36 +0800 Subject: [PATCH 17/26] Updates and lint --- .github/workflows/_build.yml | 2 +- .github/workflows/publish.yml | 6 +- deep_gemm/__init__.py | 2 +- setup.py | 146 ++++++++++++---------------------- 4 files changed, 56 insertions(+), 100 deletions(-) diff --git a/.github/workflows/_build.yml b/.github/workflows/_build.yml index 594430df..18b608d0 100644 --- a/.github/workflows/_build.yml +++ b/.github/workflows/_build.yml @@ -170,7 +170,7 @@ jobs: export MAX_JOBS=$([ "$MATRIX_CUDA_VERSION" == "129" ] && echo 1 || echo 2) export NVCC_THREADS=2 export TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 9.0+PTX" - export DEEP_GEMM_NO_LOCAL_VERSION=${{ inputs.use-local-version && 'FALSE' || 'TRUE' }} + export DG_NO_LOCAL_VERSION=${{ inputs.use-local-version && '0' || '1' }} # 5h timeout since GH allows max 6h and we want some buffer EXIT_CODE=0 diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index c47aabdc..bfbc868b 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -41,7 +41,7 @@ jobs: # Using ubuntu-22.04 instead of 24.04 for more compatibility (glibc). Ideally we'd use the # manylinux docker image, but I haven't figured out how to install CUDA on manylinux. os: [ubuntu-22.04] - python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] torch-version: ["2.4.0", "2.5.1", "2.6.0", "2.7.1", "2.8.0"] cuda-version: ["12.9.1"] # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not. @@ -83,8 +83,8 @@ jobs: pip install torch --index-url https://download.pytorch.org/whl/cpu - name: Build core package env: - DEEP_GEMM_NO_LOCAL_VERSION: "TRUE" - DEEP_GEMM_SKIP_CUDA_BUILD: "TRUE" + DG_NO_LOCAL_VERSION: "1" + DG_SKIP_CUDA_BUILD: "1" run: | python setup.py sdist --dist-dir=dist - name: Deploy diff --git a/deep_gemm/__init__.py b/deep_gemm/__init__.py index d26eafa9..00ec9cfb 100644 --- a/deep_gemm/__init__.py +++ b/deep_gemm/__init__.py @@ -73,4 +73,4 @@ def _find_cuda_home() -> str: _find_cuda_home() # CUDA home ) -__version__ = "2.0.0" +__version__ = '2.0.0' diff --git a/setup.py b/setup.py index ab65357e..99f56180 100644 --- a/setup.py +++ b/setup.py @@ -1,32 +1,31 @@ import ast import os import re -import setuptools import shutil +import setuptools import subprocess import sys -import urllib import torch import platform +import urllib +import urllib.error +import urllib.request from setuptools import find_packages from setuptools.command.build_py import build_py -from torch.utils.cpp_extension import CUDAExtension, CUDA_HOME -from pathlib import Path from packaging.version import parse +from pathlib import Path +from torch.utils.cpp_extension import CUDAExtension, CUDA_HOME from wheel.bdist_wheel import bdist_wheel as _bdist_wheel -SKIP_CUDA_BUILD = os.getenv("DEEP_GEMM_SKIP_CUDA_BUILD", "FALSE") == "TRUE" -NO_LOCAL_VERSION = os.getenv("DEEP_GEMM_NO_LOCAL_VERSION", "FALSE") == "TRUE" -FORCE_BUILD = os.getenv("DEEP_GEMM_FORCE_BUILD", "FALSE") == "TRUE" -BASE_WHEEL_URL = ( - "https://github.com/DeepSeek-AI/DeepGEMM/releases/download/{tag_name}/{wheel_name}" -) -PACKAGE_NAME = "deep_gemm" - -current_dir = os.path.dirname(os.path.realpath(__file__)) +# Compiler flags cxx_flags = ['-std=c++17', '-O3', '-fPIC', '-Wno-psabi', '-Wno-deprecated-declarations', f'-D_GLIBCXX_USE_CXX11_ABI={int(torch.compiled_with_cxx11_abi())}'] +if int(os.environ.get('DG_JIT_USE_RUNTIME_API', '0')): + cxx_flags.append('-DDG_JIT_USE_RUNTIME_API') + +# Sources +current_dir = os.path.dirname(os.path.realpath(__file__)) sources = ['csrc/python_api.cpp'] build_include_dirs = [ f'{CUDA_HOME}/include', @@ -45,67 +44,60 @@ 'third-party/cutlass/include/cutlass', ] -# Use runtime API -if int(os.environ.get('DG_JIT_USE_RUNTIME_API', '0')): - cxx_flags.append('-DDG_JIT_USE_RUNTIME_API') +# Release +base_wheel_url = 'https://github.com/DeepSeek-AI/DeepGEMM/releases/download/{tag_name}/{wheel_name}' + def get_package_version(): - with open(Path(current_dir) / "deep_gemm" / "__init__.py", "r") as f: - version_match = re.search(r"^__version__\s*=\s*(.*)$", f.read(), re.MULTILINE) + with open(Path(current_dir) / 'deep_gemm' / '__init__.py', 'r') as f: + version_match = re.search(r'^__version__\s*=\s*(.*)$', f.read(), re.MULTILINE) public_version = ast.literal_eval(version_match.group(1)) - revision = "" + revision = '' - if not NO_LOCAL_VERSION: + if int(os.getenv('DG_NO_LOCAL_VERSION', '0')) == 0: + # noinspection PyBroadException try: - cmd = ["git", "rev-parse", "--short", "HEAD"] - revision = "+" + subprocess.check_output(cmd).decode("ascii").rstrip() + cmd = ['git', 'rev-parse', '--short', 'HEAD'] + revision = '+' + subprocess.check_output(cmd).decode('ascii').rstrip() except: - revision = "" + revision = '' + return f'{public_version}{revision}' - return f"{public_version}{revision}" def get_platform(): - """ - Returns the platform name as used in wheel filenames. - """ - if sys.platform.startswith("linux"): - return f"linux_{platform.uname().machine}" - elif sys.platform == "darwin": - mac_version = ".".join(platform.mac_ver()[0].split(".")[:2]) - return f"macosx_{mac_version}_x86_64" - elif sys.platform == "win32": - return "win_amd64" + if sys.platform.startswith('linux'): + return f'linux_{platform.uname().machine}' else: - raise ValueError("Unsupported platform: {}".format(sys.platform)) + raise ValueError('Unsupported platform: {}'.format(sys.platform)) + def get_wheel_url(): - torch_version_raw = parse(torch.__version__) - python_version = f"cp{sys.version_info.major}{sys.version_info.minor}" + torch_version = parse(torch.__version__) + torch_version = f'{torch_version.major}.{torch_version.minor}' + python_version = f'cp{sys.version_info.major}{sys.version_info.minor}' platform_name = get_platform() - grouped_gemm_version = get_package_version() - torch_version = f"{torch_version_raw.major}.{torch_version_raw.minor}" + deep_gemm_version = get_package_version() cxx11_abi = str(torch._C._GLIBCXX_USE_CXX11_ABI).upper() # Determine the version numbers that will be used to determine the correct wheel # We're using the CUDA version used to build torch, not the one currently installed - # _, cuda_version_raw = get_cuda_bare_metal_version(CUDA_HOME) - torch_cuda_version = parse(torch.version.cuda) - # For CUDA 11, we only compile for CUDA 11.8, and for CUDA 12 we only compile for CUDA 12.3 - # to save CI time. Minor versions should be compatible. - torch_cuda_version = ( - parse("11.8") if torch_cuda_version.major == 11 else parse("12.3") - ) - # cuda_version = f"{cuda_version_raw.major}{cuda_version_raw.minor}" - cuda_version = f"{torch_cuda_version.major}" + cuda_version = parse(torch.version.cuda) + cuda_version = f'{cuda_version.major}' # Determine wheel URL based on CUDA version, torch version, python version and OS - wheel_filename = f"{PACKAGE_NAME}-{grouped_gemm_version}+cu{cuda_version}torch{torch_version}cxx11abi{cxx11_abi}-{python_version}-{python_version}-{platform_name}.whl" + wheel_filename = f'deep_gemm-{deep_gemm_version}+cu{cuda_version}torch{torch_version}cxx11abi{cxx11_abi}-{python_version}-{python_version}-{platform_name}.whl' + wheel_url = base_wheel_url.format(tag_name=f'v{deep_gemm_version}', wheel_name=wheel_filename) + return wheel_url, wheel_filename - wheel_url = BASE_WHEEL_URL.format( - tag_name=f"v{grouped_gemm_version}", wheel_name=wheel_filename - ) - return wheel_url, wheel_filename +def get_ext_modules(): + if os.getenv('DG_SKIP_CUDA_BUILD', '0') != 0: + return [] + + return [CUDAExtension(name='deep_gemm_cpp', + sources=sources, + include_dirs=build_include_dirs,)] + class CustomBuildPy(build_py): def run(self): @@ -145,60 +137,31 @@ def prepare_includes(self): # Copy the directory shutil.copytree(src_dir, dst_dir) -if not SKIP_CUDA_BUILD: - ext_modules = [ - CUDAExtension( - name="deep_gemm_cpp", - sources=sources, - include_dirs=build_include_dirs, - ) - ] -else: - ext_modules = [] class CachedWheelsCommand(_bdist_wheel): - """ - The CachedWheelsCommand plugs into the default bdist wheel, which is ran by pip when it cannot - find an existing wheel (which is currently the case for all grouped gemm installs). We use - the environment parameters to detect whether there is already a pre-built version of a compatible - wheel available and short-circuits the standard full build pipeline. - """ - def run(self): - if FORCE_BUILD: + if int(os.getenv('DG_FORCE_BUILD', '0')) != 0: return super().run() wheel_url, wheel_filename = get_wheel_url() - print("Guessing wheel URL: ", wheel_url) + print(f'Try to download wheel from URL: {wheel_url}') try: urllib.request.urlretrieve(wheel_url, wheel_filename) # Make the archive - # Lifted from the root wheel processing command - # https://github.com/pypa/wheel/blob/cf71108ff9f6ffc36978069acb28824b44ae028e/src/wheel/bdist_wheel.py#LL381C9-L381C85 if not os.path.exists(self.dist_dir): os.makedirs(self.dist_dir) - impl_tag, abi_tag, plat_tag = self.get_tag() - archive_basename = f"{self.wheel_dist_name}-{impl_tag}-{abi_tag}-{plat_tag}" - - wheel_path = os.path.join(self.dist_dir, archive_basename + ".whl") - print("Raw wheel path", wheel_path) + archive_basename = f'{self.wheel_dist_name}-{impl_tag}-{abi_tag}-{plat_tag}' + wheel_path = os.path.join(self.dist_dir, archive_basename + '.whl') os.rename(wheel_filename, wheel_path) except (urllib.error.HTTPError, urllib.error.URLError): - print("Precompiled wheel not found. Building from source...") + print('Precompiled wheel not found. Building from source...') # If the wheel could not be downloaded, build from source super().run() if __name__ == '__main__': - # noinspection PyBroadException - try: - cmd = ['git', 'rev-parse', '--short', 'HEAD'] - revision = '+' + subprocess.check_output(cmd).decode('ascii').rstrip() - except: - revision = '' - # noinspection PyTypeChecker setuptools.setup( name='deep_gemm', @@ -211,14 +174,7 @@ def run(self): 'include/cutlass/**/*', ] }, - ext_modules=[ - CUDAExtension(name='deep_gemm_cpp', - sources=sources, - include_dirs=build_include_dirs, - libraries=build_libraries, - library_dirs=build_library_dirs, - extra_compile_args=cxx_flags) - ], + ext_modules=get_ext_modules(), zip_safe=False, cmdclass={ 'build_py': CustomBuildPy, From ba2cb6ba147c30dc00b8a7c467bd70fb957974b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 26 Sep 2025 12:54:31 +0000 Subject: [PATCH 18/26] revert missing cudaextension args MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- setup.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 99f56180..cb555bcf 100644 --- a/setup.py +++ b/setup.py @@ -96,7 +96,10 @@ def get_ext_modules(): return [CUDAExtension(name='deep_gemm_cpp', sources=sources, - include_dirs=build_include_dirs,)] + include_dirs=build_include_dirs, + libraries=build_libraries, + library_dirs=build_library_dirs, + extra_compile_args=cxx_flags)] class CustomBuildPy(build_py): From 805405863a29793b20760978fb89c3052c2e17d6 Mon Sep 17 00:00:00 2001 From: Chenggang Zhao Date: Sun, 28 Sep 2025 10:03:04 +0800 Subject: [PATCH 19/26] Add timeout --- setup.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index cb555bcf..7c2cd199 100644 --- a/setup.py +++ b/setup.py @@ -77,7 +77,7 @@ def get_wheel_url(): python_version = f'cp{sys.version_info.major}{sys.version_info.minor}' platform_name = get_platform() deep_gemm_version = get_package_version() - cxx11_abi = str(torch._C._GLIBCXX_USE_CXX11_ABI).upper() + cxx11_abi = int(torch._C._GLIBCXX_USE_CXX11_ABI) # Determine the version numbers that will be used to determine the correct wheel # We're using the CUDA version used to build torch, not the one currently installed @@ -85,7 +85,7 @@ def get_wheel_url(): cuda_version = f'{cuda_version.major}' # Determine wheel URL based on CUDA version, torch version, python version and OS - wheel_filename = f'deep_gemm-{deep_gemm_version}+cu{cuda_version}torch{torch_version}cxx11abi{cxx11_abi}-{python_version}-{python_version}-{platform_name}.whl' + wheel_filename = f'deep_gemm-{deep_gemm_version}+cu{cuda_version}-torch{torch_version}-cxx11abi{cxx11_abi}-{python_version}-{platform_name}.whl' wheel_url = base_wheel_url.format(tag_name=f'v{deep_gemm_version}', wheel_name=wheel_filename) return wheel_url, wheel_filename @@ -149,7 +149,10 @@ def run(self): wheel_url, wheel_filename = get_wheel_url() print(f'Try to download wheel from URL: {wheel_url}') try: - urllib.request.urlretrieve(wheel_url, wheel_filename) + with urllib.request.urlopen(wheel_url, timeout=1) as response: + with open(wheel_filename, 'wb') as out_file: + data = response.read() + out_file.write(data) # Make the archive if not os.path.exists(self.dist_dir): From 6be763bf75310eb937513b9b2b717adf13350d91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 28 Sep 2025 09:19:27 +0000 Subject: [PATCH 20/26] fix DG settings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- setup.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 7c2cd199..183a9aab 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,9 @@ from wheel.bdist_wheel import bdist_wheel as _bdist_wheel +DG_SKIP_CUDA_BUILD = int(os.getenv('DG_SKIP_CUDA_BUILD', '0')) == 1 +DG_FORCE_BUILD = int(os.getenv('DG_FORCE_BUILD', '0')) == 1 + # Compiler flags cxx_flags = ['-std=c++17', '-O3', '-fPIC', '-Wno-psabi', '-Wno-deprecated-declarations', f'-D_GLIBCXX_USE_CXX11_ABI={int(torch.compiled_with_cxx11_abi())}'] @@ -91,7 +94,7 @@ def get_wheel_url(): def get_ext_modules(): - if os.getenv('DG_SKIP_CUDA_BUILD', '0') != 0: + if DG_SKIP_CUDA_BUILD: return [] return [CUDAExtension(name='deep_gemm_cpp', @@ -143,7 +146,7 @@ def prepare_includes(self): class CachedWheelsCommand(_bdist_wheel): def run(self): - if int(os.getenv('DG_FORCE_BUILD', '0')) != 0: + if DG_FORCE_BUILD: return super().run() wheel_url, wheel_filename = get_wheel_url() From 89a63e988da0b418493b2be5322fe0cb93044b12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 28 Sep 2025 09:27:40 +0000 Subject: [PATCH 21/26] DG_USE_LOCAL_VERSION MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/_build.yml | 2 +- .github/workflows/publish.yml | 2 +- setup.py | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_build.yml b/.github/workflows/_build.yml index 18b608d0..cff80136 100644 --- a/.github/workflows/_build.yml +++ b/.github/workflows/_build.yml @@ -170,7 +170,7 @@ jobs: export MAX_JOBS=$([ "$MATRIX_CUDA_VERSION" == "129" ] && echo 1 || echo 2) export NVCC_THREADS=2 export TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 9.0+PTX" - export DG_NO_LOCAL_VERSION=${{ inputs.use-local-version && '0' || '1' }} + export DG_USE_LOCAL_VERSION=${{ inputs.use-local-version && '1' || '0' }} # 5h timeout since GH allows max 6h and we want some buffer EXIT_CODE=0 diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index bfbc868b..a7b3e6b8 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -83,7 +83,7 @@ jobs: pip install torch --index-url https://download.pytorch.org/whl/cpu - name: Build core package env: - DG_NO_LOCAL_VERSION: "1" + DG_USE_LOCAL_VERSION: "0" DG_SKIP_CUDA_BUILD: "1" run: | python setup.py sdist --dist-dir=dist diff --git a/setup.py b/setup.py index 183a9aab..61ed18a9 100644 --- a/setup.py +++ b/setup.py @@ -20,6 +20,7 @@ DG_SKIP_CUDA_BUILD = int(os.getenv('DG_SKIP_CUDA_BUILD', '0')) == 1 DG_FORCE_BUILD = int(os.getenv('DG_FORCE_BUILD', '0')) == 1 +DG_USE_LOCAL_VERSION = int(os.getenv('DG_USE_LOCAL_VERSION', '1')) == 1 # Compiler flags cxx_flags = ['-std=c++17', '-O3', '-fPIC', '-Wno-psabi', '-Wno-deprecated-declarations', @@ -57,7 +58,7 @@ def get_package_version(): public_version = ast.literal_eval(version_match.group(1)) revision = '' - if int(os.getenv('DG_NO_LOCAL_VERSION', '0')) == 0: + if DG_USE_LOCAL_VERSION: # noinspection PyBroadException try: cmd = ['git', 'rev-parse', '--short', 'HEAD'] From e30725e68fa94aaaca0aeb880b8f47cf48afdc8b Mon Sep 17 00:00:00 2001 From: Chenggang Zhao Date: Fri, 10 Oct 2025 18:02:29 +0800 Subject: [PATCH 22/26] Update version --- deep_gemm/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deep_gemm/__init__.py b/deep_gemm/__init__.py index d9054b68..a4633ae6 100644 --- a/deep_gemm/__init__.py +++ b/deep_gemm/__init__.py @@ -84,4 +84,4 @@ def _find_cuda_home() -> str: _find_cuda_home() # CUDA home ) -__version__ = '2.0.0' +__version__ = '2.1.1' From e9674db8d93c7b8997ec9c86fb5fd2d91382f583 Mon Sep 17 00:00:00 2001 From: Chenggang Zhao Date: Fri, 10 Oct 2025 18:14:38 +0800 Subject: [PATCH 23/26] Detect local changes --- setup.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 61ed18a9..ebd3f728 100644 --- a/setup.py +++ b/setup.py @@ -56,15 +56,21 @@ def get_package_version(): with open(Path(current_dir) / 'deep_gemm' / '__init__.py', 'r') as f: version_match = re.search(r'^__version__\s*=\s*(.*)$', f.read(), re.MULTILINE) public_version = ast.literal_eval(version_match.group(1)) - revision = '' + revision = '' if DG_USE_LOCAL_VERSION: # noinspection PyBroadException try: + status_cmd = ['git', 'status', '--porcelain'] + status_output = subprocess.check_output(status_cmd).decode('ascii').strip() + if status_output: + print(f'Warning: Git working directory is not clean. Uncommitted changes:\n{status_output}') + assert False, 'Git working directory is not clean' + cmd = ['git', 'rev-parse', '--short', 'HEAD'] revision = '+' + subprocess.check_output(cmd).decode('ascii').rstrip() except: - revision = '' + revision = '+local' return f'{public_version}{revision}' From 28c26b9c4bfc0c64289627ffc15cb49ec8122813 Mon Sep 17 00:00:00 2001 From: Chenggang Zhao Date: Fri, 10 Oct 2025 18:21:01 +0800 Subject: [PATCH 24/26] Minor fix --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ebd3f728..aaada624 100644 --- a/setup.py +++ b/setup.py @@ -153,7 +153,7 @@ def prepare_includes(self): class CachedWheelsCommand(_bdist_wheel): def run(self): - if DG_FORCE_BUILD: + if DG_FORCE_BUILD or DG_USE_LOCAL_VERSION: return super().run() wheel_url, wheel_filename = get_wheel_url() From c90ccc5b95273926104bef3c755b51f81dda4cbb Mon Sep 17 00:00:00 2001 From: Chenggang Zhao Date: Fri, 10 Oct 2025 18:21:45 +0800 Subject: [PATCH 25/26] Revert CUTLASS --- third-party/cutlass | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third-party/cutlass b/third-party/cutlass index a49a78ff..f3fde583 160000 --- a/third-party/cutlass +++ b/third-party/cutlass @@ -1 +1 @@ -Subproject commit a49a78ffefc86a87160dfe0ccc3a3a2d1622c918 +Subproject commit f3fde58372d33e9a5650ba7b80fc48b3b49d40c8 From f69cd3b51e6dc02b56d129530427fe270d9c1d76 Mon Sep 17 00:00:00 2001 From: Chenggang Zhao Date: Fri, 10 Oct 2025 18:23:15 +0800 Subject: [PATCH 26/26] Unify options --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index aaada624..01a72ebb 100644 --- a/setup.py +++ b/setup.py @@ -21,11 +21,12 @@ DG_SKIP_CUDA_BUILD = int(os.getenv('DG_SKIP_CUDA_BUILD', '0')) == 1 DG_FORCE_BUILD = int(os.getenv('DG_FORCE_BUILD', '0')) == 1 DG_USE_LOCAL_VERSION = int(os.getenv('DG_USE_LOCAL_VERSION', '1')) == 1 +DG_JIT_USE_RUNTIME_API = int(os.environ.get('DG_JIT_USE_RUNTIME_API', '0')) == 1 # Compiler flags cxx_flags = ['-std=c++17', '-O3', '-fPIC', '-Wno-psabi', '-Wno-deprecated-declarations', f'-D_GLIBCXX_USE_CXX11_ABI={int(torch.compiled_with_cxx11_abi())}'] -if int(os.environ.get('DG_JIT_USE_RUNTIME_API', '0')): +if DG_JIT_USE_RUNTIME_API: cxx_flags.append('-DDG_JIT_USE_RUNTIME_API') # Sources