diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 59cb0a9..89156f8 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -73,6 +73,12 @@ jobs: with: python-version: ${{ matrix.python-version }} + - name: Set CUDA and PyTorch versions + run: | + echo "MATRIX_CUDA_VERSION=$(echo ${{ matrix.cuda-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV + echo "MATRIX_TORCH_VERSION=$(echo ${{ matrix.pytorch-version }} | awk -F \. {'print $1 "." $2'})" >> $GITHUB_ENV + echo "MATRIX_PYTHON_VERSION=$(echo ${{ matrix.python-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV + - name: Install CUDA ${{ matrix.cuda-version }} run: | bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }} diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh index 908bd83..c491b92 100644 --- a/.github/workflows/scripts/build.sh +++ b/.github/workflows/scripts/build.sh @@ -16,4 +16,9 @@ export MAX_JOBS=1 # Make sure release wheels are built for the following architectures export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" # Build -$python_executable setup.py $3 --dist-dir=dist +if [ "$3" = sdist ]; +then +MINFERENCE_SKIP_CUDA_BUILD="TRUE" $python_executable setup.py $3 --dist-dir=dist +else +MINFERENCE_LOCAL_VERSION=cu${MATRIX_CUDA_VERSION}torch${MATRIX_TORCH_VERSION} MINFERENCE_FORCE_BUILD="TRUE" $python_executable setup.py $3 --dist-dir=dist +fi diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index b491516..7cc737f 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -1,7 +1,10 @@ name: Unit Test # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows -on: [] # Trigger the workflow on pull request or merge +on: + push: + branches: + - 'test/**' # pull_request: # merge_group: # types: [checks_requested] diff --git a/.gitignore b/.gitignore index 24a0420..16033cd 100644 --- a/.gitignore +++ b/.gitignore @@ -414,3 +414,4 @@ __pycache__ build/ *.egg-info/ *.so +dist diff --git a/README.md b/README.md index 26209ca..ed3233b 100644 --- a/README.md +++ b/README.md @@ -8,14 +8,14 @@
| Project Page | - Paper | + Paper | HF Demo |
https://github.com/microsoft/MInference/assets/30883354/52613efc-738f-4081-8367-7123c81d6b19 ## News -- 📃 [24/07/03] Due to an issue with arXiv, the PDF is currently unavailable there. You can find the paper at this [link](https://github.com/microsoft/MInference/blob/main/papers/MInference1_Arxiv.pdf).. +- 📃 [24/07/03] Due to an issue with arXiv, the PDF is currently unavailable there. You can find the paper at this [link](https://export.arxiv.org/pdf/2407.02490). - 🧩 [24/07/03] We will present **MInference 1.0** at the _**Microsoft Booth**_ and _**ES-FoMo**_ at ICML'24. See you in Vienna! ## TL;DR diff --git a/minference/configs/__init__.py b/minference/configs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/minference/modules/__init__.py b/minference/modules/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/minference/ops/__init__.py b/minference/ops/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/minference/version.py b/minference/version.py index b6245f9..d8a5922 100644 --- a/minference/version.py +++ b/minference/version.py @@ -5,7 +5,7 @@ _MINOR = "1" # On master and in a nightly release the patch should be one ahead of the last # released build. -_PATCH = "0" +_PATCH = "1" # This is mainly for nightly builds which have the suffix ".dev$DATE". See # https://semver.org/#is-v123-a-semantic-version for the semantics. _SUFFIX = "" diff --git a/setup.py b/setup.py index e5147c5..a312a21 100644 --- a/setup.py +++ b/setup.py @@ -8,6 +8,7 @@ from packaging.version import Version, parse from setuptools import find_packages, setup from torch.utils.cpp_extension import CUDA_HOME, BuildExtension, CUDAExtension +from wheel.bdist_wheel import bdist_wheel as _bdist_wheel # PEP0440 compatible formatted version, see: # https://www.python.org/dev/peps/pep-0440/ @@ -46,52 +47,117 @@ ] DEV_REQUIRES = INSTALL_REQUIRES + QUANLITY_REQUIRES -MAIN_CUDA_VERSION = "12.1" +# ninja build does not work unless include_dirs are abs path +this_dir = os.path.dirname(os.path.abspath(__file__)) -def _is_cuda() -> bool: - return torch.version.cuda is not None +PACKAGE_NAME = "minference" +BASE_WHEEL_URL = ( + "https://github.com/microsoft/MInference/releases/download/{tag_name}/{wheel_name}" +) -def get_nvcc_cuda_version() -> Version: - """Get the CUDA version from nvcc. +# FORCE_BUILD: Force a fresh build locally, instead of attempting to find prebuilt wheels +# SKIP_CUDA_BUILD: Intended to allow CI to use a simple `python setup.py sdist` run to copy over raw files, without any cuda compilation +FORCE_BUILD = os.getenv("MINFERENCE_FORCE_BUILD", "FALSE") == "TRUE" +SKIP_CUDA_BUILD = os.getenv("MINFERENCE_SKIP_CUDA_BUILD", "FALSE") == "TRUE" +# For CI, we want the option to build with C++11 ABI since the nvcr images use C++11 ABI +FORCE_CXX11_ABI = os.getenv("MINFERENCE_FORCE_CXX11_ABI", "FALSE") == "TRUE" + + +def check_if_cuda_home_none(global_option: str) -> None: + if CUDA_HOME is not None: + return + # warn instead of error because user could be downloading prebuilt wheels, so nvcc won't be necessary + # in that case. + warnings.warn( + f"{global_option} was requested, but nvcc was not found. Are you sure your environment has nvcc available? " + "If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, " + "only images whose names contain 'devel' will provide nvcc." + ) - Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py - """ - assert CUDA_HOME is not None, "CUDA_HOME is not set" - nvcc_output = subprocess.check_output( - [CUDA_HOME + "/bin/nvcc", "-V"], universal_newlines=True + +cmdclass = {} +ext_modules = [] + +if not SKIP_CUDA_BUILD: + print("\n\ntorch.__version__ = {}\n\n".format(torch.__version__)) + TORCH_MAJOR = int(torch.__version__.split(".")[0]) + TORCH_MINOR = int(torch.__version__.split(".")[1]) + + # Check, if ATen/CUDAGeneratorImpl.h is found, otherwise use ATen/cuda/CUDAGeneratorImpl.h + # See https://github.com/pytorch/pytorch/pull/70650 + generator_flag = [] + torch_dir = torch.__path__[0] + if os.path.exists( + os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h") + ): + generator_flag = ["-DOLD_GENERATOR_PATH"] + + check_if_cuda_home_none("minference") + + # HACK: The compiler flag -D_GLIBCXX_USE_CXX11_ABI is set to be the same as + # torch._C._GLIBCXX_USE_CXX11_ABI + # https://github.com/pytorch/pytorch/blob/8472c24e3b5b60150096486616d98b7bea01500b/torch/utils/cpp_extension.py#L920 + if FORCE_CXX11_ABI: + torch._C._GLIBCXX_USE_CXX11_ABI = True + ext_modules.append( + CUDAExtension( + name="minference.cuda", + sources=[ + os.path.join("csrc", "kernels.cpp"), + os.path.join("csrc", "vertical_slash_index.cu"), + ], + extra_compile_args=["-std=c++17", "-O3"], + ) ) - output = nvcc_output.split() - release_idx = output.index("release") + 1 - nvcc_cuda_version = parse(output[release_idx].split(",")[0]) - return nvcc_cuda_version def get_minference_version() -> str: version = VERSION["VERSION"] - if _is_cuda(): - cuda_version = str(get_nvcc_cuda_version()) - if cuda_version != MAIN_CUDA_VERSION: - cuda_version_str = cuda_version.replace(".", "")[:3] - version += f"+cu{cuda_version_str}" + local_version = os.environ.get("MINFERENCE_LOCAL_VERSION") + if local_version: + return f"{version}+{local_version}" else: - raise RuntimeError("Unknown runtime environment") + return str(version) - return version +class CachedWheelsCommand(_bdist_wheel): + """ + The CachedWheelsCommand plugs into the default bdist wheel, which is ran by pip when it cannot + find an existing wheel (which is currently the case for all flash attention installs). We use + the environment parameters to detect whether there is already a pre-built version of a compatible + wheel available and short-circuits the standard full build pipeline. + """ + + def run(self): + return super().run() + + +class NinjaBuildExtension(BuildExtension): + def __init__(self, *args, **kwargs) -> None: + # do not override env MAX_JOBS if already exists + if not os.environ.get("MAX_JOBS"): + import psutil + + # calculate the maximum allowed NUM_JOBS based on cores + max_num_jobs_cores = max(1, os.cpu_count() // 2) + + # calculate the maximum allowed NUM_JOBS based on free memory + free_memory_gb = psutil.virtual_memory().available / ( + 1024**3 + ) # free memory in GB + max_num_jobs_memory = int( + free_memory_gb / 9 + ) # each JOB peak memory cost is ~8-9GB when threads = 4 + + # pick lower value of jobs based on cores vs memory metric to minimize oom and swap usage during compilation + max_jobs = max(1, min(max_num_jobs_cores, max_num_jobs_memory)) + os.environ["MAX_JOBS"] = str(max_jobs) + + super().__init__(*args, **kwargs) -ext_modules = [ - CUDAExtension( - name="minference.cuda", - sources=[ - os.path.join("csrc", "kernels.cpp"), - os.path.join("csrc", "vertical_slash_index.cu"), - ], - extra_compile_args=["-std=c++17", "-O3"], - ) -] setup( name="minference", @@ -110,7 +176,6 @@ def get_minference_version() -> str: "Programming Language :: Python :: 3", "Topic :: Scientific/Engineering :: Artificial Intelligence", ], - package_dir={"": "."}, packages=find_packages( exclude=( "csrc", @@ -136,5 +201,9 @@ def get_minference_version() -> str: python_requires=">=3.8.0", zip_safe=False, ext_modules=ext_modules, - cmdclass={"build_ext": BuildExtension}, + cmdclass={"bdist_wheel": CachedWheelsCommand, "build_ext": NinjaBuildExtension} + if ext_modules + else { + "bdist_wheel": CachedWheelsCommand, + }, )