diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 59cb0a9..89156f8 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -73,6 +73,12 @@ jobs:
         with:
             python-version: ${{ matrix.python-version }}
 
+      - name: Set CUDA and PyTorch versions
+        run: |
+          echo "MATRIX_CUDA_VERSION=$(echo ${{ matrix.cuda-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
+          echo "MATRIX_TORCH_VERSION=$(echo ${{ matrix.pytorch-version }} | awk -F \. {'print $1 "." $2'})" >> $GITHUB_ENV
+          echo "MATRIX_PYTHON_VERSION=$(echo ${{ matrix.python-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
+
       - name: Install CUDA ${{ matrix.cuda-version }}
         run: |
           bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
index 908bd83..c491b92 100644
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@@ -16,4 +16,9 @@ export MAX_JOBS=1
 # Make sure release wheels are built for the following architectures
 export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
 # Build
-$python_executable setup.py $3 --dist-dir=dist
+if [ "$3" = sdist ];
+then
+MINFERENCE_SKIP_CUDA_BUILD="TRUE" $python_executable setup.py $3 --dist-dir=dist
+else
+MINFERENCE_LOCAL_VERSION=cu${MATRIX_CUDA_VERSION}torch${MATRIX_TORCH_VERSION} MINFERENCE_FORCE_BUILD="TRUE" $python_executable setup.py $3 --dist-dir=dist
+fi
diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
index b491516..7cc737f 100644
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -1,7 +1,10 @@
 name: Unit Test
 
 # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows
-on: [] # Trigger the workflow on pull request or merge
+on:
+  push:
+    branches:
+      - 'test/**'
   # pull_request:
   # merge_group:
   #   types: [checks_requested]
diff --git a/.gitignore b/.gitignore
index 24a0420..16033cd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -414,3 +414,4 @@ __pycache__
 build/
 *.egg-info/
 *.so
+dist
diff --git a/README.md b/README.md
index 26209ca..ed3233b 100644
--- a/README.md
+++ b/README.md
@@ -8,14 +8,14 @@
 
 <p align="center">
     | <a href="https://aka.ms/MInference"><b>Project Page</b></a> |
-    <a href="https://arxiv.org/abs/2407.02490"><b>Paper</b></a> |
+    <a href="https://export.arxiv.org/pdf/2407.02490"><b>Paper</b></a> |
     <a href="https://huggingface.co/spaces/microsoft/MInference"><b>HF Demo</b></a> |
 </p>
 
 https://github.com/microsoft/MInference/assets/30883354/52613efc-738f-4081-8367-7123c81d6b19
 
 ## News
-- 📃 [24/07/03] Due to an issue with arXiv, the PDF is currently unavailable there. You can find the paper at this [link](https://github.com/microsoft/MInference/blob/main/papers/MInference1_Arxiv.pdf)..
+- 📃 [24/07/03] Due to an issue with arXiv, the PDF is currently unavailable there. You can find the paper at this [link](https://export.arxiv.org/pdf/2407.02490).
 - 🧩 [24/07/03] We will present **MInference 1.0** at the _**Microsoft Booth**_ and _**ES-FoMo**_ at ICML'24. See you in Vienna!
 
 ## TL;DR
diff --git a/minference/configs/__init__.py b/minference/configs/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/minference/modules/__init__.py b/minference/modules/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/minference/ops/__init__.py b/minference/ops/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/minference/version.py b/minference/version.py
index b6245f9..d8a5922 100644
--- a/minference/version.py
+++ b/minference/version.py
@@ -5,7 +5,7 @@
 _MINOR = "1"
 # On master and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "0"
+_PATCH = "1"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""
diff --git a/setup.py b/setup.py
index e5147c5..a312a21 100644
--- a/setup.py
+++ b/setup.py
@@ -8,6 +8,7 @@
 from packaging.version import Version, parse
 from setuptools import find_packages, setup
 from torch.utils.cpp_extension import CUDA_HOME, BuildExtension, CUDAExtension
+from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
 
 # PEP0440 compatible formatted version, see:
 # https://www.python.org/dev/peps/pep-0440/
@@ -46,52 +47,117 @@
 ]
 DEV_REQUIRES = INSTALL_REQUIRES + QUANLITY_REQUIRES
 
-MAIN_CUDA_VERSION = "12.1"
 
+# ninja build does not work unless include_dirs are abs path
+this_dir = os.path.dirname(os.path.abspath(__file__))
 
-def _is_cuda() -> bool:
-    return torch.version.cuda is not None
+PACKAGE_NAME = "minference"
 
+BASE_WHEEL_URL = (
+    "https://github.com/microsoft/MInference/releases/download/{tag_name}/{wheel_name}"
+)
 
-def get_nvcc_cuda_version() -> Version:
-    """Get the CUDA version from nvcc.
+# FORCE_BUILD: Force a fresh build locally, instead of attempting to find prebuilt wheels
+# SKIP_CUDA_BUILD: Intended to allow CI to use a simple `python setup.py sdist` run to copy over raw files, without any cuda compilation
+FORCE_BUILD = os.getenv("MINFERENCE_FORCE_BUILD", "FALSE") == "TRUE"
+SKIP_CUDA_BUILD = os.getenv("MINFERENCE_SKIP_CUDA_BUILD", "FALSE") == "TRUE"
+# For CI, we want the option to build with C++11 ABI since the nvcr images use C++11 ABI
+FORCE_CXX11_ABI = os.getenv("MINFERENCE_FORCE_CXX11_ABI", "FALSE") == "TRUE"
+
+
+def check_if_cuda_home_none(global_option: str) -> None:
+    if CUDA_HOME is not None:
+        return
+    # warn instead of error because user could be downloading prebuilt wheels, so nvcc won't be necessary
+    # in that case.
+    warnings.warn(
+        f"{global_option} was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  "
+        "If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, "
+        "only images whose names contain 'devel' will provide nvcc."
+    )
 
-    Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
-    """
-    assert CUDA_HOME is not None, "CUDA_HOME is not set"
-    nvcc_output = subprocess.check_output(
-        [CUDA_HOME + "/bin/nvcc", "-V"], universal_newlines=True
+
+cmdclass = {}
+ext_modules = []
+
+if not SKIP_CUDA_BUILD:
+    print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
+    TORCH_MAJOR = int(torch.__version__.split(".")[0])
+    TORCH_MINOR = int(torch.__version__.split(".")[1])
+
+    # Check, if ATen/CUDAGeneratorImpl.h is found, otherwise use ATen/cuda/CUDAGeneratorImpl.h
+    # See https://github.com/pytorch/pytorch/pull/70650
+    generator_flag = []
+    torch_dir = torch.__path__[0]
+    if os.path.exists(
+        os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")
+    ):
+        generator_flag = ["-DOLD_GENERATOR_PATH"]
+
+    check_if_cuda_home_none("minference")
+
+    # HACK: The compiler flag -D_GLIBCXX_USE_CXX11_ABI is set to be the same as
+    # torch._C._GLIBCXX_USE_CXX11_ABI
+    # https://github.com/pytorch/pytorch/blob/8472c24e3b5b60150096486616d98b7bea01500b/torch/utils/cpp_extension.py#L920
+    if FORCE_CXX11_ABI:
+        torch._C._GLIBCXX_USE_CXX11_ABI = True
+    ext_modules.append(
+        CUDAExtension(
+            name="minference.cuda",
+            sources=[
+                os.path.join("csrc", "kernels.cpp"),
+                os.path.join("csrc", "vertical_slash_index.cu"),
+            ],
+            extra_compile_args=["-std=c++17", "-O3"],
+        )
     )
-    output = nvcc_output.split()
-    release_idx = output.index("release") + 1
-    nvcc_cuda_version = parse(output[release_idx].split(",")[0])
-    return nvcc_cuda_version
 
 
 def get_minference_version() -> str:
     version = VERSION["VERSION"]
 
-    if _is_cuda():
-        cuda_version = str(get_nvcc_cuda_version())
-        if cuda_version != MAIN_CUDA_VERSION:
-            cuda_version_str = cuda_version.replace(".", "")[:3]
-            version += f"+cu{cuda_version_str}"
+    local_version = os.environ.get("MINFERENCE_LOCAL_VERSION")
+    if local_version:
+        return f"{version}+{local_version}"
     else:
-        raise RuntimeError("Unknown runtime environment")
+        return str(version)
 
-    return version
 
+class CachedWheelsCommand(_bdist_wheel):
+    """
+    The CachedWheelsCommand plugs into the default bdist wheel, which is ran by pip when it cannot
+    find an existing wheel (which is currently the case for all flash attention installs). We use
+    the environment parameters to detect whether there is already a pre-built version of a compatible
+    wheel available and short-circuits the standard full build pipeline.
+    """
+
+    def run(self):
+        return super().run()
+
+
+class NinjaBuildExtension(BuildExtension):
+    def __init__(self, *args, **kwargs) -> None:
+        # do not override env MAX_JOBS if already exists
+        if not os.environ.get("MAX_JOBS"):
+            import psutil
+
+            # calculate the maximum allowed NUM_JOBS based on cores
+            max_num_jobs_cores = max(1, os.cpu_count() // 2)
+
+            # calculate the maximum allowed NUM_JOBS based on free memory
+            free_memory_gb = psutil.virtual_memory().available / (
+                1024**3
+            )  # free memory in GB
+            max_num_jobs_memory = int(
+                free_memory_gb / 9
+            )  # each JOB peak memory cost is ~8-9GB when threads = 4
+
+            # pick lower value of jobs based on cores vs memory metric to minimize oom and swap usage during compilation
+            max_jobs = max(1, min(max_num_jobs_cores, max_num_jobs_memory))
+            os.environ["MAX_JOBS"] = str(max_jobs)
+
+        super().__init__(*args, **kwargs)
 
-ext_modules = [
-    CUDAExtension(
-        name="minference.cuda",
-        sources=[
-            os.path.join("csrc", "kernels.cpp"),
-            os.path.join("csrc", "vertical_slash_index.cu"),
-        ],
-        extra_compile_args=["-std=c++17", "-O3"],
-    )
-]
 
 setup(
     name="minference",
@@ -110,7 +176,6 @@ def get_minference_version() -> str:
         "Programming Language :: Python :: 3",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
-    package_dir={"": "."},
     packages=find_packages(
         exclude=(
             "csrc",
@@ -136,5 +201,9 @@ def get_minference_version() -> str:
     python_requires=">=3.8.0",
     zip_safe=False,
     ext_modules=ext_modules,
-    cmdclass={"build_ext": BuildExtension},
+    cmdclass={"bdist_wheel": CachedWheelsCommand, "build_ext": NinjaBuildExtension}
+    if ext_modules
+    else {
+        "bdist_wheel": CachedWheelsCommand,
+    },
 )