fixed fused_rope naming in JIT + added readme for amd support (#1224)

EleutherAI · May 21, 2024 · 1d55708 · 1d55708
1 parent 2746d43
commit 1d55708
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -96,7 +96,6 @@ To install the remaining basic dependencies, run:
 pip install -r requirements/requirements.txt
 pip install -r requirements/requirements-wandb.txt # optional, if logging using WandB
 pip install -r requirements/requirements-tensorboard.txt # optional, if logging via tensorboard
-python ./megatron/fused_kernels/setup.py install # optional, if using fused kernels
 ```
 
 from the repository root.
@@ -106,6 +105,16 @@ from the repository root.
 
 </aside>
 
+### Fused Kernels
+We now support AMD GPUs (MI100, MI250X) through JIT fused-kernel compilation. Fused kernels will be built and loaded as needed. To avoid waiting during job launching, you can also do the following for manual pre-build:
+
+```python
+python
+from megatron.fused_kernels import load
+load()
+```
+This will automatically adapts building process over different GPU vendors (AMD, NVIDIA) without platform specific code changes. To further test fused kernels using `pytest`, use `pytest tests/model/test_fused_kernels.py`
+
 ### Flash Attention
 
 To use [Flash-Attention](https://github.com/HazyResearch/flash-attention), install the additional dependencies in  `./requirements/requirements-flashattention.txt` and set the attention type in your configuration accordingly (see [configs](./configs/)). This can provide significant speed-ups over regular attention on certain GPU architectures, including Ampere GPUs (such as A100s); see the repository for more details.

diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/__init__.py
@@ -135,8 +135,8 @@ def _cpp_extention_load_helper(
         srcpath / "fused_rotary_positional_embedding.cpp",
         srcpath / "fused_rotary_positional_embedding_cuda.cu",
     ]
-    fused_rotary_positional_embedding_cuda = _cpp_extention_load_helper(
-        "fused_rotary_positional_embedding_cuda",
+    fused_rotary_positional_embedding = _cpp_extention_load_helper(
+        "fused_rotary_positional_embedding",
         sources,
         extra_cuda_flags,
         extra_include_paths,
@@ -174,7 +174,7 @@ def load_fused_kernels():
         print(e)
         print("=" * 100)
         print(
-            f"ERROR: Fused kernels configured but not properly installed. Please run `pip install {str(srcpath)}` to install them"
+            f"ERROR: Fused kernels configured but not properly installed. Please run `from megatron.fused_kernels import load()` then `load()` to load them correctly"
         )
         print("=" * 100)
         exit()

diff --git a/tests/model/test_fused_kernels.py b/tests/model/test_fused_kernels.py
@@ -30,9 +30,7 @@
 )
 
 
-@pytest.mark.xfail(
-    reason="ModuleNotFoundError: No module named 'scaled_masked_softmax_cuda'"
-)
+@pytest.mark.xfail(reason="SystemExit: None")
 def test_load_fused_kernels():
     load()
     try: