Hotfix(MInference): fix the configs in pip (#14)

Co-authored-by: Yucheng Li <[email protected]> Co-authored-by: Chengruidong Zhang <[email protected]>
microsoft · Jul 5, 2024 · 1c2bf70 · 1c2bf70
1 parent 00666fb
commit 1c2bf70
Show file tree

Hide file tree

Showing 3 changed files with 9 additions and 2 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1,4 @@
 recursive-include csrc *.cu
 recursive-include csrc *.cpp
+
+recursive-include minference *.json
diff --git a/README.md b/README.md
@@ -80,6 +80,7 @@ pipe(prompt, max_length=10)
 ```
 
 for vLLM,
+> For now, please use vllm==0.4.1
 
 ```diff
 from vllm import LLM, SamplingParams

diff --git a/minference/modules/minference_forward.py b/minference/modules/minference_forward.py
@@ -4,13 +4,17 @@
 import inspect
 import json
 import os
+import warnings
 from importlib import import_module
 
 from transformers.models.llama.modeling_llama import *
 from transformers.utils.import_utils import _is_package_available
 
 if _is_package_available("vllm"):
-    from vllm.attention.backends.flash_attn import *
+    try:
+        from vllm.attention.backends.flash_attn import *
+    except:
+        warnings.warn("Only support 'vllm==0.4.1'. Please update your vllm version.")
 
 from ..ops.block_sparse_flash_attention import block_sparse_attention
 from ..ops.pit_sparse_flash_attention_v2 import vertical_slash_sparse_attention
@@ -768,7 +772,7 @@ def forward(
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata[FlashAttentionMetadata],
+        attn_metadata,
         kv_scale: float,
         layer_idx: int,
     ) -> torch.Tensor: