joyliu-q · joyliu-q · Mar 30, 2024 · anli5005 · Mar 30, 2024
diff --git a/backend/README.md b/backend/README.md
diff --git a/backend/__pycache__/llm.cpython-310.pyc b/backend/__pycache__/llm.cpython-310.pyc
diff --git a/backend/llm.py b/backend/llm.py
@@ -0,0 +1,121 @@
+import os
+
+from modal import Image, Secret, Stub, enter, gpu, method
+
+MODEL_DIR = "/model"
+BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.1"
+
+
+def download_model_to_folder():
+    from huggingface_hub import snapshot_download
+    from transformers.utils import move_cache
+
+    os.makedirs(MODEL_DIR, exist_ok=True)
+
+    snapshot_download(
+        BASE_MODEL,
+        local_dir=MODEL_DIR,
+        ignore_patterns=["*.pt", "*.bin"],  # Using safetensors
+    )
+    move_cache()
+
+image = (
+    Image.from_registry(
+        "nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10"
+    )
+    .pip_install(
+        "vllm==0.2.5",
+        "huggingface_hub==0.19.4",
+        "hf-transfer==0.1.4",
+        "torch==2.1.2",
+    )
+    # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
+    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
+    .run_function(
+        download_model_to_folder,
+        # TODO: load in secret
+        secrets=[Secret.from_name("huggingface-secret")],
+        timeout=60 * 20,
+    )
+)
+
+
+stub = Stub("example-vllm-inference", image=image)
+
+
+
+
+GPU_CONFIG = gpu.A100(count=1)  # 40GB A100 by default
+
+
+@stub.cls(gpu=GPU_CONFIG, secrets=[Secret.from_name("huggingface-secret")])
+class Model:
+    @enter()
+    def load_model(self):
+        from vllm import LLM
+
+        if GPU_CONFIG.count > 1:
+            # Patch issue from https://github.com/vllm-project/vllm/issues/1116
+            import ray
+
+            ray.shutdown()
+            ray.init(num_gpus=GPU_CONFIG.count)
+
+        # Load the model. Tip: MPT models may require `trust_remote_code=true`.
+        self.llm = LLM(MODEL_DIR, tensor_parallel_size=GPU_CONFIG.count)
+        self.template = """<s>[INST] <<SYS>>
+{system}
+<</SYS>>
+
+{user} [/INST] """
+
+    @method()
+    def generate(self, user_questions):
+        import time
+
+        from vllm import SamplingParams
+
+        prompts = [
+            self.template.format(system="", user=q) for q in user_questions
+        ]
+
+        sampling_params = SamplingParams(
+            temperature=0.75,
+            top_p=1,
+            max_tokens=256,
+            presence_penalty=1.15,
+        )
+        start = time.monotonic_ns()
+        result = self.llm.generate(prompts, sampling_params)
+        duration_s = (time.monotonic_ns() - start) / 1e9
+        num_tokens = 0
+
+        COLOR = {
+            "HEADER": "\033[95m",
+            "BLUE": "\033[94m",
+            "GREEN": "\033[92m",
+            "RED": "\033[91m",
+            "ENDC": "\033[0m",
+        }
+
+        for output in result:
+            num_tokens += len(output.outputs[0].token_ids)
+            print(
+                f"{COLOR['HEADER']}{COLOR['GREEN']}{output.prompt}",
+                f"\n{COLOR['BLUE']}{output.outputs[0].text}",
+                "\n\n",
+                sep=COLOR["ENDC"],
+            )
+            time.sleep(0.01)
+        print(
+            f"{COLOR['HEADER']}{COLOR['GREEN']}Generated {num_tokens} tokens from {BASE_MODEL} in {duration_s:.1f} seconds, throughput = {num_tokens / duration_s:.0f} tokens/second on {GPU_CONFIG}.{COLOR['ENDC']}"
+        )
+
+
+@stub.local_entrypoint()
+def main():
+    model = Model()
+    questions = [
+        "Implement a Python function to compute the Fibonacci numbers.",
+    ]
+    model.generate.remote(questions)
diff --git a/backend/main.py b/backend/main.py
@@ -0,0 +1,14 @@
+import modal
+
+stub = modal.Stub("example-get-started")
+
+
+@stub.function()
+def square(x):
+    print("This code is running on a remote worker!")
+    return x**2
+
+
+@stub.local_entrypoint()
+def main():
+    print("the square is", square.remote(42))