diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh new file mode 100644 index 0000000000000..8ba03b78e8dbf --- /dev/null +++ b/.buildkite/run-neuron-test.sh @@ -0,0 +1,37 @@ +# This script build the Neuron docker image and run the API server inside the container. +# It serves a sanity check for compilation and basic model usage. +set -e + +# Try building the docker image +aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com +docker build -t neuron -f Dockerfile.neuron . + +# Setup cleanup +remove_docker_container() { docker rm -f neuron || true; } +trap remove_docker_container EXIT +remove_docker_container + +# Run the image +docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \ + --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 & + +# Wait for the server to start +wait_for_server_to_start() { + timeout=300 + counter=0 + + while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do + sleep 1 + counter=$((counter + 1)) + if [ $counter -ge $timeout ]; then + echo "Timeout after $timeout seconds" + break + fi + done +} +wait_for_server_to_start + +# Test a simple prompt +curl -X POST -H "Content-Type: application/json" \ + localhost:8000/generate \ + -d '{"prompt": "San Francisco is a"}' diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index 0e1acc9777d4b..fb1086db77823 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -21,6 +21,11 @@ steps: queue: amd command: bash .buildkite/run-amd-test.sh + - label: "Neuron Test" + agents: + queue: neuron + command: bash .buildkite/run-neuron-test.sh + - label: "CPU Test" command: bash .buildkite/run-cpu-test.sh diff --git a/Dockerfile.neuron b/Dockerfile.neuron new file mode 100644 index 0000000000000..fe42b4ef393f1 --- /dev/null +++ b/Dockerfile.neuron @@ -0,0 +1,36 @@ +# default base image +ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04" + +FROM $BASE_IMAGE + +RUN echo "Base image is $BASE_IMAGE" + +# Install some basic utilities +RUN apt-get update && apt-get install python3 python3-pip -y + +### Mount Point ### +# When launching the container, mount the code directory to /app +ARG APP_MOUNT=/app +VOLUME [ ${APP_MOUNT} ] +WORKDIR ${APP_MOUNT} + +RUN python3 -m pip install --upgrade pip +RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas +RUN python3 -m pip install sentencepiece transformers==4.36.2 -U +RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U +RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U + +COPY ./vllm /app/vllm/vllm +COPY ./setup.py /app/vllm/setup.py +COPY ./requirements-common.txt /app/vllm/requirements-common.txt +COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt + +RUN cd /app/vllm \ + && python3 -m pip install -U -r requirements-neuron.txt + +ENV VLLM_BUILD_WITH_NEURON 1 +RUN cd /app/vllm \ + && pip install -e . \ + && cd .. + +CMD ["/bin/bash"] diff --git a/setup.py b/setup.py index 19a9150ad2e64..4b672e1af8494 100644 --- a/setup.py +++ b/setup.py @@ -204,7 +204,8 @@ def _is_neuron() -> bool: subprocess.run(["neuron-ls"], capture_output=True, check=True) except (FileNotFoundError, PermissionError, subprocess.CalledProcessError): torch_neuronx_installed = False - return torch_neuronx_installed + return torch_neuronx_installed or os.environ.get("VLLM_BUILD_WITH_NEURON", + False) def _is_cpu() -> bool: diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index c3020d2b38db0..c436ece83f65a 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -335,8 +335,8 @@ def from_engine_args( engine_config = engine_args.create_engine_config() if engine_config.device_config.device_type == "neuron": - raise NotImplementedError("Neuron is not supported for " - "async engine yet.") + from vllm.executor.neuron_executor import NeuronExecutorAsync + executor_class = NeuronExecutorAsync elif engine_config.parallel_config.worker_use_ray: initialize_ray_cluster(engine_config.parallel_config) from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index 7cc187e297c9f..5a137d1bdcb3b 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -1,9 +1,10 @@ from typing import Dict, List, Set, Tuple -from vllm.executor.executor_base import ExecutorBase +from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.utils import make_async logger = init_logger(__name__) @@ -73,3 +74,22 @@ def check_health(self) -> None: # NeuronExecutor will always be healthy as long as # it's running. return + + +class NeuronExecutorAsync(NeuronExecutor, ExecutorAsyncBase): + + async def execute_model_async( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + ) -> SamplerOutput: + output = await make_async(self.driver_worker.execute_model)( + seq_group_metadata_list=seq_group_metadata_list, ) + return output + + async def check_health_async(self) -> None: + # NeuronExecutor will always be healthy as long as + # it's running. + return