[CI/CD] add neuron docker and ci test scripts (#3571)

2024-04-18 15:26:01 -07:00 · 2024-04-18 15:26:01 -07:00 · cd2f63fb36
commit cd2f63fb36
parent 87fa80c91f
6 changed files with 103 additions and 4 deletions
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@ -0,0 +1,37 @@
 # This script build the Neuron docker image and run the API server inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -e
 # Try building the docker image
 aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
 docker build -t neuron -f Dockerfile.neuron .
 # Setup cleanup
 remove_docker_container() { docker rm -f neuron || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 # Run the image
 docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
       --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
 # Wait for the server to start
 wait_for_server_to_start() {
    timeout=300
    counter=0
    while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
        sleep 1
        counter=$((counter + 1))
        if [ $counter -ge $timeout ]; then
            echo "Timeout after $timeout seconds"
            break
        fi
    done
 }
 wait_for_server_to_start
 # Test a simple prompt
 curl -X POST -H "Content-Type: application/json" \
    localhost:8000/generate \
    -d '{"prompt": "San Francisco is a"}'
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@ -21,6 +21,11 @@ steps:
      queue: amd
    command: bash .buildkite/run-amd-test.sh
  - label: "Neuron Test"
    agents:
      queue: neuron
    command: bash .buildkite/run-neuron-test.sh
  - label: "CPU Test"
    command: bash .buildkite/run-cpu-test.sh
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@ -0,0 +1,36 @@
 # default base image
 ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"
 FROM $BASE_IMAGE
 RUN echo "Base image is $BASE_IMAGE"
 # Install some basic utilities
 RUN apt-get update && apt-get install python3 python3-pip -y
 ### Mount Point ###
 # When launching the container, mount the code directory to /app
 ARG APP_MOUNT=/app
 VOLUME [ ${APP_MOUNT} ]
 WORKDIR ${APP_MOUNT}
 RUN python3 -m pip install --upgrade pip
 RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
 RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
 RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 COPY ./vllm /app/vllm/vllm
 COPY ./setup.py /app/vllm/setup.py
 COPY ./requirements-common.txt /app/vllm/requirements-common.txt
 COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
 RUN cd /app/vllm \
    && python3 -m pip install -U -r requirements-neuron.txt
 ENV VLLM_BUILD_WITH_NEURON 1
 RUN cd /app/vllm \
    && pip install -e . \
    && cd ..
 CMD ["/bin/bash"]
--- a/setup.py
+++ b/setup.py
@ -204,7 +204,8 @@ def _is_neuron() -> bool:
        subprocess.run(["neuron-ls"], capture_output=True, check=True)
    except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
        torch_neuronx_installed = False
-    return torch_neuronx_installed
+    return torch_neuronx_installed or os.environ.get("VLLM_BUILD_WITH_NEURON",
                                                     False)
 def _is_cpu() -> bool:
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@ -335,8 +335,8 @@ class AsyncLLMEngine:
        engine_config = engine_args.create_engine_config()
        if engine_config.device_config.device_type == "neuron":
-            raise NotImplementedError("Neuron is not supported for "
+            from vllm.executor.neuron_executor import NeuronExecutorAsync
-                                      "async engine yet.")
+            executor_class = NeuronExecutorAsync
        elif engine_config.parallel_config.worker_use_ray:
            initialize_ray_cluster(engine_config.parallel_config)
            from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@ -1,9 +1,10 @@
 from typing import Dict, List, Set, Tuple
-from vllm.executor.executor_base import ExecutorBase
+from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.utils import make_async
 logger = init_logger(__name__)
@ -73,3 +74,22 @@ class NeuronExecutor(ExecutorBase):
        # NeuronExecutor will always be healthy as long as
        # it's running.
        return
 class NeuronExecutorAsync(NeuronExecutor, ExecutorAsyncBase):
    async def execute_model_async(
        self,
        seq_group_metadata_list: List[SequenceGroupMetadata],
        blocks_to_swap_in: Dict[int, int],
        blocks_to_swap_out: Dict[int, int],
        blocks_to_copy: Dict[int, List[int]],
    ) -> SamplerOutput:
        output = await make_async(self.driver_worker.execute_model)(
            seq_group_metadata_list=seq_group_metadata_list, )
        return output
    async def check_health_async(self) -> None:
        # NeuronExecutor will always be healthy as long as
        # it's running.
        return