[CI/CD] add neuron docker and ci test scripts (#3571)
This commit is contained in:
parent
87fa80c91f
commit
cd2f63fb36
37
.buildkite/run-neuron-test.sh
Normal file
37
.buildkite/run-neuron-test.sh
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
# This script build the Neuron docker image and run the API server inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
||||||
|
docker build -t neuron -f Dockerfile.neuron .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() { docker rm -f neuron || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Run the image
|
||||||
|
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
|
||||||
|
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
|
||||||
|
|
||||||
|
# Wait for the server to start
|
||||||
|
wait_for_server_to_start() {
|
||||||
|
timeout=300
|
||||||
|
counter=0
|
||||||
|
|
||||||
|
while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
|
||||||
|
sleep 1
|
||||||
|
counter=$((counter + 1))
|
||||||
|
if [ $counter -ge $timeout ]; then
|
||||||
|
echo "Timeout after $timeout seconds"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
}
|
||||||
|
wait_for_server_to_start
|
||||||
|
|
||||||
|
# Test a simple prompt
|
||||||
|
curl -X POST -H "Content-Type: application/json" \
|
||||||
|
localhost:8000/generate \
|
||||||
|
-d '{"prompt": "San Francisco is a"}'
|
@ -21,6 +21,11 @@ steps:
|
|||||||
queue: amd
|
queue: amd
|
||||||
command: bash .buildkite/run-amd-test.sh
|
command: bash .buildkite/run-amd-test.sh
|
||||||
|
|
||||||
|
- label: "Neuron Test"
|
||||||
|
agents:
|
||||||
|
queue: neuron
|
||||||
|
command: bash .buildkite/run-neuron-test.sh
|
||||||
|
|
||||||
- label: "CPU Test"
|
- label: "CPU Test"
|
||||||
command: bash .buildkite/run-cpu-test.sh
|
command: bash .buildkite/run-cpu-test.sh
|
||||||
|
|
||||||
|
36
Dockerfile.neuron
Normal file
36
Dockerfile.neuron
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
# default base image
|
||||||
|
ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"
|
||||||
|
|
||||||
|
FROM $BASE_IMAGE
|
||||||
|
|
||||||
|
RUN echo "Base image is $BASE_IMAGE"
|
||||||
|
|
||||||
|
# Install some basic utilities
|
||||||
|
RUN apt-get update && apt-get install python3 python3-pip -y
|
||||||
|
|
||||||
|
### Mount Point ###
|
||||||
|
# When launching the container, mount the code directory to /app
|
||||||
|
ARG APP_MOUNT=/app
|
||||||
|
VOLUME [ ${APP_MOUNT} ]
|
||||||
|
WORKDIR ${APP_MOUNT}
|
||||||
|
|
||||||
|
RUN python3 -m pip install --upgrade pip
|
||||||
|
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
|
||||||
|
RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
|
||||||
|
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||||
|
RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||||
|
|
||||||
|
COPY ./vllm /app/vllm/vllm
|
||||||
|
COPY ./setup.py /app/vllm/setup.py
|
||||||
|
COPY ./requirements-common.txt /app/vllm/requirements-common.txt
|
||||||
|
COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
|
||||||
|
|
||||||
|
RUN cd /app/vllm \
|
||||||
|
&& python3 -m pip install -U -r requirements-neuron.txt
|
||||||
|
|
||||||
|
ENV VLLM_BUILD_WITH_NEURON 1
|
||||||
|
RUN cd /app/vllm \
|
||||||
|
&& pip install -e . \
|
||||||
|
&& cd ..
|
||||||
|
|
||||||
|
CMD ["/bin/bash"]
|
3
setup.py
3
setup.py
@ -204,7 +204,8 @@ def _is_neuron() -> bool:
|
|||||||
subprocess.run(["neuron-ls"], capture_output=True, check=True)
|
subprocess.run(["neuron-ls"], capture_output=True, check=True)
|
||||||
except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
|
except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
|
||||||
torch_neuronx_installed = False
|
torch_neuronx_installed = False
|
||||||
return torch_neuronx_installed
|
return torch_neuronx_installed or os.environ.get("VLLM_BUILD_WITH_NEURON",
|
||||||
|
False)
|
||||||
|
|
||||||
|
|
||||||
def _is_cpu() -> bool:
|
def _is_cpu() -> bool:
|
||||||
|
@ -335,8 +335,8 @@ class AsyncLLMEngine:
|
|||||||
engine_config = engine_args.create_engine_config()
|
engine_config = engine_args.create_engine_config()
|
||||||
|
|
||||||
if engine_config.device_config.device_type == "neuron":
|
if engine_config.device_config.device_type == "neuron":
|
||||||
raise NotImplementedError("Neuron is not supported for "
|
from vllm.executor.neuron_executor import NeuronExecutorAsync
|
||||||
"async engine yet.")
|
executor_class = NeuronExecutorAsync
|
||||||
elif engine_config.parallel_config.worker_use_ray:
|
elif engine_config.parallel_config.worker_use_ray:
|
||||||
initialize_ray_cluster(engine_config.parallel_config)
|
initialize_ray_cluster(engine_config.parallel_config)
|
||||||
from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
|
from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
from typing import Dict, List, Set, Tuple
|
from typing import Dict, List, Set, Tuple
|
||||||
|
|
||||||
from vllm.executor.executor_base import ExecutorBase
|
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
|
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
|
||||||
|
from vllm.utils import make_async
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
@ -73,3 +74,22 @@ class NeuronExecutor(ExecutorBase):
|
|||||||
# NeuronExecutor will always be healthy as long as
|
# NeuronExecutor will always be healthy as long as
|
||||||
# it's running.
|
# it's running.
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
class NeuronExecutorAsync(NeuronExecutor, ExecutorAsyncBase):
|
||||||
|
|
||||||
|
async def execute_model_async(
|
||||||
|
self,
|
||||||
|
seq_group_metadata_list: List[SequenceGroupMetadata],
|
||||||
|
blocks_to_swap_in: Dict[int, int],
|
||||||
|
blocks_to_swap_out: Dict[int, int],
|
||||||
|
blocks_to_copy: Dict[int, List[int]],
|
||||||
|
) -> SamplerOutput:
|
||||||
|
output = await make_async(self.driver_worker.execute_model)(
|
||||||
|
seq_group_metadata_list=seq_group_metadata_list, )
|
||||||
|
return output
|
||||||
|
|
||||||
|
async def check_health_async(self) -> None:
|
||||||
|
# NeuronExecutor will always be healthy as long as
|
||||||
|
# it's running.
|
||||||
|
return
|
||||||
|
Loading…
x
Reference in New Issue
Block a user