[CI] Fix neuron CI and run offline tests (#11779)

Signed-off-by: Liangfu Chen <liangfc@amazon.com>
This commit is contained in:
Liangfu Chen 2025-01-06 21:36:10 -08:00 committed by GitHub
parent 0f3f3c86ec
commit 898cdf033e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 35 additions and 37 deletions

View File

@ -3,6 +3,18 @@
# This script build the Neuron docker image and run the API server inside the container. # This script build the Neuron docker image and run the API server inside the container.
# It serves a sanity check for compilation and basic model usage. # It serves a sanity check for compilation and basic model usage.
set -e set -e
set -v
image_name="neuron/vllm-ci"
container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
HF_CACHE="$(realpath ~)/huggingface"
mkdir -p "${HF_CACHE}"
HF_MOUNT="/root/.cache/huggingface"
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
# Try building the docker image # Try building the docker image
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
@ -13,41 +25,30 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
last_build=$(cat /tmp/neuron-docker-build-timestamp) last_build=$(cat /tmp/neuron-docker-build-timestamp)
current_time=$(date +%s) current_time=$(date +%s)
if [ $((current_time - last_build)) -gt 86400 ]; then if [ $((current_time - last_build)) -gt 86400 ]; then
docker image prune -f
docker system prune -f docker system prune -f
rm -rf "${HF_MOUNT:?}/*"
rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*"
echo "$current_time" > /tmp/neuron-docker-build-timestamp echo "$current_time" > /tmp/neuron-docker-build-timestamp
fi fi
else else
date "+%s" > /tmp/neuron-docker-build-timestamp date "+%s" > /tmp/neuron-docker-build-timestamp
fi fi
docker build -t neuron -f Dockerfile.neuron . docker build -t "${image_name}" -f Dockerfile.neuron .
# Setup cleanup # Setup cleanup
remove_docker_container() { docker rm -f neuron || true; } remove_docker_container() {
docker image rm -f "${image_name}" || true;
}
trap remove_docker_container EXIT trap remove_docker_container EXIT
remove_docker_container
# Run the image # Run the image
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 & -v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
# Wait for the server to start -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
wait_for_server_to_start() { -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
timeout=300 --name "${container_name}" \
counter=0 ${image_name} \
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference_neuron.py"
while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do
sleep 1
counter=$((counter + 1))
if [ $counter -ge $timeout ]; then
echo "Timeout after $timeout seconds"
break
fi
done
}
wait_for_server_to_start
# Test a simple prompt
curl -X POST -H "Content-Type: application/json" \
localhost:8000/generate \
-d '{"prompt": "San Francisco is a"}'

View File

@ -15,8 +15,8 @@ RUN apt-get update && \
ffmpeg libsm6 libxext6 libgl1 ffmpeg libsm6 libxext6 libgl1
### Mount Point ### ### Mount Point ###
# When launching the container, mount the code directory to /app # When launching the container, mount the code directory to /workspace
ARG APP_MOUNT=/app ARG APP_MOUNT=/workspace
VOLUME [ ${APP_MOUNT} ] VOLUME [ ${APP_MOUNT} ]
WORKDIR ${APP_MOUNT}/vllm WORKDIR ${APP_MOUNT}/vllm
@ -25,6 +25,7 @@ RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
RUN python3 -m pip install sentencepiece transformers==4.45.2 -U RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install pytest
COPY . . COPY . .
ARG GIT_REPO_CHECK=0 ARG GIT_REPO_CHECK=0
@ -42,4 +43,7 @@ RUN --mount=type=bind,source=.git,target=.git \
# install development dependencies (for testing) # install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils RUN python3 -m pip install -e tests/vllm_test_utils
# overwrite entrypoint to run bash script
RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
CMD ["/bin/bash"] CMD ["/bin/bash"]

View File

@ -1,12 +1,5 @@
import os
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
# creates XLA hlo graphs for all the context length buckets.
os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048"
# creates XLA hlo graphs for all the token gen buckets.
os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048"
# Sample prompts. # Sample prompts.
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",
@ -26,8 +19,8 @@ llm = LLM(
# Currently, this is a known limitation in continuous batching support # Currently, this is a known limitation in continuous batching support
# in transformers-neuronx. # in transformers-neuronx.
# TODO(liangfu): Support paged-attention in transformers-neuronx. # TODO(liangfu): Support paged-attention in transformers-neuronx.
max_model_len=2048, max_model_len=1024,
block_size=2048, block_size=1024,
# The device can be automatically detected when AWS Neuron SDK is installed. # The device can be automatically detected when AWS Neuron SDK is installed.
# The device argument can be either unspecified for automated detection, # The device argument can be either unspecified for automated detection,
# or explicitly assigned. # or explicitly assigned.