[CI] Fix neuron CI and run offline tests (#11779)
Signed-off-by: Liangfu Chen <liangfc@amazon.com>
This commit is contained in:
parent
0f3f3c86ec
commit
898cdf033e
@ -3,6 +3,18 @@
|
|||||||
# This script build the Neuron docker image and run the API server inside the container.
|
# This script build the Neuron docker image and run the API server inside the container.
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
set -e
|
set -e
|
||||||
|
set -v
|
||||||
|
|
||||||
|
image_name="neuron/vllm-ci"
|
||||||
|
container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||||
|
|
||||||
|
HF_CACHE="$(realpath ~)/huggingface"
|
||||||
|
mkdir -p "${HF_CACHE}"
|
||||||
|
HF_MOUNT="/root/.cache/huggingface"
|
||||||
|
|
||||||
|
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
||||||
|
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
||||||
|
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
||||||
@ -13,41 +25,30 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
|
|||||||
last_build=$(cat /tmp/neuron-docker-build-timestamp)
|
last_build=$(cat /tmp/neuron-docker-build-timestamp)
|
||||||
current_time=$(date +%s)
|
current_time=$(date +%s)
|
||||||
if [ $((current_time - last_build)) -gt 86400 ]; then
|
if [ $((current_time - last_build)) -gt 86400 ]; then
|
||||||
|
docker image prune -f
|
||||||
docker system prune -f
|
docker system prune -f
|
||||||
|
rm -rf "${HF_MOUNT:?}/*"
|
||||||
|
rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*"
|
||||||
echo "$current_time" > /tmp/neuron-docker-build-timestamp
|
echo "$current_time" > /tmp/neuron-docker-build-timestamp
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
date "+%s" > /tmp/neuron-docker-build-timestamp
|
date "+%s" > /tmp/neuron-docker-build-timestamp
|
||||||
fi
|
fi
|
||||||
|
|
||||||
docker build -t neuron -f Dockerfile.neuron .
|
docker build -t "${image_name}" -f Dockerfile.neuron .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() { docker rm -f neuron || true; }
|
remove_docker_container() {
|
||||||
|
docker image rm -f "${image_name}" || true;
|
||||||
|
}
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Run the image
|
# Run the image
|
||||||
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
|
docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
|
||||||
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
# Wait for the server to start
|
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||||
wait_for_server_to_start() {
|
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||||
timeout=300
|
--name "${container_name}" \
|
||||||
counter=0
|
${image_name} \
|
||||||
|
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference_neuron.py"
|
||||||
while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do
|
|
||||||
sleep 1
|
|
||||||
counter=$((counter + 1))
|
|
||||||
if [ $counter -ge $timeout ]; then
|
|
||||||
echo "Timeout after $timeout seconds"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
}
|
|
||||||
wait_for_server_to_start
|
|
||||||
|
|
||||||
# Test a simple prompt
|
|
||||||
curl -X POST -H "Content-Type: application/json" \
|
|
||||||
localhost:8000/generate \
|
|
||||||
-d '{"prompt": "San Francisco is a"}'
|
|
||||||
|
@ -15,8 +15,8 @@ RUN apt-get update && \
|
|||||||
ffmpeg libsm6 libxext6 libgl1
|
ffmpeg libsm6 libxext6 libgl1
|
||||||
|
|
||||||
### Mount Point ###
|
### Mount Point ###
|
||||||
# When launching the container, mount the code directory to /app
|
# When launching the container, mount the code directory to /workspace
|
||||||
ARG APP_MOUNT=/app
|
ARG APP_MOUNT=/workspace
|
||||||
VOLUME [ ${APP_MOUNT} ]
|
VOLUME [ ${APP_MOUNT} ]
|
||||||
WORKDIR ${APP_MOUNT}/vllm
|
WORKDIR ${APP_MOUNT}/vllm
|
||||||
|
|
||||||
@ -25,6 +25,7 @@ RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
|
|||||||
RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
|
RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
|
||||||
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||||
RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||||
|
RUN python3 -m pip install pytest
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
ARG GIT_REPO_CHECK=0
|
ARG GIT_REPO_CHECK=0
|
||||||
@ -42,4 +43,7 @@ RUN --mount=type=bind,source=.git,target=.git \
|
|||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||||
|
|
||||||
|
# overwrite entrypoint to run bash script
|
||||||
|
RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
CMD ["/bin/bash"]
|
||||||
|
@ -1,12 +1,5 @@
|
|||||||
import os
|
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
# creates XLA hlo graphs for all the context length buckets.
|
|
||||||
os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048"
|
|
||||||
# creates XLA hlo graphs for all the token gen buckets.
|
|
||||||
os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048"
|
|
||||||
|
|
||||||
# Sample prompts.
|
# Sample prompts.
|
||||||
prompts = [
|
prompts = [
|
||||||
"Hello, my name is",
|
"Hello, my name is",
|
||||||
@ -26,8 +19,8 @@ llm = LLM(
|
|||||||
# Currently, this is a known limitation in continuous batching support
|
# Currently, this is a known limitation in continuous batching support
|
||||||
# in transformers-neuronx.
|
# in transformers-neuronx.
|
||||||
# TODO(liangfu): Support paged-attention in transformers-neuronx.
|
# TODO(liangfu): Support paged-attention in transformers-neuronx.
|
||||||
max_model_len=2048,
|
max_model_len=1024,
|
||||||
block_size=2048,
|
block_size=1024,
|
||||||
# The device can be automatically detected when AWS Neuron SDK is installed.
|
# The device can be automatically detected when AWS Neuron SDK is installed.
|
||||||
# The device argument can be either unspecified for automated detection,
|
# The device argument can be either unspecified for automated detection,
|
||||||
# or explicitly assigned.
|
# or explicitly assigned.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user