54 lines
1.8 KiB
Bash
54 lines
1.8 KiB
Bash
#!/bin/bash
|
|
|
|
# This script build the Neuron docker image and run the API server inside the container.
|
|
# It serves a sanity check for compilation and basic model usage.
|
|
set -e
|
|
|
|
# Try building the docker image
|
|
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
|
|
|
# prune old image and containers to save disk space, and only once a day
|
|
# by using a timestamp file in tmp.
|
|
if [ -f /tmp/neuron-docker-build-timestamp ]; then
|
|
last_build=$(cat /tmp/neuron-docker-build-timestamp)
|
|
current_time=$(date +%s)
|
|
if [ $((current_time - last_build)) -gt 86400 ]; then
|
|
docker system prune -f
|
|
echo "$current_time" > /tmp/neuron-docker-build-timestamp
|
|
fi
|
|
else
|
|
date "+%s" > /tmp/neuron-docker-build-timestamp
|
|
fi
|
|
|
|
docker build -t neuron -f Dockerfile.neuron .
|
|
|
|
# Setup cleanup
|
|
remove_docker_container() { docker rm -f neuron || true; }
|
|
trap remove_docker_container EXIT
|
|
remove_docker_container
|
|
|
|
# Run the image
|
|
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
|
|
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
|
|
|
|
# Wait for the server to start
|
|
wait_for_server_to_start() {
|
|
timeout=300
|
|
counter=0
|
|
|
|
while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do
|
|
sleep 1
|
|
counter=$((counter + 1))
|
|
if [ $counter -ge $timeout ]; then
|
|
echo "Timeout after $timeout seconds"
|
|
break
|
|
fi
|
|
done
|
|
}
|
|
wait_for_server_to_start
|
|
|
|
# Test a simple prompt
|
|
curl -X POST -H "Content-Type: application/json" \
|
|
localhost:8000/generate \
|
|
-d '{"prompt": "San Francisco is a"}'
|