[CI/Build] Adding functionality to reset the node's GPUs before processing. (#4213)
This commit is contained in:
parent
fbf152d976
commit
7ee82bef1e
@ -5,6 +5,19 @@ set -ex
|
|||||||
# Print ROCm version
|
# Print ROCm version
|
||||||
rocminfo
|
rocminfo
|
||||||
|
|
||||||
|
|
||||||
|
echo "reset" > /opt/amdgpu/etc/gpu_state
|
||||||
|
|
||||||
|
while true; do
|
||||||
|
sleep 3
|
||||||
|
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
||||||
|
echo "GPUs state is \"clean\""
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
docker build -t rocm -f Dockerfile.rocm .
|
docker build -t rocm -f Dockerfile.rocm .
|
||||||
|
|
||||||
@ -14,7 +27,8 @@ trap remove_docker_container EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Run the image
|
# Run the image
|
||||||
docker run --device /dev/kfd --device /dev/dri --network host --name rocm rocm python3 -m vllm.entrypoints.api_server &
|
export HIP_VISIBLE_DEVICES=1
|
||||||
|
docker run --device /dev/kfd --device /dev/dri --network host -e HIP_VISIBLE_DEVICES --name rocm rocm python3 -m vllm.entrypoints.api_server &
|
||||||
|
|
||||||
# Wait for the server to start
|
# Wait for the server to start
|
||||||
wait_for_server_to_start() {
|
wait_for_server_to_start() {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user