2024-11-07 13:17:29 -05:00
|
|
|
#!/bin/bash
|
|
|
|
|
2024-05-16 22:58:25 -05:00
|
|
|
# This script runs test inside the corresponding ROCm docker container.
|
2024-09-04 14:57:54 -04:00
|
|
|
set -o pipefail
|
2024-03-18 12:33:47 -07:00
|
|
|
|
|
|
|
# Print ROCm version
|
2024-07-12 11:42:24 -05:00
|
|
|
echo "--- Confirming Clean Initial State"
|
|
|
|
while true; do
|
|
|
|
sleep 3
|
|
|
|
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
|
|
|
echo "GPUs state is \"clean\""
|
|
|
|
break
|
|
|
|
fi
|
|
|
|
done
|
|
|
|
|
2024-05-02 14:29:07 -05:00
|
|
|
echo "--- ROCm info"
|
2024-03-18 12:33:47 -07:00
|
|
|
rocminfo
|
|
|
|
|
2024-05-29 22:27:39 -05:00
|
|
|
# cleanup older docker images
|
|
|
|
cleanup_docker() {
|
|
|
|
# Get Docker's root directory
|
|
|
|
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
|
|
|
if [ -z "$docker_root" ]; then
|
|
|
|
echo "Failed to determine Docker root directory."
|
|
|
|
exit 1
|
|
|
|
fi
|
|
|
|
echo "Docker root directory: $docker_root"
|
|
|
|
# Check disk usage of the filesystem where Docker's root directory is located
|
|
|
|
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
|
|
|
# Define the threshold
|
|
|
|
threshold=70
|
|
|
|
if [ "$disk_usage" -gt "$threshold" ]; then
|
|
|
|
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
|
|
|
# Remove dangling images (those that are not tagged and not used by any container)
|
|
|
|
docker image prune -f
|
2024-10-31 12:02:58 -05:00
|
|
|
# Remove unused volumes / force the system prune for old images as well.
|
|
|
|
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
2024-05-29 22:27:39 -05:00
|
|
|
echo "Docker images and volumes cleanup completed."
|
|
|
|
else
|
|
|
|
echo "Disk usage is below $threshold%. No cleanup needed."
|
|
|
|
fi
|
|
|
|
}
|
|
|
|
|
|
|
|
# Call the cleanup docker function
|
|
|
|
cleanup_docker
|
|
|
|
|
2024-05-02 14:29:07 -05:00
|
|
|
echo "--- Resetting GPUs"
|
2024-04-25 11:37:20 -05:00
|
|
|
|
|
|
|
echo "reset" > /opt/amdgpu/etc/gpu_state
|
|
|
|
|
|
|
|
while true; do
|
|
|
|
sleep 3
|
|
|
|
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
|
|
|
echo "GPUs state is \"clean\""
|
|
|
|
break
|
|
|
|
fi
|
|
|
|
done
|
|
|
|
|
2024-07-12 00:26:26 -04:00
|
|
|
echo "--- Pulling container"
|
2024-08-01 13:07:37 -05:00
|
|
|
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
2024-07-12 00:26:26 -04:00
|
|
|
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
2024-11-07 13:17:29 -05:00
|
|
|
docker pull "${image_name}"
|
2024-05-02 14:29:07 -05:00
|
|
|
|
|
|
|
remove_docker_container() {
|
2024-11-07 13:17:29 -05:00
|
|
|
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
|
2024-05-02 14:29:07 -05:00
|
|
|
}
|
|
|
|
trap remove_docker_container EXIT
|
2024-04-25 11:37:20 -05:00
|
|
|
|
2024-05-02 14:29:07 -05:00
|
|
|
echo "--- Running container"
|
2024-04-25 11:37:20 -05:00
|
|
|
|
2024-07-20 11:39:07 -05:00
|
|
|
HF_CACHE="$(realpath ~)/huggingface"
|
2024-11-07 13:17:29 -05:00
|
|
|
mkdir -p "${HF_CACHE}"
|
2024-07-20 11:39:07 -05:00
|
|
|
HF_MOUNT="/root/.cache/huggingface"
|
|
|
|
|
2024-09-04 14:57:54 -04:00
|
|
|
commands=$@
|
2024-09-10 14:51:15 -04:00
|
|
|
echo "Commands:$commands"
|
|
|
|
#ignore certain kernels tests
|
|
|
|
if [[ $commands == *" kernels "* ]]; then
|
|
|
|
commands="${commands} \
|
|
|
|
--ignore=kernels/test_attention_selector.py \
|
|
|
|
--ignore=kernels/test_blocksparse_attention.py \
|
|
|
|
--ignore=kernels/test_causal_conv1d.py \
|
|
|
|
--ignore=kernels/test_cutlass.py \
|
|
|
|
--ignore=kernels/test_encoder_decoder_attn.py \
|
|
|
|
--ignore=kernels/test_flash_attn.py \
|
|
|
|
--ignore=kernels/test_flashinfer.py \
|
|
|
|
--ignore=kernels/test_int8_quant.py \
|
|
|
|
--ignore=kernels/test_machete_gemm.py \
|
|
|
|
--ignore=kernels/test_mamba_ssm.py \
|
|
|
|
--ignore=kernels/test_marlin_gemm.py \
|
2024-09-11 14:31:41 -04:00
|
|
|
--ignore=kernels/test_moe.py \
|
2024-09-10 14:51:15 -04:00
|
|
|
--ignore=kernels/test_prefix_prefill.py \
|
|
|
|
--ignore=kernels/test_rand.py \
|
2025-02-27 12:31:47 -08:00
|
|
|
--ignore=kernels/test_sampler.py \
|
|
|
|
--ignore=kernels/test_cascade_flash_attn.py \
|
2025-03-04 15:37:55 -08:00
|
|
|
--ignore=kernels/test_mamba_mixer2.py \
|
|
|
|
--ignore=kernels/test_aqlm.py \
|
|
|
|
--ignore=kernels/test_machete_mm.py \
|
|
|
|
--ignore=kernels/test_mha_attn.py \
|
|
|
|
--ignore=kernels/test_block_fp8.py \
|
|
|
|
--ignore=kernels/test_permute_cols.py"
|
2024-09-10 14:51:15 -04:00
|
|
|
fi
|
|
|
|
|
2025-03-14 14:18:13 -05:00
|
|
|
#ignore certain Entrypoints/openai tests
|
2024-09-19 16:06:32 -04:00
|
|
|
if [[ $commands == *" entrypoints/openai "* ]]; then
|
|
|
|
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
|
|
|
--ignore=entrypoints/openai/test_audio.py \
|
2025-03-14 14:18:13 -05:00
|
|
|
--ignore=entrypoints/openai/test_shutdown.py \
|
|
|
|
--ignore=entrypoints/openai/test_completion.py \
|
|
|
|
--ignore=entrypoints/openai/test_sleep.py \
|
|
|
|
--ignore=entrypoints/openai/test_models.py \
|
2025-04-03 13:05:17 -05:00
|
|
|
--ignore=entrypoints/openai/test_lora_adapters.py \
|
|
|
|
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
|
|
|
--ignore=entrypoints/openai/test_root_path.py \
|
|
|
|
--ignore=entrypoints/openai/test_tokenization.py \
|
2025-03-14 14:18:13 -05:00
|
|
|
--ignore=entrypoints/openai/test_prompt_validation.py "}
|
2024-09-19 16:06:32 -04:00
|
|
|
fi
|
|
|
|
|
2025-03-14 14:18:13 -05:00
|
|
|
#ignore certain Entrypoints/llm tests
|
2025-04-03 13:05:17 -05:00
|
|
|
if [[ $commands == *" entrypoints/llm "* ]]; then
|
|
|
|
commands=${commands//" entrypoints/llm "/" entrypoints/llm \
|
|
|
|
--ignore=entrypoints/llm/test_chat.py \
|
|
|
|
--ignore=entrypoints/llm/test_accuracy.py \
|
|
|
|
--ignore=entrypoints/llm/test_init.py \
|
|
|
|
--ignore=entrypoints/llm/test_generate_multiple_loras.py \
|
|
|
|
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
2025-03-14 14:18:13 -05:00
|
|
|
fi
|
|
|
|
|
2025-04-03 13:05:17 -05:00
|
|
|
#Obsolete currently
|
|
|
|
##ignore certain Entrypoints/llm tests
|
|
|
|
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
|
|
|
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
|
|
|
#fi
|
|
|
|
|
2025-03-14 14:18:13 -05:00
|
|
|
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
|
|
|
# --ignore=entrypoints/openai/test_embedding.py \
|
|
|
|
# --ignore=entrypoints/openai/test_oot_registration.py
|
|
|
|
# --ignore=entrypoints/openai/test_accuracy.py \
|
|
|
|
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
|
|
|
|
|
|
|
|
|
2024-09-04 14:57:54 -04:00
|
|
|
PARALLEL_JOB_COUNT=8
|
|
|
|
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
|
|
|
if [[ $commands == *"--shard-id="* ]]; then
|
2024-11-04 21:37:46 +02:00
|
|
|
# assign job count as the number of shards used
|
|
|
|
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
|
2024-09-04 14:57:54 -04:00
|
|
|
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
2024-11-04 21:37:46 +02:00
|
|
|
# assign shard-id for each shard
|
|
|
|
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
|
|
|
echo "Shard ${GPU} commands:$commands_gpu"
|
2025-03-26 15:35:11 -05:00
|
|
|
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
2024-09-04 14:57:54 -04:00
|
|
|
docker run \
|
2025-03-26 15:35:11 -05:00
|
|
|
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
|
|
|
--network=host \
|
2024-07-20 11:39:07 -05:00
|
|
|
--shm-size=16gb \
|
2024-05-02 14:29:07 -05:00
|
|
|
--rm \
|
2024-11-07 13:17:29 -05:00
|
|
|
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
2024-05-02 14:29:07 -05:00
|
|
|
-e HF_TOKEN \
|
2025-02-19 19:56:06 -08:00
|
|
|
-e AWS_ACCESS_KEY_ID \
|
|
|
|
-e AWS_SECRET_ACCESS_KEY \
|
2024-11-07 13:17:29 -05:00
|
|
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
|
|
|
-e "HF_HOME=${HF_MOUNT}" \
|
|
|
|
--name "${container_name}_${GPU}" \
|
|
|
|
"${image_name}" \
|
2024-11-04 21:37:46 +02:00
|
|
|
/bin/bash -c "${commands_gpu}" \
|
2024-09-04 14:57:54 -04:00
|
|
|
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
|
|
|
|
PIDS+=($!)
|
|
|
|
done
|
|
|
|
#wait for all processes to finish and collect exit codes
|
2024-11-07 13:17:29 -05:00
|
|
|
for pid in "${PIDS[@]}"; do
|
|
|
|
wait "${pid}"
|
2024-09-04 14:57:54 -04:00
|
|
|
STATUS+=($?)
|
|
|
|
done
|
2024-11-07 13:17:29 -05:00
|
|
|
for st in "${STATUS[@]}"; do
|
2024-09-04 14:57:54 -04:00
|
|
|
if [[ ${st} -ne 0 ]]; then
|
|
|
|
echo "One of the processes failed with $st"
|
2024-11-07 13:17:29 -05:00
|
|
|
exit "${st}"
|
2024-09-04 14:57:54 -04:00
|
|
|
fi
|
|
|
|
done
|
|
|
|
else
|
2025-03-26 15:35:11 -05:00
|
|
|
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
2024-09-04 14:57:54 -04:00
|
|
|
docker run \
|
2025-03-26 15:35:11 -05:00
|
|
|
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
|
|
|
--network=host \
|
2024-09-04 14:57:54 -04:00
|
|
|
--shm-size=16gb \
|
|
|
|
--rm \
|
|
|
|
-e HIP_VISIBLE_DEVICES=0 \
|
|
|
|
-e HF_TOKEN \
|
2025-02-19 19:56:06 -08:00
|
|
|
-e AWS_ACCESS_KEY_ID \
|
|
|
|
-e AWS_SECRET_ACCESS_KEY \
|
2024-11-07 13:17:29 -05:00
|
|
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
|
|
|
-e "HF_HOME=${HF_MOUNT}" \
|
|
|
|
--name "${container_name}" \
|
|
|
|
"${image_name}" \
|
2024-09-04 14:57:54 -04:00
|
|
|
/bin/bash -c "${commands}"
|
|
|
|
fi
|