[CI/Build][ROCm] Enabling LoRA tests on ROCm (#7369)

Co-authored-by: Simon Mo <simon.mo@hey.com>
This commit is contained in:
alexeykondrat 2024-09-04 14:57:54 -04:00 committed by GitHub
parent 2ad2e5608e
commit d1dec64243
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 64 additions and 14 deletions

47
.buildkite/run-amd-test.sh Normal file → Executable file
View File

@ -1,5 +1,5 @@
# This script runs test inside the corresponding ROCm docker container.
set -ex
set -o pipefail
# Print ROCm version
echo "--- Confirming Clean Initial State"
@ -70,16 +70,51 @@ HF_CACHE="$(realpath ~)/huggingface"
mkdir -p ${HF_CACHE}
HF_MOUNT="/root/.cache/huggingface"
docker run \
commands=$@
PARALLEL_JOB_COUNT=8
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
#replace shard arguments
commands=${@//"--shard-id= "/"--shard-id=${GPU} "}
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
docker run \
--device /dev/kfd --device /dev/dri \
--network host \
--shm-size=16gb \
--rm \
-e HIP_VISIBLE_DEVICES=0 \
-e HIP_VISIBLE_DEVICES=${GPU} \
-e HF_TOKEN \
-v ${HF_CACHE}:${HF_MOUNT} \
-e HF_HOME=${HF_MOUNT} \
--name ${container_name} \
--name ${container_name}_${GPU} \
${image_name} \
/bin/bash -c "${@}"
/bin/bash -c "${commands}" \
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
PIDS+=($!)
done
#wait for all processes to finish and collect exit codes
for pid in ${PIDS[@]}; do
wait ${pid}
STATUS+=($?)
done
for st in ${STATUS[@]}; do
if [[ ${st} -ne 0 ]]; then
echo "One of the processes failed with $st"
exit ${st}
fi
done
else
docker run \
--device /dev/kfd --device /dev/dri \
--network host \
--shm-size=16gb \
--rm \
-e HIP_VISIBLE_DEVICES=0 \
-e HF_TOKEN \
-v ${HF_CACHE}:${HF_MOUNT} \
-e HF_HOME=${HF_MOUNT} \
--name ${container_name} \
${image_name} \
/bin/bash -c "${commands}"
fi

View File

@ -218,9 +218,9 @@ steps:
- pytest -v -s spec_decode
- label: LoRA Test %N # 30min each
mirror_hardwares: [amd]
source_file_dependencies:
- vllm/lora
- csrc/punica
- tests/lora
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
parallelism: 4
@ -360,7 +360,6 @@ steps:
num_gpus: 4
source_file_dependencies:
- vllm/lora
- csrc/punica
- tests/lora/test_long_context
commands:
# FIXIT: find out which code initialize cuda before running the test

View File

@ -1,7 +1,10 @@
from typing import List
import pytest
import vllm
from vllm.lora.request import LoRARequest
from vllm.utils import is_hip
MODEL_PATH = "google/gemma-7b"
@ -28,6 +31,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
return generated_texts
@pytest.mark.xfail(is_hip(), reason="There can be output mismatch on ROCm")
def test_gemma_lora(gemma_lora_files):
llm = vllm.LLM(MODEL_PATH,
max_model_len=1024,

View File

@ -7,6 +7,7 @@ import pytest
import vllm
from vllm.lora.request import LoRARequest
from vllm.utils import is_hip
from .conftest import cleanup
@ -17,12 +18,23 @@ class ModelWithQuantization:
quantization: str
MODELS: List[ModelWithQuantization] = [
ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
quantization="AWQ"),
ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
quantization="GPTQ"),
]
MODELS: List[ModelWithQuantization]
#AWQ quantization is currently not supported in ROCm.
if is_hip():
MODELS = [
ModelWithQuantization(
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
quantization="GPTQ"),
]
else:
MODELS = [
ModelWithQuantization(
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
quantization="AWQ"),
ModelWithQuantization(
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
quantization="GPTQ"),
]
def do_sample(llm: vllm.LLM,