[V1] TPU - Fix CI/CD runner (#14974)
This commit is contained in:
parent
e41e160263
commit
18551e820c
@ -1,25 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# Build the docker image.
|
||||
docker build -f Dockerfile.tpu -t vllm-tpu .
|
||||
|
||||
# Set up cleanup.
|
||||
remove_docker_container() { docker rm -f tpu-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
# Remove the container that might not be cleaned up in the previous run.
|
||||
remove_docker_container
|
||||
|
||||
# For HF_TOKEN.
|
||||
source /etc/environment
|
||||
# Run a simple end-to-end example.
|
||||
docker run --privileged --net host --shm-size=16G -it \
|
||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install pytest \
|
||||
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
||||
&& pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
|
||||
&& python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
||||
&& python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
@ -15,13 +15,22 @@ remove_docker_container
|
||||
source /etc/environment
|
||||
# Run a simple end-to-end example.
|
||||
docker run --privileged --net host --shm-size=16G -it \
|
||||
-e "HF_TOKEN=$HF_TOKEN" -e "VLLM_USE_V1=1" --name tpu-test \
|
||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install pytest \
|
||||
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
||||
&& pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
|
||||
&& pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
|
||||
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
|
||||
&& python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
||||
&& python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
||||
&& echo TEST_1 \
|
||||
&& VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
|
||||
&& echo TEST_2 \
|
||||
&& VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
|
||||
&& echo TEST_3 \
|
||||
&& VLLM_USE_V1=1 pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
||||
&& echo TEST_4 \
|
||||
&& VLLM_USE_V1=1 python3 /workspace/vllm/examples/offline_inference/tpu.py" \
|
||||
&& echo TEST_5 \
|
||||
&& VLLM_USE_V1=1 python3 /workspace/vllm/tests/tpu/test_compilation.py \
|
||||
|
||||
|
||||
# TODO: Fix these tests
|
||||
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||
|
||||
|
@ -34,7 +34,9 @@ with depyf.prepare_debug(temp_dir):
|
||||
|
||||
# disable custom dispatcher, let Dynamo takes over
|
||||
# all the control
|
||||
llm = LLM(model="google/gemma-2b",
|
||||
llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
|
||||
max_model_len=512,
|
||||
max_num_seqs=64,
|
||||
enforce_eager=True,
|
||||
compilation_config={"level": CompilationLevel.DYNAMO_AS_IS})
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
@ -44,38 +46,51 @@ with depyf.prepare_debug(temp_dir):
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
assert generated_text.startswith(answer)
|
||||
|
||||
compiled_code = sorted(
|
||||
compiled_codes = sorted(
|
||||
glob.glob(os.path.join(temp_dir, "__transformed_code*.py")))
|
||||
|
||||
# we should only trigger Dynamo compilation three times:
|
||||
# one for the profiling phase without kv cache
|
||||
# one for the prefill phase with symbolic shapes
|
||||
# one for the decode phase with symbolic shapes
|
||||
for i, compiled_code in enumerate(compiled_codes):
|
||||
print("{} file: {}".format(i + 1, compiled_code))
|
||||
|
||||
# We should only trigger Dynamo compilation 4 times:
|
||||
# 1. forward pass (symbolic)
|
||||
# 2. compute_logits (symbolic)
|
||||
# 3. forward pass (shape 16)
|
||||
# 4. forward pass (shape 32)
|
||||
# and later calls should not trigger Dynamo compilation again.
|
||||
# NOTE: it might still trigger XLA compilation.
|
||||
# NOTE: It might still trigger XLA compilation.
|
||||
|
||||
# check we have three compiled code
|
||||
# this is the assumption when we use the custom dispatcher
|
||||
assert len(compiled_code) == 3
|
||||
# Check we have 4 compiled codes
|
||||
assert len(compiled_codes) == 4
|
||||
|
||||
# check all the compilations are as expected
|
||||
compiled_fn = sorted(
|
||||
kv_cache_prefix = "kv_cache"
|
||||
attn_prefix = "ragged_paged_attention"
|
||||
|
||||
# Check all the compilations are as expected
|
||||
compiled_fns = sorted(
|
||||
glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py")))
|
||||
|
||||
# the first compilation is the profiling phase,
|
||||
# it should not have any kv cache
|
||||
with open(compiled_fn[0]) as f:
|
||||
content = f.read()
|
||||
assert "kv_caches" not in content
|
||||
for i, compiled_fn in enumerate(compiled_fns):
|
||||
print("{} file: {}".format(i + 1, compiled_fn))
|
||||
|
||||
# the second compilation is the prefill phase,
|
||||
# it should have kv cache and the flash_attention op
|
||||
with open(compiled_fn[1]) as f:
|
||||
# The first compilation is symbolic, so it should not have any kv_caches
|
||||
with open(compiled_fns[0]) as f:
|
||||
content = f.read()
|
||||
assert "kv_caches" in content and "torch.ops.xla.flash_attention" in content
|
||||
assert kv_cache_prefix not in content
|
||||
|
||||
# the third compilation is the decode phase,
|
||||
# it should have kv cache and the paged_attention op
|
||||
with open(compiled_fn[2]) as f:
|
||||
# The second compilation is symbolic, so it should not have any kv_caches
|
||||
with open(compiled_fns[1]) as f:
|
||||
content = f.read()
|
||||
assert "kv_caches" in content and "torch.ops.xla.paged_attention" in content
|
||||
assert kv_cache_prefix not in content
|
||||
|
||||
# The third compilation is shape 16, so it should have kv_caches and the
|
||||
# ragged_paged_attention
|
||||
with open(compiled_fns[2]) as f:
|
||||
content = f.read()
|
||||
assert (kv_cache_prefix in content and attn_prefix in content)
|
||||
|
||||
# The forth compilation is shape 32, so it should have kv_caches and the
|
||||
# ragged_paged_attention
|
||||
with open(compiled_fns[3]) as f:
|
||||
content = f.read()
|
||||
assert (kv_cache_prefix in content and attn_prefix in content)
|
||||
|
@ -14,12 +14,17 @@ from ..utils import compare_two_settings
|
||||
def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_RPC_TIMEOUT", "30000")
|
||||
compare_two_settings(
|
||||
"google/gemma-2b",
|
||||
arg1=[
|
||||
"--enforce-eager",
|
||||
f"-O{CompilationLevel.DYNAMO_ONCE}",
|
||||
],
|
||||
arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"],
|
||||
env1={},
|
||||
env2={})
|
||||
compare_two_settings("Qwen/Qwen2.5-1.5B-Instruct",
|
||||
arg1=[
|
||||
"--max-model-len=256",
|
||||
"--max-num-seqs=32",
|
||||
"--enforce-eager",
|
||||
f"-O{CompilationLevel.DYNAMO_ONCE}",
|
||||
],
|
||||
arg2=[
|
||||
"--max-model-len=256", "--max-num-seqs=32",
|
||||
"--enforce-eager",
|
||||
f"-O{CompilationLevel.DYNAMO_AS_IS}"
|
||||
],
|
||||
env1={},
|
||||
env2={})
|
||||
|
Loading…
x
Reference in New Issue
Block a user