[Misc] Replace os environ to monkeypatch in test suite (#14516)
Signed-off-by: sibi <85477603+t-sibiraj@users.noreply.github.com> Signed-off-by: Aaron Pham <contact@aarnphm.xyz> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
This commit is contained in:
parent
1e799b7ec1
commit
a73e183e36
@ -522,7 +522,7 @@ steps:
|
|||||||
# TODO: investigate and fix
|
# TODO: investigate and fix
|
||||||
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
|
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
|
||||||
|
|
||||||
- label: Plugin Tests (2 GPUs) # 40min
|
- label: Plugin Tests (2 GPUs) # 40min
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
@ -47,6 +47,7 @@ def test_vllm_gc_ed():
|
|||||||
@pytest.mark.parametrize("max_tokens", [5])
|
@pytest.mark.parametrize("max_tokens", [5])
|
||||||
@pytest.mark.parametrize("enforce_eager", [False])
|
@pytest.mark.parametrize("enforce_eager", [False])
|
||||||
def test_models(
|
def test_models(
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
hf_runner,
|
hf_runner,
|
||||||
model: str,
|
model: str,
|
||||||
backend: str,
|
backend: str,
|
||||||
@ -63,31 +64,33 @@ def test_models(
|
|||||||
pytest.skip(
|
pytest.skip(
|
||||||
f"{backend} does not support gemma2 with full context length.")
|
f"{backend} does not support gemma2 with full context length.")
|
||||||
|
|
||||||
os.environ["VLLM_ATTENTION_BACKEND"] = backend
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv("VLLM_ATTENTION_BACKEND", backend)
|
||||||
|
|
||||||
# 5042 tokens for gemma2
|
# 5042 tokens for gemma2
|
||||||
# gemma2 has alternating sliding window size of 4096
|
# gemma2 has alternating sliding window size of 4096
|
||||||
# we need a prompt with more than 4096 tokens to test the sliding window
|
# we need a prompt with more than 4096 tokens to test the sliding window
|
||||||
prompt = "The following numbers of the sequence " + ", ".join(
|
prompt = "The following numbers of the sequence " + ", ".join(
|
||||||
str(i) for i in range(1024)) + " are:"
|
str(i) for i in range(1024)) + " are:"
|
||||||
example_prompts = [prompt]
|
example_prompts = [prompt]
|
||||||
|
|
||||||
with hf_runner(model, dtype=dtype) as hf_model:
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
with VllmRunner(model,
|
with VllmRunner(model,
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
enforce_eager=enforce_eager,
|
enforce_eager=enforce_eager,
|
||||||
gpu_memory_utilization=0.7) as vllm_model:
|
gpu_memory_utilization=0.7) as vllm_model:
|
||||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_outputs = vllm_model.generate_greedy(example_prompts,
|
||||||
|
max_tokens)
|
||||||
|
|
||||||
check_outputs_equal(
|
check_outputs_equal(
|
||||||
outputs_0_lst=hf_outputs,
|
outputs_0_lst=hf_outputs,
|
||||||
outputs_1_lst=vllm_outputs,
|
outputs_1_lst=vllm_outputs,
|
||||||
name_0="hf",
|
name_0="hf",
|
||||||
name_1="vllm",
|
name_1="vllm",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@multi_gpu_test(num_gpus=2)
|
@multi_gpu_test(num_gpus=2)
|
||||||
@ -104,6 +107,7 @@ def test_models(
|
|||||||
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
|
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
|
||||||
])
|
])
|
||||||
def test_models_distributed(
|
def test_models_distributed(
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
hf_runner,
|
hf_runner,
|
||||||
vllm_runner,
|
vllm_runner,
|
||||||
example_prompts,
|
example_prompts,
|
||||||
@ -116,34 +120,41 @@ def test_models_distributed(
|
|||||||
if test_suite != TARGET_TEST_SUITE:
|
if test_suite != TARGET_TEST_SUITE:
|
||||||
pytest.skip(f"Skip test for {test_suite}")
|
pytest.skip(f"Skip test for {test_suite}")
|
||||||
|
|
||||||
if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
|
with monkeypatch.context() as monkeypatch_context:
|
||||||
# test Ray Compiled Graph
|
if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
|
||||||
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
|
# test Ray Compiled Graph
|
||||||
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
|
monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
|
||||||
|
monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
|
||||||
|
|
||||||
if attention_backend:
|
if attention_backend:
|
||||||
os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
|
monkeypatch_context.setenv(
|
||||||
|
"VLLM_ATTENTION_BACKEND",
|
||||||
|
attention_backend,
|
||||||
|
)
|
||||||
|
|
||||||
dtype = "half"
|
dtype = "half"
|
||||||
max_tokens = 5
|
max_tokens = 5
|
||||||
|
|
||||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||||
# vLLM needs a fresh new process without cuda initialization.
|
# vLLM needs a fresh new process without cuda initialization.
|
||||||
# if we run HF first, the cuda initialization will be done and it
|
# if we run HF first, the cuda initialization will be done and it
|
||||||
# will hurt multiprocessing backend with fork method (the default method).
|
# will hurt multiprocessing backend with fork method
|
||||||
with vllm_runner(model,
|
# (the default method).
|
||||||
dtype=dtype,
|
with vllm_runner(
|
||||||
tensor_parallel_size=2,
|
model,
|
||||||
distributed_executor_backend=distributed_executor_backend
|
dtype=dtype,
|
||||||
) as vllm_model:
|
tensor_parallel_size=2,
|
||||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
|
) as vllm_model:
|
||||||
|
vllm_outputs = vllm_model.generate_greedy(example_prompts,
|
||||||
|
max_tokens)
|
||||||
|
|
||||||
with hf_runner(model, dtype=dtype) as hf_model:
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
check_outputs_equal(
|
check_outputs_equal(
|
||||||
outputs_0_lst=hf_outputs,
|
outputs_0_lst=hf_outputs,
|
||||||
outputs_1_lst=vllm_outputs,
|
outputs_1_lst=vllm_outputs,
|
||||||
name_0="hf",
|
name_0="hf",
|
||||||
name_1="vllm",
|
name_1="vllm",
|
||||||
)
|
)
|
||||||
|
@ -7,16 +7,22 @@ prefill requests are chunked.
|
|||||||
|
|
||||||
Run `pytest tests/models/test_chunked_prefill.py`.
|
Run `pytest tests/models/test_chunked_prefill.py`.
|
||||||
"""
|
"""
|
||||||
import os
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from tests.kernels.utils import override_backend_env_variable
|
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.utils import STR_BACKEND_ENV_VAR
|
||||||
|
|
||||||
from ..models.utils import check_logprobs_close, check_outputs_equal
|
from ..models.utils import check_logprobs_close, check_outputs_equal
|
||||||
from ..utils import multi_gpu_test
|
from ..utils import multi_gpu_test
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from .conftest import HfRunner, VllmRunner
|
||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
"facebook/opt-125m",
|
"facebook/opt-125m",
|
||||||
"meta-llama/Llama-3.2-1B-Instruct",
|
"meta-llama/Llama-3.2-1B-Instruct",
|
||||||
@ -24,12 +30,14 @@ MODELS = [
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function", autouse=True)
|
@pytest.fixture(scope="function", autouse=True)
|
||||||
def use_v0_only(monkeypatch):
|
def use_v0_only(monkeypatch: pytest.MonkeyPatch):
|
||||||
"""
|
"""
|
||||||
Since this module is V0 only, set VLLM_USE_V1=0 for
|
Since this module is V0 only, set VLLM_USE_V1=0 for
|
||||||
all tests in the file.
|
all tests in the file.
|
||||||
"""
|
"""
|
||||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv('VLLM_USE_V1', '0')
|
||||||
|
yield
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
@ -42,8 +50,8 @@ def use_v0_only(monkeypatch):
|
|||||||
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
||||||
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
|
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
|
||||||
def test_models(
|
def test_models(
|
||||||
hf_runner,
|
hf_runner: HfRunner,
|
||||||
vllm_runner,
|
vllm_runner: VllmRunner,
|
||||||
example_prompts,
|
example_prompts,
|
||||||
model: str,
|
model: str,
|
||||||
dtype: str,
|
dtype: str,
|
||||||
@ -52,37 +60,39 @@ def test_models(
|
|||||||
enforce_eager: bool,
|
enforce_eager: bool,
|
||||||
tensor_parallel_size: int,
|
tensor_parallel_size: int,
|
||||||
attention_backend: str,
|
attention_backend: str,
|
||||||
monkeypatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Checks exact match decode between huggingface model and vllm runner with
|
Checks exact match decode between huggingface model and vllm runner with
|
||||||
chunked prefill.
|
chunked prefill.
|
||||||
"""
|
"""
|
||||||
override_backend_env_variable(monkeypatch, attention_backend)
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
|
||||||
|
|
||||||
max_num_seqs = chunked_prefill_token_size
|
max_num_seqs = chunked_prefill_token_size
|
||||||
max_num_batched_tokens = chunked_prefill_token_size
|
max_num_batched_tokens = chunked_prefill_token_size
|
||||||
|
|
||||||
with hf_runner(model, dtype=dtype) as hf_model:
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
model,
|
model,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
max_num_batched_tokens=max_num_batched_tokens,
|
max_num_batched_tokens=max_num_batched_tokens,
|
||||||
enable_chunked_prefill=True,
|
enable_chunked_prefill=True,
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
enforce_eager=enforce_eager,
|
enforce_eager=enforce_eager,
|
||||||
max_num_seqs=max_num_seqs,
|
max_num_seqs=max_num_seqs,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_outputs = vllm_model.generate_greedy(example_prompts,
|
||||||
|
max_tokens)
|
||||||
|
|
||||||
check_outputs_equal(
|
check_outputs_equal(
|
||||||
outputs_0_lst=hf_outputs,
|
outputs_0_lst=hf_outputs,
|
||||||
outputs_1_lst=vllm_outputs,
|
outputs_1_lst=vllm_outputs,
|
||||||
name_0="hf",
|
name_0="hf",
|
||||||
name_1="vllm",
|
name_1="vllm",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@multi_gpu_test(num_gpus=2)
|
@multi_gpu_test(num_gpus=2)
|
||||||
@ -90,57 +100,61 @@ def test_models(
|
|||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
|
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
|
||||||
def test_models_distributed(
|
def test_models_distributed(
|
||||||
hf_runner,
|
hf_runner: HfRunner,
|
||||||
vllm_runner,
|
vllm_runner: VllmRunner,
|
||||||
example_prompts,
|
example_prompts,
|
||||||
model: str,
|
model: str,
|
||||||
distributed_executor_backend: str,
|
distributed_executor_backend: str,
|
||||||
attention_backend: str,
|
attention_backend: str,
|
||||||
monkeypatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
) -> None:
|
) -> None:
|
||||||
override_backend_env_variable(monkeypatch, attention_backend)
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
|
||||||
|
if (model == "meta-llama/Llama-3.2-1B-Instruct"
|
||||||
|
and distributed_executor_backend == "ray"):
|
||||||
|
# test Ray Compiled Graph
|
||||||
|
m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
|
||||||
|
m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
|
||||||
|
|
||||||
if (model == "meta-llama/Llama-3.2-1B-Instruct"
|
dtype = "half"
|
||||||
and distributed_executor_backend == "ray"):
|
max_tokens = 5
|
||||||
# test Ray Compiled Graph
|
chunked_prefill_token_size = 16
|
||||||
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
|
|
||||||
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
|
|
||||||
|
|
||||||
dtype = "half"
|
# Add a chunked prefill config.
|
||||||
max_tokens = 5
|
max_num_seqs = min(chunked_prefill_token_size, 256)
|
||||||
chunked_prefill_token_size = 16
|
assert chunked_prefill_token_size != -1
|
||||||
|
enable_chunked_prefill = True
|
||||||
|
max_num_batched_tokens = chunked_prefill_token_size
|
||||||
|
|
||||||
# Add a chunked prefill config.
|
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||||
max_num_seqs = min(chunked_prefill_token_size, 256)
|
# vLLM needs a fresh new process without cuda initialization.
|
||||||
assert chunked_prefill_token_size != -1
|
# if we run HF first, the cuda initialization will be done and it
|
||||||
enable_chunked_prefill = True
|
# will hurt multiprocessing backend with
|
||||||
max_num_batched_tokens = chunked_prefill_token_size
|
# fork method (the default method).
|
||||||
|
|
||||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
with vllm_runner(
|
||||||
# vLLM needs a fresh new process without cuda initialization.
|
model,
|
||||||
# if we run HF first, the cuda initialization will be done and it
|
dtype=dtype,
|
||||||
# will hurt multiprocessing backend with fork method (the default method).
|
tensor_parallel_size=2,
|
||||||
|
max_num_seqs=max_num_seqs,
|
||||||
|
enable_chunked_prefill=enable_chunked_prefill,
|
||||||
|
max_num_batched_tokens=max_num_batched_tokens,
|
||||||
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
|
) as vllm_model:
|
||||||
|
vllm_outputs = vllm_model.generate_greedy(
|
||||||
|
example_prompts,
|
||||||
|
max_tokens,
|
||||||
|
)
|
||||||
|
|
||||||
with vllm_runner(
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
model,
|
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||||
dtype=dtype,
|
|
||||||
tensor_parallel_size=2,
|
|
||||||
max_num_seqs=max_num_seqs,
|
|
||||||
enable_chunked_prefill=enable_chunked_prefill,
|
|
||||||
max_num_batched_tokens=max_num_batched_tokens,
|
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
|
||||||
) as vllm_model:
|
|
||||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
|
||||||
|
|
||||||
with hf_runner(model, dtype=dtype) as hf_model:
|
check_outputs_equal(
|
||||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
outputs_0_lst=hf_outputs,
|
||||||
|
outputs_1_lst=vllm_outputs,
|
||||||
check_outputs_equal(
|
name_0="hf",
|
||||||
outputs_0_lst=hf_outputs,
|
name_1="vllm",
|
||||||
outputs_1_lst=vllm_outputs,
|
)
|
||||||
name_0="hf",
|
|
||||||
name_1="vllm",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@ -158,7 +172,7 @@ def test_models_distributed(
|
|||||||
# the async postprocessor
|
# the async postprocessor
|
||||||
@pytest.mark.parametrize("disable_async_output_proc", [True])
|
@pytest.mark.parametrize("disable_async_output_proc", [True])
|
||||||
def test_models_with_fp8_kv_cache(
|
def test_models_with_fp8_kv_cache(
|
||||||
vllm_runner,
|
vllm_runner: VllmRunner,
|
||||||
example_prompts,
|
example_prompts,
|
||||||
kv_cache_dtype: str,
|
kv_cache_dtype: str,
|
||||||
model: str,
|
model: str,
|
||||||
@ -218,7 +232,7 @@ def test_models_with_fp8_kv_cache(
|
|||||||
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
||||||
@pytest.mark.parametrize("dtype", ["half"])
|
@pytest.mark.parametrize("dtype", ["half"])
|
||||||
def test_with_prefix_caching(
|
def test_with_prefix_caching(
|
||||||
vllm_runner,
|
vllm_runner: VllmRunner,
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
enforce_eager: bool,
|
enforce_eager: bool,
|
||||||
chunk_size: int,
|
chunk_size: int,
|
||||||
@ -254,8 +268,10 @@ def test_with_prefix_caching(
|
|||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
outputs[enable] = []
|
outputs[enable] = []
|
||||||
for prompt in full_prompts:
|
for prompt in full_prompts:
|
||||||
outputs[enable] += vllm_model.generate_greedy([prompt],
|
outputs[enable] += vllm_model.generate_greedy(
|
||||||
max_tokens)
|
[prompt],
|
||||||
|
max_tokens,
|
||||||
|
)
|
||||||
|
|
||||||
check_outputs_equal(
|
check_outputs_equal(
|
||||||
outputs_0_lst=outputs[False],
|
outputs_0_lst=outputs[False],
|
||||||
@ -274,8 +290,8 @@ def test_with_prefix_caching(
|
|||||||
@pytest.mark.cpu_model
|
@pytest.mark.cpu_model
|
||||||
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
|
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
|
||||||
def test_models_cpu(
|
def test_models_cpu(
|
||||||
hf_runner,
|
hf_runner: HfRunner,
|
||||||
vllm_runner,
|
vllm_runner: VllmRunner,
|
||||||
example_prompts,
|
example_prompts,
|
||||||
model: str,
|
model: str,
|
||||||
dtype: str,
|
dtype: str,
|
||||||
@ -283,7 +299,7 @@ def test_models_cpu(
|
|||||||
chunked_prefill_token_size: int,
|
chunked_prefill_token_size: int,
|
||||||
enforce_eager: bool,
|
enforce_eager: bool,
|
||||||
attention_backend: str,
|
attention_backend: str,
|
||||||
monkeypatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
) -> None:
|
) -> None:
|
||||||
test_models(
|
test_models(
|
||||||
hf_runner,
|
hf_runner,
|
||||||
@ -307,7 +323,7 @@ def test_models_cpu(
|
|||||||
@pytest.mark.cpu_model
|
@pytest.mark.cpu_model
|
||||||
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
|
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
|
||||||
def test_with_prefix_caching_cpu(
|
def test_with_prefix_caching_cpu(
|
||||||
vllm_runner,
|
vllm_runner: VllmRunner,
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
enforce_eager: bool,
|
enforce_eager: bool,
|
||||||
chunk_size: int,
|
chunk_size: int,
|
||||||
|
@ -123,40 +123,38 @@ def test_cumem_with_cudagraph():
|
|||||||
# sleep mode with pytorch checkpoint
|
# sleep mode with pytorch checkpoint
|
||||||
("facebook/opt-125m", False),
|
("facebook/opt-125m", False),
|
||||||
])
|
])
|
||||||
def test_end_to_end(model: str, use_v1: bool):
|
def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
|
||||||
import os
|
with monkeypatch.context() as m:
|
||||||
os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
|
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
|
||||||
free, total = torch.cuda.mem_get_info()
|
free, total = torch.cuda.mem_get_info()
|
||||||
used_bytes_baseline = total - free # in case other process is running
|
used_bytes_baseline = total - free # in case other process is running
|
||||||
llm = LLM(model, enable_sleep_mode=True)
|
llm = LLM(model, enable_sleep_mode=True)
|
||||||
prompt = "How are you?"
|
prompt = "How are you?"
|
||||||
sampling_params = SamplingParams(temperature=0, max_tokens=10)
|
sampling_params = SamplingParams(temperature=0, max_tokens=10)
|
||||||
output = llm.generate(prompt, sampling_params)
|
output = llm.generate(prompt, sampling_params)
|
||||||
|
|
||||||
# the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
|
# the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
|
||||||
# which is difficult to measure in the test. therefore, we only
|
# which is difficult to measure in the test. therefore, we only
|
||||||
# test sleep level 1 here.
|
# test sleep level 1 here.
|
||||||
llm.sleep(level=1)
|
llm.sleep(level=1)
|
||||||
|
|
||||||
free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
|
free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
|
||||||
used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
|
used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
|
||||||
# now the memory usage is mostly cudagraph memory pool,
|
# now the memory usage is mostly cudagraph memory pool,
|
||||||
# and it should be less than the model weights (1B model, 2GiB weights)
|
# and it should be less than the model weights (1B model, 2GiB weights)
|
||||||
|
|
||||||
# NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
|
# NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
|
||||||
# is captured but cannot be releasesd from PyTorch due to a known bug,
|
# is captured but cannot be releasesd from PyTorch due to a known bug,
|
||||||
# therefore high memory usage after `llm.sleep` is called is expected.
|
# therefore high memory usage after `llm.sleep` is called is expected.
|
||||||
# FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
|
# FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
|
||||||
# in V1.
|
# in V1.
|
||||||
if use_v1:
|
if use_v1:
|
||||||
assert used_bytes < 7 * GiB_bytes
|
assert used_bytes < 7 * GiB_bytes
|
||||||
else:
|
else:
|
||||||
assert used_bytes < 2 * GiB_bytes
|
assert used_bytes < 2 * GiB_bytes
|
||||||
|
|
||||||
llm.wake_up()
|
llm.wake_up()
|
||||||
output2 = llm.generate(prompt, sampling_params)
|
output2 = llm.generate(prompt, sampling_params)
|
||||||
|
|
||||||
# cmp output
|
# cmp output
|
||||||
assert output[0].outputs[0].text == output2[0].outputs[0].text
|
assert output[0].outputs[0].text == output2[0].outputs[0].text
|
||||||
|
|
||||||
del os.environ["VLLM_USE_V1"]
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import dataclasses
|
import dataclasses
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@ -22,75 +22,76 @@ class TestSetting:
|
|||||||
fullgraph: bool
|
fullgraph: bool
|
||||||
|
|
||||||
|
|
||||||
# representative settings for testing
|
|
||||||
test_settings = [
|
|
||||||
# basic llama model
|
|
||||||
TestSetting(
|
|
||||||
model="meta-llama/Llama-3.2-1B-Instruct",
|
|
||||||
model_args=[],
|
|
||||||
pp_size=2,
|
|
||||||
tp_size=2,
|
|
||||||
attn_backend="FLASHINFER",
|
|
||||||
method="generate",
|
|
||||||
fullgraph=True,
|
|
||||||
),
|
|
||||||
# llama model with quantization
|
|
||||||
TestSetting(
|
|
||||||
model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
|
|
||||||
model_args=["--quantization", "gptq"],
|
|
||||||
pp_size=1,
|
|
||||||
tp_size=1,
|
|
||||||
attn_backend="FLASH_ATTN",
|
|
||||||
method="generate",
|
|
||||||
fullgraph=True,
|
|
||||||
),
|
|
||||||
# MoE model
|
|
||||||
TestSetting(
|
|
||||||
model="ibm/PowerMoE-3b",
|
|
||||||
model_args=[],
|
|
||||||
pp_size=1,
|
|
||||||
tp_size=2,
|
|
||||||
attn_backend="FLASH_ATTN",
|
|
||||||
method="generate",
|
|
||||||
fullgraph=True,
|
|
||||||
),
|
|
||||||
# embedding model
|
|
||||||
TestSetting(
|
|
||||||
model="BAAI/bge-multilingual-gemma2",
|
|
||||||
model_args=["--task", "embed"],
|
|
||||||
pp_size=1,
|
|
||||||
tp_size=1,
|
|
||||||
attn_backend="FLASH_ATTN",
|
|
||||||
method="encode",
|
|
||||||
fullgraph=True,
|
|
||||||
),
|
|
||||||
# encoder-based embedding model (BERT)
|
|
||||||
TestSetting(
|
|
||||||
model="BAAI/bge-base-en-v1.5",
|
|
||||||
model_args=["--task", "embed"],
|
|
||||||
pp_size=1,
|
|
||||||
tp_size=1,
|
|
||||||
attn_backend="XFORMERS",
|
|
||||||
method="encode",
|
|
||||||
fullgraph=True,
|
|
||||||
),
|
|
||||||
# vision language model
|
|
||||||
TestSetting(
|
|
||||||
model="microsoft/Phi-3.5-vision-instruct",
|
|
||||||
model_args=["--trust-remote-code", "--max-model-len", "2048"],
|
|
||||||
pp_size=2,
|
|
||||||
tp_size=1,
|
|
||||||
attn_backend="FLASH_ATTN",
|
|
||||||
method="generate_with_image",
|
|
||||||
fullgraph=False,
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
# we cannot afford testing the full Catesian product
|
# we cannot afford testing the full Catesian product
|
||||||
# of all models and all levels
|
# of all models and all levels
|
||||||
@pytest.mark.parametrize("test_setting", test_settings)
|
@pytest.mark.parametrize(
|
||||||
def test_compile_correctness(test_setting: TestSetting):
|
"test_setting",
|
||||||
|
[
|
||||||
|
# basic llama model
|
||||||
|
TestSetting(
|
||||||
|
model="meta-llama/Llama-3.2-1B-Instruct",
|
||||||
|
model_args=[],
|
||||||
|
pp_size=2,
|
||||||
|
tp_size=2,
|
||||||
|
attn_backend="FLASHINFER",
|
||||||
|
method="generate",
|
||||||
|
fullgraph=True,
|
||||||
|
),
|
||||||
|
# llama model with quantization
|
||||||
|
TestSetting(
|
||||||
|
model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
|
||||||
|
model_args=["--quantization", "gptq"],
|
||||||
|
pp_size=1,
|
||||||
|
tp_size=1,
|
||||||
|
attn_backend="FLASH_ATTN",
|
||||||
|
method="generate",
|
||||||
|
fullgraph=True,
|
||||||
|
),
|
||||||
|
# MoE model
|
||||||
|
TestSetting(
|
||||||
|
model="ibm/PowerMoE-3b",
|
||||||
|
model_args=[],
|
||||||
|
pp_size=1,
|
||||||
|
tp_size=2,
|
||||||
|
attn_backend="FLASH_ATTN",
|
||||||
|
method="generate",
|
||||||
|
fullgraph=True,
|
||||||
|
),
|
||||||
|
# embedding model
|
||||||
|
TestSetting(
|
||||||
|
model="BAAI/bge-multilingual-gemma2",
|
||||||
|
model_args=["--task", "embed"],
|
||||||
|
pp_size=1,
|
||||||
|
tp_size=1,
|
||||||
|
attn_backend="FLASH_ATTN",
|
||||||
|
method="encode",
|
||||||
|
fullgraph=True,
|
||||||
|
),
|
||||||
|
# encoder-based embedding model (BERT)
|
||||||
|
TestSetting(
|
||||||
|
model="BAAI/bge-base-en-v1.5",
|
||||||
|
model_args=["--task", "embed"],
|
||||||
|
pp_size=1,
|
||||||
|
tp_size=1,
|
||||||
|
attn_backend="XFORMERS",
|
||||||
|
method="encode",
|
||||||
|
fullgraph=True,
|
||||||
|
),
|
||||||
|
# vision language model
|
||||||
|
TestSetting(
|
||||||
|
model="microsoft/Phi-3.5-vision-instruct",
|
||||||
|
model_args=["--trust-remote-code", "--max-model-len", "2048"],
|
||||||
|
pp_size=2,
|
||||||
|
tp_size=1,
|
||||||
|
attn_backend="FLASH_ATTN",
|
||||||
|
method="generate_with_image",
|
||||||
|
fullgraph=False,
|
||||||
|
),
|
||||||
|
])
|
||||||
|
def test_compile_correctness(
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
test_setting: TestSetting,
|
||||||
|
):
|
||||||
# this test is run under multiple suits, with different GPUs.
|
# this test is run under multiple suits, with different GPUs.
|
||||||
# make sure we only run the test with correct CUDA devices.
|
# make sure we only run the test with correct CUDA devices.
|
||||||
# don't use "<", as it will duplicate the tests.
|
# don't use "<", as it will duplicate the tests.
|
||||||
@ -103,41 +104,45 @@ def test_compile_correctness(test_setting: TestSetting):
|
|||||||
fullgraph = test_setting.fullgraph
|
fullgraph = test_setting.fullgraph
|
||||||
if cuda_device_count_stateless() != pp_size * tp_size:
|
if cuda_device_count_stateless() != pp_size * tp_size:
|
||||||
pytest.skip("Not correct CUDA devices for the test.")
|
pytest.skip("Not correct CUDA devices for the test.")
|
||||||
import os
|
|
||||||
os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
|
|
||||||
final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
|
|
||||||
["-tp", str(tp_size)]
|
|
||||||
|
|
||||||
all_args: list[list[str]] = []
|
with monkeypatch.context() as m:
|
||||||
all_envs: list[Optional[dict[str, str]]] = []
|
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
|
||||||
|
final_args = [
|
||||||
|
"--enforce-eager", *model_args, "-pp",
|
||||||
|
str(pp_size), "-tp",
|
||||||
|
str(tp_size)
|
||||||
|
]
|
||||||
|
|
||||||
for level in [
|
all_args: list[list[str]] = []
|
||||||
CompilationLevel.NO_COMPILATION,
|
all_envs: list[dict[str, str] | None] = []
|
||||||
CompilationLevel.PIECEWISE,
|
|
||||||
]:
|
|
||||||
all_args.append(final_args + [f"-O{level}"])
|
|
||||||
all_envs.append({})
|
|
||||||
|
|
||||||
# inductor will change the output, so we only compare if the output
|
for level in [
|
||||||
# is close, not exactly the same.
|
CompilationLevel.NO_COMPILATION,
|
||||||
compare_all_settings(
|
CompilationLevel.PIECEWISE,
|
||||||
model,
|
]:
|
||||||
all_args,
|
all_args.append(final_args + [f"-O{level}"])
|
||||||
all_envs,
|
all_envs.append({})
|
||||||
method=method if method != "generate" else "generate_close")
|
|
||||||
all_envs.clear()
|
|
||||||
all_args.clear()
|
|
||||||
|
|
||||||
for level in [
|
# inductor will change the output, so we only compare if the output
|
||||||
CompilationLevel.NO_COMPILATION,
|
# is close, not exactly the same.
|
||||||
CompilationLevel.DYNAMO_AS_IS,
|
compare_all_settings(
|
||||||
CompilationLevel.DYNAMO_ONCE,
|
model,
|
||||||
]:
|
all_args,
|
||||||
all_args.append(final_args + [f"-O{level}"])
|
all_envs,
|
||||||
all_envs.append({})
|
method=method if method != "generate" else "generate_close")
|
||||||
if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
|
all_envs.clear()
|
||||||
# "DYNAMO_ONCE" will always use fullgraph
|
all_args.clear()
|
||||||
all_envs[-1][
|
|
||||||
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore
|
|
||||||
|
|
||||||
compare_all_settings(model, all_args * 3, all_envs, method=method)
|
for level in [
|
||||||
|
CompilationLevel.NO_COMPILATION,
|
||||||
|
CompilationLevel.DYNAMO_AS_IS,
|
||||||
|
CompilationLevel.DYNAMO_ONCE,
|
||||||
|
]:
|
||||||
|
all_args.append(final_args + [f"-O{level}"])
|
||||||
|
all_envs.append({})
|
||||||
|
if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
|
||||||
|
# "DYNAMO_ONCE" will always use fullgraph
|
||||||
|
all_envs[-1][
|
||||||
|
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore
|
||||||
|
|
||||||
|
compare_all_settings(model, all_args * 3, all_envs, method=method)
|
||||||
|
@ -1,22 +1,115 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import pytest
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from tests.quantization.utils import is_quant_method_supported
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.config import CompilationLevel
|
from vllm.config import CompilationLevel
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
from ..utils import fork_new_process_for_each_test
|
from ..utils import fork_new_process_for_each_test
|
||||||
from .utils import TEST_MODELS, check_full_graph_support
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_info", TEST_MODELS)
|
@pytest.fixture(params=None, name="model_info")
|
||||||
|
def models_list_fixture(request):
|
||||||
|
TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
|
||||||
|
("facebook/opt-125m", {}),
|
||||||
|
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
|
||||||
|
"dtype": torch.float16,
|
||||||
|
"quantization": "compressed-tensors"
|
||||||
|
}),
|
||||||
|
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
|
||||||
|
"dtype": torch.float16,
|
||||||
|
"quantization": "compressed-tensors"
|
||||||
|
}),
|
||||||
|
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
|
||||||
|
"quantization": "compressed-tensors"
|
||||||
|
}),
|
||||||
|
("meta-llama/Llama-3.2-1B-Instruct", {}),
|
||||||
|
]
|
||||||
|
|
||||||
|
if is_quant_method_supported("aqlm"):
|
||||||
|
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
|
||||||
|
"quantization": "aqlm"
|
||||||
|
}))
|
||||||
|
|
||||||
|
# TODO: figure out why this fails.
|
||||||
|
if False and is_quant_method_supported("gguf"): # noqa: SIM223
|
||||||
|
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
|
||||||
|
"quantization": "gguf"
|
||||||
|
}))
|
||||||
|
|
||||||
|
if is_quant_method_supported("gptq"):
|
||||||
|
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
|
||||||
|
"quantization": "gptq"
|
||||||
|
}))
|
||||||
|
|
||||||
|
if is_quant_method_supported("gptq_marlin"):
|
||||||
|
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
|
||||||
|
"quantization": "gptq_marlin"
|
||||||
|
}))
|
||||||
|
|
||||||
|
if is_quant_method_supported("gptq_marlin_24"):
|
||||||
|
TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
|
||||||
|
"quantization": "gptq_marlin_24"
|
||||||
|
}))
|
||||||
|
|
||||||
|
if is_quant_method_supported("marlin"):
|
||||||
|
TEST_MODELS.append(
|
||||||
|
("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
|
||||||
|
"quantization": "marlin"
|
||||||
|
}))
|
||||||
|
|
||||||
|
if not current_platform.is_rocm() and is_quant_method_supported("awq"):
|
||||||
|
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
|
||||||
|
"quantization": "AWQ"
|
||||||
|
}))
|
||||||
|
|
||||||
|
return TEST_MODELS
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"optimization_level",
|
"optimization_level",
|
||||||
[CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE])
|
[CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("model_info", "", indirect=True)
|
||||||
@fork_new_process_for_each_test
|
@fork_new_process_for_each_test
|
||||||
def test_full_graph(model_info, optimization_level):
|
def test_full_graph(
|
||||||
model = model_info[0]
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
model_kwargs = model_info[1]
|
model_info: tuple[str, dict[str, Any]],
|
||||||
check_full_graph_support(model,
|
optimization_level: int,
|
||||||
model_kwargs,
|
):
|
||||||
optimization_level,
|
model, model_kwargs = model_info
|
||||||
tp_size=1)
|
|
||||||
|
with monkeypatch.context() as m:
|
||||||
|
# make sure these models can be captured in full graph mode
|
||||||
|
m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
|
||||||
|
print(f"MODEL={model}")
|
||||||
|
|
||||||
|
prompts = [
|
||||||
|
"Hello, my name is",
|
||||||
|
"The president of the United States is",
|
||||||
|
"The capital of France is",
|
||||||
|
"The future of AI is",
|
||||||
|
]
|
||||||
|
sampling_params = SamplingParams(temperature=0)
|
||||||
|
llm = LLM(
|
||||||
|
model=model,
|
||||||
|
enforce_eager=True,
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
disable_custom_all_reduce=True,
|
||||||
|
compilation_config=optimization_level,
|
||||||
|
**model_kwargs,
|
||||||
|
)
|
||||||
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
|
|
||||||
|
# Print the outputs.
|
||||||
|
for output in outputs:
|
||||||
|
prompt = output.prompt
|
||||||
|
generated_text = output.outputs[0].text
|
||||||
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
|
@ -1,93 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
|
|
||||||
import os
|
|
||||||
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from tests.quantization.utils import is_quant_method_supported
|
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
from vllm.platforms import current_platform
|
|
||||||
|
|
||||||
TEST_MODELS = [
|
|
||||||
("facebook/opt-125m", {}),
|
|
||||||
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
|
|
||||||
"dtype": torch.float16,
|
|
||||||
"quantization": "compressed-tensors"
|
|
||||||
}),
|
|
||||||
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
|
|
||||||
"dtype": torch.float16,
|
|
||||||
"quantization": "compressed-tensors"
|
|
||||||
}),
|
|
||||||
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
|
|
||||||
"quantization": "compressed-tensors"
|
|
||||||
}),
|
|
||||||
("meta-llama/Llama-3.2-1B-Instruct", {}),
|
|
||||||
]
|
|
||||||
|
|
||||||
if is_quant_method_supported("aqlm"):
|
|
||||||
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
|
|
||||||
"quantization": "aqlm"
|
|
||||||
}))
|
|
||||||
|
|
||||||
# TODO: figure out why this fails.
|
|
||||||
if False and is_quant_method_supported("gguf"): # noqa: SIM223
|
|
||||||
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
|
|
||||||
"quantization": "gguf"
|
|
||||||
}))
|
|
||||||
|
|
||||||
if is_quant_method_supported("gptq"):
|
|
||||||
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
|
|
||||||
"quantization": "gptq"
|
|
||||||
}))
|
|
||||||
|
|
||||||
if is_quant_method_supported("gptq_marlin"):
|
|
||||||
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
|
|
||||||
"quantization": "gptq_marlin"
|
|
||||||
}))
|
|
||||||
|
|
||||||
if is_quant_method_supported("gptq_marlin_24"):
|
|
||||||
TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
|
|
||||||
"quantization": "gptq_marlin_24"
|
|
||||||
}))
|
|
||||||
|
|
||||||
if is_quant_method_supported("marlin"):
|
|
||||||
TEST_MODELS.append(("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
|
|
||||||
"quantization": "marlin"
|
|
||||||
}))
|
|
||||||
|
|
||||||
if not current_platform.is_rocm() and is_quant_method_supported("awq"):
|
|
||||||
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
|
|
||||||
"quantization": "AWQ"
|
|
||||||
}))
|
|
||||||
|
|
||||||
|
|
||||||
def check_full_graph_support(model,
|
|
||||||
model_kwargs,
|
|
||||||
optimization_level,
|
|
||||||
tp_size=1):
|
|
||||||
# make sure these models can be captured in full graph mode
|
|
||||||
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
|
|
||||||
|
|
||||||
print(f"MODEL={model}")
|
|
||||||
|
|
||||||
prompts = [
|
|
||||||
"Hello, my name is",
|
|
||||||
"The president of the United States is",
|
|
||||||
"The capital of France is",
|
|
||||||
"The future of AI is",
|
|
||||||
]
|
|
||||||
sampling_params = SamplingParams(temperature=0)
|
|
||||||
llm = LLM(model=model,
|
|
||||||
enforce_eager=True,
|
|
||||||
tensor_parallel_size=tp_size,
|
|
||||||
disable_custom_all_reduce=True,
|
|
||||||
compilation_config=optimization_level,
|
|
||||||
**model_kwargs)
|
|
||||||
|
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
|
||||||
|
|
||||||
# Print the outputs.
|
|
||||||
for output in outputs:
|
|
||||||
prompt = output.prompt
|
|
||||||
generated_text = output.outputs[0].text
|
|
||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
|
@ -3,7 +3,10 @@
|
|||||||
|
|
||||||
Run `pytest tests/distributed/test_comm_ops.py`.
|
Run `pytest tests/distributed/test_comm_ops.py`.
|
||||||
"""
|
"""
|
||||||
import os
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any, Callable
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import ray
|
import ray
|
||||||
@ -17,12 +20,18 @@ from ..utils import init_test_distributed_environment, multi_process_parallel
|
|||||||
|
|
||||||
|
|
||||||
@ray.remote(num_gpus=1, max_calls=1)
|
@ray.remote(num_gpus=1, max_calls=1)
|
||||||
def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
|
def all_reduce_test_worker(
|
||||||
distributed_init_port: str):
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
tp_size: int,
|
||||||
|
pp_size: int,
|
||||||
|
rank: int,
|
||||||
|
distributed_init_port: str,
|
||||||
|
):
|
||||||
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
||||||
# so that each worker can see all the GPUs
|
# so that each worker can see all the GPUs
|
||||||
# they will be able to set the device to the correct GPU
|
# they will be able to set the device to the correct GPU
|
||||||
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||||
|
|
||||||
device = torch.device(f"cuda:{rank}")
|
device = torch.device(f"cuda:{rank}")
|
||||||
torch.cuda.set_device(device)
|
torch.cuda.set_device(device)
|
||||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||||
@ -39,12 +48,17 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
|
|||||||
|
|
||||||
|
|
||||||
@ray.remote(num_gpus=1, max_calls=1)
|
@ray.remote(num_gpus=1, max_calls=1)
|
||||||
def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
|
def all_gather_test_worker(
|
||||||
distributed_init_port: str):
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
tp_size: int,
|
||||||
|
pp_size: int,
|
||||||
|
rank: int,
|
||||||
|
distributed_init_port: str,
|
||||||
|
):
|
||||||
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
||||||
# so that each worker can see all the GPUs
|
# so that each worker can see all the GPUs
|
||||||
# they will be able to set the device to the correct GPU
|
# they will be able to set the device to the correct GPU
|
||||||
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||||
device = torch.device(f"cuda:{rank}")
|
device = torch.device(f"cuda:{rank}")
|
||||||
torch.cuda.set_device(device)
|
torch.cuda.set_device(device)
|
||||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||||
@ -67,12 +81,17 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
|
|||||||
|
|
||||||
|
|
||||||
@ray.remote(num_gpus=1, max_calls=1)
|
@ray.remote(num_gpus=1, max_calls=1)
|
||||||
def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
|
def broadcast_tensor_dict_test_worker(
|
||||||
distributed_init_port: str):
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
tp_size: int,
|
||||||
|
pp_size: int,
|
||||||
|
rank: int,
|
||||||
|
distributed_init_port: str,
|
||||||
|
):
|
||||||
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
||||||
# so that each worker can see all the GPUs
|
# so that each worker can see all the GPUs
|
||||||
# they will be able to set the device to the correct GPU
|
# they will be able to set the device to the correct GPU
|
||||||
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||||
device = torch.device(f"cuda:{rank}")
|
device = torch.device(f"cuda:{rank}")
|
||||||
torch.cuda.set_device(device)
|
torch.cuda.set_device(device)
|
||||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||||
@ -106,9 +125,14 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
|
|||||||
|
|
||||||
|
|
||||||
@ray.remote(num_gpus=1, max_calls=1)
|
@ray.remote(num_gpus=1, max_calls=1)
|
||||||
def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
|
def send_recv_tensor_dict_test_worker(
|
||||||
distributed_init_port: str):
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
tp_size: int,
|
||||||
|
pp_size: int,
|
||||||
|
rank: int,
|
||||||
|
distributed_init_port: str,
|
||||||
|
):
|
||||||
|
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||||
device = torch.device(f"cuda:{rank}")
|
device = torch.device(f"cuda:{rank}")
|
||||||
torch.cuda.set_device(device)
|
torch.cuda.set_device(device)
|
||||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||||
@ -146,9 +170,14 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
|
|||||||
|
|
||||||
|
|
||||||
@ray.remote(num_gpus=1, max_calls=1)
|
@ray.remote(num_gpus=1, max_calls=1)
|
||||||
def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
|
def send_recv_test_worker(
|
||||||
distributed_init_port: str):
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
tp_size: int,
|
||||||
|
pp_size: int,
|
||||||
|
rank: int,
|
||||||
|
distributed_init_port: str,
|
||||||
|
):
|
||||||
|
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||||
device = torch.device(f"cuda:{rank}")
|
device = torch.device(f"cuda:{rank}")
|
||||||
torch.cuda.set_device(device)
|
torch.cuda.set_device(device)
|
||||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||||
@ -174,8 +203,12 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
|
|||||||
all_reduce_test_worker, all_gather_test_worker,
|
all_reduce_test_worker, all_gather_test_worker,
|
||||||
broadcast_tensor_dict_test_worker
|
broadcast_tensor_dict_test_worker
|
||||||
])
|
])
|
||||||
def test_multi_process_tensor_parallel(tp_size, test_target):
|
def test_multi_process_tensor_parallel(
|
||||||
multi_process_parallel(tp_size, 1, test_target)
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
tp_size: int,
|
||||||
|
test_target: Callable[..., Any],
|
||||||
|
):
|
||||||
|
multi_process_parallel(monkeypatch, tp_size, 1, test_target)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||||
@ -183,8 +216,12 @@ def test_multi_process_tensor_parallel(tp_size, test_target):
|
|||||||
@pytest.mark.parametrize("pp_size", [2])
|
@pytest.mark.parametrize("pp_size", [2])
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
|
"test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
|
||||||
def test_multi_process_pipeline_parallel(pp_size, test_target):
|
def test_multi_process_pipeline_parallel(
|
||||||
multi_process_parallel(1, pp_size, test_target)
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
pp_size: int,
|
||||||
|
test_target: Callable[..., Any],
|
||||||
|
):
|
||||||
|
multi_process_parallel(monkeypatch, 1, pp_size, test_target)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(torch.cuda.device_count() < 4,
|
@pytest.mark.skipif(torch.cuda.device_count() < 4,
|
||||||
@ -197,5 +234,9 @@ def test_multi_process_pipeline_parallel(pp_size, test_target):
|
|||||||
broadcast_tensor_dict_test_worker
|
broadcast_tensor_dict_test_worker
|
||||||
])
|
])
|
||||||
def test_multi_process_tensor_parallel_pipeline_parallel(
|
def test_multi_process_tensor_parallel_pipeline_parallel(
|
||||||
tp_size, pp_size, test_target):
|
tp_size: int,
|
||||||
multi_process_parallel(tp_size, pp_size, test_target)
|
pp_size: int,
|
||||||
|
test_target: Callable[..., Any],
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
):
|
||||||
|
multi_process_parallel(monkeypatch, tp_size, pp_size, test_target)
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import os
|
|
||||||
import random
|
import random
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
@ -23,95 +22,115 @@ for i, v in enumerate(test_sizes):
|
|||||||
|
|
||||||
|
|
||||||
@ray.remote(num_gpus=1, max_calls=1)
|
@ray.remote(num_gpus=1, max_calls=1)
|
||||||
def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
|
def graph_allreduce(
|
||||||
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
device = torch.device(f"cuda:{rank}")
|
tp_size,
|
||||||
torch.cuda.set_device(device)
|
pp_size,
|
||||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
rank,
|
||||||
distributed_init_port)
|
distributed_init_port,
|
||||||
ensure_model_parallel_initialized(tp_size, pp_size)
|
):
|
||||||
group = get_tensor_model_parallel_group().device_group
|
with monkeypatch.context() as m:
|
||||||
|
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||||
|
device = torch.device(f"cuda:{rank}")
|
||||||
|
torch.cuda.set_device(device)
|
||||||
|
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||||
|
distributed_init_port)
|
||||||
|
ensure_model_parallel_initialized(tp_size, pp_size)
|
||||||
|
group = get_tensor_model_parallel_group().device_group
|
||||||
|
|
||||||
# A small all_reduce for warmup.
|
# A small all_reduce for warmup.
|
||||||
# this is needed because device communicators might be created lazily
|
# this is needed because device communicators might be created lazily
|
||||||
# (e.g. NCCL). This will ensure that the communicator is initialized
|
# (e.g. NCCL). This will ensure that the communicator is initialized
|
||||||
# before any communication happens, so that this group can be used for
|
# before any communication happens, so that this group can be used for
|
||||||
# graph capture immediately.
|
# graph capture immediately.
|
||||||
data = torch.zeros(1)
|
data = torch.zeros(1)
|
||||||
data = data.to(device=device)
|
data = data.to(device=device)
|
||||||
torch.distributed.all_reduce(data, group=group)
|
torch.distributed.all_reduce(data, group=group)
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
del data
|
del data
|
||||||
|
|
||||||
# we use the first group to communicate once
|
# we use the first group to communicate once
|
||||||
# and the second group to communicate twice
|
# and the second group to communicate twice
|
||||||
# and so on
|
# and so on
|
||||||
# this is used to demonstrate that each group can
|
# this is used to demonstrate that each group can
|
||||||
# communicate independently
|
# communicate independently
|
||||||
num_communication = rank // tp_size + 1
|
num_communication = rank // tp_size + 1
|
||||||
|
|
||||||
for sz in test_sizes:
|
for sz in test_sizes:
|
||||||
for dtype in [torch.float32, torch.float16, torch.bfloat16]:
|
for dtype in [torch.float32, torch.float16, torch.bfloat16]:
|
||||||
with graph_capture(device=device) as graph_capture_context:
|
with graph_capture(device=device) as graph_capture_context:
|
||||||
# use integers so result matches NCCL exactly
|
# use integers so result matches NCCL exactly
|
||||||
inp1 = torch.randint(1,
|
inp1 = torch.randint(1,
|
||||||
16, (sz, ),
|
16, (sz, ),
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
device=torch.cuda.current_device())
|
device=torch.cuda.current_device())
|
||||||
inp2 = torch.randint(1,
|
inp2 = torch.randint(1,
|
||||||
16, (sz, ),
|
16, (sz, ),
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
device=torch.cuda.current_device())
|
device=torch.cuda.current_device())
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
graph = torch.cuda.CUDAGraph()
|
graph = torch.cuda.CUDAGraph()
|
||||||
with torch.cuda.graph(graph,
|
with torch.cuda.graph(graph,
|
||||||
stream=graph_capture_context.stream):
|
stream=graph_capture_context.stream):
|
||||||
for i in range(num_communication):
|
for i in range(num_communication):
|
||||||
out1 = tensor_model_parallel_all_reduce(inp1)
|
out1 = tensor_model_parallel_all_reduce(inp1)
|
||||||
# the input buffer is immediately modified to test
|
# the input buffer is immediately modified to test
|
||||||
# synchronization
|
# synchronization
|
||||||
dist.all_reduce(inp1, group=group)
|
dist.all_reduce(inp1, group=group)
|
||||||
out2 = tensor_model_parallel_all_reduce(inp2)
|
out2 = tensor_model_parallel_all_reduce(inp2)
|
||||||
dist.all_reduce(inp2, group=group)
|
dist.all_reduce(inp2, group=group)
|
||||||
graph.replay()
|
graph.replay()
|
||||||
torch.testing.assert_close(out1, inp1)
|
torch.testing.assert_close(out1, inp1)
|
||||||
torch.testing.assert_close(out2, inp2)
|
torch.testing.assert_close(out2, inp2)
|
||||||
|
|
||||||
|
|
||||||
@ray.remote(num_gpus=1, max_calls=1)
|
@ray.remote(num_gpus=1, max_calls=1)
|
||||||
def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
|
def eager_allreduce(
|
||||||
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
device = torch.device(f"cuda:{rank}")
|
tp_size,
|
||||||
torch.cuda.set_device(device)
|
pp_size,
|
||||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
rank,
|
||||||
distributed_init_port)
|
distributed_init_port,
|
||||||
|
):
|
||||||
|
with monkeypatch.context() as m:
|
||||||
|
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||||
|
device = torch.device(f"cuda:{rank}")
|
||||||
|
torch.cuda.set_device(device)
|
||||||
|
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||||
|
distributed_init_port)
|
||||||
|
|
||||||
# we use the first group to communicate once
|
# we use the first group to communicate once
|
||||||
# and the second group to communicate twice
|
# and the second group to communicate twice
|
||||||
# and so on
|
# and so on
|
||||||
# this is used to demonstrate that each group can
|
# this is used to demonstrate that each group can
|
||||||
# communicate independently
|
# communicate independently
|
||||||
num_communication = rank // tp_size + 1
|
num_communication = rank // tp_size + 1
|
||||||
sz = 1024
|
sz = 1024
|
||||||
fa = get_tp_group().ca_comm
|
fa = get_tp_group().ca_comm
|
||||||
inp = torch.ones(sz, dtype=torch.float32, device=device)
|
inp = torch.ones(sz, dtype=torch.float32, device=device)
|
||||||
out = inp
|
out = inp
|
||||||
for _ in range(num_communication):
|
for _ in range(num_communication):
|
||||||
out = fa.all_reduce(out, registered=False)
|
out = fa.all_reduce(out, registered=False)
|
||||||
torch.testing.assert_close(out, inp * (tp_size**num_communication))
|
torch.testing.assert_close(out, inp * (tp_size**num_communication))
|
||||||
|
|
||||||
inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
|
inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
|
||||||
out = inp
|
out = inp
|
||||||
for _ in range(num_communication):
|
for _ in range(num_communication):
|
||||||
out = fa.all_reduce(out, registered=False)
|
out = fa.all_reduce(out, registered=False)
|
||||||
torch.testing.assert_close(out, inp * (tp_size**num_communication))
|
torch.testing.assert_close(out, inp * (tp_size**num_communication))
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("tp_size", [2])
|
@pytest.mark.parametrize("tp_size", [2])
|
||||||
@pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
|
@pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
|
||||||
@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
|
@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
|
||||||
def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target):
|
def test_custom_allreduce(
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
tp_size,
|
||||||
|
pipeline_parallel_size,
|
||||||
|
test_target,
|
||||||
|
):
|
||||||
world_size = tp_size * pipeline_parallel_size
|
world_size = tp_size * pipeline_parallel_size
|
||||||
if world_size > torch.cuda.device_count():
|
if world_size > torch.cuda.device_count():
|
||||||
pytest.skip("Not enough GPUs to run the test.")
|
pytest.skip("Not enough GPUs to run the test.")
|
||||||
multi_process_parallel(tp_size, pipeline_parallel_size, test_target)
|
multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
|
||||||
|
test_target)
|
||||||
|
@ -7,33 +7,35 @@ import pytest
|
|||||||
from vllm.distributed.utils import get_pp_indices
|
from vllm.distributed.utils import get_pp_indices
|
||||||
|
|
||||||
|
|
||||||
def test_custom_layer_partition():
|
def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
|
||||||
|
|
||||||
def _verify(partition_str, num_layers, pp_size, goldens):
|
with monkeypatch.context() as m:
|
||||||
bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
|
|
||||||
os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str
|
|
||||||
for pp_rank, golden in enumerate(goldens):
|
|
||||||
assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
|
|
||||||
if bak is not None:
|
|
||||||
os.environ["VLLM_PP_LAYER_PARTITION"] = bak
|
|
||||||
|
|
||||||
# Even partition
|
def _verify(partition_str, num_layers, pp_size, goldens):
|
||||||
_verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
|
||||||
# Balanced partition
|
m.setenv("VLLM_PP_LAYER_PARTITION", partition_str)
|
||||||
_verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
|
for pp_rank, golden in enumerate(goldens):
|
||||||
# Put reminder somewhere
|
assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
|
||||||
_verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
|
if bak is not None:
|
||||||
# Invalid partition strings
|
m.setenv("VLLM_PP_LAYER_PARTITION", bak)
|
||||||
with pytest.raises(ValueError):
|
|
||||||
_verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
# Even partition
|
||||||
with pytest.raises(ValueError):
|
_verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||||
_verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
# Balanced partition
|
||||||
# Wrong number of partitions
|
_verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
|
||||||
with pytest.raises(ValueError):
|
# Put reminder somewhere
|
||||||
_verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
_verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
|
||||||
# Wrong number of layers
|
# Invalid partition strings
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
_verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
_verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
_verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||||
|
# Wrong number of partitions
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
_verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||||
|
# Wrong number of layers
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
_verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@ -55,6 +57,10 @@ def test_custom_layer_partition():
|
|||||||
(5, 3, 1, (2, 4)),
|
(5, 3, 1, (2, 4)),
|
||||||
(5, 3, 2, (4, 5)),
|
(5, 3, 2, (4, 5)),
|
||||||
])
|
])
|
||||||
def test_uneven_auto_partition(num_hidden_layers: int, pp_size: int,
|
def test_uneven_auto_partition(
|
||||||
pp_rank: int, indices: tuple[int, int]):
|
num_hidden_layers: int,
|
||||||
|
pp_size: int,
|
||||||
|
pp_rank: int,
|
||||||
|
indices: tuple[int, int],
|
||||||
|
):
|
||||||
assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size)
|
assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size)
|
||||||
|
@ -1,11 +1,15 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import os
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from ..utils import compare_two_settings, fork_new_process_for_each_test
|
from ..utils import compare_two_settings, fork_new_process_for_each_test
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from typing_extensions import LiteralString
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
|
@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
|
||||||
(2, "JackFram/llama-160m"),
|
(2, "JackFram/llama-160m"),
|
||||||
@ -15,18 +19,24 @@ from ..utils import compare_two_settings, fork_new_process_for_each_test
|
|||||||
"FLASHINFER",
|
"FLASHINFER",
|
||||||
])
|
])
|
||||||
@fork_new_process_for_each_test
|
@fork_new_process_for_each_test
|
||||||
def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
|
def test_pp_cudagraph(
|
||||||
cudagraph_args = [
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
# use half precision for speed and memory savings in CI environment
|
PP_SIZE: int,
|
||||||
"--dtype",
|
MODEL_NAME: str,
|
||||||
"float16",
|
ATTN_BACKEND: LiteralString,
|
||||||
"--pipeline-parallel-size",
|
):
|
||||||
str(PP_SIZE),
|
with monkeypatch.context() as m:
|
||||||
"--distributed-executor-backend",
|
cudagraph_args = [
|
||||||
"mp",
|
# use half precision for speed and memory savings in CI environment
|
||||||
]
|
"--dtype",
|
||||||
os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
|
"float16",
|
||||||
|
"--pipeline-parallel-size",
|
||||||
|
str(PP_SIZE),
|
||||||
|
"--distributed-executor-backend",
|
||||||
|
"mp",
|
||||||
|
]
|
||||||
|
m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND)
|
||||||
|
|
||||||
eager_args = cudagraph_args + ["--enforce-eager"]
|
eager_args = cudagraph_args + ["--enforce-eager"]
|
||||||
|
|
||||||
compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
|
compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
|
||||||
|
@ -49,7 +49,7 @@ TPU_TP_TEST_STR = "" #"tensor_parallel_size=4"
|
|||||||
@pytest.mark.skipif(not current_platform.is_cuda()
|
@pytest.mark.skipif(not current_platform.is_cuda()
|
||||||
and not current_platform.is_tpu(),
|
and not current_platform.is_tpu(),
|
||||||
reason="V1 is currently only supported on CUDA and TPU")
|
reason="V1 is currently only supported on CUDA and TPU")
|
||||||
def test_lm_eval_accuracy_v1_engine(monkeypatch):
|
def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
|
||||||
"""Run with the V1 Engine."""
|
"""Run with the V1 Engine."""
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
@ -67,7 +67,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
|
|||||||
run_test(more_args)
|
run_test(more_args)
|
||||||
|
|
||||||
|
|
||||||
def test_lm_eval_accuracy_v0_engine(monkeypatch):
|
def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
|
||||||
"""Run with the V0 Engine."""
|
"""Run with the V0 Engine."""
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
|
@ -53,32 +53,37 @@ def cache_models():
|
|||||||
|
|
||||||
@pytest.mark.skip_global_cleanup
|
@pytest.mark.skip_global_cleanup
|
||||||
@pytest.mark.usefixtures("cache_models")
|
@pytest.mark.usefixtures("cache_models")
|
||||||
def test_offline_mode(monkeypatch):
|
def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
|
||||||
# Set HF to offline mode and ensure we can still construct an LLM
|
# Set HF to offline mode and ensure we can still construct an LLM
|
||||||
try:
|
with monkeypatch.context() as m:
|
||||||
monkeypatch.setenv("HF_HUB_OFFLINE", "1")
|
try:
|
||||||
monkeypatch.setenv("VLLM_NO_USAGE_STATS", "1")
|
m.setenv("HF_HUB_OFFLINE", "1")
|
||||||
|
m.setenv("VLLM_NO_USAGE_STATS", "1")
|
||||||
|
|
||||||
def disable_connect(*args, **kwargs):
|
def disable_connect(*args, **kwargs):
|
||||||
raise RuntimeError("No http calls allowed")
|
raise RuntimeError("No http calls allowed")
|
||||||
|
|
||||||
monkeypatch.setattr(urllib3.connection.HTTPConnection, "connect",
|
m.setattr(
|
||||||
disable_connect)
|
urllib3.connection.HTTPConnection,
|
||||||
monkeypatch.setattr(urllib3.connection.HTTPSConnection, "connect",
|
"connect",
|
||||||
disable_connect)
|
disable_connect,
|
||||||
|
)
|
||||||
|
m.setattr(
|
||||||
|
urllib3.connection.HTTPSConnection,
|
||||||
|
"connect",
|
||||||
|
disable_connect,
|
||||||
|
)
|
||||||
|
|
||||||
# Need to re-import huggingface_hub and friends to setup offline mode
|
# Need to re-import huggingface_hub
|
||||||
_re_import_modules()
|
# and friends to setup offline mode
|
||||||
# Cached model files should be used in offline mode
|
_re_import_modules()
|
||||||
for model_config in MODEL_CONFIGS:
|
# Cached model files should be used in offline mode
|
||||||
LLM(**model_config)
|
for model_config in MODEL_CONFIGS:
|
||||||
finally:
|
LLM(**model_config)
|
||||||
# Reset the environment after the test
|
finally:
|
||||||
# NB: Assuming tests are run in online mode
|
# Reset the environment after the test
|
||||||
monkeypatch.delenv("HF_HUB_OFFLINE")
|
# NB: Assuming tests are run in online mode
|
||||||
monkeypatch.delenv("VLLM_NO_USAGE_STATS")
|
_re_import_modules()
|
||||||
_re_import_modules()
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def _re_import_modules():
|
def _re_import_modules():
|
||||||
|
@ -70,7 +70,7 @@ def run_test(more_args):
|
|||||||
@pytest.mark.skipif(not current_platform.is_cuda()
|
@pytest.mark.skipif(not current_platform.is_cuda()
|
||||||
and not current_platform.is_tpu(),
|
and not current_platform.is_tpu(),
|
||||||
reason="V1 currently only supported on CUDA and TPU")
|
reason="V1 currently only supported on CUDA and TPU")
|
||||||
def test_lm_eval_accuracy_v1_engine(monkeypatch):
|
def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
|
||||||
"""Run with the V1 Engine."""
|
"""Run with the V1 Engine."""
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
@ -85,7 +85,8 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
|
@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
|
||||||
def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args):
|
def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch,
|
||||||
|
more_args):
|
||||||
"""Run with the V0 Engine."""
|
"""Run with the V0 Engine."""
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
|
@ -5,13 +5,12 @@ from unittest.mock import Mock, patch
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from tests.kernels.utils import override_backend_env_variable
|
|
||||||
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
|
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
|
||||||
from vllm.platforms.cpu import CpuPlatform
|
from vllm.platforms.cpu import CpuPlatform
|
||||||
from vllm.platforms.cuda import CudaPlatform
|
from vllm.platforms.cuda import CudaPlatform
|
||||||
from vllm.platforms.openvino import OpenVinoPlatform
|
from vllm.platforms.openvino import OpenVinoPlatform
|
||||||
from vllm.platforms.rocm import RocmPlatform
|
from vllm.platforms.rocm import RocmPlatform
|
||||||
from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL
|
from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
@ -25,87 +24,111 @@ def clear_cache():
|
|||||||
"name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
|
"name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
|
||||||
@pytest.mark.parametrize("use_v1", [True, False])
|
@pytest.mark.parametrize("use_v1", [True, False])
|
||||||
@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
|
@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
|
||||||
def test_env(name: str, use_v1: bool, device: str, monkeypatch):
|
def test_env(
|
||||||
|
name: str,
|
||||||
|
use_v1: bool,
|
||||||
|
device: str,
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
):
|
||||||
"""Test that the attention selector can be set via environment variable.
|
"""Test that the attention selector can be set via environment variable.
|
||||||
Note that we do not test FlashAttn because it is the default backend.
|
Note that we do not test FlashAttn because it is the default backend.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
|
with monkeypatch.context() as m:
|
||||||
override_backend_env_variable(monkeypatch, name)
|
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
|
||||||
|
m.setenv(STR_BACKEND_ENV_VAR, name)
|
||||||
|
|
||||||
if device == "cpu":
|
if device == "cpu":
|
||||||
with patch("vllm.attention.selector.current_platform", CpuPlatform()):
|
|
||||||
backend = get_attn_backend(16, torch.float16, torch.float16, 16,
|
|
||||||
False)
|
|
||||||
assert backend.get_name() == "TORCH_SDPA"
|
|
||||||
elif device == "hip":
|
|
||||||
with patch("vllm.attention.selector.current_platform", RocmPlatform()):
|
|
||||||
backend = get_attn_backend(16, torch.float16, torch.float16, 16,
|
|
||||||
False)
|
|
||||||
EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
|
|
||||||
assert backend.get_name() == EXPECTED
|
|
||||||
elif device == "openvino":
|
|
||||||
with patch("vllm.attention.selector.current_platform",
|
|
||||||
OpenVinoPlatform()), patch.dict('sys.modules',
|
|
||||||
{'openvino': Mock()}):
|
|
||||||
backend = get_attn_backend(16, torch.float16, torch.float16, 16,
|
|
||||||
False)
|
|
||||||
assert backend.get_name() == "OPENVINO"
|
|
||||||
else:
|
|
||||||
if name in ["XFORMERS", "FLASHINFER"]:
|
|
||||||
with patch("vllm.attention.selector.current_platform",
|
with patch("vllm.attention.selector.current_platform",
|
||||||
CudaPlatform()):
|
CpuPlatform()):
|
||||||
backend = get_attn_backend(16, torch.float16, torch.float16,
|
backend = get_attn_backend(16, torch.float16, torch.float16,
|
||||||
16, False)
|
16, False)
|
||||||
EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name
|
assert backend.get_name() == "TORCH_SDPA"
|
||||||
|
elif device == "hip":
|
||||||
|
with patch("vllm.attention.selector.current_platform",
|
||||||
|
RocmPlatform()):
|
||||||
|
backend = get_attn_backend(16, torch.float16, torch.float16,
|
||||||
|
16, False)
|
||||||
|
EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
|
||||||
assert backend.get_name() == EXPECTED
|
assert backend.get_name() == EXPECTED
|
||||||
|
elif device == "openvino":
|
||||||
|
with patch("vllm.attention.selector.current_platform",
|
||||||
|
OpenVinoPlatform()), patch.dict('sys.modules',
|
||||||
|
{'openvino': Mock()}):
|
||||||
|
backend = get_attn_backend(16, torch.float16, torch.float16,
|
||||||
|
16, False)
|
||||||
|
assert backend.get_name() == "OPENVINO"
|
||||||
|
else:
|
||||||
|
if name in ["XFORMERS", "FLASHINFER"]:
|
||||||
|
with patch("vllm.attention.selector.current_platform",
|
||||||
|
CudaPlatform()):
|
||||||
|
backend = get_attn_backend(16, torch.float16,
|
||||||
|
torch.float16, 16, False)
|
||||||
|
EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name
|
||||||
|
assert backend.get_name() == EXPECTED
|
||||||
|
|
||||||
|
|
||||||
def test_flash_attn(monkeypatch):
|
def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
|
||||||
"""Test FlashAttn validation."""
|
"""Test FlashAttn validation."""
|
||||||
# TODO: When testing for v1, pipe in `use_v1` as an argument to
|
# TODO: When testing for v1, pipe in `use_v1` as an argument to
|
||||||
# get_attn_backend
|
# get_attn_backend
|
||||||
|
|
||||||
override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)
|
||||||
|
|
||||||
# Unsupported CUDA arch
|
# Unsupported CUDA arch
|
||||||
with patch("torch.cuda.get_device_capability", return_value=(7, 5)):
|
monkeypatch.setattr(torch.cuda, "get_device_capability", lambda:
|
||||||
|
(7, 5))
|
||||||
backend = get_attn_backend(16, torch.float16, None, 16, False)
|
backend = get_attn_backend(16, torch.float16, None, 16, False)
|
||||||
assert backend.get_name() != STR_FLASH_ATTN_VAL
|
assert backend.get_name() != STR_FLASH_ATTN_VAL
|
||||||
|
|
||||||
# Unsupported data type
|
# Reset the monkeypatch for subsequent tests
|
||||||
backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
|
monkeypatch.undo()
|
||||||
assert backend.get_name() != STR_FLASH_ATTN_VAL
|
|
||||||
|
|
||||||
# Unsupported kv cache data type
|
# Unsupported data type
|
||||||
backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
|
backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
|
||||||
assert backend.get_name() != STR_FLASH_ATTN_VAL
|
assert backend.get_name() != STR_FLASH_ATTN_VAL
|
||||||
|
|
||||||
# Unsupported block size
|
# Unsupported kv cache data type
|
||||||
backend = get_attn_backend(16, torch.float16, None, 8, False)
|
backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
|
||||||
assert backend.get_name() != STR_FLASH_ATTN_VAL
|
assert backend.get_name() != STR_FLASH_ATTN_VAL
|
||||||
|
|
||||||
# flash-attn is not installed
|
# Unsupported block size
|
||||||
with patch.dict('sys.modules', {'vllm_flash_attn': None}):
|
backend = get_attn_backend(16, torch.float16, None, 8, False)
|
||||||
|
assert backend.get_name() != STR_FLASH_ATTN_VAL
|
||||||
|
|
||||||
|
# flash-attn is not installed
|
||||||
|
import sys
|
||||||
|
original_module = sys.modules.get('vllm_flash_attn')
|
||||||
|
monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None)
|
||||||
backend = get_attn_backend(16, torch.float16, None, 16, False)
|
backend = get_attn_backend(16, torch.float16, None, 16, False)
|
||||||
assert backend.get_name() != STR_FLASH_ATTN_VAL
|
assert backend.get_name() != STR_FLASH_ATTN_VAL
|
||||||
|
|
||||||
# Unsupported head size
|
# Restore the original module if it existed
|
||||||
backend = get_attn_backend(17, torch.float16, None, 16, False)
|
if original_module is not None:
|
||||||
assert backend.get_name() != STR_FLASH_ATTN_VAL
|
monkeypatch.setitem(sys.modules, 'vllm_flash_attn',
|
||||||
|
original_module)
|
||||||
|
else:
|
||||||
|
monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False)
|
||||||
|
|
||||||
# Attention-free models should bypass env and use PlaceholderAttention
|
# Unsupported head size
|
||||||
backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
|
backend = get_attn_backend(17, torch.float16, None, 16, False)
|
||||||
assert backend.get_name() != STR_FLASH_ATTN_VAL
|
assert backend.get_name() != STR_FLASH_ATTN_VAL
|
||||||
|
|
||||||
|
# Attention-free models should bypass env and use PlaceholderAttention
|
||||||
|
backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
|
||||||
|
assert backend.get_name() != STR_FLASH_ATTN_VAL
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("use_v1", [True, False])
|
@pytest.mark.parametrize("use_v1", [True, False])
|
||||||
def test_invalid_env(use_v1: bool, monkeypatch):
|
def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
|
||||||
"""Ignore the invalid env variable if it is set."""
|
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
|
|
||||||
override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
|
|
||||||
|
|
||||||
with patch("vllm.attention.selector.current_platform", CudaPlatform()):
|
with monkeypatch.context() as m, patch(
|
||||||
|
"vllm.attention.selector.current_platform", CudaPlatform()):
|
||||||
|
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
|
||||||
|
m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
|
||||||
|
|
||||||
|
# Test with head size 32
|
||||||
backend = get_attn_backend(32, torch.float16, None, 16, False)
|
backend = get_attn_backend(32, torch.float16, None, 16, False)
|
||||||
EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN"
|
EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN"
|
||||||
assert backend.get_name() == EXPECTED
|
assert backend.get_name() == EXPECTED
|
||||||
|
@ -1,7 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import os
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@ -11,36 +9,38 @@ from vllm import _custom_ops as ops # noqa: F401
|
|||||||
|
|
||||||
@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"),
|
@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"),
|
||||||
reason="AWQ is not supported on this GPU type.")
|
reason="AWQ is not supported on this GPU type.")
|
||||||
def test_awq_dequantize_opcheck():
|
def test_awq_dequantize_opcheck(monkeypatch: pytest.MonkeyPatch):
|
||||||
os.environ["VLLM_USE_TRITON_AWQ"] = "0"
|
with monkeypatch.context() as m:
|
||||||
qweight = torch.randint(-2000000000,
|
m.setenv("VLLM_USE_TRITON_AWQ", "0")
|
||||||
2000000000, (8192, 256),
|
qweight = torch.randint(-2000000000,
|
||||||
device='cuda',
|
2000000000, (8192, 256),
|
||||||
dtype=torch.int32)
|
device='cuda',
|
||||||
scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16)
|
dtype=torch.int32)
|
||||||
zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32)
|
scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16)
|
||||||
split_k_iters = 0
|
zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32)
|
||||||
thx = 0
|
split_k_iters = 0
|
||||||
thy = 0
|
thx = 0
|
||||||
opcheck(torch.ops._C.awq_dequantize,
|
thy = 0
|
||||||
(qweight, scales, zeros, split_k_iters, thx, thy))
|
opcheck(torch.ops._C.awq_dequantize,
|
||||||
|
(qweight, scales, zeros, split_k_iters, thx, thy))
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="Not working; needs investigation.")
|
@pytest.mark.skip(reason="Not working; needs investigation.")
|
||||||
@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"),
|
@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"),
|
||||||
reason="AWQ is not supported on this GPU type.")
|
reason="AWQ is not supported on this GPU type.")
|
||||||
def test_awq_gemm_opcheck():
|
def test_awq_gemm_opcheck(monkeypatch: pytest.MonkeyPatch):
|
||||||
os.environ["VLLM_USE_TRITON_AWQ"] = "0"
|
with monkeypatch.context() as m:
|
||||||
input = torch.rand((2, 8192), device='cuda', dtype=torch.float16)
|
m.setenv("VLLM_USE_TRITON_AWQ", "0")
|
||||||
qweight = torch.randint(-2000000000,
|
input = torch.rand((2, 8192), device='cuda', dtype=torch.float16)
|
||||||
2000000000, (8192, 256),
|
qweight = torch.randint(-2000000000,
|
||||||
device='cuda',
|
2000000000, (8192, 256),
|
||||||
dtype=torch.int32)
|
device='cuda',
|
||||||
scales = torch.randint(-2000000000,
|
dtype=torch.int32)
|
||||||
2000000000, (64, 256),
|
scales = torch.randint(-2000000000,
|
||||||
device='cuda',
|
2000000000, (64, 256),
|
||||||
dtype=torch.int32)
|
device='cuda',
|
||||||
qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16)
|
dtype=torch.int32)
|
||||||
split_k_iters = 8
|
qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16)
|
||||||
opcheck(torch.ops._C.awq_gemm,
|
split_k_iters = 8
|
||||||
(input, qweight, qzeros, scales, split_k_iters))
|
opcheck(torch.ops._C.awq_gemm,
|
||||||
|
(input, qweight, qzeros, scales, split_k_iters))
|
||||||
|
@ -1,13 +1,11 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
from unittest.mock import patch
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from tests.kernels.utils import override_backend_env_variable
|
|
||||||
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
|
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
|
||||||
from vllm.platforms.rocm import RocmPlatform
|
from vllm.platforms.rocm import RocmPlatform
|
||||||
|
from vllm.utils import STR_BACKEND_ENV_VAR
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
@ -17,15 +15,19 @@ def clear_cache():
|
|||||||
_cached_get_attn_backend.cache_clear()
|
_cached_get_attn_backend.cache_clear()
|
||||||
|
|
||||||
|
|
||||||
def test_selector(monkeypatch):
|
def test_selector(monkeypatch: pytest.MonkeyPatch):
|
||||||
"""Test that the attention selector for ROCm.
|
with monkeypatch.context() as m:
|
||||||
"""
|
m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH")
|
||||||
override_backend_env_variable(monkeypatch, "ROCM_FLASH")
|
|
||||||
|
|
||||||
with patch("vllm.attention.selector.current_platform", RocmPlatform()):
|
# Set the current platform to ROCm using monkeypatch
|
||||||
|
monkeypatch.setattr("vllm.attention.selector.current_platform",
|
||||||
|
RocmPlatform())
|
||||||
|
|
||||||
|
# Test standard ROCm attention
|
||||||
backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
|
backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
|
||||||
assert (backend.get_name() == "ROCM_FLASH"
|
assert (backend.get_name() == "ROCM_FLASH"
|
||||||
or backend.get_name() == "ROCM_ATTN_VLLM_V1")
|
or backend.get_name() == "ROCM_ATTN_VLLM_V1")
|
||||||
|
|
||||||
# mla test for deepseek related
|
# mla test for deepseek related
|
||||||
backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
|
backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
|
||||||
False, True)
|
False, True)
|
||||||
|
@ -12,11 +12,10 @@ import pytest
|
|||||||
from tests.kernels.utils import override_backend_env_variable
|
from tests.kernels.utils import override_backend_env_variable
|
||||||
from tests.quantization.utils import is_quant_method_supported
|
from tests.quantization.utils import is_quant_method_supported
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.utils import STR_BACKEND_ENV_VAR
|
||||||
|
|
||||||
from ...utils import check_logprobs_close
|
from ...utils import check_logprobs_close
|
||||||
|
|
||||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.quant_model
|
@pytest.mark.quant_model
|
||||||
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
|
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
|
||||||
@ -55,45 +54,47 @@ def test_models(
|
|||||||
backend: str,
|
backend: str,
|
||||||
tensor_parallel_size: int,
|
tensor_parallel_size: int,
|
||||||
disable_async_output_proc: bool,
|
disable_async_output_proc: bool,
|
||||||
monkeypatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Only checks log probs match to cover the discrepancy in
|
Only checks log probs match to cover the discrepancy in
|
||||||
numerical sensitive kernels.
|
numerical sensitive kernels.
|
||||||
"""
|
"""
|
||||||
override_backend_env_variable(monkeypatch, backend)
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv("TOKENIZERS_PARALLELISM", 'true')
|
||||||
|
m.setenv(STR_BACKEND_ENV_VAR, backend)
|
||||||
|
|
||||||
MAX_MODEL_LEN = 1024
|
MAX_MODEL_LEN = 1024
|
||||||
NUM_LOG_PROBS = 8
|
NUM_LOG_PROBS = 8
|
||||||
|
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
base_model,
|
base_model,
|
||||||
max_model_len=MAX_MODEL_LEN,
|
max_model_len=MAX_MODEL_LEN,
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
enforce_eager=enforce_eager,
|
enforce_eager=enforce_eager,
|
||||||
kv_cache_dtype="auto",
|
kv_cache_dtype="auto",
|
||||||
disable_async_output_proc=disable_async_output_proc,
|
disable_async_output_proc=disable_async_output_proc,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
baseline_outputs = vllm_model.generate_greedy_logprobs(
|
baseline_outputs = vllm_model.generate_greedy_logprobs(
|
||||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||||
|
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
test_model,
|
test_model,
|
||||||
max_model_len=MAX_MODEL_LEN,
|
max_model_len=MAX_MODEL_LEN,
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
enforce_eager=enforce_eager,
|
enforce_eager=enforce_eager,
|
||||||
kv_cache_dtype=kv_cache_dtype,
|
kv_cache_dtype=kv_cache_dtype,
|
||||||
disable_async_output_proc=disable_async_output_proc,
|
disable_async_output_proc=disable_async_output_proc,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
test_outputs = vllm_model.generate_greedy_logprobs(
|
test_outputs = vllm_model.generate_greedy_logprobs(
|
||||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||||
|
|
||||||
check_logprobs_close(
|
check_logprobs_close(
|
||||||
outputs_0_lst=baseline_outputs,
|
outputs_0_lst=baseline_outputs,
|
||||||
outputs_1_lst=test_outputs,
|
outputs_1_lst=test_outputs,
|
||||||
name_0="fp16_kv_cache",
|
name_0="fp16_kv_cache",
|
||||||
name_1="fp8_kv_cache",
|
name_1="fp8_kv_cache",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.cpu_model
|
@pytest.mark.cpu_model
|
||||||
@ -119,38 +120,41 @@ def test_cpu_models(
|
|||||||
test_model: str,
|
test_model: str,
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
disable_async_output_proc: bool,
|
disable_async_output_proc: bool,
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Only checks log probs match to cover the discrepancy in
|
Only checks log probs match to cover the discrepancy in
|
||||||
numerical sensitive kernels.
|
numerical sensitive kernels.
|
||||||
"""
|
"""
|
||||||
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv("TOKENIZERS_PARALLELISM", 'true')
|
||||||
|
|
||||||
MAX_MODEL_LEN = 1024
|
MAX_MODEL_LEN = 1024
|
||||||
NUM_LOG_PROBS = 8
|
NUM_LOG_PROBS = 8
|
||||||
|
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
base_model,
|
base_model,
|
||||||
max_model_len=MAX_MODEL_LEN,
|
max_model_len=MAX_MODEL_LEN,
|
||||||
dtype="bfloat16",
|
dtype="bfloat16",
|
||||||
kv_cache_dtype="auto",
|
kv_cache_dtype="auto",
|
||||||
disable_async_output_proc=disable_async_output_proc,
|
disable_async_output_proc=disable_async_output_proc,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
baseline_outputs = vllm_model.generate_greedy_logprobs(
|
baseline_outputs = vllm_model.generate_greedy_logprobs(
|
||||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||||
|
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
test_model,
|
test_model,
|
||||||
max_model_len=MAX_MODEL_LEN,
|
max_model_len=MAX_MODEL_LEN,
|
||||||
dtype="bfloat16",
|
dtype="bfloat16",
|
||||||
kv_cache_dtype=kv_cache_dtype,
|
kv_cache_dtype=kv_cache_dtype,
|
||||||
disable_async_output_proc=disable_async_output_proc,
|
disable_async_output_proc=disable_async_output_proc,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
test_outputs = vllm_model.generate_greedy_logprobs(
|
test_outputs = vllm_model.generate_greedy_logprobs(
|
||||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||||
|
|
||||||
check_logprobs_close(
|
check_logprobs_close(
|
||||||
outputs_0_lst=baseline_outputs,
|
outputs_0_lst=baseline_outputs,
|
||||||
outputs_1_lst=test_outputs,
|
outputs_1_lst=test_outputs,
|
||||||
name_0="bf16_kv_cache",
|
name_0="bf16_kv_cache",
|
||||||
name_1="fp8_kv_cache",
|
name_1="fp8_kv_cache",
|
||||||
)
|
)
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import importlib.util
|
import importlib.util
|
||||||
import math
|
import math
|
||||||
@ -11,6 +12,7 @@ from scipy.spatial.distance import cosine
|
|||||||
|
|
||||||
import vllm
|
import vllm
|
||||||
import vllm.config
|
import vllm.config
|
||||||
|
from vllm.utils import STR_BACKEND_ENV_VAR
|
||||||
|
|
||||||
from ....utils import RemoteOpenAIServer
|
from ....utils import RemoteOpenAIServer
|
||||||
|
|
||||||
@ -29,36 +31,34 @@ def _arr(arr):
|
|||||||
return array("i", arr)
|
return array("i", arr)
|
||||||
|
|
||||||
|
|
||||||
def test_find_array(monkeypatch):
|
def test_find_array(monkeypatch: pytest.MonkeyPatch):
|
||||||
# GritLM embedding implementation is only supported by XFormers backend.
|
# GritLM embedding implementation is only supported by XFormers backend.
|
||||||
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
|
||||||
|
|
||||||
from vllm.model_executor.models.gritlm import GritLMPooler
|
from vllm.model_executor.models.gritlm import GritLMPooler
|
||||||
|
|
||||||
# Create an LLM object to get the model config.
|
# Create an LLM object to get the model config.
|
||||||
llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
|
llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
|
||||||
pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
|
pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
|
||||||
|
|
||||||
arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
|
arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
|
||||||
|
|
||||||
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
|
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
|
||||||
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
|
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
|
||||||
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
|
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
|
||||||
assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
|
assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
|
||||||
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
|
pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def server_embedding():
|
def server_embedding():
|
||||||
# GritLM embedding implementation is only supported by XFormers backend.
|
# GritLM embedding implementation is only supported by XFormers backend.
|
||||||
with pytest.MonkeyPatch.context() as mp:
|
args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
|
||||||
mp.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
|
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||||
|
yield remote_server
|
||||||
args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
|
|
||||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
|
||||||
yield remote_server
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
@ -69,9 +69,12 @@ def server_generate():
|
|||||||
|
|
||||||
|
|
||||||
@pytest_asyncio.fixture
|
@pytest_asyncio.fixture
|
||||||
async def client_embedding(server_embedding: RemoteOpenAIServer):
|
async def client_embedding(monkeypatch: pytest.MonkeyPatch,
|
||||||
async with server_embedding.get_async_client() as async_client:
|
server_embedding: RemoteOpenAIServer):
|
||||||
yield async_client
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
|
||||||
|
async with server_embedding.get_async_client() as async_client:
|
||||||
|
yield async_client
|
||||||
|
|
||||||
|
|
||||||
@pytest_asyncio.fixture
|
@pytest_asyncio.fixture
|
||||||
@ -80,14 +83,20 @@ async def client_generate(server_generate: RemoteOpenAIServer):
|
|||||||
yield async_client
|
yield async_client
|
||||||
|
|
||||||
|
|
||||||
def run_llm_encode(llm: vllm.LLM, queries: list[str],
|
def run_llm_encode(
|
||||||
instruction: str) -> list[float]:
|
llm: vllm.LLM,
|
||||||
|
queries: list[str],
|
||||||
|
instruction: str,
|
||||||
|
) -> list[float]:
|
||||||
outputs = llm.encode([instruction + q for q in queries], )
|
outputs = llm.encode([instruction + q for q in queries], )
|
||||||
return [output.outputs.embedding for output in outputs]
|
return [output.outputs.embedding for output in outputs]
|
||||||
|
|
||||||
|
|
||||||
async def run_client_embeddings(client: vllm.LLM, queries: list[str],
|
async def run_client_embeddings(
|
||||||
instruction: str) -> list[float]:
|
client: vllm.LLM,
|
||||||
|
queries: list[str],
|
||||||
|
instruction: str,
|
||||||
|
) -> list[float]:
|
||||||
outputs = await client.embeddings.create(
|
outputs = await client.embeddings.create(
|
||||||
model=MODEL_NAME,
|
model=MODEL_NAME,
|
||||||
input=[instruction + q for q in queries],
|
input=[instruction + q for q in queries],
|
||||||
@ -106,7 +115,7 @@ def get_test_data():
|
|||||||
README.md in https://github.com/ContextualAI/gritlm
|
README.md in https://github.com/ContextualAI/gritlm
|
||||||
"""
|
"""
|
||||||
q_instruction = gritlm_instruction(
|
q_instruction = gritlm_instruction(
|
||||||
"Given a scientific paper title, retrieve the paper's abstract")
|
"Given a scientific paper title, retrieve the paper's abstract", )
|
||||||
queries = [
|
queries = [
|
||||||
"Bitcoin: A Peer-to-Peer Electronic Cash System",
|
"Bitcoin: A Peer-to-Peer Electronic Cash System",
|
||||||
"Generative Representational Instruction Tuning",
|
"Generative Representational Instruction Tuning",
|
||||||
@ -136,31 +145,32 @@ def validate_embed_output(q_rep: list[float], d_rep: list[float]):
|
|||||||
assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001)
|
assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001)
|
||||||
|
|
||||||
|
|
||||||
def test_gritlm_offline_embedding(monkeypatch):
|
def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch):
|
||||||
# GritLM embedding implementation is only supported by XFormers backend.
|
# GritLM embedding implementation is only supported by XFormers backend.
|
||||||
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
|
||||||
|
|
||||||
queries, q_instruction, documents, d_instruction = get_test_data()
|
queries, q_instruction, documents, d_instruction = get_test_data()
|
||||||
|
|
||||||
llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
|
llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
|
||||||
|
|
||||||
d_rep = run_llm_encode(
|
d_rep = run_llm_encode(
|
||||||
llm,
|
llm,
|
||||||
documents,
|
documents,
|
||||||
d_instruction,
|
d_instruction,
|
||||||
)
|
)
|
||||||
q_rep = run_llm_encode(
|
q_rep = run_llm_encode(
|
||||||
llm,
|
llm,
|
||||||
queries,
|
queries,
|
||||||
q_instruction,
|
q_instruction,
|
||||||
)
|
)
|
||||||
|
|
||||||
validate_embed_output(q_rep, d_rep)
|
validate_embed_output(q_rep, d_rep)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_gritlm_api_server_embedding(
|
async def test_gritlm_api_server_embedding(
|
||||||
client_embedding: openai.AsyncOpenAI):
|
client_embedding: openai.AsyncOpenAI, ):
|
||||||
queries, q_instruction, documents, d_instruction = get_test_data()
|
queries, q_instruction, documents, d_instruction = get_test_data()
|
||||||
|
|
||||||
d_rep = await run_client_embeddings(
|
d_rep = await run_client_embeddings(
|
||||||
|
@ -1,7 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import os
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
@ -11,76 +9,92 @@ from ..utils import fork_new_process_for_each_test
|
|||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@fork_new_process_for_each_test
|
||||||
def test_plugin(dummy_opt_path, monkeypatch):
|
def test_plugin(
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
dummy_opt_path: str,
|
||||||
|
):
|
||||||
# V1 shuts down rather than raising an error here.
|
# V1 shuts down rather than raising an error here.
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
with monkeypatch.context() as m:
|
||||||
os.environ["VLLM_PLUGINS"] = ""
|
m.setenv("VLLM_USE_V1", "0")
|
||||||
with pytest.raises(Exception) as excinfo:
|
m.setenv("VLLM_PLUGINS", "")
|
||||||
LLM(model=dummy_opt_path, load_format="dummy")
|
|
||||||
error_msg = "has no vLLM implementation and " \
|
with pytest.raises(Exception) as excinfo:
|
||||||
"the Transformers implementation is not compatible with vLLM"
|
LLM(model=dummy_opt_path, load_format="dummy")
|
||||||
assert (error_msg in str(excinfo.value))
|
error_msg = "has no vLLM implementation and the Transformers implementation is not compatible with vLLM" # noqa: E501
|
||||||
|
assert (error_msg in str(excinfo.value))
|
||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@fork_new_process_for_each_test
|
||||||
def test_oot_registration_text_generation(dummy_opt_path):
|
def test_oot_registration_text_generation(
|
||||||
os.environ["VLLM_PLUGINS"] = "register_dummy_model"
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
prompts = ["Hello, my name is", "The text does not matter"]
|
dummy_opt_path: str,
|
||||||
sampling_params = SamplingParams(temperature=0)
|
):
|
||||||
llm = LLM(model=dummy_opt_path, load_format="dummy")
|
with monkeypatch.context() as m:
|
||||||
first_token = llm.get_tokenizer().decode(0)
|
m.setenv("VLLM_PLUGINS", "register_dummy_model")
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
prompts = ["Hello, my name is", "The text does not matter"]
|
||||||
|
sampling_params = SamplingParams(temperature=0)
|
||||||
|
llm = LLM(model=dummy_opt_path, load_format="dummy")
|
||||||
|
first_token = llm.get_tokenizer().decode(0)
|
||||||
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
|
|
||||||
for output in outputs:
|
for output in outputs:
|
||||||
generated_text = output.outputs[0].text
|
generated_text = output.outputs[0].text
|
||||||
# make sure only the first token is generated
|
# make sure only the first token is generated
|
||||||
rest = generated_text.replace(first_token, "")
|
rest = generated_text.replace(first_token, "")
|
||||||
assert rest == ""
|
assert rest == ""
|
||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@fork_new_process_for_each_test
|
||||||
def test_oot_registration_embedding(dummy_gemma2_embedding_path):
|
def test_oot_registration_embedding(
|
||||||
os.environ["VLLM_PLUGINS"] = "register_dummy_model"
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
prompts = ["Hello, my name is", "The text does not matter"]
|
dummy_gemma2_embedding_path: str,
|
||||||
llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
|
):
|
||||||
outputs = llm.embed(prompts)
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv("VLLM_PLUGINS", "register_dummy_model")
|
||||||
|
prompts = ["Hello, my name is", "The text does not matter"]
|
||||||
|
llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
|
||||||
|
outputs = llm.embed(prompts)
|
||||||
|
|
||||||
for output in outputs:
|
for output in outputs:
|
||||||
assert all(v == 0 for v in output.outputs.embedding)
|
assert all(v == 0 for v in output.outputs.embedding)
|
||||||
|
|
||||||
|
|
||||||
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
|
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
|
||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@fork_new_process_for_each_test
|
||||||
def test_oot_registration_multimodal(dummy_llava_path, monkeypatch):
|
def test_oot_registration_multimodal(
|
||||||
os.environ["VLLM_PLUGINS"] = "register_dummy_model"
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
prompts = [{
|
dummy_llava_path: str,
|
||||||
"prompt": "What's in the image?<image>",
|
):
|
||||||
"multi_modal_data": {
|
with monkeypatch.context() as m:
|
||||||
"image": image
|
m.setenv("VLLM_PLUGINS", "register_dummy_model")
|
||||||
},
|
prompts = [{
|
||||||
}, {
|
"prompt": "What's in the image?<image>",
|
||||||
"prompt": "Describe the image<image>",
|
"multi_modal_data": {
|
||||||
"multi_modal_data": {
|
"image": image
|
||||||
"image": image
|
},
|
||||||
},
|
}, {
|
||||||
}]
|
"prompt": "Describe the image<image>",
|
||||||
|
"multi_modal_data": {
|
||||||
|
"image": image
|
||||||
|
},
|
||||||
|
}]
|
||||||
|
|
||||||
sampling_params = SamplingParams(temperature=0)
|
sampling_params = SamplingParams(temperature=0)
|
||||||
llm = LLM(model=dummy_llava_path,
|
llm = LLM(model=dummy_llava_path,
|
||||||
load_format="dummy",
|
load_format="dummy",
|
||||||
max_num_seqs=1,
|
max_num_seqs=1,
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
gpu_memory_utilization=0.98,
|
gpu_memory_utilization=0.98,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
limit_mm_per_prompt={"image": 1})
|
limit_mm_per_prompt={"image": 1})
|
||||||
first_token = llm.get_tokenizer().decode(0)
|
first_token = llm.get_tokenizer().decode(0)
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
|
|
||||||
for output in outputs:
|
for output in outputs:
|
||||||
generated_text = output.outputs[0].text
|
generated_text = output.outputs[0].text
|
||||||
# make sure only the first token is generated
|
# make sure only the first token is generated
|
||||||
rest = generated_text.replace(first_token, "")
|
rest = generated_text.replace(first_token, "")
|
||||||
assert rest == ""
|
assert rest == ""
|
||||||
|
@ -235,25 +235,28 @@ async def test_bad_request(tmp_socket):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_mp_crash_detection(monkeypatch):
|
async def test_mp_crash_detection(monkeypatch: pytest.MonkeyPatch):
|
||||||
|
with monkeypatch.context() as m:
|
||||||
|
|
||||||
parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
|
parser = FlexibleArgumentParser(
|
||||||
parser = make_arg_parser(parser)
|
description="vLLM's remote OpenAI server.")
|
||||||
args = parser.parse_args([])
|
parser = make_arg_parser(parser)
|
||||||
|
args = parser.parse_args([])
|
||||||
|
|
||||||
# When LLMEngine is loaded, it will crash.
|
# When LLMEngine is loaded, it will crash.
|
||||||
def mock_init():
|
def mock_init():
|
||||||
raise ValueError
|
raise ValueError
|
||||||
|
|
||||||
monkeypatch.setattr(LLMEngine, "__init__", mock_init)
|
m.setattr(LLMEngine, "__init__", mock_init)
|
||||||
|
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
async with build_async_engine_client(args):
|
async with build_async_engine_client(args):
|
||||||
pass
|
pass
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
|
|
||||||
assert end - start < 60, ("Expected vLLM to gracefully shutdown in <60s "
|
assert end - start < 60, (
|
||||||
"if there is an error in the startup.")
|
"Expected vLLM to gracefully shutdown in <60s "
|
||||||
|
"if there is an error in the startup.")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
|
@ -5,7 +5,7 @@ from typing import Optional
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from tests.kernels.utils import override_backend_env_variable
|
from vllm.utils import STR_BACKEND_ENV_VAR
|
||||||
|
|
||||||
from ..models.utils import check_logprobs_close
|
from ..models.utils import check_logprobs_close
|
||||||
from ..utils import (completions_with_server_args, get_client_text_generations,
|
from ..utils import (completions_with_server_args, get_client_text_generations,
|
||||||
@ -52,7 +52,7 @@ async def test_multi_step(
|
|||||||
num_logprobs: Optional[int],
|
num_logprobs: Optional[int],
|
||||||
attention_backend: str,
|
attention_backend: str,
|
||||||
enable_chunked_prefill: bool,
|
enable_chunked_prefill: bool,
|
||||||
monkeypatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test vLLM engine with multi-step scheduling in an OpenAI-protocol
|
"""Test vLLM engine with multi-step scheduling in an OpenAI-protocol
|
||||||
client/server environment.
|
client/server environment.
|
||||||
@ -82,67 +82,70 @@ async def test_multi_step(
|
|||||||
pytest.skip("Multi-step with Chunked-Prefill only supports"
|
pytest.skip("Multi-step with Chunked-Prefill only supports"
|
||||||
"PP=1 and FLASH_ATTN backend")
|
"PP=1 and FLASH_ATTN backend")
|
||||||
|
|
||||||
override_backend_env_variable(monkeypatch, attention_backend)
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
|
||||||
|
|
||||||
prompts = example_prompts
|
prompts = example_prompts
|
||||||
if len(prompts) < num_prompts:
|
if len(prompts) < num_prompts:
|
||||||
prompts = prompts * ((num_prompts // len(prompts)) + 1)
|
prompts = prompts * ((num_prompts // len(prompts)) + 1)
|
||||||
prompts = prompts[:num_prompts]
|
prompts = prompts[:num_prompts]
|
||||||
assert len(prompts) == num_prompts
|
assert len(prompts) == num_prompts
|
||||||
|
|
||||||
server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
|
server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
|
||||||
ms_server_args = DEFAULT_SERVER_ARGS + \
|
ms_server_args = DEFAULT_SERVER_ARGS + \
|
||||||
["--num-scheduler-steps", f"{num_scheduler_steps}"]
|
["--num-scheduler-steps", f"{num_scheduler_steps}"]
|
||||||
|
|
||||||
if not is_async:
|
if not is_async:
|
||||||
ms_server_args += ["--disable-async-output-proc"]
|
ms_server_args += ["--disable-async-output-proc"]
|
||||||
|
|
||||||
if eager_mode:
|
if eager_mode:
|
||||||
ms_server_args.append("--enforce-eager")
|
ms_server_args.append("--enforce-eager")
|
||||||
|
|
||||||
if enable_chunked_prefill:
|
if enable_chunked_prefill:
|
||||||
ms_server_args.append("--enable-chunked-prefill")
|
ms_server_args.append("--enable-chunked-prefill")
|
||||||
|
|
||||||
distributed_args = [
|
distributed_args = [
|
||||||
"--tensor-parallel-size",
|
"--tensor-parallel-size",
|
||||||
str(tp_size),
|
str(tp_size),
|
||||||
"--pipeline-parallel-size",
|
"--pipeline-parallel-size",
|
||||||
str(pp_size),
|
str(pp_size),
|
||||||
]
|
]
|
||||||
|
|
||||||
# Spin up client/server & issue completion API requests.
|
# Spin up client/server & issue completion API requests.
|
||||||
# Default `max_wait_seconds` is 240 but was empirically
|
# Default `max_wait_seconds` is 240 but was empirically
|
||||||
# was raised 5x to 1200 *just for this test* due to
|
# was raised 5x to 1200 *just for this test* due to
|
||||||
# observed timeouts in GHA CI
|
# observed timeouts in GHA CI
|
||||||
ref_completions = await completions_with_server_args(
|
ref_completions = await completions_with_server_args(
|
||||||
prompts,
|
prompts,
|
||||||
model,
|
model,
|
||||||
server_args + distributed_args,
|
server_args + distributed_args,
|
||||||
num_logprobs,
|
num_logprobs,
|
||||||
max_wait_seconds=5 * 240)
|
max_wait_seconds=5 * 240)
|
||||||
test_completions = await completions_with_server_args(
|
test_completions = await completions_with_server_args(
|
||||||
prompts,
|
prompts,
|
||||||
model,
|
model,
|
||||||
ms_server_args + distributed_args,
|
ms_server_args + distributed_args,
|
||||||
num_logprobs,
|
num_logprobs,
|
||||||
max_wait_seconds=5 * 240)
|
max_wait_seconds=5 * 240)
|
||||||
|
|
||||||
# Assert multi-step scheduling produces identical tokens
|
# Assert multi-step scheduling produces identical tokens
|
||||||
# to single-step scheduling.
|
# to single-step scheduling.
|
||||||
ref_generations = get_client_text_generations(ref_completions)
|
ref_generations = get_client_text_generations(ref_completions)
|
||||||
test_generations = get_client_text_generations(test_completions)
|
test_generations = get_client_text_generations(test_completions)
|
||||||
assert ref_generations == test_generations
|
assert ref_generations == test_generations
|
||||||
|
|
||||||
# Assert multi-step scheduling produces nearly-identical logprobs
|
# Assert multi-step scheduling produces nearly-identical logprobs
|
||||||
# to single-step scheduling.
|
# to single-step scheduling.
|
||||||
ref_text_logprobs = get_client_text_logprob_generations(ref_completions)
|
ref_text_logprobs = get_client_text_logprob_generations(
|
||||||
test_text_logprobs = get_client_text_logprob_generations(test_completions)
|
ref_completions)
|
||||||
check_logprobs_close(
|
test_text_logprobs = get_client_text_logprob_generations(
|
||||||
outputs_0_lst=ref_text_logprobs,
|
test_completions)
|
||||||
outputs_1_lst=test_text_logprobs,
|
check_logprobs_close(
|
||||||
name_0="hf",
|
outputs_0_lst=ref_text_logprobs,
|
||||||
name_1="vllm",
|
outputs_1_lst=test_text_logprobs,
|
||||||
)
|
name_0="hf",
|
||||||
|
name_1="vllm",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(("tp_size, pp_size"), [
|
@pytest.mark.parametrize(("tp_size, pp_size"), [
|
||||||
@ -152,7 +155,7 @@ async def test_multi_step(
|
|||||||
async def test_multi_step_pp_smoke(
|
async def test_multi_step_pp_smoke(
|
||||||
tp_size: int,
|
tp_size: int,
|
||||||
pp_size: int,
|
pp_size: int,
|
||||||
monkeypatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Smoke test for the vLLM engine with multi-step scheduling in an
|
Smoke test for the vLLM engine with multi-step scheduling in an
|
||||||
@ -174,54 +177,55 @@ async def test_multi_step_pp_smoke(
|
|||||||
attention_backend = "FLASH_ATTN"
|
attention_backend = "FLASH_ATTN"
|
||||||
max_num_seqs = 3
|
max_num_seqs = 3
|
||||||
|
|
||||||
override_backend_env_variable(monkeypatch, attention_backend)
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
|
||||||
|
|
||||||
# Prompt from the ShareGPT dataset
|
# Prompt from the ShareGPT dataset
|
||||||
prompts = [
|
prompts = [
|
||||||
"in the jtbd context whats a push?", # codespell:ignore
|
"in the jtbd context whats a push?", # codespell:ignore
|
||||||
"in the jtbd context whats a push?", # codespell:ignore
|
"in the jtbd context whats a push?", # codespell:ignore
|
||||||
"in the jtbd context whats a push?", # codespell:ignore
|
"in the jtbd context whats a push?", # codespell:ignore
|
||||||
"in the jtbd context whats a push?", # codespell:ignore
|
"in the jtbd context whats a push?", # codespell:ignore
|
||||||
]
|
]
|
||||||
# Use varying max_tokens to introduce scheduling randomness.
|
# Use varying max_tokens to introduce scheduling randomness.
|
||||||
max_tokens = [10 * i for i in range(1, len(prompts) + 1)]
|
max_tokens = [10 * i for i in range(1, len(prompts) + 1)]
|
||||||
assert len(prompts) == len(max_tokens)
|
assert len(prompts) == len(max_tokens)
|
||||||
|
|
||||||
test_args = [
|
test_args = [
|
||||||
"--tensor-parallel-size",
|
"--tensor-parallel-size",
|
||||||
str(tp_size), "--pipeline-parallel-size",
|
str(tp_size), "--pipeline-parallel-size",
|
||||||
str(pp_size), "--max-num-seqs",
|
str(pp_size), "--max-num-seqs",
|
||||||
str(max_num_seqs)
|
str(max_num_seqs)
|
||||||
]
|
]
|
||||||
|
|
||||||
server_args = DEFAULT_SERVER_ARGS + test_args
|
server_args = DEFAULT_SERVER_ARGS + test_args
|
||||||
ms_server_args = DEFAULT_SERVER_ARGS + \
|
ms_server_args = DEFAULT_SERVER_ARGS + \
|
||||||
["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
|
["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
|
||||||
test_args
|
test_args
|
||||||
|
|
||||||
# Spin up client/server & issue completion API requests.
|
# Spin up client/server & issue completion API requests.
|
||||||
# Default `max_wait_seconds` is 240 but was empirically
|
# Default `max_wait_seconds` is 240 but was empirically
|
||||||
# was raised 3x to 720 *just for this test* due to
|
# was raised 3x to 720 *just for this test* due to
|
||||||
# observed timeouts in GHA CI
|
# observed timeouts in GHA CI
|
||||||
ref_completions = await completions_with_server_args(
|
ref_completions = await completions_with_server_args(
|
||||||
prompts=prompts,
|
prompts=prompts,
|
||||||
model_name=model,
|
model_name=model,
|
||||||
server_cli_args=server_args,
|
server_cli_args=server_args,
|
||||||
num_logprobs=None,
|
num_logprobs=None,
|
||||||
max_wait_seconds=5 * 240,
|
max_wait_seconds=5 * 240,
|
||||||
max_tokens=max_tokens)
|
max_tokens=max_tokens)
|
||||||
|
|
||||||
test_completions = await completions_with_server_args(
|
test_completions = await completions_with_server_args(
|
||||||
prompts=prompts,
|
prompts=prompts,
|
||||||
model_name=model,
|
model_name=model,
|
||||||
server_cli_args=ms_server_args,
|
server_cli_args=ms_server_args,
|
||||||
num_logprobs=None,
|
num_logprobs=None,
|
||||||
max_wait_seconds=5 * 240,
|
max_wait_seconds=5 * 240,
|
||||||
max_tokens=max_tokens)
|
max_tokens=max_tokens)
|
||||||
|
|
||||||
# Assert multi-step scheduling produces identical tokens
|
# Assert multi-step scheduling produces identical tokens
|
||||||
# to single-step scheduling.
|
# to single-step scheduling.
|
||||||
ref_generations = get_client_text_generations(ref_completions)
|
ref_generations = get_client_text_generations(ref_completions)
|
||||||
test_generations = get_client_text_generations(test_completions)
|
test_generations = get_client_text_generations(test_completions)
|
||||||
|
|
||||||
assert ref_generations == test_generations
|
assert ref_generations == test_generations
|
||||||
|
@ -7,7 +7,7 @@ from typing import Optional
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from tests.kernels.utils import override_backend_env_variable
|
from vllm.utils import STR_BACKEND_ENV_VAR
|
||||||
|
|
||||||
from ..models.utils import check_logprobs_close, check_outputs_equal
|
from ..models.utils import check_logprobs_close, check_outputs_equal
|
||||||
|
|
||||||
@ -42,7 +42,7 @@ def test_multi_step_llm(
|
|||||||
num_prompts: int,
|
num_prompts: int,
|
||||||
num_logprobs: Optional[int],
|
num_logprobs: Optional[int],
|
||||||
attention_backend: str,
|
attention_backend: str,
|
||||||
monkeypatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test vLLM engine with multi-step scheduling via sync LLM Engine.
|
"""Test vLLM engine with multi-step scheduling via sync LLM Engine.
|
||||||
|
|
||||||
@ -70,48 +70,49 @@ def test_multi_step_llm(
|
|||||||
num_logprobs: corresponds to the `logprobs` argument to the OpenAI
|
num_logprobs: corresponds to the `logprobs` argument to the OpenAI
|
||||||
completions endpoint; `None` -> 1 logprob returned.
|
completions endpoint; `None` -> 1 logprob returned.
|
||||||
"""
|
"""
|
||||||
override_backend_env_variable(monkeypatch, attention_backend)
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
|
||||||
|
|
||||||
prompts = example_prompts
|
prompts = example_prompts
|
||||||
if len(prompts) < num_prompts:
|
if len(prompts) < num_prompts:
|
||||||
prompts = prompts * ((num_prompts // len(prompts)) + 1)
|
prompts = prompts * ((num_prompts // len(prompts)) + 1)
|
||||||
prompts = prompts[:num_prompts]
|
prompts = prompts[:num_prompts]
|
||||||
assert len(prompts) == num_prompts
|
assert len(prompts) == num_prompts
|
||||||
|
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
model,
|
model,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
enforce_eager=enforce_eager,
|
enforce_eager=enforce_eager,
|
||||||
gpu_memory_utilization=0.7,
|
gpu_memory_utilization=0.7,
|
||||||
tensor_parallel_size=tp_size,
|
tensor_parallel_size=tp_size,
|
||||||
enable_chunked_prefill=enable_chunked_prefill,
|
enable_chunked_prefill=enable_chunked_prefill,
|
||||||
num_scheduler_steps=num_scheduler_steps,
|
num_scheduler_steps=num_scheduler_steps,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
|
vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
|
||||||
if num_logprobs is None else
|
if num_logprobs is None else
|
||||||
vllm_model.generate_greedy_logprobs(
|
vllm_model.generate_greedy_logprobs(
|
||||||
prompts, max_tokens, num_logprobs))
|
prompts, max_tokens, num_logprobs))
|
||||||
|
|
||||||
with hf_runner(model, dtype=dtype) as hf_model:
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
|
hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
|
||||||
if num_logprobs is None else
|
if num_logprobs is None else
|
||||||
hf_model.generate_greedy_logprobs_limit(
|
hf_model.generate_greedy_logprobs_limit(
|
||||||
prompts, max_tokens, num_logprobs))
|
prompts, max_tokens, num_logprobs))
|
||||||
|
|
||||||
if num_logprobs is None:
|
if num_logprobs is None:
|
||||||
check_outputs_equal(
|
check_outputs_equal(
|
||||||
outputs_0_lst=hf_outputs,
|
outputs_0_lst=hf_outputs,
|
||||||
outputs_1_lst=vllm_outputs,
|
outputs_1_lst=vllm_outputs,
|
||||||
name_0="hf",
|
name_0="hf",
|
||||||
name_1="vllm",
|
name_1="vllm",
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
check_logprobs_close(
|
check_logprobs_close(
|
||||||
outputs_0_lst=hf_outputs,
|
outputs_0_lst=hf_outputs,
|
||||||
outputs_1_lst=vllm_outputs,
|
outputs_1_lst=vllm_outputs,
|
||||||
name_0="hf",
|
name_0="hf",
|
||||||
name_1="vllm",
|
name_1="vllm",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
@ -136,7 +137,7 @@ def test_multi_step_llm_w_prompt_logprobs(
|
|||||||
num_logprobs: Optional[int],
|
num_logprobs: Optional[int],
|
||||||
num_prompt_logprobs: Optional[int],
|
num_prompt_logprobs: Optional[int],
|
||||||
attention_backend: str,
|
attention_backend: str,
|
||||||
monkeypatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test prompt logprobs with multi-step scheduling via sync LLM Engine.
|
"""Test prompt logprobs with multi-step scheduling via sync LLM Engine.
|
||||||
|
|
||||||
@ -166,47 +167,48 @@ def test_multi_step_llm_w_prompt_logprobs(
|
|||||||
note that this argument is not supported by the
|
note that this argument is not supported by the
|
||||||
OpenAI completions endpoint.
|
OpenAI completions endpoint.
|
||||||
"""
|
"""
|
||||||
override_backend_env_variable(monkeypatch, attention_backend)
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
|
||||||
|
|
||||||
prompts = example_prompts
|
prompts = example_prompts
|
||||||
if len(prompts) < num_prompts:
|
if len(prompts) < num_prompts:
|
||||||
prompts = prompts * ((num_prompts // len(prompts)) + 1)
|
prompts = prompts * ((num_prompts // len(prompts)) + 1)
|
||||||
prompts = prompts[:num_prompts]
|
prompts = prompts[:num_prompts]
|
||||||
assert len(prompts) == num_prompts
|
assert len(prompts) == num_prompts
|
||||||
|
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
model,
|
model,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
enforce_eager=enforce_eager,
|
enforce_eager=enforce_eager,
|
||||||
gpu_memory_utilization=0.7,
|
gpu_memory_utilization=0.7,
|
||||||
tensor_parallel_size=tp_size,
|
tensor_parallel_size=tp_size,
|
||||||
num_scheduler_steps=num_scheduler_steps,
|
num_scheduler_steps=num_scheduler_steps,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||||
prompts,
|
prompts,
|
||||||
max_tokens,
|
max_tokens,
|
||||||
num_logprobs,
|
num_logprobs,
|
||||||
num_prompt_logprobs=num_prompt_logprobs)
|
num_prompt_logprobs=num_prompt_logprobs)
|
||||||
|
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
model,
|
model,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
enforce_eager=enforce_eager,
|
enforce_eager=enforce_eager,
|
||||||
gpu_memory_utilization=0.7,
|
gpu_memory_utilization=0.7,
|
||||||
tensor_parallel_size=tp_size,
|
tensor_parallel_size=tp_size,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
|
single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||||
prompts,
|
prompts,
|
||||||
max_tokens,
|
max_tokens,
|
||||||
num_logprobs,
|
num_logprobs,
|
||||||
num_prompt_logprobs=num_prompt_logprobs)
|
num_prompt_logprobs=num_prompt_logprobs)
|
||||||
|
|
||||||
check_logprobs_close(
|
check_logprobs_close(
|
||||||
outputs_0_lst=single_step_vllm_outputs,
|
outputs_0_lst=single_step_vllm_outputs,
|
||||||
outputs_1_lst=vllm_outputs,
|
outputs_1_lst=vllm_outputs,
|
||||||
name_0="hf",
|
name_0="hf",
|
||||||
name_1="vllm",
|
name_1="vllm",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
@ -230,7 +232,7 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
|
|||||||
num_prompts: int,
|
num_prompts: int,
|
||||||
num_logprobs: Optional[int],
|
num_logprobs: Optional[int],
|
||||||
attention_backend: str,
|
attention_backend: str,
|
||||||
monkeypatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test vLLM engine with multi-step+"single-step chunked prefill"+APC.
|
"""Test vLLM engine with multi-step+"single-step chunked prefill"+APC.
|
||||||
|
|
||||||
@ -293,77 +295,78 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
|
|||||||
#
|
#
|
||||||
# The Incorrect scheduling behavior - if it occurs - will cause an exception
|
# The Incorrect scheduling behavior - if it occurs - will cause an exception
|
||||||
# in the model runner resulting from `do_sample=False`.
|
# in the model runner resulting from `do_sample=False`.
|
||||||
override_backend_env_variable(monkeypatch, attention_backend)
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
|
||||||
|
|
||||||
assert len(example_prompts) >= 2
|
assert len(example_prompts) >= 2
|
||||||
challenge_prompts = copy.deepcopy(example_prompts)
|
challenge_prompts = copy.deepcopy(example_prompts)
|
||||||
challenge_prompts[0] = ('vLLM is a high-throughput and memory-efficient '
|
challenge_prompts[0] = (
|
||||||
'inference and serving engine for LLMs.\n'
|
'vLLM is a high-throughput and memory-efficient '
|
||||||
) # 24 tok
|
'inference and serving engine for LLMs.\n') # 24 tok
|
||||||
challenge_prompts[1] = (
|
challenge_prompts[1] = (
|
||||||
'Briefly describe the major milestones in the '
|
'Briefly describe the major milestones in the '
|
||||||
'development of artificial intelligence from 1950 to 2020.\n'
|
'development of artificial intelligence from 1950 to 2020.\n'
|
||||||
) # 30 tok
|
) # 30 tok
|
||||||
|
|
||||||
# If necessary, adjust the length of `challenge_prompts` to match
|
# If necessary, adjust the length of `challenge_prompts` to match
|
||||||
# `num_prompts`
|
# `num_prompts`
|
||||||
if len(challenge_prompts) < num_prompts:
|
if len(challenge_prompts) < num_prompts:
|
||||||
challenge_prompts = (challenge_prompts *
|
challenge_prompts = (challenge_prompts *
|
||||||
((num_prompts // len(challenge_prompts)) + 1))
|
((num_prompts // len(challenge_prompts)) + 1))
|
||||||
challenge_prompts = challenge_prompts[:num_prompts]
|
challenge_prompts = challenge_prompts[:num_prompts]
|
||||||
assert len(challenge_prompts) == num_prompts
|
assert len(challenge_prompts) == num_prompts
|
||||||
|
|
||||||
# Single-step scheduler baseline
|
# Single-step scheduler baseline
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
model,
|
model,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
enforce_eager=enforce_eager,
|
enforce_eager=enforce_eager,
|
||||||
gpu_memory_utilization=0.7,
|
gpu_memory_utilization=0.7,
|
||||||
tensor_parallel_size=tp_size,
|
tensor_parallel_size=tp_size,
|
||||||
num_scheduler_steps=num_scheduler_steps,
|
num_scheduler_steps=num_scheduler_steps,
|
||||||
max_model_len=48,
|
max_model_len=48,
|
||||||
max_num_batched_tokens=48,
|
max_num_batched_tokens=48,
|
||||||
max_num_seqs=4,
|
max_num_seqs=4,
|
||||||
block_size=16,
|
block_size=16,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
outputs_baseline = (vllm_model.generate_greedy(
|
outputs_baseline = (
|
||||||
challenge_prompts, max_tokens) if num_logprobs is None else
|
vllm_model.generate_greedy(challenge_prompts, max_tokens) if
|
||||||
vllm_model.generate_greedy_logprobs(
|
num_logprobs is None else vllm_model.generate_greedy_logprobs(
|
||||||
challenge_prompts, max_tokens, num_logprobs))
|
challenge_prompts, max_tokens, num_logprobs))
|
||||||
|
|
||||||
# multi-step+"single-step chunked prefill"+APC
|
# multi-step+"single-step chunked prefill"+APC
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
model,
|
model,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
enforce_eager=enforce_eager,
|
enforce_eager=enforce_eager,
|
||||||
gpu_memory_utilization=0.7,
|
gpu_memory_utilization=0.7,
|
||||||
tensor_parallel_size=tp_size,
|
tensor_parallel_size=tp_size,
|
||||||
enable_chunked_prefill=True,
|
enable_chunked_prefill=True,
|
||||||
enable_prefix_caching=True,
|
enable_prefix_caching=True,
|
||||||
num_scheduler_steps=num_scheduler_steps,
|
num_scheduler_steps=num_scheduler_steps,
|
||||||
max_model_len=48,
|
max_model_len=48,
|
||||||
max_num_batched_tokens=48,
|
max_num_batched_tokens=48,
|
||||||
max_num_seqs=4,
|
max_num_seqs=4,
|
||||||
block_size=16,
|
block_size=16,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
outputs_w_features = (vllm_model.generate_greedy(
|
outputs_w_features = (
|
||||||
challenge_prompts, max_tokens) if num_logprobs is None else
|
vllm_model.generate_greedy(challenge_prompts, max_tokens) if
|
||||||
vllm_model.generate_greedy_logprobs(
|
num_logprobs is None else vllm_model.generate_greedy_logprobs(
|
||||||
challenge_prompts, max_tokens, num_logprobs))
|
challenge_prompts, max_tokens, num_logprobs))
|
||||||
|
|
||||||
if num_logprobs is None:
|
if num_logprobs is None:
|
||||||
# No-logprobs test
|
# No-logprobs test
|
||||||
check_outputs_equal(
|
check_outputs_equal(
|
||||||
outputs_0_lst=outputs_baseline,
|
outputs_0_lst=outputs_baseline,
|
||||||
outputs_1_lst=outputs_w_features,
|
outputs_1_lst=outputs_w_features,
|
||||||
name_0="multi-step",
|
name_0="multi-step",
|
||||||
name_1="multi-step+features",
|
name_1="multi-step+features",
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Yes-logprobs test
|
# Yes-logprobs test
|
||||||
check_logprobs_close(
|
check_logprobs_close(
|
||||||
outputs_0_lst=outputs_baseline,
|
outputs_0_lst=outputs_baseline,
|
||||||
outputs_1_lst=outputs_w_features,
|
outputs_1_lst=outputs_w_features,
|
||||||
name_0="multi-step",
|
name_0="multi-step",
|
||||||
name_1="multi-step+features",
|
name_1="multi-step+features",
|
||||||
)
|
)
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
import os
|
|
||||||
|
|
||||||
import neuronxcc.nki.language as nl
|
import neuronxcc.nki.language as nl
|
||||||
import pytest
|
import pytest
|
||||||
@ -99,6 +98,7 @@ def ref_block_tables_transform(
|
|||||||
)
|
)
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def test_load_and_transform_block_tables(
|
def test_load_and_transform_block_tables(
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
num_tiles,
|
num_tiles,
|
||||||
num_blocks_per_tile,
|
num_blocks_per_tile,
|
||||||
q_head_per_kv_head,
|
q_head_per_kv_head,
|
||||||
@ -108,46 +108,46 @@ def test_load_and_transform_block_tables(
|
|||||||
|
|
||||||
device = xm.xla_device()
|
device = xm.xla_device()
|
||||||
|
|
||||||
compiler_flags = [
|
compiler_flags_str = " ".join([
|
||||||
"-O1",
|
"-O1",
|
||||||
"--retry_failed_compilation",
|
"--retry_failed_compilation",
|
||||||
]
|
])
|
||||||
compiler_flags_str = " ".join(compiler_flags)
|
with monkeypatch.context() as m:
|
||||||
os.environ["NEURON_CC_FLAGS"] = compiler_flags_str
|
m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
|
||||||
|
|
||||||
torch.manual_seed(10000)
|
torch.manual_seed(10000)
|
||||||
torch.set_printoptions(sci_mode=False)
|
torch.set_printoptions(sci_mode=False)
|
||||||
|
|
||||||
# On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
|
# On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
|
||||||
B_P_SIZE = 128
|
B_P_SIZE = 128
|
||||||
if num_blocks_per_tile < B_P_SIZE:
|
if num_blocks_per_tile < B_P_SIZE:
|
||||||
assert B_P_SIZE % num_blocks_per_tile == 0
|
assert B_P_SIZE % num_blocks_per_tile == 0
|
||||||
block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile
|
block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile
|
||||||
else:
|
else:
|
||||||
block_size_tiling_factor = 1
|
block_size_tiling_factor = 1
|
||||||
max_num_blocks = 100000
|
max_num_blocks = 100000
|
||||||
block_tables = torch.randint(
|
block_tables = torch.randint(
|
||||||
0,
|
0,
|
||||||
max_num_blocks,
|
max_num_blocks,
|
||||||
(num_tiles * num_blocks_per_tile, ),
|
(num_tiles * num_blocks_per_tile, ),
|
||||||
dtype=torch.int32,
|
dtype=torch.int32,
|
||||||
)
|
)
|
||||||
nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1](
|
nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1](
|
||||||
block_tables.to(device=device),
|
block_tables.to(device=device),
|
||||||
num_tiles,
|
num_tiles,
|
||||||
num_blocks_per_tile,
|
num_blocks_per_tile,
|
||||||
q_head_per_kv_head,
|
q_head_per_kv_head,
|
||||||
head_id,
|
head_id,
|
||||||
block_size_tiling_factor,
|
block_size_tiling_factor,
|
||||||
).cpu()
|
).cpu()
|
||||||
ref_out = ref_block_tables_transform(
|
ref_out = ref_block_tables_transform(
|
||||||
block_tables,
|
block_tables,
|
||||||
num_tiles,
|
num_tiles,
|
||||||
num_blocks_per_tile,
|
num_blocks_per_tile,
|
||||||
q_head_per_kv_head,
|
q_head_per_kv_head,
|
||||||
head_id,
|
head_id,
|
||||||
block_size_tiling_factor,
|
block_size_tiling_factor,
|
||||||
)
|
)
|
||||||
assert (nki_out.shape == ref_out.shape
|
assert (nki_out.shape == ref_out.shape
|
||||||
), f"{nki_out.shape=} != {ref_out.shape=}"
|
), f"{nki_out.shape=} != {ref_out.shape=}"
|
||||||
assert torch.all(nki_out == ref_out)
|
assert torch.all(nki_out == ref_out)
|
||||||
|
@ -320,6 +320,7 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
|
|||||||
])
|
])
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def test_contexted_kv_attention(
|
def test_contexted_kv_attention(
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
prefill_batch_size: int,
|
prefill_batch_size: int,
|
||||||
decode_batch_size: int,
|
decode_batch_size: int,
|
||||||
num_heads: int,
|
num_heads: int,
|
||||||
@ -329,7 +330,6 @@ def test_contexted_kv_attention(
|
|||||||
large_tile_size,
|
large_tile_size,
|
||||||
mixed_precision: bool,
|
mixed_precision: bool,
|
||||||
) -> None:
|
) -> None:
|
||||||
import os
|
|
||||||
|
|
||||||
import torch_xla.core.xla_model as xm
|
import torch_xla.core.xla_model as xm
|
||||||
|
|
||||||
@ -340,174 +340,178 @@ def test_contexted_kv_attention(
|
|||||||
|
|
||||||
device = xm.xla_device()
|
device = xm.xla_device()
|
||||||
|
|
||||||
compiler_flags = [
|
compiler_flags_str = " ".join([
|
||||||
"-O1",
|
"-O1",
|
||||||
"--retry_failed_compilation",
|
"--retry_failed_compilation",
|
||||||
]
|
])
|
||||||
compiler_flags_str = " ".join(compiler_flags)
|
with monkeypatch.context() as m:
|
||||||
os.environ["NEURON_CC_FLAGS"] = compiler_flags_str
|
m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
|
||||||
|
|
||||||
torch.manual_seed(0)
|
torch.manual_seed(0)
|
||||||
torch.set_printoptions(sci_mode=False)
|
torch.set_printoptions(sci_mode=False)
|
||||||
torch.set_default_device("cpu")
|
torch.set_default_device("cpu")
|
||||||
dtype = torch.float32
|
dtype = torch.float32
|
||||||
|
|
||||||
min_ctx_len = 32
|
min_ctx_len = 32
|
||||||
max_ctx_len = 1024
|
max_ctx_len = 1024
|
||||||
min_query_len = 16
|
min_query_len = 16
|
||||||
max_query_len = 512
|
max_query_len = 512
|
||||||
num_kv_heads = num_heads // num_queries_per_kv
|
num_kv_heads = num_heads // num_queries_per_kv
|
||||||
(
|
(
|
||||||
query,
|
query,
|
||||||
k_active,
|
k_active,
|
||||||
v_active,
|
v_active,
|
||||||
k_cache,
|
k_cache,
|
||||||
v_cache,
|
v_cache,
|
||||||
block_table,
|
block_table,
|
||||||
key,
|
key,
|
||||||
value,
|
value,
|
||||||
query_lens,
|
query_lens,
|
||||||
seq_lens,
|
seq_lens,
|
||||||
) = sample_inputs(
|
) = sample_inputs(
|
||||||
prefill_batch_size=prefill_batch_size,
|
prefill_batch_size=prefill_batch_size,
|
||||||
decode_batch_size=decode_batch_size,
|
decode_batch_size=decode_batch_size,
|
||||||
min_query_len=min_query_len,
|
min_query_len=min_query_len,
|
||||||
max_query_len=max_query_len,
|
max_query_len=max_query_len,
|
||||||
min_ctx_len=min_ctx_len,
|
min_ctx_len=min_ctx_len,
|
||||||
max_ctx_len=max_ctx_len,
|
max_ctx_len=max_ctx_len,
|
||||||
block_size=block_size,
|
block_size=block_size,
|
||||||
num_heads=num_heads,
|
num_heads=num_heads,
|
||||||
num_kv_heads=num_kv_heads,
|
num_kv_heads=num_kv_heads,
|
||||||
head_size=head_size,
|
head_size=head_size,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
)
|
)
|
||||||
|
|
||||||
output_ref = ref_context_attention(
|
output_ref = ref_context_attention(
|
||||||
query,
|
query,
|
||||||
key,
|
key,
|
||||||
value,
|
value,
|
||||||
query_lens,
|
query_lens,
|
||||||
seq_lens,
|
seq_lens,
|
||||||
head_size,
|
head_size,
|
||||||
num_queries_per_kv,
|
num_queries_per_kv,
|
||||||
return_max_reduce=False,
|
return_max_reduce=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
# build neuron program
|
# build neuron program
|
||||||
B_P_SIZE = 128
|
B_P_SIZE = 128
|
||||||
assert (large_tile_size >= B_P_SIZE
|
assert (large_tile_size >= B_P_SIZE
|
||||||
), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"
|
), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"
|
||||||
|
|
||||||
def ceil_div(a, b):
|
def ceil_div(a, b):
|
||||||
return (a + b - 1) // b
|
return (a + b - 1) // b
|
||||||
|
|
||||||
def pad_to_multiple(a, b):
|
def pad_to_multiple(a, b):
|
||||||
return ceil_div(a, b) * b
|
return ceil_div(a, b) * b
|
||||||
|
|
||||||
def pad_to_next_power_of_2(a):
|
def pad_to_next_power_of_2(a):
|
||||||
assert a > 0
|
assert a > 0
|
||||||
return 2**int(a - 1).bit_length()
|
return 2**int(a - 1).bit_length()
|
||||||
|
|
||||||
# calculate input shapes
|
# calculate input shapes
|
||||||
max_num_queries = pad_to_next_power_of_2(sum(query_lens))
|
max_num_queries = pad_to_next_power_of_2(sum(query_lens))
|
||||||
context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
|
context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
|
||||||
num_active_blocks = ceil_div(context_lens, block_size).sum().item()
|
num_active_blocks = ceil_div(context_lens, block_size).sum().item()
|
||||||
num_active_blocks = pad_to_multiple(num_active_blocks,
|
num_active_blocks = pad_to_multiple(num_active_blocks,
|
||||||
large_tile_size // block_size)
|
large_tile_size // block_size)
|
||||||
context_kv_len = num_active_blocks * block_size
|
context_kv_len = num_active_blocks * block_size
|
||||||
assert (context_kv_len %
|
assert (
|
||||||
|
context_kv_len %
|
||||||
large_tile_size == 0), f"invalid context_kv_len={context_kv_len}"
|
large_tile_size == 0), f"invalid context_kv_len={context_kv_len}"
|
||||||
|
|
||||||
# pad QKV tensors
|
# pad QKV tensors
|
||||||
pad_dims = (
|
pad_dims = (
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
max_num_queries - query.shape[0],
|
|
||||||
)
|
|
||||||
query = F.pad(query, pad_dims, "constant", 0)
|
|
||||||
k = F.pad(k_active, pad_dims, "constant", 0)
|
|
||||||
v = F.pad(v_active, pad_dims, "constant", 0)
|
|
||||||
|
|
||||||
# permute QKV tensors
|
|
||||||
# query: (1, n_heads, d, seq_q)
|
|
||||||
# key: (1, n_kv_heads, d, seq_k)
|
|
||||||
# value: (1, n_kv_heads, seq_v, d)
|
|
||||||
query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
|
|
||||||
k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
|
|
||||||
v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
|
|
||||||
k_cache = k_cache.permute(0, 2, 1, 3).contiguous()
|
|
||||||
v_cache = v_cache.permute(0, 2, 1, 3).contiguous()
|
|
||||||
|
|
||||||
# transform block table
|
|
||||||
active_block_table = get_active_block_tables(
|
|
||||||
block_table.cpu(),
|
|
||||||
torch.tensor(query_lens).cpu(),
|
|
||||||
torch.tensor(seq_lens).cpu(),
|
|
||||||
block_size,
|
|
||||||
num_active_blocks,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Build attention masks
|
|
||||||
prior_mask, active_mask = (
|
|
||||||
BlockDiagonalCausalFromBottomRightMask.from_seqlens(
|
|
||||||
query_lens, seq_lens, block_size=block_size))
|
|
||||||
prior_mask_padded = F.pad(
|
|
||||||
prior_mask,
|
|
||||||
(
|
|
||||||
0,
|
0,
|
||||||
context_kv_len - prior_mask.shape[1],
|
|
||||||
0,
|
0,
|
||||||
max_num_queries - prior_mask.shape[0],
|
|
||||||
),
|
|
||||||
"constant",
|
|
||||||
0,
|
|
||||||
).bool()
|
|
||||||
active_mask_padded = F.pad(
|
|
||||||
active_mask,
|
|
||||||
(
|
|
||||||
0,
|
0,
|
||||||
max_num_queries - active_mask.shape[1],
|
|
||||||
0,
|
0,
|
||||||
max_num_queries - active_mask.shape[0],
|
0,
|
||||||
),
|
max_num_queries - query.shape[0],
|
||||||
"constant",
|
)
|
||||||
0,
|
query = F.pad(query, pad_dims, "constant", 0)
|
||||||
).bool()
|
k = F.pad(k_active, pad_dims, "constant", 0)
|
||||||
attn_mask = torch.concat([prior_mask_padded, active_mask_padded], dim=1)
|
v = F.pad(v_active, pad_dims, "constant", 0)
|
||||||
|
|
||||||
attn_mask = reorder_context_mask(attn_mask, large_tile_size, block_size)
|
# permute QKV tensors
|
||||||
|
# query: (1, n_heads, d, seq_q)
|
||||||
|
# key: (1, n_kv_heads, d, seq_k)
|
||||||
|
# value: (1, n_kv_heads, seq_v, d)
|
||||||
|
query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
|
||||||
|
k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
|
||||||
|
v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
|
||||||
|
k_cache = k_cache.permute(0, 2, 1, 3).contiguous()
|
||||||
|
v_cache = v_cache.permute(0, 2, 1, 3).contiguous()
|
||||||
|
|
||||||
input_args = (
|
# transform block table
|
||||||
query.to(device=device),
|
active_block_table = get_active_block_tables(
|
||||||
k.to(device=device),
|
block_table.cpu(),
|
||||||
v.to(device=device),
|
torch.tensor(query_lens).cpu(),
|
||||||
k_cache.to(device=device),
|
torch.tensor(seq_lens).cpu(),
|
||||||
v_cache.to(device=device),
|
block_size,
|
||||||
active_block_table.to(device=device),
|
num_active_blocks,
|
||||||
attn_mask.to(device=device),
|
)
|
||||||
)
|
|
||||||
input_kwargs = dict(
|
|
||||||
n_kv_head=num_kv_heads,
|
|
||||||
head_size=head_size,
|
|
||||||
mixed_precision=mixed_precision,
|
|
||||||
LARGE_TILE_SZ=large_tile_size,
|
|
||||||
)
|
|
||||||
|
|
||||||
output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
|
# Build attention masks
|
||||||
|
prior_mask, active_mask = (
|
||||||
|
BlockDiagonalCausalFromBottomRightMask.from_seqlens(
|
||||||
|
query_lens, seq_lens, block_size=block_size))
|
||||||
|
prior_mask_padded = F.pad(
|
||||||
|
prior_mask,
|
||||||
|
(
|
||||||
|
0,
|
||||||
|
context_kv_len - prior_mask.shape[1],
|
||||||
|
0,
|
||||||
|
max_num_queries - prior_mask.shape[0],
|
||||||
|
),
|
||||||
|
"constant",
|
||||||
|
0,
|
||||||
|
).bool()
|
||||||
|
active_mask_padded = F.pad(
|
||||||
|
active_mask,
|
||||||
|
(
|
||||||
|
0,
|
||||||
|
max_num_queries - active_mask.shape[1],
|
||||||
|
0,
|
||||||
|
max_num_queries - active_mask.shape[0],
|
||||||
|
),
|
||||||
|
"constant",
|
||||||
|
0,
|
||||||
|
).bool()
|
||||||
|
attn_mask = torch.concat([prior_mask_padded, active_mask_padded],
|
||||||
|
dim=1)
|
||||||
|
|
||||||
num_actual_tokens = sum(query_lens)
|
attn_mask = reorder_context_mask(attn_mask, large_tile_size,
|
||||||
# - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
|
block_size)
|
||||||
output_nki = output_nki.cpu().permute(0, 2, 1, 3)
|
|
||||||
output_nki = output_nki[0, :num_actual_tokens, :, :]
|
|
||||||
output_ref_padded = F.pad(
|
|
||||||
output_ref,
|
|
||||||
(0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]),
|
|
||||||
"constant",
|
|
||||||
0,
|
|
||||||
)
|
|
||||||
output_ref = output_ref_padded.transpose(0, 1)[0, :num_actual_tokens, :, :]
|
|
||||||
|
|
||||||
torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
|
input_args = (
|
||||||
|
query.to(device=device),
|
||||||
|
k.to(device=device),
|
||||||
|
v.to(device=device),
|
||||||
|
k_cache.to(device=device),
|
||||||
|
v_cache.to(device=device),
|
||||||
|
active_block_table.to(device=device),
|
||||||
|
attn_mask.to(device=device),
|
||||||
|
)
|
||||||
|
input_kwargs = dict(
|
||||||
|
n_kv_head=num_kv_heads,
|
||||||
|
head_size=head_size,
|
||||||
|
mixed_precision=mixed_precision,
|
||||||
|
LARGE_TILE_SZ=large_tile_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
|
||||||
|
|
||||||
|
num_actual_tokens = sum(query_lens)
|
||||||
|
# - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
|
||||||
|
output_nki = output_nki.cpu().permute(0, 2, 1, 3)
|
||||||
|
output_nki = output_nki[0, :num_actual_tokens, :, :]
|
||||||
|
output_ref_padded = F.pad(
|
||||||
|
output_ref,
|
||||||
|
(0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]),
|
||||||
|
"constant",
|
||||||
|
0,
|
||||||
|
)
|
||||||
|
output_ref = output_ref_padded.transpose(
|
||||||
|
0, 1)[0, :num_actual_tokens, :, :]
|
||||||
|
|
||||||
|
torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from tests.kernels.utils import override_backend_env_variable
|
|
||||||
from vllm.attention.selector import get_attn_backend
|
from vllm.attention.selector import get_attn_backend
|
||||||
from vllm.utils import STR_INVALID_VAL
|
from vllm.utils import STR_BACKEND_ENV_VAR, STR_INVALID_VAL
|
||||||
|
|
||||||
|
|
||||||
def test_platform_plugins():
|
def test_platform_plugins():
|
||||||
@ -25,8 +25,9 @@ def test_platform_plugins():
|
|||||||
f" is loaded. The first import:\n{_init_trace}")
|
f" is loaded. The first import:\n{_init_trace}")
|
||||||
|
|
||||||
|
|
||||||
def test_oot_attention_backend(monkeypatch):
|
def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch):
|
||||||
# ignore the backend env variable if it is set
|
# ignore the backend env variable if it is set
|
||||||
override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
|
with monkeypatch.context() as m:
|
||||||
backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
|
m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
|
||||||
assert backend.get_name() == "Dummy_Backend"
|
backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
|
||||||
|
assert backend.get_name() == "Dummy_Backend"
|
||||||
|
@ -22,43 +22,47 @@ class DummyV1Scheduler(V1Scheduler):
|
|||||||
raise Exception("Exception raised by DummyV1Scheduler")
|
raise Exception("Exception raised by DummyV1Scheduler")
|
||||||
|
|
||||||
|
|
||||||
def test_scheduler_plugins_v0(monkeypatch):
|
def test_scheduler_plugins_v0(monkeypatch: pytest.MonkeyPatch):
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
with monkeypatch.context() as m:
|
||||||
with pytest.raises(Exception) as exception_info:
|
m.setenv("VLLM_USE_V1", "0")
|
||||||
|
with pytest.raises(Exception) as exception_info:
|
||||||
|
|
||||||
engine_args = EngineArgs(
|
engine_args = EngineArgs(
|
||||||
model="facebook/opt-125m",
|
model="facebook/opt-125m",
|
||||||
enforce_eager=True, # reduce test time
|
enforce_eager=True, # reduce test time
|
||||||
scheduler_cls=DummyV0Scheduler,
|
scheduler_cls=DummyV0Scheduler,
|
||||||
)
|
)
|
||||||
|
|
||||||
engine = LLMEngine.from_engine_args(engine_args=engine_args)
|
engine = LLMEngine.from_engine_args(engine_args=engine_args)
|
||||||
|
|
||||||
sampling_params = SamplingParams(max_tokens=1)
|
sampling_params = SamplingParams(max_tokens=1)
|
||||||
engine.add_request("0", "foo", sampling_params)
|
engine.add_request("0", "foo", sampling_params)
|
||||||
engine.step()
|
engine.step()
|
||||||
|
|
||||||
assert str(exception_info.value) == "Exception raised by DummyV0Scheduler"
|
assert str(
|
||||||
|
exception_info.value) == "Exception raised by DummyV0Scheduler"
|
||||||
|
|
||||||
|
|
||||||
def test_scheduler_plugins_v1(monkeypatch):
|
def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
with monkeypatch.context() as m:
|
||||||
# Explicitly turn off engine multiprocessing so that the scheduler runs in
|
m.setenv("VLLM_USE_V1", "1")
|
||||||
# this process
|
# Explicitly turn off engine multiprocessing so
|
||||||
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
# that the scheduler runs in this process
|
||||||
|
m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||||
|
|
||||||
with pytest.raises(Exception) as exception_info:
|
with pytest.raises(Exception) as exception_info:
|
||||||
|
|
||||||
engine_args = EngineArgs(
|
engine_args = EngineArgs(
|
||||||
model="facebook/opt-125m",
|
model="facebook/opt-125m",
|
||||||
enforce_eager=True, # reduce test time
|
enforce_eager=True, # reduce test time
|
||||||
scheduler_cls=DummyV1Scheduler,
|
scheduler_cls=DummyV1Scheduler,
|
||||||
)
|
)
|
||||||
|
|
||||||
engine = V1LLMEngine.from_engine_args(engine_args=engine_args)
|
engine = V1LLMEngine.from_engine_args(engine_args=engine_args)
|
||||||
|
|
||||||
sampling_params = SamplingParams(max_tokens=1)
|
sampling_params = SamplingParams(max_tokens=1)
|
||||||
engine.add_request("0", "foo", sampling_params)
|
engine.add_request("0", "foo", sampling_params)
|
||||||
engine.step()
|
engine.step()
|
||||||
|
|
||||||
assert str(exception_info.value) == "Exception raised by DummyV1Scheduler"
|
assert str(
|
||||||
|
exception_info.value) == "Exception raised by DummyV1Scheduler"
|
||||||
|
@ -4,25 +4,29 @@
|
|||||||
Run `pytest tests/prefix_caching/test_prefix_caching.py`.
|
Run `pytest tests/prefix_caching/test_prefix_caching.py`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from tests.conftest import VllmRunner
|
from tests.conftest import VllmRunner
|
||||||
from tests.core.utils import SchedulerProxy, create_dummy_prompt
|
from tests.core.utils import SchedulerProxy, create_dummy_prompt
|
||||||
from tests.kernels.utils import override_backend_env_variable
|
|
||||||
from vllm import SamplingParams, TokensPrompt
|
from vllm import SamplingParams, TokensPrompt
|
||||||
from vllm.core.scheduler import Scheduler
|
from vllm.core.scheduler import Scheduler
|
||||||
from vllm.engine.llm_engine import LLMEngine
|
from vllm.engine.llm_engine import LLMEngine
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.utils import STR_BACKEND_ENV_VAR
|
||||||
|
|
||||||
from ..models.utils import check_outputs_equal
|
from ..models.utils import check_outputs_equal
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function", autouse=True)
|
@pytest.fixture(scope="function", autouse=True)
|
||||||
def use_v0_only(monkeypatch):
|
def use_v0_only(monkeypatch: pytest.MonkeyPatch):
|
||||||
"""
|
"""
|
||||||
This module relies on V0 internals, so set VLLM_USE_V1=0.
|
This module relies on V0 internals, so set VLLM_USE_V1=0.
|
||||||
"""
|
"""
|
||||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv('VLLM_USE_V1', '0')
|
||||||
|
yield
|
||||||
|
|
||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
@ -56,7 +60,7 @@ def test_mixed_requests(
|
|||||||
cached_position: int,
|
cached_position: int,
|
||||||
enable_chunked_prefill: bool,
|
enable_chunked_prefill: bool,
|
||||||
block_size: int,
|
block_size: int,
|
||||||
monkeypatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Test the case when some sequences have the prefix cache hit
|
Test the case when some sequences have the prefix cache hit
|
||||||
@ -67,72 +71,77 @@ def test_mixed_requests(
|
|||||||
pytest.skip("Flashinfer does not support ROCm/HIP.")
|
pytest.skip("Flashinfer does not support ROCm/HIP.")
|
||||||
if backend == "XFORMERS" and current_platform.is_rocm():
|
if backend == "XFORMERS" and current_platform.is_rocm():
|
||||||
pytest.skip("Xformers does not support ROCm/HIP.")
|
pytest.skip("Xformers does not support ROCm/HIP.")
|
||||||
override_backend_env_variable(monkeypatch, backend)
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv(STR_BACKEND_ENV_VAR, backend)
|
||||||
|
|
||||||
with hf_runner(model, dtype=dtype) as hf_model:
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
cached_prompt = example_prompts[cached_position]
|
cached_prompt = example_prompts[cached_position]
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
model,
|
model,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
enable_prefix_caching=True,
|
enable_prefix_caching=True,
|
||||||
enable_chunked_prefill=enable_chunked_prefill,
|
enable_chunked_prefill=enable_chunked_prefill,
|
||||||
block_size=block_size,
|
block_size=block_size,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
# Run the first prompt so the cache is populated
|
# Run the first prompt so the cache is populated
|
||||||
vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens)
|
vllm_outputs = vllm_model.generate_greedy([cached_prompt],
|
||||||
|
max_tokens)
|
||||||
|
|
||||||
# Run all the promopts
|
# Run all the promopts
|
||||||
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
|
greedy_params = SamplingParams(temperature=0.0,
|
||||||
req_outputs = vllm_model.model.generate(example_prompts, greedy_params)
|
max_tokens=max_tokens)
|
||||||
|
req_outputs = vllm_model.model.generate(example_prompts,
|
||||||
|
greedy_params)
|
||||||
|
|
||||||
# Verify number of cached tokens
|
# Verify number of cached tokens
|
||||||
for i in range(len(req_outputs)):
|
for i in range(len(req_outputs)):
|
||||||
if i == cached_position:
|
if i == cached_position:
|
||||||
expected_num_cached_tokens = (
|
expected_num_cached_tokens = (
|
||||||
len(req_outputs[i].prompt_token_ids) //
|
len(req_outputs[i].prompt_token_ids) //
|
||||||
block_size) * block_size
|
block_size) * block_size
|
||||||
else:
|
else:
|
||||||
expected_num_cached_tokens = 0
|
expected_num_cached_tokens = 0
|
||||||
assert (
|
assert (req_outputs[i].num_cached_tokens ==
|
||||||
req_outputs[i].num_cached_tokens == expected_num_cached_tokens)
|
expected_num_cached_tokens)
|
||||||
|
|
||||||
vllm_outputs = [(
|
vllm_outputs = [(
|
||||||
output.prompt_token_ids + list(output.outputs[0].token_ids),
|
output.prompt_token_ids + list(output.outputs[0].token_ids),
|
||||||
output.prompt + output.outputs[0].text,
|
output.prompt + output.outputs[0].text,
|
||||||
) for output in req_outputs]
|
) for output in req_outputs]
|
||||||
|
|
||||||
check_outputs_equal(
|
check_outputs_equal(
|
||||||
outputs_0_lst=hf_outputs,
|
outputs_0_lst=hf_outputs,
|
||||||
outputs_1_lst=vllm_outputs,
|
outputs_1_lst=vllm_outputs,
|
||||||
name_0="hf",
|
name_0="hf",
|
||||||
name_1="vllm",
|
name_1="vllm",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
|
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
|
||||||
def test_unstable_prompt_sequence(
|
def test_unstable_prompt_sequence(
|
||||||
vllm_runner,
|
vllm_runner,
|
||||||
backend: str,
|
backend: str,
|
||||||
monkeypatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
if backend == "FLASHINFER" and current_platform.is_rocm():
|
if backend == "FLASHINFER" and current_platform.is_rocm():
|
||||||
pytest.skip("Flashinfer does not support ROCm/HIP.")
|
pytest.skip("Flashinfer does not support ROCm/HIP.")
|
||||||
if backend == "XFORMERS" and current_platform.is_rocm():
|
if backend == "XFORMERS" and current_platform.is_rocm():
|
||||||
pytest.skip("Xformers does not support ROCm/HIP.")
|
pytest.skip("Xformers does not support ROCm/HIP.")
|
||||||
override_backend_env_variable(monkeypatch, backend)
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv(STR_BACKEND_ENV_VAR, backend)
|
||||||
|
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
"Qwen/Qwen2.5-0.5B-Instruct",
|
"Qwen/Qwen2.5-0.5B-Instruct",
|
||||||
enable_chunked_prefill=True,
|
enable_chunked_prefill=True,
|
||||||
enable_prefix_caching=True,
|
enable_prefix_caching=True,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
for prompt in UNSTABLE_PROMPT_SEQUENCE:
|
for prompt in UNSTABLE_PROMPT_SEQUENCE:
|
||||||
vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
|
vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
|
||||||
SamplingParams(max_tokens=1))
|
SamplingParams(max_tokens=1))
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
|
@ -56,12 +56,11 @@ def test_gc():
|
|||||||
assert allocated < 50 * 1024 * 1024
|
assert allocated < 50 * 1024 * 1024
|
||||||
|
|
||||||
|
|
||||||
def test_model_from_modelscope(monkeypatch):
|
def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
|
||||||
# model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
|
# model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
|
||||||
MODELSCOPE_MODEL_NAME = "qwen/Qwen1.5-0.5B-Chat"
|
with monkeypatch.context() as m:
|
||||||
monkeypatch.setenv("VLLM_USE_MODELSCOPE", "True")
|
m.setenv("VLLM_USE_MODELSCOPE", "True")
|
||||||
try:
|
llm = LLM(model="qwen/Qwen1.5-0.5B-Chat")
|
||||||
llm = LLM(model=MODELSCOPE_MODEL_NAME)
|
|
||||||
|
|
||||||
prompts = [
|
prompts = [
|
||||||
"Hello, my name is",
|
"Hello, my name is",
|
||||||
@ -73,10 +72,3 @@ def test_model_from_modelscope(monkeypatch):
|
|||||||
|
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
assert len(outputs) == 4
|
assert len(outputs) == 4
|
||||||
finally:
|
|
||||||
monkeypatch.delenv("VLLM_USE_MODELSCOPE", raising=False)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import pytest
|
|
||||||
pytest.main([__file__])
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# ruff: noqa
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import os
|
|
||||||
import socket
|
import socket
|
||||||
from collections.abc import AsyncIterator
|
from collections.abc import AsyncIterator
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
@ -112,16 +112,16 @@ def test_deprecate_kwargs_additional_message():
|
|||||||
dummy(old_arg=1)
|
dummy(old_arg=1)
|
||||||
|
|
||||||
|
|
||||||
def test_get_open_port():
|
def test_get_open_port(monkeypatch: pytest.MonkeyPatch):
|
||||||
os.environ["VLLM_PORT"] = "5678"
|
with monkeypatch.context() as m:
|
||||||
# make sure we can get multiple ports, even if the env var is set
|
m.setenv("VLLM_PORT", "5678")
|
||||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1:
|
# make sure we can get multiple ports, even if the env var is set
|
||||||
s1.bind(("localhost", get_open_port()))
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1:
|
||||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2:
|
s1.bind(("localhost", get_open_port()))
|
||||||
s2.bind(("localhost", get_open_port()))
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2:
|
||||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
|
s2.bind(("localhost", get_open_port()))
|
||||||
s3.bind(("localhost", get_open_port()))
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
|
||||||
os.environ.pop("VLLM_PORT")
|
s3.bind(("localhost", get_open_port()))
|
||||||
|
|
||||||
|
|
||||||
# Tests for FlexibleArgumentParser
|
# Tests for FlexibleArgumentParser
|
||||||
@ -366,31 +366,32 @@ def test_bind_kv_cache_non_attention():
|
|||||||
assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
|
assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
|
||||||
|
|
||||||
|
|
||||||
def test_bind_kv_cache_encoder_decoder(monkeypatch):
|
def test_bind_kv_cache_encoder_decoder(monkeypatch: pytest.MonkeyPatch):
|
||||||
# V1 TESTS: ENCODER_DECODER is not supported on V1 yet.
|
# V1 TESTS: ENCODER_DECODER is not supported on V1 yet.
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv("VLLM_USE_V1", "0")
|
||||||
|
|
||||||
from vllm.attention import Attention, AttentionType
|
from vllm.attention import Attention, AttentionType
|
||||||
|
|
||||||
# example from bart
|
# example from bart
|
||||||
ctx = {
|
ctx = {
|
||||||
'encoder.layers.0.self_attn.attn':
|
'encoder.layers.0.self_attn.attn':
|
||||||
Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
|
Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
|
||||||
'decoder.layers.0.encoder_attn.attn':
|
'decoder.layers.0.encoder_attn.attn':
|
||||||
Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
|
Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
|
||||||
'decoder.layers.0.self_attn.attn':
|
'decoder.layers.0.self_attn.attn':
|
||||||
Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
|
Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
|
||||||
}
|
}
|
||||||
|
|
||||||
kv_cache = [
|
kv_cache = [
|
||||||
torch.zeros((1, )),
|
torch.zeros((1, )),
|
||||||
]
|
]
|
||||||
encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
|
encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
|
||||||
|
|
||||||
bind_kv_cache(ctx, [kv_cache])
|
bind_kv_cache(ctx, [kv_cache])
|
||||||
assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
|
assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
|
||||||
assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
|
assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
|
||||||
assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
|
assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
|
||||||
|
|
||||||
|
|
||||||
def test_bind_kv_cache_pp():
|
def test_bind_kv_cache_pp():
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import os
|
import pytest
|
||||||
|
|
||||||
from vllm.config import CompilationLevel
|
from vllm.config import CompilationLevel
|
||||||
|
|
||||||
@ -9,16 +9,17 @@ from ..utils import compare_two_settings
|
|||||||
# --enforce-eager on TPU causes graph compilation
|
# --enforce-eager on TPU causes graph compilation
|
||||||
# this times out default Health Check in the MQLLMEngine,
|
# this times out default Health Check in the MQLLMEngine,
|
||||||
# so we set the timeout here to 30s
|
# so we set the timeout here to 30s
|
||||||
os.environ["VLLM_RPC_TIMEOUT"] = "30000"
|
|
||||||
|
|
||||||
|
|
||||||
def test_custom_dispatcher():
|
def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
|
||||||
compare_two_settings(
|
with monkeypatch.context() as m:
|
||||||
"google/gemma-2b",
|
m.setenv("VLLM_RPC_TIMEOUT", "30000")
|
||||||
arg1=[
|
compare_two_settings(
|
||||||
"--enforce-eager",
|
"google/gemma-2b",
|
||||||
f"-O{CompilationLevel.DYNAMO_ONCE}",
|
arg1=[
|
||||||
],
|
"--enforce-eager",
|
||||||
arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"],
|
f"-O{CompilationLevel.DYNAMO_ONCE}",
|
||||||
env1={},
|
],
|
||||||
env2={})
|
arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"],
|
||||||
|
env1={},
|
||||||
|
env2={})
|
||||||
|
@ -1,10 +1,12 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# ruff: noqa
|
||||||
|
# type: ignore
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import os
|
|
||||||
import threading
|
import threading
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
from concurrent import futures
|
from concurrent import futures
|
||||||
from typing import Callable, Literal
|
from typing import Callable, Generator, Literal
|
||||||
|
|
||||||
import grpc
|
import grpc
|
||||||
import pytest
|
import pytest
|
||||||
@ -21,12 +23,14 @@ from vllm.tracing import SpanAttributes
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function", autouse=True)
|
@pytest.fixture(scope="function", autouse=True)
|
||||||
def use_v0_only(monkeypatch):
|
def use_v0_only(monkeypatch: pytest.MonkeyPatch):
|
||||||
"""
|
"""
|
||||||
Since this module is V0 only, set VLLM_USE_V1=0 for
|
Since this module is V0 only, set VLLM_USE_V1=0 for
|
||||||
all tests in the module.
|
all tests in the module.
|
||||||
"""
|
"""
|
||||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv('VLLM_USE_V1', '0')
|
||||||
|
yield
|
||||||
|
|
||||||
|
|
||||||
FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
|
FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
|
||||||
@ -67,7 +71,7 @@ class FakeTraceService(TraceServiceServicer):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def trace_service():
|
def trace_service() -> Generator[FakeTraceService, None, None]:
|
||||||
"""Fixture to set up a fake gRPC trace service"""
|
"""Fixture to set up a fake gRPC trace service"""
|
||||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
|
server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
|
||||||
service = FakeTraceService()
|
service = FakeTraceService()
|
||||||
@ -80,136 +84,153 @@ def trace_service():
|
|||||||
server.stop(None)
|
server.stop(None)
|
||||||
|
|
||||||
|
|
||||||
def test_traces(trace_service):
|
def test_traces(
|
||||||
os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true"
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
trace_service: FakeTraceService,
|
||||||
|
):
|
||||||
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
|
||||||
|
|
||||||
sampling_params = SamplingParams(temperature=0.01,
|
sampling_params = SamplingParams(
|
||||||
top_p=0.1,
|
temperature=0.01,
|
||||||
max_tokens=256)
|
top_p=0.1,
|
||||||
model = "facebook/opt-125m"
|
max_tokens=256,
|
||||||
llm = LLM(
|
)
|
||||||
model=model,
|
model = "facebook/opt-125m"
|
||||||
otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
|
llm = LLM(
|
||||||
)
|
model=model,
|
||||||
prompts = ["This is a short prompt"]
|
otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
|
||||||
outputs = llm.generate(prompts, sampling_params=sampling_params)
|
)
|
||||||
|
prompts = ["This is a short prompt"]
|
||||||
|
outputs = llm.generate(prompts, sampling_params=sampling_params)
|
||||||
|
|
||||||
timeout = 5
|
timeout = 5
|
||||||
if not trace_service.evt.wait(timeout):
|
if not trace_service.evt.wait(timeout):
|
||||||
raise TimeoutError(
|
raise TimeoutError(
|
||||||
f"The fake trace service didn't receive a trace within "
|
f"The fake trace service didn't receive a trace within "
|
||||||
f"the {timeout} seconds timeout")
|
f"the {timeout} seconds timeout")
|
||||||
|
|
||||||
request = trace_service.request
|
request = trace_service.request
|
||||||
assert len(request.resource_spans) == 1, (
|
assert len(request.resource_spans) == 1, (
|
||||||
f"Expected 1 resource span, "
|
f"Expected 1 resource span, "
|
||||||
f"but got {len(request.resource_spans)}")
|
f"but got {len(request.resource_spans)}")
|
||||||
assert len(request.resource_spans[0].scope_spans) == 1, (
|
assert len(request.resource_spans[0].scope_spans) == 1, (
|
||||||
f"Expected 1 scope span, "
|
f"Expected 1 scope span, "
|
||||||
f"but got {len(request.resource_spans[0].scope_spans)}")
|
f"but got {len(request.resource_spans[0].scope_spans)}")
|
||||||
assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
|
assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
|
||||||
f"Expected 1 span, "
|
f"Expected 1 span, "
|
||||||
f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
|
f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
|
||||||
|
|
||||||
attributes = decode_attributes(
|
attributes = decode_attributes(
|
||||||
request.resource_spans[0].scope_spans[0].spans[0].attributes)
|
request.resource_spans[0].scope_spans[0].spans[0].attributes)
|
||||||
assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
|
assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
|
||||||
assert attributes.get(
|
assert attributes.get(
|
||||||
SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
|
SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
|
||||||
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
|
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
|
||||||
) == sampling_params.temperature
|
) == sampling_params.temperature
|
||||||
assert attributes.get(
|
assert attributes.get(
|
||||||
SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
|
SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
|
||||||
assert attributes.get(
|
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
|
||||||
SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
|
) == sampling_params.max_tokens
|
||||||
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
|
assert attributes.get(
|
||||||
assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
|
SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
|
||||||
outputs[0].prompt_token_ids)
|
assert attributes.get(
|
||||||
completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
|
SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
|
||||||
assert attributes.get(
|
outputs[0].prompt_token_ids)
|
||||||
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
|
completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
|
||||||
metrics = outputs[0].metrics
|
assert attributes.get(
|
||||||
assert attributes.get(
|
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
|
||||||
SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
|
metrics = outputs[0].metrics
|
||||||
ttft = metrics.first_token_time - metrics.arrival_time
|
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
|
||||||
assert attributes.get(
|
) == metrics.time_in_queue
|
||||||
SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
|
ttft = metrics.first_token_time - metrics.arrival_time
|
||||||
e2e_time = metrics.finished_time - metrics.arrival_time
|
assert attributes.get(
|
||||||
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
|
SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
|
||||||
assert metrics.scheduler_time > 0
|
e2e_time = metrics.finished_time - metrics.arrival_time
|
||||||
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
|
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
|
||||||
) == metrics.scheduler_time
|
assert metrics.scheduler_time > 0
|
||||||
# Model forward and model execute should be none, since detailed traces is
|
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
|
||||||
# not enabled.
|
) == metrics.scheduler_time
|
||||||
assert metrics.model_forward_time is None
|
# Model forward and model execute should be none, since detailed traces is
|
||||||
assert metrics.model_execute_time is None
|
# not enabled.
|
||||||
|
assert metrics.model_forward_time is None
|
||||||
|
assert metrics.model_execute_time is None
|
||||||
|
|
||||||
|
|
||||||
def test_traces_with_detailed_steps(trace_service):
|
def test_traces_with_detailed_steps(
|
||||||
os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true"
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
trace_service: FakeTraceService,
|
||||||
|
):
|
||||||
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
|
||||||
|
|
||||||
sampling_params = SamplingParams(temperature=0.01,
|
sampling_params = SamplingParams(
|
||||||
top_p=0.1,
|
temperature=0.01,
|
||||||
max_tokens=256)
|
top_p=0.1,
|
||||||
model = "facebook/opt-125m"
|
max_tokens=256,
|
||||||
llm = LLM(
|
)
|
||||||
model=model,
|
model = "facebook/opt-125m"
|
||||||
otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
|
llm = LLM(
|
||||||
collect_detailed_traces="all",
|
model=model,
|
||||||
)
|
otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
|
||||||
prompts = ["This is a short prompt"]
|
collect_detailed_traces="all",
|
||||||
outputs = llm.generate(prompts, sampling_params=sampling_params)
|
)
|
||||||
|
prompts = ["This is a short prompt"]
|
||||||
|
outputs = llm.generate(prompts, sampling_params=sampling_params)
|
||||||
|
|
||||||
timeout = 5
|
timeout = 5
|
||||||
if not trace_service.evt.wait(timeout):
|
if not trace_service.evt.wait(timeout):
|
||||||
raise TimeoutError(
|
raise TimeoutError(
|
||||||
f"The fake trace service didn't receive a trace within "
|
f"The fake trace service didn't receive a trace within "
|
||||||
f"the {timeout} seconds timeout")
|
f"the {timeout} seconds timeout")
|
||||||
|
|
||||||
request = trace_service.request
|
request = trace_service.request
|
||||||
assert len(request.resource_spans) == 1, (
|
assert len(request.resource_spans) == 1, (
|
||||||
f"Expected 1 resource span, "
|
f"Expected 1 resource span, "
|
||||||
f"but got {len(request.resource_spans)}")
|
f"but got {len(request.resource_spans)}")
|
||||||
assert len(request.resource_spans[0].scope_spans) == 1, (
|
assert len(request.resource_spans[0].scope_spans) == 1, (
|
||||||
f"Expected 1 scope span, "
|
f"Expected 1 scope span, "
|
||||||
f"but got {len(request.resource_spans[0].scope_spans)}")
|
f"but got {len(request.resource_spans[0].scope_spans)}")
|
||||||
assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
|
assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
|
||||||
f"Expected 1 span, "
|
f"Expected 1 span, "
|
||||||
f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
|
f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
|
||||||
|
|
||||||
attributes = decode_attributes(
|
attributes = decode_attributes(
|
||||||
request.resource_spans[0].scope_spans[0].spans[0].attributes)
|
request.resource_spans[0].scope_spans[0].spans[0].attributes)
|
||||||
assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
|
assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
|
||||||
assert attributes.get(
|
assert attributes.get(
|
||||||
SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
|
SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
|
||||||
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
|
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
|
||||||
) == sampling_params.temperature
|
) == sampling_params.temperature
|
||||||
assert attributes.get(
|
assert attributes.get(
|
||||||
SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
|
SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
|
||||||
assert attributes.get(
|
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
|
||||||
SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
|
) == sampling_params.max_tokens
|
||||||
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
|
assert attributes.get(
|
||||||
assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
|
SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
|
||||||
outputs[0].prompt_token_ids)
|
assert attributes.get(
|
||||||
completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
|
SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
|
||||||
assert attributes.get(
|
outputs[0].prompt_token_ids)
|
||||||
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
|
completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
|
||||||
metrics = outputs[0].metrics
|
assert attributes.get(
|
||||||
assert attributes.get(
|
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
|
||||||
SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
|
metrics = outputs[0].metrics
|
||||||
ttft = metrics.first_token_time - metrics.arrival_time
|
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
|
||||||
assert attributes.get(
|
) == metrics.time_in_queue
|
||||||
SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
|
ttft = metrics.first_token_time - metrics.arrival_time
|
||||||
e2e_time = metrics.finished_time - metrics.arrival_time
|
assert attributes.get(
|
||||||
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
|
SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
|
||||||
assert metrics.scheduler_time > 0
|
e2e_time = metrics.finished_time - metrics.arrival_time
|
||||||
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
|
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
|
||||||
) == metrics.scheduler_time
|
assert metrics.scheduler_time > 0
|
||||||
assert metrics.model_forward_time > 0
|
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
|
||||||
assert attributes.get(
|
) == metrics.scheduler_time
|
||||||
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx(
|
assert metrics.model_forward_time > 0
|
||||||
metrics.model_forward_time / 1000)
|
assert attributes.get(
|
||||||
assert metrics.model_execute_time > 0
|
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD
|
||||||
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
|
) == pytest.approx(metrics.model_forward_time / 1000)
|
||||||
) == metrics.model_execute_time
|
assert metrics.model_execute_time > 0
|
||||||
assert metrics.model_forward_time < 1000 * metrics.model_execute_time
|
assert attributes.get(
|
||||||
|
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
|
||||||
|
) == metrics.model_execute_time
|
||||||
|
assert metrics.model_forward_time < 1000 * metrics.model_execute_time
|
||||||
|
@ -566,6 +566,7 @@ def init_test_distributed_environment(
|
|||||||
|
|
||||||
|
|
||||||
def multi_process_parallel(
|
def multi_process_parallel(
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
tp_size: int,
|
tp_size: int,
|
||||||
pp_size: int,
|
pp_size: int,
|
||||||
test_target: Any,
|
test_target: Any,
|
||||||
@ -582,7 +583,13 @@ def multi_process_parallel(
|
|||||||
refs = []
|
refs = []
|
||||||
for rank in range(tp_size * pp_size):
|
for rank in range(tp_size * pp_size):
|
||||||
refs.append(
|
refs.append(
|
||||||
test_target.remote(tp_size, pp_size, rank, distributed_init_port))
|
test_target.remote(
|
||||||
|
monkeypatch,
|
||||||
|
tp_size,
|
||||||
|
pp_size,
|
||||||
|
rank,
|
||||||
|
distributed_init_port,
|
||||||
|
), )
|
||||||
ray.get(refs)
|
ray.get(refs)
|
||||||
|
|
||||||
ray.shutdown()
|
ray.shutdown()
|
||||||
|
@ -1,5 +1,8 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import random
|
import random
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@ -50,8 +53,12 @@ def model_name():
|
|||||||
return "meta-llama/Meta-Llama-3-8B-Instruct"
|
return "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
|
|
||||||
|
|
||||||
def test_ngram_correctness(monkeypatch, test_prompts, sampling_config,
|
def test_ngram_correctness(
|
||||||
model_name):
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
test_prompts: list[list[dict[str, Any]]],
|
||||||
|
sampling_config: SamplingParams,
|
||||||
|
model_name: str,
|
||||||
|
):
|
||||||
'''
|
'''
|
||||||
Compare the outputs of a original LLM and a speculative LLM
|
Compare the outputs of a original LLM and a speculative LLM
|
||||||
should be the same when using ngram speculative decoding.
|
should be the same when using ngram speculative decoding.
|
||||||
|
@ -80,9 +80,11 @@ async def generate(engine: AsyncLLM,
|
|||||||
[(TEXT_ENGINE_ARGS, TEXT_PROMPT),
|
[(TEXT_ENGINE_ARGS, TEXT_PROMPT),
|
||||||
(VISION_ENGINE_ARGS, VISION_PROMPT)])
|
(VISION_ENGINE_ARGS, VISION_PROMPT)])
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_load(monkeypatch, output_kind: RequestOutputKind,
|
async def test_load(
|
||||||
engine_args_and_prompt: tuple[AsyncEngineArgs,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
PromptType]):
|
output_kind: RequestOutputKind,
|
||||||
|
engine_args_and_prompt: tuple[AsyncEngineArgs, PromptType],
|
||||||
|
):
|
||||||
# TODO(rickyx): Remove monkeypatch once we have a better way to test V1
|
# TODO(rickyx): Remove monkeypatch once we have a better way to test V1
|
||||||
# so that in the future when we switch, we don't have to change all the
|
# so that in the future when we switch, we don't have to change all the
|
||||||
# tests.
|
# tests.
|
||||||
@ -126,7 +128,8 @@ async def test_load(monkeypatch, output_kind: RequestOutputKind,
|
|||||||
[(TEXT_ENGINE_ARGS, TEXT_PROMPT),
|
[(TEXT_ENGINE_ARGS, TEXT_PROMPT),
|
||||||
(VISION_ENGINE_ARGS, VISION_PROMPT)])
|
(VISION_ENGINE_ARGS, VISION_PROMPT)])
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_abort(monkeypatch, output_kind: RequestOutputKind,
|
async def test_abort(monkeypatch: pytest.MonkeyPatch,
|
||||||
|
output_kind: RequestOutputKind,
|
||||||
engine_args_and_prompt: tuple[AsyncEngineArgs,
|
engine_args_and_prompt: tuple[AsyncEngineArgs,
|
||||||
PromptType]):
|
PromptType]):
|
||||||
|
|
||||||
|
@ -45,7 +45,7 @@ def make_request() -> EngineCoreRequest:
|
|||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@fork_new_process_for_each_test
|
||||||
def test_engine_core(monkeypatch):
|
def test_engine_core(monkeypatch: pytest.MonkeyPatch):
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
m.setenv("VLLM_USE_V1", "1")
|
||||||
@ -159,7 +159,7 @@ def test_engine_core(monkeypatch):
|
|||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@fork_new_process_for_each_test
|
||||||
def test_engine_core_advanced_sampling(monkeypatch):
|
def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
|
||||||
"""
|
"""
|
||||||
A basic end-to-end test to verify that the engine functions correctly
|
A basic end-to-end test to verify that the engine functions correctly
|
||||||
when additional sampling parameters, such as top_p, min_tokens, and
|
when additional sampling parameters, such as top_p, min_tokens, and
|
||||||
@ -209,7 +209,7 @@ def test_engine_core_advanced_sampling(monkeypatch):
|
|||||||
|
|
||||||
|
|
||||||
@fork_new_process_for_each_test
|
@fork_new_process_for_each_test
|
||||||
def test_engine_core_concurrent_batches(monkeypatch):
|
def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
|
||||||
"""
|
"""
|
||||||
Test that the engine can handle multiple concurrent batches.
|
Test that the engine can handle multiple concurrent batches.
|
||||||
"""
|
"""
|
||||||
|
@ -90,7 +90,8 @@ def echo(self, msg: str, err_msg: Optional[str] = None) -> str:
|
|||||||
|
|
||||||
@fork_new_process_for_each_test
|
@fork_new_process_for_each_test
|
||||||
@pytest.mark.parametrize("multiprocessing_mode", [True, False])
|
@pytest.mark.parametrize("multiprocessing_mode", [True, False])
|
||||||
def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
|
def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
|
||||||
|
multiprocessing_mode: bool):
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
m.setenv("VLLM_USE_V1", "1")
|
||||||
@ -175,7 +176,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio(loop_scope="function")
|
@pytest.mark.asyncio(loop_scope="function")
|
||||||
async def test_engine_core_client_asyncio(monkeypatch):
|
async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
m.setenv("VLLM_USE_V1", "1")
|
||||||
|
@ -255,12 +255,10 @@ def _run_and_validate(
|
|||||||
[NONE, SAMPLE, PROMPT, SAMPLE_PROMPT])
|
[NONE, SAMPLE, PROMPT, SAMPLE_PROMPT])
|
||||||
@pytest.mark.parametrize("temperature", [0.0, 2.0])
|
@pytest.mark.parametrize("temperature", [0.0, 2.0])
|
||||||
def test_get_logprobs_and_prompt_logprobs(
|
def test_get_logprobs_and_prompt_logprobs(
|
||||||
hf_model,
|
hf_model, vllm_model,
|
||||||
vllm_model,
|
batch_logprobs_composition: BatchLogprobsComposition,
|
||||||
batch_logprobs_composition: BatchLogprobsComposition,
|
temperature: float, example_prompts: list[str],
|
||||||
temperature: float,
|
monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
example_prompts,
|
|
||||||
) -> None:
|
|
||||||
"""Test V1 Engine logprobs & prompt logprobs
|
"""Test V1 Engine logprobs & prompt logprobs
|
||||||
|
|
||||||
Exercise a variety of combinations of `logprobs` and `prompt_logprobs`
|
Exercise a variety of combinations of `logprobs` and `prompt_logprobs`
|
||||||
@ -287,128 +285,140 @@ def test_get_logprobs_and_prompt_logprobs(
|
|||||||
temperature: "temperature" sampling parameter
|
temperature: "temperature" sampling parameter
|
||||||
example_prompts: example prompt fixture
|
example_prompts: example prompt fixture
|
||||||
"""
|
"""
|
||||||
do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching
|
with monkeypatch.context() as m:
|
||||||
if do_apc and (temperature < 2.0
|
m.setenv("VLLM_USE_V1", "1")
|
||||||
or batch_logprobs_composition != SAMPLE_PROMPT):
|
do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching
|
||||||
# Skip some test-cases to save time.
|
if do_apc and (temperature < 2.0
|
||||||
pytest.skip()
|
or batch_logprobs_composition != SAMPLE_PROMPT):
|
||||||
test_prompts = example_prompts
|
# Skip some test-cases to save time.
|
||||||
|
pytest.skip()
|
||||||
|
test_prompts = example_prompts
|
||||||
|
|
||||||
max_tokens = 5
|
max_tokens = 5
|
||||||
hf_outputs = hf_model.generate_greedy(
|
hf_outputs = hf_model.generate_greedy(
|
||||||
test_prompts,
|
test_prompts,
|
||||||
max_tokens=max_tokens,
|
|
||||||
)
|
|
||||||
hf_logprobs = hf_model.generate_greedy_logprobs(
|
|
||||||
test_prompts,
|
|
||||||
max_tokens=max_tokens,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Batch has mixed sample params
|
|
||||||
# (different logprobs/prompt logprobs combos)
|
|
||||||
logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
|
|
||||||
|
|
||||||
# Ensure that each test prompt has a logprob config for testing
|
|
||||||
logprob_prompt_logprob_list = _repeat_logprob_config(
|
|
||||||
test_prompts, logprob_prompt_logprob_list)
|
|
||||||
# Generate SamplingParams
|
|
||||||
vllm_sampling_params = [
|
|
||||||
SamplingParams(max_tokens=max_tokens,
|
|
||||||
logprobs=num_lp,
|
|
||||||
prompt_logprobs=num_plp,
|
|
||||||
temperature=temperature,
|
|
||||||
seed=1984)
|
|
||||||
for num_lp, num_plp in logprob_prompt_logprob_list
|
|
||||||
]
|
|
||||||
for _ in range(2 if do_apc else 1):
|
|
||||||
_run_and_validate(
|
|
||||||
vllm_model=vllm_model,
|
|
||||||
test_prompts=test_prompts,
|
|
||||||
vllm_sampling_params=vllm_sampling_params,
|
|
||||||
hf_logprobs=hf_logprobs,
|
|
||||||
hf_outputs=hf_outputs,
|
|
||||||
logprob_prompt_logprob_list=logprob_prompt_logprob_list,
|
|
||||||
temperature=temperature,
|
|
||||||
max_tokens=max_tokens,
|
max_tokens=max_tokens,
|
||||||
do_apc=do_apc)
|
)
|
||||||
|
hf_logprobs = hf_model.generate_greedy_logprobs(
|
||||||
|
test_prompts,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Batch has mixed sample params
|
||||||
|
# (different logprobs/prompt logprobs combos)
|
||||||
|
logprob_prompt_logprob_list = get_test_batch(
|
||||||
|
batch_logprobs_composition)
|
||||||
|
|
||||||
|
# Ensure that each test prompt has a logprob config for testing
|
||||||
|
logprob_prompt_logprob_list = _repeat_logprob_config(
|
||||||
|
test_prompts, logprob_prompt_logprob_list)
|
||||||
|
# Generate SamplingParams
|
||||||
|
vllm_sampling_params = [
|
||||||
|
SamplingParams(max_tokens=max_tokens,
|
||||||
|
logprobs=num_lp,
|
||||||
|
prompt_logprobs=num_plp,
|
||||||
|
temperature=temperature,
|
||||||
|
seed=1984)
|
||||||
|
for num_lp, num_plp in logprob_prompt_logprob_list
|
||||||
|
]
|
||||||
|
for _ in range(2 if do_apc else 1):
|
||||||
|
_run_and_validate(
|
||||||
|
vllm_model=vllm_model,
|
||||||
|
test_prompts=test_prompts,
|
||||||
|
vllm_sampling_params=vllm_sampling_params,
|
||||||
|
hf_logprobs=hf_logprobs,
|
||||||
|
hf_outputs=hf_outputs,
|
||||||
|
logprob_prompt_logprob_list=logprob_prompt_logprob_list,
|
||||||
|
temperature=temperature,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
do_apc=do_apc)
|
||||||
|
|
||||||
|
|
||||||
def test_max_logprobs():
|
def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
|
||||||
"""vLLM v1 engine should fail a request with `logprobs > max_logprobs`
|
"""vLLM v1 engine should fail a request with `logprobs > max_logprobs`
|
||||||
|
|
||||||
Should also fail for `prompt_logprobs > max_logprobs`
|
Should also fail for `prompt_logprobs > max_logprobs`
|
||||||
|
|
||||||
APC should not matter as this test checks basic request validation.
|
APC should not matter as this test checks basic request validation.
|
||||||
|
|
||||||
Args:
|
|
||||||
monkeypatch
|
|
||||||
"""
|
"""
|
||||||
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv("VLLM_USE_V1", "1")
|
||||||
|
|
||||||
runner = VllmRunner("facebook/opt-125m",
|
runner = VllmRunner("facebook/opt-125m",
|
||||||
max_logprobs=1,
|
max_logprobs=1,
|
||||||
enable_prefix_caching=False,
|
enable_prefix_caching=False,
|
||||||
max_model_len=256)
|
max_model_len=256)
|
||||||
vllm_sampling_params = SamplingParams(logprobs=1)
|
vllm_sampling_params = SamplingParams(logprobs=1)
|
||||||
# should pass
|
# should pass
|
||||||
runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
|
runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
|
||||||
|
|
||||||
bad_sampling_params = SamplingParams(logprobs=2)
|
bad_sampling_params = SamplingParams(logprobs=2)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
runner.generate(["Hello world"], sampling_params=bad_sampling_params)
|
runner.generate(["Hello world"],
|
||||||
|
sampling_params=bad_sampling_params)
|
||||||
|
|
||||||
|
|
||||||
def test_none_logprobs(vllm_model, example_prompts):
|
def test_none_logprobs(vllm_model, example_prompts,
|
||||||
|
monkeypatch: pytest.MonkeyPatch):
|
||||||
"""Engine should return `logprobs` and `prompt_logprobs` as `None`
|
"""Engine should return `logprobs` and `prompt_logprobs` as `None`
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vllm_model: vLLM model fixture
|
vllm_model: vLLM model fixture
|
||||||
example_prompts: list of example prompts (test fixture)
|
example_prompts: list of example prompts (test fixture)
|
||||||
"""
|
"""
|
||||||
max_tokens = 5
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv("VLLM_USE_V1", "1")
|
||||||
|
max_tokens = 5
|
||||||
|
|
||||||
sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
|
sampling_params_logprobs_none = SamplingParams(
|
||||||
logprobs=None,
|
max_tokens=max_tokens,
|
||||||
prompt_logprobs=None,
|
logprobs=None,
|
||||||
temperature=0.0)
|
prompt_logprobs=None,
|
||||||
results_logprobs_none = vllm_model.model.generate(
|
temperature=0.0,
|
||||||
example_prompts, sampling_params=sampling_params_logprobs_none)
|
)
|
||||||
|
results_logprobs_none = vllm_model.model.generate(
|
||||||
|
example_prompts,
|
||||||
|
sampling_params=sampling_params_logprobs_none,
|
||||||
|
)
|
||||||
|
|
||||||
for i in range(len(results_logprobs_none)):
|
for i in range(len(results_logprobs_none)):
|
||||||
# Check sample logprobs are None
|
# Check sample logprobs are None
|
||||||
assert results_logprobs_none[i].outputs[0].logprobs is None
|
assert results_logprobs_none[i].outputs[0].logprobs is None
|
||||||
assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
|
assert results_logprobs_none[i].outputs[
|
||||||
# Check prompt logprobs are None
|
0].cumulative_logprob is None
|
||||||
assert results_logprobs_none[i].prompt_logprobs is None
|
# Check prompt logprobs are None
|
||||||
|
assert results_logprobs_none[i].prompt_logprobs is None
|
||||||
|
|
||||||
|
|
||||||
def test_zero_logprobs(vllm_model, example_prompts):
|
def test_zero_logprobs(vllm_model, example_prompts,
|
||||||
|
monkeypatch: pytest.MonkeyPatch):
|
||||||
"""Engine should return sampled token and prompt token logprobs
|
"""Engine should return sampled token and prompt token logprobs
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vllm_model: vLLM model fixture
|
vllm_model: vLLM model fixture
|
||||||
example_prompts: list of example prompts (test fixture)
|
example_prompts: list of example prompts (test fixture)
|
||||||
"""
|
"""
|
||||||
max_tokens = 5
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv("VLLM_USE_V1", "1")
|
||||||
|
max_tokens = 5
|
||||||
|
|
||||||
sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens,
|
sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens,
|
||||||
logprobs=0,
|
logprobs=0,
|
||||||
prompt_logprobs=0,
|
prompt_logprobs=0,
|
||||||
temperature=0.0)
|
temperature=0.0)
|
||||||
results_logprobs_zero = vllm_model.model.generate(
|
results_logprobs_zero = vllm_model.model.generate(
|
||||||
example_prompts, sampling_params=sampling_params_logprobs_zero)
|
example_prompts, sampling_params=sampling_params_logprobs_zero)
|
||||||
|
|
||||||
for i in range(len(results_logprobs_zero)):
|
for i in range(len(results_logprobs_zero)):
|
||||||
# Check that there is one sample logprob dict for each
|
# Check that there is one sample logprob dict for each
|
||||||
# sample token
|
# sample token
|
||||||
logprobs = results_logprobs_zero[i].outputs[0].logprobs
|
logprobs = results_logprobs_zero[i].outputs[0].logprobs
|
||||||
prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
|
prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
|
||||||
sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
|
sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
|
||||||
prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
|
prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
|
||||||
assert logprobs is not None
|
assert logprobs is not None
|
||||||
assert len(sampled_token_ids) == len(logprobs)
|
assert len(sampled_token_ids) == len(logprobs)
|
||||||
assert results_logprobs_zero[i].outputs[
|
assert results_logprobs_zero[i].outputs[
|
||||||
0].cumulative_logprob is not None
|
0].cumulative_logprob is not None
|
||||||
# Check that there is one prompt logprob dict for each
|
# Check that there is one prompt logprob dict for each
|
||||||
# prompt token
|
# prompt token
|
||||||
assert prompt_logprobs is not None
|
assert prompt_logprobs is not None
|
||||||
assert len(prompt_token_ids) == len(prompt_logprobs)
|
assert len(prompt_token_ids) == len(prompt_logprobs)
|
||||||
|
@ -3,11 +3,16 @@
|
|||||||
|
|
||||||
Run `pytest tests/v1/tpu/test_basic.py`.
|
Run `pytest tests/v1/tpu/test_basic.py`.
|
||||||
"""
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
from ...conftest import VllmRunner
|
if TYPE_CHECKING:
|
||||||
|
from tests.conftest import VllmRunner
|
||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
# "Qwen/Qwen2-7B-Instruct",
|
# "Qwen/Qwen2-7B-Instruct",
|
||||||
@ -28,7 +33,8 @@ TENSOR_PARALLEL_SIZES = [1]
|
|||||||
@pytest.mark.parametrize("enforce_eager", [True])
|
@pytest.mark.parametrize("enforce_eager", [True])
|
||||||
@pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES)
|
@pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES)
|
||||||
def test_models(
|
def test_models(
|
||||||
monkeypatch,
|
vllm_runner: type[VllmRunner],
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
model: str,
|
model: str,
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
enforce_eager: bool,
|
enforce_eager: bool,
|
||||||
@ -41,7 +47,7 @@ def test_models(
|
|||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
m.setenv("VLLM_USE_V1", "1")
|
||||||
|
|
||||||
with VllmRunner(
|
with vllm_runner(
|
||||||
model,
|
model,
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
enforce_eager=enforce_eager,
|
enforce_eager=enforce_eager,
|
||||||
@ -50,5 +56,5 @@ def test_models(
|
|||||||
tensor_parallel_size=tensor_parallel_size) as vllm_model:
|
tensor_parallel_size=tensor_parallel_size) as vllm_model:
|
||||||
vllm_outputs = vllm_model.generate_greedy(example_prompts,
|
vllm_outputs = vllm_model.generate_greedy(example_prompts,
|
||||||
max_tokens)
|
max_tokens)
|
||||||
output = vllm_outputs[0][1]
|
output = vllm_outputs[0][1]
|
||||||
assert "1024" in output
|
assert "1024" in output
|
||||||
|
Loading…
x
Reference in New Issue
Block a user