2024-06-29 23:45:54 +08:00
|
|
|
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
|
|
|
|
The second test will hang if more than one test is run per command, so we need
|
|
|
|
to run the tests one by one. The solution is to pass arguments (model name) by
|
|
|
|
environment variables.
|
|
|
|
|
|
|
|
Run:
|
|
|
|
```sh
|
|
|
|
TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf \
|
|
|
|
test_multimodal_broadcast.py
|
|
|
|
TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct \
|
|
|
|
test_multimodal_broadcast.py
|
|
|
|
```
|
|
|
|
"""
|
|
|
|
import os
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
|
|
from vllm.utils import cuda_device_count_stateless
|
|
|
|
|
|
|
|
model = os.environ["TEST_DIST_MODEL"]
|
|
|
|
|
|
|
|
if model.startswith("llava-hf/llava"):
|
2024-07-03 15:14:16 -07:00
|
|
|
from ..models.test_llava import models, run_test
|
2024-06-29 23:45:54 +08:00
|
|
|
elif model.startswith("microsoft/Phi-3-vision"):
|
2024-07-03 15:14:16 -07:00
|
|
|
from ..models.test_phi3v import models, run_test
|
2024-06-29 23:45:54 +08:00
|
|
|
else:
|
|
|
|
raise NotImplementedError(f"Unsupported model: {model}")
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("tensor_parallel_size", [2])
|
|
|
|
@pytest.mark.parametrize("dtype", ["half"])
|
|
|
|
@pytest.mark.parametrize("max_tokens", [128])
|
2024-07-03 11:34:00 +08:00
|
|
|
@pytest.mark.parametrize("num_logprobs", [5])
|
2024-06-29 23:45:54 +08:00
|
|
|
def test_models(hf_runner, vllm_runner, image_assets,
|
2024-07-03 11:34:00 +08:00
|
|
|
tensor_parallel_size: int, dtype: str, max_tokens: int,
|
|
|
|
num_logprobs: int) -> None:
|
2024-06-29 23:45:54 +08:00
|
|
|
if cuda_device_count_stateless() < tensor_parallel_size:
|
|
|
|
pytest.skip(
|
|
|
|
f"Need at least {tensor_parallel_size} GPUs to run the test.")
|
|
|
|
|
|
|
|
distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND")
|
|
|
|
|
|
|
|
run_test(
|
|
|
|
hf_runner,
|
|
|
|
vllm_runner,
|
|
|
|
image_assets,
|
2024-07-03 15:14:16 -07:00
|
|
|
model=models[0],
|
2024-07-03 11:34:00 +08:00
|
|
|
size_factors=[1.0],
|
2024-06-29 23:45:54 +08:00
|
|
|
dtype=dtype,
|
|
|
|
max_tokens=max_tokens,
|
2024-07-03 11:34:00 +08:00
|
|
|
num_logprobs=num_logprobs,
|
2024-06-29 23:45:54 +08:00
|
|
|
tensor_parallel_size=tensor_parallel_size,
|
|
|
|
distributed_executor_backend=distributed_executor_backend,
|
|
|
|
)
|