vllm/tests/distributed/test_basic_distributed_correctness.py

"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
vLLM will allocate all the available memory, so we need to run the tests one
by one. The solution is to pass arguments (model name) by environment
variables.
Run:
```sh
TEST_DIST_MODEL=facebook/opt-125m pytest \
    test_basic_distributed_correctness.py
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
    test_basic_distributed_correctness.py
```
"""
import os

import pytest
import torch

MODELS = [
    os.environ["TEST_DIST_MODEL"],
]
VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"


@pytest.mark.skipif(torch.cuda.device_count() < 2,
                    reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [5])
def test_models(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
    dtype: str,
    max_tokens: int,
) -> None:
    enforce_eager = False
    backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
    if backend_by_env_var == "FLASHINFER":
        enforce_eager = True

    hf_model = hf_runner(model, dtype=dtype)
    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
    del hf_model

    vllm_model = vllm_runner(model,
                             dtype=dtype,
                             tensor_parallel_size=2,
                             enforce_eager=enforce_eager)
    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
    del vllm_model

    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]
        vllm_output_ids, vllm_output_str = vllm_outputs[i]
        assert hf_output_str == vllm_output_str, (
            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
        assert hf_output_ids == vllm_output_ids, (
            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
[Test] Add basic correctness test (#2908) 2024-02-18 16:44:50 -08:00			`"""Compare the outputs of HF and distributed vLLM when using greedy sampling.`
[Core] remove cupy dependency (#3625) 2024-03-27 00:33:26 -07:00			`vLLM will allocate all the available memory, so we need to run the tests one`
			`by one. The solution is to pass arguments (model name) by environment`
			`variables.`
			`Run:`
			```sh
			`TEST_DIST_MODEL=facebook/opt-125m pytest \`
			`test_basic_distributed_correctness.py`
			`TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \`
			`test_basic_distributed_correctness.py`
			```
[Test] Add basic correctness test (#2908) 2024-02-18 16:44:50 -08:00			`"""`
[Core] remove cupy dependency (#3625) 2024-03-27 00:33:26 -07:00			`import os`

[Test] Add basic correctness test (#2908) 2024-02-18 16:44:50 -08:00			`import pytest`
			`import torch`

			`MODELS = [`
[Core] remove cupy dependency (#3625) 2024-03-27 00:33:26 -07:00			`os.environ["TEST_DIST_MODEL"],`
[Test] Add basic correctness test (#2908) 2024-02-18 16:44:50 -08:00			`]`
[Kernel] Use flashinfer for decoding (#4353) Co-authored-by: LiuXiaoxuanPKU <llilyliupku@gmail.com> 2024-05-03 15:51:27 -07:00			`VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"`
[Test] Add basic correctness test (#2908) 2024-02-18 16:44:50 -08:00

			`@pytest.mark.skipif(torch.cuda.device_count() < 2,`
			`reason="Need at least 2 GPUs to run the test.")`
			`@pytest.mark.parametrize("model", MODELS)`
			`@pytest.mark.parametrize("dtype", ["half"])`
			`@pytest.mark.parametrize("max_tokens", [5])`
			`def test_models(`
			`hf_runner,`
			`vllm_runner,`
			`example_prompts,`
			`model: str,`
			`dtype: str,`
			`max_tokens: int,`
			`) -> None:`
[Kernel] Use flashinfer for decoding (#4353) Co-authored-by: LiuXiaoxuanPKU <llilyliupku@gmail.com> 2024-05-03 15:51:27 -07:00			`enforce_eager = False`
			`backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)`
			`if backend_by_env_var == "FLASHINFER":`
			`enforce_eager = True`
[Core][5/N] Fully working chunked prefill e2e (#3884) 2024-04-11 09:56:48 +09:00
[Test] Add basic correctness test (#2908) 2024-02-18 16:44:50 -08:00			`hf_model = hf_runner(model, dtype=dtype)`
			`hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)`
			`del hf_model`

[Kernel] Use flashinfer for decoding (#4353) Co-authored-by: LiuXiaoxuanPKU <llilyliupku@gmail.com> 2024-05-03 15:51:27 -07:00			`vllm_model = vllm_runner(model,`
			`dtype=dtype,`
			`tensor_parallel_size=2,`
			`enforce_eager=enforce_eager)`
[Test] Add basic correctness test (#2908) 2024-02-18 16:44:50 -08:00			`vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)`
			`del vllm_model`

			`for i in range(len(example_prompts)):`
			`hf_output_ids, hf_output_str = hf_outputs[i]`
			`vllm_output_ids, vllm_output_str = vllm_outputs[i]`
			`assert hf_output_str == vllm_output_str, (`
			`f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")`
			`assert hf_output_ids == vllm_output_ids, (`
			`f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")`