vllm/tests/distributed/test_chunked_prefill_distributed.py

"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
vLLM will allocate all the available memory, so we need to run the tests one
by one. The solution is to pass arguments (model name) by environment
variables.

Run:
```sh
TEST_DIST_MODEL=facebook/opt-125m pytest \
    test_chunked_prefill_distributed.py
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
    test_chunked_prefill_distributed.py
```
"""
import os

import pytest
import torch

MODELS = [
    os.environ["TEST_DIST_MODEL"],
]
DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND"


@pytest.mark.skipif(torch.cuda.device_count() < 2,
                    reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("chunked_prefill_token_size", [16])
def test_models(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
    dtype: str,
    max_tokens: int,
    chunked_prefill_token_size: int,
) -> None:
    distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND)

    # Add a chunked prefill config.
    max_num_seqs = min(chunked_prefill_token_size, 256)
    assert chunked_prefill_token_size != -1
    enable_chunked_prefill = True
    max_num_batched_tokens = chunked_prefill_token_size

    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

    vllm_model = vllm_runner(
        model,
        dtype=dtype,
        tensor_parallel_size=2,
        max_num_seqs=max_num_seqs,
        enable_chunked_prefill=enable_chunked_prefill,
        max_num_batched_tokens=max_num_batched_tokens,
        distributed_executor_backend=distributed_executor_backend,
    )
    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
    del vllm_model

    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]
        vllm_output_ids, vllm_output_str = vllm_outputs[i]
        assert hf_output_str == vllm_output_str, (
            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
        assert hf_output_ids == vllm_output_ids, (
            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
[Core][5/N] Fully working chunked prefill e2e (#3884) 2024-04-11 09:56:48 +09:00			`"""Compare the outputs of HF and distributed vLLM when using greedy sampling.`
			`vLLM will allocate all the available memory, so we need to run the tests one`
			`by one. The solution is to pass arguments (model name) by environment`
			`variables.`

			`Run:`
			```sh
			`TEST_DIST_MODEL=facebook/opt-125m pytest \`
			`test_chunked_prefill_distributed.py`
			`TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \`
			`test_chunked_prefill_distributed.py`
			```
			`"""`
			`import os`

			`import pytest`
			`import torch`

			`MODELS = [`
			`os.environ["TEST_DIST_MODEL"],`
			`]`
[Core] Add MultiprocessingGPUExecutor (#4539) Co-authored-by: SAHIL SUNEJA <suneja@us.ibm.com> 2024-05-14 10:38:59 -07:00			`DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND"`
[Core][5/N] Fully working chunked prefill e2e (#3884) 2024-04-11 09:56:48 +09:00

			`@pytest.mark.skipif(torch.cuda.device_count() < 2,`
			`reason="Need at least 2 GPUs to run the test.")`
			`@pytest.mark.parametrize("model", MODELS)`
			`@pytest.mark.parametrize("dtype", ["half"])`
			`@pytest.mark.parametrize("max_tokens", [5])`
			`@pytest.mark.parametrize("chunked_prefill_token_size", [16])`
			`def test_models(`
			`hf_runner,`
			`vllm_runner,`
			`example_prompts,`
			`model: str,`
			`dtype: str,`
			`max_tokens: int,`
			`chunked_prefill_token_size: int,`
			`) -> None:`
[Core] Add MultiprocessingGPUExecutor (#4539) Co-authored-by: SAHIL SUNEJA <suneja@us.ibm.com> 2024-05-14 10:38:59 -07:00			`distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND)`

[Core][5/N] Fully working chunked prefill e2e (#3884) 2024-04-11 09:56:48 +09:00			`# Add a chunked prefill config.`
			`max_num_seqs = min(chunked_prefill_token_size, 256)`
			`assert chunked_prefill_token_size != -1`
			`enable_chunked_prefill = True`
			`max_num_batched_tokens = chunked_prefill_token_size`

[CI/Test] improve robustness of test (hf_runner) (#5347) [CI/Test] improve robustness of test by replacing del with context manager (hf_runner) (#5347) 2024-06-07 22:31:32 -07:00			`with hf_runner(model, dtype=dtype) as hf_model:`
			`hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)`
[Core][5/N] Fully working chunked prefill e2e (#3884) 2024-04-11 09:56:48 +09:00
			`vllm_model = vllm_runner(`
			`model,`
			`dtype=dtype,`
			`tensor_parallel_size=2,`
			`max_num_seqs=max_num_seqs,`
			`enable_chunked_prefill=enable_chunked_prefill,`
			`max_num_batched_tokens=max_num_batched_tokens,`
[Core] Add MultiprocessingGPUExecutor (#4539) Co-authored-by: SAHIL SUNEJA <suneja@us.ibm.com> 2024-05-14 10:38:59 -07:00			`distributed_executor_backend=distributed_executor_backend,`
[Core][5/N] Fully working chunked prefill e2e (#3884) 2024-04-11 09:56:48 +09:00			`)`
			`vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)`
			`del vllm_model`

			`for i in range(len(example_prompts)):`
			`hf_output_ids, hf_output_str = hf_outputs[i]`
			`vllm_output_ids, vllm_output_str = vllm_outputs[i]`
			`assert hf_output_str == vllm_output_str, (`
			`f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")`
			`assert hf_output_ids == vllm_output_ids, (`
			`f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")`