vllm/tests/distributed/test_chunked_prefill_distributed.py

"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
vLLM will allocate all the available memory, so we need to run the tests one
by one. The solution is to pass arguments (model name) by environment
variables.

Run:
```sh
TEST_DIST_MODEL=facebook/opt-125m pytest \
    test_chunked_prefill_distributed.py
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
    test_chunked_prefill_distributed.py
```
"""
import os

import pytest

from vllm.utils import cuda_device_count_stateless

from ..models.utils import check_outputs_equal

MODELS = [
    os.environ["TEST_DIST_MODEL"],
]
DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND"


@pytest.mark.skipif(cuda_device_count_stateless() < 2,
                    reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("chunked_prefill_token_size", [16])
def test_models(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
    dtype: str,
    max_tokens: int,
    chunked_prefill_token_size: int,
) -> None:
    distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND)

    # Add a chunked prefill config.
    max_num_seqs = min(chunked_prefill_token_size, 256)
    assert chunked_prefill_token_size != -1
    enable_chunked_prefill = True
    max_num_batched_tokens = chunked_prefill_token_size

    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).

    with vllm_runner(
            model,
            dtype=dtype,
            tensor_parallel_size=2,
            max_num_seqs=max_num_seqs,
            enable_chunked_prefill=enable_chunked_prefill,
            max_num_batched_tokens=max_num_batched_tokens,
            distributed_executor_backend=distributed_executor_backend,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)

    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

    check_outputs_equal(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )
[Core][5/N] Fully working chunked prefill e2e (#3884) 2024-04-11 09:56:48 +09:00			`"""Compare the outputs of HF and distributed vLLM when using greedy sampling.`
			`vLLM will allocate all the available memory, so we need to run the tests one`
			`by one. The solution is to pass arguments (model name) by environment`
			`variables.`

			`Run:`
			```sh
			`TEST_DIST_MODEL=facebook/opt-125m pytest \`
			`test_chunked_prefill_distributed.py`
			`TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \`
			`test_chunked_prefill_distributed.py`
			```
			`"""`
			`import os`

			`import pytest`
[ci][distributed] fix device count call [ci][distributed] fix some cuda init that makes it necessary to use spawn (#5991) 2024-06-30 01:06:13 -07:00
			`from vllm.utils import cuda_device_count_stateless`
[Core][5/N] Fully working chunked prefill e2e (#3884) 2024-04-11 09:56:48 +09:00
[CI/Build] Reuse code for checking output consistency (#5988) 2024-06-30 11:44:25 +08:00			`from ..models.utils import check_outputs_equal`

[Core][5/N] Fully working chunked prefill e2e (#3884) 2024-04-11 09:56:48 +09:00			`MODELS = [`
			`os.environ["TEST_DIST_MODEL"],`
			`]`
[Core] Add MultiprocessingGPUExecutor (#4539) Co-authored-by: SAHIL SUNEJA <suneja@us.ibm.com> 2024-05-14 10:38:59 -07:00			`DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND"`
[Core][5/N] Fully working chunked prefill e2e (#3884) 2024-04-11 09:56:48 +09:00

[ci][distributed] fix device count call [ci][distributed] fix some cuda init that makes it necessary to use spawn (#5991) 2024-06-30 01:06:13 -07:00			`@pytest.mark.skipif(cuda_device_count_stateless() < 2,`
[Core][5/N] Fully working chunked prefill e2e (#3884) 2024-04-11 09:56:48 +09:00			`reason="Need at least 2 GPUs to run the test.")`
			`@pytest.mark.parametrize("model", MODELS)`
			`@pytest.mark.parametrize("dtype", ["half"])`
			`@pytest.mark.parametrize("max_tokens", [5])`
			`@pytest.mark.parametrize("chunked_prefill_token_size", [16])`
			`def test_models(`
			`hf_runner,`
			`vllm_runner,`
			`example_prompts,`
			`model: str,`
			`dtype: str,`
			`max_tokens: int,`
			`chunked_prefill_token_size: int,`
			`) -> None:`
[Core] Add MultiprocessingGPUExecutor (#4539) Co-authored-by: SAHIL SUNEJA <suneja@us.ibm.com> 2024-05-14 10:38:59 -07:00			`distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND)`

[Core][5/N] Fully working chunked prefill e2e (#3884) 2024-04-11 09:56:48 +09:00			`# Add a chunked prefill config.`
			`max_num_seqs = min(chunked_prefill_token_size, 256)`
			`assert chunked_prefill_token_size != -1`
			`enable_chunked_prefill = True`
			`max_num_batched_tokens = chunked_prefill_token_size`

[ci][distributed] fix device count call [ci][distributed] fix some cuda init that makes it necessary to use spawn (#5991) 2024-06-30 01:06:13 -07:00			`# NOTE: take care of the order. run vLLM first, and then run HF.`
			`# vLLM needs a fresh new process without cuda initialization.`
			`# if we run HF first, the cuda initialization will be done and it`
			`# will hurt multiprocessing backend with fork method (the default method).`
[Core][5/N] Fully working chunked prefill e2e (#3884) 2024-04-11 09:56:48 +09:00
[CI/Test] improve robustness of test (vllm_runner) (#5357) [CI/Test] improve robustness of test by replacing del with context manager (vllm_runner) (#5357) 2024-06-08 01:59:20 -07:00			`with vllm_runner(`
			`model,`
			`dtype=dtype,`
			`tensor_parallel_size=2,`
			`max_num_seqs=max_num_seqs,`
			`enable_chunked_prefill=enable_chunked_prefill,`
			`max_num_batched_tokens=max_num_batched_tokens,`
			`distributed_executor_backend=distributed_executor_backend,`
			`) as vllm_model:`
			`vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)`
[Core][5/N] Fully working chunked prefill e2e (#3884) 2024-04-11 09:56:48 +09:00
[ci][distributed] fix device count call [ci][distributed] fix some cuda init that makes it necessary to use spawn (#5991) 2024-06-30 01:06:13 -07:00			`with hf_runner(model, dtype=dtype) as hf_model:`
			`hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)`

[CI/Build] Reuse code for checking output consistency (#5988) 2024-06-30 11:44:25 +08:00			`check_outputs_equal(`
			`outputs_0_lst=hf_outputs,`
			`outputs_1_lst=vllm_outputs,`
			`name_0="hf",`
			`name_1="vllm",`
			`)`