vllm/tests/compile/test_full_graph_multi_gpu.py

import pytest

from vllm.compilation.backends import vllm_backend
from vllm.utils import cuda_device_count_stateless

from ..utils import fork_new_process_for_each_test
from .utils import TEST_MODELS_SMOKE, check_full_graph_support


@pytest.mark.parametrize("model_info", TEST_MODELS_SMOKE)
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("backend", ["eager", vllm_backend])
@fork_new_process_for_each_test
def test_full_graph_multi_gpu(model_info, tp_size, backend):
    model = model_info[0]
    model_kwargs = model_info[1]

    # Skip the test if there are not enough CUDA devices.
    if cuda_device_count_stateless() < tp_size:
        pytest.skip("Not enough CUDA devices for the test.")

    check_full_graph_support(model, model_kwargs, backend, tp_size=tp_size)
[Kernel] Fullgraph and opcheck tests (#8479) 2024-09-25 10:35:52 -04:00			`import pytest`

			`from vllm.compilation.backends import vllm_backend`
			`from vllm.utils import cuda_device_count_stateless`

			`from ..utils import fork_new_process_for_each_test`
			`from .utils import TEST_MODELS_SMOKE, check_full_graph_support`


			`@pytest.mark.parametrize("model_info", TEST_MODELS_SMOKE)`
			`@pytest.mark.parametrize("tp_size", [2])`
			`@pytest.mark.parametrize("backend", ["eager", vllm_backend])`
			`@fork_new_process_for_each_test`
			`def test_full_graph_multi_gpu(model_info, tp_size, backend):`
			`model = model_info[0]`
			`model_kwargs = model_info[1]`

			`# Skip the test if there are not enough CUDA devices.`
			`if cuda_device_count_stateless() < tp_size:`
			`pytest.skip("Not enough CUDA devices for the test.")`

			`check_full_graph_support(model, model_kwargs, backend, tp_size=tp_size)`