vllm/tests/distributed/test_pipeline_parallel.py

"""
WARNING: This test runs in both single-node (4 GPUs) and multi-node
 (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
 important to set the distributed backend to "mp" to avoid Ray scheduling
 all workers in a node other than the head node, which can cause the test
 to fail.
"""
import os

import pytest

from vllm.logger import init_logger

from ..utils import compare_two_settings, fork_new_process_for_each_test

logger = init_logger("test_pipeline_parallel")

VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"


@pytest.mark.parametrize(("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, "
                          "MODEL_NAME, DIST_BACKEND"),
                         [
                             (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
                             (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
                             (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
                             (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
                             (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
                             (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
                             (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
                             (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
                             (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
                             (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
                         ])
@fork_new_process_for_each_test
def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
                    DIST_BACKEND):
    if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
        pytest.skip("Skipping multi-node pipeline parallel test for "
                    "multiprocessing distributed backend")

    pp_args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "float16",
        "--pipeline-parallel-size",
        str(PP_SIZE),
        "--tensor-parallel-size",
        str(TP_SIZE),
        "--distributed-executor-backend",
        DIST_BACKEND,
    ]

    # compare without pipeline parallelism
    # NOTE: use mp backend for TP
    # PP tests might involve multiple nodes, and ray might
    #  schedule all workers in a node other than the head node,
    #  which can cause the test to fail.
    tp_args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "bfloat16",
        "--tensor-parallel-size",
        str(max(TP_SIZE, 2)),  # We only use 2 GPUs in the CI.
        "--distributed-executor-backend",
        "mp",
    ]
    if CHUNKED_PREFILL:
        pp_args.append("--enable-chunked-prefill")
        tp_args.append("--enable-chunked-prefill")
    if EAGER_MODE:
        pp_args.append("--enforce-eager")
        tp_args.append("--enforce-eager")
    pp_env = None
    if (DIST_BACKEND == "ray" and TP_SIZE == 2 and PP_SIZE == 2
            and CHUNKED_PREFILL):
        # Test Ray ADAG for a subset of the tests
        pp_env = {
            "VLLM_USE_RAY_COMPILED_DAG": "1",
            "VLLM_USE_RAY_SPMD_WORKER": "1",
            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
        }

    try:
        compare_two_settings(MODEL_NAME, pp_args, tp_args, pp_env)
    except Exception:
        if pp_env is None:
            raise
        else:
            # Ray ADAG tests are flaky, so we don't want to fail the test
            logger.exception("Ray ADAG tests failed")
[ci][distributed] fix flaky tests (#6806) 2024-07-25 17:44:09 -07:00			`"""`
			`WARNING: This test runs in both single-node (4 GPUs) and multi-node`
			`(2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is`
			`important to set the distributed backend to "mp" to avoid Ray scheduling`
			`all workers in a node other than the head node, which can cause the test`
			`to fail.`
			`"""`
[Core] Multiprocessing Pipeline Parallel support (#6130) Co-authored-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-18 19:15:52 -07:00			`import os`

[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00			`import pytest`

[ci/test] rearrange tests and make adag test soft fail (#7572) 2024-08-15 19:39:04 -07:00			`from vllm.logger import init_logger`

[ci][distributed] try to fix pp test (#7054) 2024-08-01 22:03:12 -07:00			`from ..utils import compare_two_settings, fork_new_process_for_each_test`
[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00
[ci/test] rearrange tests and make adag test soft fail (#7572) 2024-08-15 19:39:04 -07:00			`logger = init_logger("test_pipeline_parallel")`

[Core] Multiprocessing Pipeline Parallel support (#6130) Co-authored-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-18 19:15:52 -07:00			`VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"`

[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00
[ci][distributed] disable ray dag tests (#7099) 2024-08-02 22:32:04 -07:00			`@pytest.mark.parametrize(("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, "`
			`"MODEL_NAME, DIST_BACKEND"),`
			`[`
			`(2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),`
			`(2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),`
			`(1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),`
			`(1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),`
			`(1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),`
[ci/test] rearrange tests and make adag test soft fail (#7572) 2024-08-15 19:39:04 -07:00			`(1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),`
			`(1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),`
			`(1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),`
			`(2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),`
			`(2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),`
[ci][distributed] disable ray dag tests (#7099) 2024-08-02 22:32:04 -07:00			`])`
[ci/test] rearrange tests and make adag test soft fail (#7572) 2024-08-15 19:39:04 -07:00			`@fork_new_process_for_each_test`
[Core] Multiprocessing Pipeline Parallel support (#6130) Co-authored-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-18 19:15:52 -07:00			`def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,`
[ci][distributed] disable ray dag tests (#7099) 2024-08-02 22:32:04 -07:00			`DIST_BACKEND):`
[Core] Multiprocessing Pipeline Parallel support (#6130) Co-authored-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-18 19:15:52 -07:00			`if VLLM_MULTI_NODE and DIST_BACKEND == "mp":`
			`pytest.skip("Skipping multi-node pipeline parallel test for "`
			`"multiprocessing distributed backend")`
[Model] Pipeline parallel support for Mixtral (#6516) 2024-07-17 19:26:04 -07:00
[ci][distributed] add pipeline parallel correctness test (#6410) 2024-07-16 15:44:22 -07:00			`pp_args = [`
[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00			`# use half precision for speed and memory savings in CI environment`
			`"--dtype",`
[Core] Multiprocessing Pipeline Parallel support (#6130) Co-authored-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-18 19:15:52 -07:00			`"float16",`
[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00			`"--pipeline-parallel-size",`
			`str(PP_SIZE),`
			`"--tensor-parallel-size",`
			`str(TP_SIZE),`
			`"--distributed-executor-backend",`
[Core] Multiprocessing Pipeline Parallel support (#6130) Co-authored-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-18 19:15:52 -07:00			`DIST_BACKEND,`
[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00			`]`
[ci][distributed] add pipeline parallel correctness test (#6410) 2024-07-16 15:44:22 -07:00
			`# compare without pipeline parallelism`
			`# NOTE: use mp backend for TP`
			`# PP tests might involve multiple nodes, and ray might`
			`# schedule all workers in a node other than the head node,`
			`# which can cause the test to fail.`
			`tp_args = [`
			`# use half precision for speed and memory savings in CI environment`
			`"--dtype",`
			`"bfloat16",`
			`"--tensor-parallel-size",`
[Model] Pipeline parallel support for Mixtral (#6516) 2024-07-17 19:26:04 -07:00			`str(max(TP_SIZE, 2)), # We only use 2 GPUs in the CI.`
[ci][distributed] add pipeline parallel correctness test (#6410) 2024-07-16 15:44:22 -07:00			`"--distributed-executor-backend",`
			`"mp",`
			`]`
[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00			`if CHUNKED_PREFILL:`
[ci][distributed] add pipeline parallel correctness test (#6410) 2024-07-16 15:44:22 -07:00			`pp_args.append("--enable-chunked-prefill")`
			`tp_args.append("--enable-chunked-prefill")`
[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00			`if EAGER_MODE:`
[ci][distributed] add pipeline parallel correctness test (#6410) 2024-07-16 15:44:22 -07:00			`pp_args.append("--enforce-eager")`
			`tp_args.append("--enforce-eager")`
[Core] Pipeline parallel with Ray ADAG (#6837) Support pipeline-parallelism with Ray accelerated DAG. Signed-off-by: Rui Qiao <ruisearch42@gmail.com> 2024-08-02 13:55:40 -07:00			`pp_env = None`
[Core] Shut down aDAG workers with clean async llm engine exit (#7224) Signed-off-by: Rui Qiao <ruisearch42@gmail.com> 2024-08-12 17:57:16 -07:00			`if (DIST_BACKEND == "ray" and TP_SIZE == 2 and PP_SIZE == 2`
			`and CHUNKED_PREFILL):`
			`# Test Ray ADAG for a subset of the tests`
[Core] Pipeline parallel with Ray ADAG (#6837) Support pipeline-parallelism with Ray accelerated DAG. Signed-off-by: Rui Qiao <ruisearch42@gmail.com> 2024-08-02 13:55:40 -07:00			`pp_env = {`
			`"VLLM_USE_RAY_COMPILED_DAG": "1",`
			`"VLLM_USE_RAY_SPMD_WORKER": "1",`
[Core] Shut down aDAG workers with clean async llm engine exit (#7224) Signed-off-by: Rui Qiao <ruisearch42@gmail.com> 2024-08-12 17:57:16 -07:00			`"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",`
[Core] Pipeline parallel with Ray ADAG (#6837) Support pipeline-parallelism with Ray accelerated DAG. Signed-off-by: Rui Qiao <ruisearch42@gmail.com> 2024-08-02 13:55:40 -07:00			`}`
[ci][distributed] add pipeline parallel correctness test (#6410) 2024-07-16 15:44:22 -07:00
[ci/test] rearrange tests and make adag test soft fail (#7572) 2024-08-15 19:39:04 -07:00			`try:`
			`compare_two_settings(MODEL_NAME, pp_args, tp_args, pp_env)`
			`except Exception:`
			`if pp_env is None:`
			`raise`
			`else:`
			`# Ray ADAG tests are flaky, so we don't want to fail the test`
			`logger.exception("Ray ADAG tests failed")`