vllm/tests/distributed/test_pipeline_parallel.py

"""
WARNING: This test runs in both single-node (4 GPUs) and multi-node
 (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
 important to set the distributed backend to "mp" to avoid Ray scheduling
 all workers in a node other than the head node, which can cause the test
 to fail.
"""
import os

import pytest

from ..utils import compare_two_settings, fork_new_process_for_each_test

VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"


@pytest.mark.parametrize(
    ("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, "
     "MODEL_NAME, DIST_BACKEND, USE_RAY_ADAG, USE_RAY_ADAG_NCCL"), [
         (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
         (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
         (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False),
         (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
         (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
         (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False),
         (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
         (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
         (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True),
         (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
         (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
         (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
         (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
         (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False),
     ])
def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
                    DIST_BACKEND, USE_RAY_ADAG, USE_RAY_ADAG_NCCL):
    if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
        pytest.skip("Skipping multi-node pipeline parallel test for "
                    "multiprocessing distributed backend")

    pp_args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "float16",
        "--pipeline-parallel-size",
        str(PP_SIZE),
        "--tensor-parallel-size",
        str(TP_SIZE),
        "--distributed-executor-backend",
        DIST_BACKEND,
    ]

    # compare without pipeline parallelism
    # NOTE: use mp backend for TP
    # PP tests might involve multiple nodes, and ray might
    #  schedule all workers in a node other than the head node,
    #  which can cause the test to fail.
    tp_args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "bfloat16",
        "--tensor-parallel-size",
        str(max(TP_SIZE, 2)),  # We only use 2 GPUs in the CI.
        "--distributed-executor-backend",
        "mp",
    ]
    if CHUNKED_PREFILL:
        pp_args.append("--enable-chunked-prefill")
        tp_args.append("--enable-chunked-prefill")
    if EAGER_MODE:
        pp_args.append("--enforce-eager")
        tp_args.append("--enforce-eager")
    pp_env = None
    if USE_RAY_ADAG:
        assert DIST_BACKEND == "ray", (
            "Ray ADAG is only supported with Ray distributed backend")
        pp_env = {
            "VLLM_USE_RAY_COMPILED_DAG": "1",
            "VLLM_USE_RAY_SPMD_WORKER": "1",
            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL":
            str(int(USE_RAY_ADAG_NCCL)),
        }

    compare_two_settings(MODEL_NAME, pp_args, tp_args, pp_env)


@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
    (2, "JackFram/llama-160m"),
])
@pytest.mark.parametrize("ATTN_BACKEND", [
    "FLASH_ATTN",
    "FLASHINFER",
])
@fork_new_process_for_each_test
def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
    cudagraph_args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "float16",
        "--pipeline-parallel-size",
        str(PP_SIZE),
        "--distributed-executor-backend",
        "mp",
    ]
    os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND

    eager_args = cudagraph_args + ["--enforce-eager"]

    compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
[ci][distributed] fix flaky tests (#6806) 2024-07-25 17:44:09 -07:00			`"""`
			`WARNING: This test runs in both single-node (4 GPUs) and multi-node`
			`(2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is`
			`important to set the distributed backend to "mp" to avoid Ray scheduling`
			`all workers in a node other than the head node, which can cause the test`
			`to fail.`
			`"""`
[Core] Multiprocessing Pipeline Parallel support (#6130) Co-authored-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-18 19:15:52 -07:00			`import os`

[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00			`import pytest`

[ci][distributed] try to fix pp test (#7054) 2024-08-01 22:03:12 -07:00			`from ..utils import compare_two_settings, fork_new_process_for_each_test`
[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00
[Core] Multiprocessing Pipeline Parallel support (#6130) Co-authored-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-18 19:15:52 -07:00			`VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"`

[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00
[ci][distributed] add pipeline parallel correctness test (#6410) 2024-07-16 15:44:22 -07:00			`@pytest.mark.parametrize(`
[Core] Pipeline parallel with Ray ADAG (#6837) Support pipeline-parallelism with Ray accelerated DAG. Signed-off-by: Rui Qiao <ruisearch42@gmail.com> 2024-08-02 13:55:40 -07:00			`("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, "`
			`"MODEL_NAME, DIST_BACKEND, USE_RAY_ADAG, USE_RAY_ADAG_NCCL"), [`
			`(2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", False, False),`
			`(2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False),`
			`(1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False),`
			`(1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", False, False),`
			`(1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False),`
			`(2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, False),`
			`(2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False),`
			`(1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False),`
			`(1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, False),`
			`(1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False),`
			`(2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True),`
			`(2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True),`
			`(1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True),`
			`(1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True),`
			`(1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True),`
			`(2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False),`
			`(2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False),`
			`(1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False),`
			`(1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False),`
			`(1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False),`
			`])`
[Core] Multiprocessing Pipeline Parallel support (#6130) Co-authored-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-18 19:15:52 -07:00			`def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,`
[Core] Pipeline parallel with Ray ADAG (#6837) Support pipeline-parallelism with Ray accelerated DAG. Signed-off-by: Rui Qiao <ruisearch42@gmail.com> 2024-08-02 13:55:40 -07:00			`DIST_BACKEND, USE_RAY_ADAG, USE_RAY_ADAG_NCCL):`
[Core] Multiprocessing Pipeline Parallel support (#6130) Co-authored-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-18 19:15:52 -07:00			`if VLLM_MULTI_NODE and DIST_BACKEND == "mp":`
			`pytest.skip("Skipping multi-node pipeline parallel test for "`
			`"multiprocessing distributed backend")`
[Model] Pipeline parallel support for Mixtral (#6516) 2024-07-17 19:26:04 -07:00
[ci][distributed] add pipeline parallel correctness test (#6410) 2024-07-16 15:44:22 -07:00			`pp_args = [`
[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00			`# use half precision for speed and memory savings in CI environment`
			`"--dtype",`
[Core] Multiprocessing Pipeline Parallel support (#6130) Co-authored-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-18 19:15:52 -07:00			`"float16",`
[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00			`"--pipeline-parallel-size",`
			`str(PP_SIZE),`
			`"--tensor-parallel-size",`
			`str(TP_SIZE),`
			`"--distributed-executor-backend",`
[Core] Multiprocessing Pipeline Parallel support (#6130) Co-authored-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-18 19:15:52 -07:00			`DIST_BACKEND,`
[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00			`]`
[ci][distributed] add pipeline parallel correctness test (#6410) 2024-07-16 15:44:22 -07:00
			`# compare without pipeline parallelism`
			`# NOTE: use mp backend for TP`
			`# PP tests might involve multiple nodes, and ray might`
			`# schedule all workers in a node other than the head node,`
			`# which can cause the test to fail.`
			`tp_args = [`
			`# use half precision for speed and memory savings in CI environment`
			`"--dtype",`
			`"bfloat16",`
			`"--tensor-parallel-size",`
[Model] Pipeline parallel support for Mixtral (#6516) 2024-07-17 19:26:04 -07:00			`str(max(TP_SIZE, 2)), # We only use 2 GPUs in the CI.`
[ci][distributed] add pipeline parallel correctness test (#6410) 2024-07-16 15:44:22 -07:00			`"--distributed-executor-backend",`
			`"mp",`
			`]`
[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00			`if CHUNKED_PREFILL:`
[ci][distributed] add pipeline parallel correctness test (#6410) 2024-07-16 15:44:22 -07:00			`pp_args.append("--enable-chunked-prefill")`
			`tp_args.append("--enable-chunked-prefill")`
[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00			`if EAGER_MODE:`
[ci][distributed] add pipeline parallel correctness test (#6410) 2024-07-16 15:44:22 -07:00			`pp_args.append("--enforce-eager")`
			`tp_args.append("--enforce-eager")`
[Core] Pipeline parallel with Ray ADAG (#6837) Support pipeline-parallelism with Ray accelerated DAG. Signed-off-by: Rui Qiao <ruisearch42@gmail.com> 2024-08-02 13:55:40 -07:00			`pp_env = None`
			`if USE_RAY_ADAG:`
			`assert DIST_BACKEND == "ray", (`
			`"Ray ADAG is only supported with Ray distributed backend")`
			`pp_env = {`
			`"VLLM_USE_RAY_COMPILED_DAG": "1",`
			`"VLLM_USE_RAY_SPMD_WORKER": "1",`
			`"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL":`
			`str(int(USE_RAY_ADAG_NCCL)),`
			`}`
[ci][distributed] add pipeline parallel correctness test (#6410) 2024-07-16 15:44:22 -07:00
[Core] Pipeline parallel with Ray ADAG (#6837) Support pipeline-parallelism with Ray accelerated DAG. Signed-off-by: Rui Qiao <ruisearch42@gmail.com> 2024-08-02 13:55:40 -07:00			`compare_two_settings(MODEL_NAME, pp_args, tp_args, pp_env)`
[Bugfix] fix flashinfer cudagraph capture for PP (#6708) 2024-07-23 18:49:44 -07:00

			`@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [`
			`(2, "JackFram/llama-160m"),`
			`])`
			`@pytest.mark.parametrize("ATTN_BACKEND", [`
			`"FLASH_ATTN",`
			`"FLASHINFER",`
			`])`
[ci][distributed] try to fix pp test (#7054) 2024-08-01 22:03:12 -07:00			`@fork_new_process_for_each_test`
[Bugfix] fix flashinfer cudagraph capture for PP (#6708) 2024-07-23 18:49:44 -07:00			`def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):`
			`cudagraph_args = [`
			`# use half precision for speed and memory savings in CI environment`
			`"--dtype",`
			`"float16",`
			`"--pipeline-parallel-size",`
			`str(PP_SIZE),`
			`"--distributed-executor-backend",`
[ci][distributed] fix flaky tests (#6806) 2024-07-25 17:44:09 -07:00			`"mp",`
[Bugfix] fix flashinfer cudagraph capture for PP (#6708) 2024-07-23 18:49:44 -07:00			`]`
			`os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND`

			`eager_args = cudagraph_args + ["--enforce-eager"]`

			`compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)`