vllm/tests/distributed/test_pipeline_parallel.py

"""
WARNING: This test runs in both single-node (4 GPUs) and multi-node
 (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
 important to set the distributed backend to "mp" to avoid Ray scheduling
 all workers in a node other than the head node, which can cause the test
 to fail.
"""
import os
from dataclasses import dataclass
from typing import List, NamedTuple, Optional

import pytest

from vllm.logger import init_logger

from ..utils import compare_two_settings, fork_new_process_for_each_test

logger = init_logger("test_pipeline_parallel")

VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"


class ParallelSetup(NamedTuple):
    tp_size: int
    pp_size: int
    eager_mode: bool
    chunked_prefill: bool


@dataclass
class PPTestSettings:
    parallel_setups: List[ParallelSetup]
    distributed_backends: List[str]
    trust_remote_code: bool
    tokenizer_mode: Optional[str]

    @staticmethod
    def detailed(
        *,
        tp_base: int = 1,
        pp_base: int = 2,
        trust_remote_code: bool = False,
        tokenizer_mode: Optional[str] = None,
    ):
        return PPTestSettings(
            parallel_setups=[
                ParallelSetup(tp_size=tp_base,
                              pp_size=pp_base,
                              eager_mode=False,
                              chunked_prefill=False),
                ParallelSetup(tp_size=tp_base,
                              pp_size=2 * pp_base,
                              eager_mode=False,
                              chunked_prefill=True),
                ParallelSetup(tp_size=tp_base,
                              pp_size=2 * pp_base,
                              eager_mode=True,
                              chunked_prefill=False),
                ParallelSetup(tp_size=2 * tp_base,
                              pp_size=pp_base,
                              eager_mode=False,
                              chunked_prefill=True),
                ParallelSetup(tp_size=2 * tp_base,
                              pp_size=pp_base,
                              eager_mode=True,
                              chunked_prefill=False),
            ],
            distributed_backends=["mp", "ray"],
            trust_remote_code=trust_remote_code,
            tokenizer_mode=tokenizer_mode,
        )

    @staticmethod
    def fast(
        *,
        tp_base: int = 1,
        pp_base: int = 2,
        trust_remote_code: bool = False,
        tokenizer_mode: Optional[str] = None,
    ):
        return PPTestSettings(
            parallel_setups=[
                ParallelSetup(tp_size=tp_base,
                              pp_size=pp_base,
                              eager_mode=True,
                              chunked_prefill=False),
            ],
            distributed_backends=["mp"],
            trust_remote_code=trust_remote_code,
            tokenizer_mode=tokenizer_mode,
        )

    def iter_params(self, model_name: str):
        for parallel_setup in self.parallel_setups:
            for distributed_backend in self.distributed_backends:
                yield (model_name, parallel_setup, distributed_backend,
                       self.trust_remote_code, self.tokenizer_mode)


# yapf: disable
GENERATION_MODEL_SETTINGS = {
    # [DETAILED TESTS]
    "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
    # [FAST TESTS]
    # Uses Llama
    # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
    # TODO: Test on larger GPU
    # "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
    "baichuan-inc/Baichuan-7B": PPTestSettings.fast(trust_remote_code=True),
    "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
    "bigscience/bloomz-1b1": PPTestSettings.fast(),
    "THUDM/chatglm3-6b": PPTestSettings.fast(trust_remote_code=True),
    "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(tp_base=2, trust_remote_code=True),  # noqa: E501
    # TODO: Test on larger GPU
    # "databricks/dbrx-instruct": PPTestSettings.fast(),
    "Deci/DeciLM-7B-instruct": PPTestSettings.fast(trust_remote_code=True),
    "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
    "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
    "tiiuae/falcon-7b": PPTestSettings.fast(),
    "google/gemma-2b": PPTestSettings.fast(),
    "google/gemma-2-9b": PPTestSettings.fast(),
    "gpt2": PPTestSettings.fast(),
    "bigcode/starcoder": PPTestSettings.fast(),
    "EleutherAI/gpt-j-6b": PPTestSettings.fast(),
    "EleutherAI/pythia-12b": PPTestSettings.fast(),
    "ibm/PowerLM-3b": PPTestSettings.fast(),
    "ibm/PowerMoE-3b": PPTestSettings.fast(),
    # Uses Llama
    # "internlm/internlm-chat-7b": PPTestSettings.fast(),
    "internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True),
    "core42/jais-13b-chat": PPTestSettings.fast(),
    # TODO: Implement PP
    # "ai21labs/AI21-Jamba-1.5-Mini": PPTestSettings.fast(),
    "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True),
    "openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True),
    # Uses Llama
    # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
    "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4),
    "mosaicml/mpt-7b": PPTestSettings.fast(),
    "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
    "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
    "allenai/OLMo-1B-hf": PPTestSettings.fast(),
    "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
    "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
    "microsoft/phi-2": PPTestSettings.fast(),
    "microsoft/Phi-3-mini-4k-instruct": PPTestSettings.fast(),
    "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
    # FIXME: https://github.com/vllm-project/vllm/issues/8553
    # "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
    "adept/persimmon-8b-chat": PPTestSettings.fast(),
    "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
    "Qwen/Qwen2-beta-7B-Chat": PPTestSettings.fast(),
    "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
    "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
    "bigcode/starcoder2-3b": PPTestSettings.fast(),
    "upstage/solar-pro-preview-instruct": PPTestSettings.fast(tp_base=2),
    # FIXME: Cannot load tokenizer in latest transformers version
    # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
}

EMBEDDING_MODEL_SETTINGS = {  # type: ignore[var-annotated]
    # [FAST TESTS]
    # Uses Llama
    # "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
}

MULTIMODAL_MODEL_SETTINGS = {
    # [FAST TESTS]
    "Salesforce/blip2-opt-2.7b": PPTestSettings.fast(),
    "facebook/chameleon-7b": PPTestSettings.fast(),
    "adept/fuyu-8b": PPTestSettings.fast(),
    "OpenGVLab/InternVL2-1B": PPTestSettings.fast(trust_remote_code=True),
    "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
    "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
    "llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
    "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(trust_remote_code=True),
    # TODO: Implement PP
    # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
    "microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
    "mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"),  # noqa: E501
    "Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
    "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
    "fixie-ai/ultravox-v0_3": PPTestSettings.fast(),
}

CONDITIONAL_GENERATION_MODEL_SETTINGS = {  # type: ignore[var-annotated]
    # [FAST TESTS]
    # TODO: Implement PP
    # "facebook/bart-base": PPTestSettings.fast(),
}
# yapf: enable

MODEL_SETTINGS = {
    **GENERATION_MODEL_SETTINGS,
    **EMBEDDING_MODEL_SETTINGS,
    **MULTIMODAL_MODEL_SETTINGS,
}

# You can update this on your local machine to run specific tests
TEST_MODELS = [
    "meta-llama/Meta-Llama-3-8B",
    "facebook/chameleon-7b",
    "OpenGVLab/InternVL2-1B",
    "microsoft/Phi-3-vision-128k-instruct",
    "mistralai/Pixtral-12B-2409",
    "fixie-ai/ultravox-v0_3",
]


@pytest.mark.parametrize(
    ("model_name", "parallel_setup", "distributed_backend",
     "trust_remote_code", "tokenizer_mode"),
    [
        params for model_name, settings in MODEL_SETTINGS.items()
        for params in settings.iter_params(model_name)
        if model_name in TEST_MODELS
    ],
)
@fork_new_process_for_each_test
def test_compare_tp(model_name: str, parallel_setup: ParallelSetup,
                    distributed_backend: str, trust_remote_code: bool,
                    tokenizer_mode: Optional[str], num_gpus_available):
    tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup

    if num_gpus_available < tp_size:
        pytest.skip(f"Need at least {tp_size} GPUs to run the test")
    if VLLM_MULTI_NODE and distributed_backend == "mp":
        pytest.skip("Skipping multi-node pipeline parallel test for "
                    "multiprocessing distributed backend")

    common_args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "float16",
        "--max-model-len",
        "2048",
        "--max-num-seqs",
        "8",
    ]
    if chunked_prefill:
        common_args.append("--enable-chunked-prefill")
    if eager_mode:
        common_args.append("--enforce-eager")
    if trust_remote_code:
        common_args.append("--trust-remote-code")
    if tokenizer_mode:
        common_args.extend(["--tokenizer-mode", tokenizer_mode])

    if (distributed_backend == "ray" and tp_size == 2 and pp_size == 2
            and chunked_prefill):
        # Test Ray ADAG for a subset of the tests
        pp_env = {
            "VLLM_USE_RAY_COMPILED_DAG": "1",
            "VLLM_USE_RAY_SPMD_WORKER": "1",
            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
        }
        # Temporary. Currently when zeromq + SPMD is used, it does not properly
        # terminate because of aDAG issue.
        common_args.append("--disable-frontend-multiprocessing")
    else:
        pp_env = None

    pp_args = [
        *common_args,
        "--pipeline-parallel-size",
        str(pp_size),
        "--tensor-parallel-size",
        str(tp_size),
        "--distributed-executor-backend",
        distributed_backend,
    ]

    # compare without pipeline parallelism
    # NOTE: use mp backend for TP
    # PP tests might involve multiple nodes, and ray might
    #  schedule all workers in a node other than the head node,
    #  which can cause the test to fail.
    tp_args = [
        *common_args,
        "--tensor-parallel-size",
        str(tp_size),
        "--distributed-executor-backend",
        "mp",
    ]

    try:
        compare_two_settings(model_name, pp_args, tp_args, pp_env)
    except Exception:
        if pp_env is None:
            raise
        else:
            # Ray ADAG tests are flaky, so we don't want to fail the test
            logger.exception("Ray ADAG tests failed")
[ci][distributed] fix flaky tests (#6806) 2024-07-25 17:44:09 -07:00			`"""`
			`WARNING: This test runs in both single-node (4 GPUs) and multi-node`
			`(2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is`
			`important to set the distributed backend to "mp" to avoid Ray scheduling`
			`all workers in a node other than the head node, which can cause the test`
			`to fail.`
			`"""`
[Core] Multiprocessing Pipeline Parallel support (#6130) Co-authored-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-18 19:15:52 -07:00			`import os`
[Models] Add remaining model PP support (#7168) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> Signed-off-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk> 2024-10-03 19:56:58 -07:00			`from dataclasses import dataclass`
			`from typing import List, NamedTuple, Optional`
[Core] Multiprocessing Pipeline Parallel support (#6130) Co-authored-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-18 19:15:52 -07:00
[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00			`import pytest`

[ci/test] rearrange tests and make adag test soft fail (#7572) 2024-08-15 19:39:04 -07:00			`from vllm.logger import init_logger`

[ci][distributed] try to fix pp test (#7054) 2024-08-01 22:03:12 -07:00			`from ..utils import compare_two_settings, fork_new_process_for_each_test`
[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00
[ci/test] rearrange tests and make adag test soft fail (#7572) 2024-08-15 19:39:04 -07:00			`logger = init_logger("test_pipeline_parallel")`

[Core] Multiprocessing Pipeline Parallel support (#6130) Co-authored-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-18 19:15:52 -07:00			`VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"`

[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00
[Models] Add remaining model PP support (#7168) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> Signed-off-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk> 2024-10-03 19:56:58 -07:00			`class ParallelSetup(NamedTuple):`
			`tp_size: int`
			`pp_size: int`
			`eager_mode: bool`
			`chunked_prefill: bool`


			`@dataclass`
			`class PPTestSettings:`
			`parallel_setups: List[ParallelSetup]`
			`distributed_backends: List[str]`
			`trust_remote_code: bool`
			`tokenizer_mode: Optional[str]`

			`@staticmethod`
			`def detailed(`
			`*,`
			`tp_base: int = 1,`
			`pp_base: int = 2,`
			`trust_remote_code: bool = False,`
			`tokenizer_mode: Optional[str] = None,`
			`):`
			`return PPTestSettings(`
			`parallel_setups=[`
			`ParallelSetup(tp_size=tp_base,`
			`pp_size=pp_base,`
			`eager_mode=False,`
			`chunked_prefill=False),`
			`ParallelSetup(tp_size=tp_base,`
			`pp_size=2 * pp_base,`
			`eager_mode=False,`
			`chunked_prefill=True),`
			`ParallelSetup(tp_size=tp_base,`
			`pp_size=2 * pp_base,`
			`eager_mode=True,`
			`chunked_prefill=False),`
			`ParallelSetup(tp_size=2 * tp_base,`
			`pp_size=pp_base,`
			`eager_mode=False,`
			`chunked_prefill=True),`
			`ParallelSetup(tp_size=2 * tp_base,`
			`pp_size=pp_base,`
			`eager_mode=True,`
			`chunked_prefill=False),`
			`],`
			`distributed_backends=["mp", "ray"],`
			`trust_remote_code=trust_remote_code,`
			`tokenizer_mode=tokenizer_mode,`
			`)`

			`@staticmethod`
			`def fast(`
			`*,`
			`tp_base: int = 1,`
			`pp_base: int = 2,`
			`trust_remote_code: bool = False,`
			`tokenizer_mode: Optional[str] = None,`
			`):`
			`return PPTestSettings(`
			`parallel_setups=[`
			`ParallelSetup(tp_size=tp_base,`
			`pp_size=pp_base,`
			`eager_mode=True,`
			`chunked_prefill=False),`
			`],`
			`distributed_backends=["mp"],`
			`trust_remote_code=trust_remote_code,`
			`tokenizer_mode=tokenizer_mode,`
			`)`

			`def iter_params(self, model_name: str):`
			`for parallel_setup in self.parallel_setups:`
			`for distributed_backend in self.distributed_backends:`
			`yield (model_name, parallel_setup, distributed_backend,`
			`self.trust_remote_code, self.tokenizer_mode)`


			`# yapf: disable`
			`GENERATION_MODEL_SETTINGS = {`
			`# [DETAILED TESTS]`
			`"meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),`
			`# [FAST TESTS]`
			`# Uses Llama`
			`# "BAAI/AquilaChat-7B": PPTestSettings.fast(),`
			`# TODO: Test on larger GPU`
			`# "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501`
			`"baichuan-inc/Baichuan-7B": PPTestSettings.fast(trust_remote_code=True),`
			`"baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(trust_remote_code=True), # noqa: E501`
			`"bigscience/bloomz-1b1": PPTestSettings.fast(),`
			`"THUDM/chatglm3-6b": PPTestSettings.fast(trust_remote_code=True),`
			`"CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(tp_base=2, trust_remote_code=True), # noqa: E501`
			`# TODO: Test on larger GPU`
			`# "databricks/dbrx-instruct": PPTestSettings.fast(),`
			`"Deci/DeciLM-7B-instruct": PPTestSettings.fast(trust_remote_code=True),`
			`"deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),`
			`"deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(trust_remote_code=True), # noqa: E501`
			`"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),`
			`"tiiuae/falcon-7b": PPTestSettings.fast(),`
			`"google/gemma-2b": PPTestSettings.fast(),`
			`"google/gemma-2-9b": PPTestSettings.fast(),`
			`"gpt2": PPTestSettings.fast(),`
			`"bigcode/starcoder": PPTestSettings.fast(),`
			`"EleutherAI/gpt-j-6b": PPTestSettings.fast(),`
			`"EleutherAI/pythia-12b": PPTestSettings.fast(),`
			`"ibm/PowerLM-3b": PPTestSettings.fast(),`
			`"ibm/PowerMoE-3b": PPTestSettings.fast(),`
			`# Uses Llama`
			`# "internlm/internlm-chat-7b": PPTestSettings.fast(),`
			`"internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True),`
			`"core42/jais-13b-chat": PPTestSettings.fast(),`
			`# TODO: Implement PP`
			`# "ai21labs/AI21-Jamba-1.5-Mini": PPTestSettings.fast(),`
			`"openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True),`
			`"openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True),`
			`# Uses Llama`
			`# "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),`
			`"mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4),`
			`"mosaicml/mpt-7b": PPTestSettings.fast(),`
			`"nvidia/Minitron-8B-Base": PPTestSettings.fast(),`
			`"allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),`
			`"allenai/OLMo-1B-hf": PPTestSettings.fast(),`
			`"facebook/opt-iml-max-1.3b": PPTestSettings.fast(),`
			`"OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),`
			`"microsoft/phi-2": PPTestSettings.fast(),`
			`"microsoft/Phi-3-mini-4k-instruct": PPTestSettings.fast(),`
			`"microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501`
			`# FIXME: https://github.com/vllm-project/vllm/issues/8553`
			`# "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501`
			`"adept/persimmon-8b-chat": PPTestSettings.fast(),`
			`"Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),`
			`"Qwen/Qwen2-beta-7B-Chat": PPTestSettings.fast(),`
			`"Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),`
			`"stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),`
			`"bigcode/starcoder2-3b": PPTestSettings.fast(),`
			`"upstage/solar-pro-preview-instruct": PPTestSettings.fast(tp_base=2),`
			`# FIXME: Cannot load tokenizer in latest transformers version`
			`# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True),`
			`}`

			`EMBEDDING_MODEL_SETTINGS = { # type: ignore[var-annotated]`
			`# [FAST TESTS]`
			`# Uses Llama`
			`# "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),`
			`}`

			`MULTIMODAL_MODEL_SETTINGS = {`
			`# [FAST TESTS]`
			`"Salesforce/blip2-opt-2.7b": PPTestSettings.fast(),`
			`"facebook/chameleon-7b": PPTestSettings.fast(),`
			`"adept/fuyu-8b": PPTestSettings.fast(),`
			`"OpenGVLab/InternVL2-1B": PPTestSettings.fast(trust_remote_code=True),`
			`"llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),`
			`"llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),`
			`"llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),`
			`"llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),`
			`"openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(trust_remote_code=True),`
			`# TODO: Implement PP`
			`# "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),`
			`"microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501`
			`"mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"), # noqa: E501`
			`"Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),`
			`"Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),`
			`"fixie-ai/ultravox-v0_3": PPTestSettings.fast(),`
			`}`

			`CONDITIONAL_GENERATION_MODEL_SETTINGS = { # type: ignore[var-annotated]`
			`# [FAST TESTS]`
			`# TODO: Implement PP`
			`# "facebook/bart-base": PPTestSettings.fast(),`
			`}`
			`# yapf: enable`

			`MODEL_SETTINGS = {`
			`**GENERATION_MODEL_SETTINGS,`
			`**EMBEDDING_MODEL_SETTINGS,`
			`**MULTIMODAL_MODEL_SETTINGS,`
			`}`

			`# You can update this on your local machine to run specific tests`
			`TEST_MODELS = [`
			`"meta-llama/Meta-Llama-3-8B",`
			`"facebook/chameleon-7b",`
			`"OpenGVLab/InternVL2-1B",`
			`"microsoft/Phi-3-vision-128k-instruct",`
			`"mistralai/Pixtral-12B-2409",`
			`"fixie-ai/ultravox-v0_3",`
			`]`


Inclusion of InternVLChatModel In PP_SUPPORTED_MODELS(Pipeline Parallelism) (#7860) 2024-09-05 17:03:37 +05:30			`@pytest.mark.parametrize(`
[Models] Add remaining model PP support (#7168) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> Signed-off-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk> 2024-10-03 19:56:58 -07:00			`("model_name", "parallel_setup", "distributed_backend",`
			`"trust_remote_code", "tokenizer_mode"),`
Inclusion of InternVLChatModel In PP_SUPPORTED_MODELS(Pipeline Parallelism) (#7860) 2024-09-05 17:03:37 +05:30			`[`
[Models] Add remaining model PP support (#7168) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> Signed-off-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk> 2024-10-03 19:56:58 -07:00			`params for model_name, settings in MODEL_SETTINGS.items()`
			`for params in settings.iter_params(model_name)`
			`if model_name in TEST_MODELS`
Inclusion of InternVLChatModel In PP_SUPPORTED_MODELS(Pipeline Parallelism) (#7860) 2024-09-05 17:03:37 +05:30			`],`
			`)`
[ci/test] rearrange tests and make adag test soft fail (#7572) 2024-08-15 19:39:04 -07:00			`@fork_new_process_for_each_test`
[Models] Add remaining model PP support (#7168) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> Signed-off-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk> 2024-10-03 19:56:58 -07:00			`def test_compare_tp(model_name: str, parallel_setup: ParallelSetup,`
			`distributed_backend: str, trust_remote_code: bool,`
			`tokenizer_mode: Optional[str], num_gpus_available):`
			`tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup`

			`if num_gpus_available < tp_size:`
			`pytest.skip(f"Need at least {tp_size} GPUs to run the test")`
			`if VLLM_MULTI_NODE and distributed_backend == "mp":`
[Core] Multiprocessing Pipeline Parallel support (#6130) Co-authored-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-18 19:15:52 -07:00			`pytest.skip("Skipping multi-node pipeline parallel test for "`
			`"multiprocessing distributed backend")`
[Model] Pipeline parallel support for Mixtral (#6516) 2024-07-17 19:26:04 -07:00
[Models] Add remaining model PP support (#7168) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> Signed-off-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk> 2024-10-03 19:56:58 -07:00			`common_args = [`
[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00			`# use half precision for speed and memory savings in CI environment`
			`"--dtype",`
[Core] Multiprocessing Pipeline Parallel support (#6130) Co-authored-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-18 19:15:52 -07:00			`"float16",`
[Bugfix] Fix InternVL2 vision embeddings process with pipeline parallel (#8299) 2024-09-11 10:11:01 +08:00			`"--max-model-len",`
[Models] Add remaining model PP support (#7168) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> Signed-off-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk> 2024-10-03 19:56:58 -07:00			`"2048",`
			`"--max-num-seqs",`
			`"8",`
			`]`
			`if chunked_prefill:`
			`common_args.append("--enable-chunked-prefill")`
			`if eager_mode:`
			`common_args.append("--enforce-eager")`
			`if trust_remote_code:`
			`common_args.append("--trust-remote-code")`
			`if tokenizer_mode:`
			`common_args.extend(["--tokenizer-mode", tokenizer_mode])`

			`if (distributed_backend == "ray" and tp_size == 2 and pp_size == 2`
			`and chunked_prefill):`
			`# Test Ray ADAG for a subset of the tests`
			`pp_env = {`
			`"VLLM_USE_RAY_COMPILED_DAG": "1",`
			`"VLLM_USE_RAY_SPMD_WORKER": "1",`
			`"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",`
			`}`
			`# Temporary. Currently when zeromq + SPMD is used, it does not properly`
			`# terminate because of aDAG issue.`
			`common_args.append("--disable-frontend-multiprocessing")`
			`else:`
			`pp_env = None`

			`pp_args = [`
			`*common_args,`
[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00			`"--pipeline-parallel-size",`
[Models] Add remaining model PP support (#7168) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> Signed-off-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk> 2024-10-03 19:56:58 -07:00			`str(pp_size),`
[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00			`"--tensor-parallel-size",`
[Models] Add remaining model PP support (#7168) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> Signed-off-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk> 2024-10-03 19:56:58 -07:00			`str(tp_size),`
[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00			`"--distributed-executor-backend",`
[Models] Add remaining model PP support (#7168) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> Signed-off-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk> 2024-10-03 19:56:58 -07:00			`distributed_backend,`
[Core] Pipeline Parallel Support (#4412) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-02 10:58:08 -07:00			`]`
[ci][distributed] add pipeline parallel correctness test (#6410) 2024-07-16 15:44:22 -07:00
			`# compare without pipeline parallelism`
			`# NOTE: use mp backend for TP`
			`# PP tests might involve multiple nodes, and ray might`
			`# schedule all workers in a node other than the head node,`
			`# which can cause the test to fail.`
			`tp_args = [`
[Models] Add remaining model PP support (#7168) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> Signed-off-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk> 2024-10-03 19:56:58 -07:00			`*common_args,`
[ci][distributed] add pipeline parallel correctness test (#6410) 2024-07-16 15:44:22 -07:00			`"--tensor-parallel-size",`
[Models] Add remaining model PP support (#7168) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> Signed-off-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk> 2024-10-03 19:56:58 -07:00			`str(tp_size),`
[ci][distributed] add pipeline parallel correctness test (#6410) 2024-07-16 15:44:22 -07:00			`"--distributed-executor-backend",`
			`"mp",`
			`]`

[ci/test] rearrange tests and make adag test soft fail (#7572) 2024-08-15 19:39:04 -07:00			`try:`
[Models] Add remaining model PP support (#7168) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> Signed-off-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk> 2024-10-03 19:56:58 -07:00			`compare_two_settings(model_name, pp_args, tp_args, pp_env)`
[ci/test] rearrange tests and make adag test soft fail (#7572) 2024-08-15 19:39:04 -07:00			`except Exception:`
			`if pp_env is None:`
			`raise`
			`else:`
			`# Ray ADAG tests are flaky, so we don't want to fail the test`
			`logger.exception("Ray ADAG tests failed")`