vllm/tests/spec_decode/e2e/test_integration_dist_tp2.py

"""Tests which cover integration of the speculative decoding framework with
tensor parallelism.
"""

from typing import Optional

import pytest
import torch

from vllm.platforms import current_platform

from .conftest import run_equality_correctness_test_tp


@pytest.mark.skipif(torch.cuda.device_count() < 2,
                    reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize(
    "common_llm_kwargs",
    [[
        # Skip cuda graph recording for fast test.
        "--enforce-eager",
        "--tensor-parallel-size",
        "2"
    ]])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
@pytest.mark.parametrize("test_llm_kwargs", [
    [
        "--speculative-model",
        "JackFram/llama-68m",
        "--num-speculative-tokens",
        "3",
    ],
    [
        "--speculative-model",
        "[ngram]",
        "--num-speculative-tokens",
        "5",
        "--ngram-prompt-lookup-max",
        "3",
    ],
])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize(
    "output_len",
    [
        # Use smaller output len for fast test.
        32,
    ])
@pytest.mark.parametrize("seed", [1])
def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
                              baseline_llm_kwargs, test_llm_kwargs,
                              batch_size: int, output_len: int, seed: int):
    """Verify greedy equality when tensor parallelism is used.
    """
    if current_platform.is_rocm():
        pytest.skip("hip is not well-supported yet")
    run_equality_correctness_test_tp("JackFram/llama-68m",
                                     common_llm_kwargs,
                                     per_test_common_llm_kwargs,
                                     baseline_llm_kwargs,
                                     test_llm_kwargs,
                                     batch_size,
                                     output_len,
                                     seed,
                                     temperature=0.0)


@pytest.mark.skipif(torch.cuda.device_count() < 2,
                    reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize(
    "common_llm_kwargs",
    [[
        # Skip cuda graph recording for fast test.
        "--enforce-eager",
        "--tensor_parallel_size",
        "2",

        # precision
        "--dtype",
        "bfloat16",
    ]])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
@pytest.mark.parametrize("model, test_llm_kwargs",
                         [("JackFram/llama-68m", [
                             "--speculative-model",
                             "JackFram/llama-68m",
                             "--num_speculative-tokens",
                             "5",
                             "--speculative-draft-tensor-parallel-size",
                             "1",
                         ]),
                          ("ibm-granite/granite-3b-code-instruct", [
                              "--speculative-model",
                              "ibm-granite/granite-3b-code-instruct",
                              "--num_speculative-tokens",
                              "5",
                              "--speculative-draft-tensor-parallel-size",
                              "1",
                          ])])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1])
def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
                                            per_test_common_llm_kwargs,
                                            baseline_llm_kwargs,
                                            test_llm_kwargs, batch_size: int,
                                            seed: int):
    """Verify spec decode works well with smaller tp for draft models.
    """
    run_equality_correctness_test_tp(model,
                                     common_llm_kwargs,
                                     per_test_common_llm_kwargs,
                                     baseline_llm_kwargs,
                                     test_llm_kwargs,
                                     batch_size,
                                     max_output_len=32,
                                     seed=seed,
                                     temperature=0.0)


@pytest.mark.skipif(torch.cuda.device_count() < 2,
                    reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize(
    "common_llm_kwargs",
    [[
        # Skip cuda graph recording for fast test.
        "--enforce-eager",
        "--tensor_parallel_size",
        "2",

        # precision
        "--dtype",
        "bfloat16",
    ]])
@pytest.mark.parametrize(
    "per_test_common_llm_kwargs",
    [["--enable-chunked-prefill", "False"],
     [
         "--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4",
         "--max-num-seqs", "4"
     ]])
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
@pytest.mark.parametrize("model, test_llm_kwargs",
                         [("JackFram/llama-68m", [
                             "--speculative-model",
                             "JackFram/llama-68m",
                             "--num_speculative-tokens",
                             "3",
                         ]),
                          ("JackFram/llama-68m", [
                              "--speculative-model",
                              "JackFram/llama-68m",
                              "--num_speculative-tokens",
                              "3",
                              "--speculative-draft-tensor-parallel-size",
                              "1",
                          ])])
@pytest.mark.parametrize("logprobs", [None, 2])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1])
def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
                                         per_test_common_llm_kwargs,
                                         baseline_llm_kwargs, test_llm_kwargs,
                                         logprobs: Optional[int],
                                         batch_size: int, seed: int):
    """Verify spec decode works well with same and different TP size for
    the draft model with chunked prefill.
    """
    if logprobs:
        test_llm_kwargs.extend(
            ["--disable_logprobs_during_spec_decoding", "False"])
    run_equality_correctness_test_tp(model,
                                     common_llm_kwargs,
                                     per_test_common_llm_kwargs,
                                     baseline_llm_kwargs,
                                     test_llm_kwargs,
                                     batch_size,
                                     max_output_len=32,
                                     seed=seed,
                                     temperature=0.0,
                                     logprobs=logprobs)
[Speculative decoding][Re-take] Enable TP>1 speculative decoding (#4840) Co-authored-by: Cade Daniel <edacih@gmail.com> Co-authored-by: Cade Daniel <cade@anyscale.com> 2024-05-16 00:53:51 -07:00			`"""Tests which cover integration of the speculative decoding framework with`
			`tensor parallelism.`
			`"""`

[Feature] [Spec decode]: Enable MLPSpeculator/Medusa and `prompt_logprobs` with ChunkedPrefill (#10132) Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: wallashss <wallashss@ibm.com> Co-authored-by: wallashss <wallashss@ibm.com> 2025-01-27 22:38:35 +01:00			`from typing import Optional`

[Speculative decoding][Re-take] Enable TP>1 speculative decoding (#4840) Co-authored-by: Cade Daniel <edacih@gmail.com> Co-authored-by: Cade Daniel <cade@anyscale.com> 2024-05-16 00:53:51 -07:00			`import pytest`
			`import torch`

[Hardware][ROCM] using current_platform.is_rocm (#9642) Signed-off-by: wangshuai09 <391746016@qq.com> 2024-10-28 12:07:00 +08:00			`from vllm.platforms import current_platform`
[Speculative decoding][Re-take] Enable TP>1 speculative decoding (#4840) Co-authored-by: Cade Daniel <edacih@gmail.com> Co-authored-by: Cade Daniel <cade@anyscale.com> 2024-05-16 00:53:51 -07:00
[Speculative Decoding] Test refactor (#8317) Co-authored-by: youkaichao <youkaichao@126.com> 2024-09-11 14:07:34 -07:00			`from .conftest import run_equality_correctness_test_tp`
[Speculative decoding][Re-take] Enable TP>1 speculative decoding (#4840) Co-authored-by: Cade Daniel <edacih@gmail.com> Co-authored-by: Cade Daniel <cade@anyscale.com> 2024-05-16 00:53:51 -07:00

			`@pytest.mark.skipif(torch.cuda.device_count() < 2,`
			`reason="Need at least 2 GPUs to run the test.")`
			`@pytest.mark.parametrize(`
			`"common_llm_kwargs",`
[Speculative Decoding] Test refactor (#8317) Co-authored-by: youkaichao <youkaichao@126.com> 2024-09-11 14:07:34 -07:00			`[[`
[Speculative decoding][Re-take] Enable TP>1 speculative decoding (#4840) Co-authored-by: Cade Daniel <edacih@gmail.com> Co-authored-by: Cade Daniel <cade@anyscale.com> 2024-05-16 00:53:51 -07:00			`# Skip cuda graph recording for fast test.`
[Speculative Decoding] Test refactor (#8317) Co-authored-by: youkaichao <youkaichao@126.com> 2024-09-11 14:07:34 -07:00			`"--enforce-eager",`
			`"--tensor-parallel-size",`
			`"2"`
			`]])`
			`@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])`
			`@pytest.mark.parametrize("baseline_llm_kwargs", [[]])`
[Speculative decoding][Re-take] Enable TP>1 speculative decoding (#4840) Co-authored-by: Cade Daniel <edacih@gmail.com> Co-authored-by: Cade Daniel <cade@anyscale.com> 2024-05-16 00:53:51 -07:00			`@pytest.mark.parametrize("test_llm_kwargs", [`
[Speculative Decoding] Test refactor (#8317) Co-authored-by: youkaichao <youkaichao@126.com> 2024-09-11 14:07:34 -07:00			`[`
			`"--speculative-model",`
			`"JackFram/llama-68m",`
			`"--num-speculative-tokens",`
			`"3",`
			`],`
			`[`
			`"--speculative-model",`
			`"[ngram]",`
			`"--num-speculative-tokens",`
			`"5",`
			`"--ngram-prompt-lookup-max",`
			`"3",`
			`],`
[Speculative decoding][Re-take] Enable TP>1 speculative decoding (#4840) Co-authored-by: Cade Daniel <edacih@gmail.com> Co-authored-by: Cade Daniel <cade@anyscale.com> 2024-05-16 00:53:51 -07:00			`])`
			`@pytest.mark.parametrize("batch_size", [2])`
			`@pytest.mark.parametrize(`
			`"output_len",`
			`[`
			`# Use smaller output len for fast test.`
			`32,`
			`])`
			`@pytest.mark.parametrize("seed", [1])`
[Speculative Decoding] Test refactor (#8317) Co-authored-by: youkaichao <youkaichao@126.com> 2024-09-11 14:07:34 -07:00			`def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,`
			`baseline_llm_kwargs, test_llm_kwargs,`
			`batch_size: int, output_len: int, seed: int):`
[Speculative decoding][Re-take] Enable TP>1 speculative decoding (#4840) Co-authored-by: Cade Daniel <edacih@gmail.com> Co-authored-by: Cade Daniel <cade@anyscale.com> 2024-05-16 00:53:51 -07:00			`"""Verify greedy equality when tensor parallelism is used.`
			`"""`
[Hardware][ROCM] using current_platform.is_rocm (#9642) Signed-off-by: wangshuai09 <391746016@qq.com> 2024-10-28 12:07:00 +08:00			`if current_platform.is_rocm():`
[Speculative decoding][Re-take] Enable TP>1 speculative decoding (#4840) Co-authored-by: Cade Daniel <edacih@gmail.com> Co-authored-by: Cade Daniel <cade@anyscale.com> 2024-05-16 00:53:51 -07:00			`pytest.skip("hip is not well-supported yet")`
[Speculative Decoding] Test refactor (#8317) Co-authored-by: youkaichao <youkaichao@126.com> 2024-09-11 14:07:34 -07:00			`run_equality_correctness_test_tp("JackFram/llama-68m",`
			`common_llm_kwargs,`
			`per_test_common_llm_kwargs,`
			`baseline_llm_kwargs,`
			`test_llm_kwargs,`
			`batch_size,`
			`output_len,`
			`seed,`
			`temperature=0.0)`
[Speculative Decoding] Support draft model on different tensor-parallel size than target model (#5414) 2024-06-25 18:56:06 +09:00

			`@pytest.mark.skipif(torch.cuda.device_count() < 2,`
			`reason="Need at least 2 GPUs to run the test.")`
			`@pytest.mark.parametrize(`
			`"common_llm_kwargs",`
[Speculative Decoding] Test refactor (#8317) Co-authored-by: youkaichao <youkaichao@126.com> 2024-09-11 14:07:34 -07:00			`[[`
[Speculative Decoding] Support draft model on different tensor-parallel size than target model (#5414) 2024-06-25 18:56:06 +09:00			`# Skip cuda graph recording for fast test.`
[Speculative Decoding] Test refactor (#8317) Co-authored-by: youkaichao <youkaichao@126.com> 2024-09-11 14:07:34 -07:00			`"--enforce-eager",`
			`"--tensor_parallel_size",`
			`"2",`
[Model] RowParallelLinear: pass bias to quant_method.apply (#6327) Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> 2024-07-19 15:15:22 +02:00
			`# precision`
[Speculative Decoding] Test refactor (#8317) Co-authored-by: youkaichao <youkaichao@126.com> 2024-09-11 14:07:34 -07:00			`"--dtype",`
			`"bfloat16",`
			`]])`
			`@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])`
			`@pytest.mark.parametrize("baseline_llm_kwargs", [[]])`
			`@pytest.mark.parametrize("model, test_llm_kwargs",`
			`[("JackFram/llama-68m", [`
			`"--speculative-model",`
			`"JackFram/llama-68m",`
			`"--num_speculative-tokens",`
			`"5",`
			`"--speculative-draft-tensor-parallel-size",`
			`"1",`
			`]),`
			`("ibm-granite/granite-3b-code-instruct", [`
			`"--speculative-model",`
			`"ibm-granite/granite-3b-code-instruct",`
			`"--num_speculative-tokens",`
			`"5",`
			`"--speculative-draft-tensor-parallel-size",`
			`"1",`
			`])])`
[Speculative Decoding] Support draft model on different tensor-parallel size than target model (#5414) 2024-06-25 18:56:06 +09:00			`@pytest.mark.parametrize("batch_size", [2])`
			`@pytest.mark.parametrize("seed", [1])`
[Speculative Decoding] Test refactor (#8317) Co-authored-by: youkaichao <youkaichao@126.com> 2024-09-11 14:07:34 -07:00			`def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,`
			`per_test_common_llm_kwargs,`
			`baseline_llm_kwargs,`
			`test_llm_kwargs, batch_size: int,`
			`seed: int):`
[Speculative Decoding] Support draft model on different tensor-parallel size than target model (#5414) 2024-06-25 18:56:06 +09:00			`"""Verify spec decode works well with smaller tp for draft models.`
			`"""`
[Speculative Decoding] Test refactor (#8317) Co-authored-by: youkaichao <youkaichao@126.com> 2024-09-11 14:07:34 -07:00			`run_equality_correctness_test_tp(model,`
			`common_llm_kwargs,`
			`per_test_common_llm_kwargs,`
			`baseline_llm_kwargs,`
			`test_llm_kwargs,`
			`batch_size,`
			`max_output_len=32,`
			`seed=seed,`
			`temperature=0.0)`
[Bugfix] Fix for Spec model TP + Chunked Prefill (#10232) Signed-off-by: andoorve <37849411+andoorve@users.noreply.github.com> Signed-off-by: Sourashis Roy <sroy@roblox.com> Co-authored-by: Sourashis Roy <sroy@roblox.com> 2024-11-26 09:11:16 -08:00

			`@pytest.mark.skipif(torch.cuda.device_count() < 2,`
			`reason="Need at least 2 GPUs to run the test.")`
			`@pytest.mark.parametrize(`
			`"common_llm_kwargs",`
			`[[`
			`# Skip cuda graph recording for fast test.`
			`"--enforce-eager",`
			`"--tensor_parallel_size",`
			`"2",`

			`# precision`
			`"--dtype",`
			`"bfloat16",`
			`]])`
			`@pytest.mark.parametrize(`
			`"per_test_common_llm_kwargs",`
			`[["--enable-chunked-prefill", "False"],`
			`[`
			`"--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4",`
			`"--max-num-seqs", "4"`
			`]])`
			`@pytest.mark.parametrize("baseline_llm_kwargs", [[]])`
			`@pytest.mark.parametrize("model, test_llm_kwargs",`
			`[("JackFram/llama-68m", [`
			`"--speculative-model",`
			`"JackFram/llama-68m",`
			`"--num_speculative-tokens",`
			`"3",`
			`]),`
			`("JackFram/llama-68m", [`
			`"--speculative-model",`
			`"JackFram/llama-68m",`
			`"--num_speculative-tokens",`
			`"3",`
			`"--speculative-draft-tensor-parallel-size",`
			`"1",`
			`])])`
[Feature] [Spec decode]: Enable MLPSpeculator/Medusa and `prompt_logprobs` with ChunkedPrefill (#10132) Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: wallashss <wallashss@ibm.com> Co-authored-by: wallashss <wallashss@ibm.com> 2025-01-27 22:38:35 +01:00			`@pytest.mark.parametrize("logprobs", [None, 2])`
[Bugfix] Fix for Spec model TP + Chunked Prefill (#10232) Signed-off-by: andoorve <37849411+andoorve@users.noreply.github.com> Signed-off-by: Sourashis Roy <sroy@roblox.com> Co-authored-by: Sourashis Roy <sroy@roblox.com> 2024-11-26 09:11:16 -08:00			`@pytest.mark.parametrize("batch_size", [2])`
			`@pytest.mark.parametrize("seed", [1])`
			`def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,`
			`per_test_common_llm_kwargs,`
			`baseline_llm_kwargs, test_llm_kwargs,`
[Feature] [Spec decode]: Enable MLPSpeculator/Medusa and `prompt_logprobs` with ChunkedPrefill (#10132) Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: wallashss <wallashss@ibm.com> Co-authored-by: wallashss <wallashss@ibm.com> 2025-01-27 22:38:35 +01:00			`logprobs: Optional[int],`
[Bugfix] Fix for Spec model TP + Chunked Prefill (#10232) Signed-off-by: andoorve <37849411+andoorve@users.noreply.github.com> Signed-off-by: Sourashis Roy <sroy@roblox.com> Co-authored-by: Sourashis Roy <sroy@roblox.com> 2024-11-26 09:11:16 -08:00			`batch_size: int, seed: int):`
			`"""Verify spec decode works well with same and different TP size for`
			`the draft model with chunked prefill.`
			`"""`
[Feature] [Spec decode]: Enable MLPSpeculator/Medusa and `prompt_logprobs` with ChunkedPrefill (#10132) Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: wallashss <wallashss@ibm.com> Co-authored-by: wallashss <wallashss@ibm.com> 2025-01-27 22:38:35 +01:00			`if logprobs:`
			`test_llm_kwargs.extend(`
			`["--disable_logprobs_during_spec_decoding", "False"])`
[Bugfix] Fix for Spec model TP + Chunked Prefill (#10232) Signed-off-by: andoorve <37849411+andoorve@users.noreply.github.com> Signed-off-by: Sourashis Roy <sroy@roblox.com> Co-authored-by: Sourashis Roy <sroy@roblox.com> 2024-11-26 09:11:16 -08:00			`run_equality_correctness_test_tp(model,`
			`common_llm_kwargs,`
			`per_test_common_llm_kwargs,`
			`baseline_llm_kwargs,`
			`test_llm_kwargs,`
			`batch_size,`
			`max_output_len=32,`
			`seed=seed,`
[Feature] [Spec decode]: Enable MLPSpeculator/Medusa and `prompt_logprobs` with ChunkedPrefill (#10132) Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: wallashss <wallashss@ibm.com> Co-authored-by: wallashss <wallashss@ibm.com> 2025-01-27 22:38:35 +01:00			`temperature=0.0,`
			`logprobs=logprobs)`