vllm/tests/spec_decode/e2e/test_seed.py

import pytest

from .conftest import run_equality_correctness_test

# main model
MAIN_MODEL = "JackFram/llama-68m"

# speculative model
SPEC_MODEL = "JackFram/llama-160m"


@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
        "model_name": "JackFram/llama-68m",

        # Skip cuda graph recording for fast test.
        "enforce_eager": True,

        # speculative model
        "speculative_model": "JackFram/llama-160m",

        # num speculative tokens
        "num_speculative_tokens": 3,
    }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
@pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}])
@pytest.mark.parametrize("batch_size", [1, 8, 32])
@pytest.mark.parametrize("temperature", [0.1, 1.0])
@pytest.mark.parametrize(
    "output_len",
    [
        # Use smaller output len for fast test.
        20,
    ])
def test_seeded_consistency(vllm_runner, common_llm_kwargs,
                            per_test_common_llm_kwargs, baseline_llm_kwargs,
                            test_llm_kwargs, batch_size: int,
                            temperature: float, output_len: int):
    """Verify outputs are consistent across multiple runs with same seed
    """
    run_equality_correctness_test(
        vllm_runner,
        common_llm_kwargs,
        per_test_common_llm_kwargs,
        baseline_llm_kwargs,
        test_llm_kwargs,
        batch_size,
        max_output_len=output_len,
        temperature=temperature,
        disable_seed=False,
    )

    # Ensure this same test does fail if we _don't_ include per-request seeds
    with pytest.raises(AssertionError):
        run_equality_correctness_test(
            vllm_runner,
            common_llm_kwargs,
            per_test_common_llm_kwargs,
            baseline_llm_kwargs,
            test_llm_kwargs,
            batch_size,
            max_output_len=output_len,
            temperature=temperature,
            disable_seed=True,
        )
[Bugfix] Make spec. decode respect per-request seed. (#6034) Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2024-07-19 04:22:08 +02:00			`import pytest`

			`from .conftest import run_equality_correctness_test`

[Speculative Decoding] Test refactor (#8317) Co-authored-by: youkaichao <youkaichao@126.com> 2024-09-11 14:07:34 -07:00			`# main model`
			`MAIN_MODEL = "JackFram/llama-68m"`

			`# speculative model`
			`SPEC_MODEL = "JackFram/llama-160m"`

[Bugfix] Make spec. decode respect per-request seed. (#6034) Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2024-07-19 04:22:08 +02:00
			`@pytest.mark.parametrize(`
			`"common_llm_kwargs",`
			`[{`
[Speculative Decoding] Test refactor (#8317) Co-authored-by: youkaichao <youkaichao@126.com> 2024-09-11 14:07:34 -07:00			`"model_name": "JackFram/llama-68m",`
[Bugfix] Make spec. decode respect per-request seed. (#6034) Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2024-07-19 04:22:08 +02:00
			`# Skip cuda graph recording for fast test.`
			`"enforce_eager": True,`

			`# speculative model`
			`"speculative_model": "JackFram/llama-160m",`

			`# num speculative tokens`
			`"num_speculative_tokens": 3,`
			`}])`
			`@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])`
[Bugfix] Fix speculative decode seeded test (#6743) 2024-07-24 08:58:31 -07:00			`@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])`
			`@pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}])`
[Bugfix] Make spec. decode respect per-request seed. (#6034) Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2024-07-19 04:22:08 +02:00			`@pytest.mark.parametrize("batch_size", [1, 8, 32])`
			`@pytest.mark.parametrize("temperature", [0.1, 1.0])`
			`@pytest.mark.parametrize(`
			`"output_len",`
			`[`
			`# Use smaller output len for fast test.`
[BugFix] Fix use of per-request seed with pipeline parallel (#6698) 2024-07-30 10:40:08 -07:00			`20,`
[Bugfix] Make spec. decode respect per-request seed. (#6034) Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2024-07-19 04:22:08 +02:00			`])`
[Speculative Decoding] Test refactor (#8317) Co-authored-by: youkaichao <youkaichao@126.com> 2024-09-11 14:07:34 -07:00			`def test_seeded_consistency(vllm_runner, common_llm_kwargs,`
			`per_test_common_llm_kwargs, baseline_llm_kwargs,`
			`test_llm_kwargs, batch_size: int,`
			`temperature: float, output_len: int):`
[Bugfix] Make spec. decode respect per-request seed. (#6034) Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2024-07-19 04:22:08 +02:00			`"""Verify outputs are consistent across multiple runs with same seed`
			`"""`
[Speculative Decoding] Test refactor (#8317) Co-authored-by: youkaichao <youkaichao@126.com> 2024-09-11 14:07:34 -07:00			`run_equality_correctness_test(`
			`vllm_runner,`
			`common_llm_kwargs,`
			`per_test_common_llm_kwargs,`
			`baseline_llm_kwargs,`
			`test_llm_kwargs,`
			`batch_size,`
			`max_output_len=output_len,`
			`temperature=temperature,`
			`disable_seed=False,`
			`)`
[Bugfix] Fix speculative decode seeded test (#6743) 2024-07-24 08:58:31 -07:00
			`# Ensure this same test does fail if we _don't_ include per-request seeds`
			`with pytest.raises(AssertionError):`
[Speculative Decoding] Test refactor (#8317) Co-authored-by: youkaichao <youkaichao@126.com> 2024-09-11 14:07:34 -07:00			`run_equality_correctness_test(`
			`vllm_runner,`
			`common_llm_kwargs,`
			`per_test_common_llm_kwargs,`
			`baseline_llm_kwargs,`
			`test_llm_kwargs,`
			`batch_size,`
			`max_output_len=output_len,`
			`temperature=temperature,`
			`disable_seed=True,`
			`)`