2025-02-02 14:58:18 -05:00
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
2024-05-16 00:53:51 -07:00
|
|
|
"""Tests which cover integration of the speculative decoding framework with
|
|
|
|
other features, e.g. cuda graphs.
|
|
|
|
"""
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
2024-09-11 14:07:34 -07:00
|
|
|
from .conftest import run_equality_correctness_test
|
|
|
|
|
|
|
|
MAIN_MODEL = "JackFram/llama-68m"
|
2024-05-16 00:53:51 -07:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"common_llm_kwargs",
|
|
|
|
[{
|
|
|
|
|
|
|
|
# Verify equality when cuda graphs allowed.
|
|
|
|
"enforce_eager": False,
|
2024-09-11 14:07:34 -07:00
|
|
|
"model_name": "JackFram/llama-68m",
|
2024-05-16 00:53:51 -07:00
|
|
|
}])
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"per_test_common_llm_kwargs",
|
|
|
|
[
|
|
|
|
{
|
|
|
|
# Identical models.
|
2025-03-23 13:28:10 +08:00
|
|
|
"speculative_config": {
|
|
|
|
"model": "JackFram/llama-68m",
|
|
|
|
"num_speculative_tokens": 5,
|
|
|
|
},
|
2024-05-16 00:53:51 -07:00
|
|
|
},
|
|
|
|
])
|
|
|
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
|
|
|
@pytest.mark.parametrize("test_llm_kwargs", [{}])
|
|
|
|
@pytest.mark.parametrize("batch_size", [8])
|
|
|
|
@pytest.mark.parametrize("output_len", [32])
|
|
|
|
@pytest.mark.parametrize("seed", [1])
|
2024-09-11 14:07:34 -07:00
|
|
|
def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
|
|
|
|
per_test_common_llm_kwargs,
|
|
|
|
baseline_llm_kwargs, test_llm_kwargs,
|
|
|
|
batch_size: int, output_len: int, seed: int):
|
2024-05-16 00:53:51 -07:00
|
|
|
"""Verify spec decode equality when cuda graphs are enabled.
|
|
|
|
"""
|
2024-09-11 14:07:34 -07:00
|
|
|
run_equality_correctness_test(vllm_runner,
|
|
|
|
common_llm_kwargs,
|
|
|
|
per_test_common_llm_kwargs,
|
|
|
|
baseline_llm_kwargs,
|
|
|
|
test_llm_kwargs,
|
|
|
|
batch_size,
|
|
|
|
max_output_len=output_len,
|
|
|
|
seed=seed,
|
|
|
|
temperature=0.0)
|
2024-08-16 10:34:28 +08:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"common_llm_kwargs",
|
|
|
|
[{
|
2024-09-11 14:07:34 -07:00
|
|
|
"model_name": "JackFram/llama-160m",
|
2024-08-16 10:34:28 +08:00
|
|
|
|
|
|
|
# Skip cuda graph recording for fast test.
|
|
|
|
"enforce_eager": True,
|
|
|
|
}])
|
2025-03-23 13:28:10 +08:00
|
|
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [])
|
2024-08-16 10:34:28 +08:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"test_llm_kwargs",
|
|
|
|
[
|
|
|
|
# Explicitly specify draft model quantization
|
|
|
|
{
|
2025-03-23 13:28:10 +08:00
|
|
|
"speculative_config": {
|
|
|
|
"model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
|
|
|
|
"num_speculative_tokens": 5,
|
|
|
|
"quantization": "gptq",
|
|
|
|
},
|
2024-08-16 10:34:28 +08:00
|
|
|
},
|
|
|
|
# Explicitly specify GPTQ-based draft model to use marlin quantization
|
|
|
|
{
|
2025-03-23 13:28:10 +08:00
|
|
|
"speculative_config": {
|
|
|
|
"model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
|
|
|
|
"num_speculative_tokens": 5,
|
|
|
|
"quantization": "marlin",
|
|
|
|
},
|
2024-08-16 10:34:28 +08:00
|
|
|
},
|
|
|
|
# Not explicitly specify draft model quantization
|
|
|
|
{
|
2025-03-23 13:28:10 +08:00
|
|
|
"speculative_config": {
|
|
|
|
"model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
|
|
|
|
"num_speculative_tokens": 5,
|
|
|
|
"quantization": None,
|
|
|
|
},
|
2024-08-16 10:34:28 +08:00
|
|
|
},
|
|
|
|
])
|
|
|
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
|
|
|
@pytest.mark.parametrize("batch_size", [2])
|
|
|
|
@pytest.mark.parametrize("seed", [1])
|
2024-09-11 14:07:34 -07:00
|
|
|
def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
|
|
|
|
per_test_common_llm_kwargs,
|
|
|
|
baseline_llm_kwargs,
|
|
|
|
test_llm_kwargs,
|
|
|
|
batch_size: int, seed: int):
|
2024-08-16 10:34:28 +08:00
|
|
|
"""Verify spec decode works well with draft model quantization configs.
|
|
|
|
"""
|
2024-09-11 14:07:34 -07:00
|
|
|
run_equality_correctness_test(vllm_runner,
|
|
|
|
common_llm_kwargs,
|
|
|
|
per_test_common_llm_kwargs,
|
|
|
|
baseline_llm_kwargs,
|
|
|
|
test_llm_kwargs,
|
|
|
|
batch_size,
|
|
|
|
max_output_len=32,
|
|
|
|
seed=seed,
|
|
|
|
temperature=0.0)
|
2024-10-01 16:04:42 -07:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"common_llm_kwargs",
|
|
|
|
[{
|
|
|
|
"model_name": MAIN_MODEL,
|
|
|
|
|
|
|
|
# Skip cuda graph recording for fast test.
|
|
|
|
"enforce_eager": True,
|
|
|
|
}])
|
|
|
|
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
|
|
|
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
2025-03-23 13:28:10 +08:00
|
|
|
@pytest.mark.parametrize("test_llm_kwargs", [{
|
|
|
|
"speculative_config": {
|
|
|
|
"model": "JackFram/llama-68m",
|
|
|
|
"num_speculative_tokens": 3,
|
|
|
|
"disable_mqa_scorer": True,
|
|
|
|
},
|
|
|
|
}])
|
2024-10-01 16:04:42 -07:00
|
|
|
@pytest.mark.parametrize("batch_size", [1, 5])
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"output_len",
|
|
|
|
[
|
|
|
|
# Use smaller output len for fast test.
|
|
|
|
32,
|
|
|
|
])
|
|
|
|
@pytest.mark.parametrize("seed", [1])
|
|
|
|
def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
|
|
|
baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
|
|
|
|
output_len: int, seed: int):
|
2025-03-23 13:28:10 +08:00
|
|
|
"""Verify that speculative decoding generates the same output
|
2024-10-01 16:04:42 -07:00
|
|
|
with batch expansion scorer and mqa scorer.
|
|
|
|
"""
|
|
|
|
run_equality_correctness_test(vllm_runner,
|
|
|
|
common_llm_kwargs,
|
|
|
|
per_test_common_llm_kwargs,
|
|
|
|
baseline_llm_kwargs,
|
|
|
|
test_llm_kwargs,
|
|
|
|
batch_size,
|
|
|
|
max_output_len=output_len,
|
|
|
|
seed=seed,
|
|
|
|
temperature=0.0)
|