2025-02-02 14:58:18 -05:00
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
2024-11-11 18:05:38 -05:00
|
|
|
"""
|
|
|
|
This file test accuracy of the vLLM server via LMEval.
|
|
|
|
It uses local-completions, which interacts with vLLM
|
|
|
|
through the OAI API with N concurrent connections.
|
|
|
|
This simulates real work usage of the API and makes
|
|
|
|
sure that the zmq frontend mp RPC message passing and
|
|
|
|
AsyncLLMEngine are working correctly.
|
|
|
|
"""
|
|
|
|
|
|
|
|
import lm_eval
|
|
|
|
import pytest
|
|
|
|
|
|
|
|
from vllm.platforms import current_platform
|
|
|
|
|
2025-04-03 14:23:28 -07:00
|
|
|
MODEL_NAMES = [
|
|
|
|
"Qwen/Qwen2-1.5B-Instruct",
|
|
|
|
"google/gemma-3-1b-it",
|
|
|
|
]
|
2024-11-11 18:05:38 -05:00
|
|
|
NUM_CONCURRENT = 500
|
|
|
|
TASK = "gsm8k"
|
|
|
|
FILTER = "exact_match,strict-match"
|
|
|
|
RTOL = 0.03
|
2025-04-03 14:23:28 -07:00
|
|
|
EXPECTED_VALUES = {
|
|
|
|
"Qwen/Qwen2-1.5B-Instruct": 0.58,
|
|
|
|
"google/gemma-3-1b-it": 0.25,
|
|
|
|
}
|
2024-11-11 18:05:38 -05:00
|
|
|
|
|
|
|
|
2025-04-03 14:23:28 -07:00
|
|
|
def run_test(model_name, more_args=None):
|
2024-11-11 18:05:38 -05:00
|
|
|
"""Run the end to end accuracy test."""
|
|
|
|
|
2025-04-03 14:23:28 -07:00
|
|
|
model_args = f"pretrained={model_name},max_model_len=4096"
|
2025-02-14 03:21:53 -05:00
|
|
|
|
|
|
|
if more_args is not None:
|
|
|
|
model_args = "{},{}".format(model_args, more_args)
|
2024-11-11 18:05:38 -05:00
|
|
|
|
|
|
|
results = lm_eval.simple_evaluate(
|
|
|
|
model="vllm",
|
|
|
|
model_args=model_args,
|
|
|
|
tasks="gsm8k",
|
|
|
|
batch_size="auto",
|
|
|
|
)
|
|
|
|
|
|
|
|
measured_value = results["results"][TASK][FILTER]
|
2025-04-03 14:23:28 -07:00
|
|
|
assert model_name in EXPECTED_VALUES, (
|
|
|
|
f"Cannot find the expected value for the model {model_name=}")
|
|
|
|
expected_value = EXPECTED_VALUES[model_name]
|
|
|
|
assert (measured_value - RTOL < expected_value
|
|
|
|
and measured_value + RTOL > expected_value
|
|
|
|
), f"Expected: {expected_value} | Measured: {measured_value}"
|
2024-11-11 18:05:38 -05:00
|
|
|
|
|
|
|
|
2025-03-08 08:19:38 -05:00
|
|
|
# TODO: [AlexM] Fix it with new CI/CD tests
|
|
|
|
TPU_TP_TEST_STR = "" #"tensor_parallel_size=4"
|
|
|
|
|
|
|
|
|
2025-02-14 03:21:53 -05:00
|
|
|
@pytest.mark.skipif(not current_platform.is_cuda()
|
|
|
|
and not current_platform.is_tpu(),
|
|
|
|
reason="V1 is currently only supported on CUDA and TPU")
|
2025-04-03 14:23:28 -07:00
|
|
|
@pytest.mark.parametrize("model", MODEL_NAMES)
|
|
|
|
def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
|
2024-11-11 18:05:38 -05:00
|
|
|
"""Run with the V1 Engine."""
|
|
|
|
|
|
|
|
with monkeypatch.context() as m:
|
|
|
|
m.setenv("VLLM_USE_V1", "1")
|
2025-02-14 03:21:53 -05:00
|
|
|
|
|
|
|
more_args = None
|
|
|
|
if current_platform.is_tpu():
|
|
|
|
# Limit compilation time for TPU V1
|
2025-03-31 13:25:20 -04:00
|
|
|
more_args = "max_model_len=2048,max_num_seqs=64"
|
2025-02-14 03:21:53 -05:00
|
|
|
|
2025-03-08 08:19:38 -05:00
|
|
|
# Add TP test (if provided)
|
|
|
|
if TPU_TP_TEST_STR:
|
|
|
|
more_args += ",{}".format(TPU_TP_TEST_STR)
|
|
|
|
|
2025-04-03 14:23:28 -07:00
|
|
|
run_test(model, more_args)
|
2024-11-11 18:05:38 -05:00
|
|
|
|
|
|
|
|
2025-03-17 11:35:57 +08:00
|
|
|
def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
|
2024-11-11 18:05:38 -05:00
|
|
|
"""Run with the V0 Engine."""
|
|
|
|
|
|
|
|
with monkeypatch.context() as m:
|
|
|
|
m.setenv("VLLM_USE_V1", "0")
|
2025-04-03 14:23:28 -07:00
|
|
|
run_test("Qwen/Qwen2-1.5B-Instruct")
|