2025-02-02 14:58:18 -05:00
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
2024-08-21 13:34:14 -04:00
|
|
|
"""
|
|
|
|
This file test accuracy of the vLLM server via LMEval.
|
|
|
|
It uses local-completions, which interacts with vLLM
|
|
|
|
through the OAI API with N concurrent connections.
|
|
|
|
This simulates real work usage of the API and makes
|
|
|
|
sure that the zmq frontend mp RPC message passing and
|
|
|
|
AsyncLLMEngine are working correctly.
|
|
|
|
"""
|
|
|
|
|
|
|
|
import lm_eval
|
|
|
|
import pytest
|
|
|
|
|
2024-11-04 14:34:26 -05:00
|
|
|
from vllm.platforms import current_platform
|
|
|
|
|
2025-02-13 16:23:45 +01:00
|
|
|
from ....utils import RemoteOpenAIServer
|
2024-08-21 13:34:14 -04:00
|
|
|
|
|
|
|
MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
|
|
|
|
NUM_CONCURRENT = 500
|
|
|
|
TASK = "gsm8k"
|
|
|
|
FILTER = "exact_match,strict-match"
|
|
|
|
RTOL = 0.03
|
2025-03-08 07:46:15 +01:00
|
|
|
EXPECTED_VALUE = 0.54
|
2025-02-14 03:21:53 -05:00
|
|
|
DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"]
|
2024-09-23 18:38:04 -04:00
|
|
|
MORE_ARGS_LIST = [
|
2024-11-04 14:34:26 -05:00
|
|
|
[], # Default
|
2024-09-23 18:38:04 -04:00
|
|
|
["--enable-chunked-prefill"], # Chunked
|
|
|
|
["--num-scheduler-steps", "8"], # MS
|
|
|
|
["--num-scheduler-steps", "8", "--multi-step-stream-outputs"] # MS+Stream
|
|
|
|
]
|
2024-11-04 14:34:26 -05:00
|
|
|
MAX_WAIT_SECONDS = None
|
|
|
|
|
|
|
|
if current_platform.is_tpu():
|
|
|
|
MORE_ARGS_LIST = [
|
|
|
|
[], # Default
|
|
|
|
# ["--num-scheduler-steps", "8"], # Multi-step << currently fails
|
|
|
|
]
|
|
|
|
MAX_WAIT_SECONDS = 600
|
2024-08-21 13:34:14 -04:00
|
|
|
|
|
|
|
|
2024-11-11 18:05:38 -05:00
|
|
|
def run_test(more_args):
|
|
|
|
"""Run the end to end accuracy test."""
|
|
|
|
|
2024-09-18 09:56:58 -04:00
|
|
|
args = list(DEFAULT_ARGS)
|
|
|
|
args.extend(more_args)
|
|
|
|
print(f"Running with: {args}")
|
2024-08-21 13:34:14 -04:00
|
|
|
|
2024-11-04 14:34:26 -05:00
|
|
|
with RemoteOpenAIServer(
|
|
|
|
MODEL_NAME, args,
|
|
|
|
max_wait_seconds=MAX_WAIT_SECONDS) as remote_server:
|
2024-09-18 09:56:58 -04:00
|
|
|
url = f"{remote_server.url_for('v1')}/completions"
|
|
|
|
|
|
|
|
model_args = (
|
|
|
|
f"model={MODEL_NAME},"
|
|
|
|
f"base_url={url},"
|
|
|
|
f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
|
|
|
|
|
|
|
|
results = lm_eval.simple_evaluate(
|
|
|
|
model="local-completions",
|
|
|
|
model_args=model_args,
|
|
|
|
tasks=TASK,
|
|
|
|
)
|
|
|
|
|
|
|
|
measured_value = results["results"][TASK][FILTER]
|
|
|
|
assert (measured_value - RTOL < EXPECTED_VALUE
|
|
|
|
and measured_value + RTOL > EXPECTED_VALUE
|
|
|
|
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
|
2024-11-11 18:05:38 -05:00
|
|
|
|
|
|
|
|
2025-02-14 03:21:53 -05:00
|
|
|
@pytest.mark.skipif(not current_platform.is_cuda()
|
|
|
|
and not current_platform.is_tpu(),
|
|
|
|
reason="V1 currently only supported on CUDA and TPU")
|
2025-03-17 11:35:57 +08:00
|
|
|
def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
|
2024-11-11 18:05:38 -05:00
|
|
|
"""Run with the V1 Engine."""
|
|
|
|
|
|
|
|
with monkeypatch.context() as m:
|
|
|
|
m.setenv("VLLM_USE_V1", "1")
|
2025-02-14 03:21:53 -05:00
|
|
|
more_args = []
|
|
|
|
|
|
|
|
# Limit compilation time for V1
|
|
|
|
if current_platform.is_tpu():
|
|
|
|
more_args = ["--max-num-seqs", "64"]
|
|
|
|
|
|
|
|
run_test(more_args)
|
2024-11-11 18:05:38 -05:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
|
2025-03-17 11:35:57 +08:00
|
|
|
def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch,
|
|
|
|
more_args):
|
2024-11-11 18:05:38 -05:00
|
|
|
"""Run with the V0 Engine."""
|
|
|
|
|
|
|
|
with monkeypatch.context() as m:
|
|
|
|
m.setenv("VLLM_USE_V1", "0")
|
|
|
|
run_test(more_args)
|