2025-02-02 14:58:18 -05:00
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
|
2025-03-17 11:35:57 +08:00
|
|
|
import pytest
|
2024-09-18 09:56:58 -04:00
|
|
|
|
2024-11-16 18:02:14 -08:00
|
|
|
from vllm.config import CompilationLevel
|
2024-10-10 12:39:36 -07:00
|
|
|
|
2024-08-28 16:10:12 -07:00
|
|
|
from ..utils import compare_two_settings
|
|
|
|
|
2024-09-18 09:56:58 -04:00
|
|
|
# --enforce-eager on TPU causes graph compilation
|
|
|
|
# this times out default Health Check in the MQLLMEngine,
|
|
|
|
# so we set the timeout here to 30s
|
|
|
|
|
2024-08-28 16:10:12 -07:00
|
|
|
|
2025-03-17 11:35:57 +08:00
|
|
|
def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
|
|
|
|
with monkeypatch.context() as m:
|
|
|
|
m.setenv("VLLM_RPC_TIMEOUT", "30000")
|
2025-03-17 17:07:07 -04:00
|
|
|
compare_two_settings("Qwen/Qwen2.5-1.5B-Instruct",
|
|
|
|
arg1=[
|
|
|
|
"--max-model-len=256",
|
|
|
|
"--max-num-seqs=32",
|
|
|
|
"--enforce-eager",
|
|
|
|
f"-O{CompilationLevel.DYNAMO_ONCE}",
|
|
|
|
],
|
|
|
|
arg2=[
|
|
|
|
"--max-model-len=256", "--max-num-seqs=32",
|
|
|
|
"--enforce-eager",
|
|
|
|
f"-O{CompilationLevel.DYNAMO_AS_IS}"
|
|
|
|
],
|
|
|
|
env1={},
|
|
|
|
env2={})
|