
Co-authored-by: Nick Hill <nickhill@us.ibm.com> Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Co-authored-by: Simon Mo <simon.mo@hey.com>
17 lines
524 B
Python
17 lines
524 B
Python
import os
|
|
|
|
from ..utils import compare_two_settings
|
|
|
|
# --enforce-eager on TPU causes graph compilation
|
|
# this times out default Health Check in the MQLLMEngine,
|
|
# so we set the timeout here to 30s
|
|
os.environ["VLLM_RPC_TIMEOUT"] = "30000"
|
|
|
|
|
|
def test_custom_dispatcher():
|
|
compare_two_settings("google/gemma-2b",
|
|
arg1=["--enforce-eager"],
|
|
arg2=["--enforce-eager"],
|
|
env1={"VLLM_DYNAMO_USE_CUSTOM_DISPATCHER": "0"},
|
|
env2={})
|