[V1] Set structured output backend to auto
by default (#15724)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
This commit is contained in:
parent
0c54fc7273
commit
9665313c39
@ -20,8 +20,6 @@ from .test_completion import zephyr_lora_files # noqa: F401
|
|||||||
# any model with a chat template should work here
|
# any model with a chat template should work here
|
||||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||||
|
|
||||||
GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def monkeypatch_module():
|
def monkeypatch_module():
|
||||||
@ -487,20 +485,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
|
|||||||
assert last_completion_tokens == 10
|
assert last_completion_tokens == 10
|
||||||
|
|
||||||
|
|
||||||
# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
|
|
||||||
# (i.e. using the same ordering as in the Completions API tests), the test
|
|
||||||
# will fail on the second `guided_decoding_backend` even when I swap their order
|
|
||||||
# (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256)
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
|
||||||
async def test_guided_choice_chat(client: openai.AsyncOpenAI,
|
async def test_guided_choice_chat(client: openai.AsyncOpenAI,
|
||||||
is_v1_server: bool,
|
|
||||||
guided_decoding_backend: str,
|
|
||||||
sample_guided_choice):
|
sample_guided_choice):
|
||||||
|
|
||||||
if is_v1_server and guided_decoding_backend != 'xgrammar':
|
|
||||||
pytest.skip("Only xgrammar backend is supported with V1")
|
|
||||||
|
|
||||||
messages = [{
|
messages = [{
|
||||||
"role": "system",
|
"role": "system",
|
||||||
"content": "you are a helpful assistant"
|
"content": "you are a helpful assistant"
|
||||||
@ -515,8 +502,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
|
|||||||
messages=messages,
|
messages=messages,
|
||||||
max_completion_tokens=10,
|
max_completion_tokens=10,
|
||||||
temperature=0.7,
|
temperature=0.7,
|
||||||
extra_body=dict(guided_choice=sample_guided_choice,
|
extra_body=dict(guided_choice=sample_guided_choice))
|
||||||
guided_decoding_backend=guided_decoding_backend))
|
|
||||||
choice1 = chat_completion.choices[0].message.content
|
choice1 = chat_completion.choices[0].message.content
|
||||||
assert choice1 in sample_guided_choice
|
assert choice1 in sample_guided_choice
|
||||||
|
|
||||||
@ -530,22 +516,16 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
|
|||||||
messages=messages,
|
messages=messages,
|
||||||
max_completion_tokens=10,
|
max_completion_tokens=10,
|
||||||
temperature=0.7,
|
temperature=0.7,
|
||||||
extra_body=dict(guided_choice=sample_guided_choice,
|
extra_body=dict(guided_choice=sample_guided_choice))
|
||||||
guided_decoding_backend=guided_decoding_backend))
|
|
||||||
choice2 = chat_completion.choices[0].message.content
|
choice2 = chat_completion.choices[0].message.content
|
||||||
assert choice2 in sample_guided_choice
|
assert choice2 in sample_guided_choice
|
||||||
assert choice1 != choice2
|
assert choice1 != choice2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
async def test_guided_json_chat(client: openai.AsyncOpenAI,
|
||||||
async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
|
|
||||||
guided_decoding_backend: str,
|
|
||||||
sample_json_schema):
|
sample_json_schema):
|
||||||
|
|
||||||
if is_v1_server:
|
|
||||||
pytest.skip("sample_json_schema has features unsupported in V1")
|
|
||||||
|
|
||||||
messages = [{
|
messages = [{
|
||||||
"role": "system",
|
"role": "system",
|
||||||
"content": "you are a helpful assistant"
|
"content": "you are a helpful assistant"
|
||||||
@ -560,8 +540,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
|
|||||||
model=MODEL_NAME,
|
model=MODEL_NAME,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
max_completion_tokens=1000,
|
max_completion_tokens=1000,
|
||||||
extra_body=dict(guided_json=sample_json_schema,
|
extra_body=dict(guided_json=sample_json_schema))
|
||||||
guided_decoding_backend=guided_decoding_backend))
|
|
||||||
message = chat_completion.choices[0].message
|
message = chat_completion.choices[0].message
|
||||||
assert message.content is not None
|
assert message.content is not None
|
||||||
json1 = json.loads(message.content)
|
json1 = json.loads(message.content)
|
||||||
@ -578,8 +557,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
|
|||||||
model=MODEL_NAME,
|
model=MODEL_NAME,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
max_completion_tokens=1000,
|
max_completion_tokens=1000,
|
||||||
extra_body=dict(guided_json=sample_json_schema,
|
extra_body=dict(guided_json=sample_json_schema))
|
||||||
guided_decoding_backend=guided_decoding_backend))
|
|
||||||
message = chat_completion.choices[0].message
|
message = chat_completion.choices[0].message
|
||||||
assert message.content is not None
|
assert message.content is not None
|
||||||
json2 = json.loads(message.content)
|
json2 = json.loads(message.content)
|
||||||
@ -589,13 +567,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex):
|
||||||
async def test_guided_regex_chat(client: openai.AsyncOpenAI,
|
|
||||||
is_v1_server: bool,
|
|
||||||
guided_decoding_backend: str, sample_regex):
|
|
||||||
|
|
||||||
if is_v1_server and guided_decoding_backend != 'xgrammar':
|
|
||||||
pytest.skip("Only xgrammar backend is supported with V1")
|
|
||||||
|
|
||||||
messages = [{
|
messages = [{
|
||||||
"role": "system",
|
"role": "system",
|
||||||
@ -610,8 +582,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
|
|||||||
model=MODEL_NAME,
|
model=MODEL_NAME,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
max_completion_tokens=20,
|
max_completion_tokens=20,
|
||||||
extra_body=dict(guided_regex=sample_regex,
|
extra_body=dict(guided_regex=sample_regex))
|
||||||
guided_decoding_backend=guided_decoding_backend))
|
|
||||||
ip1 = chat_completion.choices[0].message.content
|
ip1 = chat_completion.choices[0].message.content
|
||||||
assert ip1 is not None
|
assert ip1 is not None
|
||||||
assert re.fullmatch(sample_regex, ip1) is not None
|
assert re.fullmatch(sample_regex, ip1) is not None
|
||||||
@ -622,8 +593,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
|
|||||||
model=MODEL_NAME,
|
model=MODEL_NAME,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
max_completion_tokens=20,
|
max_completion_tokens=20,
|
||||||
extra_body=dict(guided_regex=sample_regex,
|
extra_body=dict(guided_regex=sample_regex))
|
||||||
guided_decoding_backend=guided_decoding_backend))
|
|
||||||
ip2 = chat_completion.choices[0].message.content
|
ip2 = chat_completion.choices[0].message.content
|
||||||
assert ip2 is not None
|
assert ip2 is not None
|
||||||
assert re.fullmatch(sample_regex, ip2) is not None
|
assert re.fullmatch(sample_regex, ip2) is not None
|
||||||
@ -652,15 +622,9 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
|
||||||
async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
|
async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
|
||||||
is_v1_server: bool,
|
|
||||||
guided_decoding_backend: str,
|
|
||||||
sample_guided_choice):
|
sample_guided_choice):
|
||||||
|
|
||||||
if is_v1_server and guided_decoding_backend != 'xgrammar':
|
|
||||||
pytest.skip("Only xgrammar backend is supported with V1")
|
|
||||||
|
|
||||||
messages = [{
|
messages = [{
|
||||||
"role": "system",
|
"role": "system",
|
||||||
"content": "you are a helpful assistant"
|
"content": "you are a helpful assistant"
|
||||||
@ -676,8 +640,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
|
|||||||
max_completion_tokens=10,
|
max_completion_tokens=10,
|
||||||
logprobs=True,
|
logprobs=True,
|
||||||
top_logprobs=5,
|
top_logprobs=5,
|
||||||
extra_body=dict(guided_choice=sample_guided_choice,
|
extra_body=dict(guided_choice=sample_guided_choice))
|
||||||
guided_decoding_backend=guided_decoding_backend))
|
|
||||||
|
|
||||||
assert chat_completion.choices[0].logprobs is not None
|
assert chat_completion.choices[0].logprobs is not None
|
||||||
assert chat_completion.choices[0].logprobs.content is not None
|
assert chat_completion.choices[0].logprobs.content is not None
|
||||||
@ -689,14 +652,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema):
|
||||||
async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
|
|
||||||
guided_decoding_backend: str,
|
|
||||||
sample_json_schema):
|
|
||||||
|
|
||||||
if is_v1_server:
|
|
||||||
pytest.skip("sample_json_schema has features unsupported on V1")
|
|
||||||
|
|
||||||
messages = [{
|
messages = [{
|
||||||
"role": "system",
|
"role": "system",
|
||||||
"content": "you are a helpful assistant"
|
"content": "you are a helpful assistant"
|
||||||
@ -728,7 +684,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
|
|||||||
"name": "dummy_function_name"
|
"name": "dummy_function_name"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
extra_body=dict(guided_decoding_backend=guided_decoding_backend))
|
)
|
||||||
message = chat_completion.choices[0].message
|
message = chat_completion.choices[0].message
|
||||||
assert len(message.content) == 0
|
assert len(message.content) == 0
|
||||||
json_string = message.tool_calls[0].function.arguments
|
json_string = message.tool_calls[0].function.arguments
|
||||||
@ -763,7 +719,6 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
|
|||||||
"name": "dummy_function_name"
|
"name": "dummy_function_name"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
extra_body=dict(guided_decoding_backend=guided_decoding_backend),
|
|
||||||
stream=True)
|
stream=True)
|
||||||
|
|
||||||
output = []
|
output = []
|
||||||
@ -888,7 +843,6 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
|
|||||||
model=model_name,
|
model=model_name,
|
||||||
tools=tools,
|
tools=tools,
|
||||||
tool_choice="required",
|
tool_choice="required",
|
||||||
extra_body=dict(guided_decoding_backend="outlines"),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert chat_completion.choices[0].message.tool_calls is not None
|
assert chat_completion.choices[0].message.tool_calls is not None
|
||||||
@ -900,7 +854,6 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
|
|||||||
model=model_name,
|
model=model_name,
|
||||||
tools=tools,
|
tools=tools,
|
||||||
tool_choice="required",
|
tool_choice="required",
|
||||||
extra_body=dict(guided_decoding_backend="outlines"),
|
|
||||||
stream=True,
|
stream=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -914,12 +867,7 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
|
|||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
|
async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
|
||||||
is_v1_server: bool,
|
|
||||||
sample_json_schema):
|
sample_json_schema):
|
||||||
|
|
||||||
if is_v1_server:
|
|
||||||
pytest.skip("sample_json_schema has features unsupported on V1")
|
|
||||||
|
|
||||||
messages = [{
|
messages = [{
|
||||||
"role": "system",
|
"role": "system",
|
||||||
"content": "you are a helpful assistant"
|
"content": "you are a helpful assistant"
|
||||||
|
@ -2976,7 +2976,7 @@ class DecodingConfig:
|
|||||||
|
|
||||||
# Which guided decoding algo to use.
|
# Which guided decoding algo to use.
|
||||||
# 'outlines' / 'lm-format-enforcer' / 'xgrammar'
|
# 'outlines' / 'lm-format-enforcer' / 'xgrammar'
|
||||||
guided_decoding_backend: str = 'xgrammar'
|
guided_decoding_backend: str = "auto" if envs.VLLM_USE_V1 else "xgrammar"
|
||||||
|
|
||||||
reasoning_backend: Optional[str] = None
|
reasoning_backend: Optional[str] = None
|
||||||
|
|
||||||
@ -3001,7 +3001,7 @@ class DecodingConfig:
|
|||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
v0_valid_guided_backends = [
|
v0_valid_guided_backends = [
|
||||||
'outlines', 'lm-format-enforcer', 'xgrammar'
|
'outlines', 'lm-format-enforcer', 'xgrammar', 'auto'
|
||||||
]
|
]
|
||||||
v1_valid_guided_backends = ['xgrammar', 'guidance', 'auto']
|
v1_valid_guided_backends = ['xgrammar', 'guidance', 'auto']
|
||||||
|
|
||||||
|
@ -182,7 +182,7 @@ class EngineArgs:
|
|||||||
enable_chunked_prefill: Optional[bool] = None
|
enable_chunked_prefill: Optional[bool] = None
|
||||||
disable_chunked_mm_input: bool = False
|
disable_chunked_mm_input: bool = False
|
||||||
|
|
||||||
guided_decoding_backend: str = 'xgrammar'
|
guided_decoding_backend: str = DecodingConfig.guided_decoding_backend
|
||||||
logits_processor_pattern: Optional[str] = None
|
logits_processor_pattern: Optional[str] = None
|
||||||
|
|
||||||
speculative_config: Optional[Dict[str, Any]] = None
|
speculative_config: Optional[Dict[str, Any]] = None
|
||||||
@ -407,13 +407,13 @@ class EngineArgs:
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--guided-decoding-backend',
|
'--guided-decoding-backend',
|
||||||
type=str,
|
type=str,
|
||||||
default='xgrammar',
|
default=DecodingConfig.guided_decoding_backend,
|
||||||
help='Which engine will be used for guided decoding'
|
help='Which engine will be used for guided decoding'
|
||||||
' (JSON schema / regex etc) by default. Currently support '
|
' (JSON schema / regex etc) by default. Currently support '
|
||||||
'https://github.com/mlc-ai/xgrammar and '
|
'https://github.com/mlc-ai/xgrammar and '
|
||||||
'https://github.com/guidance-ai/llguidance.'
|
'https://github.com/guidance-ai/llguidance.'
|
||||||
'Valid backend values are "xgrammar", "guidance", and "auto". '
|
'Valid backend values are "xgrammar", "guidance", and "auto". '
|
||||||
'With "auto", we will make opinionated choices based on request'
|
'With "auto", we will make opinionated choices based on request '
|
||||||
'contents and what the backend libraries currently support, so '
|
'contents and what the backend libraries currently support, so '
|
||||||
'the behavior is subject to change in each release.')
|
'the behavior is subject to change in each release.')
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
@ -33,6 +33,12 @@ def maybe_backend_fallback(
|
|||||||
logger.warning("%s Falling back to use %s instead.", message, fallback)
|
logger.warning("%s Falling back to use %s instead.", message, fallback)
|
||||||
guided_params.backend = fallback
|
guided_params.backend = fallback
|
||||||
|
|
||||||
|
# `auto` was added for V1 to explicitly declare a mode that has fallbacks
|
||||||
|
# in place. If that is specified with V0, treat it as `xgrammar`, as we have
|
||||||
|
# fallbacks enabled for that and it is the V0 default.
|
||||||
|
if guided_params.backend == "auto":
|
||||||
|
guided_params.backend = "xgrammar"
|
||||||
|
|
||||||
# lm-format-enforce doesn't support grammar, fallback to xgrammar
|
# lm-format-enforce doesn't support grammar, fallback to xgrammar
|
||||||
if guided_params.backend_name == "lm-format-enforcer":
|
if guided_params.backend_name == "lm-format-enforcer":
|
||||||
if guided_params.grammar is not None:
|
if guided_params.grammar is not None:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user