2025-02-02 14:58:18 -05:00
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
|
2024-07-16 12:18:09 +00:00
|
|
|
import pytest
|
2024-08-26 21:33:17 -07:00
|
|
|
import pytest_asyncio
|
2024-07-16 12:18:09 +00:00
|
|
|
import requests
|
|
|
|
|
|
|
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
|
|
|
|
|
|
|
from ...utils import RemoteOpenAIServer
|
2024-07-18 00:13:30 -07:00
|
|
|
from .test_completion import zephyr_lora_added_tokens_files # noqa: F401
|
|
|
|
from .test_completion import zephyr_lora_files # noqa: F401
|
2024-07-16 12:18:09 +00:00
|
|
|
|
|
|
|
# any model with a chat template should work here
|
|
|
|
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
2024-07-18 00:13:30 -07:00
|
|
|
def server(zephyr_lora_added_tokens_files: str): # noqa: F811
|
2024-07-17 15:43:21 +08:00
|
|
|
args = [
|
|
|
|
# use half precision for speed and memory savings in CI environment
|
|
|
|
"--dtype",
|
|
|
|
"bfloat16",
|
|
|
|
"--max-model-len",
|
|
|
|
"8192",
|
|
|
|
"--enforce-eager",
|
|
|
|
"--max-num-seqs",
|
|
|
|
"128",
|
2024-07-18 00:13:30 -07:00
|
|
|
# lora config
|
|
|
|
"--enable-lora",
|
|
|
|
"--lora-modules",
|
|
|
|
f"zephyr-lora2={zephyr_lora_added_tokens_files}",
|
|
|
|
"--max-lora-rank",
|
|
|
|
"64",
|
2024-07-17 15:43:21 +08:00
|
|
|
]
|
|
|
|
|
|
|
|
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
2024-07-16 12:18:09 +00:00
|
|
|
yield remote_server
|
|
|
|
|
|
|
|
|
2024-07-18 00:13:30 -07:00
|
|
|
@pytest.fixture(scope="module")
|
|
|
|
def tokenizer_name(model_name: str,
|
|
|
|
zephyr_lora_added_tokens_files: str): # noqa: F811
|
|
|
|
return zephyr_lora_added_tokens_files if (
|
|
|
|
model_name == "zephyr-lora2") else model_name
|
|
|
|
|
|
|
|
|
2024-08-26 21:33:17 -07:00
|
|
|
@pytest_asyncio.fixture
|
|
|
|
async def client(server):
|
|
|
|
async with server.get_async_client() as async_client:
|
|
|
|
yield async_client
|
2024-07-16 12:18:09 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
@pytest.mark.parametrize(
|
2024-07-18 00:13:30 -07:00
|
|
|
"model_name,tokenizer_name",
|
|
|
|
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
|
|
|
|
indirect=["tokenizer_name"],
|
2024-07-16 12:18:09 +00:00
|
|
|
)
|
2024-11-01 16:13:35 +08:00
|
|
|
async def test_tokenize_completions(
|
|
|
|
server: RemoteOpenAIServer,
|
|
|
|
model_name: str,
|
|
|
|
tokenizer_name: str,
|
|
|
|
):
|
2024-07-18 00:13:30 -07:00
|
|
|
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
|
|
|
|
tokenizer_mode="fast")
|
2024-07-16 12:18:09 +00:00
|
|
|
|
|
|
|
for add_special in [False, True]:
|
2024-07-18 00:13:30 -07:00
|
|
|
prompt = "vllm1 This is a test prompt."
|
2024-07-16 12:18:09 +00:00
|
|
|
tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
|
|
|
|
|
2024-11-01 16:13:35 +08:00
|
|
|
response = requests.post(server.url_for("tokenize"),
|
2024-07-16 12:18:09 +00:00
|
|
|
json={
|
|
|
|
"add_special_tokens": add_special,
|
|
|
|
"model": model_name,
|
|
|
|
"prompt": prompt
|
|
|
|
})
|
|
|
|
response.raise_for_status()
|
|
|
|
|
|
|
|
assert response.json() == {
|
|
|
|
"tokens": tokens,
|
|
|
|
"count": len(tokens),
|
|
|
|
"max_model_len": 8192
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
@pytest.mark.parametrize(
|
2024-07-18 00:13:30 -07:00
|
|
|
"model_name,tokenizer_name",
|
|
|
|
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
|
|
|
|
indirect=["tokenizer_name"],
|
2024-07-16 12:18:09 +00:00
|
|
|
)
|
2024-11-01 16:13:35 +08:00
|
|
|
async def test_tokenize_chat(
|
|
|
|
server: RemoteOpenAIServer,
|
|
|
|
model_name: str,
|
|
|
|
tokenizer_name: str,
|
|
|
|
):
|
2024-07-18 00:13:30 -07:00
|
|
|
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
|
|
|
|
tokenizer_mode="fast")
|
2024-07-16 12:18:09 +00:00
|
|
|
|
|
|
|
for add_generation in [False, True]:
|
|
|
|
for add_special in [False, True]:
|
|
|
|
conversation = [{
|
|
|
|
"role": "user",
|
|
|
|
"content": "Hi there!"
|
|
|
|
}, {
|
|
|
|
"role": "assistant",
|
|
|
|
"content": "Nice to meet you!"
|
|
|
|
}, {
|
|
|
|
"role": "user",
|
2024-07-18 00:13:30 -07:00
|
|
|
"content": "Can I ask a question? vllm1"
|
2024-07-16 12:18:09 +00:00
|
|
|
}]
|
2024-09-29 20:59:47 +03:00
|
|
|
for continue_final in [False, True]:
|
|
|
|
if add_generation and continue_final:
|
|
|
|
continue
|
|
|
|
if continue_final:
|
|
|
|
conversation.append({
|
|
|
|
"role": "assistant",
|
|
|
|
"content": "Sure,"
|
|
|
|
})
|
|
|
|
|
|
|
|
prompt = tokenizer.apply_chat_template(
|
|
|
|
add_generation_prompt=add_generation,
|
|
|
|
continue_final_message=continue_final,
|
|
|
|
conversation=conversation,
|
|
|
|
tokenize=False)
|
|
|
|
tokens = tokenizer.encode(prompt,
|
|
|
|
add_special_tokens=add_special)
|
|
|
|
|
2024-11-01 16:13:35 +08:00
|
|
|
response = requests.post(server.url_for("tokenize"),
|
2024-09-29 20:59:47 +03:00
|
|
|
json={
|
|
|
|
"add_generation_prompt":
|
|
|
|
add_generation,
|
|
|
|
"continue_final_message":
|
|
|
|
continue_final,
|
|
|
|
"add_special_tokens": add_special,
|
|
|
|
"messages": conversation,
|
|
|
|
"model": model_name
|
|
|
|
})
|
|
|
|
response.raise_for_status()
|
|
|
|
|
|
|
|
assert response.json() == {
|
|
|
|
"tokens": tokens,
|
|
|
|
"count": len(tokens),
|
|
|
|
"max_model_len": 8192
|
|
|
|
}
|
2024-07-16 12:18:09 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
@pytest.mark.parametrize(
|
2024-07-18 00:13:30 -07:00
|
|
|
"model_name,tokenizer_name",
|
|
|
|
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
|
|
|
|
indirect=["tokenizer_name"],
|
2024-07-16 12:18:09 +00:00
|
|
|
)
|
2024-11-01 16:13:35 +08:00
|
|
|
async def test_detokenize(
|
|
|
|
server: RemoteOpenAIServer,
|
|
|
|
model_name: str,
|
|
|
|
tokenizer_name: str,
|
|
|
|
):
|
2024-07-18 00:13:30 -07:00
|
|
|
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
|
|
|
|
tokenizer_mode="fast")
|
2024-07-16 12:18:09 +00:00
|
|
|
|
2024-07-18 00:13:30 -07:00
|
|
|
prompt = "This is a test prompt. vllm1"
|
2024-07-16 12:18:09 +00:00
|
|
|
tokens = tokenizer.encode(prompt, add_special_tokens=False)
|
|
|
|
|
2024-11-01 16:13:35 +08:00
|
|
|
response = requests.post(server.url_for("detokenize"),
|
2024-07-16 12:18:09 +00:00
|
|
|
json={
|
|
|
|
"model": model_name,
|
|
|
|
"tokens": tokens
|
|
|
|
})
|
|
|
|
response.raise_for_status()
|
|
|
|
|
|
|
|
assert response.json() == {"prompt": prompt}
|