2024-04-27 19:30:08 +08:00
|
|
|
import openai # use the official client for correctness check
|
|
|
|
import pytest
|
2024-08-26 21:33:17 -07:00
|
|
|
import pytest_asyncio
|
2024-04-27 19:30:08 +08:00
|
|
|
|
2024-08-07 17:12:05 +08:00
|
|
|
from ..utils import VLLM_PATH, RemoteOpenAIServer
|
2024-05-13 22:50:09 +08:00
|
|
|
|
2024-04-27 19:30:08 +08:00
|
|
|
# any model with a chat template should work here
|
|
|
|
MODEL_NAME = "facebook/opt-125m"
|
2024-08-07 17:12:05 +08:00
|
|
|
chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
|
|
|
|
assert chatml_jinja_path.exists()
|
2024-04-27 19:30:08 +08:00
|
|
|
|
|
|
|
|
2024-05-13 22:50:09 +08:00
|
|
|
@pytest.fixture(scope="module")
|
2024-07-12 21:51:48 -07:00
|
|
|
def server():
|
2024-07-17 15:43:21 +08:00
|
|
|
args = [
|
|
|
|
# use half precision for speed and memory savings in CI environment
|
|
|
|
"--dtype",
|
|
|
|
"float16",
|
|
|
|
"--max-model-len",
|
|
|
|
"2048",
|
|
|
|
"--enforce-eager",
|
2024-08-07 17:12:05 +08:00
|
|
|
"--engine-use-ray",
|
|
|
|
"--chat-template",
|
|
|
|
str(chatml_jinja_path),
|
2024-07-17 15:43:21 +08:00
|
|
|
]
|
|
|
|
|
2024-08-14 13:44:27 -03:00
|
|
|
# Allow `--engine-use-ray`, otherwise the launch of the server throw
|
|
|
|
# an error due to try to use a deprecated feature
|
|
|
|
env_dict = {"VLLM_ALLOW_ENGINE_USE_RAY": "1"}
|
|
|
|
with RemoteOpenAIServer(MODEL_NAME, args,
|
|
|
|
env_dict=env_dict) as remote_server:
|
2024-07-12 21:51:48 -07:00
|
|
|
yield remote_server
|
2024-04-27 19:30:08 +08:00
|
|
|
|
|
|
|
|
2024-08-26 21:33:17 -07:00
|
|
|
@pytest_asyncio.fixture
|
|
|
|
async def client(server):
|
|
|
|
async with server.get_async_client() as async_client:
|
|
|
|
yield async_client
|
2024-04-27 19:30:08 +08:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
2024-06-14 02:21:53 +08:00
|
|
|
async def test_check_models(client: openai.AsyncOpenAI):
|
2024-04-27 19:30:08 +08:00
|
|
|
models = await client.models.list()
|
|
|
|
models = models.data
|
|
|
|
served_model = models[0]
|
|
|
|
assert served_model.id == MODEL_NAME
|
|
|
|
assert all(model.root == MODEL_NAME for model in models)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
2024-06-14 02:21:53 +08:00
|
|
|
async def test_single_completion(client: openai.AsyncOpenAI):
|
2024-04-27 19:30:08 +08:00
|
|
|
completion = await client.completions.create(model=MODEL_NAME,
|
|
|
|
prompt="Hello, my name is",
|
|
|
|
max_tokens=5,
|
|
|
|
temperature=0.0)
|
|
|
|
|
|
|
|
assert completion.id is not None
|
2024-06-11 13:36:46 +08:00
|
|
|
assert len(completion.choices) == 1
|
|
|
|
assert len(completion.choices[0].text) >= 5
|
2024-04-27 19:30:08 +08:00
|
|
|
assert completion.choices[0].finish_reason == "length"
|
|
|
|
assert completion.usage == openai.types.CompletionUsage(
|
|
|
|
completion_tokens=5, prompt_tokens=6, total_tokens=11)
|
|
|
|
|
|
|
|
# test using token IDs
|
|
|
|
completion = await client.completions.create(
|
|
|
|
model=MODEL_NAME,
|
|
|
|
prompt=[0, 0, 0, 0, 0],
|
|
|
|
max_tokens=5,
|
|
|
|
temperature=0.0,
|
|
|
|
)
|
2024-06-11 13:36:46 +08:00
|
|
|
assert len(completion.choices[0].text) >= 5
|
2024-04-27 19:30:08 +08:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
2024-06-14 02:21:53 +08:00
|
|
|
async def test_single_chat_session(client: openai.AsyncOpenAI):
|
2024-04-27 19:30:08 +08:00
|
|
|
messages = [{
|
|
|
|
"role": "system",
|
|
|
|
"content": "you are a helpful assistant"
|
|
|
|
}, {
|
|
|
|
"role": "user",
|
|
|
|
"content": "what is 1+1?"
|
|
|
|
}]
|
|
|
|
|
|
|
|
# test single completion
|
|
|
|
chat_completion = await client.chat.completions.create(model=MODEL_NAME,
|
|
|
|
messages=messages,
|
|
|
|
max_tokens=10,
|
|
|
|
logprobs=True,
|
|
|
|
top_logprobs=5)
|
|
|
|
assert chat_completion.id is not None
|
2024-06-11 13:36:46 +08:00
|
|
|
assert len(chat_completion.choices) == 1
|
|
|
|
|
|
|
|
choice = chat_completion.choices[0]
|
|
|
|
assert choice.finish_reason == "length"
|
|
|
|
assert chat_completion.usage == openai.types.CompletionUsage(
|
2024-08-07 17:12:05 +08:00
|
|
|
completion_tokens=10, prompt_tokens=55, total_tokens=65)
|
2024-06-11 13:36:46 +08:00
|
|
|
|
|
|
|
message = choice.message
|
2024-04-27 19:30:08 +08:00
|
|
|
assert message.content is not None and len(message.content) >= 10
|
|
|
|
assert message.role == "assistant"
|
|
|
|
messages.append({"role": "assistant", "content": message.content})
|
|
|
|
|
|
|
|
# test multi-turn dialogue
|
|
|
|
messages.append({"role": "user", "content": "express your result in json"})
|
|
|
|
chat_completion = await client.chat.completions.create(
|
|
|
|
model=MODEL_NAME,
|
|
|
|
messages=messages,
|
|
|
|
max_tokens=10,
|
|
|
|
)
|
|
|
|
message = chat_completion.choices[0].message
|
|
|
|
assert message.content is not None and len(message.content) >= 0
|