vllm/tests/async_engine/test_openapi_server_ray.py

import openai  # use the official client for correctness check
import pytest
import pytest_asyncio

from ..utils import VLLM_PATH, RemoteOpenAIServer

# any model with a chat template should work here
MODEL_NAME = "facebook/opt-125m"
chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
assert chatml_jinja_path.exists()


@pytest.fixture(scope="module")
def server():
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "float16",
        "--max-model-len",
        "2048",
        "--enforce-eager",
        "--engine-use-ray",
        "--chat-template",
        str(chatml_jinja_path),
    ]

    # Allow `--engine-use-ray`, otherwise the launch of the server throw
    # an error due to try to use a deprecated feature
    env_dict = {"VLLM_ALLOW_ENGINE_USE_RAY": "1"}
    with RemoteOpenAIServer(MODEL_NAME, args,
                            env_dict=env_dict) as remote_server:
        yield remote_server


@pytest_asyncio.fixture
async def client(server):
    async with server.get_async_client() as async_client:
        yield async_client


@pytest.mark.asyncio
async def test_check_models(client: openai.AsyncOpenAI):
    models = await client.models.list()
    models = models.data
    served_model = models[0]
    assert served_model.id == MODEL_NAME
    assert all(model.root == MODEL_NAME for model in models)


@pytest.mark.asyncio
async def test_single_completion(client: openai.AsyncOpenAI):
    completion = await client.completions.create(model=MODEL_NAME,
                                                 prompt="Hello, my name is",
                                                 max_tokens=5,
                                                 temperature=0.0)

    assert completion.id is not None
    assert len(completion.choices) == 1
    assert len(completion.choices[0].text) >= 5
    assert completion.choices[0].finish_reason == "length"
    assert completion.usage == openai.types.CompletionUsage(
        completion_tokens=5, prompt_tokens=6, total_tokens=11)

    # test using token IDs
    completion = await client.completions.create(
        model=MODEL_NAME,
        prompt=[0, 0, 0, 0, 0],
        max_tokens=5,
        temperature=0.0,
    )
    assert len(completion.choices[0].text) >= 5


@pytest.mark.asyncio
async def test_single_chat_session(client: openai.AsyncOpenAI):
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
    }, {
        "role": "user",
        "content": "what is 1+1?"
    }]

    # test single completion
    chat_completion = await client.chat.completions.create(model=MODEL_NAME,
                                                           messages=messages,
                                                           max_tokens=10,
                                                           logprobs=True,
                                                           top_logprobs=5)
    assert chat_completion.id is not None
    assert len(chat_completion.choices) == 1

    choice = chat_completion.choices[0]
    assert choice.finish_reason == "length"
    assert chat_completion.usage == openai.types.CompletionUsage(
        completion_tokens=10, prompt_tokens=55, total_tokens=65)

    message = choice.message
    assert message.content is not None and len(message.content) >= 10
    assert message.role == "assistant"
    messages.append({"role": "assistant", "content": message.content})

    # test multi-turn dialogue
    messages.append({"role": "user", "content": "express your result in json"})
    chat_completion = await client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages,
        max_tokens=10,
    )
    message = chat_completion.choices[0].message
    assert message.content is not None and len(message.content) >= 0
[Bugfix][Core] Fix get decoding config from ray (#4335) 2024-04-27 19:30:08 +08:00			`import openai # use the official client for correctness check`
			`import pytest`
[Tests] Disable retries and use context manager for openai client (#7565) 2024-08-26 21:33:17 -07:00			`import pytest_asyncio`
[Bugfix][Core] Fix get decoding config from ray (#4335) 2024-04-27 19:30:08 +08:00
[Frontend] Gracefully handle missing chat template and fix CI failure (#7238) Co-authored-by: Roger Wang <ywang@roblox.com> 2024-08-07 17:12:05 +08:00			`from ..utils import VLLM_PATH, RemoteOpenAIServer`
[CI/Build] Move `test_utils.py` to `tests/utils.py` (#4425) Since #4335 was merged, I've noticed that the definition of ServerRunner in the tests is the same as in the test for OpenAI API. I have moved the class to the test utilities to avoid code duplication. (Although it only has been repeated twice so far, I will add another similar test suite in #4200 which would duplicate the code a third time) Also, I have moved the test utilities file (test_utils.py) to under the test directory (tests/utils.py), since none of its code is actually used in the main package. Note that I have added __init__.py to each test subpackage and updated the ray.init() call in the test utilities file in order to relative import tests/utils.py. 2024-05-13 22:50:09 +08:00
[Bugfix][Core] Fix get decoding config from ray (#4335) 2024-04-27 19:30:08 +08:00			`# any model with a chat template should work here`
			`MODEL_NAME = "facebook/opt-125m"`
[Frontend] Gracefully handle missing chat template and fix CI failure (#7238) Co-authored-by: Roger Wang <ywang@roblox.com> 2024-08-07 17:12:05 +08:00			`chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"`
			`assert chatml_jinja_path.exists()`
[Bugfix][Core] Fix get decoding config from ray (#4335) 2024-04-27 19:30:08 +08:00

[CI/Build] Move `test_utils.py` to `tests/utils.py` (#4425) Since #4335 was merged, I've noticed that the definition of ServerRunner in the tests is the same as in the test for OpenAI API. I have moved the class to the test utilities to avoid code duplication. (Although it only has been repeated twice so far, I will add another similar test suite in #4200 which would duplicate the code a third time) Also, I have moved the test utilities file (test_utils.py) to under the test directory (tests/utils.py), since none of its code is actually used in the main package. Note that I have added __init__.py to each test subpackage and updated the ray.init() call in the test utilities file in order to relative import tests/utils.py. 2024-05-13 22:50:09 +08:00			`@pytest.fixture(scope="module")`
[ci] try to add multi-node tests (#6280) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> Co-authored-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-12 21:51:48 -07:00			`def server():`
[Doc][CI/Build] Update docs and tests to use `vllm serve` (#6431) 2024-07-17 15:43:21 +08:00			`args = [`
			`# use half precision for speed and memory savings in CI environment`
			`"--dtype",`
			`"float16",`
			`"--max-model-len",`
			`"2048",`
			`"--enforce-eager",`
[Frontend] Gracefully handle missing chat template and fix CI failure (#7238) Co-authored-by: Roger Wang <ywang@roblox.com> 2024-08-07 17:12:05 +08:00			`"--engine-use-ray",`
			`"--chat-template",`
			`str(chatml_jinja_path),`
[Doc][CI/Build] Update docs and tests to use `vllm serve` (#6431) 2024-07-17 15:43:21 +08:00			`]`

[Misc] Deprecation Warning when setting --engine-use-ray (#7424) Signed-off-by: Wallas Santos <wallashss@ibm.com> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com> Co-authored-by: youkaichao <youkaichao@126.com> 2024-08-14 13:44:27 -03:00			# Allow `--engine-use-ray`, otherwise the launch of the server throw
			`# an error due to try to use a deprecated feature`
			`env_dict = {"VLLM_ALLOW_ENGINE_USE_RAY": "1"}`
			`with RemoteOpenAIServer(MODEL_NAME, args,`
			`env_dict=env_dict) as remote_server:`
[ci] try to add multi-node tests (#6280) Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> Co-authored-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> 2024-07-12 21:51:48 -07:00			`yield remote_server`
[Bugfix][Core] Fix get decoding config from ray (#4335) 2024-04-27 19:30:08 +08:00

[Tests] Disable retries and use context manager for openai client (#7565) 2024-08-26 21:33:17 -07:00			`@pytest_asyncio.fixture`
			`async def client(server):`
			`async with server.get_async_client() as async_client:`
			`yield async_client`
[Bugfix][Core] Fix get decoding config from ray (#4335) 2024-04-27 19:30:08 +08:00

			`@pytest.mark.asyncio`
[CI/Build] Simplify OpenAI server setup in tests (#5100) 2024-06-14 02:21:53 +08:00			`async def test_check_models(client: openai.AsyncOpenAI):`
[Bugfix][Core] Fix get decoding config from ray (#4335) 2024-04-27 19:30:08 +08:00			`models = await client.models.list()`
			`models = models.data`
			`served_model = models[0]`
			`assert served_model.id == MODEL_NAME`
			`assert all(model.root == MODEL_NAME for model in models)`


			`@pytest.mark.asyncio`
[CI/Build] Simplify OpenAI server setup in tests (#5100) 2024-06-14 02:21:53 +08:00			`async def test_single_completion(client: openai.AsyncOpenAI):`
[Bugfix][Core] Fix get decoding config from ray (#4335) 2024-04-27 19:30:08 +08:00			`completion = await client.completions.create(model=MODEL_NAME,`
			`prompt="Hello, my name is",`
			`max_tokens=5,`
			`temperature=0.0)`

			`assert completion.id is not None`
[Bugfix][Frontend] Cleanup "fix chat logprobs" (#5026) 2024-06-11 13:36:46 +08:00			`assert len(completion.choices) == 1`
			`assert len(completion.choices[0].text) >= 5`
[Bugfix][Core] Fix get decoding config from ray (#4335) 2024-04-27 19:30:08 +08:00			`assert completion.choices[0].finish_reason == "length"`
			`assert completion.usage == openai.types.CompletionUsage(`
			`completion_tokens=5, prompt_tokens=6, total_tokens=11)`

			`# test using token IDs`
			`completion = await client.completions.create(`
			`model=MODEL_NAME,`
			`prompt=[0, 0, 0, 0, 0],`
			`max_tokens=5,`
			`temperature=0.0,`
			`)`
[Bugfix][Frontend] Cleanup "fix chat logprobs" (#5026) 2024-06-11 13:36:46 +08:00			`assert len(completion.choices[0].text) >= 5`
[Bugfix][Core] Fix get decoding config from ray (#4335) 2024-04-27 19:30:08 +08:00

			`@pytest.mark.asyncio`
[CI/Build] Simplify OpenAI server setup in tests (#5100) 2024-06-14 02:21:53 +08:00			`async def test_single_chat_session(client: openai.AsyncOpenAI):`
[Bugfix][Core] Fix get decoding config from ray (#4335) 2024-04-27 19:30:08 +08:00			`messages = [{`
			`"role": "system",`
			`"content": "you are a helpful assistant"`
			`}, {`
			`"role": "user",`
			`"content": "what is 1+1?"`
			`}]`

			`# test single completion`
			`chat_completion = await client.chat.completions.create(model=MODEL_NAME,`
			`messages=messages,`
			`max_tokens=10,`
			`logprobs=True,`
			`top_logprobs=5)`
			`assert chat_completion.id is not None`
[Bugfix][Frontend] Cleanup "fix chat logprobs" (#5026) 2024-06-11 13:36:46 +08:00			`assert len(chat_completion.choices) == 1`

			`choice = chat_completion.choices[0]`
			`assert choice.finish_reason == "length"`
			`assert chat_completion.usage == openai.types.CompletionUsage(`
[Frontend] Gracefully handle missing chat template and fix CI failure (#7238) Co-authored-by: Roger Wang <ywang@roblox.com> 2024-08-07 17:12:05 +08:00			`completion_tokens=10, prompt_tokens=55, total_tokens=65)`
[Bugfix][Frontend] Cleanup "fix chat logprobs" (#5026) 2024-06-11 13:36:46 +08:00
			`message = choice.message`
[Bugfix][Core] Fix get decoding config from ray (#4335) 2024-04-27 19:30:08 +08:00			`assert message.content is not None and len(message.content) >= 10`
			`assert message.role == "assistant"`
			`messages.append({"role": "assistant", "content": message.content})`

			`# test multi-turn dialogue`
			`messages.append({"role": "user", "content": "express your result in json"})`
			`chat_completion = await client.chat.completions.create(`
			`model=MODEL_NAME,`
			`messages=messages,`
			`max_tokens=10,`
			`)`
			`message = chat_completion.choices[0].message`
			`assert message.content is not None and len(message.content) >= 0`