vllm/tests/entrypoints/openai/test_basic.py

from http import HTTPStatus
from typing import List

import pytest
import pytest_asyncio
import requests

from vllm.version import __version__ as VLLM_VERSION

from ...utils import RemoteOpenAIServer

MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"


@pytest.fixture(scope='module')
def server_args(request: pytest.FixtureRequest) -> List[str]:
    """ Provide extra arguments to the server via indirect parametrization

    Usage:

    >>> @pytest.mark.parametrize(
    >>>     "server_args",
    >>>     [
    >>>         ["--disable-frontend-multiprocessing"],
    >>>         [
    >>>             "--model=NousResearch/Hermes-3-Llama-3.1-70B",
    >>>             "--enable-auto-tool-choice",
    >>>         ],
    >>>     ],
    >>>     indirect=True,
    >>> )
    >>> def test_foo(server, client):
    >>>     ...

    This will run `test_foo` twice with servers with:
    - `--disable-frontend-multiprocessing`
    - `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.

    """
    if not hasattr(request, "param"):
        return []

    val = request.param

    if isinstance(val, str):
        return [val]

    return request.param


@pytest.fixture(scope="module")
def server(server_args):
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "bfloat16",
        "--max-model-len",
        "8192",
        "--enforce-eager",
        "--max-num-seqs",
        "128",
        *server_args,
    ]

    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server


@pytest_asyncio.fixture
async def client(server):
    async with server.get_async_client() as async_client:
        yield async_client


@pytest.mark.parametrize(
    "server_args",
    [
        pytest.param([], id="default-frontend-multiprocessing"),
        pytest.param(["--disable-frontend-multiprocessing"],
                     id="disable-frontend-multiprocessing")
    ],
    indirect=True,
)
@pytest.mark.asyncio
async def test_show_version(server: RemoteOpenAIServer):
    response = requests.get(server.url_for("version"))
    response.raise_for_status()

    assert response.json() == {"version": VLLM_VERSION}


@pytest.mark.parametrize(
    "server_args",
    [
        pytest.param([], id="default-frontend-multiprocessing"),
        pytest.param(["--disable-frontend-multiprocessing"],
                     id="disable-frontend-multiprocessing")
    ],
    indirect=True,
)
@pytest.mark.asyncio
async def test_check_health(server: RemoteOpenAIServer):
    response = requests.get(server.url_for("health"))

    assert response.status_code == HTTPStatus.OK
[Bugfix][Frontend] Fix missing `/metrics` endpoint (#6463) 2024-07-19 11:55:13 +08:00			`from http import HTTPStatus`
[Bugfix] fix OpenAI API server startup with --disable-frontend-multiprocessing (#8537) 2024-10-08 18:38:40 +02:00			`from typing import List`
[Bugfix][Frontend] Fix missing `/metrics` endpoint (#6463) 2024-07-19 11:55:13 +08:00
			`import pytest`
[Tests] Disable retries and use context manager for openai client (#7565) 2024-08-26 21:33:17 -07:00			`import pytest_asyncio`
[Bugfix][Frontend] Fix missing `/metrics` endpoint (#6463) 2024-07-19 11:55:13 +08:00			`import requests`

			`from vllm.version import __version__ as VLLM_VERSION`

			`from ...utils import RemoteOpenAIServer`

			`MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"`


[Bugfix] fix OpenAI API server startup with --disable-frontend-multiprocessing (#8537) 2024-10-08 18:38:40 +02:00			`@pytest.fixture(scope='module')`
			`def server_args(request: pytest.FixtureRequest) -> List[str]:`
			`""" Provide extra arguments to the server via indirect parametrization`

			`Usage:`

			`>>> @pytest.mark.parametrize(`
			`>>> "server_args",`
			`>>> [`
			`>>> ["--disable-frontend-multiprocessing"],`
			`>>> [`
			`>>> "--model=NousResearch/Hermes-3-Llama-3.1-70B",`
			`>>> "--enable-auto-tool-choice",`
			`>>> ],`
			`>>> ],`
			`>>> indirect=True,`
			`>>> )`
			`>>> def test_foo(server, client):`
			`>>> ...`

			This will run `test_foo` twice with servers with:
			- `--disable-frontend-multiprocessing`
			- `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.

			`"""`
			`if not hasattr(request, "param"):`
			`return []`

			`val = request.param`

			`if isinstance(val, str):`
			`return [val]`

			`return request.param`


[Bugfix][Frontend] Fix missing `/metrics` endpoint (#6463) 2024-07-19 11:55:13 +08:00			`@pytest.fixture(scope="module")`
[Bugfix] fix OpenAI API server startup with --disable-frontend-multiprocessing (#8537) 2024-10-08 18:38:40 +02:00			`def server(server_args):`
[Bugfix][Frontend] Fix missing `/metrics` endpoint (#6463) 2024-07-19 11:55:13 +08:00			`args = [`
			`# use half precision for speed and memory savings in CI environment`
			`"--dtype",`
			`"bfloat16",`
			`"--max-model-len",`
			`"8192",`
			`"--enforce-eager",`
			`"--max-num-seqs",`
			`"128",`
[Bugfix] fix OpenAI API server startup with --disable-frontend-multiprocessing (#8537) 2024-10-08 18:38:40 +02:00			`*server_args,`
[Bugfix][Frontend] Fix missing `/metrics` endpoint (#6463) 2024-07-19 11:55:13 +08:00			`]`

			`with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:`
			`yield remote_server`


[Tests] Disable retries and use context manager for openai client (#7565) 2024-08-26 21:33:17 -07:00			`@pytest_asyncio.fixture`
			`async def client(server):`
			`async with server.get_async_client() as async_client:`
			`yield async_client`
[Bugfix][Frontend] Fix missing `/metrics` endpoint (#6463) 2024-07-19 11:55:13 +08:00

[Bugfix] fix OpenAI API server startup with --disable-frontend-multiprocessing (#8537) 2024-10-08 18:38:40 +02:00			`@pytest.mark.parametrize(`
			`"server_args",`
			`[`
			`pytest.param([], id="default-frontend-multiprocessing"),`
			`pytest.param(["--disable-frontend-multiprocessing"],`
			`id="disable-frontend-multiprocessing")`
			`],`
			`indirect=True,`
			`)`
[Bugfix][Frontend] Fix missing `/metrics` endpoint (#6463) 2024-07-19 11:55:13 +08:00			`@pytest.mark.asyncio`
[Frontend] Chat-based Embeddings API (#9759) 2024-11-01 16:13:35 +08:00			`async def test_show_version(server: RemoteOpenAIServer):`
			`response = requests.get(server.url_for("version"))`
[Bugfix][Frontend] Fix missing `/metrics` endpoint (#6463) 2024-07-19 11:55:13 +08:00			`response.raise_for_status()`

			`assert response.json() == {"version": VLLM_VERSION}`


[Bugfix] fix OpenAI API server startup with --disable-frontend-multiprocessing (#8537) 2024-10-08 18:38:40 +02:00			`@pytest.mark.parametrize(`
			`"server_args",`
			`[`
			`pytest.param([], id="default-frontend-multiprocessing"),`
			`pytest.param(["--disable-frontend-multiprocessing"],`
			`id="disable-frontend-multiprocessing")`
			`],`
			`indirect=True,`
			`)`
[Bugfix][Frontend] Fix missing `/metrics` endpoint (#6463) 2024-07-19 11:55:13 +08:00			`@pytest.mark.asyncio`
[Frontend] Chat-based Embeddings API (#9759) 2024-11-01 16:13:35 +08:00			`async def test_check_health(server: RemoteOpenAIServer):`
			`response = requests.get(server.url_for("health"))`
[Bugfix][Frontend] Fix missing `/metrics` endpoint (#6463) 2024-07-19 11:55:13 +08:00
			`assert response.status_code == HTTPStatus.OK`