vllm/tests/entrypoints/openai/test_shutdown.py

import json
import os

import openai
import pytest

from ...utils import RemoteOpenAIServer

MODEL_NAME = "meta-llama/Llama-3.2-1B"


@pytest.mark.asyncio
async def test_shutdown_on_engine_failure(tmp_path):
    # Use a bad adapter to crash the engine
    # (This test will fail when that bug is fixed)
    adapter_path = tmp_path / "bad_adapter"
    os.mkdir(adapter_path)
    with open(adapter_path / "adapter_model_config.json", "w") as f:
        json.dump({"not": "real"}, f)
    with open(adapter_path / "adapter_model.safetensors", "wb") as f:
        f.write(b"this is fake")

    # dtype, max-len etc set so that this can run in CI
    args = [
        "--dtype",
        "bfloat16",
        "--max-model-len",
        "8192",
        "--enforce-eager",
        "--max-num-seqs",
        "128",
        "--enable-lora",
        "--lora-modules",
        f"bad-adapter={tmp_path / 'bad_adapter'}",
    ]

    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        async with remote_server.get_async_client() as client:

            with pytest.raises(
                (openai.APIConnectionError, openai.InternalServerError)):
                # This crashes the engine
                await client.completions.create(model="bad-adapter",
                                                prompt="Hello, my name is")

            # Now the server should shut down
            return_code = remote_server.proc.wait(timeout=8)
            assert return_code is not None