2025-02-02 14:58:18 -05:00
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
|
2025-03-15 01:02:20 -04:00
|
|
|
import os
|
2023-09-07 13:43:45 -07:00
|
|
|
import subprocess
|
|
|
|
import sys
|
|
|
|
import time
|
|
|
|
from multiprocessing import Pool
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
import requests
|
|
|
|
|
|
|
|
|
2024-01-04 03:30:22 +08:00
|
|
|
def _query_server(prompt: str, max_tokens: int = 5) -> dict:
|
2023-09-07 13:43:45 -07:00
|
|
|
response = requests.post("http://localhost:8000/generate",
|
|
|
|
json={
|
|
|
|
"prompt": prompt,
|
2024-01-04 03:30:22 +08:00
|
|
|
"max_tokens": max_tokens,
|
2023-09-07 13:43:45 -07:00
|
|
|
"temperature": 0,
|
|
|
|
"ignore_eos": True
|
|
|
|
})
|
|
|
|
response.raise_for_status()
|
|
|
|
return response.json()
|
|
|
|
|
|
|
|
|
2024-01-04 03:30:22 +08:00
|
|
|
def _query_server_long(prompt: str) -> dict:
|
|
|
|
return _query_server(prompt, max_tokens=500)
|
|
|
|
|
|
|
|
|
2023-09-07 13:43:45 -07:00
|
|
|
@pytest.fixture
|
2025-01-25 03:45:20 +08:00
|
|
|
def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
|
2023-09-07 13:43:45 -07:00
|
|
|
script_path = Path(__file__).parent.joinpath(
|
|
|
|
"api_server_async_engine.py").absolute()
|
2024-04-16 14:24:53 +09:00
|
|
|
commands = [
|
2025-01-25 03:45:20 +08:00
|
|
|
sys.executable,
|
|
|
|
"-u",
|
|
|
|
str(script_path),
|
|
|
|
"--model",
|
|
|
|
"facebook/opt-125m",
|
|
|
|
"--host",
|
|
|
|
"127.0.0.1",
|
|
|
|
"--tokenizer-pool-size",
|
|
|
|
str(tokenizer_pool_size),
|
|
|
|
"--distributed-executor-backend",
|
|
|
|
distributed_executor_backend,
|
2024-04-16 14:24:53 +09:00
|
|
|
]
|
2024-08-14 13:44:27 -03:00
|
|
|
|
2025-03-15 01:02:20 -04:00
|
|
|
# API Server Test Requires V0.
|
|
|
|
my_env = os.environ.copy()
|
|
|
|
my_env["VLLM_USE_V1"] = "0"
|
|
|
|
uvicorn_process = subprocess.Popen(commands, env=my_env)
|
2023-09-07 13:43:45 -07:00
|
|
|
yield
|
|
|
|
uvicorn_process.terminate()
|
|
|
|
|
|
|
|
|
2024-03-15 16:37:01 -07:00
|
|
|
@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
|
2025-01-25 03:45:20 +08:00
|
|
|
@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
|
2024-09-11 18:23:36 -07:00
|
|
|
def test_api_server(api_server, tokenizer_pool_size: int,
|
2025-01-25 03:45:20 +08:00
|
|
|
distributed_executor_backend: str):
|
2023-09-07 13:43:45 -07:00
|
|
|
"""
|
|
|
|
Run the API server and test it.
|
|
|
|
|
|
|
|
We run both the server and requests in separate processes.
|
|
|
|
|
|
|
|
We test that the server can handle incoming requests, including
|
|
|
|
multiple requests at the same time, and that it can handle requests
|
|
|
|
being cancelled without crashing.
|
|
|
|
"""
|
|
|
|
with Pool(32) as pool:
|
|
|
|
# Wait until the server is ready
|
2023-12-27 02:37:06 +08:00
|
|
|
prompts = ["warm up"] * 1
|
2023-09-07 13:43:45 -07:00
|
|
|
result = None
|
|
|
|
while not result:
|
|
|
|
try:
|
2023-12-27 02:37:06 +08:00
|
|
|
for r in pool.map(_query_server, prompts):
|
|
|
|
result = r
|
2023-09-07 13:43:45 -07:00
|
|
|
break
|
2023-12-27 02:37:06 +08:00
|
|
|
except requests.exceptions.ConnectionError:
|
2023-09-07 13:43:45 -07:00
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
# Actual tests start here
|
|
|
|
# Try with 1 prompt
|
|
|
|
for result in pool.map(_query_server, prompts):
|
|
|
|
assert result
|
|
|
|
|
|
|
|
num_aborted_requests = requests.get(
|
|
|
|
"http://localhost:8000/stats").json()["num_aborted_requests"]
|
|
|
|
assert num_aborted_requests == 0
|
|
|
|
|
|
|
|
# Try with 100 prompts
|
2023-12-27 02:37:06 +08:00
|
|
|
prompts = ["test prompt"] * 100
|
2023-09-07 13:43:45 -07:00
|
|
|
for result in pool.map(_query_server, prompts):
|
|
|
|
assert result
|
|
|
|
|
2024-01-04 03:30:22 +08:00
|
|
|
with Pool(32) as pool:
|
2023-09-07 13:43:45 -07:00
|
|
|
# Cancel requests
|
2023-12-27 02:37:06 +08:00
|
|
|
prompts = ["canceled requests"] * 100
|
2024-01-04 03:30:22 +08:00
|
|
|
pool.map_async(_query_server_long, prompts)
|
|
|
|
time.sleep(0.01)
|
2023-09-07 13:43:45 -07:00
|
|
|
pool.terminate()
|
|
|
|
pool.join()
|
|
|
|
|
|
|
|
# check cancellation stats
|
2024-01-14 12:37:58 -08:00
|
|
|
# give it some times to update the stats
|
|
|
|
time.sleep(1)
|
|
|
|
|
2023-09-07 13:43:45 -07:00
|
|
|
num_aborted_requests = requests.get(
|
|
|
|
"http://localhost:8000/stats").json()["num_aborted_requests"]
|
|
|
|
assert num_aborted_requests > 0
|
|
|
|
|
|
|
|
# check that server still runs after cancellations
|
|
|
|
with Pool(32) as pool:
|
|
|
|
# Try with 100 prompts
|
2023-12-27 02:37:06 +08:00
|
|
|
prompts = ["test prompt after canceled"] * 100
|
2023-09-07 13:43:45 -07:00
|
|
|
for result in pool.map(_query_server, prompts):
|
|
|
|
assert result
|