Revert "[Frontend] Factor out code for running uvicorn" (#7012)
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
This commit is contained in:
parent
a0dce9383a
commit
7eb0cb4a14
@ -60,7 +60,6 @@ files = [
|
||||
"vllm/logging",
|
||||
"vllm/multimodal",
|
||||
"vllm/platforms",
|
||||
"vllm/server",
|
||||
"vllm/transformers_utils",
|
||||
"vllm/triton_utils",
|
||||
"vllm/usage",
|
||||
|
@ -5,12 +5,12 @@ For production use, we recommend using our OpenAI compatible server.
|
||||
We are also not going to accept PRs modifying this file, please
|
||||
change `vllm/entrypoints/openai/api_server.py` instead.
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
import json
|
||||
import ssl
|
||||
from argparse import Namespace
|
||||
from typing import Any, AsyncGenerator, Optional
|
||||
from typing import AsyncGenerator
|
||||
|
||||
import uvicorn
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi.responses import JSONResponse, Response, StreamingResponse
|
||||
|
||||
@ -18,10 +18,8 @@ from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.logger import init_logger
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.server import serve_http
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import FlexibleArgumentParser, random_uuid
|
||||
from vllm.version import __version__ as VLLM_VERSION
|
||||
|
||||
logger = init_logger("vllm.entrypoints.api_server")
|
||||
|
||||
@ -83,50 +81,6 @@ async def generate(request: Request) -> Response:
|
||||
return JSONResponse(ret)
|
||||
|
||||
|
||||
def build_app(args: Namespace) -> FastAPI:
|
||||
global app
|
||||
|
||||
app.root_path = args.root_path
|
||||
return app
|
||||
|
||||
|
||||
async def init_app(
|
||||
args: Namespace,
|
||||
llm_engine: Optional[AsyncLLMEngine] = None,
|
||||
) -> FastAPI:
|
||||
app = build_app(args)
|
||||
|
||||
global engine
|
||||
|
||||
engine_args = AsyncEngineArgs.from_cli_args(args)
|
||||
engine = (llm_engine
|
||||
if llm_engine is not None else AsyncLLMEngine.from_engine_args(
|
||||
engine_args, usage_context=UsageContext.API_SERVER))
|
||||
|
||||
return app
|
||||
|
||||
|
||||
async def run_server(args: Namespace,
|
||||
llm_engine: Optional[AsyncLLMEngine] = None,
|
||||
**uvicorn_kwargs: Any) -> None:
|
||||
logger.info("vLLM API server version %s", VLLM_VERSION)
|
||||
logger.info("args: %s", args)
|
||||
|
||||
app = await init_app(args, llm_engine)
|
||||
await serve_http(
|
||||
app,
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
log_level=args.log_level,
|
||||
timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
|
||||
ssl_keyfile=args.ssl_keyfile,
|
||||
ssl_certfile=args.ssl_certfile,
|
||||
ssl_ca_certs=args.ssl_ca_certs,
|
||||
ssl_cert_reqs=args.ssl_cert_reqs,
|
||||
**uvicorn_kwargs,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser()
|
||||
parser.add_argument("--host", type=str, default=None)
|
||||
@ -151,5 +105,25 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--log-level", type=str, default="debug")
|
||||
parser = AsyncEngineArgs.add_cli_args(parser)
|
||||
args = parser.parse_args()
|
||||
engine_args = AsyncEngineArgs.from_cli_args(args)
|
||||
engine = AsyncLLMEngine.from_engine_args(
|
||||
engine_args, usage_context=UsageContext.API_SERVER)
|
||||
|
||||
asyncio.run(run_server(args))
|
||||
app.root_path = args.root_path
|
||||
|
||||
logger.info("Available routes are:")
|
||||
for route in app.routes:
|
||||
if not hasattr(route, 'methods'):
|
||||
continue
|
||||
methods = ', '.join(route.methods)
|
||||
logger.info("Route: %s, Methods: %s", route.path, methods)
|
||||
|
||||
uvicorn.run(app,
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
log_level=args.log_level,
|
||||
timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
|
||||
ssl_keyfile=args.ssl_keyfile,
|
||||
ssl_certfile=args.ssl_certfile,
|
||||
ssl_ca_certs=args.ssl_ca_certs,
|
||||
ssl_cert_reqs=args.ssl_cert_reqs)
|
||||
|
@ -2,12 +2,14 @@ import asyncio
|
||||
import importlib
|
||||
import inspect
|
||||
import re
|
||||
from argparse import Namespace
|
||||
import signal
|
||||
from contextlib import asynccontextmanager
|
||||
from http import HTTPStatus
|
||||
from typing import Any, Optional, Set
|
||||
from typing import Optional, Set
|
||||
|
||||
from fastapi import APIRouter, FastAPI, Request
|
||||
import fastapi
|
||||
import uvicorn
|
||||
from fastapi import APIRouter, Request
|
||||
from fastapi.exceptions import RequestValidationError
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import JSONResponse, Response, StreamingResponse
|
||||
@ -36,7 +38,6 @@ from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
|
||||
from vllm.entrypoints.openai.serving_tokenization import (
|
||||
OpenAIServingTokenization)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.server import serve_http
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.version import __version__ as VLLM_VERSION
|
||||
@ -56,7 +57,7 @@ _running_tasks: Set[asyncio.Task] = set()
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
async def lifespan(app: fastapi.FastAPI):
|
||||
|
||||
async def _force_log():
|
||||
while True:
|
||||
@ -74,7 +75,7 @@ async def lifespan(app: FastAPI):
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def mount_metrics(app: FastAPI):
|
||||
def mount_metrics(app: fastapi.FastAPI):
|
||||
# Add prometheus asgi middleware to route /metrics requests
|
||||
metrics_route = Mount("/metrics", make_asgi_app())
|
||||
# Workaround for 307 Redirect for /metrics
|
||||
@ -164,8 +165,8 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
|
||||
return JSONResponse(content=generator.model_dump())
|
||||
|
||||
|
||||
def build_app(args: Namespace) -> FastAPI:
|
||||
app = FastAPI(lifespan=lifespan)
|
||||
def build_app(args):
|
||||
app = fastapi.FastAPI(lifespan=lifespan)
|
||||
app.include_router(router)
|
||||
app.root_path = args.root_path
|
||||
|
||||
@ -213,8 +214,11 @@ def build_app(args: Namespace) -> FastAPI:
|
||||
return app
|
||||
|
||||
|
||||
async def init_app(args: Namespace,
|
||||
llm_engine: Optional[AsyncLLMEngine] = None) -> FastAPI:
|
||||
async def build_server(
|
||||
args,
|
||||
llm_engine: Optional[AsyncLLMEngine] = None,
|
||||
**uvicorn_kwargs,
|
||||
) -> uvicorn.Server:
|
||||
app = build_app(args)
|
||||
|
||||
if args.served_model_name is not None:
|
||||
@ -277,17 +281,14 @@ async def init_app(args: Namespace,
|
||||
)
|
||||
app.root_path = args.root_path
|
||||
|
||||
return app
|
||||
logger.info("Available routes are:")
|
||||
for route in app.routes:
|
||||
if not hasattr(route, 'methods'):
|
||||
continue
|
||||
methods = ', '.join(route.methods)
|
||||
logger.info("Route: %s, Methods: %s", route.path, methods)
|
||||
|
||||
|
||||
async def run_server(args: Namespace,
|
||||
llm_engine: Optional[AsyncLLMEngine] = None,
|
||||
**uvicorn_kwargs: Any) -> None:
|
||||
logger.info("vLLM API server version %s", VLLM_VERSION)
|
||||
logger.info("args: %s", args)
|
||||
|
||||
app = await init_app(args, llm_engine)
|
||||
await serve_http(
|
||||
config = uvicorn.Config(
|
||||
app,
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
@ -300,6 +301,36 @@ async def run_server(args: Namespace,
|
||||
**uvicorn_kwargs,
|
||||
)
|
||||
|
||||
return uvicorn.Server(config)
|
||||
|
||||
|
||||
async def run_server(args, llm_engine=None, **uvicorn_kwargs) -> None:
|
||||
logger.info("vLLM API server version %s", VLLM_VERSION)
|
||||
logger.info("args: %s", args)
|
||||
|
||||
server = await build_server(
|
||||
args,
|
||||
llm_engine,
|
||||
**uvicorn_kwargs,
|
||||
)
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
server_task = loop.create_task(server.serve())
|
||||
|
||||
def signal_handler() -> None:
|
||||
# prevents the uvicorn signal handler to exit early
|
||||
server_task.cancel()
|
||||
|
||||
loop.add_signal_handler(signal.SIGINT, signal_handler)
|
||||
loop.add_signal_handler(signal.SIGTERM, signal_handler)
|
||||
|
||||
try:
|
||||
await server_task
|
||||
except asyncio.CancelledError:
|
||||
print("Gracefully stopping http server")
|
||||
await server.shutdown()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# NOTE(simon):
|
||||
@ -308,5 +339,4 @@ if __name__ == "__main__":
|
||||
description="vLLM OpenAI-Compatible RESTful API server.")
|
||||
parser = make_arg_parser(parser)
|
||||
args = parser.parse_args()
|
||||
|
||||
asyncio.run(run_server(args))
|
||||
|
@ -1,3 +0,0 @@
|
||||
from .launch import serve_http
|
||||
|
||||
__all__ = ["serve_http"]
|
@ -1,42 +0,0 @@
|
||||
import asyncio
|
||||
import signal
|
||||
from typing import Any
|
||||
|
||||
import uvicorn
|
||||
from fastapi import FastAPI
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
async def serve_http(app: FastAPI, **uvicorn_kwargs: Any) -> None:
|
||||
logger.info("Available routes are:")
|
||||
for route in app.routes:
|
||||
methods = getattr(route, "methods", None)
|
||||
path = getattr(route, "path", None)
|
||||
|
||||
if methods is None or path is None:
|
||||
continue
|
||||
|
||||
logger.info("Route: %s, Methods: %s", path, ', '.join(methods))
|
||||
|
||||
config = uvicorn.Config(app, **uvicorn_kwargs)
|
||||
server = uvicorn.Server(config)
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
server_task = loop.create_task(server.serve())
|
||||
|
||||
def signal_handler() -> None:
|
||||
# prevents the uvicorn signal handler to exit early
|
||||
server_task.cancel()
|
||||
|
||||
loop.add_signal_handler(signal.SIGINT, signal_handler)
|
||||
loop.add_signal_handler(signal.SIGTERM, signal_handler)
|
||||
|
||||
try:
|
||||
await server_task
|
||||
except asyncio.CancelledError:
|
||||
logger.info("Gracefully stopping http server")
|
||||
await server.shutdown()
|
Loading…
x
Reference in New Issue
Block a user