[Doc][CI/Build] Update docs and tests to use vllm serve (#6431)

This commit is contained in:
Cyrus Leung 2024-07-17 15:43:21 +08:00 committed by GitHub
parent a19e8d3726
commit 5bf35a91e4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
23 changed files with 155 additions and 175 deletions

View File

@ -73,16 +73,13 @@ Start the server:
.. code-block:: console .. code-block:: console
$ python -m vllm.entrypoints.openai.api_server \ $ vllm serve facebook/opt-125m
$ --model facebook/opt-125m
By default, the server uses a predefined chat template stored in the tokenizer. You can override this template by using the ``--chat-template`` argument: By default, the server uses a predefined chat template stored in the tokenizer. You can override this template by using the ``--chat-template`` argument:
.. code-block:: console .. code-block:: console
$ python -m vllm.entrypoints.openai.api_server \ $ vllm serve facebook/opt-125m --chat-template ./examples/template_chatml.jinja
$ --model facebook/opt-125m \
$ --chat-template ./examples/template_chatml.jinja
This server can be queried in the same format as OpenAI API. For example, list the models: This server can be queried in the same format as OpenAI API. For example, list the models:

View File

@ -114,7 +114,7 @@ Just add the following lines in your code:
from your_code import YourModelForCausalLM from your_code import YourModelForCausalLM
ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
If you are running api server with `python -m vllm.entrypoints.openai.api_server args`, you can wrap the entrypoint with the following code: If you are running api server with :code:`vllm serve <args>`, you can wrap the entrypoint with the following code:
.. code-block:: python .. code-block:: python
@ -124,4 +124,4 @@ If you are running api server with `python -m vllm.entrypoints.openai.api_server
import runpy import runpy
runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__') runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__')
Save the above code in a file and run it with `python your_file.py args`. Save the above code in a file and run it with :code:`python your_file.py <args>`.

View File

@ -8,7 +8,7 @@ Below, you can find an explanation of every engine argument for vLLM:
.. argparse:: .. argparse::
:module: vllm.engine.arg_utils :module: vllm.engine.arg_utils
:func: _engine_args_parser :func: _engine_args_parser
:prog: -m vllm.entrypoints.openai.api_server :prog: vllm serve
:nodefaultconst: :nodefaultconst:
Async Engine Arguments Async Engine Arguments
@ -19,5 +19,5 @@ Below are the additional arguments related to the asynchronous engine:
.. argparse:: .. argparse::
:module: vllm.engine.arg_utils :module: vllm.engine.arg_utils
:func: _async_engine_args_parser :func: _async_engine_args_parser
:prog: -m vllm.entrypoints.openai.api_server :prog: vllm serve
:nodefaultconst: :nodefaultconst:

View File

@ -61,8 +61,7 @@ LoRA adapted models can also be served with the Open-AI compatible vLLM server.
.. code-block:: bash .. code-block:: bash
python -m vllm.entrypoints.openai.api_server \ vllm serve meta-llama/Llama-2-7b-hf \
--model meta-llama/Llama-2-7b-hf \
--enable-lora \ --enable-lora \
--lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/ --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/

View File

@ -94,9 +94,7 @@ Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with
.. code-block:: bash .. code-block:: bash
python -m vllm.entrypoints.openai.api_server \ vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
--model llava-hf/llava-1.5-7b-hf \
--chat-template template_llava.jinja
.. important:: .. important::
We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow

View File

@ -40,7 +40,7 @@ Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7
gpu: 24GB gpu: 24GB
commands: commands:
- pip install vllm - pip install vllm
- python -m vllm.entrypoints.openai.api_server --model $MODEL --port 8000 - vllm serve $MODEL --port 8000
model: model:
format: openai format: openai
type: chat type: chat

View File

@ -35,16 +35,14 @@ To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument wh
.. code-block:: console .. code-block:: console
$ python -m vllm.entrypoints.openai.api_server \ $ vllm serve facebook/opt-13b \
$ --model facebook/opt-13b \
$ --tensor-parallel-size 4 $ --tensor-parallel-size 4
You can also additionally specify :code:`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism: You can also additionally specify :code:`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
.. code-block:: console .. code-block:: console
$ python -m vllm.entrypoints.openai.api_server \ $ vllm serve gpt2 \
$ --model gpt2 \
$ --tensor-parallel-size 4 \ $ --tensor-parallel-size 4 \
$ --pipeline-parallel-size 2 \ $ --pipeline-parallel-size 2 \
$ --distributed-executor-backend ray $ --distributed-executor-backend ray

View File

@ -4,7 +4,7 @@ vLLM provides an HTTP server that implements OpenAI's [Completions](https://plat
You can start the server using Python, or using [Docker](deploying_with_docker.rst): You can start the server using Python, or using [Docker](deploying_with_docker.rst):
```bash ```bash
python -m vllm.entrypoints.openai.api_server --model NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123 vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
``` ```
To call the server, you can use the official OpenAI Python client library, or any other HTTP client. To call the server, you can use the official OpenAI Python client library, or any other HTTP client.
@ -97,9 +97,7 @@ template, or the template in string form. Without a chat template, the server wi
and all chat requests will error. and all chat requests will error.
```bash ```bash
python -m vllm.entrypoints.openai.api_server \ vllm serve <model> --chat-template ./path-to-chat-template.jinja
--model ... \
--chat-template ./path-to-chat-template.jinja
``` ```
vLLM community provides a set of chat templates for popular models. You can find them in the examples vLLM community provides a set of chat templates for popular models. You can find them in the examples
@ -110,7 +108,7 @@ directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
```{argparse} ```{argparse}
:module: vllm.entrypoints.openai.cli_args :module: vllm.entrypoints.openai.cli_args
:func: create_parser_for_docs :func: create_parser_for_docs
:prog: -m vllm.entrypoints.openai.api_server :prog: vllm serve
``` ```
## Tool calling in the chat completion API ## Tool calling in the chat completion API

View File

@ -1,8 +1,7 @@
"""Example Python client for vllm.entrypoints.api_server """Example Python client for `vllm.entrypoints.api_server`
NOTE: The API server is used only for demonstration and simple performance NOTE: The API server is used only for demonstration and simple performance
benchmarks. It is not intended for production use. benchmarks. It is not intended for production use.
For production use, we recommend vllm.entrypoints.openai.api_server For production use, we recommend `vllm serve` and the OpenAI client API.
and the OpenAI client API
""" """
import argparse import argparse

View File

@ -95,9 +95,7 @@ to the path of the custom logging configuration JSON file:
```bash ```bash
VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \ VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
python3 -m vllm.entrypoints.openai.api_server \ vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
--max-model-len 2048 \
--model mistralai/Mistral-7B-v0.1
``` ```
@ -152,9 +150,7 @@ to the path of the custom logging configuration JSON file:
```bash ```bash
VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \ VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
python3 -m vllm.entrypoints.openai.api_server \ vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
--max-model-len 2048 \
--model mistralai/Mistral-7B-v0.1
``` ```
@ -167,9 +163,7 @@ loggers.
```bash ```bash
VLLM_CONFIGURE_LOGGING=0 \ VLLM_CONFIGURE_LOGGING=0 \
python3 -m vllm.entrypoints.openai.api_server \ vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
--max-model-len 2048 \
--model mistralai/Mistral-7B-v0.1
``` ```

View File

@ -1,9 +1,7 @@
"""An example showing how to use vLLM to serve VLMs. """An example showing how to use vLLM to serve VLMs.
Launch the vLLM server with the following command: Launch the vLLM server with the following command:
python -m vllm.entrypoints.openai.api_server \ vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
--model llava-hf/llava-1.5-7b-hf \
--chat-template template_llava.jinja
""" """
import base64 import base64

View File

@ -36,7 +36,7 @@
``` ```
export OTEL_SERVICE_NAME="vllm-server" export OTEL_SERVICE_NAME="vllm-server"
export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT" vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
``` ```
1. In a new shell, send requests with trace context from a dummy client 1. In a new shell, send requests with trace context from a dummy client
@ -62,7 +62,7 @@ By default, `grpc` is used. To set `http/protobuf` as the protocol, configure th
``` ```
export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT" vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
``` ```
## Instrumentation of FastAPI ## Instrumentation of FastAPI
@ -74,7 +74,7 @@ OpenTelemetry allows automatic instrumentation of FastAPI.
1. Run vLLM with `opentelemetry-instrument` 1. Run vLLM with `opentelemetry-instrument`
``` ```
opentelemetry-instrument python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" opentelemetry-instrument vllm serve facebook/opt-125m
``` ```
1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI. 1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI.

View File

@ -10,8 +10,7 @@ Install:
Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint: Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint:
```bash ```bash
python3 -m vllm.entrypoints.openai.api_server \ vllm serve mistralai/Mistral-7B-v0.1 \
--model mistralai/Mistral-7B-v0.1 \
--max-model-len 2048 \ --max-model-len 2048 \
--disable-log-requests --disable-log-requests
``` ```

View File

@ -9,17 +9,17 @@ MODEL_NAME = "facebook/opt-125m"
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
with RemoteOpenAIServer([ args = [
"--model", # use half precision for speed and memory savings in CI environment
MODEL_NAME, "--dtype",
# use half precision for speed and memory savings in CI environment "float16",
"--dtype", "--max-model-len",
"float16", "2048",
"--max-model-len", "--enforce-eager",
"2048", "--engine-use-ray"
"--enforce-eager", ]
"--engine-use-ray"
]) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server

View File

@ -15,8 +15,6 @@ from ..utils import RemoteOpenAIServer
]) ])
def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME): def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
pp_args = [ pp_args = [
"--model",
MODEL_NAME,
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",
"bfloat16", "bfloat16",
@ -34,8 +32,6 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
# schedule all workers in a node other than the head node, # schedule all workers in a node other than the head node,
# which can cause the test to fail. # which can cause the test to fail.
tp_args = [ tp_args = [
"--model",
MODEL_NAME,
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",
"bfloat16", "bfloat16",
@ -53,7 +49,7 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
results = [] results = []
for args in [pp_args, tp_args]: for args in [pp_args, tp_args]:
with RemoteOpenAIServer(args) as server: with RemoteOpenAIServer(MODEL_NAME, args) as server:
client = server.get_client() client = server.get_client()
# test models list # test models list

View File

@ -27,27 +27,27 @@ def zephyr_lora_files():
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(zephyr_lora_files): def server(zephyr_lora_files):
with RemoteOpenAIServer([ args = [
"--model", # use half precision for speed and memory savings in CI environment
MODEL_NAME, "--dtype",
# use half precision for speed and memory savings in CI environment "bfloat16",
"--dtype", "--max-model-len",
"bfloat16", "8192",
"--max-model-len", "--enforce-eager",
"8192", # lora config below
"--enforce-eager", "--enable-lora",
# lora config below "--lora-modules",
"--enable-lora", f"zephyr-lora={zephyr_lora_files}",
"--lora-modules", f"zephyr-lora2={zephyr_lora_files}",
f"zephyr-lora={zephyr_lora_files}", "--max-lora-rank",
f"zephyr-lora2={zephyr_lora_files}", "64",
"--max-lora-rank", "--max-cpu-loras",
"64", "2",
"--max-cpu-loras", "--max-num-seqs",
"2", "128",
"--max-num-seqs", ]
"128",
]) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server

View File

@ -37,36 +37,36 @@ def zephyr_pa_files():
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(zephyr_lora_files, zephyr_pa_files): def server(zephyr_lora_files, zephyr_pa_files):
with RemoteOpenAIServer([ args = [
"--model", # use half precision for speed and memory savings in CI environment
MODEL_NAME, "--dtype",
# use half precision for speed and memory savings in CI environment "bfloat16",
"--dtype", "--max-model-len",
"bfloat16", "8192",
"--max-model-len", "--max-num-seqs",
"8192", "128",
"--max-num-seqs", "--enforce-eager",
"128", # lora config
"--enforce-eager", "--enable-lora",
# lora config "--lora-modules",
"--enable-lora", f"zephyr-lora={zephyr_lora_files}",
"--lora-modules", f"zephyr-lora2={zephyr_lora_files}",
f"zephyr-lora={zephyr_lora_files}", "--max-lora-rank",
f"zephyr-lora2={zephyr_lora_files}", "64",
"--max-lora-rank", "--max-cpu-loras",
"64", "2",
"--max-cpu-loras", # pa config
"2", "--enable-prompt-adapter",
# pa config "--prompt-adapters",
"--enable-prompt-adapter", f"zephyr-pa={zephyr_pa_files}",
"--prompt-adapters", f"zephyr-pa2={zephyr_pa_files}",
f"zephyr-pa={zephyr_pa_files}", "--max-prompt-adapters",
f"zephyr-pa2={zephyr_pa_files}", "2",
"--max-prompt-adapters", "--max-prompt-adapter-token",
"2", "128",
"--max-prompt-adapter-token", ]
"128",
]) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server

View File

@ -11,17 +11,17 @@ EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def embedding_server(): def embedding_server():
with RemoteOpenAIServer([ args = [
"--model", # use half precision for speed and memory savings in CI environment
EMBEDDING_MODEL_NAME, "--dtype",
# use half precision for speed and memory savings in CI environment "bfloat16",
"--dtype", "--enforce-eager",
"bfloat16", "--max-model-len",
"--enforce-eager", "8192",
"--max-model-len", "--enforce-eager",
"8192", ]
"--enforce-eager",
]) as remote_server: with RemoteOpenAIServer(EMBEDDING_MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server

View File

@ -19,27 +19,27 @@ def zephyr_lora_files():
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(zephyr_lora_files): def server(zephyr_lora_files):
with RemoteOpenAIServer([ args = [
"--model", # use half precision for speed and memory savings in CI environment
MODEL_NAME, "--dtype",
# use half precision for speed and memory savings in CI environment "bfloat16",
"--dtype", "--max-model-len",
"bfloat16", "8192",
"--max-model-len", "--enforce-eager",
"8192", # lora config below
"--enforce-eager", "--enable-lora",
# lora config below "--lora-modules",
"--enable-lora", f"zephyr-lora={zephyr_lora_files}",
"--lora-modules", f"zephyr-lora2={zephyr_lora_files}",
f"zephyr-lora={zephyr_lora_files}", "--max-lora-rank",
f"zephyr-lora2={zephyr_lora_files}", "64",
"--max-lora-rank", "--max-cpu-loras",
"64", "2",
"--max-cpu-loras", "--max-num-seqs",
"2", "128",
"--max-num-seqs", ]
"128",
]) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server

View File

@ -12,18 +12,18 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
with RemoteOpenAIServer([ args = [
"--model", # use half precision for speed and memory savings in CI environment
MODEL_NAME, "--dtype",
# use half precision for speed and memory savings in CI environment "bfloat16",
"--dtype", "--max-model-len",
"bfloat16", "8192",
"--max-model-len", "--enforce-eager",
"8192", "--max-num-seqs",
"--enforce-eager", "128",
"--max-num-seqs", ]
"128",
]) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server

View File

@ -23,17 +23,17 @@ TEST_IMAGE_URLS = [
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
with RemoteOpenAIServer([ args = [
"--model", "--dtype",
MODEL_NAME, "bfloat16",
"--dtype", "--max-model-len",
"bfloat16", "4096",
"--max-model-len", "--enforce-eager",
"4096", "--chat-template",
"--enforce-eager", str(LLAVA_CHAT_TEMPLATE),
"--chat-template", ]
str(LLAVA_CHAT_TEMPLATE),
]) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server

View File

@ -214,12 +214,12 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
## Start OpenAI API server ## Start OpenAI API server
openai_args = [ openai_args = [
"--model", model_ref, "--dtype", "float16", "--load-format", "--dtype", "float16", "--load-format",
"tensorizer", "--model-loader-extra-config", "tensorizer", "--model-loader-extra-config",
json.dumps(model_loader_extra_config), json.dumps(model_loader_extra_config),
] ]
with RemoteOpenAIServer(openai_args) as server: with RemoteOpenAIServer(model_ref, openai_args) as server:
print("Server ready.") print("Server ready.")
client = server.get_client() client = server.get_client()

View File

@ -49,7 +49,13 @@ class RemoteOpenAIServer:
DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key
MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds
def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None: def __init__(
self,
model: str,
cli_args: List[str],
*,
auto_port: bool = True,
) -> None:
if auto_port: if auto_port:
if "-p" in cli_args or "--port" in cli_args: if "-p" in cli_args or "--port" in cli_args:
raise ValueError("You have manually specified the port" raise ValueError("You have manually specified the port"
@ -68,12 +74,10 @@ class RemoteOpenAIServer:
# the current process might initialize cuda, # the current process might initialize cuda,
# to be safe, we should use spawn method # to be safe, we should use spawn method
env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
self.proc = subprocess.Popen( self.proc = subprocess.Popen(["vllm", "serve"] + [model] + cli_args,
[sys.executable, "-m", "vllm.entrypoints.openai.api_server"] + env=env,
cli_args, stdout=sys.stdout,
env=env, stderr=sys.stderr)
stdout=sys.stdout,
stderr=sys.stderr)
self._wait_for_server(url=self.url_for("health"), self._wait_for_server(url=self.url_for("health"),
timeout=self.MAX_SERVER_START_WAIT_S) timeout=self.MAX_SERVER_START_WAIT_S)