[Doc][CI/Build] Update docs and tests to use vllm serve
(#6431)
This commit is contained in:
parent
a19e8d3726
commit
5bf35a91e4
@ -73,16 +73,13 @@ Start the server:
|
|||||||
|
|
||||||
.. code-block:: console
|
.. code-block:: console
|
||||||
|
|
||||||
$ python -m vllm.entrypoints.openai.api_server \
|
$ vllm serve facebook/opt-125m
|
||||||
$ --model facebook/opt-125m
|
|
||||||
|
|
||||||
By default, the server uses a predefined chat template stored in the tokenizer. You can override this template by using the ``--chat-template`` argument:
|
By default, the server uses a predefined chat template stored in the tokenizer. You can override this template by using the ``--chat-template`` argument:
|
||||||
|
|
||||||
.. code-block:: console
|
.. code-block:: console
|
||||||
|
|
||||||
$ python -m vllm.entrypoints.openai.api_server \
|
$ vllm serve facebook/opt-125m --chat-template ./examples/template_chatml.jinja
|
||||||
$ --model facebook/opt-125m \
|
|
||||||
$ --chat-template ./examples/template_chatml.jinja
|
|
||||||
|
|
||||||
This server can be queried in the same format as OpenAI API. For example, list the models:
|
This server can be queried in the same format as OpenAI API. For example, list the models:
|
||||||
|
|
||||||
|
@ -114,7 +114,7 @@ Just add the following lines in your code:
|
|||||||
from your_code import YourModelForCausalLM
|
from your_code import YourModelForCausalLM
|
||||||
ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
|
ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
|
||||||
|
|
||||||
If you are running api server with `python -m vllm.entrypoints.openai.api_server args`, you can wrap the entrypoint with the following code:
|
If you are running api server with :code:`vllm serve <args>`, you can wrap the entrypoint with the following code:
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
@ -124,4 +124,4 @@ If you are running api server with `python -m vllm.entrypoints.openai.api_server
|
|||||||
import runpy
|
import runpy
|
||||||
runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__')
|
runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__')
|
||||||
|
|
||||||
Save the above code in a file and run it with `python your_file.py args`.
|
Save the above code in a file and run it with :code:`python your_file.py <args>`.
|
||||||
|
@ -8,7 +8,7 @@ Below, you can find an explanation of every engine argument for vLLM:
|
|||||||
.. argparse::
|
.. argparse::
|
||||||
:module: vllm.engine.arg_utils
|
:module: vllm.engine.arg_utils
|
||||||
:func: _engine_args_parser
|
:func: _engine_args_parser
|
||||||
:prog: -m vllm.entrypoints.openai.api_server
|
:prog: vllm serve
|
||||||
:nodefaultconst:
|
:nodefaultconst:
|
||||||
|
|
||||||
Async Engine Arguments
|
Async Engine Arguments
|
||||||
@ -19,5 +19,5 @@ Below are the additional arguments related to the asynchronous engine:
|
|||||||
.. argparse::
|
.. argparse::
|
||||||
:module: vllm.engine.arg_utils
|
:module: vllm.engine.arg_utils
|
||||||
:func: _async_engine_args_parser
|
:func: _async_engine_args_parser
|
||||||
:prog: -m vllm.entrypoints.openai.api_server
|
:prog: vllm serve
|
||||||
:nodefaultconst:
|
:nodefaultconst:
|
@ -61,8 +61,7 @@ LoRA adapted models can also be served with the Open-AI compatible vLLM server.
|
|||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
python -m vllm.entrypoints.openai.api_server \
|
vllm serve meta-llama/Llama-2-7b-hf \
|
||||||
--model meta-llama/Llama-2-7b-hf \
|
|
||||||
--enable-lora \
|
--enable-lora \
|
||||||
--lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
|
--lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
|
||||||
|
|
||||||
|
@ -94,9 +94,7 @@ Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with
|
|||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
python -m vllm.entrypoints.openai.api_server \
|
vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
|
||||||
--model llava-hf/llava-1.5-7b-hf \
|
|
||||||
--chat-template template_llava.jinja
|
|
||||||
|
|
||||||
.. important::
|
.. important::
|
||||||
We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
|
We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
|
||||||
|
@ -40,7 +40,7 @@ Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7
|
|||||||
gpu: 24GB
|
gpu: 24GB
|
||||||
commands:
|
commands:
|
||||||
- pip install vllm
|
- pip install vllm
|
||||||
- python -m vllm.entrypoints.openai.api_server --model $MODEL --port 8000
|
- vllm serve $MODEL --port 8000
|
||||||
model:
|
model:
|
||||||
format: openai
|
format: openai
|
||||||
type: chat
|
type: chat
|
||||||
|
@ -35,16 +35,14 @@ To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument wh
|
|||||||
|
|
||||||
.. code-block:: console
|
.. code-block:: console
|
||||||
|
|
||||||
$ python -m vllm.entrypoints.openai.api_server \
|
$ vllm serve facebook/opt-13b \
|
||||||
$ --model facebook/opt-13b \
|
|
||||||
$ --tensor-parallel-size 4
|
$ --tensor-parallel-size 4
|
||||||
|
|
||||||
You can also additionally specify :code:`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
|
You can also additionally specify :code:`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
|
||||||
|
|
||||||
.. code-block:: console
|
.. code-block:: console
|
||||||
|
|
||||||
$ python -m vllm.entrypoints.openai.api_server \
|
$ vllm serve gpt2 \
|
||||||
$ --model gpt2 \
|
|
||||||
$ --tensor-parallel-size 4 \
|
$ --tensor-parallel-size 4 \
|
||||||
$ --pipeline-parallel-size 2 \
|
$ --pipeline-parallel-size 2 \
|
||||||
$ --distributed-executor-backend ray
|
$ --distributed-executor-backend ray
|
||||||
|
@ -4,7 +4,7 @@ vLLM provides an HTTP server that implements OpenAI's [Completions](https://plat
|
|||||||
|
|
||||||
You can start the server using Python, or using [Docker](deploying_with_docker.rst):
|
You can start the server using Python, or using [Docker](deploying_with_docker.rst):
|
||||||
```bash
|
```bash
|
||||||
python -m vllm.entrypoints.openai.api_server --model NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
|
vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
|
||||||
```
|
```
|
||||||
|
|
||||||
To call the server, you can use the official OpenAI Python client library, or any other HTTP client.
|
To call the server, you can use the official OpenAI Python client library, or any other HTTP client.
|
||||||
@ -97,9 +97,7 @@ template, or the template in string form. Without a chat template, the server wi
|
|||||||
and all chat requests will error.
|
and all chat requests will error.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python -m vllm.entrypoints.openai.api_server \
|
vllm serve <model> --chat-template ./path-to-chat-template.jinja
|
||||||
--model ... \
|
|
||||||
--chat-template ./path-to-chat-template.jinja
|
|
||||||
```
|
```
|
||||||
|
|
||||||
vLLM community provides a set of chat templates for popular models. You can find them in the examples
|
vLLM community provides a set of chat templates for popular models. You can find them in the examples
|
||||||
@ -110,7 +108,7 @@ directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
|
|||||||
```{argparse}
|
```{argparse}
|
||||||
:module: vllm.entrypoints.openai.cli_args
|
:module: vllm.entrypoints.openai.cli_args
|
||||||
:func: create_parser_for_docs
|
:func: create_parser_for_docs
|
||||||
:prog: -m vllm.entrypoints.openai.api_server
|
:prog: vllm serve
|
||||||
```
|
```
|
||||||
|
|
||||||
## Tool calling in the chat completion API
|
## Tool calling in the chat completion API
|
||||||
|
@ -1,8 +1,7 @@
|
|||||||
"""Example Python client for vllm.entrypoints.api_server
|
"""Example Python client for `vllm.entrypoints.api_server`
|
||||||
NOTE: The API server is used only for demonstration and simple performance
|
NOTE: The API server is used only for demonstration and simple performance
|
||||||
benchmarks. It is not intended for production use.
|
benchmarks. It is not intended for production use.
|
||||||
For production use, we recommend vllm.entrypoints.openai.api_server
|
For production use, we recommend `vllm serve` and the OpenAI client API.
|
||||||
and the OpenAI client API
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
@ -95,9 +95,7 @@ to the path of the custom logging configuration JSON file:
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
|
VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
|
||||||
python3 -m vllm.entrypoints.openai.api_server \
|
vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
|
||||||
--max-model-len 2048 \
|
|
||||||
--model mistralai/Mistral-7B-v0.1
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
@ -152,9 +150,7 @@ to the path of the custom logging configuration JSON file:
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
|
VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
|
||||||
python3 -m vllm.entrypoints.openai.api_server \
|
vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
|
||||||
--max-model-len 2048 \
|
|
||||||
--model mistralai/Mistral-7B-v0.1
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
@ -167,9 +163,7 @@ loggers.
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
VLLM_CONFIGURE_LOGGING=0 \
|
VLLM_CONFIGURE_LOGGING=0 \
|
||||||
python3 -m vllm.entrypoints.openai.api_server \
|
vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
|
||||||
--max-model-len 2048 \
|
|
||||||
--model mistralai/Mistral-7B-v0.1
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,9 +1,7 @@
|
|||||||
"""An example showing how to use vLLM to serve VLMs.
|
"""An example showing how to use vLLM to serve VLMs.
|
||||||
|
|
||||||
Launch the vLLM server with the following command:
|
Launch the vLLM server with the following command:
|
||||||
python -m vllm.entrypoints.openai.api_server \
|
vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
|
||||||
--model llava-hf/llava-1.5-7b-hf \
|
|
||||||
--chat-template template_llava.jinja
|
|
||||||
"""
|
"""
|
||||||
import base64
|
import base64
|
||||||
|
|
||||||
|
@ -36,7 +36,7 @@
|
|||||||
```
|
```
|
||||||
export OTEL_SERVICE_NAME="vllm-server"
|
export OTEL_SERVICE_NAME="vllm-server"
|
||||||
export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
|
export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
|
||||||
python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
|
vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
|
||||||
```
|
```
|
||||||
|
|
||||||
1. In a new shell, send requests with trace context from a dummy client
|
1. In a new shell, send requests with trace context from a dummy client
|
||||||
@ -62,7 +62,7 @@ By default, `grpc` is used. To set `http/protobuf` as the protocol, configure th
|
|||||||
```
|
```
|
||||||
export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
|
export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
|
||||||
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
|
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
|
||||||
python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m" --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
|
vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
|
||||||
```
|
```
|
||||||
|
|
||||||
## Instrumentation of FastAPI
|
## Instrumentation of FastAPI
|
||||||
@ -74,7 +74,7 @@ OpenTelemetry allows automatic instrumentation of FastAPI.
|
|||||||
|
|
||||||
1. Run vLLM with `opentelemetry-instrument`
|
1. Run vLLM with `opentelemetry-instrument`
|
||||||
```
|
```
|
||||||
opentelemetry-instrument python -m vllm.entrypoints.openai.api_server --model="facebook/opt-125m"
|
opentelemetry-instrument vllm serve facebook/opt-125m
|
||||||
```
|
```
|
||||||
|
|
||||||
1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI.
|
1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI.
|
||||||
|
@ -10,8 +10,7 @@ Install:
|
|||||||
|
|
||||||
Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint:
|
Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint:
|
||||||
```bash
|
```bash
|
||||||
python3 -m vllm.entrypoints.openai.api_server \
|
vllm serve mistralai/Mistral-7B-v0.1 \
|
||||||
--model mistralai/Mistral-7B-v0.1 \
|
|
||||||
--max-model-len 2048 \
|
--max-model-len 2048 \
|
||||||
--disable-log-requests
|
--disable-log-requests
|
||||||
```
|
```
|
||||||
|
@ -9,9 +9,7 @@ MODEL_NAME = "facebook/opt-125m"
|
|||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def server():
|
def server():
|
||||||
with RemoteOpenAIServer([
|
args = [
|
||||||
"--model",
|
|
||||||
MODEL_NAME,
|
|
||||||
# use half precision for speed and memory savings in CI environment
|
# use half precision for speed and memory savings in CI environment
|
||||||
"--dtype",
|
"--dtype",
|
||||||
"float16",
|
"float16",
|
||||||
@ -19,7 +17,9 @@ def server():
|
|||||||
"2048",
|
"2048",
|
||||||
"--enforce-eager",
|
"--enforce-eager",
|
||||||
"--engine-use-ray"
|
"--engine-use-ray"
|
||||||
]) as remote_server:
|
]
|
||||||
|
|
||||||
|
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||||
yield remote_server
|
yield remote_server
|
||||||
|
|
||||||
|
|
||||||
|
@ -15,8 +15,6 @@ from ..utils import RemoteOpenAIServer
|
|||||||
])
|
])
|
||||||
def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
|
def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
|
||||||
pp_args = [
|
pp_args = [
|
||||||
"--model",
|
|
||||||
MODEL_NAME,
|
|
||||||
# use half precision for speed and memory savings in CI environment
|
# use half precision for speed and memory savings in CI environment
|
||||||
"--dtype",
|
"--dtype",
|
||||||
"bfloat16",
|
"bfloat16",
|
||||||
@ -34,8 +32,6 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
|
|||||||
# schedule all workers in a node other than the head node,
|
# schedule all workers in a node other than the head node,
|
||||||
# which can cause the test to fail.
|
# which can cause the test to fail.
|
||||||
tp_args = [
|
tp_args = [
|
||||||
"--model",
|
|
||||||
MODEL_NAME,
|
|
||||||
# use half precision for speed and memory savings in CI environment
|
# use half precision for speed and memory savings in CI environment
|
||||||
"--dtype",
|
"--dtype",
|
||||||
"bfloat16",
|
"bfloat16",
|
||||||
@ -53,7 +49,7 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
|
|||||||
|
|
||||||
results = []
|
results = []
|
||||||
for args in [pp_args, tp_args]:
|
for args in [pp_args, tp_args]:
|
||||||
with RemoteOpenAIServer(args) as server:
|
with RemoteOpenAIServer(MODEL_NAME, args) as server:
|
||||||
client = server.get_client()
|
client = server.get_client()
|
||||||
|
|
||||||
# test models list
|
# test models list
|
||||||
|
@ -27,9 +27,7 @@ def zephyr_lora_files():
|
|||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def server(zephyr_lora_files):
|
def server(zephyr_lora_files):
|
||||||
with RemoteOpenAIServer([
|
args = [
|
||||||
"--model",
|
|
||||||
MODEL_NAME,
|
|
||||||
# use half precision for speed and memory savings in CI environment
|
# use half precision for speed and memory savings in CI environment
|
||||||
"--dtype",
|
"--dtype",
|
||||||
"bfloat16",
|
"bfloat16",
|
||||||
@ -47,7 +45,9 @@ def server(zephyr_lora_files):
|
|||||||
"2",
|
"2",
|
||||||
"--max-num-seqs",
|
"--max-num-seqs",
|
||||||
"128",
|
"128",
|
||||||
]) as remote_server:
|
]
|
||||||
|
|
||||||
|
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||||
yield remote_server
|
yield remote_server
|
||||||
|
|
||||||
|
|
||||||
|
@ -37,9 +37,7 @@ def zephyr_pa_files():
|
|||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def server(zephyr_lora_files, zephyr_pa_files):
|
def server(zephyr_lora_files, zephyr_pa_files):
|
||||||
with RemoteOpenAIServer([
|
args = [
|
||||||
"--model",
|
|
||||||
MODEL_NAME,
|
|
||||||
# use half precision for speed and memory savings in CI environment
|
# use half precision for speed and memory savings in CI environment
|
||||||
"--dtype",
|
"--dtype",
|
||||||
"bfloat16",
|
"bfloat16",
|
||||||
@ -66,7 +64,9 @@ def server(zephyr_lora_files, zephyr_pa_files):
|
|||||||
"2",
|
"2",
|
||||||
"--max-prompt-adapter-token",
|
"--max-prompt-adapter-token",
|
||||||
"128",
|
"128",
|
||||||
]) as remote_server:
|
]
|
||||||
|
|
||||||
|
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||||
yield remote_server
|
yield remote_server
|
||||||
|
|
||||||
|
|
||||||
|
@ -11,9 +11,7 @@ EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
|
|||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def embedding_server():
|
def embedding_server():
|
||||||
with RemoteOpenAIServer([
|
args = [
|
||||||
"--model",
|
|
||||||
EMBEDDING_MODEL_NAME,
|
|
||||||
# use half precision for speed and memory savings in CI environment
|
# use half precision for speed and memory savings in CI environment
|
||||||
"--dtype",
|
"--dtype",
|
||||||
"bfloat16",
|
"bfloat16",
|
||||||
@ -21,7 +19,9 @@ def embedding_server():
|
|||||||
"--max-model-len",
|
"--max-model-len",
|
||||||
"8192",
|
"8192",
|
||||||
"--enforce-eager",
|
"--enforce-eager",
|
||||||
]) as remote_server:
|
]
|
||||||
|
|
||||||
|
with RemoteOpenAIServer(EMBEDDING_MODEL_NAME, args) as remote_server:
|
||||||
yield remote_server
|
yield remote_server
|
||||||
|
|
||||||
|
|
||||||
|
@ -19,9 +19,7 @@ def zephyr_lora_files():
|
|||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def server(zephyr_lora_files):
|
def server(zephyr_lora_files):
|
||||||
with RemoteOpenAIServer([
|
args = [
|
||||||
"--model",
|
|
||||||
MODEL_NAME,
|
|
||||||
# use half precision for speed and memory savings in CI environment
|
# use half precision for speed and memory savings in CI environment
|
||||||
"--dtype",
|
"--dtype",
|
||||||
"bfloat16",
|
"bfloat16",
|
||||||
@ -39,7 +37,9 @@ def server(zephyr_lora_files):
|
|||||||
"2",
|
"2",
|
||||||
"--max-num-seqs",
|
"--max-num-seqs",
|
||||||
"128",
|
"128",
|
||||||
]) as remote_server:
|
]
|
||||||
|
|
||||||
|
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||||
yield remote_server
|
yield remote_server
|
||||||
|
|
||||||
|
|
||||||
|
@ -12,9 +12,7 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
|||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def server():
|
def server():
|
||||||
with RemoteOpenAIServer([
|
args = [
|
||||||
"--model",
|
|
||||||
MODEL_NAME,
|
|
||||||
# use half precision for speed and memory savings in CI environment
|
# use half precision for speed and memory savings in CI environment
|
||||||
"--dtype",
|
"--dtype",
|
||||||
"bfloat16",
|
"bfloat16",
|
||||||
@ -23,7 +21,9 @@ def server():
|
|||||||
"--enforce-eager",
|
"--enforce-eager",
|
||||||
"--max-num-seqs",
|
"--max-num-seqs",
|
||||||
"128",
|
"128",
|
||||||
]) as remote_server:
|
]
|
||||||
|
|
||||||
|
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||||
yield remote_server
|
yield remote_server
|
||||||
|
|
||||||
|
|
||||||
|
@ -23,9 +23,7 @@ TEST_IMAGE_URLS = [
|
|||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def server():
|
def server():
|
||||||
with RemoteOpenAIServer([
|
args = [
|
||||||
"--model",
|
|
||||||
MODEL_NAME,
|
|
||||||
"--dtype",
|
"--dtype",
|
||||||
"bfloat16",
|
"bfloat16",
|
||||||
"--max-model-len",
|
"--max-model-len",
|
||||||
@ -33,7 +31,9 @@ def server():
|
|||||||
"--enforce-eager",
|
"--enforce-eager",
|
||||||
"--chat-template",
|
"--chat-template",
|
||||||
str(LLAVA_CHAT_TEMPLATE),
|
str(LLAVA_CHAT_TEMPLATE),
|
||||||
]) as remote_server:
|
]
|
||||||
|
|
||||||
|
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||||
yield remote_server
|
yield remote_server
|
||||||
|
|
||||||
|
|
||||||
|
@ -214,12 +214,12 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
|
|||||||
|
|
||||||
## Start OpenAI API server
|
## Start OpenAI API server
|
||||||
openai_args = [
|
openai_args = [
|
||||||
"--model", model_ref, "--dtype", "float16", "--load-format",
|
"--dtype", "float16", "--load-format",
|
||||||
"tensorizer", "--model-loader-extra-config",
|
"tensorizer", "--model-loader-extra-config",
|
||||||
json.dumps(model_loader_extra_config),
|
json.dumps(model_loader_extra_config),
|
||||||
]
|
]
|
||||||
|
|
||||||
with RemoteOpenAIServer(openai_args) as server:
|
with RemoteOpenAIServer(model_ref, openai_args) as server:
|
||||||
print("Server ready.")
|
print("Server ready.")
|
||||||
|
|
||||||
client = server.get_client()
|
client = server.get_client()
|
||||||
|
@ -49,7 +49,13 @@ class RemoteOpenAIServer:
|
|||||||
DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key
|
DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key
|
||||||
MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds
|
MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds
|
||||||
|
|
||||||
def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None:
|
def __init__(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
cli_args: List[str],
|
||||||
|
*,
|
||||||
|
auto_port: bool = True,
|
||||||
|
) -> None:
|
||||||
if auto_port:
|
if auto_port:
|
||||||
if "-p" in cli_args or "--port" in cli_args:
|
if "-p" in cli_args or "--port" in cli_args:
|
||||||
raise ValueError("You have manually specified the port"
|
raise ValueError("You have manually specified the port"
|
||||||
@ -68,9 +74,7 @@ class RemoteOpenAIServer:
|
|||||||
# the current process might initialize cuda,
|
# the current process might initialize cuda,
|
||||||
# to be safe, we should use spawn method
|
# to be safe, we should use spawn method
|
||||||
env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
|
env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
|
||||||
self.proc = subprocess.Popen(
|
self.proc = subprocess.Popen(["vllm", "serve"] + [model] + cli_args,
|
||||||
[sys.executable, "-m", "vllm.entrypoints.openai.api_server"] +
|
|
||||||
cli_args,
|
|
||||||
env=env,
|
env=env,
|
||||||
stdout=sys.stdout,
|
stdout=sys.stdout,
|
||||||
stderr=sys.stderr)
|
stderr=sys.stderr)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user