[Doc] Move examples into categories (#11840)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor 2025-01-08 13:09:53 +00:00 committed by GitHub
parent 2a0596bc48
commit aba8d6ee00
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
116 changed files with 153 additions and 124 deletions

View File

@ -30,7 +30,7 @@ function cpu_tests() {
# offline inference
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
set -e
python3 examples/offline_inference.py"
python3 examples/offline_inference/offline_inference.py"
# Run basic model test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "

View File

@ -24,5 +24,5 @@ remove_docker_container
# Run the image and test offline inference
docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
python3 examples/offline_inference.py
python3 examples/offline_inference/offline_inference.py
'

View File

@ -13,4 +13,4 @@ trap remove_docker_container EXIT
remove_docker_container
# Run the image and launch offline inference
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/offline_inference.py

View File

@ -51,4 +51,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
--name "${container_name}" \
${image_name} \
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference_neuron.py"
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/offline_inference_neuron.py"

View File

@ -13,4 +13,4 @@ trap remove_docker_container EXIT
remove_docker_container
# Run the image and launch offline inference
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/offline_inference.py

View File

@ -14,4 +14,4 @@ remove_docker_container
# For HF_TOKEN.
source /etc/environment
# Run a simple end-to-end example.
docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py"

View File

@ -14,6 +14,6 @@ remove_docker_container
# Run the image and test offline inference/tensor parallel
docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
python3 examples/offline_inference.py
python3 examples/offline_inference_cli.py -tp 2
python3 examples/offline_inference/offline_inference.py
python3 examples/offline_inference/offline_inference_cli.py -tp 2
'

View File

@ -187,19 +187,19 @@ steps:
- examples/
commands:
- pip install tensorizer # for tensorizer test
- python3 offline_inference.py
- python3 cpu_offload.py
- python3 offline_inference_chat.py
- python3 offline_inference_with_prefix.py
- python3 llm_engine_example.py
- python3 offline_inference_vision_language.py
- python3 offline_inference_vision_language_multi_image.py
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference_encoder_decoder.py
- python3 offline_inference_classification.py
- python3 offline_inference_embedding.py
- python3 offline_inference_scoring.py
- python3 offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
- python3 offline_inference/offline_inference.py
- python3 offline_inference/cpu_offload.py
- python3 offline_inference/offline_inference_chat.py
- python3 offline_inference/offline_inference_with_prefix.py
- python3 offline_inference/llm_engine_example.py
- python3 offline_inference/offline_inference_vision_language.py
- python3 offline_inference/offline_inference_vision_language_multi_image.py
- python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/offline_inference_encoder_decoder.py
- python3 offline_inference/offline_inference_classification.py
- python3 offline_inference/offline_inference_embedding.py
- python3 offline_inference/offline_inference_scoring.py
- python3 offline_inference/offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
- label: Prefix Caching Test # 9min
mirror_hardwares: [amd]

View File

@ -27,7 +27,7 @@ jobs:
version: v3.10.1
- name: Run chart-testing (lint)
run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/chart-helm --charts examples/chart-helm
run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
- name: Setup minio
run: |
@ -64,7 +64,7 @@ jobs:
run: |
export AWS_ACCESS_KEY_ID=minioadmin
export AWS_SECRET_ACCESS_KEY=minioadmin
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
- name: curl test
run: |

View File

@ -250,7 +250,7 @@ ENV VLLM_USAGE_SOURCE production-docker-image
# define sagemaker first, so it is not default from `docker build`
FROM vllm-openai-base AS vllm-sagemaker
COPY examples/sagemaker-entrypoint.sh .
COPY examples/online_serving/sagemaker-entrypoint.sh .
RUN chmod +x sagemaker-entrypoint.sh
ENTRYPOINT ["./sagemaker-entrypoint.sh"]

View File

@ -26,7 +26,7 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve
### Offline Inference
Refer to <gh-file:examples/offline_inference_with_profiler.py> for an example.
Refer to <gh-file:examples/offline_inference/offline_inference_with_profiler.py> for an example.
### OpenAI Server

View File

@ -61,7 +61,7 @@ run: |
echo 'Starting gradio server...'
git clone https://github.com/vllm-project/vllm.git || true
python vllm/examples/gradio_openai_chatbot_webserver.py \
python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
-m $MODEL_NAME \
--port 8811 \
--model-url http://localhost:8081/v1 \
@ -321,7 +321,7 @@ run: |
echo 'Starting gradio server...'
git clone https://github.com/vllm-project/vllm.git || true
python vllm/examples/gradio_openai_chatbot_webserver.py \
python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
-m $MODEL_NAME \
--port 8811 \
--model-url http://$ENDPOINT/v1 \

View File

@ -21,7 +21,7 @@ Disaggregated prefill DOES NOT improve throughput.
## Usage example
Please refer to `examples/disaggregated_prefill.sh` for the example usage of disaggregated prefilling.
Please refer to `examples/online_serving/disaggregated_prefill.sh` for the example usage of disaggregated prefilling.
## Benchmarks

View File

@ -47,7 +47,7 @@ outputs = llm.generate(
)
```
Check out <gh-file:examples/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
## Serving LoRA Adapters

View File

@ -47,7 +47,7 @@ print(f'Model is quantized and saved at "{quant_path}"')
To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
```console
$ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
$ python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
```
AWQ models are also supported directly through the LLM entrypoint:

View File

@ -28,7 +28,7 @@ Here is an example of how to enable this feature:
```python
# two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to
# https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own.
# https://github.com/vllm-project/vllm/blob/main/examples/other/fp8/README.md to generate kv_cache_scales.json of your own.
from vllm import LLM, SamplingParams
sampling_params = SamplingParams(temperature=1.3, top_p=0.8)

View File

@ -131,7 +131,7 @@ completion = client.chat.completions.create(
print(completion.choices[0].message.content)
```
Full example: <gh-file:examples/openai_chat_completion_structured_outputs.py>
Full example: <gh-file:examples/online_serving/openai_chat_completion_structured_outputs.py>
## Experimental Automatic Parsing (OpenAI API)
@ -257,4 +257,4 @@ outputs = llm.generate(
print(outputs[0].outputs[0].text)
```
Full example: <gh-file:examples/offline_inference_structured_outputs.py>
Full example: <gh-file:examples/offline_inference/offline_inference_structured_outputs.py>

View File

@ -12,6 +12,7 @@ EXAMPLE_DOC_DIR = ROOT_DIR / "docs/source/getting_started/examples"
def fix_case(text: str) -> str:
subs = {
"api": "API",
"Cli": "CLI",
"cpu": "CPU",
"llm": "LLM",
"tpu": "TPU",
@ -58,7 +59,7 @@ class Index:
content = f"# {self.title}\n\n{self.description}\n\n"
content += "```{toctree}\n"
content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n"
content += "\n".join(sorted(self.documents)) + "\n```\n"
content += "\n".join(self.documents) + "\n```\n"
return content
@ -131,11 +132,14 @@ class Example:
ROOT_DIR)
content = f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n"
if self.main_file.suffix == ".py":
content += f"# {self.title}\n\n"
include = "include" if self.main_file.suffix == ".md" else \
"literalinclude"
content += f":::{{{include}}} {make_relative(self.main_file)}\n:::\n\n"
if include == "literalinclude":
content += f"# {self.title}\n\n"
content += f":::{{{include}}} {make_relative(self.main_file)}\n"
if include == "literalinclude":
content += f":language: {self.main_file.suffix[1:]}\n"
content += ":::\n\n"
if not self.other_files:
return content
@ -163,14 +167,16 @@ def generate_examples():
description=
"A collection of examples demonstrating usage of vLLM.\nAll documented examples are autogenerated using <gh-file:docs/source/generate_examples.py> from examples found in <gh-file:examples>.", # noqa: E501
caption="Examples",
maxdepth=1) # TODO change to 2 when examples start being categorised
maxdepth=2)
# Category indices stored in reverse order because they are inserted into
# examples_index.documents at index 0 in order
category_indices = {
"offline_inference":
"other":
Index(
path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
title="Offline Inference",
path=EXAMPLE_DOC_DIR / "examples_other_index.md",
title="Other",
description=
"Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.", # noqa: E501
"Other examples that don't strongly fit into the online or offline serving categories.", # noqa: E501
caption="Examples",
),
"online_serving":
@ -181,31 +187,30 @@ def generate_examples():
"Online serving examples demonstrate how to use vLLM in an online setting, where the model is queried for predictions in real-time.", # noqa: E501
caption="Examples",
),
"other":
"offline_inference":
Index(
path=EXAMPLE_DOC_DIR / "examples_other_index.md",
title="Other",
path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
title="Offline Inference",
description=
"Other examples that don't strongly fit into the online or offline serving categories.", # noqa: E501
"Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.", # noqa: E501
caption="Examples",
),
}
examples = []
glob_patterns = ["*.py", "*.md", "*.sh"]
# Find categorised examples
for category in category_indices:
category_dir = EXAMPLE_DIR / category
py = category_dir.glob("*.py")
md = category_dir.glob("*.md")
for path in itertools.chain(py, md):
globs = [category_dir.glob(pattern) for pattern in glob_patterns]
for path in itertools.chain(*globs):
examples.append(Example(path, category))
# Find examples in subdirectories
for path in category_dir.glob("*/*.md"):
examples.append(Example(path.parent, category))
# Find uncategorised examples
py = EXAMPLE_DIR.glob("*.py")
md = EXAMPLE_DIR.glob("*.md")
for path in itertools.chain(py, md):
globs = [EXAMPLE_DIR.glob(pattern) for pattern in glob_patterns]
for path in itertools.chain(*globs):
examples.append(Example(path))
# Find examples in subdirectories
for path in EXAMPLE_DIR.glob("*/*.md"):
@ -215,7 +220,7 @@ def generate_examples():
examples.append(Example(path.parent))
# Generate the example documentation
for example in examples:
for example in sorted(examples, key=lambda e: e.path.stem):
doc_path = EXAMPLE_DOC_DIR / f"{example.path.stem}.md"
with open(doc_path, "w+") as f:
f.write(example.generate())

View File

@ -95,7 +95,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install
$ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
$ find / -name *libtcmalloc* # find the dynamic link library path
$ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
$ python examples/offline_inference.py # run vLLM
$ python examples/offline_inference/offline_inference.py # run vLLM
```
- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
@ -132,7 +132,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ
# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
$ export VLLM_CPU_OMP_THREADS_BIND=0-7
$ python examples/offline_inference.py
$ python examples/offline_inference/offline_inference.py
```
- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.

View File

@ -71,4 +71,4 @@ $ --pipeline-parallel-size=2 \
$ -tp=8
```
By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/run_cluster.sh> helper script.
By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.

View File

@ -31,7 +31,7 @@ For non-CUDA platforms, please refer [here](#installation-index) for specific in
## Offline Batched Inference
With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference.py>
With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/offline_inference.py>
The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:
@ -133,7 +133,7 @@ completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
print("Completion result:", completion)
```
A more detailed client example can be found here: <gh-file:examples/openai_completion_client.py>
A more detailed client example can be found here: <gh-file:examples/online_serving/openai_completion_client.py>
### OpenAI Chat Completions API with vLLM

View File

@ -24,7 +24,7 @@ To isolate the model downloading and loading issue, you can use the `--load-form
## Model is too large
If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
## Enable more logging

View File

@ -9,7 +9,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor
For more information on CoreWeave's Tensorizer, please refer to
[CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/tensorize_vllm_model.html).
the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference/tensorize_vllm_model.html).
```{note}
Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.

View File

@ -46,7 +46,7 @@ for output in outputs:
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
A code example can be found here: <gh-file:examples/offline_inference.py>
A code example can be found here: <gh-file:examples/offline_inference/offline_inference.py>
### `LLM.beam_search`
@ -103,7 +103,7 @@ for output in outputs:
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
A code example can be found here: <gh-file:examples/offline_inference_chat.py>
A code example can be found here: <gh-file:examples/offline_inference/offline_inference_chat.py>
If the model doesn't have a chat template or you want to specify another one,
you can explicitly pass a chat template:

View File

@ -65,7 +65,7 @@ embeds = output.outputs.embedding
print(f"Embeddings: {embeds!r} (size={len(embeds)})")
```
A code example can be found here: <gh-file:examples/offline_inference_embedding.py>
A code example can be found here: <gh-file:examples/offline_inference/offline_inference_embedding.py>
### `LLM.classify`
@ -80,7 +80,7 @@ probs = output.outputs.probs
print(f"Class Probabilities: {probs!r} (size={len(probs)})")
```
A code example can be found here: <gh-file:examples/offline_inference_classification.py>
A code example can be found here: <gh-file:examples/offline_inference/offline_inference_classification.py>
### `LLM.score`
@ -102,7 +102,7 @@ score = output.outputs.score
print(f"Score: {score}")
```
A code example can be found here: <gh-file:examples/offline_inference_scoring.py>
A code example can be found here: <gh-file:examples/offline_inference/offline_inference_scoring.py>
## Online Inference

View File

@ -51,7 +51,7 @@ $ --pipeline-parallel-size 2
If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration.
The first step, is to start containers and organize them into a cluster. We have provided the helper script <gh-file:examples/run_cluster.sh> to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command.
The first step, is to start containers and organize them into a cluster. We have provided the helper script <gh-file:examples/online_serving/run_cluster.sh> to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command.
Pick a node as the head node, and run the following command:

View File

@ -60,7 +60,7 @@ for o in outputs:
print(generated_text)
```
Full example: <gh-file:examples/offline_inference_vision_language.py>
Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
@ -91,7 +91,7 @@ for o in outputs:
print(generated_text)
```
Full example: <gh-file:examples/offline_inference_vision_language_multi_image.py>
Full example: <gh-file:examples/offline_inference/offline_inference_vision_language_multi_image.py>
Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
@ -125,13 +125,13 @@ for o in outputs:
You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
instead of using multi-image input.
Full example: <gh-file:examples/offline_inference_vision_language.py>
Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
### Audio
You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
Full example: <gh-file:examples/offline_inference_audio_language.py>
Full example: <gh-file:examples/offline_inference/offline_inference_audio_language.py>
### Embedding
@ -271,7 +271,7 @@ chat_response = client.chat.completions.create(
print("Chat completion output:", chat_response.choices[0].message.content)
```
Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
```{tip}
Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
@ -342,7 +342,7 @@ result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from image url:", result)
```
Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
````{note}
By default, the timeout for fetching videos through HTTP URL is `30` seconds.
@ -445,7 +445,7 @@ result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from audio url:", result)
```
Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
````{note}
By default, the timeout for fetching audios through HTTP URL is `10` seconds.
@ -529,4 +529,4 @@ Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of th
example below for details.
```
Full example: <gh-file:examples/openai_chat_embedding_client_for_multimodal.py>
Full example: <gh-file:examples/online_serving/openai_chat_embedding_client_for_multimodal.py>

View File

@ -191,7 +191,7 @@ The order of priorities is `command line > config file values > defaults`.
Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions);
you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
Code example: <gh-file:examples/openai_completion_client.py>
Code example: <gh-file:examples/online_serving/openai_completion_client.py>
#### Extra parameters
@ -222,7 +222,7 @@ We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
see our [Multimodal Inputs](#multimodal-inputs) guide for more information.
- *Note: `image_url.detail` parameter is not supported.*
Code example: <gh-file:examples/openai_chat_completion_client.py>
Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
#### Extra parameters
@ -255,7 +255,7 @@ which will be treated as a single prompt to the model.
This enables multi-modal inputs to be passed to embedding models, see [this page](#multimodal-inputs) for details.
```
Code example: <gh-file:examples/openai_embedding_client.py>
Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
#### Extra parameters
@ -299,7 +299,7 @@ Our Pooling API encodes input prompts using a [pooling model](../models/pooling_
The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
Code example: <gh-file:examples/openai_pooling_client.py>
Code example: <gh-file:examples/online_serving/openai_pooling_client.py>
(score-api)=
### Score API
@ -309,7 +309,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent
You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
Code example: <gh-file:examples/openai_cross_encoder_score.py>
Code example: <gh-file:examples/online_serving/openai_cross_encoder_score.py>
#### Single inference

View File

@ -3,7 +3,8 @@ Demonstrate prompting of text-to-text
encoder/decoder models, specifically Florence-2
'''
# TODO(Isotr0py):
# Move to offline_inference_vision_language.py after porting vision backbone
# Move to offline_inference/offline_inference_vision_language.py
# after porting vision backbone
from vllm import LLM, SamplingParams
dtype = "float"

View File

@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format
The OpenAI batch file format consists of a series of json objects on new lines.
[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl)
[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl)
Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
@ -31,13 +31,13 @@ We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints
To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
```
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
```
Once you've created your batch file it should look like this
```
$ cat openai_example_batch.jsonl
$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
```
@ -49,7 +49,7 @@ The batch running tool is designed to be used from the command line.
You can run the batch with the following command, which will write its results to a file called `results.jsonl`
```
python -m vllm.entrypoints.openai.run_batch -i openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
python -m vllm.entrypoints.openai.run_batch -i offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
```
### Step 3: Check your results
@ -66,10 +66,10 @@ $ cat results.jsonl
The batch runner supports remote input and output urls that are accessible via http/https.
For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl`, you can run
For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl`, you can run
```
python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
```
## Example 3: Integrating with AWS S3
@ -90,13 +90,13 @@ To integrate with cloud blob storage, we recommend using presigned urls.
To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
```
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
```
Once you've created your batch file it should look like this
```
$ cat openai_example_batch.jsonl
$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
```
@ -104,7 +104,7 @@ $ cat openai_example_batch.jsonl
Now upload your batch file to your S3 bucket.
```
aws s3 cp openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
aws s3 cp offline_inference/offline_inference_openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
```
### Step 2: Generate your presigned urls

View File

@ -363,7 +363,7 @@ Profile a model
example:
```
python examples/offline_profile.py \\
python examples/offline_inference/offline_profile.py \\
--model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
--prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
--enforce-eager run_num_steps -n 2

View File

@ -0,0 +1,21 @@
# Helm Charts
This directory contains a Helm chart for deploying the vllm application. The chart includes configurations for deployment, autoscaling, resource management, and more.
## Files
- Chart.yaml: Defines the chart metadata including name, version, and maintainers.
- ct.yaml: Configuration for chart testing.
- lintconf.yaml: Linting rules for YAML files.
- values.schema.json: JSON schema for validating values.yaml.
- values.yaml: Default values for the Helm chart.
- templates/_helpers.tpl: Helper templates for defining common configurations.
- templates/configmap.yaml: Template for creating ConfigMaps.
- templates/custom-objects.yaml: Template for custom Kubernetes objects.
- templates/deployment.yaml: Template for creating Deployments.
- templates/hpa.yaml: Template for Horizontal Pod Autoscaler.
- templates/job.yaml: Template for Kubernetes Jobs.
- templates/poddisruptionbudget.yaml: Template for Pod Disruption Budget.
- templates/pvc.yaml: Template for Persistent Volume Claims.
- templates/secrets.yaml: Template for Kubernetes Secrets.
- templates/service.yaml: Template for creating Services.

Some files were not shown because too many files have changed in this diff Show More