[Doc] Rename offline inference examples (#11927)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor 2025-01-10 15:50:29 +00:00 committed by GitHub
parent 20410b2fda
commit 482cdc494e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
46 changed files with 46 additions and 46 deletions

View File

@ -30,7 +30,7 @@ function cpu_tests() {
# offline inference # offline inference
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
set -e set -e
python3 examples/offline_inference/offline_inference.py" python3 examples/offline_inference/basic.py"
# Run basic model test # Run basic model test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "

View File

@ -24,5 +24,5 @@ remove_docker_container
# Run the image and test offline inference # Run the image and test offline inference
docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
python3 examples/offline_inference/offline_inference.py python3 examples/offline_inference/basic.py
' '

View File

@ -13,4 +13,4 @@ trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Run the image and launch offline inference # Run the image and launch offline inference
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/offline_inference.py docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py

View File

@ -51,4 +51,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
--name "${container_name}" \ --name "${container_name}" \
${image_name} \ ${image_name} \
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/offline_inference_neuron.py" /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py"

View File

@ -13,4 +13,4 @@ trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Run the image and launch offline inference # Run the image and launch offline inference
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/offline_inference.py docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py

View File

@ -23,4 +23,4 @@ docker run --privileged --net host --shm-size=16G -it \
&& pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \ && python3 /workspace/vllm/tests/tpu/test_compilation.py \
&& python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \ && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
&& python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py" && python3 /workspace/vllm/examples/offline_inference/tpu.py"

View File

@ -14,6 +14,6 @@ remove_docker_container
# Run the image and test offline inference/tensor parallel # Run the image and test offline inference/tensor parallel
docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c ' docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
python3 examples/offline_inference/offline_inference.py python3 examples/offline_inference/basic.py
python3 examples/offline_inference/offline_inference_cli.py -tp 2 python3 examples/offline_inference/cli.py -tp 2
' '

View File

@ -187,19 +187,19 @@ steps:
- examples/ - examples/
commands: commands:
- pip install tensorizer # for tensorizer test - pip install tensorizer # for tensorizer test
- python3 offline_inference/offline_inference.py - python3 offline_inference/basic.py
- python3 offline_inference/cpu_offload.py - python3 offline_inference/cpu_offload.py
- python3 offline_inference/offline_inference_chat.py - python3 offline_inference/chat.py
- python3 offline_inference/offline_inference_with_prefix.py - python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py - python3 offline_inference/llm_engine_example.py
- python3 offline_inference/offline_inference_vision_language.py - python3 offline_inference/vision_language.py
- python3 offline_inference/offline_inference_vision_language_multi_image.py - python3 offline_inference/vision_language_multi_image.py
- python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/offline_inference_encoder_decoder.py - python3 offline_inference/encoder_decoder.py
- python3 offline_inference/offline_inference_classification.py - python3 offline_inference/classification.py
- python3 offline_inference/offline_inference_embedding.py - python3 offline_inference/embedding.py
- python3 offline_inference/offline_inference_scoring.py - python3 offline_inference/scoring.py
- python3 offline_inference/offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2 - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
- label: Prefix Caching Test # 9min - label: Prefix Caching Test # 9min
mirror_hardwares: [amd] mirror_hardwares: [amd]

View File

@ -26,7 +26,7 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve
### Offline Inference ### Offline Inference
Refer to <gh-file:examples/offline_inference/offline_inference_with_profiler.py> for an example. Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example.
### OpenAI Server ### OpenAI Server

View File

@ -257,4 +257,4 @@ outputs = llm.generate(
print(outputs[0].outputs[0].text) print(outputs[0].outputs[0].text)
``` ```
Full example: <gh-file:examples/offline_inference/offline_inference_structured_outputs.py> Full example: <gh-file:examples/offline_inference/structured_outputs.py>

View File

@ -95,7 +95,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install
$ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library $ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
$ find / -name *libtcmalloc* # find the dynamic link library path $ find / -name *libtcmalloc* # find the dynamic link library path
$ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
$ python examples/offline_inference/offline_inference.py # run vLLM $ python examples/offline_inference/basic.py # run vLLM
``` ```
- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP: - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
@ -132,7 +132,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ
# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15 # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
$ export VLLM_CPU_OMP_THREADS_BIND=0-7 $ export VLLM_CPU_OMP_THREADS_BIND=0-7
$ python examples/offline_inference/offline_inference.py $ python examples/offline_inference/basic.py
``` ```
- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access. - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.

View File

@ -40,7 +40,7 @@ For non-CUDA platforms, please refer [here](#installation-index) for specific in
## Offline Batched Inference ## Offline Batched Inference
With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/offline_inference.py> With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/basic.py>
The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`: The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:

View File

@ -46,7 +46,7 @@ for output in outputs:
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
``` ```
A code example can be found here: <gh-file:examples/offline_inference/offline_inference.py> A code example can be found here: <gh-file:examples/offline_inference/basic.py>
### `LLM.beam_search` ### `LLM.beam_search`
@ -103,7 +103,7 @@ for output in outputs:
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
``` ```
A code example can be found here: <gh-file:examples/offline_inference/offline_inference_chat.py> A code example can be found here: <gh-file:examples/offline_inference/chat.py>
If the model doesn't have a chat template or you want to specify another one, If the model doesn't have a chat template or you want to specify another one,
you can explicitly pass a chat template: you can explicitly pass a chat template:

View File

@ -88,7 +88,7 @@ embeds = output.outputs.embedding
print(f"Embeddings: {embeds!r} (size={len(embeds)})") print(f"Embeddings: {embeds!r} (size={len(embeds)})")
``` ```
A code example can be found here: <gh-file:examples/offline_inference/offline_inference_embedding.py> A code example can be found here: <gh-file:examples/offline_inference/embedding.py>
### `LLM.classify` ### `LLM.classify`
@ -103,7 +103,7 @@ probs = output.outputs.probs
print(f"Class Probabilities: {probs!r} (size={len(probs)})") print(f"Class Probabilities: {probs!r} (size={len(probs)})")
``` ```
A code example can be found here: <gh-file:examples/offline_inference/offline_inference_classification.py> A code example can be found here: <gh-file:examples/offline_inference/classification.py>
### `LLM.score` ### `LLM.score`
@ -125,7 +125,7 @@ score = output.outputs.score
print(f"Score: {score}") print(f"Score: {score}")
``` ```
A code example can be found here: <gh-file:examples/offline_inference/offline_inference_scoring.py> A code example can be found here: <gh-file:examples/offline_inference/scoring.py>
## Online Serving ## Online Serving

View File

@ -60,7 +60,7 @@ for o in outputs:
print(generated_text) print(generated_text)
``` ```
Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py> Full example: <gh-file:examples/offline_inference/vision_language.py>
To substitute multiple images inside the same text prompt, you can pass in a list of images instead: To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
@ -91,7 +91,7 @@ for o in outputs:
print(generated_text) print(generated_text)
``` ```
Full example: <gh-file:examples/offline_inference/offline_inference_vision_language_multi_image.py> Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos: Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
@ -125,13 +125,13 @@ for o in outputs:
You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
instead of using multi-image input. instead of using multi-image input.
Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py> Full example: <gh-file:examples/offline_inference/vision_language.py>
### Audio ### Audio
You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary. You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
Full example: <gh-file:examples/offline_inference/offline_inference_audio_language.py> Full example: <gh-file:examples/offline_inference/audio_language.py>
### Embedding ### Embedding

View File

@ -3,7 +3,7 @@ Demonstrate prompting of text-to-text
encoder/decoder models, specifically Florence-2 encoder/decoder models, specifically Florence-2
''' '''
# TODO(Isotr0py): # TODO(Isotr0py):
# Move to offline_inference/offline_inference_vision_language.py # Move to offline_inference/vision_language.py
# after porting vision backbone # after porting vision backbone
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams

View File

@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format
The OpenAI batch file format consists of a series of json objects on new lines. The OpenAI batch file format consists of a series of json objects on new lines.
[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl) [See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai/openai_example_batch.jsonl)
Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details. Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
@ -31,13 +31,13 @@ We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints
To follow along with this example, you can download the example batch, or create your own batch file in your working directory. To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
``` ```
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
``` ```
Once you've created your batch file it should look like this Once you've created your batch file it should look like this
``` ```
$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl $ cat offline_inference/openai/openai_example_batch.jsonl
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
``` ```
@ -49,7 +49,7 @@ The batch running tool is designed to be used from the command line.
You can run the batch with the following command, which will write its results to a file called `results.jsonl` You can run the batch with the following command, which will write its results to a file called `results.jsonl`
``` ```
python -m vllm.entrypoints.openai.run_batch -i offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
``` ```
### Step 3: Check your results ### Step 3: Check your results
@ -66,10 +66,10 @@ $ cat results.jsonl
The batch runner supports remote input and output urls that are accessible via http/https. The batch runner supports remote input and output urls that are accessible via http/https.
For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl`, you can run For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl`, you can run
``` ```
python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
``` ```
## Example 3: Integrating with AWS S3 ## Example 3: Integrating with AWS S3
@ -90,13 +90,13 @@ To integrate with cloud blob storage, we recommend using presigned urls.
To follow along with this example, you can download the example batch, or create your own batch file in your working directory. To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
``` ```
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
``` ```
Once you've created your batch file it should look like this Once you've created your batch file it should look like this
``` ```
$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl $ cat offline_inference/openai/openai_example_batch.jsonl
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
``` ```
@ -104,7 +104,7 @@ $ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
Now upload your batch file to your S3 bucket. Now upload your batch file to your S3 bucket.
``` ```
aws s3 cp offline_inference/offline_inference_openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl aws s3 cp offline_inference/openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
``` ```
### Step 2: Generate your presigned urls ### Step 2: Generate your presigned urls

View File

@ -363,7 +363,7 @@ Profile a model
example: example:
``` ```
python examples/offline_inference/offline_profile.py \\ python examples/offline_inference/profiling.py \\
--model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\ --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
--prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\ --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
--enforce-eager run_num_steps -n 2 --enforce-eager run_num_steps -n 2

View File

@ -5,7 +5,7 @@ def test_platform_plugins():
import os import os
example_file = os.path.join( example_file = os.path.join(
os.path.dirname(os.path.dirname(os.path.dirname(current_file))), os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
"examples", "offline_inference/offline_inference.py") "examples", "offline_inference/basic.py")
runpy.run_path(example_file) runpy.run_path(example_file)
# check if the plugin is loaded correctly # check if the plugin is loaded correctly

View File

@ -31,7 +31,7 @@ if __name__ == "__main__":
type=str, type=str,
required=True, required=True,
help="json trace file output by " help="json trace file output by "
"examples/offline_inference/offline_profile.py") "examples/offline_inference/profiling.py")
parser.add_argument("--phase", parser.add_argument("--phase",
type=str, type=str,
required=True, required=True,

View File

@ -538,7 +538,7 @@ if __name__ == "__main__":
type=str, type=str,
required=True, required=True,
help="json trace file output by \ help="json trace file output by \
examples/offline_inference/offline_profile.py") examples/offline_inference/profiling.py")
parser.add_argument("--output-directory", parser.add_argument("--output-directory",
type=str, type=str,
required=False, required=False,