[Doc] Rename offline inference examples (#11927)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
20410b2fda
commit
482cdc494e
@ -30,7 +30,7 @@ function cpu_tests() {
|
||||
# offline inference
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference/offline_inference.py"
|
||||
python3 examples/offline_inference/basic.py"
|
||||
|
||||
# Run basic model test
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
|
@ -24,5 +24,5 @@ remove_docker_container
|
||||
|
||||
# Run the image and test offline inference
|
||||
docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
||||
python3 examples/offline_inference/offline_inference.py
|
||||
python3 examples/offline_inference/basic.py
|
||||
'
|
||||
|
@ -13,4 +13,4 @@ trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and launch offline inference
|
||||
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/offline_inference.py
|
||||
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
|
@ -51,4 +51,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
|
||||
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||
--name "${container_name}" \
|
||||
${image_name} \
|
||||
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/offline_inference_neuron.py"
|
||||
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py"
|
||||
|
@ -13,4 +13,4 @@ trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and launch offline inference
|
||||
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/offline_inference.py
|
||||
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py
|
||||
|
@ -23,4 +23,4 @@ docker run --privileged --net host --shm-size=16G -it \
|
||||
&& pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
|
||||
&& python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
||||
&& python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py"
|
||||
&& python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
||||
|
@ -14,6 +14,6 @@ remove_docker_container
|
||||
|
||||
# Run the image and test offline inference/tensor parallel
|
||||
docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
|
||||
python3 examples/offline_inference/offline_inference.py
|
||||
python3 examples/offline_inference/offline_inference_cli.py -tp 2
|
||||
python3 examples/offline_inference/basic.py
|
||||
python3 examples/offline_inference/cli.py -tp 2
|
||||
'
|
||||
|
@ -187,19 +187,19 @@ steps:
|
||||
- examples/
|
||||
commands:
|
||||
- pip install tensorizer # for tensorizer test
|
||||
- python3 offline_inference/offline_inference.py
|
||||
- python3 offline_inference/basic.py
|
||||
- python3 offline_inference/cpu_offload.py
|
||||
- python3 offline_inference/offline_inference_chat.py
|
||||
- python3 offline_inference/offline_inference_with_prefix.py
|
||||
- python3 offline_inference/chat.py
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 offline_inference/offline_inference_vision_language.py
|
||||
- python3 offline_inference/offline_inference_vision_language_multi_image.py
|
||||
- python3 offline_inference/vision_language.py
|
||||
- python3 offline_inference/vision_language_multi_image.py
|
||||
- python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/offline_inference_encoder_decoder.py
|
||||
- python3 offline_inference/offline_inference_classification.py
|
||||
- python3 offline_inference/offline_inference_embedding.py
|
||||
- python3 offline_inference/offline_inference_scoring.py
|
||||
- python3 offline_inference/offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||
- python3 offline_inference/encoder_decoder.py
|
||||
- python3 offline_inference/classification.py
|
||||
- python3 offline_inference/embedding.py
|
||||
- python3 offline_inference/scoring.py
|
||||
- python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||
|
||||
- label: Prefix Caching Test # 9min
|
||||
mirror_hardwares: [amd]
|
||||
|
@ -26,7 +26,7 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve
|
||||
|
||||
### Offline Inference
|
||||
|
||||
Refer to <gh-file:examples/offline_inference/offline_inference_with_profiler.py> for an example.
|
||||
Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example.
|
||||
|
||||
### OpenAI Server
|
||||
|
||||
|
@ -257,4 +257,4 @@ outputs = llm.generate(
|
||||
print(outputs[0].outputs[0].text)
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/offline_inference/offline_inference_structured_outputs.py>
|
||||
Full example: <gh-file:examples/offline_inference/structured_outputs.py>
|
||||
|
@ -95,7 +95,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install
|
||||
$ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
|
||||
$ find / -name *libtcmalloc* # find the dynamic link library path
|
||||
$ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
|
||||
$ python examples/offline_inference/offline_inference.py # run vLLM
|
||||
$ python examples/offline_inference/basic.py # run vLLM
|
||||
```
|
||||
|
||||
- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
|
||||
@ -132,7 +132,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ
|
||||
|
||||
# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
|
||||
$ export VLLM_CPU_OMP_THREADS_BIND=0-7
|
||||
$ python examples/offline_inference/offline_inference.py
|
||||
$ python examples/offline_inference/basic.py
|
||||
```
|
||||
|
||||
- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
|
||||
|
@ -40,7 +40,7 @@ For non-CUDA platforms, please refer [here](#installation-index) for specific in
|
||||
|
||||
## Offline Batched Inference
|
||||
|
||||
With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/offline_inference.py>
|
||||
With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/basic.py>
|
||||
|
||||
The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:
|
||||
|
||||
|
@ -46,7 +46,7 @@ for output in outputs:
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
```
|
||||
|
||||
A code example can be found here: <gh-file:examples/offline_inference/offline_inference.py>
|
||||
A code example can be found here: <gh-file:examples/offline_inference/basic.py>
|
||||
|
||||
### `LLM.beam_search`
|
||||
|
||||
@ -103,7 +103,7 @@ for output in outputs:
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
```
|
||||
|
||||
A code example can be found here: <gh-file:examples/offline_inference/offline_inference_chat.py>
|
||||
A code example can be found here: <gh-file:examples/offline_inference/chat.py>
|
||||
|
||||
If the model doesn't have a chat template or you want to specify another one,
|
||||
you can explicitly pass a chat template:
|
||||
|
@ -88,7 +88,7 @@ embeds = output.outputs.embedding
|
||||
print(f"Embeddings: {embeds!r} (size={len(embeds)})")
|
||||
```
|
||||
|
||||
A code example can be found here: <gh-file:examples/offline_inference/offline_inference_embedding.py>
|
||||
A code example can be found here: <gh-file:examples/offline_inference/embedding.py>
|
||||
|
||||
### `LLM.classify`
|
||||
|
||||
@ -103,7 +103,7 @@ probs = output.outputs.probs
|
||||
print(f"Class Probabilities: {probs!r} (size={len(probs)})")
|
||||
```
|
||||
|
||||
A code example can be found here: <gh-file:examples/offline_inference/offline_inference_classification.py>
|
||||
A code example can be found here: <gh-file:examples/offline_inference/classification.py>
|
||||
|
||||
### `LLM.score`
|
||||
|
||||
@ -125,7 +125,7 @@ score = output.outputs.score
|
||||
print(f"Score: {score}")
|
||||
```
|
||||
|
||||
A code example can be found here: <gh-file:examples/offline_inference/offline_inference_scoring.py>
|
||||
A code example can be found here: <gh-file:examples/offline_inference/scoring.py>
|
||||
|
||||
## Online Serving
|
||||
|
||||
|
@ -60,7 +60,7 @@ for o in outputs:
|
||||
print(generated_text)
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
|
||||
Full example: <gh-file:examples/offline_inference/vision_language.py>
|
||||
|
||||
To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
|
||||
|
||||
@ -91,7 +91,7 @@ for o in outputs:
|
||||
print(generated_text)
|
||||
```
|
||||
|
||||
Full example: <gh-file:examples/offline_inference/offline_inference_vision_language_multi_image.py>
|
||||
Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
|
||||
|
||||
Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
|
||||
|
||||
@ -125,13 +125,13 @@ for o in outputs:
|
||||
You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
|
||||
instead of using multi-image input.
|
||||
|
||||
Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
|
||||
Full example: <gh-file:examples/offline_inference/vision_language.py>
|
||||
|
||||
### Audio
|
||||
|
||||
You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
|
||||
|
||||
Full example: <gh-file:examples/offline_inference/offline_inference_audio_language.py>
|
||||
Full example: <gh-file:examples/offline_inference/audio_language.py>
|
||||
|
||||
### Embedding
|
||||
|
||||
|
@ -3,7 +3,7 @@ Demonstrate prompting of text-to-text
|
||||
encoder/decoder models, specifically Florence-2
|
||||
'''
|
||||
# TODO(Isotr0py):
|
||||
# Move to offline_inference/offline_inference_vision_language.py
|
||||
# Move to offline_inference/vision_language.py
|
||||
# after porting vision backbone
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
|
@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format
|
||||
|
||||
The OpenAI batch file format consists of a series of json objects on new lines.
|
||||
|
||||
[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl)
|
||||
[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai/openai_example_batch.jsonl)
|
||||
|
||||
Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
|
||||
|
||||
@ -31,13 +31,13 @@ We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints
|
||||
To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
|
||||
|
||||
```
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
|
||||
```
|
||||
|
||||
Once you've created your batch file it should look like this
|
||||
|
||||
```
|
||||
$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
|
||||
$ cat offline_inference/openai/openai_example_batch.jsonl
|
||||
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
|
||||
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
|
||||
```
|
||||
@ -49,7 +49,7 @@ The batch running tool is designed to be used from the command line.
|
||||
You can run the batch with the following command, which will write its results to a file called `results.jsonl`
|
||||
|
||||
```
|
||||
python -m vllm.entrypoints.openai.run_batch -i offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
|
||||
python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
|
||||
```
|
||||
|
||||
### Step 3: Check your results
|
||||
@ -66,10 +66,10 @@ $ cat results.jsonl
|
||||
|
||||
The batch runner supports remote input and output urls that are accessible via http/https.
|
||||
|
||||
For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl`, you can run
|
||||
For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl`, you can run
|
||||
|
||||
```
|
||||
python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
|
||||
python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
|
||||
```
|
||||
|
||||
## Example 3: Integrating with AWS S3
|
||||
@ -90,13 +90,13 @@ To integrate with cloud blob storage, we recommend using presigned urls.
|
||||
To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
|
||||
|
||||
```
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
|
||||
```
|
||||
|
||||
Once you've created your batch file it should look like this
|
||||
|
||||
```
|
||||
$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
|
||||
$ cat offline_inference/openai/openai_example_batch.jsonl
|
||||
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
|
||||
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
|
||||
```
|
||||
@ -104,7 +104,7 @@ $ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
|
||||
Now upload your batch file to your S3 bucket.
|
||||
|
||||
```
|
||||
aws s3 cp offline_inference/offline_inference_openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
|
||||
aws s3 cp offline_inference/openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
|
||||
```
|
||||
|
||||
### Step 2: Generate your presigned urls
|
@ -363,7 +363,7 @@ Profile a model
|
||||
|
||||
example:
|
||||
```
|
||||
python examples/offline_inference/offline_profile.py \\
|
||||
python examples/offline_inference/profiling.py \\
|
||||
--model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
|
||||
--prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
|
||||
--enforce-eager run_num_steps -n 2
|
@ -5,7 +5,7 @@ def test_platform_plugins():
|
||||
import os
|
||||
example_file = os.path.join(
|
||||
os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
|
||||
"examples", "offline_inference/offline_inference.py")
|
||||
"examples", "offline_inference/basic.py")
|
||||
runpy.run_path(example_file)
|
||||
|
||||
# check if the plugin is loaded correctly
|
||||
|
@ -31,7 +31,7 @@ if __name__ == "__main__":
|
||||
type=str,
|
||||
required=True,
|
||||
help="json trace file output by "
|
||||
"examples/offline_inference/offline_profile.py")
|
||||
"examples/offline_inference/profiling.py")
|
||||
parser.add_argument("--phase",
|
||||
type=str,
|
||||
required=True,
|
||||
|
@ -538,7 +538,7 @@ if __name__ == "__main__":
|
||||
type=str,
|
||||
required=True,
|
||||
help="json trace file output by \
|
||||
examples/offline_inference/offline_profile.py")
|
||||
examples/offline_inference/profiling.py")
|
||||
parser.add_argument("--output-directory",
|
||||
type=str,
|
||||
required=False,
|
||||
|
Loading…
x
Reference in New Issue
Block a user