[Doc] Rename offline inference examples (#11927)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-01-10 15:50:29 +00:00 · 2025-01-10 15:50:29 +00:00 · 482cdc494e
commit 482cdc494e
parent 20410b2fda
46 changed files with 46 additions and 46 deletions
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@ -30,7 +30,7 @@ function cpu_tests() {
  # offline inference
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
    set -e
-    python3 examples/offline_inference/offline_inference.py"
+    python3 examples/offline_inference/basic.py"

  # Run basic model test
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@ -24,5 +24,5 @@ remove_docker_container

 # Run the image and test offline inference
 docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference/offline_inference.py
+    python3 examples/offline_inference/basic.py
 '
--- a/.buildkite/run-hpu-test.sh
+++ b/.buildkite/run-hpu-test.sh
@ -13,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container

 # Run the image and launch offline inference
-docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/offline_inference.py
+docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@ -51,4 +51,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
       --name "${container_name}" \
       ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/offline_inference_neuron.py"
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py"
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@ -13,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container

 # Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/offline_inference.py
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@ -23,4 +23,4 @@ docker run --privileged --net host --shm-size=16G -it \
    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
-    && python3 /workspace/vllm/examples/offline_inference/offline_inference_tpu.py"
+    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@ -14,6 +14,6 @@ remove_docker_container

 # Run the image and test offline inference/tensor parallel
 docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
-    python3 examples/offline_inference/offline_inference.py
-    python3 examples/offline_inference/offline_inference_cli.py -tp 2
+    python3 examples/offline_inference/basic.py
+    python3 examples/offline_inference/cli.py -tp 2
 '
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -187,19 +187,19 @@ steps:
  - examples/
  commands:
    - pip install tensorizer # for tensorizer test
-    - python3 offline_inference/offline_inference.py
+    - python3 offline_inference/basic.py
    - python3 offline_inference/cpu_offload.py
-    - python3 offline_inference/offline_inference_chat.py
-    - python3 offline_inference/offline_inference_with_prefix.py
+    - python3 offline_inference/chat.py
+    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
-    - python3 offline_inference/offline_inference_vision_language.py
-    - python3 offline_inference/offline_inference_vision_language_multi_image.py
+    - python3 offline_inference/vision_language.py
+    - python3 offline_inference/vision_language_multi_image.py
    - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference/offline_inference_encoder_decoder.py
-    - python3 offline_inference/offline_inference_classification.py
-    - python3 offline_inference/offline_inference_embedding.py
-    - python3 offline_inference/offline_inference_scoring.py
-    - python3 offline_inference/offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
+    - python3 offline_inference/encoder_decoder.py
+    - python3 offline_inference/classification.py
+    - python3 offline_inference/embedding.py
+    - python3 offline_inference/scoring.py
+    - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2

 - label: Prefix Caching Test # 9min
  mirror_hardwares: [amd]
--- a/docs/source/contributing/profiling/profiling_index.md
+++ b/docs/source/contributing/profiling/profiling_index.md
@ -26,7 +26,7 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve

 ### Offline Inference

-Refer to <gh-file:examples/offline_inference/offline_inference_with_profiler.py> for an example.
+Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example.

 ### OpenAI Server

--- a/docs/source/features/structured_outputs.md
+++ b/docs/source/features/structured_outputs.md
@ -257,4 +257,4 @@ outputs = llm.generate(
 print(outputs[0].outputs[0].text)
 ```

-Full example: <gh-file:examples/offline_inference/offline_inference_structured_outputs.py>
+Full example: <gh-file:examples/offline_inference/structured_outputs.py>
--- a/docs/source/getting_started/installation/cpu-x86.md
+++ b/docs/source/getting_started/installation/cpu-x86.md
@ -95,7 +95,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install
 $ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
 $ find / -name *libtcmalloc* # find the dynamic link library path
 $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
-$ python examples/offline_inference/offline_inference.py # run vLLM
+$ python examples/offline_inference/basic.py # run vLLM
 ```

 - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
@ -132,7 +132,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE    MAXMHZ   MINMHZ      MHZ

 # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
 $ export VLLM_CPU_OMP_THREADS_BIND=0-7
-$ python examples/offline_inference/offline_inference.py
+$ python examples/offline_inference/basic.py
 ```

 - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@ -40,7 +40,7 @@ For non-CUDA platforms, please refer [here](#installation-index) for specific in

 ## Offline Batched Inference

-With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/offline_inference.py>
+With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/basic.py>

 The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:

--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@ -46,7 +46,7 @@ for output in outputs:
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```

-A code example can be found here: <gh-file:examples/offline_inference/offline_inference.py>
+A code example can be found here: <gh-file:examples/offline_inference/basic.py>

 ### `LLM.beam_search`

@ -103,7 +103,7 @@ for output in outputs:
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```

-A code example can be found here: <gh-file:examples/offline_inference/offline_inference_chat.py>
+A code example can be found here: <gh-file:examples/offline_inference/chat.py>

 If the model doesn't have a chat template or you want to specify another one,
 you can explicitly pass a chat template:
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@ -88,7 +88,7 @@ embeds = output.outputs.embedding
 print(f"Embeddings: {embeds!r} (size={len(embeds)})")
 ```

-A code example can be found here: <gh-file:examples/offline_inference/offline_inference_embedding.py>
+A code example can be found here: <gh-file:examples/offline_inference/embedding.py>

 ### `LLM.classify`

@ -103,7 +103,7 @@ probs = output.outputs.probs
 print(f"Class Probabilities: {probs!r} (size={len(probs)})")
 ```

-A code example can be found here: <gh-file:examples/offline_inference/offline_inference_classification.py>
+A code example can be found here: <gh-file:examples/offline_inference/classification.py>

 ### `LLM.score`

@ -125,7 +125,7 @@ score = output.outputs.score
 print(f"Score: {score}")
 ```

-A code example can be found here: <gh-file:examples/offline_inference/offline_inference_scoring.py>
+A code example can be found here: <gh-file:examples/offline_inference/scoring.py>

 ## Online Serving

--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@ -60,7 +60,7 @@ for o in outputs:
    print(generated_text)
 ```

-Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
+Full example: <gh-file:examples/offline_inference/vision_language.py>

 To substitute multiple images inside the same text prompt, you can pass in a list of images instead:

@ -91,7 +91,7 @@ for o in outputs:
    print(generated_text)
 ```

-Full example: <gh-file:examples/offline_inference/offline_inference_vision_language_multi_image.py>
+Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>

 Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:

@ -125,13 +125,13 @@ for o in outputs:
 You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
 instead of using multi-image input.

-Full example: <gh-file:examples/offline_inference/offline_inference_vision_language.py>
+Full example: <gh-file:examples/offline_inference/vision_language.py>

 ### Audio

 You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.

-Full example: <gh-file:examples/offline_inference/offline_inference_audio_language.py>
+Full example: <gh-file:examples/offline_inference/audio_language.py>

 ### Embedding

--- a/examples/offline_inference/offline_inference_arctic.py
+++ b/examples/offline_inference/offline_inference_arctic.py
--- a/examples/offline_inference/offline_inference_audio_language.py
+++ b/examples/offline_inference/offline_inference_audio_language.py
--- a/examples/offline_inference/offline_inference.py
+++ b/examples/offline_inference/offline_inference.py
--- a/examples/offline_inference/offline_inference_with_default_generation_config.py
+++ b/examples/offline_inference/offline_inference_with_default_generation_config.py
--- a/examples/offline_inference/offline_inference_chat.py
+++ b/examples/offline_inference/offline_inference_chat.py
--- a/examples/offline_inference/offline_chat_with_tools.py
+++ b/examples/offline_inference/offline_chat_with_tools.py
--- a/examples/offline_inference/offline_inference_classification.py
+++ b/examples/offline_inference/offline_inference_classification.py
--- a/examples/offline_inference/offline_inference_cli.py
+++ b/examples/offline_inference/offline_inference_cli.py
--- a/examples/offline_inference/offline_inference_distributed.py
+++ b/examples/offline_inference/offline_inference_distributed.py
--- a/examples/offline_inference/offline_inference_embedding.py
+++ b/examples/offline_inference/offline_inference_embedding.py
--- a/examples/offline_inference/offline_inference_encoder_decoder.py
+++ b/examples/offline_inference/offline_inference_encoder_decoder.py
--- a/examples/offline_inference/florence2_inference.py
+++ b/examples/offline_inference/florence2_inference.py
@ -3,7 +3,7 @@ Demonstrate prompting of text-to-text
 encoder/decoder models, specifically Florence-2
 '''
 # TODO(Isotr0py):
-# Move to offline_inference/offline_inference_vision_language.py
+# Move to offline_inference/vision_language.py
 # after porting vision backbone
 from vllm import LLM, SamplingParams

--- a/examples/offline_inference/offline_inference_mlpspeculator.py
+++ b/examples/offline_inference/offline_inference_mlpspeculator.py
--- a/examples/offline_inference/offline_inference_neuron.py
+++ b/examples/offline_inference/offline_inference_neuron.py
--- a/examples/offline_inference/offline_inference_neuron_int8_quantization.py
+++ b/examples/offline_inference/offline_inference_neuron_int8_quantization.py
--- a/examples/offline_inference/offline_inference_openai/offline_inference_openai.md
+++ b/examples/offline_inference/offline_inference_openai/offline_inference_openai.md
@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format
 
 The OpenAI batch file format consists of a series of json objects on new lines.
 
-[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl)
+[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai/openai_example_batch.jsonl)
 
 Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
 
@ -31,13 +31,13 @@ We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.

 ```
-wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
 ```

 Once you've created your batch file it should look like this

 ```
-$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
+$ cat offline_inference/openai/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 ```
@ -49,7 +49,7 @@ The batch running tool is designed to be used from the command line.
 You can run the batch with the following command, which will write its results to a file called `results.jsonl`

 ```
-python -m vllm.entrypoints.openai.run_batch -i offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
 ```

 ### Step 3: Check your results
@ -66,10 +66,10 @@ $ cat results.jsonl

 The batch runner supports remote input and output urls that are accessible via http/https.

-For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl`, you can run
+For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl`, you can run

 ```
-python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
 ```

 ## Example 3: Integrating with AWS S3
@ -90,13 +90,13 @@ To integrate with cloud blob storage, we recommend using presigned urls.
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.

 ```
-wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
 ```

 Once you've created your batch file it should look like this

 ```
-$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
+$ cat offline_inference/openai/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 ```
@ -104,7 +104,7 @@ $ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
 Now upload your batch file to your S3 bucket.

 ```
-aws s3 cp offline_inference/offline_inference_openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
+aws s3 cp offline_inference/openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
 ```

 ### Step 2: Generate your presigned urls
--- a/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
+++ b/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
--- a/examples/offline_inference/offline_inference_pixtral.py
+++ b/examples/offline_inference/offline_inference_pixtral.py
--- a/examples/offline_inference/offline_inference_with_prefix.py
+++ b/examples/offline_inference/offline_inference_with_prefix.py
--- a/examples/offline_inference/offline_profile.py
+++ b/examples/offline_inference/offline_profile.py
@ -363,7 +363,7 @@ Profile a model

    example:
    ```
-    python examples/offline_inference/offline_profile.py \\
+    python examples/offline_inference/profiling.py \\
        --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
        --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
        --enforce-eager run_num_steps -n 2
--- a/examples/offline_inference/offline_inference_scoring.py
+++ b/examples/offline_inference/offline_inference_scoring.py
--- a/examples/offline_inference/offline_inference_with_profiler.py
+++ b/examples/offline_inference/offline_inference_with_profiler.py
--- a/examples/offline_inference/offline_inference_structured_outputs.py
+++ b/examples/offline_inference/offline_inference_structured_outputs.py
--- a/examples/offline_inference/offline_inference_tpu.py
+++ b/examples/offline_inference/offline_inference_tpu.py
--- a/examples/offline_inference/offline_inference_vision_language.py
+++ b/examples/offline_inference/offline_inference_vision_language.py
--- a/examples/offline_inference/offline_inference_vision_language_embedding.py
+++ b/examples/offline_inference/offline_inference_vision_language_embedding.py
--- a/examples/offline_inference/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference/offline_inference_vision_language_multi_image.py
--- a/examples/offline_inference/offline_inference_whisper.py
+++ b/examples/offline_inference/offline_inference_whisper.py
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@ -5,7 +5,7 @@ def test_platform_plugins():
    import os
    example_file = os.path.join(
        os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
-        "examples", "offline_inference/offline_inference.py")
+        "examples", "offline_inference/basic.py")
    runpy.run_path(example_file)

    # check if the plugin is loaded correctly
--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
@ -31,7 +31,7 @@ if __name__ == "__main__":
                        type=str,
                        required=True,
                        help="json trace file output by "
-                        "examples/offline_inference/offline_profile.py")
+                        "examples/offline_inference/profiling.py")
    parser.add_argument("--phase",
                        type=str,
                        required=True,
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@ -538,7 +538,7 @@ if __name__ == "__main__":
                        type=str,
                        required=True,
                        help="json trace file output by \
-                              examples/offline_inference/offline_profile.py")
+                              examples/offline_inference/profiling.py")
    parser.add_argument("--output-directory",
                        type=str,
                        required=False,