Merge similar examples in offline_inference into single basic example (#12737)

2025-02-20 12:53:51 +00:00 · 2025-02-20 12:53:51 +00:00 · 992e5c3d34
commit 992e5c3d34
parent b69692a2d8
29 changed files with 394 additions and 437 deletions
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@ -30,7 +30,7 @@ function cpu_tests() {
  # offline inference
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
    set -e
-    python3 examples/offline_inference/basic.py"
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
  # Run basic model test
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@ -24,5 +24,5 @@ remove_docker_container
 # Run the image and test offline inference
 docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference/cli.py --model meta-llama/Llama-3.2-1B
+    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
 '
--- a/.buildkite/run-hpu-test.sh
+++ b/.buildkite/run-hpu-test.sh
@ -20,5 +20,5 @@ trap remove_docker_container_and_exit EXIT
 remove_docker_container
 # Run the image and launch offline inference
-docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
+docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
 EXITCODE=$?
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@ -13,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 # Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@ -14,6 +14,6 @@ remove_docker_container
 # Run the image and test offline inference/tensor parallel
 docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
-    python3 examples/offline_inference/basic.py
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
-    python3 examples/offline_inference/cli.py -tp 2
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
 '
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -215,18 +215,18 @@ steps:
  - examples/
  commands:
    - pip install tensorizer # for tensorizer test
-    - python3 offline_inference/basic.py
+    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
-    - python3 offline_inference/cpu_offload.py
+    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-    - python3 offline_inference/chat.py
+    - python3 offline_inference/basic/chat.py
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
    - python3 offline_inference/vision_language.py
    - python3 offline_inference/vision_language_multi_image.py
    - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/encoder_decoder.py
-    - python3 offline_inference/classification.py
+    - python3 offline_inference/basic/classify.py
-    - python3 offline_inference/embedding.py
+    - python3 offline_inference/basic/embed.py
-    - python3 offline_inference/scoring.py
+    - python3 offline_inference/basic/score.py
    - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 - label: Prefix Caching Test # 9min
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@ -147,7 +147,7 @@ class Example:
            return content
        content += "## Example materials\n\n"
-        for file in self.other_files:
+        for file in sorted(self.other_files):
            include = "include" if file.suffix == ".md" else "literalinclude"
            content += f":::{{admonition}} {file.relative_to(self.path)}\n"
            content += ":class: dropdown\n\n"
@ -194,7 +194,7 @@ def generate_examples():
            path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
            title="Offline Inference",
            description=
-            "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.",  # noqa: E501
+            "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches. We recommend starting with <project:basic.md>.",  # noqa: E501
            caption="Examples",
        ),
    }
--- a/docs/source/getting_started/installation/cpu/index.md
+++ b/docs/source/getting_started/installation/cpu/index.md
@ -170,7 +170,7 @@ vLLM CPU backend supports the following vLLM features:
 sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
 find / -name *libtcmalloc* # find the dynamic link library path
 export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
-python examples/offline_inference/basic.py # run vLLM
+python examples/offline_inference/basic/basic.py # run vLLM
 ```
 - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
@ -207,7 +207,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE    MAXMHZ   MINMHZ      MHZ
 # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
 $ export VLLM_CPU_OMP_THREADS_BIND=0-7
-$ python examples/offline_inference/basic.py
+$ python examples/offline_inference/basic/basic.py
 ```
 - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@ -40,7 +40,7 @@ For non-CUDA platforms, please refer [here](#installation-index) for specific in
 ## Offline Batched Inference
-With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/basic.py>
+With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/basic/basic.py>
 The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@ -46,7 +46,7 @@ for output in outputs:
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
-A code example can be found here: <gh-file:examples/offline_inference/basic.py>
+A code example can be found here: <gh-file:examples/offline_inference/basic/basic.py>
 ### `LLM.beam_search`
@ -103,7 +103,7 @@ for output in outputs:
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
-A code example can be found here: <gh-file:examples/offline_inference/chat.py>
+A code example can be found here: <gh-file:examples/offline_inference/basic/chat.py>
 If the model doesn't have a chat template or you want to specify another one,
 you can explicitly pass a chat template:
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@ -88,7 +88,7 @@ embeds = output.outputs.embedding
 print(f"Embeddings: {embeds!r} (size={len(embeds)})")
 ```
-A code example can be found here: <gh-file:examples/offline_inference/embedding.py>
+A code example can be found here: <gh-file:examples/offline_inference/basic/embed.py>
 ### `LLM.classify`
@ -103,7 +103,7 @@ probs = output.outputs.probs
 print(f"Class Probabilities: {probs!r} (size={len(probs)})")
 ```
-A code example can be found here: <gh-file:examples/offline_inference/classification.py>
+A code example can be found here: <gh-file:examples/offline_inference/basic/classify.py>
 ### `LLM.score`
@ -125,7 +125,7 @@ score = output.outputs.score
 print(f"Score: {score}")
 ```
-A code example can be found here: <gh-file:examples/offline_inference/scoring.py>
+A code example can be found here: <gh-file:examples/offline_inference/basic/score.py>
 ## Online Serving
--- a/examples/offline_inference/aqlm_example.py
+++ b/examples/offline_inference/aqlm_example.py
@ -1,47 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 from vllm import LLM, SamplingParams
 from vllm.utils import FlexibleArgumentParser
 def main():
    parser = FlexibleArgumentParser(description='AQLM examples')
    parser.add_argument('--model',
                        '-m',
                        type=str,
                        default=None,
                        help='model path, as for HF')
    parser.add_argument('--choice',
                        '-c',
                        type=int,
                        default=0,
                        help='known good models by index, [0-4]')
    parser.add_argument('--tensor-parallel-size',
                        '-t',
                        type=int,
                        default=1,
                        help='tensor parallel size')
    args = parser.parse_args()
    models = [
        "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf",
        "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf",
        "ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf",
        "ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf",
        "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf",
    ]
    model = LLM(args.model if args.model is not None else models[args.choice],
                tensor_parallel_size=args.tensor_parallel_size)
    sampling_params = SamplingParams(max_tokens=100, temperature=0)
    outputs = model.generate("Hello my name is",
                             sampling_params=sampling_params)
    print(outputs[0].outputs[0].text)
 if __name__ == '__main__':
    main()
--- a/examples/offline_inference/arctic.py
+++ b/examples/offline_inference/arctic.py
@ -1,28 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 from vllm import LLM, SamplingParams
 # Sample prompts.
 prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
 ]
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 # Create an LLM.
 llm = LLM(model="snowflake/snowflake-arctic-instruct",
          quantization="deepspeedfp",
          tensor_parallel_size=8,
          trust_remote_code=True)
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
 # Print the outputs.
 for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/examples/offline_inference/basic/README.md
+++ b/examples/offline_inference/basic/README.md
@ -0,0 +1,94 @@
 # Basic
 The `LLM` class provides the primary Python interface for doing offline inference, which is interacting with a model without using a separate model inference server.
 ## Usage
 The first script in this example shows the most basic usage of vLLM. If you are new to Python and vLLM, you should start here.
 ```bash
 python examples/offline_inference/basic/basic.py
 ```
 The rest of the scripts include an [argument parser](https://docs.python.org/3/library/argparse.html), which you can use to pass any arguments that are compatible with [`LLM`](https://docs.vllm.ai/en/latest/api/offline_inference/llm.html). Try running the script with `--help` for a list of all available arguments.
 ```bash
 python examples/offline_inference/basic/classify.py
 ```
 ```bash
 python examples/offline_inference/basic/embed.py
 ```
 ```bash
 python examples/offline_inference/basic/score.py
 ```
 The chat and generate scripts also accept the [sampling parameters](https://docs.vllm.ai/en/latest/api/inference_params.html#sampling-parameters): `max_tokens`, `temperature`, `top_p` and `top_k`.
 ```bash
 python examples/offline_inference/basic/chat.py
 ```
 ```bash
 python examples/offline_inference/basic/generate.py
 ```
 ## Features
 In the scripts that support passing arguments, you can experiment with the following features.
 ### Default generation config
 The `--generation-config` argument specifies where the generation config will be loaded from when calling `LLM.get_default_sampling_params()`. If set to ‘auto’, the generation config will be loaded from model path. If set to a folder path, the generation config will be loaded from the specified folder path. If it is not provided, vLLM defaults will be used.
 > If max_new_tokens is specified in generation config, then it sets a server-wide limit on the number of output tokens for all requests.
 Try it yourself with the following argument:
 ```bash
 --generation-config auto
 ```
 ### Quantization
 #### AQLM
 vLLM supports models that are quantized using AQLM.
 Try one yourself by passing one of the following models to the `--model` argument:
 - `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf`
 - `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf`
 - `ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf`
 - `ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf`
 - `BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf`
 > Some of these models are likely to be too large for a single GPU. You can split them across multiple GPUs by setting `--tensor-parallel-size` to the number of required GPUs.
 #### GGUF
 vLLM supports models that are quantized using GGUF.
 Try one yourself by downloading a GUFF quantised model and using the following arguments:
 ```python
 from huggingface_hub import hf_hub_download
 repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF"
 filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf"
 print(hf_hub_download(repo_id, filename=filename))
 ```
 ```bash
 --model {local-path-printed-above} --tokenizer microsoft/Phi-3-medium-4k-instruct
 ```
 ### CPU offload
 The `--cpu-offload-gb` argument can be seen as a virtual way to increase the GPU memory size. For example, if you have one 24 GB GPU and set this to 10, virtually you can think of it as a 34 GB GPU. Then you can load a 13B model with BF16 weight, which requires at least 26GB GPU memory. Note that this requires fast CPU-GPU interconnect, as part of the model is loaded from CPU memory to GPU memory on the fly in each model forward pass.
 Try it yourself with the following arguments:
 ```bash
 --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
 ```
--- a/examples/offline_inference/basic/basic.py
+++ b/examples/offline_inference/basic/basic.py
--- a/examples/offline_inference/basic/chat.py
+++ b/examples/offline_inference/basic/chat.py
@ -0,0 +1,98 @@
 # SPDX-License-Identifier: Apache-2.0
 from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
 def main(args: dict):
    # Pop arguments not used by LLM
    max_tokens = args.pop("max_tokens")
    temperature = args.pop("temperature")
    top_p = args.pop("top_p")
    top_k = args.pop("top_k")
    chat_template_path = args.pop("chat_template_path")
    # Create an LLM
    llm = LLM(**args)
    # Create sampling params object
    sampling_params = llm.get_default_sampling_params()
    if max_tokens is not None:
        sampling_params.max_tokens = max_tokens
    if temperature is not None:
        sampling_params.temperature = temperature
    if top_p is not None:
        sampling_params.top_p = top_p
    if top_k is not None:
        sampling_params.top_k = top_k
    def print_outputs(outputs):
        for output in outputs:
            prompt = output.prompt
            generated_text = output.outputs[0].text
            print(f"Prompt: {prompt!r}")
            print(f"Generated text: {generated_text!r}")
        print("-" * 80)
    print("=" * 80)
    # In this script, we demonstrate how to pass input to the chat method:
    conversation = [
        {
            "role": "system",
            "content": "You are a helpful assistant"
        },
        {
            "role": "user",
            "content": "Hello"
        },
        {
            "role": "assistant",
            "content": "Hello! How can I assist you today?"
        },
        {
            "role": "user",
            "content":
            "Write an essay about the importance of higher education.",
        },
    ]
    outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
    print_outputs(outputs)
    # You can run batch inference with llm.chat API
    conversations = [conversation for _ in range(10)]
    # We turn on tqdm progress bar to verify it's indeed running batch inference
    outputs = llm.chat(conversations, sampling_params, use_tqdm=True)
    print_outputs(outputs)
    # A chat template can be optionally supplied.
    # If not, the model will use its default chat template.
    if chat_template_path is not None:
        with open(chat_template_path) as f:
            chat_template = f.read()
        outputs = llm.chat(
            conversations,
            sampling_params,
            use_tqdm=False,
            chat_template=chat_template,
        )
 if __name__ == "__main__":
    parser = FlexibleArgumentParser()
    # Add engine args
    engine_group = parser.add_argument_group("Engine arguments")
    EngineArgs.add_cli_args(engine_group)
    engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
    # Add sampling params
    sampling_group = parser.add_argument_group("Sampling parameters")
    sampling_group.add_argument("--max-tokens", type=int)
    sampling_group.add_argument("--temperature", type=float)
    sampling_group.add_argument("--top-p", type=float)
    sampling_group.add_argument("--top-k", type=int)
    # Add example params
    parser.add_argument("--chat-template-path", type=str)
    args: dict = vars(parser.parse_args())
    main(args)
--- a/examples/offline_inference/basic/classify.py
+++ b/examples/offline_inference/basic/classify.py
@ -0,0 +1,42 @@
 # SPDX-License-Identifier: Apache-2.0
 from argparse import Namespace
 from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
 def main(args: Namespace):
    # Sample prompts.
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
        "The capital of France is",
        "The future of AI is",
    ]
    # Create an LLM.
    # You should pass task="classify" for classification models
    model = LLM(**vars(args))
    # Generate logits. The output is a list of ClassificationRequestOutputs.
    outputs = model.classify(prompts)
    # Print the outputs.
    for prompt, output in zip(prompts, outputs):
        probs = output.outputs.probs
        probs_trimmed = ((str(probs[:16])[:-1] +
                          ", ...]") if len(probs) > 16 else probs)
        print(f"Prompt: {prompt!r} | "
              f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
 if __name__ == "__main__":
    parser = FlexibleArgumentParser()
    parser = EngineArgs.add_cli_args(parser)
    # Set example specific arguments
    parser.set_defaults(model="jason9693/Qwen2.5-1.5B-apeach",
                        task="classify",
                        enforce_eager=True)
    args = parser.parse_args()
    main(args)
--- a/examples/offline_inference/basic/embed.py
+++ b/examples/offline_inference/basic/embed.py
@ -0,0 +1,42 @@
 # SPDX-License-Identifier: Apache-2.0
 from argparse import Namespace
 from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
 def main(args: Namespace):
    # Sample prompts.
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
        "The capital of France is",
        "The future of AI is",
    ]
    # Create an LLM.
    # You should pass task="embed" for embedding models
    model = LLM(**vars(args))
    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
    outputs = model.embed(prompts)
    # Print the outputs.
    for prompt, output in zip(prompts, outputs):
        embeds = output.outputs.embedding
        embeds_trimmed = ((str(embeds[:16])[:-1] +
                           ", ...]") if len(embeds) > 16 else embeds)
        print(f"Prompt: {prompt!r} | "
              f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
 if __name__ == "__main__":
    parser = FlexibleArgumentParser()
    parser = EngineArgs.add_cli_args(parser)
    # Set example specific arguments
    parser.set_defaults(model="intfloat/e5-mistral-7b-instruct",
                        task="embed",
                        enforce_eager=True)
    args = parser.parse_args()
    main(args)
--- a/examples/offline_inference/basic/generate.py
+++ b/examples/offline_inference/basic/generate.py
@ -0,0 +1,57 @@
 # SPDX-License-Identifier: Apache-2.0
 from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
 def main(args: dict):
    # Pop arguments not used by LLM
    max_tokens = args.pop("max_tokens")
    temperature = args.pop("temperature")
    top_p = args.pop("top_p")
    top_k = args.pop("top_k")
    # Create an LLM
    llm = LLM(**args)
    # Create a sampling params object
    sampling_params = llm.get_default_sampling_params()
    if max_tokens is not None:
        sampling_params.max_tokens = max_tokens
    if temperature is not None:
        sampling_params.temperature = temperature
    if top_p is not None:
        sampling_params.top_p = top_p
    if top_k is not None:
        sampling_params.top_k = top_k
    # Generate texts from the prompts. The output is a list of RequestOutput
    # objects that contain the prompt, generated text, and other information.
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
        "The capital of France is",
        "The future of AI is",
    ]
    outputs = llm.generate(prompts, sampling_params)
    # Print the outputs.
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 if __name__ == "__main__":
    parser = FlexibleArgumentParser()
    # Add engine args
    engine_group = parser.add_argument_group("Engine arguments")
    EngineArgs.add_cli_args(engine_group)
    engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
    # Add sampling params
    sampling_group = parser.add_argument_group("Sampling parameters")
    sampling_group.add_argument("--max-tokens", type=int)
    sampling_group.add_argument("--temperature", type=float)
    sampling_group.add_argument("--top-p", type=float)
    sampling_group.add_argument("--top-k", type=int)
    args: dict = vars(parser.parse_args())
    main(args)
--- a/examples/offline_inference/basic/score.py
+++ b/examples/offline_inference/basic/score.py
@ -0,0 +1,38 @@
 # SPDX-License-Identifier: Apache-2.0
 from argparse import Namespace
 from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
 def main(args: Namespace):
    # Sample prompts.
    text_1 = "What is the capital of France?"
    texts_2 = [
        "The capital of Brazil is Brasilia.",
        "The capital of France is Paris.",
    ]
    # Create an LLM.
    # You should pass task="score" for cross-encoder models
    model = LLM(**vars(args))
    # Generate scores. The output is a list of ScoringRequestOutputs.
    outputs = model.score(text_1, texts_2)
    # Print the outputs.
    for text_2, output in zip(texts_2, outputs):
        score = output.outputs.score
        print(f"Pair: {[text_1, text_2]!r} | Score: {score}")
 if __name__ == "__main__":
    parser = FlexibleArgumentParser()
    parser = EngineArgs.add_cli_args(parser)
    # Set example specific arguments
    parser.set_defaults(model="BAAI/bge-reranker-v2-m3",
                        task="score",
                        enforce_eager=True)
    args = parser.parse_args()
    main(args)
--- a/examples/offline_inference/basic_with_model_default_sampling.py
+++ b/examples/offline_inference/basic_with_model_default_sampling.py
@ -1,32 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 from vllm import LLM
 # Sample prompts.
 prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
 ]
 # Create an LLM with built-in default generation config.
 # The generation config is set to None by default to keep
 # the behavior consistent with the previous version.
 # If you want to use the default generation config from the model,
 # you should set the generation_config to "auto".
 llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", generation_config="auto")
 # Load the default sampling parameters from the model.
 sampling_params = llm.get_default_sampling_params()
 # Modify the sampling parameters if needed.
 sampling_params.temperature = 0.5
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
 # Print the outputs.
 for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/examples/offline_inference/chat.py
+++ b/examples/offline_inference/chat.py
@ -1,82 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 from vllm import LLM, SamplingParams
 llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
 sampling_params = SamplingParams(temperature=0.5)
 def print_outputs(outputs):
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    print("-" * 80)
 print("=" * 80)
 # In this script, we demonstrate how to pass input to the chat method:
 conversation = [
    {
        "role": "system",
        "content": "You are a helpful assistant"
    },
    {
        "role": "user",
        "content": "Hello"
    },
    {
        "role": "assistant",
        "content": "Hello! How can I assist you today?"
    },
    {
        "role": "user",
        "content": "Write an essay about the importance of higher education.",
    },
 ]
 outputs = llm.chat(conversation,
                   sampling_params=sampling_params,
                   use_tqdm=False)
 print_outputs(outputs)
 # You can run batch inference with llm.chat API
 conversation = [
    {
        "role": "system",
        "content": "You are a helpful assistant"
    },
    {
        "role": "user",
        "content": "Hello"
    },
    {
        "role": "assistant",
        "content": "Hello! How can I assist you today?"
    },
    {
        "role": "user",
        "content": "Write an essay about the importance of higher education.",
    },
 ]
 conversations = [conversation for _ in range(10)]
 # We turn on tqdm progress bar to verify it's indeed running batch inference
 outputs = llm.chat(messages=conversations,
                   sampling_params=sampling_params,
                   use_tqdm=True)
 print_outputs(outputs)
 # A chat template can be optionally supplied.
 # If not, the model will use its default chat template.
 # with open('template_falcon_180b.jinja', "r") as f:
 #     chat_template = f.read()
 # outputs = llm.chat(
 #     conversations,
 #     sampling_params=sampling_params,
 #     use_tqdm=False,
 #     chat_template=chat_template,
 # )
--- a/examples/offline_inference/classification.py
+++ b/examples/offline_inference/classification.py
@ -1,30 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 from vllm import LLM
 # Sample prompts.
 prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
 ]
 # Create an LLM.
 # You should pass task="classify" for classification models
 model = LLM(
    model="jason9693/Qwen2.5-1.5B-apeach",
    task="classify",
    enforce_eager=True,
 )
 # Generate logits. The output is a list of ClassificationRequestOutputs.
 outputs = model.classify(prompts)
 # Print the outputs.
 for prompt, output in zip(prompts, outputs):
    probs = output.outputs.probs
    probs_trimmed = ((str(probs[:16])[:-1] +
                      ", ...]") if len(probs) > 16 else probs)
    print(f"Prompt: {prompt!r} | "
          f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
--- a/examples/offline_inference/cli.py
+++ b/examples/offline_inference/cli.py
@ -1,82 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 from dataclasses import asdict
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.utils import FlexibleArgumentParser
 def get_prompts(num_prompts: int):
    # The default sample prompts.
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
        "The capital of France is",
        "The future of AI is",
    ]
    if num_prompts != len(prompts):
        prompts = (prompts * ((num_prompts // len(prompts)) + 1))[:num_prompts]
    return prompts
 def main(args):
    # Create prompts
    prompts = get_prompts(args.num_prompts)
    # Create a sampling params object.
    sampling_params = SamplingParams(n=args.n,
                                     temperature=args.temperature,
                                     top_p=args.top_p,
                                     top_k=args.top_k,
                                     max_tokens=args.max_tokens)
    # Create an LLM.
    # The default model is 'facebook/opt-125m'
    engine_args = EngineArgs.from_cli_args(args)
    llm = LLM(**asdict(engine_args))
    # Generate texts from the prompts.
    # The output is a list of RequestOutput objects
    # that contain the prompt, generated text, and other information.
    outputs = llm.generate(prompts, sampling_params)
    # Print the outputs.
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 if __name__ == '__main__':
    parser = FlexibleArgumentParser()
    parser = EngineArgs.add_cli_args(parser)
    group = parser.add_argument_group("SamplingParams options")
    group.add_argument("--num-prompts",
                       type=int,
                       default=4,
                       help="Number of prompts used for inference")
    group.add_argument("--max-tokens",
                       type=int,
                       default=16,
                       help="Generated output length for sampling")
    group.add_argument('--n',
                       type=int,
                       default=1,
                       help='Number of generated sequences per prompt')
    group.add_argument('--temperature',
                       type=float,
                       default=0.8,
                       help='Temperature for text generation')
    group.add_argument('--top-p',
                       type=float,
                       default=0.95,
                       help='top_p for text generation')
    group.add_argument('--top-k',
                       type=int,
                       default=-1,
                       help='top_k for text generation')
    args = parser.parse_args()
    main(args)
--- a/examples/offline_inference/cpu_offload.py
+++ b/examples/offline_inference/cpu_offload.py
@ -1,24 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 from vllm import LLM, SamplingParams
 # Sample prompts.
 prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
 ]
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 # Create an LLM.
 llm = LLM(model="meta-llama/Llama-2-13b-chat-hf", cpu_offload_gb=10)
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
 # Print the outputs.
 for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/examples/offline_inference/embedding.py
+++ b/examples/offline_inference/embedding.py
@ -1,30 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 from vllm import LLM
 # Sample prompts.
 prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
 ]
 # Create an LLM.
 # You should pass task="embed" for embedding models
 model = LLM(
    model="intfloat/e5-mistral-7b-instruct",
    task="embed",
    enforce_eager=True,
 )
 # Generate embedding. The output is a list of EmbeddingRequestOutputs.
 outputs = model.embed(prompts)
 # Print the outputs.
 for prompt, output in zip(prompts, outputs):
    embeds = output.outputs.embedding
    embeds_trimmed = ((str(embeds[:16])[:-1] +
                       ", ...]") if len(embeds) > 16 else embeds)
    print(f"Prompt: {prompt!r} | "
          f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
--- a/examples/offline_inference/gguf_inference.py
+++ b/examples/offline_inference/gguf_inference.py
@ -1,34 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 from huggingface_hub import hf_hub_download
 from vllm import LLM, SamplingParams
 def run_gguf_inference(model_path, tokenizer):
    # Sample prompts.
    prompts = [
        "How many helicopters can a human eat in one sitting?",
        "What's the future of AI?",
    ]
    prompts = [[{"role": "user", "content": prompt}] for prompt in prompts]
    # Create a sampling params object.
    sampling_params = SamplingParams(temperature=0, max_tokens=128)
    # Create an LLM.
    llm = LLM(model=model_path, tokenizer=tokenizer)
    outputs = llm.chat(prompts, sampling_params)
    # Print the outputs.
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 if __name__ == "__main__":
    repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF"
    filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf"
    tokenizer = "microsoft/Phi-3-medium-4k-instruct"
    model = hf_hub_download(repo_id, filename=filename)
    run_gguf_inference(model, tokenizer)
--- a/examples/offline_inference/scoring.py
+++ b/examples/offline_inference/scoring.py
@ -1,25 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 from vllm import LLM
 # Sample prompts.
 text_1 = "What is the capital of France?"
 texts_2 = [
    "The capital of Brazil is Brasilia.", "The capital of France is Paris."
 ]
 # Create an LLM.
 # You should pass task="score" for cross-encoder models
 model = LLM(
    model="BAAI/bge-reranker-v2-m3",
    task="score",
    enforce_eager=True,
 )
 # Generate scores. The output is a list of ScoringRequestOutputs.
 outputs = model.score(text_1, texts_2)
 # Print the outputs.
 for text_2, output in zip(texts_2, outputs):
    score = output.outputs.score
    print(f"Pair: {[text_1, text_2]!r} | Score: {score}")
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@ -14,7 +14,7 @@ def test_platform_plugins():
    import os
    example_file = os.path.join(
        os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
-        "examples", "offline_inference/basic.py")
+        "examples", "offline_inference/basic/basic.py")
    runpy.run_path(example_file)
    # check if the plugin is loaded correctly