Merge similar examples in offline_inference into single basic example (#12737)

This commit is contained in:
Harry Mellor 2025-02-20 12:53:51 +00:00 committed by GitHub
parent b69692a2d8
commit 992e5c3d34
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
29 changed files with 394 additions and 437 deletions

View File

@ -30,7 +30,7 @@ function cpu_tests() {
# offline inference
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
set -e
python3 examples/offline_inference/basic.py"
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run basic model test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "

View File

@ -24,5 +24,5 @@ remove_docker_container
# Run the image and test offline inference
docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
python3 examples/offline_inference/cli.py --model meta-llama/Llama-3.2-1B
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
'

View File

@ -20,5 +20,5 @@ trap remove_docker_container_and_exit EXIT
remove_docker_container
# Run the image and launch offline inference
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
EXITCODE=$?

View File

@ -13,4 +13,4 @@ trap remove_docker_container EXIT
remove_docker_container
# Run the image and launch offline inference
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m

View File

@ -14,6 +14,6 @@ remove_docker_container
# Run the image and test offline inference/tensor parallel
docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
python3 examples/offline_inference/basic.py
python3 examples/offline_inference/cli.py -tp 2
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
'

View File

@ -215,18 +215,18 @@ steps:
- examples/
commands:
- pip install tensorizer # for tensorizer test
- python3 offline_inference/basic.py
- python3 offline_inference/cpu_offload.py
- python3 offline_inference/chat.py
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 offline_inference/basic/chat.py
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- python3 offline_inference/vision_language.py
- python3 offline_inference/vision_language_multi_image.py
- python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder.py
- python3 offline_inference/classification.py
- python3 offline_inference/embedding.py
- python3 offline_inference/scoring.py
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
- python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
- label: Prefix Caching Test # 9min

View File

@ -147,7 +147,7 @@ class Example:
return content
content += "## Example materials\n\n"
for file in self.other_files:
for file in sorted(self.other_files):
include = "include" if file.suffix == ".md" else "literalinclude"
content += f":::{{admonition}} {file.relative_to(self.path)}\n"
content += ":class: dropdown\n\n"
@ -194,7 +194,7 @@ def generate_examples():
path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
title="Offline Inference",
description=
"Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.", # noqa: E501
"Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches. We recommend starting with <project:basic.md>.", # noqa: E501
caption="Examples",
),
}

View File

@ -170,7 +170,7 @@ vLLM CPU backend supports the following vLLM features:
sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
find / -name *libtcmalloc* # find the dynamic link library path
export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
python examples/offline_inference/basic.py # run vLLM
python examples/offline_inference/basic/basic.py # run vLLM
```
- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
@ -207,7 +207,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ
# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
$ export VLLM_CPU_OMP_THREADS_BIND=0-7
$ python examples/offline_inference/basic.py
$ python examples/offline_inference/basic/basic.py
```
- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.

View File

@ -40,7 +40,7 @@ For non-CUDA platforms, please refer [here](#installation-index) for specific in
## Offline Batched Inference
With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/basic.py>
With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/basic/basic.py>
The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:

View File

@ -46,7 +46,7 @@ for output in outputs:
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
A code example can be found here: <gh-file:examples/offline_inference/basic.py>
A code example can be found here: <gh-file:examples/offline_inference/basic/basic.py>
### `LLM.beam_search`
@ -103,7 +103,7 @@ for output in outputs:
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
A code example can be found here: <gh-file:examples/offline_inference/chat.py>
A code example can be found here: <gh-file:examples/offline_inference/basic/chat.py>
If the model doesn't have a chat template or you want to specify another one,
you can explicitly pass a chat template:

View File

@ -88,7 +88,7 @@ embeds = output.outputs.embedding
print(f"Embeddings: {embeds!r} (size={len(embeds)})")
```
A code example can be found here: <gh-file:examples/offline_inference/embedding.py>
A code example can be found here: <gh-file:examples/offline_inference/basic/embed.py>
### `LLM.classify`
@ -103,7 +103,7 @@ probs = output.outputs.probs
print(f"Class Probabilities: {probs!r} (size={len(probs)})")
```
A code example can be found here: <gh-file:examples/offline_inference/classification.py>
A code example can be found here: <gh-file:examples/offline_inference/basic/classify.py>
### `LLM.score`
@ -125,7 +125,7 @@ score = output.outputs.score
print(f"Score: {score}")
```
A code example can be found here: <gh-file:examples/offline_inference/scoring.py>
A code example can be found here: <gh-file:examples/offline_inference/basic/score.py>
## Online Serving

View File

@ -1,47 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
from vllm import LLM, SamplingParams
from vllm.utils import FlexibleArgumentParser
def main():
parser = FlexibleArgumentParser(description='AQLM examples')
parser.add_argument('--model',
'-m',
type=str,
default=None,
help='model path, as for HF')
parser.add_argument('--choice',
'-c',
type=int,
default=0,
help='known good models by index, [0-4]')
parser.add_argument('--tensor-parallel-size',
'-t',
type=int,
default=1,
help='tensor parallel size')
args = parser.parse_args()
models = [
"ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf",
"ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf",
"ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf",
"ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf",
"BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf",
]
model = LLM(args.model if args.model is not None else models[args.choice],
tensor_parallel_size=args.tensor_parallel_size)
sampling_params = SamplingParams(max_tokens=100, temperature=0)
outputs = model.generate("Hello my name is",
sampling_params=sampling_params)
print(outputs[0].outputs[0].text)
if __name__ == '__main__':
main()

View File

@ -1,28 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
from vllm import LLM, SamplingParams
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM.
llm = LLM(model="snowflake/snowflake-arctic-instruct",
quantization="deepspeedfp",
tensor_parallel_size=8,
trust_remote_code=True)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

View File

@ -0,0 +1,94 @@
# Basic
The `LLM` class provides the primary Python interface for doing offline inference, which is interacting with a model without using a separate model inference server.
## Usage
The first script in this example shows the most basic usage of vLLM. If you are new to Python and vLLM, you should start here.
```bash
python examples/offline_inference/basic/basic.py
```
The rest of the scripts include an [argument parser](https://docs.python.org/3/library/argparse.html), which you can use to pass any arguments that are compatible with [`LLM`](https://docs.vllm.ai/en/latest/api/offline_inference/llm.html). Try running the script with `--help` for a list of all available arguments.
```bash
python examples/offline_inference/basic/classify.py
```
```bash
python examples/offline_inference/basic/embed.py
```
```bash
python examples/offline_inference/basic/score.py
```
The chat and generate scripts also accept the [sampling parameters](https://docs.vllm.ai/en/latest/api/inference_params.html#sampling-parameters): `max_tokens`, `temperature`, `top_p` and `top_k`.
```bash
python examples/offline_inference/basic/chat.py
```
```bash
python examples/offline_inference/basic/generate.py
```
## Features
In the scripts that support passing arguments, you can experiment with the following features.
### Default generation config
The `--generation-config` argument specifies where the generation config will be loaded from when calling `LLM.get_default_sampling_params()`. If set to auto, the generation config will be loaded from model path. If set to a folder path, the generation config will be loaded from the specified folder path. If it is not provided, vLLM defaults will be used.
> If max_new_tokens is specified in generation config, then it sets a server-wide limit on the number of output tokens for all requests.
Try it yourself with the following argument:
```bash
--generation-config auto
```
### Quantization
#### AQLM
vLLM supports models that are quantized using AQLM.
Try one yourself by passing one of the following models to the `--model` argument:
- `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf`
- `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf`
- `ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf`
- `ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf`
- `BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf`
> Some of these models are likely to be too large for a single GPU. You can split them across multiple GPUs by setting `--tensor-parallel-size` to the number of required GPUs.
#### GGUF
vLLM supports models that are quantized using GGUF.
Try one yourself by downloading a GUFF quantised model and using the following arguments:
```python
from huggingface_hub import hf_hub_download
repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF"
filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf"
print(hf_hub_download(repo_id, filename=filename))
```
```bash
--model {local-path-printed-above} --tokenizer microsoft/Phi-3-medium-4k-instruct
```
### CPU offload
The `--cpu-offload-gb` argument can be seen as a virtual way to increase the GPU memory size. For example, if you have one 24 GB GPU and set this to 10, virtually you can think of it as a 34 GB GPU. Then you can load a 13B model with BF16 weight, which requires at least 26GB GPU memory. Note that this requires fast CPU-GPU interconnect, as part of the model is loaded from CPU memory to GPU memory on the fly in each model forward pass.
Try it yourself with the following arguments:
```bash
--model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
```

View File

@ -0,0 +1,98 @@
# SPDX-License-Identifier: Apache-2.0
from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
def main(args: dict):
# Pop arguments not used by LLM
max_tokens = args.pop("max_tokens")
temperature = args.pop("temperature")
top_p = args.pop("top_p")
top_k = args.pop("top_k")
chat_template_path = args.pop("chat_template_path")
# Create an LLM
llm = LLM(**args)
# Create sampling params object
sampling_params = llm.get_default_sampling_params()
if max_tokens is not None:
sampling_params.max_tokens = max_tokens
if temperature is not None:
sampling_params.temperature = temperature
if top_p is not None:
sampling_params.top_p = top_p
if top_k is not None:
sampling_params.top_k = top_k
def print_outputs(outputs):
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}")
print(f"Generated text: {generated_text!r}")
print("-" * 80)
print("=" * 80)
# In this script, we demonstrate how to pass input to the chat method:
conversation = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": "Hello"
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
},
{
"role": "user",
"content":
"Write an essay about the importance of higher education.",
},
]
outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
print_outputs(outputs)
# You can run batch inference with llm.chat API
conversations = [conversation for _ in range(10)]
# We turn on tqdm progress bar to verify it's indeed running batch inference
outputs = llm.chat(conversations, sampling_params, use_tqdm=True)
print_outputs(outputs)
# A chat template can be optionally supplied.
# If not, the model will use its default chat template.
if chat_template_path is not None:
with open(chat_template_path) as f:
chat_template = f.read()
outputs = llm.chat(
conversations,
sampling_params,
use_tqdm=False,
chat_template=chat_template,
)
if __name__ == "__main__":
parser = FlexibleArgumentParser()
# Add engine args
engine_group = parser.add_argument_group("Engine arguments")
EngineArgs.add_cli_args(engine_group)
engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
# Add sampling params
sampling_group = parser.add_argument_group("Sampling parameters")
sampling_group.add_argument("--max-tokens", type=int)
sampling_group.add_argument("--temperature", type=float)
sampling_group.add_argument("--top-p", type=float)
sampling_group.add_argument("--top-k", type=int)
# Add example params
parser.add_argument("--chat-template-path", type=str)
args: dict = vars(parser.parse_args())
main(args)

View File

@ -0,0 +1,42 @@
# SPDX-License-Identifier: Apache-2.0
from argparse import Namespace
from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
def main(args: Namespace):
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create an LLM.
# You should pass task="classify" for classification models
model = LLM(**vars(args))
# Generate logits. The output is a list of ClassificationRequestOutputs.
outputs = model.classify(prompts)
# Print the outputs.
for prompt, output in zip(prompts, outputs):
probs = output.outputs.probs
probs_trimmed = ((str(probs[:16])[:-1] +
", ...]") if len(probs) > 16 else probs)
print(f"Prompt: {prompt!r} | "
f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
if __name__ == "__main__":
parser = FlexibleArgumentParser()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(model="jason9693/Qwen2.5-1.5B-apeach",
task="classify",
enforce_eager=True)
args = parser.parse_args()
main(args)

View File

@ -0,0 +1,42 @@
# SPDX-License-Identifier: Apache-2.0
from argparse import Namespace
from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
def main(args: Namespace):
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create an LLM.
# You should pass task="embed" for embedding models
model = LLM(**vars(args))
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
outputs = model.embed(prompts)
# Print the outputs.
for prompt, output in zip(prompts, outputs):
embeds = output.outputs.embedding
embeds_trimmed = ((str(embeds[:16])[:-1] +
", ...]") if len(embeds) > 16 else embeds)
print(f"Prompt: {prompt!r} | "
f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
if __name__ == "__main__":
parser = FlexibleArgumentParser()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(model="intfloat/e5-mistral-7b-instruct",
task="embed",
enforce_eager=True)
args = parser.parse_args()
main(args)

View File

@ -0,0 +1,57 @@
# SPDX-License-Identifier: Apache-2.0
from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
def main(args: dict):
# Pop arguments not used by LLM
max_tokens = args.pop("max_tokens")
temperature = args.pop("temperature")
top_p = args.pop("top_p")
top_k = args.pop("top_k")
# Create an LLM
llm = LLM(**args)
# Create a sampling params object
sampling_params = llm.get_default_sampling_params()
if max_tokens is not None:
sampling_params.max_tokens = max_tokens
if temperature is not None:
sampling_params.temperature = temperature
if top_p is not None:
sampling_params.top_p = top_p
if top_k is not None:
sampling_params.top_k = top_k
# Generate texts from the prompts. The output is a list of RequestOutput
# objects that contain the prompt, generated text, and other information.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == "__main__":
parser = FlexibleArgumentParser()
# Add engine args
engine_group = parser.add_argument_group("Engine arguments")
EngineArgs.add_cli_args(engine_group)
engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
# Add sampling params
sampling_group = parser.add_argument_group("Sampling parameters")
sampling_group.add_argument("--max-tokens", type=int)
sampling_group.add_argument("--temperature", type=float)
sampling_group.add_argument("--top-p", type=float)
sampling_group.add_argument("--top-k", type=int)
args: dict = vars(parser.parse_args())
main(args)

View File

@ -0,0 +1,38 @@
# SPDX-License-Identifier: Apache-2.0
from argparse import Namespace
from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
def main(args: Namespace):
# Sample prompts.
text_1 = "What is the capital of France?"
texts_2 = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
# Create an LLM.
# You should pass task="score" for cross-encoder models
model = LLM(**vars(args))
# Generate scores. The output is a list of ScoringRequestOutputs.
outputs = model.score(text_1, texts_2)
# Print the outputs.
for text_2, output in zip(texts_2, outputs):
score = output.outputs.score
print(f"Pair: {[text_1, text_2]!r} | Score: {score}")
if __name__ == "__main__":
parser = FlexibleArgumentParser()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(model="BAAI/bge-reranker-v2-m3",
task="score",
enforce_eager=True)
args = parser.parse_args()
main(args)

View File

@ -1,32 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
from vllm import LLM
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create an LLM with built-in default generation config.
# The generation config is set to None by default to keep
# the behavior consistent with the previous version.
# If you want to use the default generation config from the model,
# you should set the generation_config to "auto".
llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", generation_config="auto")
# Load the default sampling parameters from the model.
sampling_params = llm.get_default_sampling_params()
# Modify the sampling parameters if needed.
sampling_params.temperature = 0.5
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

View File

@ -1,82 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
from vllm import LLM, SamplingParams
llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
sampling_params = SamplingParams(temperature=0.5)
def print_outputs(outputs):
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
print("-" * 80)
print("=" * 80)
# In this script, we demonstrate how to pass input to the chat method:
conversation = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": "Hello"
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
},
{
"role": "user",
"content": "Write an essay about the importance of higher education.",
},
]
outputs = llm.chat(conversation,
sampling_params=sampling_params,
use_tqdm=False)
print_outputs(outputs)
# You can run batch inference with llm.chat API
conversation = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": "Hello"
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
},
{
"role": "user",
"content": "Write an essay about the importance of higher education.",
},
]
conversations = [conversation for _ in range(10)]
# We turn on tqdm progress bar to verify it's indeed running batch inference
outputs = llm.chat(messages=conversations,
sampling_params=sampling_params,
use_tqdm=True)
print_outputs(outputs)
# A chat template can be optionally supplied.
# If not, the model will use its default chat template.
# with open('template_falcon_180b.jinja', "r") as f:
# chat_template = f.read()
# outputs = llm.chat(
# conversations,
# sampling_params=sampling_params,
# use_tqdm=False,
# chat_template=chat_template,
# )

View File

@ -1,30 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
from vllm import LLM
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create an LLM.
# You should pass task="classify" for classification models
model = LLM(
model="jason9693/Qwen2.5-1.5B-apeach",
task="classify",
enforce_eager=True,
)
# Generate logits. The output is a list of ClassificationRequestOutputs.
outputs = model.classify(prompts)
# Print the outputs.
for prompt, output in zip(prompts, outputs):
probs = output.outputs.probs
probs_trimmed = ((str(probs[:16])[:-1] +
", ...]") if len(probs) > 16 else probs)
print(f"Prompt: {prompt!r} | "
f"Class Probabilities: {probs_trimmed} (size={len(probs)})")

View File

@ -1,82 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
from dataclasses import asdict
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.utils import FlexibleArgumentParser
def get_prompts(num_prompts: int):
# The default sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
if num_prompts != len(prompts):
prompts = (prompts * ((num_prompts // len(prompts)) + 1))[:num_prompts]
return prompts
def main(args):
# Create prompts
prompts = get_prompts(args.num_prompts)
# Create a sampling params object.
sampling_params = SamplingParams(n=args.n,
temperature=args.temperature,
top_p=args.top_p,
top_k=args.top_k,
max_tokens=args.max_tokens)
# Create an LLM.
# The default model is 'facebook/opt-125m'
engine_args = EngineArgs.from_cli_args(args)
llm = LLM(**asdict(engine_args))
# Generate texts from the prompts.
# The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == '__main__':
parser = FlexibleArgumentParser()
parser = EngineArgs.add_cli_args(parser)
group = parser.add_argument_group("SamplingParams options")
group.add_argument("--num-prompts",
type=int,
default=4,
help="Number of prompts used for inference")
group.add_argument("--max-tokens",
type=int,
default=16,
help="Generated output length for sampling")
group.add_argument('--n',
type=int,
default=1,
help='Number of generated sequences per prompt')
group.add_argument('--temperature',
type=float,
default=0.8,
help='Temperature for text generation')
group.add_argument('--top-p',
type=float,
default=0.95,
help='top_p for text generation')
group.add_argument('--top-k',
type=int,
default=-1,
help='top_k for text generation')
args = parser.parse_args()
main(args)

View File

@ -1,24 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
from vllm import LLM, SamplingParams
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM.
llm = LLM(model="meta-llama/Llama-2-13b-chat-hf", cpu_offload_gb=10)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

View File

@ -1,30 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
from vllm import LLM
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create an LLM.
# You should pass task="embed" for embedding models
model = LLM(
model="intfloat/e5-mistral-7b-instruct",
task="embed",
enforce_eager=True,
)
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
outputs = model.embed(prompts)
# Print the outputs.
for prompt, output in zip(prompts, outputs):
embeds = output.outputs.embedding
embeds_trimmed = ((str(embeds[:16])[:-1] +
", ...]") if len(embeds) > 16 else embeds)
print(f"Prompt: {prompt!r} | "
f"Embeddings: {embeds_trimmed} (size={len(embeds)})")

View File

@ -1,34 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
from huggingface_hub import hf_hub_download
from vllm import LLM, SamplingParams
def run_gguf_inference(model_path, tokenizer):
# Sample prompts.
prompts = [
"How many helicopters can a human eat in one sitting?",
"What's the future of AI?",
]
prompts = [[{"role": "user", "content": prompt}] for prompt in prompts]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0, max_tokens=128)
# Create an LLM.
llm = LLM(model=model_path, tokenizer=tokenizer)
outputs = llm.chat(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == "__main__":
repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF"
filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf"
tokenizer = "microsoft/Phi-3-medium-4k-instruct"
model = hf_hub_download(repo_id, filename=filename)
run_gguf_inference(model, tokenizer)

View File

@ -1,25 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
from vllm import LLM
# Sample prompts.
text_1 = "What is the capital of France?"
texts_2 = [
"The capital of Brazil is Brasilia.", "The capital of France is Paris."
]
# Create an LLM.
# You should pass task="score" for cross-encoder models
model = LLM(
model="BAAI/bge-reranker-v2-m3",
task="score",
enforce_eager=True,
)
# Generate scores. The output is a list of ScoringRequestOutputs.
outputs = model.score(text_1, texts_2)
# Print the outputs.
for text_2, output in zip(texts_2, outputs):
score = output.outputs.score
print(f"Pair: {[text_1, text_2]!r} | Score: {score}")

View File

@ -14,7 +14,7 @@ def test_platform_plugins():
import os
example_file = os.path.join(
os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
"examples", "offline_inference/basic.py")
"examples", "offline_inference/basic/basic.py")
runpy.run_path(example_file)
# check if the plugin is loaded correctly