Merge similar examples in offline_inference
into single basic
example (#12737)
This commit is contained in:
parent
b69692a2d8
commit
992e5c3d34
@ -30,7 +30,7 @@ function cpu_tests() {
|
||||
# offline inference
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference/basic.py"
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
|
||||
# Run basic model test
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
|
@ -24,5 +24,5 @@ remove_docker_container
|
||||
|
||||
# Run the image and test offline inference
|
||||
docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
||||
python3 examples/offline_inference/cli.py --model meta-llama/Llama-3.2-1B
|
||||
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
|
||||
'
|
||||
|
@ -20,5 +20,5 @@ trap remove_docker_container_and_exit EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and launch offline inference
|
||||
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
|
||||
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
EXITCODE=$?
|
||||
|
@ -13,4 +13,4 @@ trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and launch offline inference
|
||||
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py
|
||||
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
|
@ -14,6 +14,6 @@ remove_docker_container
|
||||
|
||||
# Run the image and test offline inference/tensor parallel
|
||||
docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
|
||||
python3 examples/offline_inference/basic.py
|
||||
python3 examples/offline_inference/cli.py -tp 2
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
|
||||
'
|
||||
|
@ -215,18 +215,18 @@ steps:
|
||||
- examples/
|
||||
commands:
|
||||
- pip install tensorizer # for tensorizer test
|
||||
- python3 offline_inference/basic.py
|
||||
- python3 offline_inference/cpu_offload.py
|
||||
- python3 offline_inference/chat.py
|
||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
- python3 offline_inference/basic/chat.py
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 offline_inference/vision_language.py
|
||||
- python3 offline_inference/vision_language_multi_image.py
|
||||
- python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/encoder_decoder.py
|
||||
- python3 offline_inference/classification.py
|
||||
- python3 offline_inference/embedding.py
|
||||
- python3 offline_inference/scoring.py
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
- python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||
|
||||
- label: Prefix Caching Test # 9min
|
||||
|
@ -147,7 +147,7 @@ class Example:
|
||||
return content
|
||||
|
||||
content += "## Example materials\n\n"
|
||||
for file in self.other_files:
|
||||
for file in sorted(self.other_files):
|
||||
include = "include" if file.suffix == ".md" else "literalinclude"
|
||||
content += f":::{{admonition}} {file.relative_to(self.path)}\n"
|
||||
content += ":class: dropdown\n\n"
|
||||
@ -194,7 +194,7 @@ def generate_examples():
|
||||
path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
|
||||
title="Offline Inference",
|
||||
description=
|
||||
"Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.", # noqa: E501
|
||||
"Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches. We recommend starting with <project:basic.md>.", # noqa: E501
|
||||
caption="Examples",
|
||||
),
|
||||
}
|
||||
|
@ -170,7 +170,7 @@ vLLM CPU backend supports the following vLLM features:
|
||||
sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
|
||||
find / -name *libtcmalloc* # find the dynamic link library path
|
||||
export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
|
||||
python examples/offline_inference/basic.py # run vLLM
|
||||
python examples/offline_inference/basic/basic.py # run vLLM
|
||||
```
|
||||
|
||||
- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
|
||||
@ -207,7 +207,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ
|
||||
|
||||
# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
|
||||
$ export VLLM_CPU_OMP_THREADS_BIND=0-7
|
||||
$ python examples/offline_inference/basic.py
|
||||
$ python examples/offline_inference/basic/basic.py
|
||||
```
|
||||
|
||||
- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
|
||||
|
@ -40,7 +40,7 @@ For non-CUDA platforms, please refer [here](#installation-index) for specific in
|
||||
|
||||
## Offline Batched Inference
|
||||
|
||||
With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/basic.py>
|
||||
With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/basic/basic.py>
|
||||
|
||||
The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:
|
||||
|
||||
|
@ -46,7 +46,7 @@ for output in outputs:
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
```
|
||||
|
||||
A code example can be found here: <gh-file:examples/offline_inference/basic.py>
|
||||
A code example can be found here: <gh-file:examples/offline_inference/basic/basic.py>
|
||||
|
||||
### `LLM.beam_search`
|
||||
|
||||
@ -103,7 +103,7 @@ for output in outputs:
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
```
|
||||
|
||||
A code example can be found here: <gh-file:examples/offline_inference/chat.py>
|
||||
A code example can be found here: <gh-file:examples/offline_inference/basic/chat.py>
|
||||
|
||||
If the model doesn't have a chat template or you want to specify another one,
|
||||
you can explicitly pass a chat template:
|
||||
|
@ -88,7 +88,7 @@ embeds = output.outputs.embedding
|
||||
print(f"Embeddings: {embeds!r} (size={len(embeds)})")
|
||||
```
|
||||
|
||||
A code example can be found here: <gh-file:examples/offline_inference/embedding.py>
|
||||
A code example can be found here: <gh-file:examples/offline_inference/basic/embed.py>
|
||||
|
||||
### `LLM.classify`
|
||||
|
||||
@ -103,7 +103,7 @@ probs = output.outputs.probs
|
||||
print(f"Class Probabilities: {probs!r} (size={len(probs)})")
|
||||
```
|
||||
|
||||
A code example can be found here: <gh-file:examples/offline_inference/classification.py>
|
||||
A code example can be found here: <gh-file:examples/offline_inference/basic/classify.py>
|
||||
|
||||
### `LLM.score`
|
||||
|
||||
@ -125,7 +125,7 @@ score = output.outputs.score
|
||||
print(f"Score: {score}")
|
||||
```
|
||||
|
||||
A code example can be found here: <gh-file:examples/offline_inference/scoring.py>
|
||||
A code example can be found here: <gh-file:examples/offline_inference/basic/score.py>
|
||||
|
||||
## Online Serving
|
||||
|
||||
|
@ -1,47 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
parser = FlexibleArgumentParser(description='AQLM examples')
|
||||
|
||||
parser.add_argument('--model',
|
||||
'-m',
|
||||
type=str,
|
||||
default=None,
|
||||
help='model path, as for HF')
|
||||
parser.add_argument('--choice',
|
||||
'-c',
|
||||
type=int,
|
||||
default=0,
|
||||
help='known good models by index, [0-4]')
|
||||
parser.add_argument('--tensor-parallel-size',
|
||||
'-t',
|
||||
type=int,
|
||||
default=1,
|
||||
help='tensor parallel size')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
models = [
|
||||
"ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf",
|
||||
"ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf",
|
||||
"ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf",
|
||||
"ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf",
|
||||
"BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf",
|
||||
]
|
||||
|
||||
model = LLM(args.model if args.model is not None else models[args.choice],
|
||||
tensor_parallel_size=args.tensor_parallel_size)
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=100, temperature=0)
|
||||
outputs = model.generate("Hello my name is",
|
||||
sampling_params=sampling_params)
|
||||
print(outputs[0].outputs[0].text)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -1,28 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model="snowflake/snowflake-arctic-instruct",
|
||||
quantization="deepspeedfp",
|
||||
tensor_parallel_size=8,
|
||||
trust_remote_code=True)
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
94
examples/offline_inference/basic/README.md
Normal file
94
examples/offline_inference/basic/README.md
Normal file
@ -0,0 +1,94 @@
|
||||
# Basic
|
||||
|
||||
The `LLM` class provides the primary Python interface for doing offline inference, which is interacting with a model without using a separate model inference server.
|
||||
|
||||
## Usage
|
||||
|
||||
The first script in this example shows the most basic usage of vLLM. If you are new to Python and vLLM, you should start here.
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/basic/basic.py
|
||||
```
|
||||
|
||||
The rest of the scripts include an [argument parser](https://docs.python.org/3/library/argparse.html), which you can use to pass any arguments that are compatible with [`LLM`](https://docs.vllm.ai/en/latest/api/offline_inference/llm.html). Try running the script with `--help` for a list of all available arguments.
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/basic/classify.py
|
||||
```
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/basic/embed.py
|
||||
```
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/basic/score.py
|
||||
```
|
||||
|
||||
The chat and generate scripts also accept the [sampling parameters](https://docs.vllm.ai/en/latest/api/inference_params.html#sampling-parameters): `max_tokens`, `temperature`, `top_p` and `top_k`.
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/basic/chat.py
|
||||
```
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/basic/generate.py
|
||||
```
|
||||
|
||||
## Features
|
||||
|
||||
In the scripts that support passing arguments, you can experiment with the following features.
|
||||
|
||||
### Default generation config
|
||||
|
||||
The `--generation-config` argument specifies where the generation config will be loaded from when calling `LLM.get_default_sampling_params()`. If set to ‘auto’, the generation config will be loaded from model path. If set to a folder path, the generation config will be loaded from the specified folder path. If it is not provided, vLLM defaults will be used.
|
||||
|
||||
> If max_new_tokens is specified in generation config, then it sets a server-wide limit on the number of output tokens for all requests.
|
||||
|
||||
Try it yourself with the following argument:
|
||||
|
||||
```bash
|
||||
--generation-config auto
|
||||
```
|
||||
|
||||
### Quantization
|
||||
|
||||
#### AQLM
|
||||
|
||||
vLLM supports models that are quantized using AQLM.
|
||||
|
||||
Try one yourself by passing one of the following models to the `--model` argument:
|
||||
|
||||
- `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf`
|
||||
- `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf`
|
||||
- `ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf`
|
||||
- `ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf`
|
||||
- `BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf`
|
||||
|
||||
> Some of these models are likely to be too large for a single GPU. You can split them across multiple GPUs by setting `--tensor-parallel-size` to the number of required GPUs.
|
||||
|
||||
#### GGUF
|
||||
|
||||
vLLM supports models that are quantized using GGUF.
|
||||
|
||||
Try one yourself by downloading a GUFF quantised model and using the following arguments:
|
||||
|
||||
```python
|
||||
from huggingface_hub import hf_hub_download
|
||||
repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF"
|
||||
filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf"
|
||||
print(hf_hub_download(repo_id, filename=filename))
|
||||
```
|
||||
|
||||
```bash
|
||||
--model {local-path-printed-above} --tokenizer microsoft/Phi-3-medium-4k-instruct
|
||||
```
|
||||
|
||||
### CPU offload
|
||||
|
||||
The `--cpu-offload-gb` argument can be seen as a virtual way to increase the GPU memory size. For example, if you have one 24 GB GPU and set this to 10, virtually you can think of it as a 34 GB GPU. Then you can load a 13B model with BF16 weight, which requires at least 26GB GPU memory. Note that this requires fast CPU-GPU interconnect, as part of the model is loaded from CPU memory to GPU memory on the fly in each model forward pass.
|
||||
|
||||
Try it yourself with the following arguments:
|
||||
|
||||
```bash
|
||||
--model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||
```
|
98
examples/offline_inference/basic/chat.py
Normal file
98
examples/offline_inference/basic/chat.py
Normal file
@ -0,0 +1,98 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def main(args: dict):
|
||||
# Pop arguments not used by LLM
|
||||
max_tokens = args.pop("max_tokens")
|
||||
temperature = args.pop("temperature")
|
||||
top_p = args.pop("top_p")
|
||||
top_k = args.pop("top_k")
|
||||
chat_template_path = args.pop("chat_template_path")
|
||||
|
||||
# Create an LLM
|
||||
llm = LLM(**args)
|
||||
|
||||
# Create sampling params object
|
||||
sampling_params = llm.get_default_sampling_params()
|
||||
if max_tokens is not None:
|
||||
sampling_params.max_tokens = max_tokens
|
||||
if temperature is not None:
|
||||
sampling_params.temperature = temperature
|
||||
if top_p is not None:
|
||||
sampling_params.top_p = top_p
|
||||
if top_k is not None:
|
||||
sampling_params.top_k = top_k
|
||||
|
||||
def print_outputs(outputs):
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}")
|
||||
print(f"Generated text: {generated_text!r}")
|
||||
print("-" * 80)
|
||||
|
||||
print("=" * 80)
|
||||
|
||||
# In this script, we demonstrate how to pass input to the chat method:
|
||||
conversation = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Hello! How can I assist you today?"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content":
|
||||
"Write an essay about the importance of higher education.",
|
||||
},
|
||||
]
|
||||
outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
|
||||
print_outputs(outputs)
|
||||
|
||||
# You can run batch inference with llm.chat API
|
||||
conversations = [conversation for _ in range(10)]
|
||||
|
||||
# We turn on tqdm progress bar to verify it's indeed running batch inference
|
||||
outputs = llm.chat(conversations, sampling_params, use_tqdm=True)
|
||||
print_outputs(outputs)
|
||||
|
||||
# A chat template can be optionally supplied.
|
||||
# If not, the model will use its default chat template.
|
||||
if chat_template_path is not None:
|
||||
with open(chat_template_path) as f:
|
||||
chat_template = f.read()
|
||||
|
||||
outputs = llm.chat(
|
||||
conversations,
|
||||
sampling_params,
|
||||
use_tqdm=False,
|
||||
chat_template=chat_template,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser()
|
||||
# Add engine args
|
||||
engine_group = parser.add_argument_group("Engine arguments")
|
||||
EngineArgs.add_cli_args(engine_group)
|
||||
engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
|
||||
# Add sampling params
|
||||
sampling_group = parser.add_argument_group("Sampling parameters")
|
||||
sampling_group.add_argument("--max-tokens", type=int)
|
||||
sampling_group.add_argument("--temperature", type=float)
|
||||
sampling_group.add_argument("--top-p", type=float)
|
||||
sampling_group.add_argument("--top-k", type=int)
|
||||
# Add example params
|
||||
parser.add_argument("--chat-template-path", type=str)
|
||||
args: dict = vars(parser.parse_args())
|
||||
main(args)
|
42
examples/offline_inference/basic/classify.py
Normal file
42
examples/offline_inference/basic/classify.py
Normal file
@ -0,0 +1,42 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from argparse import Namespace
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def main(args: Namespace):
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
# Create an LLM.
|
||||
# You should pass task="classify" for classification models
|
||||
model = LLM(**vars(args))
|
||||
|
||||
# Generate logits. The output is a list of ClassificationRequestOutputs.
|
||||
outputs = model.classify(prompts)
|
||||
|
||||
# Print the outputs.
|
||||
for prompt, output in zip(prompts, outputs):
|
||||
probs = output.outputs.probs
|
||||
probs_trimmed = ((str(probs[:16])[:-1] +
|
||||
", ...]") if len(probs) > 16 else probs)
|
||||
print(f"Prompt: {prompt!r} | "
|
||||
f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser()
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
# Set example specific arguments
|
||||
parser.set_defaults(model="jason9693/Qwen2.5-1.5B-apeach",
|
||||
task="classify",
|
||||
enforce_eager=True)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
42
examples/offline_inference/basic/embed.py
Normal file
42
examples/offline_inference/basic/embed.py
Normal file
@ -0,0 +1,42 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from argparse import Namespace
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def main(args: Namespace):
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
# Create an LLM.
|
||||
# You should pass task="embed" for embedding models
|
||||
model = LLM(**vars(args))
|
||||
|
||||
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
|
||||
outputs = model.embed(prompts)
|
||||
|
||||
# Print the outputs.
|
||||
for prompt, output in zip(prompts, outputs):
|
||||
embeds = output.outputs.embedding
|
||||
embeds_trimmed = ((str(embeds[:16])[:-1] +
|
||||
", ...]") if len(embeds) > 16 else embeds)
|
||||
print(f"Prompt: {prompt!r} | "
|
||||
f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser()
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
# Set example specific arguments
|
||||
parser.set_defaults(model="intfloat/e5-mistral-7b-instruct",
|
||||
task="embed",
|
||||
enforce_eager=True)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
57
examples/offline_inference/basic/generate.py
Normal file
57
examples/offline_inference/basic/generate.py
Normal file
@ -0,0 +1,57 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def main(args: dict):
|
||||
# Pop arguments not used by LLM
|
||||
max_tokens = args.pop("max_tokens")
|
||||
temperature = args.pop("temperature")
|
||||
top_p = args.pop("top_p")
|
||||
top_k = args.pop("top_k")
|
||||
|
||||
# Create an LLM
|
||||
llm = LLM(**args)
|
||||
|
||||
# Create a sampling params object
|
||||
sampling_params = llm.get_default_sampling_params()
|
||||
if max_tokens is not None:
|
||||
sampling_params.max_tokens = max_tokens
|
||||
if temperature is not None:
|
||||
sampling_params.temperature = temperature
|
||||
if top_p is not None:
|
||||
sampling_params.top_p = top_p
|
||||
if top_k is not None:
|
||||
sampling_params.top_k = top_k
|
||||
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput
|
||||
# objects that contain the prompt, generated text, and other information.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser()
|
||||
# Add engine args
|
||||
engine_group = parser.add_argument_group("Engine arguments")
|
||||
EngineArgs.add_cli_args(engine_group)
|
||||
engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
|
||||
# Add sampling params
|
||||
sampling_group = parser.add_argument_group("Sampling parameters")
|
||||
sampling_group.add_argument("--max-tokens", type=int)
|
||||
sampling_group.add_argument("--temperature", type=float)
|
||||
sampling_group.add_argument("--top-p", type=float)
|
||||
sampling_group.add_argument("--top-k", type=int)
|
||||
args: dict = vars(parser.parse_args())
|
||||
main(args)
|
38
examples/offline_inference/basic/score.py
Normal file
38
examples/offline_inference/basic/score.py
Normal file
@ -0,0 +1,38 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from argparse import Namespace
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def main(args: Namespace):
|
||||
# Sample prompts.
|
||||
text_1 = "What is the capital of France?"
|
||||
texts_2 = [
|
||||
"The capital of Brazil is Brasilia.",
|
||||
"The capital of France is Paris.",
|
||||
]
|
||||
|
||||
# Create an LLM.
|
||||
# You should pass task="score" for cross-encoder models
|
||||
model = LLM(**vars(args))
|
||||
|
||||
# Generate scores. The output is a list of ScoringRequestOutputs.
|
||||
outputs = model.score(text_1, texts_2)
|
||||
|
||||
# Print the outputs.
|
||||
for text_2, output in zip(texts_2, outputs):
|
||||
score = output.outputs.score
|
||||
print(f"Pair: {[text_1, text_2]!r} | Score: {score}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser()
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
# Set example specific arguments
|
||||
parser.set_defaults(model="BAAI/bge-reranker-v2-m3",
|
||||
task="score",
|
||||
enforce_eager=True)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
@ -1,32 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
# Create an LLM with built-in default generation config.
|
||||
# The generation config is set to None by default to keep
|
||||
# the behavior consistent with the previous version.
|
||||
# If you want to use the default generation config from the model,
|
||||
# you should set the generation_config to "auto".
|
||||
llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", generation_config="auto")
|
||||
|
||||
# Load the default sampling parameters from the model.
|
||||
sampling_params = llm.get_default_sampling_params()
|
||||
# Modify the sampling parameters if needed.
|
||||
sampling_params.temperature = 0.5
|
||||
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
@ -1,82 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
|
||||
sampling_params = SamplingParams(temperature=0.5)
|
||||
|
||||
|
||||
def print_outputs(outputs):
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
print("-" * 80)
|
||||
|
||||
|
||||
print("=" * 80)
|
||||
|
||||
# In this script, we demonstrate how to pass input to the chat method:
|
||||
|
||||
conversation = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Hello! How can I assist you today?"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Write an essay about the importance of higher education.",
|
||||
},
|
||||
]
|
||||
outputs = llm.chat(conversation,
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=False)
|
||||
print_outputs(outputs)
|
||||
|
||||
# You can run batch inference with llm.chat API
|
||||
conversation = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Hello! How can I assist you today?"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Write an essay about the importance of higher education.",
|
||||
},
|
||||
]
|
||||
conversations = [conversation for _ in range(10)]
|
||||
|
||||
# We turn on tqdm progress bar to verify it's indeed running batch inference
|
||||
outputs = llm.chat(messages=conversations,
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=True)
|
||||
print_outputs(outputs)
|
||||
|
||||
# A chat template can be optionally supplied.
|
||||
# If not, the model will use its default chat template.
|
||||
|
||||
# with open('template_falcon_180b.jinja', "r") as f:
|
||||
# chat_template = f.read()
|
||||
|
||||
# outputs = llm.chat(
|
||||
# conversations,
|
||||
# sampling_params=sampling_params,
|
||||
# use_tqdm=False,
|
||||
# chat_template=chat_template,
|
||||
# )
|
@ -1,30 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
# Create an LLM.
|
||||
# You should pass task="classify" for classification models
|
||||
model = LLM(
|
||||
model="jason9693/Qwen2.5-1.5B-apeach",
|
||||
task="classify",
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
# Generate logits. The output is a list of ClassificationRequestOutputs.
|
||||
outputs = model.classify(prompts)
|
||||
|
||||
# Print the outputs.
|
||||
for prompt, output in zip(prompts, outputs):
|
||||
probs = output.outputs.probs
|
||||
probs_trimmed = ((str(probs[:16])[:-1] +
|
||||
", ...]") if len(probs) > 16 else probs)
|
||||
print(f"Prompt: {prompt!r} | "
|
||||
f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
|
@ -1,82 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from dataclasses import asdict
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def get_prompts(num_prompts: int):
|
||||
# The default sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
if num_prompts != len(prompts):
|
||||
prompts = (prompts * ((num_prompts // len(prompts)) + 1))[:num_prompts]
|
||||
|
||||
return prompts
|
||||
|
||||
|
||||
def main(args):
|
||||
# Create prompts
|
||||
prompts = get_prompts(args.num_prompts)
|
||||
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(n=args.n,
|
||||
temperature=args.temperature,
|
||||
top_p=args.top_p,
|
||||
top_k=args.top_k,
|
||||
max_tokens=args.max_tokens)
|
||||
|
||||
# Create an LLM.
|
||||
# The default model is 'facebook/opt-125m'
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
llm = LLM(**asdict(engine_args))
|
||||
|
||||
# Generate texts from the prompts.
|
||||
# The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = FlexibleArgumentParser()
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
group = parser.add_argument_group("SamplingParams options")
|
||||
group.add_argument("--num-prompts",
|
||||
type=int,
|
||||
default=4,
|
||||
help="Number of prompts used for inference")
|
||||
group.add_argument("--max-tokens",
|
||||
type=int,
|
||||
default=16,
|
||||
help="Generated output length for sampling")
|
||||
group.add_argument('--n',
|
||||
type=int,
|
||||
default=1,
|
||||
help='Number of generated sequences per prompt')
|
||||
group.add_argument('--temperature',
|
||||
type=float,
|
||||
default=0.8,
|
||||
help='Temperature for text generation')
|
||||
group.add_argument('--top-p',
|
||||
type=float,
|
||||
default=0.95,
|
||||
help='top_p for text generation')
|
||||
group.add_argument('--top-k',
|
||||
type=int,
|
||||
default=-1,
|
||||
help='top_k for text generation')
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
@ -1,24 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model="meta-llama/Llama-2-13b-chat-hf", cpu_offload_gb=10)
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
@ -1,30 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
# Create an LLM.
|
||||
# You should pass task="embed" for embedding models
|
||||
model = LLM(
|
||||
model="intfloat/e5-mistral-7b-instruct",
|
||||
task="embed",
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
|
||||
outputs = model.embed(prompts)
|
||||
|
||||
# Print the outputs.
|
||||
for prompt, output in zip(prompts, outputs):
|
||||
embeds = output.outputs.embedding
|
||||
embeds_trimmed = ((str(embeds[:16])[:-1] +
|
||||
", ...]") if len(embeds) > 16 else embeds)
|
||||
print(f"Prompt: {prompt!r} | "
|
||||
f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
|
@ -1,34 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
|
||||
def run_gguf_inference(model_path, tokenizer):
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"How many helicopters can a human eat in one sitting?",
|
||||
"What's the future of AI?",
|
||||
]
|
||||
prompts = [[{"role": "user", "content": prompt}] for prompt in prompts]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=128)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model=model_path, tokenizer=tokenizer)
|
||||
|
||||
outputs = llm.chat(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF"
|
||||
filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf"
|
||||
tokenizer = "microsoft/Phi-3-medium-4k-instruct"
|
||||
model = hf_hub_download(repo_id, filename=filename)
|
||||
run_gguf_inference(model, tokenizer)
|
@ -1,25 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
# Sample prompts.
|
||||
text_1 = "What is the capital of France?"
|
||||
texts_2 = [
|
||||
"The capital of Brazil is Brasilia.", "The capital of France is Paris."
|
||||
]
|
||||
|
||||
# Create an LLM.
|
||||
# You should pass task="score" for cross-encoder models
|
||||
model = LLM(
|
||||
model="BAAI/bge-reranker-v2-m3",
|
||||
task="score",
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
# Generate scores. The output is a list of ScoringRequestOutputs.
|
||||
outputs = model.score(text_1, texts_2)
|
||||
|
||||
# Print the outputs.
|
||||
for text_2, output in zip(texts_2, outputs):
|
||||
score = output.outputs.score
|
||||
print(f"Pair: {[text_1, text_2]!r} | Score: {score}")
|
@ -14,7 +14,7 @@ def test_platform_plugins():
|
||||
import os
|
||||
example_file = os.path.join(
|
||||
os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
|
||||
"examples", "offline_inference/basic.py")
|
||||
"examples", "offline_inference/basic/basic.py")
|
||||
runpy.run_path(example_file)
|
||||
|
||||
# check if the plugin is loaded correctly
|
||||
|
Loading…
x
Reference in New Issue
Block a user