Merge similar examples in offline_inference
into single basic
example (#12737)
This commit is contained in:
parent
b69692a2d8
commit
992e5c3d34
@ -30,7 +30,7 @@ function cpu_tests() {
|
|||||||
# offline inference
|
# offline inference
|
||||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
python3 examples/offline_inference/basic.py"
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
|
@ -24,5 +24,5 @@ remove_docker_container
|
|||||||
|
|
||||||
# Run the image and test offline inference
|
# Run the image and test offline inference
|
||||||
docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
||||||
python3 examples/offline_inference/cli.py --model meta-llama/Llama-3.2-1B
|
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
|
||||||
'
|
'
|
||||||
|
@ -20,5 +20,5 @@ trap remove_docker_container_and_exit EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Run the image and launch offline inference
|
# Run the image and launch offline inference
|
||||||
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
|
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
EXITCODE=$?
|
EXITCODE=$?
|
||||||
|
@ -13,4 +13,4 @@ trap remove_docker_container EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Run the image and launch offline inference
|
# Run the image and launch offline inference
|
||||||
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py
|
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
|
@ -14,6 +14,6 @@ remove_docker_container
|
|||||||
|
|
||||||
# Run the image and test offline inference/tensor parallel
|
# Run the image and test offline inference/tensor parallel
|
||||||
docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
|
docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
|
||||||
python3 examples/offline_inference/basic.py
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
python3 examples/offline_inference/cli.py -tp 2
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
|
||||||
'
|
'
|
||||||
|
@ -215,18 +215,18 @@ steps:
|
|||||||
- examples/
|
- examples/
|
||||||
commands:
|
commands:
|
||||||
- pip install tensorizer # for tensorizer test
|
- pip install tensorizer # for tensorizer test
|
||||||
- python3 offline_inference/basic.py
|
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
- python3 offline_inference/cpu_offload.py
|
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||||
- python3 offline_inference/chat.py
|
- python3 offline_inference/basic/chat.py
|
||||||
- python3 offline_inference/prefix_caching.py
|
- python3 offline_inference/prefix_caching.py
|
||||||
- python3 offline_inference/llm_engine_example.py
|
- python3 offline_inference/llm_engine_example.py
|
||||||
- python3 offline_inference/vision_language.py
|
- python3 offline_inference/vision_language.py
|
||||||
- python3 offline_inference/vision_language_multi_image.py
|
- python3 offline_inference/vision_language_multi_image.py
|
||||||
- python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
- python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
- python3 offline_inference/encoder_decoder.py
|
- python3 offline_inference/encoder_decoder.py
|
||||||
- python3 offline_inference/classification.py
|
- python3 offline_inference/basic/classify.py
|
||||||
- python3 offline_inference/embedding.py
|
- python3 offline_inference/basic/embed.py
|
||||||
- python3 offline_inference/scoring.py
|
- python3 offline_inference/basic/score.py
|
||||||
- python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
- python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||||
|
|
||||||
- label: Prefix Caching Test # 9min
|
- label: Prefix Caching Test # 9min
|
||||||
|
@ -147,7 +147,7 @@ class Example:
|
|||||||
return content
|
return content
|
||||||
|
|
||||||
content += "## Example materials\n\n"
|
content += "## Example materials\n\n"
|
||||||
for file in self.other_files:
|
for file in sorted(self.other_files):
|
||||||
include = "include" if file.suffix == ".md" else "literalinclude"
|
include = "include" if file.suffix == ".md" else "literalinclude"
|
||||||
content += f":::{{admonition}} {file.relative_to(self.path)}\n"
|
content += f":::{{admonition}} {file.relative_to(self.path)}\n"
|
||||||
content += ":class: dropdown\n\n"
|
content += ":class: dropdown\n\n"
|
||||||
@ -194,7 +194,7 @@ def generate_examples():
|
|||||||
path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
|
path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
|
||||||
title="Offline Inference",
|
title="Offline Inference",
|
||||||
description=
|
description=
|
||||||
"Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.", # noqa: E501
|
"Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches. We recommend starting with <project:basic.md>.", # noqa: E501
|
||||||
caption="Examples",
|
caption="Examples",
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
@ -170,7 +170,7 @@ vLLM CPU backend supports the following vLLM features:
|
|||||||
sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
|
sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
|
||||||
find / -name *libtcmalloc* # find the dynamic link library path
|
find / -name *libtcmalloc* # find the dynamic link library path
|
||||||
export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
|
export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
|
||||||
python examples/offline_inference/basic.py # run vLLM
|
python examples/offline_inference/basic/basic.py # run vLLM
|
||||||
```
|
```
|
||||||
|
|
||||||
- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
|
- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
|
||||||
@ -207,7 +207,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ
|
|||||||
|
|
||||||
# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
|
# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
|
||||||
$ export VLLM_CPU_OMP_THREADS_BIND=0-7
|
$ export VLLM_CPU_OMP_THREADS_BIND=0-7
|
||||||
$ python examples/offline_inference/basic.py
|
$ python examples/offline_inference/basic/basic.py
|
||||||
```
|
```
|
||||||
|
|
||||||
- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
|
- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
|
||||||
|
@ -40,7 +40,7 @@ For non-CUDA platforms, please refer [here](#installation-index) for specific in
|
|||||||
|
|
||||||
## Offline Batched Inference
|
## Offline Batched Inference
|
||||||
|
|
||||||
With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/basic.py>
|
With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/basic/basic.py>
|
||||||
|
|
||||||
The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:
|
The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:
|
||||||
|
|
||||||
|
@ -46,7 +46,7 @@ for output in outputs:
|
|||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
```
|
```
|
||||||
|
|
||||||
A code example can be found here: <gh-file:examples/offline_inference/basic.py>
|
A code example can be found here: <gh-file:examples/offline_inference/basic/basic.py>
|
||||||
|
|
||||||
### `LLM.beam_search`
|
### `LLM.beam_search`
|
||||||
|
|
||||||
@ -103,7 +103,7 @@ for output in outputs:
|
|||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
```
|
```
|
||||||
|
|
||||||
A code example can be found here: <gh-file:examples/offline_inference/chat.py>
|
A code example can be found here: <gh-file:examples/offline_inference/basic/chat.py>
|
||||||
|
|
||||||
If the model doesn't have a chat template or you want to specify another one,
|
If the model doesn't have a chat template or you want to specify another one,
|
||||||
you can explicitly pass a chat template:
|
you can explicitly pass a chat template:
|
||||||
|
@ -88,7 +88,7 @@ embeds = output.outputs.embedding
|
|||||||
print(f"Embeddings: {embeds!r} (size={len(embeds)})")
|
print(f"Embeddings: {embeds!r} (size={len(embeds)})")
|
||||||
```
|
```
|
||||||
|
|
||||||
A code example can be found here: <gh-file:examples/offline_inference/embedding.py>
|
A code example can be found here: <gh-file:examples/offline_inference/basic/embed.py>
|
||||||
|
|
||||||
### `LLM.classify`
|
### `LLM.classify`
|
||||||
|
|
||||||
@ -103,7 +103,7 @@ probs = output.outputs.probs
|
|||||||
print(f"Class Probabilities: {probs!r} (size={len(probs)})")
|
print(f"Class Probabilities: {probs!r} (size={len(probs)})")
|
||||||
```
|
```
|
||||||
|
|
||||||
A code example can be found here: <gh-file:examples/offline_inference/classification.py>
|
A code example can be found here: <gh-file:examples/offline_inference/basic/classify.py>
|
||||||
|
|
||||||
### `LLM.score`
|
### `LLM.score`
|
||||||
|
|
||||||
@ -125,7 +125,7 @@ score = output.outputs.score
|
|||||||
print(f"Score: {score}")
|
print(f"Score: {score}")
|
||||||
```
|
```
|
||||||
|
|
||||||
A code example can be found here: <gh-file:examples/offline_inference/scoring.py>
|
A code example can be found here: <gh-file:examples/offline_inference/basic/score.py>
|
||||||
|
|
||||||
## Online Serving
|
## Online Serving
|
||||||
|
|
||||||
|
@ -1,47 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
from vllm.utils import FlexibleArgumentParser
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
|
|
||||||
parser = FlexibleArgumentParser(description='AQLM examples')
|
|
||||||
|
|
||||||
parser.add_argument('--model',
|
|
||||||
'-m',
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help='model path, as for HF')
|
|
||||||
parser.add_argument('--choice',
|
|
||||||
'-c',
|
|
||||||
type=int,
|
|
||||||
default=0,
|
|
||||||
help='known good models by index, [0-4]')
|
|
||||||
parser.add_argument('--tensor-parallel-size',
|
|
||||||
'-t',
|
|
||||||
type=int,
|
|
||||||
default=1,
|
|
||||||
help='tensor parallel size')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
models = [
|
|
||||||
"ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf",
|
|
||||||
"ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf",
|
|
||||||
"ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf",
|
|
||||||
"ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf",
|
|
||||||
"BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf",
|
|
||||||
]
|
|
||||||
|
|
||||||
model = LLM(args.model if args.model is not None else models[args.choice],
|
|
||||||
tensor_parallel_size=args.tensor_parallel_size)
|
|
||||||
|
|
||||||
sampling_params = SamplingParams(max_tokens=100, temperature=0)
|
|
||||||
outputs = model.generate("Hello my name is",
|
|
||||||
sampling_params=sampling_params)
|
|
||||||
print(outputs[0].outputs[0].text)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
@ -1,28 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
|
|
||||||
# Sample prompts.
|
|
||||||
prompts = [
|
|
||||||
"Hello, my name is",
|
|
||||||
"The president of the United States is",
|
|
||||||
"The capital of France is",
|
|
||||||
"The future of AI is",
|
|
||||||
]
|
|
||||||
# Create a sampling params object.
|
|
||||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
|
||||||
|
|
||||||
# Create an LLM.
|
|
||||||
llm = LLM(model="snowflake/snowflake-arctic-instruct",
|
|
||||||
quantization="deepspeedfp",
|
|
||||||
tensor_parallel_size=8,
|
|
||||||
trust_remote_code=True)
|
|
||||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
|
||||||
# that contain the prompt, generated text, and other information.
|
|
||||||
|
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
|
||||||
# Print the outputs.
|
|
||||||
for output in outputs:
|
|
||||||
prompt = output.prompt
|
|
||||||
generated_text = output.outputs[0].text
|
|
||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
|
94
examples/offline_inference/basic/README.md
Normal file
94
examples/offline_inference/basic/README.md
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
# Basic
|
||||||
|
|
||||||
|
The `LLM` class provides the primary Python interface for doing offline inference, which is interacting with a model without using a separate model inference server.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
The first script in this example shows the most basic usage of vLLM. If you are new to Python and vLLM, you should start here.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python examples/offline_inference/basic/basic.py
|
||||||
|
```
|
||||||
|
|
||||||
|
The rest of the scripts include an [argument parser](https://docs.python.org/3/library/argparse.html), which you can use to pass any arguments that are compatible with [`LLM`](https://docs.vllm.ai/en/latest/api/offline_inference/llm.html). Try running the script with `--help` for a list of all available arguments.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python examples/offline_inference/basic/classify.py
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python examples/offline_inference/basic/embed.py
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python examples/offline_inference/basic/score.py
|
||||||
|
```
|
||||||
|
|
||||||
|
The chat and generate scripts also accept the [sampling parameters](https://docs.vllm.ai/en/latest/api/inference_params.html#sampling-parameters): `max_tokens`, `temperature`, `top_p` and `top_k`.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python examples/offline_inference/basic/chat.py
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python examples/offline_inference/basic/generate.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
In the scripts that support passing arguments, you can experiment with the following features.
|
||||||
|
|
||||||
|
### Default generation config
|
||||||
|
|
||||||
|
The `--generation-config` argument specifies where the generation config will be loaded from when calling `LLM.get_default_sampling_params()`. If set to ‘auto’, the generation config will be loaded from model path. If set to a folder path, the generation config will be loaded from the specified folder path. If it is not provided, vLLM defaults will be used.
|
||||||
|
|
||||||
|
> If max_new_tokens is specified in generation config, then it sets a server-wide limit on the number of output tokens for all requests.
|
||||||
|
|
||||||
|
Try it yourself with the following argument:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
--generation-config auto
|
||||||
|
```
|
||||||
|
|
||||||
|
### Quantization
|
||||||
|
|
||||||
|
#### AQLM
|
||||||
|
|
||||||
|
vLLM supports models that are quantized using AQLM.
|
||||||
|
|
||||||
|
Try one yourself by passing one of the following models to the `--model` argument:
|
||||||
|
|
||||||
|
- `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf`
|
||||||
|
- `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf`
|
||||||
|
- `ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf`
|
||||||
|
- `ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf`
|
||||||
|
- `BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf`
|
||||||
|
|
||||||
|
> Some of these models are likely to be too large for a single GPU. You can split them across multiple GPUs by setting `--tensor-parallel-size` to the number of required GPUs.
|
||||||
|
|
||||||
|
#### GGUF
|
||||||
|
|
||||||
|
vLLM supports models that are quantized using GGUF.
|
||||||
|
|
||||||
|
Try one yourself by downloading a GUFF quantised model and using the following arguments:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from huggingface_hub import hf_hub_download
|
||||||
|
repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF"
|
||||||
|
filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf"
|
||||||
|
print(hf_hub_download(repo_id, filename=filename))
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
--model {local-path-printed-above} --tokenizer microsoft/Phi-3-medium-4k-instruct
|
||||||
|
```
|
||||||
|
|
||||||
|
### CPU offload
|
||||||
|
|
||||||
|
The `--cpu-offload-gb` argument can be seen as a virtual way to increase the GPU memory size. For example, if you have one 24 GB GPU and set this to 10, virtually you can think of it as a 34 GB GPU. Then you can load a 13B model with BF16 weight, which requires at least 26GB GPU memory. Note that this requires fast CPU-GPU interconnect, as part of the model is loaded from CPU memory to GPU memory on the fly in each model forward pass.
|
||||||
|
|
||||||
|
Try it yourself with the following arguments:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
--model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||||
|
```
|
98
examples/offline_inference/basic/chat.py
Normal file
98
examples/offline_inference/basic/chat.py
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
from vllm import LLM, EngineArgs
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
|
def main(args: dict):
|
||||||
|
# Pop arguments not used by LLM
|
||||||
|
max_tokens = args.pop("max_tokens")
|
||||||
|
temperature = args.pop("temperature")
|
||||||
|
top_p = args.pop("top_p")
|
||||||
|
top_k = args.pop("top_k")
|
||||||
|
chat_template_path = args.pop("chat_template_path")
|
||||||
|
|
||||||
|
# Create an LLM
|
||||||
|
llm = LLM(**args)
|
||||||
|
|
||||||
|
# Create sampling params object
|
||||||
|
sampling_params = llm.get_default_sampling_params()
|
||||||
|
if max_tokens is not None:
|
||||||
|
sampling_params.max_tokens = max_tokens
|
||||||
|
if temperature is not None:
|
||||||
|
sampling_params.temperature = temperature
|
||||||
|
if top_p is not None:
|
||||||
|
sampling_params.top_p = top_p
|
||||||
|
if top_k is not None:
|
||||||
|
sampling_params.top_k = top_k
|
||||||
|
|
||||||
|
def print_outputs(outputs):
|
||||||
|
for output in outputs:
|
||||||
|
prompt = output.prompt
|
||||||
|
generated_text = output.outputs[0].text
|
||||||
|
print(f"Prompt: {prompt!r}")
|
||||||
|
print(f"Generated text: {generated_text!r}")
|
||||||
|
print("-" * 80)
|
||||||
|
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
# In this script, we demonstrate how to pass input to the chat method:
|
||||||
|
conversation = [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You are a helpful assistant"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Hello"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "Hello! How can I assist you today?"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content":
|
||||||
|
"Write an essay about the importance of higher education.",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
|
||||||
|
print_outputs(outputs)
|
||||||
|
|
||||||
|
# You can run batch inference with llm.chat API
|
||||||
|
conversations = [conversation for _ in range(10)]
|
||||||
|
|
||||||
|
# We turn on tqdm progress bar to verify it's indeed running batch inference
|
||||||
|
outputs = llm.chat(conversations, sampling_params, use_tqdm=True)
|
||||||
|
print_outputs(outputs)
|
||||||
|
|
||||||
|
# A chat template can be optionally supplied.
|
||||||
|
# If not, the model will use its default chat template.
|
||||||
|
if chat_template_path is not None:
|
||||||
|
with open(chat_template_path) as f:
|
||||||
|
chat_template = f.read()
|
||||||
|
|
||||||
|
outputs = llm.chat(
|
||||||
|
conversations,
|
||||||
|
sampling_params,
|
||||||
|
use_tqdm=False,
|
||||||
|
chat_template=chat_template,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = FlexibleArgumentParser()
|
||||||
|
# Add engine args
|
||||||
|
engine_group = parser.add_argument_group("Engine arguments")
|
||||||
|
EngineArgs.add_cli_args(engine_group)
|
||||||
|
engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
|
||||||
|
# Add sampling params
|
||||||
|
sampling_group = parser.add_argument_group("Sampling parameters")
|
||||||
|
sampling_group.add_argument("--max-tokens", type=int)
|
||||||
|
sampling_group.add_argument("--temperature", type=float)
|
||||||
|
sampling_group.add_argument("--top-p", type=float)
|
||||||
|
sampling_group.add_argument("--top-k", type=int)
|
||||||
|
# Add example params
|
||||||
|
parser.add_argument("--chat-template-path", type=str)
|
||||||
|
args: dict = vars(parser.parse_args())
|
||||||
|
main(args)
|
42
examples/offline_inference/basic/classify.py
Normal file
42
examples/offline_inference/basic/classify.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
from argparse import Namespace
|
||||||
|
|
||||||
|
from vllm import LLM, EngineArgs
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
|
def main(args: Namespace):
|
||||||
|
# Sample prompts.
|
||||||
|
prompts = [
|
||||||
|
"Hello, my name is",
|
||||||
|
"The president of the United States is",
|
||||||
|
"The capital of France is",
|
||||||
|
"The future of AI is",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Create an LLM.
|
||||||
|
# You should pass task="classify" for classification models
|
||||||
|
model = LLM(**vars(args))
|
||||||
|
|
||||||
|
# Generate logits. The output is a list of ClassificationRequestOutputs.
|
||||||
|
outputs = model.classify(prompts)
|
||||||
|
|
||||||
|
# Print the outputs.
|
||||||
|
for prompt, output in zip(prompts, outputs):
|
||||||
|
probs = output.outputs.probs
|
||||||
|
probs_trimmed = ((str(probs[:16])[:-1] +
|
||||||
|
", ...]") if len(probs) > 16 else probs)
|
||||||
|
print(f"Prompt: {prompt!r} | "
|
||||||
|
f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = FlexibleArgumentParser()
|
||||||
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
|
# Set example specific arguments
|
||||||
|
parser.set_defaults(model="jason9693/Qwen2.5-1.5B-apeach",
|
||||||
|
task="classify",
|
||||||
|
enforce_eager=True)
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args)
|
42
examples/offline_inference/basic/embed.py
Normal file
42
examples/offline_inference/basic/embed.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
from argparse import Namespace
|
||||||
|
|
||||||
|
from vllm import LLM, EngineArgs
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
|
def main(args: Namespace):
|
||||||
|
# Sample prompts.
|
||||||
|
prompts = [
|
||||||
|
"Hello, my name is",
|
||||||
|
"The president of the United States is",
|
||||||
|
"The capital of France is",
|
||||||
|
"The future of AI is",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Create an LLM.
|
||||||
|
# You should pass task="embed" for embedding models
|
||||||
|
model = LLM(**vars(args))
|
||||||
|
|
||||||
|
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
|
||||||
|
outputs = model.embed(prompts)
|
||||||
|
|
||||||
|
# Print the outputs.
|
||||||
|
for prompt, output in zip(prompts, outputs):
|
||||||
|
embeds = output.outputs.embedding
|
||||||
|
embeds_trimmed = ((str(embeds[:16])[:-1] +
|
||||||
|
", ...]") if len(embeds) > 16 else embeds)
|
||||||
|
print(f"Prompt: {prompt!r} | "
|
||||||
|
f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = FlexibleArgumentParser()
|
||||||
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
|
# Set example specific arguments
|
||||||
|
parser.set_defaults(model="intfloat/e5-mistral-7b-instruct",
|
||||||
|
task="embed",
|
||||||
|
enforce_eager=True)
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args)
|
57
examples/offline_inference/basic/generate.py
Normal file
57
examples/offline_inference/basic/generate.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
from vllm import LLM, EngineArgs
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
|
def main(args: dict):
|
||||||
|
# Pop arguments not used by LLM
|
||||||
|
max_tokens = args.pop("max_tokens")
|
||||||
|
temperature = args.pop("temperature")
|
||||||
|
top_p = args.pop("top_p")
|
||||||
|
top_k = args.pop("top_k")
|
||||||
|
|
||||||
|
# Create an LLM
|
||||||
|
llm = LLM(**args)
|
||||||
|
|
||||||
|
# Create a sampling params object
|
||||||
|
sampling_params = llm.get_default_sampling_params()
|
||||||
|
if max_tokens is not None:
|
||||||
|
sampling_params.max_tokens = max_tokens
|
||||||
|
if temperature is not None:
|
||||||
|
sampling_params.temperature = temperature
|
||||||
|
if top_p is not None:
|
||||||
|
sampling_params.top_p = top_p
|
||||||
|
if top_k is not None:
|
||||||
|
sampling_params.top_k = top_k
|
||||||
|
|
||||||
|
# Generate texts from the prompts. The output is a list of RequestOutput
|
||||||
|
# objects that contain the prompt, generated text, and other information.
|
||||||
|
prompts = [
|
||||||
|
"Hello, my name is",
|
||||||
|
"The president of the United States is",
|
||||||
|
"The capital of France is",
|
||||||
|
"The future of AI is",
|
||||||
|
]
|
||||||
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
|
# Print the outputs.
|
||||||
|
for output in outputs:
|
||||||
|
prompt = output.prompt
|
||||||
|
generated_text = output.outputs[0].text
|
||||||
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = FlexibleArgumentParser()
|
||||||
|
# Add engine args
|
||||||
|
engine_group = parser.add_argument_group("Engine arguments")
|
||||||
|
EngineArgs.add_cli_args(engine_group)
|
||||||
|
engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
|
||||||
|
# Add sampling params
|
||||||
|
sampling_group = parser.add_argument_group("Sampling parameters")
|
||||||
|
sampling_group.add_argument("--max-tokens", type=int)
|
||||||
|
sampling_group.add_argument("--temperature", type=float)
|
||||||
|
sampling_group.add_argument("--top-p", type=float)
|
||||||
|
sampling_group.add_argument("--top-k", type=int)
|
||||||
|
args: dict = vars(parser.parse_args())
|
||||||
|
main(args)
|
38
examples/offline_inference/basic/score.py
Normal file
38
examples/offline_inference/basic/score.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
from argparse import Namespace
|
||||||
|
|
||||||
|
from vllm import LLM, EngineArgs
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
|
def main(args: Namespace):
|
||||||
|
# Sample prompts.
|
||||||
|
text_1 = "What is the capital of France?"
|
||||||
|
texts_2 = [
|
||||||
|
"The capital of Brazil is Brasilia.",
|
||||||
|
"The capital of France is Paris.",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Create an LLM.
|
||||||
|
# You should pass task="score" for cross-encoder models
|
||||||
|
model = LLM(**vars(args))
|
||||||
|
|
||||||
|
# Generate scores. The output is a list of ScoringRequestOutputs.
|
||||||
|
outputs = model.score(text_1, texts_2)
|
||||||
|
|
||||||
|
# Print the outputs.
|
||||||
|
for text_2, output in zip(texts_2, outputs):
|
||||||
|
score = output.outputs.score
|
||||||
|
print(f"Pair: {[text_1, text_2]!r} | Score: {score}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = FlexibleArgumentParser()
|
||||||
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
|
# Set example specific arguments
|
||||||
|
parser.set_defaults(model="BAAI/bge-reranker-v2-m3",
|
||||||
|
task="score",
|
||||||
|
enforce_eager=True)
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args)
|
@ -1,32 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
|
|
||||||
from vllm import LLM
|
|
||||||
|
|
||||||
# Sample prompts.
|
|
||||||
prompts = [
|
|
||||||
"Hello, my name is",
|
|
||||||
"The president of the United States is",
|
|
||||||
"The capital of France is",
|
|
||||||
"The future of AI is",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Create an LLM with built-in default generation config.
|
|
||||||
# The generation config is set to None by default to keep
|
|
||||||
# the behavior consistent with the previous version.
|
|
||||||
# If you want to use the default generation config from the model,
|
|
||||||
# you should set the generation_config to "auto".
|
|
||||||
llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", generation_config="auto")
|
|
||||||
|
|
||||||
# Load the default sampling parameters from the model.
|
|
||||||
sampling_params = llm.get_default_sampling_params()
|
|
||||||
# Modify the sampling parameters if needed.
|
|
||||||
sampling_params.temperature = 0.5
|
|
||||||
|
|
||||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
|
||||||
# that contain the prompt, generated text, and other information.
|
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
|
||||||
# Print the outputs.
|
|
||||||
for output in outputs:
|
|
||||||
prompt = output.prompt
|
|
||||||
generated_text = output.outputs[0].text
|
|
||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
|
@ -1,82 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
|
|
||||||
llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
|
|
||||||
sampling_params = SamplingParams(temperature=0.5)
|
|
||||||
|
|
||||||
|
|
||||||
def print_outputs(outputs):
|
|
||||||
for output in outputs:
|
|
||||||
prompt = output.prompt
|
|
||||||
generated_text = output.outputs[0].text
|
|
||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
|
||||||
print("-" * 80)
|
|
||||||
|
|
||||||
|
|
||||||
print("=" * 80)
|
|
||||||
|
|
||||||
# In this script, we demonstrate how to pass input to the chat method:
|
|
||||||
|
|
||||||
conversation = [
|
|
||||||
{
|
|
||||||
"role": "system",
|
|
||||||
"content": "You are a helpful assistant"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "Hello"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "assistant",
|
|
||||||
"content": "Hello! How can I assist you today?"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "Write an essay about the importance of higher education.",
|
|
||||||
},
|
|
||||||
]
|
|
||||||
outputs = llm.chat(conversation,
|
|
||||||
sampling_params=sampling_params,
|
|
||||||
use_tqdm=False)
|
|
||||||
print_outputs(outputs)
|
|
||||||
|
|
||||||
# You can run batch inference with llm.chat API
|
|
||||||
conversation = [
|
|
||||||
{
|
|
||||||
"role": "system",
|
|
||||||
"content": "You are a helpful assistant"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "Hello"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "assistant",
|
|
||||||
"content": "Hello! How can I assist you today?"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "Write an essay about the importance of higher education.",
|
|
||||||
},
|
|
||||||
]
|
|
||||||
conversations = [conversation for _ in range(10)]
|
|
||||||
|
|
||||||
# We turn on tqdm progress bar to verify it's indeed running batch inference
|
|
||||||
outputs = llm.chat(messages=conversations,
|
|
||||||
sampling_params=sampling_params,
|
|
||||||
use_tqdm=True)
|
|
||||||
print_outputs(outputs)
|
|
||||||
|
|
||||||
# A chat template can be optionally supplied.
|
|
||||||
# If not, the model will use its default chat template.
|
|
||||||
|
|
||||||
# with open('template_falcon_180b.jinja', "r") as f:
|
|
||||||
# chat_template = f.read()
|
|
||||||
|
|
||||||
# outputs = llm.chat(
|
|
||||||
# conversations,
|
|
||||||
# sampling_params=sampling_params,
|
|
||||||
# use_tqdm=False,
|
|
||||||
# chat_template=chat_template,
|
|
||||||
# )
|
|
@ -1,30 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
|
|
||||||
from vllm import LLM
|
|
||||||
|
|
||||||
# Sample prompts.
|
|
||||||
prompts = [
|
|
||||||
"Hello, my name is",
|
|
||||||
"The president of the United States is",
|
|
||||||
"The capital of France is",
|
|
||||||
"The future of AI is",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Create an LLM.
|
|
||||||
# You should pass task="classify" for classification models
|
|
||||||
model = LLM(
|
|
||||||
model="jason9693/Qwen2.5-1.5B-apeach",
|
|
||||||
task="classify",
|
|
||||||
enforce_eager=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Generate logits. The output is a list of ClassificationRequestOutputs.
|
|
||||||
outputs = model.classify(prompts)
|
|
||||||
|
|
||||||
# Print the outputs.
|
|
||||||
for prompt, output in zip(prompts, outputs):
|
|
||||||
probs = output.outputs.probs
|
|
||||||
probs_trimmed = ((str(probs[:16])[:-1] +
|
|
||||||
", ...]") if len(probs) > 16 else probs)
|
|
||||||
print(f"Prompt: {prompt!r} | "
|
|
||||||
f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
|
|
@ -1,82 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
|
|
||||||
from dataclasses import asdict
|
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
|
||||||
from vllm.utils import FlexibleArgumentParser
|
|
||||||
|
|
||||||
|
|
||||||
def get_prompts(num_prompts: int):
|
|
||||||
# The default sample prompts.
|
|
||||||
prompts = [
|
|
||||||
"Hello, my name is",
|
|
||||||
"The president of the United States is",
|
|
||||||
"The capital of France is",
|
|
||||||
"The future of AI is",
|
|
||||||
]
|
|
||||||
|
|
||||||
if num_prompts != len(prompts):
|
|
||||||
prompts = (prompts * ((num_prompts // len(prompts)) + 1))[:num_prompts]
|
|
||||||
|
|
||||||
return prompts
|
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
|
||||||
# Create prompts
|
|
||||||
prompts = get_prompts(args.num_prompts)
|
|
||||||
|
|
||||||
# Create a sampling params object.
|
|
||||||
sampling_params = SamplingParams(n=args.n,
|
|
||||||
temperature=args.temperature,
|
|
||||||
top_p=args.top_p,
|
|
||||||
top_k=args.top_k,
|
|
||||||
max_tokens=args.max_tokens)
|
|
||||||
|
|
||||||
# Create an LLM.
|
|
||||||
# The default model is 'facebook/opt-125m'
|
|
||||||
engine_args = EngineArgs.from_cli_args(args)
|
|
||||||
llm = LLM(**asdict(engine_args))
|
|
||||||
|
|
||||||
# Generate texts from the prompts.
|
|
||||||
# The output is a list of RequestOutput objects
|
|
||||||
# that contain the prompt, generated text, and other information.
|
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
|
||||||
# Print the outputs.
|
|
||||||
for output in outputs:
|
|
||||||
prompt = output.prompt
|
|
||||||
generated_text = output.outputs[0].text
|
|
||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
parser = FlexibleArgumentParser()
|
|
||||||
parser = EngineArgs.add_cli_args(parser)
|
|
||||||
group = parser.add_argument_group("SamplingParams options")
|
|
||||||
group.add_argument("--num-prompts",
|
|
||||||
type=int,
|
|
||||||
default=4,
|
|
||||||
help="Number of prompts used for inference")
|
|
||||||
group.add_argument("--max-tokens",
|
|
||||||
type=int,
|
|
||||||
default=16,
|
|
||||||
help="Generated output length for sampling")
|
|
||||||
group.add_argument('--n',
|
|
||||||
type=int,
|
|
||||||
default=1,
|
|
||||||
help='Number of generated sequences per prompt')
|
|
||||||
group.add_argument('--temperature',
|
|
||||||
type=float,
|
|
||||||
default=0.8,
|
|
||||||
help='Temperature for text generation')
|
|
||||||
group.add_argument('--top-p',
|
|
||||||
type=float,
|
|
||||||
default=0.95,
|
|
||||||
help='top_p for text generation')
|
|
||||||
group.add_argument('--top-k',
|
|
||||||
type=int,
|
|
||||||
default=-1,
|
|
||||||
help='top_k for text generation')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
main(args)
|
|
@ -1,24 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
|
|
||||||
# Sample prompts.
|
|
||||||
prompts = [
|
|
||||||
"Hello, my name is",
|
|
||||||
"The president of the United States is",
|
|
||||||
"The capital of France is",
|
|
||||||
"The future of AI is",
|
|
||||||
]
|
|
||||||
# Create a sampling params object.
|
|
||||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
|
||||||
|
|
||||||
# Create an LLM.
|
|
||||||
llm = LLM(model="meta-llama/Llama-2-13b-chat-hf", cpu_offload_gb=10)
|
|
||||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
|
||||||
# that contain the prompt, generated text, and other information.
|
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
|
||||||
# Print the outputs.
|
|
||||||
for output in outputs:
|
|
||||||
prompt = output.prompt
|
|
||||||
generated_text = output.outputs[0].text
|
|
||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
|
@ -1,30 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
|
|
||||||
from vllm import LLM
|
|
||||||
|
|
||||||
# Sample prompts.
|
|
||||||
prompts = [
|
|
||||||
"Hello, my name is",
|
|
||||||
"The president of the United States is",
|
|
||||||
"The capital of France is",
|
|
||||||
"The future of AI is",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Create an LLM.
|
|
||||||
# You should pass task="embed" for embedding models
|
|
||||||
model = LLM(
|
|
||||||
model="intfloat/e5-mistral-7b-instruct",
|
|
||||||
task="embed",
|
|
||||||
enforce_eager=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
|
|
||||||
outputs = model.embed(prompts)
|
|
||||||
|
|
||||||
# Print the outputs.
|
|
||||||
for prompt, output in zip(prompts, outputs):
|
|
||||||
embeds = output.outputs.embedding
|
|
||||||
embeds_trimmed = ((str(embeds[:16])[:-1] +
|
|
||||||
", ...]") if len(embeds) > 16 else embeds)
|
|
||||||
print(f"Prompt: {prompt!r} | "
|
|
||||||
f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
|
|
@ -1,34 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
|
|
||||||
from huggingface_hub import hf_hub_download
|
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
|
|
||||||
|
|
||||||
def run_gguf_inference(model_path, tokenizer):
|
|
||||||
# Sample prompts.
|
|
||||||
prompts = [
|
|
||||||
"How many helicopters can a human eat in one sitting?",
|
|
||||||
"What's the future of AI?",
|
|
||||||
]
|
|
||||||
prompts = [[{"role": "user", "content": prompt}] for prompt in prompts]
|
|
||||||
# Create a sampling params object.
|
|
||||||
sampling_params = SamplingParams(temperature=0, max_tokens=128)
|
|
||||||
|
|
||||||
# Create an LLM.
|
|
||||||
llm = LLM(model=model_path, tokenizer=tokenizer)
|
|
||||||
|
|
||||||
outputs = llm.chat(prompts, sampling_params)
|
|
||||||
# Print the outputs.
|
|
||||||
for output in outputs:
|
|
||||||
prompt = output.prompt
|
|
||||||
generated_text = output.outputs[0].text
|
|
||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF"
|
|
||||||
filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf"
|
|
||||||
tokenizer = "microsoft/Phi-3-medium-4k-instruct"
|
|
||||||
model = hf_hub_download(repo_id, filename=filename)
|
|
||||||
run_gguf_inference(model, tokenizer)
|
|
@ -1,25 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
|
|
||||||
from vllm import LLM
|
|
||||||
|
|
||||||
# Sample prompts.
|
|
||||||
text_1 = "What is the capital of France?"
|
|
||||||
texts_2 = [
|
|
||||||
"The capital of Brazil is Brasilia.", "The capital of France is Paris."
|
|
||||||
]
|
|
||||||
|
|
||||||
# Create an LLM.
|
|
||||||
# You should pass task="score" for cross-encoder models
|
|
||||||
model = LLM(
|
|
||||||
model="BAAI/bge-reranker-v2-m3",
|
|
||||||
task="score",
|
|
||||||
enforce_eager=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Generate scores. The output is a list of ScoringRequestOutputs.
|
|
||||||
outputs = model.score(text_1, texts_2)
|
|
||||||
|
|
||||||
# Print the outputs.
|
|
||||||
for text_2, output in zip(texts_2, outputs):
|
|
||||||
score = output.outputs.score
|
|
||||||
print(f"Pair: {[text_1, text_2]!r} | Score: {score}")
|
|
@ -14,7 +14,7 @@ def test_platform_plugins():
|
|||||||
import os
|
import os
|
||||||
example_file = os.path.join(
|
example_file = os.path.join(
|
||||||
os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
|
os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
|
||||||
"examples", "offline_inference/basic.py")
|
"examples", "offline_inference/basic/basic.py")
|
||||||
runpy.run_path(example_file)
|
runpy.run_path(example_file)
|
||||||
|
|
||||||
# check if the plugin is loaded correctly
|
# check if the plugin is loaded correctly
|
||||||
|
Loading…
x
Reference in New Issue
Block a user