[doc] add missing imports (#15699)
Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
This commit is contained in:
parent
7329ff5468
commit
2914006fe0
@ -23,6 +23,8 @@ It is similar to [its counterpart in HF Transformers](https://huggingface.co/doc
|
|||||||
except that tokenization and detokenization are also performed automatically.
|
except that tokenization and detokenization are also performed automatically.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
from vllm import LLM
|
||||||
|
|
||||||
llm = LLM(model="facebook/opt-125m")
|
llm = LLM(model="facebook/opt-125m")
|
||||||
outputs = llm.generate("Hello, my name is")
|
outputs = llm.generate("Hello, my name is")
|
||||||
|
|
||||||
@ -36,6 +38,8 @@ You can optionally control the language generation by passing {class}`~vllm.Samp
|
|||||||
For example, you can use greedy sampling by setting `temperature=0`:
|
For example, you can use greedy sampling by setting `temperature=0`:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
llm = LLM(model="facebook/opt-125m")
|
llm = LLM(model="facebook/opt-125m")
|
||||||
params = SamplingParams(temperature=0)
|
params = SamplingParams(temperature=0)
|
||||||
outputs = llm.generate("Hello, my name is", params)
|
outputs = llm.generate("Hello, my name is", params)
|
||||||
@ -83,6 +87,8 @@ Base models may perform poorly as they are not trained to respond to the chat co
|
|||||||
:::
|
:::
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
from vllm import LLM
|
||||||
|
|
||||||
llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
|
llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
|
||||||
conversation = [
|
conversation = [
|
||||||
{
|
{
|
||||||
|
@ -68,6 +68,8 @@ The {class}`~vllm.LLM.encode` method is available to all pooling models in vLLM.
|
|||||||
It returns the extracted hidden states directly, which is useful for reward models.
|
It returns the extracted hidden states directly, which is useful for reward models.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
from vllm import LLM
|
||||||
|
|
||||||
llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward")
|
llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward")
|
||||||
(output,) = llm.encode("Hello, my name is")
|
(output,) = llm.encode("Hello, my name is")
|
||||||
|
|
||||||
@ -81,6 +83,8 @@ The {class}`~vllm.LLM.embed` method outputs an embedding vector for each prompt.
|
|||||||
It is primarily designed for embedding models.
|
It is primarily designed for embedding models.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
from vllm import LLM
|
||||||
|
|
||||||
llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed")
|
llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed")
|
||||||
(output,) = llm.embed("Hello, my name is")
|
(output,) = llm.embed("Hello, my name is")
|
||||||
|
|
||||||
@ -96,6 +100,8 @@ The {class}`~vllm.LLM.classify` method outputs a probability vector for each pro
|
|||||||
It is primarily designed for classification models.
|
It is primarily designed for classification models.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
from vllm import LLM
|
||||||
|
|
||||||
llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify")
|
llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify")
|
||||||
(output,) = llm.classify("Hello, my name is")
|
(output,) = llm.classify("Hello, my name is")
|
||||||
|
|
||||||
@ -116,6 +122,8 @@ To handle RAG at a higher level, you should use integration frameworks such as [
|
|||||||
:::
|
:::
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
from vllm import LLM
|
||||||
|
|
||||||
llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score")
|
llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score")
|
||||||
(output,) = llm.score("What is the capital of France?",
|
(output,) = llm.score("What is the capital of France?",
|
||||||
"The capital of Brazil is Brasilia.")
|
"The capital of Brazil is Brasilia.")
|
||||||
|
@ -31,6 +31,8 @@ vLLM supports an experimental feature chunked prefill. Chunked prefill allows to
|
|||||||
You can enable the feature by specifying `--enable-chunked-prefill` in the command line or setting `enable_chunked_prefill=True` in the LLM constructor.
|
You can enable the feature by specifying `--enable-chunked-prefill` in the command line or setting `enable_chunked_prefill=True` in the LLM constructor.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
from vllm import LLM
|
||||||
|
|
||||||
llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True)
|
llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True)
|
||||||
# Set max_num_batched_tokens to tune performance.
|
# Set max_num_batched_tokens to tune performance.
|
||||||
# NOTE: 2048 is the default max_num_batched_tokens for chunked prefill.
|
# NOTE: 2048 is the default max_num_batched_tokens for chunked prefill.
|
||||||
|
@ -21,6 +21,8 @@ To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`
|
|||||||
You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
|
You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
from vllm import LLM
|
||||||
|
|
||||||
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
|
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
|
||||||
|
|
||||||
# Refer to the HuggingFace repo for the correct format to use
|
# Refer to the HuggingFace repo for the correct format to use
|
||||||
@ -65,6 +67,8 @@ Full example: <gh-file:examples/offline_inference/vision_language.py>
|
|||||||
To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
|
To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
from vllm import LLM
|
||||||
|
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
model="microsoft/Phi-3.5-vision-instruct",
|
model="microsoft/Phi-3.5-vision-instruct",
|
||||||
trust_remote_code=True, # Required to load Phi-3.5-vision
|
trust_remote_code=True, # Required to load Phi-3.5-vision
|
||||||
@ -96,6 +100,8 @@ Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py
|
|||||||
Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
|
Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
from vllm import LLM
|
||||||
|
|
||||||
# Specify the maximum number of frames per video to be 4. This can be changed.
|
# Specify the maximum number of frames per video to be 4. This can be changed.
|
||||||
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
|
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
|
||||||
|
|
||||||
@ -139,6 +145,8 @@ To input pre-computed embeddings belonging to a data type (i.e. image, video, or
|
|||||||
pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
|
pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
from vllm import LLM
|
||||||
|
|
||||||
# Inference with image embeddings as input
|
# Inference with image embeddings as input
|
||||||
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
|
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
|
||||||
|
|
||||||
|
@ -11,6 +11,8 @@ For example, the following code downloads the [`facebook/opt-125m`](https://hugg
|
|||||||
and runs it in vLLM using the default configuration.
|
and runs it in vLLM using the default configuration.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
from vllm import LLM
|
||||||
|
|
||||||
llm = LLM(model="facebook/opt-125m")
|
llm = LLM(model="facebook/opt-125m")
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -47,6 +49,8 @@ To fix this, explicitly specify the model architecture by passing `config.json`
|
|||||||
For example:
|
For example:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
from vllm import LLM
|
||||||
|
|
||||||
model = LLM(
|
model = LLM(
|
||||||
model="cerebras/Cerebras-GPT-1.3B",
|
model="cerebras/Cerebras-GPT-1.3B",
|
||||||
hf_overrides={"architectures": ["GPT2LMHeadModel"]}, # GPT-2
|
hf_overrides={"architectures": ["GPT2LMHeadModel"]}, # GPT-2
|
||||||
@ -92,6 +96,8 @@ You can further reduce memory usage by limiting the context length of the model
|
|||||||
and the maximum batch size (`max_num_seqs` option).
|
and the maximum batch size (`max_num_seqs` option).
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
from vllm import LLM
|
||||||
|
|
||||||
llm = LLM(model="adept/fuyu-8b",
|
llm = LLM(model="adept/fuyu-8b",
|
||||||
max_model_len=2048,
|
max_model_len=2048,
|
||||||
max_num_seqs=2)
|
max_num_seqs=2)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user