From 2914006fe09875ebfa33626d945e34173c7441c6 Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Fri, 28 Mar 2025 23:56:48 +0800 Subject: [PATCH] [doc] add missing imports (#15699) Signed-off-by: reidliu41 Co-authored-by: reidliu41 --- docs/source/models/generative_models.md | 6 ++++++ docs/source/models/pooling_models.md | 8 ++++++++ docs/source/performance/optimization.md | 2 ++ docs/source/serving/multimodal_inputs.md | 8 ++++++++ docs/source/serving/offline_inference.md | 6 ++++++ 5 files changed, 30 insertions(+) diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md index c94e940b..63fc53b0 100644 --- a/docs/source/models/generative_models.md +++ b/docs/source/models/generative_models.md @@ -23,6 +23,8 @@ It is similar to [its counterpart in HF Transformers](https://huggingface.co/doc except that tokenization and detokenization are also performed automatically. ```python +from vllm import LLM + llm = LLM(model="facebook/opt-125m") outputs = llm.generate("Hello, my name is") @@ -36,6 +38,8 @@ You can optionally control the language generation by passing {class}`~vllm.Samp For example, you can use greedy sampling by setting `temperature=0`: ```python +from vllm import LLM, SamplingParams + llm = LLM(model="facebook/opt-125m") params = SamplingParams(temperature=0) outputs = llm.generate("Hello, my name is", params) @@ -83,6 +87,8 @@ Base models may perform poorly as they are not trained to respond to the chat co ::: ```python +from vllm import LLM + llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") conversation = [ { diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md index f774f3d0..dbcd846c 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/source/models/pooling_models.md @@ -68,6 +68,8 @@ The {class}`~vllm.LLM.encode` method is available to all pooling models in vLLM. It returns the extracted hidden states directly, which is useful for reward models. ```python +from vllm import LLM + llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward") (output,) = llm.encode("Hello, my name is") @@ -81,6 +83,8 @@ The {class}`~vllm.LLM.embed` method outputs an embedding vector for each prompt. It is primarily designed for embedding models. ```python +from vllm import LLM + llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed") (output,) = llm.embed("Hello, my name is") @@ -96,6 +100,8 @@ The {class}`~vllm.LLM.classify` method outputs a probability vector for each pro It is primarily designed for classification models. ```python +from vllm import LLM + llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify") (output,) = llm.classify("Hello, my name is") @@ -116,6 +122,8 @@ To handle RAG at a higher level, you should use integration frameworks such as [ ::: ```python +from vllm import LLM + llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score") (output,) = llm.score("What is the capital of France?", "The capital of Brazil is Brasilia.") diff --git a/docs/source/performance/optimization.md b/docs/source/performance/optimization.md index 5b0f8421..ccbe8a36 100644 --- a/docs/source/performance/optimization.md +++ b/docs/source/performance/optimization.md @@ -31,6 +31,8 @@ vLLM supports an experimental feature chunked prefill. Chunked prefill allows to You can enable the feature by specifying `--enable-chunked-prefill` in the command line or setting `enable_chunked_prefill=True` in the LLM constructor. ```python +from vllm import LLM + llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True) # Set max_num_batched_tokens to tune performance. # NOTE: 2048 is the default max_num_batched_tokens for chunked prefill. diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md index 2e2016c9..f45d36c3 100644 --- a/docs/source/serving/multimodal_inputs.md +++ b/docs/source/serving/multimodal_inputs.md @@ -21,6 +21,8 @@ To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType` You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples: ```python +from vllm import LLM + llm = LLM(model="llava-hf/llava-1.5-7b-hf") # Refer to the HuggingFace repo for the correct format to use @@ -65,6 +67,8 @@ Full example: To substitute multiple images inside the same text prompt, you can pass in a list of images instead: ```python +from vllm import LLM + llm = LLM( model="microsoft/Phi-3.5-vision-instruct", trust_remote_code=True, # Required to load Phi-3.5-vision @@ -96,6 +100,8 @@ Full example: