[Misc] refactor examples series (#16708)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
This commit is contained in:
Reid 2025-04-16 18:16:36 +08:00 committed by GitHub
parent 21378a2323
commit 7168920491
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 220 additions and 153 deletions

View File

@ -50,6 +50,13 @@ def initialize_engine(args: argparse.Namespace) -> LLMEngine:
return LLMEngine.from_engine_args(engine_args)
def parse_args():
parser = FlexibleArgumentParser(
description='Demo on using the LLMEngine class directly')
parser = EngineArgs.add_cli_args(parser)
return parser.parse_args()
def main(args: argparse.Namespace):
"""Main function that sets up and runs the prompt processing."""
engine = initialize_engine(args)
@ -58,8 +65,5 @@ def main(args: argparse.Namespace):
if __name__ == '__main__':
parser = FlexibleArgumentParser(
description='Demo on using the LLMEngine class directly')
parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()
args = parse_args()
main(args)

View File

@ -23,10 +23,6 @@ import gradio as gr
from openai import OpenAI
def create_openai_client(api_key, base_url):
return OpenAI(api_key=api_key, base_url=base_url)
def format_history_to_openai(history):
history_openai_format = [{
"role": "system",

View File

@ -303,12 +303,7 @@ example_function_map = {
}
def main(args) -> None:
chat_type = args.chat_type
example_function_map[chat_type]()
if __name__ == "__main__":
def parse_args():
parser = FlexibleArgumentParser(
description='Demo on using OpenAI client for online serving with '
'multimodal language models served with vLLM.')
@ -318,5 +313,14 @@ if __name__ == "__main__":
default="single-image",
choices=list(example_function_map.keys()),
help='Conversation type with multimodal data.')
args = parser.parse_args()
return parser.parse_args()
def main(args) -> None:
chat_type = args.chat_type
example_function_map[chat_type]()
if __name__ == "__main__":
args = parse_args()
main(args)

View File

@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
"""
To run this example, you can start the vLLM server
To run this example, you can start the vLLM server
without any specific flags:
```bash
@ -8,7 +8,7 @@ VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
--guided-decoding-backend outlines
```
This example demonstrates how to generate chat completions
This example demonstrates how to generate chat completions
using the OpenAI Python client library.
"""
@ -18,15 +18,6 @@ from openai import OpenAI
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
tools = [
{
"type": "function",
@ -116,21 +107,36 @@ messages = [
},
]
chat_completion = client.chat.completions.create(
messages=messages,
model=model,
tools=tools,
tool_choice="required",
stream=True # Enable streaming response
)
for chunk in chat_completion:
if chunk.choices and chunk.choices[0].delta.tool_calls:
print(chunk.choices[0].delta.tool_calls)
def main():
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
chat_completion = client.chat.completions.create(messages=messages,
model=model,
tools=tools,
tool_choice="required")
models = client.models.list()
model = models.data[0].id
print(chat_completion.choices[0].message.tool_calls)
chat_completion = client.chat.completions.create(
messages=messages,
model=model,
tools=tools,
tool_choice="required",
stream=True # Enable streaming response
)
for chunk in chat_completion:
if chunk.choices and chunk.choices[0].delta.tool_calls:
print(chunk.choices[0].delta.tool_calls)
chat_completion = client.chat.completions.create(messages=messages,
model=model,
tools=tools,
tool_choice="required")
print(chat_completion.choices[0].message.tool_calls)
if __name__ == "__main__":
main()

View File

@ -3,8 +3,8 @@
An example shows how to generate chat completions from reasoning models
like DeepSeekR1.
To run this example, you need to start the vLLM server with the reasoning
parser:
To run this example, you need to start the vLLM server
with the reasoning parser:
```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
@ -21,35 +21,44 @@ from openai import OpenAI
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
def main():
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
# Round 1
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
response = client.chat.completions.create(model=model, messages=messages)
models = client.models.list()
model = models.data[0].id
reasoning_content = response.choices[0].message.reasoning_content
content = response.choices[0].message.content
# Round 1
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
# ruff: noqa: E501
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
response = client.chat.completions.create(model=model, messages=messages)
print("reasoning_content for Round 1:", reasoning_content)
print("content for Round 1:", content)
reasoning_content = response.choices[0].message.reasoning_content
content = response.choices[0].message.content
# Round 2
messages.append({"role": "assistant", "content": content})
messages.append({
"role": "user",
"content": "How many Rs are there in the word 'strawberry'?",
})
response = client.chat.completions.create(model=model, messages=messages)
print("reasoning_content for Round 1:", reasoning_content)
print("content for Round 1:", content)
reasoning_content = response.choices[0].message.reasoning_content
content = response.choices[0].message.content
# Round 2
messages.append({"role": "assistant", "content": content})
messages.append({
"role":
"user",
"content":
"How many Rs are there in the word 'strawberry'?",
})
response = client.chat.completions.create(model=model, messages=messages)
print("reasoning_content for Round 2:", reasoning_content)
print("content for Round 2:", content)
reasoning_content = response.choices[0].message.reasoning_content
content = response.choices[0].message.content
print("reasoning_content for Round 2:", reasoning_content)
print("content for Round 2:", content)
if __name__ == "__main__":
main()

View File

@ -3,7 +3,7 @@
An example shows how to generate chat completions from reasoning models
like DeepSeekR1.
To run this example, you need to start the vLLM server with the reasoning
To run this example, you need to start the vLLM server with the reasoning
parser:
```bash
@ -29,41 +29,49 @@ from openai import OpenAI
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
stream = client.chat.completions.create(model=model,
messages=messages,
stream=True)
print("client: Start streaming chat completions...")
printed_reasoning_content = False
printed_content = False
for chunk in stream:
reasoning_content = None
content = None
# Check the content is reasoning_content or content
if hasattr(chunk.choices[0].delta, "reasoning_content"):
reasoning_content = chunk.choices[0].delta.reasoning_content
elif hasattr(chunk.choices[0].delta, "content"):
content = chunk.choices[0].delta.content
def main():
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
if reasoning_content is not None:
if not printed_reasoning_content:
printed_reasoning_content = True
print("reasoning_content:", end="", flush=True)
print(reasoning_content, end="", flush=True)
elif content is not None:
if not printed_content:
printed_content = True
print("\ncontent:", end="", flush=True)
# Extract and print the content
print(content, end="", flush=True)
models = client.models.list()
model = models.data[0].id
# ruff: noqa: E501
# For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
stream = client.chat.completions.create(model=model,
messages=messages,
stream=True)
print("client: Start streaming chat completions...")
printed_reasoning_content = False
printed_content = False
for chunk in stream:
reasoning_content = None
content = None
# Check the content is reasoning_content or content
if hasattr(chunk.choices[0].delta, "reasoning_content"):
reasoning_content = chunk.choices[0].delta.reasoning_content
elif hasattr(chunk.choices[0].delta, "content"):
content = chunk.choices[0].delta.content
if reasoning_content is not None:
if not printed_reasoning_content:
printed_reasoning_content = True
print("reasoning_content:", end="", flush=True)
print(reasoning_content, end="", flush=True)
elif content is not None:
if not printed_content:
printed_content = True
print("\ncontent:", end="", flush=True)
# Extract and print the content
print(content, end="", flush=True)
if __name__ == "__main__":
main()

View File

@ -98,7 +98,7 @@ def dse_qwen2_vl(inp: dict):
print("Embedding output:", response_json["data"][0]["embedding"])
if __name__ == '__main__':
def parse_args():
parser = argparse.ArgumentParser(
"Script to call a specified VLM through the API. Make sure to serve "
"the model with --task embed before running this.")
@ -107,8 +107,10 @@ if __name__ == '__main__':
choices=["vlm2vec", "dse_qwen2_vl"],
required=True,
help="Which model to call.")
args = parser.parse_args()
return parser.parse_args()
def main(args):
if args.model == "vlm2vec":
vlm2vec()
elif args.model == "dse_qwen2_vl":
@ -120,3 +122,8 @@ if __name__ == '__main__':
"type": "text",
"content": "What is the weather like today?",
})
if __name__ == '__main__':
args = parse_args()
main(args)

View File

@ -6,28 +6,36 @@ from openai import OpenAI
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
def main():
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
# Completion API
stream = False
completion = client.completions.create(
model=model,
prompt="A robot may not injure a human being",
echo=False,
n=2,
stream=stream,
logprobs=3)
models = client.models.list()
model = models.data[0].id
print("Completion results:")
if stream:
for c in completion:
print(c)
else:
print(completion)
# Completion API
stream = False
completion = client.completions.create(
model=model,
prompt="A robot may not injure a human being",
echo=False,
n=2,
stream=stream,
logprobs=3)
print("-" * 50)
print("Completion results:")
if stream:
for c in completion:
print(c)
else:
print(completion)
print("-" * 50)
if __name__ == "__main__":
main()

View File

@ -16,13 +16,15 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
return response
if __name__ == "__main__":
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3")
return parser.parse_args()
args = parser.parse_args()
def main(args):
api_url = f"http://{args.host}:{args.port}/score"
model_name = args.model
@ -30,9 +32,9 @@ if __name__ == "__main__":
text_2 = "The capital of Brazil is Brasilia."
prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
score_response = post_http_request(prompt=prompt, api_url=api_url)
print("Prompt when text_1 and text_2 are both strings:")
print("\nPrompt when text_1 and text_2 are both strings:")
pprint.pprint(prompt)
print("Score Response:")
print("\nScore Response:")
pprint.pprint(score_response.json())
text_1 = "What is the capital of France?"
@ -41,9 +43,9 @@ if __name__ == "__main__":
]
prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
score_response = post_http_request(prompt=prompt, api_url=api_url)
print("Prompt when text_1 is string and text_2 is a list:")
print("\nPrompt when text_1 is string and text_2 is a list:")
pprint.pprint(prompt)
print("Score Response:")
print("\nScore Response:")
pprint.pprint(score_response.json())
text_1 = [
@ -54,7 +56,12 @@ if __name__ == "__main__":
]
prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
score_response = post_http_request(prompt=prompt, api_url=api_url)
print("Prompt when text_1 and text_2 are both lists:")
print("\nPrompt when text_1 and text_2 are both lists:")
pprint.pprint(prompt)
print("Score Response:")
print("\nScore Response:")
pprint.pprint(score_response.json())
if __name__ == "__main__":
args = parse_args()
main(args)

View File

@ -6,22 +6,29 @@ from openai import OpenAI
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
def main():
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
responses = client.embeddings.create(
input=[
"Hello my name is",
"The best thing about vLLM is that it supports many different models"
],
model=model,
)
models = client.models.list()
model = models.data[0].id
for data in responses.data:
print(data.embedding) # List of float of len 4096
responses = client.embeddings.create(
# ruff: noqa: E501
input=[
"Hello my name is",
"The best thing about vLLM is that it supports many different models"
],
model=model,
)
for data in responses.data:
print(data.embedding) # List of float of len 4096
if __name__ == "__main__":
main()

View File

@ -17,7 +17,7 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
return response
if __name__ == "__main__":
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
@ -25,15 +25,20 @@ if __name__ == "__main__":
type=str,
default="jason9693/Qwen2.5-1.5B-apeach")
args = parser.parse_args()
return parser.parse_args()
def main(args):
api_url = f"http://{args.host}:{args.port}/pooling"
model_name = args.model
# Input like Completions API
prompt = {"model": model_name, "input": "vLLM is great!"}
pooling_response = post_http_request(prompt=prompt, api_url=api_url)
print("-" * 50)
print("Pooling Response:")
pprint.pprint(pooling_response.json())
print("-" * 50)
# Input like Chat API
prompt = {
@ -50,3 +55,9 @@ if __name__ == "__main__":
pooling_response = post_http_request(prompt=prompt, api_url=api_url)
print("Pooling Response:")
pprint.pprint(pooling_response.json())
print("-" * 50)
if __name__ == "__main__":
args = parse_args()
main(args)