From 716892049190e55b432a0cb84d0250f0e68c310a Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Wed, 16 Apr 2025 18:16:36 +0800 Subject: [PATCH] [Misc] refactor examples series (#16708) Signed-off-by: reidliu41 Co-authored-by: reidliu41 --- .../offline_inference/llm_engine_example.py | 12 ++- .../gradio_openai_chatbot_webserver.py | 4 - ...i_chat_completion_client_for_multimodal.py | 18 +++-- ...t_completion_client_with_tools_required.py | 58 +++++++------- .../openai_chat_completion_with_reasoning.py | 63 ++++++++------- ...hat_completion_with_reasoning_streaming.py | 78 ++++++++++--------- ...ai_chat_embedding_client_for_multimodal.py | 11 ++- .../openai_completion_client.py | 52 +++++++------ .../openai_cross_encoder_score.py | 23 ++++-- .../online_serving/openai_embedding_client.py | 39 ++++++---- .../online_serving/openai_pooling_client.py | 15 +++- 11 files changed, 220 insertions(+), 153 deletions(-) diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py index abff90d1..d84cd9ee 100644 --- a/examples/offline_inference/llm_engine_example.py +++ b/examples/offline_inference/llm_engine_example.py @@ -50,6 +50,13 @@ def initialize_engine(args: argparse.Namespace) -> LLMEngine: return LLMEngine.from_engine_args(engine_args) +def parse_args(): + parser = FlexibleArgumentParser( + description='Demo on using the LLMEngine class directly') + parser = EngineArgs.add_cli_args(parser) + return parser.parse_args() + + def main(args: argparse.Namespace): """Main function that sets up and runs the prompt processing.""" engine = initialize_engine(args) @@ -58,8 +65,5 @@ def main(args: argparse.Namespace): if __name__ == '__main__': - parser = FlexibleArgumentParser( - description='Demo on using the LLMEngine class directly') - parser = EngineArgs.add_cli_args(parser) - args = parser.parse_args() + args = parse_args() main(args) diff --git a/examples/online_serving/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py index 13331609..314f1c5b 100644 --- a/examples/online_serving/gradio_openai_chatbot_webserver.py +++ b/examples/online_serving/gradio_openai_chatbot_webserver.py @@ -23,10 +23,6 @@ import gradio as gr from openai import OpenAI -def create_openai_client(api_key, base_url): - return OpenAI(api_key=api_key, base_url=base_url) - - def format_history_to_openai(history): history_openai_format = [{ "role": "system", diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py index ecfcf05a..18006e2c 100644 --- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py @@ -303,12 +303,7 @@ example_function_map = { } -def main(args) -> None: - chat_type = args.chat_type - example_function_map[chat_type]() - - -if __name__ == "__main__": +def parse_args(): parser = FlexibleArgumentParser( description='Demo on using OpenAI client for online serving with ' 'multimodal language models served with vLLM.') @@ -318,5 +313,14 @@ if __name__ == "__main__": default="single-image", choices=list(example_function_map.keys()), help='Conversation type with multimodal data.') - args = parser.parse_args() + return parser.parse_args() + + +def main(args) -> None: + chat_type = args.chat_type + example_function_map[chat_type]() + + +if __name__ == "__main__": + args = parse_args() main(args) diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_required.py b/examples/online_serving/openai_chat_completion_client_with_tools_required.py index 779369d1..97d900bb 100644 --- a/examples/online_serving/openai_chat_completion_client_with_tools_required.py +++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """ -To run this example, you can start the vLLM server +To run this example, you can start the vLLM server without any specific flags: ```bash @@ -8,7 +8,7 @@ VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \ --guided-decoding-backend outlines ``` -This example demonstrates how to generate chat completions +This example demonstrates how to generate chat completions using the OpenAI Python client library. """ @@ -18,15 +18,6 @@ from openai import OpenAI openai_api_key = "EMPTY" openai_api_base = "http://localhost:8000/v1" -client = OpenAI( - # defaults to os.environ.get("OPENAI_API_KEY") - api_key=openai_api_key, - base_url=openai_api_base, -) - -models = client.models.list() -model = models.data[0].id - tools = [ { "type": "function", @@ -116,21 +107,36 @@ messages = [ }, ] -chat_completion = client.chat.completions.create( - messages=messages, - model=model, - tools=tools, - tool_choice="required", - stream=True # Enable streaming response -) -for chunk in chat_completion: - if chunk.choices and chunk.choices[0].delta.tool_calls: - print(chunk.choices[0].delta.tool_calls) +def main(): + client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key=openai_api_key, + base_url=openai_api_base, + ) -chat_completion = client.chat.completions.create(messages=messages, - model=model, - tools=tools, - tool_choice="required") + models = client.models.list() + model = models.data[0].id -print(chat_completion.choices[0].message.tool_calls) + chat_completion = client.chat.completions.create( + messages=messages, + model=model, + tools=tools, + tool_choice="required", + stream=True # Enable streaming response + ) + + for chunk in chat_completion: + if chunk.choices and chunk.choices[0].delta.tool_calls: + print(chunk.choices[0].delta.tool_calls) + + chat_completion = client.chat.completions.create(messages=messages, + model=model, + tools=tools, + tool_choice="required") + + print(chat_completion.choices[0].message.tool_calls) + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py index e753cedc..6f5f7b5f 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning.py @@ -3,8 +3,8 @@ An example shows how to generate chat completions from reasoning models like DeepSeekR1. -To run this example, you need to start the vLLM server with the reasoning -parser: +To run this example, you need to start the vLLM server +with the reasoning parser: ```bash vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ @@ -21,35 +21,44 @@ from openai import OpenAI openai_api_key = "EMPTY" openai_api_base = "http://localhost:8000/v1" -client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, -) -models = client.models.list() -model = models.data[0].id +def main(): + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) -# Round 1 -messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] -# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}` -response = client.chat.completions.create(model=model, messages=messages) + models = client.models.list() + model = models.data[0].id -reasoning_content = response.choices[0].message.reasoning_content -content = response.choices[0].message.content + # Round 1 + messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] + # ruff: noqa: E501 + # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}` + response = client.chat.completions.create(model=model, messages=messages) -print("reasoning_content for Round 1:", reasoning_content) -print("content for Round 1:", content) + reasoning_content = response.choices[0].message.reasoning_content + content = response.choices[0].message.content -# Round 2 -messages.append({"role": "assistant", "content": content}) -messages.append({ - "role": "user", - "content": "How many Rs are there in the word 'strawberry'?", -}) -response = client.chat.completions.create(model=model, messages=messages) + print("reasoning_content for Round 1:", reasoning_content) + print("content for Round 1:", content) -reasoning_content = response.choices[0].message.reasoning_content -content = response.choices[0].message.content + # Round 2 + messages.append({"role": "assistant", "content": content}) + messages.append({ + "role": + "user", + "content": + "How many Rs are there in the word 'strawberry'?", + }) + response = client.chat.completions.create(model=model, messages=messages) -print("reasoning_content for Round 2:", reasoning_content) -print("content for Round 2:", content) + reasoning_content = response.choices[0].message.reasoning_content + content = response.choices[0].message.content + + print("reasoning_content for Round 2:", reasoning_content) + print("content for Round 2:", content) + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py index cb13b0c6..90481cdc 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py @@ -3,7 +3,7 @@ An example shows how to generate chat completions from reasoning models like DeepSeekR1. -To run this example, you need to start the vLLM server with the reasoning +To run this example, you need to start the vLLM server with the reasoning parser: ```bash @@ -29,41 +29,49 @@ from openai import OpenAI openai_api_key = "EMPTY" openai_api_base = "http://localhost:8000/v1" -client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, -) - -models = client.models.list() -model = models.data[0].id - messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] -# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}` -stream = client.chat.completions.create(model=model, - messages=messages, - stream=True) -print("client: Start streaming chat completions...") -printed_reasoning_content = False -printed_content = False -for chunk in stream: - reasoning_content = None - content = None - # Check the content is reasoning_content or content - if hasattr(chunk.choices[0].delta, "reasoning_content"): - reasoning_content = chunk.choices[0].delta.reasoning_content - elif hasattr(chunk.choices[0].delta, "content"): - content = chunk.choices[0].delta.content +def main(): + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) - if reasoning_content is not None: - if not printed_reasoning_content: - printed_reasoning_content = True - print("reasoning_content:", end="", flush=True) - print(reasoning_content, end="", flush=True) - elif content is not None: - if not printed_content: - printed_content = True - print("\ncontent:", end="", flush=True) - # Extract and print the content - print(content, end="", flush=True) + models = client.models.list() + model = models.data[0].id + + # ruff: noqa: E501 + # For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}` + stream = client.chat.completions.create(model=model, + messages=messages, + stream=True) + + print("client: Start streaming chat completions...") + printed_reasoning_content = False + printed_content = False + + for chunk in stream: + reasoning_content = None + content = None + # Check the content is reasoning_content or content + if hasattr(chunk.choices[0].delta, "reasoning_content"): + reasoning_content = chunk.choices[0].delta.reasoning_content + elif hasattr(chunk.choices[0].delta, "content"): + content = chunk.choices[0].delta.content + + if reasoning_content is not None: + if not printed_reasoning_content: + printed_reasoning_content = True + print("reasoning_content:", end="", flush=True) + print(reasoning_content, end="", flush=True) + elif content is not None: + if not printed_content: + printed_content = True + print("\ncontent:", end="", flush=True) + # Extract and print the content + print(content, end="", flush=True) + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py index 2c63c5ec..c850b5aa 100644 --- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py @@ -98,7 +98,7 @@ def dse_qwen2_vl(inp: dict): print("Embedding output:", response_json["data"][0]["embedding"]) -if __name__ == '__main__': +def parse_args(): parser = argparse.ArgumentParser( "Script to call a specified VLM through the API. Make sure to serve " "the model with --task embed before running this.") @@ -107,8 +107,10 @@ if __name__ == '__main__': choices=["vlm2vec", "dse_qwen2_vl"], required=True, help="Which model to call.") - args = parser.parse_args() + return parser.parse_args() + +def main(args): if args.model == "vlm2vec": vlm2vec() elif args.model == "dse_qwen2_vl": @@ -120,3 +122,8 @@ if __name__ == '__main__': "type": "text", "content": "What is the weather like today?", }) + + +if __name__ == '__main__': + args = parse_args() + main(args) diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py index 06b93d7d..6ab7619b 100644 --- a/examples/online_serving/openai_completion_client.py +++ b/examples/online_serving/openai_completion_client.py @@ -6,28 +6,36 @@ from openai import OpenAI openai_api_key = "EMPTY" openai_api_base = "http://localhost:8000/v1" -client = OpenAI( - # defaults to os.environ.get("OPENAI_API_KEY") - api_key=openai_api_key, - base_url=openai_api_base, -) -models = client.models.list() -model = models.data[0].id +def main(): + client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key=openai_api_key, + base_url=openai_api_base, + ) -# Completion API -stream = False -completion = client.completions.create( - model=model, - prompt="A robot may not injure a human being", - echo=False, - n=2, - stream=stream, - logprobs=3) + models = client.models.list() + model = models.data[0].id -print("Completion results:") -if stream: - for c in completion: - print(c) -else: - print(completion) + # Completion API + stream = False + completion = client.completions.create( + model=model, + prompt="A robot may not injure a human being", + echo=False, + n=2, + stream=stream, + logprobs=3) + + print("-" * 50) + print("Completion results:") + if stream: + for c in completion: + print(c) + else: + print(completion) + print("-" * 50) + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py index 67c5fc91..20a64ddb 100644 --- a/examples/online_serving/openai_cross_encoder_score.py +++ b/examples/online_serving/openai_cross_encoder_score.py @@ -16,13 +16,15 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response: return response -if __name__ == "__main__": +def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--port", type=int, default=8000) parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3") + return parser.parse_args() - args = parser.parse_args() + +def main(args): api_url = f"http://{args.host}:{args.port}/score" model_name = args.model @@ -30,9 +32,9 @@ if __name__ == "__main__": text_2 = "The capital of Brazil is Brasilia." prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} score_response = post_http_request(prompt=prompt, api_url=api_url) - print("Prompt when text_1 and text_2 are both strings:") + print("\nPrompt when text_1 and text_2 are both strings:") pprint.pprint(prompt) - print("Score Response:") + print("\nScore Response:") pprint.pprint(score_response.json()) text_1 = "What is the capital of France?" @@ -41,9 +43,9 @@ if __name__ == "__main__": ] prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} score_response = post_http_request(prompt=prompt, api_url=api_url) - print("Prompt when text_1 is string and text_2 is a list:") + print("\nPrompt when text_1 is string and text_2 is a list:") pprint.pprint(prompt) - print("Score Response:") + print("\nScore Response:") pprint.pprint(score_response.json()) text_1 = [ @@ -54,7 +56,12 @@ if __name__ == "__main__": ] prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} score_response = post_http_request(prompt=prompt, api_url=api_url) - print("Prompt when text_1 and text_2 are both lists:") + print("\nPrompt when text_1 and text_2 are both lists:") pprint.pprint(prompt) - print("Score Response:") + print("\nScore Response:") pprint.pprint(score_response.json()) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/examples/online_serving/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py index b7c5651e..bc217f7c 100644 --- a/examples/online_serving/openai_embedding_client.py +++ b/examples/online_serving/openai_embedding_client.py @@ -6,22 +6,29 @@ from openai import OpenAI openai_api_key = "EMPTY" openai_api_base = "http://localhost:8000/v1" -client = OpenAI( - # defaults to os.environ.get("OPENAI_API_KEY") - api_key=openai_api_key, - base_url=openai_api_base, -) -models = client.models.list() -model = models.data[0].id +def main(): + client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key=openai_api_key, + base_url=openai_api_base, + ) -responses = client.embeddings.create( - input=[ - "Hello my name is", - "The best thing about vLLM is that it supports many different models" - ], - model=model, -) + models = client.models.list() + model = models.data[0].id -for data in responses.data: - print(data.embedding) # List of float of len 4096 + responses = client.embeddings.create( + # ruff: noqa: E501 + input=[ + "Hello my name is", + "The best thing about vLLM is that it supports many different models" + ], + model=model, + ) + + for data in responses.data: + print(data.embedding) # List of float of len 4096 + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py index e17f9c5e..abcfe27c 100644 --- a/examples/online_serving/openai_pooling_client.py +++ b/examples/online_serving/openai_pooling_client.py @@ -17,7 +17,7 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response: return response -if __name__ == "__main__": +def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--port", type=int, default=8000) @@ -25,15 +25,20 @@ if __name__ == "__main__": type=str, default="jason9693/Qwen2.5-1.5B-apeach") - args = parser.parse_args() + return parser.parse_args() + + +def main(args): api_url = f"http://{args.host}:{args.port}/pooling" model_name = args.model # Input like Completions API prompt = {"model": model_name, "input": "vLLM is great!"} pooling_response = post_http_request(prompt=prompt, api_url=api_url) + print("-" * 50) print("Pooling Response:") pprint.pprint(pooling_response.json()) + print("-" * 50) # Input like Chat API prompt = { @@ -50,3 +55,9 @@ if __name__ == "__main__": pooling_response = post_http_request(prompt=prompt, api_url=api_url) print("Pooling Response:") pprint.pprint(pooling_response.json()) + print("-" * 50) + + +if __name__ == "__main__": + args = parse_args() + main(args)