From 716892049190e55b432a0cb84d0250f0e68c310a Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Wed, 16 Apr 2025 18:16:36 +0800
Subject: [PATCH] [Misc] refactor examples series (#16708)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 .../offline_inference/llm_engine_example.py   | 12 ++-
 .../gradio_openai_chatbot_webserver.py        |  4 -
 ...i_chat_completion_client_for_multimodal.py | 18 +++--
 ...t_completion_client_with_tools_required.py | 58 +++++++-------
 .../openai_chat_completion_with_reasoning.py  | 63 ++++++++-------
 ...hat_completion_with_reasoning_streaming.py | 78 ++++++++++---------
 ...ai_chat_embedding_client_for_multimodal.py | 11 ++-
 .../openai_completion_client.py               | 52 +++++++------
 .../openai_cross_encoder_score.py             | 23 ++++--
 .../online_serving/openai_embedding_client.py | 39 ++++++----
 .../online_serving/openai_pooling_client.py   | 15 +++-
 11 files changed, 220 insertions(+), 153 deletions(-)

diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py
index abff90d1..d84cd9ee 100644
--- a/examples/offline_inference/llm_engine_example.py
+++ b/examples/offline_inference/llm_engine_example.py
@@ -50,6 +50,13 @@ def initialize_engine(args: argparse.Namespace) -> LLMEngine:
     return LLMEngine.from_engine_args(engine_args)
 
 
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description='Demo on using the LLMEngine class directly')
+    parser = EngineArgs.add_cli_args(parser)
+    return parser.parse_args()
+
+
 def main(args: argparse.Namespace):
     """Main function that sets up and runs the prompt processing."""
     engine = initialize_engine(args)
@@ -58,8 +65,5 @@ def main(args: argparse.Namespace):
 
 
 if __name__ == '__main__':
-    parser = FlexibleArgumentParser(
-        description='Demo on using the LLMEngine class directly')
-    parser = EngineArgs.add_cli_args(parser)
-    args = parser.parse_args()
+    args = parse_args()
     main(args)
diff --git a/examples/online_serving/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py
index 13331609..314f1c5b 100644
--- a/examples/online_serving/gradio_openai_chatbot_webserver.py
+++ b/examples/online_serving/gradio_openai_chatbot_webserver.py
@@ -23,10 +23,6 @@ import gradio as gr
 from openai import OpenAI
 
 
-def create_openai_client(api_key, base_url):
-    return OpenAI(api_key=api_key, base_url=base_url)
-
-
 def format_history_to_openai(history):
     history_openai_format = [{
         "role": "system",
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index ecfcf05a..18006e2c 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -303,12 +303,7 @@ example_function_map = {
 }
 
 
-def main(args) -> None:
-    chat_type = args.chat_type
-    example_function_map[chat_type]()
-
-
-if __name__ == "__main__":
+def parse_args():
     parser = FlexibleArgumentParser(
         description='Demo on using OpenAI client for online serving with '
         'multimodal language models served with vLLM.')
@@ -318,5 +313,14 @@ if __name__ == "__main__":
                         default="single-image",
                         choices=list(example_function_map.keys()),
                         help='Conversation type with multimodal data.')
-    args = parser.parse_args()
+    return parser.parse_args()
+
+
+def main(args) -> None:
+    chat_type = args.chat_type
+    example_function_map[chat_type]()
+
+
+if __name__ == "__main__":
+    args = parse_args()
     main(args)
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_required.py b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
index 779369d1..97d900bb 100644
--- a/examples/online_serving/openai_chat_completion_client_with_tools_required.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """
-To run this example, you can start the vLLM server 
+To run this example, you can start the vLLM server
 without any specific flags:
 
 ```bash
@@ -8,7 +8,7 @@ VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
     --guided-decoding-backend outlines
 ```
 
-This example demonstrates how to generate chat completions 
+This example demonstrates how to generate chat completions
 using the OpenAI Python client library.
 """
 
@@ -18,15 +18,6 @@ from openai import OpenAI
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-
-models = client.models.list()
-model = models.data[0].id
-
 tools = [
     {
         "type": "function",
@@ -116,21 +107,36 @@ messages = [
     },
 ]
 
-chat_completion = client.chat.completions.create(
-    messages=messages,
-    model=model,
-    tools=tools,
-    tool_choice="required",
-    stream=True  # Enable streaming response
-)
 
-for chunk in chat_completion:
-    if chunk.choices and chunk.choices[0].delta.tool_calls:
-        print(chunk.choices[0].delta.tool_calls)
+def main():
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
 
-chat_completion = client.chat.completions.create(messages=messages,
-                                                 model=model,
-                                                 tools=tools,
-                                                 tool_choice="required")
+    models = client.models.list()
+    model = models.data[0].id
 
-print(chat_completion.choices[0].message.tool_calls)
+    chat_completion = client.chat.completions.create(
+        messages=messages,
+        model=model,
+        tools=tools,
+        tool_choice="required",
+        stream=True  # Enable streaming response
+    )
+
+    for chunk in chat_completion:
+        if chunk.choices and chunk.choices[0].delta.tool_calls:
+            print(chunk.choices[0].delta.tool_calls)
+
+    chat_completion = client.chat.completions.create(messages=messages,
+                                                     model=model,
+                                                     tools=tools,
+                                                     tool_choice="required")
+
+    print(chat_completion.choices[0].message.tool_calls)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py
index e753cedc..6f5f7b5f 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
@@ -3,8 +3,8 @@
 An example shows how to generate chat completions from reasoning models
 like DeepSeekR1.
 
-To run this example, you need to start the vLLM server with the reasoning 
-parser:
+To run this example, you need to start the vLLM server
+with the reasoning parser:
 
 ```bash
 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
@@ -21,35 +21,44 @@ from openai import OpenAI
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
 
-models = client.models.list()
-model = models.data[0].id
+def main():
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
 
-# Round 1
-messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
-# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
-response = client.chat.completions.create(model=model, messages=messages)
+    models = client.models.list()
+    model = models.data[0].id
 
-reasoning_content = response.choices[0].message.reasoning_content
-content = response.choices[0].message.content
+    # Round 1
+    messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+    # ruff: noqa: E501
+    # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
+    response = client.chat.completions.create(model=model, messages=messages)
 
-print("reasoning_content for Round 1:", reasoning_content)
-print("content for Round 1:", content)
+    reasoning_content = response.choices[0].message.reasoning_content
+    content = response.choices[0].message.content
 
-# Round 2
-messages.append({"role": "assistant", "content": content})
-messages.append({
-    "role": "user",
-    "content": "How many Rs are there in the word 'strawberry'?",
-})
-response = client.chat.completions.create(model=model, messages=messages)
+    print("reasoning_content for Round 1:", reasoning_content)
+    print("content for Round 1:", content)
 
-reasoning_content = response.choices[0].message.reasoning_content
-content = response.choices[0].message.content
+    # Round 2
+    messages.append({"role": "assistant", "content": content})
+    messages.append({
+        "role":
+        "user",
+        "content":
+        "How many Rs are there in the word 'strawberry'?",
+    })
+    response = client.chat.completions.create(model=model, messages=messages)
 
-print("reasoning_content for Round 2:", reasoning_content)
-print("content for Round 2:", content)
+    reasoning_content = response.choices[0].message.reasoning_content
+    content = response.choices[0].message.content
+
+    print("reasoning_content for Round 2:", reasoning_content)
+    print("content for Round 2:", content)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
index cb13b0c6..90481cdc 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -3,7 +3,7 @@
 An example shows how to generate chat completions from reasoning models
 like DeepSeekR1.
 
-To run this example, you need to start the vLLM server with the reasoning 
+To run this example, you need to start the vLLM server with the reasoning
 parser:
 
 ```bash
@@ -29,41 +29,49 @@ from openai import OpenAI
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-
-models = client.models.list()
-model = models.data[0].id
-
 messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
-# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
-stream = client.chat.completions.create(model=model,
-                                        messages=messages,
-                                        stream=True)
 
-print("client: Start streaming chat completions...")
-printed_reasoning_content = False
-printed_content = False
 
-for chunk in stream:
-    reasoning_content = None
-    content = None
-    # Check the content is reasoning_content or content
-    if hasattr(chunk.choices[0].delta, "reasoning_content"):
-        reasoning_content = chunk.choices[0].delta.reasoning_content
-    elif hasattr(chunk.choices[0].delta, "content"):
-        content = chunk.choices[0].delta.content
+def main():
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
 
-    if reasoning_content is not None:
-        if not printed_reasoning_content:
-            printed_reasoning_content = True
-            print("reasoning_content:", end="", flush=True)
-        print(reasoning_content, end="", flush=True)
-    elif content is not None:
-        if not printed_content:
-            printed_content = True
-            print("\ncontent:", end="", flush=True)
-        # Extract and print the content
-        print(content, end="", flush=True)
+    models = client.models.list()
+    model = models.data[0].id
+
+    # ruff: noqa: E501
+    # For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
+    stream = client.chat.completions.create(model=model,
+                                            messages=messages,
+                                            stream=True)
+
+    print("client: Start streaming chat completions...")
+    printed_reasoning_content = False
+    printed_content = False
+
+    for chunk in stream:
+        reasoning_content = None
+        content = None
+        # Check the content is reasoning_content or content
+        if hasattr(chunk.choices[0].delta, "reasoning_content"):
+            reasoning_content = chunk.choices[0].delta.reasoning_content
+        elif hasattr(chunk.choices[0].delta, "content"):
+            content = chunk.choices[0].delta.content
+
+        if reasoning_content is not None:
+            if not printed_reasoning_content:
+                printed_reasoning_content = True
+                print("reasoning_content:", end="", flush=True)
+            print(reasoning_content, end="", flush=True)
+        elif content is not None:
+            if not printed_content:
+                printed_content = True
+                print("\ncontent:", end="", flush=True)
+            # Extract and print the content
+            print(content, end="", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
index 2c63c5ec..c850b5aa 100644
--- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
@@ -98,7 +98,7 @@ def dse_qwen2_vl(inp: dict):
     print("Embedding output:", response_json["data"][0]["embedding"])
 
 
-if __name__ == '__main__':
+def parse_args():
     parser = argparse.ArgumentParser(
         "Script to call a specified VLM through the API. Make sure to serve "
         "the model with --task embed before running this.")
@@ -107,8 +107,10 @@ if __name__ == '__main__':
                         choices=["vlm2vec", "dse_qwen2_vl"],
                         required=True,
                         help="Which model to call.")
-    args = parser.parse_args()
+    return parser.parse_args()
 
+
+def main(args):
     if args.model == "vlm2vec":
         vlm2vec()
     elif args.model == "dse_qwen2_vl":
@@ -120,3 +122,8 @@ if __name__ == '__main__':
             "type": "text",
             "content": "What is the weather like today?",
         })
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py
index 06b93d7d..6ab7619b 100644
--- a/examples/online_serving/openai_completion_client.py
+++ b/examples/online_serving/openai_completion_client.py
@@ -6,28 +6,36 @@ from openai import OpenAI
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
 
-models = client.models.list()
-model = models.data[0].id
+def main():
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
 
-# Completion API
-stream = False
-completion = client.completions.create(
-    model=model,
-    prompt="A robot may not injure a human being",
-    echo=False,
-    n=2,
-    stream=stream,
-    logprobs=3)
+    models = client.models.list()
+    model = models.data[0].id
 
-print("Completion results:")
-if stream:
-    for c in completion:
-        print(c)
-else:
-    print(completion)
+    # Completion API
+    stream = False
+    completion = client.completions.create(
+        model=model,
+        prompt="A robot may not injure a human being",
+        echo=False,
+        n=2,
+        stream=stream,
+        logprobs=3)
+
+    print("-" * 50)
+    print("Completion results:")
+    if stream:
+        for c in completion:
+            print(c)
+    else:
+        print(completion)
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py
index 67c5fc91..20a64ddb 100644
--- a/examples/online_serving/openai_cross_encoder_score.py
+++ b/examples/online_serving/openai_cross_encoder_score.py
@@ -16,13 +16,15 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
     return response
 
 
-if __name__ == "__main__":
+def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--host", type=str, default="localhost")
     parser.add_argument("--port", type=int, default=8000)
     parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3")
+    return parser.parse_args()
 
-    args = parser.parse_args()
+
+def main(args):
     api_url = f"http://{args.host}:{args.port}/score"
     model_name = args.model
 
@@ -30,9 +32,9 @@ if __name__ == "__main__":
     text_2 = "The capital of Brazil is Brasilia."
     prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
     score_response = post_http_request(prompt=prompt, api_url=api_url)
-    print("Prompt when text_1 and text_2 are both strings:")
+    print("\nPrompt when text_1 and text_2 are both strings:")
     pprint.pprint(prompt)
-    print("Score Response:")
+    print("\nScore Response:")
     pprint.pprint(score_response.json())
 
     text_1 = "What is the capital of France?"
@@ -41,9 +43,9 @@ if __name__ == "__main__":
     ]
     prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
     score_response = post_http_request(prompt=prompt, api_url=api_url)
-    print("Prompt when text_1 is string and text_2 is a list:")
+    print("\nPrompt when text_1 is string and text_2 is a list:")
     pprint.pprint(prompt)
-    print("Score Response:")
+    print("\nScore Response:")
     pprint.pprint(score_response.json())
 
     text_1 = [
@@ -54,7 +56,12 @@ if __name__ == "__main__":
     ]
     prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
     score_response = post_http_request(prompt=prompt, api_url=api_url)
-    print("Prompt when text_1 and text_2 are both lists:")
+    print("\nPrompt when text_1 and text_2 are both lists:")
     pprint.pprint(prompt)
-    print("Score Response:")
+    print("\nScore Response:")
     pprint.pprint(score_response.json())
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/online_serving/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py
index b7c5651e..bc217f7c 100644
--- a/examples/online_serving/openai_embedding_client.py
+++ b/examples/online_serving/openai_embedding_client.py
@@ -6,22 +6,29 @@ from openai import OpenAI
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
 
-models = client.models.list()
-model = models.data[0].id
+def main():
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
 
-responses = client.embeddings.create(
-    input=[
-        "Hello my name is",
-        "The best thing about vLLM is that it supports many different models"
-    ],
-    model=model,
-)
+    models = client.models.list()
+    model = models.data[0].id
 
-for data in responses.data:
-    print(data.embedding)  # List of float of len 4096
+    responses = client.embeddings.create(
+        # ruff: noqa: E501
+        input=[
+            "Hello my name is",
+            "The best thing about vLLM is that it supports many different models"
+        ],
+        model=model,
+    )
+
+    for data in responses.data:
+        print(data.embedding)  # List of float of len 4096
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py
index e17f9c5e..abcfe27c 100644
--- a/examples/online_serving/openai_pooling_client.py
+++ b/examples/online_serving/openai_pooling_client.py
@@ -17,7 +17,7 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
     return response
 
 
-if __name__ == "__main__":
+def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--host", type=str, default="localhost")
     parser.add_argument("--port", type=int, default=8000)
@@ -25,15 +25,20 @@ if __name__ == "__main__":
                         type=str,
                         default="jason9693/Qwen2.5-1.5B-apeach")
 
-    args = parser.parse_args()
+    return parser.parse_args()
+
+
+def main(args):
     api_url = f"http://{args.host}:{args.port}/pooling"
     model_name = args.model
 
     # Input like Completions API
     prompt = {"model": model_name, "input": "vLLM is great!"}
     pooling_response = post_http_request(prompt=prompt, api_url=api_url)
+    print("-" * 50)
     print("Pooling Response:")
     pprint.pprint(pooling_response.json())
+    print("-" * 50)
 
     # Input like Chat API
     prompt = {
@@ -50,3 +55,9 @@ if __name__ == "__main__":
     pooling_response = post_http_request(prompt=prompt, api_url=api_url)
     print("Pooling Response:")
     pprint.pprint(pooling_response.json())
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)