[Misc] refactor examples series (#16708)
Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
This commit is contained in:
parent
21378a2323
commit
7168920491
@ -50,6 +50,13 @@ def initialize_engine(args: argparse.Namespace) -> LLMEngine:
|
||||
return LLMEngine.from_engine_args(engine_args)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = FlexibleArgumentParser(
|
||||
description='Demo on using the LLMEngine class directly')
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
"""Main function that sets up and runs the prompt processing."""
|
||||
engine = initialize_engine(args)
|
||||
@ -58,8 +65,5 @@ def main(args: argparse.Namespace):
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = FlexibleArgumentParser(
|
||||
description='Demo on using the LLMEngine class directly')
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
args = parser.parse_args()
|
||||
args = parse_args()
|
||||
main(args)
|
||||
|
@ -23,10 +23,6 @@ import gradio as gr
|
||||
from openai import OpenAI
|
||||
|
||||
|
||||
def create_openai_client(api_key, base_url):
|
||||
return OpenAI(api_key=api_key, base_url=base_url)
|
||||
|
||||
|
||||
def format_history_to_openai(history):
|
||||
history_openai_format = [{
|
||||
"role": "system",
|
||||
|
@ -303,12 +303,7 @@ example_function_map = {
|
||||
}
|
||||
|
||||
|
||||
def main(args) -> None:
|
||||
chat_type = args.chat_type
|
||||
example_function_map[chat_type]()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def parse_args():
|
||||
parser = FlexibleArgumentParser(
|
||||
description='Demo on using OpenAI client for online serving with '
|
||||
'multimodal language models served with vLLM.')
|
||||
@ -318,5 +313,14 @@ if __name__ == "__main__":
|
||||
default="single-image",
|
||||
choices=list(example_function_map.keys()),
|
||||
help='Conversation type with multimodal data.')
|
||||
args = parser.parse_args()
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args) -> None:
|
||||
chat_type = args.chat_type
|
||||
example_function_map[chat_type]()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
|
@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""
|
||||
To run this example, you can start the vLLM server
|
||||
To run this example, you can start the vLLM server
|
||||
without any specific flags:
|
||||
|
||||
```bash
|
||||
@ -8,7 +8,7 @@ VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
|
||||
--guided-decoding-backend outlines
|
||||
```
|
||||
|
||||
This example demonstrates how to generate chat completions
|
||||
This example demonstrates how to generate chat completions
|
||||
using the OpenAI Python client library.
|
||||
"""
|
||||
|
||||
@ -18,15 +18,6 @@ from openai import OpenAI
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
client = OpenAI(
|
||||
# defaults to os.environ.get("OPENAI_API_KEY")
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
@ -116,21 +107,36 @@ messages = [
|
||||
},
|
||||
]
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
messages=messages,
|
||||
model=model,
|
||||
tools=tools,
|
||||
tool_choice="required",
|
||||
stream=True # Enable streaming response
|
||||
)
|
||||
|
||||
for chunk in chat_completion:
|
||||
if chunk.choices and chunk.choices[0].delta.tool_calls:
|
||||
print(chunk.choices[0].delta.tool_calls)
|
||||
def main():
|
||||
client = OpenAI(
|
||||
# defaults to os.environ.get("OPENAI_API_KEY")
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
chat_completion = client.chat.completions.create(messages=messages,
|
||||
model=model,
|
||||
tools=tools,
|
||||
tool_choice="required")
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
|
||||
print(chat_completion.choices[0].message.tool_calls)
|
||||
chat_completion = client.chat.completions.create(
|
||||
messages=messages,
|
||||
model=model,
|
||||
tools=tools,
|
||||
tool_choice="required",
|
||||
stream=True # Enable streaming response
|
||||
)
|
||||
|
||||
for chunk in chat_completion:
|
||||
if chunk.choices and chunk.choices[0].delta.tool_calls:
|
||||
print(chunk.choices[0].delta.tool_calls)
|
||||
|
||||
chat_completion = client.chat.completions.create(messages=messages,
|
||||
model=model,
|
||||
tools=tools,
|
||||
tool_choice="required")
|
||||
|
||||
print(chat_completion.choices[0].message.tool_calls)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
@ -3,8 +3,8 @@
|
||||
An example shows how to generate chat completions from reasoning models
|
||||
like DeepSeekR1.
|
||||
|
||||
To run this example, you need to start the vLLM server with the reasoning
|
||||
parser:
|
||||
To run this example, you need to start the vLLM server
|
||||
with the reasoning parser:
|
||||
|
||||
```bash
|
||||
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
|
||||
@ -21,35 +21,44 @@ from openai import OpenAI
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
client = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
def main():
|
||||
client = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
# Round 1
|
||||
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
|
||||
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
|
||||
response = client.chat.completions.create(model=model, messages=messages)
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
|
||||
reasoning_content = response.choices[0].message.reasoning_content
|
||||
content = response.choices[0].message.content
|
||||
# Round 1
|
||||
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
|
||||
# ruff: noqa: E501
|
||||
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
|
||||
response = client.chat.completions.create(model=model, messages=messages)
|
||||
|
||||
print("reasoning_content for Round 1:", reasoning_content)
|
||||
print("content for Round 1:", content)
|
||||
reasoning_content = response.choices[0].message.reasoning_content
|
||||
content = response.choices[0].message.content
|
||||
|
||||
# Round 2
|
||||
messages.append({"role": "assistant", "content": content})
|
||||
messages.append({
|
||||
"role": "user",
|
||||
"content": "How many Rs are there in the word 'strawberry'?",
|
||||
})
|
||||
response = client.chat.completions.create(model=model, messages=messages)
|
||||
print("reasoning_content for Round 1:", reasoning_content)
|
||||
print("content for Round 1:", content)
|
||||
|
||||
reasoning_content = response.choices[0].message.reasoning_content
|
||||
content = response.choices[0].message.content
|
||||
# Round 2
|
||||
messages.append({"role": "assistant", "content": content})
|
||||
messages.append({
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"How many Rs are there in the word 'strawberry'?",
|
||||
})
|
||||
response = client.chat.completions.create(model=model, messages=messages)
|
||||
|
||||
print("reasoning_content for Round 2:", reasoning_content)
|
||||
print("content for Round 2:", content)
|
||||
reasoning_content = response.choices[0].message.reasoning_content
|
||||
content = response.choices[0].message.content
|
||||
|
||||
print("reasoning_content for Round 2:", reasoning_content)
|
||||
print("content for Round 2:", content)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
@ -3,7 +3,7 @@
|
||||
An example shows how to generate chat completions from reasoning models
|
||||
like DeepSeekR1.
|
||||
|
||||
To run this example, you need to start the vLLM server with the reasoning
|
||||
To run this example, you need to start the vLLM server with the reasoning
|
||||
parser:
|
||||
|
||||
```bash
|
||||
@ -29,41 +29,49 @@ from openai import OpenAI
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
client = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
|
||||
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
|
||||
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
|
||||
stream = client.chat.completions.create(model=model,
|
||||
messages=messages,
|
||||
stream=True)
|
||||
|
||||
print("client: Start streaming chat completions...")
|
||||
printed_reasoning_content = False
|
||||
printed_content = False
|
||||
|
||||
for chunk in stream:
|
||||
reasoning_content = None
|
||||
content = None
|
||||
# Check the content is reasoning_content or content
|
||||
if hasattr(chunk.choices[0].delta, "reasoning_content"):
|
||||
reasoning_content = chunk.choices[0].delta.reasoning_content
|
||||
elif hasattr(chunk.choices[0].delta, "content"):
|
||||
content = chunk.choices[0].delta.content
|
||||
def main():
|
||||
client = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
if reasoning_content is not None:
|
||||
if not printed_reasoning_content:
|
||||
printed_reasoning_content = True
|
||||
print("reasoning_content:", end="", flush=True)
|
||||
print(reasoning_content, end="", flush=True)
|
||||
elif content is not None:
|
||||
if not printed_content:
|
||||
printed_content = True
|
||||
print("\ncontent:", end="", flush=True)
|
||||
# Extract and print the content
|
||||
print(content, end="", flush=True)
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
|
||||
# ruff: noqa: E501
|
||||
# For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
|
||||
stream = client.chat.completions.create(model=model,
|
||||
messages=messages,
|
||||
stream=True)
|
||||
|
||||
print("client: Start streaming chat completions...")
|
||||
printed_reasoning_content = False
|
||||
printed_content = False
|
||||
|
||||
for chunk in stream:
|
||||
reasoning_content = None
|
||||
content = None
|
||||
# Check the content is reasoning_content or content
|
||||
if hasattr(chunk.choices[0].delta, "reasoning_content"):
|
||||
reasoning_content = chunk.choices[0].delta.reasoning_content
|
||||
elif hasattr(chunk.choices[0].delta, "content"):
|
||||
content = chunk.choices[0].delta.content
|
||||
|
||||
if reasoning_content is not None:
|
||||
if not printed_reasoning_content:
|
||||
printed_reasoning_content = True
|
||||
print("reasoning_content:", end="", flush=True)
|
||||
print(reasoning_content, end="", flush=True)
|
||||
elif content is not None:
|
||||
if not printed_content:
|
||||
printed_content = True
|
||||
print("\ncontent:", end="", flush=True)
|
||||
# Extract and print the content
|
||||
print(content, end="", flush=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
@ -98,7 +98,7 @@ def dse_qwen2_vl(inp: dict):
|
||||
print("Embedding output:", response_json["data"][0]["embedding"])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
"Script to call a specified VLM through the API. Make sure to serve "
|
||||
"the model with --task embed before running this.")
|
||||
@ -107,8 +107,10 @@ if __name__ == '__main__':
|
||||
choices=["vlm2vec", "dse_qwen2_vl"],
|
||||
required=True,
|
||||
help="Which model to call.")
|
||||
args = parser.parse_args()
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args):
|
||||
if args.model == "vlm2vec":
|
||||
vlm2vec()
|
||||
elif args.model == "dse_qwen2_vl":
|
||||
@ -120,3 +122,8 @@ if __name__ == '__main__':
|
||||
"type": "text",
|
||||
"content": "What is the weather like today?",
|
||||
})
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_args()
|
||||
main(args)
|
||||
|
@ -6,28 +6,36 @@ from openai import OpenAI
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
client = OpenAI(
|
||||
# defaults to os.environ.get("OPENAI_API_KEY")
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
def main():
|
||||
client = OpenAI(
|
||||
# defaults to os.environ.get("OPENAI_API_KEY")
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
# Completion API
|
||||
stream = False
|
||||
completion = client.completions.create(
|
||||
model=model,
|
||||
prompt="A robot may not injure a human being",
|
||||
echo=False,
|
||||
n=2,
|
||||
stream=stream,
|
||||
logprobs=3)
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
|
||||
print("Completion results:")
|
||||
if stream:
|
||||
for c in completion:
|
||||
print(c)
|
||||
else:
|
||||
print(completion)
|
||||
# Completion API
|
||||
stream = False
|
||||
completion = client.completions.create(
|
||||
model=model,
|
||||
prompt="A robot may not injure a human being",
|
||||
echo=False,
|
||||
n=2,
|
||||
stream=stream,
|
||||
logprobs=3)
|
||||
|
||||
print("-" * 50)
|
||||
print("Completion results:")
|
||||
if stream:
|
||||
for c in completion:
|
||||
print(c)
|
||||
else:
|
||||
print(completion)
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
@ -16,13 +16,15 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
|
||||
return response
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host", type=str, default="localhost")
|
||||
parser.add_argument("--port", type=int, default=8000)
|
||||
parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3")
|
||||
return parser.parse_args()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
def main(args):
|
||||
api_url = f"http://{args.host}:{args.port}/score"
|
||||
model_name = args.model
|
||||
|
||||
@ -30,9 +32,9 @@ if __name__ == "__main__":
|
||||
text_2 = "The capital of Brazil is Brasilia."
|
||||
prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
|
||||
score_response = post_http_request(prompt=prompt, api_url=api_url)
|
||||
print("Prompt when text_1 and text_2 are both strings:")
|
||||
print("\nPrompt when text_1 and text_2 are both strings:")
|
||||
pprint.pprint(prompt)
|
||||
print("Score Response:")
|
||||
print("\nScore Response:")
|
||||
pprint.pprint(score_response.json())
|
||||
|
||||
text_1 = "What is the capital of France?"
|
||||
@ -41,9 +43,9 @@ if __name__ == "__main__":
|
||||
]
|
||||
prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
|
||||
score_response = post_http_request(prompt=prompt, api_url=api_url)
|
||||
print("Prompt when text_1 is string and text_2 is a list:")
|
||||
print("\nPrompt when text_1 is string and text_2 is a list:")
|
||||
pprint.pprint(prompt)
|
||||
print("Score Response:")
|
||||
print("\nScore Response:")
|
||||
pprint.pprint(score_response.json())
|
||||
|
||||
text_1 = [
|
||||
@ -54,7 +56,12 @@ if __name__ == "__main__":
|
||||
]
|
||||
prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
|
||||
score_response = post_http_request(prompt=prompt, api_url=api_url)
|
||||
print("Prompt when text_1 and text_2 are both lists:")
|
||||
print("\nPrompt when text_1 and text_2 are both lists:")
|
||||
pprint.pprint(prompt)
|
||||
print("Score Response:")
|
||||
print("\nScore Response:")
|
||||
pprint.pprint(score_response.json())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
|
@ -6,22 +6,29 @@ from openai import OpenAI
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
client = OpenAI(
|
||||
# defaults to os.environ.get("OPENAI_API_KEY")
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
def main():
|
||||
client = OpenAI(
|
||||
# defaults to os.environ.get("OPENAI_API_KEY")
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
responses = client.embeddings.create(
|
||||
input=[
|
||||
"Hello my name is",
|
||||
"The best thing about vLLM is that it supports many different models"
|
||||
],
|
||||
model=model,
|
||||
)
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
|
||||
for data in responses.data:
|
||||
print(data.embedding) # List of float of len 4096
|
||||
responses = client.embeddings.create(
|
||||
# ruff: noqa: E501
|
||||
input=[
|
||||
"Hello my name is",
|
||||
"The best thing about vLLM is that it supports many different models"
|
||||
],
|
||||
model=model,
|
||||
)
|
||||
|
||||
for data in responses.data:
|
||||
print(data.embedding) # List of float of len 4096
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
@ -17,7 +17,7 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
|
||||
return response
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host", type=str, default="localhost")
|
||||
parser.add_argument("--port", type=int, default=8000)
|
||||
@ -25,15 +25,20 @@ if __name__ == "__main__":
|
||||
type=str,
|
||||
default="jason9693/Qwen2.5-1.5B-apeach")
|
||||
|
||||
args = parser.parse_args()
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args):
|
||||
api_url = f"http://{args.host}:{args.port}/pooling"
|
||||
model_name = args.model
|
||||
|
||||
# Input like Completions API
|
||||
prompt = {"model": model_name, "input": "vLLM is great!"}
|
||||
pooling_response = post_http_request(prompt=prompt, api_url=api_url)
|
||||
print("-" * 50)
|
||||
print("Pooling Response:")
|
||||
pprint.pprint(pooling_response.json())
|
||||
print("-" * 50)
|
||||
|
||||
# Input like Chat API
|
||||
prompt = {
|
||||
@ -50,3 +55,9 @@ if __name__ == "__main__":
|
||||
pooling_response = post_http_request(prompt=prompt, api_url=api_url)
|
||||
print("Pooling Response:")
|
||||
pprint.pprint(pooling_response.json())
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
|
Loading…
x
Reference in New Issue
Block a user