vllm/examples/offline_inference_pixtral.py

# ruff: noqa
import argparse

from vllm import LLM
from vllm.sampling_params import SamplingParams

# This script is an offline demo for running Pixtral.
#
# If you want to run a server/client setup, please follow this code:
#
# - Server:
#
# ```bash
# vllm serve mistralai/Pixtral-12B-2409 --tokenizer_mode mistral --limit_mm_per_prompt 'image=4' --max_num_batched_tokens 16384
# ```
#
# - Client:
#
# ```bash
# curl --location 'http://<your-node-url>:8000/v1/chat/completions' \
# --header 'Content-Type: application/json' \
# --header 'Authorization: Bearer token' \
# --data '{
#     "model": "mistralai/Pixtral-12B-2409",
#     "messages": [
#       {
#         "role": "user",
#         "content": [
#             {"type" : "text", "text": "Describe this image in detail please."},
#             {"type": "image_url", "image_url": {"url": "https://s3.amazonaws.com/cms.ipressroom.com/338/files/201808/5b894ee1a138352221103195_A680%7Ejogging-edit/A680%7Ejogging-edit_hero.jpg"}},
#             {"type" : "text", "text": "and this one as well. Answer in French."},
#             {"type": "image_url", "image_url": {"url": "https://www.wolframcloud.com/obj/resourcesystem/images/a0e/a0ee3983-46c6-4c92-b85d-059044639928/6af8cfb971db031b.png"}}
#         ]
#       }
#     ]
#   }'
# ```
#
# Usage:
#     python demo.py simple
#     python demo.py advanced


def run_simple_demo():
    model_name = "mistralai/Pixtral-12B-2409"
    sampling_params = SamplingParams(max_tokens=8192)

    llm = LLM(model=model_name, tokenizer_mode="mistral")

    prompt = "Describe this image in one sentence."
    image_url = "https://picsum.photos/id/237/200/300"

    messages = [
        {
            "role":
            "user",
            "content": [
                {
                    "type": "text",
                    "text": prompt
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": image_url
                    }
                },
            ],
        },
    ]
    outputs = llm.chat(messages, sampling_params=sampling_params)

    print(outputs[0].outputs[0].text)


def run_advanced_demo():
    model_name = "mistralai/Pixtral-12B-2409"
    max_img_per_msg = 5
    max_tokens_per_img = 4096

    sampling_params = SamplingParams(max_tokens=8192, temperature=0.7)
    llm = LLM(
        model=model_name,
        tokenizer_mode="mistral",
        limit_mm_per_prompt={"image": max_img_per_msg},
        max_num_batched_tokens=max_img_per_msg * max_tokens_per_img,
    )

    prompt = "Describe the following image."

    url_1 = "https://huggingface.co/datasets/patrickvonplaten/random_img/resolve/main/yosemite.png"
    url_2 = "https://picsum.photos/seed/picsum/200/300"
    url_3 = "https://picsum.photos/id/32/512/512"

    messages = [
        {
            "role":
            "user",
            "content": [
                {
                    "type": "text",
                    "text": prompt
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": url_1
                    }
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": url_2
                    }
                },
            ],
        },
        {
            "role": "assistant",
            "content": "The images show nature.",
        },
        {
            "role": "user",
            "content": "More details please and answer only in French!.",
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {
                        "url": url_3
                    }
                },
            ],
        },
    ]

    outputs = llm.chat(messages=messages, sampling_params=sampling_params)
    print(outputs[0].outputs[0].text)


def main():
    parser = argparse.ArgumentParser(
        description="Run a demo in simple or advanced mode.")

    parser.add_argument(
        "mode",
        choices=["simple", "advanced"],
        help="Specify the demo mode: 'simple' or 'advanced'",
    )

    args = parser.parse_args()

    if args.mode == "simple":
        print("Running simple demo...")
        run_simple_demo()
    elif args.mode == "advanced":
        print("Running advanced demo...")
        run_advanced_demo()


if __name__ == "__main__":
    main()
Pixtral (#8377) Co-authored-by: Roger Wang <ywang@roblox.com> 2024-09-11 23:41:55 +02:00			`# ruff: noqa`
			`import argparse`

			`from vllm import LLM`
			`from vllm.sampling_params import SamplingParams`

			`# This script is an offline demo for running Pixtral.`
			`#`
			`# If you want to run a server/client setup, please follow this code:`
			`#`
			`# - Server:`
			`#`
			# ```bash
			`# vllm serve mistralai/Pixtral-12B-2409 --tokenizer_mode mistral --limit_mm_per_prompt 'image=4' --max_num_batched_tokens 16384`
			# ```
			`#`
			`# - Client:`
			`#`
			# ```bash
			`# curl --location 'http://<your-node-url>:8000/v1/chat/completions' \`
			`# --header 'Content-Type: application/json' \`
			`# --header 'Authorization: Bearer token' \`
			`# --data '{`
			`# "model": "mistralai/Pixtral-12B-2409",`
			`# "messages": [`
			`# {`
			`# "role": "user",`
			`# "content": [`
			`# {"type" : "text", "text": "Describe this image in detail please."},`
			`# {"type": "image_url", "image_url": {"url": "https://s3.amazonaws.com/cms.ipressroom.com/338/files/201808/5b894ee1a138352221103195_A680%7Ejogging-edit/A680%7Ejogging-edit_hero.jpg"}},`
			`# {"type" : "text", "text": "and this one as well. Answer in French."},`
			`# {"type": "image_url", "image_url": {"url": "https://www.wolframcloud.com/obj/resourcesystem/images/a0e/a0ee3983-46c6-4c92-b85d-059044639928/6af8cfb971db031b.png"}}`
			`# ]`
			`# }`
			`# ]`
			`# }'`
			# ```
			`#`
			`# Usage:`
			`# python demo.py simple`
			`# python demo.py advanced`


			`def run_simple_demo():`
			`model_name = "mistralai/Pixtral-12B-2409"`
			`sampling_params = SamplingParams(max_tokens=8192)`

			`llm = LLM(model=model_name, tokenizer_mode="mistral")`

			`prompt = "Describe this image in one sentence."`
			`image_url = "https://picsum.photos/id/237/200/300"`

			`messages = [`
			`{`
			`"role":`
			`"user",`
			`"content": [`
			`{`
			`"type": "text",`
			`"text": prompt`
			`},`
			`{`
			`"type": "image_url",`
			`"image_url": {`
			`"url": image_url`
			`}`
			`},`
			`],`
			`},`
			`]`
			`outputs = llm.chat(messages, sampling_params=sampling_params)`

			`print(outputs[0].outputs[0].text)`


			`def run_advanced_demo():`
			`model_name = "mistralai/Pixtral-12B-2409"`
			`max_img_per_msg = 5`
			`max_tokens_per_img = 4096`

			`sampling_params = SamplingParams(max_tokens=8192, temperature=0.7)`
			`llm = LLM(`
			`model=model_name,`
			`tokenizer_mode="mistral",`
			`limit_mm_per_prompt={"image": max_img_per_msg},`
			`max_num_batched_tokens=max_img_per_msg * max_tokens_per_img,`
			`)`

			`prompt = "Describe the following image."`

			`url_1 = "https://huggingface.co/datasets/patrickvonplaten/random_img/resolve/main/yosemite.png"`
			`url_2 = "https://picsum.photos/seed/picsum/200/300"`
			`url_3 = "https://picsum.photos/id/32/512/512"`

			`messages = [`
			`{`
			`"role":`
			`"user",`
			`"content": [`
			`{`
			`"type": "text",`
			`"text": prompt`
			`},`
			`{`
			`"type": "image_url",`
			`"image_url": {`
			`"url": url_1`
			`}`
			`},`
			`{`
			`"type": "image_url",`
			`"image_url": {`
			`"url": url_2`
			`}`
			`},`
			`],`
			`},`
			`{`
			`"role": "assistant",`
			`"content": "The images show nature.",`
			`},`
			`{`
			`"role": "user",`
			`"content": "More details please and answer only in French!.",`
			`},`
			`{`
			`"role": "user",`
			`"content": [`
			`{`
			`"type": "image_url",`
			`"image_url": {`
			`"url": url_3`
			`}`
			`},`
			`],`
			`},`
			`]`

			`outputs = llm.chat(messages=messages, sampling_params=sampling_params)`
			`print(outputs[0].outputs[0].text)`


			`def main():`
			`parser = argparse.ArgumentParser(`
			`description="Run a demo in simple or advanced mode.")`

			`parser.add_argument(`
			`"mode",`
			`choices=["simple", "advanced"],`
			`help="Specify the demo mode: 'simple' or 'advanced'",`
			`)`

			`args = parser.parse_args()`

			`if args.mode == "simple":`
			`print("Running simple demo...")`
			`run_simple_demo()`
			`elif args.mode == "advanced":`
			`print("Running advanced demo...")`
			`run_advanced_demo()`


			`if __name__ == "__main__":`
			`main()`