From 61de3ef74b9cd5594e1d79ba573258adf481382c Mon Sep 17 00:00:00 2001 From: "Ye (Charlotte) Qi" Date: Thu, 10 Apr 2025 02:36:27 -0700 Subject: [PATCH] [Model] Remove image mm limit for LLaMa4 (#16365) Signed-off-by: Ye (Charlotte) Qi --- .../vision_language_multi_image.py | 29 +++++++++++++++---- vllm/model_executor/models/mllama4.py | 4 ++- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index da9a616e..d9f84d2f 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -22,6 +22,16 @@ QUESTION = "What is the content of each image?" IMAGE_URLS = [ "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg", "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg", + "https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG", + "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg", + "https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg", + "https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg", + "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg", + "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg", + "https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg", + "https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg", + "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg", + "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg", ] @@ -285,8 +295,7 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData: engine_args = EngineArgs( model=model_name, - max_model_len=8192, - max_num_seqs=4, + max_model_len=131072, tensor_parallel_size=8, limit_mm_per_prompt={"image": len(image_urls)}, ) @@ -660,7 +669,7 @@ def run_generate(model, question: str, image_urls: list[str], llm.llm_engine.add_lora(lora_request=lora_request) sampling_params = SamplingParams(temperature=0.0, - max_tokens=128, + max_tokens=256, stop_token_ids=req_data.stop_token_ids) outputs = llm.generate( @@ -694,7 +703,7 @@ def run_chat(model: str, question: str, image_urls: list[str], llm.llm_engine.add_lora(lora_request=lora_request) sampling_params = SamplingParams(temperature=0.0, - max_tokens=128, + max_tokens=256, stop_token_ids=req_data.stop_token_ids) outputs = llm.chat( [{ @@ -729,10 +738,12 @@ def main(args: Namespace): method = args.method seed = args.seed + image_urls = IMAGE_URLS[:args.num_images] + if method == "generate": - run_generate(model, QUESTION, IMAGE_URLS, seed) + run_generate(model, QUESTION, image_urls, seed) elif method == "chat": - run_chat(model, QUESTION, IMAGE_URLS, seed) + run_chat(model, QUESTION, image_urls, seed) else: raise ValueError(f"Invalid method: {method}") @@ -757,6 +768,12 @@ if __name__ == "__main__": type=int, default=None, help="Set the seed when initializing `vllm.LLM`.") + parser.add_argument( + "--num-images", + "-n", + choices=list(range(1, 13)), # 12 is the max number of images + default=2, + help="Number of images to use for the demo.") args = parser.parse_args() main(args) diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 0499fe09..17171f82 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -477,7 +477,9 @@ class Mllama4ProcessingInfo(BaseProcessingInfo): **kwargs) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": 10} + # Although vLLM can support more images from an infra capability + # perspective, we do not recommend using >10 images in practice. + return {"image": None} @staticmethod def get_patch_per_chunk(vision_config: Llama4VisionConfig) -> int: