[Bugfix] Make image processor respect mm_processor_kwargs
for Qwen2-VL (#10112)
Signed-off-by: Jiahao Li <liplus17@163.com>
This commit is contained in:
parent
a6f332d0d9
commit
999df95b4e
@ -22,8 +22,8 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""Inference-only Qwen2-VL model compatible with HuggingFace weights."""
|
"""Inference-only Qwen2-VL model compatible with HuggingFace weights."""
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional,
|
from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
|
||||||
Tuple, Type, TypedDict, Union)
|
Optional, Tuple, Type, TypedDict, Union)
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
@ -558,6 +558,17 @@ class Qwen2VisionTransformer(nn.Module):
|
|||||||
# === Vision input helpers === #
|
# === Vision input helpers === #
|
||||||
|
|
||||||
|
|
||||||
|
def get_mm_processor_kwargs(
|
||||||
|
min_pixels: Optional[int] = None,
|
||||||
|
max_pixels: Optional[int] = None) -> Dict[str, int]:
|
||||||
|
mm_processor_kwargs = {}
|
||||||
|
if min_pixels:
|
||||||
|
mm_processor_kwargs["min_pixels"] = min_pixels
|
||||||
|
if max_pixels:
|
||||||
|
mm_processor_kwargs["max_pixels"] = max_pixels
|
||||||
|
return mm_processor_kwargs
|
||||||
|
|
||||||
|
|
||||||
def mm_input_mapper_for_qwen2_vl(
|
def mm_input_mapper_for_qwen2_vl(
|
||||||
ctx: InputContext,
|
ctx: InputContext,
|
||||||
data: MultiModalData[object],
|
data: MultiModalData[object],
|
||||||
@ -575,12 +586,8 @@ def mm_input_mapper_for_qwen2_vl(
|
|||||||
model_config = ctx.model_config
|
model_config = ctx.model_config
|
||||||
# Handle mm processor kwargs; we pass these at creation time
|
# Handle mm processor kwargs; we pass these at creation time
|
||||||
# because preprocess() in transformers doesn't expose them
|
# because preprocess() in transformers doesn't expose them
|
||||||
mm_processor_kwargs = {}
|
mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
|
||||||
if min_pixels:
|
max_pixels=max_pixels)
|
||||||
mm_processor_kwargs["min_pixels"] = min_pixels
|
|
||||||
if max_pixels:
|
|
||||||
mm_processor_kwargs["max_pixels"] = max_pixels
|
|
||||||
|
|
||||||
image_processor = cached_get_image_processor(
|
image_processor = cached_get_image_processor(
|
||||||
model_config.model,
|
model_config.model,
|
||||||
trust_remote_code=model_config.trust_remote_code,
|
trust_remote_code=model_config.trust_remote_code,
|
||||||
@ -683,7 +690,10 @@ def get_max_qwen2_vl_mm_tokens(ctx: InputContext,
|
|||||||
*,
|
*,
|
||||||
min_pixels=None,
|
min_pixels=None,
|
||||||
max_pixels=None) -> int:
|
max_pixels=None) -> int:
|
||||||
image_processor = cached_get_image_processor(ctx.model_config.model)
|
mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
|
||||||
|
max_pixels=max_pixels)
|
||||||
|
image_processor = cached_get_image_processor(ctx.model_config.model,
|
||||||
|
**mm_processor_kwargs)
|
||||||
max_resized_height, max_resized_width, max_llm_image_tokens = \
|
max_resized_height, max_resized_width, max_llm_image_tokens = \
|
||||||
_get_max_image_info(image_processor, data_type_key=data_type_key,
|
_get_max_image_info(image_processor, data_type_key=data_type_key,
|
||||||
mm_count=1, min_pixels=min_pixels,
|
mm_count=1, min_pixels=min_pixels,
|
||||||
@ -705,7 +715,10 @@ def dummy_data_for_qwen2_vl(
|
|||||||
min_pixels: Optional[int] = None,
|
min_pixels: Optional[int] = None,
|
||||||
max_pixels: Optional[int] = None
|
max_pixels: Optional[int] = None
|
||||||
) -> Tuple[SequenceData, Optional[MultiModalDataDict]]:
|
) -> Tuple[SequenceData, Optional[MultiModalDataDict]]:
|
||||||
image_processor = cached_get_image_processor(ctx.model_config.model)
|
mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
|
||||||
|
max_pixels=max_pixels)
|
||||||
|
image_processor = cached_get_image_processor(ctx.model_config.model,
|
||||||
|
**mm_processor_kwargs)
|
||||||
|
|
||||||
num_images = mm_counts["image"]
|
num_images = mm_counts["image"]
|
||||||
max_resized_height, max_resized_width, max_llm_image_tokens = \
|
max_resized_height, max_resized_width, max_llm_image_tokens = \
|
||||||
|
Loading…
x
Reference in New Issue
Block a user