diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index ebcd3614..f758c98e 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -38,7 +38,8 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors -from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP +from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP, + SupportsQuant) from .utils import (flatten_bn, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, merge_multimodal_embeddings) @@ -927,7 +928,11 @@ class ChameleonModel(nn.Module): info=ChameleonProcessingInfo, dummy_inputs=ChameleonDummyInputsBuilder) class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, - SupportsPP): + SupportsPP, SupportsQuant): + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"] + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index a51a0af9..1b1738f8 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -29,7 +29,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import ChatGLMConfig -from .interfaces import SupportsLoRA, SupportsPP +from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -295,7 +295,11 @@ class GLMTransformer(nn.Module): @support_torch_compile -class ChatGLMModel(nn.Module): +class ChatGLMModel(nn.Module, SupportsQuant): + packed_modules_mapping = { + "linear_proj.merged_proj": + ["linear_proj.gate_proj", "linear_proj.dense_h_to_4h"] + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -395,7 +399,6 @@ class ChatGLMModel(nn.Module): class ChatGLMBaseModel(nn.Module): - hf_to_vllm_mapper = WeightsMapper( orig_to_new_substr={".word_embeddings": ""}, ) @@ -452,7 +455,8 @@ class ChatGLMBaseModel(nn.Module): return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) -class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP): +class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP, + SupportsQuant): packed_modules_mapping = { "query_key_value": ["query_key_value"], "dense_h_to_4h": ["dense_h_to_4h"] diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index e7e73f44..bb8d9bf8 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -49,7 +49,7 @@ from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors -from .interfaces import SupportsLoRA, SupportsPP +from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant from .utils import (extract_layer_index, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -332,7 +332,7 @@ class CohereModel(nn.Module): return hidden_states -class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP): +class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant): packed_modules_mapping = { "qkv_proj": [ "q_proj",