[SupportsQuant] Chameleon, Chatglm, Commandr (#15952)

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
This commit is contained in:
Kyle Sayers 2025-04-03 11:25:22 -04:00 committed by GitHub
parent 421c462948
commit 82e7e19a6e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 17 additions and 8 deletions

View File

@ -38,7 +38,8 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
SupportsQuant)
from .utils import (flatten_bn, is_pp_missing_parameter, from .utils import (flatten_bn, is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers, make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix, merge_multimodal_embeddings) maybe_prefix, merge_multimodal_embeddings)
@ -927,7 +928,11 @@ class ChameleonModel(nn.Module):
info=ChameleonProcessingInfo, info=ChameleonProcessingInfo,
dummy_inputs=ChameleonDummyInputsBuilder) dummy_inputs=ChameleonDummyInputsBuilder)
class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
SupportsPP): SupportsPP, SupportsQuant):
packed_modules_mapping = {
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
"gate_up_proj": ["gate_proj", "up_proj"]
}
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()

View File

@ -29,7 +29,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import ChatGLMConfig from vllm.transformers_utils.configs import ChatGLMConfig
from .interfaces import SupportsLoRA, SupportsPP from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter, from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers, make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix) maybe_prefix)
@ -295,7 +295,11 @@ class GLMTransformer(nn.Module):
@support_torch_compile @support_torch_compile
class ChatGLMModel(nn.Module): class ChatGLMModel(nn.Module, SupportsQuant):
packed_modules_mapping = {
"linear_proj.merged_proj":
["linear_proj.gate_proj", "linear_proj.dense_h_to_4h"]
}
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
@ -395,7 +399,6 @@ class ChatGLMModel(nn.Module):
class ChatGLMBaseModel(nn.Module): class ChatGLMBaseModel(nn.Module):
hf_to_vllm_mapper = WeightsMapper( hf_to_vllm_mapper = WeightsMapper(
orig_to_new_substr={".word_embeddings": ""}, ) orig_to_new_substr={".word_embeddings": ""}, )
@ -452,7 +455,8 @@ class ChatGLMBaseModel(nn.Module):
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP): class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
SupportsQuant):
packed_modules_mapping = { packed_modules_mapping = {
"query_key_value": ["query_key_value"], "query_key_value": ["query_key_value"],
"dense_h_to_4h": ["dense_h_to_4h"] "dense_h_to_4h": ["dense_h_to_4h"]

View File

@ -49,7 +49,7 @@ from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from .interfaces import SupportsLoRA, SupportsPP from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
from .utils import (extract_layer_index, is_pp_missing_parameter, from .utils import (extract_layer_index, is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers, make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix) maybe_prefix)
@ -332,7 +332,7 @@ class CohereModel(nn.Module):
return hidden_states return hidden_states
class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP): class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
packed_modules_mapping = { packed_modules_mapping = {
"qkv_proj": [ "qkv_proj": [
"q_proj", "q_proj",