[SupportsQuant] Chameleon, Chatglm, Commandr (#15952)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
This commit is contained in:
parent
421c462948
commit
82e7e19a6e
@ -38,7 +38,8 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
|||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
|
||||||
|
SupportsQuant)
|
||||||
from .utils import (flatten_bn, is_pp_missing_parameter,
|
from .utils import (flatten_bn, is_pp_missing_parameter,
|
||||||
make_empty_intermediate_tensors_factory, make_layers,
|
make_empty_intermediate_tensors_factory, make_layers,
|
||||||
maybe_prefix, merge_multimodal_embeddings)
|
maybe_prefix, merge_multimodal_embeddings)
|
||||||
@ -927,7 +928,11 @@ class ChameleonModel(nn.Module):
|
|||||||
info=ChameleonProcessingInfo,
|
info=ChameleonProcessingInfo,
|
||||||
dummy_inputs=ChameleonDummyInputsBuilder)
|
dummy_inputs=ChameleonDummyInputsBuilder)
|
||||||
class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
|
class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||||
SupportsPP):
|
SupportsPP, SupportsQuant):
|
||||||
|
packed_modules_mapping = {
|
||||||
|
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
||||||
|
"gate_up_proj": ["gate_proj", "up_proj"]
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
@ -29,7 +29,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
|
|||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.transformers_utils.configs import ChatGLMConfig
|
from vllm.transformers_utils.configs import ChatGLMConfig
|
||||||
|
|
||||||
from .interfaces import SupportsLoRA, SupportsPP
|
from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
|
||||||
from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
|
from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
|
||||||
make_empty_intermediate_tensors_factory, make_layers,
|
make_empty_intermediate_tensors_factory, make_layers,
|
||||||
maybe_prefix)
|
maybe_prefix)
|
||||||
@ -295,7 +295,11 @@ class GLMTransformer(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
@support_torch_compile
|
@support_torch_compile
|
||||||
class ChatGLMModel(nn.Module):
|
class ChatGLMModel(nn.Module, SupportsQuant):
|
||||||
|
packed_modules_mapping = {
|
||||||
|
"linear_proj.merged_proj":
|
||||||
|
["linear_proj.gate_proj", "linear_proj.dense_h_to_4h"]
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
@ -395,7 +399,6 @@ class ChatGLMModel(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class ChatGLMBaseModel(nn.Module):
|
class ChatGLMBaseModel(nn.Module):
|
||||||
|
|
||||||
hf_to_vllm_mapper = WeightsMapper(
|
hf_to_vllm_mapper = WeightsMapper(
|
||||||
orig_to_new_substr={".word_embeddings": ""}, )
|
orig_to_new_substr={".word_embeddings": ""}, )
|
||||||
|
|
||||||
@ -452,7 +455,8 @@ class ChatGLMBaseModel(nn.Module):
|
|||||||
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
|
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
|
||||||
|
|
||||||
|
|
||||||
class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP):
|
class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
|
||||||
|
SupportsQuant):
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"query_key_value": ["query_key_value"],
|
"query_key_value": ["query_key_value"],
|
||||||
"dense_h_to_4h": ["dense_h_to_4h"]
|
"dense_h_to_4h": ["dense_h_to_4h"]
|
||||||
|
@ -49,7 +49,7 @@ from vllm.model_executor.utils import set_weight_attrs
|
|||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
from .interfaces import SupportsLoRA, SupportsPP
|
from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
|
||||||
from .utils import (extract_layer_index, is_pp_missing_parameter,
|
from .utils import (extract_layer_index, is_pp_missing_parameter,
|
||||||
make_empty_intermediate_tensors_factory, make_layers,
|
make_empty_intermediate_tensors_factory, make_layers,
|
||||||
maybe_prefix)
|
maybe_prefix)
|
||||||
@ -332,7 +332,7 @@ class CohereModel(nn.Module):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"qkv_proj": [
|
"qkv_proj": [
|
||||||
"q_proj",
|
"q_proj",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user