[BugFix] Stop silent failures on compressed-tensors parsing (#9381)
This commit is contained in:
parent
343f8e0905
commit
48138a8415
@ -31,4 +31,4 @@ pyyaml
|
|||||||
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
|
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
|
||||||
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
|
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
|
||||||
einops # Required for Qwen2-VL.
|
einops # Required for Qwen2-VL.
|
||||||
compressed-tensors == 0.6.0 # required for compressed-tensors
|
compressed-tensors == 0.7.1 # required for compressed-tensors
|
||||||
|
@ -100,12 +100,21 @@ class CompressedTensorsConfig(QuantizationConfig):
|
|||||||
target_scheme_map[target][
|
target_scheme_map[target][
|
||||||
"weights"] = QuantizationArgs.parse_obj(
|
"weights"] = QuantizationArgs.parse_obj(
|
||||||
quant_config.get("weights"))
|
quant_config.get("weights"))
|
||||||
try:
|
|
||||||
|
target_scheme_map[target]["input_activations"] = None
|
||||||
|
if is_activation_quantization_format(quant_format):
|
||||||
|
input_activations = quant_config.get("input_activations")
|
||||||
|
# The only case where we have activation quant supported
|
||||||
|
# but no input_activations provided in the config
|
||||||
|
# should be w8a16fp8 w8a16fp8 can also run for cases where
|
||||||
|
# there is an input_quant but it is ignored
|
||||||
|
if not input_activations:
|
||||||
|
assert target_scheme_map[target][
|
||||||
|
"weights"].type == QuantizationType.FLOAT
|
||||||
|
else:
|
||||||
target_scheme_map[target][
|
target_scheme_map[target][
|
||||||
"input_activations"] = QuantizationArgs.parse_obj(
|
"input_activations"] = QuantizationArgs.parse_obj(
|
||||||
quant_config.get("input_activations"))
|
quant_config.get("input_activations"))
|
||||||
except Exception:
|
|
||||||
target_scheme_map[target]["input_activations"] = None
|
|
||||||
|
|
||||||
return cls(target_scheme_map=target_scheme_map,
|
return cls(target_scheme_map=target_scheme_map,
|
||||||
ignore=ignore,
|
ignore=ignore,
|
||||||
@ -244,8 +253,6 @@ class CompressedTensorsConfig(QuantizationConfig):
|
|||||||
group_size=weight_quant.group_size,
|
group_size=weight_quant.group_size,
|
||||||
actorder=weight_quant.actorder)
|
actorder=weight_quant.actorder)
|
||||||
|
|
||||||
# Detect If Activation Quantization.
|
|
||||||
# TODO @dsikka: clean-up conditions
|
|
||||||
if is_activation_quantization_format(self.quant_format):
|
if is_activation_quantization_format(self.quant_format):
|
||||||
if self._is_fp8_w8a8(weight_quant, input_quant):
|
if self._is_fp8_w8a8(weight_quant, input_quant):
|
||||||
is_fp8_w8a8_supported = self._check_scheme_supported(
|
is_fp8_w8a8_supported = self._check_scheme_supported(
|
||||||
@ -256,16 +263,19 @@ class CompressedTensorsConfig(QuantizationConfig):
|
|||||||
is_static_input_scheme=(input_quant
|
is_static_input_scheme=(input_quant
|
||||||
and not input_quant.dynamic))
|
and not input_quant.dynamic))
|
||||||
else:
|
else:
|
||||||
|
# note: input_quant will be present for converted models;
|
||||||
|
# will be ignored during inference post loading
|
||||||
return CompressedTensorsW8A16Fp8(
|
return CompressedTensorsW8A16Fp8(
|
||||||
strategy=weight_quant.strategy,
|
strategy=weight_quant.strategy,
|
||||||
is_static_input_scheme=(input_quant
|
is_static_input_scheme=not input_quant.dynamic)
|
||||||
and not input_quant.dynamic))
|
|
||||||
|
|
||||||
|
# note: input_quant can be None
|
||||||
if self._is_fp8_w8a16(weight_quant, input_quant):
|
if self._is_fp8_w8a16(weight_quant, input_quant):
|
||||||
|
is_static_input_scheme = (input_quant
|
||||||
|
and not input_quant.dynamic)
|
||||||
return CompressedTensorsW8A16Fp8(
|
return CompressedTensorsW8A16Fp8(
|
||||||
strategy=weight_quant.strategy,
|
strategy=weight_quant.strategy,
|
||||||
is_static_input_scheme=(input_quant
|
is_static_input_scheme=is_static_input_scheme)
|
||||||
and not input_quant.dynamic))
|
|
||||||
|
|
||||||
if self._is_static_tensor_w8a8(weight_quant, input_quant):
|
if self._is_static_tensor_w8a8(weight_quant, input_quant):
|
||||||
return CompressedTensorsW8A8Int8(
|
return CompressedTensorsW8A8Int8(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user