[Misc] Revert compressed-tensors
code reuse (#7521)
This commit is contained in:
parent
951fdd66d3
commit
f55a9aea45
@ -24,4 +24,3 @@ librosa # Required for audio processing
|
|||||||
soundfile # Required for audio processing
|
soundfile # Required for audio processing
|
||||||
gguf == 0.9.1
|
gguf == 0.9.1
|
||||||
importlib_metadata
|
importlib_metadata
|
||||||
compressed-tensors == 0.5.0
|
|
||||||
|
@ -17,7 +17,7 @@ peft
|
|||||||
requests
|
requests
|
||||||
ray
|
ray
|
||||||
sentence-transformers # required for embedding
|
sentence-transformers # required for embedding
|
||||||
compressed-tensors==0.5.0 # required for compressed-tensors
|
compressed-tensors==0.4.0 # required for compressed-tensors
|
||||||
timm # required for internvl test
|
timm # required for internvl test
|
||||||
|
|
||||||
# TODO: Add this after fully implementing llava(mantis)
|
# TODO: Add this after fully implementing llava(mantis)
|
||||||
|
@ -5,12 +5,13 @@ Run `pytest tests/quantization/test_compressed_tensors.py`.
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
from compressed_tensors.quantization import QuantizationType
|
|
||||||
|
|
||||||
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
|
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
|
||||||
CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
|
CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
|
||||||
CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
|
CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
|
||||||
CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
|
CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
|
||||||
|
from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
|
||||||
|
QuantizationType)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_args", [
|
@pytest.mark.parametrize("model_args", [
|
||||||
|
@ -1,10 +1,6 @@
|
|||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from compressed_tensors.config import CompressionFormat
|
|
||||||
from compressed_tensors.quantization import (QuantizationArgs,
|
|
||||||
QuantizationStrategy,
|
|
||||||
QuantizationType)
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
|
from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
|
||||||
@ -17,7 +13,8 @@ from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
|
|||||||
CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
|
CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
|
||||||
CompressedTensorsWNA16)
|
CompressedTensorsWNA16)
|
||||||
from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
|
from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
|
||||||
find_matched_target, is_activation_quantization_format,
|
CompressionFormat, QuantizationArgs, QuantizationStrategy,
|
||||||
|
QuantizationType, find_matched_target, is_activation_quantization_format,
|
||||||
should_ignore_layer)
|
should_ignore_layer)
|
||||||
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
|
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
@ -1,10 +1,11 @@
|
|||||||
from typing import Callable, List, Optional
|
from typing import Callable, List, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from compressed_tensors.quantization import QuantizationStrategy
|
|
||||||
|
|
||||||
from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
|
from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
|
||||||
CompressedTensorsScheme)
|
CompressedTensorsScheme)
|
||||||
|
from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
|
||||||
|
QuantizationStrategy)
|
||||||
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
|
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
|
||||||
apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
|
apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
|
||||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||||
|
@ -1,11 +1,12 @@
|
|||||||
from typing import Callable, List, Optional
|
from typing import Callable, List, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from compressed_tensors.quantization import QuantizationStrategy
|
|
||||||
from torch.nn import Parameter
|
from torch.nn import Parameter
|
||||||
|
|
||||||
from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
|
from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
|
||||||
CompressedTensorsScheme)
|
CompressedTensorsScheme)
|
||||||
|
from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
|
||||||
|
QuantizationStrategy)
|
||||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||||
apply_fp8_linear, cutlass_fp8_supported, requantize_with_max_scale)
|
apply_fp8_linear, cutlass_fp8_supported, requantize_with_max_scale)
|
||||||
from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
|
from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
|
||||||
|
@ -1,11 +1,12 @@
|
|||||||
from typing import Callable, List, Optional
|
from typing import Callable, List, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from compressed_tensors.quantization import QuantizationStrategy
|
|
||||||
from torch.nn import Parameter
|
from torch.nn import Parameter
|
||||||
|
|
||||||
from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
|
from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
|
||||||
CompressedTensorsScheme)
|
CompressedTensorsScheme)
|
||||||
|
from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
|
||||||
|
QuantizationStrategy)
|
||||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||||
apply_int8_linear, convert_to_channelwise)
|
apply_int8_linear, convert_to_channelwise)
|
||||||
from vllm.model_executor.parameter import (BasevLLMParameter,
|
from vllm.model_executor.parameter import (BasevLLMParameter,
|
||||||
|
@ -1,13 +1,85 @@
|
|||||||
import re
|
import re
|
||||||
from typing import Iterable, Optional
|
from enum import Enum
|
||||||
|
from typing import Any, Dict, Iterable, Optional
|
||||||
|
|
||||||
from compressed_tensors import CompressionFormat
|
from pydantic import BaseModel, Field
|
||||||
from torch.nn import Module
|
from torch.nn import Module
|
||||||
|
|
||||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||||
FUSED_LAYER_NAME_MAPPING)
|
FUSED_LAYER_NAME_MAPPING)
|
||||||
|
|
||||||
|
|
||||||
|
class CompressionFormat(Enum):
|
||||||
|
dense = "dense"
|
||||||
|
sparse_bitmask = "sparse-bitmask"
|
||||||
|
naive_quantized = "naive-quantized"
|
||||||
|
float_quantized = "float-quantized"
|
||||||
|
int_quantized = "int-quantized"
|
||||||
|
pack_quantized = "pack-quantized"
|
||||||
|
marlin_24 = "marlin-24"
|
||||||
|
|
||||||
|
|
||||||
|
class QuantizationType(str, Enum):
|
||||||
|
"""
|
||||||
|
Enum storing quantization type options
|
||||||
|
"""
|
||||||
|
|
||||||
|
INT = "int"
|
||||||
|
FLOAT = "float"
|
||||||
|
|
||||||
|
|
||||||
|
class QuantizationStrategy(str, Enum):
|
||||||
|
"""
|
||||||
|
Enum storing quantization strategy options
|
||||||
|
"""
|
||||||
|
|
||||||
|
TENSOR = "tensor"
|
||||||
|
CHANNEL = "channel"
|
||||||
|
GROUP = "group"
|
||||||
|
BLOCK = "block"
|
||||||
|
TOKEN = "token"
|
||||||
|
|
||||||
|
|
||||||
|
class QuantizationArgs(BaseModel):
|
||||||
|
"""
|
||||||
|
User facing arguments used to define a quantization config
|
||||||
|
for weights or activations
|
||||||
|
|
||||||
|
:param num_bits: quantization bit depth
|
||||||
|
:param type: dtype to quantized to, either int or float
|
||||||
|
:param symmetric: whether or not quantization scale is symmetric
|
||||||
|
:param strategy: string determining the scope of scale/zero-point to apply
|
||||||
|
:param group_size: group length to use for the group strategy
|
||||||
|
:param block_structure: 2d block structure to use for the block
|
||||||
|
strategy, must be of the format "2x4", "8x16", etc.
|
||||||
|
:param dynamic: set True to perform dynamic quantization -
|
||||||
|
values will not be calibrated during calibration phase,
|
||||||
|
instead during inference new quantization ranges will be
|
||||||
|
observed with every sample. Defaults to False for static
|
||||||
|
quantization. Note that enabling dynamic quantization
|
||||||
|
will change the default observer to a memoryless one
|
||||||
|
"""
|
||||||
|
|
||||||
|
num_bits: int = 8
|
||||||
|
type: QuantizationType = QuantizationType.INT
|
||||||
|
symmetric: bool = True
|
||||||
|
group_size: Optional[int] = None
|
||||||
|
strategy: Optional[QuantizationStrategy] = None
|
||||||
|
block_structure: Optional[str] = None
|
||||||
|
dynamic: bool = False
|
||||||
|
observer: str = Field(
|
||||||
|
default="minmax",
|
||||||
|
description=("The class to use to compute the quantization param - "
|
||||||
|
"scale and zero-point'"),
|
||||||
|
)
|
||||||
|
observer_kwargs: Dict[str, Any] = Field(
|
||||||
|
default_factory=dict,
|
||||||
|
description=
|
||||||
|
("optional dict of kwargs to be passed directly to torch quantization "
|
||||||
|
"Observers constructor excluding quantization range or symmetry"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def is_activation_quantization_format(format: str) -> bool:
|
def is_activation_quantization_format(format: str) -> bool:
|
||||||
_ACTIVATION_QUANTIZATION_FORMATS = [
|
_ACTIVATION_QUANTIZATION_FORMATS = [
|
||||||
CompressionFormat.naive_quantized.value,
|
CompressionFormat.naive_quantized.value,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user