vllm/tests/kernels/test_gguf.py

from pathlib import Path
from typing import List

import pytest
import torch
from gguf import GGMLQuantizationType, GGUFReader, ReaderTensor, dequantize
from huggingface_hub import snapshot_download

import vllm._custom_ops as ops
from vllm.utils import seed_everything

GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")


def get_gguf_sample_tensors(
        hidden_size: int,
        quant_type: GGMLQuantizationType) -> List[ReaderTensor]:
    sample_dir = GGUF_SAMPLE
    filename = f"Quant_{quant_type.name}_{hidden_size}.gguf"
    sample_file = Path(sample_dir) / filename
    return GGUFReader(sample_file).tensors


DTYPES = [torch.half]
# Hidden_size for testing, must match the sample file in HF repo,
# we have `hidden_size = 256, 1024` for test in HF repo currently.
HIDDEN_SIZES = [256, 1024]
NUM_TOKENS = [7, 83, 128, 2048]  # Arbitrary values for testing
SEEDS = [0]
QUANT_TYPES = [
    # i-matrix
    GGMLQuantizationType.IQ1_M,
    GGMLQuantizationType.IQ1_S,
    GGMLQuantizationType.IQ2_S,
    GGMLQuantizationType.IQ2_XS,
    GGMLQuantizationType.IQ3_S,
    GGMLQuantizationType.IQ3_XXS,
    GGMLQuantizationType.IQ4_NL,
    GGMLQuantizationType.IQ4_XS,
    # k-quants
    GGMLQuantizationType.Q2_K,
    GGMLQuantizationType.Q3_K,
    GGMLQuantizationType.Q4_K,
    GGMLQuantizationType.Q5_K,
    GGMLQuantizationType.Q6_K,
    # standard quantization
    GGMLQuantizationType.Q4_0,
    GGMLQuantizationType.Q5_0,
    GGMLQuantizationType.Q8_0,
]


@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("quant_type", QUANT_TYPES)
@torch.inference_mode()
def test_dequantize(hidden_size: int, dtype: torch.dtype,
                    quant_type: GGMLQuantizationType):
    tensors = get_gguf_sample_tensors(hidden_size, quant_type)
    for tensor in tensors:
        shape_str = tensor.name.split("_")[-1]
        shape = map(int, shape_str.split("x"))

        ref_output = torch.tensor(dequantize(tensor.data, quant_type),
                                  device="cuda").to(dtype)
        output = ops.ggml_dequantize(torch.tensor(tensor.data, device="cuda"),
                                     quant_type, *list(shape)).to(dtype)

        torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=4e-2)


@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("quant_type", QUANT_TYPES)
@torch.inference_mode()
def test_mmvq(hidden_size: int, dtype: torch.dtype,
              quant_type: GGMLQuantizationType):
    seed_everything(0)

    tensors = get_gguf_sample_tensors(hidden_size, quant_type)
    x = torch.rand((1, hidden_size), dtype=dtype, device="cuda")
    for tensor in tensors:
        weight = torch.tensor(dequantize(tensor.data, quant_type),
                              device="cuda").to(dtype)
        ref_output = x @ weight.T

        qweight = torch.tensor(tensor.data, device="cuda")
        output = ops.ggml_mul_mat_vec_a8(qweight, x, quant_type,
                                         qweight.shape[0]).to(dtype)

        torch.testing.assert_close(output, ref_output, atol=1, rtol=1e-1)


@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize(
    "quant_type",
    [
        # k-quants
        GGMLQuantizationType.Q2_K,
        GGMLQuantizationType.Q3_K,
        GGMLQuantizationType.Q4_K,
        GGMLQuantizationType.Q5_K,
        GGMLQuantizationType.Q6_K,
        # standard quants
        GGMLQuantizationType.Q4_0,
        GGMLQuantizationType.Q5_0,
        GGMLQuantizationType.Q8_0,
    ])
@torch.inference_mode()
def test_mmq(num_tokens: int, hidden_size: int, dtype: torch.dtype,
             quant_type: GGMLQuantizationType):
    seed_everything(0)

    tensors = get_gguf_sample_tensors(hidden_size, quant_type)
    x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda")
    for tensor in tensors:
        weight = torch.tensor(dequantize(tensor.data, quant_type),
                              device="cuda").to(dtype)
        ref_output = x @ weight.T

        qweight = torch.tensor(tensor.data, device="cuda")
        output = ops.ggml_mul_mat_a8(qweight, x, quant_type,
                                     qweight.shape[0]).to(dtype)

        torch.testing.assert_close(output, ref_output, atol=1, rtol=1e-1)
[Bugfix][Kernel] Add `IQ1_M` quantization implementation to GGUF kernel (#8357) 2024-09-16 06:51:44 +08:00			`from pathlib import Path`
			`from typing import List`

			`import pytest`
			`import torch`
			`from gguf import GGMLQuantizationType, GGUFReader, ReaderTensor, dequantize`
			`from huggingface_hub import snapshot_download`

			`import vllm._custom_ops as ops`
[CI/Build] Avoid CUDA initialization (#8534) 2024-09-18 18:38:11 +08:00			`from vllm.utils import seed_everything`
[Bugfix][Kernel] Add `IQ1_M` quantization implementation to GGUF kernel (#8357) 2024-09-16 06:51:44 +08:00
			`GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")`


			`def get_gguf_sample_tensors(`
			`hidden_size: int,`
			`quant_type: GGMLQuantizationType) -> List[ReaderTensor]:`
			`sample_dir = GGUF_SAMPLE`
			`filename = f"Quant_{quant_type.name}_{hidden_size}.gguf"`
			`sample_file = Path(sample_dir) / filename`
			`return GGUFReader(sample_file).tensors`


			`DTYPES = [torch.half]`
			`# Hidden_size for testing, must match the sample file in HF repo,`
			# we have `hidden_size = 256, 1024` for test in HF repo currently.
			`HIDDEN_SIZES = [256, 1024]`
			`NUM_TOKENS = [7, 83, 128, 2048] # Arbitrary values for testing`
			`SEEDS = [0]`
			`QUANT_TYPES = [`
			`# i-matrix`
			`GGMLQuantizationType.IQ1_M,`
			`GGMLQuantizationType.IQ1_S,`
			`GGMLQuantizationType.IQ2_S,`
			`GGMLQuantizationType.IQ2_XS,`
			`GGMLQuantizationType.IQ3_S,`
			`GGMLQuantizationType.IQ3_XXS,`
			`GGMLQuantizationType.IQ4_NL,`
			`GGMLQuantizationType.IQ4_XS,`
			`# k-quants`
			`GGMLQuantizationType.Q2_K,`
			`GGMLQuantizationType.Q3_K,`
			`GGMLQuantizationType.Q4_K,`
			`GGMLQuantizationType.Q5_K,`
			`GGMLQuantizationType.Q6_K,`
			`# standard quantization`
			`GGMLQuantizationType.Q4_0,`
			`GGMLQuantizationType.Q5_0,`
			`GGMLQuantizationType.Q8_0,`
			`]`


			`@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)`
			`@pytest.mark.parametrize("dtype", DTYPES)`
			`@pytest.mark.parametrize("quant_type", QUANT_TYPES)`
			`@torch.inference_mode()`
			`def test_dequantize(hidden_size: int, dtype: torch.dtype,`
			`quant_type: GGMLQuantizationType):`
			`tensors = get_gguf_sample_tensors(hidden_size, quant_type)`
			`for tensor in tensors:`
			`shape_str = tensor.name.split("_")[-1]`
			`shape = map(int, shape_str.split("x"))`

			`ref_output = torch.tensor(dequantize(tensor.data, quant_type),`
			`device="cuda").to(dtype)`
			`output = ops.ggml_dequantize(torch.tensor(tensor.data, device="cuda"),`
			`quant_type, *list(shape)).to(dtype)`

			`torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=4e-2)`


			`@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)`
			`@pytest.mark.parametrize("dtype", DTYPES)`
			`@pytest.mark.parametrize("quant_type", QUANT_TYPES)`
			`@torch.inference_mode()`
			`def test_mmvq(hidden_size: int, dtype: torch.dtype,`
			`quant_type: GGMLQuantizationType):`
[CI/Build] Avoid CUDA initialization (#8534) 2024-09-18 18:38:11 +08:00			`seed_everything(0)`
[Bugfix][Kernel] Add `IQ1_M` quantization implementation to GGUF kernel (#8357) 2024-09-16 06:51:44 +08:00
			`tensors = get_gguf_sample_tensors(hidden_size, quant_type)`
			`x = torch.rand((1, hidden_size), dtype=dtype, device="cuda")`
			`for tensor in tensors:`
			`weight = torch.tensor(dequantize(tensor.data, quant_type),`
			`device="cuda").to(dtype)`
			`ref_output = x @ weight.T`

			`qweight = torch.tensor(tensor.data, device="cuda")`
			`output = ops.ggml_mul_mat_vec_a8(qweight, x, quant_type,`
			`qweight.shape[0]).to(dtype)`

			`torch.testing.assert_close(output, ref_output, atol=1, rtol=1e-1)`


			`@pytest.mark.parametrize("num_tokens", NUM_TOKENS)`
			`@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)`
			`@pytest.mark.parametrize("dtype", DTYPES)`
			`@pytest.mark.parametrize(`
			`"quant_type",`
			`[`
			`# k-quants`
			`GGMLQuantizationType.Q2_K,`
			`GGMLQuantizationType.Q3_K,`
			`GGMLQuantizationType.Q4_K,`
			`GGMLQuantizationType.Q5_K,`
			`GGMLQuantizationType.Q6_K,`
			`# standard quants`
			`GGMLQuantizationType.Q4_0,`
			`GGMLQuantizationType.Q5_0,`
			`GGMLQuantizationType.Q8_0,`
			`])`
			`@torch.inference_mode()`
			`def test_mmq(num_tokens: int, hidden_size: int, dtype: torch.dtype,`
			`quant_type: GGMLQuantizationType):`
[CI/Build] Avoid CUDA initialization (#8534) 2024-09-18 18:38:11 +08:00			`seed_everything(0)`
[Bugfix][Kernel] Add `IQ1_M` quantization implementation to GGUF kernel (#8357) 2024-09-16 06:51:44 +08:00
			`tensors = get_gguf_sample_tensors(hidden_size, quant_type)`
			`x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda")`
			`for tensor in tensors:`
			`weight = torch.tensor(dequantize(tensor.data, quant_type),`
			`device="cuda").to(dtype)`
			`ref_output = x @ weight.T`

			`qweight = torch.tensor(tensor.data, device="cuda")`
			`output = ops.ggml_mul_mat_a8(qweight, x, quant_type,`
			`qweight.shape[0]).to(dtype)`

			`torch.testing.assert_close(output, ref_output, atol=1, rtol=1e-1)`