vllm/tests/quantization/test_bitsandbytes.py

# SPDX-License-Identifier: Apache-2.0
'''Tests whether bitsandbytes computation is enabled correctly.

Run `pytest tests/quantization/test_bitsandbytes.py`.
'''

import gc

import pytest
import torch

from tests.quantization.utils import is_quant_method_supported

from ..utils import compare_two_settings, create_new_process_for_each_test

models_4bit_to_test = [
    ("facebook/opt-125m", "quantize opt model inflight"),
    ("mistralai/Mistral-7B-Instruct-v0.3",
     "quantize inflight model with both HF and Mistral format weights")
]

models_pre_qaunt_4bit_to_test = [
    ('PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed',
     'read pre-quantized 4-bit FP4 model'),
    ('poedator/opt-125m-bnb-4bit', 'read pre-quantized 4-bit NF4 opt model'),
]

models_pre_quant_8bit_to_test = [
    ('meta-llama/Llama-Guard-3-8B-INT8',
     'read pre-quantized llama 8-bit model'),
    ("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
]


@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
                    reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
@create_new_process_for_each_test()
def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                             model_name, description) -> None:

    hf_model_kwargs = {"load_in_4bit": True}
    validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
                             model_name, hf_model_kwargs)


@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
                    reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description",
                         models_pre_qaunt_4bit_to_test)
@create_new_process_for_each_test()
def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                                       model_name, description) -> None:

    validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
                             model_name)


@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
                    reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description",
                         models_pre_quant_8bit_to_test)
@create_new_process_for_each_test()
def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                             model_name, description) -> None:

    validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
                             model_name)


@pytest.mark.skipif(torch.cuda.device_count() < 2,
                    reason='Test requires at least 2 GPUs.')
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
                    reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
@create_new_process_for_each_test()
def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                                model_name, description) -> None:

    hf_model_kwargs = {"load_in_4bit": True}
    validate_generated_texts(hf_runner,
                             vllm_runner,
                             example_prompts[:1],
                             model_name,
                             hf_model_kwargs,
                             vllm_tp_size=2)


@pytest.mark.skipif(torch.cuda.device_count() < 2,
                    reason='Test requires at least 2 GPUs.')
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
                    reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
@create_new_process_for_each_test()
def test_load_pp_4bit_bnb_model(model_name, description) -> None:
    common_args = [
        "--disable-log-stats",
        "--disable-log-requests",
        "--dtype",
        "bfloat16",
        "--enable-prefix-caching",
        "--quantization",
        "bitsandbytes",
        "--gpu-memory-utilization",
        "0.7",
    ]
    pp_args = [
        *common_args,
        "--pipeline-parallel-size",
        "2",
    ]
    compare_two_settings(model_name, common_args, pp_args)


def log_generated_texts(prompts, outputs, runner_name):
    logged_texts = []
    for i, (_, generated_text) in enumerate(outputs):
        log_entry = {
            "prompt": prompts[i],
            "runner_name": runner_name,
            "generated_text": generated_text,
        }
        logged_texts.append(log_entry)
    return logged_texts


def validate_generated_texts(hf_runner,
                             vllm_runner,
                             prompts,
                             model_name,
                             hf_model_kwargs=None,
                             vllm_tp_size=1):

    # NOTE: run vLLM first, as it requires a clean process
    # when using distributed inference
    with vllm_runner(model_name,
                     quantization='bitsandbytes',
                     tensor_parallel_size=vllm_tp_size,
                     enforce_eager=False) as llm:
        vllm_outputs = llm.generate_greedy(prompts, 8)
        vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")

    # Clean up the GPU memory for the next test
    gc.collect()
    torch.cuda.empty_cache()

    if hf_model_kwargs is None:
        hf_model_kwargs = {}

    # Run with HF runner
    with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
        hf_outputs = llm.generate_greedy(prompts, 8)
        hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")

    # Clean up the GPU memory for the next test
    gc.collect()
    torch.cuda.empty_cache()

    # Compare the generated strings
    for hf_log, vllm_log in zip(hf_logs, vllm_logs):
        hf_str = hf_log["generated_text"]
        vllm_str = vllm_log["generated_text"]
        prompt = hf_log["prompt"]

        assert hf_str == vllm_str, (f"Model: {model_name}"
                                    f"Mismatch between HF and vLLM outputs:\n"
                                    f"Prompt: {prompt}\n"
                                    f"HF Output: '{hf_str}'\n"
                                    f"vLLM Output: '{vllm_str}'")
[Misc] Add SPDX-License-Identifier headers to python source files (#12628) - Add SPDX license headers to python source files - Check for SPDX headers using pre-commit commit 9d7ef44c3cfb72ca4c32e1c677d99259d10d4745 Author: Russell Bryant <rbryant@redhat.com> Date: Fri Jan 31 14:18:24 2025 -0500 Add SPDX license headers to python source files This commit adds SPDX license headers to python source files as recommended to the project by the Linux Foundation. These headers provide a concise way that is both human and machine readable for communicating license information for each source file. It helps avoid any ambiguity about the license of the code and can also be easily used by tools to help manage license compliance. The Linux Foundation runs license scans against the codebase to help ensure we are in compliance with the licenses of the code we use, including dependencies. Having these headers in place helps that tool do its job. More information can be found on the SPDX site: - https://spdx.dev/learn/handling-license-info/ Signed-off-by: Russell Bryant <rbryant@redhat.com> commit 5a1cf1cb3b80759131c73f6a9dddebccac039dea Author: Russell Bryant <rbryant@redhat.com> Date: Fri Jan 31 14:36:32 2025 -0500 Check for SPDX headers using pre-commit Signed-off-by: Russell Bryant <rbryant@redhat.com> --------- Signed-off-by: Russell Bryant <rbryant@redhat.com> 2025-02-02 14:58:18 -05:00			`# SPDX-License-Identifier: Apache-2.0`
[Feature][Kernel] Support bitsandbytes quantization and QLoRA (#4776) 2024-06-01 13:51:10 -07:00			`'''Tests whether bitsandbytes computation is enabled correctly.`

			Run `pytest tests/quantization/test_bitsandbytes.py`.
			`'''`
support bitsandbytes 8-bit and FP4 quantized models (#7445) 2024-08-29 16:09:08 -07:00
			`import gc`

[Feature][Kernel] Support bitsandbytes quantization and QLoRA (#4776) 2024-06-01 13:51:10 -07:00			`import pytest`
			`import torch`

[CI/Build][REDO] Add is_quant_method_supported to control quantization test configurations (#5466) 2024-06-13 11:18:08 -04:00			`from tests.quantization.utils import is_quant_method_supported`
[Bugfix][ROCm] running new process using spawn method for rocm in tests. (#14810) Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com> Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: TJian <tunjian.tan@embeddedllm.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> 2025-03-17 19:33:35 +08:00
			`from ..utils import compare_two_settings, create_new_process_for_each_test`
[misc][ci] fix quant test (#8449) 2024-09-13 02:20:14 -07:00
support bitsandbytes 8-bit and FP4 quantized models (#7445) 2024-08-29 16:09:08 -07:00			`models_4bit_to_test = [`
support bitsandbytes quantization with more models (#9148) 2024-10-08 18:52:19 -07:00			`("facebook/opt-125m", "quantize opt model inflight"),`
[Bugfix] Fix bnb quantization for models with both HF-format and Mistral-format weights (#14950) 2025-03-18 00:27:26 +01:00			`("mistralai/Mistral-7B-Instruct-v0.3",`
			`"quantize inflight model with both HF and Mistral format weights")`
[bitsandbytes]: support read bnb pre-quantized model (#5753) Co-authored-by: Michael Goin <michael@neuralmagic.com> 2024-07-23 16:45:09 -07:00			`]`

support bitsandbytes 8-bit and FP4 quantized models (#7445) 2024-08-29 16:09:08 -07:00			`models_pre_qaunt_4bit_to_test = [`
			`('PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed',`
			`'read pre-quantized 4-bit FP4 model'),`
support bitsandbytes quantization with more models (#9148) 2024-10-08 18:52:19 -07:00			`('poedator/opt-125m-bnb-4bit', 'read pre-quantized 4-bit NF4 opt model'),`
support bitsandbytes 8-bit and FP4 quantized models (#7445) 2024-08-29 16:09:08 -07:00			`]`

			`models_pre_quant_8bit_to_test = [`
support bitsandbytes quantization with more models (#9148) 2024-10-08 18:52:19 -07:00			`('meta-llama/Llama-Guard-3-8B-INT8',`
			`'read pre-quantized llama 8-bit model'),`
			`("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),`
support bitsandbytes 8-bit and FP4 quantized models (#7445) 2024-08-29 16:09:08 -07:00			`]`


			`@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),`
			`reason='bitsandbytes is not supported on this GPU type.')`
			`@pytest.mark.parametrize("model_name, description", models_4bit_to_test)`
[Bugfix][ROCm] running new process using spawn method for rocm in tests. (#14810) Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com> Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: TJian <tunjian.tan@embeddedllm.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> 2025-03-17 19:33:35 +08:00			`@create_new_process_for_each_test()`
support bitsandbytes 8-bit and FP4 quantized models (#7445) 2024-08-29 16:09:08 -07:00			`def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,`
			`model_name, description) -> None:`

			`hf_model_kwargs = {"load_in_4bit": True}`
			`validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],`
			`model_name, hf_model_kwargs)`


			`@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),`
			`reason='bitsandbytes is not supported on this GPU type.')`
			`@pytest.mark.parametrize("model_name, description",`
			`models_pre_qaunt_4bit_to_test)`
[Bugfix][ROCm] running new process using spawn method for rocm in tests. (#14810) Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com> Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: TJian <tunjian.tan@embeddedllm.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> 2025-03-17 19:33:35 +08:00			`@create_new_process_for_each_test()`
support bitsandbytes 8-bit and FP4 quantized models (#7445) 2024-08-29 16:09:08 -07:00			`def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,`
			`model_name, description) -> None:`

			`validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],`
			`model_name)`

[Feature][Kernel] Support bitsandbytes quantization and QLoRA (#4776) 2024-06-01 13:51:10 -07:00
[CI/Build][REDO] Add is_quant_method_supported to control quantization test configurations (#5466) 2024-06-13 11:18:08 -04:00			`@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),`
			`reason='bitsandbytes is not supported on this GPU type.')`
support bitsandbytes 8-bit and FP4 quantized models (#7445) 2024-08-29 16:09:08 -07:00			`@pytest.mark.parametrize("model_name, description",`
			`models_pre_quant_8bit_to_test)`
[Bugfix][ROCm] running new process using spawn method for rocm in tests. (#14810) Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com> Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: TJian <tunjian.tan@embeddedllm.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> 2025-03-17 19:33:35 +08:00			`@create_new_process_for_each_test()`
support bitsandbytes 8-bit and FP4 quantized models (#7445) 2024-08-29 16:09:08 -07:00			`def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,`
			`model_name, description) -> None:`

			`validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],`
			`model_name)`


[Feature][kernel] tensor parallelism with bitsandbytes quantization (#8434) 2024-09-17 08:09:12 -07:00			`@pytest.mark.skipif(torch.cuda.device_count() < 2,`
			`reason='Test requires at least 2 GPUs.')`
			`@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),`
			`reason='bitsandbytes is not supported on this GPU type.')`
			`@pytest.mark.parametrize("model_name, description", models_4bit_to_test)`
[Bugfix][ROCm] running new process using spawn method for rocm in tests. (#14810) Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com> Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: TJian <tunjian.tan@embeddedllm.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> 2025-03-17 19:33:35 +08:00			`@create_new_process_for_each_test()`
[Feature][kernel] tensor parallelism with bitsandbytes quantization (#8434) 2024-09-17 08:09:12 -07:00			`def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,`
			`model_name, description) -> None:`

			`hf_model_kwargs = {"load_in_4bit": True}`
			`validate_generated_texts(hf_runner,`
			`vllm_runner,`
			`example_prompts[:1],`
			`model_name,`
			`hf_model_kwargs,`
			`vllm_tp_size=2)`


[Bugfix] bitsandbytes models fail to run pipeline parallel (#10200) Signed-off-by: Hoang Cong Duc <hoangcongducltt@gmail.com> 2024-11-14 00:56:39 +08:00			`@pytest.mark.skipif(torch.cuda.device_count() < 2,`
			`reason='Test requires at least 2 GPUs.')`
			`@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),`
			`reason='bitsandbytes is not supported on this GPU type.')`
			`@pytest.mark.parametrize("model_name, description", models_4bit_to_test)`
[Bugfix][ROCm] running new process using spawn method for rocm in tests. (#14810) Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com> Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: TJian <tunjian.tan@embeddedllm.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> 2025-03-17 19:33:35 +08:00			`@create_new_process_for_each_test()`
[Bugfix] bitsandbytes models fail to run pipeline parallel (#10200) Signed-off-by: Hoang Cong Duc <hoangcongducltt@gmail.com> 2024-11-14 00:56:39 +08:00			`def test_load_pp_4bit_bnb_model(model_name, description) -> None:`
			`common_args = [`
			`"--disable-log-stats",`
			`"--disable-log-requests",`
			`"--dtype",`
			`"bfloat16",`
			`"--enable-prefix-caching",`
			`"--quantization",`
			`"bitsandbytes",`
			`"--gpu-memory-utilization",`
			`"0.7",`
			`]`
			`pp_args = [`
			`*common_args,`
			`"--pipeline-parallel-size",`
			`"2",`
			`]`
			`compare_two_settings(model_name, common_args, pp_args)`


support bitsandbytes 8-bit and FP4 quantized models (#7445) 2024-08-29 16:09:08 -07:00			`def log_generated_texts(prompts, outputs, runner_name):`
			`logged_texts = []`
			`for i, (_, generated_text) in enumerate(outputs):`
			`log_entry = {`
			`"prompt": prompts[i],`
			`"runner_name": runner_name,`
			`"generated_text": generated_text,`
			`}`
			`logged_texts.append(log_entry)`
			`return logged_texts`


			`def validate_generated_texts(hf_runner,`
			`vllm_runner,`
			`prompts,`
			`model_name,`
[Feature][kernel] tensor parallelism with bitsandbytes quantization (#8434) 2024-09-17 08:09:12 -07:00			`hf_model_kwargs=None,`
			`vllm_tp_size=1):`
support bitsandbytes 8-bit and FP4 quantized models (#7445) 2024-08-29 16:09:08 -07:00
[misc][ci] fix quant test (#8449) 2024-09-13 02:20:14 -07:00			`# NOTE: run vLLM first, as it requires a clean process`
			`# when using distributed inference`
[bitsandbytes]: support read bnb pre-quantized model (#5753) Co-authored-by: Michael Goin <michael@neuralmagic.com> 2024-07-23 16:45:09 -07:00			`with vllm_runner(model_name,`
[CI/Test] improve robustness of test (vllm_runner) (#5357) [CI/Test] improve robustness of test by replacing del with context manager (vllm_runner) (#5357) 2024-06-08 01:59:20 -07:00			`quantization='bitsandbytes',`
[Feature][kernel] tensor parallelism with bitsandbytes quantization (#8434) 2024-09-17 08:09:12 -07:00			`tensor_parallel_size=vllm_tp_size,`
:bug: fix torch memory profiling (#9516) Signed-off-by: Joe Runde <Joseph.Runde@ibm.com> 2024-10-18 20:25:19 -05:00			`enforce_eager=False) as llm:`
support bitsandbytes 8-bit and FP4 quantized models (#7445) 2024-08-29 16:09:08 -07:00			`vllm_outputs = llm.generate_greedy(prompts, 8)`
			`vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")`

			`# Clean up the GPU memory for the next test`
			`gc.collect()`
			`torch.cuda.empty_cache()`

[misc][ci] fix quant test (#8449) 2024-09-13 02:20:14 -07:00			`if hf_model_kwargs is None:`
			`hf_model_kwargs = {}`

			`# Run with HF runner`
			`with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:`
			`hf_outputs = llm.generate_greedy(prompts, 8)`
			`hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")`

			`# Clean up the GPU memory for the next test`
			`gc.collect()`
			`torch.cuda.empty_cache()`

support bitsandbytes 8-bit and FP4 quantized models (#7445) 2024-08-29 16:09:08 -07:00			`# Compare the generated strings`
			`for hf_log, vllm_log in zip(hf_logs, vllm_logs):`
			`hf_str = hf_log["generated_text"]`
			`vllm_str = vllm_log["generated_text"]`
			`prompt = hf_log["prompt"]`
support bitsandbytes quantization with more models (#9148) 2024-10-08 18:52:19 -07:00
support bitsandbytes 8-bit and FP4 quantized models (#7445) 2024-08-29 16:09:08 -07:00			`assert hf_str == vllm_str, (f"Model: {model_name}"`
			`f"Mismatch between HF and vLLM outputs:\n"`
			`f"Prompt: {prompt}\n"`
			`f"HF Output: '{hf_str}'\n"`
			`f"vLLM Output: '{vllm_str}'")`