diff --git a/docs/source/features/quantization/bnb.md b/docs/source/features/quantization/bnb.md index fc499e76..e356b99d 100644 --- a/docs/source/features/quantization/bnb.md +++ b/docs/source/features/quantization/bnb.md @@ -19,17 +19,20 @@ And usually, these repositories have a config.json file that includes a quantiza ## Read quantized checkpoint +For pre-quantized checkpoints, vLLM will try to infer the quantization method from the config file, so you don't need to explicitly specify the quantization argument. + ```python from vllm import LLM import torch # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint. model_id = "unsloth/tinyllama-bnb-4bit" -llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ -quantization="bitsandbytes") +llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True) ``` ## Inflight quantization: load as 4bit quantization +For inflight 4bit quantization with BitsAndBytes, you need to explicitly specify the quantization argument. + ```python from vllm import LLM import torch @@ -40,7 +43,7 @@ quantization="bitsandbytes") ## OpenAI Compatible Server -Append the following to your 4bit model arguments: +Append the following to your model arguments for 4bit inflight quantization: ```console --quantization bitsandbytes diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py index 533b055e..8d9ae282 100644 --- a/tests/quantization/test_bitsandbytes.py +++ b/tests/quantization/test_bitsandbytes.py @@ -41,7 +41,7 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts, hf_model_kwargs = {"load_in_4bit": True} validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1], - model_name, hf_model_kwargs) + model_name, False, hf_model_kwargs) @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"), @@ -53,7 +53,7 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts, model_name, description) -> None: validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1], - model_name) + model_name, True) @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"), @@ -65,7 +65,7 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts, model_name, description) -> None: validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1], - model_name) + model_name, True) @pytest.mark.skipif(torch.cuda.device_count() < 2, @@ -82,6 +82,7 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts, vllm_runner, example_prompts[:1], model_name, + False, hf_model_kwargs, vllm_tp_size=2) @@ -128,13 +129,14 @@ def validate_generated_texts(hf_runner, vllm_runner, prompts, model_name, + pre_quant=False, hf_model_kwargs=None, vllm_tp_size=1): # NOTE: run vLLM first, as it requires a clean process # when using distributed inference with vllm_runner(model_name, - quantization='bitsandbytes', + quantization=None if pre_quant else 'bitsandbytes', tensor_parallel_size=vllm_tp_size, enforce_eager=False) as llm: vllm_outputs = llm.generate_greedy(prompts, 8) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 89c9b674..93dba201 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1275,6 +1275,10 @@ class EngineArgs: self.model_loader_extra_config[ "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path + # bitsandbytes pre-quantized model need a specific model loader + if model_config.quantization == "bitsandbytes": + self.quantization = self.load_format = "bitsandbytes" + load_config = self.create_load_config() prompt_adapter_config = PromptAdapterConfig(