diff --git a/docs/source/features/quantization/bnb.md b/docs/source/features/quantization/bnb.md index 7525e8e7..b81d89c4 100644 --- a/docs/source/features/quantization/bnb.md +++ b/docs/source/features/quantization/bnb.md @@ -25,7 +25,7 @@ import torch # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint. model_id = "unsloth/tinyllama-bnb-4bit" llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ -quantization="bitsandbytes", load_format="bitsandbytes") +quantization="bitsandbytes") ``` ## Inflight quantization: load as 4bit quantization @@ -35,7 +35,7 @@ from vllm import LLM import torch model_id = "huggyllama/llama-7b" llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ -quantization="bitsandbytes", load_format="bitsandbytes") +quantization="bitsandbytes") ``` ## OpenAI Compatible Server @@ -43,5 +43,5 @@ quantization="bitsandbytes", load_format="bitsandbytes") Append the following to your 4bit model arguments: ```console ---quantization bitsandbytes --load-format bitsandbytes +--quantization bitsandbytes ``` diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py index a4097350..ab235ddd 100644 --- a/examples/offline_inference/lora_with_quantization_inference.py +++ b/examples/offline_inference/lora_with_quantization_inference.py @@ -83,7 +83,6 @@ def initialize_engine(model: str, quantization: str, engine_args = EngineArgs(model=model, quantization=quantization, qlora_adapter_name_or_path=lora_repo, - load_format="bitsandbytes", enable_lora=True, max_lora_rank=64) else: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index bbe780a0..88d70acb 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1170,22 +1170,15 @@ class EngineArgs: ) def create_load_config(self) -> LoadConfig: - # bitsandbytes quantization needs a specific model loader - # so we make sure the quant method and the load format are consistent - if (self.quantization == "bitsandbytes" or - self.qlora_adapter_name_or_path is not None) and \ - self.load_format != "bitsandbytes": - raise ValueError( - "BitsAndBytes quantization and QLoRA adapter only support " - f"'bitsandbytes' load format, but got {self.load_format}") - if (self.load_format == "bitsandbytes" or - self.qlora_adapter_name_or_path is not None) and \ + if(self.qlora_adapter_name_or_path is not None) and \ self.quantization != "bitsandbytes": raise ValueError( - "BitsAndBytes load format and QLoRA adapter only support " + "QLoRA adapter only support " f"'bitsandbytes' quantization, but got {self.quantization}") + if self.quantization == "bitsandbytes": + self.load_format = "bitsandbytes" return LoadConfig( load_format=self.load_format, download_dir=self.download_dir,