[Misc] Clean up the BitsAndBytes arguments (#15140)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
Jee Jee Li 2025-03-21 10:17:12 +08:00 committed by GitHub
parent d3ccbd6350
commit 10f55fe6c5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 7 additions and 15 deletions

View File

@ -25,7 +25,7 @@ import torch
# unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint. # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
model_id = "unsloth/tinyllama-bnb-4bit" model_id = "unsloth/tinyllama-bnb-4bit"
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
quantization="bitsandbytes", load_format="bitsandbytes") quantization="bitsandbytes")
``` ```
## Inflight quantization: load as 4bit quantization ## Inflight quantization: load as 4bit quantization
@ -35,7 +35,7 @@ from vllm import LLM
import torch import torch
model_id = "huggyllama/llama-7b" model_id = "huggyllama/llama-7b"
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
quantization="bitsandbytes", load_format="bitsandbytes") quantization="bitsandbytes")
``` ```
## OpenAI Compatible Server ## OpenAI Compatible Server
@ -43,5 +43,5 @@ quantization="bitsandbytes", load_format="bitsandbytes")
Append the following to your 4bit model arguments: Append the following to your 4bit model arguments:
```console ```console
--quantization bitsandbytes --load-format bitsandbytes --quantization bitsandbytes
``` ```

View File

@ -83,7 +83,6 @@ def initialize_engine(model: str, quantization: str,
engine_args = EngineArgs(model=model, engine_args = EngineArgs(model=model,
quantization=quantization, quantization=quantization,
qlora_adapter_name_or_path=lora_repo, qlora_adapter_name_or_path=lora_repo,
load_format="bitsandbytes",
enable_lora=True, enable_lora=True,
max_lora_rank=64) max_lora_rank=64)
else: else:

View File

@ -1170,22 +1170,15 @@ class EngineArgs:
) )
def create_load_config(self) -> LoadConfig: def create_load_config(self) -> LoadConfig:
# bitsandbytes quantization needs a specific model loader
# so we make sure the quant method and the load format are consistent
if (self.quantization == "bitsandbytes" or
self.qlora_adapter_name_or_path is not None) and \
self.load_format != "bitsandbytes":
raise ValueError(
"BitsAndBytes quantization and QLoRA adapter only support "
f"'bitsandbytes' load format, but got {self.load_format}")
if (self.load_format == "bitsandbytes" or if(self.qlora_adapter_name_or_path is not None) and \
self.qlora_adapter_name_or_path is not None) and \
self.quantization != "bitsandbytes": self.quantization != "bitsandbytes":
raise ValueError( raise ValueError(
"BitsAndBytes load format and QLoRA adapter only support " "QLoRA adapter only support "
f"'bitsandbytes' quantization, but got {self.quantization}") f"'bitsandbytes' quantization, but got {self.quantization}")
if self.quantization == "bitsandbytes":
self.load_format = "bitsandbytes"
return LoadConfig( return LoadConfig(
load_format=self.load_format, load_format=self.load_format,
download_dir=self.download_dir, download_dir=self.download_dir,