[Misc] Clean up the BitsAndBytes arguments (#15140)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
parent
d3ccbd6350
commit
10f55fe6c5
@ -25,7 +25,7 @@ import torch
|
|||||||
# unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
|
# unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
|
||||||
model_id = "unsloth/tinyllama-bnb-4bit"
|
model_id = "unsloth/tinyllama-bnb-4bit"
|
||||||
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
|
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
|
||||||
quantization="bitsandbytes", load_format="bitsandbytes")
|
quantization="bitsandbytes")
|
||||||
```
|
```
|
||||||
|
|
||||||
## Inflight quantization: load as 4bit quantization
|
## Inflight quantization: load as 4bit quantization
|
||||||
@ -35,7 +35,7 @@ from vllm import LLM
|
|||||||
import torch
|
import torch
|
||||||
model_id = "huggyllama/llama-7b"
|
model_id = "huggyllama/llama-7b"
|
||||||
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
|
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
|
||||||
quantization="bitsandbytes", load_format="bitsandbytes")
|
quantization="bitsandbytes")
|
||||||
```
|
```
|
||||||
|
|
||||||
## OpenAI Compatible Server
|
## OpenAI Compatible Server
|
||||||
@ -43,5 +43,5 @@ quantization="bitsandbytes", load_format="bitsandbytes")
|
|||||||
Append the following to your 4bit model arguments:
|
Append the following to your 4bit model arguments:
|
||||||
|
|
||||||
```console
|
```console
|
||||||
--quantization bitsandbytes --load-format bitsandbytes
|
--quantization bitsandbytes
|
||||||
```
|
```
|
||||||
|
@ -83,7 +83,6 @@ def initialize_engine(model: str, quantization: str,
|
|||||||
engine_args = EngineArgs(model=model,
|
engine_args = EngineArgs(model=model,
|
||||||
quantization=quantization,
|
quantization=quantization,
|
||||||
qlora_adapter_name_or_path=lora_repo,
|
qlora_adapter_name_or_path=lora_repo,
|
||||||
load_format="bitsandbytes",
|
|
||||||
enable_lora=True,
|
enable_lora=True,
|
||||||
max_lora_rank=64)
|
max_lora_rank=64)
|
||||||
else:
|
else:
|
||||||
|
@ -1170,22 +1170,15 @@ class EngineArgs:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def create_load_config(self) -> LoadConfig:
|
def create_load_config(self) -> LoadConfig:
|
||||||
# bitsandbytes quantization needs a specific model loader
|
|
||||||
# so we make sure the quant method and the load format are consistent
|
|
||||||
if (self.quantization == "bitsandbytes" or
|
|
||||||
self.qlora_adapter_name_or_path is not None) and \
|
|
||||||
self.load_format != "bitsandbytes":
|
|
||||||
raise ValueError(
|
|
||||||
"BitsAndBytes quantization and QLoRA adapter only support "
|
|
||||||
f"'bitsandbytes' load format, but got {self.load_format}")
|
|
||||||
|
|
||||||
if (self.load_format == "bitsandbytes" or
|
if(self.qlora_adapter_name_or_path is not None) and \
|
||||||
self.qlora_adapter_name_or_path is not None) and \
|
|
||||||
self.quantization != "bitsandbytes":
|
self.quantization != "bitsandbytes":
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"BitsAndBytes load format and QLoRA adapter only support "
|
"QLoRA adapter only support "
|
||||||
f"'bitsandbytes' quantization, but got {self.quantization}")
|
f"'bitsandbytes' quantization, but got {self.quantization}")
|
||||||
|
|
||||||
|
if self.quantization == "bitsandbytes":
|
||||||
|
self.load_format = "bitsandbytes"
|
||||||
return LoadConfig(
|
return LoadConfig(
|
||||||
load_format=self.load_format,
|
load_format=self.load_format,
|
||||||
download_dir=self.download_dir,
|
download_dir=self.download_dir,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user