[Bugfix] add hf_token to EngineArgs (#16093)
Signed-off-by: paolovic <paul-philipp.luley@uzh.ch> Co-authored-by: paolovic <paul-philipp.luley@uzh.ch>
This commit is contained in:
parent
3a100b9278
commit
da224daaa9
@ -173,6 +173,9 @@ class ModelConfig:
|
||||
Defaults to True.
|
||||
config_format: The config format which shall be loaded.
|
||||
Defaults to 'auto' which defaults to 'hf'.
|
||||
hf_token: The token to use as HTTP bearer authorization for remote files
|
||||
. If `True`, will use the token generated when running
|
||||
`huggingface-cli login` (stored in `~/.huggingface`).
|
||||
hf_overrides: If a dictionary, contains arguments to be forwarded to the
|
||||
HuggingFace config. If a callable, it is called to update the
|
||||
HuggingFace config.
|
||||
@ -256,6 +259,7 @@ class ModelConfig:
|
||||
limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
|
||||
use_async_output_proc: bool = True,
|
||||
config_format: ConfigFormat = ConfigFormat.AUTO,
|
||||
hf_token: Optional[Union[bool, str]] = None,
|
||||
hf_overrides: Optional[HfOverrides] = None,
|
||||
mm_processor_kwargs: Optional[dict[str, Any]] = None,
|
||||
disable_mm_preprocessor_cache: bool = False,
|
||||
@ -356,7 +360,7 @@ class ModelConfig:
|
||||
self.hf_text_config = get_hf_text_config(self.hf_config)
|
||||
self.encoder_config = self._get_encoder_config()
|
||||
self.hf_image_processor_config = get_hf_image_processor_config(
|
||||
self.model, revision)
|
||||
self.model, hf_token=hf_token, revision=revision)
|
||||
self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
|
||||
self.use_async_output_proc = use_async_output_proc
|
||||
self.mm_processor_kwargs = mm_processor_kwargs
|
||||
|
@ -138,6 +138,7 @@ class EngineArgs:
|
||||
code_revision: Optional[str] = None
|
||||
rope_scaling: Optional[Dict[str, Any]] = None
|
||||
rope_theta: Optional[float] = None
|
||||
hf_token: Optional[Union[bool, str]] = None
|
||||
hf_overrides: Optional[HfOverrides] = None
|
||||
tokenizer_revision: Optional[str] = None
|
||||
quantization: Optional[str] = None
|
||||
@ -602,6 +603,16 @@ class EngineArgs:
|
||||
help='RoPE theta. Use with `rope_scaling`. In '
|
||||
'some cases, changing the RoPE theta improves the '
|
||||
'performance of the scaled model.')
|
||||
parser.add_argument(
|
||||
'--hf-token',
|
||||
type=str,
|
||||
nargs='?',
|
||||
const=True,
|
||||
default=None,
|
||||
help='The token to use as HTTP bearer authorization'
|
||||
' for remote files. If `True`, will use the token '
|
||||
'generated when running `huggingface-cli login` '
|
||||
'(stored in `~/.huggingface`).')
|
||||
parser.add_argument('--hf-overrides',
|
||||
type=json.loads,
|
||||
default=EngineArgs.hf_overrides,
|
||||
@ -1038,6 +1049,7 @@ class EngineArgs:
|
||||
code_revision=self.code_revision,
|
||||
rope_scaling=self.rope_scaling,
|
||||
rope_theta=self.rope_theta,
|
||||
hf_token=self.hf_token,
|
||||
hf_overrides=self.hf_overrides,
|
||||
tokenizer_revision=self.tokenizer_revision,
|
||||
max_model_len=self.max_model_len,
|
||||
|
@ -117,6 +117,9 @@ class LLM:
|
||||
disable_custom_all_reduce: See :class:`~vllm.config.ParallelConfig`
|
||||
disable_async_output_proc: Disable async output processing.
|
||||
This may result in lower performance.
|
||||
hf_token: The token to use as HTTP bearer authorization for remote files
|
||||
. If `True`, will use the token generated when running
|
||||
`huggingface-cli login` (stored in `~/.huggingface`).
|
||||
hf_overrides: If a dictionary, contains arguments to be forwarded to the
|
||||
HuggingFace config. If a callable, it is called to update the
|
||||
HuggingFace config.
|
||||
@ -177,6 +180,7 @@ class LLM:
|
||||
max_seq_len_to_capture: int = 8192,
|
||||
disable_custom_all_reduce: bool = False,
|
||||
disable_async_output_proc: bool = False,
|
||||
hf_token: Optional[Union[bool, str]] = None,
|
||||
hf_overrides: Optional[HfOverrides] = None,
|
||||
mm_processor_kwargs: Optional[dict[str, Any]] = None,
|
||||
# After positional args are removed, move this right below `model`
|
||||
@ -232,6 +236,7 @@ class LLM:
|
||||
max_seq_len_to_capture=max_seq_len_to_capture,
|
||||
disable_custom_all_reduce=disable_custom_all_reduce,
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
hf_token=hf_token,
|
||||
hf_overrides=hf_overrides,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
override_pooler_config=override_pooler_config,
|
||||
|
@ -712,6 +712,7 @@ def load_params_config(model: Union[str, Path], revision: Optional[str],
|
||||
|
||||
def get_hf_image_processor_config(
|
||||
model: Union[str, Path],
|
||||
hf_token: Optional[Union[bool, str]] = None,
|
||||
revision: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> Dict[str, Any]:
|
||||
@ -721,7 +722,10 @@ def get_hf_image_processor_config(
|
||||
# Separate model folder from file path for GGUF models
|
||||
if check_gguf_file(model):
|
||||
model = Path(model).parent
|
||||
return get_image_processor_config(model, revision=revision, **kwargs)
|
||||
return get_image_processor_config(model,
|
||||
token=hf_token,
|
||||
revision=revision,
|
||||
**kwargs)
|
||||
|
||||
|
||||
def get_hf_text_config(config: PretrainedConfig):
|
||||
|
Loading…
x
Reference in New Issue
Block a user