diff --git a/vllm/config.py b/vllm/config.py index 2669d1a1..d6f931ca 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -173,6 +173,9 @@ class ModelConfig: Defaults to True. config_format: The config format which shall be loaded. Defaults to 'auto' which defaults to 'hf'. + hf_token: The token to use as HTTP bearer authorization for remote files + . If `True`, will use the token generated when running + `huggingface-cli login` (stored in `~/.huggingface`). hf_overrides: If a dictionary, contains arguments to be forwarded to the HuggingFace config. If a callable, it is called to update the HuggingFace config. @@ -256,6 +259,7 @@ class ModelConfig: limit_mm_per_prompt: Optional[Mapping[str, int]] = None, use_async_output_proc: bool = True, config_format: ConfigFormat = ConfigFormat.AUTO, + hf_token: Optional[Union[bool, str]] = None, hf_overrides: Optional[HfOverrides] = None, mm_processor_kwargs: Optional[dict[str, Any]] = None, disable_mm_preprocessor_cache: bool = False, @@ -356,7 +360,7 @@ class ModelConfig: self.hf_text_config = get_hf_text_config(self.hf_config) self.encoder_config = self._get_encoder_config() self.hf_image_processor_config = get_hf_image_processor_config( - self.model, revision) + self.model, hf_token=hf_token, revision=revision) self.dtype = _get_and_verify_dtype(self.hf_config, dtype) self.use_async_output_proc = use_async_output_proc self.mm_processor_kwargs = mm_processor_kwargs diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 93dba201..af80541b 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -138,6 +138,7 @@ class EngineArgs: code_revision: Optional[str] = None rope_scaling: Optional[Dict[str, Any]] = None rope_theta: Optional[float] = None + hf_token: Optional[Union[bool, str]] = None hf_overrides: Optional[HfOverrides] = None tokenizer_revision: Optional[str] = None quantization: Optional[str] = None @@ -602,6 +603,16 @@ class EngineArgs: help='RoPE theta. Use with `rope_scaling`. In ' 'some cases, changing the RoPE theta improves the ' 'performance of the scaled model.') + parser.add_argument( + '--hf-token', + type=str, + nargs='?', + const=True, + default=None, + help='The token to use as HTTP bearer authorization' + ' for remote files. If `True`, will use the token ' + 'generated when running `huggingface-cli login` ' + '(stored in `~/.huggingface`).') parser.add_argument('--hf-overrides', type=json.loads, default=EngineArgs.hf_overrides, @@ -1038,6 +1049,7 @@ class EngineArgs: code_revision=self.code_revision, rope_scaling=self.rope_scaling, rope_theta=self.rope_theta, + hf_token=self.hf_token, hf_overrides=self.hf_overrides, tokenizer_revision=self.tokenizer_revision, max_model_len=self.max_model_len, diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index f39b011c..d252a2bb 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -117,6 +117,9 @@ class LLM: disable_custom_all_reduce: See :class:`~vllm.config.ParallelConfig` disable_async_output_proc: Disable async output processing. This may result in lower performance. + hf_token: The token to use as HTTP bearer authorization for remote files + . If `True`, will use the token generated when running + `huggingface-cli login` (stored in `~/.huggingface`). hf_overrides: If a dictionary, contains arguments to be forwarded to the HuggingFace config. If a callable, it is called to update the HuggingFace config. @@ -177,6 +180,7 @@ class LLM: max_seq_len_to_capture: int = 8192, disable_custom_all_reduce: bool = False, disable_async_output_proc: bool = False, + hf_token: Optional[Union[bool, str]] = None, hf_overrides: Optional[HfOverrides] = None, mm_processor_kwargs: Optional[dict[str, Any]] = None, # After positional args are removed, move this right below `model` @@ -232,6 +236,7 @@ class LLM: max_seq_len_to_capture=max_seq_len_to_capture, disable_custom_all_reduce=disable_custom_all_reduce, disable_async_output_proc=disable_async_output_proc, + hf_token=hf_token, hf_overrides=hf_overrides, mm_processor_kwargs=mm_processor_kwargs, override_pooler_config=override_pooler_config, diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index d27a126d..fe0319c9 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -712,6 +712,7 @@ def load_params_config(model: Union[str, Path], revision: Optional[str], def get_hf_image_processor_config( model: Union[str, Path], + hf_token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, **kwargs, ) -> Dict[str, Any]: @@ -721,7 +722,10 @@ def get_hf_image_processor_config( # Separate model folder from file path for GGUF models if check_gguf_file(model): model = Path(model).parent - return get_image_processor_config(model, revision=revision, **kwargs) + return get_image_processor_config(model, + token=hf_token, + revision=revision, + **kwargs) def get_hf_text_config(config: PretrainedConfig):