[Bugfix] add hf_token to EngineArgs (#16093)

Signed-off-by: paolovic <paul-philipp.luley@uzh.ch> Co-authored-by: paolovic <paul-philipp.luley@uzh.ch>
2025-04-06 16:47:33 +02:00 · 2025-04-06 16:47:33 +02:00 · da224daaa9
commit da224daaa9
parent 3a100b9278
4 changed files with 27 additions and 2 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@ -173,6 +173,9 @@ class ModelConfig:
            Defaults to True.
        config_format: The config format which shall be loaded.
            Defaults to 'auto' which defaults to 'hf'.
+        hf_token: The token to use as HTTP bearer authorization for remote files
+            . If `True`, will use the token generated when running 
+            `huggingface-cli login` (stored in `~/.huggingface`).
        hf_overrides: If a dictionary, contains arguments to be forwarded to the
            HuggingFace config. If a callable, it is called to update the
            HuggingFace config.
@ -256,6 +259,7 @@ class ModelConfig:
        limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
        use_async_output_proc: bool = True,
        config_format: ConfigFormat = ConfigFormat.AUTO,
+        hf_token: Optional[Union[bool, str]] = None,
        hf_overrides: Optional[HfOverrides] = None,
        mm_processor_kwargs: Optional[dict[str, Any]] = None,
        disable_mm_preprocessor_cache: bool = False,
@ -356,7 +360,7 @@ class ModelConfig:
        self.hf_text_config = get_hf_text_config(self.hf_config)
        self.encoder_config = self._get_encoder_config()
        self.hf_image_processor_config = get_hf_image_processor_config(
-            self.model, revision)
+            self.model, hf_token=hf_token, revision=revision)
        self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
        self.use_async_output_proc = use_async_output_proc
        self.mm_processor_kwargs = mm_processor_kwargs
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -138,6 +138,7 @@ class EngineArgs:
    code_revision: Optional[str] = None
    rope_scaling: Optional[Dict[str, Any]] = None
    rope_theta: Optional[float] = None
+    hf_token: Optional[Union[bool, str]] = None
    hf_overrides: Optional[HfOverrides] = None
    tokenizer_revision: Optional[str] = None
    quantization: Optional[str] = None
@ -602,6 +603,16 @@ class EngineArgs:
                            help='RoPE theta. Use with `rope_scaling`. In '
                            'some cases, changing the RoPE theta improves the '
                            'performance of the scaled model.')
+        parser.add_argument(
+            '--hf-token',
+            type=str,
+            nargs='?',
+            const=True,
+            default=None,
+            help='The token to use as HTTP bearer authorization'
+            ' for remote files. If `True`, will use the token '
+            'generated when running `huggingface-cli login` '
+            '(stored in `~/.huggingface`).')
        parser.add_argument('--hf-overrides',
                            type=json.loads,
                            default=EngineArgs.hf_overrides,
@ -1038,6 +1049,7 @@ class EngineArgs:
            code_revision=self.code_revision,
            rope_scaling=self.rope_scaling,
            rope_theta=self.rope_theta,
+            hf_token=self.hf_token,
            hf_overrides=self.hf_overrides,
            tokenizer_revision=self.tokenizer_revision,
            max_model_len=self.max_model_len,
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -117,6 +117,9 @@ class LLM:
        disable_custom_all_reduce: See :class:`~vllm.config.ParallelConfig`
        disable_async_output_proc: Disable async output processing.
            This may result in lower performance.
+        hf_token: The token to use as HTTP bearer authorization for remote files
+            . If `True`, will use the token generated when running 
+            `huggingface-cli login` (stored in `~/.huggingface`).
        hf_overrides: If a dictionary, contains arguments to be forwarded to the
            HuggingFace config. If a callable, it is called to update the
            HuggingFace config.
@ -177,6 +180,7 @@ class LLM:
        max_seq_len_to_capture: int = 8192,
        disable_custom_all_reduce: bool = False,
        disable_async_output_proc: bool = False,
+        hf_token: Optional[Union[bool, str]] = None,
        hf_overrides: Optional[HfOverrides] = None,
        mm_processor_kwargs: Optional[dict[str, Any]] = None,
        # After positional args are removed, move this right below `model`
@ -232,6 +236,7 @@ class LLM:
            max_seq_len_to_capture=max_seq_len_to_capture,
            disable_custom_all_reduce=disable_custom_all_reduce,
            disable_async_output_proc=disable_async_output_proc,
+            hf_token=hf_token,
            hf_overrides=hf_overrides,
            mm_processor_kwargs=mm_processor_kwargs,
            override_pooler_config=override_pooler_config,
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@ -712,6 +712,7 @@ def load_params_config(model: Union[str, Path], revision: Optional[str],

 def get_hf_image_processor_config(
    model: Union[str, Path],
+    hf_token: Optional[Union[bool, str]] = None,
    revision: Optional[str] = None,
    **kwargs,
 ) -> Dict[str, Any]:
@ -721,7 +722,10 @@ def get_hf_image_processor_config(
    # Separate model folder from file path for GGUF models
    if check_gguf_file(model):
        model = Path(model).parent
-    return get_image_processor_config(model, revision=revision, **kwargs)
+    return get_image_processor_config(model,
+                                      token=hf_token,
+                                      revision=revision,
+                                      **kwargs)


 def get_hf_text_config(config: PretrainedConfig):