From af295e9b010ff2f7886cde2e5a41a4ef84d82ac1 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 11 Mar 2025 22:59:43 +0800 Subject: [PATCH] [Bugfix] Update `--hf-overrides` for `Alibaba-NLP/gte-Qwen2` (#14609) Signed-off-by: DarkLight1337 --- docs/source/models/supported_models.md | 11 ++++------- tests/models/embedding/language/test_embedding.py | 4 ++-- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index c9140bd0..e46934b9 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -541,14 +541,11 @@ You should manually set mean pooling by passing `--override-pooler-config '{"poo ::: :::{note} -Unlike base Qwen2, `Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention. -You can set `--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly. +The HF implementation of `Alibaba-NLP/gte-Qwen2-1.5B-instruct` is hardcoded to use causal attention despite what is shown in `config.json`. To compare vLLM vs HF results, +you should set `--hf-overrides '{"is_causal": true}'` in vLLM so that the two implementations are consistent with each other. -On the other hand, its 1.5B variant (`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention -despite being described otherwise on its model card. - -Regardless of the variant, you need to enable `--trust-remote-code` for the correct tokenizer to be -loaded. See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882). +For both the 1.5B and 7B variants, you also need to enable `--trust-remote-code` for the correct tokenizer to be loaded. +See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882). ::: If your model is not in the above list, we will try to automatically convert the model using diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index 4b992686..a8ac70d5 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -42,8 +42,8 @@ def test_models( if model == "ssmits/Qwen2-7B-Instruct-embed-base": vllm_extra_kwargs["override_pooler_config"] = \ PoolerConfig(pooling_type="MEAN") - if model == "Alibaba-NLP/gte-Qwen2-7B-instruct": - vllm_extra_kwargs["hf_overrides"] = {"is_causal": False} + if model == "Alibaba-NLP/gte-Qwen2-1.5B-instruct": + vllm_extra_kwargs["hf_overrides"] = {"is_causal": True} # The example_prompts has ending "\n", for example: # "Write a short story about a robot that dreams for the first time.\n"