From 99ef59cf7f93a6ce7d5ed3fd403db0aa962e5348 Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Thu, 10 Apr 2025 21:26:07 -0700
Subject: [PATCH] [Llama4] Enable attention temperature tuning by default for
 long context (>32k) (#16439)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
---
 vllm/model_executor/models/llama4.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 3dbf352a..8785e9dc 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -467,11 +467,15 @@ class Llama4ForCausalLM(LlamaForCausalLM):
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        # Update temperature tuning config from generation config
+        # update temperature tuning config from generation config
         gen_config = vllm_config.model_config.try_get_generation_config()
         gen_config.update(vllm_config.model_config.override_generation_config)
+        # enable temperature tuning by default when max_model_len > 32K
+        default_attn_temperature_tuning = \
+            vllm_config.model_config.max_model_len > 32768
         vllm_config.model_config.hf_config.attn_temperature_tuning \
-            = gen_config.get("attn_temperature_tuning", False)
+            = gen_config.get(
+                "attn_temperature_tuning", default_attn_temperature_tuning)
 
         super().__init__(vllm_config=vllm_config,
                          prefix=prefix,