From 5b1aca2ae39b4bcdd04d916ec55bfc87f98d4835 Mon Sep 17 00:00:00 2001
From: intervitens <155717317+intervitens@users.noreply.github.com>
Date: Thu, 17 Apr 2025 13:35:07 +0300
Subject: [PATCH] [Bugfix] Fix GLM4 model (#16618)

Signed-off-by: intervitens <intervitens@tutanota.com>
---
 docs/source/models/supported_models.md | 2 +-
 tests/models/registry.py               | 2 +-
 vllm/model_executor/models/glm4.py     | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 0b193ca0..34917b5b 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -338,7 +338,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
 - * `Glm4ForCausalLM`
   * GLM-4-0414
-  * `THUDM/GLM-4-32B-Chat-0414`, etc.
+  * `THUDM/GLM-4-32B-0414`, etc.
   * ✅︎
   * ✅︎
 - * `GPT2LMHeadModel`
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 8d50644a..22e03f49 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -147,7 +147,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                          min_transformers_version="4.50"),
     "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"),
     "Glm4ForCausalLM": _HfExamplesInfo(
-        "THUDM/GLM-4-32B-Chat-0414",
+        "THUDM/GLM-4-32B-0414",
         is_available_online=False,
         min_transformers_version="4.52.dev0"
     ),
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
index cba093cb..28cebfbd 100644
--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -82,7 +82,7 @@ class Glm4Attention(nn.Module):
         partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
         self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
         self.head_dim = head_dim or hidden_size // self.total_num_heads
-        self.rotary_dim = int(partial_rotary_factor * self.head_dim)
+        self.rotary_dim = self.head_dim
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
@@ -110,6 +110,7 @@ class Glm4Attention(nn.Module):
             base=self.rope_theta,
             rope_scaling=rope_scaling,
             partial_rotary_factor=partial_rotary_factor,
+            is_neox_style=False,
         )
         self.attn = Attention(self.num_heads,
                               self.head_dim,
@@ -197,13 +198,12 @@ class Glm4DecoderLayer(nn.Module):
         )
 
         hidden_states = self.post_self_attn_layernorm(hidden_states)
-        hidden_states = residual + hidden_states
 
         # Fully Connected
-        hidden_states = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
         hidden_states = self.post_mlp_layernorm(hidden_states)
-        hidden_states = residual + hidden_states
 
         return hidden_states, residual