Skip to content

Commit 6cafd46

Browse files
committed
additional fix
Signed-off-by: pengdurice <pengduhit@gmail.com>
1 parent 8fd4e8b commit 6cafd46

File tree

2 files changed

+5
-38
lines changed

2 files changed

+5
-38
lines changed

src/megatron/bridge/models/glm_moe_dsa/glm5_bridge.py

Lines changed: 4 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,7 @@ def provider_bridge(self, hf_pretrained: PreTrainedCausalLM) -> GLM5ModelProvide
115115

116116
configs["make_vocab_size_divisible_by"] = 1280
117117
configs["moe_router_score_function"] = "sigmoid"
118-
# configs["moe_router_enable_expert_bias"] = True # TODO: uncomment this
119-
configs["moe_router_enable_expert_bias"] = False # TODO: remove this
118+
configs["moe_router_enable_expert_bias"] = False
120119
if hasattr(hf_config, "aux_loss_alpha"):
121120
configs["moe_aux_loss_coeff"] = hf_config.aux_loss_alpha
122121

@@ -151,14 +150,6 @@ def build_conversion_tasks(self, hf_pretrained, megatron_model):
151150
def mapping_registry(self) -> MegatronMappingRegistry:
152151
mapping_list = []
153152

154-
# param_mappings = {
155-
# # Embed
156-
# "embedding.word_embeddings.weight": "model.embed_tokens.weight",
157-
# # LM Head
158-
# "decoder.final_layernorm.weight": "model.norm.weight",
159-
# "output_layer.weight": "lm_head.weight",
160-
# }
161-
# copied from deepseek's common.py
162153
param_mappings = {
163154
# Embed
164155
"embedding.word_embeddings.weight": "model.embed_tokens.weight",
@@ -193,43 +184,19 @@ def mapping_registry(self) -> MegatronMappingRegistry:
193184
"decoder.layers.*.self_attention.q_layernorm.weight": "model.layers.*.self_attn.q_a_layernorm.weight",
194185
# For models without MLA
195186
"decoder.layers.*.self_attention.linear_q_proj.weight": "model.layers.*.self_attn.q_proj.weight",
196-
197-
# copied from megatron-bridge's pr: https://github.com/NVIDIA-NeMo/Megatron-Bridge/pull/1421
187+
# Sparse attention indexer
198188
"decoder.layers.*.self_attention.core_attention.indexer.linear_wq_b.weight": "model.layers.*.self_attn.indexer.wq_b.weight",
199189
"decoder.layers.*.self_attention.core_attention.indexer.linear_wk.weight": "model.layers.*.self_attn.indexer.wk.weight",
200190
"decoder.layers.*.self_attention.core_attention.indexer.k_norm.weight": "model.layers.*.self_attn.indexer.k_norm.weight",
201191
"decoder.layers.*.self_attention.core_attention.indexer.k_norm.bias": "model.layers.*.self_attn.indexer.k_norm.bias",
202-
"decoder.layers.*.self_attention.core_attention.indexer.linear_weights_proj.weight": "model.layers.*.self_attn.indexer.weights_proj.weight",
192+
"decoder.layers.*.self_attention.core_attention.indexer.linear_weights_proj.weight": "model.layers.*.self_attn.indexer.weights_proj.weight",
203193
}
204-
# copied from glm45_bridge.py
205194
layer_specific_mappings = {
206-
# Attention
207-
# "decoder.layers.*.input_layernorm.weight": "model.layers.*.input_layernorm.weight",
208-
# "decoder.layers.*.self_attention.linear_proj.weight": "model.layers.*.self_attn.o_proj.weight",
209-
# Reference: https://github.com/NVIDIA/NeMo/blob/50cceb9c90ea1f440d1e14074fa13bd45f60a1c4/nemo/collections/llm/gpt/model/deepseek.py#L637-L650
210-
# In GLM, HF weight `model.layers.*.post_attention_layernorm.weight` is mapped to the following mcore weights depending on the layer type:
211-
# (a) `decoder.layers.*.pre_mlp_layernorm.weight`, if the layer is MoE
212-
# (b) `decoder.layers.*.mlp.linear_fc1.layer_norm_weight`, if the layer is dense
213-
# "decoder.layers.*.pre_mlp_layernorm.weight": "model.layers.*.post_attention_layernorm.weight",
214195
"decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight",
215-
# "decoder.layers.*.self_attention.q_layernorm.weight": "model.layers.*.self_attn.q_norm.weight",
216-
# "decoder.layers.*.self_attention.k_layernorm.weight": "model.layers.*.self_attn.k_norm.weight",
217-
# MLP
218-
# "decoder.layers.*.mlp.linear_fc2.weight": "model.layers.*.mlp.down_proj.weight",
219-
# "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.layers.*.post_attention_layernorm.weight",
220-
# "decoder.layers.*.mlp.shared_experts.linear_fc2.weight": "model.layers.*.mlp.shared_experts.down_proj.weight",
221196
"decoder.layers.*.mlp.shared_experts.router.weight": "model.layers.*.mlp.shared_experts.gate.weight",
222-
# "decoder.layers.*.mlp.experts.linear_fc2.weight*": "model.layers.*.mlp.experts.*.down_proj.weight",
223-
# "decoder.layers.*.mlp.router.weight": "model.layers.*.mlp.gate.weight",
224197
"decoder.layers.*.mlp.router.expert_bias": "model.layers.*.mlp.gate.e_score_correction_bias",
225-
226-
# "decoder.layers.*.self_attention.linear_kv_up_proj.layer_norm_weight": "model.layers.*.self_attn.kv_a_layernorm.weight", # For Dense MLA
227-
# Sparse attention indexer
228-
229-
# "decoder.layers.*.mlp.router.expert_bias": "model.layers.*.mlp.gate.e_score_correction_bias",
230-
# "decoder.layers.*.self_attention.linear_q_up_proj.layer_norm_weight": "model.layers.*.self_attn.q_a_layernorm.weight",
231198
}
232-
199+
233200
for megatron_param, hf_param in param_mappings.items():
234201
mapping_list.append(AutoMapping(megatron_param=megatron_param, hf_param=hf_param))
235202

src/megatron/bridge/models/glm_moe_dsa/glm5_provider.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,5 +28,5 @@ class GLM5ModelProvider(DeepSeekV3ModelProvider):
2828
moe_aux_loss_coeff: float = 0.001
2929
sparse_attention_type: str = "dsa"
3030
index_head_dim: int = 128
31-
index_n_heads: int = 64
31+
index_n_heads: int = 32
3232
index_topk: int = 2048

0 commit comments

Comments
 (0)