@@ -115,8 +115,7 @@ def provider_bridge(self, hf_pretrained: PreTrainedCausalLM) -> GLM5ModelProvide
115115
116116 configs ["make_vocab_size_divisible_by" ] = 1280
117117 configs ["moe_router_score_function" ] = "sigmoid"
118- # configs["moe_router_enable_expert_bias"] = True # TODO: uncomment this
119- configs ["moe_router_enable_expert_bias" ] = False # TODO: remove this
118+ configs ["moe_router_enable_expert_bias" ] = False
120119 if hasattr (hf_config , "aux_loss_alpha" ):
121120 configs ["moe_aux_loss_coeff" ] = hf_config .aux_loss_alpha
122121
@@ -151,14 +150,6 @@ def build_conversion_tasks(self, hf_pretrained, megatron_model):
151150 def mapping_registry (self ) -> MegatronMappingRegistry :
152151 mapping_list = []
153152
154- # param_mappings = {
155- # # Embed
156- # "embedding.word_embeddings.weight": "model.embed_tokens.weight",
157- # # LM Head
158- # "decoder.final_layernorm.weight": "model.norm.weight",
159- # "output_layer.weight": "lm_head.weight",
160- # }
161- # copied from deepseek's common.py
162153 param_mappings = {
163154 # Embed
164155 "embedding.word_embeddings.weight" : "model.embed_tokens.weight" ,
@@ -193,43 +184,19 @@ def mapping_registry(self) -> MegatronMappingRegistry:
193184 "decoder.layers.*.self_attention.q_layernorm.weight" : "model.layers.*.self_attn.q_a_layernorm.weight" ,
194185 # For models without MLA
195186 "decoder.layers.*.self_attention.linear_q_proj.weight" : "model.layers.*.self_attn.q_proj.weight" ,
196-
197- # copied from megatron-bridge's pr: https://github.com/NVIDIA-NeMo/Megatron-Bridge/pull/1421
187+ # Sparse attention indexer
198188 "decoder.layers.*.self_attention.core_attention.indexer.linear_wq_b.weight" : "model.layers.*.self_attn.indexer.wq_b.weight" ,
199189 "decoder.layers.*.self_attention.core_attention.indexer.linear_wk.weight" : "model.layers.*.self_attn.indexer.wk.weight" ,
200190 "decoder.layers.*.self_attention.core_attention.indexer.k_norm.weight" : "model.layers.*.self_attn.indexer.k_norm.weight" ,
201191 "decoder.layers.*.self_attention.core_attention.indexer.k_norm.bias" : "model.layers.*.self_attn.indexer.k_norm.bias" ,
202- "decoder.layers.*.self_attention.core_attention.indexer.linear_weights_proj.weight" : "model.layers.*.self_attn.indexer.weights_proj.weight" ,
192+ "decoder.layers.*.self_attention.core_attention.indexer.linear_weights_proj.weight" : "model.layers.*.self_attn.indexer.weights_proj.weight" ,
203193 }
204- # copied from glm45_bridge.py
205194 layer_specific_mappings = {
206- # Attention
207- # "decoder.layers.*.input_layernorm.weight": "model.layers.*.input_layernorm.weight",
208- # "decoder.layers.*.self_attention.linear_proj.weight": "model.layers.*.self_attn.o_proj.weight",
209- # Reference: https://github.com/NVIDIA/NeMo/blob/50cceb9c90ea1f440d1e14074fa13bd45f60a1c4/nemo/collections/llm/gpt/model/deepseek.py#L637-L650
210- # In GLM, HF weight `model.layers.*.post_attention_layernorm.weight` is mapped to the following mcore weights depending on the layer type:
211- # (a) `decoder.layers.*.pre_mlp_layernorm.weight`, if the layer is MoE
212- # (b) `decoder.layers.*.mlp.linear_fc1.layer_norm_weight`, if the layer is dense
213- # "decoder.layers.*.pre_mlp_layernorm.weight": "model.layers.*.post_attention_layernorm.weight",
214195 "decoder.layers.*.self_attention.linear_qkv.layer_norm_weight" : "model.layers.*.input_layernorm.weight" ,
215- # "decoder.layers.*.self_attention.q_layernorm.weight": "model.layers.*.self_attn.q_norm.weight",
216- # "decoder.layers.*.self_attention.k_layernorm.weight": "model.layers.*.self_attn.k_norm.weight",
217- # MLP
218- # "decoder.layers.*.mlp.linear_fc2.weight": "model.layers.*.mlp.down_proj.weight",
219- # "decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.layers.*.post_attention_layernorm.weight",
220- # "decoder.layers.*.mlp.shared_experts.linear_fc2.weight": "model.layers.*.mlp.shared_experts.down_proj.weight",
221196 "decoder.layers.*.mlp.shared_experts.router.weight" : "model.layers.*.mlp.shared_experts.gate.weight" ,
222- # "decoder.layers.*.mlp.experts.linear_fc2.weight*": "model.layers.*.mlp.experts.*.down_proj.weight",
223- # "decoder.layers.*.mlp.router.weight": "model.layers.*.mlp.gate.weight",
224197 "decoder.layers.*.mlp.router.expert_bias" : "model.layers.*.mlp.gate.e_score_correction_bias" ,
225-
226- # "decoder.layers.*.self_attention.linear_kv_up_proj.layer_norm_weight": "model.layers.*.self_attn.kv_a_layernorm.weight", # For Dense MLA
227- # Sparse attention indexer
228-
229- # "decoder.layers.*.mlp.router.expert_bias": "model.layers.*.mlp.gate.e_score_correction_bias",
230- # "decoder.layers.*.self_attention.linear_q_up_proj.layer_norm_weight": "model.layers.*.self_attn.q_a_layernorm.weight",
231198 }
232-
199+
233200 for megatron_param , hf_param in param_mappings .items ():
234201 mapping_list .append (AutoMapping (megatron_param = megatron_param , hf_param = hf_param ))
235202
0 commit comments