diff --git a/megatron_patch/model/deepseek_v2/transformer_config.py b/megatron_patch/model/deepseek_v2/transformer_config.py index 213d4c0e..2e1f788a 100644 --- a/megatron_patch/model/deepseek_v2/transformer_config.py +++ b/megatron_patch/model/deepseek_v2/transformer_config.py @@ -26,3 +26,7 @@ class DeepSeekV2TransformerConfig(TransformerConfig): rotary_base: int = None rotary_scaling_factor: int = None + + max_position_embeddings: int = None + + moe_aux_loss_coeff: float = 0.0