add 'data' axis to fsdp axis

apple · apoorvtintin · Jul 1, 2024 · Jul 1, 2024 · Jul 1, 2024 · Jul 24, 2024
commit 6669a41090f2882ee3364bc9c0d5489e7adfd6a6
@@ -267,12 +267,17 @@ def model_config(
         batch_axis_names=batch_axis_names,
         seq_axis_names="seq",
     )
+
+    device_platform = np.asarray(jax.devices())[0].platform
 devices = jax.devices() 
 devices = jax.devices() 
+    # neuron uses Zero 3
+    fsdp_axis_names = ("expert", "fsdp", "seq") if device_platform != 'neuron' else ("data", "expert", "fsdp", "seq") 
+
     cfg.dtype = jnp.float32
     # Shard some FFN and attention weights over multiple axes.
     set_double_shard_weights_config(
         cfg.decoder.transformer.layer,
         batch_axis_names=batch_axis_names,
-        fsdp_axis_names=("expert", "fsdp", "seq"),
+        fsdp_axis_names=fsdp_axis_names,
         tp_axis_names="model",
         seq_axis_names=("seq",),
     )