Address PR comments

apple · apoorvtintin · Jul 1, 2024 · Jul 1, 2024 · Jul 1, 2024 · Jul 24, 2024
commit 49f9efa2aa149c6355b2474328e13292b2f6e1ed
@@ -1176,7 +1176,7 @@ def create_device_mesh(
     # Check if the devices are part of a multi-granule configuration.
     # <https://github.com/google/jax/blob/b81b79c1b0d2ec/jax/experimental/mesh_utils.py#L313>
     device_platform = devices[0].platform
-    attr = "process_index" if device_platform != "tpu" else "slice_index"
+    attr = "process_index" if device_platform == "gpu" else "slice_index"
     is_multi_granule_env = hasattr(devices[0], attr)
     if not all(el.platform == device_platform for el in devices):
         raise NotImplementedError(f"Not all devices had platform: {device_platform}.")
@@ -1193,7 +1193,7 @@ def create_device_mesh(
         logging.warning("Falling back to ICI-only mesh on GPU, performance may be reduced.")
         return build_standard_mesh(mesh_shape, devices=devices)
 
-    # Neuron also only uses standard mesh 
+    # Neuron also only uses standard mesh
     if device_platform == "neuron":
         return build_standard_mesh(mesh_shape, devices=devices)
 

@@ -11,11 +11,11 @@
 """
 
 import math
-import numpy as np
 from typing import Dict, List, Optional, Sequence, Tuple, Union
 
 import jax
 import jax.numpy as jnp
+import numpy as np
 import tensorflow as tf
 from jax.sharding import PartitionSpec
 
@@ -271,8 +271,8 @@ def model_config(
     )
 
     device_platform = np.asarray(jax.devices())[0].platform
 devices = jax.devices() 
 devices = jax.devices() 
-    # neuron uses Zero 3
-    fsdp_axis_names = ("expert", "fsdp", "seq") if device_platform != 'neuron' else ("data", "expert", "fsdp", "seq") 
+    # Trainium will have FSDP support soon, for now use Zero 3.
+    fsdp_axis_names = ("expert", "fsdp", "seq") if device_platform != "neuron" else ("data")
 
     cfg.dtype = jnp.float32
     # Shard some FFN and attention weights over multiple axes.

@@ -83,7 +83,6 @@ class Version(enum.Enum):
     },
 }
 
-TRN_MODEL_AXIS_SIZE=8
 
 def get_trainer_kwargs(
     model_size: str,
@@ -167,9 +166,9 @@ def get_trainer_kwargs(
                     "gpu-(p5.48xlarge|p4de.24xlarge)-(256|512|1024)",
                     mesh_shape_from_axes(data=-1, fsdp=8),
                 ),
-                (   
+                (
                     "neuron-(trn1.32xlarge|trn1n.32xlarge)-(32|64|256|512|1024|2048)",
-                    mesh_shape_from_axes(data=-1, model=TRN_MODEL_AXIS_SIZE),
+                    mesh_shape_from_axes(data=-1, model=8),
                 ),
             ),
         )