remat checkpoints to host

apple · markblee · Aug 14, 2024 · Aug 8, 2024 · Aug 12, 2024 · Aug 12, 2024
commit 5e7786a3d2197ea6f0ea52b2b3d8810f6cf9a6d2
@@ -3874,6 +3874,7 @@ def build_remat_spec(
     ],
     self_attention: bool = True,
     feed_forward: bool = False,
+    offload: bool = False,
-    offload: bool = False,
+    offload_dst: Optional[Literal["pinned_host"]] = None,
-    offload: bool = False,
+    offload_dst: Optional[Literal["pinned_host"]] = None,
 ) -> Optional[RematSpec]:
     """Configures how the Transformer or Conformer stack will save the linearization points.
 
@@ -3891,6 +3892,7 @@ def build_remat_spec(
         stack_cfg: A transformer config.
         self_attention: Checkpoint self attention layer activations if true.
         feed_forward: Checkpoint feed-forward layer activations if true.
+        offload: Offload the checkpoints to host memory instead of TPU memory.
 
     Returns:
         None (if no rematerialization is needed) or a RematSpec.
@@ -3905,17 +3907,27 @@ def build_remat_spec(
         checkpoints.extend(
             [f"{attention_name}.{el}" for el in ["q_proj", "k_proj", "v_proj", "context", "o_proj"]]
         )
+
     if feed_forward and hasattr(stack_cfg.layer, "feed_forward"):
         ffn_name = stack_cfg.layer.feed_forward.klass.__name__
         checkpoints.extend([f"{ffn_name}.{el}" for el in ["activation", "linear2"]])
 
+    policy = config_for_function(jax_remat_policies.save_only_these_names).set(
+        names_which_can_be_saved=checkpoints
+    )
+    if offload:
+        policy = config_for_function(jax_remat_policies.save_and_offload_only_these_names).set(
+            names_which_can_be_saved=[],
+            names_which_can_be_offloaded=checkpoints,
+            offload_src="device",
+            offload_dst="pinned_host",
+        )
+
     return RematSpec(
         prevent_cse=stack_cfg.klass is StackedTransformerLayer,
         # If we are running inside a jax.lax.scan (Repeated/Pipelined transformers
         # or Repeated Conformers) we can enable common subexpression elimination optimizations.
-        policy=config_for_function(jax_remat_policies.save_only_these_names).set(
-            names_which_can_be_saved=checkpoints
-        ),
+        policy=policy,
     )
 
 

@@ -183,7 +183,10 @@ def mesh_shape_from_axes(
 
 
 def update_model_remat_config(
-    *, stack_cfg: causal_lm.TransformerStackConfig, layer_cfg: TransformerLayer.Config
+    *,
+    stack_cfg: causal_lm.TransformerStackConfig,
+    layer_cfg: TransformerLayer.Config,
+    offload: bool = False,
 ):
     """Recomputes and sets the remat_spec based on provided layer_cfg.
 
@@ -203,10 +206,12 @@ def update_model_remat_config(
 
     if layer_cfg.self_attention.attention.klass is not FlashAttention:
         # Enable remat to reduce memory usage for larger models.
-        remat_spec = build_remat_spec(stack_cfg.clone(layer=layer_cfg))
+        remat_spec = build_remat_spec(stack_cfg.clone(layer=layer_cfg), offload=offload)
     else:
         # Checkpointing both ffn and attention to give the best performance.
-        remat_spec = build_remat_spec(stack_cfg, feed_forward=True, self_attention=True)
+        remat_spec = build_remat_spec(
+            stack_cfg, feed_forward=True, self_attention=True, offload=offload
+        )
     layer_cfg.set(remat_spec=remat_spec)
 
 
@@ -230,6 +235,7 @@ def model_config(
     ffn_structure: str = "prenorm",
     atten_structure: str = "prenorm",
     atten_logit_cap: Optional[float] = None,
+    remat_offload: bool = False,
 ) -> causal_lm.Model.Config:
     """Returns an LM model config based on the given hyperparams.
 
@@ -258,6 +264,7 @@ def model_config(
             Options: [prenorm, postnorm, hybridnorm].
         atten_logit_cap: Cap the absolute values of logits by tanh.
             Enabled by setting a positive value.
+        remat_offload: Offload remat checkpoints to host instead of TPU memory.
 
     Returns:
         A causal LM config.
@@ -276,7 +283,7 @@ def model_config(
     layer_cfg.self_attention.structure = atten_structure
     layer_cfg.self_attention.attention.atten_logit_cap = atten_logit_cap
     if stack_cfg.klass is RepeatedTransformerLayer:
-        update_model_remat_config(stack_cfg=stack_cfg, layer_cfg=layer_cfg)
+        update_model_remat_config(stack_cfg=stack_cfg, layer_cfg=layer_cfg, offload=remat_offload)
     # Stack.
     transformer_cfg = stack_cfg.set(num_layers=num_layers, layer=layer_cfg)
     decoder_cfg = Decoder.default_config().set(

@@ -188,6 +188,7 @@ def get_trainer_kwargs(
                 num_kv_heads=None if version == Version.V1 else 8,
                 rope_theta=rope_theta,
                 flash_attention=flash_attention,
+                remat_offload=True,
             ),
             learner_kwargs=dict(peak_lr=1.5e-4, weight_decay=0.1),
             max_sequence_length=max_sequence_length,
@@ -230,6 +231,7 @@ def model_config(
     ffn_dim: Optional[Union[int, config.FunctionConfigBase]] = None,
     flash_attention: bool = False,
     stack_cfg: Optional[BaseStackedTransformerLayer.Config] = None,
+    remat_offload: bool = False,
 ) -> causal_lm.Model.Config:
     """Returns an LM model config based on the given hyperparams.
 
@@ -247,6 +249,7 @@ def model_config(
         flash_attention: Whether to enable flash attention.
         stack_cfg: The transformer stack config.
             If None, defaults to a RepeatedTransformerLayer.
+        remat_offload: Offload remat checkpoints to host instead of TPU memory.
 
     Returns:
         A causal LM config.
@@ -283,6 +286,7 @@ def model_config(
         emb_cfg=TransformerTextEmbeddings.default_config().set(pos_emb=None),
         attention_cfg=flash_attention_config() if flash_attention else atten_cfg,
         attention_qkv_linear=atten_qkv_linear,
+        remat_offload=remat_offload,
     )
     return cfg