Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix checkpoint saving and consolidation for TP #378

Merged
merged 3 commits into from
Dec 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion optimum/neuron/distributed/checkpointing.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,6 @@ def consolidate_tensor_parallel_checkpoints_to_unified_checkpoint(
torch.save(shard, output_dir / shard_file)
if index is not None:
save_index_file = SAFE_WEIGHTS_INDEX_NAME if save_format == "safetensors" else WEIGHTS_INDEX_NAME
with open(save_index_file, "w") as fp:
with open(output_dir / save_index_file, "w") as fp:
content = json.dumps(index, indent=2, sort_keys=True) + "\n"
fp.write(content)
8 changes: 7 additions & 1 deletion optimum/neuron/trainers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"""Defines Trainer subclasses to perform training on AWS Neuron instances."""

import contextlib
import copy
import glob
import os
import random
Expand Down Expand Up @@ -395,7 +396,12 @@ def _save_xla(self, output_dir: Optional[str] = None):
if self.accelerator.distributed_type is NeuronDistributedType.TENSOR_PARALLELISM:
logger.info("Model parallelism is enabled, only saving the model sharded state dict.")
if isinstance(self.model, PreTrainedModel):
self.model.config.save_pretrained(output_dir)
from neuronx_distributed.parallel_layers.parallel_state import get_tensor_model_parallel_size

config = copy.deepcopy(self.model.config)
if self.args.tp_plugin.parallelize_embeddings:
config.vocab_size = config.vocab_size * get_tensor_model_parallel_size()
config.save_pretrained(output_dir)

parallelizer = ParallelizersManager.parallelizer_for_model(self.model)
# This mark_step is needed to avoid hang issues.
Expand Down
4 changes: 4 additions & 0 deletions optimum/neuron/utils/training_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,10 @@ def prepare_environment_for_neuron():
"""
# Set compiler flag to compile for transformer model type
os.environ["NEURON_CC_FLAGS"] = os.environ.get("NEURON_CC_FLAGS", "") + " --model-type=transformer"
# Setting MALLOC_ARENA_MAX is needed because of a memory issue in XLA/glic, otherwise OOM can happen during
# checkpointing. More information here:
# https://awsdocs-neuron.readthedocs-hosted.com/en/latest/release-notes/torch/torch-neuronx/index.html#memory-leaking-in-glibc
os.environ["MALLOC_ARENA_MAX"] = "64"


def set_neuron_cc_optlevel_for_model(model: "PreTrainedModel", optlevel: str = "auto"):
Expand Down
Loading