Skip to content

Commit

Permalink
add timeout for dist
Browse files Browse the repository at this point in the history
  • Loading branch information
jeonsworld committed Nov 4, 2020
1 parent c6649cb commit 215e05a
Showing 1 changed file with 4 additions and 1 deletion.
5 changes: 4 additions & 1 deletion train.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import random
import numpy as np

from datetime import timedelta

import torch
import torch.distributed as dist

Expand Down Expand Up @@ -296,7 +298,8 @@ def main():
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
torch.distributed.init_process_group(backend='nccl')
torch.distributed.init_process_group(backend='nccl',
timeout=timedelta(minutes=60))
args.n_gpu = 1
args.device = device

Expand Down

0 comments on commit 215e05a

Please sign in to comment.