root@pai-worker1:/home/Data/exports/pytorch-distributed# srun -N1 -n2 --gres gpu:2 python distributed_slurm_main.py --dist-file dist_file
Traceback (most recent call last):
File "distributed_slurm_main.py", line 420, in <module>
main()
File "distributed_slurm_main.py", line 131, in main
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/spawn.py", line 230, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/spawn.py", line 188, in start_processes
while not context.join():
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/spawn.py", line 150, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 1 terminated with the following error:
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/spawn.py", line 59, in _wrap
fn(i, *args)
File "/home/Data/exports/pytorch-distributed/distributed_slurm_main.py", line 137, in main_worker
dist.init_process_group(backend='nccl',
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 608, in init_process_group
_store_based_barrier(rank, store, timeout)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 244, in _store_based_barrier
raise RuntimeError(
RuntimeError: Timed out initializing process group in store based barrier on rank: 1, for key: store_based_barrier_key:1 (world_size=2, worker_count=4, timeout=0:30:00)
Traceback (most recent call last):
File "distributed_slurm_main.py", line 420, in <module>
main()
File "distributed_slurm_main.py", line 131, in main
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/spawn.py", line 230, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/spawn.py", line 188, in start_processes
while not context.join():
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/spawn.py", line 150, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/spawn.py", line 59, in _wrap
fn(i, *args)
File "/home/Data/exports/pytorch-distributed/distributed_slurm_main.py", line 137, in main_worker
dist.init_process_group(backend='nccl',
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 608, in init_process_group
_store_based_barrier(rank, store, timeout)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 244, in _store_based_barrier
raise RuntimeError(
RuntimeError: Timed out initializing process group in store based barrier on rank: 2, for key: store_based_barrier_key:1 (world_size=2, worker_count=4, timeout=0:30:00)
srun: error: pai-worker1: tasks 0-1: Exited with exit code 1
root@pai-worker1:/home/Data/exports/pytorch-distributed#