I am running the lab 1 example as it is. Everything goes fine and training succeeds. But when I check the training logs, Its is all happening on [1,mpirank:0,algo-1]. I am passing the instance_count as two and can see there are two hosts [algo-1 and algo-2]. Each has 8 gpu on each so the mpirank goes from 0-15, but all training logs have just [1,mpirank:0,algo-1]. Below is the sample from log.
[1,mpirank:0,algo-1]<stdout>:#015Epoch 0: 50% 1/2 [00:00<00:00, 6.54it/s, loss=2.29, v_num=0] [1,mpirank:0,algo-1]<stdout>: [1,mpirank:0,algo-1]<stdout>:#015Validation: 0it [00:00, ?it/s]#033[A [1,mpirank:0,algo-1]<stdout>: [1,mpirank:0,algo-1]<stdout>:#015Validation: 0% 0/1 [00:00<?, ?it/s]#033[A [1,mpirank:0,algo-1]<stdout>:#015Validation DataLoader 0: 0% 0/1 [00:00<?, ?it/s]#033[A [1,mpirank:0,algo-1]<stdout>: [1,mpirank:0,algo-1]<stdout>:#015Validation DataLoader 0: 100% 1/1 [00:00<00:00, 1113.73it/s]#033[A [1,mpirank:0,algo-1]<stdout>:#015Epoch 0: 100% 2/2 [00:00<00:00, 12.33it/s, loss=2.29, v_num=0] [1,mpirank:0,algo-1]<stdout>:#015Epoch 0: 100% 2/2 [00:00<00:00, 12.33it/s, loss=2.29, v_num=0] [1,mpirank:0,algo-1]<stdout>:#015Epoch 0: 100% 2/2 [00:00<00:00, 12.10it/s, loss=2.29, v_num=0, val_acc=0.166] [1,mpirank:0,algo-1]<stdout>:#015 #033[A [1,mpirank:0,algo-1]<stdout>:#015Epoch 0: 100% 2/2 [00:00<00:00, 12.05it/s, loss=2.29, v_num=0, val_acc=0.166] [1,mpirank:0,algo-1]<stdout>:#015Epoch 0: 0% 0/2 [00:00<?, ?it/s, loss=2.29, v_num=0, val_acc=0.166] #015Epoch 1: 0% 0/2 [00:00<?, ?it/s, loss=2.29, v_num=0, val_acc=0.166] [1,mpirank:0,algo-1]<stdout>:#015Epoch 1: 50% 1/2 [00:00<00:00, 35.14it/s, loss=2.29, v_num=0, val_acc=0.166] [1,mpirank:0,algo-1]<stdout>:#015Epoch 1: 50% 1/2 [00:00<00:00, 9.28it/s, loss=2.29, v_num=0, val_acc=0.166] [1,mpirank:0,algo-1]<stdout>: [1,mpirank:0,algo-1]<stdout>:#015Validation: 0it [00:00, ?it/s]#033[A [1,mpirank:0,algo-1]<stdout>: [1,mpirank:0,algo-1]<stdout>:#015Validation: 0% 0/1 [00:00<?, ?it/s]#033[A [1,mpirank:0,algo-1]<stdout>:#015Validation DataLoader 0: 0% 0/1 [00:00<?, ?it/s]#033[A [1,mpirank:0,algo-1]<stdout>: [1,mpirank:0,algo-1]<stdout>:#015Validation DataLoader 0: 100% 1/1 [00:00<00:00, 1333.22it/s]#033[A [1,mpirank:0,algo-1]<stdout>:#015Epoch 1: 100% 2/2 [00:00<00:00, 17.19it/s, loss=2.29, v_num=0, val_acc=0.166] [1,mpirank:0,algo-1]<stdout>:#015Epoch 1: 100% 2/2 [00:00<00:00, 17.18it/s, loss=2.29, v_num=0, val_acc=0.166] [1,mpirank:0,algo-1]<stdout>:#015Epoch 1: 100% 2/2 [00:00<00:00, 16.85it/s, loss=2.29, v_num=0, val_acc=0.206] [1,mpirank:0,algo-1]<stdout>: [1,mpirank:0,algo-1]<stdout>:#015 [1,mpirank:0,algo-1]<stdout>:#033[A [1,mpirank:0,algo-1]<stdout>:#015Epoch 1: 100% 2/2 [00:00<00:00, 16.77it/s, loss=2.29, v_num=0, val_acc=0.206] [1,mpirank:0,algo-1]<stdout>:#015Epoch 1: 0% 0/2 [00:00<?, ?it/s, loss=2.29, v_num=0, val_acc=0.206] #015Epoch 2: 0% 0/2 [00:00<?, ?it/s, loss=2.29, v_num=0, val_acc=0.206] [1,mpirank:0,algo-1]<stdout>:#015Epoch 2: 50% 1/2 [00:00<00:00, 34.22it/s, loss=2.29, v_num=0, val_acc=0.206] [1,mpirank:0,algo-1]<stdout>:#015Epoch 2: 50% 1/2 [00:00<00:00, 33.82it/s, loss=2.29, v_num=0, val_acc=0.206] [1,mpirank:0,algo-1]<stdout>: [1,mpirank:0,algo-1]<stdout>:#015Validation: 0it [00:00, ?it/s]#033[A [1,mpirank:0,algo-1]<stdout>: [1,mpirank:0,algo-1]<stdout>:#015Validation: 0% 0/1 [00:00<?, ?it/s]#033[A[1,mpirank:0,algo-1]<stdout>: [1,mpirank:0,algo-1]<stdout>:#015Validation DataLoader 0: 0% 0/1 [00:00<?, ?it/s]#033[A [1,mpirank:0,algo-1]<stdout>: [1,mpirank:0,algo-1]<stdout>:#015Validation DataLoader 0: 100% 1/1 [00:00<00:00, 1283.05it/s]#033[A [1,mpirank:0,algo-1]<stdout>:#015Epoch 2: 100% 2/2 [00:00<00:00, 52.55it/s, loss=2.29, v_num=0, val_acc=0.206] [1,mpirank:0,algo-1]<stdout>:#015Epoch 2: 100% 2/2 [00:00<00:00, 47.22it/s, loss=2.29, v_num=0, val_acc=0.246] [1,mpirank:0,algo-1]<stdout>:#015 #033[A [1,mpirank:0,algo-1]<stdout>:#015Epoch 2: 100% 2/2 [00:00<00:00, 46.59it/s, loss=2.29, v_num=0, val_acc=0.246] [1,mpirank:0,algo-1]<stdout>:#015Epoch 2: 0% 0/2 [00:00<?, ?it/s, loss=2.29, v_num=0, val_acc=0.246] #015Epoch 3: 0% 0/2 [00:00<?, ?it/s, loss=2.29, v_num=0, val_acc=0.246] [1,mpirank:0,algo-1]<stdout>:#015Epoch 3: 50% 1/2 [00:00<00:00, 35.53it/s, loss=2.29, v_num=0, val_acc=0.246] [1,mpirank:0,algo-1]<stdout>:#015Epoch 3: 50% 1/2 [00:00<00:00, 34.17it/s, loss=2.29, v_num=0, val_acc=0.246] [1,mpirank:0,algo-1]<stdout>: [1,mpirank:0,algo-1]<stdout>:#015Validation: 0it [00:00, ?it/s]#033[A [1,mpirank:0,algo-1]<stdout>: [1,mpirank:0,algo-1]<stdout>:#015Validation: 0% 0/1 [00:00<?, ?it/s]#033[A [1,mpirank:0,algo-1]<stdout>:#015Validation DataLoader 0: 0% 0/1 [00:00<?, ?it/s][1,mpirank:0,algo-1]<stdout>:#033[A [1,mpirank:0,algo-1]<stdout>: [1,mpirank:0,algo-1]<stdout>:#015Validation DataLoader 0: 100% 1/1 [00:00<00:00, 1230.36it/s]#033[A [1,mpirank:0,algo-1]<stdout>:#015Epoch 3: 100% 2/2 [00:00<00:00, 52.96it/s, loss=2.29, v_num=0, val_acc=0.246] [1,mpirank:0,algo-1]<stdout>:#015Epoch 3: 100% 2/2 [00:00<00:00, 47.93it/s, loss=2.29, v_num=0, val_acc=0.277][1,mpirank:0,algo-1]<stdout>: [1,mpirank:0,algo-1]<stdout>:#015 #033[A [1,mpirank:0,algo-1]<stdout>:#015Epoch 3: 100% 2/2 [00:00<00:00, 47.29it/s, loss=2.29, v_num=0, val_acc=0.277] [1,mpirank:0,algo-1]<stdout>:#015Epoch 3: 0% 0/2 [00:00<?, ?it/s, loss=2.29, v_num=0, val_acc=0.277] #015Epoch 4: 0% 0/2 [00:00<?, ?it/s, loss=2.29, v_num=0, val_acc=0.277] [1,mpirank:0,algo-1]<stdout>:#015Epoch 4: 50% 1/2 [00:00<00:00, 35.43it/s, loss=2.29, v_num=0, val_acc=0.277] [1,mpirank:0,algo-1]<stdout>:#015Epoch 4: 50% 1/2 [00:00<00:00, 34.41it/s, loss=2.28, v_num=0, val_acc=0.277] [1,mpirank:0,algo-1]<stdout>: [1,mpirank:0,algo-1]<stdout>:#015Validation: 0it [00:00, ?it/s]#033[A [1,mpirank:0,algo-1]<stdout>: [1,mpirank:0,algo-1]<stdout>:#015Validation: 0% 0/1 [00:00<?, ?it/s]#033[A [1,mpirank:0,algo-1]<stdout>:#015Validation DataLoader 0: 0% 0/1 [00:00<?, ?it/s]#033[A [1,mpirank:0,algo-1]<stdout>: [1,mpirank:0,algo-1]<stdout>:#015Validation DataLoader 0: 100% 1/1 [00:00<00:00, 1197.69it/s]#033[A [1,mpirank:0,algo-1]<stdout>:#015Epoch 4: 100% 2/2 [00:00<00:00, 52.35it/s, loss=2.28, v_num=0, val_acc=0.277] [1,mpirank:0,algo-1]<stdout>:#015Epoch 4: 100% 2/2 [00:00<00:00, 48.23it/s, loss=2.28, v_num=0, val_acc=0.305] [1,mpirank:0,algo-1]<stdout>:#015 #033[A [1,mpirank:0,algo-1]<stdout>:#015Epoch 4: 100% 2/2 [00:00<00:00, 47.55it/s, loss=2.28, v_num=0, val_acc=0.305] [1,mpirank:0,algo-1]<stdout>:#015Epoch 4: 0% 0/2 [00:00<?, ?it/s, loss=2.28, v_num=0, val_acc=0.305] #015Epoch 5: 0% 0/2 [00:00<?, ?it/s, loss=2.28, v_num=0, val_acc=0.305] [1,mpirank:0,algo-1]<stdout>:#015Epoch 5: 50% 1/2 [00:00<00:00, 35.41it/s, loss=2.28, v_num=0, val_acc=0.305] [1,mpirank:0,algo-1]<stdout>:#015Epoch 5: 50% 1/2 [00:00<00:00, 34.12it/s, loss=2.28, v_num=0, val_acc=0.305] [1,mpirank:0,algo-1]<stdout>: [1,mpirank:0,algo-1]<stdout>:#015Validation: 0it [00:00, ?it/s]#033[A [1,mpirank:0,algo-1]<stdout>: [1,mpirank:0,algo-1]<stdout>:#015Validation: 0% 0/1 [00:00<?, ?it/s]#033[A[1,mpirank:0,algo-1]<stdout>: [1,mpirank:0,algo-1]<stdout>:#015Validation DataLoader 0: 0% 0/1 [00:00<?, ?it/s]#033[A [1,mpirank:0,algo-1]<stdout>: [1,mpirank:0,algo-1]<stdout>:#015Validation DataLoader 0: 100% 1/1 [00:00<00:00, 1276.42it/s]#033[A [1,mpirank:0,algo-1]<stdout>:#015Epoch 5: 100% 2/2 [00:00<00:00, 52.82it/s, loss=2.28, v_num=0, val_acc=0.305] [1,mpirank:0,algo-1]<stdout>:#015Epoch 5: 100% 2/2 [00:00<00:00, 48.07it/s, loss=2.28, v_num=0, val_acc=0.333] [1,mpirank:0,algo-1]<stdout>:#015 #033[A [1,mpirank:0,algo-1]<stdout>:#015Epoch 5: 100% 2/2 [00:00<00:00, 47.45it/s, loss=2.28, v_num=0, val_acc=0.333] [1,mpirank:0,algo-1]<stdout>:#015Epoch 5: 0% 0/2 [00:00<?, ?it/s, loss=2.28, v_num=0, val_acc=0.333] [1,mpirank:0,algo-1]<stdout>:#015Epoch 6: 0% 0/2 [00:00<?, ?it/s, loss=2.28, v_num=0, val_acc=0.333] [1,mpirank:0,algo-1]<stdout>:#015Epoch 6: 50% 1/2 [00:00<00:00, 35.15it/s, loss=2.28, v_num=0, val_acc=0.333] [1,mpirank:0,algo-1]<stdout>:#015Epoch 6: 50% 1/2 [00:00<00:00, 34.69it/s, loss=2.28, v_num=0, val_acc=0.333] [1,mpirank:0,algo-1]<stdout>: