How to achieve distributed training with CPU on multi-nodes?

Question

I want to distributed training model with CPU on 2 machines. The model training scripts, running command files and time consumed of each machines are as follow:

On machine1 (ip: 10.0.0.113):

training scripts on machine1:

import os
import time
import warnings
import socket

import pandas as pd
from collections import OrderedDict

import torch
from torch import nn
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DistributedSampler, DataLoader
import torchmetrics as tm

from model import MyModel
warnings.filterwarnings("ignore",category=DeprecationWarning)


def calculate_value(pred_label, true_label):
    accuracy = tm.Accuracy(task="binary")
    precision = tm.Precision(task="binary")
    recall = tm.Recall(task="binary")
    f1score = tm.F1Score(task="binary")
    auroc = tm.AUROC(task="binary")
    auprc = tm.AveragePrecision(task="binary")
    
    acc = accuracy(pred_label, true_label)
    prec = precision(pred_label, true_label)
    rec = recall(pred_label, true_label)
    f1s = f1score(pred_label, true_label)
    auroc_ = auroc(pred_label, true_label)
    auprc_ = auprc(pred_label, true_label)

    return acc, prec, rec, f1s, auroc_, auprc_


def train_and_cross_validate(train_val_file, record_file, batch_size, epochs, learn_rate, feature_path):
    # process datasets
    dt_train_val = pd.read_csv(train_val_file, sep="\t", header=0)
    
    dt_val = dt_train_val[dt_train_val["group"].isin(["gp_3","gp_4"])].reset_index(drop=True)
    dt_train = dt_train_val[dt_train_val["group"].isin(["gp_1","gp_2","gp_5"])].reset_index(drop=True)

    # upsamping of positive samples [Only for Train dataset, random repeats of positive samples]
    dt_train_pos = dt_train[dt_train["label"]==1].reset_index(drop=True)
    dt_train_neg = dt_train[dt_train["label"]==0].reset_index(drop=True)
    dt_train_pos_ = dt_train_pos
    for n in range(9):
        dt_train_pos_n = dt_train_pos.sample(dt_train_pos.shape[0], random_state=20, replace=True, ignore_index=True)
        dt_train_pos_ = pd.concat([dt_train_pos_, dt_train_pos_n], ignore_index=True)
    dt_train = pd.concat([dt_train_pos_, dt_train_neg], ignore_index=True)
    dt_train = dt_train.sample(frac=1, replace=False, ignore_index=True)

    # default set 
    recordF = open(record_file, "a+")
    device = torch.device("cpu")
    world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
    os.environ["WORLD_SIZE"] = str(world_size)
    rank = int(os.environ["OMPI_COMM_WORLD_RANK"])

    # dirstributed training
    torch.manual_seed(20 + rank)
    dist.init_process_group(backend="gloo", init_method="env://", rank=rank)
    
    if rank == 0:
        print("IP_Address: ", socket.gethostbyname(socket.gethostname()))
        print("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % ("epoch", "val_acc", "val_prec", "val_rec", "val_f1s", "val_auroc", "val_auprc", "val_loss", "train_acc", "train_loss"), flush=True)
        recordF.write("Start: {}\n".format(time.ctime(time.time())))
    ##    recordF.write("%s\t%s\t%s\t%s\t%s\t%s\n" % ("Kfold","Epoch","Train_Loss","Train_Acc","Val_Loss","Val_Acc"))
    
    
    # Create train and validation dataloader
    train_set = []
    for i in range(dt_train.shape[0]):
        train_set.append([feature_path + dt_train["virus_unid"][i] + ".pt",
                          feature_path + dt_train["human_unid"][i] + ".pt",
                          torch.tensor(dt_train["label"][i], dtype=torch.float32)])
    train_sampler = DistributedSampler(dataset=train_set)
    train_set_loader = DataLoader(dataset=train_set, batch_size=batch_size, sampler=train_sampler, drop_last=True)
    val_set = []
    for j in range(dt_val.shape[0]):
        val_set.append([feature_path + dt_val["virus_unid"][j] + ".pt",
                        feature_path + dt_val["human_unid"][j] + ".pt",
                        torch.tensor(dt_val["label"][j], dtype=torch.float32)])
    validation_set_loader = DataLoader(dataset=val_set, batch_size=batch_size, drop_last=True, shuffle=True)
    
    # model parallel
    model = MyModel().to(device)
    ddp_model = DDP(model)
    #optimizer = torch.optim.Adam(ddp_model.parameters(), lr=learn_rate)
    optimizer = torch.optim.SGD(model.parameters(), lr=learn_rate, momentum=0.6)


    for epoch in range(1, epochs + 1):
        ddp_model.train()

        train_loss = 0.0
        train_correct = 0
        train_total = 0

        for vp, hp, label in train_set_loader:
            vp = torch.stack([torch.load(v) for v in vp])
            hp = torch.stack([torch.load(h) for h in hp])
            vp, hp, label = vp.to(device), hp.to(device), label.to(device)

            optimizer.zero_grad()
            train_output = ddp_model(vp, hp).squeeze().to(torch.float32)
            label = label.squeeze().to(torch.float32) 
            # Compute training loss
            #loss_func = nn.CrossEntropyLoss(weight = torch.tensor([1.0 if int(l)==1 else 0.1 for l in label]))
            loss_func = nn.CrossEntropyLoss(reduction="sum")
            tra_loss = loss_func(train_output, label)
            train_loss += tra_loss.item()

            # Compute training accuracy
            #_, predicted = torch.max(train_output, 1)
            predicted = torch.where(train_output > 0.5, 1, 0)
            train_total += label.size(0)
            train_correct += (predicted == label).sum().item()

            tra_loss.backward()
            optimizer.step()

        # Synchronize and gather training loss and accuracy across processes
        torch.distributed.barrier()
        train_loss = torch.tensor(train_loss, device=device)
        train_loss_list = [torch.zeros_like(train_loss) for _ in range(world_size)]
        train_loss_all = torch.zeros_like(train_loss)
        torch.distributed.all_gather(train_loss_list, train_loss)
        for i in range(world_size):
            train_loss_all += train_loss_list[i]
        train_acc = train_correct / train_total
        train_loss_ = train_loss_all.item() / train_total
        # Synchronize and calculate validation loss and accuracy in rank 0 process
        if rank == 0:
            ddp_model.eval()
            val_loss = 0.0
            val_correct = 0
            val_total = 0
            val_label_list = []
            predicted_list = []

            with torch.no_grad():
                for vp_val, hp_val, label_val in validation_set_loader:
                    vp_val = torch.stack([torch.load(v) for v in vp_val])
                    hp_val = torch.stack([torch.load(h) for h in hp_val])
                    vp_val, hp_val, label_val = vp_val.to(device), hp_val.to(device), label_val.to(device)

                    pred_output = ddp_model(vp_val, hp_val).squeeze().to(torch.float32)
                    label_val = label_val.squeeze().to(torch.float32) 

                    # Compute validation loss
                    #loss_func = nn.CrossEntropyLoss(weight = torch.tensor([1.0 if int(l)==1 else 0.1 for l in label_val]))
                    loss_func = nn.CrossEntropyLoss(reduction="sum")
                    val_loss += loss_func(pred_output, label_val).item()
                    #print(pred_output)
                    # Compute validation accuracy
                    #_, predicted = torch.max(pred_output, 1)
                     
                    predicted_ = torch.where(pred_output > 0.5, 1, 0)
                    val_label_list += list(label_val.cpu().detach().numpy())
                    predicted_list += list(predicted_)
                    val_total += label_val.size(0)
                    val_correct += (predicted_ == label_val).sum().item()
            val_loss /= len(validation_set_loader.dataset)
            val_acc = val_correct / val_total
            predicted_list = torch.tensor(predicted_list, dtype=torch.float32)
            val_label_list = torch.tensor(val_label_list, dtype=torch.int)
            acc, prec, rec, f1s, auroc_, auprc_ = calculate_value(predicted_list, val_label_list) 
            print("%d\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f" % (epoch, acc, prec, rec, f1s, auroc_, auprc_, val_loss, train_acc, train_loss_),flush=True)

    if rank == 0:
        recordF.write("End: {}\n".format(time.ctime(time.time())))
        recordF.close()
    dist.destroy_process_group()


if __name__ == "__main__":
    train_and_cross_validate(train_val_file="../datasets/train_val.txt",
                             record_file="./record.txt",
                             batch_size=16,
                             epochs=15,
                             learn_rate=0.01,
                             feature_path="../onehot_Features_20/")

mpirun command shell script on machine1:

#!/bin/bash

export MASTER_ADDR="10.0.0.113"
export MASTER_PORT=1234

source activate vhPPI_transformer

mpirun -np 10 \
       --hostfile myhostfile \
       --prefix /home/zhangzhiyuan/BioSoftware/OpenMPI/ \
       python vhPPI_multiCPUs_DataloadBatch_1_1.py

hostfile on machine1:

10.0.0.113
10.0.0.112

Time consuming of script running on machine1:

Start: Wed Jul  5 14:04:22 2023
End: Wed Jul  5 14:08:19 2023

On machine2 (ip: 10.0.0.112)

training scripts on machine2:

This python scripts file on machine2 is the same as the one on machine1.

import os
import time
import warnings
import socket

import pandas as pd
from collections import OrderedDict

import torch
from torch import nn
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DistributedSampler, DataLoader
import torchmetrics as tm

from model import MyModel
warnings.filterwarnings("ignore",category=DeprecationWarning)


def calculate_value(pred_label, true_label):
    accuracy = tm.Accuracy(task="binary")
    precision = tm.Precision(task="binary")
    recall = tm.Recall(task="binary")
    f1score = tm.F1Score(task="binary")
    auroc = tm.AUROC(task="binary")
    auprc = tm.AveragePrecision(task="binary")
    
    acc = accuracy(pred_label, true_label)
    prec = precision(pred_label, true_label)
    rec = recall(pred_label, true_label)
    f1s = f1score(pred_label, true_label)
    auroc_ = auroc(pred_label, true_label)
    auprc_ = auprc(pred_label, true_label)

    return acc, prec, rec, f1s, auroc_, auprc_


def train_and_cross_validate(train_val_file, record_file, batch_size, epochs, learn_rate, feature_path):
    # process datasets
    dt_train_val = pd.read_csv(train_val_file, sep="\t", header=0)
    
    dt_val = dt_train_val[dt_train_val["group"].isin(["gp_3","gp_4"])].reset_index(drop=True)
    dt_train = dt_train_val[dt_train_val["group"].isin(["gp_1","gp_2","gp_5"])].reset_index(drop=True)

    # upsamping of positive samples [Only for Train dataset, random repeats of positive samples]
    dt_train_pos = dt_train[dt_train["label"]==1].reset_index(drop=True)
    dt_train_neg = dt_train[dt_train["label"]==0].reset_index(drop=True)
    dt_train_pos_ = dt_train_pos
    for n in range(9):
        dt_train_pos_n = dt_train_pos.sample(dt_train_pos.shape[0], random_state=20, replace=True, ignore_index=True)
        dt_train_pos_ = pd.concat([dt_train_pos_, dt_train_pos_n], ignore_index=True)
    dt_train = pd.concat([dt_train_pos_, dt_train_neg], ignore_index=True)
    dt_train = dt_train.sample(frac=1, replace=False, ignore_index=True)

    # default set 
    recordF = open(record_file, "a+")
    device = torch.device("cpu")
    world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
    os.environ["WORLD_SIZE"] = str(world_size)
    rank = int(os.environ["OMPI_COMM_WORLD_RANK"])

    # dirstributed training
    torch.manual_seed(30 + rank)
    dist.init_process_group(backend="gloo", init_method="env://", rank=rank)
    
    if rank == 0:
        print("IP_Address: ", socket.gethostbyname(socket.gethostname()))
        print("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % ("epoch", "val_acc", "val_prec", "val_rec", "val_f1s", "val_auroc", "val_auprc", "val_loss", "train_acc", "train_loss"), flush=True)
        recordF.write("Start: {}\n".format(time.ctime(time.time())))
    ##    recordF.write("%s\t%s\t%s\t%s\t%s\t%s\n" % ("Kfold","Epoch","Train_Loss","Train_Acc","Val_Loss","Val_Acc"))
    
    
    # Create train and validation dataloader
    train_set = []
    for i in range(dt_train.shape[0]):
        train_set.append([feature_path + dt_train["virus_unid"][i] + ".pt",
                          feature_path + dt_train["human_unid"][i] + ".pt",
                          torch.tensor(dt_train["label"][i], dtype=torch.float32)])
    train_sampler = DistributedSampler(dataset=train_set)
    train_set_loader = DataLoader(dataset=train_set, batch_size=batch_size, sampler=train_sampler, drop_last=True)
    val_set = []
    for j in range(dt_val.shape[0]):
        val_set.append([feature_path + dt_val["virus_unid"][j] + ".pt",
                        feature_path + dt_val["human_unid"][j] + ".pt",
                        torch.tensor(dt_val["label"][j], dtype=torch.float32)])
    validation_set_loader = DataLoader(dataset=val_set, batch_size=batch_size, drop_last=True, shuffle=True)
    
    # model parallel
    model = MyModel().to(device)
    ddp_model = DDP(model)
    #optimizer = torch.optim.Adam(ddp_model.parameters(), lr=learn_rate)
    optimizer = torch.optim.SGD(model.parameters(), lr=learn_rate, momentum=0.6)


    for epoch in range(1, epochs + 1):
        ddp_model.train()

        train_loss = 0.0
        train_correct = 0
        train_total = 0

        for vp, hp, label in train_set_loader:
            vp = torch.stack([torch.load(v) for v in vp])
            hp = torch.stack([torch.load(h) for h in hp])
            vp, hp, label = vp.to(device), hp.to(device), label.to(device)

            optimizer.zero_grad()
            train_output = ddp_model(vp, hp).squeeze().to(torch.float32)
            label = label.squeeze().to(torch.float32) 
            # Compute training loss
            #loss_func = nn.CrossEntropyLoss(weight = torch.tensor([1.0 if int(l)==1 else 0.1 for l in label]))
            loss_func = nn.CrossEntropyLoss(reduction="sum")
            tra_loss = loss_func(train_output, label)
            train_loss += tra_loss.item()

            # Compute training accuracy
            #_, predicted = torch.max(train_output, 1)
            predicted = torch.where(train_output > 0.5, 1, 0)
            train_total += label.size(0)
            train_correct += (predicted == label).sum().item()

            tra_loss.backward()
            optimizer.step()

        # Synchronize and gather training loss and accuracy across processes
        torch.distributed.barrier()
        train_loss = torch.tensor(train_loss, device=device)
        train_loss_list = [torch.zeros_like(train_loss) for _ in range(world_size)]
        train_loss_all = torch.zeros_like(train_loss)
        torch.distributed.all_gather(train_loss_list, train_loss)
        for i in range(world_size):
            train_loss_all += train_loss_list[i]
        train_acc = train_correct / train_total
        train_loss_ = train_loss_all.item() / train_total
        # Synchronize and calculate validation loss and accuracy in rank 0 process
        if rank == 0:
            ddp_model.eval()
            val_loss = 0.0
            val_correct = 0
            val_total = 0
            val_label_list = []
            predicted_list = []

            with torch.no_grad():
                for vp_val, hp_val, label_val in validation_set_loader:
                    vp_val = torch.stack([torch.load(v) for v in vp_val])
                    hp_val = torch.stack([torch.load(h) for h in hp_val])
                    vp_val, hp_val, label_val = vp_val.to(device), hp_val.to(device), label_val.to(device)

                    pred_output = ddp_model(vp_val, hp_val).squeeze().to(torch.float32)
                    label_val = label_val.squeeze().to(torch.float32) 

                    # Compute validation loss
                    #loss_func = nn.CrossEntropyLoss(weight = torch.tensor([1.0 if int(l)==1 else 0.1 for l in label_val]))
                    loss_func = nn.CrossEntropyLoss(reduction="sum")
                    val_loss += loss_func(pred_output, label_val).item()
                    #print(pred_output)
                    # Compute validation accuracy
                    #_, predicted = torch.max(pred_output, 1)
                     
                    predicted_ = torch.where(pred_output > 0.5, 1, 0)
                    val_label_list += list(label_val.cpu().detach().numpy())
                    predicted_list += list(predicted_)
                    val_total += label_val.size(0)
                    val_correct += (predicted_ == label_val).sum().item()
            val_loss /= len(validation_set_loader.dataset)
            val_acc = val_correct / val_total
            predicted_list = torch.tensor(predicted_list, dtype=torch.float32)
            val_label_list = torch.tensor(val_label_list, dtype=torch.int)
            acc, prec, rec, f1s, auroc_, auprc_ = calculate_value(predicted_list, val_label_list) 
            print("%d\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f" % (epoch, acc, prec, rec, f1s, auroc_, auprc_, val_loss, train_acc, train_loss_),flush=True)

    if rank == 0:
        recordF.write("End: {}\n".format(time.ctime(time.time())))
        recordF.close()
    dist.destroy_process_group()


if __name__ == "__main__":
    train_and_cross_validate(train_val_file="../datasets/train_val.txt",
                             record_file="./record.txt",
                             batch_size=16,
                             epochs=15,
                             learn_rate=0.01,
                             feature_path="../onehot_Features_20/")

mpirun command shell script on machine2:

#!/bin/bash

export MASTER_ADDR="10.0.0.112"
export MASTER_PORT=1234

source activate vhPPI_transformer

mpirun -np 16 \
       --hostfile myhostfile \
       --prefix /home/zhangzhiyuan/BioSoftware/openmpi/ \
       python vhPPI_multiCPUs_DataloadBatch_1_1.py

hostfile on machine2:

10.0.0.113
10.0.0.112

Time consuming of script running on machine2:

Start: Wed Jul  5 14:04:27 2023
End: Wed Jul  5 14:08:47 2023

Time consuming of script running just on machine1:

Start: Wed Jul  5 15:59:55 2023
End: Wed Jul  5 16:04:42 2023

Question is:

The consumed time of distributed training on machine1 and machine2 is the same as the time running on single machine1, which might mean I don't achieve distributed training on multi-nodes.

I have checked the version of OpenMPI on machine1 and machine2, both of them are 4.1.5. And the OpenMPI has been added into the environment PATH. I achieved login machine2 or machine1 from machine1 or machine2 with ssh. I also check the communication between machine1 and machine2 with ping IP_address and it works well.

The expecting thing is that the consumed time of model training on machine1 and machine2 should be less than the consumed time of training just on single machine1.

So I want to know if there is any potential problem that make the distributed training failed ?