I want to distributed training model with CPU on 2 machines. The model training scripts, running command files and time consumed of each machines are as follow:
On machine1 (ip: 10.0.0.113):
training scripts on machine1:
import os
import time
import warnings
import socket
import pandas as pd
from collections import OrderedDict
import torch
from torch import nn
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DistributedSampler, DataLoader
import torchmetrics as tm
from model import MyModel
warnings.filterwarnings("ignore",category=DeprecationWarning)
def calculate_value(pred_label, true_label):
accuracy = tm.Accuracy(task="binary")
precision = tm.Precision(task="binary")
recall = tm.Recall(task="binary")
f1score = tm.F1Score(task="binary")
auroc = tm.AUROC(task="binary")
auprc = tm.AveragePrecision(task="binary")
acc = accuracy(pred_label, true_label)
prec = precision(pred_label, true_label)
rec = recall(pred_label, true_label)
f1s = f1score(pred_label, true_label)
auroc_ = auroc(pred_label, true_label)
auprc_ = auprc(pred_label, true_label)
return acc, prec, rec, f1s, auroc_, auprc_
def train_and_cross_validate(train_val_file, record_file, batch_size, epochs, learn_rate, feature_path):
# process datasets
dt_train_val = pd.read_csv(train_val_file, sep="\t", header=0)
dt_val = dt_train_val[dt_train_val["group"].isin(["gp_3","gp_4"])].reset_index(drop=True)
dt_train = dt_train_val[dt_train_val["group"].isin(["gp_1","gp_2","gp_5"])].reset_index(drop=True)
# upsamping of positive samples [Only for Train dataset, random repeats of positive samples]
dt_train_pos = dt_train[dt_train["label"]==1].reset_index(drop=True)
dt_train_neg = dt_train[dt_train["label"]==0].reset_index(drop=True)
dt_train_pos_ = dt_train_pos
for n in range(9):
dt_train_pos_n = dt_train_pos.sample(dt_train_pos.shape[0], random_state=20, replace=True, ignore_index=True)
dt_train_pos_ = pd.concat([dt_train_pos_, dt_train_pos_n], ignore_index=True)
dt_train = pd.concat([dt_train_pos_, dt_train_neg], ignore_index=True)
dt_train = dt_train.sample(frac=1, replace=False, ignore_index=True)
# default set
recordF = open(record_file, "a+")
device = torch.device("cpu")
world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
os.environ["WORLD_SIZE"] = str(world_size)
rank = int(os.environ["OMPI_COMM_WORLD_RANK"])
# dirstributed training
torch.manual_seed(20 + rank)
dist.init_process_group(backend="gloo", init_method="env://", rank=rank)
if rank == 0:
print("IP_Address: ", socket.gethostbyname(socket.gethostname()))
print("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % ("epoch", "val_acc", "val_prec", "val_rec", "val_f1s", "val_auroc", "val_auprc", "val_loss", "train_acc", "train_loss"), flush=True)
recordF.write("Start: {}\n".format(time.ctime(time.time())))
## recordF.write("%s\t%s\t%s\t%s\t%s\t%s\n" % ("Kfold","Epoch","Train_Loss","Train_Acc","Val_Loss","Val_Acc"))
# Create train and validation dataloader
train_set = []
for i in range(dt_train.shape[0]):
train_set.append([feature_path + dt_train["virus_unid"][i] + ".pt",
feature_path + dt_train["human_unid"][i] + ".pt",
torch.tensor(dt_train["label"][i], dtype=torch.float32)])
train_sampler = DistributedSampler(dataset=train_set)
train_set_loader = DataLoader(dataset=train_set, batch_size=batch_size, sampler=train_sampler, drop_last=True)
val_set = []
for j in range(dt_val.shape[0]):
val_set.append([feature_path + dt_val["virus_unid"][j] + ".pt",
feature_path + dt_val["human_unid"][j] + ".pt",
torch.tensor(dt_val["label"][j], dtype=torch.float32)])
validation_set_loader = DataLoader(dataset=val_set, batch_size=batch_size, drop_last=True, shuffle=True)
# model parallel
model = MyModel().to(device)
ddp_model = DDP(model)
#optimizer = torch.optim.Adam(ddp_model.parameters(), lr=learn_rate)
optimizer = torch.optim.SGD(model.parameters(), lr=learn_rate, momentum=0.6)
for epoch in range(1, epochs + 1):
ddp_model.train()
train_loss = 0.0
train_correct = 0
train_total = 0
for vp, hp, label in train_set_loader:
vp = torch.stack([torch.load(v) for v in vp])
hp = torch.stack([torch.load(h) for h in hp])
vp, hp, label = vp.to(device), hp.to(device), label.to(device)
optimizer.zero_grad()
train_output = ddp_model(vp, hp).squeeze().to(torch.float32)
label = label.squeeze().to(torch.float32)
# Compute training loss
#loss_func = nn.CrossEntropyLoss(weight = torch.tensor([1.0 if int(l)==1 else 0.1 for l in label]))
loss_func = nn.CrossEntropyLoss(reduction="sum")
tra_loss = loss_func(train_output, label)
train_loss += tra_loss.item()
# Compute training accuracy
#_, predicted = torch.max(train_output, 1)
predicted = torch.where(train_output > 0.5, 1, 0)
train_total += label.size(0)
train_correct += (predicted == label).sum().item()
tra_loss.backward()
optimizer.step()
# Synchronize and gather training loss and accuracy across processes
torch.distributed.barrier()
train_loss = torch.tensor(train_loss, device=device)
train_loss_list = [torch.zeros_like(train_loss) for _ in range(world_size)]
train_loss_all = torch.zeros_like(train_loss)
torch.distributed.all_gather(train_loss_list, train_loss)
for i in range(world_size):
train_loss_all += train_loss_list[i]
train_acc = train_correct / train_total
train_loss_ = train_loss_all.item() / train_total
# Synchronize and calculate validation loss and accuracy in rank 0 process
if rank == 0:
ddp_model.eval()
val_loss = 0.0
val_correct = 0
val_total = 0
val_label_list = []
predicted_list = []
with torch.no_grad():
for vp_val, hp_val, label_val in validation_set_loader:
vp_val = torch.stack([torch.load(v) for v in vp_val])
hp_val = torch.stack([torch.load(h) for h in hp_val])
vp_val, hp_val, label_val = vp_val.to(device), hp_val.to(device), label_val.to(device)
pred_output = ddp_model(vp_val, hp_val).squeeze().to(torch.float32)
label_val = label_val.squeeze().to(torch.float32)
# Compute validation loss
#loss_func = nn.CrossEntropyLoss(weight = torch.tensor([1.0 if int(l)==1 else 0.1 for l in label_val]))
loss_func = nn.CrossEntropyLoss(reduction="sum")
val_loss += loss_func(pred_output, label_val).item()
#print(pred_output)
# Compute validation accuracy
#_, predicted = torch.max(pred_output, 1)
predicted_ = torch.where(pred_output > 0.5, 1, 0)
val_label_list += list(label_val.cpu().detach().numpy())
predicted_list += list(predicted_)
val_total += label_val.size(0)
val_correct += (predicted_ == label_val).sum().item()
val_loss /= len(validation_set_loader.dataset)
val_acc = val_correct / val_total
predicted_list = torch.tensor(predicted_list, dtype=torch.float32)
val_label_list = torch.tensor(val_label_list, dtype=torch.int)
acc, prec, rec, f1s, auroc_, auprc_ = calculate_value(predicted_list, val_label_list)
print("%d\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f" % (epoch, acc, prec, rec, f1s, auroc_, auprc_, val_loss, train_acc, train_loss_),flush=True)
if rank == 0:
recordF.write("End: {}\n".format(time.ctime(time.time())))
recordF.close()
dist.destroy_process_group()
if __name__ == "__main__":
train_and_cross_validate(train_val_file="../datasets/train_val.txt",
record_file="./record.txt",
batch_size=16,
epochs=15,
learn_rate=0.01,
feature_path="../onehot_Features_20/")
mpirun command shell script on machine1:
#!/bin/bash
export MASTER_ADDR="10.0.0.113"
export MASTER_PORT=1234
source activate vhPPI_transformer
mpirun -np 10 \
--hostfile myhostfile \
--prefix /home/zhangzhiyuan/BioSoftware/OpenMPI/ \
python vhPPI_multiCPUs_DataloadBatch_1_1.py
hostfile on machine1:
10.0.0.113
10.0.0.112
Time consuming of script running on machine1:
Start: Wed Jul 5 14:04:22 2023
End: Wed Jul 5 14:08:19 2023
On machine2 (ip: 10.0.0.112)
training scripts on machine2:
This python scripts file on machine2 is the same as the one on machine1.
import os
import time
import warnings
import socket
import pandas as pd
from collections import OrderedDict
import torch
from torch import nn
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DistributedSampler, DataLoader
import torchmetrics as tm
from model import MyModel
warnings.filterwarnings("ignore",category=DeprecationWarning)
def calculate_value(pred_label, true_label):
accuracy = tm.Accuracy(task="binary")
precision = tm.Precision(task="binary")
recall = tm.Recall(task="binary")
f1score = tm.F1Score(task="binary")
auroc = tm.AUROC(task="binary")
auprc = tm.AveragePrecision(task="binary")
acc = accuracy(pred_label, true_label)
prec = precision(pred_label, true_label)
rec = recall(pred_label, true_label)
f1s = f1score(pred_label, true_label)
auroc_ = auroc(pred_label, true_label)
auprc_ = auprc(pred_label, true_label)
return acc, prec, rec, f1s, auroc_, auprc_
def train_and_cross_validate(train_val_file, record_file, batch_size, epochs, learn_rate, feature_path):
# process datasets
dt_train_val = pd.read_csv(train_val_file, sep="\t", header=0)
dt_val = dt_train_val[dt_train_val["group"].isin(["gp_3","gp_4"])].reset_index(drop=True)
dt_train = dt_train_val[dt_train_val["group"].isin(["gp_1","gp_2","gp_5"])].reset_index(drop=True)
# upsamping of positive samples [Only for Train dataset, random repeats of positive samples]
dt_train_pos = dt_train[dt_train["label"]==1].reset_index(drop=True)
dt_train_neg = dt_train[dt_train["label"]==0].reset_index(drop=True)
dt_train_pos_ = dt_train_pos
for n in range(9):
dt_train_pos_n = dt_train_pos.sample(dt_train_pos.shape[0], random_state=20, replace=True, ignore_index=True)
dt_train_pos_ = pd.concat([dt_train_pos_, dt_train_pos_n], ignore_index=True)
dt_train = pd.concat([dt_train_pos_, dt_train_neg], ignore_index=True)
dt_train = dt_train.sample(frac=1, replace=False, ignore_index=True)
# default set
recordF = open(record_file, "a+")
device = torch.device("cpu")
world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
os.environ["WORLD_SIZE"] = str(world_size)
rank = int(os.environ["OMPI_COMM_WORLD_RANK"])
# dirstributed training
torch.manual_seed(30 + rank)
dist.init_process_group(backend="gloo", init_method="env://", rank=rank)
if rank == 0:
print("IP_Address: ", socket.gethostbyname(socket.gethostname()))
print("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % ("epoch", "val_acc", "val_prec", "val_rec", "val_f1s", "val_auroc", "val_auprc", "val_loss", "train_acc", "train_loss"), flush=True)
recordF.write("Start: {}\n".format(time.ctime(time.time())))
## recordF.write("%s\t%s\t%s\t%s\t%s\t%s\n" % ("Kfold","Epoch","Train_Loss","Train_Acc","Val_Loss","Val_Acc"))
# Create train and validation dataloader
train_set = []
for i in range(dt_train.shape[0]):
train_set.append([feature_path + dt_train["virus_unid"][i] + ".pt",
feature_path + dt_train["human_unid"][i] + ".pt",
torch.tensor(dt_train["label"][i], dtype=torch.float32)])
train_sampler = DistributedSampler(dataset=train_set)
train_set_loader = DataLoader(dataset=train_set, batch_size=batch_size, sampler=train_sampler, drop_last=True)
val_set = []
for j in range(dt_val.shape[0]):
val_set.append([feature_path + dt_val["virus_unid"][j] + ".pt",
feature_path + dt_val["human_unid"][j] + ".pt",
torch.tensor(dt_val["label"][j], dtype=torch.float32)])
validation_set_loader = DataLoader(dataset=val_set, batch_size=batch_size, drop_last=True, shuffle=True)
# model parallel
model = MyModel().to(device)
ddp_model = DDP(model)
#optimizer = torch.optim.Adam(ddp_model.parameters(), lr=learn_rate)
optimizer = torch.optim.SGD(model.parameters(), lr=learn_rate, momentum=0.6)
for epoch in range(1, epochs + 1):
ddp_model.train()
train_loss = 0.0
train_correct = 0
train_total = 0
for vp, hp, label in train_set_loader:
vp = torch.stack([torch.load(v) for v in vp])
hp = torch.stack([torch.load(h) for h in hp])
vp, hp, label = vp.to(device), hp.to(device), label.to(device)
optimizer.zero_grad()
train_output = ddp_model(vp, hp).squeeze().to(torch.float32)
label = label.squeeze().to(torch.float32)
# Compute training loss
#loss_func = nn.CrossEntropyLoss(weight = torch.tensor([1.0 if int(l)==1 else 0.1 for l in label]))
loss_func = nn.CrossEntropyLoss(reduction="sum")
tra_loss = loss_func(train_output, label)
train_loss += tra_loss.item()
# Compute training accuracy
#_, predicted = torch.max(train_output, 1)
predicted = torch.where(train_output > 0.5, 1, 0)
train_total += label.size(0)
train_correct += (predicted == label).sum().item()
tra_loss.backward()
optimizer.step()
# Synchronize and gather training loss and accuracy across processes
torch.distributed.barrier()
train_loss = torch.tensor(train_loss, device=device)
train_loss_list = [torch.zeros_like(train_loss) for _ in range(world_size)]
train_loss_all = torch.zeros_like(train_loss)
torch.distributed.all_gather(train_loss_list, train_loss)
for i in range(world_size):
train_loss_all += train_loss_list[i]
train_acc = train_correct / train_total
train_loss_ = train_loss_all.item() / train_total
# Synchronize and calculate validation loss and accuracy in rank 0 process
if rank == 0:
ddp_model.eval()
val_loss = 0.0
val_correct = 0
val_total = 0
val_label_list = []
predicted_list = []
with torch.no_grad():
for vp_val, hp_val, label_val in validation_set_loader:
vp_val = torch.stack([torch.load(v) for v in vp_val])
hp_val = torch.stack([torch.load(h) for h in hp_val])
vp_val, hp_val, label_val = vp_val.to(device), hp_val.to(device), label_val.to(device)
pred_output = ddp_model(vp_val, hp_val).squeeze().to(torch.float32)
label_val = label_val.squeeze().to(torch.float32)
# Compute validation loss
#loss_func = nn.CrossEntropyLoss(weight = torch.tensor([1.0 if int(l)==1 else 0.1 for l in label_val]))
loss_func = nn.CrossEntropyLoss(reduction="sum")
val_loss += loss_func(pred_output, label_val).item()
#print(pred_output)
# Compute validation accuracy
#_, predicted = torch.max(pred_output, 1)
predicted_ = torch.where(pred_output > 0.5, 1, 0)
val_label_list += list(label_val.cpu().detach().numpy())
predicted_list += list(predicted_)
val_total += label_val.size(0)
val_correct += (predicted_ == label_val).sum().item()
val_loss /= len(validation_set_loader.dataset)
val_acc = val_correct / val_total
predicted_list = torch.tensor(predicted_list, dtype=torch.float32)
val_label_list = torch.tensor(val_label_list, dtype=torch.int)
acc, prec, rec, f1s, auroc_, auprc_ = calculate_value(predicted_list, val_label_list)
print("%d\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f" % (epoch, acc, prec, rec, f1s, auroc_, auprc_, val_loss, train_acc, train_loss_),flush=True)
if rank == 0:
recordF.write("End: {}\n".format(time.ctime(time.time())))
recordF.close()
dist.destroy_process_group()
if __name__ == "__main__":
train_and_cross_validate(train_val_file="../datasets/train_val.txt",
record_file="./record.txt",
batch_size=16,
epochs=15,
learn_rate=0.01,
feature_path="../onehot_Features_20/")
mpirun command shell script on machine2:
#!/bin/bash
export MASTER_ADDR="10.0.0.112"
export MASTER_PORT=1234
source activate vhPPI_transformer
mpirun -np 16 \
--hostfile myhostfile \
--prefix /home/zhangzhiyuan/BioSoftware/openmpi/ \
python vhPPI_multiCPUs_DataloadBatch_1_1.py
hostfile on machine2:
10.0.0.113
10.0.0.112
Time consuming of script running on machine2:
Start: Wed Jul 5 14:04:27 2023
End: Wed Jul 5 14:08:47 2023
Time consuming of script running just on machine1:
Start: Wed Jul 5 15:59:55 2023
End: Wed Jul 5 16:04:42 2023
Question is:
The consumed time of distributed training on machine1 and machine2 is the same as the time running on single machine1, which might mean I don't achieve distributed training on multi-nodes.
I have checked the version of OpenMPI on machine1 and machine2, both of them are 4.1.5
. And the OpenMPI has been added into the environment PATH. I achieved login machine2 or machine1 from machine1 or machine2 with ssh
. I also check the communication between machine1 and machine2 with ping IP_address
and it works well.
The expecting thing is that the consumed time of model training on machine1 and machine2 should be less than the consumed time of training just on single machine1.
So I want to know if there is any potential problem that make the distributed training failed ?