We recently setup a Slurm Cluster with 2 Nodes(1 headnode+compute node and 1 compute nodes) for some HPC CFD simulations.Right now i am trying to run some python script which is used for feature selection in one of our Machine learning project which would run for around a day in one system.I have configured python and installed all libraries in 2 machines, verified Slurm node availability, configured job Script with required parameters(shown below) .But while trying to run the job using SBATCH, i am able to see the script execution in the head node where i run the SBATCH but not on the other node.the execution happens in the exact number of cores i specified in job script.however when i specify the compute name in command using --nodelist , the script is running on the other compute node. Kindly help as i am starter in both slurm cluster management as well as python.
Python Script
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns
from sklearn.feature_selection import f_classif, mutual_info_classif
from lightgbm import LGBMClassifier
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib as mpl
mpl.style.use('seaborn')
# df = pd.read_csv('vib_processed_data.csv')#Vibration
df = pd.read_csv('curr_processed_data.csv')#Current
# print(df.shape)
# print(df.describe().T.to_string())
meta = pd.read_csv("data.csv", names=["mimb", "bbl", "mbl", "fcl", "file_name"],
header=None, skiprows=1)
# print(meta.shape)
meta_new = pd.DataFrame()
for i in range(6):
dummy = meta.copy()
dummy["file_name"] = dummy["file_name"].apply(lambda x: f"{x}_{i}")
meta_new = pd.concat([
meta_new,
dummy
])
# print(meta_new.head(), meta_new.shape)
meta = meta_new.reset_index(drop=True)
no_single_defects_df = pd.concat([meta.query('mimb==0&bbl==0&mbl==0&fcl==0'),
meta.query('(mimb==1 |mimb==2) &bbl==0&mbl==0&fcl==0'),
meta.query('mimb==0&(bbl==1|bbl==2)&mbl==0&fcl==0'),
meta.query('mimb==0&bbl==0&(mbl==1|mbl==2)&fcl==0'),
meta.query('mimb==0&bbl==0&mbl==0&(fcl==1|fcl==2)')]).reset_index(drop=True)
# print(no_single_defects_df.tail())
no_single_defects_df.reset_index(drop=True,inplace=True)
defects = ['mimb', 'bbl', 'mbl', 'fcl']
labels = np.argmax(no_single_defects_df[defects].values, axis=1).tolist()
no_single_defects_df=pd.merge(no_single_defects_df, df, on="file_name", how="left")
f_values, p_values = f_classif(no_single_defects_df.drop(columns=defects+["file_name"]), labels)
anova_test = pd.DataFrame(columns=["features", "f_values", "p_values"])
anova_test["features"] = no_single_defects_df.drop(columns=defects+["file_name"]).columns
anova_test["f_values"] = f_values
anova_test["p_values"] = p_values
anova_test = anova_test.sort_values(by=["f_values", "p_values"], ascending=False).reset_index(drop=True)
# print(anova_test.head(100).to_string())
features = anova_test[anova_test.index < 100]["features"].values
# corr = df[features].corr()
# mask = np.triu(corr)
# plt.figure(figsize=(20,15))
# sns.heatmap(corr, mask=mask)
# plt.show()
#-----------------------------------------------------------------------------------------------------------------------
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import hamming_loss, multilabel_confusion_matrix, classification_report
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.metrics import get_scorer_names
from mlxtend.plotting import plot_sequential_feature_selection
from scipy.special import binom
def multinomial(params):
if len(params) == 1:
return 1
return binom(sum(params), params[-1]) * multinomial(params[:-1])
for i in range(3):
meta[f"Count_{i}"] = (meta[defects] == i).astype(int).sum(axis=1)
meta['groups'] = meta.apply(lambda row: int(multinomial([row['Count_0'],row['Count_1'], row['Count_2']])),axis=1)
# print(meta.shape)
df = pd.merge(meta, df, on="file_name", how = 'left')
train_data = df[features.tolist()+["groups"]+defects].reset_index(drop=True)
print(train_data.head())
Y = np.zeros((train_data.shape[0], 8))
counter = 0
for idx, defect in enumerate(defects):
for i in range(1,3):
Y[:,counter] = (train_data[defect].to_numpy() == i)
counter += 1
# print(list(Y))
# print(train_data[defects].tail())
# print(np.sum(Y))
Y_str = []
for label in Y:
Y_str.append("".join(list(map(str,label))))
# print(len(Y_str))
print('classification start')
gstrf = StratifiedGroupKFold(n_splits=3, shuffle=True, random_state=43)
# clf = Pipeline([('scaler', StandardScaler()), ('lr', OneVsRestClassifier(LogisticRegression()))])
clf = OneVsRestClassifier(LGBMClassifier())
feature_selector = sfs(
estimator=clf,
cv=list(
gstrf.split(df[features],y=Y_str, groups=df["groups"])
),
k_features=15,
scoring='roc_auc',
verbose=2,
floating=True,
n_jobs=-1
)
feature_selector.fit(df[features], Y)
print(feature_selector.k_feature_names_)
selected_features = list(feature_selector.k_feature_names_)
# fig1 = plot_sequential_feature_selection(feature_selector.get_metric_dict(), figsize=(10,20))
# plt.ylim([0,1])
# plt.show()
dump_features_list = selected_features.copy()
dump_features_list.append('file_name')
for defect in defects:
dump_features_list.append(defect)
dump_features_list.append('groups')
feature_dump = df[dump_features_list]
# feature_dump.to_csv("curr_feature_dump.csv", index=False)
# corr = df[list(feature_selector.k_feature_names_)].corr()
# mask = np.triu(corr)
# plt.figure(figsize=(20,15))
# sns.heatmap(corr, mask=mask, annot=True);
# plt.show()
selected_features = list(feature_selector.k_feature_names_)
plt.rc('font', size=5)
fig,ax = plot_sfs(feature_selector.get_metric_dict(), kind='std_dev',
figsize=(10, 7));
# ax.set_xticklabels(list(feature_selector.k_feature_names_))
plt.ylim(0,1)
ax.tick_params(axis="x", rotation=5)
ax.set_xlabel("Sequential Features")
ax.set_ylabel("Performance - AUC")
fig.align_labels()
plt.show()
fig.savefig("./vib_feature_selection_lgbm.png", dpi=300, bbox_inches='tight')
train_1_indices = list(train_data.query("groups==1|groups==4|groups==6").index)
val_1_indices = list(set(train_data.index) - set(train_1_indices))
train_2_indices = list(train_data.query("groups==1|groups==12").index)
val_2_indices = list(set(train_data.index) - set(train_2_indices))
cv = [
(train_1_indices, val_1_indices),
(train_2_indices, val_2_indices),
(val_1_indices, train_1_indices),
(val_2_indices, train_2_indices),
]
# ps = PredefinedSplit(train_data["groups"])
# train_data["kfold"] = -1
# for fold, (train_indices, val_indices) in enumerate(cv):
# train_data.loc[val_indices, "kfold"] = fold
# train_data[selected_features].head()
for fold, (train_indices, val_indices) in enumerate(cv):
print(f"FOLD {fold}:\n")
# splitting train and val based on k-fold
x_train = train_data.loc[train_indices]
x_val = train_data.loc[val_indices]
print(f"train_groups: {x_train.groups.unique()}")
print(f"val_groups: {x_val.groups.unique()}")
# exclude 'group'
x_train = x_train[selected_features]
x_val = x_val[selected_features]
y_train = Y[x_train.index]
y_val = Y[x_val.index]
clf = OneVsRestClassifier(LGBMClassifier())
# clf = OneVsRestClassifier(LGBMClassifier(boosting_type="dart",
# objective="binary",
# verbose=-1))
clf.fit(x_train, y_train)
# predict train and val
train_preds = clf.predict(x_train)
val_preds = clf.predict(x_val)
# calculate train and val loss (hamming)
train_loss = hamming_loss(y_train, train_preds)
val_loss = hamming_loss(y_val, val_preds)
print(f"TRAIN LOSS: {train_loss}")
print(f"OOF LOSS: {val_loss}\n")
print(classification_report(y_val, val_preds))
cMat = multilabel_confusion_matrix(y_val, val_preds)
print(cMat)
print("\n")
Job Script
#!/bin/bash
#SBATCH --job-name=testjob
#SBATCH --nodes=2
#SBATCH --ntasks=32
#SBATCH --nodes-per-node=16
#SBATCH --partition=accel_ai
python3 featureselector.py
i have tried running the script specifying srun/mpirun python3 in job script which runs the jobs in both nodes but the same script is being run in all cores as different instances as i understood from the job script which is not what i wanted.