I am doing a Masters in Construction and Real Estate Management. My topic is about scheduling and how to predict the duration of an activity in a new project using historical data.
I learned most of my knowledge through Code Academy and I am now in the process of writing my model and debugging it on a sample dataset I created myself. The problem I am facing when running it is that the model parameters apparently don't lead to convergence. or perhaps I am choosing wrong models to process my data idk
Here is my code and sample data attached, written using Spyder's Python IDE in Anaconda Desktop A few things to note: I am trying to utilize pipelines for data preprocessing I am trying to use pipelines to iterate over a selection of models and boosting techniques and Hyperparameters to come up with the best model for my data, this is where I think the issue is mostly
ALL MY CODE
# Installation of all necessary external packages and IDE through Anaconda Package Manager
# The following Packages were installed(noting that not all of them were necessarily used): Scikit-Learn, Pandas, NumPy, SciPy, MatPlotLib, Seaborn, XGBoost
import sklearn, xgboost, pandas as pd, numpy as np, scipy as sp, matplotlib.pyplot as plt, seaborn as sns
# Installation of Weights and Biases Command Line Interface (CLI) and Python library through Python’s Package Installer (PIP) in Anaconda Prompt
# Login to W&B. My API Key 961898dc9de1a991c524effe720cfc9b007ea71d
import wandb
wandb.init(project="CONREM_MSc_Thesis_Project")
# Importing the chosen Scikit-Learn Models (A justification for chosen models is included)
from sklearn.linear_model import LinearRegression, LassoLars, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
# Importing other required libraries and Model metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
# Importing Model Boosting Technique
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
# Importing Pipeline Object
from sklearn.pipeline import Pipeline
# Accessing combined schedule database using Pandas
# Read the CSV file with the manually defined column names
Schedules_Historical_Database = pd.read_csv('V4_Randomized_1000_Substructure_Schedules_Database.csv')
Activity_Names_List = ['Excavation','Total']
# Extracting Activity Feature Columns from the Database
Feature_Columns = Schedules_Historical_Database.columns[1:]
# Creating SKLearn Machine Learning Pipelines for Data Preprocessing, Model Selection, and Hyperparameter Tuning
SKLearn_Feature_Columns = Feature_Columns
# Initialize an empty list to store target features for each activity
Target_Features_List = []
# Create the list of target feature names by combining activity names with "Baseline Duration"
for activity_name in Activity_Names_List:
target_feature = f"{activity_name} Baseline Duration"
Target_Features_List.append(target_feature)
print(Target_Features_List)
# Extract the target features from the DataFrame
y = Schedules_Historical_Database[Target_Features_List]
# Removing the target feature column to get the input features
X = Schedules_Historical_Database.drop(columns = Target_Features_List)
Numerical_value_columns = X.select_dtypes(include=np.number).columns
# Splitting the data into training dataset and validation dataset (70% and 30%)
X_train, X_test, y_train, y_test = train_test_split(
X[Numerical_value_columns], y,
train_size=0.7, test_size=0.3,
random_state = 1
)
# Data preprocessing pipeline
Data_preprocessing_pipeline = Pipeline(
[('imputer_type', SimpleImputer(strategy = 'median')),
('scaler',StandardScaler())]
)
Data_preprocessing_pipeline.fit(X_train)
x_transform = Data_preprocessing_pipeline.transform(X_test)
SKLearn_Column_Transformer = ColumnTransformer(
transformers=[("Data Preprocessing", Data_preprocessing_pipeline, Numerical_value_columns)]
)
Training_Data_Transformation = SKLearn_Column_Transformer.fit(X_train)
Testing_Data_Transformation = SKLearn_Column_Transformer.transform(X_test)
# Machine Learning Pipelines, Ensemble Models, and Model Accuracy Boosting Algorithm.
ML_Pipeline_Model_1 = Pipeline(
[('activity_feature_selector', Data_preprocessing_pipeline),
('ML_Ensemble_Model', LinearRegression()),
('ML_Model_Boosting', XGBRegressor)]
)
ML_Pipeline_Model_2 = Pipeline(
[('activity_feature_selector', Data_preprocessing_pipeline),
('ML_Ensemble_Model', ElasticNet()),
('ML_Model_Boosting', XGBRegressor)]
)
ML_Pipeline_Model_3 = Pipeline(
[('activity_feature_selector', Data_preprocessing_pipeline),
('ML_Ensemble_Model', LassoLars()),
('ML_Model_Boosting', XGBRegressor)]
)
ML_Pipeline_Model_4 = Pipeline(
[('activity_feature_selector', Data_preprocessing_pipeline),
('ML_Ensemble_Model', DecisionTreeRegressor()),
('ML_Model_Boosting', XGBRegressor)]
)
ML_Pipeline_Model_5 = Pipeline(
[('activity_feature_selector', Data_preprocessing_pipeline),
('ML_Ensemble_Model', MLPRegressor()),
('ML_Model_Boosting', XGBRegressor)] # 10 MLP NNs is enough
)
Regressor_1 = LinearRegression()
Regressor_2 = ElasticNet()
Regressor_3 = LassoLars()
Regressor_4 = DecisionTreeRegressor()
Regressor_5 = MLPRegressor()
# Creating Model and Hyperparameters Dictionaries
Model_1_Regressor = {
'LinearRegressor': Regressor_1
}
Model_1_HPs = {
'fit_intercept': [True, False],
} #Hyperparamater list of values
Model_2_Regressor = {
'ElasticNet': Regressor_2
}
Model_2_HPs = {
'alpha' : [0.01, 0.05, 0.1, 0.2, 0.4, 0.8],
'l1_ratio' : [0.3, 0.5, 0.7]
}
Model_3_Regressor = {
'LassoLARSRegressor': Regressor_3
}
Model_3_HPs = {
'alpha': [0.3, 0.6, 0.9],
'max_iter': [500, 1000, 2000]
}
Model_4_Regressor = {
'DecisionTreeRegressor': Regressor_4
}
Model_4_HPs = {
'criterion': ["squared_error", "friedman_mse"]
}
Model_5_Regressor = {
'MLPRegressor': Regressor_5
}
Model_5_HPs = {
'hidden_layer_sizes': [100, 200, 400, 800, 1600, 3200],
'activation': ['identity', 'relu'],
'solver': ['sgd', 'adam'],
'learning_rate': ['constant'],
'max_iter': [200, 400, 800, 1600, 3200]
}
# Iterating over Pipelines to select best Model, and subsequent Model Parameters using SKLearn Grid Search Cross Validation
SKLearn_ML_Models_to_GridsearchCV = [
(Regressor_1, Model_1_HPs),
(Regressor_2, Model_2_HPs),
(Regressor_3, Model_3_HPs),
(Regressor_4, Model_4_HPs),
(Regressor_5, Model_5_HPs)
]
best_model_scores = []
for pipeline, hyperparameters in SKLearn_ML_Models_to_GridsearchCV:
Grid_Search_Object = GridSearchCV(
estimator = pipeline, # Set estimator to pipeline
param_grid = hyperparameters,
scoring='neg_mean_squared_error',
cv = 5
)
# Evaluating the best Pipeline and included Ensemble Model
best_pipeline = Grid_Search_Object.fit(X_train, y_train).best_estimator_
best_model_scores.append(Grid_Search_Object.best_score_)
# Add the name of the best model to the list
best_model_scores.append(pipeline.__class__.__name__)
# Best ML Model and corresponding Hyperparameters
print('The most suitable SKLearn ML model is:', pipeline.__class__.__name__)
print('Best hyperparameters:', Grid_Search_Object.best_params_)
# Find the index of the best model based on scores
Best_ML_Model_Index = best_model_scores.index(max(best_model_scores))
Best_ML_Model = SKLearn_ML_Models_to_GridsearchCV[Best_ML_Model_Index][0]
print('The best SKLearn ML model overall is:', Best_ML_Model)
# Formatting the Hyperparameters versus Accuracies into Pandas Dataframe before plotting
Model_Parameters_List = Grid_Search_Object.cv_results_['params']
Model_Accuracies_List = Grid_Search_Object.cv_results_['mean_test_score']
Formatted_Hyperparameters_vs_Model_Scores = pd.concat(
[pd.DataFrame(Model_Parameters_List),
pd.DataFrame(Model_Accuracies_List, columns=['Model Accuracy'])],
axis=1
)
CV_table = Formatted_Hyperparameters_vs_Model_Scores.pivot(index='Hyperparameters List', columns='Model Accuracy')
print(CV_table)
# Plotting Models versus Accuracies
plt.figure(figsize=(16, 9))
for i, row in CV_table.iterrows():
plt.plot(row.index, row.values, marker='o', label=f'Hyperparameters: {i}')
plt.xlabel('Models')
plt.ylabel('Model Accuracy')
plt.title('Models vs Accuracies')
plt.xticks(rotation = 45)
plt.legend()
plt.tight_layout()
plt.show()
# Finding Best Ensemble Model Parameters
Highest_Model_Score = Grid_Search_Object.best_score_
Best_Model_Parameters = Grid_Search_Object.best_params_
# Plotting Best Model Hyperparameters
grouped_data = Formatted_Hyperparameters_vs_Model_Scores.groupby(
Model_Parameters_List[0].keys()
).mean()
plt.plot(grouped_data.index, grouped_data['Model Accuracy'], marker='o')
plt.xlabel('Best Model Hyperparameters')
plt.ylabel('Model Accuracy')
plt.title('Best Model Hyperparameters vs Accuracy')
plt.xticks(rotation = 45)
plt.tight_layout()
plt.show()
print("Highest Model Score", Highest_Model_Score, "Best Model Parameters", Best_Model_Parameters)
# Using Final Model to Predict Baseline durations in Training data
y_pred = Best_ML_Model.predict(X_test)
# Transforming y_pred list versus y_test list into a Table
Tabulated_Prediction_vs_Actual = pd.DataFrame({'Actual Values': y_test, 'Predicted Values': y_pred})
# Final Model Metrics
Final_Model_ConfusionMatrix = Best_ML_Model.confusion_matrix(
y_test, y_pred
)
Final_Model_AccuracyScore = Best_ML_Model.accuracy_score(
y_test, y_pred
)
Final_Model_PrecisionScore = Best_ML_Model.precision_score(
y_test, y_pred
)
Final_Model_RecallScore = Best_ML_Model.recall_score(
y_test, y_pred
)
Final_Model_F1Score = Best_ML_Model.f1_score(
y_test, y_pred
)
# Tabulated Model Metrics
Tabulated_Model_Metrics = pd.DataFrame(
{
'Metric': ['Confusion Matrix', 'Accuracy Score', 'Precision Score', 'Recall Score', 'F1 Score'],
'Value': [Final_Model_ConfusionMatrix, Final_Model_AccuracyScore, Final_Model_PrecisionScore, Final_Model_RecallScore, Final_Model_F1Score]
}
)
# Printing out Tabulated y_pred vs y_test and Model Metrics
print(Tabulated_Model_Metrics)
print(Tabulated_Prediction_vs_Actual)
# Save Trained Model
Final_Trained_Model = Grid_Search_Object
import joblib
joblib.dump(Final_Trained_Model, 'MSc_Final_Trained_Model.pkl')
# Load the trained and optimized model from the file
# loaded_optimized_model = joblib.load('MSc_Final_Trained_Model.pkl')