I'm building an API using sklearn's algorithms RandomForestRegressor, DecisionTreeRegressor, SVR, LinearRegression, KNeighborsRegressor. Among all the models, the one that achieved the best predictions was the LinearRegression model, as shown below:
LinearRegression Error Metrics:
- MAE: 0.0162
- MSE: 0.0003
- RMSE: 0.0182
- R²: 0.9999
- Explained Variance Score: 0.9999
Mean Scores - Cross validation:
- mean MAE: 0.0162
- mean MSE: 0.00031
- mean RMSE: 0.0178
- mean R2: 0.9999
- mean explained variance: 0.9999
but I'm finding these values very strange, the predictions seem very perfect even in the cross-validation scenario. where can i be wrong? can I trust these results?
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import joblib
import re
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score, max_error, median_absolute_error
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
# Load the data
df = pd.read_excel('dataset.xlsx')
df_SPEAKER_0 = pd.read_excel('dataset.xlsx', sheet_name=2, header=0)
df_SPEAKER_1 = pd.read_excel('dataset.xlsx', sheet_name=1, header=0)
df = df.set_index("Filename")
# Error data
df_errors = pd.DataFrame()
df.head(5)
# Model number to be used
MODEL = 1
def create_model(model_type):
RANDST=42
if model_type == 1:
from sklearn.linear_model import LinearRegression
return LinearRegression()
else:
raise ValueError("Invalid model type!")
# Normalize the data
index = df.index
scaler = MinMaxScaler()
df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
df = df.set_index(index)
df
# Unchanged but normalized
df_original = df
df_original["SCORE_SPEAKER_0"] = df_SPEAKER_0["TOTAL"].values
df_original["SCORE_SPEAKER_1"] = df_SPEAKER_1["TOTAL"].values
# Calculate the correlation matrix
correlations = df.corr(method='pearson')
# Transform the matrix into a dataframe with columns "column A/SPEAKER_0", "column B/SPEAKER_1", and "Correlation"
df_correlations = correlations.unstack().reset_index()
df_correlations.columns = ['SPEAKER_0', 'SPEAKER_1', 'Correlation']
# Exclude duplicate correlations
df_correlations = df_correlations[df_correlations['SPEAKER_0'] < df_correlations['SPEAKER_1']]
# Sort the dataframe in descending order of correlation
df_correlations = df_correlations.sort_values(by='Correlation', ascending=False)
excluded = []
# Exclude columns from the original dataframe with correlation greater than or equal to 98
for i in range(len(df_correlations)):
if df_correlations.iloc[i]['Correlation'] >= 0.98:
colname = df_correlations.iloc[i]['SPEAKER_0']
if colname in df.columns:
excluded.append(colname)
df = df.drop(colname, axis=1)
df["SCORE_SPEAKER_0"] = df_SPEAKER_0["TOTAL"].values
df["SCORE_SPEAKER_1"] = df_SPEAKER_1["TOTAL"].values
df = df.set_index(index)
GRAPH_TYPE = 3
def plot_values(y_test, y_pred, plot_type):
if plot_type == 1: # Bar Chart
indices = np.arange(len(y_test))
width = 0.35
plt.bar(indices, y_test, width, label='Real Values')
plt.bar(indices + width, y_pred, width, label='Predictions')
plt.xlabel('Sample')
plt.ylabel('Value')
plt.title('Bar Chart: Real Values vs. Predictions')
plt.xticks(indices + width / 2, indices)
plt.legend()
plt.show()
elif plot_type == 2: # Line Chart
plt.plot(y_test, label='Real Values')
plt.plot(y_pred, label='Predictions')
plt.xlabel('Sample')
plt.ylabel('Value')
plt.title('Line Chart: Real Values vs. Predictions')
plt.legend()
plt.show()
else:
print("Invalid plot type.")
def errors(y_test, y_pred):
# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)
me = max_error(y_test, y_pred)
medae = median_absolute_error(y_test, y_pred)
# Print metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Coefficient of Determination (R²):", r2)
print("Explained Variance Score:", evs)
print("Max Error:", me)
print("Median Absolute Error:", medae)
metrics = {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R²': r2, 'Explained Variance Score': evs, 'Max Error': me, 'Median Absolute Error': medae}
return pd.DataFrame.from_dict(metrics, orient='index', columns=['Value'])
# Create the PCA dataframe
X = df
pca = PCA()
pca.fit(X)
var_exp = pca.explained_variance_ratio_
cum_var_exp = np.cumsum(var_exp)
n_components = np.argmax(cum_var_exp >= 0.99) + 1
# Create a new PCA object with the determined number of components
#pca = PCA(n_components=4)
# Create a new PCA object with a number of components according to variance
pca = PCA(n_components=n_components)
# Fit PCA to the data again with the determined number of components
X_pca = pca.fit_transform(X)
X_pca = pd.DataFrame(X_pca, columns=["PC" + str(i) for i in range(1, pca.n_components_+1)])
X_pca.index = df.index
X_pca["SCORE_SPEAKER_0"] = df_SPEAKER_0["TOTAL"].values
X_pca["SCORE_SPEAKER_1"] = df_SPEAKER_1["TOTAL"].values
# Prepare data for regression
df_reg_SPEAKER_0 = df.drop(["SCORE_SPEAKER_1"], axis=1)
cols_to_drop = df_reg_SPEAKER_0.filter(regex='1$').columns
df_reg_SPEAKER_0 = df_reg_SPEAKER_0.drop(columns=cols_to_drop)
df_reg_SPEAKER_0_2 = df.drop(["SCORE_SPEAKER_0"], axis=1)
cols_to_drop = df_reg_SPEAKER_0_2.filter(regex='1$').columns
df_reg_SPEAKER_0_2 = df_reg_SPEAKER_0_2.drop(columns=cols_to_drop)
df_reg_SPEAKER_1 = df.drop(["SCORE_SPEAKER_0"], axis=1)
cols_to_drop = df_reg_SPEAKER_1.filter(regex='0$').columns
df_reg_SPEAKER_1 = df_reg_SPEAKER_1.drop(columns=cols_to_drop)
df_reg_SPEAKER_1_2 = df.drop(["SCORE_SPEAKER_1"], axis=1)
cols_to_drop = df_reg_SPEAKER_1_2.filter(regex='0$').columns
df_reg_SPEAKER_1_2 = df_reg_SPEAKER_1_2.drop(columns=cols_to_drop)
!pip install catboost -q
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict
# List of model names and corresponding model constructors
model_info = [
("LinearRegression", LinearRegression)
]
# Function to create and execute scenarios
def execute_scenario2(df, model, scenario, df_errors):
print("-------------------")
if scenario == 1:
name = "NORMALIZED + PEARSON + PCA vs SCORE_SPEAKER_0"
print("Scenario 1: BASE REGRESSION " + name)
X = X_pca.drop(["SCORE_SPEAKER_0", "SCORE_SPEAKER_1"], axis=1)
y = X_pca["SCORE_SPEAKER_0"]
elif scenario == 2:
name = "NORMALIZED + PEARSON + PCA vs SCORE_SPEAKER_1"
print("Scenario 2: BASE REGRESSION " + name)
X = X_pca.drop(["SCORE_SPEAKER_0", "SCORE_SPEAKER_1"], axis=1)
y = X_pca["SCORE_SPEAKER_1"]
else:
print("Invalid scenario.")
return
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Create a DataFrame to display the results
results = pd.DataFrame({'Real Value': y_test, 'Prediction': y_pred})
# Print the DataFrame
print(results)
# Plot real values and predictions
plt.plot(y_test, label='Real Values')
plt.plot(y_pred, label='Predictions')
plt.legend()
# Rotate x-axis labels
plt.xticks(rotation=90)
plt.show()
# Calculate error metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
explained_variance = explained_variance_score(y_test, y_pred)
y_pred = cross_val_predict(model, X, y, cv=10)
# Calculate metrics for each fold
mae_scores = mean_absolute_error(y, y_pred)
mse_scores = mean_squared_error(y, y_pred)
rmse_scores = np.sqrt(mse_scores)
r2_scores = r2_score(y, y_pred)
explained_variance_scores = explained_variance_score(y, y_pred)
median_absolute_errors = median_absolute_error(y, y_pred)
# Calculate mean cross-validation metrics
mean_mae = np.mean(mae_scores)
mean_mse = np.mean(mse_scores)
mean_rmse = np.mean(rmse_scores)
mean_r2 = np.mean(r2_scores)
mean_explained_variance = np.mean(explained_variance_scores)
mean_median_absolute_error = np.mean(median_absolute_errors)
mean_scores = {
'mean MAE': mae,
'mean MSE': mean_mse,
'mean RMSE': mean_rmse,
'mean R2': mean_r2,
'mean explained variance': mean_explained_variance,
'mean median absolute error': mean_median_absolute_error,
}
return {
'MAE': mae,
'MSE': mse,
'RMSE': rmse,
'R²': r2,
'Explained Variance Score': explained_variance
}, mean_scores, name
# Create the dataframe
scenarios = range(1, 2)
error_columns = ["Scenario", "Model", "MAE", "MSE", "RMSE", "R²", "Explained Variance Score"]
df_errors = pd.DataFrame(columns=error_columns)
# Iterate through scenarios
for model_type, (model_name, model_constructor) in enumerate(model_info, start=1):
for scenario in scenarios:
MODEL_num = model_constructor()
error_metrics, mean_scores, name = execute_scenario2(df_original, MODEL_num, scenario, df_errors)
print(model_name)
print("Error Metrics:")
for metric, value in error_metrics.items():
print(f"{metric}: {value}")
print("\nMean Scores - Cross validation:")
for metric, value in mean_scores.items():
print(f"{metric}: {value}")
# Concatenate error metrics
df_errors = pd.concat([df_errors, pd.DataFrame([{"Scenario": name, "Model": model_name, **error_metrics, **mean_scores}])], ignore_index=True)
# Show the DataFrame
df_errors