0

I'm building an API using sklearn's algorithms RandomForestRegressor, DecisionTreeRegressor, SVR, LinearRegression, KNeighborsRegressor. Among all the models, the one that achieved the best predictions was the LinearRegression model, as shown below:

LinearRegression Error Metrics:

  • MAE: 0.0162
  • MSE: 0.0003
  • RMSE: 0.0182
  • R²: 0.9999
  • Explained Variance Score: 0.9999

Mean Scores - Cross validation:

  • mean MAE: 0.0162
  • mean MSE: 0.00031
  • mean RMSE: 0.0178
  • mean R2: 0.9999
  • mean explained variance: 0.9999

but I'm finding these values very strange, the predictions seem very perfect even in the cross-validation scenario. where can i be wrong? can I trust these results?

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import joblib
import re

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score, max_error, median_absolute_error

from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict

# Load the data
df = pd.read_excel('dataset.xlsx')
df_SPEAKER_0 = pd.read_excel('dataset.xlsx', sheet_name=2, header=0)
df_SPEAKER_1 = pd.read_excel('dataset.xlsx', sheet_name=1, header=0)

df = df.set_index("Filename")

# Error data
df_errors = pd.DataFrame()

df.head(5)

# Model number to be used
MODEL = 1

def create_model(model_type):
    RANDST=42

    if model_type == 1:
        from sklearn.linear_model import LinearRegression
        return LinearRegression()

    else:
        raise ValueError("Invalid model type!")

# Normalize the data
index = df.index
scaler = MinMaxScaler()
df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
df = df.set_index(index)
df

# Unchanged but normalized
df_original = df
df_original["SCORE_SPEAKER_0"] = df_SPEAKER_0["TOTAL"].values
df_original["SCORE_SPEAKER_1"] = df_SPEAKER_1["TOTAL"].values

# Calculate the correlation matrix
correlations = df.corr(method='pearson')

# Transform the matrix into a dataframe with columns "column A/SPEAKER_0", "column B/SPEAKER_1", and "Correlation"
df_correlations = correlations.unstack().reset_index()
df_correlations.columns = ['SPEAKER_0', 'SPEAKER_1', 'Correlation']

# Exclude duplicate correlations
df_correlations = df_correlations[df_correlations['SPEAKER_0'] < df_correlations['SPEAKER_1']]

# Sort the dataframe in descending order of correlation
df_correlations = df_correlations.sort_values(by='Correlation', ascending=False)

excluded = []
# Exclude columns from the original dataframe with correlation greater than or equal to 98
for i in range(len(df_correlations)):
    if df_correlations.iloc[i]['Correlation'] >= 0.98:
        colname = df_correlations.iloc[i]['SPEAKER_0']
        if colname in df.columns:
            excluded.append(colname)
            df = df.drop(colname, axis=1)

df["SCORE_SPEAKER_0"] = df_SPEAKER_0["TOTAL"].values
df["SCORE_SPEAKER_1"] = df_SPEAKER_1["TOTAL"].values

df = df.set_index(index)

GRAPH_TYPE = 3
def plot_values(y_test, y_pred, plot_type):
    if plot_type == 1:  # Bar Chart
        indices = np.arange(len(y_test))
        width = 0.35
        plt.bar(indices, y_test, width, label='Real Values')
        plt.bar(indices + width, y_pred, width, label='Predictions')
        plt.xlabel('Sample')
        plt.ylabel('Value')
        plt.title('Bar Chart: Real Values vs. Predictions')
        plt.xticks(indices + width / 2, indices)
        plt.legend()
        plt.show()
    elif plot_type == 2:  # Line Chart
        plt.plot(y_test, label='Real Values')
        plt.plot(y_pred, label='Predictions')
        plt.xlabel('Sample')
        plt.ylabel('Value')
        plt.title('Line Chart: Real Values vs. Predictions')
        plt.legend()
        plt.show()

    else:
        print("Invalid plot type.")
        
def errors(y_test, y_pred):
      # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    evs = explained_variance_score(y_test, y_pred)
    me = max_error(y_test, y_pred)
    medae = median_absolute_error(y_test, y_pred)

    # Print metrics
    print("Mean Absolute Error (MAE):", mae)
    print("Mean Squared Error (MSE):", mse)
    print("Root Mean Squared Error (RMSE):", rmse)
    print("Coefficient of Determination (R²):", r2)
    print("Explained Variance Score:", evs)
    print("Max Error:", me)
    print("Median Absolute Error:", medae)

    metrics = {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R²': r2, 'Explained Variance Score': evs, 'Max Error': me, 'Median Absolute Error': medae}
    return pd.DataFrame.from_dict(metrics, orient='index', columns=['Value'])

# Create the PCA dataframe
X = df
pca = PCA()
pca.fit(X)

var_exp = pca.explained_variance_ratio_
cum_var_exp = np.cumsum(var_exp)
n_components = np.argmax(cum_var_exp >= 0.99) + 1

# Create a new PCA object with the determined number of components
#pca = PCA(n_components=4)

# Create a new PCA object with a number of components according to variance
pca = PCA(n_components=n_components)

# Fit PCA to the data again with the determined number of components
X_pca = pca.fit_transform(X)
X_pca = pd.DataFrame(X_pca, columns=["PC" + str(i) for i in range(1, pca.n_components_+1)])
X_pca.index = df.index

X_pca["SCORE_SPEAKER_0"] = df_SPEAKER_0["TOTAL"].values
X_pca["SCORE_SPEAKER_1"] = df_SPEAKER_1["TOTAL"].values

# Prepare data for regression

df_reg_SPEAKER_0 = df.drop(["SCORE_SPEAKER_1"], axis=1)
cols_to_drop = df_reg_SPEAKER_0.filter(regex='1$').columns
df_reg_SPEAKER_0 = df_reg_SPEAKER_0.drop(columns=cols_to_drop)

df_reg_SPEAKER_0_2 = df.drop(["SCORE_SPEAKER_0"], axis=1)
cols_to_drop = df_reg_SPEAKER_0_2.filter(regex='1$').columns
df_reg_SPEAKER_0_2 = df_reg_SPEAKER_0_2.drop(columns=cols_to_drop)

df_reg_SPEAKER_1 = df.drop(["SCORE_SPEAKER_0"], axis=1)
cols_to_drop = df_reg_SPEAKER_1.filter(regex='0$').columns
df_reg_SPEAKER_1 = df_reg_SPEAKER_1.drop(columns=cols_to_drop)

df_reg_SPEAKER_1_2 = df.drop(["SCORE_SPEAKER_1"], axis=1)
cols_to_drop = df_reg_SPEAKER_1_2.filter(regex='0$').columns
df_reg_SPEAKER_1_2 = df_reg_SPEAKER_1_2.drop(columns=cols_to_drop)

!pip install catboost -q

import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict

# List of model names and corresponding model constructors
model_info = [
    ("LinearRegression", LinearRegression)    
]

# Function to create and execute scenarios
def execute_scenario2(df, model, scenario, df_errors):
    print("-------------------")
    if scenario == 1:
        name = "NORMALIZED + PEARSON + PCA vs SCORE_SPEAKER_0"
        print("Scenario 1: BASE REGRESSION " + name)
        X = X_pca.drop(["SCORE_SPEAKER_0", "SCORE_SPEAKER_1"], axis=1)
        y = X_pca["SCORE_SPEAKER_0"]
    elif scenario == 2:
        name = "NORMALIZED + PEARSON + PCA vs SCORE_SPEAKER_1"
        print("Scenario 2: BASE REGRESSION " + name)
        X = X_pca.drop(["SCORE_SPEAKER_0", "SCORE_SPEAKER_1"], axis=1)
        y = X_pca["SCORE_SPEAKER_1"]
    else:
        print("Invalid scenario.")
        return

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    # Create a DataFrame to display the results
    results = pd.DataFrame({'Real Value': y_test, 'Prediction': y_pred})

    # Print the DataFrame
    print(results)

    # Plot real values and predictions
    plt.plot(y_test, label='Real Values')
    plt.plot(y_pred, label='Predictions')
    plt.legend()
    # Rotate x-axis labels
    plt.xticks(rotation=90)
    plt.show()

    # Calculate error metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    explained_variance = explained_variance_score(y_test, y_pred)

    y_pred = cross_val_predict(model, X, y, cv=10)

    # Calculate metrics for each fold
    mae_scores = mean_absolute_error(y, y_pred)
    mse_scores = mean_squared_error(y, y_pred)
    rmse_scores = np.sqrt(mse_scores)
    r2_scores = r2_score(y, y_pred)
    explained_variance_scores = explained_variance_score(y, y_pred)
    median_absolute_errors = median_absolute_error(y, y_pred)

    # Calculate mean cross-validation metrics
    mean_mae = np.mean(mae_scores)
    mean_mse = np.mean(mse_scores)
    mean_rmse = np.mean(rmse_scores)
    mean_r2 = np.mean(r2_scores)
    mean_explained_variance = np.mean(explained_variance_scores)
    mean_median_absolute_error = np.mean(median_absolute_errors)

    mean_scores = {
        'mean MAE': mae,
        'mean MSE': mean_mse,
        'mean RMSE': mean_rmse,
        'mean R2': mean_r2,
        'mean explained variance': mean_explained_variance,
        'mean median absolute error': mean_median_absolute_error,
    }

    return {
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R²': r2,
        'Explained Variance Score': explained_variance
    }, mean_scores, name

# Create the dataframe
scenarios = range(1, 2)
error_columns = ["Scenario", "Model", "MAE", "MSE", "RMSE", "R²", "Explained Variance Score"]
df_errors = pd.DataFrame(columns=error_columns)

# Iterate through scenarios
for model_type, (model_name, model_constructor) in enumerate(model_info, start=1):
    for scenario in scenarios:
        MODEL_num = model_constructor()
        error_metrics, mean_scores, name = execute_scenario2(df_original, MODEL_num, scenario, df_errors)

        print(model_name)

        print("Error Metrics:")
        for metric, value in error_metrics.items():
            print(f"{metric}: {value}")

        print("\nMean Scores - Cross validation:")
        for metric, value in mean_scores.items():
            print(f"{metric}: {value}")

        # Concatenate error metrics
        df_errors = pd.concat([df_errors, pd.DataFrame([{"Scenario": name, "Model": model_name, **error_metrics, **mean_scores}])], ignore_index=True)

# Show the DataFrame
df_errors
Liam Park
  • 414
  • 1
  • 9
  • 26
  • A scatter plot of the training data supplied to the model can help with understanding whether the results/predictions are reasonable or not. If the scatter plot of target vs. features given to the model is a straight line with only a bit of scatter, then you'd expect `LinearRegression` to capture that trend very well and yield a high score. – some3128 Aug 27 '23 at 11:39

0 Answers0