Linear regression returning perfect values on metrics even in cross-validation

Question

I'm building an API using sklearn's algorithms RandomForestRegressor, DecisionTreeRegressor, SVR, LinearRegression, KNeighborsRegressor. Among all the models, the one that achieved the best predictions was the LinearRegression model, as shown below:

LinearRegression Error Metrics:

MAE: 0.0162
MSE: 0.0003
RMSE: 0.0182
R²: 0.9999
Explained Variance Score: 0.9999

Mean Scores - Cross validation:

mean MAE: 0.0162
mean MSE: 0.00031
mean RMSE: 0.0178
mean R2: 0.9999
mean explained variance: 0.9999

but I'm finding these values very strange, the predictions seem very perfect even in the cross-validation scenario. where can i be wrong? can I trust these results?

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import joblib
import re

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score, max_error, median_absolute_error

from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict

# Load the data
df = pd.read_excel('dataset.xlsx')
df_SPEAKER_0 = pd.read_excel('dataset.xlsx', sheet_name=2, header=0)
df_SPEAKER_1 = pd.read_excel('dataset.xlsx', sheet_name=1, header=0)

df = df.set_index("Filename")

# Error data
df_errors = pd.DataFrame()

df.head(5)

# Model number to be used
MODEL = 1

def create_model(model_type):
    RANDST=42

    if model_type == 1:
        from sklearn.linear_model import LinearRegression
        return LinearRegression()

    else:
        raise ValueError("Invalid model type!")

# Normalize the data
index = df.index
scaler = MinMaxScaler()
df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
df = df.set_index(index)
df

# Unchanged but normalized
df_original = df
df_original["SCORE_SPEAKER_0"] = df_SPEAKER_0["TOTAL"].values
df_original["SCORE_SPEAKER_1"] = df_SPEAKER_1["TOTAL"].values

# Calculate the correlation matrix
correlations = df.corr(method='pearson')

# Transform the matrix into a dataframe with columns "column A/SPEAKER_0", "column B/SPEAKER_1", and "Correlation"
df_correlations = correlations.unstack().reset_index()
df_correlations.columns = ['SPEAKER_0', 'SPEAKER_1', 'Correlation']

# Exclude duplicate correlations
df_correlations = df_correlations[df_correlations['SPEAKER_0'] < df_correlations['SPEAKER_1']]

# Sort the dataframe in descending order of correlation
df_correlations = df_correlations.sort_values(by='Correlation', ascending=False)

excluded = []
# Exclude columns from the original dataframe with correlation greater than or equal to 98
for i in range(len(df_correlations)):
    if df_correlations.iloc[i]['Correlation'] >= 0.98:
        colname = df_correlations.iloc[i]['SPEAKER_0']
        if colname in df.columns:
            excluded.append(colname)
            df = df.drop(colname, axis=1)

df["SCORE_SPEAKER_0"] = df_SPEAKER_0["TOTAL"].values
df["SCORE_SPEAKER_1"] = df_SPEAKER_1["TOTAL"].values

df = df.set_index(index)

GRAPH_TYPE = 3
def plot_values(y_test, y_pred, plot_type):
    if plot_type == 1:  # Bar Chart
        indices = np.arange(len(y_test))
        width = 0.35
        plt.bar(indices, y_test, width, label='Real Values')
        plt.bar(indices + width, y_pred, width, label='Predictions')
        plt.xlabel('Sample')
        plt.ylabel('Value')
        plt.title('Bar Chart: Real Values vs. Predictions')
        plt.xticks(indices + width / 2, indices)
        plt.legend()
        plt.show()
    elif plot_type == 2:  # Line Chart
        plt.plot(y_test, label='Real Values')
        plt.plot(y_pred, label='Predictions')
        plt.xlabel('Sample')
        plt.ylabel('Value')
        plt.title('Line Chart: Real Values vs. Predictions')
        plt.legend()
        plt.show()

    else:
        print("Invalid plot type.")
        
def errors(y_test, y_pred):
      # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    evs = explained_variance_score(y_test, y_pred)
    me = max_error(y_test, y_pred)
    medae = median_absolute_error(y_test, y_pred)

    # Print metrics
    print("Mean Absolute Error (MAE):", mae)
    print("Mean Squared Error (MSE):", mse)
    print("Root Mean Squared Error (RMSE):", rmse)
    print("Coefficient of Determination (R²):", r2)
    print("Explained Variance Score:", evs)
    print("Max Error:", me)
    print("Median Absolute Error:", medae)

    metrics = {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R²': r2, 'Explained Variance Score': evs, 'Max Error': me, 'Median Absolute Error': medae}
    return pd.DataFrame.from_dict(metrics, orient='index', columns=['Value'])

# Create the PCA dataframe
X = df
pca = PCA()
pca.fit(X)

var_exp = pca.explained_variance_ratio_
cum_var_exp = np.cumsum(var_exp)
n_components = np.argmax(cum_var_exp >= 0.99) + 1

# Create a new PCA object with the determined number of components
#pca = PCA(n_components=4)

# Create a new PCA object with a number of components according to variance
pca = PCA(n_components=n_components)

# Fit PCA to the data again with the determined number of components
X_pca = pca.fit_transform(X)
X_pca = pd.DataFrame(X_pca, columns=["PC" + str(i) for i in range(1, pca.n_components_+1)])
X_pca.index = df.index

X_pca["SCORE_SPEAKER_0"] = df_SPEAKER_0["TOTAL"].values
X_pca["SCORE_SPEAKER_1"] = df_SPEAKER_1["TOTAL"].values

# Prepare data for regression

df_reg_SPEAKER_0 = df.drop(["SCORE_SPEAKER_1"], axis=1)
cols_to_drop = df_reg_SPEAKER_0.filter(regex='1$').columns
df_reg_SPEAKER_0 = df_reg_SPEAKER_0.drop(columns=cols_to_drop)

df_reg_SPEAKER_0_2 = df.drop(["SCORE_SPEAKER_0"], axis=1)
cols_to_drop = df_reg_SPEAKER_0_2.filter(regex='1$').columns
df_reg_SPEAKER_0_2 = df_reg_SPEAKER_0_2.drop(columns=cols_to_drop)

df_reg_SPEAKER_1 = df.drop(["SCORE_SPEAKER_0"], axis=1)
cols_to_drop = df_reg_SPEAKER_1.filter(regex='0$').columns
df_reg_SPEAKER_1 = df_reg_SPEAKER_1.drop(columns=cols_to_drop)

df_reg_SPEAKER_1_2 = df.drop(["SCORE_SPEAKER_1"], axis=1)
cols_to_drop = df_reg_SPEAKER_1_2.filter(regex='0$').columns
df_reg_SPEAKER_1_2 = df_reg_SPEAKER_1_2.drop(columns=cols_to_drop)

!pip install catboost -q

import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict

# List of model names and corresponding model constructors
model_info = [
    ("LinearRegression", LinearRegression)    
]

# Function to create and execute scenarios
def execute_scenario2(df, model, scenario, df_errors):
    print("-------------------")
    if scenario == 1:
        name = "NORMALIZED + PEARSON + PCA vs SCORE_SPEAKER_0"
        print("Scenario 1: BASE REGRESSION " + name)
        X = X_pca.drop(["SCORE_SPEAKER_0", "SCORE_SPEAKER_1"], axis=1)
        y = X_pca["SCORE_SPEAKER_0"]
    elif scenario == 2:
        name = "NORMALIZED + PEARSON + PCA vs SCORE_SPEAKER_1"
        print("Scenario 2: BASE REGRESSION " + name)
        X = X_pca.drop(["SCORE_SPEAKER_0", "SCORE_SPEAKER_1"], axis=1)
        y = X_pca["SCORE_SPEAKER_1"]
    else:
        print("Invalid scenario.")
        return

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    # Create a DataFrame to display the results
    results = pd.DataFrame({'Real Value': y_test, 'Prediction': y_pred})

    # Print the DataFrame
    print(results)

    # Plot real values and predictions
    plt.plot(y_test, label='Real Values')
    plt.plot(y_pred, label='Predictions')
    plt.legend()
    # Rotate x-axis labels
    plt.xticks(rotation=90)
    plt.show()

    # Calculate error metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    explained_variance = explained_variance_score(y_test, y_pred)

    y_pred = cross_val_predict(model, X, y, cv=10)

    # Calculate metrics for each fold
    mae_scores = mean_absolute_error(y, y_pred)
    mse_scores = mean_squared_error(y, y_pred)
    rmse_scores = np.sqrt(mse_scores)
    r2_scores = r2_score(y, y_pred)
    explained_variance_scores = explained_variance_score(y, y_pred)
    median_absolute_errors = median_absolute_error(y, y_pred)

    # Calculate mean cross-validation metrics
    mean_mae = np.mean(mae_scores)
    mean_mse = np.mean(mse_scores)
    mean_rmse = np.mean(rmse_scores)
    mean_r2 = np.mean(r2_scores)
    mean_explained_variance = np.mean(explained_variance_scores)
    mean_median_absolute_error = np.mean(median_absolute_errors)

    mean_scores = {
        'mean MAE': mae,
        'mean MSE': mean_mse,
        'mean RMSE': mean_rmse,
        'mean R2': mean_r2,
        'mean explained variance': mean_explained_variance,
        'mean median absolute error': mean_median_absolute_error,
    }

    return {
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R²': r2,
        'Explained Variance Score': explained_variance
    }, mean_scores, name

# Create the dataframe
scenarios = range(1, 2)
error_columns = ["Scenario", "Model", "MAE", "MSE", "RMSE", "R²", "Explained Variance Score"]
df_errors = pd.DataFrame(columns=error_columns)

# Iterate through scenarios
for model_type, (model_name, model_constructor) in enumerate(model_info, start=1):
    for scenario in scenarios:
        MODEL_num = model_constructor()
        error_metrics, mean_scores, name = execute_scenario2(df_original, MODEL_num, scenario, df_errors)

        print(model_name)

        print("Error Metrics:")
        for metric, value in error_metrics.items():
            print(f"{metric}: {value}")

        print("\nMean Scores - Cross validation:")
        for metric, value in mean_scores.items():
            print(f"{metric}: {value}")

        # Concatenate error metrics
        df_errors = pd.concat([df_errors, pd.DataFrame([{"Scenario": name, "Model": model_name, **error_metrics, **mean_scores}])], ignore_index=True)

# Show the DataFrame
df_errors

A scatter plot of the training data supplied to the model can help with understanding whether the results/predictions are reasonable or not. If the scatter plot of target vs. features given to the model is a straight line with only a bit of scatter, then you'd expect `LinearRegression` to capture that trend very well and yield a high score. — some3128, Aug 27 '23 at 11:39

Linear regression returning perfect values on metrics even in cross-validation

0 Answers0