The goal is to detect and fix why the report between my sklearn "summary" implementation is not matching with the results of OLS
statsmodels. The only thing is matching, is the beta coefficients.
import pandas as pd
import numpy as np
from statsmodels.regression.linear_model import OLS
from sklearn import linear_model
from scipy.stats import t
class LinearRegression(linear_model.LinearRegression):
"""
LinearRegression class after sklearn's, but calculate t-statistics
and p-values for model coefficients (betas).
Additional attributes available after .fit()
are `t` and `p` which are of the shape (y.shape[1], X.shape[1])
which is (n_features, n_coefs)
This class sets the intercept to 0 by default, since usually we include it
in X.
"""
def __init__(self, *args, **kwargs):
if not "fit_intercept" in kwargs:
kwargs['fit_intercept'] = False
super(LinearRegression, self)\
.__init__(*args, **kwargs)
def fit(self, X, y, n_jobs=1):
self = super(LinearRegression, self).fit(X, y, n_jobs)
# std errors
uhat = (y-(X@self.coef_).ravel())
k = np.shape(X)[1]
s2 = (uhat.T@uhat)/(y.shape[0])
var = s2*np.linalg.inv(X.T@X)
self.se = np.sqrt(np.diag(var))
# T-Stat
self.t_stats = self.coef_/self.se
# p-values
self.df = y.shape[0] - k # -1 degrees of freedom: N minus number of parameters
self.p_values = 2*(t.sf(abs(self.t_stats),self.df))
# Rsquared
tss = (y-np.mean(y)).T@(y-np.mean(y))
rss = uhat.T@uhat
self.rsq = 1 - rss/tss
self.summary = pd.DataFrame({
"beta":self.coef_.reshape(1,-1).tolist()[0],
"se":self.se.reshape(1,-1).tolist()[0],
"t_stats":self.t_stats.reshape(1,-1).tolist()[0],
"p_values":self.p_values.reshape(1,-1).tolist()[0],
})
return self
Running the function in a toy dataset we can test the results:
import statsmodels.api as sm
data = sm.datasets.longley.load_pandas()
# Estimate statsmodels OLS
model = OLS(endog=data.endog,exog=data.exog).fit()
# Estimate Sklearn with report like statsmodels OLS
model2 = LinearRegression(fit_intercept=False).fit(data.exog,np.array(data.endog))
model2.summary
I am worried about some formula is not matching with the correct one.