Using Linear Regression and Model Selection Techniques to Predict Y based on Xs in R

Question

I attempted to utilize the principles of linear regression and feature selection to predict a target variable (Y) based on a set of predictor variables (X1, X2, X3, X4, X5, X6, X7, and X8). I started by implementing a full model, which included all predictor variables, and then used stepwise regression to select the most relevant variables for my model through the use of backward, forward, and both selection methods. I then compared the performance of my model using AIC, BIC, and root mean squared error (RMSE) to determine the best model for my data. Finally, I used this best model to predict the value of Y for a specific set of predictor variable values and compared it to the actual value to assess the accuracy of my model. However, I encountered a problem in my data where the value of Y in the 39th semester was missing, so I couldn't evaluate the prediction results.

#Jeu de donnée : Classeur2.xlsx
setwd("D:/third year/alaoui")
# load
library(readxl)
data2 <- read_excel("D:/third year/alaoui/tpnote/Classeur2.xlsx")
data <- data2[-c(39),]
View(data)
# Analyse descriptive
summary(data)
str(data)

# analyse de correlation 
#install.packages("psych")
library(psych)
# check if all values r numeric if not convert em 
num_cols <- sapply(data, is.numeric)
data[, !num_cols] <- lapply(data[, !num_cols], as.numeric)
# 
matrice_correlation <- cor(data[,c("Y","X1","X2","X4","X5...5","X5...6","X6","X7","X8")])
KMO(matrice_correlation)
cortest.bartlett(matrice_correlation, n = nrow(data))

# Analyse en composantes principales
library("FactoMineR")
library("factoextra")
library("corrplot")
p=PCA(data,graph=FALSE)
p
pca=PCA(data,ncp=2)
print(pca)
eig.val <- get_eigenvalue(pca)
eig.val
fviz_eig(pca)
fviz_pca_var(pca, col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"))

# Régression linéaire
model <- lm(Y ~ X1 + X2 + X4 + X5...5 + X5...6 + X6 + X7 + X8, data = data)
summary(model)

#Vérification des hypothèses de la régression linéaire
#1. Linearité
par(mfrow = c(2, 2))
plot(model)
#2. Homoscédasticité
library(car)
ncvTest(model)
#3. Normalité des résidus
library(lmtest)
library(tseries)
residuals <- resid(model)
qqnorm(residuals)
qqline(residuals)
shapiro.test(residuals)
#4. Indépendance des résidus
plot(residuals ~ fitted(model))
durbinWatsonTest(model)
#Sélection de variables

# Fit the full model
full_model <- lm(Y ~ X1 + X2 + X4 + X5...5 + X5...6 + X6 + X7 + X8, data = data)

# Fit the null model (constant only)
null_model <- lm(Y ~ 1, data = data)

# Perform backward stepwise selection
backward_model <- step(full_model, direction = "backward")

# Perform forward stepwise selection
forward_model <- step(null_model, scope = list(lower = null_model, upper = full_model), direction = "forward")

# Perform both stepwise selection
both_model <- step(null_model, scope = list(upper = full_model), direction = "both")

# Compare AIC, BIC and RMSE for each model
AIC_full <- AIC(full_model)
AIC_backward <- AIC(backward_model)
AIC_forward <- AIC(forward_model)
AIC_both <- AIC(both_model)

BIC_full <- BIC(full_model)
BIC_backward <- BIC(backward_model)
BIC_forward <- BIC(forward_model)
BIC_both <- BIC(both_model)

RMSE_full <- sqrt(mean((resid(full_model))^2))
RMSE_backward <- sqrt(mean((resid(backward_model))^2))
RMSE_forward <- sqrt(mean((resid(forward_model))^2))
RMSE_both <- sqrt(mean((resid(both_model))^2))


#Print the model selection criteria for each model
cat("Full model:")
cat("\tAIC:", AIC_full, "\tBIC:", BIC_full, "\tRMSE:", RMSE_full, "\n")
cat("Backward model:")
cat("\tAIC:", AIC_backward, "\tBIC:", BIC_backward, "\tRMSE:", RMSE_backward, "\n")
cat("Forward model:")
cat("\tAIC:", AIC_forward, "\tBIC:", BIC_forward, "\tRMSE:", RMSE_forward, "\n")
cat("Both model:")
cat("\tAIC:", AIC_both, "\tBIC:", BIC_both, "\tRMSE:", RMSE_both, "\n")

#Select the model with the lowest AIC, BIC, and RMSE
model_names <- c("Full Model", "Backward Model", "Forward Model", "Both Model")
 best_model <- model_names[which.min(c(AIC_full, AIC_backward, AIC_forward, AIC_both))]
 print(best_model)
# predict the value of Y in the 39th semester
 predicted_Y <- predict(backward_model, newdata = data.frame(X1 = 500, X2 = 100, X4 = 83, X5...5 = 30, X5...6= 50, X6 = 90, X7 = 300, X8 = 200))
 print(predicted_Y)
# to make sure that its correct 
 #Calculate mean squared error
 MSE <- mean((predicted_Y - data$Y[39])^2)
 #Calculate root mean squared error
 RMSE <- sqrt(MSE)
 
 #Calculate R-squared value
 R_squared <- summary(backward_model)$r.squared
 
 #Print the results
 print(paste("Predicted value of Y:", predicted_Y))
 print(paste("Mean Squared Error:", MSE))
 print(paste("Root Mean Squared Error:", RMSE))
 print(paste("R-Squared value:", R_squared))
 
 #Compare the predicted value with the actual value
 print(paste("Actual value of Y:", data$Y[39]))
 print(paste("Difference:", abs(predicted_Y - data$Y[39])))
 
 #Plot the model
 par(xpd = TRUE)
 plot(backward_model,which=1)
 abline(backward_model,col="blue")
 
 #Plot the residuals
 plot(backward_model, which=2)
 
 #Normality test on residuals
 shapiro.test(residuals(backward_model))
 
 #Homoscedasticity test on residuals
 ncvTest(backward_model)
 
 #Linearity test on residuals
 dwtest(backward_model)

this is my file.csv

X4  X5  X5  X6  X7  X8  Y
56  12  50  77  229 98  5540
59  9   17  89  177 225 5439
57  29  89  51  166 263 4290
58  13  107 40  258 321 5502
59  13  143 52  209 407 4872
60  11  61  21  180 247 4708
60  25  -30 40  213 328 4627
60  21  -45 32  201 298 4110
63  8   -28 12  176 218 4123
62  11  76  68  175 410 4842
65  22  144 52  253 93  5741
65  24  113 77  208 307 5094
64  14  128 96  195 107 5383
66  15  10  48  154 305 4888
67  22  -25 27  181 60  4033
67  23  117 73  220 239 4942
66  13  120 62  235 141 5313
68  8   122 25  258 291 5140
69  27  71  74  196 414 5397
71  18  4   63  279 206 5149
69  8   47  29  207 80  5151
70  10  8   91  213 429 4989
73  27  128 74  296 273 5927
73  16  -50 16  245 309 4704
73  32  100 43  276 280 5366
75  20  -40 41  211 315 4630
73  15  68  93  283 212 5712
74  11  88  83  218 118 5095
74  27  27  75  307 345 6124
77  20  59  88  211 141 4787
79  35  142 74  270 83  5036
77  23  126 21  328 398 5288
78  36  30  26  258 124 4647
78  22  18  95  233 118 5316
81  20  42  93  324 161 6180
80  16  -22 50  267 405 4801
81  35  148 83  257 111 5512
82  27  -18 91  267 170 5272
83  30  50  90  300 200 .

Using Linear Regression and Model Selection Techniques to Predict Y based on Xs in R

0 Answers0