I attempted to utilize the principles of linear regression and feature selection to predict a target variable (Y) based on a set of predictor variables (X1, X2, X3, X4, X5, X6, X7, and X8). I started by implementing a full model, which included all predictor variables, and then used stepwise regression to select the most relevant variables for my model through the use of backward, forward, and both selection methods. I then compared the performance of my model using AIC, BIC, and root mean squared error (RMSE) to determine the best model for my data. Finally, I used this best model to predict the value of Y for a specific set of predictor variable values and compared it to the actual value to assess the accuracy of my model. However, I encountered a problem in my data where the value of Y in the 39th semester was missing, so I couldn't evaluate the prediction results.
#Jeu de donnée : Classeur2.xlsx
setwd("D:/third year/alaoui")
# load
library(readxl)
data2 <- read_excel("D:/third year/alaoui/tpnote/Classeur2.xlsx")
data <- data2[-c(39),]
View(data)
# Analyse descriptive
summary(data)
str(data)
# analyse de correlation
#install.packages("psych")
library(psych)
# check if all values r numeric if not convert em
num_cols <- sapply(data, is.numeric)
data[, !num_cols] <- lapply(data[, !num_cols], as.numeric)
#
matrice_correlation <- cor(data[,c("Y","X1","X2","X4","X5...5","X5...6","X6","X7","X8")])
KMO(matrice_correlation)
cortest.bartlett(matrice_correlation, n = nrow(data))
# Analyse en composantes principales
library("FactoMineR")
library("factoextra")
library("corrplot")
p=PCA(data,graph=FALSE)
p
pca=PCA(data,ncp=2)
print(pca)
eig.val <- get_eigenvalue(pca)
eig.val
fviz_eig(pca)
fviz_pca_var(pca, col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"))
# Régression linéaire
model <- lm(Y ~ X1 + X2 + X4 + X5...5 + X5...6 + X6 + X7 + X8, data = data)
summary(model)
#Vérification des hypothèses de la régression linéaire
#1. Linearité
par(mfrow = c(2, 2))
plot(model)
#2. Homoscédasticité
library(car)
ncvTest(model)
#3. Normalité des résidus
library(lmtest)
library(tseries)
residuals <- resid(model)
qqnorm(residuals)
qqline(residuals)
shapiro.test(residuals)
#4. Indépendance des résidus
plot(residuals ~ fitted(model))
durbinWatsonTest(model)
#Sélection de variables
# Fit the full model
full_model <- lm(Y ~ X1 + X2 + X4 + X5...5 + X5...6 + X6 + X7 + X8, data = data)
# Fit the null model (constant only)
null_model <- lm(Y ~ 1, data = data)
# Perform backward stepwise selection
backward_model <- step(full_model, direction = "backward")
# Perform forward stepwise selection
forward_model <- step(null_model, scope = list(lower = null_model, upper = full_model), direction = "forward")
# Perform both stepwise selection
both_model <- step(null_model, scope = list(upper = full_model), direction = "both")
# Compare AIC, BIC and RMSE for each model
AIC_full <- AIC(full_model)
AIC_backward <- AIC(backward_model)
AIC_forward <- AIC(forward_model)
AIC_both <- AIC(both_model)
BIC_full <- BIC(full_model)
BIC_backward <- BIC(backward_model)
BIC_forward <- BIC(forward_model)
BIC_both <- BIC(both_model)
RMSE_full <- sqrt(mean((resid(full_model))^2))
RMSE_backward <- sqrt(mean((resid(backward_model))^2))
RMSE_forward <- sqrt(mean((resid(forward_model))^2))
RMSE_both <- sqrt(mean((resid(both_model))^2))
#Print the model selection criteria for each model
cat("Full model:")
cat("\tAIC:", AIC_full, "\tBIC:", BIC_full, "\tRMSE:", RMSE_full, "\n")
cat("Backward model:")
cat("\tAIC:", AIC_backward, "\tBIC:", BIC_backward, "\tRMSE:", RMSE_backward, "\n")
cat("Forward model:")
cat("\tAIC:", AIC_forward, "\tBIC:", BIC_forward, "\tRMSE:", RMSE_forward, "\n")
cat("Both model:")
cat("\tAIC:", AIC_both, "\tBIC:", BIC_both, "\tRMSE:", RMSE_both, "\n")
#Select the model with the lowest AIC, BIC, and RMSE
model_names <- c("Full Model", "Backward Model", "Forward Model", "Both Model")
best_model <- model_names[which.min(c(AIC_full, AIC_backward, AIC_forward, AIC_both))]
print(best_model)
# predict the value of Y in the 39th semester
predicted_Y <- predict(backward_model, newdata = data.frame(X1 = 500, X2 = 100, X4 = 83, X5...5 = 30, X5...6= 50, X6 = 90, X7 = 300, X8 = 200))
print(predicted_Y)
# to make sure that its correct
#Calculate mean squared error
MSE <- mean((predicted_Y - data$Y[39])^2)
#Calculate root mean squared error
RMSE <- sqrt(MSE)
#Calculate R-squared value
R_squared <- summary(backward_model)$r.squared
#Print the results
print(paste("Predicted value of Y:", predicted_Y))
print(paste("Mean Squared Error:", MSE))
print(paste("Root Mean Squared Error:", RMSE))
print(paste("R-Squared value:", R_squared))
#Compare the predicted value with the actual value
print(paste("Actual value of Y:", data$Y[39]))
print(paste("Difference:", abs(predicted_Y - data$Y[39])))
#Plot the model
par(xpd = TRUE)
plot(backward_model,which=1)
abline(backward_model,col="blue")
#Plot the residuals
plot(backward_model, which=2)
#Normality test on residuals
shapiro.test(residuals(backward_model))
#Homoscedasticity test on residuals
ncvTest(backward_model)
#Linearity test on residuals
dwtest(backward_model)
this is my file.csv
X4 X5 X5 X6 X7 X8 Y
56 12 50 77 229 98 5540
59 9 17 89 177 225 5439
57 29 89 51 166 263 4290
58 13 107 40 258 321 5502
59 13 143 52 209 407 4872
60 11 61 21 180 247 4708
60 25 -30 40 213 328 4627
60 21 -45 32 201 298 4110
63 8 -28 12 176 218 4123
62 11 76 68 175 410 4842
65 22 144 52 253 93 5741
65 24 113 77 208 307 5094
64 14 128 96 195 107 5383
66 15 10 48 154 305 4888
67 22 -25 27 181 60 4033
67 23 117 73 220 239 4942
66 13 120 62 235 141 5313
68 8 122 25 258 291 5140
69 27 71 74 196 414 5397
71 18 4 63 279 206 5149
69 8 47 29 207 80 5151
70 10 8 91 213 429 4989
73 27 128 74 296 273 5927
73 16 -50 16 245 309 4704
73 32 100 43 276 280 5366
75 20 -40 41 211 315 4630
73 15 68 93 283 212 5712
74 11 88 83 218 118 5095
74 27 27 75 307 345 6124
77 20 59 88 211 141 4787
79 35 142 74 270 83 5036
77 23 126 21 328 398 5288
78 36 30 26 258 124 4647
78 22 18 95 233 118 5316
81 20 42 93 324 161 6180
80 16 -22 50 267 405 4801
81 35 148 83 257 111 5512
82 27 -18 91 267 170 5272
83 30 50 90 300 200 .