I'm trying to build a decision tree model using R language, and when i run the rpart() function, Rstudio freezes. i provided blow a link to the dataset i use, and the code too process it until the decision tree model building, any help is appreciated
https://github.com/ArcanePersona/files/blob/main/vgsales.csv
#Libraries used:
library(tidyverse)
library(Hmisc)
library(mctest)
library(rpart)
library(rpart.plot)
library(RColorBrewer)
library(rattle)
library(missForest)
library(VIM)
library(caret)
library(fmsb)
#Phase One: Data Preprocessing:
#Loading in the "vgsales.csv" data:
game_sales <- read.csv("vgsales.csv", header = T, stringsAsFactors = F)
#turning the structure of the data to tibble for ease of use:
game_sales <- as_tibble(game_sales)
#Replacing the "N/A" character values in Year_of_Release with real NA values:
game_sales %>% filter(game_sales$Year_of_Release == "N/A")
game_sales <- game_sales %>% mutate( Year_of_Release = gsub("N/A","", Year_of_Release))
#Changing the data type of column Year_of_release from "chr" to "int":
game_sales$Year_of_Release <- as.integer(game_sales$Year_of_Release)
str(game_sales$Year_of_Release)
#Imputing Year_of_Release variable and inserting the imputed values:
imputeyear <- with(game_sales,Hmisc::impute(game_sales$Year_of_Release, 'mean'))
game_sales <- game_sales %>% mutate (Year_of_Release = imputeyear)
#filtering data for "year_of_release" >= 2010 then ordering data ascending:
game_sales <- game_sales %>% filter(Year_of_Release >= 1991) %>% filter(Year_of_Release <=2010)
#Creating a subset of not NA values in the Rating variable
#Because the missing data is too many and not imputable (50%)
#This subset are for machine learning purposes only:
ml_subset_x <- subset(game_sales, !is.na(game_sales$Critic_Score) | !is.na(game_sales$Critic_Count))
ml_subset_y <- ml_subset_x %>% filter( Rating == "E"| Rating == "M" | Rating =="T" |
Rating == "E10+"| Rating == "AO" | Rating =="K-A" | Rating =="RP")
#Phase Four: Machine Learning:
#Decision Tree:
ml_subset_y$Publisher <- as.factor(ml_subset_y$Publisher)
ml_subset_y$Platform <- as.factor(ml_subset_y$Platform)
ml_subset_y$Genre <- as.factor(ml_subset_y$Genre)
ml_subset_y$Rating <- as.factor(ml_subset_y$Rating)
#Splitting data into train (70%) and test (30%):
set.seed(1234)
index <- sample(nrow(ml_subset_y), 0.7 * nrow(ml_subset_y))
ml_subset_ytrain <- ml_subset_y[index,]
ml_subset_ytest <- ml_subset_y[-index,]
#Modelling the train data using decision tree algorithm:
treemodel <- rpart(Rating~., data=ml_subset_ytrain)
plot(treemodel, margin=0.25)
text(treemodel, use.n=T)
fancyRpartPlot(treemodel)
#Testing the model using the test data and using confusion matrix
#to check Accuracy:
prediction <- predict(treemodel, newdata=ml_subset_ytest, type='class')
accuracy_test <- table(prediction, ml_subset_ytest$Rating)
confusionmatrix(accuracy_test)