I have been trying to use the following code to run the integrated quanteda crossval function. The code works but the results look really strange to me in the sense that they differ a lot from what I receive when I implement a cross-validation loop myself (see below). Any idea what the issue could be?
#import dataset
require(readxl)
Classifications_Total <- read_excel("C:/Users/maxim/OneDrive/Desktop/Communication Methods and Measures/Scripts and Data/Classifications_Input.xlsx")
#recode missing values in Q1 as non-projections
Classifications_Total$Q1[is.na(Classifications_Total$Q2)] <- 0
# recode missing values in Q2 as non-evaluative statements
Classifications_Total$Q2[is.na(Classifications_Total$Q2)] <- 0
#recode unclear and ambivalent evaluations in Q2 as non-evaluative statements given their small size-number in the annotated corpus
Classifications_Total$Q2[Classifications_Total$Q2 == 3] <- 0
Classifications_Total$Q2[Classifications_Total$Q2 == 4] <- 0
#create corpus object
require(quanteda)
require(quanteda.textmodels)
require(caret)
corp_projections <- corpus(Classifications_Total, text_field = "SNIPPET")
summary(corp_projections, 5)
#set.seed(300)
#split train and test datasets from individually coded snippets
id_train <- sample(1:10004,replace = FALSE)
head(id_train, 10)
corp_projections$TEXT_ID <- 1:ndoc(corp_projections)
#pre-processing
corp_2 <- gsub('\\[', '', corp_projections)
corp_3 <- gsub(']][-]',' xxneg',corp_2)
corp_4 <- gsub(']][+]',' xxpos',corp_3)
#Stopwords Included & Trigrams Included (SITI)
toks_SITI <- tokens(corp_4,remove_punct = TRUE, remove_number = TRUE)
toks_SITI_ngrams <- tokens_ngrams(toks_SITI, n=1:3)
dfmt_SITI_ngrams <- dfm(toks_SITI_ngrams)
dfmt_SITI_trim <- dfm_trim(dfmt_SITI_ngrams, min_termfreq = 5)
dfmt_SITI_trim2 <- dfm_trim(dfmt_SITI_trim, max_termfreq = .99, termfreq_type = "prop")
additional_features <- dfmt_SITI_trim2$Q1 %>% as.matrix()
dfmt_SITI_added <- cbind(dfmt_SITI_trim2, additional_features)
#train the classifier
dfmat_training <- dfm_subset(dfmt_SITI_added, TEXT_ID %in% id_train)
library(e1071)
tmod_svm <- textmodel_svm(dfmat_training, dfmat_training$Q2)
summary(tmod_svm)
#Quanteda-kfold-cross-validation
quanteda.classifiers::crossval(tmod_svm, k = 10, by_class = TRUE, verbose = TRUE)
Runnning this code returns the following results
Now, when I run the following manual cross validation loop the results look very different and they make more sense to me.
#Randomly shuffle the data
yourData<-dfmt_SITI_added[sample(nrow(dfmt_SITI_added)),]
#Create 10 equally size folds
folds <- cut(seq(1,nrow(yourData)),breaks=10,labels=FALSE)
#Perform 10 fold cross validation
for(i in 1:10){
#Segement your data by fold using the which() function
testIndexes <- which(folds==i,arr.ind=TRUE)
testData <- yourData[testIndexes, ]
trainData <- yourData[-testIndexes, ]
tmod_svm <- textmodel_svm(trainData, trainData$Q2)
summary(tmod_svm)
dfmat_matched <- dfm_match(testData, features = featnames(trainData))
actual_class <- dfmat_matched$Q2
predicted_class <- predict(tmod_svm, newdata = dfmat_matched)
tab_class <- table(predicted_class, actual_class)
print(tab_class)
confusionMatrix(tab_class, mode = "everything")
}
This returns the following results
I was also wondering if the crossval quanteda package has an option to print the performance results for every fold separately (confusion matrices, F1-Scores..), or does it only provide the mean performance for all folds?