I am very new in R. I have a data-set with 139 columns and more than 46.5k rows. I have measured pairwise cosine similarity matrices between rows in the data-set where one row will be compared with rest of the other rows and will be excluded during next iteration and the process will go on for rest of the data-set. This implementation work fine with small sample data-set e.g. with 500 rows. However, when i try to do this with whole data-set(46k) it gets nasty (i have waited almost 30 hours running the code but no output). Here is my implementation so-far:
library(reshape2)
library(lsa)
psm_sample <- read.csv("psm_final_sample.csv")
numRows = nrow(psm_sample)
##################################
normalize <- function(x) {
return ( (2 * ((x - min(x)) / (max(x) - min(x))) ) - 1 )
}
##################################
cat_normalize <- function(x) {
norm <- ( (2 * ((x - min(x)) / (max(x) - min(x))) ) - 1 )
return (ifelse(norm < 0 , -1, 1))
}
#############################
cat_gender <- function (sex){
sex <- as.character(sex)
if( sex == 'M' ) {
return (as.integer(1))
}
else{
return(as.integer(2))
}
}
##################################
cat_admsn_type <- function (type){
type <- as.character(type)
if( type == 'EMERGENCY' ) {
return(as.integer(1))
}
else if ( type == 'URGENT'){
return(as.integer(2))
}
else{
return(as.integer(3))
}
}
#############################
cat_first_icu <- function (ficu){
type <- as.character(ficu)
if( ficu == 'CCU' ) {
return(as.integer(1))
}
else if ( ficu == 'CSRU'){
return(as.integer(2))
}
else if ( ficu == 'MICU'){
return(as.integer(3))
}
else if ( ficu == 'NICU'){
return(as.integer(4))
}
else if ( ficu == 'SICU'){
return(as.integer(5))
}
else{
return(as.integer(6))
}
}
##################################
cat_last_icu <- function (licu){
type <- as.character(licu)
if( licu == 'CCU' ) {
return(as.integer(1))
}
else if ( licu == 'CSRU'){
return(as.integer(2))
}
else if ( licu == 'MICU'){
return(as.integer(3))
}
else if ( licu == 'NICU'){
return(as.integer(4))
}
else if ( licu == 'SICU'){
return(as.integer(5))
}
else{
return(as.integer(6))
}
}
#################################################################################
gender <- sapply(psm_sample$gender,cat_gender)
admission_type <- sapply(psm_sample$admission_type,cat_admsn_type)
first_icu_service_type <- sapply(psm_sample$first_icu_service_type,cat_first_icu)
last_icu_service_type <- sapply(psm_sample$last_icu_service_type,cat_last_icu)
################################################################################
psm_sample_cont_norm_df <- as.data.frame(lapply(psm_sample[8:138], normalize))
psm_sample_cat_df <- data.frame(gender,admission_type,first_icu_service_type,last_icu_service_type)
psm_sample_cat_norm_df <- as.data.frame(lapply(psm_sample_cat_df, cat_normalize))
psm_temp_df <- cbind.data.frame(psm_sample[1], psm_sample_cat_norm_df, psm_sample_cont_norm_df)
row.names(psm_temp_df ) <- make.names(paste0("patid.", as.character(1:nrow(psm_temp_df ))))
psm_final_df <- psm_temp_df[2:136]
###############################################################################
#mycosine <- function(x,y){
#c <- sum(x*y) / (sqrt(sum(x*x)) * sqrt(sum(y*y)))
#return(c)
#}
#cosinesim <- function(x) {
# initialize similarity matrix
#m <- matrix(NA, nrow=ncol(x),ncol=ncol(x),dimnames=list(colnames(x),colnames(x)))
#cos <- as.data.frame(m)
#for(i in 1:ncol(x)) {
#for(j in i:ncol(x)) {
#co_rate_1 <- x[which(x[,i] & x[,j]),i]
#co_rate_2 <- x[which(x[,i] & x[,j]),j]
#cos[i,j]= mycosine(co_rate_1,co_rate_2)
#cos[j,i]=cos[i,j]
#}
#}
#return(cos)
#}
cs <- lsa::cosine(t(psm_final_df))
cs_round <-round(cs,digits = 2)
#cs_norm <- as.data.frame(lapply(cs,normalize))
#print(cs_norm)
#print(cs_round)
##########################################
numCols = 3;
totalROws = (numRows * (numRows-1)) / 2;
result <- matrix(nrow = totalROws, ncol = numCols)
#result<- big.matrix( nrow = totalROws, ncol = numCols, type = "double",shared = TRUE)
#options(bigmemory.allow.dimnames=TRUE)
colnames(result) <- c("PatA","PatB","Similarity")
index = 1;
for (i in 1:nrow(cs_round)) {
patA = rownames(cs_round)[i]
for (j in i:ncol(cs_round)) {
if (j > i) {
patB = colnames(cs_round)[j]
result[index, 1] = patA
result[index, 2] = patB
result[index, 3] = cs_round[i,j]
index = index + 1;
}
}
}
print(result)
write.csv(result, file = "C:/cosine/output.csv", row.names = F)
#ord_result<-result[order(result[,3],decreasing=TRUE),]
#print(ord_result)
Under this circumstances, I can split the data-set into highest 10 sub-sets. Then, there will be 4650 rows in each data-sets. Hence, for 4650 rows it is still a very big matrix calculation and I have to wait for a very long time for the output.
I have tried with big-memory, ff and matrix packages with this implementation but no fruitful outcome with my knowledge.
Any kind of suggestion or code modification or how to do it efficiently will be very much helpful for me.
Note:My machine has 8GBDDR3 RAM and i3 Processor with 2.10GHz clock speed.I am using 64 bit R studio.
Link for whole data-set(46.5 KRows - psm_final_without_null.csv) >> https://1drv.ms/u/s!AhoddsPPvdj3hVVFC-yl1tDKEfo8
Link for sample data-set(4700 Rows - psm_final_sample.csv) >> https://1drv.ms/u/s!AhoddsPPvdj3hVjrNTgkV0noqMk8