0

I'm currently working on a user-item collaborative filtering model.

I have a set of users and places they have shopped at, and have attempted to build a recommender model using R.

There are two aims of this project: a) Recommend new shops to ALL customers b) Give a stat to show how accurate the model is.

I have 2 years worth of data.

To answer b), I have subset my data to customers that have purchased in both the first 1.5 years AND in the following 6 months. I have created a model on the data of transactions in the first 1.5 years, then have compared to model predictions to the ACTUAL 6 months of data.

By performing the above, I determined that I was to use UBCF and nn=500, and I was able to achieve accuracy of approx 80%.

However, I am now unsure of how to predict for the ENTIRE user base. I was thinking of applying the ENTIRE dataset to the model I have just created, but there is bias/will not be accurate, as not all shops are represented in this small model I have created.


I have read articles and tutorials where people have done different things. I have seen one where they input the entire dataset, and apply the [which] subsetting, so that it creates the model in 80% and tests using the remaining 20%.

My question is, if I was to use this process, how would I then get recommendations for ALL users, when the model only gives predictions for 20% of the users?

Is it best to create the model on the entire dataset?

SUBSET THE DATA

Create period flags

#If in 1.5 years, then 1. If in following 6 months, then 0.
FV$Flag1<-ifelse(FV$Date<="2018-10-01",1,0) 
FV$Flag2<-ifelse(FV$Date>"2018-10-01",1,0) 

IDENTIFICATION OF CUSTOMERS TO USE IN TRAINING MODEL

#Create SCV
#FV
FV_SCV<-select(FV, Customer, Flag1, Flag2) %>% 
  group_by(Customer) %>%
  summarise_all(funs(sum)) #Sum all variables. 

#Determine which customers to use based on if they have purchased both in the first and second years
FV_SCV$Use<-ifelse(FV_SCV$Flag1>0 & FV_SCV$Flag2>0, 1,0)
EXTRACT CUSTOMER LIST FOR TRAINING MODEL
#Training. Where customers have purchased both in the first & second year, but we only run the model on the first.
FV_Train<-FV_SCV %>%
  filter(Use==1 )

SUBSET TO CUSTOMERS THAT HAVE PURCHASED IN 1 YEAR AND OF THE CUSTOMERS THAT HAVE PURCHASED IN BOTH YEARS, TO ONLY THOSE THAT HAVE SHOPPED IN THE FIRST YEAR

#FV_SCV$flag_sum<- FV_SCV$Flag1+FV_SCV$Flag2


SCV FOR CUSTOMERS USED IN THE TRAINING MODEL

#Join on the USE flag
FV_Train_Transactions<- FV %>% #Join on the page info
  left_join(select(FV_Train, Customer,  Use), by=c("Customer"="Customer"))

#Replace NA with 0
FV_Train_Transactions[is.na(FV_Train_Transactions)] <- 0

##Subset to only the users' transactions to be used in training
FV_Train_Transactions<-FV_Train_Transactions %>%
  filter(Use==1)

##Create date flag for train and test to use to create the model on the train and comparing the results with the output of the test df
FV_Train_Transactions_Compare<-FV_Train_Transactions %>%
  filter(Flag2>0)

##Create SCV for TRAIN 
FV_TRAIN_SCV<-FV_Train_Transactions %>%
  filter(Flag1>0) %>%
  group_by(Customer, Brand)%>%
  select(Customer, Brand) 

FV_TRAIN_SCV$Flag<-1

#Not sure why this hasn't selected unique rows, so remove duplicates.
FV_TRAIN_SCV<-distinct(FV_TRAIN_SCV)

##Create scv for TEST
FV_TEST_SCV<-FV_Train_Transactions_Compare %>%
  filter(Flag2>0) %>%
  select(Customer, Brand) %>%
  group_by(Customer, Brand)
FV_TEST_SCV$Flag<-1
#Not sure why this hasn't selected unique rows, so remove duplicates.
FV_TEST_SCV<-distinct(FV_TEST_SCV)

Transpose to columns

install.packages("reshape")
install.packages("reshape2")
install.packages("tidytext")
library(reshape)
library(reshape2)
library(tidytext)
#Melt data for transposition
#Train
fv_train_md<-melt(FV_TRAIN_SCV, id=(c("Customer", "Brand")))
FV_TRAIN_SCV2<-dcast(fv_train_md, Customer~Brand, value="Flag",  fun.aggregate = mean)
#Test
fv_test_md<-melt(FV_TEST_SCV, id=(c("Customer" , "Brand")))
#Do the same for the overall transactions table
#Make FV_SCV a binary rating matrix
fv_overall<- FV[,c(1,3)] #The table name is case sensitive. Select only the customer and brand columns
fv_overall<- distinct(fv_overall) #Remove dups
fv_overall$Flag<-1

fv_overall_md<-melt(fv_overall, id=(c("Customer", "Brand")))
fv_overall_2<- dcast(fv_overall_md, Customer~Brand, value="Flag", fun.aggregate = mean)


#fv_test_123<-dcast(FV_TEST_SCV, Customer~Brand, value.var="Brand")

#colnames(fv_test_123)
#fv_test_12345<-which(fv_test_123==1, arr.ind=TRUE)
#fv_test_123<-colnames(fv_test_123)[fv_test_12345]
#fv_test_123
#fv_test_123_df<-as.data.frame((fv_test_123))

FV_TRAIN_SCV2<-dcast(fv_train_md, Customer~Brand, value="Value",  fun.aggregate = mean)
FV_TEST_SCV2<-dcast(fv_test_md, Customer~Brand, value="Value",  fun.aggregate = mean)

#Replace NaN with 0
is.nan.data.frame <- function(x)
do.call(cbind, lapply(x, is.nan))

FV_TRAIN_SCV2[is.nan(FV_TRAIN_SCV2)] <- 0
FV_TEST_SCV2[is.nan(FV_TEST_SCV2)] <- 0
fv_overall_2[is.nan(fv_overall_2)] <- 0
# #
install.packages("recommenderlab")
library(recommenderlab)

row.names(FV_TRAIN_SCV2)<-FV_TRAIN_SCV2$Customer
FV_TRAIN_SCV2$Hawkers<-0
FV_TRAIN_SCV2$Pollini<-0
FV_TRAIN_SCV2$"Twin Set"<-0
FV_TRAIN_SCV2_matrix<-as.matrix(FV_TRAIN_SCV2[,2:ncol(FV_TRAIN_SCV2)])
FV_TRAIN_SCV2_binarymatrix<-as(FV_TRAIN_SCV2_matrix,"binaryRatingMatrix")

similarity_FV_train_trans_items<-similarity(FV_TRAIN_SCV2_binarymatrix, method="jaccard", which="items")

train_col<- data.frame(colnames(FV_TRAIN_SCV2))
#------------------------------------------------------------------------------------
row.names(fv_overall_2)<-fv_overall_2$Customer

#Convert NaN to 0
fv_overall_2[is.nan(fv_overall_2)]<-0
#fv_overall_matrix<- as.matrix(fv_overall_2[,2:(ncol(fv_overall_2)-3)])#Convert to matrix
fv_overall_matrix<- as.matrix(fv_overall_2[,2:ncol(fv_overall_2)])#Convert to matrix
#fv_overall_matrix<- as.matrix(fv_overall_matrix[,1:(ncol(fv_overall_2)-3)])
fv_matrix_binary<- as(fv_overall_matrix, "binaryRatingMatrix")  #Make a binary ratings matrix

FV_overall_similarity<-similarity(fv_matrix_binary, method="jaccard", which="items")
overall_col<- data.frame(colnames(fv_overall_2))
#---------------------------------------------------------------------------------------------------------


# #
#Now, define multiple recommender algorithms to compare them all.

algorithms <- list(`user-based CF 50` = list(name = "UBCF",param = list(method = "Jaccard", nn = 50)),
                   `user-based CF 100` = list(name = "UBCF",param = list(method = "Jaccard", nn = 100)),
                   `user-based CF 200` = list(name = "UBCF",param = list(method = "Jaccard", nn = 200)),
                   `user-based CF 500` = list(name = "UBCF",param = list(method = "Jaccard", nn = 500)),
                   #
                   `item-based CF 3` = list(name = "IBCF",param = list(method = "Jaccard", k = 3)),
                   `item-based CF 5` = list(name = "IBCF",param = list(method = "Jaccard", k = 5)),
                   `item-based CF 10` = list(name = "IBCF",param = list(method = "Jaccard", k = 10)),
                   `item-based CF 50` = list(name = "IBCF",param = list(method = "Jaccard", k = 50))
                   )

scheme <- evaluationScheme(FV_TRAIN_SCV2_binarymatrix, method = "cross", k = 4,given = 1)
scheme <- evaluationScheme(fv_matrix_binary, method = "cross", k = 4,given = 1)
results <- evaluate(scheme, algorithms, n = c(1,2,3,4,5,6,7,8))
results <- evaluate(scheme, algorithms, n = c(1,2,3,4,5,6,7,8,50,100,200,500))#Evaluating with n=c(1,2,3.....) being the number of recommendations
#names(results) #Check that all results have run. 
#results

#Plot results to help determine which of the above models is best for further analysis
#plot(results, annotate = c(1, 3), legend = "right") #ROC Curve
#plot(results, "prec/rec", annotate = 3) #Precision/Recall Plot


The first of these plots (with FPR on x axis) is the ROC curve. The better performing model is the curve with the highest area therefore the better performing model, of these tested parameters, is UBCF with nn=500. Or, with nn=50.

Based on the precision/recall plot, nn should be set to 500.

MODEL USING UBCF nn = 500

recc_model <- Recommender(data = FV_TRAIN_SCV2_binarymatrix, method = "UBCF", 
                          parameter = list(method = "Jaccard",
                                           nn=500))
model_details <- getModel(recc_model)
model_details




#Running on ENTIRE DATA
recc_model <- Recommender(data = fv_matrix_binary, method = "UBCF", 
                          parameter = list(method = "Jaccard",
                                           nn=500))
model_details <- getModel(recc_model)
model_details

install.packages("plyr")
library(plyr)
#Get the scores
recc_predicted <- predict(object = recc_model, newdata = FV_TRAIN_SCV2_binarymatrix, n = 198, type="ratings")
ibcf_scores<-as(recc_predicted, "list")
ibcf_list_scores<-ldply(ibcf_scores, rbind)
ibcf_list_scores[is.na(ibcf_list_scores)]<-0

#Get the list of brands
recc_predicted_list2<-predict(object=recc_model,newdata= FV_TRAIN_SCV2_binarymatrix,type="topNList", n=198)
recc_predicted_list2_a<- as(recc_predicted_list2, "list")
recc_predicted_list2_b<-ldply(recc_predicted_list2_a, rbind)
#recc_predicted_list2_b[is.na(recc_predicted_list2_b)]<-0

#------------------------------------------------------------
#On the overall model:

#Get the scores
recc_predicted <- predict(object = recc_model, newdata = fv_matrix_binary, n = 80, type="ratings")
ibcf_scores<-as(recc_predicted, "list")
ibcf_list_scores<-ldply(ibcf_scores, rbind)
ibcf_list_scores[is.na(ibcf_list_scores)]<-0

#Get the list of brands
recc_predicted_list2<-predict(object=recc_model,newdata= fv_matrix_binary,type="topNList", n=80)
recc_predicted_list2_a<- as(recc_predicted_list2, "list")
recc_predicted_list2_b<-ldply(recc_predicted_list2_a, rbind)
#recc_predicted_list2_b[is.na(recc_predicted_list2_b)]<-0

Reshape df so all ratings are in one column. Use this to then create a unique table, to thendo a count if, as this always crashes excel.

install.packages("data.table")
library(data.table)
df.m1<- melt(ibcf_list_scores, id.vars=c(".id"),
             value.name="Rating")

df.m1.unique<- data.frame(df.m1)
df.m1.unique$variable<-NULL
df.m1.unique$.id<-NULL

#df.m1.unique<-distinct(df.m1.unique)
#df.m1.unique<- df.m1.unique[order(df.m1.unique$Rating),] #This comma means it is only ordering based on this one var.

#Using ave
df.m1.unique$count<- ave(df.m1.unique$Rating, df.m1.unique[,c("Rating")], FUN=length)
rownames(df.m1.unique) <- c() #Remove rownames
df.m1.unique<-distinct(df.m1.unique)
df.m1.unique<- df.m1.unique[order(-df.m1.unique$Rating),]#Sort by ascending rating

#Plot this
df.m1.unique.plot<- data.frame(df.m1.unique[2:(nrow(df.m1.unique)-1),])
#plot(x=df.m1.unique.plot$Rating, y=df.m1.unique.plot$count)

#Get the cumulative distribution
df.m1.unique.plot2<- df.m1.unique.plot %>%
  mutate(Percentage=cumsum(100*(count/sum(count))),
         cumsum=cumsum(count))


Remove ratings

#a) Remove values that are less than specific rating
#Using logical indexing with replacement
ibcf_list_scores_removal<- ibcf_list_scores

#Replace low values with 0
ibcf_list_scores_removal[,2:ncol(ibcf_list_scores_removal)][ibcf_list_scores_removal[,2:ncol(ibcf_list_scores_removal)] < 0.0217] <- 0

#To flag whether customer is recommended the brand, replace all values >0 with 1. Keep 0 as is.
ibcf_list_scores_removal_b<- ibcf_list_scores_removal #Call a new df
ibcf_list_scores_removal_b[,2:ncol(ibcf_list_scores_removal_b)][ibcf_list_scores_removal_b[,2:ncol(ibcf_list_scores_removal_b)] > 0] <- 1#Create the flag

So basically I'd like to know how to create the model on my ENTIRE dataset? And how to extract all ratings?

Thank you

Larissa K
  • 1
  • 2

0 Answers0