You could try a regex
solution. But, this won't sort as you may wanted.
v1 <- paste(data[,1], data[,2], sep=", ")
data$z <- sub('(\\b\\S+\\b)(?=.*\\b\\1\\b.*),', "", v1, perl=TRUE)
The regex can be viewed at regex101
Other options include
library(splitstackshape)
library(data.table)
cbind(data[,1:2],cSplit(setDT(data)[, indx:=1:.N],
c('x', 'y'), sep=",", 'long')[ ,
list(z=toString(unique(na.omit(unlist(.SD))))),
by=indx][,indx:=NULL])
x y
#1: kal, Kon, Jor, Kara Mon, Cir, John, Jor
#2: Bruce, Helena, Martha, Terry Damian, Terry, Jason
#3: connor, oliver, Roy Mia, Roy
#4: Alan, Guy, Simon, Kyle John, Cary
# z
#1: kal, Kon, Jor, Kara, Mon, Cir, John
#2: Bruce, Helena, Martha, Terry, Damian, Jason
#3: connor, oliver, Roy, Mia
#4: Alan, Guy, Simon, Kyle, John, Cary
Or using stringi
package
library(stringi)
data$z <- vapply(stri_extract_all_regex(paste(data$x, data$y), '\\w+'),
function(x) toString(sort(unique(x))), character(1))
Benchmarks
Based on on a not so big dataset,
data <- data[rep(1:nrow(data), 3e4),]
row.names(data) <- NULL
cath <- function(){
apply(data,1,function(vec){
paste(sort(unique(strsplit(paste(vec[1],
vec[2],sep=", "),", ")[[1]])),collapse=", ")
})
}
akrun2 <- function(){
vapply(stri_extract_all_regex(paste(data$x, data$y), '\\w+'),
function(x) toString(sort(unique(x))), character(1))
}
akrun3 <- function(){
v1 <- paste(data[,1], data[,2], sep=", ")
sub('(\\b\\S+\\b)(?=.*\\b\\1\\b.*),', "", v1, perl=TRUE)
}
microbenchmark(cath(), akrun2(), akrun3(),unit='relative', times=10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# cath() 11.700071 11.979908 11.700118 11.76762 11.57583 11.40806 10 c
#akrun2() 7.175622 7.225212 7.217322 7.19431 7.09539 7.31929 10 b
#akrun3() 1.000000 1.000000 1.000000 1.00000 1.00000 1.00000 10 a