A base option using grepl
and merge
(Variant1).
i <- lapply(strsplit(df2$Items, ", "), function(s) {
Reduce(`&`, lapply(s, grepl, df1$Items, fixed=TRUE)) })
s <- do.call(rbind, Map(function(i, j) cbind(ID = df1$ID[i], Coupons = j),
i, df2$Coupons))
merge(df1, s, all.x = TRUE)
# ID Items Coupons
#1 1 item-6, item-5, item-3 <NA>
#2 2 item-6, item-4, item-9, item-7 coupon-1
#3 2 item-6, item-4, item-9, item-7 coupon-2
#4 3 item-6, item-4, item-5 coupon-1
#5 4 item-1, item-7, item-2, item-9 coupon-2
#6 5 item-5, item-8, item-7, item-2 <NA>
#7 6 item-10, item-1, item-6, item-4 coupon-1
#8 7 item-6, item-7, item-9, item-4, item-5 coupon-1
#9 7 item-6, item-7, item-9, item-4, item-5 coupon-2
#10 8 item-6, item-9, item-1, item-3, item-5 <NA>
#11 9 item-6, item-8, item-7, item-3, item-9 coupon-2
#12 10 item-4, item-7, item-5, item-9 coupon-2
Instead of using merge
subsetting the matches (Variant 2).
i <- lapply(strsplit(df2$Items, ", "), function(s) {
Reduce(`&`, lapply(s, grepl, df1$Items, fixed=TRUE)) })
m <- rbind(cbind(df1[!Reduce(`|`, i),], Coupons = NA), do.call(rbind,
Map(function(i, j) cbind(df1[i,], Coupons = j), i, df2$Coupons)) )
m[order(m$ID),]
# ID Items Coupons
#1 1 item-6, item-5, item-3 <NA>
#4 2 item-6, item-4, item-9, item-7 coupon-1
#8 2 item-6, item-4, item-9, item-7 coupon-2
#5 3 item-6, item-4, item-5 coupon-1
#9 4 item-1, item-7, item-2, item-9 coupon-2
#2 5 item-5, item-8, item-7, item-2 <NA>
#6 6 item-10, item-1, item-6, item-4 coupon-1
#7 7 item-6, item-7, item-9, item-4, item-5 coupon-1
#10 7 item-6, item-7, item-9, item-4, item-5 coupon-2
#3 8 item-6, item-9, item-1, item-3, item-5 <NA>
#11 9 item-6, item-8, item-7, item-3, item-9 coupon-2
#12 10 item-4, item-7, item-5, item-9 coupon-2
Another variant (3)
i <- lapply(strsplit(df2$Items, ", "), function(s) {
Reduce(`&`, lapply(s, grepl, df1$Items, fixed=TRUE)) })
i <- c(i, list(!Reduce(`|`, i)))
cbind(df1[unlist(lapply(i, which)),], Coupons = rep(c(df2$Coupons, NA),
sapply(i, sum)))
Another variant, testing for the string only in cases where the first string had a hit (Variant4).
i <- lapply(strsplit(df2$Items, ", ", TRUE), function(s) {
Reduce(function(a, b) a[grep(b, df1$Items[a], fixed=TRUE)],
s[-1], grep(s[[1]], df1$Items, fixed=TRUE)) })
j <- unique(unlist(i))
i <- if(length(j>0)) c(list(seq_len(nrow(df1))[-j]), i) else c(list(seq_len(nrow(df1))), i)
cbind(df1[unlist(i),], Coupons = rep(c(NA, df2$Coupons), lengths(i)))
Benchmark
bench::mark(check=FALSE,
varaint1 = {i <- lapply(strsplit(df2$Items, ", "), function(s) {
Reduce(`&`, lapply(s, grepl, df1$Items, fixed=TRUE)) })
s <- do.call(rbind, Map(function(i, j) cbind(ID = df1$ID[i], Coupons = j),
i, df2$Coupons))
merge(df1, s, all.x = TRUE)},
variant2 = {i <- lapply(strsplit(df2$Items, ", "), function(s) {
Reduce(`&`, lapply(s, grepl, df1$Items, fixed=TRUE)) })
rbind(cbind(df1[!Reduce(`|`, i),], Coupons = NA), do.call(rbind,
Map(function(i, j) cbind(df1[i,], Coupons = j), i, df2$Coupons)) )},
variant3 = {i <- lapply(strsplit(df2$Items, ", "), function(s) {
Reduce(`&`, lapply(s, grepl, df1$Items, fixed=TRUE)) })
i <- c(i, list(!Reduce(`|`, i)))
cbind(df1[unlist(lapply(i, which)),], Coupons = rep(c(df2$Coupons, NA), sapply(i, sum))) },
variant4 = {i <- lapply(strsplit(df2$Items, ", ", TRUE), function(s) {
Reduce(function(a, b) a[grep(b, df1$Items[a], fixed=TRUE)], s[-1], grep(s[[1]], df1$Items, fixed=TRUE)) })
j <- unique(unlist(i))
i <- if(length(j>0)) c(list(seq_len(nrow(df1))[-j]), i) else c(list(seq_len(nrow(df1))), i)
cbind(df1[unlist(i),], Coupons = rep(c(NA, df2$Coupons), lengths(i))) }
)
Result
expression min median itr/s…¹ mem_a…² gc/se…³ n_itr n_gc total…⁴ result
<bch:expr> <bch:tm> <bch:t> <dbl> <bch:b> <dbl> <int> <dbl> <bch:t> <list>
1 varaint1 435µs 465µs 2131. 17.55KB 21.0 1016 10 477ms <NULL>
2 variant2 703µs 758µs 1322. 16.09KB 21.2 625 10 473ms <NULL>
3 variant3 223µs 241µs 4015. 9.87KB 23.3 1895 11 472ms <NULL>
4 variant4 208µs 224µs 4323. 24.57KB 20.9 2066 10 478ms <NULL>
In this case variant4 is the faster and variant3 uses lowest amount of memory.
Comparing with other methods.
set.seed(99)
df1 <- tibble::tibble(
ID = 1:10,
Items = replicate(10, paste0('item-', sample(1:10, sample(3:5)[1]), collapse = ', '))
)
df2 <- tibble::tibble(
Items = c("item-4, item-6", "item-7, item-9"),
Coupons = c("coupon-1", "coupon-2")
)
library(dplyr)
library(fuzzyjoin)
library(stringr)
library(data.table)
bench::mark(check=FALSE,
Darren1 = {fuzzy_left_join(df1, rename(df2, key = Items), by = c("Items" = "key"),
match_fun = Vectorize(\(x, y) all(strsplit(y, ', ')[[1]] %in% strsplit(x, ', ')[[1]]))) %>%
select(-key)},
Darren2 = {df2_pattern <- df2 %>%
mutate(key = sapply(str_split(Items, ', '), \(x) str_c("(?=.*", x, ")", collapse = "")), .keep = "unused")
fuzzy_left_join(df1, df2_pattern, by = c("Items" = "key"),
match_fun = str_detect) %>%
select(-key) },
arg0naut91A = {df1 %>%
left_join(
full_join(df1, df2 %>% rename(CouponItems = Items), by = character()) %>%
rowwise %>%
filter(all(unlist(strsplit(CouponItems, ', ')) %in% unlist(strsplit(Items, ', ')))) %>%
select(-CouponItems), multiple = "all"
)},
arg0naut91B = {df1 %>%
left_join(
cross_join(df1, df2 %>% rename(CouponItems = Items)) %>%
rowwise %>%
filter(all(unlist(strsplit(CouponItems, ', ')) %in% unlist(strsplit(Items, ', ')))) %>%
select(-CouponItems), multiple = "all"
)},
Thomas = {unique(
na.omit(
setDT(df2)[, .(Items = unlist(strsplit(Items, ", "))), Coupons][
setDT(df1)[, .(Items = unlist(strsplit(Items, ", "))), ID],
on = "Items"
]
)[
,
.SD[uniqueN(Items) > 1], .(ID, Coupons)
][, Items := NULL]
)[df1,
on = "ID",
allow.cartesian = TRUE
][
,
.(ID, Items, Coupons)
]},
GKi3 = {i <- lapply(strsplit(df2$Items, ", "), function(s) {
Reduce(`&`, lapply(s, grepl, df1$Items, fixed=TRUE)) })
i <- c(i, list(!Reduce(`|`, i)))
cbind(df1[unlist(lapply(i, which)),], Coupons = rep(c(df2$Coupons, NA), sapply(i, sum))) },
GKi4 = {i <- lapply(strsplit(df2$Items, ", ", TRUE), function(s) {
Reduce(function(a, b) a[grep(b, df1$Items[a], fixed=TRUE)], s[-1], grep(s[[1]], df1$Items, fixed=TRUE)) })
j <- unique(unlist(i))
i <- if(length(j>0)) c(list(seq_len(nrow(df1))[-j]), i) else c(list(seq_len(nrow(df1))), i)
cbind(df1[unlist(i),], Coupons = rep(c(NA, df2$Coupons), lengths(i))) }
)
Result
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total…¹
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:t>
1 Darren1 34.6ms 34.61ms 28.5 223.7KB 85.4 3 9 105ms
2 Darren2 32.1ms 33.66ms 29.7 173.8KB 53.5 5 9 168ms
3 arg0naut91A 18.6ms 21.15ms 47.6 95.3KB 36.6 13 10 273ms
4 arg0naut91B 14.8ms 16.73ms 60.9 76.3KB 37.2 18 11 296ms
5 Thomas 3.6ms 4.13ms 222. 688.7KB 18.3 97 8 437ms
6 GKi3 289.1µs 322.51µs 2857. 42.4KB 24.0 1308 11 458ms
7 GKi4 284.4µs 312.12µs 3035. 46.5KB 23.6 1412 11 465ms
In this case GKi3 and GKi4 are the fastest and uses the lowest amount of memory.