Goal: I have a table of records (for example people) with a dynamic number of variables (e.g., email, phone, birthday,..., others). I want to compare each row with every other row and sum the number of variables that match.
# Input
my_data <- tibble(person = c("A","B","C","E","F"),
email = c("A@me.com", "A@me.com", NA, NA, NA),
phone = c(NA, NA, NA, 801, 801),
birthday = c("Jan1", "Jan1", NA, NA, NA))
# A tibble: 5 x 4
person email phone birthday
<chr> <chr> <dbl> <chr>
1 A A@me.com NA Jan1
2 B A@me.com NA Jan1
3 C NA NA NA
4 E NA 801 NA
5 F NA 801 NA
Using tidyr::expand_grid
, I can get every combination of records.
# Munging
my_data_a <- my_data %>%
rename_with(~str_c(., "_a"), everything())
my_data_b <- my_data %>%
rename_with(~str_c(., "_b"), everything())
grid <- expand_grid(my_data_a, my_data_b)
# A tibble: 25 x 9
person_a email_a phone_a birthday_a person_b email_b phone_b birthday_b total
<chr> <chr> <dbl> <chr> <chr> <chr> <dbl> <chr> <int>
1 A A@me.com NA Jan1 A A@me.com NA Jan1 2
2 A A@me.com NA Jan1 B A@me.com NA Jan1 2
3 A A@me.com NA Jan1 C NA NA NA 0
4 A A@me.com NA Jan1 E NA 801 NA 0
5 A A@me.com NA Jan1 F NA 801 NA 0
6 B A@me.com NA Jan1 A A@me.com NA Jan1 2
7 B A@me.com NA Jan1 B A@me.com NA Jan1 2
8 B A@me.com NA Jan1 C NA NA NA 0
9 B A@me.com NA Jan1 E NA 801 NA 0
10 B A@me.com NA Jan1 F NA 801 NA 0
# … with 15 more rows
Now I can compare each of the variables manually, but the issue is I will have more than email, phone, birthday.
grid %>%
mutate(email_match = email_a == email_b,
phone_match = phone_a == phone_b,
birthday_match = birthday_a == birthday_b) %>%
mutate(across(everything(), ~replace_na(., 0)),
total = email_match + phone_match + birthday_match) %>%
select(person_a, person_b, total)
# Output
person_a person_b total
<chr> <chr> <dbl>
1 A A 2
2 A B 2
3 A C 0
4 A E 0
5 A F 0
6 B A 2
7 B B 2
8 B C 0
9 B E 0
10 B F 0
# … with 15 more rows
This can be done by brute force in a for loop, but the data set is large:
# Brute force
a_col_start <- 2
a_col_end <- ncol(grid)/2
b_col_start <- a_col_end + 2
b_col_end <- ncol(grid)
for (i in 1:nrow(grid)) {
grid[i,"total"] <- sum(grid[i,a_col_start:a_col_end] == grid[i,b_col_start:b_col_end], na.rm = TRUE)
}
grid %>%
select(person_a, person_b, total)