1

To calculate the unique users in a dataset one can use the following code:

library(tidyverse)
library(igraph)
        
graph.data.frame(dat) %>%
    components() %>%
    pluck(membership) %>%
    stack() %>%
    set_names(c('GRP', 'user_id')) %>%
    right_join(dat %>% mutate(user_id = as.factor(user_id)), by = c('user_id'))

I was wondering if there is a way to expand/modify that piece of code, so we can do the same procedure for more than 2 fields. For example, for the following data:

dat <- data.frame(user_id = c(101,102,102,103,103,106, 107, 111, 112),
                  phone_number = c(4030201, 4030201, 4030202, 4030202, 4030203, 4030204, 4030205, 4030203, 4030206),
                  email = c("a@gmail.com", "b@gmail.com", "c@gmail.com", "d@gmail.com", "e@gmail.com", "f@gmail.com", "g@gmail.com", "h@gmail.com", "a@gmail.com"))

Any ideas on how can the code be modified for more than 2 fields? Thanks!

1 Answers1

1

You can try the igraph code below

g1 <- graph_from_data_frame(dat[-3])
g2 <- graph_from_data_frame(dat[-1])
merge(
    dat,
    stack(membership(components(graph.union(g1, g2)))),
    by.x = "user_id",
    by.y = "ind",
    all.x = TRUE
)

which gives

  user_id phone_number       email values
1     101      4030201 a@gmail.com      1
2     102      4030201 b@gmail.com      1
3     102      4030202 c@gmail.com      1
4     103      4030202 d@gmail.com      1
5     103      4030203 e@gmail.com      1
6     106      4030204 f@gmail.com      2
7     107      4030205 g@gmail.com      3
8     111      4030203 h@gmail.com      1
9     112      4030206 a@gmail.com      1

Generalization

For general cases, i.e., more than 2 columns, you can try

g <- do.call(
    graph.union,
    apply(embed(seq(ncol(dat)), 2)[, 2:1], 1, function(v) graph_from_data_frame(dat[v]))
)

merge(
    dat,
    stack(membership(components(g))),
    by.x = "user_id",
    by.y = "ind",
    all.x = TRUE
)
ThomasIsCoding
  • 96,636
  • 9
  • 24
  • 81