1

I have data that takes the format:

have <- structure(list(V1 = c(4L, 28L, 2L), 
                       V2 = c("[{\"group\":1,\"topic\":\"A\"},{\"group\":1,\"topic\":\"B\"},{\"group\":2,\"topic\":\"C\"},{\"group\":2,\"topic\":\"T\"},{\"group\":2,\"topic\":\"U\"},{\"group\":3,\"topic\":\"V\"},{\"group\":3,\"topic\":\"D\"},{\"group\":3,\"topic\":\"R\"},{\"group\":4,\"topic\":\"A\"},{\"group\":4,\"topic\":\"Q\"},{\"group\":4,\"topic\":\"S\"},{\"group\":4,\"topic\":\"W\"},{\"group\":6,\"topic\":\"O\"},{\"group\":6,\"topic\":\"P\"},{\"group\":6,\"topic\":\"E\"},{\"group\":6,\"topic\":\"F\"},{\"group\":6,\"topic\":\"G\"},{\"group\":6,\"topic\":\"H\"},{\"group\":6,\"topic\":\"I\"},{\"group\":6,\"topic\":\"J\"},{\"group\":6,\"topic\":\"K\"},{\"group\":6,\"topic\":\"L\"},{\"group\":6,\"topic\":\"M\"},{\"group\":6,\"topic\":\"N\"}]",
                              "[]", 
                              "[{\"group\":2,\"topic\":\"C\"},{\"group\":3,\"topic\":\"D\"},{\"group\":6,\"topic\":\"O\"},{\"group\":6,\"topic\":\"P\"},{\"group\":6,\"topic\":\"E\"},{\"group\":6,\"topic\":\"G\"},{\"group\":6,\"topic\":\"M\"}]")
), 
row.names = c(NA, 3L), 
class = "data.frame")

The contents of V2 are nested groupings for each row like [{"group":1,"topic":"A"},{"group":1,"topic":"B"}...]

I want to get a wide dataframe that has an indicator (1/0) for each combination of group+topic (see also_have) for each row. Something like this:

# A tibble: 3 x 4
     id topic_id_1 topic_id_2 topic_id_3 topic_id_4 ...
  <dbl>      <dbl>      <dbl>      <dbl>
1     4          1          1          0
2    28          0          0          0
3     2          0          0          0

The first step is to parse the json.

I can use purrr::map(have$V2, jsonlite::fromJSON) to unnest into a list, but I'm not sure how to bind the V1 column (that we might rename to id) to each element of the resulting list (note that list element two is empty because V1==28 is empty). Here's a snippet of what the first element might look like with the id (V1) added.

[[1]]
   group topic  id
1      1     A   4
2      1     B   4
3      2     C   4
4      2     T   4
...

Alternatively, I think purrr::map_df(have$V2, jsonlite::fromJSON) would get me closer to what I ultimately need, but here too I'm not sure how to add the row id (V1).

df <- purrr::map_df(have$V2, jsonlite::fromJSON)
df

What I get:

   group topic
1      1     A
2      1     B
3      2     C
4      2     T
...

What I want (notice `V1==28` does not appear): 

   group topic id
1      1     A  4
2      1     B  4
3      2     C  4
4      2     T  4
5      2     U  4
6      3     V  4
7      3     D  4
8      3     R  4
9      4     A  4
10     4     Q  4
11     4     S  4
12     4     W  4
13     6     O  4
14     6     P  4
15     6     E  4
16     6     F  4
17     6     G  4
18     6     H  4
19     6     I  4
20     6     J  4
21     6     K  4
22     6     L  4
23     6     M  4
24     6     N  4
25     2     C  2
26     3     D  2
27     6     O  2
28     6     P  2
29     6     E  2
30     6     G  2
31     6     M  2

STOP.

I think if I can get the above dataframe with id I can get the rest of the way. The ultimate goal is to join this info with also_have and then pivot wide.

# join
also_have <- expand_grid(c(1:6), c(LETTERS)) %>%
  mutate(topic_id = 1:n()) %>%
  magrittr::set_colnames(c("group", "topic", "topic_id")) %>%
  select(topic_id, group, topic)

# pivot wide

# A tibble: 3 x 4
     id topic_id_1 topic_id_2 topic_id_3 topic_id_4 ...
  <dbl>      <dbl>      <dbl>      <dbl>
1     4          1          1          0
2    28          0          0          0
3     2          0          0          0

Update:

Applying @akrun's solution:

purrr::map_dfr(setNames(have$V2, have$V1), 
                     jsonlite::fromJSON, 
                     .id = 'V1') %>%
  rename(id = V1) %>%
  left_join(also_have, by=c("group", "topic")) %>%
  select(-group, -topic) %>%
  mutate(value = 1) %>%
  pivot_wider(id_cols = id, 
              names_from = topic_id,
              names_prefix = "topic_id",
              values_from = value,
              values_fill = 0
  ) %>% 
  full_join(tibble(id = as.character(have$V1))) %>%
  replace(is.na(.), 0)

# A tibble: 3 x 25
  id    topic_id1 topic_id2 topic_id29 topic_id46 topic_id47 topic_id74 topic_id56
  <chr>     <dbl>     <dbl>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
1 4             1         1          1          1          1          1          1
2 2             0         0          1          0          0          0          1
3 28            0         0          0          0          0          0          0
# … with 17 more variables: topic_id70 <dbl>, topic_id79 <dbl>, topic_id95 <dbl>,
#   topic_id97 <dbl>, topic_id101 <dbl>, topic_id145 <dbl>, topic_id146 <dbl>,
#   topic_id135 <dbl>, topic_id136 <dbl>, topic_id137 <dbl>, topic_id138 <dbl>,
#   topic_id139 <dbl>, topic_id140 <dbl>, topic_id141 <dbl>, topic_id142 <dbl>,
#   topic_id143 <dbl>, topic_id144 <dbl>
Eric Green
  • 7,385
  • 11
  • 56
  • 102

1 Answers1

5

We could pass a named vector and then use .id in map_dfr

 purrr::map_dfr(setNames(have$V2, have$V1), jsonlite::fromJSON, .id = 'id')

-output

id group topic
1   4     1     A
2   4     1     B
3   4     2     C
4   4     2     T
5   4     2     U
6   4     3     V
7   4     3     D
8   4     3     R
9   4     4     A
10  4     4     Q
11  4     4     S
12  4     4     W
...

Or this can be done within in dplyr framework itself after using rowwise

library(tidyr)
have %>%
    rowwise %>%
    transmute(ID = V1, V2 = list(fromJSON(V2))) %>% 
    ungroup %>%
    unnest(c(V2), keep_empty = TRUE) %>% 
    select(-V2)
# A tibble: 32 x 3
      ID group topic
   <int> <int> <chr>
 1     4     1 A    
 2     4     1 B    
 3     4     2 C    
 4     4     2 T    
 5     4     2 U    
 6     4     3 V    
 7     4     3 D    
 8     4     3 R    
 9     4     4 A    
10     4     4 Q    
# … with 22 more rows

For the second step do a join

out <-  have %>%
     rowwise %>%
     transmute(ID = V1, V2 = list(fromJSON(V2))) %>% 
     ungroup %>%
     unnest(c(V2), keep_empty = TRUE) %>% 
     select(-V2) %>% right_join(also_have)
out
Joining, by = c("group", "topic")
# A tibble: 163 x 4
      ID group topic topic_id
   <int> <int> <chr>    <int>
 1     4     1 A            1
 2     4     1 B            2
 3     4     2 C           29
 4     4     2 T           46
 5     4     2 U           47
 6     4     3 V           74
 7     4     3 D           56
 8     4     3 R           70
 9     4     4 A           79
10     4     4 Q           95
# … with 153 more rows
akrun
  • 874,273
  • 37
  • 540
  • 662