4

I have two data.frames which have a 3 columns: 1. id - a unique key

  1. target - semicolon separated unique values

  2. source - similar for each of the data frames but different for the two data.frames.

Here's simulated data:

set.seed(1)
df.1 <- data.frame(id=LETTERS[sample(length(LETTERS),10,replace=F)],
                   target=sapply(1:10,function(x) paste(LETTERS[sample(length(LETTERS),5,replace=F)],collapse=";")),
                   source="A",stringsAsFactors=F)

df.2 <- data.frame(id=LETTERS[sample(length(LETTERS),5,replace=F)],
                   target=sapply(1:5,function(x) paste(LETTERS[sample(length(LETTERS),5,replace=F)],collapse=";")),
                   source="B",stringsAsFactors=F)

I'm looking for a function that will collapse the two data.frames together and will create 3 columns:

1.intersected.targets - semicolon separated unique values that are intersected between the two data.frames

2.source1.targets - targets that are unique to the first data.frame

3.source2.targets - targets that are unique to the second data.frame

So for the example above the resulting data.frame will be:

> res.df
   id intersected.targets sourceA.targets sourceB.targets
1   G                  NA       F;E;Q;I;X            <NA>
2   J                  NA       M;R;X;I;Y            <NA>
3   N                  NA       Y;F;P;C;Z            <NA>
4   U                  NA       K;A;J;U;H            <NA>
5   E                  NA       M;O;L;E;S            <NA>
6   S                  NA       R;T;C;Q;J            <NA>
7   W                  NA       V;Q;S;M;L            <NA>
8   M                  NA       U;A;L;Q;P            <NA>
9   B                  NA       C;H;M;P;I            <NA>
10  X                  NA            <NA>       G;L;S;B;T
11  H                  NA            <NA>       I;U;Z;H;K
12  Y                  NA            <NA>       L;R;J;H;Q
13  O                  NA            <NA>       F;R;C;Z;D
14  L                  V       M;K;F;B       X;J;R;Y
Daniel Widdis
  • 8,424
  • 13
  • 41
  • 63
dan
  • 6,048
  • 10
  • 57
  • 125
  • You could start with `library(data.table) ; dcast(rbind(setDT(df.1), setDT(df.2)), id ~ source, value.var = "target")`. Not sure what you want in the `intersected.targets` column as you haven't specified it in your desired output – David Arenburg Aug 16 '16 at 07:21
  • You don't need this modification (not to mention that your code didn't work) because you already have a common "V" there – David Arenburg Aug 16 '16 at 07:57
  • Right, sorry about that. Edited accordingly – dan Aug 16 '16 at 08:02
  • I'm guessing @DavidArenburg has only temporarily deleted his response and is editing it and if that's the case I was going to suggest that he continue with datatable methods using conversion to list columns for the source columns and then run `setdiff`, by=id. – IRTFM Aug 16 '16 at 08:33
  • @42- Nah, I don't have time for this, you can post a solution if have any. – David Arenburg Aug 16 '16 at 08:37
  • Rats, I had gotten as far as you using base methods but was kind of hoping you were further along and I could give up. (I think you're a lot better with data.table than I.) – IRTFM Aug 16 '16 at 08:39
  • concat string, table characters, subset >1, join string? – shayaa Aug 16 '16 at 08:49

2 Answers2

2

The pain in the butt in this type of data cleaning, as @42- mentions, is unlisting data frames of lists.

library(dplyr)
library(stringr)
df <- full_join(df.1, df.2) %>% 
  spread(source, target)  %>%
  mutate(intersect_targets = str_c(A,B,sep = ";"))

df[,4][!is.na(df[,4])] <- names(do.call("c",lapply(df$intersect_targets, function(x) 
which(table(str_split(x, ";"))>1))))

a <- sapply(seq(nrow(df)), function(x) {
str_split(df[x,2:3],";")
})

sa <-  do.call("c",lapply(mapply(setdiff,a[1,], a[2,]),paste0, collapse = ","))
sb <- do.call("c",lapply(mapply(setdiff,a[2,], a[1,]), paste0, collapse = ","))

df[,2:3] <-cbind(sa,sb)

 head(df)
  id         A         B intersect_targets
1  B C,H,M,P,I        NA              <NA>
2  E M,O,L,E,S        NA              <NA>
3  G F,E,Q,I,X        NA              <NA>
4  H        NA I,U,Z,H,K              <NA>
5  J M,R,X,I,Y        NA              <NA>
6  L   M,K,F,B   X,J,R,Y                 V
shayaa
  • 2,787
  • 13
  • 19
  • Doesn't look like you removed teh intersecting elements from the input sets. – IRTFM Aug 16 '16 at 09:56
  • 1
    It's possible we could have done it on the non-list version with something like: `A <- gsub( paste( ";*, intersect_targets), "", A)`. Kind of frustrating to spend an hour at @AM on a question and then get no upvote, eh? – IRTFM Aug 17 '16 at 00:27
  • You have mine. Didn't get a chance to read it start to finish until just now. – shayaa Aug 17 '16 at 01:07
2

This is a continuation of DavidArenberg's deleted answer that taught me the notion of creating a list column in a data.table. I didn't know how to properly implement my idea of using setdiff row by row but eventually after multiple searches found an answer by Frank that does it. Here is David's (partial) answer:

===== Here's a possible solution on a different seed that have more than one intersections and more than one letter in a single intersection

#Generating Data

set.seed(123)
df.1 <- data.frame(id=LETTERS[sample(length(LETTERS),10,replace=F)],
                   target=sapply(1:10,function(x) paste(LETTERS[sample(length(LETTERS),5,
                                                                replace=F)],collapse=";")),
                   source="A",stringsAsFactors=F)

df.2 <- data.frame(id=LETTERS[sample(length(LETTERS),5, replace=F)],
                   target=sapply(1:5,function(x) paste(LETTERS[sample(length(LETTERS),5, 
                                                               replace=F)],collapse=";")),
                   source="B",stringsAsFactors=F)
#Solution

library(data.table) 
library(stringi)
res <- dcast(rbind(setDT(df.1), setDT(df.2)), id ~ source, value.var = "target")
res[!is.na(A) & !is.na(B), intersected.targets := 
                             stri_extract_all(A, regex = gsub(";", "|", B, fixed = TRUE))]
res

==========================

So I used his listifying code to make an A2and B2 column that are the list-version of A and B

res[ , A2 := stri_extract_all(A, regex = "[[:alpha:]]") ]
 res[ , B2 := stri_extract_all(B, regex = "[[:alpha:]]") ]

Then used Map() to do a row by row setdiff:

res[, SourceA := Map( setdiff, A2, intersected.targets)]
res[, SourceB := Map( setdiff, B, intersected.targets)]
 res
#-------------------------------
    id         A         B intersected.targets        A2        B2   SourceA   SourceB
 1:  A M;S;F;H;X        NA                NULL M,S,F,H,X        NA M,S,F,H,X        NA
 2:  C        NA T;P;R;A;K                NULL        NA T,P,R,A,K        NA T,P,R,A,K
 3:  G        NA G;Q;K;S;C                NULL        NA G,Q,K,S,C        NA G,Q,K,S,C
 4:  H Y;L;Q;N;C        NA                NULL Y,L,Q,N,C        NA Y,L,Q,N,C        NA
 5:  J X;R;P;W;O F;J;O;I;C                   O X,R,P,W,O F,J,O,I,C   X,R,P,W   F,J,I,C
 6:  K D;K;J;I;Z        NA                NULL D,K,J,I,Z        NA D,K,J,I,Z        NA
 7:  Q D;F;L;G;S        NA                NULL D,F,L,G,S        NA D,F,L,G,S        NA
 8:  R        NA L;U;T;S;J                NULL        NA L,U,T,S,J        NA L,U,T,S,J
 9:  T X;G;B;H;U        NA                NULL X,G,B,H,U        NA X,G,B,H,U        NA
10:  U S;N;O;G;D        NA                NULL S,N,O,G,D        NA S,N,O,G,D        NA
11:  W Z;W;Q;S;A        NA                NULL Z,W,Q,S,A        NA Z,W,Q,S,A        NA
12:  X B;L;T;C;M        NA                NULL B,L,T,C,M        NA B,L,T,C,M        NA
13:  Z F;D;S;U;I L;Y;V;U;D                 D,U F,D,S,U,I L,Y,V,U,D     F,S,I     L,Y,V

I'm leaving the clean-up as a student exercise.

IRTFM
  • 258,963
  • 21
  • 364
  • 487