0

I have a dataframe (sample below) that has repeating data. The first column is the data from which the sample data was taken. I would like to create a table that calculates the number of overlaps in each possible intersection between the dates. In other words, using 2014.01.14 as an example, I want to see how many of the elements from that date are in the 2014.07.14 sample, in the 2014.04.14 sample, etc.

I have been looking at the setdiff and calculate.overlap functions (from dplyr and the Venn Diagram function), but haven't found an efficient way to do these multiple set overlappings.

Any help would be much appreciated - thanks.

EDIT: Desired output is a table that summarizes all overlap combinations - so an x by x grid where x is the number of distinct dates and the elements the number of overlaps in each combination. In other words, the diagonal of this table would all be 100% since it will be comparing sets with themselves.

structure(list(.id = c("2016.10.14", "2016.07.14", "2014.04.14", 
"2014.01.14", "2017.01.06", "2016.10.14", "2016.07.14", "2016.04.14", 
"2016.07.14", "2016.04.14", "2016.01.14", "2015.10.14", "2015.07.14", 
"2015.04.14", "2015.01.14", "2014.10.14", "2017.03.06", "2017.01.06", 
"2016.10.14", "2016.07.14", "2016.04.14", "2016.01.14", "2015.04.14", 
"2015.01.14", "2014.10.14", "2014.07.14", "2014.04.14", "2014.01.14", 
"2016.07.14", "2016.04.14", "2016.01.14", "2015.10.14", "2015.07.14", 
"2015.04.14", "2015.04.14", "2016.01.14", "2015.10.14", "2015.07.14", 
"2015.04.14", "2015.01.14", "2014.10.14", "2014.07.14", "2014.04.14", 
"2014.01.14", "2017.03.06", "2017.01.06", "2016.10.14", "2016.07.14", 
"2015.01.14", "2014.10.14", "2014.07.14", "2014.04.14", "2014.01.14", 
"2016.04.14", "2016.01.14", "2017.03.06", "2017.01.06", "2016.10.14", 
"2016.07.14", "2016.04.14", "2016.01.14", "2015.10.14", "2015.07.14", 
"2015.04.14", "2015.01.14", "2014.10.14", "2014.07.14", "2014.04.14", 
"2014.01.14", "2017.03.06", "2017.01.06", "2015.10.14", "2015.07.14", 
"2017.03.06", "2017.01.06", "2015.10.14", "2015.07.14", "2015.04.14", 
"2015.01.14", "2014.10.14", "2014.07.14", "2014.04.14", "2014.01.14", 
"2016.01.14", "2015.10.14", "2015.07.14", "2015.04.14", "2015.01.14", 
"2014.10.14", "2015.07.14", "2017.03.06", "2017.01.06", "2016.10.14", 
"2016.07.14", "2016.04.14", "2016.01.14", "2015.10.14", "2015.07.14", 
"2015.04.14", "2015.01.14"), Name = c("ZIMMER BIOMET HOLDINGS INC", 
"ZIMMER BIOMET HOLDINGS INC", "ZIMMER BIOMET HOLDINGS INC", "ZIMMER BIOMET HOLDINGS INC", 
"XYLEM INC", "XYLEM INC", "XYLEM INC", "XYLEM INC", "XL GROUP LTD", 
"XL GROUP LTD", "XL GROUP LTD", "XL GROUP LTD", "XL GROUP LTD", 
"XL GROUP LTD", "XL GROUP LTD", "XL GROUP LTD", "XCEL ENERGY INC", 
"XCEL ENERGY INC", "XCEL ENERGY INC", "XCEL ENERGY INC", "XCEL ENERGY INC", 
"XCEL ENERGY INC", "XCEL ENERGY INC", "XCEL ENERGY INC", "XCEL ENERGY INC", 
"XCEL ENERGY INC", "XCEL ENERGY INC", "XCEL ENERGY INC", "WILLIS TOWERS WATSON PLC", 
"WILLIS TOWERS WATSON PLC", "WEYERHAEUSER CO", "WEYERHAEUSER CO", 
"WEYERHAEUSER CO", "WEYERHAEUSER CO", "WELLTOWER INC", "WELLS FARGO & CO", 
"WELLS FARGO & CO", "WELLS FARGO & CO", "WELLS FARGO & CO", "WELLS FARGO & CO", 
"WELLS FARGO & CO", "WELLS FARGO & CO", "WELLS FARGO & CO", "WELLS FARGO & CO", 
"WEC ENERGY GROUP INC", "WEC ENERGY GROUP INC", "WEC ENERGY GROUP INC", 
"WEC ENERGY GROUP INC", "WEC ENERGY GROUP INC", "WEC ENERGY GROUP INC", 
"WEC ENERGY GROUP INC", "WEC ENERGY GROUP INC", "WEC ENERGY GROUP INC", 
"WATERS CORP", "WATERS CORP", "WASTE MANAGEMENT INC", "WASTE MANAGEMENT INC", 
"WASTE MANAGEMENT INC", "WASTE MANAGEMENT INC", "WASTE MANAGEMENT INC", 
"WASTE MANAGEMENT INC", "WASTE MANAGEMENT INC", "WASTE MANAGEMENT INC", 
"WASTE MANAGEMENT INC", "WASTE MANAGEMENT INC", "WASTE MANAGEMENT INC", 
"WASTE MANAGEMENT INC", "WASTE MANAGEMENT INC", "WASTE MANAGEMENT INC", 
"WALT DISNEY CO/THE", "WALT DISNEY CO/THE", "WALT DISNEY CO/THE", 
"WALT DISNEY CO/THE", "WAL-MART STORES INC", "WAL-MART STORES INC", 
"WAL-MART STORES INC", "WAL-MART STORES INC", "WAL-MART STORES INC", 
"WAL-MART STORES INC", "WAL-MART STORES INC", "WAL-MART STORES INC", 
"WAL-MART STORES INC", "WAL-MART STORES INC", "VORNADO REALTY TRUST", 
"VORNADO REALTY TRUST", "VORNADO REALTY TRUST", "VORNADO REALTY TRUST", 
"VORNADO REALTY TRUST", "VORNADO REALTY TRUST", "VF CORP", "VERIZON COMMUNICATIONS INC", 
"VERIZON COMMUNICATIONS INC", "VERIZON COMMUNICATIONS INC", "VERIZON COMMUNICATIONS INC", 
"VERIZON COMMUNICATIONS INC", "VERIZON COMMUNICATIONS INC", "VERIZON COMMUNICATIONS INC", 
"VERIZON COMMUNICATIONS INC", "VERIZON COMMUNICATIONS INC", "VERIZON COMMUNICATIONS INC"
), X..Wgt..P. = c(0.919761358, 0.933057358, 0.864039801, 0.986472727, 
0.818192145, 0.964415732, 0.937955303, 0.975205821, 0.842865506, 
1.126873419, 1.112947401, 1.099063977, 1.164558843, 1.151605792, 
0.988825771, 0.918127246, 0.961678801, 1.033232496, 1.071266028, 
1.058801025, 0.93020876, 1.006470014, 0.964550593, 1.125389358, 
1.124178769, 1.059881818, 1.085759766, 1.0200896, 0.909300947, 
1.012656226, 0.871314497, 0.94065386, 0.936539398, 0.89578771, 
0.970174408, 0.872847521, 0.949033595, 1.002482323, 0.969762291, 
0.942347801, 0.989303579, 0.98593113, 0.968543074, 0.950396767, 
0.92842183, 0.972461108, 1.006570516, 1.005416119, 0.993368064, 
1.002700203, 0.958855231, 1.096422022, 1.021761901, 0.963388913, 
0.926948746, 1.304745321, 1.199632088, 1.234399398, 1.269585556, 
1.087885176, 1.189337133, 1.146522298, 1.053416881, 1.16547938, 
1.096877887, 1.018844619, 1.014660123, 0.999372549, 1.032241197, 
1.063327494, 1.00757917, 0.963540352, 0.944914081, 0.896543743, 
0.80520587, 0.841142125, 0.921557168, 0.959306858, 1.278648475, 
1.299562929, 1.191759169, 1.222574274, 1.082399965, 0.943414727, 
0.998604251, 0.94817809, 1.097301363, 1.220343667, 0.931967545, 
0.900281393, 1.018445806, 1.18929386, 1.085480841, 1.13721209, 
1.070562566, 1.135912005, 1.0973153, 1.025256697, 1.008197447, 
0.907933413), Mkt.Val..P. = c(60495554.15, 74208256.58, 33088060.26, 
36688942.4, 52333970.68, 63432610.68, 74597801.67, 66667061.11, 
67035085.44, 77035367.75, 61854260.94, 52447251.15, 55756621.76, 
59977265.12, 52416987.1, 41839037.8, 62294332.68, 66088582.56, 
70460485.65, 84208947.55, 63590970.16, 55936568.82, 50235164.73, 
59656130.73, 51228801, 43818397.08, 41578738.06, 37939222.8, 
72318853.08, 69227247.24, 48425032.67, 44887932.18, 44839531.56, 
46653896.1, 50528060.96, 48510233.52, 45287812.5, 47996739.75, 
50506597.32, 49953221.25, 45082541.65, 40761074.48, 37089971.51, 
35347203.52, 60140057.4, 62201466.24, 66205261.4, 79963119.81, 
52657770.96, 45693025.53, 39641683.2, 41987044.93, 38001419.1, 
65859233.16, 51517016.5, 84517033.09, 76731988.76, 81190272.8, 
100973139.4, 74370051.84, 66099771.92, 54711958.7, 50435379, 
60699821.28, 58144756.89, 46428726.18, 41948809.2, 38270574, 
38391165.6, 68878794.6, 64447720.56, 45980073.86, 45240493.7, 
58075101.72, 51503330.42, 40139239.59, 44122213.97, 49962063.55, 
67780293.12, 59221053.22, 49270565.45, 46817995.2, 40256672.96, 
52432146.06, 47653320.52, 45396767.6, 57149013.33, 64689594.6, 
42469739.92, 43103574.75, 65971509.17, 76070725.4, 71395437.96, 
90445164.8, 73185842.88, 63130564.42, 52363804.43, 49087128.75, 
52508354.86, 48128937.78), Ticker = c("ZBH US", "ZBH US", "ZBH US", 
"ZBH US", "XYL US", "XYL US", "XYL US", "XYL US", "XL US", "XL US", 
"XL US", "XL US", "XL US", "XL US", "XL US", "XL US", "XEL US", 
"XEL US", "XEL US", "XEL US", "XEL US", "XEL US", "XEL US", "XEL US", 
"XEL US", "XEL US", "XEL US", "XEL US", "WLTW US", "WLTW US", 
"WY US", "WY US", "WY US", "WY US", "HCN US", "WFC US", "WFC US", 
"WFC US", "WFC US", "WFC US", "WFC US", "WFC US", "WFC US", "WFC US", 
"WEC US", "WEC US", "WEC US", "WEC US", "WEC US", "WEC US", "WEC US", 
"WEC US", "WEC US", "WAT US", "WAT US", "WM US", "WM US", "WM US", 
"WM US", "WM US", "WM US", "WM US", "WM US", "WM US", "WM US", 
"WM US", "WM US", "WM US", "WM US", "DIS US", "DIS US", "DIS US", 
"DIS US", "WMT US", "WMT US", "WMT US", "WMT US", "WMT US", "WMT US", 
"WMT US", "WMT US", "WMT US", "WMT US", "VNO US", "VNO US", "VNO US", 
"VNO US", "VNO US", "VNO US", "VFC US", "VZ US", "VZ US", "VZ US", 
"VZ US", "VZ US", "VZ US", "VZ US", "VZ US", "VZ US", "VZ US"
)), .Names = c(".id", "Name", "X..Wgt..P.", "Mkt.Val..P.", "Ticker"
), row.names = c(NA, 100L), class = "data.frame")
Z_D
  • 797
  • 2
  • 12
  • 30
  • 1
    You ought to sanitize your column names, dates, etc. first, I think. Also, provide desired output. You might also consider dropping the `Name` column that apparently doesn't add anything to the problem (except size). Also, please show desired output corresponding to the example. The description of overlap you give isn't clear enough on its own. – Frank Mar 10 '17 at 16:33
  • Edited the above - hope that helps. – Z_D Mar 10 '17 at 16:37
  • Ok, you might want to look into `table` and `expand.grid`. With dplyr, a very ugly approach would be: `DF %>% group_by(Ticker) %>% do({ud = unique(.$.id); expand.grid(ud, ud)}) %>% with(table(Var1, Var2))`. As far as converting that to a share of overlap, I guess you can figure out some extension. – Frank Mar 10 '17 at 16:51
  • Thank you! That does the trick – Z_D Mar 10 '17 at 23:25

1 Answers1

2

This question hasn't got an answer although Frank has posted a solution in a comment.

Therefore, I will post two solutions which implement Frank's approach using dplyr/tidyr and cross join/dcast() from data.table.

In all three approaches the .id column is expanded for each Ticker, then the number of occurrences of Ticker is counted for each unique combination of .id/.id, and then reshaped from long to wide format, finally.

dplyr/tidyr

library(dplyr)
library(tidyr)
DF %>% 
  group_by(Ticker) %>%
  expand(.id, .id) %>% 
  group_by(.id, .id1) %>% 
  count() %>% 
  spread(.id1, n)
# A tibble: 14 x 15
# Groups:   .id [14]
          .id `2014.01.14` `2014.04.14` `2014.07.14` `2014.10.14` `2015.01.14` `2015.04.14` `2015.07.14` `2015.10.14`
 *      <chr>        <int>        <int>        <int>        <int>        <int>        <int>        <int>        <int>
 1 2014.01.14            6            6            5            5            5            4            3            3
 2 2014.04.14            6            6            5            5            5            4            3            3
 3 2014.07.14            5            5            5            5            5            4            3            3
 4 2014.10.14            5            5            5            7            7            6            5            5
 5 2015.01.14            5            5            5            7            8            7            6            6
 6 2015.04.14            4            4            4            6            7            9            7            7
 7 2015.07.14            3            3            3            5            6            7            9            8
 8 2015.10.14            3            3            3            5            6            7            8            8
 9 2016.01.14            3            3            3            5            6            7            6            6
10 2016.04.14            2            2            2            3            4            4            3            3
11 2016.07.14            4            4            3            4            5            4            3            3
12 2016.10.14            4            4            3            3            4            3            2            2
13 2017.01.06            4            4            4            4            5            4            4            4
14 2017.03.06            4            4            4            4            5            4            4            4
# ... with 6 more variables: `2016.01.14` <int>, `2016.04.14` <int>, `2016.07.14` <int>, `2016.10.14` <int>,
#   `2017.01.06` <int>, `2017.03.06` <int>

data.table

library(data.table)
setDT(DF)[, CJ(.id, .id), by = Ticker][, dcast(.SD, V1 ~ V2, length, value.var = "Ticker")]
            V1 2014.01.14 2014.04.14 2014.07.14 2014.10.14 2015.01.14 2015.04.14 2015.07.14 2015.10.14 2016.01.14
 1: 2014.01.14          6          6          5          5          5          4          3          3          3
 2: 2014.04.14          6          6          5          5          5          4          3          3          3
 3: 2014.07.14          5          5          5          5          5          4          3          3          3
 4: 2014.10.14          5          5          5          7          7          6          5          5          5
 5: 2015.01.14          5          5          5          7          8          7          6          6          6
 6: 2015.04.14          4          4          4          6          7          9          7          7          7
 7: 2015.07.14          3          3          3          5          6          7          9          8          6
 8: 2015.10.14          3          3          3          5          6          7          8          8          6
 9: 2016.01.14          3          3          3          5          6          7          6          6          8
10: 2016.04.14          2          2          2          3          4          4          3          3          5
11: 2016.07.14          4          4          3          4          5          4          3          3          4
12: 2016.10.14          4          4          3          3          4          3          2          2          3
13: 2017.01.06          4          4          4          4          5          4          4          4          3
14: 2017.03.06          4          4          4          4          5          4          4          4          3
    2016.04.14 2016.07.14 2016.10.14 2017.01.06 2017.03.06
 1:          2          4          4          4          4
 2:          2          4          4          4          4
 3:          2          3          3          4          4
 4:          3          4          3          4          4
 5:          4          5          4          5          5
 6:          4          4          3          4          4
 7:          3          3          2          4          4
 8:          3          3          2          4          4
 9:          5          4          3          3          3
10:          7          6          4          4          3
11:          6          8          6          5          4
12:          4          6          6          5          4
13:          4          5          5          7          6
14:          3          4          4          6          6
Community
  • 1
  • 1
Uwe
  • 41,420
  • 11
  • 90
  • 134