Get top x% or top n results from a nested tibble

Question

I am tyring to take the top x results from a group of nested data. Adding map(data, slice(seq(n()*0.2))) to the end of the pipe function gives me errors.

So I am trying to get the top 20% of each tbl_df and return them.

Secondly How can I nest the data based on the date column? I am also trying to nest the data based on all observations between July of year t and June of year t-1. Instead of nesting them based on Yrs I would like to nest them based on a specific data range from the date column.

Data looks like (I changed all the years to 2007/2008 for this illustration):

       id       Yrs       date       var   NESTID
1 1144215 2006_2007 2007-02-23 0.7897917     1
2  352510 2011_2012 2007-02-27 0.9626791     1
3  793952 2008_2009 2007-11-27 0.8186645     2
4 1368457 2014_2015 2007-03-02 0.9151516     1
5  319201 2012_2013 2007-07-20 0.9816049     2
6  882095 2017_2018 2008-03-19 0.9924851     2

So all the 1s would be in one nest and all the 2s would be in another nest, since and finally the 2008 obseration which is less tan july would be in the same nest as 2.

Code:

 nested_result <- x %>%
      as_tibble() %>%
      group_by(Yrs) %>%
      nest(.key = "data")

Data:

x <- structure(list(id = c(1144215L, 352510L, 793952L, 1368457L, 319201L, 
882095L, 28412L, 1048911L, 109198L, 42293L, 97854L, 764180L, 
53117L, 1038357L, 310158L, 1324404L, 1133421L, 1646383L, 1095073L, 
1166691L, 1133421L, 721083L, 28412L, 1171825L, 764622L, 40493L, 
27904L, 109198L, 1039684L, 916457L, 72741L, 800240L, 1168054L, 
16918L, 4447L, 885639L, 1066806L, 1156039L, 732485L, 63754L, 
1310067L, 720005L, 1168054L, 92122L, 310158L, 1601712L, 42582L, 
70858L, 789388L, 21344L, 310764L, 835910L, 1065696L, 835541L, 
863157L, 51253L, 1137774L, 1018963L, 217346L, 945436L, 1058290L, 
84792L, 50104L, 8868L, 200406L, 929887L, 200406L, 64803L, 709804L, 
41077L, 785161L, 1585689L, 1037868L, 769397L, 63754L, 720005L, 
63541L, 33213L, 723527L, 1021860L, 4904L, 832428L, 915389L, 356028L, 
1361658L, 929887L, 40987L, 352541L, 1518832L, 100893L, 1004440L, 
1633917L, 14693L, 783325L, 92380L, 1043277L, 797468L, 922224L, 
1018963L, 205520L), Yrs = c("2006_2007", "2011_2012", "2008_2009", 
"2014_2015", "2012_2013", "2017_2018", "2008_2009", "2010_2011", 
"2018_2019", "2016_2017", "2011_2012", "2018_2019", "2017_2018", 
"2017_2018", "2013_2014", "2013_2014", "2014_2015", "2018_2019", 
"2008_2009", "2011_2012", "2012_2013", "2010_2011", "2017_2018", 
"2006_2007", "2013_2014", "2008_2009", "2015_2016", "2006_2007", 
"2006_2007", "2010_2011", "2015_2016", "2012_2013", "2017_2018", 
"2011_2012", "2011_2012", "2010_2011", "2006_2007", "2008_2009", 
"2007_2008", "2011_2012", "2016_2017", "2008_2009", "2008_2009", 
"2013_2014", "2008_2009", "2008_2009", "2006_2007", "2013_2014", 
"2008_2009", "2013_2014", "2008_2009", "2009_2010", "2013_2014", 
"2017_2018", "2016_2017", "2006_2007", "2011_2012", "2018_2019", 
"2013_2014", "2012_2013", "2012_2013", "2011_2012", "2013_2014", 
"2017_2018", "2010_2011", "2017_2018", "2018_2019", "2009_2010", 
"2017_2018", "2007_2008", "2012_2013", "2008_2009", "2009_2010", 
"2006_2007", "2009_2010", "2011_2012", "2015_2016", "2007_2008", 
"2007_2008", "2010_2011", "2018_2019", "2007_2008", "2012_2013", 
"2011_2012", "2015_2016", "2006_2007", "2007_2008", "2014_2015", 
"2013_2014", "2009_2010", "2011_2012", "2016_2017", "2011_2012", 
"2007_2008", "2008_2009", "2017_2018", "2014_2015", "2017_2018", 
"2014_2015", "2008_2009"), date = c("2007-02-23", "2012-02-27", 
"2009-11-27", "2015-03-02", "2013-02-20", "2018-03-19", "2009-02-20", 
"2011-03-01", "2019-02-15", "2017-07-24", "2012-02-29", "2019-02-26", 
"2018-03-20", "2018-02-21", "2014-02-27", "2014-02-06", "2015-02-26", 
"2019-02-21", "2009-02-27", "2012-02-28", "2013-02-27", "2011-03-01", 
"2018-02-16", "2007-02-27", "2014-12-10", "2009-08-26", "2016-08-30", 
"2007-02-28", "2007-03-01", "2011-02-17", "2016-02-26", "2013-06-17", 
"2018-02-23", "2012-11-16", "2012-02-24", "2011-02-24", "2007-02-28", 
"2009-10-27", "2008-02-22", "2012-05-21", "2017-03-21", "2009-02-17", 
"2009-03-02", "2014-02-27", "2009-02-27", "2009-02-26", "2007-02-16", 
"2014-02-26", "2009-02-19", "2014-02-27", "2009-12-14", "2010-12-20", 
"2014-02-07", "2018-02-22", "2017-03-24", "2007-02-15", "2012-02-15", 
"2019-02-25", "2014-02-27", "2013-05-30", "2013-02-25", "2012-06-27", 
"2014-02-27", "2018-02-07", "2011-02-25", "2018-02-23", "2019-02-25", 
"2010-02-26", "2018-05-02", "2008-03-14", "2013-02-19", "2009-02-25", 
"2010-02-19", "2007-03-01", "2010-05-24", "2012-02-21", "2016-12-20", 
"2008-02-27", "2008-02-28", "2011-03-11", "2019-02-21", "2008-02-28", 
"2013-02-26", "2012-02-28", "2016-02-26", "2007-02-28", "2008-07-10", 
"2015-03-02", "2014-02-24", "2010-05-07", "2012-02-23", "2017-03-16", 
"2012-10-31", "2008-06-27", "2009-02-11", "2018-02-27", "2015-02-13", 
"2018-02-15", "2015-02-23", "2009-02-25"), var = c(0.789791703652461, 
0.962679090148091, 0.818664455488741, 0.915151626033204, 0.981604875838189, 
0.992485078188146, 0.919096450897814, 1, 1, 0.999495268313767, 
0.971549791181422, 0.142410091613585, 0.854300303926079, 0.963969335350853, 
0.100915282090841, 0.788888929835236, 0.998838037754812, 1, 0.585900427740673, 
0.99197864817115, 0.963194355499937, 0.665254900669739, 0.989803980097533, 
0.989481867437919, 0.50494546710205, 1, 0.955496106519928, 0.926277449400448, 
0.848103309375422, 1, 0.643463529314152, 0.480751853712661, 0.912805193910848, 
0.980011721979559, 0.850678323003415, 0.97329136045642, 0.9463752199707, 
0.984883553015867, 0.950093429532104, 0.694897812240191, 0.988148152699705, 
0.909736787656526, 0.882419459468187, 1, 1, 0.602296327788106, 
0.982390900591669, 0.832481527160082, 0.970062225639918, 0.93938848599177, 
0.420104624462152, 0.822785559294837, 0.922609495465592, 1, 0.847618582834581, 
1, 0.998884518357301, 0.856694145446766, 0.954934410586873, 0.999483529190324, 
0.964333698862739, 0.000025317480951105, 1, 1, 0.988227892981814, 
0.976453170572977, 0.897281824659027, 1, 0.991392575240258, 0.918719921619656, 
0.308549664065154, 0.989794242194186, 0.984928826473983, 0.921184863501297, 
0.993424278980621, 0.981441305437729, 1, 1, 0.945774347345707, 
0.9593304680852, 0.944686323046023, 0.954719328141632, 0.996087437964504, 
0.00723186083983232, 0.998107962441803, 1, 0.999992291359637, 
0.954625273413497, 0.937925176467551, 0.840391718964617, 0.976419264342728, 
0.994956214638493, 0.993807958583037, 0.929808089723338, 0.497183583710065, 
0.826207496536073, 1, 1, 0.827230630432659, 0.714753109251633
)), row.names = c(NA, -100L), class = "data.frame")

There are multiple distinct questions here. I would recommend breaking them up into 1) top x results from nested data frames, 2) nest on date range. Also, when you say "top x" results, what do you mean? The first `x` records based on the current row order, or the "top x" ranked based on some other variable? — Wil, May 01 '19 at 12:18
Yes I do realise I have asked a few qustions. What I really want to do is to firstly nest the data based on the date ranges as you mentioned then secondly to take the top x observations using something like `top_n()` for each of the nests. Top here meaning 1 is the highest, 0 the lowest. Not the first x observations. — user8959427, May 01 '19 at 12:24
When you say highest and lowest, is that based on the `var` column? — Wil, May 01 '19 at 12:28
ok thanks. And what is wrong with nesting by the `Yrs` column? How do you want that to change? — Wil, May 01 '19 at 12:29
The `Yrs` column was constructed by taking `year(date)` fromt he `lubridate` package. So the dates correspond to January - December. I would like to do a similar thing but from June t to July t+1. I think it will be easier if I créate an "id_date" column using an `if` statement and then nest on these values. — user8959427, May 01 '19 at 12:34

Wil · Accepted Answer · 2019-05-01T13:02:16.493

I use year(as.Date(date) %m+% months(6) to move anything in July or later to the next year, then paste(mod_year-1,mod_year,sep = "_") to create the label for that year. I use group_by() to rank them prior to nesting, to avoid using a loop.

library(tidyverse)
library(lubridate)

x %>%
  as_tibble() %>%
  mutate(mod_year = year(as.Date(date) %m+% months(6)),
         mod_year = paste(mod_year-1,mod_year,sep = "_")) %>%
  group_by(mod_year) %>%
  filter(percent_rank(var) >= .8) %>%
  nest(.key = "data")

# A tibble: 13 x 2
   mod_year  data            
   <chr>     <list>          
 1 2010_2011 <tibble [2 × 4]>
 2 2018_2019 <tibble [2 × 4]>
 3 2014_2015 <tibble [2 × 4]>
 4 2011_2012 <tibble [2 × 4]>
 5 2009_2010 <tibble [2 × 4]>
 6 2013_2014 <tibble [2 × 4]>
 7 2008_2009 <tibble [3 × 4]>
 8 2017_2018 <tibble [3 × 4]>
 9 2006_2007 <tibble [2 × 4]>
10 2012_2013 <tibble [2 × 4]>
11 2016_2017 <tibble [1 × 4]>
12 2007_2008 <tibble [2 × 4]>
13 2015_2016 <tibble [1 × 4]>

UPDATE

Here is a more detailed example showing how the dates are grouped:

data.frame(date = seq.Date(from = as.Date("2017-01-01"),by = "months",length.out = 24)) %>%
  as_tibble() %>%
  mutate(mod_year = year(as.Date(date) %m+% months(6)),
         mod_year = paste(mod_year-1,mod_year,sep = "_")) %>%
  as.data.frame()

         date  mod_year
1  2017-01-01 2016_2017
2  2017-02-01 2016_2017
3  2017-03-01 2016_2017
4  2017-04-01 2016_2017
5  2017-05-01 2016_2017
6  2017-06-01 2016_2017
7  2017-07-01 2017_2018
8  2017-08-01 2017_2018
9  2017-09-01 2017_2018
10 2017-10-01 2017_2018
11 2017-11-01 2017_2018
12 2017-12-01 2017_2018
13 2018-01-01 2017_2018
14 2018-02-01 2017_2018
15 2018-03-01 2017_2018
16 2018-04-01 2017_2018
17 2018-05-01 2017_2018
18 2018-06-01 2017_2018
19 2018-07-01 2018_2019
20 2018-08-01 2018_2019
21 2018-09-01 2018_2019
22 2018-10-01 2018_2019
23 2018-11-01 2018_2019
24 2018-12-01 2018_2019

Thanks! I think this is it! One question, each nest has 1 year of data in it? i.e. starting in July of year `t` and ending in June of `t+1`? I am looking at the data and it looks good, just wanted to double check. — user8959427, May 01 '19 at 12:57

score 0 · Answer 2 · answered May 01 '19 at 12:39

Your question is slightly confusing but here is my take on it:

library(tidyverse)
library(lubridate)

df %>%
  mutate(date=as_date(date)) %>%                         # change date column to date format
  filter(date<='2019-02-15' & date>='2011-03-01') %>%    # choose range of dates
  top_n(n=length(df$var)*0.20, wt=var) %>%               # choose the top 20% of 'var'
  arrange(desc(var), date)                               # arrange dataframe by descending 'var'

This gives the following output:

        id       Yrs       date       var
1  1048911 2010_2011 2011-03-01 1.0000000
2    92122 2013_2014 2014-02-27 1.0000000
3    50104 2013_2014 2014-02-27 1.0000000
4   797468 2014_2015 2015-02-13 1.0000000
5    63541 2015_2016 2016-12-20 1.0000000
6     8868 2017_2018 2018-02-07 1.0000000
7   922224 2017_2018 2018-02-15 1.0000000
8   835541 2017_2018 2018-02-22 1.0000000
9   109198 2018_2019 2019-02-15 1.0000000
10   42293 2016_2017 2017-07-24 0.9994953
11  945436 2012_2013 2013-05-30 0.9994835
12 1137774 2011_2012 2012-02-15 0.9988845
13 1133421 2014_2015 2015-02-26 0.9988380
14 1361658 2015_2016 2016-02-26 0.9981080
15  915389 2012_2013 2013-02-26 0.9960874
16 1633917 2016_2017 2017-03-16 0.9949562
17   14693 2011_2012 2012-10-31 0.9938080
18  882095 2017_2018 2018-03-19 0.9924851
19 1166691 2011_2012 2012-02-28 0.9919786
20  709804 2017_2018 2018-05-02 0.9913926

Get top x% or top n results from a nested tibble

2 Answers2