Keep the rows where the rowsum of at least one of the predefined subsets of the columns is greater than a threshold

Question

I have a dataframe like this:

df <- data.frame(
  sample1 = c(0, 1, 2, 0, 2, 1),
  sample2 = c(0.3, 3, 2, 0.4, 2, 3),
  sample3 = c(0.2, 1, 3, 0.1, 3, 3),
  sample4 = c(0.4, 2, 4, 0.3, 1, 1),
  sample5 = c(0.1, 2, 4, 0.2, 5, 3),
  sample6 = c(0.2, 3, 1, 0.1, 6, 3),
  sample7 = c(0.2, 1, 1, 0.4, 1, 1)
)

And the groups defined in this df:

groups <- data.frame(
  samples = c("sample1", "sample2", "sample3", "sample4", "sample5", "sample6", "sample7"),
  groups = c("group1", "group1", "group1", "group2", "group2", "group3", "group3")
)

Using R, I want to keep only the rows where the sum in at least one group is greater than 0.5, so that the resulting df would be:

  sample1 sample2 sample3 sample4 sample5 sample6 sample7
2       1       3       1       2       2       3       1
3       2       2       3       4       4       1       1
5       2       2       3       1       5       6       1
6       1       3       3       1       3       3       1

score 1 · Answer 1 · answered May 19 '23 at 21:05

library(dplyr)
library(purrr)
library(tibble)

map({groups %>% 
      group_split(groups, .keep = F) %>% 
      map(~pull(.x, samples))} , ~df %>% 
      rownames_to_column("id") %>% 
      mutate(, sum_grp = select(., all_of(.x)) %>% 
               rowSums) %>% 
      filter(sum_grp > 0.5)) %>% 
  bind_rows() %>% 
  arrange(id) %>% 
  select(-sum_grp, -id) %>% 
  unique()

#>    sample1 sample2 sample3 sample4 sample5 sample6 sample7
#> 1        1       3       1       2       2       3       1
#> 4        2       2       3       4       4       1       1
#> 7        2       2       3       1       5       6       1
#> 10       1       3       3       1       3       3       1

score 1 · Accepted Answer · answered May 19 '23 at 21:47

Here is tidyverse version: Note there is only on row (e.g. row 4) which fulfill the criteria):

library(dplyr)
library(tidyr)

df %>% 
  pivot_longer(everything()) %>% 
  left_join(groups, by = c("name"="samples")) %>% 
  mutate(row =as.integer(gl(n(),ncol(df),n()))) %>% 
  mutate(sum_group = sum(value), .by = c(row, groups)) %>% 
  group_by(row) %>% 
  filter(!all(sum_group <= 0.5)) %>% 
  select(name, value) %>% 
  pivot_wider(names_from = name, values_from = value) 


  row sample1 sample2 sample3 sample4 sample5 sample6 sample7
  <int>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
1     1       0     0.3     0.2     0.4     0.1     0.5     0.2
2     2       1     3       1       2       2       3       1  
3     3       2     2       3       4       4       1       1  
4     5       2     2       3       1       5       6       1  
5     6       1     3       3       1       3       3       1

Keep the rows where the rowsum of at least one of the predefined subsets of the columns is greater than a threshold

2 Answers2