keep rows according to several conditions in R

Question

Given the following data.frame :

dput(t2)
structure(list(rs. = c("S1A_494392059", "S1A_494392059", "S1A_497201550", 
"S1A_497201550", "S1A_499864157", "S1A_499864157", "S1B_566171302", 
"S1B_566171302", "S1B_642616640", "S1B_642616640", "S2B_24883552", 
"S2B_24883552", "S2B_75832544", "S2B_75832544", "S2B_784544719", 
"S2B_784544719", "S4B_644330895", "S4B_644330895", "S5A_548234618", 
"S5A_548234618", "S5B_24292046", "S5B_24292046", "S5B_47584429", 
"S5B_47584429", "S5B_513712393", "S5B_513712393", "S5D_550192169", 
"S5D_550192169", "S6B_17686703", "S6B_17686703", "S6B_459374225", 
"S6B_459374225", "S7A_12011058", "S7A_12011058", "S7A_7938818", 
"S7A_7938818", "S7B_124548883", "S7B_124548883", "S7B_576927863", 
"S7B_576927863", "S7B_605313385", "S7B_605313385", "S7B_733461150", 
"S7B_733461150"), marker = c("0", "2", "0", "2", "0", "2", "0", 
"2", "0", "2", "0", "2", "0", "2", "0", "2", "0", "2", "0", "2", 
"0", "2", "0", "2", "0", "2", "0", "2", "0", "2", "0", "2", "0", 
"2", "0", "2", "0", "2", "0", "2", "0", "2", "0", "2"), n = c(653L, 
1463L, 943L, 1110L, 960L, 1100L, 708L, 1335L, 148L, 1060L, 208L, 
1938L, 785L, 1254L, 402L, 1695L, 722L, 1326L, 872L, 1176L, 694L, 
1381L, 619L, 1432L, 581L, 1462L, 383L, 1707L, 235L, 1894L, 458L, 
1636L, 794L, 1281L, 589L, 1484L, 163L, 1979L, 740L, 920L, 868L, 
1215L, 573L, 1521L), prop = c(0.298992673992674, 0.669871794871795, 
0.431776556776557, 0.508241758241758, 0.43956043956044, 0.503663003663004, 
0.324175824175824, 0.611263736263736, 0.0677655677655678, 0.485347985347985, 
0.0952380952380952, 0.887362637362637, 0.359432234432234, 0.574175824175824, 
0.184065934065934, 0.776098901098901, 0.330586080586081, 0.607142857142857, 
0.399267399267399, 0.538461538461538, 0.317765567765568, 0.632326007326007, 
0.283424908424908, 0.655677655677656, 0.266025641025641, 0.669413919413919, 
0.1753663003663, 0.781593406593407, 0.107600732600733, 0.867216117216117, 
0.20970695970696, 0.749084249084249, 0.363553113553114, 0.586538461538462, 
0.269688644688645, 0.67948717948718, 0.0746336996336996, 0.906135531135531, 
0.338827838827839, 0.421245421245421, 0.397435897435897, 0.556318681318681, 
0.262362637362637, 0.696428571428571), BASE = c("C", "C", "C", 
"C", "T", "T", "A", "A", "G", "G", "A", "A", "G", "G", "A", "A", 
"G", "G", "A", "A", "A", "A", "C", "C", "A", "A", "T", "T", "G", 
"G", "C", "C", "A", "A", "G", "G", "A", "A", "T", "T", "A", "A", 
"T", "T"), alleles = c("C/G", "C/G", "C/T", "C/T", "C/T", "C/T", 
"G/A", "G/A", "A/G", "A/G", "A/G", "A/G", "A/G", "A/G", "G/A", 
"G/A", "G/C", "G/C", "A/G", "A/G", "C/A", "C/A", "T/C", "T/C", 
"A/G", "A/G", "T/C", "T/C", "G/A", "G/A", "C/T", "C/T", "G/A", 
"G/A", "G/C", "G/C", "G/A", "G/A", "C/T", "C/T", "A/G", "A/G", 
"T/C", "T/C")), row.names = c(NA, -44L), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), vars = "rs.", drop = TRUE, indices = list(
0:1, 2:3, 4:5, 6:7, 8:9, 10:11, 12:13, 14:15, 16:17, 18:19, 
20:21, 22:23, 24:25, 26:27, 28:29, 30:31, 32:33, 34:35, 36:37, 
38:39, 40:41, 42:43), group_sizes = c(2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L), biggest_group_size = 2L, labels = structure(list(rs. = 
c("S1A_494392059", 
"S1A_497201550", "S1A_499864157", "S1B_566171302", "S1B_642616640", 
"S2B_24883552", "S2B_75832544", "S2B_784544719", "S4B_644330895", 
"S5A_548234618", "S5B_24292046", "S5B_47584429", "S5B_513712393", 
"S5D_550192169", "S6B_17686703", "S6B_459374225", "S7A_12011058", 
"S7A_7938818", "S7B_124548883", "S7B_576927863", "S7B_605313385", 
"S7B_733461150")), row.names = c(NA, -22L), class = "data.frame", vars = 
"rs.", drop = TRUE))

The dataframe named t2 looks like that :

# A tibble: 6 x 6
# Groups:   rs. [3]
   rs.           marker     n  prop BASE  alleles
  <chr>         <chr>  <int> <dbl> <chr> <chr>  
  1 S1A_494392059 0        653 0.299 C     C/G    
  2 S1A_494392059 2       1463 0.670 C     C/G    
  3 S1A_497201550 0        943 0.432 C     C/T    
  4 S1A_497201550 2       1110 0.508 C     C/T    
  5 S1A_499864157 0        960 0.440 T     C/T    
  6 S1A_499864157 2       1100 0.504 T     C/T

I would really like to have a neat way of achieving the following conditions:

If t2$BASE is equal to the first string of t2$alleles please subset t2$marker that is equal to 2. otherwise if t2$BASE is equal to the third string of t2$alleles please subset t2$marker that is equal to 0. In this way the desired dataframe should have half of the number of lines of the initial dataframe by applying the conditions in a row wise manner.

arg0naut91 · Accepted Answer · 2019-02-25T23:50:07.190

1

Could try:

library(dplyr)

t2 %>%
  group_by(rs.) %>%
  filter(
    BASE == substr(alleles, 1, 1) & marker == 2 |
    BASE == substr(alleles, 3, 3) & marker == 0
  )

edited Feb 25 '19 at 23:50

answered Feb 25 '19 at 23:38

arg0naut91

14,574
2
17
38

thanks. could you explain what the `(...) { ... }` in `if` and `else if` represent ? I am learning R – moth Feb 25 '19 at 23:44
1

They represent a condition - if it evaluates to `TRUE`, then whatever follows is executed, otherwise it goes on to `else` or `else if` .. On the other hand, in your case this is actually not that relevant - please see my edit and first approach, that would be a more genuine `dplyr` approach – arg0naut91 Feb 25 '19 at 23:46
the syntax of first solution is nice and did not throw any warning messages whereas the second solution gave me some warning message. But both solve the problem, thanks – moth Feb 25 '19 at 23:47
1

Exactly, you should use the first one. Glad it helped. – arg0naut91 Feb 25 '19 at 23:49

keep rows according to several conditions in R

1 Answers1