6

I wonder how can we filter rows with the same values in columns using dplyr package? I tried doing something in opposite to what was asked and proposed in this question but nothing worked.

I used the approach with apply function but received the following error:

same_vals <- apply(mydata, 1, function(x) all(x == x[1]))

final <- mydata %>% 
  filter(same_vals)


Error: Can't subset elements that don't exist.
x Location 1 doesn't exist.
i There are only 0 elements. 

apply drives me crazy every time I try to use it. Neither it worked on my sample data:

set.seed(2022)
test <- tibble(id = floor(runif(10, min = 0, max = 111)),
       var1 = ceiling(runif(10, min = 5, max = 10)),
       var2 = c(6, 5, 4, 8, 12, 1223, 14, 1, 90, 1),
       var3 = c(6, 3, 4, 8, 11, 45, 56, 78, 0, 9))


# A tibble: 10 x 4
      id  var1  var2  var3
   <dbl> <dbl> <dbl> <dbl>
 1    90     6     6     6
 2    71     6     5     3
 3    13     6     4     4
 4    60     8     8     8
 5    20     9    12    11
 6    70     6  1223    45
 7     8     9    14    56
 8     4     8     1    78
 9    41     8    90     0
10    84    10     1     9

test1 <- apply(test, 1, function(x) all(x == x[1]))

test %>% 
  filter(test1)

# A tibble: 0 x 4
# ... with 4 variables: id <dbl>, var1 <dbl>, var2 <dbl>, var3 <dbl>

Desirable output

# A tibble: 10 x 4
      id  var1  var2  var3
   <dbl> <dbl> <dbl> <dbl>
 1    90     6     6     6
 4    60     8     8     8
Waldi
  • 39,242
  • 6
  • 30
  • 78
rg4s
  • 811
  • 5
  • 22

7 Answers7

3

You could compare min and max of each row :

vars <- test[,-1]
test[do.call(pmin, vars) == do.call(pmax, vars),]

# A tibble: 2 x 4
     id  var1  var2  var3
  <dbl> <dbl> <dbl> <dbl>
1    90     6     6     6
2    60     8     8     8
Waldi
  • 39,242
  • 6
  • 30
  • 78
3

Update: Again with the help of fabulous akrun: We can use if_all:

test %>% filter(if_all(var2:var3, ~ . == var1))
     id  var1  var2  var3
  <dbl> <dbl> <dbl> <dbl>
1    90     6     6     6
2    60     8     8     8

Updated benchmark: including Ronak's solution and akrun assisted tarjae if_all solution: enter image description here

First answer: This is a very good question! Whenever I need a bunch of time to solve a question I notice that the solution is not as obvious as I thought it should be. In this case I completely agree with the OP, that sometimes things can drive someone crazy.

I would have expected that if_all should work proper for this task as it was created for such kind of situation (in my knowledge, to use filter across multiple columns). But I was not possible for me to apply it.

Therefore I decided to write a function (notice I am not very experienced in programming) Comments are welcomed to improve myself:

# function to filter only rows with same values across some (not all) columns!

library(dplyr)
library(tidyr)

tarjae <- function(x) {
  x %>% 
    pivot_longer(-1) %>% 
    group_by(group = rep(row_number(), each=ncol(test[,-1]), length.out=n())) %>% 
    add_count(value) %>% 
    filter(ncol(test[,-1]) == n) %>%
    pivot_wider(names_from = name, values_from = value) %>%
    ungroup() %>% 
    select(-c(group, n))
}


tarjae(test)
 id  var1  var2  var3
  <int> <int> <int> <int>
1    90     6     6     6
2    60     8     8     8

Benchmark: Waldi is first, tarjae is last :-(

library(microbenchmark)
mbm = microbenchmark(
  tarjae = tarjae(test),
  Yuriy = test %>% 
    rowwise() %>% 
    filter(n_distinct(c_across(-id)) == 1),
  Mikko1 = test %>% filter(sapply(1:nrow(test), function(i) {all(diff(as.numeric(test[i,2:4])) == 0)})),
  Mikko2 = test %>% 
    rowwise() %>% 
    mutate(diff = sum(diff(var1:var3))) %>% 
    filter(diff == 0),
  Waldi = test[do.call(pmin, test[,-1]) == do.call(pmax, vars),],
  Claudiu1 = test %>% 
    rowwise() %>% 
    mutate(unique =  n_distinct(c_across(var1:var3))),
  Claudiu2 = test %>% 
    rowwise() %>% 
    mutate(unique =  n_distinct(c_across(var1:var3), na.rm = TRUE)),
  times=50
)
mbm

autoplot(mbm)
Unit: microseconds
     expr       min        lq        mean     median        uq       max neval   cld
   tarjae 15099.000 15422.300 16147.84108 15672.5005 16134.201 22529.301    50     e
    Yuriy  3995.102  4106.401  4228.88304  4173.9010  4331.300  4640.902    50   cd 
   Mikko1  1325.702  1369.702  1456.27518  1405.0010  1474.001  2071.201    50  b   
   Mikko2  4159.700  4256.300  4503.31502  4316.2010  4435.201 10515.101    50    d 
    Waldi    63.601    87.901    95.26902    92.9015    94.801   209.901    50 a    
 Claudiu1  3817.500  3877.801  4050.05700  3974.6515  4065.101  5529.501    50   c  
 Claudiu2  4003.502  4093.501  4252.04898  4170.2010  4272.501  6003.801    50   cd 

enter image description here

TarJae
  • 72,363
  • 6
  • 19
  • 66
2

Not quite native tidyverse, but does what you desire?

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

set.seed(2022)
test <- tibble(id = floor(runif(10, min = 0, max = 111)),
               var1 = ceiling(runif(10, min = 5, max = 10)),
               var2 = c(6, 5, 4, 8, 12, 1223, 14, 1, 90, 1),
               var3 = c(6, 3, 4, 8, 11, 45, 56, 78, 0, 9))

test %>% filter(sapply(1:nrow(test), function(i) {all(diff(as.numeric(test[i,2:4])) == 0)}))
#> # A tibble: 2 × 4
#>      id  var1  var2  var3
#>   <dbl> <dbl> <dbl> <dbl>
#> 1    90     6     6     6
#> 2    60     8     8     8

Created on 2022-01-14 by the reprex package (v2.0.1)

EDIT:

Here is an entirely tidyverse way:

test %>% 
  rowwise() %>% 
  filter(all(diff(var1:var3) == 0))
Mikko
  • 7,530
  • 8
  • 55
  • 92
  • 3
    Yea, but `sum(diff(.))` isn't equivalent to all being duplicated values. You can have that sum to zero in arbitrary cases, e.g. sum(diff(c(6, 5, 6))), because both 0+0=0 and 1-1=0, 2-2=0 etc – Claudiu Papasteri Jan 14 '22 at 08:39
2
df <- data.frame(
          id = c(90, 71, 13, 60, 20, 70, 8, 4, 41, 84),
        var1 = c(6, 6, 6, 8, 9, 6, 9, 8, 8, 10),
        var2 = c(6, 5, 4, 8, 12, 1223, 14, 1, 90, 1),
        var3 = c(6, 3, 4, 8, 11, 45, 56, 78, 0, 9)
                  )

library(tidyverse)
df %>% 
  rowwise() %>% 
  filter(n_distinct(c_across(-id)) == 1) 
#> # A tibble: 2 x 4
#> # Rowwise: 
#>      id  var1  var2  var3
#>   <dbl> <dbl> <dbl> <dbl>
#> 1    90     6     6     6
#> 2    60     8     8     8

Created on 2022-01-14 by the reprex package (v2.0.1)

Yuriy Saraykin
  • 8,390
  • 1
  • 7
  • 14
2

You may select the rows whose rowwise standard deviation or variance is 0.

test[matrixStats::rowSds(as.matrix(test[-1])) == 0, ]

# A tibble: 2 x 4
#     id  var1  var2  var3
#  <dbl> <dbl> <dbl> <dbl>
#1    90     6     6     6
#2    60     8     8     8
Ronak Shah
  • 377,200
  • 20
  • 156
  • 213
1

This is follow-up to @Yuriy Saraykin's answer, whose solution I think is the most representative for the tidyverse way and uses the robust n_distinct function. His answer achieves the outcome you are looking for in virtually all circumstances, but I wanted to expand a bit on using it with missing data because NA is always a unique value.

library(dplyr)
library(tidyr)  

set.seed(2022)
test <- tibble(id = floor(runif(10, min = 0, max = 111)),
               var1 = c(6, 6, 6, 8, 9, NA, NA, 8, 8, NA),
               var2 = c(6, 5, 4, 8, 7, 12, NA, 1, 9, NA),
               var3 = c(6, 3, 4, 8, 7, 45, NA, 78, 0, 3))

# count number of distinct values by row across columns -- NAs are all counted as unique values
test %>% 
  rowwise() %>% 
  mutate(unique =  n_distinct(c_across(var1:var3)))
#> # A tibble: 10 x 5
#> # Rowwise: 
#>       id  var1  var2  var3 unique
#>    <dbl> <dbl> <dbl> <dbl>  <int>
#>  1    90     6     6     6      1
#>  2    71     6     5     3      3
#>  3    13     6     4     4      2
#>  4    60     8     8     8      1
#>  5    20     9     7     7      2
#>  6    70    NA    12    45      3
#>  7     8    NA    NA    NA      1
#>  8     4     8     1    78      3
#>  9    41     8     9     0      3
#> 10    84    NA    NA     3      2

# Filter out rows with same value on colums -- NAs are all counted as unique values
test %>% 
  rowwise() %>% 
  filter(n_distinct(c_across(var1:var3)) == 1) 
#> # A tibble: 3 x 4
#> # Rowwise: 
#>      id  var1  var2  var3
#>   <dbl> <dbl> <dbl> <dbl>
#> 1    90     6     6     6
#> 2    60     8     8     8
#> 3     8    NA    NA    NA


# count number of distinct values by row across columns -- NAs excluded from count
test %>% 
  rowwise() %>% 
  mutate(unique =  n_distinct(c_across(var1:var3), na.rm = TRUE))
#> # A tibble: 10 x 5
#> # Rowwise: 
#>       id  var1  var2  var3 unique
#>    <dbl> <dbl> <dbl> <dbl>  <int>
#>  1    90     6     6     6      1
#>  2    71     6     5     3      3
#>  3    13     6     4     4      2
#>  4    60     8     8     8      1
#>  5    20     9     7     7      2
#>  6    70    NA    12    45      2
#>  7     8    NA    NA    NA      0
#>  8     4     8     1    78      3
#>  9    41     8     9     0      3
#> 10    84    NA    NA     3      1

# Filter out rows with same value on colums -- NAs excluded from count
test %>% 
  rowwise() %>% 
  filter(n_distinct(c_across(var1:var3), na.rm = TRUE) == 1) 
#> # A tibble: 3 x 4
#> # Rowwise: 
#>      id  var1  var2  var3
#>   <dbl> <dbl> <dbl> <dbl>
#> 1    90     6     6     6
#> 2    60     8     8     8
#> 3    84    NA    NA     3


# count number of distinct values by row across columns -- NAs excluded altogether
test %>% 
  tidyr::drop_na(var1:var3) %>%
  rowwise() %>% 
  mutate(unique =  n_distinct(c_across(var1:var3), na.rm = TRUE))
#> # A tibble: 7 x 5
#> # Rowwise: 
#>      id  var1  var2  var3 unique
#>   <dbl> <dbl> <dbl> <dbl>  <int>
#> 1    90     6     6     6      1
#> 2    71     6     5     3      3
#> 3    13     6     4     4      2
#> 4    60     8     8     8      1
#> 5    20     9     7     7      2
#> 6     4     8     1    78      3
#> 7    41     8     9     0      3

# Filter out rows with same value on colums -- NAs excluded altogether
test %>% 
  tidyr::drop_na(var1:var3) %>%
  rowwise() %>% 
  filter(n_distinct(c_across(var1:var3), na.rm = TRUE) == 1)
#> # A tibble: 2 x 4
#> # Rowwise: 
#>      id  var1  var2  var3
#>   <dbl> <dbl> <dbl> <dbl>
#> 1    90     6     6     6
#> 2    60     8     8     8

Created on 2022-01-14 by the reprex package (v2.0.1)

Claudiu Papasteri
  • 2,469
  • 1
  • 17
  • 30
0

Also found for myself another solution which looks almost like Ronak's suggestion. data.table, a little bit of dplyr and matrixStats packages are needed:

setDT(test)

test[, sd_var := matrixStats::rowSds(as.matrix(.SD)),
     .SDcols = c("var1", "var2", "var3"), by = 1:nrow(test)] %>% 
  .[sd_var == 0, , ]

   id var1 var2 var3 sd_var
1: 90    6    6    6      0
2: 60    8    8    8      0
rg4s
  • 811
  • 5
  • 22