0

I have a dataset, which looks like

df <- data.frame(A=rnorm(10), B=rnorm(10), C=rnorm(10), D=rnorm(10), E=rnorm(10))

I need to find the four lowest entries for every row and then build the average across the four.

I tried

df %>% rowwise() %>% mutate(Min = min(A, B, C, D, E))

but I struggle to find a way for the second lowest entry, third lowest entry etc.

As I have 36 column entries and always need the work with the 4 lowest row entries, I tried to work with percentiles(4/36=1/9..) but that seems way to complicated to do.

Is there any easier way I am overlooking?

user438383
  • 5,716
  • 8
  • 28
  • 43
dbraeuni
  • 11
  • 2

5 Answers5

2

This sort of thing tends to be easier if you get the data in long form first.

library(tidyr)
library(dplyr) 
df %>% 
    pivot_longer(everything()) %>% 
    group_by(name) %>% 
    top_n(-4) %>%
    group_by(name) %>%
    summarise(mean_val = mean(value))
# A tibble: 5 x 2
  name  mean_val
  <chr>    <dbl>
1 A       -0.620
2 B       -1.87 
3 C       -1.91 
4 D       -1.17 
5 E       -0.36
user438383
  • 5,716
  • 8
  • 28
  • 43
2

You can use c_across, sort the values, select first 4 of them and get average of them.

library(dplyr)

df <- df %>% 
  rowwise() %>% 
  mutate(Average_4 = mean(head(sort(c_across()), 4)))

df

#       A      B      C       D       E   Average_4
#     <dbl>  <dbl>  <dbl>   <dbl>   <dbl>     <dbl>
# 1 -0.560   1.22  -1.07   0.426  -0.695    -0.474 
# 2 -0.230   0.360 -0.218 -0.295  -0.208    -0.238 
# 3  1.56    0.401 -1.03   0.895  -1.27     -0.249 
# 4  0.0705  0.111 -0.729  0.878   2.17      0.0826
# 5  0.129  -0.556 -0.625  0.822   1.21     -0.0575
# 6  1.72    1.79  -1.69   0.689  -1.12     -0.102 
# 7  0.461   0.498  0.838  0.554  -0.403     0.277 
# 8 -1.27   -1.97   0.153 -0.0619 -0.467    -0.940 
# 9 -0.687   0.701 -1.14  -0.306   0.780    -0.357 
#10 -0.446  -0.473  1.25  -0.380  -0.0834   -0.346 

Or in base R -

df$Average_4 <- apply(df, 1, function(x) mean(head(sort(x), 4)))

data

set.seed(123)
df <- data.frame(A=rnorm(10), B=rnorm(10), C=rnorm(10), D=rnorm(10), E=rnorm(10))
Ronak Shah
  • 377,200
  • 20
  • 156
  • 213
2

With Base R,

df$New <-  colMeans(apply(df,1,sort)[1:4,])
df

gives,

             A          B          C           D           E         New
1  -0.56047565  1.2240818 -1.0678237  0.42646422 -0.69470698 -0.47413553
2  -0.23017749  0.3598138 -0.2179749 -0.29507148 -0.20791728 -0.23778529
3   1.55870831  0.4007715 -1.0260044  0.89512566 -1.26539635 -0.24887592
4   0.07050839  0.1106827 -0.7288912  0.87813349  2.16895597  0.08260834
5   0.12928774 -0.5558411 -0.6250393  0.82158108  1.20796200 -0.05750290
6   1.71506499  1.7869131 -1.6866933  0.68864025 -1.12310858 -0.10152416
7   0.46091621  0.4978505  0.8377870  0.55391765 -0.40288484  0.27744988
8  -1.26506123 -1.9666172  0.1533731 -0.06191171 -0.46665535 -0.94006136
9  -0.68685285  0.7013559 -1.1381369 -0.30596266  0.77996512 -0.35739914
10 -0.44566197 -0.4727914  1.2538149 -0.38047100 -0.08336907 -0.34557336

Data:

set.seed(123)
df <- data.frame(A=rnorm(10), B=rnorm(10), C=rnorm(10), D=rnorm(10), E=rnorm(10))
maydin
  • 3,715
  • 3
  • 10
  • 27
  • Then to close the issue, you may want to have a look at this : https://stackoverflow.com/help/someone-answers – maydin Jul 12 '21 at 08:18
0

Use sort function and subset the second lowest

df %>% rowwise() %>% mutate(Min = sort(c(A, B, C, D, E), FALSE)[1],
                            Min2 = sort(c(A, B, C, D, E), FALSE)[2])

# A tibble: 10 x 7
# Rowwise: 
        A      B      C      D      E    Min   Min2
    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
 1  0.252 -1.69  -0.612 -1.54   1.50  -1.69  -1.54 
 2  0.642  1.25  -0.372 -1.32  -0.289 -1.32  -0.372
 3 -0.347 -1.77  -0.515 -0.676  1.92  -1.77  -0.676
 4 -0.515  0.223 -0.342 -0.367  0.519 -0.515 -0.367
 5  0.398 -0.561  0.684 -0.514  0.477 -0.561 -0.514
 6  0.954 -0.508  0.798 -0.959  0.303 -0.959 -0.508
 7  0.260 -0.939 -0.800 -0.122  0.821 -0.939 -0.800
 8 -2.05   1.64   0.230  0.666  0.393 -2.05   0.230
 9  0.288 -0.653  1.69   1.24  -1.44  -1.44  -0.653
10  0.362  0.839 -0.768  1.18  -0.869 -0.869 -0.768

If you want to find the average of four lowest values:

df %>% rowwise() %>% mutate(Average_Min = mean(sort(c(A, B, C, D, E), FALSE)[1:4]))

# A tibble: 10 x 6
# Rowwise: 
         A       B      C      D       E Average_Min
     <dbl>   <dbl>  <dbl>  <dbl>   <dbl>       <dbl>
 1 -2.99    0.279   0.999 -0.131  0.128      -0.678 
 2 -0.0505 -0.0756 -1.09  -0.417 -0.225      -0.452 
 3  1.30    0.123   0.854  0.652  0.670       0.575 
 4 -1.35   -1.03   -0.627 -1.65   0.538      -1.16  
 5 -0.950   0.0897  0.421 -0.677 -0.0553     -0.398 
 6  1.69    1.32   -0.396 -1.31   0.502       0.0307
 7  0.244  -0.308  -0.390 -0.405 -0.640      -0.436 
 8 -1.39    1.48    0.384  1.36   1.80        0.458 
 9  0.887  -0.470   1.66  -0.661  0.999       0.189 
10  0.0282 -0.866  -1.13  -0.915  0.878      -0.720
Mohanasundaram
  • 2,889
  • 1
  • 8
  • 18
0

We may use pmap to loop over the rows, sort the vector (c(...)), get the first 'n' values with head and return with the mean to create the new column 'Average_4' in mutate

library(purrr)
library(dplyr)
df %>% 
   mutate(Average_4 = pmap_dbl(cur_data(), ~ mean(head(sort(c(...)), 4))))
             A          B          C           D           E   Average_4
1  -0.56047565  1.2240818 -1.0678237  0.42646422 -0.69470698 -0.47413553
2  -0.23017749  0.3598138 -0.2179749 -0.29507148 -0.20791728 -0.23778529
3   1.55870831  0.4007715 -1.0260044  0.89512566 -1.26539635 -0.24887592
4   0.07050839  0.1106827 -0.7288912  0.87813349  2.16895597  0.08260834
5   0.12928774 -0.5558411 -0.6250393  0.82158108  1.20796200 -0.05750290
6   1.71506499  1.7869131 -1.6866933  0.68864025 -1.12310858 -0.10152416
7   0.46091621  0.4978505  0.8377870  0.55391765 -0.40288484  0.27744988
8  -1.26506123 -1.9666172  0.1533731 -0.06191171 -0.46665535 -0.94006136
9  -0.68685285  0.7013559 -1.1381369 -0.30596266  0.77996512 -0.35739914
10 -0.44566197 -0.4727914  1.2538149 -0.38047100 -0.08336907 -0.34557336

Or this can be done in a faster way with fmean/dapply from collapse

library(collapse)
df$Average_4 <-  dapply(df, MARGIN = 1, FUN = \(x) fmean(ss(sort(x), 1:4)))
df$Average_4
#[1] -0.47413553 -0.23778529 -0.24887592  0.08260834 -0.05750290 -0.10152416  0.27744988 -0.94006136 -0.35739914 -0.34557336

data

set.seed(123)
df <- data.frame(A=rnorm(10), B=rnorm(10), C=rnorm(10), D=rnorm(10), E=rnorm(10))
akrun
  • 874,273
  • 37
  • 540
  • 662