The fastest (and in my opinion, simplest) way would be to use simple subsetting. However, I'll also give the ifelse
way of doing it in base r and data.table (which can be more memory efficient) as well.
library(microbenchmark)
library(data.table)
subset_method <- function(mat) {
mat[!is.na(mat[,'a1']), 'a2'] <- mat[!is.na(mat[,'a1']), 'a1']
return(mat)
}
ifelse_method <- function(mat) {
mat[,'a2'] <- ifelse(is.na(mat[,'a1']), mat[,'a2'], mat[,'a1'])
return(mat)
}
dt_sub_method <- function(dt) {
d <- copy(dt)
d[!is.na(a1), a2 := a1][]
}
dt_ie_method <- function(dt) {
d <- copy(dt)
d[, a2 := ifelse(is.na(a1), a2, a1)][]
}
set.seed(1234)
n = 1e4
mat <- matrix(rpois(2*n, 3), ncol = 2)
colnames(mat) <- c('a1', 'a2')
inds <- sample(1:n, floor(.25*n))
mat[inds, 1] <- NA
dt <- data.table(mat)
head(mat, 10)
#> a1 a2
#> [1,] 1 2
#> [2,] 3 3
#> [3,] 3 3
#> [4,] 3 6
#> [5,] 5 1
#> [6,] 3 2
#> [7,] 0 2
#> [8,] 2 1
#> [9,] 4 2
#> [10,] 3 3
ret <- subset_method(mat)
head(ret, 10)
#> a1 a2
#> [1,] 1 1
#> [2,] 3 3
#> [3,] 3 3
#> [4,] 3 3
#> [5,] 5 5
#> [6,] 3 3
#> [7,] 0 0
#> [8,] 2 2
#> [9,] 4 4
#> [10,] 3 3
times <- microbenchmark(subset_method(mat), ifelse_method(mat),
dt_sub_method(dt), dt_ie_method(dt))
times
#> Unit: microseconds
#> expr min lq mean median uq max
#> subset_method(mat) 157.401 169.1010 249.3690 174.9015 279.3505 3402.801
#> ifelse_method(mat) 237.500 247.2510 361.2280 260.7010 446.9510 3219.001
#> dt_sub_method(dt) 656.601 785.0015 927.3231 840.8010 901.7510 6646.801
#> dt_ie_method(dt) 493.701 546.2515 698.5841 577.6515 725.6015 3532.701
#> neval
#> 100
#> 100
#> 100
#> 100
Created on 2019-09-17 by the reprex package (v0.3.0)