4

I've got a dataset where there are some NA's but I can manually work out what the values should be as the df is a column for name and the rest of the columns are just numbers followed by a final column with total. Only one NA appears per row at most so I can work out what the value should be by using the total column and the sum of all the other columns. Just wondering what would be the best way to fill these NA's without having to hardcode one by one as the df I'm using is pretty big

example df:

df = structure(list(city = c("sydney", "new york", "london", "beijing", "paris", "madrid"), 
                    year = c(2005:2010), 
                    A = c(1, 4, 5 , NA, 2, 1), 
                    B = c(3, NA, 4 , 9, 0, 6),
                    C = c(3, 4 , 6, 1, 8, NA),
                    total = c(NA, 10, 15, 14, NA, 15)), 
               class = "data.frame", row.names = c(NA, -6L))

df
Anoushiravan R
  • 21,622
  • 3
  • 18
  • 41
P_S_13
  • 65
  • 3
  • Take a look at `fill` and `replace_na` from the tidyr package. – Maël Feb 09 '22 at 11:26
  • 1
    Alternatively the `zoo` package with the `na.approx` function will help too. You can do something like `df = df %>% mutate(A = na.approx(A))` to interpolate the values of A and so on for the other columns. – thehand0 Feb 09 '22 at 11:37

4 Answers4

2

You need to replace the NA in total column first and then you can simply calculate the rest. You can also make a function for the A, B, C columns so you don't repeat the code, but with only 3 columns that shouldn't be the problem.

df = structure(list(city = c("sydney", "new york", "london", "beijing", "paris", "madrid"), 
                    year = c(2005:2010), 
                    A = c(1, 4, 5 , NA, 2, 1), 
                    B = c(3, NA, 4 , 9, 0, 6),
                    C = c(3, 4 , 6, 1, 8, NA),
                    total = c(NA, 10, 15, 14, NA, 15)), 
               class = "data.frame", row.names = c(NA, -6L))

df
#>       city year  A  B  C total
#> 1   sydney 2005  1  3  3    NA
#> 2 new york 2006  4 NA  4    10
#> 3   london 2007  5  4  6    15
#> 4  beijing 2008 NA  9  1    14
#> 5    paris 2009  2  0  8    NA
#> 6   madrid 2010  1  6 NA    15

df$total <- ifelse(is.na(df$total), rowSums(df[, c("A", "B", "C")]), df$total)
df$A <- ifelse(is.na(df$A), df$total - rowSums(df[, c("A", "B", "C")], na.rm = TRUE), df$A)
df$B <- ifelse(is.na(df$B), df$total - rowSums(df[, c("A", "B", "C")], na.rm = TRUE), df$B)
df$C <- ifelse(is.na(df$C), df$total - rowSums(df[, c("A", "B", "C")], na.rm = TRUE), df$C)

df
#>       city year A B C total
#> 1   sydney 2005 1 3 3     7
#> 2 new york 2006 4 2 4    10
#> 3   london 2007 5 4 6    15
#> 4  beijing 2008 4 9 1    14
#> 5    paris 2009 2 0 8    10
#> 6   madrid 2010 1 6 8    15

Created on 2022-02-09 by the reprex package (v2.0.1)

UPDATE: After replacing NA in total column, you can use the na.aprox function from zoo package to interpolate the rest of values.

library(zoo)

df$total <- ifelse(is.na(df$total), rowSums(df[, c("A", "B", "C")]), df$total)   # first totals
df[, c("A", "B", "C")] <- na.approx(df[, c("A", "B", "C", "total")], rule = 2)   # then rest
df
      city year   A   B C total
1   sydney 2005 1.0 3.0 3     7
2 new york 2006 4.0 3.5 4    10
3   london 2007 5.0 4.0 6    15
4  beijing 2008 3.5 9.0 1    14
5    paris 2009 2.0 0.0 8    10
6   madrid 2010 1.0 6.0 8    15
Claudiu Papasteri
  • 2,469
  • 1
  • 17
  • 30
2

This solution can also help you.

library(purrr)
library(dplyr)

df %>%
  rowwise() %>%
  mutate(total = ifelse(is.na(total), sum(c_across(A:C)), total), 
         pmap_df(select(cur_data(), A:total), ~ {x <- c(...)[1:3]
         replace(x, is.na(x), c(...)[4] - sum(x, na.rm = TRUE))}))

# A tibble: 6 x 6
# Rowwise: 
  city      year     A     B     C total
  <chr>    <int> <dbl> <dbl> <dbl> <dbl>
1 sydney    2005     1     3     3     7
2 new york  2006     4     2     4    10
3 london    2007     5     4     6    15
4 beijing   2008     4     9     1    14
5 paris     2009     2     0     8    10
6 madrid    2010     1     6     8    15

A bit hardcoded but can be modified in that sense.

Anoushiravan R
  • 21,622
  • 3
  • 18
  • 41
2

data.table solution

library(data.table)
setDT(df)

cols <- c("A", "B", "C")

df[, (cols) := lapply(.SD, function(x) {
  ifelse(is.na(x), total - rowSums(.SD, na.rm = T), x)
}), .SDcols = cols][is.na(total), total := rowSums(.SD), .SDcols = cols]

df
#        city year A B C total
# 1:   sydney 2005 1 3 3     7
# 2: new york 2006 4 2 4    10
# 3:   london 2007 5 4 6    15
# 4:  beijing 2008 4 9 1    14
# 5:    paris 2009 2 0 8    10
# 6:   madrid 2010 1 6 8    15

data

df = structure(list(
  city = c("sydney", "new york", "london", "beijing", "paris", "madrid"), 
  year = c(2005:2010), 
  A = c(1, 4, 5 , NA, 2, 1), 
  B = c(3, NA, 4 , 9, 0, 6),
  C = c(3, 4 , 6, 1, 8, NA),
  total = c(NA, 10, 15, 14, NA, 15)), 
  class = "data.frame", row.names = c(NA, -6L)
)
Merijn van Tilborg
  • 5,452
  • 1
  • 7
  • 22
1

Here is a base R solution with apply.

df = structure(list(city = c("sydney", "new york", "london", "beijing", "paris", "madrid"), 
                    year = c(2005:2010), 
                    A = c(1, 4, 5 , NA, 2, 1), 
                    B = c(3, NA, 4 , 9, 0, 6),
                    C = c(3, 4 , 6, 1, 8, NA),
                    total = c(NA, 10, 15, 14, NA, 15)), 
               class = "data.frame", row.names = c(NA, -6L))

df[-(1:2)] <- t(apply(df[-(1:2)], 1, \(x) {
  if(is.na(x[4])) {
    x[4] <- sum(x[-4])
  } else if(anyNA(x[-4])) {
      x[-4][is.na(x[-4])] <- x[4] - sum(x[-4][!is.na(x[-4])])
  }
  x
}))
df
#>       city year A B C total
#> 1   sydney 2005 1 3 3     7
#> 2 new york 2006 4 2 4    10
#> 3   london 2007 5 4 6    15
#> 4  beijing 2008 4 9 1    14
#> 5    paris 2009 2 0 8    10
#> 6   madrid 2010 1 6 8    15

Created on 2022-02-09 by the reprex package (v2.0.1)

Rui Barradas
  • 70,273
  • 8
  • 34
  • 66