One approach would be to convert your data frame into long format using tidyr::gather
and then to apply a similar calculation to each column after grouping.
Supposing a data frame (smaller than yours for illustrative purposes):
library(tidyverse)
df <- tibble(
column = rep(paste0("col_", str_pad(1:1000, 4, pad = "0")), each = 640),
value = sample(c(0:100, NA_integer_), replace = TRUE, 6.4E5),
line = rep(1:640, 1E3)
) %>% spread(column, value)
Convert to long with tidyr::gather, then group_by, and summarize to get the average missingness by column:
df %>%
gather(col, value, -line) %>%
group_by(col) %>%
summarize(missing_share = mean(is.na(value)))
# A tibble: 1,000 x 2
col missing_share
<chr> <dbl>
1 col_0001 0.0109
2 col_0002 0.0141
3 col_0003 0.0125
4 col_0004 0.00938
5 col_0005 0.0141
6 col_0006 0.00625
7 col_0007 0.00312
8 col_0008 0.00781
9 col_0009 0.00781
10 col_0010 0.00781
# ... with 990 more rows
Or skip the grouping step to get overall missingness:
df_NA_overall <- df %>%
gather(col, value, -line) %>%
summarize(missing_share = mean(is.na(value)))
# A tibble: 1 x 1
missing_share
<dbl>
1 0.00989
The first part can be done yet faster using purrr::map
:
map(df, ~mean(is.na(.)))
$line
[1] 0
$col_0001
[1] 0.0109375
$col_0002
[1] 0.0140625
$col_0003
[1] 0.0125
$col_0004
[1] 0.009375
$col_0005
[1] 0.0140625
$col_0006
[1] 0.00625
$col_0007
[1] 0.003125
$col_0008
[1] 0.0078125
$col_0009
[1] 0.0078125