4

I have a tibble with information about diagnoses:

data <- tibble(
  id = c(1:10),
  diagnosis_1 = c("F32", "F431", "R58", "S32", "F11", NA, NA, "Y67", "F32", "Z032"),
  diagnosis_2 = c(NA, NA, NA, NA, NA, NA, "G35", NA, NA, NA),
  diagnosis_3 = c("F40", NA, "R67", "F431", NA, "F60", "S58", "R68", "F11", NA),
  diagnosis_4 = c(NA, NA, "F65", NA, "F19", NA, NA, "F32", NA, NA)
)

As a part of the cleaning process, I have removed all diagnoses not fulfilling certain criteria (i.e. not starting with the letter F, G, or Z). With the following code:

data$diagnosis_1[str_sub(data$diagnosis_1, 1,1) %in% c("R", "S", "Y")] <- NA
data$diagnosis_2[str_sub(data$diagnosis_2, 1,1) %in% c("R", "S", "Y")] <- NA
data$diagnosis_3[str_sub(data$diagnosis_3, 1,1) %in% c("R", "S", "Y")] <- NA
data$diagnosis_4[str_sub(data$diagnosis_4, 1,1) %in% c("R", "S", "Y")] <- NA

Ending up with this tibble:

enter image description here

I now need to move the data to the left to fill the columns from left to right (i.e diagnosis_1 not being empty if diagnosis_2, diagnosis_3 or diagnosis_4 has data). I have tried using ifelse() as it is vectorized but I can`t seem to get it to work with several nested ifelse().

ifelse(is.na(data$diagnosis_1), data$diagnosis_2, data$diagnosis_1))

All suggestions are much appreciated.

Edit: adding expected output:

enter image description here

MartinM
  • 65
  • 6

6 Answers6

2

You can use Reduce along with coalesce from dplyr, i.e.

df$diagnosis_1 <- Reduce(dplyr::coalesce, df[-1])

#id diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4
#   <int> <chr>       <chr>       <chr>       <chr>      
# 1     1 F32         <NA>        F40         <NA>       
# 2     2 F431        <NA>        <NA>        <NA>       
# 3     3 F65         <NA>        <NA>        F65        
# 4     4 F431        <NA>        F431        <NA>       
# 5     5 F11         <NA>        <NA>        F19        
# 6     6 F60         <NA>        F60         <NA>       
# 7     7 G35         G35         <NA>        <NA>       
# 8     8 F32         <NA>        <NA>        F32        
# 9     9 F32         <NA>        F11         <NA>       
#10    10 Z032        <NA>        <NA>        <NA> 
Sotos
  • 51,121
  • 6
  • 32
  • 66
2

We first replace values which start with either "R", "S" or "Y" to NA and then left shift the non-NA values.

data[-1] <- lapply(data[-1], function(x) replace(x, grepl("^[R|S|Y]", x), NA))  
data[] <- t(apply(data, 1, function(x) `length<-`(na.omit(x), length(x))))

data
# A tibble: 10 x 5
#     id diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4
#   <chr> <chr>       <chr>       <chr>       <chr>      
# 1 " 1"  F32         F40         NA          NA         
# 2 " 2"  F431        NA          NA          NA         
# 3 " 3"  F65         NA          NA          NA         
# 4 " 4"  F431        NA          NA          NA         
# 5 " 5"  F11         F19         NA          NA         
# 6 " 6"  F60         NA          NA          NA         
# 7 " 7"  G35         NA          NA          NA         
# 8 " 8"  F32         NA          NA          NA         
# 9 " 9"  F32         F11         NA          NA         
#10  10   Z032        NA          NA          NA    

Shifting the non-NA value to left has been taken from David's answer from here. You can try any other approach to shift values from the same question as well.

Ronak Shah
  • 377,200
  • 20
  • 156
  • 213
2

You can try a tidyverse

library(tidyverse)
data %>% 
  mutate_at(vars(starts_with("diagnosis")), funs(ifelse(str_sub(., 1, 1) %in% c("R", "S", "Y"), NA, .))) %>% 
  gather(k,v, -id) %>% 
  group_by(id) %>% 
  arrange(id) %>% 
  mutate(v=ifelse(k == "diagnosis_1", v[!is.na(v)][1], v)) %>% 
  spread(k, v)
# A tibble: 10 x 5
# Groups:   id [10]
      id diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4
   <int> <chr>       <chr>       <chr>       <chr>      
 1     1 F32         NA          F40         NA         
 2     2 F431        NA          NA          NA         
 3     3 F65         NA          NA          F65        
 4     4 F431        NA          F431        NA         
 5     5 F11         NA          NA          F19        
 6     6 F60         NA          F60         NA         
 7     7 G35         G35         NA          NA         
 8     8 F32         NA          NA          F32        
 9     9 F32         NA          F11         NA         
10    10 Z032        NA          NA          NA 

As its unclear what OP wants (see discussion below) you can also try

data %>% 
  mutate_at(vars(starts_with("diagnosis")), funs(ifelse(str_sub(., 1, 1) %in% c("R", "S", "Y"), NA, .))) %>% 
  gather(k,v, -id) %>% 
  group_by(id) %>% 
  arrange(id) %>% 
  mutate(v=c(v[!is.na(v)], rep(NA, length(v) - length(v[!is.na(v)])))) %>% 
  spread(k, v)
# A tibble: 10 x 5
# Groups:   id [10]
      id diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4
   <int> <chr>       <chr>       <chr>       <chr>      
 1     1 F32         F40         NA          NA         
 2     2 F431        NA          NA          NA         
 3     3 F65         NA          NA          NA         
 4     4 F431        NA          NA          NA         
 5     5 F11         F19         NA          NA         
 6     6 F60         NA          NA          NA         
 7     7 G35         NA          NA          NA         
 8     8 F32         NA          NA          NA         
 9     9 F32         F11         NA          NA         
10    10 Z032        NA          NA          NA
Roman
  • 17,008
  • 3
  • 36
  • 49
2

Using dplyr and tidyr. Reshape from wide to long, exclude "^RSY" and NA diagnosis, reshape long to wide.

library(dplyr)
library(tidyr)

gather(data, key = "k", value = "v", -id) %>% 
  filter(!(grepl("^[R|S|Y]", v) | is.na(v))) %>% 
  group_by(id) %>% 
  mutate(diagN = paste0("diagnosis_", row_number())) %>% 
  select(-k) %>% 
  spread(key = "diagN", value = "v") %>% 
  ungroup()

# # A tibble: 10 x 3
#       id diagnosis_1 diagnosis_2
#    <int> <chr>       <chr>      
#  1     1 F32         F40        
#  2     2 F431        NA         
#  3     3 F65         NA         
#  4     4 F431        NA         
#  5     5 F11         F19        
#  6     6 F60         NA         
#  7     7 G35         NA         
#  8     8 F32         NA         
#  9     9 F32         F11        
# 10    10 Z032        NA  
zx8754
  • 52,746
  • 12
  • 114
  • 209
0

Below solution using function na_move from package dedupewider.

library(dedupewider)

na_move(data) # 'right' direction is by default

#> # A tibble: 10 x 5
#>    id    diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4
#>  * <chr> <chr>       <chr>       <lgl>       <lgl>      
#>  1 1     F32         F40         NA          NA         
#>  2 2     F431        <NA>        NA          NA         
#>  3 3     F65         <NA>        NA          NA         
#>  4 4     F431        <NA>        NA          NA         
#>  5 5     F11         F19         NA          NA         
#>  6 6     F60         <NA>        NA          NA         
#>  7 7     G35         <NA>        NA          NA         
#>  8 8     F32         <NA>        NA          NA         
#>  9 9     F32         F11         NA          NA         
#> 10 10    Z032        <NA>        NA          NA
gss
  • 1,334
  • 6
  • 11
0

A update, using pivot_longer and unnest_wider.

  • dplyr 1.0.10 CRAN release: 2022-09-01
  • tidyr 1.2.1 CRAN release: 2022-09-08

Step 1: clean up data

library(dplyr)
library(tidyr)

data <- data %>% 
  mutate(across(starts_with("diag"), ~ 
    replace(.x, grepl(paste0("^", c("R", "S", "Y"), collapse="|"), .x), NA)))

Step 2: left-compact data

data %>% 
  pivot_longer(starts_with("diag")) %>% 
  group_by(id) %>% 
  mutate(value = value[order(is.na(value))]) %>% 
  summarize(col = list(value)) %>% 
  unnest_wider(col, names_sep="_") %>% 
  setNames(colnames({{data}}))
# A tibble: 10 × 5
      id diagnosis_1 diagnosis_2 diagnosis_3 diagnosis_4
   <int> <chr>       <chr>       <chr>       <chr>      
 1     1 F32         F40         NA          NA         
 2     2 F431        NA          NA          NA         
 3     3 F65         NA          NA          NA         
 4     4 F431        NA          NA          NA         
 5     5 F11         F19         NA          NA         
 6     6 F60         NA          NA          NA         
 7     7 G35         NA          NA          NA         
 8     8 F32         NA          NA          NA         
 9     9 F32         F11         NA          NA         
10    10 Z032        NA          NA          NA

data

data <- structure(list(id = 1:10, diagnosis_1 = c("F32", "F431", "R58", 
"S32", "F11", NA, NA, "Y67", "F32", "Z032"), diagnosis_2 = c(NA, 
NA, NA, NA, NA, NA, "G35", NA, NA, NA), diagnosis_3 = c("F40", 
NA, "R67", "F431", NA, "F60", "S58", "R68", "F11", NA), diagnosis_4 = c(NA, 
NA, "F65", NA, "F19", NA, NA, "F32", NA, NA)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -10L))
Andre Wildberg
  • 12,344
  • 3
  • 12
  • 29