Performing a mutate function incorporating character strings

Question

This is my dataframe

data.frame(
  condition = as.factor(c("ecoli_RPMI", "staph_RPMI", "RPMI", "ecoli_DMEM", "staph_DMEM", "DMEM", "ecoli_RPMI", "staph_RPMI", "RPMI", "ecoli_DMEM", "staph_DMEM", "DMEM")),
  time = as.numeric(c(1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2)),
  value = as.numeric(c(0.3, 0.3, 0.2, 0.4, 0.4, 0.1, 0.9, 0.8, 0.1, 0.7, 0.8, 0.2)))

I have many more conditions, representing bacteria i.e. ecoli or staph, followed by the media they were grown in so that the conditions are written like this i.e. "ecoli_RPMI", "staph_RPMI", "ecoli_DMEM", "staph_DMEM". I have multiple time points (50) or so and multiple bacteria and media.

I also have conditions for just the media controls. i.e. "RPMI", "DMEM" which also have corresponding value again across multiple time points

I am trying to subtract the "value" corresponding to (on the same row as) the media control i.e. the value for "RPMI" from all bacteria with the RPMI suffix, x_RPMI i.e. "ecoli_RPMI", "staph_RPMI" and assign the values to a new column named "corrected.values" for example: value for ecoli_RPMI - value for RPMI

The desired result would like this

data.frame(
  condition = as.factor(c("ecoli_RPMI", "staph_RPMI", "RPMI", "ecoli_DMEM", "staph_DMEM", "DMEM", "ecoli_RPMI", "staph_RPMI", "RPMI", "ecoli_DMEM", "staph_DMEM", "DMEM")),
  time = as.numeric(c(1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2)),
  value = as.numeric(c(0.3, 0.3, 0.2, 0.4, 0.4, 0.1, 0.9, 0.8, 0.1, 0.7, 0.8, 0.2)),
  corrected_value = as.numeric(c(0.1, 0.1, 0, 0.3, 0.3, 0, 0.8, 0.7, 0, 0.5, 0.6, 0)))

I have tried all sorts:

doing a group by statement with mutate and case_when

a bit like this
```
df %>%
  group_by(time)%>%
  mutate(corrected_value = case_when(
  conditions == "ecoli_RPMI" ~ value - value[Conditions == "RPMI"],
```
inputing all possibilities but his doesn't seem to work. I wondered if it should be possible to use a string argument to simplify this as the strings are consistent across all of the conditions i.e. always "bacteria"_"media"
I also tried to pivot_wider but didnt have any luck

Many thanks for your help!

score 1 · Answer 1 · answered Jul 27 '23 at 10:36

I think the simplest solution would be to split your condition variable into two variables (bacterium, medium) and then perform the subtraction on grouped data. You could do this:

data.frame(condition = as.factor(
  c(
    "ecoli_RPMI",
    "staph_RPMI",
    "RPMI",
    "ecoli_DMEM",
    "staph_DMEM",
    "DMEM",
    "ecoli_RPMI",
    "staph_RPMI",
    "RPMI",
    "ecoli_DMEM",
    "staph_DMEM",
    "DMEM"
  )
),
time = as.numeric(c(1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2)),
value = as.numeric(c(
  0.3, 0.3, 0.2, 0.4, 0.4, 0.1, 0.9, 0.8, 0.1, 0.7, 0.8, 0.2
))) %>%
  mutate(
    bacterium = case_when(
      str_detect(condition, "ecoli") ~ "Ecoli",
      str_detect(condition, "staph") ~ "Staph",
      TRUE ~ "None"
    ),
    medium = case_when(
      str_detect(condition, "RPMI") ~ "RPMI",
      str_detect(condition, "DMEM") ~ "DMEM",
      TRUE ~ "None"
    )
  ) %>%
  group_by(medium, time) %>%
  mutate(corrected.values = value - value[bacterium == "None"]) %>%
  ungroup()

I am using str_detect() to extract bacterium and medium from conditions. Then for each timepoint and medium combination you can subtract the value you get without bacteria from the whole group.

This produces this result, which seems to be what you are looking for.

# A tibble: 12 × 6
   condition   time value bacterium medium corrected.values
   <fct>      <dbl> <dbl> <chr>     <chr>             <dbl>
 1 ecoli_RPMI     1   0.3 Ecoli     RPMI                0.1
 2 staph_RPMI     1   0.3 Staph     RPMI                0.1
 3 RPMI           1   0.2 None      RPMI                0  
 4 ecoli_DMEM     1   0.4 Ecoli     DMEM                0.3
 5 staph_DMEM     1   0.4 Staph     DMEM                0.3
 6 DMEM           1   0.1 None      DMEM                0  
 7 ecoli_RPMI     2   0.9 Ecoli     RPMI                0.8
 8 staph_RPMI     2   0.8 Staph     RPMI                0.7
 9 RPMI           2   0.1 None      RPMI                0  
10 ecoli_DMEM     2   0.7 Ecoli     DMEM                0.5
11 staph_DMEM     2   0.8 Staph     DMEM                0.6
12 DMEM           2   0.2 None      DMEM                0

TimTeaFan · Answer 2 · 2023-07-28T10:43:04.007

Below are two approaches:

The first uses group_modify():

We first create a variable bacteria. Then we subtract the value where the condition equals the current bacteria group cur_group()$bacteria from the whole value column.

library(tidyverse)

dat |> 
  mutate(bacteria = gsub("^.*_(\\w*$)", "\\1", condition),
         .after = "condition") |> 
  mutate(corrected_values = value - value[condition == cur_group()$bacteria],
         .by = c("time", "bacteria")) |>
  ungroup()

#>     condition bacteria time value corrected_values
#> 1  ecoli_RPMI     RPMI    1   0.3              0.1
#> 2  staph_RPMI     RPMI    1   0.3              0.1
#> 3        RPMI     RPMI    1   0.2              0.0
#> 4  ecoli_DMEM     DMEM    1   0.4              0.3
#> 5  staph_DMEM     DMEM    1   0.4              0.3
#> 6        DMEM     DMEM    1   0.1              0.0
#> 7  ecoli_RPMI     RPMI    2   0.9              0.8
#> 8  staph_RPMI     RPMI    2   0.8              0.7
#> 9        RPMI     RPMI    2   0.1              0.0
#> 10 ecoli_DMEM     DMEM    2   0.7              0.5
#> 11 staph_DMEM     DMEM    2   0.8              0.6
#> 12       DMEM     DMEM    2   0.2              0.0

^{Created on 2023-07-28 with reprex v2.0.2}

The second uses pivot_wider() |> mutate(across()) |> pivot_longer() |> left_join().

The trick lies in the across() statement where we iterate over each column (except from time) and then get the column with the corresponding suffix and subtract it from the current column col.

library(tidyverse)

dat |> 
  pivot_wider(names_from = condition,
              values_from = value) |> 
  mutate(across(! time,
                \(col) col - get(gsub("^.*_(\\w*$)", "\\1", cur_column()))
                )
         ) |> 
  pivot_longer(cols = !time,
               names_to = "condition",
               values_to = "corrected_values") |> 
  left_join(dat, by = c("time", "condition"))

#> # A tibble: 12 x 4
#>     time condition  corrected_values value
#>    <dbl> <chr>                 <dbl> <dbl>
#>  1     1 ecoli_RPMI              0.1   0.3
#>  2     1 staph_RPMI              0.1   0.3
#>  3     1 RPMI                    0     0.2
#>  4     1 ecoli_DMEM              0.3   0.4
#>  5     1 staph_DMEM              0.3   0.4
#>  6     1 DMEM                    0     0.1
#>  7     2 ecoli_RPMI              0.8   0.9
#>  8     2 staph_RPMI              0.7   0.8
#>  9     2 RPMI                    0     0.1
#> 10     2 ecoli_DMEM              0.5   0.7
#> 11     2 staph_DMEM              0.6   0.8
#> 12     2 DMEM                    0     0.2

Data from OP

dat <- data.frame( condition = as.factor(c("ecoli_RPMI", "staph_RPMI", "RPMI", "ecoli_DMEM", "staph_DMEM", "DMEM", "ecoli_RPMI", "staph_RPMI", "RPMI", "ecoli_DMEM", "staph_DMEM", "DMEM")), time = as.numeric(c(1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2)), value = as.numeric(c(0.3, 0.3, 0.2, 0.4, 0.4, 0.1, 0.9, 0.8, 0.1, 0.7, 0.8, 0.2)))

^{Created on 2023-07-27 by the reprex package (v2.0.1)}

Konrad Rudolph · Accepted Answer · 2023-07-27T11:16:35.597

You can use the following, which splits the condition into the constituent strain and medium, and then (per time) subtracts the row which contains no strain from those that do:

data |>
  separate(condition, c("strain", "medium"), "_", remove = FALSE, fill = "left") |>
  group_by(time, medium) |>
  mutate(corrected_value = value - value[is.na(strain)]) |>
  ungroup()

Yielding:

# A tibble: 12 × 6
   condition  strain medium  time value corrected_value
   <fct>      <chr>  <chr>  <dbl> <dbl>           <dbl>
 1 ecoli_RPMI ecoli  RPMI       1   0.3             0.1
 2 staph_RPMI staph  RPMI       1   0.3             0.1
 3 RPMI       NA     RPMI       1   0.2             0
 4 ecoli_DMEM ecoli  DMEM       1   0.4             0.3
 5 staph_DMEM staph  DMEM       1   0.4             0.3
 6 DMEM       NA     DMEM       1   0.1             0
 7 ecoli_RPMI ecoli  RPMI       2   0.9             0.8
 8 staph_RPMI staph  RPMI       2   0.8             0.7
 9 RPMI       NA     RPMI       2   0.1             0
10 ecoli_DMEM ecoli  DMEM       2   0.7             0.5
11 staph_DMEM staph  DMEM       2   0.8             0.6
12 DMEM       NA     DMEM       2   0.2             0

Alternatively, consider reshaping your data into a more semantically meaningful format:

data |>
  separate(condition, c("strain", "medium"), "_", fill = "left") |>
  pivot_wider(names_from = strain) |>
  rename(baseline = `NA`) |>
  pivot_longer(! c(medium, time, baseline), names_to = "strain") |>
  mutate(corrected_value = value - baseline)

# A tibble: 8 × 6
  medium  time baseline strain value corrected_value
  <chr>  <dbl>    <dbl> <chr>  <dbl>           <dbl>
1 RPMI       1      0.2 ecoli    0.3             0.1
2 RPMI       1      0.2 staph    0.3             0.1
3 DMEM       1      0.1 ecoli    0.4             0.3
4 DMEM       1      0.1 staph    0.4             0.3
5 RPMI       2      0.1 ecoli    0.9             0.8
6 RPMI       2      0.1 staph    0.8             0.7
7 DMEM       2      0.2 ecoli    0.7             0.5
8 DMEM       2      0.2 staph    0.8             0.6

Performing a mutate function incorporating character strings

3 Answers3