convert partitioning folder columns to variables

Question

I want to convert data frame like this:

mre <- tibble::tribble(
  ~folder3, ~folder2, ~folder1,
    "V3=4",   "V2=1",   "V1=0",
    "V3=5",   "V2=1",   "V1=0",
    "V3=4",   "V2=2",   "V1=0",
    "V3=5",   "V2=2",   "V1=0",
    "V3=4",   "V2=1",   "V1=1",
    "V3=5",   "V2=1",   "V1=1",
    "V3=4",   "V2=2",   "V1=1",
    "V3=5",   "V2=2",   "V1=1"
  )

to this:

folder3 folder2 folder1 V3  V2  V1
V3=4    V2=1    V1=0    4   1   0
V3=5    V2=1    V1=0    5   1   0
V3=4    V2=2    V1=0    4   2   0
V3=5    V2=2    V1=0    5   2   0
V3=4    V2=1    V1=1    4   1   1
V3=5    V2=1    V1=1    5   1   1
V3=4    V2=2    V1=1    4   2   1
V3=5    V2=2    V1=1    5   2   1

Basically extracting the unique variable names ("V3, "V2", "V1" here, but could be any valid names such as "a", "b", c" ) for each folder? column as the new column name, and keep the values in place.

I have the following for a single "folder" column by using the first row value:

mre %>% 
    tidyr::extract(folder1, into = .$folder1[1] |> word(1, sep="="), "\\S+=(\\d+)", remove = FALSE)

But I don't know how to expand to multiple "folders" columns (the number is not fixed). I tried to use map following the answers here, but could not figure out how to get the variable names from the first row.

Any suggestions?

I am sorry the variable name "V3" coincide with "folder3" in my example. It could be any arbitrary name, so it need to be extracted from the column value itself. — Dong, Dec 22 '21 at 00:40

akrun · Accepted Answer · 2021-12-22T17:48:51.720

Instead of extract, we may create new columns within across itself - mutate across all the columns (everything()), use str_extract to get the digits (\\d+) that succeeds the =, while modifying the column names in names with str_replace

library(dplyr)
library(stringr)
mre %>%
    mutate(across(everything(), 
    ~ as.numeric(str_extract(., "(?<=\\=)\\d+")), 
       .names = "{str_replace(.col, 'folder', 'V')}"))

-output

# A tibble: 8 × 6
  folder3 folder2 folder1    V3    V2    V1
  <chr>   <chr>   <chr>   <dbl> <dbl> <dbl>
1 V3=4    V2=1    V1=0        4     1     0
2 V3=5    V2=1    V1=0        5     1     0
3 V3=4    V2=2    V1=0        4     2     0
4 V3=5    V2=2    V1=0        5     2     0
5 V3=4    V2=1    V1=1        4     1     1
6 V3=5    V2=1    V1=1        5     1     1
7 V3=4    V2=2    V1=1        4     2     1
8 V3=5    V2=2    V1=1        5     2     1

If the column names needs to extracted from the 'folder' column values, then one option is to reshape to 'long' with pivot_longer, split the column with separate_rows, reshape back to 'wide' with pivot_wider and bind the columns with the original data

library(tidyr)
mre %>% 
  mutate(rn = row_number()) %>% 
  pivot_longer(cols= starts_with('folder'), names_to = NULL) %>% 
  separate(value, into = c('name', 'value'), sep="\\=", convert = TRUE) %>% 
  pivot_wider(names_from = name, values_from = value) %>% 
  select(-rn) %>% 
  bind_cols(mre, .)

-output

# A tibble: 8 × 6
  folder3 folder2 folder1    V3    V2    V1
  <chr>   <chr>   <chr>   <int> <int> <int>
1 V3=4    V2=1    V1=0        4     1     0
2 V3=5    V2=1    V1=0        5     1     0
3 V3=4    V2=2    V1=0        4     2     0
4 V3=5    V2=2    V1=0        5     2     0
5 V3=4    V2=1    V1=1        4     1     1
6 V3=5    V2=1    V1=1        5     1     1
7 V3=4    V2=2    V1=1        4     2     1
8 V3=5    V2=2    V1=1        5     2     1

elegant solution with `str_extract` :) Merry Christmas, my friend! — ThomasIsCoding, Dec 21 '21 at 20:13
Thx for the answers. Unfortunately the variable names are not always regular like "V3", "V2", "V1". They need to be extract from the column values. — Dong, Dec 22 '21 at 02:32
Follow your excellent example, the following works to extract the variable name from the first row: `mre %>% mutate(across(everything(), ~ as.numeric(str_remove(., "\\S+=")), .names = "{str_sub(.[1,], 1, 2)}"))`. However, I could not get a more general regex such as `str_remove(.[1,], "=\\d+")` to work. in the braces. — Dong, Dec 22 '21 at 07:21

ThomasIsCoding · Answer 2 · 2021-12-22T08:49:56.907

A base R option

cbind(
  mre,
  unclass(
    xtabs(
      V2 ~ id + factor(V1, levels = unique(V1)),
      do.call(
        rbind,
        Map(function(x) cbind(read.table(text = x, sep = "="), id = seq_along(x)), mre)
      )
    )
  )
)

gives

  folder3 folder2 folder1 V3 V2 V1
1    V3=4    V2=1    V1=0  4  1  0
2    V3=5    V2=1    V1=0  5  1  0
3    V3=4    V2=2    V1=0  4  2  0
4    V3=5    V2=2    V1=0  5  2  0
5    V3=4    V2=1    V1=1  4  1  1
6    V3=5    V2=1    V1=1  5  1  1
7    V3=4    V2=2    V1=1  4  2  1
8    V3=5    V2=2    V1=1  5  2  1

Code Breakdowns

Map(..., mre)

> Map(function(x) cbind(read.table(text = x, sep = "="), id = seq_along(x)), mre)
$folder3
  V1 V2 id
1 V3  4  1
2 V3  5  2
3 V3  4  3
4 V3  5  4
5 V3  4  5
6 V3  5  6
7 V3  4  7
8 V3  5  8

$folder2
  V1 V2 id
1 V2  1  1
2 V2  1  2
3 V2  2  3
4 V2  2  4
5 V2  1  5
6 V2  1  6
7 V2  2  7
8 V2  2  8

$folder1
  V1 V2 id
1 V1  0  1
2 V1  0  2
3 V1  0  3
4 V1  0  4
5 V1  1  5
6 V1  1  6
7 V1  1  7
8 V1  1  8

do.call(rbind, ...)

> do.call(
+   rbind,
+   Map(function(x) cbind(read.table(text = x, sep = "="), id = seq_along(x)), mre)
+ )
          V1 V2 id
folder3.1 V3  4  1
folder3.2 V3  5  2
folder3.3 V3  4  3
folder3.4 V3  5  4
folder3.5 V3  4  5
folder3.6 V3  5  6
folder3.7 V3  4  7
folder3.8 V3  5  8
folder2.1 V2  1  1
folder2.2 V2  1  2
folder2.3 V2  2  3
folder2.4 V2  2  4
folder2.5 V2  1  5
folder2.6 V2  1  6
folder2.7 V2  2  7
folder2.8 V2  2  8
folder1.1 V1  0  1
folder1.2 V1  0  2
folder1.3 V1  0  3
folder1.4 V1  0  4
folder1.5 V1  1  5
folder1.6 V1  1  6
folder1.7 V1  1  7
folder1.8 V1  1  8

xtabs(..., ...)

> xtabs(
+   V2 ~ id + factor(V1, levels = unique(V1)),
+   do.call(
+     rbind,
+     Map(function(x) cbind(read.table(text = x, sep = "="), id = se .... [TRUNCATED]
   factor(V1, levels = unique(V1))
id  V3 V2 V1
  1  4  1  0
  2  5  1  0
  3  4  2  0
  4  5  2  0
  5  4  1  1
  6  5  1  1
  7  4  2  1
  8  5  2  1

Or another option is `mre[paste0("V", 3:1)] <- "dim<-"(as.numeric(sub(".*\\=", "", as.matrix(mre))), dim(mre))` — akrun, Dec 21 '21 at 20:15
@akrun yes, that option seems much shorter than mine. thanks :) — ThomasIsCoding, Dec 21 '21 at 20:16
I think the `type.convert` would be more easier to understand `mre[paste0("V", 3:1)] <- type.convert(sub(".*\\=", "", as.matrix(mre)), as.is = TRUE)` — akrun, Dec 21 '21 at 20:18
Thanks. This works. But it is beyond my understanding at this time. :-( — Dong, Dec 22 '21 at 02:13

score 1 · Answer 3 · answered Dec 22 '21 at 20:21

Here is another solution you could use:

library(dplyr)
library(stringr)
library(purrr)

mre %>%
  bind_cols(
    setNames(seq_len(ncol(mre)) %>%
               map_dfc(~ {y <- unname(unlist(mre[.x]))
               as.integer(str_remove(y, "\\w+="))}), paste0("V", 1:ncol(mre)))
  )

# A tibble: 8 x 6
  folder3 folder2 folder1    V1    V2    V3
  <chr>   <chr>   <chr>   <int> <int> <int>
1 V3=4    V2=1    V1=0        4     1     0
2 V3=5    V2=1    V1=0        5     1     0
3 V3=4    V2=2    V1=0        4     2     0
4 V3=5    V2=2    V1=0        5     2     0
5 V3=4    V2=1    V1=1        4     1     1
6 V3=5    V2=1    V1=1        5     1     1
7 V3=4    V2=2    V1=1        4     2     1
8 V3=5    V2=2    V1=1        5     2     1

convert partitioning folder columns to variables

3 Answers3