2

I want to convert data frame like this:

mre <- tibble::tribble(
  ~folder3, ~folder2, ~folder1,
    "V3=4",   "V2=1",   "V1=0",
    "V3=5",   "V2=1",   "V1=0",
    "V3=4",   "V2=2",   "V1=0",
    "V3=5",   "V2=2",   "V1=0",
    "V3=4",   "V2=1",   "V1=1",
    "V3=5",   "V2=1",   "V1=1",
    "V3=4",   "V2=2",   "V1=1",
    "V3=5",   "V2=2",   "V1=1"
  )

to this:

folder3 folder2 folder1 V3  V2  V1
V3=4    V2=1    V1=0    4   1   0
V3=5    V2=1    V1=0    5   1   0
V3=4    V2=2    V1=0    4   2   0
V3=5    V2=2    V1=0    5   2   0
V3=4    V2=1    V1=1    4   1   1
V3=5    V2=1    V1=1    5   1   1
V3=4    V2=2    V1=1    4   2   1
V3=5    V2=2    V1=1    5   2   1

Basically extracting the unique variable names ("V3, "V2", "V1" here, but could be any valid names such as "a", "b", c" ) for each folder? column as the new column name, and keep the values in place.

I have the following for a single "folder" column by using the first row value:

mre %>% 
    tidyr::extract(folder1, into = .$folder1[1] |> word(1, sep="="), "\\S+=(\\d+)", remove = FALSE)

But I don't know how to expand to multiple "folders" columns (the number is not fixed). I tried to use map following the answers here, but could not figure out how to get the variable names from the first row.

Any suggestions?

Anoushiravan R
  • 21,622
  • 3
  • 18
  • 41
Dong
  • 481
  • 4
  • 15
  • I am sorry the variable name "V3" coincide with "folder3" in my example. It could be any arbitrary name, so it need to be extracted from the column value itself. – Dong Dec 22 '21 at 00:40

3 Answers3

3

Instead of extract, we may create new columns within across itself - mutate across all the columns (everything()), use str_extract to get the digits (\\d+) that succeeds the =, while modifying the column names in names with str_replace

library(dplyr)
library(stringr)
mre %>%
    mutate(across(everything(), 
    ~ as.numeric(str_extract(., "(?<=\\=)\\d+")), 
       .names = "{str_replace(.col, 'folder', 'V')}"))

-output

# A tibble: 8 × 6
  folder3 folder2 folder1    V3    V2    V1
  <chr>   <chr>   <chr>   <dbl> <dbl> <dbl>
1 V3=4    V2=1    V1=0        4     1     0
2 V3=5    V2=1    V1=0        5     1     0
3 V3=4    V2=2    V1=0        4     2     0
4 V3=5    V2=2    V1=0        5     2     0
5 V3=4    V2=1    V1=1        4     1     1
6 V3=5    V2=1    V1=1        5     1     1
7 V3=4    V2=2    V1=1        4     2     1
8 V3=5    V2=2    V1=1        5     2     1

If the column names needs to extracted from the 'folder' column values, then one option is to reshape to 'long' with pivot_longer, split the column with separate_rows, reshape back to 'wide' with pivot_wider and bind the columns with the original data

library(tidyr)
mre %>% 
  mutate(rn = row_number()) %>% 
  pivot_longer(cols= starts_with('folder'), names_to = NULL) %>% 
  separate(value, into = c('name', 'value'), sep="\\=", convert = TRUE) %>% 
  pivot_wider(names_from = name, values_from = value) %>% 
  select(-rn) %>% 
  bind_cols(mre, .)

-output

# A tibble: 8 × 6
  folder3 folder2 folder1    V3    V2    V1
  <chr>   <chr>   <chr>   <int> <int> <int>
1 V3=4    V2=1    V1=0        4     1     0
2 V3=5    V2=1    V1=0        5     1     0
3 V3=4    V2=2    V1=0        4     2     0
4 V3=5    V2=2    V1=0        5     2     0
5 V3=4    V2=1    V1=1        4     1     1
6 V3=5    V2=1    V1=1        5     1     1
7 V3=4    V2=2    V1=1        4     2     1
8 V3=5    V2=2    V1=1        5     2     1
akrun
  • 874,273
  • 37
  • 540
  • 662
  • 1
    elegant solution with `str_extract` :) Merry Christmas, my friend! – ThomasIsCoding Dec 21 '21 at 20:13
  • @ThomasIsCoding thanks. Merry Xmas to you as well! – akrun Dec 21 '21 at 20:13
  • Thx for the answers. Unfortunately the variable names are not always regular like "V3", "V2", "V1". They need to be extract from the column values. – Dong Dec 22 '21 at 02:32
  • Follow your excellent example, the following works to extract the variable name from the first row: `mre %>% mutate(across(everything(), ~ as.numeric(str_remove(., "\\S+=")), .names = "{str_sub(.[1,], 1, 2)}"))`. However, I could not get a more general regex such as `str_remove(.[1,], "=\\d+")` to work. in the braces. – Dong Dec 22 '21 at 07:21
  • @Dong Try the updated soluiton – akrun Dec 22 '21 at 17:49
  • 1
    @akrun Nice trick using the row number. – Dong Dec 23 '21 at 02:55
3

A base R option

cbind(
  mre,
  unclass(
    xtabs(
      V2 ~ id + factor(V1, levels = unique(V1)),
      do.call(
        rbind,
        Map(function(x) cbind(read.table(text = x, sep = "="), id = seq_along(x)), mre)
      )
    )
  )
)

gives

  folder3 folder2 folder1 V3 V2 V1
1    V3=4    V2=1    V1=0  4  1  0
2    V3=5    V2=1    V1=0  5  1  0
3    V3=4    V2=2    V1=0  4  2  0
4    V3=5    V2=2    V1=0  5  2  0
5    V3=4    V2=1    V1=1  4  1  1
6    V3=5    V2=1    V1=1  5  1  1
7    V3=4    V2=2    V1=1  4  2  1
8    V3=5    V2=2    V1=1  5  2  1

Code Breakdowns

  • Map(..., mre)
> Map(function(x) cbind(read.table(text = x, sep = "="), id = seq_along(x)), mre)
$folder3
  V1 V2 id
1 V3  4  1
2 V3  5  2
3 V3  4  3
4 V3  5  4
5 V3  4  5
6 V3  5  6
7 V3  4  7
8 V3  5  8

$folder2
  V1 V2 id
1 V2  1  1
2 V2  1  2
3 V2  2  3
4 V2  2  4
5 V2  1  5
6 V2  1  6
7 V2  2  7
8 V2  2  8

$folder1
  V1 V2 id
1 V1  0  1
2 V1  0  2
3 V1  0  3
4 V1  0  4
5 V1  1  5
6 V1  1  6
7 V1  1  7
8 V1  1  8
  • do.call(rbind, ...)
> do.call(
+   rbind,
+   Map(function(x) cbind(read.table(text = x, sep = "="), id = seq_along(x)), mre)
+ )
          V1 V2 id
folder3.1 V3  4  1
folder3.2 V3  5  2
folder3.3 V3  4  3
folder3.4 V3  5  4
folder3.5 V3  4  5
folder3.6 V3  5  6
folder3.7 V3  4  7
folder3.8 V3  5  8
folder2.1 V2  1  1
folder2.2 V2  1  2
folder2.3 V2  2  3
folder2.4 V2  2  4
folder2.5 V2  1  5
folder2.6 V2  1  6
folder2.7 V2  2  7
folder2.8 V2  2  8
folder1.1 V1  0  1
folder1.2 V1  0  2
folder1.3 V1  0  3
folder1.4 V1  0  4
folder1.5 V1  1  5
folder1.6 V1  1  6
folder1.7 V1  1  7
folder1.8 V1  1  8
  • xtabs(..., ...)
> xtabs(
+   V2 ~ id + factor(V1, levels = unique(V1)),
+   do.call(
+     rbind,
+     Map(function(x) cbind(read.table(text = x, sep = "="), id = se .... [TRUNCATED]
   factor(V1, levels = unique(V1))
id  V3 V2 V1
  1  4  1  0
  2  5  1  0
  3  4  2  0
  4  5  2  0
  5  4  1  1
  6  5  1  1
  7  4  2  1
  8  5  2  1
ThomasIsCoding
  • 96,636
  • 9
  • 24
  • 81
1

Here is another solution you could use:

library(dplyr)
library(stringr)
library(purrr)

mre %>%
  bind_cols(
    setNames(seq_len(ncol(mre)) %>%
               map_dfc(~ {y <- unname(unlist(mre[.x]))
               as.integer(str_remove(y, "\\w+="))}), paste0("V", 1:ncol(mre)))
  )

# A tibble: 8 x 6
  folder3 folder2 folder1    V1    V2    V3
  <chr>   <chr>   <chr>   <int> <int> <int>
1 V3=4    V2=1    V1=0        4     1     0
2 V3=5    V2=1    V1=0        5     1     0
3 V3=4    V2=2    V1=0        4     2     0
4 V3=5    V2=2    V1=0        5     2     0
5 V3=4    V2=1    V1=1        4     1     1
6 V3=5    V2=1    V1=1        5     1     1
7 V3=4    V2=2    V1=1        4     2     1
8 V3=5    V2=2    V1=1        5     2     1
Anoushiravan R
  • 21,622
  • 3
  • 18
  • 41