1

I'm trying to create new variables from existing variables like below:

a1+a2=a3, b1+b2=b3, ..., z1+z2=z3

Here is an example data frame

df <- data.frame(replicate(10,sample(1:10)))
colnames(df) <- c("a1","a2","b1","b2","c1","c2","d1","d2","e1","e2")

Here's my solution with repeating codes

# a solution by base R
df$a3 <- df$a1 + df$a2
df$b3 <- df$b1 + df$b2
df$c3 <- df$c1 + df$c2
df$d3 <- df$d1 + df$d2
df$e3 <- df$e1 + df$e2

Or

# a solution by dplyr
library(dplyr)
df <- df %>%
  mutate(a3 = a1+a2,
         b3 = b1+b2,
         c3 = c1+c2,
         d3 = d1+d2,
         e3 = e1+d2)

Or

# a solution by data.table
library(data.table)
DT <- data.table(df)
DT[,a3:=a1+a2][,b3:=b1+b2][,c3:=c1+c2][,d3:=d1+d2][,e3:=e1+e2]

Actually I have more than 100 variables, so I want to find a way to do so without repeating code... Although I tried to use mutate_ with standard evaluation and regular expression, I lost my way because I'm a newbie in R. Can you mutate multiple variables without repeating code?

Bhargav Rao
  • 50,140
  • 28
  • 121
  • 140
  • 2
    Is the naming pattern always going to be like that? Meaning same first part of the name and then 1 and 2? – Elin Nov 06 '16 at 16:07

4 Answers4

2

A possible solution using data.table:

DT <- data.table(df)[, rn := .I]

DTadd3 <- dcast(melt(DT, measure.vars = 1:10)[, `:=` (let = substr(variable,1,1), rn = 1:.N), variable
                                              ][, s3 := sum(value), .(let,rn)], 
                rn ~ paste0(let,3), value.var = 's3', mean)

DT[DTadd3, on = 'rn'][, rn := NULL][]

which gives:

    a1 a2 b1 b2 c1 c2 d1 d2 e1 e2 a3 b3 c3 d3 e3
 1: 10  5  9  5 10  4  5  3  7 10 15 14 14  8 17
 2:  2  6  6  8  3  8  7  1  4  7  8 14 11  8 11
 3:  6  4  7  4  4  3  4  6  3  3 10 11  7 10  6
 4:  1  2  4  2  9  9  3  7 10  4  3  6 18 10 14
 5:  9 10  8  1  8  7 10  5  9  1 19  9 15 15 10
 6:  8  8 10  6  2  5  2  4  2  6 16 16  7  6  8
 7:  7  9  1  7  5 10  9  2  1  8 16  8 15 11  9
 8:  5  1  2  9  7  2  1  8  5  5  6 11  9  9 10
 9:  3  7  3  3  1  6  8 10  8  9 10  6  7 18 17
10:  4  3  5 10  6  1  6  9  6  2  7 15  7 15  8

A similar solution using dplyr and tidyr:

df %>% 
  bind_cols(., df %>% 
              gather(var, val) %>% 
              group_by(var) %>% 
              mutate(let = substr(var,1,1), rn = 1:n()) %>% 
              group_by(let,rn) %>% 
              summarise(s3 = sum(val)) %>% 
              spread(let, s3) %>% 
              select(-rn)
            )

However, as noted by @Gregor, it is much better to transform your data into long format. The data.table equivalent of @Gregor's answer:

DT <- data.table(df)
melt(DT[, rn := .I], 
     variable.name = 'let',
     measure.vars = patterns('1$','2$'), 
     value.name = paste0('v',1:2)
     )[, `:=` (let = letters[let], v3 = v1 + v2)][]

which gives (first 15 rows):

    rn let v1 v2 v3
 1:  1   a 10  5 15
 2:  2   a  2  6  8
 3:  3   a  6  4 10
 4:  4   a  1  2  3
 5:  5   a  9 10 19
 6:  6   a  8  8 16
 7:  7   a  7  9 16
 8:  8   a  5  1  6
 9:  9   a  3  7 10
10: 10   a  4  3  7
11:  1   b  9  5 14
12:  2   b  6  8 14
13:  3   b  7  4 11
14:  4   b  4  2  6
15:  5   b  8  1  9
Jaap
  • 81,064
  • 34
  • 182
  • 193
2

Your data format is making this hard - I would reshape the data like this. In general, you shouldn't encode actual data information in column names, if the difference between a1 and a2 is meaningful, it is better to have a column with letter, a, b, c and a column with number, 1, 2.

df$id = 1:nrow(df)

library(tidyr)
library(dplyr)
tdf = gather(df, key = key, value = value, -id) %>%
    separate(key, into = c("letter", "number"), sep = 1) %>%
    mutate(number = paste0("V", number)) %>%
    spread(key = number, value = value) 
## now data is "tidy":
head(tdf)
#   id letter V1 V2
# 1  1      a  2  7
# 2  1      b 10  4
# 3  1      c  9 10
# 4  1      d  9  4
# 5  1      e  5  8
# 6  2      a  9  8

## and the operation is simple:
tdf$V3 = tdf$V1 + tdf$V2
head(tdf)
#    id letter V1 V2 V3
# 1   1      a  2  7  9
# 2   1      b 10  4 14
# 3   1      c  9 10 19
# 4   1      d  9  4 13
# 5   1      e  5  8 13
# 6   2      a  9  8 17
Gregor Thomas
  • 136,190
  • 20
  • 167
  • 294
1

My data.table solution:

sapply(c("a", "b", "c", "d", "e"), function(ll) 
  df[ , paste0(ll, 3) := get(paste0(ll, 1)) + get(paste0(ll, 2))])
df[]
#     a1 a2 b1 b2 c1 c2 d1 d2 e1 e2 a3 b3 c3 d3 e3
#  1:  5  2  2  6  4  1 10  7  3  9  7  8  5 17 12
#  2:  4  8  7  3  3  7  9  6  9  7 12 10 10 15 16
#  3: 10  7  6 10  1  9  4  1  2  4 17 16 10  5  6
#  4:  3  4  1  7  6  4  7  4  7  5  7  8 10 11 12
#  5:  8  3  4  2  2  2  3  3  4 10 11  6  4  6 14
#  6:  6  6  5  1  8 10  1 10  5  3 12  6 18 11  8
#  7:  2 10  8  9  5  6  2  5 10  2 12 17 11  7 12
#  8:  1  1 10  8  9  5  6  9  6  8  2 18 14 15 14
#  9:  9  5  3  5 10  3  5  2  1  6 14  8 13  7  7
# 10:  7  9  9  4  7  8  8  8  8  1 16 13 15 16  9

Or, more extensibly:

sapply(c("a", "b", "c", "d", "e"), function(ll) 
  df[ , paste0(ll, 3) := Reduce(`+`, mget(paste0(ll, 1:2)))])

If all of the variables fit the pattern of ending with 1 or 2, you might try:

stems = unique(gsub("[0-9]", "", names(df)))

Then sapply(stems, ...)

MichaelChirico
  • 33,841
  • 14
  • 113
  • 198
0
library(tidyverse)
reduce(.init=df, .x=letters[1:5], .f~{
    mutate(.x, '{.y}3' := get(str_c(.y, 1)) + get(str_c(.y, 2)))
})
Sergey Skripko
  • 336
  • 1
  • 8