3

I have a data frame like this

x1<- c(0,1,1,1,1,0)

df<-data.frame(x1)

I want to add another column that will take the sum of every two rows and store the value for the first two rows. This should look like this.

enter image description here

You can see here that the first two rows' sum is 1 and that is given in the first two rows of the new column (x2). Next, the third and fourth-row sum is given in the 3rd and fourth row of the new column. Can anyone help?

ThomasIsCoding
  • 96,636
  • 9
  • 24
  • 81
P Initiate
  • 79
  • 5

7 Answers7

2

Create an index with gl for every 2 rows and do the sum after grouping

library(dplyr)
df <- df %>%
    group_by(grp = as.integer(gl(n(), 2, n()))) %>% 
    mutate(x2 = sum(x1)) %>% 
    ungroup %>% 
    select(-grp)

-output

df
# A tibble: 6 × 2
     x1    x2
  <dbl> <dbl>
1     0     1
2     1     1
3     1     2
4     1     2
5     1     1
6     0     1

Or using collapse/data.table

library(data.table)
library(collapse)
setDT(df)[, x2 := fsum(x1, g = rep(.I, each = 2, length.out = .N), TRA = 1)]

-output

> df
      x1    x2
   <num> <num>
1:     0     1
2:     1     1
3:     1     2
4:     1     2
5:     1     1
6:     0     1
akrun
  • 874,273
  • 37
  • 540
  • 662
2

Here a way using dplyr where I create a auxiliar column to group by

library(dplyr)


x1<- c(0,1,1,1,1,0)

df <- data.frame(x1)

len_df <- nrow(df)
aux <- rep(seq(1:(len_df/2)),each = 2)[1:len_df]

df %>% 
  mutate(aux = aux) %>% 
  group_by(aux) %>% 
  mutate(x2 = sum(x1)) %>% 
  ungroup() %>%
  select(-aux)

# A tibble: 6 x 2
     x1    x2
  <dbl> <dbl>
1     0     1
2     1     1
3     1     2
4     1     2
5     1     1
6     0     1
Vinícius Félix
  • 8,448
  • 6
  • 16
  • 32
2

You can define the groups using floor division and then simply obtain the grouped sum:

library(dplyr)

df %>%
  mutate(group = (row_number() - 1) %/% 2) %>%
  group_by(group) %>%
  mutate(x2 = sum(x1)) %>%
  ungroup() %>%
  select(-group)
# # A tibble: 6 × 2
#      x1    x2
#   <dbl> <dbl>
# 1     0     1
# 2     1     1
# 3     1     2
# 4     1     2
# 5     1     1
# 6     0     1
Santiago
  • 641
  • 3
  • 14
2

You can use ave + ceiling (both are base R functions)

> transform(df, x2 = ave(x1, ceiling(seq_along(x1) / 2)) * 2)
  x1 x2
1  0  1
2  1  1
3  1  2
4  1  2
5  1  1
6  0  1
ThomasIsCoding
  • 96,636
  • 9
  • 24
  • 81
1

First, a way of making the data.frame without the intermediate variable.

This splits the data.frame into groups of 2, sums, then repeats the pattern into the new variable.

df<-data.frame(x1=c(0,1,1,1,1,0))

df$x2<-rep(lapply(split(df, rep(1:3, each=2)), sum), each=2)

#  x1 x2
#1  0  1
#2  1  1
#3  1  2
#4  1  2
#5  1  1
#6  0  1
M.Viking
  • 5,067
  • 4
  • 17
  • 33
1

in base R you could do:

 transform(df,x2 = ave(x1, gl(nrow(df)/2, 2), FUN = sum))

  x1 x2
1  0  1
2  1  1
3  1  2
4  1  2
5  1  1
6  0  1
Onyambu
  • 67,392
  • 3
  • 24
  • 53
1

A few more options with select benchmarks.

x1 <- sample(0:1, 1e4, 1)

microbenchmark::microbenchmark(
  matrix = rep(colSums(matrix(x1, 2)), each = 2),
  recycle = x1 + x1[seq(x1) + c(1, -1)],
  cumsum = rep(diff(cumsum(c(0, x1))[seq(1, length(x1) + 1, 2)]), each = 2),
  Thomas = ave(x1, ceiling(seq_along(x1)/2))*2,
  onyambu = ave(x1, gl(length(x1)/2, 2), FUN = sum),
  check = "equal"
)
#> Unit: microseconds
#>     expr       min         lq        mean     median         uq       max neval
#>   matrix    65.001    69.6510    79.27203    78.4510    82.1510   148.501   100
#>  recycle    95.001   100.6505   108.65003   107.5510   110.6010   176.901   100
#>   cumsum   137.201   148.9010   169.61090   166.5505   177.7015   340.002   100
#>   Thomas 24645.401 25297.2010 26450.46994 25963.3515 27463.2010 31803.101   100
#>  onyambu  3774.902  3935.7510  4444.36500  4094.3520  4336.1505 11070.301   100

With data.table for large data:

library(data.table)
library(collapse)

x1 <- sample(0:1, 1e6, 1)
df <- data.frame(x1)

microbenchmark::microbenchmark(
  matrix = setDT(df)[, x2 := rep(colSums(matrix(x1, 2)), each = 2)],
  recycle = setDT(df)[, x2 := x1 + x1[.I + c(1, -1)]],
  akrun = setDT(df)[, x2 := fsum(x1, g = rep(.I, each = 2, length.out = .N), TRA = 1)],
  check = "identical"
)
#> Unit: milliseconds
#>     expr       min        lq     mean    median       uq     max neval
#>   matrix  8.053302  8.937301 10.64786  9.376551 12.51890 17.2037   100
#>  recycle 12.117101 12.965950 16.57696 14.003151 17.09805 56.4729   100
#>    akrun 10.071701 10.611051 14.42578 11.291601 14.79090 55.1141   100
jblood94
  • 10,340
  • 1
  • 10
  • 15