2

I have a column that contains means of three different measurements in the first 50% of rows, and the associated standard errors in the last 50% of rows. In the previous column are the names used for each of those (meanNativeSR, meanExoticSR, meanTotalSR, seN, seE, seT). I wanted to create 2 new columns that contain the se_ names in the first column, and their values in the second column, then get rid of that bottom 50% of rows. Here is my dataset:

df <- structure(list(Invasion = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 
3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 
2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 
1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L
), .Label = c("Low", "Medium", "High"), class = "factor"), Growth = structure(c(1L, 
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
2L, 3L, 1L, 2L, 3L), .Label = c("cover", "herb", "woody"), class = "factor"), 
    mean_se = c("meanNativeSR", "meanNativeSR", "meanNativeSR", 
    "meanNativeSR", "meanNativeSR", "meanNativeSR", "meanNativeSR", 
    "meanNativeSR", "meanNativeSR", "meanExoticSR", "meanExoticSR", 
    "meanExoticSR", "meanExoticSR", "meanExoticSR", "meanExoticSR", 
    "meanExoticSR", "meanExoticSR", "meanExoticSR", "meanTotalSR", 
    "meanTotalSR", "meanTotalSR", "meanTotalSR", "meanTotalSR", 
    "meanTotalSR", "meanTotalSR", "meanTotalSR", "meanTotalSR", 
    "seN", "seN", "seN", "seN", "seN", "seN", "seN", "seN", "seN", 
    "seE", "seE", "seE", "seE", "seE", "seE", "seE", "seE", "seE", 
    "seT", "seT", "seT", "seT", "seT", "seT", "seT", "seT", "seT"
    ), value = c(0.769230769230769, 0.230769230769231, 0.923076923076923, 
    2.46153846153846, 6.84615384615385, 0.538461538461538, 1.69230769230769, 
    1.76923076923077, 1.15384615384615, 0.384615384615385, 0, 
    1.38461538461538, 1.76923076923077, 0, 2.23076923076923, 
    2.07692307692308, 0.769230769230769, 2.46153846153846, 1.15384615384615, 
    0.230769230769231, 2.53846153846154, 4.23076923076923, 6.84615384615385, 
    3.23076923076923, 3.76923076923077, 2.76923076923077, 3.84615384615385, 
    0.280883362823162, 0.12162606385263, 0.329364937914491, 0.312463015562922, 
    0.705710715103738, 0.24325212770526, 0.36487819155789, 0.280883362823162, 
    0.191021338791684, 0.140441681411581, 0, 0.180400606147055, 
    0.201081886427668, 0, 0.230769230769231, 0.329364937914491, 
    0.12162606385263, 0.24325212770526, 0.273771237231572, 0.12162606385263, 
    0.24325212770526, 0.394738572265145, 0.705710715103738, 0.440772139427464, 
    0.532938710021193, 0.257050482766198, 0.336767321450351)), row.names = c(NA, 
-54L), class = c("tbl_df", "tbl", "data.frame"))

I was able to figure out what I wanted to do with the code below, but surely there must be a more elegant way as this way required me to create unnecessary intermediates.

#create an intermediate data.frame that contains just the means and their values from the first half of original df
df.mean <- head(df, -27)
#rename columns 3 and 4
colnames(df.mean)[3] <- "mean"
colnames(df.mean)[4] <- "mean_value"


#create another intermediate data.frame with standard error values from the bottom half of original df
df.se <- df[28:54,]
#rename columns 3 and 4
colnames(df.se)[3] <- "se"
colnames(df.se)[4] <- "se_value"


#cbind those together to get desired result
df.final <- cbind(df.mean, df.se[,3:4])

#remove intermediates
rm(df.mean); rm(df.se)

Is there a simpler way to accomplish this, perhaps using pipes or some functions in the tidyverse, or even with base R?

E. Moore
  • 321
  • 1
  • 5
  • 11

3 Answers3

3

Here is an approach with pivot_wider and unnest:

library(tidyverse)
df %>%
    mutate(class = str_extract(mean_se,"(N|E|T)"),
           fun = str_extract(mean_se,"(mean|se)")) %>%
    pivot_wider(id_cols = c("Invasion","Growth"), names_from = "fun",
                values_from = c("mean_se","value")) %>%
  unnest()
# A tibble: 27 x 6
   Invasion Growth mean_se_mean mean_se_se value_mean value_se
   <fct>    <fct>  <chr>        <chr>           <dbl>    <dbl>
 1 Low      cover  meanNativeSR seN             0.769    0.281
 2 Low      cover  meanExoticSR seE             0.385    0.140
 3 Low      cover  meanTotalSR  seT             1.15     0.274
 4 Low      herb   meanNativeSR seN             0.231    0.122
 5 Low      herb   meanExoticSR seE             0        0    
 6 Low      herb   meanTotalSR  seT             0.231    0.122
 7 Low      woody  meanNativeSR seN             0.923    0.329
 8 Low      woody  meanExoticSR seE             1.38     0.180
 9 Low      woody  meanTotalSR  seT             2.54     0.243
10 Medium   cover  meanNativeSR seN             2.46     0.312
# … with 17 more rows

You'll get some warnings, but it should work nonetheless.

Ian Campbell
  • 23,484
  • 14
  • 36
  • 57
3

With tidyverse, we could do a group_split, change the column names, and do a inner_join

library(dplyr)
library(purrr)
df %>%
   group_split(grp = row_number() > 27, .keep = FALSE) %>% 
   map2(list(c('mean', 'mean_value'), c('se', 'se_value')),
        ~ {nm1 <- .y
           .x  %>%
             rename_at(3:4, ~ nm1) %>%
             mutate(rn = row_number())} ) %>% 
  reduce(inner_join) %>% 
  select(-rn)

-output

# A tibble: 27 x 6
#   Invasion Growth mean         mean_value se    se_value
#   <fct>    <fct>  <chr>             <dbl> <chr>    <dbl>
# 1 Low      cover  meanNativeSR      0.769 seN      0.281
# 2 Low      herb   meanNativeSR      0.231 seN      0.122
# 3 Low      woody  meanNativeSR      0.923 seN      0.329
# 4 Medium   cover  meanNativeSR      2.46  seN      0.312
# 5 Medium   herb   meanNativeSR      6.85  seN      0.706
# 6 Medium   woody  meanNativeSR      0.538 seN      0.243
# 7 High     cover  meanNativeSR      1.69  seN      0.365
# 8 High     herb   meanNativeSR      1.77  seN      0.281
# 9 High     woody  meanNativeSR      1.15  seN      0.191
#10 Low      cover  meanExoticSR      0.385 seE      0.140
# … with 17 more rows
akrun
  • 874,273
  • 37
  • 540
  • 662
2

I think that other than pulling things together there is no shorter and easier way to accomplish your goals. The longest part of your code is assigning the new colnames, which can't really be shortened. The rest can be put into a single line. But really, you have to always balance terseness and readability.

And the the dplyr methods shown above are really neat, but I believe they are meant to deal with more complex/general cases than yours.

df_final_2 <- cbind(head(df, -27), df[28:54,3:4])
colnames(df_final_2)[3:6] <- c("mean", "mean_value","se", "se_value")
Mario Niepel
  • 1,095
  • 4
  • 19