0

I need to convert a "wide" dataframe of annually repeated measures on individuals into "long" format so that I can model it like lm(y_year2 ~ x_year1) as well as lm(z_year2 ~ y_year2)

I can get it into the format I want "by hand" but cannot get figure out how to melt/dcast it into the shape I want

Below I've illustrated what I'm doing with some simulated data

The dataframe is like this in wide format, one individual per line

ID  SITE    L_03  M_03  R_03  L_04  M_04  R_04  L_05  M_05  R_05
1   forest    X     a   YES     Y     b   YES     Z     c   NO
2   forest    ... 

I'd like it in LONG format:

ID  SITE    L_year1  L_year2  M_year1  M_year2  R_year1   R_year2   year1  year2
1   forest      Z       Y       a         b       YES       YES       03    04
1   forest      Y       Z       b         c       YES       NO        04    05
2   forest      ...  
2   forest      ...

Some Simulated data: L and M are numeric (length & mass), R is a Yes/No factor (reproductive), 3 years of repeated measurements (2003-2005)

    ID <- 1:10; SITE <- c(rep("forest",3), rep("swamp",3), rep("field",4))
    L_03 <- round(rnorm(10, 100, 1),3) ; M_03 <- round((10 + L_03*0.25 + rnorm(10, 0, 1)), 3)
    R_03 <- sample(c("Yes", "No"), 10, replace = TRUE) ; L_04 <- round((2 + L_03*1.25 + rnorm(10, 1,10)), 3) 
    M_04 <- round((10 + L_04*0.25 + rnorm(10, 0,10)), 3) ;R_04 <- sample(c("Yes", "No"), 10, replace = TRUE)
    L_05 <- round((2 + L_04*1.25 + rnorm(10, 1,10)),3) ; M_05 <- round((10 + L_05*0.25 + abs(rnorm(10, 0,10))),3)
    R_05 <- sample(c("Yes", "No"), 10, replace = TRUE); rm_data <- data.frame(ID, SITE, L_03, M_03, R_03, L_04, M_04,R_04, L_05, M_05, R_05)

Approach 1: My ad hoc reshaping "by hand" with rbind 1st, make subset with 2003 & 2004 data, then another w/ 2004 & 2005

rm_data1 <- cbind(rm_data[ ,c(1,2,3:5, 6:8)], rep(2003,10), rep(2004,10))
rm_data2 <- cbind(rm_data[ ,c(1,2,6:8, 9:11)],rep(2004,10), rep(2005,10))
names(rm_data1)[3:10]<- c("L1", "M1", "R1", "L2", "M2", "R2", "yr1", "yr2")
names(rm_data2)[3:10]<- c("L1", "M1", "R1", "L2", "M2", "R2", "yr1", "yr2")
data3 <- rbind(rm_data1, rm_data2)

Approach 2?: I'd like to do this with reshape/melt/dcast. I can't figure out if I can use dcast directly on the wide dataframe or, once I melt it, how to dcast it into the format I want.

library(reshape2)
rm_measure_vars <- c("L_03", "M_03", "R_03", "L_04", "M_04","R_04", "L_05", "M_05", "R_05")
rm_data_melt <-  melt(data = rm_data, id.vars = c("ID", "SITE"), measure.vars = rm_measure_vars, value.name = "data")

I add a designator of the year the measurement was taken to the melted data

obs_year <- gsub("(.*)([0-9]{2})", "\\2", rm_data_melt$variable)
rm_data_melt <- cbind(rm_data_melt, obs_year)

The dcast seems like it should be something like this, but this is not yet what I need

dcast(data = rm_data_melt, formula = ID + SITE + obs_year ~ variable)
   ID   SITE obs_year    L_03   M_03 R_03    L_04   M_04 R_04    L_05   M_05 R_05
1   1 forest       03   99.96 35.364   No    <NA>   <NA> <NA>    <NA>   <NA> <NA>
2   1 forest       04    <NA>   <NA> <NA> 129.595 47.256  Yes    <NA>   <NA> <NA>
3   1 forest       05    <NA>   <NA> <NA>    <NA>   <NA> <NA> 177.607 58.204  Yes

Any suggestions would be greatly appreciated

N Brouwer
  • 4,778
  • 7
  • 30
  • 35
  • You appear to be asking to duplicate portions of the data. Year values for the second year are being expected to be in two different columns, the end of year one col and the beginning of year 2 col. – IRTFM Jan 04 '13 at 19:01
  • Exactly. These data are being used for parameterizing an individual-based demographic model. Individuals will be treated as a random effect during parameter estimation. – N Brouwer Jan 04 '13 at 19:07
  • Well, I would add 3 columns and then do the reshaping. – IRTFM Jan 04 '13 at 19:27

2 Answers2

2

I gave it some try. The reshape is the easy part. The rest needs some semi-manual handling, I believe. The following should give you what you want.

output <- reshape(rm_data, idvar=c("ID","SITE"), varying=3:11, 
                v.names=c("L_","M_","R_"), direction="long")
output$time <- output$time + 2    # to get the year
names(output)[3:6] <- c("year1", "L_year1", "M_year1", "R_year1")
output$year2 <- output$year1+1
rownames(output) <- c()

sapply(output[,4:6], function(x) {
  i <- ncol(output)+1
  output[,i] <<- x[c(2:length(x), NA)]
  names(output)[i] <<- sub("1","2",names(output)[i-4])
})

output <- output[,c(1,2,4,8,5,9,6,10,3,7)]    # rearrange columns as necessary

Hope this helps!

Theodore Lytras
  • 3,955
  • 1
  • 18
  • 25
0

Install onetree packages. devtools::install_github("yikeshu0611/onetree") library(onetree)

3 steps, using onetree package

1 step

reshape the data to a long data

long1=reshape_toLong(data = rm_data,
               id = "ID",
               j = "year",
               value.var.prefix = c("L_","M_","R_"))

2nd step

drop 5 year, choose 3 and 4 year; duplicated year as y

long2=long1[long1$year!=5,]
long2$y=long2$year

reshape long2 to a wide data by year

wide1=reshape_toWide(data = long2,
               id = "ID",
               j = "year",
               value.var.prefix = c("L_","M_","R_","y")
               )

Now, we get data with year 3 and year 4, whic is year1 and year2 in your purpose data. So we repalce 3 with 1, 4 with 2 in the colnames.

colnames(wide1)=gsub(3,1,colnames(wide1))
colnames(wide1)=gsub(4,2,colnames(wide1))

3rd step

do 2nd step again, this time, we drop year3, we choose year4 and year5.

long3=long1[long1$year!=3,]
long3$y=long3$year
wide2=reshape_toWide(data = long3,
                     id = "ID",
                     j = "year",
                     value.var.prefix = c("L_","M_","R_","y")
)
colnames(wide2)=gsub(4,1,colnames(wide2))
colnames(wide2)=gsub(5,2,colnames(wide2))

last

rbind wide1 and wide2

data=rbind(wide1,wide2)
data[order(data$ID),]


   ID   SITE     L_1    M_1 R_1 y1     L_2    M_2 R_2 y2
1   1 forest 100.181 34.279 Yes  3  131.88 50.953  No  4
11  1 forest  131.88 50.953  No  4 158.642 50.255  No  5
2   2 forest 101.645 36.667 Yes  3 123.923 43.915  No  4
12  2 forest 123.923 43.915  No  4  163.81 55.979  No  5
3   3 forest  98.961 33.901 Yes  3 125.928 41.611  No  4
13  3 forest 125.928 41.611  No  4 165.865 57.417  No  5
4   4  swamp 100.807 36.254  No  3 117.856 48.634 Yes  4
14  4  swamp 117.856 48.634 Yes  4 137.487 50.945  No  5
5   5  swamp   99.75 33.881  No  3 132.419 50.563 Yes  4
15  5  swamp 132.419 50.563 Yes  4 168.461 58.373 Yes  5
6   6  swamp 100.463 34.859 Yes  3 122.884 40.301  No  4
16  6  swamp 122.884 40.301  No  4  152.85 57.491  No  5
7   7  field 102.527 34.521  No  3 123.363 35.935  No  4
17  7  field 123.363 35.935  No  4     168 55.692  No  5
8   8  field  99.957 35.236 Yes  3 139.083 34.793  No  4
18  8  field 139.083 34.793  No  4 177.648 62.638 Yes  5
9   9  field  100.16 36.454  No  3 135.468 45.115 Yes  4
19  9  field 135.468 45.115 Yes  4 180.666 57.233  No  5
10 10  field 100.037 35.612  No  3 139.165  46.95  No  4
20 10  field 139.165  46.95  No  4 169.333 55.782 Yes  5
zhang jing
  • 141
  • 9