4

I have a data frame which is like this(4 rows and 5 column):

Marker ind1 ind2 ind3 ind4
mark1             CT             TT             CT             TT
mark2             AG             AA             AG             AA
mark3             AC             AA             AC             AA
mark4             CT             TT             CT             TT

what I want to do is to split each of the columns (except first coloumn) to two column. so the output should be like this (4 rows and 9 column):

Marker ind1 ind1 ind2 ind2 ind3 ind3 ind4 ind4
mark1             C T             T T             C T             T T
mark2             A G             A A             A G             A A
mark3             A C             A A             A C             A A
mark4             C T             T T             C T             T T

I know how to split one column

do.call(rbind,strsplit(test$JRP4RA6119.039, ""))

which gives this:

      [,1] [,2]
 [1,] "C"  "T" 
 [2,] "A"  "G" 
 [3,] "A"  "C" 
 [4,] "C"  "T" 

what I would like is to be able to loop this and make it for all columns in one dataframe.

Thanks in advance.

Jaap
  • 81,064
  • 34
  • 182
  • 193
mahmood
  • 1,203
  • 5
  • 16
  • 27
  • ahh ok. but the column names are not important I can add it as a row later. – mahmood May 05 '15 at 13:28
  • 4
    @DatamineR, you _can_ have duplicated column names in a data.frame but it's not a good idea to do that. For example `setNames(data.frame(1, 2), c("x", "x"))` – talat May 05 '15 at 13:50

3 Answers3

5

I've got the feeling that it is a bit far-fetched but:

test_split <- data.frame(Marker=test$Marker, 
                         do.call("cbind", lapply(apply(test[, -1], 2, strsplit, ""), 
                                                 function(x) do.call("rbind", x))), 
                         stringsAsFactors=F)
colnames(test_split)[-1] <- paste(rep(colnames(test)[-1], e=2), 1:2, sep="_")

test_split
#      Marker JRP4RA6119.039_1 JRP4RA6119.039_2 JRP4RA6124.029_1 JRP4RA6124.029_2 JRP4RA6133.051_1 JRP4RA6133.051_2 JRP4RA6125.009_1 JRP4RA6125.009_2
#1 s7e4419xxx                C                T                T                T                C                T                T                T
#2 s7e7001s01                A                G                A                A                A                G                A                A
#3 s7e3049xxx                A                C                A                A                A                C                A                A
#4 s7e4727xxx                C                T                T                T                C                T                T                T
Cath
  • 23,906
  • 5
  • 52
  • 86
5

You could also try cSplit_f from splitstackshape

library(splitstackshape)
df1[-1] <- lapply(df1[-1] , function(x)
        gsub('(?<=\\w)(?=\\w)', ',', x, perl=TRUE))
 cSplit_f(df1, 2:ncol(df1), sep=',')
#   Marker ind1_1 ind1_2 ind2_1 ind2_2 ind3_1 ind3_2 ind4_1 ind4_2
#1:  mark1      C      T      T      T      C      T      T      T
#2:  mark2      A      G      A      A      A      G      A      A
#3:  mark3      A      C      A      A      A      C      A      A
#4:  mark4      C      T      T      T      C      T      T      T

Or as @Ananda Mahto suggested, cSplit may be more efficient on large datasets, and this can be used directly without changing the delimiter.

cSplit(df1, names(df1)[-1], sep="", stripWhite = FALSE)
#   Marker ind1_1 ind1_2 ind2_1 ind2_2 ind3_1 ind3_2 ind4_1 ind4_2
#1:  mark1      C      T      T      T      C      T      T      T
#2:  mark2      A      G      A      A      A      G      A      A
#3:  mark3      A      C      A      A      A      C      A      A
#4:  mark4      C      T      T      T      C      T      T      T

Or using tstrsplit from data.table

library(data.table)#v1.9.5+
setDT(df1)
cbind(Marker=df1$Marker,df1[, unlist(lapply(.SD, function(x)
        tstrsplit(x, '')), recursive=FALSE), .SDcols=-1])
#   Marker ind11 ind12 ind21 ind22 ind31 ind32 ind41 ind42
#1:  mark1     C     T     T     T     C     T     T     T
#2:  mark2     A     G     A     A     A     G     A     A
#3:  mark3     A     C     A     A     A     C     A     A
#4:  mark4     C     T     T     T     C     T     T     T

data

df1 <- structure(list(Marker = c("mark1", "mark2", "mark3", "mark4"), 
ind1 = c("CT", "AG", "AC", "CT"), ind2 = c("TT", "AA", "AA", 
"TT"), ind3 = c("CT", "AG", "AC", "CT"), ind4 = c("TT", "AA", 
"AA", "TT")), .Names = c("Marker", "ind1", "ind2", "ind3", 
"ind4"), class = "data.frame", row.names = c(NA, -4L))
Community
  • 1
  • 1
akrun
  • 874,273
  • 37
  • 540
  • 662
  • Why `cSplit_f` here instead of `cSplit`? Speed? – Sam Firke May 05 '15 at 13:58
  • @SamFirke `cSplit` can be used for multiple columns, according to `?cSplit_f`, `A variation of the ‘concat.split’ family of functions designed for large _rectangular_ datasets. This function makes use of ‘fread’ from the "data.table" package for very speedy splitting of concatenated columns of data.` – akrun May 05 '15 at 13:58
  • 3
    @SamFirke, I would actually just recommend `cSplit(df1, names(df1)[-1], "", stripWhite = FALSE)` since there may be memory issues with very large datasets when using `cSplit_f` due to how "data.table" preallocates columns. – A5C1D2H2I1M1N2O1R2T1 May 05 '15 at 14:03
  • @AnandaMahto I tried the `cSplit_f(df1, names(df1)[-1], sep="", stripWhite = FALSE)`, but didn't work – akrun May 05 '15 at 14:05
  • 4
    @akrun, nope, it won't. That's why I suggested `cSplit` :-) – A5C1D2H2I1M1N2O1R2T1 May 05 '15 at 14:06
0
> b <- as.data.frame(a[, 1])
> b[, 2] <- substr(a[, 2], 1, 1)
> b[, 3] <- substr(a[, 2], 2, 2)
> b[, 4] <- substr(a[, 3], 1, 1)
> b[, 5] <- substr(a[, 3], 2, 2)
> b[, 6] <- substr(a[, 4], 1, 1)
> b[, 7] <- substr(a[, 4], 2, 2)
> b[, 8] <- substr(a[, 5], 1, 1)
> b[, 9] <- substr(a[, 5], 2, 2)
> head(b)
  a[, 1] V2 V3 V4 V5 V6 V7 V8 V9
1  mark1  C  T  T  T  C  T  T  T
2  mark2  A  G  A  A  A  G  A  A
3  mark3  A  C  A  A  A  C  A  A
4  mark4  C  T  T  T  C  T  T  T
> dim(b)
[1] 4 9
> names(b) <- c("Marker", "ind1", "ind1","ind2", "ind2", "ind3", "ind3", "ind4", "ind4")
> head(b)
  Marker ind1 ind1 ind2 ind2 ind3 ind3 ind4
1  mark1    C    T    T    T    C    T    T
2  mark2    A    G    A    A    A    G    A
3  mark3    A    C    A    A    A    C    A
4  mark4    C    T    T    T    C    T    T
  ind4
1    T
2    A
3    A
4    T
> 

You could easily make this into a loop, but with the relatively small number of columns I didn't have a need to.

To make it into a loop just set it up as

for(i in 2:ncol(a)){
}
Hack-R
  • 22,422
  • 14
  • 75
  • 131