0

I am "converting" from data.frame to data.table

I now have a data.table:

library(data.table)


DT = data.table(ID = c("ab_cd.de","ab_ci.de","fb_cd.de","xy_cd.de"))
DT

         ID
1: ab_cd.de
2: ab_ci.de
3: fb_cd.de
4: xy_cd.de  

new_DT<- data.table(matrix(ncol = 2))
colnames(new_DT)<- c("test1", "test2")

I would like to to first: delete ".de" after every entry and in the next step separate every entry by the underscore and save the output in two new columns. The final output should look like this:

   test1 test2
1    ab    cd
2    ab    ci
3    fb    cd
4    xy    cd

In data.frame I did:

df = data.frame(ID = c("ab_cd.de","ab_ci.de","fb_cd.de","xy_cd.de"))
df

         ID
1: ab_cd.de
2: ab_ci.de
3: fb_cd.de
4: xy_cd.de


df[,1] <- gsub(".de", "", df[,1], fixed=FALSE)
df

      ID
1: ab_cd
2: ab_ci
3: fb_cd
4: xy_cd



 n <- 1
for (i in (1:length(df[,1]))){
    new_df[n,] <-str_split_fixed(df[i,1], "_", 2)
    n <- n+1
}
new_df

  test1 test2
1    ab    cd
2    ab    ci
3    fb    cd
4    xy    cd

Any help is appreciated!

Rivka
  • 307
  • 1
  • 5
  • 19

2 Answers2

2

You can use tstrsplit to split the column into two after removing the suffix (.de) with sub:

DT[, c("test1", "test2") := tstrsplit(sub("\\.de", "", ID), "_")][, ID := NULL][]

#   test1 test2
#1:    ab    cd
#2:    ab    ci
#3:    fb    cd
#4:    xy    cd
Psidom
  • 209,562
  • 33
  • 339
  • 356
  • what do I need the [] for at the end? – Rivka May 27 '17 at 18:38
  • 1
    `[]` here is a trick to print the result out. As `:=` modify columns in place but doesn't return anything. If you don't need to print, then you don't it. – Psidom May 27 '17 at 18:41
  • I need to repeat every row 5 times. i tried `... :=rep(tstrsplit(sub.....),16)` but this is adding columns, not rows – Rivka May 31 '17 at 13:47
  • 1
    You can repeat rows afterwards. like so `DT[, c("test1", "test2") := tstrsplit(sub("\\.de", "", ID), "_")][, ID := NULL][, lapply(.SD, rep, each=5)]`. – Psidom May 31 '17 at 13:50
1

We can use extract from tidyr

library(tidyr)
df %>% 
   extract(ID, into = c('test1', 'test2'), '([^_]+)_([^.]+).*')
#  test1 test2
#1    ab    cd
#2    ab    ci
#3    fb    cd
#4    xy    cd

Or using data.table

library(data.table)
DT[, .(test1 = sub('_.*', '', ID), test2 = sub('[^_]+_([^.]+)\\..*', '\\1', ID))]
#   test1 test2
#1:    ab    cd
#2:    ab    ci
#3:    fb    cd
#4:    xy    cd
akrun
  • 874,273
  • 37
  • 540
  • 662