5

I have a dataset like that:

Old <- data.frame(
  X1= c(
    "AD=17795,54;ARL=139;DEA=20;DER=20;DP=1785",
    "DP=4784;AD=4753,23;ARL=123;DEA=5;DER=5",
    "ARL=149;AD=30,9;DEA=25;DER=25;DP=3077",
    "AD=244,49;ARL=144;DEA=7;DER=7;DP=245"
    ))


X1
AD=17795,54;ARL=139;DEA=20;DER=20;DP=1785
DP=4784;AD=4753,23;ARL=123;DEA=5;DER=5
ARL=149;AD=30,9;DEA=25;DER=25;DP=3077
AD=244,49;ARL=144;DEA=7;DER=7;DP=245 

I want to extract ";" seperated value for AD=xxx,xx than add to a new column: Desired output is:

X1                                              X2
AD=17795,54;ARL=139;DEA=20;DER=20;DP=1785       17795,54
DP=4784;AD=4753,23;ARL=123;DEA=5;DER=5          4753,23
ARL=149;AD=30,9;DEA=25;DER=25;DP=3077           30,9
AD=244,49;ARL=144;DEA=7;DER=7;DP=245            244,49

I have tried that:

Old$X2<-mapply(
  function(x,  i) x[i],
  strsplit(X1, ";"),
  lapply(strsplit(X1, ";"), function(x) which(x == "AD="))
)
Anoushiravan R
  • 21,622
  • 3
  • 18
  • 41
ersan
  • 393
  • 1
  • 9

5 Answers5

7

I thought this could also help you:

  • We first capture the AD= characters and then reset the starting point of the reported match with \\K in a way that it tells the regex engine to drop AD= and start the matching pattern from then on again
Old$X2 <- regmatches(Old$X1, gregexpr("(AD=)\\K[0-9,.]+(?=;)", Old$X1, perl = TRUE))
Old

                                         X1       X2
1 AD=17795,54;ARL=139;DEA=20;DER=20;DP=1785 17795,54
2    DP=4784;AD=4753,23;ARL=123;DEA=5;DER=5  4753,23
3     ARL=149;AD=30,9;DEA=25;DER=25;DP=3077     30,9
4      AD=244,49;ARL=144;DEA=7;DER=7;DP=245   244,49
Anoushiravan R
  • 21,622
  • 3
  • 18
  • 41
5

Here is a tidyverse solution to separate in 5 columns

Libraries

library(tidyverse)

Data

Old <- data.frame(
   X1= c(
      "AD=17795,54;ARL=139;DEA=20;DER=20;DP=1785",
      "DP=4784;AD=4753,23;ARL=123;DEA=5;DER=5",
      "ARL=149;AD=30,9;DEA=25;DER=25;DP=3077",
      "AD=244,49;ARL=144;DEA=7;DER=7;DP=245"
   ))

Code

Old %>% 
   # Creating 5 columns based on the separator ";"
   separate(col = X1,sep = ";", into = paste0("v",1:5)) %>% 
   # Pivotting data
   pivot_longer(cols = everything()) %>% 
   # Separating the value column based on the separator "="
   separate(value,into = c("var","value"),sep = "=") %>% 
   select(-name) %>% 
   pivot_wider(names_from = var,values_from = value) %>% 
   unnest()

Output

# A tibble: 4 x 5
  AD       ARL   DEA   DER   DP   
  <chr>    <chr> <chr> <chr> <chr>
1 17795,54 139   20    20    1785 
2 4753,23  123   5     5     4784 
3 30,9     149   25    25    3077 
4 244,49   144   7     7     245  
Vinícius Félix
  • 8,448
  • 6
  • 16
  • 32
4

We can use sub

sub(".*AD\\=(\\d+,\\d+);.*", "\\1", Old$X1)
akrun
  • 874,273
  • 37
  • 540
  • 662
  • This is worked when AD=xxx,xx is at the front. AD= can sometimes be in the middle or at the end. – ersan Sep 02 '21 at 21:42
3

Using str_match from stringr -

Old$X2 <- stringr::str_match(Old$X1, 'AD=(\\d+,\\d+)')[, 2]
Old

#                                         X1       X2
#1 AD=17795,54;ARL=139;DEA=20;DER=20;DP=1785 17795,54
#2    DP=4784;AD=4753,23;ARL=123;DEA=5;DER=5  4753,23
#3     ARL=149;AD=30,9;DEA=25;DER=25;DP=3077     30,9
#4      AD=244,49;ARL=144;DEA=7;DER=7;DP=245   244,49

str_match returns a matrix, [, 2] is used to extract the capture group from the regex.

Ronak Shah
  • 377,200
  • 20
  • 156
  • 213
3

Another base R option using regmatches

transform(
  Old,
  X2 = unlist(regmatches(X1, gregexpr("(?<=AD=)(\\d+,)+\\d+(?=;)", X1, perl = TRUE)))
)

gives

                                         X1       X2
1 AD=17795,54;ARL=139;DEA=20;DER=20;DP=1785 17795,54
2    DP=4784;AD=4753,23;ARL=123;DEA=5;DER=5  4753,23
3     ARL=149;AD=30,9;DEA=25;DER=25;DP=3077     30,9
4      AD=244,49;ARL=144;DEA=7;DER=7;DP=245   244,49
ThomasIsCoding
  • 96,636
  • 9
  • 24
  • 81