2

I have a data.table with many rows that look like this in R:

    V1        V2       V3    V4   V5  V6  V7  V8   V9           V10
 NCBINCC    GenBank   gene  331 1008  .   -   .   gene_id=UL1   protein_id=ABV71500.1
 NCBINCC    GenBank   gene  1009 1120  .  -   .  gene_id=UL4   protein_id=ABV71520
 NCBINCC    GenBank   gene  1135 1200  .  -   .  gene_id=UL6   protein_id=ABV71525

Is there a simple way to add quotes in between strings (after the strings gene_id= and protein_id=) so that they only encompass the different gene and proteins like the following output:

    V1        V2       V3    V4   V5  V6  V7  V8   V9            V10
 NCBINCC    GenBank   gene  331 1008  .   -   .   gene_id="UL1"  protein_id="ABV71500.1"
 NCBINCC    GenBank   gene  1009 1120 .   -   .  gene_id="UL4"  protein_id="ABV71520"
 NCBINCC    GenBank   gene  1135 1200 .   -   .  gene_id="UL6"  protein_id="ABV71525"

I have seen this answer for shell, but wanted to know if there was a way to also do it in R. Thank you kindly.

jay.sf
  • 60,139
  • 8
  • 53
  • 110
vanish007
  • 323
  • 1
  • 10

4 Answers4

2

We can use str_replace with a regex lookaround to match the =, capture the alphanumeric characters including the . and replace with the backreference (\\1) quoted

library(stringr)
library(dplyr)
df1 <- df1 %>%
     mutate(across(c(V9, V10), 
        ~ str_replace(., "(?<=\\=)([[:alnum:].]+)", '"\\1"')))

-output

df1
#  V1      V2   V3   V4   V5 V6 V7 V8            V9                     V10
#1 NCBINCC GenBank gene  331 1008  .  -  . gene_id="UL1" protein_id="ABV71500.1"
#2 NCBINCC GenBank gene 1009 1120  .  -  . gene_id="UL4"   protein_id="ABV71520"
#3 NCBINCC GenBank gene 1135 1200  .  -  . gene_id="UL6"   protein_id="ABV71525"

Forgot to use the corresponding option using base R

nm1 <- c("V9", "V10")
df1[nm1] <- lapply(df1[nm1], function(x) 
     sub("(?<=\\=)([[:alnum:].]+)", '"\\1"', x, perl = TRUE))

data

df1 <- structure(list(V1 = c("NCBINCC", "NCBINCC", "NCBINCC"), V2 = c("GenBank", 
"GenBank", "GenBank"), V3 = c("gene", "gene", "gene"), V4 = c(331L, 
1009L, 1135L), V5 = c(1008L, 1120L, 1200L), V6 = c(".", ".", 
"."), V7 = c("-", "-", "-"), V8 = c(".", ".", "."), V9 = c("gene_id=UL1", 
"gene_id=UL4", "gene_id=UL6"), V10 = c("protein_id=ABV71500.1", 
"protein_id=ABV71520", "protein_id=ABV71525")), class = "data.frame",
row.names = c(NA, 
-3L))
akrun
  • 874,273
  • 37
  • 540
  • 662
1

I would use mutate and stringr:

require(dplyr)
require(stringr)

myTable %>%
mutate(across(c(V9, V10),
function(x){
firstHalf <- str_extract(x, "^.+=") # everything up to and including the '='
secondHalf <- str_extract(x, "(?<==).*$") # everything after the '='

# Add quotes to secondHalf
newSecondHalf <- paste0("\"", secondHalf, "\"")

# Glue it all back together and spit it out
paste0(firstHalf, newSecondHalf)
}))
Captain Hat
  • 2,444
  • 1
  • 14
  • 31
1

Assuming a data table named mydatatable, i used gsub and paste0.

library(dplyr)

mydatatable <- mydatatable %>% 
  mutate(across(c(V9, V10), ~paste0(gsub("=", '="', .), '"')))
norie
  • 9,609
  • 2
  • 11
  • 18
1

If you are bored from packages, you may want to try sub in an lapply.

v <- c('V9', 'V10')
d[v] <- lapply(d[v], sub, pa='\\=(.*)', re='="\\1"')
d
#        V1      V2   V3   V4   V5 V6 V7 V8            V9                     V10
# 1 NCBINCC GenBank gene  331 1008  .  -  . gene_id="UL1" protein_id="ABV71500.1"
# 2 NCBINCC GenBank gene 1009 1120  .  -  . gene_id="UL4"   protein_id="ABV71520"
# 3 NCBINCC GenBank gene 1135 1200  .  -  . gene_id="UL6"   protein_id="ABV71525"

Data

d <- read.table(header=T, text='V1        V2       V3    V4   V5  V6  V7  V8   V9           V10
NCBINCC    GenBank   gene  331 1008  .   -   .   gene_id=UL1   protein_id=ABV71500.1
NCBINCC    GenBank   gene  1009 1120  .  -   .  gene_id=UL4   protein_id=ABV71520
NCBINCC    GenBank   gene  1135 1200  .  -   .  gene_id=UL6   protein_id=ABV71525')
jay.sf
  • 60,139
  • 8
  • 53
  • 110