12

I have a vector of strings

c("YSAHEEHHYDK", "HEHISSDYAGK", "TFAHTESHISK", "ISLGEHEGGGK", 
"LSSGYDGTSYK", "FGTGTYAGGEK", "VGASTGYSGLK", "TASGVGGFSTK", "SYASDFGSSAK", 
"LYSYYSSTESK")

for each string I would like to replace "Y", "S" or "T" with "pY", "pS" or "pT". But I dont want all the replacements to be in the same final string, I want each replacement to generate a new string, e.g.

"YSAHEEHHYDK" turns into

c("pYSAHEEHHYDK",
"YpSAHEEHHYDK",
"YSAHEEHHpYDK")
jpsmith
  • 11,023
  • 5
  • 15
  • 36
Pavel Shliaha
  • 773
  • 5
  • 16

4 Answers4

9

You could write a function in base R:

Edit:

Included the notion of zero-length as shown by @GKi

strings <-  c("YSAHEEHHYDK", "HEHISSDYAGK", "TFAHTESHISK", "ISLGEHEGGGK", 
              "LSSGYDGTSYK", "FGTGTYAGGEK", "VGASTGYSGLK", "TASGVGGFSTK", 
              "SYASDFGSSAK", "LYSYYSSTESK")


reg <- gregexpr("[YST]", strings)
`regmatches<-`(rep(strings, lengths(reg)), 
              `attr<-`(unlist(reg), "match.length", 0),  value = 'p')

#>  [1] "pYSAHEEHHYDK" "YpSAHEEHHYDK" "YSAHEEHHpYDK" "HEHIpSSDYAGK" "HEHISpSDYAGK"
#>  [6] "HEHISSDpYAGK" "pTFAHTESHISK" "TFAHpTESHISK" "TFAHTEpSHISK" "TFAHTESHIpSK"
#> [11] "IpSLGEHEGGGK" "LpSSGYDGTSYK" "LSpSGYDGTSYK" "LSSGpYDGTSYK" "LSSGYDGpTSYK"
#> [16] "LSSGYDGTpSYK" "LSSGYDGTSpYK" "FGpTGTYAGGEK" "FGTGpTYAGGEK" "FGTGTpYAGGEK"
#> [21] "VGApSTGYSGLK" "VGASpTGYSGLK" "VGASTGpYSGLK" "VGASTGYpSGLK" "pTASGVGGFSTK"
#> [26] "TApSGVGGFSTK" "TASGVGGFpSTK" "TASGVGGFSpTK" "pSYASDFGSSAK" "SpYASDFGSSAK"
#> [31] "SYApSDFGSSAK" "SYASDFGpSSAK" "SYASDFGSpSAK" "LpYSYYSSTESK" "LYpSYYSSTESK"
#> [36] "LYSpYYSSTESK" "LYSYpYSSTESK" "LYSYYpSSTESK" "LYSYYSpSTESK" "LYSYYSSpTESK"
#> [41] "LYSYYSSTEpSK"

Created on 2023-02-14 with reprex v2.0.2

You can create a small function to help you out.

my_replace <- function(x){
  reg <- gregexpr("[YST]", x)
  `regmatches<-`(rep(x, lengths(reg)), structure(unlist(reg), match.length = 0), value = "p")
}
Onyambu
  • 67,392
  • 3
  • 24
  • 53
7

Using xx input in the Note at the end (which is as in the question plus some border tests) we use stringi functions. In particular note that stri_sub can insert a p character. If an input string is empty, i.e. "", or does not contain any of Y, S or T then NA is returned for that string.

library(stringi)

add_p <- function(s, loc) {
  start <- loc[, "start"]
  stri_sub(s, start, start-1) <- "p"
  s
}
Map(add_p, xx, stri_locate_all(xx, regex = "[YST]"))

giving

[1] NA

$ABC
[1] NA

$YSAHEEHHYDK
[1] "pYSAHEEHHYDK" "YpSAHEEHHYDK" "YSAHEEHHpYDK"

$HEHISSDYAGK
[1] "HEHIpSSDYAGK" "HEHISpSDYAGK" "HEHISSDpYAGK"

$TFAHTESHISK
[1] "pTFAHTESHISK" "TFAHpTESHISK" "TFAHTEpSHISK" "TFAHTESHIpSK"

# ...snip...

Note

This is the same as in the question exceept we have added the first two strings.

xx <- c("", "ABC", "YSAHEEHHYDK", "HEHISSDYAGK", "TFAHTESHISK", "ISLGEHEGGGK", 
"LSSGYDGTSYK", "FGTGTYAGGEK", "VGASTGYSGLK", "TASGVGGFSTK", "SYASDFGSSAK", 
"LYSYYSSTESK")
G. Grothendieck
  • 254,981
  • 17
  • 203
  • 341
5

Perhaps something like this with stringr and purrr.

str_locate_all() returns a 2-column matrix with start & end of pattern locations, str_sub(string, start) <- "p" conveniently accepts that same matrix for a start. Subtracting 1 from current end-column (i.e [1, 1] becomes [1, 0]) keeps all existing characters and inserts p.

library(stringr)
library(purrr)

str_ <-  c("YSAHEEHHYDK", "HEHISSDYAGK", "TFAHTESHISK", "ISLGEHEGGGK", 
           "LSSGYDGTSYK", "FGTGTYAGGEK", "VGASTGYSGLK", "TASGVGGFSTK", 
           "SYASDFGSSAK", "LYSYYSSTESK")


map2(set_names(str_),
     str_locate_all(str_,"Y|S|T"),
     function(x, y) { 
       y[,2] <- y[,2] - 1
       str_sub(x, y) <- "p"
       x
       })

Result as a named list:

#> $YSAHEEHHYDK
#> [1] "pYSAHEEHHYDK" "YpSAHEEHHYDK" "YSAHEEHHpYDK"
#> 
#> $HEHISSDYAGK
#> [1] "HEHIpSSDYAGK" "HEHISpSDYAGK" "HEHISSDpYAGK"
#> 
#> $TFAHTESHISK
#> [1] "pTFAHTESHISK" "TFAHpTESHISK" "TFAHTEpSHISK" "TFAHTESHIpSK"
#> 
#> $ISLGEHEGGGK
#> [1] "IpSLGEHEGGGK"
#> 
#> $LSSGYDGTSYK
#> [1] "LpSSGYDGTSYK" "LSpSGYDGTSYK" "LSSGpYDGTSYK" "LSSGYDGpTSYK" "LSSGYDGTpSYK"
#> [6] "LSSGYDGTSpYK"
#> 
#> $FGTGTYAGGEK
#> [1] "FGpTGTYAGGEK" "FGTGpTYAGGEK" "FGTGTpYAGGEK"
#> 
#> $VGASTGYSGLK
#> [1] "VGApSTGYSGLK" "VGASpTGYSGLK" "VGASTGpYSGLK" "VGASTGYpSGLK"
#> 
#> $TASGVGGFSTK
#> [1] "pTASGVGGFSTK" "TApSGVGGFSTK" "TASGVGGFpSTK" "TASGVGGFSpTK"
#> 
#> $SYASDFGSSAK
#> [1] "pSYASDFGSSAK" "SpYASDFGSSAK" "SYApSDFGSSAK" "SYASDFGpSSAK" "SYASDFGSpSAK"
#> 
#> $LYSYYSSTESK
#> [1] "LpYSYYSSTESK" "LYpSYYSSTESK" "LYSpYYSSTESK" "LYSYpYSSTESK" "LYSYYpSSTESK"
#> [6] "LYSYYSpSTESK" "LYSYYSSpTESK" "LYSYYSSTEpSK"

Created on 2023-02-15 with reprex v2.0.2

margusl
  • 7,804
  • 2
  • 16
  • 20
  • Close, but still not quite correct: YSAHEEHHYDK shouldn't have "pT" in it (third string of `[1] "pYSAHEEHHYDK" "YpSAHEEHHYDK" "YSAHEEHHpTDK"`). This is a deceptively difficult question – jared_mamrot Feb 15 '23 at 00:24
  • @jared_mamrot, indeed and edited, though now it's just a mimic, and convoluted one, of G. Grothendieck's neat solution. – margusl Feb 15 '23 at 01:38
3

A base variant similar to the method from @G.Grothendieck and @margusl using gregexpr to find the positions of Y, S or T and using regmatches<-, like @onyambu, to insert p at this positions.

sIn <- function(s, i) {
  `regmatches<-`(rep(s, length(i)), `attr<-`(i, "match.length", 0), value="p")
}
Map(sIn, s, gregexpr("[YST]", s))
#[[1]]
#[1] ""
#
#$ABC
#[1] "ABC"
#
#$YSAHEEHHYDK
#[1] "pYSAHEEHHYDK" "YpSAHEEHHYDK" "YSAHEEHHpYDK"
#
#$HEHISSDYAGK
#[1] "HEHIpSSDYAGK" "HEHISpSDYAGK" "HEHISSDpYAGK"
#...

Or using str_sub<- and str_locate_all from stringr with a non consuming look ahead (?=[YST]).

library(stringr)
Map(`str_sub<-`, s, str_locate_all(s,"(?=[YST])"), value="p")
#[[1]]
#character(0)
#
#$ABC
#character(0)
#
#$YSAHEEHHYDK
#[1] "pYSAHEEHHYDK" "YpSAHEEHHYDK" "YSAHEEHHpYDK"
#
#$HEHISSDYAGK
#[1] "HEHIpSSDYAGK" "HEHISpSDYAGK" "HEHISSDpYAGK"
#...

Or the same but using stringi.

library(stringi)
Map(`stri_sub<-`, s, stri_locate_all(s, regex="(?=[YST])"), value="p")
#[[1]]
#[1] NA
#
#$ABC
#[1] NA
#
#$YSAHEEHHYDK
#[1] "pYSAHEEHHYDK" "YpSAHEEHHYDK" "YSAHEEHHpYDK"
#
#$HEHISSDYAGK
#[1] "HEHIpSSDYAGK" "HEHISpSDYAGK" "HEHISSDpYAGK"
#...

Data (added the first two strings like @G.Grothendieck)

s <- c("", "ABC", "YSAHEEHHYDK", "HEHISSDYAGK", "TFAHTESHISK", "ISLGEHEGGGK",
       "LSSGYDGTSYK", "FGTGTYAGGEK", "VGASTGYSGLK", "TASGVGGFSTK",
       "SYASDFGSSAK", "LYSYYSSTESK")
GKi
  • 37,245
  • 2
  • 26
  • 48