I have an applescript that finds and replaces about a hundred terms. Using regular expressions. I'd like to import this find and replace functions in R. So, in ScriptEditor, I've saved the AppleScript as a text file and imported this into R via readLines(). The dput() results of this import look like punct.out, below. When I create my own data frame of patterns and replacements from raw vectors, rather than from imports (see punct below), then the find and replace on a test string (see test, below) works just fine. But, when I try the same command with the imported data frame, then it doesn't work, it returns NA.
So somehow, the imported text results are not being interpreted as regular expressions somehow or as character vectors...I can't figure it out.
#structure of my imported patterns and replacements
punct.out<-structure(list(replace = c(NA, NA, "good-bye[a-z]+|good-bye",
"good bye[a-z]+|good bye", "good-", "ill at ease", "ill-", "-like",
" well,", "- well,", ", well,", "as well", ".,", ".... well",
"... well", ". Well,", ": well,", "well-", "well,", "well,",
"well,", "Well,", "- okay,", ", okay,", "okay,", " okay,", ".... okay",
"... okay", ". Okay,", ": okay,", "OK", "'okay,", "okay,", "Okay,",
"Okay", ", too", "too /", "too,", "too.", "too?", "too:", "(No)(. )([0- 9]+)",
"( [A-Z])(.)( )", "www.", "ain't", "let's", "won't", "can't",
"n't", "cannot", "'d", "'ll", "'m", "'ve", "'re", "!", "?", ";",
"", ",", "--", "-", "-", "é", "è", "à", "ç", "&", "%", "per cent",
"_", "Que.", "Ont.", "Nfld.", "Alta.", "Man.", "Sask.", "St.",
"Ste.", "i.e.", "Mr.", "Ms.", "Mrs.", "Prof.", ".com", "a. m.",
"p. m.", "a.m.", "p.m.", "Jan.", "Feb.", "Mar.", "Apr.", "Jun.",
"Jul.", "Aug.", "Sept.", "Oct.", "Nov.", "Dec.", "gen.", "Dr.",
"e. coli", "(.)([A-Z])(.)", "([A-Z])(.)([A-Z])", "([A-Z])(.)([A-Z])",
"([A-Z])(.)([A-Z])", "([A-Z])(.)([A-Z])", "([A-Z])(.)([A-Z])",
"([0-9])(.)([0-9])", "()(S)", "([a-z]+)(')", "(')([a-z]+)", "bull ' s eye",
"no man ' s land", "pandora ' s box", "....", "...", ".", ",",
":", "", "", "", "", NA, NA), with = c("character(0)", "character(0)",
"goodbye", "goodbye", "good x", "ill at xease", "ill x", " xlike",
" xwell", " xwell", " xwell", "as xwell", " ", " xwell", " xwell",
". xWell", ": xwell", "well x", "xwell", " xwell", "xwell", "xWell",
" xokay", " xokay", " xokay", " xokay", " xokay", " xokay", ". xOkay",
": xokay", "okay", "xokay", "xokay", "xOkay", "xOkay", " xtoo",
"xtoo /", "xtoo", "xtoo.", "xtoo.", "xtoo", "#\\\\3", "\\\\1\\\\3",
"www", "am not", "let us", "will not", "can not", " not", "can not",
" would", " will", " am", " have", " are", ".", ".", "", "",
"", " ", " ", " ", "e", "e", "a", "c", "and", "percent", "percent",
" ", "Que", "Ont", "Nfld", "Alta", "Man", "Sask", "St", "Ste",
"ie", "Mr", "Ms", "Mrs", "Prof", "com", "am", "pm", " am", " pm",
"Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aug", "Sept", "Oct",
"Nov", "Dec", "gen", "Dr", "e coli", "\\\\1\\\\2 ", "\\\\1\\\\3",
"\\\\1\\\\3", "\\\\1\\\\3", "\\\\1\\\\3", "\\\\1\\\\3", "\\\\1dot\\\\3",
"\\\\1 \\\\2", "\\\\1 \\\\2", "\\\\1 \\\\2", "bull's eye", "no man's land",
"pandora's box", "", "", " . ", " ,", "", " ", " ", " ", " ",
"character(0)", "character(0)")), .Names = c("replace", "with"
), row.names = c(NA, -127L), class = "data.frame")
#library
library(stringi)
#test string
test<-c('Sept.','Mr.' ,'Oct.', 'ill at ease', 'as well', 'Dr.', 'OK'
, 'well,', '.com')
#data frame of patterns and replacements
punct<-data.frame(replace=c('ill at ease', 'Sept.', 'Mr.', 'Oct.', 'as
well', 'Dr.', 'OK', 'well,', '.com'), with=c('ill at xease', 'Sept',
'Mr', 'Oct', 'as xwell', 'Dr', 'okay', 'xwell', 'com'))
#This works
stri_replace_all_regex(test, punct$replace, punct$with, vectorize_all=F)
#But this doesn't
stri_replace_all_regex(test, punct.out$replace, punct.out$with,
vectorize_all=F)
Second problem: I solved the problem above based on the comments below. But, there are some specific problems with some regex's emerging. Specificaly, I don't know how to escape backslashes to print the first and second patterns matched in the regex, i.e. \1, \2, etc.
#Define data
punct.out<-structure(list(replace = c("(\\.)([A-Z])(\\.)", "([A-Z])(\\.)([A-
Z])",
"([0-9])(\\.)([0-9])", "([a-z]+)(')", "(') ([a-z]+)"), with =
c("\\\\1\\\\2 ",
"\\\\1\\\\3", "\\\\1dot\\\\3", "\\\\1 \\\\2", "\\\\1 \\\\2")), .Names =
c("replace",
"with"), row.names = c(104L, 105L, 110L, 112L, 113L), class = "data.frame")
#Test string of characters that the above regex's are supposed to match
test<-c('.B.', 'B.B', '1.1','premier\'s')
#This sort of works but I clearly haven't figured out how to properly escape
the backslashes to capture the references
stri_replace_all_regex(test,punct.out$replace, punct.out$with,
vectorize_all=F)
#Based on the help for stri_replace I also tried using $ to capture the
references.
punct.out$with<-gsub('\\\\\\\\', '$', punct.out$with)
#And it did work.
stri_replace_all_regex(test,punct$replace, punct$with, vectorize_all=F)