Here another option. Please note, I have introduced some words before your marker word for extraction, i.e., "Josè", since I assume that you will not only have sentences starting with this word. The important functionality is the combination of using stri_locate
for detecting the end of your marker word, adding one character position, and using this position with the last character position in your string to be passed to stri_sub
. Please note, the function provided is not failsafe, e.g., for cases if there are zero matches, etc.
Please also check below benchmarks for the three solutions proposed so far.
library(stringi)
library(magrittr)
sentencas<- c("some words josé é um trabalhador responsável"
,"only one word José é um trabalhador responsável"
,"several words jose é um trabalhador responsável"
,"and again some words Jose é um trabalhador responsável")
stri_word <- function(marker, str, words_after_marker) {
stri_sub(str, cbind(stri_locate_first_coll(str, marker ,strength=1)[, "end"]+1, nchar(str)) ) %>%
{ gsub( "^\\s+|\\s+$", "", stri_extract_first_regex(., paste0("(\\s\\w+){", words_after_marker[1], ",", words_after_marker[2],"}"))) }
}
stri_word("jose", sentencas, c(1,3) )
#[1] "é um trabalhador" "é um trabalhador" "é um trabalhador" "é um trabalhador"
#Benchmarks
library(microbenchmark)
library(stringr)
stringi_positions <- function() {
stri_word <- function(marker, str, words_after_marker) {
stri_sub(str, cbind(stri_locate_first_coll(str, marker ,strength=1)[, "end"]+1, nchar(str)) ) %>%
{ gsub( "^\\s+|\\s+$", "", stri_extract_first_regex(., paste0("(\\s\\w+){", words_after_marker[1], ",", words_after_marker[2],"}"))) }
}
stri_word("jose", sentencas, c(1,3) )
}
stringi_map <- function() {
sentencas %>%
map(stri_extract_all_words) %>%
map(~{
.x <- flatten_chr(.x)
map(.x, stri_detect_coll, "jose", ignore.case=TRUE, strength=1L) %>%
flatten_lgl() %>%
which() -> pos
.x[(pos+1):(pos+1+3)]
})
}
semi_stringi <- function() {
sentencas %>%
stri_split_coll("jose ", strength=1, simplify = TRUE) %>%
.[,2] %>%
word(1,3)
}
microbenchmark(
stringi_map(),
semi_stringi(),
stringi_positions()
)
# Unit: microseconds
# expr min lq mean median uq max neval
# stringi_map() 3498.667 3752.886 4059.0339 4038.0925 4214.3480 7365.635 100
# semi_stringi() 485.543 558.966 805.0216 593.9015 652.7195 15806.567 100
# stringi_positions() 288.958 325.669 456.9946 344.6180 384.4865 10719.428 100