import re
#input string
capture_where_capsule = "alrededor ((NOUN)del auto rojizo, algo grande y completamente veloz). Luego dentro del baúl rápidamente abajo de una caja por sobre ello vimos una caña."
#list of adverbs indicating the start of the capture
list_all_adverbs_of_place = ["adentro", "dentro", "al rededor", "alrededor", "abajo", "hacía", "hacia", "por sobre", "sobre"]
#It should cut everything in place_reference_string after '((NOUN) )'
descriptive_noun_pattern = r"\(\(NOUN\)" + r'(?:[\w,;.]\s*)+' + r"\)"
list_verbs = ["vimos", "hemos visto", "encontramos", "hemos encontrado"]
list_adverbs_of_manner = ["rápidamente", "rapidamente", "intensamente", "gradualmente", "completamente"]
list_adverbs_of_time = ["durante", "luego", "ahora", "mientras tanto"]
list_limitant_words = ["a las", "a los", "a la", "a el", "a los", "a las", "a mí", "a mi", "a sus", "a su", "a él", "a ella", "talvez", "tal vez", "tal", "al", "los", "las", "él", "el", "la", "cómo", "como" , "con", "en su", "en mi", "en", ".", ":", ";", ",", "(", ")", "[", "]", "¿", "?", "¡", "!", "&", "="]
#list that combines all the elements that act as limits, indicating when the captures should end
list_limiting_elements = list_verbs + list_adverbs_of_manner + list_adverbs_of_time + list_limitant_words
print(repr(capture_where_capsule)) #--> output
I must capture within the encapsulation standard ((PL_ADVB)the text)
, those strings that are after one of the elements of the list list_all_adverbs_of_place
, which basically consists of a small list of adverbs indicating place. (the adverb must also be captured).
And the end of the capture can be after a pattern ((NOUN)some text here)
, and if it is not that pattern has to end if any of the elements of the list list_limiting_elements
appear.
In this way, perform this restructuring of the input string so that it looks like this output, after using a re.sub(, , capture_where_capsule, flags = re.IGNORECASE)
"((PL_ADVB)alrededor ((NOUN)del auto rojizo, algo grande y completamente veloz)). Luego ((PL_ADVB)dentro del baúl) rápidamente ((PL_ADVB)abajo de una caja) ((PL_ADVB)por sobre ello) vimos una caña."
Keep in mind that an adverb from the list_limiting_elements
list represents the beginning of the capture, but if there is a second adverb within the captured text, then this will act as an end limit.