import re
input_text = "estoy segura que empezaria desde las 15:00 pm del 2002_-_11_-_01 hasta las 16:00 hs pm" #example 1
input_text = "estoy segura que empezara desde las 15:00 pm h.s. del 2002_-_11_-_(01_--_15) hasta las 16:10 pm hs, aunque no se cuando podria acabar" #example 2
input_text = "probablemente dure desde las 01:00 am hasta las 16:00 pm del 2002_-_11_-_01 pero seguramente no mucho mas que eso" #example 3
input_text = "desde las 11:00 am hasta las 16:00 pm del 2002_-_11_-_(01_--_17) o quizas desde las 15:00 pm hs hasta las 16:00 pm del 2003_-_11_-_(01_--_17)" #example 4
def standardize_time_interval_associated_to_date(input_text, identify_only_4_digit_years = True):
if (identify_only_4_digit_years == True):
date_format_capture_01 = r"(\d{4})_-_(\d{2})_-_(\d{2})"
date_format_capture_02 = r"(\d{4})_-_(\d{2})_-_\((\d{1,2})_--_(\d{1,2})\)"
elif (identify_only_4_digit_years == False):
date_format_capture_01 = r"(\d*)_-_(\d{2})_-_(\d{2})"
date_format_capture_02 = r"(\d*)_-_(\d{2})_-_\((\d{1,2})_--_(\d{1,2})\)"
time_format_capture = r"(\d{1,2})[\s|:](\d{0,2})\s*(?:h.s.|h.s|hs|)\s*(?:(am)|(pm))\s*(?:h.s.|h.s|hs|)"
#replace for the example 1
input_text = re.sub(r"(?:desde|a[\s|]*partir)[\s|]*(?:de|)[\s|]*(?:las|la|)[\s|]*" + time_format_capture + r"[\s|]*(del|de[\s|]*el|de )[\s|]*(?:" + date_format_capture_02 + r"|" + date_format_capture_01 + r")[\s|]*(?:hasta|al)[\s|]*(?:las|la|)[\s|]*" + time_format_capture,
print(lambda m: print(m[1]) ) ,
input_text)
#replace for the example 2
input_text = re.sub(r"(?:desde|a[\s|]*partir)[\s|]*(?:de|)[\s|]*(?:las|la|)[\s|]*" + time_format_capture + r"[\s|]*(?:hasta|al)[\s|]*(?:las|la|)[\s|]*" + time_format_capture + r"[\s|]*(del|de[\s|]*el|de )[\s|]*(?:" + date_format_capture_02 + r"|" + date_format_capture_01 + r")",
print(lambda m: print(m[1])) ,
input_text)
return input_text
#Here I make the call to the function indicating the input string as the first parameter, and as the second I pass an indication about how it should identify the date information
input_text = standardize_time_interval_associated_to_date(input_text, True)
print(repr(input_text)) # --> output
What should I put in the second parameter of the re.sub()
function instead of print(lambda m: print(m[1]))
so that the following string replacements are possible?
Replacements are expected to comply with this substitution (generic) structure:
(YYYY_-_MM_-_DD hh:mm pm or am_--_hh:mm am or pm)
Bearing in mind that the goal of the program is to search and rearrange information in the main string, the output that I need to get in each of the input example strings:
"estoy segura que empezaria (2002_-_11_-_01 (15:00 pm_--_16:00 pm))" #for example 1
"estoy segura que empezara (2002_-_11_-_(01_--_15) (15:00 pm_--_16:10 pm)), aunque no se cuando podria acabar" #for example 2
"probablemente dure (2002_-_11_-_01 (01:00 am_--_16:00 pm)) pero seguramente no mucho mas que eso" #for example 3
"(2002_-_11_-_(01_--_17) (11:00 am_--_16:00 pm)) o quizas (2003_-_11_-_(01_--_17) (15:00 pm_--_16:00 pm))" #for example 4