import re, datetime
from calendar import monthrange
#examples:
input_text_substring = "los juegos se jugaran durante el transcurso del mes de octubre" #example 1
input_text_substring = "empiezan durante el transcurso del mes de febrero del año 2020" #example 2
input_text_substring = "empiezan durante el periodo del mes de septiembre" #example 3
input_text_substring = "empezaran durante el transcurso del mes de febrero del 2023" #example 4
input_text_substring = "creo que empezarian durante el transcurso del mes de diciembre 2021" #example 5
es_month_dict = {"enero": "01", "febrero": "02", "marzo": "03", "abril": "04", "mayo": "05", "junio": "06", "julio": "07", "agosto": "08", "septiembre": "09", "octubre": "10", "noviembre": "11", "diciembre": "12"}
#Assumes that it is the current year if it is not explicitly indicated
if not re.search(r"(?:(?:del|de el)[\s|]*(?:año|ano)[\s|]*\d*|.*\d{4}$)", input_text_substring):
input_text_substring = " de " datetime.datetime.today().strftime('%Y') " "
#do substring identification capture groups...
identified_year = #extract year
identified_month = #extract month
last_day_in_this_month = (monthrange(int(identified_year), int(identified_month)))[1]
time_period_in_this_month = "[01 -- " str(last_day_in_this_month) "] de " str(identified_month)
months = r"enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre"
pattern_to_replace = r"(?:(?:en|durante)[\s|]*(?:el|los)[\s|]*(?:transcurso|trancurso|periodo|dias)[\s|]*(?:del|de)[\s|]*(?:mes[\s|]*de|mes)[\s|]*(?:" months r")|durante[\s|]*(?:el[\s|]*mes[\s|]*de|el[\s|]*mes|)[\s|]*(?:" months r"))"
#do the replacement...
input_text_substring = re.sub(pattern_to_replace, time_period_in_this_month, input_text_substring)
print(repr(input_text_substring)) #output
The correct outputs that this will need to get in each of the examples:
input_text_substring = "los juegos se jugaran [01 -- 31] 10 2022" #example 1
input_text_substring = "empiezan [01 -- 29] 02 2020" #example 2
input_text_substring = "empiezan [01 -- 30] 09 2022" #example 3
input_text_substring = "empezaran [01 -- 28] 02 2023" #example 4
input_text_substring = "creo que empezarian [01 -- 31] 12 2021" #example 5
How should I extract the month and year, to be able to pass it to the (monthrange(int(identified_year), int(identified_month))
method and have it return the number of days of that month in that year, and then replace it in the original string and obtain these outputs?
CodePudding user response:
To find the month, you can simply search for the dictionary term in the string, then to find the year, you can use regex to extract the year from your string.
Example:
for key, value in s_month_dict.items():
if key in input_text_substring:
identified_month = value
break
wordList = re.findall(r'\b\S*%s\S*\b' % re.escape('20'), input_text_substring)
for word in wordList:
if len(word) == 4:
identified_year = word
break
print identified_month, ' ', identified_year
The preceding code produces the following outputs:
10 2022
02 2020
09 2022
02 2023
12 2021