import re
#input_example
capture_where_capsule = "((PL_ADVB='la gran biblioteca rápidamente y luego llegamos allí')hacía)"
list_all_adverbs_of_place = ["de allí", "de alli", "allí", "alli", "de allá", "de alla", "allá", "alla", "arriba", "abajo", "a dentro", "adentro", "dentro", "a fuera", "afuera", "fuera", "hacía", "hacia", "encíma de", "encima de", "por sobre", "sobre"]
place_reference = r"((?i:\w\s*) )"
pattern = re.compile(r"\(\(PL_ADVB='" place_reference r"'\)" rf"{'|'.join(list_all_adverbs_of_place)}" r"\)", re.IGNORECASE)
m1 = re.search(pattern, capture_where_capsule)
if m1:
place_reference_string = m1.group()[1]
print(repr(place_reference_string))
Why does this capturing group fail to capture all this substring?
The parentheses of the capturing group enclose the entire pattern that should be responsible for capturing the text that matches.
The substring that should capture would be this substring (and not other):
'la gran biblioteca rápidamente y luego llegamos allí'
CodePudding user response:
You need to put all the alternatives in a group. Otherwise, only the first alternative is concatenated to the ((PL_ADV...)
part of the regexp.
Then use m1.group(1)
to get that capture group. m1.group()[1]
is the 2nd character of the entire match.
pattern = re.compile(rf"\(\(PL_ADVB='{place_reference}'\)({'|'.join(list_all_adverbs_of_place)})\)", re.IGNORECASE)
m1 = re.search(pattern, capture_where_capsule)
if m1:
place_reference_string = m1.group(1)
print(repr(place_reference_string))