I wrote the search code and I want to store what is between " " as one place in the list, how I may do that?
I used regex. The list will contain everything outside the parentheses () and store them in list called must
.
import re
message = 'all all "exact exact" (any OR "anyone") -none -"none"'
others = ' '.join(re.split('\(.*\)', message))
others_split = others.split()
to_compile = re.compile('.*\((.*)\).*')
to_match = to_compile.match(message)
ors_string = to_match.group(1)
must = list(filter(lambda word: not word.startswith('-'), others_split))
print(f'must: {must}')
Output:
must: ['all', 'all', '"exact', 'exact"']
Wanted result:
must: ['all', 'all', '"exact exact"']
CodePudding user response:
To correctly parse a search string with state (e.g. the negations, parentheses), you'll need a real stateful parser too, e.g. here with re.Scanner
:
import re
scanner = re.Scanner(
[
(r'"', lambda scanner, token: ("QUOTE", token)),
(r"\(", lambda scanner, token: ("OPEN_PAR", token)),
(r"\)", lambda scanner, token: ("CLOSE_PAR", token)),
(r"-", lambda scanner, token: ("NOT", token)),
(r"\s ", lambda scanner, token: ("WS", token)),
(r"\w ", lambda scanner, token: ("TEXT", token)),
]
)
def parse_search(search):
def emit_current():
nonlocal par_depth, is_negated, curr_phrase
yield (par_depth, is_negated, curr_phrase)
curr_phrase = ""
is_negated = False
result, rest = scanner.scan(search)
is_negated = False
in_quotes = False
curr_phrase = ""
par_depth = 0
for type, value in result:
if in_quotes and type != "QUOTE":
curr_phrase = value
continue
if type == "OPEN_PAR":
par_depth = 1
continue
if type == "CLOSE_PAR":
if par_depth == 0:
raise ValueError("Unbalanced parentheses")
par_depth -= 1
continue
if type == "QUOTE":
curr_phrase = value # keep quote in phrase
in_quotes = not in_quotes
if not in_quotes:
yield from emit_current()
continue
if type == "NOT":
is_negated = True
continue
if type == "TEXT":
curr_phrase = value
yield from emit_current()
continue
if in_quotes:
raise ValueError("Unbalanced quotes")
def main():
message = 'all "((( oh no )))" alley "exact text" (any OR "anyone") -no_no_no -"none"'
must = []
must_not = []
for par_depth, negated, phrase in parse_search(message):
if par_depth > 0:
# This implementation ignores all (nested) parenthesized segments
continue
(must_not if negated else must).append(phrase)
print(f"{must=}", f"{must_not=}", sep="\n")
if __name__ == "__main__":
main()
The output is
must=['all', '"((( oh no )))"', 'alley', '"exact text"']
must_not=['no_no_no', '"none"']
CodePudding user response:
Instead of splitting, an easier approach would be to use re.findall
with an alternation pattern that matches a parantheses-enclosed string, a quoted string or a word, but captures only the latter two:
[
term
for term in re.findall(r'\(.*?\)|(-?(?:".*?"|\w ))', message)
if term and not term.startswith('-')
]
Demo: https://replit.com/@blhsing/CylindricalTalkativeNetbsd
CodePudding user response:
if you want only words in between " "
message = 'all all "exact exact" (any OR "anyone") -none -"none"'
result= re.findall('"([^"]*)"', message)
['exact exact', 'anyone', 'none']