How to store string between quotation-CodePudding

I wrote the search code and I want to store what is between " " as one place in the list, how I may do that?

I used regex. The list will contain everything outside the parentheses () and store them in list called must.

import re

message = 'all all "exact exact" (any OR "anyone") -none -"none"'

others = ' '.join(re.split('\(.*\)', message))
others_split = others.split()

to_compile = re.compile('.*\((.*)\).*')
to_match = to_compile.match(message)
ors_string = to_match.group(1)

must = list(filter(lambda word: not word.startswith('-'), others_split))

print(f'must: {must}')

Output:

must: ['all', 'all', '"exact', 'exact"']

Wanted result:

must: ['all', 'all', '"exact exact"']

CodePudding user response：

To correctly parse a search string with state (e.g. the negations, parentheses), you'll need a real stateful parser too, e.g. here with re.Scanner:

import re

scanner = re.Scanner(
    [
        (r'"', lambda scanner, token: ("QUOTE", token)),
        (r"\(", lambda scanner, token: ("OPEN_PAR", token)),
        (r"\)", lambda scanner, token: ("CLOSE_PAR", token)),
        (r"-", lambda scanner, token: ("NOT", token)),
        (r"\s ", lambda scanner, token: ("WS", token)),
        (r"\w ", lambda scanner, token: ("TEXT", token)),
    ]
)


def parse_search(search):
    def emit_current():
        nonlocal par_depth, is_negated, curr_phrase
        yield (par_depth, is_negated, curr_phrase)
        curr_phrase = ""
        is_negated = False

    result, rest = scanner.scan(search)
    is_negated = False
    in_quotes = False
    curr_phrase = ""
    par_depth = 0
    for type, value in result:
        if in_quotes and type != "QUOTE":
            curr_phrase  = value
            continue
        if type == "OPEN_PAR":
            par_depth  = 1
            continue
        if type == "CLOSE_PAR":
            if par_depth == 0:
                raise ValueError("Unbalanced parentheses")
            par_depth -= 1
            continue
        if type == "QUOTE":
            curr_phrase  = value  # keep quote in phrase
            in_quotes = not in_quotes
            if not in_quotes:
                yield from emit_current()
            continue
        if type == "NOT":
            is_negated = True
            continue
        if type == "TEXT":
            curr_phrase  = value
            yield from emit_current()
            continue
    if in_quotes:
        raise ValueError("Unbalanced quotes")


def main():
    message = 'all "((( oh no )))" alley "exact text" (any OR "anyone") -no_no_no -"none"'

    must = []
    must_not = []

    for par_depth, negated, phrase in parse_search(message):
        if par_depth > 0:
            # This implementation ignores all (nested) parenthesized segments
            continue
        (must_not if negated else must).append(phrase)

    print(f"{must=}", f"{must_not=}", sep="\n")


if __name__ == "__main__":
    main()

The output is

must=['all', '"((( oh no )))"', 'alley', '"exact text"']
must_not=['no_no_no', '"none"']

CodePudding user response：

Instead of splitting, an easier approach would be to use re.findall with an alternation pattern that matches a parantheses-enclosed string, a quoted string or a word, but captures only the latter two:

[
    term
    for term in re.findall(r'\(.*?\)|(-?(?:".*?"|\w ))', message)
    if term and not term.startswith('-')
]

Demo: https://replit.com/@blhsing/CylindricalTalkativeNetbsd

CodePudding user response：

if you want only words in between " "

message = 'all all "exact exact" (any OR "anyone") -none -"none"'

result= re.findall('"([^"]*)"', message)

['exact exact', 'anyone', 'none']