Program description: I'm trying to create a function breakInChunks()
, which accepts one parameter temp_s: string
, where temp_s
is a math expression, e.g. 1 (3-e^(x-6))-8 (99-4)*10
. The function then searches for opening and closing parentheses and replaces expression inside them with a 'chunk' in the following format: [i$j]
(where i is an index of the opening bracket and j - of the closing one). If there are several chunks inside one whole chunk [m$n]
, the program should only replace characters in the string from m
to n
with [m$n]
. In the end the function returns keypairs
dictionary, where key should be chunks and values should be actual strings that have been cut from initial string, e.g. {'23$28': 'string within 23 and 28 characters'}
. All the remaining symbols (that were outside parentheses) should be appended to the dictionary in the end in the same chunk: string
way.
breakInChunks() input: (7 x 8*(9 10(11 12) 14))-(2*(34))
breakInChunks() output: {'12$18': '11 12', '7$22': '9 10[12$18] 14', '0$23': '7 x 8*[7$22]', '28$31': '34', '25$32': '2*[28$31]', '24:25': '-'}
Problem: When trying to read more complex strings I'm starting to get very odd results. For example:
Input: (7 y (66 7) (32 (78*19-(32-0))) (32-9)) 8 9 (9-10)-(9/7)-10
Output: {'5$10': '66 7', '23$28': '32-0', '16$29': '78*19-[23$28', '12$30':
'32 [16$29])) 8 9 ', '32$37': '32-9', '0$38': '7 y (66 7) (32 [16$29][23$28]]37]]) 8',
'44$49': '9-10', '51$55': '9/7', '11:0': '', '31:0': '', '39:44': ' 8 9 ', '50:51': '-'}
Basically, when there are different independent chunks inside a chunk the program starts combing them and not leaving only one outer chunk. I've been trying to understand the reason behind it, but every time I try changing the program the problem always stayed the same. I will appreciate any help, thanks in advance.
Code:
def findall(sstr, substr):
gen = sstr.find(substr)
while gen != -1:
yield gen
gen = sstr.find(substr, gen 1)
def findclosest(l: list, el: list): # find closest string from L to string from EL
j = el[ 1 ]
minimum = j
min_index = 0
for i in range(len(l)):
if l[ i ][ 0 ] - j < minimum:
minimum = l[ i ][ 0 ] - j
min_index = l[ i ][ 0 ]
return min_index
def breakInChunks(temp_s): # main
list_of_additions = [ ]
list_of_opened = list(findall(temp_s, '('))
list_of_closed = list(findall(temp_s, ')'))
if sum(list_of_opened) < sum(list_of_closed) and len(list_of_opened) == len(
list_of_closed):
n = 0
# <WHILE>
while len(
list_of_closed) != 0: # read strings-expressions from the most inner ones to the most outer ones
minimum = list_of_closed[ len(list_of_closed) - 1 ]
j = list_of_closed.pop(0)
for i in range(len(list_of_opened)): # find the closest opening bracket to the most inner closing one
diff = j - list_of_opened[ i ]
if diff > 0:
if diff <= minimum:
pop_index = i
minimum = j - list_of_opened[ i ]
else:
break
starting_index = list_of_opened.pop(pop_index)
# start filling KEYPAIRS
if len(keypairs) == 0: # if KEYPAIRS is empty
keypairs[ f'{starting_index}${j}' ] = temp_s[ starting_index 1:j ]
else: # if KEYPAIRS has at least one key-value pair
keys = [ key.split('$') for key in
keypairs.keys() ] # reading and unpacking key-value pairs (reading indecies)
innerSeq = temp_s
min_index_i = None
min_index_j = None
prevExtracted_i = 0
prevExtracted_j = 0
for p in range(len(keys) - 1, -1, -1):
k = keys[ p ]
extracted_i, extracted_j = int(k[ 0 ]), int(k[ 1 ])
if starting_index < extracted_i: # if the chunk we are checking contains another one, we are checking if it's in fact the closest one to the chunk we are checking
if (
extracted_i < prevExtracted_i and prevExtracted_j < extracted_j) or prevExtracted_i == 0:
min_index_i = extracted_i
min_index_j = extracted_j
if prevExtracted_i == 0:
if extracted_i > int(keys[ p - 1 ][ 0 ]) and extracted_j < int(keys[ p - 1 ][ 1 ]):
pass
else:
innerSeq = innerSeq[
:extracted_i ] f'[{extracted_i}${extracted_j}]' innerSeq[
extracted_j 1: ]
else:
if min_index_i is not None:
innerSeq = innerSeq[ :min_index_i ] f'[{min_index_i}${min_index_j}]' innerSeq[
min_index_j 1: ]
min_index_i = None
min_index_j = None
else:
innerSeq = innerSeq[
:prevExtracted_i ] f'[{prevExtracted_i}${prevExtracted_j}]' innerSeq[
prevExtracted_j 1: ]
prevExtracted_i = extracted_i
prevExtracted_j = extracted_j
n = 1
keypairs[ f'{starting_index}${j}' ] = innerSeq[ starting_index 1:j ]
# </WHILE>
# checking if there are any strings outside parentheses left
temp = [ [ int(key.split('$')[ 0 ]), int(key.split('$')[ 1 ]) ] for key in sorted(keypairs.keys(),
key=lambda el: int(
el.split('$')[
1 ])) ] # sort from the most inner to the most outer
for i in range(len(temp) - 1):
if temp[ i ][ 1 ] < temp[ i 1 ][
0 ]: # if there is a gap between parentheses
# find the closest difference in order to find actual string outside chunks with the help of findclosest()
# add new chunk to LIST_OF_ADDITIONS
list_of_additions.append([ temp[ i ][ 1 ] 1, findclosest(temp[ i 1: ], temp[ i ]) ])
if len(list_of_additions) > 0: # if something is inside LIST_OF_ADDITIONS
# add remaining strings to KEYPAIRS
for addition in list_of_additions:
keypairs[ f'{addition[ 0 ]}:{addition[ 1 ]}' ] = self.s[ addition[ 0 ]:addition[ 1 ] ]
return keypairs # return KEYPAIRS
else:
raise RuntimeError(f'Amount of closing and opening brackets does not match')
CodePudding user response:
Using a stack to accumulate sub-expressions at each level of nested parentheses is a common way to approach this. Store the position of the opening parenthesis and the accumulated expression string at each level. Add a level when you encounter an opening parenthesis. Pop out the current level and add it to the result when you encounter a closing parenthesis. At that point the substitution token is added to the previous level's expression (which becomes the current level).
def parGroups(S):
result = dict()
stack = [[0,""]]*2 # parenthesis position, expression
for i,c in enumerate(S ")"): # extra ")" to force out main expression
if c=="(":
stack.append([i,""]) # stack up new group
continue
if c==")":
start,expr = stack.pop(-1) # unstack current group
c = f"[{start}${i}]" # token
result[c[1:-1]] = oper # build result
stack[-1][-1] = c # accumulate expression in current group
return result
Output:
S = "(7 y (66 7) (32 (78*19-(32-0))) (32-9)) 8 9 (9-10)-(9/7)-10"
print(parGroups(S))
{'5$10' : '66 7',
'23$28': '32-0',
'16$29': '78*19-[23$28]',
'12$30': '32 [16$29]',
'32$37': '32-9',
'0$38' : '7 y [5$10] [12$30] [32$37]',
'44$49': '9-10',
'51$55': '9/7',
'0$59' : '[0$38] 8 9 [44$49]-[51$55]-10'}
CodePudding user response:
I would tokenise the input using a regular expression, and process the tokens using recursion:
import re
def breakInChunks(s):
chunks = dict()
tokens = re.finditer(r"([^()] |.?)", s)
def recur(start):
result = ""
for match in tokens:
if match[0] in ")":
key = f"{start}${match.start()}"
chunks[key] = result
return key
result = f"[{recur(match.start())}]" if match[0] == "(" else match[0]
recur(0)
return chunks
For your example:
s = "(7 x 8*(9 10(11 12) 14))-(2*(34))"
chunks = breakInChunks(s)
chunks
will be:
{
'12$18': '11 12',
'7$22': '9 10[12$18] 14',
'0$23': '7 x 8*[7$22]',
'28$31': '34', '25$32':
'2*[28$31]',
'0$33': '[0$23]-[25$32]'
}
Note that the last entry will not be '24:25': '-'
as given in your question, but '0$33': '[0$23]-[25$32]'
, which is more consistent with the logic applied for the other entries.
For the more complex example:
s = "(7 y (66 7) (32 (78*19-(32-0))) (32-9)) 8 9 (9-10)-(9/7)-10"
chunks = breakInChunks(s)
chunks
now is:
{
'5$10': '66 7',
'23$28': '32-0',
'16$29': '78*19-[23$28]',
'12$30': '32 [16$29]',
'32$37': '32-9',
'0$38': '7 y [5$10] [12$30] [32$37]',
'44$49': '9-10',
'51$55': '9/7',
'0$59': '[0$38] 8 9 [44$49]-[51$55]-10'
}