I have a list text file with the following
Aerospace
Aerodynamics
Aerospace Technician/Engineer
Performance Engineer
Flight Dynamics & Control
Aircraft Loads Professional
Ground Test Rig Architecture Designer
Biomedical
Bioinstrumentation
Biomedical Engineer
Biomaterials Engineer
Biomaterials
Researcher
Systems Development Engineer
and I want to convert it into a format that looks something like this
[
{
"label" : "Aerospace",
"children" : [
{
"label" : "Aerodynamics",
"children" : [
...
]
}
]
},
{
"label" : "Biomedical",
"children" : ...
}
]
I tried my hand at recursion, but I wasn't able to create an algo to do this. I'm hoping someone smarter than I can figure this out / give me pointers as to where I went wrong.
This is what I tried
def recurse(remaining_lines, depth):
out = []
for i in range(len(remaining_lines)):
line = remaining_lines[i]
nspaces = len(line) - len(line.lstrip())
if nspaces > depth:
out.extend(recurse(remaining_lines[i 1:], nspaces))
else:
return [{
"id" : str(uuid.uuid4()),
"label" : line.strip(),
"children" : out,
}]
return out
# lines is an an array created by splitting the text file by \n
out = recurse(lines, -1)
CodePudding user response:
You can maintain a stack that maintains the current path in the hierarchy of (children) lists, where each list is accompanied with its indent. Whenever a node is created at that indent, it will be added to the corresponding list:
def create_hierarchy(lines):
stack = [(-1, [])]
for line in lines:
label = line.lstrip()
indent = len(line) - len(label)
while indent <= stack[-1][0]:
stack.pop()
node = { "label": label, "children": [] }
stack[-1][1].append(node)
stack.append((indent, node["children"]))
return stack[0][1]
CodePudding user response:
This seems to work
def parse_line(line, indent_str=' '*4):
ind = 0
while line.startswith(indent_str):
line = line[len(indent_str):]
ind = 1
return line, ind
# In Python >= 3.9 you can use:
# def parse_line(line, indent_str=' '*4):
# ind = 0
# while line != (line := line.removeprefix(indent_str)):
# ind = 1
# return line, ind
assert(parse_line("Aerospace") == ('Aerospace', 0))
assert(parse_line(" Researcher") == ('Researcher', 2))
assert(parse_line("---A", indent_str='-') == ('A', 3))
def parse_lines(lines, ind=0, indent_str=' '*4):
lines = lines.copy()
lines_out = []
while len(lines) > 0:
line, new_ind = parse_line(lines[0], indent_str)
if new_ind < ind:
break
if new_ind == ind 1:
sub_lines, lines, new_ind = parse_lines(lines, new_ind, indent_str)
lines_out[-1] = {"label": lines_out[-1], "children": sub_lines}
elif new_ind == ind:
lines_out.append(line)
lines.pop(0)
elif new_ind > ind 1:
raise ValueError("Invalid indenting")
if ind > 0:
return lines_out, lines, new_ind
assert(len(lines) == 0) # make sure everything was parsed
return lines_out
assert(parse_lines([' a', ' b'], 1, indent_str=' ') == (['a', 'b'], [], 1))
assert(parse_lines(['a', 'b', ' c', ' d', ' e', 'f'], indent_str=' ') ==
['a', {'label': 'b', 'children': ['c', {'label': 'd', 'children': ['e']}]}, 'f'])```
Output:
In [10]: parse_lines(lines)
Out[10]:
[{'label': 'Aerospace',
'children': [{'label': 'Aerodynamics',
'children': ['Aerospace Technician/Engineer', 'Performance Engineer']},
{'label': 'Flight Dynamics & Control',
'children': ['Aircraft Loads Professional',
'Ground Test Rig Architecture Designer']}]},
{'label': 'Biomedical',
'children': [{'label': 'Bioinstrumentation',
'children': ['Biomedical Engineer', 'Biomaterials Engineer']},
{'label': 'Biomaterials',
'children': ['Researcher', 'Systems Development Engineer']}]}]