Let's say I have the following text:
text = '''
test1:
aaa: aaa
test2:
bbb:bbb
test3:
ccc:ccc
eee:eee
test2:
ddd:ddd
'''
I would like to parse that text and create a dict in which the keys are the lines which have zero leading spaces and the value a list of remaining lines below of each key. In case there are two same keys like 'test2' then a second list should be assigned under the same key. So my end result dict should be like the following:
{
"test1": [
[" aaa: aaa"],
],
"test2": [
[" bbb: bbb"],
[" ddd: ddd"],
],
"test3": [
[
[" ccc:ccc", " eee:eee"],
]
],
"test": [
[" ddd:ddd"],
]
}
Util now I wrote the following code but need some help to finish it:
parsed = {}
proj = None
prev_proj = None
lines = []
for line in text.splitlines()[1:-1]:
leading_whitespaces = len(line) - len(line.lstrip())
if leading_whitespaces == 0 and (re.match(r'^[a-z_]', line)):
proj = line.split(':')[0]
if not parsed and len(lines) == 0:
parsed[proj] = []
parsed[proj].append(lines)
elif len(lines) > 0 and parsed:
parsed[prev_proj] = []
parsed[prev_proj].append(lines)
lines = []
prev_proj = proj
else:
lines.append(line)
CodePudding user response:
You can use defaultdict
from the collections
module to handle the dictionary of lists.
text = '''
test1:
aaa: aaa
test2:
bbb:bbb
test3:
ccc:ccc
eee:eee
test2:
ddd:ddd
'''
from collections import defaultdict
out = defaultdict(list)
key = None
values = []
for line in text.splitlines():
if not line:
continue
# handle a new key occurring
if not line.startswith(' '):
# append values to current key
if key and values:
out[key].append(values)
values = []
key = line.strip(':')
# roll up values
else:
values.append(line)
# handle the last key and values
else:
out[key].append(values)
CodePudding user response:
You could try something like that:
temp_dict = dict()
latest_key = ""
for line in text.splitlines():
if line.startswith(" ") and latest_key != "":
for i in range(len(line)):
if line[i] != " ":
break
if line.replace(" ", "") != "": # need so no empty lines get added to values
temp_dict[latest_key].append(line[i:])
else:
latest_key = line.split(":")[0]
if latest_key not in temp_dict.keys() and latest_key != "":
temp_dict[latest_key] = list()
print(temp_dict)
CodePudding user response:
You can do that with a bit of text parsing. If a line starts with no leading space, update the key, otherwise append the new value to the dict.
text = '''
test1:
aaa: aaa
test2:
bbb:bbb
test3:
ccc:ccc
eee:eee
test2:
ddd:ddd
'''
from io import StringIO
out = {}
with StringIO(text) as f:
key = None
for line in f:
if line.strip() == "": # empty lines, skip
continue
if line[0] != " ": # new key
key = line.rstrip("\n").strip(":")
continue
else: # new value
value = line.rstrip() # remove line breaks at end
if key not in out:
out[key] = [value]
else:
out[key].append(value)
out
#{'test1': [' aaa: aaa'],
# 'test2': [' bbb:bbb', ' ddd:ddd'],
# 'test3': [' ccc:ccc', ' eee:eee']}
CodePudding user response:
You could try to use groupby()
:
from itertools import groupby
def new_block(line): return not line.startswith(" ")
parsed = {}
lines = filter(None, text.splitlines())
for new, block in groupby(lines, key=new_block):
block = list(block)
if new:
last = parsed.setdefault(block[0].rstrip(":"), [])
else:
last.append(block)
Result for your example text
:
{'test1': [[' aaa: aaa']],
'test2': [[' bbb:bbb'], [' ddd:ddd']],
'test3': [[' ccc:ccc', ' eee:eee']]}