Sorry for bad title, but it was my best.
I have list of strings, which containing article's header and contents, and they are nested.
string_list = [
'1. Hello',
'2. World',
'content string 1',
'content string 2',
'- hyphen',
'content string 3',
'[Bracket]',
'content string 4',
'content string 5',
'3. This is',
'content string 6',
'4. Numbering header',
'content string 7',
'[Bracket]',
'content string 8',
'[Nested Bracket]',
'content string 9',
'◈ Square header',
'content string 10',
'- hyphen',
'content string 11',
'[Nested Bracket]'
]
And need to extract data as same header type in same depth, but rarely, duplicated header type used nestedly.
string_list
list must converted as below dict structure.
[
{
"TITLE": "1. Hello",
"CONTENT": []
},
{
"TITLE": "2. World",
"CONTENT": [
"content string 1",
"content string 2",
{
"TITLE": "- hyphen",
"CONTENT": [
"content string 3",
{
"TITLE": "[Bracket]",
"CONTENT": [
"content string 4",
"content string 5"
]
}
]
}
]
},
{
"TITLE": "3. This is",
"CONTENT": [
"content string 6"
]
},
{
"TITLE": "4. Numbering header",
"CONTENT": [
"content string 7",
{
"TITLE": "[Bracket]",
"CONTENT": [
"content string 8",
{
"TITLE": "[Nested Bracket]",
"CONTENT": [
"content string 9",
{
"TITLE": "\\u25c8 Square header",
"CONTENT": [
"content string 10",
{
"TITLE": "- hyphen",
"CONTENT": [
"content string 11"
]
}
]
}
]
},
{
"TITLE": "[Nested Bracket]",
"CONTENT": []
}
]
}
]
}
]
I cannot discriminate or classify [Bracket]
and [Nested Bracket]
, because every words can be inside of square brackets. I've already wrote 2k lines of regex patterns, but still impossible.
I wrote some python codes before I realized some data have this defects. Surely, below codes generate wrong output.
def build_header_dict(header_str: str, content: Any) -> dict:
return dict(TITLE=header_str, CONTENT=content if content else [])
class Header:
def __init__(self, header_str: str, header_type: str, header_index: int, content: Optional[list] = None):
if not content:
content = []
self.header_str: str = header_str
self.header_type: str = header_type
self.header_index: int = header_index
self.content: list = content
def to_dict(self) -> dict:
return build_header_dict(self.header_str, self.content)
def append_content(self, new_content: Union[str, list]) -> None:
if isinstance(new_content, list):
self.content.extend(new_content)
else:
self.content.append(new_content)
def build_nested_list(content_list: list) -> list:
"""
Find first matching header string, and make dict which containing header as key and value as list.
new nested list will be built, and only first matching header type will be used.
:param content_list: list of contents string
:return: new nested list
"""
# stack to add content at the top of the stack
stack: list[Header] = []
# new nested list
new_content_list: list = []
# First matching header type will be used for building nested list
first_header_type: Optional[str] = None
first_header_index: int = -1
# loop through all content string
for cont_index, cont_str in enumerate(content_list):
# `is_header_text` function will search all regex and return result as boolean
if is_header_text(cont_str):
header_type: Optional[str]
header_index: int
# `find_header_type_and_char_index` return 3 cases:
# 1. Not matched: return (None, -1)
# 2. Matched, but Non-index type header, return (type_str, -1)
# (match string such as "- hyphen text")
# 3. Matched, and index is available, return (type_str, int >= 0)
# (match string such as "1. Header string")
header_type, header_index = find_header_type_and_char_index(cont_str)
# if not found first header type, save it
if not first_header_type:
first_header_type, first_header_index = header_type, header_index
# if header string is found, build header dict
header: Header = Header(header_str=cont_str, header_type=header_type, header_index=header_index)
# if this header type is same as first header type, add content to the top of the stack
if first_header_type == header.header_type:
# if stack is not empty
if stack:
top_header: Header = stack[-1]
top_index: int = top_header.header_index
# TODO: how to filter out duplicated use of non index header type?
# if header type is same and index is larger than top header index or header type is not index
# push new header to the top of the stack
if -1 == header.header_index or top_index < header.header_index:
stack.append(header)
new_content_list.append(header)
# if not matched, it's just plain text, so add content to the top of the stack
else:
stack[-1].append_content(cont_str)
continue
# stack is empty, so append header to the stack
else:
stack.append(header)
new_content_list.append(header)
continue
# if string is not header or this header type is not same as first header type
# Add string to stack's top header's content list if stack is not empty, else just append string to new list
if stack:
stack[-1].append_content(cont_str)
else:
new_content_list.append(cont_str)
# recursive call to build nested list
for new_val in new_content_list:
if isinstance(new_val, Header):
new_val.content = build_nested_list(new_val.content)
# return new nested list
result: list = []
for new_index, new_val in enumerate(new_content_list):
if isinstance(new_val, Header):
result.append(new_val.to_dict())
else:
result.append(new_val)
return result
My code matched all same square brackets at same depth. So, generated wrong output like this.
[
{
"TITLE": "1. Hello",
"CONTENT": []
},
{
"TITLE": "2. World",
"CONTENT": [
"content string 1",
"content string 2",
{
"TITLE": "- hyphen",
"CONTENT": [
"content string 3",
{
"TITLE": "[Bracket]",
"CONTENT": [
"content string 4",
"content string 5"
]
}
]
}
]
},
{
"TITLE": "3. This is",
"CONTENT": [
"content string 6"
]
},
{
"TITLE": "4. Numbering header",
"CONTENT": [
"content string 7",
{
"TITLE": "[Bracket]",
"CONTENT": [
"content string 8"
]
},
{
"TITLE": "[Nested Bracket]",
"CONTENT": [
"content string 9",
{
"TITLE": "\\u25c8 Square header",
"CONTENT": [
"content string 10",
{
"TITLE": "- hyphen",
"CONTENT": [
"content string 11"
]
}
]
}
]
},
{
"TITLE": "[Nested Bracket]",
"CONTENT": []
}
]
}
]
I'm struggling to solve this problem, but I can't think of an appropriate approach. Is there any approach to make this job possible?
Real data is here (Korean): https://dart.fss.or.kr/report/viewer.do?rcpNo=20210427000533&dcmNo=8044871&eleId=13&offset=551483&length=2243274&dtd=dart3.xsd
Thank you.
CodePudding user response:
Using recursion:
import re
patterns = ['^\d \.', '^\-', '^\[Nested', '^\[', '^◈']
def group(d):
content, m, h = [], None, None
for i in d:
if (m_p:=[j for j in patterns if re.findall(j, i)]):
if m is None or m_p[0] == m:
if h is not None:
yield {'title':h, 'content':[*group(content)]}
content = []
m, h = m_p[0], i
else:
content.append(i)
elif h is not None:
content.append(i)
else:
yield i
if h is not None: yield {'title':h, 'content':[*group(content)]}
import json
string_list = ['1. Hello', '2. World', 'content string 1', 'content string 2', '- hyphen', 'content string 3', '[Bracket]', 'content string 4', 'content string 5', '3. This is', 'content string 6', '4. Numbering header', 'content string 7', '[Bracket]', 'content string 8', '[Nested Bracket]', 'content string 9', '◈ Square header', 'content string 10', '- hyphen', 'content string 11', '[Nested Bracket]']
print(json.dumps([*group(string_list)], indent=4))
[
{
"title": "1. Hello",
"content": []
},
{
"title": "2. World",
"content": [
"content string 1",
"content string 2",
{
"title": "- hyphen",
"content": [
"content string 3",
{
"title": "[Bracket]",
"content": [
"content string 4",
"content string 5"
]
}
]
}
]
},
{
"title": "3. This is",
"content": [
"content string 6"
]
},
{
"title": "4. Numbering header",
"content": [
"content string 7",
{
"title": "[Bracket]",
"content": [
"content string 8",
{
"title": "[Nested Bracket]",
"content": [
"content string 9",
{
"title": "\u25c8 Square header",
"content": [
"content string 10",
{
"title": "- hyphen",
"content": [
"content string 11"
]
}
]
}
]
},
{
"title": "[Nested Bracket]",
"content": []
}
]
}
]
}
]
Edit: you can pass the result from group
to another function, which can perform the subnesting of bracketed groups:
from itertools import groupby
def combine(d):
return {**d[0], 'content':[*d[0]['content'], *([] if not d[1:] else [combine(d[1:])])]}
def merge(d):
r = []
for a, b in groupby(d, key=lambda x:isinstance(x, dict) and x['title'][0]=='['):
b = [{**i, 'content':merge(i['content'])} if isinstance(i, dict) else i for i in b]
if not a:
r.extend(b)
else:
r.append(combine(b))
return r
#note: running this when patterns = ['^\d \.', '^\-', '^\[', '^◈']
print(json.dumps(merge([*group(string_list)]), indent=4))
Output:
[
{
"title": "1. Hello",
"content": []
},
{
"title": "2. World",
"content": [
"content string 1",
"content string 2",
{
"title": "- hyphen",
"content": [
"content string 3",
{
"title": "[Bracket]",
"content": [
"content string 4",
"content string 5"
]
}
]
}
]
},
{
"title": "3. This is",
"content": [
"content string 6"
]
},
{
"title": "4. Numbering header",
"content": [
"content string 7",
{
"title": "[Bracket]",
"content": [
"content string 8",
{
"title": "[Nested Bracket]",
"content": [
"content string 9",
{
"title": "\u25c8 Square header",
"content": [
"content string 10",
{
"title": "- hyphen",
"content": [
"content string 11"
]
}
]
},
{
"title": "[Nested Bracket]",
"content": []
}
]
}
]
}
]
}
]
CodePudding user response:
def nest(lines):
tree = []
for line in lines:
if line[0].isdigit() and '.' in line and line[:line.find('.')].isdigit():
brackets = []
parent = dict(TITLE=line, CONTENT=[])
tree.append(parent)
elif line[0].islower():
parent['CONTENT'].append(line)
else:
child = dict(TITLE=line, CONTENT=[])
if line.startswith('['):
if len(brackets) > 1:
parent = brackets[-2]
else:
brackets.append(child)
parent['CONTENT'].append(child)
parent = child
return tree