How to create nested JSON from list of strings, which contains heading titles and contents-CodePudding

Sorry for bad title, but it was my best.

I have list of strings, which containing article's header and contents, and they are nested.

string_list = [
    '1. Hello',
    '2. World',
    'content string 1',
    'content string 2',
    '- hyphen',
    'content string 3',
    '[Bracket]',
    'content string 4',
    'content string 5',
    '3. This is',
    'content string 6',
    '4. Numbering header',
    'content string 7',
    '[Bracket]',
    'content string 8',
    '[Nested Bracket]',
    'content string 9',
    '◈ Square header',
    'content string 10',
    '- hyphen',
    'content string 11',
    '[Nested Bracket]'
]

And need to extract data as same header type in same depth, but rarely, duplicated header type used nestedly. string_list list must converted as below dict structure.

[
    {
        "TITLE": "1. Hello",
        "CONTENT": []
    },
    {
        "TITLE": "2. World",
        "CONTENT": [
            "content string 1",
            "content string 2",
            {
                "TITLE": "- hyphen",
                "CONTENT": [
                    "content string 3",
                    {
                        "TITLE": "[Bracket]",
                        "CONTENT": [
                            "content string 4",
                            "content string 5"
                        ]
                    }
                ]
            }
        ]
    },
    {
        "TITLE": "3. This is",
        "CONTENT": [
            "content string 6"
        ]
    },
    {
        "TITLE": "4. Numbering header",
        "CONTENT": [
            "content string 7",
            {
                "TITLE": "[Bracket]",
                "CONTENT": [
                    "content string 8",
                    {
                        "TITLE": "[Nested Bracket]",
                        "CONTENT": [
                            "content string 9",
                            {
                                "TITLE": "\\u25c8 Square header",
                                "CONTENT": [
                                    "content string 10",
                                    {
                                        "TITLE": "- hyphen",
                                        "CONTENT": [
                                            "content string 11"
                                        ]
                                    }
                                ]
                            }
                        ]
                    },
                    {
                        "TITLE": "[Nested Bracket]",
                        "CONTENT": []
                    }
                ]
            }
        ]
    }
]

I cannot discriminate or classify [Bracket] and [Nested Bracket], because every words can be inside of square brackets. I've already wrote 2k lines of regex patterns, but still impossible.

I wrote some python codes before I realized some data have this defects. Surely, below codes generate wrong output.


def build_header_dict(header_str: str, content: Any) -> dict:
    return dict(TITLE=header_str, CONTENT=content if content else [])


class Header:
    def __init__(self, header_str: str, header_type: str, header_index: int, content: Optional[list] = None):
        if not content:
            content = []
        self.header_str: str = header_str
        self.header_type: str = header_type
        self.header_index: int = header_index
        self.content: list = content

    def to_dict(self) -> dict:
        return build_header_dict(self.header_str, self.content)

    def append_content(self, new_content: Union[str, list]) -> None:
        if isinstance(new_content, list):
            self.content.extend(new_content)
        else:
            self.content.append(new_content)


def build_nested_list(content_list: list) -> list:
    """
    Find first matching header string, and make dict which containing header as key and value as list.
    new nested list will be built, and only first matching header type will be used.

    :param content_list: list of contents string
    :return: new nested list
    """

    # stack to add content at the top of the stack
    stack: list[Header] = []
    # new nested list
    new_content_list: list = []

    # First matching header type will be used for building nested list
    first_header_type: Optional[str] = None
    first_header_index: int = -1
    # loop through all content string
    for cont_index, cont_str in enumerate(content_list):
        # `is_header_text` function will search all regex and return result as boolean
        if is_header_text(cont_str):
            header_type: Optional[str]
            header_index: int
            # `find_header_type_and_char_index` return 3 cases:
            # 1. Not matched: return (None, -1)
            # 2. Matched, but Non-index type header, return (type_str, -1)
            #       (match string such as "- hyphen text")
            # 3. Matched, and index is available, return (type_str, int >= 0)
            #       (match string such as "1. Header string")
            header_type, header_index = find_header_type_and_char_index(cont_str)
            # if not found first header type, save it
            if not first_header_type:
                first_header_type, first_header_index = header_type, header_index
            # if header string is found, build header dict
            header: Header = Header(header_str=cont_str, header_type=header_type, header_index=header_index)
            # if this header type is same as first header type, add content to the top of the stack
            if first_header_type == header.header_type:
                # if stack is not empty
                if stack:
                    top_header: Header = stack[-1]
                    top_index: int = top_header.header_index
                    # TODO: how to filter out duplicated use of non index header type?
                    # if header type is same and index is larger than top header index or header type is not index
                    # push new header to the top of the stack
                    if -1 == header.header_index or top_index < header.header_index:
                        stack.append(header)
                        new_content_list.append(header)
                    # if not matched, it's just plain text, so add content to the top of the stack
                    else:
                        stack[-1].append_content(cont_str)
                    continue
                # stack is empty, so append header to the stack
                else:
                    stack.append(header)
                    new_content_list.append(header)
                    continue

        # if string is not header or this header type is not same as first header type
        # Add string to stack's top header's content list if stack is not empty, else just append string to new list 
        if stack:
            stack[-1].append_content(cont_str)
        else:
            new_content_list.append(cont_str)

    # recursive call to build nested list
    for new_val in new_content_list:
        if isinstance(new_val, Header):
            new_val.content = build_nested_list(new_val.content)
    
    # return new nested list
    result: list = []
    for new_index, new_val in enumerate(new_content_list):
        if isinstance(new_val, Header):
            result.append(new_val.to_dict())
        else:
            result.append(new_val)
    return result

My code matched all same square brackets at same depth. So, generated wrong output like this.

[
    {
        "TITLE": "1. Hello",
        "CONTENT": []
    },
    {
        "TITLE": "2. World",
        "CONTENT": [
            "content string 1",
            "content string 2",
            {
                "TITLE": "- hyphen",
                "CONTENT": [
                    "content string 3",
                    {
                        "TITLE": "[Bracket]",
                        "CONTENT": [
                            "content string 4",
                            "content string 5"
                        ]
                    }
                ]
            }
        ]
    },
    {
        "TITLE": "3. This is",
        "CONTENT": [
            "content string 6"
        ]
    },
    {
        "TITLE": "4. Numbering header",
        "CONTENT": [
            "content string 7",
            {
                "TITLE": "[Bracket]",
                "CONTENT": [
                    "content string 8"
                ]
            },
            {
                "TITLE": "[Nested Bracket]",
                "CONTENT": [
                    "content string 9",
                    {
                        "TITLE": "\\u25c8 Square header",
                        "CONTENT": [
                            "content string 10",
                            {
                                "TITLE": "- hyphen",
                                "CONTENT": [
                                    "content string 11"
                                ]
                            }
                        ]
                    }
                ]
            },
            {
                "TITLE": "[Nested Bracket]",
                "CONTENT": []
            }
        ]
    }
]

I'm struggling to solve this problem, but I can't think of an appropriate approach. Is there any approach to make this job possible?

Real data is here (Korean): https://dart.fss.or.kr/report/viewer.do?rcpNo=20210427000533&dcmNo=8044871&eleId=13&offset=551483&length=2243274&dtd=dart3.xsd

Thank you.

CodePudding user response：

Using recursion:

import re
patterns = ['^\d \.', '^\-', '^\[Nested', '^\[', '^◈']
def group(d):
  content, m, h = [], None, None
  for i in d:
    if (m_p:=[j for j in patterns if re.findall(j, i)]):
       if m is None or m_p[0] == m:
          if h is not None:
            yield {'title':h, 'content':[*group(content)]}
            content = []
          m, h = m_p[0], i
       else:
          content.append(i)
    elif h is not None:
       content.append(i)
    else:
       yield i
  if h is not None: yield {'title':h, 'content':[*group(content)]}

import json
string_list = ['1. Hello', '2. World', 'content string 1', 'content string 2', '- hyphen', 'content string 3', '[Bracket]', 'content string 4', 'content string 5', '3. This is', 'content string 6', '4. Numbering header', 'content string 7', '[Bracket]', 'content string 8', '[Nested Bracket]', 'content string 9', '◈ Square header', 'content string 10', '- hyphen', 'content string 11', '[Nested Bracket]']
print(json.dumps([*group(string_list)], indent=4))

[
    {
        "title": "1. Hello",
        "content": []
    },
    {
        "title": "2. World",
        "content": [
            "content string 1",
            "content string 2",
            {
                "title": "- hyphen",
                "content": [
                    "content string 3",
                    {
                        "title": "[Bracket]",
                        "content": [
                            "content string 4",
                            "content string 5"
                        ]
                    }
                ]
            }
        ]
    },
    {
        "title": "3. This is",
        "content": [
            "content string 6"
        ]
    },
    {
        "title": "4. Numbering header",
        "content": [
            "content string 7",
            {
                "title": "[Bracket]",
                "content": [
                    "content string 8",
                    {
                        "title": "[Nested Bracket]",
                        "content": [
                            "content string 9",
                            {
                                "title": "\u25c8 Square header",
                                "content": [
                                    "content string 10",
                                    {
                                        "title": "- hyphen",
                                        "content": [
                                            "content string 11"
                                        ]
                                    }
                                ]
                            }
                        ]
                    },
                    {
                        "title": "[Nested Bracket]",
                        "content": []
                    }
                ]
            }
        ]
    }
]

Edit: you can pass the result from group to another function, which can perform the subnesting of bracketed groups:

from itertools import groupby
def combine(d):
   return {**d[0], 'content':[*d[0]['content'], *([] if not d[1:] else [combine(d[1:])])]}

def merge(d):
  r = []
  for a, b in groupby(d, key=lambda x:isinstance(x, dict) and x['title'][0]=='['):
    b = [{**i, 'content':merge(i['content'])} if isinstance(i, dict) else i for i in b]
    if not a:
       r.extend(b)
    else:
       r.append(combine(b))
  return r

#note: running this when patterns = ['^\d \.', '^\-', '^\[', '^◈']
print(json.dumps(merge([*group(string_list)]), indent=4))

Output:

[
    {
        "title": "1. Hello",
        "content": []
    },
    {
        "title": "2. World",
        "content": [
            "content string 1",
            "content string 2",
            {
                "title": "- hyphen",
                "content": [
                    "content string 3",
                    {
                        "title": "[Bracket]",
                        "content": [
                            "content string 4",
                            "content string 5"
                        ]
                    }
                ]
            }
        ]
    },
    {
        "title": "3. This is",
        "content": [
            "content string 6"
        ]
    },
    {
        "title": "4. Numbering header",
        "content": [
            "content string 7",
            {
                "title": "[Bracket]",
                "content": [
                    "content string 8",
                    {
                        "title": "[Nested Bracket]",
                        "content": [
                            "content string 9",
                            {
                                "title": "\u25c8 Square header",
                                "content": [
                                    "content string 10",
                                    {
                                        "title": "- hyphen",
                                        "content": [
                                            "content string 11"
                                        ]
                                    }
                                ]
                            },
                            {
                                "title": "[Nested Bracket]",
                                "content": []
                            }
                        ]
                    }
                ]
            }
        ]
    }
]

CodePudding user response：

def nest(lines):
    tree = []
    for line in lines:
        if line[0].isdigit() and '.' in line and line[:line.find('.')].isdigit():
            brackets = []
            parent = dict(TITLE=line, CONTENT=[])
            tree.append(parent)
        elif line[0].islower():
            parent['CONTENT'].append(line)
        else:
            child = dict(TITLE=line, CONTENT=[])
            if line.startswith('['):
                if len(brackets) > 1:
                    parent = brackets[-2]
                else:
                    brackets.append(child)
            parent['CONTENT'].append(child)
            parent = child
    return tree