Create nested list from another list based on byte size check to dynamically incremented nested list-CodePudding

I have a list filled with paragraph tags (<p></p>) content from a site using beautifulsoup4.

I would like to break that list into a nested list with the sublists name being dynamically incremented and have this incrementation to be based on a byte size check of the current nested list. The result should be used to create a json object afterwards.

My current code for example:

import requests
from bs4 import BeautifulSoup


def getContent():

    page = requests.get("www.example.com")
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.prettify()
    
    data = {}
    SECTION_INDEX = 1
    data_container = []
    total_article_size = 0
    article_section_data = []



    for tag in soup.find_all("p"):
        text = tag.text
        data_container.append(text)

    for p in data_container:
        article_section = "CONTENT_SECTION_"   str(SECTION_INDEX)
        article_section_data.append(p)
        data[article_section] = article_section_data


        if article_section_size >= 300:
            SECTION_INDEX = SECTION_INDEX   1

    return(data)

def createJson():
    data = getContent()
    json_source = {
                      "ARTICLE_DATA": data
                  }

    json_object = json.dumps(json_source, indent=2)


def main():
    createJson()

The actual result:

{
  "CONTENT_DATA": {
    "CONTENT_SECTION_1": [
      "the actual paragraphs",
      "content goes there",
      "some more content".
      "even more content from the site",
      "and some even more",
      "and finally, some more"
    ],
    "CONTENT_SECTION_2": [
      "the actual paragraphs",
      "content goes there",
      "some more content".
      "even more content from the site",
      "and some even more",
      "and finally, some more"
    ],
    "CONTENT_SECTION_3": [
      "the actual paragraphs",
      "content goes there",
      "some more content".
      "even more content from the site",
      "and some even more",
      "and finally, some more"
    ]
  }
}

The desired result:

{
  "CONTENT_DATA": {
    "CONTENT_SECTION_1": [
      "the actual paragraphs",
      "content goes there"
    ],
    "CONTENT_SECTION_2": [
      "some more content",
      "even more content from the site"
    ],
    "CONTENT_SECTION_3": [
      "and some even more",
      "and finally, some more"
    ]
  }
}

How to achieve this and why the repeated pattern from the actual result above?

CodePudding user response：

To achieve the desired result, you can track the size of the current article section using sys.getsizeof function and split the data_container list into smaller lists based on the desired byte size. Here's the updated code:

import requests
from bs4 import BeautifulSoup
import sys

def getContent():

    page = requests.get("www.example.com")
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.prettify()
    
    data = {}
    SECTION_INDEX = 1
    data_container = []
    article_section_size = 0
    article_section_data = []

    for tag in soup.find_all("p"):
        text = tag.text
        data_container.append(text)

    for p in data_container:
        article_section = "CONTENT_SECTION_"   str(SECTION_INDEX)
        article_section_data.append(p)
        article_section_size  = sys.getsizeof(p)

        if article_section_size >= 300:
            data[article_section] = article_section_data
            article_section_data = []
            article_section_size = 0
            SECTION_INDEX = SECTION_INDEX   1

    if article_section_data:
        data[article_section] = article_section_data

    return(data)

The repeated pattern from the actual result is due to the fact that you were always appending the p element to the article_section_data list, rather than resetting it to an empty list when the desired byte size has been reached.