How to return null if h4 not found in python-CodePudding

My script:

import requests
from bs4 import BeautifulSoup
import csv
import re
from itertools import zip_longest

url = requests.get('https://example.com')
soup = BeautifulSoup(url.text, 'lxml')
headers = soup.find_all(["h3"])
texts = soup.find_all(["h4"])

for header, text in list(zip_longest(headers, texts, fillvalue ='')):
    row = [header.text.strip(), text.text.strip() if text else '']

Website:

<h4>xxx</h4>
<h3>yyy</h3>
<h4>sss</h4>
h3 is missing
<h4>zzz</h4>
<h3>ooo</h3>

My result: {xxx,yyy},{sss,ooo},{zzz,null} Correct must be: {xxx,yyy},{sss,null},{zzz,ooo}

CodePudding user response：

It's a somewhat hacky solution, but you can do something like the following:

soup = BeautifulSoup(url.text, 'html.parser') #not lxml

#find the missing <h3>s using css selectors
# and insert the missing <h3>s into the soup
for x in soup.select(':not(h3) h4'):
    x.insert_before(BeautifulSoup('<h3/>', "html.parser"))

#now the lists are of equal length:
headers = soup.find_all("h3")
texts = soup.find_all("h4")
for text, header  in zip(texts,headers):
    row = [text.text.strip(), header.text.strip()]
    print(row)

Output:

['xxx', 'yyy']
['sss', '']
['zzz', 'ooo']

CodePudding user response：

For this I ended up defining put_Nones function which will fill the gaps in the list with Nones. I think it's flexible enough that will work with any ordering of two distinct tags and fills the gaps appropriately. That's it, then you can iterate over generated list and built your desired output. Here is the final code: (I've put comments in code)

from bs4 import BeautifulSoup
html = """<h4>xxx</h4>
<h3>yyy</h3>
<h4>sss</h4>
<h4>zzz</h4>
<h3>ooo</h3>"""

soup = BeautifulSoup(html, 'html.parser')

# getting all 'h3' and 'h4' tags together.
h3_and_h4_tags = soup.find_all(['h3', 'h4'])

def put_Nones(lst):
    first_tag = lst[0]
    full_list = [first_tag]

    # we're going to iterate over this list
    temp_list = lst[1:]

    for i, tag in enumerate(temp_list):
        last = full_list[-1]

        if last is not None and tag.name == last.name:
            full_list.append(None)

            # this is just to shift elements to right
            temp_list.insert(i, 'foo')
        else:
            full_list.append(tag)
    return full_list

full_list = put_Nones(h3_and_h4_tags)

result = []
for i in range(0, len(full_list), 2):
    tag1 = None if full_list[i] is None else full_list[i].string
    tag2 = None if full_list[i   1] is None else full_list[i   1].string
    result.append((tag1, tag2))

print(full_list)
print(result)

output :

[<h4>xxx</h4>, <h3>yyy</h3>, <h4>sss</h4>, None, <h4>zzz</h4>, <h3>ooo</h3>]
[('xxx', 'yyy'), ('sss', None), ('zzz', 'ooo')]

CodePudding user response：

For my solution, I process the h4 and h3 tags together and use a simple state machine to get the correct ordering of tag.text according to tag.name.

I stubbed where necessary to make sure it actually runs. You can extract out what you need for your more comprehensive solution.

The below state machine only catches missing h3 tags, but it could be expanded to catch missing h4 tags, as well.

import requests
from bs4 import BeautifulSoup
import csv
import re
from itertools import zip_longest
from collections import namedtuple # NEW

#kludge for input data
Website = """
<h4>xxx</h4>
<h3>yyy</h3>
<h4>sss</h4>
h3 is missing
<h4>zzz</h4>
<h3>ooo</h3>
"""

My_result = [('xxx','yyy'),('sss','ooo'),('zzz','')]
Correct = [('xxx','yyy'),('sss',''),('zzz','ooo')]

# url = requests.get('https://example.com')
soup = BeautifulSoup(Website, 'lxml') # url.text
tags = soup.find_all(["h3", "h4"])

tag_pairs = []
h4 = h3 = ''
prev_tag = namedtuple('Tag', 'text name')('', '')

for tag in tags:
    if tag.name == 'h4':
        h4, h3 = tag.text, ''
    elif tag.name == 'h3':
        h3 = tag.text
    else:
        raise Exception(f"Unexpected Tag Name {tag.name}")
    if tag.name == 'h4' == prev_tag.name:
        tag_pairs.append((prev_tag.text, h3))
    elif tag.name == 'h3':
        tag_pairs.append((h4, h3))
    prev_tag = tag

print(f"{tag_pairs=}")
print(tag_pairs == Correct)

# LEGACY CODE BELOW

headers = soup.find_all(["h3"])
texts = soup.find_all(["h4"])
rows = []
for (header, text) in list(zip_longest(headers, texts, fillvalue ='')):
    hdr = header.text.strip() if header else ''
    txt = text.text.strip() if text else ''
    row = txt, hdr
    rows.append(row)

print(f"{rows=}")

print(rows == My_result)