I have 3 different type ofhtml
snippets which are part of a bigger part as follows:
<html>
<body>
<span _ngcontent-dna-c199="" >
<span _ngcontent-dna-c199="" >
<span _ngcontent-dna-c199="" translate="">
issue_number
</span>
4 Näköispainos
</span>
<span _ngcontent-dna-c199="" >
6.12.1939
</span>
</span>
</body>
</html>
and
<html>
<body>
<span _ngcontent-sut-c199="" >
<span _ngcontent-sut-c199="" >
<span _ngcontent-sut-c199="" translate="">
issue_number
</span>
8
</span>
<span _ngcontent-sut-c199="" >
1998
</span>
</span>
</body>
</html>
and
<html>
<body>
<span _ngcontent-dgu-c199="" >
<span _ngcontent-dgu-c199="" >
1905
</span>
</span>
</body>
</html>
Given the following code:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "lxml") # html_1, html_2, html_3
res = soup.find("span", class_="font-weight-bold")
print(res.text.split())
I get the following results:
['issue_number', '4', 'Näköispainos', '6.12.1939'] # html_1
['issue_number', '8', '1998'] # html_2
['1905'] # html_3
However, my desired custom-made list should have 4 elements and looks like this:
desired_list = ["issue_number", "number", "extension", "date"]
so if there is no info available in html snippet, I'd like to get None
or simply "-"
in that specific element of my desired custom list as follows:
['issue_number', '4', 'Näköispainos', '6.12.1939'] # html_1
['issue_number', '8', None, '1998'] # html_2
[None, None, None, '1905'] # html_3
Is there anyway to manipulate the result list to obtain the desired list using soup.find()
?
CodePudding user response:
Try it like below:
from bs4 import BeautifulSoup
pages = [
'''
<html>
<body>
<span _ngcontent-dna-c199="" >
<span _ngcontent-dna-c199="" >
<span _ngcontent-dna-c199="" translate="">
issue_number
</span>
4 Näköispainos
</span>
<span _ngcontent-dna-c199="" >
6.12.1939
</span>
</span>
</body>
</html>
''',
'''
<html>
<body>
<span _ngcontent-sut-c199="" >
<span _ngcontent-sut-c199="" >
<span _ngcontent-sut-c199="" translate="">
issue_number
</span>
8
</span>
<span _ngcontent-sut-c199="" >
1998
</span>
</span>
</body>
</html>
''',
'''
<html>
<body>
<span _ngcontent-dgu-c199="" >
<span _ngcontent-dgu-c199="" >
1905
</span>
</span>
</body>
</html>
''' ]
i = -1
my_dct = {}
my_list = []
for page in pages:
i = 1
soup = BeautifulSoup(page, "lxml") # html_1, html_2, html_3
spans = soup.find_all("span", attrs= {"class": "ng-star-inserted"})
sp_0 = spans[0].span
if sp_0:
txt = sp_0.text.replace('\n', '').strip()
if txt not in my_list:
my_list.append(txt)
else:
my_list.append('None')
sp_1 = spans[0]
if sp_1 and len(spans) > 1:
txt_1 = sp_1.text.replace('\n', '').replace(txt, '').strip()
my_list.append(txt_1.split()[0])
if len(txt_1.split()) > 1:
my_list.append(txt_1.split()[1])
else:
my_list.append('None')
else:
my_list.append('None')
if len(spans) > 1:
txt_2 = spans[1].text.replace('\n', '').strip()
my_list.append(txt_2)
else:
my_list.append('None')
txt_2 = spans[0].text.replace('\n', '').strip()
my_list.append(txt_2)
my_dct['html_' str(i 1)] = my_list
my_list = []
print(my_dct)
''' R e s u l t :
{
'html_1': ['issue_number', '4', 'Näköispainos', '6.12.1939'],
'html_2': ['issue_number', '8', 'None', '1998'],
'html_3': ['None', 'None', 'None', '1905']
}
'''
Regards...
CodePudding user response:
You could search for each part individually:
def bs2customList(soup):
fwb = soup.find("span", class_="font-weight-bold")
fdesc = [] if fwb is None else [( #(text, filter1, filter2, filter3)
d.get_text(strip=True), d.parent.get('class', []), d.parent.get('translate'), len(d.parent.select('span'))
) for d in fwb.descendants if d.name is None and d.parent.name == 'span' and d.get_text(strip=True) ]
filters = [([], '', 0), (['ng-star-inserted'], None, 0), (['ng-star-inserted'], None, 1)]
issueNum, ydate, num_ext = [[d[0] for d in fdesc if d[1:] == f] for f in filters]
num = num_ext[0].split()[0] if num_ext else []
ext = num_ext[0].split()[1:] if num_ext else []
return [(d[0] if d else None) for d in [issueNum, num, ext, ydate]]
or maybe this is more understandable:
def bs2customList(soup):
fwb = soup.find("span", class_="font-weight-bold")
if fwb is None or not fwb.select('span.ng-star-inserted'):
return [ None ]*4
snsi = fwb.select('span.ng-star-inserted')
snsi1 = [t for t in snsi if t.select('span[translate]')]
if snsi1 != []:
issueNum = snsi1[0].select_one('span[translate]').get_text(strip=True)
ext = [
c.get_text(strip=True) for c in snsi1[0].children
if c.name is None and c.get_text(strip=True)
]
if ext:
ext = [e for e in ext[0].split() if e]
# keep ext[0].isdigit() only if "number" is always integer
if len(ext) > 1 and ext[0].isdigit():
num = ext[0]
ext = ' '.join(ext[1:])
else: num, ext = ' '.join(ext), None
else: num, ext = None, None
else: issueNum, num, ext = [ None ]*3
ydate = [t for t in snsi if not t.select('span[translate]')]
ydate = ydate[0].get_text(strip=True) if ydate else None
return [issueNum, num, ext, ydate]
Whichever version of the function is used, with the below test set:
htmls = [
'''
<html>
<body>
<span _ngcontent-dna-c199="" >
<span _ngcontent-dna-c199="" >
<span _ngcontent-dna-c199="" translate="">
issue_number
</span>
4 Näköispainos
</span>
<span _ngcontent-dna-c199="" >
6.12.1939
</span>
</span>
</body>
</html>
''',
'''
<html>
<body>
<span _ngcontent-sut-c199="" >
<span _ngcontent-sut-c199="" >
<span _ngcontent-sut-c199="" translate="">
issue_number
</span>
8
</span>
<span _ngcontent-sut-c199="" >
1998
</span>
</span>
</body>
</html>
''',
'''
<html>
<body>
<span _ngcontent-dgu-c199="" >
<span _ngcontent-dgu-c199="" >
1905
</span>
</span>
</body>
</html>
''',
'<html><body><span ></span></body></html>',
'' # empty str
]
printing with
for h in htmls: print(bs2customList(BeautifulSoup(h, 'lxml')))
gives the same output [with both versions]:
['issue_number', '4', 'Näköispainos', '6.12.1939']
['issue_number', '8', None, '1998']
[None, None, None, '1905']
[None, None, None, None]
[None, None, None, None]
(The last 2 tests are with an empty [textless] html and an empty string.)