I want to preprocess HTML documents using bs4
and need a way to find all leaf nodes in an HTML document that do not have siblings.
from typing import List
import bs4
def _is_leaf_node(tag: bs4.Tag) -> bool:
if isinstance(tag, bs4.NavigableString):
return False
if len(tag.find_all(text=False)) > 0:
return False
return True
def _has_sibling_nodes(tag: bs4.Tag) -> bool:
if tag.previous_sibling is not None:
return True
if tag.next_sibling is not None:
return True
return False
_is_leaf_node_without_siblings = lambda node: _is_leaf_node(node) and not _has_sibling_nodes(node)
_is_leaf_node_without_siblings.__name__ = "_is_leaf_node_without_siblings"
def _find_leaf_nodes_without_siblings(soup: bs4.BeautifulSoup) -> List[bs4.Tag]:
leaf_nodes_without_siblings = soup.find_all(_is_leaf_node_without_siblings)
return list(leaf_nodes_without_siblings)
def main():
html = """<html>
<body>
<div>
<ul>
<li>Test</li>
<li>Test</li>
</ul>
</div>
<div>
<span>Test</span>
</div>
</body>
</html>"""
soup = bs4.BeautifulSoup(html, "html.parser")
print(_find_leaf_nodes_without_siblings(soup))
if __name__ == "__main__":
main()
Currently my implementation returns []
but it should return
[<span>Test</span>]
.
CodePudding user response:
Your problem is that you're using previous_sibling
, which returns text nodes as well. Note that <span>Test</span>
has whitespace around it, so technically it has siblings. If you want the previous sibling element returned, do find_previous_sibling()
instead. Ditto for next_sibling
.
(The find_*()
methods return elements when no parameters are passed; it seems like a criminally undocumented feature as I don't see it mentioned in the docs(?))
Tip: I'd call "leaf node without siblings" a brat, as children without siblings are often spoiled brats. It'd make the code shorter.
Tip: Alternatively, use xpath: //*[position()=1 and position()=last()][not(*)]
.
CodePudding user response:
The following should return the expected result, with less complexity:
from bs4 import BeautifulSoup, NavigableString
html = '''
<html>
<body>
<div>
<ul>
<li>Test</li>
<li>Test</li>
</ul>
</div>
<div>
<span>Test</span>
</div>
</body>
</html>
'''
soup = BeautifulSoup(html, "html.parser")
print([el for el in soup.find_all() if (len(el.find_next_siblings()) == 0 and
len(el.find_previous_siblings()) == 0 and
len(el.contents) == 1 and
isinstance(el.contents[0], NavigableString)) or (len(el.find_next_siblings()) == 0 and
len(el.find_previous_siblings()) == 0 and
len(el.contents) == 0)])
Resulting in:
[<span>Test</span>]
This covers both scenarios where leaf node is empty, or not (has text).