I'm getting an HTML I need to parse it so that I can read text under a certain Heading. More specifically, there is a div tag that includes several H2 elements and I need to read only the text between the 3rd and 4th H2 heading, i.e. the Resume section.
Screenshot of the HTML example
Thanks
Patrik
CodePudding user response:
Good question. You can use a recursive function well for that. The function gets the start point (third h2) and the end point (fourth h2). Then you iterate over every single element within these two points. I have now written the output to the console. But you can concatenate it into a string.
function getTextFromTo(rootNode, startNode, endNode) {
let pastStartNode = false, reachedEndNode = false, textNodes = [];
function getTextNodes(node) {
if (node == startNode) {
pastStartNode = true;
} else if (node == endNode) {
reachedEndNode = true;
} else if (node.nodeType == 3) {
if (pastStartNode && !reachedEndNode && !/^\s*$/.test(node.nodeValue)) {
textNodes.push(node);
}
} else {
for (var i = 0, len = node.childNodes.length; !reachedEndNode && i < len; i) {
getTextNodes(node.childNodes[i]);
}
}
}
getTextNodes(rootNode);
return textNodes;
}
const from = document.querySelector('div :nth-child(5)'); // from
const to = document.querySelector('div :nth-child(11)'); // to
const root = document.querySelector('div');
var textNodes = getTextFromTo(root, from, to);
for (let i = 0, len = textNodes.length, div; i < len; i) {
console.log(textNodes[i].data)
}
<div >
<h2>title 1</h2>
<ul><li></li></ul>
<h2>title 2</h2>
<ul><li></li></ul>
<h2>Resume</h2>
<p>text 1</p>
<p>text 2</p>
<p>text 3 this one</p>
<p>text 4</p>
<p>text 5 this one</p>
<h2>next title</h2>
</div>
The originator of this cool function is @TimDown. I just adapted it. How can I find all text nodes between two element nodes with JavaScript/jQuery?
CodePudding user response:
You can use regex for it
/(?<=<h2>Résumé<\/h2>)(.|\n)*?(?=<h2>)/g
This will get all the text after <h2>Résumé<\/h2>' till next <h2> tag.