I'm trying to get the html structure of multiple websites using nodejs, and I'm having difficulties. I want to get just the html structure of the document, and no content. I want to preserve classes, ids, and other attributes. Example what I want back
<title></title>
</head>
<body>
<h1></h1>
<div>
<div >
<p></p>
</div>
</div>
</body>
Any suggestion on how to do this? Thanks
CodePudding user response:
Basically you want to remove all text nodes. Time to traverse the elements.
But first, we load the html string using DOMParser.
var EnglishCharFixer = {
do_elem: function(elem) {
var nodes = this.textNodesUnder(elem);
this.process_text_nodes(nodes)
return elem;
},
textNodesUnder: function(node) {
var all = [];
for (node = node.firstChild; node; node = node.nextSibling) {
if (node.nodeType == 3) {
all.push(node);
} else {
all = all.concat(this.textNodesUnder(node));
}
}
return all;
},
process_text_nodes: function(nodes) {
for (var index = 0; index < nodes.length; index ) {
var node = nodes[index];
node.nodeValue = ""
}
}
}
const htmlString = `
<html>
<head>
<scr` `ipt>var x=12</scr` `ipt>
</head>
<body>
<h1>this is test</h1>
<div>
<p>THIS IS TEXT THAT SHOULDN'T BE IN OUTPUT</p>
</div>
</body>
</html>
`;
function removeContentKeepStructure(html) {
const parser = new DOMParser();
const doc3 = parser.parseFromString(html, "text/html");
EnglishCharFixer.do_elem(doc3.documentElement);
var result = doc3.documentElement.outerHTML;
return result;
}
console.log(removeContentKeepStructure(htmlString))
CodePudding user response:
One solution is match
the opening and closing tags with a regex /<\/?.*?>/g
which will produce an array with all opening and closing tags without the content and then join
the array.
const html = `<html>
<head>
<title>title</title>
</head>
<body>
<h1>header</h1>
<div>
<div >
<p>paragrapth</p>
</div>
</div>
</body>
</html>`
const result = html.match(/<\/?.*?>/g).join('');
console.log(result)
CodePudding user response:
Using recursion to simply clear .textContent
from each node and then finishing with the .outerHTML
property works well.
<html>
<head>
<title>This is <span>the title</span></title>
<meta http-equiv="X-UA-Compatible" content="IE=edge">
</head>
<body >
<main id="rt">
<h1>This is a header</h1>
<div>
<div >
<p>This is a <span>paragraph</span></p>
</div>
<div id="shadow-rt">
<div>
<span id="shadow-dom-child"></span>
</div>
</div>
</div>
</main>
</body>
<script>
function walkTree(node) {
if (node === null) {
return;
}
// do something with node
for (let i = 0; i < node.childNodes.length; i ) {
walkTree(node.childNodes[i]);
}
if(node.textContent){
node.textContent = "";
}
}
document.getElementById("rt").attachShadow({mode: 'closed'});
walkTree(document.getElementById("rt"));
console.log(document.getElementsByTagName("HTML")[0].outerHTML);
</script>
</html>