Home > Blockchain >  How to extract a websites html tags in dom and shadowdom
How to extract a websites html tags in dom and shadowdom

Time:10-21

I'm trying to get the html structure of multiple websites using nodejs, and I'm having difficulties. I want to get just the html structure of the document, and no content. I want to preserve classes, ids, and other attributes. Example what I want back

<title></title>
</head>
<body>
  <h1></h1>
  <div>
    <div >
      <p></p>
    </div>
  </div>
</body>

Any suggestion on how to do this? Thanks

CodePudding user response:

Basically you want to remove all text nodes. Time to traverse the elements.

But first, we load the html string using DOMParser.

var EnglishCharFixer = {

  do_elem: function(elem) {
    var nodes = this.textNodesUnder(elem);
    this.process_text_nodes(nodes)
    return elem;
  },

  textNodesUnder: function(node) {
    var all = [];
    for (node = node.firstChild; node; node = node.nextSibling) {
      if (node.nodeType == 3) {
        all.push(node);
      } else {
        all = all.concat(this.textNodesUnder(node));
      }
    }
    return all;
  },


  process_text_nodes: function(nodes) {
    for (var index = 0; index < nodes.length; index  ) {
      var node = nodes[index];
      node.nodeValue = ""
    }
  }

}


const htmlString = `
<html>
<head>
  <scr`   `ipt>var x=12</scr`   `ipt>
</head>
<body>
  <h1>this is test</h1>
  <div>
    <p>THIS IS TEXT THAT SHOULDN'T BE IN OUTPUT</p>
  </div> 
</body>
</html>
`;

function removeContentKeepStructure(html) {
  const parser = new DOMParser();
  const doc3 = parser.parseFromString(html, "text/html");
  EnglishCharFixer.do_elem(doc3.documentElement);
  var result = doc3.documentElement.outerHTML;
  return result;
}


console.log(removeContentKeepStructure(htmlString))

CodePudding user response:

One solution is match the opening and closing tags with a regex /<\/?.*?>/g which will produce an array with all opening and closing tags without the content and then join the array.

const html = `<html>
<head>
 <title>title</title> 
</head>
<body>
  <h1>header</h1>
  <div>
    <div >
      <p>paragrapth</p>
    </div>
  </div>
</body>
</html>`

const result = html.match(/<\/?.*?>/g).join('');

console.log(result)

CodePudding user response:

Using recursion to simply clear .textContent from each node and then finishing with the .outerHTML property works well.

<html>
    <head>
        <title>This is <span>the title</span></title>
        <meta http-equiv="X-UA-Compatible" content="IE=edge">
    </head>
    <body >
        <main id="rt">
          <h1>This is a header</h1>
          <div>
            <div >
              <p>This is a <span>paragraph</span></p>
            </div>
            <div id="shadow-rt">
                <div>
                    <span id="shadow-dom-child"></span>
                </div>
            </div>
          </div>
        </main>
    </body>
        <script>
            function walkTree(node) {
              if (node === null) {
                return;
              }
              // do something with node
              for (let i = 0; i < node.childNodes.length; i  ) {
                walkTree(node.childNodes[i]);
              }
              if(node.textContent){
                node.textContent = "";
              }
            }
            document.getElementById("rt").attachShadow({mode: 'closed'});
            walkTree(document.getElementById("rt"));
            console.log(document.getElementsByTagName("HTML")[0].outerHTML);
        </script>
</html>

  • Related