How to extract a websites html tags in dom and shadowdom-CodePudding

I'm trying to get the html structure of multiple websites using nodejs, and I'm having difficulties. I want to get just the html structure of the document, and no content. I want to preserve classes, ids, and other attributes. Example what I want back

<title></title>
</head>
<body>
  <h1></h1>
  <div>
    <div >
      <p></p>
    </div>
  </div>
</body>

Any suggestion on how to do this? Thanks

CodePudding user response：

Basically you want to remove all text nodes. Time to traverse the elements.

But first, we load the html string using DOMParser.

var EnglishCharFixer = {

  do_elem: function(elem) {
    var nodes = this.textNodesUnder(elem);
    this.process_text_nodes(nodes)
    return elem;
  },

  textNodesUnder: function(node) {
    var all = [];
    for (node = node.firstChild; node; node = node.nextSibling) {
      if (node.nodeType == 3) {
        all.push(node);
      } else {
        all = all.concat(this.textNodesUnder(node));
      }
    }
    return all;
  },


  process_text_nodes: function(nodes) {
    for (var index = 0; index < nodes.length; index  ) {
      var node = nodes[index];
      node.nodeValue = ""
    }
  }

}


const htmlString = `
<html>
<head>
  <scr`   `ipt>var x=12</scr`   `ipt>
</head>
<body>
  <h1>this is test</h1>
  <div>
    <p>THIS IS TEXT THAT SHOULDN'T BE IN OUTPUT</p>
  </div> 
</body>
</html>
`;

function removeContentKeepStructure(html) {
  const parser = new DOMParser();
  const doc3 = parser.parseFromString(html, "text/html");
  EnglishCharFixer.do_elem(doc3.documentElement);
  var result = doc3.documentElement.outerHTML;
  return result;
}


console.log(removeContentKeepStructure(htmlString))

CodePudding user response：

One solution is match the opening and closing tags with a regex /<\/?.*?>/g which will produce an array with all opening and closing tags without the content and then join the array.

const html = `<html>
<head>
 <title>title</title> 
</head>
<body>
  <h1>header</h1>
  <div>
    <div >
      <p>paragrapth</p>
    </div>
  </div>
</body>
</html>`

const result = html.match(/<\/?.*?>/g).join('');

console.log(result)

CodePudding user response：

Using recursion to simply clear .textContent from each node and then finishing with the .outerHTML property works well.

<html>
    <head>
        <title>This is <span>the title</span></title>
        <meta http-equiv="X-UA-Compatible" content="IE=edge">
    </head>
    <body >
        <main id="rt">
          <h1>This is a header</h1>
          <div>
            <div >
              <p>This is a <span>paragraph</span></p>
            </div>
            <div id="shadow-rt">
                <div>
                    <span id="shadow-dom-child"></span>
                </div>
            </div>
          </div>
        </main>
    </body>
        <script>
            function walkTree(node) {
              if (node === null) {
                return;
              }
              // do something with node
              for (let i = 0; i < node.childNodes.length; i  ) {
                walkTree(node.childNodes[i]);
              }
              if(node.textContent){
                node.textContent = "";
              }
            }
            document.getElementById("rt").attachShadow({mode: 'closed'});
            walkTree(document.getElementById("rt"));
            console.log(document.getElementsByTagName("HTML")[0].outerHTML);
        </script>
</html>