如何在DOM和shadowDOM中提取網站HTML標簽-有解無憂

我正在嘗試使用 NodeJS 獲取多個網站的 html 結構，但遇到了困難。我只想獲取檔案的 HTML 結構，而沒有內容。我想保留類、ID 和其他屬性。

我想要回來的例子：

<title></title>
</head>
<body>
  <h1></h1>
  <div>
    <div class="something">
      <p></p>
    </div>
  </div>
</body>

關于如何做到這一點的任何建議？謝謝

uj5u.com熱心網友回復：

基本上你想洗掉所有的文本節點。是時候遍歷元素了。

但首先，我們使用 DOMParser 加載 html 字串。

var EnglishCharFixer = {

  do_elem: function(elem) {
    var nodes = this.textNodesUnder(elem);
    this.process_text_nodes(nodes)
    return elem;
  },

  textNodesUnder: function(node) {
    var all = [];
    for (node = node.firstChild; node; node = node.nextSibling) {
      if (node.nodeType == 3) {
        all.push(node);
      } else {
        all = all.concat(this.textNodesUnder(node));
      }
    }
    return all;
  },


  process_text_nodes: function(nodes) {
    for (var index = 0; index < nodes.length; index  ) {
      var node = nodes[index];
      node.nodeValue = ""
    }
  }

}


const htmlString = `
<html>
<head>
  <scr`   `ipt>var x=12</scr`   `ipt>
</head>
<body>
  <h1>this is test</h1>
  <div>
    <p>THIS IS TEXT THAT SHOULDN'T BE IN OUTPUT</p>
  </div> 
</body>
</html>
`;

function removeContentKeepStructure(html) {
  const parser = new DOMParser();
  const doc3 = parser.parseFromString(html, "text/html");
  EnglishCharFixer.do_elem(doc3.documentElement);
  var result = doc3.documentElement.outerHTML;
  return result;
}


console.log(removeContentKeepStructure(htmlString))

uj5u.com熱心網友回復：

一種解決方案是match帶有正則運算式的開始和結束標簽/<\/?.*?>/g，它將生成一個包含所有開始和結束標簽的陣列，沒有內容，然后join是陣列。

const html = `<html>
<head>
 <title>title</title> 
</head>
<body>
  <h1>header</h1>
  <div>
    <div >
      <p>paragrapth</p>
    </div>
  </div>
</body>
</html>`

const result = html.match(/<\/?.*?>/g).join('');

console.log(result)

uj5u.com熱心網友回復：

如果 OP 標記他的問題：如何在 DOM 和 shadowDOM 中提取網站 HTML 標簽

那么為什么不使用TreeWalker API（在所有瀏覽器中都可用......自2011 年以來）

您不想提取 HTML 標記...

您要洗掉textNodes：

  function removeTextNodes( root = document.body ) {
    let node,tree = document.createTreeWalker(root, NodeFilter.SHOW_TEXT);
    while (node = tree.nextNode()) node.textContent = "";
    return root.outerHTML;
  }

如果你確實有開放的 shadowRoots，你需要遞回地深入研究 shadowDOMs

uj5u.com熱心網友回復：

使用遞回簡單地.textContent從每個節點中清除，然后使用.outerHTML屬性完成效果很好。

<html>
    <head>
        <title>This is <span>the title</span></title>
        <meta http-equiv="X-UA-Compatible" content="IE=edge">
    </head>
    <body class="my-class">
        <main id="rt">
          <h1>This is a header</h1>
          <div>
            <div class="something">
              <p>This is a <span>paragraph</span></p>
            </div>
            <div id="shadow-rt">
                <div>
                    <span id="shadow-dom-child"></span>
                </div>
            </div>
          </div>
        </main>
    </body>
        <script>
            function walkTree(node) {
              if (node === null) {
                return;
              }
              // do something with node
              for (let i = 0; i < node.childNodes.length; i  ) {
                walkTree(node.childNodes[i]);
              }
              if(node.textContent){
                node.textContent = "";
              }
            }
            document.getElementById("rt").attachShadow({mode: 'closed'});
            walkTree(document.getElementById("rt"));
            console.log(document.getElementsByTagName("HTML")[0].outerHTML);
        </script>
</html>

轉載請註明出處，本文鏈接：https://www.uj5u.com/qukuanlian/522737.html

標籤：javascripthtmldom影子王国树行者

上一篇：為什么document.getElementById()在VSCode中不起作用？

下一篇：鍵入“節點|嘗試迭代TypeScript中的HTML元素時，null'不可分配給型別'Node'