There is a web page that I would like to scrape some information from.
I start off with gathering a bunch of HTML Elements.
var theSearch = document.getElementsByClassName('theID');
I then take that HTML Collection and turn it into an array.
var arr = Array.prototype.slice.call( theSearch );
Now comes the tricky part.
I'd like to scroll down the page, and grab new items that have appeared on the page.
window.scrollTo(0, document.body.scrollHeight);
How does one access the newly inserted DOM nodes? Something like ...
var theSearch2 = document.getElementsByClassName('theID');
... and casting it into a new array ...
var arr2 = Array.prototype.slice.call( theSearch );
... and pushing the items from arr2
to arr
like ...
arr.push(...arr2);
And how would one achieve an ongoing process which keeps scraping until no new items are appended into the page's DOM.
CodePudding user response:
MutationObserver
The MutationObserver interface provides the ability to watch for changes being made to the DOM tree.
var observer = new MutationObserver(function (mutations) {
mutations.forEach(function (mutation) {
mutation.addedNodes.forEach(function (addedNode) {
console.log(addedNode, "@@@"); // your new item
});
});
});
observer.observe(document.getElementById("lists"), {
childList: true,
subtree: false
});
TRY THIS OUT:
window.addEventListener('load', function() {
var count = 0;
function addListItem() {
console.log("called");
const ul = document.getElementById("lists");
var li = document.createElement("li");
li.setAttribute("class", "item");
ul.appendChild(li);
li.innerHTML = li.innerHTML Math.floor(Math.random() * 10);
count ;
if(count > 5) {
myStopFunction()
}
}
myInterval = setInterval(addListItem, 2000);
function myStopFunction() {
clearInterval(myInterval);
}
// HERE IS THE SOLUTION
var observer = new MutationObserver(function (mutations) {
mutations.forEach(function (mutation) {
mutation.addedNodes.forEach(function (addedNode) {
console.log(addedNode, "@@@"); // your new item
});
});
});
observer.observe(document.getElementById("lists"), {
childList: true,
subtree: false
});
})
<!DOCTYPE html>
<html>
<head>
<title>Parcel Sandbox</title>
<meta charset="UTF-8" />
</head>
<body>
<div >
<ul id="lists">
<li >Rand</li>
</ul>
</div>
</body>
</html>
CodePudding user response:
The OP might have a look into MutationObserver
. Whenever new items are rendered into the DOM (triggered by the scrolling) the observer's callback
receives a list of MutationRecord
instances which the OP can act upon.
function handleChildlistChanges(mutationList/*, observer*/) {
mutationList.forEach(mutation => {
const { type, addedNodes } = mutation;
if (type === 'childList') {
// one or more children have been added to
// and/or removed from the tree.
scrapedContentNodes.push(...addedNodes);
console.log({ scrapedContentNodes });
}
});
}
const scrapedContentNodes = [];
const options = {
//attributes: true,
childList: true,
//subtree: true,
};
const target = document.querySelector('#items');
const observer = new MutationObserver(handleChildlistChanges);
observer.observe(target, options);
// test case ... creating content.
['the quick', 'brown fox', 'jumped over', 'the lazy dog.']
.reduce((parentNode, content, idx) => {
const contentNode = document.createElement('p');
contentNode.appendChild(
document.createTextNode(content)
);
setTimeout(
() => parentNode.appendChild(contentNode),
600 * idx,
);
return parentNode;
}, target);
.as-console-wrapper { left: auto!important; width: 70%; min-height: 100%; }
<div id="items">
</div>