I want to perform a web scrapping for all the data within the main table body in the url https://data.anbima.com.br/debentures/AGRU12/agenda
... However as it implements pagination, I am unable to get that done easily... I came up with the following code which is not workng... I am getting the error ReferenceError: list is not defined
, though I have defned it right before the while loop...
const puppeteer = require('puppeteer');
const fs = require('fs');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(`https://data.anbima.com.br/debentures/AGRU12/agenda`);
await page.waitForSelector('.normal-text');
var list = [];
while (true) {
let nextButton;
await page.evaluate(async () => {
const nodeList = document.querySelectorAll(
'.anbima-ui-table > tbody > tr'
);
let nodeArray = [...nodeList];
nextButton = document.querySelector('.anbima-ui-pagination__next-button');
let listA = nodeArray
.map((tbody) => [...tbody.children].map((td) => [...td.children]))
.map((tr) =>
tr.map((span) =>
span[0].innerHTML
.replace('<label class="flag__children">', '')
.replace('</label>', '')
)
);
list.push(listA);
});
if (!nextButton) {
break;
} else {
await page.goto(nextButton.href);
}
}
fs.writeFile('eventDates.json', JSON.stringify(list[0], null, 2), (err) => {
if (err) throw new Error('Something went wrong');
console.log('well done you got the dates');
});
await browser.close();
})();
CodePudding user response:
List is undefined inside the callback function. You would need to return the array in page.evaluate and then use that returned array to push it to list.
const list = [];
while (true) {
let nextButton;
const listA = await page.evaluate(async () => {
const nodeList = document.querySelectorAll(
'.anbima-ui-table > tbody > tr'
);
let nodeArray = [...nodeList];
nextButton = document.querySelector('.anbima-ui-pagination__next-button');
return nodeArray
.map((tbody) => [...tbody.children].map((td) => [...td.children]))
.map((tr) =>
tr.map((span) =>
span[0].innerHTML
.replace('<label class="flag__children">', '')
.replace('</label>', '')
)
);
});
list.push(...listA);
Edit: Corrected the last line in my example.