I have a table that you can see below. I'm trying to make an array of contents inside each <tr>
element. For example an array would output like
[{Current: 757, Peak: 976, Name: XXX},{Current: 594, Peak: 684, Name: XXX2}]
Any insight would greatly be appreciated as to what I'm doing wrong.
<div id="detail">
<table>
<tbody>
<tr>
<td align="right"><b>Current</b></td>
<td align="right"><b>Peak</b></td>
<td width="20"> </td>
<td align="left"><b>Name</b></td>
</tr>
<tr>
<td colspan="5"> </td>
</tr>
<tr >
<td align="right">
<span >757</span>
</td>
<td align="right">
<span >976</span>
</td>
<td width="20"> </td>
<td>
<a href="xxx">XXX</a>
</td>
</tr>
<tr >
<td align="right">
<span >594</span>
</td>
<td align="right">
<span >684</span>
</td>
<td width="20"> </td>
<td>
<a href="xxx">XXX2</a>
</td>
</tr>
</tbody>
</table>
</div>
let scrape = async() => {
try {
const browser = await puppeteer.launch({});
const context = await browser.createIncognitoBrowserContext();
const page = await context.newPage();
await page.goto(`https://xxx/stats/`, {
waitUntil: 'domcontentloaded'
})
const gameRecordsList = await page.evaluate(() => {
let record = {
'name': '',
'current': '',
'peak': ''
}
let nameCountList = []
const row = Array.from(
document.querySelectorAll('#detail > table > tbody > tr > td')
)
record.name = (row.map(td => td[2].innerText))
record.current = (row.map(td => td[0].innerText))
record.peak = (row.map(td => td[1].innerText))
if (row.length >= 200) {
nameCountList.push(record)
}
return nameCountList;
});
browser.close();
console.log(gameRecordsList)
} catch (err) {
// Catch and display errors
console.log(error(err));
console.log(error("Error, Browser Closed"));
}
};
scrape()
CodePudding user response:
When scraping tables, if you first select by td
s it can be difficult to cleanly re-assemble the rows.
I would first select the row elements, then for each row, select the cells. This produces a 2d array that's much easier to manipulate.
After that, you can use a simple map
to create the array of objects.
const puppeteer = require("puppeteer"); // ^16.2.0
const html = `<div id="detail"> <table> <tbody> <tr> <td align="right"><b>Current</b></td> <td align="right"><b>Peak</b></td> <td width="20"> </td> <td align="left"><b>Name</b></td> </tr> <tr> <td colspan="5"> </td> </tr> <tr > <td align="right"> <span >757</span> </td> <td align="right"> <span >976</span> </td> <td width="20"> </td> <td> <a href="xxx">XXX</a> </td> </tr> <tr > <td align="right"> <span >594</span> </td> <td align="right"> <span >684</span> </td> <td width="20"> </td> <td> <a href="xxx">XXX2</a> </td> </tr> </tbody> </table> </div>`;
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.setContent(html);
const data = (
await page.$$eval("#detail tr", (els) =>
els
.slice(2)
.map((e) =>
[...e.querySelectorAll("td")].map((e) => e.textContent.trim())
)
)
).map(([current, peak, , name]) => ({current, peak, name}));
console.log(data);
})()
.catch((err) => console.error(err))
.finally(() => browser?.close());
A more general approach is to filter out empty cells:
const data = (
await page.$$eval("#detail tr", (els) =>
els
.slice(2)
.map((e) =>
[...e.querySelectorAll("td")]
.map((e) => e.textContent.trim())
.filter(Boolean)
)
)
).map(([current, peak, name]) => ({current, peak, name}));
See also: