I'm writing an API with express, puppeteer-cluster and cheerio that returns all anchor elements containing one or more words that can be added as query parameters. I want to use puppeteer in order to get elements that are javascript generated too. But for some reason it's not working, I get an empty array as an output printed on the browser.
I'm still trying to understand this library but has been 2 days and I made no progress. Any help is deeply appreciated.
Update: I added async to all my functions and they run now, but the result is still empty :(
Update 2: I started logging everything, every step and found that data.name is being passed to the cheerio function as a Promise. '-' I think that is the problem, but don't know how to fix it yet.
Update 3: One of the issues was that the page content (html code) was not being handled properly to the cheerio function. In the browser, however, the response is empty and the console shows an error:
Error handling response: TypeError: Cannot read properties of undefined (reading 'innerText').
So, I think the response is not json formatted. Is res.json()
not the right way to do it?
My code:
app.js
const PORT = process.env.PORT || 8000;
var path = require("path");
const express = require("express");
// Routes
const indexRouter = require("./routes/index");
const allNews = require("./routes/news");
const clusterRouter = require("./routes/cluster");
const app = express();
app.use(cors());
app.use(express.json());
app.use(express.urlencoded({ extended: false }));
app.use(express.static(path.join(__dirname, "public")));
app.use("/", indexRouter);
app.use("/news", allNews);
app.use("/cluster", clusterRouter);
app.listen(PORT, () => console.log(`server running on PORT ${PORT}`));
cluster.js
const express = require("express");
const { Cluster } = require("puppeteer-cluster");
const puppeteer = require("puppeteer-extra");
const cheerio = require("cheerio");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
var router = express.Router();
const newspapers = [
{
"name": "CNN",
"address": "https://edition.cnn.com/specials/world/cnn-climate",
"base": "https://edition.cnn.com"
},
{
"name": "The Guardian",
"address": "https://www.theguardian.com/environment/climate-crisis",
"base": "https://www.theguardian.com"
}]
const app = express();
puppeteer.use(StealthPlugin());
const result = [];
router.get("/", async (req, res) => {
(async () => {
// Query String
const query = checkForQuery(req);
const wordsToSearch = query ? verifyQuery(query) : "";
console.log("Running tests.."); // This is printed on console
//Functions
function checkForQuery(request) {
if (request.originalUrl.indexOf("?") !== -1) {
console.log(request.query);
return request.query;
} else {
return false;
}
}
// // Validates query and remove invalid values
function verifyQuery(queryString) {
const queryParams = {
only: queryString.only ? queryString.only : "",
also: queryString.also ? queryString.also : "",
};
// Creates new list containing valid terms for search
var newList = {
only: [],
also: [],
};
for (const [key, value] of Object.entries(queryParams)) {
const tempId = key.toString();
const tempVal =
queryParams[tempId].length >= 2
? queryParams[tempId].split(",")
: queryParams[tempId];
console.log(queryParams[tempId], " and ", tempVal);
if (tempVal.length > 1) {
console.log("helloooooo");
tempVal.forEach((term) => {
if (topics.indexOf(term) != -1) {
newList[tempId].push(term);
}
});
} else {
if (topics.indexOf(queryParams[tempId]) != -1) {
newList[tempId].push(queryParams[tempId]);
}
}
}
console.log(newList);
return newList;
}
function storeData(element, base, name) {
const results = [];
element.find("style").remove();
const title = element.text();
const urlRaw = element.attr("href");
const url =
urlRaw.includes("www") || urlRaw.includes("http")
? urlRaw
: base urlRaw;
// Check for duplicated url
if (tempUrls.indexOf(url) === -1) {
// Check for social media links and skip
if (!exceptions.some((el) => url.toLowerCase().includes(el))) {
tempUrls.push(url);
// Get img if child of anchor tag
const imageElement = element.find("img");
if (imageElement.length > 0) {
// Get the src attribute of the image element
results.push({
title: title.replace(/(\r\n|\n|\r)/gm, ""),
url,
source: name,
imgUrl: getImageFromElement(imageElement),
});
} else {
results.push({
title: title.replace(/(\r\n|\n|\r)/gm, ""),
url: url,
source: name,
});
}
}
}
return results;
}
function getElementsCheerio(html, base, name, searchterms) {
console.log(html, base, name);
const $ = cheerio.load(html);
console.log(searchterms);
const concatInfo = [];
if (searchterms) {
const termsAlso = searchterms.also;
const termsOnly = searchterms.only;
termsAlso.forEach((term) => {
$(`a:has(:contains("climate"):contains(${term}))`).each(function () {
const tempData = storeData($(this), base, name);
tempData.map((el) => concatInfo.push(el));
});
});
termsOnly.forEach((term) => {
// $(`a:has(:contains(${term}))`).each(function () {
$(`a:contains(${term})`).each(function () {
const tempData = storeData($(this), base, name);
tempData.map((el) => concatInfo.push(el));
});
});
} else {
$('a:contains("climate")').each(function () {
const tempData = storeData($(this), base, name);
tempData.map((el) => concatInfo.push(el));
});
}
return concatInfo;
}
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_CONTEXT,
maxConcurrency: 2,
puppeteerOptions: {
headless: true,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
userDataDir: "./tmp",
defaultViewport: false,
},
});
await cluster.task(async ({ page, data }) => {
await page.goto(data.address);
await page.waitForSelector("body");
// console.log here prints that data.name is a Promise :(
const elements = await getElementsCheerio(
document.body.innerHTML,
data.base,
data.name,
wordsToSearch
);
result.push(elements);
});
newspapers.map((newspaper) => {
console.log("queue" newspaper); // This logs correctly: queue[object Object]
cluster.queue(newspaper);
});
await cluster.idle();
await cluster.close();
// Display final object
res.json(result);
})();
});
module.exports = router;
I don't get any errors, but on screen I get an empty [ ]. Anyone can see what I am doing wrong here? :(
CodePudding user response:
In general, it's an antipattern to mix Puppeteer with another selection library like Cheerio. In addition to being redundant, the extra HTML parser doesn't work on the live document as Puppeteer does, so you have to snapshot the HTML at a particular moment with Puppeteer to capture it as a string and plug that string into Cheerio, where it's re-parsed back to a traversible tree structure.
Introducing this extra step creates opportunity for bugs and confusion to creep in, and that's what happened here.
The code
const elements = await getElementsCheerio(
document.body.innerHTML,
data.base,
data.name,
wordsToSearch
);
is problematic. document.body.innerHTML
doesn't refer to anything related to Puppeteer. Instead, use Puppeteer's await page.content()
to snapshot the HTML.
As a minor point, there's no need for Cheerio functions to be async
, because they never use await
. It's a fully synchronous API.
Here's a minimal set up for using Cheerio with Puppeteer, assuming you accept the terms and conditions and are sure that intoducing this usually unnecessary layer of indirection is appropriate for your use case:
const cheerio = require("cheerio"); // 1.0.0-rc.12
const puppeteer = require("puppeteer"); // ^19.0.0
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
const url = "https://www.example.com";
await page.goto(url, {waitUntil: "domcontentloaded"});
const html = await page.content();
const $ = cheerio.load(html);
// do cheerio stuff synchronously
console.log($("h1").text()); // => Example Domain
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
It's basically the same for puppeteer-cluster: just drop the lines starting with const html = await page.content();
into the cluster.task
callback that operates on page
.