This is simple scraper, for example where all links are stored in var productUrls
. Then there is for loop where searching string from var searchUrls[x][0]
and if string is included in var productUrls
, then run specific function from var searchUrls[x][1]
. Problem is when I call from for loop console.log(await storeFunc);
what is in this case searchUrls[0][1]
function with argument alzask(productUrl) is runed from var searchUrls[0][0] and that productUrl is undefined and shold be productUrls[i][0]
. I'm missing something ? Or it's too complicated and can be better writed ? Thank you.
const puppeteer = require("puppeteer");
async function scrapeData() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
var productUrls = [
['https://www.alza.sk/nest-hub-2nd-gen-charcoal-eu-d6845008.htm'],
['https://edigital.sk/prenosny-reproduktor/google-ga01331gr-nest-hub-2-smart-reproduktor-svetlo-sedy-p899177'],
['https://www.mironet.cz/google-nest-hub-2-charcoal-7-displej-wifi-bt-50-google-assistant dp497233/'],
['https://www.planeo.sk/katalog/1316639-google-nest-hub-2nd-gen-chalk-hlasovy-asistent.html']
]
var searchUrls = [
["alza.sk",alzask(productUrl)],
["edigital.sk","edigitalsk(productUrl)"],
["mironet.cz","mironetcz(productUrl)"]
]
for ( i = 0; i < productUrls.length; i ) {
for ( j = 0; j < searchUrls.length; j ) {
var productUrl = productUrls[i][0];
var searchUrl = searchUrls[j][0];
var storeFunc = searchUrls[j][1];
if (productUrl.includes(searchUrl)) {
console.log(await storeFunc);
break;
}
else if (j == searchUrls.length - 1 && !productUrl.includes(searchUrl)) {
console.log("Product with url: " productUrl " doesn't have storeFunc in database.");
}
}
}
async function alzask(productUrl) {
var priceSelector = ".price-box__price";
await page.goto(productUrl);
const priceData = await page.$eval(priceSelector, price => price.textContent.trim());
return priceData;
}
await browser.close();
}
scrapeData();
Run async function alzask(productUrl)
where argument productUrl is not undefined, but is string url from var productUrls
.
CodePudding user response:
I see the following issue with the code mentioned above.
var searchUrls = [
// productUrl is not defined
["alza.sk",alzask(productUrl)],
["edigital.sk","edigitalsk(productUrl)"],
["mironet.cz","mironetcz(productUrl)"]
]
The productUrl is not defined when you used it in the above code.
Following is the correct implementation. Save the function in the searchUrls and then call the function for the corresponding productUrl -
const puppeteer = require("puppeteer");
async function scrapeData() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
var productUrls = [
['https://www.alza.sk/nest-hub-2nd-gen-charcoal-eu-d6845008.htm'],
['https://edigital.sk/prenosny-reproduktor/google-ga01331gr-nest-hub-2-smart-reproduktor-svetlo-sedy-p899177'],
['https://www.mironet.cz/google-nest-hub-2-charcoal-7-displej-wifi-bt-50-google-assistant dp497233/'],
['https://www.planeo.sk/katalog/1316639-google-nest-hub-2nd-gen-chalk-hlasovy-asistent.html']
]
var searchUrls = [
// just save the function here like this
["alza.sk", alzask],
// ["alza.sk", alzask(productUrl)],
// similar implementation for below
["edigital.sk", "edigitalsk"],
["mironet.cz", "mironetcz"]
]
for ( i = 0; i < productUrls.length; i ) {
for ( j = 0; j < searchUrls.length; j ) {
var productUrl = productUrls[i][0];
var searchUrl = searchUrls[j][0];
var storeFunc = searchUrls[j][1];
if (productUrl.includes(searchUrl)) {
// now call the function here with the productUrl
console.log(await storeFunc(productUrl));
break;
}
else if (j == searchUrls.length - 1 && !productUrl.includes(searchUrl)) {
console.log("Product with url: " productUrl " doesn't have storeFunc in database.");
}
}
}
async function alzask(productUrl) {
var priceSelector = ".price-box__price";
await page.goto(productUrl);
const priceData = await page.$eval(priceSelector, price => price.textContent.trim());
return priceData;
}
await browser.close();
}
scrapeData();
Now the issue with your code is that it has multiple for loops which are not needed. You can make searchUrls an object and it will make your code faster like this-
var searchUrls = {
'www.alza.sk': alzask,
'edigital.sk': edigitalsk,
// and so on...
}
for ( i = 0; i < productUrls.length; i ) {
var productUrl = productUrls[i][0];
// productUrl will split as ['https:', '', 'www.alza.sk', 'nest-hub-2nd-gen-charcoal-eu-d6845008.htm']
// split array's 2nd index will give the corresponding searchUrl
var searchUrl = productUrl.split('/')[2]
var storeFunc = searchUrls[searchUrl]
if (storeFunc) {
console.log(await storeFunc(productUrl))
}
}