Blockquote
I am trying to learn js/puppeteer and by building a simple web scraper to scrape books info for educational purposes. I am trying to get the web scraper to fill UPC numbers from a CSV file onto the search bar of a book website. I managed to get a the web scraper to scrape the website if I use a single UPC number.
But I have a CSV with a list of UPCs and would love for the web scraper:
- to read the CSV file,
- grab the UPC from first line,
- search for the UPC on website,
- scrape the information,
- grab the UPC from 2nd line,
- repeat 3, 4
Sample CSV:
DATE,QUANTITY,NAME,CODECONTENT,CODETYPE
2021-10-13 20:16:44 1100,1,"Book 1","9781250035288",9
2021-10-13 20:16:40 1100,1,"Book 2","9781847245601",9
2021-10-13 20:16:35 1100,1,"Book 3","9780007149247",9
2021-10-13 20:16:30 1100,1,"Book 4","9780749958084",9
2021-10-13 20:16:26 1100,1,"Book 5","9781405920384",9
This is my code so far. I am stuck at async function for the CSV parser where its giving me an undefined result when i do a
console.log(allupcs);
Plus I am not sure how to get the
await page.type('#book-search-form > div.el-wrap.header-search-el-wrap > input.text-input','9781509847556');
to accept the UPCs
See code below:
const puppeteer = require('puppeteer');
const parse = require('csv-parser');
const fs = require('fs');
async function getupcs(){
var upcData=[];
fs.createReadStream('Book_Bulk.csv')
.pipe(parse({delimiter: ':'}))
.on('data', function(csvrow) {
// console.log( csvrow.CODECONTENT);
//do something with csvrow
upcData.push( csvrow.CODECONTENT);
})
.on('end',function() {
//do something with csvData
// return upcData;
console.log(upcData);
});
}
async function main(){
// const allupcs = await upcData();
// console.log(allupcs);
const browser = await puppeteer.launch({ headless: false, defaultViewport: null, args: ['--start-maximized']});
const page = await browser.newPage();
await page.goto('https://www.bookdepository.com/');
await page.type('#book-search-form > div.el-wrap.header-search-el-wrap > input.text-input','9781509847556');
await page.click('#book-search-form > div.el-wrap.header-search-el-wrap > button');
//Title
await page.waitForSelector('.item-info h1');
const title = await page.$eval('.item-info h1', h1 => h1.textContent);
//Author
await page.waitForSelector('div.author-info.hidden-md > span > a > span');
const author = await page.$eval('div.author-info.hidden-md > span > a > span', span => span.innerText);
//Genre
await page.waitForSelector('.active a');
const genre = await page.$eval('.active a', a => a.innerText);
//Format
await page.waitForSelector('.item-info li');
const format = await page.$eval('.item-info li', li => li.innerText);
//Publisher
await page.waitForSelector('div.biblio-wrap > div > ul > li:nth-child(4) > span > a > span');
const publisher = await page.$eval('div.biblio-wrap > div > ul > li:nth-child(4) > span > a > span', span => span.innerText);
//Year
await page.waitForSelector('div.biblio-wrap > div > ul > li:nth-child(3) > span');
const year = await page.$eval('div.biblio-wrap > div > ul > li:nth-child(3) > span', span => span.innerText);
const newyear = year.slice(-4)
// Price
try {
await page.waitForSelector('div.price.item-price-wrap.hidden-xs.hidden-sm > span', { timeout: 1000 });
const price = await page.$eval('div.price.item-price-wrap.hidden-xs.hidden-sm > span', span => span.innerText);
var newprice = price.slice(-6);
} catch {
await page.waitForSelector('p.list-price');
const price = await page.$eval('p.list-price', p => p.innerText);
var newprice = price.slice(-6);
} finally {
await page.waitForSelector('div.price.item-price-wrap.hidden-xs.hidden-sm > span.sale-price');
const price = await page.$eval('div.price.item-price-wrap.hidden-xs.hidden-sm > span.sale-price', span => span.innerText);
var newprice = price.slice(-6);
}
console.log(title);
console.log(author);
console.log(genre);
console.log(format);
console.log(publisher);
console.log(newyear);
console.log(newprice);
// return {
// title: title,
// author: author,
// genre: genre,
// format: format,
// publisher: publisher,
// year: newyear,
// price: newprice
// }
}
main();
Updated: with code from Answer
const puppeteer = require('puppeteer');
const parse = require('csv-parser');
const fs = require('fs');
async function getpageData(page,upc){
await page.goto('https://www.bookdepository.com/');
await page.type('#book-search-form > div.el-wrap.header-search-el-wrap > input.text-input',upc);
await page.click('#book-search-form > div.el-wrap.header-search-el-wrap > button');
//Title
await page.waitForSelector('.item-info h1');
const title = await page.$eval('.item-info h1', h1 => h1.textContent);
//Author
await page.waitForSelector('div.author-info.hidden-md > span > a > span');
const author = await page.$eval('div.author-info.hidden-md > span > a > span', span => span.innerText);
//Genre
await page.waitForSelector('.active a');
const genre = await page.$eval('.active a', a => a.innerText);
//Format
await page.waitForSelector('.item-info li');
const format = await page.$eval('.item-info li', li => li.innerText);
//Publisher
await page.waitForSelector('div.biblio-wrap > div > ul > li:nth-child(4) > span > a > span');
const publisher = await page.$eval('div.biblio-wrap > div > ul > li:nth-child(4) > span > a > span', span => span.innerText);
//Year
await page.waitForSelector('div.biblio-wrap > div > ul > li:nth-child(3) > span');
const year = await page.$eval('div.biblio-wrap > div > ul > li:nth-child(3) > span', span => span.innerText);
const newyear = year.slice(-4)
// Price
try {
await page.waitForSelector('div.price.item-price-wrap.hidden-xs.hidden-sm > span', { timeout: 1000 });
const price = await page.$eval('div.price.item-price-wrap.hidden-xs.hidden-sm > span', span => span.innerText);
var newprice = price.slice(-6);
} catch {
await page.waitForSelector('p.list-price');
const price = await page.$eval('p.list-price', p => p.innerText);
var newprice = price.slice(-6);
} finally {
await page.waitForSelector('div.price.item-price-wrap.hidden-xs.hidden-sm > span.sale-price');
const price = await page.$eval('div.price.item-price-wrap.hidden-xs.hidden-sm > span.sale-price', span => span.innerText);
var newprice = price.slice(-6);
}
// console.log(title);
// console.log(author);
// console.log(genre);
// console.log(format);
// console.log(publisher);
// console.log(newyear);
// console.log(newprice);
return {
title: title,
author: author,
genre: genre,
format: format,
publisher: publisher,
year: newyear,
price: newprice
}
};
function readCsvAsync(filename, delimiter=',', encoding='utf-8') {
return new Promise((resolve, reject) => {
const rows = [];
try {
fs.createReadStream(filename, {encoding: encoding})
.pipe(parse({delimiter: delimiter}))
.on('data', (row) => rows.push( row.CODECONTENT))
.on('end', () => resolve(rows))
.on('error', reject);
} catch (err) {
reject(err);
}
});
}
async function upcData() {
try {
const rows = await readCsvAsync('Book_Bulk.csv', ':');
// console.log(csvData);
// call puppeteer or whatever
return rows;
} catch (err) {
console.log(err);
}
}
async function main(){
const allupcs = await upcData();
// console.log(allupcs);
const browser = await puppeteer.launch({ headless: false, defaultViewport: null, args: ['--start-maximized']});
const page = await browser.newPage();
const scrapedData = [];
for(let upc of allupcs){
const data = await getpageData(page,upc);
scrapedData.push(data);
}
console.log(scrapedData);
}
main();
CodePudding user response:
As you have noticed, the CSV parser is asynchronous. "asynchronous" means you can't do this:
var upcData=[]; // 1
fs.createReadStream('Book_Bulk.csv') // 2
.pipe(parse({delimiter: ':'}))
.on('data', (csvrow) { // 5 6 7 8 9
upcData.push( csvrow.CODECONTENT);
})
.on('end',function() { // 10
console.log(upcData);
});
}
console.log(upcData); // 3
// call puppeteer or whatever // 4
I've outlined the order of execution. The last console.log()
runs immediately after you set up the read stream. upcData
will not contain anything at this point.
But it will contain data at point #10, and #5 etc will fill it.
That means: Whatever you want to do with upcData
, do it inside the 'end'
event handler.
.on('end',function() { // 10
console.log(upcData);
for (let upc of upcData) {
// call puppeteer or whatever
}
});
Since csv reader will give you one row per data
event, you can also do things directly in the data
event handler and not build an upcData
array at all.
.on('data', (csvrow) { // 5 6 7 8 9
const upc = csvrow.CODECONTENT;
// call puppeteer or whatever
})
If you want to be able to await
the whole thing, you must turn it into a promise first. In this case again the relevant step (promise resolution) happens in the end
callback:
function readCsvAsync(filename, delimiter=',', encoding='utf-8') {
return new Promise((resolve, reject) => {
const rows = [];
try {
fs.createReadStream(filename, {encoding: encoding})
.pipe(parse({delimiter: delimiter}))
.on('data', (row) => rows.push(row))
.on('end', () => resolve(rows))
.on('error', reject);
} catch (err) {
reject(err);
}
});
}
async function main() {
try {
const rows = await readCsvAsync('Book_Bulk.csv', ':');
// call puppeteer or whatever
} catch (err) {
console.log(err);
}
}