I have the following web data collector:
function start(urls) {
Promise.map(urls, requestPromise)
.map((htmlPage, index) => {
const $ = cheerio.load(htmlPage);
$(".fixedttitle2").each(function () {
mytxt = $(this).text();
myarray.push(mytxt);
});
mainarray[urls[index]] = myarray;
});
fs.writeFileSync("1.json", JSON.stringify(mainarray));
}
var urls = [];
for (i = 1; i <= 100; i ) {
urls = "https://thisurl.com/" i.toString();
start(urls);
}
Now I want to check response of each request at first, How I can check the response code at first inorder to get rid of some URLs that return 500 Error? How I can handle it?
CodePudding user response:
You might be looking for something like this.
scrape
(néestart
) processes a single URL and returns a promise of[url, content]
, or if there's an error,[url, null]
.main
generates the list of URLs to scrape, then startsscrape
for all of them.- Note that all 100 requests start at once; this may or may not be a problem for you.
- Finally, when all of the scrape promises complete, their return values are gathered into
response
, and that's written into the JSON file.- This differs from the original in that the original kept re-writing the file as new content was scraped.
async function scrape(url) {
try {
const htmlPage = await requestPromise(url);
const $ = cheerio.load(htmlPage);
const texts = [];
$('.fixedttitle2').each(function () {
texts.push($(this).text());
});
return [url, texts];
} catch (err) {
console.error(`Error processing url: ${url}: ${err}`);
return [url, null];
}
}
async function main() {
const urls = [];
for (var i = 1; i <= 100; i ) {
urls.push(`https://thisurl.com/${i}`);
}
const response = await Promise.all(urls.map(scrape));
fs.writeFileSync('1.json', JSON.stringify(response));
}
If you'd like the requests to be done sequentially, you can await scrape()
in the loop:
async function main() {
const response = [];
for (var i = 1; i <= 100; i ) {
const url = `https://thisurl.com/${i}`;
response.push(await scrape(url));
}
fs.writeFileSync('1.json', JSON.stringify(response));
}
You could also move the write file call into the loop if you wanted the same incremental behavior your original code had.