I would like to scrape multiple websites using NodeJS, Express, Cheerio and Axios.
I'm able now to scrape 1 website and display the information to the HTML.
But when I try to scrape multiple websites looking for the same element, it doesn't go through the forEach (stops after 1 cycle). Notice my loop which doesn't work correctly:
urls.forEach(url => {
2 files that are the most important: index.js
const PORT = 8000
const axios = require('axios')
const cheerio = require('cheerio')
const express = require('express')
const app = express()
const cors = require('cors')
app.use(cors())
const urls = ['https://www.google.nl','https://www.google.de']
// const url = 'https://www.heineken.com/nl/nl/'
app.get('/', function(req, res){
res.json('Robin')
})
urls.forEach(url => {
app.get('/results', (req, res) => {
axios(url)
.then(response => {
const html = response.data
const $ = cheerio.load(html)
const articles = []
$('script', html).each(function(){
const link = $(this).get()[0].namespace
if (link !== undefined) {
if (link.indexOf('w3.org') > -1) {
articles.push({
link
})
}
}
})
res.json(articles)
}).catch(err => console.log(err))
})
})
app.listen(PORT, () => console.log('server running on PORT ${PORT}'))
App.js:
const root = document.querySelector('#root')
fetch('http://localhost:8000/results')
.then(response => {return response.json()})
.then(data => {
console.log(data)
data.forEach(article => {
const title = `<h3>` article.link `</h3>`
root.insertAdjacentHTML("beforeend", title)
})
})
CodePudding user response:
You're registering multiple route handlers for the same route. Express will only route requests to the first one. Move your URL loop inside app.get("/results", ...)
...
app.get("/results", async (req, res, next) => {
try {
res.json(
(
await Promise.all(
urls.map(async (url) => {
const { data } = await axios(url);
const $ = cheerio.load(data);
const articles = [];
$("script", html).each(function () {
const link = $(this).get()[0].namespace;
if (link !== undefined) {
if (link.indexOf("w3.org") > -1) {
articles.push({
link,
});
}
}
});
return articles;
})
)
).flat() // un-nest each array of articles
);
} catch (err) {
console.error(err);
next(err); // make sure Express responds with an error
}
});