I've been trying to scrape next page link iteratively from a webpage using axios
and cheerio
. When I run the script, I always get undefined
as a result. How can I get next page link until there is no more?
This is my current attempt:
const axios = require("axios");
const cheerio = require("cheerio");
const base = "https://stackoverflow.com";
const url = "https://stackoverflow.com/questions/tagged/web-scraping";
async function main(){
const data = await axios.get(url,{
headers: {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36"
}
});
const $ = cheerio.load(data);
let nextPage = $('a[rel="next"]').eq(0).attr("href");
console.log(nextPage);
while(nextPage){
url = base nextPage;
const data = await axios.get(url,{
headers: {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36"
}
});
const $ = cheerio.load(data);
nextPage = $('a[rel="next"]').eq(0).attr("href");
console.log(nextPage);
}
}
main();
CodePudding user response:
The resolved promise from axios.get
is a response object which contains the actual html-response body under <axiosResponse>.data
. So if you change it to the following, it should work:
...
const response = await axios.get(url,{
headers: {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36"
}
});
const $ = cheerio.load(response.data);
...