I am trying to do web scraping from
What is the solution?
CodePudding user response:
You need to add the Accept-Encoding
in axios get() header with text/html; charset=UTF-8
const axios = require("axios")
const cheerio = require("cheerio")
const express = require("express")
async function getPriceFeed() {
try {
const siteUrl = 'https://coinmarketcap.com/'
const response = await axios({
method: "GET",
url: siteUrl,
headers : { 'Accept-Encoding': 'text/html; charset=UTF-8'}
})
console.log(response.data)
const $ = cheerio.load(response.data)
const selector = "#__next > div > div.main-content > div.sc-1a736df3-0.PimrZ.cmc-body-wrapper > div > div:nth-child(1) > div.sc-f7a61dda-2.efhsPu > table > tbody > tr"
console.log($.html())
$(selector).each((index,ele) =>{
console.log(ele)
console.log(index)
})
} catch (err) {
console.error(err)
}
}
getPriceFeed()
Result
... removed
Element {
parent: [Circular *2],
prev: [Element],
next: null,
startIndex: null,
endIndex: null,
children: [Array],
name: 'td',
attribs: [Object: null prototype] {},
type: 'tag',
namespace: 'http://www.w3.org/1999/xhtml',
'x-attribsNamespace': [Object: null prototype] {},
'x-attribsPrefix': [Object: null prototype] {}
}
],
name: 'tr',
attribs: [Object: null prototype] { class: 'sc-428ddaf3-0 bKFMfg' },
type: 'tag',
namespace: 'http://www.w3.org/1999/xhtml',
'x-attribsNamespace': [Object: null prototype] { class: undefined },
'x-attribsPrefix': [Object: null prototype] { class: undefined }
}
99
How about use other web crawler library like a puppeteer? It give more easy to handle xpath()
This is demo code by puppeteer
const puppeteer = require("puppeteer");
const cheerio = require("cheerio");
(async () => {
const url = "https://coinmarketcap.com/";
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
const body = await page.evaluate(() => document.body.outerHTML);
await browser.close();
const $ = cheerio.load(body);
const names = [];
$(".cmc-link > div > div > p").each((i, item) => {
const $item = $(item);
const name = $item.text();
names.push({
name
});
});
console.log(names);
})();
Result
$ node get-data.js
[
{ name: 'Bitcoin' },
{ name: 'Ethereum' },
{ name: 'Tether' },
{ name: 'BNB' },
{ name: 'USD Coin' },
{ name: 'Binance USD' },
{ name: 'XRP' },
{ name: 'Dogecoin' },
{ name: 'Cardano' },
{ name: 'Polygon' }
]