I am creating a scrapy bot that scrapes etfs from a website but cannot get it to scrape with pagination. I want it to scrape from the second page but when I try to do that it scrapes from the base URL
code:
class EtfsSpider(scrapy.Spider):
name = "etfs"
start_urls = ['https://etfdb.com/etfs/asset-class/bond/#etfs&sort_name=assets_under_management&sort_order=desc&page=2']
def parse(self, response):
etf_table = response.css('table#etfs tbody')
for etf in etf_table.css('tr'):
symbol = etf.css('td[data-th="Symbol"] a::text').get()
name = etf.css('td[data-th="ETF Name"] a::text').get()
total_assets = etf.css('td[data-th="Total Assets ($MM)"]::text').get()
avg_daily_vol = etf.css('td[data-th="Avg. Daily Volume"]::text').get()
closing_price = etf.css('td[data-th="Previous Closing Price"]::text').get()
yield {
"symbol": symbol,
"name": name,
"total assets": total_assets,
"average daily volume": avg_daily_vol,
"last closing price": closing_price
}
In my mind this would go to the url in start_urls which in this case would be the second page of the etfs table but this is the output I get from the console:
2022-08-13 22:36:44 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://etfdb.com/robots.txt> (referer: None)
2022-08-13 22:36:45 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://etfdb.com/etfs/asset-class/bond/#etfs&sort_name=assets_under_management&sort_order=desc&page=2> (referer: None)
2022-08-13 22:36:46 [scrapy.core.scraper] DEBUG: Scraped from <200 https://etfdb.com/etfs/asset-class/bond/>
{'symbol': 'BND', 'name': 'Vanguard Total Bond Market ETF', 'total assets': '$84,446.60', 'average daily volume': None, 'last closing price': '$75.95'}
So it says it crawled the right URL but when it actually scrapes the items/data it does it from the base URL which is actually just the first page. I have no idea how to fix this
CodePudding user response:
The data is generated with JavaScript, you can get it from the JSON file.
import scrapy
class EtfsSpider(scrapy.Spider):
name = "etfs"
# I got the JSON url from the "network" tab in the browser's devtools
start_urls = ['https://etfdb.com/data_set/?tm=92960&cond={"by_type":["Etfdb::EtfType",374,null,false,false]}&no_null_sort=true&count_by_id=&limit=25&sort=assets_under_management&order=desc&limit=25&offset=']
# the offset start with zero and incremented with "limit" value every page
offset = 0
# I got the headers from the "network" tab in the browser's devtools
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.5",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Content-Type": "application/json",
"DNT": "1",
"Host": "etfdb.com",
"Pragma": "no-cache",
"Referer": "https://etfdb.com/etfs/asset-class/bond/",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"Sec-GPC": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
# use delay to reduce the chance of getting blocked
custom_settings = {'DOWNLOAD_DELAY': 0.3}
def start_requests(self):
url = self.start_urls[0] str(self.offset)
yield scrapy.Request(url=url, headers=self.headers)
def parse(self, response):
json_data = response.json()
# stopping condition
if not json_data['rows']:
self.logger.info("Finished scraping")
return
for item in json_data['rows']:
# "symbol" and "name" are both html values so we need to use selectors in order to get the text
symbol = item['symbol']
selector = scrapy.Selector(text=symbol, type="html")
symbol = selector.xpath('//text()').get()
name = item['name']
selector = scrapy.Selector(text=name, type="html")
name = selector.xpath('//text()').get()
total_assets = item['assets_under_management']
avg_daily_vol = item['three_month_average_volume']
closing_price = item['price']
yield {
"symbol": symbol,
"name": name,
"total assets": total_assets,
"average daily volume": avg_daily_vol,
"last closing price": closing_price
}
# next page
self.offset = 25
url = self.start_urls[0] str(self.offset)
yield scrapy.Request(url=url, headers=self.headers)