I want to extract information from a website like price and store that as values in a dictionary. However, I'm trying to learn scrapy so I'd like to know how to achieve this with it.
Here's how it would look like with requests
and BeautifulSoup
import numpy as np
import requests as r
import pandas as pd
from bs4 import BeauitfulSoup
html = ['https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=2&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=3&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=4&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=5&_sop=16']
data = defaultdict(list)
for i in range(0, len(html):
r = requests.get(html[i])
soup = BeautifulSoup(r.content, 'lxml')
name = soup.select(".s-item__title")
value = soup.select(".ITALIC")
for n, v in zip(name, value):
data["card"].append(n.text.strip())
data["price"].append(v.text.strip())
Here's what I have tried with scrapy but I do not get any values after looking at the json output. I just get the links, how do I get the output like the code above?:
import scrapy
from scrapy.loader import ItemLoader
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
html = np.array(['https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=2&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=3&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=4&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=5&_sop=16'],
dtype=object)
url = pd.DataFrame(html, columns=['data'])
class StatisticsItem(scrapy.Item):
statistics_div = Field(output_processor=TakeFirst())
url = Field(output_processor=TakeFirst())
class StatisticsSpider(scrapy.Spider):
name = 'statistics'
start_urls = url.data.values
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url
)
def parse(self, response):
table = response.xpath("//div[@class='s-item__price']").get()
loader = ItemLoader(StatisticsItem())
loader.add_value('values', table)
loader.add_value('url', response.url)
yield loader.load_item()
process = CrawlerProcess(
settings={
'FEED_URI': 'ebay_data.json',
'FEED_FORMAT': 'jsonlines'
}
)
process.crawl(StatisticsSpider)
process.start()
CodePudding user response:
Here is the minimal working solution so far
Code:
import numpy as np
import requests
import pandas as pd
from bs4 import BeautifulSoup
urls = ['https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=2&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=3&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=4&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=5&_sop=16']
data=[]
for url in urls:
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
cards = soup.select("ul.b-list__items_nofooter.srp-results.srp-grid li")
for card in cards:
title =card.select_one('h3.s-item__title')
p =card.select_one('span.s-item__price span')
price = p.text if p else None
data.append([title,price])
cols= ['Title','Price']
df = pd.DataFrame(data, columns= cols)
print(df)
#df.to_csv('info.csv',index = False)
Output:
Title Price
0 [THE ULTIMATE COMPLETE 1ST EDITION POKEMON BOO... £4,466,944.61
1 [Japanese Old Back Pokemon Trophy Card NO.2 Ne... £1,315,157.83
2 [Non-Ultimate Pokemon WOTC Booster Box Collect... £1,095,964.86
3 [First Digital Charizard Lv.80 from Pokemon Re... £850,245.68
4 [First Digital Blastoise Lv.100 from Pokemon R... £850,245.68
.. ... ...
187 [PSA 9 BLASTOISE 1999 Pokemon 1st Edition THIN... £29,779.62
188 [Pokemon Charizard No Rarity Symbol