Home > OS >  Store values into keys with scrapy
Store values into keys with scrapy

Time:11-16

I want to extract information from a website like price and store that as values in a dictionary. However, I'm trying to learn scrapy so I'd like to know how to achieve this with it.

Here's how it would look like with requests and BeautifulSoup

import numpy as np
import requests as r
import pandas as pd
from bs4 import BeauitfulSoup

html = ['https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16',
       'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=2&_sop=16',
       'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=3&_sop=16',
       'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=4&_sop=16',
       'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=5&_sop=16']

data = defaultdict(list)
for i in range(0, len(html):
    r = requests.get(html[i])
    soup = BeautifulSoup(r.content, 'lxml')
    name = soup.select(".s-item__title")
    value = soup.select(".ITALIC")
    for n, v in zip(name, value):
        data["card"].append(n.text.strip())
        data["price"].append(v.text.strip())

Here's what I have tried with scrapy but I do not get any values after looking at the json output. I just get the links, how do I get the output like the code above?:

import scrapy
from scrapy.loader import ItemLoader
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess

html = np.array(['https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16',
       'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=2&_sop=16',
       'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=3&_sop=16',
       'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=4&_sop=16',
       'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=5&_sop=16'],
      dtype=object)

url = pd.DataFrame(html, columns=['data'])

class StatisticsItem(scrapy.Item):
    statistics_div = Field(output_processor=TakeFirst())
    url = Field(output_processor=TakeFirst())


class StatisticsSpider(scrapy.Spider):
    name = 'statistics'
    start_urls = url.data.values

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url
            )

    def parse(self, response):
        
        table = response.xpath("//div[@class='s-item__price']").get()

        loader = ItemLoader(StatisticsItem())
        loader.add_value('values', table)
        loader.add_value('url', response.url)
        yield loader.load_item()


process = CrawlerProcess(
    settings={
        'FEED_URI': 'ebay_data.json',
        'FEED_FORMAT': 'jsonlines'
    }
)
process.crawl(StatisticsSpider)
process.start()

CodePudding user response:

Here is the minimal working solution so far

Code:

import numpy as np
import requests 
import pandas as pd
from bs4 import BeautifulSoup

urls = ['https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16',
       'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=2&_sop=16',
       'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=3&_sop=16',
       'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=4&_sop=16',
       'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=5&_sop=16']
data=[]
for url in urls:
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'lxml')
    cards = soup.select("ul.b-list__items_nofooter.srp-results.srp-grid li")
    for card in cards:
        title =card.select_one('h3.s-item__title')
   
        p =card.select_one('span.s-item__price span')
        price = p.text if p else None
        data.append([title,price])

cols= ['Title','Price']
    
       
df = pd.DataFrame(data, columns= cols)
print(df)
#df.to_csv('info.csv',index = False)

Output:

                                               Title          Price 
0    [THE ULTIMATE COMPLETE 1ST EDITION POKEMON BOO...  £4,466,944.61 
1    [Japanese Old Back Pokemon Trophy Card NO.2 Ne...  £1,315,157.83 
2    [Non-Ultimate Pokemon WOTC Booster Box Collect...  £1,095,964.86 
3    [First Digital Charizard Lv.80 from Pokemon Re...    £850,245.68 
4    [First Digital Blastoise Lv.100 from Pokemon R...    £850,245.68 
..                                                 ...            ... 
187  [PSA 9 BLASTOISE 1999 Pokemon 1st Edition THIN...     £29,779.62 
188  [Pokemon Charizard No Rarity Symbol            
  • Related