Home > Net >  Passing requests with Scrapy
Passing requests with Scrapy

Time:12-21

I'm trying to pass a request with scrapy based on the brand number in the url, and then extract id's from the webpage which provide information on the next page, then iterate over the next pages to grab the product ID's.

I can pass the request and parse the data for products and send this into requests, however I'm unsure on defining the function to let me grab the cursors for the next page.

Here's my code:

class DepopItem(scrapy.Item):
    brands = Field(output_processor=TakeFirst())
    ID = Field(output_processor=TakeFirst())
    brand = Field(output_processor=TakeFirst())

class DepopSpider(scrapy.Spider):
    name = 'depop'
    start_urls = ['https://webapi.depop.com/api/v2/search/filters/aggregates/?brands=1596&itemsPerPage=24&country=gb&currency=GBP&sort=relevance']

    brands = [1596]

    custom_settings = {
        'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
    }
    
    def start_requests(self, cursor=''):
        for brand in self.brands:
            for item in self.create_product_request(brand):
                yield item
    
        yield scrapy.FormRequest(
            url='https://webapi.depop.com/api/v2/search/products/',
            method='GET',
            formdata={
                'brands': str(brand),
                'cursor': cursor,
                'itemsPerPage': '24',
                'country': 'gb',
                'currency': 'GBP',
                'sort': 'relevance'
            },
            cb_kwargs={'brand': brand}
        )

    def parse(self, response, brand):

        # load stuff
        for item in response.json().get('products'):
            loader = ItemLoader(DepopItem())
            loader.add_value('brand', brand)
            loader.add_value('ID', item.get('id'))
            
            yield loader.load_item()

        cursor = response.json()['meta'].get('cursor')
        if cursor:
            for item in self.create_product_request(brand, cursor):
                yield item

    def create_product_request(self, response):
        test = response.json()['meta'].get('cursor')
        yield test

I get the following error:

AttributeError: 'int' object has no attribute 'json'

Expected output:

{"brand": 1596, "ID": 273027529}
{"brand": 1596, "ID": 274115361}
{"brand": 1596, "ID": 270641301}
{"brand": 1596, "ID": 274505678}
{"brand": 1596, "ID": 262857014}
{"brand": 1596, "ID": 270088589}
{"brand": 1596, "ID": 208498028}
{"brand": 1596, "ID": 270426792}
{"brand": 1596, "ID": 274483351}
{"brand": 1596, "ID": 274109923}
{"brand": 1596, "ID": 273424157}
..
..
..

CodePudding user response:

start_requests is ran before the requests are made.

You can handle the pagination recursively.

import scrapy
from scrapy.loader import ItemLoader
from scrapy import Field
from scrapy.loader.processors import TakeFirst


class DepopItem(scrapy.Item):
    brands = Field(output_processor=TakeFirst())
    ID = Field(output_processor=TakeFirst())
    brand = Field(output_processor=TakeFirst())


class DepopSpider(scrapy.Spider):
    name = 'depop'

    start_urls = ['https://webapi.depop.com/api/v2/search/products/']

    brands = [1596]

    custom_settings = {
        'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
    }

    def parse(self, response):
        json_data = response.json()

        # pagination
        cursor = json_data['meta']['cursor']
        if json_data['meta']['hasMore']:
            yield scrapy.FormRequest(
                url='https://webapi.depop.com/api/v2/search/products/',
                method='GET',
                formdata={'cursor': cursor}
            )

        for brand in self.brands:
            yield scrapy.FormRequest(
                url='https://webapi.depop.com/api/v2/search/products/',
                method='GET',
                formdata={
                    'brands': str(brand),
                    'cursor': cursor,
                    'itemsPerPage': '24',
                    'country': 'gb',
                    'currency': 'GBP',
                    'sort': 'relevance'
                },
                cb_kwargs={'brand': brand},
                callback=self.parse_brand
            )

    def parse_brand(self, response, brand):
        # load stuff
        for item in response.json().get('products'):
            loader = ItemLoader(DepopItem())
            loader.add_value('brand', brand)
            loader.add_value('ID', item.get('id'))
            yield loader.load_item()

Output:

{'ID': 245137362, 'brand': 1596}
{'ID': 244263081, 'brand': 1596}
{'ID': 242128472, 'brand': 1596}
{'ID': 239929000, 'brand': 1596}
...
...
...

BTW, use rotating proxies or something because I got blocked for 10 minutes because "too many requests".

  • Related