I'm trying to pass a request with scrapy
based on the brand
number in the url, and then extract id's
from the webpage which provide information on the next page, then iterate over the next pages to grab the product ID's.
I can pass the request and parse the data for products and send this into requests, however I'm unsure on defining the function to let me grab the cursors for the next page.
Here's my code:
class DepopItem(scrapy.Item):
brands = Field(output_processor=TakeFirst())
ID = Field(output_processor=TakeFirst())
brand = Field(output_processor=TakeFirst())
class DepopSpider(scrapy.Spider):
name = 'depop'
start_urls = ['https://webapi.depop.com/api/v2/search/filters/aggregates/?brands=1596&itemsPerPage=24&country=gb¤cy=GBP&sort=relevance']
brands = [1596]
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
}
def start_requests(self, cursor=''):
for brand in self.brands:
for item in self.create_product_request(brand):
yield item
yield scrapy.FormRequest(
url='https://webapi.depop.com/api/v2/search/products/',
method='GET',
formdata={
'brands': str(brand),
'cursor': cursor,
'itemsPerPage': '24',
'country': 'gb',
'currency': 'GBP',
'sort': 'relevance'
},
cb_kwargs={'brand': brand}
)
def parse(self, response, brand):
# load stuff
for item in response.json().get('products'):
loader = ItemLoader(DepopItem())
loader.add_value('brand', brand)
loader.add_value('ID', item.get('id'))
yield loader.load_item()
cursor = response.json()['meta'].get('cursor')
if cursor:
for item in self.create_product_request(brand, cursor):
yield item
def create_product_request(self, response):
test = response.json()['meta'].get('cursor')
yield test
I get the following error:
AttributeError: 'int' object has no attribute 'json'
Expected output:
{"brand": 1596, "ID": 273027529}
{"brand": 1596, "ID": 274115361}
{"brand": 1596, "ID": 270641301}
{"brand": 1596, "ID": 274505678}
{"brand": 1596, "ID": 262857014}
{"brand": 1596, "ID": 270088589}
{"brand": 1596, "ID": 208498028}
{"brand": 1596, "ID": 270426792}
{"brand": 1596, "ID": 274483351}
{"brand": 1596, "ID": 274109923}
{"brand": 1596, "ID": 273424157}
..
..
..
CodePudding user response:
start_requests
is ran before the requests are made.
You can handle the pagination recursively.
import scrapy
from scrapy.loader import ItemLoader
from scrapy import Field
from scrapy.loader.processors import TakeFirst
class DepopItem(scrapy.Item):
brands = Field(output_processor=TakeFirst())
ID = Field(output_processor=TakeFirst())
brand = Field(output_processor=TakeFirst())
class DepopSpider(scrapy.Spider):
name = 'depop'
start_urls = ['https://webapi.depop.com/api/v2/search/products/']
brands = [1596]
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
}
def parse(self, response):
json_data = response.json()
# pagination
cursor = json_data['meta']['cursor']
if json_data['meta']['hasMore']:
yield scrapy.FormRequest(
url='https://webapi.depop.com/api/v2/search/products/',
method='GET',
formdata={'cursor': cursor}
)
for brand in self.brands:
yield scrapy.FormRequest(
url='https://webapi.depop.com/api/v2/search/products/',
method='GET',
formdata={
'brands': str(brand),
'cursor': cursor,
'itemsPerPage': '24',
'country': 'gb',
'currency': 'GBP',
'sort': 'relevance'
},
cb_kwargs={'brand': brand},
callback=self.parse_brand
)
def parse_brand(self, response, brand):
# load stuff
for item in response.json().get('products'):
loader = ItemLoader(DepopItem())
loader.add_value('brand', brand)
loader.add_value('ID', item.get('id'))
yield loader.load_item()
Output:
{'ID': 245137362, 'brand': 1596}
{'ID': 244263081, 'brand': 1596}
{'ID': 242128472, 'brand': 1596}
{'ID': 239929000, 'brand': 1596}
...
...
...
BTW, use rotating proxies or something because I got blocked for 10 minutes because "too many requests".