I'm trying to extract some values from keys in the response page of a webpage. Unfortunately, when I do this it returns the keys only and I cannot seem to grab the values. Because each key is a long list and they're numbered I cannot seem to figoure out how to get the values for all of them.
For example, here's my working code:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
from scrapy.item import Field
from itemloaders.processors import TakeFirst
class DepopItem(scrapy.Item):
brands = Field(output_processor=TakeFirst())
class DepopSpider(scrapy.Spider):
name = 'depop'
allowed_domains = ["depop.com"]
start_urls = ['https://webapi.depop.com/api/v2/search/filters/aggregates/?brands=1596&itemsPerPage=24&country=gb¤cy=GBP&sort=relevance']
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback=self.parse,
)
def parse(self, response):
resp= response.json()['brands']
for item in resp:
loader = ItemLoader(DepopItem(), selector=item)
loader.add_value('brands', item)
yield loader.load_item()
This returns a list of the keys:
{"brands": "1"}
{"brands": "2"}
{"brands": "3"}
{"brands": "4"}
{"brands": "5"}
{"brands": "7"}
{"brands": "9"}
Instead I want the values corresponding to these keys:
{"brands": 946}
{"brands": 2376}
{"brands": 1286}
{"brands": 2774}
{"brands": 489}
{"brands": 11572}
{"brands": 1212}
CodePudding user response:
Either use values()
or resp[item]
.
Example:
import scrapy
from scrapy.loader import ItemLoader
from scrapy.item import Field
from itemloaders.processors import TakeFirst
class DepopItem(scrapy.Item):
brands = Field(output_processor=TakeFirst())
class DepopSpider(scrapy.Spider):
name = 'depop'
allowed_domains = ["depop.com"]
start_urls = ['https://webapi.depop.com/api/v2/search/filters/aggregates/?brands=1596&itemsPerPage=24&country=gb¤cy=GBP&sort=relevance']
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
}
def parse(self, response):
resp = response.json()['brands']
for item in resp.values():
loader = ItemLoader(DepopItem(), selector=item)
loader.add_value('brands', item['count'])
yield loader.load_item()
Output:
{'brands': 888}
{'brands': 1}
{'brands': 52}
{'brands': 138}
{'brands': 148}
...
...
...
CodePudding user response:
I am not sure how about scrapy but you could simply do:
import requests
import json
from itertools import starmap
from requests.models import Response
from typing import Dict, List
url = "https://webapi.depop.com/api/v2/search/filters/aggregates/?brands=1596&itemsPerPage=24&country=gb¤cy=GBP&sort=relevance"
resp: Response = requests.get(url)
data: Dict = json.loads(resp.text).get("brands")
values: List[Dict] = list(starmap(lambda k,v: {"brands": v["count"]}, data.items()))
OUTPUT:
[{'brands': 989},
{'brands': 1838},
{'brands': 2415},
{'brands': 1344},
...]