Home > Mobile >  why my spider is crawling 700 items actually there are 245 items? how?
why my spider is crawling 700 items actually there are 245 items? how?

Time:09-26

why my spider is crawling 700 items actually there are 245 items? how?

there are not more than 245 items but how my spider is scraping 700 items even i use for loop to crawl only selected pages and tried to set my CLOSESPIDER_ITEMCOUNT = 244 in setting.py

any possible solution?

here is my code.

import scrapy
import json
from ..items import HmsItem
from scrapy.loader import ItemLoader


class HmSpider(scrapy.Spider):
    name = 'hm'
    allowed_domains = ['hm.com']
    
    def start_requests(self):
        for i in range(36,252, 36): #there is a diff of 36 on each next url
            yield scrapy.Request(
                url = f"https://www2.hm.com/en_us/men/new-arrivals/view-all/_jcr_content/main/productlisting.display.json?sort=stock&image-size=small&image=model&offset=0&page-size={i}",
                method='GET',
                callback= self.parse
            )

    def parse(self, response):
        # with open('initial.json', 'wb') as f:
        #     f.write(response.body)
        json_resp = json.loads(response.body)
        products = json_resp.get('products')
        for product in products:
            loader = ItemLoader(item=HmsItem())
            title = loader.add_value('title', product.get('title'))
            articleCode = loader.add_value('articleCode', product.get('articleCode'))
            category = loader.add_value('category', product.get('category'))
            src = loader.add_value('src', product.get('image')[0].get('src'))
            price = loader.add_value('price', product.get('price'))
            swatchesTotal = loader.add_value('swatchesTotal', product.get('swatchesTotal'))
            brandName =loader.add_value('brandName', product.get('brandName'))
            yield loader.load_item()

here is my setting.py

BOT_NAME = 'hms'
SPIDER_MODULES = ['hms.spiders']
NEWSPIDER_MODULE = 'hms.spiders'
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36"
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 8
FEED = 'json'
FEED_EXPORT_ENCODING = 'utf-8'
CLOSESPIDER_ITEMCOUNT = 244
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_TARGET_CONCURRENCY = 4.0

here is my items.py

import scrapy

class HmsItem(scrapy.Item):
    title = scrapy.Field()
    articleCode = scrapy.Field()
    category = scrapy.Field()
    src = scrapy.Field()
    price = scrapy.Field()
    swatchesTotal = scrapy.Field()
    brandName = scrapy.Field()

CodePudding user response:

As scrapy can't take the params(querystring key value pairs) as parameter ,So we can't use it to make pagination using range funcction and for loop.

Actually your pagination method iterates the ResultSet and producing the duplicate items.

Alternative way is to get the correct output which is to inject the total items 243 in the end point of the url where the page-size= exists.

Working code:

import scrapy
import json
from ..items import HmsItem
from scrapy.loader import ItemLoader


class HmSpider(scrapy.Spider):
    name = 'hm'
    #allowed_domains = ['hm.com']

    def start_requests(self):
        u="https://www2.hm.com/en_us/men/new-arrivals/view-all/_jcr_content/main/productlisting.display.json?sort=stock&image-size=small&image=model&offset=0&page-size=243"
        
        yield scrapy.Request(
            url = u,
            method='GET',
            callback=self.parse
            )

    def parse(self, response):
        # with open('initial.json', 'wb') as f:
        #     f.write(response.body)
        json_resp = json.loads(response.body)
        products = json_resp.get('products')
        for product in products:
            loader = ItemLoader(item=HmsItem())
            title = loader.add_value('title', product.get('title'))
            articleCode = loader.add_value(
                'articleCode', product.get('articleCode'))
            category = loader.add_value('category', product.get('category'))
            src = loader.add_value('src', product.get('image')[0].get('src'))
            price = loader.add_value('price', product.get('price'))
            swatchesTotal = loader.add_value(
                'swatchesTotal', product.get('swatchesTotal'))
            brandName = loader.add_value('brandName', product.get('brandName'))
            yield loader.load_item()

Output:

'title': ['Reflective Running Gloves']}
2022-09-23 18:36:09 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www2.hm.com/en_us/men/new-arrivals/view-all/_jcr_content/main/productlisting.display.json?sort=stock&image-size=small&image=model&offset=0&page-size=243>    
{'articleCode': ['1104789002'],
 'brandName': ['H&M'],
 'category': ['men_sport_bottoms_leggingstights'],
 'price': ['$ 49.99'],
 'src': ['//lp2.hm.com/hmgoepprod?set=source[/40/a6/40a6eddd4f5cfa2f899b56c796022baeb23fe18f.jpg],origin[dam],category[],type[DESCRIPTIVESTILLLIFE],res[m],hmver[2]&call=url[file:/product/style]'],
 'swatchesTotal': ['2'],
 'title': ['Running Tights']}
2022-09-23 18:36:09 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www2.hm.com/en_us/men/new-arrivals/view-all/_jcr_content/main/productlisting.display.json?sort=stock&image-size=small&image=model&offset=0&page-size=243>    
{'articleCode': ['1104789001'],
 'brandName': ['H&M'],
 'category': ['men_sport_bottoms_leggingstights'],
 'price': ['$ 49.99'],
 'src': ['//lp2.hm.com/hmgoepprod?set=source[/c5/69/c569b49c77761dc8e2690b476dbd38662e2157f9.jpg],origin[dam],category[],type[LOOKBOOK],res[m],hmver[1]&call=url[file:/product/style]'],
 'swatchesTotal': ['2'],
 'title': ['Running Tights']}
Finished Request
2022-09-23 18:36:11 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 431,
 'downloader/request_count': 1,
 'downloader/request_method_count/GET': 1,
 'downloader/response_bytes': 47993,
 'downloader/response_count': 1,
 'downloader/response_status_count/200': 1,
 'elapsed_time_seconds': 4.229539,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2022, 9, 23, 12, 36, 9, 21036),
 'httpcompression/response_bytes': 477132,
 'httpcompression/response_count': 1,
 'item_scraped_count': 243,
 'log_count/DEBUG': 258,
 'log_count/INFO': 10,
 'response_received_count': 1,
 'scheduler/dequeued': 1,
 'scheduler/dequeued/memory': 1,
 'scheduler/enqueued': 1,
 'scheduler/enqueued/memory': 1,
 'start_time': datetime.datetime(2022, 9, 23, 12, 36, 4, 791497)}
2022-09-23 18:36:11 [scrapy.core.engine] INFO: Spider closed (finished)
  • Related