Home > Blockchain >  duplicated data scraper json api
duplicated data scraper json api

Time:05-09

I have this script:

import scrapy
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
    
    
if os.path.exists('jfs_hombre.csv'):
    os.remove('jfs_hombre.csv')
    print("The file has been deleted successfully")
else:
    print("The file does not exist!")
    
    
class JfsSpider_hombre(scrapy.Spider):
    name = 'jfs_hombre'

    #start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]

    custom_settings = {"FEEDS": {'jfs_hombre.csv': {'format': 'csv'}}} 

    def start_requests(self):
    
        yield scrapy.Request(
            url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables={}&extensions={"persistedQuery":{"version":1,"sha256Hash":"6869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5","sender":"[email protected]","provider":"[email protected]"},"variables":"eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9"}',
            callback=self.parse,
            method="GET"
        )
    
    def parse(self, response):
        resp = response.json()
        #print(resp)
        for item in range(0,576,32):
            resp['recordsFiltered']=item
           
            for result  in resp['data']['productSearch']['products']:
                yield {
                    'Casa':'Just_For_Sports',
                    'Sku' :result['productReference'],
                    'Name': result['productName'],
                    'precio': result['priceRange']['sellingPrice']['highPrice'],
                    'Link': 'https://www.justforsport.com.ar'   result['link'],
                    'Date':datetime.today().strftime('%Y-%m-%d')
                    }
    
    
  

if __name__ == "__main__":
    process =CrawlerProcess()
    process.crawl(JfsSpider_hombre)
    process.start()

It works fine and gets 576 rows but the problem is that they are duplicated. When I drop duplicated data I get only 32 unique values, I think I m getting values from only one page ( 32 products per page) How could I iterate throuh all the elements I think it has something to do with the line:

for item in range(0,576,32):

Thanks in advance

CodePudding user response:

You are using 'Casa':'Just_For_Sports', which is not correct, it would be result['Just_For_Sports'] but the most important thing is that from where you have got the "Just_For_Sports". I didn't find it in product list. Actually,you can't include the key that didn't exist in products. 'Date':datetime.today().strftime('%Y-%m-%d') you also will not find in products list as key. Now you can try whether dublicated value exist or not.

import scrapy
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
    
    
if os.path.exists('jfs_hombre.csv'):
    os.remove('jfs_hombre.csv')
    print("The file has been deleted successfully")
else:
    print("The file does not exist!")
    
    
class JfsSpider_hombre(scrapy.Spider):
    name = 'jfs_hombre'

    #start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]

    custom_settings = {"FEEDS": {'jfs_hombre.csv': {'format': 'csv'}}} 

    def start_requests(self):
        headers = {"content-type": "application/json"}
        yield scrapy.Request(
            url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables={}&extensions={"persistedQuery":{"version":1,"sha256Hash":"6869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5","sender":"[email protected]","provider":"[email protected]"},"variables":"eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9"}',
            callback=self.parse,
            method="GET",
            headers=headers,
            dont_filter=True
        )
    
    def parse(self, response):
        resp = response.json()
        #print(resp)
        for item in range(0,576,32):
            resp['data']['productSearch']['recordsFiltered']=item
           
            for result  in resp['data']['productSearch']['products']:
                yield {
                    #'Casa':'Just_For_Sports',
                    'Sku' :result['productReference'],
                    'Name': result['productName'],
                    'precio': result['priceRange']['sellingPrice']['highPrice'],
                    'Link': 'https://www.justforsport.com.ar'   result['link'],
                    # 'Date':datetime.today().strftime('%Y-%m-%d')
                    }
    
    
if __name__ == "__main__":
    process =CrawlerProcess()
    
    process.crawl(JfsSpider_hombre)
    process.start()

Proven by set()

import scrapy
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
    
    
if os.path.exists('jfs_hombre.csv'):
    os.remove('jfs_hombre.csv')
    print("The file has been deleted successfully")
else:
    print("The file does not exist!")
    
 
class JfsSpider_hombre(scrapy.Spider):
    name = 'jfs_hombre'
    unique_data = set() 

    #start_urls = ["https://www.justforsport.com.ar/hombre?page=1"]

    custom_settings = {"FEEDS": {'jfs_hombre.csv': {'format': 'csv'}}} 

    def start_requests(self):
        headers = {"content-type": "application/json"}
        yield scrapy.Request(
            url='https://www.justforsport.com.ar/_v/segment/graphql/v1?workspace=master&maxAge=short&appsEtag=remove&domain=store&locale=es-AR&__bindingId=e841e6ce-1216-4569-a2ad-0188ba5a92fc&operationName=productSearchV3&variables={}&extensions={"persistedQuery":{"version":1,"sha256Hash":"6869499be99f20964918e2fe0d1166fdf6c006b1766085db9e5a6bc7c4b957e5","sender":"[email protected]","provider":"[email protected]"},"variables":"eyJoaWRlVW5hdmFpbGFibGVJdGVtcyI6ZmFsc2UsInNrdXNGaWx0ZXIiOiJGSVJTVF9BVkFJTEFCTEUiLCJzaW11bGF0aW9uQmVoYXZpb3IiOiJkZWZhdWx0IiwiaW5zdGFsbG1lbnRDcml0ZXJpYSI6Ik1BWF9XSVRIT1VUX0lOVEVSRVNUIiwicHJvZHVjdE9yaWdpblZ0ZXgiOmZhbHNlLCJtYXAiOiJjIiwicXVlcnkiOiJob21icmUiLCJvcmRlckJ5IjoiT3JkZXJCeVJlbGVhc2VEYXRlREVTQyIsImZyb20iOjY0LCJ0byI6OTUsInNlbGVjdGVkRmFjZXRzIjpbeyJrZXkiOiJjIiwidmFsdWUiOiJob21icmUifV0sIm9wZXJhdG9yIjoiYW5kIiwiZnV6enkiOiIwIiwic2VhcmNoU3RhdGUiOm51bGwsImZhY2V0c0JlaGF2aW9yIjoiU3RhdGljIiwiY2F0ZWdvcnlUcmVlQmVoYXZpb3IiOiJkZWZhdWx0Iiwid2l0aEZhY2V0cyI6ZmFsc2V9"}',
            callback=self.parse,
            method="GET",
            headers=headers,
            dont_filter=True
        )
    
    def parse(self, response):
        resp = response.json()
        #print(resp)
        for item in range(0,576,32):
            resp['data']['productSearch']['recordsFiltered']=item
           
            for result  in resp['data']['productSearch']['products']:
                s=result['productReference']
                self.unique_data.add(s)
                yield {
                    #'Casa':'Just_For_Sports',
                    'Sku' :s,
                    'Name': result['productName'],
                    'precio': result['priceRange']['sellingPrice']['highPrice'],
                    'Link': 'https://www.justforsport.com.ar'   result['link'],
                    # 'Date':datetime.today().strftime('%Y-%m-%d')
                    }
    
    
if __name__ == "__main__":
    process =CrawlerProcess()
    
    process.crawl(JfsSpider_hombre)
    process.start()

Output:

'item_scraped_count': 576,
  • Related