Webscraper Not Accurately Obtaining Data-CodePudding

I am trying to scrape all cars from the website: www.webuycars.co.za

I am using scrapy to do this and each page has 24 vehicles that I want to send to a json file. Through data analysis it seems that I am just scraping the first page only or overwriting the variable used to create the json file.

import json
import scrapy
from scrapy.crawler import CrawlerProcess

class carSpider(scrapy.Spider):

    name = 'car'
    body = {"to":24,"size":24,"type":"All","filter_type":"all","subcategory":None,"q":"","Make":None,"Roadworthy":None,"Auctions":[],"Model":None,"Variant":None,"DealerKey":None,"FuelType":None,"BodyType":None,"Gearbox":None,"AxleConfiguration":None,"Colour":None,"FinanceGrade":None,"Priced_Amount_Gte":0,"Priced_Amount_Lte":0,"MonthlyInstallment_Amount_Gte":0,"MonthlyInstallment_Amount_Lte":0,"auctionDate":None,"auctionEndDate":None,"auctionDurationInSeconds":None,"Kilometers_Gte":0,"Kilometers_Lte":0,"Priced_Amount_Sort":"","Bid_Amount_Sort":"","Kilometers_Sort":"","Year_Sort":"","Auction_Date_Sort":"","Auction_Lot_Sort":"","Year":[],"Price_Update_Date_Sort":"","Online_Auction_Date_Sort":"","Online_Auction_In_Progress":""}

    def start_requests(self):
       
        yield scrapy.Request(
            url='https://website-elastic-api.webuycars.co.za/api/search',
            callback=self.parse,
            body=json.dumps(self.body),
            method="POST",
            headers= {
                "content-type": "application/json",
                "User-Agent":"mozilla/5.0"
                }
        )
            

    def parse(self, response):
        response = json.loads(response.body)
        cars = []
        filename = "webuycar.json"
        for item in range(0,6528,24):
            response['total']['value']=item

            cars.append(response['data'])

        with open(filename, "w") as f:
            json.dump(cars, f, indent=4)

        for resp in response['data']:
            yield {
                'Title': resp['OnlineDescription']
            }


#Code that runs the spider
process = CrawlerProcess()
process.crawl(carSpider)
process.start()

I would like to fix this as it messes with the accuracy of the database I have created and makes redundant data prevalent.

I have looked at my json file to see if the issue was from extraction. It seems that my webscraper is the problem. I would appreciate some thoughts on this.

CodePudding user response：

You shouldn't try to dump the data into a file from the parse method. You should either use command line arguments, or in the case when running as a script like in your example, you can use feed exports.

Like THis:

import json
import scrapy
from scrapy.crawler import CrawlerProcess

class carSpider(scrapy.Spider):

    name = 'car'
    body = {"to":24,"size":24,"type":"All","filter_type":"all","subcategory":None,"q":"","Make":None,"Roadworthy":None,"Auctions":[],"Model":None,"Variant":None,"DealerKey":None,"FuelType":None,"BodyType":None,"Gearbox":None,"AxleConfiguration":None,"Colour":None,"FinanceGrade":None,"Priced_Amount_Gte":0,"Priced_Amount_Lte":0,"MonthlyInstallment_Amount_Gte":0,"MonthlyInstallment_Amount_Lte":0,"auctionDate":None,"auctionEndDate":None,"auctionDurationInSeconds":None,"Kilometers_Gte":0,"Kilometers_Lte":0,"Priced_Amount_Sort":"","Bid_Amount_Sort":"","Kilometers_Sort":"","Year_Sort":"","Auction_Date_Sort":"","Auction_Lot_Sort":"","Year":[],"Price_Update_Date_Sort":"","Online_Auction_Date_Sort":"","Online_Auction_In_Progress":""}

    custom_settings = {"FEEDS": {
        "webuycar.json":{
            'format': 'json',
            'encoding': 'utf8',
            'store_empty': False,
            'indent': 4
        }
    }}

    def start_requests(self):

        yield scrapy.Request(
            url='https://website-elastic-api.webuycars.co.za/api/search',
            callback=self.parse,
            body=json.dumps(self.body),
            method="POST",
            headers= {
                "content-type": "application/json",
                "User-Agent":"mozilla/5.0"
                }
        )


    def parse(self, response):
        data = response.json()
        for item in range(0,6528,24):
            data['total']['value']=item
            yield data
        for item in data['data']:
            yield {'Title': item['OnlineDescription']}


#Code that runs the spider
process = CrawlerProcess()
process.crawl(carSpider)
process.start()

Im not totally sure this solves your problem because you are still scraping a single url, but this should avoid overwriting the file.

Although I tested this and the output json file was 7143882 lines long

Update:

After taking a closer look at your code, I think that this is closer to what you are actually trying to achieve. This makes many calls to the api and extracts all 24 OnlineDescription fields from each api call response.

import json
import scrapy
from scrapy.crawler import CrawlerProcess

class carSpider(scrapy.Spider):

    name = 'car'
    body = {"to":24,"size":24,"type":"All","filter_type":"all","subcategory":None,"q":"","Make":None,"Roadworthy":None,"Auctions":[],"Model":None,"Variant":None,"DealerKey":None,"FuelType":None,"BodyType":None,"Gearbox":None,"AxleConfiguration":None,"Colour":None,"FinanceGrade":None,"Priced_Amount_Gte":0,"Priced_Amount_Lte":0,"MonthlyInstallment_Amount_Gte":0,"MonthlyInstallment_Amount_Lte":0,"auctionDate":None,"auctionEndDate":None,"auctionDurationInSeconds":None,"Kilometers_Gte":0,"Kilometers_Lte":0,"Priced_Amount_Sort":"","Bid_Amount_Sort":"","Kilometers_Sort":"","Year_Sort":"","Auction_Date_Sort":"","Auction_Lot_Sort":"","Year":[],"Price_Update_Date_Sort":"","Online_Auction_Date_Sort":"","Online_Auction_In_Progress":""}

    custom_settings = {"FEEDS": {
        "webuycar.json":{
            'format': 'json',
            'encoding': 'utf8',
            'store_empty': False,
            'indent': 4
        }
    }}

    def start_requests(self):
        for i in range(24,6528,24):
            self.body["to"] = i
            yield scrapy.Request(
                url='https://website-elastic-api.webuycars.co.za/api/search',
                callback=self.parse,
                body=json.dumps(self.body),
                method="POST",
                headers= {
                    "content-type": "application/json",
                    "User-Agent":"mozilla/5.0"
                }
            )


    def parse(self, response):
        data = response.json()
        for item in data['data']:
            yield {"Title": item['OnlineDescription']}



#Code that runs the spider
process = CrawlerProcess()
process.crawl(carSpider)
process.start()