Home > OS >  Valid json pipeline Scrapy
Valid json pipeline Scrapy

Time:11-06

I output the scraped data in JSON format. Custom scrapy pipeline outputs a list of dictionaries in JSON format. Item type looks like this:

[{
    "product_id": "11980174",
    "brand_id": 25354,
    "brand_name": "Gucci",
    "title": "beige and brown Dionysus GG Supreme mini canvas shoulder bag",
    "slug": "/shopping/gucci-beige-and-brown-dionysus-gg-supreme-mini-canvas-shoulder-bag-11980174"
},
{
    "product_id": "17070807",
    "brand_id": 1168391,
    "brand_name": "Jonathan Adler",
    "title": "Clear acrylic chess set",
    "slug": "/shopping/jonathan-adler-clear-acrylic-chess-set-17070807"
},
{
    "product_id": "17022890",
    "brand_id": 3543122,
    "brand_name": "Anissa Kermiche",
    "title": "pink, green and red Mini Jugs Jug earthenware vase set",
    "slug": "/shopping/anissa-kermiche-pink-green-and-red-mini-jugs-jug-earthenware-vase-set-17022890"
},]

But I want to export the data in a valid json format:

[{
    "product_id": "11980174",
    "brand_id": 25354,
    "brand_name": "Gucci",
    "title": "beige and brown Dionysus GG Supreme mini canvas shoulder bag",
    "slug": "/shopping/gucci-beige-and-brown-dionysus-gg-supreme-mini-canvas-shoulder-bag-11980174"
},
{
    "product_id": "17070807",
    "brand_id": 1168391,
    "brand_name": "Jonathan Adler",
    "title": "Clear acrylic chess set",
    "slug": "/shopping/jonathan-adler-clear-acrylic-chess-set-17070807"
},
{
    "product_id": "17022890",
    "brand_id": 3543122,
    "brand_name": "Anissa Kermiche",
    "title": "pink, green and red Mini Jugs Jug earthenware vase set",
    "slug": "/shopping/anissa-kermiche-pink-green-and-red-mini-jugs-jug-earthenware-vase-set-17022890"
}]

I need to remove the comma from the last json object to make it a valid json.

Here is my custom scrapy json pipeline:

from scrapy import signals
import boto3
from scrapy.utils.project import get_project_settings
import time
import json


class JsonWriterPipeline(object):
    def __init__(self):
        self.spider_time = f'{time.strftime("%Y/%G_%m/%Y.%m.%d/%Y.%m.%d")}'

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open("%s_items.json" % spider.name, "w")
        self.file.write("[")

    def process_item(self, item, spider):
        line = line = json.dumps(dict(item), indent=4)   ",\n"
        self.file.write(line)
        return item

    def spider_closed(self, spider):
        self.file.write("]")
        self.file.close()
        settings = get_project_settings()
        my_session = boto3.session.Session()
        s3 = my_session.resource(
            "s3",
            endpoint_url=settings.get("AWS_ENDPOINT_URL"),
            aws_access_key_id=settings.get("AWS_ACCESS_KEY_ID"),
            aws_secret_access_key=settings.get("AWS_SECRET_ACCESS_KEY"),
        )
        boto_test_bucket = s3.Bucket(settings.get("AWS_STORAGE_BUCKET_NAME"))
        boto_test_bucket.upload_file(
            "%s_items.json" % spider.name,
            f"brownsfashion-feeds/{spider.name}_{self.spider_time}.json",
        )

Please advise me of any solutions. Thank you.

CodePudding user response:

Don't try to convert a dictionary to JSON yourself. Instead, use json.dumps() from the json package. Like this:

import json


data = [{
    "product_id": "11980174",
    "brand_id": 25354,
    "brand_name": "Gucci",
    "title": "beige and brown Dionysus GG Supreme mini canvas shoulder bag",
    "slug": "/shopping/gucci-beige-and-brown-dionysus-gg-supreme-mini-canvas-shoulder-bag-11980174"
},
{
    "product_id": "17070807",
    "brand_id": 1168391,
    "brand_name": "Jonathan Adler",
    "title": "Clear acrylic chess set",
    "slug": "/shopping/jonathan-adler-clear-acrylic-chess-set-17070807"
},
{
    "product_id": "17022890",
    "brand_id": 3543122,
    "brand_name": "Anissa Kermiche",
    "title": "pink, green and red Mini Jugs Jug earthenware vase set",
    "slug": "/shopping/anissa-kermiche-pink-green-and-red-mini-jugs-jug-earthenware-vase-set-17022890"
}]

valid_json = json.dumps(data)
print(valid_json)

CodePudding user response:

You can rewrite your code like this,

class JsonWriterPipeline(object):
    def __init__(self):
        self.spider_time = f'{time.strftime("%Y/%G_%m/%Y.%m.%d/%Y.%m.%d")}'

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open("%s_items.json" % spider.name, "w")

    def process_item(self, item, spider):
        json.dump(item, self.file) # You can use the json.dump it will directly write into your file
        return item

    def spider_closed(self, spider):
        self.file.close()
        settings = get_project_settings()
        my_session = boto3.session.Session()
        s3 = my_session.resource(
            "s3",
            endpoint_url=settings.get("AWS_ENDPOINT_URL"),
            aws_access_key_id=settings.get("AWS_ACCESS_KEY_ID"),
            aws_secret_access_key=settings.get("AWS_SECRET_ACCESS_KEY"),
        )
        boto_test_bucket = s3.Bucket(settings.get("AWS_STORAGE_BUCKET_NAME"))
        boto_test_bucket.upload_file(
            "%s_items.json" % spider.name,
            f"brownsfashion-feeds/{spider.name}_{self.spider_time}.json",
        )
  • Related