I output the scraped data in JSON format. Custom scrapy pipeline outputs a list of dictionaries in JSON format. Item type looks like this:
[{
"product_id": "11980174",
"brand_id": 25354,
"brand_name": "Gucci",
"title": "beige and brown Dionysus GG Supreme mini canvas shoulder bag",
"slug": "/shopping/gucci-beige-and-brown-dionysus-gg-supreme-mini-canvas-shoulder-bag-11980174"
},
{
"product_id": "17070807",
"brand_id": 1168391,
"brand_name": "Jonathan Adler",
"title": "Clear acrylic chess set",
"slug": "/shopping/jonathan-adler-clear-acrylic-chess-set-17070807"
},
{
"product_id": "17022890",
"brand_id": 3543122,
"brand_name": "Anissa Kermiche",
"title": "pink, green and red Mini Jugs Jug earthenware vase set",
"slug": "/shopping/anissa-kermiche-pink-green-and-red-mini-jugs-jug-earthenware-vase-set-17022890"
},]
But I want to export the data in a valid json format:
[{
"product_id": "11980174",
"brand_id": 25354,
"brand_name": "Gucci",
"title": "beige and brown Dionysus GG Supreme mini canvas shoulder bag",
"slug": "/shopping/gucci-beige-and-brown-dionysus-gg-supreme-mini-canvas-shoulder-bag-11980174"
},
{
"product_id": "17070807",
"brand_id": 1168391,
"brand_name": "Jonathan Adler",
"title": "Clear acrylic chess set",
"slug": "/shopping/jonathan-adler-clear-acrylic-chess-set-17070807"
},
{
"product_id": "17022890",
"brand_id": 3543122,
"brand_name": "Anissa Kermiche",
"title": "pink, green and red Mini Jugs Jug earthenware vase set",
"slug": "/shopping/anissa-kermiche-pink-green-and-red-mini-jugs-jug-earthenware-vase-set-17022890"
}]
I need to remove the comma from the last json object to make it a valid json.
Here is my custom scrapy json pipeline:
from scrapy import signals
import boto3
from scrapy.utils.project import get_project_settings
import time
import json
class JsonWriterPipeline(object):
def __init__(self):
self.spider_time = f'{time.strftime("%Y/%G_%m/%Y.%m.%d/%Y.%m.%d")}'
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
self.file = open("%s_items.json" % spider.name, "w")
self.file.write("[")
def process_item(self, item, spider):
line = line = json.dumps(dict(item), indent=4) ",\n"
self.file.write(line)
return item
def spider_closed(self, spider):
self.file.write("]")
self.file.close()
settings = get_project_settings()
my_session = boto3.session.Session()
s3 = my_session.resource(
"s3",
endpoint_url=settings.get("AWS_ENDPOINT_URL"),
aws_access_key_id=settings.get("AWS_ACCESS_KEY_ID"),
aws_secret_access_key=settings.get("AWS_SECRET_ACCESS_KEY"),
)
boto_test_bucket = s3.Bucket(settings.get("AWS_STORAGE_BUCKET_NAME"))
boto_test_bucket.upload_file(
"%s_items.json" % spider.name,
f"brownsfashion-feeds/{spider.name}_{self.spider_time}.json",
)
Please advise me of any solutions. Thank you.
CodePudding user response:
Don't try to convert a dictionary to JSON yourself. Instead, use json.dumps()
from the json
package. Like this:
import json
data = [{
"product_id": "11980174",
"brand_id": 25354,
"brand_name": "Gucci",
"title": "beige and brown Dionysus GG Supreme mini canvas shoulder bag",
"slug": "/shopping/gucci-beige-and-brown-dionysus-gg-supreme-mini-canvas-shoulder-bag-11980174"
},
{
"product_id": "17070807",
"brand_id": 1168391,
"brand_name": "Jonathan Adler",
"title": "Clear acrylic chess set",
"slug": "/shopping/jonathan-adler-clear-acrylic-chess-set-17070807"
},
{
"product_id": "17022890",
"brand_id": 3543122,
"brand_name": "Anissa Kermiche",
"title": "pink, green and red Mini Jugs Jug earthenware vase set",
"slug": "/shopping/anissa-kermiche-pink-green-and-red-mini-jugs-jug-earthenware-vase-set-17022890"
}]
valid_json = json.dumps(data)
print(valid_json)
CodePudding user response:
You can rewrite your code like this,
class JsonWriterPipeline(object):
def __init__(self):
self.spider_time = f'{time.strftime("%Y/%G_%m/%Y.%m.%d/%Y.%m.%d")}'
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
self.file = open("%s_items.json" % spider.name, "w")
def process_item(self, item, spider):
json.dump(item, self.file) # You can use the json.dump it will directly write into your file
return item
def spider_closed(self, spider):
self.file.close()
settings = get_project_settings()
my_session = boto3.session.Session()
s3 = my_session.resource(
"s3",
endpoint_url=settings.get("AWS_ENDPOINT_URL"),
aws_access_key_id=settings.get("AWS_ACCESS_KEY_ID"),
aws_secret_access_key=settings.get("AWS_SECRET_ACCESS_KEY"),
)
boto_test_bucket = s3.Bucket(settings.get("AWS_STORAGE_BUCKET_NAME"))
boto_test_bucket.upload_file(
"%s_items.json" % spider.name,
f"brownsfashion-feeds/{spider.name}_{self.spider_time}.json",
)