How to export scraped data as readable json using Scrapy-CodePudding

According to

The desired output format is:

{"content": "text", "scrape_date": "36456456456"}
{"content": "text", "scrape_date": "56445435435"}

My spider.py:

import scrapy
import time
import json
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from urllib.parse import urlparse


DICT = {
    'quotes.toscrape.com': 'domain1.json',
    'stadt-koeln.de': 'domain2.json',
}


class PagingIncremental(CrawlSpider):
    name = "my_spider"

    allowed_domains = ['quotes.toscrape.com', 'stadt-koeln.de']

    start_urls = [
        'https://quotes.toscrape.com/page/1/',
        'https://www.stadt-koeln.de/leben-in-koeln/planen-bauen/bebauungsplaene/aufstellen-eines-bauleitplanes'
    ]

    custom_settings = {
        'DOWNLOAD_DELAY': '0',
        'FEED_EXPORT_ENCODING': 'utf-8',
        'DEPTH_LIMIT': '1',
        'AUTOTHROTTLE_ENABLED': 'True',
        'AUTOTHROTTLE_START_DELAY': '1',
        'AUTOTHROTTLE_MAX_DELAY': '3'
    }
    # Visit all found sublinks
    rules = (
        Rule(LinkExtractor(allow=r""), callback='parse', follow=False),
    )

    def parse(self, response):

        item = {}

        # get domain from each sub page 
        domain = urlparse(response.url).netloc
        domain = domain.replace("www.", "")

        # if domain from DICT above matches with domain from subpage
        # all sublinks are stored in the same output file
        item["filename"] = DICT[domain]
        item["content"] = response.xpath("//p/text()").getall() 
        item['scrape_date'] = int(time.time())

        yield item


if __name__ == "__main__":
    process = CrawlerProcess(settings={
    })

    # process = CrawlerProcess()
    process.crawl(PagingIncremental)
    process.start()

My pipelines.py:

from scrapy.exporters import JsonItemExporter

class SaveJsonPipeline:
    def process_item(self, item, spider):

        filename = item['filename']
        del item['filename']

        # if the file exists it will append the data 
        JsonItemExporter(open(filename, "ab")).export_item(item)

        return item

My settings.py:

ITEM_PIPELINES = {
   '<project_name>.pipelines.SaveJsonPipeline': 300,
}

If I use a instead of ab to export the data in pipelines.pyin non-binary format Scrapy says:

 JsonItemExporter(open(filename, "a")).export_item(item)
  File "c:\python\lib\site-packages\scrapy\exporters.py", line 135, in export_item
    self.file.write(to_bytes(data, self.encoding))
TypeError: write() argument must be str, not bytes

Any ideas and solutions are awarded!

CodePudding user response：

You should use JsonLinesItemExporter instead of JsonItemExporter to get every item in separated line.

And don't bother bytes because documentation mentions that it has to open file in bytes mode.

And in pandas.read_json() you can use option lines=True to read JSONL (multiline-JSON):

df = pd.read_json('domain1.json', lines=True)

Full working code.

All code is in one file so everyone can simply copy and test it.

I used '__main__.SaveJsonPipeline' to load class from current file.

I also added code to remove spaces in content and join in one string:

" ".join([x.strip() for x in response.xpath("//p/text()").getall()]).strip()

import time
import scrapy
#import json
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from urllib.parse import urlparse
from scrapy.exporters import JsonItemExporter, JsonLinesItemExporter


class SaveJsonPipeline:
    def process_item(self, item, spider):

        filename = item['filename']
        del item['filename']

        # if the file exists it will append the data 
        JsonLinesItemExporter(open(filename, "ab")).export_item(item)

        return item


DICT = {
    'quotes.toscrape.com': 'domain1.json',
    'stadt-koeln.de': 'domain2.json',
}


class PagingIncremental(CrawlSpider):
    name = "my_spider"

    allowed_domains = ['quotes.toscrape.com', 'stadt-koeln.de']

    start_urls = [
        'https://quotes.toscrape.com/page/1/',
        'https://www.stadt-koeln.de/leben-in-koeln/planen-bauen/bebauungsplaene/aufstellen-eines-bauleitplanes'
    ]

    custom_settings = {
        'DOWNLOAD_DELAY': '0',
        'FEED_EXPORT_ENCODING': 'utf-8',
        'DEPTH_LIMIT': '1',
        'AUTOTHROTTLE_ENABLED': 'True',
        'AUTOTHROTTLE_START_DELAY': '1',
        'AUTOTHROTTLE_MAX_DELAY': '3'
    }
    # Visit all found sublinks
    rules = (
        Rule(LinkExtractor(allow=r""), callback='parse', follow=False),
    )

    def parse(self, response):

        item = {}

        # get domain from each sub page 
        domain = urlparse(response.url).netloc
        domain = domain.replace("www.", "")

        # if domain from DICT above matches with domain from subpage
        # all sublinks are stored in the same output file
        item["filename"] = DICT[domain]
        #item["content"] = [x.strip() for x in response.xpath("//p/text()").getall()]
        item["content"] = " ".join([x.strip() for x in response.xpath("//p/text()").getall()]).strip()
        item['scrape_date'] = int(time.time())

        yield item


if __name__ == "__main__":
    process = CrawlerProcess({
        'USER_AGENT': 'Mozilla/5.0',
        'ITEM_PIPELINES': {'__main__.SaveJsonPipeline': 1},  # used Pipeline create in current file (needs __main___)
    })

    # process = CrawlerProcess()
    process.crawl(PagingIncremental)
    process.start()

    import pandas as pd
    df = pd.read_json('domain1.json', lines=True)
    print(df.head())