The desired output format is:
{"content": "text", "scrape_date": "36456456456"}
{"content": "text", "scrape_date": "56445435435"}
My spider.py:
import scrapy
import time
import json
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from urllib.parse import urlparse
DICT = {
'quotes.toscrape.com': 'domain1.json',
'stadt-koeln.de': 'domain2.json',
}
class PagingIncremental(CrawlSpider):
name = "my_spider"
allowed_domains = ['quotes.toscrape.com', 'stadt-koeln.de']
start_urls = [
'https://quotes.toscrape.com/page/1/',
'https://www.stadt-koeln.de/leben-in-koeln/planen-bauen/bebauungsplaene/aufstellen-eines-bauleitplanes'
]
custom_settings = {
'DOWNLOAD_DELAY': '0',
'FEED_EXPORT_ENCODING': 'utf-8',
'DEPTH_LIMIT': '1',
'AUTOTHROTTLE_ENABLED': 'True',
'AUTOTHROTTLE_START_DELAY': '1',
'AUTOTHROTTLE_MAX_DELAY': '3'
}
# Visit all found sublinks
rules = (
Rule(LinkExtractor(allow=r""), callback='parse', follow=False),
)
def parse(self, response):
item = {}
# get domain from each sub page
domain = urlparse(response.url).netloc
domain = domain.replace("www.", "")
# if domain from DICT above matches with domain from subpage
# all sublinks are stored in the same output file
item["filename"] = DICT[domain]
item["content"] = response.xpath("//p/text()").getall()
item['scrape_date'] = int(time.time())
yield item
if __name__ == "__main__":
process = CrawlerProcess(settings={
})
# process = CrawlerProcess()
process.crawl(PagingIncremental)
process.start()
My pipelines.py
:
from scrapy.exporters import JsonItemExporter
class SaveJsonPipeline:
def process_item(self, item, spider):
filename = item['filename']
del item['filename']
# if the file exists it will append the data
JsonItemExporter(open(filename, "ab")).export_item(item)
return item
My settings.py
:
ITEM_PIPELINES = {
'<project_name>.pipelines.SaveJsonPipeline': 300,
}
If I use a
instead of ab
to export the data in pipelines.py
in non-binary format Scrapy says:
JsonItemExporter(open(filename, "a")).export_item(item)
File "c:\python\lib\site-packages\scrapy\exporters.py", line 135, in export_item
self.file.write(to_bytes(data, self.encoding))
TypeError: write() argument must be str, not bytes
Any ideas and solutions are awarded!
CodePudding user response:
You should use JsonLinesItemExporter instead of JsonItemExporter
to get every item in separated line.
And don't bother bytes
because documentation mentions that it has to open file in bytes mode
.
And in pandas.read_json() you can use option lines=True
to read JSONL
(multiline-JSON):
df = pd.read_json('domain1.json', lines=True)
Full working code.
All code is in one file so everyone can simply copy and test it.
I used '__main__.SaveJsonPipeline'
to load class from current file.
I also added code to remove spaces in content
and join in one string:
" ".join([x.strip() for x in response.xpath("//p/text()").getall()]).strip()
import time
import scrapy
#import json
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from urllib.parse import urlparse
from scrapy.exporters import JsonItemExporter, JsonLinesItemExporter
class SaveJsonPipeline:
def process_item(self, item, spider):
filename = item['filename']
del item['filename']
# if the file exists it will append the data
JsonLinesItemExporter(open(filename, "ab")).export_item(item)
return item
DICT = {
'quotes.toscrape.com': 'domain1.json',
'stadt-koeln.de': 'domain2.json',
}
class PagingIncremental(CrawlSpider):
name = "my_spider"
allowed_domains = ['quotes.toscrape.com', 'stadt-koeln.de']
start_urls = [
'https://quotes.toscrape.com/page/1/',
'https://www.stadt-koeln.de/leben-in-koeln/planen-bauen/bebauungsplaene/aufstellen-eines-bauleitplanes'
]
custom_settings = {
'DOWNLOAD_DELAY': '0',
'FEED_EXPORT_ENCODING': 'utf-8',
'DEPTH_LIMIT': '1',
'AUTOTHROTTLE_ENABLED': 'True',
'AUTOTHROTTLE_START_DELAY': '1',
'AUTOTHROTTLE_MAX_DELAY': '3'
}
# Visit all found sublinks
rules = (
Rule(LinkExtractor(allow=r""), callback='parse', follow=False),
)
def parse(self, response):
item = {}
# get domain from each sub page
domain = urlparse(response.url).netloc
domain = domain.replace("www.", "")
# if domain from DICT above matches with domain from subpage
# all sublinks are stored in the same output file
item["filename"] = DICT[domain]
#item["content"] = [x.strip() for x in response.xpath("//p/text()").getall()]
item["content"] = " ".join([x.strip() for x in response.xpath("//p/text()").getall()]).strip()
item['scrape_date'] = int(time.time())
yield item
if __name__ == "__main__":
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'ITEM_PIPELINES': {'__main__.SaveJsonPipeline': 1}, # used Pipeline create in current file (needs __main___)
})
# process = CrawlerProcess()
process.crawl(PagingIncremental)
process.start()
import pandas as pd
df = pd.read_json('domain1.json', lines=True)
print(df.head())