I am looking for a way to prefix each log produced by Scrapy with the name of the spider that generated it. Until now, I was launching each spider synchronously in a loop, so it was easy to track which spider generated which log. But I have recently refactored my code in order to either accept a list of spiders as argument, or launch them all at once throught the CrawlerProcess()
function. The result is that they are launched asynchronously, so the logs are all mixed up.
I have thought about adding something like [%(name)]
to the LOG_FORMAT setting, but the name produced is the module that called it (scrapy.core.engine, scrapy.utils.log, etc.) and not the spider's name.
I also tried creating an extension that would modify the crawler's settings by retrieving spider.name
and adding it to the LOG_FORMAT constant, but as far as I'm aware changing the settings while the crawler is running has no effect (and I haven't found a clean way of doing it since they are immutable).
Any help would be greatly appreciated ! Thank you
- I tried setting a custom LOG_FORMAT but there does not seem to be any way to access the spider's name ;
- I tried using an
extension
to catch the crawler's settings and modify them, but they are immutable and they are only evaluated at the beginning of the process ;
CodePudding user response:
You need to create a custom log format, and set it as the log formatter for the project.
Basically you need to extend Scrapy's log formatter and set the message with the new format.
main2.py:
from scrapy import logformatter
import logging
import os
from twisted.python.failure import Failure
from scrapy.utils.request import referer_str
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
SCRAPEDMSG = "Scraped from %(src)s" os.linesep "%(item)s"
# DROPPEDMSG = "Dropped: %(exception)s" os.linesep "%(item)s"
CRAWLEDMSG = "Crawled (%(status)s) %(request)s%(request_flags)s (referer: %(referer)s)%(response_flags)s"
# ITEMERRORMSG = "Error processing %(item)s"
# SPIDERERRORMSG = "Spider error processing %(request)s (referer: %(referer)s)"
# DOWNLOADERRORMSG_SHORT = "Error downloading %(request)s"
# DOWNLOADERRORMSG_LONG = "Error downloading %(request)s: %(errmsg)s"
class ExampleLogFormatter(logformatter.LogFormatter):
def crawled(self, request, response, spider):
request_flags = f' {str(request.flags)}' if request.flags else ''
response_flags = f' {str(response.flags)}' if response.flags else ''
return {
'level': logging.DEBUG,
'msg': f'{spider.name} {CRAWLEDMSG}',
'args': {
'status': response.status,
'request': request,
'request_flags': request_flags,
'referer': referer_str(request),
'response_flags': response_flags,
# backward compatibility with Scrapy logformatter below 1.4 version
'flags': response_flags
}
}
def scraped(self, item, response, spider):
if isinstance(response, Failure):
src = response.getErrorMessage()
else:
src = response
return {
'level': logging.DEBUG,
'msg': f'{spider.name} {SCRAPEDMSG}',
'args': {
'src': src,
'item': item,
}
}
if __name__ == "__main__":
spider = 'example_spider'
settings = get_project_settings()
settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
settings['LOG_FORMATTER'] = 'tempbuffer.main2.ExampleLogFormatter'
process = CrawlerProcess(settings)
process.crawl(spider)
process.start()
spider.py:
import scrapy
class ExampleSpider(scrapy.Spider):
name = 'example_spider'
allowed_domains = ['scrapingclub.com']
start_urls = ['https://scrapingclub.com/exercise/detail_basic/']
def parse(self, response):
item = dict()
item['title'] = response.xpath('//h3/text()').get()
item['price'] = response.xpath('//div[@]/h4/text()').get()
yield item
Output:
[scrapy.core.engine] DEBUG: example_spider Crawled (200) <GET https://scrapingclub.com/exercise/detail_basic/> (referer: None)
[scrapy.core.scraper] DEBUG: example_spider Scraped from <200 https://scrapingclub.com/exercise/detail_basic/>
{'title': 'Long-sleeved Jersey Top', 'price': '$12.99'}
Update:
A non global working solution:
import logging
import scrapy
from scrapy.utils.log import configure_logging
class ExampleSpider(scrapy.Spider):
name = 'example_spider'
allowed_domains = ['scrapingclub.com']
start_urls = ['https://scrapingclub.com/exercise/detail_basic/']
configure_logging(install_root_handler=False)
logging.basicConfig(level=logging.DEBUG, format=name ': %(levelname)s: %(message)s')
def parse(self, response):
item = dict()
item['title'] = response.xpath('//h3/text()').get()
item['price'] = response.xpath('//div[@]/h4/text()').get()
yield item
Update 2: Finally a working solution.
main2.py:
import logging
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
# create a logging filter
class ContentFilter(logging.Filter):
def filter(self, record):
record.spider_name = ''
# enter the spider's name
if hasattr(record, 'spider'):
record.spider_name = record.spider.name
return True
# record.spider.name was enough for my tests, but maybe you'll need this:
# record.spider_name = ''
# if hasattr(record, 'crawler'):
# record.spider_name = record.crawler.spidercls.name
# elif hasattr(record, 'spider'):
# record.spider_name = record.spider.name
# return True
# Extend scrapy.Spider class
class Spider(scrapy.Spider):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# the new format with "spider_name" variable:
formatter = logging.Formatter('[%(spider_name)s]: %(levelname)s: %(message)s')
# add the new format and filter to all the handlers
for handler in logging.root.handlers:
handler.formatter = formatter
handler.addFilter(ContentFilter())
if __name__ == "__main__":
spider1 = 'example_spider'
spider2 = 'example_spider2'
settings = get_project_settings()
settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
process = CrawlerProcess(settings)
process.crawl(spider1)
process.crawl(spider2)
process.start()
spider.py:
from tempbuffer.main2 import Spider
# use the extended "Spider" class
class ExampleSpider(Spider):
name = 'example_spider'
allowed_domains = ['scrapingclub.com']
start_urls = ['https://scrapingclub.com/exercise/detail_basic/']
def parse(self, response):
item = dict()
item['price'] = response.xpath('//div[@]/h4/text()').get()
yield item
# use the extended "Spider" class
class ExampleSpider2(Spider):
name = 'example_spider2'
allowed_domains = ['scrapingclub.com']
start_urls = ['https://scrapingclub.com/exercise/detail_basic/']
def parse(self, response):
item = dict()
item['title'] = response.xpath('//h3/text()').get()
yield item
CodePudding user response:
Thanks to @SuperUser I managed to do what I needed without having to add code in each spider individually. Everything happens inside of an extension, more specifically inside the spider_opened
method. Here is the code :
class CustomLogExtension:
class ContentFilter(logging.Filter):
"""
Creates a filter that will
"""
def filter(self, record):
record.spider_name = ''
# enter the spider's name
if hasattr(record, 'spider'):
record.spider_name = record.spider.name
return True
@classmethod
def from_crawler(cls, crawler):
# first check if the extension should be enabled and raise NotConfigured otherwise
if not crawler.settings.getbool('CUSTOM_LOG_EXTENSION'):
raise NotConfigured
# instantiate the extension object
ext = cls()
# connect the extension object to signals
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
# return the extension object
return ext
def spider_opened(self, spider):
"""Prefixes the spider's name to every log emitted."""
formatter = logging.Formatter('[%(spider_name)s] %(asctime)s [%(name)s] %(levelname)s: %(message)s')
# add the new format and filter to all the handlers
for handler in logging.root.handlers:
handler.formatter = formatter
handler.addFilter(self.ContentFilter())