i have the following working spider as a process - so i can start the spider with: python xyz.py
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
class onlyLinks(scrapy.Spider):
name = 'onlyLinks'
allowed_domains = ['magnatiles.com']
start_urls = ['https://www.magnatiles.com/products/page/1/']
def parse(self, response):
tmpProd = response.xpath("//ul[@class='products']//ancestor::li")
for p in tmpProd:
yield {
"link": p.xpath("(./a)[1]/@href").get(),
"name": p.xpath(".//h2/text()").get(),
"sku": p.css("a.button::attr(data-product_sku)").get(),
"price": p.xpath("//span[@class='price']//ancestor::bdi/text()").get()
}
process = CrawlerProcess(settings={
"FEEDS": {
"items.json": {"format": "json"},
# "items.csv": {"format": "csv"},
# "items.xlsx": {"format": "xlsx"},
},
})
process.crawl(onlyLinks)
process.start()
But how can i use the other informations / settings from the settings.py? eg. ROBOTSTXT_OBEY = False, AUTOTHROTTLE_ENABLED = True, AUTOTHROTTLE_START_DELAY = 5 etc.
And how can i make the output to json in that way - so the output get overwritten? (when i run the program now the data is allways appended to the existing items.json-file) (when i run a spider with scrapy runspider i would use -o or -O to overwrite or not overwrite the output-file - but how can i do this here?)
CodePudding user response:
You can do that using key value pairs meaning dict
format as follows:
process = CrawlerProcess ({
"FEEDS": {"items.json": {"format": "json", "overwrite": True}},
'ROBOTSTXT_OBEY':'False',
'USER_AGENT': 'Mozilla/5.0',
'AUTOTHROTTLE_ENABLED':'True',
'AUTOTHROTTLE_START_DELAY': '5'
})
process.crawl(onlyLinks)
process.start()