Having an issue with XMLFeedSpider. I can get the parsing to work on the scrapy shell, so it seems there is something going on with either the request, or the spider's engagement. Whether I add a start_request()
method or not, I seem to get the same error.
No output_file.csv
is produced after running the spider.
I am able to get a scrapy.Spider and CrawlSpider to work, but can't seem to figure out what I am doing wrong with the XMLFeedSpider.
This is the spider:
from ..items import TheItem
from scrapy.loader import ItemLoader
import scrapy
from scrapy.crawler import CrawlerProcess
class TheSpider(scrapy.spiders.XMLFeedSpider):
name = 'stuff_spider'
allowed_domains = ['www.website.net']
start_urls = ['https://www.website.net/10016/stuff/otherstuff.xml']
namespaces = [('xsi', 'https://schemas.website.net/xml/uslm'), ]
itertag = 'xsi:item'
iterator = 'xml'
def start_requests(self):
yield scrapy.Request('https://www.website.net/10016/stuff/otherstuff.xml', callback=self.parse_node)
def parse_node(self, response, node):
l = ItemLoader(item=TheItem(), selector=node, response=response)
just_want_something = 'just want the csv to show some output'
l.add_xpath('title', response.xpath('//xsi:title/text()').extract())
l.add_xpath('date', response.xpath('//xsi:date/text()').extract())
l.add_xpath('category', node.xpath('//xsi:cat1/text()').extract())
l.add_value('content', node.xpath('//xsi:content/text()'))
l.add_value('manditory', just_want_something)
yield l.load_item()
process = CrawlerProcess(settings={
'FEEDS': 'output_file.csv',
'FEED_FORMAT': 'csv',
'DOWNLOAD_DELAY': 1.25,
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0'
})
process.crawl(TheSpider)
process.start()
This is the item:
from scrapy import Item, Field
from itemloaders.processors import Identity, Compose
def all_lower(value):
return value.lower()
class TheItem(Item):
title = Field(
input_processor=Compose(all_lower),
output_processor=Identity()
)
link = Field(
input_processor=Compose(all_lower),
output_processor=Identity()
)
date = Field(
input_processor=Compose(all_lower),
output_processor=Identity()
)
category = Field(
input_processor=Compose(all_lower),
output_processor=Identity()
)
manditory = Field(
input_processor=Compose(all_lower),
output_processor=Identity()
)
This is the output:
D:\GitFolder\scrapyProjects\TheProject\venv\Scripts\python.exe D:\GitFolder\scrapyProjects\TheProject\TheSpider\TheSpider\spiders\TheSpider.py
Traceback (most recent call last):
File "D:\GitFolder\scrapyProjects\TheProject\TheSpider\TheSpider\spiders\TheSpider.py", line 43, in <module>
process = CrawlerProcess(settings={
File "D:\GitFolder\scrapyProjects\TheProject\venv\lib\site-packages\scrapy\crawler.py", line 289, in __init__
super().__init__(settings)
File "D:\GitFolder\scrapyProjects\TheProject\venv\lib\site-packages\scrapy\crawler.py", line 164, in __init__
settings = Settings(settings)
File "D:\GitFolder\scrapyProjects\TheProject\venv\lib\site-packages\scrapy\settings\__init__.py", line 454, in __init__
self.update(values, priority)
File "D:\GitFolder\scrapyProjects\TheProject\venv\lib\site-packages\scrapy\settings\__init__.py", line 323, in update
self.set(name, value, priority)
File "D:\GitFolder\scrapyProjects\TheProject\venv\lib\site-packages\scrapy\settings\__init__.py", line 265, in set
self.attributes[name].set(value, priority)
File "D:\GitFolder\scrapyProjects\TheProject\venv\lib\site-packages\scrapy\settings\__init__.py", line 50, in set
value = BaseSettings(value, priority=priority)
File "D:\GitFolder\scrapyProjects\TheProject\venv\lib\site-packages\scrapy\settings\__init__.py", line 86, in __init__
self.update(values, priority)
File "D:\GitFolder\scrapyProjects\TheProject\venv\lib\site-packages\scrapy\settings\__init__.py", line 316, in update
values = json.loads(values)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.2032.0_x64__qbz5n2kfra8p0\lib\json\__init__.py", line 346, in loads
return _default_decoder.decode(s)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.2032.0_x64__qbz5n2kfra8p0\lib\json\decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.2032.0_x64__qbz5n2kfra8p0\lib\json\decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
Process finished with exit code 1
And if I remove the start_requests()
method, I get this output:
D:\GitFolder\scrapyProjects\TheProject\venv\Scripts\python.exe D:\GitFolder\scrapyProjects\TheProject\TheSpider\TheSpider\spiders\TheSpider.py
Traceback (most recent call last):
File "D:\GitFolder\scrapyProjects\TheProject\TheSpider\TheSpider\spiders\TheSpider.py", line 43, in <module>
process = CrawlerProcess(settings={
File "D:\GitFolder\scrapyProjects\TheProject\venv\lib\site-packages\scrapy\crawler.py", line 289, in __init__
super().__init__(settings)
File "D:\GitFolder\scrapyProjects\TheProject\venv\lib\site-packages\scrapy\crawler.py", line 164, in __init__
settings = Settings(settings)
File "D:\GitFolder\scrapyProjects\TheProject\venv\lib\site-packages\scrapy\settings\__init__.py", line 454, in __init__
self.update(values, priority)
File "D:\GitFolder\scrapyProjects\TheProject\venv\lib\site-packages\scrapy\settings\__init__.py", line 323, in update
self.set(name, value, priority)
File "D:\GitFolder\scrapyProjects\TheProject\venv\lib\site-packages\scrapy\settings\__init__.py", line 265, in set
self.attributes[name].set(value, priority)
File "D:\GitFolder\scrapyProjects\TheProject\venv\lib\site-packages\scrapy\settings\__init__.py", line 50, in set
value = BaseSettings(value, priority=priority)
File "D:\GitFolder\scrapyProjects\TheProject\venv\lib\site-packages\scrapy\settings\__init__.py", line 86, in __init__
self.update(values, priority)
File "D:\GitFolder\scrapyProjects\TheProject\venv\lib\site-packages\scrapy\settings\__init__.py", line 316, in update
values = json.loads(values)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.2032.0_x64__qbz5n2kfra8p0\lib\json\__init__.py", line 346, in loads
return _default_decoder.decode(s)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.2032.0_x64__qbz5n2kfra8p0\lib\json\decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.2032.0_x64__qbz5n2kfra8p0\lib\json\decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
Process finished with exit code 1
Both ultimately end up with the same error.
CodePudding user response:
According to https://docs.scrapy.org/en/latest/topics/feed-exports.html#feeds FEED
param should be a dict
. Like:
process = CrawlerProcess(settings={
"FEEDS": {
"items.json": {"format": "json"},
},
})