I'm new to Scrapy and having some problems with the output from my first spider. No matter what I try, the output json file is always empty. Im using the 2.5.1 version due to running into a bug on the current 2.6.1 version. The spiders code is:
import scrapy
from scrapy.item import Field, Item
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class WormSpider(CrawlSpider):
name = 'Worm'
allowed_domains = ['parahumans.wordpress.com']
start_urls = ['https://parahumans.wordpress.com/']
custom_settings = {'CLOSESPIDER_PAGECOUNT': 10}
rules = (
#Rule(
# LinkExtractor(allow = r'notebook_Desde_'), follow = True
# ),
Rule(
LinkExtractor(allow = r'category/stories-arcs-'), follow = True, callback = 'parse_item'
),
)
def parse_item(self, response):
yield {
'Arco': response.xpath('//h1[@]/text()').getall(),
'tags': response.xpath('//span[@]/text()').getall()
}
And the terminal log output when i run scrapy crawl Worm -O Testeo.json :
[]
2022-07-03 00:05:34 [scrapy.core.engine] INFO: Spider opened
2022-07-03 00:05:34 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2022-07-03 00:05:34 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2022-07-03 00:05:35 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://parahumans.wordpress.com/> (referer: None)
2022-07-03 00:05:35 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://parahumans.wordpress.com/category/stories-arcs-1-10/arc-1-gestation/1-01/> (referer: https://parahumans.wordpress.com/)
2022-07-03 00:05:36 [scrapy.dupefilters] DEBUG: Filtered duplicate request: <GET https://parahumans.wordpress.com/category/stories-arcs-1-10/arc-1-gestation/1-01/#content> - no more duplicates will be shown (see DUPEFILTER_DEBUG to show all duplicates)
2022-07-03 00:05:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://parahumans.wordpress.com/category/stories-arcs-1-10/arc-6-tangle/6-02/> (referer: https://parahumans.wordpress.com/)
2022-07-03 00:05:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://parahumans.wordpress.com/category/stories-arcs-1-10/arc-5-hive/5-10/> (referer: https://parahumans.wordpress.com/)
2022-07-03 00:05:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://parahumans.wordpress.com/category/stories-arcs-1-10/arc-5-hive/5-08/> (referer: https://parahumans.wordpress.com/)
2022-07-03 00:05:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://parahumans.wordpress.com/category/stories-arcs-1-10/arc-5-hive/5-09/> (referer: https://parahumans.wordpress.com/)
2022-07-03 00:05:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://parahumans.wordpress.com/category/stories-arcs-1-10/arc-5-hive/5-x-interlude/> (referer: https://parahumans.wordpress.com/)
2022-07-03 00:05:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://parahumans.wordpress.com/category/stories-arcs-1-10/arc-5-hive/5-07/> (referer: https://parahumans.wordpress.com/)
2022-07-03 00:05:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://parahumans.wordpress.com/category/stories-arcs-1-10/arc-6-tangle/6-01/> (referer: https://parahumans.wordpress.com/)
2022-07-03 00:05:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://parahumans.wordpress.com/category/stories-arcs-1-10/arc-6-tangle/> (referer: https://parahumans.wordpress.com/)
2022-07-03 00:05:36 [scrapy.core.engine] INFO: Closing spider (closespider_pagecount)
2022-07-03 00:05:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://parahumans.wordpress.com/category/stories-arcs-1-10/arc-5-hive/5-05/> (referer: https://parahumans.wordpress.com/)
2022-07-03 00:05:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://parahumans.wordpress.com/category/stories-arcs-1-10/arc-5-hive/5-04/> (referer: https://parahumans.wordpress.com/)
2022-07-03 00:05:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://parahumans.wordpress.com/category/stories-arcs-1-10/arc-5-hive/5-03/> (referer: https://parahumans.wordpress.com/)
2022-07-03 00:05:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://parahumans.wordpress.com/category/stories-arcs-1-10/arc-5-hive/5-06/> (referer: https://parahumans.wordpress.com/)
2022-07-03 00:05:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://parahumans.wordpress.com/category/stories-arcs-1-10/arc-5-hive/5-01/> (referer: https://parahumans.wordpress.com/)
2022-07-03 00:05:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://parahumans.wordpress.com/category/stories-arcs-1-10/arc-5-hive/> (referer: https://parahumans.wordpress.com/)
2022-07-03 00:05:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://parahumans.wordpress.com/category/stories-arcs-21/arc-30-speck/30-05/> (referer: https://parahumans.wordpress.com/)
2022-07-03 00:05:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://parahumans.wordpress.com/category/stories-arcs-21/arc-30-speck/30-07/> (referer: https://parahumans.wordpress.com/)
2022-07-03 00:05:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://parahumans.wordpress.com/category/stories-arcs-21/arc-30-speck/30-02/> (referer: https://parahumans.wordpress.com/)
2022-07-03 00:05:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://parahumans.wordpress.com/category/stories-arcs-21/arc-30-speck/30-04/> (referer: https://parahumans.wordpress.com/)
2022-07-03 00:05:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://parahumans.wordpress.com/category/stories-arcs-21/arc-30-speck/30-06/> (referer: https://parahumans.wordpress.com/)
2022-07-03 00:05:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://parahumans.wordpress.com/category/stories-arcs-21/arc-30-speck/30-03/> (referer: https://parahumans.wordpress.com/)
2022-07-03 00:05:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://parahumans.wordpress.com/category/stories-arcs-1-10/arc-5-hive/5-02/> (referer: https://parahumans.wordpress.com/)
2022-07-03 00:05:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://parahumans.wordpress.com/category/stories-arcs-21/arc-30-speck/30-01/> (referer: https://parahumans.wordpress.com/)
2022-07-03 00:05:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://parahumans.wordpress.com/category/stories-arcs-21/arc-30-speck/> (referer: https://parahumans.wordpress.com/)
2022-07-03 00:05:39 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 9450,
'downloader/request_count': 25,
'downloader/request_method_count/GET': 25,
'downloader/response_bytes': 1085925,
'downloader/response_count': 25,
'downloader/response_status_count/200': 25,
'dupefilter/filtered': 7992,
'elapsed_time_seconds': 4.61569,
'finish_reason': 'closespider_pagecount',
'finish_time': datetime.datetime(2022, 7, 3, 3, 5, 39, 261991),
'httpcompression/response_bytes': 4498394,
'httpcompression/response_count': 25,
'log_count/DEBUG': 26,
'log_count/INFO': 10,
'memusage/max': 55934976,
'memusage/startup': 55934976,
'request_depth_max': 2,
'response_received_count': 25,
'scheduler/dequeued': 25,
'scheduler/dequeued/memory': 25,
'scheduler/enqueued': 333,
'scheduler/enqueued/memory': 333,
'start_time': datetime.datetime(2022, 7, 3, 3, 5, 34, 646301)}
2022-07-03 00:05:39 [scrapy.core.engine] INFO: Spider closed (closespider_pagecount)```
CodePudding user response:
You are about to your goal. Use //a//text()
instead of /text()
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
class WormSpider(CrawlSpider):
name = 'Worm'
custom_settings = {
"FEEDS": {'data.json': {'format': 'json'}},
"FEED_EXPORT_ENCODING": "utf-8"
}
allowed_domains = ['parahumans.wordpress.com']
start_urls = ['https://parahumans.wordpress.com/']
#custom_settings = {'CLOSESPIDER_PAGECOUNT': 10}
rules = (
Rule(
LinkExtractor(allow = r'category/stories-arcs-'), follow = True, callback = 'parse_item'
),
)
def parse_item(self, response):
yield {
'Arco': response.xpath('//h1[@]//a//text()').getall(),
'tags': response.xpath('//span[@]//a//text()').getall()
}
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(WormSpider)
process.start()
Output:
[
{
"Arco": ["Gestation 1.1"],
"tags": ["Emma", "Madison", "Mr. G", "Sophia", "Taylor"]
},
{
"Arco": ["Agitation 3.9"],
"tags": [
"Aegis",
"Angelica",
"Bitch",
"Brutus",
"Clockblocker",
"Grue",
"Judas",
"Regent",
"Tattletale",
"Taylor",
"Vista"
]
},
{
"Arco": ["Agitation 3.8"],
"tags": [
"Angelica",
"Bitch",
"Brutus",
"Grue",
"Judas",
"Regent",
"Tattletale",
"Taylor"
]
},
{
"Arco": ["Agitation 3.7"],
"tags": [
"Angelica",
"Bitch",
"Brutus",
"Grue",
"Judas",
"Regent",
"Tattletale",
"Taylor"
]
},
{ "Arco": ["Agitation 3.5"], "tags": ["Armsmaster", "Taylor"] },
{
"Arco": [
"Interlude 4",
"Shell 4.11",
"Shell 4.10",
"Shell 4.9",
"Shell 4.8",
"Shell 4.7",
"Interlude 3½ (Bonus)",
"Shell 4.6",
"Shell 4.5",
"Shell 4.4"
],
"tags": [
"Angelica",
"Bitch",
"Brutus",
"Judas",
"Doctor Q",
"Grue",
"Tattletale",
"Taylor",
"Taylor's Dad",
"Bakuda",
"Grue",
"Taylor",
"Bakuda",
"Grue",
"Regent",
"Tattletale",
"Taylor",
"Bakuda",
"Grue",
"Regent",
"Tattletale",
"Taylor",
"Bakuda",
"Grue",
"Regent",
"Tattletale",
"Taylor",
"Über",
"Bakuda",
"Grue",
"Leet",
"Regent",
"Tattletale",
"Taylor",
"Über",
"Grue",
"Leet",
"Regent",
"Tattletale",
"Taylor",
"Grue",
"Regent",
"Tattletale",
"Taylor"
]
},
{
"Arco": ["Agitation 3.4"],
"tags": ["Armsmaster", "Taylor", "Taylor's Dad"]
},
{ "Arco": ["Agitation 3.6"], "tags": ["Tattletale", "Taylor"] },
{
"Arco": ["Shell 4.1"],
"tags": ["Emma", "Grue", "Regent", "Tattletale", "Taylor"]
},
{
"Arco": ["Agitation 3.2"],
"tags": ["Grue", "Regent", "Tattletale", "Taylor"]
},
{
"Arco": ["Agitation 3.1"],
"tags": ["Bitch", "Grue", "Mrs. Knott", "Sophia", "Taylor"]
},
{
"Arco": ["Agitation 3.3"],
"tags": ["Angelica", "Bitch", "Grue", "Regent", "Tattletale", "Taylor"]
... so on