Home > other >  Why can't crawl many pages, consult bosses
Why can't crawl many pages, consult bosses


The main program
The import scrapy
The from scrapyDemo. QSBK. QSBK. Items import QsbkItem
The class QsbkSpiderSpider (scrapy. Spiders) :
Allowed_domains=[' qsbk.com ']
Start_urls=[' https://www.qiushibaike.com/text/page/1/']

Def parse (self, response) :
All_div=response. Xpath ("//div [@ class='col1 old - style - col1']/div ")
For div in all_div:
The authors=div. Xpath (".//div [@ class='author clearfix']/a [2]/h2/text () "). The getall () [0]. Strip ()
The content=div. Xpath (".//div [@ class='content']/span/text () "). The getall () [0]. Replace (" \ n ", "")

# duanzi={" author ": the authors," content ": the content}
The item=QsbkItem (author=the authors, the content=content)
Yield item
# the url on the next page
Next_url=response. Xpath ("//ul/@ class='pagination']/li [last ()]/a/@ href "). The get ()
# print (next_url)
If next_url:
Yield scrapy. Request (self base_domain + next_url, callback=self. Parse)
The else:

Settings. Py
'Accept' : 'text/HTML, application/XHTML + XML, application/XML. Q=0.9 */*; Q=0.8 ',
'the Accept - Language' : 'en',
'the user-agent: "Mozilla/5.0 (Windows NT 10.0; Win64. X64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36 "
'QSBK. Pipelines. QsbkPipeline: 300,

The import scrapy
The class QsbkItem (scrapy. Item) :
The author=scrapy. Field ()
The content=scrapy. Field ()

The from scrapy. Exporters import JsonLinesItemExporter
The class QsbkPipeline:
Def __init__ (self) :
# export machine use bytes written to the file, so here it with wb
The self. The fp=open (" duanzi. Json ", "wb")
The self. The exporter=JsonLinesItemExporter (self. Fp, ensure_ascii=False, encoding="utf-8")
Def open_spider (self, spiders) :
Print (" crawlers start ")
Def process_item (self, item, spiders) :
The # import
The self. The exporter. Export_item (item)
Return the item
Def close_spider (self, spiders) :
The self. The fp. Close ()
Print (" end of the crawler ")

The results are as follows:
  • Related