Home > other >  Why can't crawl many pages, consult bosses
Why can't crawl many pages, consult bosses

Time:09-15

The main program
 
The import scrapy
The from scrapyDemo. QSBK. QSBK. Items import QsbkItem
The class QsbkSpiderSpider (scrapy. Spiders) :
Name='qsbk_spider'
Allowed_domains=[' qsbk.com ']
Start_urls=[' https://www.qiushibaike.com/text/page/1/']
Base_domain="https://www.qiushibaike.com"

Def parse (self, response) :
All_div=response. Xpath ("//div [@ class='col1 old - style - col1']/div ")
For div in all_div:
The authors=div. Xpath (".//div [@ class='author clearfix']/a [2]/h2/text () "). The getall () [0]. Strip ()
The content=div. Xpath (".//div [@ class='content']/span/text () "). The getall () [0]. Replace (" \ n ", "")

# duanzi={" author ": the authors," content ": the content}
The item=QsbkItem (author=the authors, the content=content)
Yield item
# the url on the next page
Next_url=response. Xpath ("//ul/@ class='pagination']/li [last ()]/a/@ href "). The get ()
# print (next_url)
If next_url:
Yield scrapy. Request (self base_domain + next_url, callback=self. Parse)
The else:
Return


Settings. Py
 
ROBOTSTXT_OBEY=False
DOWNLOAD_DELAY=1
DEFAULT_REQUEST_HEADERS={
'Accept' : 'text/HTML, application/XHTML + XML, application/XML. Q=0.9 */*; Q=0.8 ',
'the Accept - Language' : 'en',
'the user-agent: "Mozilla/5.0 (Windows NT 10.0; Win64. X64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36 "
}
ITEM_PIPELINES={
'QSBK. Pipelines. QsbkPipeline: 300,
}

 
The import scrapy
The class QsbkItem (scrapy. Item) :
The author=scrapy. Field ()
The content=scrapy. Field ()

 
The from scrapy. Exporters import JsonLinesItemExporter
The class QsbkPipeline:
Def __init__ (self) :
# export machine use bytes written to the file, so here it with wb
The self. The fp=open (" duanzi. Json ", "wb")
The self. The exporter=JsonLinesItemExporter (self. Fp, ensure_ascii=False, encoding="utf-8")
Def open_spider (self, spiders) :
Print (" crawlers start ")
Def process_item (self, item, spiders) :
The # import
The self. The exporter. Export_item (item)
Return the item
Def close_spider (self, spiders) :
The self. The fp. Close ()
Print (" end of the crawler ")


The results are as follows:
  • Related