Really don't know what to do, the teacher urged the project team is pretty tight, ask everybody to help me
The crawler file
Zongheng. Py
# - * - coding: utf-8 - * -
The import scrapy
The from scrapy import spiders, Request
The from w3lib. HTML import remove_tags
The from zongheng. Items import ZonghengItem
The from the selenium import webdriver
The class PassageSpider (spiders) :
Name='passage'
Def __init__ (self) :
The self. The browser=webdriver. Firefox ()
The self. The set_page_load_timeout (30)
Def closed (self, spiders) :
Print (spiders "closed")
The self. The close ()
Def start_requests (self) :
Start_urls='http://book.zongheng.com/store/c0/c0/b0/u4/p48/v9/s1/t0/u0/i1/ALL.html' [r]
For I in start_urls:
Yield Request (url=I, the callback=self. The parse, dont_filter=True)
Def parse (self, response) : # # for all links of the novel
Book_url_list=response. Xpath (
"/HTML/body/div [2]/em/div [1]/div [1]/div/div [2]/div [1]/a/@ href"
). The extract ()
For book_url book_url_list in:
Yield Request (book_url, callback=self. Parse_read dont_filter=True)
Def parse_read (self, response) : # into the novel directory
Book_catalogue_list=self. Browser. Find_element_by_xpath (
'/HTML/body/div [2]/div [5]/div [1]/div [1]/div [1]/div [2]/div [5]/div [1] [2]/a'
)
Book_catalogue=book_catalogue_list. Get_attribute (' href ')
Yield Request (book_catalogue, callback=self parse_chapter)
Def parse_chapter (self, response) : # section links
Book_directory=response. Xpath (
[@ '/HTML/body/div class=\ "container "]/div/div [@ class=\ \ "volume - the list are"]/div/ul [@ class=\ "chapter - the list clearfix "]/li/a/@ href'
). The extract ()
For chapter in book_directory:
Yield Request (chapter, callback=self parse_content)
Def parse_content (self, response) : # for text content
Name=response. Xpath ("/HTML/body/div [2]/div [3]/div [2], a [3]/text () "). The extract_first ()
Print (name)
Chapter_name=response. Xpath (
"/HTML/body/div [2]/div [3]/div [3]/div/div [2]/div [2]/text ()", "
). The extract ()
Chapter_content0=response. Xpath (
"/HTML/body/div [2]/div [3]/div [3]/div/div [5]//text ()", "
). The extract ()
Chapter_content1=[]
For chapter in chapter_content0:
Chapter1=remove_tags (chapter)
Chapter_content1. Append (chapter1)
Chapter_content="". Join (chapter_content1)
The item=ZonghengItem ()
The item [' name ']=name
The item [' chap_name]=chapter_name [0]
The item [' chap_content]=chapter_content
Yield item
Middlewares. Py (this part is I add code, the other I didn't move)
class SeleniumMiddleware (object) :
Def process_request (self, request, spiders) :
If spiders. Name=='passage:
Try:
Spiders. Browser. Get (request. Url)
Except TimeoutException as e:
Print (' timeout ')
Spiders. Browser. Execute_script (' window. Stop () ')
Time. Sleep (2)
Return HtmlResponse (url=spiders. Browser. Current_url, body=spiders. The page_source,
Encoding="utf-8", request=request)
CodePudding user response:
I try to open your start_urls, inside p48 parameters should be refers to 48 pages, this page is only a novel, is "the city of the sea battle", out on this issue? Other code temporarily haven't seenCodePudding user response: