here is the code
the from scrapy. Linkextractors import LinkExtractor
The from scrapy. Spiders import CrawlSpider, Rule
The class Txzp1Spider (CrawlSpider) :
Name='txzp1'
# allowed_domains=[' hr.tencent.com ']
Start_urls=[' https://hr.tencent.com/position.php?lid=&tid=&keywords=java&start=0#a ']
Rules=(
Rule (LinkExtractor (allow=r 'position. PHP? Lid=& amp; Tid=& amp; Keywords=java& Start=\ d# a '), follow=True),
Rule (LinkExtractor (=r 'allow position_detail. PHP? Id=\ d + & amp; Keywords=java& Tid=0 & amp; Lid=0 '),
The callback="parse_detail", follow=False),
)
Def parse_detail (self, response) :
Print ("===========")
Title=the response. Xpath ("//tr [@ class='h']/td/text () "). The get ()
Region=response. Xpath ("//tr [@ class='c bottomline] [1]/td/text () "). The get ()
Position_type=response. Xpath ("//tr [@ class='c bottomline] [2]/td/text () "). The get ()
Number=response. Xpath ("//tr [@ class='c bottomline] [3]/td/text () "). The get ()
Duty=response. Xpath (
"//table [@ class=" tablelist textl ']//tr [@ class='c'] [1]//ul/@ class='squareli']/li/text () "). The getall ()
Yaoqiu=response. Xpath (
"//table [@ class=" tablelist textl ']//tr [@ class='c'] [2]//ul/@ class='squareli']/li/text () "). The getall ()
Item={" title ": the title," position_type ": position_type," number ": the number of" region ": region," duty ": duty,
"Yaoqiu" : yaoqiu}
Print (item)
CodePudding user response:
Should this address isLinkExtractor (allow=r 'position. PHP? Lid=& amp; Tid=& amp; Keywords=java& Start=\ d# a '), follow=True
Start behind should pick a specific page Numbers, so quite so preach a \ d does not flip, so to get less than content
CodePudding user response:
Should be dynamically the page number to send the past to startCodePudding user response:
Data is empty: less yield itemCodePudding user response:
Rule write wrong, wrong is empty