The import re
The import json
The from scrapy. The selector import the selector
Try:
The from scrapy. Spiders import spiders
Except:
The from scrapy. Spiders import BaseSpider as spiders
The from scrapy. Utils. Response import get_base_url
The from scrapy. Utils. Url import urljoin_rfc
The from scrapy. Contrib. Spiders import CrawlSpider, Rule
The from scrapy. Contrib. Linkextractors. SGML import SgmlLinkExtractor as sle
The from itzhaopin. Items import *
The from itzhaopin. Misc. The log import *
The class TencentSpider (CrawlSpider) :
Name="tencent"
Allowed_domains=(" tencent.com ")
Start_urls=[
"Http://hr.tencent.com/position.php"
]
Rules=[# define crawl URL rules
Rule (sle (allow=("/position. PHP \? & Start=\ d {4} # b ")), follow=True, the callback='parse_item')
]
Def parse_item (self, response) : # extract data to the Items inside, the main use XPath and CSS selector to extract data page
The items=[]
Sel=the Selector (response)
Base_url=get_base_url (response)
Sites_even=sel. CSS (' table. Tablelist tr. Even ')
For site in sites_even:
The item=TencentItem ()
Item [' name ']=site. CSS (' l.s quare a '). The xpath (' text () '). The extract ()
Relative_url=site. CSS (' l.s quare a '). The xpath (' @ href) extract () [0]
The item [' detailLink]=urljoin_rfc (base_url relative_url)
The item [' catalog]=site. CSS (' tr & gt; Td: NTH - child (2) : : text '). The extract ()
The item [' workLocation]=site. CSS (' tr & gt; Td: NTH - child (4) : : text '). The extract ()
The item [' recruitNumber]=site. CSS (' tr & gt; Td: NTH - child (3) : : text '). The extract ()
The item [' publishTime ']=site. CSS (' tr & gt; Td: NTH - child (5) : : text '). The extract ()
The items. Append (item)
# print repr (item). Decode (" unicode - escape ") + '\ n'
Sites_odd=sel. CSS (' table. Tablelist tr. Odd ')
For site in sites_odd:
The item=TencentItem ()
Item [' name ']=site. CSS (' l.s quare a '). The xpath (' text () '). The extract ()
Relative_url=site. CSS (' l.s quare a '). The xpath (' @ href) extract () [0]
The item [' detailLink]=urljoin_rfc (base_url relative_url)
The item [' catalog]=site. CSS (' tr & gt; Td: NTH - child (2) : : text '). The extract ()
The item [' workLocation]=site. CSS (' tr & gt; Td: NTH - child (4) : : text '). The extract ()
The item [' recruitNumber]=site. CSS (' tr & gt; Td: NTH - child (3) : : text '). The extract ()
The item [' publishTime ']=site. CSS (' tr & gt; Td: NTH - child (5) : : text '). The extract ()
The items. Append (item)
# print repr (item). Decode (" unicode - escape ") + '\ n'
Info (" parsed "+ STR (response))
Return the items
Def _process_request (self, request) :
Info (' process '+ STR (request))
The return request
Themselves, according to the wrote a encyclopedia articles from python, extract the page link, crawl the title, description, url
The import re
The from scrapy. The selector import the selector
The from scrapy. Utils. Response import get_base_url
The from scrapy. Spiders import Rule, CrawlSpider
The from scrapy. Linkextractors import LinkExtractor as sle
The from baikeSpider. Items import *
The class BaikeSpider (CrawlSpider) :
Name="baike
"Allowed_domains=(" baike.baidu.com ")
Start_urls=[
"Http://baike.baidu.com/view/21087.htm"
]
Rule=[
Rule (sle (allow=("/view/\ d + \. HTM ")), follow=True, the callback='parse_item')
]
Def parse_item (self, response) :
Sel=the Selector (response)
Base_url=get_base_url (response)
The item=BaikespiderItem ()
Item [' title ']=sel. Xpath ('//dd [contains (@ class, "lemmaWgt - lemmaTitle - the title")]/h1/text () '). The extract () [0]
The item [' url ']=base_url
Item [' desc ']=sel. Xpath ('//div [contains (@ class, "lemma - summary")]/div/text () '). The extract ()
Return the item
Def _process_request (self, request) :
Print (' process '+ STR (request))
The return request
This can't crawl, test and parse_item did not perform, I change the parse_item to parse, is able to crawl start_url, wondering, why don't the original, you can write like that, in addition to ask, I this how to change
CodePudding user response:
Don't say clear, didn't post all the codeCodePudding user response:
Well what do you think of where did not say clear, then the missing part of the code can sayCodePudding user response:
You are funny, I think you didn't say clear, I also said that clear? Since you said a can run successfully, one can't that you give to the two codeCodePudding user response:
If you cannot tell me where I couldn't say for sure, so what do you think I where, spiders code all over all, the other items and pipelines doesn't affect the crawl, besides what you want me to stick???????CodePudding user response:
Scrappy default calling parse, you want to use the parse - item, show that you didn't work this sentence: rule=[Rule (sle (allow=("/view/\ d + \. HTM ")), follow=True, the callback='parse_item')
]
Regular expression did not match the view/21087 HTM, find their own knowledge of regular expressions completion he is ok,
CodePudding user response:
The building Lord, how are you, I also met the same problem, want to ask next you finally have found a way to solveCodePudding user response: