Home > other >  CrawlSpider cookie, a great god for help
CrawlSpider cookie, a great god for help

Time:09-20

 
# - * - coding: utf-8 - * -
The import scrapy
The from scrapy. Linkextractors import LinkExtractor
The from scrapy. Spiders import CrawlSpider, Rule
The from.. The items import KejianItem
The import re
The from LXML import etree
The import requests
The from scrapy import Request

The class A53kejianSpider (CrawlSpider) :
Name='
Allowed_domains=[' com]
Start_urls=[' http://www.com/']
Rules=(
Rule (LinkExtractor (allow=r '/\ w/${2, 9}'), follow=True),
Rule (LinkExtractor (allow=r '{2, 9}//\ w \ d +. HTML'), the callback='parse_item', follow=False),
)
# rewrite start_requests
Def start_requests (self) :
Cookies='ASPSESSIONIDAQSSQBCQ=ANELFNGBHNAFGJGBPCOOHACN; Hm_lvt_0cb4e81fdd1a5b0b04d6edd93bfa0928=1587718373158728, 873158741, 214158773, 867; Hm_lpvt_0cb4e81fdd1a5b0b04d6edd93bfa0928=1587776002 '
Cookies={i. plit ('=') [0] : i. plit ('=') [1] for I in cookies, split ('; ')}
Yield scrapy. Request (url=self. Start_urls [0], cookies=cookies, callback=self. The parse, dont_filter=True)
Def parse_item (self, response) :
The item=KejianItem ()
# title
Item [' title ']=response. Xpath ('//div/@/h1/text () '). The get ()
# content
The content=response. Xpath ('//div [@ class="p20"]/p//text () '). The getall ()
Item/' content '='. Join (content)
# captcha
Codes=response. Xpath ('//* [@ id="container"]/div [1]/div [4]//text () '). The getall ()
The item [' code ']=codes [1] + codes [2] + codes [9]
# download address:
Orgin=response. Url
Number=re. The.findall (' \ d 'orgin)
Number="'. Join (number)
Url='http://www, com/plug-in to the asp? Id='+ number + & amp; The order=0 '
# # # get a download link
Yield scrapy. HTTP Request (url, meta={' item: the item}, the callback=self. Down_url, dont_filter=True)
Def down_url (self, response) :
Item=response. Meta [' item ']
Result=response. Xpath ('//text () '). The getall ()
Result="". Join (result)
Downurl=the result of [1] # download address
The item [' downurl]=downurl
Yield item


Rewrite the start_requests, let the parse request generation in the cookies information, but the download link or cannot obtain,


No longer scrapy framework, using the requests module can get a download link
 
The from LXML import etree
The import requests
Url='http://www, com/plug-in to the asp? Id='+ number + & amp; The order=0 '
Cookies='ASPSESSIONIDAQSSQBCQ=OIEKFNGBBMBKAOACCCJJCJCK; Hm_lvt_0cb4e81fdd1a5b0b04d6edd93bfa0928=1587685941158718, 373158728, 873158741, 214; 1 hl5yp=5 f9353 content %=6123% 3 a381; Hm_lpvt_0cb4e81fdd1a5b0b04d6edd93bfa0928=1587742953 '
Cookies={i. plit ('=') [0] : i. plit ('=') [1] for I in cookies, split ('; ')}
Resp=requests. Get (url, cookies=cookies) # access url page source
Data=https://bbs.csdn.net/topics/etree.HTML (resp. Text)
Downl=data. Xpath ('//text () ')
Print (downl)



* * a great god, and this is why? In scrapy, I also carry the cookies, but why can't we get * * download address
  • Related