Scrapy an post browser can access data scrapy is empty-CodePudding

Destination address: http://www.ccgp-gansu.gov.cn/web/article/128/0/index.htm
This web site to post submission, return HTML text, detailed code can see my
Want to crawl: the content of the project in the list of
Question: scrapy obtain the body there is no list of data ul li
Tried to solve, use cookiejar: True, there is still no data
Hope to have the ability to friend, can give a little hint, appreciate
Spiders source file

 
# - * - coding: utf-8 - * - 
The import re 

The import scrapy 
The import scrapy_splash 
The from demo. The items import DemoItem 
The from datetime import datetime 


The class GgzyfwSpider (scrapy. Spiders) : 
Name='GSCCGP' 
Allowed_domains=[' www.ccgp-gansu.gov.cn '] 
Start_urls=[' http://www.ccgp-gansu.gov.cn/web/doSearchmxarticle.action '] 
Url='http://www.ccgp-gansu.gov.cn/web/doSearchmxarticle.action' 

Def get_form_data (self, page) : 
Content={' articleSearchInfoVo. Releasestarttime ':', 
'articleSearchInfoVo. Releaseendtime' : ' ', 
'articleSearchInfoVo. Tflag' : '1', 
'articleSearchInfoVo. Classname', '128', 
'articleSearchInfoVo. Dtype' : '0', 
'articleSearchInfoVo. Days' :' ', 
'articleSearchInfoVo. Releasestarttimeold' : ' ', 
'articleSearchInfoVo. Releaseendtimeold' : ' ', 
'articleSearchInfoVo. The title' : ' ', 
'articleSearchInfoVo. Agentname' : ' ', 
'articleSearchInfoVo. Bidcode' : ' ', 
'articleSearchInfoVo. Proj_name' : ' ', 
'articleSearchInfoVo. Buyername' : ' ', 
'total', '5402', 
'limit' : '20', 
'current' : STR (page), 
'SJM' : '7466'} 
Return content 

Def start_requests (self) : 
='post' yield scrapy_splash. SplashFormRequest (method, formdata=https://bbs.csdn.net/topics/self.get_form_data (1), 
Url=self. Url, the callback=self. Parse) 

Def parse (self, response) : 
Tr_list=response. Xpath ("//ul/@ class='Expand_SearchSLisi']/li ") 
If not tr_list: 
Return 
The else: 
Pass 

The current=self. Settings. The get (' CURRENT_DATA) 
Domain='http://www.ccgp-gansu.gov.cn' 
# header is the first tr 
For li in tr_list: 
Date_str=li. Xpath (" string (.//span [1]//text ()) "). The get (). The strip () 

# time of bid opening: | release date: 2020-03-12 20:41:01 | purchasing: kongdong district town people's government of pingliang city | agency: gansu Haitian construction engineering cost consulting co., LTD. 
Date_arr=date_str. Split (' | ') 
The date=date_arr [1]. The split (' : ') [1]. The strip () 
Buy_person=date_arr [2]. The split (' : ') [1]. The strip () 
Middle_name=date_arr [3]. The split (' : ') [1]. The strip () 
If the date: 
# project_time 
Date_time=datetime. Strptime (date, "% % Y - m - H: % d % % m: % S") 

Now_time=datetime. Now () 

Diff_day=(now_time - date_time.) days 

If diff_day & gt; Current: 
# pictures so I can't stop the crawler to deal with because stops the image is also does not handle the 
# the self. The crawler. Engine. Close_spider (self, 'date') 
# print (' & gt;>> Date after ') 
Return 
The else: 
# print (' & gt;>> You can continue to ') 
Pass 
The item=DemoItem () 
The item [' publish_date]=date 
The item [' source_url]=self. Start_urls [0] 
Item [' project_name]=li. Xpath (".//a//text () "). The get () 
A href=https://bbs.csdn.net/topics/li.xpath (.//a/@ href '). The get () 
Item [' url ']=domain + href 

# waste standard/termination notice | kongdong, pingliang area | agricultural town people's government, forest, animal husbandry, fishery 
Other_str=li. Xpath (" string (.//span/strong//text ()) "). The get (). The strip () 
Other_arr=other_str. Split (' | ') 
The item [' status']=other_arr [0]. Strip () 
The item [' buy_area]=other_arr [1]. The strip () 
The item [' project_type_name]=other_arr [2]. The strip () 
The item [' buy_person]=buy_person 
The item [' middle_name]=middle_name 
Print (item) 
# yield item 

The depth=response. Meta. Get (' the depth, 0) 
Page=the depth + 1 
Url=domain + '/web/doSearchmxarticle. The action? Limit=20 & amp; Start='+ STR (page * 20) 
Yield scrapy. Request (url=url, the callback=self. Parse)

CodePudding user response:

Capture part of the code, you run a requests only after parsing code, data is successful

 
# - * - coding: utf-8 - * - 
The import re 

The from datetime import datetime 
The class GgzyfwSpider () : 
Name='GSCCGP' 
Allowed_domains=[' www.ccgp-gansu.gov.cn '] 
Start_urls=[' http://www.ccgp-gansu.gov.cn/web/doSearchmxarticle.action '] 
Url='http://www.ccgp-gansu.gov.cn/web/doSearchmxarticle.action' 

Def get_form_data (self, page) : 
Content={' articleSearchInfoVo. Releasestarttime ':', 
'articleSearchInfoVo. Releaseendtime' : ' ', 
'articleSearchInfoVo. Tflag' : '1', 
'articleSearchInfoVo. Classname', '128', 
'articleSearchInfoVo. Dtype' : '0', 
'articleSearchInfoVo. Days' :' ', 
'articleSearchInfoVo. Releasestarttimeold' : ' ', 
'articleSearchInfoVo. Releaseendtimeold' : ' ', 
'articleSearchInfoVo. The title' : ' ', 
'articleSearchInfoVo. Agentname' : ' ', 
'articleSearchInfoVo. Bidcode' : ' ', 
'articleSearchInfoVo. Proj_name' : ' ', 
'articleSearchInfoVo. Buyername' : ' ', 
'total', '5402', 
'limit' : '20', 
'current' : STR (page), 
'SJM' : '7466'} 
Return content 

Def start_requests (self) : 
The import requests 
Resp=requests. Post (url=self. Url, data=https://bbs.csdn.net/topics/self.get_form_data (1)) 
The self. The parse (resp) 

Def parse (self, response) : 
The import LXML 
nullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull