Home > Enterprise >  Parsing ASPX site with Python POST request
Parsing ASPX site with Python POST request

Time:10-19

I am trying to perfom parsing but when I send POST method to get searching results, getting page with error: The requested URL was rejected. Please consult with your administrator.

Website: https://prod.ceidg.gov.pl/CEIDG/CEIDG.Public.UI/Search.aspx

I've collected data like viewstate, viewstategenerator etc.. to pass throught form but doesn't work. What am I missing?

#import requests
from bs4 import BeautifulSoup
import lxml
import urllib
from requests_html import HTMLSession
from requests_html import AsyncHTMLSession
import time
#s = HTMLSession(browser_args=["--no-sandbox", '--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'])
s= HTMLSession()
header_simple = {
'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
    'HTTP_ACCEPT': 'text/html,application/xhtml xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Content-Type': 'application/x-www-form-urlencoded',

}

r = s.request('get', 'http://prod.ceidg.gov.pl/CEIDG/CEIDG.Public.UI/Search.aspx')
soup_dummy = BeautifulSoup(r.content, "lxml")
# parse and retrieve two vital form values
viewstate = soup_dummy.select("#__VIEWSTATE")[0]['value']
viewstategen = soup_dummy.select("#__VIEWSTATEGENERATOR")[0]['value']
eventvalidation = soup_dummy.select("#__EVENTVALIDATION")[0]['value']
english = soup_dummy.select("#hfEnglishWebsiteUrl")[0]['value']

data = {
'__VIEWSTATE': viewstate,
'__VIEWSTATEGENERATOR': viewstategen,
'__EVENTVALIDATION': eventvalidation,
'ctl00$MainContent$txtName': 'bank',
'ctl00$MainContent$cbIncludeCeased': 'on',
'ctl00$MainContent$btnSearch': 'Find',
'ctl00$hfAuthRequired': 'False',
'ctl00$hfEnglishWebsiteUrl': english,
'ctl00$stWarningLength': '30',
'ctl00$stIdleAfter': '1200',
'ctl00$stPollingInterval': '60',
'ctl00$stMultiTabTimeoutSyncInterval': '20'
}
time.sleep(3)
p = s.request('post', 'https://prod.ceidg.gov.pl/CEIDG/CEIDG.Public.UI/Search.aspx', params=data, headers=header_simple)

print(p.content)

CodePudding user response:

This is one of the ways how you can populate results from that page using requests module. Be sure to include all the keys and values within data parameters while sending with post requests in order to access the desired content.

Working script:

import lxml
import requests
from pprint import pprint
from bs4 import BeautifulSoup

with requests.Session() as s:
    s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    
    r = s.get('http://prod.ceidg.gov.pl/CEIDG/CEIDG.Public.UI/Search.aspx')
    soup = BeautifulSoup(r.text,"lxml")
    data = {i['name']:i.get('value','') for i in soup.select('input[name]')}

    data['ctl00$MainContent$txtName'] = 'bank'
    data['ctl00$MainContent$cbIncludeCeased'] = 'on'
    data['ctl00$MainContent$btnSearch'] = 'Find'
    data.pop('ctl00$MainContent$btnClear')
    data.pop('ctl00$versionDetails$btnClose')

    # pprint(data)   #print it to see the keys and values that have been included within data

    p = s.post('https://prod.ceidg.gov.pl/CEIDG/CEIDG.Public.UI/Search.aspx', data=data)
    soup = BeautifulSoup(p.text,"lxml")
    print(soup.select_one("table#MainContent_DataListEntities"))
  • Related