Trying to scrape
and return my Rego details.
I am not getting any information returned. In the HTML text I can see the plate field is completed with my Rego.
import requests
from bs4 import BeautifulSoup
url = 'https://online.transport.wa.gov.au/webExternal/registration/page'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 '
'Safari/537.36 Edg/107.0.1418.62',
'Accept-Language': 'en-GB,en;q=0.9,en-US;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://online.transport.wa.gov.au/webExternal/registration/;jsessionid=_4ndF3LkVxrzXCIH58W_'
'dGyYg3zAXOpnScRtW2kndXAICNK2kUBx!-1720409949!-1658794803?0',
'Host': 'online.transport.wa.gov.au',
'Connection': 'keep-alive',
'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/'
'signed-exchange;v=b3;q=0.9',
'DNT': '1',
'Cache-Control': 'no-cache',
'Pragma': 'no-cache',
'Cookie': 'JSESSIONID=70Pb5udz8enjqPcRk4OjOODjfORHr82eUe90pPSpitM83k2EWbRh!-1720409949!-1658794803; TS012ba7f5=0'
'1becb1e5b6b9eda0a43a7a09fe67c8e8d893f7792d1d8270a95b4267bb1b1754adc3fc326aa6104de47ae36b87d71d4c1afa8f1'
'73d2583279ddc291f0caef515f0f85c8ed',
'sec-ch-ua': 'Microsoft Edge";v="107", "Chromium";v="107", "Not=A?Brand";v="24',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': 'windows',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1'
}
data= {"plate": "1hdv242"}
r = requests.post(url, headers=headers, data=data)
soup = BeautifulSoup(r.text, 'html.parser')
print(r.headers)
print(soup)
This is the code I am using at the moment; I have tried both POST and GET and still no results.
Can someone please advise on what the problem is, or on the things i need to read up on to be able to complete this.
CodePudding user response:
Currently my answer is through iPad, but anyway that site can be solved via single request if you reversed the JS
function, but i don't have time to do that for now.
Below is a solution using requests
import requests
import re
import pandas as pd
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0",
}
def main(url):
with requests.Session() as req:
req.headers.update(headers)
r1 = req.get(url)
nurl = r1.url "-1.IBehaviorListener.1-layout-layout_body-registrationRequestForm-searchButton="
data = {
"id3_hf_0": "",
"plate": "1hdv242",
"searchButton": 1
}
req.headers.update({
'Wicket-Ajax': 'true',
'Wicket-Ajax-BaseURL': '.'
})
r = req.post(nurl, data=data)
match = url re.search('(wicket.*?)]', r.text).group(1)
r = req.get(match)
df = pd.read_html(r.content, attrs={'class': 'registrationTable'})[0]
print(df.T)
main('https://online.transport.wa.gov.au/webExternal/registration/')
Output:
0 1 2 3 4 5
0 Plate Number Make Model Year Colour This vehicle licence expires on
1 1HDV242 KIA RIO 2020 BLACK 12/02/2023