Python -Requests -BS4 and scraping JS websites (ajax)-CodePudding

Trying to scrape

https://online.transport.wa.gov.au/webExternal/registration/;jsessionid=f5niZvGzB71vhrxMujeQPc2lxEWUqj5nc_fuXDvU9ZvvmTEcncK7!1976798565!-891443374?0

and return my Rego details.

I am not getting any information returned. In the HTML text I can see the plate field is completed with my Rego.

import requests
from bs4 import BeautifulSoup


url = 'https://online.transport.wa.gov.au/webExternal/registration/page'


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 '
                  'Safari/537.36 Edg/107.0.1418.62',
    'Accept-Language': 'en-GB,en;q=0.9,en-US;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Referer': 'https://online.transport.wa.gov.au/webExternal/registration/;jsessionid=_4ndF3LkVxrzXCIH58W_'
               'dGyYg3zAXOpnScRtW2kndXAICNK2kUBx!-1720409949!-1658794803?0',
    'Host': 'online.transport.wa.gov.au',
    'Connection': 'keep-alive',
    'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/'
              'signed-exchange;v=b3;q=0.9',
    'DNT': '1',
    'Cache-Control': 'no-cache',
    'Pragma': 'no-cache',
    'Cookie': 'JSESSIONID=70Pb5udz8enjqPcRk4OjOODjfORHr82eUe90pPSpitM83k2EWbRh!-1720409949!-1658794803; TS012ba7f5=0'
              '1becb1e5b6b9eda0a43a7a09fe67c8e8d893f7792d1d8270a95b4267bb1b1754adc3fc326aa6104de47ae36b87d71d4c1afa8f1'
              '73d2583279ddc291f0caef515f0f85c8ed',
    'sec-ch-ua': 'Microsoft Edge";v="107", "Chromium";v="107", "Not=A?Brand";v="24',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': 'windows',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1'
}


data= {"plate": "1hdv242"}

r = requests.post(url, headers=headers, data=data)


soup = BeautifulSoup(r.text, 'html.parser')

print(r.headers)
print(soup)

This is the code I am using at the moment; I have tried both POST and GET and still no results.

Can someone please advise on what the problem is, or on the things i need to read up on to be able to complete this.

CodePudding user response：

Currently my answer is through iPad, but anyway that site can be solved via single request if you reversed the JS function, but i don't have time to do that for now.

Below is a solution using requests

import requests
import re
import pandas as pd

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0",
}


def main(url):
    with requests.Session() as req:
        req.headers.update(headers)
        r1 = req.get(url)
        nurl = r1.url   "-1.IBehaviorListener.1-layout-layout_body-registrationRequestForm-searchButton="
        data = {
            "id3_hf_0": "",
            "plate": "1hdv242",
            "searchButton": 1
        }
        req.headers.update({
            'Wicket-Ajax': 'true',
            'Wicket-Ajax-BaseURL': '.'
        })
        r = req.post(nurl, data=data)
        match = url   re.search('(wicket.*?)]', r.text).group(1)
        r = req.get(match)
        df = pd.read_html(r.content, attrs={'class': 'registrationTable'})[0]
        print(df.T)


main('https://online.transport.wa.gov.au/webExternal/registration/')

Output:

              0     1      2     3       4                                5
0  Plate Number  Make  Model  Year  Colour  This vehicle licence expires on
1       1HDV242   KIA    RIO  2020   BLACK                       12/02/2023