Web Scraping with __doPostBack-CodePudding

I am trying to scrape data (just need the url for each Lego set's page) from https://www.brickeconomy.com/sets/theme/collectable-minifigures but there is pagination on the website using Javascript __doPostBack function. I looked at many other relevant answers to know that I need to look into the POST request to identify the request form data as seen here:

Screenshot of Request's Form Data

My code is as follows now:

import requests
from bs4 import BeautifulSoup

url = "http://www.brickeconomy.com/sets/theme/collectable-minifigures"
page_content = requests.get(url).content
soup = BeautifulSoup(page_content, 'html.parser')

VIEWSTATEGENERATOR  = soup.find('input',{'id':'__VIEWSTATEGENERATOR'}).get('value')
VIEWSTATE  = soup.find('input',{'id':'__VIEWSTATE'}).get('value')

headers = {'user-agent': 'Mozilla/5.0'}
data = {  
        "ctl00$ScriptManager1": "ctl00$ContentPlaceHolder1$ctlSets$UpdatePanelMain|ctl00$ContentPlaceHolder1$ctlSets$GridViewSets",
        "ctl00$txtSearchHeader2": "",
        "ctl00$txtSearchHeader": "",
        "subthemesorter": "",
        "setsorter": "SetNumberDESC",
        "ctl00$LoginModalUsername": "",
        "ctl00$LoginModalPassword": "",
        "__EVENTTARGET": "ctl00$ContentPlaceHolder1$ctlSets$GridViewSets",
        "__EVENTARGUMENT": "Page$2",
        "__VIEWSTATE":VIEWSTATE,
        "__VIEWSTATEGENERATOR": VIEWSTATEGENERATOR, 
        "__ASYNCPOST": 'true'
}

res = requests.post(url, data=data, headers =headers).content
BeautifulSoup(res, 'html.parser').find_all(class_ = 'mb-5')

However, it is still showing the data from the first page. Would appreciate any advice here. Thank you!

CodePudding user response：

You were sending post requests to the wrong url. Once I've replaced your existing url with the correct one, the script started to work:

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

base = 'https://www.brickeconomy.com'
start_url = 'http://www.brickeconomy.com/sets/theme/collectable-minifigures'
post_url = 'https://www.brickeconomy.com/sets/theme/sets/theme/collectable-minifigures'

data = {  
    "ctl00$ScriptManager1": "ctl00$ContentPlaceHolder1$ctlSets$UpdatePanelMain|ctl00$ContentPlaceHolder1$ctlSets$GridViewSets",
    "ctl00$txtSearchHeader2": "",
    "ctl00$txtSearchHeader": "",
    "subthemesorter": "",
    "setsorter": "SetNumberDESC",
    "ctl00$LoginModalUsername": "",
    "ctl00$LoginModalPassword": "",
    "__EVENTTARGET": "ctl00$ContentPlaceHolder1$ctlSets$GridViewSets",
    "__EVENTARGUMENT": "Page$2",
    "__ASYNCPOST": 'true'
}
with requests.Session() as s:
    s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
    r = s.get(start_url)
    soup = BeautifulSoup(r.text,"lxml")

    data['__VIEWSTATE'] = soup.find('input',{'id':'__VIEWSTATE'}).get('value')
    data['__VIEWSTATEGENERATOR'] = soup.find('input',{'id':'__VIEWSTATEGENERATOR'}).get('value')

    res = s.post(post_url,data=data)
    soup = BeautifulSoup(res.text,"lxml")
    for item in soup.select("table.table > tr h4 > a"):
        inner_url = urljoin(base,item.get("href"))
        print(inner_url)