I want to be able to create a for loop that scrapes a url with multiple pages. There are examples of this I have found however my code requires authentication and for that reason I haven't shared the actual url. I have entered an example url that shows the same key identifier "currentPage=1"
So for this example for page i it would be currentPage=i where i will be 1,2,3,4....
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
def requests_retry_session(retries=10,
backoff_factor=0.3,
status_forcelist=(500, 502, 503, 504),
session=None):
session = session or requests.Session()
retry = Retry(total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist)
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
import io
import urllib3
import pandas as pd
from requests_kerberos import OPTIONAL, HTTPKerberosAuth
import mwinit2web
a = mwinit2web.get_mwinit_cookie()
urls = https://example-url.com/ABCD/customer.currentPage=1&end
def Scraper(url):
urllib3.disable_warnings()
with requests_retry_session() as req:
resp = req.get(url,
timeout=30,
verify=False,
allow_redirects=True,
auth=HTTPKerberosAuth(mutual_authentication=OPTIONAL),
cookies=a)
global df
data = pd.read_html(resp.text, flavor=None, header=0, index_col=0)
df = pd.concat(data, sort=False)
print(df)
s = Scraper(urls)
df
CodePudding user response:
pageCount = 4 #say you have 3 pages
urlsList = []
base = "https://example-url.com/ABCD/customer.currentPage={}&end" #curly braces let you format
for x in range(pageCount)[1:]:
urlsList.append(base.format(x))
Then you can pass the list to your function.