I am scraping from a website with pages from 1 to 40. This is the website. https://gb.kompass.com/d/surrey/gb_gbr09_sw/ as you can see the pagination doesn't have a next button so i need to pass in the page number in my api call. I have added a range from 1 to 41 because I manually check there are 41 pages. I don't want to hard code the last page number. What can I do to make my code more scalable so that it will stop on the last page?
Please note when it reaches the last page the website automatically go back to the first page from the website.
import time
import undetected_chromedriver as uc
import pandas as pd
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
"Upgrade-Insecure-Requests": "1",
"DNT": "1",
"Accept": "text/html,application/xhtml xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate"
}
def get_links():
try:
driver = uc.Chrome()
driver.get(
'https://url')
time.sleep(15)
print("driver", driver)
content = driver.page_source
soup = BeautifulSoup(content, 'html.parser')
body = soup.body
LINKS = []
for x in range(1, 41):
tags = body.find_all('div', {'class': 'col col-left company-container'})
for tag in tags:
try:
a = tag.find_all('a', href=True)
print("a", a[0].get('href'))
url = a[0].get('href')
LINKS.append(url)
df = pd.DataFrame({
'LINKS': LINKS
})
df.to_csv('Links.csv', index=False)
except:
pass
try:
next_page = 'https://url/page-' str(x) '/'
print('next_page', next_page)
driver.get(next_page)
time.sleep(5)
content = driver.page_source
soup = BeautifulSoup(content, 'html.parser')
body = soup.body
except:
pass
except:
pass
if __name__ == '__main__':
get_links()
CodePudding user response:
Run in while
-loop and check some element which exists (or not exists) only on last page.
Usually pagination
is useful for this.
For example this last page doesn't have button >
with link to next page.
And last page has class active
on last element in pagination. And I use this to detect last page
In this code I use number = 38
only to fast test it but you should start with number = 1
import time
import undetected_chromedriver as uc
from bs4 import BeautifulSoup
import pandas as pd
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
"Upgrade-Insecure-Requests": "1",
"DNT": "1",
"Accept": "text/html,application/xhtml xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate"
}
def get_links():
# --- before loop ---
LINKS = []
# --- loop ---
try:
driver = uc.Chrome()
url = 'https://gb.kompass.com/d/surrey/gb_gbr09_sw/'
number = 38
#number = 1
while True:
print(f'page {number}:', url)
driver.get(url)
time.sleep(5)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
# -----
tags = soup.find_all('div', {'class': 'col col-left company-container'})
for tag in tags:
a = tag.find_all('a', href=True)
url = a[0].get('href')
print("url:", url)
LINKS.append(url)
# -----
pagination = soup.select('ul.pagination li')
last = pagination[-1]
if 'active' in last.attrs.get('class'):
print('it is last page')
break
else:
number = 1
url = f'https://gb.kompass.com/d/surrey/gb_gbr09_sw/page-{number}/'
except Exception as ex:
print('Exception:', ex)
# --- after loop ---
df = pd.DataFrame({'LINKS': LINKS})
df.to_csv('Links.csv', index=False)
if __name__ == '__main__':
get_links()