Home > OS >  How can i make my web scraper to stop on the last page without hard coding the last page value (pyth
How can i make my web scraper to stop on the last page without hard coding the last page value (pyth

Time:08-29

I am scraping from a website with pages from 1 to 40. This is the website. https://gb.kompass.com/d/surrey/gb_gbr09_sw/ as you can see the pagination doesn't have a next button so i need to pass in the page number in my api call. I have added a range from 1 to 41 because I manually check there are 41 pages. I don't want to hard code the last page number. What can I do to make my code more scalable so that it will stop on the last page?

Please note when it reaches the last page the website automatically go back to the first page from the website.


import time
import undetected_chromedriver as uc
import pandas as pd
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
    "Upgrade-Insecure-Requests": "1",
    "DNT": "1",
    "Accept": "text/html,application/xhtml xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate"
}

def get_links():
    try:
        driver = uc.Chrome()
        driver.get(
            'https://url')
        time.sleep(15)
        print("driver", driver)
        content = driver.page_source

        soup = BeautifulSoup(content, 'html.parser')
        body = soup.body
        LINKS = []
        for x in range(1, 41):
            tags = body.find_all('div', {'class': 'col col-left company-container'})
            for tag in tags:
                try:
                    a = tag.find_all('a', href=True)
                    print("a", a[0].get('href'))
                    url = a[0].get('href')
                    LINKS.append(url)
                    df = pd.DataFrame({
                        'LINKS': LINKS
                    })
                    df.to_csv('Links.csv', index=False)
                except:
                    pass
                try:
                    next_page = 'https://url/page-'   str(x)   '/'
                    print('next_page', next_page)
                    driver.get(next_page)
                    time.sleep(5)
                    content = driver.page_source
                    soup = BeautifulSoup(content, 'html.parser')
                    body = soup.body
                 except:
                    pass

    except:
        pass


if __name__ == '__main__':
    get_links()

CodePudding user response:

Run in while-loop and check some element which exists (or not exists) only on last page.

Usually pagination is useful for this.

For example this last page doesn't have button > with link to next page.
And last page has class active on last element in pagination. And I use this to detect last page

In this code I use number = 38 only to fast test it but you should start with number = 1

import time
import undetected_chromedriver as uc
from bs4 import BeautifulSoup
import pandas as pd

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
    "Upgrade-Insecure-Requests": "1",
    "DNT": "1",
    "Accept": "text/html,application/xhtml xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate"
}

def get_links():
    
    # --- before loop ---
    
    LINKS = []
    
    # --- loop ---
    
    try:
        driver = uc.Chrome()
        url = 'https://gb.kompass.com/d/surrey/gb_gbr09_sw/'
        
        number = 38
        #number = 1
        while True:
            print(f'page {number}:', url)
            
            driver.get(url)
            time.sleep(5)
            html = driver.page_source
        
            soup = BeautifulSoup(html, 'html.parser')

            # -----
            
            tags = soup.find_all('div', {'class': 'col col-left company-container'})
            for tag in tags:
                a = tag.find_all('a', href=True)
                url = a[0].get('href')
                print("url:", url)
                LINKS.append(url)

            # -----

            pagination = soup.select('ul.pagination li')
            last = pagination[-1]
            
            if 'active' in last.attrs.get('class'):
                print('it is last page')
                break
            else:
                number  = 1
                url = f'https://gb.kompass.com/d/surrey/gb_gbr09_sw/page-{number}/'

    except Exception as ex:
        print('Exception:', ex)

    # --- after loop ---
    
    df = pd.DataFrame({'LINKS': LINKS})
    df.to_csv('Links.csv', index=False)

if __name__ == '__main__':
    get_links()
  • Related