Home > front end >  How to stop scraping empty pages in category?
How to stop scraping empty pages in category?

Time:01-23

I am trying to scrape content from 4 category.Yes,the scraping for 4 of the category worked.The thing is I want to prevent non-existing pages to be scrape but my code below does not work. The reason for the range for loop to be 15 is because my fnb pages got 14.

fnb: 14 pages

services: 1 page

electronic: 2 pages

fashion: 4 pages


from bs4 import BeautifulSoup
import requests

def parse():

    cate=["Services","Food & Beverage","Fashion & Accessories","Electronics & Technology"]
    url = "https://www.jurongpoint.com.sg/store-directory"

    for cat in cate:
        for page in range(1, 15):
            
                print(f'Scraping category {cat} page {page}')
                payload = {
                    'level': '',
                    'cate': cat,
                    'page': page
                }
                resp = requests.get(url, params=payload)
                soup = BeautifulSoup(resp.text, 'html.parser')
            

                for link in soup.find_all('div',class_='entry-content'):

                        try:
                            shops=soup.find_all('div',class_="col-9")
                            names=soup.find_all('tr',class_="clickable")

                            for n, k in zip(names, shops):
                                name = n.find_all('td')[1].text.replace(' ','')
                                desc = k.text.replace(' ','')
                                print(name   "\n")
                                print(desc)

                        except AttributeError as e:
                            print(e)

parse()

This is a portion of my output from the code above:

enter image description here <- Output show scrapping of empty pages

CodePudding user response:

I'm not sure if this is the type of output you are after. However, the following implementation should help you get rid of empty content issues.

from bs4 import BeautifulSoup
import requests

url = "https://www.jurongpoint.com.sg/store-directory"
categories = ["Services","Food & Beverage","Fashion & Accessories","Electronics & Technology"]
payload = {'level': '','cate': '','page': ''}

def parse():
    for cat in categories:
        for page in range(1, 15):
            print(f'Scraping category {cat} page {page}')
            payload['cate'] = cat
            payload['page'] = page
            resp = requests.get(url, params=payload)
            soup = BeautifulSoup(resp.text, 'html.parser')
            if not soup.select("table.table > tbody"): 
                break

            for item in soup.select("table.table > tbody"):
                try:
                    shop = item.select_one("div.col-9").get_text(strip=True)
                except AttributeError: shop = ""
                try:
                    name = item.select_one("tr.clickable > td:nth-of-type(2)").get_text(strip=True)
                except AttributeError: name = ""
                print(shop,name)


if __name__ == '__main__':
    parse()

CodePudding user response:

Note: Number of pages can also increase and you will loose results - So wouldn't it be easy to use the approach already answered https://stackoverflow.com/a/75200178/14460824 to avoid the static range values in general? Just add the scraping part that extracts the necessary parts.

Example

from bs4 import BeautifulSoup
import requests

data = []

def parse(c):
    url = f'https://www.jurongpoint.com.sg/store-directory/?level=&cate={requests.utils.quote(c)}'
    while True:
        soup = BeautifulSoup(requests.get(url).text)

        for e in soup.select('tbody:not([id])'):

            data.append({
                'name':list(e.stripped_strings)[0],
                'desc':e.find_next_sibling('tbody').get_text(strip=True)
            })
        next_button = soup.select_one('.PagedList-skipToNext a')
        if next_button:
            url = next_button.get('href')
        else:
            break
    return

for c in ["Services","Food & Beverage","Fashion & Accessories","Electronics & Technology"]:
    parse(c)

data

Output

[{'name': 'Keyshoe',
  'desc': 'Our full range of services includes key duplication, access cards and auto gate access setup. Keyshoe also provides dyeing, cleaning and repair services for a wide range of products, ranging from sofas, handbags, luggage, shoes, belts and other leather goods.'},
 {'name': 'LA Barbershop',
  'desc': 'LA Barbershop is one of the first few brands in town to offer a complete bespoke grooming experience for men, from shaving to haircut, massage to ear-cleaning services at value for money prices. Its concept is inspired by the traditional English barbershop culture that dates back to 18th century.'},
 {'name': 'My Digital Lock',
  'desc': 'My Digital Lock specialise in Gateman, KEYWE Smart Home, EPIC and Samsung Digital Lock with Digital lock installation for EC Condo and BTO HDB Fire Rated Main Door in Singapore'},
 {'name': 'POSB', 'desc': ''},
 {'name': '4Fingers Crispy Chicken',
  'desc': 'Hand-painted crispy chicken with a twist!!    All outlets in Singapore are halal.'},...]
  • Related