How to webscrape multiple pages efficiently using Beautiful Soup?-CodePudding

How long does it normally take to web-scrape multiple pages? I have kept it running and have not gotten output. Currently it is only scraping one url. I plan to scrape 5 urls. Does this mean my loop is just stuck in an infinite loop?

Additionally, if anyone knows how to make my j loop more efficient instead of accessing the tag elements one by one it would be great. I assume that is one method I can increase the efficiency and speed of my script. Any help is appreciated.

# -*- coding: utf-8 -*-
'''
Import the necessary modules to run the script. 
If error, use pip to install modules
'''
import pandas as pd #Data analysis and manipulation tool
from urllib.request import urlopen, Request #Package that helps in opening URLS
from bs4 import BeautifulSoup as bsoup #Package to pull data out from HTML and XML files
import ssl #TLS/SSL wrapper for accessing OS socket

'''
Use headers to prevent getting blocked by websites when scraping high volumes of data frequently
''' 
def get_headers():
   #Headers
   headers={'accept':'text/html,application/xhtml xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
           'accept-language':'en-US,en;q=0.9',
           'cache-control':'max-age=0',
           'upgrade-insecure-requests':'1',
           'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
   return headers

'''
Create SSL wrapper and check connection
'''
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
count = 1 # for pagination


'''
Create a list holder
'''
info = {
        'name': [],
        'date': [],
        'address': [],
        'district': [],
        'city': [],
        'price': [],
        'area_sqm': [],
        'rooms': [],
        'floor': [],
        'commission_year': [],
        'total_floors': [],
        'garage': [],
        'balcony': [],
        'windows': [],
        'windows_type': [],
        'door_type': [],
        'leasing': [],
        'description': [],
        'link': []
        }

urls = ['https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/4-r/']

'''
Loop to scrape multiple pages of data.
Change while count integer to indiate how many pages to scrape.
'''

for i in urls:
    count=1
    y=i
    while(count<2):
        http_request = Request(i, headers=get_headers())
        html_file = urlopen(http_request)
        html_text = html_file.read()
        soup = bsoup(html_text, 'html.parser')
        
        for tag in soup.findAll('div', attrs={'class':'list-announcement-block'}):
            name = tag.find('a', attrs={'itemprop':'name'})
            description = tag.find('div', attrs={'class':'announcement-block__description'})
            link = name['href']
            date = tag.find('div', attrs={'class':'announcement-block__date'})
            price = tag.find('meta', attrs={'itemprop':'price'})
            price2 = tag.find('div', attrs={'class':'announcement-block__price _premium'})
            
            info['name'].append(name['content'] if name else 'N/A')
            info['description'].append(description.get_text().strip() if description else 'N/A')
            info['link'].append('http://www.unegui.mn' link if link else 'N/A')
            info['date'].append(date.get_text().strip() if date else 'N/A')
            info['price'].append(price['content'] if price else price2.get_text().strip())
            
            for j in info['link']:
                for litag in soup.findAll('ul', attrs={'class':'chars-column'}):
                  floor = litag.find_all(attrs={'class':'value-chars'})[0]
                  balcony = litag.find_all(attrs={'class':'value-chars'})[1]
                  year = litag.find_all(attrs={'class':'value-chars'})[2]
                  garage = litag.find_all(attrs={'class':'value-chars'})[3]
                  window_type = litag.find_all(attrs={'class':'value-chars'})[4]
                  building_floor = litag.find_all(attrs={'class':'value-chars'})[5]
                  door_type = litag.find_all(attrs={'class':'value-chars'})[6]
                  area_sqm = litag.find_all(attrs={'class':'value-chars'})[7]
                  floor = litag.find_all(attrs={'class':'value-chars'})[8]
                  leasing = litag.find_all(attrs={'class':'value-chars'})[9]
                  district = litag.find_all(attrs={'class':'value-chars'})[10]
                  windows_num = litag.find_all(attrs={'class':'value-chars'})[11]
                  location = litag.find_all(attrs={'class':'value-chars'})[12]
                  
                  info['floor'].append(floor.get_text().strip() if date else 'N/A')
                    
# Go to next page
count=count 1
page = '?page=' str(count)
i=y page
     
df = pd.DataFrame(list(zip(info['name'], info['description'], info['link'], info['date'], info['price'], info['floor'])),columns=['Name', 'Description', 'Link', 'Date', 'Price', 'Floor'])
print(df)

CodePudding user response：

There are some probably redundant loops going on here:

floor = litag.find_all(attrs={'class':'value-chars'})[0]
balcony = litag.find_all(attrs={'class':'value-chars'})[1]
...

Could be more efficiently written as:

value_chars = litag.find_all(attrs={'class':'value-chars'})
floor, balcony, ... = value_chars

I.e. don't call the find_all every time. However, it's not clear what you actually do with your variables when you get them. Did you mean to put them in info? And if so, why overwrite them every time?

Infinte Looping

Here is your while loop, stripped of content:

count = 0
while count < 2:
    ...

count  = 1

Do you see the problem? count is never modified inside the loop, so the loop will run for ever. Whilst indenting the count = 1 would fix it, use a for loop:

for count in range(1,3):
    ...

or more sensibly here:

for page in range(1,3):
    ...

given that that's apparently what count is. Note that your algorithm implies doing this inside the outer for url in urls loop, but you don't actually do so.

Avoiding redundant scraping

If you don't need a value, don't scrape it. Currently you look up and then throw away all kinds of things. But the main problem is just the inifite loop.

Progress

A print(f"Currently scraping page {count}") at the top of your while loop would have showed you what was happening. In general when testing loops, print something every time they run, so you can see what is running.

CodePudding user response：

In order to get the floor information, you need to access the href for each property. The code below runs very slowly. This is not due to the performance of Python but is a result of poor response times from the multitude of hyperlinks that we have to visit.

One enhancement over your code is that my answer accounts for all known pages. The numbers of pages cannot be predetermined. However, if we try to access (for example) page 12 and page 12 doesn't exist then we will be directed to the highest known page. In this example that's page 11. By keeping track of the last page visited, we can work out when to break out of the loop.

Hope this helps:

import requests
from bs4 import BeautifulSoup as BS
import pandas as pd
import re


def main():
    page = 0
    name = []
    description = []
    link = []
    date = []
    price = []
    floor = []
    
    BASE = 'https://www.unegui.mn'
    URL = f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/4-r/?page='
    COLUMNS=['Name', 'Description', 'Link', 'Date', 'Price', 'Floor']
    with requests.Session() as session:
        while True:
            (r := session.get(f'{URL}{page 1}')).raise_for_status()
            m = re.search('.*page=(\d )$', r.url)
            if m and int(m.group(1)) == page:
                break
            page  = 1
            print(f'Processing page {page}')
            soup = BS(r.text, 'lxml')
            for tag in soup.findAll('div', class_='list-announcement-block'):
                _name = tag.find('a', attrs={'itemprop': 'name'})
                name.append(_name.get('content', 'N/A'))
                if (_link := _name.get('href', None)):
                    link.append(f'{BASE}{_link}')
                    (_r := session.get(link[-1])).raise_for_status()
                    _floor = BS(_r.text, 'lxml').find('span', class_='value-chars')
                    floor.append(_floor.get_text().strip() if _floor else 'N/A')
                description.append(tag.find('div', class_='announcement-block__description').get_text().strip())
                date.append(tag.find('div', class_='announcement-block__date').get_text().split(',')[0].strip())
                if (_price := tag.find('meta', attrs={'itemprop': 'price'})) is None:
                    _price = tag.find('div', class_='announcement-block__price _premium')
                price.append(_price.get_text().strip() if _price else 'N/A')
        df = pd.DataFrame(zip(name, description, link, date, price, floor), columns=COLUMNS)
        print(df)


if __name__ == '__main__':
    main()