Home > Enterprise >  Create a for loop to webscrape multiple pages from multiple URLs using beautifulsoup
Create a for loop to webscrape multiple pages from multiple URLs using beautifulsoup

Time:10-26

I am trying to scrape multiple pages from multiple URLS efficiently. I have been able to scrape multiple pages from one URL successfully, but unable to implement this for multiple URLs. Any and help would be greatly appreciated. Thank you.

Current Loop Code:

BASE = 'https://www.unegui.mn'
URL = f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/?page='
COLUMNS=['Name','Date','Address','District','City','Price','Area_sqm','Rooms','Floor','Commission_year',
         'Building_floors','Garage', 'Balcony','Windows','Window_type','Floor_type','door_type','Leasing','Description','Link']
with requests.Session() as session:
    while True:
        (r := session.get(f'{URL}{page 1}')).raise_for_status()
        m = re.search('.*page=(\d )$', r.url)
        if m and int(m.group(1)) == page:
            break
        page  = 1
        print(f'Scrapping page {page}')

Desired URL Loop:

The only thing being changed for each url is the 1-r, 2-r, 3-r section. The total number of URLS is 5.

URL = [f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/1-r/?page=',
       f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/2-r/?page=',
       f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/3-r/?page=',
       f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/4-r/?page=',
       f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/?page='
       ]

Full Code:

import requests
from bs4 import BeautifulSoup as BS
from datetime import datetime
import pandas as pd
import re
import csv

today = datetime.today().strftime('%y%m%d ')

def main():
    page = 0
    name = []
    date = []
    address = []
    district = []
    city = []
    price = []
    area_sqm = []
    rooms = []
    floor = []
    commission_year = []
    building_floors = []
    garage = []
    balcony = []
    windows = []
    window_type = []
    floor_type = []
    door_type = []
    leasing = []
    description = []
    link = []

        BASE = 'https://www.unegui.mn'
    URL = f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/?page='
    COLUMNS=['Name','Date','Address','District','City','Price','Area_sqm','Rooms','Floor','Commission_year',
             'Building_floors','Garage', 'Balcony','Windows','Window_type','Floor_type','door_type','Leasing','Description','Link']
    with requests.Session() as session:
        while True:
            (r := session.get(f'{URL}{page 1}')).raise_for_status()
            m = re.search('.*page=(\d )$', r.url)
            if m and int(m.group(1)) == page:
                break
            page  = 1
            print(f'Scrapping page {page}')
            soup = BS(r.text, 'lxml')
            for tag in soup.findAll('div', class_='list-announcement-block'):
                _name = tag.find('a', attrs={'itemprop': 'name'})
                name.append(_name.get('content', 'N/A'))
                if (_link := _name.get('href', None)):
                    link.append(f'{BASE}{_link}')
                    (_r := session.get(link[-1])).raise_for_status()
                    _spanlist = BS(_r.text, 'lxml').find_all('span', class_='value-chars')
                    floor_type.append(_spanlist[0].get_text().strip())
                    balcony.append(_spanlist[1].get_text().strip())
                    garage.append(_spanlist[2].get_text().strip())
                    window_type.append(_spanlist[3].get_text().strip())
                    door_type.append(_spanlist[4].get_text().strip())   
                    windows.append(_spanlist[5].get_text().strip())
                    
                    _alist = BS(_r.text, 'lxml').find_all('a', class_='value-chars')
                    commission_year.append(_alist[0].get_text().strip())
                    building_floors.append(_alist[1].get_text().strip())
                    area_sqm.append(_alist[2].get_text().strip())
                    floor.append(_alist[3].get_text().strip())
                    leasing.append(_alist[4].get_text().strip())
                    district.append(_alist[5].get_text().strip())
                    address.append(_alist[6].get_text().strip())
                    
                rooms.append(tag.find('div', attrs={'announcement-block__breadcrumbs'}).get_text().split('»')[1].strip())
                description.append(tag.find('div', class_='announcement-block__description').get_text().strip())
                date.append(tag.find('div', class_='announcement-block__date').get_text().split(',')[0].strip())
                city.append((tag.find('meta', attrs={'itemprop': 'areaServed'})).get('content'))
                if (_price := tag.find('meta', attrs={'itemprop': 'price'})) is None:
                    _price = tag.find('div', class_='announcement-block__price _premium')
                price.append(_price.get_text().strip() if _price else 'N/A')
        df = pd.DataFrame(zip(name, date, address, district, city, 
                                  price, area_sqm, rooms, floor, commission_year,
                                  building_floors, garage, balcony, windows, window_type,
                                  floor_type, door_type, leasing, description, link), columns=COLUMNS)
        return(df)

if __name__ == '__main__':
    df = main()
    df.to_csv(f'{today}HPD.csv', encoding='cp1251', errors='ignore', index=False)

CodePudding user response:

You can combine for loops with Python's range() function.

The range() function provides a sequence of integers based upon the function's arguments.

range(start, stop[, step])

The start argument is the first value in the range. If range() is called with only one argument, then Python assumes start = 0.

The stop argument is the upper bound of the range. It is important to realize that this upper value is not included in the range.

Example:

for i in range(1, 6):
    BASE = 'https://www.unegui.mn'
    URL = f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/{i}-r/?page='
    print(URL)

Output:

https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/1-r/?page=
https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/2-r/?page=
https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/3-r/?page=
https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/4-r/?page=
https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/?page=
  • Related