Home > Back-end >  How to scrape multiple urls in one Python script with Beautiful soup
How to scrape multiple urls in one Python script with Beautiful soup

Time:10-07

I have the following Python script that works well for what I need and gives me the output I want. However, I have another url (https://www.website2/page-) that I'd like to add to the script. Currently I manually swap the urls and run them as separate scripts but I'd like to do it in one go, is this possible?

Ps - required script for each site is identical other than the url property. TIA!

import itertools
import random
import time
import typing
import signal

import requests
from bs4 import BeautifulSoup

from model import Model, Data

RUNNING = True


def sigint_handler(*args: typing.Any) -> None:
    global RUNNING
    print("Signal received, exiting gracefully ...")
    RUNNING = False


def scrape(url: str, model: Model, session: requests.Session, headers: typing.Dict[str, str]) -> None:
    for page in itertools.count(1):
        if not RUNNING:
            break
        req = session.get(f"{url}{page}", headers=headers)
        soup = BeautifulSoup(req.content, 'html.parser')

        for li in soup.find_all('li', class_="container"):
            title = li.find('h2').text
            price = li.find('p', class_="price-text").text
            print(f"Title: {title}, Price: {price}")
            model.insert_or_update(Data(address=title, price=price))

        time.sleep(random.randint(1, 5))


def run() -> None:
    url = "https://www.website1/page-"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
    }
    model, session = Model(), requests.Session()
    scrape(url, model, session, headers)


if __name__ == '__main__':
    signal.signal(signal.SIGINT, sigint_handler)
    run()

CodePudding user response:

I think you just need an array and for loop:

def run() -> None:
    urls = ["https://www.website1/page-", "https://www.website2/page-"]
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) 
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
    }
    model, session = Model(), requests.Session()
    for url in urls:
        scrape(url, model, session, headers)

CodePudding user response:

if you want it to parse another page add a comma

## Added Comma meaning 2 urls can be parsed for efficiency as you wouldn't want to rewrite your code again
import itertools
import random
import time
import typing
import signal
import requests
from bs4 import BeautifulSoup
from model import Model, Data
RUNNING = True
def sigint_handler(*args: typing.Any) -> None:
    global RUNNING
    print("Signal received, exiting gracefully ...")
    RUNNING = False


def scrape(url: str, model: Model, session: requests.Session, headers: typing.Dict[str, str]) -> None:
    for page in itertools.count(1):
        if not RUNNING:
            break
        req = session.get(f"{url}{page}", headers=headers)
        soup = BeautifulSoup(req.content, 'html.parser')

        for li in soup.find_all('li', class_="container"):
            title = li.find('h2').text
            price = li.find('p', class_="price-text").text
            print(f"Title: {title}, Price: {price}")
            model.insert_or_update(Data(address=title, price=price))

        time.sleep(random.randint(1, 5))


def run() -> None:
    url = "https://www.website1/page- , https://www.website1/page- "


    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
    }
    model, session = Model(), requests.Session()
    scrape(url, model, session, headers)


if __name__ == '__main__':
    signal.signal(signal.SIGINT, sigint_handler)
    run()

  • Related