I have the following Python script that works well for what I need and gives me the output I want. However, I have another url (https://www.website2/page-
) that I'd like to add to the script. Currently I manually swap the urls and run them as separate scripts but I'd like to do it in one go, is this possible?
Ps - required script for each site is identical other than the url property. TIA!
import itertools
import random
import time
import typing
import signal
import requests
from bs4 import BeautifulSoup
from model import Model, Data
RUNNING = True
def sigint_handler(*args: typing.Any) -> None:
global RUNNING
print("Signal received, exiting gracefully ...")
RUNNING = False
def scrape(url: str, model: Model, session: requests.Session, headers: typing.Dict[str, str]) -> None:
for page in itertools.count(1):
if not RUNNING:
break
req = session.get(f"{url}{page}", headers=headers)
soup = BeautifulSoup(req.content, 'html.parser')
for li in soup.find_all('li', class_="container"):
title = li.find('h2').text
price = li.find('p', class_="price-text").text
print(f"Title: {title}, Price: {price}")
model.insert_or_update(Data(address=title, price=price))
time.sleep(random.randint(1, 5))
def run() -> None:
url = "https://www.website1/page-"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
model, session = Model(), requests.Session()
scrape(url, model, session, headers)
if __name__ == '__main__':
signal.signal(signal.SIGINT, sigint_handler)
run()
CodePudding user response:
I think you just need an array and for loop:
def run() -> None:
urls = ["https://www.website1/page-", "https://www.website2/page-"]
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
model, session = Model(), requests.Session()
for url in urls:
scrape(url, model, session, headers)
CodePudding user response:
if you want it to parse another page add a comma
## Added Comma meaning 2 urls can be parsed for efficiency as you wouldn't want to rewrite your code again
import itertools
import random
import time
import typing
import signal
import requests
from bs4 import BeautifulSoup
from model import Model, Data
RUNNING = True
def sigint_handler(*args: typing.Any) -> None:
global RUNNING
print("Signal received, exiting gracefully ...")
RUNNING = False
def scrape(url: str, model: Model, session: requests.Session, headers: typing.Dict[str, str]) -> None:
for page in itertools.count(1):
if not RUNNING:
break
req = session.get(f"{url}{page}", headers=headers)
soup = BeautifulSoup(req.content, 'html.parser')
for li in soup.find_all('li', class_="container"):
title = li.find('h2').text
price = li.find('p', class_="price-text").text
print(f"Title: {title}, Price: {price}")
model.insert_or_update(Data(address=title, price=price))
time.sleep(random.randint(1, 5))
def run() -> None:
url = "https://www.website1/page- , https://www.website1/page- "
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
model, session = Model(), requests.Session()
scrape(url, model, session, headers)
if __name__ == '__main__':
signal.signal(signal.SIGINT, sigint_handler)
run()