Using Python/Selenium, I have defined a sleepy decorator to wait for 20 seconds between 5 function calls (requests to the server) for a webscraping project, so that I don't overwhelm their server. Looking at the terminal output it seems to be working as I intend it to, but when I observe the output file "Hitachi.csv" when it's being created it doesn't seem to pause at the fifth url but at the end, leading me to believe that the sleepy decorator isn't pausing at the 5th call. Please help:)
def sleepy(f):
def wrapped(*args, **kwargs):
wrapped.calls = 1
print(f"{f.__name__} called {wrapped.calls} times")
if wrapped.calls % 5 == 0:
print("Sleeping...")
sleep(20)
return f(*args, **kwargs)
wrapped.calls = 0
return wrapped
# script_concurrent.py
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from concurrent.futures import ThreadPoolExecutor, wait
from time import sleep, time
from selenium import webdriver
import datetime
import os
from scrapers.scraper import connect_to_base, parse_html, write_to_file
def counted(f):
def wrapped(*args, **kwargs):
wrapped.calls = 1
return f(*args, **kwargs)
wrapped.calls = 0
return wrapped
def sleepy(f):
def wrapped(*args, **kwargs):
wrapped.calls = 1
print(f"{f.__name__} called {wrapped.calls} times")
if wrapped.calls % 5 == 0:
print("Sleeping...")
sleep(20)
return f(*args, **kwargs)
wrapped.calls = 0
return wrapped
@counted
@sleepy
def run_process(filename="Hitachi.csv"):
# init browser
os.environ["WDM_LOG_LEVEL"] = "0"
browser = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
if connect_to_base(browser):
sleep(2)
html = browser.page_source
output_list = parse_html(html)
write_to_file(output_list, filename)
else:
print("Error connecting to AVS")
# exit
browser.quit()
if __name__ == "__main__":
start_time = time()
output_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
output_filename = f"output_{output_timestamp}.csv"
futures = []
with ThreadPoolExecutor() as executor:
futures.extend(executor.submit(run_process) for _ in range(2, 12))
wait(futures)
end_time = time()
elapsed_time = end_time - start_time
print(f"Elapsed run time: {elapsed_time / 60:.2f} minutes.")
print(f"Calls to run_process: {run_process.calls}")
# scraper.py
import requests
import csv
from pathlib import Path
import itertools
import pandas as pd
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
BASE_DIR = Path(__file__).resolve(strict=True).parent.parent
def csv_to_iter(filename, idx=0):
pd.set_option("display.max_rows", None)
df = pd.read_csv(filename)
df = df.iloc[:, [idx]]
df = df.values.tolist()
df = list(itertools.chain(*df))
df = sorted(list(set(df)))
return iter(df)
my_iter = csv_to_iter(
filename="/Users/myusername/Downloads/Code/AVS-concurrent-web-scraping/Sorted_MAH_Hitachi_urls.csv"
)
def connect_to_base(browser):
my_next_iter = next(my_iter)
connection_attempts = 0
while connection_attempts < 3:
try:
browser.get(my_next_iter)
# wait for table element with id = 'content' to load
# before returning True
WebDriverWait(browser, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".container"))
)
return True
except Exception as e:
print(e)
connection_attempts = 1
print(f"Error connecting to {my_next_iter}.")
print(f"Attempt #{connection_attempts}.")
return False
def parse_html(html):
# create soup object
soup = BeautifulSoup(html, "html.parser")
# parse soup object to get wikipedia article url, title, and last modified date
# part_position = [
# item.text.strip() for item in soup.findAll("td", {"data-title": "Pos."})
# ]
part_number_1 = [
item.text.strip() for item in soup.findAll("td", {"data-title": "Part â„–"})
]
part_number_2 = [
item.text.strip() for item in soup.findAll("td", {"data-title": "Part №"})
]
if not part_number_1:
pass
else:
part_number = part_number_1
if not part_number_2:
pass
else:
part_number = part_number_2
part_qty = [item.text.strip() for item in soup.findAll("td", {"data-title": "Qty"})]
part_name = [
item.text.strip() for item in soup.findAll("td", {"data-title": "Part name"})
]
part_comments = [
item.text.strip() for item in soup.findAll("td", {"data-title": "Comments"})
]
machine = [
item.text.split()[0] for item in soup.findAll("article", {"id": "node-content"})
]
alternative_machines = [
item.text.split()[2] for item in soup.findAll("article", {"id": "node-content"})
]
title = [item.text for item in soup.findAll("span", {"class": "trans"})]
parts_group = [item.h3 for item in soup.findAll("div", {"class": "card-header"})]
article_info = {
# "Pos.": part_position,
"Part No": part_number,
"Qty": part_qty,
"Parts name": part_name,
"Comments": part_comments,
"Machine": machine,
"Alternative_machines": alternative_machines,
"Title": title,
"Parts_group": parts_group,
}
return [article_info]
def get_load_time(article_url):
try:
# set headers
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
}
# make get request to article_url
response = requests.get(
article_url, headers=headers, stream=True, timeout=3.000
)
# get page load time
load_time = response.elapsed.total_seconds()
except Exception as e:
print(e)
load_time = "Loading Error"
return load_time
def write_to_file(output_list, filename="Hitachi.csv"):
for row in output_list:
with open(Path(BASE_DIR).joinpath(filename), "a") as csvfile:
fieldnames = [
"Pos.",
"Part No",
"Qty",
"Parts name",
"Comments",
"Machine",
"Alternative_machines",
"Title",
"Parts_group",
]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow(row)
Output
run_process called 1 times
[WDM] - ====== WebDriver manager ======
2022-07-10 14:45:59,433 INFO ====== WebDriver manager ======
run_process called 2 times
[WDM] - ====== WebDriver manager ======
2022-07-10 14:45:59,439 INFO ====== WebDriver manager ======
run_process called 3 times
[WDM] - ====== WebDriver manager ======
2022-07-10 14:45:59,440 INFO ====== WebDriver manager ======
run_process called 4 times
[WDM] - ====== WebDriver manager ======
2022-07-10 14:45:59,450 INFO ====== WebDriver manager ======
run_process called 5 times
Sleeping...
run_process called 6 times
[WDM] - ====== WebDriver manager ======
2022-07-10 14:45:59,461 INFO ====== WebDriver manager ======
run_process called 7 times
[WDM] - ====== WebDriver manager ======
2022-07-10 14:45:59,467 INFO ====== WebDriver manager ======
run_process called 8 times
[WDM] - ====== WebDriver manager ======
2022-07-10 14:45:59,477 INFO ====== WebDriver manager ======
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:45:59,690 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:45:59,690 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:45:59,720 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:45:59,720 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:45:59,733 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:45:59,733 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:45:59,789 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:45:59,790 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:45:59,793 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:45:59,793 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:45:59,798 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:45:59,798 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:45:59,807 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:45:59,807 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:45:59,868 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:45:59,909 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:45:59,946 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:45:59,974 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:46:00,007 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:46:00,016 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:46:00,038 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - ====== WebDriver manager ======
2022-07-10 14:46:19,459 INFO ====== WebDriver manager ======
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:46:19,552 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:46:19,552 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:46:19,647 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
run_process called 9 times
[WDM] - ====== WebDriver manager ======
2022-07-10 14:46:42,827 INFO ====== WebDriver manager ======
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:46:43,131 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-10 14:46:43,131 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-10 14:46:43,745 INFO Driver [/Users/myusername/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
run_process called 10 times
Sleeping...
Data
0
https://spare.avspart.com/catalog/hitachi/101:uh02/06e2437d-a240-49d0-ac8d-fc553bff6c53/
https://spare.avspart.com/catalog/hitachi/101:uh02/0d79c019-4621-4a47-8127-bd7baa5f0c0b/
https://spare.avspart.com/catalog/hitachi/101:uh02/1a7f894f-c1b8-456b-8ed3-bf78c60e4a71/
https://spare.avspart.com/catalog/hitachi/101:uh02/1c6fe013-e139-4112-81a5-c01fc4591803/
https://spare.avspart.com/catalog/hitachi/101:uh02/1d07c2a9-d4f8-4b50-a6bc-e64951cd7e8e/
https://spare.avspart.com/catalog/hitachi/101:uh02/2780b803-2f37-4777-a5c6-97ea9e54137d/
https://spare.avspart.com/catalog/hitachi/101:uh02/3aa2c54f-154e-4aae-8f2a-efb05b471bfa/
https://spare.avspart.com/catalog/hitachi/101:uh02/3c0b42bb-c6c9-4f60-8c2e-d5258a703d76/
https://spare.avspart.com/catalog/hitachi/101:uh02/47a76d4e-70b0-4b6d-9308-67b91a4619ad/
https://spare.avspart.com/catalog/hitachi/101:uh02/540f4b09-795a-41de-9715-8825e296018b/
https://spare.avspart.com/catalog/hitachi/101:uh02/57cefeb3-9dd2-4f99-a552-50dc452b6565/
https://spare.avspart.com/catalog/hitachi/101:uh02/58c4d3b6-9a15-4be0-8082-19980c2119fe/
https://spare.avspart.com/catalog/hitachi/101:uh02/5b2f40e4-a61f-4a3d-a15f-a41659595b28/
CodePudding user response:
The run_process called
and Sleeping...
do seem to be organized, but if you look more closely you will see that almost all the printout
[WDM] - Current google-chrome version is 103.0.5060
2022-07-10 14:45:59,690 INFO Current google-chrome version is 103.0.5060
is between run_process called 8
and run_process called 9
.
This info is from
webdriver.Chrome(service=Service(ChromeDriverManager().install()))
where the actual work is.
The cause is the use of ThreadPoolExecutor
, run_process()
is executed 10 times simultaneously, even if the printout indicates otherwise (you can't count on its order as printing to the console isn't synchronized).
You can add lock
in sleepy
on the waiting block
def sleepy(f):
def wrapped(*args, **kwargs):
with lock:
wrapped.calls = 1
print(f"{f.__name__} called {wrapped.calls} times")
if wrapped.calls % 5 == 0:
print("Sleeping...")
sleep(20)
return f(*args, **kwargs)
lock = threading.Lock()
wrapped.calls = 0
return wrapped