"Questions seeking debugging help ("Why isn't this code working?") must include the desired behavior, a specific problem or error and the shortest code necessary to reproduce it in the question itself."
The desired behavior is to create an output file of scraped pages as per this working code (in non headless mode) and here's the shortest code necessary to reproduce it in the question itself.
# script_concurrent.py
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from concurrent.futures import ThreadPoolExecutor, wait
from time import sleep, time
from selenium import webdriver
import datetime
import os
from scrapers.scraper import connect_to_base, parse_html, write_to_file
def counted(f):
def wrapped(*args, **kwargs):
wrapped.calls = 1
return f(*args, **kwargs)
wrapped.calls = 0
return wrapped
def sleepy(f):
def wrapped(*args, **kwargs):
with lock:
wrapped.calls = 1
print(f"{f.__name__} called {wrapped.calls} times")
if wrapped.calls % 20 == 0:
print(colored("Sleeping...", "blue"))
sleep(randint(60, 65))
return f(*args, **kwargs)
lock = threading.Lock()
wrapped.calls = 0
return wrapped
@counted
@sleepy
def run_process(filename="Hitachi.csv"):
# init browser
os.environ["WDM_LOG_LEVEL"] = "0"
browser = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
if connect_to_base(browser):
sleep(2)
html = browser.page_source
output_list = parse_html(html)
write_to_file(output_list, filename)
else:
print("Error connecting to AVS")
# exit
browser.quit()
if __name__ == "__main__":
start_time = time()
output_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
output_filename = f"output_{output_timestamp}.csv"
futures = []
with ThreadPoolExecutor() as executor:
futures.extend(executor.submit(run_process) for _ in range(2, 12))
wait(futures)
end_time = time()
elapsed_time = end_time - start_time
print(f"Elapsed run time: {elapsed_time / 60:.2f} minutes.")
print(f"Calls to run_process: {run_process.calls}")
# scraper.py
import requests
import csv
from pathlib import Path
import itertools
import pandas as pd
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
BASE_DIR = Path(__file__).resolve(strict=True).parent.parent
def csv_to_iter(filename, idx=0):
pd.set_option("display.max_rows", None)
df = pd.read_csv(filename)
df = df.iloc[:, [idx]]
df = df.values.tolist()
df = list(itertools.chain(*df))
df = sorted(list(set(df)))
return iter(df)
my_iter = csv_to_iter(
filename="/Users/myusername/Downloads/Code/AVS-concurrent-web-scraping/Sorted_MAH_Hitachi_urls.csv"
)
def connect_to_base(browser):
my_next_iter = next(my_iter)
connection_attempts = 0
while connection_attempts < 3:
try:
browser.get(my_next_iter)
# wait for table element with id = 'content' to load
# before returning True
WebDriverWait(browser, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".container"))
)
return True
except Exception as e:
print(e)
connection_attempts = 1
print(f"Error connecting to {my_next_iter}.")
print(f"Attempt #{connection_attempts}.")
return False
def parse_html(html):
# create soup object
soup = BeautifulSoup(html, "html.parser")
# parse soup object to get wikipedia article url, title, and last modified date
# part_position = [
# item.text.strip() for item in soup.findAll("td", {"data-title": "Pos."})
# ]
part_number_1 = [
item.text.strip() for item in soup.findAll("td", {"data-title": "Part â„–"})
]
part_number_2 = [
item.text.strip() for item in soup.findAll("td", {"data-title": "Part №"})
]
if not part_number_1:
pass
else:
part_number = part_number_1
if not part_number_2:
pass
else:
part_number = part_number_2
part_qty = [item.text.strip() for item in soup.findAll("td", {"data-title": "Qty"})]
part_name = [
item.text.strip() for item in soup.findAll("td", {"data-title": "Part name"})
]
part_comments = [
item.text.strip() for item in soup.findAll("td", {"data-title": "Comments"})
]
machine = [
item.text.split()[0] for item in soup.findAll("article", {"id": "node-content"})
]
alternative_machines = [
item.text.split()[2] for item in soup.findAll("article", {"id": "node-content"})
]
title = [item.text for item in soup.findAll("span", {"class": "trans"})]
parts_group = [item.h3 for item in soup.findAll("div", {"class": "card-header"})]
article_info = {
# "Pos.": part_position,
"Part No": part_number,
"Qty": part_qty,
"Parts name": part_name,
"Comments": part_comments,
"Machine": machine,
"Alternative_machines": alternative_machines,
"Title": title,
"Parts_group": parts_group,
}
return [article_info]
def get_load_time(article_url):
try:
# set headers
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
}
# make get request to article_url
response = requests.get(
article_url, headers=headers, stream=True, timeout=3.000
)
# get page load time
load_time = response.elapsed.total_seconds()
except Exception as e:
print(e)
load_time = "Loading Error"
return load_time
def write_to_file(output_list, filename="Hitachi.csv"):
for row in output_list:
with open(Path(BASE_DIR).joinpath(filename), "a") as csvfile:
fieldnames = [
"Pos.",
"Part No",
"Qty",
"Parts name",
"Comments",
"Machine",
"Alternative_machines",
"Title",
"Parts_group",
]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow(row)
Output
run_process called 1 times
[WDM] - ====== WebDriver manager ======
2022-07-13 15:35:26,409 INFO ====== WebDriver manager ======
run_process called 2 times
[WDM] - ====== WebDriver manager ======
2022-07-13 15:35:26,410 INFO ====== WebDriver manager ======
run_process called 3 times
[WDM] - ====== WebDriver manager ======
2022-07-13 15:35:26,410 INFO ====== WebDriver manager ======
run_process called 4 times
[WDM] - ====== WebDriver manager ======
2022-07-13 15:35:26,415 INFO ====== WebDriver manager ======
run_process called 5 times
[WDM] - ====== WebDriver manager ======
2022-07-13 15:35:26,417 INFO ====== WebDriver manager ======
run_process called 6 times
[WDM] - ====== WebDriver manager ======
2022-07-13 15:35:26,418 INFO ====== WebDriver manager ======
run_process called 7 times
[WDM] - ====== WebDriver manager ======
2022-07-13 15:35:26,420 INFO ====== WebDriver manager ======
run_process called 8 times
[WDM] - ====== WebDriver manager ======
2022-07-13 15:35:26,426 INFO ====== WebDriver manager ======
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:35:26,616 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:35:26,617 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:35:26,695 INFO Current google-chrome version is 103.0.5060
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:35:26,697 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:35:26,700 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:35:26,699 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:35:26,701 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:35:26,699 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:35:26,710 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:35:26,710 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:35:26,713 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:35:26,713 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:35:26,717 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:35:26,717 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
(.venv) martinhewing@Martins-MacBook-Pro AVS-concurrent-web-scraping % python3 script_concurrent.py
run_process called 1 times
[WDM] - ====== WebDriver manager ======
2022-07-13 15:36:45,472 INFO ====== WebDriver manager ======
run_process called 2 times
[WDM] - ====== WebDriver manager ======
2022-07-13 15:36:45,476 INFO ====== WebDriver manager ======
run_process called 3 times
[WDM] - ====== WebDriver manager ======
2022-07-13 15:36:45,479 INFO ====== WebDriver manager ======
run_process called 4 times
[WDM] - ====== WebDriver manager ======
2022-07-13 15:36:45,480 INFO ====== WebDriver manager ======
run_process called 5 times
Sleeping...
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:36:45,616 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:36:45,617 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:36:45,650 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:36:45,650 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:36:45,660 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:36:45,660 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
(.venv) martinhewing@Martins-MacBook-Pro AVS-concurrent-web-scraping % python3 script_concurrent.py
run_process called 1 times
[WDM] - ====== WebDriver manager ======
2022-07-13 15:37:46,546 INFO ====== WebDriver manager ======
run_process called 2 times
[WDM] - ====== WebDriver manager ======
2022-07-13 15:37:46,550 INFO ====== WebDriver manager ======
run_process called 3 times
[WDM] - ====== WebDriver manager ======
2022-07-13 15:37:46,555 INFO ====== WebDriver manager ======
run_process called 4 times
[WDM] - ====== WebDriver manager ======
2022-07-13 15:37:46,695 INFO ====== WebDriver manager ======
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:37:46,708 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:37:46,708 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:37:46,724 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:37:46,725 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:37:46,733 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:37:46,734 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:37:46,752 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:37:46,753 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
run_process called 5 times
Sleeping...
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:37:46,843 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:37:46,843 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:37:46,844 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:37:46,942 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
https://spare.avspart.com/catalog/hitachi/101:uh02/0d79c019-4621-4a47-8127-bd7baa5f0c0b/
https://spare.avspart.com/catalog/hitachi/101:uh02/1a7f894f-c1b8-456b-8ed3-bf78c60e4a71/
https://spare.avspart.com/catalog/hitachi/101:uh02/06e2437d-a240-49d0-ac8d-fc553bff6c53/
https://spare.avspart.com/catalog/hitachi/101:uh02/1c6fe013-e139-4112-81a5-c01fc4591803/
[WDM] - ====== WebDriver manager ======
2022-07-13 15:38:48,773 INFO ====== WebDriver manager ======
run_process called 6 times
[WDM] - ====== WebDriver manager ======
2022-07-13 15:38:48,778 INFO ====== WebDriver manager ======
run_process called 7 times
[WDM] - ====== WebDriver manager ======
2022-07-13 15:38:48,783 INFO ====== WebDriver manager ======
run_process called 8 times
[WDM] - ====== WebDriver manager ======
2022-07-13 15:38:48,793 INFO ====== WebDriver manager ======
run_process called 9 times
[WDM] - ====== WebDriver manager ======
2022-07-13 15:38:48,802 INFO ====== WebDriver manager ======
run_process called 10 times
Sleeping...
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:38:48,947 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:38:48,948 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:38:48,964 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:38:48,964 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:38:48,967 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:38:48,967 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:38:48,971 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:38:48,973 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:38:48,989 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:38:48,994 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:38:49,065 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:38:49,108 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:38:49,129 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:38:49,181 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:38:49,189 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
https://spare.avspart.com/catalog/hitachi/101:uh02/1d07c2a9-d4f8-4b50-a6bc-e64951cd7e8e/
https://spare.avspart.com/catalog/hitachi/101:uh02/3aa2c54f-154e-4aae-8f2a-efb05b471bfa/
https://spare.avspart.com/catalog/hitachi/101:uh02/3c0b42bb-c6c9-4f60-8c2e-d5258a703d76/
https://spare.avspart.com/catalog/hitachi/101:uh02/2780b803-2f37-4777-a5c6-97ea9e54137d/
https://spare.avspart.com/catalog/hitachi/101:uh02/47a76d4e-70b0-4b6d-9308-67b91a4619ad/
[WDM] - ====== WebDriver manager ======
2022-07-13 15:39:49,816 INFO ====== WebDriver manager ======
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:39:50,147 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:39:50,148 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:39:50,368 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
https://spare.avspart.com/catalog/hitachi/101:uh02/540f4b09-795a-41de-9715-8825e296018b/
Elapsed run time: 2.27 minutes.
Calls to run_process: 10
Data
0
https://spare.avspart.com/catalog/hitachi/101:uh02/06e2437d-a240-49d0-ac8d-fc553bff6c53/
https://spare.avspart.com/catalog/hitachi/101:uh02/0d79c019-4621-4a47-8127-bd7baa5f0c0b/
https://spare.avspart.com/catalog/hitachi/101:uh02/1a7f894f-c1b8-456b-8ed3-bf78c60e4a71/
https://spare.avspart.com/catalog/hitachi/101:uh02/1c6fe013-e139-4112-81a5-c01fc4591803/
https://spare.avspart.com/catalog/hitachi/101:uh02/1d07c2a9-d4f8-4b50-a6bc-e64951cd7e8e/
https://spare.avspart.com/catalog/hitachi/101:uh02/2780b803-2f37-4777-a5c6-97ea9e54137d/
https://spare.avspart.com/catalog/hitachi/101:uh02/3aa2c54f-154e-4aae-8f2a-efb05b471bfa/
https://spare.avspart.com/catalog/hitachi/101:uh02/3c0b42bb-c6c9-4f60-8c2e-d5258a703d76/
https://spare.avspart.com/catalog/hitachi/101:uh02/47a76d4e-70b0-4b6d-9308-67b91a4619ad/
https://spare.avspart.com/catalog/hitachi/101:uh02/540f4b09-795a-41de-9715-8825e296018b/
https://spare.avspart.com/catalog/hitachi/101:uh02/57cefeb3-9dd2-4f99-a552-50dc452b6565/
https://spare.avspart.com/catalog/hitachi/101:uh02/58c4d3b6-9a15-4be0-8082-19980c2119fe/
https://spare.avspart.com/catalog/hitachi/101:uh02/5b2f40e4-a61f-4a3d-a15f-a41659595b28/
Here's my attempt to implement Headless Mode.
def get_driver(headless):
options = webdriver.Options()
if headless:
options.add_argument("--headless")
# initialize driver
driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()), options=options
)
return driver
# script_concurrent.py
from concurrent.futures import ThreadPoolExecutor, wait
from time import sleep, time
from termcolor import colored
from random import randint
import threading
import datetime
import sys
from scrapers.scraper import get_driver, connect_to_base, parse_html, write_to_file
def counted(f):
def wrapped(*args, **kwargs):
wrapped.calls = 1
return f(*args, **kwargs)
wrapped.calls = 0
return wrapped
def sleepy(f):
def wrapped(*args, **kwargs):
with lock:
wrapped.calls = 1
print(f"{f.__name__} called {wrapped.calls} times")
if wrapped.calls % 20 == 0:
print(colored("Sleeping...", "blue"))
sleep(randint(60, 65))
return f(*args, **kwargs)
lock = threading.Lock()
wrapped.calls = 0
return wrapped
@counted
@sleepy
def run_process(filename, headless):
# init browser
browser = get_driver(headless)
if connect_to_base(browser):
sleep(2)
html = browser.page_source
output_list = parse_html(html)
write_to_file(output_list, filename)
# exit
browser.quit()
else:
print("Error connecting to AVS")
browser.quit()
if __name__ == "__main__":
headless = False
if len(sys.argv) > 1:
if sys.argv[1] == "headless":
print("Running in headless mode")
headless = True
start_time = time()
output_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
output_filename = f"Hitachi_{output_timestamp}.csv"
futures = []
with ThreadPoolExecutor() as executor:
futures.extend(
executor.submit(run_process, output_filename, headless)
for _ in range(2, 202)
)
wait(futures)
end_time = time()
elapsed_time = end_time - start_time
print(f"Elapsed run time: {elapsed_time / 60:.2f} minutes.")
print(f"Calls to run_process: {run_process.calls}")
# script.py
import csv
import requests
import itertools
import pandas as pd
from pathlib import Path
from selenium import webdriver
from termcolor import colored
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
BASE_DIR = Path(__file__).resolve(strict=True).parent.parent
def csv_to_iter(filename, idx=0):
pd.set_option("display.max_rows", None)
df = pd.read_csv(filename)
df = df.iloc[:, [idx]]
df = df.values.tolist()
df = list(itertools.chain(*df))
df = sorted(list(set(df)))
return iter(df)
my_iter = csv_to_iter(
filename="/Users/martinhewing/Downloads/Code/AVS-concurrent-web-scraping/Sorted_MAH_Hitachi_urls.csv"
)
def get_driver(headless):
options = webdriver.Options()
if headless:
options.add_argument("--headless")
# initialize driver
driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()), options=options
)
return driver
def connect_to_base(browser):
my_next_iter = next(my_iter)
connection_attempts = 0
while connection_attempts < 3:
try:
browser.get(my_next_iter)
print(colored(browser.current_url, "green"))
# wait for table element with id = 'content' to load
# before returning True
WebDriverWait(browser, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".container"))
)
return True
except Exception as e:
print(e)
connection_attempts = 1
print(f"Error connecting to {my_next_iter}.")
print(f"Attempt #{connection_attempts}.")
return False
def parse_html(html):
# create soup object
soup = BeautifulSoup(html, "html.parser")
# parse soup object to get wikipedia article url, title, and last modified date
# part_position = [
# item.text.strip() for item in soup.findAll("td", {"data-title": "Pos."})
# ]
part_number_1 = [
item.text.strip() for item in soup.findAll("td", {"data-title": "Part â"})
]
part_number_2 = [
item.text.strip() for item in soup.findAll("td", {"data-title": "Part №"})
]
if not part_number_1:
pass
else:
part_number = part_number_1
if not part_number_2:
pass
else:
part_number = part_number_2
part_qty = [item.text.strip() for item in soup.findAll("td", {"data-title": "Qty"})]
part_name = [
item.text.strip() for item in soup.findAll("td", {"data-title": "Part name"})
]
part_comments = [
item.text.strip() for item in soup.findAll("td", {"data-title": "Comments"})
]
machine = [
item.text.split()[0] for item in soup.findAll("article", {"id": "node-content"})
]
alternative_machines = [
item.text.split()[2] for item in soup.findAll("article", {"id": "node-content"})
]
title = [item.text for item in soup.findAll("span", {"class": "trans"})]
parts_group = [item.h3 for item in soup.findAll("div", {"class": "card-header"})]
article_info = {
# "Pos.": part_position,
"Part No": part_number,
"Qty": part_qty,
"Parts name": part_name,
"Comments": part_comments,
"Machine": machine,
"Alternative_machines": alternative_machines,
"Title": title,
"Parts_group": parts_group,
}
return [article_info]
def get_load_time(article_url):
try:
# set headers
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
}
# make get request to article_url
response = requests.get(
article_url, headers=headers, stream=True, timeout=3.000
)
# get page load time
load_time = response.elapsed.total_seconds()
except Exception as e:
print(e)
load_time = "Loading Error"
return load_time
def write_to_file(output_list, filename):
for row in output_list:
with open(Path(BASE_DIR).joinpath(filename), "a") as csvfile:
fieldnames = [
"Pos.",
"Part No",
"Qty",
"Parts name",
"Comments",
"Machine",
"Alternative_machines",
"Title",
"Parts_group",
]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow(row)
Output
Running in headless mode
run_process called 1 times
run_process called 2 times
run_process called 3 times
run_process called 4 times
run_process called 5 times
run_process called 6 times
run_process called 7 times
run_process called 8 times
run_process called 9 times
run_process called 10 times
run_process called 11 times
run_process called 12 times
run_process called 13 times
run_process called 14 times
run_process called 15 times
run_process called 16 times
run_process called 17 times
run_process called 18 times
run_process called 19 times
run_process called 20 times
Sleeping...
Data
0
https://spare.avspart.com/catalog/hitachi/101:uh02/06e2437d-a240-49d0-ac8d-fc553bff6c53/
https://spare.avspart.com/catalog/hitachi/101:uh02/0d79c019-4621-4a47-8127-bd7baa5f0c0b/
https://spare.avspart.com/catalog/hitachi/101:uh02/1a7f894f-c1b8-456b-8ed3-bf78c60e4a71/
https://spare.avspart.com/catalog/hitachi/101:uh02/1c6fe013-e139-4112-81a5-c01fc4591803/
https://spare.avspart.com/catalog/hitachi/101:uh02/1d07c2a9-d4f8-4b50-a6bc-e64951cd7e8e/
https://spare.avspart.com/catalog/hitachi/101:uh02/2780b803-2f37-4777-a5c6-97ea9e54137d/
https://spare.avspart.com/catalog/hitachi/101:uh02/3aa2c54f-154e-4aae-8f2a-efb05b471bfa/
https://spare.avspart.com/catalog/hitachi/101:uh02/3c0b42bb-c6c9-4f60-8c2e-d5258a703d76/
https://spare.avspart.com/catalog/hitachi/101:uh02/47a76d4e-70b0-4b6d-9308-67b91a4619ad/
https://spare.avspart.com/catalog/hitachi/101:uh02/540f4b09-795a-41de-9715-8825e296018b/
https://spare.avspart.com/catalog/hitachi/101:uh02/57cefeb3-9dd2-4f99-a552-50dc452b6565/
https://spare.avspart.com/catalog/hitachi/101:uh02/58c4d3b6-9a15-4be0-8082-19980c2119fe/
https://spare.avspart.com/catalog/hitachi/101:uh02/5b2f40e4-a61f-4a3d-a15f-a41659595b28/
When I run in headless mode there's no error but no output either, I have reviewed similar questions, however, I am at a lost to understand what might be causing this to happen. Please help:)
CodePudding user response:
Website Blocking:
The website could be detecting you scrape. There are a couple different solutions you could try.
Change your user agent:
chrome_options.add_argument("USER AGENT")
Change the "User Agent" string with the contents shown in this link: What is my user agent
Selenium Stealth:
stealth(driver,
user_agent: 'USER AGENT',
languages=["en-US", "en"],
vendor="Google Inc.",
platform="Win32",
webgl_vendor="Intel Inc.",
renderer="Intel Iris OpenGL Engine",
fix_hairline=True,
)
Selenium Stealth is a python package that is paired with Selenium and used to prevent detection. It manipulates key elements of your Selenium browser in order to bypass bot detection software.