Home > Software design >  No Output While Running Selenium in Headless Mode, Working in Non Headless
No Output While Running Selenium in Headless Mode, Working in Non Headless

Time:07-20

"Questions seeking debugging help ("Why isn't this code working?") must include the desired behavior, a specific problem or error and the shortest code necessary to reproduce it in the question itself."

The desired behavior is to create an output file of scraped pages as per this working code (in non headless mode) and here's the shortest code necessary to reproduce it in the question itself.

# script_concurrent.py

from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from concurrent.futures import ThreadPoolExecutor, wait
from time import sleep, time
from selenium import webdriver
import datetime
import os

from scrapers.scraper import connect_to_base, parse_html, write_to_file


def counted(f):
    def wrapped(*args, **kwargs):
        wrapped.calls  = 1
        return f(*args, **kwargs)

    wrapped.calls = 0

    return wrapped


def sleepy(f):
    def wrapped(*args, **kwargs):
        with lock:
            wrapped.calls  = 1
            print(f"{f.__name__} called {wrapped.calls} times")
            if wrapped.calls % 20 == 0:
                print(colored("Sleeping...", "blue"))
                sleep(randint(60, 65))
        return f(*args, **kwargs)

    lock = threading.Lock()
    wrapped.calls = 0

    return wrapped

@counted
@sleepy
def run_process(filename="Hitachi.csv"):

    # init browser
    os.environ["WDM_LOG_LEVEL"] = "0"
    browser = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

    if connect_to_base(browser):
        sleep(2)
        html = browser.page_source
        output_list = parse_html(html)
        write_to_file(output_list, filename)
    else:
        print("Error connecting to AVS")

    # exit
    browser.quit()


if __name__ == "__main__":

    start_time = time()
    output_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    output_filename = f"output_{output_timestamp}.csv"

    futures = []

    with ThreadPoolExecutor() as executor:
        futures.extend(executor.submit(run_process) for _ in range(2, 12))

    wait(futures)
    end_time = time()
    elapsed_time = end_time - start_time
    print(f"Elapsed run time: {elapsed_time / 60:.2f} minutes.")
    print(f"Calls to run_process: {run_process.calls}")

# scraper.py

import requests
import csv
from pathlib import Path
import itertools
import pandas as pd
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

BASE_DIR = Path(__file__).resolve(strict=True).parent.parent


def csv_to_iter(filename, idx=0):
    pd.set_option("display.max_rows", None)
    df = pd.read_csv(filename)
    df = df.iloc[:, [idx]]
    df = df.values.tolist()
    df = list(itertools.chain(*df))
    df = sorted(list(set(df)))
    return iter(df)


my_iter = csv_to_iter(
    filename="/Users/myusername/Downloads/Code/AVS-concurrent-web-scraping/Sorted_MAH_Hitachi_urls.csv"
)


def connect_to_base(browser):
    my_next_iter = next(my_iter)
    connection_attempts = 0
    while connection_attempts < 3:
        try:
            browser.get(my_next_iter)
            # wait for table element with id = 'content' to load
            # before returning True
            WebDriverWait(browser, 5).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".container"))
            )
            return True
        except Exception as e:
            print(e)
            connection_attempts  = 1
            print(f"Error connecting to {my_next_iter}.")
            print(f"Attempt #{connection_attempts}.")
    return False


def parse_html(html):
    # create soup object
    soup = BeautifulSoup(html, "html.parser")
    # parse soup object to get wikipedia article url, title, and last modified date
    # part_position = [
    #     item.text.strip() for item in soup.findAll("td", {"data-title": "Pos."})
    # ]
    part_number_1 = [
        item.text.strip() for item in soup.findAll("td", {"data-title": "Part â„–"})
    ]
    part_number_2 = [
        item.text.strip() for item in soup.findAll("td", {"data-title": "Part №"})
    ]

    if not part_number_1:
        pass
    else:
        part_number = part_number_1

    if not part_number_2:
        pass
    else:
        part_number = part_number_2

    part_qty = [item.text.strip() for item in soup.findAll("td", {"data-title": "Qty"})]

    part_name = [
        item.text.strip() for item in soup.findAll("td", {"data-title": "Part name"})
    ]

    part_comments = [
        item.text.strip() for item in soup.findAll("td", {"data-title": "Comments"})
    ]

    machine = [
        item.text.split()[0] for item in soup.findAll("article", {"id": "node-content"})
    ]

    alternative_machines = [
        item.text.split()[2] for item in soup.findAll("article", {"id": "node-content"})
    ]

    title = [item.text for item in soup.findAll("span", {"class": "trans"})]

    parts_group = [item.h3 for item in soup.findAll("div", {"class": "card-header"})]

    article_info = {
        # "Pos.": part_position,
        "Part No": part_number,
        "Qty": part_qty,
        "Parts name": part_name,
        "Comments": part_comments,
        "Machine": machine,
        "Alternative_machines": alternative_machines,
        "Title": title,
        "Parts_group": parts_group,
    }

    return [article_info]


def get_load_time(article_url):
    try:
        # set headers
        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
        }
        # make get request to article_url
        response = requests.get(
            article_url, headers=headers, stream=True, timeout=3.000
        )
        # get page load time
        load_time = response.elapsed.total_seconds()
    except Exception as e:
        print(e)
        load_time = "Loading Error"
    return load_time


def write_to_file(output_list, filename="Hitachi.csv"):
    for row in output_list:
        with open(Path(BASE_DIR).joinpath(filename), "a") as csvfile:
            fieldnames = [
                "Pos.",
                "Part No",
                "Qty",
                "Parts name",
                "Comments",
                "Machine",
                "Alternative_machines",
                "Title",
                "Parts_group",
            ]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writerow(row)

Output

run_process called 1 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:35:26,409 INFO ====== WebDriver manager ======
run_process called 2 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:35:26,410 INFO ====== WebDriver manager ======
run_process called 3 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:35:26,410 INFO ====== WebDriver manager ======
run_process called 4 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:35:26,415 INFO ====== WebDriver manager ======
run_process called 5 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:35:26,417 INFO ====== WebDriver manager ======
run_process called 6 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:35:26,418 INFO ====== WebDriver manager ======
run_process called 7 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:35:26,420 INFO ====== WebDriver manager ======
run_process called 8 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:35:26,426 INFO ====== WebDriver manager ======
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:35:26,616 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:35:26,617 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:35:26,695 INFO Current google-chrome version is 103.0.5060
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:35:26,697 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:35:26,700 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:35:26,699 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:35:26,701 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:35:26,699 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:35:26,710 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:35:26,710 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:35:26,713 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:35:26,713 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:35:26,717 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:35:26,717 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
(.venv) martinhewing@Martins-MacBook-Pro AVS-concurrent-web-scraping % python3 script_concurrent.py
run_process called 1 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:36:45,472 INFO ====== WebDriver manager ======
run_process called 2 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:36:45,476 INFO ====== WebDriver manager ======
run_process called 3 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:36:45,479 INFO ====== WebDriver manager ======
run_process called 4 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:36:45,480 INFO ====== WebDriver manager ======
run_process called 5 times
Sleeping...
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:36:45,616 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:36:45,617 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:36:45,650 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:36:45,650 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:36:45,660 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:36:45,660 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
(.venv) martinhewing@Martins-MacBook-Pro AVS-concurrent-web-scraping % python3 script_concurrent.py
run_process called 1 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:37:46,546 INFO ====== WebDriver manager ======
run_process called 2 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:37:46,550 INFO ====== WebDriver manager ======
run_process called 3 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:37:46,555 INFO ====== WebDriver manager ======
run_process called 4 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:37:46,695 INFO ====== WebDriver manager ======
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:37:46,708 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:37:46,708 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:37:46,724 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:37:46,725 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:37:46,733 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:37:46,734 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:37:46,752 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:37:46,753 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
run_process called 5 times
Sleeping...
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:37:46,843 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:37:46,843 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:37:46,844 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:37:46,942 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
https://spare.avspart.com/catalog/hitachi/101:uh02/0d79c019-4621-4a47-8127-bd7baa5f0c0b/
https://spare.avspart.com/catalog/hitachi/101:uh02/1a7f894f-c1b8-456b-8ed3-bf78c60e4a71/
https://spare.avspart.com/catalog/hitachi/101:uh02/06e2437d-a240-49d0-ac8d-fc553bff6c53/
https://spare.avspart.com/catalog/hitachi/101:uh02/1c6fe013-e139-4112-81a5-c01fc4591803/

[WDM] - ====== WebDriver manager ======
2022-07-13 15:38:48,773 INFO ====== WebDriver manager ======
run_process called 6 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:38:48,778 INFO ====== WebDriver manager ======
run_process called 7 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:38:48,783 INFO ====== WebDriver manager ======
run_process called 8 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:38:48,793 INFO ====== WebDriver manager ======
run_process called 9 times

[WDM] - ====== WebDriver manager ======
2022-07-13 15:38:48,802 INFO ====== WebDriver manager ======
run_process called 10 times
Sleeping...
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:38:48,947 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:38:48,948 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:38:48,964 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:38:48,964 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:38:48,967 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:38:48,967 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:38:48,971 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:38:48,973 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:38:48,989 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:38:48,994 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:38:49,065 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:38:49,108 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:38:49,129 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:38:49,181 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:38:49,189 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
https://spare.avspart.com/catalog/hitachi/101:uh02/1d07c2a9-d4f8-4b50-a6bc-e64951cd7e8e/
https://spare.avspart.com/catalog/hitachi/101:uh02/3aa2c54f-154e-4aae-8f2a-efb05b471bfa/
https://spare.avspart.com/catalog/hitachi/101:uh02/3c0b42bb-c6c9-4f60-8c2e-d5258a703d76/
https://spare.avspart.com/catalog/hitachi/101:uh02/2780b803-2f37-4777-a5c6-97ea9e54137d/
https://spare.avspart.com/catalog/hitachi/101:uh02/47a76d4e-70b0-4b6d-9308-67b91a4619ad/

[WDM] - ====== WebDriver manager ======
2022-07-13 15:39:49,816 INFO ====== WebDriver manager ======
[WDM] - Current google-chrome version is 103.0.5060
2022-07-13 15:39:50,147 INFO Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
2022-07-13 15:39:50,148 INFO Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
2022-07-13 15:39:50,368 INFO Driver [/Users/martinhewing/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache
https://spare.avspart.com/catalog/hitachi/101:uh02/540f4b09-795a-41de-9715-8825e296018b/
Elapsed run time: 2.27 minutes.
Calls to run_process: 10

Data

0
https://spare.avspart.com/catalog/hitachi/101:uh02/06e2437d-a240-49d0-ac8d-fc553bff6c53/
https://spare.avspart.com/catalog/hitachi/101:uh02/0d79c019-4621-4a47-8127-bd7baa5f0c0b/
https://spare.avspart.com/catalog/hitachi/101:uh02/1a7f894f-c1b8-456b-8ed3-bf78c60e4a71/
https://spare.avspart.com/catalog/hitachi/101:uh02/1c6fe013-e139-4112-81a5-c01fc4591803/
https://spare.avspart.com/catalog/hitachi/101:uh02/1d07c2a9-d4f8-4b50-a6bc-e64951cd7e8e/
https://spare.avspart.com/catalog/hitachi/101:uh02/2780b803-2f37-4777-a5c6-97ea9e54137d/
https://spare.avspart.com/catalog/hitachi/101:uh02/3aa2c54f-154e-4aae-8f2a-efb05b471bfa/
https://spare.avspart.com/catalog/hitachi/101:uh02/3c0b42bb-c6c9-4f60-8c2e-d5258a703d76/
https://spare.avspart.com/catalog/hitachi/101:uh02/47a76d4e-70b0-4b6d-9308-67b91a4619ad/
https://spare.avspart.com/catalog/hitachi/101:uh02/540f4b09-795a-41de-9715-8825e296018b/
https://spare.avspart.com/catalog/hitachi/101:uh02/57cefeb3-9dd2-4f99-a552-50dc452b6565/
https://spare.avspart.com/catalog/hitachi/101:uh02/58c4d3b6-9a15-4be0-8082-19980c2119fe/
https://spare.avspart.com/catalog/hitachi/101:uh02/5b2f40e4-a61f-4a3d-a15f-a41659595b28/

Here's my attempt to implement Headless Mode.

def get_driver(headless):
    options = webdriver.Options()
    if headless:
        options.add_argument("--headless")

    # initialize driver
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()), options=options
    )
    return driver
# script_concurrent.py

from concurrent.futures import ThreadPoolExecutor, wait
from time import sleep, time
from termcolor import colored
from random import randint
import threading
import datetime
import sys

from scrapers.scraper import get_driver, connect_to_base, parse_html, write_to_file


def counted(f):
    def wrapped(*args, **kwargs):
        wrapped.calls  = 1
        return f(*args, **kwargs)

    wrapped.calls = 0

    return wrapped


def sleepy(f):
    def wrapped(*args, **kwargs):
        with lock:
            wrapped.calls  = 1
            print(f"{f.__name__} called {wrapped.calls} times")
            if wrapped.calls % 20 == 0:
                print(colored("Sleeping...", "blue"))
                sleep(randint(60, 65))
        return f(*args, **kwargs)

    lock = threading.Lock()
    wrapped.calls = 0

    return wrapped


@counted
@sleepy
def run_process(filename, headless):

    # init browser
    browser = get_driver(headless)

    if connect_to_base(browser):
        sleep(2)
        html = browser.page_source
        output_list = parse_html(html)
        write_to_file(output_list, filename)
        # exit
        browser.quit()
    else:
        print("Error connecting to AVS")
        browser.quit()


if __name__ == "__main__":

    headless = False
    if len(sys.argv) > 1:
        if sys.argv[1] == "headless":
            print("Running in headless mode")
            headless = True

    start_time = time()
    output_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    output_filename = f"Hitachi_{output_timestamp}.csv"

    futures = []

    with ThreadPoolExecutor() as executor:
        futures.extend(
            executor.submit(run_process, output_filename, headless)
            for _ in range(2, 202)
        )

    wait(futures)
    end_time = time()
    elapsed_time = end_time - start_time
    print(f"Elapsed run time: {elapsed_time / 60:.2f} minutes.")
    print(f"Calls to run_process: {run_process.calls}")

# script.py

import csv
import requests
import itertools
import pandas as pd
from pathlib import Path
from selenium import webdriver
from termcolor import colored
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

BASE_DIR = Path(__file__).resolve(strict=True).parent.parent


def csv_to_iter(filename, idx=0):
    pd.set_option("display.max_rows", None)
    df = pd.read_csv(filename)
    df = df.iloc[:, [idx]]
    df = df.values.tolist()
    df = list(itertools.chain(*df))
    df = sorted(list(set(df)))
    return iter(df)


my_iter = csv_to_iter(
    filename="/Users/martinhewing/Downloads/Code/AVS-concurrent-web-scraping/Sorted_MAH_Hitachi_urls.csv"
)


def get_driver(headless):
    options = webdriver.Options()
    if headless:
        options.add_argument("--headless")

    # initialize driver
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()), options=options
    )
    return driver


def connect_to_base(browser):
    my_next_iter = next(my_iter)
    connection_attempts = 0
    while connection_attempts < 3:
        try:
            browser.get(my_next_iter)
            print(colored(browser.current_url, "green"))
            # wait for table element with id = 'content' to load
            # before returning True
            WebDriverWait(browser, 5).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".container"))
            )
            return True
        except Exception as e:
            print(e)
            connection_attempts  = 1
            print(f"Error connecting to {my_next_iter}.")
            print(f"Attempt #{connection_attempts}.")
    return False


def parse_html(html):
    # create soup object
    soup = BeautifulSoup(html, "html.parser")
    # parse soup object to get wikipedia article url, title, and last modified date
    # part_position = [
    #     item.text.strip() for item in soup.findAll("td", {"data-title": "Pos."})
    # ]
    part_number_1 = [
        item.text.strip() for item in soup.findAll("td", {"data-title": "Part â"})
    ]
    part_number_2 = [
        item.text.strip() for item in soup.findAll("td", {"data-title": "Part №"})
    ]

    if not part_number_1:
        pass
    else:
        part_number = part_number_1

    if not part_number_2:
        pass
    else:
        part_number = part_number_2

    part_qty = [item.text.strip() for item in soup.findAll("td", {"data-title": "Qty"})]

    part_name = [
        item.text.strip() for item in soup.findAll("td", {"data-title": "Part name"})
    ]

    part_comments = [
        item.text.strip() for item in soup.findAll("td", {"data-title": "Comments"})
    ]

    machine = [
        item.text.split()[0] for item in soup.findAll("article", {"id": "node-content"})
    ]

    alternative_machines = [
        item.text.split()[2] for item in soup.findAll("article", {"id": "node-content"})
    ]

    title = [item.text for item in soup.findAll("span", {"class": "trans"})]

    parts_group = [item.h3 for item in soup.findAll("div", {"class": "card-header"})]

    article_info = {
        # "Pos.": part_position,
        "Part No": part_number,
        "Qty": part_qty,
        "Parts name": part_name,
        "Comments": part_comments,
        "Machine": machine,
        "Alternative_machines": alternative_machines,
        "Title": title,
        "Parts_group": parts_group,
    }

    return [article_info]


def get_load_time(article_url):
    try:
        # set headers
        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
        }
        # make get request to article_url
        response = requests.get(
            article_url, headers=headers, stream=True, timeout=3.000
        )
        # get page load time
        load_time = response.elapsed.total_seconds()
    except Exception as e:
        print(e)
        load_time = "Loading Error"
    return load_time


def write_to_file(output_list, filename):
    for row in output_list:
        with open(Path(BASE_DIR).joinpath(filename), "a") as csvfile:
            fieldnames = [
                "Pos.",
                "Part No",
                "Qty",
                "Parts name",
                "Comments",
                "Machine",
                "Alternative_machines",
                "Title",
                "Parts_group",
            ]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writerow(row)

Output

Running in headless mode
run_process called 1 times
run_process called 2 times
run_process called 3 times
run_process called 4 times
run_process called 5 times
run_process called 6 times
run_process called 7 times
run_process called 8 times
run_process called 9 times
run_process called 10 times
run_process called 11 times
run_process called 12 times
run_process called 13 times
run_process called 14 times
run_process called 15 times
run_process called 16 times
run_process called 17 times
run_process called 18 times
run_process called 19 times
run_process called 20 times
Sleeping...

Data

0
https://spare.avspart.com/catalog/hitachi/101:uh02/06e2437d-a240-49d0-ac8d-fc553bff6c53/
https://spare.avspart.com/catalog/hitachi/101:uh02/0d79c019-4621-4a47-8127-bd7baa5f0c0b/
https://spare.avspart.com/catalog/hitachi/101:uh02/1a7f894f-c1b8-456b-8ed3-bf78c60e4a71/
https://spare.avspart.com/catalog/hitachi/101:uh02/1c6fe013-e139-4112-81a5-c01fc4591803/
https://spare.avspart.com/catalog/hitachi/101:uh02/1d07c2a9-d4f8-4b50-a6bc-e64951cd7e8e/
https://spare.avspart.com/catalog/hitachi/101:uh02/2780b803-2f37-4777-a5c6-97ea9e54137d/
https://spare.avspart.com/catalog/hitachi/101:uh02/3aa2c54f-154e-4aae-8f2a-efb05b471bfa/
https://spare.avspart.com/catalog/hitachi/101:uh02/3c0b42bb-c6c9-4f60-8c2e-d5258a703d76/
https://spare.avspart.com/catalog/hitachi/101:uh02/47a76d4e-70b0-4b6d-9308-67b91a4619ad/
https://spare.avspart.com/catalog/hitachi/101:uh02/540f4b09-795a-41de-9715-8825e296018b/
https://spare.avspart.com/catalog/hitachi/101:uh02/57cefeb3-9dd2-4f99-a552-50dc452b6565/
https://spare.avspart.com/catalog/hitachi/101:uh02/58c4d3b6-9a15-4be0-8082-19980c2119fe/
https://spare.avspart.com/catalog/hitachi/101:uh02/5b2f40e4-a61f-4a3d-a15f-a41659595b28/

When I run in headless mode there's no error but no output either, I have reviewed similar questions, however, I am at a lost to understand what might be causing this to happen. Please help:)

CodePudding user response:

Website Blocking:

The website could be detecting you scrape. There are a couple different solutions you could try.


Change your user agent:

chrome_options.add_argument("USER AGENT")

Change the "User Agent" string with the contents shown in this link: What is my user agent


Selenium Stealth:

stealth(driver,
        user_agent: 'USER AGENT',
        languages=["en-US", "en"],
        vendor="Google Inc.",
        platform="Win32",
        webgl_vendor="Intel Inc.",
        renderer="Intel Iris OpenGL Engine",
        fix_hairline=True,
        )

Selenium Stealth is a python package that is paired with Selenium and used to prevent detection. It manipulates key elements of your Selenium browser in order to bypass bot detection software.

  • Related