Home > Back-end >  Trouble implementing ThreadPoolExecutor with selenium, why soup is duplicated based on number of thr
Trouble implementing ThreadPoolExecutor with selenium, why soup is duplicated based on number of thr

Time:09-30

I'm trying to iterate through each url in allItemUrls list and apply getItemDetails() to each url. I want to use ThreadPoolExecutor to do this but I keep getting duplicated data, and the number of duplicates depends on the number of workers specified. For example I have 4 workers and I will be getting 4 identical SKUs printed out instead of 4 unique urls running in parallel and get back 4 unique SKUs. The problem starts when I pass url into getSoup() and I seem to get the same soup for all 4 workers.

options = Options()
options.add_argument("--headless")
options.add_argument("--disable-extensions")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
options.add_argument(f"user-agent={user_agent}")
driver = Chrome(service=Service(ChromeDriverManager().install()), options=options)

allItemsDetails = []

def getSoup(url):
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, "lxml")
    return soup

def getItemDetails(itemUrl):
    soup = getSoup(itemUrl)


    print(soup.find("span", {"itemprop": "sku"}).text)

    try:
        sku = soup.find("span", {"itemprop": "sku"}).text
        details = {
            "sku": sku,
        }

        global allItemsDetails
        allItemsDetails.append(details)
    except:
        print("Something went wrong at getItemDetails")

with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    executor.map(getItemDetails, allItemUrls)

CodePudding user response:

Don't use global in your case as that's will lead to race condition.

from concurrent.futures import ThreadPoolExecutor, as_completed

from selenium.common.exceptions import TimeoutException
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

user_agent = 'Your defined one!'

allurls = []


def parser(driver, url):
    driver.get(url)
    try:
        return {
            "SKU": WebDriverWait(driver, 10).until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, "span[itemprop=sku]"))
            ).text
        }
    except TimeoutException:
        return


def main():
    options = ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--disable-extensions")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--no-sandbox")
    options.add_argument(f"user-agent={user_agent}")
    allitems = []
    with ThreadPoolExecutor() as executor, Chrome(options=options) as driver:
        fs = (executor.submit(parser, driver, url) for url in allurls)
        for f in as_completed(fs):
            if f.result():
                allitems.append(f.result())
    print(allitems)


if __name__ == '__main__':
    main()

  • Related