I'm trying to iterate through each url in allItemUrls list and apply getItemDetails() to each url. I want to use ThreadPoolExecutor to do this but I keep getting duplicated data, and the number of duplicates depends on the number of workers specified. For example I have 4 workers and I will be getting 4 identical SKUs printed out instead of 4 unique urls running in parallel and get back 4 unique SKUs. The problem starts when I pass url into getSoup() and I seem to get the same soup for all 4 workers.
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-extensions")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
options.add_argument(f"user-agent={user_agent}")
driver = Chrome(service=Service(ChromeDriverManager().install()), options=options)
allItemsDetails = []
def getSoup(url):
driver.get(url)
soup = BeautifulSoup(driver.page_source, "lxml")
return soup
def getItemDetails(itemUrl):
soup = getSoup(itemUrl)
print(soup.find("span", {"itemprop": "sku"}).text)
try:
sku = soup.find("span", {"itemprop": "sku"}).text
details = {
"sku": sku,
}
global allItemsDetails
allItemsDetails.append(details)
except:
print("Something went wrong at getItemDetails")
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
executor.map(getItemDetails, allItemUrls)
CodePudding user response:
Don't use global in your case as that's will lead to race condition.
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium.common.exceptions import TimeoutException
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
user_agent = 'Your defined one!'
allurls = []
def parser(driver, url):
driver.get(url)
try:
return {
"SKU": WebDriverWait(driver, 10).until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, "span[itemprop=sku]"))
).text
}
except TimeoutException:
return
def main():
options = ChromeOptions()
options.add_argument("--headless")
options.add_argument("--disable-extensions")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
options.add_argument(f"user-agent={user_agent}")
allitems = []
with ThreadPoolExecutor() as executor, Chrome(options=options) as driver:
fs = (executor.submit(parser, driver, url) for url in allurls)
for f in as_completed(fs):
if f.result():
allitems.append(f.result())
print(allitems)
if __name__ == '__main__':
main()