Home > database >  How can I make this selenium code run in parallel?
How can I make this selenium code run in parallel?

Time:04-26

I have two seperate selenium codes that scrape a website and download a file. I am trying to merge them into one script and make them run simultaneously rather than sequentially. Can someone create a working code that merges the two so that they run in parallel?.

Here is the first code:

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import os
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

options=Options()
options.add_argument("--headless")
options.add_argument("--window-size=1920,1080")

driver=webdriver.Chrome(options=options)

params={'behavior':'allow','downloadPath':os.getcwd()}
driver.execute_cdp_cmd('Page.setDownloadBehavior',params)

driver.get("https://www.ons.gov.uk/")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.NAME, "q"))).send_keys("Education and childcare")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.NAME, "q"))).send_keys("Education and childcare")
click_button=driver.find_element_by_xpath('//*[@id="nav-search-submit"]').click()
click_button=driver.find_element_by_xpath('//*[@id="results"]/div[1]/div[2]/div[1]/h3/a/span').click()
click_button=driver.find_element_by_xpath('//*[@id="main"]/div[2]/div[1]/section/div/div[1]/div/div[2]/h3/a/span').click()
click_button=driver.find_element_by_xpath('//*[@id="main"]/div[2]/div/div[1]/div[2]/p[2]/a').click()

and here is the second code:

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import os
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

options=Options()
#options.add_argument("--headless")
#options.add_argument("--window-size=1920,1080")

driver=webdriver.Chrome(options=options)

params={'behavior':'allow','downloadPath':os.getcwd()}
driver.execute_cdp_cmd('Page.setDownloadBehavior',params)

driver.get("https://data.gov.uk/")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "/html/body/div[3]/main/div[2]/form/div/div/input"))).send_keys("Forestry Statistics 2018: Recreation")
click_button=driver.find_element_by_xpath('/html/body/div[3]/main/div[2]/form/div/div/div/button').click()
click_button=driver.find_element_by_xpath('/html/body/div[3]/form/main/div/div[2]/div[2]/div[2]/h2/a').click()
click_button=driver.find_element_by_xpath('/html/body/div[3]/main/div/div/div/section/table/tbody/tr[2]/td[1]/a').click()

CodePudding user response:

The simplest approach is to just create a multithreading pool of size 2 (you do not need a multiprocessing pool since each Chrome driver is already running in its own process):

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import os
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

from multiprocessing.pool import ThreadPool
from functools import partial

def getDriver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--window-size=1920,1080")

    driver = webdriver.Chrome(options=options)
    return driver

def task1():
    driver = getDriver()
    try:
        params = {'behavior':'allow','downloadPath':os.getcwd()}
        driver.execute_cdp_cmd('Page.setDownloadBehavior',params)

        driver.get("https://www.ons.gov.uk/")
        WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.NAME, "q"))).send_keys("Education and childcare")
        WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.NAME, "q"))).send_keys("Education and childcare")
        click_button = driver.find_element_by_xpath('//*[@id="nav-search-submit"]').click()
        click_button = driver.find_element_by_xpath('//*[@id="results"]/div[1]/div[2]/div[1]/h3/a/span').click()
        click_button = driver.find_element_by_xpath('//*[@id="main"]/div[2]/div[1]/section/div/div[1]/div/div[2]/h3/a/span').click()
        click_button = driver.find_element_by_xpath('//*[@id="main"]/div[2]/div/div[1]/div[2]/p[2]/a').click()
    finally:
        driver.quit()

def task2():
    driver = getDriver()
    try:
        params={'behavior':'allow','downloadPath':os.getcwd()}
        driver.execute_cdp_cmd('Page.setDownloadBehavior',params)

        driver.get("https://data.gov.uk/")
        WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "/html/body/div[3]/main/div[2]/form/div/div/input"))).send_keys("Forestry Statistics 2018: Recreation")
        click_button = driver.find_element_by_xpath('/html/body/div[3]/main/div[2]/form/div/div/div/button').click()
        click_button = driver.find_element_by_xpath('/html/body/div[3]/form/main/div/div[2]/div[2]/div[2]/h2/a').click()
        click_button = driver.find_element_by_xpath('/html/body/div[3]/main/div/div/div/section/table/tbody/tr[2]/td[1]/a').click()
    finally:
        driver.quit()

def error_callback(task_name, e):
    print(f'{task_name} completed with exception {e}')

POOL_SIZE = 2 # We only need 2 for this case
pool = ThreadPool(POOL_SIZE)
pool.apply_async(task1, error_callback=partial(error_callback, 'task1'))
pool.apply_async(task2, error_callback=partial(error_callback, 'task2'))
# Wait for tasks to complete
pool.close()
pool.join()
  • Related