I have two seperate selenium codes that scrape a website and download a file. I am trying to merge them into one script and make them run simultaneously rather than sequentially. Can someone create a working code that merges the two so that they run in parallel?.
Here is the first code:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import os
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options=Options()
options.add_argument("--headless")
options.add_argument("--window-size=1920,1080")
driver=webdriver.Chrome(options=options)
params={'behavior':'allow','downloadPath':os.getcwd()}
driver.execute_cdp_cmd('Page.setDownloadBehavior',params)
driver.get("https://www.ons.gov.uk/")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.NAME, "q"))).send_keys("Education and childcare")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.NAME, "q"))).send_keys("Education and childcare")
click_button=driver.find_element_by_xpath('//*[@id="nav-search-submit"]').click()
click_button=driver.find_element_by_xpath('//*[@id="results"]/div[1]/div[2]/div[1]/h3/a/span').click()
click_button=driver.find_element_by_xpath('//*[@id="main"]/div[2]/div[1]/section/div/div[1]/div/div[2]/h3/a/span').click()
click_button=driver.find_element_by_xpath('//*[@id="main"]/div[2]/div/div[1]/div[2]/p[2]/a').click()
and here is the second code:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import os
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options=Options()
#options.add_argument("--headless")
#options.add_argument("--window-size=1920,1080")
driver=webdriver.Chrome(options=options)
params={'behavior':'allow','downloadPath':os.getcwd()}
driver.execute_cdp_cmd('Page.setDownloadBehavior',params)
driver.get("https://data.gov.uk/")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "/html/body/div[3]/main/div[2]/form/div/div/input"))).send_keys("Forestry Statistics 2018: Recreation")
click_button=driver.find_element_by_xpath('/html/body/div[3]/main/div[2]/form/div/div/div/button').click()
click_button=driver.find_element_by_xpath('/html/body/div[3]/form/main/div/div[2]/div[2]/div[2]/h2/a').click()
click_button=driver.find_element_by_xpath('/html/body/div[3]/main/div/div/div/section/table/tbody/tr[2]/td[1]/a').click()
CodePudding user response:
The simplest approach is to just create a multithreading pool of size 2 (you do not need a multiprocessing pool since each Chrome driver is already running in its own process):
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import os
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from multiprocessing.pool import ThreadPool
from functools import partial
def getDriver():
options = Options()
options.add_argument("--headless")
options.add_argument("--window-size=1920,1080")
driver = webdriver.Chrome(options=options)
return driver
def task1():
driver = getDriver()
try:
params = {'behavior':'allow','downloadPath':os.getcwd()}
driver.execute_cdp_cmd('Page.setDownloadBehavior',params)
driver.get("https://www.ons.gov.uk/")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.NAME, "q"))).send_keys("Education and childcare")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.NAME, "q"))).send_keys("Education and childcare")
click_button = driver.find_element_by_xpath('//*[@id="nav-search-submit"]').click()
click_button = driver.find_element_by_xpath('//*[@id="results"]/div[1]/div[2]/div[1]/h3/a/span').click()
click_button = driver.find_element_by_xpath('//*[@id="main"]/div[2]/div[1]/section/div/div[1]/div/div[2]/h3/a/span').click()
click_button = driver.find_element_by_xpath('//*[@id="main"]/div[2]/div/div[1]/div[2]/p[2]/a').click()
finally:
driver.quit()
def task2():
driver = getDriver()
try:
params={'behavior':'allow','downloadPath':os.getcwd()}
driver.execute_cdp_cmd('Page.setDownloadBehavior',params)
driver.get("https://data.gov.uk/")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "/html/body/div[3]/main/div[2]/form/div/div/input"))).send_keys("Forestry Statistics 2018: Recreation")
click_button = driver.find_element_by_xpath('/html/body/div[3]/main/div[2]/form/div/div/div/button').click()
click_button = driver.find_element_by_xpath('/html/body/div[3]/form/main/div/div[2]/div[2]/div[2]/h2/a').click()
click_button = driver.find_element_by_xpath('/html/body/div[3]/main/div/div/div/section/table/tbody/tr[2]/td[1]/a').click()
finally:
driver.quit()
def error_callback(task_name, e):
print(f'{task_name} completed with exception {e}')
POOL_SIZE = 2 # We only need 2 for this case
pool = ThreadPool(POOL_SIZE)
pool.apply_async(task1, error_callback=partial(error_callback, 'task1'))
pool.apply_async(task2, error_callback=partial(error_callback, 'task2'))
# Wait for tasks to complete
pool.close()
pool.join()