I'm a total beginner at Python and am currently running this nested for loop webscraping program to scrape for several Excel files for thousands of observations in my dataset. However, my code runs so slow in that I need to speed up this process so I can do this for 5-20 observations AT ONCE. People have suggested threading or asyncio, but I don't know how to use them or what code to write as the documentation online is extremely obtuse with no real explanation as to what Python 3.9 (Spyder) is doing during my trial-and-error process.
My code is LONG, but the main point is that I need to iterate over multiple elements i (in the first line of code) at once, but I don't know how to do so. I'm looking for an easy fix. I realize this code is very clunky, but processing power/speed is not an issue. Please only help me address the concurrency issues!
Here's my code. The very first line is the one that I need to iterate over multiple (10-20) elements in an array at once/concurrently for.
for i in range(0,33000):
#Say what iteration this is
print('Beginning iteration')
print(i)
#Calling to use Chrome to webscrape
driver = webdriver.Chrome(ChromeDriverManager().install())
#Create WebDriverWait times of 5, 10, 15 and 30 seconds
wait5 = WebDriverWait(driver, 5)
wait10 = WebDriverWait(driver, 10)
wait15 = WebDriverWait(driver, 15)
wait30 = WebDriverWait(driver, 30)
#Open FEC webpage
driver.get("https://www.fec.gov/")
#Find the searchbar and search the PCC ID
searchbar = driver.find_element_by_xpath('/html/body/header[2]/div/ul/li[3]/form/div/span/input')
searchbar.send_keys(commid[i])
#searchbar.send_keys(comm5)
searchbar.send_keys(Keys.RETURN)
#Click on PCC Homepage
pcc = wait5.until(
EC.element_to_be_clickable((By.XPATH, '/html/body/main/main/div[2]/div[2]/section/ul/li/h3/a'))
)
pcc.click()
try:
#Get Two-Year election cycle Period drop down menu in PCC Homepage
select = driver.find_element_by_xpath( "//select[@id='summary-cycle']") #get the select element
options = select.find_elements_by_tag_name("option") #get all the options into a list
except:
pass
else:
#Create array that will hold all election cycle options for PCC 'i'
optionsList = []
for option in options: #iterate over the options, place attribute value in the options array
optionsList.append(option.get_attribute("value"))
#Now, for each PCC in the dataset, loop over all available election cycles each PCC was registered for
for oppy in optionsList:
#Select the election cycle of interest
dropdown = Select(driver.find_element_by_id('summary-cycle'))
dropdown.select_by_value(oppy)
sleep(randint(5,7))
try:
#Clicks on "Browse receipts" button on PCC i's homepage
receipts = wait10.until(EC.presence_of_element_located((By.XPATH, '//*[@id="total-raised"]/div[1]/a')))
driver.execute_script("arguments[0].click();",receipts)
sleep(randint(10,15))
except:
if NoSuchElementException:
try:
driver.find_element(By.XPATH, '/html/body/main/div[2]/header/div/span[3]')
print('For PCC ID {},'.format(''.join(commid[i])))
#print('For PCC ID {},'.format(''.join(comm5[i])))
print('Receipts do not exist for election year {}.'.format(''.join(oppy)))
pass
except:
print('For PCC ID {},'.format(''.join(commid[i])))
print('Webpage does not exist for election year {}.'.format(''.join(oppy)))
driver.back()
else:
try:
#Clicks on "Export" button for receipts from succeeding webpage of receipt data
receiptsexport = wait15.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main"]/section/div[2]/div[1]/div[1]/div/div[2]/button')))
receiptsexport.click()
sleep(randint(5,7))
except:
print('For PCC ID {},'.format(''.join(commid[i])))
#print('For PCC ID {},'.format(''.join(comm5[i])))
print('There is no Receipt Data to export for election year {}.'.format(''.join(oppy)))
sleep(randint(5,7))
pass
else:
try:
#Clicks on "Download" button under "Your downloads" to download receipts as .csv file
receiptsdownload = wait10.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[4]/div/ul/li/div/a')))
sleep(randint(5,7))
receiptsdownload.click()
sleep(randint(5,7))
driver.back()
sleep(randint(5,7))
except:
print('For PCC ID {},'.format(''.join(commid[i])))
print('I cannot download Receipt Data, since there is none to export for election year {}.'.format(''.join(oppy)))
driver.back() #Go back to PCC homepage
sleep(randint(5,7))
pass
try:
#Search for "Browse Disbursements" on PCC i's homepage and click link
disburse = wait10.until(EC.presence_of_element_located((By.LINK_TEXT, "Browse disbursements")))
driver.execute_script("arguments[0].click();",disburse)
sleep(randint(10,15))
except:
print('For PCC ID {},'.format(''.join(commid[i])))
#print('For PCC ID {},'.format(''.join(comm5)))
print('Disbursements do not exist for election year {}.'.format(''.join(oppy)))
sleep(randint(5,7))
pass
else:
try:
#Clicks on "Export" button for disbursements from succeeding webpage of disbursement data
disbursexport = wait15.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main"]/section/div[2]/div[1]/div[1]/div/div[2]/button')))
disbursexport.click()
sleep(randint(5,7))
except:
print('For PCC ID {},'.format(''.join(commid[i])))
#print('For PCC ID {},'.format(''.join(comm5)))
print('There is no Disbursement Data to export for election year {}.'.format(''.join(oppy)))
sleep(randint(5,7))
pass
else:
try:
#Clicks on "Download" button under "Your downloads" to download disbursements as .csv file
disbursedownload = wait15.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[4]/div/ul/li/div/a')))
sleep(randint(5,7))
disbursedownload.click()
driver.back()
sleep(randint(5,7))
except:
print('For PCC ID {},'.format(''.join(commid[i])))
print('I cannot download Disbursement Data, since there is none to export for election year {}.'.format(''.join(oppy)))
driver.back() #Go back to PCC homepage
sleep(randint(5,7))
pass
CodePudding user response:
You can try concurrent.futures
. Define the code you want to run as a function with one argument, and pass it like so:
import concurrent.futures
def my_func(i):
do_something
my_list = [i for i in range(0, 33000)]
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.map(my_func, my_list)
Each entry of my_list
is passed into my_func
. If you want to pass in more arguments to my_func()
, look into How to use multiprocessing pool.map with multiple arguments, but it doesn't look like you need that. You can play around with how many threads using the max_workers
argument of ThreadPoolExecutor
.