wait = WebDriverWait(driver, 20) #wait 20s to let webpage load
driver.get('https://beta.clinicaltrials.gov/') #getting website url
driver.maximize_window()
time.sleep(1)
country = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="content"]/div/ctg-home/div/div[2]/ctg-advanced-search-home/div[2]/div[1]/fieldset/div[2]/div[3]/ctg-location-search-input/form/div[2]/div/label')))
country.click()
searchBar = driver.find_element("id",'location-input')
searchBar.send_keys("Singapore") #input country name into searchBar
search_dropdown = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="mat-option-14"]/span'))) #wait till xpath is visible
search_dropdown.click()
search_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="content"]/div/ctg-home/div/div[2]/ctg-advanced-search-home/div[2]/div[2]/div/div[2]/button')))
search_button.click()
#finding filter button for recruiting status
filter_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="filter-button-statusGroup"]')))
filter_button.click()
#clicking on 'recruiting' status
recruiting = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="adv-check-status"]/div[2]/div[2]/div/label')))
recruiting.click()
#scraping each clinical trial details
clinical_trial = {} #empty dict to store details
name_list = []
phone_list = []
email_list = []
enrollment = []
condition_list = []
#loop to go through all the clinical trials in the search page (10 per page)
for i in range(1,11):
time.sleep(2) #wait 2s to let page load
xpath = '//*[@id="content"]/div/ctg-search-results/div[2]/div/div[2]/div/div[2]/div[1]/ctg-search-hit-card[{}]/div/header/a'.format(i)
trials = driver.find_element("xpath", xpath)
trials.click()
#time.sleep(5) #wait 5s to let page load
#getting contact person name
name = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[@id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[2]/ctg-study-contacts-and-locations/div/div/div/ctg-study-contact-info/p[1]/span')))
name_list.append(name.text) #adding each name to the list
#phone number of contact person
phone = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[@id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[2]/ctg-study-contacts-and-locations/div/div/div/ctg-study-contact-info/p[2]/span')))
phone_list.append(phone.text) #adding each phone number to the list
#email of contact person
email = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[@id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[2]/ctg-study-contacts-and-locations/div/div/div/ctg-study-contact-info/p[3]/ctg-study-contact-email/span/a')))
email_list.append(email.text) #adding each email address to the list
#number of enrollment
enrollment_num = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[@id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[1]/ctg-study-overview/div[3]/div[2]/div[3]/div[2]')))
enrollment.append(enrollment_num.text) #adding each enrollment number to the list
#condition of study
conditions = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[@id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[1]/ctg-study-overview/div[3]/div[2]/div[1]/div[2]')))
condition_list.append(conditions.text) #adding conditions of the study to list
driver.back() #return to search page
#adding all the different list details to the contact_details dict
clinical_trial["name"] = name_list
clinical_trial["phone_num"] = phone_list
clinical_trial["email_address"] = email_list
clinical_trial["Enrollment"] = enrollment
clinical_trial["Conditions"] = condition_list
I am having an issue with selenium somehow not finding the xpath for enrollment_num
in the loop. The loop runs through the 10 clickable links on the webpage, however it gives a TimeoutException
error at the 9th link. Why is that so? When i change the loop to iterate through 8 links instead of the usual 10 links, it works fine. Its just that one link which creates the error.
CodePudding user response:
Page number 9 is different from all the other pages. The difference is hard to spot. Tipp: to compare strings i use Notepad with the compare plugin. This page does not have these 2 elements:
enrollment_num =...ctg-study-overview/div[3]/div[2]/di...'
here it is:
enrollment_num =...ctg-study-overview/div[2]/div[2]/di...
conditions = ...ctg-study-overview/div[3]/di...
here it is:
...ctg-study-overview/div[2]/di...
This is why it runs into a timeout. You could build a try:except:else around these to avoid the program from crashing. Below a quick fix. Of course you should tidy it up. I hope this helps.
# number of enrollment
try:
enrollment_num = wait.until(EC.visibility_of_element_located((By.XPATH,
'//*[@id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[1]/ctg-study-overview/div[3]/div[2]/div[3]/div[2]')))
enrollment.append(enrollment_num.text) # adding each enrollment number to the list
except:
print("enrollement div[3] but div[2]")
enrollment_num = wait.until(EC.visibility_of_element_located((By.XPATH,
'//*[@id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[1]/ctg-study-overview/div[2]/div[2]/div[3]/div[2]')))
enrollment.append(enrollment_num.text) # adding each enrollment number to the list
else:
pass
# condition of study
try:
conditions = wait.until(EC.visibility_of_element_located((By.XPATH,
'//*[@id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[1]/ctg-study-overview/div[3]/div[2]/div[1]/div[2]')))
condition_list.append(conditions.text) # adding conditions of the study to list
except:
print("condition_list non div[3] but div[2]")
conditions = wait.until(EC.visibility_of_element_located((By.XPATH,
'//*[@id="studyDetailsInfo"]/ctg-study-info/div/ctg-study-info-view/div/div[1]/ctg-study-overview/div[2]/div[2]/div[1]/div[2]')))
condition_list.append(conditions.text) # adding conditions of the study to list
else:
pass