Extracting the links from a website using python-CodePudding

I have a dataframe df which has two columns (PMID, DOI), I want to pass each PMID into a website as input value and get the DOI href link as output and store it in the second column (DOI), an example is shown below,

PMID     | DOI
20022636   10.1016/j.molimm.2009.11.027
20023032   10.1128/JB.01375-09  
2002360
2002352




 driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.pmid2cite.com/pmid-to-doi-converter")
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//p[text()='Consent']"))).click()
for index, row in df.iterrows():
    print(str(row['PMID']))
    
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#formInput"))).send_keys(row['PMID'])
    driver.find_element_by_xpath("/html/body/div[5]/div[2]/form/button").click()
    #print(driver.page_source)
    #print(WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH, "/html/body/div[5]/div[3]/p[1]/span[2]/a"))).get_attribute('href'))
    #print([my_elem.get_attribute("innerHTML") for my_elem in WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "/html/body/div[5]/div[3]/p[1]/span[2]/a")))])
    res = [my_elem.get_attribute("innerHTML") for my_elem in WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "/html/body/div[5]/div[3]/p[1]/span[2]/a")))]
    df.iloc[index, 1] = res[0]
    
print('Done')

CodePudding user response：

You are getting ElementNotInteractableException: Message: element not interactable error because when webpages loaded a consent form appear. you need to accept the consent form then it will be able to interact the other elements.

Use WebDriverWait() and wait for consent form first to click and then the input element.

driver.get("https://www.pmid2cite.com/pmid-to-doi-converter")
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//p[text()='Consent']"))).click()
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#formInput"))).send_keys("20022636")

You need to import below libraries.

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

browser snapshot:

CodePudding user response：

Here's the answer, it returns DOI for every PMID's in a given dataframe.

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, WebDriverException, NoSuchElementException

driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.pmid2cite.com/pmid-to-doi-converter")
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//p[text()='Consent']"))).click()
for index, row in df.iterrows():
    try:
        
        #print(str(row['PMID']))
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#formInput"))).send_keys(row['PMID'])
        driver.find_element_by_xpath("/html/body/div[5]/div[2]/form/button").click()
        res = [my_elem.get_attribute("innerHTML") for my_elem in WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "/html/body/div[5]/div[3]/p[1]/span[2]/a")))]
        print(res)
        df.iloc[index, 1] = res[0]
    
    except TimeoutException:
        
        df.iloc[index, 1] = 'NA'
        
        
print('Done')

    PMID        DOI
0   277290620   10.1186/s13075-016-1133-8
1   110534327   10.1074/jbc.M004690200
2   97257504    NA
3   78270291    NA