Python IndexError and UnboundLocalError in Selenium-CodePudding

Apologies if this is a basic question as I am still relatively new to python. I am trying to develop a webscraping script using Selenium and have gotten almost all the necessary functions down (navigating from page to page, locating and opening all the urls in one page).

However, due to the nature of the site I am trying to scrape, a few elements are omitted on some pages, while present on others. In the case that they are ommitted, the terminal returns an IndexError, which I am currently bypassing using an exception.

When I try to print the scraped data however, I get the following error:

UnboundLocalError: local variable 'manufacturer' referenced before assignment

I understand that this is likely due to me making an exception for the IndexError and then referencing it in the subsequent print command.

Is a way for me to still scrape the information I am looking for by circumventing both issues, and if so, how would I then export the scraped data into a csv file?

My code for the function is as follows:

def scrape():
    browser.implicitly_wait(7)
    try:
        collection = browser.find_elements(By.XPATH,'//*[@id="page-content-wrapper"]/div/ul/li[5]/a')[0].text
        description = browser.find_elements(By.XPATH,'//*[(@id = "child-1")]//p')[0].text
        dimension = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*)   1) = 1) and parent::*)]//p')[0].text
        finish = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*)   1) = 2) and parent::*)]//p')[0].text
        country = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*)   1) = 3) and parent::*)]//p')[0].text
        manufacturer = browser.find_elements(By.XPATH,'//div[(((count(preceding-sibling::*)   1) = 4) and parent::*)]//p')[0].text
    except IndexError:
        pass
    print(collection, description, dimension, finish, country, manufacturer)
    browser.back()

Many thanks!

CodePudding user response：

Since your variables are being created inside the try block, if one of them fails then that variable and anything below that variable will not be created, and when you try to reference it the interpreter does not know what that variable is supposed to be. If you do the following the print statement will work:

def scrape():
    browser.implicitly_wait(7)
    collection = ""
    description = ""
    dimension = ""
    finish = ""
    country = ""
    manufacturer = ""

    try:
        collection = browser.find_elements(By.XPATH,'//*[@id="page-content-wrapper"]/div/ul/li[5]/a')[0].text
        description = browser.find_elements(By.XPATH,'//*[(@id = "child-1")]//p')[0].text
        dimension = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*)   1) = 1) and parent::*)]//p')[0].text
        finish = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*)   1) = 2) and parent::*)]//p')[0].text
        country = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*)   1) = 3) and parent::*)]//p')[0].text
        manufacturer = browser.find_elements(By.XPATH,'//div[(((count(preceding-sibling::*)   1) = 4) and parent::*)]//p')[0].text
    except IndexError:
        pass
    print(collection, description, dimension, finish, country, manufacturer)
    browser.back()

So now you have to take care of the problem of the variables below the failing one also not being assigned; I recommend using a dictionary:

def scrape():
    browser.implicitly_wait(7)
    page_elements = dict()
    page_elements['collection'] = ""
    page_elements['description'] = ""
    page_elements['dimension'] = ""
    page_elements['finish'] = ""
    page_elements['country'] = ""
    page_elements['manufacturer'] = ""
    try:
        collection = browser.find_elements(By.XPATH,'//*[@id="page-content-wrapper"]/div/ul/li[5]/a')[0].text
    except IndexError:
        pass
    try:
        description = browser.find_elements(By.XPATH,'//*[(@id = "child-1")]//p')[0].text
    except IndexError:
        pass
    try:
        dimension = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*)   1) = 1) and parent::*)]//p')[0].text
    except IndexError:
        pass
    try:
        finish = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*)   1) = 2) and parent::*)]//p')[0].text
    except IndexError:
        pass
    try:
        country = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*)   1) = 3) and parent::*)]//p')[0].text
    except IndexError:
        pass
    try:
        manufacturer = browser.find_elements(By.XPATH,'//div[(((count(preceding-sibling::*)   1) = 4) and parent::*)]//p')[0].text
    except IndexError:
        pass
    print(page_elements)
    browser.back()

And again, using try blocks for each statement is a little cumbersome, so if you want to save the paths in a different dictionary you could do something like this:

def scrape():
    browser.implicityl_wait(7)
    page_elements = dict()
    page_elements['collection'] = ""
    page_elements['description'] = ""
    page_elements['dimension'] = ""
    page_elements['finish'] = ""
    page_elements['country'] = ""
    page_elements['manufacturer'] = ""
    
    element_paths = dict()
    element_paths['collection'] = '//*[@id="page-content-wrapper"]/div/ul/li[5]/a'
    element_paths['description'] = '//*[(@id = "child-1")]//p'
    element_paths['dimension'] = '//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*)   1) = 1) and parent::*)]//p'
    element_paths['finish'] = '//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*)   1) = 2) and parent::*)]//p'
    element_paths['country'] = '//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*)   1) = 3) and parent::*)]//p'
    element_paths['manufacturer'] = '//div[(((count(preceding-sibling::*)   1) = 4) and parent::*)]//p'

    for element in page_elements:
        try:
            page_elements[element] = browser.find_elements(By.XPATH, element_paths[element])[0].text
        except IndexError:
            pass
    print(page_elements)
    browser.back()