Apologies if this is a basic question as I am still relatively new to python. I am trying to develop a webscraping script using Selenium and have gotten almost all the necessary functions down (navigating from page to page, locating and opening all the urls in one page).
However, due to the nature of the site I am trying to scrape, a few elements are omitted on some pages, while present on others. In the case that they are ommitted, the terminal returns an IndexError
, which I am currently bypassing using an exception.
When I try to print the scraped data however, I get the following error:
UnboundLocalError: local variable 'manufacturer' referenced before assignment
I understand that this is likely due to me making an exception for the IndexError and then referencing it in the subsequent print
command.
Is a way for me to still scrape the information I am looking for by circumventing both issues, and if so, how would I then export the scraped data into a csv file?
My code for the function is as follows:
def scrape():
browser.implicitly_wait(7)
try:
collection = browser.find_elements(By.XPATH,'//*[@id="page-content-wrapper"]/div/ul/li[5]/a')[0].text
description = browser.find_elements(By.XPATH,'//*[(@id = "child-1")]//p')[0].text
dimension = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) 1) = 1) and parent::*)]//p')[0].text
finish = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) 1) = 2) and parent::*)]//p')[0].text
country = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) 1) = 3) and parent::*)]//p')[0].text
manufacturer = browser.find_elements(By.XPATH,'//div[(((count(preceding-sibling::*) 1) = 4) and parent::*)]//p')[0].text
except IndexError:
pass
print(collection, description, dimension, finish, country, manufacturer)
browser.back()
Many thanks!
CodePudding user response:
Since your variables are being created inside the try block, if one of them fails then that variable and anything below that variable will not be created, and when you try to reference it the interpreter does not know what that variable is supposed to be. If you do the following the print statement will work:
def scrape():
browser.implicitly_wait(7)
collection = ""
description = ""
dimension = ""
finish = ""
country = ""
manufacturer = ""
try:
collection = browser.find_elements(By.XPATH,'//*[@id="page-content-wrapper"]/div/ul/li[5]/a')[0].text
description = browser.find_elements(By.XPATH,'//*[(@id = "child-1")]//p')[0].text
dimension = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) 1) = 1) and parent::*)]//p')[0].text
finish = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) 1) = 2) and parent::*)]//p')[0].text
country = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) 1) = 3) and parent::*)]//p')[0].text
manufacturer = browser.find_elements(By.XPATH,'//div[(((count(preceding-sibling::*) 1) = 4) and parent::*)]//p')[0].text
except IndexError:
pass
print(collection, description, dimension, finish, country, manufacturer)
browser.back()
So now you have to take care of the problem of the variables below the failing one also not being assigned; I recommend using a dictionary:
def scrape():
browser.implicitly_wait(7)
page_elements = dict()
page_elements['collection'] = ""
page_elements['description'] = ""
page_elements['dimension'] = ""
page_elements['finish'] = ""
page_elements['country'] = ""
page_elements['manufacturer'] = ""
try:
collection = browser.find_elements(By.XPATH,'//*[@id="page-content-wrapper"]/div/ul/li[5]/a')[0].text
except IndexError:
pass
try:
description = browser.find_elements(By.XPATH,'//*[(@id = "child-1")]//p')[0].text
except IndexError:
pass
try:
dimension = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) 1) = 1) and parent::*)]//p')[0].text
except IndexError:
pass
try:
finish = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) 1) = 2) and parent::*)]//p')[0].text
except IndexError:
pass
try:
country = browser.find_elements(By.XPATH,'//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) 1) = 3) and parent::*)]//p')[0].text
except IndexError:
pass
try:
manufacturer = browser.find_elements(By.XPATH,'//div[(((count(preceding-sibling::*) 1) = 4) and parent::*)]//p')[0].text
except IndexError:
pass
print(page_elements)
browser.back()
And again, using try blocks for each statement is a little cumbersome, so if you want to save the paths in a different dictionary you could do something like this:
def scrape():
browser.implicityl_wait(7)
page_elements = dict()
page_elements['collection'] = ""
page_elements['description'] = ""
page_elements['dimension'] = ""
page_elements['finish'] = ""
page_elements['country'] = ""
page_elements['manufacturer'] = ""
element_paths = dict()
element_paths['collection'] = '//*[@id="page-content-wrapper"]/div/ul/li[5]/a'
element_paths['description'] = '//*[(@id = "child-1")]//p'
element_paths['dimension'] = '//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) 1) = 1) and parent::*)]//p'
element_paths['finish'] = '//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) 1) = 2) and parent::*)]//p'
element_paths['country'] = '//*[(@id = "detailed-description")]//div[(((count(preceding-sibling::*) 1) = 3) and parent::*)]//p'
element_paths['manufacturer'] = '//div[(((count(preceding-sibling::*) 1) = 4) and parent::*)]//p'
for element in page_elements:
try:
page_elements[element] = browser.find_elements(By.XPATH, element_paths[element])[0].text
except IndexError:
pass
print(page_elements)
browser.back()