I'm trying to scrape a Goodreads Page to get all editions of a book, but when I run the code I get this error:
Traceback (most recent call last):
File "C:/xxx/PycharmProjects/wikipedia_pageview/isbn.py", line 141, in <module>
ed_details = get_editions_details(isbn)
File "C:/xxx/PycharmProjects/wikipedia_pageview/isbn.py", line 79, in get_editions_details
if ed_link := f"https://www.goodreads.com{ed_item['href']}":...
TypeError: 'NoneType' object is not subscriptable
I tried to put conditions for this reason in the selected areas but they don't work. Code:
def get_editions_details(isbn):
# Create the search URL with the ISBN of the book
data = {'q': isbn}
book_url = get_page("https://www.goodreads.com/search", data)
#print(book_url)
# Parse the markup with Beautiful Soup
soup = bs(book_url.text, 'lxml')
# Retrieve from the book's page the link for other editions
# and the total number of editions
if ed_item := soup.find("div", class_="otherEditionsLink"):
if ed_item := ed_item.find("a"):
print(ed_item)
else:
pass
if ed_item:
ed_num = ed_item.text.strip().split(' ')[-1].strip('()')
if ed_link := f"https://www.goodreads.com{ed_item['href']}":#capire...
print(ed_link)
else:
pass
return((ed_link, int(ed_num), isbn))
if __name__ == "__main__":
try:
os.mkdir('./urls_files')
except Exception:
pass
isbns = get_isbn()
for isbn in isbns:
ed_details = get_editions_details(isbn)
get_editions_urls(ed_details)
CodePudding user response:
What happens?
The indentation in your example seems to be not correct and will not handle wrong or missing isbn or editionlinks.
How to fix?
Assign the values to ed_link
and ed_num
in the moment you can be sure there exist a href
in ed_item
else set them to None
or 0
or handle these issue in another way:
def get_editions_details(isbn):
data = {'q': isbn}
book_url = requests.get("https://www.goodreads.com/search", data)
soup = bs(book_url.text, 'lxml')
ed_link = None
ed_num = 0
if ed_item := soup.find("div", class_="otherEditionsLink"):
if ed_item := ed_item.find("a"):
ed_link = f"https://www.goodreads.com{ed_item['href']}"
ed_num = ed_item.text.strip().split(' ')[-1].strip('()')
else:
pass
return((ed_link, int(ed_num), isbn))
if __name__ == "__main__":
#just as example to simulate an error
ed_details = get_editions_details(1)
if ed_details[0]:
get_editions_urls(ed_details)
else:
print(f'no editionlinks for isbn:{ed_details[2]}')