Problem while using bs4: 'NoneType' object is not subscriptable-CodePudding

I'm trying to scrape a Goodreads Page to get all editions of a book, but when I run the code I get this error:

Traceback (most recent call last):
  File "C:/xxx/PycharmProjects/wikipedia_pageview/isbn.py", line 141, in <module>
    ed_details = get_editions_details(isbn) 
  File "C:/xxx/PycharmProjects/wikipedia_pageview/isbn.py", line 79, in get_editions_details
    if ed_link := f"https://www.goodreads.com{ed_item['href']}":...
TypeError: 'NoneType' object is not subscriptable

I tried to put conditions for this reason in the selected areas but they don't work. Code:

def get_editions_details(isbn):
# Create the search URL with the ISBN of the book
data = {'q': isbn}
book_url = get_page("https://www.goodreads.com/search", data)
#print(book_url)
# Parse the markup with Beautiful Soup
soup = bs(book_url.text, 'lxml')
# Retrieve from the book's page the link for other editions
# and the total number of editions
if ed_item := soup.find("div", class_="otherEditionsLink"):
    if ed_item := ed_item.find("a"):
        print(ed_item)
    else:
        pass

if ed_item:
    ed_num = ed_item.text.strip().split(' ')[-1].strip('()')

if ed_link := f"https://www.goodreads.com{ed_item['href']}":#capire...
    print(ed_link)
else:
    pass
return((ed_link, int(ed_num), isbn))  



if __name__ == "__main__":
        try:
            os.mkdir('./urls_files')
        except Exception:
            pass


    isbns = get_isbn()

    for isbn in isbns:
            ed_details = get_editions_details(isbn) 
            get_editions_urls(ed_details)

CodePudding user response：

What happens?

The indentation in your example seems to be not correct and will not handle wrong or missing isbn or editionlinks.

How to fix?

Assign the values to ed_link and ed_num in the moment you can be sure there exist a href in ed_item else set them to None or 0 or handle these issue in another way:

def get_editions_details(isbn):

    data = {'q': isbn}
    book_url = requests.get("https://www.goodreads.com/search", data)
    soup = bs(book_url.text, 'lxml')

    ed_link = None
    ed_num = 0

    if ed_item := soup.find("div", class_="otherEditionsLink"):
        if ed_item := ed_item.find("a"):
            ed_link = f"https://www.goodreads.com{ed_item['href']}"
            ed_num = ed_item.text.strip().split(' ')[-1].strip('()')
        else:
            pass

    return((ed_link, int(ed_num), isbn))


if __name__ == "__main__":
    #just as example to simulate an error
    ed_details = get_editions_details(1)
    if ed_details[0]:
        get_editions_urls(ed_details)
    else:
        print(f'no editionlinks for isbn:{ed_details[2]}')