Written some code to scrape a website: https://books.toscrape.com/catalogue/page-1.html
but I'm getting an error:
Nontype object has no attribute text
Failed to find a solution for this so how can I can fix this error?
import requests
from bs4 import BeautifulSoup
import pandas as pd
all_books=[]
url='https://books.toscrape.com/catalogue/page-1.html'
headers=('https://developers.whatismybrowser.com/useragents/parse/22526098chrome-windows-blink')
def get_page(url):
page=requests.get(url,headers)
status=page.status_code
soup=BeautifulSoup(page.text,'html.parser')
return [soup,status]
#get all books links
def get_links(soup):
links=[]
listings=soup.find_all(class_='product_pod')
for listing in listings:
bk_link=listing.find("h3").a.get("href")
base_url='https://books.toscrape.com/catalogue/page-1.html'
cmplt_link=base_url bk_link
links.append(cmplt_link)
return links
#extraxt info from each link
def extract_info(links):
for link in links:
r=requests.get(link).text
book_soup=BeautifulSoup(r,'html.parser')
name=book_soup.find(class_='col-sm-6 product_main').text.strip()
price=book_soup.find(class_='col-sm-6 product_main').text.strip()
desc=book_soup.find(class_='sub-header').text.strip()
cat=book_soup.find('"../category/books/poetry_23/index.html">Poetry').text.strip()
book={'name':name,'price':price,'desc':desc,'cat':cat}
all_books.append(book)
pg=48
while True:
url=f'https://books.toscrape.com/catalogue/page-{pg}.html'
soup_status=get_page(url)
if soup_status[1]==200:
print(f"scrapping page{pg}")
extract_info(get_links(soup_status[0]))
pg =1
else:
print("The End")
break
df=pd.DataFrame(all_books)
print(df)
CodePudding user response:
Note First of all, always take a look at your soup - therein lies the truth. The contents can always differ slightly to extremely from the view in the dev tools.
What happens?
There are different issues you should keep in mind:
base_url='https://books.toscrape.com/catalogue/page-1.html'
will lead to 404 errors and is the first reason causing your "Nontype object has no attribute text"You try to find the category like this
cat=book_soup.find('"../category/books/poetry_23/index.html">Poetry').text.strip()
what won't work to and will lead to same errorThere some more selection that will not lead to an expected result, take a look in my example edited them to give you a clue how to get the goal.
How to fix?
Change
base_url='https://books.toscrape.com/catalogue/page-1.html'
tobase_url='https://books.toscrape.com/catalogue/'
Select the category more specific, it is the last
<a>
in breadcrumb:cat=book_soup.select('.breadcrumb a')[-1].text.strip()
Example
import requests
from bs4 import BeautifulSoup
import pandas as pd
all_books=[]
url='https://books.toscrape.com/catalogue/page-1.html'
headers=('https://developers.whatismybrowser.com/useragents/parse/22526098chrome-windows-blink')
def get_page(url):
page=requests.get(url,headers)
status=page.status_code
soup=BeautifulSoup(page.text,'html.parser')
return [soup,status]
#get all books links
def get_links(soup):
links=[]
listings=soup.find_all(class_='product_pod')
for listing in listings:
bk_link=listing.find("h3").a.get("href")
base_url='https://books.toscrape.com/catalogue/'
cmplt_link=base_url bk_link
links.append(cmplt_link)
return links
#extraxt info from each link
def extract_info(links):
for link in links:
r=requests.get(link).text
book_soup=BeautifulSoup(r,'html.parser')
name=book_soup.h1.text.strip()
price=book_soup.select_one('h1 p').text.strip()
desc=book_soup.select_one('#product_description p').text.strip()
cat=book_soup.select('.breadcrumb a')[-1].text.strip()
book={'name':name,'price':price,'desc':desc,'cat':cat}
all_books.append(book)
pg=48
while True:
url=f'https://books.toscrape.com/catalogue/page-{pg}.html'
soup_status=get_page(url)
if soup_status[1]==200:
print(f"scrapping page{pg}")
extract_info(get_links(soup_status[0]))
pg =1
else:
print("The End")
break
all_books
CodePudding user response:
Use the function below when you need to grab the text of element.
It will protect you from None
elements
def get_text(book_soup,clazz):
ele = book_soup.find(class_=clazz)
return ele.text.strip() if ele is not None else ''
Example.Instead of
name=book_soup.find(class_='col-sm-6 product_main').text.strip()
do
name=get_text(book_soup,'col-sm-6 product_main')