def getAllBooksPagesURLs():
lists_of_url = []
lists_of_url.append(r"http://books.toscrape.com/")
for j in range(2,51):
lists_of_url.append(r"http://books.toscrape.com/catalogue/page-%d.html"%j)
return lists_of_url
def getAndParseURL(url):
result = requests.get(url)
soup = BeautifulSoup(result.text, 'html.parser')
return soup
def getBooksURLs(url,z):
soup = getAndParseURL(url)
return([z x.a.get('href') for x in soup.findAll( "div", class_="image_container")])
books_url = []
title_list = []
main_page_list = []
list_of_rewiew_num = []
list_of_bookpage = []
list_of_resultitle = []
books_done_page = []
list_of_review_num=[]
for y in getAllBooksPagesURLs()[0:1]:
main_page=getAndParseURL(y)
result_of_title = main_page.findAll("h3")
for x in result_of_title:
list_of_resultitle.append(x.find("a").get("title"))
books_url = getBooksURLs(y,y)
for b in books_url:
print(b)
books_page = getAndParseURL(b)
if books_page.find("td") is None:
list_of_review_num.append(0)
else:
review_num =books_page.find("td").contents[0]
list_of_review_num.append(review_num)
books_url
list_of_resultitle
list_of_review_num
above is my code ,the result is
['a897fe39b1053632', '90fa61229261140a', '6957f44c3847a760', 'e00eb4fd7b871a48', '4165285e1663650f', 'f77dbf2323deb740', '2597b5a345f45e1b', 'e72a5dfc7e9267b2', 'e10e1e165dc8be4a', '1dfe412b8ac00530', '0312262ecafa5a40', '30a7f60cd76ca58c', 'ce6396b0f23f6ecc', '3b1c02bac2a429e6', 'a34ba96d4081e6a4', 'deda3e61b9514b83', 'feb7cc7701ecf901', 'e30f54cea9b38190', 'a18a4f574854aced', 'a22124811bfa8350']
the garble codes are like 'a22124811bfa8350', is it about dynamic html? I donnot know. my desire output of list_of_review_num should be
[0,1,2,3]
how to get the correct output?could you plz help me? thank u in advance
CodePudding user response:
The reason your code is outputting the result that you have is that you are using .find()
which will find the first occurrence of the td
tag, since there are numerous tags on the page you are working with and that the reviews would be the last td
tag you should do something like this.
if books_page.find("td") is None: # saying that there is no td tags at all
list_of_review_num.append(0)
else:
review_num = books_page.find_all("td")[-1].contents[0] # using find_all and accessing the last td tag element
list_of_review_num.append(review_num)
CodePudding user response:
Issue here, you select upc information and not the reviews. I recommend to avoid all these lists to store your results, better use dicts instead:
data = []
for y in getAllBooksPagesURLs()[0:1]:
main_page=getAndParseURL(y)
books_url = getBooksURLs(y,y)
for b in books_url:
books_page = getAndParseURL(b)
d = {
'title': books_page.h1.text,
'url':b
}
d.update(dict(x.stripped_strings for x in books_page.select('table tr')))
data.append(d)
data
Example
import requests
from bs4 import BeautifulSoup
def getAllBooksPagesURLs():
lists_of_url = []
lists_of_url.append(r"http://books.toscrape.com/")
for j in range(2,51):
lists_of_url.append(r"http://books.toscrape.com/catalogue/page-%d.html"%j)
return lists_of_url
def getAndParseURL(url):
result = requests.get(url)
soup = BeautifulSoup(result.text, 'html.parser')
return soup
def getBooksURLs(url,z):
soup = getAndParseURL(url)
return([z x.a.get('href') for x in soup.find_all( "div", class_="image_container")])
data = []
for y in getAllBooksPagesURLs()[0:1]:
books_url = getBooksURLs(y,y)
for b in books_url:
books_page = getAndParseURL(b)
d = {
'title': books_page.h1.text,
'url':b
}
d.update(dict(x.stripped_strings for x in books_page.select('table tr')))
data.append(d)
data
Output:
[{'title': 'A Light in the Attic',
'url': 'http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html',
'UPC': 'a897fe39b1053632',
'Product Type': 'Books',
'Price (excl. tax)': '£51.77',
'Price (incl. tax)': '£51.77',
'Tax': '£0.00',
'Availability': 'In stock (22 available)',
'Number of reviews': '0'},
{'title': 'Tipping the Velvet',
'url': 'http://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html',
'UPC': '90fa61229261140a',
'Product Type': 'Books',
'Price (excl. tax)': '£53.74',
'Price (incl. tax)': '£53.74',
'Tax': '£0.00',
'Availability': 'In stock (20 available)',
'Number of reviews': '0'},...]
Note: In newer code avoid old syntax findAll()
instead use find_all()
or select()
with css selectors
- For more take a minute to check docs