using bs4 get garbled code when find values inside tag-CodePudding

def getAllBooksPagesURLs():
    lists_of_url = []
    lists_of_url.append(r"http://books.toscrape.com/")
    for j in range(2,51):
        lists_of_url.append(r"http://books.toscrape.com/catalogue/page-%d.html"%j)
    return lists_of_url

def getAndParseURL(url):
    result = requests.get(url)
    soup = BeautifulSoup(result.text, 'html.parser')
    return soup

def getBooksURLs(url,z):
    soup = getAndParseURL(url)
    return([z  x.a.get('href') for x in soup.findAll( "div", class_="image_container")])

books_url = []
title_list = []
main_page_list = []
list_of_rewiew_num = []
list_of_bookpage = []
list_of_resultitle = []
books_done_page = []
list_of_review_num=[]

for y in getAllBooksPagesURLs()[0:1]:
    main_page=getAndParseURL(y)
    result_of_title = main_page.findAll("h3")
    for x in  result_of_title:
        list_of_resultitle.append(x.find("a").get("title"))
        books_url = getBooksURLs(y,y)

        for b in books_url:
    
             print(b)
             books_page = getAndParseURL(b)
             if books_page.find("td") is None:
                 list_of_review_num.append(0)
             else:
                 review_num =books_page.find("td").contents[0]

                 list_of_review_num.append(review_num)
books_url
list_of_resultitle
list_of_review_num

above is my code ,the result is

['a897fe39b1053632', '90fa61229261140a', '6957f44c3847a760', 'e00eb4fd7b871a48', '4165285e1663650f', 'f77dbf2323deb740', '2597b5a345f45e1b', 'e72a5dfc7e9267b2', 'e10e1e165dc8be4a', '1dfe412b8ac00530', '0312262ecafa5a40', '30a7f60cd76ca58c', 'ce6396b0f23f6ecc', '3b1c02bac2a429e6', 'a34ba96d4081e6a4', 'deda3e61b9514b83', 'feb7cc7701ecf901', 'e30f54cea9b38190', 'a18a4f574854aced', 'a22124811bfa8350']

the garble codes are like 'a22124811bfa8350', is it about dynamic html? I donnot know. my desire output of list_of_review_num should be

[0,1,2,3]

how to get the correct output?could you plz help me? thank u in advance

CodePudding user response：

The reason your code is outputting the result that you have is that you are using .find() which will find the first occurrence of the td tag, since there are numerous tags on the page you are working with and that the reviews would be the last td tag you should do something like this.

if books_page.find("td") is None: # saying that there is no td tags at all
    list_of_review_num.append(0)
else:
    review_num = books_page.find_all("td")[-1].contents[0] # using find_all and accessing the last td tag element 

    list_of_review_num.append(review_num)

CodePudding user response：

Issue here, you select upc information and not the reviews. I recommend to avoid all these lists to store your results, better use dicts instead:

data = []

for y in getAllBooksPagesURLs()[0:1]:
    main_page=getAndParseURL(y)
    books_url = getBooksURLs(y,y)

    for b in books_url:
        books_page = getAndParseURL(b)
        d = {
            'title': books_page.h1.text,
            'url':b
        }
        d.update(dict(x.stripped_strings for x in books_page.select('table tr')))
        data.append(d)
data

Example

import requests
from bs4 import BeautifulSoup

def getAllBooksPagesURLs():
    lists_of_url = []
    lists_of_url.append(r"http://books.toscrape.com/")
    for j in range(2,51):
        lists_of_url.append(r"http://books.toscrape.com/catalogue/page-%d.html"%j)
    return lists_of_url

def getAndParseURL(url):
    result = requests.get(url)
    soup = BeautifulSoup(result.text, 'html.parser')
    return soup

def getBooksURLs(url,z):
    soup = getAndParseURL(url)
    return([z  x.a.get('href') for x in soup.find_all( "div", class_="image_container")])

data = []

for y in getAllBooksPagesURLs()[0:1]:
    books_url = getBooksURLs(y,y)

    for b in books_url:
        books_page = getAndParseURL(b)
        d = {
            'title': books_page.h1.text,
            'url':b
        }
        d.update(dict(x.stripped_strings for x in books_page.select('table tr')))
        data.append(d)
data

Output:

[{'title': 'A Light in the Attic',
  'url': 'http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html',
  'UPC': 'a897fe39b1053632',
  'Product Type': 'Books',
  'Price (excl. tax)': '£51.77',
  'Price (incl. tax)': '£51.77',
  'Tax': '£0.00',
  'Availability': 'In stock (22 available)',
  'Number of reviews': '0'},
 {'title': 'Tipping the Velvet',
  'url': 'http://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html',
  'UPC': '90fa61229261140a',
  'Product Type': 'Books',
  'Price (excl. tax)': '£53.74',
  'Price (incl. tax)': '£53.74',
  'Tax': '£0.00',
  'Availability': 'In stock (20 available)',
  'Number of reviews': '0'},...]

Note: In newer code avoid old syntax findAll() instead use find_all() or select() with css selectors - For more take a minute to check docs