Home > Mobile >  python scrape site with multiple pages
python scrape site with multiple pages

Time:09-27

Hey how can I change this code to enter each page and get the info I want ( the book name and the url of the book )

the url is : http://books.toscrape.com/

i wrote ( with google help ) this code but i want to get all the books from all the pages ( 50 pages )

# import web grabbing client and
# HTML parser
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import requests 


# variable to store website link as string
booksURL = 'http://books.toscrape.com/'
 
# grab website and store in variable urlClient
urlClient = uReq(booksURL)
 
# read and close HTML
page_html = urlClient.read()
urlClient.close()
 
# call BeautifulSoup for parsing
page_soup = soup(page_html, "html.parser")
# grabs all the products under list tag
bookshelf = page_soup.findAll(
    "li", {"class": "col-xs-6 col-sm-4 col-md-3 col-lg-3"})

for books in bookshelf:
 
    # collect title of all books
    book_title = books.h3.a["title"]
    book_url = books.find("a")["href"]
    #books_url = books.h3.a["url"]
 
    print(book_title   "-"  booksURL book_url)
 

i tried to add this code but i dont know how to add it to my

for i in range(51):      # Number of pages plus one 
    url = "https://books.toscrape.com/catalogue/page-{}.html".format(i)
    r = requests.get(url)
    soup = BeautifulSoup(r.content)

CodePudding user response:

This might work. I have removed uReq because I prefer using requests ;)

# import web grabbing client and
# HTML parser
from bs4 import BeautifulSoup as soup
import requests

for i in range(1, 51):      # Number of pages plus one
    url = "https://books.toscrape.com/catalogue/page-{}.html".format(i)
    response = requests.get(url)

    # call BeautifulSoup for parsing
    page_soup = soup(response.content, "html.parser")
    # grabs all the products under list tag
    bookshelf = page_soup.findAll(
        "li", {"class": "col-xs-6 col-sm-4 col-md-3 col-lg-3"})

    for books in bookshelf:
        # collect title of all books
        book_title = books.h3.a["title"]
        book_url = books.find("a")["href"]

        print(book_title   " - "   book_url)
  • Related