Home > front end >  Web Crawler Looping the URL to crawl many pages
Web Crawler Looping the URL to crawl many pages

Time:05-05

I am lost with making a loop to go through all of the pages on this book site. The url ends in 'all?page=' followed by the page number, so it should be easy I thought, but I'm stuck. All the info gathering works fine, I just don't know how to move to the next pages. Any help would be appreciated.

import requests
from bs4 import BeautifulSoup

URL = 'https://www.bookdepository.com/category/352/Science-Fiction/browse/viewmode/all?page='  str(page)
page = 1
page  = 1
for page in max_pages:
    html = requests.get(URL)
    soup = BeautifulSoup(html.content, "html.parser")
# ^This part I need help with^


# results = all books present on page
# books = each individual book on the page
results = soup.find(class_='tab search')
books = results.find_all('div', class_='book-item')

for book in books:
    title = book.h3.a
    author = book.p.span
    # in case there is no rating on a book
    if len(book.find('div','rating-wrap').findAll('span', 'full-star')) == None:
        pass
    else: rating = len(book.find('div','rating-wrap').findAll('span', 'full-star'))
    
    publish_date = book.find(class_='published')
    format = book.find(class_='format')
    price = book.find('span', class_='sale-price').text.strip()

    # if there is no discount
    if book.find(class_='rrp') == None:
        pass
    else:
        original_price = book.find(class_='rrp').text.strip()
    if book.find(class_='price-save') == None:
        pass
    else:
        discount = book.find(class_='price-save').text.strip()
    
    # unneeded text removed such as 'US' before the price shown
    price = price.replace('US', '')        
    original_price = original_price.replace('US', '')
    discount = discount.replace('Save US', '')
    
    # .text.strip() gets text and rids of empty spaces
    print(title.text.strip())
    print(author.text.strip())
    print(rating, 'stars')
    print(publish_date.text.strip())
    print(format.text.strip())
    print(price)
    print(original_price)
    print(discount, 'in savings!')

CodePudding user response:

What the code does is it loops 5 times in this case with page going up one every singe time.

max_pages = 5
for page in range(max_pages):
    URL = f"https://www.bookdepository.com/category/352/Science-Fiction/browse/viewmode/all?page={page}"
    html = requests.get(URL)
    soup = BeautifulSoup(html.content, "html.parser")

CodePudding user response:

You can make the pagination using for loop and range function as follows:

import requests
from bs4 import BeautifulSoup

URL = 'https://www.bookdepository.com/category/352/Science-Fiction/browse/viewmode/all?page=1' 

for page in range(1,11):
    print(page)
    html = requests.get(URL.format(page))
    soup = BeautifulSoup(html.content, "html.parser")
    # ^This part I need help with^


    # results = all books present on page
    # books = each individual book on the page
    results = soup.find(class_='tab search')
    books = results.find_all('div', class_='book-item')

    for book in books:
        title = book.h3.a
        author = book.p.span
        # in case there is no rating on a book
        if len(book.find('div','rating-wrap').findAll('span', 'full-star')) == None:
            pass
        else: rating = len(book.find('div','rating-wrap').findAll('span', 'full-star'))
    
        publish_date = book.find(class_='published')
        format = book.find(class_='format')
        price = book.find('span', class_='sale-price').text.strip()

        # if there is no discount
        if book.find(class_='rrp') == None:
            pass
        else:
            original_price = book.find(class_='rrp').text.strip()
        if book.find(class_='price-save') == None:
            pass
        else:
            discount = book.find(class_='price-save').text.strip()
    
        # unneeded text removed such as 'US' before the price shown
        price = price.replace('US', '')        
        original_price = original_price.replace('US', '')
        discount = discount.replace('Save US', '')
    
        # .text.strip() gets text and rids of empty spaces
        print(title.text.strip())
        print(author.text.strip())
        print(rating, 'stars')
        print(publish_date.text.strip())
        print(format.text.strip())
        print(price)
        print(original_price)
        print(discount, 'in savings!')

It's working without any issues. See the provement

import pandas as pd
import requests
from bs4 import BeautifulSoup
urls=['https://www.bookdepository.com/category/352/Science-Fiction/browse/viewmode/all?page=' str(x) '' for x in range(1,11)]

data=[]
for url in urls:
    html = requests.get(url)
    soup = BeautifulSoup(html.content, "lxml")
# ^This part I need help with^


# results = all books present on page
# books = each individual book on the page
    results = soup.find(class_='tab search')
    books = results.find_all('div', class_='book-item')

    for book in books:
        title = book.select_one('.title a')
        title =title.get_text(strip=True)
        data.append({'Title':title})

df=pd.DataFrame(data)
print(df)

Output:

                              Title
0                              1984
1                       Animal Farm
2               The Love Hypothesis
3                              Dune
4                  Ready Player One
..                              ...
295                 Ancillary Sword
296           The Mysterious Island
297                     Red Country
298          A Memory Called Empire
299  An Absolutely Remarkable Thing

[300 rows x 1 columns]
  • Related