I am lost with making a loop to go through all of the pages on this book site. The url ends in 'all?page=' followed by the page number, so it should be easy I thought, but I'm stuck. All the info gathering works fine, I just don't know how to move to the next pages. Any help would be appreciated.
import requests
from bs4 import BeautifulSoup
URL = 'https://www.bookdepository.com/category/352/Science-Fiction/browse/viewmode/all?page=' str(page)
page = 1
page = 1
for page in max_pages:
html = requests.get(URL)
soup = BeautifulSoup(html.content, "html.parser")
# ^This part I need help with^
# results = all books present on page
# books = each individual book on the page
results = soup.find(class_='tab search')
books = results.find_all('div', class_='book-item')
for book in books:
title = book.h3.a
author = book.p.span
# in case there is no rating on a book
if len(book.find('div','rating-wrap').findAll('span', 'full-star')) == None:
pass
else: rating = len(book.find('div','rating-wrap').findAll('span', 'full-star'))
publish_date = book.find(class_='published')
format = book.find(class_='format')
price = book.find('span', class_='sale-price').text.strip()
# if there is no discount
if book.find(class_='rrp') == None:
pass
else:
original_price = book.find(class_='rrp').text.strip()
if book.find(class_='price-save') == None:
pass
else:
discount = book.find(class_='price-save').text.strip()
# unneeded text removed such as 'US' before the price shown
price = price.replace('US', '')
original_price = original_price.replace('US', '')
discount = discount.replace('Save US', '')
# .text.strip() gets text and rids of empty spaces
print(title.text.strip())
print(author.text.strip())
print(rating, 'stars')
print(publish_date.text.strip())
print(format.text.strip())
print(price)
print(original_price)
print(discount, 'in savings!')
CodePudding user response:
What the code does is it loops 5 times in this case with page going up one every singe time.
max_pages = 5
for page in range(max_pages):
URL = f"https://www.bookdepository.com/category/352/Science-Fiction/browse/viewmode/all?page={page}"
html = requests.get(URL)
soup = BeautifulSoup(html.content, "html.parser")
CodePudding user response:
You can make the pagination using for loop and range function as follows:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.bookdepository.com/category/352/Science-Fiction/browse/viewmode/all?page=1'
for page in range(1,11):
print(page)
html = requests.get(URL.format(page))
soup = BeautifulSoup(html.content, "html.parser")
# ^This part I need help with^
# results = all books present on page
# books = each individual book on the page
results = soup.find(class_='tab search')
books = results.find_all('div', class_='book-item')
for book in books:
title = book.h3.a
author = book.p.span
# in case there is no rating on a book
if len(book.find('div','rating-wrap').findAll('span', 'full-star')) == None:
pass
else: rating = len(book.find('div','rating-wrap').findAll('span', 'full-star'))
publish_date = book.find(class_='published')
format = book.find(class_='format')
price = book.find('span', class_='sale-price').text.strip()
# if there is no discount
if book.find(class_='rrp') == None:
pass
else:
original_price = book.find(class_='rrp').text.strip()
if book.find(class_='price-save') == None:
pass
else:
discount = book.find(class_='price-save').text.strip()
# unneeded text removed such as 'US' before the price shown
price = price.replace('US', '')
original_price = original_price.replace('US', '')
discount = discount.replace('Save US', '')
# .text.strip() gets text and rids of empty spaces
print(title.text.strip())
print(author.text.strip())
print(rating, 'stars')
print(publish_date.text.strip())
print(format.text.strip())
print(price)
print(original_price)
print(discount, 'in savings!')
It's working without any issues. See the provement
import pandas as pd
import requests
from bs4 import BeautifulSoup
urls=['https://www.bookdepository.com/category/352/Science-Fiction/browse/viewmode/all?page=' str(x) '' for x in range(1,11)]
data=[]
for url in urls:
html = requests.get(url)
soup = BeautifulSoup(html.content, "lxml")
# ^This part I need help with^
# results = all books present on page
# books = each individual book on the page
results = soup.find(class_='tab search')
books = results.find_all('div', class_='book-item')
for book in books:
title = book.select_one('.title a')
title =title.get_text(strip=True)
data.append({'Title':title})
df=pd.DataFrame(data)
print(df)
Output:
Title
0 1984
1 Animal Farm
2 The Love Hypothesis
3 Dune
4 Ready Player One
.. ...
295 Ancillary Sword
296 The Mysterious Island
297 Red Country
298 A Memory Called Empire
299 An Absolutely Remarkable Thing
[300 rows x 1 columns]