from urllib import response
from bs4 import BeautifulSoup
import requests
import csv
# headers = {
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
# }
csv_file = open("scifi_audible.csv", "w")
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["title ", "link ", "rating ", "reviews "])
url = "https://www.audible.de/"
audiobooklinks = []
for x in range(1, 2):
source = requests.get(f"https://www.audible.de/search?node=16245852031&page={x}")
soup = BeautifulSoup(source.content, "lxml")
audiobooks = soup.find_all("h3", class_ = "bc-heading")
for item in audiobooks:
for link in item.find_all("a", href=True):
audiobooklinks.append(url link["href"])
#testlink = 'https://www.audible.de/pd/Mortarion-The-Pale-King-Hoerbuch/B0BCQXVJML'
for link in audiobooklinks:
r = requests.get(link) #headers=headers)
soup = BeautifulSoup(r.content, "lxml")
try:
title = soup.find("h1", class_= "bc-heading").text.strip()
except:
title = "no output possible"
try:
rating = soup.find("span", attrs={"aria-hidden":"true", "class":"bc-text"}).text.strip()
except:
rating = "no rating"
try:
raw_reviews = soup.find("li", class_= "bc-list-item ratingsLabel").text.strip()
except:
raw_reviews = "no raw_reviews"
try:
reviews = raw_reviews.split("(")[-1].split()[0].replace(".", "")
except:
reviews = "no reviews"
print(title, link, rating, reviews)
csv_writer.writerow([title, link, rating, reviews])
csv_file.close()
Most of the time it works. Randomly it is printed i.e. like this:
"no output possible https://www.audible.de//pd/Mortarion-The-Pale-King-Hoerbuch/B0BCQXVJML no rating no"
What do I have to change to always get the h1 and li?
CodePudding user response:
When I tried to reproduce the "no output possible" scenario, I got it about 2% of the time; it was nearly always due to 503 Service Unavailable error; occasionally, status was 200 [OK], but content was empty - and I don't really know what might be causing that. (Btw, when working with requests, you should generally check that status_code==200
before proceeding.)
One way to handle this would be to append , something like
repeats = 0 # initiate
maxRepeats = 10 # limit allowed errors
abl_copy = audiobooklinks[:] # preserve original
for link in abl_copy:
r = requests.get(link) #headers=headers)
soup = BeautifulSoup(r.content, "lxml")
if r.status_code != 200 or not r.content:
print(f'! {r.status_code} {r.reason} - for {link} !')
repeats = 1
if maxRepeats < repeats:
print('! Stopping because of too many bad responses !')
break
abl_copy.append(link)
continue
soup = BeautifulSoup(r.content, "lxml")
# rest of your for loop
There are many other ways to handle it as well - you could add a wait every time you get 503, you could add all the bad responses [r
] to a list and explore/handle them after the loop, etc...