I'm trying to scrape the href values for the items on the following page, however only if the items show as in stock: https://www.waitrosecellar.com/whisky-shop/view-all-whiskies/whisky-by-brand/macallan
With the following code, I've managed to successfully scrape the hrefs, however the out_of_stock flag does not appear to be working and still returns items that are out of stock in the print list. My code:
import ssl
import requests
import sys
import time
import smtplib
from email.message import EmailMessage
import hashlib
from urllib.request import urlopen
from datetime import datetime
import json
import random
import requests
from itertools import cycle
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from urllib3.exceptions import InsecureRequestWarning
from requests_html import HTMLSession
session = HTMLSession()
user_agent_list = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
]
for i in range(1,4):
#Pick a random user agent
user_agent = random.choice(user_agent_list)
#Set the headers
headers = {'User-Agent': user_agent}
url = 'https://www.waitrosecellar.com/whisky-shop/view-all-whiskies/whisky-by-brand/macallan'
response = requests.get(url,headers=headers)
soup = BeautifulSoup(response.text,features="html.parser")
test = []
for product in soup.find_all('div', class_="productName"):
out_of_stock=False
for span in product.parent.find_all('span', ):
if "Out of stock" in span.text:
out_of_stock = True
break
if not out_of_stock:
test.append(product.a['href'])
print(test)
Please could I have suggestions as to how to make the out_of_stock flag work correctly, in order to only print items that are in stock. Thank you!
CodePudding user response:
Here is one way to differentiate between out of stock/available products:
import requests
from bs4 import BeautifulSoup as bs
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
url = 'https://www.waitrosecellar.com/whisky-shop/view-all-whiskies/whisky-by-brand/macallan'
r = requests.get(url, headers=headers)
soup = bs(r.text, 'html.parser')
cards = soup.select('div[]')
for c in cards:
product = c.select_one('div[ ] a').text.strip()
product_url = c.select_one('div[ ] a').get('href')
availability = 'Product Available' if c.select_one('div[]').get('style') == 'display:none;' else 'Out of Stock'
if availability == 'Product Available':
print(product, product_url, availability)
Result in terminal:
Macallan 12 Year Old Sherry Oak https://www.waitrosecellar.com/macallan-12-year-old-sherry-oak-717201 Product Available
Of course you can get other data points about products as well. See BeautifulSoup documentation here: https://beautiful-soup-4.readthedocs.io/en/latest/ Also, Requests-Html seems to be unmaintained, last release being almost 4 years ago? Released: Feb 17, 2019