How to scrap href information for a product, but only if the product is in stock?-CodePudding

I'm trying to scrape a list of products of products on the following page: https://www.beermerchants.com/browse/brewery/cantillon , however I only want to print products that are in stock. I've been able to scrape the full list of products with the following code, however how can I modify this so that this is only true for products that are in stock?

import ssl
import requests
import sys

import time
import smtplib
from email.message import EmailMessage
import hashlib
from urllib.request import urlopen
from datetime import datetime
import json

import random
import requests
from itertools import cycle
import pandas as pd

from bs4 import BeautifulSoup
from selenium import webdriver
from urllib3.exceptions import InsecureRequestWarning

from requests_html import HTMLSession
session = HTMLSession()

# Suppress only the single warning from urllib3 needed.
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)

user_agent_list = [
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
]
for i in range(1,4):
    #Pick a random user agent
    user_agent = random.choice(user_agent_list)
    #Set the headers 
    headers = {'User-Agent': user_agent}


url = 'https://www.beermerchants.com/browse/brewery/cantillon'

response = requests.get(url,headers=headers)

soup = BeautifulSoup(response.text,features="html.parser")
link = []


for product in soup.find_all('a', href=True, class_="product-item-link"):
    link.append(product['href'])

print(link)

Thanks in advance!!!

CodePudding user response：

I have adapted your code to use xpath, in order to add more complex logic.

I've checked the products that can be added to cart (meaning they are in stock)

import ssl
import requests
import sys

import time
import smtplib
from email.message import EmailMessage
import hashlib
from urllib.request import urlopen
from datetime import datetime
import json

import random
import requests
from itertools import cycle
import pandas as pd

from bs4 import BeautifulSoup
from lxml import etree
from selenium import webdriver
from urllib3.exceptions import InsecureRequestWarning

from requests_html import HTMLSession
session = HTMLSession()

# Suppress only the single warning from urllib3 needed.
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)

user_agent_list = [
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
]
for i in range(1,4):
    #Pick a random user agent
    user_agent = random.choice(user_agent_list)
    #Set the headers 
    headers = {'User-Agent': user_agent}


url = 'https://www.beermerchants.com/browse/brewery/cantillon'

response = requests.get(url,headers=headers)

soup = BeautifulSoup(response.text,features="html.parser")
link = []

dom = etree.HTML(str(soup))
for i in dom.xpath('//div[contains(@class, "product-item-info") and .//form[@data-role="tocart-form"]]//a[@]/@href'):
    print(i)

CodePudding user response：

You have to check for the product div to contain the span with "out of stock". Simple example:

for product in soup.find_all('a', href=True, class_="product-item-link"):
    out_of_stock=False
    for span in product.parent.parent.find_all('span', ):
        if "Out of stock" in span.text:
            out_of_stock = True
            break
    if not out_of_stock:
        link.append(product['href'])

CodePudding user response：

The following code will grab only the links which contain products that are in stock meaning add to card

import requests
import pandas as pd
from bs4 import BeautifulSoup
url_link = 'https://www.beermerchants.com/browse/brewery/cantillon'
lst = []
url = requests.get(url_link)

soup = BeautifulSoup(url.text,'lxml')

for card in soup.select('div[] > ol li:has(:-soup-contains("Add to Cart"))'):
    e=card.a.get('href')
    lst.append(e)
print(lst)

output:

https://www.beermerchants.com/cantillon-gueuze-75cl-bottle
https://www.beermerchants.com/cantillon-rose-de-gambrinus-75cl-bottle
https://www.beermerchants.com/cantillon-kriek-75cl-bottle
https://www.beermerchants.com/cantillon-grand-cru-bruocsella-75cl-bottle
https://www.beermerchants.com/cantillon-kriek-37-5cl-bottle
https://www.beermerchants.com/cantillon-rose-de-gambrinus-37-5cl-bottle
https://www.beermerchants.com/cantillon-c-est-bon-aluminium-sign
https://www.beermerchants.com/cantillon-gueuze-new-aluminium-sign
https://www.beermerchants.com/cantillon-traditionnal-gueuze-glas-33-cl
https://www.beermerchants.com/cantillon-super-tasting-glass-magnifica