Python scrape issue with class on a website?-CodePudding

I like to scrape a website and text me on telegram with a change on the website.

This is working but i got to much messages and i want to change the script to check a specific class on the website.

So on the website i want to check the <span >-49%</span>

I want a message if the value is between -65% and -99%. Is this possible? The script to check changes is below here:

import requests
from bs4 import BeautifulSoup
import difflib
import time
from datetime import datetime
import re
import os
import schedule
import cloudscraper


# target URL
url = "https://nl.pepper.com/groep/prijsfout"
# act like a browser
#headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36         (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
#headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36     (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246'}

scraper = cloudscraper.create_scraper()


# Send a message via a telegram bot
def telegram_bot_sendtext(bot_message):
bot_token = '17XXXX32:AAFd5jXXXXXXXXXXXXC5UJgG5pses8'
bot_chatID = '-XXXXX'
send_text = 'https://api.telegram.org/bot'   bot_token   '/sendMessage?chat_id='   bot_chatID       '&parse_mode=Markdown&text='   bot_message

response = requests.get(send_text)

return response.json()


PrevVersion = ""
FirstRun = True
while True:

# download the page
response = scraper.get("https://nl.pepper.com/nieuw").content
# parse the downloaded homepage
soup =  BeautifulSoup(response, 'html.parser')

# remove all scripts and styles
for script in soup(["script", "style"]):
    script.extract()
soup = soup.get_text()
# compare the page text to the previous version
if PrevVersion != soup:
    # on the first run - just memorize the page
    if FirstRun == True:
        PrevVersion = soup
        FirstRun = False
        print ("Start Monitoring " url  ""  str(datetime.now()))
    else:
        print ("Changes detected at: "  str(datetime.now()))
        OldPage = PrevVersion.splitlines()
        NewPage = soup.splitlines()
        # compare versions and highlight changes using difflib
        #d = difflib.Differ()
        #diff = d.compare(OldPage, NewPage)
        diff = difflib.context_diff(OldPage,NewPage,n=0)
        out_text = "\n".join([ll.rstrip() for ll in '\n'.join(diff).splitlines() if     ll.strip()])
        print (out_text)
        OldPage = NewPage
        # Send the message (such as with a telegram bot provided below)
        telegram_bot_sendtext("Nieuwe prijsfout op Pepper "   url   out_text )

       # print ('\n'.join(diff))
        PrevVersion = soup
else:
    print( "No Changes "  str(datetime.now()))
time.sleep(5)
continue

Maybe there is also a problem with cookies in this script (or it is not defined.)

CodePudding user response：

So many thanks, you don't know how happy i am!

But another question, i think i do something wrong now because i don't get it to work at the moment.

`# target URL
url = "https://nl.pepper.com/groep/prijsfout"
# act like a browser
#headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)             AppleWebKit/537.36         (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
#headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)     AppleWebKit/537.36     (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36     Edge/12.246'}

scraper = cloudscraper.create_scraper()


# Send a message via a telegram bot
def telegram_bot_sendtext(bot_message):
bot_token = '17XXXX32:AAFd5jXXXXXXXXXXXXC5UJgG5pses8'
bot_chatID = '-XXXXXX'
send_text = 'https://api.telegram.org/bot'   bot_token   '/sendMessage?chat_id='       bot_chatID       '&parse_mode=Markdown&text='   bot_message

response = requests.get(send_text)

return response.json()

PrevVersion = ""
FirstRun = True
while True:


def get_discounts(soup):
for d in soup.select('.cept-discount'):
    if d.text != '' and 65 < int(''.join(filter(str.isdigit, d.text))) < 99:
        return True
    else:
        return False
    
get_discounts(soup)

discounts = get_discounts(soup)

soup = soup.get_text()

# download the page
response = scraper.get("https://nl.pepper.com/nieuw").content
# parse the downloaded homepage
soup =  BeautifulSoup(response, 'html.parser')

# remove all scripts and styles
for script in soup(["script", "style"]):
    script.extract()
soup = soup.get_text()
# compare the page text to the previous version and check if there are any discounts in your range

if PrevVersion != soup and discounts:       
# on the first run - just memorize the page
    if FirstRun == True:
        PrevVersion = soup
        FirstRun = False
        print ("Start Monitoring " url  ""  str(datetime.now()))
    else:
        print ("Changes detected at: "  str(datetime.now()))
        OldPage = PrevVersion.splitlines()
        NewPage = soup.splitlines()
        # compare versions and highlight changes using difflib
        #d = difflib.Differ()
        #diff = d.compare(OldPage, NewPage)
        diff = difflib.context_diff(OldPage,NewPage,n=0)
        out_text = "\n".join([ll.rstrip() for ll in '\n'.join(diff).splitlines()     if     ll.strip()])
        print (out_text)
        OldPage = NewPage
        # Send the message (such as with a telegram bot provided below)
        telegram_bot_sendtext("Nieuwe prijsfout op Pepper "   url   out_text )

       # print ('\n'.join(diff))
        PrevVersion = soup
else:
    print( "No Changes "  str(datetime.now()))
time.sleep(5)
continue`

CodePudding user response：

A simple possible solution to get a clue if there are any discounts between -65% and -99% could be the following.

This function is taking your soup and is looking for the discounts in generally and returns True if there is any discount in your range or False if not:

def get_discounts(soup):
    for d in soup.select('.cept-discount'):
        if d.text != '' and 65 < int(''.join(filter(str.isdigit, d.text))) < 99:
            return True
        else:
            return False
        
get_discounts(soup)

Note Call the function before you call soup = soup.get_text() - Order is crucial cause you change the content of soup to text.

Might be better to store the text in another / renamed variable e.g. souptext, ... So you can be sure, that soup always contains the BeautifulSoup object, which represents the document as a nested data structure.

So you will end up in something like this:

import requests, time, difflib, os, re, schedule, cloudscraper
from bs4 import BeautifulSoup
from datetime import datetime

# target URL
url = "https://nl.pepper.com/groep/prijsfout"
# act like a browser
#headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36         (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
#headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36     (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246'}

scraper = cloudscraper.create_scraper()


# Send a message via a telegram bot
def telegram_bot_sendtext(bot_message):
    bot_token = '17XXXX32:AAFd5jXXXXXXXXXXXXC5UJgG5pses8'
    bot_chatID = '-XXXXX'
    send_text = 'https://api.telegram.org/bot'   bot_token   '/sendMessage?chat_id='   bot_chatID       '&parse_mode=Markdown&text='   bot_message

    response = requests.get(send_text)

    return response.json()


PrevVersion = ""
FirstRun = True
while True:

    # download the page
    response = scraper.get("https://nl.pepper.com/nieuw").content
    # parse the downloaded homepage
    soup =  BeautifulSoup(response, 'html.parser')

    # remove all scripts and styles
    for script in soup(["script", "style"]):
        script.extract()
    discounts = get_discounts(soup)

    soup = soup.get_text()

    # compare the page text to the previous version and check if there are any discounts in your range

    if PrevVersion != soup and discounts:       
        # on the first run - just memorize the page
        if FirstRun == True:
            PrevVersion = soup
            FirstRun = False
            print ("Start Monitoring " url  ""  str(datetime.now()))
        else:
            print ("Changes detected at: "  str(datetime.now()))
            OldPage = PrevVersion.splitlines()
            NewPage = soup.splitlines()
            # compare versions and highlight changes using difflib
            #d = difflib.Differ()
            #diff = d.compare(OldPage, NewPage)
            diff = difflib.context_diff(OldPage,NewPage,n=0)
            out_text = "\n".join([ll.rstrip() for ll in '\n'.join(diff).splitlines() if     ll.strip()])
            print (out_text)
            OldPage = NewPage
            # Send the message (such as with a telegram bot provided below)
            telegram_bot_sendtext("Nieuwe prijsfout op Pepper "   url   out_text )

           # print ('\n'.join(diff))
            PrevVersion = soup
    else:
        print( "No Changes "  str(datetime.now()))
    time.sleep(5)
    continue