I like to scrape a website and text me on telegram with a change on the website.
This is working but i got to much messages and i want to change the script to check a specific class
on the website.
So on the website i want to check the <span >-49%</span>
I want a message if the value is between -65% and -99%. Is this possible? The script to check changes is below here:
import requests
from bs4 import BeautifulSoup
import difflib
import time
from datetime import datetime
import re
import os
import schedule
import cloudscraper
# target URL
url = "https://nl.pepper.com/groep/prijsfout"
# act like a browser
#headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
#headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246'}
scraper = cloudscraper.create_scraper()
# Send a message via a telegram bot
def telegram_bot_sendtext(bot_message):
bot_token = '17XXXX32:AAFd5jXXXXXXXXXXXXC5UJgG5pses8'
bot_chatID = '-XXXXX'
send_text = 'https://api.telegram.org/bot' bot_token '/sendMessage?chat_id=' bot_chatID '&parse_mode=Markdown&text=' bot_message
response = requests.get(send_text)
return response.json()
PrevVersion = ""
FirstRun = True
while True:
# download the page
response = scraper.get("https://nl.pepper.com/nieuw").content
# parse the downloaded homepage
soup = BeautifulSoup(response, 'html.parser')
# remove all scripts and styles
for script in soup(["script", "style"]):
script.extract()
soup = soup.get_text()
# compare the page text to the previous version
if PrevVersion != soup:
# on the first run - just memorize the page
if FirstRun == True:
PrevVersion = soup
FirstRun = False
print ("Start Monitoring " url "" str(datetime.now()))
else:
print ("Changes detected at: " str(datetime.now()))
OldPage = PrevVersion.splitlines()
NewPage = soup.splitlines()
# compare versions and highlight changes using difflib
#d = difflib.Differ()
#diff = d.compare(OldPage, NewPage)
diff = difflib.context_diff(OldPage,NewPage,n=0)
out_text = "\n".join([ll.rstrip() for ll in '\n'.join(diff).splitlines() if ll.strip()])
print (out_text)
OldPage = NewPage
# Send the message (such as with a telegram bot provided below)
telegram_bot_sendtext("Nieuwe prijsfout op Pepper " url out_text )
# print ('\n'.join(diff))
PrevVersion = soup
else:
print( "No Changes " str(datetime.now()))
time.sleep(5)
continue
Maybe there is also a problem with cookies in this script (or it is not defined.)
CodePudding user response:
So many thanks, you don't know how happy i am!
But another question, i think i do something wrong now because i don't get it to work at the moment.
`# target URL
url = "https://nl.pepper.com/groep/prijsfout"
# act like a browser
#headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
#headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246'}
scraper = cloudscraper.create_scraper()
# Send a message via a telegram bot
def telegram_bot_sendtext(bot_message):
bot_token = '17XXXX32:AAFd5jXXXXXXXXXXXXC5UJgG5pses8'
bot_chatID = '-XXXXXX'
send_text = 'https://api.telegram.org/bot' bot_token '/sendMessage?chat_id=' bot_chatID '&parse_mode=Markdown&text=' bot_message
response = requests.get(send_text)
return response.json()
PrevVersion = ""
FirstRun = True
while True:
def get_discounts(soup):
for d in soup.select('.cept-discount'):
if d.text != '' and 65 < int(''.join(filter(str.isdigit, d.text))) < 99:
return True
else:
return False
get_discounts(soup)
discounts = get_discounts(soup)
soup = soup.get_text()
# download the page
response = scraper.get("https://nl.pepper.com/nieuw").content
# parse the downloaded homepage
soup = BeautifulSoup(response, 'html.parser')
# remove all scripts and styles
for script in soup(["script", "style"]):
script.extract()
soup = soup.get_text()
# compare the page text to the previous version and check if there are any discounts in your range
if PrevVersion != soup and discounts:
# on the first run - just memorize the page
if FirstRun == True:
PrevVersion = soup
FirstRun = False
print ("Start Monitoring " url "" str(datetime.now()))
else:
print ("Changes detected at: " str(datetime.now()))
OldPage = PrevVersion.splitlines()
NewPage = soup.splitlines()
# compare versions and highlight changes using difflib
#d = difflib.Differ()
#diff = d.compare(OldPage, NewPage)
diff = difflib.context_diff(OldPage,NewPage,n=0)
out_text = "\n".join([ll.rstrip() for ll in '\n'.join(diff).splitlines() if ll.strip()])
print (out_text)
OldPage = NewPage
# Send the message (such as with a telegram bot provided below)
telegram_bot_sendtext("Nieuwe prijsfout op Pepper " url out_text )
# print ('\n'.join(diff))
PrevVersion = soup
else:
print( "No Changes " str(datetime.now()))
time.sleep(5)
continue`
CodePudding user response:
A simple possible solution to get a clue if there are any discounts between -65% and -99% could be the following.
This function is taking your soup
and is looking for the discounts in generally and returns True
if there is any discount in your range or False
if not:
def get_discounts(soup):
for d in soup.select('.cept-discount'):
if d.text != '' and 65 < int(''.join(filter(str.isdigit, d.text))) < 99:
return True
else:
return False
get_discounts(soup)
Note Call the function before you call soup = soup.get_text()
- Order is crucial cause you change the content of soup
to text.
Might be better to store the text in another / renamed variable e.g. souptext, ... So you can be sure, that soup
always contains the BeautifulSoup
object
, which represents the document
as a nested data structure.
So you will end up in something like this:
import requests, time, difflib, os, re, schedule, cloudscraper
from bs4 import BeautifulSoup
from datetime import datetime
# target URL
url = "https://nl.pepper.com/groep/prijsfout"
# act like a browser
#headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
#headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246'}
scraper = cloudscraper.create_scraper()
# Send a message via a telegram bot
def telegram_bot_sendtext(bot_message):
bot_token = '17XXXX32:AAFd5jXXXXXXXXXXXXC5UJgG5pses8'
bot_chatID = '-XXXXX'
send_text = 'https://api.telegram.org/bot' bot_token '/sendMessage?chat_id=' bot_chatID '&parse_mode=Markdown&text=' bot_message
response = requests.get(send_text)
return response.json()
PrevVersion = ""
FirstRun = True
while True:
# download the page
response = scraper.get("https://nl.pepper.com/nieuw").content
# parse the downloaded homepage
soup = BeautifulSoup(response, 'html.parser')
# remove all scripts and styles
for script in soup(["script", "style"]):
script.extract()
discounts = get_discounts(soup)
soup = soup.get_text()
# compare the page text to the previous version and check if there are any discounts in your range
if PrevVersion != soup and discounts:
# on the first run - just memorize the page
if FirstRun == True:
PrevVersion = soup
FirstRun = False
print ("Start Monitoring " url "" str(datetime.now()))
else:
print ("Changes detected at: " str(datetime.now()))
OldPage = PrevVersion.splitlines()
NewPage = soup.splitlines()
# compare versions and highlight changes using difflib
#d = difflib.Differ()
#diff = d.compare(OldPage, NewPage)
diff = difflib.context_diff(OldPage,NewPage,n=0)
out_text = "\n".join([ll.rstrip() for ll in '\n'.join(diff).splitlines() if ll.strip()])
print (out_text)
OldPage = NewPage
# Send the message (such as with a telegram bot provided below)
telegram_bot_sendtext("Nieuwe prijsfout op Pepper " url out_text )
# print ('\n'.join(diff))
PrevVersion = soup
else:
print( "No Changes " str(datetime.now()))
time.sleep(5)
continue