I have created a simple thread request code:
import random
import threading
import time
from concurrent.futures import as_completed
from concurrent.futures.thread import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
URLS = [
'https://github.com/search?q=hello world',
'https://github.com/search?q=python 3',
'https://github.com/search?q=world',
'https://github.com/search?q=i love python',
'https://github.com/search?q=sport today',
'https://github.com/search?q=how to code',
'https://github.com/search?q=banana',
'https://github.com/search?q=android vs iphone',
'https://github.com/search?q=please help me',
'https://github.com/search?q=batman',
]
def doScrape(response):
soup = BeautifulSoup(response.text, 'html.parser')
t = soup.find("div", {"class": "codesearch-results"}).find("h3")
return {
'url': response.url,
'repository_results': t.text.strip()
}
def doRequest(url):
response = requests.get(url)
time.sleep(random.randint(1, 3))
return response
def ourLoop():
with ThreadPoolExecutor(max_workers=2) as executor:
future_tasks = [
executor.submit(
doRequest,
url
) for url in URLS]
for future in as_completed(future_tasks):
response = future.result()
if response.status_code == 200:
result = doScrape(response)
print(result)
while True:
t = threading.Thread(target=ourLoop, )
t.start()
print('Joining thread and waiting for it to finish...')
t.join()
where I first start a thread with a ThreadPoolExecutor that has a workers of 2. The idea of this is that I want to be able to monitor 24/7 and notify myself whenever there has been a change (in this case if the repository_results
is different between previous requests vs. latest request) - whenever there is a change, I want to print out that says that there is a difference.
I wonder how I am able to do that using ThreadPoolExecutor and how I can monitor a specific url to see if there has happend a change nor not?
CodePudding user response:
You can do this by storing the previous results in the list itself, and passing that along with the response
to doScrape
import random
import threading
import time
from concurrent.futures import as_completed
from concurrent.futures.thread import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
URLS = [
'https://github.com/search?q=hello world',
'https://github.com/search?q=python 3',
'https://github.com/search?q=world',
'https://github.com/search?q=i love python',
'https://github.com/search?q=sport today',
'https://github.com/search?q=how to code',
'https://github.com/search?q=banana',
'https://github.com/search?q=android vs iphone',
'https://github.com/search?q=please help me',
'https://github.com/search?q=batman',
]
# Create a list of dictionaries with urls and their previous result as None
url_ = []
for url in URLS:
url_.append({'url': url, 'repository_results': None})
def doScrape(response, url_dict):
result = {'url': url_dict['url'], 'respository_results': None, 'change': False}
soup = BeautifulSoup(response.text, 'html.parser')
t = soup.find("div", {"class": "codesearch-results"}).find("h3")
current_response = t.text.strip()
# If prev result do not match current result, set key 'change' as True, only exception being if the
# previous result was None, i.e, this is the first time we are running this
if current_response != url_dict['repository_results'] and url_dict['repository_results'] is not None:
result['change'] = True
result['respository_results'] = current_response
return result
def doRequest(url_dict):
response = requests.get(url_dict['url'])
time.sleep(random.randint(1, 3))
return response, url_dict
def ourLoop():
with ThreadPoolExecutor(max_workers=2) as executor:
future_tasks = [
executor.submit(
doRequest,
url_dict
) for url_dict in url_]
for future in as_completed(future_tasks):
response, url_dict = future.result()
if response.status_code == 200:
result = doScrape(response, url_dict)
print(result)
if result['change']:
print(f'Changed for url : {result["url"]}!')
while True:
t = threading.Thread(target=ourLoop, )
t.start()
print('Joining thread and waiting for it to finish...')
t.join()
The only exception where this fails is if the change happened at the very first time you are running the loop, since we would not know the previous value of the scraped element.
Also, if you are planning to run this on loop and only want to print
in case their is a change, make sure to change the repository_result
key in the url_dict
itself (inside doScrape
), and you can omit the return results
line as well:
import random
import threading
import time
from concurrent.futures import as_completed
from concurrent.futures.thread import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
URLS = [
'https://github.com/search?q=hello world',
'https://github.com/search?q=python 3',
'https://github.com/search?q=world',
'https://github.com/search?q=i love python',
'https://github.com/search?q=sport today',
'https://github.com/search?q=how to code',
'https://github.com/search?q=banana',
'https://github.com/search?q=android vs iphone',
'https://github.com/search?q=please help me',
'https://github.com/search?q=batman',
]
# Create a list of dictionaries with urls and their previous result as None
url_ = []
for url in URLS:
url_.append({'url': url, 'repository_results': None})
def doScrape(response, url_dict):
soup = BeautifulSoup(response.text, 'html.parser')
t = soup.find("div", {"class": "codesearch-results"}).find("h3")
current_response = t.text.strip()
# If prev result do not match current result, set key 'change' as True, only exception being if the
# previous result was None, i.e, this is the first time we are running this
if current_response != url_dict['repository_results'] and url_dict['repository_results'] is not None:
print(f'Changed for url : {url_dict["url"]}')
url_dict['respository_results'] = current_response
def doRequest(url_dict):
response = requests.get(url_dict['url'])
time.sleep(random.randint(1, 3))
return response, url_dict
def ourLoop():
with ThreadPoolExecutor(max_workers=2) as executor:
future_tasks = [
executor.submit(
doRequest,
url_dict
) for url_dict in url_]
for future in as_completed(future_tasks):
response, url_dict = future.result()
if response.status_code == 200:
doScrape(response, url_dict)
while True:
t = threading.Thread(target=ourLoop, )
t.start()
print('Joining thread and waiting for it to finish...')
t.join()