How to compare between two requests using Thread-CodePudding

I have created a simple thread request code:

import random
import threading
import time
from concurrent.futures import as_completed
from concurrent.futures.thread import ThreadPoolExecutor

import requests
from bs4 import BeautifulSoup

URLS = [
    'https://github.com/search?q=hello world',
    'https://github.com/search?q=python 3',
    'https://github.com/search?q=world',
    'https://github.com/search?q=i love python',
    'https://github.com/search?q=sport today',
    'https://github.com/search?q=how to code',
    'https://github.com/search?q=banana',
    'https://github.com/search?q=android vs iphone',
    'https://github.com/search?q=please help me',
    'https://github.com/search?q=batman',
]


def doScrape(response):
    soup = BeautifulSoup(response.text, 'html.parser')
    t = soup.find("div", {"class": "codesearch-results"}).find("h3")
    return {
        'url': response.url,
        'repository_results': t.text.strip()
    }


def doRequest(url):
    response = requests.get(url)
    time.sleep(random.randint(1, 3))
    return response


def ourLoop():
    with ThreadPoolExecutor(max_workers=2) as executor:
        future_tasks = [
            executor.submit(
                doRequest,
                url
            ) for url in URLS]

        for future in as_completed(future_tasks):
            response = future.result()
            if response.status_code == 200:
                result = doScrape(response)
                print(result)


while True:
    t = threading.Thread(target=ourLoop, )
    t.start()
    print('Joining thread and waiting for it to finish...')
    t.join()

where I first start a thread with a ThreadPoolExecutor that has a workers of 2. The idea of this is that I want to be able to monitor 24/7 and notify myself whenever there has been a change (in this case if the repository_results is different between previous requests vs. latest request) - whenever there is a change, I want to print out that says that there is a difference.

I wonder how I am able to do that using ThreadPoolExecutor and how I can monitor a specific url to see if there has happend a change nor not?

CodePudding user response：

You can do this by storing the previous results in the list itself, and passing that along with the response to doScrape

import random
import threading
import time
from concurrent.futures import as_completed
from concurrent.futures.thread import ThreadPoolExecutor

import requests
from bs4 import BeautifulSoup

URLS = [
    'https://github.com/search?q=hello world',
    'https://github.com/search?q=python 3',
    'https://github.com/search?q=world',
    'https://github.com/search?q=i love python',
    'https://github.com/search?q=sport today',
    'https://github.com/search?q=how to code',
    'https://github.com/search?q=banana',
    'https://github.com/search?q=android vs iphone',
    'https://github.com/search?q=please help me',
    'https://github.com/search?q=batman',
]


# Create a list of dictionaries with urls and their previous result as None
url_ = []
for url in URLS:
    url_.append({'url': url, 'repository_results': None})

def doScrape(response, url_dict):
    result = {'url': url_dict['url'], 'respository_results': None, 'change': False}
    soup = BeautifulSoup(response.text, 'html.parser')
    t = soup.find("div", {"class": "codesearch-results"}).find("h3")
    current_response = t.text.strip()

    # If prev result do not match current result, set key 'change' as True, only exception being if the 
    # previous result was None, i.e, this is the first time we are running this
    if current_response != url_dict['repository_results'] and url_dict['repository_results'] is not None:
        result['change'] = True

    result['respository_results'] = current_response
    return result


def doRequest(url_dict):
    response = requests.get(url_dict['url'])
    time.sleep(random.randint(1, 3))
    return response, url_dict


def ourLoop():
    with ThreadPoolExecutor(max_workers=2) as executor:
        future_tasks = [
            executor.submit(
                doRequest,
                url_dict
            ) for url_dict in url_]

        for future in as_completed(future_tasks):
            response, url_dict = future.result()
            if response.status_code == 200:
                result = doScrape(response, url_dict)
                print(result)
                if result['change']:
                    print(f'Changed for url : {result["url"]}!')


while True:
    t = threading.Thread(target=ourLoop, )
    t.start()
    print('Joining thread and waiting for it to finish...')
    t.join()

The only exception where this fails is if the change happened at the very first time you are running the loop, since we would not know the previous value of the scraped element.

Also, if you are planning to run this on loop and only want to print in case their is a change, make sure to change the repository_result key in the url_dict itself (inside doScrape), and you can omit the return results line as well:

import random
import threading
import time
from concurrent.futures import as_completed
from concurrent.futures.thread import ThreadPoolExecutor

import requests
from bs4 import BeautifulSoup

URLS = [
    'https://github.com/search?q=hello world',
    'https://github.com/search?q=python 3',
    'https://github.com/search?q=world',
    'https://github.com/search?q=i love python',
    'https://github.com/search?q=sport today',
    'https://github.com/search?q=how to code',
    'https://github.com/search?q=banana',
    'https://github.com/search?q=android vs iphone',
    'https://github.com/search?q=please help me',
    'https://github.com/search?q=batman',
]


# Create a list of dictionaries with urls and their previous result as None
url_ = []
for url in URLS:
    url_.append({'url': url, 'repository_results': None})

def doScrape(response, url_dict):
    soup = BeautifulSoup(response.text, 'html.parser')
    t = soup.find("div", {"class": "codesearch-results"}).find("h3")
    current_response = t.text.strip()

    # If prev result do not match current result, set key 'change' as True, only exception being if the 
    # previous result was None, i.e, this is the first time we are running this
    if current_response != url_dict['repository_results'] and url_dict['repository_results'] is not None:
        print(f'Changed for url : {url_dict["url"]}')

    url_dict['respository_results'] = current_response


def doRequest(url_dict):
    response = requests.get(url_dict['url'])
    time.sleep(random.randint(1, 3))
    return response, url_dict


def ourLoop():
    with ThreadPoolExecutor(max_workers=2) as executor:
        future_tasks = [
            executor.submit(
                doRequest,
                url_dict
            ) for url_dict in url_]

        for future in as_completed(future_tasks):
            response, url_dict = future.result()
            if response.status_code == 200:
                doScrape(response, url_dict)


while True:
    t = threading.Thread(target=ourLoop, )
    t.start()
    print('Joining thread and waiting for it to finish...')
    t.join()