Home > Mobile >  Scraping only new elements from website using requests and bs4
Scraping only new elements from website using requests and bs4

Time:12-29

I want to look at a certain website to collect data from it, at the first visit to the said website I collect all the data that was there to ignore it. I want to perform a certain action if a new row is added(for example print as in here). But whenever a new item appears it seems to print every single row on the website even though I'm checking if the row exists already in the dictionary. Don't know how to fix it, can anyone take a look?

import requests
import re
import copy
import time

from datetime import date
from bs4 import BeautifulSoup


class KillStatistics:
    def __init__(self):
        self.records = {}
        self.watched_names = ["Test"]
        self.iter = 0

    def parse_records(self):
        r = requests.get("http://149.56.28.71/?subtopic=killstatistics")
        soup = BeautifulSoup(r.content, "html.parser")
        table = soup.findChildren("table")

        for record in table:
            for data in record:
                if data.text == "Last Deaths":
                    pass
                else:
                    entry = data.text
                    entry = re.split("..?(?=[0-9][A-Z]).", data.text)
                    entry[0] = entry[0].split(", ")
                    entry[0][0] = entry[0][0].split(".")

                    entry_id, day, month, year, hour = (
                        entry[0][0][0],
                        entry[0][0][1],
                        entry[0][0][2],
                        entry[0][0][3],
                        entry[0][1],
                    )

                    message = entry[1]
                    nickname = (re.findall(". ?(?=at)", message)[0]).strip()
                    killed_by = (re.findall(r"(?<=\bby).*", message)[0]).strip()
                    if self.iter < 1:
                        """Its the first visit to the website, i want it to download all the data and store it in dictionary"""
                        self.records[
                            entry_id
                        ] = f"{nickname} was killed by {killed_by} at {day}-{month}-{year} {hour}"
                    elif (
                        self.iter > 1
                        and f"{nickname} was killed by {killed_by} at {day}-{month}-{year} {hour}"
                        not in self.records.values()
                    ):
                        """Here I want to look into the dictionary to check if the element exists in it, 
                        if not print it and add to the dictionary at [entry_id] so we can skip it in next iteration
                        Don't know why but whenever a new item appears on the website it seems to edit every item in the dictionary instead of just editing the one
                        that wasnt there"""
                        print(
                            f"{nickname} was killed by {killed_by} at {day}-{month}-{year} {hour}"
                        )
                        self.records[
                            entry_id
                        ] = f"{nickname} was killed by {killed_by} at {day}-{month}-{year} {hour}"
        print("---")
        self.iter  = 1


ks = KillStatistics()
if __name__ == "__main__":
    while True:
        ks.parse_records()
        time.sleep(10)

The entry_id are always the same its 500 rows of data and they have id of 1,2,3...500 the newest is always 1. I know i could always check for 1 to get the newest but sometimes for example 10 players can die at the same time so i would like to check them all if they changed and only perform print on new ones.

Current output:

Velerion was killed by Rat and Cave Rat at 27-12-2021 16:53
Scrappy was killed by Cursed Queen at 27-12-2021 16:52
Velerion was killed by Rat at 27-12-2021 16:28
Velerion was killed by Rat at 27-12-2021 16:22
Velerion was killed by Rat at 27-12-2021 16:21
Velerion was killed by Rat at 27-12-2021 15:51
Shade was killed by Tentacle Slayer at 27-12-2021 15:46
Mr Yahoo was killed by Immortal Hunter at 27-12-2021 15:41
Scrappy was killed by Witch Hunter at 27-12-2021 15:39
Barbudo Arqueiro was killed by Seahorse at 27-12-2021 15:23
Emperor Martino was killed by Dark Slayer at 27-12-2021 15:14
Shade was killed by Tentacle Slayer at 27-12-2021 15:11
Head Hunter was killed by Demon Blood Slayer at 27-12-2021 15:09

Expected output:

Velerion was killed by Rat and Cave Rat at 27-12-2021 16:53

CodePudding user response:

Here's what I've changed your processing. I'm using one regex to parse all the header information. That gets me all 7 numeric fields at once, plus the overall length of the match tells me where the message starts.

Then, I'm using the timestamp to determine what data is new. The newest entry is always first, so I grab the first timestamp of the lot to use as the threshold for the next.

Then, I'm storing the entries in a list instead of a dict. If you don't really need to store them forever, but just want to print them, then you don't need to track the list at all.

import requests
import re
import copy
import time

from datetime import date
from bs4 import BeautifulSoup

prefix = r"(\d )\.(\d )\.(\d )\.(\d )\, (\d ):(\d ):(\d )"

class KillStatistics:
    def __init__(self):
        self.records = []
        self.latest = (0,0,0,0,0,0,0)
        self.watched_names = ["Test"]

    def parse_records(self):
        r = requests.get("http://149.56.28.71/?subtopic=killstatistics")
        soup = BeautifulSoup(r.content, "html.parser")
        table = soup.findChildren("table")
        latest = None

        for record in table:
            for data in record:
                if data.text == "Last Deaths":
                    continue
                entry = data.text
                mo = re.match(prefix, entry)
                entry_id, day, month, year, hour, mm, ss = mo.groups()
                stamp = tuple(int(i) for i in (year, month, day, hour, mm, ss))
                if latest is None:
                    latest = stamp

                if stamp > self.latest:
                    rest = entry[mo.span()[1]:]
                    i = rest.find(" at ")
                    j = rest.find(" by ")
                    nickname = rest[:i]
                    killed_by = rest[j 4:]
                    msg = f"{nickname} was killed by {killed_by} at {day}-{month}-{year} {hour}"
                    print( msg )
                    self.records.append( msg )
        print("---")
        self.latest = latest

ks = KillStatistics()
if __name__ == "__main__":
    while True:
        ks.parse_records()
        time.sleep(10)

CodePudding user response:

I managed to find the solution by myself. In this specific case, I needed to create a list of entries and a copy of it. Then after each lookup, I compare the newly created list to the old one and using set().difference() I return the new records.

import requests
import re
import copy
import time

from datetime import date
from bs4 import BeautifulSoup


class KillStatistics:
    def __init__(self):
        self.records = []
        self.old_table = []
        self.watched_names = ["Test"]
        self.visited = False

    def parse_records(self):
        r = requests.get("http://149.56.28.71/?subtopic=killstatistics")
        soup = BeautifulSoup(r.content, "html.parser")
        table = soup.findChildren("table")
        self.records = []

        for record in table:
            for data in record:
                if data.text == "Last Deaths":
                    continue
                else:

                    entry = data.text
                    entry = re.split("..?(?=[0-9][A-Z]).", data.text)
                    entry[0] = entry[0].split(", ")
                    entry[0][0] = entry[0][0].split(".")

                    entry_id, day, month, year, hour = (
                        entry[0][0][0],
                        entry[0][0][1],
                        entry[0][0][2],
                        entry[0][0][3],
                        entry[0][1],
                    )
                    ## czas recordów jest w EST. Można konwertować na czas w Brazylii i w CET
                    message = entry[1]
                    nickname = (re.findall(". ?(?=at)", message)[0]).strip()
                    killed_by = (re.findall(r"(?<=\bby).*", message)[0]).strip()
                    record_to_add = (
                        f"{nickname},{killed_by},{day}-{month}-{year} {hour}"
                    )

                    self.records.append(record_to_add)

        if len(self.old_table) > 0:
            self.compare_records(self.old_table, self.records)
        else:
            print("Setting up initial data...")
        print("---")
        self.visited = True

    def compare_records(self, old_records, new_records):
        new_record = list(set(new_records).difference(old_records))
        if len(new_record) > 0:
            print("We got new record")
            for i in new_record:
                print(i)
                with open("/home/sammy/gp/new_records", "a") as f:
                    f.write(i   "\n")
        else:
            print("No new records")

class DiscordBot:
    def __init__(self):
        pass


if __name__ == "__main__":
    ks = KillStatistics()
    while True:
        ks.parse_records()
        ks.old_table = copy.deepcopy(ks.records)
        time.sleep(10)

  • Related