Scraping table with BeautifulSoup and geting table data only if is no older then 3 months [closed]-CodePudding

I've stoped on scraping data from website https://security.paloaltonetworks.com/?severity=CRITICAL&severity=HIGH&product=GlobalProtect App&product=PAN-OS&sort=-updated&limit=100 I'd like to scrape only columns and rows that are updated max 3months ago if it's older then I'd like to not scrape that information. Till now this is what I come up with

#Scraper
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime

def remove_attrs(soup, to_remove=tuple()):
    for tag in soup.findAll(True):
        for attr in [attr for attr in tag.attrs if attr in to_remove]:
            del tag[attr]
    return soup

url = 'https://security.paloaltonetworks.com/?severity=CRITICAL&severity=HIGH&product=GlobalProtect App&product=PAN-OS&sort=-updated&limit=10'
data = requests.get(url).text
soup = BeautifulSoup(data, 'html.parser') #parsing content
remove_attrs(soup, ['onclick', 'href'])
tablePaloAlto = soup.find('table', {'class':'tbl salist wide'})
tableAllTr = tablePaloAlto.find_all('tr')
#tableAllTd = tablePaloAlto.find_all('td')
paloalto_list = []
for table_row in soup.select("table tr"):
    cells = table_row.findAll('td')
    for cell in cells:
        match = re.search(r'\d{4}-\d{2}-\d{2}', cell)
        date = datetime.strptime(match.group(), '%Y-%m-%d').date(cell)
        print(cell)
        
th = tablePaloAlto.find('th', text='Updated')
#print(th.text)
td = th.findNext('td').findNext('td').findNext('td').findNext('td').findNext('td').findNext('td').findNext('td')
#print(td.text)

Could you advise what to do next and guide me with some logic? Thank you very much!!

CodePudding user response：

You were pretty close!

When parsing tables what you want to do is to iterate through table rows and parse the columns.

For your case you can use python's datetime module to turn text into programmable datetime objects and compare them to today's date.
I've modified for loop part of your code to exactly this:

from datetime import datetime, timedelta 
now = datetime.utcnow()

for row in tableAllTr:  
    # find column values of this row
    columns = row.find_all("td")
    if not columns:  # empty row?
        continue
    # "updated on" data is last column
    # turn it into datetime object
    updated_on = datetime.strptime(columns[-1].text, "%Y-%m-%d")
    # how long has it been since update?
    elapsed = now - updated_on
    if elapsed < timedelta(days=90):
        print([col.text for col in columns])

For more on this see Python's datetime module documentation: https://docs.python.org/3.8/library/datetime.html or alternatively a 3rd party datetime module arrow is a bit easier to work with.

CodePudding user response：

i think that should do it

import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

url = 'https://security.paloaltonetworks.com/?severity=CRITICAL&severity=HIGH&product=GlobalProtect App&product=PAN-OS&sort=-updated&limit=10'
data = requests.get(url).text
soup = BeautifulSoup(data, 'html.parser') #parsing content
paloalto_list = []
for table_row in soup.select("table tr"):
    cells = table_row.findAll('td')
    if cells:
        date_cell = cells[-1].text
        date = datetime.strptime(date_cell, '%Y-%m-%d')
        if (datetime.now() - date) > timedelta(days=30*3):
            paloalto_list.append(table_row)