I'm working on a code to scrape data on NBA players age straight from Wikipedia, pretty much finished it some days ago, returned to it today to check if it's working, whole code looks like this (I've added try/except statement in yyyy_mm_dd_to_age function and commented out the extra code that works fine):
import bs4
import pandas as pd
import numpy as np
import requests
import statistics
import datetime as dt
from typing import Generator, Tuple, List, T
def tables_by_class(soup: bs4.BeautifulSoup,
html_class: str) -> Generator[pd.DataFrame, None, None]:
tables = soup.find_all("table", {"class": html_class})
tables = pd.read_html(str(tables))
for table in tables:
yield table
def divs_by_id(soup: bs4.BeautifulSoup,
html_id: str) -> bs4.element.ResultSet:
elements = soup.find_all("div", {"id": html_id})
return elements
def yyyy_mm_dd_to_age(b_date: str) -> float:
try:
b_date = dt.datetime.strptime(b_date, "%Y-%m-%d")
today = dt.date.today()
today = dt.datetime(year = today.year,
month = today.month,
day = today.day)
return round((today - b_date).days / 365.25, 2)
except ValueError:
return b_date
def l_flatten(l: List[T]) -> List[T]:
return [j for i in l for j in i]
def l_to_bins(l: List[float],
bins: List[float]) -> List[float]:
pairs = {(i, j): 0 for i, j in zip(bins, bins[1:])}
for pair in pairs:
pairs[pair] = len([i for i in l if pair[0] < i <= pair[1]])
return pairs
def team_data():
url = "https://en.wikipedia.org/wiki/List_of_current_NBA_team_rosters"
html_content = requests.get(url).text
soup = bs4.BeautifulSoup(html_content, "lxml")
team_names = divs_by_id(soup, "toc")[0]
team_names = team_names.find_all("a", href = True)
words_out = ["Conference", "Division", "See", "References", "External"]
team_names = [i["href"][1:] for i in team_names
if not any(j in i["href"] for j in words_out)]
team_tables = tables_by_class(soup, "sortable")
team_tables = [table["DOB (YYYY-MM-DD)"].tolist() for table in team_tables]
team_tables = [[yyyy_mm_dd_to_age(b_date) for b_date in table]
for table in team_tables]
return team_tables
# this is necessary, but i commented it out to figure out the bug
#team_tables = [sorted(table) for table in team_tables]
#age_dict = {name: table for name, table in zip(team_names, team_tables)}
#age_dict = dict(sorted(age_dict.items()))
#age_dataframe = pd.DataFrame(list(age_dict.values()),
# index = list(age_dict.keys()))
#age_dist = l_flatten(age_dataframe.values.tolist())
#age_dist = l_to_bins(age_dist, [i for i in np.linspace(18, 42, 9)])
#return age_dataframe, age_dist
data = team_data()
print(data)
The output is this:
[[27.56, 24.97, 23.17, 23.85, 26.05, 35.37, 29.4, 21.97, 24.97, 24.5, 22.0, 26.42, 25.94, 23.71, 28.08, 28.08, 27.61, 23.62, 22.87, 23.99], [36.24, 27.28, '1997–01–20', 25.17, 25.02, 26.09, 22.5, 22.01, 33.04, 21.18, 22.28, 32.58, 32.14, 30.11, 29.56, 34.65, 33.18, 36.68, '1999–09–30', 19.94, 20.01, 23.2], [21.34, 30.24, 28.96, 36.31, 26.04, 21.44, 22.18, 21.1, 27.52, 22.33, 26.88, 23.54, 33.03, 27.04, 22.66, 22.99, 23.62, 31.44, 23.55], [20.96, 31.15, 28.18, 27.58, 34.32, 29.25, 28.02, 22.13, 22.29, 24.23, 21.03, 25.05, 28.33, 22.31, 24.68, 25.24, 19.06, 24.62], [22.07, 24.25, 21.94, 20.21, 29.05, 21.94, 28.76, 20.3, 27.44, 35.44, 23.43, 20.63, 24.35, 27.54, 22.74, 27.64, 27.09, 27.01], [23.97, 23.21, 22.22, 27.63, 24.06, 32.19, 21.74, 22.2, 28.23, 25.48, 25.38, 24.66, 26.6, 22.0, 27.2, 30.98, 21.66, 20.14], [23.49, 23.29, 25.85, 21.72, 24.18, 33.1, 24.4, 20.33, 22.57, 20.72, 26.52, 28.72, 30.98, 22.78, 24.27, 27.91, 24.9, 25.06], [22.52, 20.05, 30.4, 23.21, 22.8, 27.59, 20.22, 23.45, 24.68, 30.15, 22.31, 23.22, 25.94, 30.21, 30.49, 23.81, 21.81, 22.16, 20.4, 26.54], [22.24, 23.32, 28.84, 29.56, 24.34, 22.35, 32.53, 19.76, 23.73, 29.38, 27.14, 26.2, 29.56, 25.45, 27.79, 22.06, 25.56, 32.22, 28.11, 21.56], [26.02, 26.86, 29.24, 26.49, 28.77, 23.52, 24.7, 35.45, 31.34, 28.99, 22.78, 33.54, 22.4, 30.18, 23.1, 28.37, 26.86, 26.68, 24.01, 23.76], [29.16, 27.41, 24.06, 20.34, 31.74, 33.19, 27.7, 30.58, 23.13, 23.87, 22.13, 19.82, 26.44, 24.11, 25.83, 20.84, 22.12, 34.97, 29.47, 23.07], [20.15, 21.07, 23.57, 20.64, 22.25, 31.56, 26.82, 20.74, 23.78, 21.59, 26.05, 23.7, 24.04, 25.85, 31.61, 23.88, 27.58, 33.28, 19.14, 23.15], [24.24, 32.08, 32.18, 22.93, 41.35, 21.74, 35.56, 26.05, 32.12, 22.47, 27.45, 23.53, 27.48, 22.37, 23.79, 22.22, 25.55, 36.45, 25.34, 23.32], [21.42, 23.43, 22.77, 22.5, 30.02, 24.43, 23.38, 25.25, 20.68, 27.09, 24.03, 33.54, 32.64, 23.16, 30.69, 24.54, 20.37, 22.89, 20.13, 24.47], [20.78, 28.3, 28.92, 24.21, 28.65, 28.53, 24.47, 23.04, 28.99, 22.98, 23.68, 27.72, 25.04, 23.14, 22.62, 26.23, 29.41, 23.34, 19.99, 23.63], [30.77, 21.9, 21.91, 30.57, 24.51, 26.22, 24.97, 26.08, 31.32, 35.13, 22.62, 21.08, 26.65, 26.3, 24.64, 20.76, 23.3, 26.35, 29.2, 24.5], [24.88, 33.26, 21.09, 23.04, 20.19, 24.07, 27.61, 22.82, 21.04, 25.52, 22.93, 22.27, 23.12, 27.57, 22.14, 25.64, 25.07, 25.92, 22.54, 22.97], [21.34, 26.68, 24.73, 22.49, 30.25, 19.01, 23.26, 24.27, 21.32, 20.34, 20.7, 30.29, 19.8, 20.95, 23.7, 26.79, 22.78, 26.87], [25.89, 20.12, 24.29, 28.56, 30.84, 21.33, 31.25, 21.68, 30.07, 28.67, 28.79, 27.15, 32.59, 28.39, 22.35, 23.89, 29.93, 20.93, 29.03], [24.08, 22.08, 32.49, 23.5, 21.14, 29.36, 34.01, 24.28, 23.34, 35.16, 29.3, 23.6, 34.04, 25.1, 28.36, 24.2, 24.94, 24.34, 32.34], [26.77, 33.43, 30.89, 25.89, 33.59, 31.62, 37.71, 19.03, 28.98, 25.69, 19.38, 27.34, 28.87, 22.32, 28.37, 31.68, 28.51, 26.64, 20.54], [32.84, 31.85, 19.88, 24.33, 31.46, 23.48, 23.45, 32.07, 31.5, 19.6, 25.31, 27.75, 30.3, 24.99, 32.12, 22.18, 21.12, 25.56, 22.81, 24.58], [37.38, 36.29, 21.61, 32.29, 22.86, 28.6, 20.81, 33.88, 20.89, 35.85, 36.79, 33.23, 22.77, 23.69, 26.2, 25.26, 24.64, 23.38, 35.64, 32.92], [23.23, 24.96, 25.13, 31.28, 25.47, 25.62, 28.53, 33.74, 28.05, 36.44, 27.19, 27.64, 28.69, 27.52, 24.59, 21.58], [22.59, 29.38, 23.73, 24.42, 23.82, 21.63, 28.43, 28.83, 28.0, 26.29, 22.53, 28.33, 24.57, 23.11, 22.26, 20.35, 20.04, 25.15, 30.59, 22.09], [22.01, 26.68, 25.12, 30.58, 28.92, 28.16, 22.63, 28.45, 20.91, 29.58, 22.28, 23.81, 29.71, 33.17, 23.71, 23.22, 24.67, 26.2, 30.24, 21.05], [33.93, 23.68, 23.36, 19.85, 26.26, 19.6, 32.8, 19.68, 28.36, 23.73, 20.77, 19.67, 28.75, 21.45, 19.23, 25.97, 29.53, 31.11, 26.05], [28.24, 20.76, 28.07, 23.31, 25.73, 25.07, 22.65, 27.58, 22.08, 25.43, 25.57, 24.95, 23.38, 25.42, 22.18, 22.39, 23.61, 22.76, 20.35, 20.09], [23.12, 23.51, 23.75, 24.56, 26.64, 26.61, 21.4, 27.39, 25.97, 25.28, 24.12, 23.03, 20.53, 22.29, 23.72, 21.33, 29.96, 35.44, 29.44, 21.28], [31.07, 25.73, 23.9, 24.7, 28.23, 22.01, 21.77, 25.97, 29.78, 25.07, 26.0, 18.81, 21.14, 22.84, 27.29, 22.15, 33.32]]
As you see, all the dates have been successfully converted to age except for two, and I've been going crazy trying to figure it out. Can anybody help me out?
CodePudding user response:
some of the date time format does not using dash "-"
as separator
they are using "–"
try change your dt.datetime.strptime(b_date, "%Y-%m-%d")
to try_datetime(b_date)
and it may work
def try_datetime(b_date: str) -> dt.datetime:
try:
return dt.datetime.strptime(b_date, "%Y-%m-%d")
except:
return dt.datetime.strptime(b_date, "%Y–%m–%d")