import pandas as pd
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
cba_url = "https://www.cbabbq.com/events.cfm"
client = uReq(cba_url)
page_html = client.read()
page_soup = soup(page_html, features='lxml')
# CBA Event Names
cba_event_tags = page_soup.findAll("td", {"class":"th1"})
cba_event_names = []
for cba_event_name in cba_event_tags:
cba_event_names.append(cba_event_name.text)
df_cba_event_names = pd.DataFrame(cba_event_names)
#print(df_cba_event_names)
# CBA Event Location
cba_location_tags = page_soup.findAll("td", {"class":"th2"})
cba_event_locations = []
for cba_event_location in cba_location_tags:
cba_event_locations.append(cba_event_location.text)
df_cba_event_locations = pd.DataFrame(cba_event_locations)
#print(df_cba_event_locations)
# CBA Event Date
cba_date_tags = page_soup.findAll("td")
cba_date = str(cba_date_tags).split("<")[5].split(">")[1]
cba_dates = []
for k in cba_date_tags:
cba_dates.append(k.text)
print(cba_dates)
I am trying to obtain the daates but they have the same "td" tags as the other things but nothing else with it. So, I'm unsure how to specify that specific line of html.
CodePudding user response:
Here is how to get everything from that page:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
url = "https://www.cbabbq.com/events.cfm"
page = uReq(url)
page_soup = soup(page.read(), 'lxml')
data = page_soup.select('.table2 td')
for event, date, location in zip(data[0::3], data[1::3], data[2::3]):
print(f'{event.text} -- {date.text} -- {location.text}')
Another way to iterate over found rows:
for i in range(len(data)-2):
event, date, location = data[i:i 3]
print(f'{event.text} -- {date.text} -- {location.text}')
If you want to just get the dates:
dates = page_soup.select('.table2 td:nth-of-type(3)')
for d in dates:
print(d.text)