PROBLEM: I have a list of sites that I want BS and Pandas to grab a data table for. I want to add all the iterative results to the same xlsx or csv file.
My current code below will iterate over each of the 3 sites, but the final product is just the last page to get scraped. Removing my export function and just printing df
, I can see all 3 pages of data; so I'm not sure how to correctly append each iteration into my output file.
from bs4 import BeautifulSoup
import requests
import pandas as pd
from time import gmtime, strftime
#Pass in the URL
url = ["https://www.nfl.com/standings/league/2021/reg", "https://www.nfl.com/standings/league/2020/reg", "https://www.nfl.com/standings/league/2019/reg"]
for site in url:
#Load the page html
page = requests.get(site)
soup = BeautifulSoup(page.text, 'lxml')
# Get all the table data
table = soup.find('table', {'summary':'Standings - Detailed View'})
headers = []
for i in table.find_all('th'):
title = i.text.strip()
headers.append(title)
#Dataframe the headers into columns
df = pd.DataFrame(columns = headers)
# TR for the rows, TD for the values
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
length = len(df)
df.loc[length] = row_data
#Write the collected data out to an Excel file
dateTime = strftime("%d%b%Y_%H%M", gmtime())
writer = pd.ExcelWriter(dateTime "Z" ".xlsx")
df.to_excel(writer)
writer.save()
print('[*] Data successfully written to Excel File.')
CodePudding user response:
Try the following. You need to capture all the dataframes from each url, then concatenate them, then write the new df to excel. This should work, but untested. See comments inline.
from bs4 import BeautifulSoup
import requests
import pandas as pd
from time import gmtime, strftime
#Pass in the URL
url = ["https://www.nfl.com/standings/league/2021/reg", "https://www.nfl.com/standings/league/2020/reg", "https://www.nfl.com/standings/league/2019/reg"]
df_hold_list = [] #collect each dataframe separately
for site in url:
#Load the page html
page = requests.get(site)
soup = BeautifulSoup(page.text, 'lxml')
# Get all the table data
table = soup.find('table', {'summary':'Standings - Detailed View'})
headers = []
for i in table.find_all('th'):
title = i.text.strip()
headers.append(title)
#Dataframe the headers into columns
df = pd.DataFrame(columns = headers)
# TR for the rows, TD for the values
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
length = len(df)
df.loc[length] = row_data
df_hold_list.append(df) # add the dfs to the list
final_df = pd.concat(df_hold_list, axis=1) # put them together-check that axis=1 is correct, otherwise axis=0
# move this out of loop
#Write the collected data out to an Excel file
dateTime = strftime("%d%b%Y_%H%M", gmtime())
writer = pd.ExcelWriter(dateTime "Z" ".xlsx")
final_df.to_excel(writer) # write final_df to excel
writer.save()
print('[*] Data successfully written to Excel File.')