I am trying to scrape population data for each US state for a range of years. Here is my code so far:
#Import libraries
import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd
#Range of years
years = np.arange(1991,2019).tolist()
#Check to see if the website allows scraping
url = 'https://fred.stlouisfed.org/release/tables?rid=118&eid=259194'
page = requests.get(url)
page
#Set up dataframe
df = pd.DataFrame(
{
'State':[''],
'Year':[''],
'Population':['']
}
)
#For each year - run through this process
for i in years:
full_link = url '&od=' str(i) '-01-01#'
page = requests.get(full_link)
soup = BeautifulSoup(page.text,'lxml')
#Grab the state names
states = soup.find_all('span',class_='fred-rls-elm-nm')
states_list = []
for i in states:
state_name = i.text
states_list.append(state_name)
#Package into df
states_df = pd.DataFrame(states_list,columns=['State'])
states_df['index'] = np.arange(len(states_df))
#Grab the population values
pops = soup.find_all('td',class_='fred-rls-elm-vl-td')
pop_list = []
for i in pops:
pop_num = i.text
pop_list.append(pop_num)
#Package into a df
pop_df = pd.DataFrame(pop_list,columns=['Pop'])
#Clean up population values
pop_df['Pop'] = pop_df['Pop'].replace('\n',"",regex=True).str.strip()
pop_df['Pop'] = pop_df['Pop'].replace(',',"",regex=True)
pop_df = pop_df.apply(pd.to_numeric,errors='ignore')
#Grab every third value
pop_df = pop_df.iloc[::3, :]
pop_df['Pop'] = (pop_df['Pop']*1000).astype(int)
#Create index for joining
pop_df['index'] = np.arange(len(pop_df))
#Combine dataframes
full_df = pd.merge(
states_df,
pop_df,
how='left',
on=['index']
)
#Add in the year
full_df = full_df.append({
'Year' : i
},ignore_index=True)
#Collect all dataframes in a list
df_lists=[]
for i in df_lists:
df_lists.append(full_df)
#Print the list of dataframes
df_lists
I have been able to get this script to work when I hard code the year into the URL, but that is really inefficient. I want to be able to loop through the years in the URL and grab the population data and package together into a nice dataframe. So I would expect the dataframe to look like this:
State Year Population
Alabama 1991 4567
Alaska 1991 4563
Arizona 1991 4532
.........................
West Virginia 2018 3454
Wisconsin 2018 3423
Wyoming 2018 9843
Can someone help point me in the right direction? I feel like I'm close, but I'm missing one or two details somewhere.
Any help would be appreciated! Thank you!
CodePudding user response:
An easy way to do it with pandas:
for x in range(1900, 2022): df_list = pd.read_html(f'https://fred.stlouisfed.org/release/tables?rid=118&eid=259194&od={x}-01-01#') print(df_list[0].head()) ## do stuff with data, save it, merge dfs, etc
CodePudding user response:
I figured it out - mostly needed to differentiate the subscripts
#Set up dataframe
df = pd.DataFrame(
{
'State':[''],
'Year':[''],
'Population':['']
}
)
list_of_dfs=[]
#For each year - run through this process
for i in years:
full_link = url '&od=' str(i) '-01-01#'
page = requests.get(full_link)
soup = BeautifulSoup(page.text,'lxml')
#Grab the state names
states = soup.find_all('span',class_='fred-rls-elm-nm')
states_list = []
for j in states:
state_name = j.text
states_list.append(state_name)
#Package into df
states_df = pd.DataFrame(states_list,columns=['State'])
states_df['index'] = np.arange(len(states_df))
#Grab the population values
pops = soup.find_all('td',class_='fred-rls-elm-vl-td')
pop_list = []
for k in pops:
pop_num = k.text
pop_list.append(pop_num)
#Package into a df
pop_df = pd.DataFrame(pop_list,columns=['Pop'])
#Clean up population values
pop_df['Pop'] = pop_df['Pop'].replace('\n',"",regex=True).str.strip()
pop_df['Pop'] = pop_df['Pop'].replace(',',"",regex=True)
pop_df = pop_df.apply(pd.to_numeric,errors='ignore')
#Grab every third value
pop_df = pop_df.iloc[::3, :]
pop_df['Pop'] = (pop_df['Pop']*1000).astype(int)
#Create index for joining
pop_df['index'] = np.arange(len(pop_df))
#Combine dataframes
full_df = pd.merge(
states_df,
pop_df,
how='left',
on=['index']
)
year_pop = str(i)
full_df['Year'] = year_pop
list_of_dfs.append(full_df)
full_df = pd.concat(list_of_dfs)
full_df