I'm new to Python and am trying scrape election results from this wikipedia page:
In order to keep them out of our dataframe, we can create a function to filter them from our selection:
def rowHasColspan(tr):
cells = tr.select('td')
for cell in cells:
if (cell.get('colspan') is not None and int(cell.get('colspan')) > 1):
return True
return False
Now, by putting it all together, we can finally scrape the data:
df = []
titles = content.select('h3')
for i in range(len(tables)):
h3 = titles[i]
table = tables[i]
title = h3.select('span')[0].decode_contents()
state = 2
for row in table.select('tr'):
if (state == 2):
if (isRowTitle(row) and isRowTPP(row)):
state -= 1
elif (state == 1):
if (isRowTitle(row)):
break
else:
if (not rowHasColspan(row)):
cells = row.find_all('td')
cells.pop(0)
data = [d.text.rstrip() for d in cells]
data.insert(0, title)
df.append(data)
Here's the entire script:
from bs4 import BeautifulSoup
import pandas as pd
import requests
url = 'https://en.wikipedia.org/wiki/Results_of_the_2019_New_South_Wales_state_election_(Legislative_Assembly)'
html = requests.get(url).content
soup = BeautifulSoup(html, 'html.parser')
content = soup.find('div', {'id': 'content'})
undesired = content.find_all('div', {'class': 'navbox'})
for udiv in undesired:
udiv.decompose()
undesired = content.find_all('div', {'class': 'navbox-styles'})
for udiv in undesired:
udiv.decompose()
tables = content.select('table')
tables.pop(0) # Remove Top-Right table
def isRowTPP(tr):
a = tr.find('th').find('a')
if (a is not None):
if (a.decode_contents() == 'Two-party-preferred'):
return True
else:
return False
else:
return False
def isRowTitle(tr):
return tr.find('th')
def rowHasColspan(tr):
cells = tr.select('td')
for cell in cells:
if (cell.get('colspan') is not None and int(cell.get('colspan')) > 1):
return True
return False
df = []
titles = content.select('h3')
for i in range(len(tables)):
h3 = titles[i]
table = tables[i]
title = h3.select('span')[0].decode_contents()
state = 2
for row in table.select('tr'):
if (state == 2):
if (isRowTitle(row) and isRowTPP(row)):
state -= 1
elif (state == 1):
if (isRowTitle(row)):
break
else:
if (not rowHasColspan(row)):
cells = row.find_all('td')
cells.pop(0)
data = [d.text.rstrip() for d in cells]
data.insert(0, title)
df.append(data)
df = pd.DataFrame(df, columns=[' ','Party','Candidate','Votes','Pct','Pct_ch'])
print(df)