Home > Software engineering >  Scrape specific rows across multiple tables on Wikipedia
Scrape specific rows across multiple tables on Wikipedia

Time:10-10

I'm new to Python and am trying scrape election results from this wikipedia page: Rows with Colspan

In order to keep them out of our dataframe, we can create a function to filter them from our selection:

def rowHasColspan(tr):
    cells = tr.select('td')

    for cell in cells:
        if (cell.get('colspan') is not None and int(cell.get('colspan')) > 1):
            return True

    return False

Now, by putting it all together, we can finally scrape the data:

df = []
titles = content.select('h3')
for i in range(len(tables)):
    h3 = titles[i]
    table = tables[i]

    title = h3.select('span')[0].decode_contents()
    state = 2
    for row in table.select('tr'):
        if (state == 2):
            if (isRowTitle(row) and isRowTPP(row)):
                state -= 1
        elif (state == 1):
            if (isRowTitle(row)):
                break
            else:
                if (not rowHasColspan(row)):
                    cells = row.find_all('td')
                    cells.pop(0)
                    data = [d.text.rstrip() for d in cells]
                    data.insert(0, title)
                    df.append(data)

Here's the entire script:

from bs4 import BeautifulSoup
import pandas as pd
import requests

url = 'https://en.wikipedia.org/wiki/Results_of_the_2019_New_South_Wales_state_election_(Legislative_Assembly)'
html = requests.get(url).content
soup = BeautifulSoup(html, 'html.parser')

content = soup.find('div', {'id': 'content'})

undesired = content.find_all('div', {'class': 'navbox'})
for udiv in undesired:
    udiv.decompose()

undesired = content.find_all('div', {'class': 'navbox-styles'})
for udiv in undesired:
    udiv.decompose()

tables = content.select('table')
tables.pop(0) # Remove Top-Right table

def isRowTPP(tr):
    a = tr.find('th').find('a')

    if (a is not None):
        if (a.decode_contents() == 'Two-party-preferred'):
            return True
        else:
            return False
    else:
        return False

def isRowTitle(tr):
    return tr.find('th')

def rowHasColspan(tr):
    cells = tr.select('td')

    for cell in cells:
        if (cell.get('colspan') is not None and int(cell.get('colspan')) > 1):
            return True

    return False

df = []
titles = content.select('h3')
for i in range(len(tables)):
    h3 = titles[i]
    table = tables[i]

    title = h3.select('span')[0].decode_contents()
    state = 2
    for row in table.select('tr'):
        if (state == 2):
            if (isRowTitle(row) and isRowTPP(row)):
                state -= 1
        elif (state == 1):
            if (isRowTitle(row)):
                break
            else:
                if (not rowHasColspan(row)):
                    cells = row.find_all('td')
                    cells.pop(0)
                    data = [d.text.rstrip() for d in cells]
                    data.insert(0, title)
                    df.append(data)

df = pd.DataFrame(df, columns=[' ','Party','Candidate','Votes','Pct','Pct_ch'])
print(df)
  • Related