How to webscrape from a selected tab with an embedded table on a website?-CodePudding

I am trying to scrape data from https://www.onthesnow.com/alberta/lake-louise/historical-snowfall however the default script only shows the monthly totals and not the annual totals. There is a tab you have to select 'Annual' on the webpage to show the annual totals. website source code showing the table I can successfully scrape the monthly totals but I am unsuccessful in returning the annual totals. Annual totals table

def historic_snowfall():
    #Resort naming convetion on OpenSnow.com
    # Revelstoke, BC: british-columbia, revelstoke-mountain
    # Whistler, BC: british-columbia, whistler-blackcomb
    # Lake Louise, AB: alberta, lake-louise
    # Big Sky, MT: montana, big-sky-resort
    # Snowbird, UT: utah, snowbird
    # Palisades: califronia, squaw-valley-usa
    # Steamboat, CO: colorado, steamboat
    # Copper Mountain, CO: colorado, copper-mountain
    # Aspen, CO: colorado, aspen-snowmass
    # Jackson Hole, WY: wyoming, jackson-hole
    # Taos, NM: taos-ski-valley
    
    resort_table = ['alberta/lake-louise',
                    'montana/big-sky-resort',
                    'utah/snowbird',
                    'california/squaw-valley-usa',
                    'colorado/steamboat',
                    'colorado/copper-mountain',
                    'colorado/aspen-snowmass',
                    'wyoming/jackson-hole',
                    'new-mexico/taos-ski-valley'
                    ]

    resort = (resort_table[1])
    
    hsnow_url = 'https://www.onthesnow.com/'   resort   '/historical-snowfall'
    page = requests.get(hsnow_url) 
    soup = BeautifulSoup(page.content, "html.parser")
    
 
    data = []
    table = soup.find('table')                      # Find table
    table_body = table.find('tbody')                # Find body of table

    rows = table_body.find_all('tr')                #find rows within table
    for row in rows:
        cols = row.find_all('td')                   # within rows pull column data
        cols = [ele.text.strip() for ele in cols]
        data.append([ele for ele in cols if ele]) # Get rid of empty values
        
    return data     #returns monthly averages not yearly results

CodePudding user response：

You can use the Json data embedded inside the HTML page to get the annual info:

import json
import requests
import pandas as pd
from bs4 import BeautifulSoup


url = "https://www.onthesnow.com/alberta/lake-louise/historical-snowfall"

soup = BeautifulSoup(requests.get(url).content, "html.parser")

data = json.loads(soup.select_one("#__NEXT_DATA__").text)
df = pd.DataFrame(data["props"]["pageProps"]["snowfallInfoAnnual"])
print(df.to_markdown(index=False))

Prints:

date	totalSnow	snowDays	baseDepth	summitDepth	maxBaseDepth	biggestSnowfall
2012-01-01	467	80	25.2892	128.724	118	28
2013-01-01	756	89	111.013	123.061	207	40
2014-01-01	465	64	77.3352	105.181	158	34
2015-01-01	339	60	75.5234	95.4746	125	54
2016-01-01	544	95	99.4935	147.88	177	30
2017-01-01	737	89	114.79	153.75	192	41
2018-01-01	505	81	98.2466	124.705	158	71
2019-01-01	537	73	76.7257	84.805	188	36
2020-01-01	445	76	88.5694	110.969	183	38
2021-01-01	669	96	118.13	168.105	178	54
2022-01-01	111	14	39.9333	47.7667	52	24
Average	507	75	85	118	158	41

CodePudding user response：

I would leave this in a comment but I don't have enough reputation, but you can get the data directly:

Monthly: https://api.onthesnow.com/api/v2/resort/368/snowfall/monthly

Annual: https://api.onthesnow.com/api/v2/resort/368/snowfall/annual

Example data returned:

{"snowfallItems":[{"date":"2012-01-01","totalSnow":467,"snowDays":80,"baseDepth":25.2892,"summitDepth":128.72395779996586,"maxBaseDepth":118,"biggestSnowfall":28}]}

Note: Each measurement is in centimetres rather than inches so you might have to convert it.

Edit: The other (BETTER) answer basically does this and makes things beautiful in a table too

CodePudding user response：

If you can use find_all() function instead of find() while searching tables, you can get the annual table as well. So the code should be like:

from bs4 import BeautifulSoup
import requests 

def historic_snowfall():
    #Resort naming convetion on OpenSnow.com
    # Revelstoke, BC: british-columbia, revelstoke-mountain
    # Whistler, BC: british-columbia, whistler-blackcomb
    # Lake Louise, AB: alberta, lake-louise
    # Big Sky, MT: montana, big-sky-resort
    # Snowbird, UT: utah, snowbird
    # Palisades: califronia, squaw-valley-usa
    # Steamboat, CO: colorado, steamboat
    # Copper Mountain, CO: colorado, copper-mountain
    # Aspen, CO: colorado, aspen-snowmass
    # Jackson Hole, WY: wyoming, jackson-hole
    # Taos, NM: taos-ski-valley
    
    resort_table = ['alberta/lake-louise',
                    'montana/big-sky-resort',
                    'utah/snowbird',
                    'california/squaw-valley-usa',
                    'colorado/steamboat',
                    'colorado/copper-mountain',
                    'colorado/aspen-snowmass',
                    'wyoming/jackson-hole',
                    'new-mexico/taos-ski-valley'
                    ]

    resort = (resort_table[1])
    
    hsnow_url = 'https://www.onthesnow.com/'   resort   '/historical-snowfall'
    page = requests.get(hsnow_url) 
    soup = BeautifulSoup(page.content, "html.parser")
    

    data = []
    data2 = []
    table = soup.find_all('table')                      # Find table

    table_body_1 = table[0].find('tbody')                # Find body of table
    rows_table_1 = table_body_1.find_all('tr')                #find rows within table
    for row in rows_table_1:
        cols = row.find_all('td')                   # within rows pull column data
        cols = [ele.text.strip() for ele in cols]
        data.append([ele for ele in cols if ele]) # Get rid of empty values

    table_body_2 = table[1].find('tbody')                # Find body of table
    rows_table_2 = table_body_2.find_all('tr')                #find rows within table
    for row in rows_table_2:
        cols = row.find_all('td')                   # within rows pull column data
        cols = [ele.text.strip() for ele in cols]
        data2.append([ele for ele in cols if ele]) # Get rid of empty values
        
    return data,data2     #returns monthly averages not yearly results


print(historic_snowfall())