Scraping wind data from Weather Underground using selenium in Python-CodePudding

I'm trying to scrape the next day's forecast for time, wind speed and wind direction from Weather Underground. I adapted the code in this tutorial and my MWE is

import numpy as np
import pandas as pd
from datetime import datetime
from datetime import date, timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service as ChromeService

# define future date
start_date = date.today()   pd.Timedelta(days=1)

# get data for Sydney
page = 'https://www.wunderground.com/hourly/au/sydney/date/{}-{}-{}'

df = pd.DataFrame()

options = webdriver.ChromeOptions()
options.add_argument('headless') 
options.set_capability("loggingPrefs", {'performance': 'ALL'})
service = ChromeService(executable_path='chromedriver.exe')
driver = webdriver.Chrome(service=service, options=options)

classlist = ["mat-cell cdk-cell cdk-column-timeHour mat-column-timeHour ng-star-inserted",
             "mat-cell cdk-cell cdk-column-wind mat-column-wind ng-star-inserted",
             "mat-cell cdk-cell cdk-column-wind mat-column-wind ng-star-inserted",]
name = ['time', 'windspeed_mph', 'winddirection']

print('gathering data from: ', start_date)
    
formatted_lookup_URL = page.format(start_date.year,
                                   start_date.month,
                                   start_date.day)

driver.get(formatted_lookup_URL)

rows = WebDriverWait(driver, 20).until( \
    EC.visibility_of_all_elements_located((By.XPATH, \
    '//td[@]')))

for row in rows:

    time = row.find_element(By.XPATH,'.//span[@]').text
    
    # append new row to table
    df = df.append(pd.DataFrame({"Day":[str(start_date)],"time":[time],}),
                   ignore_index = True)

del classlist[0]

for ii in range(len(classlist)):
    
    rows = WebDriverWait(driver, 20).until( \
        EC.visibility_of_all_elements_located((By.XPATH, \
        '//td[@]')))
    
    for row in rows:

        if name[ii]=='winddirection':
            data = row.find_element(By.XPATH,
                './/span[@]').text
            print(data)

        else:
            data = row.find_element(By.XPATH,
                './/span[@]').text
        
        # append new row to table
        df = df.append(pd.DataFrame({name[ii]:[data]}), ignore_index=True)
    
driver.quit()

# remove NaN
df = df.apply(lambda x: pd.Series(x.dropna().values))
print(df.head())

The final dataframe df contains the time and wind speed, but not the wind direction. I suspect it's because of the line data = row.find_element(By.XPATH, './/span[@]').text but I'm not sure how to fix it.

CodePudding user response：

Seems that the classlist and name have different lengths after line del classlist[0]. Fix it by adding this line after deleting first element of classlist:

del name[0]