I'm trying to scrape the next day's forecast for time, wind speed and wind direction from Weather Underground. I adapted the code in this tutorial and my MWE is
import numpy as np
import pandas as pd
from datetime import datetime
from datetime import date, timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service as ChromeService
# define future date
start_date = date.today() pd.Timedelta(days=1)
# get data for Sydney
page = 'https://www.wunderground.com/hourly/au/sydney/date/{}-{}-{}'
df = pd.DataFrame()
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.set_capability("loggingPrefs", {'performance': 'ALL'})
service = ChromeService(executable_path='chromedriver.exe')
driver = webdriver.Chrome(service=service, options=options)
classlist = ["mat-cell cdk-cell cdk-column-timeHour mat-column-timeHour ng-star-inserted",
"mat-cell cdk-cell cdk-column-wind mat-column-wind ng-star-inserted",
"mat-cell cdk-cell cdk-column-wind mat-column-wind ng-star-inserted",]
name = ['time', 'windspeed_mph', 'winddirection']
print('gathering data from: ', start_date)
formatted_lookup_URL = page.format(start_date.year,
start_date.month,
start_date.day)
driver.get(formatted_lookup_URL)
rows = WebDriverWait(driver, 20).until( \
EC.visibility_of_all_elements_located((By.XPATH, \
'//td[@]')))
for row in rows:
time = row.find_element(By.XPATH,'.//span[@]').text
# append new row to table
df = df.append(pd.DataFrame({"Day":[str(start_date)],"time":[time],}),
ignore_index = True)
del classlist[0]
for ii in range(len(classlist)):
rows = WebDriverWait(driver, 20).until( \
EC.visibility_of_all_elements_located((By.XPATH, \
'//td[@]')))
for row in rows:
if name[ii]=='winddirection':
data = row.find_element(By.XPATH,
'.//span[@]').text
print(data)
else:
data = row.find_element(By.XPATH,
'.//span[@]').text
# append new row to table
df = df.append(pd.DataFrame({name[ii]:[data]}), ignore_index=True)
driver.quit()
# remove NaN
df = df.apply(lambda x: pd.Series(x.dropna().values))
print(df.head())
The final dataframe df
contains the time and wind speed, but not the wind direction. I suspect it's because of the line data = row.find_element(By.XPATH, './/span[@]').text
but I'm not sure how to fix it.
CodePudding user response:
Seems that the classlist
and name
have different lengths after line del classlist[0]
. Fix it by adding this line after deleting first element of classlist:
del name[0]