I try to scrape a webpage with hourly energy prices. I want to use the data for home-automation. if the hourly price =< baseload price, certain times should turn on via Mqtt. I managed to get the data from the baseload price and the hourly prices from its column. The output from the column seems not to be in one list but in 24 lists. correct? how to fix this so that the hourly price can be compared with the baseload price?
import datetime
import pytz
import requests
from bs4 import BeautifulSoup as bs
today_utc = pytz.utc.localize(datetime.datetime.utcnow())
today = today_utc.astimezone(pytz.timezone("Europe/Amsterdam"))
text_today = today.strftime("%y-%m-%d")
print(today)
print(text_today)
yesterday = datetime.datetime.now(tz=pytz.timezone("Europe/Amsterdam")) - datetime.timedelta(1)
text_yesterday = yesterday.strftime("%y-%m-%d")
print(yesterday)
print(text_yesterday)
url_part1 = 'https://www.epexspot.com/en/market-data?market_area=NL&trading_date='
url_part2 = '&delivery_date='
url_part3 = '&underlying_year=&modality=Auction&sub_modality=DayAhead&technology=&product=60&data_mode=table&period=&production_period='
url_text = url_part1 text_yesterday url_part2 text_today url_part3
print(url_text)
html_text = requests.get(url_text).text
#print(html_text)
soup = bs(html_text,'lxml')
#print(soup.prettify())
baseload = soup.find_all('div', class_='flex day-1')
for baseload_price in baseload:
baseload_price = baseload_price.find('span').text.replace(' ', '')
print(baseload_price)
table = soup.find_all('tr',{'class':"child"})
#print(table)
for columns in table:
column3 = columns.find_all('td')[3:]
#print(columns)
column3_text = [td.text.strip() for td in column3]
column3_text = column3_text
print(column3_text)
CodePudding user response:
You simply need to use join:
column3_text = "".join([td.text.strip() for td in column3])
CodePudding user response:
In the for loop for columns in table
, you are creating a new list column3_text
. If you intend for column3 text to be a list of the next 24 hours, you can replace this for loop with this:
column3_text = [column.find_all("td")[3].text.strip() for column in table]
Additionally, if you are going to be comparing the baseload price to the hourly prices, you'll want to convert the strings to floats or Decimals. :)
CodePudding user response:
If you want to compare the values use pandas
.
Here's how:
import datetime
import urllib.parse
import pandas as pd
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0",
}
today = datetime.datetime.today().strftime("%Y-%m-%d")
yesterday = (
datetime.datetime.today() - datetime.timedelta(days=1)
).strftime("%Y-%m-%d")
url = "https://www.epexspot.com/en/market-data?"
data = {
"market_area": "NL",
"trading_date": yesterday,
"delivery_date": today,
"underlying_year": "",
"modality": "Auction",
"sub_modality": "DayAhead",
"technology": "",
"product": "60",
"data_mode": "table",
"period": "",
"production_period": "",
}
query_url = f"{url}{urllib.parse.urlencode(data)}"
with requests.Session() as s:
s.headers.update(headers)
response = s.get(query_url).text
baseload = (
BeautifulSoup(response, "html.parser")
.select_one(".day-1 > span:nth-child(1)")
.text
)
print(f"Baselaod: {baseload}")
df = pd.concat(pd.read_html(response, flavor="lxml"), ignore_index=True)
df.columns = range(df.shape[1])
df = df.drop(df.columns[[4, 5, 6, 7]], axis=1)
df['is_higher'] = df[[3]].apply(lambda x: (x >= float(baseload)), axis=1)
df['price_diff'] = df[[3]].apply(lambda x: (x - float(baseload)), axis=1)
df = df.set_axis(
[
"buy_volume",
"sell_volume",
"volume",
"price",
"is_higher",
"price_diff",
],
axis=1,
copy=False,
)
df.insert(
0,
"hours",
[
f"0{value}:00 - {value 1}:00" if value < 10
else f"{value}:00 - {value 1}:00"
for value in range(0, 24)
],
)
print(df)
Output:
Baselaod: 144.32
hours buy_volume sell_volume ... price is_higher price_diff
0 00:00 - 1:00 2052.2 3608.7 ... 124.47 False -19.85
1 01:00 - 2:00 2467.8 3408.9 ... 119.09 False -25.23
2 02:00 - 3:00 2536.8 3220.5 ... 116.32 False -28.00
3 03:00 - 4:00 2552.0 3206.5 ... 114.60 False -29.72
4 04:00 - 5:00 2524.4 3010.0 ... 115.07 False -29.25
5 05:00 - 6:00 2542.4 3342.7 ... 123.54 False -20.78
6 06:00 - 7:00 2891.2 3872.2 ... 145.42 True 1.10
7 07:00 - 8:00 3413.2 3811.0 ... 166.40 True 22.08
8 08:00 - 9:00 3399.4 3566.0 ... 168.00 True 23.68
9 09:00 - 10:00 2919.3 3159.4 ... 153.30 True 8.98
10 10:00 - 11:00 2680.2 3611.5 ... 143.35 False -0.97
11 11:00 - 12:00 2646.8 3722.3 ... 141.95 False -2.37
12 12:00 - 13:00 2606.4 3723.3 ... 141.96 False -2.36
13 13:00 - 14:00 2559.7 3232.3 ... 145.96 True 1.64
14 14:00 - 15:00 2544.9 3261.2 ... 155.00 True 10.68
15 15:00 - 16:00 2661.7 3428.0 ... 169.15 True 24.83
16 16:00 - 17:00 3072.2 3529.4 ... 173.36 True 29.04
17 17:00 - 18:00 3593.7 3091.4 ... 192.00 True 47.68
18 18:00 - 19:00 3169.0 3255.4 ... 182.86 True 38.54
19 19:00 - 20:00 2710.1 3630.3 ... 167.96 True 23.64
20 20:00 - 21:00 2896.3 3728.8 ... 147.17 True 2.85
21 21:00 - 22:00 3160.3 3639.2 ... 136.78 False -7.54
22 22:00 - 23:00 3506.2 3196.3 ... 119.90 False -24.42
23 23:00 - 24:00 3343.8 3414.1 ... 100.00 False -44.32