Home > Back-end >  Scraping table with Beautifulsoup and output to compare with value
Scraping table with Beautifulsoup and output to compare with value

Time:10-31

I try to scrape a webpage with hourly energy prices. I want to use the data for home-automation. if the hourly price =< baseload price, certain times should turn on via Mqtt. I managed to get the data from the baseload price and the hourly prices from its column. The output from the column seems not to be in one list but in 24 lists. correct? how to fix this so that the hourly price can be compared with the baseload price?

import datetime
import pytz
import requests  
from bs4 import BeautifulSoup as bs

today_utc = pytz.utc.localize(datetime.datetime.utcnow())
today = today_utc.astimezone(pytz.timezone("Europe/Amsterdam"))
text_today = today.strftime("%y-%m-%d")
print(today)
print(text_today)

yesterday =  datetime.datetime.now(tz=pytz.timezone("Europe/Amsterdam")) - datetime.timedelta(1)
text_yesterday = yesterday.strftime("%y-%m-%d")
print(yesterday)
print(text_yesterday)

url_part1 = 'https://www.epexspot.com/en/market-data?market_area=NL&trading_date='
url_part2 = '&delivery_date='
url_part3 = '&underlying_year=&modality=Auction&sub_modality=DayAhead&technology=&product=60&data_mode=table&period=&production_period='
url_text = url_part1 text_yesterday url_part2 text_today url_part3
print(url_text)

html_text = requests.get(url_text).text
#print(html_text)

soup = bs(html_text,'lxml')
#print(soup.prettify())

baseload = soup.find_all('div', class_='flex day-1')
for baseload_price in baseload:
    baseload_price = baseload_price.find('span').text.replace(' ', '')
    print(baseload_price)

table = soup.find_all('tr',{'class':"child"})
#print(table)
for columns in table:
    column3 = columns.find_all('td')[3:]
    #print(columns)
    column3_text = [td.text.strip() for td in column3]
    column3_text = column3_text
    print(column3_text)

CodePudding user response:

You simply need to use join:

column3_text = "".join([td.text.strip() for td in column3])

CodePudding user response:

In the for loop for columns in table, you are creating a new list column3_text. If you intend for column3 text to be a list of the next 24 hours, you can replace this for loop with this:

column3_text = [column.find_all("td")[3].text.strip() for column in table]

Additionally, if you are going to be comparing the baseload price to the hourly prices, you'll want to convert the strings to floats or Decimals. :)

CodePudding user response:

If you want to compare the values use pandas.

Here's how:

import datetime
import urllib.parse

import pandas as pd
import requests
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0",
}

today = datetime.datetime.today().strftime("%Y-%m-%d")
yesterday = (
        datetime.datetime.today() - datetime.timedelta(days=1)
).strftime("%Y-%m-%d")

url = "https://www.epexspot.com/en/market-data?"

data = {
    "market_area": "NL",
    "trading_date": yesterday,
    "delivery_date": today,
    "underlying_year": "",
    "modality": "Auction",
    "sub_modality": "DayAhead",
    "technology": "",
    "product": "60",
    "data_mode": "table",
    "period": "",
    "production_period": "",
}

query_url = f"{url}{urllib.parse.urlencode(data)}"

with requests.Session() as s:
    s.headers.update(headers)
    response = s.get(query_url).text
    baseload = (
        BeautifulSoup(response, "html.parser")
        .select_one(".day-1 > span:nth-child(1)")
        .text
    )
    print(f"Baselaod: {baseload}")

    df = pd.concat(pd.read_html(response, flavor="lxml"), ignore_index=True)
    df.columns = range(df.shape[1])
    df = df.drop(df.columns[[4, 5, 6, 7]], axis=1)
    df['is_higher'] = df[[3]].apply(lambda x: (x >= float(baseload)), axis=1)
    df['price_diff'] = df[[3]].apply(lambda x: (x - float(baseload)), axis=1)
    df = df.set_axis(
        [
            "buy_volume",
            "sell_volume",
            "volume",
            "price",
            "is_higher",
            "price_diff",
        ],
        axis=1,
        copy=False,
    )
    df.insert(
        0,
        "hours",
        [
            f"0{value}:00 - {value   1}:00" if value < 10
            else f"{value}:00 - {value   1}:00"
            for value in range(0, 24)
        ],
    )
    print(df)

Output:

Baselaod: 144.32

            hours  buy_volume  sell_volume  ...   price  is_higher  price_diff
0    00:00 - 1:00      2052.2       3608.7  ...  124.47      False      -19.85
1    01:00 - 2:00      2467.8       3408.9  ...  119.09      False      -25.23
2    02:00 - 3:00      2536.8       3220.5  ...  116.32      False      -28.00
3    03:00 - 4:00      2552.0       3206.5  ...  114.60      False      -29.72
4    04:00 - 5:00      2524.4       3010.0  ...  115.07      False      -29.25
5    05:00 - 6:00      2542.4       3342.7  ...  123.54      False      -20.78
6    06:00 - 7:00      2891.2       3872.2  ...  145.42       True        1.10
7    07:00 - 8:00      3413.2       3811.0  ...  166.40       True       22.08
8    08:00 - 9:00      3399.4       3566.0  ...  168.00       True       23.68
9   09:00 - 10:00      2919.3       3159.4  ...  153.30       True        8.98
10  10:00 - 11:00      2680.2       3611.5  ...  143.35      False       -0.97
11  11:00 - 12:00      2646.8       3722.3  ...  141.95      False       -2.37
12  12:00 - 13:00      2606.4       3723.3  ...  141.96      False       -2.36
13  13:00 - 14:00      2559.7       3232.3  ...  145.96       True        1.64
14  14:00 - 15:00      2544.9       3261.2  ...  155.00       True       10.68
15  15:00 - 16:00      2661.7       3428.0  ...  169.15       True       24.83
16  16:00 - 17:00      3072.2       3529.4  ...  173.36       True       29.04
17  17:00 - 18:00      3593.7       3091.4  ...  192.00       True       47.68
18  18:00 - 19:00      3169.0       3255.4  ...  182.86       True       38.54
19  19:00 - 20:00      2710.1       3630.3  ...  167.96       True       23.64
20  20:00 - 21:00      2896.3       3728.8  ...  147.17       True        2.85
21  21:00 - 22:00      3160.3       3639.2  ...  136.78      False       -7.54
22  22:00 - 23:00      3506.2       3196.3  ...  119.90      False      -24.42
23  23:00 - 24:00      3343.8       3414.1  ...  100.00      False      -44.32
  • Related