One of the data sources I have been trying to scrape has been proving elusive, which may be intentional on the data providers part as a way of pushing users onto their subscription service. This is the website below.
https://www.theice.com/marketdata/reports/255
It should link to NGX Fin FF,FP for AESO Flat (CA/MWh),Alberta and for Time Period: Current Settlement.
I can get it working for the same day, but once the day is over the scrape no longer works which has completely stumped me.
import datetime
import logging
import pandas as pd
import requests
import lxml
from datetime import date
import azure.functions as func
def main(mytimer: func.TimerRequest,volumeBlob: func.Out[bytes]):
utc_timestamp = datetime.datetime.utcnow().replace(
tzinfo=datetime.timezone.utc).isoformat()
if mytimer.past_due:
logging.info('The timer is past due!')
logging.info('Python timer trigger function ran at %s', utc_timestamp)
cookies = {
'JSESSIONID': '477F62CBEBC61CDFEEE9731813B1D538',
'OptanonAlertBoxClosed': '2022-05-17T15:59:03.608Z',
'_ga': 'GA1.2.1094311247.1652803144',
'iceBanner': 'rcDisclaimer',
'_gcl_au': '1.1.195669816.1660653521',
'_gid': 'GA1.2.215733394.1661950999',
'ICE_MARKETDATA': '1174809866.6426.0000',
'OptanonConsent': 'isIABGlobal=false&datestamp=Thu Sep 01 2022 09:31:42 GMT-0600 (Mountain Daylight Time)&version=6.16.0&hosts=&consentId=76d26ec4-5ee0-408b-8380-9df9a3c10a8e&interactionCount=1&landingPath=NotLandingPage&groups=C0001:1,C0005:1,C0004:1,C0003:1,C0002:1&geolocation=;&AwaitingReconsent=false',
'TS01fbe106': '013ade2eabdbfe5126fd8f2840815f288765427548e635980fd6984ed85b0b81b487f984c193676d29a538b9db33b64856b5418d25cd18f783ce2043dffa94a861829dc2fa',
}
headers = {
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
# Requests sorts cookies= alphabetically
# 'Cookie': 'JSESSIONID=477F62CBEBC61CDFEEE9731813B1D538; OptanonAlertBoxClosed=2022-05-17T15:59:03.608Z; _ga=GA1.2.1094311247.1652803144; iceBanner=rcDisclaimer; _gcl_au=1.1.195669816.1660653521; _gid=GA1.2.215733394.1661950999; ICE_MARKETDATA=1174809866.6426.0000; OptanonConsent=isIABGlobal=false&datestamp=Thu Sep 01 2022 09:31:42 GMT-0600 (Mountain Daylight Time)&version=6.16.0&hosts=&consentId=76d26ec4-5ee0-408b-8380-9df9a3c10a8e&interactionCount=1&landingPath=NotLandingPage&groups=C0001:1,C0005:1,C0004:1,C0003:1,C0002:1&geolocation=;&AwaitingReconsent=false; TS01fbe106=013ade2eabdbfe5126fd8f2840815f288765427548e635980fd6984ed85b0b81b487f984c193676d29a538b9db33b64856b5418d25cd18f783ce2043dffa94a861829dc2fa',
'Origin': 'https://www.theice.com',
'Referer': 'https://www.theice.com/marketdata/reports/255',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua': '"Chromium";v="104", " Not A;Brand";v="99", "Google Chrome";v="104"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
data = 'reportId=255&group=Power&selectedMarket=NGX Fin FF, FP for AESO Flat, (CA/MWh), Alberta&selectedTimePeriod=Current Settlement&rcMode=2'
response = requests.post('https://www.theice.com/marketdata/public-web/ngx/daily-settlement-price/report', cookies=cookies, headers=headers, data=data)
dfs = pd.read_html(response.text)
df = dfs[0]
df.drop(df.columns[[0]], axis=1, inplace=True)
df['BeginDate'] = pd.to_datetime(df['BeginDate'])
df['EndDate'] = pd.to_datetime(df['EndDate'])
df['days'] = df['EndDate'] - df['BeginDate']
df['days'] = df['days'].dt.days
df = df[(df['days'] > 1)]
df['BeginDate'] = df['BeginDate'].dt.date
df['EndDate'] = df['EndDate'].dt.date
df.set_index('BeginDate',inplace=True,drop=True)
pd.to_numeric(df["# Trades"],errors="ignore")
pd.to_numeric(df["TotalVolume"],errors="ignore")
pd.to_numeric(df["Open"],errors="ignore")
pd.to_numeric(df["High"],errors="ignore")
pd.to_numeric(df["Low"],errors="ignore")
pd.to_numeric(df["Settle"],errors="ignore")
pd.to_numeric(df["Net OI"],errors="ignore")
pd.to_numeric(df["days"],errors="ignore")
df['DateModified'] = date.today()
print(df)
volumeBlob.set(df.to_csv())
I have used a curl -> python to create the requests param and header body. Wondering if this is where I have gone wrong (and why?). https://curlconverter.com/python
Thanks in advance!
CodePudding user response:
Using pandas Dataframe and API, getting the following ResultSet
import requests
import pandas as pd
api_url = 'https://www.theice.com/marketdata/api/reports/all'
headers= {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
}
req=requests.get(api_url,headers=headers).json()
df = pd.DataFrame(req)
print(df)
Output:
id name ... active recaptchaRequired
0 101 Historical Daily Volume ... True True
1 102 Daily Settlement Prices ... True True
2 104 Deliveries - Gasoil Delivery (LS Gasoil from F... ... True True
3 284 Midland WTI American Gulf Coast Futures Settle... ... True True
4 114 Preliminary Open Interest - Futures ... True True
.. ... ... ... ... ...
125 96 Historical Daily Volume - Futures ... True True
126 97 Historical Daily Volume - Options ... True True
127 98 Daily Volume and OI ... True True
128 99 Historical Daily Volume ... True True
129 100 Daily Volume and OI ... True True
[130 rows x 7 columns]
CodePudding user response:
To read the table to Panda's dataframe you can use next example:
import requests
import pandas as pd
url = "https://www.theice.com/marketdata/public-web/ngx/daily-settlement-price/report"
data = {
"reportId": "255",
"group": "Power",
"selectedMarket": "NGX Fin FF, FP for AESO Flat, (CA/MWh), Alberta",
"selectedTimePeriod": "Current Settlement",
"rcMode": "2",
}
df = pd.read_html(requests.post(url, data=data).text)[0]
print(df)
Prints:
Market BeginDate EndDate # Trades TotalVolume Open High Low Settle Net OI
0 NGX Fin FF, FP for AESO Flat, (CA/MWh), Alberta 2022-08-01 2022-08-31 0 0 NaN NaN NaN 254.1346 951576
1 NGX Fin FF, FP for AESO Flat, (CA/MWh), Alberta 2022-08-30 2022-08-30 0 0 NaN NaN NaN 325.0000 0
2 NGX Fin FF, FP for AESO Flat, (CA/MWh), Alberta 2022-08-31 2022-08-31 0 0 NaN NaN NaN 450.0000 0
3 NGX Fin FF, FP for AESO Flat, (CA/MWh), Alberta 2022-09-01 2022-09-01 0 0 NaN NaN NaN 250.0000 0
...