Home > front end >  Scraping difficult website - may be intentional
Scraping difficult website - may be intentional

Time:09-02

One of the data sources I have been trying to scrape has been proving elusive, which may be intentional on the data providers part as a way of pushing users onto their subscription service. This is the website below.

https://www.theice.com/marketdata/reports/255

It should link to NGX Fin FF,FP for AESO Flat (CA/MWh),Alberta and for Time Period: Current Settlement.

I can get it working for the same day, but once the day is over the scrape no longer works which has completely stumped me.

import datetime
import logging
import pandas as pd
import requests
import lxml
from datetime import date

import azure.functions as func


def main(mytimer: func.TimerRequest,volumeBlob: func.Out[bytes]):
    utc_timestamp = datetime.datetime.utcnow().replace(
        tzinfo=datetime.timezone.utc).isoformat()

if mytimer.past_due:
    logging.info('The timer is past due!')

logging.info('Python timer trigger function ran at %s', utc_timestamp)


cookies = {
    'JSESSIONID': '477F62CBEBC61CDFEEE9731813B1D538',
    'OptanonAlertBoxClosed': '2022-05-17T15:59:03.608Z',
    '_ga': 'GA1.2.1094311247.1652803144',
    'iceBanner': 'rcDisclaimer',
    '_gcl_au': '1.1.195669816.1660653521',
    '_gid': 'GA1.2.215733394.1661950999',
    'ICE_MARKETDATA': '1174809866.6426.0000',
    'OptanonConsent': 'isIABGlobal=false&datestamp=Thu Sep 01 2022 09:31:42 GMT-0600 (Mountain Daylight Time)&version=6.16.0&hosts=&consentId=76d26ec4-5ee0-408b-8380-9df9a3c10a8e&interactionCount=1&landingPath=NotLandingPage&groups=C0001:1,C0005:1,C0004:1,C0003:1,C0002:1&geolocation=;&AwaitingReconsent=false',
    'TS01fbe106': '013ade2eabdbfe5126fd8f2840815f288765427548e635980fd6984ed85b0b81b487f984c193676d29a538b9db33b64856b5418d25cd18f783ce2043dffa94a861829dc2fa',
}

headers = {
    'Accept': '*/*',
    'Accept-Language': 'en-US,en;q=0.9',
    'Connection': 'keep-alive',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    # Requests sorts cookies= alphabetically
    # 'Cookie': 'JSESSIONID=477F62CBEBC61CDFEEE9731813B1D538; OptanonAlertBoxClosed=2022-05-17T15:59:03.608Z; _ga=GA1.2.1094311247.1652803144; iceBanner=rcDisclaimer; _gcl_au=1.1.195669816.1660653521; _gid=GA1.2.215733394.1661950999; ICE_MARKETDATA=1174809866.6426.0000; OptanonConsent=isIABGlobal=false&datestamp=Thu Sep 01 2022 09:31:42 GMT-0600 (Mountain Daylight Time)&version=6.16.0&hosts=&consentId=76d26ec4-5ee0-408b-8380-9df9a3c10a8e&interactionCount=1&landingPath=NotLandingPage&groups=C0001:1,C0005:1,C0004:1,C0003:1,C0002:1&geolocation=;&AwaitingReconsent=false; TS01fbe106=013ade2eabdbfe5126fd8f2840815f288765427548e635980fd6984ed85b0b81b487f984c193676d29a538b9db33b64856b5418d25cd18f783ce2043dffa94a861829dc2fa',
    'Origin': 'https://www.theice.com',
    'Referer': 'https://www.theice.com/marketdata/reports/255',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',
    'sec-ch-ua': '"Chromium";v="104", " Not A;Brand";v="99", "Google Chrome";v="104"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
}

data = 'reportId=255&group=Power&selectedMarket=NGX Fin FF, FP for AESO Flat, (CA/MWh), Alberta&selectedTimePeriod=Current Settlement&rcMode=2'

response = requests.post('https://www.theice.com/marketdata/public-web/ngx/daily-settlement-price/report', cookies=cookies, headers=headers, data=data)
dfs = pd.read_html(response.text)
df = dfs[0]
df.drop(df.columns[[0]], axis=1, inplace=True)
df['BeginDate'] = pd.to_datetime(df['BeginDate'])
df['EndDate'] = pd.to_datetime(df['EndDate'])
df['days'] = df['EndDate'] - df['BeginDate']
df['days'] = df['days'].dt.days
df = df[(df['days'] > 1)]

df['BeginDate'] = df['BeginDate'].dt.date
df['EndDate'] = df['EndDate'].dt.date
df.set_index('BeginDate',inplace=True,drop=True)
pd.to_numeric(df["# Trades"],errors="ignore")
pd.to_numeric(df["TotalVolume"],errors="ignore")
pd.to_numeric(df["Open"],errors="ignore")
pd.to_numeric(df["High"],errors="ignore")
pd.to_numeric(df["Low"],errors="ignore")
pd.to_numeric(df["Settle"],errors="ignore")
pd.to_numeric(df["Net OI"],errors="ignore")
pd.to_numeric(df["days"],errors="ignore")
df['DateModified'] = date.today()
print(df)
volumeBlob.set(df.to_csv())

I have used a curl -> python to create the requests param and header body. Wondering if this is where I have gone wrong (and why?). https://curlconverter.com/python

Thanks in advance!

CodePudding user response:

Using pandas Dataframe and API, getting the following ResultSet

import requests
import pandas as pd
api_url = 'https://www.theice.com/marketdata/api/reports/all'
headers= {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
    }

req=requests.get(api_url,headers=headers).json()

df = pd.DataFrame(req)
print(df)

Output:

id                                               name  ... active recaptchaRequired
0    101                            Historical Daily Volume  ...   True              True
1    102                            Daily Settlement Prices  ...   True              True
2    104  Deliveries - Gasoil Delivery (LS Gasoil from F...  ...   True              True
3    284  Midland WTI American Gulf Coast Futures Settle...  ...   True              True
4    114                Preliminary Open Interest - Futures  ...   True              True
..   ...                                                ...  ...    ...               ...
125   96                  Historical Daily Volume - Futures  ...   True              True
126   97                  Historical Daily Volume - Options  ...   True              True
127   98                                Daily Volume and OI  ...   True              True
128   99                            Historical Daily Volume  ...   True              True
129  100                                Daily Volume and OI  ...   True              True

[130 rows x 7 columns]

CodePudding user response:

To read the table to Panda's dataframe you can use next example:

import requests
import pandas as pd

url = "https://www.theice.com/marketdata/public-web/ngx/daily-settlement-price/report"
data = {
    "reportId": "255",
    "group": "Power",
    "selectedMarket": "NGX Fin FF, FP for AESO Flat, (CA/MWh), Alberta",
    "selectedTimePeriod": "Current Settlement",
    "rcMode": "2",
}

df = pd.read_html(requests.post(url, data=data).text)[0]
print(df)

Prints:

                                              Market   BeginDate     EndDate  # Trades  TotalVolume   Open   High    Low    Settle   Net OI
0    NGX Fin FF, FP for AESO Flat, (CA/MWh), Alberta  2022-08-01  2022-08-31         0            0    NaN    NaN    NaN  254.1346   951576
1    NGX Fin FF, FP for AESO Flat, (CA/MWh), Alberta  2022-08-30  2022-08-30         0            0    NaN    NaN    NaN  325.0000        0
2    NGX Fin FF, FP for AESO Flat, (CA/MWh), Alberta  2022-08-31  2022-08-31         0            0    NaN    NaN    NaN  450.0000        0
3    NGX Fin FF, FP for AESO Flat, (CA/MWh), Alberta  2022-09-01  2022-09-01         0            0    NaN    NaN    NaN  250.0000        0


...

  • Related