Home > Software design >  Scraping data from Morningstar using an API
Scraping data from Morningstar using an API

Time:09-24

I have a very specific issue which I have not been able to find a solution to. Recently, I began a project for which I am monitoring about 100 ETFs and Mutual funds based on specific data acquired from Morningstar. The current solution works great - but I later found out that I need more data from another "Tab" within the website. Specifically, I am trying to get data from the 1st table from the following website: https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F00000Z1MC&tab=1

Right now, I have the code below for scraping data from a table from the tab "Indhold" on the website, and exporting it to Excel. My question is therefore: How do I adjust the code to scrape data from another part of the website?.

To briefly explain the code and reiterate: The code below scrapes data from another tab from the same websites. The many, many IDs are for each website representing each mutual fund/ETF. The setup works very well so I am hoping to simply adjust it (If that is possible) to extract the table from the link above. I have very limited knowledge of the topic so any help is much, much appreciated.

import requests
import re
import pandas as pd
from openpyxl import load_workbook

auth = 'https://www.morningstar.dk/Common/funds/snapshot/PortfolioSAL.aspx'

# Create a Pandas Excel writer using XlsxWriter as the engine.
path= r'/Users/karlemilthulstrup/Downloads/data2.xlsm'
book = load_workbook(path ,read_only = False, keep_vba=True)
writer = pd.ExcelWriter(path, engine='openpyxl')
writer.book = book

ids = ['F00000VA2N','F0GBR064OO','F00000YKC2','F000015MVX','F0000020YA','0P00015YTR','0P00015YTT','F0GBR05V8D','F0GBR06XKI','F000013CKH','F00000MG6K','F000014G49',
'F00000WC0Z','F00000QSD2','F000016551','F0000146QH','F0000146QI','F0GBR04KZS','F0GBR064VU','F00000VXLM','F0000119R1','F0GBR04L4T','F000015CS3','F000015CS5','F000015CS6',
'F000015CS4','F000013BZE','F0GBR05W0Q','F000016M1C','F0GBR04L68','F00000Z9T9','F0GBR04JI8','F00000Z9TG','F0GBR04L2P','F000014CU8','F00000ZG2G','F00000MLEW',
'F000013ZOY','F000016614','F00000WUI9','F000015KRL','F0GBR04LCR','F000010ES9','F00000P780','F0GBR04HC3','F000015CV6','F00000YWCK','F00000YWCJ','F00000NAI5',
'F0GBR04L81','F0GBR05KNU','F0GBR06XKB','F00000NAI3','F0GBR06XKF','F000016UA9','F000013FC2','F000014NRE','0P0000CNVT','0P0000CNVX','F000015KRI',
'F000015KRG','F00000XLK7','F0GBR04IDG','F00000XLK6','F00000073J','F00000XLK4','F000013CKG','F000013CKJ','F000013CKK','F000016P8R','F000016P8S','F000011JG6',
'F000014UZQ','F0000159PE','F0GBR04KZG','F0000002OY','F00000TW9K','F0000175CC','F00000NBEL','F000016054','F000016056','F00000TEYP','F0000025UI','F0GBR04FV7',
'F00000WP01','F000011SQ4','F0GBR04KZO','F000010E19','F000013ZOX','F0GBR04HD7','F00000YKC1','F0GBR064UG','F00000JSDD','F000010ROF','F0000100CA','F0000100CD',
'FOGBR05KQ0','F0GBR04LBB','F0GBR04LBZ','F0GBR04LCN','F00000WLA7','F0000147D7','F00000ZB5E','F00000WC0Y']
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Mobile Safari/537.36'}
payload = {
'languageId': 'da-DK',
'locale': 'da-DK',
'clientId': 'MDC_intl',
'benchmarkId': 'category',
'component': 'sal-components-mip-factor-profile',
'version': '3.40.1'}

for api_id in ids:
    payload = {
        'Site': 'dk',
        'FC': '%s' %api_id,
        'IT': 'FO',
        'LANG': 'da-DK',}
    
    response = requests.get(auth, params=payload)
    
    search = re.search('(tokenMaaS:[\w\s]*\")(.*)(\")', response.text, re.IGNORECASE)
    bearer = 'Bearer '   search.group(2)
    
    headers.update({'Authorization': bearer})
    
    url = 'https://www.us-api.morningstar.com/sal/sal-service/fund/factorProfile/%s/data' %api_id
    jsonData = requests.get(url, headers=headers, params=payload).json()
    
    rows = []
    for k, v in jsonData['factors'].items():
        row = {}
        row['factor'] = k
        
        historicRange = v.pop('historicRange')
        row.update(v)
        
        for each in historicRange:
            row.update(each)
            
            rows.append(row.copy())
        
    
    df = pd.DataFrame(rows)
    sheetName = jsonData['id']
    df.to_excel(writer, sheet_name=sheetName, index=False)
    print('Finished: %s' %sheetName)

writer.save()
writer.close()

CodePudding user response:

If I understand you correctly, you want to get first table of that URL in the form of pandas dataframe:

import requests
import pandas as pd
from bs4 import BeautifulSoup


# load the page into soup:
url = "https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F00000Z1MC&tab=1"
soup = BeautifulSoup(requests.get(url).content, "html.parser")

# find correct table:
tbl = soup.select_one(".returnsCalenderYearTable")

# remove the first row (it's not header):
tbl.tr.extract()

# convert the html to pandas DF:
df = pd.read_html(str(tbl))[0]

# move the first row to header:
df.columns = map(str, df.loc[0])
df = df.loc[1:].reset_index(drop=True).rename(columns={"nan": "Name"})

print(df)

Prints:

              Name 2014* 2015* 2016* 2017*  2018  2019  2020 31-08
0  Samlet afkast %  2627  1490  1432   584  -589  2648  -482  1841
1      /- Kategori  1130   583   808  -255   164    22  -910  -080
2        /- Indeks   788   591   363  -320  -127  -262 -1106  -162
3  Rank i kategori     2     9     4    80    38    54    92    63

EDIT: To load from multiple URLs:

import requests
import pandas as pd
from bs4 import BeautifulSoup


urls = [
    "https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F00000VA2N&tab=1",
    "https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F0GBR064OO&tab=1",
    "https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F00000YKC2&tab=1",
    "https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F000015MVX&tab=1",
]


all_data = []
for url in urls:
    print("Loading URL {}".format(url))

    # load the page into soup:
    soup = BeautifulSoup(requests.get(url).content, "html.parser")

    # find correct table:
    tbl = soup.select_one(".returnsCalenderYearTable")

    # remove the first row (it's not header):
    tbl.tr.extract()

    # convert the html to pandas DF:
    df = pd.read_html(str(tbl))[0]

    # move the first row to header:
    df.columns = map(lambda x: str(x).replace("*", "").strip(), df.loc[0])
    df = df.loc[1:].reset_index(drop=True).rename(columns={"nan": "Name"})

    df["Company"] = soup.h1.text.split("\n")[0].strip()
    df["URL"] = url
    all_data.append(df.loc[:, ~df.isna().all()])

df = pd.concat(all_data, ignore_index=True)
print(df)

Prints:

               Name    2016    2017    2018    2019   2020 31-08                          Company                                                                             URL
0   Samlet afkast %  1755.0   942.0 -1317.0  1757.0 -189.0  3018        Great Dane Globale Aktier  https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F00000VA2N&tab=1
1       /- Kategori   966.0   -54.0  -186.0  -662.0 -967.0  1152        Great Dane Globale Aktier  https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F00000VA2N&tab=1
2         /- Indeks   686.0    38.0  -854.0 -1153.0 -813.0  1015        Great Dane Globale Aktier  https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F00000VA2N&tab=1
3   Rank i kategori    10.0    24.0    85.0    84.0   77.0     4        Great Dane Globale Aktier  https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F00000VA2N&tab=1
4   Samlet afkast %     NaN  1016.0  -940.0  1899.0  767.0  2238      Independent Generations ESG  https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F0GBR064OO&tab=1
5       /- Kategori     NaN    20.0   190.0  -520.0  -12.0   373      Independent Generations ESG  https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F0GBR064OO&tab=1
6         /- Indeks     NaN   112.0  -478.0 -1011.0  143.0   235      Independent Generations ESG  https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F0GBR064OO&tab=1
7   Rank i kategori     NaN    26.0    69.0    92.0   43.0    25      Independent Generations ESG  https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F0GBR064OO&tab=1
8   Samlet afkast %     NaN     NaN  -939.0  1898.0  766.0  2239  Independent Generations ESG Akk  https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F00000YKC2&tab=1
9       /- Kategori     NaN     NaN   191.0  -521.0  -12.0   373  Independent Generations ESG Akk  https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F00000YKC2&tab=1
10        /- Indeks     NaN     NaN  -477.0 -1012.0  142.0   236  Independent Generations ESG Akk  https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F00000YKC2&tab=1
11  Rank i kategori     NaN     NaN    68.0    92.0   44.0    24  Independent Generations ESG Akk  https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F00000YKC2&tab=1
12  Samlet afkast %     NaN     NaN     NaN     NaN    NaN  2384       Investin Sustainable World  https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F000015MVX&tab=1
13      /- Kategori     NaN     NaN     NaN     NaN    NaN   518       Investin Sustainable World  https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F000015MVX&tab=1
14        /- Indeks     NaN     NaN     NaN     NaN    NaN   381       Investin Sustainable World  https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F000015MVX&tab=1
15  Rank i kategori     NaN     NaN     NaN     NaN    NaN    18       Investin Sustainable World  https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F000015MVX&tab=1
  • Related