I am trying to figure out how to scrape all 'symbol' elemnets from the URL below. I would like to put all symbol items into a list.
I want to get this: lst = ['NRGU', 'CHIE' .... 'PSCE', 'FXN']
I tested this concept, but it gave me a lot more than I want...I don't know how to distill it down to just the 'symbol' items.
# Print All TR & TD ELements in Web Page
from bs4 import BeautifulSoup
import requests
import pandas as pd
import csv
url = "https://etfdb.com/screener/#page=1&tab=returns&sort_by=one_week_return&sort_direction=desc&asset_class=equity&one_week_return_start=2&one_month_return_start=5&ytd_start=20&fifty_two_week_start=30"
page = requests.get(url)
pagetext = page.text
print(pagetext)
soup = BeautifulSoup(pagetext, 'html.parser')
for row in soup.find_all('tr'):
for col in row.find_all('td'):
info1 = row.text
print(info1)
info2 = col.text
print(info2)
Any thoughts on how to do this?
CodePudding user response:
import requests
def main(url):
data = {
"asset_class": "equity",
"fifty_two_week_start": "30",
"one_month_return_start": "5",
"one_week_return_start": "2",
"only": [
"meta",
"data",
"count"
],
"page": 1,
"sort_by": "one_week_return",
"sort_direction": "desc",
"tab": "returns",
"ytd_start": "20"
}
r = requests.post(url, json=data)
symbols = [i['symbol']['text'] for i in r.json()['data']]
print(symbols)
if __name__ == "__main__":
main('https://etfdb.com/api/screener/')
CodePudding user response:
Actually, data is loaded dynamically from external source via API
as POST
method as json
format meaning they are using API to transmit data from browser to server that's why bs4 can't parse/mimic data and we can't pull such data from HTML DOM
. So there are two options to grab data. 1. use an automation tool something like selenium 2. Extract data from API which is the easiest and the robust way.
import requests
import pandas as pd
payload = {
"asset_class": "equity",
"fifty_two_week_start": "30",
"one_month_return_start": "5",
"one_week_return_start": "2",
"only": [
"meta",
"data",
"count"
],
"page": 1,
"sort_by": "one_week_return",
"sort_direction": "desc",
"tab": "returns",
"ytd_start": "20"
}
api_url= 'https://etfdb.com/api/screener/'
headers= {
"content-type": "application/json",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
}
lst=[]
for payload['page'] in range(1,3):
req=requests.post(api_url,headers=headers,json=payload).json()
for item in req['data']:
lst.append({
"symbol":item['symbol']['text']})
print(lst)
# df = pd.DataFrame(lst)
# print(df)
Output:
[{'symbol': 'NRGU'}, {'symbol': 'GUSH'}, {'symbol': 'ERX'}, {'symbol': 'DIG'}, {'symbol': 'EPV'}, {'symbol': 'CHIE'}, {'symbol': 'PXI'}, {'symbol': 'PXE'}, {'symbol': 'OIH'}, {'symbol': 'IEO'}, {'symbol': 'IEZ'}, {'symbol': 'FCG'},
{'symbol': 'FTXN'}, {'symbol': 'XOP'}, {'symbol': 'PSCE'}, {'symbol': 'XES'}, {'symbol': 'XLE'}, {'symbol': 'RYE'},
{'symbol': 'FENY'}, {'symbol': 'JHME'}, {'symbol': 'VDE'}, {'symbol': 'FILL'}, {'symbol': 'IYE'}, {'symbol': 'IXC'}, {'symbol': 'PXJ'}, {'symbol': 'FXN'}, {'symbol': 'NANR'}, {'symbol': 'MLPR'}, {'symbol': 'IGE'}, {'symbol': 'AMLP'}, {'symbol': 'AMZA'}]