im trying to scrape a stock website scrape the sectors and industry (next question) and add it to a csv. I'm getting the info I want for 1 page but the next one is different so that's where I'm stuck
share_details1 = soup.find('a', href="../Industry/Industry_Data.php?s=100") results : Basic Materials i want to find tags that range from 100 - 1300 going by hundreds like href="../Industry/Industry_Data.php?s=200" 300 400 500 600 and so on to 1300
from bs4 import BeautifulSoup as bs
import csv
import requests
LSE = 'https://csimarket.com/stocks/at_glance.php?code=aa'
def get_stocks():
with open('tickers.csv') as ticker_file:
return list(map(lambda ticker: ticker.strip(), ticker_file))
def to_csv(stocks):
with open('stocks.csv', 'w') as sectors:
writer = csv.writer(sectors)
writer.writerow(stocks[0].keys())
for stock in stocks:
writer.writerow(stock.values())
def get_soup(url):
return bs(requests.get(url).text, 'html.parser')
def get_sector(ticker):
soup = get_soup(LSE ticker)
try:
share_details1 = soup.find('a', href="../Industry/Industry_Data.php?s=100")
messy = share_details1.find("span")
messy.decompose()
sector = share_details1.text.strip()
except:
print('No sector information availible for ', ticker)
return {'ticker': ticker, 'sector': ''}
print(ticker, sector)
return {'ticker': ticker, 'sector': sector}
def get_industry(ticker):
soup1 = get_soup(LSE ticker)
try:
share_details1 = soup1.find('a', href="../Industry/Industry_Data.php?ind=104")
messy = share_details1.find("span")
messy.decompose()
industry = share_details1.text.strip()
except:
print('No industry information availible for ', ticker)
return {'ticker': ticker, 'industry': ''}
print(ticker, industry)
return {'ticker': ticker, 'industry': industry}
if __name__ == '__main__':
to_csv(list(map(lambda ticker: get_sector(ticker), get_stocks())))
# to_csv(list(map(lambda ticker: get_industry(ticker), get_stocks())))
here is the sample of the csv
ticker,sector
A,
AA,Basic Materials
AADI,
AAIC,
AAL,
AAN,
AAOI,
AAON,
AAP,
AAPL,
AAT,
AAU,Basic Materials
AAWW,
AB,
ABB,
ABBV,
ABC,
ABCB,
ABCL,
ABEO,
ABEV,
ABG,
ABIO,
ABM,
ABMD,
ABNB,
ABOS,
ABR,
ABSI,
ABST,
ABT,
ABTX,
ABUS,
ACA,Basic Materials
ACAD,
ACB,
ACC,
ACCD,
ACCO,Basic Materials
ACEL,
ACER,
ACET,
ACEV,
ACGL,
ACH,Basic Materials
ACHC,
ACHR,
ACHV,
ACI,
ACIU,
CodePudding user response:
It looks like those hrefs are dynamic. You're better off just looking for 'Sector'
or 'Industry'
and then parse it.
You could also use regex to pull out that info. But here's my fix.
from bs4 import BeautifulSoup as bs
import pandas as pd
import re
import requests
LSE = 'https://csimarket.com/stocks/at_glance.php?code='
def get_stocks():
df = pd.read_csv('tickers.csv')
return list(df['ticker'])
def to_csv(stocks):
df = pd.DataFrame(stocks)
df.to_csv('stocks.csv', index=False)
def get_soup(url):
return bs(requests.get(url).text, 'html.parser')
def get_sector(ticker):
soup = get_soup(LSE ticker)
try:
sector = soup.find('span', text='Sector').find_next('a').text.replace('\n','').replace('•','').strip()
except:
print('No sector information availible for ', ticker)
return {'ticker': ticker, 'sector': ''}
print(ticker, sector)
return {'ticker': ticker, 'sector': sector}
def get_industry(ticker):
soup1 = get_soup(LSE ticker)
try:
industry = soup1.find('span', text='Industry').find_next('a').text.replace('\n','').replace('•','').strip()
except:
print('No industry information availible for ', ticker)
return {'ticker': ticker, 'industry': ''}
print(ticker, industry)
return {'ticker': ticker, 'industry': industry}
if __name__ == '__main__':
to_csv(list(map(lambda ticker: get_sector(ticker), get_stocks())))
# to_csv(list(map(lambda ticker: get_industry(ticker), get_stocks())))