web scraper beautifulsoup different tags in python-CodePudding

im trying to scrape a stock website scrape the sectors and industry (next question) and add it to a csv. I'm getting the info I want for 1 page but the next one is different so that's where I'm stuck

share_details1 = soup.find('a', href="../Industry/Industry_Data.php?s=100") results : Basic Materials i want to find tags that range from 100 - 1300 going by hundreds like href="../Industry/Industry_Data.php?s=200" 300 400 500 600 and so on to 1300

from bs4 import BeautifulSoup as bs
import csv
import requests

LSE = 'https://csimarket.com/stocks/at_glance.php?code=aa'

def get_stocks():
    with open('tickers.csv') as ticker_file:
        return list(map(lambda ticker: ticker.strip(), ticker_file))


def to_csv(stocks):
    with open('stocks.csv', 'w') as sectors:
        writer = csv.writer(sectors)
        writer.writerow(stocks[0].keys())
        for stock in stocks:
            writer.writerow(stock.values())


def get_soup(url):
    return bs(requests.get(url).text, 'html.parser')

def get_sector(ticker):
    soup = get_soup(LSE   ticker)
    try:
        share_details1 = soup.find('a', href="../Industry/Industry_Data.php?s=100")
        messy = share_details1.find("span")
        messy.decompose()
        sector = share_details1.text.strip()

    except:
        print('No sector information availible for ', ticker)
        return {'ticker': ticker, 'sector': ''}

    print(ticker, sector)
    return {'ticker': ticker, 'sector': sector}


def get_industry(ticker):
    soup1 = get_soup(LSE   ticker)
    try:
        share_details1 = soup1.find('a', href="../Industry/Industry_Data.php?ind=104")
        messy = share_details1.find("span")
        messy.decompose()
        industry = share_details1.text.strip()

    except:
        print('No industry information availible for ', ticker)
        return {'ticker': ticker, 'industry': ''}

    print(ticker, industry)
    return {'ticker': ticker, 'industry': industry}


if __name__ == '__main__':
    to_csv(list(map(lambda ticker: get_sector(ticker), get_stocks())))
    # to_csv(list(map(lambda ticker: get_industry(ticker), get_stocks())))

here is the sample of the csv

ticker,sector
A,
AA,Basic Materials
AADI,
AAIC,
AAL,
AAN,
AAOI,
AAON,
AAP,
AAPL,
AAT,
AAU,Basic Materials
AAWW,
AB,
ABB,
ABBV,
ABC,
ABCB,
ABCL,
ABEO,
ABEV,
ABG,
ABIO,
ABM,
ABMD,
ABNB,
ABOS,
ABR,
ABSI,
ABST,
ABT,
ABTX,
ABUS,
ACA,Basic Materials
ACAD,
ACB,
ACC,
ACCD,
ACCO,Basic Materials
ACEL,
ACER,
ACET,
ACEV,
ACGL,
ACH,Basic Materials
ACHC,
ACHR,
ACHV,
ACI,
ACIU,

CodePudding user response：

It looks like those hrefs are dynamic. You're better off just looking for 'Sector' or 'Industry' and then parse it.

You could also use regex to pull out that info. But here's my fix.

from bs4 import BeautifulSoup as bs
import pandas as pd
import re
import requests

LSE = 'https://csimarket.com/stocks/at_glance.php?code='

def get_stocks():
    df = pd.read_csv('tickers.csv')
    return list(df['ticker'])


def to_csv(stocks):
    df = pd.DataFrame(stocks)
    df.to_csv('stocks.csv', index=False)

def get_soup(url):
    return bs(requests.get(url).text, 'html.parser')

def get_sector(ticker):
    soup = get_soup(LSE   ticker)
    try:
        sector = soup.find('span', text='Sector').find_next('a').text.replace('\n','').replace('•','').strip()
    except:
        print('No sector information availible for ', ticker)
        return {'ticker': ticker, 'sector': ''}

    print(ticker, sector)
    return {'ticker': ticker, 'sector': sector}


def get_industry(ticker):
    soup1 = get_soup(LSE   ticker)
    try:
        industry = soup1.find('span', text='Industry').find_next('a').text.replace('\n','').replace('•','').strip()
    except:
        print('No industry information availible for ', ticker)
        return {'ticker': ticker, 'industry': ''}

    print(ticker, industry)
    return {'ticker': ticker, 'industry': industry}


if __name__ == '__main__':
    to_csv(list(map(lambda ticker: get_sector(ticker), get_stocks())))
    # to_csv(list(map(lambda ticker: get_industry(ticker), get_stocks())))