How to iterate elements and scrape data to dataframe?-CodePudding

Trying to understand the optimal way to loop over data in python that isn't formatted as a table (tr/td)

Example Data:

Trying to create a table for Name, headshot URL, Company, Address, Education.

Trying to following so far but cannot seem to understand how to go into the divs for the content component:

r=requests.get(url)
soup=BeautifulSoup(r.text, 'html5lib')
table = soup.find_all('div', attrs = {'class':'col-lg-6 agent'}) 
for a in table:
    if a.find('div', attrs = {'headshot'}):
        headshot_url=a.find('div', attrs = {'headshot'}).img```

CodePudding user response：

Simply iterate all the agents and pick your specific information to store them in a list of dicts:

for e in soup.select('.agent'):
    data.append({
        'name':e.h3.get_text(strip=True).replace('\xa0',' '),
        'headshot_url':e.img.get('src'),
        'company':e.h5.get_text(strip=True),
        'address':e.address.get_text(strip=True) if e.address else None,
        'education':e.select_one('.education div div').get_text(strip=True)
    })

This could be transformed into a dataframe.

Example

import requests
from bs4 import BeautifulSoup
import pandas as pd

url = 'https://www.nhlpa.com/the-pa/certified-agents?range=A-Z'
r = requests.get(url)
soup = BeautifulSoup(r.text)

data = []

for e in soup.select('.agent'):
    data.append({
        'name':e.h3.get_text(strip=True).replace('\xa0',' '),
        'headshot_url':e.img.get('src'),
        'company':e.h5.get_text(strip=True),
        'address':e.address.get_text(strip=True) if e.address else None,
        'education':e.select_one('.education div div').get_text(strip=True)
    })
pd.DataFrame(data)

Output

	name	headshot_url	company	address	education
0	Wade Arnott	https://cdn.nhlpa.com/img/assets/agents/headshots/48x48/9207.jpg	Newport Sports Management Inc.	201 City Centre Drive, Suite 400Mississauga, OntarioCANADA, L5B 2T4	Concord Law School, E.J.D.
					Wilfrid Laurier University, Hons. Bus. Admin., '91.
1	Patrik Aronsson	https://cdn.nhlpa.com/img/assets/agents/headshots/48x48/56469.jpg	AC Hockey	Faktorvagen 17 RKungsbackaSweden, 43437	Not available.
2	Shumi Babaev	https://cdn.nhlpa.com/img/assets/agents/headshots/48x48/56794.jpg	Shumi Babaev Agency		Moscow Mining University (Moscow) 1989-1994 - Masters' Degree
3	Mika Backman	https://cdn.nhlpa.com/img/assets/agents/headshots/48x48/58054.jpg	WSG Finland Ltd.	Kappelikuja 6 C02200 EspooFinland,	Helsinki University Law School (1992-1998) - Master of Law

...

CodePudding user response：

Hope this helps <3

r=requests.get(url)
print("fetched")
soup=BeautifulSoup(r.text, 'html.parser')
table = soup.find_all('div', attrs = {'class':'col-lg-6 agent'}) 
for a in table:
    headshots=a.find('div', attrs = {'headshot'})
    #find all divs with headshot class
    if headshots:
        #check if not None
        headshot_url=headshots.img["src"]
        #get the url
    else:
        headshot_url=None
        #So nothing gets wrong with our data sets
    
    content=a.find('div', attrs = {'content'})
    #find all divs with content class
    if content:
        #check if the div actually exist
        if content.h3:
            name=str(content.h3.contents[0]).replace("\xa0"," ")
        else:
            name=None
        if content.h5:
            company=content.h5.contents[0]
        else:
            company=None
    else:
        name,company=None,None
        #if content is None, then by default both of these None
    html_address=content.address
    
    if html_address:
        address=html_address.contents[0]
        #You might wanna edit this if you want
    else:
        address=None
    
    edu=a.find("div",attrs={'education'}).find("div",attrs={"class":None})
    #find all divs with education class
    
    if edu:
        education=edu.contents[0]
        
    else:
        education=None
    
    #YOUR FINAL DATA SET IS:
    data_set={"headshot_url":headshot_url,"name":name,"company":company,'address':address,'education':education}