How do I get two DIV's text, so that it becomes a table using BeautifulSoup in Python?-CodePudding

How can I iterate through the links, then access their pages' specific divs' content and form like a table, using Python?

I've come this far (only), but the output is not right:

from bs4 import BeautifulSoup
import urllib3

http = urllib3.PoolManager()
base_url = 'http://www.warrencountyschools.org'
url = 'https://www.warrencountyschools.org/district_staff.aspx?action=search&location=29&department=0'
response = http.request('GET', url)
soup = BeautifulSoup(response.data)

# the second tr in the table - index starts at 0
table = soup.find('table', {'class': 'content staff-table'})
rows = table.findAll('tr')

fieldContent = []

for tr in rows:
    cols = tr.findAll('td')
    if len(cols) >= 3:
        link = cols[2].find('a').get('href')
        abs_link = base_url link

        profileURL = abs_link
        profilePagResp = http.request('GET', profileURL)
        soup2 = BeautifulSoup(profilePagResp.data)

        flDiv = soup2.findAll('div', {'class', 'field-label'})
        fcDiv = soup2.find('div', {'class', 'field-content'})
        for fl in flDiv:
            fieldContent.append(fcDiv.text)

print(fieldContent)

The output now consists of each name repeated the number of times it's iterates, while it should be like this:

Name	Email	Website	Phone	Buildings
SomeName	email@	wwww.	78978978	SomeBuildin

CodePudding user response：

You could use an async library like trio as this is more I/0 bound as you will be awaiting responses for requests to individual staff pages. I have added a custom sort, based on last name, in attempt to recreate the original results order. For larger result sets this might not match perfectly in case of ties. You might then extend by adding in a first name sort. The additional sort column can be dropped.

There does seem to be a FIFO processing instruction within trio but I haven't explored that.

import pandas as pd
import httpx
import trio
from bs4 import BeautifulSoup

LINK = 'https://www.warrencountyschools.org/district_staff.aspx?action=search&location=29&department=0'
ALL_INFO = []
  
async def get_soup(content):
    return BeautifulSoup(content, 'lxml')


async def get_staff_info(link, nurse):
    async with httpx.AsyncClient(timeout=None) as client:
        r = await client.get(link)
        soup = await get_soup(r.text)
        info_items = ['Name', 'Email', 'Website', 'Phone', 'Buildings']
        staff_info = {}
        for key in info_items:
            try:
                if key == 'Website':
                    value = 'http://www.warrencountyschools.org'   soup.select_one(
                        f'.field-label:-soup-contains("{key}:")   .field-content > a')['href']
                else:
                    value = soup.select_one(
                        f'.field-label:-soup-contains("{key}:")   .field-content').text.strip()
            except:
                value = 'N/A'
            finally:
                staff_info[key.lower()] = value
        ALL_INFO.append(staff_info)


async def get_links(LINK, nurse):
    async with httpx.AsyncClient(timeout=None) as client:
        r = await client.get(LINK)
        soup = await get_soup(r.text)
        for x in soup.select('#ctl00_ctl00_MasterContent_ContentColumnRight_ctl01_dg_staff .staff-profile-button > a'):
            nurse.start_soon(
                get_staff_info, 'http://www.warrencountyschools.org'   x['href'], nurse)


async def main():
    async with trio.open_nursery() as nurse:
        nurse.start_soon(get_links, LINK, nurse)

if __name__ == "__main__":

    trio.run(main)
    df = pd.DataFrame(ALL_INFO)
    df['sort_value'] = [i.strip().split(' ')[-1] for i in df['name'].tolist()]
    df.sort_values(by=['sort_value'], ascending=True, inplace=True)
    #print(df)
    df.to_csv('staff.csv',
              encoding='utf-8-sig', index=False)

CodePudding user response：

@Antonio Santos, All profile data aren't in the same order. So you can grab data only as follows :

Script

from bs4 import BeautifulSoup
import requests
import pandas as pd

base_url = 'http://www.warrencountyschools.org'
url = 'https://www.warrencountyschools.org/district_staff.aspx?action=search&location=29&department=0'
response = requests.get(url)
soup = BeautifulSoup(response.content,'html.parser')

# the second tr in the table - index starts at 0
table = soup.find('table', {'class': 'content staff-table'})
rows = table.findAll('tr')

for tr in rows:
    cols = tr.findAll('td')
    if len(cols) >= 3:
        link = cols[2].find('a').get('href')
        abs_link = base_url link
        #print(abs_link)
        
        final_page = requests.get(abs_link)
        soup2 = BeautifulSoup(final_page .text,'html.parser')

        profile_data =[x.get_text(strip=True) for x in soup2.findAll("div","field-content")]
        
        print(profile_data)

Output:

['Greg Blewett', '[email protected]', 'Access Staff Website', '270-746-7205', 'Greg Blewett - Construction-Carpentry - Warren County Area Technology Center']
['Adrian Boggess', 'Staff', '[email protected]', 'Tike Barton - Computerized Manufacturing and 
Machining - Warren County Area Technology Center']
['Kim Coomer', 'Teacher', '[email protected]', '270-746-7205', 'Kim Coomer - Career Specialist - Warren County Area Technology Center']
['Rex Cundiff', '[email protected]', 'Access Staff Website', '270-746-7205', 'Rex Cundiff - Welding - Warren County Area Technology Center']
['Susan Devore', '[email protected]', 'Access Staff Website', '270-746-7205', 'Susan Devore - Information Technology - Warren County Area Technology Center']
['Michael Emberton', '[email protected]', 'Access Staff Website', 'Micheal Emberton - Automotive - Warren County Area Technology Center']
['Jacob Hildebrant', 'Staff', '[email protected]', 'Greg Blewett - Construction-Carpentry - Warren County Area Technology Center']
['Jeton Hyseni', 'Staff', '[email protected]', 'Administrative Assistant - Warren County Area Technology Center']
['Jesse Muse', 'Staff', '[email protected]', 'Tike Barton - Computerized Manufacturing and Machining - Warren County Area Technology Center']
['Chris Riggs', 'Staff', '[email protected]', '467-7500', 'Administrative Assistant - Warren County Area Technology Center']
['Allison Runner', 'Staff', '[email protected]', 'Administrative Assistant - Warren County Area Technology Center']
['Jacob Thomas', 'Staff', '[email protected]', 'Greg Blewett - Construction-Carpentry - Warren County Area Technology Center']
['Brooke Weakly', 'Staff', '[email protected]', 'Administrative Assistant - Warren County Area Technology Center']