I want to extract Name, Position and Email from the webpage for every person, I did extract the name and position but as the email is not possible to extract since you have to contact them to know. I just want to extract their contact URL. Every person has different contact URL I want to extract in a way that the CSV file should contain third column as contact in front of their row and whenever I click to contact I should be redirect to there particular contact page. Here is my code:
import requests
from bs4 import BeautifulSoup
from csv import writer
for page in range(0,30):
url = 'https://fairfaxhs.fcps.edu/staff-directory?field_last_name_from=&field_last_name_to=&items_per_page=10&keywords=&page={page}'.format(page =page)
R = requests.get(url)
soup = BeautifulSoup(R.content, 'html.parser')
lists = soup.find_all('tr')
with open('FCPS.csv', 'a', encoding='utf8', newline='') as l:
thewriter = writer(l)
if page == 0:
header = (['Name', 'Position','Contact'])
thewriter.writerow(header)
else :
for list in lists:
name = list.find('td', class_ = 'views-field views-field-field-last-name')
if name:
name = name.text
else :
name = 'N/A'
position = list.find('td', class_='views-field views-field-field-staff-title')
if position:
position = position.text
else :
position = 'N/A'
#contact = list.find('td', class_='views-field views-field-rendered-item')
#if contact:
# contact = contact.text
#else :
# contact = 'N/A'
thewriter.writerow([name,position])
CodePudding user response:
You can grab data according to your requireqment from each listing page/detailed page
following the next example:
import pandas as pd
from bs4 import BeautifulSoup
import requests
url = 'https://fairfaxhs.fcps.edu/staff-directory?field_last_name_from=&field_last_name_to=&items_per_page=10&keywords=&page={page}'
data = []
for page in range(0,30):
soup = BeautifulSoup(requests.get(url.format(page=page)).text,'lxml')
try:
for u in ['https://fairfaxhs.fcps.edu' link.a.get('href') for link in soup.table.select('tr td[]')]:
soup2 = BeautifulSoup(requests.get(u).text,'lxml')
d={
'Name': soup2.select_one('h1.node__title.fcps-color--dark11').get_text(strip=True),
'Position': soup2.select_one('h1 div').get_text(strip=True),
'contact_url': u
}
data.append(d)
except:
pass
df=pd.DataFrame(data).to_csv('out.csv',index=False)
#print(df)