By referring https://medium.com/analytics-vidhya/how-to-scrape-data-from-a-website-using-python-for-beginner-5c770a1fbe2d I have started scraping data from a website with login information. My website is a bit different and I got result. But it is in a different format. code:
from pprint import pprint
import datetime
import requests
from bs4 import BeautifulSoup
cookies = {
'CFID': '180615757',
'CFTOKEN': '64089929988eb934-58E2ACC9-AD21-785B-2AFBCE86106B41FE',
'visid_incap_2388351': '0Vmr7QpDRvmVw8fbXUJFkB5XEWMAAAAAQUIPAAAAAADtlXunU/D8GLU5VofHHier',
'_ga_6ZQNJ4ELG2': 'GS1.1.1662315508.15.1.1662315668.0.0.0',
'_ga': 'GA1.2.147261521.1662080801',
'_gid': 'GA1.2.1149490171.1662080801',
'reese84': '3:yMGXsdMquwoCj3IoSFRCMg==:Vf20HwL77P8oWYTTKbE0XigwyQE3d2lLQpPVoZYcoL8SJTmLeqAani 7GspfC2BiJYOOytBlkIp9MewLgs/XbkaiLrSvLnMdZ0aT8/M9FvBohByybnJXNl25ya/yfpGhL9oT1HKMZYnKqSR0Sg8 nHTUEO0/YErJgQmfoeYIT4kmE01S8cndGIemtuGjvq1hzB/D9VAQL7S3idutOumBNu84j5FyCdOBClCJTriE X9j40lj1swIxFlryTmBAtLHnEvN9M57N4LMb13yuSBaCawrv4fnron0JnUvfKpLU0CXTnpcM9hJNGv9Ekb4Ap43CZDPdeLVzEmj 39wCVtXPtMqBNCU6mPVBSeJCRHyRuQjY y0Sv5w7ME2LXhT8bEGHyE8yeuxddxvoG51STebu pb0mSp5n iKotUEn9h sA=:WH64twwKGqtE4pUorYOeGylONeXRsfG 3Qe3zAfpdrs=',
'__atuvc': '65|35,2|36',
'COOKIESTATUS': 'ON',
'HIDECOOKIEBANNER': 'TRUE',
'nlbi_2388351': 'jGGxMFazFBqnU x okRrFAAAAAC/AJ/k R2U vs5Q4LIRTS7',
'nlbi_2388351_2147483392': 'PUildkEvtiZ9uje3okRrFAAAAABv1NR/7gPLX7Lc/iS5ei8N',
'incap_ses_989_2388351': 'mWy Uq7aLX000xomDaO5DfTrFGMAAAAA6XmB42vG5CO6i609/RhyKg==',
'incap_ses_468_2388351': 'sDNcR2labTHyNXYlUqx BipAFGMAAAAAImV2A07lGANZGfpvhvPlLg==',
'__atuvs': '6314ec0cdbe92a78001',
'_gat_gtag_UA_12825325_1': '1',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0',
'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
# 'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://www.higheredjobs.com/admin/',
'Connection': 'keep-alive',
# Requests sorts cookies= alphabetically
# 'Cookie': 'CFID=180615757; CFTOKEN=64089929988eb934-58E2ACC9-AD21-785B-2AFBCE86106B41FE; visid_incap_2388351=0Vmr7QpDRvmVw8fbXUJFkB5XEWMAAAAAQUIPAAAAAADtlXunU/D8GLU5VofHHier; _ga_6ZQNJ4ELG2=GS1.1.1662315508.15.1.1662315668.0.0.0; _ga=GA1.2.147261521.1662080801; _gid=GA1.2.1149490171.1662080801; reese84=3:yMGXsdMquwoCj3IoSFRCMg==:Vf20HwL77P8oWYTTKbE0XigwyQE3d2lLQpPVoZYcoL8SJTmLeqAani 7GspfC2BiJYOOytBlkIp9MewLgs/XbkaiLrSvLnMdZ0aT8/M9FvBohByybnJXNl25ya/yfpGhL9oT1HKMZYnKqSR0Sg8 nHTUEO0/YErJgQmfoeYIT4kmE01S8cndGIemtuGjvq1hzB/D9VAQL7S3idutOumBNu84j5FyCdOBClCJTriE X9j40lj1swIxFlryTmBAtLHnEvN9M57N4LMb13yuSBaCawrv4fnron0JnUvfKpLU0CXTnpcM9hJNGv9Ekb4Ap43CZDPdeLVzEmj 39wCVtXPtMqBNCU6mPVBSeJCRHyRuQjY y0Sv5w7ME2LXhT8bEGHyE8yeuxddxvoG51STebu pb0mSp5n iKotUEn9h sA=:WH64twwKGqtE4pUorYOeGylONeXRsfG 3Qe3zAfpdrs=; __atuvc=65|35,2|36; COOKIESTATUS=ON; HIDECOOKIEBANNER=TRUE; nlbi_2388351=jGGxMFazFBqnU x okRrFAAAAAC/AJ/k R2U vs5Q4LIRTS7; nlbi_2388351_2147483392=PUildkEvtiZ9uje3okRrFAAAAABv1NR/7gPLX7Lc/iS5ei8N; incap_ses_989_2388351=mWy Uq7aLX000xomDaO5DfTrFGMAAAAA6XmB42vG5CO6i609/RhyKg==; incap_ses_468_2388351=sDNcR2labTHyNXYlUqx BipAFGMAAAAAImV2A07lGANZGfpvhvPlLg==; __atuvs=6314ec0cdbe92a78001; _gat_gtag_UA_12825325_1=1',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
# Requests doesn't support trailers
# 'TE': 'trailers',
}
params = {
'JobCat': '141',
'CatName': 'Academic Advising',
}
response = requests.get('https://www.higheredjobs.com/admin/search.cfm', params=params, cookies=cookies, headers=headers)
soup = BeautifulSoup(response.text,'html.parser')#'lxml')#
job_title = soup.find_all('div',class_=["row record","col-sm-5 text-sm-right"])
jobs_list = []
for i in job_title:
name = i.text
jobs_list.append(name)
df = pd.DataFrame({'Jobs title':jobs_list})
Present output:
df =
Jobs title
0 \n\nRe-Sort\n\n\r\n\t\t\tResults 1 - 70 of 70\...
1 \n\n\r\n\t\t\t\t\t\t\t\t\t\t\tAssistant Profes...
2 \r\n\t\t\t\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\t\t\t...
Expected output:
df =
Jobs title Company name location Posted
0 Assistant Professor/Associate University of Southern Indiana Evansville, IN 09/02/22
Professor of Engineering,
Pott College of Science,
Engineering, and Education - F22057F1
CodePudding user response:
Main issue is that you try to create your DataFrame
from unstructured data, that is collected in your list
.
So try to structure it first e.g. as dict
, append it to your list
and then create your DataFrame
:
jobs_list = []
for i in soup.select('.row.record'):
jobs_list.append(dict(zip(['title','university','location','study','date'],i.stripped_strings)))
pd.DataFrame(jobs_list)
Note: If you like to change the headers, change this list
-> ['title','university','location','study','date']
Example
from bs4 import BeautifulSoup
html ='''
<div >
<div ><a href="details.cfm?JobCode=178085874&Title=Assistant Professor/Associate Professor of Engineering, Pott College of Science, Engineering, and Education - F22057F1">
Assistant Professor/Associate Professor of Engineering, Pott College of Science, Engineering, and Education - F22057F1</a>
<br/>
University of Southern Indiana <br/>
Evansville, IN
</div>
<div >
Electrical Engineering
<br/> Posted 09/02/22<br/>
</div>
</div>
<div >
<div >
<a href="details.cfm?JobCode=178085843&Title=Assistant Professor of Engineering F99507">
Assistant Professor of Engineering F99507</a>
<br/>
McNeese State University <br/>
Lake Charles, LA
</div>
<div >
Electrical Engineering
<br/> Posted 09/02/22<br/>
</div>
</div>
'''
soup = BeautifulSoup(html)
jobs_list = []
for i in soup.select('.row.record'):
jobs_list.append(dict(zip(['title','university','location','study','date'],i.stripped_strings)))
pd.DataFrame(jobs_list)
Output
title | university | location | study | date | |
---|---|---|---|---|---|
0 | Assistant Professor/Associate Professor of Engineering, Pott College of Science, Engineering, and Education - F22057F1 | University of Southern Indiana | Evansville, IN | Electrical Engineering | Posted 09/02/22 |
1 | Assistant Professor of Engineering F99507 | McNeese State University | Lake Charles, LA | Electrical Engineering | Posted 09/02/22 |
CodePudding user response:
The following is a complete example of how you can extract the jobs under 'Academic Advising' from that website:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
from tqdm import tqdm
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
s = requests.Session()
s.headers.update(headers)
big_list = []
for x in tqdm(range(1, 1337, 100)):
url = f'https://www.higheredjobs.com/admin/search.cfm?JobCat=141&StartRow={x}&SortBy=4&NumJobs=100'
r = s.get(url)
soup = bs(r.text, 'html.parser')
jobs = soup.select_one('div#js-results').select('div[]')
for job in jobs:
job_title = job.select_one('a').get_text(strip=True)
job_url = job.select_one('a').get('href')
big_list.append((job_title, job_url))
df = pd.DataFrame(list(set(big_list)), columns = ['Job', 'Url'])
print(df)
Result is a dataframe with all those jobs (1337):
Job Url
0 Director, Usha Kundu, MD College of Health Adv... details.cfm?JobCode=178071028&Title=Director%2...
1 Academic Advisor, College of Natural, Behavior... details.cfm?JobCode=178061977&Title=Academic%2...
2 Part-Time Academic Advisor for EAP & Foreign L... details.cfm?JobCode=177870235&Title=Part-Tim...
3 Student Service Assistant ll (Temp) details.cfm?JobCode=178044985&Title=Student ...
4 On-Call Academic Advisor (Applicant Pool) details.cfm?JobCode=177522145&Title=On-Call%...
... ... ...
1332 Part-Time Academic Support Coach details.cfm?JobCode=178060131&Title=Part-Tim...
1333 Academic Advisor details.cfm?JobCode=178005430&Title=Academic%2...
1334 Retention Coordinator/Academic Advisor details.cfm?JobCode=178077784&Title=Retention%...
1335 P220178 - Academic Advisor, School of Public H... details.cfm?JobCode=177930648&Title=P220178 ...
1336 Director of Academic Advising - Georgetown Uni... details.cfm?JobCode=178021588&Title=Director%2...
CodePudding user response:
To remove newlines meaning \n\t
, you can invoke get_text()
property instead of .text
name = i.get_text(strip=True)