Python scrap useful information from webpage with login-CodePudding

By referring https://medium.com/analytics-vidhya/how-to-scrape-data-from-a-website-using-python-for-beginner-5c770a1fbe2d I have started scraping data from a website with login information. My website is a bit different and I got result. But it is in a different format. code:

from pprint import pprint
import datetime
import requests
from bs4 import BeautifulSoup

cookies = {
    'CFID': '180615757',
    'CFTOKEN': '64089929988eb934-58E2ACC9-AD21-785B-2AFBCE86106B41FE',
    'visid_incap_2388351': '0Vmr7QpDRvmVw8fbXUJFkB5XEWMAAAAAQUIPAAAAAADtlXunU/D8GLU5VofHHier',
    '_ga_6ZQNJ4ELG2': 'GS1.1.1662315508.15.1.1662315668.0.0.0',
    '_ga': 'GA1.2.147261521.1662080801',
    '_gid': 'GA1.2.1149490171.1662080801',
    'reese84': '3:yMGXsdMquwoCj3IoSFRCMg==:Vf20HwL77P8oWYTTKbE0XigwyQE3d2lLQpPVoZYcoL8SJTmLeqAani 7GspfC2BiJYOOytBlkIp9MewLgs/XbkaiLrSvLnMdZ0aT8/M9FvBohByybnJXNl25ya/yfpGhL9oT1HKMZYnKqSR0Sg8 nHTUEO0/YErJgQmfoeYIT4kmE01S8cndGIemtuGjvq1hzB/D9VAQL7S3idutOumBNu84j5FyCdOBClCJTriE X9j40lj1swIxFlryTmBAtLHnEvN9M57N4LMb13yuSBaCawrv4fnron0JnUvfKpLU0CXTnpcM9hJNGv9Ekb4Ap43CZDPdeLVzEmj 39wCVtXPtMqBNCU6mPVBSeJCRHyRuQjY y0Sv5w7ME2LXhT8bEGHyE8yeuxddxvoG51STebu pb0mSp5n iKotUEn9h sA=:WH64twwKGqtE4pUorYOeGylONeXRsfG 3Qe3zAfpdrs=',
    '__atuvc': '65|35,2|36',
    'COOKIESTATUS': 'ON',
    'HIDECOOKIEBANNER': 'TRUE',
    'nlbi_2388351': 'jGGxMFazFBqnU x okRrFAAAAAC/AJ/k R2U vs5Q4LIRTS7',
    'nlbi_2388351_2147483392': 'PUildkEvtiZ9uje3okRrFAAAAABv1NR/7gPLX7Lc/iS5ei8N',
    'incap_ses_989_2388351': 'mWy Uq7aLX000xomDaO5DfTrFGMAAAAA6XmB42vG5CO6i609/RhyKg==',
    'incap_ses_468_2388351': 'sDNcR2labTHyNXYlUqx BipAFGMAAAAAImV2A07lGANZGfpvhvPlLg==',
    '__atuvs': '6314ec0cdbe92a78001',
    '_gat_gtag_UA_12825325_1': '1',
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0',
    'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    # 'Accept-Encoding': 'gzip, deflate, br',
    'Referer': 'https://www.higheredjobs.com/admin/',
    'Connection': 'keep-alive',
    # Requests sorts cookies= alphabetically
    # 'Cookie': 'CFID=180615757; CFTOKEN=64089929988eb934-58E2ACC9-AD21-785B-2AFBCE86106B41FE; visid_incap_2388351=0Vmr7QpDRvmVw8fbXUJFkB5XEWMAAAAAQUIPAAAAAADtlXunU/D8GLU5VofHHier; _ga_6ZQNJ4ELG2=GS1.1.1662315508.15.1.1662315668.0.0.0; _ga=GA1.2.147261521.1662080801; _gid=GA1.2.1149490171.1662080801; reese84=3:yMGXsdMquwoCj3IoSFRCMg==:Vf20HwL77P8oWYTTKbE0XigwyQE3d2lLQpPVoZYcoL8SJTmLeqAani 7GspfC2BiJYOOytBlkIp9MewLgs/XbkaiLrSvLnMdZ0aT8/M9FvBohByybnJXNl25ya/yfpGhL9oT1HKMZYnKqSR0Sg8 nHTUEO0/YErJgQmfoeYIT4kmE01S8cndGIemtuGjvq1hzB/D9VAQL7S3idutOumBNu84j5FyCdOBClCJTriE X9j40lj1swIxFlryTmBAtLHnEvN9M57N4LMb13yuSBaCawrv4fnron0JnUvfKpLU0CXTnpcM9hJNGv9Ekb4Ap43CZDPdeLVzEmj 39wCVtXPtMqBNCU6mPVBSeJCRHyRuQjY y0Sv5w7ME2LXhT8bEGHyE8yeuxddxvoG51STebu pb0mSp5n iKotUEn9h sA=:WH64twwKGqtE4pUorYOeGylONeXRsfG 3Qe3zAfpdrs=; __atuvc=65|35,2|36; COOKIESTATUS=ON; HIDECOOKIEBANNER=TRUE; nlbi_2388351=jGGxMFazFBqnU x okRrFAAAAAC/AJ/k R2U vs5Q4LIRTS7; nlbi_2388351_2147483392=PUildkEvtiZ9uje3okRrFAAAAABv1NR/7gPLX7Lc/iS5ei8N; incap_ses_989_2388351=mWy Uq7aLX000xomDaO5DfTrFGMAAAAA6XmB42vG5CO6i609/RhyKg==; incap_ses_468_2388351=sDNcR2labTHyNXYlUqx BipAFGMAAAAAImV2A07lGANZGfpvhvPlLg==; __atuvs=6314ec0cdbe92a78001; _gat_gtag_UA_12825325_1=1',
    'Upgrade-Insecure-Requests': '1',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    # Requests doesn't support trailers
    # 'TE': 'trailers',
}

params = {
    'JobCat': '141',
    'CatName': 'Academic Advising',
}

response = requests.get('https://www.higheredjobs.com/admin/search.cfm', params=params, cookies=cookies, headers=headers)

soup = BeautifulSoup(response.text,'html.parser')#'lxml')#
job_title = soup.find_all('div',class_=["row record","col-sm-5 text-sm-right"])
jobs_list = []
for i in job_title:
 name = i.text
 jobs_list.append(name)
df = pd.DataFrame({'Jobs title':jobs_list})

Present output:

df = 
Jobs title
0   \n\nRe-Sort\n\n\r\n\t\t\tResults 1 - 70 of 70\...
1   \n\n\r\n\t\t\t\t\t\t\t\t\t\t\tAssistant Profes...
2   \r\n\t\t\t\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\t\t\t...

Expected output:

df = 
     Jobs title                     Company name                   location         Posted
0   Assistant Professor/Associate  University of Southern Indiana  Evansville, IN   09/02/22
    Professor of Engineering, 
    Pott College of Science, 
    Engineering, and Education - F22057F1

CodePudding user response：

Main issue is that you try to create your DataFrame from unstructured data, that is collected in your list.

So try to structure it first e.g. as dict, append it to your list and then create your DataFrame:

jobs_list = []
for i in soup.select('.row.record'):
    
    jobs_list.append(dict(zip(['title','university','location','study','date'],i.stripped_strings)))

pd.DataFrame(jobs_list)

Note: If you like to change the headers, change this list -> ['title','university','location','study','date']

Example

from bs4 import BeautifulSoup
html ='''
<div >
<div ><a href="details.cfm?JobCode=178085874&amp;Title=Assistant Professor/Associate Professor of Engineering, Pott College of Science, Engineering, and Education - F22057F1">
                                            Assistant Professor/Associate Professor of Engineering, Pott College of Science, Engineering, and Education - F22057F1</a>
<br/>
                                        University of Southern Indiana <br/>
                                            Evansville, IN 
                                    </div>
<div >
                                        
                                        Electrical Engineering 
                                            <br/> Posted 09/02/22<br/>
</div>
</div>
<div >
<div >
<a href="details.cfm?JobCode=178085843&amp;Title=Assistant Professor of Engineering F99507">
                                            Assistant Professor of Engineering F99507</a>
<br/>
                                        McNeese State University <br/>
                                            Lake Charles, LA 
                                    </div>
<div >
                                        
                                        Electrical Engineering 
                                            <br/> Posted 09/02/22<br/>
</div>
</div>
'''
soup = BeautifulSoup(html)

jobs_list = []
for i in soup.select('.row.record'):
    
    jobs_list.append(dict(zip(['title','university','location','study','date'],i.stripped_strings)))

pd.DataFrame(jobs_list)

Output

	title	university	location	study	date
0	Assistant Professor/Associate Professor of Engineering, Pott College of Science, Engineering, and Education - F22057F1	University of Southern Indiana	Evansville, IN	Electrical Engineering	Posted 09/02/22
1	Assistant Professor of Engineering F99507	McNeese State University	Lake Charles, LA	Electrical Engineering	Posted 09/02/22

CodePudding user response：

The following is a complete example of how you can extract the jobs under 'Academic Advising' from that website:

import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
from tqdm import tqdm

headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
s = requests.Session()
s.headers.update(headers)
big_list = []
for x in tqdm(range(1, 1337, 100)):
    url = f'https://www.higheredjobs.com/admin/search.cfm?JobCat=141&StartRow={x}&SortBy=4&NumJobs=100'
    r = s.get(url)
    soup = bs(r.text, 'html.parser')
    jobs = soup.select_one('div#js-results').select('div[]')
    for job in jobs:
        job_title = job.select_one('a').get_text(strip=True)
        job_url = job.select_one('a').get('href')
        big_list.append((job_title, job_url))
df = pd.DataFrame(list(set(big_list)), columns = ['Job', 'Url'])
print(df)

Result is a dataframe with all those jobs (1337):

Job Url
0   Director, Usha Kundu, MD College of Health Adv...   details.cfm?JobCode=178071028&Title=Director%2...
1   Academic Advisor, College of Natural, Behavior...   details.cfm?JobCode=178061977&Title=Academic%2...
2   Part-Time Academic Advisor for EAP & Foreign L...   details.cfm?JobCode=177870235&Title=Part-Tim...
3   Student Service Assistant ll (Temp) details.cfm?JobCode=178044985&Title=Student ...
4   On-Call Academic Advisor (Applicant Pool)   details.cfm?JobCode=177522145&Title=On-Call%...
... ... ...
1332    Part-Time Academic Support Coach    details.cfm?JobCode=178060131&Title=Part-Tim...
1333    Academic Advisor    details.cfm?JobCode=178005430&Title=Academic%2...
1334    Retention Coordinator/Academic Advisor  details.cfm?JobCode=178077784&Title=Retention%...
1335    P220178 - Academic Advisor, School of Public H...   details.cfm?JobCode=177930648&Title=P220178 ...
1336    Director of Academic Advising - Georgetown Uni...   details.cfm?JobCode=178021588&Title=Director%2...

CodePudding user response：

To remove newlines meaning \n\t , you can invoke get_text() property instead of .text

name = i.get_text(strip=True)