How to extract job description (web-scrapping) from the site using python-CodePudding

I tried to extract job description from a job site. i got all the details except job description. I'm attaching my code and details below. From this code I got company details location and some other datas separately. Like that i need job description of the full jobs. While running appending Job_Description I didn't get any data.

import requests
from bs4 import BeautifulSoup
headers = {
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0"
}


url = "https://in.indeed.com/jobs?q=software engineer&l=Kerala&sort=date&vjk=ce1481bc5c182a25"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
Links = soup.find("div", {"class":"pagination"}).find_all('a')


Page= [tag['href'] for tag in Links]
for pageid in range(0,2):
    pageid=10*pageid
    website=f'https://in.indeed.com//jobs?q=software engineer&l=Kerala&sort=date&start={pageid}'
    soup = BeautifulSoup(requests.get(website, headers=headers).content, "html.parser")
    SubLinks = soup.find("div", {"class":"pagination"}).find_all('a')
    Page=list(set(Page  [tag['href'] for tag in SubLinks]))

    for job in soup.select('a[id^="job_"]'):
        job_id = job["id"].split("_")[-1]
        #s = BeautifulSoup(requests.get(api_url.format(job_id=job_id), headers=headers).content,"html.parser",)
    data=[]
    Company_Name=[]
    Location=[]
    Job_Description=[]
    for div_block in soup.find_all('span', class_=['companyName',],style=None):
        Company_Name.append([line.strip() for line in div_block.stripped_strings])
    for div_block in soup.find_all('div', class_=['companyLocation'],style=None):
        Location.append([line.strip() for line in div_block.stripped_strings])
    for div_block in soup.find_all('div',class_=['jobsearch-JobComponent-description icl-u-xs-mt--md'],style=None):
        Job_Description.append([line.strip() for line in div_block.stripped_strings])

CodePudding user response：

Since you are working on the paginated search section of indeed.com you are not going to be getting the full job description unless you select the job and go into it.

With that said, I believe what you are looking for is the job snippet which would give you the results you are looking for based on the search criteria of your code.

for div_block in soup.find_all('div',class_=['job-snippet'],style=None):
    Job_Description.append([line.strip() for line in div_block.stripped_strings])

Based on what you are looking for I think you want to actually get all of the data instead of just the snippet so I would consider doing it this way.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests, json
from bs4 import BeautifulSoup

def main():
    url = "https://in.indeed.com/jobs?q=software engineer&l=Kerala&sort=date&vjk=ce1481bc5c182a25"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'}
    soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
    job_card = None
    for row in str(soup).split('\n'):
        if 'window.mosaic.providerData["mosaic-provider-jobcards"]=' in row:
            job_card = row.replace('window.mosaic.providerData["mosaic-provider-jobcards"]=', '').replace(';', '')
    job_card_data = json.loads(job_card)
    job_list = list()
    for job in job_card_data['metaData']['mosaicProviderJobCardsModel']['results']:
        job_dict = job
        job_full_soup_url = 'https://in.indeed.com{}'.format(job['viewJobLink'])
        job_full_soup = BeautifulSoup(requests.get(job_full_soup_url, headers=headers).content, "html.parser")
        for div_block in job_full_soup.find_all('div', class_=['jobsearch-jobDescriptionText'],style=None):
            job_dict['full_description'] = [line.strip() for line in div_block.stripped_strings]
        job_list.append(job_dict)
    print(json.dumps(job_list, indent=4))

if __name__ == '__main__':
    main()