Error while web scraping full job descriptions across multiple pages using BeautifulSoup-CodePudding

Would be great to get your help/ input on this!

I'm trying to scrape job info from indeed. Everything in the code was working fine until I tried to get full job descriptions across multiple pages using the job href. Now I keep getting the following error:

job_link = job_posts.find(name="a", class_="jcs-JobTitle").get("href") AttributeError: 'NoneType' object has no attribute 'get'

Please find the code below:

import requests
from bs4 import BeautifulSoup
import pandas as pd

# Extract function
def extract(page):
    headers = {
        "User Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 \
        (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"}
    url = f"https://uk.indeed.com/jobs?q=data analyst £30,000&l=London, Greater London&jt=fulltime&start={page}"
    r = requests.get(url, headers)
    soup = BeautifulSoup(r.text, "html.parser")
    return soup

# Transform function
def transform(soup):
    # Get list of all job_postings
    job_postings = soup.find_all(name="div", class_="slider_item")

    # Get job elements
    for job_posts in job_postings:
        job_title = job_posts.select_one("a span[title]").text
        company_name = job_posts.find(name="span", class_="companyName").text
        try:
            salary = job_posts.find(name="div", class_="salary-snippet").find("span").getText()
        except:
            salary = "n/a"
        summary_text = job_posts.find(name="div", class_="job-snippet").text.replace("\n", "")

        # full job descriptions
        job_link = job_posts.find(name="a", class_="jcs-JobTitle").get("href")
        absolute_link = 'https://uk.indeed.com'   job_link

        job_desc_r = requests.get(absolute_link)
        job_desc_data = job_desc_r.text
        job_desc_soup = BeautifulSoup(job_desc_data, "html.parser")

        full_description = [item.text for item in
                            job_desc_soup.find(name="div", class_="jobsearch-jobDescriptionText").find_all("li")]

    # Append jobs to job list 
        job = {
                'Job Title': job_title,
                'Company': company_name,
                'Salary': salary,
                'Summary': summary_text,
                'Full Descriptions': full_description
                }

        job_list.append(job)
    return


job_list = []

# Loop across multiple pages 
for page_num in range(0, 40, 10):
    extract_output = extract(page_num)
    transform(extract_output)

I've tried fixing the error using try and except (see below), but this has resulted in a lot of empty job descriptions.

# Get full job descriptions
        try:
            job_link = job_posts.find(name="a", class_="jcs-JobTitle").get("href")
        except:
            job_link = ""
        absolute_link = 'https://uk.indeed.com'   job_link

    # For each Job's webpage you need to connect to the link
        job_desc_r = requests.get(absolute_link)
        job_desc_data = job_desc_r.text
        job_desc_soup = BeautifulSoup(job_desc_data, "html.parser")

        try:
            full_description = [item.text for item in
                                job_desc_soup.find(name="div", class_="jobsearch-jobDescriptionText").find_all("li")]
        except:
            full_description = ""

Thank you in advance! P.S. I'm using PyCharmCE on my mac

CodePudding user response：

Without looking at the raw web scrape data, I am almost certain I understand why the error is occuring. It is highly likely that the anchor elements you are retrieving do not contain a 'link' in the href attribute.

HTML does not specify that href attributes are specifically for hyperlinks. For example, the href could be href="this.classList.add("class")" - such href contains no link.

Instead of creating columns for job_posts for all anchor elements, check if the href contains a valid hyperlink.

if "http" in href:
  #execute code
  pass

CodePudding user response：

It means that there is no href attribute in the <a> tag, or actually that there is no <a> tag at all found.

You could use the try/except much the same way you used it for salary:

import requests
from bs4 import BeautifulSoup
import pandas as pd

# Extract function
def extract(page):
    headers = {
        "User Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 \
        (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"}
    url = f"https://uk.indeed.com/jobs?q=data analyst £30,000&l=London, Greater London&jt=fulltime&start={page}"
    r = requests.get(url, headers)
    soup = BeautifulSoup(r.text, "html.parser")
    return soup

# Transform function
def transform(soup):
    # Get list of all job_postings
    job_postings = soup.find_all(name="div", class_="slider_item")

    # Get job elements
    for job_posts in job_postings:
        job_title = job_posts.select_one("a span[title]").text
        company_name = job_posts.find(name="span", class_="companyName").text
        print(company_name, job_title)
        try:
            salary = job_posts.find(name="div", class_="salary-snippet").find("span").getText()
        except:
            salary = "n/a"
        summary_text = job_posts.find(name="div", class_="job-snippet").text.replace("\n", "")

        # full job descriptions
        try:
            job_link = job_posts.find(name="a", class_="jcs-JobTitle").get("href")
            absolute_link = 'https://uk.indeed.com'   job_link
            job_desc_r = requests.get(absolute_link)
            job_desc_data = job_desc_r.text
            job_desc_soup = BeautifulSoup(job_desc_data, "html.parser")

            full_description = [item.text for item in
                                job_desc_soup.find(name="div", class_="jobsearch-jobDescriptionText").find_all("li")]
        
        except Exception as e:
            print(e)
            full_description = 'N/A'
            

    # Append jobs to job list 
        job = {
                'Job Title': job_title,
                'Company': company_name,
                'Salary': salary,
                'Summary': summary_text,
                'Full Descriptions': full_description
                }

        job_list.append(job)
    return


job_list = []

# Loop across multiple pages 
for page_num in range(0, 40, 10):
    extract_output = extract(page_num)
    transform(extract_output)