Would be great to get your help/ input on this!
I'm trying to scrape job info from indeed. Everything in the code was working fine until I tried to get full job descriptions across multiple pages using the job href. Now I keep getting the following error:
job_link = job_posts.find(name="a", class_="jcs-JobTitle").get("href") AttributeError: 'NoneType' object has no attribute 'get'
Please find the code below:
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Extract function
def extract(page):
headers = {
"User Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"}
url = f"https://uk.indeed.com/jobs?q=data analyst £30,000&l=London, Greater London&jt=fulltime&start={page}"
r = requests.get(url, headers)
soup = BeautifulSoup(r.text, "html.parser")
return soup
# Transform function
def transform(soup):
# Get list of all job_postings
job_postings = soup.find_all(name="div", class_="slider_item")
# Get job elements
for job_posts in job_postings:
job_title = job_posts.select_one("a span[title]").text
company_name = job_posts.find(name="span", class_="companyName").text
try:
salary = job_posts.find(name="div", class_="salary-snippet").find("span").getText()
except:
salary = "n/a"
summary_text = job_posts.find(name="div", class_="job-snippet").text.replace("\n", "")
# full job descriptions
job_link = job_posts.find(name="a", class_="jcs-JobTitle").get("href")
absolute_link = 'https://uk.indeed.com' job_link
job_desc_r = requests.get(absolute_link)
job_desc_data = job_desc_r.text
job_desc_soup = BeautifulSoup(job_desc_data, "html.parser")
full_description = [item.text for item in
job_desc_soup.find(name="div", class_="jobsearch-jobDescriptionText").find_all("li")]
# Append jobs to job list
job = {
'Job Title': job_title,
'Company': company_name,
'Salary': salary,
'Summary': summary_text,
'Full Descriptions': full_description
}
job_list.append(job)
return
job_list = []
# Loop across multiple pages
for page_num in range(0, 40, 10):
extract_output = extract(page_num)
transform(extract_output)
I've tried fixing the error using try and except (see below), but this has resulted in a lot of empty job descriptions.
# Get full job descriptions
try:
job_link = job_posts.find(name="a", class_="jcs-JobTitle").get("href")
except:
job_link = ""
absolute_link = 'https://uk.indeed.com' job_link
# For each Job's webpage you need to connect to the link
job_desc_r = requests.get(absolute_link)
job_desc_data = job_desc_r.text
job_desc_soup = BeautifulSoup(job_desc_data, "html.parser")
try:
full_description = [item.text for item in
job_desc_soup.find(name="div", class_="jobsearch-jobDescriptionText").find_all("li")]
except:
full_description = ""
Thank you in advance! P.S. I'm using PyCharmCE on my mac
CodePudding user response:
Without looking at the raw web scrape data, I am almost certain I understand why the error is occuring. It is highly likely that the anchor elements you are retrieving do not contain a 'link' in the href
attribute.
HTML does not specify that href
attributes are specifically for hyperlinks. For example, the href
could be href="this.classList.add("class")"
- such href
contains no link.
Instead of creating columns for job_posts
for all anchor elements, check if the href
contains a valid hyperlink.
if "http" in href:
#execute code
pass
CodePudding user response:
It means that there is no href attribute in the <a>
tag, or actually that there is no <a>
tag at all found.
You could use the try/except
much the same way you used it for salary:
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Extract function
def extract(page):
headers = {
"User Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"}
url = f"https://uk.indeed.com/jobs?q=data analyst £30,000&l=London, Greater London&jt=fulltime&start={page}"
r = requests.get(url, headers)
soup = BeautifulSoup(r.text, "html.parser")
return soup
# Transform function
def transform(soup):
# Get list of all job_postings
job_postings = soup.find_all(name="div", class_="slider_item")
# Get job elements
for job_posts in job_postings:
job_title = job_posts.select_one("a span[title]").text
company_name = job_posts.find(name="span", class_="companyName").text
print(company_name, job_title)
try:
salary = job_posts.find(name="div", class_="salary-snippet").find("span").getText()
except:
salary = "n/a"
summary_text = job_posts.find(name="div", class_="job-snippet").text.replace("\n", "")
# full job descriptions
try:
job_link = job_posts.find(name="a", class_="jcs-JobTitle").get("href")
absolute_link = 'https://uk.indeed.com' job_link
job_desc_r = requests.get(absolute_link)
job_desc_data = job_desc_r.text
job_desc_soup = BeautifulSoup(job_desc_data, "html.parser")
full_description = [item.text for item in
job_desc_soup.find(name="div", class_="jobsearch-jobDescriptionText").find_all("li")]
except Exception as e:
print(e)
full_description = 'N/A'
# Append jobs to job list
job = {
'Job Title': job_title,
'Company': company_name,
'Salary': salary,
'Summary': summary_text,
'Full Descriptions': full_description
}
job_list.append(job)
return
job_list = []
# Loop across multiple pages
for page_num in range(0, 40, 10):
extract_output = extract(page_num)
transform(extract_output)