I just started learning how to use Python to explore scraping a jobs portal site - so please bear with me as I may ask very fundamental questions.
Situation: I've managed to build out the following lines
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome('C:/Users/ - Home/Desktop/Web Scraper/chromedriver.exe')
driver.get('https://www.mycareersfuture.gov.sg/search?sortBy=relevancy&page=0')
results =[]
content = driver.page_source
soup = BeautifulSoup(content, 'html.parser')
listing= soup.find('div', class_ = 'card-list')
job = listing.find('p')
print(job)
Complication: I can't seem the extract the following items from the job card:
- Job title
- Company name
- Salary
I've looked up several tutorials and each of them have indicated to look for h2 tags or divs with the respective classes. However, the site that I'm scraping doesn't appear to have this explicitly stated.
Link to site: https://www.mycareersfuture.gov.sg/search?sortBy=relevancy&page=0
For example, I've inspected the HTML and found the job title to be somewhere in this line; however, I just can't seem to extract it.
<span data-cy="job-card__job-title" style="overflow-wrap: break-word;">2402 - IT Manager [ Amber Rd / / 5 days ]</span>
I would really appreciate any help on this. I've been researching for solutions all night but to no avail...
CodePudding user response:
One of the possible solutions:
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
options = webdriver.ChromeOptions()
# set headless mode
# options.add_argument("--headless")
# disable chromedriver log message in cmd
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
service = Service(executable_path='path\to\your\chromedriver.exe')
driver = webdriver.Chrome(service=service, options=options)
# set an explicit wait (10 sec)
wait = WebDriverWait(driver, 10)
url = 'https://www.mycareersfuture.gov.sg/search?sortBy=relevancy&page=0'
# page where parsing will stop
last_page = 5225
# loads a web page
driver.get(url)
while True:
try:
# waiting(max 10 sec) for least one element with our css selector present on a web page.
company_names = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'p[data-testid="company-hire-info"]')))
job_titles = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'span[data-cy="job-card__job-title"]')))
salaries = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div[]')))
except TimeoutException:
# if TimeoutException refresh the page and try again
driver.refresh()
continue
# get data from received web elements
for data in zip(company_names, job_titles, salaries):
data = {
'Company name': data[0].text,
'Job title': data[1].text,
'Salary': data[2].text
}
# save received data in csv
with open(file='mycareersfuture.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([data['Company name'], data['Job title'], data['Salary']])
# waiting for an element is present on the DOM of a page. after that click on it
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'button[aria-label="Next"]'))).click()
# if the current page is equal to the last_page stop parsing
if driver.current_url.endswith(str(last_page)):
break
driver.quit()
Output mycareersfuture.csv:
THE SUPREME HR ADVISORY PTE. LTD.,2402 - IT Manager [ Amber Rd / / 5 days ],$6 500to$7 000
TRITON AI PTE. LTD.,"Property Executive, Town Council (Facilities Management)",$2 000to$3 000
PISTACHIO RESTAURANT PTE. LTD.,Service Crew / Supervisor,$1 700to$3 000
THE SUPREME HR ADVISORY PTE. LTD.,2402 - Quantity Surveyor [ Admiralty / 5 days ],$3 000to$3 500
THE SUPREME HR ADVISORY PTE. LTD.,2402 - WSH Co-ordinator [ 5 days / WSQ Advanced Cert ],$2 200to$3 500