Home > Software engineering >  Difficulty Selecting Element using Selenium
Difficulty Selecting Element using Selenium

Time:07-15

Glassdoor Job Search Link

Currently attempting to select the "sector" for a job posting. Not having an luck with xpath or css selector thus far...any assistance would be appreciated! My code is already iterating through each job posting successfully and pulling company name, location, job description etc. My vs code is below.

Original code credit: Omer Sakarya and Ken Jee.

Here are some of my attempts:

'.//span[@]'

 './/div[@id="EmpBasicInfo"]//div[@]/div[5]/span[@]'

'.//div[@]//span[text()="Sector"]//following-sibling::*'

Glassdoor Job Posting/Sector Element

from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.common.alert import Alert
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


def get_jobs(keyword, num_jobs, verbose, path, slp_time):

'''Gathers jobs as a dataframe, scraped from Glassdoor'''

#Initializing the webdriver
options = webdriver.ChromeOptions()

#Uncomment the line below if you'd like to scrape without a new Chrome window every time.
#options.add_argument('headless')

#Change the path to where chromedriver is in your home folder.
driver = webdriver.Chrome(executable_path=path, options=options)
driver.set_window_size(1120, 1000)

url='https://www.glassdoor.com/Job/'   keyword   '-jobs-SRCH_KO0,14.htm'
driver.get(url)
jobs = []

while len(jobs) < num_jobs:  #If true, should be still looking for new jobs.

    #Let the page load. Change this number based on your internet speed.
    #Or, wait until the webpage is loaded, instead of hardcoding it.
    time.sleep(slp_time)

    #Test for the "Sign Up" prompt and get rid of it.
    try:
        driver.find_element(By.CSS_SELECTOR,  '[data-selected="true"]').click()
    except ElementClickInterceptedException:
        pass

    time.sleep(.1)

    try:
        driver.find_element(By.XPATH,('.//div[@id="JAModal"]//span[@alt="Close"]')).click()
    except NoSuchElementException:
        pass

    #Going through each job in this page
    job_buttons = driver.find_elements(By.CSS_SELECTOR,'[data-test="job-link"]') #jl for Job Listing. These are the buttons were going to click.
    for job_button in job_buttons:  

        print("Progress: {}".format(""   str(len(jobs))   "/"   str(num_jobs)))
        if len(jobs) >= num_jobs:
            break

        job_button.click()  #You might 
        time.sleep(1)
        collected_successfully = False
        
        while not collected_successfully:
            try:
                company_name = driver.find_element(By.XPATH,'.//div[@]').text
                location = driver.find_element(By.XPATH,'.//div[@]').text
                job_title = driver.find_element(By.XPATH,'.//div[contains(@class, "css-1j389vi e1tk4kwz2")]').text
                job_description = driver.find_element(By.XPATH,'.//div[@]').text
                collected_successfully = True
            except:
                time.sleep(5)

        try:
            salary_estimate = driver.find_element(By.XPATH,'.//span[@]').text
        except NoSuchElementException:
            salary_estimate = -1 #You need to set a "not found value. It's important."
        
        try:
            rating = driver.find_element(By.CSS_SELECTOR,'[data-test="detailRating"]').text
        except NoSuchElementException:
            rating = -1 #You need to set a "not found value. It's important."

        #Printing for debugging
        if verbose:
            print("Job Title: {}".format(job_title))
            print("Salary Estimate: {}".format(salary_estimate))
            print("Job Description: {}".format(job_description[:500]))
            print("Rating: {}".format(rating))
            print("Company Name: {}".format(company_name))
            print("Location: {}".format(location))

        #Going to the Company tab...
        #clicking on this:
        #<div  data-tab-type="overview"><span>Company</span></div>
        try:
            driver.find_element(By.XPATH,'.//div[@ and @data-tab-type="overview"]').click()

            try:
                #<div >
                #    <label>Headquarters</label>
                #    <span >San Francisco, CA</span>
                #</div>
                headquarters = driver.find_element(By.XPATH,'.//div[@]//label[text()="Headquarters"]//following-sibling::*').text
            except NoSuchElementException:
                headquarters = -1

            try:
                size = driver.find_element(By.XPATH,'.//div[@id="EmpBasicInfo"]//div[@]/div[1]/span[@]').text
            except NoSuchElementException:
                size = -1

            try:
                founded = driver.find_element(By.XPATH,'.//div[@]//span[text()="Founded"]//following-sibling::*').text
            except NoSuchElementException:
                founded = -1

            try:
                type_of_ownership = driver.find_element(By.XPATH,'.//div[@]//label[text()="Type"]//following-sibling::*').text
            except NoSuchElementException:
                type_of_ownership = -1

            try:
                industry = driver.find_element(By.XPATH,'.//div[@id="EmpBasicInfo"]//div[@]/div[4]/span[@]').text
            except NoSuchElementException:
                industry = -1

            try:
                sector = driver.find_element(By.XPATH,".//div[@id='EmpBasicInfo']//div[@class='d-flex flex-wrap']/div[5]/span[@class='css-1pldt9b e1pvx6aw1']//following-sibling::*").text
            except NoSuchElementException:
                sector = -1

            try:
                revenue = driver.find_element(By.XPATH,'.//span[@]').text
            except NoSuchElementException:
                revenue = -1

            try:
                competitors = driver.find_element(By.XPATH,'.//div[@]//label[text()="Competitors"]//following-sibling::*').text
            except NoSuchElementException:
                competitors = -1

        except NoSuchElementException:  #Rarely, some job postings do not have the "Company" tab.
            headquarters = -1
            size = -1
            founded = -1
            type_of_ownership = -1
            industry = -1
            sector = -1
            revenue = -1
            competitors = -1

            
        if verbose:
            print("Headquarters: {}".format(headquarters))
            print("Size: {}".format(size))
            print("Founded: {}".format(founded))
            print("Type of Ownership: {}".format(type_of_ownership))
            print("Industry: {}".format(industry))
            print("Sector: {}".format(sector))
            print("Revenue: {}".format(revenue))
            print("Competitors: {}".format(competitors))
            print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")

        jobs.append({"Job Title" : job_title,
        "Salary Estimate" : salary_estimate,
        "Job Description" : job_description,
        "Rating" : rating,
        "Company Name" : company_name,
        "Location" : location,
        "Headquarters" : headquarters,
        "Size" : size,
        "Founded" : founded,
        "Type of ownership" : type_of_ownership,
        "Industry" : industry,
        "Sector" : sector,
        "Revenue" : revenue,
        "Competitors" : competitors})
        #add job to jobs

    #Clicking on the "next page" button
    try:
        driver.find_element(By.CSS_SELECTOR, "[alt='next-icon']").click
    except NoSuchElementException:
        print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
        break

return pd.DataFrame(jobs)  #This line converts the dictionary object into a pandas DataFrame.

CodePudding user response:

You have to use below xpath for Job posting.

//a[contains(@data-test, 'post-jobs')]

CodePudding user response:

Looks like I was missing the click through to the other tab on the page.

driver.find_element(By.XPATH,'.//div[@]').click()

Thanks all that attempted to assist!

  • Related