Home > Back-end >  Selenium, Scrolling page
Selenium, Scrolling page

Time:06-21

I am trying to scroll webpage with Selenium- "https://jobsearch.az/vacancies". But, you see when you open it and click on vacancy, there are two pages side by side to scroll. I need to scroll the one in the middle, so selenium can go and take info from other jobs too. But, now, it stops when it comes to 14, which is all you can see if you dont scroll.

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd 
from selenium.webdriver.common.action_chains import ActionChains

path = "C:/Users/nihad/OneDrive/Documents/aInternship/chromedriver.exe"
driver = webdriver.Chrome(path)

url = "https://jobsearch.az/vacancies"


driver.get(url)
time.sleep(10)

soup = BeautifulSoup(driver.page_source, 'html.parser')
lists = soup.find_all('div', {'class': 'list__item'})
jobs_list=[]
print('A')

x=True
while x:
    
    print('B')
    driver.maximize_window()
    for index, _list in enumerate(lists):

        link = _list.find(
            'a', class_="list__item__text")["href"]

        current_url = "https://jobsearch.az" link

        driver.get(current_url)
        time.sleep(10)

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        jobs = soup.find_all('div', {'class': 'vacancy'})


        for index, job in enumerate(jobs):

            company = job.find(  # div.a.
                'div', class_="vacancy__start").text.strip()
            print(f'Company:{company}')

            category = job.find(  # div.a.
                'span', class_="company__industry").text
            print(f'Category:{category}')

#         key_requirements = job.find(  # div.a.
#             'div', class_="content-text").text
#         print(f'Key requirements:{key_requirements}')

            job_title = job.find(  # div.a.
                'h1', class_="vacancy__title").text.strip()
            print(f'Job title: {job_title}')

            deadline = job.find(  # div.a.
                'span', class_="vacancy__deadline").text.strip()
            print(f'Deadline: {deadline}')
            views = _list.find('div', class_="list__item__end").text.strip()
            print(f'Views: {views}')
            data = {
                    "job_title":job_title,
                    "company":company,
                    "category":category,
                    "deadline":deadline,
                    "views":views
                }
            jobs_list.append(data)

    driver.minimize_window()
    print('C')
    driver.find_element_by_xpath('//h3[@]').click()
    driver.execute.script("window.scrollBy(0, document.body.scrollHeight)")
    for i in range(5):
        driver.find_element_by_tag_name('h3').send_keys(Keys.END)
        time.sleep(4)
    driver.execute_script("window.scrollBy(0, 1000)", "")
    time.sleep(5)
    print('after executing scrolling')

    
    # element = driver.find_element_by_class_name('vacancy__title')
    # actions = ActionChains(driver)
    # actions.move_to_element(element).perform()

        




dataframe = pd.DataFrame(jobs_list)
    
dataframe


driver.close()

I wrote all 3 techniques, but nothing works.

CodePudding user response:

I don't know if you need to use selenium. But I have an example on another library - requests. Since the api uses cookies, we must use the session to pass the XSRF token.

import requests
import pandas as pd

jobs_list = []


def foo(url):
    headers = {
        'accept': 'application/json, text/plain, */*',
        'x-requested-with': 'XMLHttpRequest'
    }
    response = session.request("GET", url, headers=headers)

    for job in response.json()['items']:
        data = {
            "job_title": job['title'],
            "company": job['company']['title'],
            "category": job['category']['title'],
            "deadline": job['deadline_at'],
            "views": job['view_count']
        }
        jobs_list.append(data)
    if 'next' in response.json():
        foo(response.json()['next'])


session = requests.session()
response = session.get('https://jobsearch.az/vacancies')
foo('https://jobsearch.az/api-az/vacancies-az?hl=az')
dataframe = pd.DataFrame(jobs_list)
dataframe

OUTPUT:

                                              job_title  ... views
0                                       Revenue manager  ...   356
1                           Operator (Satış təmsilçisi)  ...   236
2                                        Satıcı (xanım)  ...   766
3     ADM ISO 9001 Beynəlxalq Sertifikat Proqramını ...  ...  1.6K
4                     Avto-sənayedə operator montajçısı  ...   218
...                                                 ...  ...   ...
1656                        Receptionist (gecə növbəsi)  ...   735
1657                                           Складчик  ...   400
1658                                  Android proqramçı  ...   358
1659                             Credit Risk Specialist  ...   587
1660  İpoteka üzrə mütəxəssis, aparıcı mütəxəssis (B...  ...   439

[1661 rows x 5 columns]
  • Related