I am trying to scroll webpage with Selenium- "https://jobsearch.az/vacancies". But, you see when you open it and click on vacancy, there are two pages side by side to scroll. I need to scroll the one in the middle, so selenium can go and take info from other jobs too. But, now, it stops when it comes to 14, which is all you can see if you dont scroll.
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
from selenium.webdriver.common.action_chains import ActionChains
path = "C:/Users/nihad/OneDrive/Documents/aInternship/chromedriver.exe"
driver = webdriver.Chrome(path)
url = "https://jobsearch.az/vacancies"
driver.get(url)
time.sleep(10)
soup = BeautifulSoup(driver.page_source, 'html.parser')
lists = soup.find_all('div', {'class': 'list__item'})
jobs_list=[]
print('A')
x=True
while x:
print('B')
driver.maximize_window()
for index, _list in enumerate(lists):
link = _list.find(
'a', class_="list__item__text")["href"]
current_url = "https://jobsearch.az" link
driver.get(current_url)
time.sleep(10)
soup = BeautifulSoup(driver.page_source, 'html.parser')
jobs = soup.find_all('div', {'class': 'vacancy'})
for index, job in enumerate(jobs):
company = job.find( # div.a.
'div', class_="vacancy__start").text.strip()
print(f'Company:{company}')
category = job.find( # div.a.
'span', class_="company__industry").text
print(f'Category:{category}')
# key_requirements = job.find( # div.a.
# 'div', class_="content-text").text
# print(f'Key requirements:{key_requirements}')
job_title = job.find( # div.a.
'h1', class_="vacancy__title").text.strip()
print(f'Job title: {job_title}')
deadline = job.find( # div.a.
'span', class_="vacancy__deadline").text.strip()
print(f'Deadline: {deadline}')
views = _list.find('div', class_="list__item__end").text.strip()
print(f'Views: {views}')
data = {
"job_title":job_title,
"company":company,
"category":category,
"deadline":deadline,
"views":views
}
jobs_list.append(data)
driver.minimize_window()
print('C')
driver.find_element_by_xpath('//h3[@]').click()
driver.execute.script("window.scrollBy(0, document.body.scrollHeight)")
for i in range(5):
driver.find_element_by_tag_name('h3').send_keys(Keys.END)
time.sleep(4)
driver.execute_script("window.scrollBy(0, 1000)", "")
time.sleep(5)
print('after executing scrolling')
# element = driver.find_element_by_class_name('vacancy__title')
# actions = ActionChains(driver)
# actions.move_to_element(element).perform()
dataframe = pd.DataFrame(jobs_list)
dataframe
driver.close()
I wrote all 3 techniques, but nothing works.
CodePudding user response:
I don't know if you need to use selenium. But I have an example on another library - requests. Since the api uses cookies, we must use the session to pass the XSRF token.
import requests
import pandas as pd
jobs_list = []
def foo(url):
headers = {
'accept': 'application/json, text/plain, */*',
'x-requested-with': 'XMLHttpRequest'
}
response = session.request("GET", url, headers=headers)
for job in response.json()['items']:
data = {
"job_title": job['title'],
"company": job['company']['title'],
"category": job['category']['title'],
"deadline": job['deadline_at'],
"views": job['view_count']
}
jobs_list.append(data)
if 'next' in response.json():
foo(response.json()['next'])
session = requests.session()
response = session.get('https://jobsearch.az/vacancies')
foo('https://jobsearch.az/api-az/vacancies-az?hl=az')
dataframe = pd.DataFrame(jobs_list)
dataframe
OUTPUT:
job_title ... views
0 Revenue manager ... 356
1 Operator (Satış təmsilçisi) ... 236
2 Satıcı (xanım) ... 766
3 ADM ISO 9001 Beynəlxalq Sertifikat Proqramını ... ... 1.6K
4 Avto-sənayedə operator montajçısı ... 218
... ... ... ...
1656 Receptionist (gecə növbəsi) ... 735
1657 Складчик ... 400
1658 Android proqramçı ... 358
1659 Credit Risk Specialist ... 587
1660 İpoteka üzrə mütəxəssis, aparıcı mütəxəssis (B... ... 439
[1661 rows x 5 columns]