Home > Software design >  Scraping after select from drop down list
Scraping after select from drop down list

Time:11-01

I'm trying to scraping data form the following web site:

https://shafafiyah.socpa.org.sa/EmployeeDetails.aspx

The issue is the table that I need is not showing, and I think that because I need to click from drop down list before scraping

I'm trying the below scraping but its return null

import requests
from bs4 import BeautifulSoup
import html5lib

 headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 
 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}

 URL2 = "https://shafafiyah.socpa.org.sa/EmployeeDetails.aspx"
 r2 = requests.get(URL2,headers=headers)
 soup2 = BeautifulSoup(r2.content, 'html5lib')

how can I show hidden tables

CodePudding user response:

The data comes from a POST request that you need to mimic.

Here's how:

import json

import pandas as pd
import requests
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0",
    "Origin": "https://shafafiyah.socpa.org.sa",
    "Referer": "https://shafafiyah.socpa.org.sa/EmployeeDetails.aspx",
    "Cookie": "ASP.NET_SessionId=kipasiwnunzznecfoy0yhlju; BIGipServerVIS=319032330.47873.0000;",
}

url = "https://shafafiyah.socpa.org.sa/EmployeeDetails.aspx"

with requests.Session() as s:
    s.headers.update(headers)
    soup = BeautifulSoup(s.get(url).text, "lxml")
    hidden_field = soup.find(id="_TSM_HiddenField_")["value"]
    v_state = soup.find(id="__VSTATE")["value"]
    event_validation = soup.find(id="__EVENTVALIDATION")["value"]

    payload = {
        "__EVENTTARGET": "ctl00$MainContent$btnSearch",
        "__EVENTARGUMENT": "",
        "_TSM_HiddenField_": hidden_field,
        "__VTATE": v_state,
        "__VIEWSTATE": "",
        "__EVENTVALIDATION": event_validation,
        "ctl00$MainContent$ddlAccountingFirm": "74",
        "ctl00$MainContent$lseAccountingFirm_ClientState": "",
    }
    df = pd.read_html(s.post(url, data=json.dumps(payload)).text, flavor="lxml")
    print(df[0])

This should give you:

                                                     م  ...  الخبرة الوظيفية
0                                                    1  ...             21.0
1                                                    2  ...             33.0
2                                                    3  ...             14.0
3                                                    4  ...             22.0
4                                                    5  ...             17.0
..                                                 ...  ...              ...
96                                                  97  ...              4.0
97                                                  98  ...              4.0
98                                                  99  ...              4.0
99                                                 100  ...              5.0
100  م  رمز الموظف  نوع الوظيفة  الجنسية  الدرجة ال...  ...              NaN

[101 rows x 7 columns]

The values for ctl00$MainContent$ddlAccountingFirm are in the source HTML so you can build a mapping.

<option value="74"> إرنست ويونغ محاسبون قانونيون-45                  </option>
    <option value="204">أديب بن محمد ابانمي محاسبون ومراجعون قانونيون-418                 </option>
    <option value="345">آل ربيع محاسبون قانونيون واستشاريون-675</option>

CodePudding user response:

please find below the solution by use selenium

#modules

import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys 
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import numpy as np  

#driver  

Path="chromedriver"
driver = webdriver.Chrome(Path)
driver.get("https://shafafiyah.socpa.org.sa/EmployeeDetails.aspx")

#access drop down list

id_element= driver.find_element(By.ID,'MainContent_ddlAccountingFirm')
select_list=Select(id_element)


# extract result by loop in all options

leanth=len(select_list.options)
df_master = pd.DataFrame([])

for x in range(1,leanth 1):
    id_element= driver.find_element(By.ID,'MainContent_ddlAccountingFirm')
    select_list=Select(id_element)

    select_list.select_by_index(x)
    #search
    id_element2= driver.find_element(By.ID,'MainContent_btnSearch')
    id_element2.click()
    try:
    #contant
        id_element3= driver.find_element(By.ID,'MainContent_divEmployeeSummaryDetails')

        #into frame
        result= id_element3.text.split('\n')
        name_com=result[0]
        new_result=result[1:]
        data=dict(zip(i := iter(new_result), i))
        df=pd.DataFrame(data.items(), columns=['Date', 'DateValue'])
        df=df.transpose().reset_index()
        df.columns=df.iloc[0]
        df=df.drop(0)
        df['Date']=name_com
        df_master = df_master.append(df)
    except NoSuchElementException: 
        pass
    #Reset
    id_element4= driver.find_element(By.ID,'MainContent_btnReset')
    id_element4.click()
  • Related