I'm trying to scraping data form the following web site:
The issue is the table that I need is not showing, and I think that because I need to click from drop down list before scraping
I'm trying the below scraping but its return null
import requests
from bs4 import BeautifulSoup
import html5lib
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
URL2 = "https://shafafiyah.socpa.org.sa/EmployeeDetails.aspx"
r2 = requests.get(URL2,headers=headers)
soup2 = BeautifulSoup(r2.content, 'html5lib')
how can I show hidden tables
CodePudding user response:
The data comes from a POST
request that you need to mimic.
Here's how:
import json
import pandas as pd
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0",
"Origin": "https://shafafiyah.socpa.org.sa",
"Referer": "https://shafafiyah.socpa.org.sa/EmployeeDetails.aspx",
"Cookie": "ASP.NET_SessionId=kipasiwnunzznecfoy0yhlju; BIGipServerVIS=319032330.47873.0000;",
}
url = "https://shafafiyah.socpa.org.sa/EmployeeDetails.aspx"
with requests.Session() as s:
s.headers.update(headers)
soup = BeautifulSoup(s.get(url).text, "lxml")
hidden_field = soup.find(id="_TSM_HiddenField_")["value"]
v_state = soup.find(id="__VSTATE")["value"]
event_validation = soup.find(id="__EVENTVALIDATION")["value"]
payload = {
"__EVENTTARGET": "ctl00$MainContent$btnSearch",
"__EVENTARGUMENT": "",
"_TSM_HiddenField_": hidden_field,
"__VTATE": v_state,
"__VIEWSTATE": "",
"__EVENTVALIDATION": event_validation,
"ctl00$MainContent$ddlAccountingFirm": "74",
"ctl00$MainContent$lseAccountingFirm_ClientState": "",
}
df = pd.read_html(s.post(url, data=json.dumps(payload)).text, flavor="lxml")
print(df[0])
This should give you:
م ... الخبرة الوظيفية
0 1 ... 21.0
1 2 ... 33.0
2 3 ... 14.0
3 4 ... 22.0
4 5 ... 17.0
.. ... ... ...
96 97 ... 4.0
97 98 ... 4.0
98 99 ... 4.0
99 100 ... 5.0
100 م رمز الموظف نوع الوظيفة الجنسية الدرجة ال... ... NaN
[101 rows x 7 columns]
The values for ctl00$MainContent$ddlAccountingFirm
are in the source HTML
so you can build a mapping.
<option value="74"> إرنست ويونغ محاسبون قانونيون-45 </option>
<option value="204">أديب بن محمد ابانمي محاسبون ومراجعون قانونيون-418 </option>
<option value="345">آل ربيع محاسبون قانونيون واستشاريون-675</option>
CodePudding user response:
please find below the solution by use selenium
#modules
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import numpy as np
#driver
Path="chromedriver"
driver = webdriver.Chrome(Path)
driver.get("https://shafafiyah.socpa.org.sa/EmployeeDetails.aspx")
#access drop down list
id_element= driver.find_element(By.ID,'MainContent_ddlAccountingFirm')
select_list=Select(id_element)
# extract result by loop in all options
leanth=len(select_list.options)
df_master = pd.DataFrame([])
for x in range(1,leanth 1):
id_element= driver.find_element(By.ID,'MainContent_ddlAccountingFirm')
select_list=Select(id_element)
select_list.select_by_index(x)
#search
id_element2= driver.find_element(By.ID,'MainContent_btnSearch')
id_element2.click()
try:
#contant
id_element3= driver.find_element(By.ID,'MainContent_divEmployeeSummaryDetails')
#into frame
result= id_element3.text.split('\n')
name_com=result[0]
new_result=result[1:]
data=dict(zip(i := iter(new_result), i))
df=pd.DataFrame(data.items(), columns=['Date', 'DateValue'])
df=df.transpose().reset_index()
df.columns=df.iloc[0]
df=df.drop(0)
df['Date']=name_com
df_master = df_master.append(df)
except NoSuchElementException:
pass
#Reset
id_element4= driver.find_element(By.ID,'MainContent_btnReset')
id_element4.click()