I'm coding a python program to scrap this site using Selenium and Beatifulsoup:
https://www.argentina.gob.ar/desarrollosocial/registrocuidadores
I was able to go through the steps to access the first table I need (that's what the code does) but then the webdriver close itself and I get this error in the console:
Traceback (most recent call last): File "/Users/martin/Desktop/Scrap/scrapy1-3.py", line 33, in select2.select_by_visible_text(option2.text) ^^^^^^^^^^^^ File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/selenium/webdriver/remote/webelement.py", line 89, in text return self._execute(Command.GET_ELEMENT_TEXT)["value"] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/selenium/webdriver/remote/webelement.py", line 410, in _execute return self._parent.execute(command, params) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/selenium/webdriver/remote/webdriver.py", line 444, in execute self.error_handler.check_response(response) File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/selenium/webdriver/remote/errorhandler.py", line 249, in check_response raise exception_class(message, screen, stacktrace) selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document (Session info: chrome=109.0.5414.87)
This is my code right now:
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import time
from bs4 import BeautifulSoup
driver = webdriver.Chrome(
'/Users/martin/Downloads/chromedriver_mac64/chromedriver')
# Abrir el sitio web
driver.get("https://registroncd.senaf.gob.ar/ListadoCuidadores.aspx")
# Esperar a que la página cargue antes de hacer scraping
time.sleep(3)
boton1 = driver.find_element(
By.XPATH, "//*[@id='ContentPlaceHolder1_DropDownListProv']")
select1 = Select(boton1)
options1 = select1.options
for option1 in options1:
select1.select_by_visible_text(option1.text)
time.sleep(3) # wait for the page to load
boton2 = driver.find_element(
By.XPATH, "//*[@id='ContentPlaceHolder1_DropDownListLoc']")
select2 = Select(boton2)
options2 = select2.options
for i in range(1, len(options2)):
option2 = options2[i]
select2.select_by_visible_text(option2.text)
time.sleep(3) # wait for the page to load
boton3 = driver.find_element(By.ID, "ContentPlaceHolder1_ButtonBuscar")
boton3.click()
time.sleep(3)
wait = WebDriverWait(driver, 10)
element = wait.until(EC.presence_of_element_located(
(By.ID, "ContentPlaceHolder1_GridView1")))
soup = BeautifulSoup(driver.page_source, "html.parser")
table = soup.find("table", class_="gridview")
if table:
rows = table.find_all("tr")
for row in rows:
cells = row.find_all("td")
for cell in cells:
print(cell.text)
else:
print("La tabla no ha sido encontrada")
CodePudding user response:
To use only bs4
without selenium
you can try:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
# first page:
url = 'https://registroncd.senaf.gob.ar/ListadoCuidadores.aspx'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
data = {}
for inp in soup.select('input[value]'):
data[inp['name']] = inp['value']
soup = BeautifulSoup(requests.post(url, data=data).content, 'html.parser')
df = pd.read_html(str(soup))[0]
print(df[:-1])
# for additional pages:
for page in range(2, 4):
data = {}
for inp in soup.select('input[value]'):
data[inp['name']] = inp['value']
del data['ctl00$ContentPlaceHolder1$ButtonBuscar']
data['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$GridView1'
data['__EVENTARGUMENT'] = f"Page${page}"
soup = BeautifulSoup(requests.post(url, data=data).content, 'html.parser')
df = pd.read_html(str(soup))[0]
print(df[:-1])
Prints:
Nombre Apellido Provincia Localidad Telefono Email Capacitaciones
0 BLANCA BEATRIZ AGUILAR Buenos Aires 25 de Mayo 0234515532692 [email protected] Ver
1 RUBEN OSVALDO CABALLERO Buenos Aires 25 de Mayo 0234515400320 [email protected] Ver
2 DAVID ALEJANDRO GIGLIO Buenos Aires 25 de Mayo 0234515517152 [email protected] Ver
3 LILIANA RAQUEL MACHAROLI Buenos Aires 25 de Mayo 0234515438703 [email protected] Ver
4 PATRICIA ELIZABETH MATTIA Buenos Aires 25 de Mayo 0234515433654 [email protected] Ver
5 ANDREA SILVINA PEREZ Buenos Aires 25 de Mayo 0234515513612 [email protected] Ver
6 NATALIA CLARISA LOPEZ Buenos Aires 25 de Mayo 0234515400562 [email protected] Ver
7 LUCIANA KARINA MARA Buenos Aires 25 de Mayo 0234515668788 [email protected] Ver
...and so on.
CodePudding user response:
Try using the WebDriverWait
wait = WebDriverWait(driver, 10)
wait.until(EC.element_to_be_clickable((By.ID, "ContentPlaceHolder1_ButtonBuscar"))).click()
import
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.webdriver.common.by import By