I was trying to scrap data from a web page. This is the
after that I got another page like this.
I also successfully clicked the first link from the list! But then comes the problem! The next page generated and I tried to scrap the company name and address with email but the page data can't be fetched with Selenium!
The page look like this and I am trying to scrap red marked data
I am giving my code. Can anyone please tell me what was my mistake?
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import csv
import time
PATH = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH)
driver.get('https://www.outils.ffbatiment.fr/federation-francaise-du-batiment/laffb/annuaire.html')
driver.implicitly_wait(1)
selectOne = Select(driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_DDL_SectAct"]'))
selectOne.select_by_value('1')
driver.implicitly_wait(2)
selectTwo = Select(driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_DDL_Act"]'))
selectTwo.select_by_value('704')
driver.implicitly_wait(2)
selectThree = Select(driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_DDL_Departement"]'))
selectThree.select_by_value('85')
driver.implicitly_wait(2)
button = driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_Button1"]')
button.click()
driver.implicitly_wait(2)
getData = driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_RadGrid1_ctl00"]/tbody')
getTr = getData.find_elements(By.TAG_NAME, "tr")
print("size ", len(getTr))
length = len(getTr)
for i in range(length) :
getData = driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_RadGrid1_ctl00"]/tbody')
getTr = getData.find_elements(By.TAG_NAME, "tr")
for item in getTr :
xxpath = '//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_RadGrid1_ctl00__' str(i) '"]/td[1]/a'
link = item.find_elements(By.XPATH,xxpath)
link[0].send_keys(Keys.RETURN)
driver.implicitly_wait(10)
company = item.find_elements(By.XPATH,'//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_FormView1_Ent_NomLabel"]')
address = item.find_elements(By.XPATH,'//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_PanelDetails"]/div[1]/div[2]/div/div[1]/p[3]')
postal = item.find_elements(By.XPATH,'//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_FormView1_Ent_CpLabel"]')
city = item.find_elements(By.XPATH,'//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_FormView1_Ent_VilleLabel"]')
phone = item.find_elements(By.XPATH,'//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_FormView1_Ent_TelHyperLink"]')
email = item.find_elements(By.XPATH,'//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_FormView1_Ent_EmailHyperLink"]')
print(company," ",postal," ",city," ",phone," ",email)
back = item.find_elements(By.XPATH,'//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_FormView1_LButtonRetour"]')
back.send_keys(Keys.RETURN)
time.sleep(10)
driver.quit()
CodePudding user response:
wait = WebDriverWait(driver, 20)
driver.get('https://www.outils.ffbatiment.fr/federation-francaise-du-batiment/laffb/annuaire.html')
# Clicks the accept button
try:
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "div#cookiescript_accept"))).click()
except:
pass
sl1= driver.find_element(By.XPATH,'//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_DDL_SectAct"]')
selectOne = Select(sl1)
selectOne.select_by_value('1')
sl2= driver.find_element(By.XPATH,'//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_DDL_Act"]')
selectTwo = Select(sl2)
selectTwo.select_by_value('704')
sl3=driver.find_element(By.XPATH,'//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_DDL_Departement"]')
selectThree = Select(sl3)
selectThree.select_by_value('85')
button = driver.find_element(By.XPATH,'//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_Button1"]')
button.click()
getData = driver.find_element(By.XPATH,'//*[@id="ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_RadGrid1_ctl00"]/tbody')
getTr = getData.find_elements(By.TAG_NAME, "tr")
print("size ", len(getTr))
length = len(getTr) 1
for i in range(1,length):
driver.find_element(By.XPATH,f"(//tr[@class='rgRow']//a)[{i}]").click()
address = [x.text for x in driver.find_elements(By.XPATH,'//*[starts-with(@id,"ctl00_ctl00_ContentPlaceHolderGlobal_ContentPlaceHolderContenu_FormView1_Ent_AdresseLabel")]')]
address=''.join(map(str, address))
print(address)
driver.back()
driver.refresh()
- Handle the accept button when it pops up.
- When looping through the 20 or so items that come up make sure to use the index and grab that item to click.
- When getting a new page make sure to reget elements.
- When getting the company, address make sure you get all of the information.
- When going back a page make sure to refresh to resend all the information.
I didn't do them all but this is a good start for you to figure how to handle elements when going through pages. Remember if you can't find an element to handle it properly.
Outputs:
size 20
Z.A. Sud42, rue du Commerce
ZA LES NOUETTES
RUE DES FRERESZA DE BRECHARD
ZI des Plesses118 rue des PlessesCHATEAU D OLONNE
11 IMPASSE DU VIGNAUDLA VERRIE
2 RUE JOSEPH CUGNOTZONE DES ROCHES
Z.A. Sud42, rue du Commerce
704 ROUTE DES ABOIRESLE MOTTEAU
60 avenue Villebois MareuilMONTAIGU
PARC D'ACTIVITE DE LA BLOIRE6 RUE FRANCOIS MANSART
PARC D'ACTIVITE DE LA BLOIRE6 RUE FRANCOIS MANSART
704 ROUTE DES ABOIRESLE MOTTEAU
9 RUE DE LA CHAPELLEFONTAINES
ZA la promenade200 allée du pré chacun
ZA DU CHARFAIT
RUE DU MOULIN GROSMONTAIGU
3 ZA DES CINQ MOULINS
ROUTE DU COMMERCE
ROUTE DE ST GILLESZA ESPACE OCEANE