I try to scrape all the pages of a URL by selenium python, but only could get the values from the 1st page. The code is going to the next page but same code is not working to scrape ending with an error. The error is " Element ... is not clickable at point (208, 17). Other element would receive the click: ...". Here is the code
import pandas as pd
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as W
from selenium.webdriver.support import expected_conditions as E
def page_scrape():
driver.maximize_window()
ADDRESS_LOCATIONS_TEASER = '//div[contains(@class,"all-stores accordian ng-star-inserted")]'
ADDRESS_LOCATIONS = './/div[contains(@class,"accordian-header")]'
teaser = driver.find_element(By.XPATH, ADDRESS_LOCATIONS_TEASER)
locations = teaser.find_elements(By.XPATH, ADDRESS_LOCATIONS)
for loc in locations:
add = loc.find_element(By.XPATH, './/a[@href]').click()
add1 = driver.find_element(By.XPATH, ".//address[contains(@class, 'address-block')]").text
print(add1)
Services_Types_TEASER = '//div[contains(@class, "store-accordian store-accordian-flex ng-star-inserted")]'
Service_Types = './/div[contains(@class, "store-dine-flx ng-star-inserted")]'
services_teaser = driver.find_element(By.XPATH, Services_Types_TEASER)
services_list = services_teaser.find_elements(By.XPATH, Service_Types)
types_of_services = '//div[contains(@id,"divrestaurant2")]'
types_of_services_teaser = './/div[contains(@class,"services ng-star-inserted")]'
types_of_services_find = driver.find_element(By.XPATH, types_of_services)
types_of_service_list = types_of_services_find.find_elements(By.XPATH, types_of_services_teaser)
text = []
for types in types_of_service_list:
if types.text == 'Services in diesem Restaurant':
text = "'{}':".format(types.text)
elif types.text == 'Betreiber':
exit
else:
text = "{0},'{1}'".format(text, types.text)
print(text)
for services_types in services_list:
# service = driver.find_element(By.XPATH, './/h5').text
print(services_types.text)
driver = webdriver.Chrome("C:/Users/doyel/Downloads/chromedriver_win32/chromedriver.exe")
driver.get('https://www.kfc.de/find-a-kfc')
results = pd.DataFrame(columns=['address', 'PLZ', 'Telephone' 'Restaurant Services'])
COOKIE_PATH = '//button[contains(@id,"onetrust-accept-btn-handler")]'
driver.find_element(By.XPATH,COOKIE_PATH).click()
flag = True
while True:
page_scrape()
next_page = '//a[@aria-label="Next page"]'
try:
driver.find_element(By.XPATH, next_page).click()
print("next page")
time.sleep(2)
except:
print("last page reached")
break
#//p[@class='mb-2']//font//font[contains(text(),'Schnellerstr.')]
#print(driver.page_source)
driver.quit()
CodePudding user response:
While not trivial, it's doable, and here is one way to do it:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import pandas as pd
import time as t
from tqdm import tqdm ## if using Jupyter notebook, import as from tqdm.notebook import tqdm
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1920,1080")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
driver = webdriver.Chrome(service=webdriver_service, options=chrome_options)
wait = WebDriverWait(driver, 5)
restaurant_list = []
driver.get('https://www.kfc.de/find-a-kfc')
try:
wait.until(EC.element_to_be_clickable((By.ID, "onetrust-reject-all-handler"))).click()
print('dismissed cookies')
except Exception as e:
print('no cookie button!')
header = wait.until(EC.element_to_be_clickable((By.TAG_NAME, "app-common-header")))
driver.execute_script("""
var element = arguments[0];
element.parentNode.removeChild(element);
""", header)
for x in tqdm(range(1, 21)):
kfc_rests = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//div[@]//app-accordian[@]')))
for k in kfc_rests:
k.location_once_scrolled_into_view
k.click()
name = k.find_element(By.TAG_NAME, 'strong').text
address = wait.until(EC.element_to_be_clickable((By.XPATH, '//address[@]/p'))).text.replace('\n', ' ').strip()
try:
services = ', '.join([x.text.strip() for x in wait.until(EC.presence_of_all_elements_located((By.XPATH, '//div[@]//li')))])
except Exception as e:
services = 'Not specified'
restaurant_list.append((name, address, services))
try:
next_page = wait.until(EC.element_to_be_clickable((By.XPATH, '//li[@]//a[@aria-label="Next page"]')))
next_page.location_once_scrolled_into_view
next_page.click()
except Exception as e:
print('end of list')
break
df = pd.DataFrame(restaurant_list, columns = ['Name', 'Address', 'Services'])
print(df)
Result in terminal:
dismissed cookies
95%
19/20 [01:41<00:05, 5.91s/it]
end of list
Name Address Services
0 KFC BERLIN Grenzallee 37 12057 Berlin Lieferung, Drive Thru, Free Refill, EC-Zahlung, Click & Collect
1 KFC BERLIN Gatower Straße 56 13595 Berlin Lieferung, Drive Thru, Free Refill, EC-Zahlung, Click & Collect
2 KFC BERLIN Mall of Berlin Leipziger Platz 12 10117 Berlin Lieferung, Free Refill, EC-Zahlung, Click & Collect
3 KFC BERLIN Klosterstraße 3 13581 Berlin Lieferung, EC-Zahlung, Click & Collect
4 KFC BERLIN Schnellerstr. 18a 12439 Berlin Drive Thru, Free Refill, EC-Zahlung, Click & Collect
... ... ... ...
191 KFC SAARBRÜCKEN Wolfseck 6 66130 Saarbrücken Drive Thru, Free Refill, EC-Zahlung
192 KFC SAARLOUIS Provinzialstr. 246 66740 Saarlouis Drive Thru, Free Refill, EC-Zahlung
193 KFC OFFENBURG Heinrich-Hertz-Str. 3 77656 Offenburg Drive Thru, Free Refill, EC-Zahlung
194 KFC FREIBURG Tullastraße 68 79108 Freiburg Lieferung, Drive Thru, Free Refill, EC-Zahlung
195 KFC FRANKFURT FLUGHAFEN Tullastraße 68 79108 Freiburg Lieferung, Drive Thru, Free Refill, EC-Zahlung
196 rows × 3 columns
Selenium documentation can be found at: https://www.selenium.dev/documentation/
Pandas documentation: https://pandas.pydata.org/docs/
And for TQDM, go to https://pypi.org/project/tqdm/