I am trying to write a code that scrapes all reviews from a single hotel on tripadvisor. The code runs through all pages except the last one, where it has a problem. It says that the problem is the next.click() in the loop. I am assuming this is because "next" is still present in the element, but just disabled. Anyone know how to fix this problem? I basically want it to not try to click next when it reaches the last page/when it is disabled, but still technically present. Any help would be much appreciated!
#maybe3.1
from argparse import Action
from calendar import month
from distutils.command.clean import clean
from lib2to3.pgen2 import driver
from os import link
import unittest
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import ElementNotInteractableException
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from dateutil import relativedelta
from selenium.webdriver.common.action_chains import ActionChains
import time
import datetime
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import requests
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Extract the HTML and create a BeautifulSoup object.
url = ('https://www.tripadvisor.com/Hotel_Review-g46833-d256905-Reviews-Knights_Inn_South_Hackensack-South_Hackensack_New_Jersey.html#REVIEWS')
user_agent = ({'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/90.0.4430.212 Safari/537.36',
'Accept-Language': 'en-US, en;q=0.5'})
driver = webdriver.Chrome()
driver.get(url)
# Find and extract the data elements.
wait = WebDriverWait(driver,30)
wait.until(EC.element_to_be_clickable((By.XPATH,'//*[@id="component_15"]/div/div[3]/div[13]/div')))
#explicit wait here
next = driver.find_element(By.XPATH,'.//a[@]')
here = next.is_displayed()
while here == True:
time.sleep(2)
soup = BeautifulSoup(driver.page_source, 'html.parser')
time.sleep(2)
Titles = []
for title in soup.findAll('a',{'Qwuub'}):
Titles.append(title.text.strip())
reviews = []
for review in soup.findAll('q',{'class':'QewHA H4 _a'}):
reviews.append(review.text.strip())
next.click()
if here != True:
time.sleep(2)
soup = BeautifulSoup(driver.page_source, 'html.parser')
time.sleep(8)
break
# Create the dictionary.
dict = {'Review Title':Titles,'Reviews/Feedback':reviews}
# Create the dataframe.
datafr = pd.DataFrame.from_dict(dict)
datafr.head(10)
# Convert dataframe to CSV file.
datafr.to_csv('hotels1.855.csv', index=False, header=True)
CodePudding user response:
This question might be in the same vein like: python selenium to check if this text field is disabled or not
You can check if an element is enabled with:
driver.find_element_by_id("id").is_enabled
You can also wrap the code in a try/except block.
CodePudding user response:
page=2
while True:
try:
#your code
driver.find_element(By.XPATH,f"//a[@class='pageNum ' and text()='{page}']").click()
page =1
time.sleep(1)
except:
break
Should be a simple loop to go through all pages wait till the a tag in question is not valid anymore.
import time