I am learning how to use selenium to scrape data from TripAdvisor via Python, and would like to extract the information of hotels after being sorted by "Traveler Ranked" in the link of (https://en.tripadvisor.com.hk/Hotels-g294217-Hong_Kong-Hotels.html). Name of hotels and the "data-location=" of each hotel in the html page would like to be extracted.
[The html code of "data-location="][1] [1]: https://i.stack.imgur.com/x668S.png
Here is my code. I don't know why it cannot print the hotel name. I also don't know how to list out the number inside the "data-location=".
!pip install selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
browser = webdriver.Chrome(executable_path='C:\ProgramData\Anaconda3\Lib\site-packages\jupyterlab\chromedriver.exe')
browser.get('https://en.tripadvisor.com.hk/Hotels-g294217-Hong_Kong-Hotels.html')
browser.maximize_window()
CheckinDate = browser.find_element(By.XPATH, '//*[@id="BODY_BLOCK_JQUERY_REFLOW"]/div[4]/div[2]/div/div[2]/div/div/div[2]/div/div[2]/div[1]/div[3]/div[3]/div[1]')
CheckinDate.click()
CheckOutDate = browser.find_element(By.XPATH, '//*[@id="BODY_BLOCK_JQUERY_REFLOW"]/div[4]/div[2]/div/div[2]/div/div/div[2]/div/div[2]/div[1]/div[3]/div[3]/div[2]')
CheckOutDate.click()
Roombutton = browser.find_element(By.XPATH, '//*[@id="BODY_BLOCK_JQUERY_REFLOW"]/div[4]/div[2]/div/div[2]/div/div[4]/button')
Roombutton.click()
WebDriverWait(browser, 30).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="component_15"]/div[2]/div[2]/span[1]/div/div'))).click()
browser.find_element(By.XPATH,'//*[@id="component_15"]/div[2]/div[2]/span[1]/div/div[2]/div[1]/div').click()
results = browser.find_elements_by_css_selector('#bodycon_main .prw_meta_hsx_responsive_listing')
for result in results:
try:
link = result.find_element_by_xpath("./div/div[1]/div[2]/div[1]/div/a")
print(link.text)
except:
continue
Many thanks!
CodePudding user response:
You were not locating the results
variable correctly, which returned an empty object, leading to no output. The following code should work.
Code Snippet-
CheckinDate = browser.find_element(By.XPATH, '//*[@id="BODY_BLOCK_JQUERY_REFLOW"]/div[4]/div[2]/div/div[2]/div/div/div[2]/div/div[2]/div[1]/div[3]/div[3]/div[1]')
CheckinDate.click()
CheckOutDate = browser.find_element(By.XPATH, '//*[@id="BODY_BLOCK_JQUERY_REFLOW"]/div[4]/div[2]/div/div[2]/div/div/div[2]/div/div[2]/div[1]/div[3]/div[3]/div[2]')
CheckOutDate.click()
Roombutton = browser.find_element(By.XPATH, '//*[@id="BODY_BLOCK_JQUERY_REFLOW"]/div[4]/div[2]/div/div[2]/div/div[4]/button')
Roombutton.click()
WebDriverWait(browser, 30).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="component_15"]/div[2]/div[2]/span[1]/div/div'))).click()
browser.find_element(By.XPATH,'//*[@id="component_15"]/div[2]/div[2]/span[1]/div/div[2]/div[1]/div').click()
#time sleep to wait for all results to load after applying the preferences
#can be adjusted accordingly
time.sleep(10)
#locate all hotel results
results = browser.find_elements_by_xpath('//div[@]')
#for each hotel in page results
for result in results:
try:
#find hotel name
link = result.find_element_by_xpath('*//div[@]/a')
#find class which contains data-location attribute
data_location=result.find_element_by_xpath('*//div[@]').get_attribute("data-location")
print(link.text)
print(data_location)
except:
continue