Unable to scrape reviews from target.com due to some error in xpath-CodePudding

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import configparser
from datetime import datetime
import time
import random
import json

parser = configparser.RawConfigParser()
parser.read('config.ini')
url= parser['PROPERTIES']['URL']
OMIT_KEYWORDS= parser['FILTERS']['OMIT'].split(',')
INCLUDE_KEYWORDS=parser['FILTERS']['INCLUDE'].split(',')
END_DATE = datetime.strptime(parser['DATE']['END'], '%Y-%m-%d')
START_DATE=datetime.strptime(parser['DATE']['START'],'%Y-%m-%d')
minimum_comment_length = int(parser['PROPERTIES']['MIN_COMMENT_LENGTH'])
maximum_comment_length = int(parser['PROPERTIES']['MAX_COMMENT_LENGTH'])

# Setting up driver options
options = webdriver.ChromeOptions()
# Setting up Path to chromedriver executable file
CHROMEDRIVER_PATH =r'C:\Users\HP\Desktop\INTERNSHIP\Target\chromedriver.exe'
# Adding options
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
# Setting up chrome service
service = ChromeService(executable_path=CHROMEDRIVER_PATH)
# Establishing Chrom web driver using set services and options
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 20)      
driver.get(url)
time.sleep(random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))


reviews = wait.until(EC.visibility_of_all_elements_located((By.XPATH, ".//div[contains(@class,'styles__ReviewRow-sc-4u2mi2-1')]")))
item_list = []

for review in reviews:
        item={  
             'review_text':review.find_element_by_xpath(".//div[contains(@class,'h-margin-t-default h-text-md')]").text,
    }
        item_list.append(item)
print(item_list)
with open("output.json","r ") as outfile:
    json.dump(item_list,outfile)

The link I am trying to scrape is https://www.target.com/p/lysol-lemon-breeze-scented-all-purpose-cleaner-38-disinfectant-spray-32oz/-/A-14062784. I am not getting review text due to some problem in xpath of reviews variable.Please help me out

CodePudding user response：

For the webpage you mentioned, it is not loading all the elements even after the page loading completes. It is loading all of its elements while scrolling down the page. So I added a page scroll-down statement.

driver.get("https://www.target.com/p/lysol-lemon-breeze-scented-all-purpose-cleaner-38-disinfectant-spray-32oz/-/A-14062784")
driver.implicitly_wait(10)
time.sleep(2)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")

reviews = driver.find_elements(By.CSS_SELECTOR,".h-margin-t-default.h-text-md")

for review_text in reviews:
    print("Review text: ",review_text.text)

Its printing all the 8 reviews which are displayed in that page.

CodePudding user response：

Yes, your xpaths are not valid. Here is what I'd use to locate these elements:

driver.find_elements_by_css_selector('.h-margin-t-default.h-text-md')

Or, if you're set on using an xpath, it could be:

driver.find_elements_by_xpath('//div[@]')

Finally, I'm not sure your code will do what you're looking for even if you get the right selectors. As you loop through the reviews do not try to identify elements with selectors as you will repeatedly just get the first element. A sketch of your code could look something like this:

wait_time = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, ".styles__ReviewRow-sc-4u2mi2-1")))
reviews = driver.find_elements_by_css_selector('.h-margin-t-default.h-text-md')
review_text = []
for review in reviews:
    review_text.append(review.text)
print(review_text)

etc.