Home > database >  I need to scrape all reviews from target.com but I am getting reviews for a single page only
I need to scrape all reviews from target.com but I am getting reviews for a single page only

Time:09-20

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import configparser
from datetime import datetime, timedelta, date
import time
import json
import requests



parser = configparser.RawConfigParser()

parser.read('config.ini')
load=parser['PROPERTIES']['loads']
url= parser['PROPERTIES']['URL']
OMIT_KEYWORDS= parser['FILTERS']['OMIT'].split(',')
INCLUDE_KEYWORDS=parser['FILTERS']['INCLUDE'].split(',')
END_DATE = datetime.strptime(parser['DATE']['END'], '%Y-%m-%d')
START_DATE=datetime.strptime(parser['DATE']['START'],'%Y-%m-%d')
minimum_comment_length = int(parser['PROPERTIES']['MIN_COMMENT_LENGTH'])
maximum_comment_length = int(parser['PROPERTIES']['MAX_COMMENT_LENGTH'])

# Setting up driver options
options = webdriver.ChromeOptions()
# Setting up Path to chromedriver executable file
CHROMEDRIVER_PATH =r'C:\Users\HP\Desktop\INTERNSHIP\Target\chromedriver.exe'
# Adding options
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
# Setting up chrome service
service = ChromeService(executable_path=CHROMEDRIVER_PATH)
# Establishing Chrom web driver using set services and options
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 20)      
driver.get(url)
driver.implicitly_wait(10)
time.sleep(2)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")

reviews = driver.find_elements_by_css_selector(".styles__ReviewRow-sc-4u2mi2-1")
item_list=[]     

for review in reviews:
        item={  
            'stars': review.find_element_by_css_selector('.utils__ScreenReaderOnly-sc-1b93ups-0.dsPOPg').text.replace("out of 5 stars",""),
            'username': review.find_element_by_xpath(".//span[contains(@data-test,'review-card--username')]").text,
            'userurl':"NA",
            'title':review.find_element_by_xpath(".//h4[contains(@data-test,'review-card--title')]").text,
            'review_text':review.find_element_by_css_selector('.h-margin-t-default.h-text-md').text,
            'permalink': "NA",
            'reviewlocation': "NA",
            #'reviewdate': current_date,
            'subproductname': "NA",
            'subproductlink': "NA",
}
        item_list.append(item)
    
print(item_list)
with open("output.json","r ") as outfile:
    json.dump(item_list,outfile, default=lambda o: '<not serializable>') 

The link I am using for this scraping is https://www.target.com/p/lysol-lemon-breeze-scented-all-purpose-cleaner-38-disinfectant-spray-32oz/-/A-14062784. Please tell me how can I make my code good enough to scrape all reviews by clicking on load more multiple times.

CodePudding user response:

The following code works for me:

import time

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

options = Options()
options.add_argument("start-maximized")


webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=options)
url = 'https://www.target.com/p/lysol-lemon-breeze-scented-all-purpose-cleaner-38-disinfectant-spray-32oz/-/A-14062784'
driver.get(url)
wait = WebDriverWait(driver, 3)

item_list = []
time.sleep(2)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
while True:
    try:
        reviews = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, ".styles__ReviewRow-sc-4u2mi2-1")))

        time.sleep(2)
        for review in reviews:
            item = {
            'stars': review.find_element(By.CSS_SELECTOR, '.utils__ScreenReaderOnly-sc-1b93ups-0.dsPOPg').text.replace("out of 5 stars", ""),
            'username': review.find_element(By.XPATH, ".//span[contains(@data-test,'review-card--username')]").text,
            'userurl': "NA",
            'title': review.find_element(By.XPATH, ".//h4[contains(@data-test,'review-card--title')]").text,
            'review_text': review.find_element(By.CSS_SELECTOR, '.h-margin-t-default.h-text-md').text,
            'permalink': "NA",
            'reviewlocation': "NA",
            'subproductname': "NA",
            'subproductlink': "NA",
            }
            item_list.append(item)
        driver.execute_script("window.scrollBy(0, arguments[0]);", 400)
        time.sleep(2)
        wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(.,'Load')][contains(.,'more')]"))).click()
        time.sleep(2)
    except:
        break

print(item_list)

I omitted parser and json related code here since it is not related directly to Selenium issue of clicking the Load 8 more button.
The basic logic here is simple: I'm looping with actually your code clicking the Load 8 more button scrolling is added inside the loop.
I added WebDriverWait expected conditions to wait for elements visibility and clickability. The loop is executed until "Load 8 more" button no more appears. In this case wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(.,'Load')][contains(.,'more')]"))).click() throws an exception. It is caught by except and applies a break to get out of the while loop.

  • Related