from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import configparser
from datetime import datetime, timedelta, date
import time
import json
import requests
parser = configparser.RawConfigParser()
parser.read('config.ini')
load=parser['PROPERTIES']['loads']
url= parser['PROPERTIES']['URL']
OMIT_KEYWORDS= parser['FILTERS']['OMIT'].split(',')
INCLUDE_KEYWORDS=parser['FILTERS']['INCLUDE'].split(',')
END_DATE = datetime.strptime(parser['DATE']['END'], '%Y-%m-%d')
START_DATE=datetime.strptime(parser['DATE']['START'],'%Y-%m-%d')
minimum_comment_length = int(parser['PROPERTIES']['MIN_COMMENT_LENGTH'])
maximum_comment_length = int(parser['PROPERTIES']['MAX_COMMENT_LENGTH'])
# Setting up driver options
options = webdriver.ChromeOptions()
# Setting up Path to chromedriver executable file
CHROMEDRIVER_PATH =r'C:\Users\HP\Desktop\INTERNSHIP\Target\chromedriver.exe'
# Adding options
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
# Setting up chrome service
service = ChromeService(executable_path=CHROMEDRIVER_PATH)
# Establishing Chrom web driver using set services and options
driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 20)
driver.get(url)
driver.implicitly_wait(10)
time.sleep(2)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
reviews = driver.find_elements_by_css_selector(".styles__ReviewRow-sc-4u2mi2-1")
item_list=[]
for review in reviews:
item={
'stars': review.find_element_by_css_selector('.utils__ScreenReaderOnly-sc-1b93ups-0.dsPOPg').text.replace("out of 5 stars",""),
'username': review.find_element_by_xpath(".//span[contains(@data-test,'review-card--username')]").text,
'userurl':"NA",
'title':review.find_element_by_xpath(".//h4[contains(@data-test,'review-card--title')]").text,
'review_text':review.find_element_by_css_selector('.h-margin-t-default.h-text-md').text,
'permalink': "NA",
'reviewlocation': "NA",
#'reviewdate': current_date,
'subproductname': "NA",
'subproductlink': "NA",
}
item_list.append(item)
print(item_list)
with open("output.json","r ") as outfile:
json.dump(item_list,outfile, default=lambda o: '<not serializable>')
The link I am using for this scraping is https://www.target.com/p/lysol-lemon-breeze-scented-all-purpose-cleaner-38-disinfectant-spray-32oz/-/A-14062784. Please tell me how can I make my code good enough to scrape all reviews by clicking on load more multiple times.
CodePudding user response:
The following code works for me:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=options)
url = 'https://www.target.com/p/lysol-lemon-breeze-scented-all-purpose-cleaner-38-disinfectant-spray-32oz/-/A-14062784'
driver.get(url)
wait = WebDriverWait(driver, 3)
item_list = []
time.sleep(2)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
while True:
try:
reviews = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, ".styles__ReviewRow-sc-4u2mi2-1")))
time.sleep(2)
for review in reviews:
item = {
'stars': review.find_element(By.CSS_SELECTOR, '.utils__ScreenReaderOnly-sc-1b93ups-0.dsPOPg').text.replace("out of 5 stars", ""),
'username': review.find_element(By.XPATH, ".//span[contains(@data-test,'review-card--username')]").text,
'userurl': "NA",
'title': review.find_element(By.XPATH, ".//h4[contains(@data-test,'review-card--title')]").text,
'review_text': review.find_element(By.CSS_SELECTOR, '.h-margin-t-default.h-text-md').text,
'permalink': "NA",
'reviewlocation': "NA",
'subproductname': "NA",
'subproductlink': "NA",
}
item_list.append(item)
driver.execute_script("window.scrollBy(0, arguments[0]);", 400)
time.sleep(2)
wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(.,'Load')][contains(.,'more')]"))).click()
time.sleep(2)
except:
break
print(item_list)
I omitted parser
and json
related code here since it is not related directly to Selenium issue of clicking the Load 8 more
button.
The basic logic here is simple: I'm looping with actually your code clicking the Load 8 more
button scrolling is added inside the loop.
I added WebDriverWait
expected conditions to wait for elements visibility and clickability. The loop is executed until "Load 8 more" button no more appears. In this case wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(.,'Load')][contains(.,'more')]"))).click()
throws an exception. It is caught by except
and applies a break
to get out of the while
loop.