Scrape whole table with selenium and beautiful soap?-CodePudding

i would like to scrape the whole table in the middle of this site: https://www.brilliantearth.com/lab-diamonds-search/

i tried it with the following code - but with that i only get the first 200 rows of the table:

import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import os, sys
import xlwings as xw
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from fake_useragent import UserAgent

if __name__ == '__main__':
  WAIT = 3  
  ua = UserAgent()
  userAgent = ua.random
  options = Options()
  # options.add_argument('--headless')
  options.add_experimental_option ('excludeSwitches', ['enable-logging'])
  options.add_argument("start-maximized")
  options.add_argument('window-size=1920x1080')                               
  options.add_argument('--no-sandbox')
  options.add_argument('--disable-gpu')  
  options.add_argument(f'user-agent={userAgent}')   
  srv=Service(ChromeDriverManager().install())
  driver = webdriver.Chrome (service=srv, options=options)    
  waitWebDriver = WebDriverWait (driver, 10)         
  
  lElems = []
  link = f"https://www.brilliantearth.com/lab-diamonds-search/" 
  # driver.minimize_window()        # optional
  driver.get (link)       
  time.sleep(WAIT) 
  driver.find_element(By.XPATH,"(//button[@title='Accept All'])[1]").click() 
  time.sleep(WAIT) 
  soup = BeautifulSoup (driver.page_source, 'html.parser')     
  time.sleep(WAIT)   
  tmpSearch = soup.find("div", {"id": "diamond_search_wrapper"})
  tmpDIVs = tmpSearch.select("div.inner.item")
  for idx,elem in enumerate(tmpDIVs):
    tmpTD = elem.find_all("td")  
    row = []    
    for e2 in tmpTD:
      row.append(e2.text)
    print(idx, row)

I would like to scroll down with selenium to the very bottom of this table. But when i make a scrolldown only the overall page scroll downs and not the table inside.

How can i scroll down in the table to the bottom? (and can then probably scrape all elements from the the table)

CodePudding user response：

It would be quicker and easier to scrape their backend network calls, to explore them in your browser open Developer Tools - Network - fetch/XHR and refresh the page or scroll down the data you want and you can see the network calls happening. I've recreated them below and dump the data into csv:

import requests
import pandas as pd

headers =   {
    'accept':'application/json, text/javascript, */*; q=0.01',
    'accept-encoding':'gzip, deflate, br',
    'referer':'https://www.brilliantearth.com/lab-diamonds-search/',
    'sec-fetch-site':'same-origin',
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
    'x-requested-with':'XMLHttpRequest'
    }

final = []
for page in range(1,10):
    print(f'Scraping page {page}')
    new_url = f'https://www.brilliantearth.com/lab-diamonds/list/?page={page}&shapes=Round&cuts=Fair,Good,Very Good,Ideal,Super Ideal&colors=J,I,H,G,F,E,D&clarities=SI2,SI1,VS2,VS1,VVS2,VVS1,IF,FL&polishes=Good,Very Good,Excellent&symmetries=Good,Very Good,Excellent&fluorescences=Very Strong,Strong,Medium,Faint,None&min_carat=0.30&max_carat=8.18&min_table=45.00&max_table=82.50&min_depth=5.00&max_depth=85.80&min_price=350&max_price=128290&stock_number=&row=0&requestedDataSize=200&order_by=price&order_method=asc&currency=$&has_v360_video=&dedicated=&min_ratio=1.00&max_ratio=2.75&exclude_quick_ship_suppliers=&MIN_PRICE=350&MAX_PRICE=128290&MIN_CARAT=0.3&MAX_CARAT=8.18&MIN_TABLE=45&MAX_TABLE=82.5&MIN_DEPTH=5&MAX_DEPTH=85.8'
    resp = requests.get(new_url,headers=headers).json()

    for diamond in resp['diamonds']:
        diamond.pop('v360_src',None) #remove long video and images links to clean up csv
        diamond.pop('images',None)
        final.append(diamond)

df = pd.DataFrame(final)
df.to_csv('diamonds.csv',encoding='utf-8',index=False)
print('Saved to diamonds.csv')

CodePudding user response：

You can scroll the inner table with this code:

rows = driver.find_elements_by_css_selector("#diamond_search_wrapper div.inner.item") 

for row in rows :
    driver.execute_script("arguments[0].scrollIntoView();", row )
    #scrape the data etc..