Scraping multiple tables in python efficiently-CodePudding

I am currently trying to scrape data from the https://essentialoils.org/db#. I am trying to use selenium, but it will take too long to scrape the needed data. I am eventually taking the data into two CSV files first with (Name, Latin Botanical, Citation, CAS) and then another (Essential oil, Ingredient name, Percentage, DB 5, Carbowax). If someone could help me by directing me the fastest way to do this it would be much help. Here is my current code and a couple of images. https://imgur.com/n8vlfkR https://imgur.com/pHHcD3u https://imgur.com/PHHoEHa

 #import libraries
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from time import sleep
import pandas as pd

#set up the driver for firefox
driver = webdriver.Firefox()
driver.get("https://essentialoils.org/auth/login")
# wait for page to load
sleep(1)
# select the username and password fields
username = driver.find_element_by_id("email")
username.click()
# input the username
username.send_keys("[email protected]")
password = driver.find_element_by_id("password")
password.click()
# input the password
password.send_keys("password")
# press the submit button
submit = driver.find_element_by_id("process_login")
submit.click()
# wait for 2 seconds
sleep(2)
# go to the database page
driver.get("https://essentialoils.org/db")
# wait for page to load
sleep(2)
# find table
table = driver.find_element_by_xpath("//*[@id='item-table']/tbody")
# find all the td elements in the table
tds = table.find_elements_by_tag_name("td")
# create a list of the td elements
names=[]
latin_name=[] 
citation=[]
cas=[]
# select the first td then skip 3 
for td in tds[0::3]:
    names.append(td.text)
print('Done collecting names')
for td in tds[1::3]:
    latin_name.append(td.text)
print('Done collecting latin names')
for td in tds[2::3]:
    citation.append(td.text)
print('Done collecting citation')
#click on each row to get data then go back
for id,td in enumerate(tds[0::3]):
    id = id 1
    if id == 4153:
        # stop and continue
        break
    else:
        print(f'{id} of 4152')
        x_path = f'//*[@id="item-table"]/tbody/tr[{id}]/td[1]'
        driver.find_element_by_xpath(x_path).click()
        sleep(2)
        #find the element by class name and append to the list
        cas_number = driver.find_element_by_xpath('//*[@id="eoudb"]/div/div/div/div[3]/div[2]/div/div[2]/div[1]/div[3]/span')
        cas.append(cas_number.text[5:])
        #go back to the table url db wait to load
        driver.get("https://essentialoils.org/db")
        sleep(5)
# create pandas dataframe with column names Name,Latin Botanical,Citation,CAS
df = pd.DataFrame(
    {'Name': names,
        'Latin Botanical': latin_name,
        'Citation': citation,
        'CAS': cas
    }
)
# save the dataframe to a csv file
df.to_csv('essential_oils.csv', index=False)

CodePudding user response：

You can do this much faster using Requests and BeautifulSoup. I'm not going to post my cookie information, however if you go to the network tab on your browser, you can see what headers are being used for the request.

from bs4 import BeautifulSoup as bs 
import requests
import json

cookies = {
    'XSRF-TOKEN': '',
    'laravel_session': '',
    '_pk_id.1.5d5e': '',
    '_pk_ses.1.5d5e': '*',
    '_ga': '',
    '_gid': '',
    '__stripe_sid': '',
    '__stripe_mid': '',
    'laravel_token': '',
    '_gat_gtag_UA_118297571_1': '1',
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0',
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'en-GB,en;q=0.5',
    'X-CSRF-TOKEN': 'CHANGE FOR YOUR TOKEN',
    'X-Requested-With': 'XMLHttpRequest',
    'Connection': 'keep-alive',
    'Referer': 'https://essentialoils.org/db',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
}

response = requests.get('https://essentialoils.org/api/oil?fields=id,name,name_sort,CAS,abstract_publication,name_botanical,name_botanical_sort&pages=*&reset_cache&synonym', cookies=cookies, headers=headers)
soup = bs(response.text, 'lxml')
json_ptag = soup.select_one('p').text.strip()
json_text = json.loads(json_text)

print(json_text)

This returns a list of dictionaries, one is displayed below:

{'id': 1, 'name': 'Abies alba from cones', 'name_sort': 'ABIESALBAFROMCONES', 'CAS': '8021-27-0', 'abstract_publication': 'Riechstoffe, Aromen, Koerperpflegemittel, Vol. 18, 376 (1968) ', 'name_botanical': 'Abies alba Mill., fam. Pinaceae', 'name_botanical_sort': 'ABIES ALBA MILL., FAM. PINACEAE'}