Efficiently web scrape tables with selenium?-CodePudding

I have some questions regarding web scraping with selenium for python. I attempted to web scrape a table of pokemon names and stats from pokemondb.net, and I saved that data into a pandas dataframe in my jupyter notebook. The problem is that it takes 2-3 minutes to scrape all the data, and I assumed that this a bit too time consuming of a process. I was wondering if maybe I did a poor job of coding my web scraping program? I also programmed it to scrape all the table data 1 column at a time, and I believe that this may be one reason why it is not as efficient as possible. I would appreciate if anyone can take a look and offer any suggestions.


from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

import os
import numpy as np
import pandas as pd
import matplotlib as plt



driver = webdriver.Chrome('drivers/chromedriver.exe') # assign the driver path to variable

driver.get('https://pokemondb.net/pokedex/all') # get request - opens chrome browser and navigates to URL

driver.minimize_window() # minimize window


pokemon_id = []
pokemon_id_html = driver.find_elements(By.CLASS_NAME, 'infocard-cell-data') # retrieve the pokemon id column from pokemondb.net
for poke_id in pokemon_id_html:
    pokemon_id.append(poke_id.text)

pokemon_name = []
pokemon_name_html = driver.find_elements(By.CLASS_NAME, 'ent-name') # retrieve the pokemon name column
for name in pokemon_name_html:
    pokemon_name.append(name.text)

pokemon_type = []
pokemon_type_html = driver.find_elements(By.CLASS_NAME, 'cell-icon') # retrieve pokemon type
for p_type in pokemon_type_html:
    pokemon_type.append(p_type.text)

pokemon_total = []
pokemon_total_html = driver.find_elements(By.CLASS_NAME, 'cell-total') # retrieve pokemon total stats
for total in pokemon_total_html:
    pokemon_total.append(total.text)

pokemon_hp = []
pokemon_hp_html = driver.find_elements(By.XPATH, "//*[@class='cell-num'][1]") # retrieve pokemon hp stat
for hp in pokemon_hp_html:
    pokemon_hp.append(hp.text)

pokemon_attack = []
pokemon_attack_html = driver.find_elements(By.XPATH, "//*[@class='cell-num'][2]") # retrieve pokemon attack stat
for attack in pokemon_attack_html:
    pokemon_attack.append(attack.text)

pokemon_defense = []
pokemon_defense_html = driver.find_elements(By.XPATH, "//*[@class='cell-num'][3]") # retrieve pokemon defense stat
for defense in pokemon_defense_html:
    pokemon_defense.append(defense.text)

pokemon_special_attack = []
pokemon_special_attack_html = driver.find_elements(By.XPATH, "//*[@class='cell-num'][4]") # retrieve pokemon sp. attack stat
for special_attack in pokemon_special_attack_html:
    pokemon_special_attack.append(special_attack.text)

pokemon_special_defense = []
pokemon_special_defense_html = driver.find_elements(By.XPATH, "//*[@class='cell-num'][5]") # retrieve pokemon sp. defense stat
for special_defense in pokemon_special_defense_html:
    pokemon_special_defense.append(special_defense.text)

pokemon_speed = []
pokemon_speed_html = driver.find_elements(By.XPATH, "//*[@class='cell-num'][6]") # retrieve pokemon speed stat
for speed in pokemon_speed_html:
    pokemon_speed.append(speed.text)


driver.close()  # close driver, end session

columns = ['id', 'name', 'type', 'total', 'hp', 'attack', 'defense', 'special-attack', 'special-defense', 'speed'] # column names (labels) for dataset

attributes = [pokemon_id, pokemon_name, pokemon_type, pokemon_total, pokemon_hp, pokemon_attack, pokemon_defense, pokemon_special_attack, pokemon_special_defense, pokemon_speed] # list of values for each column (rows) for dataset

CodePudding user response：

Though @platipus_on_fire_333 answer was perfecto using page_source, as an alternative you can also canonically identify the <table> element and achieve similar result.

Solution

To web scrape a table of pokemon names and stats from pokemondb.net you need to induce WebDriverWait for the visibility_of_element_located() and using DataFrame from Pandas you can use the following Locator Strategy:

Code Block:

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

driver.execute("get", {'url': 'https://pokemondb.net/pokedex/all'})
data = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table#pokedex"))).get_attribute("outerHTML")
df  = pd.read_html(data)
print(df)

Console Output:

[        #                      Name             Type  Total   HP  Attack  Defense  Sp. Atk  Sp. Def  Speed
0       1                 Bulbasaur     Grass Poison    318   45      49       49       65       65     45
1       2                   Ivysaur     Grass Poison    405   60      62       63       80       80     60
2       3                  Venusaur     Grass Poison    525   80      82       83      100      100     80
3       3    Venusaur Mega Venusaur     Grass Poison    625   80     100      123      122      120     80
4       4                Charmander             Fire    309   39      52       43       60       50     65
...   ...                       ...              ...    ...  ...     ...      ...      ...      ...    ...
1070  902        Basculegion Female      Water Ghost    530  120      92       65      100       75     78
1071  903                  Sneasler  Poison Fighting    510   80     130       60       40       80    120
1072  904                  Overqwil      Dark Poison    510   85     115       95       65       65     85
1073  905  Enamorus Incarnate Forme     Fairy Flying    580   74     115       70      135       80    106
1074  905    Enamorus Therian Forme     Fairy Flying    580   74     115      110      135      100     46

[1075 rows x 10 columns]]

CodePudding user response：

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd


chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
# chrome_options.add_argument("--headless")

webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)

browser.get("https://pokemondb.net/pokedex/all")
dfs = pd.read_html(str(browser.page_source))
dfs[0]

This returns a dataframe with 1075 rows × 10 columns:

#   Name    Type    Total   HP  Attack  Defense Sp. Atk Sp. Def Speed
0   1   Bulbasaur   Grass Poison    318 45  49  49  65  65  45
1   2   Ivysaur Grass Poison    405 60  62  63  80  80  60
2   3   Venusaur    Grass Poison    525 80  82  83  100 100 80
3   3   Venusaur Mega Venusaur  Grass Poison    625 80  100 123 122 120 80
4   4   Charmander  Fire    309 39  52  43  60  50  65
... ... ... ... ... ... ... ... ...