I have some questions regarding web scraping with selenium for python. I attempted to web scrape a table of pokemon names and stats from pokemondb.net, and I saved that data into a pandas dataframe in my jupyter notebook. The problem is that it takes 2-3 minutes to scrape all the data, and I assumed that this a bit too time consuming of a process. I was wondering if maybe I did a poor job of coding my web scraping program? I also programmed it to scrape all the table data 1 column at a time, and I believe that this may be one reason why it is not as efficient as possible. I would appreciate if anyone can take a look and offer any suggestions.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import os
import numpy as np
import pandas as pd
import matplotlib as plt
driver = webdriver.Chrome('drivers/chromedriver.exe') # assign the driver path to variable
driver.get('https://pokemondb.net/pokedex/all') # get request - opens chrome browser and navigates to URL
driver.minimize_window() # minimize window
pokemon_id = []
pokemon_id_html = driver.find_elements(By.CLASS_NAME, 'infocard-cell-data') # retrieve the pokemon id column from pokemondb.net
for poke_id in pokemon_id_html:
pokemon_id.append(poke_id.text)
pokemon_name = []
pokemon_name_html = driver.find_elements(By.CLASS_NAME, 'ent-name') # retrieve the pokemon name column
for name in pokemon_name_html:
pokemon_name.append(name.text)
pokemon_type = []
pokemon_type_html = driver.find_elements(By.CLASS_NAME, 'cell-icon') # retrieve pokemon type
for p_type in pokemon_type_html:
pokemon_type.append(p_type.text)
pokemon_total = []
pokemon_total_html = driver.find_elements(By.CLASS_NAME, 'cell-total') # retrieve pokemon total stats
for total in pokemon_total_html:
pokemon_total.append(total.text)
pokemon_hp = []
pokemon_hp_html = driver.find_elements(By.XPATH, "//*[@class='cell-num'][1]") # retrieve pokemon hp stat
for hp in pokemon_hp_html:
pokemon_hp.append(hp.text)
pokemon_attack = []
pokemon_attack_html = driver.find_elements(By.XPATH, "//*[@class='cell-num'][2]") # retrieve pokemon attack stat
for attack in pokemon_attack_html:
pokemon_attack.append(attack.text)
pokemon_defense = []
pokemon_defense_html = driver.find_elements(By.XPATH, "//*[@class='cell-num'][3]") # retrieve pokemon defense stat
for defense in pokemon_defense_html:
pokemon_defense.append(defense.text)
pokemon_special_attack = []
pokemon_special_attack_html = driver.find_elements(By.XPATH, "//*[@class='cell-num'][4]") # retrieve pokemon sp. attack stat
for special_attack in pokemon_special_attack_html:
pokemon_special_attack.append(special_attack.text)
pokemon_special_defense = []
pokemon_special_defense_html = driver.find_elements(By.XPATH, "//*[@class='cell-num'][5]") # retrieve pokemon sp. defense stat
for special_defense in pokemon_special_defense_html:
pokemon_special_defense.append(special_defense.text)
pokemon_speed = []
pokemon_speed_html = driver.find_elements(By.XPATH, "//*[@class='cell-num'][6]") # retrieve pokemon speed stat
for speed in pokemon_speed_html:
pokemon_speed.append(speed.text)
driver.close() # close driver, end session
columns = ['id', 'name', 'type', 'total', 'hp', 'attack', 'defense', 'special-attack', 'special-defense', 'speed'] # column names (labels) for dataset
attributes = [pokemon_id, pokemon_name, pokemon_type, pokemon_total, pokemon_hp, pokemon_attack, pokemon_defense, pokemon_special_attack, pokemon_special_defense, pokemon_speed] # list of values for each column (rows) for dataset
CodePudding user response:
Though @platipus_on_fire_333
answer was perfecto using page_source
, as an alternative you can also canonically identify the <table>
element and achieve similar result.
Solution
To web scrape a table of pokemon names and stats from pokemondb.net you need to induce WebDriverWait for the visibility_of_element_located() and using DataFrame from Pandas you can use the following Locator Strategy:
Code Block:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
driver.execute("get", {'url': 'https://pokemondb.net/pokedex/all'})
data = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table#pokedex"))).get_attribute("outerHTML")
df = pd.read_html(data)
print(df)
Console Output:
[ # Name Type Total HP Attack Defense Sp. Atk Sp. Def Speed
0 1 Bulbasaur Grass Poison 318 45 49 49 65 65 45
1 2 Ivysaur Grass Poison 405 60 62 63 80 80 60
2 3 Venusaur Grass Poison 525 80 82 83 100 100 80
3 3 Venusaur Mega Venusaur Grass Poison 625 80 100 123 122 120 80
4 4 Charmander Fire 309 39 52 43 60 50 65
... ... ... ... ... ... ... ... ... ... ...
1070 902 Basculegion Female Water Ghost 530 120 92 65 100 75 78
1071 903 Sneasler Poison Fighting 510 80 130 60 40 80 120
1072 904 Overqwil Dark Poison 510 85 115 95 65 65 85
1073 905 Enamorus Incarnate Forme Fairy Flying 580 74 115 70 135 80 106
1074 905 Enamorus Therian Forme Fairy Flying 580 74 115 110 135 100 46
[1075 rows x 10 columns]]
CodePudding user response:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
# chrome_options.add_argument("--headless")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
browser.get("https://pokemondb.net/pokedex/all")
dfs = pd.read_html(str(browser.page_source))
dfs[0]
This returns a dataframe with 1075 rows × 10 columns:
# Name Type Total HP Attack Defense Sp. Atk Sp. Def Speed
0 1 Bulbasaur Grass Poison 318 45 49 49 65 65 45
1 2 Ivysaur Grass Poison 405 60 62 63 80 80 60
2 3 Venusaur Grass Poison 525 80 82 83 100 100 80
3 3 Venusaur Mega Venusaur Grass Poison 625 80 100 123 122 120 80
4 4 Charmander Fire 309 39 52 43 60 50 65
... ... ... ... ... ... ... ... ...