Home > OS >  Python web scraping "All arrays must be of the same length" pandas error
Python web scraping "All arrays must be of the same length" pandas error

Time:03-25

I am trying to scrape udemy, and it seems that I get more courses titles than courses prices, but I do not know why this happen, I know some ways to avoid pandas error, but that is not the solution, the solution must be scrape exactly the same number of names and the same number of prices:

from selenium import webdriver
import pandas as pd
import time 
import selenium


#I put all this options to avoid udemy detect selnium as a bot

from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

options = Options()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('--disable-blink-features=AutomationControlled')

website = "https://www.udemy.com/courses/search/?src=ukw&q=python"

s = Service('C:\\Users\\Albin Rodriguez\\Documents\\Aprendiendo\\web_scraping\\chromedriver.exe')
driver = webdriver.Chrome(service=s, options=options)
driver.get(website)

time.sleep(5)                    

titles = driver.find_elements_by_xpath('//h3[@]') 
prices = driver.find_elements_by_xpath('//div[@data-purpose="price-text-container"]//span/span') 

courses = []
prices_courses= []
for title in titles:
    courses.append(title.text)
    
for price in prices:
    prices_courses.append(price.text)

input()  #I used this to check if all the prices are shown.

#pandas
df = pd.DataFrame({'cursos': courses, 'precios': prices_courses})
df.to_excel("precio_cursos2.xlsx", index=False)

CodePudding user response:

Now run the code

from selenium import webdriver
import pandas as pd
import time 
import selenium


#I put all this options to avoid udemy detect selnium as a bot

from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

options = Options()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('--disable-blink-features=AutomationControlled')

website = "https://www.udemy.com/courses/search/?src=ukw&q=python"

s = Service('C:\\Users\\Albin Rodriguez\\Documents\\Aprendiendo\\web_scraping\\chromedriver.exe')
driver = webdriver.Chrome(service=s, options=options)
driver.get(website)

time.sleep(5)                    
data=[]
titles = [x.text for x in driver.find_elements_by_xpath('//h3[@]/a')]
prices = [x.text for x in driver.find_elements_by_xpath('//div[@data-purpose="price-text-container"]//span/span')[0:31]]

data.extend([titles,prices])

#input()  #I used this to check if all the prices are shown.

#pandas
df = pd.DataFrame(data=list(zip(titles,prices)),columns=['cursos','precios'])
df.to_excel("precio_cursos2.xlsx", index=False)
  • Related