I am trying to scrape udemy, and it seems that I get more courses titles than courses prices, but I do not know why this happen, I know some ways to avoid pandas error, but that is not the solution, the solution must be scrape exactly the same number of names and the same number of prices:
from selenium import webdriver
import pandas as pd
import time
import selenium
#I put all this options to avoid udemy detect selnium as a bot
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('--disable-blink-features=AutomationControlled')
website = "https://www.udemy.com/courses/search/?src=ukw&q=python"
s = Service('C:\\Users\\Albin Rodriguez\\Documents\\Aprendiendo\\web_scraping\\chromedriver.exe')
driver = webdriver.Chrome(service=s, options=options)
driver.get(website)
time.sleep(5)
titles = driver.find_elements_by_xpath('//h3[@]')
prices = driver.find_elements_by_xpath('//div[@data-purpose="price-text-container"]//span/span')
courses = []
prices_courses= []
for title in titles:
courses.append(title.text)
for price in prices:
prices_courses.append(price.text)
input() #I used this to check if all the prices are shown.
#pandas
df = pd.DataFrame({'cursos': courses, 'precios': prices_courses})
df.to_excel("precio_cursos2.xlsx", index=False)
CodePudding user response:
Now run the code
from selenium import webdriver
import pandas as pd
import time
import selenium
#I put all this options to avoid udemy detect selnium as a bot
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('--disable-blink-features=AutomationControlled')
website = "https://www.udemy.com/courses/search/?src=ukw&q=python"
s = Service('C:\\Users\\Albin Rodriguez\\Documents\\Aprendiendo\\web_scraping\\chromedriver.exe')
driver = webdriver.Chrome(service=s, options=options)
driver.get(website)
time.sleep(5)
data=[]
titles = [x.text for x in driver.find_elements_by_xpath('//h3[@]/a')]
prices = [x.text for x in driver.find_elements_by_xpath('//div[@data-purpose="price-text-container"]//span/span')[0:31]]
data.extend([titles,prices])
#input() #I used this to check if all the prices are shown.
#pandas
df = pd.DataFrame(data=list(zip(titles,prices)),columns=['cursos','precios'])
df.to_excel("precio_cursos2.xlsx", index=False)