I have written a python script which scraps products from aliexpress.
Here is my code :
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from pymongo import MongoClient
from time import sleep
from lxml import html
import pandas as pd
import cssselect
import pymongo
import json
import csv
options = Options()
options.headless = True
driver = webdriver.Edge(executable_path=r"C:\Users\aicha\Desktop\mycode\aliexpress_scrap\scrap\codes\msedgedriver",options=options)
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth earphones<ype=wholesale&SortType=default&page={}'
baseurl = 'https://www.aliexpress.com'
for page_nb in range(1, 2):
print('---', page_nb, '---')
driver.get(url.format(page_nb))
sleep(2)
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
results = []
for product in tree.xpath('//div[@]//a'):
title = product.xpath('.//h1/text()')
if title:
title = title[0]
price = product.cssselect('div.mGXnE._37W_B span')
price = [x.text for x in price]
currency = price[0]
price = ''.join(price[1:])
stars = product.xpath('.//span[@]/text()')
if stars :
stars = stars [0]
else:
stars = 'None'
nb_sold = product.xpath('.//span[@]/text()')
if nb_sold:
nb_sold = nb_sold[0]
else:
nb_sold = 'None'
supl = product.xpath('.//a[@]/text()')
if supl:
supl = supl[0]
else:
supl = 'None'
ship_cost = product.xpath('.//span[@]/text()')
if ship_cost:
ship_cost = ship_cost[0]
else:
ship_cost = 'None'
product_links = product.xpath('./@href')
if product_links:
product_links = str(baseurl) str( product_links[0])
row = [title, price, currency, stars, nb_sold, ship_cost, supl, product_links]
results.append(row)
print('len(results):', len(results))
driver.close()
df = pd.DataFrame(results , columns=("Title","Price", "Currency", "Stars", "Orders", "Shipcost", "Supplier", "Productlinks" ))
####### Insert in database #############
client = MongoClient("mongodb://localhost:27017/")
collection = client['db2']['aliex2']
data = df.to_dict(orient = 'records')
collection.insert_many(data)
My question :
What I need is to add a timer that calculate the time of process and returns a value to know how much time takes the code. And also I want a method in order ro get a new collection after each scraping because when I run my code the second time, I get my data with the old collection.
I appreciate any help from you . Thank you !
CodePudding user response:
May your problem solve in below code:
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from pymongo import MongoClient
from time import sleep
from lxml import html
import pandas as pd
import cssselect
import pymongo
import json
import csv
import time as Time
options = Options()
options.headless = True
driver = webdriver.Edge(executable_path=r"C:\Users\aicha\Desktop\mycode\aliexpress_scrap\scrap\codes\msedgedriver",options=options)
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth earphones<ype=wholesale&SortType=default&page={}'
baseurl = 'https://www.aliexpress.com'
for page_nb in range(1, 2):
print('---', page_nb, '---')
driver.get(url.format(page_nb))
sleep(2)
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
results = []
for product in tree.xpath('//div[@]//a'):
start_time = Time.time()
title = product.xpath('.//h1/text()')
if title:
title = title[0]
price = product.cssselect('div.mGXnE._37W_B span')
price = [x.text for x in price]
currency = price[0]
price = ''.join(price[1:])
stars = product.xpath('.//span[@]/text()')
if stars :
stars = stars [0]
else:
stars = 'None'
nb_sold = product.xpath('.//span[@]/text()')
if nb_sold:
nb_sold = nb_sold[0]
else:
nb_sold = 'None'
supl = product.xpath('.//a[@]/text()')
if supl:
supl = supl[0]
else:
supl = 'None'
ship_cost = product.xpath('.//span[@]/text()')
if ship_cost:
ship_cost = ship_cost[0]
else:
ship_cost = 'None'
product_links = product.xpath('./@href')
if product_links:
product_links = str(baseurl) str( product_links[0])
diffrence_time = Time.time() - start_time # calculate time taken by program
row = [title, price, currency, stars, nb_sold, ship_cost, supl, product_links, diffrence_time] #diffrence_time store dataframe
results.append(row)
print('len(results):', len(results))
driver.close()
df = pd.DataFrame(results , columns=("Title","Price", "Currency", "Stars", "Orders", "Shipcost", "Supplier", "Productlinks", "Time Taken"))
####### Insert in database #############
client = MongoClient("mongodb://localhost:27017/")
collection = client['db2']['aliex2']
data = df.to_dict(orient = 'records')
collection.insert_many(data)
I had find out time taken by program in loop and store that diffrence time in dataframe