I have created a scrapping tool to crawl data from aliexpress website using python and selenium. This is my script in python :
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from pymongo import MongoClient
from time import sleep
from lxml import html
import pandas as pd
import cssselect
import pymongo
import time
def scrap(subject):
start_time = time.time()
options = Options()
options.headless = True
driver = webdriver.Edge(executable_path=r"C:\Users\aicha\Desktop\mycode\aliexpress_scrap\scrap\codes\msedgedriver",options=options)
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=' subject '<ype=wholesale&SortType=default&page={}'
# baseurl = 'https://www.aliexpress.com'
for page_nb in range(1, 5):
print('---', page_nb, '---')
driver.get(url.format(page_nb))
sleep(2)
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
results = []
for product in tree.xpath('//div[@]//a'):
title = product.xpath('.//h1/text()')
if title:
title = title[0]
price = product.cssselect('div.mGXnE._37W_B span')
price = [x.text for x in price]
currency = price[0]
price = ''.join(price[1:])
stars = product.xpath('.//span[@]/text()')
if stars :
stars = stars [0]
else:
stars = 'None'
nb_sold = product.xpath('.//span[@]/text()')
if nb_sold:
nb_sold = nb_sold[0]
else:
nb_sold = 'None'
supl = product.xpath('.//a[@]/text()')
if supl:
supl = supl[0]
else:
supl = 'None'
ship_cost = product.xpath('.//span[@]/text()')
if ship_cost:
ship_cost = ship_cost[0]
else:
ship_cost = 'None'
product_links = product.xpath('./@href')
if product_links:
product_links = str( product_links[0])
row = [title, price, currency, stars, nb_sold, ship_cost, supl, product_links]
results.append(row)
# driver.close()-------Remove this code so driver is open and can open URL
df = pd.DataFrame(results , columns=("Title","Price", "Currency", "Stars", "Orders", "Shipcost", "Supplier", "Productlinks" ))
client = MongoClient("mongodb://localhost:27017/")
collection = client['db2']['aliex2']
data = df.to_dict(orient = 'records')
collection.insert_many(data)
print("--- %s seconds ---" % (time.time() - start_time))
driver.quit()
return
Explanation :
My code get all data from aliexpress website and store it in a Mongodb database collection to display it in datatable format using django. The issue that I am struggling is that when I execute my code for the first time it all goes right. But when I execute it for the second time I get the data in the same collection instead of having a new collection and a new datatable. In otherwords I want for each code execution a new collection and a new datatable.
My datatable view :
def datatable_view(request):
if request.method =='POST':
form = Scraping(request.POST)
if form.is_valid():
subject=form.cleaned_data['subject']
scrap(subject)
client = pymongo.MongoClient("mongodb://localhost:27017/")
db= client["db2"]
col = db["aliex2"]
products = col.find()
context = {'products' : products}
return render(request,'datatable.html', context)
My datatable html :
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Aliexpress Datatable </title>
<meta name="description" content="Aliexpress DataTable.">
<link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.0.0/css/bootstrap.css">
<link rel="stylesheet" href="//cdn.datatables.net/1.11.5/css/jquery.dataTables.min.css">
</head>
<body >
<div >
<div >
<div >
<table cellspacing="5" cellpadding="5">
<tbody>
<tr>
<td>Minimum star:</td>
<td><input type="text" id="min1" name="min1"></td>
</tr>
<tr>
<td>Maximum star:</td>
<td><input type="text" id="max1" name="max1"></td>
</tr>
<tr>
<td>Minimum price:</td>
<td><input type="text" id="min2" name="min2"></td>
</tr>
<tr>
<td>Maximum price:</td>
<td><input type="text" id="max2" name="max2"></td>
</tr>
<tr>
<td> Minimum number of orders :</td>
<td><input type="text" id="min3" name="min3"></td>
</tr>
<tr>
<td> Maximum number of orders:</td>
<td><input type="text" id="max3" name="max3"></td>
</tr>
</tbody>
</table>
<table id="scrap" style="width:100%">
<thead>
<tr>
<th >Title</th>
<th >Price</th>
<th >Currency</th>
<th >Stars</th>
<th >Number Of Orders</th>
<th >Shipping Cost</th>
<th >Supplier</th>
<th >Product Links</th>
</thead>
<tbody>
{% for product in products %}
<tr>
<td> {{ product.Title }} </td>
<td> {{ product.Price }} </td>
<td> {{ product.Currency }} </td>
<td> {{ product.Stars }} </td>
<td> {{ product.Orders}} </td>
<td> {{ product.Shipcost }} </td>
<td> {{ product.Supplier }} </td>
<td><a href="{{product.Productlinks }}"> Click here</a></td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
</div>
</div>
<script src="https://code.jquery.com/jquery-3.5.1.js"></script>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.1.0/jquery.min.js"></script>
<script type="text/javascript" src="https://cdn.datatables.net/1.11.5/js/jquery.dataTables.min.js"></script>
<script>
$.fn.dataTable.ext.search.push(
function( settings, data, dataIndex ) {
var min1 = parseFloat( $('#min1').val(), 10 );
var max1 = parseFloat( $('#max1').val(), 10 );
var stars = parseFloat( data[3] ) || 0; // use data for the stars column
if ( ( isNaN( min1 ) && isNaN( max1 ) ) ||
( isNaN( min1) && stars <= max1 ) ||
( min1 <= stars && isNaN( max1 ) ) ||
( min1 <= stars && stars <= max1 ) )
{
return true;
}
return false;
}
);
$(document).ready(function() {
var table = $('#scrap').DataTable();
// Event listener to the two range filtering inputs to redraw on input
$('#min1, #max1').keyup( function() {
table.draw();
} );
} );
$.fn.dataTable.ext.search.push(
function( settings, data, dataIndex ) {
var min2 = parseFloat( $('#min2').val(), 10 );
var max2 = parseFloat( $('#max2').val(), 10 );
var price = parseFloat( data[1] ) || 0; // use data for the price column
if ( ( isNaN( min2 ) && isNaN( max2 ) ) ||
( isNaN( min2 ) && price <= max2 ) ||
( min2 <= price && isNaN( max2 ) ) ||
( min2 <= price && price <= max2 ) )
{
return true;
}
return false;
}
);
$(document).ready(function() {
var table = $('#scrap').DataTable();
// Event listener to the two range filtering inputs to redraw on input
$('#min2, #max2').keyup( function() {
table.draw();
} );
} );
$.fn.dataTable.ext.search.push(
function( settings, data, dataIndex ) {
var min3 = parseFloat( $('#min3').val(), 10 );
var max3 = parseFloat( $('#max3').val(), 10 );
var nb_o = parseFloat( data[4] ) || 0; // use data for the number of orders column
if ( ( isNaN( min3 ) && isNaN( max3 ) ) ||
( isNaN( min3 ) && nb_o <= max3 ) ||
( min3 <= nb_o && isNaN( max3 ) ) ||
( min3 <= nb_o && nb_o <= max3 ) )
{
return true;
}
return false;
}
);
$(document).ready(function() {
var table = $('#scrap').DataTable();
// Event listener to the two range filtering inputs to redraw on input
$('#min3, #max3').keyup( function() {
table.draw();
} );
} );
</script>
</body>
</html>
ps : subject is the name product that the user will enter before scraping.
I would be so grateful if you help me. Thank you in advance !
CodePudding user response:
You are always saving the date in the collection called aliex2
which is the reason it is storing in the same collection. If you want to store in a new collection, everytime you scrape new data make sure you use a unique collection name instead of aliex2
CodePudding user response:
MongoDB creates new databases and collections once you try to reference them. So by changing the referenced db and/or collection at the end of your scraping run, you could write to a new db/collection. To create a new collection for every subject, you could change the database reference in your scrap method to something like:
client = MongoClient("mongodb://localhost:27017/")
# use variable db and collection names
collection_name = subject
collection = client["db2"][collection_name]
data = df.to_dict(orient = 'records')
collection.insert_many(data)
To use the variable reference in the datatable_view, you would have to use the subject from the POST request. Using the following snippet might brake your request, but allows you to read the crawled subject dynamically:
def datatable_view(request):
if request.method =='POST':
form = Scraping(request.POST)
if form.is_valid():
subject=form.cleaned_data['subject']
scrap(subject)
client = pymongo.MongoClient("mongodb://localhost:27017/")
# use variable names for db and collection reference
db= client["db2"]
col = db[subject]
products = col.find()
context = {'products' : products}
return render(request,'datatable.html', context)
return
CodePudding user response:
By watching you problem I got an Idea may it helps you:
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from pymongo import MongoClient
from time import sleep
from lxml import html
import pandas as pd
import cssselect
import pymongo
import time
def scrap(subject):
start_time = time.time()
options = Options()
options.headless = True
driver = webdriver.Edge(executable_path=r"C:\Users\aicha\Desktop\mycode\aliexpress_scrap\scrap\codes\msedgedriver",options=options)
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=' subject '<ype=wholesale&SortType=default&page={}'
# baseurl = 'https://www.aliexpress.com'
for page_nb in range(1, 5):
print('---', page_nb, '---')
driver.get(url.format(page_nb))
sleep(2)
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
results = []
for product in tree.xpath('//div[@]//a'):
title = product.xpath('.//h1/text()')
if title:
title = title[0]
price = product.cssselect('div.mGXnE._37W_B span')
price = [x.text for x in price]
currency = price[0]
price = ''.join(price[1:])
stars = product.xpath('.//span[@]/text()')
if stars :
stars = stars [0]
else:
stars = 'None'
nb_sold = product.xpath('.//span[@]/text()')
if nb_sold:
nb_sold = nb_sold[0]
else:
nb_sold = 'None'
supl = product.xpath('.//a[@]/text()')
if supl:
supl = supl[0]
else:
supl = 'None'
ship_cost = product.xpath('.//span[@]/text()')
if ship_cost:
ship_cost = ship_cost[0]
else:
ship_cost = 'None'
product_links = product.xpath('./@href')
if product_links:
product_links = str( product_links[0])
row = [title, price, currency, stars, nb_sold, ship_cost, supl, product_links]
results.append(row)
# driver.close()-------Remove this code so driver is open and can open URL
#I just put this below line out of the loop
df = pd.DataFrame(results , columns=("Title","Price", "Currency", "Stars", "Orders", "Shipcost", "Supplier", "Productlinks" ))
client = MongoClient("mongodb://localhost:27017/")
collection = client['db2']['aliex2']
data = df.to_dict(orient = 'records')
collection.insert_many(data)
print("--- %s seconds ---" % (time.time() - start_time))
driver.quit()
return
I had put database four line out off the page_nb loop so what it will do is rows of the all page will added to result list and at last result list will be added to dataframe and to database.
datatable view and datatable HTML will Remain same
Try this simple Idea to solve your problem.