How to open new page driver python selenium-CodePudding

I have written a code that can scrape data from aliexpress website and I have made a pagination to parse many pages and get data from it.

But where I execute the code I get this error after the first page :

Traceback (most recent call last):
  File "c:\Users\aicha\Desktop\mycode\aliexpress\draft.py", line 24, in <module>
    driver.get(url.format(page_nb))
  File "C:\Users\aicha\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\selenium\webdriver\remote\webdriver.py", line 437, in get
    self.execute(Command.GET, {'url': url})
  File "C:\Users\aicha\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\selenium\webdriver\remote\webdriver.py", line 425, in execute
    self.error_handler.check_response(response)
  File "C:\Users\aicha\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\selenium\webdriver\remote\errorhandler.py", line 247, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.InvalidSessionIdException: Message: invalid session id

The code :

from selenium.webdriver.edge.options import Options  
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver  
from pymongo import MongoClient
from time import sleep
from lxml import html 
import pandas as pd
import cssselect
import pymongo
import time


start_time = time.time()
options = Options()
options.headless = True
driver = webdriver.Edge(executable_path=r"C:\Users\aicha\Desktop\mycode\aliexpress_scrap\scrap\codes\msedgedriver",options=options)
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth earphones&ltype=wholesale&SortType=default&page={}'
baseurl = 'https://www.aliexpress.com'

for page_nb in range(1, 5):
    print('---', page_nb, '---')    
    driver.get(url.format(page_nb))
    sleep(2)
    current_offset = 0
    while True:
        driver.execute_script("window.scrollBy(0, window.innerHeight);")
        sleep(.5)  # JavaScript has time to add elements
        new_offset = driver.execute_script("return window.pageYOffset;")
        print(new_offset,current_offset)
        if new_offset <= current_offset:
            break
        current_offset = new_offset
    
    sleep(3)
    
    tree = html.fromstring(driver.page_source)
    
    results = []
    
    for product in tree.xpath('//div[@]//a'):
        title = product.xpath('.//h1/text()')
        
        if title:
            title = title[0]
            
            price = product.cssselect('div.mGXnE._37W_B span')
            price = [x.text for x in price]

            currency = price[0]
            price = ''.join(price[1:])
            stars = product.xpath('.//span[@]/text()')
            if stars :
                stars  = stars [0]
            else:
                stars  = 'None'
                
            nb_sold = product.xpath('.//span[@]/text()')
            if nb_sold:
                nb_sold = nb_sold[0]
            else:
                nb_sold = 'None'
            supl = product.xpath('.//a[@]/text()')
            if supl:
                supl = supl[0]
            else:
                supl = 'None'

            ship_cost = product.xpath('.//span[@]/text()')
            if ship_cost:
                ship_cost = ship_cost[0]
            else:
                ship_cost = 'None'
            
            product_links = product.xpath('./@href')
            if product_links:
                product_links = str(baseurl)   str( product_links[0])
            
            row = [title, price, currency, stars, nb_sold, ship_cost, supl, product_links]
            results.append(row)
            print('len(results):', len(results))

    driver.close()
df = pd.DataFrame(results , columns=("Title","Price", "Currency", "Stars", "Orders", "Shipcost", "Supplier", "Productlinks" ))

####### Insert in database #############
client = MongoClient("mongodb://localhost:27017/")     
collection = client['db2']['aliex2']     
data = df.to_dict(orient = 'records')     
collection.insert_many(data) 
print("--- %s seconds ---" % (time.time() - start_time))

I don't know how to deal with this error, I would be grateful if you help me. Thank you !

CodePudding user response：

Main problem in your code is you are closing browser and then you want to open URL so it will generate error.

There are two solution below

Solution 1:

from selenium.webdriver.edge.options import Options  
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver  
from pymongo import MongoClient
from time import sleep
from lxml import html 
import pandas as pd
import cssselect
import pymongo
import time


start_time = time.time()
options = Options()
options.headless = True
driver = webdriver.Edge(executable_path=r"C:\Users\aicha\Desktop\mycode\aliexpress_scrap\scrap\codes\msedgedriver",options=options)
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth earphones&ltype=wholesale&SortType=default&page={}'
baseurl = 'https://www.aliexpress.com'

for page_nb in range(1, 5):
    print('---', page_nb, '---')    
    driver.get(url.format(page_nb))
    sleep(2)
    current_offset = 0
    while True:
        driver.execute_script("window.scrollBy(0, window.innerHeight);")
        sleep(.5)  # JavaScript has time to add elements
        new_offset = driver.execute_script("return window.pageYOffset;")
        print(new_offset,current_offset)
        if new_offset <= current_offset:
            break
        current_offset = new_offset
    
    sleep(3)
    
    tree = html.fromstring(driver.page_source)
    
    results = []
    
    for product in tree.xpath('//div[@]//a'):
        title = product.xpath('.//h1/text()')
        
        if title:
            title = title[0]
            
            price = product.cssselect('div.mGXnE._37W_B span')
            price = [x.text for x in price]

            currency = price[0]
            price = ''.join(price[1:])
            stars = product.xpath('.//span[@]/text()')
            if stars :
                stars  = stars [0]
            else:
                stars  = 'None'
                
            nb_sold = product.xpath('.//span[@]/text()')
            if nb_sold:
                nb_sold = nb_sold[0]
            else:
                nb_sold = 'None'
            supl = product.xpath('.//a[@]/text()')
            if supl:
                supl = supl[0]
            else:
                supl = 'None'

            ship_cost = product.xpath('.//span[@]/text()')
            if ship_cost:
                ship_cost = ship_cost[0]
            else:
                ship_cost = 'None'
            
            product_links = product.xpath('./@href')
            if product_links:
                product_links = str(baseurl)   str( product_links[0])
            
            row = [title, price, currency, stars, nb_sold, ship_cost, supl, product_links]
            results.append(row)
            print('len(results):', len(results))

    # driver.close()-------Remove this code so driver is open and can open URL
df = pd.DataFrame(results , columns=("Title","Price", "Currency", "Stars", "Orders", "Shipcost", "Supplier", "Productlinks" ))

####### Insert in database #############
client = MongoClient("mongodb://localhost:27017/")     
collection = client['db2']['aliex2']     
data = df.to_dict(orient = 'records')     
collection.insert_many(data) 
print("--- %s seconds ---" % (time.time() - start_time))

Above code will not close browser so it will continue execution and get the next link

Solution 2:

from selenium.webdriver.edge.options import Options  
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver  
from pymongo import MongoClient
from time import sleep
from lxml import html 
import pandas as pd
import cssselect
import pymongo
import time


start_time = time.time()
options = Options()
options.headless = True
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth earphones&ltype=wholesale&SortType=default&page={}'
baseurl = 'https://www.aliexpress.com'

for page_nb in range(1, 5):
    #Open driver in loop every next page drive will close and open again
    driver = webdriver.Edge(executable_path=r"C:\Users\aicha\Desktop\mycode\aliexpress_scrap\scrap\codes\msedgedriver",options=options)
    print('---', page_nb, '---')    
    driver.get(url.format(page_nb))
    sleep(2)
    current_offset = 0
    while True:
        driver.execute_script("window.scrollBy(0, window.innerHeight);")
        sleep(.5)  # JavaScript has time to add elements
        new_offset = driver.execute_script("return window.pageYOffset;")
        print(new_offset,current_offset)
        if new_offset <= current_offset:
            break
        current_offset = new_offset
    
    sleep(3)
    
    tree = html.fromstring(driver.page_source)
    
    results = []
    
    for product in tree.xpath('//div[@]//a'):
        title = product.xpath('.//h1/text()')
        
        if title:
            title = title[0]
            
            price = product.cssselect('div.mGXnE._37W_B span')
            price = [x.text for x in price]

            currency = price[0]
            price = ''.join(price[1:])
            stars = product.xpath('.//span[@]/text()')
            if stars :
                stars  = stars [0]
            else:
                stars  = 'None'
                
            nb_sold = product.xpath('.//span[@]/text()')
            if nb_sold:
                nb_sold = nb_sold[0]
            else:
                nb_sold = 'None'
            supl = product.xpath('.//a[@]/text()')
            if supl:
                supl = supl[0]
            else:
                supl = 'None'

            ship_cost = product.xpath('.//span[@]/text()')
            if ship_cost:
                ship_cost = ship_cost[0]
            else:
                ship_cost = 'None'
            
            product_links = product.xpath('./@href')
            if product_links:
                product_links = str(baseurl)   str( product_links[0])
            
            row = [title, price, currency, stars, nb_sold, ship_cost, supl, product_links]
            results.append(row)
            print('len(results):', len(results))

    driver.close()
df = pd.DataFrame(results , columns=("Title","Price", "Currency", "Stars", "Orders", "Shipcost", "Supplier", "Productlinks" ))

####### Insert in database #############
client = MongoClient("mongodb://localhost:27017/")     
collection = client['db2']['aliex2']     
data = df.to_dict(orient = 'records')     
collection.insert_many(data) 
print("--- %s seconds ---" % (time.time() - start_time))

When First page completes Above code close the browser and reopen browser again and continue execution of second page

Solution 2 take more time than solution 1 as it close and open browser