How to make our webscraping script check both scenarios but execute only the one needed-CodePudding

I scrape some data on website, here's my script :

import warnings
warnings.filterwarnings("ignore")

import re
import requests
from requests import get
from bs4 import BeautifulSoup

import os
import pandas as pd
import numpy as np
import shutil
from selenium import webdriver

from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait     
from selenium.webdriver.common.by import By     
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0',
    'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3',
    'Referer': 'https://www.espncricinfo.com/',
    'Upgrade-Insecure-Requests': '1',
    'Connection': 'keep-alive',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
}


PATH = "driver\chromedriver.exe"

options = webdriver.ChromeOptions() 


options.add_argument("--disable-gpu")
#options.add_argument('enable-logging')
options.add_argument("start-maximized")
#options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_experimental_option('excludeSwitches', ['enable-logging'])

driver = webdriver.Chrome(options=options, executable_path=PATH)

driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})

url = 'https://www.boursorama.com/'

driver.get(url)

cookie = WebDriverWait(driver, 50).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="didomi-notice-agree-button"]')))

try:
    cookie.click()
except:
    pass

df = pd.read_excel('liste.xlsx')

df2 = pd.DataFrame(df)

df3 = df2['Entreprises'].values.tolist()

currencies = []

for i in df3:

    try :

        print(i)
        searchbar = WebDriverWait(driver, 50).until(EC.element_to_be_clickable((By.XPATH, 'html/body/div[6]/div[3]/div[2]/ol/li[1]/button')))
        searchbar.click()

        searchbar2 = WebDriverWait(driver, 50).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[6]/div[1]/div[2]/form/div/input')))
        searchbar2.click()
        searchbar2.send_keys(i   '\n') 

        time.sleep(2)

        links = driver.find_elements_by_xpath('//*[@id="main-content"]/div/div/div[4]/div[1]/div[3]/div/div/div[2]/div[1]/div/div[3]/div/div[1]/div/table/tbody/tr[1]/td[1]/div/div[2]/a')
        for k in links:
            data = k.get_attribute("href")

        results = requests.get(data)

        soup = BeautifulSoup(results.text, "html.parser")

        currency = soup.find('span', class_= 'c-instrument c-instrument--last').text

        currencies.append(currency)

    except :

        print(i)
        searchbar = WebDriverWait(driver, 50).until(EC.element_to_be_clickable((By.XPATH, 'html/body/div[6]/div[3]/div[2]/ol/li[1]/button')))
        searchbar.click()

        searchbar2 = WebDriverWait(driver, 50).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[6]/div[1]/div[2]/form/div/input')))
        searchbar2.click()
        searchbar2.send_keys(i   '\n') 

        time.sleep(2)

        url2 = driver.current_url

        results = requests.get(url2)

        soup = BeautifulSoup(results.text, "html.parser")

        currency = soup.find('span', class_= 'c-instrument c-instrument--last').text

        currencies.append(currency)

print(currencies)

liste.xlsx is just a excel file with entreprise names for my loop :

liste

And here's my output :

TotalEnergies
TotalEnergies
Engie
Engie
BNP
BNP
['45.59', '11.07', '49.03']

I don't understand, it seems that my script does try and also except. I have 3 outputs as intended but it print two times each entreprises. My goal was to : if needed execute try, else execute except.

Can I improve my code to make it execute only one ? The one needed.

Because sometime when searching for an entreprise, you need to be more specific and the site offer you some alternatives, hence this code :

try :
    print(i)
    searchbar = WebDriverWait(driver, 50).until(EC.element_to_be_clickable((By.XPATH, 'html/body/div[6]/div[3]/div[2]/ol/li[1]/button')))
    searchbar.click()

searchbar2 = WebDriverWait(driver, 50).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[6]/div[1]/div[2]/form/div/input')))
searchbar2.click()
searchbar2.send_keys(i   '\n') 

time.sleep(2)

links = driver.find_elements_by_xpath('//*[@id="main-content"]/div/div/div[4]/div[1]/div[3]/div/div/div[2]/div[1]/div/div[3]/div/div[1]/div/table/tbody/tr[1]/td[1]/div/div[2]/a')
for k in links:
    data = k.get_attribute("href")



results = requests.get(data)

soup = BeautifulSoup(results.text, "html.parser")

currency = soup.find('span', class_= 'c-instrument c-instrument--last').text

currencies.append(currency)

And sometime your writing the correct name on the search bar and the website goes right away on the desired page, hence this code :

except :
    print(i)
searchbar = WebDriverWait(driver, 50).until(EC.element_to_be_clickable((By.XPATH, 'html/body/div[6]/div[3]/div[2]/ol/li[1]/button')))
searchbar.click()

searchbar2 = WebDriverWait(driver, 50).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[6]/div[1]/div[2]/form/div/input')))
searchbar2.click()
searchbar2.send_keys(i   '\n') 

time.sleep(2)

url2 = driver.current_url

results = requests.get(url2)

soup = BeautifulSoup(results.text, "html.parser")

currency = soup.find('span', class_= 'c-instrument c-instrument--last').text

currencies.append(currency)

But how to make the script check both scenario but execute only the one needed ? To improve time performances ?

CodePudding user response：

"My goal was to : if needed execute try, else execute except."

This is exactly what it's doing. I'd suggest look into how to debug code. You would be able to run it line by line, and follow the logic and you would see what is occuring.

When you do try/except, it "trys" to execute the script in the try block. If it's successful, it skips the except block. If it fails at some point within the try block, it then goes and executes the exception script.

The reason it APPEARS to be running both is because, technically, as I described above, it does run both. You are seeing this print twice because of the placement of your print() statements.

It enters the try block, then prints i with print(i) right at the beginning. At some point in the try block after the print(i), an error/exception is raised, and then it goes to the except block, where again, it prints i with print(i) at the beginning of that block.

If you want it to look for a condition and only execute the one you want, then you need to use if blocks to check for a condition, not a try/except.

With that being said, it would be far more efficient to get the data from the source as opposed to rendering with Selenium. You also get far more data. I don't know what exactly you would want from the response, but this is what you'd get: click here

Code:

import requests
from bs4 import BeautifulSoup

df3 = ['TotalEnergies','Engie','BNP']

currencies = []
for i in df3:
    url = f'https://www.boursorama.com/recherche/ajax?query={i}&searchId='
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    symbol = soup.find('a', {'class':'search__list-link'})['href'].split('/')[-2]
    
    url = 'https://www.boursorama.com/bourse/action/graph/ws/GetTicksEOD'
    payload = {
            'symbol': symbol,
            'length': '1',
            'period': '0',
            'guid': ''}
    
    jsonData = requests.get(url, params=payload).json()
    data = jsonData['d']
    
    name = data['Name']
    qd = data ['qd']['c']
    
    currencies.append(qd)
    print(f'{name}: {qd}')

print(currencies)

Ouput:

TOTALENERGIES: 45.59
ENGIE: 11.07
BNP PARIBAS: 49.03
[45.59, 11.07, 49.03]