Home > Software engineering >  Selenium getting stuck when looping through pages
Selenium getting stuck when looping through pages

Time:04-09

I have written code that pulls text from sites and analyses them for readability. However, I sometimes get stuck on sites. Is there any way for me to have my program skip to the next iteration of the for loop if it takes longer than x amount of seconds? If there are any questions or clarifications just let me know in comments

import time
import numpy as np
import pandas as pd
import openpyxl
import reqto as rq
from bs4 import BeautifulSoup
# from SpacySylGetter import *
# import readability
import selenium
from selenium import webdriver

TextIn = pd.read_excel('C:\\Users\\Max von Klemperer\\Desktop\\KeywordLinks\\Aus2.xlsx')

# print(TextIn)
WebURLs = list(TextIn["URL"].values)
Region = list(TextIn["Region"].values)
Keywords = list(TextIn["Keyword"].values)
Rankings = list(TextIn["Ranking"].values)
spaces = 0
syls = 0
counter = 0
characters = 0
sentences = 0
CLIs = []
FL = []
FLAuto = []
WebTexts = []
goodurl = []
goodKW = []
goodRegion = []
goodRanking = []
driver = webdriver.Chrome('C:\\Users\\Max von Klemperer\\Downloads\\chromedriver.exe')

for i in WebURLs:
    try:
        time.sleep(1)
        url = i
        driver.get(url)
        el = driver.find_element_by_tag_name('body')
        initText = el.text

        TextPros = ''.join(
            filter(lambda x: x in '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPGQRSTUVWXYZ-.?! \n', initText))
        cleanedStr = ' '.join(TextPros.split())
        print(i)
        textToProc = cleanedStr[600:len(cleanedStr) - 600]
        textToProc.replace("...", ".")
        textToProc.replace("-", " ")
        textToProc.replace(".com", " ")
        if 1000 < len(textToProc) < 100000:
            print(textToProc)
            WebTexts.append(textToProc)
            goodurl.append(i)
            goodRegion.append(Region[counter])
            goodKW.append(Keywords[counter])
            goodRanking.append(Rankings[counter])
        counter = counter   1

    except Exception:
        print("Bounced")

for i in WebTexts:
    words = len(i.split())
    commas = i.count(",")
    spaces = i.count(" ")
    Hyphens = i.count("-")
    # syls = sylsGet(i)
    # print(syls)
    characters = len(i) - spaces - sentences
    sentences = i.count(".")   i.count("?")   i.count("!")
    characters = len(i) - spaces - sentences - commas - Hyphens
    CLI = ((5.89 * (characters / words)) - (0.296 * sentences / (words / 100))) - 15.8
    CLIs.append(CLI)
    print(CLI)
    # FLK = 206.835 - (1.015 * words / sentences) - (84.6 * syls / words)
    # print(FLK)
    # FL.append(FLK)

driver.close()
CLIExcel = pd.DataFrame()

toAdd1 = np.array(goodurl)
toAdd2 = np.array(CLIs)
toAdd3 = np.array(goodRegion)
toAdd4 = np.array(goodKW)
toAdd5 = np.array(goodRanking)
# toAdd6 = np.array(FL)

CLIExcel["URL"] = toAdd1
CLIExcel["CLI's"] = toAdd2
CLIExcel["Region"] = toAdd3
CLIExcel["Keyword"] = toAdd4
CLIExcel["Ranking"] = toAdd5
# CLIExcel["Flesch Kinkaid"] = toAdd6

print(CLIExcel)
CLIExcel.to_excel('C:\\Users\\Max von Klemperer\\Desktop\\WorkedCLI.xlsx')

CodePudding user response:

Replace your line el = driver.find_element_by_tag_name('body') with

timeout = 10
try:
    el = WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
except TimeoutException:
    continue

This will wait for the element to appear for 10 seconds and if it is not found, continue will continue from the next url in your for-loop.

You will also need the following imports:

from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
  • Related