I have written code that pulls text from sites and analyses them for readability. However, I sometimes get stuck on sites. Is there any way for me to have my program skip to the next iteration of the for loop if it takes longer than x amount of seconds? If there are any questions or clarifications just let me know in comments
import time
import numpy as np
import pandas as pd
import openpyxl
import reqto as rq
from bs4 import BeautifulSoup
# from SpacySylGetter import *
# import readability
import selenium
from selenium import webdriver
TextIn = pd.read_excel('C:\\Users\\Max von Klemperer\\Desktop\\KeywordLinks\\Aus2.xlsx')
# print(TextIn)
WebURLs = list(TextIn["URL"].values)
Region = list(TextIn["Region"].values)
Keywords = list(TextIn["Keyword"].values)
Rankings = list(TextIn["Ranking"].values)
spaces = 0
syls = 0
counter = 0
characters = 0
sentences = 0
CLIs = []
FL = []
FLAuto = []
WebTexts = []
goodurl = []
goodKW = []
goodRegion = []
goodRanking = []
driver = webdriver.Chrome('C:\\Users\\Max von Klemperer\\Downloads\\chromedriver.exe')
for i in WebURLs:
try:
time.sleep(1)
url = i
driver.get(url)
el = driver.find_element_by_tag_name('body')
initText = el.text
TextPros = ''.join(
filter(lambda x: x in '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPGQRSTUVWXYZ-.?! \n', initText))
cleanedStr = ' '.join(TextPros.split())
print(i)
textToProc = cleanedStr[600:len(cleanedStr) - 600]
textToProc.replace("...", ".")
textToProc.replace("-", " ")
textToProc.replace(".com", " ")
if 1000 < len(textToProc) < 100000:
print(textToProc)
WebTexts.append(textToProc)
goodurl.append(i)
goodRegion.append(Region[counter])
goodKW.append(Keywords[counter])
goodRanking.append(Rankings[counter])
counter = counter 1
except Exception:
print("Bounced")
for i in WebTexts:
words = len(i.split())
commas = i.count(",")
spaces = i.count(" ")
Hyphens = i.count("-")
# syls = sylsGet(i)
# print(syls)
characters = len(i) - spaces - sentences
sentences = i.count(".") i.count("?") i.count("!")
characters = len(i) - spaces - sentences - commas - Hyphens
CLI = ((5.89 * (characters / words)) - (0.296 * sentences / (words / 100))) - 15.8
CLIs.append(CLI)
print(CLI)
# FLK = 206.835 - (1.015 * words / sentences) - (84.6 * syls / words)
# print(FLK)
# FL.append(FLK)
driver.close()
CLIExcel = pd.DataFrame()
toAdd1 = np.array(goodurl)
toAdd2 = np.array(CLIs)
toAdd3 = np.array(goodRegion)
toAdd4 = np.array(goodKW)
toAdd5 = np.array(goodRanking)
# toAdd6 = np.array(FL)
CLIExcel["URL"] = toAdd1
CLIExcel["CLI's"] = toAdd2
CLIExcel["Region"] = toAdd3
CLIExcel["Keyword"] = toAdd4
CLIExcel["Ranking"] = toAdd5
# CLIExcel["Flesch Kinkaid"] = toAdd6
print(CLIExcel)
CLIExcel.to_excel('C:\\Users\\Max von Klemperer\\Desktop\\WorkedCLI.xlsx')
CodePudding user response:
Replace your line el = driver.find_element_by_tag_name('body')
with
timeout = 10
try:
el = WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
except TimeoutException:
continue
This will wait for the element to appear for 10 seconds and if it is not found, continue
will continue from the next url in your for-loop.
You will also need the following imports:
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait