I am having trouble iterating the action of opening a list containing URLs using selenium.
the issue is in the part labeled #Second Part
in my code. linklinkfin
is a list of length 9 at the moment, but this length can change as more URLs are collected over time. when the code runs, it appears to open the very first URL over and over, and it does not appear to run the append action in the nested while loop since at the end when i print textreal_listing
it is empty. As the code runs i see https://www.nj.gov/dobi/division_insurance/bfd/enforcement2014.htm opening/refreshing continually till the program ends. At the end of each while loop 1 should get added to browsercount
and then the code repeats with the new URL but this doesn't appear to be occurring, any ideas?
my code:
#FIRST PART
from selenium.common.exceptions import NoSuchElementException
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
textreal_listing=[]
browser = webdriver.Chrome(r'\\homedirpva1a01\USERSNC$\603225\chromedriver\chromedriver.exe')
time.sleep(5)
browser.get("https://www.nj.gov/dobi/division_insurance/bfd/enforcement.htm")
time.sleep(5)
linkslist=browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[6]/td[1]/table/tbody/tr[2]/td/ul/li/font/a")
linkslist2=browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[6]/td[1]/table/tbody/tr[2]/td/ul/li/font/font/a")
linklinkfin=linkslist linkslist2
#SECOND PART
textcount=1
textpage=6
browsercount=2014
for i in linklinkfin:
browser.get("https://www.nj.gov/dobi/division_insurance/bfd/enforcement{}.htm".format(browsercount))
time.sleep(2)
if "404 Error" in browser.page_source:
browser.get("https://www.nj.gov/dobi/division_insurance/bfd/enforcement{}.html".format(browsercount))
time.sleep(2)
while len(textreal_listing)<100:
texttreesing=browser.find_element_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount))
textreal_listing.append(texttreesing.text)
textcount =1
if len(browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount)))==0:
textpage =3
textcount=2
if len(browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount)))==0:
break
browsercount =1
else:
while len(textreal_listing)<100:
texttreesing=browser.find_element_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount))
textreal_listing.append(texttreesing.text)
textcount =1
if len(browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount)))==0:
textpage =3
textcount=2
if len(browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount)))==0:
break
browsercount =1
print(textreal_listing)
CodePudding user response:
this worked
#FIRST PART
import pandas as pd
from selenium.common.exceptions import NoSuchElementException
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
textreal_listing=[]
browser = webdriver.Chrome(r'\\homedirpva1a01\USERSNC$\603225\chromedriver\chromedriver.exe')
time.sleep(5)
browser.get("https://www.nj.gov/dobi/division_insurance/bfd/enforcement.htm")
time.sleep(5)
linkslist=browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[6]/td[1]/table/tbody/tr[2]/td/ul/li/font/a")
linkslist2=browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[6]/td[1]/table/tbody/tr[2]/td/ul/li/font/font/a")
linktext=[]
for my_href in linkslist:
linktext.append(my_href.get_attribute("href"))
for my_hrefs in linkslist2:
linktext.append(my_hrefs.get_attribute("href"))
#SECOND Part
textcount=1
textpage=6
browsercount=2014
for i in linktext:
browser.get(i)
time.sleep(5)
while len(textreal_listing)<100:
texttreesing=browser.find_element_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount))
textreal_listing.append(texttreesing.text)
textcount =1
if len(browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount)))==0:
textpage =3
textcount=2
if len(browser.find_elements_by_xpath("/html/body/div/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/tr[{}]/td/p[{}]".format(textpage,textcount)))==0:
break
print(textreal_listing)