import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
options = webdriver.ChromeOptions()
# options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
chrome_driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=options
)
def supplyvan_scraper():
with chrome_driver as driver:
driver.implicitly_wait(15)
URL = 'https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx'
driver.get(URL)
time.sleep(3)
link=driver.find_elements(By.XPATH, "//div[@class='list-group']//a")
for links in link:
if(links.startsWith("https://www.ifep.ro/")):
print(links.get_attribute("href"))
They show me error in these line there are some unwanted link and I want to remove it these is the page link
CodePudding user response:
This is because the WebElement
is not a string. You have to first extract the text from the WebElement
and then use startsWith
on the resulting text.
Here is the complete code:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
chrome_options = Options()
# chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--disable-extensions")
driver = webdriver.Chrome(executable_path="./chromedriver", options=chrome_options)
driver.get("https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx")
driver.maximize_window()
time.sleep(3)
links = driver.find_elements_by_xpath("//div[@class='list-group']//a")
for link in links:
link_href = link.get_attribute("href")
if link_href.startswith("https://www.ifep.ro/"):
print(link_href)
You can use this modified code only:
links = driver.find_elements_by_xpath("//div[@class='list-group']//a")
for link in links:
link_href = link.get_attribute("href")
if link_href.startswith("https://www.ifep.ro/"):
print(link_href)
Output:
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=33353&Signature=387599
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=34493&Signature=387599
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=15868&Signature=387599
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=33526&Signature=387599
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=33459&Signature=387599
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=9100&Signature=387599
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=27125&Signature=387599
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=24811&Signature=387599
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=1932&Signature=387599
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=7746&Signature=387599
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=18864&Signature=387599
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=23966&Signature=387599
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=3840&Signature=387599
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=16192&Signature=387599
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=16350&Signature=387599
CodePudding user response:
Trying to filter links by partial @href you're trying to solve an X-Y issue. There is no need to filter links- just use correct XPath to select required links:
links = driver.find_elements(By.XPATH, "links = driver.find_elements('xpath', "//td/div[@class='list-group']/a")")
for link in links:
print(link.get_attribute("href"))
CodePudding user response:
Links have multiple attributes, target, location, text...
You most likely want text
links.getText()....
should work