I have been working on a project to pull specific href links from a site. Everything is working but I also want to be able to pull only specific data from href link. I haven't been able to figure that part out.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
service = Service(executable_path="c:/temp/chromedriver.exe")
driver = webdriver.Chrome(service=service)
#Source site
driver.get("site.com")
#The xpath to the links
s = driver.find_element(By.XPATH, "/html/body/div[3]/div[4]/table[1]/tbody/tr[2]/td[4]/a").get_attribute('href')
smonth = driver.find_element(By.XPATH, "/html/body/div[3]/div[4]/table[1]/tbody/tr[2]/td[3]")
b = driver.find_element(By.XPATH, "/html/body/div[3]/div[4]/table[1]/tbody/tr[6]/td[4]/a").get_attribute('href')
bmonth = driver.find_element(By.XPATH, "/html/body/div[3]/div[4]/table[1]/tbody/tr[6]/td[3]")
bb = driver.find_element(By.XPATH, "/html/body/div[3]/div[4]/table[1]/tbody/tr[28]/td[3]/a").get_attribute('href')
bbmonth = driver.find_element(By.XPATH, "/html/body/div[3]/div[4]/table[1]/tbody/tr[28]/td[2]")
#The output of the links
print("S:", smonth.text, s, "\nB:", bmonth.text, b, "\nBB:", bbmonth.text, bb)
driver.close()
Here is the output
S: July 2022 https://bdn-ak-ssl.site.com/software/s96_8_80.exe
B: July 2022 https://bdn-ak-ssl.site.com/software/b43_6_56.exe
BB: July 2022 https://bdn-ak-ssl.site.com/software/bb202_2_100.exe
I'm trying to get the output to look like this
S: July 2022 https://bdn-ak-ssl.site.com/software/s96_8_80.exe Version: s96_8_80
B: July 2022 https://bdn-ak-ssl.site.com/software/b43_6_56.exe Version: b43_6_56
BB: July 2022 https://bdn-ak-ssl.site.com/software/bb202_2_100.exe Version: bb202_2_100
CodePudding user response:
Use regex
import re
output = "BB: July 2022 https://bdn-ak-ssl.site.com/software/bb202_2_100.exe"
result = re.search(r'software/(.*?).exe', output)
result = result.group(1)
output = f" Version: {result}"
print(output)
Output
BB: July 2022 https://bdn-ak-ssl.site.com/software/bb202_2_100.exe Version: bb202_2_100
CodePudding user response:
You can simply remove parts of the string to return the version
version = s.replace('https://bdn-ak-ssl.site.com/software/','').replace('.exe','')
print('Version: ', version)