Hello guys Im trying to take data from websites but here is the problem I dont know how to take the href links I mean I can take the texts by saying jobs= jobs.text but how can I do it for href links Here is the code (you dont need to check all the code you can check just List4)
from ctypes.wintypes import tagRECT
from traceback import print_tb
from turtle import clear
import requests
from bs4 import BeautifulSoup
Jobs_Name_List =list()
Jobs_Description=list()
Job_Company=list()
jobs_link=list()
url = ("https://www.seek.co.nz/jobs? onsite_campaign=TATSOI_TGJB_Aware&onsite_content=TATSOI_TGJB_Aware_CANZ_AW_OS_Ban_Half01A&onsite_medium=Display&onsite_source=SEEK&tracking=SEK-SNZ-BAN-TATSOI_TGJB_Aware-30419")
R = requests.get(url)
Soup = BeautifulSoup(R.content, "html5lib")
List = Soup.find_all("h3", attrs={"class":"yvsb870 _1qw3t4i0 _1qw3t4ih _1d0g9qk4 _1qw3t4ip _1qw3t4i1x"})
for jobs in List:
jobs = jobs.text
if jobs not in Jobs_Name_List:
Jobs_Name_List.append(jobs)
print(Jobs_Name_List)
print("--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------")
List2= Soup.find_all("span", attrs={"class":"yvsb870 _14uh9944u _1qw3t4i0 _1qw3t4i1x _1qw3t4i2 _1d0g9qk4 _1qw3t4ie"})
for companies in List2:
companies = companies.text
if companies not in Job_Company:
Job_Company.append(companies)
print(Job_Company)
print("-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------")
List3 = Soup.find_all("span", attrs={"class":"yvsb870 _14uh9944u _1qw3t4i0 _1qw3t4i1y _1qw3t4i1 _1d0g9qk4 _1qw3t4i8"})
for descriptions in List3:
descriptions = descriptions.text
if descriptions not in Jobs_Description:
Jobs_Description.append(descriptions)
print(Jobs_Description)
List4 = Soup.find_all("a", attrs={"href"})
here is the html code I need to have href link to the Jobs_link list
CodePudding user response:
This appends all the href
values with the text "Receptionist/Administrator" of the anchor tags into List5
.
List4 = Soup.find_all("a")
List5 = []
for a in List4:
if 'href' in a.attrs and a.text=="Receptionist/Administrator":
link = a.get('href')
List5.append(link)
This scrapes only the links that has type=promoted
in the value.
List4 = Soup.find_all("a")
List5 = []
for a in List4:
if 'href' in a.attrs:
if "type=promoted" in a.attrs['href']:
link = a.get('href')
List5.append(link)