Home > Enterprise >  Python beatifulsoup webscraping
Python beatifulsoup webscraping

Time:04-11

Hello guys Im trying to take data from websites but here is the problem I dont know how to take the href links I mean I can take the texts by saying jobs= jobs.text but how can I do it for href links Here is the code (you dont need to check all the code you can check just List4)

from ctypes.wintypes import tagRECT
from traceback import print_tb
from turtle import clear
import requests
from bs4 import BeautifulSoup
Jobs_Name_List =list()
Jobs_Description=list()
Job_Company=list()
jobs_link=list()
url = ("https://www.seek.co.nz/jobs? onsite_campaign=TATSOI_TGJB_Aware&onsite_content=TATSOI_TGJB_Aware_CANZ_AW_OS_Ban_Half01A&onsite_medium=Display&onsite_source=SEEK&tracking=SEK-SNZ-BAN-TATSOI_TGJB_Aware-30419")
R = requests.get(url)
Soup = BeautifulSoup(R.content, "html5lib")
List = Soup.find_all("h3", attrs={"class":"yvsb870 _1qw3t4i0 _1qw3t4ih _1d0g9qk4 _1qw3t4ip _1qw3t4i1x"})
for jobs in List:
   jobs = jobs.text
   if jobs not in Jobs_Name_List:
    Jobs_Name_List.append(jobs)
print(Jobs_Name_List)
print("--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------")

List2= Soup.find_all("span", attrs={"class":"yvsb870 _14uh9944u _1qw3t4i0 _1qw3t4i1x _1qw3t4i2 _1d0g9qk4 _1qw3t4ie"})
   for companies in List2:
      companies = companies.text
      if companies not in Job_Company:
          Job_Company.append(companies)

print(Job_Company)
print("-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------")

List3 = Soup.find_all("span", attrs={"class":"yvsb870 _14uh9944u _1qw3t4i0 _1qw3t4i1y _1qw3t4i1 _1d0g9qk4 _1qw3t4i8"})
   for descriptions in List3:
        descriptions = descriptions.text
        if descriptions not in Jobs_Description:
           Jobs_Description.append(descriptions)

 print(Jobs_Description)

 List4 = Soup.find_all("a", attrs={"href"})
    

here is the html code I need to have href link to the Jobs_link list

CodePudding user response:

This appends all the href values with the text "Receptionist/Administrator" of the anchor tags into List5.

List4 = Soup.find_all("a")
List5 = []
for a in List4:
   if 'href' in a.attrs and a.text=="Receptionist/Administrator":
       link = a.get('href')
       List5.append(link)

This scrapes only the links that has type=promoted in the value.


List4 = Soup.find_all("a")
List5 = []
for a in List4:
   if 'href' in a.attrs:
       if "type=promoted" in a.attrs['href']:
            link = a.get('href')
            List5.append(link)
  • Related