How to get the tokens in data-search-meta-sol-CodePudding

def extract(page):
    url = f'https://www.jobstreet.com.my/en/job-search/administrative-assistant-jobs/{page}/'
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    return soup


def transform(soup):
    jobs = soup.find_all('div', class_='sx2jih0 zcydq876 zcydq866 zcydq896 zcydq886 zcydq8n zcydq856 zcydq8f6 zcydq8eu')
    for job in jobs[:29]:
        for token in job.find_all('div', attrs={'data-search-sol-meta': True}):
            more_details = token.text.strip()

            job_detail = {
                'more details': more_details
            }
            joblist.append(job_detail)


joblist = []
dummy = 2
for i in range(0, dummy, 1):
    c = extract(i   1)
transform(c)

print(f'Progress Page: [{int(i)   1}/{dummy}]')
time.sleep(4)

df = pd.DataFrame(joblist)

I want to scrape the tokens in those data-search-sol-meta tags, how to i get it?

   <div data-search-sol-meta="{"searchRequestToken":"62781aeb-4a14-43c9-b985-8be617cc1107","token":"0~62781aeb-4a14-43c9-b985-8be617cc1107","jobId":"jobstreet-my-job-5011156","section":"MAIN","sectionRank":1,"jobAdType":"ORGANIC","tags":{"mordor__flights":"mordor_80","jobstreet:userGroup":"BB","jobstreet:s_vi":"[CS]v1|314CC40D0D655F39-400007A66AC825EB[CE]"}}">

the results in the pd (more_details column) that I've got is just "None"

CodePudding user response：

I would use a more robust css selector list i.e. not the dynamic classes. Be high enough in the DOM to be able to select both the attributes you want and then the job info. You can extract the attribute with the tokens and use json library to list separately.

import requests, json
from bs4 import BeautifulSoup


def extract(page):
    url = f"https://www.jobstreet.com.my/en/job-search/administrative-assistant-jobs/{page}/"
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html.parser")
    return soup


def transform(soup):
    jobs = soup.select("[data-automation=jobListing] > div:has(article)")
    for job in jobs:
        print(job.select_one("h1 span").text)
        print()
        print(job["data-search-sol-meta"])
        print()
        data = json.loads(job["data-search-sol-meta"])
        print("searchRequestToken: ", data["searchRequestToken"])
        print("token: ", data["token"])
        print()


soup = extract(1)
transform(soup)