</div><section class="">
<div class="wrap">
<div class="l-12">
<div class="l-gi">
<div class="cards-heading">
<a href="/en/your-mediacorp/our-artistes/tca/male-artistes"><h3>male.celebs</h3></a>
</div>
<div class="search-loading-inner hidden"></div>
<div class="cards-wrap" id="cards-11831178"><div class="group-wrap wrap-3">
<div class="cards-group collective-group">
<div class="card-item person" id="content-12357686" data-item-index="0">
<div class="card-media">
<div class="card-image">
<a href="/en/your-mediacorp/our-artistes/tca/male-artistes/ayden-sng-12357686">
<img data-sizes="auto" src="data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==" data-srcset="/image/13663916/1x1/480/480/f957f70cc7d1c19a3b1d6ccef61118c4/Cp/ayden-sng-2020--tight-shot-.jpg 320w, /image/13663916/1x1/640/640/f957f70cc7d1c19a3b1d6ccef61118c4/Rb/ayden-sng-2020--tight-shot-.jpg 480w" class="lazyautosizes lazyloaded" alt="Ayden Sng 2020 (Tight Shot)" sizes="339px" srcset="/image/13663916/1x1/480/480/f957f70cc7d1c19a3b1d6ccef61118c4/Cp/ayden-sng-2020--tight-shot-.jpg 320w, /image/13663916/1x1/640/640/f957f70cc7d1c19a3b1d6ccef61118c4/Rb/ayden-sng-2020--tight-shot-.jpg 480w">
</a>
</div>
</div>
This is the information I get from the website and I'm trying to get the link from
from bs4 import BeautifulSoup
import requests
import re
def getHTMLdocument(url):
response = requests.get(url)
return response.text
url_to_scrape = "https://www.mediacorp.sg/en/your-mediacorp/our-artistes/tca/male-artistes"
html_document = getHTMLdocument(url_to_scrape)
soup = BeautifulSoup(html_document, 'lxml')
for link in soup.find_all('a',attrs={'href': re.compile("/en/")}):
print(link.get('href'))
however the output from the code only gives me all the links on the page except for the ones under the h3 tag. is there anyway for me to access the codes under the h3 tag?
CodePudding user response:
I'm assuming you are trying to get the "/en/your-mediacorp/our-artistes/tca/male-artistes/ayden-sng-12357686"
links under the "card-media"
attributes. That data is dynamic and what you are getting in the requests is the static html. You need to use the url that gets back that data.
from bs4 import BeautifulSoup
import requests
import re
def getHTMLdocument(url, payload):
response = requests.get(url, params=payload)
return response.text
url_to_scrape = 'https://www.mediacorp.sg/en/toggle/paginateLocalItems'
payload = {
'pageSize': '12',
'pageIndex': '0',
'contentId': '11831178',
'navigationId': '12302270',
'viewType': 'ajaxContentCardListing'}
html_document = getHTMLdocument(url_to_scrape, payload)
soup = BeautifulSoup(html_document, 'lxml')
links = []
for link in soup.find_all('a',attrs={'href': re.compile("/en/")}):
if link.get('href') not in links:
links.append(link.get('href'))
print(link.get('href'))
You use those parameters in the payload to go get the other pages as well:
from bs4 import BeautifulSoup
import requests
import re
def getHTMLdocument(url, payload):
response = requests.get(url, params=payload)
return response.text
url_to_scrape = 'https://www.mediacorp.sg/en/toggle/paginateLocalItems'
links = []
page = 0
while True:
#print('Page: %s' %(page 1))
payload = {
'pageSize': '12',
'pageIndex': '%s' %page,
'contentId': '11831178',
'navigationId': '12302270',
'viewType': 'ajaxContentCardListing'}
html_document = getHTMLdocument(url_to_scrape, payload)
soup = BeautifulSoup(html_document, 'lxml')
if soup.find_all('a',attrs={'href': re.compile("/en/")}) == []:
break
for link in soup.find_all('a',attrs={'href': re.compile("/en/")}):
if link.get('href') not in links:
links.append(link.get('href'))
print(link.get('href'))
page =1
Output:
/en/your-mediacorp/our-artistes/tca/male-artistes/ayden-sng-12357686
/en/your-mediacorp/our-artistes/tca/male-artistes/ben-yeo-12357544
/en/your-mediacorp/our-artistes/tca/male-artistes/benjamin-tan-12357684
/en/your-mediacorp/our-artistes/tca/male-artistes/brandon-wong-12357582
/en/your-mediacorp/our-artistes/tca/male-artistes/bryan-wong-12357546
/en/your-mediacorp/our-artistes/tca/male-artistes/cavin-soh-12357626
/en/your-mediacorp/our-artistes/tca/male-artistes/chen-han-wei-12357548
/en/your-mediacorp/our-artistes/tca/male-artistes/chen-shu-cheng-12357584
/en/your-mediacorp/our-artistes/tca/male-artistes/chen-tian-wen-12357586
/en/your-mediacorp/our-artistes/tca/male-artistes/chen-yi-xi-12357646
/en/your-mediacorp/our-artistes/tca/male-artistes/chew-chor-meng-12357588
/en/your-mediacorp/our-artistes/tca/male-artistes/desmond-ng-12357570
/en/your-mediacorp/our-artistes/tca/male-artistes/desmond-tan-12357590
/en/your-mediacorp/our-artistes/tca/male-artistes/edwin-goh-12357550
/en/your-mediacorp/our-artistes/tca/male-artistes/elvin-ng-12357552
/en/your-mediacorp/our-artistes/tca/male-artistes/glenn-yong-14218688
/en/your-mediacorp/our-artistes/tca/male-artistes/guo-liang-12357592
/en/your-mediacorp/our-artistes/tca/male-artistes/herman-keh-12357682
/en/your-mediacorp/our-artistes/tca/male-artistes/ian-fang-12357594
/en/your-mediacorp/our-artistes/tca/male-artistes/james-seah-12357596
/en/your-mediacorp/our-artistes/tca/male-artistes/jarrell-huang-12357654
/en/your-mediacorp/our-artistes/tca/male-artistes/jeffrey-xu-12357554
/en/your-mediacorp/our-artistes/tca/male-artistes/jeremy-chan-15172284
/en/your-mediacorp/our-artistes/tca/male-artistes/lee-teng--12357630
/en/your-mediacorp/our-artistes/tca/male-artistes/mark-lee-12357632
/en/your-mediacorp/our-artistes/tca/male-artistes/nick-teo-12357598
/en/your-mediacorp/our-artistes/tca/male-artistes/pierre-png--12357556
/en/your-mediacorp/our-artistes/tca/male-artistes/qi-yuwu-12357600
/en/your-mediacorp/our-artistes/tca/male-artistes/rayson-tan-12357562
/en/your-mediacorp/our-artistes/tca/male-artistes/richard-low-12357602
/en/your-mediacorp/our-artistes/tca/male-artistes/richie-koh-12357642
/en/your-mediacorp/our-artistes/tca/male-artistes/romeo-tan-12357558
/en/your-mediacorp/our-artistes/tca/male-artistes/shaun-chen-12357560
/en/your-mediacorp/our-artistes/tca/male-artistes/terence-cao-12357604
/en/your-mediacorp/our-artistes/tca/male-artistes/tyler-ten--12357694
/en/your-mediacorp/our-artistes/tca/male-artistes/yao-wen-long-12357564
/en/your-mediacorp/our-artistes/tca/male-artistes/zane-lim-12357692
/en/your-mediacorp/our-artistes/tca/male-artistes/zhai-siming--12357690
/en/your-mediacorp/our-artistes/tca/male-artistes/zhang-yao-dong-12357568
/en/your-mediacorp/our-artistes/tca/male-artistes/zhang-ze-tong--12357688
/en/your-mediacorp/our-artistes/tca/male-artistes/zheng-ge-ping-12357622
/en/your-mediacorp/our-artistes/tca/male-artistes/zhu-hou-ren-12358080
CodePudding user response:
As alternativ approach to requests
you can also go with selenium
to get the urls.
Example
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome('YOUR PATH TO DRIVER')
driver.get('https://www.mediacorp.sg/en/your-mediacorp/our-artistes/tca/male-artistes')
soup=BeautifulSoup(driver.page_source, 'html.parser')
[x['href'] for x in soup.select('div.card-item h3 > a')]
Output - List with urls
['/en/your-mediacorp/our-artistes/tca/male-artistes/ayden-sng-12357686',
'/en/your-mediacorp/our-artistes/tca/male-artistes/ben-yeo-12357544',
'/en/your-mediacorp/our-artistes/tca/male-artistes/benjamin-tan-12357684',
'/en/your-mediacorp/our-artistes/tca/male-artistes/brandon-wong-12357582',
'/en/your-mediacorp/our-artistes/tca/male-artistes/bryan-wong-12357546',
'/en/your-mediacorp/our-artistes/tca/male-artistes/cavin-soh-12357626',
'/en/your-mediacorp/our-artistes/tca/male-artistes/chen-han-wei-12357548',
'/en/your-mediacorp/our-artistes/tca/male-artistes/chen-shu-cheng-12357584',
'/en/your-mediacorp/our-artistes/tca/male-artistes/chen-tian-wen-12357586',
'/en/your-mediacorp/our-artistes/tca/male-artistes/chen-yi-xi-12357646',
'/en/your-mediacorp/our-artistes/tca/male-artistes/chew-chor-meng-12357588',
'/en/your-mediacorp/our-artistes/tca/male-artistes/desmond-ng-12357570']