I am trying to collect the headline and conent from the news website (https://www.nusabali.com/search?keyword=umkm). I got the headline and the content summary/preview, but I want to retrieve the full article. I see that I would need to click into each article return to previous page and continue to the next article but I couldn't figure out how. Thank you for any help. Here is my code:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
# empty list
headlines_list = []
content_list = []
# chrome driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.nusabali.com/search?keyword=umkm')
count = 0
while (count<5):
try:
time.sleep(1)
driver.execute_script("arguments[0].click();", WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#main-content > div.wrapper.clearfix > div.col-a.pull-left > section.widget-area-2.pull-right > div > div > div.row > div > button'))))
count =1
except:
break
# Select all the parent elements that contain the headline and content
parents = driver.find_elements(By.CSS_SELECTOR, ".card-deck > .card")
# Iterate through each parent element
for parent in parents:
try:
# headline
headline = parent.find_element(By.TAG_NAME, "h5")
headlines_list.append(headline.text)
except NoSuchElementException:
headlines_list.append(None)
try:
# content
content = parent.find_element(By.TAG_NAME, "p")
content_list.append(content.text)
except NoSuchElementException:
content_list.append(None)
while len(headlines_list) > len(content_list):
headlines_list.pop()
while len(headlines_list) < len(content_list):
content_list.pop()
# create dataframe
df = pd.DataFrame({'Headline': headlines_list, 'Content': content_list})
CodePudding user response:
I see that I would need to click into each article return to previous page and continue to the next article
Not necessarily. Instead, you could just gather all links into a list that you can loop through and scrape.
# while....[ load more ]
article_links = driver.find_elements(By.CSS_SELECTOR, "div.row div.row h5>a[href]")
article_links = list(set([l.get_attribute('href') for l in article_links])) ## set <-> unique
for url in article_links:
driver.get(url)
### SCRAPE ARTICLE ###
Note: I can't find any elements with selector .card-deck > .card
, but div.row.feature-items
or [excluding thumbnail area] div.row div.row
seems to work for article containers.
Btw, about
driver.execute_script("arguments[0].click();", WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#main-content > div.wrapper.clearfix > div.col-a.pull-left > section.widget-area-2.pull-right > div > div > div.row > div > button'))))
Simply button.load-mores:has( a.load-more-basic)
[or even just button.load-mores
] should be enough to select the load-more button.