Scraping news articles from January 2021 till date using Selenium and Beautifulsoup-CodePudding

I am trying to scrape the following information (titles, dates, links, and content) of each news article on this website: https://www.dailymaverick.co.za/section/maverick-news/. I was able to scrape the information on the first page consisting of 21 articles. However, I want to scrape the information starting from January 2021. I tried using Selenium to click on load more, and it worked, showing more 21 articles per click. However, only the 21 articles on the first page were returned. My code is provided below:



import requests
import json
import pandas as pd
from bs4 import BeautifulSoup
import time
from datetime import timedelta
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from datetime import timedelta
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

session = requests.Session()
session.header = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}

article_title = []
article_date = []
article_link = []

pagesToGet = ['section/maverick-news']


options = webdriver.ChromeOptions()

#options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")



driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
    )

for page in range(0, len(pagesToGet)):
    print('processing page : \n')
    url = 'https://www.dailymaverick.co.za/'   str(pagesToGet[page])
    print(url)

    driver.maximize_window()
    driver.get(url)
    time.sleep(3)

    k = 0
    click_count = 0
    while k <= 25:

        driver.execute_script("arguments[0].scrollIntoView();",
                             driver.find_element(By.CSS_SELECTOR, '.ajax-loader'))
        WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, '.ajax-loader'))).click()
        time.sleep(5)
        k  = 1
    click_count = click_count   k
    print("Number of Clicks on More News = ", click_count)

    r = session.get(url)
    time.sleep(3)

    if r.status_code != 200:
        print(f'Error occured: {r.status_code} on url: {url}')
    else:
        soup = BeautifulSoup(r.content, "html5lib")

        news = soup.find_all('div', attrs={'class': 'media-item'})

        for j in news:
            # Getting titles
            if j.h1 not in ([], None):
                titles = j.h1.get_text(strip=True)
                article_title.append(titles)

            # Getting date
            if j.h6 not in ([], None):
                dates = j.find('h6', attrs={'class': 'date'})
                date = dates.text.strip()
                article_date.append(date)

            # Getting URL
            address = j.find('a').get('href')
            # Countermeasure for links with full url
            if "https://" in address:
                news_link = address
                article_link.append(news_link)
            else:
                ''

        df = pd.DataFrame({'Article_Title': article_title,
                           'Date': article_date, 'Source': article_link})

        # Getting Content Section
        news_articles = []  # to store the content of each news artcle
        news_count = 0
        for link in df['Source']:
            start_time = time.monotonic()
            print('Article No. ', news_count)
            print('Link: ', link)
            news_response = session.get(link)
            time.sleep(1)
            news_data = news_response.content
            news_soup = BeautifulSoup(news_data, 'html.parser')
            art_cont = news_soup.find('div', 'article-content')
            # Countermeasure for links with subscribe form
            try:
                try:
                    article = art_cont.text.split("Subscribe")[
                                  0]   art_cont.text.split("Sign up")[1]
                except:
                    article = art_cont.text
                article = " ".join((article).strip().split())
            except:
                article = f"Login to read {link}"
            news_count  = 1
            news_articles.append(article)
            end_time = time.monotonic()
            print(timedelta(seconds=end_time - start_time))
            print('\n')

        # Create a column to add all the scraped text
        df['News'] = news_articles
        print('\n')

df.drop_duplicates(subset="Source", keep=False, inplace=True)

# Dont store links
df.drop(columns=['Source'], axis=1, inplace=True)
# df.to_excel('SA_news.xlsx')
df.to_csv('maverick2.csv')

CodePudding user response：

General tip for scraping, you don't necessarily need selenium to access information that is loaded with JS. in this case, if you look at the network tab in the developers tools in chrome, you can see that the information is retrieved from the a url that looks like this: https://www.dailymaverick.co.za/wp-admin/admin-ajax.php?action=ajaxCategoryPost&lazy_load_offset={OFFSET}&ajax_count=4&cat_name=maverick-news&taxonomy_name=section

where the {OFFSET} is replaced with the offset in the article you want to see. this will allow you to access more articles with regular requests.

As for your question, I couldn't find a way to find articles that old, but if you can find a way to see them in the browser, you can use the same method (Network tab in chrome) to see how to access them.

Make sure you check the robots.txt file in the site before you start scraping, to make sure they allow access to this information.

CodePudding user response：

You want to load more articles with Selenium and then scrape them with BeautifulSoup. This is fine, however you are doing it wrong, in fact when you run

# load articles with selenium
...

# scrape with beautifulsoup
r = session.get(url)
...
soup = BeautifulSoup(r.content, "html5lib")

you are not using the articles loaded with Selenium, but rather loading the homepage again (without the articles loaded by Selenium). You should instead pass to BeautifulSoup the page source loaded by Selenium, which contains all the articles. To do this, run

soup = BeautifulSoup(driver.page_source, 'lxml')