Home > Back-end >  Unable to scrape and make a dictionary from a news website
Unable to scrape and make a dictionary from a news website

Time:10-26

I want to scrape the news articles from a number of pages on the site: https://koreajoongangdaily.joins.com/section/business

At the end, I want to create a dictionary out of the scraped data which should have the date, UTC_date, title, authors_name, news_content, url.

Here is my code, which I tried but couldn't make the dictionary.

Import all the necessary functions

from bs4 import BeautifulSoup as soup
import requests
import numpy as np
from pymongo import MongoClient
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
from time import sleep
import uuid
import datetime
import time
from fake_useragent import UserAgent
import os
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import sys
from fake_useragent import UserAgent

import warnings
warnings.filterwarnings('ignore')
import re
from tqdm import tqdm
import pandas as pd
import datetime

def string_to_date(x):
    return datetime.datetime.strptime(x, '%Y/%m/%d')

def datee(pp):
    return str(pp.date())

To get the links,

def get_link(res):
    href_list = []
    for res in res_list:  # h3
        link_list = res.select('a')
        for link in link_list:  # a
            href = link.get('href')
            href_list.append(href)
    return href_list

To get the article body, title, authors, date and utc date from every link

def get_article(url):
    news_list = []
    title_list= []
    page = requests.get(url)
    bsobj = soup(page.content)
    for title in bsobj.findAll('h1',{'class':'view-article-title serif'}):
        title_list.append(title.text.strip())
        
    for news in bsobj.findAll('div',{'class':'article-content-left pb-30'}):
        news = news_list.append(news.text.strip())
        
    author_list = []
    for f in news:
        author = ""
        pattern = r"BY\b(. )(?=\[. \])"
        resultsss = re.search(pattern, f)
        if resultsss != None:
            author = resultsss.group(0).strip()[3:]
        authors = author_list.append(author)
        
    #there is date given in every links of the articles hence we can use that    
    date_list_1 = []
    separator = '/business'
    for link in href_list:
        new_set1 = link.replace('https://koreajoongangdaily.joins.com/', '')
        new_set2 = new_set1.split(separator, 1)[0]
        new_set3 = date_list_1.append(new_set2)
        new_set4 = list(map(datee, new_set_4))
   #no separate time so add 00:00:00 for UTC    
    p=[]
    for x in new_set4:
        utc_date = p.append(str(x)   " 00:00:00")
        
    #print(news_list)   
    return news_list, title_list, authors, new_set4, utc_date

The n denotes the number of page I want to scrape,

def scrape_the_article(n):
    options = webdriver.ChromeOptions()
    
    lists = ['disable-popup-blocking']

    caps = DesiredCapabilities().CHROME
    caps["pageLoadStrategy"] = "normal"

    options.add_argument("--window-size=1920,1080")
    options.add_argument("--disable-extensions")
    options.add_argument("--disable-notifications")
    options.add_argument("--disable-Advertisement")
    options.add_argument("--disable-popup-blocking")

    driver = webdriver.Chrome(executable_path= r"E:\chromedriver\chromedriver.exe", options=options) #paste your own choromedriver path
    url = "https://koreajoongangdaily.joins.com/section/business"
    driver.get(url)
    
    page = 0
    for step in tqdm(range(n)):          # set the page range here, how many page you want to scrape
        page  = 1
        time.sleep(2)
        try:
            button = driver.find_element_by_class_name("service-more-btn")
            button.click()
        except Exception as e:
            print("trying to scroll")
            driver.execute_script("window.scrollBy(0, 100);")
        print("Page: ", page)
        
        
    html = driver.page_source
    bs = BeautifulSoup(html, 'html.parser')
    res_list = bs.select('div[]')
    
    for res in res_list:
        links = get_article_links(res)
        article = get_article(links) 
scrape_the_article(4)

And at the end I wanna make a dictionary which will look like this,

data = {'date': new_set4, 'utc_date_time': utc_date, 'title': title_list,'author': authors,
            'content': news_list,'link': href_list}

But I couldn't get back the dictionary I wanted to get back. Please help me with this. Thank you!

CodePudding user response:

There's an API endpoint that holds (almost) all data you need and each item is a dictionary, so you can construct your own data structure out of the API response.

NOTE There's no author key in the response, so if you really need this, you'll have to visit each article URL.

Here's how to get the first 10 items:

import datetime

import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0',
    'X-Requested-With': 'XMLHttpRequest'
}

api_endpoint = "https://koreajoongangdaily.joins.com/section/business"

payload = {
    "url": "/section/business",
    "currPage": "1",
}

results = requests.post(api_endpoint, headers=headers, data=payload)

for result in results.json()['RESULT_LIST']:
    date = (
        datetime.datetime
        .strptime(result['service_date'], '%Y%m%d%H%M%S')
        .strftime('%Y-%m-%d %H:%M:%S')
    )
    print(date)
    print(f"{result['list_title']}\n{result['cmss_url']}")
    print(f"{result['summary']}")
    print("-" * 50)

Output:

2022-10-25 18:20:42
Bio business
https://koreajoongangdaily.joins.com/2022/10/25/business/industry/Korea-World-Bio-Summit-Seoul/20221025182043006.html
President Yoon Suk-yeol delivers an opening address at the World Bio Summit 2022 held at the Grand Walkerhill Seoul in Gwangjin District, eastern Seoul, on Tuesday.
--------------------------------------------------
2022-10-25 18:20:33
Mirae Group invests in Musk's Twitter takeover
https://koreajoongangdaily.joins.com/2022/10/25/business/tech/Korea-Twitter-Elon-Musk/20221025182048690.html
Mirae Asset Financial Group will invest $212 million in Elon Musks’ $44 billion acquisition of Twitter, according to electronic disclosures and local media reports.  
--------------------------------------------------
2022-10-25 18:20:00
Smart chair
https://koreajoongangdaily.joins.com/2022/10/25/imageNews/photos/KT-robotics-smart-autonomous-chairs/20221025182003312.html
A demonstration of an autonomous “smart” robot chair at the Dongdaemun Design Plaza in Seoul. KT announced that it is making the smart robotic chair available for three weeks to visitors attending the DDP-NFT exhibition.
--------------------------------------------------

and more ...

To paginate the API, try this example:

import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0',
    'X-Requested-With': 'XMLHttpRequest'
}

api_endpoint = "https://koreajoongangdaily.joins.com/section/business"

payload = {
    "url": "/section/business",
    "currPage": "1",
}

with requests.Session() as s:
    for page in range(1, 100, 10):
        payload["currPage"] = str(page)
        results = s.post(api_endpoint, headers=headers, data=payload)
        for result in results.json()['RESULT_LIST']:
            print(result['service_date'])
            print(f"{result['list_title']}\n{result['cmss_url']}")
            print(f"{result['summary']}")
            print("-" * 50)

NOTE: I'd highly recommend throttling the request to a 1 - 3 seconds between each attempt.

  • Related