Home > Net >  How can I scrape the content of a news, based on its title?
How can I scrape the content of a news, based on its title?

Time:03-31

My little personal utility built for fun. I have a Listbox where the titles and news time are scraped from 2 links and printed in the Listbox after clicking on the "View Title" button. This works correctly. All ok!

enter image description here

Now I would like to select the newspaper title from the Listbox, click on the "View Content" button, and view the news content in a multiline textbox. So I would like to view the content of the news of the selected title in the textbox below. I specify that the title is the same as the link of the news content. But I have a problem with the function to build this:

def content():
    if title.select:

        #click on title-link
        driver.find_element_by_tag_name("title").click()

        #Download Content to class for every title
        content_download =(" ".join([span.text for span in div.select("text mbottom")]))

        #Print Content in textobox
        textbox_download.insert(tk.END, content_download)

So I imagined that to get this, we would have to simulate clicking on the title of the news to open it (in html it is title), then select the text of the content (in html it is text mbottom) and then copy it in the tetbox of my file. It should be so? What are you saying? Obviously I have poorly written the code and it doesn't work. I'm not very good at scraping. Could anyone help me? Thank you

The complete code is this (is executable correctly and scrapes titles and now. I don't call the content function in the button). Aside from the above function, the code is working good and fetches the title and news time

from tkinter import *
from tkinter import ttk
import tkinter as tk
import sqlite3
import random
import tkinter.font as tkFont
from tkinter import ttk

window=Tk()
window.title("x")
window.geometry("800x800")

textbox_title = tk.Listbox(window, width=80, height=16, font=('helvetic', 12), selectbackground="#960000", selectforeground="white", bg="white") #prima era self.tutti_pronostici, per far visualizzare le chiamate dall'altra finestra
textbox_title.place(x=1, y=1)

textbox_download = tk.Listbox(window, width=80, height=15, font=('helvetic', 12), selectbackground="#960000", selectforeground="white", bg="white") #prima era self.tutti_pronostici, per far visualizzare le chiamate dall'altra finestra
textbox_download.place(x=1, y=340)

#Download All Titles and Time
def all_titles():

    allnews = []

    import requests
    from bs4 import BeautifulSoup

    # mock browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }


    #ATALANTA
    site_atalanta = requests.get('https://www.tuttomercatoweb.com/atalanta/', headers=headers)
    soup = BeautifulSoup(site_atalanta.content, 'html.parser')

    news = soup.find_all('div', attrs={"class": "tcc-list-news"})

    for each in news:
        for div in each.find_all("div"):
            time= (div.find('span', attrs={'class': 'hh serif'}).text)
            title=(" ".join([span.text for span in div.select("a > span")]))

            news = (f" {time} {'ATALANTA'}, {title} (TMW)")
            allnews.append(news)        


    #BOLOGNA
    site_bologna = requests.get('https://www.tuttomercatoweb.com/bologna/', headers=headers)
    soup = BeautifulSoup(site_bologna.content, 'html.parser')

    news = soup.find_all('div', attrs={"class": "tcc-list-news"})

    for each in news:
        for div in each.find_all("div"):
            time= (div.find('span', attrs={'class': 'hh serif'}).text)
            title=(" ".join([span.text for span in div.select("a > span")]))

            news = (f" {time} {'BOLOGNA'}, {title} (TMW)")
            allnews.append(news)           
                            

    allnews.sort(reverse=True)

    for news in allnews:
        textbox_title.insert(tk.END, news)

#Download Content of News
def content():
    if titolo.select:

        #click on title-link
        driver.find_element_by_tag_name("title").click()

        #Download Content to class for every title
        content_download =(" ".join([span.text for span in div.select("text mbottom")]))

        #Print Content in textobox
        textbox_download.insert(tk.END, content_download)



button = tk.Button(window, text="View Titles", command= lambda: [all_titles()])
button.place(x=1, y=680)

button2 = tk.Button(window, text="View Content", command= lambda: [content()])
button2.place(x=150, y=680)

window.mainloop()

CodePudding user response:

When you get title and time then you could directly get link to page with details - and keep them as pair.

            news = f" {time} '{place}', {title} (TMW)"
            link = div.find('a')['href']

            results.append( [news, link] )

and later you can display only news but when you select title then you can get index and get link from allnews and directly download it - using requests instead driver

def content():
    # tuple with indexes of all selected titles
    selection = listbox_title.curselection()
    print('selection:', selection)

    if selection:
        
        item = allnews[selection[-1]]
        print('item:', item)

        url = item[1]
        print('url:', url)

To select full news you have to use select(".text.mbottom") with dots.

And to display news it would be better to use Text() instead Listbox()

Because you run the same code for ATALANTA and BOLOGNA so I moved this code to function get_data_for(place) and now I can even use for-loop to run it for more places.

for place in ['atalanta',  'bologna']: 
    results = get_data_for(place)
    allnews  = results

Full working code (1) - I tried to keep only important elements.

I used pack() instead of place() beacause it allows to resize window and it will resize also Listbox() and Text()

import tkinter as tk   # PEP8: `import *` is not preferred
from tkinter import ttk
import requests
from bs4 import BeautifulSoup

# PEP8: all imports at the beginning

# --- functions ---   # PEP8: all functions directly after imports

def get_data_for(place):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }
    
    results = []
    
    response = requests.get(f'https://www.tuttomercatoweb.com/{place}/', headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    news = soup.find_all('div', attrs={"class": "tcc-list-news"})

    for each in news:
        for div in each.find_all("div"):
            time = div.find('span', attrs={'class': 'hh serif'}).text
            title = " ".join(span.text for span in div.select("a > span"))
            news = f" {time} {place.upper()}, {title} (TMW)"
            link = div.find('a')['href']
            results.append( [news, link] )
    
    return results

def all_titles():
    global allnews  # inform function to use global variable instead of local variable

    allnews = []

    for place in ['atalanta',  'bologna']: 
        print('search:', place)
        results = get_data_for(place)
        print('found:', len(results))
        allnews  = results

    allnews.sort(reverse=True)

    listbox_title.delete('0', 'end')

    for news in allnews:
        listbox_title.insert('end', news[0])

#Download Content of News
def content():
    # tuple
    selection = listbox_title.curselection()
    print('selection:', selection)

    if selection:
        
        item = allnews[selection[-1]]
        print('item:', item)
        url = item[1]
        print('url:', url)

        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
        }
             
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        content_download = "\n".join(item.get_text() for item in soup.select("div.text.mbottom"))

        text_download.delete('1.0', 'end') # remove previous content)
        text_download.insert('end', content_download)

# --- main ---

allnews = []  # global variable with default value at start

window = tk.Tk()
window.geometry("800x800")

listbox_title = tk.Listbox(window, selectbackground="#960000", selectforeground="white", bg="white")
listbox_title.pack(fill='both', expand=True, pady=5, padx=5)

text_download = tk.Text(window, bg="white")
text_download.pack(fill='both', expand=True, pady=0, padx=5)

buttons_frame = tk.Frame(window)
buttons_frame.pack(fill='x')

button1 = tk.Button(buttons_frame, text="View Titles", command=all_titles)  # don't use `[]` to execute functions
button1.pack(side='left', pady=5, padx=5)

button2 = tk.Button(buttons_frame, text="View Content", command=content)   # don't use `[]` to execute functions
button2.pack(side='left', pady=5, padx=(0,5))

window.mainloop()

Result:

enter image description here


EDIT:

Problem with sorting: today's titles are at the end of list but they should be at the beginning - all because they are sorted using only time but they would need to be sorted using date time or number time.

You would enumerate every tcc-list-news and then every day would have own number and they would sort (almost) correctly. because you want to sort in reverse order then you may need -number instead of number to get correct order.

    for number, each in enumerate(news):
        for div in each.find_all("div"):
            time  = div.find('span', attrs={'class': 'hh serif'}).text
            title = " ".join(span.text for span in div.select("a > span"))
            news  = f" {time} {place.upper()}, {title} (TMW)"
            link  = div.find('a')['href']
            results.append( [-number, news, link] )

and after sorting

    for number, news, url in allnews:
        listbox_title.insert('end', news)

Full working code (2)

import tkinter as tk   # PEP8: `import *` is not preferred
from tkinter import ttk
import requests
from bs4 import BeautifulSoup

# PEP8: all imports at the beginning

# --- functions ---   # PEP8: all functions directly after imports

def get_data_for(place):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }
    
    results = []
    
    response = requests.get(f'https://www.tuttomercatoweb.com/{place}/', headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    news = soup.find_all('div', attrs={"class": "tcc-list-news"})

    for number, each in enumerate(news):
        for div in each.find_all("div"):
            time  = div.find('span', attrs={'class': 'hh serif'}).text
            title = " ".join(span.text for span in div.select("a > span"))
            news  = f" {time} {place.upper()}, {title} (TMW)"
            link  = div.find('a')['href']
            results.append( [-number, news, link] )
    
    return results

def all_titles():
    global allnews  # inform function to use global variable instead of local variable

    allnews = []

    for place in ['atalanta',  'bologna']: 
        print('search:', place)
        results = get_data_for(place)
        print('found:', len(results))
        allnews  = results

    allnews.sort(reverse=True)

    listbox_title.delete('0', 'end')
    
    for number, news, url in allnews:
        listbox_title.insert('end', news)

#Download Content of News
def content():
    # tuple
    selection = listbox_title.curselection()
    print('selection:', selection)

    if selection:
        
        item = allnews[selection[-1]]
        print('item:', item)
        url = item[2]
        print('url:', url)

        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
        }
             
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        content_download = "\n".join(item.get_text() for item in soup.select("div.text.mbottom"))

        text_download.delete('1.0', 'end') # remove previous content)
        text_download.insert('end', content_download)

# --- main ---

allnews = []  # global variable with default value at start

window = tk.Tk()
window.geometry("800x800")

listbox_title = tk.Listbox(window, selectbackground="#960000", selectforeground="white", bg="white")
listbox_title.pack(fill='both', expand=True, pady=5, padx=5)

text_download = tk.Text(window, bg="white")
text_download.pack(fill='both', expand=True, pady=0, padx=5)

buttons_frame = tk.Frame(window)
buttons_frame.pack(fill='x')

button1 = tk.Button(buttons_frame, text="View Titles", command=all_titles)  # don't use `[]` to execute functions
button1.pack(side='left', pady=5, padx=5)

button2 = tk.Button(buttons_frame, text="View Content", command=content)   # don't use `[]` to execute functions
button2.pack(side='left', pady=5, padx=(0,5))

window.mainloop()

BTW

Because you sort in reverse order so you get 00:30 bologna before 00:30 atalanta - to get 00:30 atalanta before 00:30 bologna you would have to keep time, place as separated values and use key= in sort() to assign function which would reverse only time but not place and number. Maybe it would be simpler to put all in pandas.DataFrame which has better methot to sort it.


Version with pandas.DataFrame and enter image description here

Full working code (4)

import tkinter as tk   # PEP8: `import *` is not preferred
from tkinter import ttk
from tkinter.scrolledtext import ScrolledText  # https://docs.python.org/3/library/tkinter.scrolledtext.html
import requests
import requests_cache  # https://github.com/reclosedev/requests-cache
from bs4 import BeautifulSoup
import pandas as pd

# PEP8: all imports at the beginning

# --- functions ---   # PEP8: all functions directly after imports

def get_data_for(place):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }

    results = []

    response = requests.get(f'https://www.tuttomercatoweb.com/{place}/', headers=headers)
    print('url:', response.url)
    print('status:', response.status_code)
    #print('html:', response.text[:1000])

    soup = BeautifulSoup(response.content, 'html.parser')

    news = soup.find_all('div', attrs={"class": "tcc-list-news"})

    for number, each in enumerate(news):
        for div in each.find_all("div"):
            time  = div.find('span', attrs={'class': 'hh serif'}).text
            title = " ".join(span.text for span in div.select("a > span"))
            news = f" {time} {place.upper()}, {title} (TMW)"
            link  = div.find('a')['href']
            results.append( [number, time, place, title, news, link] )

    return results

def all_titles():
    global df

    allnews = []  # local variable

    for place in ['atalanta',  'bologna']:
        print('search:', place)
        results = get_data_for(place)
        print('found:', len(results))
        allnews  = results
        text_download.insert('end', f"search: {place}\nfound: {len(results)}\n")

    df = pd.DataFrame(allnews, columns=['number', 'time', 'place', 'title', 'news', 'link'])
    df = df.sort_values(by=['number', 'time', 'place', 'title'], ascending=[True, False, True, True])
    df = df.reset_index()

    listbox_title.delete('0', 'end')

    for index, row in df.iterrows():
        listbox_title.insert('end', row['news'])

def content(event=None):   # `command=` executes without `event`, but `bind` executes with `event` - so it needs default value
    # tuple
    selection = listbox_title.curselection()
    print('selection:', selection)

    if selection:

        item = df.iloc[selection[-1]]
        #print('item:', item)

        url = item['link']
        #print('url:', url)

        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
        }

        # keep page in database `SQLite` 
        # https://github.com/reclosedev/requests-cache
        # https://sqlite.org/index.html
        session = requests_cache.CachedSession('titles')
        response = session.get(url, headers=headers)
        #response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        content_download = "\n".join(item.get_text() for item in soup.select("div.text.mbottom"))

        text_download.delete('1.0', 'end') # remove previous content)
        text_download.insert('end', content_download)

# --- main ---

df = None

window = tk.Tk()
window.geometry("800x800")

# ---
# [Tkinter: How to display Listbox with Scrollbar — furas.pl](https://blog.furas.pl/python-tkitner-how-to-display-listbox-with-scrollbar-gb.html)

frame_title = tk.Frame(window)
frame_title.pack(fill='both', expand=True, pady=5, padx=5)

listbox_title = tk.Listbox(frame_title, selectbackground="#960000", selectforeground="white", bg="white")
listbox_title.pack(side='left', fill='both', expand=True)

scrollbar_title = tk.Scrollbar(frame_title)
scrollbar_title.pack(side='left', fill='y')

scrollbar_title['command'] = listbox_title.yview
listbox_title.config(yscrollcommand=scrollbar_title.set)

listbox_title.bind('<Double-Button-1>', content)  # it executes `content(event)`

# ----

text_download = ScrolledText(window, bg="white")
text_download.pack(fill='both', expand=True, pady=0, padx=5)

# ----

buttons_frame = tk.Frame(window)
buttons_frame.pack(fill='x')

button1 = tk.Button(buttons_frame, text="View Titles", command=all_titles)  # don't use `[]` to execute functions
button1.pack(side='left', pady=5, padx=5)

button2 = tk.Button(buttons_frame, text="View Content", command=content)   # don't use `[]` to execute functions
button2.pack(side='left', pady=5, padx=(0,5))

window.mainloop()
  • Related