My little personal utility built for fun. I have a Listbox where the titles and news time are scraped from 2 links and printed in the Listbox after clicking on the "View Title" button. This works correctly. All ok!
Now I would like to select the newspaper title from the Listbox, click on the "View Content" button, and view the news content in a multiline textbox. So I would like to view the content of the news of the selected title in the textbox below. I specify that the title is the same as the link of the news content. But I have a problem with the function to build this:
def content():
if title.select:
#click on title-link
driver.find_element_by_tag_name("title").click()
#Download Content to class for every title
content_download =(" ".join([span.text for span in div.select("text mbottom")]))
#Print Content in textobox
textbox_download.insert(tk.END, content_download)
So I imagined that to get this, we would have to simulate clicking on the title of the news to open it (in html it is title
), then select the text of the content (in html it is text mbottom
) and then copy it in the tetbox of my file. It should be so? What are you saying? Obviously I have poorly written the code and it doesn't work. I'm not very good at scraping. Could anyone help me? Thank you
The complete code is this (is executable correctly and scrapes titles and now. I don't call the content function in the button). Aside from the above function, the code is working good and fetches the title and news time
from tkinter import *
from tkinter import ttk
import tkinter as tk
import sqlite3
import random
import tkinter.font as tkFont
from tkinter import ttk
window=Tk()
window.title("x")
window.geometry("800x800")
textbox_title = tk.Listbox(window, width=80, height=16, font=('helvetic', 12), selectbackground="#960000", selectforeground="white", bg="white") #prima era self.tutti_pronostici, per far visualizzare le chiamate dall'altra finestra
textbox_title.place(x=1, y=1)
textbox_download = tk.Listbox(window, width=80, height=15, font=('helvetic', 12), selectbackground="#960000", selectforeground="white", bg="white") #prima era self.tutti_pronostici, per far visualizzare le chiamate dall'altra finestra
textbox_download.place(x=1, y=340)
#Download All Titles and Time
def all_titles():
allnews = []
import requests
from bs4 import BeautifulSoup
# mock browser request
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
#ATALANTA
site_atalanta = requests.get('https://www.tuttomercatoweb.com/atalanta/', headers=headers)
soup = BeautifulSoup(site_atalanta.content, 'html.parser')
news = soup.find_all('div', attrs={"class": "tcc-list-news"})
for each in news:
for div in each.find_all("div"):
time= (div.find('span', attrs={'class': 'hh serif'}).text)
title=(" ".join([span.text for span in div.select("a > span")]))
news = (f" {time} {'ATALANTA'}, {title} (TMW)")
allnews.append(news)
#BOLOGNA
site_bologna = requests.get('https://www.tuttomercatoweb.com/bologna/', headers=headers)
soup = BeautifulSoup(site_bologna.content, 'html.parser')
news = soup.find_all('div', attrs={"class": "tcc-list-news"})
for each in news:
for div in each.find_all("div"):
time= (div.find('span', attrs={'class': 'hh serif'}).text)
title=(" ".join([span.text for span in div.select("a > span")]))
news = (f" {time} {'BOLOGNA'}, {title} (TMW)")
allnews.append(news)
allnews.sort(reverse=True)
for news in allnews:
textbox_title.insert(tk.END, news)
#Download Content of News
def content():
if titolo.select:
#click on title-link
driver.find_element_by_tag_name("title").click()
#Download Content to class for every title
content_download =(" ".join([span.text for span in div.select("text mbottom")]))
#Print Content in textobox
textbox_download.insert(tk.END, content_download)
button = tk.Button(window, text="View Titles", command= lambda: [all_titles()])
button.place(x=1, y=680)
button2 = tk.Button(window, text="View Content", command= lambda: [content()])
button2.place(x=150, y=680)
window.mainloop()
CodePudding user response:
When you get title
and time
then you could directly get link
to page with details - and keep them as pair.
news = f" {time} '{place}', {title} (TMW)"
link = div.find('a')['href']
results.append( [news, link] )
and later you can display only news
but when you select title then you can get index and get link
from allnews
and directly download it - using requests
instead driver
def content():
# tuple with indexes of all selected titles
selection = listbox_title.curselection()
print('selection:', selection)
if selection:
item = allnews[selection[-1]]
print('item:', item)
url = item[1]
print('url:', url)
To select full news you have to use select(".text.mbottom")
with dots.
And to display news it would be better to use Text()
instead Listbox()
Because you run the same code for ATALANTA
and BOLOGNA
so I moved this code to function get_data_for(place)
and now I can even use for
-loop to run it for more places.
for place in ['atalanta', 'bologna']:
results = get_data_for(place)
allnews = results
Full working code (1) - I tried to keep only important elements.
I used pack()
instead of place()
beacause it allows to resize window and it will resize also Listbox()
and Text()
import tkinter as tk # PEP8: `import *` is not preferred
from tkinter import ttk
import requests
from bs4 import BeautifulSoup
# PEP8: all imports at the beginning
# --- functions --- # PEP8: all functions directly after imports
def get_data_for(place):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
results = []
response = requests.get(f'https://www.tuttomercatoweb.com/{place}/', headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
news = soup.find_all('div', attrs={"class": "tcc-list-news"})
for each in news:
for div in each.find_all("div"):
time = div.find('span', attrs={'class': 'hh serif'}).text
title = " ".join(span.text for span in div.select("a > span"))
news = f" {time} {place.upper()}, {title} (TMW)"
link = div.find('a')['href']
results.append( [news, link] )
return results
def all_titles():
global allnews # inform function to use global variable instead of local variable
allnews = []
for place in ['atalanta', 'bologna']:
print('search:', place)
results = get_data_for(place)
print('found:', len(results))
allnews = results
allnews.sort(reverse=True)
listbox_title.delete('0', 'end')
for news in allnews:
listbox_title.insert('end', news[0])
#Download Content of News
def content():
# tuple
selection = listbox_title.curselection()
print('selection:', selection)
if selection:
item = allnews[selection[-1]]
print('item:', item)
url = item[1]
print('url:', url)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
content_download = "\n".join(item.get_text() for item in soup.select("div.text.mbottom"))
text_download.delete('1.0', 'end') # remove previous content)
text_download.insert('end', content_download)
# --- main ---
allnews = [] # global variable with default value at start
window = tk.Tk()
window.geometry("800x800")
listbox_title = tk.Listbox(window, selectbackground="#960000", selectforeground="white", bg="white")
listbox_title.pack(fill='both', expand=True, pady=5, padx=5)
text_download = tk.Text(window, bg="white")
text_download.pack(fill='both', expand=True, pady=0, padx=5)
buttons_frame = tk.Frame(window)
buttons_frame.pack(fill='x')
button1 = tk.Button(buttons_frame, text="View Titles", command=all_titles) # don't use `[]` to execute functions
button1.pack(side='left', pady=5, padx=5)
button2 = tk.Button(buttons_frame, text="View Content", command=content) # don't use `[]` to execute functions
button2.pack(side='left', pady=5, padx=(0,5))
window.mainloop()
Result:
EDIT:
Problem with sorting: today's titles are at the end of list but they should be at the beginning - all because they are sorted using only time
but they would need to be sorted using date time
or number time
.
You would enumerate
every tcc-list-news
and then every day would have own number and they would sort (almost) correctly. because you want to sort in reverse order then you may need -number
instead of number
to get correct order.
for number, each in enumerate(news):
for div in each.find_all("div"):
time = div.find('span', attrs={'class': 'hh serif'}).text
title = " ".join(span.text for span in div.select("a > span"))
news = f" {time} {place.upper()}, {title} (TMW)"
link = div.find('a')['href']
results.append( [-number, news, link] )
and after sorting
for number, news, url in allnews:
listbox_title.insert('end', news)
Full working code (2)
import tkinter as tk # PEP8: `import *` is not preferred
from tkinter import ttk
import requests
from bs4 import BeautifulSoup
# PEP8: all imports at the beginning
# --- functions --- # PEP8: all functions directly after imports
def get_data_for(place):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
results = []
response = requests.get(f'https://www.tuttomercatoweb.com/{place}/', headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
news = soup.find_all('div', attrs={"class": "tcc-list-news"})
for number, each in enumerate(news):
for div in each.find_all("div"):
time = div.find('span', attrs={'class': 'hh serif'}).text
title = " ".join(span.text for span in div.select("a > span"))
news = f" {time} {place.upper()}, {title} (TMW)"
link = div.find('a')['href']
results.append( [-number, news, link] )
return results
def all_titles():
global allnews # inform function to use global variable instead of local variable
allnews = []
for place in ['atalanta', 'bologna']:
print('search:', place)
results = get_data_for(place)
print('found:', len(results))
allnews = results
allnews.sort(reverse=True)
listbox_title.delete('0', 'end')
for number, news, url in allnews:
listbox_title.insert('end', news)
#Download Content of News
def content():
# tuple
selection = listbox_title.curselection()
print('selection:', selection)
if selection:
item = allnews[selection[-1]]
print('item:', item)
url = item[2]
print('url:', url)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
content_download = "\n".join(item.get_text() for item in soup.select("div.text.mbottom"))
text_download.delete('1.0', 'end') # remove previous content)
text_download.insert('end', content_download)
# --- main ---
allnews = [] # global variable with default value at start
window = tk.Tk()
window.geometry("800x800")
listbox_title = tk.Listbox(window, selectbackground="#960000", selectforeground="white", bg="white")
listbox_title.pack(fill='both', expand=True, pady=5, padx=5)
text_download = tk.Text(window, bg="white")
text_download.pack(fill='both', expand=True, pady=0, padx=5)
buttons_frame = tk.Frame(window)
buttons_frame.pack(fill='x')
button1 = tk.Button(buttons_frame, text="View Titles", command=all_titles) # don't use `[]` to execute functions
button1.pack(side='left', pady=5, padx=5)
button2 = tk.Button(buttons_frame, text="View Content", command=content) # don't use `[]` to execute functions
button2.pack(side='left', pady=5, padx=(0,5))
window.mainloop()
BTW
Because you sort in reverse order so you get 00:30 bologna
before 00:30 atalanta
- to get 00:30 atalanta
before 00:30 bologna
you would have to keep time
, place
as separated values and use key=
in sort()
to assign function which would reverse only time
but not place
and number
. Maybe it would be simpler to put all in pandas.DataFrame
which has better methot to sort it.
Version with pandas.DataFrame
and
Full working code (4)
import tkinter as tk # PEP8: `import *` is not preferred
from tkinter import ttk
from tkinter.scrolledtext import ScrolledText # https://docs.python.org/3/library/tkinter.scrolledtext.html
import requests
import requests_cache # https://github.com/reclosedev/requests-cache
from bs4 import BeautifulSoup
import pandas as pd
# PEP8: all imports at the beginning
# --- functions --- # PEP8: all functions directly after imports
def get_data_for(place):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
results = []
response = requests.get(f'https://www.tuttomercatoweb.com/{place}/', headers=headers)
print('url:', response.url)
print('status:', response.status_code)
#print('html:', response.text[:1000])
soup = BeautifulSoup(response.content, 'html.parser')
news = soup.find_all('div', attrs={"class": "tcc-list-news"})
for number, each in enumerate(news):
for div in each.find_all("div"):
time = div.find('span', attrs={'class': 'hh serif'}).text
title = " ".join(span.text for span in div.select("a > span"))
news = f" {time} {place.upper()}, {title} (TMW)"
link = div.find('a')['href']
results.append( [number, time, place, title, news, link] )
return results
def all_titles():
global df
allnews = [] # local variable
for place in ['atalanta', 'bologna']:
print('search:', place)
results = get_data_for(place)
print('found:', len(results))
allnews = results
text_download.insert('end', f"search: {place}\nfound: {len(results)}\n")
df = pd.DataFrame(allnews, columns=['number', 'time', 'place', 'title', 'news', 'link'])
df = df.sort_values(by=['number', 'time', 'place', 'title'], ascending=[True, False, True, True])
df = df.reset_index()
listbox_title.delete('0', 'end')
for index, row in df.iterrows():
listbox_title.insert('end', row['news'])
def content(event=None): # `command=` executes without `event`, but `bind` executes with `event` - so it needs default value
# tuple
selection = listbox_title.curselection()
print('selection:', selection)
if selection:
item = df.iloc[selection[-1]]
#print('item:', item)
url = item['link']
#print('url:', url)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
# keep page in database `SQLite`
# https://github.com/reclosedev/requests-cache
# https://sqlite.org/index.html
session = requests_cache.CachedSession('titles')
response = session.get(url, headers=headers)
#response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
content_download = "\n".join(item.get_text() for item in soup.select("div.text.mbottom"))
text_download.delete('1.0', 'end') # remove previous content)
text_download.insert('end', content_download)
# --- main ---
df = None
window = tk.Tk()
window.geometry("800x800")
# ---
# [Tkinter: How to display Listbox with Scrollbar — furas.pl](https://blog.furas.pl/python-tkitner-how-to-display-listbox-with-scrollbar-gb.html)
frame_title = tk.Frame(window)
frame_title.pack(fill='both', expand=True, pady=5, padx=5)
listbox_title = tk.Listbox(frame_title, selectbackground="#960000", selectforeground="white", bg="white")
listbox_title.pack(side='left', fill='both', expand=True)
scrollbar_title = tk.Scrollbar(frame_title)
scrollbar_title.pack(side='left', fill='y')
scrollbar_title['command'] = listbox_title.yview
listbox_title.config(yscrollcommand=scrollbar_title.set)
listbox_title.bind('<Double-Button-1>', content) # it executes `content(event)`
# ----
text_download = ScrolledText(window, bg="white")
text_download.pack(fill='both', expand=True, pady=0, padx=5)
# ----
buttons_frame = tk.Frame(window)
buttons_frame.pack(fill='x')
button1 = tk.Button(buttons_frame, text="View Titles", command=all_titles) # don't use `[]` to execute functions
button1.pack(side='left', pady=5, padx=5)
button2 = tk.Button(buttons_frame, text="View Content", command=content) # don't use `[]` to execute functions
button2.pack(side='left', pady=5, padx=(0,5))
window.mainloop()