from bs4 import BeautifulSoup as bs
import requests
import tkinter as tk
root = tk.Tk()
root.title('Language Filter')
root.geometry('500x100 700 200')
def rip ():
page = requests.get(scriptE.get())
rawscript = bs(page.content, "html.parser")
script = rawscript.find('td', class_='scrtext')
with open('script.txt', 'w') as f:
f.write(script.get_text())
def censor ():
with open('bannedwords.txt', 'r') as file:
lines = file.read().split('\n')
with open('script.txt', 'r') as file:
filedata = file.read()
for x in lines:
filedata = filedata.replace(x, "****")
with open('file.txt', 'w') as file:
file.write(filedata)
scriptL = tk.Label(root, text='Enter the URL to the script on IMSDB')
scriptL.grid(row=1,column=0)
scriptE = tk.Entry(root, width=80)
scriptE.grid(row=0,column=0, padx=8)
scriptB = tk.Button(root, text="Rip the Script", command=rip, width=60, bg='red', fg='white', font=('helvetica', 9, 'bold'))
scriptB.grid(row=2,column=0)
scriptC = tk.Button(root, text="Censor the Script", command=censor, width=60, bg='red', fg='white', font=('helvetica', 9, 'bold'))
scriptC.grid(row=3,column=0)
root.mainloop()
Hello, I'm trying to create a simple program that rips a movie script from IMSDB with one click and then censors it using a list of profane words contained in another text file. The rip function works and the censor function works, but it censors any word that matches the list.
For example, the word "hello" becomes "****o". I know that you can normally use r"b\string\b" to tell Python you only want words that are specifically that string and not any variation/word that contains that string.
Since I am using a list in filedata.replace(x, "****"), how do I use r"b\string\b" or some variation of it to ensure words like "ass" get censored, but words like "classic" won't become "cl****ic".
Here is a script I was using https://imsdb.com/scripts/Joker.html
The BannedWords text file is fairly lengthy but I guess creating a .txt with a few swear words in it should be enough to test if needed.
I'm new to Python and this was just a mini-exercise for me. I'm also open to advice on my coding format or maybe some redundant things inside of it. Thanks in advance.
CodePudding user response:
You will need to use regex for this. You can be a bit clever and turn the entire banned
list into one big regex. Then you don't need to worry about loops. Since the first argument of sub
can either be the replacement string or a function, we can use a lambda
to return an asterisk for every character in the match.
This is arguably the most efficient way to do this. Regex is slow. If you turned every banned word into a separate expression and looped them, it would crawl to the finish line. Don't keep remaking the expression, either. Make it once, and reuse it as much as you need.
import re
banned = ['wood', 'chuck']
#combine banned words into one big expression
#produces: r'\b(wood|chuck)\b'
banned = re.compile(rf'\b({"|".join(banned)})\b')
data = 'How much wood could a woodchuck chuck if a woodchuck could chuck wood?'
data = banned.sub(lambda m: '*'*len(m.group(1)), data)
print(data) #How much **** could a woodchuck ***** if a woodchuck could ***** ****?
Here's a completed version of your script that uses the above method. You said you were open to pointers. The script below illustrates a number of them.
import tkinter as tk, requests, re
from bs4 import BeautifulSoup
banned = None
with open('bannedwords.txt', 'r') as ban:
banned = ban.read().split('\n')
banned = re.compile(fr'\b({"|".join(banned)})\b')
#create root
root = tk.Tk()
root.title('Language Filter')
root.geometry('500x100 700 200')
#this is the only widget that needs a name
(entry := tk.Entry(root, width=80)).grid(row=0,column=0, padx=8)
#unless you intend to modify any of them, the labels and buttons don't need names
tk.Label(root, text='Enter the URL to the script on IMSDB').grid(row=1,column=0)
#default button config
btncfg = dict(width=60, bg='red', fg='white', font='Helvetica 9 bold')
#rip script ~ above rip button
def rip() -> None:
url = entry.get()
with requests.get(url) as resp, open('script.txt', 'wb') as file:
page = BeautifulSoup(resp.content, "html.parser")
script = page.find('td', class_='scrtext')
file.write(script.get_text().encode('utf-8'))
tk.Button(root, text="Rip the Script", command=rip, **btncfg).grid(row=2,column=0)
#censor script ~ above censor button
def censor() -> None:
with open('script.txt', 'r') as scr, open('censored.txt', 'wb') as cen:
cen.write(banned.sub(lambda m: '*'*len(m.group(1)), scr.read()).encode('utf-8'))
tk.Button(root, text="Censor the Script", command=censor, **btncfg).grid(row=3,column=0)
#best practice
if __name__ == '__main__':
root.mainloop()
Here's an identical version written as a class
import tkinter as tk, requests, re
from bs4 import BeautifulSoup
class App(tk.Tk):
def __init__(self, **kwargs):
tk.Tk.__init__(self)
self.title('Language Filter')
self.geometry('500x100 700 200')
#this is the only widget that needs a name
self._entry = tk.Entry(self, width=80)
self._entry.grid(row=0,column=0, padx=8)
#unless you intend to modify any of them, the labels and buttons don't need names
tk.Label(self, text='Enter the URL to the script on IMSDB').grid(row=1,column=0)
#buttons
btncfg = dict(width=60, bg='red', fg='white', font='Helvetica 9 bold')
tk.Button(self, text="Rip the Script", command=self._rip, **btncfg).grid(row=2,column=0)
tk.Button(self, text="Censor the Script", command=self._censor, **btncfg).grid(row=3,column=0)
#make banned words filter
self.banned = None
with open('bannedwords.txt', 'r') as ban:
self.banned = ban.read().split('\n')
self.banned = re.compile(fr'\b({"|".join(self.banned)})\b')
#load, parse and save script
def _rip(self) -> None:
url = self._entry.get()
with requests.get(url) as resp, open('script.txt', 'wb') as file:
page = BeautifulSoup(resp.content, "html.parser")
script = page.find('td', class_='scrtext')
file.write(script.get_text().encode('utf-8'))
#censor script
def _censor(self) -> None:
with open('script.txt', 'r') as scr, open('censored.txt', 'wb') as cen:
cen.write(self.banned.sub(lambda m: '*'*len(m.group(1)), scr.read()).encode('utf-8'))
if __name__ == '__main__':
App().mainloop()