How do I use r"\b \b" when replacing text in a file while working with a list?-CodePudding

from bs4 import BeautifulSoup as bs
import requests
import tkinter as tk

root = tk.Tk()
root.title('Language Filter')
root.geometry('500x100 700 200')

def rip ():

    page = requests.get(scriptE.get())
    rawscript = bs(page.content, "html.parser")
    script = rawscript.find('td', class_='scrtext')
    with open('script.txt', 'w') as f:
        f.write(script.get_text())  


def censor ():
    
    with open('bannedwords.txt', 'r') as file:
        lines = file.read().split('\n')

        with open('script.txt', 'r') as file:
            filedata = file.read()
            for x in lines:

                filedata = filedata.replace(x, "****")
    
                with open('file.txt', 'w') as file:
                    file.write(filedata)    


scriptL = tk.Label(root, text='Enter the URL to the script on IMSDB')
scriptL.grid(row=1,column=0)
scriptE = tk.Entry(root, width=80)
scriptE.grid(row=0,column=0, padx=8)
scriptB = tk.Button(root, text="Rip the Script", command=rip, width=60, bg='red', fg='white', font=('helvetica', 9, 'bold'))
scriptB.grid(row=2,column=0)
scriptC = tk.Button(root, text="Censor the Script", command=censor, width=60, bg='red', fg='white', font=('helvetica', 9, 'bold'))
scriptC.grid(row=3,column=0)

root.mainloop()

Hello, I'm trying to create a simple program that rips a movie script from IMSDB with one click and then censors it using a list of profane words contained in another text file. The rip function works and the censor function works, but it censors any word that matches the list.

For example, the word "hello" becomes "****o". I know that you can normally use r"b\string\b" to tell Python you only want words that are specifically that string and not any variation/word that contains that string.

Since I am using a list in filedata.replace(x, "****"), how do I use r"b\string\b" or some variation of it to ensure words like "ass" get censored, but words like "classic" won't become "cl****ic".

Here is a script I was using https://imsdb.com/scripts/Joker.html

The BannedWords text file is fairly lengthy but I guess creating a .txt with a few swear words in it should be enough to test if needed.

I'm new to Python and this was just a mini-exercise for me. I'm also open to advice on my coding format or maybe some redundant things inside of it. Thanks in advance.

CodePudding user response：

You will need to use regex for this. You can be a bit clever and turn the entire banned list into one big regex. Then you don't need to worry about loops. Since the first argument of sub can either be the replacement string or a function, we can use a lambda to return an asterisk for every character in the match.

This is arguably the most efficient way to do this. Regex is slow. If you turned every banned word into a separate expression and looped them, it would crawl to the finish line. Don't keep remaking the expression, either. Make it once, and reuse it as much as you need.

import re

banned = ['wood', 'chuck']

#combine banned words into one big expression
#produces: r'\b(wood|chuck)\b'
banned = re.compile(rf'\b({"|".join(banned)})\b')

data = 'How much wood could a woodchuck chuck if a woodchuck could chuck wood?'
data = banned.sub(lambda m: '*'*len(m.group(1)), data)
    
print(data) #How much **** could a woodchuck ***** if a woodchuck could ***** ****?

Here's a completed version of your script that uses the above method. You said you were open to pointers. The script below illustrates a number of them.

import tkinter as tk, requests, re
from bs4 import BeautifulSoup

banned = None
with open('bannedwords.txt', 'r') as ban:
    banned = ban.read().split('\n')
    banned = re.compile(fr'\b({"|".join(banned)})\b')

#create root
root = tk.Tk()
root.title('Language Filter')
root.geometry('500x100 700 200')

#this is the only widget that needs a name
(entry := tk.Entry(root, width=80)).grid(row=0,column=0, padx=8)

#unless you intend to modify any of them, the labels and buttons don't need names
tk.Label(root, text='Enter the URL to the script on IMSDB').grid(row=1,column=0)

#default button config
btncfg = dict(width=60, bg='red', fg='white', font='Helvetica 9 bold')

#rip script ~ above rip button
def rip() -> None:
    url = entry.get()
    with requests.get(url) as resp, open('script.txt', 'wb') as file:
        page   = BeautifulSoup(resp.content, "html.parser")
        script = page.find('td', class_='scrtext')
        file.write(script.get_text().encode('utf-8'))  

tk.Button(root, text="Rip the Script", command=rip, **btncfg).grid(row=2,column=0)

#censor script ~ above censor button
def censor() -> None:
    with open('script.txt', 'r') as scr, open('censored.txt', 'wb') as cen:
        cen.write(banned.sub(lambda m: '*'*len(m.group(1)), scr.read()).encode('utf-8'))

tk.Button(root, text="Censor the Script", command=censor, **btncfg).grid(row=3,column=0)

#best practice
if __name__ == '__main__':
    root.mainloop()

Here's an identical version written as a class

import tkinter as tk, requests, re
from bs4 import BeautifulSoup

    
class App(tk.Tk):
    def __init__(self, **kwargs):
        tk.Tk.__init__(self)
        
        self.title('Language Filter')
        self.geometry('500x100 700 200')
        
        #this is the only widget that needs a name
        self._entry = tk.Entry(self, width=80)
        self._entry.grid(row=0,column=0, padx=8)
        
        #unless you intend to modify any of them, the labels and buttons don't need names
        tk.Label(self, text='Enter the URL to the script on IMSDB').grid(row=1,column=0)
        
        #buttons
        btncfg = dict(width=60, bg='red', fg='white', font='Helvetica 9 bold')
        tk.Button(self, text="Rip the Script", command=self._rip, **btncfg).grid(row=2,column=0)
        tk.Button(self, text="Censor the Script", command=self._censor, **btncfg).grid(row=3,column=0)
        
        #make banned words filter
        self.banned = None
        with open('bannedwords.txt', 'r') as ban:
            self.banned = ban.read().split('\n')
            self.banned = re.compile(fr'\b({"|".join(self.banned)})\b')
        
    #load, parse and save script
    def _rip(self) -> None:
        url = self._entry.get()
        with requests.get(url) as resp, open('script.txt', 'wb') as file:
            page   = BeautifulSoup(resp.content, "html.parser")
            script = page.find('td', class_='scrtext')
            file.write(script.get_text().encode('utf-8'))  

    #censor script
    def _censor(self) -> None:
        with open('script.txt', 'r') as scr, open('censored.txt', 'wb') as cen:
            cen.write(self.banned.sub(lambda m: '*'*len(m.group(1)), scr.read()).encode('utf-8'))


if __name__ == '__main__':
    App().mainloop()