I have written a code which extracts stop words from a text file and outputs two new text files. One file contains the stop words from that text file and another file contains the data without stop words. Now I have more than 100 text file in a folder, I would like to perform the same operation on all those file simultaneously.
For example there is a Folder A which contains 100 text file the code should be executed on all those text files simultaneously. The output should be two new text files such as 'Stop_Word_Consist_Filename.txt' and 'Stop_word_not_Filename.txt' which should be stored in a separate folder.That means for every 100 text files there will 200 output text files stored in a new folder. Please note the 'Filename' in both these output file is the actual name of the text file meaning 'Walmart.txt' should have 'Stop_Word_Consist_Walmart.txt' and 'Stop_word_not_Walmart.txt'. I did try few things and I know loop in involved giving the path directory but I didn't get any success.
Apologies for such a long question.
Following is the code for 1 file.
import numpy as np
import pandas as pd
# Pathes of source files and that for after-modifications
files_path = os.getcwd()
# another folder, your should create first to store files after modifications in
files_after_path = os.getcwd() '/' 'Stopwords_folder'
os.makedirs(files_after_path, exist_ok=True)
text_files = os.listdir(files_path)
data = pd.DataFrame(text_files)
data.columns = ["Review_text"]
import re
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
def clean_text(df):
all_reviews = list()
#lines = df["Review_text"].values.tolist()
lines = data.values.tolist()
for text in lines:
#text = text.lower()
text = [word.lower() for word in text]
pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.& ]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F])) ')
text = pattern.sub('', str(text))
emoji = re.compile("["
u"\U0001F600-\U0001FFFF" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"] ", flags=re.UNICODE)
text = emoji.sub(r'', text)
text = re.sub(r"i'm", "i am", text)
text = re.sub(r"he's", "he is", text)
text = re.sub(r"she's", "she is", text)
text = re.sub(r"that's", "that is", text)
text = re.sub(r"what's", "what is", text)
text = re.sub(r"where's", "where is", text)
text = re.sub(r"\'ll", " will", text)
text = re.sub(r"\'ve", " have", text)
text = re.sub(r"\'re", " are", text)
text = re.sub(r"\'d", " would", text)
text = re.sub(r"\'ve", " have", text)
text = re.sub(r"won't", "will not", text)
text = re.sub(r"don't", "do not", text)
text = re.sub(r"did't", "did not", text)
text = re.sub(r"can't", "can not", text)
text = re.sub(r"it's", "it is", text)
text = re.sub(r"couldn't", "could not", text)
text = re.sub(r"have't", "have not", text)
text = re.sub(r"[,.\"!@#$%^&*(){}?/;`~:<> =-]", "", text)
tokens = word_tokenize(text)
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
words = [word for word in stripped if word.isalpha()]
stop_words = set(stopwords.words("english"))
stop_words.discard("not")
PS = PorterStemmer()
words = [PS.stem(w) for w in words if not w in stop_words]
words = ' '.join(words)
all_reviews.append(words)
return all_reviews,stop_words
for entry in data:
#all_reviews , stop_words = clean_text(entry)
for r in all_reviews:
if not r in stop_words:
appendFile = open(f'No_Stopwords{entry}.txt','a')
appendFile.write(" " r)
appendFile.close()
for r in stop_words:
appendFile = open(f'Stop_Word_Consist{entry}.txt','a')
appendFile.write(" " r)
appendFile.close()
all_reviews , stop_words = clean_text(entry)
UPDATE :
So I have made changes to the code. I did got two output files Stop_Word_Consist and No_Stop_word. But I am not getting the required Data inside. Meaning Stop_word consist does not have the stop words I am looking for. I am pretty sure I made some mistakes in indentation. I would appreciate the help.
CodePudding user response:
You can use OS.listdir to get the number of text files, and use a for loop to run each time. To assign a name to the output file you can use an f-string in its creation so it looks like f'Stop_Word_Consist_{fileName}':
for entry in OS.listdir(folder location):
all_reviews , stop_words = clean_text(data_1)
all_reviews[:]
for r in all_reviews:
if not r in stop_words:
appendFile = open('Stop_Word_hrb02-phil-usa.txt.txt','a')
appendFile.write(" " r)
appendFile.close()
for r in stop_words:
appendFile = open(f'Stop_Word_Consist{entry}.txt','a')
appendFile.write(" " r)
appendFile.close()