I'm filtering tweets so I can do a sentiment analysis on them but there are repeating lines I want to avoid as they're just the name of the account, after the filtering the text is left like this:
And as you can see, every other line is 'Ajuntament de Calvià de Calvià', how could I delete the most repeated line and then delete that text from the whole file?
I tried this but all it does is find the longest repeating substring, wich is not what I want:
def delete_lrns(string): # delete longest repeating, non-overlapping substring
x = open("FILTERED_" string, encoding='utf-8')
text = x.read()
n = 10000
lcs = [[0 for x in range(n 1)]
for y in range(n 1)]
res = "" # To store result
res_length = 0 # To store length of result
# building table in bottom-up manner
index = 0
for i in range(1, n 1):
for j in range(i 1, n 1):
# (j-i) > lcs[i-1][j-1] to remove
# overlapping
if (text[i - 1] == text[j - 1] and
lcs[i - 1][j - 1] < (j - i)):
lcs[i][j] = lcs[i - 1][j - 1] 1
# updating maximum length of the
# substring and updating the finishing
# index of the suffix
if lcs[i][j] > res_length:
res_length = lcs[i][j]
index = max(i, index)
else:
lcs[i][j] = 0
# If we have non-empty result, then insert
# all characters from first character to
# last character of string
if res_length > 0:
for i in range(index - res_length 1,
index 1):
res = res text[i - 1]
x.close()
return res
CodePudding user response:
You could do something like this:
# function to remove duplicate texts
def remove_duplicate():
# opens texts.txt in r mode as one long string and assigns to var
texts = open('filename', 'r').read()
# .split() removes excess whitespaces from str, return str as list
texts = texts.split()
# empty list to store non-duplicate e-mails
clean_list = []
# for loop to append non-duplicate texts to clean list
for text in texts:
if text not in clean_list:
clean_list.append(text)
return clean_list
# close texts.txt file
texts.close()
# assigns no_duplicate_texts.txt to variable below
no_duplicate_texts = open('no_duplicate_texts.txt', 'w')
# function to convert clean_list 'list' elements in to strings
for text in remove_duplicate():
# .strip() method to remove commas
text = text.strip(',')
no_duplicate_texts.write(f"{text}\n")
# close no_duplicate_texts.txt file
no_duplicate_texts.close()
But this will generate a new file without the duplicates.
Hope it helps
CodePudding user response:
You could do something like this:
count_dict = {} #This is to keep track of count of each line
with open('yourfile.txt', "r") as f:
lines = f.readlines() #Open file and read, write to a list simultenously
for line in lines:
count_dict[line] = lines.count(line) #Fill the dict to keep track
line_with_max_occurance = max(count_dict, key= lambda x: count_dict[x])
modified_lines = [i for i in lines if i != line_with_max_occurance or line_with_max_occurance not in line]
with open('your_new_file.txt', 'w') as f:
for line in modified_lines:
f.write(f"{line}\n")
Please note that this is for writing the content to a new file, not the same file.
If you want to modify the same file without the most repeated line, please use below instead of the file writing part in the code above:
with open("yourfile.txt", "w") as f:
for line in lines:
if line.strip("\n") != line_with_max_occurance:
f.write(line)
Let me know if this works for you.
CodePudding user response:
You could do something like this:
lines_seen = set()
with open(".txt", "r ") as f:
d = f.readlines()
f.seek(0)
for i in d:
if i not in lines_seen:
f.write(i)
lines_seen.add(i)
f.truncate()
downside is that it doesn't delete the first duplicate and the last duplicate in the file Hope it helps