I have a dataset named news_collection.csv where that has news and what I was struggling to do is that to replace words of the data set with the synonyms from pre built collection called syno.txt . If a word in the data set has a synonyms from syno.txt I want to replace with the first value of that particular synonym line.
Below is the news_collection.csv
created_at,text
5/13/2021 3:27:55 PM,"my mom went with her mommy to bring the food for us"
5/13/2021 3:27:55 PM,"that is my dad and haven't your dada talk to my father"
Below is the syno.txt
mother, mommy, mom, ma
father, dad, daddy, dada
Below is the expected result
created_at,text
5/13/2021 3:27:55 PM,"my mother went with her mother to bring the food for us"
5/13/2021 3:27:55 PM,"that is my father and haven't your father talk to my father"
Below is what I have tried upto now
import pandas as pd
import re
from nltk.tokenize import word_tokenize
def similarity():
tweets = pd.read_csv(r'news_collection.csv')
df = pd.DataFrame(tweets, columns=['created_at', 'text'])
df['created_at'] = pd.to_datetime(df['created_at'])
df['text'] = df['text'].apply(lambda x: str(x))
df["text"] = df["text"].apply(lambda x: replacesynonyms(x))
return df
def replacesynonyms(text):
file = open('syno.txt', 'r', encoding="utf8")
//code to be added
Can someone help to solve this algorithm?
CodePudding user response:
Try this:
def similarity():
tweets = pd.read_csv(r'news_collection.csv')
df = pd.DataFrame(tweets, columns=['created_at', 'text'])
df['created_at'] = pd.to_datetime(df['created_at'])
df['text'] = df['text'].apply(lambda x: str(x))
df["text"] = df["text"].apply(lambda x: replacesynonyms(x))
return df
def create_sets():
lists_sets = []
file = open('syno.txt', 'r', encoding="utf8")
lines = file.readlines()
for line in lines:
s = set()
words = line.split(',')
for word in words:
s.add(word.strip())
lists_sets.append(s)
return lists_sets
def create_syn_list():
first_syn_name = []
file = open('syno.txt', 'r', encoding="utf8")
lines = file.readlines()
for line in lines:
first_syn_name.append(line.split(',')[0].strip())
return first_syn_name
lists_sets = create_sets()
first_syn_list = create_syn_list()
def replacesynonyms(text):
words = text.split()
new_sentence_l = []
for word in words:
to_add = True
for idx, syn_set in enumerate(lists_sets):
if word in syn_set:
new_sentence_l.append(first_syn_list[idx])
to_add = False
break
if to_add:
new_sentence_l.append(word)
return ' '.join(new_sentence_l)
df = similarity()
sen = list(df['text'])
for i in sen:
print(i)