Replace words in a sentence with synonyms using Python-CodePudding

I have a dataset named news_collection.csv where that has news and what I was struggling to do is that to replace words of the data set with the synonyms from pre built collection called syno.txt . If a word in the data set has a synonyms from syno.txt I want to replace with the first value of that particular synonym line.

Below is the news_collection.csv

created_at,text
5/13/2021 3:27:55 PM,"my mom went with her mommy to bring the food for us"
5/13/2021 3:27:55 PM,"that is my dad and haven't your dada talk to my father"

Below is the syno.txt

mother, mommy, mom, ma
father, dad, daddy, dada

Below is the expected result

created_at,text
5/13/2021 3:27:55 PM,"my mother went with her mother to bring the food for us"
5/13/2021 3:27:55 PM,"that is my father and haven't your father talk to my father"

Below is what I have tried upto now

import pandas as pd
import re
from nltk.tokenize import word_tokenize


def similarity():
    tweets = pd.read_csv(r'news_collection.csv')
    df = pd.DataFrame(tweets, columns=['created_at', 'text'])
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['text'] = df['text'].apply(lambda x: str(x))
    df["text"] = df["text"].apply(lambda x: replacesynonyms(x))

return df

def replacesynonyms(text):
    file = open('syno.txt', 'r', encoding="utf8")
    //code to be added

Can someone help to solve this algorithm?

CodePudding user response：

Try this:

def similarity():
    tweets = pd.read_csv(r'news_collection.csv')
    df = pd.DataFrame(tweets, columns=['created_at', 'text'])
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['text'] = df['text'].apply(lambda x: str(x))
    df["text"] = df["text"].apply(lambda x: replacesynonyms(x))
    return df


def create_sets():
    lists_sets = []
    file = open('syno.txt', 'r', encoding="utf8")
    lines = file.readlines()
    for line in lines:
        s = set()
        words = line.split(',')
        for word in words:
            s.add(word.strip())
        lists_sets.append(s)

    return lists_sets


def create_syn_list():
    first_syn_name = []
    file = open('syno.txt', 'r', encoding="utf8")
    lines = file.readlines()
    for line in lines:
        first_syn_name.append(line.split(',')[0].strip())
    return first_syn_name

lists_sets = create_sets()
first_syn_list = create_syn_list()


def replacesynonyms(text):
    words = text.split()
    new_sentence_l = []
    for word in words:
        to_add = True
        for idx, syn_set in enumerate(lists_sets):
            if word in syn_set:
                new_sentence_l.append(first_syn_list[idx])
                to_add = False
                break
        if to_add:
            new_sentence_l.append(word)
    return ' '.join(new_sentence_l)

df = similarity()
sen = list(df['text'])
for i in sen:
    print(i)