how to get a list of words after cleaning the data with stemming-CodePudding

Currently, I get just one row. How can I get all the words? Currently, I have a column of words. The problem in the stemmer. it gives only one row instead of all words.

My purpose is to clean the data and print all words separated by commas.

input: word1,word2,word3,word4,word5 in each row in the colomn df[tag]

and the output will be a long list with all the values word1,word2,word3,word4,word5,word6,word7....

from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer
import pandas as pd 
import spacy
import pytextrank

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer 



def Clean_stop_words(data): 
#print(stopwords.words('english'))
    stop_words=stopwords.words('english')
    new_data="" 
    for word in data:
        np.char.lower(word)
        if word not in stop_words:
            new_data = data   " , "   word
    print(new_data)
    symbols = "!\"#$%&()* -./:;<=>?@[\]^_`{|}~\n"
    for i in symbols:
        new_data = np.char.replace(new_text, i, ' ')
    #print(data) 
    stemmer=PorterStemmer()
    new_data=stemmer.stem(word)
    #print(new_data)

Clean_stop_words(df["Tag"])
#print(data)

Thank you in advance

CodePudding user response：

Notice -

I decided to clean the special characters with regex, you can change the method if you wish.

Moreover, please look at the apply function of pandas that takes each row and executes the Clean_stop_words function.

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import numpy as np
import pandas as pd
import re

l = ["'word1,wording,w#ord,he##llo,sleeping,don't"]
df = pd.DataFrame(l, columns=['Tag'])


def Clean_stop_words(data):
    stemmer = PorterStemmer()
    stop_words=stopwords.words('english')
    new_data=""
    data_split = data.split(',')
    for word in data_split:
        np.char.lower(word)
        word = re.sub('[^A-Za-z0-9] ', '', word)
        if word not in stop_words:
            stemmer.stem(word)
            new_data = new_data   " , "   word
    return new_data

df['Tag'] = df['Tag'].apply(Clean_stop_words)
print(df['Tag'])