Currently, I get just one row. How can I get all the words? Currently, I have a column of words. The problem in the stemmer. it gives only one row instead of all words.
My purpose is to clean the data and print all words separated by commas.
input: word1,word2,word3,word4,word5 in each row in the colomn df[tag]
and the output will be a long list with all the values word1,word2,word3,word4,word5,word6,word7....
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer
import pandas as pd
import spacy
import pytextrank
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
def Clean_stop_words(data):
#print(stopwords.words('english'))
stop_words=stopwords.words('english')
new_data=""
for word in data:
np.char.lower(word)
if word not in stop_words:
new_data = data " , " word
print(new_data)
symbols = "!\"#$%&()* -./:;<=>?@[\]^_`{|}~\n"
for i in symbols:
new_data = np.char.replace(new_text, i, ' ')
#print(data)
stemmer=PorterStemmer()
new_data=stemmer.stem(word)
#print(new_data)
Clean_stop_words(df["Tag"])
#print(data)
Thank you in advance
CodePudding user response:
Notice -
I decided to clean the special characters with regex, you can change the method if you wish.
Moreover, please look at the apply function of pandas that takes each row and executes the Clean_stop_words function.
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import numpy as np
import pandas as pd
import re
l = ["'word1,wording,w#ord,he##llo,sleeping,don't"]
df = pd.DataFrame(l, columns=['Tag'])
def Clean_stop_words(data):
stemmer = PorterStemmer()
stop_words=stopwords.words('english')
new_data=""
data_split = data.split(',')
for word in data_split:
np.char.lower(word)
word = re.sub('[^A-Za-z0-9] ', '', word)
if word not in stop_words:
stemmer.stem(word)
new_data = new_data " , " word
return new_data
df['Tag'] = df['Tag'].apply(Clean_stop_words)
print(df['Tag'])