How can I set a function that can cleans through a given text passed into a Data Frame. The text will be my variable, so I can put whatever sentence and the function will clean it by applying lower case, removing characters, etc. My attempt goes like this:
def my_function(x):
# Applies a few cleaning steps to the exceptions df:
# Sets text to lower case:
x.iloc[:, 0].str.lower()
# Removes breaks:
x.iloc[:, 0].replace(r'\n', ' ', regex=True)
# Sets text to lower case:
x.iloc[:, 0].str.lower()
# Removes a more extensive set of 'special' characters:
remove_these = ["!",'"',"%","&","'","(",")","#","*","?",
" ",",","-",".","/",":",";","<","=",">",
"@","[","\\","]","^","_","`","{","|","}",
"~","–","’", "*"]
for char in remove_these:
x.iloc[:, 0].str.replace(char, ' ')
# Removes numbers:
x.iloc[:, 0].replace(r'\d ', ' ', regex=True)
# Removes single characters:
x.iloc[:, 0].replace(r'\b[a-zA-Z]\b', ' ', regex=True)
# Removes extra spaces (trim) from both ends:
x.iloc[:, 0].str.strip()
# Removes double spacing:
x.iloc[:, 0].replace(r' ', ' ', regex=True)
# Removes spaces --:
x.iloc[:, 0].replace(r'--', '', regex=True)
Since the variable text would be passed into a DF, I thought using the first column always, hence the iloc[:, 0].
Then my variable text would be set like this:
my_variable = "WHAT A WONDERFUL WORLD!"
df_Text = pd.DataFrame({my_variable})
But when applying this, it won't work, the output is 'None':
output = my_function(df_Text)
print(output)
What am I doing wrong? Thanks a lot.
CodePudding user response:
Your function doesn't actually alter the dataframe in any way, and it doesn't return anything.
Try this.
mport pandas as pd
def my_function(x):
# Applies a few cleaning steps to the exceptions df:
# Sets text to lower case:
x.iloc[:, 0] = x.iloc[:, 0].str.lower()
# Removes breaks:
x.iloc[:, 0] = x.iloc[:, 0].replace(r'\n', ' ', regex=True)
# Sets text to lower case:
x.iloc[:, 0] = x.iloc[:, 0].str.lower()
# Removes a more extensive set of 'special' characters:
remove_these = ["!",'"',"%","&","'","(",")","#","*","?",
" ",",","-",".","/",":",";","<","=",">",
"@","[","\\","]","^","_","`","{","|","}",
"~","–","’", "*"]
for char in remove_these:
x.iloc[:, 0] = x.iloc[:, 0].str.replace(char, ' ')
# Removes numbers:
x.iloc[:, 0] = x.iloc[:, 0].replace(r'\d ', ' ', regex=True)
# Removes single characters:
x.iloc[:, 0] = x.iloc[:, 0].replace(r'\b[a-zA-Z]\b', ' ', regex=True)
# Removes extra spaces (trim) from both ends:
x.iloc[:, 0] = x.iloc[:, 0].str.strip()
# Removes double spacing:
x.iloc[:, 0] = x.iloc[:, 0].replace(r' ', ' ', regex=True)
# Removes spaces --:
x.iloc[:, 0] = x.iloc[:, 0].replace(r'--', '', regex=True)
return x
my_variable = "WHAT A WONDERFUL WORLD!"
df_Text = pd.DataFrame({my_variable})
output = my_function(df_Text)
print(output)
0
0 what wonderful world