I am trying to remove emojis from column in pandas dataframe. Using this code:
def remove_emoji(string):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"] ", flags=re.UNICODE)
return emoji_pattern.sub(r'', string)
def decontracted(phrase):
# specific
phrase = phrase.rstrip()
phrase = ' '.join(phrase.split())
phrase = re.sub(r'\w :\/{2}[\d\w-] (\.[\d\w-] )*(?:(?:\/[^\s/]*))*', '', phrase)
phrase = re.sub('@[\w] ','',phrase)
phrase = re.sub(r'[^\x00-\x7f]',r'', phrase)
# general
phrase = re.sub('@[^\s] ','',phrase)
phrase = remove_accented_chars(phrase)
phrase = remove_special_characters(phrase)
phrase = remove_emoji(phrase)
return phrase
def remove_accented_chars(text):
new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
return new_text
def remove_special_characters(text):
# define the pattern to keep
pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]'
return re.sub(pat, '', text)
Applying it to the dataframe column like so:
AAVE["sentence"] = AAVE["sentence"].apply(decontracted)
['He better hurry amp; come back from playing cards', 'I ordered a new phone', 'lol okay baby \ud83d\ude18\u2764\ud83d\ude0d', 'imma cry']
Above is an example of the text I'm testing on. \ud83d\ude18\u2764\ud83d\ude0d is not removed.
-------------edit------------
Here is the code I am using to load the data that is in a TSV file:
AAVE = pd.read_csv('twitteraae_all_aa', sep='\t', on_bad_lines='skip')
columns = ['ID', 'Date', 'Num', 'Location','Num2', 'AA', 'Hispanic', 'Other', 'White']
AAVE.drop(columns, inplace=True, axis=1)
AAVE = AAVE.rename(columns={'Sentence': 'sentence'})
AAVE['label'] = 1
AAVE['sentence'] = AAVE['sentence'][0:391165].astype('string')
AAVE = AAVE.dropna()
AAVE['sentence1'] = AAVE['sentence'].astype('string').apply(decontracted).astype('string')
The code will work if I create an array of strings and apply the decontract function, but if I apply it to the dataframe, everything else that I want removed works, but not the emojis.
CodePudding user response:
you have to apply row by row using
AAVE["sentence"] = AAVE.apply(lambda row: remove_emoji(row["sentence"]), axis=1)
CodePudding user response:
This line of code is functional for removing emojis operating column by column
df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
Note that it will also removes all non-English letters and special characters, but if you need to keep them we can edit the code
CodePudding user response:
Your functions work for me:
arr = ['He better hurry amp; come back from playing cards', 'I ordered a new phone',
'lol okay baby \ud83d\ude18\u2764\ud83d\ude0d', 'imma cry']
df = pd.DataFrame({"column1": [0, 1, 2, 3], "column2": arr})
df
column1 column2
0 0 He better hurry amp; come back from playing cards
1 1 I ordered a new phone
2 2 lol okay baby \ud83d\ude18❤\ud83d\ude0d
3 3 imma cry
df["column2"] = df["column2"].apply(decontracted)
df
column1 column2
0 0 He better hurry amp; come back from playing cards
1 1 I ordered a new phone
2 2 lol okay baby
3 3 imma cry
Could it be an issue with how the text is stored in your dataframe?