def cleanTweets(text):
text = re.sub(r'@[A-Za-z0–9] ','', text) #remove the mentions
text = re.sub(r'#','', text) # remove the #
text = re.sub(r'RT[\s] ','', text) # remove the RT
text = re.sub(r'https?:\/\/\S ','', text) #remove hyperlink
return text
tweets_df_cleaned = tweets_df.withColumn('Tweets',col(udf(cleanTweets(Text))))
how could i apply this to the tweets_df which has a column Text to clean, in pandas it can be done by apply
CodePudding user response:
You can use the method apply
tweets_df['Tweets_New'] = tweets_df['Tweets'].apply(cleanTweets)
CodePudding user response:
Use Pandas UDF(User-Defined Function). Check your spark version because this solution is for Spark 3.X version.
from pyspark.sql.functions import pandas_udf, PandasUDFType
def cleanTweets(text):
text = re.sub(r'@[A-Za-z0–9] ','', text) #remove the mentions
text = re.sub(r'#','', text) # remove the #
text = re.sub(r'RT[\s] ','', text) # remove the RT
text = re.sub(r'https?:\/\/\S ','', text) #remove hyperlink
return text
@pandas_udf("string", PandasUDFType.SCALAR)
tweets_df_cleaned = tweets_df.withColumn("Tweets", cleanTweets("text"))