I created the following method
import numpy as np
import re
from nltk.corpus import stopwords
def clean_tweet(tweet):
if type(tweet) == np.float:
return ""
temp = tweet.lower()
temp = re.sub("'", "", temp) # to avoid removing contractions in english
temp = re.sub("@[A-Za-z0-9_] ","", temp)
temp = re.sub("#[A-Za-z0-9_] ","", temp)
temp = re.sub(r'http\S ', '', temp)
temp = re.sub('[()!?]', ' ', temp)
temp = re.sub('\[.*?\]',' ', temp)
temp = re.sub("[^a-z0-9]"," ", temp)
temp = temp.split()
temp = [w for w in temp if not w in stopwords]
temp = " ".join(word for word in temp)
return temp
and I have a pandas dataframe with 1000 tweets to clean
If I try this:
df['cleantweet'] = df.apply(lambda row : clean_tweet(row['Tweet']), axis = 1)
I get this error:
<1 sec
TypeError: argument of type 'WordListCorpusReader' is not iterable
Update: How did I fill the dataframe
paginator = tweepy.Paginator(
client.search_recent_tweets, # The method you want to use
"#GunControlNow -is:retweet", # Some argument for this method
max_results=100 # How many tweets asked per request
)
import pandas as pd
tweets = []
for tweet in paginator.flatten(limit=10000): # Total number of tweets to retrieve
tweets.append(tweet.text)
df = pd.DataFrame (tweets, columns = ['Tweet'])
df
from azureml.core import Workspace, Dataset
subscription_id = 'x'
resource_group = 'x'
workspace_name = 'x'
workspace = Workspace(subscription_id, resource_group, workspace_name)
from azureml.core import Datastore, Dataset
datastore = Datastore.get(workspace, 'workspaceblobstore')
dataset = Dataset.Tabular.register_pandas_dataframe(df, datastore, "tweets", show_progress=True)
CodePudding user response:
refer to the following: WordListCorpusReader is not iterable
You just need to define a variable for the stopwords that reads from the stopwords object that you import from nltk corpus: stopwords = set(stopwords.words("english"))