How to get all stop words from spacy.lang.en
and don't get any errors?
from spacy.lang.en import stop_words as stop_words
def tokenize(sentence):
sentence = nlp(sentence)
# lemmatizing
sentence = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in sentence ]
# removing stop words
sentence = [ word for word in sentence if word not in stop_words and word not in punctuations ]
return sentence
tokenize("Hallo ik ben leyla en ") and then i get
Then I got the following error and This is the error that I got
TypeError: argument of type 'module' is not iterable
CodePudding user response:
Make sure stop_words
and punctuations
be a list
or set
and for getting a set of all stop_words
from from spacy.lang.en import stop_words
you can use stop_words.STOP_WORDS
or as an alternative sultion you can use nlp.Defaults.stop_words
.
import spacy
from string import punctuation
from spacy.lang.en import stop_words
nlp = spacy.load('en_core_web_sm')
stop_words = stop_words.STOP_WORDS
# print(stop_words)
# as an alternative solution
# stop_words = nlp.Defaults.stop_words
punctuations = ','.join(punctuation).split(',')
print(punctuations)
# ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', ' ', '', '', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']
def tokenize(sentence):
sentence = nlp(sentence)
# lemmatizing
sentence = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in sentence ]
# removing stop words
sentence = [ word for word in sentence if word not in stop_words and word not in punctuations ]
return sentence
>>> tokenize("Hallo ik ben leyla en ")
['hallo', 'ik', 'ben', 'leyla', 'en']