#! The/usr/bin/python
# coding=utf-8
The from numpy import *
# filter for malicious message insulting: 1 the insults: 0
# to create an experimental sample
Def loadDataSet () :
PostingList=[[' my ', 'dog', 'has',' flea ', 'the problems',' help ', 'do'],
[' maybe ', 'not' and 'take', 'question', 'to' and 'dog', 'park', 'stupid'],
[' my ', 'dalmation', 'is',' so ', 'express', 'I', 'love', 'question'],
[' stop ', 'Posting', 'stupid', 'worthless', 'garbage'],
[' Mr ', 'licks',' ate ', 'my', 'the steak', 'how', 'to', 'stop', 'question'],
[" quit ", "buying", "worthless", "dog", "food", "stupid"]]
,1,0,1,0,1 classVec=[0]
Return postingList, classVec
# to create a contains no repeat words appear in all the documents of the list
Def createVocabList (dataSet) :
VocabSet=set # ([]) creates an empty set
For the document in the dataSet:
VocabSet=vocabSet | set # (document) to create two collections and set
Return a list (vocabSet)
# will be converted into word document entry vector
Def setOfWords2Vec (vocabList inputSet) :
Len returnVec=[0] * (vocabList) # create all elements in a zero vector
For the word in inputSet:
If word in vocabList:
# returnVec [vocabList index (word)]=1 # index function to find the location of the character first appeared in the string word set model
ReturnVec [vocabList index (word)] +=1 # document word bag model can appear many times each word
The else: print (" the word: % s is not in my Vocabulary!" % word)
Return returnVec
CodePudding user response:
Please put a note from my blog, thank you