Novice for help: the small white, can you tell me the use simple doc2vec text study, how to solve the mistakes?
# coding: utf-8
The import jieba
The import gensim
The from gensim. Models. Doc2vec import doc2vec
TaggededDocument=gensim. Models. Doc2vec. TaggedDocument
Def get_datasest () :
With the open (" F: \ CSDN \ shuju TXT ", 'r', encoding='GBK', errors='ignore') as the cf: # # for the training set, read from a file, the content inside is a line of words
Docs=cf. Readlines ()
X_train=[]
CNT=0
For I, the text enumerate in (docs) :
CNT=CNT + 1
Word_list="'. Join (jieba. The cut (text. The split (' \ n ') [0])). The split (' ')
L=len (word_list)
=word_list word_list [l - 1] [l - 1]. Strip ()
The document=TaggededDocument (word_list, tags=[I])
X_train. Append (document)
Return x_train
Def "train" (x_train vector_size=100, epoch_num=1) :
Model_dm=Doc2Vec (x_train min_count=1, the window=5, vector_size=vector_size, sample=1-3, e negative=5, workers=4)
Model_dm. "train" (x_train, total_examples=model_dm corpus_count, epochs=70)
Model_dm. Save (' picking. The model ') # # model to save the location of the
Return model_dm
Def ceshi () :
Model_dm=Doc2Vec. Load (F: \ "CSDN \ picking model")
Str1='computer operating systems'
Test_text="'. Join (jieba. The cut (str1)). The encode (' GBK '). The split (' ')
Inferred_vector_dm=model_dm. Infer_vector (test_text) # # of text vector
Print (inferred_vector_dm)
Return inferred_vector_dm
If __name__=="__main__ ':
X_train=get_datasest ()
Model_dm="train" (x_train)
Doc_2_vec=ceshi ()
Print the
Type (doc_2_vec)
Print (doc_2_vec. Shape)