求助:新手小白,請問使用簡單doc2vec進行文本學習,出現這個錯誤該如何解決?

# coding:utf-8
import jieba
import gensim
from gensim.models.doc2vec import Doc2Vec
TaggededDocument = gensim.models.doc2vec.TaggedDocument
def get_datasest():
with open("F:\csdn\shuju.txt", 'r',encoding='gbk',errors='ignore') as cf: ##獲取訓練集的地方,從一個檔案中讀取出來,里面的內容是一行一句話
docs = cf.readlines()
x_train = []
cnt = 0
for i, text in enumerate(docs):
cnt = cnt + 1
word_list = ' '.join(jieba.cut(text.split('\n')[0])).split(' ')
l = len(word_list)
word_list[l - 1] = word_list[l - 1].strip()
document = TaggededDocument(word_list, tags=[i])
x_train.append(document)
return x_train
def train(x_train, vector_size=100, epoch_num=1):
model_dm = Doc2Vec(x_train, min_count=1, window=5, vector_size=vector_size, sample=1e-3, negative=5, workers=4)
model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=70)
model_dm.save('couple.model') ##模型保存的位置
return model_dm
def ceshi():
model_dm = Doc2Vec.load("F:\csdn\couple.model")
str1 = '計算機作業系統'
test_text = ' '.join(jieba.cut(str1)).encode('gbk').split(' ')
inferred_vector_dm = model_dm.infer_vector(test_text) ##得到文本的向量
print(inferred_vector_dm)
return inferred_vector_dm
if __name__ == '__main__':
x_train = get_datasest()
model_dm = train(x_train)
doc_2_vec = ceshi()
type(doc_2_vec)
print(doc_2_vec.shape)
轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/79343.html
標籤:人工智能技術
