以下是完整代碼:
import spacy
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding
def read_file(filepath):
with open(filepath) as f:
str_text = f.read()
return str_text
moby_text = read_file('moby_dick.txt')
nlp = spacy.load('en_core_web_sm')
doc = nlp(moby_text)
#getting tokens using list comprehension
tokens = [token.text.lower() for token in doc]
#cleaning text
tokens = [token for token in tokens if token not in '\n\n \n\n\n!"-#$%&()--.* ,-/:;<=>?@[\\]^_`{|}~\t\n ']
train_len = 10 1 # 10 i/p and 1 o/p
text_sequences = []
for i in range(train_len,len(tokens)):
seq = tokens[i-train_len:i]
text_sequences.append(seq)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)
for i in sequences[0]:
print(f'{i} : {tokenizer.index_word[i]}')
sequences = np.array(sequences)
vocabulary_size = len(tokenizer.word_counts)
def create_model(vocabulary_size, seq_len):
model = Sequential()
model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
model.add(LSTM(100,return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100,activation='relu'))
model.add(Dense(vocabulary_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
return model
X = sequences[:,:-1]
y = sequences[:,-1]
y = to_categorical(y, num_classes=vocabulary_size)
在 to_categorical 中,我遇到了錯誤。我不明白為什么?在閱讀了這么多文章之后,我仍然不知道如何解決它。
IndexError: index 2718 is out of bounds for axis 1 with size 2718
錯誤
seq_len = X.shape[1]
model = create_model(vocabulary_size, seq_len)
model.fit(X, y, epochs=100,verbose=1)
我不明白這個錯誤。我搜索了錯誤并嘗試了不同的方法來解決它,但我找不到任何解決方法。另外,我想這是因為串列的索引從 0 開始。我已經完成了
Y = Y - 1
y = to_categorical(y, num_classes=vocabulary_size)
但這不起作用,因為它會在模型??中產生錯誤。所以我回到原點。
Node: 'sequential/embedding/embedding_lookup'
indices[13,9] = 2718 is not in [0, 2718)
[[{{node sequential/embedding/embedding_lookup}}]] [Op:__inference_train_function_5647]
那么我該如何解決呢?有人可以幫幫我嗎?謝謝!!!
uj5u.com熱心網友回復:
Tokenizer不使用 0,它從 1 開始計數:
0 是保留索引,不會分配給任何單詞。
嘗試這個:
vocabulary_size = len(tokenizer.word_counts) 1
轉載請註明出處,本文鏈接:https://www.uj5u.com/shujuku/535077.html
