NLP多標簽分類輸入張量大小錯誤問題-有解無憂

入門菜雞代碼都跑不起來，大佬們救救孩子

Traceback (most recent call last):

  File "D:\WJYworkplace\PythonWorkPlace\NLP\multi_label_classfication\keras_albert_multi_label_cls-master\model_train.py", line 164, in <module>

    validation_steps=len(test_D)

  File "D:\MyGeekTool\DevelopTool\MicrosoftVisualStudio\Shared\Python37_64\lib\site-packages\tensorflow\python\keras\engine\training.py", line 1861, in fit_generator

    initial_epoch=initial_epoch)

  File "D:\MyGeekTool\DevelopTool\MicrosoftVisualStudio\Shared\Python37_64\lib\site-packages\tensorflow\python\keras\engine\training.py", line 1100, in fit

    tmp_logs = self.train_function(iterator)

  File "D:\MyGeekTool\DevelopTool\MicrosoftVisualStudio\Shared\Python37_64\lib\site-packages\tensorflow\python\eager\def_function.py", line 828, in __call__

    result = self._call(*args, **kwds)

  File "D:\MyGeekTool\DevelopTool\MicrosoftVisualStudio\Shared\Python37_64\lib\site-packages\tensorflow\python\eager\def_function.py", line 888, in _call

    return self._stateless_fn(*args, **kwds)

  File "D:\MyGeekTool\DevelopTool\MicrosoftVisualStudio\Shared\Python37_64\lib\site-packages\tensorflow\python\eager\function.py", line 2943, in __call__

    filtered_flat_args, captured_inputs=graph_function.captured_inputs)  # pylint: disable=protected-access

  File "D:\MyGeekTool\DevelopTool\MicrosoftVisualStudio\Shared\Python37_64\lib\site-packages\tensorflow\python\eager\function.py", line 1919, in _call_flat

    ctx, args, cancellation_manager=cancellation_manager))

  File "D:\MyGeekTool\DevelopTool\MicrosoftVisualStudio\Shared\Python37_64\lib\site-packages\tensorflow\python\eager\function.py", line 560, in call

    ctx=ctx)

  File "D:\MyGeekTool\DevelopTool\MicrosoftVisualStudio\Shared\Python37_64\lib\site-packages\tensorflow\python\eager\execute.py", line 60, in quick_execute

    inputs, attrs, num_outputs)

tensorflow.python.framework.errors_impl.InvalidArgumentError:  Input to reshape is a tensor with 374400 values, but the requested shape has 1277952

	 [[node model_2/model_1/Embed-Token/Reshape_2 (defined at D:\MyGeekTool\DevelopTool\MicrosoftVisualStudio\Shared\Python37_64\lib\site-packages\keras_adaptive_softmax\embedding.py:172) ]] [Op:__inference_train_function_12406]



Function call stack:

train_function

訓練資料的輸入張量不對要在哪里修改啊？，下面是模型訓練的代碼

# -*- coding: utf-8 -*-

# @Time : 2020/12/23 14:19

# @Author : Jclian91

# @File : model_train.py

# @Place : Yangpu, Shanghai

import json

import codecs

import pandas as pd

import numpy as np

from keras_bert import Tokenizer

from keras.layers import *

from keras.models import Model

from keras.optimizers import Adam



from albert import load_brightmart_albert_zh_checkpoint



# 建議長度<=510

maxlen = 256

BATCH_SIZE = 8

dict_path = './albert_tiny/vocab.txt'





token_dict = {}

with codecs.open(dict_path, 'r', 'utf-8') as reader:

    for line in reader:

        token = line.strip()

        token_dict[token] = len(token_dict)





class OurTokenizer(Tokenizer):

    def _tokenize(self, text):

        R = []

        for c in text:

            if c in self._token_dict:

                R.append(c)

            else:

                R.append('[UNK]')   # 剩余的字符是[UNK]

        return R





tokenizer = OurTokenizer(token_dict)





def seq_padding(X, padding=0):

    L = [len(x) for x in X]

    ML = max(L)

    return np.array([

        np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X

    ])





class DataGenerator:



    def __init__(self, data, batch_size=BATCH_SIZE):

        self.data = data

        self.batch_size = batch_size

        self.steps = len(self.data) // self.batch_size

        if len(self.data) % self.batch_size != 0:

            self.steps += 1



    def __len__(self):

        return self.steps



    def __iter__(self):

        while True:

            idxs = list(range(len(self.data)))

            np.random.shuffle(idxs)

            X1, X2, Y = [], [], []

            for i in idxs:

                d = self.data[i]

                text = d[0][:maxlen]

                x1, x2 = tokenizer.encode(first=text)

                y = d[1]

                X1.append(x1)

                X2.append(x2)

                Y.append(y)

                if len(X1) == self.batch_size or i == idxs[-1]:

                    X1 = seq_padding(X1)

                    X2 = seq_padding(X2)

                    Y = seq_padding(Y)

                    yield [X1, X2], Y

                    [X1, X2, Y] = [], [], []





# 構建模型

def create_cls_model(num_labels):

    albert_model = load_brightmart_albert_zh_checkpoint("albert_tiny", training=False)



    for layer in albert_model.layers:

        layer.trainable = True



    x1_in = Input(shape=(None,))

    x2_in = Input(shape=(None,))



    x = albert_model([x1_in, x2_in])

    cls_layer = Lambda(lambda x: x[:, 0])(x)    # 取出[CLS]對應的向量用來做分類

    p = Dense(num_labels, activation='sigmoid')(cls_layer)     # 多分類



    model = Model([x1_in, x2_in], p)

    model.compile(

        loss='binary_crossentropy',

        optimizer=Adam(1e-5), # 用足夠小的學習率

        metrics=['accuracy']

    )

    model.summary()



    return model





if __name__ == '__main__':



    # 資料處理, 讀取訓練集和測驗集

    print("begin data processing...")

    train_df = pd.read_csv("data/train.csv").fillna(value="")

    test_df = pd.read_csv("data/test.csv").fillna(value="")



    select_labels = train_df["label"].unique()

    labels = []

    for label in select_labels:

        if "|" not in label:

            if label not in labels:

                labels.append(label)

        else:

            for _ in label.split("|"):

                if _ not in labels:

                    labels.append(_)

    with open("label.json", "w", encoding="utf-8") as f:

        f.write(json.dumps(dict(zip(range(len(labels)), labels)), ensure_ascii=False, indent=2))



    train_data = []

    test_data = []

    for i in range(train_df.shape[0]):

        label, content = train_df.iloc[i, :]

        label_id = [0] * len(labels)

        for j, _ in enumerate(labels):

            for separate_label in label.split("|"):

                if _ == separate_label:

                    label_id[j] = 1

        train_data.append((content, label_id))



    for i in range(test_df.shape[0]):

        label, content = test_df.iloc[i, :]

        label_id = [0] * len(labels)

        for j, _ in enumerate(labels):

            for separate_label in label.split("|"):

                if _ == separate_label:

                    label_id[j] = 1

        test_data.append((content, label_id))



    # print(train_data[:10])

    print("finish data processing!")



    # 模型訓練

    model = create_cls_model(len(labels))

    train_D = DataGenerator(train_data)

    test_D = DataGenerator(test_data)



    print("begin model training...")

    model.fit_generator(

        train_D.__iter__(),

        steps_per_epoch=len(train_D),

        epochs=10,

        validation_data=https://bbs.csdn.net/topics/test_D.__iter__(),

        validation_steps=len(test_D)

    )



    print("finish model training!")



    # 模型保存

    model.save('albert_base_multi_label_ee.h5')

    print("Model saved!")



    result = model.evaluate_generator(test_D.__iter__(), steps=len(test_D))

    print("模型評估結果:", result)

轉載請註明出處，本文鏈接：https://www.uj5u.com/qita/277538.html

標籤：腳本語言(Perl/Python)

上一篇：python爬蟲遇到JSONDecodeError: Expecting value: line 1 column 1 (char 0)

下一篇：python tk canvas，如何獲得滑鼠按下后在畫布中的相對位置？

NLP多標簽分類 輸入張量大小錯誤問題

NLP多標簽分類輸入張量大小錯誤問題