如果訓練集大小很大，反向傳播會計算NaN值-有解無憂

我正在從頭開始開發一個淺層全連接 ANN，它通過帶有動量演算法的梯度下降來學習。這是代碼

import numpy as np
from scipy.special import expit, xlog1py

def softmax(y):
    e_y = np.exp(y - np.max(y))
    return e_y / e_y.sum()

def cross_entropy(y, t, derivative=False, post_process=True):
    if post_process:
        if derivative:
            return y - t
        return -np.sum(np.sum(xlog1py(t, softmax(y)), axis=0))

def sigmoid(a, derivative=False):
    f_a = expit(-a)
    df_a = np.multiply(f_a, (1 - f_a))  # element-wise
    if derivative:
        return df_a
    return f_a


def identity(a, derivative=False):
    f_a = a
    df_a = np.ones(np.shape(a))
    if derivative:
        return df_a
    return f_a


def generate_data(n_items, n_features, n_classes):
    X = np.asmatrix(np.random.normal(size=(n_items, n_features)))
    targets = np.asarray(np.random.randint(n_classes, size=n_items))
    targets = one_hot(targets)
    return X, targets


def one_hot(targets):
    return np.asmatrix(np.eye(np.max(targets)   1)[targets]).T


class NeuralNetwork:

    def __init__(self):
        self.layers = []

    def add_layer(self, layer):
        self.layers.append(layer)

    def build(self):
        for i, layer in enumerate(self.layers):
            if i == 0:
                layer.type = "input"
            else:
                layer.type = "output" if i == len(self.layers) - 1 else "hidden"
                layer.configure(self.layers[i - 1].neurons)

    def fit(self, X, targets):
        MAX_EPOCHS = 200
        epoch_loss = []

        # batch mode
        for epoch in range(MAX_EPOCHS):
            predictions = self.predict(X)
            self.back_prop(targets, cross_entropy)
            self.learning_rule(l_rate=0.01, momentum=0.01)
            loss = cross_entropy(predictions, targets)
            epoch_loss.append(loss)
            print("E(%d) on TrS is:" % epoch, loss)


    # Columns of predictions  
    def predict(self, dataset):
        z = dataset.T 
        for layer in self.layers:
            z = layer.forward_prop_step(z)
        return z

    def back_prop(self, target, loss):
        for i, layer in enumerate(self.layers[:0:-1]):
            next_layer = self.layers[-i]
            prev_layer = self.layers[-i - 2]  
            layer.back_prop_step(next_layer, prev_layer, target, loss)

    def learning_rule(self, l_rate, momentum):
        # Momentum GD
        for layer in [layer for layer in self.layers if layer.type != "input"]:
            layer.update_weights(l_rate, momentum)
            layer.update_bias(l_rate, momentum)


class Layer:

    def __init__(self, neurons, type=None, activation=None):
        self.dE_dW = 0  
        self.dE_db = 0
        self.dEn_db = None  
        self.dEn_dW = None  
        self.dact_a = None  
        self.out = None
        self.weights = None  
        self.bias = None
        self.w_sum = None  
        self.neurons = neurons  
        self.type = type  
        self.activation = activation  
        self.deltas = None

    def configure(self, prev_layer_neurons):
        self.weights = np.asmatrix(np.random.normal(-1, 1, (self.neurons, prev_layer_neurons)))
        self.bias = np.asmatrix(np.random.normal(-1, 1, self.neurons)).T  # vettore colonna
        if self.activation is None:
            # th approx universale
            if self.type == "hidden":
                self.activation = sigmoid
            elif self.type == "output":
                self.activation = identity

    def forward_prop_step(self, z):
        if self.type == "input":
            self.out = z
        else:
            self.w_sum = np.dot(self.weights, z)   self.bias
            self.out = self.activation(self.w_sum)
        return self.out

    def back_prop_step(self, next_layer, prev_layer, target, local_loss):
        if self.type == "output":
            self.dact_a = self.activation(self.w_sum, derivative=True)
            self.deltas = np.multiply(self.dact_a,
                                      local_loss(self.out, target, derivative=True))  # (c,batch_size)
        else:
            self.dact_a = self.activation(self.w_sum, derivative=True)  # (m,batch_size)
            debug = np.dot(next_layer.weights.T, next_layer.deltas)  # <<<< problem here
            self.deltas = np.multiply(self.dact_a, debug)

        self.dEn_dW = self.deltas * prev_layer.out.T

        self.dEn_db = self.deltas

        self.dE_dW = self.dEn_dW

        self.dE_db = self.dEn_db  

    def update_weights(self, l_rate, momentum):
        # Momentum GD
        self.weights = self.weights - l_rate * self.dE_dW
        self.weights = -l_rate * self.dE_dW   momentum * self.weights

    def update_bias(self, l_rate, momentum):
        # Momentum GD
        self.bias = self.bias - l_rate * self.dE_db
        self.bias = -l_rate * self.dE_db   momentum * self.bias


if __name__ == '__main__':
    # Dog: 0 -> 000
    # Cat: 1 -> 010
    # Mouse: 2 -> 001
    net = NeuralNetwork()
    d = 4  # (n_features)
    c = 3  # classes
    n_items = 10  # increasing this gives NaN in EBP formula, in debug variable

    for m in (d, 4, c):
        layer = Layer(m)  
        net.add_layer(layer)

    net.build()

    X, targets = generate_data(n_items=n_items, n_features=d, n_classes=c)

    net.fit(X, targets)

如果n_items值較低，例如 10 或 100，則學習正常：

E(0) on TrS is: -0.27547576455869305
E(1) on TrS is: -0.33774479466660445
E(2) on TrS is: -0.3295771015279694
...
E(199) on TrS is: -0.33026951829371987

不幸的是，當n_items變大時，例如 1000，我收到此錯誤：

RuntimeWarning：在乘法中遇到無效值 self.deltas = np.multiply(self.dact_a, debug)

和：

E(0) on TrS is: -0.3337489007828587
E(1) on TrS is: -0.01614463421285259
E(2) on TrS is: -0.33594156066981384
E(3) on TrS is: -0.11378512597000995
E(4) on TrS is: -0.33508867936192843
E(5) on TrS is: -0.33276323614435077
E(6) on TrS is: -0.33310949105565746
E(7) on TrS is: -0.224661060748479
E(8) on TrS is: -0.3321560115270673
E(9) on TrS is: -0.22289014654421438
...
E(138) on TrS is: nan
E(139) on TrS is: nan
E(140) on TrS is: nan
E(141) on TrS is: nan
...
E(199) on TrS is: nan

我認為這是由不斷增長的變數引起的debug，直到達到.sys.float_info.max1.7976931348623157e 308

我怎么解決這個問題？

uj5u.com熱心網友回復：

看起來你有爆炸梯度。也許嘗試一些正則化。

不過，有幾件一般的事情。您的權重初始化將給出非常大的起始值，這可能會導致梯度爆炸。

如果訓練集大小很大，反向傳播會計算 NaN 值

相反，考慮更多類似的東西：

np.random.normal(-0.1,0.02,...

這個例子不影響權重，但也值得看看你的一些方法的邏輯。例如，sigmoid 總是計算導數，不管它是否被使用。也許改為使用兩種方法（每種方法一項作業）或至少計算 if 內的導數：

def sigmoid(a, derivative=False):
f_a = expit(-a)
if derivative:
    df_a = np.multiply(f_a, (1 - f_a))  # element-wise
    return df_a
return f_a

有關爆炸梯度和權重初始化的更多資訊，請參閱此https://medium.com/usf-msds/deep-learning-best-practices-1-weight-initialization-14e5c0295b94。

轉載請註明出處，本文鏈接：https://www.uj5u.com/net/511432.html

標籤：Python机器学习深度学习神经网络楠

上一篇：如何在BertForSequenceClassification之上放置一個不同的分類器？

下一篇：我應該將我的資料分成不同的批次，然后對每個批次執行tsne嗎？