Normalization (Batch, Weight, Layer) の実装に関するメモ #Python

ポイント

Weight NormalizationをFC（全結合）Layer をベースに実装し、他のメソッドと比較。
Weight Initialization のある・なしがパフォーマンスに影響することを確認。
今後、Convolution, LSTM Layer をベースに追加検証。

レファレンス

1. Weight Normalization: A Simple Reparameterization to Accelerate Training of Deep Neural Networks

2. Layer Normalization

検証方法

MNIST 手書き数字データを使用。
全結合層２層のうち、第１層の活性化関数への入力に対し Normalization を適用。
Weight Initialization の効果をチェック。

データ

MNIST 手書き数字

# import mnist data
from tensorflow.examples.tutorials.mnist \
              import input_data
mnist = input_data.read_data_sets('***/mnist', \
               one_hot = True)

検証結果

n_units = 128
learning_rate = 0.1
batch_size = 128

Normalization なし
Weight Normalization ( Weight Initialization なし )
Weight Normalization ( Weight Initialization あり )
Batch Normalization
Layer Normalization

サンプルコード

  def inference_wn_first(self, x, n_in, n_units, n_out):

    with tf.variable_scope('layer1'):
      v = self.weight_variable('v', [n_in, n_units])
      g = self.weight_variable('g', [n_units])
      b = self.bias_variable('b', [n_units])

      v_norm = tf.nn.l2_normalize(v, axis = 0)
      t = tf.matmul(x, v_norm)
      mean, var = tf.nn.moments(t, axes = [0])
      g = tf.assign(g, 1.0 / tf.sqrt(var + 1e-10))
      b = tf.assign(b, - mean / tf.sqrt(var + 1e-10))
      w = g * v_norm
      y = tf.add(tf.matmul(x, w), b)
      y = tf.nn.relu(y)

    with tf.variable_scope('layer2'):
      v = self.weight_variable('v', [n_units, n_out])
      g = self.weight_variable('g', [n_out])
      b = self.bias_variable('b', [n_out])

      v_norm = tf.nn.l2_normalize(v, axis = 0)
      t = tf.matmul(y, v_norm)
      mean, var = tf.nn.moments(t, axes = [0])
      g = tf.assign(g, 1.0 / tf.sqrt(var + 1e-10))
      b = tf.assign(b, - mean * tf.sqrt(var + 1e-10))
      w = g * v_norm      
      y = tf.add(tf.matmul(y, w), b)
      y = tf.nn.softmax(y, axis = 1)

    return y

  def inference_wn_after(self, x, n_in, n_units, n_out):

    with tf.variable_scope('layer1', reuse = True):
      v = tf.get_variable('v')
      g = tf.get_variable('g')
      b = tf.get_variable('b')

      v_norm = tf.nn.l2_normalize(v, axis = 0)
      w = g * v_norm
      y = tf.add(tf.matmul(x, w), b)
      y = tf.nn.relu(y)

    with tf.variable_scope('layer2', reuse = True):
      v = tf.get_variable('v')
      g = tf.get_variable('g')
      b = tf.get_variable('b')

      v_norm = tf.nn.l2_normalize(v, axis = 0)
      w = g * v_norm
      y = tf.add(tf.matmul(y, w), b)
      y = tf.nn.softmax(y, axis = 1)

    return y

  def fit_wn(self, images_train, labels_train, \
          images_test, labels_test, n_in, n_units, \
          n_out, learning_rate, n_iter, batch_size, \
          show_step, is_saving, model_path):

    tf.reset_default_graph()

    x = tf.placeholder(shape = [None, n_in], dtype = \
                       tf.float32)
    t = tf.placeholder(shape = [None, n_out], dtype = \
                       tf.float32)

    # WN
    y_first = self.inference_wn_first(x, n_in, n_units, \
                                      n_out)
    y_after = self.inference_wn_after(x, n_in, n_units, \
                                      n_out)

    loss_first = self.loss(y_first, t)
    loss_after = self.loss(y_after, t)

    train_step_first = self.training(loss_first, \
                                      learning_rate)
    train_step_after = self.training(loss_after, \
                                      learning_rate)

    acc_first =  self.accuracy(y_first, t)
    acc_after =  self.accuracy(y_after, t)

    init = tf.global_variables_initializer()
    saver = tf.train.Saver()

    with tf.Session() as sess:

      sess.run(init)

      history_loss_train = []
      history_acc_train = []
      history_loss_test = []
      history_acc_test = []

      for i in range(n_iter):
        # Train
        rand_index = np.random.choice(len(images_train), \
                                       size = batch_size)
        x_batch = images_train[rand_index]
        y_batch = labels_train[rand_index]

        feed_dict = {x: x_batch, t: y_batch}

        if i == 0:
          sess.run(train_step_first, feed_dict = feed_dict)

          temp_loss = sess.run(loss_first, feed_dict = \
                                feed_dict)
          temp_acc = sess.run(acc_first, feed_dict = \
                               feed_dict)
        else:
          sess.run(train_step_after, feed_dict = feed_dict)

          temp_loss = sess.run(loss_after, feed_dict = \
                                feed_dict)
          temp_acc = sess.run(acc_after, feed_dict = \
                               feed_dict)

        history_loss_train.append(temp_loss)
        history_acc_train.append(temp_acc)

        if (i + 1) % show_step == 0:
          print ('--------------------')
          print ('Iteration: ' + str(i + 1) + '  Loss: ' + \
                   str(temp_loss) + '  Accuracy: ' + \
                   str(temp_acc))

  def batch_norm(self, x, n_units):
    with tf.variable_scope('bn'):
      init_const1 = tf.constant_initializer(value = 0.0, \
                      dtype = tf.float32)
      init_const2 = tf.constant_initializer(value = 1.0, \
                      dtype = tf.float32)
      beta = tf.get_variable('beta', shape = [n_units], \
                            initializer =init_const1)
      gamma = tf.get_variable('gamma', shape = [n_units], \
                            initializer =init_const2)
      mean, var = tf.nn.moments(x, [0])
      x = gamma * (x - mean) / tf.sqrt(var + 1e-5) + beta

      return x

  def layer_norm(self, x, batch_size):
    with tf.variable_scope('bn'):
      init_const1 = tf.constant_initializer(value = 0.0, \
                        dtype = tf.float32)
      init_const2 = tf.constant_initializer(value = 1.0, \
                        dtype = tf.float32)
      beta = tf.get_variable('beta', shape = [batch_size], \
                            initializer =init_const1)
      gamma = tf.get_variable('gamma', shape = \
          [batch_size], initializer =init_const2) \
          mean, var = tf.nn.moments(x, [1])

      mean = tf.expand_dims(mean, axis = 1)
      var = tf.expand_dims(var, axis = 1)
      beta = tf.expand_dims(beta, axis = 1)
      gamma = tf.expand_dims(gamma, axis = 1)

      x = gamma * (x - mean) / tf.sqrt(var + 1e-5) + beta

      return x