Diversified Visual Attention Network の実装に関するメモ #DeepLearning

ポイント

Diversified Visual Attention Network を実装し、具体的な数値で確認。
今後、パフォーマンス検証を実施。
レファレンス

1. Diversified Visual Attention Networks for
Fine-Grained Object Classification
　　　　　　　　　　　　　（参照論文より引用）
データ

Distorted MNIST。Distorted MNIST 作成に関するメモ
サンプルコード

class DiversifiedVisualAttention():
  def __init__(self):
    pass

  def weight_variable(self, name, shape):
    initializer = tf.truncated_normal_initializer(mean = 0.0, stddev = 0.01, dtype = tf.float32)
    return tf.get_variable(name, shape, initializer = initializer)

  def bias_variable(self, name, shape):
    initializer = tf.constant_initializer(value = 0.0, dtype = tf.float32)
    return tf.get_variable(name, shape, initializer = initializer)

  def attention_canvas_generation(self, x, hight, width):
    canvases = []

    # full
    img_full = tf.reshape(x, [-1, hight, width, 1])
    canvases.append(img_full)

    # 2x2
    imgs = tf.split(img_full, 2, axis = 1)
    imgs1 = tf.split(imgs[0], 2, axis = 2)
    imgs2 = tf.split(imgs[1], 2, axis = 2)
    imgs = tf.concat([imgs1, imgs2], axis = 0)

    for i in range(4):
      img = tf.image.resize_images(imgs[i], [hight, width])
      canvases.append(img)

    # 3x3
    #if hight == 28:
    #  imgs = tf.pad(img_full, [[0, 0], [1, 1], [1, 1], [0, 0]])

    #imgs = tf.split(imgs, 3, axis = 1)
    #imgs1 = tf.split(imgs[0], 3, axis = 2)
    #imgs2 = tf.split(imgs[1], 3, axis = 2)
    #imgs3 = tf.split(imgs[2], 3, axis = 2)

    #imgs = tf.concat([imgs1, imgs2, imgs3], axis = 0)

    #for i in range(9):
    #  img = tf.image.resize_images(imgs[i], [hight, width])
    #  canvases.append(img)

    return canvases

  def cnn_feature_learning(self, x, filter_size, n_filters_1, n_filters_2):
    w_1 = self.weight_variable('w_1', [filter_size, filter_size, 1, n_filters_1])
    b_1 = self.bias_variable('b_1', [n_filters_1])

    conv_1 = tf.nn.relu(tf.nn.conv2d(x, w_1, strides = [1, 1, 1, 1], padding = 'SAME') + b_1)
    conv_1 = tf.nn.max_pool(conv_1, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = 'SAME')

    w_2 = self.weight_variable('w_2', [filter_size, filter_size, n_filters_1, n_filters_2])
    b_2 = self.bias_variable('b_2', [n_filters_2])

    conv_2 = tf.nn.relu(tf.nn.conv2d(conv_1, w_2, strides = [1, 1, 1, 1], padding = 'SAME') + b_2)

    return tf.nn.max_pool(conv_2, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = 'SAME')

  def two_fc_layers(self, x, hight, width, n_filters, n_fc_units_1, n_fc_units_2, keep_prob):
    x = tf.reshape(x, [-1, hight * width * n_filters])

    w_1 = self.weight_variable('w_1', [hight * width * n_filters, n_fc_units_1])
    b_1 = self.bias_variable('b_1', [n_fc_units_1])

    fc_1 = tf.nn.relu(tf.matmul(x, w_1) + b_1)
    fc_1_dropout = tf.nn.dropout(fc_1, keep_prob)

    w_2 = self.weight_variable('w_2', [n_fc_units_1, n_fc_units_2])
    b_2 = self.bias_variable('b_2', [n_fc_units_2])

    return tf.matmul(fc_1_dropout, w_2) + b_2

  def diversified_visual_attention_average(self, x, hight, width, n_canvases, \
                                          batch_size, n_in, n_lstm_units, n_fc_units):

    w_x = self.weight_variable('w_x', [n_in, n_lstm_units * 4])
    w_h = self.weight_variable('w_h', [n_lstm_units, n_lstm_units * 4])
    b = self.bias_variable('b', [n_lstm_units * 4])

    w_fc = self.weight_variable('w_fc', [n_lstm_units, n_fc_units])
    b_fc = self.bias_variable('b_fc', [n_fc_units])

    h = tf.zeros(shape = [batch_size, n_lstm_units], dtype = tf.float32)
    c = tf.zeros(shape = [batch_size, n_lstm_units], dtype = tf.float32)

    dva_outs = []

    # LSTM
    for i in range(n_canvases):
      # average
      x_t = tf.reduce_mean(x[i], axis = [1, 2])

      i, f, o, g = tf.split(tf.add(tf.add(tf.matmul(x_t, w_x), tf.matmul(h, w_h)), b), 4, axis = 1)

      i = tf.nn.sigmoid(i)
      f = tf.nn.sigmoid(f)
      o = tf.nn.sigmoid(o)
      g = tf.nn.tanh(g)

      c = tf.add(tf.multiply(f, c), tf.multiply(i, g))
      h = tf.multiply(o, tf.nn.tanh(c))

      dva_out = tf.matmul(h, w_fc) + b_fc

      dva_outs.append(dva_out)

    return dva_outs

  def diversified_visual_attention_multi(self, x, hight, width, n_canvases, \
                                          batch_size, n_in, n_lstm_units, n_fc_units):

    w_a = self.weight_variable('w_a', [n_lstm_units, n_in])
    w_x = self.weight_variable('w_x', [n_in, n_lstm_units * 4])
    w_h = self.weight_variable('w_h', [n_lstm_units, n_lstm_units * 4])
    b = self.bias_variable('b', [n_lstm_units * 4])

    w_fc = self.weight_variable('w_fc', [n_lstm_units, n_fc_units])
    b_fc = self.bias_variable('b_fc', [n_fc_units])

    h = tf.zeros(shape = [batch_size, n_lstm_units], dtype = tf.float32)
    c = tf.zeros(shape = [batch_size, n_lstm_units], dtype = tf.float32)

    dva_outs = []

    # LSTM
    for i in range(n_canvases):
      # multiplicatie attention
      q = tf.matmul(h, w_a)
      q = tf.reshape(q, [batch_size, 1, n_in])
      k = tf.reshape(x[i], [batch_size, hight * width, n_in])
      v = k
      a = tf.matmul(q, tf.transpose(k, [0, 2, 1]))
      a = tf.nn.softmax(a, axis = 2)
      x_t = tf.matmul(a, v)
      x_t = tf.reshape(x_t, [batch_size, n_in]) 

      i, f, o, g = tf.split(tf.add(tf.add(tf.matmul(x_t, w_x), tf.matmul(h, w_h)), b), 4, axis = 1)

      i = tf.nn.sigmoid(i)
      f = tf.nn.sigmoid(f)
      o = tf.nn.sigmoid(o)
      g = tf.nn.tanh(g)

      c = tf.add(tf.multiply(f, c), tf.multiply(i, g))
      h = tf.multiply(o, tf.nn.tanh(c))

      dva_out = tf.matmul(h, w_fc) + b_fc

      dva_outs.append(dva_out)

    return dva_outs

  def classification(self, x):
    predictions = tf.nn.softmax(x, axis = 2)

    return tf.reduce_mean(predictions, axis = 0)    

  # Multi Canvas
  def inference_multi(self, x, hight, width, n_canvases, filter_size, n_filters_1, n_filters_2, \
               n_units_1, n_units_2, keep_prob):
    canvases = self.attention_canvas_generation(x, hight, width)

    cnn_features = []
    for i in range(n_canvases):
      with tf.variable_scope('cnn_feature_{}'.format(i)):
        cnn_feature = self.cnn_feature_learning(canvases[i], filter_size, n_filters_1, n_filters_2)
        cnn_features.append(cnn_feature) 

    hight_2 = 7  # 28 / 2 / 2
    width_2 = 7  # 28 / 2 / 2

    fc_outs = []
    for i in range(n_canvases):
      with tf.variable_scope('fc_out_{}'.format(i)):
        fc_out = self.two_fc_layers(cnn_features[i], hight_2, width_2, \
                                    n_filters_2, n_units_1, n_units_2, keep_prob)
        fc_outs.append(fc_out) 

    return self.classification(fc_outs)

  # Single Canvas
  def inference_single(self, x, hight, width, filter_size, n_filters_1, n_filters_2, \
                  n_fc_units_1, n_fc_units_2, keep_prob):
    x_reshaped = tf.reshape(x, [-1, hight, width, 1])

    with tf.variable_scope('conv_1'):
      w = self.weight_variable('w', [filter_size, filter_size, 1, n_filters_1])
      b = self.bias_variable('b', [n_filters_1])

      # no max_pooling
      #conv_1 = tf.nn.relu(tf.nn.conv2d(x_reshaped, w, strides = [1, 2, 2, 1], padding = 'SAME') + b)

      # with max_pooling
      conv_1 = tf.nn.relu(tf.nn.conv2d(x_reshaped, w, strides = [1, 1, 1, 1], padding = 'SAME') + b)
      conv_1 = tf.nn.max_pool(conv_1, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = 'SAME')

    with tf.variable_scope('conv_2'):
      w = self.weight_variable('w', [filter_size, filter_size, n_filters_1, n_filters_2])
      b = self.bias_variable('b', [n_filters_2])

      # no max_pooling
      #conv_2 = tf.nn.relu(tf.nn.conv2d(conv_1, w, strides = [1, 2, 2, 1], padding = 'SAME') + b)

      # with max_pooling
      conv_2 = tf.nn.relu(tf.nn.conv2d(conv_1, w, strides = [1, 1, 1, 1], padding = 'SAME') + b)
      conv_2 = tf.nn.max_pool(conv_2, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = 'SAME')

    conv_2_flat = tf.reshape(conv_2, [-1, 7 * 7 * n_filters_2])

    with tf.variable_scope('fc_1'):
      w = self.weight_variable('w', [7 * 7 * n_filters_2, n_fc_units_1])
      b = self.bias_variable('b', [n_fc_units_1])

      fc_1 = tf.nn.relu(tf.matmul(conv_2_flat, w) + b)

    fc_1_dropout = tf.nn.dropout(fc_1, keep_prob)

    with tf.variable_scope('fc_2'):
      w = self.weight_variable('w', [n_fc_units_1, n_fc_units_2])
      b = self.bias_variable('b', [n_fc_units_2])

      fc_2 = tf.matmul(fc_1_dropout, w) + b

    return tf.nn.softmax(fc_2, axis = 1)

  # DVAN
  def inference(self, x, hight, width, n_canvases, filter_size, n_filters_1, n_filters_2, \
                    batch_size, n_in, n_lstm_units, n_fc_units):
    canvases = self.attention_canvas_generation(x, hight, width)

    cnn_features = []
    for i in range(n_canvases):
      with tf.variable_scope('cnn_feature_{}'.format(i)):
        cnn_feature = self.cnn_feature_learning(canvases[i], filter_size, n_filters_1, n_filters_2)
        cnn_features.append(cnn_feature) 

    hight_2 = 7  # 28 / 2 / 2
    width_2 = 7  # 28 / 2 / 2

    # DVAN Average
    dva_outs = self.diversified_visual_attention_average(cnn_features, hight_2, width_2, n_canvases, \
                                                         batch_size, n_in, n_lstm_units, n_fc_units)

    # DVAN Multiplicative 
    #dva_outs = self.diversified_visual_attention_multi(cnn_features, hight_2, width_2, n_canvases, \
    #                                                     batch_size, n_in, n_lstm_units, n_fc_units)

    return self.classification(dva_outs)

  def loss(self, y, t):
    cross_entropy = - tf.reduce_mean(tf.reduce_sum(t * tf.log(tf.clip_by_value(y, 1e-10, 1.0)), axis = 1))
    return cross_entropy

  def accuracy(self, y, t):
    correct_preds = tf.equal(tf.argmax(y, axis = 1), tf.argmax(t, axis = 1))

    return tf.reduce_mean(tf.cast(correct_preds, tf.float32))

  def training(self, loss, learning_rate):
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate)
    train_step = optimizer.minimize(loss)
    return train_step

  def training_clipped(self, loss, learning_rate, clip_norm):
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate)

    grads_and_vars = optimizer.compute_gradients(loss)
    clipped_grads_and_vars = [(tf.clip_by_norm(grad, clip_norm = clip_norm), \
                             var) for grad, var in grads_and_vars]
    train_step = optimizer.apply_gradients(clipped_grads_and_vars)

    return train_step

  def fit(self, images_train, labels_train, images_test, labels_Test, \
          n_canvases, filter_size, n_filters_1, n_filters_2, n_fc_units, \
          n_lstm_units, learning_rate, n_iter, batch_size, show_step, is_saving, model_path):

    tf.reset_default_graph()

    x = tf.placeholder(shape = [None, 28 * 28], dtype = tf.float32)
    t = tf.placeholder(shape = [None, 10], dtype = tf.float32)
    keep_prob = tf.placeholder(shape = (), dtype = tf.float32)

    # Multi Canvas
    #y = self.inference_multi(x, 28, 28, n_canvases, filter_size, n_filters_1, n_filters_2, \
    #           n_fc_units, 10, keep_prob)

    # Single Canvas
    y = self.inference_single(x, 28, 28, filter_size, n_filters_1, n_filters_2, \
                  n_fc_units, 10, keep_prob)

    # DVAN
    #y = self.inference(x, 28, 28, n_canvases, filter_size, n_filters_1, n_filters_2, \
    #                       batch_size, n_filters_2, n_lstm_units, 10)

    loss = self.loss(y, t)

    # Without Gradient Clipping
    train_step = self.training(loss, learning_rate)
    # With Gradient Clipping
    #train_step = self.training_clipped(loss, learning_rate, 0.1)

    acc =  self.accuracy(y, t)

    init = tf.global_variables_initializer()
    saver = tf.train.Saver()

    with tf.Session() as sess:

      sess.run(init)

      history_loss_train = []
      history_acc_train = []
      history_loss_test = []
      history_acc_test = []

      for i in range(n_iter):
        # Train
        rand_index = np.random.choice(len(images_train), size = batch_size)
        x_batch = images_train[rand_index]
        y_batch = labels_train[rand_index]

        feed_dict = {x: x_batch, t: y_batch, keep_prob: 0.7}

        sess.run(train_step, feed_dict = feed_dict)

        temp_loss = sess.run(loss, feed_dict = feed_dict)
        temp_acc = sess.run(acc, feed_dict = feed_dict)

        history_loss_train.append(temp_loss)
        history_acc_train.append(temp_acc)

        if (i + 1) % show_step == 0:
          print ('--------------------')
          print ('Iteration: ' + str(i + 1) + '  Loss: ' + str(temp_loss) + \
                '  Accuracy: ' + str(temp_acc))

        # Test
        rand_index = np.random.choice(len(images_test), size = batch_size)
        x_batch = images_test[rand_index]
        y_batch = labels_test[rand_index]

        feed_dict = {x: x_batch, t: y_batch, keep_prob: 1.0}

        temp_loss = sess.run(loss, feed_dict = feed_dict)
        temp_acc = sess.run(acc, feed_dict = feed_dict)

        history_loss_test.append(temp_loss)
        history_acc_test.append(temp_acc)

      if is_saving:
        model_path = saver.save(sess, model_path)
        print ('done saving at ', model_path)

    fig = plt.figure(figsize = (10, 3))
    ax1 = fig.add_subplot(1, 2, 1)
    ax1.plot(range(n_iter), history_loss_train, 'b-', label = 'Train')
    ax1.plot(range(n_iter), history_loss_test, 'r--', label = 'Test')
    ax1.set_title('Loss')
    ax1.legend(loc = 'upper right')

    ax2 = fig.add_subplot(1, 2, 2)
    ax2.plot(range(n_iter), history_acc_train, 'b-', label = 'Train')
    ax2.plot(range(n_iter), history_acc_test, 'r--', label = 'Test')
    ax2.set_title('Accuracy')
    ax2.legend(loc = 'lower right')

    plt.show()
パラメータ

n_canvases = 5
filter_size = 5
n_filters_1 = 32
n_filters_2 = 32
n_fc_units = 64
n_lstm_units = 64
learning_rate = 0.001
batch_size = 64
アウトプット

Single Canvas
Multi Canvas
DVAN-Avg
DVAN