LoginSignup
0
0

More than 5 years have passed since last update.

Diversified Visual Attention Network の実装に関するメモ

Last updated at Posted at 2018-06-18

ポイント

  • Diversified Visual Attention Network を実装し、具体的な数値で確認。
  • 今後、パフォーマンス検証を実施。

レファレンス

1. Diversified Visual Attention Networks for
Fine-Grained Object Classification

image.png

image.png

image.png

             (参照論文より引用)

データ

サンプルコード

class DiversifiedVisualAttention():
  def __init__(self):
    pass

  def weight_variable(self, name, shape):
    initializer = tf.truncated_normal_initializer(mean = 0.0, stddev = 0.01, dtype = tf.float32)
    return tf.get_variable(name, shape, initializer = initializer)

  def bias_variable(self, name, shape):
    initializer = tf.constant_initializer(value = 0.0, dtype = tf.float32)
    return tf.get_variable(name, shape, initializer = initializer)

  def attention_canvas_generation(self, x, hight, width):
    canvases = []

    # full
    img_full = tf.reshape(x, [-1, hight, width, 1])
    canvases.append(img_full)

    # 2x2
    imgs = tf.split(img_full, 2, axis = 1)
    imgs1 = tf.split(imgs[0], 2, axis = 2)
    imgs2 = tf.split(imgs[1], 2, axis = 2)
    imgs = tf.concat([imgs1, imgs2], axis = 0)

    for i in range(4):
      img = tf.image.resize_images(imgs[i], [hight, width])
      canvases.append(img)

    # 3x3
    #if hight == 28:
    #  imgs = tf.pad(img_full, [[0, 0], [1, 1], [1, 1], [0, 0]])

    #imgs = tf.split(imgs, 3, axis = 1)
    #imgs1 = tf.split(imgs[0], 3, axis = 2)
    #imgs2 = tf.split(imgs[1], 3, axis = 2)
    #imgs3 = tf.split(imgs[2], 3, axis = 2)

    #imgs = tf.concat([imgs1, imgs2, imgs3], axis = 0)

    #for i in range(9):
    #  img = tf.image.resize_images(imgs[i], [hight, width])
    #  canvases.append(img)

    return canvases

  def cnn_feature_learning(self, x, filter_size, n_filters_1, n_filters_2):
    w_1 = self.weight_variable('w_1', [filter_size, filter_size, 1, n_filters_1])
    b_1 = self.bias_variable('b_1', [n_filters_1])

    conv_1 = tf.nn.relu(tf.nn.conv2d(x, w_1, strides = [1, 1, 1, 1], padding = 'SAME') + b_1)
    conv_1 = tf.nn.max_pool(conv_1, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = 'SAME')

    w_2 = self.weight_variable('w_2', [filter_size, filter_size, n_filters_1, n_filters_2])
    b_2 = self.bias_variable('b_2', [n_filters_2])

    conv_2 = tf.nn.relu(tf.nn.conv2d(conv_1, w_2, strides = [1, 1, 1, 1], padding = 'SAME') + b_2)

    return tf.nn.max_pool(conv_2, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = 'SAME')

  def two_fc_layers(self, x, hight, width, n_filters, n_fc_units_1, n_fc_units_2, keep_prob):
    x = tf.reshape(x, [-1, hight * width * n_filters])

    w_1 = self.weight_variable('w_1', [hight * width * n_filters, n_fc_units_1])
    b_1 = self.bias_variable('b_1', [n_fc_units_1])

    fc_1 = tf.nn.relu(tf.matmul(x, w_1) + b_1)
    fc_1_dropout = tf.nn.dropout(fc_1, keep_prob)

    w_2 = self.weight_variable('w_2', [n_fc_units_1, n_fc_units_2])
    b_2 = self.bias_variable('b_2', [n_fc_units_2])

    return tf.matmul(fc_1_dropout, w_2) + b_2

  def diversified_visual_attention_average(self, x, hight, width, n_canvases, \
                                          batch_size, n_in, n_lstm_units, n_fc_units):

    w_x = self.weight_variable('w_x', [n_in, n_lstm_units * 4])
    w_h = self.weight_variable('w_h', [n_lstm_units, n_lstm_units * 4])
    b = self.bias_variable('b', [n_lstm_units * 4])

    w_fc = self.weight_variable('w_fc', [n_lstm_units, n_fc_units])
    b_fc = self.bias_variable('b_fc', [n_fc_units])

    h = tf.zeros(shape = [batch_size, n_lstm_units], dtype = tf.float32)
    c = tf.zeros(shape = [batch_size, n_lstm_units], dtype = tf.float32)

    dva_outs = []

    # LSTM
    for i in range(n_canvases):
      # average
      x_t = tf.reduce_mean(x[i], axis = [1, 2])

      i, f, o, g = tf.split(tf.add(tf.add(tf.matmul(x_t, w_x), tf.matmul(h, w_h)), b), 4, axis = 1)

      i = tf.nn.sigmoid(i)
      f = tf.nn.sigmoid(f)
      o = tf.nn.sigmoid(o)
      g = tf.nn.tanh(g)

      c = tf.add(tf.multiply(f, c), tf.multiply(i, g))
      h = tf.multiply(o, tf.nn.tanh(c))

      dva_out = tf.matmul(h, w_fc) + b_fc

      dva_outs.append(dva_out)

    return dva_outs

  def diversified_visual_attention_multi(self, x, hight, width, n_canvases, \
                                          batch_size, n_in, n_lstm_units, n_fc_units):

    w_a = self.weight_variable('w_a', [n_lstm_units, n_in])
    w_x = self.weight_variable('w_x', [n_in, n_lstm_units * 4])
    w_h = self.weight_variable('w_h', [n_lstm_units, n_lstm_units * 4])
    b = self.bias_variable('b', [n_lstm_units * 4])

    w_fc = self.weight_variable('w_fc', [n_lstm_units, n_fc_units])
    b_fc = self.bias_variable('b_fc', [n_fc_units])

    h = tf.zeros(shape = [batch_size, n_lstm_units], dtype = tf.float32)
    c = tf.zeros(shape = [batch_size, n_lstm_units], dtype = tf.float32)

    dva_outs = []

    # LSTM
    for i in range(n_canvases):
      # multiplicatie attention
      q = tf.matmul(h, w_a)
      q = tf.reshape(q, [batch_size, 1, n_in])
      k = tf.reshape(x[i], [batch_size, hight * width, n_in])
      v = k
      a = tf.matmul(q, tf.transpose(k, [0, 2, 1]))
      a = tf.nn.softmax(a, axis = 2)
      x_t = tf.matmul(a, v)
      x_t = tf.reshape(x_t, [batch_size, n_in]) 

      i, f, o, g = tf.split(tf.add(tf.add(tf.matmul(x_t, w_x), tf.matmul(h, w_h)), b), 4, axis = 1)

      i = tf.nn.sigmoid(i)
      f = tf.nn.sigmoid(f)
      o = tf.nn.sigmoid(o)
      g = tf.nn.tanh(g)

      c = tf.add(tf.multiply(f, c), tf.multiply(i, g))
      h = tf.multiply(o, tf.nn.tanh(c))

      dva_out = tf.matmul(h, w_fc) + b_fc

      dva_outs.append(dva_out)

    return dva_outs

  def classification(self, x):
    predictions = tf.nn.softmax(x, axis = 2)

    return tf.reduce_mean(predictions, axis = 0)    

  # Multi Canvas
  def inference_multi(self, x, hight, width, n_canvases, filter_size, n_filters_1, n_filters_2, \
               n_units_1, n_units_2, keep_prob):
    canvases = self.attention_canvas_generation(x, hight, width)

    cnn_features = []
    for i in range(n_canvases):
      with tf.variable_scope('cnn_feature_{}'.format(i)):
        cnn_feature = self.cnn_feature_learning(canvases[i], filter_size, n_filters_1, n_filters_2)
        cnn_features.append(cnn_feature) 

    hight_2 = 7  # 28 / 2 / 2
    width_2 = 7  # 28 / 2 / 2

    fc_outs = []
    for i in range(n_canvases):
      with tf.variable_scope('fc_out_{}'.format(i)):
        fc_out = self.two_fc_layers(cnn_features[i], hight_2, width_2, \
                                    n_filters_2, n_units_1, n_units_2, keep_prob)
        fc_outs.append(fc_out) 

    return self.classification(fc_outs)

  # Single Canvas
  def inference_single(self, x, hight, width, filter_size, n_filters_1, n_filters_2, \
                  n_fc_units_1, n_fc_units_2, keep_prob):
    x_reshaped = tf.reshape(x, [-1, hight, width, 1])

    with tf.variable_scope('conv_1'):
      w = self.weight_variable('w', [filter_size, filter_size, 1, n_filters_1])
      b = self.bias_variable('b', [n_filters_1])

      # no max_pooling
      #conv_1 = tf.nn.relu(tf.nn.conv2d(x_reshaped, w, strides = [1, 2, 2, 1], padding = 'SAME') + b)

      # with max_pooling
      conv_1 = tf.nn.relu(tf.nn.conv2d(x_reshaped, w, strides = [1, 1, 1, 1], padding = 'SAME') + b)
      conv_1 = tf.nn.max_pool(conv_1, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = 'SAME')

    with tf.variable_scope('conv_2'):
      w = self.weight_variable('w', [filter_size, filter_size, n_filters_1, n_filters_2])
      b = self.bias_variable('b', [n_filters_2])

      # no max_pooling
      #conv_2 = tf.nn.relu(tf.nn.conv2d(conv_1, w, strides = [1, 2, 2, 1], padding = 'SAME') + b)

      # with max_pooling
      conv_2 = tf.nn.relu(tf.nn.conv2d(conv_1, w, strides = [1, 1, 1, 1], padding = 'SAME') + b)
      conv_2 = tf.nn.max_pool(conv_2, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = 'SAME')

    conv_2_flat = tf.reshape(conv_2, [-1, 7 * 7 * n_filters_2])

    with tf.variable_scope('fc_1'):
      w = self.weight_variable('w', [7 * 7 * n_filters_2, n_fc_units_1])
      b = self.bias_variable('b', [n_fc_units_1])

      fc_1 = tf.nn.relu(tf.matmul(conv_2_flat, w) + b)

    fc_1_dropout = tf.nn.dropout(fc_1, keep_prob)

    with tf.variable_scope('fc_2'):
      w = self.weight_variable('w', [n_fc_units_1, n_fc_units_2])
      b = self.bias_variable('b', [n_fc_units_2])

      fc_2 = tf.matmul(fc_1_dropout, w) + b

    return tf.nn.softmax(fc_2, axis = 1)

  # DVAN
  def inference(self, x, hight, width, n_canvases, filter_size, n_filters_1, n_filters_2, \
                    batch_size, n_in, n_lstm_units, n_fc_units):
    canvases = self.attention_canvas_generation(x, hight, width)

    cnn_features = []
    for i in range(n_canvases):
      with tf.variable_scope('cnn_feature_{}'.format(i)):
        cnn_feature = self.cnn_feature_learning(canvases[i], filter_size, n_filters_1, n_filters_2)
        cnn_features.append(cnn_feature) 

    hight_2 = 7  # 28 / 2 / 2
    width_2 = 7  # 28 / 2 / 2

    # DVAN Average
    dva_outs = self.diversified_visual_attention_average(cnn_features, hight_2, width_2, n_canvases, \
                                                         batch_size, n_in, n_lstm_units, n_fc_units)

    # DVAN Multiplicative 
    #dva_outs = self.diversified_visual_attention_multi(cnn_features, hight_2, width_2, n_canvases, \
    #                                                     batch_size, n_in, n_lstm_units, n_fc_units)

    return self.classification(dva_outs)

  def loss(self, y, t):
    cross_entropy = - tf.reduce_mean(tf.reduce_sum(t * tf.log(tf.clip_by_value(y, 1e-10, 1.0)), axis = 1))
    return cross_entropy

  def accuracy(self, y, t):
    correct_preds = tf.equal(tf.argmax(y, axis = 1), tf.argmax(t, axis = 1))

    return tf.reduce_mean(tf.cast(correct_preds, tf.float32))

  def training(self, loss, learning_rate):
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate)
    train_step = optimizer.minimize(loss)
    return train_step

  def training_clipped(self, loss, learning_rate, clip_norm):
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate)

    grads_and_vars = optimizer.compute_gradients(loss)
    clipped_grads_and_vars = [(tf.clip_by_norm(grad, clip_norm = clip_norm), \
                             var) for grad, var in grads_and_vars]
    train_step = optimizer.apply_gradients(clipped_grads_and_vars)

    return train_step

  def fit(self, images_train, labels_train, images_test, labels_Test, \
          n_canvases, filter_size, n_filters_1, n_filters_2, n_fc_units, \
          n_lstm_units, learning_rate, n_iter, batch_size, show_step, is_saving, model_path):

    tf.reset_default_graph()

    x = tf.placeholder(shape = [None, 28 * 28], dtype = tf.float32)
    t = tf.placeholder(shape = [None, 10], dtype = tf.float32)
    keep_prob = tf.placeholder(shape = (), dtype = tf.float32)

    # Multi Canvas
    #y = self.inference_multi(x, 28, 28, n_canvases, filter_size, n_filters_1, n_filters_2, \
    #           n_fc_units, 10, keep_prob)

    # Single Canvas
    y = self.inference_single(x, 28, 28, filter_size, n_filters_1, n_filters_2, \
                  n_fc_units, 10, keep_prob)

    # DVAN
    #y = self.inference(x, 28, 28, n_canvases, filter_size, n_filters_1, n_filters_2, \
    #                       batch_size, n_filters_2, n_lstm_units, 10)

    loss = self.loss(y, t)

    # Without Gradient Clipping
    train_step = self.training(loss, learning_rate)
    # With Gradient Clipping
    #train_step = self.training_clipped(loss, learning_rate, 0.1)

    acc =  self.accuracy(y, t)

    init = tf.global_variables_initializer()
    saver = tf.train.Saver()

    with tf.Session() as sess:

      sess.run(init)

      history_loss_train = []
      history_acc_train = []
      history_loss_test = []
      history_acc_test = []

      for i in range(n_iter):
        # Train
        rand_index = np.random.choice(len(images_train), size = batch_size)
        x_batch = images_train[rand_index]
        y_batch = labels_train[rand_index]

        feed_dict = {x: x_batch, t: y_batch, keep_prob: 0.7}

        sess.run(train_step, feed_dict = feed_dict)

        temp_loss = sess.run(loss, feed_dict = feed_dict)
        temp_acc = sess.run(acc, feed_dict = feed_dict)

        history_loss_train.append(temp_loss)
        history_acc_train.append(temp_acc)

        if (i + 1) % show_step == 0:
          print ('--------------------')
          print ('Iteration: ' + str(i + 1) + '  Loss: ' + str(temp_loss) + \
                '  Accuracy: ' + str(temp_acc))

        # Test
        rand_index = np.random.choice(len(images_test), size = batch_size)
        x_batch = images_test[rand_index]
        y_batch = labels_test[rand_index]

        feed_dict = {x: x_batch, t: y_batch, keep_prob: 1.0}

        temp_loss = sess.run(loss, feed_dict = feed_dict)
        temp_acc = sess.run(acc, feed_dict = feed_dict)

        history_loss_test.append(temp_loss)
        history_acc_test.append(temp_acc)

      if is_saving:
        model_path = saver.save(sess, model_path)
        print ('done saving at ', model_path)

    fig = plt.figure(figsize = (10, 3))
    ax1 = fig.add_subplot(1, 2, 1)
    ax1.plot(range(n_iter), history_loss_train, 'b-', label = 'Train')
    ax1.plot(range(n_iter), history_loss_test, 'r--', label = 'Test')
    ax1.set_title('Loss')
    ax1.legend(loc = 'upper right')

    ax2 = fig.add_subplot(1, 2, 2)
    ax2.plot(range(n_iter), history_acc_train, 'b-', label = 'Train')
    ax2.plot(range(n_iter), history_acc_test, 'r--', label = 'Test')
    ax2.set_title('Accuracy')
    ax2.legend(loc = 'lower right')

    plt.show()

パラメータ

n_canvases = 5
filter_size = 5
n_filters_1 = 32
n_filters_2 = 32
n_fc_units = 64
n_lstm_units = 64
learning_rate = 0.001
batch_size = 64

アウトプット

  1. Single Canvas
    image.png

  2. Multi Canvas
    image.png

  3. DVAN-Avg
    image.png

  4. DVAN
    image.png

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0