ポイント
- Diversified Visual Attention Network を実装し、具体的な数値で確認。
- 今後、パフォーマンス検証を実施。
レファレンス
1. Diversified Visual Attention Networks for
Fine-Grained Object Classification
(参照論文より引用)
データ
- Distorted MNIST。Distorted MNIST 作成に関するメモ
サンプルコード
class DiversifiedVisualAttention():
def __init__(self):
pass
def weight_variable(self, name, shape):
initializer = tf.truncated_normal_initializer(mean = 0.0, stddev = 0.01, dtype = tf.float32)
return tf.get_variable(name, shape, initializer = initializer)
def bias_variable(self, name, shape):
initializer = tf.constant_initializer(value = 0.0, dtype = tf.float32)
return tf.get_variable(name, shape, initializer = initializer)
def attention_canvas_generation(self, x, hight, width):
canvases = []
# full
img_full = tf.reshape(x, [-1, hight, width, 1])
canvases.append(img_full)
# 2x2
imgs = tf.split(img_full, 2, axis = 1)
imgs1 = tf.split(imgs[0], 2, axis = 2)
imgs2 = tf.split(imgs[1], 2, axis = 2)
imgs = tf.concat([imgs1, imgs2], axis = 0)
for i in range(4):
img = tf.image.resize_images(imgs[i], [hight, width])
canvases.append(img)
# 3x3
#if hight == 28:
# imgs = tf.pad(img_full, [[0, 0], [1, 1], [1, 1], [0, 0]])
#imgs = tf.split(imgs, 3, axis = 1)
#imgs1 = tf.split(imgs[0], 3, axis = 2)
#imgs2 = tf.split(imgs[1], 3, axis = 2)
#imgs3 = tf.split(imgs[2], 3, axis = 2)
#imgs = tf.concat([imgs1, imgs2, imgs3], axis = 0)
#for i in range(9):
# img = tf.image.resize_images(imgs[i], [hight, width])
# canvases.append(img)
return canvases
def cnn_feature_learning(self, x, filter_size, n_filters_1, n_filters_2):
w_1 = self.weight_variable('w_1', [filter_size, filter_size, 1, n_filters_1])
b_1 = self.bias_variable('b_1', [n_filters_1])
conv_1 = tf.nn.relu(tf.nn.conv2d(x, w_1, strides = [1, 1, 1, 1], padding = 'SAME') + b_1)
conv_1 = tf.nn.max_pool(conv_1, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = 'SAME')
w_2 = self.weight_variable('w_2', [filter_size, filter_size, n_filters_1, n_filters_2])
b_2 = self.bias_variable('b_2', [n_filters_2])
conv_2 = tf.nn.relu(tf.nn.conv2d(conv_1, w_2, strides = [1, 1, 1, 1], padding = 'SAME') + b_2)
return tf.nn.max_pool(conv_2, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = 'SAME')
def two_fc_layers(self, x, hight, width, n_filters, n_fc_units_1, n_fc_units_2, keep_prob):
x = tf.reshape(x, [-1, hight * width * n_filters])
w_1 = self.weight_variable('w_1', [hight * width * n_filters, n_fc_units_1])
b_1 = self.bias_variable('b_1', [n_fc_units_1])
fc_1 = tf.nn.relu(tf.matmul(x, w_1) + b_1)
fc_1_dropout = tf.nn.dropout(fc_1, keep_prob)
w_2 = self.weight_variable('w_2', [n_fc_units_1, n_fc_units_2])
b_2 = self.bias_variable('b_2', [n_fc_units_2])
return tf.matmul(fc_1_dropout, w_2) + b_2
def diversified_visual_attention_average(self, x, hight, width, n_canvases, \
batch_size, n_in, n_lstm_units, n_fc_units):
w_x = self.weight_variable('w_x', [n_in, n_lstm_units * 4])
w_h = self.weight_variable('w_h', [n_lstm_units, n_lstm_units * 4])
b = self.bias_variable('b', [n_lstm_units * 4])
w_fc = self.weight_variable('w_fc', [n_lstm_units, n_fc_units])
b_fc = self.bias_variable('b_fc', [n_fc_units])
h = tf.zeros(shape = [batch_size, n_lstm_units], dtype = tf.float32)
c = tf.zeros(shape = [batch_size, n_lstm_units], dtype = tf.float32)
dva_outs = []
# LSTM
for i in range(n_canvases):
# average
x_t = tf.reduce_mean(x[i], axis = [1, 2])
i, f, o, g = tf.split(tf.add(tf.add(tf.matmul(x_t, w_x), tf.matmul(h, w_h)), b), 4, axis = 1)
i = tf.nn.sigmoid(i)
f = tf.nn.sigmoid(f)
o = tf.nn.sigmoid(o)
g = tf.nn.tanh(g)
c = tf.add(tf.multiply(f, c), tf.multiply(i, g))
h = tf.multiply(o, tf.nn.tanh(c))
dva_out = tf.matmul(h, w_fc) + b_fc
dva_outs.append(dva_out)
return dva_outs
def diversified_visual_attention_multi(self, x, hight, width, n_canvases, \
batch_size, n_in, n_lstm_units, n_fc_units):
w_a = self.weight_variable('w_a', [n_lstm_units, n_in])
w_x = self.weight_variable('w_x', [n_in, n_lstm_units * 4])
w_h = self.weight_variable('w_h', [n_lstm_units, n_lstm_units * 4])
b = self.bias_variable('b', [n_lstm_units * 4])
w_fc = self.weight_variable('w_fc', [n_lstm_units, n_fc_units])
b_fc = self.bias_variable('b_fc', [n_fc_units])
h = tf.zeros(shape = [batch_size, n_lstm_units], dtype = tf.float32)
c = tf.zeros(shape = [batch_size, n_lstm_units], dtype = tf.float32)
dva_outs = []
# LSTM
for i in range(n_canvases):
# multiplicatie attention
q = tf.matmul(h, w_a)
q = tf.reshape(q, [batch_size, 1, n_in])
k = tf.reshape(x[i], [batch_size, hight * width, n_in])
v = k
a = tf.matmul(q, tf.transpose(k, [0, 2, 1]))
a = tf.nn.softmax(a, axis = 2)
x_t = tf.matmul(a, v)
x_t = tf.reshape(x_t, [batch_size, n_in])
i, f, o, g = tf.split(tf.add(tf.add(tf.matmul(x_t, w_x), tf.matmul(h, w_h)), b), 4, axis = 1)
i = tf.nn.sigmoid(i)
f = tf.nn.sigmoid(f)
o = tf.nn.sigmoid(o)
g = tf.nn.tanh(g)
c = tf.add(tf.multiply(f, c), tf.multiply(i, g))
h = tf.multiply(o, tf.nn.tanh(c))
dva_out = tf.matmul(h, w_fc) + b_fc
dva_outs.append(dva_out)
return dva_outs
def classification(self, x):
predictions = tf.nn.softmax(x, axis = 2)
return tf.reduce_mean(predictions, axis = 0)
# Multi Canvas
def inference_multi(self, x, hight, width, n_canvases, filter_size, n_filters_1, n_filters_2, \
n_units_1, n_units_2, keep_prob):
canvases = self.attention_canvas_generation(x, hight, width)
cnn_features = []
for i in range(n_canvases):
with tf.variable_scope('cnn_feature_{}'.format(i)):
cnn_feature = self.cnn_feature_learning(canvases[i], filter_size, n_filters_1, n_filters_2)
cnn_features.append(cnn_feature)
hight_2 = 7 # 28 / 2 / 2
width_2 = 7 # 28 / 2 / 2
fc_outs = []
for i in range(n_canvases):
with tf.variable_scope('fc_out_{}'.format(i)):
fc_out = self.two_fc_layers(cnn_features[i], hight_2, width_2, \
n_filters_2, n_units_1, n_units_2, keep_prob)
fc_outs.append(fc_out)
return self.classification(fc_outs)
# Single Canvas
def inference_single(self, x, hight, width, filter_size, n_filters_1, n_filters_2, \
n_fc_units_1, n_fc_units_2, keep_prob):
x_reshaped = tf.reshape(x, [-1, hight, width, 1])
with tf.variable_scope('conv_1'):
w = self.weight_variable('w', [filter_size, filter_size, 1, n_filters_1])
b = self.bias_variable('b', [n_filters_1])
# no max_pooling
#conv_1 = tf.nn.relu(tf.nn.conv2d(x_reshaped, w, strides = [1, 2, 2, 1], padding = 'SAME') + b)
# with max_pooling
conv_1 = tf.nn.relu(tf.nn.conv2d(x_reshaped, w, strides = [1, 1, 1, 1], padding = 'SAME') + b)
conv_1 = tf.nn.max_pool(conv_1, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = 'SAME')
with tf.variable_scope('conv_2'):
w = self.weight_variable('w', [filter_size, filter_size, n_filters_1, n_filters_2])
b = self.bias_variable('b', [n_filters_2])
# no max_pooling
#conv_2 = tf.nn.relu(tf.nn.conv2d(conv_1, w, strides = [1, 2, 2, 1], padding = 'SAME') + b)
# with max_pooling
conv_2 = tf.nn.relu(tf.nn.conv2d(conv_1, w, strides = [1, 1, 1, 1], padding = 'SAME') + b)
conv_2 = tf.nn.max_pool(conv_2, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = 'SAME')
conv_2_flat = tf.reshape(conv_2, [-1, 7 * 7 * n_filters_2])
with tf.variable_scope('fc_1'):
w = self.weight_variable('w', [7 * 7 * n_filters_2, n_fc_units_1])
b = self.bias_variable('b', [n_fc_units_1])
fc_1 = tf.nn.relu(tf.matmul(conv_2_flat, w) + b)
fc_1_dropout = tf.nn.dropout(fc_1, keep_prob)
with tf.variable_scope('fc_2'):
w = self.weight_variable('w', [n_fc_units_1, n_fc_units_2])
b = self.bias_variable('b', [n_fc_units_2])
fc_2 = tf.matmul(fc_1_dropout, w) + b
return tf.nn.softmax(fc_2, axis = 1)
# DVAN
def inference(self, x, hight, width, n_canvases, filter_size, n_filters_1, n_filters_2, \
batch_size, n_in, n_lstm_units, n_fc_units):
canvases = self.attention_canvas_generation(x, hight, width)
cnn_features = []
for i in range(n_canvases):
with tf.variable_scope('cnn_feature_{}'.format(i)):
cnn_feature = self.cnn_feature_learning(canvases[i], filter_size, n_filters_1, n_filters_2)
cnn_features.append(cnn_feature)
hight_2 = 7 # 28 / 2 / 2
width_2 = 7 # 28 / 2 / 2
# DVAN Average
dva_outs = self.diversified_visual_attention_average(cnn_features, hight_2, width_2, n_canvases, \
batch_size, n_in, n_lstm_units, n_fc_units)
# DVAN Multiplicative
#dva_outs = self.diversified_visual_attention_multi(cnn_features, hight_2, width_2, n_canvases, \
# batch_size, n_in, n_lstm_units, n_fc_units)
return self.classification(dva_outs)
def loss(self, y, t):
cross_entropy = - tf.reduce_mean(tf.reduce_sum(t * tf.log(tf.clip_by_value(y, 1e-10, 1.0)), axis = 1))
return cross_entropy
def accuracy(self, y, t):
correct_preds = tf.equal(tf.argmax(y, axis = 1), tf.argmax(t, axis = 1))
return tf.reduce_mean(tf.cast(correct_preds, tf.float32))
def training(self, loss, learning_rate):
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate)
train_step = optimizer.minimize(loss)
return train_step
def training_clipped(self, loss, learning_rate, clip_norm):
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate)
grads_and_vars = optimizer.compute_gradients(loss)
clipped_grads_and_vars = [(tf.clip_by_norm(grad, clip_norm = clip_norm), \
var) for grad, var in grads_and_vars]
train_step = optimizer.apply_gradients(clipped_grads_and_vars)
return train_step
def fit(self, images_train, labels_train, images_test, labels_Test, \
n_canvases, filter_size, n_filters_1, n_filters_2, n_fc_units, \
n_lstm_units, learning_rate, n_iter, batch_size, show_step, is_saving, model_path):
tf.reset_default_graph()
x = tf.placeholder(shape = [None, 28 * 28], dtype = tf.float32)
t = tf.placeholder(shape = [None, 10], dtype = tf.float32)
keep_prob = tf.placeholder(shape = (), dtype = tf.float32)
# Multi Canvas
#y = self.inference_multi(x, 28, 28, n_canvases, filter_size, n_filters_1, n_filters_2, \
# n_fc_units, 10, keep_prob)
# Single Canvas
y = self.inference_single(x, 28, 28, filter_size, n_filters_1, n_filters_2, \
n_fc_units, 10, keep_prob)
# DVAN
#y = self.inference(x, 28, 28, n_canvases, filter_size, n_filters_1, n_filters_2, \
# batch_size, n_filters_2, n_lstm_units, 10)
loss = self.loss(y, t)
# Without Gradient Clipping
train_step = self.training(loss, learning_rate)
# With Gradient Clipping
#train_step = self.training_clipped(loss, learning_rate, 0.1)
acc = self.accuracy(y, t)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(init)
history_loss_train = []
history_acc_train = []
history_loss_test = []
history_acc_test = []
for i in range(n_iter):
# Train
rand_index = np.random.choice(len(images_train), size = batch_size)
x_batch = images_train[rand_index]
y_batch = labels_train[rand_index]
feed_dict = {x: x_batch, t: y_batch, keep_prob: 0.7}
sess.run(train_step, feed_dict = feed_dict)
temp_loss = sess.run(loss, feed_dict = feed_dict)
temp_acc = sess.run(acc, feed_dict = feed_dict)
history_loss_train.append(temp_loss)
history_acc_train.append(temp_acc)
if (i + 1) % show_step == 0:
print ('--------------------')
print ('Iteration: ' + str(i + 1) + ' Loss: ' + str(temp_loss) + \
' Accuracy: ' + str(temp_acc))
# Test
rand_index = np.random.choice(len(images_test), size = batch_size)
x_batch = images_test[rand_index]
y_batch = labels_test[rand_index]
feed_dict = {x: x_batch, t: y_batch, keep_prob: 1.0}
temp_loss = sess.run(loss, feed_dict = feed_dict)
temp_acc = sess.run(acc, feed_dict = feed_dict)
history_loss_test.append(temp_loss)
history_acc_test.append(temp_acc)
if is_saving:
model_path = saver.save(sess, model_path)
print ('done saving at ', model_path)
fig = plt.figure(figsize = (10, 3))
ax1 = fig.add_subplot(1, 2, 1)
ax1.plot(range(n_iter), history_loss_train, 'b-', label = 'Train')
ax1.plot(range(n_iter), history_loss_test, 'r--', label = 'Test')
ax1.set_title('Loss')
ax1.legend(loc = 'upper right')
ax2 = fig.add_subplot(1, 2, 2)
ax2.plot(range(n_iter), history_acc_train, 'b-', label = 'Train')
ax2.plot(range(n_iter), history_acc_test, 'r--', label = 'Test')
ax2.set_title('Accuracy')
ax2.legend(loc = 'lower right')
plt.show()
パラメータ
n_canvases = 5
filter_size = 5
n_filters_1 = 32
n_filters_2 = 32
n_fc_units = 64
n_lstm_units = 64
learning_rate = 0.001
batch_size = 64