More than 5 years have passed since last update.

画像オートエンコーダの学習におけるロスの選び方の違い（CelebAの場合）

Last updated at 2017-06-19Posted at 2017-06-17

（後編画像オートエンコーダの学習におけるロスの選び方の違い後編（MIT Places2の場合）書きました。）

目的

オートエンコーダやVAEなどで画像の再構成のロス（入力画像からの遠さ）を最小化するように学習します。いろいろな実装を見ているとロスの式で二乗誤差ロスを採用しているものとバイナリクロスエントロピロス（logloss?）を採用しているものがありどちらが良いのかいまいちわかりません。また画素単位のロスをそのまま最適化するものとサンプル内で画素毎のロスの平均を取るものがあり、こちらもどちらが良いのかよくわかりません。

最適化の流れから考えれば挙動の違いもなんとなく想像はつくのですがとりあえずベンチマークを取ってしまいます。

注意

結果画像として鮮明な顔写真が出てきますがオートエンコーダの出力であり自動生成的なものではありません。「脅威の人工知能実現！」みたいな記事ではありません。
いろいろと厳密性を欠く実験ではあります。実験の条件は次に述べます。

実験の概要

実験に使用したモデル

前半部分がモデル、後半部分がロスです。特に凝ったことはしていない畳み込み層によるオートエンコーダです。

中間の特徴ベクトルサイズは512
畳み込み層の活性化関数はReLU
線形層の活性化関数はtanh
デコーダの最終出力の活性化関数はSigmoid
デコーダの最終出力以外にはバッチノーマライズを使用
ロスの定義は二乗誤差とバイナリクロスエントロピの2種類を画素毎、サンプル毎、ミニバッチ全体の3通りで計6種類
オプティマイザはAdam

import tensorflow as tf
import math
from castanea.layers import conv2d, conv2d_transpose, linear, LayerParameter

IMAGE_SUMMARY_MAX_OUTPUTS = 8

LATENT_DIMENTION = 512
WEIGHT_NORMALIZE = False
BATCH_NORMALIZE = True
WITH_BIAS = True
LEARNING_RATE = 1e-4
RECTIFIER = tf.nn.relu
LINEAR_RECTIFIER = tf.tanh
IMAGE_RECTIFIER = tf.sigmoid

def encoder(images):
    ks = 5 # kernel size
    x = images

    p1 = LayerParameter(
        with_bias=WITH_BIAS,
        rectifier=RECTIFIER,
        with_weight_normalize=WEIGHT_NORMALIZE,
        with_batch_normalize=BATCH_NORMALIZE,
        var_device='/gpu:0')
    p2 = LayerParameter(
        with_bias=WITH_BIAS,
        rectifier=LINEAR_RECTIFIER,
        with_weight_normalize=WEIGHT_NORMALIZE,
        with_batch_normalize=BATCH_NORMALIZE,
        var_device='/gpu:0')

    with tf.variable_scope('encoder'):
        x = conv2d(x, ks, ks, 32, parameter=p1)
        x = conv2d(x, ks, ks, 64, strides=[1,2,2,1], parameter=p1)
        x = conv2d(x, ks, ks, 128, strides=[1,2,2,1], parameter=p1)
        x = conv2d(x, ks, ks, 256, strides=[1,2,2,1], parameter=p1)
        x = conv2d(x, ks, ks, 512, strides=[1,2,2,1], parameter=p1)

        x = linear(x, [-1, LATENT_DIMENTION], parameter=p2)

        return x

def decoder(features, core):
    ks = 5 # kernel size

    p1 = LayerParameter(
        with_bias=WITH_BIAS,
        rectifier=RECTIFIER,
        with_weight_normalize=WEIGHT_NORMALIZE,
        with_batch_normalize=BATCH_NORMALIZE,
        var_device='/gpu:0')
    p2 = LayerParameter(
        with_bias=WITH_BIAS,
        rectifier=LINEAR_RECTIFIER,
        with_weight_normalize=WEIGHT_NORMALIZE,
        with_batch_normalize=BATCH_NORMALIZE,
        var_device='/gpu:0')
    p3 = LayerParameter(
        with_bias=WITH_BIAS,
        rectifier=IMAGE_RECTIFIER,
        with_weight_normalize=WEIGHT_NORMALIZE,
        with_batch_normalize=False,
        var_device='/gpu:0')
    
    with tf.variable_scope('decoder'):
        x = features
        x = linear(x, [-1, core, core, 512], parameter=p2)
        x = conv2d_transpose(x, ks, ks, 256, strides=[1,2,2,1], parameter=p1) # 16
        x = conv2d_transpose(x, ks, ks, 128, strides=[1,2,2,1], parameter=p1) # 32
        x = conv2d_transpose(x, ks, ks, 64, strides=[1,2,2,1], parameter=p1) # 64
        x = conv2d_transpose(x, ks, ks, 32, strides=[1,2,2,1], parameter=p1) # 128
        x = conv2d(x, ks, ks, 3, parameter=p3) # 128

        #tf.summary.image('decoder', x, collections=['image', 'encoder'], max_outputs=IMAGE_SUMMARY_MAX_OUTPUTS )

        return x

def loss_pixel_squared_difference(real_images, generated_images):
    with tf.name_scope('loss'):
        l = tf.squared_difference(real_images, generated_images)
        index_l = l

        l_mean = tf.reduce_mean(l)
        tf.summary.scalar(
            'loss_pixel_squared_difference', l_mean,
            collections=[tf.GraphKeys.SUMMARIES, 'loss', 'scalar'])

        index_l_mean = tf.reduce_mean(index_l) * 1e+7
        tf.summary.scalar(
            'loss_index', index_l_mean, collections=[tf.GraphKeys.SUMMARIES, 'loss', 'scalar'])

        return l

def loss_sample_squared_difference(real_images, generated_images):
    with tf.name_scope('loss'):
        l = tf.squared_difference(real_images, generated_images)
        index_l = l
        l = tf.reduce_mean(l, axis=[0])

        l_mean = tf.reduce_mean(l)
        tf.summary.scalar(
            'loss_sample_squared_difference', l_mean,
            collections=[tf.GraphKeys.SUMMARIES, 'loss', 'scalar'])

        index_l_mean = tf.reduce_mean(index_l) * 1e+7
        tf.summary.scalar(
            'loss_index', index_l_mean, collections=[tf.GraphKeys.SUMMARIES, 'loss', 'scalar'])
        return l

def loss_minibatch_squared_difference(real_images, generated_images):
    with tf.name_scope('loss'):
        l = tf.squared_difference(real_images, generated_images)
        index_l = l
        l = tf.reduce_mean(l)

        l_mean = tf.reduce_mean(l)
        tf.summary.scalar(
            'loss_minibatch_squared_difference', l_mean,
            collections=[tf.GraphKeys.SUMMARIES, 'loss', 'scalar'])

        index_l_mean = tf.reduce_mean(index_l) * 1e+7
        tf.summary.scalar(
            'loss_index', index_l_mean, collections=[tf.GraphKeys.SUMMARIES, 'loss', 'scalar'])
        return l

def loss_pixel_binary_cross_entropy(real_images, generated_images):
    with tf.name_scope('loss'):
        l = - (real_images * tf.log(generated_images + 0.001) +
            (1.0 - real_images) * tf.log(1.0 - generated_images + 0.001))
        index_l = tf.squared_difference(real_images, generated_images)

        l_mean = tf.reduce_mean(l)
        tf.summary.scalar(
            'loss_pixel_binary_cross_entropy', l_mean,
            collections=[tf.GraphKeys.SUMMARIES, 'loss', 'scalar'])

        index_l_mean = tf.reduce_mean(index_l) * 1e+7
        tf.summary.scalar(
            'loss_index', index_l_mean, collections=[tf.GraphKeys.SUMMARIES, 'loss', 'scalar'])
        return l

def loss_sample_binary_cross_entropy(real_images, generated_images):
    with tf.name_scope('loss'):
        l = - (real_images * tf.log(generated_images + 0.001) +
            (1.0 - real_images) * tf.log(1.0 - generated_images + 0.001))
        index_l = tf.squared_difference(real_images, generated_images)
        l = tf.reduce_mean(l, axis=[0])

        l_mean = tf.reduce_mean(l)
        tf.summary.scalar(
            'loss_sample_binary_cross_entropy', l_mean,
            collections=[tf.GraphKeys.SUMMARIES, 'loss', 'scalar'])

        index_l_mean = tf.reduce_mean(index_l) * 1e+7
        tf.summary.scalar(
            'loss_index', index_l_mean, collections=[tf.GraphKeys.SUMMARIES, 'loss', 'scalar'])
        return l

def loss_minibatch_binary_cross_entropy(real_images, generated_images):
    with tf.name_scope('loss'):
        l = - (real_images * tf.log(generated_images + 0.001) +
            (1.0 - real_images) * tf.log(1.0 - generated_images + 0.001))
        index_l = tf.squared_difference(real_images, generated_images)
        l = tf.reduce_mean(l)

        l_mean = tf.reduce_mean(l)
        tf.summary.scalar(
            'loss_minibatch_binary_cross_entropy', l_mean,
            collections=[tf.GraphKeys.SUMMARIES, 'loss', 'scalar'])

        index_l_mean = tf.reduce_mean(index_l) * 1e+7
        tf.summary.scalar(
            'loss_index', index_l_mean, collections=[tf.GraphKeys.SUMMARIES, 'loss', 'scalar'])
        return l

def train(loss, global_step):
    with tf.name_scope('train'):
        opt = tf.train.AdamOptimizer(LEARNING_RATE)
        grads = opt.compute_gradients(loss)
        out = opt.apply_gradients(grads, global_step=global_step)
    return out