#はじめに
本記事ではLight CNN(LCNN)という深層学習モデルを実装したのでまとめさせていただきました。
最初にLCNNとその特徴であるMax Feature Mapping (MFM)という技術について説明した後、実装と評価をしていきます。
コードは全てpython、LCNNの実装はTensorflow, Kerasを使って行います。
なお、LCNNを実装するためのコードはGithubに載せてるので参考にしてください。
Github URL : https://github.com/ozora-ogino/LCNN
#Light CNN
LCNNは2015年にSTC提案され、現在はSTCという機関により研究されている深層学習手法で画像分類や音声分類等の分野で使用されています。
LCNNは8層の畳み込み層から構成されており、各層における活性化関数でMax Feature Mappingと呼ばれるものを使っているのが大きな特徴となっています。
#Max Feature Mapping
MFMについてはこちらに詳しくまとめたので参考にしてみてください。
元の論文も載せておきます。
"A Light CNN for Deep Face Representation with Noisy Labels
" (https://arxiv.org/pdf/1511.02683.pdf)
#実装
GitHubに載せているものと同じコードになります。
import tensorflow as tf
from keras.layers import Activation, Dense, BatchNormalization, MaxPool2D, Lambda, Input, Flatten, Dropout
from keras.layers.convolutional import Conv2D
from keras.models import Model
from keras.initializers import he_normal
#Custom layer
from .layers import Maxout
#function that return the stuck of Conv2D and MFM
def MaxOutConv2D(x, dim, kernel_size, strides, padding='same'):
conv_out = Conv2D(dim, kernel_size=kernel_size, strides=strides, padding=padding)(x)
mfm_out = Maxout(int(dim/2))(conv_out)
return mfm_out
#function that return the stuck of FC and MFM
def MaxOutDense(x, dim):
dense_out = Dense(dim)(x)
mfm_out = Maxout(int(dim/2))(dense_out)
return mfm_out
# this function helps to build LCNN.
def build_lcnn(shape, n_label=2):
"""
Auguments:
shape (list) :
Input shape for LCNN. (Example : [128, 128, 1])
n_label (int) :
Number of label that LCNN should predict.
"""
input = Input(shape=shape)
conv2d_1 = MaxOutConv2D(input, 64, kernel_size=5, strides=1, padding='same')
maxpool_1 = MaxPool2D(pool_size=(2, 2), strides=(2,2))(conv2d_1)
conv_2d_2 = MaxOutConv2D(maxpool_1, 64, kernel_size=1, strides=1, padding='same')
batch_norm_2 = BatchNormalization()(conv_2d_2)
conv2d_3 = MaxOutConv2D(batch_norm_2, 96, kernel_size=3, strides=1, padding='same')
maxpool_3 = MaxPool2D(pool_size=(2, 2), strides=(2,2))(conv2d_3)
batch_norm_3 = BatchNormalization()(maxpool_3)
conv_2d_4 = MaxOutConv2D(batch_norm_3, 96, kernel_size=1, strides=1, padding='same')
batch_norm_4 = BatchNormalization()(conv_2d_4)
conv2d_5 = MaxOutConv2D(batch_norm_4, 128, kernel_size=3, strides=1, padding='same')
maxpool_5 = MaxPool2D(pool_size=(2, 2), strides=(2,2))(conv2d_5)
conv_2d_6 = MaxOutConv2D(maxpool_5, 128, kernel_size=1, strides=1, padding='same')
batch_norm_6 = BatchNormalization()(conv_2d_6)
conv_2d_7 = MaxOutConv2D(batch_norm_6, 64, kernel_size=3, strides=1, padding='same')
batch_norm_7 = BatchNormalization()(conv_2d_7)
conv_2d_8 = MaxOutConv2D(batch_norm_7, 64, kernel_size=1, strides=1, padding='same')
batch_norm_8 = BatchNormalization()(conv_2d_8)
conv_2d_9 = MaxOutConv2D(batch_norm_8, 64, kernel_size=3, strides=1, padding='same')
maxpool_9 = MaxPool2D(pool_size=(2, 2), strides=(2,2))(conv_2d_9)
flatten = Flatten()(maxpool_9)
dense_10 = MaxOutDense(flatten, 160)
batch_norm_10 = BatchNormalization()(dense_10)
dropout_10 = Dropout(0.75)(batch_norm_10)
output = Dense(n_label, activation='softmax')(dropout_10)
return Model(inputs=input, outputs=output)
#評価
実装したLCNNは音声認識コンペで使用されたモデルになりますが、簡単な画像認識でどの程度の性能が出るのかmnistとCIFAR10で試してみました。
モデルチューニング等は一切していませんがmnistでは99%、CIFAR10では75%程度の性能を示すことができました。
mnist
import numpy as np
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
from keras.datasets import mnist
lr = 0.001
epochs = 10
batch_size =256
[x_train, y_train], [x_test, y_test] = mnist.load_data()
x_train = x_train / 255
x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], x_train.shape[2], 1))
y_train = to_categorical(y_train)
input_shape = x_train.shape[1:]
lcnn = build_lcnn(input_shape, n_label=10)
lcnn.compile(optimizer=Adam(learning_rate=lr), loss='categorical_crossentropy', metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
history = lcnn.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, callbacks=[es])
x_test = x_test / 255
x_test = x_test.reshape((x_test.shape[0], x_test.shape[1], x_test.shape[2], 1))
y_test = to_categorical(y_test)
loss, acc = lcnn.evaluate(x_test, y_test)
print(f'Accuracy : {acc*100}') # Result --> Accuracy : 99.90999794006348
print(f'Loss : {loss}')# Result --> Loss : 0.04250425341885457
CIFAR10
import numpy as np
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
from keras.datasets import cifar10
lr = 0.001
epochs = 100
batch_size =64
[x_train, y_train], [x_test, y_test] =cifar10.load_data()
x_train = x_train / 255
y_train = to_categorical(y_train)
input_shape = x_train.shape[1:]
lcnn = build_lcnn(input_shape, n_label=10)
lcnn.compile(optimizer=Adam(learning_rate=lr), loss='categorical_crossentropy', metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', patience=5 , verbose=1)
history = lcnn.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, callbacks=[es])
x_test = x_test / 255
y_test = to_categorical(y_test)
loss, acc = lcnn.evaluate(x_test, y_test)
print(f'Accuracy : {acc*100}') # Result --> Accuracy : 75.1200020313263
print(f'Loss : {loss}')# Result --> Loss : 1.2616282165050507
#まとめ
LCNNという深層学習モデルを実装したのでまとめさせていただきました。
参考になれば幸いです。
Github URL : https://github.com/ozora-ogino/LCNN
#Reference
["A Light CNN for Deep Face Representation with Noisy Labels"] (https://arxiv.org/pdf/1511.02683.pdf)
"STC Antispoofing Systems for the ASVspoof2019 Challenge"
"Audio replay attack detection with deep learning frameworks"