More than 3 years have passed since last update.

ViTで櫻坂メンバーの顔分類をやってみた

Posted at 2021-10-28

最近流行りのTransformerを試してみたいと思ったので、keras版のViTパッケージを使って櫻坂メンバーの顔分類をやってみた記録です。

環境

GCP Jupyter lab GPU T4
python: 3.7
tensorflow: 2.6.0

画像データ

コチラで作成した櫻坂メンバーの顔画像データセットを使います。128×128で11331枚のデータセットです。クラス数は25クラスです。

必要なものをインポート

import numpy as np
import cv2
import os
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model, load_model, Sequential
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical

from vit_keras import vit

前処理、水増し

以前よりもData Augmentationの知識はついている（と思いたい）ので色々実施しました。

・左右反転、90,180,270度回転
・blend
・RGB-permute
・cutblur
・cutout

左右反転、90,180,270度回転はその名の通りで、合計7種類の水増し画像ができます。
blendは元画像にランダムな色を加えるものです。
RGB-permuteはチャネルの順番をRGB以外の順番にランダムに変えます。
cutblurは主に超解像タスクで用いられている水増し手法で、画像の一部分を低解像度のものに置き換えるものです。
cutoutは画像の一部分の情報を落とすものです。

cutmixやmixupも試してみましたが、訓練データが難しすぎて精度は上がらなかったです。
そもそも顔画像なので、誰かと誰かを混ぜるのはモデルにとって異常データになりそう。画像データの特徴に合わせて効果のある水増し手法を選択することが大事ですね。

これらの水増しを行う自作データローダーをTensorFlowのImageDataGeneratorクラスを拡張して、作成します。

class MyGenerator(ImageDataGenerator):
    def __init__(self, 
                 basic_aug=False,  
                 blend=False, 
                 rgb_permute=False, 
                 cutblur=False,
                 cutout=False,
                 cutmix=False,
                 mixup=False,
                 *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.basic_aug = basic_aug
        self.blend = blend
        self.rgb_permute = rgb_permute
        self.cutblur = cutblur
        self.cutout = cutout
        self.cutmix = cutmix
        self.mixup = mixup

    # 左右反転、90,180,270度回転
    def img_basic_aug(self, img):
        mode = np.random.randint(0,8)
        if mode == 0:
            flip_img = tf.image.flip_left_right(img).numpy()
            return flip_img
        elif mode == 1:
            rotate_180_img = tf.image.rot90(img, k=2).numpy()
            return rotate_180_img
        elif mode == 2:
            rotate_180_flip_img = tf.image.rot90(tf.image.flip_left_right(img), k=2).numpy()
            return rotate_180_flip_img
        elif mode == 3:
            rotate_90_img = tf.image.rot90(img, k=1).numpy()
            return rotate_90_img
        elif mode == 4:
            rotate_270_img = tf.image.rot90(img, k=3).numpy()
            return rotate_270_img
        elif mode == 5:
            rotate_90_flip_img = tf.image.rot90(tf.image.flip_left_right(img), k=1).numpy()
            return rotate_90_flip_img
        elif mode == 6:
            rotate_270_flip_img = tf.image.rot90(tf.image.flip_left_right(img), k=3).numpy()
            return rotate_270_flip_img
        elif mode == 7:
            return img

    # blend
    def img_blend(self, img):
        seed_tensor = tf.zeros((img.shape[0], img.shape[1], 3)).numpy()
        a = np.random.random()*255
        b = np.random.random()*255
        c = np.random.random()*255
        seed_tensor[:,:,0] = a
        seed_tensor[:,:,1] = b
        seed_tensor[:,:,2] = c

        v = np.random.uniform(0.6, 1)
        blend_img = v * img + (1-v) * seed_tensor
        return blend_img

    # RGB-permute
    def img_rgb_permute(self, img):
        seed = np.random.randint(0,5)
        R, G, B = img[:,:,0], img[:,:,1], img[:,:,2]
        if seed == 0:
            permute_img = np.dstack((R, B, G))
        elif seed == 1:
            permute_img = np.dstack((B, R, G))
        elif seed == 2:
            permute_img = np.dstack((B, G, R))
        elif seed == 3:
            permute_img = np.dstack((G, R, B))
        elif seed == 4:
            permute_img = np.dstack((G, B, R))

        return permute_img

    # cutblur
    def img_cutblur(self, img):
        cut_ratio = np.random.randn() * 0.01 + 0.7

        h, w = img.shape[0], img.shape[1]
        ch, cw = np.int(h*cut_ratio), np.int(w*cut_ratio)
        cy = np.random.randint(0, h-ch+1)
        cx = np.random.randint(0, w-cw+1)
        
        LR = cv2.resize(img, (w//4, h//4), interpolation=cv2.INTER_CUBIC)
        LR = cv2.resize(LR, (w, h), interpolation=cv2.INTER_CUBIC)
        
        HR = img

        LR_copy = LR.copy()
        if np.random.random() > 0.5:
            LR_copy[cy:cy+ch, cx:cx+cw] = HR[cy:cy+ch, cx:cx+cw] # inter
        else:
            HR_aug = HR.copy() 
            HR_aug[cy:cy+ch, cx:cx+cw] = LR_copy[cy:cy+ch, cx:cx+cw]
            LR_copy = HR_aug

        return LR_copy

    # cutout
    def img_cutout(self, img):
        height, width, _ = img.shape
        a = 0.01
        cutout_tensor = np.random.choice([0, 1], size=(height, width, 1), p=[a, 1-a])
        cutout_img = img * cutout_tensor

        return cutout_img

    # cutmix
    def img_cutmix(self, img, label, blend_img, blend_label):
        height, width, _ = img.shape
        ry = np.random.randint(0, height)
        rx = np.random.randint(0, width)
        l = np.random.normal(0.7, 0.01)
        rw = l * width
        rh = l * height
        x1 = np.int(np.clip(rx-rw/2, 0, width))
        x2 = np.int(np.clip(rx+rw/2, 0, width))
        y1 = np.int(np.clip(ry-rh/2, 0, height))
        y2 = np.int(np.clip(ry+rh/2, 0, height))
        
        img[y1:y2, x1:x2] = blend_img[y1:y2, x1:x2]
        mix_label = label * l + blend_label * (1-l)
        
        return img, mix_label

    # mixup
    def img_mixup(self, img, label, blend_img, blend_label):
        v = np.random.beta(1.2, 1.2)
        mix_img = v * img + (1-v) * blend_img
        mix_label = v * label + (1-v) * blend_label

        return mix_img, mix_label

    # flowも自作
    def flow(self, *args, **kwargs):
        batches = super().flow(*args, **kwargs)
        
        while True:
            X, y = next(batches)
            copy_X, copy_y = X.copy(), y.copy() # cutmix, mixupで混ぜる画像
            if self.basic_aug == True:
                for i in range(X.shape[0]):
                    #print(f'basic処理{i+1}番目')
                    X[i] = self.img_basic_aug(X[i])
            
            second_list = [0]
            if self.blend == True:
                second_list.append(1)
            if self.rgb_permute == True:
                second_list.append(2)
            if self.cutblur == True:
                second_list.append(3)
            if self.cutout == True:
                second_list.append(4)
            if self.cutmix == True:
                second_list.append(5)
            if self.mixup == True:
                second_list.append(6)
            for i in range(X.shape[0]):
                n = np.random.choice(second_list)
                if n == 0:
                    X[i]= X[i]
                elif n == 1:
                    X[i] = self.img_blend(X[i])
                elif n == 2:
                    X[i] = self.img_rgb_permute(X[i])
                elif n == 3:
                    X[i] = self.img_cutblur(X[i])
                elif n == 4:
                    X[i] = self.img_cutout(X[i])
                elif n == 5:
                    j = np.random.randint(0,X.shape[0])
                    while j == i:
                        j = np.random.randint(0,X.shape[0])
                    X[i], y[i] = self.img_cutmix(X[i], y[i], copy_X[j], copy_y[j])
                elif n == 6:
                    j = np.random.randint(0,X.shape[0])
                    while j == i:
                        j = np.random.randint(0,X.shape[0])
                    X[i], y[i] = self.img_mixup(X[i], y[i], copy_X[j], copy_y[j])
            yield (X, y)

flow関数の流れとしては

ミニバッチを取り出す
左右反転、90,180,270度回転を組み合わせた合計7種類と何もしないという8種類をランダムで行う
その後、水増し手法でTrueにしたものと何もしないという選択の中からランダムで行う

またcutoutに関してはよくある一部分を正方形で黒く隠すものではなく、ピクセルごとにランダムで黒くする手法をとってます。このcutout手法は超解像タスクでよく使われるようです。

これらの水増しと255で割っての正規化で前処理は以上です。

モデル構築

今回は簡単にViTモデルを構築できるvit_kerasを使います。
pipで簡単にインストールできます。

!pip install vit_keras

ViTは規模によってたくさんの種類がありますが今回はB16を使用しました。
今回はファインチューニングしたいので、最後に全結合層を加える形でモデルを構築してます。
optimizerはRAdamを使いました。

image_size = 128
num_classes = 25

def buildModel():
    vit_model = vit.vit_b16(
        image_size = image_size,
        activation = 'softmax',
        pretrained = True,
        include_top = False,
        pretrained_top = False,
        )
    
    model = Sequential()
    model.add(vit_model)
    model.add(Dense(num_classes, 'softmax'))
    
    model.compile(optimizer=tfa.optimizers.RectifiedAdam(learning_rate=1e-4), loss="categorical_crossentropy", metrics=["accuracy"])

    return model

model = buildModel()

学習

まずデータを訓練用と評価用に分けます

data = np.load('sakurazaka_face.npz')
Images = data['x']
labels = data['y']

# 正規化、ラベルはone-hotエンコーディング
X = Images.astype('float32') / 255
y = to_categorical(labels)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=46)

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape
# ((9064, 128, 128, 3), (9064, 25), (2267, 128, 128, 3), (2267, 25))

自作ジェネレータを作成します。訓練用データだけ水増しをします。前述の通り、cutmixとmixup以外の水増しを行います。

train_generator = MyPatchGenerator(basic_aug=True, blend=True, rgb_permute=True, cutblur=True, 
                                   cutout=True, cutmix=False, mixup=False)
val_generator = MyPatchGenerator()

BATCH_SIZE = 32
train_data_gen = train_generator.flow(X_train, y_train, batch_size=BATCH_SIZE, shuffle=True, seed=46)
val_data_gen = val_generator.flow(X_valid, y_valid, batch_size=BATCH_SIZE, shuffle=False)

乱数固定しておきます

def set_seed(seed=46):
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    np.random.seed(seed)
    tf.random.set_seed(seed)

set_seed()

各種コールバックを定義して学習スタートです。

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', verbose=1, 
                                                  patience=5, mode='auto')

file_path='weights.{epoch:02d}-val_loss.{val_loss:.2f}.h5'
check_point = tf.keras.callbacks.ModelCheckpoint(file_path, monitor='val_accuracy',save_best_only=True, 
                                                 save_weights_only=True, period=1, mode='auto')

lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=2, 
                                                    verbose=0, mode='auto', min_lr=1e-6)
epochs = 1000
history = model.fit(train_data_gen,
                        epochs=epochs,
                        steps_per_epoch=X_train.shape[0]//BATCH_SIZE,
                        validation_data=val_data_gen,
                        validation_steps=X_valid.shape[0]//BATCH_SIZE,
                        shuffle=True,
                        callbacks=[early_stopping, check_point, lr_scheduler])