TensorFlow Eager Execution と Focal Loss について #TensorFlow

目的

TenorFlow の eager execution に慣れる。
不均衡データ (imbalanced data) に対し、focal loss を試す。

参照

コード (Jupyter Notebook)

ライブラリー

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm import tqdm

from sklearn.model_selection  import train_test_split
from sklearn import metrics

import tensorflow as tf
import tensorflow.contrib.eager as tfe
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

np.random.seed(0)

Eager execution (TF 1.x の時）

tf.enable_eager_execution()

print ("TensorFlow version: {}".format(tf.VERSION))
print ("Eager execution: {}".format(tf.executing_eagerly()))

データ

Fraud detection dataset on Kaggle

df_data = pd.read_csv(os.path.join(path, file))

df_data = df_data.drop('nameDest', axis=1)
df_data = df_data.drop('nameOrig', axis=1)
df_data = df_data.drop('type', axis=1)

print ('shape: ', df_data.shape)
df_data.head()

print (df_data['isFraud'].value_counts())
print ()
print ('% of 0: ', np.sum(df_data['isFraud']==0)/len(df_data))

plt.figure(figsize=(5, 3))
sns.countplot(df_data['isFraud'])
plt.show()

def feature_normalize(dataset):
    mu = np.mean(dataset, axis=0)
    sigma = np.std(dataset, axis=0)
    return (dataset - mu) / sigma

X, y = df_data.iloc[:,:-2], df_data.iloc[:, -2]
y = tf.keras.utils.to_categorical(y, num_classes=2)

X = feature_normalize(X.values)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train_tf = tf.convert_to_tensor(X_train, dtype=tf.float32)
y_train_tf = tf.convert_to_tensor(y_train, dtype=tf.float32)
X_test_tf = tf.convert_to_tensor(X_test, dtype=tf.float32)
y_test_tf = tf.convert_to_tensor(y_test, dtype=tf.float32)

Model の書き方の例

同一のアーキテクチャを３パターンのやり方で実装。

パターン１

class Pattern1():        
    def __init__(self):        
        input_size = X_train.shape[1]
        hidden_size1 = 10
        hidden_size2 = 20
        hidden_size3 = 10
        output_size = y_train.shape[1]

        self.fc1_w = tfe.Variable(tf.truncated_normal([input_size, hidden_size1],
                                                       stddev=0.1), dtype=tf.float32)
        self.fc1_b = tfe.Variable(tf.constant(0.1, shape=[hidden_size1]),
                                  dtype=tf.float32)
        self.fc2_w = tfe.Variable(tf.truncated_normal([hidden_size1, hidden_size2],
                                                      stddev=0.1), dtype=tf.float32)
        self.fc2_b = tfe.Variable(tf.constant(0.1, shape=[hidden_size2]),
                                  dtype=tf.float32)
        self.fc3_w = tfe.Variable(tf.truncated_normal([hidden_size2, hidden_size3], 
                                                      stddev=0.1), dtype=tf.float32)
        self.fc3_b = tfe.Variable(tf.constant(0.1, shape=[hidden_size3]), 
                                  dtype=tf.float32)
        self.fc4_w = tfe.Variable(tf.truncated_normal([hidden_size3, output_size], 
                                                      stddev=0.1), dtype=tf.float32)
        self.fc4_b = tfe.Variable(tf.constant(0.1, shape=[output_size]), 
                                  dtype=tf.float32)        
        self.variables = [
            self.fc1_w, self.fc1_b,
            self.fc2_w, self.fc2_b,
            self.fc3_w, self.fc3_b,
            self.fc4_w, self.fc4_b,
        ]

    def __call__(self, x):
        h = tf.nn.relu(tf.matmul(x, self.fc1_w) + self.fc1_b)
        h = tf.nn.relu(tf.matmul(h, self.fc2_w) + self.fc2_b)
        h = tf.nn.relu(tf.matmul(h, self.fc3_w) + self.fc3_b)
        y_pred = tf.matmul(h, self.fc4_w) + self.fc4_b
        return y_pred

パターン２

class Pattern2(tf.keras.Model):
    def __init__(self):
        super(Pattern2, self).__init__()

        input_shape = X_train.shape[1:]
        hidden_size1 = 10
        hidden_size2 = 20
        hidden_size3 = 10
        output_size = y_train.shape[1]

        self.dense1 = tf.keras.layers.Dense(hidden_size1, activation='relu',
                                           input_shape=input_shape)
        self.dense2 = tf.keras.layers.Dense(hidden_size2, activation='relu')
        self.dense3 = tf.keras.layers.Dense(hidden_size3, activation='relu')
        self.dense4 = tf.keras.layers.Dense(output_size, activation='softmax')

    def __call__(self, x):
        h = self.dense1(x)
        h = self.dense2(h)
        h = self.dense3(h)
        h = self.dense4(h)
        return h

パターン３

input_shape = X_train.shape[1:]
hidden_size1 = 10
hidden_size2 = 20
hidden_size3 = 10
output_size = y_train.shape[1]

Pattern3 = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=input_shape),
    tf.keras.layers.Dense(hidden_size1, activation='relu'),
    tf.keras.layers.Dense(hidden_size2, activation='relu'),
    tf.keras.layers.Dense(hidden_size3, activation='relu'),
    tf.keras.layers.Dense(output_size, activation='softmax'),    
])

Training のやり方の例

パターン４と５が focal loss を使用。

パターン１

model = Pattern3

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', mode='min', 
                               patience=5, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', mode='min', 
                              factor=0.1, patience=5, min_lr=0.00001, verbose=1)

epochs = 3
batch_size = 1000

history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_data=(X_test, y_test),
                    callbacks=[early_stopping, reduce_lr],
                    verbose=2,                    
                    )

plt.figure(figsize=(5, 3))
plt.plot(history.epoch, history.history["loss"], label="Train loss")
plt.plot(history.epoch, history.history["val_loss"], label="Validation loss")
plt.title('Loss')
plt.xlabel('epoch')
plt.legend(loc='best')
plt.show()

plt.figure(figsize=(5, 3))
plt.plot(history.epoch, history.history["acc"], label="Train accuracy")
plt.plot(history.epoch, history.history["val_acc"], label="Validation accuracy")
plt.title('Accuracy')
plt.xlabel('epoch')
plt.legend(loc='best')
plt.show()

preds = model.predict(X_test, batch_size=1000)

LABELS = ['Normal','Fraud'] 

truth = np.argmax(y_test, axis=1)
predictions = np.argmax(preds, axis=1)
confusion_matrix = metrics.confusion_matrix(truth, predictions)

plt.figure(figsize=(3, 3))
sns.heatmap(confusion_matrix, xticklabels=LABELS, yticklabels=LABELS,
            annot=True, fmt="d", linewidth=0.1, linecolor='white', 
            annot_kws={"size": 10});
plt.title("Confusion matrix", fontsize=10)
plt.ylabel('True label', fontsize=10)
plt.xlabel('Predicted label', fontsize=10)
plt.show()

values = confusion_matrix.view()
error_count = values.sum() - np.trace(values)

f1 = metrics.f1_score(truth, predictions)
precision = metrics.precision_score(truth, predictions)
recall = metrics.recall_score(truth, predictions)

print ('error count: ', error_count)
print ('f1: {:.3f}'.format(f1))
print ('precision: {:.3f}'.format(precision))
print ('recall: {:.3f}'.format(recall))

パターン２

model = Pattern3

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

class_weight = {0 : 1., 1: 20.}

epochs = 3
batch_size = 1000

history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_data=(X_test, y_test),
                    class_weight=class_weight,
                    verbose=2,                    
                    )

パターン３

def loss(model, x, y):
    y_pred = model(x)
    cross_entropy = - tf.reduce_mean(tf.reduce_sum(y*tf.log(tf.clip_by_value(y_pred, 1e-10, 1.0)),
                                                   axis=1))
    return cross_entropy

def accuracy(y_true, y_pred):
    correct_preds = tf.equal(tf.argmax(y_true, axis = 1),
                             tf.argmax(y_pred, axis = 1))
    accuracy = tf.reduce_mean(tf.cast(correct_preds, tf.float32))
    return accuracy

def grad(model, x, y):
    with tf.GradientTape() as tape:
        loss_value = loss(model, x, y)
    return tape.gradient(loss_value, model.variables)

model = Pattern2()

epochs = 3
batch_size = 1000

optimizer = tf.train.AdamOptimizer()

for epoch in tqdm(range(epochs)):

    n = tf.shape(X_train_tf).numpy()[0]
    perm = np.random.permutation(n)

    for i in range(0, n, batch_size): 
        batch_x = tf.gather(X_train_tf, perm[i:i+batch_size])
        batch_y = tf.gather(y_train_tf, perm[i:i+batch_size])

        grads = grad(model, batch_x, batch_y)
        optimizer.apply_gradients(zip(grads, model.variables), 
                                  global_step=tf.train.get_or_create_global_step())

    train_loss = loss(model, X_train_tf, y_train_tf)
    train_acc = accuracy(y_train_tf, model(X_train_tf))

    valid_loss = loss(model, X_test_tf, y_test_tf)
    valid_acc = accuracy(y_test_tf, model(X_test_tf))

    if (epoch+1)%1 == 0:
        tqdm.write('epoch:\t{}\tloss:\t{:.5f}\tacc:\t{:.5f}\tval_loss:\t{:.5f}\tval_acc:\t{:.5f}'.format(
            epoch+1, train_loss, train_acc, valid_loss, valid_acc)
                  )

パターン４

def focal_loss(gamma=2., alpha=4.):

    gamma = float(gamma)
    alpha = float(alpha)

    def focal_loss_fixed(y_true, y_pred):
        """Focal loss for multi-classification
        FL(p_t)=-alpha(1-p_t)^{gamma}ln(p_t)
        Notice: y_pred is probability after softmax
        gradient is d(Fl)/d(p_t) not d(Fl)/d(x) as described in paper
        d(Fl)/d(p_t) * [p_t(1-p_t)] = d(Fl)/d(x)
        Focal Loss for Dense Object Detection
        https://arxiv.org/abs/1708.02002

        Arguments:
            y_true {tensor} -- ground truth labels, shape of [batch_size, num_cls]
            y_pred {tensor} -- model's output, shape of [batch_size, num_cls]

        Keyword Arguments:
            gamma {float} -- (default: {2.0})
            alpha {float} -- (default: {4.0})

        Returns:
            [tensor] -- loss.
        """
        epsilon = 1.e-9
        y_true = tf.convert_to_tensor(y_true, tf.float32)
        y_pred = tf.convert_to_tensor(y_pred, tf.float32)

        model_out = tf.add(y_pred, epsilon)
        ce = tf.multiply(y_true, -tf.log(model_out))
        weight = tf.multiply(y_true, tf.pow(tf.subtract(1., model_out), gamma))
        fl = tf.multiply(alpha, tf.multiply(weight, ce))
        reduced_fl = tf.reduce_max(fl, axis=1)
        return tf.reduce_mean(reduced_fl)
    return focal_loss_fixed

model = Pattern3

model.compile(loss=focal_loss(alpha=1),
              optimizer='adam',
              metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', mode='min', 
                               patience=5, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', mode='min', 
                              factor=0.1, patience=5, min_lr=0.00001, verbose=1)

epochs = 3
batch_size = 1000

history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_data=(X_test, y_test),
                    callbacks=[early_stopping, reduce_lr],
                    verbose=2,                    
                    )

パターン５

def ce_loss(model, x, y):
    y_pred = model(x)
    cross_entropy = - tf.reduce_mean(tf.reduce_sum(y*tf.log(tf.clip_by_value(y_pred, 1e-10, 1.0)),
                                                   axis=1))
    return cross_entropy

def focal_loss(model, x, y, gamma=2., alpha=4.):
    epsilon = 1.e-9
    y_true = tf.convert_to_tensor(y, tf.float32)
    y_pred = tf.convert_to_tensor(model(x), tf.float32)

    model_out = tf.add(y_pred, epsilon)
    ce = tf.multiply(y_true, -tf.log(model_out))
    weight = tf.multiply(y_true, tf.pow(tf.subtract(1., model_out), gamma))
    fl = tf.multiply(alpha, tf.multiply(weight, ce))
    reduced_fl = tf.reduce_max(fl, axis=1)
    return tf.reduce_mean(reduced_fl)

def accuracy(y_true, y_pred):
    correct_preds = tf.equal(tf.argmax(y_true, axis = 1),
                             tf.argmax(y_pred, axis = 1))
    accuracy = tf.reduce_mean(tf.cast(correct_preds, tf.float32))
    return accuracy

def grad(model, loss, x, y):
    with tf.GradientTape() as tape:
        loss_value = loss(model, x, y)
    return tape.gradient(loss_value, model.variables)

model = Pattern2()
loss = focal_loss
epochs = 3
batch_size = 1000

optimizer = tf.train.AdamOptimizer()

for epoch in tqdm(range(epochs)):

    n = tf.shape(X_train_tf).numpy()[0]
    perm = np.random.permutation(n)

    for i in range(0, n, batch_size): 
        batch_x = tf.gather(X_train_tf, perm[i:i+batch_size])
        batch_y = tf.gather(y_train_tf, perm[i:i+batch_size])

        grads = grad(model, loss, batch_x, batch_y)
        optimizer.apply_gradients(zip(grads, model.variables), 
                                  global_step=tf.train.get_or_create_global_step())

    train_loss = loss(model, X_train_tf, y_train_tf)
    train_acc = accuracy(y_train_tf, model(X_train_tf))

    valid_loss = loss(model, X_test_tf, y_test_tf)
    valid_acc = accuracy(y_test_tf, model(X_test_tf))

    if (epoch+1)%1 == 0:
        tqdm.write('epoch:\t{}\tloss:\t{:.5f}\tacc:\t{:.5f}\tval_loss:\t{:.5f}\tval_acc:\t{:.5f}'.format(
            epoch+1, train_loss, train_acc, valid_loss, valid_acc)
                  )

preds = model(X_test_tf)

LABELS = ['Normal','Fraud'] 

truth = np.argmax(y_test_tf, axis=1)
predictions = np.argmax(preds, axis=1)
confusion_matrix = metrics.confusion_matrix(truth, predictions)

plt.figure(figsize=(3, 3))
sns.heatmap(confusion_matrix, xticklabels=LABELS, yticklabels=LABELS,
            annot=True, fmt="d", linewidth=0.1, linecolor='white', 
            annot_kws={"size": 10});
plt.title("Confusion matrix", fontsize=10)
plt.ylabel('True label', fontsize=10)
plt.xlabel('Predicted label', fontsize=10)
plt.show()