1
3

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 3 years have passed since last update.

mnistをauto encoderで教師なし学習し、潜在変数をクラスタリングして評価する

Last updated at Posted at 2020-11-03

mnistをauto encoderで教師なし学習し、最終段をクラスタリングして評価する

# 必要なライブラリのインポート
from keras.datasets import mnist
import numpy as np
import pandas as pd
import sklearn
# Jupyter notebookを利用している際に、notebook内にplot結果を表示するようにする
import matplotlib.pyplot as plt
%matplotlib inline

from keras.layers import Input, Dense
from keras.models import Model
from keras import backend as K
import gc
Using TensorFlow backend.
feature_dims = range(8, 32+1, 8)
display(list(feature_dims))
[8, 16, 24, 32]
#Kerasの関数でデータの読み込み。データをシャッフルして学習データと訓練データに分割
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# 2次元データを数値に変換
x_train = x_train.reshape(60000, 784)
x_test = x_test.reshape(10000, 784)
# 型変換
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
# 255で割ったものを新たに変数とする
x_train /= 255
x_test /= 255

# one-hot encodingを施すためのメソッド
from keras.utils.np_utils import to_categorical
# クラス数は10
num_classes = 10
y_train = y_train.astype('int32')
y_test = y_test.astype('int32')
labels = y_test
# one-hot encoding
y_train = to_categorical(y_train, num_classes)
y_test =  to_categorical(y_test, num_classes)
def fitting(feature_dim, x_train, y_train, x_test, y_test):
    # モデル構築
    layer_name = 'encoded'
    input_img = Input(shape=(784,))
    x1 = Dense(256, activation='relu')(input_img)  
    x2 = Dense(64, activation='relu')(x1)  
    encoded = Dense(feature_dim, activation='relu', name=layer_name)(x2) 
    x3 = Dense(64, activation='relu')(encoded)
    x4 = Dense(256, activation='relu')(x3)  
    decoded = Dense(784, activation='sigmoid')(x4)
    autoencoder = Model(input=input_img, output=decoded)

    z_layer_model = Model(inputs=autoencoder.input,
                                    outputs=autoencoder.get_layer(layer_name).output)

    autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')
    autoencoder.summary()  

    # 学習
    history = autoencoder.fit(x_train, x_train,
                    nb_epoch=40,    
                    batch_size=256,
                    shuffle=True,
                    validation_data=(x_test, x_test))

    result = [autoencoder.predict(x_test), z_layer_model.predict(x_test)]

    K.clear_session() # ←これです
    gc.collect()
    from IPython.display import clear_output
    clear_output()
    return (history, autoencoder, result)
#model = fitting(10, x_train, y_train, x_test, y_test)
models = [None] * len(feature_dims)
histories = [None] * len(feature_dims)
dec_imgs = [None] * len(feature_dims)
results = [None] * len(feature_dims)
for i in range(len(feature_dims)):
    (histories[i], models[i], dec_imgs[i]) = fitting(feature_dims[i], x_train, y_train, x_test, y_test)
for i in range(len(feature_dims)):
    print(feature_dims[i])
    # テスト画像と変換画像の表示
    n = 10
    plt.figure(figsize=(10, 2))
    for j in range(n):
        # テスト画像を表示
        ax = plt.subplot(2, n, j+1)
        plt.imshow(x_test[j].reshape(28, 28))
        plt.gray()
        ax.get_xaxis().set_visible(False)
        ax.get_yaxis().set_visible(False)

        # 変換された画像を表示
        ax = plt.subplot(2, n, j+1+n)
        plt.imshow(dec_imgs[i][0][j].reshape(28, 28))
        plt.gray()
        ax.get_xaxis().set_visible(False)
        ax.get_yaxis().set_visible(False)
    plt.show()
8

output_5_1.png

16

output_5_3.png

24

output_5_5.png

32

output_5_7.png

for i in range(len(feature_dims)):
    results[i] = dec_imgs[i][1]
#model.save('model/mnist-10')
#model = keras.models.load_model('model/mnist-10')
#for i in range(len(feature_dims)):
#    models[i].pop() # 最終段のsoftmax層を取り除いて、特徴量の層を最終段とする
#    models[i].summary()
#result = model.predict(x_test)
#results = [None] * len(feature_dims)
#for i in range(len(feature_dims)):
#    keras.backend.clear_session()
#    results[i] = models[i].predict(x_test)
def tsne(result):
    #t-SNEで次元削減
    from sklearn.manifold import TSNE
    tsne = TSNE(n_components=2, random_state = 0, perplexity = 30, n_iter = 1000)
    return tsne.fit_transform(result)
#tsne = tsne(result)
tsnes = [None] * len(feature_dims)
for i in range(len(feature_dims)):
    tsnes[i] = tsne(results[i])
#df = pd.DataFrame(tsne, columns = ['x', 'y'])
#df['label'] = labels
def km(n_clusters, result):
    # k-meansでクラスタリングする
    from sklearn.cluster import KMeans
    return KMeans(n_clusters).fit_predict(result)
#km = km(10, result)
#df['km'] = km
kms = [None] * len(feature_dims)
for i in range(len(feature_dims)):
    kms[i] = km(10, results[i])
def DBSCAN(n_clusters, result):
    from sklearn.cluster import DBSCAN
    db = DBSCAN(eps=0.2, min_samples=n_clusters).fit(result)
    return db.labels_
#dbscan = DBSCAN(20, result)
#df['DBSCAN'] = dbscan
def hierarchy(result):
    from scipy.cluster.hierarchy import linkage, dendrogram
    result1 = linkage(result, 
                  metric = 'braycurtis', 
                  #metric = 'canberra', 
                  #metric = 'chebyshev', 
                  #metric = 'cityblock', 
                  #metric = 'correlation', 
                  #metric = 'cosine', 
                  #metric = 'euclidean', 
                  #metric = 'hamming', 
                  #metric = 'jaccard', 
                  #method= 'single')
                  method = 'average')
                  #method= 'complete')
                  #method='weighted')
    return result1
#hierarchy = hierarchy(result)
#display(hierarchy)
def label_to_colors(label):
    color_dict = dict([(color[0], color[1]['color']) for color in zip(np.unique(label), plt.rcParams['axes.prop_cycle'])])
    colors = np.empty(label.shape, np.object)
    for k, v in color_dict.items():
        colors[label==k] = v
    return colors

#def cluster_visualization(x, y, label, cluster, method, n_clusters):
def cluster_visualization(x, y, label, cluster):
    plt.figure(figsize = (30, 15))
    plt.subplot(1,2,1)
    plt.scatter(x, y, c=label_to_colors(label))
#    for i in range(10):
#        tmp_df = df[df['label'] == i]
#        plt.scatter(tmp_df['x'], tmp_df['y'], label=i)
#    plt.legend(loc='upper left', bbox_to_anchor=(1,1))
    plt.subplot(1,2,2)
    plt.scatter(x, y, c=label_to_colors(cluster))
#    for i in range(n_clusters):
#        tmp_df = df[df[method] == i]
#        plt.scatter(tmp_df['x'], tmp_df['y'], label=i)
#    plt.legend(loc='upper left', bbox_to_anchor=(1,1))
for i in range(len(feature_dims)):
    cluster_visualization(tsnes[i][:,0], tsnes[i][:,1], labels, kms[i])

output_19_0.png

output_19_1.png

output_19_2.png

output_19_3.png

# https://qiita.com/mamika311/items/75c24f6892f85593f7e7
from sklearn.metrics.cluster import adjusted_rand_score
for i in range(len(feature_dims)):
    print("dim:" + str(feature_dims[i]) + " ARI: " + str(adjusted_rand_score(labels, kms[i])))
dim:8 ARI: 0.3987309485653015
dim:16 ARI: 0.40738458796211546
dim:24 ARI: 0.3677837864385967
dim:32 ARI: 0.43182464556112676
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.normalized_mutual_info_score.html
# https://qiita.com/kotap15/items/38289edfe822005e1e44
from sklearn.metrics import normalized_mutual_info_score
#display(normalized_mutual_info_score(labels, df['km']))
for i in range(len(feature_dims)):
    print("dim:" + str(feature_dims[i]) + " NMI: " + str(normalized_mutual_info_score(labels, kms[i])))
dim:8 NMI: 0.525123015401584
dim:16 NMI: 0.5452028060642871
dim:24 NMI: 0.5173700351804098
dim:32 NMI: 0.5592638372411443
def shilhouette(clusters, x_test):
    from sklearn.metrics import silhouette_samples
    from matplotlib import cm
    plt.figure(figsize = (10, 10))
    cluster_labels=np.unique(clusters)
    n_clusters=cluster_labels.shape[0]
    silhouette_vals=silhouette_samples(x_test,clusters,metric='euclidean')
    y_ax_lower,y_ax_upper=0,0
    yticks=[]
    for i,c in enumerate(cluster_labels):
        c_silhouette_vals=silhouette_vals[clusters==c]
        print(len(c_silhouette_vals))
        c_silhouette_vals.sort()
        y_ax_upper +=len(c_silhouette_vals)
        color=cm.jet(float(i)/n_clusters)
        plt.barh(range(y_ax_lower,y_ax_upper),
                c_silhouette_vals,
                height=1.0,
                edgecolor='none',
                color=color
                )
        yticks.append((y_ax_lower+y_ax_upper)/2.)
        y_ax_lower += len(c_silhouette_vals)

    #シルエット係数が1であれば 良くクラスタリングできてる 
    #またシルエットの幅がクラスタ数で平均して等しいとき、全体のデータを等分割できていることを示す
    #この分割幅=シルエットバーの幅が等しくなり、かつ、シルエット係数が1に近づくようにkを最適化することが設定手法として考えられる.

    #平均の位置に線を引く
    silhouette_avg=np.mean(silhouette_vals)
    plt.axvline(silhouette_avg,color="red",linestyle="--")
    plt.ylabel("Cluster")
    plt.xlabel("Silhouette coefficient")
for i in range(len(feature_dims)):
    shilhouette(kms[i], x_test)
1077
1368
1273
824
854
1070
1251
848
758
677
1047
1660
869
824
1400
532
926
770
1314
658
793
784
929
1452
889
733
1592
1381
521
926
1503
843
810
1343
500
908
1559
744
973
817

output_23_1.png

output_23_2.png

output_23_3.png

output_23_4.png


1
3
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
3

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?