LoginSignup
10
7

t-SNE & UMAP 試行・分けられた分布の中にある数字の形の傾向を調べる (MNIST 28x28 60000枚で再試行→1分弱で高速)

Last updated at Posted at 2020-05-05

下記の試行をしました:

  • t-SNE、UMAPを試行。
  • MNIST 28x28手書き数字、学習用画像、60000枚を使用。
  • それぞれ分けられた分布の中にある数字の形の傾向を調べる。

結果:

  • 6万枚の画像を処理すると、t-SNEは1時間40分もかかる結果となる。
  • t-SNE、おおむね良い分解の性能。グルーフ同士の距離が近め、分解があいまいな部分もある。
  • UMAPは、t-SNEと比較して、処理に1分もかからず(ローカルPC CPUのみ)、圧倒的に高速で処理される模様。
  • UMAPの方が、全体的にはグルーフ同士の距離が遠くなるが、近い部分はほとんどグループが接してしまう。グループの塊だけを見ていると、別グループへ分解するのは困難。
  • 双方の手法、同じグループの中でも、さらに、同じような見た目の数字は同じ場所付近に分布し、傾向がさらに分解され、良好。

前回の試行は:

参照したもの:

ライブラリをLoad

import numpy as np
import matplotlib.pyplot as plt

import torch
import torchvision
import sklearn
!pip show torch
Name: torch
Version: 1.5.0+cpu
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: ...\wpy64-3741\python-3.7.4.amd64\lib\site-packages
Requires: numpy, future
Required-by: torchvision

データをLoad

# sec: load

ds = torchvision.datasets.MNIST(root="trains/pytorch-mnist", train=True, download=True)
print(ds)
Dataset MNIST
    Number of datapoints: 60000
    Root location: trains/pytorch-mnist
    Split: Train
# sec: MNISTの画像をグリッド状に描画

def draw_digits(i_list, n_grid=(10, 10), annosize=10, figsize=(12, 12)):
    # assume: i_listは画像配列の番号をリストに入れたもの、annosize=Noneで画像列番と正解ラベルを非表示
    
    fig = None
    i_ax = 0
    for i_img in i_list:

        if fig is None or i_ax >= n_grid[0] * n_grid[1]:
            fig = plt.figure(figsize=figsize)
            plt.subplots_adjust(hspace=0.02, wspace=0)
            i_ax = 0
        i_ax += 1

        ax = fig.add_subplot(n_grid[0], n_grid[1], i_ax)
        if i_img is None:
            ax.axis('off')
            continue
        
        ax.imshow(ds[i_img][0], cmap='gray', interpolation='none')
        if annosize is not None: # if: 画像列番と正解ラベルを追記
            ax.annotate("%d" % i_img, 
                xy=(0, 0.98), xycoords='axes fraction', ha='left', va='top', color='y', fontsize=annosize)
            ax.annotate("L:%d" % ds[i_img][1], 
                xy=(1, 0.98), xycoords='axes fraction', ha='right', va='top', color='c', fontsize=annosize)
        ax.axis('off')

    plt.show()
    
draw_digits(list(range(24)))

output_5_0.png

t-SNEを実行

from sklearn.manifold import TSNE
# sec: データ変換

n_data = len(ds) # 2000, 10000, len(ds)←60k枚使うと処理時間が膨大過ぎ
data_tsne = np.array([np.asarray(ds[i][0]).flatten() for i in range(n_data)])
data_true = np.array([np.asarray(ds[i][1]) for i in range(n_data)]).flatten()
print(data_tsne.shape)
print(data_true.shape)
(60000, 784)
(60000,)
# sec: 実行 (時間がかかるため、ここはGoogle Colab上で処理した結果)

model = TSNE(n_components=2) # 2軸へ次元圧縮
%time res = model.fit_transform(data_tsne) # 画像数 x 1Dベクトルの2D配列を渡す
print(res.shape)
CPU times: user 1h 39min 20s, sys: 1.1 s, total: 1h 39min 21s
Wall time: 1h 39min 23s
(60000, 2)
# sec: 結果の描画

import matplotlib.cm as cm
plt.figure(figsize=(12, 12))
plt.scatter(res[:,0], res[:,1], s=3, c=data_true, cmap=cm.tab10)
plt.colorbar()
plt.grid()
plt.show()

output_10_0.png

結果のグループを調べる

# sec: 結果を保存 毎回実行で変わる為

import pickle
with open("./results/2005 t-SNE/res-tsne-60k.pickle", 'wb') as file:
    pickle.dump(res, file)
# sec: 結果を読み込み 前回の続きから

import pickle
with open("./results/2005 t-SNE/res-tsne-60k.pickle", 'rb') as file:
    res = pickle.load(file)

分布の位置に画像を当てはめて格子状に表示

def draw_digits_at_tsne(res, x_min, x_max, y_min, y_max, 
    n_grid=(15, 15), annosize=8, figsize=(12, 12)):
    
    x_pitch = (x_max - x_min) / n_grid[1]
    y_pitch = (y_max - y_min) / n_grid[0]
    i_draw_list = []
    for i_y in range(n_grid[0]):
        y_i = y_max - y_pitch * i_y - y_pitch/2 # 格子中央点
        for i_x in range(n_grid[1]):
            x_i = x_min + x_pitch * i_x + x_pitch/2 # 格子中央点
            
            i_list = np.where((x_i-x_pitch/2 < res[:, 0]) & (res[:, 0] < x_i+x_pitch/2) & \
                              (y_i-y_pitch/2 < res[:, 1]) & (res[:, 1] < y_i+y_pitch/2))[0] # 格子内の点を集める
            res_i = res[i_list, :]
            if len(res_i) == 0: # if: 格子内に点なし
                i_draw_list.append(None)
                continue
            
            r2_i = ((res_i[:, 0] - x_i) / x_pitch)**2 + ((res_i[:, 1] - y_i) / y_pitch)**2 # 格子中央と点との距離
            i_min = i_list[np.argmin(r2_i)] # 格子中央に最も近い点
            i_draw_list.append(i_min)
    
    plt.figure(figsize=(6, 6))
    plt.scatter(res[:, 0], res[:, 1], s=10, c=data_true, cmap=cm.tab10) # 指定範囲内の点の分布を描画
    plt.axis([x_min, x_max, y_min, y_max]); plt.grid(); plt.show()
    
    draw_digits(i_draw_list, n_grid=n_grid, annosize=annosize, figsize=figsize)

draw_digits_at_tsne(res, -55, -20, -30, -5)

output_15_0.png

output_15_1.png

全体の分布状況を描画

x_max, y_max = np.amax(res, axis=0)
x_min, y_min = np.amin(res, axis=0)

draw_digits_at_tsne(res, x_min, x_max, y_min, y_max, n_grid=(50, 50), annosize=None)

output_17_0.png

output_17_1.png

分割して拡大、分布状況を描画

# sec: 分割

def gen_grid_rect(x_min, x_max, y_min, y_max, n_part = 3): # 領域の分割数
    grid_x1 = np.linspace(x_min, x_max, n_part + 1)
    grid_x2 = grid_x1[1:].copy()
    grid_y1 = np.linspace(y_min, y_max, n_part + 1)
    grid_y2 = grid_y1[1:].copy()
    return [[x1_i, x2_i, y1_i, y2_i] 
        for y1_i, y2_i in zip(grid_y1, grid_y2) 
        for x1_i, x2_i in zip(grid_x1, grid_x2)]

gen_grid_rect(x_min, x_max, y_min, y_max)
[[-51.664737701416016,
  -15.292266845703125,
  -57.17302703857422,
  -19.207008361816406],
 [-15.292266845703125,
  21.080204010009766,
  -57.17302703857422,
  -19.207008361816406],
 [21.080204010009766,
  57.452674865722656,
  -57.17302703857422,
  -19.207008361816406],
 [-51.664737701416016,
  -15.292266845703125,
  -19.207008361816406,
  18.759010314941406],
 [-15.292266845703125,
  21.080204010009766,
  -19.207008361816406,
  18.759010314941406],
 [21.080204010009766,
  57.452674865722656,
  -19.207008361816406,
  18.759010314941406],
 [-51.664737701416016,
  -15.292266845703125,
  18.759010314941406,
  56.72502899169922],
 [-15.292266845703125,
  21.080204010009766,
  18.759010314941406,
  56.72502899169922],
 [21.080204010009766,
  57.452674865722656,
  18.759010314941406,
  56.72502899169922]]
for x1, x2, y1, y2 in gen_grid_rect(x_min, x_max, y_min, y_max):
    draw_digits_at_tsne(res, x1, x2, y1, y2, n_grid=(25, 25), annosize=None)

output_20_0.png

output_20_1.png

output_20_2.png

output_20_3.png

output_20_4.png

output_20_5.png

output_20_6.png

output_20_7.png

output_20_8.png

output_20_9.png

output_20_10.png

output_20_11.png

output_20_12.png

output_20_13.png

output_20_14.png

output_20_15.png

output_20_16.png

output_20_17.png

UMAPを試す

t-SNEの時と同様に、MNIST 28x28の学習用画像、6万枚を使用。t-SNEの試行の中で作った描画用の関数を使用。

UMAPは、t-SNEと比較して、処理に1分もかからず(ローカルPC CPUのみ)、圧倒的に高速で処理される模様。

!pip3 install umap-learn
Requirement already satisfied: umap-learn in ...\wpy64-3741\python-3.7.4.amd64\lib\site-packages (0.3.10)
Requirement already satisfied: scipy>=0.19 in ...\wpy64-3741\python-3.7.4.amd64\lib\site-packages (from umap-learn) (1.3.1)
Requirement already satisfied: numpy>=1.13 in ...\wpy64-3741\python-3.7.4.amd64\lib\site-packages (from umap-learn) (1.16.5+mkl)
Requirement already satisfied: numba>=0.37 in ...\wpy64-3741\python-3.7.4.amd64\lib\site-packages (from umap-learn) (0.45.1)
Requirement already satisfied: scikit-learn>=0.16 in ...\wpy64-3741\python-3.7.4.amd64\lib\site-packages (from umap-learn) (0.21.3)
Requirement already satisfied: llvmlite>=0.29.0 in ...\wpy64-3741\python-3.7.4.amd64\lib\site-packages (from numba>=0.37->umap-learn) (0.29.0)
Requirement already satisfied: joblib>=0.11 in ...\wpy64-3741\python-3.7.4.amd64\lib\site-packages (from scikit-learn>=0.16->umap-learn) (0.13.2)


WARNING: You are using pip version 19.2.3, however version 20.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.

UMAPを実行

import umap
from scipy.sparse.csgraph import connected_components
# 公式GitHubには書いてあるのですが、↑を書かないとエラーが出てしまいます。
# sec: 実行 →1分かからず、圧倒的に高速で処理される模様

%time res_umap = umap.UMAP().fit_transform(data_tsne)
print(res_umap.shape)
...\WPy64-3741\python-3.7.4.amd64\lib\site-packages\numba\compiler.py:602: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "..\..\python-3.7.4.amd64\lib\site-packages\umap\nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  self.func_ir.loc))


Wall time: 56.5 s
(60000, 2)
# sec: 結果の描画

import matplotlib.cm as cm
plt.figure(figsize=(12, 12))
plt.scatter(res_umap[:,0], res_umap[:,1], s=3, c=data_true, cmap=cm.tab10)
plt.colorbar()
plt.grid()
plt.show()

output_26_0.png

# sec: 実行 (Google Colabでも実行してみると)

%time res_umap = umap.UMAP().fit_transform(data_tsne)
print(res_umap.shape)
CPU times: user 2min 18s, sys: 3.5 s, total: 2min 21s
Wall time: 1min 31s
(60000, 2)
# sec: 結果の描画

import matplotlib.cm as cm
plt.figure(figsize=(12, 12))
plt.scatter(res_umap[:,0], res_umap[:,1], s=3, c=data_true, cmap=cm.tab10)
plt.colorbar()
plt.grid()
plt.show()

output_28_0.png

結果のグループを調べる

# sec: 結果を保存 毎回実行で変わる為

import pickle
with open("./results/2005 t-SNE/res-umap-60k.pickle", 'wb') as file:
    pickle.dump(res_umap, file)
# sec: 結果を読み込み 前回の続きから

import pickle
with open("./results/2005 t-SNE/res-umap-60k.pickle", 'rb') as file:
    res_umap = pickle.load(file)

全体の分布状況を描画

x_max, y_max = np.amax(res_umap, axis=0)
x_min, y_min = np.amin(res_umap, axis=0)

draw_digits_at_tsne(res_umap, x_min, x_max, y_min, y_max, n_grid=(50, 50), annosize=None)

output_33_0.png

output_33_1.png

分割して拡大、分布状況を描画

# sec: 分割

def gen_grid_rect(x_min, x_max, y_min, y_max, n_part = 3): # 領域の分割数
    grid_x1 = np.linspace(x_min, x_max, n_part + 1)
    grid_x2 = grid_x1[1:].copy()
    grid_y1 = np.linspace(y_min, y_max, n_part + 1)
    grid_y2 = grid_y1[1:].copy()
    return [[x1_i, x2_i, y1_i, y2_i] 
        for y1_i, y2_i in zip(grid_y1, grid_y2) 
        for x1_i, x2_i in zip(grid_x1, grid_x2)]

gen_grid_rect(x_min, x_max, y_min, y_max)
[[-5.68484354019165,
  1.8034880956013994,
  -4.267199516296387,
  2.2263097763061523],
 [1.8034880956013994,
  9.29181973139445,
  -4.267199516296387,
  2.2263097763061523],
 [9.29181973139445, 16.7801513671875, -4.267199516296387, 2.2263097763061523],
 [-5.68484354019165,
  1.8034880956013994,
  2.2263097763061523,
  8.719819068908691],
 [1.8034880956013994, 9.29181973139445, 2.2263097763061523, 8.719819068908691],
 [9.29181973139445, 16.7801513671875, 2.2263097763061523, 8.719819068908691],
 [-5.68484354019165, 1.8034880956013994, 8.719819068908691, 15.21332836151123],
 [1.8034880956013994, 9.29181973139445, 8.719819068908691, 15.21332836151123],
 [9.29181973139445, 16.7801513671875, 8.719819068908691, 15.21332836151123]]
for x1, x2, y1, y2 in gen_grid_rect(x_min, x_max, y_min, y_max):
    draw_digits_at_tsne(res_umap, x1, x2, y1, y2, n_grid=(25, 25), annosize=None)

output_36_0.png

output_36_1.png

output_36_2.png

output_36_3.png

output_36_4.png

output_36_5.png

output_36_6.png

output_36_7.png

output_36_8.png

output_36_9.png

output_36_10.png

output_36_11.png

output_36_12.png

output_36_13.png

output_36_14.png

output_36_15.png

output_36_16.png

output_36_17.png


10
7
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
10
7