More than 1 year has passed since last update.

t-SNE & UMAP 試行・分けられた分布の中にある数字の形の傾向を調べる (MNIST 28x28 60000枚で再試行→1分弱で高速)

Last updated at 2024-05-14Posted at 2020-05-05

下記の試行をしました:

t-SNE、UMAPを試行。
MNIST 28x28手書き数字、学習用画像、60000枚を使用。
それぞれ分けられた分布の中にある数字の形の傾向を調べる。

結果:

6万枚の画像を処理すると、t-SNEは1時間40分もかかる結果となる。
t-SNE、おおむね良い分解の性能。グルーフ同士の距離が近め、分解があいまいな部分もある。
UMAPは、t-SNEと比較して、処理に1分もかからず(ローカルPC CPUのみ)、圧倒的に高速で処理される模様。
UMAPの方が、全体的にはグルーフ同士の距離が遠くなるが、近い部分はほとんどグループが接してしまう。グループの塊だけを見ていると、別グループへ分解するのは困難。
双方の手法、同じグループの中でも、さらに、同じような見た目の数字は同じ場所付近に分布し、傾向がさらに分解され、良好。

前回の試行は:

参照したもの:

ライブラリをLoad

import numpy as np
import matplotlib.pyplot as plt

import torch
import torchvision
import sklearn

!pip show torch

Name: torch
Version: 1.5.0+cpu
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: ...\wpy64-3741\python-3.7.4.amd64\lib\site-packages
Requires: numpy, future
Required-by: torchvision

データをLoad

# sec: load

ds = torchvision.datasets.MNIST(root="trains/pytorch-mnist", train=True, download=True)
print(ds)

Dataset MNIST
    Number of datapoints: 60000
    Root location: trains/pytorch-mnist
    Split: Train

# sec: MNISTの画像をグリッド状に描画

def draw_digits(i_list, n_grid=(10, 10), annosize=10, figsize=(12, 12)):
    # assume: i_listは画像配列の番号をリストに入れたもの、annosize=Noneで画像列番と正解ラベルを非表示
    
    fig = None
    i_ax = 0
    for i_img in i_list:

        if fig is None or i_ax >= n_grid[0] * n_grid[1]:
            fig = plt.figure(figsize=figsize)
            plt.subplots_adjust(hspace=0.02, wspace=0)
            i_ax = 0
        i_ax += 1

        ax = fig.add_subplot(n_grid[0], n_grid[1], i_ax)
        if i_img is None:
            ax.axis('off')
            continue
        
        ax.imshow(ds[i_img][0], cmap='gray', interpolation='none')
        if annosize is not None: # if: 画像列番と正解ラベルを追記
            ax.annotate("%d" % i_img, 
                xy=(0, 0.98), xycoords='axes fraction', ha='left', va='top', color='y', fontsize=annosize)
            ax.annotate("L:%d" % ds[i_img][1], 
                xy=(1, 0.98), xycoords='axes fraction', ha='right', va='top', color='c', fontsize=annosize)
        ax.axis('off')

    plt.show()
    
draw_digits(list(range(24)))

t-SNEを実行

from sklearn.manifold import TSNE

# sec: データ変換

n_data = len(ds) # 2000, 10000, len(ds)←60k枚使うと処理時間が膨大過ぎ
data_tsne = np.array([np.asarray(ds[i][0]).flatten() for i in range(n_data)])
data_true = np.array([np.asarray(ds[i][1]) for i in range(n_data)]).flatten()
print(data_tsne.shape)
print(data_true.shape)

(60000, 784)
(60000,)

# sec: 実行 (時間がかかるため、ここはGoogle Colab上で処理した結果)

model = TSNE(n_components=2) # 2軸へ次元圧縮
%time res = model.fit_transform(data_tsne) # 画像数 x 1Dベクトルの2D配列を渡す
print(res.shape)

CPU times: user 1h 39min 20s, sys: 1.1 s, total: 1h 39min 21s
Wall time: 1h 39min 23s
(60000, 2)

# sec: 結果の描画

import matplotlib.cm as cm
plt.figure(figsize=(12, 12))
plt.scatter(res[:,0], res[:,1], s=3, c=data_true, cmap=cm.tab10)
plt.colorbar()
plt.grid()
plt.show()

結果のグループを調べる

# sec: 結果を保存 毎回実行で変わる為

import pickle
with open("./results/2005 t-SNE/res-tsne-60k.pickle", 'wb') as file:
    pickle.dump(res, file)

# sec: 結果を読み込み 前回の続きから

import pickle
with open("./results/2005 t-SNE/res-tsne-60k.pickle", 'rb') as file:
    res = pickle.load(file)

分布の位置に画像を当てはめて格子状に表示

def draw_digits_at_tsne(res, x_min, x_max, y_min, y_max, 
    n_grid=(15, 15), annosize=8, figsize=(12, 12)):
    
    x_pitch = (x_max - x_min) / n_grid[1]
    y_pitch = (y_max - y_min) / n_grid[0]
    i_draw_list = []
    for i_y in range(n_grid[0]):
        y_i = y_max - y_pitch * i_y - y_pitch/2 # 格子中央点
        for i_x in range(n_grid[1]):
            x_i = x_min + x_pitch * i_x + x_pitch/2 # 格子中央点
            
            i_list = np.where((x_i-x_pitch/2 < res[:, 0]) & (res[:, 0] < x_i+x_pitch/2) & \
                              (y_i-y_pitch/2 < res[:, 1]) & (res[:, 1] < y_i+y_pitch/2))[0] # 格子内の点を集める
            res_i = res[i_list, :]
            if len(res_i) == 0: # if: 格子内に点なし
                i_draw_list.append(None)
                continue
            
            r2_i = ((res_i[:, 0] - x_i) / x_pitch)**2 + ((res_i[:, 1] - y_i) / y_pitch)**2 # 格子中央と点との距離
            i_min = i_list[np.argmin(r2_i)] # 格子中央に最も近い点
            i_draw_list.append(i_min)
    
    plt.figure(figsize=(6, 6))
    plt.scatter(res[:, 0], res[:, 1], s=10, c=data_true, cmap=cm.tab10) # 指定範囲内の点の分布を描画
    plt.axis([x_min, x_max, y_min, y_max]); plt.grid(); plt.show()
    
    draw_digits(i_draw_list, n_grid=n_grid, annosize=annosize, figsize=figsize)

draw_digits_at_tsne(res, -55, -20, -30, -5)

全体の分布状況を描画

x_max, y_max = np.amax(res, axis=0)
x_min, y_min = np.amin(res, axis=0)

draw_digits_at_tsne(res, x_min, x_max, y_min, y_max, n_grid=(50, 50), annosize=None)

分割して拡大、分布状況を描画

# sec: 分割

def gen_grid_rect(x_min, x_max, y_min, y_max, n_part = 3): # 領域の分割数
    grid_x1 = np.linspace(x_min, x_max, n_part + 1)
    grid_x2 = grid_x1[1:].copy()
    grid_y1 = np.linspace(y_min, y_max, n_part + 1)
    grid_y2 = grid_y1[1:].copy()
    return [[x1_i, x2_i, y1_i, y2_i] 
        for y1_i, y2_i in zip(grid_y1, grid_y2) 
        for x1_i, x2_i in zip(grid_x1, grid_x2)]

gen_grid_rect(x_min, x_max, y_min, y_max)

[[-51.664737701416016,
  -15.292266845703125,
  -57.17302703857422,
  -19.207008361816406],
 [-15.292266845703125,
  21.080204010009766,
  -57.17302703857422,
  -19.207008361816406],
 [21.080204010009766,
  57.452674865722656,
  -57.17302703857422,
  -19.207008361816406],
 [-51.664737701416016,
  -15.292266845703125,
  -19.207008361816406,
  18.759010314941406],
 [-15.292266845703125,
  21.080204010009766,
  -19.207008361816406,
  18.759010314941406],
 [21.080204010009766,
  57.452674865722656,
  -19.207008361816406,
  18.759010314941406],
 [-51.664737701416016,
  -15.292266845703125,
  18.759010314941406,
  56.72502899169922],
 [-15.292266845703125,
  21.080204010009766,
  18.759010314941406,
  56.72502899169922],
 [21.080204010009766,
  57.452674865722656,
  18.759010314941406,
  56.72502899169922]]

for x1, x2, y1, y2 in gen_grid_rect(x_min, x_max, y_min, y_max):
    draw_digits_at_tsne(res, x1, x2, y1, y2, n_grid=(25, 25), annosize=None)

UMAPを試す

t-SNEの時と同様に、MNIST 28x28の学習用画像、6万枚を使用。t-SNEの試行の中で作った描画用の関数を使用。

UMAPは、t-SNEと比較して、処理に1分もかからず(ローカルPC CPUのみ)、圧倒的に高速で処理される模様。

!pip3 install umap-learn

Requirement already satisfied: umap-learn in ...\wpy64-3741\python-3.7.4.amd64\lib\site-packages (0.3.10)
Requirement already satisfied: scipy>=0.19 in ...\wpy64-3741\python-3.7.4.amd64\lib\site-packages (from umap-learn) (1.3.1)
Requirement already satisfied: numpy>=1.13 in ...\wpy64-3741\python-3.7.4.amd64\lib\site-packages (from umap-learn) (1.16.5+mkl)
Requirement already satisfied: numba>=0.37 in ...\wpy64-3741\python-3.7.4.amd64\lib\site-packages (from umap-learn) (0.45.1)
Requirement already satisfied: scikit-learn>=0.16 in ...\wpy64-3741\python-3.7.4.amd64\lib\site-packages (from umap-learn) (0.21.3)
Requirement already satisfied: llvmlite>=0.29.0 in ...\wpy64-3741\python-3.7.4.amd64\lib\site-packages (from numba>=0.37->umap-learn) (0.29.0)
Requirement already satisfied: joblib>=0.11 in ...\wpy64-3741\python-3.7.4.amd64\lib\site-packages (from scikit-learn>=0.16->umap-learn) (0.13.2)


WARNING: You are using pip version 19.2.3, however version 20.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.

UMAPを実行

import umap
from scipy.sparse.csgraph import connected_components
# 公式GitHubには書いてあるのですが、↑を書かないとエラーが出てしまいます。

# sec: 実行 →1分かからず、圧倒的に高速で処理される模様

%time res_umap = umap.UMAP().fit_transform(data_tsne)
print(res_umap.shape)

...\WPy64-3741\python-3.7.4.amd64\lib\site-packages\numba\compiler.py:602: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "..\..\python-3.7.4.amd64\lib\site-packages\umap\nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  self.func_ir.loc))


Wall time: 56.5 s
(60000, 2)

# sec: 結果の描画

import matplotlib.cm as cm
plt.figure(figsize=(12, 12))
plt.scatter(res_umap[:,0], res_umap[:,1], s=3, c=data_true, cmap=cm.tab10)
plt.colorbar()
plt.grid()
plt.show()

# sec: 実行 (Google Colabでも実行してみると)

%time res_umap = umap.UMAP().fit_transform(data_tsne)
print(res_umap.shape)

CPU times: user 2min 18s, sys: 3.5 s, total: 2min 21s
Wall time: 1min 31s
(60000, 2)

# sec: 結果の描画

import matplotlib.cm as cm
plt.figure(figsize=(12, 12))
plt.scatter(res_umap[:,0], res_umap[:,1], s=3, c=data_true, cmap=cm.tab10)
plt.colorbar()
plt.grid()
plt.show()

結果のグループを調べる

# sec: 結果を保存 毎回実行で変わる為

import pickle
with open("./results/2005 t-SNE/res-umap-60k.pickle", 'wb') as file:
    pickle.dump(res_umap, file)

# sec: 結果を読み込み 前回の続きから

import pickle
with open("./results/2005 t-SNE/res-umap-60k.pickle", 'rb') as file:
    res_umap = pickle.load(file)

全体の分布状況を描画

x_max, y_max = np.amax(res_umap, axis=0)
x_min, y_min = np.amin(res_umap, axis=0)

draw_digits_at_tsne(res_umap, x_min, x_max, y_min, y_max, n_grid=(50, 50), annosize=None)

分割して拡大、分布状況を描画

# sec: 分割

def gen_grid_rect(x_min, x_max, y_min, y_max, n_part = 3): # 領域の分割数
    grid_x1 = np.linspace(x_min, x_max, n_part + 1)
    grid_x2 = grid_x1[1:].copy()
    grid_y1 = np.linspace(y_min, y_max, n_part + 1)
    grid_y2 = grid_y1[1:].copy()
    return [[x1_i, x2_i, y1_i, y2_i] 
        for y1_i, y2_i in zip(grid_y1, grid_y2) 
        for x1_i, x2_i in zip(grid_x1, grid_x2)]

gen_grid_rect(x_min, x_max, y_min, y_max)

[[-5.68484354019165,
  1.8034880956013994,
  -4.267199516296387,
  2.2263097763061523],
 [1.8034880956013994,
  9.29181973139445,
  -4.267199516296387,
  2.2263097763061523],
 [9.29181973139445, 16.7801513671875, -4.267199516296387, 2.2263097763061523],
 [-5.68484354019165,
  1.8034880956013994,
  2.2263097763061523,
  8.719819068908691],
 [1.8034880956013994, 9.29181973139445, 2.2263097763061523, 8.719819068908691],
 [9.29181973139445, 16.7801513671875, 2.2263097763061523, 8.719819068908691],
 [-5.68484354019165, 1.8034880956013994, 8.719819068908691, 15.21332836151123],
 [1.8034880956013994, 9.29181973139445, 8.719819068908691, 15.21332836151123],
 [9.29181973139445, 16.7801513671875, 8.719819068908691, 15.21332836151123]]

for x1, x2, y1, y2 in gen_grid_rect(x_min, x_max, y_min, y_max):
    draw_digits_at_tsne(res_umap, x1, x2, y1, y2, n_grid=(25, 25), annosize=None)

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up