下記の試行をしました:
- t-SNE、UMAPを試行。
- MNIST 28x28手書き数字、学習用画像、60000枚を使用。
- それぞれ分けられた分布の中にある数字の形の傾向を調べる。
結果:
- 6万枚の画像を処理すると、t-SNEは1時間40分もかかる結果となる。
- t-SNE、おおむね良い分解の性能。グルーフ同士の距離が近め、分解があいまいな部分もある。
- UMAPは、t-SNEと比較して、処理に1分もかからず(ローカルPC CPUのみ)、圧倒的に高速で処理される模様。
- UMAPの方が、全体的にはグルーフ同士の距離が遠くなるが、近い部分はほとんどグループが接してしまう。グループの塊だけを見ていると、別グループへ分解するのは困難。
- 双方の手法、同じグループの中でも、さらに、同じような見た目の数字は同じ場所付近に分布し、傾向がさらに分解され、良好。
前回の試行は:
参照したもの:
ライブラリをLoad
import numpy as np
import matplotlib.pyplot as plt
import torch
import torchvision
import sklearn
!pip show torch
Name: torch
Version: 1.5.0+cpu
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: ...\wpy64-3741\python-3.7.4.amd64\lib\site-packages
Requires: numpy, future
Required-by: torchvision
データをLoad
# sec: load
ds = torchvision.datasets.MNIST(root="trains/pytorch-mnist", train=True, download=True)
print(ds)
Dataset MNIST
Number of datapoints: 60000
Root location: trains/pytorch-mnist
Split: Train
# sec: MNISTの画像をグリッド状に描画
def draw_digits(i_list, n_grid=(10, 10), annosize=10, figsize=(12, 12)):
# assume: i_listは画像配列の番号をリストに入れたもの、annosize=Noneで画像列番と正解ラベルを非表示
fig = None
i_ax = 0
for i_img in i_list:
if fig is None or i_ax >= n_grid[0] * n_grid[1]:
fig = plt.figure(figsize=figsize)
plt.subplots_adjust(hspace=0.02, wspace=0)
i_ax = 0
i_ax += 1
ax = fig.add_subplot(n_grid[0], n_grid[1], i_ax)
if i_img is None:
ax.axis('off')
continue
ax.imshow(ds[i_img][0], cmap='gray', interpolation='none')
if annosize is not None: # if: 画像列番と正解ラベルを追記
ax.annotate("%d" % i_img,
xy=(0, 0.98), xycoords='axes fraction', ha='left', va='top', color='y', fontsize=annosize)
ax.annotate("L:%d" % ds[i_img][1],
xy=(1, 0.98), xycoords='axes fraction', ha='right', va='top', color='c', fontsize=annosize)
ax.axis('off')
plt.show()
draw_digits(list(range(24)))
t-SNEを実行
from sklearn.manifold import TSNE
# sec: データ変換
n_data = len(ds) # 2000, 10000, len(ds)←60k枚使うと処理時間が膨大過ぎ
data_tsne = np.array([np.asarray(ds[i][0]).flatten() for i in range(n_data)])
data_true = np.array([np.asarray(ds[i][1]) for i in range(n_data)]).flatten()
print(data_tsne.shape)
print(data_true.shape)
(60000, 784)
(60000,)
# sec: 実行 (時間がかかるため、ここはGoogle Colab上で処理した結果)
model = TSNE(n_components=2) # 2軸へ次元圧縮
%time res = model.fit_transform(data_tsne) # 画像数 x 1Dベクトルの2D配列を渡す
print(res.shape)
CPU times: user 1h 39min 20s, sys: 1.1 s, total: 1h 39min 21s
Wall time: 1h 39min 23s
(60000, 2)
# sec: 結果の描画
import matplotlib.cm as cm
plt.figure(figsize=(12, 12))
plt.scatter(res[:,0], res[:,1], s=3, c=data_true, cmap=cm.tab10)
plt.colorbar()
plt.grid()
plt.show()
結果のグループを調べる
# sec: 結果を保存 毎回実行で変わる為
import pickle
with open("./results/2005 t-SNE/res-tsne-60k.pickle", 'wb') as file:
pickle.dump(res, file)
# sec: 結果を読み込み 前回の続きから
import pickle
with open("./results/2005 t-SNE/res-tsne-60k.pickle", 'rb') as file:
res = pickle.load(file)
分布の位置に画像を当てはめて格子状に表示
def draw_digits_at_tsne(res, x_min, x_max, y_min, y_max,
n_grid=(15, 15), annosize=8, figsize=(12, 12)):
x_pitch = (x_max - x_min) / n_grid[1]
y_pitch = (y_max - y_min) / n_grid[0]
i_draw_list = []
for i_y in range(n_grid[0]):
y_i = y_max - y_pitch * i_y - y_pitch/2 # 格子中央点
for i_x in range(n_grid[1]):
x_i = x_min + x_pitch * i_x + x_pitch/2 # 格子中央点
i_list = np.where((x_i-x_pitch/2 < res[:, 0]) & (res[:, 0] < x_i+x_pitch/2) & \
(y_i-y_pitch/2 < res[:, 1]) & (res[:, 1] < y_i+y_pitch/2))[0] # 格子内の点を集める
res_i = res[i_list, :]
if len(res_i) == 0: # if: 格子内に点なし
i_draw_list.append(None)
continue
r2_i = ((res_i[:, 0] - x_i) / x_pitch)**2 + ((res_i[:, 1] - y_i) / y_pitch)**2 # 格子中央と点との距離
i_min = i_list[np.argmin(r2_i)] # 格子中央に最も近い点
i_draw_list.append(i_min)
plt.figure(figsize=(6, 6))
plt.scatter(res[:, 0], res[:, 1], s=10, c=data_true, cmap=cm.tab10) # 指定範囲内の点の分布を描画
plt.axis([x_min, x_max, y_min, y_max]); plt.grid(); plt.show()
draw_digits(i_draw_list, n_grid=n_grid, annosize=annosize, figsize=figsize)
draw_digits_at_tsne(res, -55, -20, -30, -5)
全体の分布状況を描画
x_max, y_max = np.amax(res, axis=0)
x_min, y_min = np.amin(res, axis=0)
draw_digits_at_tsne(res, x_min, x_max, y_min, y_max, n_grid=(50, 50), annosize=None)
分割して拡大、分布状況を描画
# sec: 分割
def gen_grid_rect(x_min, x_max, y_min, y_max, n_part = 3): # 領域の分割数
grid_x1 = np.linspace(x_min, x_max, n_part + 1)
grid_x2 = grid_x1[1:].copy()
grid_y1 = np.linspace(y_min, y_max, n_part + 1)
grid_y2 = grid_y1[1:].copy()
return [[x1_i, x2_i, y1_i, y2_i]
for y1_i, y2_i in zip(grid_y1, grid_y2)
for x1_i, x2_i in zip(grid_x1, grid_x2)]
gen_grid_rect(x_min, x_max, y_min, y_max)
[[-51.664737701416016,
-15.292266845703125,
-57.17302703857422,
-19.207008361816406],
[-15.292266845703125,
21.080204010009766,
-57.17302703857422,
-19.207008361816406],
[21.080204010009766,
57.452674865722656,
-57.17302703857422,
-19.207008361816406],
[-51.664737701416016,
-15.292266845703125,
-19.207008361816406,
18.759010314941406],
[-15.292266845703125,
21.080204010009766,
-19.207008361816406,
18.759010314941406],
[21.080204010009766,
57.452674865722656,
-19.207008361816406,
18.759010314941406],
[-51.664737701416016,
-15.292266845703125,
18.759010314941406,
56.72502899169922],
[-15.292266845703125,
21.080204010009766,
18.759010314941406,
56.72502899169922],
[21.080204010009766,
57.452674865722656,
18.759010314941406,
56.72502899169922]]
for x1, x2, y1, y2 in gen_grid_rect(x_min, x_max, y_min, y_max):
draw_digits_at_tsne(res, x1, x2, y1, y2, n_grid=(25, 25), annosize=None)
UMAPを試す
t-SNEの時と同様に、MNIST 28x28の学習用画像、6万枚を使用。t-SNEの試行の中で作った描画用の関数を使用。
UMAPは、t-SNEと比較して、処理に1分もかからず(ローカルPC CPUのみ)、圧倒的に高速で処理される模様。
!pip3 install umap-learn
Requirement already satisfied: umap-learn in ...\wpy64-3741\python-3.7.4.amd64\lib\site-packages (0.3.10)
Requirement already satisfied: scipy>=0.19 in ...\wpy64-3741\python-3.7.4.amd64\lib\site-packages (from umap-learn) (1.3.1)
Requirement already satisfied: numpy>=1.13 in ...\wpy64-3741\python-3.7.4.amd64\lib\site-packages (from umap-learn) (1.16.5+mkl)
Requirement already satisfied: numba>=0.37 in ...\wpy64-3741\python-3.7.4.amd64\lib\site-packages (from umap-learn) (0.45.1)
Requirement already satisfied: scikit-learn>=0.16 in ...\wpy64-3741\python-3.7.4.amd64\lib\site-packages (from umap-learn) (0.21.3)
Requirement already satisfied: llvmlite>=0.29.0 in ...\wpy64-3741\python-3.7.4.amd64\lib\site-packages (from numba>=0.37->umap-learn) (0.29.0)
Requirement already satisfied: joblib>=0.11 in ...\wpy64-3741\python-3.7.4.amd64\lib\site-packages (from scikit-learn>=0.16->umap-learn) (0.13.2)
WARNING: You are using pip version 19.2.3, however version 20.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.
UMAPを実行
import umap
from scipy.sparse.csgraph import connected_components
# 公式GitHubには書いてあるのですが、↑を書かないとエラーが出てしまいます。
# sec: 実行 →1分かからず、圧倒的に高速で処理される模様
%time res_umap = umap.UMAP().fit_transform(data_tsne)
print(res_umap.shape)
...\WPy64-3741\python-3.7.4.amd64\lib\site-packages\numba\compiler.py:602: NumbaPerformanceWarning:
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.
To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
File "..\..\python-3.7.4.amd64\lib\site-packages\umap\nndescent.py", line 47:
@numba.njit(parallel=True)
def nn_descent(
^
self.func_ir.loc))
Wall time: 56.5 s
(60000, 2)
# sec: 結果の描画
import matplotlib.cm as cm
plt.figure(figsize=(12, 12))
plt.scatter(res_umap[:,0], res_umap[:,1], s=3, c=data_true, cmap=cm.tab10)
plt.colorbar()
plt.grid()
plt.show()
# sec: 実行 (Google Colabでも実行してみると)
%time res_umap = umap.UMAP().fit_transform(data_tsne)
print(res_umap.shape)
CPU times: user 2min 18s, sys: 3.5 s, total: 2min 21s
Wall time: 1min 31s
(60000, 2)
# sec: 結果の描画
import matplotlib.cm as cm
plt.figure(figsize=(12, 12))
plt.scatter(res_umap[:,0], res_umap[:,1], s=3, c=data_true, cmap=cm.tab10)
plt.colorbar()
plt.grid()
plt.show()
結果のグループを調べる
# sec: 結果を保存 毎回実行で変わる為
import pickle
with open("./results/2005 t-SNE/res-umap-60k.pickle", 'wb') as file:
pickle.dump(res_umap, file)
# sec: 結果を読み込み 前回の続きから
import pickle
with open("./results/2005 t-SNE/res-umap-60k.pickle", 'rb') as file:
res_umap = pickle.load(file)
全体の分布状況を描画
x_max, y_max = np.amax(res_umap, axis=0)
x_min, y_min = np.amin(res_umap, axis=0)
draw_digits_at_tsne(res_umap, x_min, x_max, y_min, y_max, n_grid=(50, 50), annosize=None)
分割して拡大、分布状況を描画
# sec: 分割
def gen_grid_rect(x_min, x_max, y_min, y_max, n_part = 3): # 領域の分割数
grid_x1 = np.linspace(x_min, x_max, n_part + 1)
grid_x2 = grid_x1[1:].copy()
grid_y1 = np.linspace(y_min, y_max, n_part + 1)
grid_y2 = grid_y1[1:].copy()
return [[x1_i, x2_i, y1_i, y2_i]
for y1_i, y2_i in zip(grid_y1, grid_y2)
for x1_i, x2_i in zip(grid_x1, grid_x2)]
gen_grid_rect(x_min, x_max, y_min, y_max)
[[-5.68484354019165,
1.8034880956013994,
-4.267199516296387,
2.2263097763061523],
[1.8034880956013994,
9.29181973139445,
-4.267199516296387,
2.2263097763061523],
[9.29181973139445, 16.7801513671875, -4.267199516296387, 2.2263097763061523],
[-5.68484354019165,
1.8034880956013994,
2.2263097763061523,
8.719819068908691],
[1.8034880956013994, 9.29181973139445, 2.2263097763061523, 8.719819068908691],
[9.29181973139445, 16.7801513671875, 2.2263097763061523, 8.719819068908691],
[-5.68484354019165, 1.8034880956013994, 8.719819068908691, 15.21332836151123],
[1.8034880956013994, 9.29181973139445, 8.719819068908691, 15.21332836151123],
[9.29181973139445, 16.7801513671875, 8.719819068908691, 15.21332836151123]]
for x1, x2, y1, y2 in gen_grid_rect(x_min, x_max, y_min, y_max):
draw_digits_at_tsne(res_umap, x1, x2, y1, y2, n_grid=(25, 25), annosize=None)