はじめに
次元削減(じげんさくげん、英: Dimensionality reduction、dimension reduction)とは、高次元空間から低次元空間へデータを変換しながら、低次元表現が元データの何らかの意味ある特性を保持することである。
出典:Wikipedia
PCA
主成分分析(principal component analysis; PCA)
pip install scikit-learn
from sklearn.decomposition import PCA
reducer = PCA(n_components=5) # 主成分数を指定
res = reducer.fit(array)
t-SNE
pip install scikit-learn
from sklearn.manifold import TSNE
reducer = TSNE(n_components=2)
res = reducer.fit_transform(array)
UMAP
pip install umap-learn
import umap
#上手く行かなかったら以下でリトライ
#import umap.umap_ as umap
reducer = umap.UMAP()
res = reducer.fit_transform(array)
(おまけ) k-meansクラスタリング
from sklearn.cluster import KMeans
pred = KMeans(n_clusters=8).fit(array) # クラスター数を指定
labels = pred.labels_
可視化
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(3,3))
ax1 = fig.add_subplot(111)
plt.scatter(
x=res[:,0],
y=res[:,1],
alpha=1,
s=0.1
)
plt.show()
pandas.DataFrame
を用いる場合
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap.umap_ as umap
from sklearn.cluster import KMeans
array = np.array(df)
reducer = PCA(n_components=5)
res = reducer.fit(array)
pred = KMeans(n_clusters=8).fit(array)
labels = pred.labels_
df['PCA_one'] = res[:,0]
df['PCA_two'] = res[:,1]
df['kmeans'] = labels
fig = plt.figure(figsize=(3,3))
ax1 = fig.add_subplot(111)
plt.scatter(
x=df['PCA_one'],
y=df['PCA_two'],
alpha=1,
s=0.1,
c=df['kmeans'],
cmap='tab10'
)
plt.show()