背景
分類データを扱う際、PCAやNMDSで次元削除した後、matplotlib/seabornで散布図を書くのですが、
「分類ごとに楕円を描いてほしい」と思ったが、なかなか見つからなかったので、メモです。
ドキュメント
コード
# ライブラリのインポート
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms
# 楕円を描く関数
# https://github.com/matplotlib/matplotlib/blob/main/examples/statistics/confidence_ellipse.py
def confidence_ellipse(x, y, ax, n_std=3.0, facecolor='none', **kwargs):
"""
Create a plot of the covariance confidence ellipse of *x* and *y*.
Parameters
----------
x, y : array-like, shape (n, )
Input data.
ax : matplotlib.axes.Axes
The axes object to draw the ellipse into.
n_std : float
The number of standard deviations to determine the ellipse's radiuses.
**kwargs
Forwarded to `~matplotlib.patches.Ellipse`
Returns
-------
matplotlib.patches.Ellipse
"""
if x.size != y.size:
raise ValueError("x and y must be the same size")
cov = np.cov(x, y)
pearson = cov[0, 1]/np.sqrt(cov[0, 0] * cov[1, 1])
# Using a special case to obtain the eigenvalues of this
# two-dimensional dataset.
ell_radius_x = np.sqrt(1 + pearson)
ell_radius_y = np.sqrt(1 - pearson)
ellipse = Ellipse((0, 0), width=ell_radius_x * 2, height=ell_radius_y * 2,
facecolor=facecolor, **kwargs)
# Calculating the standard deviation of x from
# the squareroot of the variance and multiplying
# with the given number of standard deviations.
scale_x = np.sqrt(cov[0, 0]) * n_std
mean_x = np.mean(x)
# calculating the standard deviation of y ...
scale_y = np.sqrt(cov[1, 1]) * n_std
mean_y = np.mean(y)
transf = transforms.Affine2D() \
.rotate_deg(45) \
.scale(scale_x, scale_y) \
.translate(mean_x, mean_y)
ellipse.set_transform(transf + ax.transData)
return ax.add_patch(ellipse)
使用してみた
iris
from sklearn.datasets import load_iris
import seaborn as sns
import pandas as pd
from sklearn.decomposition import PCA
# データの読み込み
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
# 標準化
df = df.iloc[:, :].apply(lambda x: (x-x.mean())/x.std(), axis=0)
# PCA
_n_components=2
pca = PCA(n_components=_n_components)
pca.fit(df)
feature = pca.transform(df)
feature_df = pd.DataFrame(feature, columns=["PC{}".format(x + 1) for x in range(_n_components)])
# 目的変数の追加
feature_df["label"] = iris.target
feature_df["label"] = feature_df["label"].map({0: iris.target_names[0], 1: iris.target_names[1], 2: iris.target_names[2]})
# seabornで散布図
colors = dict({"setosa": "blue",
"virginica": "orange",
"versicolor": "green"})
fig, ax = plt.subplots()
sns.scatterplot(data=feature_df, x="PC1", y="PC2", hue="label", palette=colors, ax=ax)
# 楕円の追加
for name, color in colors.items():
_x = feature_df[feature_df["label"] == name]["PC1"].values
_y = feature_df[feature_df["label"] == name]["PC2"].values
confidence_ellipse(_x, _y, ax, edgecolor=color)
plt.show()
参考にさせていただきました