前回作った改良型のbiplotを目的変数で分かりやすくした関数を作りました。
関数
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
def biplot_spv(df, y_name):
y = df[y_name]
x = df.drop(y_name, axis=1)
ss = StandardScaler()
ss.fit(x)
sx = ss.transform(x)
model = PCA()
model.fit(sx)
tx = model.transform(sx)
evr = model.explained_variance_ratio_
com = model.components_
fac = []
for i in range(len(evr)):
fac.append(np.sqrt(evr[i])*com[i])
fig, ax = plt.subplots()
ax1 = ax.twinx()
ax2 = ax.twiny()
ax1.scatter(tx[:, 0], tx[:, 1], c=y, cmap="brg")
xlim = [abs(min(fac[0])), abs(max(fac[0]))]
ylim = [abs(min(fac[1])), abs(max(fac[1]))]
ax2.set_xlim(-max(xlim), max(xlim))
ax2.set_ylim(-max(ylim), max(ylim))
for i in range(len(x.columns)):
ax2.annotate("", xytext=[0, 0], xy=[fac[0][i], fac[1][i]],
arrowprops=dict(shrink=0, width=1, headwidth=6,
headlength=10, connectionstyle='arc3',
facecolor='red', edgecolor='red'))
ax2.text(fac[0][i], fac[1][i], x.columns[i])
plt.show()
使用例
import pandas as pd
df = pd.read_csv("wine.csv")
biplot_spv(df, "Wine")
df = pd.read_csv("boston.csv")
biplot_spv(df, "PRICE")