Qiita Teams that are logged in
You are not logged in to any team

Community
Service
Qiita JobsQiita ZineQiita Blog
2
Help us understand the problem. What is going on with this article?
@0NE_shoT_

# はじパタ4章の実行例をPythonで再現

More than 1 year has passed since last update.

# 実行例4.1 標準化

• 必要なライブラリの読み込み
```import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
```
• アヤメのデータの読み込み
```from sklearn.datasets import load_iris

data = pd.DataFrame(iris['data'], columns=iris.feature_names)
target = pd.Series(iris['target']).map({0: 'setosa', 1:'versicolor', 2:'virginica'})
```
• アヤメの花弁の長さと幅の平均値と共分散行列
```print('mu = ', np.mean(data[['petal length (cm)', 'petal width (cm)']]))
print('Sigma = ', np.cov(data[['petal length (cm)', 'petal width (cm)']], rowvar=0))

# 出力=>
# mu =
#  petal length (cm)    3.758000
# petal width (cm)     1.199333
# dtype: float64
# Sigma =
#  [[3.11627785 1.2956094 ]
#  [1.2956094  0.58100626]]
```
• 標準化前の散布図 - 図4.2(a)
```for t in target.unique():
data_tmp = data[target==t]
plt.scatter(data_tmp['petal length (cm)'], data_tmp['petal width (cm)'], label=t)
plt.legend()
plt.xlabel('petal length (cm)')
plt.ylabel('petal width (cm')
plt.xlim([1, 7])
plt.ylim([-1, 4])
plt.show()
```

• 標準化後の散布図 - 図4.2(b)
```def standardize(X):
return (X - np.mean(X)) / np.std(X)

std_data = standardize(data[['petal length (cm)', 'petal width (cm)']])
for t in target.unique():
data_tmp = std_data[target==t]
plt.scatter(data_tmp['petal length (cm)'], data_tmp['petal width (cm)'], label=t)
plt.legend()
plt.xlabel('petal length (cm)')
plt.ylabel('petal width (cm)')
plt.xlim([-2, 2])
plt.ylim([-2, 2])
plt.show()
```

# 実行例4.2 無相関化

• 無相関化後の散布図 - 図4.4(b)
```def no_correlation(X):
cov_mat = np.cov(X, rowvar=0)
eig_vals, eig_vecs = np.linalg.eig(cov_mat)
return np.dot(X, eig_vecs)

no_corr_data = no_correlation(data[['petal length (cm)', 'petal width (cm)']])
for t in target.unique():
data_tmp = no_corr_data[target==t]
plt.scatter(data_tmp[:, 0], data_tmp[:, 1], label=t)
plt.legend()
plt.xlabel('petal length (cm)')
plt.ylabel('petal width (cm)')
plt.xlim([0, 8])
plt.ylim([-3, 3])
plt.show()
```

```Sigma = np.cov(data[['petal length (cm)', 'petal width (cm)']], rowvar=0)
Lambda, S = np.linalg.eig(Sigma)
print('Lambda = \n', Lambda)
print('S = \n', S)
# 出力=>
# Lambda =
#  [3.66123805 0.03604607]
# S =
#  [[ 0.92177769 -0.38771882]
#  [ 0.38771882  0.92177769]]
```
• 無相関化後の共分散行列
```print('Sigma = \n', np.cov(no_corr_data, rowvar=0))
# 出力=>
# Sigma =
#  [[3.66123805e+00 6.01365790e-16]
#  [6.01365790e-16 3.60460707e-02]]
```

# 実行例4.3 白色化

• 白色化後の散布図 - 図4.6(b)
```def whitening(X):
cov_mat = np.cov(X, rowvar=0)
eig_vals, eig_vecs = np.linalg.eig(cov_mat)
diag_sqrt_eig_vals = np.diag(np.sqrt(eig_vals))

return np.dot(np.dot(X-np.mean(X, axis=0), eig_vecs), np.linalg.inv(diag_sqrt_eig_vals.T))

whitened_data = whitening(data[['petal length (cm)', 'petal width (cm)']])
for t in target.unique():
data_tmp = whitened_data[target==t]
plt.scatter(data_tmp[:, 0], data_tmp[:, 1], label=t)
plt.legend()
plt.xlabel('petal length (cm)')
plt.ylabel('petal width (cm)')
plt.xlim([-3, 3])
plt.ylim([-3, 4])
plt.show()
```

• 白色化後の共分散行列
```print('Sigma = \n', np.cov(whitened_data, rowvar=0))
# 出力 =>
# Sigma =
#  [[1.00000000e+00 1.64411249e-15]
#  [1.64411249e-15 1.00000000e+00]]
```

# 実行例4.4 2次識別関数/線形識別関数

クラス条件付き確率が正規分布に従っていると仮定して、クラスを識別する関数です。以下、コードです。

• ピマ・インディアンデータの読み込み

データはRdatasetsからお借りしています。

```pima_train = pd.read_csv('https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/MASS/Pima.tr.csv')
```
• 学習データの散布図 - 図4.8
```for t in ['Yes', 'No']:
pima_tmp = pima_train[pima_train['type']==t]
plt.scatter(pima_tmp['glu'], pima_tmp['bmi'], label=t)
plt.legend()
plt.xlabel('glu')
plt.ylabel('bmi')
plt.xlim([50, 200])
plt.ylim([15, 50])
plt.show()
```

• 2次識別関数の実装
```class QuadraticDiscriminantFunction:
def __init__(self, threshold=0):
self._S = None
self._c = None
self._F = None
self._threshold = threshold

def fit(self, X, Y):
X_pos = X[Y==1]
X_neg = X[Y==0]

mu_pos = np.mean(X_pos, axis=0)
mu_neg = np.mean(X_neg, axis=0)
Sigma_pos = np.cov(X_pos, rowvar=0)
Sigma_neg = np.cov(X_neg, rowvar=0)
p_pos = len(Y[Y==1])/len(Y)
p_neg = len(Y[Y==0])/len(Y)

self._S = np.linalg.inv(Sigma_pos) - np.linalg.inv(Sigma_neg)
self._c = np.dot(mu_neg, np.linalg.inv(Sigma_neg)) - np.dot(mu_pos, np.linalg.inv(Sigma_pos))
self._F = np.dot(np.dot(mu_pos, np.linalg.inv(Sigma_pos)), mu_pos) - np.dot(np.dot(mu_neg, np.linalg.inv(Sigma_neg)), mu_neg)\
+ np.log(np.linalg.det(Sigma_pos)/np.linalg.det(Sigma_neg))\
- 2*np.log(p_pos/p_neg)

def predict(self, X):
xSx = np.diag(np.dot(np.dot(X, self._S), X.T))
cx = np.dot(self._c, X.T)
Y_hat = xSx + 2*cx + self._F
return np.where(Y_hat < self._threshold, 1, 0)
```
• 線形識別関数の実装
```class LinearDiscriminantFunction:
def __init__(self, threshold=0):
self._c = None
self._F = None
self._threshold = threshold

def fit(self, X, Y):
X_pos = X[Y==1]
X_neg = X[Y==0]

mu_pos = np.mean(X_pos, axis=0)
mu_neg = np.mean(X_neg, axis=0)
Sigma_pos = np.cov(X_pos, rowvar=0)
Sigma_neg = np.cov(X_neg, rowvar=0)
p_pos = len(Y[Y==1])/len(Y)
p_neg = len(Y[Y==0])/len(Y)
Sigma_pool = p_pos*Sigma_pos + p_neg*Sigma_neg

self._c = np.dot(mu_neg, np.linalg.inv(Sigma_pool)) - np.dot(mu_pos, np.linalg.inv(Sigma_pool))
self._F = np.dot(np.dot(mu_pos, np.linalg.inv(Sigma_pool)), mu_pos) - np.dot(np.dot(mu_neg, np.linalg.inv(Sigma_pool)), mu_neg)\
- 2*np.log(p_pos/p_neg)

def predict(self, X):
cx = np.dot(self._c, X.T)
Y_hat = 2*cx + self._F
return np.where(Y_hat < self._threshold, 1, 0)
```
• ベイズ識別境界を描画するための学習と予測
```X_train = pima_train[['glu', 'bmi']].values
Y_train = pima_train['type'].map({'Yes':1, 'No':0})

qdf.fit(X_train, Y_train)
ldf = LinearDiscriminantFunction(threshold=0)
ldf.fit(X_train, Y_train)

X1_min, X1_max = np.min(X_train[:, 0])-1, np.max(X_train[:, 0])+1
X2_min, X2_max = np.min(X_train[:, 1])-1, np.max(X_train[:, 1])+1
X1, X2 = np.meshgrid(np.arange(X1_min, X1_max, 1), np.arange(X2_min, X2_max, 0.5))
X = np.c_[X1.ravel(), X2.ravel()]

Y_qdf_pred = qdf.predict(X)
Y_ldf_pred = ldf.predict(X)
```
• 2次識別関数から得られるベイズ識別境界 - 図4.9(a)
```plt.contour(X1, X2, Y_qdf_pred.reshape(X1.shape), colors='limegreen')
for t in ['Yes', 'No']:
pima_tmp = pima_train[pima_train['type']==t]
plt.scatter(pima_tmp['glu'], pima_tmp['bmi'], label=t)
plt.legend()
plt.xlabel('glu')
plt.ylabel('bmi')
plt.xlim([50, 200])
plt.ylim([15, 50])
plt.show()
```

• 2次識別関数の誤り率
```X_test = pima_test[['glu', 'bmi']].values
Y_test = pima_test['type'].map({'Yes':1, 'No':0})

Y_train_qdf_pred = qdf.predict(X_train)
print('error rate (train) = {:.1f}%'.format(np.mean(Y_train_qdf_pred != Y_train)*100))
Y_test_qdf_pred = qdf.predict(X_test)
print('error rate (test) = {:.1f}%'.format(np.mean(Y_test_qdf_pred != Y_test)*100))

# 出力 =>
# error rate (train) = 24.5%
# error rate (test) = 23.8%
```
• 線形識別関数から得られるベイズ識別境界 - 図4.9(b)
```plt.contour(X1, X2, Y_ldf_pred.reshape(X1.shape), colors='limegreen')
for t in ['Yes', 'No']:
pima_tmp = pima_train[pima_train['type']==t]
plt.scatter(pima_tmp['glu'], pima_tmp['bmi'], label=t)
plt.legend()
plt.xlabel('glu')
plt.ylabel('bmi')
plt.xlim([50, 200])
plt.ylim([15, 50])
plt.show()
```

• 線形識別関数の誤り率
```Y_train_ldf_pred = ldf.predict(X_train)
print('error rate (train) = {:.1f}%'.format(np.mean(Y_train_ldf_pred != Y_train)*100))
Y_test_ldf_pred = ldf.predict(X_test)
print('error rate (test) = {:.1f}%'.format(np.mean(Y_test_ldf_pred != Y_test)*100))
# 出力 =>
# error rate (train) = 24.0%
# error rate (test) = 22.0%
```
• 2次識別関数/線形識別関数のROC曲線 - 図4.10
```qdf_tpr_list = []
qdf_fpr_list = []
ldf_tpr_list = []
ldf_fpr_list = []

for th in np.arange(-9, 7, 0.1):
qdf.fit(X_train, Y_train)
Y_qdf_pred = qdf.predict(X_test)

tpr_qdf = len(Y_test[(Y_test==0)&(Y_qdf_pred==0)]) / len(Y_test[Y_test==0])
qdf_tpr_list.append(tpr_qdf)
fpr_qdf = len(Y_test[(Y_test==1)&(Y_qdf_pred==0)]) / len(Y_test[Y_test==1])
qdf_fpr_list.append(fpr_qdf)

ldf = LinearDiscriminantFunction(threshold=th)
ldf.fit(X_train, Y_train)
Y_ldf_pred = ldf.predict(X_test)

tpr_ldf = len(Y_test[(Y_test==0)&(Y_ldf_pred==0)]) / len(Y_test[Y_test==0])
ldf_tpr_list.append(tpr_ldf)
fpr_ldf = len(Y_test[(Y_test==1)&(Y_ldf_pred==0)]) / len(Y_test[Y_test==1])
ldf_fpr_list.append(fpr_ldf)

plt.plot(ldf_fpr_list, ldf_tpr_list, label='Linear')
plt.legend()
plt.xlabel('false positive ratio')
plt.ylabel('true positive ratio')
plt.show()
```

# おわりに

はじパタ4章の実行例について、Pythonで再現してみました。コードの正しさについては、描画した図が書籍の図とだいたい合っているからヨシ！程度の確認のみしておりますので、誤りなどありましたら編集リクエストをして頂けると幸いです。

2
Help us understand the problem. What is going on with this article?
Why not register and get more from Qiita?
1. We will deliver articles that match you
By following users and tags, you can catch up information on technical fields that you are interested in as a whole
2. you can read useful information later efficiently
By "stocking" the articles you like, you can search right away
アナリティクス・コンサルタントとして働いています。最近はデジタルマーケティングの意思決定の判断材料となるデータ分析をしています。