This article is a Private article. Only a writer and users who know the URL can access it.
Please change open range to public in publish setting if you want to share this article with other users.

More than 5 years have passed since last update.

【読書会】python機会学習プログラミング_3章

機械学習

Last updated at 2016-12-12Posted at 2016-12-08

#分類問題-機会学習ライブラリscikit-learnの活用

よく使用されている機械学習ライブラリを一通り見る
ここでは、分類を目的とした教師あり学習のアルゴリズム間の相違点を学ぶ

##分類アルゴリズムの選択
アルゴリズムにはそれぞれ特徴があるため、それらを考慮して最適なものを選んでいく必要がある
アルゴリズムの選定は
・特徴量やサンプルの個数
・データベースでのノイズの量
・クラスの線形分離可能性
などの利用可能な学習データに大きく依存することになる

##scikit-learnでパーセプトロン
Iris(あやめ)のデータ分析を行う
ここでは可視化のため、データセットの特徴量を二つだけ使用する
※150個のサンプルの「がく片の長さ」と「花びらの長さ」を使用

from sklearn import datasets
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler

#sklealnのデータセットからアヤメのデータを出力
iris = datasets.load_iris()
X = iris.data[:, [2, 3]]
y = iris.target

# print('Class labels:', np.unique(y))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#特徴量を標準化する
sc = StandardScaler()
sc.fit(X_train)

#平均と標準偏差を用いて標準化
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

次にパーセプトロンを行ってみる

from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score

#エポック数40、学習率0.1でパーセプトロンのインスタンスを生成
ppn = Perceptron(n_iter = 4.0, eta0 = 0.1, random_state = 0, shuffle=True)
#トレーニングデータをモデルに適合させる
ppn.fit(X_train_std, y_train)

#テストデータで予測を実施
y_pred = ppn.predict(X_test_std)
print('Misclasfied samples: %d' % (y_test != y_pred).sum())

#ご分類のサンプル個数を表示
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))

Misclasfied samples: 10
Accuracy: 0.78

これだけでパーセプトロンが実行できる!

最後に画像に出力する。

from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import warnings


def versiontuple(v):
    return tuple(map(int, (v.split("."))))


def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):

    # setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    # plot the decision surface
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                           np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
                    alpha=0.8, c=cmap(idx),
                    marker=markers[idx], label=cl)

    # highlight test samples
    if test_idx:
        # plot all samples
        if not versiontuple(np.__version__) >= versiontuple('1.9.0'):
            X_test, y_test = X[list(test_idx), :], y[list(test_idx)]
            warnings.warn('Please update to NumPy 1.9.0 or newer')
        else:
            X_test, y_test = X[test_idx, :], y[test_idx]

        plt.scatter(X_test[:, 0],
                    X_test[:, 1],
                    c='',
                    alpha=1.0,
                    linewidths=1,
                    marker='o',
                    s=55, label='test set')
        
X_combined_std = np.vstack((X_train_std, X_test_std))
y_combined = np.hstack((y_train, y_test))

plot_decision_regions(X=X_combined_std, y=y_combined,
                      classifier=ppn, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')

plt.tight_layout()
# plt.savefig('./figures/iris_perceptron_scikit.png', dpi=300)
plt.show()

図3_1

##scikit-learnでロジスティック回帰

ロジスティック回帰をやってみる


from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=1000.0, random_state=0)
lr.fit(X_train_std, y_train)

plot_decision_regions(X_combined_std, y_combined,
                      classifier=lr, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/logistic_regression.png', dpi=300)
plt.show()

これだけでできてしまう。

##scikit-learnでサポートベクトルマシン


from sklearn.svm import SVC

svm = SVC(kernel='linear', C=1.0, random_state=0)
svm.fit(X_train_std, y_train)

plot_decision_regions(X_combined_std, y_combined,
                      classifier=svm, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/support_vector_machine_linear.png', dpi=300)
plt.show()

##scikit-learnで決定木


from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)
tree.fit(X_train, y_train)

X_combined = np.vstack((X_train, X_test))
y_combined = np.hstack((y_train, y_test))
plot_decision_regions(X_combined, y_combined, 
                      classifier=tree, test_idx=range(105, 150))

plt.xlabel('petal length [cm]')
plt.ylabel('petal width [cm]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/decision_tree_decision.png', dpi=300)
plt.show()

決定木も作れる

from sklearn.tree import export_graphviz

export_graphviz(tree, 
                out_file='tree.dot', 
                feature_names=['petal length', 'petal width'])

ファイルを保存した場所へ移動し、コマンドラインから以下のコマンドを実行し、dotファイルをpngに変換する。

$ dot -Tpg tree.dot -o tree.png

すると決定木の図をみることができる。
こういう事ができると意思決定の理由をビジュアルで説明できる。

##その他にもいろいろできる

ランダムフォレスト
k近傍法
などなど

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up