LoginSignup
1
1

More than 3 years have passed since last update.

ハードマージンSVM-Breast Cancer Wisconsin (Diagnostic) Data Set-

Last updated at Posted at 2019-05-16

やりたいこと

サポートベクトルマシンを作ってみる。
やってみた感想としては、libsvmのまま使ったほうが使いやすいかも?

コード

'''
Created on 2019/05/15

@author: tatsunidas
'''
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import seaborn as sns # used for plot interactive graph. 
#データの読み込み
from sklearn.datasets import load_breast_cancer
dataset = load_breast_cancer()
df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)

# トレーニングデータとテストデータに分割
# 乱数を制御するパラメータ random_state は None にすると毎回異なるデータを生成する
X, X_test, y, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2, random_state=None)
print(X.shape)

# データの標準化処理
sc = StandardScaler()
sc.fit(X)
X = sc.transform(X)
X_test = sc.transform(X_test)

# fit the model
h = .02  # step size in the mesh
# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
svc = svm.SVC(kernel='linear', C=1.).fit(X, y)#One-vs-One
lin_svc = svm.LinearSVC(C=1., tol=2e-03).fit(X, y)#One-vs-All
rbf_svc = svm.NuSVC(kernel='rbf', gamma='auto').fit(X, y)
# polyのみdegreeをオプションで使える。チューニングは難しい
poly_svc = svm.NuSVC(kernel='poly', gamma='auto', degree=3,decision_function_shape='ovr', nu=0.05).fit(X, y)

#show accuracy
print('score :: 1 svc, 2 lin_svc, 3 rbf_svc, 4 poly_svc')
for j, clf in enumerate((svc, lin_svc, rbf_svc, poly_svc)):
    print(str(j+1))
    print('正解率(train):{:.3f}'.format(clf.score(X,y)))
    print('正解率(test):{:.3f}'.format(clf.score(X_test,y_test)))

'''
graphを作ってみる
'''
#相関を見てみる
df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
corr = df.corr() # .corr is used to find corelation
f,ax = plt.subplots(figsize=(20, 20))
sns.heatmap(corr, cbar = True,  square = True, annot = True, fmt= '.1f', 
            xticklabels= True, yticklabels= True
            ,cmap="coolwarm", linewidths=.5, ax=ax);
plt.show()

#最初の2つの特徴を使って境界を描画してみる
# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

# title for the plots
titles = ['SVC with linear kernel',
          'LinearSVC (linear kernel)',
          'SVC with RBF kernel',
          'SVC with polynomial kernel']
plt.figure(figsize=(10,12))
for j, clf in enumerate((svc, lin_svc, rbf_svc, poly_svc)):
    # Plot the decision boundary by assigning a color to each point in the mesh 
    plt.subplot(2, 2, j + 1)
    plt.subplots_adjust(wspace=0.4, hspace=0.4)

    pre_pred = np.array([xx.ravel(), yy.ravel()] + [np.repeat(0, xx.ravel().size) for _ in range(28)]).T
    print(pre_pred.shape)
    pred = clf.predict(pre_pred)
    Z = pred.reshape(xx.shape)
    decf = clf.decision_function(pre_pred).reshape(xx.shape)    

    # Put the result into a color plot
    # decision func
    plt.contourf(xx, yy, decf,alpha=1.0, cmap="RdYlGn", levels=np.linspace(decf.min(), decf.max(), 100))
    # boundary (これを使うときは一行上の plt.contourf(xx, yy, decf...をコメントアウトして)
#     plt.contourf(xx, yy, Z, cmap=plt.cm.get_cmap('RdBu_r'), alpha=0.6)

# Ploting  the training points
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.get_cmap('RdBu_r'))
    plt.xlabel(dataset.feature_names[0],size=20)
    plt.ylabel(dataset.feature_names[1],size=20)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.xticks(())
    plt.yticks(())
    plt.title(titles[j],size=20);

plt.show()

# 5/17 added
# ROC curve(ロックカーブ)
titles = ['SVC with linear kernel',
#           'LinearSVC (linear kernel)', # proba関数がない。予測を取得したいだけなのでpredict関数でもOKと思う。
          'SVC with RBF kernel',
          'SVC with polynomial kernel']
plt.figure(figsize=(10,12))
for j, clf in enumerate((svc, rbf_svc, poly_svc)):# exclude lin_svc
    # Plot each ROC curve
    plt.subplot(2, 2, j + 1)
    plt.subplots_adjust(wspace=0.4, hspace=0.4)
    #予測確率を取得
    pred = clf.predict_proba(X_test)[:,1]
    #偽陽性率と真陽性率の算出
    fpr, tpr, thretholds = roc_curve(y_test,pred) 
    # aucの算出
#     auc = auc(fpr,tpr)
    auc = roc_auc_score(y_test, pred)
    # Ploting  ROC
    plt.plot(fpr, tpr,label='ROC curve (auc = %.2f)'%(auc), marker='o', color='red')
    plt.plot([0,1],[0,1], color = 'black',linestyle='--')

    plt.xlabel('false positive rate')
    plt.ylabel('true positive rate')
    plt.xlim(-0.05, 1.0)
    plt.ylim(0.0, 1.05)
    plt.title('receiver operating characteristic'+'\n'+ titles[j]);
    plt.legend()
    plt.grid()

plt.show()

結果

(455, 30)#形状の確認用
score :: 1 svc, 2 lin_svc, 3 rbf_svc, 4 poly_svc
1
正解率(train):0.987
正解率(test):0.991
2
正解率(train):0.989
正解率(test):0.982
3
正解率(train):0.947
正解率(test):0.965
4
正解率(train):0.998
正解率(test):0.974
(175616, 30)
(175616, 30)
(175616, 30)
(175616, 30)

途中で描画するグラフ

データセット内の特徴相関図
Figure_1_corr.png

最初の2つの特徴を使って描画した境界面
Figure_1_svm.png
(polyはチューニングが難しい。)

ROC curve

Figure_1_roc.png

参考文献やURL

https://qiita.com/kazuki_hayakawa/items/18b7017da9a6f73eba77
https://scikit-learn.org/stable/modules/generated/sklearn.svm.NuSVC.html
https://www.kaggle.com/rcfreitas/python-ml-breast-cancer-diagnostic-data-set
https://ameblo.jp/cognitive-solution/entry-12290094948.html
https://scikit-learn.org/stable/auto_examples/svm/plot_svm_nonlinear.html#sphx-glr-auto-examples-svm-plot-svm-nonlinear-py
https://www.kaggle.com/selener/breast-cancer-diagnosis/notebook
https://stackoverflow.com/questions/45384185/what-is-the-difference-between-linearsvc-and-svckernel-linear/45390526
https://www.kaggle.com/sachin1512/breast-cancer-dataset/downloads/breast-cancer-dataset.zip/1
https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original)

1
1
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
1