scikit-learn の Perceptron は線形回帰と使い方はほぼ同じで、骨の部分は

skPC.py

import numpy as np
from sklearn.linear_model import Perceptron

# read data
(略)

# build the model                                                                    
maxIter = 100
model = Perceptron(n_iter=maxIter)

# training                                                                           
model.fit(features, labels)

# make test data
(略)

# predict
p= model.predict(test_data)

と、データ準備 → モデル作成 → fit() → predict() の流れですが、

x (独立変数)が複数ある
線形回帰のようにモデルから ax + b のパラメータを取り出して線を引いて評価ということができない

ところが前回と違っている点です。

2.1 学習データの準備

これはcsvファイルの最初の２つの項目をリストにするだけです。
x, y だとわかりにくいので、features=独立変数、labels=従属変数とします。
features = [x,y]（xy座標）、labels = 0 or 1 になります。

skPC.py

import numpy as np
from sklearn.linear_model import Perceptron

# read data                                                                          
import csv
def readArrayWithCSV(dataFile):
    features = []
    labels = []
    f = open(dataFile, 'r')  # csv は '0.1, 0.2, 1.0' という形式
    reader = csv.reader(f)
    for row in reader:
        features.append([float(row[0]), float(row[1])]) # [x,y]
        labels.append(int(float(row[2]))) # int にする
    # features=[[0.1, 1.1],[0.2, 1.2]...], labels=[0, 1, ...]
    return features,labels

# read training data                                                                 
trainFile = "trainPC.csv"
features0,labels0 = readArrayWithCSV(trainFile)
features = np.array(features0)
labels = np.array(labels0)

2.2 モデルの作成

ここはそのままです。

skPC.py

# build the model                                                                    
maxIter = 100
model = Perceptron(n_iter=maxIter)

# training                                                                           
model.fit(features, labels)

2.3 モデルを用いた予測

予測も model.predict(テストデータ) を呼ぶだけなのですが、結果を表示するために、(0,0)〜(10,5)の範囲で0.1単位のxy座標のマトリックスを作り、全点について [0,1] を予測して散布図を作ることにしました。xy座標のマトリックスは numpy.meshgrid() を使うと簡単に作れました。

skPC.py

# (0, 0)-(10, 5) の範囲で 0.1単位のマトリックスを作る                                                             
npax = np.arange(0.0, 10.0, 0.1)
npay = np.arange(0.0, 5.0, 0.1)
npx,npy = np.meshgrid(npax, npay)
npxy = np.c_[npx.ravel(), npy.ravel()]

numpy.arange(s, e, i) は s から e まで i 単位で 1次元行列を作ります。
xn,yn = numpy.meshgrid(x, y) は、xをy個縦に並べた xn と yをx個横に並べた x * y の２次元行列を作成します。
a.ravel() は多次元行列を１次元行列に直します。
np.c_[x, y] は [[x0,y0],[x1,y1]...] の形で行列を結合します

>>> import numpy as np
>>> x = np.arange(0, 9, 3)
array([0, 3, 6])
>>> x
array([0, 3, 6])
>>> y = np.arange(2, 8, 2)
>>> y
array([2, 4, 6])
>>> xn,yn = np.meshgrid(x, y)
>>> xn
array([[0, 3, 6],
       [0, 3, 6],
       [0, 3, 6]])
>>> yn
array([[2, 2, 2],
       [4, 4, 4],
       [6, 6, 6]])
>>> xn.ravel()
array([0, 3, 6, 0, 3, 6, 0, 3, 6])
>>> yn.ravel()
array([2, 2, 2, 4, 4, 4, 6, 6, 6])
>>> np.c_[xn.ravel(), yn.ravel()]
array([[0, 2],
       [3, 2],
       [6, 2],
       [0, 4],
       [3, 4],
       [6, 4],
       [0, 6],
       [3, 6],
       [6, 6]])

これで作成したテストデータを predict() に渡すと全データの予測値 (0 or 1) が１次元行列で得られるので、x * y 行列に直して結果をプロットします。

skPC.py

# predict
npp = model.predict(npxy)
npz = npp.reshape(npx.shape)

# output                                                                             
from plotPC import plotPC
title = "predPC_with_sklearn"
plotPC(title, npx, npy, npz, a, b, maxIter)

プロット関数は Spark.ml、TensorFlow でも使うので別ファイルで定義しておきます。

plotPC.py

import numpy as np
import matplotlib.pyplot as plt

def plotPC(title, px, py, pp, a, b, it, ac=0):
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    # pp の True に対応する (px, py) を青で描画                                      
    bp = pp.astype(np.bool)
    bx = px[bp]
    by = py[bp]
    ax.scatter(bx, by, color='blue')
    # pp の False に対応する (px, py) を赤で描画                                     
    rp = -bp
    rx = px[rp]
    ry = py[rp]
    ax.scatter(rx, ry, color='red')
    # 正しい境界を黄色で描画                                                         
    textL = 'True Model:x=' + str(a) + ', y=' + str(b)
    lx = np.array([0,10])
    ly = a * lx + b
    ax.plot(lx, ly, color='yellow', label=textL)

    ax.set_title(title)
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.legend(loc=2)
    textC = 'iteration=' + str(it)
    if ac > 0:
        textC = textC + ', accuracy=' + str(round(ac, 2))
    ax.text(5.0, 0.1, textC)
    #fig.show()                                                                      
    # ファイル保存                                                                   
    imageFile = title + '_' + str(it) + '.png'
    fig.savefig(imageFile)

結果は以下になりました。

また、正しい値と予測値があれば、正解率を出す関数もそれぞれの機械学習ライブラリに用意されています。scikit-learn では sklearn.metrics.accuracy_score になります。

skPC.py

# eval                                                                               
a = 0.4
b = 0.8
npl = a * npx + b < npy
from sklearn.metrics import accuracy_score
ac = accuracy_score(npl.ravel(), npp)

最後にプログラム全体を載せておきます。

skPC.py

#!/usr/bin/env python                                                                

import numpy as np
from sklearn.linear_model import Perceptron

# read data                                                                          
import csv
def readArrayWithCSV(dataFile):
    features = []
    labels = []
    f = open(dataFile, 'r')
    reader = csv.reader(f)
    for row in reader:
        features.append([float(row[0]), float(row[1])])
        labels.append(int(float(row[2])))
    return features,labels

# read training data                                                                 
trainFile = "trainPC.csv"
features0,labels0 = readArrayWithCSV(trainFile)
features = np.array(features0)
labels = np.array(labels0)

# build the model                                                                    
maxIter = 100
model = Perceptron(n_iter=maxIter)

# training                                                                           
model.fit(features, labels)

# (0, 0)-(10, 5)の範囲で 0.1 単位のマトリックスを作る                   
npax = np.arange(0.0, 10.0, 0.1)
npay = np.arange(0.0, 5.0, 0.1)
npx,npy = np.meshgrid(npax, npay)
npxy = np.c_[npx.ravel(), npy.ravel()]

# predict                                                                            
npp = model.predict(npxy)
npz = npp.reshape(npx.shape)

# eval                                                                               
a = 0.4
b = 0.8
npl = a * npx + b < npy
from sklearn.metrics import accuracy_score
ac = accuracy_score(npl.ravel(), npp)

# output                                                                             
from plotPC import plotPC
title = "predPC_with_sklearn"
plotPC(title, npx, npy, npz, a, b, maxIter, ac)

scikit-learn、Spark.ml、TensorFlow で Perceptron〜（２）scikit-learn

2.1 学習データの準備

2.2 モデルの作成

2.3 モデルを用いた予測