LoginSignup
1
1

More than 5 years have passed since last update.

KaggleのMNISTデータでTheanoを試してみる~ロジスティック回帰~

Posted at

概要

Theanoのチュートリアル( http://deeplearning.net/tutorial/contents.html
をもとにKaggleのMNISTデータでロジスティック回帰を実装してみた。

実装

from __future__ import print_function

__docformat__ = 'restructedtext en'

import six.moves.cPickle as pickle
import timeit
import numpy 
import theano
import theano.tensor as T
import numpy as np

rng = numpy.random.RandomState(8000)

#共有変数の定義
# この表現で作られるデータは

#-------------------------------------------------------------------------
#Logistic Regression クラス
#-------------------------------------------------------------------------
class LogisticRegression(object):
    """Multi-class Logistic Regression Class
    """
    def __init__(self, input, n_in, n_out):


        #MODEL
        #共有変数表現
        #Wについて
        self.W = theano.shared(
            value=numpy.zeros(
                (n_in, n_out),
                dtype=theano.config.floatX
            ),
            name='W',
            borrow=True
        )
        #borrow は Python 空間上で定義されたデータの実体を 共有変数でも共有するかどうかを決める。
        #bについて
        self.b = theano.shared(
            value=numpy.zeros(
                (n_out,),
                dtype=theano.config.floatX
            ),
            name='b',
            borrow=True
        )


        #softmax関数の計算(クラスに割りあたる確率を計算)
        #p(y=i|x,W,b)
        self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
        #argmaxをとって予測値を出力
        #ypred
        self.y_pred = T.argmax(self.p_y_given_x, axis=1)
        #.paramsnに格納
        self.params = [self.W, self.b]
        #.inputに格納        
        self.input = input

    #損失関数
    def negative_log_likelihood(self, y):
        #損失関数の定義 
        #self.p_y_given_xはsoftmax関数(出力yn)
        #T.arange(y.shape[0])はNumpyの行、列[]
        #ーln(p(t|w)).PRML4.90式  
        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])

    #エラー率
    def errors(self, y):

        #出力の次元の確認
        if y.ndim != self.y_pred.ndim:
            raise TypeError(
                'y should have the same shape as self.y_pred',
                ('y', y.type, 'y_pred', self.y_pred.type)
            )
        # データタイプが正しいか確認 
        if y.dtype.startswith('int'):
            # neqはnot equal 出力した答えが正解と正しい確率を出力 
            # 第一引数:出力確率画最大のargument
            # 第二引数:正解ラベル 
            return T.mean(T.neq(self.y_pred, y))
        else:
            raise NotImplementedError()

#ミニバッチ確率的勾配降下法
def sgd_optimization_mnist(learning_rate=0.10, n_epochs=100,
                           dataset='mnist.pkl.gz',
                           batch_size=500):

    dtype = np.float32
    data = np.loadtxt("../input/train.csv", dtype=dtype,
                           delimiter=',', skiprows=1)
    test_data = np.loadtxt("../input/test.csv", dtype=dtype,
                          delimiter=',', skiprows=1)
    test_datasets=[test_data]

    print (data.shape)

    labels = data[:,0]
    data = data[:, 1:]                               

    index = T.lscalar()  # minibatchのindex
    NUM_TRAIN = len(data)
    NUM_TEST = len(test_data)
    if NUM_TRAIN % batch_size != 0: 
        whole = (NUM_TRAIN // batch_size) * batch_size
        data = data[:whole]
        NUM_TRAIN = len(data) 

    # random 並べ替え
    indices = rng.permutation(NUM_TRAIN)
    data, labels = data[indices, :], labels[indices]
    # batch_size は 500, (480, 20)し、データセットの96パーセントでトレーニングし、残りでvalidateする
    is_train = numpy.array( ([0]* (batch_size - 20) + [1] * 20) * (NUM_TRAIN // batch_size))

    test_indices = rng.permutation(NUM_TEST)
    test_data, test_labels = test_data[test_indices, :], labels[test_indices]
    is_test = numpy.array( ([0]* (batch_size - 20) + [1] * 20) * (NUM_TEST // batch_size))
    # trai_set,valid_set,test_setの準備
    train_set_x, train_set_y = numpy.array(data[is_train==0]), labels[is_train==0]
    valid_set_x, valid_set_y = numpy.array(data[is_train==1]), labels[is_train==1]
    test_set_x,test_set_y    = numpy.array(data[is_test==0]), labels[is_test==0]
    # minibatchの計算
    n_train_batches = len(train_set_y) // batch_size
    n_valid_batches = len(valid_set_y) // batch_size
    n_test_batches = len(test_set_y)   // batch_size

    ##############
    # モデルの構築 #
    ##############

    print('... building the model')

    # long型のscalar 
    index = T.lscalar()  

    #matrix型
    x = T.matrix('x')
    #int型のvector    
    y = T.ivector('y') 

    # logistic regression クラスの構築
    # MNISTデータは28X28
    classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10)

    # cost function は負の対数尤度 
    cost = classifier.negative_log_likelihood(y)

    # ミニバッチ型でtheano.functionでまとめる 
    train_set_x = theano.shared(numpy.asarray(train_set_x, dtype=theano.config.floatX))
    train_set_y = T.cast(theano.shared(numpy.asarray(train_set_y, dtype=theano.config.floatX)), 'int32')
    test_set_x  = theano.shared(numpy.asarray(test_set_x, dtype=theano.config.floatX))
    test_set_y   = T.cast(theano.shared(numpy.asarray(test_set_y, dtype=theano.config.floatX)), 'int32')
    valid_set_x = theano.shared(numpy.asarray(valid_set_x, dtype=theano.config.floatX)) 
    valid_set_y = T.cast(theano.shared(numpy.asarray(valid_set_y, dtype=theano.config.floatX)), 'int32')

    # validateモデル
    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size: (index + 1) * batch_size],
            y: valid_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    # testモデル 
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        }
    )

    #costをWで微分
    g_W = T.grad(cost=cost, wrt=classifier.W)
    #costをbで微分    
    g_b = T.grad(cost=cost, wrt=classifier.b)
    # cost function は負の対数尤度  

    #パラメータ(W,b)の更新
    updates = [(classifier.W, classifier.W - learning_rate * g_W),
               (classifier.b, classifier.b - learning_rate * g_b)]

    # trainモデル
    # updatesで定義されたパラメータ(W,b)の更新を行う
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    #################
    # モデルのトレーニング #
    #################
    print('... training the model')
    # LOOP回数の上限
    patience = 50000  
    # 2以上増えなくなったら
    patience_increase = 5  
    #成績の上限                         
    improvement_threshold = 0.995         
    #patienceを満たすかの頻度。この場合はevery epochでチェックする
    validation_frequency = min(n_train_batches, patience // 2)

    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = timeit.default_timer()

    #LOOPのはじまり
    done_looping = False
    epoch = 0
    minibatch_index = 1
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):

            minibatch_avg_cost = train_model(minibatch_index)
            # iterationの計算
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # zero-one損失をValidation で計算
                validation_losses = [validate_model(i)
                                     for i in range(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)

                print(
                    'epoch %i, minibatch %i/%i, validation error %f %%' %
                    (
                        epoch,
                        minibatch_index + 1,
                        n_train_batches,
                        this_validation_loss * 100.
                    )
                )

                # validation lossが最低の場合
                if this_validation_loss < best_validation_loss:

                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    # testセットでテストする

                    test_losses = [test_model(i)
                                   for it in range(len(test_datasets))]
                    test_score = numpy.mean(test_losses)

                    print(
                        (
                            '     epoch %i, minibatch %i/%i, test error of'
                            ' best model %f %%'
                        ) %
                        (
                            epoch,
                            minibatch_index + 1,
                            n_train_batches,
                            test_score * 100.
                        )
                    )

                    # bestモデルを保存
                    with open('best_model.pkl', 'wb') as f:
                        pickle.dump(classifier, f)

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print(
        (
            'Optimization complete with best validation score of %f %%,'
            'with test performance %f %%'
        )
        % (best_validation_loss * 100., test_score * 100.)
    )
    print('The code run for %d epochs, with %f epochs/sec' % (
        epoch, 1. * epoch / (end_time - start_time)))

def predict():
    """
    An example of how to load a trained model and use it
    to predict labels.
    """
    # Trainしたモデルを呼び出す
    classifier = pickle.load(open('best_model.pkl'))

    # predictモデル
    predict_model = theano.function(
    inputs=[classifier.input],
        outputs=classifier.y_pred) 

    # testセットを呼び出す
    test_data = np.loadtxt("../input/test.csv", dtype=dtype,
                          delimiter=',', skiprows=1)
    #predictする                          
    predicted_values = predict_model(test_data)
    print("Predicted values for the first 10 examples in test set:")
    print(predicted_values)

    np.savetxt('Submit_TheanoLR3.csv', np.c_[range(1, len(test_data) + 1), predicted_values], delimiter=',', comments = '', header = 'ImageId,Label', fmt='%d')

if __name__ == '__main__':    
    sgd_optimization_mnist()

結果

結果は0.90757。scikit-learnで0.90900だったし、こんなものかと思われる。

1
1
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
1