前置き

衝撃のサンプルとはGoogle公式の以下です。
財務時系列データを使用した機械学習

サンプルをもとにTensorFlow1.3でN225の予測をもうしましたが理解を深めます。

前回までのコード

import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from pandas.plotting import autocorrelation_plot
from pandas.plotting import scatter_matrix
# ダウンロードしてきたやつ
INDEIES = ["N225",  # Nikkei 225, Japan
           "HSI",   # Hang Seng, Hong Kong
           "GDAXI", # DAX, German
           "DJI",   # Dow, US
           "GSPC",  # S&P 500, US
           "SSEC",  # Shanghai Composite Index (China)
           "BVSP"]  # BOVESPA, Brazil
def tf_confusion_metrics(model, actual_classes, session, feed_dict):
    predictions = tf.argmax(model, 1)
    actuals = tf.argmax(actual_classes, 1)

    ones_like_actuals = tf.ones_like(actuals)
    zeros_like_actuals = tf.zeros_like(actuals)
    ones_like_predictions = tf.ones_like(predictions)
    zeros_like_predictions = tf.zeros_like(predictions)

    tp_op = tf.reduce_sum(
      tf.cast(
        tf.logical_and(
          tf.equal(actuals, ones_like_actuals),
          tf.equal(predictions, ones_like_predictions)
        ),
        "float"
      )
    )

    tn_op = tf.reduce_sum(
      tf.cast(
        tf.logical_and(
          tf.equal(actuals, zeros_like_actuals),
          tf.equal(predictions, zeros_like_predictions)
        ),
        "float"
      )
    )

    fp_op = tf.reduce_sum(
      tf.cast(
        tf.logical_and(
          tf.equal(actuals, zeros_like_actuals),
          tf.equal(predictions, ones_like_predictions)
        ),
        "float"
      )
    )

    fn_op = tf.reduce_sum(
      tf.cast(
        tf.logical_and(
          tf.equal(actuals, ones_like_actuals),
          tf.equal(predictions, zeros_like_predictions)
        ),
        "float"
      )
    )

    tp, tn, fp, fn = \
      session.run(
        [tp_op, tn_op, fp_op, fn_op],
        feed_dict
      )

    tpr = float(tp)/(float(tp) + float(fn))
    fpr = float(fp)/(float(tp) + float(fn))

    accuracy = (float(tp) + float(tn))/(float(tp) + float(fp) + float(fn) + float(tn))

    recall = tpr
    precision = float(tp)/(float(tp) + float(fp))

    f1_score = (2 * (precision * recall)) / (precision + recall)

    print('Precision = ', precision)
    print('Recall = ', recall)
    print('F1 Score = ', f1_score)
    print('Accuracy = ', accuracy)
def study():
    closing = pd.DataFrame()
    for index in INDEIES:
        # na_valuesは文字列"null"のとき空として扱う CSVみるとnullって書いてあります。
        df = pd.read_csv("./data/" + index + ".csv",na_values=["null"])
        df["Date"] = pd.to_datetime(df["Date"])
        df = df.set_index("Date")
        closing[index] = df["Close"]
    #空の部分は古いので埋める。
    closing = closing.fillna(method="ffill")
    for index in INDEIES:
        closing[index] = closing[index] / max(closing[index])
        closing[index] = np.log(closing[index] / closing[index].shift())

    closing["positive"] = 0
    #closing["N225"] >= 0の行のpositiveに1をいれる。
    closing.ix[closing["N225"] >= 0, "positive"] = 1
    closing["negative"] = 0
    #closing["N225"] < 0の行のnegativeに1をいれる。
    closing.ix[closing["N225"] < 0, "negative"] = 1

    #1~3日前のデータを予測に使う
    days_before = range(1,4)

    columns = ["positive", "negative"]
    for i in days_before :
        columns += [index + "_" + str(i) for index in INDEIES]

    training = pd.DataFrame(
        # ['positive', 'negative', 'N225_1', 'HSI_1', 'GDAXI_1', 'DJI_1', 'GSPC_1', 'SSEC_1', 'BVSP_1', 'N225_2', 'HSI_2', 'GDAXI_2', 'DJI_2', 'GSPC_2', 'SSEC_2', 'BVSP_2', 'N225_3', 'HSI_3', 'GDAXI_3', 'DJI_3', 'GSPC_3', 'SSEC_3', 'BVSP_3']
        columns = columns
    )

    #なんで7から？
    for i in range(7, len(closing)):
        data = {}
        #予測の部分は当日のデータで
        data["positive"] = closing["positive"].ix[i]
        data["negative"] = closing["negative"].ix[i]
        #ほかの指標は１個前のデータを使用する。
        for index in INDEIES:
            for before in days_before :
                data[index + "_" + str(before)] = closing[index].ix[i - before]
        training = training.append(data, ignore_index=True)
    #一応確認
    #print(training.describe())

    #2行目からは予測する元のデータ
    predictors = training[training.columns[2:]]
    #2行目までは予測するべきデータ
    classes = training[training.columns[:2]]

    # 80%をトレーニングに使う
    training_size = int(len(training) * 0.8)

    training_predictors = predictors[:training_size]
    training_classes = classes[:training_size]
    test_predictors = predictors[training_size:]
    test_classes = classes[training_size:]

    # 予測する列の数
    num_predictors = len(training_predictors.columns)
    # 予測するべき列の数
    num_classes = len(training_classes.columns)

    session = tf.Session()
    feature = tf.placeholder(tf.float32, shape=(None, num_predictors))
    actual_classes = tf.placeholder(tf.float32,  shape=(None, num_classes))
    weights = tf.Variable(tf.truncated_normal([num_predictors, num_classes], stddev=0.0001))
    biases = tf.Variable(tf.ones([num_classes]))
    model = tf.nn.softmax(tf.matmul(feature, weights) + biases)
    cost = -tf.reduce_sum(actual_classes*tf.log(model))
    training_step = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(cost)
    init = tf.global_variables_initializer()
    session.run(init)
    correct_prediction = tf.equal(tf.argmax(model, 1), tf.argmax(actual_classes, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    actual_classes_param = training_classes.values.reshape(len(training_classes.values), 2)

    for i in range(1, 300001):
        session.run(
          training_step,
          feed_dict={
            feature: training_predictors.values,
            actual_classes: actual_classes_param
          }
        )
        if i%5000 == 0:
            print( i, session.run(
             accuracy,
             feed_dict={
               feature: training_predictors.values,
               actual_classes: actual_classes_param
             }
           ))
    feed_dict= {
      feature: test_predictors.values,
      actual_classes: test_classes.values.reshape(len(test_classes.values), 2)
    }

    tf_confusion_metrics(model, actual_classes, session, feed_dict)

    session2 = tf.Session()

    feature = tf.placeholder(tf.float32, shape=(None, num_predictors))
    actual_classes = tf.placeholder(tf.float32,  shape=(None, num_classes))

    weights1 = tf.Variable(tf.truncated_normal([num_predictors, 50], stddev=0.0001))
    biases1 = tf.Variable(tf.ones([50]))

    weights2 = tf.Variable(tf.truncated_normal([50, 25], stddev=0.0001))
    biases2 = tf.Variable(tf.ones([25]))

    weights3 = tf.Variable(tf.truncated_normal([25, 2], stddev=0.0001))
    biases3 = tf.Variable(tf.ones([2]))

    hidden_layer_1 = tf.nn.relu(tf.matmul(feature, weights1) + biases1)
    hidden_layer_2 = tf.nn.relu(tf.matmul(hidden_layer_1, weights2) + biases2)
    model = tf.nn.softmax(tf.matmul(hidden_layer_2, weights3) + biases3)

    cost = -tf.reduce_sum(actual_classes*tf.log(model))

    train_op1 = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(cost)

    init = tf.global_variables_initializer()
    session2.run(init)

    correct_prediction = tf.equal(tf.argmax(model, 1), tf.argmax(actual_classes, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    for i in range(1, 300001):
        session2.run(
            train_op1,
            feed_dict={
              feature: training_predictors.values,
              actual_classes: actual_classes_param
            }
          )
        if i%5000 == 0:
            print( i, session2.run(
              accuracy,
              feed_dict={
                feature: training_predictors.values,
                actual_classes: actual_classes_param
              }
            ))

    feed_dict= {
      feature: test_predictors.values,
      actual_classes: test_classes.values.reshape(len(test_classes.values), 2)
    }

    tf_confusion_metrics(model, actual_classes, session2, feed_dict)


if __name__ == "__main__":
    study()

なんとなく整理しちゃう

コードがよくわからない時にまずいつもやっています。
なんかコードが冗長だしカラムの数をハードコーディングしてたりするを改善します。

またテストの出力は以下のようなものがでてきます。

Precision =  0.6395939086294417
Recall =  0.5338983050847458
F1 Score =  0.581986143187067
Accuracy =  0.6298568507157464

Precisionは下がっているときに上がっていると判断しない確率。
Recallは上がっているものを全部見つけられる確率。
F1 ScoreはPrecisionとRecallの加重平均。

うん　見てもどう評価していいのかわからないので消しちゃいます。Accuracyが高ければいいもん。
あとは単純にプログラムの知識だけで出力が変わらないようリファクタをし、ついでに隠れ層はいくつになってもいいように拡張しました。

study.py

import numpy as np
import pandas as pd
from model import Model
# ダウンロードしてきたやつ
INDEIES = ["N225",  # Nikkei 225, Japan
           "HSI",   # Hang Seng, Hong Kong
           "GDAXI", # DAX, German
           "DJI",   # Dow, US
           "GSPC",  # S&P 500, US
           "SSEC",  # Shanghai Composite Index (China)
           "BVSP"]  # BOVESPA, Brazil
def getClosing():
    closing = pd.DataFrame()
    for index in INDEIES:
        # na_valuesは文字列"null"のとき空として扱う CSVみるとnullって書いてあります。
        df = pd.read_csv("./data/" + index + ".csv",na_values=["null"])
        df["Date"] = pd.to_datetime(df["Date"])
        df = df.set_index("Date")
        closing[index] = df["Close"]
    #空の部分は古いので埋める。
    closing = closing.fillna(method="ffill")
    for index in INDEIES:
        closing[index] = closing[index] / max(closing[index])
        closing[index] = np.log(closing[index] / closing[index].shift())
    closing["positive"] = 0
    #closing["N225"] >= 0の行のpositiveに1をいれる。
    closing.ix[closing["N225"] >= 0, "positive"] = 1
    closing["negative"] = 0
    #closing["N225"] < 0の行のnegativeに1をいれる。
    closing.ix[closing["N225"] < 0, "negative"] = 1
    return closing
def getTraningData():
    closing = getClosing()

    #1~3日前のデータを予測に使う
    days_before = range(1,4)
    answers = pd.DataFrame(columns = ["positive", "negative"])

    columns = []
    for i in days_before :
        columns += [index + "_" + str(i) for index in INDEIES]
    features = pd.DataFrame(columns = columns)
    #なんで7から？
    for i in range(7, len(closing)):
        #予測の部分は当日のデータで
        answers = answers.append({
            "positive" : closing["positive"].ix[i],
            "negative" : closing["negative"].ix[i]}, ignore_index=True)
        data={}
        #ほかの指標は１個前のデータを使用する。
        for index in INDEIES:
            for before in days_before :
                data[index + "_" + str(before)] = closing[index].ix[i - before]
        features = features.append(data, ignore_index=True)
    #予測する元のデータ , 予測するべきデータ
    return features,answers
if __name__ == "__main__":
    features,answers = getTraningData()
    #[]は隠れ層なし、[50.25]は２層
    for layers in [[],[50,25]]:
        model = Model(features,answers,layers)
        model.train(3000)
        print('Accuracy = ',  model.test())

model.py

import tensorflow as tf
class featuresAndAnswers:
    def __init__(self, features, answers):
        self.features = features
        self.answers = answers
class trainingAndTest():
    def __init__(self, features, answers ,percentage):
        training_size = int(len(features) * percentage)
        self.traning = featuresAndAnswers(features[:training_size],answers[:training_size])
        self.test = featuresAndAnswers(features[training_size:],answers[training_size:])
        # N225_1 とかいっぱい
        self.feature_type_count = len(features.columns)
        # positive nagativeの２つ
        self.answer_type_count = len(answers.columns)
class Model:
    def __init__(self, features, answers, layers=[]):
        # 80%をトレーニングに使う
        self.data = trainingAndTest(features, answers, 0.8)
        # placeholderは変数みたいなもん
        self.real_answer = tf.placeholder(tf.float32,  shape=(None, self.data.answer_type_count))
        self.feature = tf.placeholder(tf.float32, shape=(None, self.data.feature_type_count))
        self.model = self.createTfModel(layers)
        # 目標値との誤差 reduce_sumで全部足しちゃう
        cost = -tf.reduce_sum(self.real_answer*tf.log(self.model))
        # 最適化のアルゴリズム。アダムは評価が高いらしいほかにも10個位tensorflow api にある
        self.step = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(cost)
        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())

        # 正答率の算出 いつもおんなじ？
        correct_prediction = tf.equal(tf.argmax(self.model, 1), tf.argmax(self.real_answer, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    def createTfModel(self, layers):
        hidden_layer = None
        loop_count_max = len(layers)+1
        for loop_count in range(0,loop_count_max):
            if loop_count is len(layers) :
                #最後は与えられた答えの数 positive negativeの2になる
                need_answer_count = self.data.answer_type_count
            else :
                #隠れ層がある場合はその層のニューロンの数？の答えをだすのね
                need_answer_count = layers[loop_count]
            if loop_count is 0 :
                # 最初は与えられた特徴なのね
                feature = self.feature
                feature_type_count = self.data.feature_type_count
            else :
                # 隠れ層がある場合は隠れ層が学習する特徴のデータになるんだ！
                feature = hidden_layer
                feature_type_count = layers[loop_count-1]
            # truncated_normal Tensorを正規分布かつ標準偏差0.0001の２倍までのランダムな値で初期化する
            weights = tf.Variable(tf.truncated_normal([feature_type_count, need_answer_count], stddev=0.0001))
            # バイアス
            biases = tf.Variable(tf.ones([need_answer_count]))
            # matmulは掛け算feature * weights
            logits = tf.matmul(feature, weights) + biases
            if loop_count is loop_count_max - 1  :
                # 最後はsoftmax
                return tf.nn.softmax(logits)
            else :
                # reluはRectified Linear Unit, Rectifier, 正規化線形関数だそうです。
                hidden_layer = tf.nn.relu(logits)
    def train(self,count=30000,print_count=5):
        feed_dict = {
               self.feature: self.data.traning.features,
               self.real_answer: self.data.traning.answers
            }
        for i in range(1, count+1):
            # feed_dictからself.stepを評価
            self.session.run(self.step,feed_dict)
            if i % (count/print_count) == 0:
                # feed_dictからself.accuracyを評価してるのでaccuracyが返ってくる
                print( i, self.session.run(self.accuracy,feed_dict))
    def test(self):
        predictions = tf.argmax(self.model, 1)
        real_answers = tf.argmax(self.real_answer, 1)
        count_correct_answer = tf.reduce_sum(
          # booleanをfloatに。trueが１になるのでしょう。
          tf.cast(
            # booleanがもどる
            tf.equal(real_answers, predictions),
            tf.float32
          )
        )
        correct_answer_count = self.session.run(
            count_correct_answer,
            {self.feature: self.data.test.features,
             self.real_answer: self.data.test.answers}
          )
        return  correct_answer_count/len(self.data.test.answers)

意図しなくてもしっかり２つにわかれました。study.pyはデータの取得と学習の呼び出しだけなのでここまでの記事で理解しました。
もともとある変数名がtp_opとかpredictorとかよくわからなかったので簡単な単語に変えさらにリファクタリングすると分かりやすくなりました。コメント見てもらえれば大体わかると思います。
隠れ層があると実行するごとにAccuracyが微妙に変わるのですが、同じAccuracyが出るまで確認しました。

    feed_dict= {
      feature: test_predictors.values,
      actual_classes: test_classes.values.reshape(len(test_classes.values), 2)
    }

サンプルでは上記のように60000回もreshapeしてたのはなんだったんだろう。

プログラムは根性だ！

前 TensorFlowを投資につかうまで衝撃のサンプル編 (4)
次日経平均株価の予測をTensorFlowで機械学習させ精度の向上を試みた　62％から68％位になった

TensorFlowを投資につかうまで 衝撃のサンプル編 (5)

前置き

前回までのコード

なんとなく整理しちゃう

TensorFlowを投資につかうまで衝撃のサンプル編 (5)