Qiita Teams that are logged in
You are not logged in to any team

Log in to Qiita Team
Community
OrganizationEventAdvent CalendarQiitadon (β)
Service
Qiita JobsQiita ZineQiita Blog
13
Help us understand the problem. What are the problem?

More than 3 years have passed since last update.

@verizi

# 前置き

サンプルをもとにTensorFlow1.3でN225の予測をもうしましたが理解を深めます。

# 前回までのコード

``````import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from pandas.plotting import autocorrelation_plot
from pandas.plotting import scatter_matrix
# ダウンロードしてきたやつ
INDEIES = ["N225",  # Nikkei 225, Japan
"HSI",   # Hang Seng, Hong Kong
"GDAXI", # DAX, German
"DJI",   # Dow, US
"GSPC",  # S&P 500, US
"SSEC",  # Shanghai Composite Index (China)
"BVSP"]  # BOVESPA, Brazil
def tf_confusion_metrics(model, actual_classes, session, feed_dict):
predictions = tf.argmax(model, 1)
actuals = tf.argmax(actual_classes, 1)

ones_like_actuals = tf.ones_like(actuals)
zeros_like_actuals = tf.zeros_like(actuals)
ones_like_predictions = tf.ones_like(predictions)
zeros_like_predictions = tf.zeros_like(predictions)

tp_op = tf.reduce_sum(
tf.cast(
tf.logical_and(
tf.equal(actuals, ones_like_actuals),
tf.equal(predictions, ones_like_predictions)
),
"float"
)
)

tn_op = tf.reduce_sum(
tf.cast(
tf.logical_and(
tf.equal(actuals, zeros_like_actuals),
tf.equal(predictions, zeros_like_predictions)
),
"float"
)
)

fp_op = tf.reduce_sum(
tf.cast(
tf.logical_and(
tf.equal(actuals, zeros_like_actuals),
tf.equal(predictions, ones_like_predictions)
),
"float"
)
)

fn_op = tf.reduce_sum(
tf.cast(
tf.logical_and(
tf.equal(actuals, ones_like_actuals),
tf.equal(predictions, zeros_like_predictions)
),
"float"
)
)

tp, tn, fp, fn = \
session.run(
[tp_op, tn_op, fp_op, fn_op],
feed_dict
)

tpr = float(tp)/(float(tp) + float(fn))
fpr = float(fp)/(float(tp) + float(fn))

accuracy = (float(tp) + float(tn))/(float(tp) + float(fp) + float(fn) + float(tn))

recall = tpr
precision = float(tp)/(float(tp) + float(fp))

f1_score = (2 * (precision * recall)) / (precision + recall)

print('Precision = ', precision)
print('Recall = ', recall)
print('F1 Score = ', f1_score)
print('Accuracy = ', accuracy)
def study():
closing = pd.DataFrame()
for index in INDEIES:
# na_valuesは文字列"null"のとき空として扱う CSVみるとnullって書いてあります。
df = pd.read_csv("./data/" + index + ".csv",na_values=["null"])
df["Date"] = pd.to_datetime(df["Date"])
df = df.set_index("Date")
closing[index] = df["Close"]
#空の部分は古いので埋める。
closing = closing.fillna(method="ffill")
for index in INDEIES:
closing[index] = closing[index] / max(closing[index])
closing[index] = np.log(closing[index] / closing[index].shift())

closing["positive"] = 0
#closing["N225"] >= 0の行のpositiveに1をいれる。
closing.ix[closing["N225"] >= 0, "positive"] = 1
closing["negative"] = 0
#closing["N225"] < 0の行のnegativeに1をいれる。
closing.ix[closing["N225"] < 0, "negative"] = 1

#1~3日前のデータを予測に使う
days_before = range(1,4)

columns = ["positive", "negative"]
for i in days_before :
columns += [index + "_" + str(i) for index in INDEIES]

training = pd.DataFrame(
# ['positive', 'negative', 'N225_1', 'HSI_1', 'GDAXI_1', 'DJI_1', 'GSPC_1', 'SSEC_1', 'BVSP_1', 'N225_2', 'HSI_2', 'GDAXI_2', 'DJI_2', 'GSPC_2', 'SSEC_2', 'BVSP_2', 'N225_3', 'HSI_3', 'GDAXI_3', 'DJI_3', 'GSPC_3', 'SSEC_3', 'BVSP_3']
columns = columns
)

#なんで7から？
for i in range(7, len(closing)):
data = {}
#予測の部分は当日のデータで
data["positive"] = closing["positive"].ix[i]
data["negative"] = closing["negative"].ix[i]
#ほかの指標は１個前のデータを使用する。
for index in INDEIES:
for before in days_before :
data[index + "_" + str(before)] = closing[index].ix[i - before]
training = training.append(data, ignore_index=True)
#一応確認
#print(training.describe())

#2行目からは予測する元のデータ
predictors = training[training.columns[2:]]
#2行目までは予測するべきデータ
classes = training[training.columns[:2]]

# 80%をトレーニングに使う
training_size = int(len(training) * 0.8)

training_predictors = predictors[:training_size]
training_classes = classes[:training_size]
test_predictors = predictors[training_size:]
test_classes = classes[training_size:]

# 予測する列の数
num_predictors = len(training_predictors.columns)
# 予測するべき列の数
num_classes = len(training_classes.columns)

session = tf.Session()
feature = tf.placeholder(tf.float32, shape=(None, num_predictors))
actual_classes = tf.placeholder(tf.float32,  shape=(None, num_classes))
weights = tf.Variable(tf.truncated_normal([num_predictors, num_classes], stddev=0.0001))
biases = tf.Variable(tf.ones([num_classes]))
model = tf.nn.softmax(tf.matmul(feature, weights) + biases)
cost = -tf.reduce_sum(actual_classes*tf.log(model))
training_step = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(cost)
init = tf.global_variables_initializer()
session.run(init)
correct_prediction = tf.equal(tf.argmax(model, 1), tf.argmax(actual_classes, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
actual_classes_param = training_classes.values.reshape(len(training_classes.values), 2)

for i in range(1, 300001):
session.run(
training_step,
feed_dict={
feature: training_predictors.values,
actual_classes: actual_classes_param
}
)
if i%5000 == 0:
print( i, session.run(
accuracy,
feed_dict={
feature: training_predictors.values,
actual_classes: actual_classes_param
}
))
feed_dict= {
feature: test_predictors.values,
actual_classes: test_classes.values.reshape(len(test_classes.values), 2)
}

tf_confusion_metrics(model, actual_classes, session, feed_dict)

session2 = tf.Session()

feature = tf.placeholder(tf.float32, shape=(None, num_predictors))
actual_classes = tf.placeholder(tf.float32,  shape=(None, num_classes))

weights1 = tf.Variable(tf.truncated_normal([num_predictors, 50], stddev=0.0001))
biases1 = tf.Variable(tf.ones([50]))

weights2 = tf.Variable(tf.truncated_normal([50, 25], stddev=0.0001))
biases2 = tf.Variable(tf.ones([25]))

weights3 = tf.Variable(tf.truncated_normal([25, 2], stddev=0.0001))
biases3 = tf.Variable(tf.ones([2]))

hidden_layer_1 = tf.nn.relu(tf.matmul(feature, weights1) + biases1)
hidden_layer_2 = tf.nn.relu(tf.matmul(hidden_layer_1, weights2) + biases2)
model = tf.nn.softmax(tf.matmul(hidden_layer_2, weights3) + biases3)

cost = -tf.reduce_sum(actual_classes*tf.log(model))

train_op1 = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(cost)

init = tf.global_variables_initializer()
session2.run(init)

correct_prediction = tf.equal(tf.argmax(model, 1), tf.argmax(actual_classes, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

for i in range(1, 300001):
session2.run(
train_op1,
feed_dict={
feature: training_predictors.values,
actual_classes: actual_classes_param
}
)
if i%5000 == 0:
print( i, session2.run(
accuracy,
feed_dict={
feature: training_predictors.values,
actual_classes: actual_classes_param
}
))

feed_dict= {
feature: test_predictors.values,
actual_classes: test_classes.values.reshape(len(test_classes.values), 2)
}

tf_confusion_metrics(model, actual_classes, session2, feed_dict)

if __name__ == "__main__":
study()
``````

# なんとなく整理しちゃう

コードがよくわからない時にまずいつもやっています。
なんかコードが冗長だしカラムの数をハードコーディングしてたりするを改善します。

またテストの出力は以下のようなものがでてきます。

``````Precision =  0.6395939086294417
Recall =  0.5338983050847458
F1 Score =  0.581986143187067
Accuracy =  0.6298568507157464
``````

Precisionは下がっているときに上がっていると判断しない確率。
Recallは上がっているものを全部見つけられる確率。
F1 ScoreはPrecisionとRecallの加重平均。

うん　見てもどう評価していいのかわからないので消しちゃいます。Accuracyが高ければいいもん。
あとは単純にプログラムの知識だけで出力が変わらないようリファクタをし、ついでに隠れ層はいくつになってもいいように拡張しました。

study.py
``````import numpy as np
import pandas as pd
from model import Model
# ダウンロードしてきたやつ
INDEIES = ["N225",  # Nikkei 225, Japan
"HSI",   # Hang Seng, Hong Kong
"GDAXI", # DAX, German
"DJI",   # Dow, US
"GSPC",  # S&P 500, US
"SSEC",  # Shanghai Composite Index (China)
"BVSP"]  # BOVESPA, Brazil
def getClosing():
closing = pd.DataFrame()
for index in INDEIES:
# na_valuesは文字列"null"のとき空として扱う CSVみるとnullって書いてあります。
df = pd.read_csv("./data/" + index + ".csv",na_values=["null"])
df["Date"] = pd.to_datetime(df["Date"])
df = df.set_index("Date")
closing[index] = df["Close"]
#空の部分は古いので埋める。
closing = closing.fillna(method="ffill")
for index in INDEIES:
closing[index] = closing[index] / max(closing[index])
closing[index] = np.log(closing[index] / closing[index].shift())
closing["positive"] = 0
#closing["N225"] >= 0の行のpositiveに1をいれる。
closing.ix[closing["N225"] >= 0, "positive"] = 1
closing["negative"] = 0
#closing["N225"] < 0の行のnegativeに1をいれる。
closing.ix[closing["N225"] < 0, "negative"] = 1
return closing
def getTraningData():
closing = getClosing()

#1~3日前のデータを予測に使う
days_before = range(1,4)
answers = pd.DataFrame(columns = ["positive", "negative"])

columns = []
for i in days_before :
columns += [index + "_" + str(i) for index in INDEIES]
features = pd.DataFrame(columns = columns)
#なんで7から？
for i in range(7, len(closing)):
#予測の部分は当日のデータで
answers = answers.append({
"positive" : closing["positive"].ix[i],
"negative" : closing["negative"].ix[i]}, ignore_index=True)
data={}
#ほかの指標は１個前のデータを使用する。
for index in INDEIES:
for before in days_before :
data[index + "_" + str(before)] = closing[index].ix[i - before]
features = features.append(data, ignore_index=True)
#予測する元のデータ , 予測するべきデータ
return features,answers
if __name__ == "__main__":
features,answers = getTraningData()
#[]は隠れ層なし、[50.25]は２層
for layers in [[],[50,25]]:
model = Model(features,answers,layers)
model.train(3000)
print('Accuracy = ',  model.test())

``````
model.py
``````import tensorflow as tf
class featuresAndAnswers:
def __init__(self, features, answers):
self.features = features
self.answers = answers
class trainingAndTest():
def __init__(self, features, answers ,percentage):
training_size = int(len(features) * percentage)
self.traning = featuresAndAnswers(features[:training_size],answers[:training_size])
self.test = featuresAndAnswers(features[training_size:],answers[training_size:])
# N225_1 とかいっぱい
self.feature_type_count = len(features.columns)
# positive nagativeの２つ
self.answer_type_count = len(answers.columns)
class Model:
def __init__(self, features, answers, layers=[]):
# 80%をトレーニングに使う
self.data = trainingAndTest(features, answers, 0.8)
# placeholderは変数みたいなもん
self.real_answer = tf.placeholder(tf.float32,  shape=(None, self.data.answer_type_count))
self.feature = tf.placeholder(tf.float32, shape=(None, self.data.feature_type_count))
self.model = self.createTfModel(layers)
# 目標値との誤差 reduce_sumで全部足しちゃう
cost = -tf.reduce_sum(self.real_answer*tf.log(self.model))
# 最適化のアルゴリズム。アダムは評価が高いらしいほかにも10個位tensorflow api にある
self.step = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(cost)
self.session = tf.Session()
self.session.run(tf.global_variables_initializer())

# 正答率の算出 いつもおんなじ？
correct_prediction = tf.equal(tf.argmax(self.model, 1), tf.argmax(self.real_answer, 1))
self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
def createTfModel(self, layers):
hidden_layer = None
loop_count_max = len(layers)+1
for loop_count in range(0,loop_count_max):
if loop_count is len(layers) :
#最後は与えられた答えの数 positive negativeの2になる
need_answer_count = self.data.answer_type_count
else :
#隠れ層がある場合はその層のニューロンの数？の答えをだすのね
need_answer_count = layers[loop_count]
if loop_count is 0 :
# 最初は与えられた特徴なのね
feature = self.feature
feature_type_count = self.data.feature_type_count
else :
# 隠れ層がある場合は隠れ層が学習する特徴のデータになるんだ！
feature = hidden_layer
feature_type_count = layers[loop_count-1]
# truncated_normal Tensorを正規分布かつ標準偏差0.0001の２倍までのランダムな値で初期化する
weights = tf.Variable(tf.truncated_normal([feature_type_count, need_answer_count], stddev=0.0001))
# バイアス
biases = tf.Variable(tf.ones([need_answer_count]))
# matmulは掛け算feature * weights
logits = tf.matmul(feature, weights) + biases
if loop_count is loop_count_max - 1  :
# 最後はsoftmax
return tf.nn.softmax(logits)
else :
# reluはRectified Linear Unit, Rectifier, 正規化線形関数だそうです。
hidden_layer = tf.nn.relu(logits)
def train(self,count=30000,print_count=5):
feed_dict = {
self.feature: self.data.traning.features,
self.real_answer: self.data.traning.answers
}
for i in range(1, count+1):
# feed_dictからself.stepを評価
self.session.run(self.step,feed_dict)
if i % (count/print_count) == 0:
# feed_dictからself.accuracyを評価してるのでaccuracyが返ってくる
print( i, self.session.run(self.accuracy,feed_dict))
def test(self):
predictions = tf.argmax(self.model, 1)
real_answers = tf.argmax(self.real_answer, 1)
count_correct_answer = tf.reduce_sum(
# booleanをfloatに。trueが１になるのでしょう。
tf.cast(
# booleanがもどる
tf.equal(real_answers, predictions),
tf.float32
)
)
correct_answer_count = self.session.run(
count_correct_answer,
{self.feature: self.data.test.features,
self.real_answer: self.data.test.answers}
)
return  correct_answer_count/len(self.data.test.answers)

``````

もともとある変数名がtp_opとかpredictorとかよくわからなかったので簡単な単語に変えさらにリファクタリングすると分かりやすくなりました。コメント見てもらえれば大体わかると思います。

``````    feed_dict= {
feature: test_predictors.values,
actual_classes: test_classes.values.reshape(len(test_classes.values), 2)
}
``````

サンプルでは上記のように60000回もreshapeしてたのはなんだったんだろう。

プログラムは根性だ！

Why not register and get more from Qiita?
1. We will deliver articles that match you
By following users and tags, you can catch up information on technical fields that you are interested in as a whole
2. you can read useful information later efficiently
By "stocking" the articles you like, you can search right away
13
Help us understand the problem. What are the problem?