##MLPでテキスト分類してみよう
6章で、テキスト分類をするコードがkerasで書かれていたので、tensorflowで書き直しました。(p290)
####コード
import tensorflow as tf
import json
from sklearn.model_selection import train_test_split
from keras.utils import np_utils
from sklearn.utils import shuffle
nb_classes = 9
batch_size = 64
epochs = 20
def weight_variable(name, shape):
W_init = tf.truncated_normal(shape, stddev=0.01)
W = tf.Variable(W_init, name="W_"+name)
return W
def bias_variable(name, size):
b_init = tf.constant(0.1, shape=[size])
b = tf.Variable(b_init, name="b_"+name)
return b
# data = json.load(open("./newstext/data-mini.json"))
data = json.load(open("./newstext/data.json"))
X = data["X"]
Y = data["Y"]
max_words = len(X[0])
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
Y_train = np_utils.to_categorical(Y_train, nb_classes)
Y_test = np_utils.to_categorical(Y_test, nb_classes)
print(len(X_train), len(Y_train))
x = tf.placeholder(tf.float32, shape=[None, max_words])
y_ = tf.placeholder(tf.float32, shape=[None, nb_classes])
W = weight_variable("num1",[max_words, 512])
b = bias_variable("num1", 512)
h = tf.nn.relu(tf.matmul(x, W) + b)
keep_prob = tf.placeholder(tf.float32)
h_drop = tf.nn.dropout(h, keep_prob)
W2 = weight_variable("num2", [512, nb_classes])
b2 = bias_variable("num2", nb_classes)
y = tf.nn.softmax(tf.matmul(h_drop, W2) + b2)
cross_entoropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
optimizer = tf.train.AdamOptimizer(1e-4)
train_step = optimizer.minimize(cross_entoropy)
predict_step = tf.equal(tf.argmax(y_, 1), tf.argmax(y, 1))
accuracy_step = tf.reduce_mean(tf.cast(predict_step, tf.float32))
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
N_train = len(X_train)
n_batches = N_train // batch_size
for epoch in range(epochs):
X_, Y_ = shuffle(X_train, Y_train)
for i in range(n_batches):
start = i * batch_size
end = start + batch_size
sess.run(train_step, feed_dict={
x: X_[start:end],
y_:Y_[start:end],
keep_prob:0.5
})
loss = sess.run(cross_entoropy, feed_dict={
x: X_test,
y_: Y_test,
keep_prob:1
})
print(loss)
accuracy = sess.run(accuracy_step, feed_dict={
x: X_test,
y_: Y_test,
keep_prob:1
})
print(accuracy)
acc = sess.run(accuracy_step, feed_dict={
x: X_test,
y_:Y_test,
keep_prob:1.0
})
print(acc)
####結果
2323.9214
0.83821934
1160.6143
0.9082519
747.41626
0.92508143
572.6161
0.9391965
427.56378
0.9457112
397.68292
0.94679695
371.37494
0.95222586
353.74292
0.95222586
340.4296
0.95222586
331.1926
0.95222586
326.9084
0.9533116
320.22305
0.9538545
313.026
0.95276874
311.2329
0.9533116
306.23077
0.9516829
307.86145
0.9533116
305.60995
0.95222586
303.2419
0.9533116
300.61127
0.9516829
##参照
Pythonによるスクレイピング&機械学習 開発テクニック BeautifulSoup、scikit-learn、TensorFlowを使ってみよう(著者:クジラ飛行机、出版社:ソシム株式会社)
http://www.socym.co.jp/book/1079