LoginSignup
5
4

More than 5 years have passed since last update.

Pythonによるスクレイピング&機械学習[6-4]

Posted at

MLPでテキスト分類してみよう

6章で、テキスト分類をするコードがkerasで書かれていたので、tensorflowで書き直しました。(p290)

コード

import tensorflow as tf
import json
from sklearn.model_selection import train_test_split
from keras.utils import np_utils
from sklearn.utils import shuffle

nb_classes = 9

batch_size = 64
epochs = 20

def weight_variable(name, shape):
    W_init = tf.truncated_normal(shape, stddev=0.01)
    W = tf.Variable(W_init, name="W_"+name)
    return W

def bias_variable(name, size):
    b_init = tf.constant(0.1, shape=[size])
    b = tf.Variable(b_init, name="b_"+name)
    return b

# data = json.load(open("./newstext/data-mini.json"))
data = json.load(open("./newstext/data.json"))
X = data["X"]
Y = data["Y"]
max_words = len(X[0])

X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
Y_train = np_utils.to_categorical(Y_train, nb_classes)
Y_test = np_utils.to_categorical(Y_test, nb_classes)
print(len(X_train), len(Y_train))

x = tf.placeholder(tf.float32, shape=[None, max_words])
y_ = tf.placeholder(tf.float32, shape=[None, nb_classes])

W = weight_variable("num1",[max_words, 512])
b = bias_variable("num1", 512)
h = tf.nn.relu(tf.matmul(x, W) + b)

keep_prob = tf.placeholder(tf.float32)
h_drop = tf.nn.dropout(h, keep_prob)

W2 = weight_variable("num2", [512, nb_classes])
b2 = bias_variable("num2", nb_classes)
y = tf.nn.softmax(tf.matmul(h_drop, W2) + b2)

cross_entoropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))

optimizer = tf.train.AdamOptimizer(1e-4)
train_step = optimizer.minimize(cross_entoropy)

predict_step = tf.equal(tf.argmax(y_, 1), tf.argmax(y, 1))
accuracy_step = tf.reduce_mean(tf.cast(predict_step, tf.float32))

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    N_train = len(X_train)
    n_batches = N_train // batch_size
    for epoch in range(epochs):
        X_, Y_ = shuffle(X_train, Y_train)
        for i in range(n_batches):
            start = i * batch_size
            end = start + batch_size

            sess.run(train_step, feed_dict={
                x: X_[start:end],
                y_:Y_[start:end],
                keep_prob:0.5
            })

        loss = sess.run(cross_entoropy, feed_dict={
            x: X_test,
            y_: Y_test,
            keep_prob:1
        })
        print(loss)

        accuracy = sess.run(accuracy_step, feed_dict={
            x: X_test,
            y_: Y_test,
            keep_prob:1
        })
        print(accuracy)

    acc = sess.run(accuracy_step, feed_dict={
        x: X_test,
        y_:Y_test,
        keep_prob:1.0
    })
    print(acc)

結果

2323.9214
0.83821934
1160.6143
0.9082519
747.41626
0.92508143
572.6161
0.9391965
427.56378
0.9457112
397.68292
0.94679695
371.37494
0.95222586
353.74292
0.95222586
340.4296
0.95222586
331.1926
0.95222586
326.9084
0.9533116
320.22305
0.9538545
313.026
0.95276874
311.2329
0.9533116
306.23077
0.9516829
307.86145
0.9533116
305.60995
0.95222586
303.2419
0.9533116
300.61127
0.9516829

参照

Pythonによるスクレイピング&機械学習 開発テクニック BeautifulSoup、scikit-learn、TensorFlowを使ってみよう(著者:クジラ飛行机、出版社:ソシム株式会社)
http://www.socym.co.jp/book/1079

5
4
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
5
4