Python
機械学習
テキストマイニング
データサイエンス
TensorFlow

Word2vecを使って映画レビュー評価の予測を行う

More than 1 year has passed since last update.

CBOWを用いたWord2vecモデルで単語埋め込みを作成し、ロジスティック回帰で分類できるか試します。ここでは、映画レビューが肯定的か否定的かというラベルとレビューのテキストデータを用います。


jupyter notebookで実行

まず、データをダウンロードする関数を作成します。

In[1]:

import os

import requests
import tarfile
def load_movie_data():
save_folder_name = 'temp'
pos_file = os.path.join(save_folder_name, 'rt-polaritydata', 'rt-polarity.pos')
neg_file = os.path.join(save_folder_name, 'rt-polaritydata', 'rt-polarity.neg')

# Check if files are already downloaded
if not os.path.exists(os.path.join(save_folder_name, 'rt-polaritydata')):
movie_data_url = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz'

# Save tar.gz file
req = requests.get(movie_data_url, stream=True)
with open('temp_movie_review_temp.tar.gz', 'wb') as f:
for chunk in req.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
# Extract tar.gz file into temp folder
tar = tarfile.open('temp_movie_review_temp.tar.gz', "r:gz")
tar.extractall(path='temp')
tar.close()

pos_data = []
with open(pos_file, 'r', encoding='latin-1') as f:
for line in f:
pos_data.append(line.encode('ascii',errors='ignore').decode())
f.close()
pos_data = [x.rstrip() for x in pos_data]

neg_data = []
with open(neg_file, 'r', encoding='latin-1') as f:
for line in f:
neg_data.append(line.encode('ascii',errors='ignore').decode())
f.close()
neg_data = [x.rstrip() for x in neg_data]

texts = pos_data + neg_data
target = [1]*len(pos_data) + [0]*len(neg_data)

return(texts, target)

texts, targets = load_movie_data()

次に、必要なライブラリをインポートします。

In[2]:

import tensorflow as tf

import matplotlib.pyplot as plt
import numpy as np
import random
import pickle
import string
import collections
import io
from nltk.corpus import stopwords

単語埋め込みを作成するために使ういくつかの関数を定義します。

In[3]:

# Normalize text

def normalize_text(texts, stops):
# Lower case
texts = [x.lower() for x in texts]

# Remove punctuation
texts = [''.join(c for c in x if c not in string.punctuation) for x in texts]

# Remove numbers
texts = [''.join(c for c in x if c not in '0123456789') for x in texts]

# Remove stopwords
texts = [' '.join([word for word in x.split() if word not in (stops)]) for x in texts]

# Trim extra whitespace
texts = [' '.join(x.split()) for x in texts]

return(texts)

# Build dictionary of words
def build_dictionary(sentences, vocabulary_size):
# Turn sentences (list of strings) into lists of words
split_sentences = [s.split() for s in sentences]
words = [x for sublist in split_sentences for x in sublist]

# Initialize list of [word, word_count] for each word, starting with unknown
count = [['RARE', -1]]

# Now add most frequent words, limited to the N-most frequent (N=vocabulary size)
count.extend(collections.Counter(words).most_common(vocabulary_size-1))

# Now create the dictionary
word_dict = {}
# For each word, that we want in the dictionary, add it, then make it
# the value of the prior dictionary length
for word, word_count in count:
word_dict[word] = len(word_dict)

return(word_dict)

# Turn text data into lists of integers from dictionary
def text_to_numbers(sentences, word_dict):
# Initialize the returned data
data = []
for sentence in sentences:
sentence_data = []
# For each word, either use selected index or rare word index
for word in sentence.split(' '):
if word in word_dict:
word_ix = word_dict[word]
else:
word_ix = 0
sentence_data.append(word_ix)
data.append(sentence_data)
return(data)

前処理、単語埋め込みの結合、ロジスティック回帰の定義などをtensorflowで行います。

In[4]:

from sklearn.model_selection import train_test_split

def variable_summaries(var):
"""Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
with tf.name_scope('summaries'):
mean = tf.reduce_mean(var)
tf.summary.scalar('mean', mean)
with tf.name_scope('stddev'):
stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
tf.summary.scalar('stddev', stddev)
tf.summary.scalar('max', tf.reduce_max(var))
tf.summary.scalar('min', tf.reduce_min(var))
tf.summary.histogram('histogram', var)

with tf.name_scope("init") as scope:
embedding_size = 200
vocabulary_size = 2000
batch_size = 100
max_words = 100
stops = stopwords.words('english')

with tf.name_scope("preprocessing") as scope:
texts, targets = load_movie_data()
word_dictionary = build_dictionary(texts, 2000)

texts = normalize_text(texts, stops)
targets = [targets[ix] for ix, x in enumerate(texts) if len(x.split()) > 2]
targets = np.array([x for ix, x in enumerate(targets)])

texts = [x for x in texts if len(x.split()) > 2]
texts = np.array(text_to_numbers(texts, word_dictionary))
texts = np.array([x[0:max_words] for x in [y+[0]*max_words for y in texts]])

X_train, X_test, y_train, y_test = train_test_split(texts, targets)

with tf.name_scope("placeholders"):
x_data = tf.placeholder(shape=[None, max_words], dtype=tf.int32)
y_target = tf.placeholder(shape=[None, 1], dtype=tf.float32)

with tf.name_scope("embeddings"):
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, x_data)
embed_avg = tf.reduce_mean(embed, 1)

with tf.name_scope("logistic_regression"):
with tf.name_scope("weights"):
A = tf.Variable(tf.random_normal(shape=[embedding_size,1]))
variable_summaries(A)
with tf.name_scope("bias"):
b = tf.Variable(tf.random_normal(shape=[1, 1]))
variable_summaries(b)
with tf.name_scope("output"):
model_output = tf.add(tf.matmul(embed_avg, A), b)
variable_summaries(model_output)

with tf.name_scope("total"):
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=model_output, labels=y_target))

with tf.name_scope("train"):
optimizer = tf.train.AdagradOptimizer(0.005)
train_step = optimizer.minimize(loss)

with tf.name_scope("score"):
with tf.name_scope("prediction"):
prediction = tf.round(tf.sigmoid(model_output))
predictions_correct = tf.cast(tf.equal(prediction, y_target), tf.float32)
with tf.name_scope("accuracy"):
accuracy = tf.reduce_mean(predictions_correct)

tf.summary.scalar('loss', loss)
tf.summary.scalar('accuracy', accuracy)
merged = tf.summary.merge_all()

訓練とログ出力を行います。

In[5]:

with tf.Session() as sess:

train_writer = tf.summary.FileWriter('logs_2/train', sess.graph)
test_writer = tf.summary.FileWriter('logs_2/test')
sess.run(tf.global_variables_initializer())

for i in range(5000):
index = np.random.choice(X_train.shape[0], size=batch_size)
X_batch = X_train[index]
y_batch = np.transpose([y_train[index]])
summary,_ = sess.run([merged, train_step], feed_dict={x_data: X_batch, y_target: y_batch})
train_writer.add_summary(summary, i)

if i % 1000 == 0:
acc, ls = sess.run([accuracy, loss], feed_dict={x_data: X_batch, y_target: y_batch})
print("Iter " + str(i) + ", Minibatch Loss= "+
"{:.6f}".format(ls) + ", Training Accuracy= " +
"{:.5f}".format(acc))
if i % 10:
summary, acc = sess.run([merged, accuracy], feed_dict={x_data: X_test, y_target: np.transpose([y_test])})
test_writer.add_summary(summary, i)

test_acc = sess.run(accuracy, feed_dict={x_data: X_test, y_target: np.transpose([y_test])})
print("Test Accuracy:", test_acc)

Out[5]:

Iter 0, Minibatch Loss= 0.859230, Training Accuracy= 0.52000

Iter 1000, Minibatch Loss= 0.698935, Training Accuracy= 0.55000
Iter 2000, Minibatch Loss= 0.688453, Training Accuracy= 0.52000
Iter 3000, Minibatch Loss= 0.684841, Training Accuracy= 0.51000
Iter 4000, Minibatch Loss= 0.698602, Training Accuracy= 0.53000
Test Accuracy: 0.503075

Tensorboardから損失関数と精度の推移、ネットワークを見ます。

Screenshot from 2017-10-15 17-07-42.png

Screenshot from 2017-10-15 17-08-00.png

Screenshot from 2017-10-15 17-08-33.png

見てみると、ランダム予測器よりもほんのわずかに精度が良い程度なので、あまりよいモデルではないと言えます。ちなみに、sklearnのtfidfを用いた結果は、以下のようになります。

In[6]:

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, train_test_split

pipe = Pipeline([("vectorizer", TfidfVectorizer(min_df=4)),("clf", LogisticRegression())])
grid = GridSearchCV(pipe, param_grid={"vectorizer__min_df":[4,5,6],"clf__C":[0.001,0.01,0.1,1,10]})
X_train, X_test, y_train, y_test = train_test_split(texts, targets)
grid.fit(X_train, y_train)
grid.best_params_, grid.best_score_

Out[6]:

({'clf__C': 1, 'vectorizer__min_df': 5}, 0.73461730865432717)

scikitを使ったほうが20%以上も精度が高いことがわかります。


参考

https://github.com/nfmcclure/tensorflow_cookbook/tree/master/07_Natural_Language_Processing