# 日本語の固有表現抽出をanaGoで行う

Hironsanの記事( https://qiita.com/Hironsan/items/326b66711eb4196aa9d4 )で使われているhironsan.txtを用いて、彼の作ったanaGoを試します。
( https://github.com/Hironsan/anago )

# hironsan.txtダウンロードと整形

https://github.com/Hironsan/IOB2Corpus

transform.py
# coding: utf-8
with open("hironsan.txt") as file:

data_spl = [x.split("\t") for x in data]
data_fin = [[x[0],x[-1]] for x in data_spl]
with open("hironsan_anago.txt", "w") as file:
for x in data_fin:
file.write(str(x[0])+"\t"+str(x[1])+"\n")
train_test_split.py
# coding: utf-8
def my_train_test_split(data, train_size=0.8):
train_num = int(len(data) * train_size)
return data[:train_num], data[train_num:]

with open("hironsan_anago.txt", "r") as file:
train, test = my_train_test_split(data)
valid, test = my_train_test_split(test, train_size=0.5)

with open("hironsan_train.txt", "w") as file:
for x in train:
file.write(x+"\n")

with open("hironsan_test.txt", "w") as file:
for x in test:
file.write(x+"\n")

with open("hironsan_valid.txt", "w") as file:
for x in valid:
file.write(x+"\n")

これらを実行すると、4つのファイルが生成されます。

• hironsan_anago.txt
• hironsan_train.txt
• hironsan_test.txt
• hironsan_valid.txt

# 学習済みword2vecを取得・embeddingファイルを生成

\$ mkdir w2v_model
\$ cd w2v_model
\$ wget http://public.shiroyagi.s3.amazonaws.com/latest-ja-word2vec-gensim-model.zip
\$ unzip latest-ja-word2vec-gensim-model.zip
# coding: utf-8
from gensim.models import Word2Vec
word_vectors = model.wv
word_vectors.save_word2vec_format("emb.txt", binary=False)

emb.txtはgithub上で説明されているglove.6B.100d.txtに対応します。

# anaGoの修正

def load_word_embeddings(vocab, glove_filename, dim):
"""Loads GloVe vectors in numpy array.

Args:
vocab (): dictionary vocab[word] = index.
glove_filename (str): a path to a glove file.
dim (int): dimension of embeddings.

Returns:
numpy array: an array of word embeddings.
"""

embeddings = np.zeros([len(vocab), dim])
with open(glove_filename) as f:
for line in f:
line = line.strip().split(' ')
#word = line[0]
word_i = 1
for i in range(dim):
try:
tmp = float(line[i])
if tmp < 1.0:
break
else:
word_i = i+1
except:
word_i = i+1

#print(word_i)
word = ' '.join([str(x) for x in line[0: word_i]])
embedding = [float(x) for x in line[word_i:dim+word_i]]
if word in vocab:
word_idx = vocab[word]
embeddings[word_idx] = np.asarray(embedding)

return embeddings

evaluator.py
def eval(self, x_test, y_test):

# Prepare test data(steps, generator)
train_steps, train_batches = batch_iter(x_test, y_test, self.config.batch_size, preprocessor=self.preprocessor)

# Build the model
model = SeqLabeling(self.config, ntags=len(self.preprocessor.vocab_tag))

# Build the evaluator and evaluate the model
f1score = F1score(train_steps, train_batches, self.preprocessor)
f1score.model = model
f1score.on_epoch_end(epoch=-1)  # epoch takes any integer.

この場合、batch_iterにx_testとy_testを渡していますが、元のコードは以下のようになっています。

wrongver_evaluator.py
def eval(self, x_test, y_test):

# Prepare test data(steps, generator)
train_steps, train_batches = batch_iter(
list(zip(x_test, y_test)), self.config.batch_size, preprocessor=self.preprocessor)

# Build the model
model = SeqLabeling(self.config, ntags=len(self.preprocessor.vocab_tag))

# Build the evaluator and evaluate the model
f1score = F1score(train_steps, train_batches, self.preprocessor)
f1score.model = model
f1score.on_epoch_end(epoch=-1)

batch_iterへの引数の渡し方が間違っていたようです。

config.py
class ModelConfig(object):
"""Wrapper class for model hyperparameters."""

def __init__(self):
"""Sets the default model hyperparameters."""

# Number of unique words in the vocab (plus 2, for <UNK>, <PAD>).
self.vocab_size = None
self.char_vocab_size = None

# Batch size.
self.batch_size = 32

# Scale used to initialize model variables.
self.initializer_scale = 0.08

# LSTM input and output dimensionality, respectively.
self.char_embedding_size = 25
self.num_char_lstm_units = 25
self.word_embedding_size = 50
self.num_word_lstm_units = 50

# If < 1.0, the dropout keep probability applied to LSTM variables.
self.dropout = 0.5

# If True, use character feature.
self.char_feature = True

# If True, use crf.
self.crf = True

class TrainingConfig(object):
"""Wrapper class for training hyperparameters."""

def __init__(self):
"""Sets the default training hyperparameters."""

# Batch size
self.batch_size = 10

# Optimizer for training the model.

# Learning rate for the initial phase of training.
self.learning_rate = 0.001
self.lr_decay = 0.9

# If not None, clip gradients to this value.

# The number of max epoch size
self.max_epoch = 50

# Parameters for early stopping
self.early_stopping = True
self.patience = 3

# Fine-tune word embeddings
self.train_embeddings = True

# How many model checkpoints to keep.
self.max_checkpoints_to_keep = 5

# 訓練・テスト

それでは、実行してみます。hironsan_*.txtはdata2/hionsan/へ入れて、modelsディレクトリとlogsディレクトリを作成してくだい。emb.txtはdata2/へ入れます。

# coding: utf-8
import os
import anago
from anago.data.preprocess import prepare_preprocessor
from anago.config import ModelConfig, TrainingConfig

DATA_ROOT = 'data2/hironsan/'
SAVE_ROOT = './models'  # trained model
LOG_ROOT = './logs'     # checkpoint, tensorboard
embedding_path = './data2/emb.txt'

model_config = ModelConfig()
training_config = TrainingConfig()

train_path = os.path.join(DATA_ROOT, 'hironsan_train.txt')
valid_path = os.path.join(DATA_ROOT, 'hironsan_valid.txt')
test_path = os.path.join(DATA_ROOT, 'hironsan_test.txt')

x_train, y_train = load_data_and_labels(train_path)
x_valid, y_valid = load_data_and_labels(valid_path)
x_test, y_test = load_data_and_labels(test_path)

p = prepare_preprocessor(x_train, y_train)

embeddings = load_word_embeddings(p.vocab_word, embedding_path, 50)
model_config.vocab_size = len(p.vocab_word)
model_config.char_vocab_size = len(p.vocab_char)

trainer = anago.Trainer(model_config, training_config, checkpoint_path=LOG_ROOT, save_path=SAVE_ROOT,preprocessor=p, embeddings=embeddings)
trainer.train(x_train, y_train, x_valid, y_valid)

weights = 'model_weights.h5'
evaluator = anago.Evaluator(model_config, weights, save_path=SAVE_ROOT, preprocessor=p)
evaluator.eval(x_test, y_test)

Out[1]:

- f1: 56.88

F値はそれほど高くはないことがわかります。

# 考察

また、利用したword2vecモデルがどのように分かち書きされたのかにも影響するでしょう。

config.pyのearly_stoppingをFalseにしてepochを増やせば精度が高まる可能性があります。

