Help us understand the problem. What is going on with this article?

# 日本語の固有表現抽出をanaGoで行う

More than 1 year has passed since last update.

Hironsanの記事( https://qiita.com/Hironsan/items/326b66711eb4196aa9d4 )で使われているhironsan.txtを用いて、彼の作ったanaGoを試します。
( https://github.com/Hironsan/anago )

# hironsan.txtダウンロードと整形

https://github.com/Hironsan/IOB2Corpus

transform.py
# coding: utf-8
with open("hironsan.txt") as file:

data_spl = [x.split("\t") for x in data]
data_fin = [[x[0],x[-1]] for x in data_spl]
with open("hironsan_anago.txt", "w") as file:
for x in data_fin:
file.write(str(x[0])+"\t"+str(x[1])+"\n")
train_test_split.py
# coding: utf-8
def my_train_test_split(data, train_size=0.8):
train_num = int(len(data) * train_size)
return data[:train_num], data[train_num:]

with open("hironsan_anago.txt", "r") as file:
train, test = my_train_test_split(data)
valid, test = my_train_test_split(test, train_size=0.5)

with open("hironsan_train.txt", "w") as file:
for x in train:
file.write(x+"\n")

with open("hironsan_test.txt", "w") as file:
for x in test:
file.write(x+"\n")

with open("hironsan_valid.txt", "w") as file:
for x in valid:
file.write(x+"\n")

これらを実行すると、4つのファイルが生成されます。

• hironsan_anago.txt
• hironsan_train.txt
• hironsan_test.txt
• hironsan_valid.txt

# 学習済みword2vecを取得・embeddingファイルを生成

\$ mkdir w2v_model
\$ cd w2v_model
\$ wget http://public.shiroyagi.s3.amazonaws.com/latest-ja-word2vec-gensim-model.zip
\$ unzip latest-ja-word2vec-gensim-model.zip
# coding: utf-8
from gensim.models import Word2Vec
word_vectors = model.wv
word_vectors.save_word2vec_format("emb.txt", binary=False)

emb.txtはgithub上で説明されているglove.6B.100d.txtに対応します。

# anaGoの修正

def load_word_embeddings(vocab, glove_filename, dim):
"""Loads GloVe vectors in numpy array.

Args:
vocab (): dictionary vocab[word] = index.
glove_filename (str): a path to a glove file.
dim (int): dimension of embeddings.

Returns:
numpy array: an array of word embeddings.
"""

embeddings = np.zeros([len(vocab), dim])
with open(glove_filename) as f:
for line in f:
line = line.strip().split(' ')
#word = line[0]
word_i = 1
for i in range(dim):
try:
tmp = float(line[i])
if tmp < 1.0:
break
else:
word_i = i+1
except:
word_i = i+1

#print(word_i)
word = ' '.join([str(x) for x in line[0: word_i]])
embedding = [float(x) for x in line[word_i:dim+word_i]]
if word in vocab:
word_idx = vocab[word]
embeddings[word_idx] = np.asarray(embedding)

return embeddings

evaluator.py
def eval(self, x_test, y_test):

# Prepare test data(steps, generator)
train_steps, train_batches = batch_iter(x_test, y_test, self.config.batch_size, preprocessor=self.preprocessor)

# Build the model
model = SeqLabeling(self.config, ntags=len(self.preprocessor.vocab_tag))

# Build the evaluator and evaluate the model
f1score = F1score(train_steps, train_batches, self.preprocessor)
f1score.model = model
f1score.on_epoch_end(epoch=-1)  # epoch takes any integer.

この場合、batch_iterにx_testとy_testを渡していますが、元のコードは以下のようになっています。

wrongver_evaluator.py
def eval(self, x_test, y_test):

# Prepare test data(steps, generator)
train_steps, train_batches = batch_iter(
list(zip(x_test, y_test)), self.config.batch_size, preprocessor=self.preprocessor)

# Build the model
model = SeqLabeling(self.config, ntags=len(self.preprocessor.vocab_tag))

# Build the evaluator and evaluate the model
f1score = F1score(train_steps, train_batches, self.preprocessor)
f1score.model = model
f1score.on_epoch_end(epoch=-1)

batch_iterへの引数の渡し方が間違っていたようです。

config.py
class ModelConfig(object):
"""Wrapper class for model hyperparameters."""

def __init__(self):
"""Sets the default model hyperparameters."""

# Number of unique words in the vocab (plus 2, for <UNK>, <PAD>).
self.vocab_size = None
self.char_vocab_size = None

# Batch size.
self.batch_size = 32

# Scale used to initialize model variables.
self.initializer_scale = 0.08

# LSTM input and output dimensionality, respectively.
self.char_embedding_size = 25
self.num_char_lstm_units = 25
self.word_embedding_size = 50
self.num_word_lstm_units = 50

# If < 1.0, the dropout keep probability applied to LSTM variables.
self.dropout = 0.5

# If True, use character feature.
self.char_feature = True

# If True, use crf.
self.crf = True

class TrainingConfig(object):
"""Wrapper class for training hyperparameters."""

def __init__(self):
"""Sets the default training hyperparameters."""

# Batch size
self.batch_size = 10

# Optimizer for training the model.

# Learning rate for the initial phase of training.
self.learning_rate = 0.001
self.lr_decay = 0.9

# If not None, clip gradients to this value.

# The number of max epoch size
self.max_epoch = 50

# Parameters for early stopping
self.early_stopping = True
self.patience = 3

# Fine-tune word embeddings
self.train_embeddings = True

# How many model checkpoints to keep.
self.max_checkpoints_to_keep = 5

# 訓練・テスト

それでは、実行してみます。hironsan_*.txtはdata2/hionsan/へ入れて、modelsディレクトリとlogsディレクトリを作成してくだい。emb.txtはdata2/へ入れます。

# coding: utf-8
import os
import anago
from anago.data.preprocess import prepare_preprocessor
from anago.config import ModelConfig, TrainingConfig

DATA_ROOT = 'data2/hironsan/'
SAVE_ROOT = './models'  # trained model
LOG_ROOT = './logs'     # checkpoint, tensorboard
embedding_path = './data2/emb.txt'

model_config = ModelConfig()
training_config = TrainingConfig()

train_path = os.path.join(DATA_ROOT, 'hironsan_train.txt')
valid_path = os.path.join(DATA_ROOT, 'hironsan_valid.txt')
test_path = os.path.join(DATA_ROOT, 'hironsan_test.txt')

x_train, y_train = load_data_and_labels(train_path)
x_valid, y_valid = load_data_and_labels(valid_path)
x_test, y_test = load_data_and_labels(test_path)

p = prepare_preprocessor(x_train, y_train)

embeddings = load_word_embeddings(p.vocab_word, embedding_path, 50)
model_config.vocab_size = len(p.vocab_word)
model_config.char_vocab_size = len(p.vocab_char)

trainer = anago.Trainer(model_config, training_config, checkpoint_path=LOG_ROOT, save_path=SAVE_ROOT,preprocessor=p, embeddings=embeddings)
trainer.train(x_train, y_train, x_valid, y_valid)

weights = 'model_weights.h5'
evaluator = anago.Evaluator(model_config, weights, save_path=SAVE_ROOT, preprocessor=p)
evaluator.eval(x_test, y_test)

Out[1]:

- f1: 56.88

F値はそれほど高くはないことがわかります。

# 考察

また、利用したword2vecモデルがどのように分かち書きされたのかにも影響するでしょう。

config.pyのearly_stoppingをFalseにしてepochを増やせば精度が高まる可能性があります。

Why not register and get more from Qiita?
1. We will deliver articles that match you
By following users and tags, you can catch up information on technical fields that you are interested in as a whole
2. you can read useful information later efficiently
By "stocking" the articles you like, you can search right away