LoginSignup
2
2

More than 5 years have passed since last update.

RNNで品詞タグ付け

Last updated at Posted at 2018-12-10

ABOUT

LSTMもしくはGRUを用いて、単語に対応する品詞(名詞・動詞など)のタグ付けを行う

データを単語と品詞に分ける


DATA_DIR = "Downloads"
# ダウンロードしたデータを単語と品詞に分けて保存するファイル
with open(os.path.join(DATA_DIR, "treebank_sents.txt"), "w") as fedata, \
        open(os.path.join(DATA_DIR, "treebank_poss.txt"), "w") as ffdata:
# データをダウンロード
    sents = nltk.corpus.treebank.tagged_sents()
    for sent in sents:
# words = 単語, poss = 品詞
        words, poss = [], []
        for word, pos in sent:
            if pos == "-NONE-":
                continue
            words.append(word)
            poss.append(pos)
        fedata.write("{:s}\n".format(" ".join(words)))
        ffdata.write("{:s}\n".format(" ".join(poss)))

Resource treebank not found というエラーが出たら下記のコードで解決


import nltk
nltk.downloads('treebank')

join


w = ['I', 'like', 'apples']
# 区切り文字をスペースとして単語をつなげる
c = ' '.join(w)
print('c :', c)
# 区切り文字なしで単語をつなげる
d = ''.join(w)
print('d :', d)

c : I like apples
d : Ilikeapples

語彙数・文章の長さの最大値・文章数を調べる

# word_freqs --- 何の単語が何回出たかをカウント
# maxlen --- 1つの分に含まれる単語数の最大値
# numrecs --- 文章数

def parse_sentences(filename):
    word_freqs = collections.Counter()
    num_recs, maxlen = 0, 0
    with open(filename, "r") as fin:
        for line in fin:
            words = line.strip().lower().split()
            for word in words:
                word_freqs[word] += 1
            maxlen = max(maxlen, len(words))
            num_recs += 1
    return word_freqs, maxlen, num_recs

collections.Counter()


col = collections.Counter()
col['apple'] += 1
col['egg'] += 1
col['apple'] += 1
print(col)

Counter({'apple': 2, 'egg': 1})

strip()・split()


line = '     I like reading books   '
# strip() --- 左右の空白を除去
a = line.strip()
print('a : ', a)
# split() --- スペースで区切る
b = a.split()
print('b : ', b)

a : I like reading books
b : ['I', 'like', 'reading', 'books']

データの情報を取得


s_wordfreqs, s_maxlen, s_numrecs = \
    parse_sentences(os.path.join(DATA_DIR, "treebank_sents.txt"))
t_wordfreqs, t_maxlen, t_numrecs = \
    parse_sentences(os.path.join(DATA_DIR, "treebank_poss.txt"))
print(" records: {:d}".format(s_numrecs))
print(" unique words: {:d}".format(len(s_wordfreqs)))
print(" unique POS tags: {:d}".format(len(t_wordfreqs)))
print(" words/sentence: max: {:d}".format(s_maxlen))

records: 3914
unique words: 10947
unique POS tags: 45
words/sentence: max: 249

単語と品詞にインデックスを与える


MAX_SEQLEN = 250
S_MAX_FEATURES = 5000
T_MAX_FEATURES = 45

# s_vocabsize = 使用する語彙 + pad + unk
s_vocabsize = min(len(s_wordfreqs), S_MAX_FEATURES) + 2
# 頻出語(今回は5000語)のみをインデックス化
# i+2 は padにインデックス0 unkにインデックス1を与えるため
s_word2index = {x[0]: i+2 for i, x in
                enumerate(s_wordfreqs.most_common(S_MAX_FEATURES))}
s_word2index["PAD"] = 0
s_word2index["UNK"] = 1
s_index2word = {v: k for k, v in s_word2index.items()}

# t_vocabsize = 使用する品詞 + pad
t_vocabsize = len(t_wordfreqs) + 1
t_word2index = {x[0]: i+1 for i, x in
                enumerate(t_wordfreqs.most_common(T_MAX_FEATURES))}
#今回はすべての品詞を使っているためunkは使わない
t_word2index["PAD"] = 0
t_index2word = {v: k for k, v in t_word2index.items()}

文章をインデックス化


def build_tensor(filename, numrecs, word2index, maxlen):
    data = np.empty((numrecs, ), dtype=list)
    with open(filename, "r") as fin:
        for i, line in enumerate(fin):
            wids = []
            for word in line.strip().lower().split():
                if word in word2index:
                    wids.append(word2index[word])
                else:
                    wids.append(word2index["UNK"])
            data[i] = wids
    pdata = sequence.pad_sequences(data, maxlen=maxlen)
    return pdata

上記のコードのイメージをつかむために

dat = np.empty((3, ), dtype=list)
print('empty dat :', dat)
dat[2] = [1,2,3,5]
dat[0] = [1,3,6,8,9,2]
dat[1] = [5,6]
print('dat with data :', dat)
pdat = sequence.pad_sequences(dat,maxlen=5)
print('padding dat :', pdat)

empty dat : [None None None]
dat with data : [list([1, 3, 6, 8, 9, 2]) list([5, 6]) list([1, 2, 3, 5])]
padding dat : [[3 6 8 9 2][0 0 0 5 6][0 1 2 3 5]]

訓練データ・テストデータ


#文章をインデックス化
X = build_tensor(os.path.join(DATA_DIR, "treebank_sents.txt"),
                 s_numrecs, s_word2index, MAX_SEQLEN)
Y = build_tensor(os.path.join(DATA_DIR, "treebank_poss.txt"),
                 t_numrecs, t_word2index, MAX_SEQLEN)

#ワンホットエンコーディング
Y = np.array([np_utils.to_categorical(d, t_vocabsize) for d in Y])

#訓練データとテストデータに分ける
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42)

X.shape : (3914, 250)
Y.shape : (3914, 250, 46)

モデル

EMBED_SIZE = 128
HIDDEN_SIZE = 64
BATCH_SIZE = 32
NUM_EPOCHS = 1


model = Sequential()
model.add(Embedding(s_vocabsize, EMBED_SIZE, input_length=MAX_SEQLEN))
model.add(Dropout(0.2))
model.add(LSTM(HIDDEN_SIZE, dropout=0.2, recurrent_dropout=0.2))
model.add(RepeatVector(MAX_SEQLEN))
model.add(LSTM(HIDDEN_SIZE, return_sequences=True))
model.add(TimeDistributed(Dense(t_vocabsize)))
model.add(Activation("softmax"))
model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

model.summary()

RepeatVector
・1つ目のLSTMから返されるのは最後のタイムステップの出力のみ
 形は(None, HIDDEN_SIZE)
(デフォルトはreturn_sequence=Falseのため)
・2つ目のLSTMの入力を(None, MAX_SEQLEN, HIDDEN_SIZE)の形にするために1つ目のLSTMの出力を複製する

TimeDistributed
・Dense層へのインプットは(None, MAX_SEQLEN, HIDDEN_SIZE)
・timeDIstributedを用いることで、それぞれのタイムステップに対してDense層を適用できるようになる

訓練


model.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE,
          epochs=NUM_EPOCHS, validation_data=[Xtest, Ytest])

テスト結果


score, acc = model.evaluate(Xtest, Ytest, batch_size=BATCH_SIZE)
print("Test score: {:.3f}, accuracy: {:.3f}".format(score, acc))

Test score: 0.620, accuracy: 0.902

LSTMをGRUに変えてみた結果

LSTM

Test score: 0.620, accuracy: 0.902

GRU

Test score: 0.617, accuracy: 0.902

2
2
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
2
2