ABOUT

LSTMもしくはGRUを用いて、単語に対応する品詞（名詞・動詞など)のタグ付けを行う

データを単語と品詞に分ける


DATA_DIR = "Downloads"
# ダウンロードしたデータを単語と品詞に分けて保存するファイル
with open(os.path.join(DATA_DIR, "treebank_sents.txt"), "w") as fedata, \
        open(os.path.join(DATA_DIR, "treebank_poss.txt"), "w") as ffdata:
# データをダウンロード
    sents = nltk.corpus.treebank.tagged_sents()
    for sent in sents:
# words = 単語, poss = 品詞
        words, poss = [], []
        for word, pos in sent:
            if pos == "-NONE-":
                continue
            words.append(word)
            poss.append(pos)
        fedata.write("{:s}\n".format(" ".join(words)))
        ffdata.write("{:s}\n".format(" ".join(poss)))

Resource treebank not found　というエラーが出たら下記のコードで解決


import nltk
nltk.downloads('treebank')

join


w = ['I', 'like', 'apples']
# 区切り文字をスペースとして単語をつなげる
c = ' '.join(w)
print('c :', c)
# 区切り文字なしで単語をつなげる
d = ''.join(w)
print('d :', d)

c : I like apples
d : Ilikeapples

語彙数・文章の長さの最大値・文章数を調べる

# word_freqs --- 何の単語が何回出たかをカウント
# maxlen --- 1つの分に含まれる単語数の最大値
# numrecs --- 文章数

def parse_sentences(filename):
    word_freqs = collections.Counter()
    num_recs, maxlen = 0, 0
    with open(filename, "r") as fin:
        for line in fin:
            words = line.strip().lower().split()
            for word in words:
                word_freqs[word] += 1
            maxlen = max(maxlen, len(words))
            num_recs += 1
    return word_freqs, maxlen, num_recs

collections.Counter()


col = collections.Counter()
col['apple'] += 1
col['egg'] += 1
col['apple'] += 1
print(col)

Counter({'apple': 2, 'egg': 1})

strip()・split()


line = '     I like reading books   '
# strip() --- 左右の空白を除去
a = line.strip()
print('a : ', a)
# split() --- スペースで区切る
b = a.split()
print('b : ', b)

a : I like reading books
b : ['I', 'like', 'reading', 'books']

データの情報を取得


s_wordfreqs, s_maxlen, s_numrecs = \
    parse_sentences(os.path.join(DATA_DIR, "treebank_sents.txt"))
t_wordfreqs, t_maxlen, t_numrecs = \
    parse_sentences(os.path.join(DATA_DIR, "treebank_poss.txt"))
print(" records: {:d}".format(s_numrecs))
print(" unique words: {:d}".format(len(s_wordfreqs)))
print(" unique POS tags: {:d}".format(len(t_wordfreqs)))
print(" words/sentence: max: {:d}".format(s_maxlen))

records: 3914
unique words: 10947
unique POS tags: 45
words/sentence: max: 249

単語と品詞にインデックスを与える


MAX_SEQLEN = 250
S_MAX_FEATURES = 5000
T_MAX_FEATURES = 45

# s_vocabsize = 使用する語彙 + pad + unk
s_vocabsize = min(len(s_wordfreqs), S_MAX_FEATURES) + 2
# 頻出語（今回は5000語）のみをインデックス化
# i+2 は padにインデックス0　unkにインデックス1を与えるため
s_word2index = {x[0]: i+2 for i, x in
                enumerate(s_wordfreqs.most_common(S_MAX_FEATURES))}
s_word2index["PAD"] = 0
s_word2index["UNK"] = 1
s_index2word = {v: k for k, v in s_word2index.items()}

# t_vocabsize = 使用する品詞 + pad
t_vocabsize = len(t_wordfreqs) + 1
t_word2index = {x[0]: i+1 for i, x in
                enumerate(t_wordfreqs.most_common(T_MAX_FEATURES))}
#今回はすべての品詞を使っているためunkは使わない
t_word2index["PAD"] = 0
t_index2word = {v: k for k, v in t_word2index.items()}

文章をインデックス化


def build_tensor(filename, numrecs, word2index, maxlen):
    data = np.empty((numrecs, ), dtype=list)
    with open(filename, "r") as fin:
        for i, line in enumerate(fin):
            wids = []
            for word in line.strip().lower().split():
                if word in word2index:
                    wids.append(word2index[word])
                else:
                    wids.append(word2index["UNK"])
            data[i] = wids
    pdata = sequence.pad_sequences(data, maxlen=maxlen)
    return pdata

上記のコードのイメージをつかむために

dat = np.empty((3, ), dtype=list)
print('empty dat :', dat)
dat[2] = [1,2,3,5]
dat[0] = [1,3,6,8,9,2]
dat[1] = [5,6]
print('dat with data :', dat)
pdat = sequence.pad_sequences(dat,maxlen=5)
print('padding dat :', pdat)

empty dat : [None None None]
dat with data : [list([1, 3, 6, 8, 9, 2]) list([5, 6]) list([1, 2, 3, 5])]
padding dat : [[3 6 8 9 2][0 0 0 5 6][0 1 2 3 5]]

訓練データ・テストデータ


#文章をインデックス化
X = build_tensor(os.path.join(DATA_DIR, "treebank_sents.txt"),
                 s_numrecs, s_word2index, MAX_SEQLEN)
Y = build_tensor(os.path.join(DATA_DIR, "treebank_poss.txt"),
                 t_numrecs, t_word2index, MAX_SEQLEN)

#ワンホットエンコーディング
Y = np.array([np_utils.to_categorical(d, t_vocabsize) for d in Y])

#訓練データとテストデータに分ける
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42)

X.shape : (3914, 250)
Y.shape : (3914, 250, 46)

モデル

EMBED_SIZE = 128
HIDDEN_SIZE = 64
BATCH_SIZE = 32
NUM_EPOCHS = 1


model = Sequential()
model.add(Embedding(s_vocabsize, EMBED_SIZE, input_length=MAX_SEQLEN))
model.add(Dropout(0.2))
model.add(LSTM(HIDDEN_SIZE, dropout=0.2, recurrent_dropout=0.2))
model.add(RepeatVector(MAX_SEQLEN))
model.add(LSTM(HIDDEN_SIZE, return_sequences=True))
model.add(TimeDistributed(Dense(t_vocabsize)))
model.add(Activation("softmax"))
model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

model.summary()

RepeatVector
・１つ目のLSTMから返されるのは最後のタイムステップの出力のみ
　形は（None, HIDDEN_SIZE)
（デフォルトはreturn_sequence=Falseのため)
・２つ目のLSTMの入力を(None, MAX_SEQLEN, HIDDEN_SIZE)の形にするために１つ目のLSTMの出力を複製する

TimeDistributed
・Dense層へのインプットは(None, MAX_SEQLEN, HIDDEN_SIZE)
・timeDIstributedを用いることで、それぞれのタイムステップに対してDense層を適用できるようになる

訓練


model.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE,
          epochs=NUM_EPOCHS, validation_data=[Xtest, Ytest])

テスト結果


score, acc = model.evaluate(Xtest, Ytest, batch_size=BATCH_SIZE)
print("Test score: {:.3f}, accuracy: {:.3f}".format(score, acc))

Test score: 0.620, accuracy: 0.902

LSTMをGRUに変えてみた結果

LSTM