LoginSignup
0
0

Pytorchを使ったBidirectional LSTMによる文章分類の推論結果が異なる問題

Posted at

起きた問題

下記のHPを参考にBidirectional LSTMを用いて文章分類タスクを解く。

以下で学習した学習済みモデルを用いて推論したところ、推論結果がbatch_sizeによって変わった。

models.py
import numpy as np
import torch
import torch.nn as nn


class LSTMClassifier(nn.Module):
    def __init__(
        self,
        feature_vocab_size, # 埋め込む単語の語数
        embedding_size, # embedding後のサイズ
        hidden_size, # LSTMへの入力サイズ
        tagset_size, # 出力サイズ
        num_layers, # LSTMのlayer数
        dropout, # dropoutの割合
        is_bidirectional #LSTMを双方向にするかどうか
    ):
        super(LSTMClassifier, self).__init__()
        self.is_bidirectional = is_bidirectional

        self.embedding = nn.Embedding(
            num_embeddings=feature_vocab_size,
            embedding_dim=embedding_size,
            padding_idx=0 # 0padding
        )

        self.lstm = nn.LSTM(
            input_size=embedding_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            bias=True,
            batch_first=True,
            bidirectional=is_bidirectional,
            dropout = dropout,
        )
        self.fc = nn.Linear(
            in_features=hidden_size*2 if is_bidirectional else hidden_size,
            out_features=tagset_size,
        )
        self.logsoftmax = nn.LogSoftmax(dim=1)

    def forward(self, sentence):
        embeds = self.embedding(sentence)
        _, (h, _) = self.lstm(embeds)
        if self.is_bidirectional:
            hidden = torch.cat((h[-2,:,:], h[-1,:,:]), dim=1)
            dence_output = self.fc(hidden)
            output = self.logsoftmax(dence_output)
        else:
            dence_output = self.fc(h)
            output = self.logsoftmax(dence_output.squeeze())

        return output
inference.py
import pandas as pd
from models import BiLSTMClassifier
from utils import sentence2index
from vocab import Vocabulary

df = pd.read_csv('test.csv')
df = df[:5]

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# hyper parameter
feature_vocab = Vocabulary.load('feature.dict')
EMBEDDING_DIM = 200
HIDDEN_DIM = 128
TAGSET_SIZE = 2
NUM_LAYERS = 1
DROPOUT = 0
is_bidirectional = True

model = LSTMClassifier(len(feature_vocab), EMBEDDING_DIM, HIDDEN_DIM, TAGSET_SIZE, NUM_LAYERS, DROPOUT, is_bidirectional).to(device)
model.load_state_dict(torch.load('lstm_classifier_weight.pth', map_location=device))
model.eval()

df["sentence_wakati"] = df["sentence"].progress_apply(SudachiPreProcessor(normalize=False).make_wakati) # 形態素解析
df["sentence_id"] = df["sentence_wakati"].progress_apply(lambda x: torch.tensor(sentence2index(x, feature_vocab), dtype=torch.long)) id化
sentence_id_padded = pad_sequence(df["sentence_id"], batch_first=True, padding_value=0).to(device) # 0padding

all_outputs = []
batch_size = 3

for i in tqdm(range(0, len(sentence_id_padded), batch_size)):
    batch_data = sentence_id_padded[i:i+batch_size]
    with torch.no_grad():
        batch_output = model(batch_data)
        batch_probs = torch.exp(batch_output)    
    all_outputs.append(batch_probs)

lstm_output = torch.cat(all_outputs, dim=0)

df["lstm_prob_1"] = lstm_output[:, 1].cpu().numpy().tolist()
df

inference.pyをbtch_sizeを3や5にして実行すると結果が変わってしまう。
この例だと複雑なので、実際に簡単な入力で見てみる。

実験

word_id = [torch.tensor([ 7, 3, 5]), 
           torch.tensor([ 9, 5]), 
           torch.tensor([ 7, 1, 8, 4, 4])]

これを入力とする。0でpaddingすると、

word_id_padded = pad_sequence(
    word_id, 
    batch_first=True, 
    padding_value=0
)
print(word_id_padded)
print(word_id_padded.shape)

# 出力
tensor([[7, 3, 5, 0, 0],
        [9, 5, 0, 0, 0],
        [7, 1, 8, 4, 4]])
torch.Size([3, 5])

LSTMClassifierを簡略化したものを定義する。

embedding = nn.Embedding(10, 5, padding_idx=0)
bilstm = nn.LSTM(
            input_size=5,
            hidden_size=2,
            num_layers=1,
            batch_first=True,
            bidirectional=True,
            dropout = 0
        )
fc = nn.Linear(
            in_features=2*2,
            out_features=2
        )
logsoftmax = nn.LogSoftmax(dim=1)

順番に入力していく。

embeds = embedding(word_id_padded)
print(embeds)

# 出力
tensor([[[ 0.5258, -0.4880,  1.1914, -0.8140, -0.7360],
         [ 0.7624,  1.6423, -0.1596, -0.4974,  0.4396],
         [ 1.2964,  0.6105,  1.3347, -0.2316,  0.0418],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],

        [[ 1.8446, -1.1845,  1.3835, -1.2024,  0.7078],
         [ 1.2964,  0.6105,  1.3347, -0.2316,  0.0418],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],

        [[ 0.5258, -0.4880,  1.1914, -0.8140, -0.7360],
         [-1.2345, -0.0431, -1.6047, -0.7521,  1.6487],
         [-0.8371, -0.9224, -0.0635,  0.6756, -0.0978],
         [-0.7581,  1.0783,  0.8008,  1.6806,  1.2791],
         [-0.7581,  1.0783,  0.8008,  1.6806,  1.2791]]],
       grad_fn=<EmbeddingBackward0>)
lstm, (ht, ct)  = bilstm(embeds)
print(lstm)
print('------------------------------')
print(ht)
print('------------------------------')
print(ct)

# 出力
tensor([[[-0.2505,  0.0171,  0.0254, -0.1418],
         [-0.3123, -0.0533, -0.0896, -0.0707],
         [-0.2907, -0.0187,  0.0544, -0.1012],
         [-0.1526, -0.1095, -0.0719, -0.1345],
         [-0.1467, -0.1438, -0.0471, -0.1056]],

        [[ 0.2302, -0.0285,  0.3353, -0.2002],
         [-0.2159, -0.0116,  0.0483, -0.1012],
         [-0.1501, -0.1022, -0.0847, -0.1427],
         [-0.1470, -0.1401, -0.0719, -0.1345],
         [-0.1452, -0.1584, -0.0471, -0.1056]],

        [[-0.2505,  0.0171, -0.0439, -0.1206],
         [-0.0625, -0.1108, -0.3307,  0.5002],
         [-0.0310, -0.1351, -0.0076,  0.0042],
         [-0.0240, -0.0487,  0.0045,  0.1633],
         [-0.0315, -0.0466,  0.0051,  0.0968]]], grad_fn=<TransposeBackward0>)
------------------------------
tensor([[[-0.1467, -0.1438],
         [-0.1452, -0.1584],
         [-0.0315, -0.0466]],

        [[ 0.0254, -0.1418],
         [ 0.3353, -0.2002],
         [-0.0439, -0.1206]]], grad_fn=<StackBackward0>)
------------------------------
tensor([[[-0.2872, -0.5081],
         [-0.2826, -0.5671],
         [-0.4834, -0.4408]],

        [[ 0.1156, -0.2991],
         [ 1.0408, -0.3255],
         [-0.2172, -0.2240]]], grad_fn=<StackBackward0>)
bilstm_out = torch.cat([ht[0], ht[1]], dim=1)
dence_output = fc(bilstm_out)
tag_scores = logsoftmax(dence_output)
print(tag_scores)

# 出力
tensor([[-0.5184, -0.9050],
        [-0.5797, -0.8211],
        [-0.4972, -0.9370]], grad_fn=<LogSoftmaxBackward0>)

今度は次のtensorで試してみる。

word_id = [torch.tensor([ 7, 3, 5]),  
           torch.tensor([ 9, 5])]
word_id_padded = pad_sequence(
    word_id, 
    batch_first=True, 
    padding_value=0
)
print(word_id_padded)
print(word_id_padded.shape)

# 出力結果
tensor([[7, 3, 5],
        [9, 5, 0]])
torch.Size([2, 3])
embeds = embedding(word_id_padded)
print(embeds)

# 出力
tensor([[[ 0.5258, -0.4880,  1.1914, -0.8140, -0.7360],
         [ 0.7624,  1.6423, -0.1596, -0.4974,  0.4396],
         [ 1.2964,  0.6105,  1.3347, -0.2316,  0.0418]],

        [[ 1.8446, -1.1845,  1.3835, -1.2024,  0.7078],
         [ 1.2964,  0.6105,  1.3347, -0.2316,  0.0418],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]],
       grad_fn=<EmbeddingBackward0>)
torch.Size([2, 3, 5])

これを見た感じ、batch_sizeを変えても埋め込み表現は変わってないから問題なし。

lstm, (ht, ct)  = bilstm(embeds)
print(lstm)
print('------------------------------')
print(ht)
print('------------------------------')
print(ct)

# 出力
tensor([[[-0.2505,  0.0171,  0.0371, -0.1423],
         [-0.3123, -0.0533, -0.0743, -0.0648],
         [-0.2907, -0.0187,  0.0826, -0.0983]],

        [[ 0.2302, -0.0285,  0.3482, -0.2011],
         [-0.2159, -0.0116,  0.0652, -0.1007],
         [-0.1501, -0.1022, -0.0471, -0.1056]]], grad_fn=<TransposeBackward0>)
------------------------------
tensor([[[-0.2907, -0.0187],
         [-0.1501, -0.1022]],

        [[ 0.0371, -0.1423],
         [ 0.3482, -0.2011]]], grad_fn=<StackBackward0>)
------------------------------
tensor([[[-0.4802, -0.0926],
         [-0.2976, -0.3531]],

        [[ 0.1704, -0.2987],
         [ 1.1247, -0.3260]]], grad_fn=<StackBackward0>)
bilstm_out = torch.cat([ht[0], ht[1]], dim=1)
dence_output = fc(bilstm_out)
tag_scores = logsoftmax(dence_output)
print(tag_scores)

# 出力
tensor([[-0.5579, -0.8496],
        [-0.5897, -0.8085]], grad_fn=<LogSoftmaxBackward0>)

これを見ると最終的な結果が変わっているのがわかるだろう。
ロジック的にはbatch_sizeを変えても推論結果は変わらないはずなので、明らかにおかしい。

解決策

以下のHPに根本的な解決策があった。

models.py(抜粋)
# sentence_lengthsは各sentenceの長さ
def forward(self, sentence, sentence_lengths):
    embeds = self.embedding(sentence)

     # ここでそれぞれの文章の系列長を教えてあげる
    embeds_packed = nn.utils.rnn.pack_padded_sequence(embeds, sentence_lengths.cpu(), batch_first=True, enforce_sorted=False)
    
    _, (h, _) = self.lstm(embeds_packed)
    if self.is_bidirectional:
        hidden = torch.cat((h[-2,:,:], h[-1,:,:]), dim=1)
        dence_output = self.fc(hidden)
        output = self.logsoftmax(dence_output)
    else:
        dence_output = self.fc(h)
        output = self.logsoftmax(dence_output.squeeze())

    return output

どうやらpadding用に埋め込んだ"0"をLSTM layer入力する際padding用の0と認識できていないことが問題だったらしい。なので、LSTM layerへ入力する前に0はpadding用に埋め込んだ値であることを教えてあげればいい。

環境

M1 mac
Python==3.11.6
torch==2.0.1

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0