More than 1 year has passed since last update.

Pytorchを使ったBidirectional LSTMによる文章分類の推論結果が異なる問題

Posted at 2024-04-25

起きた問題

下記のHPを参考にBidirectional LSTMを用いて文章分類タスクを解く。

以下で学習した学習済みモデルを用いて推論したところ、推論結果がbatch_sizeによって変わった。

models.py

import numpy as np
import torch
import torch.nn as nn


class LSTMClassifier(nn.Module):
    def __init__(
        self,
        feature_vocab_size, # 埋め込む単語の語数
        embedding_size, # embedding後のサイズ
        hidden_size, # LSTMへの入力サイズ
        tagset_size, # 出力サイズ
        num_layers, # LSTMのlayer数
        dropout, # dropoutの割合
        is_bidirectional #LSTMを双方向にするかどうか
    ):
        super(LSTMClassifier, self).__init__()
        self.is_bidirectional = is_bidirectional

        self.embedding = nn.Embedding(
            num_embeddings=feature_vocab_size,
            embedding_dim=embedding_size,
            padding_idx=0 # 0padding
        )

        self.lstm = nn.LSTM(
            input_size=embedding_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            bias=True,
            batch_first=True,
            bidirectional=is_bidirectional,
            dropout = dropout,
        )
        self.fc = nn.Linear(
            in_features=hidden_size*2 if is_bidirectional else hidden_size,
            out_features=tagset_size,
        )
        self.logsoftmax = nn.LogSoftmax(dim=1)

    def forward(self, sentence):
        embeds = self.embedding(sentence)
        _, (h, _) = self.lstm(embeds)
        if self.is_bidirectional:
            hidden = torch.cat((h[-2,:,:], h[-1,:,:]), dim=1)
            dence_output = self.fc(hidden)
            output = self.logsoftmax(dence_output)
        else:
            dence_output = self.fc(h)
            output = self.logsoftmax(dence_output.squeeze())

        return output

inference.py

import pandas as pd
from models import BiLSTMClassifier
from utils import sentence2index
from vocab import Vocabulary

df = pd.read_csv('test.csv')
df = df[:5]

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# hyper parameter
feature_vocab = Vocabulary.load('feature.dict')
EMBEDDING_DIM = 200
HIDDEN_DIM = 128
TAGSET_SIZE = 2
NUM_LAYERS = 1
DROPOUT = 0
is_bidirectional = True

model = LSTMClassifier(len(feature_vocab), EMBEDDING_DIM, HIDDEN_DIM, TAGSET_SIZE, NUM_LAYERS, DROPOUT, is_bidirectional).to(device)
model.load_state_dict(torch.load('lstm_classifier_weight.pth', map_location=device))
model.eval()

df["sentence_wakati"] = df["sentence"].progress_apply(SudachiPreProcessor(normalize=False).make_wakati) # 形態素解析
df["sentence_id"] = df["sentence_wakati"].progress_apply(lambda x: torch.tensor(sentence2index(x, feature_vocab), dtype=torch.long)) id化
sentence_id_padded = pad_sequence(df["sentence_id"], batch_first=True, padding_value=0).to(device) # 0padding

all_outputs = []
batch_size = 3

for i in tqdm(range(0, len(sentence_id_padded), batch_size)):
    batch_data = sentence_id_padded[i:i+batch_size]
    with torch.no_grad():
        batch_output = model(batch_data)
        batch_probs = torch.exp(batch_output)    
    all_outputs.append(batch_probs)

lstm_output = torch.cat(all_outputs, dim=0)

df["lstm_prob_1"] = lstm_output[:, 1].cpu().numpy().tolist()
df

inference.pyをbtch_sizeを3や5にして実行すると結果が変わってしまう。
この例だと複雑なので、実際に簡単な入力で見てみる。

実験

word_id = [torch.tensor([ 7, 3, 5]), 
           torch.tensor([ 9, 5]), 
           torch.tensor([ 7, 1, 8, 4, 4])]

これを入力とする。0でpaddingすると、

word_id_padded = pad_sequence(
    word_id, 
    batch_first=True, 
    padding_value=0
)
print(word_id_padded)
print(word_id_padded.shape)

# 出力
tensor([[7, 3, 5, 0, 0],
        [9, 5, 0, 0, 0],
        [7, 1, 8, 4, 4]])
torch.Size([3, 5])

LSTMClassifierを簡略化したものを定義する。

embedding = nn.Embedding(10, 5, padding_idx=0)
bilstm = nn.LSTM(
            input_size=5,
            hidden_size=2,
            num_layers=1,
            batch_first=True,
            bidirectional=True,
            dropout = 0
        )
fc = nn.Linear(
            in_features=2*2,
            out_features=2
        )
logsoftmax = nn.LogSoftmax(dim=1)

順番に入力していく。

embeds = embedding(word_id_padded)
print(embeds)

# 出力
tensor([[[ 0.5258, -0.4880,  1.1914, -0.8140, -0.7360],
         [ 0.7624,  1.6423, -0.1596, -0.4974,  0.4396],
         [ 1.2964,  0.6105,  1.3347, -0.2316,  0.0418],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],

        [[ 1.8446, -1.1845,  1.3835, -1.2024,  0.7078],
         [ 1.2964,  0.6105,  1.3347, -0.2316,  0.0418],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],

        [[ 0.5258, -0.4880,  1.1914, -0.8140, -0.7360],
         [-1.2345, -0.0431, -1.6047, -0.7521,  1.6487],
         [-0.8371, -0.9224, -0.0635,  0.6756, -0.0978],
         [-0.7581,  1.0783,  0.8008,  1.6806,  1.2791],
         [-0.7581,  1.0783,  0.8008,  1.6806,  1.2791]]],
       grad_fn=<EmbeddingBackward0>)

lstm, (ht, ct)  = bilstm(embeds)
print(lstm)
print('------------------------------')
print(ht)
print('------------------------------')
print(ct)

# 出力
tensor([[[-0.2505,  0.0171,  0.0254, -0.1418],
         [-0.3123, -0.0533, -0.0896, -0.0707],
         [-0.2907, -0.0187,  0.0544, -0.1012],
         [-0.1526, -0.1095, -0.0719, -0.1345],
         [-0.1467, -0.1438, -0.0471, -0.1056]],

        [[ 0.2302, -0.0285,  0.3353, -0.2002],
         [-0.2159, -0.0116,  0.0483, -0.1012],
         [-0.1501, -0.1022, -0.0847, -0.1427],
         [-0.1470, -0.1401, -0.0719, -0.1345],
         [-0.1452, -0.1584, -0.0471, -0.1056]],

        [[-0.2505,  0.0171, -0.0439, -0.1206],
         [-0.0625, -0.1108, -0.3307,  0.5002],
         [-0.0310, -0.1351, -0.0076,  0.0042],
         [-0.0240, -0.0487,  0.0045,  0.1633],
         [-0.0315, -0.0466,  0.0051,  0.0968]]], grad_fn=<TransposeBackward0>)
------------------------------
tensor([[[-0.1467, -0.1438],
         [-0.1452, -0.1584],
         [-0.0315, -0.0466]],

        [[ 0.0254, -0.1418],
         [ 0.3353, -0.2002],
         [-0.0439, -0.1206]]], grad_fn=<StackBackward0>)
------------------------------
tensor([[[-0.2872, -0.5081],
         [-0.2826, -0.5671],
         [-0.4834, -0.4408]],

        [[ 0.1156, -0.2991],
         [ 1.0408, -0.3255],
         [-0.2172, -0.2240]]], grad_fn=<StackBackward0>)

bilstm_out = torch.cat([ht[0], ht[1]], dim=1)
dence_output = fc(bilstm_out)
tag_scores = logsoftmax(dence_output)
print(tag_scores)

# 出力
tensor([[-0.5184, -0.9050],
        [-0.5797, -0.8211],
        [-0.4972, -0.9370]], grad_fn=<LogSoftmaxBackward0>)

今度は次のtensorで試してみる。

word_id = [torch.tensor([ 7, 3, 5]),  
           torch.tensor([ 9, 5])]

word_id_padded = pad_sequence(
    word_id, 
    batch_first=True, 
    padding_value=0
)
print(word_id_padded)
print(word_id_padded.shape)

# 出力結果
tensor([[7, 3, 5],
        [9, 5, 0]])
torch.Size([2, 3])

embeds = embedding(word_id_padded)
print(embeds)

# 出力
tensor([[[ 0.5258, -0.4880,  1.1914, -0.8140, -0.7360],
         [ 0.7624,  1.6423, -0.1596, -0.4974,  0.4396],
         [ 1.2964,  0.6105,  1.3347, -0.2316,  0.0418]],

        [[ 1.8446, -1.1845,  1.3835, -1.2024,  0.7078],
         [ 1.2964,  0.6105,  1.3347, -0.2316,  0.0418],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]],
       grad_fn=<EmbeddingBackward0>)
torch.Size([2, 3, 5])

これを見た感じ、batch_sizeを変えても埋め込み表現は変わってないから問題なし。

lstm, (ht, ct)  = bilstm(embeds)
print(lstm)
print('------------------------------')
print(ht)
print('------------------------------')
print(ct)

# 出力
tensor([[[-0.2505,  0.0171,  0.0371, -0.1423],
         [-0.3123, -0.0533, -0.0743, -0.0648],
         [-0.2907, -0.0187,  0.0826, -0.0983]],

        [[ 0.2302, -0.0285,  0.3482, -0.2011],
         [-0.2159, -0.0116,  0.0652, -0.1007],
         [-0.1501, -0.1022, -0.0471, -0.1056]]], grad_fn=<TransposeBackward0>)
------------------------------
tensor([[[-0.2907, -0.0187],
         [-0.1501, -0.1022]],

        [[ 0.0371, -0.1423],
         [ 0.3482, -0.2011]]], grad_fn=<StackBackward0>)
------------------------------
tensor([[[-0.4802, -0.0926],
         [-0.2976, -0.3531]],

        [[ 0.1704, -0.2987],
         [ 1.1247, -0.3260]]], grad_fn=<StackBackward0>)

bilstm_out = torch.cat([ht[0], ht[1]], dim=1)
dence_output = fc(bilstm_out)
tag_scores = logsoftmax(dence_output)
print(tag_scores)

# 出力
tensor([[-0.5579, -0.8496],
        [-0.5897, -0.8085]], grad_fn=<LogSoftmaxBackward0>)

これを見ると最終的な結果が変わっているのがわかるだろう。
ロジック的にはbatch_sizeを変えても推論結果は変わらないはずなので、明らかにおかしい。

解決策

以下のHPに根本的な解決策があった。

models.py(抜粋)

# sentence_lengthsは各sentenceの長さ
def forward(self, sentence, sentence_lengths):
    embeds = self.embedding(sentence)

     # ここでそれぞれの文章の系列長を教えてあげる
    embeds_packed = nn.utils.rnn.pack_padded_sequence(embeds, sentence_lengths.cpu(), batch_first=True, enforce_sorted=False)
    
    _, (h, _) = self.lstm(embeds_packed)
    if self.is_bidirectional:
        hidden = torch.cat((h[-2,:,:], h[-1,:,:]), dim=1)
        dence_output = self.fc(hidden)
        output = self.logsoftmax(dence_output)
    else:
        dence_output = self.fc(h)
        output = self.logsoftmax(dence_output.squeeze())

    return output

どうやらpadding用に埋め込んだ"0"をLSTM layer入力する際padding用の0と認識できていないことが問題だったらしい。なので、LSTM layerへ入力する前に0はpadding用に埋め込んだ値であることを教えてあげればいい。

環境

M1 mac
Python==3.11.6
torch==2.0.1

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up