起きた問題
下記のHPを参考にBidirectional LSTMを用いて文章分類タスクを解く。
- https://qiita.com/m__k/items/db1a81bb06607d5b0ec5
- https://tma15.github.io/blog/2020/09/06/%E8%87%AA%E7%84%B6%E8%A8%80%E8%AA%9E%E5%87%A6%E7%90%86lstm%E3%81%AB%E5%9F%BA%E3%81%A5%E3%81%8F%E6%96%87%E6%9B%B8%E5%88%86%E9%A1%9E-pytorch%E3%82%B3%E3%83%BC%E3%83%89%E4%BB%98%E3%81%8D/
以下で学習した学習済みモデルを用いて推論したところ、推論結果がbatch_sizeによって変わった。
models.py
import numpy as np
import torch
import torch.nn as nn
class LSTMClassifier(nn.Module):
def __init__(
self,
feature_vocab_size, # 埋め込む単語の語数
embedding_size, # embedding後のサイズ
hidden_size, # LSTMへの入力サイズ
tagset_size, # 出力サイズ
num_layers, # LSTMのlayer数
dropout, # dropoutの割合
is_bidirectional #LSTMを双方向にするかどうか
):
super(LSTMClassifier, self).__init__()
self.is_bidirectional = is_bidirectional
self.embedding = nn.Embedding(
num_embeddings=feature_vocab_size,
embedding_dim=embedding_size,
padding_idx=0 # 0padding
)
self.lstm = nn.LSTM(
input_size=embedding_size,
hidden_size=hidden_size,
num_layers=num_layers,
bias=True,
batch_first=True,
bidirectional=is_bidirectional,
dropout = dropout,
)
self.fc = nn.Linear(
in_features=hidden_size*2 if is_bidirectional else hidden_size,
out_features=tagset_size,
)
self.logsoftmax = nn.LogSoftmax(dim=1)
def forward(self, sentence):
embeds = self.embedding(sentence)
_, (h, _) = self.lstm(embeds)
if self.is_bidirectional:
hidden = torch.cat((h[-2,:,:], h[-1,:,:]), dim=1)
dence_output = self.fc(hidden)
output = self.logsoftmax(dence_output)
else:
dence_output = self.fc(h)
output = self.logsoftmax(dence_output.squeeze())
return output
inference.py
import pandas as pd
from models import BiLSTMClassifier
from utils import sentence2index
from vocab import Vocabulary
df = pd.read_csv('test.csv')
df = df[:5]
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
# hyper parameter
feature_vocab = Vocabulary.load('feature.dict')
EMBEDDING_DIM = 200
HIDDEN_DIM = 128
TAGSET_SIZE = 2
NUM_LAYERS = 1
DROPOUT = 0
is_bidirectional = True
model = LSTMClassifier(len(feature_vocab), EMBEDDING_DIM, HIDDEN_DIM, TAGSET_SIZE, NUM_LAYERS, DROPOUT, is_bidirectional).to(device)
model.load_state_dict(torch.load('lstm_classifier_weight.pth', map_location=device))
model.eval()
df["sentence_wakati"] = df["sentence"].progress_apply(SudachiPreProcessor(normalize=False).make_wakati) # 形態素解析
df["sentence_id"] = df["sentence_wakati"].progress_apply(lambda x: torch.tensor(sentence2index(x, feature_vocab), dtype=torch.long)) id化
sentence_id_padded = pad_sequence(df["sentence_id"], batch_first=True, padding_value=0).to(device) # 0padding
all_outputs = []
batch_size = 3
for i in tqdm(range(0, len(sentence_id_padded), batch_size)):
batch_data = sentence_id_padded[i:i+batch_size]
with torch.no_grad():
batch_output = model(batch_data)
batch_probs = torch.exp(batch_output)
all_outputs.append(batch_probs)
lstm_output = torch.cat(all_outputs, dim=0)
df["lstm_prob_1"] = lstm_output[:, 1].cpu().numpy().tolist()
df
inference.pyをbtch_sizeを3や5にして実行すると結果が変わってしまう。
この例だと複雑なので、実際に簡単な入力で見てみる。
実験
word_id = [torch.tensor([ 7, 3, 5]),
torch.tensor([ 9, 5]),
torch.tensor([ 7, 1, 8, 4, 4])]
これを入力とする。0でpaddingすると、
word_id_padded = pad_sequence(
word_id,
batch_first=True,
padding_value=0
)
print(word_id_padded)
print(word_id_padded.shape)
# 出力
tensor([[7, 3, 5, 0, 0],
[9, 5, 0, 0, 0],
[7, 1, 8, 4, 4]])
torch.Size([3, 5])
LSTMClassifierを簡略化したものを定義する。
embedding = nn.Embedding(10, 5, padding_idx=0)
bilstm = nn.LSTM(
input_size=5,
hidden_size=2,
num_layers=1,
batch_first=True,
bidirectional=True,
dropout = 0
)
fc = nn.Linear(
in_features=2*2,
out_features=2
)
logsoftmax = nn.LogSoftmax(dim=1)
順番に入力していく。
embeds = embedding(word_id_padded)
print(embeds)
# 出力
tensor([[[ 0.5258, -0.4880, 1.1914, -0.8140, -0.7360],
[ 0.7624, 1.6423, -0.1596, -0.4974, 0.4396],
[ 1.2964, 0.6105, 1.3347, -0.2316, 0.0418],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 1.8446, -1.1845, 1.3835, -1.2024, 0.7078],
[ 1.2964, 0.6105, 1.3347, -0.2316, 0.0418],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]],
[[ 0.5258, -0.4880, 1.1914, -0.8140, -0.7360],
[-1.2345, -0.0431, -1.6047, -0.7521, 1.6487],
[-0.8371, -0.9224, -0.0635, 0.6756, -0.0978],
[-0.7581, 1.0783, 0.8008, 1.6806, 1.2791],
[-0.7581, 1.0783, 0.8008, 1.6806, 1.2791]]],
grad_fn=<EmbeddingBackward0>)
lstm, (ht, ct) = bilstm(embeds)
print(lstm)
print('------------------------------')
print(ht)
print('------------------------------')
print(ct)
# 出力
tensor([[[-0.2505, 0.0171, 0.0254, -0.1418],
[-0.3123, -0.0533, -0.0896, -0.0707],
[-0.2907, -0.0187, 0.0544, -0.1012],
[-0.1526, -0.1095, -0.0719, -0.1345],
[-0.1467, -0.1438, -0.0471, -0.1056]],
[[ 0.2302, -0.0285, 0.3353, -0.2002],
[-0.2159, -0.0116, 0.0483, -0.1012],
[-0.1501, -0.1022, -0.0847, -0.1427],
[-0.1470, -0.1401, -0.0719, -0.1345],
[-0.1452, -0.1584, -0.0471, -0.1056]],
[[-0.2505, 0.0171, -0.0439, -0.1206],
[-0.0625, -0.1108, -0.3307, 0.5002],
[-0.0310, -0.1351, -0.0076, 0.0042],
[-0.0240, -0.0487, 0.0045, 0.1633],
[-0.0315, -0.0466, 0.0051, 0.0968]]], grad_fn=<TransposeBackward0>)
------------------------------
tensor([[[-0.1467, -0.1438],
[-0.1452, -0.1584],
[-0.0315, -0.0466]],
[[ 0.0254, -0.1418],
[ 0.3353, -0.2002],
[-0.0439, -0.1206]]], grad_fn=<StackBackward0>)
------------------------------
tensor([[[-0.2872, -0.5081],
[-0.2826, -0.5671],
[-0.4834, -0.4408]],
[[ 0.1156, -0.2991],
[ 1.0408, -0.3255],
[-0.2172, -0.2240]]], grad_fn=<StackBackward0>)
bilstm_out = torch.cat([ht[0], ht[1]], dim=1)
dence_output = fc(bilstm_out)
tag_scores = logsoftmax(dence_output)
print(tag_scores)
# 出力
tensor([[-0.5184, -0.9050],
[-0.5797, -0.8211],
[-0.4972, -0.9370]], grad_fn=<LogSoftmaxBackward0>)
今度は次のtensorで試してみる。
word_id = [torch.tensor([ 7, 3, 5]),
torch.tensor([ 9, 5])]
word_id_padded = pad_sequence(
word_id,
batch_first=True,
padding_value=0
)
print(word_id_padded)
print(word_id_padded.shape)
# 出力結果
tensor([[7, 3, 5],
[9, 5, 0]])
torch.Size([2, 3])
embeds = embedding(word_id_padded)
print(embeds)
# 出力
tensor([[[ 0.5258, -0.4880, 1.1914, -0.8140, -0.7360],
[ 0.7624, 1.6423, -0.1596, -0.4974, 0.4396],
[ 1.2964, 0.6105, 1.3347, -0.2316, 0.0418]],
[[ 1.8446, -1.1845, 1.3835, -1.2024, 0.7078],
[ 1.2964, 0.6105, 1.3347, -0.2316, 0.0418],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]],
grad_fn=<EmbeddingBackward0>)
torch.Size([2, 3, 5])
これを見た感じ、batch_sizeを変えても埋め込み表現は変わってないから問題なし。
lstm, (ht, ct) = bilstm(embeds)
print(lstm)
print('------------------------------')
print(ht)
print('------------------------------')
print(ct)
# 出力
tensor([[[-0.2505, 0.0171, 0.0371, -0.1423],
[-0.3123, -0.0533, -0.0743, -0.0648],
[-0.2907, -0.0187, 0.0826, -0.0983]],
[[ 0.2302, -0.0285, 0.3482, -0.2011],
[-0.2159, -0.0116, 0.0652, -0.1007],
[-0.1501, -0.1022, -0.0471, -0.1056]]], grad_fn=<TransposeBackward0>)
------------------------------
tensor([[[-0.2907, -0.0187],
[-0.1501, -0.1022]],
[[ 0.0371, -0.1423],
[ 0.3482, -0.2011]]], grad_fn=<StackBackward0>)
------------------------------
tensor([[[-0.4802, -0.0926],
[-0.2976, -0.3531]],
[[ 0.1704, -0.2987],
[ 1.1247, -0.3260]]], grad_fn=<StackBackward0>)
bilstm_out = torch.cat([ht[0], ht[1]], dim=1)
dence_output = fc(bilstm_out)
tag_scores = logsoftmax(dence_output)
print(tag_scores)
# 出力
tensor([[-0.5579, -0.8496],
[-0.5897, -0.8085]], grad_fn=<LogSoftmaxBackward0>)
これを見ると最終的な結果が変わっているのがわかるだろう。
ロジック的にはbatch_sizeを変えても推論結果は変わらないはずなので、明らかにおかしい。
解決策
以下のHPに根本的な解決策があった。
- https://www.kaggle.com/code/hoangpham51/text-classification-pytorch-torchtext-lstm#Build-Model
- https://towardsdatascience.com/multiclass-text-classification-using-lstm-in-pytorch-eac56baed8df
models.py(抜粋)
# sentence_lengthsは各sentenceの長さ
def forward(self, sentence, sentence_lengths):
embeds = self.embedding(sentence)
# ここでそれぞれの文章の系列長を教えてあげる
embeds_packed = nn.utils.rnn.pack_padded_sequence(embeds, sentence_lengths.cpu(), batch_first=True, enforce_sorted=False)
_, (h, _) = self.lstm(embeds_packed)
if self.is_bidirectional:
hidden = torch.cat((h[-2,:,:], h[-1,:,:]), dim=1)
dence_output = self.fc(hidden)
output = self.logsoftmax(dence_output)
else:
dence_output = self.fc(h)
output = self.logsoftmax(dence_output.squeeze())
return output
どうやらpadding用に埋め込んだ"0"をLSTM layer入力する際padding用の0と認識できていないことが問題だったらしい。なので、LSTM layerへ入力する前に0はpadding用に埋め込んだ値であることを教えてあげればいい。
環境
M1 mac
Python==3.11.6
torch==2.0.1