@daiking0215posted at 2022-11-04

PyTorchで構築したCNNにおける推論の結果がすべて同じになる

Q&A

Python ConvolutionalNeuralNetworks PyTorch

解決したいこと

PyTorchでCNNを構築し，トレーニング後にテストデータを使った推論を行いたい．しかし，すべてのデータセットに対して同じ出力しか出てこない．

現状

研究で画像からその物体の固有値を算出するCNNの構築を試みています．入力として224*224ixelの画像を，出力として一次モードの固有値をもつネットワークです．構造はVGG19を参考に作り，損失関数にはMAEを使用しています．損失関数のhistoryを見る限り学習が進んでいるようにみえるのですが，学習終了後にいくつかのデータを与えても同じ出力しか返ってきません．おそらく重みパラメータが更新されていないorどこかでリセット（すべての重みが0でその先の出力が同じになっている）のではないかと疑っています．
試しに

model.state_dict()

でパラメータの中身を確認してみましたが，原因が見つかりませんでした．．．

環境

PyTorch 1.12.0+cu116
Python 3.8.10

発生している問題・エラー

以下はいくつかのデータを使って推論した出力です．複数のデータに対して同じ出力となっています．

>>> y_eval
tensor([[-0.0040],
        [-0.0040],
        [-0.0040],
        [-0.0040],
           中略
        [-0.0040],
        [-0.0040],
        [-0.0040],
        [-0.0040]], device='cuda:0', grad_fn=<AddmmBackward0>)

該当するソースコード

ベタ付ですみません．．．

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchsummary import summary
import torchvision
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import time

#Parameter
DATA_DIR = "../../../pinn/data/supervised/"
TEST_SIZE = 0.2
BATCH_SIZE = 1
LEARNING_RATE = 0.001
EPOCH = 200
PIXEL = 224

#Select device
def get_device(use_gpu):
    if use_gpu and torch.cuda.is_available():
        torch.backends.cudnn.deterministic = True
        return torch.device("cuda")

    else:
        return torch.device("cpu")
device = get_device(use_gpu=True)


#Input & Output
X = np.load(DATA_DIR + "input.npy")
X = np.reshape(X, (X.shape[0], 1, PIXEL, PIXEL))
X = torch.tensor(X, dtype=torch.float)
y = np.load(DATA_DIR + "output.npy")
y = np.reshape(y, (y.shape[0], 1))
y = torch.tensor(y, dtype=torch.float)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=0)

train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
train_dataset = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataset = torch.utils.data.TensorDataset(X_test, y_test)
test_dataset = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

#Make Model
class VGG19(nn.Module):
    def __init__(self, input_image_channels):
        super(VGG19, self).__init__()
        self.conv1 = nn.Conv2d(input_image_channels, 64, kernel_size=(3, 3), stride=(1, 1), padding=1)
        self.conv2 = nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=1)
        self.conv4 = nn.Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=1)
        self.conv5 = nn.Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=1)
        self.conv6 = nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=1)
        self.conv7 = nn.Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=1)
        self.conv8 = nn.Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=1)

        self.max_pool = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))

        self.bn1 = nn.BatchNorm2d(64)
        self.bn2 = nn.BatchNorm2d(128)
        self.bn3 = nn.BatchNorm2d(256)
        self.bn4 = nn.BatchNorm2d(512)

        self.fc1 = nn.Linear(7*7*512, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 64)
        self.fc4 = nn.Linear(64, 16)
        self.fc5 = nn.Linear(16, 1)

    def  forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = self.max_pool(x)

        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = self.max_pool(x)

        x = F.relu(self.conv5(x))
        x = F.relu(self.conv6(x))
        x = F.relu(self.conv6(x))
        x = F.relu(self.conv6(x))
        x = self.max_pool(x)

        x = F.relu(self.conv7(x))
        x = F.relu(self.conv8(x))
        x = F.relu(self.conv8(x))
        x = F.relu(self.conv8(x))
        x = self.max_pool(x)

        x = F.relu(self.conv8(x))
        x = F.relu(self.conv8(x))
        x = F.relu(self.conv8(x))
        x = F.relu(self.conv8(x))
        x = self.max_pool(x)

        x = x.view(-1, 7*7*512)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = self.fc5(x)
        return x

class MAELoss(nn.Module):
    def __init__(self): 
        super(MAELoss, self).__init__()

    def forward(self, outputs, targets):

        loss = torch.mean(torch.abs(outputs - targets))
        return loss

# Model Check
model = VGG19(1).to(device)
summary(model, input_size=(1, 224, 224))

# Loss & Optimizer
loss_fn = MAELoss()
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)

train_loss_value=[] 
train_eval_value=[]  
test_loss_value=[]  
test_eval_value=[]    

# Compile
for epoch in range(EPOCH):
    model.train()
    running_train_loss = 0.0

    torch.cuda.synchronize()
    start = time.time()
    with torch.set_grad_enabled(True):
        for (input, true) in train_dataset:
            input, true = input.to(device), true.to(device)
            optimizer.zero_grad()
            output = model(input)
            loss = loss_fn(output, true)
            running_train_loss += loss.item()
            loss.backward()
            optimizer.step()

    train_loss_value.append(running_train_loss * BATCH_SIZE / len(train_dataset))

    model.eval()
    running_test_loss = 0.0
    with torch.set_grad_enabled(False):
        for (input, true) in test_dataset:
            input, true = input.to(device), true.to(device)
            optimizer.zero_grad()
            output = model(input)
            loss = loss_fn(output, true)
            running_test_loss += loss.item()

    test_loss_value.append(running_test_loss * BATCH_SIZE / len(test_dataset))

    torch.cuda.synchronize()
    elapsed_time = time.time() - start

    print('#EPOCH:{}\ttrain loss: {}\tvalid loss: {}\ttime: {}'.format(epoch, running_train_loss * BATCH_SIZE / len(train_dataset), running_test_loss * BATCH_SIZE / len(test_dataset), elapsed_time))

# Plot history
fig,[ax1,ax2] =plt.subplots(1,2,figsize=(12.5,5))
fontsize = 16
labelsize = 15
c1, c2, c3, c4 = "blue", "red", "blue", "red"
l1, l2,l3, l4 = "train", "validation", "train", "validation"
epoch_list = np.arange(EPOCH) + 1

ax1.plot(epoch_list, train_loss_value, color=c1, label=l1)
ax1.plot(epoch_list, test_loss_value, color=c2, label=l2)
ax1.set_title("Loss Function", fontsize=fontsize)
ax1.set_xlabel('Epoch', fontsize=fontsize)
ax1.set_ylabel('MAE', fontsize=fontsize)
ax1.legend(loc = 'upper right')
ax1.tick_params(axis='x', labelsize=labelsize)
ax1.tick_params(axis='y', labelsize=labelsize)
ax1.grid()

# ax2.plot(epoch_list, train_eval_value, color=c3, label=l3)
# ax2.plot(epoch_list, test_eval_value, color=c4, label=l4)
ax2.set_title("Evaluate Function", fontsize=fontsize)
ax2.set_xlabel('Epoch', fontsize=fontsize)
ax2.set_ylabel('MAPE[%]', fontsize=fontsize)
ax2.legend(loc = 'upper right')
ax2.tick_params(axis='x', labelsize=labelsize)
ax2.tick_params(axis='y', labelsize=labelsize)
ax2.grid()

plt.show()


model.eval()
y_eval = model(X.to(device))
print(y_eval)

自分で試したこと

構造を２層の全結合層からなる簡単なNNに変更したり，最適化手法・損失関数等を変更したりさまざまなパラメータをいじりましたが，出力値はすべてのデータに対して同じになってしまいました．おそらく学習の過程でパラメータの更新がうまく行っていないのだと予測してはいますがどこで問題が起きているのか皆目検討も付きません．よろしくお願いいたします．

Are you sure you want to delete the question?