はじめに

この記事は、機械学習に関して、自分が便利になったなーと感じた出来事についての記事です。AlexNetを実装しています。

背景

最近機械学習に手を出し始めました。ニューラルネットワークを使った画像認識をやっていたのですが、私のパソコンにはGPUが積んでおらず、まともに学習させられません。なので、Google Colaboratoryを使わせていただいてます。
そこで、学習を実行している途中にネットワークエラーになったり、暫く放置していてランタイムエラーとかなっちゃうこともありますし、途中で学習をいったん止めたいな、ということがありました。そういった際にいちいち最初から学習するのは非効率だな、と感じたので、epoch毎にモデルと損失を保存するようにしました。1epochに掛かる時間が極めて短く、すぐに学習ができる場合は必要ないですが、Google Colabを使っていても、学習完了に1~2時間かかるので、やってみました。

調べていないですが、多分普通に常套手段ですよね。プログラミング初心者でもわかるような実装なので、そういった方向けです。もっとこうしたら良いというのがあれば教えてください。

概要

仕組みとしては、epoch毎にモデルを上書き保存し、損失をテキストファイルに書いているだけです。

import os
import torch.nn as nn
import torch
from tqdm import tqdm
import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms
import random

from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/AlexNet

"""Model"""
class Model(nn.Module):
    """コンストラクタ"""
    def __init__(self):
        super(Model, self).__init__()
        num_classes = 10
        # 特徴量抽出
        self.__features=nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        # 分類器
        self.__classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 4 * 4, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes),
        )
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), 256 * 4 * 4)
        x = self.__classifier(x)
        return x

    """特徴量"""
    def features(self, x):
        x=self.__features(x)
        return x

"""AlexNet"""
class AlexNet(object):
    """コンストラクタ"""
    def __init__(self, mode=False, model_path="./model/parameter.pth"):
        # デバイス設定 GPU or CPU
        self.__device="cuda" if torch.cuda.is_available() else "cpu"
        # モデル定義
        self.__model=Model().to(self.__device)

        if mode:
            # 学習済みモデル読み込み
            self.__model.load_state_dict(torch.load(model_path))
            self.__model.eval()

        # 学習係数
        self.__lr=1e-3
        # 損失関数:交差エントロピー
        self.__loss_func=nn.CrossEntropyLoss()
        # 最適化アルゴリズム:SGD
        self.__opt=torch.optim.SGD(self.__model.parameters(), lr=self.__lr)

        # save file path
        self.FILE_PATH= '/content/drive/MyDrive/AlexNet/model'

        # フォルダを生成
        if not os.path.exists(self.FILE_PATH):
            os.mkdir(self.FILE_PATH)

        # ファイル path
        self.LOSS_SAVE=os.path.join(self.FILE_PATH, 'loss.txt')
        self.current_epoch = 0
        if os.path.exists(self.LOSS_SAVE):
            if not mode:
                with open(self.LOSS_SAVE, 'r+') as f:
                    f.truncate(0)
            else:
                with open(self.LOSS_SAVE) as f:
                    self.current_epoch = int(f.readline().rstrip())
        self.PARAM_SAVE=os.path.join(self.FILE_PATH, 'parameter.pth')


    """update:学習"""
    def update(self, data, mode=False, epoch=100):
        # epoch=tqdm(epoch)
        for e in range(self.current_epoch+1, epoch):
            sum_log=0
            # パラメータ計算
            for batch, (X, y) in enumerate(data):
                # device調整
                X=X.to(self.__device)
                y=y.to(self.__device)
                # 学習用データXをAutoEncoderモデルに入力 -> 計算結果 出力Y
                pred_y=self.__model(X)

                # 損失計算(ラベルYと予測Yとの交差エントロピーによる損失計算)
                loss=self.__loss_func(pred_y, y)

                # 誤差逆伝播を計算
                # 勾配値を0にする
                self.__opt.zero_grad()
                # 逆伝播を計算
                loss.backward()
                # 勾配を計算
                self.__opt.step()

                loss=loss.item()
                sum_log+=loss
            # 損失を格納
            print(f'epoch:{e}, loss:{sum_log}')

            # 損失,モデル保存
            if mode:
                # ファイルが存在しない場合または空の場合、"e"を書き込む
                if not os.path.exists(self.LOSS_SAVE) or os.stat(self.LOSS_SAVE).st_size == 0:
                    with open(self.LOSS_SAVE, 'w') as file:
                        file.write(str(e)+'\n')

                # ファイルを読み取りモードで開き、現在の行数を取得する
                with open(self.LOSS_SAVE, 'r') as file:
                    lines = file.readlines()
                    line_count = len(lines)

                # 2回目以降の書き込みの際、1行目のみを更新する
                with open(self.LOSS_SAVE, 'w') as file:
                    file.write(str(e)+'\n')
                    file.writelines(lines[1:])
                    file.write(str(sum_log) + '\n')
                # 学習したパラメータを保存
                torch.save(self.__model.state_dict(), self.PARAM_SAVE)


    """test_accuracy:テストデータを使った精度評価"""
    def test_accuracy(self, data, mode=False):
        data=tqdm(data)
        # 勾配なし
        with torch.no_grad():
            # 汎用的なデータセットに対応
            n=0
            # 精度
            acc=0
            # 精度
            correct=0
            # ラベル数の合計値
            total=0
            # パラメータ計算
            for batch, (X, y) in enumerate(data):
                # device調整
                X=X.to(self.__device)
                y=y.to(self.__device)
                # 予測
                pred=self.__model(X)
                # 精度計算
                correct+=(pred.argmax(dim=1) == y).type(torch.float).sum().item()
                # 合計
                total+=y.size(0)
                # データ数 計算
                n+=1

            # 精度[%]
            acc=100*(correct/total)

        print("\n ====================== \n")
        print(f"acc:{acc}")
        print("\n ====================== \n")

        # 損失保存
        if mode:
            # パラメータ保存
            PARAM_SAVE=os.path.join(self.FILE_PATH, 'acc.txt')
            # 学習したパラメータを保存
            np.savetxt(PARAM_SAVE, [acc])

        return acc

# random seedを設定
seed = 1000
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

# 訓練データ
transform = transforms.Compose(
    [transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
                                        shuffle=True, num_workers=2)
print ('train_dataset = ', len(trainset))

# データ
data=trainloader

# CNN
cnn=AlexNet(mode = True)

# 学習
cnn.update(data, mode=True, epoch = 100)

# テストデータ
transform = transforms.Compose(
    [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)

testset = torchvision.datasets.CIFAR10(
    root="./data", train=False, download=True, transform=transform
)
testloader = torch.utils.data.DataLoader(
    testset, batch_size=4, shuffle=False, num_workers=2
)
# データ
data = testloader
# CNN
cnn = AlexNet(mode=True, model_path="./model/parameter.pth")
# 検証データで損失計算
cnn.test_accuracy(data, mode=True)

今回私が実装したのは、

        # ファイル path
        self.LOSS_SAVE=os.path.join(self.FILE_PATH, 'loss.txt')
        self.current_epoch = 0
        if os.path.exists(self.LOSS_SAVE):
            if not mode:
                with open(self.LOSS_SAVE, 'r+') as f:
                    f.truncate(0)
            else:
                with open(self.LOSS_SAVE) as f:
                    self.current_epoch = int(f.readline().rstrip())
        self.PARAM_SAVE=os.path.join(self.FILE_PATH, 'parameter.pth')

の部分と、

            # 損失,モデル保存
            if mode:
                # ファイルが存在しない場合または空の場合、"e"を書き込む
                if not os.path.exists(self.LOSS_SAVE) or os.stat(self.LOSS_SAVE).st_size == 0:
                    with open(self.LOSS_SAVE, 'w') as file:
                        file.write(str(e)+'\n')

                # ファイルを読み取りモードで開き、現在の行数を取得する
                with open(self.LOSS_SAVE, 'r') as file:
                    lines = file.readlines()
                    line_count = len(lines)

                # 2回目以降の書き込みの際、1行目のみを更新する
                with open(self.LOSS_SAVE, 'w') as file:
                    file.write(str(e)+'\n')
                    file.writelines(lines[1:])
                    file.write(str(sum_log) + '\n')
                # 学習したパラメータを保存
                torch.save(self.__model.state_dict(), self.PARAM_SAVE)

ここら辺です。
この実装のおかげで、

# CNN
cnn=AlexNet(mode = True)
# 学習
cnn.update(data, mode=True, epoch = 100)

でcnn=AlexNet(mode = False)にすれば1からの学習になりますし、続きだったらcnn=AlexNet(mode = True)にすればよいだけになりました。便利！

おわりに

まあ大したことはしてないですけど、満足する形で実装できてよかったです。
何かこうしたらいいよというのがあれば、どしどしコメントください。

機械学習でepoch毎に損失とモデルを保存する

はじめに

背景

概要

おわりに