Pytorch Lightningの利用
以下のyoutubeチャンネルで学習した内容をノートにまとめたものです。
https://www.youtube.com/playlist?list=PLhhyoLH6IjfyL740PTuXef4TstxAK6nGP
利用モジュール
import torch
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.utils.data import random_split
PyTorchのコード
以下のPyTrochコードをPytorch Lightningに書き換える
# block1------
class NN(nn.Module):
def __init__(self, input_size, num_classes):
super().__init__()
self.fc1 = nn.Linear(input_size, 50)
self.fc2 = nn.Linear(50, num_classes)
def forward(self, x):
x = F.relu(self.fc(x))
x = self.fc2(x)
return x
# ---- end block1
# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyperparameter
input_size = 784
num_classes = 10
learning_rate = 0.001
batch_size = 64
num_epochs = 3
# block 4 ------
# Load Data
entire_dataset = datasets.MNIST(root="dataset/", train=True, transform=transforms.ToTensor(), download=True)
train_ds, val_ds = random_split(entire_dataset, [50000, 10000])
test_ds = dataset.MNIST(root="dataset/", train=False, transform=transforms.ToTensor(), download=True)
train_loader = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_ds, batch_size=batch_size, shuffle=False)
# ---- end block4
# block 5 ----
# Initialize network
model = NN(input_size = input_sizze, num_classes=num_classes).to(device)
# Loss and optimize
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# -----end block5
# block 2 ----
# Train Network
for epoch in range(num_epochs):
for batch_idx, (data, targets) in enumerate(tqdm(train_loader)):
# --- training step---
# Get data to cuda if possible
data = data.to(device=device)
targets = targets.to(device=device)
# Get to correct shape
data = data.reshape(data.shape[0], -1)
# Forward
scores = model(data)
loss = criterion(scores, target)
# Backward
Optimizer.zero_grad()
loss.backward()
# Gradient descent or adam step
optimizer.step()
# ----end training step
# --- end block 2
# block 3 ----
# Check accuracy on training & test to see how good our model
def check_accuracy(loader, model):
num_correct = 0
num_samples = 0
model.eval()
# we don't need to keep track of gradients here
# so we wrap it in torch.no_grad()
with torch.no_grad():
# Loop through the data
for x, y in loader:
# Move data to device
x = x.to(device=device)
y = y.to(device=device)
# Get to correct shape
x = x.reshape(x.shape[0], -1)
# Forward pass
scores = model(x)
_, predictions = score.max(1)
# Check how many we got correct
num_correct += (predictions == y).sum()
# Keep track of number of samples
num_samples += predictions.size(0)
model.train()
return num_correct / num_samples
# ---- end block 3
# Check accuracy on training & test to see how good our model
model.to(device)
print(f"Accuracy on training set: {check_accuracy(train_loader, model)*100:.2f}")
print(f"Accuracy on validation set: {check_accuracy(val_loader, model)*100:.2f}")
print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")
blockごとに書き換える
block 1 の修正
- block 1の内容:深層学習モデル定義
# block1の修正 -> LightningModuleへの変更 -> block1' (1ダッシュ)
class NN(pl.LightningModule):
def __init__(self, input_size, num_classes):
super().__init__()
self.fc1 == nn.Linear(input_size, 50)
self.fc2 == nn.Linear(50, num_classes)
# 追加
self.loss_fn = nn.CrossEntropyLoss()
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
# 追加 ( block 2内のtraining stepをここで定義している。さらに、zero_grad(), backward()関数の記述が不要になる)
def training_step(self, batch, batch_idx):
# --- (1) start ---
x, y = batch
x = x.reshape(x.size(0), -1)
scores = self.forward(x)
loss = self.loss_fn(scores, y) # または、loss = F.cross_entropy(scores, y)
self.log('training_loss', loss)
# --- (1) end ---
return loss
# (1)内のコードは、 loss, score, y = self._common_step(batch, batch_idx) に置き換えできる
# 追加
def validation_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log('val_loss', loss)
return loss
# 追加
def test_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log('test_loss', loss)
return loss
# 追加
def _common_step(self, batch, batch_idx):
x, y = batch
x = x.reshape(x.size(0), -1)
scores = self.forward(x)
loss = self.loss_fn(scores, y)
return loss, scores, y
# 追加
def configure_optimizers(self):
return optim.Adam(self.parameters(), lr=0.001)
# 追加
def predict_step(self, batch, batch_idx):
x, y = batch
x = x.reshape(x.size(0), -1)
scores = self.forward(x)
preds = torch.argmax(scores, dim=1)
return preds
- クラス内に以下を追加して定義している
- 損失関数
- training step
- validation step
- test step
- batch処理
- 最適化step
- 予測関数
- 上記のように、ここで色々関数を定義することで、コードを書く必要が無くなる
- 定義できる関数は、LightningModuleを確認
block 2 の修正
- block 2の内容:training step
- 上記のclass NN()の変更により、block 2 (Train network)は以下のように修正し、training stepはpl.Trainer()に変更する
# block 2 は、以下のように修正 -> bloc 2' (2ダッシュ)
trainer = pl.Trainer(accelerator="gpu", device=[0], min_epochs=1, max_epochs=3, precision=16)
trainer.fit(model, train_loader, val_loader)
trainer.validate(model, val_loader)
trainer.test(model, test_loader)
block 3 の削除と torchmetricsの利用
- block 3の内容:accuracyの計算
- block 3の削除に伴い, block 1' 内のtraining_step()関数も修正する(accuracy()メソッド、f1_score()メソッドの追加)
# まず、block 3を削除
# 以下評価関数用モジュールを追加
import torchmetrics
from torchmetrics impirt Metrics
# 次に block 1' (class NN())の __init__に以下を追加 (NNクラスに精度測定用関数をメソッドとして追加)
self.accuracy = torchmetrics.Accuracy(task="numticlass" num_classes=num_classes)
self.f1_score = torchmetrics.F1score(task="multiclass", num_classes=num_classes)
# 次に block 1' (class NN())のdef training_step()を以下のように修正
def training_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
# 以下から追加コード
accuracy = self.accuracy(scores, y)
f1_score = self.f1_score(scores, y)
self.log_dict({'train_loss': loss, 'train_accuracy': accuracy, 'train_f1score': f1_score}, on_step=False, on_epoch=True, prog_bar=True)
return {'loss': loss, 'scores': scores, 'y':y}
- 上記では、既存の評価関数(Accuracy(), F1score()を呼び出しているが、カスタム評価関数を作りたい場合の例が以下
class MyAccuracy(Metric):
def __init__():
self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum")
self.add_state("correct", default=torch.tensor(0), dist_reduce_fx="sum")
def update(self, preds, target):
preds = torch.argmax(preds, dim=1)
assert preds.shape == target.shape
self.correct += torch.sum(preds==target)
self.total += target.numel()
def computer(self):
return self.correct.float() / self.total.float()
# また、上記のようにカスタムMetricを作った場合は、class NN()の __init__ に以下を追加
self.my_accuracy = MyAccuracy()
# さらに class NN()内の def training_step()の関数を以下に修正
accuracy = self.my_accuracy(scores, y)
block 4 の修正
- block 4の内容:データの読み込みとDataloaderの設定
- lightningのDataModuleに変更する
class MnistDataModule(pl.LightningDataModule):
def __init__(self, data_dir, batch_size, num_workers):
super().__init__()
self.data_dir = data_dir
self.batch_size = batch_size
self.num_workers = num_workers
def prepare_data(self): # すでにフォルダにダウンロード済みなら不要
datasets.MNIST(self.data_dir, train=True, download=True)
datasets.MNIST(self.data_dir, train=False, download=True)
def setup(self, stage):
entire_dataset = datasets.MNIST(root=self.data_dir, train=True, transform=transform.ToTensor(), download=False)
self.train_ds, self.val_ds = random_split(entire_dataset, [50000, 10000])
self.test_ds = datasets.MNIST(root=self.data_dir, train=False, download=False)
def train_dataloader(self):
return DataLoader(self.train_ds, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=True) # これは、torch.utils.dataのDataLoader
def val_dataloader(self):
return DataLoader(self.val_ds, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)
def test_dataloader(self):
return DataLoader(self.test_ds, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)
- これにより、block 5 も削除され、block 2' も変更される
# block 2'を以下のように変更する
model = NN(input_size=input_size, num_classes=num_classes)
dm = MnistDataModule(data_dir="dataset/", batch_size=batch_size, num_workers=4)
trainer = pl.Trainer(accelerator="gpu", devices=1, min_epochs=1, max_epochs=3, prediction=16)
trainer.fit(model, dm)
trainer.validate(model, dm)
trainer.test(model, dm)
# これにより、block 5も削除される。
まとめ
- 上記修正結果を以下の4つのファイルにまとめる
- config.py
- dataset.py
- model.py
- train.py
config.py
- 各種設定値の定義
## config.py
# Training hyperparameters
INPUT_SIZE = 784
NUM_CLASSES = 10
LEARNING_RATE = 0.001
BATCH_SIZE = 64
NUM_EPOCHES = 3
# Dataset
DATA_DIR = 'dataset/'
NUM_WORKERS = 4
# Compute related
ACCELERATOR = 'gpu'
DEVICES = [0]
PRECISION = 16
dataset.py
- データのダウンロードと設定
## dataset.py
# データのダウンロードと設定
import torch
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch import nn, optim
from torch.utils.data import DataLoader
from torch.utils.data import random_split
# import pytorch_lightning as pl
import lightning.pytorch as pl
class MnistDataModule(pl.LightningDataModule):
def __init__(self, data_dir, batch_size, num_workers):
self.data_dir = data_dir
self.batch_size = batch_size
self.num_workers = num_workers
def prepare_data(self):
datasets.MNIST(self.data_dir, train=True, download=True)
datasets.MNIST(self.data_dir, train=False, download=True)
def setup(self, stage):
entire_dataset = datasets.MNIST(root=self.data_dir, train=True, transform=transforms.ToTensor(), download=False)
self.train_ds, self.valid_ds = random_split(entire_dataset, [50000, 10000])
self.test_ds = datasets.MNIST(root=self.data_dir, train=False, transform=transforms.ToTensor(), download=False)
def train_dataloader(self):
return DataLoader(self.train_ds, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=True)
def val_dataloader(self):
return DataLoader(self.val_ds, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)
def test_dataloader(self):
return DataLoader(self.test_ds, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)
- 上記におけるpytorchとの違い
- pytorchでは、データ読み込んだら、別途DataLoader()インスタンスに渡して、それをループで回していたが、pl.LightningDataModuleクラスの中にdataloaderメソッドを作成し、その中にDataLoaderインスタンスを内包させている。すなわち、データをバッチ用のデータごとオブジェクトに一つにまとめている。
model.py
- LightningModuleクラスの定義
## model.py
import torch
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tadm
import lightning.pytorch as pl
import torchmetrics
from torchmetrics import Metric
class NN(pl.LightningModule):
def __init__(self, input_size, learning_rate, num_classes):
super().__init__()
self.lr = learning_rate
self.fc1 = nn.Linear(input_size, 50)
self.fc2 = nn.Linear(50, num_classes)
self.loss_fn = nn.CrossEntropyLoss()
self.accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
self.f1_score = torchmetrics.F1Score(task="multiclass", num_classes=num_clasees)
def forward(self, x):
x = F.relu(self.fc1)
x = self.fc2(x)
return x
def training_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
accuracy = self.accuracy(scores, y)
f1_score = self.f1score(scores, y)
self.log_dict({'train_loss':loss, 'train_accuracy':accuracy, 'train_f1_score':f1_score}, on_step=False, on_epoch=True, prog_bar=True)
return {'loss':loss, 'scores':scores, 'y':7}
def validation_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log('val_loss', loss)
return loss
def test_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log('test_loss', loss)
return loss
def _common_step(self, batch, batch_idx):
x, y = batch
x = x.reshape(x.size(0), -1)
scores = self.forward(x)
loss = self.loss_fn(scores, y)
return loss, scores, y
def predict_step(self, batch, batch_idx):
x, y = batch
x = x.reshape(x.size(0), -1)
scores = self.forward(x)
preds = torch.argmax(scores, dim=1)
return preds
def configure_optimaizer(self):
return optim.Adam(self.parameters(), lr=self.lr)
- 上記のpytorch lightningでは、バラバラに記述されるトレーニングステップやlossの算出、予測などがpl.LightningModuleクラス内に、各メソッドとして内包されている。
- また、このLightningModuleの子クラスであるNNクラスをpl.Trainerオブジェクトに渡すと、わざわざforループを回さなくてもバッチループを内部で回してくれる。
train.py
- ここで、上記で作成した、config.py、dataset.py、medel.pyを呼び出して実行する
## train.py
import torch
# import pytorch_lightning as pl
import lightning.pytorch as pl
from model import NN
from dataset import MnistDataModule
import config
if __name__ == "__main__":
# pl.LightningModule読み込み
model = NN(
imput_size = config.INPUT_SIZE,
learning_rate = config.LEARNING_RATE,
num_classes = config.NUM_CLASSES
)
# pl.LightningDataModule読み込み
dm = MnistDataModule(
data_dir = config.DATA_DIR,
batch_size = config.BATCH_SIZE,
num_workers = config.NUM_WORKERS
)
# pl.Trainerインスタンス作成
trainer = pl.Trainer(
accelerator = config.ACCELERATOR,
devices = config.DEVICES,
min_epochs = 1,
max_epochs = 3,
precision = config.PRECISION
)
trainer.fit(model, dm)
trainer.validate(modl, dm)
trainer.test(model, dm)
- pl.Trainerクラスにより、pytorchで行なっていたバッチごとのループ処理を記述する必要が無くなっている。
- また、pytorchでは必須だったoptimizer.zero_grad()も自動で実行してくれるので記述がいらなくなった。