More than 3 years have passed since last update.

【Image Classification】Pytorch VGG19 modelを使ってFashion Imageを分類

Last updated at 2021-09-18Posted at 2021-09-17

概要

Fashionデータを利用してカバン、シャツ、スカートなどイメージを分類しました。

構成

データパイプライン : データをロードする時に負担を緩和するためにパイプライン構築
早めに学習終了設定 : 学習改善があまり進めても出来ない場合は早めに学習終了を設定する。
TensorBoard設定 : TensorBoardと連結して学習進行状態を見る。

ディレクトリ

以下のようにdatasetディレクトリにバッグ、シャツ、スカートなどフォルダが分かれてるし、画像ファイルがあります。

├─dataset
│  ├─bag
│  ├─shirt
│  └─skirt
└─runs
    └─fashion_classification

データをロード

import os
import time
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import random 

import torch
import torch.nn as nn
from torch import optim, cuda
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torchvision import datasets, models, transforms
from torchvision.utils import make_grid
import matplotlib.pyplot as plt

from torchsummary import summary

BASE_DIR = os.getcwd()
DATASET_PATH = os.path.join(BASE_DIR, 'dataset')
MODE_PATH = os.path.join(BASE_DIR, 'models')

npyファイルをチェックする理由は最初の訓練時はイメージでnpyファイルを生成し次の訓練の時からnpyファイルをロードして時間を節約するため。

classes = []
dataset = []

for fullpath, dirnames, filenames in os.walk(DATASET_PATH):
    if DATASET_PATH == fullpath:
        classes = dirnames
        continue        
        
    label_name = fullpath.split(os.path.sep)[-1]        
    for filename in filenames:    
        if filename.find('.npy') != -1:
            continue
        class_index = classes.index(label_name)
        dataset.append((os.path.join(fullpath, filename), class_index))

random.shuffle(dataset) 

total_len = len(dataset)
train_ratio = int(total_len * 0.8)

train_data = dataset[:train_ratio]
test_data = dataset[train_ratio:]

データパイプライン

すべてのデータを取得してメモリが負荷することがある。そのために訓練時バッチサイズだけ取得してデータを加工する。
最初の訓練時のイメージのパスでnpyファイルを作成し、次の訓練の時からnpyファイルだけをロードして、時間をデータの読み込み時間を節約する。

class Dataset(object):
    def __init__(self, transforms, dataset):
        self.transforms = transforms
        self.dataset = dataset

    def __getitem__(self, idx):
        img_path, label = self.dataset[idx]        
        
        fullpath, extension = os.path.splitext(img_path)
        npy_path = fullpath + '.npy'
        
        if os.path.exists(npy_path):
            np_data = np.load(npy_path)
            img = Image.fromarray(np.uint8(np_data))
        else:
            img = Image.open(img_path).convert("RGB")  
            np_data = np.asarray(img)
            np.save(npy_path, np_data)
        
        if self.transforms is not None:
            img = self.transforms(img)                
        return img, label

    def __len__(self): 
        return len(self.dataset)

transform = transforms.Compose(
    [
         transforms.Resize((224, 224)),
         transforms.ToTensor(),
         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ]
)

訓練および評価データパイプラインを生成する。

train_dataset = Dataset(transform, train_data)
test_dataset = Dataset(transform, test_data)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=20)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=5)

データ視覚化

イメージ一つを視覚化して見る。

dataloader = iter(train_loader)
imgs, target = dataloader.next()

tf = transforms.ToPILImage()
pil_img = tf(imgs[0])
plt.figure()
plt.imshow(pil_img)
plt.colorbar()
plt.gca().grid(False)

モデル設定

vgg19モデルを参考。

learning_rate = 0.001
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 
device

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv = nn.Sequential(            
            nn.Conv2d(3, 64, 3, padding=1),nn.LeakyReLU(0.2),
            nn.Conv2d(64, 64, 3, padding=1),nn.LeakyReLU(0.2),
            nn.MaxPool2d(2, 2),
            
            nn.Conv2d(64, 128, 3, padding=1),nn.LeakyReLU(0.2),
            nn.Conv2d(128, 128, 3, padding=1),nn.LeakyReLU(0.2),
            nn.MaxPool2d(2, 2),
            
            nn.Conv2d(128, 256, 3, padding=1),nn.LeakyReLU(0.2),
            nn.Conv2d(256, 256, 3, padding=1),nn.LeakyReLU(0.2),
            nn.Conv2d(256, 256, 3, padding=1),nn.LeakyReLU(0.2),
            nn.MaxPool2d(2, 2),
            
            nn.Conv2d(256, 512, 3, padding=1),nn.LeakyReLU(0.2),
            nn.Conv2d(512, 512, 3, padding=1),nn.LeakyReLU(0.2),
            nn.Conv2d(512, 512, 3, padding=1),nn.LeakyReLU(0.2),
            nn.MaxPool2d(2, 2),
            
            nn.Conv2d(512, 512, 3, padding=1),nn.LeakyReLU(0.2),
            nn.Conv2d(512, 512, 3, padding=1),nn.LeakyReLU(0.2),
            nn.Conv2d(512, 512, 3, padding=1),nn.LeakyReLU(0.2),
            nn.MaxPool2d(2, 2)
        )
        self.avg_pool = nn.AvgPool2d(7)
        self.classifier = nn.Linear(512, 10)

    def forward(self, x):
        features = self.conv(x)
        x = self.avg_pool(features)
        x = x.view(features.size(0), -1)
        x = self.classifier(x)
        return x, features

model = Net().to(device)
model

Net(
  (conv): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): LeakyReLU(negative_slope=0.2)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): LeakyReLU(negative_slope=0.2)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): LeakyReLU(negative_slope=0.2)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): LeakyReLU(negative_slope=0.2)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): LeakyReLU(negative_slope=0.2)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): LeakyReLU(negative_slope=0.2)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): LeakyReLU(negative_slope=0.2)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (17): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (18): LeakyReLU(negative_slope=0.2)
    (19): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (20): LeakyReLU(negative_slope=0.2)
    (21): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (22): LeakyReLU(negative_slope=0.2)
    (23): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (24): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (25): LeakyReLU(negative_slope=0.2)
    (26): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (27): LeakyReLU(negative_slope=0.2)
    (28): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (29): LeakyReLU(negative_slope=0.2)
    (30): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avg_pool): AvgPool2d(kernel_size=7, stride=7, padding=0)
  (classifier): Linear(in_features=512, out_features=10, bias=True)
)

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
summary(model.to(device), (3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Conv2d-1         [-1, 64, 224, 224]           1,792
         LeakyReLU-2         [-1, 64, 224, 224]               0
            Conv2d-3         [-1, 64, 224, 224]          36,928
         LeakyReLU-4         [-1, 64, 224, 224]               0
         MaxPool2d-5         [-1, 64, 112, 112]               0
            Conv2d-6        [-1, 128, 112, 112]          73,856
         LeakyReLU-7        [-1, 128, 112, 112]               0
            Conv2d-8        [-1, 128, 112, 112]         147,584
         LeakyReLU-9        [-1, 128, 112, 112]               0
        MaxPool2d-10          [-1, 128, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]         295,168
        LeakyReLU-12          [-1, 256, 56, 56]               0
           Conv2d-13          [-1, 256, 56, 56]         590,080
        LeakyReLU-14          [-1, 256, 56, 56]               0
           Conv2d-15          [-1, 256, 56, 56]         590,080
        LeakyReLU-16          [-1, 256, 56, 56]               0
        MaxPool2d-17          [-1, 256, 28, 28]               0
           Conv2d-18          [-1, 512, 28, 28]       1,180,160
        LeakyReLU-19          [-1, 512, 28, 28]               0
           Conv2d-20          [-1, 512, 28, 28]       2,359,808
        LeakyReLU-21          [-1, 512, 28, 28]               0
           Conv2d-22          [-1, 512, 28, 28]       2,359,808
        LeakyReLU-23          [-1, 512, 28, 28]               0
        MaxPool2d-24          [-1, 512, 14, 14]               0
           Conv2d-25          [-1, 512, 14, 14]       2,359,808
        LeakyReLU-26          [-1, 512, 14, 14]               0
           Conv2d-27          [-1, 512, 14, 14]       2,359,808
        LeakyReLU-28          [-1, 512, 14, 14]               0
           Conv2d-29          [-1, 512, 14, 14]       2,359,808
        LeakyReLU-30          [-1, 512, 14, 14]               0
        MaxPool2d-31            [-1, 512, 7, 7]               0
        AvgPool2d-32            [-1, 512, 1, 1]               0
           Linear-33                   [-1, 10]           5,130
================================================================
Total params: 14,719,818
Trainable params: 14,719,818
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.57
Forward/backward pass size (MB): 218.40
Params size (MB): 56.15
Estimated Total Size (MB): 275.12
----------------------------------------------------------------

class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=10, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
            trace_func (function): trace print function.
                            Default: print            
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

writer = SummaryWriter('runs/fashion_classification')

def matplotlib_imshow(img, one_channel=False):
    if one_channel:
        img = img.mean(dim=0)
    img = img / 2 + 0.5
    npimg = img.cpu().numpy()
    if one_channel:
        plt.imshow(npimg, cmap="Greys")
    else:
        plt.imshow(np.transpose(npimg, (1, 2, 0)))

def images_to_probs(model, images):
    output,f  = model(images)
    _, preds_tensor = torch.max(output, 1)
    preds = np.squeeze(preds_tensor.cpu().numpy())
    return preds, [nn.functional.softmax(el, dim=0)[i].item() for i, el in zip(preds, output)]

def plot_classes_preds(model, images, labels):
    preds, probs = images_to_probs(model, images)
    fig = plt.figure(figsize=(12, 48))
    for idx in np.arange(4):        
        if len(images) - 1 < idx:
            break        
        ax = fig.add_subplot(1, 4, idx+1, xticks=[], yticks=[])
        matplotlib_imshow(images[idx], one_channel=True)
        ax.set_title("{0}, {1:.1f}%\n(label: {2})".format(
            classes[preds[idx]],
            probs[idx] * 100.0,
            classes[labels[idx]]), color=("green" if preds[idx]==labels[idx].item() else "red"))
    return fig

学習（訓練）

def train_model(model, n_epochs):

    train_losses = []
    valid_losses = []
    early_stopping = EarlyStopping(verbose = True)

    for epoch in range(n_epochs):

        epoch_loss = 0
        start = time.time()
        model.train()
        for batch, (data, target) in enumerate(train_loader, 1):    
            
            data = data.to(device)
            target = target.to(device)
            
            output, f = model(data)
            loss = criterion(output, target)
            
            optimizer.zero_grad()   
            loss.backward()
            optimizer.step()
            epoch_loss += loss
            
            if batch % 100 == 0:
                writer.add_scalar('training loss', epoch_loss / 100, epoch * len(train_loader) + batch)
                writer.add_figure('predictions vs. actuals', plot_classes_preds(model, data, target), global_step=epoch * len(train_loader) + batch)                        
        print(f'epoch : {epoch+1}, Loss : {epoch_loss}, time : {time.time() - start}')
            
        model.eval() 
        for data, target in test_loader :
            
            data = data.to(device)
            target = target.to(device)            
            
            output, f = model(data)
            loss = criterion(output, target)
            valid_losses.append(loss.item())

        train_loss = np.average(train_losses)
        valid_loss = np.average(valid_losses)

        epoch_len = len(str(n_epochs))      
        train_losses = []
        valid_losses = []
        early_stopping(valid_loss, model)

        if early_stopping.early_stop:
            print("Early stopping")
            break

n_epochs = 50
train_model(model, n_epochs)

Tensorboard観察

tensorboard --logdir runs

予測

columns = 5
rows = 3
fig = plt.figure(figsize=(20,10))
 
model.eval()
for i in range(1, columns*rows+1):
    idx = np.random.randint(len(test_data))
    img_path, class_index = test_data[idx]
    pil_img = Image.open(img_path).convert('RGB')
    img = transform(pil_img).unsqueeze(dim=0).to(device)    
    output,f  = model(img)
    _, argmax = torch.max(output, 1)
    pred = classes[argmax.item()]
    label = classes[class_index]
    
    fig.add_subplot(rows, columns, i)
    plt.title(pred)    
    plt.imshow(pil_img)
    plt.axis('off')
    
plt.show()

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up