はじめに
開発プラットフォームとしてTensorFlow, 学習データとしてMNISTを使って機械学習を行い,学習後のNNでリアルタイム推論をやるところまで通して行います.
動作確認は下記環境で行っています
・OS: Windows11 Pro (64bit)
・Anaconda Navigator: 2.6.3
・Python: 3.9.21
・UI: jupyter Notebook 7.3.2
・OpenCV: 4.11.0
・TensorFlow: 2.10.1
・カメラ: ELECOM UCAM-310FBBK
1. TensorFlowのインストール
Terminalにて(バージョン2.10.1を入れたい場合)
pip install tensorflow==2.10.1
(下記は未インストールの場合ついでにやっておく)
pip install pandas
バージョン確認
import numpy as np
import tensorflow as tf
import keras
print(tf.__version__)
print(keras.__version__)
print(tf.config.list_physical_devices('GPU'))
2. MNISTデータセットのダウンロード
x_train: 画像(学習用), y_train: 答え(学習用)
x_test: 画像(テスト用), y_test: 答え(テスト用)
import tensorflow.keras
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import regularizers
from tensorflow.keras.datasets import mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
print("Shape of x_train: {}".format(x_train.shape))
print("Shape of y_train: {}".format(y_train.shape))
print()
print("Shape of x_test: {}".format(x_test.shape) )
print("Shape of y_test: {}".format(y_test.shape) )
3. データの中身を見てみる(学習には不要な操作です)
数字を1つ表示してみる(画像で)
# 数字を1つ表示(画像)
import matplotlib.pyplot as plt
import numpy as np
digit = 105 # Change to choose new digit
a = x_train[digit]
plt.imshow(a, cmap='gray', interpolation='nearest')
print("Image (#{}): Which is digit '{}'".format(digit, y_train[digit]) )
数字を1つ表示してみる(数値データで)
# 数字を1つ表示(数値)
from IPython.display import display
import pandas as pd
#Display as test
pd.set_option('display.max_columns', 29)
pd.set_option('display.max_rows', 29)
print("Shape for dataset: {}".format(x_train.shape))
print("Labels: {}".format(y_train))
# single MNIST digit
digit = 105
single = x_train[digit]
print("Shape for single: {}".format(single.shape))
pd.DataFrame(single.reshape(28,28))
数字(画像)を複数ランダムで表示
# 数字を複数ランダムに表示
import random
ROWS = 6
random_indices = random.sample(range(x_train.shape[0]), ROWS*ROWS)
sample_images = x_train[random_indices, :]
plt.clf()
fig, axes = plt.subplots(ROWS, ROWS, figsize=(ROWS,ROWS),sharex=True, sharey=True)
for i in range(ROWS*ROWS):
subplot_row = int(i/ROWS)
subplot_col = int(i%ROWS)
ax = axes[subplot_row, subplot_col]
plottable_image = np.reshape(sample_images[i,:], (28,28))
ax.imshow(plottable_image, cmap='gray_r')
ax.set_xbound([0,28])
plt.tight_layout()
plt.show()
4. 入力データの前処理
・データの形を(データ数, row(横), column(縦), ch)にする
・0~255の整数を0~1の浮動小数点(float32)にする
# 入力データの前処理
import tensorflow.keras
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras import backend as K
batch_size = 128
num_classes = 10
epochs = 12
#input image dimensions
img_rows, img_cols = 28, 28
if K.image_data_format() == 'channels_first':
x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
input_shape = (1, img_rows, img_cols)
else:
x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
input_shape = (img_rows, img_cols, 1)
x_train= x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255.0
x_test /= 255.0
print("Training samples: {}".format(x_train.shape[0]))
print("Test samples: {}".format(x_test.shape[0]))
# convert class vectors to binary class matrices
y_train = tensorflow.keras.utils.to_categorical(y_train, num_classes)
y_test = tensorflow.keras.utils.to_categorical(y_test, num_classes)
print('x_train shape:', x_train.shape)
print('y_train shape:', y_train.shape)
print('x_test shape:', x_test.shape)
print('y_test shape:', y_test.shape)
5. ニューラルネットワーク構築(どれか選んで使用)
5.1 全結合のみのシンプルなモデル
import tensorflow.keras
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras import backend as K
# ニューラルネットワーク構築&表示
model = Sequential()
model.add(Flatten(input_shape=input_shape))
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
5.2 CNN(畳み込み1回だけ)
import tensorflow.keras
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras import backend as K
# ニューラルネットワーク構築&表示
model = Sequential()
model.add(Conv2D(32, kernel_size=(3,3),
activation='relu',
input_shape=input_shape))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
5.3 CNN(畳み込み2回)
import tensorflow.keras
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras import backend as K
# ニューラルネットワーク構築&表示
model = Sequential()
model.add(Conv2D(32, kernel_size=(3,3),
activation='relu',
input_shape=input_shape))
model.add(Conv2D(64,(3,3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
6. 学習・テスト・学習後のNN保存
学習
# 学習
import tensorflow as tf
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
verbose=2,
validation_data=(x_test,y_test)
)
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss: {}'.format(score[0]))
print('Test accuracy: {}'.format(score[1]))
学習後のテスト
# 学習済みのNNのテスト・評価
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss: {}'.format(score[0]))
print('Test accuracy: {}'.format(score[1]))
from sklearn import metrics
small_x = x_test[1:100]
small_y = y_test[1:100]
small_y2 = np.argmax(small_y, axis=1)
pred = model.predict(small_x)
pred = np.argmax(pred,axis=1)
score = metrics.accuracy_score(small_y2, pred)
print('Accuracy: {}'.format(score))
学習後のNN保存
# 学習済みのNNの保存
import os
!mkdir -p saved_model
model.save('saved_model/my_model')
7. 保存したNNの読み込み
読み込み
import tensorflow as tf
new_model = tf.keras.models.load_model('saved_model/my_model')
# Check its architecture
new_model.summary()
読み込んだモデルの動作テスト
MNISTデータをもう一度ダウンロードして前処理して入力
# 読み込んだモデルの確認
import tensorflow.keras
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import regularizers
from tensorflow.keras.datasets import mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
print("Shape of x_train: {}".format(x_train.shape))
print("Shape of y_train: {}".format(y_train.shape))
print()
print("Shape of x_test: {}".format(x_test.shape) )
print("Shape of y_test: {}".format(y_test.shape) )
#input image dimensions
from tensorflow.keras import backend as K
img_rows, img_cols = 28, 28
if K.image_data_format() == 'channels_first':
x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
input_shape = (1, img_rows, img_cols)
else:
x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
input_shape = (img_rows, img_cols, 1)
x_train= x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255.0
x_test /= 255.0
num_classes = 10
print("Training samples: {}".format(x_train.shape[0]))
print("Test samples: {}".format(x_test.shape[0]))
# convert class vectors to binary class matrices
y_train = tensorflow.keras.utils.to_categorical(y_train, num_classes)
y_test = tensorflow.keras.utils.to_categorical(y_test, num_classes)
score = new_model.evaluate(x_test, y_test, verbose=0)
print('Test loss: {}'.format(score[0]))
print('Test accuracy: {}'.format(score[1]))
8. 読み込んだモデルでリアルタイム推論(演習課題)
カメラ画像からNNに入力する部分が一部未完成ですので完成させましょう.
カメラに映った数字を,画像処理によって学習に用いたMNISTの白黒の手書き数字のようにする必要があります.プログラム中の「img4」に手を加えて下さい.
import cv2, matplotlib
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
new_model = tf.keras.models.load_model('saved_model/my_model')
cap = cv2.VideoCapture(0)
if not cap.isOpened():
print("カメラを開けませんでした")
exit()
flag = 0
while True:
# フレームを読み込む
ret, img = cap.read()
# 読み込み失敗時
if not ret:
print("フレームを取得できませんでした")
break
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 物体検出のため2値化(flagによって白黒反転切り替え可)
if flag == 1:
ret, img2 = cv2.threshold(gray,0,255,\
cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV)
else:
ret, img2 = cv2.threshold(gray,0,255,cv2.THRESH_OTSU)
# 物体検出
label = cv2.connectedComponentsWithStats(img2)
n = label[0] - 1
data = np.delete(label[2], 0, 0)
center = np.delete(label[3], 0, 0)
# 検出したもののうち1番大きいものだけ扱う
max_index = np.argmax(data[:,4])
gx0 = data[:,0][max_index]
gy0 = data[:,1][max_index]
gwidth = data[:,2][max_index]
ghight = data[:,3][max_index]
#検出部分切り出し
img4 = gray[gy0:gy0+ghight, gx0:gx0+gwidth]
height4 = int(img4.shape[0])
width4 = int(img4.shape[1])
img5 = cv2.resize(img4,(width4, height4))
cv2.rectangle(img,(data[:,0][max_index],data[:,1][max_index]),\
(data[:,0][max_index]+data[:,2][max_index],\
data[:,1][max_index]+data[:,3][max_index]),\
color=(255,0,0),thickness=2)
s = "WIDTH="+str(data[:,2][max_index])
img3 = cv2.putText(img, s, (10,50), cv2.FONT_HERSHEY_DUPLEX, 0.7,\
color=(0,255,0), thickness=2, lineType=cv2.LINE_AA)
s = "HIGHT="+str(data[:,3][max_index])
img3 = cv2.putText(img3, s, (10,80), cv2.FONT_HERSHEY_DUPLEX, 0.7,\
color=(0,255,0), thickness=2, lineType=cv2.LINE_AA)
s = "AREA="+str(data[:,4][max_index])
img3 = cv2.putText(img3, s, (10,110), cv2.FONT_HERSHEY_DUPLEX, 0.7,\
color=(0,255,0), thickness=2, lineType=cv2.LINE_AA)
height = 480
width = 640
img3 = cv2.resize(img3,(width, height))
#img2 = cv2.resize(img2,(width, height))
###############################################
# img4 を色々操作してうまく数字が入るようにしよう
###############################################
# 処理後の画像表示
cv2.imshow("Processing",img4)
cv2.moveWindow("Processing",640,10)
figureX = img4
#figureX = cv2.bitwise_not(img7) #白黒反転
#28x28 入力するための処理
figureX = cv2.resize(figureX,(28,28))
figureX_kakudai = cv2.resize(figureX,(256,256))
cv2.imshow("INPUT",figureX_kakudai)
cv2.moveWindow("INPUT",384,512)
input_x = figureX.reshape(1,28,28,1)*1.0/255
#推論
predictions = new_model.predict( input_x, verbose='0' )
predictions = predictions.reshape(10)
max_index = np.argmax(predictions) # AIの出力(なんの数字か)
max_value = np.nanmax(predictions) # その数字の確率
stext = str(max_index) + ' rate:' + str(max_value)
#結果表示
cv2.putText(img3, text=stext,
org=(gx0+10, gy0+30),
fontFace=cv2.FONT_HERSHEY_SIMPLEX,
fontScale=0.8,
color=(0, 255, 255),
thickness=2,
lineType=cv2.LINE_4)
cv2.imshow("Original",img3)
cv2.moveWindow("Original", 10,10)
key = cv2.waitKey(1) & 0xFF
if key == ord('b'):
flag=1
if key == ord('w'):
flag=0
if key == ord('q'):
break
# リソース解放
cap.release()
cv2.destroyAllWindows()
9. 演習課題
課題①
カメラ画像を,OpenCVの様々な画像処理方法を用いてMNISTのような見た目になるようにし,推論できるようにして下さい
【ヒント】
・モルフォロジー変換(cv2.erode, cv2.dilate など)
・2値化(cv2.threshold)
課題②
(手書き数字のリアルタイム推論が完璧にできて暇になっちゃった人向け)