AIの要素技術について記述します。多層パーセプトロン(Multi-Layer Perceptron)は、訓練データを用いて学習し、学習後は新たな入力データに対して出力を予測します。入力層と出力層の間に隠れ層があり、これらの「重み(パラメータ)」を学習します。
サンプルプログラム
mlp_numpy.py
① XOR 線形分離できないが、MLPなら解ける
訓練データ(2入力、1出力のデータセットを4個)
[[0, 0], [0, 1], [1, 0], [1, 1]]
[0, 1 ,1, 0]
活性化関数:[隠れ層]ReLU 関数、[出力層]シグモイド関数
Xavier 初期化 / He 初期化
設定
隠れ層 8, 8(隠れ層が2層あり、それぞれ 8 ユニットずつ)
出力層で分類したいクラスの数 2
(出力ノード1個 + Sigmoid 、または、出力ノード2個 + Softmax )
lr 0.1(勾配に対して 0.1倍 のステップで重みを更新する)(学習率 learning rate)
エポック 2000(学習を繰り返す回数)
バッチサイズ 4(データ全体を使って1回更新)(フルバッチ学習)
乱数シード 42 (random seed)
活性化関数 ReLU
予測値(入力は上記と同じ4個)
[0, 1, 1, 0]
正解率
1.0(「予測 == 正解」を1、「予測 != 正解」を0、として、その平均)(4つとも正解)
import numpy as np
from dataclasses import dataclass
# ------- ユーティリティ -------
def one_hot(y, num_classes):
y = y.astype(int).ravel()
oh = np.zeros((y.size, num_classes), dtype=float)
oh[np.arange(y.size), y] = 1.0
return oh
def sigmoid(z):
return 1.0 / (1.0 + np.exp(-z))
def dsigmoid(a): # a = sigmoid(z)
return a * (1.0 - a)
def relu(z):
return np.maximum(0.0, z)
def drelu(z):
return (z > 0).astype(float)
def softmax(z):
z = z - z.max(axis=1, keepdims=True) # 数値安定化
e = np.exp(z)
return e / e.sum(axis=1, keepdims=True)
# ------- モデル定義 -------
@dataclass
class MLPConfig:
input_dim: int
hidden_dims: tuple = (16, 16) # 隠れ層ユニット
num_classes: int = 2 # 2 ならバイナリ分類(Sigmoid)
lr: float = 0.05
epochs: int = 2000
batch_size: int = 32
l2: float = 0.0 # L2 正則化係数
seed: int | None = 0
activation: str = "relu" # "relu" or "tanh"
class MLP:
def __init__(self, cfg: MLPConfig):
self.cfg = cfg
self.rng = np.random.default_rng(cfg.seed)
self.params = self._init_params()
# パラメータ初期化(Xavier/He)
def _init_params(self):
sizes = [self.cfg.input_dim, *self.cfg.hidden_dims,
(1 if self.cfg.num_classes == 2 else self.cfg.num_classes)]
params = {}
for i in range(len(sizes) - 1):
fan_in, fan_out = sizes[i], sizes[i + 1]
if i < len(sizes) - 2: # 隠れ層
if self.cfg.activation == "relu":
scale = np.sqrt(2.0 / fan_in) # He init
else: # tanh
scale = np.sqrt(1.0 / fan_in) # Xavier
else: # 出力層
scale = np.sqrt(1.0 / fan_in) # Xavier
params[f"W{i+1}"] = self.rng.normal(0.0, scale, size=(fan_in, fan_out))
params[f"b{i+1}"] = np.zeros((1, fan_out))
return params
# 順伝播
def _forward(self, X):
caches = {}
A = X
L = len(self.params) // 2
for i in range(1, L): # 隠れ層
Z = A @ self.params[f"W{i}"] + self.params[f"b{i}"]
if self.cfg.activation == "relu":
A = relu(Z)
else:
A = np.tanh(Z)
caches[f"Z{i}"], caches[f"A{i}"] = Z, A
# 出力層
ZL = A @ self.params[f"W{L}"] + self.params[f"b{L}"]
if self.cfg.num_classes == 2:
AL = sigmoid(ZL) # shape: (N,1)
else:
AL = softmax(ZL) # shape: (N,C)
caches[f"Z{L}"], caches[f"A{L}"] = ZL, AL
caches["A0"] = X
return AL, caches
# 損失(交差エントロピー + L2)
def _loss(self, AL, y):
N = y.shape[0]
if self.cfg.num_classes == 2:
# y: (N,) or (N,1) in {0,1}; AL: (N,1)
y = y.reshape(-1, 1)
eps = 1e-12
loss = -(y * np.log(AL + eps) + (1 - y) * np.log(1 - AL + eps)).mean()
else:
# y: (N,) in {0..C-1}; AL: (N,C)
Y = one_hot(y, self.cfg.num_classes)
eps = 1e-12
loss = -(Y * np.log(AL + eps)).sum(axis=1).mean()
if self.cfg.l2 > 0:
L = len(self.params) // 2
l2sum = sum((self.params[f"W{i}"] ** 2).sum() for i in range(1, L + 1))
loss += 0.5 * self.cfg.l2 * l2sum / N
return loss
# 逆伝播
def _backward(self, caches, y):
grads = {}
L = len(self.params) // 2
A_prev = caches[f"A{L-1}"] if L > 1 else caches["A0"]
AL = caches[f"A{L}"]
N = A_prev.shape[0]
# 出力層のデルタ
if self.cfg.num_classes == 2:
y = y.reshape(-1, 1)
dZL = (AL - y) # BCE with sigmoid
else:
Y = one_hot(y, self.cfg.num_classes)
dZL = (AL - Y) # CE with softmax
grads[f"dW{L}"] = (A_prev.T @ dZL) / N + self.cfg.l2 * self.params[f"W{L}"] / N
grads[f"db{L}"] = dZL.mean(axis=0, keepdims=True)
dA_prev = dZL @ self.params[f"W{L}"].T
# 隠れ層を逆向きに
for i in range(L - 1, 0, -1):
Z = caches[f"Z{i}"]
A_prev = caches[f"A{i-1}"] if i > 1 else caches["A0"]
if self.cfg.activation == "relu":
dZ = dA_prev * drelu(Z)
else:
dZ = dA_prev * (1 - np.tanh(Z) ** 2) # dtanh
grads[f"dW{i}"] = (A_prev.T @ dZ) / N + self.cfg.l2 * self.params[f"W{i}"] / N
grads[f"db{i}"] = dZ.mean(axis=0, keepdims=True)
dA_prev = dZ @ self.params[f"W{i}"].T
return grads
# パラメータ更新(SGD)
def _step(self, grads):
for k in self.params.keys():
if k.startswith("W"):
i = k[1:]
self.params[k] -= self.cfg.lr * grads[f"dW{i}"]
self.params[f"b{i}"] -= self.cfg.lr * grads[f"db{i}"]
# 学習
def fit(self, X, y, X_val=None, y_val=None, verbose=True):
X = np.asarray(X, dtype=float)
y = np.asarray(y)
N = X.shape[0]
bs = min(self.cfg.batch_size, N)
history = {"loss": [], "val_loss": []}
for ep in range(1, self.cfg.epochs + 1):
# シャッフル
idx = np.random.permutation(N)
Xs, ys = X[idx], y[idx]
# ミニバッチ
for st in range(0, N, bs):
ed = st + bs
XB, yB = Xs[st:ed], ys[st:ed]
AL, caches = self._forward(XB)
grads = self._backward(caches, yB)
self._step(grads)
# ログ
AL_train, _ = self._forward(X)
train_loss = self._loss(AL_train, y)
history["loss"].append(train_loss)
if X_val is not None and y_val is not None:
AL_val, _ = self._forward(X_val)
val_loss = self._loss(AL_val, y_val)
history["val_loss"].append(val_loss)
if verbose and (ep % max(1, self.cfg.epochs // 10) == 0 or ep == 1):
if history["val_loss"]:
print(f"epoch {ep:4d} loss={train_loss:.4f} val_loss={val_loss:.4f}")
else:
print(f"epoch {ep:4d} loss={train_loss:.4f}")
return history
# 予測
def predict(self, X):
X = np.asarray(X, dtype=float)
AL, _ = self._forward(X)
if self.cfg.num_classes == 2:
return (AL.ravel() >= 0.5).astype(int)
else:
return AL.argmax(axis=1)
def predict_proba(self, X):
X = np.asarray(X, dtype=float)
AL, _ = self._forward(X)
return AL
def score(self, X, y):
y_pred = self.predict(X)
return (y_pred == y).mean()
# ------- デモ: XOR(二値)と3クラス玩具データ -------
if __name__ == "__main__":
# --- XOR(線形分離できない -> MLPなら解ける)---
X_xor = np.array([[0,0],[0,1],[1,0],[1,1]], dtype=float)
y_xor = np.array([0,1,1,0], dtype=int)
cfg_bin = MLPConfig(input_dim=2, hidden_dims=(8, 8), num_classes=2,
lr=0.1, epochs=2000, batch_size=4, seed=42, activation="relu")
mlp_bin = MLP(cfg_bin)
mlp_bin.fit(X_xor, y_xor, verbose=False)
print("XOR pred:", mlp_bin.predict(X_xor), "true:", y_xor)
print("XOR acc :", mlp_bin.score(X_xor, y_xor))
# --- 多クラス玩具データ(同心円風に3クラス作成)---
rng = np.random.default_rng(0)
N = 300
angles = rng.uniform(0, 2*np.pi, N)
radii = rng.choice([0.6, 1.2, 1.8], size=N, replace=True)
Xc = np.c_[radii*np.cos(angles), radii*np.sin(angles)]
yc = ((radii > 0.9).astype(int) + (radii > 1.5).astype(int)) # 0,1,2
cfg_mc = MLPConfig(input_dim=2, hidden_dims=(32, 32), num_classes=3,
lr=0.05, epochs=1000, batch_size=32, seed=1, activation="relu")
mlp_mc = MLP(cfg_mc)
mlp_mc.fit(Xc, yc, verbose=False)
print("3-class acc:", mlp_mc.score(Xc, yc))
XOR pred: [0 1 1 0] true: [0 1 1 0]
XOR acc : 1.0
3-class acc: 1.0