概要
- ニューラルネットワークにおける計算式のメモ
Softmax node with Cross Entropy
\begin{align}
y&=\mathrm{Softmax}(x),\quad y_{tk}=\frac{\exp x_{tk}}{\sum_k \exp x_{tk}},\quad \sum_k y_{tk}=1\\
L(y,t)&=-\sum_{t,k} \log(y_{tk}) t_{tk},\quad \sum_k t_{tk}=1\\
\frac{\partial L}{\partial X}&=-\sum_{t,k} \frac{t_{tk}}{y_{tk}}\frac{\partial y_{tk}}{\partial X}\\
&=-\sum_{t,k} \frac{t_{tk}}{y_{tk}}\left(\frac{\exp x_{tk}}{\sum_{k'} \exp x_{tk'}}\frac{\partial x_{tk}}{\partial X}
-\frac{\exp x_{tk}}{(\sum_{k'} \exp x_{tk'})^2}\sum_{k'} \exp x_{tk'}\frac{\partial x_{tk'}}{\partial X}\right)\\
&=-\sum_{t,k}\frac{t_{tk}}{y_{tk}}\left(y_{tk}\frac{\partial x_{tk}}{\partial X}
-y_{tk}\sum_{k'} y_{tk'}\frac{\partial x_{tk'}}{\partial X}\right)\\
&=-\sum_{t,k}t_{tk}\frac{\partial x_{tk}}{\partial X}
+\sum_{t,k}t_{tk}\sum_{k'} y_{tk'}\frac{\partial x_{tk'}}{\partial X}\\
&=-\sum_{t,k}t_{tk}\frac{\partial x_{tk}}{\partial X}
+\sum_{t}\sum_{k'} y_{tk'}\frac{\partial x_{tk'}}{\partial X}
=\sum_{t,k}(y_{tk}-t_{tk})\frac{\partial x_{tk}}{\partial X}\\
\frac{\partial L}{\partial x_{tk}}&=y_{tk}-t_{tk}
\end{align}
Batch Normalization
\begin{align}
y_{ti}&=\hat{x}_{ti} \gamma_i + \beta_i,\quad
\hat{x}_{ti}=\frac{x_{ti}-\mu_{bi}}{\sqrt{\sigma_{bi}^2+\epsilon}}\\
\mu_{bi}&=\frac{1}{m}\sum_{t=1}^m x_{ti},\quad
\sigma_{bi}^2=\frac{1}{m}\sum_{t=1}^m (x_{ti}-\mu_{bi})^2\\
\frac{\partial L}{\partial \beta_i}&=\sum_{t=1}^m\frac{\partial L}{\partial y_{ti}},\quad
\frac{\partial L}{\partial \gamma_i}=\sum_{t=1}^m\frac{\partial L}{\partial y_{ti}}\hat{x}_{ti}\\
\frac{\partial L}{\partial x_{ti}}&=\sum_{u=1}^m\frac{\partial L}{\partial y_{ui}}\gamma_i\left[
\frac{\delta_{ut}-\frac{1}{m}}{\sqrt{\sigma_{bi}^2+\epsilon}}
-\frac{1}{2}\frac{\hat{x}_{ui}}{(\sigma_{bi}^2+\epsilon)}
\frac{2}{m}\sum_{s=1}^m (x_{si}-\mu_{bi})\left(\delta_{st}-\frac{1}{m}\right)\right]\\
&=\sum_{u=1}^m\frac{\partial L}{\partial y_{ui}}\frac{\gamma_i}{\sqrt{\sigma_{bi}^2+\epsilon}}\left[
\delta_{ut}-\frac{1}{m}-\frac{\hat{x}_{ui}}{m}\sum_{s=1}^m\hat{x}_{si}\left(\delta_{st}-\frac{1}{m}\right)\right]\\
&=\frac{\gamma_i}{\sqrt{\sigma_{bi}^2+\epsilon}}\sum_{u=1}^m\frac{\partial L}{\partial y_{ui}}\left[
\delta_{ut}-\frac{1}{m}-\frac{\hat{x}_{ui}\hat{x}_{ti}}{m}\right]\\
&=\frac{\gamma_i}{\sqrt{\sigma_{bi}^2+\epsilon}}\left[
\frac{\partial L}{\partial y_{ti}}
-\frac{1}{m}\sum_{u=1}^m\frac{\partial L}{\partial y_{ui}}
-\frac{\hat{x}_{ti}}{m}\sum_{u=1}^m\frac{\partial L}{\partial y_{ui}}\hat{x}_{ui}\right]
\end{align}
Adam(Adaptive Moment Estimation)
\begin{align}
m_t&=\beta_1m_{t-1}+(1-\beta_1)g_t,\quad m_0=0\\
v_t&=\beta_2v_{t-1}+(1-\beta_2)g_t^2,\quad v_0=0\\
\theta_{t+1}&=\theta_{t}-\alpha\frac{\hat{m}_t}{\sqrt{\hat{v}_t}+\epsilon},\quad
\hat{m}_t=\frac{m_t}{1-\beta_1^t},\quad\hat{v}_t=\frac{v_t}{1-\beta_2^t}\\
&\alpha=0.001,\beta_1=0.9,\beta_2=0.999,\epsilon=10^{-8}\\
\theta_{t+1}&=\theta_{t}-\alpha_t\frac{m_t}{\sqrt{v_t}+\hat{\epsilon}},\quad
\alpha_t=\alpha\frac{\sqrt{1-\beta_2^t}}{1-\beta_1^t}\quad(modified)
\end{align}