概要
「渡辺澄夫 広く使えるベイズ情報量規準WBIC」に掲載されているMATLABによるWBICの計算例をRに移植。
主な変更点は以下の通り。
- 対数尤度の計算式に定数項を追加。
- RLCT推定値の計算でオーバーフローする部分を修正。
- SMALLVALで数値微分しているようなので、参考のため微分値を追加(RLCT_d)。
WBICの計算式
\begin{align}
y&=BAx+\epsilon,\quad \Sigma(\epsilon)=\sigma_2^2I_{nn}\quad モデル\\
&y(nn\times 1),B(nn\times hh),A(hh\times mm),x(mm\times 1)\\
p(y|x,A,B)&=(2\pi)^{-\frac{nn}{2}}|\Sigma|^{-\frac{1}{2}}\exp\left(-\frac{1}{2}(y-BAx)^t\Sigma^{-1}(y-BAx)\right)\\
&=(2\pi)^{-\frac{nn}{2}}\sigma_2^{-nn}\exp\left(-\frac{1}{2\sigma_2^2}(y-BAx)^t(y-BAx)\right)\\
p(A)&=\prod_a (2\pi)^{-\frac{1}{2}}\sigma_3^{-1}\exp\left(-\frac{a^2}{2\sigma_3^2}\right)
\quad A\sim N(0,\sigma_3^2)\quad 事前分布\\
p(B)&=\prod_b (2\pi)^{-\frac{1}{2}}\sigma_3^{-1}\exp\left(-\frac{b^2}{2\sigma_3^2}\right)
\quad B\sim N(0,\sigma_3^2)\quad 事前分布\\
x_0=\sigma_1^2(mm\times 1)&,A_0=\sigma_6^2(h_0\times mm),B_0=\sigma_6^2(nn\times h_0),h_0=3\quad 真のモデル\\
p_\beta(A,B|D)&\propto p(A)p(B)\prod p(Y_i|X_i,A,B)^\beta\quad 事後分布\beta\\
\mathrm{WBIC}&=-\iint\sum\log p(Y_i|X_i,A,B)p_\beta(A,B|D)dadb\quad(\beta=1/\log n)
\end{align}
Rスクリプト
####### Constants in Reduced Rank Regression Y=B*A*X+noise ##############
MM=6 ### Input Dimension
HMAX=6 ### Hidden Dimension
NN=6 ### Output Dimension
H0=3 ### True Hidden Dimension
SD1=3 ### Standard Deviation of Input Distribution
SD2=0.1 ### Standard Deviation of Output Noise
SD3=10.0 ### Standard Deviation of Prior Distribution
SD6=0.2 ### Standard Deviation of True Parameter Making
####### Constants in Metropolis Method #################
KK=2000 ### Number of Parameters from Posterior Distribution
n=500 ### Number of training samples;
BURNIN=50000 ### Burn-in in Metroplois method
INTER=100 ### sampling interval in Metroplois method
MONTEC=BURNIN+KK*INTER ### Total Samling in Metroplis method
BETA=1/log(n) ### inverse temperature
SMALLVAL=0.5 ### Constant for calculation of RLCT
SD4=0.0012 ### Standard Deviation of Monte Carlo Step for A
SD5=0.0012 ### Standard Deviation of Monte Carlo Step for B
### SD4 and SD5 are determined so that 0.1< acceptance probability <0.9.
#############################################################
randn = function(n,m){array(rnorm(n*m),dim=c(n,m))}
rand = function(n,m){array(runif(n*m),dim=c(n,m))}
mod = function(a,b){a%%b}
######################### Ture parameter is determined ################
set.seed(10)
A0=SD6*randn(H0,MM)
B0=SD6*randn(NN,H0)
########################## Input and Output are determined ##############
X = SD1*randn(MM,n) ### random inputs
Y = B0 %*% A0 %*% X + SD2*randn(NN,n) ### random outputs
################### Functions for Likelihood and prior ####################
# LOGL=@(A,B)((1/(2*SD2*SD2)*(trace((Y-B*A*X)*(Y-B*A*X)')))+NN*n*log(SD2));
LOGL = function(A,B){
w = (Y - B %*% A %*% X)/SD2
sum(w*w)/2+NN*n*log(SD2)+NN*n*log(sqrt(2*pi))
}
# PRIO=@(A,B)(1/(2*SD3*SD3)*(trace(A*A')+trace(B*B')));
PRIO = function(A,B){
1/(2*SD3*SD3)*(sum(A*A)+sum(B*B))+(length(A)+length(B))*log(sqrt(2*pi)*SD3)
}
### In reduced rank regression,
### LOGL = - sum log p_i
### = sum (1/2sigma^2)|y_i-BAx_i|^2 + NN*n/2*\log(2*pi*sigma^2)
### = (1/2sigma^2) tr( (Y-BAX)(Y-BAX)' ) + NN*n*log(sigma) + const
### PRIO = - log (prior distribution)
############################################################
sprintf("Model selection by WBIC in Reduced Rank regression %g-%g-%g. True rank = %g.",MM,HMAX,NN,H0)
sprintf("Number of Training samples = %g. Number of Metropolis Parameters = %g.",n,KK)
########### Model Selection Start ################################
t0 <- proc.time()
ret1 = t(sapply(1:HMAX,function(HH){
################### Initial parameters #########################
set.seed(HH)
A=array(0,dim=c(HH,MM))
B=array(0,dim=c(NN,HH))
##################### Metropolis preparation #########################
ENERGY1=LOGL(A,B)
HAMILTON1=BETA*ENERGY1+PRIO(A,B)
rec=0
acceptance=0
LLL=array(0,dim=c(1,KK)) ##### Log Likelihood for parameters
###################### Metropolis BEGIN #############################
for(t in 1:MONTEC){
###################### Metropolis Step ###############
AA=A+SD4*randn(HH,MM)
BB=B+SD5*randn(NN,HH)
ENERGY2=LOGL(AA,BB)
HAMILTON2=BETA*ENERGY2+PRIO(AA,BB)
DELTA=HAMILTON2-HAMILTON1
#################### Accept or Reject ####################
if(exp(-DELTA)>rand(1,1)){
A=AA
B=BB
HAMILTON1=HAMILTON2
ENERGY1=ENERGY2
if(t>BURNIN){
acceptance=acceptance+1
}
}
###################### Record Likelihood ##################
if(mod(t,INTER)==0&t>BURNIN){
rec=rec+1
LLL[rec]=ENERGY1
}
}
############################ Metropolis END #########################
############################ Estimate RLCT BEGIN ############################
sum1 = mean(LLL)
minlll = min(LLL)
sum2 = mean(LLL*exp(-SMALLVAL*BETA*(LLL-minlll)))
sum3 = mean(exp(-SMALLVAL*BETA*(LLL-minlll)))
sum4 = sum2/sum3
lambda2 = (sum1-sum4)/((1-(1/(1+SMALLVAL)))*log(n))
lambda3 = mean((LLL-sum1)*(LLL-sum1))*BETA*BETA
############################## Estimate RLCT End ##########################
################ Theory RCLT of reduced rank regression BEGIN ###############
lambda1=(2*(HH+H0)*(MM+NN)-(MM-NN)*(MM-NN)-(HH+H0)*(HH+H0))/8
if(mod(MM+NN+HH+H0,2)==1){
lambda1=(2*(HH+H0)*(MM+NN)-(MM-NN)*(MM-NN)-(HH+H0)*(HH+H0)+1)/8
}
if(HH<H0){
lambda1=HH*(MM+NN-HH)/2
}
##### Theoretical RLCTs were proved in the paper,
##### M.Aoyagi, S. Watanabe, "Stochastic complexities of reduced rank regression
##### in Bayesian estimation," Neural Networks, Vol.18,No.7,pp.924-933,2005.
############################### Theory RLCT End ################################
###########################################################################
acceptr=acceptance/(MONTEC-BURNIN)
c(Rank=HH,WBIC=sum1,RLCT=lambda1,RLCT_est=lambda2,RLCT_d=lambda3,Accept=acceptr)
}))
proc.time() - t0
計算結果
真のモデルであるRank=3の場合に、WBICが最小となっている。
Rank WBIC RLCT RLCT_est RLCT_d Accept
[1,] 1 4979.831 5.5 5.215818 4.944885 0.735510
[2,] 2 -1566.378 10.0 9.714045 9.397373 0.565965
[3,] 3 -2511.113 13.5 14.258048 14.940474 0.493385
[4,] 4 -2503.035 15.0 14.962545 15.762366 0.546195
[5,] 5 -2500.002 16.0 15.722722 15.562083 0.511370
[6,] 6 -2496.732 17.0 15.899789 16.093919 0.445145