#主成分分析
#計算例4
#分散共分散行列から始めた場合;
data=data.frame(list=c("x1(身長)","x2(体重)","x3(胸囲)","x4(座高)"),average=c(149.06,40.26,73.98,80.02),variance=c(19.94,23.56,20.95,7.55),standard_deviation=c(4.47,4.85,4.58,2.75))
variance_covariance_matrix=matrix(c(19.94,10.5,6.59,8.63,10.5,23.56,19.71,7.97,6.59,19.71,20.95,3.93,8.63,7.97,3.93,7.55),ncol=4,nrow=4)
#固有値、固有ベクトル
eigen_value=eigen(variance_covariance_matrix)$values
eigen_vector=eigen(variance_covariance_matrix)$vectors
#累積寄与率と寄与率
propotion=eigen_value/sum(diag(variance_covariance_matrix))
accumulated_propotion=cumsum(propotion)
#何番目の要素から80%を超えるか調べる
check_vector=data.frame(num=1:length(accumulated_propotion),check=0)
count=0 #80%を超えたかどうか確認するのに用いる
for(j in 1:length(accumulated_propotion)){
if(count==0){
if(accumulated_propotion[j]>0.80){
count=1;check_vector$check[j]=1;
}
}else{
# do nothing!
}
}
#初めて累積寄与率が80%を超えたところはどこか?
print(paste0(check_vector$num[check_vector$check==1],"番目です。累積寄与率はそれぞれ",round(accumulated_propotion[1]*1000)/1000,"、",round(accumulated_propotion[2]*1000)/1000,"、",round(accumulated_propotion[3]*1000)/1000,"、",round(accumulated_propotion[4]*1000)/1000,"となった。"))
print(paste0("よって、1~",check_vector$num[check_vector$check==1],"を主成分として考える。"))
#因子負荷量
factor_loading=t(sqrt(eigen_value)*t(eigen_vector))*(1/sqrt(diag(variance_covariance_matrix)))
#相関行列から始めた場合;
R=matrix(c(1,0.484,0.322,0.703,0.484,1,0.887,0.597,0.322,0.887,1,0.313,0.703,0.597,0.313,1),ncol=4,nrow=4)
#固有値、固有ベクトル
eigen_value=eigen(R)$value
eigen_vector=eigen(R)$vectors
#"x1(身長)","x2(体重)","x3(胸囲)","x4(座高)"
N=20
data=rbind(data.frame(x1=145+runif(N,min=1,max=25),x2=40+runif(N,min=1,max=25),x3=60+runif(N,min=15,max=30),x4=60+runif(N,min=30,max=40)),data.frame(x1=145+runif(N,min=1,max=5),x2=40+runif(N,min=20,max=25),x3=60+runif(N,min=25,max=30),x4=60+runif(N,min=10,max=20)))
#for(j in 1:ncol(data)){
#data[,j]=(data[,j]-apply(data,2,mean)[j])/sqrt(apply(data,2,var)[j])
#}
#相関行列から始めた場合;
R=matrix(cor(data),ncol=4,nrow=4)
#固有値、固有ベクトル
eigen_value=eigen(R)$value
eigen_vector=eigen(R)$vectors
propotion=eigen_value/sum(eigen_value)
check_data=data.frame(check=0,accumulated_propotion=cumsum(propotion))
check_data$check[check_data$accumulated_propotion>=0.8]=1
count=0
for(j in 1:nrow(check_data)){
if(check_data$check[j]!=1){
count=count+1
}
}
count=count+1
#初めて累積寄与率が80%を超えたところはどこか?
print(paste0(count,"番目です。"))
for(j in 1:nrow(check_data)){
if(j %in% c(1:(nrow(check_data)-1))){
print(paste0("累積寄与率は",check_data$accumulated_propotion[j],"と"))
}else{
print(paste0("累積寄与率は",check_data$accumulated_propotion[j],"です。"))
}
}
eigen_and_accumulated_propotion=data.frame(principal_component=1:nrow(check_data),eigen_value=eigen_value,accumulated_propotion=cumsum(propotion))
library(pforeach)
eigen_value_and_factor_loading=pforeach(j=1:count,.c=cbind)({
z=(t((eigen_vector[,j]))%*%R)
r=c()
for(k in 1:ncol(R)){
r=c(r,cov(t(z),R[,k])/sqrt(var(c(z))*var(R[,k])))
}
data=data.frame(eigen_vector=eigen_vector[,j],factor_loading=r)
})
z=data.frame(num=1:nrow(data))
for(k in 1:count){
Z=data.frame(num=1:nrow(data),z=0)
for(j in 1:nrow(data)){
Z$z[j]=sum(data[j,]*eigen_vector[,k])
}
Z=Z %>% dplyr::select(z)
colnames(Z)=c(paste0("z.",k))
z=cbind(z,Z)
}
plot(z[,2],z[,3],type="p",xlab="肥満;痩身",ylab="大柄;小柄")
print("z1が高いとき足が長くて、胸囲が大きい。低いときは小柄")
print("z2が高いとき痩身、低いとき肥満")
X=as.matrix(data)
X_sub=as.matrix(data)-apply(data,2,mean)
d=diag(svd(X)$d)
u=svd(X)$u;v=svd(X)$v
a=c()
for(j in 1:nrow(X)){
a=c(a,(t(X[j,]-apply(data,2,mean)))%*%(diag(1,ncol(X))-t(u)%*%(u))%*%t(t(X[j,]-apply(data,2,mean))))
}
x2_mat=array(0,dim=c(dim(X)))
x1_mat=array(0,dim=c(dim(X)))
for(j in 1:nrow(X)){
x2_mat[j,]=t(u)%*%(u)%*%c(X[j,]-apply(data,2,mean))
x1_mat[j,]=c(X[j,]-apply(data,2,mean))-t(u)%*%(u)%*%c(X[j,]-apply(data,2,mean))
}
a_t2=ncol(X)*a
#確率主成分?
#"x1(身長)","x2(体重)","x3(胸囲)","x4(座高)"
N=20
data=rbind(data.frame(x1=145+runif(N,min=1,max=25),x2=40+runif(N,min=1,max=25),x3=60+runif(N,min=15,max=30),x4=60+runif(N,min=30,max=40)),data.frame(x1=145+runif(N,min=1,max=5),x2=40+runif(N,min=20,max=25),x3=60+runif(N,min=25,max=30),x4=60+runif(N,min=10,max=20)))
#相関行列から始めた場合;
R=cor(data);S=cov(data)
#固有値、固有ベクトル
eigen_value=eigen(S)$value
propotion=eigen_value/sum(eigen_value)
eigen_vector=eigen(S)$vectors
D=4;K=2
sigma_hat=sum(eigen_value[(K+1):length(eigen_value)])/(D-K)
P=eigen_vector[,1:K]
W=P%*%(diag(eigen_value[1:K],K)-diag(sigma_hat,K))^(1/2)
library(mvtnorm)
n=10000
samples=array(0,dim=c(D,n))
for(j in 1:ncol(samples)){
samples[,j]=(W%*%c(rmvnorm(1,rep(0,K),diag(1,K)))+t(rmvnorm(1,rep(0,D),diag(sigma_hat,D))))
}
apply(samples,1,mean)
l=-(N*D*log(2*pi))/2-(N/2)*log(det(W%*%t(W)+diag(sigma_hat,D)))
cost=0
for(j in 1:nrow(data)){
cost=t(c(as.numeric(data[j,])))%*%solve(W%*%t(W)+diag(sigma_hat,D))%*%t(t(c(as.numeric(data[j,]))))+cost
}
l=l-(1/2)*cost
plot((as.matrix(data))%*%as.matrix(W))
#入門 機械学習における異常検知 p.140
#確率主成分分析
dim=3
X=as.matrix(iris[,1:4])
n=nrow(X);m=ncol(X)
sigma2=1
W=array(1,dim=c(m,dim))
x_bar=apply(X,2,mean)
times=1000
for(l in 1:times){
sigma2_pre=sigma2;W_pre=W
M=t(W)%*%W+diag(sigma2,dim)
W1=array(0,dim=c(ncol(X),dim))
W2=array(0,dim=c(dim,dim))
sigma2_hat=c()
for(i in 1:nrow(X)){
z_n=solve((M))%*%t(W)%*%c(X[i,]-x_bar)
zz_n=sigma2*solve(M)+z_n%*%t(z_n)
W1=W1+t(t(X[i,]-x_bar))%*%t(z_n)
W2=W2+zz_n
sigma2_hat=c(sigma2_hat,sum((X[i,]-x_bar)^2)-2*t(X[i,]-x_bar)%*%W%*%z_n+sum(diag(zz_n%*%t(W)%*%W)))
}
W=W1%*%solve(W2);sigma2=sum(sigma2_hat)/prod(dim(X))
L=-(prod(dim(X)))/2-(sigma2*sum(diag(W2)))/(2*sigma2)-ncol(X)*log(2*pi*sigma2)/2
BIC=-2*L+(ncol(X)*dim-(dim*(dim-1))/2+dim+1)*log(nrow(X))
print(BIC)
#print(sum(abs(W_pre-W))+abs(sigma2_pre-sigma2))
}
#異常検知
C=W%*%t(W)+diag(sigma2,ncol(X))
#データの標本分散共分散行列
var(X)
#推定されたパラメータsigma2
sigma2
#計算例4
#分散共分散行列から始めた場合;
data=data.frame(list=c("x1(身長)","x2(体重)","x3(胸囲)","x4(座高)"),average=c(149.06,40.26,73.98,80.02),variance=c(19.94,23.56,20.95,7.55),standard_deviation=c(4.47,4.85,4.58,2.75))
variance_covariance_matrix=matrix(c(19.94,10.5,6.59,8.63,10.5,23.56,19.71,7.97,6.59,19.71,20.95,3.93,8.63,7.97,3.93,7.55),ncol=4,nrow=4)
#固有値、固有ベクトル
eigen_value=eigen(variance_covariance_matrix)$values
eigen_vector=eigen(variance_covariance_matrix)$vectors
#累積寄与率と寄与率
propotion=eigen_value/sum(diag(variance_covariance_matrix))
accumulated_propotion=cumsum(propotion)
cov_mat=variance_covariance_matrix
#iterationによる主成分の計算法
#1回目
#Broyden Quasi-Newton Algorithm p.213
f=function(x){
lam=x[1];x=t(t(c(x[-1])))
f1=2*cov_mat%*%x-2*lam*x
f2=1-t(x)%*%x
return(c(f1,f2))
}
X=rep(3,ncol(cov_mat)+1)
ite=10000;eta=0.01
H=diag(f(X))
for(l in 1:ite){
if(sum(abs(f(X)))>10^(-9)){
X_pre=X
X=X-eta*H%*%f(X)
s=X-X_pre
y=f(X)-f(X_pre)
H=H+((s-H%*%y)/as.numeric(t(s)%*%H%*%y))%*%t(s)%*%H
print((f(X)))
}
}
lam1=X[1]
w1=X[-1]
#2回目
#Broyden Quasi-Newton Algorithm p.213
f=function(x){
lam=x[1];r=x[2];x=t(t(c(x[-c(1:2)])))
f3=2*cov_mat%*%x-2*lam*x+r*w1
f1=1-t(x)%*%x
f2=sum(w1*x)
return(c(f1,f2,f3))
}
X=rep(2.5,ncol(cov_mat)+2)
ite=100000;eta=0.01
H=diag(f(X))
for(l in 1:ite){
if(sum(abs(f(X)))>10^(-9)){
X_pre=X
X=X-eta*H%*%f(X)
s=X-X_pre
y=f(X)-f(X_pre)
H=H+((s-H%*%y)/as.numeric(t(s)%*%H%*%y))%*%t(s)%*%H
print(sum(abs(f(X))))
}
}
lam2=X[1]
w2=X[-c(1:2)]