LoginSignup
2
1

More than 1 year has passed since last update.

主成分分析(多変量解析入門Ⅰ 河口至商先生)

Last updated at Posted at 2018-03-06
#主成分分析

#計算例4

#分散共分散行列から始めた場合;
data=data.frame(list=c("x1(身長)","x2(体重)","x3(胸囲)","x4(座高)"),average=c(149.06,40.26,73.98,80.02),variance=c(19.94,23.56,20.95,7.55),standard_deviation=c(4.47,4.85,4.58,2.75))

variance_covariance_matrix=matrix(c(19.94,10.5,6.59,8.63,10.5,23.56,19.71,7.97,6.59,19.71,20.95,3.93,8.63,7.97,3.93,7.55),ncol=4,nrow=4)
#固有値、固有ベクトル
eigen_value=eigen(variance_covariance_matrix)$values
eigen_vector=eigen(variance_covariance_matrix)$vectors
#累積寄与率と寄与率
propotion=eigen_value/sum(diag(variance_covariance_matrix))
accumulated_propotion=cumsum(propotion)
#何番目の要素から80%を超えるか調べる
check_vector=data.frame(num=1:length(accumulated_propotion),check=0)
count=0 #80%を超えたかどうか確認するのに用いる
for(j in 1:length(accumulated_propotion)){
  if(count==0){
  if(accumulated_propotion[j]>0.80){
   count=1;check_vector$check[j]=1;
  }
  }else{
   # do nothing! 
   }
}
#初めて累積寄与率が80%を超えたところはどこか?
print(paste0(check_vector$num[check_vector$check==1],"番目です。累積寄与率はそれぞれ",round(accumulated_propotion[1]*1000)/1000,"、",round(accumulated_propotion[2]*1000)/1000,"、",round(accumulated_propotion[3]*1000)/1000,"、",round(accumulated_propotion[4]*1000)/1000,"となった。"))
print(paste0("よって、1~",check_vector$num[check_vector$check==1],"を主成分として考える。"))
#因子負荷量
factor_loading=t(sqrt(eigen_value)*t(eigen_vector))*(1/sqrt(diag(variance_covariance_matrix)))

#相関行列から始めた場合;

R=matrix(c(1,0.484,0.322,0.703,0.484,1,0.887,0.597,0.322,0.887,1,0.313,0.703,0.597,0.313,1),ncol=4,nrow=4)

#固有値、固有ベクトル
eigen_value=eigen(R)$value
eigen_vector=eigen(R)$vectors


#"x1(身長)","x2(体重)","x3(胸囲)","x4(座高)"

N=20

data=rbind(data.frame(x1=145+runif(N,min=1,max=25),x2=40+runif(N,min=1,max=25),x3=60+runif(N,min=15,max=30),x4=60+runif(N,min=30,max=40)),data.frame(x1=145+runif(N,min=1,max=5),x2=40+runif(N,min=20,max=25),x3=60+runif(N,min=25,max=30),x4=60+runif(N,min=10,max=20)))

#for(j in 1:ncol(data)){

#data[,j]=(data[,j]-apply(data,2,mean)[j])/sqrt(apply(data,2,var)[j])

#}

#相関行列から始めた場合;

R=matrix(cor(data),ncol=4,nrow=4)

#固有値、固有ベクトル
eigen_value=eigen(R)$value
eigen_vector=eigen(R)$vectors

propotion=eigen_value/sum(eigen_value)

check_data=data.frame(check=0,accumulated_propotion=cumsum(propotion))

check_data$check[check_data$accumulated_propotion>=0.8]=1

count=0

for(j in 1:nrow(check_data)){
  
if(check_data$check[j]!=1){
  
  count=count+1
  
}  
  
}

count=count+1

#初めて累積寄与率が80%を超えたところはどこか?
print(paste0(count,"番目です。"))

for(j in 1:nrow(check_data)){
  
if(j %in% c(1:(nrow(check_data)-1))){
             
print(paste0("累積寄与率は",check_data$accumulated_propotion[j],"と"))
  
}else{
  
 print(paste0("累積寄与率は",check_data$accumulated_propotion[j],"です。"))
 
}
}

eigen_and_accumulated_propotion=data.frame(principal_component=1:nrow(check_data),eigen_value=eigen_value,accumulated_propotion=cumsum(propotion))

library(pforeach)

eigen_value_and_factor_loading=pforeach(j=1:count,.c=cbind)({
  
  z=(t((eigen_vector[,j]))%*%R)
  
  r=c()
  
  for(k in 1:ncol(R)){
  
  r=c(r,cov(t(z),R[,k])/sqrt(var(c(z))*var(R[,k])))
  
  }
  
  data=data.frame(eigen_vector=eigen_vector[,j],factor_loading=r)
  
})


z=data.frame(num=1:nrow(data))


  
  for(k in 1:count){
    
  Z=data.frame(num=1:nrow(data),z=0)
    
  for(j in 1:nrow(data)){
 
  Z$z[j]=sum(data[j,]*eigen_vector[,k])

  }
  
Z=Z %>% dplyr::select(z)

colnames(Z)=c(paste0("z.",k))
  
z=cbind(z,Z)    
    
  }



plot(z[,2],z[,3],type="p",xlab="肥満;痩身",ylab="大柄;小柄")

print("z1が高いとき足が長くて、胸囲が大きい。低いときは小柄")

print("z2が高いとき痩身、低いとき肥満")

X=as.matrix(data)

X_sub=as.matrix(data)-apply(data,2,mean)

d=diag(svd(X)$d)

u=svd(X)$u;v=svd(X)$v

a=c()

for(j in 1:nrow(X)){

a=c(a,(t(X[j,]-apply(data,2,mean)))%*%(diag(1,ncol(X))-t(u)%*%(u))%*%t(t(X[j,]-apply(data,2,mean))))

}

x2_mat=array(0,dim=c(dim(X)))

x1_mat=array(0,dim=c(dim(X)))

for(j in 1:nrow(X)){

x2_mat[j,]=t(u)%*%(u)%*%c(X[j,]-apply(data,2,mean))

x1_mat[j,]=c(X[j,]-apply(data,2,mean))-t(u)%*%(u)%*%c(X[j,]-apply(data,2,mean))

}

a_t2=ncol(X)*a


#確率主成分?

#"x1(身長)","x2(体重)","x3(胸囲)","x4(座高)"

N=20

data=rbind(data.frame(x1=145+runif(N,min=1,max=25),x2=40+runif(N,min=1,max=25),x3=60+runif(N,min=15,max=30),x4=60+runif(N,min=30,max=40)),data.frame(x1=145+runif(N,min=1,max=5),x2=40+runif(N,min=20,max=25),x3=60+runif(N,min=25,max=30),x4=60+runif(N,min=10,max=20)))



#相関行列から始めた場合;

R=cor(data);S=cov(data)



#固有値、固有ベクトル
eigen_value=eigen(S)$value

propotion=eigen_value/sum(eigen_value)

eigen_vector=eigen(S)$vectors

D=4;K=2

sigma_hat=sum(eigen_value[(K+1):length(eigen_value)])/(D-K)

P=eigen_vector[,1:K]

W=P%*%(diag(eigen_value[1:K],K)-diag(sigma_hat,K))^(1/2)

library(mvtnorm)

n=10000

samples=array(0,dim=c(D,n))

for(j in 1:ncol(samples)){

samples[,j]=(W%*%c(rmvnorm(1,rep(0,K),diag(1,K)))+t(rmvnorm(1,rep(0,D),diag(sigma_hat,D))))
  
} 

apply(samples,1,mean)

l=-(N*D*log(2*pi))/2-(N/2)*log(det(W%*%t(W)+diag(sigma_hat,D)))

cost=0

for(j in 1:nrow(data)){
  
cost=t(c(as.numeric(data[j,])))%*%solve(W%*%t(W)+diag(sigma_hat,D))%*%t(t(c(as.numeric(data[j,]))))+cost 
  
}

l=l-(1/2)*cost

plot((as.matrix(data))%*%as.matrix(W))

  

#入門 機械学習における異常検知 p.140

#確率主成分分析

dim=3

X=as.matrix(iris[,1:4])

n=nrow(X);m=ncol(X)

sigma2=1

W=array(1,dim=c(m,dim))

x_bar=apply(X,2,mean)


times=1000

for(l in 1:times){
  
sigma2_pre=sigma2;W_pre=W  
  
M=t(W)%*%W+diag(sigma2,dim)

W1=array(0,dim=c(ncol(X),dim))

W2=array(0,dim=c(dim,dim))

sigma2_hat=c()
  
for(i in 1:nrow(X)){
  
z_n=solve((M))%*%t(W)%*%c(X[i,]-x_bar) 

zz_n=sigma2*solve(M)+z_n%*%t(z_n)

W1=W1+t(t(X[i,]-x_bar))%*%t(z_n)

W2=W2+zz_n

sigma2_hat=c(sigma2_hat,sum((X[i,]-x_bar)^2)-2*t(X[i,]-x_bar)%*%W%*%z_n+sum(diag(zz_n%*%t(W)%*%W)))
  
}  



W=W1%*%solve(W2);sigma2=sum(sigma2_hat)/prod(dim(X))

L=-(prod(dim(X)))/2-(sigma2*sum(diag(W2)))/(2*sigma2)-ncol(X)*log(2*pi*sigma2)/2

BIC=-2*L+(ncol(X)*dim-(dim*(dim-1))/2+dim+1)*log(nrow(X))

print(BIC)
  
#print(sum(abs(W_pre-W))+abs(sigma2_pre-sigma2))  

}



#異常検知
C=W%*%t(W)+diag(sigma2,ncol(X))

#データの標本分散共分散行列
var(X)

#推定されたパラメータsigma2
sigma2


#計算例4

#分散共分散行列から始めた場合;
data=data.frame(list=c("x1(身長)","x2(体重)","x3(胸囲)","x4(座高)"),average=c(149.06,40.26,73.98,80.02),variance=c(19.94,23.56,20.95,7.55),standard_deviation=c(4.47,4.85,4.58,2.75))

variance_covariance_matrix=matrix(c(19.94,10.5,6.59,8.63,10.5,23.56,19.71,7.97,6.59,19.71,20.95,3.93,8.63,7.97,3.93,7.55),ncol=4,nrow=4)
#固有値、固有ベクトル
eigen_value=eigen(variance_covariance_matrix)$values
eigen_vector=eigen(variance_covariance_matrix)$vectors
#累積寄与率と寄与率
propotion=eigen_value/sum(diag(variance_covariance_matrix))
accumulated_propotion=cumsum(propotion)

cov_mat=variance_covariance_matrix

#iterationによる主成分の計算法

#1回目

#Broyden Quasi-Newton Algorithm p.213

f=function(x){

lam=x[1];x=t(t(c(x[-1])))  

f1=2*cov_mat%*%x-2*lam*x

f2=1-t(x)%*%x

return(c(f1,f2))

}

X=rep(3,ncol(cov_mat)+1)

ite=10000;eta=0.01

H=diag(f(X))

for(l in 1:ite){

if(sum(abs(f(X)))>10^(-9)){  

X_pre=X  

X=X-eta*H%*%f(X)  

s=X-X_pre

y=f(X)-f(X_pre)

H=H+((s-H%*%y)/as.numeric(t(s)%*%H%*%y))%*%t(s)%*%H

print((f(X)))

}

}  

lam1=X[1]

w1=X[-1]


#2回目

#Broyden Quasi-Newton Algorithm p.213

f=function(x){

lam=x[1];r=x[2];x=t(t(c(x[-c(1:2)])))  

f3=2*cov_mat%*%x-2*lam*x+r*w1

f1=1-t(x)%*%x

f2=sum(w1*x)

return(c(f1,f2,f3))

}

X=rep(2.5,ncol(cov_mat)+2)

ite=100000;eta=0.01

H=diag(f(X))

for(l in 1:ite){

if(sum(abs(f(X)))>10^(-9)){  

X_pre=X  

X=X-eta*H%*%f(X)  

s=X-X_pre

y=f(X)-f(X_pre)

H=H+((s-H%*%y)/as.numeric(t(s)%*%H%*%y))%*%t(s)%*%H

print(sum(abs(f(X))))

}

}  

lam2=X[1]

w2=X[-c(1:2)]


2
1
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
2
1