LoginSignup
1
2

More than 3 years have passed since last update.

重回帰(多変量解析入門Ⅰ 河田至商先生)

Last updated at Posted at 2018-11-16
#multiple regression 

#例1
data=data.frame(num=1:20,y=c(167,167.5,168.4,172,155.3,151.4,163,174,168,160.4,164.7,171,162.6,164.8,163.3,167.6,169.2,168,167.4,172),x1=c(84,87,86,85,82,87,92,94,88,84.9,78,90,88,87,82,84,86,83,85.2,82),x2=c(61,55.5,57,57,50,50,66.5,65,60.5,49.5,49.5,61,59.5,58.4,53.5,54,60,58.8,54,56))

#平均と標準偏差のデータ
Expect_and_sd_data=data.frame(value=c("mean","sd"),y=c(mean(data$y),sd(data$y)),x1=c(mean(data$x1),sd(data$x1)),x2=c(mean(data$x2),sd(data$x2)))

library(dplyr)
Data=data %>% select(x1,x2,y)
#偏差積和行列
A=array(0,dim=c(3,3))
for(i in 1:ncol(A)){
  for(j in 1:nrow(A)){
A[i,j]=sum((Data[,i]-mean(Data[,i]))*(Data[,j]-mean(Data[,j])))
    }
}

part_of_inverse_A=solve(A[1:2,1:2])

#偏回帰係数
b1=sum(A[1:2,3]*part_of_inverse_A[1,1:2])
b2=sum(A[1:2,3]*part_of_inverse_A[2,1:2])
b0=mean(data$y)-b1*mean(data$x1)-b2*mean(data$x2)

print(paste0("Y=",signif(b0,digits=4),signif(b1,digits=4),"x1+",signif(b2,digits=4),"x2"))

#重回帰式の有意性の検定

ayy=A[3,3]
#残差の変動
EV=sum((Data$y-(b0+b1*Data$x1+b2*Data$x2))^2)
#回帰による変動
RV=sum(((b0+b1*Data$x1+b2*Data$x2)-mean(b0+b1*Data$x1+b2*Data$x2))^2)

if(signif(ayy,digits=6)==signif(EV+RV,digits=6)){
  print("your answer is right!")
}

#寄与率(propotion)
R=(1-EV/(EV+RV))^(1/2)
R^2

#標準変量の追加
data=data %>% mutate(Y=(y-mean(y))/sd(y),X1=(x1-mean(x1))/sd(x1),X2=(x2-mean(x2))/sd(x2))

plot(data$Y,ylim=c(-5,5),type="p",col=1)

plot(data$X1,ylim=c(-5,5),type="p",col=2)

plot(data$X2,ylim=c(-5,5),type="p",col=3)

B1=b1*((A[1,1]/ayy)^(1/2));B2=b2*((A[2,2]/ayy)^(1/2))

print(paste0("Y=",signif(B1,digits = 4),"X1+",signif(B2,digits=4),"X2"))

V_R=RV/(ncol(Data)-1);V_e=EV/(nrow(Data)-(ncol(Data)-1)-1)

static_F=V_R/V_e

F_dis=function(n1,n2,x){

  y<-(gamma((n1+n2)/2)*((n1/n2)^(n1/2))*(x^((n1/2)-1))*((1+(n1*x)/n2)^(-(n1+n2)/2)))/(gamma(n1/2)*gamma(n2/2))
  return(y)

}
n1=(ncol(Data)-1);n2=(nrow(Data)-(ncol(Data)-1)-1)

F_dis_data=data.frame(num=seq(0.01,10,by=0.01)) %>% mutate(value=F_dis(n1,n2,num))
plot(F_dis_data$num,F_dis_data$value,type="l")

#実際にパッケージの値と比べてみてください。
df(0.01,n1,n2) #0.01のところはF_dis_dataのnumの値、valueは出力値になります

#F分布の確率点
alpha=0.05
x_point=qf((1-alpha),2,17)
if(x_point<static_F){
  print(paste0("危険率",100*alpha,"%を許すならば有意である"))
}
rm(list=ls())

#例2
data=data.frame(prefecture=c("Hokkaido","Aomori","Iwate","Miyagi","Akita","Yamagata","Fukushima","Ibaragi","Totigi","Gumma","Saitama","Tiba","Tokyo","Kanagawa","Niigata","Toyama","Ishikawa","Fukui","Yamanashi","Nagano","Gifu","Shizuoka","Aichi","Mie","Shiga","Kyoto","Oosaka","Hyogo","Nara","Wakayama","Tottori","Shimane","Okayama","Hiroshima","Yamaguchi","Tokushima","Kagawa","Ehime","Kouchi","Hukuoka","Saga","Nagasaki","Kumamoto","Ooita","Miyazaki","Kagoshima"),x1=c(6364,7135,7266,7186,6274,6497,6740,6687,6917,6725,7928,7403,13500,8537,7315,6730,8875,7589,7249,6319,7673,7270,8149,6385,7049,10861,11548,9986,6728,7596,7324,6166,5889,8170,5147,6997,6460,6657,7979,7301,6564,8866,6928,6983,6016,7688),x2=c(90547,12634,20101,15486,12471,12446,27026,19486,19919,16611,42768,37630,147481,55648,27569,11709,13660,11161,10092,22957,20135,36377,48255,19562,9889,16925,146758,75350,16405,13007,8005,10334,22146,23029,15488,9905,7407,14795,12387,31172,8151,11117,14634,11504,11415,12546),x3=c(19.7,20,16.3,20.6,18.1,21.2,14.2,9.6,28.8,16.3,15.5,28.4,49.1,31.8,14.6,35.3,30.8,32.9,24.2,11.4,18.3,19.6,21.4,14.6,21.4,23.6,40.9,24.8,10.7,16.8,26.6,10.2,11,16.3,23.3,10.8,21.9,16.2,11.8,17.7,24.7,17.5,18.5,20.9,25.2,32.5),x4=c(640.9,128.2,113.8,171.7,97.4,134.4,166.5,225.1,187.5,212,337.3,319.4,1584.3,559.2,234.3,121.4,120,93.5,93.2,238.1,255.1,419.1,825.1,160.5,91.8,233.6,807.5,404.6,85.8,111,44.1,52.9,156.6,256.1,126.8,71.4,81.5,108.1,72.7,367,70.6,89.6,136.6,97.5,96.3,119.4),y=c(44523,10395,8669,12808,7640,7094,17556,20320,18715,16755,34601,27811,88406,48030,18213,8839,12086,8871,8961,15280,17837,37421,50998,14809,12610,35478,75497,55464,7916,14259,6244,5400,19015,36771,13827,9372,11088,9960,8516,51175,10384,10652,15987,10216,8460,12932))

Data=data %>% select(x1,x2,x3,x4,y)
library(dplyr)

Expect_and_sd_data=data.frame(value=c("mean","sd"),x1=c(mean(Data$x1),sd(Data$x1)),x2=c(mean(Data$x2),sd(Data$x2)),x3=c(mean(Data$x3),sd(Data$x3)),x4=c(mean(Data$x4),sd(Data$x4)),y=c(mean(Data$y),sd(Data$y)))

#偏差積和
A=array(0,dim=c(ncol(Data),ncol(Data)))
for(i in 1:ncol(A)){
  for(j in 1:nrow(A)){
   A[i,j]=sum((Data[,i]-mean(Data[,i]))*(Data[,j]-mean(Data[,j]))) 
   }
}

inv_A=solve(A[1:(ncol(A)-1),1:(nrow(A)-1)])

#相関行列
correlation=array(0,dim=c(ncol(Data),ncol(Data)))
for(i in 1:ncol(correlation)){
  for(j in 1:nrow(correlation)){
    correlation[i,j]=(sum((Data[,i]-mean(Data[,i]))*(Data[,j]-mean(Data[,j]))))/((sum((Data[,i]-mean(Data[,i]))^2)*sum((Data[,j]-mean(Data[,j]))^2))^(1/2))
  }
}

#偏差積和(y)
A_y=array(0,dim=c(1,ncol(Data)-1))
for(j in 1:ncol(A_y)){
  A_y[1,j]=sum((Data[,j]-mean(Data[,j]))*(Data[,ncol(Data)]-mean(Data[,ncol(Data)])))
}

#偏回帰係数
B=array(0,dim=c(1,(ncol(Data)-1)))
for(j in 1:length(B)){
B[1,j]=sum(inv_A[j,]*A_y)
}
B0=mean(Data[,ncol(Data)])-sum(B*apply(Data[,1:(ncol(Data)-1)],2,mean))
B=cbind(B0,B)

#t値
ayy=A[ncol(A),nrow(A)]

RV=sum(A[1:(nrow(A)-1),ncol(A)]*B[1,2:length(B)])
EV=ayy-RV

V_e=EV/(nrow(Data)-(ncol(Data)-1)-1)
V_R=RV/(ncol(Data)-1)
t=array(0,dim=c(1,(ncol(Data)-1)))

for(j in 1:(length(t))){

  t[1,j]=abs(B[1,1+j])/((inv_A[j,j]*V_e)^(1/2))
}

#t0
element=array(0,dim=c((ncol(Data)-1),(ncol(Data)-1)))
for(j in 1:(ncol(Data)-1)){
 for(i in 1:(ncol(Data)-1)){
   element=mean(Data[,i])*mean(Data[,j])*inv_A[i,j]
 } 
}


t0=abs(B0)/(((1/nrow(Data)+sum(element))*V_e)^(1/2))

t=cbind(t0,t)

#偏回帰係数の検定

Hypothesis_B_data=data.frame(num=0:(length(B)-1),B_value=0,percent5=0,percent1=0)
Hypothesis_B_data$B_value=t(B)
for(j in 1:nrow(Hypothesis_B_data)){

alpha=0.95
if(qt(alpha,(nrow(Data)-(ncol(Data)-1)-1))<=t[1,j]){
  Hypothesis_B_data$percent5[j]=1
}

alpha=0.99

if(qt(alpha,(nrow(Data)-(ncol(Data)-1)-1))<=t[1,j]){
  Hypothesis_B_data$percent1[j]=1
}
}

#1:有意,0:有意でない

#例3

#偏差積和行列

A=t(matrix(c(0.215119*(10)^4,0,0,0,0,0,-0.37170193*(10^6),0.45847536*(10)^9,0,0,0,0,0.1231154*(10)^6,-0.95198132*(10)^8,0.69426706*(10)^9,0,0,0,-0.2105072*(10)^4,0.10520093*(10)^7,0.40752956*(10)^5,0.898905*(10)^4,0,0,-0.236726*(10)^4,0.15493275*(10)^7,-0.51346271*(10)^6,0.1046695*(10)^5,0.1366869*(10)^5,0,-0.217398*(10)^4,0.12777251*(10)^7,-0.23044251*(10)^6,0.263454*(10)^4,0.3495001*(10)^4,0.517279*(10)^4),ncol=6,nrow=6))

A=A+t(A);diag(A)<-diag(A)/2
inv_A=solve(A)

b2=A[2,ncol(A)]/A[2,2]
A[ncol(A),nrow(A)]-(A[2,ncol(A)]^2)/A[2,2]

#相関行列

correlation=t(matrix(c(1,0,0,0,0,0,-0.37,1,0,0,0,0,0.1,-0.17,1,0,0,0,-0.48,0.52,0.02,1,0,0,-0.44,0.62,-0.17,0.94,1,0,-0.65,0.83,-0.12,0.39,0.42,1),ncol=6,nrow=6))

correlation=correlation+(t(correlation-diag(1,nrow(correlation))))

result1=data.frame(method=c(as.character("手順1"),as.character("手順1")),variable=c(as.character("x2"),as.character("b0")),propotion=c(0,"-"),F_value=c(0,0),F_test=c(0,0),multiple_correlation_coefficient=c(0,0))

1
2
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
2