#回帰分析の基礎
library(dplyr)
data=data.frame(years=c(1900,1904,1908,1912,1920,1924,1928,1932,1936,1948,1952,1956,1960,1964,1968,1972,1976,1980),seconds=c(22.2,21.6,22.6,21.7,22,21.6,21.8,21.2,20.7,21.1,20.7,20.6,20.5,20.3,19.8,20,20.23,20.19))
x=(data$years-1940)/4;y=data$seconds
S_xx=var(x)*length(x);S_yy=var(y)*length(y)
r=cor(x,y)
b=r*sqrt(S_yy/S_xx)
a=mean(y)-b*mean(x)
S_R=var(a+b*x)*length(x)
R2=S_R/S_yy
S_e=S_yy-S_R
c=(x-mean(x))/S_xx
#sum(c)=0
sum(c)
#sum(c*x)=1
sum(c*x)
#sum(c^2)=1/S_xx
sum(c^2)
1/S_xx
b_hat=b;a_hat=a
#各x_i,i=1,2,...に対して分散を1と仮定すると、bの分布は;
sigma=1
#1/times=0とみなせるぐらい大きくとる。
times=100000
b_vec=c()
for(j in 1:times){
b=b_hat-sum(c*rnorm(length(c),mean=0,sd=sigma))
b_vec=c(b_vec,b)
}
#b_hatの平均値は(=b_hat)
mean(b_vec)
#b_hatの分散は(=sigma^2/S_xx)
var(b_vec)
a_vec=mean(y)-b_vec*mean(x)
#aの平均は(=a_hat)
mean(a_vec)
#aの分散は(=(sigma^2)*((mean(x)^2)/S_xx))
var(a_vec)
#kai2
S_e_var=c()
N=10
for(j in 1:times){
S_e_var=c(S_e_var,sum(floor(((rnorm(N-2,mean=0,sd=sigma)/sigma)^2)*10)/10))
}
#S_eの平均は(=(N-2)*sigma^2)
mean(S_e_var)
#例題2.2
#t分布の区間推定
sigma_hat=sqrt(S_e/(length(x)-2))
alpha=0.05
t0_under=b_hat+qt(alpha/2,df=length(x)-2)*sigma_hat*(1/sqrt(S_xx))
t0_upper=b_hat+qt(1-alpha/2,df=length(x)-2)*sigma_hat*(1/sqrt(S_xx))
#95%の可能性でこの期間4年ごとに記録が0.09~0.14秒の間で向上しているといえる
#例題2.3
data2=data.frame(x=c(4:14),y=c(4.26,5.68,7.24,4.82,6.95,8.81,8.04,8.33,10.84,7.58,9.96))
n=nrow(data2);x_bar=mean(data2$x);S_xx=n*var(data2$x);S_yy=n*var(data2$y);r=cor(data2$x,data2$y)
b_hat=r*sqrt(S_yy/S_xx);a_hat=mean(data2$y)-b_hat*mean(data2$x)
S_R=n*var(a_hat+b_hat*data2$x)
S_e=S_yy-S_R;sigma_hat=S_e/(n-2)
predict=data.frame(x=data2$x,under=0,upper=0,predict=0)
alpha=0.05
for(j in 1:nrow(predict)){
predict$under[j]=a_hat+b_hat*predict$x[j]+qt(alpha/2,df=n-2)*sigma_hat*sqrt(1/n+((predict$x[j]-mean(data2$x))^2)/S_xx)
predict$upper[j]=a_hat+b_hat*predict$x[j]+qt(1-alpha/2,df=n-2)*sigma_hat*sqrt(1/n+((predict$x[j]-mean(data2$x))^2)/S_xx)
predict$predict[j]=a_hat+b_hat*predict$x[j]
}
plot(predict$x,predict$predict,xlab="x",ylab="y",col=1,type="o",xlim=c(4,14),ylim=c(min(predict[,colnames(predict)!="x"]),max(predict[,colnames(predict)!="x"])))
par(new=T)
plot(predict$x,predict$under,xlab="x",ylab="y",col=2,type="o",xlim=c(4,14),ylim=c(min(predict[,colnames(predict)!="x"]),max(predict[,colnames(predict)!="x"])))
par(new=T)
plot(predict$x,predict$upper,xlab="x",ylab="y",col=2,type="o",xlim=c(4,14),ylim=c(min(predict[,colnames(predict)!="x"]),max(predict[,colnames(predict)!="x"])))
#Hb:b=0 <=> XとYは独立
alpha=0.05
sqrt(n-2)*abs(r)/sqrt(1-r^2)<=qt(1-alpha,df=n-2)
data2=data.frame(x=c(4:14),y=c(4.26,5.68,7.24,4.82,6.95,8.81,8.04,8.33,10.84,7.58,9.96))
n=nrow(data2);x_bar=mean(data2$x);S_xx=n*var(data2$x);S_yy=n*var(data2$y);r=cor(data2$x,data2$y)
b_hat=r*sqrt(S_yy/S_xx);a_hat=mean(data2$y)-b_hat*mean(data2$x)
S_R=n*var(a_hat+b_hat*data2$x)
S_e=S_yy-S_R;sigma_hat=S_e/(n-2)
predict=a_hat+b_hat*data2$x
d=(data2$y-predict)/sqrt((n-1)/n-(data2$x-mean(data2$x))^2/S_xx)
plot(d) #average:0,sd:sigma^2
y=data2$y;x=data2$x
#Hi:yi=a_hat+b*xi+ei,ei~N(0,sigma^2)
#Ki:yi=b_hat2+a_hat2*log(xi), ei~N(0,sigma^2)
#Ki;
a_hat2=(sum(n*y*log(x))-sum(y)*sum(log(x)))/(sum(n*log(x)^2)-sum(log(x))^2)
b_hat2=mean(y)-a_hat*mean(log(x))
sigma2_k=c();sigma2_h=c();ave=c();
for(j in 1:length(y)){
y_data=data.frame(num=1:length(y),x=x,y=y,predict1=a_hat+b_hat*x,predict2=a_hat2*log(x)+b_hat2)
y_data=y_data %>% filter(num!=j)
sigma2_k=c(sigma2_k,(1/(n-3))*sum((y_data$y-y_data$predict2)^2))
sigma2_h=c(sigma2_h,(1/(n-3))*sum((y_data$y-y_data$predict1)^2))
ave=c(ave,(1/(n-1))*sum(y_data$x))
}
y_data=data.frame(num=1:length(y),y=y,predict1=a_hat+b_hat*x,predict2=a_hat2*log(x)+b_hat2)
t_data=data.frame(num=1:length(y),t=0)
for(j in 1:nrow(t_data)){
t_data$t[j]=(y[j]-y_data$predict1[j])/(sigma2_k[j]*sqrt(1+(1/(n-1))+((x[j]-ave[j])^2)/S_xx))
}
alpha=0.05
t_data=t_data %>% mutate(t_value=qt(1-alpha,df=n-3))
#等分散性について(levene test)
#H0:sigma1^2=sigma2^2
N1=100;N2=50
y1=rnorm(N1,mean=1,sd=3);y2=rnorm(N2,mean=2,sd=3)
alpha=0.05
Y=cbind(y1,y2)
Z=abs(Y-apply(Y,2,mean))
N=N1+N2
W=((N-2)*sum(((apply(Z,2,mean)-mean(Z))^2)*c(N1,N2)))/((2-1)*sum((Z-apply(Z,2,mean))^2))
alpha=0.05
W>qf(1-alpha,2-1,N-2) #H0を棄却
#等分散性について(Bartlett test)
#H0:sigma1^2=sigma2^2
S_b=sum((c(N1,N2)-1)*apply(Y,2,var))/(N-2)
T=((N-2)*log(S_b)-sum((c(N1,N2)-1)*log(apply(Y,2,var))))/(1+(1/3)*(sum(1/(c(N1,N2)-1))-1/(N-2)))
alpha=0.01
T>qchisq(1-alpha,df=2-1) #H0を棄却
#Box-cox transform
data=data.frame(years=c(1941:1981),values=c(4,1,4,6,4,5,10,8,10,12,10,15,21,12,19,24,14,30,33,23,25,26,27,43,50,33,55,63,46,57,58,78,68,81,90,75,75,127,110,130,185))
x_bar=mean(data$years-1900)
y_dot=(prod(data$values)^(1/nrow(data)))
S_xx=var(data$years)*nrow(data)
S_yy_vec=c();y_bar=c();a_vec=c();b_vec=c()
lambda_data=data.frame(lambda=seq(-0.5,0.5,0.1),S_e=0)
for(j in 1:nrow(lambda_data)){
if(lambda_data$lambda[j]==0){
y_sub=function(s){
z=log(s)*y_dot
return(z)
}
}else{
y_sub=function(s){
z=(s^lambda_data$lambda[j]-1)/(lambda_data$lambda[j]*y_dot^(lambda_data$lambda[j]-1))
return(z)
}
}
y2=c()
for(i in 1:nrow(data)){
y2=c(y2,y_sub(data$values[i]))
}
y_bar=c(y_bar,mean(y2))
S_yy=var(y2)*nrow(data)
S_yy_vec=c(S_yy_vec,S_yy)
r=cor(data$years-1900,y2)
b_hat=r*sqrt(S_yy/S_xx)
a_hat=mean(y2)-b_hat*x_bar
a_vec=c(a_vec,a_hat);b_vec=c(b_vec,b_hat)
lambda_data$S_e[j]=sum((y2-a_hat-b_hat*(data$years-1900))^2)
plot(data$years-1900,y2,xlab="years",ylab="values",xlim=c(min(data$years-1900),max(data$years-1900)),ylim=c(min(y2),max(y2)),main=paste0("lambda;",lambda_data$lambda[j],",S_e;",lambda_data$S_e[j]),col=2,type="p")
par(new=T)
plot(data$years-1900,a_hat+b_hat*(data$years-1900),xlab="years",ylab="values",xlim=c(min(data$years-1900),max(data$years-1900)),ylim=c(min(y2),max(y2)),main=paste0("lambda;",lambda_data$lambda[j],",S_e;",lambda_data$S_e[j]),col=1,type="l")
}
#Ridge regression
data=data.frame(num=1:20,y=c(167,167.5,168.4,172,155.3,151.4,163,174,168,160.4,164.7,171,162.6,164.8,163.3,167.6,169.2,168,167.4,172),x1=c(84,87,86,85,82,87,92,94,88,84.9,78,90,88,87,82,84,86,83,85.2,82),x2=c(61,55.5,57,57,50,50,66.5,65,60.5,49.5,49.5,61,59.5,58.4,53.5,54,60,58.8,54,56))
X=data[,colnames(data) %in% c("x1","x2")]
Y=data$y
for(j in 1:ncol(X)){
X[,j]=(X[,j]-mean(X[,j]))
}
R=cor(X)
r1y=cor(X[,1],Y)
b_R=function(k){
return(solve(R+k*diag(rep(1,ncol(X))))%*%t(X)%*%Y)
}
lambda=eigen(R)$values
#直交多項式回帰 p.128
X=seq(10,30,2)
Y=c(0.37,0.523,0.667,0.751,0.811,0.826,0.808,0.804,0.79,0.746,0.697)
P_x=function(x,d,dim){
P_mat=array(1,dim=c(length(x),dim+1))
if(dim>=1){
P_mat[,2]=(x-mean(x))/d
if(dim>=2){
P_mat[,3]=((x-mean(x))/d)^2-((length(x)^2-1)/12)
if(dim>=3){
P_mat[,4]=((x-mean(x))/d)^3-((3*length(x)^2-7)/20)*((x-mean(x))/d)
}
}
}
return(P_mat)
}
X_mat=P_x(X,2,3)
alpha=solve(t(X_mat)%*%X_mat)%*%t(X_mat)%*%Y
S_e=sum((Y-X_mat%*%alpha)^2)
S_R=sum((t(X_mat)%*%Y*alpha)[-1])
S_yy=S_e+S_R
#回帰変動の比率
S_R/S_yy