#ランダムサンプリング
#p.23
library(dplyr)
#サンプリングによる平均値の分散と共分散のサンプル標本数のちがいによる変化
N=100
alpha=0.05
Y=rnorm(N)
X=rnorm(N,2)
sampling_data=data.frame(n=1:N,V_y=0,cov=0,under_Y=0,upper_Y=0,under_Ny=0,upper_Ny=0,V_R=0)
for(n in 1:N){
S2=sum((Y-mean(Y))^2)/(N-1)
sigma=sum((Y-mean(Y))^2)/N
f=n/N
#variance(E[mean(y)-mean(Y)])
V_y=(1-f)*S2/n
sigma_y=sqrt(V_y)
V_y_hat=(((N^2)*S2))*sqrt(1-f)/sqrt(n)
#covariance
E_cov=(N-n)*sum((Y-mean(Y))*(X-mean(X)))/((n*N)*(N-1))
sampling_data$V_y[n]=V_y
sampling_data$cov[n]=E_cov
s2=(sum((Y-mean(Y))^2)-(mean(Y[1:n])-mean(Y))^2)/(n-1)
sampling_data$under_Y[n]=mean(Y[1:n])-qnorm(1-alpha/2)*sqrt(s2)*sqrt(1-f)/sqrt(n)
sampling_data$upper_Y[n]=mean(Y[1:n])+qnorm(1-alpha/2)*sqrt(s2)*sqrt(1-f)/sqrt(n)
sampling_data$under_Ny[n]=N*mean(Y[1:n])-N*qnorm(1-alpha/2)*sqrt(s2)*sqrt(1-f)/sqrt(n)
sampling_data$upper_Ny[n]=N*mean(Y[1:n])+N*qnorm(1-alpha/2)*sqrt(s2)*sqrt(1-f)/sqrt(n)
V_R=(1-f)*sum((Y-R*X)^2)/(n*(n-1)*mean(X)^2)
sampling_data$V_R[n]=V_R
}
R=mean(Y)/mean(X)
d=Y-R*X
mean(d);var(d)
#部分母集団における平均値の推定 p.37
K=10
N_vec=sample(50:100,K)
n_vec=c();S_vec=c()
s_y=c();y_ave=c();y_ave_sub=c()
y2_sum=c();lindberg=c()
tau=0.1
for(j in 1:K){
n=sample(2:N_vec[j],1)
n_vec=c(n_vec,n)
y_vec=sample(1:100,n)
S_vec=c(S_vec,sum((y_vec-mean(y_vec))^2)/(n-1))
s_y=c(s_y,sqrt(sum((y_vec-mean(y_vec))^2)/(n-1))*sqrt(1-n/N_vec[j])/sqrt(n))
y_ave=c(y_ave,mean(y_vec))
y_ave_sub=c(y_ave_sub,mean(sample(y_vec,n)))
y2_sum=c(y2_sum,sum(y_vec^2))
boundary=sqrt(n*(1-n/N_vec[j]))*sqrt(sum((y_vec-mean(y_vec))^2)/(n-1))*tau
lindberg=c(lindberg,ifelse(abs(y_vec-mean(y_vec))>boundary,(y_vec-mean(y_vec))^2,0)/((N_vec[j]-1)*(sum((y_vec-mean(y_vec))^2)/(n-1))))
}
Y_hat=sum(N_vec)*(y_ave_sub*n_vec)/sum(n_vec)
S_dash=(y2_sum-((y_ave_sub*n_vec)^2)/sum(N_vec))/(sum(N_vec)-1)
sigma_Y_hat=sum(N_vec)*sqrt(S_dash)*sqrt(1-sum(n_vec)/sum(N_vec))/sqrt(sum(n_vec))
V_Y_hat=(S_dash*sum(N_vec)^2)*(1-sum(n_vec)/sum(N_vec))/sum(n_vec)
V_yj_yk=array(0,dim=c(K,K))
for(j in 1:K){
for(i in 1:K){
V_yj_yk[i,j]=S_vec[i]/n_vec[i]+S_vec[j]/n_vec[j]
}
}
V_Nj_yj=((N_vec)^2)*S_dash*(1-n_vec/N_vec)/n_vec
#binary_data p.46
h=1
vals=rpois(2,5);prob=vals/sum(vals)
n=100
samples=sample(0:1,n,prob=prob,replace=T)
a=sum(samples)
y_ave=a*h/n
s=sqrt(a*h^2-(a*h)^2/n)
s_y_ave=sqrt(s^2/n)
alpha=0.05
under=y_ave-qnorm(1-alpha/2)*s_y
upper=y_ave+qnorm(1-alpha/2)*s_y
#表2.3 p.48
library(dplyr)
y=c(-0.9,0,c(1:13));f=c(47,143,154,82,62,33,13,6,4,6,2,0,2,0,2)
E_Y=sum(f*y)/sum(f)
E_Y2=sum(f*y^2)/sum(f)
E_Y3=sum(f*y^3)/sum(f)
E_Y4=sum(f*y^4)/sum(f)
k3=E_Y3-3*E_Y2*E_Y+2*E_Y^3
sigma2=E_Y2-E_Y^2
G1=k3/(sqrt(sigma2)^3)
n>25*G1^2
k4=E_Y4-2*E_Y3*E_Y+2*E_Y2*E_Y^2-2*E_Y2*E_Y+E_Y2*E_Y^2+2*E_Y*E_Y^2-E_Y^4-3
V_s2=2*(sqrt(sigma2)^4)/(n-1)+k4/n
plot(y,f)
#p.56 質的な特性 比率と百分率の推定のためのサンプリング
N=100
Y=sample(0:1,N,replace = T)
P=sum(Y)/N;Q=1-P
n=50
y=sample(Y,n)
p=sum(y)/n
A=N*P;a=n*p
S2=sum((Y-P)^2)/(N-1)
s2=sum((y-p)^2)/(n-1)
V_p=S2*(1-n/N)/n
V_A=(P*Q*N^2)*((N-n)/(N-1))/n
coef_var=(sqrt(Q)/sqrt(n*P))
finite_population_correction=sqrt((N-n)/(N-1))
#p.63 信頼限界
a_u=P+qnorm(1-0.01/2)*sqrt(1-n/N)*sqrt(P*Q/(n-1))+1/(2*n)
a_l=P-qnorm(1-0.01/2)*sqrt(1-n/N)*sqrt(P*Q/(n-1))-1/(2*n)
#p.69
n=10
a=sample(1:50,n);A=sample(1:100,n)
n1=a+A
N1=sample(100:1000,n)
p1=a/(a+A);q1=1-p1
A1=N1*p1
s_A1=N1*sqrt(1-n1/N1)*sqrt(p1*q1/(n1-1))
#クラスターサンプリングにおける比率の推定 p.71
#例1:陰性41.8%・陽性52.8%,サンプルサイズ366
p=(41.8);q=(58.2);n=366
s.e.p=sqrt(p*q/(n-1));f=c(17,11,4,4,7,14,4)
data=data.frame(num=c(0:6),f=f,fy=f*c(0:6))
EX2=sum(data$f*data$num^2);EX=sum(data$fy)
s.e.y=sqrt(sum(EX2-EX^2/sum(f))/(sum(f)*(sum(f)-1)))
#例2
data=data.frame(no=c(1:30),m=c(5,6,3,3,2,3,3,3,4,4,3,2,7,4,3,5,4,4,3,3,4,3,3,1,2,4,3,4,2,4),a_man=c(1,3,1,1,1,1,1,1,2,3,2,1,3,3,2,3,3,3,2,1,1,2,2,0,1,3,1,2,1,2),a_women=c(4,3,2,2,1,2,2,2,2,1,1,1,4,1,1,2,1,1,1,2,3,1,1,1,1,1,2,2,1,2),a_y=c(5,0,2,3,0,0,0,0,0,0,0,0,0,4,1,2,0,0,1,3,2,0,0,0,2,2,0,2,0,1),a_n=c(0,6,1,0,2,3,3,3,4,4,3,2,7,0,2,3,4,4,2,0,2,3,3,1,0,2,3,2,2,3))
n=sum(data$m);p=nrow(data)/n;q=1-p;v_bin=p*q/n
summary2=apply(data^2,2,sum)[2:ncol(data)]
v_p=(sum(data$a_y^2)-2*p*sum(data$a_y*data$m)+(p^2)*sum(data$m^2))/(nrow(data)*(nrow(data)-1)*mean(data$m)^2)
#二項分布の分散
p*(1-p)/n
v_p>p*(1-p)/n
#p.84
prob=rpois(2,5);prob=prob/sum(prob)
p=prob[1]
N=300;n=sample(100:200,1)
sigma_p=sqrt((N-n)/(N-1))*sqrt((p*(1-p))/n)
t=qnorm(1-0.01/2)
d=t*sigma_p;
((p*(1-p)*t^2/(d^2))/(1+((p*(1-p)*t^2/(d^2))-1)/N))
#連続量についてのデータの場合にnを求める公式 p.85
N=1000;n=100
Y=sample(1:50,N,replace = T)
y=sample(Y,n)
S2=sum((Y-mean(Y))^2)/(N-1)
sigma_y=sqrt((N-n)/N)*sqrt(S2)/sqrt(n)
t=qnorm(1-0.01/2)
d=t*sqrt((N-n)/N)*sqrt(S2)/sqrt(n)
#必要サンプルサイズ
((t*sqrt(S2))/d)^2/(1+((t*sqrt(S2))/d)^2/N)
#層別ランダムサンプリング p.99
h=10
N=sample(100:1000,h)
n=c()
for(j in 1:h){
n=c(n,sample(1:N[j],1))
}
y_ave1=c();y_ave2=c();S_h=c();s_h=c()
for(j in 1:h){
samples=rnorm(N[j],mean=sample(1:9,1))
y_ave1=c(y_ave1,mean(samples))
samp=sample(samples,n[j])
y_ave2=c(y_ave2,mean(samp))
s_h=c(s_h,sum((samp-mean(samp))^2))
S_h=c(S_h,sum((samples-mean(samples))^2)/(N[j]-1))
}
y_st=sum(N*y_ave2)/sum(N)
y_bar=sum(n*y_ave2)/sum(n)
n/N
E_y_st=sum(N*y_ave1)/sum(N)
Y_bar=sum(N*y_ave1)/sum(N)
V_st=sum(N*(N/n-1)*S_h)/(sum(N)^2)
#比例割り当て p.103
n_hat=sum(n)*N/sum(N)
sum((N/sum(N))*(S_h/sum(n))*(sum(N)-sum(n))/sum(N))
#すべてのクラスにおいて同一サイズにしたい場合(ニュートンラフソン)
fun=function(x){
return(sum(N*(N-x)*S_h/x)/(sum(N^2)))
}
times=100
newton_raphson=data.frame(times=1:times,value=0,f=0,df=0)
value=100
for(j in 1:nrow(newton_raphson)){
f<-fun(value)
df<-(fun(value+0.01)-fun(value))/(0.01)
newton_raphson$f[j]=f
newton_raphson$df[j]=df
value=-f/df+value
newton_raphson$value[j]=value
}
solution=newton_raphson$value[nrow(newton_raphson)]
#信頼区間と信頼限界
W=N/sum(N)
s2_y_st=sum((S_h*W^2)/n)-sum(W*S_h)/sum(N)
#S_hがすべて同じ値を持つと考えてもよいようなとき
s2_w=sum(s_h)/(sum(n)-h)
abs(s2_w-mean(S_h))
v_y_st=s2_w*(sum(N)-sum(n))/(sum(n)*sum(N))
#母集団平均の信頼区間
upper=y_st+qnorm(1-0.01/2)*sqrt(s2_y_st)
under=y_st-qnorm(1-0.01/2)*sqrt(s2_y_st)
#母集団総平均の信頼区間
N_upper=N*upper
N_under=N*under
#有効自由度
g_h=N*(N-n)/n
n_e=sum(g_h*s_h)^2/sum((g_h^2)*(s_h^2)/(n-1))
#最適割り当て p.108
h=10
N=sample(100:1000,h);C=sample(1:30,h+1)
n=c()
for(j in 1:h){
n=c(n,sample(1:N[j],1))
}
y_ave1=c();y_ave2=c();S_h=c();s_h=c()
for(j in 1:h){
samples=rnorm(N[j],mean=sample(1:9,1))
y_ave1=c(y_ave1,mean(samples))
samp=sample(samples,n[j])
y_ave2=c(y_ave2,mean(samp))
s_h=c(s_h,sum((samp-mean(samp))^2))
S_h=c(S_h,sum((samples-mean(samples))^2)/(N[j]-1))
}
W=N/sum(N)
V_st=sum((1-n/N)*(S_h*W^2)/n)
#V_stを最小にする各サンプルサイズ(V_st_hは最小の分散)
C_s=sum(C*c(1,n))
n_all=(C_s-C[1])*sum(N*sqrt(S_h)/sqrt(C[2:length(C)]))/sum(N*sqrt(S_h)*sqrt(C[2:length(C)]))
n_h=n_all*W*sqrt(S_h)/(sqrt(C[2:length(C)])*sum(W*sqrt(S_h)/sqrt(C[2:length(C)])))
lambda=(sum(W*sqrt(S_h)/sqrt(C[2:length(C)]))/n_all)^2
V_st=sum((1-n_h/N)*(S_h*W^2)/n_h)
a_h=W*sqrt(S_h)/sqrt(n_h)
b_h=sqrt(C[2:length(C)]*n_h)
#cauchy-schwarz inequality
floor(sum(a_h^2)*sum(b_h^2)*10^5)/(10^5)>=floor((sum(a_h*b_h)^2)*(10^5))/(10^5)
b_h/a_h
S2=sum(S_h*(N-1))/(sum(N)-1)
#Neyman割り当て p.112
h=10
N=sample(100:1000,h);
n=c()
for(j in 1:h){
n=c(n,sample(1:N[j],1))
}
y_ave1=c();y_ave2=c();S_h=c();s_h=c()
for(j in 1:h){
samples=rnorm(N[j],mean=sample(1:9,1))
y_ave1=c(y_ave1,mean(samples))
samp=sample(samples,n[j])
y_ave2=c(y_ave2,mean(samp))
s_h=c(s_h,sum((samp-mean(samp))^2))
S_h=c(S_h,sum((samples-mean(samples))^2)/(N[j]-1))
}
W=N/sum(N)
V_st=sum((1-n/N)*(S_h*W^2)/n)
n_all=sum(n)
n_h=n_all*W*sqrt(S_h)/sum(W*sqrt(S_h))
V_st_h=sum((1-n_h/N)*(S_h*W^2)/n_h)
S2=sum(S_h*(N-1))/(sum(N)-1)
Y_bar= sum(y_ave1*N)/sum(N)
S_bar=sum(N*sqrt(S_h))/sum(N)
V_ran=sum(N*S_h)/(n_all*sum(N))+sum(N*(y_ave1-Y_bar)^2)/(n_all*sum(N))
V_prop=V_ran-sum(N*(y_ave1-Y_bar)^2)/(n_all*sum(N))
V_opt=V_prop-sum(N*(sqrt(S_h)-S_bar)^2)/(n_all*sum(N))
V_opt<V_prop
V_prop<V_ran
#5.9 連続量についてのデータにおけるサンプルの大きさの推定
w_h=n_h/sum(n_h)
V=sum(s_h*W^2/w_h)/sum(n_h)-sum(W*s_h)/sum(N)
sum(s_h*W^2)/(V+sum(W*s_h)/sum(N))
#5.10 比率推定のための層別サンプリング p.121
h=10
n=c();N=c();A=c();a=c()
for(j in 1:h){
samples=sample(0:1,sample(50:100,1),replace = T)
N=c(N,length(samples))
sub_samples=sample(samples,sample(1:length(samples),1))
n=c(n,length(sub_samples))
A=c(A,sum(samples))
a=c(a,sum(sub_samples))
}
P_h=A/N;p_h=a/n
p_st=sum(N*p_h)/sum(N)
V_p_st=sum((N-n)*(N^2)*P_h*(1-P_h)/((N-1)*n))/(sum(N)^2)
S_h=N*P_h*(1-P_h)/(N-1)
#最小分散にするためのサンプルサイズ
n_h=sum(n)*(N*sqrt(P_h*(1-P_h)))/sum(N*sqrt(P_h*(1-P_h)))
V_p_st_h=sum((N-n_h)*(N^2)*P_h*(1-P_h)/((N-1)*n_h))/(sum(N)^2)
sum(n_h)==sum(n)
#費用関数が定まっている場合(nは指定されていない)
C=sample(1:50,h+1);C_s=sum(C*c(1,n))
n_all=((C_s-C[1])*sum(N*sqrt(S_h)/sqrt(C[2:length(C)]))/sum(N*sqrt(S_h)*sqrt(C[2:length(C)])))
n_h=n_all*(N*sqrt(P_h*(1-P_h)/C[2:length(C)]))/sum(N*sqrt(P_h*(1-P_h)/C[2:length(C)]))
floor(sum(C*c(1,n_h)))==C_s
#層別サンプリング詳論 p.131
h=10
N=sample(100:1000,h);
n=c()
for(j in 1:h){
n=c(n,sample(1:N[j],1))
}
y_ave1=c();y_ave2=c();S_h=c();s_h=c()
for(j in 1:h){
samples=rnorm(N[j],mean=sample(1:9,1))
y_ave1=c(y_ave1,mean(samples))
samp=sample(samples,n[j])
y_ave2=c(y_ave2,mean(samp))
s_h=c(s_h,sum((samp-mean(samp))^2))
S_h=c(S_h,sum((samples-mean(samples))^2)/(N[j]-1))
}
W=N/sum(N)
n_dash_h=sum(n)*W*sqrt(S_h)/sum(W*sqrt(S_h))
V_min_y_st=sum(W*sqrt(S_h))^2/sum(n)-sum(W*S_h)/sum(N)
#(5A.4)
V_df=sum((n-n_dash_h)^2/(n))*sum(W*sqrt(S_h))^2/(sum(n)^2)
#(5A.5):分散の増加率
sqrt(sum((n-n_dash_h)^2/(n))/sum(n))
#層の大きさに誤差があるときの影響
h=10
N=sample(100:1000,h);
n=c()
for(j in 1:h){
n=c(n,sample(1:N[j],1))
}
y_ave1=c();y_ave2=c();S_h=c();s_h=c()
for(j in 1:h){
samples=rnorm(N[j],mean=sample(1:9,1))
y_ave1=c(y_ave1,mean(samples))
samp=sample(samples,n[j])
y_ave2=c(y_ave2,mean(samp))
s_h=c(s_h,sum((samp-mean(samp))^2))
S_h=c(S_h,sum((samples-mean(samples))^2)/(N[j]-1))
}
W=N/sum(N)
w=n/sum(n)
#平均誤差
sum((w-W)*y_ave1)
#平方平均誤差(5A.6)
MSE=sum(w*S_h*(1-n/N))+sum((w-W)*y_ave1)^2
#p.138
library(dplyr);library(stringr)
data=data.frame(leaf=c(1,2),W=c(0.8,0.2),S1h=c(4,4),S2h=c(2,6),S3h=c(1,8))
S_mat=as.matrix(data[,str_count(colnames(data),"S")>0])
W=data$W
#目標標準誤差:y1=y2=y3=0.1
V_j=0.01
WS=apply(S_mat*W,2,sum)
n=max(WS)^2/V_j
V_yj_st=sum(S_mat*W)^2/n
n_mat=array(0,dim=c(nrow(S_mat),ncol(S_mat)))
for(j in 1:ncol(n_mat)){
n_mat[,j]=n*c(W)*c(S_mat[,j])/sum(W*c(S_mat[,j]))
}
V_y_st_mat=array(0,dim=c(ncol(S_mat),ncol(n_mat)))
for(j in 1:ncol(V_y_st_mat)){
for(k in 1:ncol(V_y_st_mat)){
V_y_st_mat[j,k]=sum((W^2)*c(S_mat[,k]^2)/c(n_mat[,j]))
}
}
#目標標準誤差:y1=y2=0.1,y3=0.08 p.140
V_j=c(0.01,0.01,0.0064)
#example 2 四つの層・二つの変量
#目標標準誤差
V_j=c(0.04,0.01)
data=data.frame(leaf=c(1:4),W=c(0.4,0.3,0.2,0.1),S1h=c(5,5,5,5),S2h=c(1,2,4,8))
S_mat=as.matrix(data[,str_count(colnames(data),"S")>0])
W=data$W
WS=apply(S_mat*W,2,sum)
n=WS^2/V_j
#p.144
m_mat=t(matrix(c(15,21,17,9,10,8,13,7,6,9,5,8,4,3,6,6,3,2,5,8),ncol=5))
p_mat=m_mat/sum(m_mat)
ni.=apply(m_mat,1,sum);n.j=apply(m_mat,2,sum)
kai2=(t(t(ni.))%*%t(n.j)/sum(m_mat)-m_mat)^2
p=t(t(ni.))%*%t(n.j)/sum(m_mat)
kai2=kai2*c(1/apply(m_mat,2,mean))
m=nrow(kai2)-1;n=ncol(kai2)-1
kai2=sum(kai2)
qchisq(1-0.01,m*n)
n=sum(m_mat)
y_bar_U=array(0,dim=c(nrow(m_mat),ncol(m_mat)))
for(i in 1:nrow(y_bar_U)){
for(j in 1:ncol(y_bar_U)){
y_bar_U[i,j]=n*p_mat[i,j]*mean(m_mat)/(ni.[i]*n.j[j])
}
}
y_bar_U=sum(y_bar_U)
#p.175
#N:全体都市、n:サンプル都市、X:全体標本
N=196;n=49;X=22919
data=data.frame(y=c(80,143,67,50,464,48,63,115,69,459,104,183,106,86,57,65,50,634,260,113,64,58,89,63,77,142,60,64,52,139,130,53,291,105,111,79,288,61,57,85,50,317,46,232,93,53,54,58,75),x=c(76,138,67,29,381,23,37,120,61,387,93,172,78,66,60,46,2,507,179,121,50,44,77,64,64,56,40,40,38,136,116,46,243,87,30,71,256,43,25,94,43,298,36,161,74,45,36,50,48))
y=data$y;x=data$x
R_hat=sum(y)/sum(x)
Y_R=R_hat*X
#1都市あたりの標本平均にもとづくYの推定値
Y_hat=N*mean(y)
#p.179
data=data.frame(num=1:49,y=c(80,143,67,50,464,48,63,115,69,459,104,183,106,86,57,65,50,634,260,113,64,58,89,63,77,142,60,64,52,139,130,53,291,105,111,79,288,61,57,85,50,317,46,232,93,53,54,58,75),x=c(76,138,67,29,381,23,37,120,61,387,93,172,78,66,60,46,2,507,179,121,50,44,77,64,64,56,40,40,38,136,116,46,243,87,30,71,256,43,25,94,43,298,36,161,74,45,36,50,48))
N=49;n=20
X=data$x;Y=data$y
x=sample(X,n);y=sample(Y,n)
f=n/N
Y_hat_R=X*mean(y)/mean(x)
R=mean(y)/mean(x)
V_Y_hat_R=((1-f)*(N^2)*sum((y-R_hat*x)^2))/(n*(N-1))
#confidence interval
under=Y_hat_R-qnorm(1-0.01/2)*sqrt(V_Y_hat_R)
upper=Y_hat_R+qnorm(1-0.01/2)*sqrt(V_Y_hat_R)
Y_R_bar=mean(X)*mean(y)/mean(x)
V_Y_R_bar=V_Y_R/(N^2)
V_R_hat=V_Y_R_bar/(mean(X)^2)
#confidence interval
under=R-qnorm(1-0.01/2)*sqrt(V_R_hat)
upper=R+qnorm(1-0.01/2)*sqrt(V_R_hat)
C_xy=cov(x,y)/(mean(x)*mean(y))
C_y=var(y)/(mean(y)^2);C_x=var(x)/(mean(x)^2)
cv2=(1-f)*(C_y+C_x-2*C_xy)/n
s=(mean(x)-mean(X))/mean(X)
taylor=c()
if(abs(s)<1){
for(j in 1:100){
value=s^j
if(abs(value)>10^(-10)){
taylor=c(taylor,value)
}
}
}
R_hat_R=(mean(y)-R_hat*mean(x))*sum(taylor)/mean(X)
#p.182
library(dplyr)
E_R_hat_R=(1-f)*(C_x-C_xy)/n