# StanとRでベイズ統計モデリング(アヒル本)をPythonにしてみる - 8.1 階層モデルの導入

### インポート

```import numpy as np
import pandas as pd
import pystan
import matplotlib.pyplot as plt
from matplotlib.figure import figaspect
from matplotlib.gridspec import GridSpec
from matplotlib.markers import MarkerStyle
import seaborn as sns
%matplotlib inline
```

### データ読み込み

```salary2 = pd.read_csv('./data/data-salary-2.txt')
```

# 8.1 階層モデルの導入

## 8.1.1 解析の目的とデータの分布の確認

```fig = plt.figure(figsize=figaspect(2/4))

gs1 = GridSpec(1, 1, figure=fig)
ax1 = plt.subplot(gs1[0, 0])
for kid in salary2['KID'].unique():
ax1.scatter('X', 'Y', data=salary2.query('KID==@kid'), label=kid, marker=MarkerStyle.filled_markers[kid])
regkws = dict(x='X', y='Y', data=salary2, scatter=False, ci=None, color='gray')
sns.regplot(ax=ax1, **regkws)
ax1.legend(title='KID')

xlim = ax1.get_xlim()
ylim = ax1.get_ylim()
gs2 = GridSpec(2, 2, figure=fig)
for row, col in np.ndindex(2, 2):
kid = row + 1 + col * 2
sns.regplot('X', 'Y', data=salary2.query('KID==@kid'), ci=None, label=kid, ax=ax, color=sns.color_palette()[kid-1], marker=MarkerStyle.filled_markers[kid], line_kws={'linestyle': 'dashed'})
plt.setp(ax, xlim=xlim, ylim=ylim)
sns.regplot(ax=ax, **regkws)
ax.legend(title='KID')
plt.setp(ax, xlabel='X' if row == 1 else '', ylabel='Y' if col == 0 else '')
if row == 0:
plt.setp(ax.get_xticklabels(), visible=False)
if col == 1:
plt.setp(ax.get_yticklabels(), visible=False)
gs1.tight_layout(fig, rect=[None, None, 0.5, None])
gs2.tight_layout(fig, rect=[0.5, None, None, None])
plt.show()
```

## 8.1.2 グループ差を考えない場合

```data = dict(
N=salary2.index.size,
X=salary2['X'],
Y=salary2['Y']
)
fit1 = pystan.stan('./stan/model8-1.stan', data=data, seed=1234)
```

## 8.1.3 グループごとに切片と傾きを持つ場合

```data = {col: salary2[col] for col in salary2.columns}
data['N'] = salary2.index.size
data['K'] = salary2['KID'].max()
fit2 = pystan.stan('./stan/model8-2.stan', data=data, seed=1234)
```

## 8.1.4 階層モデル

```%load_ext rpy2.ipython
```
```%%R
set.seed(123)
N <- 40
K <- 4
N_k <- c(15, 12, 10, 3)
a0 <- 350
b0 <- 12
s_a <- 60
s_b <- 4
s_Y <- 25
X <- sample(x=0:35, size=N, replace=TRUE)
KID <- rep(1:4, times=N_k)

a <- rnorm(K, mean=0, sd=s_a) + a0
b <- rnorm(K, mean=0, sd=s_b) + b0
d <- data.frame(X=X, KID=KID, a=a[KID], b=b[KID])
d <- transform(d, Y_sim=rnorm(N, mean=a + b*X, sd=s_Y))
```
```import rpy2.robjects as ro
d = ro.pandas2ri.ri2py_dataframe(ro.r['d'])
```

データ作成ここまで

```_, axes = plt.subplots(2, 2, figsize=figaspect(2/2), sharex=True, sharey=True)
rows, cols = axes.shape
for (row, col), ax in np.ndenumerate(axes):
kid = row * cols + col + 1
sns.regplot('X', 'Y_sim', data=d.query('KID==@kid'), ci=None, label=kid, marker=MarkerStyle.filled_markers[kid], line_kws={'color': 'k', 'alpha': 0.8, 'linestyle': 'dashed'}, ax=ax)
ax.legend(title='KID')
plt.setp(ax, xlabel='X' if row == rows - 1 else '', ylabel='Y' if col == 0 else '')
plt.show()
```

```data = dict(
N=salary2.index.size,
K=salary2['KID'].unique().size,
X=salary2['X'],
Y=salary2['Y'],
KID=salary2['KID']
)
fit3 = pystan.stan('./stan/model8-3.stan', data=data, seed=1234)
```

## 8.1.5 モデルの比較

グラフ外にまとめればいいのでしょうが、右側に凡例をつけるとごちゃごちゃするので省略しました。

```ms1 = fit1.extract()
ms2 = fit2.extract()
ms3 = fit3.extract()

K = 4
probs = (2.5, 25, 50, 75, 97.5)
d_qua1 = pd.DataFrame(np.percentile(ms2['a'], probs, axis=0).T, columns=['p{}'.format(p) for p in probs])
d_qua1['KID'] = np.arange(K) + 1 - 0.1
d_qua1['Model'] = '8-2'

d_qua2 = pd.DataFrame(np.percentile(ms3['a'], probs, axis=0).T, columns=['p{}'.format(p) for p in probs])
d_qua2['KID'] = np.arange(K) + 1 + 0.1
d_qua2['Model'] = '8-3'
d_qua = pd.concat([d_qua1, d_qua2], axis=0, ignore_index=True)

fig = plt.figure(figsize=figaspect(2/4))

gs1 = GridSpec(1, 1, figure=fig)
ax1 = plt.subplot(gs1[0, 0])
for model, linestyle, color in zip(d_qua['Model'].unique(), ['dashed', 'solid'], ['w', 'k']):
d_sub = d_qua.query('Model==@model')
ax1.vlines('KID', 'p2.5', 'p97.5', data=d_sub, linestyles=linestyle, label=model)
ax1.scatter('KID', 'p50', data=d_sub, c=color, edgecolors='k', label=None)
xmin, xmax = ax1.get_xlim()
ax1.hlines(np.median(ms1['a']), xmin, xmax, alpha=0.3)
ax1.legend(title='Model', loc='upper left')
plt.setp(ax1, xlabel='KID', ylabel='a', xlim=(xmin, xmax))

K = 4
N_mcmc = ms1['lp__'].size
dx = salary2.groupby('KID').agg({'X': ['min', 'max']})
dx.columns = ['Xmin', 'Xmax']

X_new = np.arange(dx.values.min(), dx.values.max()+1)
N_X = X_new.size
y_base_mcmc1 = ms1['a'].reshape((-1, 1)) + np.outer(ms1['b'], X_new)
y_base_med1 = np.median(y_base_mcmc1, axis=0)
d1 = pd.DataFrame(dict(
X=np.tile(X_new, 4),
Y=np.tile(y_base_med1, 4),
KID=np.repeat(np.arange(4)+1, N_X),
Model='8-1'
))

d2 = pd.DataFrame({col: [] for col in d1.columns})
d3 = pd.DataFrame({col: [] for col in d1.columns})
for i in range(K):
kid = i + 1
X_new = np.arange(dx.loc[kid, 'Xmin'], dx.loc[kid, 'Xmax']+1)
N_X = X_new.size
y_base_mcmc2 = ms2['a'][:, i].reshape((-1, 1)) + np.outer(ms2['b'][:, i], X_new)
y_base_mcmc3 = ms3['a'][:, i].reshape((-1, 1)) + np.outer(ms3['b'][:, i], X_new)
d2 = d2.append(pd.DataFrame({'X': X_new, 'Y': np.median(y_base_mcmc2, axis=0), 'KID': kid, 'Model': '8-2'}))
d3 = d3.append(pd.DataFrame({'X': X_new, 'Y': np.median(y_base_mcmc3, axis=0), 'KID': kid, 'Model': '8-3'}))

rows, cols = 2, 2
gs2 = GridSpec(rows, cols)
for row, col in np.ndindex((rows, cols)):
kid = row + 1 + col * rows
ax = fig.add_subplot(gs2[row, col], sharex=ax if kid > 1 else None, sharey=ax if kid > 1 else None)
q = 'KID==@kid'
ax.scatter('X', 'Y', data=salary2.query(q), color='k', marker=MarkerStyle.filled_markers[kid], alpha=0.3, label=kid)
ax.plot('X', 'Y', data=d1.query(q), color='k', alpha=0.2, linestyle='solid', label='8-1')
ax.plot('X', 'Y', data=d2.query(q), color='k', alpha=0.6, linestyle='dashed', label='8-2')
ax.plot('X', 'Y', data=d3.query(q), color='k', alpha=1, linestyle='solid', label='8-3')
plt.setp(ax, title=kid, xlabel='X' if row == rows-1 else '', ylabel='Y' if col == 0 else '')
if row < rows -1:
plt.setp(ax.get_xticklabels(), visible=False)
if col > 0:
plt.setp(ax.get_yticklabels(), visible=False)

gs1.tight_layout(fig, rect=[None, None, 0.5, None])
gs2.tight_layout(fig, rect=[0.5, None, None, None])
top = min(gs1.top, gs2.top)
bottom = max(gs1.bottom, gs2.bottom)
gs1.update(top=top, bottom=bottom)
gs2.update(top=top, bottom=bottom)
plt.show()
```

## 8.1.6 階層モデルの等価な表現

```data = {col: salary2[col] for col in salary2.columns}
data['N'] = salary2.index.size
data['K'] = salary2['KID'].max()
fit = pystan.stan('./stan/model8-4.stan', data=data, seed=1234)
```