インポート
import pandas as pd
import statsmodels.formula.api as smf
import pystan
import matplotlib.pyplot as plt
from matplotlib.figure import figaspect
from matplotlib.gridspec import GridSpec
from matplotlib.markers import MarkerStyle
import seaborn as sns
%matplotlib inline
データ読み込み
salary3 = pd.read_csv('./data/data-salary-3.txt')
8.2 複数の階層を持つ階層モデル
8.2.1 解析の目的とデータの分布の確認
fig = plt.figure(figsize=figaspect(2/4))
gs1 = GridSpec(1, 1, figure=fig)
ax1 = plt.subplot(gs1[0, 0])
for gid in salary3['GID'].unique():
ax1.scatter('X', 'Y', data=salary3.query('GID==@gid'), marker=MarkerStyle.filled_markers[gid], label=gid)
sns.regplot('X', 'Y', data=salary3, scatter=False, ci=None, line_kws={'alpha': 0.3}, ax=ax1)
xlim = ax1.get_xlim()
ylim = ax1.get_ylim()
ax1.legend(title='GID')
gs2 = GridSpec(2, 2, figure=fig)
for i, gid in enumerate(salary3['GID'].unique()):
row = i // 2
col = i % 2
ax = fig.add_subplot(gs2[row, col], sharex=ax1, sharey=ax1)
ax.scatter('X', 'Y', data=salary3.query('GID==@gid'), c=sns.color_palette()[i], marker=MarkerStyle.filled_markers[gid], alpha=0.8, label=gid)
sns.regplot('X', 'Y', data=salary3, scatter=False, ci=None, line_kws={'alpha': 0.3}, ax=ax)
ax.legend(title='GID')
if row == 1:
plt.setp(ax, xlabel='X')
else:
plt.setp(ax, xlabel='')
plt.setp(ax.get_xticklabels(), visible=False)
if col == 0:
plt.setp(ax, ylabel='Y')
else:
plt.setp(ax, ylabel='')
plt.setp(ax.get_yticklabels(), visible=False)
plt.setp(ax1, xlim=xlim, ylim=ylim)
gs1.tight_layout(fig, rect=[None, None, 0.5, None])
gs2.tight_layout(fig, rect=[0.5, None, None, None])
plt.show()
KIDGID = salary3.loc[:, ('KID', 'GID')].drop_duplicates()
N = salary3.index.size
K = 30
G = 3
coefs = salary3.groupby('KID').apply(lambda d: smf.ols('Y~X', data=d).fit().params.rename({'Intercept': 'a', 'X': 'b'})).reset_index()
d_plot = pd.merge(coefs, KIDGID, on='KID')
fig = plt.figure(figsize=figaspect(3/4))
for i, gid in enumerate(salary3['GID'].unique()):
ax1 = fig.add_subplot(3, 2, i*2+1, sharex=ax1 if i > 0 else None, sharey=ax1 if i > 0 else None)
sns.distplot(d_plot.query('GID==@i+1')['a'], ax=ax1)
plt.setp(ax1, title=gid)
ax2 = fig.add_subplot(3, 2, i*2+2, sharex=ax2 if i > 0 else None, sharey=ax2 if i > 0 else None)
sns.distplot(d_plot.query('GID==@i+1')['b'], ax=ax2)
plt.setp(ax2, title=gid)
if i < salary3['GID'].unique().size - 1:
plt.setp(ax1, xlabel='')
plt.setp(ax1.get_xticklabels(), visible=False)
plt.setp(ax2, xlabel='')
plt.setp(ax2.get_xticklabels(), visible=False)
plt.tight_layout()
plt.show()
8.2.4 Stanで実装−その1
data = dict(
N=salary3.index.size,
G=salary3['GID'].max(),
K=salary3['KID'].max(),
X=salary3['X'],
Y=salary3['Y'],
KID=salary3['KID'],
K2G=salary3.drop_duplicates(subset=['KID', 'GID'])['GID']
)
fit = pystan.stan('./stan/model8-5.stan', data=data, seed=12345)
8.2.7 Stanで実装−その2
data = {col: salary3[col] for col in salary3.columns}
data['N'] = salary3.index.size
data['G'] = salary3['GID'].max()
data['K'] = salary3['KID'].max()
data['K2G'] = salary3.drop_duplicates(subset=['KID', 'GID'])['GID']
fit = pystan.stan('./stan/model8-6.stan', data=data, seed=123)