Python
Stan
PyStan

StanとRでベイズ統計モデリング(アヒル本)をPythonにしてみる - 8.2 複数の階層を持つ階層モデル

実行環境

インポート

import pandas as pd
import statsmodels.formula.api as smf
import pystan
import matplotlib.pyplot as plt
from matplotlib.figure import figaspect
from matplotlib.gridspec import GridSpec
from matplotlib.markers import MarkerStyle
import seaborn as sns
%matplotlib inline

データ読み込み

salary3 = pd.read_csv('./data/data-salary-3.txt')

8.2 複数の階層を持つ階層モデル

8.2.1 解析の目的とデータの分布の確認

fig = plt.figure(figsize=figaspect(2/4))

gs1 = GridSpec(1, 1, figure=fig)
ax1 = plt.subplot(gs1[0, 0])
for gid in salary3['GID'].unique():
    ax1.scatter('X', 'Y', data=salary3.query('GID==@gid'), marker=MarkerStyle.filled_markers[gid], label=gid)
sns.regplot('X', 'Y', data=salary3, scatter=False, ci=None, line_kws={'alpha': 0.3}, ax=ax1)
xlim = ax1.get_xlim()
ylim = ax1.get_ylim()
ax1.legend(title='GID')

gs2 = GridSpec(2, 2, figure=fig)
for i, gid in enumerate(salary3['GID'].unique()):
    row = i // 2
    col = i % 2
    ax = fig.add_subplot(gs2[row, col], sharex=ax1, sharey=ax1)
    ax.scatter('X', 'Y', data=salary3.query('GID==@gid'), c=sns.color_palette()[i], marker=MarkerStyle.filled_markers[gid], alpha=0.8, label=gid)
    sns.regplot('X', 'Y', data=salary3, scatter=False, ci=None, line_kws={'alpha': 0.3}, ax=ax)
    ax.legend(title='GID')
    if row == 1:
        plt.setp(ax, xlabel='X')
    else:
        plt.setp(ax, xlabel='')
        plt.setp(ax.get_xticklabels(), visible=False)
    if col == 0:
        plt.setp(ax, ylabel='Y')
    else:
        plt.setp(ax, ylabel='')
        plt.setp(ax.get_yticklabels(), visible=False)
plt.setp(ax1, xlim=xlim, ylim=ylim)
gs1.tight_layout(fig, rect=[None, None, 0.5, None])
gs2.tight_layout(fig, rect=[0.5, None, None, None])
plt.show()

fig8-5.png

KIDGID = salary3.loc[:, ('KID', 'GID')].drop_duplicates()

N = salary3.index.size
K = 30
G = 3
coefs = salary3.groupby('KID').apply(lambda d: smf.ols('Y~X', data=d).fit().params.rename({'Intercept': 'a', 'X': 'b'})).reset_index()
d_plot = pd.merge(coefs, KIDGID, on='KID')

fig = plt.figure(figsize=figaspect(3/4))
for i, gid in enumerate(salary3['GID'].unique()):
    ax1 = fig.add_subplot(3, 2, i*2+1, sharex=ax1 if i > 0 else None, sharey=ax1 if i > 0 else None)
    sns.distplot(d_plot.query('GID==@i+1')['a'], ax=ax1)
    plt.setp(ax1, title=gid)
    ax2 = fig.add_subplot(3, 2, i*2+2, sharex=ax2 if i > 0 else None, sharey=ax2 if i > 0 else None)
    sns.distplot(d_plot.query('GID==@i+1')['b'], ax=ax2)
    plt.setp(ax2, title=gid)
    if i < salary3['GID'].unique().size - 1:
        plt.setp(ax1, xlabel='')
        plt.setp(ax1.get_xticklabels(), visible=False)
        plt.setp(ax2, xlabel='')
        plt.setp(ax2.get_xticklabels(), visible=False)

plt.tight_layout()
plt.show()

fig8-6.png

8.2.4 Stanで実装−その1

data = dict(
    N=salary3.index.size,
    G=salary3['GID'].max(),
    K=salary3['KID'].max(),
    X=salary3['X'],
    Y=salary3['Y'],
    KID=salary3['KID'],
    K2G=salary3.drop_duplicates(subset=['KID', 'GID'])['GID']
)
fit = pystan.stan('./stan/model8-5.stan', data=data, seed=12345)

8.2.7 Stanで実装−その2

data = {col: salary3[col] for col in salary3.columns}
data['N'] = salary3.index.size
data['G'] = salary3['GID'].max()
data['K'] = salary3['KID'].max()
data['K2G'] = salary3.drop_duplicates(subset=['KID', 'GID'])['GID']
fit = pystan.stan('./stan/model8-6.stan', data=data, seed=123)