Python
Stan
PyStan

StanとRでベイズ統計モデリング(アヒル本)をPythonにしてみる - Chapter 8 練習問題

実行環境

インポート

import numpy as np
import pandas as pd
import pystan
import matplotlib.pyplot as plt
from matplotlib.figure import figaspect
from matplotlib.markers import MarkerStyle
import seaborn as sns
%matplotlib inline

データ読み込み

salary2 = pd.read_csv('./data/data-salary-2.txt')
salary3 = pd.read_csv('./data/data-salary-3.txt')
attendance41 = pd.read_csv('./data/data-attendance-4-1.txt')
attendance42 = pd.read_csv('./data/data-attendance-4-2.txt')
d = pd.read_csv('./data/data7a.csv')
d1 = pd.read_csv('./data/d1.csv')

(1)

N = salary2.index.size
K = salary2['KID'].max()
data = {col: salary2[col] for col in salary2.columns}
data['N'] = N
data['K'] = K
fit1 = pystan.stan('./stan/model8-2.stan', data=data, seed=1234)
ms1 = fit1.extract()

N_mcmc = ms1['lp__'].size
X_new = np.arange(salary2['X'].max()+1)
N_X = X_new.size

d2 = pd.DataFrame([])
for k in range(K):
    loc = ms1['a'][:, k].reshape((-1, 1)) + np.outer(ms1['b'][:, k], X_new)
    scale = np.broadcast_to(ms1['s_Y'].reshape((-1, 1)), loc.shape)
    y_base_mcmc2 = np.random.normal(loc, scale)
    d2 = d2.append(pd.DataFrame(np.hstack([X_new.reshape((-1, 1)), np.percentile(y_base_mcmc2, (2.5, 50, 97.5), axis=0).T, np.full_like(X_new, k+1).reshape((-1, 1))]), columns=('X', 'p2.5', 'p50', 'p97.5', 'KID')), ignore_index=True)

_, axes = plt.subplots(2, 2, figsize=figaspect(2/2), sharex=True, sharey=True)
for i, ax in enumerate(axes.flat):
    kid = i + 1
    q = 'KID==@kid'
    ax.fill_between('X', 'p2.5', 'p97.5', data=d2.query(q), color='k', alpha=1/5)
    ax.plot('X', 'p50', data=d2.query(q), color='k', alpha=0.8, label='')
    ax.scatter('X', 'Y', data=salary2.query(q), c='k', marker=MarkerStyle.filled_markers[i], label=kid)
    ax.legend(title='KID')
    plt.setp(ax, title=kid)
plt.tight_layout()
plt.show()

chap08ex1.png

(2)

N = salary2.index.size
K = salary2['KID'].max()
data = {col: salary2[col] for col in salary2.columns}
data['N'] = N
data['K'] = K
fit2 = pystan.stan('./stan/model8-3.stan', data=data, seed=1234)
ms2 = fit2.extract()

N_mcmc = ms2['lp__'].size
X_new = np.arange(salary2['X'].max()+1)
N_X = X_new.size

d3 = pd.DataFrame([])
for k in range(K):
    loc = ms2['a'][:, k].reshape((-1, 1)) + np.outer(ms2['b'][:, k], X_new)
    scale = np.broadcast_to(ms2['s_Y'].reshape((-1, 1)), loc.shape)
    y_base_mcmc3 = np.random.normal(loc, scale)
    d3 = d3.append(pd.DataFrame(np.hstack([X_new.reshape((-1, 1)), np.percentile(y_base_mcmc3, (2.5, 50, 97.5), axis=0).T, np.full_like(X_new, k+1).reshape((-1, 1))]), columns=('X', 'p2.5', 'p50', 'p97.5', 'KID')), ignore_index=True)

_, axes = plt.subplots(2, 2, figsize=figaspect(2/2), sharex=True, sharey=True)
for i, ax in enumerate(axes.flat):
    kid = i + 1
    q = 'KID==@kid'
    ax.fill_between('X', 'p2.5', 'p97.5', data=d3.query(q), color='k', alpha=1/5)
    ax.plot('X', 'p50', data=d3.query(q), color='k', alpha=0.8, label='')
    ax.scatter('X', 'Y', data=salary2.query(q), c='k', marker=MarkerStyle.filled_markers[i], label=kid)
    ax.legend(title='KID')
    plt.setp(ax, title=kid)
plt.tight_layout()
plt.show()

chap08ex2.png

(3)

N = salary3.index.size
G = salary3['GID'].max()
K = salary3['KID'].max()
data = dict(
    N=N,
    G=G,
    K=K,
    X=salary3['X'],
    Y=salary3['Y'],
    KID=salary3['KID'],
    K2G=salary3.drop_duplicates(('KID', 'GID'))['GID']
)
fit5 = pystan.stan('./stan/model8-5.stan', data=data, seed=12345)
ms3 = fit5.extract()
N_mcmc = ms3['lp__'].size

param_names = ['mcmc'] + ['a1_{}'.format(i+1) for i in range(3)] + ['b1_{}'.format(i+1) for i in range(3)]
d_est = pd.DataFrame(np.hstack([np.arange(N_mcmc).reshape((-1, 1)), ms3['a1'], ms3['b1']]), columns=param_names)
d_qua = d_est.quantile((0.025, 0.5, 0.975)).transpose().reset_index()
d_qua.columns = ('X', 'p2.5', 'p50', 'p97.5')
d_melt = pd.melt(d_est, id_vars=('mcmc'), var_name='X')

_, axes = plt.subplots(1, 2, figsize=figaspect(5/8))
for ax, param in zip(axes, ['a', 'b']):
    q = 'X.str.startswith("{}1")'.format(param)
    ax.hlines('X', 'p2.5', 'p97.5', data=d_qua.query(q))
    sns.violinplot('value', 'X', data=d_melt.query(q), scale='width', inner=None, color='w', ax=ax)
    ax.scatter('p50', 'X', data=d_qua.query(q), c='k')
    plt.setp(ax, xlabel='value', ylabel='parameter')
plt.tight_layout()
plt.show()

fig8-ex3.png

(4)

d_person = attendance42.groupby('PersonID')['Y'].mean()
plt.figure(figsize=figaspect(3/4))
sns.distplot(d_person)
plt.show()

fig8-ex4-person.png

d_course = attendance42.groupby('CourseID')['Y'].mean()
plt.figure(figsize=figaspect(3/4))
sns.distplot(d_course)
plt.show()

fig8-ex4-course.png

(5)

d_conv = pd.Series([0, 0.2, 1], index=('A', 'B', 'C'))
data = dict(
    N=attendance41.index.size,
    C=attendance42['CourseID'].max(),
    I=attendance42.index.size,
    A=attendance41['A'],
    Score=attendance41['Score']/200,
    PID=attendance42['PersonID'],
    CID=attendance42['CourseID'],
    W=d_conv[attendance42['Weather']],
    Y=attendance42['Y']
)
fit = pystan.stan('./stan/model8ex5.stan', data=data, pars=('b', 'b_P', 'b_C', 's_P', 's_C', 'q'), seed=1234)

(6)

data = dict(
    N=d.index.size,
    Y=d['y']
)
fit = pystan.stan('./stan/model8ex6.stan', data=data, seed=1234)

(7)

F2int = pd.Series([0, 1], index=('C', 'T'))
pots = d1['pot'].unique()
N_Pot = pots.size
Pot2int = pd.Series(np.arange(N_Pot)+1, index=pots)

data = dict(
    N=d1.index.size,
    N_Pot=N_Pot,
    F=F2int[d1['f']],
    N2Pot=Pot2int[d1['pot']],
    Y=d1['y']
)
fit = pystan.stan('./stan/model8ex7.stan', data=data, seed=1234)