Edited at

StanとRでベイズ統計モデリング(アヒル本)をPythonにしてみる - Chapter 8 練習問題

実行環境


インポート

import numpy as np

import pandas as pd
import pystan
import matplotlib.pyplot as plt
from matplotlib.figure import figaspect
from matplotlib.markers import MarkerStyle
import seaborn as sns
%matplotlib inline


データ読み込み

salary2 = pd.read_csv('./data/data-salary-2.txt')

salary3 = pd.read_csv('./data/data-salary-3.txt')
attendance41 = pd.read_csv('./data/data-attendance-4-1.txt')
attendance42 = pd.read_csv('./data/data-attendance-4-2.txt')
d = pd.read_csv('./data/data7a.csv')
d1 = pd.read_csv('./data/d1.csv')


(1)

N = salary2.index.size

K = salary2['KID'].max()
data = {col: salary2[col] for col in salary2.columns}
data['N'] = N
data['K'] = K
fit1 = pystan.stan('./stan/model8-2.stan', data=data, seed=1234)

ms1 = fit1.extract()

N_mcmc = ms1['lp__'].size
X_new = np.arange(salary2['X'].max()+1)
N_X = X_new.size

d2 = pd.DataFrame([])
for k in range(K):
loc = ms1['a'][:, k].reshape((-1, 1)) + np.outer(ms1['b'][:, k], X_new)
scale = np.broadcast_to(ms1['s_Y'].reshape((-1, 1)), loc.shape)
y_base_mcmc2 = np.random.normal(loc, scale)
d2 = d2.append(pd.DataFrame(np.hstack([X_new.reshape((-1, 1)), np.percentile(y_base_mcmc2, (2.5, 50, 97.5), axis=0).T, np.full_like(X_new, k+1).reshape((-1, 1))]), columns=('X', 'p2.5', 'p50', 'p97.5', 'KID')), ignore_index=True)

_, axes = plt.subplots(2, 2, figsize=figaspect(2/2), sharex=True, sharey=True)
for i, ax in enumerate(axes.flat):
kid = i + 1
q = 'KID==@kid'
ax.fill_between('X', 'p2.5', 'p97.5', data=d2.query(q), color='k', alpha=1/5)
ax.plot('X', 'p50', data=d2.query(q), color='k', alpha=0.8, label='')
ax.scatter('X', 'Y', data=salary2.query(q), c='k', marker=MarkerStyle.filled_markers[i], label=kid)
ax.legend(title='KID')
plt.setp(ax, title=kid)
plt.tight_layout()
plt.show()

chap08ex1.png


(2)

N = salary2.index.size

K = salary2['KID'].max()
data = {col: salary2[col] for col in salary2.columns}
data['N'] = N
data['K'] = K
fit2 = pystan.stan('./stan/model8-3.stan', data=data, seed=1234)

ms2 = fit2.extract()

N_mcmc = ms2['lp__'].size
X_new = np.arange(salary2['X'].max()+1)
N_X = X_new.size

d3 = pd.DataFrame([])
for k in range(K):
loc = ms2['a'][:, k].reshape((-1, 1)) + np.outer(ms2['b'][:, k], X_new)
scale = np.broadcast_to(ms2['s_Y'].reshape((-1, 1)), loc.shape)
y_base_mcmc3 = np.random.normal(loc, scale)
d3 = d3.append(pd.DataFrame(np.hstack([X_new.reshape((-1, 1)), np.percentile(y_base_mcmc3, (2.5, 50, 97.5), axis=0).T, np.full_like(X_new, k+1).reshape((-1, 1))]), columns=('X', 'p2.5', 'p50', 'p97.5', 'KID')), ignore_index=True)

_, axes = plt.subplots(2, 2, figsize=figaspect(2/2), sharex=True, sharey=True)
for i, ax in enumerate(axes.flat):
kid = i + 1
q = 'KID==@kid'
ax.fill_between('X', 'p2.5', 'p97.5', data=d3.query(q), color='k', alpha=1/5)
ax.plot('X', 'p50', data=d3.query(q), color='k', alpha=0.8, label='')
ax.scatter('X', 'Y', data=salary2.query(q), c='k', marker=MarkerStyle.filled_markers[i], label=kid)
ax.legend(title='KID')
plt.setp(ax, title=kid)
plt.tight_layout()
plt.show()

chap08ex2.png


(3)

N = salary3.index.size

G = salary3['GID'].max()
K = salary3['KID'].max()
data = dict(
N=N,
G=G,
K=K,
X=salary3['X'],
Y=salary3['Y'],
KID=salary3['KID'],
K2G=salary3.drop_duplicates(('KID', 'GID'))['GID']
)
fit5 = pystan.stan('./stan/model8-5.stan', data=data, seed=12345)

ms3 = fit5.extract()

N_mcmc = ms3['lp__'].size

param_names = ['mcmc'] + ['a1_{}'.format(i+1) for i in range(3)] + ['b1_{}'.format(i+1) for i in range(3)]
d_est = pd.DataFrame(np.hstack([np.arange(N_mcmc).reshape((-1, 1)), ms3['a1'], ms3['b1']]), columns=param_names)
d_qua = d_est.quantile((0.025, 0.5, 0.975)).transpose().reset_index()
d_qua.columns = ('X', 'p2.5', 'p50', 'p97.5')
d_melt = pd.melt(d_est, id_vars=('mcmc'), var_name='X')

_, axes = plt.subplots(1, 2, figsize=figaspect(5/8))
for ax, param in zip(axes, ['a', 'b']):
q = 'X.str.startswith("{}1")'.format(param)
ax.hlines('X', 'p2.5', 'p97.5', data=d_qua.query(q))
sns.violinplot('value', 'X', data=d_melt.query(q), scale='width', inner=None, color='w', ax=ax)
ax.scatter('p50', 'X', data=d_qua.query(q), c='k')
plt.setp(ax, xlabel='value', ylabel='parameter')
plt.tight_layout()
plt.show()

fig8-ex3.png


(4)

d_person = attendance42.groupby('PersonID')['Y'].mean()

plt.figure(figsize=figaspect(3/4))
sns.distplot(d_person)
plt.show()

fig8-ex4-person.png

d_course = attendance42.groupby('CourseID')['Y'].mean()

plt.figure(figsize=figaspect(3/4))
sns.distplot(d_course)
plt.show()

fig8-ex4-course.png


(5)

d_conv = pd.Series([0, 0.2, 1], index=('A', 'B', 'C'))

data = dict(
N=attendance41.index.size,
C=attendance42['CourseID'].max(),
I=attendance42.index.size,
A=attendance41['A'],
Score=attendance41['Score']/200,
PID=attendance42['PersonID'],
CID=attendance42['CourseID'],
W=d_conv[attendance42['Weather']],
Y=attendance42['Y']
)
fit = pystan.stan('./stan/model8ex5.stan', data=data, pars=('b', 'b_P', 'b_C', 's_P', 's_C', 'q'), seed=1234)


(6)

data = dict(

N=d.index.size,
Y=d['y']
)
fit = pystan.stan('./stan/model8ex6.stan', data=data, seed=1234)


(7)

F2int = pd.Series([0, 1], index=('C', 'T'))

pots = d1['pot'].unique()
N_Pot = pots.size
Pot2int = pd.Series(np.arange(N_Pot)+1, index=pots)

data = dict(
N=d1.index.size,
N_Pot=N_Pot,
F=F2int[d1['f']],
N2Pot=Pot2int[d1['pot']],
Y=d1['y']
)
fit = pystan.stan('./stan/model8ex7.stan', data=data, seed=1234)