LoginSignup
1
2

More than 5 years have passed since last update.

StanとRでベイズ統計モデリング(アヒル本)をPythonにしてみる - 7.2 対数をとるか否か

Last updated at Posted at 2018-08-18

実行環境

インポート

import numpy as np
from scipy import stats
import pandas as pd
import pystan
import matplotlib.pyplot as plt
from matplotlib.figure import figaspect
from matplotlib.ticker import ScalarFormatter
import seaborn as sns
%matplotlib inline

データ読み込み

rental = pd.read_csv('./data/data-rental.txt')

7.2 対数をとるか否か

_, (ax1, ax2) = plt.subplots(1, 2, figsize=figaspect(3/8))
rental.plot.scatter('Area', 'Y', ax=ax1)
plt.setp(ax1, xticks=np.arange(20, 120, 20), yticks=np.arange(500, 2000, 500))
rental.plot.scatter('Area', 'Y', ax=ax2)
xticks = (1,2,5,10,20,50,100)
yticks = (200,500,1000,2000)
ax2.get_xaxis().set_major_formatter(ScalarFormatter())
plt.setp(ax2, xscale='log', yscale='log', yticks=(10,20,50,100,200,500,1000,2000))
plt.setp(ax2, xticks=xticks, xticklabels=xticks, yticks=yticks, yticklabels=yticks, xlim=(9, 120))
plt.show()

fig7-1.png

N_new = 50
Area_new = np.linspace(10, 120, N_new)
data = {col: rental[col] for col in rental.columns}
data['N'] = rental.index.size
data['N_new'] = N_new
data['Area_new'] = Area_new
fit1 = pystan.stan('./stan/model7-1.stan', data=data, seed=1234)
for key in ['Area', 'Y', 'Area_new']:
    data[key] = np.log10(data[key])
fit2 = pystan.stan('./stan/model7-2.stan', data=data, seed=1234)
ms1 = fit1.extract()
ms2 = fit2.extract()
for key in ['y_pred', 'y_new']:
    ms2[key] = np.power(10, ms2[key])
_, axes = plt.subplots(1, 2, figsize=figaspect(3/8), sharex=True, sharey=True)
for ms, ax in zip([ms1, ms2], axes):
    d_est = np.percentile(ms['y_new'], (10, 25, 50, 75, 90), axis=0)
    ax.fill_between(Area_new, d_est[0], d_est[-1], color='k', alpha=0.2)
    ax.fill_between(Area_new, d_est[1], d_est[-2], color='k', alpha=0.4)
    ax.plot(Area_new, d_est[2], color='k')
    rental.plot.scatter('Area', 'Y', ax=ax, color='k')
plt.show()

fig7-2.png

_, axes = plt.subplots(1, 2, figsize=figaspect(1/2), sharex=True, sharey=True)
for ms, ax in zip([ms1, ms2], axes):
    d_est = np.percentile(ms['y_pred'], (10, 50, 90), axis=0)
    ax.errorbar(rental['Y'], d_est[1], yerr=[d_est[1]-d_est[0], d_est[-1]-d_est[1]], fmt='o', color='k', elinewidth=1)
    xmin, xmax = ax.get_xlim()
    ymin, ymax = ax.get_ylim()
    lim = (min(xmin, ymin), max(xmax, ymax))
    ax.plot(lim, lim, c='k', linestyle='dashed')
    plt.setp(ax, xlim=lim, ylim=lim, xlabel='Obserbed', ylabel='Predicted')
plt.show()

fig7-3.png

N_mcmc1 = len(ms1['lp__'])
N_mcmc2 = len(ms2['lp__'])
d_noise1 = pd.DataFrame(rental['Y'].values - ms1['mu'], columns=['noise{}'.format(i) for i in range(rental.index.size)])
d_noise2 = pd.DataFrame(np.log10(rental['Y'].values) - ms2['mu'], columns=['noise{}'.format(i) for i in range(rental.index.size)])
d_est1 = d_noise1.join(pd.Series(np.arange(N_mcmc1), name='mcmc'))
d_est2 = d_noise2.join(pd.Series(np.arange(N_mcmc2), name='mcmc'))

def get_map(col):
    kernel = stats.gaussian_kde(col)
    dens_x = np.linspace(col.min(), col.max(), kernel.n)
    dens_y = kernel.pdf(dens_x)
    mode_i = np.argmax(dens_y)
    mode_x = dens_x[mode_i]
    mode_y = dens_y[mode_i]
    return pd.Series([mode_x, mode_y], index=['X', 'Y'])

d_mode1 = d_noise1.apply(get_map).T
d_mode2 = d_noise2.apply(get_map).T

s_MAP1 = get_map(ms1['s_Y'])['X']
s_MAP2 = get_map(ms2['s_Y'])['X']

_, axes = plt.subplots(1, 2, figsize=figaspect(3/8))
for d_mode, s_MAP, ax in zip([d_mode1, d_mode2], [s_MAP1, s_MAP2], axes):
    sns.distplot(d_mode['X'], bins=16, color='k', hist_kws={'facecolor': 'w', 'edgecolor': 'k'}, kde_kws={'shade': True}, ax=ax)
    xdata = ax.lines[0].get_xdata()
    ax.plot(xdata, stats.norm.pdf(xdata, scale=s_MAP), color='k', linestyle='dashed')
    plt.setp(ax, xlabel='value')

plt.show()

fig7-4.png

1
2
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
2