LoginSignup
1
2

More than 3 years have passed since last update.

PyMC3でガウス過程回帰 自分用メモ

Last updated at Posted at 2020-04-25
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import arviz as az
import pymc3 as pm
import matplotlib.gridspec as gridspec
import theano.tensor as tt
%matplotlib inline
plt.rcParams['font.size']=15

def plt_legend_out(frameon=True):
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0, frameon=frameon)

ガウスカーネル。$||x-x'||^2$はユークリッド平方距離。

$K(x,x')=\exp{\frac{-||x-x'||^2}{w}}$

カーネル化された線形回帰

$y=f(x)+\epsilon$

係数ベクトル$\gamma$

$f(x)=\mu=\sum_i^N\gamma_ix_i$

多項式回帰

$\mu=\sum_i^N\gamma_i\phi_i(x)$

$x'$はノット、もしくは重心

$\mu=\sum_i^N\gamma_iK_i(x,x')$

np.random.seed(1)
x = np.random.uniform(0, 10, size=20)
y = np.random.normal(np.sin(x), 0.2)
plt.plot(x, y, 'o')
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$f(x)$', fontsize=16, rotation=0)
plt.show()

image.png

def gauss_kernel(x, n_knots):
    """
    Simple Gaussian radial kernel
    """
    knots = np.linspace(x.min(), x.max(), n_knots)    
    w = 2 
    return np.array([np.exp(-(x-k)**2/w) for k in knots])

ガウスカーネル。$||x-x'||^2$はユークリッド平方距離。$w$が正であれば、$\exp{}$の中身は負になる。$x$と$x'$との距離が近づくほど、大きくなる。

$K(x,x')=\exp{\frac{-||x-x'||^2}{w}}$

tmp = np.linspace(-5,1,100)
plt.plot(tmp,np.exp(tmp))
plt.xlabel('x')
plt.ylabel('$\exp{(x)}$')
plt.axvline(x=0,color='gray',lw=0.5)
plt.axhline(y=0,color='gray',lw=0.5)
plt.show()

image.png

n_knots = 5
gk = gauss_kernel(x,n_knots)
plt.scatter(range(n_knots),np.linspace(x.min(), x.max(), n_knots))
plt.xlabel('index')
plt.ylabel('knots')
plt.show()

image.png

plt.plot(x, y, 'o')
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$f(x)$', fontsize=16, rotation=0)
plt.show()

image.png

plt.scatter(range(len(x)),x)
plt.xlabel('index')
plt.ylabel('x')
plt.axhline(y=np.min(x),color='gray',lw=0.5)
plt.axhline(y=np.max(x),color='gray',lw=0.5)
plt.show()

image.png

knots = np.linspace(x.min(), x.max(), n_knots)

for i in range(n_knots):
#    plt.scatter(range(len(x)),gk[i])
    plt.plot(gk[i], label='$x\'=$'+str(knots[i]))
plt.ylabel('$K(x,x\')$')
plt.xlabel('index')
plt_legend_out()

image.png

$y=f(x)+\color{green}{\epsilon}=\mu+\color{green}{\epsilon}=\sum_i^N\color{red}{\gamma_i}K_i(x,x')+\color{green}{\epsilon}$

$K(x,x')=\exp{\frac{-||x-x'||^2}{w}}$

$\color{red}{\gamma}$をコーシー分布から、$\color{green}{\epsilon}$を一様分布からサンプリング

with pm.Model() as kernel_model:
    gamma = pm.Cauchy('gamma', alpha=0, beta=1, shape=n_knots)
    sd = pm.Uniform('sd',0,  10)
    mu = pm.math.dot(gamma, gauss_kernel(x, n_knots))
    yl = pm.Normal('yl', mu=mu, sd=sd, observed=y)
    kernel_trace = pm.sample(10000, step=pm.Metropolis())
Multiprocess sampling (4 chains in 4 jobs)
CompoundStep
>Metropolis: [sd]
>Metropolis: [gamma]
Sampling 4 chains, 0 divergences: 100%|██████████| 42000/42000 [00:07<00:00, 5320.97draws/s]
The number of effective samples is smaller than 10% for some parameters.
chain = kernel_trace[5000:]
pm.traceplot(chain);
plt.show()

image.png

#ppc = pm.sample_ppc(chain, model=kernel_model, samples=100)
ppc = pm.sample_posterior_predictive(chain, model=kernel_model, samples=100)

plt.plot(x, ppc['yl'].T, 'ro', alpha=0.1)

plt.plot(x, y, 'bo')
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$f(x)$', fontsize=16, rotation=0)
plt.show()
/opt/conda/lib/python3.7/site-packages/pymc3/sampling.py:1247: UserWarning: samples parameter is smaller than nchains times ndraws, some draws and/or chains may not be represented in the returned posterior predictive sample
  "samples parameter is smaller than nchains times ndraws, some draws "
100%|██████████| 100/100 [00:00<00:00, 574.95it/s]

image.png

new_x = np.linspace(x.min(), x.max(), 100)
k = gauss_kernel(new_x, n_knots)
gamma_pred = chain['gamma']
for i in range(100):
    idx = np.random.randint(0, len(gamma_pred))
    y_pred = np.dot(gamma_pred[idx], k)
    plt.plot(new_x, y_pred, 'r-', alpha=0.1)
plt.plot(x, y, 'bo')
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$f(x)$', fontsize=16, rotation=0)
plt.show()

image.png

ガウス過程

$\color{red}{\mu}$は平均関数(mean function)、$\color{blue}{K(x,x')}$は共分散関数(covariance function})。今までは$p(\theta|x)$を推定してきたが、GPでは$p(f|x)$を推定するイメージ。

$f(x)\sim \text{GP}(\color{red}{\mu(x)},\color{blue}{K(x,x')})$

squared_distance = lambda x, y: np.array([[(x[i] - y[j])**2 for i in range(len(x))] for j in range(len(y))])
np.random.seed(1)
test_points = np.linspace(0, 10, 100)
cov = np.exp(-squared_distance(test_points, test_points))
plt.plot(test_points, stats.multivariate_normal.rvs(cov=cov, size=6).T)
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$f(x)$', fontsize=16, rotation=0)
plt.show()

image.png

共分散行列の対角要素が1にならないよう調整することで、ノイズをモデル化している。

\begin{eqnarray}
K_{i,j}=\left\{ \begin{array}{ll}
\eta\exp{(-\rho D)} & (i\neq j) \\
\eta+\sigma & (i=j)
\end{array} \right.
\end{eqnarray}
np.random.seed(1)
eta = 1
rho = 0.5
sigma = 0.03
D = squared_distance(test_points, test_points)

cov = eta * np.exp(-rho * D)
diag = eta * sigma

np.fill_diagonal(cov, diag)

for i in range(6):
    plt.plot(test_points, stats.multivariate_normal.rvs(cov=cov))
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$f(x)$', fontsize=16, rotation=0)
plt.show()
/opt/conda/lib/python3.7/site-packages/scipy/stats/_multivariate.py:660: RuntimeWarning: covariance is not positive-semidefinite.
  out = random_state.multivariate_normal(mean, cov, size)

image.png

事後予測密度

p(f(X_*|X_*,X,y))\sim N(\mu_*,\Sigma_*)
\begin{eqnarray}
\mu_* &=& K_*^TK^{-1}y\\
\Sigma_* &=& \color{red}{K_{**}}-\color{green}{K_*}^TK^{-1}\color{green}{K_*}\\
K &=& K(X,X)\\
\color{red}{K_{**}} &=& K(X_*,X_*)\\
\color{green}{K_*} &=& K(X,X_*)
\end{eqnarray}

$X$と$X_\ast$の差が大きい → $\color{green}{K_\ast}=K(X,X_\ast)$はゼロに近づく → $\color{green}{K_\ast}K^{-1}\color{green}{K_\ast}$もゼロに近づく → $\Sigma_\ast=\color{red}{K_{\ast\ast}}-\color{green}{K_\ast}K^{-1}\color{green}{K_\ast}$が大きくなる。つまり、データ点から離れた関数は、不確実性が大きくなる。

np.random.seed(1)

K_oo = eta * np.exp(-rho * D) 

D_x = squared_distance(x, x)
K = eta * np.exp(-rho * D_x)
diag_x = eta + sigma
np.fill_diagonal(K, diag_x)

D_off_diag = squared_distance(x, test_points)
K_o = eta * np.exp(-rho * D_off_diag)

# Posterior mean
mu_post = np.dot(np.dot(K_o, np.linalg.inv(K)), y)
# Posterior covariance
SIGMA_post = K_oo - np.dot(np.dot(K_o, np.linalg.inv(K)), K_o.T)


for i in range(100):
    fx = stats.multivariate_normal.rvs(mean=mu_post, cov=SIGMA_post)
    plt.plot(test_points, fx, 'r-', alpha=0.1)

plt.plot(x, y, 'o')

plt.xlabel('$x$', fontsize=16)
plt.ylabel('$f(x)$', fontsize=16, rotation=0)
plt.show()

image.png

コレスキー分解

逆行列の計算は$O(n^3)$のオーダーの計算量が必要なので、上記の計算量は負荷が掛かる。そこで「コレスキー分解(Cholesky decompoosition)」を使う。

np.random.seed(1)
eta = 1
rho = 0.5
sigma = 0.03

# This is the true unknown function we are trying to approximate
f = lambda x: np.sin(x).flatten()

# Define the kernel
def kernel(a, b):
    """ GP squared exponential kernel """
    D = np.sum(a**2,1).reshape(-1,1) + np.sum(b**2,1) - 2*np.dot(a, b.T)
    return eta * np.exp(- rho * D)

N = 20         # number of training points.
n = 100         # number of test points.

# Sample some input points and noisy versions of the function evaluated at
# these points. 
X = np.random.uniform(0, 10, size=(N,1))
y = f(X) + sigma * np.random.randn(N)

K = kernel(X, X)
L = np.linalg.cholesky(K + sigma * np.eye(N))

# points we're going to make predictions at.
Xtest = np.linspace(0, 10, n).reshape(-1,1)

# compute the mean at our test points.
Lk = np.linalg.solve(L, kernel(X, Xtest))
mu = np.dot(Lk.T, np.linalg.solve(L, y))

# compute the variance at our test points.
K_ = kernel(Xtest, Xtest)
sd_pred = (np.diag(K_) - np.sum(Lk**2, axis=0))**0.5


plt.fill_between(Xtest.flat, mu - 2 * sd_pred, mu + 2 * sd_pred, color="r", alpha=0.2)
plt.plot(Xtest, mu, 'r', lw=2)
plt.plot(x, y, 'o')
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$f(x)$', fontsize=16, rotation=0)
plt.show()

image.png

import warnings
warnings.filterwarnings('ignore')
with pm.Model() as GP:
    mu = np.zeros(N)
    eta = pm.HalfCauchy('eta', 5)
    rho = pm.HalfCauchy('rho', 5)
    sigma = pm.HalfCauchy('sigma', 5)

    D = squared_distance(x, x)

    K = tt.fill_diagonal(eta * pm.math.exp(-rho * D), eta + sigma)

    obs = pm.MvNormal('obs', mu, cov=K, observed=y)

    test_points = np.linspace(0, 10, 100)
    D_pred = squared_distance(test_points, test_points)
    D_off_diag = squared_distance(x, test_points)

    K_oo = eta * pm.math.exp(-rho * D_pred)
    K_o = eta * pm.math.exp(-rho * D_off_diag)

    mu_post = pm.Deterministic('mu_post', pm.math.dot(pm.math.dot(K_o, tt.nlinalg.matrix_inverse(K)), y))
    SIGMA_post = pm.Deterministic('SIGMA_post', K_oo - pm.math.dot(pm.math.dot(K_o, tt.nlinalg.matrix_inverse(K)), K_o.T))

    start = pm.find_MAP()
    trace = pm.sample(1000, start=start)
logp = 15.78, ||grad|| = 1.6532e-05: 100%|██████████| 22/22 [00:00<00:00, 647.40it/s]  
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sigma, rho, eta]
Sampling 4 chains, 0 divergences: 100%|██████████| 6000/6000 [01:37<00:00, 61.74draws/s] 
varnames = ['eta', 'rho', 'sigma']
chain = trace[100:]
pm.traceplot(chain, varnames)
plt.show()

image.png

pm.summary(chain, varnames).round(4)
mean sd hpd_3% hpd_97% mcse_mean mcse_sd ess_mean ess_sd ess_bulk ess_tail r_hat
eta 2.729 3.053 0.234 7.215 0.082 0.060 1370.0 1303.0 1494.0 1730.0 1.0
rho 0.132 0.056 0.055 0.233 0.001 0.001 1396.0 1354.0 1470.0 1622.0 1.0
sigma 0.001 0.000 0.000 0.001 0.000 0.000 1297.0 1254.0 1631.0 1790.0 1.0
y_pred = [np.random.multivariate_normal(m, S) for m,S in zip(chain['mu_post'][::5], chain['SIGMA_post'][::5])]

for yp in y_pred:
    plt.plot(test_points, yp, 'r-', alpha=0.1)

plt.plot(x, y, 'bo')
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$f(x)$', fontsize=16, rotation=0)
plt.show()

image.png

Periodic Kernel

periodic = lambda x, y: np.array([[np.sin((x[i] - y[j])/2)**2 for i in range(len(x))] for j in range(len(y))])
with pm.Model() as GP_periodic:
    mu = np.zeros(N)
    eta = pm.HalfCauchy('eta', 5)
    rho = pm.HalfCauchy('rho', 5)
    sigma = pm.HalfCauchy('sigma', 5)

    P = periodic(x, x)

    K = tt.fill_diagonal(eta * pm.math.exp(-rho * P), eta + sigma)

    obs = pm.MvNormal('obs', mu, cov=K, observed=y)

    test_points = np.linspace(0, 10, 100)
    D_pred = periodic(test_points, test_points)
    D_off_diag = periodic(x, test_points)

    K_oo = eta * pm.math.exp(-rho * D_pred)
    K_o = eta * pm.math.exp(-rho * D_off_diag)

    mu_post = pm.Deterministic('mu_post', pm.math.dot(pm.math.dot(K_o, tt.nlinalg.matrix_inverse(K)), y))
    SIGMA_post = pm.Deterministic('SIGMA_post', K_oo - pm.math.dot(pm.math.dot(K_o, tt.nlinalg.matrix_inverse(K)), K_o.T))

    start = pm.find_MAP()
    trace = pm.sample(1000, start=start)
/opt/conda/lib/python3.7/site-packages/theano/tensor/basic.py:6611: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  result[diagonal_slice] = x
logp = 23.985, ||grad|| = 1.9188: 100%|██████████| 18/18 [00:00<00:00, 797.56it/s]  
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sigma, rho, eta]
Sampling 4 chains, 0 divergences: 100%|██████████| 6000/6000 [01:01<00:00, 98.19draws/s] 
varnames = ['eta', 'rho', 'sigma']
chain = trace[100:]
pm.traceplot(chain, varnames);

image.png

y_pred = [np.random.multivariate_normal(m, S) for m,S in zip(chain['mu_post'][::5], chain['SIGMA_post'][::5])]

for yp in y_pred:
    plt.plot(test_points, yp, 'r-', alpha=0.1)

plt.plot(x, y, 'bo')
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$f(x)$', fontsize=16, rotation=0)
plt.show()
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:1: RuntimeWarning: covariance is not positive-semidefinite.
  """Entry point for launching an IPython kernel.

image.png


参考資料

1
2
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
2