Help us understand the problem. What is going on with this article?

カーネルの可視化 メモ

https://nbviewer.jupyter.org/github/adamian/adamian.github.io/blob/master/talks/Brown2016.ipynb

from IPython.display import HTML

HTML('''
<script>  
code_show=true; 
function code_toggle() {
  if (code_show){
    $(\'div.input\').hide();
  } else {
    $(\'div.input\').show();
  }
  code_show = !code_show
}  
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()">
    <input type="submit" value="Toggle code">
</form>
''')
%pylab inline

# Import relevant Python libraries
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import numpy as np
import scipy as sp
from scipy.stats import multivariate_normal
import seaborn as sns
#%matplotlib inline
#plt.rcParams['font.size']=15
Populating the interactive namespace from numpy and matplotlib


 /opt/conda/lib/python3.7/site-packages/IPython/core/magics/pylab.py:160: UserWarning:pylab import has clobbered these variables: ['multivariate_normal', 'f', 'var', 'ylim', 'sample']
`%matplotlib` prevents importing * from pylab and numpy

ガウス分布からのサンプリング

def gen_Gaussian_samples(mu, sigma, N=200):
    """
    Generate N samples from a multivariate Gaussian with mean mu and covariance sigma
    """
    D = mu.shape[0]
    samples = np.zeros((N,D))
    for i in np.arange(N):
        samples[i,:] = np.random.multivariate_normal(mean=mu, cov=sigma)
    return samples.copy()


def gen_plot_Gaussian_samples(mu, sigma,N=1000):
    """
    Generate N samples from a multivariate Gaussian with mean mu and covariance sigma
    and plot the samples as they're generated
    """
    for i in np.arange(N):
        sample = np.random.multivariate_normal(mean=mu, cov=sigma)
        plt.plot(sample[0],sample[1], '.',color='r',alpha=0.6)
    plt.grid()


def plot_Gaussian_contours(x,y,mu,sigma,N=100):
    """
    Plot contours of a 2D multivariate Gaussian based on N points. Given points x and y are 
    given for the limits of the contours
    """
    X, Y = np.meshgrid(np.linspace(x.min()-0.3,x.max()+0.3,100), np.linspace(y.min()-0.3,y.max()+0.3,N))
    rv = multivariate_normal(mu, sigma)
    Z = rv.pdf(np.dstack((X,Y)))
    plt.contour(X,Y,Z)
    plt.xlabel('$x_1$')
    plt.ylabel('$x_2$')

def plot_sample_dimensions(samples, colors=None, markers=None, ms=10):
    """
    Given a set of samples from a bivariate Gaussian, plot them, but instead of plotting them
    x1 vs x2, plot them as [x1 x2] vs ['1', '2']
    """
    N = samples.shape[0]
    D = samples.shape[1]

    t=np.array(range(1,D+1))

    for i in np.arange(N):
        if colors is None and markers is None:
            plt.plot(t,samples[i,:], '-o',ms=ms)
        elif colors is None:
            plt.plot(t,samples[i,:], '-o',marker=markers[i],ms=ms)
        elif markers is None:
            plt.plot(t,samples[i,:], '-o',color=colors[i],ms=ms)
        else:
            plt.plot(t,samples[i,:], '-o',color=colors[i],marker=markers[i],ms=ms)
    plt.grid()
    plt.xlim([0.8,t[-1]+0.2])
    plt.ylim([samples.min()-0.3, samples.max()+0.3])
    plt.xlabel('d = {' + str(t) + '}')
    plt.ylabel('[x_d]')
    plt.gca().set_title(str(N) + ' samples from a bivariate Gaussian')


def set_limits(samples):
    plt.xlim([samples[:,0].min()-0.3, samples[:,0].max()+0.3])
    plt.ylim([samples[:,1].min()-0.3, samples[:,1].max()+0.3])

2次元ガウス分布からのサンプリング

$\mu=(0,0)$

$
\Sigma = \left(
\begin{array}{ccc}
1 & 0.5 \
0.5 & 1
\end{array}
\right)
$

右図は平行線図

colors = ['r','g','b','m','k']
markers = ['p','d','o','v','<']

N=5 # Number of samples
mu = np.array([0,0])  # Mean of the 2D Gaussian
sigma = np.array([[1, 0.5], [0.5, 1]]); # covariance of the Gaussian


# Generate samples
samples = gen_Gaussian_samples(mu,sigma,N) 


f=figure(figsize=(12,12)); 
ax1=plt.subplot(1, 2, 1,autoscale_on=False, aspect='equal')
set_limits(samples)
plot_Gaussian_contours(samples[:,0],samples[:,1],mu,sigma)

# Plot samples
for i in np.arange(N):
    plt.plot(samples[i,0],samples[i,1], 'o', color=colors[i], marker=markers[i],ms=10)
plt.grid()
plt.gca().set_title(str(N) + ' samples of a bivariate Gaussian.')
plt.axvline(x=0,color='k')
plt.axhline(y=0,color='k')

ax2=plt.subplot(1, 2, 2,autoscale_on=False, aspect='equal')
plot_sample_dimensions(samples=samples, colors=colors, markers=markers)
plt.tick_params(bottom=False,labelbottom=False)
plt.axvline(x=1,color='k')
plt.text(1,np.min(samples)-0.5,'$x_1$')
plt.text(2,np.min(samples)-0.5,'$x_2$')
plt.axvline(x=2,color='k')
plt.xlabel('')
plt.ylabel('value')
plt.show()
#ax2.set(autoscale_on=False, aspect='equal')

image.png

共分散行列のパラメータを変更

弱い相関

$
\Sigma_1 = \left(
\begin{array}{ccc}
1 & 0.02 \
0.02 & 1
\end{array}
\right)
$

強い相関

$
\Sigma_2 = \left(
\begin{array}{ccc}
1 & 0.95 \
0.95 & 1
\end{array}
\right)
$

# Plot with contours. Compare a correlated vs almost uncorrelated Gaussian

sigmaUncor = np.array([[1, 0.02], [0.02, 1]]);
sigmaCor = np.array([[1, 0.95], [0.95, 1]]);

f=figure(figsize=(15,15)); 

ax=plt.subplot(1, 2, 1); ax.set_aspect('equal')
samplesUncor=gen_Gaussian_samples(mu,sigmaUncor)
plot_Gaussian_contours(samplesUncor[:,0],samplesUncor[:,1], mu, sigmaUncor)
gen_plot_Gaussian_samples(mu, sigmaUncor)
plt.gca().set_title('Weakly correlated Gaussian')

ax=plt.subplot(1, 2, 2); ax.set_aspect('equal')
samplesCor=gen_Gaussian_samples(mu,sigmaCor)
plot_Gaussian_contours(samplesCor[:,0],samplesCor[:,1], mu, sigmaCor)
gen_plot_Gaussian_samples(mu, sigmaCor)
plt.gca().set_title('Stongly correlated Gaussian')
plt.show()

image.png

平行線図。相関が強いと、平行になる(右図)。

# But let's plot them as before dimension-wise...

f=figure(figsize=(18,5)); 
perm = np.random.permutation(samplesUncor.shape[0])[0::14]

ax1=plt.subplot(1, 2, 1); ax1.set_aspect('auto')
plot_sample_dimensions(samplesUncor[perm,:])
plt.gca().set_title('Weakly correlated')
plt.axvline(x=1,color='k')
plt.axvline(x=2,color='k')

ax2=plt.subplot(1, 2, 2,sharey=ax1); ax2.set_aspect('auto')
plot_sample_dimensions(samplesCor[perm,:])
plt.gca().set_title('Strongly correlated')
plt.ylim([samplesUncor.min()-0.3, samplesUncor.max()+0.3])
plt.axvline(x=1,color='k')
plt.axvline(x=2,color='k')
plt.show()

image.png

8次元ガウス分布から、5サンプル生成

N=5
mu = np.array([0,0,0,0,0,0,0,0])
D = mu.shape[0]

# Generate random covariance matrix
tmp = np.sort(sp.random.rand(D))[:,None]
tmp2 = tmp{\ast}{\ast}np.arange(5)
sigma = 5{\ast}np.dot(tmp2,tmp2.T) + 0.005{\ast}np.eye(D)

samples = gen_Gaussian_samples(mu,sigma,N)

for i in np.arange(N):
    plt.plot(tmp,samples[i,:], '-o')
plt.grid()

plt.gca().set_title(str(N) + ' samples of a ' + str(D) + ' dimensional Gaussian')
plt.ylabel('value')
plt.xlabel('d')
plt.axhline(y=0,color='k')
plt.show()

png

共分散行列

sns.heatmap(sigma, linewidths=.5)
plt.title('covariance matrix')
plt.show()

image.png

200次元ガウス分布から5サンプル生成

N=5
D=200
mu = np.zeros((D,1))[:,0]

# Generate random covariance matrix
tmp = np.sort(sp.random.rand(D))[:,None]
tmp2 = tmp{\ast}{\ast}np.arange(5)
sigma1 = 5{\ast}np.dot(tmp2,tmp2.T)
sigma2 = 5{\ast}np.dot(tmp2,tmp2.T)+ 0.0005{\ast}np.eye(D)

plt.figure(figsize=(10,3))

plt.subplot(121)
samples = gen_Gaussian_samples(mu,sigma1,N)
for i in np.arange(N):
    plt.plot(tmp,samples[i,:], '-')
plt.gca().set_title(str(N) + ' samples of a ' + str(D) + ' dimensional Gaussian\nnoise : False')
plt.xlabel('d')
plt.ylabel('value')
plt.axhline(y=0,color='gray',lw=0.5)

plt.subplot(122)
samples = gen_Gaussian_samples(mu,sigma2,N)
for i in np.arange(N):
    plt.plot(tmp,samples[i,:], '-')
plt.gca().set_title(str(N) + ' samples of a ' + str(D) + ' dimensional Gaussian\nnoise : True')
plt.xlabel('d')
plt.ylabel('value')
plt.axhline(y=0,color='gray',lw=0.5)

plt.show()

image.png

plt.figure(figsize=(8,3))

plt.subplot(121)
sns.heatmap(sigma1)
plt.title('covariance matrix\nnoise : False')

plt.subplot(122)
sns.heatmap(sigma2)
plt.title('covariance matrix\nnoise : True')
plt.show()

image.png

200次元のガウス分布から、滑らかなベクトルデータが得られた。

ガウス過程導入

{\ast} $x,x' \in \mathcal{X}$ : input domain
{\ast} $k(x,x')$ : covariance function
{\ast} $\mathbf{X}$ : training inputs
{\ast} $\mathbf{K} = k(\mathbf{X}, \mathbf{X})$ : covariance matrix

$p(\mathbf{f_A})$と$p(\mathbf{f_B})$の同時確率分布

Joint

Let's start with a multivariate Gaussian. Assume that we have a random variable $\mathbf{f}$ which follows a multivariate Gaussian, and we partition its dimensions into two sets, $A,B$. Then, the joint distribution can be written as:

p(\underbrace{f_1, f_2, \cdots, f_s}_{\mathbf{f}_A}, \underbrace{f_{s+1}, f_{s+2}, \cdots, f_N}_{\mathbf{f}_B}) \sim \mathcal{N}(\boldsymbol \mu, \mathbf{K}).

with:
math
\boldsymbol \mu =
\begin{bmatrix}
\boldsymbol \mu_A \\
\boldsymbol \mu_B
\end{bmatrix}
\; \; \text{and} \; \;
\mathbf{K} =
\begin{bmatrix}
\mathbf{K}_{A A} & \mathbf{K}_{A B} \\
\mathbf{K}_{B A} & \mathbf{K}_{B B}
\end{bmatrix}

$p(\mathbf{f_A})$の周辺分布

Marginal

And the {\ast}marginal{\ast} distribution can be written as:

\begin{eqnarray}
p(\mathbf{f}_A, \mathbf{f}_B)  \sim \mathcal{N}(\boldsymbol \mu, \mathbf{K}). \text{ Then:} \\
p(\mathbf{f}_A)  = \int_{\mathbf{f}_B} p(\mathbf{f}_A, \mathbf{f}_B) \text{d} \mathbf{f}_B =
       \mathcal{N}(\boldsymbol \mu_A, \mathbf{K}_{A A}) %\\
% p(\mathbf{f}_B)  = \int_{\mathbf{f}_A} p(\mathbf{f}_A, \mathbf{f}_B) \text{d} \mathbf{f}_A = 
%        \mathcal{N}(\boldsymbol \mu_B,\mathbf{K}_{B B})
\end{eqnarray}

The marginalization property means that the training data that have and any (potentially infinite in number) test data $f_{\ast}$ that we have not seen (yet), follow a (potentially infinite) Gaussian distribution with mean and covariance:

  \boldsymbol \mu_{\infty} = 
    \begin{bmatrix}
      \boldsymbol \mu_{\!_\mathbf{X}} \\
      \cdots \\
      \cdots
    \end{bmatrix}
  \; \; \text{and} \; \;
    \mathbf{K}_{\infty} = 
    \begin{bmatrix}
      \mathbf{K}_{\!_\mathbf{X} \!_\mathbf{X}} & \cdots \\
      \cdots & \cdots
    \end{bmatrix}

where $\mathbf{X}$ is training inputs and $\mathbf{K}_{XX}$ is the covariance matrix constructing by evaluating the covariance {\ast}function{\ast} at all given inputs.

$\mathbf{\mu}=0$とした場合の$\mathbf{X}$と$\mathbf{X_{\ast}}$の同時確率分布

So, in the Gaussian process case (assuming 0 mean) we have a joint Gaussian distribution of the training and the (potentially infinite!) test data:

\begin{bmatrix}\mathbf{f} \\ \mathbf{f}^{\ast}\end{bmatrix} \sim \mathcal{N}\left(\mathbf{0}, \begin{bmatrix} \mathbf{K} & \mathbf{K}_\ast \\ \mathbf{K}_\ast^\top & \mathbf{K}_{\ast,\ast}\end{bmatrix}\right)

Here, $\mathbf{K}\ast$ is the (cross)-covariance matrix obtained by evaluating the covariance function in pairs of training inputs $\mathbf{X}$ and test inputs $\mathbf{X{\ast}}$, ie.

\mathbf{K}_\ast = k(\mathbf{X}, \mathbf{X}_{\ast}) .

Similarly:

\mathbf{K}_{\ast\ast} = k(\mathbf{X}_{\ast}, \mathbf{X}_{\ast}) .

$p(\mathbf{f_B})$の元での$p(\mathbf{f_A})$の条件付確率分布$p(\mathbf{f_A|f_B})$を定式化

本来$f$はrandom variableであるが、evaluatorとして利用する。

有限次元$\mathbf{X}=[x_1,x_2,\cdots,x_N]$のインプットが与えられた時、条件付確率分布を通して$x_{\ast} \in \mathbb{R}$を得ることが出来る。

Conditional

Interestingly, conditioning a multivariate Gaussian to obtain the posterior distribution also yields a Gaussian:
Again, if
math
p(\mathbf{f}_A, \mathbf{f}_B) \sim \mathcal{N}(\boldsymbol \mu, \mathbf{K}). \; \; \text{Then:} \\
p(\mathbf{f}_A | \mathbf{f}_B) = \mathcal{N}(\boldsymbol \mu_A + \mathbf{K}_{AB} \mathbf{K}^{-1}_{BB} (\mathbf{f}_B - \boldsymbol \mu_B), \mathbf{K}_{AA} - \mathbf{K}_{AB}\mathbf{K}_{BB}^{-1}\mathbf{K}_{BA})% \\
% p(\mathbf{f}_B | \mathbf{f}_A) = \mathcal{N}(\cdots, \cdots)

In the GP context this can be used for inter/extrapolation. Assume that we have a function $f$ with input domain $\mathcal{X} = \mathbb{R}$ and we set a GP prior on $f$ (so, now we use $f$ to denote function evaluations, rather than random variables). Also assume that we have a training set $\mathbf{X} = [x_1, x_2, \dots x_N]$. Then, we can condition on the function ouputs evaluated on the training set in order to perform inference for the function value at {\ast}any{\ast} input location $x_{\ast} \in \mathbb{R}$. This conditioning means finding the GP posterior {\ast}process{\ast}:

p(\mathbf{f_{\ast}} | \mathbf{f_1}, \cdots, \mathbf{f_N}) = 
p(f(x_{\ast}) | f(x_1), \cdots, f(x_N)) \\
         \sim \mathcal{N}(\mathbf{K}_{\ast}^\top \mathbf{K}^{-1} \mathbf{f}\; , \; \mathbf{K}_{{\ast},{\ast}} - \mathbf{K}_{\ast}^\top \mathbf{K}^{-1} \mathbf{K}_{\ast})

Remember, the test inputs $\mathbf{X}{\ast}$ appear in the above expression inside $\mathbf{K}{\ast}$ and $\mathbf{K}_{{\ast}{\ast}}$.

Noise model

As is standard in probabilistic regression, we assume a noise model. We take:

y = f(x) + \epsilon

where:

f \sim \mathcal{GP}(0, k(x,x'))

and

\epsilon \sim \mathcal{N}(0,\sigma^2 I) \; \; \; \; \; \; \; \; \; (1)

where non-bold symbols now denote single elements from the training vectors.

The {\ast}covariance function{\ast} $k(x,x')$ is a function which takes as inputs pairs in the input domain and returns their co-variance. By denoting $k(\mathbf{X},\mathbf{X})$ we mean that we evaluate the covariance function in the whole training set, $\mathbf{X}$, and this gives us back a covariance matrix.

The assumption about Gaussian noise says that the {\ast}training data{\ast} $(x,y) \in (\mathbf{X}, \mathbf{Y})$ are related by a function $f$ whose output is then corrupted by Gaussian noise (i.e. we have noisy observations).
The above construction, gives us the following probabilities:
\begin{equation}
p(\mathbf{y}|\mathbf{f}) = \mathcal{N}(\mathbf{y}|\mathbf{f}, \sigma^2 \mathbf{I})
\end{equation}

p(\mathbf{f}|\mathbf{x}) = \mathcal{N}(\mathbf{f}|\mathbf{0}, K_{ff}) = (2 \pi)^{n/2} |K_{ff}|^{-1/2} \exp\left( -\frac{1}{2} \mathbf{f}^T K_{ff} \mathbf{f} \right) \text{where:} K_{ff} = k(\mathbf{x},\mathbf{x}) \; \; \; \; (2)
p(\mathbf{y}|\mathbf{x}) = \int p(\mathbf{y}|\mathbf{f})p(\mathbf{f}|\mathbf{x}) d\mathbf{f} = \mathcal{N}(\mathbf{y}|\mathbf{0},K_{ff}+\sigma^{2} \mathbf{I}) \; \; \; \; (3)

where the last quantity is called the {\ast}marginal likelihood{\ast} and is tractable because of our choice for noise $\epsilon$ which is normally distributed.

Predictions

Now, for a test point $x_{\ast}$ we want to compute its output on the observed space, i.e. we want to compute $y_{\ast}$. Building on the noise model and the previously shown expressions, the posterior for the test outputs is given by:

\mathbf{y}^{\ast} | \mathbf{y}, \mathbf{x}, \mathbf{x_{\ast}} \sim \mathcal{N}(\boldsymbol \mu_{\text{pred}},\mathbf{K}_{\text{pred}}) \; \; \; \; (4)

with

math
\boldsymbol \mu_{\text{pred}} = \mathbf{K}_{\ast}^\top \left[\mathbf{K} + \sigma^2 \mathbf{I}\right]^{-1} \mathbf{y}

and
math
\mathbf{K}_{\text{pred}} = \mathbf{K}_{{\ast},{\ast}} - \mathbf{K}_{\ast}^\top \left[\mathbf{K} + \sigma^2 \mathbf{I}\right]^{-1} \mathbf{K}_{\ast}.

Covariance functions, aka kernels

We saw above their role for creating covariance matrices from training inputs, thereby allowing us to work with finite when the domain is potentially infinite.

We'll see below that the covariance function is what encodes our assumption about the GP. By selecting a covariance function, we are making implicit assumptions about the shape of the function we wish to encode with the GP, for example how smooth it is.

Even if the covariance function has a parametric form, combined with the GP it gives us a nonparametric model. In other words, the covariance function is specifying the general properties of the GP function we wish to encode, and not a specific parametric form for it.

Below we define two very common covariance functions: The RBF (also known as Exponentiated Quadratic or Gaussian kernel) which is differentiable infinitely many times (hence, very smooth), and the linear one:

k_{RBF}(\mathbf{x}_{i,:},\mathbf{x}_{j,:}) = \sigma^2 \exp \left( -\frac{1}{2\ell^2} \sum_{q=1}^Q (x_{i,q} - x_{j,q})^2\right)

where $Q$ denotes the dimensionality of the input space. Its parameters are: the {\ast}lengthscale{\ast}, $\ell$ and the variance $\sigma^2$.

k_{lin}(\mathbf{x}_{i,:},\mathbf{x}_{j,:}) = \sigma^2 \mathbf{x}_{i,:}^T \mathbf{x}_{j,:}

Its parameters is the variance $\sigma^2$.

Below, we will implement and investigate them.

共分散行列の可視化

Defining covariance function forms

def cov_linear(x,x2=None,theta=1):
        if x2 is None:
            return np.dot(x, x.T){\ast}theta
        else:
            return np.dot(x, x2.T){\ast}theta


def cov_RBF(x, x2=None, theta=np.array([1,1])):        
        """
        Compute the Euclidean distance between each row of X and X2, or between
        each pair of rows of X if X2 is None and feed it to the kernel.
        """
        variance    = theta[0]
        lengthscale = theta[1]
        if x2 is None:
            xsq = np.sum(np.square(x),1)
            r2 = -2.{\ast}np.dot(x,x.T) + (xsq[:,None] + xsq[None,:])
            r = np.sqrt(r2)/lengthscale
        else:
            x1sq = np.sum(np.square(x),1)
            x2sq = np.sum(np.square(x2),1)
            r2 = -2.{\ast}np.dot(x, x2.T) + x1sq[:,None] + x2sq[None,:]
            r = np.sqrt(r2)/lengthscale

        return variance {\ast} np.exp(-0.5 {\ast} r{\ast}{\ast}2)

Experimenting with covariance function parameters

共分散行列の可視化

まずは$\mathbf{X}$を確認。400次元。

X = np.sort(np.random.rand(400, 1) {\ast} 6 , axis=0)

plt.figure(figsize=(8,3))

plt.subplot(121)
plt.scatter(range(len(X)),X,s=0.1)
plt.xlabel('index')
plt.ylabel('value')
plt.title('$\mathbf{X}$')

plt.subplot(122)
sns.heatmap(np.dot(X,X.T))
plt.title('$\mathbf{X}\cdot \mathbf{X}^T$')

plt.show()

image.png

$\mathbf{X}$のパラメータを1に固定して、$\mathbf{X_{\ast}}$のパラメータを変更。

RBFカーネルは、varを1に固定して、lengthscaleを変更。

線形カーネルはvarパラメータで分散の絶対値を変更。

params_linear = [0.01, 0.05, 1, 2, 4, 10]
params_rbf    = [0.005, 0.1, 1, 2, 5, 12]
K = len(params_linear)

plt.figure(figsize=(25,25))
#plt.subplots_adjust(wspace=0.4)
j=1
for i in range(K):
    plt.subplot(K,2,j)
    K_rbf = cov_RBF(X,X,theta=np.array([1,params_rbf[i]]))
    plt.imshow(K_rbf)
    plt.colorbar()
    plt.gca().set_title('RBF (l=' + str(params_rbf[i]) + ')')

    plt.subplot(K,2,j+1)
    K_lin = cov_linear(X,X,theta=params_linear[i])
    plt.imshow(K_lin)
    plt.colorbar()
    plt.gca().set_title('Lin (var=' + str(params_linear[i]) + ')')

    j+=2

plt.suptitle('RBF (left) and Linear (right) cov. matrices created with different parameters', fontsize=20)
plt.show()

image.png

RBFカーネルでサンプリング

Sampling GP instances from covariance functions

Given hyperparameters l, we plot the resulting cov. matrix and samples from a GP with this cov. function.

RBFカーネルでサンプリング。lengthscaleが小さいと、相関が小さくなる。lengthscaleが大きくなると、相関が大きくなり滑らかな曲線になる。

num_samples=5
plt.figure(figsize=(25,25))
j=1
for i in range(K):
    plt.subplot(K,2,j)
    K_rbf = cov_RBF(X,X,theta=np.array([1,params_rbf[i]]))
    plt.imshow(K_rbf)
    plt.colorbar()
    plt.gca().set_title('RBF Cov. Matrix (l=' + str(params_rbf[i]) + ')')

    plt.subplot(K,2,j+1)
    # Assume a GP with zero mean
    mu=np.zeros((1,K_rbf.shape[0]))[0,:]
    for s in range(num_samples):
        # Jitter is a small noise addition to the diagonal to ensure positive definiteness
        jitter = 1e-5{\ast}np.eye(K_rbf.shape[0])
        sample = np.random.multivariate_normal(mean=mu, cov=K_rbf+jitter)
        plt.plot(sample)
    plt.gca().set_title('GP Samples from RBF (l=' + str(params_rbf[i]) + ')')
    j+=2

image.png

As we see, short lengthscales give very small correlations in the cov. matrix, resulting in very non-smooth functions. Conversely, long lengthscales result in very strongly correlated matrices and, therefore, very smooth functions.

Other covariance functions

As we saw, each covariance function can be parameterezed and (as we'll see next) adapted to the data. It's important, however, to use a reasonable cov. function to begin with, if we have some knowledge about our data. Importantly, by doing so we make our assumptions about our modelling approach explicit, which is a good principle in Bayesian statistics.

There are other covariance functions apart from the linear and the RBF. There is a very good GPy tutorial on this.

We can get a quick taste:

Gpyで他のカーネル確認

import GPy

Q=1
x_tmp = np.linspace(0,10,200)[:,None]

plt.figure(figsize=(40,40))
i=1
plt.subplot(6,2,i); i+=1
kern = GPy.kern.RBF(Q)
kern.plot(ax=gca());  plt.gca().set_title('RBF', fontsize=30); 
plt.subplot(6,2,i); i+=1
plt.imshow(kern.K(x_tmp)); plt.gca().set_title('Cov matrix:' + kern.name, fontsize=30)
plt.subplot(6,2,i); i+=1

kern = GPy.kern.Matern32(Q)
kern.plot(ax=gca()); plt.gca().set_title('Matern 3/2', fontsize=30);
plt.subplot(6,2,i); i+=1
plt.imshow(kern.K(x_tmp)); plt.gca().set_title('Cov matrix:' + kern.name, fontsize=30)
plt.subplot(6,2,i); i+=1

kern = GPy.kern.PeriodicExponential(Q,period=np.pi/2)
kern.plot(ax=gca()); plt.gca().set_title('Periodic', fontsize=30);
plt.subplot(6,2,i); i+=1
plt.imshow(kern.K(x_tmp)); plt.gca().set_title('Cov matrix:' + kern.name, fontsize=30)
plt.subplot(6,2,i); i+=1

kern = GPy.kern.Bias(Q)
kern.plot(ax=gca()); plt.gca().set_title('Bias (constant)', fontsize=30);
plt.subplot(6,2,i); i+=1
plt.imshow(kern.K(x_tmp)); plt.gca().set_title('Cov matrix:' + kern.name, fontsize=30)
plt.subplot(6,2,i); i+=1


# We can even add kernels and get a new one...
kern = GPy.kern.RBF(Q) + GPy.kern.Linear(Q,variances=0.1)
kern.plot(ax=gca()); plt.gca().set_title('RBF + linear', fontsize=30)
plt.subplot(6,2,i); i+=1
plt.imshow(kern.K(x_tmp)); plt.gca().set_title('Cov matrix:' + kern.name, fontsize=30)
plt.subplot(6,2,i); i+=1


# ...or even multiply them!
kern = GPy.kern.RBF(Q) {\ast} GPy.kern.PeriodicExponential(Q,period=np.pi/2)
kern.plot(ax=gca()); plt.gca().set_title('RBF x Periodic', fontsize=30)
plt.subplot(6,2,i); i+=1
plt.imshow(kern.K(x_tmp)); plt.gca().set_title('Cov matrix:' + kern.name, fontsize=30)

plt.suptitle('Cov. function form (left) and Sample cov. matrix (right)', fontsize=40)
plt.show()

image.png

学習データからの予測

"Full" GP Implementation!

Let's finally implement our GP framework.

class CovFunctions(object):
    """
    A wrapper for covariance functions which is compatible with the GP class.
    """
    def __init__(self, covType, theta):
        self.covType = covType
        self.theta = theta
        if covType == 'linear':
            self.compute = self.linear
        elif covType == 'RBF':
            self.compute = self.RBF

    def set_theta(self, theta):

        self.theta = theta

    def get_theta(self):
        return self.theta

    def linear(self, x,x2=None):
        return cov_linear(x,x2,self.theta)

    def RBF(self, x, x2=None):
        return cov_RBF(x,x2,self.theta)


class GP(object):
    def __init__(self, X, Y, sigma2, covType, theta):
        self.X = X
        self.Y = Y
        self.N = self.X.shape[0]
        self.sigma2 = sigma2
        self.kern = CovFunctions(covType, theta)
        self.K = self.kern.compute(X)
        # Force computations
        self.update_stats()

    def get_params(self):
        return np.hstack((self.sigma2, self.kern.get_theta()))

    def set_params(self, params):
        self.sigma2 = params[0]
        self.kern.set_theta(params[1:])

    def update_stats(self):
        self.K = self.kern.compute(self.X)
        self.Kinv = np.linalg.inv(self.K+self.sigma2{\ast}np.eye(self.N))
        self.logdet = np.log(np.linalg.det(self.K+self.sigma2{\ast}np.eye(self.N)))
        self.KinvY = np.dot(self.Kinv, self.Y)
        # Equivalent to: np.trace(np.dot(self.Y, self.KinvY.T))
        self.YKinvY = (self.Y{\ast}self.KinvY).sum()

    def likelihood(self):
        """ 
        That's actually the logarithm of equation (3)
        Since logarithm is a convex function, maximum likelihood and maximum log likelihood wrt parameters
        would yield the same solutuion, but logarithm is better to manage mathematically and
        numerically.
        """
        return -0.5{\ast}(self.N{\ast}np.log(2{\ast}np.pi) + self.logdet + self.YKinvY)

    def posterior(self, x_star):
        """
        Implements equation (4)
        """
        self.update_stats()
        K_starstar = self.kern.compute(x_star, x_star)
        K_star = self.kern.compute(self.X, x_star)
        KinvK_star = np.dot(self.Kinv, K_star)
        mu_pred = np.dot(KinvK_star.T, self.Y)
        K_pred = K_starstar - np.dot(KinvK_star.T, K_star)
        return mu_pred, K_pred

    def objective(self,params):
        self.set_params(params)
        self.update_stats()
        return -self.likelihood()

Testing the framework:

We'll create some sample 1D data as a draw from a GP with RBF covariance and lengthscale = 0.85. Next, we normalize the data and keep some for testing. We then plot the data.

lengthscale=0.85にしてRBFカーネルからデータを生成

N=22 # number of training points
Nstar = 70 # number of test points

# create toy data
X = np.sort(np.random.rand(N+Nstar, 1) {\ast} 6 , axis=0)

K_rbf = cov_RBF(X,X,theta=np.array([1,0.85]))
mu=np.zeros((1,K_rbf.shape[0]))[0,:]
jitter = 1e-5{\ast}np.eye(K_rbf.shape[0])
Y = np.random.multivariate_normal(mean=mu, cov=K_rbf+jitter)[:,None]
#Y = np.sin(X{\ast}2) + np.random.randn({\ast}X.shape) {\ast} 0.008

# split data into training and test set
perm = np.random.permutation(X.shape[0])
Xtr = X[np.sort(perm[0:N],axis=0),:]
Ytr = Y[np.sort(perm[0:N],axis=0),]
X_star = X[np.sort(perm[N:N+Nstar],axis=0),:]
Y_star = Y[np.sort(perm[N:N+Nstar],axis=0),:]

# Normalize data to be 0 mean and 1 std
Ymean = Ytr.mean()
Ystd = Ytr.std()
Ytr-=Ymean
Ytr/=Ystd
Y_star -= Ymean
Y_star /= Ystd

# plot data
plt.plot(X,(Y-Ymean)/Ystd, 'k-')
plt.plot(Xtr,Ytr, 'b<',label='training')
plt.plot(X_star,Y_star, 'ro',label='test')
plt.legend()
plt.show()

image.png

In general, we want a function that plots the fit showing the mean and variance:

def plot_fit(x,y,mu,var):
    """
    Plot the fit of a GP
    """
    plt.plot(x,y, 'k-o',label='true')
    plt.plot(x,mu, 'b-<',label='predicted')
    plt.plot(x, mu+2{\ast}np.sqrt(var), 'r--',label='var')
    plt.plot(x, mu-2{\ast}np.sqrt(var), 'r--')
    plt.legend()

Now we'll define two GP models, one with linear and one with RBF cov. function. We'll use them to predict the training and the test data.

# Define GP models with initial parameters
g_lin = GP(Xtr, Ytr, 0.1, 'linear', 2)
g_rbf = GP(Xtr, Ytr, 0.1, 'RBF', np.array([1,2]))

# Get the posterior of the two GPs on the {\ast}training{\ast} data
mu_lin_tr,var_lin_tr = g_lin.posterior(Xtr)
mu_rbf_tr,var_rbf_tr = g_rbf.posterior(Xtr)

# Get the posterior of the two GPs on the {\ast}test{\ast} data
mu_lin_star,var_lin_star = g_lin.posterior(X_star)
mu_rbf_star,var_rbf_star = g_rbf.posterior(X_star)

# Plot the fit of the two GPs on the training and test data
f=figure(figsize=(20,5))
plt.subplots_adjust(hspace=0.4)
ax=plt.subplot(2, 2, 1); ax.set_aspect('auto')
plot_fit(Xtr, Ytr, mu_lin_tr, np.diag(var_lin_tr)[:,None])
plt.gca().set_title('Linear, training')

ax=plt.subplot(2, 2, 2); ax.set_aspect('auto')
plot_fit(X_star, Y_star, mu_lin_star, np.diag(var_lin_star)[:,None])
plt.gca().set_title('Linear, test')

ax=plt.subplot(2, 2, 3); ax.set_aspect('auto')
plot_fit(Xtr, Ytr, mu_rbf_tr, np.diag(var_rbf_tr)[:,None])
plt.gca().set_title('RBF (l=' +  str(g_rbf.kern.get_theta()[1]) + '), training')

ax=plt.subplot(2, 2, 4); ax.set_aspect('auto')
plot_fit(X_star, Y_star, mu_rbf_star, np.diag(var_rbf_star)[:,None])
plt.gca().set_title('RBF, test')
plt.show()

image.png

Even if we did not optimize the GPs, we see that both do a reasonably good job in fitting the data. This is because the GP is a nonparametric model, in the sense that the data itself act as additional (constant) parameters. Indeed, you can see that the posterior GP is given by an equation where the training data appear inside.

Let's convince ourselves that there is actually something to learn in the GP to get an even better fit. In the folllowing, we'll test the fit of the RBF kernel for various selections of the lengthscale.

Remember that the true one generating the data is 0.85, but we'd expect to find it as the best only if we have infinite data. Otherwise, something close to it is also expected.

In the next plot we show the fit for each lengthscale, and the Likelihood achieved by it.

復習:lengthscaleが小さいとGPサンプル同士の相関が小さくなる。一方でlengthscaleが大きいと、GPサンプル同士は相関が大きくなり、平行線図は滑らかな曲線となる。lengthscale=0.1と1で挙動がかなり異なるように見える。

それぞれのlengthscaleで尤度を算出。今回の例では、lengthscale=0.2と0.085でテストデータの予測値の分散がかなり異なる。

f=figure(figsize=(20,28)) 
i = 1
# Following array holds different lengthscales to test one by one
test_l=[0.008, 0.05, 0.2, 0.85, 1.5, 2, 6, 12]
for l in test_l:
    g_rbf = GP(Xtr, Ytr, 0.1, 'RBF', np.array([1,l]))   # train
    mu_rbf_tr  , var_rbf_tr   = g_rbf.posterior(Xtr)    # train data
    mu_rbf_star, var_rbf_star = g_rbf.posterior(X_star) # test  data

    ax=plt.subplot(len(test_l), 2,i); ax.set_aspect('auto')
    plot_fit(Xtr, Ytr, mu_rbf_tr, np.diag(var_rbf_tr)[:,None])
    ll = g_rbf.likelihood()
    plt.gca().set_title('RBF (l=' +  str(g_rbf.kern.get_theta()[1]) + '), training. L=' + str(ll) )

    ax=plt.subplot(len(test_l), 2, i+1); ax.set_aspect('auto')
    plot_fit(X_star, Y_star, mu_rbf_star, np.diag(var_rbf_star)[:,None])
    ll = g_rbf.likelihood()
    plt.gca().set_title('RBF (l=' +  str(g_rbf.kern.get_theta()[1]) + '), test. L=' + str(ll) )
    i+=2

image.png

Fitting and Overfitting

We see that very short lenthscales give more "wiggly" functions, which are capable of interpolating perfectly between the training points. However, such an overfitted model is biased to be more "surprised" by anything else other than the exact same training data... hence performing poorly in the test set.

On the other hand, high lengthscales give very flat functions, which in the limit look like the linear one and underfit the data.

So, how do we automatically find the correct lengthscale?

The principled way of selecting the correct parameter is not a visual check or testing the error in training/held out data. Instead, we wish to look at the likelihood, telling us what is the probability of the specific model (with lengthscale l) having generated the data.

To show this, we'll try as before different settings for the lengthscale of the GP-RBF, but this time we'll keep track of the likelihood and also the training/test error.

First, create some helper functions:

# Root mean squared error
def rmse(pred, truth):
    pred = pred.flatten()
    truth = truth.flatten()
    return np.sqrt(np.mean((pred-truth){\ast}{\ast}2))

# Make data 0 mean and 1 std.
def standardize(x):
    return (x-x.mean())/x.std()

test_l=np.linspace(0.01,5, 100)
ll = []
err_tr = []
err_test = []
for l in test_l:
    g_rbf = GP(Xtr, Ytr, 0.1, 'RBF', np.array([1,l]))
    g_rbf.update_stats()
    ll.append(g_rbf.likelihood())
    mu_rbf_tr,var_rbf_tr = g_rbf.posterior(Xtr)
    err_tr.append(rmse(mu_rbf_tr, Ytr))
    mu_rbf_star,var_rbf_star = g_rbf.posterior(X_star)
    err_test.append(rmse(mu_rbf_star, Y_star))
ll = standardize(np.array(ll))
err_tr = standardize(np.array(err_tr))
err_test = standardize(np.array(err_test))

Plot the lenghtscale versus the likelihood, training error, test error:

max_ll, argmax_ll = np.max(ll), np.argmax(ll)
min_err_tr, argmin_err_tr = np.min(err_tr), np.argmin(err_tr)
min_err_test, argmin_err_test = np.min(err_test), np.argmin(err_test)

plt.plot(test_l, ll,label='likelihood')
plt.plot(test_l, err_tr,label='err_tr')
plt.plot(test_l, err_test,label='err_test')
tmp_x = np.ones(test_l.shape[0])
ylim=plt.gca().get_ylim()
tmp_y = np.linspace(ylim[0],ylim[1], tmp_x.shape[0])
plt.plot(tmp_x{\ast}test_l[argmax_ll], tmp_y,'--', label='maxL')
plt.plot(tmp_x{\ast}test_l[argmin_err_tr], tmp_y,'--', label='minErrTr')
plt.plot(tmp_x{\ast}test_l[argmin_err_test], tmp_y,'--', label='minErrTest')

plt.legend()
plt.xlabel('lengthscale')
plt.ylabel('value')
plt.show()

print('Best lengthscale according to likelihood    :' + str(test_l[argmax_ll]))
print('Best lengthscale according to training error:' + str(test_l[argmin_err_tr]))
print('Best lengthscale according to test error    :' + str(test_l[argmin_err_test]))

image.png

Best lengthscale according to likelihood    :0.7660606060606061
Best lengthscale according to training error:0.4132323232323233
Best lengthscale according to test error    :0.4636363636363637

The fact that the best lengthscale (according to likelihood) is not necessarily the one giving us less training error is a good property, i.e. the model avoids overfitting.

The chosen lengthscale is also quite close to the etrue one. Running the experiment with more data will reaveal an even close match (bear in mind that numerical problems due to simple coding here might cause problems in computations).

GPy

ハイパラを決める一つの方法は、勾配法でmaximum likelihoodを見つけることである。

To properly optimize the kernel, we want to avoid exhaustive search of the correct parameter, especially since there are also other parameters which above we considered fixed.

The way to optimize the GP according to maximum likelihood is by using gradient information.
As a "homework", feel free to implement the gradients of the GP wrt parameters. Then, by feeding the functions of the gradients and the function of the objective (already implemented in GP.objective) to a gradient optimizer you will get automatically the result. Check the function optimize of scipy.

To save some time and effort here, we'll demonstrate the optimization with GPy [http://github.com/SheffieldML/GPy], a GP software package written in Python.

RBFカーネルで学習

# Import the Gaussian process Python package
import GPy

Q = Xtr.shape[1]
k = GPy.kern.RBF(Q)
m = GPy.models.GPRegression(X=Xtr, Y=Ytr, kernel=k)

Print params and plot unoptimized model

print(m)
m.plot()
Name : GP regression
Objective : 26.578998228962337
Number of Parameters : 3
Number of Optimization Parameters : 3
Updates : True
Parameters:
  [1mGP_regression.         [0;0m  |  value  |  constraints  |  priors
  [1mrbf.variance           [0;0m  |    1.0  |      +ve      |        
  [1mrbf.lengthscale        [0;0m  |    1.0  |      +ve      |        
  [1mGaussian_noise.variance[0;0m  |    1.0  |      +ve      |        





{'dataplot': [<matplotlib.collections.PathCollection at 0x7fc8962b4a50>],
 'gpmean': [[<matplotlib.lines.Line2D at 0x7fc8962a24d0>]],
 'gpconfidence': [<matplotlib.collections.PolyCollection at 0x7fc896249690>]}

image.png

Optimize using gradient method

m.optimize(messages=1)
HBox(children=(VBox(children=(IntProgress(value=0, max=1000), HTML(value=''))), Box(children=(HTML(value=''),)…





<paramz.optimization.optimization.opt_lbfgsb at 0x7fc89620bdd0>

Print params and plot optimized model

Observe how the optimized lengthscale is close to the one we found before by grid search and even closer to the true one.

print(m)
m.plot()
Name : GP regression
Objective : -44.021574988285664
Number of Parameters : 3
Number of Optimization Parameters : 3
Updates : True
Parameters:
  [1mGP_regression.         [0;0m  |                  value  |  constraints  |  priors
  [1mrbf.variance           [0;0m  |      1.201694789009845  |      +ve      |        
  [1mrbf.lengthscale        [0;0m  |     0.9078772857387518  |      +ve      |        
  [1mGaussian_noise.variance[0;0m  |  8.829089785857163e-06  |      +ve      |        





{'dataplot': [<matplotlib.collections.PathCollection at 0x7fc8962aadd0>],
 'gpmean': [[<matplotlib.lines.Line2D at 0x7fc8961095d0>]],
 'gpconfidence': [<matplotlib.collections.PolyCollection at 0x7fc89612b490>]}

image.png

Predict test data

mu,var = m.predict(X_star)
plot_fit(X_star, Y_star, mu, var)

image.png

Why not register and get more from Qiita?
  1. We will deliver articles that match you
    By following users and tags, you can catch up information on technical fields that you are interested in as a whole
  2. you can read useful information later efficiently
    By "stocking" the articles you like, you can search right away
Comments
No comments
Sign up for free and join this conversation.
If you already have a Qiita account
Why do not you register as a user and use Qiita more conveniently?
You need to log in to use this function. Qiita can be used more conveniently after logging in.
You seem to be reading articles frequently this month. Qiita can be used more conveniently after logging in.
  1. We will deliver articles that match you
    By following users and tags, you can catch up information on technical fields that you are interested in as a whole
  2. you can read useful information later efficiently
    By "stocking" the articles you like, you can search right away
ユーザーは見つかりませんでした