はじめに
- 「ガウス過程に基づく分類モデル」で行った計算を、R, Python, Juliaで実装して処理時間を比較。
- ついでにC#とC++でも実装して比較してみた。
- 事後分布をラプラス近似して、周辺尤度を最大化するパラメータを求める。
- 最適化手法は、勾配情報の不要なNelder-Meadアルゴリズムを採用。
- Windows 10 Pro, 16.0GBメモリ, i7-8550U CPU @ 1.80GHz
- The Julia Programming Language
- JuliaNLSolvers/Optim.jl
- [SciPy] (https://www.scipy.org/index.html)
- Accord.NET Framework
- Math.NET Numerics
- Eigen: C++ template library for linear algebra
結果
評価関数の呼び出し回数を考慮すると、Juliaが最速で、その後にC++、Python、R、C#と続く。
言語環境 | 秒数 | 呼び出し回数 | 平均秒数 | 比率 |
---|---|---|---|---|
Julia(1.1.0) | 10.86 | 70 | 0.155 | 1 |
C++(VS2017) | 8.51 | 47 | 0.181 | 1.17 |
Python(3.6.7/WSL) | 13.06 | 59 | 0.221 | 1.43 |
R(3.5.3) | 38.6 | 47 | 0.822 | 5.3 |
C#(VS2017) | 47.6 | 47 | 1.013 | 6.5 |
Pythonで測定したところ、CPUの利用率が100%となり、24秒くらいかかった。
OMP_NUM_THREADS=1
として使用するスレッド数を1に制限したら速くなった。
C#が遅すぎるが、Install-Package MathNet.Numerics.MKL.Win-x64
として、
Intel Math Kernel Library (MKL)を使用すると劇的に速くなる。
言語環境 | 秒数 | 呼び出し回数 | 平均秒数 | 比率 |
---|---|---|---|---|
C+++MKL | 4.58 | 47 | 0.097 | 0.63 |
C#+MKL | 6.59 | 47 | 0.140 | 0.90 |
R Open(3.5.2)+MKL | 7.51 | 47 | 0.160 | 1.03 |
Python(3.7.3)+MKL | 10.9 | 59 | 0.184 | 1.19 |
Microsoft R Openは、デフォルトでMKLを使用する。使用スレッド数を1に制限。
MinicondaでPythonをインストールするとnumpy+MKLを使用するため速くなる。
最速はC++/Eigen/MKL。
Julia+MKLは未評価。MKLを使用するためにはソースからコンパイルする必要あり。
処理の大部分は行列(767x767)のコレスキー分解であり、MKLの効果が大きい。
サンプルデータ
USPS handwritten digit data
http://www.gaussianprocess.org/gpml/data/
R
## データの入力
library("R.matlab")
mat_data <- readMat('usps_resampled.mat')
index3 = (mat_data$train.labels[3+1,] == 1)
index5 = (mat_data$train.labels[5+1,] == 1)
y_train = ifelse(index3,1,-1)[index3 | index5]
x_train = mat_data$train.patterns[,index3 | index5]
## カーネル関数とシグモイド関数
x_train_L2 = apply(x_train,2,function(a){colSums((x_train-a)^2)})
my_kernel_train = function(k_sigma,k_scale){
exp(x_train_L2 / (-2 * k_scale^2)) * k_sigma^2
}
my_sigmoid = function(y,f){
pnorm(y * f)
}
my_sigmoid_log_1st = function(y,f,prob){
y * dnorm(f) / prob
}
my_sigmoid_log_2nd = function(y,f,prob){
a = dnorm(f) / prob
a * (a + y * f)
}
## 周辺尤度の最大化
my_evidence_optim = function(par){
k_scale = exp(par[1])
k_sigma = exp(par[2])
K11 = my_kernel_train(k_sigma,k_scale)
n = length(y_train)
f = seq(from=0,to=1,length.out=n)
while(TRUE){
prob = my_sigmoid(y_train,f)
V1 = my_sigmoid_log_1st(y_train,f,prob)
V2 = my_sigmoid_log_2nd(y_train,f,prob)
V2q = sqrt(pmax(V2,0))
B = diag(n) + t(K11 * V2q) * V2q
U = chol(B) # t(U) %*% U
b = V2 * f + V1
a = b - backsolve(U,forwardsolve(t(U),(K11 %*% b) * V2q)) * V2q
f0 = f
f = drop(K11 %*% a)
if(max(abs(f - f0))<1e-5)break
}
-sum(a * f) / 2 + sum(log(prob)) - sum(log(diag(U)))
}
start = proc.time()
ret_optim = optim(c(2.85,2.35),my_evidence_optim,control=list(fnscale=-1))
proc.time() - start
ret_optim
Julia
Rとは異なり、while
でも変数のスコープが構成される。
sum()
したら次元をドロップして欲しい。
using MAT
using Distributions
using Optim
using LinearAlgebra
## データの入力
matfile = matopen("usps_resampled.mat")
train_labels = read(matfile, "train_labels")
train_patterns = read(matfile, "train_patterns")
index3 = (train_labels[3+1,:] .== 1)
index5 = (train_labels[5+1,:] .== 1)
y_train = ifelse.(index3,1,-1)[index3 .| index5]
x_train = train_patterns[:,index3 .| index5]
## カーネル関数とシグモイド関数
x_train_L2 = mapslices(t -> dropdims(sum((x_train .- t) .^ 2, dims=1), dims=1), x_train, dims=1)
function my_kernel_train(k_sigma,k_scale)
exp.(x_train_L2 / (-2 * k_scale^2)) * k_sigma^2
end
function my_sigmoid(y,f)
cdf.(Normal(), y .* f)
end
function my_sigmoid_log_1st(y,f,prob)
y .* pdf.(Normal(), f) ./ prob
end
function my_sigmoid_log_2nd(y,f,prob)
a = pdf.(Normal(), f) ./ prob
a .* (a .+ y .* f)
end
## 周辺尤度の最大化
function my_evidence_optim(par)
k_scale = exp(par[1])
k_sigma = exp(par[2])
K11 = my_kernel_train(k_sigma,k_scale)
n = length(y_train)
f = Array(range(0,1,length=n))
evidence = 0
while true
prob = my_sigmoid(y_train,f)
V1 = my_sigmoid_log_1st(y_train,f,prob)
V2 = my_sigmoid_log_2nd(y_train,f,prob)
V2q = sqrt.(max.(V2,0))
B = I + K11 .* V2q .* V2q'
C = cholesky(Symmetric(B))
b = V2 .* f + V1
a = b - (C.U \ (C.L \ ((K11 * b) .* V2q))) .* V2q
f0 = f
f = K11 * a
if maximum(abs.(f - f0)) < 1e-5
evidence = -dot(a,f) / 2 + sum(log.(prob)) - log(det(C)) / 2
break
end
end
evidence
end
@time result = maximize(my_evidence_optim, [2.85,2.35], NelderMead())
Python
np.array
とnp.matrix
の二種類があり挙動が異なる。
import os
os.environ["OMP_NUM_THREADS"] = "1"
import time
import numpy as np
from scipy import io
from scipy.stats import norm
from scipy.linalg import cholesky
from scipy.linalg import solve_triangular
from scipy.optimize import minimize
## データの入力
matdata = io.loadmat('usps_resampled.mat')
train_labels = matdata["train_labels"]
train_patterns = matdata["train_patterns"]
index3 = (train_labels[3,:]==1)
index5 = (train_labels[5,:]==1)
y_train = np.where(index3,1,-1)[np.logical_or(index3,index5)]
x_train = train_patterns[:,np.logical_or(index3,index5)]
## カーネル関数とシグモイド関数
x_train_L2 = np.apply_along_axis(lambda x: np.sum(np.power(x_train-x.reshape(-1,1),2),0),0,x_train)
def my_kernel_train(k_sigma,k_scale):
return np.exp(x_train_L2 / (-2 * k_scale * k_scale)) * (k_sigma * k_sigma)
def my_sigmoid(y,f):
return norm.cdf(y * f)
def my_sigmoid_log_1st(y,f,prob):
return y * norm.pdf(f) / prob
def my_sigmoid_log_2nd(y,f,prob):
a = norm.pdf(f) / prob
return a * (a + y * f)
## 周辺尤度の最大化
def my_evidence_optim(par):
k_scale = np.exp(par[0])
k_sigma = np.exp(par[1])
K11 = my_kernel_train(k_sigma,k_scale)
n = len(y_train)
f = np.linspace(0,1,n)
evidence = 0
while True:
prob = my_sigmoid(y_train,f)
V1 = my_sigmoid_log_1st(y_train,f,prob)
V2 = my_sigmoid_log_2nd(y_train,f,prob)
V2q = np.sqrt(np.maximum(V2,0))
B = np.identity(n) + K11 * V2q * V2q.reshape(-1,1)
U = cholesky(B)
b = V2 * f + V1
a = b - solve_triangular(U,solve_triangular(U,np.dot(K11,b) * V2q,trans=1)) * V2q
f0 = f
f = np.dot(K11,a)
if np.max(np.abs(f - f0)) < 1e-5:
evidence = -np.dot(a,f) / 2 + np.sum(np.log(prob)) - np.sum(np.log(np.diag(U)))
break
return -evidence
start = time.time()
par = np.array([2.85,2.35])
ret = minimize(my_evidence_optim,par,method='Nelder-Mead')
print(time.time() - start)
print(ret)
C#
.mat
ファイルの入力にはAccord.Math
を使用。
行列計算と最適化にはMathNet.Numerics
を使用。
データの抽出と結合に少し手間がかかる。
class BinaryHandwrittenDigit
{
(Vector<double>, Matrix<double>) Select(Int16[,] labels, Double[,] patterns)
{
int nrow = patterns.GetLength(0);
int ncol = patterns.GetLength(1);
int count_3 = 0;
int count_5 = 0;
for (int j = 0; j < ncol; j++)
{
if (labels[3, j] == 1) count_3++;
else if (labels[5, j] == 1) count_5++;
}
Vector<double> y = new DenseVector(count_3 + count_5);
Matrix<double> x = new DenseMatrix(nrow, count_3 + count_5);
int i = 0;
for (int j = 0; j < ncol; j++)
{
if (labels[3, j] == 1)
{
for (int k = 0; k < nrow; k++) x[k, i] = patterns[k, j];
y[i++] = 1;
}
else if (labels[5, j] == 1)
{
for (int k = 0; k < nrow; k++) x[k, i] = patterns[k, j];
y[i++] = -1;
}
}
return (y, x);
}
Vector<double> y_train;
Matrix<double> x_train;
void ReadMat()
{
string path_to_file = "usps_resampled.mat";
var reader = new MatReader(path_to_file);
var train_labels = reader.Read<Int16[,]>("train_labels");
var train_patterns = reader.Read<Double[,]>("train_patterns");
(y_train, x_train) = Select(train_labels, train_patterns);
}
Matrix<double> x_train_L2;
Matrix<double> MakeMatrixL2(Matrix<double> x)
{
int m = x.ColumnCount;
var result = new DenseMatrix(m, m);
for (int j = 0; j < m; j++)
{
var cj = x.Column(j);
for (int i = 0; i < m; i++)
{
var s = x.Column(i) - cj;
result[i, j] = s * s;
}
}
return result;
}
Matrix<double> kernel_train(double k_sigma, double k_scale)
{
return (x_train_L2 / (-2 * k_scale * k_scale)).PointwiseExp() * (k_sigma * k_sigma);
}
Vector<double> sigmoid(Vector<double> y, Vector<double> f)
{
return y.PointwiseMultiply(f).Map((x) => Normal.CDF(0, 1, x));
}
Vector<double> sigmoid_log_1st(Vector<double> y, Vector<double> f, Vector<double> prob)
{
var df = f.Map((x) => Normal.PDF(0, 1, x));
return y.PointwiseDivide(prob).PointwiseMultiply(df);
}
Vector<double> sigmoid_log_2nd(Vector<double> y, Vector<double> f, Vector<double> prob)
{
var df = f.Map((x) => Normal.PDF(0, 1, x));
var a = df.PointwiseDivide(prob);
return a.PointwiseMultiply(a + y.PointwiseMultiply(f));
}
Matrix<double> bXbPlusOne(Matrix<double> X, Vector<double> b)
{
int n = X.RowCount;
var result = X.Clone();
for (int j = 0; j < n; j++)
{
for (int i = 0; i < n; i++)
{
result[i, j] *= b[i] * b[j];
}
result[j, j] += 1;
}
return result;
}
double GetEvidence(Vector<double> par)
{
double k_scale = Math.Exp(par[0]);
double k_sigma = Math.Exp(par[1]);
var K11 = kernel_train(k_sigma, k_scale);
int n = y_train.Count;
Vector<double> f = new DenseVector(n);
for (int i = 0; i < n; i++)
{
f[i] = 1.0 / (n - 1) * i;
}
double evidence = 0;
while (true)
{
var prob = sigmoid(y_train, f);
var V1 = sigmoid_log_1st(y_train, f, prob);
var V2 = sigmoid_log_2nd(y_train, f, prob);
var V2q = V2.PointwiseMaximum(0).PointwiseSqrt();
var B = bXbPlusOne(K11, V2q);
var b = V2.PointwiseMultiply(f) + V1;
var c = K11.Multiply(b).PointwiseMultiply(V2q);
var chol = B.Cholesky();
var a = b - chol.Solve(c).PointwiseMultiply(V2q);
var f0 = f;
f = K11 * a;
if ((f - f0).AbsoluteMaximum() < 1e-5)
{
evidence = -(a * f) / 2 + prob.PointwiseLog().Sum() - Math.Log(chol.Determinant) / 2;
break;
}
}
return -evidence;
}
public void MarginalLikelihoodMaximization()
{
ReadMat();
x_train_L2 = MakeMatrixL2(x_train);
var evidence_optim = ObjectiveFunction.Value(GetEvidence);
Stopwatch sw = Stopwatch.StartNew();
var ret = NelderMeadSimplex.Minimum(evidence_optim, new DenseVector(new[] { 2.85, 2.35 }));
sw.Stop();
Console.WriteLine("{0}sec", (double)(sw.ElapsedMilliseconds) / 1000);
var par_m = ret.MinimizingPoint;
Console.WriteLine("par={0},{1}", par_m[0], par_m[1]);
Console.WriteLine("Value={0},Reason={1},Iterations={2}", ret.FunctionInfoAtMinimum.Value, ret.ReasonForExit, ret.Iterations);
}
}
static void Main(string[] args)
{
Control.MaxDegreeOfParallelism = 1;
var sample = new BinaryHandwrittenDigit();
sample.MarginalLikelihoodMaximization();
}
C++
データは、C#で出力したものを入力。省略。
最適化関数nmmin
は、Rのソース/src/appl/optim.c
を流用して作成。省略。
Eigenでauto
を使用すると型が変わってしまう。
MKLを使用する場合は、#define EIGEN_USE_BLAS
として、
mkl_intel_lp64.lib,mkl_sequential.lib,mkl_core.lib
をリンクする。
Intel® Math Kernel Library Link Line Advisor
# include <stdio.h>
# include <direct.h>
# include <chrono>
# include <cmath>
//#define EIGEN_USE_BLAS
# include "Eigen/Dense"
using namespace std;
using namespace Eigen;
typedef double optimfn(int n, const double *x0, void *ex);
void nmmin(int n, double *Bvec, double *X, double *Fmin, optimfn fminfn,
int *fail, double abstol, double intol, void *ex,
double alpha, double bet, double gamm, int trace,
int *fncount, int maxit);
double Normal_PDF(double value)
{
const double pi = 3.1415926535897932384626433832795;
return exp(-value * value / 2) / sqrt(2 * pi);
}
double Normal_CDF(double value)
{
return erfc(-value / sqrt(2)) / 2;
}
class BinaryHandwrittenDigit {
private:
int nrow;
int ncol;
MatrixXd x_train;
ArrayXd y_train;
MatrixXd x_train_L2;
public:
void read_data();
void make_matrix_L2() {
x_train_L2.resize(ncol, ncol);
for (int j = 0; j < ncol; j++) {
for (int i = 0; i < ncol; i++) {
double t = 0;
for (int k = 0; k < nrow; k++) t += (x_train(k, i) - x_train(k, j)) * (x_train(k, i) - x_train(k, j));
x_train_L2(i, j) = t;
}
}
}
MatrixXd kernel_train(double k_sigma, double k_scale) {
return (x_train_L2 / (-2 * k_scale * k_scale)).array().exp() * (k_sigma * k_sigma);
}
ArrayXd sigmoid(const ArrayXd& y, const ArrayXd& f) {
ArrayXd result(y.size());
for (int j = 0; j < y.size(); j++) {
result(j) = Normal_CDF(y(j) * f(j));
}
return result;
}
ArrayXd sigmoid_log_1st(const ArrayXd& y, const ArrayXd& f, const ArrayXd& prob) {
ArrayXd result(y.size());
for (int j = 0; j < y.size(); j++) {
result(j) = y(j) * Normal_PDF(f(j)) / prob(j);
}
return result;
}
ArrayXd sigmoid_log_2nd(const ArrayXd& y, const ArrayXd& f, const ArrayXd& prob) {
ArrayXd result(y.size());
for (int j = 0; j < y.size(); j++) {
double a = Normal_PDF(f(j)) / prob(j);
result(j) = a * (a + y(j) * f(j));
}
return result;
}
MatrixXd bXbPlusOne(const MatrixXd& X, const ArrayXd& b)
{
int n = X.rows();
MatrixXd result(X);
for (int j = 0; j < n; j++)
{
for (int i = 0; i < n; i++)
{
result(i, j) *= b[i] * b[j];
}
result(j, j) += 1;
}
return result;
}
double get_evidence(const double* par) {
double k_scale = exp(par[0]);
double k_sigma = exp(par[1]);
MatrixXd K11 = kernel_train(k_sigma, k_scale);
int n = y_train.size();
ArrayXd f = ArrayXd::LinSpaced(n, 0, 1);
double evidence = 0;
while (true) {
ArrayXd prob = sigmoid(y_train, f);
ArrayXd V1 = sigmoid_log_1st(y_train, f, prob);
ArrayXd V2 = sigmoid_log_2nd(y_train, f, prob);
ArrayXd V2q = V2.cwiseMax(0.0).cwiseSqrt();
MatrixXd B = bXbPlusOne(K11, V2q);
ArrayXd b = V2 * f + V1;
ArrayXd c = (K11 * VectorXd(b)).array() * V2q;
LLT<MatrixXd> chol(B);
ArrayXd a = b - chol.solve(VectorXd(c)).array() * V2q;
ArrayXd f0 = f;
f = K11 * VectorXd(a);
if ((f - f0).abs().maxCoeff() < 1e-5) {
evidence = -(a * f).sum() / 2 + prob.log().sum() - log(chol.matrixL().determinant());
break;
}
}
return -evidence;
}
public:
static double get_evidence_optim(int n, const double *x, void *ex) {
auto p = (BinaryHandwrittenDigit*)ex;
return p->get_evidence(x);
}
};
int main()
{
auto sample = new BinaryHandwrittenDigit();
sample->read_data();
sample->make_matrix_L2();
double ninf = -INFINITY;
double eps = 2.220446e-16;
ArrayXd xini(2); xini << 2.85, 2.35;
ArrayXd xout(2);
double fmin = 0.0;
int fail = 0;
double abstol = ninf;
double intol = sqrt(eps);
void *ex = sample;
double alpha = 1.0;
double bet = 0.5;
double gamm = 2.0;
int trace = 0;
int fncount = 0;
int maxit = 500;
auto start = chrono::system_clock::now();
nmmin(xini.size(), xini.data(), xout.data(), &fmin, BinaryHandwrittenDigit::get_evidence_optim, &fail, abstol, intol, ex, alpha, bet, gamm, trace, &fncount, maxit);
auto end = chrono::system_clock::now();
long elapsed = chrono::duration_cast<chrono::milliseconds>(end - start).count();
printf("fmin=%f, fncount=%d, fail=%d\n", fmin, fncount, fail);
for (int i = 0; i < xout.size(); i++) printf("%d, %f\n", i, xout[i]);
printf("%d msec\n", elapsed);
delete sample;
}