14
13

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 5 years have passed since last update.

R, Python, Juliaの性能比較

Posted at

はじめに

結果

評価関数の呼び出し回数を考慮すると、Juliaが最速で、その後にC++、Python、R、C#と続く。

言語環境 秒数 呼び出し回数 平均秒数 比率
Julia(1.1.0) 10.86 70 0.155 1
C++(VS2017) 8.51 47 0.181 1.17
Python(3.6.7/WSL) 13.06 59 0.221 1.43
R(3.5.3) 38.6 47 0.822 5.3
C#(VS2017) 47.6 47 1.013 6.5

Pythonで測定したところ、CPUの利用率が100%となり、24秒くらいかかった。
OMP_NUM_THREADS=1として使用するスレッド数を1に制限したら速くなった。
C#が遅すぎるが、Install-Package MathNet.Numerics.MKL.Win-x64として、
Intel Math Kernel Library (MKL)を使用すると劇的に速くなる。

言語環境 秒数 呼び出し回数 平均秒数 比率
C+++MKL 4.58 47 0.097 0.63
C#+MKL 6.59 47 0.140 0.90
R Open(3.5.2)+MKL 7.51 47 0.160 1.03
Python(3.7.3)+MKL 10.9 59 0.184 1.19

Microsoft R Openは、デフォルトでMKLを使用する。使用スレッド数を1に制限。
MinicondaでPythonをインストールするとnumpy+MKLを使用するため速くなる。
最速はC++/Eigen/MKL。
Julia+MKLは未評価。MKLを使用するためにはソースからコンパイルする必要あり。

処理の大部分は行列(767x767)のコレスキー分解であり、MKLの効果が大きい。

サンプルデータ

USPS handwritten digit data
http://www.gaussianprocess.org/gpml/data/

R

## データの入力

library("R.matlab")

mat_data <- readMat('usps_resampled.mat')

index3 = (mat_data$train.labels[3+1,] == 1)
index5 = (mat_data$train.labels[5+1,] == 1)

y_train = ifelse(index3,1,-1)[index3 | index5]
x_train = mat_data$train.patterns[,index3 | index5]

## カーネル関数とシグモイド関数

x_train_L2 = apply(x_train,2,function(a){colSums((x_train-a)^2)})
my_kernel_train = function(k_sigma,k_scale){
  exp(x_train_L2 / (-2 * k_scale^2)) * k_sigma^2
}

my_sigmoid = function(y,f){
  pnorm(y * f)
}
my_sigmoid_log_1st = function(y,f,prob){
  y * dnorm(f) / prob
}
my_sigmoid_log_2nd = function(y,f,prob){
  a = dnorm(f) / prob
  a * (a + y * f)
}

## 周辺尤度の最大化

my_evidence_optim = function(par){
  k_scale = exp(par[1])
  k_sigma = exp(par[2])
  K11 = my_kernel_train(k_sigma,k_scale)
  n = length(y_train)
  f = seq(from=0,to=1,length.out=n)
  while(TRUE){
    prob = my_sigmoid(y_train,f)
    V1 = my_sigmoid_log_1st(y_train,f,prob)
    V2 = my_sigmoid_log_2nd(y_train,f,prob)
    V2q = sqrt(pmax(V2,0))
    B = diag(n) + t(K11 * V2q) * V2q
    U = chol(B) # t(U) %*% U
    b = V2 * f + V1
    a = b - backsolve(U,forwardsolve(t(U),(K11 %*% b) * V2q)) * V2q
    f0 = f
    f = drop(K11 %*% a)
    if(max(abs(f - f0))<1e-5)break
  }
  -sum(a * f) / 2 + sum(log(prob)) - sum(log(diag(U)))
}
start = proc.time()
ret_optim = optim(c(2.85,2.35),my_evidence_optim,control=list(fnscale=-1))
proc.time() - start
ret_optim

Julia

Rとは異なり、whileでも変数のスコープが構成される。
sum()したら次元をドロップして欲しい。

using MAT
using Distributions
using Optim
using LinearAlgebra

## データの入力

matfile = matopen("usps_resampled.mat")

train_labels = read(matfile, "train_labels")
train_patterns = read(matfile, "train_patterns")

index3 = (train_labels[3+1,:] .== 1)
index5 = (train_labels[5+1,:] .== 1)

y_train = ifelse.(index3,1,-1)[index3 .| index5]
x_train = train_patterns[:,index3 .| index5]

## カーネル関数とシグモイド関数

x_train_L2 = mapslices(t -> dropdims(sum((x_train .- t) .^ 2, dims=1), dims=1), x_train, dims=1)
function my_kernel_train(k_sigma,k_scale)
  exp.(x_train_L2 / (-2 * k_scale^2)) * k_sigma^2
end

function my_sigmoid(y,f)
  cdf.(Normal(), y .* f)
end
function my_sigmoid_log_1st(y,f,prob)
  y .* pdf.(Normal(), f) ./ prob
end
function my_sigmoid_log_2nd(y,f,prob)
  a = pdf.(Normal(), f) ./ prob
  a .* (a .+ y .* f)
end

## 周辺尤度の最大化

function my_evidence_optim(par)
  k_scale = exp(par[1])
  k_sigma = exp(par[2])
  K11 = my_kernel_train(k_sigma,k_scale)
  n = length(y_train)
  f = Array(range(0,1,length=n))
  evidence = 0
  while true
    prob = my_sigmoid(y_train,f)
    V1 = my_sigmoid_log_1st(y_train,f,prob)
    V2 = my_sigmoid_log_2nd(y_train,f,prob)
    V2q = sqrt.(max.(V2,0))
    B = I + K11 .* V2q .* V2q'
    C = cholesky(Symmetric(B))
    b = V2 .* f + V1
    a = b - (C.U \ (C.L \ ((K11 * b) .* V2q))) .* V2q
    f0 = f
    f = K11 * a
    if maximum(abs.(f - f0)) < 1e-5
      evidence = -dot(a,f) / 2 + sum(log.(prob)) - log(det(C)) / 2
      break
    end
  end
  evidence
end

@time result = maximize(my_evidence_optim, [2.85,2.35], NelderMead())

Python

np.arraynp.matrixの二種類があり挙動が異なる。

import os
os.environ["OMP_NUM_THREADS"] = "1"

import time
import numpy as np
from scipy import io
from scipy.stats import norm
from scipy.linalg import cholesky
from scipy.linalg import solve_triangular
from scipy.optimize import minimize

## データの入力

matdata = io.loadmat('usps_resampled.mat')

train_labels = matdata["train_labels"]
train_patterns = matdata["train_patterns"]

index3 = (train_labels[3,:]==1)
index5 = (train_labels[5,:]==1)

y_train = np.where(index3,1,-1)[np.logical_or(index3,index5)]
x_train = train_patterns[:,np.logical_or(index3,index5)]

## カーネル関数とシグモイド関数

x_train_L2 = np.apply_along_axis(lambda x: np.sum(np.power(x_train-x.reshape(-1,1),2),0),0,x_train)
def my_kernel_train(k_sigma,k_scale):
  return np.exp(x_train_L2 / (-2 * k_scale * k_scale)) * (k_sigma * k_sigma)

def my_sigmoid(y,f):
  return norm.cdf(y * f)

def my_sigmoid_log_1st(y,f,prob):
  return y * norm.pdf(f) / prob

def my_sigmoid_log_2nd(y,f,prob):
  a = norm.pdf(f) / prob
  return a * (a + y * f)

## 周辺尤度の最大化

def my_evidence_optim(par):
  k_scale = np.exp(par[0])
  k_sigma = np.exp(par[1])
  K11 = my_kernel_train(k_sigma,k_scale)
  n = len(y_train)
  f = np.linspace(0,1,n)
  evidence = 0
  while True:
    prob = my_sigmoid(y_train,f)
    V1 = my_sigmoid_log_1st(y_train,f,prob)
    V2 = my_sigmoid_log_2nd(y_train,f,prob)
    V2q = np.sqrt(np.maximum(V2,0))
    B = np.identity(n) + K11 * V2q * V2q.reshape(-1,1)
    U = cholesky(B)
    b = V2 * f + V1
    a = b - solve_triangular(U,solve_triangular(U,np.dot(K11,b) * V2q,trans=1)) * V2q
    f0 = f
    f = np.dot(K11,a)
    if np.max(np.abs(f - f0)) < 1e-5:
      evidence = -np.dot(a,f) / 2 + np.sum(np.log(prob)) - np.sum(np.log(np.diag(U)))
      break
  return -evidence

start = time.time()
par = np.array([2.85,2.35])
ret = minimize(my_evidence_optim,par,method='Nelder-Mead')
print(time.time() - start)
print(ret)

C#

.matファイルの入力にはAccord.Mathを使用。
行列計算と最適化にはMathNet.Numericsを使用。
データの抽出と結合に少し手間がかかる。

class BinaryHandwrittenDigit
{
    (Vector<double>, Matrix<double>) Select(Int16[,] labels, Double[,] patterns)
    {
        int nrow = patterns.GetLength(0);
        int ncol = patterns.GetLength(1);
        int count_3 = 0;
        int count_5 = 0;
        for (int j = 0; j < ncol; j++)
        {
            if (labels[3, j] == 1) count_3++;
            else if (labels[5, j] == 1) count_5++;
        }
        Vector<double> y = new DenseVector(count_3 + count_5);
        Matrix<double> x = new DenseMatrix(nrow, count_3 + count_5);
        int i = 0;
        for (int j = 0; j < ncol; j++)
        {
            if (labels[3, j] == 1)
            {
                for (int k = 0; k < nrow; k++) x[k, i] = patterns[k, j];
                y[i++] = 1;
            }
            else if (labels[5, j] == 1)
            {
                for (int k = 0; k < nrow; k++) x[k, i] = patterns[k, j];
                y[i++] = -1;
            }
        }
        return (y, x);
    }
    Vector<double> y_train;
    Matrix<double> x_train;
    void ReadMat()
    {
        string path_to_file = "usps_resampled.mat";
        var reader = new MatReader(path_to_file);
        var train_labels = reader.Read<Int16[,]>("train_labels");
        var train_patterns = reader.Read<Double[,]>("train_patterns");
        (y_train, x_train) = Select(train_labels, train_patterns);
    }
    Matrix<double> x_train_L2;
    Matrix<double> MakeMatrixL2(Matrix<double> x)
    {
        int m = x.ColumnCount;
        var result = new DenseMatrix(m, m);
        for (int j = 0; j < m; j++)
        {
            var cj = x.Column(j);
            for (int i = 0; i < m; i++)
            {
                var s = x.Column(i) - cj;
                result[i, j] = s * s;
            }
        }
        return result;
    }
    Matrix<double> kernel_train(double k_sigma, double k_scale)
    {
        return (x_train_L2 / (-2 * k_scale * k_scale)).PointwiseExp() * (k_sigma * k_sigma);
    }
    Vector<double> sigmoid(Vector<double> y, Vector<double> f)
    {
        return y.PointwiseMultiply(f).Map((x) => Normal.CDF(0, 1, x));
    }
    Vector<double> sigmoid_log_1st(Vector<double> y, Vector<double> f, Vector<double> prob)
    {
        var df = f.Map((x) => Normal.PDF(0, 1, x));
        return y.PointwiseDivide(prob).PointwiseMultiply(df);
    }
    Vector<double> sigmoid_log_2nd(Vector<double> y, Vector<double> f, Vector<double> prob)
    {
        var df = f.Map((x) => Normal.PDF(0, 1, x));
        var a = df.PointwiseDivide(prob);
        return a.PointwiseMultiply(a + y.PointwiseMultiply(f));
    }
    Matrix<double> bXbPlusOne(Matrix<double> X, Vector<double> b)
    {
        int n = X.RowCount;
        var result = X.Clone();
        for (int j = 0; j < n; j++)
        {
            for (int i = 0; i < n; i++)
            {
                result[i, j] *= b[i] * b[j];
            }
            result[j, j] += 1;
        }
        return result;
    }
    double GetEvidence(Vector<double> par)
    {
        double k_scale = Math.Exp(par[0]);
        double k_sigma = Math.Exp(par[1]);
        var K11 = kernel_train(k_sigma, k_scale);
        int n = y_train.Count;
        Vector<double> f = new DenseVector(n);
        for (int i = 0; i < n; i++)
        {
            f[i] = 1.0 / (n - 1) * i;
        }
        double evidence = 0;
        while (true)
        {
            var prob = sigmoid(y_train, f);
            var V1 = sigmoid_log_1st(y_train, f, prob);
            var V2 = sigmoid_log_2nd(y_train, f, prob);
            var V2q = V2.PointwiseMaximum(0).PointwiseSqrt();
            var B = bXbPlusOne(K11, V2q);
            var b = V2.PointwiseMultiply(f) + V1;
            var c = K11.Multiply(b).PointwiseMultiply(V2q);
            var chol = B.Cholesky();
            var a = b - chol.Solve(c).PointwiseMultiply(V2q);
            var f0 = f;
            f = K11 * a;
            if ((f - f0).AbsoluteMaximum() < 1e-5)
            {
                evidence = -(a * f) / 2 + prob.PointwiseLog().Sum() - Math.Log(chol.Determinant) / 2;
                break;
            }
        }
        return -evidence;
    }
    public void MarginalLikelihoodMaximization()
    {
        ReadMat();
        x_train_L2 = MakeMatrixL2(x_train);
        var evidence_optim = ObjectiveFunction.Value(GetEvidence);
        Stopwatch sw = Stopwatch.StartNew();
        var ret = NelderMeadSimplex.Minimum(evidence_optim, new DenseVector(new[] { 2.85, 2.35 }));
        sw.Stop();
        Console.WriteLine("{0}sec", (double)(sw.ElapsedMilliseconds) / 1000);
        var par_m = ret.MinimizingPoint;
        Console.WriteLine("par={0},{1}", par_m[0], par_m[1]);
        Console.WriteLine("Value={0},Reason={1},Iterations={2}", ret.FunctionInfoAtMinimum.Value, ret.ReasonForExit, ret.Iterations);
    }
}
static void Main(string[] args)
{
    Control.MaxDegreeOfParallelism = 1;
    var sample = new BinaryHandwrittenDigit();
    sample.MarginalLikelihoodMaximization();
}

C++

データは、C#で出力したものを入力。省略。
最適化関数nmminは、Rのソース/src/appl/optim.cを流用して作成。省略。
Eigenでautoを使用すると型が変わってしまう。

MKLを使用する場合は、#define EIGEN_USE_BLASとして、
mkl_intel_lp64.lib,mkl_sequential.lib,mkl_core.libをリンクする。
Intel® Math Kernel Library Link Line Advisor

# include <stdio.h>
# include <direct.h>

# include <chrono>
# include <cmath>

//#define EIGEN_USE_BLAS
# include "Eigen/Dense"

using namespace std;
using namespace Eigen;

typedef double optimfn(int n, const double *x0, void *ex);

void nmmin(int n, double *Bvec, double *X, double *Fmin, optimfn fminfn,
    int *fail, double abstol, double intol, void *ex,
    double alpha, double bet, double gamm, int trace,
    int *fncount, int maxit);

double Normal_PDF(double value)
{
    const double pi = 3.1415926535897932384626433832795;
    return exp(-value * value / 2) / sqrt(2 * pi);
}
double Normal_CDF(double value)
{
    return erfc(-value / sqrt(2)) / 2;
}

class BinaryHandwrittenDigit {
private:
    int nrow;
    int ncol;
    MatrixXd x_train;
    ArrayXd y_train;
    MatrixXd x_train_L2;
public:
    void read_data();
    void make_matrix_L2() {
        x_train_L2.resize(ncol, ncol);
        for (int j = 0; j < ncol; j++) {
            for (int i = 0; i < ncol; i++) {
                double t = 0;
                for (int k = 0; k < nrow; k++) t += (x_train(k, i) - x_train(k, j)) * (x_train(k, i) - x_train(k, j));
                x_train_L2(i, j) = t;
            }
        }
    }
    MatrixXd kernel_train(double k_sigma, double k_scale) {
        return (x_train_L2 / (-2 * k_scale * k_scale)).array().exp() * (k_sigma * k_sigma);
    }
    ArrayXd sigmoid(const ArrayXd& y, const ArrayXd& f) {
        ArrayXd result(y.size());
        for (int j = 0; j < y.size(); j++) {
            result(j) = Normal_CDF(y(j) * f(j));
        }
        return result;
    }
    ArrayXd sigmoid_log_1st(const ArrayXd& y, const ArrayXd& f, const ArrayXd& prob) {
        ArrayXd result(y.size());
        for (int j = 0; j < y.size(); j++) {
            result(j) = y(j) * Normal_PDF(f(j)) / prob(j);
        }
        return result;
    }
    ArrayXd sigmoid_log_2nd(const ArrayXd& y, const ArrayXd& f, const ArrayXd& prob) {
        ArrayXd result(y.size());
        for (int j = 0; j < y.size(); j++) {
            double a = Normal_PDF(f(j)) / prob(j);
            result(j) = a * (a + y(j) * f(j));
        }
        return result;
    }
    MatrixXd bXbPlusOne(const MatrixXd& X, const ArrayXd& b)
    {
        int n = X.rows();
        MatrixXd result(X);
        for (int j = 0; j < n; j++)
        {
            for (int i = 0; i < n; i++)
            {
                result(i, j) *= b[i] * b[j];
            }
            result(j, j) += 1;
        }
        return result;
    }
    double get_evidence(const double* par) {
        double k_scale = exp(par[0]);
        double k_sigma = exp(par[1]);
        MatrixXd K11 = kernel_train(k_sigma, k_scale);
        int n = y_train.size();
        ArrayXd f = ArrayXd::LinSpaced(n, 0, 1);
        double evidence = 0;
        while (true) {
            ArrayXd prob = sigmoid(y_train, f);
            ArrayXd V1 = sigmoid_log_1st(y_train, f, prob);
            ArrayXd V2 = sigmoid_log_2nd(y_train, f, prob);
            ArrayXd V2q = V2.cwiseMax(0.0).cwiseSqrt();
            MatrixXd B = bXbPlusOne(K11, V2q);
            ArrayXd b = V2 * f + V1;
            ArrayXd c = (K11 * VectorXd(b)).array() * V2q;
            LLT<MatrixXd> chol(B);
            ArrayXd a = b - chol.solve(VectorXd(c)).array() * V2q;
            ArrayXd f0 = f;
            f = K11 * VectorXd(a);
            if ((f - f0).abs().maxCoeff() < 1e-5) {
                evidence = -(a * f).sum() / 2 + prob.log().sum() - log(chol.matrixL().determinant());
                break;
            }
        }
        return -evidence;
    }
public:
    static double get_evidence_optim(int n, const double *x, void *ex) {
        auto p = (BinaryHandwrittenDigit*)ex;
        return p->get_evidence(x);
    }
};

int main()
{
    auto sample = new BinaryHandwrittenDigit();
    sample->read_data();
    sample->make_matrix_L2();

    double ninf = -INFINITY;
    double eps = 2.220446e-16;

    ArrayXd xini(2); xini << 2.85, 2.35;
    ArrayXd xout(2);
    double fmin = 0.0;
    int fail = 0;
    double abstol = ninf;
    double intol = sqrt(eps);
    void *ex = sample;
    double alpha = 1.0;
    double bet = 0.5;
    double gamm = 2.0;
    int trace = 0;
    int fncount = 0;
    int maxit = 500;

    auto start = chrono::system_clock::now();
    nmmin(xini.size(), xini.data(), xout.data(), &fmin, BinaryHandwrittenDigit::get_evidence_optim, &fail, abstol, intol, ex, alpha, bet, gamm, trace, &fncount, maxit);
    auto end = chrono::system_clock::now();
    long elapsed = chrono::duration_cast<chrono::milliseconds>(end - start).count();
    printf("fmin=%f, fncount=%d, fail=%d\n", fmin, fncount, fail);
    for (int i = 0; i < xout.size(); i++) printf("%d, %f\n", i, xout[i]);
    printf("%d msec\n", elapsed);

    delete sample;
}
14
13
1

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
14
13

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?