More than 5 years have passed since last update.

R, Python, Juliaの性能比較

Posted at 2019-05-06

はじめに

「ガウス過程に基づく分類モデル」で行った計算を、R, Python, Juliaで実装して処理時間を比較。
ついでにC#とC++でも実装して比較してみた。
事後分布をラプラス近似して、周辺尤度を最大化するパラメータを求める。
最適化手法は、勾配情報の不要なNelder-Meadアルゴリズムを採用。
Windows 10 Pro, 16.0GBメモリ, i7-8550U CPU @ 1.80GHz
The Julia Programming Language
JuliaNLSolvers/Optim.jl
[SciPy] (https://www.scipy.org/index.html)
Accord.NET Framework
Math.NET Numerics
Eigen: C++ template library for linear algebra

結果

評価関数の呼び出し回数を考慮すると、Juliaが最速で、その後にC++、Python、R、C#と続く。

言語環境	秒数	呼び出し回数	平均秒数	比率
Julia(1.1.0)	10.86	70	0.155	1
C++(VS2017)	8.51	47	0.181	1.17
Python(3.6.7/WSL)	13.06	59	0.221	1.43
R(3.5.3)	38.6	47	0.822	5.3
C#(VS2017)	47.6	47	1.013	6.5

Pythonで測定したところ、CPUの利用率が100%となり、24秒くらいかかった。
OMP_NUM_THREADS=1として使用するスレッド数を1に制限したら速くなった。
C#が遅すぎるが、Install-Package MathNet.Numerics.MKL.Win-x64として、
Intel Math Kernel Library (MKL)を使用すると劇的に速くなる。

言語環境	秒数	呼び出し回数	平均秒数	比率
C+++MKL	4.58	47	0.097	0.63
C#+MKL	6.59	47	0.140	0.90
R Open(3.5.2)+MKL	7.51	47	0.160	1.03
Python(3.7.3)+MKL	10.9	59	0.184	1.19

Microsoft R Openは、デフォルトでMKLを使用する。使用スレッド数を1に制限。
MinicondaでPythonをインストールするとnumpy+MKLを使用するため速くなる。
最速はC++/Eigen/MKL。
Julia+MKLは未評価。MKLを使用するためにはソースからコンパイルする必要あり。

処理の大部分は行列(767x767)のコレスキー分解であり、MKLの効果が大きい。

サンプルデータ

USPS handwritten digit data
http://www.gaussianprocess.org/gpml/data/

R

## データの入力

library("R.matlab")

mat_data <- readMat('usps_resampled.mat')

index3 = (mat_data$train.labels[3+1,] == 1)
index5 = (mat_data$train.labels[5+1,] == 1)

y_train = ifelse(index3,1,-1)[index3 | index5]
x_train = mat_data$train.patterns[,index3 | index5]

## カーネル関数とシグモイド関数

x_train_L2 = apply(x_train,2,function(a){colSums((x_train-a)^2)})
my_kernel_train = function(k_sigma,k_scale){
  exp(x_train_L2 / (-2 * k_scale^2)) * k_sigma^2
}

my_sigmoid = function(y,f){
  pnorm(y * f)
}
my_sigmoid_log_1st = function(y,f,prob){
  y * dnorm(f) / prob
}
my_sigmoid_log_2nd = function(y,f,prob){
  a = dnorm(f) / prob
  a * (a + y * f)
}

## 周辺尤度の最大化

my_evidence_optim = function(par){
  k_scale = exp(par[1])
  k_sigma = exp(par[2])
  K11 = my_kernel_train(k_sigma,k_scale)
  n = length(y_train)
  f = seq(from=0,to=1,length.out=n)
  while(TRUE){
    prob = my_sigmoid(y_train,f)
    V1 = my_sigmoid_log_1st(y_train,f,prob)
    V2 = my_sigmoid_log_2nd(y_train,f,prob)
    V2q = sqrt(pmax(V2,0))
    B = diag(n) + t(K11 * V2q) * V2q
    U = chol(B) # t(U) %*% U
    b = V2 * f + V1
    a = b - backsolve(U,forwardsolve(t(U),(K11 %*% b) * V2q)) * V2q
    f0 = f
    f = drop(K11 %*% a)
    if(max(abs(f - f0))<1e-5)break
  }
  -sum(a * f) / 2 + sum(log(prob)) - sum(log(diag(U)))
}
start = proc.time()
ret_optim = optim(c(2.85,2.35),my_evidence_optim,control=list(fnscale=-1))
proc.time() - start
ret_optim

Julia

Rとは異なり、whileでも変数のスコープが構成される。
sum()したら次元をドロップして欲しい。

using MAT
using Distributions
using Optim
using LinearAlgebra

## データの入力

matfile = matopen("usps_resampled.mat")

train_labels = read(matfile, "train_labels")
train_patterns = read(matfile, "train_patterns")

index3 = (train_labels[3+1,:] .== 1)
index5 = (train_labels[5+1,:] .== 1)

y_train = ifelse.(index3,1,-1)[index3 .| index5]
x_train = train_patterns[:,index3 .| index5]

## カーネル関数とシグモイド関数

x_train_L2 = mapslices(t -> dropdims(sum((x_train .- t) .^ 2, dims=1), dims=1), x_train, dims=1)
function my_kernel_train(k_sigma,k_scale)
  exp.(x_train_L2 / (-2 * k_scale^2)) * k_sigma^2
end

function my_sigmoid(y,f)
  cdf.(Normal(), y .* f)
end
function my_sigmoid_log_1st(y,f,prob)
  y .* pdf.(Normal(), f) ./ prob
end
function my_sigmoid_log_2nd(y,f,prob)
  a = pdf.(Normal(), f) ./ prob
  a .* (a .+ y .* f)
end

## 周辺尤度の最大化

function my_evidence_optim(par)
  k_scale = exp(par[1])
  k_sigma = exp(par[2])
  K11 = my_kernel_train(k_sigma,k_scale)
  n = length(y_train)
  f = Array(range(0,1,length=n))
  evidence = 0
  while true
    prob = my_sigmoid(y_train,f)
    V1 = my_sigmoid_log_1st(y_train,f,prob)
    V2 = my_sigmoid_log_2nd(y_train,f,prob)
    V2q = sqrt.(max.(V2,0))
    B = I + K11 .* V2q .* V2q'
    C = cholesky(Symmetric(B))
    b = V2 .* f + V1
    a = b - (C.U \ (C.L \ ((K11 * b) .* V2q))) .* V2q
    f0 = f
    f = K11 * a
    if maximum(abs.(f - f0)) < 1e-5
      evidence = -dot(a,f) / 2 + sum(log.(prob)) - log(det(C)) / 2
      break
    end
  end
  evidence
end

@time result = maximize(my_evidence_optim, [2.85,2.35], NelderMead())

Python

np.arrayとnp.matrixの二種類があり挙動が異なる。

import os
os.environ["OMP_NUM_THREADS"] = "1"

import time
import numpy as np
from scipy import io
from scipy.stats import norm
from scipy.linalg import cholesky
from scipy.linalg import solve_triangular
from scipy.optimize import minimize

## データの入力

matdata = io.loadmat('usps_resampled.mat')

train_labels = matdata["train_labels"]
train_patterns = matdata["train_patterns"]

index3 = (train_labels[3,:]==1)
index5 = (train_labels[5,:]==1)

y_train = np.where(index3,1,-1)[np.logical_or(index3,index5)]
x_train = train_patterns[:,np.logical_or(index3,index5)]

## カーネル関数とシグモイド関数

x_train_L2 = np.apply_along_axis(lambda x: np.sum(np.power(x_train-x.reshape(-1,1),2),0),0,x_train)
def my_kernel_train(k_sigma,k_scale):
  return np.exp(x_train_L2 / (-2 * k_scale * k_scale)) * (k_sigma * k_sigma)

def my_sigmoid(y,f):
  return norm.cdf(y * f)

def my_sigmoid_log_1st(y,f,prob):
  return y * norm.pdf(f) / prob

def my_sigmoid_log_2nd(y,f,prob):
  a = norm.pdf(f) / prob
  return a * (a + y * f)

## 周辺尤度の最大化

def my_evidence_optim(par):
  k_scale = np.exp(par[0])
  k_sigma = np.exp(par[1])
  K11 = my_kernel_train(k_sigma,k_scale)
  n = len(y_train)
  f = np.linspace(0,1,n)
  evidence = 0
  while True:
    prob = my_sigmoid(y_train,f)
    V1 = my_sigmoid_log_1st(y_train,f,prob)
    V2 = my_sigmoid_log_2nd(y_train,f,prob)
    V2q = np.sqrt(np.maximum(V2,0))
    B = np.identity(n) + K11 * V2q * V2q.reshape(-1,1)
    U = cholesky(B)
    b = V2 * f + V1
    a = b - solve_triangular(U,solve_triangular(U,np.dot(K11,b) * V2q,trans=1)) * V2q
    f0 = f
    f = np.dot(K11,a)
    if np.max(np.abs(f - f0)) < 1e-5:
      evidence = -np.dot(a,f) / 2 + np.sum(np.log(prob)) - np.sum(np.log(np.diag(U)))
      break
  return -evidence

start = time.time()
par = np.array([2.85,2.35])
ret = minimize(my_evidence_optim,par,method='Nelder-Mead')
print(time.time() - start)
print(ret)

C#

.matファイルの入力にはAccord.Mathを使用。
行列計算と最適化にはMathNet.Numericsを使用。
データの抽出と結合に少し手間がかかる。

class BinaryHandwrittenDigit
{
    (Vector<double>, Matrix<double>) Select(Int16[,] labels, Double[,] patterns)
    {
        int nrow = patterns.GetLength(0);
        int ncol = patterns.GetLength(1);
        int count_3 = 0;
        int count_5 = 0;
        for (int j = 0; j < ncol; j++)
        {
            if (labels[3, j] == 1) count_3++;
            else if (labels[5, j] == 1) count_5++;
        }
        Vector<double> y = new DenseVector(count_3 + count_5);
        Matrix<double> x = new DenseMatrix(nrow, count_3 + count_5);
        int i = 0;
        for (int j = 0; j < ncol; j++)
        {
            if (labels[3, j] == 1)
            {
                for (int k = 0; k < nrow; k++) x[k, i] = patterns[k, j];
                y[i++] = 1;
            }
            else if (labels[5, j] == 1)
            {
                for (int k = 0; k < nrow; k++) x[k, i] = patterns[k, j];
                y[i++] = -1;
            }
        }
        return (y, x);
    }
    Vector<double> y_train;
    Matrix<double> x_train;
    void ReadMat()
    {
        string path_to_file = "usps_resampled.mat";
        var reader = new MatReader(path_to_file);
        var train_labels = reader.Read<Int16[,]>("train_labels");
        var train_patterns = reader.Read<Double[,]>("train_patterns");
        (y_train, x_train) = Select(train_labels, train_patterns);
    }
    Matrix<double> x_train_L2;
    Matrix<double> MakeMatrixL2(Matrix<double> x)
    {
        int m = x.ColumnCount;
        var result = new DenseMatrix(m, m);
        for (int j = 0; j < m; j++)
        {
            var cj = x.Column(j);
            for (int i = 0; i < m; i++)
            {
                var s = x.Column(i) - cj;
                result[i, j] = s * s;
            }
        }
        return result;
    }
    Matrix<double> kernel_train(double k_sigma, double k_scale)
    {
        return (x_train_L2 / (-2 * k_scale * k_scale)).PointwiseExp() * (k_sigma * k_sigma);
    }
    Vector<double> sigmoid(Vector<double> y, Vector<double> f)
    {
        return y.PointwiseMultiply(f).Map((x) => Normal.CDF(0, 1, x));
    }
    Vector<double> sigmoid_log_1st(Vector<double> y, Vector<double> f, Vector<double> prob)
    {
        var df = f.Map((x) => Normal.PDF(0, 1, x));
        return y.PointwiseDivide(prob).PointwiseMultiply(df);
    }
    Vector<double> sigmoid_log_2nd(Vector<double> y, Vector<double> f, Vector<double> prob)
    {
        var df = f.Map((x) => Normal.PDF(0, 1, x));
        var a = df.PointwiseDivide(prob);
        return a.PointwiseMultiply(a + y.PointwiseMultiply(f));
    }
    Matrix<double> bXbPlusOne(Matrix<double> X, Vector<double> b)
    {
        int n = X.RowCount;
        var result = X.Clone();
        for (int j = 0; j < n; j++)
        {
            for (int i = 0; i < n; i++)
            {
                result[i, j] *= b[i] * b[j];
            }
            result[j, j] += 1;
        }
        return result;
    }
    double GetEvidence(Vector<double> par)
    {
        double k_scale = Math.Exp(par[0]);
        double k_sigma = Math.Exp(par[1]);
        var K11 = kernel_train(k_sigma, k_scale);
        int n = y_train.Count;
        Vector<double> f = new DenseVector(n);
        for (int i = 0; i < n; i++)
        {
            f[i] = 1.0 / (n - 1) * i;
        }
        double evidence = 0;
        while (true)
        {
            var prob = sigmoid(y_train, f);
            var V1 = sigmoid_log_1st(y_train, f, prob);
            var V2 = sigmoid_log_2nd(y_train, f, prob);
            var V2q = V2.PointwiseMaximum(0).PointwiseSqrt();
            var B = bXbPlusOne(K11, V2q);
            var b = V2.PointwiseMultiply(f) + V1;
            var c = K11.Multiply(b).PointwiseMultiply(V2q);
            var chol = B.Cholesky();
            var a = b - chol.Solve(c).PointwiseMultiply(V2q);
            var f0 = f;
            f = K11 * a;
            if ((f - f0).AbsoluteMaximum() < 1e-5)
            {
                evidence = -(a * f) / 2 + prob.PointwiseLog().Sum() - Math.Log(chol.Determinant) / 2;
                break;
            }
        }
        return -evidence;
    }
    public void MarginalLikelihoodMaximization()
    {
        ReadMat();
        x_train_L2 = MakeMatrixL2(x_train);
        var evidence_optim = ObjectiveFunction.Value(GetEvidence);
        Stopwatch sw = Stopwatch.StartNew();
        var ret = NelderMeadSimplex.Minimum(evidence_optim, new DenseVector(new[] { 2.85, 2.35 }));
        sw.Stop();
        Console.WriteLine("{0}sec", (double)(sw.ElapsedMilliseconds) / 1000);
        var par_m = ret.MinimizingPoint;
        Console.WriteLine("par={0},{1}", par_m[0], par_m[1]);
        Console.WriteLine("Value={0},Reason={1},Iterations={2}", ret.FunctionInfoAtMinimum.Value, ret.ReasonForExit, ret.Iterations);
    }
}
static void Main(string[] args)
{
    Control.MaxDegreeOfParallelism = 1;
    var sample = new BinaryHandwrittenDigit();
    sample.MarginalLikelihoodMaximization();
}

C++

データは、C#で出力したものを入力。省略。
最適化関数nmminは、Rのソース/src/appl/optim.cを流用して作成。省略。
Eigenでautoを使用すると型が変わってしまう。

MKLを使用する場合は、#define EIGEN_USE_BLASとして、
mkl_intel_lp64.lib,mkl_sequential.lib,mkl_core.libをリンクする。
Intel® Math Kernel Library Link Line Advisor

# include <stdio.h>
# include <direct.h>

# include <chrono>
# include <cmath>

//#define EIGEN_USE_BLAS
# include "Eigen/Dense"

using namespace std;
using namespace Eigen;

typedef double optimfn(int n, const double *x0, void *ex);

void nmmin(int n, double *Bvec, double *X, double *Fmin, optimfn fminfn,
    int *fail, double abstol, double intol, void *ex,
    double alpha, double bet, double gamm, int trace,
    int *fncount, int maxit);

double Normal_PDF(double value)
{
    const double pi = 3.1415926535897932384626433832795;
    return exp(-value * value / 2) / sqrt(2 * pi);
}
double Normal_CDF(double value)
{
    return erfc(-value / sqrt(2)) / 2;
}

class BinaryHandwrittenDigit {
private:
    int nrow;
    int ncol;
    MatrixXd x_train;
    ArrayXd y_train;
    MatrixXd x_train_L2;
public:
    void read_data();
    void make_matrix_L2() {
        x_train_L2.resize(ncol, ncol);
        for (int j = 0; j < ncol; j++) {
            for (int i = 0; i < ncol; i++) {
                double t = 0;
                for (int k = 0; k < nrow; k++) t += (x_train(k, i) - x_train(k, j)) * (x_train(k, i) - x_train(k, j));
                x_train_L2(i, j) = t;
            }
        }
    }
    MatrixXd kernel_train(double k_sigma, double k_scale) {
        return (x_train_L2 / (-2 * k_scale * k_scale)).array().exp() * (k_sigma * k_sigma);
    }
    ArrayXd sigmoid(const ArrayXd& y, const ArrayXd& f) {
        ArrayXd result(y.size());
        for (int j = 0; j < y.size(); j++) {
            result(j) = Normal_CDF(y(j) * f(j));
        }
        return result;
    }
    ArrayXd sigmoid_log_1st(const ArrayXd& y, const ArrayXd& f, const ArrayXd& prob) {
        ArrayXd result(y.size());
        for (int j = 0; j < y.size(); j++) {
            result(j) = y(j) * Normal_PDF(f(j)) / prob(j);
        }
        return result;
    }
    ArrayXd sigmoid_log_2nd(const ArrayXd& y, const ArrayXd& f, const ArrayXd& prob) {
        ArrayXd result(y.size());
        for (int j = 0; j < y.size(); j++) {
            double a = Normal_PDF(f(j)) / prob(j);
            result(j) = a * (a + y(j) * f(j));
        }
        return result;
    }
    MatrixXd bXbPlusOne(const MatrixXd& X, const ArrayXd& b)
    {
        int n = X.rows();
        MatrixXd result(X);
        for (int j = 0; j < n; j++)
        {
            for (int i = 0; i < n; i++)
            {
                result(i, j) *= b[i] * b[j];
            }
            result(j, j) += 1;
        }
        return result;
    }
    double get_evidence(const double* par) {
        double k_scale = exp(par[0]);
        double k_sigma = exp(par[1]);
        MatrixXd K11 = kernel_train(k_sigma, k_scale);
        int n = y_train.size();
        ArrayXd f = ArrayXd::LinSpaced(n, 0, 1);
        double evidence = 0;
        while (true) {
            ArrayXd prob = sigmoid(y_train, f);
            ArrayXd V1 = sigmoid_log_1st(y_train, f, prob);
            ArrayXd V2 = sigmoid_log_2nd(y_train, f, prob);
            ArrayXd V2q = V2.cwiseMax(0.0).cwiseSqrt();
            MatrixXd B = bXbPlusOne(K11, V2q);
            ArrayXd b = V2 * f + V1;
            ArrayXd c = (K11 * VectorXd(b)).array() * V2q;
            LLT<MatrixXd> chol(B);
            ArrayXd a = b - chol.solve(VectorXd(c)).array() * V2q;
            ArrayXd f0 = f;
            f = K11 * VectorXd(a);
            if ((f - f0).abs().maxCoeff() < 1e-5) {
                evidence = -(a * f).sum() / 2 + prob.log().sum() - log(chol.matrixL().determinant());
                break;
            }
        }
        return -evidence;
    }
public:
    static double get_evidence_optim(int n, const double *x, void *ex) {
        auto p = (BinaryHandwrittenDigit*)ex;
        return p->get_evidence(x);
    }
};

int main()
{
    auto sample = new BinaryHandwrittenDigit();
    sample->read_data();
    sample->make_matrix_L2();

    double ninf = -INFINITY;
    double eps = 2.220446e-16;

    ArrayXd xini(2); xini << 2.85, 2.35;
    ArrayXd xout(2);
    double fmin = 0.0;
    int fail = 0;
    double abstol = ninf;
    double intol = sqrt(eps);
    void *ex = sample;
    double alpha = 1.0;
    double bet = 0.5;
    double gamm = 2.0;
    int trace = 0;
    int fncount = 0;
    int maxit = 500;

    auto start = chrono::system_clock::now();
    nmmin(xini.size(), xini.data(), xout.data(), &fmin, BinaryHandwrittenDigit::get_evidence_optim, &fail, abstol, intol, ex, alpha, bet, gamm, trace, &fncount, maxit);
    auto end = chrono::system_clock::now();
    long elapsed = chrono::duration_cast<chrono::milliseconds>(end - start).count();
    printf("fmin=%f, fncount=%d, fail=%d\n", fmin, fncount, fail);
    for (int i = 0; i < xout.size(); i++) printf("%d, %f\n", i, xout[i]);
    printf("%d msec\n", elapsed);

    delete sample;
}

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up