More than 5 years have passed since last update.

Julia Advent Calendar 2018

JuliaとFortran(ifort+MKL)の速度比較をしてみた。

Julia

Last updated at 2018-12-02Posted at 2018-12-02

この記事はJulia Advent Calender 2018 3日目の記事です。

Julia 1.0は高速な言語と言われています。
それでは、Fortranと比べてみましょう。
どうせならIntel系CPU上のFortranで最速と言われているコンパイラであるIntel Fortran コンパイラifortと勝負をしてみたいと思います。数値計算では行列の演算が多いので、行列の演算に関する高速なライブラリであるMath Kernel Library (MKL)を使うことにしましょう。MKLは最近は無料となりました。

計算コード

計算に使うコードは、Green関数の行列要素を計算するパッケージRSCG.jl
https://github.com/cometscome/RSCG.jl
とします。これは、Green関数：

G(z) = [z I - H]^{-1}

の行列要素$G_{ij}(z)$を計算します。ここで、$H$は$N \times N$のエルミート（あるいは対称）行列です。$z$は複素数です。縮約シフト共役勾配法(RSCG法)では、複数の複素数$z$に対する$G_{ij}(z)$をCG法で同時に解くことができます。
なお、専門的な話をすると、RSCG法は$H$のクリロフ部分空間(あるベクトル$x$に対して$x,Hx,H^2x,H^3 x, \cdots $が張る空間)と$z I - H$のクリロフ部分空間が等しいことを利用することで、一つのCG法を使うことで複数の$z$に対する連立方程式$(z I - H) x = b$を解いています。

Juliaコード

今回使うRSCG.jlのコードは

module RSCG
    using LinearAlgebra
    using LinearMaps
    using SparseArrays

    export greensfunctions

    function greensfunctions(i::Integer,j::Integer,σ,A;eps=1e-12,maximumsteps=20000)

        M=length(σ)
        N=size(A,1)
        atype = eltype(A)
        σtype = eltype(σ)
        b = zeros(atype,N)
        b[j] = 1.0

    #--Line 2 in Table III.
        x = zeros(atype,N)
        r = copy(b)
        p = copy(b)
        αm = 1.0
        βm = 0.0

        m = 1
    #--
        Σ =zeros(atype,m)
        Σ[1] = b[i] #Line 3

        Θ = zeros(σtype,M)
        Π = ones(σtype,M,m)
        for mm=1:m
            Π[:,mm] *= Σ[mm]
        end
        ρk = ones(σtype,M)
        ρkm = copy(ρk)
        ρkp = copy(ρk)
        Ap = similar(p)

        for k=0:maximumsteps
            mul!(Ap,A,-p)
            #A_mul_B!(Ap,A,-p)
            rnorm = r'*r
            α = rnorm/(p'*Ap)
            x += α*p #Line 7
            r += -α*Ap #Line 8
            β = r'*r/rnorm #Line9
            p = r + β*p #Line 10

            Σ[1] = r[i] #Line 11

            for j = 1:M
                update = ifelse(abs(ρk[j]) > eps,true,false)
                if update
                    ρkp[j] = ρk[j]*ρkm[j]*αm/(ρkm[j]*αm*(1.0+α*σ[j])+α*βm*(ρkm[j]-ρk[j]))#Line 13
                    αkj = α*ρkp[j]/ρk[j]#Line 14
                    Θ[j] += αkj*Π[j,1] #Line 15
                    βkj = ((ρkp[j]/ρk[j])^2)*β #Line 16
                    Π[j,:] = ρkp[j]*Σ+ βkj*Π[j,:] #Line 17

                end
                ρkm[j] = ρk[j]
                ρk[j] = ρkp[j]
            end
            αm = α
            βm = β
            hi = real(rnorm)*maximum(abs.(ρk))
            if hi < eps
                return Θ
            end
        end


        println("After $maximumsteps steps, this was not converged")
        return Θ

    end
end

です。

Juliaコード `@inbounds`つき

@inboundsマクロをつけた方が速くなる、という話があるので、

@inbounds function greensfunctions_in(i::Integer,j::Integer,σ,A;eps=1e-12,maximumsteps=20000)
# 以下省略

というコードも用意しました。中身は全く変えていません。

Fortranコード

この二つのJuliaコードと勝負をするのは、Fortran90で書かれたコードで、

RSCG.f90

module rscg
contains
  subroutine greensfunctions(i,jj,vec_sigma,M,n,val,row,col,&
    val_l,row_l,col_l,eps,maximumsteps,Theta) bind(C, name="greensfunctions")
    implicit none
    integer(8),intent(in)::M,n
    complex(8),intent(in)::vec_sigma(1:M)
    integer(8),intent(in)::val_l,col_l,row_l
    complex(8),intent(in)::val(1:val_l)
    integer(8),intent(in)::col(1:col_l),row(1:row_l)
    integer(8),intent(in)::i,jj
    complex(8),intent(out)::Theta(1:M)
    real(8),intent(in),optional::eps
    integer(8),intent(in),optional::maximumsteps
    complex(8),allocatable::b(:),x(:),r(:),p(:),Ap(:)
    complex(8)::alpham,betam,Sigma
    complex(8),allocatable::Pi(:)
    complex(8)::rhok(1:M),rhokm(1:M),rhokp(1:M)
    integer::k,j
    real(8)::rnorm,hi
    complex(8)::alpha,beta,alphakj,betakj

    allocate(b(1:N))
    b = 0d0
    b(jj) = 1d0
    allocate(x(1:N))
    x = 0d0
    allocate(r(1:N))
    r = b
    allocate(p(1:N))
    p = b
    alpham = 1d0
    betam = 1d0
    Sigma = b(i)
    Theta = 0d0
    allocate(Pi(1:M))
    Pi = Sigma
    rhok = 1d0
    rhokm = rhok
    rhokp = rhok
    allocate(Ap(1:N))
    Ap = 0d0
    do k=0,maximumsteps
      call mkl_zcsrgemv("T", n, val, col, row, -p, Ap)
      rnorm = dot_product(r,r)
      alpha = rnorm/dot_product(p,Ap)
      x = x + alpha*p
      r = r -alpha*Ap
      beta = dot_product(r,r)/rnorm
      p = r + beta*p
      Sigma = r(i)
      do j=1,M
        if(abs(rhok(j))> eps) then
          rhokp(j) = rhok(j)*rhokm(j)*alpham/(rhokm(j)*alpham* &
            (1d0+alpha*vec_sigma(j))+alpha*betam*(rhokm(j)-rhok(j)))
          alphakj = alpha*rhokp(j)/rhok(j)
          Theta(j) = Theta(j) + alphakj*Pi(j)
          betakj = ((rhokp(j)/rhok(j))**2)*beta
          Pi(j) = rhokp(j)*Sigma+betakj*Pi(j)
        endif
        rhokm(j) = rhok(j)
        rhok(j) = rhokp(j)
      enddo
      alpham = alpha
      betam = beta
      hi = rnorm*maxval(abs(rhok))
      if (hi < eps) then
        return
      endif

    enddo
    write(*,*) "not converged"
    return
  end
end

です。疎行列とベクトル積を行うために、MKLのSparse BLAS level 2のライブラリmkl_zcsrgemvを呼び出しています。Juliaで呼び出すことを考え、転置Tをつけています。これは、Juliaの疎行列が行列要素を縦方向に格納した格納形式CSCとなっているからです。MKLのこのライブラリは行列要素を横方向に格納した格納形式CSRにのみ対応しているため、行列を転置することでJuliaから得た疎行列に対しても正しい結果となるようにしました。

このコードは、

ifort RSCG.f90 -o RSCG.so -shared -fPIC -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core

としてifortとMKLでコンパイルし、RSCG.soというファイルにします。コンパイラのバージョンはifort 17.0.5です。
このコードをJuliaから読むには、

   function greensfunctions_fortran(i,j,σ,A)
        n = size(A,1)
        M = length(σ)
        Theta = zeros(ComplexF64,M)
        col = A.colptr
        val = A.nzval
        row = A.rowval
        val_l = length(val)
        row_l = length(row)
        col_l = length(col)
        eps = 1e-12
        maximumsteps = 20000
        ccall((:greensfunctions,"./RSCG.so"),Nothing,
        (Ref{Int64},#i
        Ref{Int64},#j
        Ref{ComplexF64}, #σ
        Ref{Int64},#M
        Ref{Int64},#n
        Ref{ComplexF64},#val
        Ref{Int64},#row
        Ref{Int64},#col
        Ref{Int64},#val_l
        Ref{Int64},#col_l
        Ref{Int64},#row_l
        Ref{Float64},#eps
        Ref{Int64},#maximumsteps
        Ref{ComplexF64}),#Theta
        i,j,σ,M,n,val,row,col,val_l,row_l,col_l,eps,maximumsteps,Theta)
        return Theta
    end

というfunctionを作っておきます。このfunctionを呼べば、コンパイルされたFortranコードが呼び出されます。

今回解く問題

今回解く問題として、複素数$z$を

M = 100
σ = zeros(ComplexF64,M)
σmin = -4.0*im-1.2
σmax = 4.0*im +4.3
for i=1:M
    σ[i] = (i-1)*(σmax-σmin)/(M-1) + σmin
end

として、100個用意します。ここではzではなく$\sigma$としています。複素平面上の適当な値であることがわかると思います。

行列$H$としては、

using SparseArrays
function make_mat(n)
    A = spzeros(ComplexF64,n,n)
    t = -1.0
    μ = -1.5
    for i=1:n
        dx = 1
        jp = i+dx
        jp += ifelse(jp > n,-n,0) #+1
        dx = -1
        jm = i+dx
        jm += ifelse(jm < 1,n,0) #-1
        A[i,jp] = t
        A[i,i] = -μ
        A[i,jm] = t
    end
    return A
end

n=100000
A1 = make_mat(n)

というものを用意します。これは、一次元の二階微分方程式を差分化した行列に似たものです。
要素数が非常に少ないので、疎行列として扱います。行列の次元は10万($n=100000$)としました。

ベンチマーク結果

ベンチマークは、Xeon E5-2680v3 12C 2.5GHzが搭載された計算ノードの上で1コアを使って実行しました。ベンチマークに使ったコードは

i = 1
j = 1
using .RSCG
for k=1:10
   println("Fortran")
   @time Gij1 = greensfunctions_fortran(i,j,σ,A1) 
   println("Julia")
   @time Gij2 = greensfunctions(i,j,σ,A1)
   println("Inbounds")
   @time Gij3 = greensfunctions_in(i,j,σ,A1) 
end

となります。Juliaは最初の1回目のfunctionの呼び出しが遅いことがある（コンパイルしているため）ので、10回呼び出して、2回目から9回目の平均を取ってみることにします。
1回目から10回目の出力結果は、

Fortran
  3.465828 seconds (87.82 k allocations: 4.613 MiB)
Julia
  5.107032 seconds (1.90 M allocations: 11.897 GiB, 11.10% gc time)
Inbounds
  4.101198 seconds (359.52 k allocations: 11.823 GiB, 9.74% gc time)
Fortran
  2.432493 seconds (1 allocation: 1.766 KiB)
Julia
  4.079534 seconds (64.66 k allocations: 11.809 GiB, 10.25% gc time)
Inbounds
  3.593802 seconds (64.66 k allocations: 11.809 GiB, 9.73% gc time)
Fortran
  2.425974 seconds (1 allocation: 1.766 KiB)
Julia
  3.598211 seconds (64.66 k allocations: 11.809 GiB, 9.71% gc time)
Inbounds
  3.583639 seconds (64.66 k allocations: 11.809 GiB, 9.75% gc time)
Fortran
  2.428606 seconds (1 allocation: 1.766 KiB)
Julia
  3.651773 seconds (64.66 k allocations: 11.809 GiB, 9.54% gc time)
Inbounds
  3.562352 seconds (64.66 k allocations: 11.809 GiB, 7.76% gc time)
Fortran
  2.424241 seconds (1 allocation: 1.766 KiB)
Julia
  3.691068 seconds (64.66 k allocations: 11.809 GiB, 8.18% gc time)
Inbounds
  4.495625 seconds (64.66 k allocations: 11.809 GiB, 9.66% gc time)
Fortran
  2.424586 seconds (1 allocation: 1.766 KiB)
Julia
  3.686324 seconds (64.66 k allocations: 11.809 GiB, 8.10% gc time)
Inbounds
  3.559118 seconds (64.66 k allocations: 11.809 GiB, 7.74% gc time)
Fortran
  2.426093 seconds (1 allocation: 1.766 KiB)
Julia
  3.549117 seconds (64.66 k allocations: 11.809 GiB, 7.75% gc time)
Inbounds
  3.561412 seconds (64.66 k allocations: 11.809 GiB, 7.80% gc time)
Fortran
  2.426092 seconds (1 allocation: 1.766 KiB)
Julia
  3.603876 seconds (64.66 k allocations: 11.809 GiB, 7.93% gc time)
Inbounds
  4.121707 seconds (64.66 k allocations: 11.809 GiB, 8.69% gc time)
Fortran
  2.422536 seconds (1 allocation: 1.766 KiB)
Julia
  4.044804 seconds (64.66 k allocations: 11.809 GiB, 8.57% gc time)
Inbounds
  4.044304 seconds (64.66 k allocations: 11.809 GiB, 8.52% gc time)
Fortran
  2.423250 seconds (1 allocation: 1.766 KiB)
Julia
  3.537327 seconds (64.66 k allocations: 11.809 GiB, 7.72% gc time)
Inbounds
  3.528620 seconds (64.66 k allocations: 11.809 GiB, 7.68% gc time)

となりました。

9回の平均は

Fortran: 2.1567356666666666 [sec]
Julia: 3.3227452222222222 [sec]
Julia with @inbounds: 3.3913287777777774 [sec]

となりました。あれ、@inboundsが思ったより遅いですね。付け方が悪かったのでしょうか？

結論

9回の平均を見ると、「Juliaコードはifort+MKLのFortranコードと比べて、約1.5倍遅い」
という結果になりました。ifort+MKLという、数値計算界最速のペアと比べてこの程度の遅さで済んでいるというのは、大健闘だと思います。
なお、実際に本気で重たい数値計算を行う場合には、JuliaからFortranコードを簡単に呼び出せるので、「重たい部分だけFortran」が良いと思われます。ただ、1.5倍程度であれば、数百CPUコア並列などを用いるスパコンレベルではなくPCレベルであれば、許容できる速度差ではないかなと思います。

ベンチマークに使ったコードはすべてここに書きましたので、Juliaコードをもっと速くするアイディアは常に募集中です。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up