"JuliaとFortran(ifort+MKL)の速度比較をしてみた。"
https://qiita.com/cometscome_phys/items/4c36576dd73e09e7a1e1
ではLinux OS上のifort+MKLに勝負を挑みましたが、今回はgfortran+MKLに勝負を挑んでみました。
特に今回は、配列の代入を少し変えることでJuliaの高速化ができましたので、そちらについて述べたいと思います。

環境

Julia: 1.1
gfortran: gcc version 8.2.0 (Homebrew GCC 8.2.0)
MKL: 2019年　Mac OS版

Mac OS: 10.14.3
MacBook Pro (13-inch, 2018, Four Thunderbolt 3 Ports)
2.7 GHz Intel Core i7

計算コード

計算に使うコードは、Green関数の行列要素を計算するパッケージRSCG.jl
https://github.com/cometscome/RSCG.jl
とします。詳しくは前回の記事を参照してください。

Juliaコード（前回版）

今回は二つのJuliaコードを用意しました。
一つ目は前回の記事で使ったもの

    function greensfunctions(i::Integer,j::Integer,σ,A;eps=1e-12,maximumsteps=20000)

        M=length(σ)
        N=size(A,1)
        atype = eltype(A)
        σtype = eltype(σ)
        b = zeros(atype,N)
        b[j] = 1.0

    #--Line 2 in Table III.
        x = zeros(atype,N)
        r = copy(b)
        p = copy(b)
        αm = 1.0
        βm = 0.0

        m = 1
    #--
        Σ =zeros(atype,m)
        Σ[1] = b[i] #Line 3

        Θ = zeros(σtype,M)
        Π = ones(σtype,M,m)
        for mm=1:m
            Π[:,mm] *= Σ[mm]
        end
        ρk = ones(σtype,M)
        ρkm = copy(ρk)
        ρkp = copy(ρk)
        Ap = similar(p)

        for k=0:maximumsteps
            mul!(Ap,A,-p)
            #A_mul_B!(Ap,A,-p)
            rnorm = r'*r
            α = rnorm/(p'*Ap)
            x += α*p #Line 7
            r += -α*Ap #Line 8
            β = r'*r/rnorm #Line9
            p = r + β*p #Line 10

            Σ[1] = r[i] #Line 11

            for j = 1:M
                update = ifelse(abs(ρk[j]) > eps,true,false)
                if update
                    ρkp[j] = ρk[j]*ρkm[j]*αm/(ρkm[j]*αm*(1.0+α*σ[j])+α*βm*(ρkm[j]-ρk[j]))#Line 13
                    αkj = α*ρkp[j]/ρk[j]#Line 14
                    Θ[j] += αkj*Π[j,1] #Line 15
                    βkj = ((ρkp[j]/ρk[j])^2)*β #Line 16
                    Π[j,:] = ρkp[j]*Σ+ βkj*Π[j,:] #Line 17

                end
                ρkm[j] = ρk[j]
                ρk[j] = ρkp[j]
            end
            αm = α
            βm = β
            hi = real(rnorm)*maximum(abs.(ρk))
            if hi < eps
                return Θ
            end
        end


        println("After $maximumsteps steps, this was not converged")
        return Θ

    end

です。

Juliaコード（高速化版）

今回新しく用意したコードは、

    @inbounds function greensfunctions_fast(i::Integer,j::Integer,σ,A;eps=1e-12,maximumsteps=20000)

        M=length(σ)
        N=size(A,1)
        atype = eltype(A)
        σtype = eltype(σ)
        b = zeros(atype,N)
        b[j] = 1.0

    #--Line 2 in Table III.
        x = zeros(atype,N)
        r = copy(b)
        p = copy(b)
        αm = 1.0
        βm = 0.0

        m = 1
    #--
        Σ =zeros(atype,m)
        Σ[1] = b[i] #Line 3

        Θ = zeros(σtype,M)
        Π = ones(σtype,M,m)
        for mm=1:m
            Π[:,mm] *= Σ[mm]
        end
        ρk = ones(σtype,M)
        ρkm = copy(ρk)
        ρkp = copy(ρk)
        Ap = similar(p)

        for k=0:maximumsteps            
            mul!(Ap,A,-p)
            #A_mul_B!(Ap,A,-p)
            rnorm = dot(r,r)#r'*r
            α = rnorm/dot(p,Ap) #(p'*Ap)
            @. x += α*p #Line 7            
            @. r += -α*Ap #Line 8
            β = dot(r,r)/rnorm #Line9
#            β = r'*r/rnorm #Line9
            @. p = r + β*p #Line 10

            Σ[1] = r[i] #Line 11

            @simd for j = 1:M
                if abs(ρk[j]) > eps

                    ρkp[j] = ρk[j]*ρkm[j]*αm/(ρkm[j]*αm*(1.0+α*σ[j])+α*βm*(ρkm[j]-ρk[j]))#Line 13
                    αkj = α*ρkp[j]/ρk[j]#Line 14
                    Θ[j] += αkj*Π[j,1] #Line 15
                    βkj = ((ρkp[j]/ρk[j])^2)*β #Line 16
                    Π[j,:] = ρkp[j]*Σ+ βkj*Π[j,:] #Line 17
                    ρkm[j] = ρk[j]
                    ρk[j] = ρkp[j]
                end

            end
            αm = α
            βm = β
            hi = real(rnorm)*maximum(abs.(ρk))
            if hi < eps
                return Θ
            end
        end


        println("After $maximumsteps steps, this was not converged")
        return Θ

    end

です。@simdや@inboundsなどをつけましたが、結果的にはこれらはあまり速度に影響しませんでした。もっとも速度に影響したのは、ベクトルの足し算の部分につけた@.マクロでした。
これによって配列の確保が減ったようです。

Fortranコード

Fortranコードは前回と同じで、

module rscg
    contains
      subroutine greensfunctions(i,jj,vec_sigma,M,n,val,row,col,&
        val_l,row_l,col_l,eps,maximumsteps,Theta) bind(C, name="greensfunctions")
        implicit none
        integer(8),intent(in)::M,n
        complex(8),intent(in)::vec_sigma(1:M)
        integer(8),intent(in)::val_l,col_l,row_l
        complex(8),intent(in)::val(1:val_l)
        integer(8),intent(in)::col(1:col_l),row(1:row_l)
        integer(8),intent(in)::i,jj
        complex(8),intent(out)::Theta(1:M)
        real(8),intent(in),optional::eps
        integer(8),intent(in),optional::maximumsteps
        complex(8),allocatable::b(:),x(:),r(:),p(:),Ap(:)
        complex(8)::alpham,betam,Sigma
        complex(8),allocatable::Pi(:)
        complex(8)::rhok(1:M),rhokm(1:M),rhokp(1:M)
        integer::k,j
        real(8)::rnorm,hi
        complex(8)::alpha,beta,alphakj,betakj

        allocate(b(1:N))
        b = 0d0
        b(jj) = 1d0
        allocate(x(1:N))
        x = 0d0
        allocate(r(1:N))
        r = b
        allocate(p(1:N))
        p = b
        alpham = 1d0
        betam = 1d0
        Sigma = b(i)
        Theta = 0d0
        allocate(Pi(1:M))
        Pi = Sigma
        rhok = 1d0
        rhokm = rhok
        rhokp = rhok
        allocate(Ap(1:N))
        Ap = 0d0
        do k=0,maximumsteps
          call mkl_zcsrgemv("T", n, val, col, row, -p, Ap)
          rnorm = dot_product(r,r)
          alpha = rnorm/dot_product(p,Ap)
          x = x + alpha*p
          r = r -alpha*Ap
          beta = dot_product(r,r)/rnorm
          p = r + beta*p
          Sigma = r(i)
          do j=1,M
            if(abs(rhok(j))> eps) then
              rhokp(j) = rhok(j)*rhokm(j)*alpham/(rhokm(j)*alpham* &
                (1d0+alpha*vec_sigma(j))+alpha*betam*(rhokm(j)-rhok(j)))
              alphakj = alpha*rhokp(j)/rhok(j)
              Theta(j) = Theta(j) + alphakj*Pi(j)
              betakj = ((rhokp(j)/rhok(j))**2)*beta
              Pi(j) = rhokp(j)*Sigma+betakj*Pi(j)
            endif
            rhokm(j) = rhok(j)
            rhok(j) = rhokp(j)
          enddo
          alpham = alpha
          betam = beta
          hi = rnorm*maxval(abs(rhok))
          if (hi < eps) then
            return
          endif

        enddo
        write(*,*) "not converged"
        return
      end
    end

となります。MKLは疎行列ベクトル積にのみ使っています。
コンパイルにはgfortranを用いて、

gfortran  RSCG.f90 -o RSCG.so -shared -fPIC -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core

としました。
これをJuliaから呼び出すためのコードも前回と同じで、

    function greensfunctions_fortran(i,j,σ,A)
        n = size(A,1)
        M = length(σ)
        Theta = zeros(ComplexF64,M)
        col = A.colptr
        val = A.nzval
        row = A.rowval
        val_l = length(val)
        row_l = length(row)
        col_l = length(col)
        eps = 1e-12
        maximumsteps = 20000
        ccall((:greensfunctions,"./RSCG.so"),Nothing,
        (Ref{Int64},#i
        Ref{Int64},#j
        Ref{ComplexF64}, #σ
        Ref{Int64},#M
        Ref{Int64},#n
        Ref{ComplexF64},#val
        Ref{Int64},#row
        Ref{Int64},#col
        Ref{Int64},#val_l
        Ref{Int64},#col_l
        Ref{Int64},#row_l
        Ref{Float64},#eps
        Ref{Int64},#maximumsteps
        Ref{ComplexF64}),#Theta
        i,j,σ,M,n,val,row,col,val_l,row_l,col_l,eps,maximumsteps,Theta)
        return Theta
    end

です。

全体のコード

その他のコードを含めたJuliaコードの全体は、

module RSCG
    using LinearAlgebra
    using LinearMaps
    using SparseArrays

    export greensfunctions,greensfunctions_fast,greensfunctions_fortran

    function greensfunctions(i::Integer,j::Integer,σ,A;eps=1e-12,maximumsteps=20000)

        M=length(σ)
        N=size(A,1)
        atype = eltype(A)
        σtype = eltype(σ)
        b = zeros(atype,N)
        b[j] = 1.0

    #--Line 2 in Table III.
        x = zeros(atype,N)
        r = copy(b)
        p = copy(b)
        αm = 1.0
        βm = 0.0

        m = 1
    #--
        Σ =zeros(atype,m)
        Σ[1] = b[i] #Line 3

        Θ = zeros(σtype,M)
        Π = ones(σtype,M,m)
        for mm=1:m
            Π[:,mm] *= Σ[mm]
        end
        ρk = ones(σtype,M)
        ρkm = copy(ρk)
        ρkp = copy(ρk)
        Ap = similar(p)

        for k=0:maximumsteps
            mul!(Ap,A,-p)
            #A_mul_B!(Ap,A,-p)
            rnorm = r'*r
            α = rnorm/(p'*Ap)
            x += α*p #Line 7
            r += -α*Ap #Line 8
            β = r'*r/rnorm #Line9
            p = r + β*p #Line 10

            Σ[1] = r[i] #Line 11

            for j = 1:M
                update = ifelse(abs(ρk[j]) > eps,true,false)
                if update
                    ρkp[j] = ρk[j]*ρkm[j]*αm/(ρkm[j]*αm*(1.0+α*σ[j])+α*βm*(ρkm[j]-ρk[j]))#Line 13
                    αkj = α*ρkp[j]/ρk[j]#Line 14
                    Θ[j] += αkj*Π[j,1] #Line 15
                    βkj = ((ρkp[j]/ρk[j])^2)*β #Line 16
                    Π[j,:] = ρkp[j]*Σ+ βkj*Π[j,:] #Line 17

                end
                ρkm[j] = ρk[j]
                ρk[j] = ρkp[j]
            end
            αm = α
            βm = β
            hi = real(rnorm)*maximum(abs.(ρk))
            if hi < eps
                return Θ
            end
        end


        println("After $maximumsteps steps, this was not converged")
        return Θ

    end

    @inbounds function greensfunctions_fast(i::Integer,j::Integer,σ,A;eps=1e-12,maximumsteps=20000)

        M=length(σ)
        N=size(A,1)
        atype = eltype(A)
        σtype = eltype(σ)
        b = zeros(atype,N)
        b[j] = 1.0

    #--Line 2 in Table III.
        x = zeros(atype,N)
        r = copy(b)
        p = copy(b)
        αm = 1.0
        βm = 0.0

        m = 1
    #--
        Σ =zeros(atype,m)
        Σ[1] = b[i] #Line 3

        Θ = zeros(σtype,M)
        Π = ones(σtype,M,m)
        for mm=1:m
            Π[:,mm] *= Σ[mm]
        end
        ρk = ones(σtype,M)
        ρkm = copy(ρk)
        ρkp = copy(ρk)
        Ap = similar(p)

        for k=0:maximumsteps            
            mul!(Ap,A,-p)
            #A_mul_B!(Ap,A,-p)
            rnorm = dot(r,r)#r'*r
            α = rnorm/dot(p,Ap) #(p'*Ap)
            @. x += α*p #Line 7            
            @. r += -α*Ap #Line 8
            β = dot(r,r)/rnorm #Line9
#            β = r'*r/rnorm #Line9
            @. p = r + β*p #Line 10

            Σ[1] = r[i] #Line 11

            @simd for j = 1:M
                if abs(ρk[j]) > eps

                    ρkp[j] = ρk[j]*ρkm[j]*αm/(ρkm[j]*αm*(1.0+α*σ[j])+α*βm*(ρkm[j]-ρk[j]))#Line 13
                    αkj = α*ρkp[j]/ρk[j]#Line 14
                    Θ[j] += αkj*Π[j,1] #Line 15
                    βkj = ((ρkp[j]/ρk[j])^2)*β #Line 16
                    Π[j,:] = ρkp[j]*Σ+ βkj*Π[j,:] #Line 17
                    ρkm[j] = ρk[j]
                    ρk[j] = ρkp[j]
                end

            end
            αm = α
            βm = β
            hi = real(rnorm)*maximum(abs.(ρk))
            if hi < eps
                return Θ
            end
        end


        println("After $maximumsteps steps, this was not converged")
        return Θ

    end

    function greensfunctions_fortran(i,j,σ,A)
        n = size(A,1)
        M = length(σ)
        Theta = zeros(ComplexF64,M)
        col = A.colptr
        val = A.nzval
        row = A.rowval
        val_l = length(val)
        row_l = length(row)
        col_l = length(col)
        eps = 1e-12
        maximumsteps = 20000
        ccall((:greensfunctions,"./RSCG.so"),Nothing,
        (Ref{Int64},#i
        Ref{Int64},#j
        Ref{ComplexF64}, #σ
        Ref{Int64},#M
        Ref{Int64},#n
        Ref{ComplexF64},#val
        Ref{Int64},#row
        Ref{Int64},#col
        Ref{Int64},#val_l
        Ref{Int64},#col_l
        Ref{Int64},#row_l
        Ref{Float64},#eps
        Ref{Int64},#maximumsteps
        Ref{ComplexF64}),#Theta
        i,j,σ,M,n,val,row,col,val_l,row_l,col_l,eps,maximumsteps,Theta)
        return Theta
    end

end

M = 100
σ = zeros(ComplexF64,M)
σmin = -4.0*im-1.2
σmax = 4.0*im +4.3
for i=1:M
    σ[i] = (i-1)*(σmax-σmin)/(M-1) + σmin
end

using SparseArrays
function make_mat(n)
    A = spzeros(ComplexF64,n,n)
    t = -1.0
    μ = -1.5
    for i=1:n
        dx = 1
        jp = i+dx
        jp += ifelse(jp > n,-n,0) #+1
        dx = -1
        jm = i+dx
        jm += ifelse(jm < 1,n,0) #-1
        A[i,jp] = t
        A[i,i] = -μ
        A[i,jm] = t
    end
    return A
end

n=100000
A1 = make_mat(n)


using .RSCG
i = 1
j = 1

for k=1:10
    println("Fortran")
    @time Gij1 = greensfunctions_fortran(i,j,σ,A1) 
    println("Julia")
    @time Gij2 = greensfunctions(i,j,σ,A1)
    println(sum(abs.(Gij2-Gij1))/sum(abs.(Gij1)))
    println("Julia fast")
    @time Gij3 = greensfunctions_fast(i,j,σ,A1)
    println(sum(abs.(Gij3-Gij1))/sum(abs.(Gij1)))
 end

となっています。10回実行して、それぞれどのくらいかかるかを調べます。

出力結果

出力結果は、

Fortran
  2.469293 seconds (79.79 k allocations: 4.210 MiB, 1.47% gc time)
Julia
  3.158820 seconds (1.93 M allocations: 11.899 GiB, 18.58% gc time)
1.044857504308614e-15
Julia fast
  1.673092 seconds (1.06 M allocations: 1.744 GiB, 6.02% gc time)
1.044857504308614e-15
Fortran
  2.335390 seconds (1 allocation: 1.766 KiB)
Julia
  2.703623 seconds (64.66 k allocations: 11.809 GiB, 21.05% gc time)
1.044857504308614e-15
Julia fast
  1.257365 seconds (47.69 k allocations: 1.697 GiB, 5.82% gc time)
1.044857504308614e-15
Fortran
  2.386221 seconds (1 allocation: 1.766 KiB)
Julia
  2.744226 seconds (64.66 k allocations: 11.809 GiB, 20.05% gc time)
1.044857504308614e-15
Julia fast
  1.246285 seconds (47.69 k allocations: 1.697 GiB, 5.96% gc time)
1.044857504308614e-15
Fortran
  2.395273 seconds (1 allocation: 1.766 KiB)
Julia
  2.656693 seconds (64.66 k allocations: 11.809 GiB, 20.62% gc time)
1.044857504308614e-15
Julia fast
  1.295450 seconds (47.69 k allocations: 1.697 GiB, 5.82% gc time)
1.044857504308614e-15
Fortran
  2.459751 seconds (1 allocation: 1.766 KiB)
Julia
  2.720672 seconds (64.66 k allocations: 11.809 GiB, 20.95% gc time)
1.044857504308614e-15
Julia fast
  1.325351 seconds (47.69 k allocations: 1.697 GiB, 5.51% gc time)
1.044857504308614e-15
Fortran
  2.566647 seconds (1 allocation: 1.766 KiB)
Julia
  2.697798 seconds (64.66 k allocations: 11.809 GiB, 19.86% gc time)
1.044857504308614e-15
Julia fast
  1.350757 seconds (47.69 k allocations: 1.697 GiB, 5.59% gc time)
1.044857504308614e-15
Fortran
  2.537515 seconds (1 allocation: 1.766 KiB)
Julia
  2.729953 seconds (64.66 k allocations: 11.809 GiB, 20.15% gc time)
1.044857504308614e-15
Julia fast
  1.354522 seconds (47.69 k allocations: 1.697 GiB, 5.72% gc time)
1.044857504308614e-15
Fortran
  2.708372 seconds (1 allocation: 1.766 KiB)
Julia
  2.747928 seconds (64.66 k allocations: 11.809 GiB, 20.74% gc time)
1.044857504308614e-15
Julia fast
  1.294631 seconds (47.69 k allocations: 1.697 GiB, 5.62% gc time)
1.044857504308614e-15
Fortran
  2.599797 seconds (1 allocation: 1.766 KiB)
Julia
  2.672015 seconds (64.66 k allocations: 11.809 GiB, 19.80% gc time)
1.044857504308614e-15
Julia fast
  1.272056 seconds (47.69 k allocations: 1.697 GiB, 5.67% gc time)
1.044857504308614e-15
Fortran
  2.603039 seconds (1 allocation: 1.766 KiB)
Julia
  2.708823 seconds (64.66 k allocations: 11.809 GiB, 20.01% gc time)
1.044857504308614e-15
Julia fast
  1.358685 seconds (47.69 k allocations: 1.697 GiB, 5.47% gc time)
1.044857504308614e-15

となりました。FortranとJuliaコードの値が倍精度実数の範囲で一致していることがわかります。
また、@.をつけた新しいJuliaコードはgfortran+MKLに大きく勝っていることがわかります。

まとめ

配列同士の足し算をするときは@.をつけよう

Julia 1.1コードがgfortran+MKLに勝った

環境