LoginSignup
4
3

More than 5 years have passed since last update.

Julia 1.1コードがgfortran+MKLに勝った

Posted at

"JuliaとFortran(ifort+MKL)の速度比較をしてみた。"
https://qiita.com/cometscome_phys/items/4c36576dd73e09e7a1e1
ではLinux OS上のifort+MKLに勝負を挑みましたが、今回はgfortran+MKLに勝負を挑んでみました。
特に今回は、配列の代入を少し変えることでJuliaの高速化ができましたので、そちらについて述べたいと思います。

環境

Julia: 1.1
gfortran: gcc version 8.2.0 (Homebrew GCC 8.2.0)
MKL: 2019年 Mac OS版

Mac OS: 10.14.3
MacBook Pro (13-inch, 2018, Four Thunderbolt 3 Ports)
2.7 GHz Intel Core i7

計算コード

計算に使うコードは、Green関数の行列要素を計算するパッケージRSCG.jl
https://github.com/cometscome/RSCG.jl
とします。詳しくは前回の記事を参照してください。

Juliaコード(前回版)

今回は二つのJuliaコードを用意しました。
一つ目は前回の記事で使ったもの

    function greensfunctions(i::Integer,j::Integer,σ,A;eps=1e-12,maximumsteps=20000)

        M=length(σ)
        N=size(A,1)
        atype = eltype(A)
        σtype = eltype(σ)
        b = zeros(atype,N)
        b[j] = 1.0

    #--Line 2 in Table III.
        x = zeros(atype,N)
        r = copy(b)
        p = copy(b)
        αm = 1.0
        βm = 0.0

        m = 1
    #--
        Σ =zeros(atype,m)
        Σ[1] = b[i] #Line 3

        Θ = zeros(σtype,M)
        Π = ones(σtype,M,m)
        for mm=1:m
            Π[:,mm] *= Σ[mm]
        end
        ρk = ones(σtype,M)
        ρkm = copy(ρk)
        ρkp = copy(ρk)
        Ap = similar(p)

        for k=0:maximumsteps
            mul!(Ap,A,-p)
            #A_mul_B!(Ap,A,-p)
            rnorm = r'*r
            α = rnorm/(p'*Ap)
            x += α*p #Line 7
            r += -α*Ap #Line 8
            β = r'*r/rnorm #Line9
            p = r + β*p #Line 10

            Σ[1] = r[i] #Line 11

            for j = 1:M
                update = ifelse(abs(ρk[j]) > eps,true,false)
                if update
                    ρkp[j] = ρk[j]*ρkm[j]*αm/(ρkm[j]*αm*(1.0+α*σ[j])+α*βm*(ρkm[j]-ρk[j]))#Line 13
                    αkj = α*ρkp[j]/ρk[j]#Line 14
                    Θ[j] += αkj*Π[j,1] #Line 15
                    βkj = ((ρkp[j]/ρk[j])^2)*β #Line 16
                    Π[j,:] = ρkp[j]*Σ+ βkj*Π[j,:] #Line 17

                end
                ρkm[j] = ρk[j]
                ρk[j] = ρkp[j]
            end
            αm = α
            βm = β
            hi = real(rnorm)*maximum(abs.(ρk))
            if hi < eps
                return Θ
            end
        end


        println("After $maximumsteps steps, this was not converged")
        return Θ

    end

です。

Juliaコード(高速化版)

今回新しく用意したコードは、

    @inbounds function greensfunctions_fast(i::Integer,j::Integer,σ,A;eps=1e-12,maximumsteps=20000)

        M=length(σ)
        N=size(A,1)
        atype = eltype(A)
        σtype = eltype(σ)
        b = zeros(atype,N)
        b[j] = 1.0

    #--Line 2 in Table III.
        x = zeros(atype,N)
        r = copy(b)
        p = copy(b)
        αm = 1.0
        βm = 0.0

        m = 1
    #--
        Σ =zeros(atype,m)
        Σ[1] = b[i] #Line 3

        Θ = zeros(σtype,M)
        Π = ones(σtype,M,m)
        for mm=1:m
            Π[:,mm] *= Σ[mm]
        end
        ρk = ones(σtype,M)
        ρkm = copy(ρk)
        ρkp = copy(ρk)
        Ap = similar(p)

        for k=0:maximumsteps            
            mul!(Ap,A,-p)
            #A_mul_B!(Ap,A,-p)
            rnorm = dot(r,r)#r'*r
            α = rnorm/dot(p,Ap) #(p'*Ap)
            @. x += α*p #Line 7            
            @. r += -α*Ap #Line 8
            β = dot(r,r)/rnorm #Line9
#            β = r'*r/rnorm #Line9
            @. p = r + β*p #Line 10

            Σ[1] = r[i] #Line 11

            @simd for j = 1:M
                if abs(ρk[j]) > eps

                    ρkp[j] = ρk[j]*ρkm[j]*αm/(ρkm[j]*αm*(1.0+α*σ[j])+α*βm*(ρkm[j]-ρk[j]))#Line 13
                    αkj = α*ρkp[j]/ρk[j]#Line 14
                    Θ[j] += αkj*Π[j,1] #Line 15
                    βkj = ((ρkp[j]/ρk[j])^2)*β #Line 16
                    Π[j,:] = ρkp[j]*Σ+ βkj*Π[j,:] #Line 17
                    ρkm[j] = ρk[j]
                    ρk[j] = ρkp[j]
                end

            end
            αm = α
            βm = β
            hi = real(rnorm)*maximum(abs.(ρk))
            if hi < eps
                return Θ
            end
        end


        println("After $maximumsteps steps, this was not converged")
        return Θ

    end

です。@simd@inboundsなどをつけましたが、結果的にはこれらはあまり速度に影響しませんでした。もっとも速度に影響したのは、ベクトルの足し算の部分につけた@.マクロでした。
これによって配列の確保が減ったようです。

Fortranコード

Fortranコードは前回と同じで、

module rscg
    contains
      subroutine greensfunctions(i,jj,vec_sigma,M,n,val,row,col,&
        val_l,row_l,col_l,eps,maximumsteps,Theta) bind(C, name="greensfunctions")
        implicit none
        integer(8),intent(in)::M,n
        complex(8),intent(in)::vec_sigma(1:M)
        integer(8),intent(in)::val_l,col_l,row_l
        complex(8),intent(in)::val(1:val_l)
        integer(8),intent(in)::col(1:col_l),row(1:row_l)
        integer(8),intent(in)::i,jj
        complex(8),intent(out)::Theta(1:M)
        real(8),intent(in),optional::eps
        integer(8),intent(in),optional::maximumsteps
        complex(8),allocatable::b(:),x(:),r(:),p(:),Ap(:)
        complex(8)::alpham,betam,Sigma
        complex(8),allocatable::Pi(:)
        complex(8)::rhok(1:M),rhokm(1:M),rhokp(1:M)
        integer::k,j
        real(8)::rnorm,hi
        complex(8)::alpha,beta,alphakj,betakj

        allocate(b(1:N))
        b = 0d0
        b(jj) = 1d0
        allocate(x(1:N))
        x = 0d0
        allocate(r(1:N))
        r = b
        allocate(p(1:N))
        p = b
        alpham = 1d0
        betam = 1d0
        Sigma = b(i)
        Theta = 0d0
        allocate(Pi(1:M))
        Pi = Sigma
        rhok = 1d0
        rhokm = rhok
        rhokp = rhok
        allocate(Ap(1:N))
        Ap = 0d0
        do k=0,maximumsteps
          call mkl_zcsrgemv("T", n, val, col, row, -p, Ap)
          rnorm = dot_product(r,r)
          alpha = rnorm/dot_product(p,Ap)
          x = x + alpha*p
          r = r -alpha*Ap
          beta = dot_product(r,r)/rnorm
          p = r + beta*p
          Sigma = r(i)
          do j=1,M
            if(abs(rhok(j))> eps) then
              rhokp(j) = rhok(j)*rhokm(j)*alpham/(rhokm(j)*alpham* &
                (1d0+alpha*vec_sigma(j))+alpha*betam*(rhokm(j)-rhok(j)))
              alphakj = alpha*rhokp(j)/rhok(j)
              Theta(j) = Theta(j) + alphakj*Pi(j)
              betakj = ((rhokp(j)/rhok(j))**2)*beta
              Pi(j) = rhokp(j)*Sigma+betakj*Pi(j)
            endif
            rhokm(j) = rhok(j)
            rhok(j) = rhokp(j)
          enddo
          alpham = alpha
          betam = beta
          hi = rnorm*maxval(abs(rhok))
          if (hi < eps) then
            return
          endif

        enddo
        write(*,*) "not converged"
        return
      end
    end

となります。MKLは疎行列ベクトル積にのみ使っています。
コンパイルにはgfortranを用いて、

gfortran  RSCG.f90 -o RSCG.so -shared -fPIC -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core

としました。
これをJuliaから呼び出すためのコードも前回と同じで、

    function greensfunctions_fortran(i,j,σ,A)
        n = size(A,1)
        M = length(σ)
        Theta = zeros(ComplexF64,M)
        col = A.colptr
        val = A.nzval
        row = A.rowval
        val_l = length(val)
        row_l = length(row)
        col_l = length(col)
        eps = 1e-12
        maximumsteps = 20000
        ccall((:greensfunctions,"./RSCG.so"),Nothing,
        (Ref{Int64},#i
        Ref{Int64},#j
        Ref{ComplexF64}, 
        Ref{Int64},#M
        Ref{Int64},#n
        Ref{ComplexF64},#val
        Ref{Int64},#row
        Ref{Int64},#col
        Ref{Int64},#val_l
        Ref{Int64},#col_l
        Ref{Int64},#row_l
        Ref{Float64},#eps
        Ref{Int64},#maximumsteps
        Ref{ComplexF64}),#Theta
        i,j,σ,M,n,val,row,col,val_l,row_l,col_l,eps,maximumsteps,Theta)
        return Theta
    end

です。

全体のコード

その他のコードを含めたJuliaコードの全体は、

module RSCG
    using LinearAlgebra
    using LinearMaps
    using SparseArrays

    export greensfunctions,greensfunctions_fast,greensfunctions_fortran

    function greensfunctions(i::Integer,j::Integer,σ,A;eps=1e-12,maximumsteps=20000)

        M=length(σ)
        N=size(A,1)
        atype = eltype(A)
        σtype = eltype(σ)
        b = zeros(atype,N)
        b[j] = 1.0

    #--Line 2 in Table III.
        x = zeros(atype,N)
        r = copy(b)
        p = copy(b)
        αm = 1.0
        βm = 0.0

        m = 1
    #--
        Σ =zeros(atype,m)
        Σ[1] = b[i] #Line 3

        Θ = zeros(σtype,M)
        Π = ones(σtype,M,m)
        for mm=1:m
            Π[:,mm] *= Σ[mm]
        end
        ρk = ones(σtype,M)
        ρkm = copy(ρk)
        ρkp = copy(ρk)
        Ap = similar(p)

        for k=0:maximumsteps
            mul!(Ap,A,-p)
            #A_mul_B!(Ap,A,-p)
            rnorm = r'*r
            α = rnorm/(p'*Ap)
            x += α*p #Line 7
            r += -α*Ap #Line 8
            β = r'*r/rnorm #Line9
            p = r + β*p #Line 10

            Σ[1] = r[i] #Line 11

            for j = 1:M
                update = ifelse(abs(ρk[j]) > eps,true,false)
                if update
                    ρkp[j] = ρk[j]*ρkm[j]*αm/(ρkm[j]*αm*(1.0+α*σ[j])+α*βm*(ρkm[j]-ρk[j]))#Line 13
                    αkj = α*ρkp[j]/ρk[j]#Line 14
                    Θ[j] += αkj*Π[j,1] #Line 15
                    βkj = ((ρkp[j]/ρk[j])^2)*β #Line 16
                    Π[j,:] = ρkp[j]*Σ+ βkj*Π[j,:] #Line 17

                end
                ρkm[j] = ρk[j]
                ρk[j] = ρkp[j]
            end
            αm = α
            βm = β
            hi = real(rnorm)*maximum(abs.(ρk))
            if hi < eps
                return Θ
            end
        end


        println("After $maximumsteps steps, this was not converged")
        return Θ

    end

    @inbounds function greensfunctions_fast(i::Integer,j::Integer,σ,A;eps=1e-12,maximumsteps=20000)

        M=length(σ)
        N=size(A,1)
        atype = eltype(A)
        σtype = eltype(σ)
        b = zeros(atype,N)
        b[j] = 1.0

    #--Line 2 in Table III.
        x = zeros(atype,N)
        r = copy(b)
        p = copy(b)
        αm = 1.0
        βm = 0.0

        m = 1
    #--
        Σ =zeros(atype,m)
        Σ[1] = b[i] #Line 3

        Θ = zeros(σtype,M)
        Π = ones(σtype,M,m)
        for mm=1:m
            Π[:,mm] *= Σ[mm]
        end
        ρk = ones(σtype,M)
        ρkm = copy(ρk)
        ρkp = copy(ρk)
        Ap = similar(p)

        for k=0:maximumsteps            
            mul!(Ap,A,-p)
            #A_mul_B!(Ap,A,-p)
            rnorm = dot(r,r)#r'*r
            α = rnorm/dot(p,Ap) #(p'*Ap)
            @. x += α*p #Line 7            
            @. r += -α*Ap #Line 8
            β = dot(r,r)/rnorm #Line9
#            β = r'*r/rnorm #Line9
            @. p = r + β*p #Line 10

            Σ[1] = r[i] #Line 11

            @simd for j = 1:M
                if abs(ρk[j]) > eps

                    ρkp[j] = ρk[j]*ρkm[j]*αm/(ρkm[j]*αm*(1.0+α*σ[j])+α*βm*(ρkm[j]-ρk[j]))#Line 13
                    αkj = α*ρkp[j]/ρk[j]#Line 14
                    Θ[j] += αkj*Π[j,1] #Line 15
                    βkj = ((ρkp[j]/ρk[j])^2)*β #Line 16
                    Π[j,:] = ρkp[j]*Σ+ βkj*Π[j,:] #Line 17
                    ρkm[j] = ρk[j]
                    ρk[j] = ρkp[j]
                end

            end
            αm = α
            βm = β
            hi = real(rnorm)*maximum(abs.(ρk))
            if hi < eps
                return Θ
            end
        end


        println("After $maximumsteps steps, this was not converged")
        return Θ

    end

    function greensfunctions_fortran(i,j,σ,A)
        n = size(A,1)
        M = length(σ)
        Theta = zeros(ComplexF64,M)
        col = A.colptr
        val = A.nzval
        row = A.rowval
        val_l = length(val)
        row_l = length(row)
        col_l = length(col)
        eps = 1e-12
        maximumsteps = 20000
        ccall((:greensfunctions,"./RSCG.so"),Nothing,
        (Ref{Int64},#i
        Ref{Int64},#j
        Ref{ComplexF64}, 
        Ref{Int64},#M
        Ref{Int64},#n
        Ref{ComplexF64},#val
        Ref{Int64},#row
        Ref{Int64},#col
        Ref{Int64},#val_l
        Ref{Int64},#col_l
        Ref{Int64},#row_l
        Ref{Float64},#eps
        Ref{Int64},#maximumsteps
        Ref{ComplexF64}),#Theta
        i,j,σ,M,n,val,row,col,val_l,row_l,col_l,eps,maximumsteps,Theta)
        return Theta
    end

end

M = 100
σ = zeros(ComplexF64,M)
σmin = -4.0*im-1.2
σmax = 4.0*im +4.3
for i=1:M
    σ[i] = (i-1)*(σmax-σmin)/(M-1) + σmin
end

using SparseArrays
function make_mat(n)
    A = spzeros(ComplexF64,n,n)
    t = -1.0
    μ = -1.5
    for i=1:n
        dx = 1
        jp = i+dx
        jp += ifelse(jp > n,-n,0) #+1
        dx = -1
        jm = i+dx
        jm += ifelse(jm < 1,n,0) #-1
        A[i,jp] = t
        A[i,i] = -μ
        A[i,jm] = t
    end
    return A
end

n=100000
A1 = make_mat(n)


using .RSCG
i = 1
j = 1

for k=1:10
    println("Fortran")
    @time Gij1 = greensfunctions_fortran(i,j,σ,A1) 
    println("Julia")
    @time Gij2 = greensfunctions(i,j,σ,A1)
    println(sum(abs.(Gij2-Gij1))/sum(abs.(Gij1)))
    println("Julia fast")
    @time Gij3 = greensfunctions_fast(i,j,σ,A1)
    println(sum(abs.(Gij3-Gij1))/sum(abs.(Gij1)))
 end

となっています。10回実行して、それぞれどのくらいかかるかを調べます。

出力結果

出力結果は、

Fortran
  2.469293 seconds (79.79 k allocations: 4.210 MiB, 1.47% gc time)
Julia
  3.158820 seconds (1.93 M allocations: 11.899 GiB, 18.58% gc time)
1.044857504308614e-15
Julia fast
  1.673092 seconds (1.06 M allocations: 1.744 GiB, 6.02% gc time)
1.044857504308614e-15
Fortran
  2.335390 seconds (1 allocation: 1.766 KiB)
Julia
  2.703623 seconds (64.66 k allocations: 11.809 GiB, 21.05% gc time)
1.044857504308614e-15
Julia fast
  1.257365 seconds (47.69 k allocations: 1.697 GiB, 5.82% gc time)
1.044857504308614e-15
Fortran
  2.386221 seconds (1 allocation: 1.766 KiB)
Julia
  2.744226 seconds (64.66 k allocations: 11.809 GiB, 20.05% gc time)
1.044857504308614e-15
Julia fast
  1.246285 seconds (47.69 k allocations: 1.697 GiB, 5.96% gc time)
1.044857504308614e-15
Fortran
  2.395273 seconds (1 allocation: 1.766 KiB)
Julia
  2.656693 seconds (64.66 k allocations: 11.809 GiB, 20.62% gc time)
1.044857504308614e-15
Julia fast
  1.295450 seconds (47.69 k allocations: 1.697 GiB, 5.82% gc time)
1.044857504308614e-15
Fortran
  2.459751 seconds (1 allocation: 1.766 KiB)
Julia
  2.720672 seconds (64.66 k allocations: 11.809 GiB, 20.95% gc time)
1.044857504308614e-15
Julia fast
  1.325351 seconds (47.69 k allocations: 1.697 GiB, 5.51% gc time)
1.044857504308614e-15
Fortran
  2.566647 seconds (1 allocation: 1.766 KiB)
Julia
  2.697798 seconds (64.66 k allocations: 11.809 GiB, 19.86% gc time)
1.044857504308614e-15
Julia fast
  1.350757 seconds (47.69 k allocations: 1.697 GiB, 5.59% gc time)
1.044857504308614e-15
Fortran
  2.537515 seconds (1 allocation: 1.766 KiB)
Julia
  2.729953 seconds (64.66 k allocations: 11.809 GiB, 20.15% gc time)
1.044857504308614e-15
Julia fast
  1.354522 seconds (47.69 k allocations: 1.697 GiB, 5.72% gc time)
1.044857504308614e-15
Fortran
  2.708372 seconds (1 allocation: 1.766 KiB)
Julia
  2.747928 seconds (64.66 k allocations: 11.809 GiB, 20.74% gc time)
1.044857504308614e-15
Julia fast
  1.294631 seconds (47.69 k allocations: 1.697 GiB, 5.62% gc time)
1.044857504308614e-15
Fortran
  2.599797 seconds (1 allocation: 1.766 KiB)
Julia
  2.672015 seconds (64.66 k allocations: 11.809 GiB, 19.80% gc time)
1.044857504308614e-15
Julia fast
  1.272056 seconds (47.69 k allocations: 1.697 GiB, 5.67% gc time)
1.044857504308614e-15
Fortran
  2.603039 seconds (1 allocation: 1.766 KiB)
Julia
  2.708823 seconds (64.66 k allocations: 11.809 GiB, 20.01% gc time)
1.044857504308614e-15
Julia fast
  1.358685 seconds (47.69 k allocations: 1.697 GiB, 5.47% gc time)
1.044857504308614e-15

となりました。FortranとJuliaコードの値が倍精度実数の範囲で一致していることがわかります。
また、@.をつけた新しいJuliaコードはgfortran+MKLに大きく勝っていることがわかります。

まとめ

配列同士の足し算をするときは@.をつけよう

4
3
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
4
3