Fortran
Julia

Julia 1.1コードがgfortran+MKLに勝った

"JuliaとFortran(ifort+MKL)の速度比較をしてみた。"

https://qiita.com/cometscome_phys/items/4c36576dd73e09e7a1e1

ではLinux OS上のifort+MKLに勝負を挑みましたが、今回はgfortran+MKLに勝負を挑んでみました。

特に今回は、配列の代入を少し変えることでJuliaの高速化ができましたので、そちらについて述べたいと思います。


環境

Julia: 1.1

gfortran: gcc version 8.2.0 (Homebrew GCC 8.2.0)

MKL: 2019年 Mac OS版

Mac OS: 10.14.3

MacBook Pro (13-inch, 2018, Four Thunderbolt 3 Ports)

2.7 GHz Intel Core i7


計算コード

計算に使うコードは、Green関数の行列要素を計算するパッケージRSCG.jl

https://github.com/cometscome/RSCG.jl

とします。詳しくは前回の記事を参照してください。


Juliaコード(前回版)

今回は二つのJuliaコードを用意しました。

一つ目は前回の記事で使ったもの

    function greensfunctions(i::Integer,j::Integer,σ,A;eps=1e-12,maximumsteps=20000)

M=length(σ)
N=size(A,1)
atype = eltype(A)
σtype = eltype(σ)
b = zeros(atype,N)
b[j] = 1.0

#--Line 2 in Table III.
x = zeros(atype,N)
r = copy(b)
p = copy(b)
αm = 1.0
βm = 0.0

m = 1
#--
Σ =zeros(atype,m)
Σ[1] = b[i] #Line 3

Θ = zeros(σtype,M)
Π = ones(σtype,M,m)
for mm=1:m
Π[:,mm] *= Σ[mm]
end
ρk = ones(σtype,M)
ρkm = copy(ρk)
ρkp = copy(ρk)
Ap = similar(p)

for k=0:maximumsteps
mul!(Ap,A,-p)
#A_mul_B!(Ap,A,-p)
rnorm = r'*r
α = rnorm/(p'*Ap)
x += α*p #Line 7
r += -α*Ap #Line 8
β = r'*r/rnorm #Line9
p = r + β*p #Line 10

Σ[1] = r[i] #Line 11

for j = 1:M
update = ifelse(abs(ρk[j]) > eps,true,false)
if update
ρkp[j] = ρk[j]*ρkm[j]*αm/(ρkm[j]*αm*(1.0+α*σ[j])+α*βm*(ρkm[j]-ρk[j]))#Line 13
αkj = α*ρkp[j]/ρk[j]#Line 14
Θ[j] += αkj*Π[j,1] #Line 15
βkj = ((ρkp[j]/ρk[j])^2)*β #Line 16
Π[j,:] = ρkp[j]*Σ+ βkj*Π[j,:] #Line 17

end
ρkm[j] = ρk[j]
ρk[j] = ρkp[j]
end
αm = α
βm = β
hi = real(rnorm)*maximum(abs.(ρk))
if hi < eps
return Θ
end
end

println("After $maximumsteps steps, this was not converged")
return Θ

end

です。


Juliaコード(高速化版)

今回新しく用意したコードは、

    @inbounds function greensfunctions_fast(i::Integer,j::Integer,σ,A;eps=1e-12,maximumsteps=20000)

M=length(σ)
N=size(A,1)
atype = eltype(A)
σtype = eltype(σ)
b = zeros(atype,N)
b[j] = 1.0

#--Line 2 in Table III.
x = zeros(atype,N)
r = copy(b)
p = copy(b)
αm = 1.0
βm = 0.0

m = 1
#--
Σ =zeros(atype,m)
Σ[1] = b[i] #Line 3

Θ = zeros(σtype,M)
Π = ones(σtype,M,m)
for mm=1:m
Π[:,mm] *= Σ[mm]
end
ρk = ones(σtype,M)
ρkm = copy(ρk)
ρkp = copy(ρk)
Ap = similar(p)

for k=0:maximumsteps
mul!(Ap,A,-p)
#A_mul_B!(Ap,A,-p)
rnorm = dot(r,r)#r'*r
α = rnorm/dot(p,Ap) #(p'*Ap)
@. x += α*p #Line 7
@. r += -α*Ap #Line 8
β = dot(r,r)/rnorm #Line9
# β = r'*r/rnorm #Line9
@. p = r + β*p #Line 10

Σ[1] = r[i] #Line 11

@simd for j = 1:M
if abs(ρk[j]) > eps

ρkp[j] = ρk[j]*ρkm[j]*αm/(ρkm[j]*αm*(1.0+α*σ[j])+α*βm*(ρkm[j]-ρk[j]))#Line 13
αkj = α*ρkp[j]/ρk[j]#Line 14
Θ[j] += αkj*Π[j,1] #Line 15
βkj = ((ρkp[j]/ρk[j])^2)*β #Line 16
Π[j,:] = ρkp[j]*Σ+ βkj*Π[j,:] #Line 17
ρkm[j] = ρk[j]
ρk[j] = ρkp[j]
end

end
αm = α
βm = β
hi = real(rnorm)*maximum(abs.(ρk))
if hi < eps
return Θ
end
end

println("After $maximumsteps steps, this was not converged")
return Θ

end

です。@simd@inboundsなどをつけましたが、結果的にはこれらはあまり速度に影響しませんでした。もっとも速度に影響したのは、ベクトルの足し算の部分につけた@.マクロでした。

これによって配列の確保が減ったようです。


Fortranコード

Fortranコードは前回と同じで、

module rscg

contains
subroutine greensfunctions(i,jj,vec_sigma,M,n,val,row,col,&
val_l,row_l,col_l,eps,maximumsteps,Theta) bind(C, name="greensfunctions")
implicit none
integer(8),intent(in)::M,n
complex(8),intent(in)::vec_sigma(1:M)
integer(8),intent(in)::val_l,col_l,row_l
complex(8),intent(in)::val(1:val_l)
integer(8),intent(in)::col(1:col_l),row(1:row_l)
integer(8),intent(in)::i,jj
complex(8),intent(out)::Theta(1:M)
real(8),intent(in),optional::eps
integer(8),intent(in),optional::maximumsteps
complex(8),allocatable::b(:),x(:),r(:),p(:),Ap(:)
complex(8)::alpham,betam,Sigma
complex(8),allocatable::Pi(:)
complex(8)::rhok(1:M),rhokm(1:M),rhokp(1:M)
integer::k,j
real(8)::rnorm,hi
complex(8)::alpha,beta,alphakj,betakj

allocate(b(1:N))
b = 0d0
b(jj) = 1d0
allocate(x(1:N))
x = 0d0
allocate(r(1:N))
r = b
allocate(p(1:N))
p = b
alpham = 1d0
betam = 1d0
Sigma = b(i)
Theta = 0d0
allocate(Pi(1:M))
Pi = Sigma
rhok = 1d0
rhokm = rhok
rhokp = rhok
allocate(Ap(1:N))
Ap = 0d0
do k=0,maximumsteps
call mkl_zcsrgemv("T", n, val, col, row, -p, Ap)
rnorm = dot_product(r,r)
alpha = rnorm/dot_product(p,Ap)
x = x + alpha*p
r = r -alpha*Ap
beta = dot_product(r,r)/rnorm
p = r + beta*p
Sigma = r(i)
do j=1,M
if(abs(rhok(j))> eps) then
rhokp(j) = rhok(j)*rhokm(j)*alpham/(rhokm(j)*alpham* &
(1d0+alpha*vec_sigma(j))+alpha*betam*(rhokm(j)-rhok(j)))
alphakj = alpha*rhokp(j)/rhok(j)
Theta(j) = Theta(j) + alphakj*Pi(j)
betakj = ((rhokp(j)/rhok(j))**2)*beta
Pi(j) = rhokp(j)*Sigma+betakj*Pi(j)
endif
rhokm(j) = rhok(j)
rhok(j) = rhokp(j)
enddo
alpham = alpha
betam = beta
hi = rnorm*maxval(abs(rhok))
if (hi < eps) then
return
endif

enddo
write(*,*) "not converged"
return
end
end

となります。MKLは疎行列ベクトル積にのみ使っています。

コンパイルにはgfortranを用いて、

gfortran  RSCG.f90 -o RSCG.so -shared -fPIC -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core

としました。

これをJuliaから呼び出すためのコードも前回と同じで、

    function greensfunctions_fortran(i,j,σ,A)

n = size(A,1)
M = length(σ)
Theta = zeros(ComplexF64,M)
col = A.colptr
val = A.nzval
row = A.rowval
val_l = length(val)
row_l = length(row)
col_l = length(col)
eps = 1e-12
maximumsteps = 20000
ccall((:greensfunctions,"./RSCG.so"),Nothing,
(Ref{Int64},#i
Ref{Int64},#j
Ref{ComplexF64},
Ref{Int64},#M
Ref{Int64},#n
Ref{ComplexF64},#val
Ref{Int64},#row
Ref{Int64},#col
Ref{Int64},#val_l
Ref{Int64},#col_l
Ref{Int64},#row_l
Ref{Float64},#eps
Ref{Int64},#maximumsteps
Ref{ComplexF64}),#Theta
i,j,σ,M,n,val,row,col,val_l,row_l,col_l,eps,maximumsteps,Theta)
return Theta
end

です。


全体のコード

その他のコードを含めたJuliaコードの全体は、

module RSCG

using LinearAlgebra
using LinearMaps
using SparseArrays

export greensfunctions,greensfunctions_fast,greensfunctions_fortran

function greensfunctions(i::Integer,j::Integer,σ,A;eps=1e-12,maximumsteps=20000)

M=length(σ)
N=size(A,1)
atype = eltype(A)
σtype = eltype(σ)
b = zeros(atype,N)
b[j] = 1.0

#--Line 2 in Table III.
x = zeros(atype,N)
r = copy(b)
p = copy(b)
αm = 1.0
βm = 0.0

m = 1
#--
Σ =zeros(atype,m)
Σ[1] = b[i] #Line 3

Θ = zeros(σtype,M)
Π = ones(σtype,M,m)
for mm=1:m
Π[:,mm] *= Σ[mm]
end
ρk = ones(σtype,M)
ρkm = copy(ρk)
ρkp = copy(ρk)
Ap = similar(p)

for k=0:maximumsteps
mul!(Ap,A,-p)
#A_mul_B!(Ap,A,-p)
rnorm = r'*r
α = rnorm/(p'*Ap)
x += α*p #Line 7
r += -α*Ap #Line 8
β = r'*r/rnorm #Line9
p = r + β*p #Line 10

Σ[1] = r[i] #Line 11

for j = 1:M
update = ifelse(abs(ρk[j]) > eps,true,false)
if update
ρkp[j] = ρk[j]*ρkm[j]*αm/(ρkm[j]*αm*(1.0+α*σ[j])+α*βm*(ρkm[j]-ρk[j]))#Line 13
αkj = α*ρkp[j]/ρk[j]#Line 14
Θ[j] += αkj*Π[j,1] #Line 15
βkj = ((ρkp[j]/ρk[j])^2)*β #Line 16
Π[j,:] = ρkp[j]*Σ+ βkj*Π[j,:] #Line 17

end
ρkm[j] = ρk[j]
ρk[j] = ρkp[j]
end
αm = α
βm = β
hi = real(rnorm)*maximum(abs.(ρk))
if hi < eps
return Θ
end
end

println("After $maximumsteps steps, this was not converged")
return Θ

end

@inbounds function greensfunctions_fast(i::Integer,j::Integer,σ,A;eps=1e-12,maximumsteps=20000)

M=length(σ)
N=size(A,1)
atype = eltype(A)
σtype = eltype(σ)
b = zeros(atype,N)
b[j] = 1.0

#--Line 2 in Table III.
x = zeros(atype,N)
r = copy(b)
p = copy(b)
αm = 1.0
βm = 0.0

m = 1
#--
Σ =zeros(atype,m)
Σ[1] = b[i] #Line 3

Θ = zeros(σtype,M)
Π = ones(σtype,M,m)
for mm=1:m
Π[:,mm] *= Σ[mm]
end
ρk = ones(σtype,M)
ρkm = copy(ρk)
ρkp = copy(ρk)
Ap = similar(p)

for k=0:maximumsteps
mul!(Ap,A,-p)
#A_mul_B!(Ap,A,-p)
rnorm = dot(r,r)#r'*r
α = rnorm/dot(p,Ap) #(p'*Ap)
@. x += α*p #Line 7
@. r += -α*Ap #Line 8
β = dot(r,r)/rnorm #Line9
# β = r'*r/rnorm #Line9
@. p = r + β*p #Line 10

Σ[1] = r[i] #Line 11

@simd for j = 1:M
if abs(ρk[j]) > eps

ρkp[j] = ρk[j]*ρkm[j]*αm/(ρkm[j]*αm*(1.0+α*σ[j])+α*βm*(ρkm[j]-ρk[j]))#Line 13
αkj = α*ρkp[j]/ρk[j]#Line 14
Θ[j] += αkj*Π[j,1] #Line 15
βkj = ((ρkp[j]/ρk[j])^2)*β #Line 16
Π[j,:] = ρkp[j]*Σ+ βkj*Π[j,:] #Line 17
ρkm[j] = ρk[j]
ρk[j] = ρkp[j]
end

end
αm = α
βm = β
hi = real(rnorm)*maximum(abs.(ρk))
if hi < eps
return Θ
end
end

println("After $maximumsteps steps, this was not converged")
return Θ

end

function greensfunctions_fortran(i,j,σ,A)
n = size(A,1)
M = length(σ)
Theta = zeros(ComplexF64,M)
col = A.colptr
val = A.nzval
row = A.rowval
val_l = length(val)
row_l = length(row)
col_l = length(col)
eps = 1e-12
maximumsteps = 20000
ccall((:greensfunctions,"./RSCG.so"),Nothing,
(Ref{Int64},#i
Ref{Int64},#j
Ref{ComplexF64},
Ref{Int64},#M
Ref{Int64},#n
Ref{ComplexF64},#val
Ref{Int64},#row
Ref{Int64},#col
Ref{Int64},#val_l
Ref{Int64},#col_l
Ref{Int64},#row_l
Ref{Float64},#eps
Ref{Int64},#maximumsteps
Ref{ComplexF64}),#Theta
i,j,σ,M,n,val,row,col,val_l,row_l,col_l,eps,maximumsteps,Theta)
return Theta
end

end

M = 100
σ = zeros(ComplexF64,M)
σmin = -4.0*im-1.2
σmax = 4.0*im +4.3
for i=1:M
σ[i] = (i-1)*(σmax-σmin)/(M-1) + σmin
end

using SparseArrays
function make_mat(n)
A = spzeros(ComplexF64,n,n)
t = -1.0
μ = -1.5
for i=1:n
dx = 1
jp = i+dx
jp += ifelse(jp > n,-n,0) #+1
dx = -1
jm = i+dx
jm += ifelse(jm < 1,n,0) #-1
A[i,jp] = t
A[i,i] = -μ
A[i,jm] = t
end
return A
end

n=100000
A1 = make_mat(n)

using .RSCG
i = 1
j = 1

for k=1:10
println("Fortran")
@time Gij1 = greensfunctions_fortran(i,j,σ,A1)
println("Julia")
@time Gij2 = greensfunctions(i,j,σ,A1)
println(sum(abs.(Gij2-Gij1))/sum(abs.(Gij1)))
println("Julia fast")
@time Gij3 = greensfunctions_fast(i,j,σ,A1)
println(sum(abs.(Gij3-Gij1))/sum(abs.(Gij1)))
end

となっています。10回実行して、それぞれどのくらいかかるかを調べます。


出力結果

出力結果は、

Fortran

2.469293 seconds (79.79 k allocations: 4.210 MiB, 1.47% gc time)
Julia
3.158820 seconds (1.93 M allocations: 11.899 GiB, 18.58% gc time)
1.044857504308614e-15
Julia fast
1.673092 seconds (1.06 M allocations: 1.744 GiB, 6.02% gc time)
1.044857504308614e-15
Fortran
2.335390 seconds (1 allocation: 1.766 KiB)
Julia
2.703623 seconds (64.66 k allocations: 11.809 GiB, 21.05% gc time)
1.044857504308614e-15
Julia fast
1.257365 seconds (47.69 k allocations: 1.697 GiB, 5.82% gc time)
1.044857504308614e-15
Fortran
2.386221 seconds (1 allocation: 1.766 KiB)
Julia
2.744226 seconds (64.66 k allocations: 11.809 GiB, 20.05% gc time)
1.044857504308614e-15
Julia fast
1.246285 seconds (47.69 k allocations: 1.697 GiB, 5.96% gc time)
1.044857504308614e-15
Fortran
2.395273 seconds (1 allocation: 1.766 KiB)
Julia
2.656693 seconds (64.66 k allocations: 11.809 GiB, 20.62% gc time)
1.044857504308614e-15
Julia fast
1.295450 seconds (47.69 k allocations: 1.697 GiB, 5.82% gc time)
1.044857504308614e-15
Fortran
2.459751 seconds (1 allocation: 1.766 KiB)
Julia
2.720672 seconds (64.66 k allocations: 11.809 GiB, 20.95% gc time)
1.044857504308614e-15
Julia fast
1.325351 seconds (47.69 k allocations: 1.697 GiB, 5.51% gc time)
1.044857504308614e-15
Fortran
2.566647 seconds (1 allocation: 1.766 KiB)
Julia
2.697798 seconds (64.66 k allocations: 11.809 GiB, 19.86% gc time)
1.044857504308614e-15
Julia fast
1.350757 seconds (47.69 k allocations: 1.697 GiB, 5.59% gc time)
1.044857504308614e-15
Fortran
2.537515 seconds (1 allocation: 1.766 KiB)
Julia
2.729953 seconds (64.66 k allocations: 11.809 GiB, 20.15% gc time)
1.044857504308614e-15
Julia fast
1.354522 seconds (47.69 k allocations: 1.697 GiB, 5.72% gc time)
1.044857504308614e-15
Fortran
2.708372 seconds (1 allocation: 1.766 KiB)
Julia
2.747928 seconds (64.66 k allocations: 11.809 GiB, 20.74% gc time)
1.044857504308614e-15
Julia fast
1.294631 seconds (47.69 k allocations: 1.697 GiB, 5.62% gc time)
1.044857504308614e-15
Fortran
2.599797 seconds (1 allocation: 1.766 KiB)
Julia
2.672015 seconds (64.66 k allocations: 11.809 GiB, 19.80% gc time)
1.044857504308614e-15
Julia fast
1.272056 seconds (47.69 k allocations: 1.697 GiB, 5.67% gc time)
1.044857504308614e-15
Fortran
2.603039 seconds (1 allocation: 1.766 KiB)
Julia
2.708823 seconds (64.66 k allocations: 11.809 GiB, 20.01% gc time)
1.044857504308614e-15
Julia fast
1.358685 seconds (47.69 k allocations: 1.697 GiB, 5.47% gc time)
1.044857504308614e-15

となりました。FortranとJuliaコードの値が倍精度実数の範囲で一致していることがわかります。

また、@.をつけた新しいJuliaコードはgfortran+MKLに大きく勝っていることがわかります。


まとめ

配列同士の足し算をするときは@.をつけよう