More than 1 year has passed since last update.

Juliaのコードの最適化：Fortranとの比較

Posted at 2022-12-26

Fortranにおける配列のダブルバッファリングのうまい使い方を調査するに触発されまして、Juliaでどうなるのかを見てみることにしました。

Fortranコード

まず、記事にあるFortranのコードを再現し、手元で計算を実行してみます。
使ったFortranコードは以下のようになります。まず、モジュールとして、

module test
    use,intrinsic :: iso_fortran_env
    implicit none

    

    contains 


    subroutine array()
        
        implicit none
        integer(int32)::Nx,Lx,t_end,Nt
        real(real64)::dx,U_conv,Co,dt,PI
        real(real64), allocatable :: f(:, :)
        integer(int32) :: curr, next

        Lx = 5
        Nx = 10001 !!10001
        dx = Lx/dble(Nx - 1)
        U_conv = 1
        Co = 0.01d0
        t_end = 4
        dt = Co*dx/U_conv
        Nt = nint(t_end/dt)
        PI = 3.141592653589793238462d0
        

        curr = 0; next = 1
        allocate (f(Nx, curr:next), source=0d0)

        init: block
            integer(int32) :: i
            real(real64), allocatable :: x(:)

            x = [(dble(i - 1)*dx, i=1, Nx)]
            f(:, curr) = merge(((1d0 - cos(2d0*PI*x))/2d0)**2, 0d0, x < 1d0)
        end block init
        !write(*,*)f(:, curr) 
        !stop


        time_marching: block
            integer(int32) :: n
            real(real64) :: start_s, end_s

            call cpu_time(start_s)
            do n = 1, Nt
                f(2:Nx-1,next) = f(2:Nx-1,curr) - dt*U_conv*(f(3:Nx,curr) - f(1:Nx-2,curr))/(2d0*dx)
                call update(curr, next)
            end do
            call cpu_time(end_s)
            print *, end_s - start_s
        end block time_marching

        print *, sum(f(:,curr))*dx

        deallocate (f)    

        contains 

        subroutine update(a, b)
            implicit none
            integer(int32), intent(inout) :: a, b

            b = a
            a = 1 - a
        end subroutine update
        
    end subroutine




    subroutine nativecopy()
        use,intrinsic :: iso_fortran_env
        implicit none
        integer(int32)::Nx,Lx,t_end,Nt
        real(real64)::dx,U_conv,Co,dt,PI
        real(real64), allocatable :: f_curr(:) ! f at current time t
        real(real64), allocatable :: f_next(:) ! f at next time t+dt

        Lx = 5
        Nx = 10001
        dx = Lx/dble(Nx - 1)
        U_conv = 1
        Co = 0.01d0
        t_end = 4
        dt = Co*dx/U_conv
        Nt = nint(t_end/dt)
        PI = 3.141592653589793238462d0






        init: block
            integer(int32) :: i
            real(real64), allocatable :: x(:)

            x = [(dble(i - 1)*dx, i=1, Nx)]
            f_curr = merge(((1d0 - cos(2d0*PI*x))/2d0)**2, 0d0, x < 1d0)
            allocate (f_next, source=f_curr)
        end block init

        time_marching: block
            integer(int32) :: n
            real(real64) :: start_s, end_s

            call cpu_time(start_s)
            do n = 1, Nt
                f_next(2:Nx-1) = f_curr(2:Nx-1) - dt*U_conv*(f_curr(3:Nx) - f_curr(1:Nx-2))/(2d0*dx) !&
                f_curr(:) = f_next(:)
            end do
            call cpu_time(end_s)
            print *, end_s - start_s
        end block time_marching
        print *, sum(f_curr)*dx

        deallocate (f_curr)
        deallocate (f_next)
    end 

    subroutine nativecopy_subroutine()
        use,intrinsic :: iso_fortran_env
        implicit none
        integer(int32)::Nx,Lx,t_end,Nt
        real(real64)::dx,U_conv,Co,dt,PI
        real(real64), allocatable :: f_curr(:) ! f at current time t
        real(real64), allocatable :: f_next(:) ! f at next time t+dt

        Lx = 5
        Nx = 10001
        dx = Lx/dble(Nx - 1)
        U_conv = 1
        Co = 0.01d0
        t_end = 4
        dt = Co*dx/U_conv
        Nt = nint(t_end/dt)
        PI = atan(1d0)*4d0!3.141592653589793238462d0

        init: block
            integer(int32) :: i
            real(real64), allocatable :: x(:)

            x = [(dble(i - 1)*dx, i=1, Nx)]
            f_curr = merge(((1d0 - cos(2d0*PI*x))/2d0)**2, 0d0, x < 1d0)
            allocate (f_next, source=f_curr)
        end block init


        time_marching: block
            integer(int32) :: n
            real(real64) :: start_s, end_s

            call cpu_time(start_s)
            do n = 1, Nt
                call integrate_convection_equation(f_curr, f_next )
                !f_next(2:Nx-1) = f_curr(2:Nx-1) - dt*U_conv*(f_curr(3:Nx) - f_curr(1:Nx-2))/(2d0*dx) !&
                f_curr(:) = f_next(:)
                !print *, sum(f_curr),n
            end do
            call cpu_time(end_s)
            print *, end_s - start_s
        end block time_marching
        print *, sum(f_curr)*dx

        deallocate (f_curr)
        deallocate (f_next)
    end 


    subroutine array_subroutine()
        implicit none
        integer(int32)::Nx,Lx,t_end,Nt
        real(real64)::dx,U_conv,Co,dt,PI
        real(real64), allocatable :: f(:, :)
        integer(int32) :: curr, next

        Lx = 5
        Nx = 10001
        dx = Lx/dble(Nx - 1)
        U_conv = 1
        Co = 0.01d0
        t_end = 4
        dt = Co*dx/U_conv
        Nt = nint(t_end/dt)
        PI = 3.141592653589793238462d0
        

        curr = 0; next = 1
        allocate (f(Nx, curr:next), source=0d0)

        init: block
            integer(int32) :: i
            real(real64), allocatable :: x(:)

            x = [(dble(i - 1)*dx, i=1, Nx)]
            f(:, curr) = merge(((1d0 - cos(2d0*PI*x))/2d0)**2, 0d0, x < 1d0)
        end block init

        time_marching: block
            integer(int32) :: n
            real(real64) :: start_s, end_s

            call cpu_time(start_s)
            do n = 1, Nt
                call integrate_convection_equation(f(:,curr), f(:,next) )
                !f(2:Nx-1,next) = f(2:Nx-1,curr) - dt*U_conv*(f(3:Nx,curr) - f(1:Nx-2,curr))/(2d0*dx)
                call update(curr, next)
            end do
            call cpu_time(end_s)
            print *, end_s - start_s
        end block time_marching
        print *, sum(f(:,curr))*dx

        deallocate (f)    

        contains 

        subroutine update(a, b)
            implicit none
            integer(int32), intent(inout) :: a, b

            b = a
            a = 1 - a
        end subroutine update
        
    end subroutine




    subroutine integrate_convection_equation(f_curr, f_next)
        implicit none
        integer(int32)::Nx,Lx,t_end,Nt
        real(real64)::dx,U_conv,Co,dt,PI
        real(real64), intent(in) :: f_curr(:)
        real(real64), intent(inout) :: f_next(:)

        Lx = 5
        Nx = 10001
        dx = Lx/dble(Nx - 1)
        U_conv = 1
        Co = 0.01d0
        t_end = 4
        dt = Co*dx/U_conv
        Nt = nint(t_end/dt)
        PI = 3.141592653589793238462d0
        f_next(2:Nx-1) = f_curr(2:Nx-1) - dt*U_conv*(f_curr(3:Nx) - f_curr(1:Nx-2))/(2d0*dx) !&
    end subroutine integrate_convection_equation

end module

を定義しておき、

program main
    use test
    implicit none

    write(*,*) "copy"
    call nativecopy()
    write(*,*) "copy with subroutine"
    call nativecopy_subroutine()
    write(*,*) "array"
    call array()
    write(*,*) "array with subroutine"
    call array_subroutine()
    
end program

として、計算速度を比べてみます。コンパイラはgfortranとし、記事と同じコンパイルオプション

-O3 -Wimplicit-interface -fPIC -fmax-errors=1 -funroll-loops -fcoarray=single

をつけました。実行結果は

 copy
   3.4856549999999999     
  0.52432147453468680     
 copy with subroutine
   3.4232230000000001     
  0.52432147453468680     
 array
   3.6973959999999995     
  0.52432147453468680     
 array with subroutine
   2.0372640000000004     
  0.52432147453468680

となりました。記事では2次元の計算でしたが、こちらは1次元です。記事と同じく、arrayでサブルーチンを使うものが2秒と最速です。

Juliaのコード

Julia 1.8.1を用いて、同じことを行ってみます。

シンプルコード

まず、一番シンプルなコードは

function copyarray_00()
    Lx = 5
    Nx = 10001
    dx = Lx/(Nx - 1)
    U_conv = 1
    Co = 0.01
    t_end = 4
    dt = Co*dx/U_conv
    Nt = Int64(round(t_end/dt, RoundNearestTiesUp))
    #println(Nt)

    x = Float64[(i - 1)*dx  for i=1:Nx]
    f_curr = map(k -> ifelse(k < 1,((1 - cos(2*pi*k))/2)^2,0.0),x)

    f_next = deepcopy(f_curr)

    @time for n = 1:Nt
        for k=2:Nx-1
            f_next[k] = f_curr[k] - dt*U_conv*(f_curr[k+1] - f_curr[k-1])/(2*dx) 
        end
        for k=1:Nx
            f_curr[k] = f_next[k]
        end
       
    end 
    println(sum(f_curr)*dx)
end

です。for文でシンプルにまとめてみました。
これを

function main()
    copyarray_00()
    copyarray_00()
end
main()

と2回実行してみると、

  8.206216 seconds
0.5208227304964476
  8.303908 seconds
0.5208227304964476

となります。値がFortranと少しだけずれていますが、Nxを小さくした時には一致しましたので、丸め誤差の違いではないかと推測しています。時間は悪くはないですが、2-3倍くらいFortranより遅いです。

一部関数化

次に、Juliaは関数にすると最適化が効いて速くなるということを信じて、

function integrate_convection_equation!(Nx,dt,dx,U_conv,f_curr, f_next)
    for k=2:Nx-1
        f_next[k] = f_curr[k] - dt*U_conv*(f_curr[k+1] - f_curr[k-1])/(2*dx) 
    end
end

function copyarray_0()
    Lx = 5
    Nx = 10001
    dx = Lx/(Nx - 1)
    U_conv = 1
    Co = 0.01
    t_end = 4
    dt = Co*dx/U_conv
    Nt = Int64(round(t_end/dt, RoundNearestTiesUp))
    #println(Nt)

    x = Float64[(i - 1)*dx  for i=1:Nx]
    f_curr = map(k -> ifelse(k < 1,((1 - cos(2*pi*k))/2)^2,0.0),x)

    f_next = deepcopy(f_curr)

    @time for n = 1:Nt
        integrate_convection_equation!(Nx,dt,dx,U_conv,f_curr, f_next)
        for k=1:Nx
            f_curr[k] = f_next[k]
        end
    end 
    
    println(sum(f_curr)*dx)
   
end

と一箇所を関数にしてみました。

function main()
    copyarray_0()
    copyarray_0()
end

main()

を実行してみると、

  8.120514 seconds
0.5208227304964476
  8.138643 seconds
0.5208227304964476

となり、少しだけ速くなりました。でも、これではまだ遅いですね。

さらに関数化

もう少し関数にまとめてみます。nに関するループの部分も関数にしてみます。


function timeevolve!(Nt,Nx,dt,dx,U_conv,f_curr,f_next)
    for n = 1:Nt
        integrate_convection_equation!(Nx,dt,dx,U_conv,f_curr, f_next)
        for k=1:Nx
            f_curr[k] = f_next[k]
        end
    end 
end

function copyarray()
    Lx = 5
    Nx = 10001
    dx = Lx/(Nx - 1)
    U_conv = 1
    Co = 0.01
    t_end = 4
    dt = Co*dx/U_conv
    Nt = Int64(round(t_end/dt, RoundNearestTiesUp))
    #println(Nt)

    x = Float64[(i - 1)*dx  for i=1:Nx]
    f_curr = map(k -> ifelse(k < 1,((1 - cos(2*pi*k))/2)^2,0.0),x)

    f_next = deepcopy(f_curr)
    @time timeevolve!(Nt,Nx,dt,dx,U_conv,f_curr,f_next)
    println(sum(f_curr)*dx)
end

これを

function main()
    copyarray()
    copyarray()
end

main()

で実行すると、

  3.198903 seconds
0.5208227304964476
  3.163727 seconds
0.5208227304964476

となります。Fortranのコードと同じ程度になりました！

arrayを使用する

次に、Fortranと同じようにarrayを使用してみます。

function integrate_convection_equation_array!(Nx,dt,dx,U_conv,f,curr,next)
    for k=2:Nx-1
        f[k,next+1] = f[k,curr+1] - dt*U_conv*(f[k+1,curr+1] - f[k-1,curr+1])/(2*dx) 
    end
end

function timeevolve_array!(Nt,Nx,dt,dx,U_conv,f)
    curr = 0
    next = 1
    for n = 1:Nt
        integrate_convection_equation_array!(Nx,dt,dx,U_conv,f,curr,next)
        curr,next = next,curr
    end 
    return curr
end

function array()
    Lx = 5
    Nx = 10001
    dx = Lx/(Nx - 1)
    U_conv = 1
    Co = 0.01
    t_end = 4
    dt = Co*dx/U_conv
    Nt = Int64(round(t_end/dt, RoundNearestTiesUp))
    #println(Nt)

    x = Float64[(i - 1)*dx  for i=1:Nx]
    f = zeros(Float64,Nx,2)
    curr = 0
    next = 1
    f[:,curr+1] = map(k -> ifelse(k < 1,((1 - cos(2*pi*k))/2)^2,0.0),x)
    f[:,next+1] = deepcopy(f[:,curr+1])

    @time curr = timeevolve_array!(Nt,Nx,dt,dx,U_conv,f)
    println(sum(@view(f[:,curr+1]))*dx)
end

これを

function main()
    array()
    array()
end

main()

で実行してみると、

  2.230833 seconds
0.5208227304964476
  2.242601 seconds
0.5208227304964476

となりました。Fortranの現状最速のコードとほぼ同じ速度になりました！

まとめ

Fortranで使える高速化の技法はJuliaでも使えるようです。Juliaの場合、適当に部品のようにfunctionを作っていくと、Fortranに匹敵する速度になるようです。

なお、ここに書いたコードは一番速そうなコードですが、メモリーコピーをするような形、例えば、

f_next[2:Nx-1] = f_curr[2:Nx-1] - dt*U_conv*(f_curr[3:Nx] - f_curr[1:Nx-2])/(2*dx)

などとすると異常に遅くなることがわかります。これは、右辺で[2:Nx-1]のようにスライスを取ると配列のコピーを作成するからです。

もしかするともっとうまくやれば、ここに書いたコードよりも速くすることも可能かもしれません。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up