More than 1 year has passed since last update.

Cuda FortranでThrust (reduce, stable_sort_by_key)

Posted at 2024-05-15

0. はじめに

もともと所有するFortranコードをGPGPU移植するにあたって、Cuda Fortranが最も移植コストが少ないと考えた。キー配列を用いた配列の並べ替えやReduction（総和）の計算が必要になる計算コードで、CUDA CのThrustで実装できることはわかっていたが、Cuda FortranでThrustはまだサポートされていない。
CUDA Cで書いたコードをCUDA Fortranカーネルから呼び出すことで実装を行った。

環境

OS： Ubuntu 22.4
コンパイラ： nvfortran 2024 / 24.3
GPU: RTX 1660 Super

1. 実装

はじめにchatGPT(2024年5月15日時点のGPT-4o)に聞いて出てきたコードが以下

質問：
thrust::stable_sort_by_keyをCudaFortranで使うための方法を教えてください。なお、コンパイラはNVIDIA SDKを使用します。

thrust_sort.cu

#include <thrust/sort.h>
#include <thrust/device_vector.h>

extern "C" void sort_by_key(int* keys, int* values, int size) {
    thrust::device_vector<int> d_keys(keys, keys + size);
    thrust::device_vector<int> d_values(values, values + size);

    thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_values.begin());

    thrust::copy(d_keys.begin(), d_keys.end(), keys);
    thrust::copy(d_values.begin(), d_values.end(), values);
}

thrust_sort.F90

module thrust_sort
  use iso_c_binding
  implicit none

  interface
    subroutine sort_by_key(keys, values, size) bind(C, name="sort_by_key")
      use iso_c_binding
      integer(c_int), dimension(*), intent(inout) :: keys
      integer(c_int), dimension(*), intent(inout) :: values
      integer(c_int), intent(in) :: size
    end subroutine sort_by_key
  end interface

contains

  subroutine sort_keys_and_values(keys, values, size)
    use iso_c_binding
    integer(c_int), dimension(*), intent(inout) :: keys
    integer(c_int), dimension(*), intent(inout) :: values
    integer(c_int), intent(in) :: size

    call sort_by_key(keys, values, size)
  end subroutine sort_keys_and_values

end module thrust_sort

text.f90

program test_sort
  use thrust_sort
  implicit none

  integer(c_int), dimension(5) :: keys = [3, 1, 4, 1, 5]
  integer(c_int), dimension(5) :: values = [9, 2, 8, 7, 6]

  call sort_keys_and_values(keys, values, 5)

  print *, 'Sorted keys: ', keys
  print *, 'Sorted values: ', values
end program test_sort

nvcc -c thrust_sort.cu -o thrust_sort.o
nvfortran thrust_sort.F90 test_sort.F90 thrust_sort.o -o test_sort_app

かなり優秀にコードを出力できたとは思うが、コンパイルは通らなかった。

なお、そもそもの連携法について解説はこちら (https://www.slideshare.net/ssuserf87701/pgi-cuda-fortrangpu) などが詳しい。

コンパイルに関して修正

最初に出てきたエラーは「ライブラリへのリンクができていない」というものだった。試行錯誤するうちにnvfortranのオプションにc++ライブラリへのリンクをつけること、PTX（Parallel Thread Execution）のバージョンが実行時とコンパイル時でことなることが問題であるということがわかってきた

（参考サイトを忘れてしまいました。。。）

2. 最終コード

ソートや総和計算は型が違う場合それぞれ定義する必要がるので、bool,int,float,doubleの4パターンで用意した。

CUDA Cコード

thrust_sort.cu

#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
// #include <thrust/execution_policy.h>

extern "C"
{
    // ######################################### //
    // ############### Reduction ############### //
    // ######################################### //

    // Reduce for bool array (true =1, false = 0)
    int thrust_reduce_bool(bool *d_array, int N)
    {
        thrust::device_ptr<bool> d_ptr(d_array);
        int result = thrust::reduce(d_ptr, d_ptr + N, 0, thrust::plus<int>());
        return result;
    }
    // Reduce for int array
    int thrust_reduce_int(int *d_array, int N)
    {
        thrust::device_ptr<int> d_ptr(d_array);
        int result = thrust::reduce(d_ptr, d_ptr + N, 0, thrust::plus<int>());
        return result;
    }
    // Reduce for float array
    float thrust_reduce_float(float *d_array, int N)
    {
        thrust::device_ptr<float> d_ptr(d_array);
        float result = thrust::reduce(d_ptr, d_ptr + N, 0.0f, thrust::plus<float>());
        return result;
    }
    // Reduce for double array
    double thrust_reduce_double(double *d_array, int N)
    {
        thrust::device_ptr<double> d_ptr(d_array);
        double result = thrust::reduce(d_ptr, d_ptr + N, 0.0, thrust::plus<double>());
        return result;
    }

    // ######################################### //
    // ############## SORT_BY_KEY ############## //
    // ######################################### //

    // Sort int array by int key array
    void thrust_sort_int_by_intkey(int *keys, int *values, int n)
    {
        thrust::device_ptr<int> dev_keys(keys);
        thrust::device_ptr<int> dev_values(values);
        thrust::stable_sort_by_key(dev_keys, dev_keys + n, dev_values);
    }
    // Sort float array by int key array
    void thrust_sort_float_by_intkey(int *keys, float *values, int n)
    {
        thrust::device_ptr<int> dev_keys(keys);
        thrust::device_ptr<float> dev_values(values);
        thrust::stable_sort_by_key(dev_keys, dev_keys + n, dev_values);
    }
    // Sort double array by bool key array
    void thrust_sort_double_by_intkey(int *keys, double *values, int n)
    {
        thrust::device_ptr<int> dev_keys(keys);
        thrust::device_ptr<double> dev_values(values);
        thrust::stable_sort_by_key(dev_keys, dev_keys + n, dev_values);
    }
    // Sort int array by bool key array
    void thrust_sort_int_by_boolkey(bool *keys, int *values, int n)
    {
        thrust::device_ptr<bool> dev_keys(keys);
        thrust::device_ptr<int> dev_values(values);
        thrust::stable_sort_by_key(dev_keys, dev_keys + n, dev_values);
    }
    // Sort float array by bool key array
    void thrust_sort_float_by_boolkey(bool *keys, float *values, int n)
    {
        thrust::device_ptr<bool> dev_keys(keys);
        thrust::device_ptr<float> dev_values(values);
        thrust::stable_sort_by_key(dev_keys, dev_keys + n, dev_values);
    }
    // Sort double array by bool key array
    void thrust_sort_double_by_boolkey(bool *keys, double *values, int n)
    {
        thrust::device_ptr<bool> dev_keys(keys);
        thrust::device_ptr<double> dev_values(values);
        thrust::stable_sort_by_key(dev_keys, dev_keys + n, dev_values);
    }
}

Fortranで呼び出すためのモジュール

thrust_module.f90

module thrust_module
    use iso_c_binding
    implicit none
    interface

        ! ################ REDUCTION ################# !
        function dev_reduce_bool(values, n) bind(C, name="thrust_reduce_bool") result(result)
            use iso_c_binding
            implicit none
            logical(c_bool), device :: values(*)
            integer(c_int), value :: n
            integer(c_int) :: result
        end function dev_reduce_bool

        function dev_reduce_int(values, n) bind(C, name="thrust_reduce_int") result(result)
            use iso_c_binding
            implicit none
            integer(c_int), value :: n
            integer(c_int), device :: values(*)
            integer(c_int) :: result
        end function dev_reduce_int

        function dev_reduce_float(values, n) bind(C, name="thrust_reduce_float") result(result)
            use iso_c_binding
            integer(c_int), value :: n
            real(c_float), device :: values(*)
            real(c_float) :: result
        end function dev_reduce_float

        function dev_reduce_double(values, n) bind(C, name="thrust_reduce_double") result(result)
            use iso_c_binding
            integer(c_int), value :: n
            double precision(c_double), device :: values(*)
            double precision(c_double) :: result
        end function dev_reduce_double
        ! ################ REDUCTION ################# !

        ! ################ SORT BY KEY ################### !
        subroutine dev_sort_int_by_intkey(keys, values, n) bind(C, name="thrust_sort_int_by_intkey")
            use iso_c_binding
            implicit none
            integer(c_int), value :: n
            integer(c_int), device :: keys(*)
            integer(c_int), device :: values(*)
        end subroutine dev_sort_int_by_intkey

        subroutine dev_sort_float_by_intkey(keys, values, n) bind(C, name="thrust_sort_float_by_intkey")
            use iso_c_binding
            implicit none
            integer(c_int), value :: n
            integer(c_int), device :: keys(*)
            real(c_float), device :: values(*)
        end subroutine dev_sort_float_by_intkey

        subroutine dev_sort_double_by_intkey(keys, values, n) bind(C, name="thrust_sort_double_by_intkey")
            use iso_c_binding
            implicit none
            integer(c_int), value :: n
            integer(c_int), device :: keys(*)
            double precision(c_double), device :: values(*)
        end subroutine dev_sort_double_by_intkey

        ! --------------------------------------------------

        subroutine dev_sort_int_by_boolkey(keys, values, n) bind(C, name="thrust_sort_int_by_boolkey")
            use iso_c_binding
            implicit none
            integer(c_int), value :: n
            logical(c_bool), device :: keys(*)
            integer(c_int), device :: values(*)
        end subroutine dev_sort_int_by_boolkey

        subroutine dev_sort_float_by_boolkey(keys, values, n) bind(C, name="thrust_sort_float_by_boolkey")
            use iso_c_binding
            implicit none
            integer(c_int), value :: n
            logical(c_bool), device :: keys(*)
            real(c_float), device :: values(*)
        end subroutine dev_sort_float_by_boolkey

        subroutine dev_sort_double_by_boolkey(keys, values, n) bind(C, name="thrust_sort_double_by_boolkey")
            use iso_c_binding
            implicit none
            integer(c_int), value :: n
            logical(c_bool), device :: keys(*)
            double precision(c_double), device :: values(*)
        end subroutine dev_sort_double_by_boolkey

        ! ################ SORT BY KEY ################### !

    end interface
end module thrust_module

Mainプログラム

main.f90

program main
    use cudafor
    use thrust_module
    implicit none

    integer, parameter :: n = 10
    logical(c_bool), device :: d_keys_bool(n)
    integer, device :: d_keys(n)
    integer, device :: d_int_values(n)
    real, device :: d_float_values(n)
    double precision, device :: d_double_values(n)

    logical(c_bool) :: keys_bool(n)
    integer :: keys(n)
    integer :: int_values(n)
    real :: float_values(n)
    double precision :: double_values(n)

    integer :: i
    integer :: sum_result_bool
    integer :: sum_result_int
    real :: sum_result_float
    double precision :: sum_result_double


    ! -----------bool 型キーを用いて整列する-------------

    ! 初期データを設定
    keys_bool = (/1, 1, 0, 1, 0, 1, 0, 1, 0, 1/)
    int_values = (/3, 1, 4, 1, 5, 9, 2, 6, 5, 3/)
    float_values = (/3.0, 1.0, 4.0, 1.1, 5.0, 9.0, 2.0, 6.0, 5.1, 3.1/)
    double_values = (/3.0d0, 1.0d0, 4.0d0, 1.1d0, 5.0d0, 9.0d0, 2.0d0, 6.0d0, 5.1d0, 3.1d0/)
    ! デバイスメモリにコピー
    print *, '-------------------------------'
    print *, 'Sort by Bool keys: ', keys_bool
    d_keys_bool = keys_bool
    d_int_values = int_values
    d_float_values = float_values
    d_double_values = double_values
    call dev_sort_int_by_boolkey(d_keys_bool, d_int_values, n)
    d_keys_bool = keys_bool
    call dev_sort_float_by_boolkey(d_keys_bool, d_float_values, n)
    d_keys_bool = keys_bool
    call dev_sort_double_by_boolkey(d_keys_bool, d_double_values, n)
    ! 結果をホストにコピー
    keys_bool = d_keys_bool
    keys = d_keys
    int_values = d_int_values
    float_values = d_float_values
    double_values = d_double_values
    ! 結果を表示
    print *, 'Sorted keys: ', keys_bool
    print *, 'Sorted int values: ', int_values
    print *, 'Sorted float values: ', float_values
    print *, 'Sorted double values: ', double_values
    print *, '-------------------------------'

    ! -----------int 型キーを用いて整列する-------------
    ! 初期データを設定
    keys = (/3, 1, 4, 1, 5, 9, 2, 6, 5, 3/)
    int_values = (/3, 1, 4, 1, 5, 9, 2, 6, 5, 3/)
    float_values = (/3.0, 1.0, 4.0, 1.1, 5.0, 9.0, 2.0, 6.0, 5.1, 3.1/)
    double_values = (/3.0d0, 1.0d0, 4.0d0, 1.1d0, 5.0d0, 9.0d0, 2.0d0, 6.0d0, 5.1d0, 3.1d0/)
    ! デバイスメモリにコピー
    d_keys = keys
    d_int_values = int_values
    d_float_values = float_values
    d_double_values = double_values
    ! Thrustを使ってキーによる安定ソートを実行
    print *, '-------------------------------'
    print *, 'Sort by Integer keys: ', keys
    call dev_sort_int_by_intkey(d_keys, d_int_values, n)
    d_keys = keys
    call dev_sort_float_by_intkey(d_keys, d_float_values, n)
    d_keys = keys
    call dev_sort_double_by_intkey(d_keys, d_double_values, n)
    ! 結果をホストにコピー
    keys_bool = d_keys_bool
    keys = d_keys
    int_values = d_int_values
    float_values = d_float_values
    double_values = d_double_values
    ! 結果を表示
    print *, 'Sorted keys: ', keys
    print *, 'Sorted int values: ', int_values
    print *, 'Sorted float values: ', float_values
    print *, 'Sorted double values: ', double_values
    print *, '-------------------------------'
    !-----------------------------------

    ! -------- Reduceで総和を求める----------
    sum_result_bool = dev_reduce_bool(d_keys_bool, n)
    sum_result_int = dev_reduce_int(d_int_values, n)
    sum_result_float = dev_reduce_float(d_float_values, n)
    sum_result_double = dev_reduce_double(d_double_values, n)
    print *, '-------------------------------'
    print *, "Sum(bol): ", sum_result_bool
    print *, "Sum(int): ", sum_result_int
    print *, "Sum(flt): ", sum_result_float
    print *, "Sum(dbl): ", sum_result_double
    print *, '-------------------------------'

end program main

boolでソート、intでソート、総和の計算が入っている。
ソートするたびにキー配列もソートされることに注意。

コンパイル・実行コマンド

do.sh

nvcc -c -arch=sm_70 thrust_sort.cu -o thrust_sort.o
nvfortran -cuda -gpu=cc70 thrust_sort.o thrust_module.cuf main.cuf -o main -lstdc++
./main

3. 実行結果

 -------------------------------
 Sort by Bool keys:   T  T  F  T  F  T  F  T  F  T
 Sorted keys:   F  F  F  F  T  T  T  T  T  T
 Sorted int values:             4            5            2            5 
            3            1            1            9            6            3
 Sorted float values:     4.000000        5.000000        2.000000     
    5.100000        3.000000        1.000000        1.100000     
    9.000000        6.000000        3.100000    
 Sorted double values:     4.000000000000000         5.000000000000000      
    2.000000000000000         5.100000000000000         3.000000000000000      
    1.000000000000000         1.100000000000000         9.000000000000000      
    6.000000000000000         3.100000000000000     
 -------------------------------
 -------------------------------
 Sort by Integer keys:             3            1            4            1 
            5            9            2            6            5            3
 Sorted keys:             1            1            2            3            3
             4            5            5            6            9
 Sorted int values:             1            1            2            3 
            3            4            5            5            6            9
 Sorted float values:     1.000000        1.100000        2.000000     
    3.000000        3.100000        4.000000        5.000000     
    5.100000        6.000000        9.000000    
 Sorted double values:     1.000000000000000         1.100000000000000      
    2.000000000000000         3.000000000000000         3.100000000000000      
    4.000000000000000         5.000000000000000         5.100000000000000      
    6.000000000000000         9.000000000000000     
 -------------------------------
 -------------------------------
 Sum(bol):             6
 Sum(int):            39
 Sum(flt):     39.30000    
 Sum(dbl):     39.30000000000000     
 -------------------------------

4. 参考

PGI CUDA FortranとGPU最適化ライブラリの一連携法

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up