はじめに
わたしのGPU計算コードではステップごとにThrust::sortを使用する必要がある。プログラム上で計測した実行時間ではこの部分が全体の3-5割を占めており、非常に効率が悪かった。
ソート対象の配列は10M程度の大きな配列であり、メモリで数GBクラスとなる。純粋なThrust::sortはその呼び出しごとにバッファメモリのallocate/deallocateを行うので、大きなメモリではそこが時間ロスになるのではないかと考えた。
検索してみたところ、この事実は指摘されており、cached allocatorを用いて、最初の割付以降は、キャッシュされたメモリ空間を再利用する、という手法で一定の効果がありそうであることが分かった。
そこで実際にどの程度の速度改善がみられるかを確認した。
また、自分のコードではFotranで、Cuda Cをbindして呼び出しており(参照)、その実装で手間取ったので最終的にエラーなく動かせたコードを残す。
参考サイト
- CUDA で一時領域の確保・破棄を回避して速度低下を防ぐ
- Thrust/CUDA tip: reuse temporary buffer across multiple transforms
- NVIDIAの提供するサンプルコード
環境
OS: Ubuntu 22.4 (WSL2)
コンパイラ: nvfortran 2024 / 24.3
GPU: RTX 4060 Ti 12 GB Memor
コードと結果(Cuda C)
コード内で行っていること
- cached allocator 構造体の作成
- ホストで整列対象の配列作成
- cached allocatorなしで5回ソートを行い、ソート単体の速度平均を調べる。(ただし先に一回ソートをして、メモリ配置を効率の良い状態にしておく。)
- cached allocatorありで5回ソートを行い、ソート単体の速度平均を調べる。(ただし先に一回ソートをして、割付およびメモリ配置を効率の良い状態にしておく。)
整列対象が、私がプログラムで用いている構造体で、キーを構造体要素としているが、thrust::sortの第一引数とcached allocatorの定義以外は普通のthrust::sortの使い方と同じなので、必要に応じてほかのサイト等を参照されたい。
コード
#include <thrust/system/cuda/vector.h>
#include <thrust/host_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <thrust/pair.h>
#include <cstdlib>
#include <iostream>
#include <sstream>
#include <map>
#include <cassert>
// This example demonstrates how to intercept calls to get_temporary_buffer
// and return_temporary_buffer to control how Thrust allocates temporary storage
// during algorithms such as thrust::sort. The idea will be to create a simple
// cache of allocations to search when temporary storage is requested. If a hit
// is found in the cache, we quickly return the cached allocation instead of
// resorting to the more expensive thrust::system::cuda::malloc.
//
// Note: this implementation cached_allocator is not thread-safe. If multiple
// (host) threads use the same cached_allocator then they should gain exclusive
// access to the allocator before accessing its methods.
// cached_allocator: a simple allocator for caching allocation requests
struct not_my_pointer
{
not_my_pointer(void *p)
: message()
{
std::stringstream s;
s << "Pointer `" << p << "` was not allocated by this allocator.";
message = s.str();
}
virtual ~not_my_pointer() {}
virtual const char *what() const
{
return message.c_str();
}
private:
std::string message;
};
// A simple allocator for caching cudaMalloc allocations.
struct cached_allocator
{
typedef char value_type;
cached_allocator() { std::cout << "Allocator created.\n"; }
~cached_allocator() { free_all(); }
char *allocate(std::ptrdiff_t num_bytes)
{
std::cout << "cached_allocator::allocate(): num_bytes == "
<< num_bytes
<< std::endl;
char *result = 0;
// Search the cache for a free block.
free_blocks_type::iterator free_block = free_blocks.find(num_bytes);
if (free_block != free_blocks.end())
{
std::cout << "cached_allocator::allocate(): found a free block"
<< std::endl;
result = free_block->second;
// Erase from the `free_blocks` map.
free_blocks.erase(free_block);
}
else
{
// No allocation of the right size exists, so create a new one with
// `thrust::cuda::malloc`.
try
{
std::cout << "cached_allocator::allocate(): allocating new block"
<< std::endl;
// Allocate memory and convert the resulting `thrust::cuda::pointer` to
// a raw pointer.
result = thrust::cuda::malloc<char>(num_bytes).get();
}
catch (std::runtime_error &)
{
throw;
}
}
// Insert the allocated pointer into the `allocated_blocks` map.
allocated_blocks.insert(std::make_pair(result, num_bytes));
return result;
}
void deallocate(char *ptr, size_t)
{
std::cout << "cached_allocator::deallocate(): ptr == "
<< reinterpret_cast<void *>(ptr) << std::endl;
// Erase the allocated block from the allocated blocks map.
allocated_blocks_type::iterator iter = allocated_blocks.find(ptr);
if (iter == allocated_blocks.end())
throw not_my_pointer(reinterpret_cast<void *>(ptr));
std::ptrdiff_t num_bytes = iter->second;
allocated_blocks.erase(iter);
// Insert the block into the free blocks map.
free_blocks.insert(std::make_pair(num_bytes, ptr));
}
void free_all()
{
std::cout << "cached_allocator::free_all()" << std::endl;
// Deallocate all outstanding blocks in both lists.
for (free_blocks_type::iterator i = free_blocks.begin(); i != free_blocks.end(); ++i)
{
// Transform the pointer to cuda::pointer before calling cuda::free.
thrust::cuda::free(thrust::cuda::pointer<char>(i->second));
}
for (allocated_blocks_type::iterator i = allocated_blocks.begin(); i != allocated_blocks.end(); ++i)
{
// Transform the pointer to cuda::pointer before calling cuda::free.
thrust::cuda::free(thrust::cuda::pointer<char>(i->first));
}
// Clear the lists
free_blocks.clear();
allocated_blocks.clear();
}
typedef std::multimap<std::ptrdiff_t, char *> free_blocks_type;
typedef std::map<char *, std::ptrdiff_t> allocated_blocks_type;
free_blocks_type free_blocks;
allocated_blocks_type allocated_blocks;
};
struct dim
{
double x;
double y;
double z;
};
struct part
{
dim pos;
dim vel;
double E_rot;
int tag;
int loc;
};
// 比較演算子の定義
struct compare_by_elementloc
{
__host__ __device__ bool operator()(const part &a, const part &b)
{
return a.loc < b.loc;
}
};
int main()
{
const int N = 100;
int num_trials = 5;
thrust::host_vector<part> h_parts(N);
// 配列の初期化
for (int i = 0; i < N; ++i)
{
h_parts[i].pos.x = (i + 1) * 1.0;
h_parts[i].pos.y = (i + 1) * 2.0;
h_parts[i].pos.z = (i + 1) * 3.0;
h_parts[i].vel.x = (i + 1) * 0.1;
h_parts[i].vel.y = (i + 1) * 0.2;
h_parts[i].vel.z = (i + 1) * 0.3;
h_parts[i].E_rot = (i + 1) * 5.0;
h_parts[i].tag = i + 1;
h_parts[i].loc = N - i; // loc を逆順に設定
}
thrust::system::cuda::vector<part> d_parts = h_parts;
cached_allocator alloc;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float sum_milliseconds = 0;
float milliseconds = 0;
thrust::copy(h_parts.begin(), h_parts.end(), d_parts.begin());
thrust::sort(thrust::device, d_parts.begin(), d_parts.end(), compare_by_elementloc());
for (std::size_t i = 0; i < num_trials; ++i)
{
thrust::copy(h_parts.begin(), h_parts.end(), d_parts.begin());
cudaEventRecord(start);
thrust::sort(thrust::device, d_parts.begin(), d_parts.end(), compare_by_elementloc());
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
sum_milliseconds += milliseconds;
}
std::cout << "Without cache, Average time: " << sum_milliseconds / num_trials << " ms\n";
thrust::copy(h_parts.begin(), h_parts.end(), d_parts.begin());
thrust::sort(thrust::cuda::par(alloc), d_parts.begin(), d_parts.end(), compare_by_elementloc());
for (std::size_t i = 0; i < num_trials; ++i)
{
thrust::copy(h_parts.begin(), h_parts.end(), d_parts.begin());
cudaEventRecord(start);
// Pass alloc through cuda::par as the first parameter to sort
// to cause allocations to be handled by alloc during sort.
thrust::sort(thrust::cuda::par(alloc), d_parts.begin(), d_parts.end(), compare_by_elementloc());
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
sum_milliseconds += milliseconds;
}
std::cout << "With cache, Average time: " << sum_milliseconds / num_trials << " ms\n";
cudaEventDestroy(start);
cudaEventDestroy(stop);
// 結果を表示(オプション)
if (N <= 100)
{
thrust::copy(d_parts.begin(), d_parts.end(), h_parts.begin());
for (int i = 0; i < N; ++i)
{
part p = h_parts[i];
std::cout << "Part " << i << ": Position(" << p.pos.x << ", " << p.pos.y << ", " << p.pos.z
<< "), Velocity(" << p.vel.x << ", " << p.vel.y << ", " << p.vel.z
<< "), E_rot(" << p.E_rot << "), Tag(" << p.tag << ")\n";
}
}
return 0;
}
コンパイルコマンド
nvcc -arch=sm_80 -o repetitive_sort_test repetitive_sort_test.cu
実行結果
整列結果の表示は省略
Allocator created.
Without cache, Average time: 0.101965 ms
cached_allocator::allocate(): num_bytes == 6911
cached_allocator::allocate(): allocating new block
cached_allocator::deallocate(): ptr == 0x1307001a00
cached_allocator::allocate(): num_bytes == 6911
cached_allocator::allocate(): found a free block
cached_allocator::deallocate(): ptr == 0x1307001a00
cached_allocator::allocate(): num_bytes == 6911
cached_allocator::allocate(): found a free block
cached_allocator::deallocate(): ptr == 0x1307001a00
cached_allocator::allocate(): num_bytes == 6911
cached_allocator::allocate(): found a free block
cached_allocator::deallocate(): ptr == 0x1307001a00
cached_allocator::allocate(): num_bytes == 6911
cached_allocator::allocate(): found a free block
cached_allocator::deallocate(): ptr == 0x1307001a00
cached_allocator::allocate(): num_bytes == 6911
cached_allocator::allocate(): found a free block
cached_allocator::deallocate(): ptr == 0x1307001a00
With cache, Average time: 0.239437 ms
cached_allocator::free_all()
この場合だとむしろ低速になっている。
配列数を大きくした場合にどうなるか確認したのが以下の表。
| 配列数N | 平均時間(キャッシュあり), ms | 平均時間(キャッシュなし), ms |
|---|---|---|
| $10^2$ | 0.102 | 0.239 |
| $10^4$ | 0.593 | 0.597 |
| $10^6$ | 12.80 | 7.70 |
- 配列数10kを超えてくるあたりからしかこの効果はなく、配列数が小さいとむしろ効率は悪くなるようだ。
- 1Mを超えてくると明確に高速化に寄与している。
- ※ソートの仕方やGPUにも強く依存すると考えられる。
コードと結果(bindしてFortranより呼び出す)
コード内で行っていること
- .cuファイル上で、cached allocator 構造体の作成
- .cuファイル上で、thrust::sort実行関数を作成
- 呼び出し用Fortranコードで、関数や構造体をbind
- ホストのFortranコードで整列対象の配列作成
- cached allocatorなしで5回ソートを行い、ソート単体の速度平均を調べる。(ただし先に一回ソートをして、メモリ配置を効率の良い状態にしておく。)
- cached allocatorありで5回ソートを行い、ソート単体の速度平均を調べる。(ただし先に一回ソートをして、割付およびメモリ配置を効率の良い状態にしておく。)
コード
注意点として、
- 明確にポインタとしてnew割付をしないとallocを取り扱うことができないこと
- 構造体内部で定義されたallocated_blocksが、thrust::sortから呼び出されたときに見つからず、segmentation faultになること
があった。
(あまり構造体やbindに詳しくないので原理がわかる方は教えていただけると嬉しい。)
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#include <thrust/pair.h>
// #include <thrust/execution_policy.h>
#include <thrust/memory.h>
#include <thrust/system/cuda/vector.h>
#include <cstdlib>
#include <iostream>
#include <sstream>
#include <map>
#include <cassert>
struct dim
{
double x;
double y;
double z;
};
struct part
{
dim pos;
dim vel;
double E_rot;
int tag;
int loc;
};
struct compare_by_elementloc
{
__host__ __device__ bool operator()(const part &a, const part &b)
{
return a.loc < b.loc;
}
};
// cached_allocator: a simple allocator for caching allocation requests
struct not_my_pointer
{
not_my_pointer(void *p)
: message()
{
std::stringstream s;
s << "Pointer `" << p << "` was not allocated by this allocator.";
message = s.str();
}
virtual ~not_my_pointer() {}
virtual const char *what() const
{
return message.c_str();
}
private:
std::string message;
};
// 以下の部分は構造体内部で定義するとthrust::sort呼び出し時に存在しなくなる(なぜ?)
typedef std::multimap<std::ptrdiff_t, char *> free_blocks_type;
typedef std::map<char *, std::ptrdiff_t> allocated_blocks_type;
free_blocks_type free_blocks;
allocated_blocks_type allocated_blocks;
// A simple allocator for caching cudaMalloc allocations.
struct cached_allocator
{
typedef char value_type;
cached_allocator() { std::cout << "Allocator created.\n"; }
~cached_allocator() { free_all(); }
char *allocate(std::ptrdiff_t num_bytes)
{
std::cout << "cached_allocator::allocate(): num_bytes == "
<< num_bytes
<< std::endl;
char *alloc_ptr = nullptr;
// Search the cache for a free block.
free_blocks_type::iterator free_block = free_blocks.find(num_bytes);
if (free_block != free_blocks.end())
{
std::cout << "cached_allocator::allocate(): found a free block"
<< std::endl;
alloc_ptr = free_block->second;
// Erase from the `free_blocks` map.
free_blocks.erase(free_block);
}
else
{
// No allocation of the right size exists, so create a new one with
// `thrust::cuda::malloc`.
try
{
// Allocate memory and convert the resulting `thrust::cuda::pointer` to
// a raw pointer.
alloc_ptr = thrust::cuda::malloc<char>(num_bytes).get();
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess)
{
std::cerr << "CUDA Error: " << cudaGetErrorString(error) << std::endl;
return nullptr;
}
}
catch (std::runtime_error &)
{
throw;
}
}
// // debug //
std::cout << "cached_allocator::allocate(): ptr == "
<< reinterpret_cast<void *>(alloc_ptr) << std::endl;
// Insert the allocated pointer into the `allocated_blocks` map.
allocated_blocks.insert(std::make_pair(alloc_ptr, num_bytes));
return alloc_ptr;
}
void deallocate(char *ptr, size_t)
{
std::cout << "cached_allocator::deallocate(): ptr == "
<< reinterpret_cast<void *>(ptr) << std::endl;
// Erase the allocated block from the allocated blocks map.
allocated_blocks_type::iterator iter = allocated_blocks.find(ptr);
if (iter == allocated_blocks.end())
throw not_my_pointer(reinterpret_cast<void *>(ptr));
std::ptrdiff_t num_bytes = iter->second;
allocated_blocks.erase(iter);
// Insert the block into the free blocks map.
free_blocks.insert(std::make_pair(num_bytes, ptr));
}
void free_all()
{
std::cout << "cached_allocator::free_all()" << std::endl;
// Deallocate all outstanding blocks in both lists.
for (free_blocks_type::iterator i = free_blocks.begin(); i != free_blocks.end(); ++i)
{
// Transform the pointer to cuda::pointer before calling cuda::free.
thrust::cuda::free(thrust::cuda::pointer<char>(i->second));
}
for (allocated_blocks_type::iterator i = allocated_blocks.begin(); i != allocated_blocks.end(); ++i)
{
// Transform the pointer to cuda::pointer before calling cuda::free.
thrust::cuda::free(thrust::cuda::pointer<char>(i->first));
}
// Clear the lists
free_blocks.clear();
allocated_blocks.clear();
}
// ここから移動する必要がある
// typedef std::multimap<std::ptrdiff_t, char *> free_blocks_type;
// typedef std::map<char *, std::ptrdiff_t> allocated_blocks_type;
// free_blocks_type free_blocks;
// allocated_blocks_type allocated_blocks;
};
extern "C"
{
void create_cached_allocator(cached_allocator** allocator)
{
if (allocator)
{
*allocator = new cached_allocator();
}
}
void delete_cached_allocator(cached_allocator* allocator)
{
if (allocator)
{
delete allocator;
}
}
void thrust_sort_part_by_element(part *parts, int n)
{
thrust::device_ptr<part> dev_values(parts);
thrust::sort(thrust::device, dev_values, dev_values + n, compare_by_elementloc());
}
void thrust_sort_parts_using_cached_allocator(part *parts, int n, cached_allocator* alloc)
{
thrust::device_ptr<part> dev_parts(parts);
thrust::sort(thrust::cuda::par(*alloc), dev_parts, dev_parts + n, compare_by_elementloc());
}
}
module thrust_module
use iso_c_binding
implicit none
type, bind(c) :: dim
double precision(c_double) :: x
double precision(c_double) :: y
double precision(c_double) :: z
end type dim
type, bind(c) :: part
type(dim) :: pos
type(dim) :: vel
double precision(c_double) :: E_rot
integer(c_int) :: tag
integer(c_int) :: loc
end type part
type, bind(c) :: cached_allocator
type(c_ptr) :: handle
end type cached_allocator
interface
subroutine create_cached_allocator(alloc) bind(c, name="create_cached_allocator")
import :: cached_allocator
type(cached_allocator), pointer :: alloc
end subroutine create_cached_allocator
subroutine delete_cached_allocator(alloc) bind(c, name="delete_cached_allocator")
import :: cached_allocator
type(cached_allocator) :: alloc
end subroutine delete_cached_allocator
subroutine dev_sort_part_by_element_key(parts, n) bind(C, name="thrust_sort_part_by_element")
use iso_c_binding
import :: part
integer(c_int), value :: n
type(part), device :: parts(n)
end subroutine dev_sort_part_by_element_key
subroutine dev_sort_parts_cachedallocator(parts, n, alloc) bind(c, name="thrust_sort_parts_using_cached_allocator")
use iso_c_binding
import :: part, cached_allocator
integer(c_int), value :: n
type(part), device :: parts(n)
type(cached_allocator), pointer :: alloc
end subroutine dev_sort_parts_cachedallocator
end interface
end module thrust_module
program main
use cudafor
use thrust_module
implicit none
type(cached_allocator), pointer :: alloc
real(c_float) :: elapsed_time
integer, parameter :: n = 100
integer :: num_trials = 5
double precision :: sum_time = 0.0d0
integer :: i
integer :: istat
type(cudaEvent) :: start_event, stop_event
! ------------------------------------------
type(part), device :: d_part_values(n)
type(part) :: part_values(n)
! ------------------------------------------
do i = 1, n
part_values(i)%pos%x = i*1.0d0
part_values(i)%pos%y = i*2.0d0
part_values(i)%pos%z = i*3.0d0
part_values(i)%vel%x = i*0.1d0
part_values(i)%vel%y = i*0.2d0
part_values(i)%vel%z = i*0.3d0
part_values(i)%E_rot = i*5.0d0
part_values(i)%tag = i
part_values(i)%loc = n - i + 1 ! loc を逆順に設定
end do
istat = cudaEventCreate(start_event)
istat = cudaEventCreate(stop_event)
!--------------------------------------------------------------------------
sum_time = 0.0
d_part_values = part_values
call dev_sort_part_by_element_key(d_part_values, n) ! warm-up
do i = 1, num_trials
d_part_values = part_values
istat = cudaEventRecord(start_event, 0)
call dev_sort_part_by_element_key(d_part_values, n)
istat = cudaEventRecord(stop_event, 0)
istat = cudaEventSynchronize(stop_event)
istat = cudaEventElapsedTime(elapsed_time, start_event, stop_event)
sum_time = sum_time + elapsed_time;
end do
print *, "average calc time without buffer is ", sum_time/num_trials, " ms"
!--------------------------------------------------------------------------
call create_cached_allocator(alloc)
sum_time = 0.0
d_part_values = part_values
call dev_sort_parts_cachedallocator(d_part_values, n, alloc) ! warm-up
do i = 1, num_trials
d_part_values = part_values
istat = cudaEventRecord(start_event, 0)
call dev_sort_parts_cachedallocator(d_part_values, n, alloc)
istat = cudaEventRecord(stop_event, 0)
istat = cudaEventSynchronize(stop_event)
istat = cudaEventElapsedTime(elapsed_time, start_event, stop_event)
sum_time = sum_time + elapsed_time;
end do
print *, "average calc time with buffer is ", sum_time/num_trials, " ms"
call delete_cached_allocator(alloc)
!--------------------------------------------------------------------------
istat = cudaEventDestroy(start_event)
istat = cudaEventDestroy(stop_event)
if (n<=100) then
part_values = d_part_values
print *, '-------------------------------'
do i = 1, n
print '(A, I10, A)', 'Sorted Part', i, ':'
print '(A, I10)', ' Tag:', part_values(i)%tag
print '(A, I10)', ' Loc:', part_values(i)%loc
end do
print *, '-------------------------------'
end if
end program main
結果
average calc time without buffer is 0.1416959986090660 ms
Allocator created.
cached_allocator::allocate(): num_bytes == 6911
cached_allocator::allocate(): ptr == 0x1307001a00
cached_allocator::deallocate(): ptr == 0x1307001a00
cached_allocator::allocate(): num_bytes == 6911
cached_allocator::allocate(): found a free block
cached_allocator::allocate(): ptr == 0x1307001a00
cached_allocator::deallocate(): ptr == 0x1307001a00
cached_allocator::allocate(): num_bytes == 6911
cached_allocator::allocate(): found a free block
cached_allocator::allocate(): ptr == 0x1307001a00
cached_allocator::deallocate(): ptr == 0x1307001a00
cached_allocator::allocate(): num_bytes == 6911
cached_allocator::allocate(): found a free block
cached_allocator::allocate(): ptr == 0x1307001a00
cached_allocator::deallocate(): ptr == 0x1307001a00
cached_allocator::allocate(): num_bytes == 6911
cached_allocator::allocate(): found a free block
cached_allocator::allocate(): ptr == 0x1307001a00
cached_allocator::deallocate(): ptr == 0x1307001a00
cached_allocator::allocate(): num_bytes == 6911
cached_allocator::allocate(): found a free block
cached_allocator::allocate(): ptr == 0x1307001a00
cached_allocator::deallocate(): ptr == 0x1307001a00
average calc time with buffer is 0.1865024000406265 ms
cached_allocator::free_all()
| 配列数N | 平均時間(キャッシュあり), ms | 平均時間(キャッシュなし), ms |
|---|---|---|
| $10^2$ | 0.142 | 0.187 |
| $10^4$ | 0.401 | 0.528 |
| $10^6$ | 12.78 | 7.30 |
ばらつきが大きいが、配列数1Mの時はこちらも確実に効果を出している。
追記:カスタムアロケータ利用の拡張
追記1:Thrust::sort_by_keyに対する適用
sort_by_key(stable_sort_by_key)に対しても、第一引数でカスタムアロケータの使用を指示してやれば、同様に可能。
void thrust_sort_part_by_intkey_using_cached_allocator(int *keys, part *parts, int n, cached_allocator *alloc)
{
thrust::device_ptr<int> dev_keys(keys);
thrust::device_ptr<part> dev_parts(parts);
thrust::stable_sort_by_key(thrust::cuda::par(*alloc), dev_keys, dev_keys + n, dev_parts);
}
subroutine dev_sort_part_by_intkey_cachedallocator(keys, parts, n, alloc) bind(c, name="thrust_sort_part_by_intkey_using_cached_allocator")
use iso_c_binding
import :: part, cached_allocator
integer(c_int), value :: n
integer(c_int), device :: keys(n)
type(part), device :: parts(n)
type(cached_allocator), pointer :: alloc
end subroutine dev_sort_part_by_intkey_cachedallocator
追記2:ソートする配列数が毎回変わる時への対応
ループごとに整列する数が変わることがあり得る場合、当初のコードではぴったりのキャッシュサイズにしか対応できないため、どんどんアロケートしてしまう。以下のように、必要なメモリ量がキャッシュより少ないか判定し、その場合にはそのキャッシュをすべて確保、再確保して使うことで、それを防ぐ。
ただし、初回で呼ぶときはある程度多めのキャッシュを確保する必要があるので、注意。
// A simple allocator for caching cudaMalloc allocations.
struct cached_allocator
{
typedef char value_type;
cached_allocator() { std::cout << "Allocator created.\n"; }
~cached_allocator() { free_all(); }
char *allocate(std::ptrdiff_t num_bytes)
{
char *alloc_ptr = nullptr;
// Search the cache for a free block.
// free_blocks_type::iterator free_block = free_blocks.find(num_bytes);
// ここを変更
free_blocks_type::iterator free_block = free_blocks.lower_bound(num_bytes);
// block_size is cachsize if free block found, else allocated size
// 元のブロックサイズを確保する
std::ptrdiff_t block_size;
if (free_block != free_blocks.end())
{
std::cout << "cached_allocator::allocate(): found a free block"
<< std::endl;
block_size = free_block->first;
alloc_ptr = free_block->second;
// Erase from the `free_blocks` map.
free_blocks.erase(free_block);
}
else
{
// No allocation of the right size exists, so create a new one with
// `thrust::cuda::malloc`.
try
{
// Allocate memory and convert the resulting `thrust::cuda::pointer` to
// a raw pointer.
alloc_ptr = thrust::cuda::malloc<char>(num_bytes).get();
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess)
{
std::cerr << "CUDA Error: " << cudaGetErrorString(error) << std::endl;
return nullptr;
}
}
catch (std::runtime_error &)
{
throw;
}
block_size = num_bytes;
std::cout << "cached_allocator::allocate(): ptr == "
<< reinterpret_cast<void *>(alloc_ptr) << std::endl;
std::cout << "cached_allocator::allocate(): num_bytes == "
<< num_bytes
<< std::endl;
}
// Insert the allocated pointer into the `allocated_blocks` map.
// return all cache size if free block found
allocated_blocks.insert(std::make_pair(alloc_ptr, block_size));
return alloc_ptr;
}
void deallocate(char *ptr, size_t)
{
std::cout << "cached_allocator::deallocate(): ptr == "
<< reinterpret_cast<void *>(ptr) << std::endl;
// Erase the allocated block from the allocated blocks map.
allocated_blocks_type::iterator iter = allocated_blocks.find(ptr);
if (iter == allocated_blocks.end())
throw not_my_pointer(reinterpret_cast<void *>(ptr));
std::ptrdiff_t num_bytes = iter->second;
allocated_blocks.erase(iter);
// Insert the block into the free blocks map.
free_blocks.insert(std::make_pair(num_bytes, ptr));
}
void free_all()
{
std::cout << "cached_allocator::free_all()" << std::endl;
// Deallocate all outstanding blocks in both lists.
for (free_blocks_type::iterator i = free_blocks.begin(); i != free_blocks.end(); ++i)
{
// Transform the pointer to cuda::pointer before calling cuda::free.
thrust::cuda::free(thrust::cuda::pointer<char>(i->second));
}
for (allocated_blocks_type::iterator i = allocated_blocks.begin(); i != allocated_blocks.end(); ++i)
{
// Transform the pointer to cuda::pointer before calling cuda::free.
thrust::cuda::free(thrust::cuda::pointer<char>(i->first));
}
// Clear the lists
free_blocks.clear();
allocated_blocks.clear();
}
};
追記1&2による実行例
subroutine sort_part_by_intkey_using_chachedallocator
type(cached_allocator), pointer :: alloc
real(c_float) :: elapsed_time
integer, parameter :: n = 1E6
integer :: n_sorted
integer :: num_trials = 5
double precision :: sum_time = 0.0d0
integer :: i
integer :: istat
type(cudaEvent) :: start_event, stop_event
! ------------------------------------------
type(part), device :: d_part_values(n)
integer, device :: d_keys(n)
type(part) :: part_values(n)
integer :: keys(n)
! ------------------------------------------
do i = 1, n
keys(i) = n - i + 1
part_values(i)%pos%x = i*1.0d0
part_values(i)%pos%y = i*2.0d0
part_values(i)%pos%z = i*3.0d0
part_values(i)%vel%x = i*0.1d0
part_values(i)%vel%y = i*0.2d0
part_values(i)%vel%z = i*0.3d0
part_values(i)%E_rot = i*5.0d0
part_values(i)%tag = i
part_values(i)%loc = n - i + 1 ! loc を逆順に設定
end do
istat = cudaEventCreate(start_event)
istat = cudaEventCreate(stop_event)
!--------------------------------------------------------------------------
sum_time = 0.0
d_keys = keys
d_part_values = part_values
call dev_sort_part_by_intkey(d_keys, d_part_values, n) ! warm-up
do i = 1, num_trials
d_keys = keys
d_part_values = part_values
istat = cudaEventRecord(start_event, 0)
call dev_sort_part_by_intkey(d_keys, d_part_values, n) ! warm-up
istat = cudaEventRecord(stop_event, 0)
istat = cudaEventSynchronize(stop_event)
istat = cudaEventElapsedTime(elapsed_time, start_event, stop_event)
sum_time = sum_time + elapsed_time;
end do
print *, "average calc time without buffer is ", sum_time/num_trials, " ms"
!--------------------------------------------------------------------------
call create_cached_allocator(alloc)
sum_time = 0.0
d_keys = keys
d_part_values = part_values
call dev_sort_part_by_intkey_cachedallocator(d_keys, d_part_values, n, alloc) ! warm-up
call delete_cached_allocator(alloc)
call create_cached_allocator(alloc)
sum_time = 0.0
d_keys = keys
d_part_values = part_values
call dev_sort_part_by_intkey_cachedallocator(d_keys, d_part_values, n, alloc) ! warm-up
do i = 1, num_trials
n_sorted = n - i*mod(i,3) - n/10 ! 適当に数を減らした
print *, "n_sorted:", n_sorted
d_keys = keys
d_part_values = part_values
istat = cudaEventRecord(start_event, 0)
call dev_sort_part_by_intkey_cachedallocator(d_keys, d_part_values, n_sorted, alloc)
istat = cudaEventRecord(stop_event, 0)
istat = cudaEventSynchronize(stop_event)
istat = cudaEventElapsedTime(elapsed_time, start_event, stop_event)
sum_time = sum_time + elapsed_time;
end do
print *, "average calc time with buffer is ", sum_time/num_trials, " ms"
call delete_cached_allocator(alloc)
!--------------------------------------------------------------------------
istat = cudaEventDestroy(start_event)
istat = cudaEventDestroy(stop_event)
if (n<=100) then
part_values = d_part_values
print *, '-------------------------------'
do i = 1, n
print '(A, I10, A)', 'Sorted Part', i, ':'
print '(A, I10)', ' Tag:', part_values(i)%tag
end do
print *, '-------------------------------'
end if
end subroutine sort_part_by_intkey_using_chachedallocator
average calc time without buffer is 8.149798488616943 ms
Allocator created.
cached_allocator::allocate(): ptr == 0x130b400000
cached_allocator::allocate(): num_bytes == 70672127
cached_allocator::deallocate(): ptr == 0x130b400000
cached_allocator::free_all()
Allocator created.
cached_allocator::allocate(): ptr == 0x130b400000
cached_allocator::allocate(): num_bytes == 70672127
cached_allocator::deallocate(): ptr == 0x130b400000
n_sorted: 899900
cached_allocator::allocate(): found a free block
cached_allocator::deallocate(): ptr == 0x130b400000
n_sorted: 899600
cached_allocator::allocate(): found a free block
cached_allocator::deallocate(): ptr == 0x130b400000
n_sorted: 900000
cached_allocator::allocate(): found a free block
cached_allocator::deallocate(): ptr == 0x130b400000
n_sorted: 899600
cached_allocator::allocate(): found a free block
cached_allocator::deallocate(): ptr == 0x130b400000
n_sorted: 899000
cached_allocator::allocate(): found a free block
cached_allocator::deallocate(): ptr == 0x130b400000
average calc time with buffer is 2.532217550277710 ms
cached_allocator::free_all()
以下に速度を計測した結果を示す。
| 配列数N | 平均時間(キャッシュあり), ms | 平均時間(キャッシュなし), ms |
|---|---|---|
| $10^2$ | 0.237 | 0.273 |
| $10^4$ | 0.338 | 0.400 |
| $10^6$ | 8.15 | 2.53 |
要素が多いとき高速化できている。
なお、配列要素でソートするより、キー配列を用いたほうが高速なようだ。
終わりに
あまりこちらで高速化をするのが難しい配列ソートに対し高速化でき(る余地があり)よかった。ただし環境やソート対象、元の並び順によって効果のほどは大きく違うと考えられるので、実際のコードでどれだけ効果があるかは不明である。また、メモリが厳しい場合はキャッシュできず実行不可になるのか、検証は行っていないので注意。