Streamを使ってみる(CUDA7.0, Windows, GTX980)。
sample.cu
#include <helper_cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <time.h>
const int N_STREAMS = 10;
const int width = 3840;
const int height = 2160;
const int size = width*height;
//---------------------------------------------
// cuda kernel function
//---------------------------------------------
__global__ void
kernel(int *d_a, int *d_b, int *d_c, int n, int offset = 0)
{
int i = blockIdx.x * blockDim.x + threadIdx.x + offset;
if (i < n) {
d_c[i] = d_a[i] + d_b[i];
}
return;
}
//---------------------------------------------
// streamを使う
//---------------------------------------------
int* cuda_exec_stream()
{
cudaStream_t stream[N_STREAMS];
// host側データ準備。
int* h_a = NULL;
int* h_b = NULL;
int* h_c = NULL;
cudaHostAlloc((void **)&h_a, sizeof(int)*size, cudaHostAllocDefault);
cudaHostAlloc((void **)&h_b, sizeof(int)*size, cudaHostAllocDefault);
cudaHostAlloc((void **)&h_c, sizeof(int)*size, cudaHostAllocDefault);
for (int i=0; i<height; i++) {
for (int j=0; j<width; j++) {
h_a[i*height + j] = (i + j)*height/(width+1);
h_b[i*height + j] = (i + j)*width/(height+1);
h_c[i*height + j] = -1;
}
}
// host -> device 転送
int *d_a = NULL;
int *d_b = NULL;
int *d_c = NULL;
cudaMalloc((void **)&d_a, sizeof(int)*size);
cudaMalloc((void **)&d_b, sizeof(int)*size);
cudaMalloc((void **)&d_c, sizeof(int)*size);
cudaMemset(d_c, 0, sizeof(int)*size);
// stream用意
for (int i = 0; i < N_STREAMS; ++i) {
cudaStreamCreate(&stream[i]);
}
// kernel実行
int N = size / N_STREAMS; //assert(割り切れる)
int th = 1024;
int bl = (N + th - 1) / th;
dim3 threads(th, 1, 1);
dim3 blocks(bl, 1, 1);
for (int i = 0; i < N_STREAMS; ++i) {
cudaMemcpyAsync(d_a, h_a, sizeof(int)*size, cudaMemcpyHostToDevice, stream[i]);
cudaMemcpyAsync(d_b, h_b, sizeof(int)*size, cudaMemcpyHostToDevice, stream[i]);
kernel<<<(N + th - 1) / th, th, 0, stream[i]>>>(d_a, d_b, d_c, N*(i+1), i*N);
cudaMemcpyAsync(&h_c[i * N], &d_c[i * N], sizeof(int)*N, cudaMemcpyDeviceToHost, stream[i]);
}
// 後始末
for (int i = 0; i < N_STREAMS; ++i) {
cudaStreamSynchronize(stream[i]);
cudaStreamDestroy(stream[i]);
}
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cudaFreeHost(h_a);
cudaFreeHost(h_b);
return h_c;
}
//---------------------------------------------
// streamを使わない
//---------------------------------------------
int* cuda_exec()
{
// host側データ準備。
int* h_a = NULL;
int* h_b = NULL;
int* h_c = NULL;
cudaHostAlloc((void **)&h_a, sizeof(int)*size, cudaHostAllocDefault);
cudaHostAlloc((void **)&h_b, sizeof(int)*size, cudaHostAllocDefault);
cudaHostAlloc((void **)&h_c, sizeof(int)*size, cudaHostAllocDefault);
for (int i=0; i<height; i++) {
for (int j=0; j<width; j++) {
h_a[i*height + j] = (i + j)*height/(width+1);
h_b[i*height + j] = (i + j)*width/(height+1);
h_c[i*height + j] = -2;
}
}
// host -> device 転送
int *d_a = NULL;
int *d_b = NULL;
int *d_c = NULL;
cudaMalloc((void **)&d_a, sizeof(int)*size);
cudaMalloc((void **)&d_b, sizeof(int)*size);
cudaMalloc((void **)&d_c, sizeof(int)*size);
cudaMemcpy(d_a, h_a, sizeof(int)*size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, sizeof(int)*size, cudaMemcpyHostToDevice);
cudaMemset(d_c, 0, sizeof(int)*size);
// カーネル呼び出し
int th = 1024;
int bl = (size + th - 1) / th;
dim3 threads(th, 1, 1);
dim3 blocks(bl, 1, 1);
kernel<<<blocks, threads>>>(d_a, d_b, d_c, size);
// device -> host 転送
cudaMemcpy((void*)h_c, (void*)d_c, sizeof(int)*size, cudaMemcpyDeviceToHost);
// 後始末
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cudaFreeHost(h_a);
cudaFreeHost(h_b);
return h_c;
}
void cuda_main()
{
//CUDA Kernelの実行終了を待つ際の挙動
//cudaDeviceScheduleSpin: ポーリングを強制 to decrease latency when we wait result
//cudaDeviceScheduleYield: スリープを強制 to increase general performance
//cudaDeviceScheduleAuto: デフォルトの挙動
cudaSetDeviceFlags(cudaDeviceScheduleAuto);
// streamなし
clock_t t1 = clock();
int result1 = cuda_exec();
clock_t t2 = clock();
fprintf(stderr, "cuda_exec: %f\n", (double)(t2 - t1) / CLOCKS_PER_SEC);
// streamあり
t1 = clock();
int result2 = cuda_exec_stream();
t2 = clock();
fprintf(stderr, "cuda_exec_stream: %f\n", (double)(t2 - t1) / CLOCKS_PER_SEC);
// 結果を比較
for (int i=0; i<size; i++) {
if (result1[i] != result2[i]) {
fprintf(stderr, "[%d] %d %d\n", i, result1[i], result2[i]);
}
}
cudaFreeHost(result1);
cudaFreeHost(result2);
return;
}