4
4

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 5 years have passed since last update.

cudaサンプルその2

Posted at

Streamを使ってみる(CUDA7.0, Windows, GTX980)。

sample.cu
#include <helper_cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <time.h>
const int N_STREAMS = 10;
const int width  = 3840;
const int height = 2160;
const int size   = width*height;

//---------------------------------------------
// cuda kernel function
//---------------------------------------------
__global__ void
kernel(int *d_a, int *d_b, int *d_c, int n, int offset = 0)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x + offset;
    if (i < n) {
        d_c[i] = d_a[i] + d_b[i];
    }
    return;
}

//---------------------------------------------
// streamを使う
//---------------------------------------------
int* cuda_exec_stream()
{
    cudaStream_t stream[N_STREAMS];
    // host側データ準備。
    int* h_a = NULL;
    int* h_b = NULL;
    int* h_c = NULL;
    cudaHostAlloc((void **)&h_a, sizeof(int)*size, cudaHostAllocDefault);
    cudaHostAlloc((void **)&h_b, sizeof(int)*size, cudaHostAllocDefault);
    cudaHostAlloc((void **)&h_c, sizeof(int)*size, cudaHostAllocDefault);
    for (int i=0; i<height; i++) {
    	for (int j=0; j<width; j++) {
            h_a[i*height + j] = (i + j)*height/(width+1);
            h_b[i*height + j] = (i + j)*width/(height+1);
            h_c[i*height + j] = -1;
        }
    }
    // host -> device 転送
    int *d_a = NULL;
    int *d_b = NULL;
    int *d_c = NULL;
    cudaMalloc((void **)&d_a, sizeof(int)*size);
    cudaMalloc((void **)&d_b, sizeof(int)*size);
    cudaMalloc((void **)&d_c, sizeof(int)*size);

    cudaMemset(d_c, 0, sizeof(int)*size);
    // stream用意
    for (int i = 0; i < N_STREAMS; ++i) {
        cudaStreamCreate(&stream[i]);
    }
    // kernel実行
    int N = size / N_STREAMS; //assert(割り切れる)
    int th = 1024;
    int bl = (N + th - 1) / th;
    dim3 threads(th, 1, 1);
    dim3 blocks(bl, 1, 1);
    for (int i = 0; i < N_STREAMS; ++i) {
        cudaMemcpyAsync(d_a, h_a, sizeof(int)*size, cudaMemcpyHostToDevice, stream[i]);
        cudaMemcpyAsync(d_b, h_b, sizeof(int)*size, cudaMemcpyHostToDevice, stream[i]);
        kernel<<<(N + th - 1) / th, th, 0, stream[i]>>>(d_a, d_b, d_c, N*(i+1), i*N);
        cudaMemcpyAsync(&h_c[i * N], &d_c[i * N], sizeof(int)*N, cudaMemcpyDeviceToHost, stream[i]);
    }
    // 後始末
    for (int i = 0; i < N_STREAMS; ++i) {
        cudaStreamSynchronize(stream[i]);
        cudaStreamDestroy(stream[i]);
    }
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    cudaFreeHost(h_a);
    cudaFreeHost(h_b);
    return h_c;
}

//---------------------------------------------
// streamを使わない
//---------------------------------------------
int* cuda_exec()
{
    // host側データ準備。
    int* h_a = NULL;
    int* h_b = NULL;
    int* h_c = NULL;
    cudaHostAlloc((void **)&h_a, sizeof(int)*size, cudaHostAllocDefault);
    cudaHostAlloc((void **)&h_b, sizeof(int)*size, cudaHostAllocDefault);
    cudaHostAlloc((void **)&h_c, sizeof(int)*size, cudaHostAllocDefault);
    for (int i=0; i<height; i++) {
        for (int j=0; j<width; j++) {
            h_a[i*height + j] = (i + j)*height/(width+1);
            h_b[i*height + j] = (i + j)*width/(height+1);
            h_c[i*height + j] = -2;
        }
    }
    // host -> device 転送
    int *d_a = NULL;
    int *d_b = NULL;
    int *d_c = NULL;
    cudaMalloc((void **)&d_a, sizeof(int)*size);
    cudaMalloc((void **)&d_b, sizeof(int)*size);
    cudaMalloc((void **)&d_c, sizeof(int)*size);
    cudaMemcpy(d_a, h_a, sizeof(int)*size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, sizeof(int)*size, cudaMemcpyHostToDevice);
    cudaMemset(d_c, 0, sizeof(int)*size);
    // カーネル呼び出し
    int th = 1024;
    int bl = (size + th - 1) / th;
    dim3 threads(th, 1, 1);
    dim3 blocks(bl, 1, 1);
    kernel<<<blocks, threads>>>(d_a, d_b, d_c, size);
    // device -> host 転送
    cudaMemcpy((void*)h_c, (void*)d_c, sizeof(int)*size, cudaMemcpyDeviceToHost);
    // 後始末
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    cudaFreeHost(h_a);
    cudaFreeHost(h_b);
    return h_c;
}

void cuda_main()
{
    //CUDA Kernelの実行終了を待つ際の挙動
    //cudaDeviceScheduleSpin: ポーリングを強制 to decrease latency when we wait result
    //cudaDeviceScheduleYield: スリープを強制 to increase general performance
    //cudaDeviceScheduleAuto: デフォルトの挙動
    cudaSetDeviceFlags(cudaDeviceScheduleAuto); 
    // streamなし
    clock_t t1 = clock();
    int result1 = cuda_exec();
    clock_t t2 = clock();
    fprintf(stderr, "cuda_exec: %f\n", (double)(t2 - t1) / CLOCKS_PER_SEC);
    // streamあり
    t1 = clock();
    int result2 = cuda_exec_stream();
    t2 = clock();
    fprintf(stderr, "cuda_exec_stream: %f\n", (double)(t2 - t1) / CLOCKS_PER_SEC);
    // 結果を比較
    for (int i=0; i<size; i++) {
        if (result1[i] != result2[i]) {
            fprintf(stderr, "[%d] %d %d\n", i, result1[i], result2[i]);
        }
    }
    cudaFreeHost(result1);
    cudaFreeHost(result2);
    return;
}
4
4
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
4
4

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?