More than 5 years have passed since last update.
cudaサンプルその2

CUDA
Posted at 2015-10-05
Streamを使ってみる（CUDA7.0, Windows, GTX980）。
sample.cu
#include <helper_cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <time.h>
const int N_STREAMS = 10;
const int width  = 3840;
const int height = 2160;
const int size   = width*height;

//---------------------------------------------
// cuda kernel function
//---------------------------------------------
__global__ void
kernel(int *d_a, int *d_b, int *d_c, int n, int offset = 0)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x + offset;
    if (i < n) {
        d_c[i] = d_a[i] + d_b[i];
    }
    return;
}

//---------------------------------------------
// streamを使う
//---------------------------------------------
int* cuda_exec_stream()
{
    cudaStream_t stream[N_STREAMS];
    // host側データ準備。
    int* h_a = NULL;
    int* h_b = NULL;
    int* h_c = NULL;
    cudaHostAlloc((void **)&h_a, sizeof(int)*size, cudaHostAllocDefault);
    cudaHostAlloc((void **)&h_b, sizeof(int)*size, cudaHostAllocDefault);
    cudaHostAlloc((void **)&h_c, sizeof(int)*size, cudaHostAllocDefault);
    for (int i=0; i<height; i++) {
    	for (int j=0; j<width; j++) {
            h_a[i*height + j] = (i + j)*height/(width+1);
            h_b[i*height + j] = (i + j)*width/(height+1);
            h_c[i*height + j] = -1;
        }
    }
    // host -> device 転送
    int *d_a = NULL;
    int *d_b = NULL;
    int *d_c = NULL;
    cudaMalloc((void **)&d_a, sizeof(int)*size);
    cudaMalloc((void **)&d_b, sizeof(int)*size);
    cudaMalloc((void **)&d_c, sizeof(int)*size);

    cudaMemset(d_c, 0, sizeof(int)*size);
    // stream用意
    for (int i = 0; i < N_STREAMS; ++i) {
        cudaStreamCreate(&stream[i]);
    }
    // kernel実行
    int N = size / N_STREAMS; //assert(割り切れる)
    int th = 1024;
    int bl = (N + th - 1) / th;
    dim3 threads(th, 1, 1);
    dim3 blocks(bl, 1, 1);
    for (int i = 0; i < N_STREAMS; ++i) {
        cudaMemcpyAsync(d_a, h_a, sizeof(int)*size, cudaMemcpyHostToDevice, stream[i]);
        cudaMemcpyAsync(d_b, h_b, sizeof(int)*size, cudaMemcpyHostToDevice, stream[i]);
        kernel<<<(N + th - 1) / th, th, 0, stream[i]>>>(d_a, d_b, d_c, N*(i+1), i*N);
        cudaMemcpyAsync(&h_c[i * N], &d_c[i * N], sizeof(int)*N, cudaMemcpyDeviceToHost, stream[i]);
    }
    // 後始末
    for (int i = 0; i < N_STREAMS; ++i) {
        cudaStreamSynchronize(stream[i]);
        cudaStreamDestroy(stream[i]);
    }
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    cudaFreeHost(h_a);
    cudaFreeHost(h_b);
    return h_c;
}

//---------------------------------------------
// streamを使わない
//---------------------------------------------
int* cuda_exec()
{
    // host側データ準備。
    int* h_a = NULL;
    int* h_b = NULL;
    int* h_c = NULL;
    cudaHostAlloc((void **)&h_a, sizeof(int)*size, cudaHostAllocDefault);
    cudaHostAlloc((void **)&h_b, sizeof(int)*size, cudaHostAllocDefault);
    cudaHostAlloc((void **)&h_c, sizeof(int)*size, cudaHostAllocDefault);
    for (int i=0; i<height; i++) {
        for (int j=0; j<width; j++) {
            h_a[i*height + j] = (i + j)*height/(width+1);
            h_b[i*height + j] = (i + j)*width/(height+1);
            h_c[i*height + j] = -2;
        }
    }
    // host -> device 転送
    int *d_a = NULL;
    int *d_b = NULL;
    int *d_c = NULL;
    cudaMalloc((void **)&d_a, sizeof(int)*size);
    cudaMalloc((void **)&d_b, sizeof(int)*size);
    cudaMalloc((void **)&d_c, sizeof(int)*size);
    cudaMemcpy(d_a, h_a, sizeof(int)*size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, sizeof(int)*size, cudaMemcpyHostToDevice);
    cudaMemset(d_c, 0, sizeof(int)*size);
    // カーネル呼び出し
    int th = 1024;
    int bl = (size + th - 1) / th;
    dim3 threads(th, 1, 1);
    dim3 blocks(bl, 1, 1);
    kernel<<<blocks, threads>>>(d_a, d_b, d_c, size);
    // device -> host 転送
    cudaMemcpy((void*)h_c, (void*)d_c, sizeof(int)*size, cudaMemcpyDeviceToHost);
    // 後始末
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    cudaFreeHost(h_a);
    cudaFreeHost(h_b);
    return h_c;
}

void cuda_main()
{
    //CUDA Kernelの実行終了を待つ際の挙動
    //cudaDeviceScheduleSpin: ポーリングを強制　to decrease latency when we wait result
    //cudaDeviceScheduleYield: スリープを強制　to increase general performance
    //cudaDeviceScheduleAuto: デフォルトの挙動
    cudaSetDeviceFlags(cudaDeviceScheduleAuto); 
    // streamなし
    clock_t t1 = clock();
    int result1 = cuda_exec();
    clock_t t2 = clock();
    fprintf(stderr, "cuda_exec: %f\n", (double)(t2 - t1) / CLOCKS_PER_SEC);
    // streamあり
    t1 = clock();
    int result2 = cuda_exec_stream();
    t2 = clock();
    fprintf(stderr, "cuda_exec_stream: %f\n", (double)(t2 - t1) / CLOCKS_PER_SEC);
    // 結果を比較
    for (int i=0; i<size; i++) {
        if (result1[i] != result2[i]) {
            fprintf(stderr, "[%d] %d %d\n", i, result1[i], result2[i]);
        }
    }
    cudaFreeHost(result1);
    cudaFreeHost(result2);
    return;
}
You get articles that match your needs
You can efficiently read back useful information
You can use dark theme
What you can do with signing up