0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 5 years have passed since last update.

CUDA > 時間計測をする実装 > QueryPerformanceCounter()使用

Last updated at Posted at 2016-09-10
動作環境
GeForce GTX 750 Ti
Windows 8.1 pro (64bit)
Visual Studio Community 2013 (以下、VS)
CUDA 7.5

CPUでの処理とGPUでの処理を比較しようと思っている。
それには時間計測が必要だ。

情報

stackoverflowに例があった。
http://stackoverflow.com/questions/14337278/precise-time-measurement

answered Jan 15 '13 at 12:04
Constantinius

によるコード

準備

http://qiita.com/7of9/items/e9c185351803d9794db5
を参考にCUDAをビルドできるslnを用意する (160910_mreasureTimeとした)。

code > cudaDeviceReset()の時間計測

LARGE_INTEGERを使うにはWindows.hをincludeする必要あり。

kernel.cu

# include "cuda_runtime.h"
# include "device_launch_parameters.h"

# include <Windows.h> // LARGE_INTEGERなどに必要
# include <stdio.h>

__global__ void addKernel(int *c, const int *a, const int *b)
{
    //int i = threadIdx.x;
    //c[i] = a[i] + b[i];
}

int main()
{
	cudaError_t cudaStatus;

	LARGE_INTEGER frequency;        // ticks per second
	LARGE_INTEGER t1, t2;           // ticks
	double elapsedTime;

	QueryPerformanceFrequency(&frequency); // get ticks per second
	QueryPerformanceCounter(&t1); // start timer

	// { process to measure ---------
	cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }
	// } process to measure ---------

	QueryPerformanceCounter(&t2); // stop timer

	// compute and print the elapsed time in millisec
	elapsedTime = (t2.QuadPart - t1.QuadPart) * 1000.0 / frequency.QuadPart;

	printf("%.1f msec\n", elapsedTime);

    return 0;
}
結果
C:\CudaDev\160910_measureTime\Debug>160910_measureTime.exe
13.5 msec

こちらの環境ではcudaDeviceReset()にはだいたい13msecかかる。

今後はGPUを使う計算を実装し、それをCPU処理と比較する。

code > GPU処理の時間計測

CUDA 7.5 RuntimeのサンプルコードでのGPU処理の時間を計測してみた。

kernel.cu

# include "cuda_runtime.h"
# include "device_launch_parameters.h"

# include <Windows.h> // LARGE_INTEGERなどに必要
# include <stdio.h>

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

__global__ void addKernel(int *c, const int *a, const int *b)
{
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}

int main()
{
    const int arraySize = 5;
    const int a[arraySize] = { 1, 2, 3, 4, 5 };
    const int b[arraySize] = { 10, 20, 30, 40, 50 };
    int c[arraySize] = { 0 };

	LARGE_INTEGER frequency;        // ticks per second
	LARGE_INTEGER t1, t2;           // ticks
	double elapsedTime;

	QueryPerformanceFrequency(&frequency); // get ticks per second
	QueryPerformanceCounter(&t1); // start timer

	// { process to measure ---------
    // Add vectors in parallel.
    cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }
	// } process to measure ---------

	QueryPerformanceCounter(&t2); // stop timer
	// compute and print the elapsed time in millisec
	elapsedTime = (t2.QuadPart - t1.QuadPart) * 1000.0 / frequency.QuadPart;

	printf("%.1f msec\n", elapsedTime);

    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
        c[0], c[1], c[2], c[3], c[4]);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    
    return cudaStatus;
}

結果について以下の記事とした。
http://qiita.com/7of9/items/70b21e72c3cb9a094f2d

実行の度に処理時間が減っていくのが未消化。

同じ処理をCPUで実行すると0.0msecとなった。処理が速すぎてmsecでは追えない。
一方でGPUの場合はオーバーヘッドが結果として出ているのかもしれない。

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?