21
23

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

More than 1 year has passed since last update.

関西学院大学Advent Calendar 2017

Day 11

OpenCLでGPUをブン回してみた

Last updated at Posted at 2017-12-21

経緯

pythonで距離行列を計算する時,とある記事を参考に高速に書くことが出来るが,更に高速化できないか興味本位でOpenCLに首を突っ込んでみた.

実験環境

Macbook Pro 2016 13inch
CPU : i5
GPU : Intel Iris Graphics 540
Mem : 8GB
OpenCL version : 1.2

予備実験

まず,比較対象として,Cで距離行列を計算するプログラムを作って試してみた.
作ったプログラムのソースコードはこんな感じ.
ランダムで2次ベクトル10000個を作って,距離行列を計算する部分だけ時間を計測した.

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>

#define ARAAY_SIZE 10000

int main(void) {
  srand(time(NULL));

  float p;
  float *x ;
  float *y;

  float *out;
  int size = ARAAY_SIZE;

  x=(float *)malloc( size* sizeof(float));
  y=(float *)malloc( size* sizeof(float));
  out=(float *)malloc(size*size * sizeof(float));

  for(int i =0; i<size;i++){
      x[i] = (float)rand() / 32767.0;
      y[i] = (float)rand() / 32767.0;
  }

  clock_t start,end;
  printf("start\n");
  start = clock();
  for(int i= 0 ; i < size; i++){
      for(int j = 0; j < size; j++){
          out[j + i * size] = sqrt(pow(x[i]-x[j],2)+pow(y[i]-y[j],2));
      }
  }
  end = clock();
  printf("END\n");
  printf("%lf秒かかりました\n",(double)(end-start)/CLOCKS_PER_SEC);

  free(out);
  free(x);
  free(y);

  return 0;
}

で実行結果はこんな感じ

$ gcc -lm main.c
$ ./a.out
start
END
6.582786秒かかりました

平均して約6秒前後かかった.

実は,これだと,Pythonの方が高速で計算できてしまう...

OpenCLを使う

MacだとXcode を入れるとOpenCL関連のライブラリが入って来るのでそれを使った.

実装

CPUで動かすホストコードはこんな感じになった,

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <OpenCL/opencl.h>

#define MAX_SOURCE_SIZE 0x100000
#define ARAAY_SIZE 10000
#define ITEM_SIZE 200

int main(void) {
  srand(time(NULL));

  cl_device_id device_id = NULL;
  cl_context context = NULL;
  cl_command_queue command_queue = NULL;
  cl_mem memobj = NULL;
  cl_mem memobjb = NULL;
  cl_mem memobjout = NULL;
  cl_mem memor = NULL;
  cl_mem memosize = NULL;
  cl_program program = NULL;
  cl_kernel kernel = NULL;
  cl_platform_id platform_id = NULL;
  cl_uint ret_num_devices, ret_num_platforms;
  cl_int ret;

  float p;
  float *x ;
  float *y;

  float *out;

  // read kernel code
  FILE *fp;
  char fileName[] = "./test.cl";
  char *source_str;
  size_t source_size;
  x=(float *)malloc( ARAAY_SIZE* sizeof(float));
   y=(float *)malloc( ARAAY_SIZE* sizeof(float));
   int size = ARAAY_SIZE;
  out=(float *)malloc(ARAAY_SIZE*ARAAY_SIZE * sizeof(float));

  for(int i =0; i<ARAAY_SIZE;i++){
      x[i] = (float)rand() / 32767.0;
      y[i] = (float)rand() / 32767.0;
  }

  // read kernel code
  fp = fopen(fileName, "r");
  if(!fp) {
    exit(1);
  }
  source_str = (char*)malloc(MAX_SOURCE_SIZE);
  source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
  fclose(fp);

  // get device check
  ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
  ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);

  // make context
  context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);

  // make command queue
  command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

  // make kernel code Obj
  program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);

  // compile kernel code
  ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

  // set func
  kernel = clCreateKernel(program, "test", &ret);

  // make buffer
  memobj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*ARAAY_SIZE, NULL, &ret);
  memobjb = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*ARAAY_SIZE, NULL, &ret);
  memobjout = clCreateBuffer(context, CL_MEM_READ_WRITE, ARAAY_SIZE*ARAAY_SIZE * sizeof(float), NULL, &ret);
  memosize = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL, &ret);

  // write buffer
  ret = clEnqueueWriteBuffer(command_queue, memobj, CL_TRUE, 0, sizeof(float)*ARAAY_SIZE, x, 0, NULL, NULL);
  ret = clEnqueueWriteBuffer(command_queue, memobjb, CL_TRUE, 0, sizeof(float)*ARAAY_SIZE, y, 0, NULL, NULL);
  ret = clEnqueueWriteBuffer(command_queue, memobjout, CL_TRUE, 0, sizeof(float)*ARAAY_SIZE*ARAAY_SIZE, out, 0, NULL, NULL);
  ret = clEnqueueWriteBuffer(command_queue, memosize, CL_TRUE, 0, sizeof(int), &size, 0, NULL, NULL);

  // set args
  ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj);
  ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobjb);
  ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&memobjout);
  ret = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&memosize);

  size_t local_item_size = ITEM_SIZE;
  size_t global_item_size = ITEM_SIZE;
  clock_t start,end;
  printf("start\n");
  start = clock();

  ret=clEnqueueNDRangeKernel(command_queue,kernel, 1, NULL,&global_item_size, &local_item_size, 0, NULL, NULL);

  end = clock();
  printf("END\n");
  printf("%lf秒かかりました\n",(double)(end-start)/CLOCKS_PER_SEC);

  ret = clFlush(command_queue);
  ret = clFinish(command_queue);
  ret = clReleaseKernel(kernel);
  ret = clReleaseProgram(program);
  ret = clReleaseMemObject(memobj);
  ret = clReleaseCommandQueue(command_queue);
  ret = clReleaseContext(context);

  free(source_str);
  free(out);
  free(x);
  free(y);

  return 0;
}

つぎに,GPUで動かすカーネルコードはこんな感じ

__kernel void test(__global float *x ,__global float *y
                    ,__global float *out,__global int *size) {
    size_t gx = get_global_id(0);
    int i, j, k, x, y;
    k = (*size) * (*size) / get_global_size(0);
    for(i = 0; i < k; i++){
        j = i + k * gx;
        a = j / (*size);
        b = j % (*size);
        out[j] = sqrt(pow(x[a] - x[b], 2) + pow(y[a] - y[b], 2));
    }
}

OpenCLでは,ホストコードからカーネルコードを読み込みコンパイルして実行するらしい.
一応,違うデバイス間でデータのやり取りするので,APIを使っていたりかなり複雑になったイメージがする.
local_item_size等は,あまり理解していないので適当に値を入れた.

結果

実行結果はこんな感じになった.

$ gcc main.c -framework OpenCL
$ ./a.out
start
END
0.000006秒かかりました

な,な,なんと,1,000,000倍以上高速に計算できた!!
まあ,結果は異常な高速化ができたのですか...
メモリの読み書きを考慮せずに計測していたので,読み書きも含めて計測し直した.
コードはこんな感じ.

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <OpenCL/opencl.h>

#define MAX_SOURCE_SIZE 0x100000
#define ARAAY_SIZE 10000
#define ITEM_SIZE 200

int main(void) {
  srand(time(NULL));

  cl_device_id device_id = NULL;
  cl_context context = NULL;
  cl_command_queue command_queue = NULL;
  cl_mem memobj = NULL;
  cl_mem memobjb = NULL;
  cl_mem memobjout = NULL;
  cl_mem memor = NULL;
  cl_mem memosize = NULL;
  cl_program program = NULL;
  cl_kernel kernel = NULL;
  cl_platform_id platform_id = NULL;
  cl_uint ret_num_devices, ret_num_platforms;
  cl_int ret;

  float p;
  float *x ;
  float *y;

  float *out;

  // read kernel code
  FILE *fp;
  char fileName[] = "./test.cl";
  char *source_str;
  size_t source_size;
  x=(float *)malloc( ARAAY_SIZE* sizeof(float));
   y=(float *)malloc( ARAAY_SIZE* sizeof(float));
   int size = ARAAY_SIZE;
  out=(float *)malloc(ARAAY_SIZE*ARAAY_SIZE * sizeof(float));

  for(int i =0; i<ARAAY_SIZE;i++){
      x[i] = (float)rand() / 32767.0;
      y[i] = (float)rand() / 32767.0;
  }

  clock_t start,end;
  printf("start\n");
  start = clock();

  // read kernel code
  fp = fopen(fileName, "r");
  if(!fp) {
    exit(1);
  }
  source_str = (char*)malloc(MAX_SOURCE_SIZE);
  source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
  fclose(fp);

  // get device check
  ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
  ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);

  // make context
  context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);

  // make command queue
  command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

  // make kernel code Obj
  program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);

  // compile kernel code
  ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

  // set func
  kernel = clCreateKernel(program, "test", &ret);

  // make buffer
  memobj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*ARAAY_SIZE, NULL, &ret);
  memobjb = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*ARAAY_SIZE, NULL, &ret);
  memobjout = clCreateBuffer(context, CL_MEM_READ_WRITE, ARAAY_SIZE*ARAAY_SIZE * sizeof(float), NULL, &ret);
  memosize = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL, &ret);

  // write buffer
  ret = clEnqueueWriteBuffer(command_queue, memobj, CL_TRUE, 0, sizeof(float)*ARAAY_SIZE, x, 0, NULL, NULL);
  ret = clEnqueueWriteBuffer(command_queue, memobjb, CL_TRUE, 0, sizeof(float)*ARAAY_SIZE, y, 0, NULL, NULL);
  ret = clEnqueueWriteBuffer(command_queue, memobjout, CL_TRUE, 0, sizeof(float)*ARAAY_SIZE*ARAAY_SIZE, out, 0, NULL, NULL);
  ret = clEnqueueWriteBuffer(command_queue, memosize, CL_TRUE, 0, sizeof(int), &size, 0, NULL, NULL);

  // set args
  ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj);
  ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobjb);
  ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&memobjout);
  ret = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&memosize);

  size_t local_item_size = ITEM_SIZE;
  size_t global_item_size = ITEM_SIZE;

  ret=clEnqueueNDRangeKernel(command_queue,kernel, 1, NULL,&global_item_size, &local_item_size, 0, NULL, NULL);

  ret = clEnqueueReadBuffer(command_queue, memobjout, CL_TRUE, 0, sizeof(float) * size * size, out, 0, NULL, NULL);

  end = clock();
  printf("END\n");
  printf("%lf秒かかりました\n",(double)(end-start)/CLOCKS_PER_SEC);

  ret = clFlush(command_queue);
  ret = clFinish(command_queue);
  ret = clReleaseKernel(kernel);
  ret = clReleaseProgram(program);
  ret = clReleaseMemObject(memobj);
  ret = clReleaseCommandQueue(command_queue);
  ret = clReleaseContext(context);

  free(source_str);
  free(out);
  free(x);
  free(y);

  return 0;
}

ホストコードの計測開始場所を変更して,結果の読み込みを追加しただけです.
実行結果は

$gcc main.c -framework OpenCL
$./a.out
start
END
0.169318秒かかりました

約160msでpythonの高速実装したときとほとんど変わらない速度です.
愚直にCで書いたプログラムに比べて,高速化はされた.
まあ,I/Oがボトルネックになってるのが大きい感じだけど,実際使うとなるともっと計算能力のいる処理するので,ボトルネック分は十分すぎるくらい取り返せそうな気がする,

それなりに,I/O遅いの意識して書いていたけど,もしかしたらこれ以上に良い書き方があると思う.

あと,あまり行列のサイズを大きくしすぎるとメモリーが足りなくなる...
(100000*100000で計算するとPCが落ちた)

21
23
5

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
21
23

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?