LoginSignup
5
2

More than 3 years have passed since last update.

CUDAでBiCGSTAB法を実装した。

Posted at

CUDAの勉強のため、BiCGSTAB法を実装しました。
有限滑り接触計算やその他のFEM解析でBiCGSTAB法による求解を行うことが多く、GPUで高速化したいなというのがモチベーションです。

コードはこちらになります。


#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define N 1024
#define tol 0.000001f
#define itr_tol 100

__global__ void vector_add(float *out, float *a, float *b, int n) {
    int i = blockIdx.x*blockDim.x + threadIdx.x;

    out[i] = a[i] + 2.00*b[i];
    //out[i] = a[i] + 2.00*b[i] + out[i];
    //out[i] = a[i] + 2.00*b[i] + 2.00*out[i];

}


__global__ void get_residual(float *r, float *a, float *b,float *x, int n) {
    // r = b - Ax
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    int ij= 0;
    if(i >= n){
        return;
    }
    r[i] = 0.0f;
    for(int j = 0; j < n; j++){
        ij = n*i + j;
        //printf("ij=%i\n",ij);
        r[i] += - a[ij]*x[j];
        //printf("r[%i] = %f, a[i]=%f,b[i]=%f,x[i]=%f \n",i,r[i],a[ij],b[i],x[j]);
    }
    r[i] += b[i];
    //printf("%i, %i, %i, %i, %i,%i\n",blockDim.x, blockDim.y, blockDim.z,gridDim.x,gridDim.y,gridDim.z);
    //printf("%i, %i, %i, %i, %i,%i\n",blockIdx.x, threadIdx.x,i);

}


__global__ void cuda_matmul(float *r, float *a,float *x, int n) {
    // r = b - Ax
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    int ij= 0;
    if(i >= n){
        return;
    }

    r[i]=0.0f;
    for(int j = 0; j < n; j++){
        ij = n*i + j;
        //printf("ij=%i\n",ij);
        r[i] += a[ij]*x[j];
    }
    //printf("%i, %i, %i, %i, %i,%i\n",blockDim.x, blockDim.y, blockDim.z,gridDim.x,gridDim.y,gridDim.z);
    //printf("%i, %i, %i, %i, %i,%i\n",blockIdx.x, threadIdx.x,i);

}


__global__ void cuda_dot_product(float *r1, float *r2, float *rr,int n) {
    // r = b - Ax
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    int j;

    rr[i]=0.0f;

    for(j=0; j<n; j++){
        rr[i] += r1[j]*r2[j];
    }
}


__global__ void cuda_e_residual_bicg(float *e, float *r, float *alp, float *y,int n) {
    // r = b - Ax
    int i = blockIdx.x*blockDim.x + threadIdx.x;


    e[i] = r[i] - alp[i]*y[i];

}

__global__ void cuda_add_x_alp_p_c3_e(float *x, float *alp, float *p, float *c3, float *e, int n) {
    // r = b - Ax
    int i = blockIdx.x*blockDim.x + threadIdx.x;

    x[i] = x[i] + alp[i]*p[i] + c3[i]*e[i];

}


__global__ void cuda_update_p_bicg(float *p, float *r, float *bet, float *c3, float *y, int n) {
    // r = b - Ax
    int i = blockIdx.x*blockDim.x + threadIdx.x;

    p[i] = r[i] + bet[i]*(p[i] - c3[i]*y[i]);

}

__global__ void cuda_divide_a_by_b(float *ret, float *a, float *b,int n) {
    // r = b - Ax
    int i = blockIdx.x*blockDim.x + threadIdx.x;

    ret[i] = a[i]/b[i];

}

__global__ void cuda_divide_a_by_b_by_c(float *ret, float *a, float *b,float *c,int n) {
    // r = b - Ax
    int i = blockIdx.x*blockDim.x + threadIdx.x;

    ret[i] = a[i]/b[i]/c[i];

}


__global__ void cuda_check_float(float *x){
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    printf("x[%i] : %f\n", i,x[i]);
}


__global__ void cuda_copy_vector(float *x, float *copy){
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    copy[i] = x[i];
}



int main(){
    float *a, *b, *x, *out;
    float *r, *r0, *p, *y, *e, *v, *er,*er0;
    float *alp, *bet, *c1,*c2, *c3, *ev, *vv, *rr,*init_rr;

    float *d_a, *d_b, *d_x, *d_out;
    float *d_r, *d_r0, *d_p, *d_y, *d_e, *d_v, *d_er;
    float *d_alp, *d_bet, *d_c1,*d_c2, *d_c3, *d_ev, *d_vv, *d_rr, *d_er0,*d_init_rr;

    // Allocate memory
    a = (float*)malloc(sizeof(float) * N*N);
    b = (float*)malloc(sizeof(float) * N);
    x = (float*)malloc(sizeof(float) * N);
    r = (float*)malloc(sizeof(float) * N);
    r0= (float*)malloc(sizeof(float) * N);
    p = (float*)malloc(sizeof(float) * N);
    y = (float*)malloc(sizeof(float) * N);
    e = (float*)malloc(sizeof(float) * N);
    v = (float*)malloc(sizeof(float) * N);

    alp = (float*)malloc(sizeof(float) * N);
    bet = (float*)malloc(sizeof(float) * N);
    c1  = (float*)malloc(sizeof(float) * N);
    c2 = (float*)malloc(sizeof(float) * N);
    c3 = (float*)malloc(sizeof(float) * N);
    ev = (float*)malloc(sizeof(float) * N);
    vv = (float*)malloc(sizeof(float) * N);
    rr= (float*)malloc(sizeof(float) * N);
    er= (float*)malloc(sizeof(float) * N);
    er0= (float*)malloc(sizeof(float) * N);
    init_rr= (float*)malloc(sizeof(float) * N);

    out = (float*)malloc(sizeof(float) * N);


    // Initialize array
    int itr;
    itr=0;
    for(int i = 0; i < N*N; i++){
        a[i] = 0.0f;
        if(i==itr*N+itr){
            //if(itr<20){
            //    printf("%i, %i\n",i,itr);    
            //}
            a[i] = 1.0f*float(itr)+1.0f;
            itr = itr + 1;
        }

    }
    // a = unit matrix

    for(int i = 0; i < N; i++){
        b[i] = 2.0f;
    }

    for(int i = 0; i < N; i++){
        x[i] = 0.0f;
    }
    for(int i = 0; i < N; i++){
        r[i] = 0.0f;
    }
    for(int i = 0; i < N; i++){
        r0[i]= 0.0f;
    }
    for(int i = 0; i < N; i++){
        p[i] = 0.0f;
    }
    for(int i = 0; i < N; i++){
        y[i] = 0.0f;
    }
    for(int i = 0; i < N; i++){
        e[i] = 0.0f;
    }
    for(int i = 0; i < N; i++){
        v[i] = 0.0f;
    }

    for(int i = 0; i < N; i++){
        alp[i] = 0.0f;
    }
    for(int i = 0; i < N; i++){
        bet[i] = 0.0f;
    }
    for(int i = 0; i < N; i++){
        c1[i] = 0.0f;
    }
    for(int i = 0; i < N; i++){
        c2[i] = 0.0f;
    }
    for(int i = 0; i < N; i++){
        c3[i] = 0.0f;
    }
    for(int i = 0; i < N; i++){
        ev[i] = 0.0f;
    }
    for(int i = 0; i < N; i++){
        vv[i] = 0.0f;
    }
    for(int i = 0; i < N; i++){
        rr[i] = 0.0f;
    }
    for(int i = 0; i < N; i++){
        er[i] = 0.0f;
    }
    for(int i = 0; i < N; i++){
        er0[i] = 0.0f;
    }
    for(int i = 0; i < N; i++){
        init_rr[i] = 0.0f;
    }


    for(int i = 0; i < N; i++){
        out[i] = 2.0f;
    }

    // Allocate device memory
    cudaMalloc((void**)&d_a, sizeof(float) * N*N);
    cudaMalloc((void**)&d_b, sizeof(float) * N);
    cudaMalloc((void**)&d_x, sizeof(float) * N);
    cudaMalloc((void**)&d_out, sizeof(float) * N);
    cudaMalloc((void**)&d_r, sizeof(float) * N);
    cudaMalloc((void**)&d_r0, sizeof(float) * N);
    cudaMalloc((void**)&d_p, sizeof(float) * N);
    cudaMalloc((void**)&d_y, sizeof(float) * N);
    cudaMalloc((void**)&d_e, sizeof(float) * N);
    cudaMalloc((void**)&d_v, sizeof(float) * N);

    cudaMalloc((void**)&d_alp, sizeof(float) * N);
    cudaMalloc((void**)&d_bet, sizeof(float) * N);
    cudaMalloc((void**)&d_c1, sizeof(float) * N);
    cudaMalloc((void**)&d_c2, sizeof(float) * N);
    cudaMalloc((void**)&d_c3, sizeof(float) * N);
    cudaMalloc((void**)&d_ev, sizeof(float) * N);
    cudaMalloc((void**)&d_vv, sizeof(float) * N);
    cudaMalloc((void**)&d_rr, sizeof(float) * N);
    cudaMalloc((void**)&d_er0, sizeof(float) * N);
    cudaMalloc((void**)&d_er, sizeof(float) * N);
    cudaMalloc((void**)&d_init_rr, sizeof(float) * N);


    // Transfer data from host to device memory
    cudaMemcpy(d_a, a, sizeof(float) * N*N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_x, x, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_out, out, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_r, r, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_r0, r0, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_p, p, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, y, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_e, e, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_v, v, sizeof(float) * N, cudaMemcpyHostToDevice);

    cudaMemcpy(d_alp, alp, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_bet, bet, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_c1, c1, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_c2, c2, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_c3, c3, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_ev, ev, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_vv, vv, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_rr, rr, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_er0, er0, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_init_rr, init_rr, sizeof(float) * N, cudaMemcpyHostToDevice);

    // Setting thread
    int blocksize = 512;
    dim3 block (blocksize, 1, 1);
    dim3 grid (N / block.x, 1, 1);




    // Executing kernel for BiCGSTAB
    // a=unit matrix
    // b=2.0
    // x=0.0
    // x=0.0
    get_residual<<<grid, block>>>(d_r, d_a, d_b, d_x, N);


    //cudaDeviceSynchronize();
    cuda_dot_product<<<grid, block>>>(d_r,d_r, d_c1, N);



    //r[:] = 2
    //d_c1 = 2*2*1024=4096

    //cuda_check_float<<<grid, block>>>(d_c1);

    cuda_copy_vector<<<grid, block>>>(d_c1, d_init_rr);

    //c1=init_rr=4096

    //printf("%f",d_c1[0]);
    //FILE *outfile;
    //outfile = fopen("amat.txt","w");
    //for(int i=0; i<N*N; i++){
    //    fprintf(outfile,"%f\n",a[i]);
    //}
    //fclose(outfile);

    cudaMemcpy(c1, d_c1, sizeof(float) * N, cudaMemcpyDeviceToHost);

    if(c1[0]>float(tol) ){


        cuda_copy_vector<<<grid, block>>>(d_r, d_p);

        cuda_copy_vector<<<grid, block>>>(d_r, d_r0);

        for(int k=0; k<itr_tol;k++){

            cuda_dot_product<<<grid, block>>>(d_r, d_r0, d_c1, N);
            //d_c1=4*1024=4096

            cuda_matmul<<<grid, block>>>(d_y, d_a, d_p, N);
            //d_y = d_p = 2

            cuda_dot_product<<<grid, block>>>(d_r0, d_y, d_c2, N);
            //d_c2=4*1024=4096

            cuda_divide_a_by_b<<<grid, block>>>(d_alp, d_c1, d_c2, N);
            // alp = 1.0

            cuda_e_residual_bicg<<<grid, block>>>(d_e, d_r, d_alp, d_y, N);

            cuda_matmul<<<grid, block>>>(d_v, d_a, d_e, N);

            cuda_dot_product<<<grid, block>>>(d_e, d_v, d_ev, N);
            cuda_dot_product<<<grid, block>>>(d_v, d_v, d_vv, N);

            //if(k==0){
            //    cuda_dot_product<<<grid, block>>>(d_v, d_v, d_ev, N);
            //    cudaMemcpy(ev, d_ev, sizeof(float) * N, cudaMemcpyDeviceToHost);
            //    printf("ev:%f\n",ev[0]);
            //}
            cudaMemcpy(vv, d_vv, sizeof(float) * N, cudaMemcpyDeviceToHost);

            if(vv[0]==0.0f ){
                printf("cuda_BiCGSTAB devide by zero\n");
                return 1;
            }else{
                cuda_divide_a_by_b<<<grid, block>>>(d_c3, d_ev, d_vv, N);
                cuda_add_x_alp_p_c3_e<<<grid, block>>>(d_x, d_alp, d_p, d_c3, d_e, N);
                cuda_e_residual_bicg<<<grid, block>>>(d_r, d_e, d_c3, d_v, N); 
                cuda_dot_product<<<grid, block>>>(d_r, d_r, d_rr, N);
                cudaMemcpy(rr, d_rr, sizeof(float) * N, cudaMemcpyDeviceToHost);

                if(rr[0]<tol){
                    break;
                }else{
                    cuda_dot_product<<<grid, block>>>(d_r0, d_r, d_c1, N);
                    cuda_divide_a_by_b_by_c<<<grid, block>>>(d_bet, d_c1, d_c2, d_c3, N);
                    cuda_update_p_bicg<<<grid, block>>>(d_p, d_r, d_bet, d_c3,d_y, N);
                }
            }
            cudaMemcpy(rr, d_rr, sizeof(float) * N, cudaMemcpyDeviceToHost);
            //printf("itr = %i, residual =%f\n",k,rr[0]);
            if(k+1==itr_tol){
                printf("ERROR :: bicgstab did not converge.\n");
                return 1;
            }

        }


    }





    //vector_add<<<grid,block>>>(d_out, d_a, d_b, N);

    // Transfer data back to host memory
    cudaMemcpy(out, d_out, sizeof(float) * N, cudaMemcpyDeviceToHost);
    cudaMemcpy(rr, d_rr, sizeof(float) * N, cudaMemcpyDeviceToHost);
    cudaMemcpy(x, d_x, sizeof(float) * N, cudaMemcpyDeviceToHost);
    cudaMemcpy(alp, d_alp, sizeof(float) * N, cudaMemcpyDeviceToHost);


    FILE *outfile;
    outfile = fopen("x.txt","w");
    for(int i=0; i<N; i++){
        fprintf(outfile,"%f\n",x[i]);
    }
    fclose(outfile);

    //printf("%f\n", out[0]);
    //printf("alp[0] : %f\n", alp[0]);
    //printf("rr[0] : %f\n", rr[0]);
    //printf("x[0] : %f\n", x[0]);

    //printf("%i, %i, %i, %i, %i,%i\n",block.x, block.y, block.z,grid.x,grid.y,grid.z);
    // Deallocate device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_x);
    cudaFree(d_out);

    cudaFree(d_r);
    cudaFree(d_r0);
    cudaFree(d_p);
    cudaFree(d_y);
    cudaFree(d_e);
    cudaFree(d_v);
    cudaFree(d_er);
    cudaFree(d_er0);

    cudaFree(d_alp);
    cudaFree(d_bet);
    cudaFree(d_c1);
    cudaFree(d_c2);
    cudaFree(d_c3);
    cudaFree(d_ev);
    cudaFree(d_vv);
    cudaFree(d_rr);
    cudaFree(d_init_rr);


    // Deallocate host memory
    free(a);
    free(b);
    free(x);
    free(out);

    free(r);
    free(r0);
    free(p);
    free(y);
    free(e);
    free(v);
    free(er);
    free(er0);

    free(alp);
    free(bet);
    free(c1);
    free(c2);
    free(c3);
    free(ev);
    free(vv);
    free(rr);
    free(init_rr);
}

計算結果は省略しますが、CPUで計算を行ったコードとほぼ同じ解がでました。
ほぼ、というのは浮動小数点の桁数の違いによるもので(CPUではdouble, GPUではfloat)、今後精度の検証を行う予定です。

実装時に工夫した点、再考の余地のある点などは以下のとおりです。

  • c1, c2, alpなどのスカラー値を配列として定義しました。これによりGPUメモリを余分に消費しますが、かわりにオーバーヘッドを減らせます。工夫したつもりではありますが逆効果かも知れませんので、今後検証します。
  • kernel関数の引数の順番がごちゃごちゃなので、整理しようと思います。
  • floatで計算を行いました。今回は対角成分にしか値が入っていないのですが、非対称なマトリクスや、対角非優位なマトリクスについても検証が必要です。
  • 行列の格納形式は、単に粗行列を一次元配列に伸ばした状態になっていますが、無駄が多い気がします。今後、格納形式にCRSを使ってみたいと思います。
  • Fortranから呼び出したいので、main関数をbicgstab関数などとして、#defineなどもなくし、Fortranからバインディングしやすくします。
  • dim3d型のgrid, blockがいまいちよく解っていません。今回はきりの良い1024元の連立方程式としましたが、任意rankのもので計算できるよう改良します。

また細かく追記しようと思います。
なお、速度差については、8世代i5のシングルコアvs8世代i5+RTX2070で計測したところ、ややRTX2070の方が速かったです。
今後、サイズを大きくして、GridやBlockの方を調整してチャレンジしてみます。

ご助言など頂けましたら幸いです。

5
2
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
5
2