More than 3 years have passed since last update.

NVIDIA OptiXでRay tracing in One Weekend - Part2 (GPU編)

Last updated at 2021-12-24Posted at 2021-12-24

この記事はレイトレ Advent Calendar 2021の記事として作成されました。

さて、Part1ではGPU上でレイトレーシングを行うために必要なプログラムやAcceleration structure (AS)、Shader binding table (SBT)の準備を行いました。

ここからは、GPU上で光線を追跡していきます。

レイトレーシングの概要

デバイス(GPU)側では、ホスト側でカーネルを起動したあと、基本的にはRay generation, Intersection, Closesthit, Missの4種類のプログラムによってレイトレーシングが行われます。optixTrace()関数によってPart1で作成したASに対してトラバースを行い、交差判定が認めらたジオメトリにおいてClosesthitプログラムが実行されます。どのジオメトリとも交差が認められなかった場合は、Missプログラムが呼び出されます。私の場合は、MissプログラムでIBLの処理を行うことが多いです。

カーネルの起動

GPU上でレイトレーシングを行うには、まずホスト側でoptixLaunch関数を起動します。これを呼ぶと、GPU側でRay generationプログラムが起動され、レイトレーシングが開始されます。

// Pipeline launch parameterをGPU上にコピー
CUDA_CHECK( cudaMemcpyAsync(
            reinterpret_cast<void*>( state.d_params ),
            &state.params, sizeof( Params ),
            cudaMemcpyHostToDevice, state.stream
            ) );

// レイトレーシングカーネルの起動
OPTIX_CHECK( optixLaunch(
            state.pipeline,
            state.stream,
            reinterpret_cast<CUdeviceptr>( state.d_params ),
            sizeof( Params ),
            &state.sbt,
            state.params.width,   // launch width
            state.params.height,  // launch height
            1                     // launch depth
            ) );

optixLaunch関数の引数は以下の通りです。これまでに作成したパイプラインやlaunch parameter、SBTを使用してカーネルを起動させます。width, height, depthはカーネルを起動させるスレッドの次元です。基本的には(width, height)が出力となる画像サイズ、depthは1で良いでしょう。

optixLaunch( 
    OptixPipeline                  pipeline,
    CUstream                       stream,
    CUdeviceptr                    pipelineParams,
    size_t                         pipelineParamsSize,
    const OptixShaderBindingTable* sbt,
    unsigned int                   width,
    unsigned int                   height,
    unsigned int                   depth 
)

Ray generation

レイトレーシングカーネルが起動されたら、まずはRay generation プログラムが実行されます。
launch parameterからカメラ情報や画像の次元を取得し、レイを飛ばします。
レイトレースの開始はoptixTraceによって行われ、引数には追跡対象となるASのhandle、レイの視点・方向・最小距離・最大距離を指定します。また、0 ～ 8 個の任意の数のペイロード値を設定することができます。

static __forceinline__ __device__ void trace(
        OptixTraversableHandle handle,
        float3                 ray_origin,
        float3                 ray_direction,
        float                  tmin,
        float                  tmax,
        SurfaceInfo*           si
        )
{
    // SurfaceInfoのポインタを2つのペイロードにパックする
    unsigned int u0, u1;
    packPointer( si, u0, u1 );
    optixTrace(
            handle,
            ray_origin,
            ray_direction,
            tmin,
            tmax,
            0.0f,     // rayTime
            OptixVisibilityMask( 1 ),
            OPTIX_RAY_FLAG_NONE,
            0,        // SBT offset
            1,        // SBT stride
            0,        // missSBTIndex
            u0, u1 );
}

extern "C" __global__ void __raygen__pinhole()
{
    const int w = params.width; 
    const int h = params.height;
    const float3 eye = params.eye;
    const float3 U = params.U;
    const float3 V = params.V; 
    const float3 W = params.W;
    const uint3 idx = optixGetLaunchIndex();
    const int subframe_index = params.subframe_index;
    const int samples_per_launch = params.samples_per_launch;

    // 現在のスレッドIDから乱数用のシード値を生成
    unsigned int seed = tea<4>(idx.y * w + idx.x, subframe_index);

    float3 result = make_float3(0.0f);
    for (int i = 0; i < samples_per_launch; i++)
    {
        const float2 subpixel_jitter = make_float2(rnd(seed), rnd(seed));

        const float2 d = 2.0f * make_float2(
            ((float)idx.x + subpixel_jitter.x) / (float)w, 
            ((float)idx.y + subpixel_jitter.y) / (float)h
        ) - 1.0f;

        // 光線の向きと原点を設定
        float3 ray_direction = normalize(d.x * U + d.y * V + W);
        float3 ray_origin = eye;

        SurfaceInfo si;
        si.emission = make_float3(0.0f);
        si.albedo = make_float3(0.0f);
        si.trace_terminate = false;
        si.seed = seed;

        float3 throughput = make_float3(1.0f);

        int depth = 0;
        for (;;)
        {
            if (depth >= params.max_depth)
                break;

            // IASに対してレイトレース
            trace(params.handle, ray_origin, ray_direction, 0.01f, 1e16f, &si);

            if (si.trace_terminate) {
                result += si.emission * throughput;
                break;
            }

            // Direct callable関数を使って各マテリアルにおける散乱方向とマテリアルの色を計算
            float3 scattered;
            optixDirectCall<void, SurfaceInfo*, void*, float3&>(
                si.material.prg_id, &si, si.material.data, scattered
            );

            throughput *= si.albedo;

            ray_origin = si.p;
            ray_direction = scattered;

            ++depth;
        }
    }

    const unsigned int image_index = idx.y * params.width + idx.x;
    float3 accum_color = result / static_cast<float>(params.samples_per_launch);

    if (subframe_index > 0)
    {
        const float a = 1.0f / static_cast<float>(subframe_index + 1);
        const float3 accum_color_prev = make_float3(params.accum_buffer[image_index]);
        accum_color = lerp(accum_color_prev, accum_color, a);
    }
    // 取得した輝度値を出力バッファに書き込む
    params.accum_buffer[image_index] = make_float4(accum_color, 1.0f);
    params.frame_buffer[image_index] = make_color(accum_color);
}

optixTraceで設定したペイロードは、Intersection, Anyhit, Closesthit, Missプログラムで受け取れます。ただし、最大で8個のペイロードしか設定できないので、それ以上の値や構造体を設定したい場合に困ります。例えば、今回の場合だと、Raygen, Miss, HitGroup間における衝突位置における法線やマテリアル情報のやりとりをSurfaceInfoを介して行いますが、このままではペイロードに格納することができません。

struct SurfaceInfo
{
    // 発光度
    float3 emission;
    // 物体表面の色
    float3 albedo;
    // 衝突位置
    float3 p;
    // レイの方向
    float3 direction;
    // 法線
    float3 n;
    // テクスチャ座標
    float2 texcoord;

    // 乱数のシード値
    unsigned int seed;
    // 光線追跡を終了するか否か
    int trace_terminate;

    // マテリアル用のデータとCallablesプログラムのID
    Material material;
};

この場合にはデバイス上のローカルメモリもしくはグローバルメモリに対するポインタをいくつかのペイロードにパックする方法が取られ、trace関数内のpackPointer()がそれにあたります。

// ポインタをunsigned long longに変換してから、前側32bitをi0に、後側32bitをi1に格納する
static __forceinline__ __device__ void  packPointer( void* ptr, unsigned int& i0, unsigned int& i1 )
{
    const unsigned long long uptr = reinterpret_cast<unsigned long long>( ptr );
    i0 = uptr >> 32;
    i1 = uptr & 0x00000000ffffffff;
}

なぜ、unsigned long longにポインタを格納する必要があるのかは、CUDAドライバとの兼ね合いがあると考えられます。
CUDAドライバはデバイス上のメモリに対するアクセスを、整数型で定義されるCUdeviceptrを介して行うようです。このCUdeviceptrのサイズは実行プラットフォームのポインタサイズと同じであり、64bit環境では64bitとなります。OptiX SDKのINSTALL_WIN.txtにはCMakeの際に必ず64bitアプリを指定するように記載されていますが、この辺の兼ね合いが関係しているのかもしれませんね。

実際、ASのトラバースに用いられるOptixTraversableHandleも内部的にはunsigned long longで64bitの整数型でしたので、ASが格納されているメモリ領域を整数型のhandleを通じてアクセスしているように思えます。

参考
- CUdeviceptr を void ポインタに変換するメモ
- Why does CUDA CUdeviceptr use unsigned int instead of void?

ペイロードに格納されたポインタを、元のSurfaceInfo型のポインタに戻してあげるには、先ほどと逆で、64bitに変換したi0を32bit分左シフトしてから、i1とORを取れば復元できます。

static __forceinline__ __device__ void* unpackPointer( unsigned int i0, unsigned int i1 )
{
    const unsigned long long uptr = static_cast<unsigned long long>( i0 ) << 32 | i1;
    void*           ptr = reinterpret_cast<void*>( uptr );
    return ptr;
}

// 0番目と1番目のペイロードにパックされているSurfaceInfoのポインタを取得
static __forceinline__ __device__ SurfaceInfo* getSurfaceInfo()
{
    const unsigned int u0 = optixGetPayload_0();
    const unsigned int u1 = optixGetPayload_1();
    return reinterpret_cast<SurfaceInfo*>( unpackPointer( u0, u1 ) );
}

さて、__raygen__pinhole()関数のforループの中身についてです。ここでは、レイトレーシングにおける再帰的なパスの接続を行っています。本家の週末レイトレでは以下のコードのように再帰関数を用いていました。

color ray_color(ray, scene, depth) {
    if パスの深さが最大深さを超えている
        パスの寄与を0とし、処理終了

    if シーンのジオメトリとレイが衝突したら
        衝突点における情報を収集: 点 p, 法線 n, マテリアルのポインタ mat_ptr
        if mat_ptrが光源でないなら
            mat_ptrの表面色 albedo と レイの散乱方向 d を計算し、pとdから次のレイ(scattered)を生成
            return albedo * ray_color(scattered, world, depth-1);

    return background_color;
}

本来なら、OptiXでも再帰関数を使えれば楽なのですが、OptiXでは再帰関数がサポートされていないためできません。

パストレーシング（今回のは正確なパストレではないです）においては、以下のような積分式とパスにおけるBRDFの総乗が求まればよいので、BRDFの総乗を計算するためのthroughputを用意すれば、for文でもシンプルに実装できます。

\begin{align}
I &= \int_\Omega f(X) dX \\
f(X) &= L_e(x_0 \rightarrow x_1) \prod^k_{i=1} f_r(x_{i-1}\rightarrow x_i \rightarrow x_{i+1}) \\
\end{align}

$X:深度kのパス(x_0, x_1, ... x_k), x_0:光源上の点, L_e:光源の輝度, f_r:BRDF$

float3 throughput = make_float3(1.0f);

int depth = 0;
for (;;)
{
    if (depth >= params.max_depth)
        break;

    // IASに対してレイトレース
    trace(params.handle, ray_origin, ray_direction, 0.01f, 1e16f, &si);

    // Miss プログラムが実行されたら、si.trace_terminateはtrueになる
    // 背景色からの輝度とthroughputをかけて、パスの寄与を算出
    if (si.trace_terminate) {
        result += si.emission * throughput;
        break;
    }

    // Direct callable関数を使って各マテリアルにおける散乱方向とマテリアルの色を計算
    float3 scattered;
    optixDirectCall<void, SurfaceInfo*, void*, float3&>(
        si.material.prg_id, &si, si.material.data, scattered
    );

    throughput *= si.albedo;

    ray_origin = si.p;
    ray_direction = scattered;

    ++depth;
}

Direct callableプログラムによる疑似ポリモーフィズム

ここで、レイの散乱方向の計算についてDirect callableプログラムを用いていることについて説明します。Part1でも述べた通り、CallablesプログラムはSBT内のCallablesプログラムが紐づいているSBT indexから関数を呼び出します。ただ、本家週末レイトレをそのまま実装するなら、Direct callableなど使わずに基底クラスのポインタを介して、派生クラスの関数呼び出しを行えばよいでしょう。

しかし、悲しいことにOptiX 7.x 系ではポインタを介した仮想関数呼び出しがサポートされません。おそらくは仮想関数テーブルをデバイス上に作る代わりに、SBT内にCallables関数のテーブルを作っているからそちらを使ってねという感じなんでしょう。

そこで、今回はこのポリモーフィズム部分をDirect callableで置き換えます。Part1でも触れましたが、今回は物体表面のマテリアルに、週末レイトレのようなポインタではなく、構造体Materialを使っています。

struct Material {
    // マテリアル(Lambertian, Glass, Metal)のデータ
    // デバイス上に確保されたポインタを紐づけておく
    // 共用体(union)を使わずに汎用ポインタにすることで、
    // 異なるデータ型の構造体を追加したいときに対応しやすくなる。
    void* data; 

    // マテリアルにおける散乱方向や色を計算するためのCallablesプログラムのID
    // OptiX 7.x では仮想関数が使えないので、Callablesプログラムを使って
    // 疑似的なポリモーフィズムを実現する
    unsigned int prg_id;
};

このMaterialをSurfaceInfoに持つようにし、Closesthitプログラムが実行されたときにジオメトリに紐づいているMaterialをSurfaceInfoに設定しておくことで、Closesthit内でマテリアルの種類に応じた条件分岐を行わずに異なるマテリアルの表現が可能になります。

extern "C" __global__ void __closesthit__sphere()
{
    HitGroupData* data = (HitGroupData*)optixGetSbtDataPointer();

    // ...

    SurfaceInfo* si = getSurfaceInfo();
    si->p = P;
    si->n = world_n;
    si->direction = direction;
    si->texcoord = texcoord;
    si->material = data->material;
}

参考
- Virtual function in OptiX 7.0

Intersection/Closesehit

Intersection/Anyhit/Closesthitプログラムでは、HitGroupレコードに紐づけられたデータから交差判定やシェーディング等を行います。
とはいえここまでくればもう話すことはほとんどありません。Intersectionプログラムでジオメトリとの交差判定を行い、SurfaceInfoに交点情報を保存するのみです。三角形に関しては交差判定がOptiXにビルトイン実装されいてるので、こちら側で実装する必要はないです。以下に、球体用のIntersection/Closesthitプログラムを載せておきます。

extern "C" __global__ void __intersection__sphere()
{
    // Shader binding tableからデータを取得
    HitGroupData* data = (HitGroupData*)optixGetSbtDataPointer();
    // AABBとの交差判定が認められた球体のGAS内のIDを取得
    const int prim_idx = optixGetPrimitiveIndex();
    const SphereData sphere_data = ((SphereData*)data->shape_data)[prim_idx];

    const float3 center = sphere_data.center;
    const float radius = sphere_data.radius;

    // オブジェクト空間におけるレイの原点と方向を取得
    const float3 origin = optixGetObjectRayOrigin();
    const float3 direction = optixGetObjectRayDirection();
    // レイの最小距離と最大距離を取得
    const float tmin = optixGetRayTmin();
    const float tmax = optixGetRayTmax();

    // 球体との交差判定処理（判別式を解いて、距離tを計算)
    const float3 oc = origin - center;
    const float a = dot(direction, direction);
    const float half_b = dot(oc, direction);
    const float c = dot(oc, oc) - radius * radius;

    const float discriminant = half_b * half_b - a * c;
    if (discriminant < 0) return;
    
    const float sqrtd = sqrtf(discriminant);

    float root = (-half_b - sqrtd) / a;
    if (root < tmin || tmax < root)
    {
        root = (-half_b + sqrtd) / a;
        if (root < tmin || tmax < root)
            return;
    }

    // オブジェクト空間におけるレイと球の交点を計算
    const float3 P = origin + root * direction;
    const float3 normal = (P - center) / radius;

    // 球体におけるテクスチャ座標を算出 (Z up)と仮定して、xとyから方位角、zから仰角を計算
    float phi = atan2(normal.y, normal.x);
    if (phi < 0) phi += 2.0f * M_PIf;
    const float theta = acosf(normal.z);
    const float2 texcoord = make_float2(phi / (2.0f * M_PIf), theta / M_PIf);

    // レイと球の交差判定を認める
    optixReportIntersection(root, 0, 
        __float_as_int(normal.x), __float_as_int(normal.y), __float_as_int(normal.z),
        __float_as_int(texcoord.x), __float_as_int(texcoord.y)
    );
}

extern "C" __global__ void __closesthit__sphere()
{
    // Shader binding tableからデータを取得
    HitGroupData* data = (HitGroupData*)optixGetSbtDataPointer();

    // 0 - 2番目のAttributeからIntersectionプログラムで計算した法線を取得
    const float3 local_n = make_float3(
        __int_as_float(optixGetAttribute_0()),
        __int_as_float(optixGetAttribute_1()),
        __int_as_float(optixGetAttribute_2())
    );
    // Instanceに紐付いている行列からオブジェクト空間における法線をグローバル空間にマップする
    const float3 world_n = normalize(optixTransformNormalFromObjectToWorldSpace(local_n));

    // 3 - 4番目のAttributeからテクスチャ座標を取得
    const float2 texcoord = make_float2(
        __int_as_float(optixGetAttribute_3()),
        __int_as_float(optixGetAttribute_4())
    );

    // グローバル空間におけるレイの原点と方向を計算し、交点座標の位置を計算
    const float3 origin = optixGetWorldRayOrigin();
    const float3 direction = optixGetWorldRayDirection();
    const float3 P = origin + optixGetRayTmax() * direction;

    // PayloadからSurfaceInfoのポインタを取得し、交点上の情報を格納
    SurfaceInfo* si = getSurfaceInfo();
    si->p = P;
    si->n = world_n;
    si->direction = direction;
    si->texcoord = texcoord;
    // HitGroupDataに紐付いているマテリアル情報をSurfaceInfoに紐付ける
    si->material = data->material;
}

Intersectionプログラムではジオメトリとの交差判定が認められる場合、optixReportIntersection()関数を実行します。
hitTはレイの距離、hitKindはAnyhit、Closesthitプログラムで使用できる任意の値です。交差判定だけジオメトリごとに実装して、Closesthitを共通化する際などに使うといいんでしょうかね。使ったことはありません。
a0 ... a7 は任意の数のAttributeを設定することができます。型はunsigned int型になっており、Anyhitプログラム、Closesthitプログラムで取得することができます。個人的なイメージとしては、Intersectionがvertex シェーダで、Anyhit、Closesthitをfragmentとするとその間で頂点情報を渡すAttribute変数と考えるとしっくりきます。

optixReportIntersection(hitT, hitKind, a0 ... a7);

ただし、unsigned int型だけでなくて、float型を送りたい場面もあるかと思います。というか大抵の場合がそうですね。その場合はビルトイン関数の __float_as_int()を使います。Closesthit側でAttributeをfloatで受け取りたい場合は、__int_as_float(optixGetAttribute_0())とします。

Miss

MissプログラムはASのトラバースでジオメトリとの交差が認められなかった場合に実行されます。今回は週末レイトレ同様に背景色をSurfaceInfoの発光度に指定するのと、光線追跡を終了するフラグを立てておきます。

extern "C" __global__ void __miss__radiance()
{
    const MissData* miss = (MissData*)optixGetSbtDataPointer();

    SurfaceInfo* si = getSurfaceInfo();

    // ベクトルのy成分から背景色を計算
    const float3 unit_direction = normalize(optixGetWorldRayDirection());
    const float t = 0.5f * (unit_direction.y + 1.0f);
    si->emission = (1.0f - t) * make_float3(1.0f) + t * make_float3(0.5f, 0.7f, 1.0f);
    si->trace_terminate      = true;
}

おわりに

長くなってしまいましたが、Part1、Part2でOptiX 7.x 系におけるレイトレーシングの大体の流れが説明できたと思います。
そのほかにも、ジオメトリ型ではCurvesプリミティブが存在していたり、IASの下にIASを構築する multi-level instancing が可能で、DirectX/Vulkanよりもかゆいところには手が届きやすいのではないのかなと思います。

ただ、シンプルなレイトレシーンを作るだけで非常にやることが多い & コード量がどうしても増えがちです。
OptiXは使いたいけどSBTの構築をいちいち考えたり、そんなにたくさんコード書きたくないという方も多いと思います。

個人的には、ライブラリとして@shocker-0x15さんのOptiX_Utilityをオススメします。準備編のようなASやSBTの構築に対するラッパーはもちろんのこと、デバイス側でのユーティリティ関数も豊富に実装されていて、比較的簡単にOptiXを触れると思います。

また、私自身もopenFrameworksのレイトレ版があるといいなという思いから、OptiXを使ったリアルタイムレイトレ環境を開発しています。examplesなどがだんだん増えてきてますが、まだまだ絶賛開発継続中なので、もし使ってくださる方がいればご意見等いただきたいです。

寒さも厳しくなりつつありますので、くれぐれも健康に気を付けながら、レイトレでGPUぶん回して暖を取りましょう！

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up