プロプライエタリな CUDA Runtime や ptxas を使わずに Vulkan API + NVIDIA ドライバだけで PTX を実行できるようなのでそれについてを記しておきます。もしかしたら配布問題、依存問題が軽減されるかもしれません。
まずは実行したい CUDA コードを用意します。
inc.cu
extern "C" __global__
void add_one(unsigned int* ptr)
{
(*ptr)++;
}
CUDA を LLVM (clang++) でコンパイルして PTX にします。
$ clang++ --cuda-gpu-arch=sm_86 --cuda-device-only -include llvm_offload_wrappers/__llvm_offload.h -include __clang_cuda_builtin_vars.h -xcuda -nocudainc -nocudalib -S inc.cu -o inc.ptx
clang++: warning: CUDA version is newer than the latest partially supported version 12.8 [-Wunknown-cuda-version]
なおclang++の引数の-xcudaの前に以下を付けると数学関数もコンパイルできるようになるものの、libdeviceのリンクまでは行ってくれないようなので実行で弾かれます…。
-DCUDA_VERSION=130000 -D__forceinline__="__device__ __inline__ __attribute__((always_inline))" -include limits.h -include math.h -include __clang_cuda_libdevice_declares.h -include __clang_cuda_device_functions.h --include __clang_cuda_math.h
話を戻して上記の CUDA から生成された PTX はこんな感じになりました。
inc.ptx
//
// Generated by LLVM NVPTX Back-End
//
.version 8.7
.target sm_86
.address_size 64
// .globl add_one // -- Begin function add_one
// @add_one
.visible .entry add_one(
.param .u64 .ptr .align 1 add_one_param_0
)
{
.local .align 8 .b8 __local_depot0[8];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .b32 %r<3>;
.reg .b64 %rd<5>;
// %bb.0:
mov.u64 %SPL, __local_depot0;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd1, [add_one_param_0];
cvta.to.global.u64 %rd2, %rd1;
cvta.global.u64 %rd3, %rd2;
st.u64 [%SP], %rd3;
ld.u64 %rd4, [%SP];
ld.u32 %r1, [%rd4];
add.s32 %r2, %r1, 1;
st.u32 [%rd4], %r2;
ret;
// -- End function
}
ここからが本番です。NVIDIAドライバでは Vulkan の VK_NVX_binary_import を使うことで PTX を直接実行できるという話を聞いたものの、ドキュメントがちゃんと書かれていなく使い方が分からないため、ChatGPT にコードを書いてもらいました:
inc.cpp
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <vulkan/vulkan.h>
int main()
{
VkInstance instance;
std::ifstream ptx_file("inc.ptx");
if (!ptx_file.is_open()) {
std::cerr << "Error opening file." << std::endl;
return 1;
}
std::stringstream ptx_buffer;
ptx_buffer << ptx_file.rdbuf();
std::string ptx = ptx_buffer.str();
VkApplicationInfo app{};
app.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
app.apiVersion = VK_API_VERSION_1_1;
VkInstanceCreateInfo ici{};
ici.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
ici.pApplicationInfo = &app;
VkResult r = vkCreateInstance(&ici, nullptr, &instance);
if (r != VK_SUCCESS) {
printf("vkCreateInstance failed\n");
return 1;
}
uint32_t gpuCount = 0;
vkEnumeratePhysicalDevices(instance, &gpuCount, nullptr);
if (gpuCount == 0) {
printf("no gpu\n");
return 1;
}
VkPhysicalDevice* gpus =
(VkPhysicalDevice*) malloc(sizeof(VkPhysicalDevice) * gpuCount);
vkEnumeratePhysicalDevices(instance, &gpuCount, gpus);
VkPhysicalDevice physicalDevice = gpus[0];
uint32_t queueCount = 0;
vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueCount, nullptr);
VkQueueFamilyProperties* props =
(VkQueueFamilyProperties*) malloc(sizeof(*props) * queueCount);
vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueCount, props);
uint32_t queueFamily = UINT32_MAX;
for (uint32_t i = 0; i < queueCount; i++) {
if (props[i].queueFlags & VK_QUEUE_COMPUTE_BIT) {
queueFamily = i;
break;
}
}
if (queueFamily == UINT32_MAX) {
printf("no compute queue\n");
return 1;
}
float priority = 1.0f;
VkDeviceQueueCreateInfo qci{};
qci.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
qci.queueFamilyIndex = queueFamily;
qci.queueCount = 1;
qci.pQueuePriorities = &priority;
const char* exts[] =
{
VK_NVX_BINARY_IMPORT_EXTENSION_NAME,
VK_KHR_BUFFER_DEVICE_ADDRESS_EXTENSION_NAME
};
VkDeviceCreateInfo dci{};
dci.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
dci.queueCreateInfoCount = 1;
dci.pQueueCreateInfos = &qci;
dci.enabledExtensionCount = 2;
dci.ppEnabledExtensionNames = exts;
VkPhysicalDeviceBufferDeviceAddressFeatures bda{};
bda.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES;
bda.bufferDeviceAddress = VK_TRUE;
dci.pNext = &bda;
VkDevice device;
r = vkCreateDevice(physicalDevice, &dci, nullptr, &device);
printf("vkCreateDevice=%d\n", r);
if (r != VK_SUCCESS)
return 1;
auto vkCreateCuModuleNVX = (PFN_vkCreateCuModuleNVX) vkGetDeviceProcAddr(device,
"vkCreateCuModuleNVX");
if (!vkCreateCuModuleNVX) {
printf("vkCreateCuModuleNVX not found\n");
return 1;
}
VkCuModuleCreateInfoNVX mi{};
mi.sType = VK_STRUCTURE_TYPE_CU_MODULE_CREATE_INFO_NVX;
mi.dataSize = ptx.size();
mi.pData = ptx.data();
VkCuModuleNVX module;
r = vkCreateCuModuleNVX(device, &mi, nullptr, &module);
printf("vkCreateCuModuleNVX=%d\n", r);
if (r == VK_SUCCESS)
printf("PTX accepted\n");
else
printf("PTX rejected\n");
PFN_vkCreateCuFunctionNVX pCreateCuFunctionNVX =
(PFN_vkCreateCuFunctionNVX) vkGetDeviceProcAddr(device, "vkCreateCuFunctionNVX");
if (!pCreateCuFunctionNVX) {
printf("not found\n");
return 1;
}
VkCuFunctionCreateInfoNVX fi{};
fi.sType = VK_STRUCTURE_TYPE_CU_FUNCTION_CREATE_INFO_NVX;
fi.module = module;
fi.pName = "add_one";
VkCuFunctionNVX func;
VkResult r2 = pCreateCuFunctionNVX(device, &fi, nullptr, &func);
printf("r2=%d func=%p\n", r2, (void*)func);
PFN_vkCmdCuLaunchKernelNVX pCmdCuLaunchKernelNVX =
(PFN_vkCmdCuLaunchKernelNVX) vkGetDeviceProcAddr(device, "vkCmdCuLaunchKernelNVX");
printf("pCmdCuLaunchKernelNVX=%p\n", (void*)pCmdCuLaunchKernelNVX);
VkQueue queue;
vkGetDeviceQueue(device, queueFamily, 0, &queue);
printf("queue=%p\n", (void*)queue);
VkCommandPool pool;
VkCommandPoolCreateInfo poolInfo{};
poolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
poolInfo.queueFamilyIndex = queueFamily;
VkResult r3 = vkCreateCommandPool(device, &poolInfo, nullptr, &pool);
printf("vkCreateCommandPool=%d\n", r3);
VkCommandBuffer cmd;
VkCommandBufferAllocateInfo ai{};
ai.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
ai.commandPool = pool;
ai.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
ai.commandBufferCount = 1;
VkResult r4 = vkAllocateCommandBuffers(device, &ai, &cmd);
printf("vkAllocateCommandBuffers=%d\n", r4);
VkBuffer buffer;
VkBufferCreateInfo bci{};
bci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
bci.size = sizeof(uint32_t);
bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT;
VkResult rbuf = vkCreateBuffer(device, &bci, nullptr, &buffer);
printf("vkCreateBuffer=%d\n", rbuf);
VkMemoryRequirements req;
vkGetBufferMemoryRequirements(device, buffer, &req);
VkPhysicalDeviceMemoryProperties memProps;
vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memProps);
uint32_t memIndex = UINT32_MAX;
for(uint32_t i=0;i<memProps.memoryTypeCount;i++) {
if(req.memoryTypeBits & (1u << i)) {
if(memProps.memoryTypes[i].propertyFlags &
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
memIndex = i;
break;
}
}
}
printf("memIndex=%u\n", memIndex);
VkMemoryAllocateFlagsInfo flags{};
flags.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO;
flags.flags = VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT;
VkMemoryAllocateInfo mai{};
mai.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
mai.pNext = &flags;
mai.allocationSize = req.size;
mai.memoryTypeIndex = memIndex;
VkDeviceMemory memory;
VkResult rmem = vkAllocateMemory(device, &mai, nullptr, &memory);
printf("vkAllocateMemory=%d\n", rmem);
VkResult rbind = vkBindBufferMemory(device, buffer, memory, 0);
printf("vkBindBufferMemory=%d\n", rbind);
void* mapped = nullptr;
VkResult rmap = vkMapMemory(device, memory, 0, sizeof(uint32_t), 0, &mapped);
printf("vkMapMemory=%d\n", rmap);
*(uint32_t*)mapped = 776;
printf("before_value=%u\n", *(uint32_t*)mapped);
vkUnmapMemory(device, memory);
VkBufferDeviceAddressInfo addrInfo{};
addrInfo.sType = VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO;
addrInfo.buffer = buffer;
VkDeviceAddress addr = vkGetBufferDeviceAddress(device, &addrInfo);
printf("addr=0x%llx\n", (unsigned long long)addr);
uint64_t ptr = (uint64_t)addr;
void* params[] = {&ptr};
VkCuLaunchInfoNVX launch{};
launch.sType = VK_STRUCTURE_TYPE_CU_LAUNCH_INFO_NVX;
launch.function = func;
launch.gridDimX = 1;
launch.gridDimY = 1;
launch.gridDimZ = 1;
launch.blockDimX = 1;
launch.blockDimY = 1;
launch.blockDimZ = 1;
launch.sharedMemBytes = 0;
launch.paramCount = 1;
launch.pParams = params;
launch.extraCount = 0;
launch.pExtras = nullptr;
VkCommandBufferBeginInfo bi{};
bi.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
VkResult r5 = vkBeginCommandBuffer(cmd, &bi);
printf("vkBeginCommandBuffer=%d\n", r5);
pCmdCuLaunchKernelNVX(cmd, &launch);
printf("kernel recorded\n");
VkResult rend = vkEndCommandBuffer(cmd);
printf("vkEndCommandBuffer=%d\n", rend);
VkSubmitInfo submit{};
submit.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
submit.commandBufferCount = 1;
submit.pCommandBuffers = &cmd;
VkResult rsubmit = vkQueueSubmit(queue, 1, &submit, VK_NULL_HANDLE);
printf("vkQueueSubmit=%d\n", rsubmit);
VkResult ridle = vkQueueWaitIdle(queue);
printf("vkQueueWaitIdle=%d\n", ridle);
rmap = vkMapMemory(device, memory, 0, sizeof(uint32_t), 0, &mapped);
printf("vkMapMemory(read)=%d\n", rmap);
printf("after_value=%u\n", *(uint32_t*)mapped);
vkUnmapMemory(device, memory);
vkDestroyDevice(device, nullptr);
vkDestroyInstance(instance, nullptr);
free(props);
free(gpus);
return 0;
}
コンパイルして実行するとちゃんと動いているようです。
$ g++ inc.cpp -o inc -lvulkan
$ ./inc
vkCreateDevice=0
vkCreateCuModuleNVX=0
PTX accepted
r2=0 func=0x59f246a60240
pCmdCuLaunchKernelNVX=0x79be441547e0
queue=0x59f246c08b50
vkCreateCommandPool=0
vkAllocateCommandBuffers=0
vkCreateBuffer=0
memIndex=3
vkAllocateMemory=0
vkBindBufferMemory=0
vkMapMemory=0
before_value=776
addr=0x4310000
vkBeginCommandBuffer=0
kernel recorded
vkEndCommandBuffer=0
vkQueueSubmit=0
vkQueueWaitIdle=0
vkMapMemory(read)=0
after_value=777