AMDのGPU、確認したデバイスはRadeon RX Vega 56だけなのですが、vkCmdPushConstantsで与えた値を読み出す際にシェーダの記述次第で期待にそぐわない挙動を示します。
読み出す場所がshader invocation(HLSLのスレッド、OpenCLのワークアイテムと同義)毎に異なる可能性がある場合、間違った場所から読むように思えます。
次のプログラムをビルドし実行しました。
プログラムのソースコード
#include <array>
#include <format>
#include <iostream>
#include <random>
#include <vector>
#include <vulkan/vulkan.hpp>
#include <glslang/Include/glslang_c_interface.h>
#include <glslang/Public/resource_limits_c.h>
#pragma comment(lib, "vulkan-1.lib")
#pragma comment(lib, "glslang.lib")
#pragma comment(lib, "MachineIndependent.lib")
#pragma comment(lib, "GenericCodeGen.lib")
#pragma comment(lib, "glslang-default-resource-limits.lib")
#pragma comment(lib, "SPIRV.lib")
#pragma comment(lib, "SPIRV-Tools.lib")
#pragma comment(lib, "SPIRV-Tools-opt.lib")
bool CompileShader(std::vector<uint32_t>& binary, const char *code);
const char *kCode = R"(
#version 450
layout (local_size_x = 32) in;
layout (push_constant) uniform PushConstantsBlock
{
uint Input[32];
};
layout (binding = 0) writeonly buffer BindingBlock_0
{
uint Output[32];
};
void main()
{
uint index = gl_LocalInvocationID.x;
Output[index] = Input[index];
}
)";
int main()
{
static const uint32_t kDataElements = 32;
static const uint32_t kPushConstantSize = sizeof (uint32_t) * kDataElements;
static const uint32_t kBufferSize = kPushConstantSize;
std::vector<uint32_t> shader_binary;
CompileShader(shader_binary, kCode);
vk::Instance vulkan_instance;
{
vk::InstanceCreateInfo create_info;
vk::ApplicationInfo app_info;
const char *layers[] = {"VK_LAYER_KHRONOS_validation"};
app_info.setApiVersion(VK_API_VERSION_1_3);
create_info.setPApplicationInfo(&app_info);
create_info.enabledLayerCount = 1;
create_info.ppEnabledLayerNames = layers;
vulkan_instance = vk::createInstance(create_info);
}
vk::PhysicalDevice physical_device = vulkan_instance.enumeratePhysicalDevices().at(0);
vk::Device device;
uint32_t queue_index = UINT32_MAX;
{
std::vector<vk::QueueFamilyProperties> queue_props = physical_device.getQueueFamilyProperties();
for (uint32_t i=0; i<queue_props.size(); ++i)
{
if (queue_props.at(i).queueFlags & vk::QueueFlagBits::eCompute)
{
queue_index = i;
break;
}
}
vk::DeviceCreateInfo device_create_info;
vk::DeviceQueueCreateInfo queue_create_info;
float queue_priorities[1] = {1.0f};
const char *layers[] = {"VK_LAYER_KHRONOS_validation"};
queue_create_info.queueFamilyIndex = queue_index;
queue_create_info.queueCount = 1;
queue_create_info.pQueuePriorities = queue_priorities;
device_create_info.queueCreateInfoCount = 1;
device_create_info.pQueueCreateInfos = &queue_create_info;
device_create_info.enabledLayerCount = 1;
device_create_info.ppEnabledLayerNames = layers;
device = physical_device.createDevice(device_create_info);
}
vk::CommandPool command_pool;
{
vk::CommandPoolCreateInfo create_info;
create_info.queueFamilyIndex = queue_index;
create_info.flags = vk::CommandPoolCreateFlagBits::eResetCommandBuffer;
command_pool = device.createCommandPool(create_info);
}
vk::CommandBuffer command_buffer;
{
vk::CommandBufferAllocateInfo create_info;
create_info.commandPool = command_pool;
create_info.commandBufferCount = 1;
create_info.level = vk::CommandBufferLevel::ePrimary;
command_buffer = device.allocateCommandBuffers(create_info).at(0);
}
vk::ShaderModule shader_module;
vk::DescriptorSetLayout descriptor_set_layout;
vk::PipelineLayout pipeline_layout;
vk::Pipeline compute_pipeline;
vk::DescriptorPool descriptor_pool;
vk::DescriptorSet descriptor_set;
{
{
vk::ShaderModuleCreateInfo create_info;
create_info.setCodeSize(sizeof (uint32_t) * shader_binary.size());
create_info.setPCode(shader_binary.data());
shader_module = device.createShaderModule(create_info);
}
{
vk::DescriptorSetLayoutCreateInfo create_info;
vk::DescriptorSetLayoutBinding binding;
binding.setBinding(0);
binding.setDescriptorType(vk::DescriptorType::eStorageBuffer);
binding.setDescriptorCount(1);
binding.setStageFlags(vk::ShaderStageFlagBits::eCompute);
create_info.bindingCount = 1;
create_info.pBindings = &binding;
descriptor_set_layout = device.createDescriptorSetLayout(create_info);
}
{
vk::PipelineLayoutCreateInfo create_info;
vk::PushConstantRange push_range;
create_info.setLayoutCount = 1;
create_info.pSetLayouts = &descriptor_set_layout;
push_range.setStageFlags(vk::ShaderStageFlagBits::eCompute);
push_range.setOffset(0);
push_range.setSize(kPushConstantSize);
create_info.setPushConstantRangeCount(1);
create_info.setPushConstantRanges(push_range);
pipeline_layout = device.createPipelineLayout(create_info);
}
{
vk::ComputePipelineCreateInfo create_info;
vk::PipelineShaderStageCreateInfo shader_stage_info;
shader_stage_info.setStage(vk::ShaderStageFlagBits::eCompute);
shader_stage_info.setModule(shader_module);
shader_stage_info.setPName("main");
create_info.setStage(shader_stage_info);
create_info.setLayout(pipeline_layout);
compute_pipeline = device.createComputePipelines({}, create_info).value.at(0);
}
{
vk::DescriptorPoolCreateInfo create_info;
vk::DescriptorPoolSize pool_size;
pool_size.setType(vk::DescriptorType::eStorageBuffer);
pool_size.setDescriptorCount(1);
create_info.setFlags(vk::DescriptorPoolCreateFlagBits::eFreeDescriptorSet);
create_info.setMaxSets(1);
create_info.setPoolSizeCount(1);
create_info.setPoolSizes(pool_size);
descriptor_pool = device.createDescriptorPool(create_info);
}
{
vk::DescriptorSetAllocateInfo create_info;
vk::DescriptorSetLayout layout = descriptor_set_layout;
create_info.descriptorPool = descriptor_pool;
create_info.descriptorSetCount = 1;
create_info.pSetLayouts = &descriptor_set_layout;
descriptor_set = device.allocateDescriptorSets(create_info).at(0);
}
}
vk::Buffer device_buffer;
vk::Buffer staging_buffer;
{
vk::BufferCreateInfo create_info;
create_info.setSize(kBufferSize);
create_info.setUsage(vk::BufferUsageFlagBits::eStorageBuffer| vk::BufferUsageFlagBits::eTransferSrc);
device_buffer = device.createBuffer(create_info);
create_info.setUsage(vk::BufferUsageFlagBits::eTransferDst);
staging_buffer = device.createBuffer(create_info);
}
vk::DeviceMemory device_memory;
vk::DeviceMemory staging_memory;
{
vk::MemoryAllocateInfo allocate_info;
vk::PhysicalDeviceMemoryProperties mem_props;
vk::MemoryRequirements mem_requires;
uint32_t mem_type_index;
physical_device.getMemoryProperties(&mem_props);
//
mem_requires = device.getBufferMemoryRequirements(device_buffer);
for (mem_type_index=0; mem_type_index<mem_props.memoryTypeCount; ++mem_type_index)
{
if ((uint32_t(1) << mem_type_index) & mem_requires.memoryTypeBits)
{
if (vk::MemoryPropertyFlagBits::eDeviceLocal & mem_props.memoryTypes[mem_type_index].propertyFlags)
break;
}
}
allocate_info.setAllocationSize(mem_requires.size);
allocate_info.setMemoryTypeIndex(mem_type_index);
device_memory = device.allocateMemory(allocate_info);
//
mem_requires = device.getBufferMemoryRequirements(staging_buffer);
for (mem_type_index=0; mem_type_index<mem_props.memoryTypeCount; ++mem_type_index)
{
if ((uint32_t(1) << mem_type_index) & mem_requires.memoryTypeBits)
{
if (vk::MemoryPropertyFlagBits::eHostVisible & mem_props.memoryTypes[mem_type_index].propertyFlags)
break;
}
}
allocate_info.setAllocationSize(mem_requires.size);
allocate_info.setMemoryTypeIndex(mem_type_index);
staging_memory = device.allocateMemory(allocate_info);
}
device.bindBufferMemory(device_buffer, device_memory, 0);
device.bindBufferMemory(staging_buffer, staging_memory, 0);
{
vk::WriteDescriptorSet write;
vk::DescriptorBufferInfo info;
info.setBuffer(device_buffer);
info.setOffset(0);
info.setRange(kBufferSize);
write.setDstSet(descriptor_set);
write.setDstBinding(0);
write.setDstArrayElement(0);
write.setDescriptorCount(1);
write.setDescriptorType(vk::DescriptorType::eStorageBuffer);
write.setBufferInfo(info);
device.updateDescriptorSets(write, nullptr);
}
std::array<uint32_t, kDataElements> data;
{
std::random_device random_device;
std::generate(data.begin(), data.end(), std::ref(random_device));
}
{
vk::CommandBufferBeginInfo begin_info;
vk::BufferMemoryBarrier barrier;
vk::BufferCopy copy_region;
begin_info.setFlags(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
command_buffer.begin(begin_info);
command_buffer.pushConstants(pipeline_layout, vk::ShaderStageFlagBits::eCompute, 0, kPushConstantSize, data.data());
command_buffer.bindPipeline(vk::PipelineBindPoint::eCompute, compute_pipeline);
command_buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipeline_layout, 0, descriptor_set, nullptr);
command_buffer.dispatch(1, 1, 1);
barrier.setSrcAccessMask(vk::AccessFlagBits::eShaderWrite);
barrier.setDstAccessMask(vk::AccessFlagBits::eTransferRead);
barrier.setSrcQueueFamilyIndex(VK_QUEUE_FAMILY_IGNORED);
barrier.setDstQueueFamilyIndex(VK_QUEUE_FAMILY_IGNORED);
barrier.setBuffer(device_buffer);
barrier.setOffset(0);
barrier.setSize(kBufferSize);
command_buffer.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eTransfer, {}, {}, {barrier}, {});
copy_region.setSrcOffset(0);
copy_region.setDstOffset(0);
copy_region.setSize(kBufferSize);
command_buffer.copyBuffer(device_buffer, staging_buffer, {copy_region});
command_buffer.end();
}
{
vk::SubmitInfo submit_info;
vk::Queue queue = device.getQueue(queue_index, 0);
submit_info.setCommandBufferCount(1);
submit_info.setCommandBuffers(command_buffer);
queue.submit(submit_info);
queue.waitIdle();
}
{
void *mapped_memory;
uint32_t *readback_data;
mapped_memory = device.mapMemory(staging_memory, 0, kBufferSize);
readback_data = reinterpret_cast<uint32_t *>(mapped_memory);
for (int i=0; i<data.size(); ++i)
{
std::cout << std::format("{:x}, {:x}", data.at(i), readback_data[i]) << std::endl;
}
device.unmapMemory(staging_memory);
}
// destroy objects...
return EXIT_SUCCESS;
}
bool CompileShader(std::vector<uint32_t>& binary, const char *code)
{
glslang_initialize_process();
glslang_input_t shader_input {};
shader_input.language = GLSLANG_SOURCE_GLSL;
shader_input.stage = GLSLANG_STAGE_COMPUTE;
shader_input.client = GLSLANG_CLIENT_VULKAN;
shader_input.client_version = GLSLANG_TARGET_VULKAN_1_3;
shader_input.target_language = GLSLANG_TARGET_SPV;
shader_input.target_language_version = GLSLANG_TARGET_SPV_1_6;
shader_input.code = code;
shader_input.default_version = 450;
shader_input.default_profile = GLSLANG_NO_PROFILE;
shader_input.force_default_version_and_profile = false;
shader_input.forward_compatible = 0;
shader_input.messages = GLSLANG_MSG_DEFAULT_BIT;
shader_input.resource = glslang_default_resource();
glslang_shader_t *shader = glslang_shader_create(&shader_input);
glslang_shader_preprocess(shader, &shader_input);
glslang_shader_parse(shader, &shader_input);
glslang_program_t *program = glslang_program_create();
glslang_program_add_shader(program, shader);
glslang_program_link(program, GLSLANG_MSG_SPV_RULES_BIT| GLSLANG_MSG_VULKAN_RULES_BIT);
glslang_program_SPIRV_generate(program, GLSLANG_STAGE_COMPUTE);
size_t binary_size = glslang_program_SPIRV_get_size(program);
binary.clear();
binary.resize(binary_size);
glslang_program_SPIRV_get(program, binary.data());
glslang_program_delete(program);
glslang_shader_delete(shader);
glslang_finalize_process();
return true;
}
Intel UHD 630を利用した際は次のような出力を得ました。
63064ee1, 63064ee1
7e61860f, 7e61860f
c266bde6, c266bde6
54b87425, 54b87425
84416824, 84416824
b01d4c87, b01d4c87
c84266e2, c84266e2
28e5b7bc, 28e5b7bc
<略>
一方、AMD Radeon RX Vega 56では次のような出力を得ました。
d0cfcb8e, d0cfcb8e
61e86148, d0cfcb8e
fce1ffa0, d0cfcb8e
95764b7f, d0cfcb8e
9ab37dbd, d0cfcb8e
f7e42d9f, d0cfcb8e
4ab1c545, d0cfcb8e
80491d66, d0cfcb8e
<略>
Radeon GPU Analyzerでシェーダをビルドしてみたところ、次の出力を得られました。
0x000000 v_ashrrev_i32_e32 v1, 31, v0 Vector ALU 2202009F
0x000004 v_lshlrev_b64 v[1:2], 2, v[0:1] Vector ALU D28F0001 00020082
0x00000C s_getpc_b64 s[0:1] Vector ALU BE801C00
0x000010 v_mov_b32_e32 v3, s1 Vector ALU 7E060201
0x000014 v_add_co_u32_e32 v1, vcc, s3, v1 Vector ALU 32020203
0x000018 v_addc_co_u32_e32 v2, vcc, v3, v2, vcc Vector ALU 38040503
0x00001C global_load_dword v1, v[1:2], off offset:4 Vector Memory DC508004 017F0001
0x000024 s_mov_b32 s3, s1 Scalar ALU BE830001
0x000028 s_load_dwordx4 s[0:3], s[2:3], 0x0 Scalar Memory C00A0001 00000000
0x000030 v_lshlrev_b32_e32 v0, 2, v0 Vector ALU 24000082
0x000034 s_waitcnt vmcnt(0) lgkmcnt(0) Flow Control BF8C0070
0x000038 buffer_store_dword v1, v0, s[0:3], 0 offen Vector ALU E0701000 80000100
0x000040 s_endpgm Flow Control BF810000
global_load_dwordで読み取り、buffer_store_dwordで書き込んでいます。
ここでシェーダの一部を次のように書き換えてみます。
void main()
{
uint index = gl_LocalInvocationID.x;
for (uint i=0; i<32; ++i)
Output[i] = Input[i];
}
AMD Radeon RX Vega 56で次のような出力を得ました。
1a7e0d59, 1a7e0d59
48348045, 48348045
5b141b90, 5b141b90
9d44e6f9, 9d44e6f9
fc5430c7, fc5430c7
2c948604, 2c948604
f8f73374, f8f73374
4babbeef, 4babbeef
<略>
Radeon GPU Analyzerで次の出力を得ました。
0x000000 s_getpc_b64 s[20:21] Vector ALU BE941C00
0x000004 s_mov_b32 s0, s3 Scalar ALU BE800003
0x000008 s_mov_b32 s1, s21 Scalar ALU BE810015
0x00000C s_load_dwordx16 s[4:19], s[0:1], 0x4 Scalar Memory C0120100 00000004
0x000014 s_mov_b32 s3, s21 Scalar ALU BE830015
0x000018 s_load_dwordx4 s[20:23], s[2:3], 0x0 Scalar Memory C00A0501 00000000
0x000020 s_waitcnt lgkmcnt(0) Flow Control BF8CC07F
0x000024 v_mov_b32_e32 v0, s4 Vector ALU 7E000204
0x000028 v_mov_b32_e32 v1, s5 Vector ALU 7E020205
0x00002C v_mov_b32_e32 v2, s6 Vector ALU 7E040206
0x000030 v_mov_b32_e32 v3, s7 Vector ALU 7E060207
0x000034 buffer_store_dwordx4 v[0:3], off, s[20:23], 0 Vector ALU E07C0000 80050000
0x00003C s_nop 0 Flow Control BF800000
0x000040 v_mov_b32_e32 v0, s8 Vector ALU 7E000208
0x000044 v_mov_b32_e32 v1, s9 Vector ALU 7E020209
0x000048 v_mov_b32_e32 v2, s10 Vector ALU 7E04020A
0x00004C v_mov_b32_e32 v3, s11 Vector ALU 7E06020B
0x000050 buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:16 Vector ALU E07C0010 80050000
0x000058 s_nop 0 Flow Control BF800000
0x00005C v_mov_b32_e32 v0, s12 Vector ALU 7E00020C
0x000060 v_mov_b32_e32 v1, s13 Vector ALU 7E02020D
0x000064 v_mov_b32_e32 v2, s14 Vector ALU 7E04020E
0x000068 v_mov_b32_e32 v3, s15 Vector ALU 7E06020F
0x00006C buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:32 Vector ALU E07C0020 80050000
0x000074 s_nop 0 Flow Control BF800000
0x000078 v_mov_b32_e32 v0, s16 Vector ALU 7E000210
0x00007C v_mov_b32_e32 v1, s17 Vector ALU 7E020211
0x000080 v_mov_b32_e32 v2, s18 Vector ALU 7E040212
0x000084 v_mov_b32_e32 v3, s19 Vector ALU 7E060213
0x000088 buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:48 Vector ALU E07C0030 80050000
0x000090 s_load_dwordx16 s[0:15], s[0:1], 0x44 Scalar Memory C0120000 00000044
0x000098 s_waitcnt lgkmcnt(0) Flow Control BF8CC07F
0x00009C v_mov_b32_e32 v0, s0 Vector ALU 7E000200
0x0000A0 v_mov_b32_e32 v1, s1 Vector ALU 7E020201
0x0000A4 v_mov_b32_e32 v2, s2 Vector ALU 7E040202
0x0000A8 v_mov_b32_e32 v3, s3 Vector ALU 7E060203
0x0000AC buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:64 Vector ALU E07C0040 80050000
0x0000B4 s_nop 0 Flow Control BF800000
0x0000B8 v_mov_b32_e32 v0, s4 Vector ALU 7E000204
0x0000BC v_mov_b32_e32 v1, s5 Vector ALU 7E020205
0x0000C0 v_mov_b32_e32 v2, s6 Vector ALU 7E040206
0x0000C4 v_mov_b32_e32 v3, s7 Vector ALU 7E060207
0x0000C8 buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:80 Vector ALU E07C0050 80050000
0x0000D0 s_nop 0 Flow Control BF800000
0x0000D4 v_mov_b32_e32 v0, s8 Vector ALU 7E000208
0x0000D8 v_mov_b32_e32 v1, s9 Vector ALU 7E020209
0x0000DC v_mov_b32_e32 v2, s10 Vector ALU 7E04020A
0x0000E0 v_mov_b32_e32 v3, s11 Vector ALU 7E06020B
0x0000E4 buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:96 Vector ALU E07C0060 80050000
0x0000EC s_nop 0 Flow Control BF800000
0x0000F0 v_mov_b32_e32 v0, s12 Vector ALU 7E00020C
0x0000F4 v_mov_b32_e32 v1, s13 Vector ALU 7E02020D
0x0000F8 v_mov_b32_e32 v2, s14 Vector ALU 7E04020E
0x0000FC v_mov_b32_e32 v3, s15 Vector ALU 7E06020F
0x000100 buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:112 Vector ALU E07C0070 80050000
0x000108 s_endpgm Flow Control BF810000
scalar data cacheからs_load_dwordx16で読み取り、buffer_store_dwordx4で書き込んでいます。
vkCmdPushConstantsで与えた値はscalar data cacheに書き込まれるようですが、シェーダの記述次第では間違ってglobal memoryから読み取ろうとするため期待にそぐわない挙動をするようです。また、どこから読み取ろうとしているのかわからないマイクロコードなので、よろしくないものと思われます。
scalar data cacheにあるデータをshader invocation毎に異なる場所から読み出すマイクロコードはとても冗長なものとなるので、実行速度の観点から修正されることはないものと思われます。