0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

AMDのGPUでPush Constantsを使う際の注意点

Posted at

AMDのGPU、確認したデバイスはRadeon RX Vega 56だけなのですが、vkCmdPushConstantsで与えた値を読み出す際にシェーダの記述次第で期待にそぐわない挙動を示します。
読み出す場所がshader invocation(HLSLのスレッド、OpenCLのワークアイテムと同義)毎に異なる可能性がある場合、間違った場所から読むように思えます。

次のプログラムをビルドし実行しました。

プログラムのソースコード
#include <array>
#include <format>
#include <iostream>
#include <random>
#include <vector>

#include <vulkan/vulkan.hpp>

#include <glslang/Include/glslang_c_interface.h>
#include <glslang/Public/resource_limits_c.h>

#pragma comment(lib, "vulkan-1.lib")
#pragma comment(lib, "glslang.lib")
#pragma comment(lib, "MachineIndependent.lib")
#pragma comment(lib, "GenericCodeGen.lib")
#pragma comment(lib, "glslang-default-resource-limits.lib")
#pragma comment(lib, "SPIRV.lib")
#pragma comment(lib, "SPIRV-Tools.lib")
#pragma comment(lib, "SPIRV-Tools-opt.lib")

bool CompileShader(std::vector<uint32_t>& binary, const char *code);

const char *kCode = R"(
#version 450

layout (local_size_x = 32) in;

layout (push_constant) uniform PushConstantsBlock
{
	uint Input[32];
};

layout (binding = 0) writeonly buffer BindingBlock_0
{
	uint Output[32];
};

void main()
{
	uint index = gl_LocalInvocationID.x;

	Output[index] = Input[index];
}
)";

int main()
{
	static const uint32_t kDataElements = 32;
	static const uint32_t kPushConstantSize = sizeof (uint32_t) * kDataElements;
	static const uint32_t kBufferSize = kPushConstantSize;

	std::vector<uint32_t> shader_binary;

	CompileShader(shader_binary, kCode);

	vk::Instance vulkan_instance;
	{

		vk::InstanceCreateInfo create_info;
		vk::ApplicationInfo app_info;
		const char *layers[] = {"VK_LAYER_KHRONOS_validation"};

		app_info.setApiVersion(VK_API_VERSION_1_3);
		create_info.setPApplicationInfo(&app_info);
		create_info.enabledLayerCount = 1;
		create_info.ppEnabledLayerNames = layers;
		vulkan_instance = vk::createInstance(create_info);
	}

	vk::PhysicalDevice physical_device = vulkan_instance.enumeratePhysicalDevices().at(0);

	vk::Device device;
	uint32_t queue_index = UINT32_MAX;
	{
		std::vector<vk::QueueFamilyProperties> queue_props = physical_device.getQueueFamilyProperties();

		for (uint32_t i=0; i<queue_props.size(); ++i)
		{
			if (queue_props.at(i).queueFlags & vk::QueueFlagBits::eCompute)
			{
				queue_index = i;
				break;
			}
		}

		vk::DeviceCreateInfo device_create_info;
		vk::DeviceQueueCreateInfo queue_create_info;
		float queue_priorities[1] = {1.0f};
		const char *layers[] = {"VK_LAYER_KHRONOS_validation"};

		queue_create_info.queueFamilyIndex = queue_index;
		queue_create_info.queueCount = 1;
		queue_create_info.pQueuePriorities = queue_priorities;

		device_create_info.queueCreateInfoCount = 1;
		device_create_info.pQueueCreateInfos = &queue_create_info;
		device_create_info.enabledLayerCount = 1;
		device_create_info.ppEnabledLayerNames = layers;

		device = physical_device.createDevice(device_create_info);
	}

	vk::CommandPool command_pool;
	{
		vk::CommandPoolCreateInfo create_info;

		create_info.queueFamilyIndex = queue_index;
		create_info.flags = vk::CommandPoolCreateFlagBits::eResetCommandBuffer;
		command_pool = device.createCommandPool(create_info);
	}

	vk::CommandBuffer command_buffer;
	{
		vk::CommandBufferAllocateInfo create_info;

		create_info.commandPool = command_pool;
		create_info.commandBufferCount = 1;
		create_info.level = vk::CommandBufferLevel::ePrimary;
		command_buffer = device.allocateCommandBuffers(create_info).at(0);
	}

	vk::ShaderModule shader_module;
	vk::DescriptorSetLayout descriptor_set_layout;
	vk::PipelineLayout pipeline_layout;
	vk::Pipeline compute_pipeline;
	vk::DescriptorPool descriptor_pool;
	vk::DescriptorSet descriptor_set;
	{

		{
			vk::ShaderModuleCreateInfo create_info;

			create_info.setCodeSize(sizeof (uint32_t) * shader_binary.size());
			create_info.setPCode(shader_binary.data());
			shader_module = device.createShaderModule(create_info);
		}

		{
			vk::DescriptorSetLayoutCreateInfo create_info;
			vk::DescriptorSetLayoutBinding binding;

			binding.setBinding(0);
			binding.setDescriptorType(vk::DescriptorType::eStorageBuffer);
			binding.setDescriptorCount(1);
			binding.setStageFlags(vk::ShaderStageFlagBits::eCompute);

			create_info.bindingCount = 1;
			create_info.pBindings = &binding;
			descriptor_set_layout = device.createDescriptorSetLayout(create_info);
		}

		{
			vk::PipelineLayoutCreateInfo create_info;
			vk::PushConstantRange push_range;

			create_info.setLayoutCount = 1;
			create_info.pSetLayouts = &descriptor_set_layout;

			push_range.setStageFlags(vk::ShaderStageFlagBits::eCompute);
			push_range.setOffset(0);
			push_range.setSize(kPushConstantSize);

			create_info.setPushConstantRangeCount(1);
			create_info.setPushConstantRanges(push_range);

			pipeline_layout = device.createPipelineLayout(create_info);
		}

		{
			vk::ComputePipelineCreateInfo create_info;
			vk::PipelineShaderStageCreateInfo shader_stage_info;

			shader_stage_info.setStage(vk::ShaderStageFlagBits::eCompute);
			shader_stage_info.setModule(shader_module);
			shader_stage_info.setPName("main");

			create_info.setStage(shader_stage_info);
			create_info.setLayout(pipeline_layout);

			compute_pipeline = device.createComputePipelines({}, create_info).value.at(0);
		}

		{
			vk::DescriptorPoolCreateInfo create_info;
			vk::DescriptorPoolSize pool_size;

			pool_size.setType(vk::DescriptorType::eStorageBuffer);
			pool_size.setDescriptorCount(1);

			create_info.setFlags(vk::DescriptorPoolCreateFlagBits::eFreeDescriptorSet);
			create_info.setMaxSets(1);
			create_info.setPoolSizeCount(1);
			create_info.setPoolSizes(pool_size);

			descriptor_pool = device.createDescriptorPool(create_info);
		}

		{
			vk::DescriptorSetAllocateInfo create_info;
			vk::DescriptorSetLayout layout = descriptor_set_layout;

			create_info.descriptorPool = descriptor_pool;
			create_info.descriptorSetCount = 1;
			create_info.pSetLayouts = &descriptor_set_layout;

			descriptor_set = device.allocateDescriptorSets(create_info).at(0);
		}
	}

	vk::Buffer device_buffer;
	vk::Buffer staging_buffer;
	{
		vk::BufferCreateInfo create_info;

		create_info.setSize(kBufferSize);
		create_info.setUsage(vk::BufferUsageFlagBits::eStorageBuffer| vk::BufferUsageFlagBits::eTransferSrc);
		device_buffer = device.createBuffer(create_info);

		create_info.setUsage(vk::BufferUsageFlagBits::eTransferDst);
		staging_buffer = device.createBuffer(create_info);
	}

	vk::DeviceMemory device_memory;
	vk::DeviceMemory staging_memory;
	{
		vk::MemoryAllocateInfo allocate_info;
		vk::PhysicalDeviceMemoryProperties mem_props;
		vk::MemoryRequirements mem_requires;
		uint32_t mem_type_index;

		physical_device.getMemoryProperties(&mem_props);

		//
		mem_requires = device.getBufferMemoryRequirements(device_buffer);

		for (mem_type_index=0; mem_type_index<mem_props.memoryTypeCount; ++mem_type_index)
		{
			if ((uint32_t(1) << mem_type_index) & mem_requires.memoryTypeBits)
			{
				if (vk::MemoryPropertyFlagBits::eDeviceLocal & mem_props.memoryTypes[mem_type_index].propertyFlags)
					break;
			}
		}

		allocate_info.setAllocationSize(mem_requires.size);
		allocate_info.setMemoryTypeIndex(mem_type_index);
		device_memory = device.allocateMemory(allocate_info);

		//
		mem_requires = device.getBufferMemoryRequirements(staging_buffer);

		for (mem_type_index=0; mem_type_index<mem_props.memoryTypeCount; ++mem_type_index)
		{
			if ((uint32_t(1) << mem_type_index) & mem_requires.memoryTypeBits)
			{
				if (vk::MemoryPropertyFlagBits::eHostVisible & mem_props.memoryTypes[mem_type_index].propertyFlags)
					break;
			}
		}

		allocate_info.setAllocationSize(mem_requires.size);
		allocate_info.setMemoryTypeIndex(mem_type_index);
		staging_memory = device.allocateMemory(allocate_info);
	}

	device.bindBufferMemory(device_buffer, device_memory, 0);
	device.bindBufferMemory(staging_buffer, staging_memory, 0);

	{
		vk::WriteDescriptorSet write;
		vk::DescriptorBufferInfo info;

		info.setBuffer(device_buffer);
		info.setOffset(0);
		info.setRange(kBufferSize);

		write.setDstSet(descriptor_set);
		write.setDstBinding(0);
		write.setDstArrayElement(0);
		write.setDescriptorCount(1);
		write.setDescriptorType(vk::DescriptorType::eStorageBuffer);
		write.setBufferInfo(info);

		device.updateDescriptorSets(write, nullptr);
	}

	std::array<uint32_t, kDataElements> data;
	{
		std::random_device random_device;

		std::generate(data.begin(), data.end(), std::ref(random_device));
	}

	{
		vk::CommandBufferBeginInfo begin_info;
		vk::BufferMemoryBarrier barrier;
		vk::BufferCopy copy_region;

		begin_info.setFlags(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);

		command_buffer.begin(begin_info);
		command_buffer.pushConstants(pipeline_layout, vk::ShaderStageFlagBits::eCompute, 0, kPushConstantSize, data.data());

		command_buffer.bindPipeline(vk::PipelineBindPoint::eCompute, compute_pipeline);
		command_buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipeline_layout, 0, descriptor_set, nullptr);
		command_buffer.dispatch(1, 1, 1);

		barrier.setSrcAccessMask(vk::AccessFlagBits::eShaderWrite);
		barrier.setDstAccessMask(vk::AccessFlagBits::eTransferRead);
		barrier.setSrcQueueFamilyIndex(VK_QUEUE_FAMILY_IGNORED);
		barrier.setDstQueueFamilyIndex(VK_QUEUE_FAMILY_IGNORED);
		barrier.setBuffer(device_buffer);
		barrier.setOffset(0);
		barrier.setSize(kBufferSize);
		command_buffer.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eTransfer, {}, {}, {barrier}, {});

		copy_region.setSrcOffset(0);
		copy_region.setDstOffset(0);
		copy_region.setSize(kBufferSize);
		command_buffer.copyBuffer(device_buffer, staging_buffer, {copy_region});

		command_buffer.end();
	}

	{
		vk::SubmitInfo submit_info;
		vk::Queue queue = device.getQueue(queue_index, 0);

		submit_info.setCommandBufferCount(1);
		submit_info.setCommandBuffers(command_buffer);

		queue.submit(submit_info);
		queue.waitIdle();
	}

	{
		void *mapped_memory;
		uint32_t *readback_data;

		mapped_memory = device.mapMemory(staging_memory, 0, kBufferSize);
		readback_data = reinterpret_cast<uint32_t *>(mapped_memory);

		for (int i=0; i<data.size(); ++i)
		{
			std::cout << std::format("{:x}, {:x}", data.at(i), readback_data[i]) << std::endl;
		}

		device.unmapMemory(staging_memory);
	}

	// destroy objects...

	return EXIT_SUCCESS;
}

bool CompileShader(std::vector<uint32_t>& binary, const char *code)
{
	glslang_initialize_process();

	glslang_input_t shader_input {};
	shader_input.language = GLSLANG_SOURCE_GLSL;
	shader_input.stage = GLSLANG_STAGE_COMPUTE;
	shader_input.client = GLSLANG_CLIENT_VULKAN;
	shader_input.client_version = GLSLANG_TARGET_VULKAN_1_3;
	shader_input.target_language = GLSLANG_TARGET_SPV;
	shader_input.target_language_version = GLSLANG_TARGET_SPV_1_6;
	shader_input.code = code;
	shader_input.default_version = 450;
	shader_input.default_profile = GLSLANG_NO_PROFILE;
	shader_input.force_default_version_and_profile = false;
	shader_input.forward_compatible = 0;
	shader_input.messages = GLSLANG_MSG_DEFAULT_BIT;
	shader_input.resource = glslang_default_resource();

	glslang_shader_t *shader = glslang_shader_create(&shader_input);
	glslang_shader_preprocess(shader, &shader_input);
	glslang_shader_parse(shader, &shader_input);

	glslang_program_t *program = glslang_program_create();
	glslang_program_add_shader(program, shader);
	glslang_program_link(program, GLSLANG_MSG_SPV_RULES_BIT| GLSLANG_MSG_VULKAN_RULES_BIT);
	glslang_program_SPIRV_generate(program, GLSLANG_STAGE_COMPUTE);
	size_t binary_size = glslang_program_SPIRV_get_size(program);
	binary.clear();
	binary.resize(binary_size);
	glslang_program_SPIRV_get(program, binary.data());
	glslang_program_delete(program);
	glslang_shader_delete(shader);
	glslang_finalize_process();

	return true;
}

Intel UHD 630を利用した際は次のような出力を得ました。

63064ee1, 63064ee1
7e61860f, 7e61860f
c266bde6, c266bde6
54b87425, 54b87425
84416824, 84416824
b01d4c87, b01d4c87
c84266e2, c84266e2
28e5b7bc, 28e5b7bc
<略>

一方、AMD Radeon RX Vega 56では次のような出力を得ました。

d0cfcb8e, d0cfcb8e
61e86148, d0cfcb8e
fce1ffa0, d0cfcb8e
95764b7f, d0cfcb8e
9ab37dbd, d0cfcb8e
f7e42d9f, d0cfcb8e
4ab1c545, d0cfcb8e
80491d66, d0cfcb8e
<略>

Radeon GPU Analyzerでシェーダをビルドしてみたところ、次の出力を得られました。

0x000000    v_ashrrev_i32_e32     v1, 31, v0                  Vector ALU       2202009F
0x000004    v_lshlrev_b64         v[1:2], 2, v[0:1]           Vector ALU       D28F0001 00020082
0x00000C    s_getpc_b64           s[0:1]                      Vector ALU       BE801C00
0x000010    v_mov_b32_e32         v3, s1                      Vector ALU       7E060201
0x000014    v_add_co_u32_e32      v1, vcc, s3, v1             Vector ALU       32020203
0x000018    v_addc_co_u32_e32     v2, vcc, v3, v2, vcc        Vector ALU       38040503
0x00001C    global_load_dword     v1, v[1:2], off offset:4    Vector Memory    DC508004 017F0001
0x000024    s_mov_b32             s3, s1                      Scalar ALU       BE830001
0x000028    s_load_dwordx4        s[0:3], s[2:3], 0x0         Scalar Memory    C00A0001 00000000
0x000030    v_lshlrev_b32_e32     v0, 2, v0                   Vector ALU       24000082
0x000034    s_waitcnt             vmcnt(0) lgkmcnt(0)         Flow Control     BF8C0070
0x000038    buffer_store_dword    v1, v0, s[0:3], 0 offen     Vector ALU       E0701000 80000100
0x000040    s_endpgm                                          Flow Control     BF810000

global_load_dwordで読み取り、buffer_store_dwordで書き込んでいます。

ここでシェーダの一部を次のように書き換えてみます。

void main()
{
	uint index = gl_LocalInvocationID.x;

	for (uint i=0; i<32; ++i)
		Output[i] = Input[i];
}

AMD Radeon RX Vega 56で次のような出力を得ました。

1a7e0d59, 1a7e0d59
48348045, 48348045
5b141b90, 5b141b90
9d44e6f9, 9d44e6f9
fc5430c7, fc5430c7
2c948604, 2c948604
f8f73374, f8f73374
4babbeef, 4babbeef
<略>

Radeon GPU Analyzerで次の出力を得ました。

0x000000    s_getpc_b64             s[20:21]                               Vector ALU       BE941C00
0x000004    s_mov_b32               s0, s3                                 Scalar ALU       BE800003
0x000008    s_mov_b32               s1, s21                                Scalar ALU       BE810015
0x00000C    s_load_dwordx16         s[4:19], s[0:1], 0x4                   Scalar Memory    C0120100 00000004
0x000014    s_mov_b32               s3, s21                                Scalar ALU       BE830015
0x000018    s_load_dwordx4          s[20:23], s[2:3], 0x0                  Scalar Memory    C00A0501 00000000
0x000020    s_waitcnt               lgkmcnt(0)                             Flow Control     BF8CC07F
0x000024    v_mov_b32_e32           v0, s4                                 Vector ALU       7E000204
0x000028    v_mov_b32_e32           v1, s5                                 Vector ALU       7E020205
0x00002C    v_mov_b32_e32           v2, s6                                 Vector ALU       7E040206
0x000030    v_mov_b32_e32           v3, s7                                 Vector ALU       7E060207
0x000034    buffer_store_dwordx4    v[0:3], off, s[20:23], 0               Vector ALU       E07C0000 80050000
0x00003C    s_nop                   0                                      Flow Control     BF800000
0x000040    v_mov_b32_e32           v0, s8                                 Vector ALU       7E000208
0x000044    v_mov_b32_e32           v1, s9                                 Vector ALU       7E020209
0x000048    v_mov_b32_e32           v2, s10                                Vector ALU       7E04020A
0x00004C    v_mov_b32_e32           v3, s11                                Vector ALU       7E06020B
0x000050    buffer_store_dwordx4    v[0:3], off, s[20:23], 0 offset:16     Vector ALU       E07C0010 80050000
0x000058    s_nop                   0                                      Flow Control     BF800000
0x00005C    v_mov_b32_e32           v0, s12                                Vector ALU       7E00020C
0x000060    v_mov_b32_e32           v1, s13                                Vector ALU       7E02020D
0x000064    v_mov_b32_e32           v2, s14                                Vector ALU       7E04020E
0x000068    v_mov_b32_e32           v3, s15                                Vector ALU       7E06020F
0x00006C    buffer_store_dwordx4    v[0:3], off, s[20:23], 0 offset:32     Vector ALU       E07C0020 80050000
0x000074    s_nop                   0                                      Flow Control     BF800000
0x000078    v_mov_b32_e32           v0, s16                                Vector ALU       7E000210
0x00007C    v_mov_b32_e32           v1, s17                                Vector ALU       7E020211
0x000080    v_mov_b32_e32           v2, s18                                Vector ALU       7E040212
0x000084    v_mov_b32_e32           v3, s19                                Vector ALU       7E060213
0x000088    buffer_store_dwordx4    v[0:3], off, s[20:23], 0 offset:48     Vector ALU       E07C0030 80050000
0x000090    s_load_dwordx16         s[0:15], s[0:1], 0x44                  Scalar Memory    C0120000 00000044
0x000098    s_waitcnt               lgkmcnt(0)                             Flow Control     BF8CC07F
0x00009C    v_mov_b32_e32           v0, s0                                 Vector ALU       7E000200
0x0000A0    v_mov_b32_e32           v1, s1                                 Vector ALU       7E020201
0x0000A4    v_mov_b32_e32           v2, s2                                 Vector ALU       7E040202
0x0000A8    v_mov_b32_e32           v3, s3                                 Vector ALU       7E060203
0x0000AC    buffer_store_dwordx4    v[0:3], off, s[20:23], 0 offset:64     Vector ALU       E07C0040 80050000
0x0000B4    s_nop                   0                                      Flow Control     BF800000
0x0000B8    v_mov_b32_e32           v0, s4                                 Vector ALU       7E000204
0x0000BC    v_mov_b32_e32           v1, s5                                 Vector ALU       7E020205
0x0000C0    v_mov_b32_e32           v2, s6                                 Vector ALU       7E040206
0x0000C4    v_mov_b32_e32           v3, s7                                 Vector ALU       7E060207
0x0000C8    buffer_store_dwordx4    v[0:3], off, s[20:23], 0 offset:80     Vector ALU       E07C0050 80050000
0x0000D0    s_nop                   0                                      Flow Control     BF800000
0x0000D4    v_mov_b32_e32           v0, s8                                 Vector ALU       7E000208
0x0000D8    v_mov_b32_e32           v1, s9                                 Vector ALU       7E020209
0x0000DC    v_mov_b32_e32           v2, s10                                Vector ALU       7E04020A
0x0000E0    v_mov_b32_e32           v3, s11                                Vector ALU       7E06020B
0x0000E4    buffer_store_dwordx4    v[0:3], off, s[20:23], 0 offset:96     Vector ALU       E07C0060 80050000
0x0000EC    s_nop                   0                                      Flow Control     BF800000
0x0000F0    v_mov_b32_e32           v0, s12                                Vector ALU       7E00020C
0x0000F4    v_mov_b32_e32           v1, s13                                Vector ALU       7E02020D
0x0000F8    v_mov_b32_e32           v2, s14                                Vector ALU       7E04020E
0x0000FC    v_mov_b32_e32           v3, s15                                Vector ALU       7E06020F
0x000100    buffer_store_dwordx4    v[0:3], off, s[20:23], 0 offset:112    Vector ALU       E07C0070 80050000
0x000108    s_endpgm                                                       Flow Control     BF810000

scalar data cacheからs_load_dwordx16で読み取り、buffer_store_dwordx4で書き込んでいます。

vkCmdPushConstantsで与えた値はscalar data cacheに書き込まれるようですが、シェーダの記述次第では間違ってglobal memoryから読み取ろうとするため期待にそぐわない挙動をするようです。また、どこから読み取ろうとしているのかわからないマイクロコードなので、よろしくないものと思われます。

scalar data cacheにあるデータをshader invocation毎に異なる場所から読み出すマイクロコードはとても冗長なものとなるので、実行速度の観点から修正されることはないものと思われます。

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?