v3dv: implement VK_EXT_inline_uniform_block

Inline uniform blocks store their contents in pool memory rather
than a separate buffer, and are intended to provide a way in which
some platforms may provide more efficient access to the uniform
data, similar to push constants but with more flexible size
constraints.

We implement these in a similar way as push constants: for constant
access we copy the data in the uniform stream (using the new
QUNIFORM_UNIFORM_UBO_*) enums to identify the inline buffer from
which we need to copy and for indirect access we fallback to
regular UBO access.

Because at NIR level there is no distinction between inline and
regular UBOs and the compiler isn't aware of Vulkan descriptor
sets, we use the UBO index on UBO load intrinsics to identify
inline UBOs, just like we do for push constants. Particularly,
we reserve indices 1..MAX_INLINE_UNIFORM_BUFFERS for this,
however, unlike push constants, inline buffers are accessed
through descriptor sets, and therefore we need to make sure
they are located in the first slots of the UBO descriptor map.
This means we store them in the first MAX_INLINE_UNIFORM_BUFFERS
slots of the map, with regular UBOs always coming after these
slots.

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15575>
This commit is contained in:
Iago Toral Quiroga 2022-03-24 10:05:17 +01:00 committed by Marge Bot
parent 37c0f68500
commit ea3223e7a4
12 changed files with 453 additions and 111 deletions

View File

@ -483,7 +483,7 @@ Vulkan 1.3 -- all DONE: anv, radv, lvp
VK_EXT_4444_formats DONE (anv, lvp, radv, tu, v3dv)
VK_EXT_extended_dynamic_state DONE (anv, lvp, radv, tu)
VK_EXT_extended_dynamic_state2 DONE (anv, lvp, radv, tu)
VK_EXT_inline_uniform_block DONE (anv, radv)
VK_EXT_inline_uniform_block DONE (anv, radv, v3dv)
VK_EXT_pipeline_creation_cache_control DONE (anv, radv, v3dv)
VK_EXT_pipeline_creation_feedback DONE (anv, radv, v3dv)
VK_EXT_private_data DONE (anv, lvp, radv, tu, v3dv)

View File

@ -67,4 +67,7 @@
/* Sub-pixel precission bits in the rasterizer */
#define V3D_COORD_SHIFT 6
/* Size of a cache line */
#define V3D_NON_COHERENT_ATOM_SIZE 256
#endif /* V3D_LIMITS_H */

View File

@ -2638,41 +2638,54 @@ vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
vir_MOV(c, color_reads_for_sample[component]));
}
static bool
try_emit_uniform(struct v3d_compile *c,
int offset,
int num_components,
nir_dest *dest,
enum quniform_contents contents)
{
/* Even though ldunif is strictly 32-bit we can still use it
* to load scalar 8-bit/16-bit uniforms so long as their offset
* is 32-bit aligned. In this case, ldunif would still load
* 32-bit into the destination with the 8-bit/16-bit uniform
* data in the LSB and garbage in the MSB, but that is fine
* because we should only be accessing the valid bits of the
* destination.
*
* FIXME: if in the future we improve our register allocator to
* pack 2 16-bit variables in the MSB and LSB of the same
* register then this optimization would not be valid as is,
* since the load clobbers the MSB.
*/
if (offset % 4 != 0)
return false;
/* We need dwords */
offset = offset / 4;
for (int i = 0; i < num_components; i++) {
ntq_store_dest(c, dest, i,
vir_uniform(c, contents, offset + i));
}
return true;
}
static void
ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
{
/* We scalarize general TMU access for anything that is not 32-bit. */
assert(nir_dest_bit_size(instr->dest) == 32 ||
instr->num_components == 1);
/* Try to emit ldunif if possible, otherwise fallback to general TMU */
if (nir_src_is_const(instr->src[0])) {
int offset = (nir_intrinsic_base(instr) +
nir_src_as_uint(instr->src[0]));
/* Even though ldunif is strictly 32-bit we can still use it
* to load scalar 8-bit/16-bit uniforms so long as their offset
* is * 32-bit aligned. In this case, ldunif would still load
* 32-bit into the destination with the 8-bit/16-bit uniform
* data in the LSB and garbage in the MSB, but that is fine
* because we should only be accessing the valid bits of the
* destination.
*
* FIXME: if in the future we improve our register allocator to
* pack 2 16-bit variables in the MSB and LSB of the same
* register then this optimization would not be valid as is,
* since the load clobbers the MSB.
*/
if (offset % 4 == 0) {
/* We need dwords */
offset = offset / 4;
/* We scalarize general TMU access for anything that
* is not 32-bit.
*/
assert(nir_dest_bit_size(instr->dest) == 32 ||
instr->num_components == 1);
for (int i = 0; i < instr->num_components; i++) {
ntq_store_dest(c, &instr->dest, i,
vir_uniform(c, QUNIFORM_UNIFORM,
offset + i));
}
if (try_emit_uniform(c, offset, instr->num_components,
&instr->dest, QUNIFORM_UNIFORM)) {
return;
}
}
@ -2680,6 +2693,41 @@ ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
ntq_emit_tmu_general(c, instr, false);
}
static bool
ntq_emit_inline_ubo_load(struct v3d_compile *c, nir_intrinsic_instr *instr)
{
if (c->compiler->max_inline_uniform_buffers <= 0)
return false;
/* On Vulkan we use indices 1..MAX_INLINE_UNIFORM_BUFFERS for inline
* uniform buffers which we want to handle more like push constants
* than regular UBO. OpenGL doesn't implement this feature.
*/
assert(c->key->environment == V3D_ENVIRONMENT_VULKAN);
uint32_t index = nir_src_as_uint(instr->src[0]);
if (index == 0 || index > c->compiler->max_inline_uniform_buffers)
return false;
/* We scalarize general TMU access for anything that is not 32-bit */
assert(nir_dest_bit_size(instr->dest) == 32 ||
instr->num_components == 1);
if (nir_src_is_const(instr->src[1])) {
/* Index 0 is reserved for push constants */
assert(index > 0);
uint32_t inline_index = index - 1;
int offset = nir_src_as_uint(instr->src[1]);
if (try_emit_uniform(c, offset, instr->num_components,
&instr->dest,
QUNIFORM_INLINE_UBO_0 + inline_index)) {
return true;
}
}
/* Fallback to regular UBO load */
return false;
}
static void
ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
{
@ -3199,6 +3247,9 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
break;
case nir_intrinsic_load_ubo:
if (ntq_emit_inline_ubo_load(c, instr))
break;
FALLTHROUGH;
case nir_intrinsic_load_ssbo:
if (!ntq_emit_load_unifa(c, instr)) {
ntq_emit_tmu_general(c, instr, false);

View File

@ -338,6 +338,14 @@ enum quniform_contents {
* Current value of gl_ViewIndex for Multiview rendering.
*/
QUNIFORM_VIEW_INDEX,
/**
* Inline uniform buffers
*/
QUNIFORM_INLINE_UBO_0,
QUNIFORM_INLINE_UBO_1,
QUNIFORM_INLINE_UBO_2,
QUNIFORM_INLINE_UBO_3,
};
static inline uint32_t v3d_unit_data_create(uint32_t unit, uint32_t value)
@ -574,6 +582,7 @@ enum v3d_compilation_result {
*/
struct v3d_compiler {
const struct v3d_device_info *devinfo;
uint32_t max_inline_uniform_buffers;
struct ra_regs *regs;
struct ra_class *reg_class_any[3];
struct ra_class *reg_class_r5[3];
@ -1045,7 +1054,8 @@ vir_has_uniform(struct qinst *inst)
return inst->uniform != ~0;
}
const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo);
const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo,
uint32_t max_inline_uniform_buffers);
void v3d_compiler_free(const struct v3d_compiler *compiler);
void v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s);

View File

@ -517,13 +517,15 @@ vir_link_blocks(struct qblock *predecessor, struct qblock *successor)
}
const struct v3d_compiler *
v3d_compiler_init(const struct v3d_device_info *devinfo)
v3d_compiler_init(const struct v3d_device_info *devinfo,
uint32_t max_inline_uniform_buffers)
{
struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler);
if (!compiler)
return NULL;
compiler->devinfo = devinfo;
compiler->max_inline_uniform_buffers = max_inline_uniform_buffers;
if (!vir_init_reg_sets(compiler)) {
ralloc_free(compiler);

View File

@ -31,16 +31,23 @@
* binding layout, and array_index, it returns the map region assigned to it
* from the descriptor pool bo.
*/
static void*
static void *
descriptor_bo_map(struct v3dv_device *device,
struct v3dv_descriptor_set *set,
const struct v3dv_descriptor_set_binding_layout *binding_layout,
uint32_t array_index)
{
assert(v3dv_X(device, descriptor_bo_size)(binding_layout->type) > 0);
/* Inline uniform blocks use BO memory to store UBO contents, not
* descriptor data, so their descriptor BO size is 0 even though they
* do use BO memory.
*/
uint32_t bo_size = v3dv_X(device, descriptor_bo_size)(binding_layout->type);
assert(bo_size > 0 ||
binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT);
return set->pool->bo->map +
set->base_offset + binding_layout->descriptor_offset +
array_index * v3dv_X(device, descriptor_bo_size)(binding_layout->type);
array_index * bo_size;
}
static bool
@ -102,7 +109,7 @@ v3dv_descriptor_map_get_descriptor(struct v3dv_descriptor_state *descriptor_stat
* It also returns the descriptor type, so the caller could do extra
* validation or adding extra offsets if the bo contains more that one field.
*/
static struct v3dv_cl_reloc
struct v3dv_cl_reloc
v3dv_descriptor_map_get_descriptor_bo(struct v3dv_device *device,
struct v3dv_descriptor_state *descriptor_state,
struct v3dv_descriptor_map *map,
@ -125,8 +132,10 @@ v3dv_descriptor_map_get_descriptor_bo(struct v3dv_device *device,
const struct v3dv_descriptor_set_binding_layout *binding_layout =
&set->layout->binding[binding_number];
assert(v3dv_X(device, descriptor_bo_size)(binding_layout->type) > 0);
*out_type = binding_layout->type;
assert(binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT ||
v3dv_X(device, descriptor_bo_size)(binding_layout->type) > 0);
if (out_type)
*out_type = binding_layout->type;
uint32_t array_index = map->array_index[index];
assert(array_index < binding_layout->array_size);
@ -364,6 +373,10 @@ v3dv_CreateDescriptorPool(VkDevice _device,
uint32_t bo_size = 0;
uint32_t descriptor_count = 0;
const VkDescriptorPoolInlineUniformBlockCreateInfo *inline_info =
vk_find_struct_const(pCreateInfo->pNext,
DESCRIPTOR_POOL_INLINE_UNIFORM_BLOCK_CREATE_INFO);
assert(pCreateInfo->poolSizeCount > 0);
for (unsigned i = 0; i < pCreateInfo->poolSizeCount; ++i) {
/* Verify supported descriptor type */
@ -379,6 +392,7 @@ v3dv_CreateDescriptorPool(VkDevice _device,
case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT:
break;
default:
unreachable("Unimplemented descriptor type");
@ -386,9 +400,28 @@ v3dv_CreateDescriptorPool(VkDevice _device,
}
assert(pCreateInfo->pPoolSizes[i].descriptorCount > 0);
descriptor_count += pCreateInfo->pPoolSizes[i].descriptorCount;
bo_size += v3dv_X(device, descriptor_bo_size)(pCreateInfo->pPoolSizes[i].type) *
pCreateInfo->pPoolSizes[i].descriptorCount;
if (pCreateInfo->pPoolSizes[i].type ==
VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
/* Inline uniform blocks are specified to use the descriptor array
* size as the size in bytes of the block.
*/
assert(inline_info);
descriptor_count++;
bo_size += pCreateInfo->pPoolSizes[i].descriptorCount;
} else {
descriptor_count += pCreateInfo->pPoolSizes[i].descriptorCount;
bo_size += v3dv_X(device, descriptor_bo_size)(pCreateInfo->pPoolSizes[i].type) *
pCreateInfo->pPoolSizes[i].descriptorCount;
}
}
/* We align all our buffers to V3D_NON_COHERENT_ATOM_SIZE, make sure we
* allocate enough memory to honor that requirement for all our inline
* buffers too.
*/
if (inline_info) {
bo_size += V3D_NON_COHERENT_ATOM_SIZE *
inline_info->maxInlineUniformBlockBindings;
}
if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT)) {
@ -599,6 +632,7 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device,
case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT:
/* Nothing here, just to keep the descriptor type filtering below */
break;
default:
@ -624,16 +658,36 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device,
samplers_offset += sizeof(struct v3dv_sampler) * binding->descriptorCount;
}
descriptor_count += binding->descriptorCount;
dynamic_offset_count += binding->descriptorCount *
set_layout->binding[binding_number].dynamic_offset_count;
set_layout->shader_stages |= binding->stageFlags;
set_layout->binding[binding_number].descriptor_offset = set_layout->bo_size;
set_layout->bo_size +=
v3dv_X(device, descriptor_bo_size)(set_layout->binding[binding_number].type) *
binding->descriptorCount;
if (binding->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
dynamic_offset_count += binding->descriptorCount *
set_layout->binding[binding_number].dynamic_offset_count;
descriptor_count += binding->descriptorCount;
set_layout->binding[binding_number].descriptor_offset =
set_layout->bo_size;
set_layout->bo_size +=
v3dv_X(device, descriptor_bo_size)(set_layout->binding[binding_number].type) *
binding->descriptorCount;
} else {
/* We align all our buffers, inline buffers too. We made sure to take
* this account when calculating total BO size requirements at pool
* creation time.
*/
set_layout->bo_size = align(set_layout->bo_size,
V3D_NON_COHERENT_ATOM_SIZE);
set_layout->binding[binding_number].descriptor_offset =
set_layout->bo_size;
/* Inline uniform blocks are not arrayed, instead descriptorCount
* specifies the size of the buffer in bytes.
*/
set_layout->bo_size += binding->descriptorCount;
descriptor_count++;
}
}
free(bindings);
@ -931,6 +985,31 @@ write_buffer_view_descriptor(struct v3dv_device *device,
sizeof(bview->texture_shader_state));
}
static void
write_inline_uniform_descriptor(struct v3dv_device *device,
struct v3dv_descriptor *descriptor,
struct v3dv_descriptor_set *set,
const struct v3dv_descriptor_set_binding_layout *binding_layout,
const void *data,
size_t offset,
size_t size)
{
assert(binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT);
descriptor->type = VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT;
descriptor->buffer = NULL;
void *desc_map = descriptor_bo_map(device, set, binding_layout, 0);
memcpy(desc_map + offset, data, size);
/* Inline uniform buffers allocate BO space in the pool for all inline
* buffers it may allocate and then this space is assigned to individual
* descriptors when they are written, so we define the range of an inline
* buffer as the largest range of data that the client has written to it.
*/
descriptor->offset = 0;
descriptor->range = MAX2(descriptor->range, offset + size);
}
VKAPI_ATTR void VKAPI_CALL
v3dv_UpdateDescriptorSets(VkDevice _device,
uint32_t descriptorWriteCount,
@ -949,9 +1028,20 @@ v3dv_UpdateDescriptorSets(VkDevice _device,
struct v3dv_descriptor *descriptor = set->descriptors;
descriptor += binding_layout->descriptor_index;
descriptor += writeset->dstArrayElement;
for (uint32_t j = 0; j < writeset->descriptorCount; ++j) {
/* Inline uniform blocks are not arrayed, instead they use dstArrayElement
* to specify the byte offset of the uniform update and descriptorCount
* to specify the size (in bytes) of the update.
*/
uint32_t descriptor_count;
if (writeset->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
descriptor += writeset->dstArrayElement;
descriptor_count = writeset->descriptorCount;
} else {
descriptor_count = 1;
}
for (uint32_t j = 0; j < descriptor_count; ++j) {
switch(writeset->descriptorType) {
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
@ -1006,6 +1096,18 @@ v3dv_UpdateDescriptorSets(VkDevice _device,
writeset->dstArrayElement + j);
break;
}
case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: {
const VkWriteDescriptorSetInlineUniformBlock *inline_write =
vk_find_struct_const(writeset->pNext,
WRITE_DESCRIPTOR_SET_INLINE_UNIFORM_BLOCK);
assert(inline_write->dataSize == writeset->descriptorCount);
write_inline_uniform_descriptor(device, descriptor, set,
binding_layout,
inline_write->pData,
writeset->dstArrayElement, /* offset */
inline_write->dataSize);
break;
}
default:
unreachable("unimplemented descriptor type");
break;
@ -1032,9 +1134,25 @@ v3dv_UpdateDescriptorSets(VkDevice _device,
struct v3dv_descriptor *dst_descriptor = dst_set->descriptors;
src_descriptor += src_binding_layout->descriptor_index;
src_descriptor += copyset->srcArrayElement;
dst_descriptor += dst_binding_layout->descriptor_index;
if (src_binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
/* {src,dst}ArrayElement specifies src/dst start offset and
* descriptorCount specifies size (in bytes) to copy.
*/
const void *src_data = src_set->pool->bo->map +
src_set->base_offset +
src_binding_layout->descriptor_offset +
copyset->srcArrayElement;
write_inline_uniform_descriptor(device, dst_descriptor, dst_set,
dst_binding_layout,
src_data,
copyset->dstArrayElement,
copyset->descriptorCount);
continue;
}
src_descriptor += copyset->srcArrayElement;
dst_descriptor += copyset->dstArrayElement;
for (uint32_t j = 0; j < copyset->descriptorCount; j++) {
@ -1179,8 +1297,7 @@ v3dv_UpdateDescriptorSetWithTemplate(
struct v3dv_descriptor *descriptor =
set->descriptors +
binding_layout->descriptor_index +
entry->array_element;
binding_layout->descriptor_index;
switch (entry->type) {
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
@ -1190,7 +1307,8 @@ v3dv_UpdateDescriptorSetWithTemplate(
for (uint32_t j = 0; j < entry->array_count; j++) {
const VkDescriptorBufferInfo *info =
pData + entry->offset + j * entry->stride;
write_buffer_descriptor(descriptor + j, entry->type, info);
write_buffer_descriptor(descriptor + entry->array_element + j,
entry->type, info);
}
break;
@ -1204,9 +1322,9 @@ v3dv_UpdateDescriptorSetWithTemplate(
pData + entry->offset + j * entry->stride;
V3DV_FROM_HANDLE(v3dv_image_view, iview, info->imageView);
V3DV_FROM_HANDLE(v3dv_sampler, sampler, info->sampler);
write_image_descriptor(device, descriptor + j, entry->type,
set, binding_layout, iview, sampler,
entry->array_element + j);
write_image_descriptor(device, descriptor + entry->array_element + j,
entry->type, set, binding_layout, iview,
sampler, entry->array_element + j);
}
break;
@ -1216,12 +1334,22 @@ v3dv_UpdateDescriptorSetWithTemplate(
const VkBufferView *_bview =
pData + entry->offset + j * entry->stride;
V3DV_FROM_HANDLE(v3dv_buffer_view, bview, *_bview);
write_buffer_view_descriptor(device, descriptor + j, entry->type,
set, binding_layout, bview,
write_buffer_view_descriptor(device,
descriptor + entry->array_element + j,
entry->type, set, binding_layout, bview,
entry->array_element + j);
}
break;
case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: {
write_inline_uniform_descriptor(device, descriptor, set,
binding_layout,
pData + entry->offset,
entry->array_element, /* offset */
entry->array_count); /* size */
break;
}
default:
unreachable("Unsupported descriptor type");
}

View File

@ -153,6 +153,7 @@ get_device_extensions(const struct v3dv_physical_device *device,
.EXT_4444_formats = true,
.EXT_color_write_enable = true,
.EXT_custom_border_color = true,
.EXT_inline_uniform_block = true,
.EXT_external_memory_dma_buf = true,
.EXT_host_query_reset = true,
.EXT_image_drm_format_modifier = true,
@ -812,7 +813,8 @@ physical_device_init(struct v3dv_physical_device *device,
if (result != VK_SUCCESS)
goto fail;
device->compiler = v3d_compiler_init(&device->devinfo);
device->compiler = v3d_compiler_init(&device->devinfo,
MAX_INLINE_UNIFORM_BUFFERS);
device->next_program_id = 0;
ASSERTED int len =
@ -1089,6 +1091,20 @@ v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
{
v3dv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features);
VkPhysicalDeviceVulkan13Features vk13 = {
.inlineUniformBlock = true,
/* Inline buffers work like push constants, so after their are bound
* some of their contents may be copied into the uniform stream as soon
* as the next draw/dispatch is recorded in the command buffer. This means
* that if the client updates the buffer contents after binding it to
* a command buffer, the next queue submit of that command buffer may
* not use the latest update to the buffer contents, but the data that
* was present in the buffer at the time it was bound to the command
* buffer.
*/
.descriptorBindingInlineUniformBlockUpdateAfterBind = false,
};
VkPhysicalDeviceVulkan12Features vk12 = {
.hostQueryReset = true,
.uniformAndStorageBuffer8BitAccess = true,
@ -1173,6 +1189,15 @@ v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
break;
}
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INLINE_UNIFORM_BLOCK_FEATURES_EXT: {
VkPhysicalDeviceInlineUniformBlockFeaturesEXT *features =
(VkPhysicalDeviceInlineUniformBlockFeaturesEXT *)ext;
features->inlineUniformBlock = vk13.inlineUniformBlock;
features->descriptorBindingInlineUniformBlockUpdateAfterBind =
vk13.descriptorBindingInlineUniformBlockUpdateAfterBind;
break;
}
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COLOR_WRITE_ENABLE_FEATURES_EXT: {
VkPhysicalDeviceColorWriteEnableFeaturesEXT *features = (void *) ext;
features->colorWriteEnable = true;
@ -1385,7 +1410,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
.maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE,
.maxMemoryAllocationCount = mem_size / page_size,
.maxSamplerAllocationCount = 64 * 1024,
.bufferImageGranularity = 256, /* A cache line */
.bufferImageGranularity = V3D_NON_COHERENT_ATOM_SIZE,
.sparseAddressSpaceSize = 0,
.maxBoundDescriptorSets = MAX_SETS,
.maxPerStageDescriptorSamplers = V3D_MAX_TEXTURE_SAMPLERS,
@ -1499,7 +1524,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
.standardSampleLocations = false,
.optimalBufferCopyOffsetAlignment = 32,
.optimalBufferCopyRowPitchAlignment = 32,
.nonCoherentAtomSize = 256,
.nonCoherentAtomSize = V3D_NON_COHERENT_ATOM_SIZE,
};
*pProperties = (VkPhysicalDeviceProperties) {
@ -1575,6 +1600,18 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
};
break;
}
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INLINE_UNIFORM_BLOCK_PROPERTIES_EXT: {
VkPhysicalDeviceInlineUniformBlockProperties *props =
(VkPhysicalDeviceInlineUniformBlockProperties *)ext;
props->maxInlineUniformBlockSize = 4096;
props->maxPerStageDescriptorInlineUniformBlocks =
MAX_INLINE_UNIFORM_BUFFERS;
props->maxDescriptorSetInlineUniformBlocks =
MAX_INLINE_UNIFORM_BUFFERS;
props->maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks = 0;
props->maxDescriptorSetUpdateAfterBindInlineUniformBlocks = 0;
break;
}
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROVOKING_VERTEX_PROPERTIES_EXT: {
VkPhysicalDeviceProvokingVertexPropertiesEXT *props =
(VkPhysicalDeviceProvokingVertexPropertiesEXT *)ext;
@ -2516,7 +2553,7 @@ v3dv_CreateBuffer(VkDevice _device,
buffer->size = pCreateInfo->size;
buffer->usage = pCreateInfo->usage;
buffer->alignment = 256; /* nonCoherentAtomSize */
buffer->alignment = V3D_NON_COHERENT_ATOM_SIZE;
/* Limit allocations to 32-bit */
const VkDeviceSize aligned_size = align64(buffer->size, buffer->alignment);

View File

@ -44,6 +44,7 @@
#define MAX_INPUT_ATTACHMENTS 4
#define MAX_UNIFORM_BUFFERS 12
#define MAX_INLINE_UNIFORM_BUFFERS 4
#define MAX_STORAGE_BUFFERS 8
#define MAX_DYNAMIC_UNIFORM_BUFFERS 8

View File

@ -465,17 +465,19 @@ descriptor_map_add(struct v3dv_descriptor_map *map,
int binding,
int array_index,
int array_size,
int start_index,
uint8_t return_size)
{
assert(array_index < array_size);
assert(return_size == 16 || return_size == 32);
unsigned index = 0;
for (unsigned i = 0; i < map->num_desc; i++) {
if (set == map->set[i] &&
binding == map->binding[i] &&
array_index == map->array_index[i]) {
assert(array_size == map->array_size[i]);
unsigned index = start_index;
for (; index < map->num_desc; index++) {
if (map->used[index] &&
set == map->set[index] &&
binding == map->binding[index] &&
array_index == map->array_index[index]) {
assert(array_size == map->array_size[index]);
if (return_size != map->return_size[index]) {
/* It the return_size is different it means that the same sampler
* was used for operations with different precision
@ -485,18 +487,21 @@ descriptor_map_add(struct v3dv_descriptor_map *map,
map->return_size[index] = 32;
}
return index;
} else if (!map->used[index]) {
break;
}
index++;
}
assert(index == map->num_desc);
assert(index < DESCRIPTOR_MAP_SIZE);
assert(!map->used[index]);
map->set[map->num_desc] = set;
map->binding[map->num_desc] = binding;
map->array_index[map->num_desc] = array_index;
map->array_size[map->num_desc] = array_size;
map->return_size[map->num_desc] = return_size;
map->num_desc++;
map->used[index] = true;
map->set[index] = set;
map->binding[index] = binding;
map->array_index[index] = array_index;
map->array_size[index] = array_size;
map->return_size[index] = return_size;
map->num_desc = MAX2(map->num_desc, index + 1);
return index;
}
@ -536,8 +541,11 @@ pipeline_get_descriptor_map(struct v3dv_pipeline *pipeline,
&pipeline->shared_data->maps[broadcom_stage]->sampler_map :
&pipeline->shared_data->maps[broadcom_stage]->texture_map;
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT:
return &pipeline->shared_data->maps[broadcom_stage]->ubo_map;
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
return &pipeline->shared_data->maps[broadcom_stage]->ssbo_map;
default:
unreachable("Descriptor type unknown or not having a descriptor map");
@ -563,31 +571,53 @@ lower_vulkan_resource_index(nir_builder *b,
struct v3dv_descriptor_set_binding_layout *binding_layout =
&set_layout->binding[binding];
unsigned index = 0;
const VkDescriptorType desc_type = nir_intrinsic_desc_type(instr);
switch (desc_type) {
switch (binding_layout->type) {
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: {
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: {
struct v3dv_descriptor_map *descriptor_map =
pipeline_get_descriptor_map(pipeline, desc_type, shader->info.stage, false);
pipeline_get_descriptor_map(pipeline, binding_layout->type,
shader->info.stage, false);
if (!const_val)
unreachable("non-constant vulkan_resource_index array index");
/* At compile-time we will need to know if we are processing a UBO load
* for an inline or a regular UBO so we can handle inline loads like
* push constants. At the level of NIR level however, the inline
* information is gone, so we rely on the index to make this distinction.
* Particularly, we reserve indices 1..MAX_INLINE_UNIFORM_BUFFERS for
* inline buffers. This means that at the descriptor map level
* we store inline buffers at slots 0..MAX_INLINE_UNIFORM_BUFFERS - 1,
* and regular UBOs at indices starting from MAX_INLINE_UNIFORM_BUFFERS.
*/
uint32_t start_index = 0;
if (binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) {
start_index = MAX_INLINE_UNIFORM_BUFFERS;
}
index = descriptor_map_add(descriptor_map, set, binding,
const_val->u32,
binding_layout->array_size,
start_index,
32 /* return_size: doesn't really apply for this case */);
if (desc_type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
/* skip index 0 which is used for push constants */
/* We always reserve index 0 for push constants */
if (binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
index++;
}
break;
}
default:
unreachable("unsupported desc_type for vulkan_resource_index");
unreachable("unsupported descriptor type for vulkan_resource_index");
break;
}
@ -698,6 +728,7 @@ lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx,
deref->var->data.binding,
array_index,
binding_layout->array_size,
0,
return_size);
if (is_sampler)
@ -807,6 +838,7 @@ lower_image_deref(nir_builder *b,
deref->var->data.binding,
array_index,
binding_layout->array_size,
0,
32 /* return_size: doesn't apply for textures */);
/* Note: we don't need to do anything here in relation to the precision and
@ -1752,12 +1784,12 @@ pipeline_lower_nir(struct v3dv_pipeline *pipeline,
*/
UNUSED unsigned index =
descriptor_map_add(&pipeline->shared_data->maps[p_stage->stage]->sampler_map,
-1, -1, -1, 0, 16);
-1, -1, -1, 0, 0, 16);
assert(index == V3DV_NO_SAMPLER_16BIT_IDX);
index =
descriptor_map_add(&pipeline->shared_data->maps[p_stage->stage]->sampler_map,
-2, -2, -2, 0, 32);
-2, -2, -2, 0, 0, 32);
assert(index == V3DV_NO_SAMPLER_32BIT_IDX);
/* Apply the actual pipeline layout to UBOs, SSBOs, and textures */

View File

@ -1353,8 +1353,8 @@ struct v3dv_descriptor {
struct {
struct v3dv_buffer *buffer;
uint32_t offset;
uint32_t range;
size_t offset;
size_t range;
};
struct v3dv_buffer_view *buffer_view;
@ -1727,8 +1727,8 @@ struct v3dv_pipeline_layout {
* FIXME: one alternative would be to allocate the map as big as you need for
* each descriptor type. That would means more individual allocations.
*/
#define DESCRIPTOR_MAP_SIZE MAX3(V3D_MAX_TEXTURE_SAMPLERS, \
MAX_UNIFORM_BUFFERS, \
#define DESCRIPTOR_MAP_SIZE MAX3(V3D_MAX_TEXTURE_SAMPLERS, \
MAX_UNIFORM_BUFFERS + MAX_INLINE_UNIFORM_BUFFERS, \
MAX_STORAGE_BUFFERS)
@ -1739,6 +1739,7 @@ struct v3dv_descriptor_map {
int binding[DESCRIPTOR_MAP_SIZE];
int array_index[DESCRIPTOR_MAP_SIZE];
int array_size[DESCRIPTOR_MAP_SIZE];
bool used[DESCRIPTOR_MAP_SIZE];
/* NOTE: the following is only for sampler, but this is the easier place to
* put it.
@ -2073,6 +2074,14 @@ v3dv_descriptor_map_get_descriptor(struct v3dv_descriptor_state *descriptor_stat
uint32_t index,
uint32_t *dynamic_offset);
struct v3dv_cl_reloc
v3dv_descriptor_map_get_descriptor_bo(struct v3dv_device *device,
struct v3dv_descriptor_state *descriptor_state,
struct v3dv_descriptor_map *map,
struct v3dv_pipeline_layout *pipeline_layout,
uint32_t index,
VkDescriptorType *out_type);
const struct v3dv_sampler *
v3dv_descriptor_map_get_sampler(struct v3dv_descriptor_state *descriptor_state,
struct v3dv_descriptor_map *map,

View File

@ -56,7 +56,8 @@ struct state_bo_list {
struct v3dv_bo *states[MAX_TOTAL_STATES];
};
#define MAX_TOTAL_UNIFORM_BUFFERS (1 + MAX_UNIFORM_BUFFERS * MAX_STAGES)
#define MAX_TOTAL_UNIFORM_BUFFERS (1 + (MAX_UNIFORM_BUFFERS + \
MAX_INLINE_UNIFORM_BUFFERS) * MAX_STAGES)
#define MAX_TOTAL_STORAGE_BUFFERS (MAX_STORAGE_BUFFERS * MAX_STAGES)
struct buffer_bo_list {
struct v3dv_bo *ubo[MAX_TOTAL_UNIFORM_BUFFERS];
@ -247,10 +248,12 @@ write_ubo_ssbo_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t dynamic_offset = 0;
/* For ubos, index is shifted, as 0 is reserved for push constants.
/* For ubos, index is shifted, as 0 is reserved for push constants
* and 1..MAX_INLINE_UNIFORM_BUFFERS are reserved for inline uniform
* buffers.
*/
if (content == QUNIFORM_UBO_ADDR &&
v3d_unit_data_get_unit(data) == 0) {
uint32_t index = v3d_unit_data_get_unit(data);
if (content == QUNIFORM_UBO_ADDR && index == 0) {
/* This calls is to ensure that the push_constant_ubo is
* updated. It already take into account it is should do the
* update or not
@ -266,40 +269,97 @@ write_ubo_ssbo_uniforms(struct v3dv_cmd_buffer *cmd_buffer,
offset + dynamic_offset);
buffer_bos->ubo[0] = resource->bo;
} else {
uint32_t index =
content == QUNIFORM_UBO_ADDR ?
v3d_unit_data_get_unit(data) - 1 :
data;
if (content == QUNIFORM_UBO_ADDR) {
/* We reserve index 0 for push constants and artificially increase our
* indices by one for that reason, fix that now before accessing the
* descriptor map.
*/
assert(index > 0);
index--;
} else {
index = data;
}
struct v3dv_descriptor *descriptor =
v3dv_descriptor_map_get_descriptor(descriptor_state, map,
pipeline->layout,
index, &dynamic_offset);
/* Inline UBO descriptors store UBO data in descriptor pool memory,
* instead of an external buffer.
*/
assert(descriptor);
assert(descriptor->buffer);
assert(descriptor->buffer->mem);
assert(descriptor->buffer->mem->bo);
if (content == QUNIFORM_GET_SSBO_SIZE ||
content == QUNIFORM_GET_UBO_SIZE) {
cl_aligned_u32(uniforms, descriptor->range);
} else {
cl_aligned_u32(uniforms, descriptor->buffer->mem->bo->offset +
descriptor->buffer->mem_offset +
descriptor->offset +
offset + dynamic_offset);
/* Inline uniform buffers store their contents in pool memory instead
* of an external buffer.
*/
struct v3dv_bo *bo;
uint32_t addr;
if (descriptor->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
assert(dynamic_offset == 0);
struct v3dv_cl_reloc reloc =
v3dv_descriptor_map_get_descriptor_bo(cmd_buffer->device,
descriptor_state, map,
pipeline->layout, index,
NULL);
bo = reloc.bo;
addr = reloc.bo->offset + reloc.offset + offset;
} else {
assert(descriptor->buffer);
assert(descriptor->buffer->mem);
assert(descriptor->buffer->mem->bo);
bo = descriptor->buffer->mem->bo;
addr = bo->offset +
descriptor->buffer->mem_offset +
descriptor->offset +
offset + dynamic_offset;
}
cl_aligned_u32(uniforms, addr);
if (content == QUNIFORM_UBO_ADDR) {
assert(index + 1 < MAX_TOTAL_UNIFORM_BUFFERS);
buffer_bos->ubo[index + 1] = descriptor->buffer->mem->bo;
assert(index < MAX_TOTAL_UNIFORM_BUFFERS);
buffer_bos->ubo[index] = bo;
} else {
assert(index < MAX_TOTAL_STORAGE_BUFFERS);
buffer_bos->ssbo[index] = descriptor->buffer->mem->bo;
buffer_bos->ssbo[index] = bo;
}
}
}
}
static void
write_inline_uniform(struct v3dv_cl_out **uniforms,
uint32_t index,
uint32_t offset,
struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_pipeline *pipeline,
enum broadcom_shader_stage stage)
{
assert(index < MAX_INLINE_UNIFORM_BUFFERS);
struct v3dv_descriptor_state *descriptor_state =
v3dv_cmd_buffer_get_descriptor_state(cmd_buffer, pipeline);
struct v3dv_descriptor_map *map =
&pipeline->shared_data->maps[stage]->ubo_map;
struct v3dv_cl_reloc reloc =
v3dv_descriptor_map_get_descriptor_bo(cmd_buffer->device,
descriptor_state, map,
pipeline->layout, index,
NULL);
/* Offset comes in 32-bit units */
uint32_t *addr = reloc.bo->map + reloc.offset + 4 * offset;
cl_aligned_u32(uniforms, *addr);
}
static uint32_t
get_texture_size_from_image_view(struct v3dv_image_view *image_view,
enum quniform_contents contents,
@ -432,6 +492,15 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
cl_aligned_u32(&uniforms, cmd_buffer->push_constants_data[data]);
break;
case QUNIFORM_INLINE_UBO_0:
case QUNIFORM_INLINE_UBO_1:
case QUNIFORM_INLINE_UBO_2:
case QUNIFORM_INLINE_UBO_3:
write_inline_uniform(&uniforms,
uinfo->contents[i] - QUNIFORM_INLINE_UBO_0, data,
cmd_buffer, pipeline, variant->stage);
break;
case QUNIFORM_VIEWPORT_X_SCALE:
cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * 256.0f);
break;

View File

@ -184,7 +184,7 @@ v3d_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
return screen->devinfo.ver >= 40;
case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
return 256;
return V3D_NON_COHERENT_ATOM_SIZE;
case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
if (screen->devinfo.ver < 40)
@ -872,7 +872,7 @@ v3d_screen_create(int fd, const struct pipe_screen_config *config,
v3d_resource_screen_init(pscreen);
screen->compiler = v3d_compiler_init(&screen->devinfo);
screen->compiler = v3d_compiler_init(&screen->devinfo, 0);
#ifdef ENABLE_SHADER_CACHE
v3d_disk_cache_init(screen);