turnip: implement VK_KHR_shader_draw_parameters

Note: going by the blob, VFD_INDEX_OFFSET/FD_INSTANCE_START_OFFSET seem
completely unused by indirect draws, so this changes them to only be set
for non-indirect draws (and moves them to the vs_params draw state).

Passes dEQP-VK.draw.shader_draw_parameters.*

Signed-off-by: Jonathan Marek <jonathan@marek.ca>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5635>
This commit is contained in:
Jonathan Marek 2020-06-24 16:00:30 -04:00 committed by Marge Bot
parent 16a9e233da
commit 62de79ac44
5 changed files with 136 additions and 74 deletions

View File

@ -919,6 +919,7 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AB00, 0x5);
tu_cs_emit_write_reg(cs, REG_A6XX_VPC_GS_SIV_CNTL, 0x0000ffff);
/* TODO: set A6XX_VFD_ADD_OFFSET_INSTANCE and fix ir3 to avoid adding base instance */
tu_cs_emit_write_reg(cs, REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX);
tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8811, 0x00000010);
tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x1f);
@ -2973,46 +2974,6 @@ tu6_emit_consts(struct tu_cmd_buffer *cmd,
return tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
}
static VkResult
tu6_emit_vs_params(struct tu_cmd_buffer *cmd,
uint32_t first_instance,
struct tu_cs_entry *entry)
{
/* TODO: fill out more than just base instance */
const struct tu_program_descriptor_linkage *link =
&cmd->state.pipeline->program.link[MESA_SHADER_VERTEX];
const struct ir3_const_state *const_state = &link->const_state;
struct tu_cs cs;
if (const_state->offsets.driver_param >= link->constlen) {
*entry = (struct tu_cs_entry) {};
return VK_SUCCESS;
}
VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 8, &cs);
if (result != VK_SUCCESS)
return result;
tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(const_state->offsets.driver_param) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
CP_LOAD_STATE6_0_NUM_UNIT(1));
tu_cs_emit(&cs, 0);
tu_cs_emit(&cs, 0);
STATIC_ASSERT(IR3_DP_INSTID_BASE == 2);
tu_cs_emit(&cs, 0);
tu_cs_emit(&cs, 0);
tu_cs_emit(&cs, first_instance);
tu_cs_emit(&cs, 0);
*entry = tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
return VK_SUCCESS;
}
static struct tu_cs_entry
tu6_emit_vertex_buffers(struct tu_cmd_buffer *cmd,
const struct tu_pipeline *pipeline)
@ -3156,9 +3117,7 @@ static VkResult
tu6_draw_common(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
bool indexed,
uint32_t vertex_offset,
uint32_t first_instance,
/* note: draw_count count is 0 for indirect */
/* note: draw_count is 0 for indirect */
uint32_t draw_count)
{
const struct tu_pipeline *pipeline = cmd->state.pipeline;
@ -3171,10 +3130,6 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
/* TODO lrz */
tu_cs_emit_regs(cs,
A6XX_VFD_INDEX_OFFSET(vertex_offset),
A6XX_VFD_INSTANCE_START_OFFSET(first_instance));
tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0(
.primitive_restart =
pipeline->ia.primitive_restart && indexed,
@ -3225,11 +3180,6 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
if (cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS)
cmd->state.vertex_buffers_ib = tu6_emit_vertex_buffers(cmd, pipeline);
struct tu_cs_entry vs_params;
result = tu6_emit_vs_params(cmd, first_instance, &vs_params);
if (result != VK_SUCCESS)
return result;
bool has_tess =
pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
struct tu_cs_entry tess_consts = {};
@ -3269,7 +3219,7 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets_ib);
tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS_LOAD, cmd->state.desc_sets_load_ib);
tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers_ib);
tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VS_PARAMS, vs_params);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params);
for (uint32_t i = 0; i < ARRAY_SIZE(cmd->state.dynamic_state); i++) {
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i,
@ -3306,7 +3256,7 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS_LOAD, cmd->state.desc_sets_load_ib);
if (cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS)
tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers_ib);
tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VS_PARAMS, vs_params);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params);
}
tu_cs_sanity_check(cs);
@ -3352,6 +3302,68 @@ tu_draw_initiator(struct tu_cmd_buffer *cmd, enum pc_di_src_sel src_sel)
return initiator;
}
static uint32_t
vs_params_offset(struct tu_cmd_buffer *cmd)
{
const struct tu_program_descriptor_linkage *link =
&cmd->state.pipeline->program.link[MESA_SHADER_VERTEX];
const struct ir3_const_state *const_state = &link->const_state;
if (const_state->offsets.driver_param >= link->constlen)
return 0;
/* this layout is required by CP_DRAW_INDIRECT_MULTI */
STATIC_ASSERT(IR3_DP_DRAWID == 0);
STATIC_ASSERT(IR3_DP_VTXID_BASE == 1);
STATIC_ASSERT(IR3_DP_INSTID_BASE == 2);
/* 0 means disabled for CP_DRAW_INDIRECT_MULTI */
assert(const_state->offsets.driver_param != 0);
return const_state->offsets.driver_param;
}
static struct tu_draw_state
tu6_emit_vs_params(struct tu_cmd_buffer *cmd,
uint32_t vertex_offset,
uint32_t first_instance)
{
uint32_t offset = vs_params_offset(cmd);
struct tu_cs cs;
VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 3 + (offset ? 8 : 0), &cs);
if (result != VK_SUCCESS) {
cmd->record_result = result;
return (struct tu_draw_state) {};
}
/* TODO: don't make a new draw state when it doesn't change */
tu_cs_emit_regs(&cs,
A6XX_VFD_INDEX_OFFSET(vertex_offset),
A6XX_VFD_INSTANCE_START_OFFSET(first_instance));
if (offset) {
tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
CP_LOAD_STATE6_0_NUM_UNIT(1));
tu_cs_emit(&cs, 0);
tu_cs_emit(&cs, 0);
tu_cs_emit(&cs, 0);
tu_cs_emit(&cs, vertex_offset);
tu_cs_emit(&cs, first_instance);
tu_cs_emit(&cs, 0);
}
struct tu_cs_entry entry = tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
return (struct tu_draw_state) {entry.bo->iova + entry.offset, entry.size / 4};
}
void
tu_CmdDraw(VkCommandBuffer commandBuffer,
uint32_t vertexCount,
@ -3362,7 +3374,9 @@ tu_CmdDraw(VkCommandBuffer commandBuffer,
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
struct tu_cs *cs = &cmd->draw_cs;
tu6_draw_common(cmd, cs, false, firstVertex, firstInstance, vertexCount);
cmd->state.vs_params = tu6_emit_vs_params(cmd, firstVertex, firstInstance);
tu6_draw_common(cmd, cs, false, vertexCount);
tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
@ -3381,7 +3395,9 @@ tu_CmdDrawIndexed(VkCommandBuffer commandBuffer,
TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
struct tu_cs *cs = &cmd->draw_cs;
tu6_draw_common(cmd, cs, true, vertexOffset, firstInstance, indexCount);
cmd->state.vs_params = tu6_emit_vs_params(cmd, vertexOffset, firstInstance);
tu6_draw_common(cmd, cs, true, indexCount);
tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 7);
tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
@ -3403,13 +3419,25 @@ tu_CmdDrawIndirect(VkCommandBuffer commandBuffer,
TU_FROM_HANDLE(tu_buffer, buf, _buffer);
struct tu_cs *cs = &cmd->draw_cs;
tu6_draw_common(cmd, cs, false, 0, 0, 0);
cmd->state.vs_params = (struct tu_draw_state) {};
for (uint32_t i = 0; i < drawCount; i++) {
tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT, 3);
tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset + stride * i);
}
tu6_draw_common(cmd, cs, false, 0);
/* workaround for a firmware bug with CP_DRAW_INDIRECT_MULTI, where it
* doesn't wait for WFIs to be completed and leads to GPU fault/hang
* TODO: this could be worked around in a more performant way,
* or there may exist newer firmware that has been fixed
*/
if (cmd->device->physical_device->gpu_id != 650)
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 6);
tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_NORMAL) |
A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
tu_cs_emit(cs, drawCount);
tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);
tu_cs_emit(cs, stride);
tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
}
@ -3425,15 +3453,27 @@ tu_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
TU_FROM_HANDLE(tu_buffer, buf, _buffer);
struct tu_cs *cs = &cmd->draw_cs;
tu6_draw_common(cmd, cs, true, 0, 0, 0);
cmd->state.vs_params = (struct tu_draw_state) {};
for (uint32_t i = 0; i < drawCount; i++) {
tu_cs_emit_pkt7(cs, CP_DRAW_INDX_INDIRECT, 6);
tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
tu_cs_emit_qw(cs, cmd->state.index_va);
tu_cs_emit(cs, A5XX_CP_DRAW_INDX_INDIRECT_3_MAX_INDICES(cmd->state.max_index_count));
tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset + stride * i);
}
tu6_draw_common(cmd, cs, true, 0);
/* workaround for a firmware bug with CP_DRAW_INDIRECT_MULTI, where it
* doesn't wait for WFIs to be completed and leads to GPU fault/hang
* TODO: this could be worked around in a more performant way,
* or there may exist newer firmware that has been fixed
*/
if (cmd->device->physical_device->gpu_id != 650)
tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 9);
tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDEXED) |
A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
tu_cs_emit(cs, drawCount);
tu_cs_emit_qw(cs, cmd->state.index_va);
tu_cs_emit(cs, cmd->state.max_index_count);
tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);
tu_cs_emit(cs, stride);
tu_bo_list_add(&cmd->bo_list, buf->bo, MSM_SUBMIT_BO_READ);
}
@ -3450,7 +3490,9 @@ void tu_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
TU_FROM_HANDLE(tu_buffer, buf, _counterBuffer);
struct tu_cs *cs = &cmd->draw_cs;
tu6_draw_common(cmd, cs, false, 0, firstInstance, 0);
cmd->state.vs_params = tu6_emit_vs_params(cmd, 0, firstInstance);
tu6_draw_common(cmd, cs, false, 0);
tu_cs_emit_pkt7(cs, CP_DRAW_AUTO, 6);
tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_XFB));

View File

@ -590,8 +590,8 @@ tu_GetPhysicalDeviceFeatures(VkPhysicalDevice physicalDevice,
.sampleRateShading = true,
.dualSrcBlend = true,
.logicOp = true,
.multiDrawIndirect = false,
.drawIndirectFirstInstance = false,
.multiDrawIndirect = true,
.drawIndirectFirstInstance = true,
.depthClamp = true,
.depthBiasClamp = false,
.fillModeNonSolid = false,
@ -636,6 +636,22 @@ tu_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
vk_foreach_struct(ext, pFeatures->pNext)
{
switch (ext->sType) {
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES:
*((VkPhysicalDeviceVulkan11Features*) ext) = (VkPhysicalDeviceVulkan11Features) {
.storageBuffer16BitAccess = false,
.uniformAndStorageBuffer16BitAccess = false,
.storagePushConstant16 = false,
.storageInputOutput16 = false,
.multiview = false,
.multiviewGeometryShader = false,
.multiviewTessellationShader = false,
.variablePointersStorageBuffer = false,
.variablePointers = false,
.protectedMemory = false,
.samplerYcbcrConversion = true,
.shaderDrawParameters = true,
};
break;
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTERS_FEATURES: {
VkPhysicalDeviceVariablePointersFeatures *features = (void *) ext;
features->variablePointersStorageBuffer = false;
@ -653,7 +669,7 @@ tu_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETERS_FEATURES: {
VkPhysicalDeviceShaderDrawParametersFeatures *features =
(VkPhysicalDeviceShaderDrawParametersFeatures *) ext;
features->shaderDrawParameters = false;
features->shaderDrawParameters = true;
break;
}
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROTECTED_MEMORY_FEATURES: {

View File

@ -81,6 +81,7 @@ EXTENSIONS = [
Extension('VK_EXT_filter_cubic', 1, 'device->gpu_id == 650'),
Extension('VK_EXT_index_type_uint8', 1, True),
Extension('VK_EXT_vertex_attribute_divisor', 1, True),
Extension('VK_KHR_shader_draw_parameters', 1, True),
]
MAX_API_VERSION = VkVersion(MAX_API_VERSION)

View File

@ -815,6 +815,8 @@ struct tu_cmd_state
struct tu_cs_entry desc_sets_ib, desc_sets_load_ib;
struct tu_cs_entry ia_gmem_ib, ia_sysmem_ib;
struct tu_draw_state vs_params;
/* Index buffer */
uint64_t index_va;
uint32_t max_index_count;

View File

@ -48,6 +48,7 @@ tu_spirv_to_nir(struct ir3_compiler *compiler,
.caps = {
.transform_feedback = true,
.tessellation = true,
.draw_parameters = true,
},
};
const nir_shader_compiler_options *nir_options =