turnip: Split tu6_emit_xs()

Emit all the state layout config (such as push-const CONSTLEN) first,
before emitting anything that depends on that state.  This fixes an
issue that was showing up when FLUT is enabled in ir3 (which results
in higher probability of not having any immediats lowered to push-
consts).

Signed-off-by: Rob Clark <robdclark@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8705>
This commit is contained in:
Rob Clark 2021-07-12 12:00:38 -07:00 committed by Marge Bot
parent 71003e3c84
commit 4e802538e7
4 changed files with 149 additions and 87 deletions

View File

@ -469,12 +469,15 @@ r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_
.gfx_bindless = 0x1f,
.cs_bindless = 0x1f));
tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs);
tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL);
tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL);
tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL);
tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs);
struct tu_pvtmem_config pvtmem = {};
tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs, &pvtmem, global_iova(cmd, shaders[GLOBAL_SH_VS]));
tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL, &pvtmem, 0);
tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL, &pvtmem, 0);
tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL, &pvtmem, 0);
tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs, &pvtmem, global_iova(cmd, shaders[fs_id]));
tu6_emit_xs(cs, MESA_SHADER_VERTEX, &vs, &pvtmem, global_iova(cmd, shaders[GLOBAL_SH_VS]));
tu6_emit_xs(cs, MESA_SHADER_FRAGMENT, &fs, &pvtmem, global_iova(cmd, shaders[fs_id]));
tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());

View File

@ -2121,7 +2121,8 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
struct tu_cs *cs = &cmd->draw_cs;
uint32_t mask = ~pipeline->dynamic_state_mask & BITFIELD_MASK(TU_DYNAMIC_STATE_COUNT);
tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (6 + util_bitcount(mask)));
tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (7 + util_bitcount(mask)));
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI, pipeline->vi.state);
@ -3719,6 +3720,7 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
if (cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE) {
tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_TESS, tess_consts);

View File

@ -330,63 +330,69 @@ tu_blend_state_is_dual_src(const VkPipelineColorBlendStateCreateInfo *info)
return false;
}
static const struct xs_config {
uint16_t reg_sp_xs_ctrl;
uint16_t reg_sp_xs_config;
uint16_t reg_sp_xs_instrlen;
uint16_t reg_hlsq_xs_ctrl;
uint16_t reg_sp_xs_first_exec_offset;
uint16_t reg_sp_xs_pvt_mem_hw_stack_offset;
} xs_config[] = {
[MESA_SHADER_VERTEX] = {
REG_A6XX_SP_VS_CTRL_REG0,
REG_A6XX_SP_VS_CONFIG,
REG_A6XX_SP_VS_INSTRLEN,
REG_A6XX_HLSQ_VS_CNTL,
REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET,
REG_A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET,
},
[MESA_SHADER_TESS_CTRL] = {
REG_A6XX_SP_HS_CTRL_REG0,
REG_A6XX_SP_HS_CONFIG,
REG_A6XX_SP_HS_INSTRLEN,
REG_A6XX_HLSQ_HS_CNTL,
REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET,
REG_A6XX_SP_HS_PVT_MEM_HW_STACK_OFFSET,
},
[MESA_SHADER_TESS_EVAL] = {
REG_A6XX_SP_DS_CTRL_REG0,
REG_A6XX_SP_DS_CONFIG,
REG_A6XX_SP_DS_INSTRLEN,
REG_A6XX_HLSQ_DS_CNTL,
REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET,
REG_A6XX_SP_DS_PVT_MEM_HW_STACK_OFFSET,
},
[MESA_SHADER_GEOMETRY] = {
REG_A6XX_SP_GS_CTRL_REG0,
REG_A6XX_SP_GS_CONFIG,
REG_A6XX_SP_GS_INSTRLEN,
REG_A6XX_HLSQ_GS_CNTL,
REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET,
REG_A6XX_SP_GS_PVT_MEM_HW_STACK_OFFSET,
},
[MESA_SHADER_FRAGMENT] = {
REG_A6XX_SP_FS_CTRL_REG0,
REG_A6XX_SP_FS_CONFIG,
REG_A6XX_SP_FS_INSTRLEN,
REG_A6XX_HLSQ_FS_CNTL,
REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET,
REG_A6XX_SP_FS_PVT_MEM_HW_STACK_OFFSET,
},
[MESA_SHADER_COMPUTE] = {
REG_A6XX_SP_CS_CTRL_REG0,
REG_A6XX_SP_CS_CONFIG,
REG_A6XX_SP_CS_INSTRLEN,
REG_A6XX_HLSQ_CS_CNTL,
REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET,
REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET,
},
};
void
tu6_emit_xs_config(struct tu_cs *cs,
gl_shader_stage stage, /* xs->type, but xs may be NULL */
const struct ir3_shader_variant *xs,
const struct tu_pvtmem_config *pvtmem,
uint64_t binary_iova)
const struct ir3_shader_variant *xs)
{
static const struct xs_config {
uint16_t reg_sp_xs_ctrl;
uint16_t reg_sp_xs_config;
uint16_t reg_hlsq_xs_ctrl;
uint16_t reg_sp_xs_first_exec_offset;
uint16_t reg_sp_xs_pvt_mem_hw_stack_offset;
} xs_config[] = {
[MESA_SHADER_VERTEX] = {
REG_A6XX_SP_VS_CTRL_REG0,
REG_A6XX_SP_VS_CONFIG,
REG_A6XX_HLSQ_VS_CNTL,
REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET,
REG_A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET,
},
[MESA_SHADER_TESS_CTRL] = {
REG_A6XX_SP_HS_CTRL_REG0,
REG_A6XX_SP_HS_CONFIG,
REG_A6XX_HLSQ_HS_CNTL,
REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET,
REG_A6XX_SP_HS_PVT_MEM_HW_STACK_OFFSET,
},
[MESA_SHADER_TESS_EVAL] = {
REG_A6XX_SP_DS_CTRL_REG0,
REG_A6XX_SP_DS_CONFIG,
REG_A6XX_HLSQ_DS_CNTL,
REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET,
REG_A6XX_SP_DS_PVT_MEM_HW_STACK_OFFSET,
},
[MESA_SHADER_GEOMETRY] = {
REG_A6XX_SP_GS_CTRL_REG0,
REG_A6XX_SP_GS_CONFIG,
REG_A6XX_HLSQ_GS_CNTL,
REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET,
REG_A6XX_SP_GS_PVT_MEM_HW_STACK_OFFSET,
},
[MESA_SHADER_FRAGMENT] = {
REG_A6XX_SP_FS_CTRL_REG0,
REG_A6XX_SP_FS_CONFIG,
REG_A6XX_HLSQ_FS_CNTL,
REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET,
REG_A6XX_SP_FS_PVT_MEM_HW_STACK_OFFSET,
},
[MESA_SHADER_COMPUTE] = {
REG_A6XX_SP_CS_CTRL_REG0,
REG_A6XX_SP_CS_CONFIG,
REG_A6XX_HLSQ_CS_CNTL,
REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET,
REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET,
},
};
const struct xs_config *cfg = &xs_config[stage];
if (!xs) {
@ -399,6 +405,34 @@ tu6_emit_xs_config(struct tu_cs *cs,
return;
}
tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
tu_cs_emit(cs, A6XX_SP_VS_CONFIG_ENABLED |
COND(xs->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) |
COND(xs->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) |
COND(xs->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_IBO) |
COND(xs->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) |
A6XX_SP_VS_CONFIG_NTEX(xs->num_samp) |
A6XX_SP_VS_CONFIG_NSAMP(xs->num_samp));
tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(xs->constlen) |
A6XX_HLSQ_VS_CNTL_ENABLED);
}
void
tu6_emit_xs(struct tu_cs *cs,
gl_shader_stage stage, /* xs->type, but xs may be NULL */
const struct ir3_shader_variant *xs,
const struct tu_pvtmem_config *pvtmem,
uint64_t binary_iova)
{
const struct xs_config *cfg = &xs_config[stage];
if (!xs) {
/* shader stage disabled */
return;
}
enum a6xx_threadsize thrsz =
xs->info.double_threadsize ? THREAD128 : THREAD64;
switch (stage) {
@ -459,20 +493,9 @@ tu6_emit_xs_config(struct tu_cs *cs,
unreachable("bad shader stage");
}
tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 2);
tu_cs_emit(cs, A6XX_SP_VS_CONFIG_ENABLED |
COND(xs->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) |
COND(xs->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) |
COND(xs->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_IBO) |
COND(xs->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) |
A6XX_SP_VS_CONFIG_NTEX(xs->num_samp) |
A6XX_SP_VS_CONFIG_NSAMP(xs->num_samp));
tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_instrlen, 1);
tu_cs_emit(cs, xs->instrlen);
tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(xs->constlen) |
A6XX_HLSQ_VS_CNTL_ENABLED);
/* emit program binary & private memory layout
* binary_iova should be aligned to 1 instrlen unit (128 bytes)
*/
@ -578,7 +601,8 @@ tu6_emit_cs_config(struct tu_cs *cs, const struct tu_shader *shader,
.cs_state = true,
.cs_ibo = true));
tu6_emit_xs_config(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova);
tu6_emit_xs_config(cs, MESA_SHADER_COMPUTE, v);
tu6_emit_xs(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova);
uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1);
tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
@ -1523,6 +1547,26 @@ tu6_emit_geom_tess_consts(struct tu_cs *cs,
}
}
static void
tu6_emit_program_config(struct tu_cs *cs,
struct tu_pipeline_builder *builder)
{
gl_shader_stage stage = MESA_SHADER_VERTEX;
STATIC_ASSERT(MESA_SHADER_VERTEX == 0);
tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
.vs_state = true,
.hs_state = true,
.ds_state = true,
.gs_state = true,
.fs_state = true,
.gfx_ibo = true));
for (; stage < ARRAY_SIZE(builder->shaders); stage++) {
tu6_emit_xs_config(cs, stage, builder->variants[stage]);
}
}
static void
tu6_emit_program(struct tu_cs *cs,
struct tu_pipeline_builder *builder,
@ -1540,22 +1584,12 @@ tu6_emit_program(struct tu_cs *cs,
builder->create_info->pTessellationState->patchControlPoints : 0;
bool multi_pos_output = builder->shaders[MESA_SHADER_VERTEX]->multi_pos_output;
STATIC_ASSERT(MESA_SHADER_VERTEX == 0);
tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
.vs_state = true,
.hs_state = true,
.ds_state = true,
.gs_state = true,
.fs_state = true,
.gfx_ibo = true));
/* Don't use the binning pass variant when GS is present because we don't
* support compiling correct binning pass variants with GS.
*/
if (binning_pass && !gs) {
vs = bs;
tu6_emit_xs_config(cs, stage, bs, &builder->pvtmem, builder->binning_vs_iova);
tu6_emit_xs(cs, stage, bs, &builder->pvtmem, builder->binning_vs_iova);
stage++;
}
@ -1565,7 +1599,7 @@ tu6_emit_program(struct tu_cs *cs,
if (stage == MESA_SHADER_FRAGMENT && binning_pass)
fs = xs = NULL;
tu6_emit_xs_config(cs, stage, xs, &builder->pvtmem, builder->shader_iova[stage]);
tu6_emit_xs(cs, stage, xs, &builder->pvtmem, builder->shader_iova[stage]);
}
uint32_t multiview_views = util_logbase2(builder->multiview_mask) + 1;
@ -2456,6 +2490,22 @@ tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder,
struct tu_pipeline *pipeline)
{
struct tu_cs prog_cs;
/* Emit HLSQ_xS_CNTL/HLSQ_SP_xS_CONFIG *first*, before emitting anything
* else that could depend on that state (like push constants)
*
* Note also that this always uses the full VS even in binning pass. The
* binning pass variant has the same const layout as the full VS, and
* the constlen for the VS will be the same or greater than the constlen
* for the binning pass variant. It is required that the constlen state
* matches between binning and draw passes, as some parts of the push
* consts are emitted in state groups that are shared between the binning
* and draw passes.
*/
tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs);
tu6_emit_program_config(&prog_cs, builder);
pipeline->program.config_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs);
tu6_emit_program(&prog_cs, builder, false, pipeline);
pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);

View File

@ -498,6 +498,7 @@ enum tu_dynamic_state
enum tu_draw_state_group_id
{
TU_DRAW_STATE_PROGRAM_CONFIG,
TU_DRAW_STATE_PROGRAM,
TU_DRAW_STATE_PROGRAM_BINNING,
TU_DRAW_STATE_TESS,
@ -1159,6 +1160,7 @@ struct tu_pipeline
struct
{
struct tu_draw_state config_state;
struct tu_draw_state state;
struct tu_draw_state binning_state;
@ -1235,9 +1237,14 @@ struct tu_pvtmem_config {
void
tu6_emit_xs_config(struct tu_cs *cs,
gl_shader_stage stage,
const struct ir3_shader_variant *xs,
const struct tu_pvtmem_config *pvtmem,
uint64_t binary_iova);
const struct ir3_shader_variant *xs);
void
tu6_emit_xs(struct tu_cs *cs,
gl_shader_stage stage,
const struct ir3_shader_variant *xs,
const struct tu_pvtmem_config *pvtmem,
uint64_t binary_iova);
void
tu6_emit_vpc(struct tu_cs *cs,