diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index e4857288d64..a56c0296e19 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -496,7 +496,6 @@ tu_cs_emit_draw_state(struct tu_cs *cs, uint32_t id, struct tu_draw_state state) switch (id) { case TU_DRAW_STATE_PROGRAM: case TU_DRAW_STATE_VI: - case TU_DRAW_STATE_FS_CONST: /* The blob seems to not enable this (DESC_SETS_LOAD) for binning, even * when resources would actually be used in the binning shader. * Presumably the overhead of prefetching the resources isn't @@ -847,13 +846,14 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit_write_reg(cs, REG_A6XX_SP_CHICKEN_BITS, 0x00000410); tu_cs_emit_write_reg(cs, REG_A6XX_SP_IBO_COUNT, 0); tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B182, 0); - tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_SHARED_CONSTS, 0); + tu_cs_emit_regs(cs, A6XX_HLSQ_SHARED_CONSTS(.enable = false)); tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_UNKNOWN_0E12, 0x3200000); tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_CLIENT_PF, 4); tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E01, 0x0); tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A9A8, 0); - tu_cs_emit_write_reg(cs, REG_A6XX_SP_MODE_CONTROL, - A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4); + tu_cs_emit_regs(cs, A6XX_SP_MODE_CONTROL(.constant_demotion_enable = true, + .isammode = ISAMMODE_GL, + .shared_consts_enable = false)); /* TODO: set A6XX_VFD_ADD_OFFSET_INSTANCE and fix ir3 to avoid adding base instance */ tu_cs_emit_write_reg(cs, REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX); @@ -1060,7 +1060,7 @@ tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3); tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) | CP_SET_DRAW_STATE__0_DISABLE | - CP_SET_DRAW_STATE__0_GROUP_ID(TU_DRAW_STATE_SHADER_GEOM_CONST)); + CP_SET_DRAW_STATE__0_GROUP_ID(TU_DRAW_STATE_CONST)); tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0)); tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0)); @@ -3676,9 +3676,9 @@ tu6_user_consts_size(const struct tu_pipeline *pipeline, &pipeline->program.link[type]; uint32_t dwords = 0; - if (link->push_consts.count > 0) { - unsigned num_units = link->push_consts.count; - dwords += 4 + num_units * 4; + if (link->push_consts.dwords > 0) { + unsigned num_units = link->push_consts.dwords; + dwords += 4 + num_units; } return dwords; @@ -3693,47 +3693,81 @@ tu6_emit_user_consts(struct tu_cs *cs, const struct tu_program_descriptor_linkage *link = &pipeline->program.link[type]; - if (link->push_consts.count > 0) { - unsigned num_units = link->push_consts.count; + if (link->push_consts.dwords > 0) { + unsigned num_units = link->push_consts.dwords; unsigned offset = link->push_consts.lo; - tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_units * 4); - tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) | + + /* DST_OFF and NUM_UNIT requires vec4 units */ + tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_units); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset / 4) | CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | + CP_LOAD_STATE6_0_NUM_UNIT(num_units / 4)); + tu_cs_emit(cs, 0); + tu_cs_emit(cs, 0); + for (unsigned i = 0; i < num_units; i++) + tu_cs_emit(cs, push_constants[i + offset]); + } +} + +static void +tu6_emit_shared_consts(struct tu_cs *cs, + const struct tu_pipeline *pipeline, + uint32_t *push_constants, + bool compute) +{ + if (pipeline->shared_consts.dwords > 0) { + /* Offset and num_units for shared consts are in units of dwords. */ + unsigned num_units = pipeline->shared_consts.dwords; + unsigned offset = pipeline->shared_consts.lo; + + enum a6xx_state_type st = compute ? ST6_UBO : ST6_CONSTANTS; + uint32_t cp_load_state = compute ? CP_LOAD_STATE6_FRAG : CP_LOAD_STATE6; + + tu_cs_emit_pkt7(cs, cp_load_state, 3 + num_units); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) | + CP_LOAD_STATE6_0_STATE_TYPE(st) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(SB6_IBO) | CP_LOAD_STATE6_0_NUM_UNIT(num_units)); tu_cs_emit(cs, 0); tu_cs_emit(cs, 0); - for (unsigned i = 0; i < num_units * 4; i++) - tu_cs_emit(cs, push_constants[i + offset * 4]); + + for (unsigned i = 0; i < num_units; i++) + tu_cs_emit(cs, push_constants[i + offset]); } } +static uint32_t +tu6_const_size(struct tu_cmd_buffer *cmd, + const struct tu_pipeline *pipeline, + bool compute) +{ + uint32_t dwords = 0; + + if (pipeline->shared_consts.dwords > 0) { + dwords = pipeline->shared_consts.dwords + 4; + } else { + if (compute) { + dwords = tu6_user_consts_size(pipeline, MESA_SHADER_COMPUTE); + } else { + for (uint32_t type = MESA_SHADER_VERTEX; type <= MESA_SHADER_FRAGMENT; type++) + dwords += tu6_user_consts_size(pipeline, type); + } + } + + return dwords; +} + static struct tu_draw_state tu6_emit_consts(struct tu_cmd_buffer *cmd, const struct tu_pipeline *pipeline, - gl_shader_stage type) -{ - uint32_t dwords = tu6_user_consts_size(pipeline, type); - if (dwords == 0) - return (struct tu_draw_state) {}; - - struct tu_cs cs; - tu_cs_begin_sub_stream(&cmd->sub_cs, dwords, &cs); - - tu6_emit_user_consts(&cs, pipeline, type, cmd->push_constants); - - return tu_cs_end_draw_state(&cmd->sub_cs, &cs); -} - -static struct tu_draw_state -tu6_emit_consts_geom(struct tu_cmd_buffer *cmd, - const struct tu_pipeline *pipeline) + bool compute) { uint32_t dwords = 0; - for (uint32_t type = MESA_SHADER_VERTEX; type < MESA_SHADER_FRAGMENT; type++) - dwords += tu6_user_consts_size(pipeline, type); + dwords = tu6_const_size(cmd, pipeline, compute); if (dwords == 0) return (struct tu_draw_state) {}; @@ -3741,8 +3775,16 @@ tu6_emit_consts_geom(struct tu_cmd_buffer *cmd, struct tu_cs cs; tu_cs_begin_sub_stream(&cmd->sub_cs, dwords, &cs); - for (uint32_t type = MESA_SHADER_VERTEX; type < MESA_SHADER_FRAGMENT; type++) - tu6_emit_user_consts(&cs, pipeline, type, cmd->push_constants); + if (pipeline->shared_consts.dwords > 0) { + tu6_emit_shared_consts(&cs, pipeline, cmd->push_constants, compute); + } else { + if (compute) { + tu6_emit_user_consts(&cs, pipeline, MESA_SHADER_COMPUTE, cmd->push_constants); + } else { + for (uint32_t type = MESA_SHADER_VERTEX; type <= MESA_SHADER_FRAGMENT; type++) + tu6_emit_user_consts(&cs, pipeline, type, cmd->push_constants); + } + } return tu_cs_end_draw_state(&cmd->sub_cs, &cs); } @@ -3968,12 +4010,8 @@ tu6_draw_common(struct tu_cmd_buffer *cmd, tu_cs_emit_regs(&cs, A6XX_RB_STENCIL_CONTROL(.dword = cmd->state.rb_stencil_cntl)); } - if (cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) { - cmd->state.shader_const[0] = - tu6_emit_consts_geom(cmd, pipeline); - cmd->state.shader_const[1] = - tu6_emit_consts(cmd, pipeline, MESA_SHADER_FRAGMENT); - } + if (cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) + cmd->state.shader_const = tu6_emit_consts(cmd, pipeline, false); if (cmd->state.dirty & TU_CMD_DIRTY_VIEWPORTS) { struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * cmd->state.max_viewport); @@ -4008,8 +4046,7 @@ tu6_draw_common(struct tu_cmd_buffer *cmd, tu_cs_emit_draw_state(cs, TU_DRAW_STATE_RAST, pipeline->rast_state); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_SYSMEM, pipeline->prim_order_state_sysmem); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_GMEM, pipeline->prim_order_state_gmem); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_SHADER_GEOM_CONST, cmd->state.shader_const[0]); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const[1]); + tu_cs_emit_draw_state(cs, TU_DRAW_STATE_CONST, cmd->state.shader_const); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS_LOAD, pipeline->load_state); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers); @@ -4028,7 +4065,7 @@ tu6_draw_common(struct tu_cmd_buffer *cmd, */ bool emit_binding_stride = false, emit_blend = false; uint32_t draw_state_count = - ((cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) ? 2 : 0) + + ((cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) ? 1 : 0) + ((cmd->state.dirty & TU_CMD_DIRTY_DESC_SETS_LOAD) ? 1 : 0) + ((cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) ? 1 : 0) + ((cmd->state.dirty & TU_CMD_DIRTY_VS_PARAMS) ? 1 : 0) + @@ -4049,10 +4086,8 @@ tu6_draw_common(struct tu_cmd_buffer *cmd, if (draw_state_count > 0) tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_count); - if (cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) { - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_SHADER_GEOM_CONST, cmd->state.shader_const[0]); - tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const[1]); - } + if (cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) + tu_cs_emit_draw_state(cs, TU_DRAW_STATE_CONST, cmd->state.shader_const); if (cmd->state.dirty & TU_CMD_DIRTY_DESC_SETS_LOAD) tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS_LOAD, pipeline->load_state); if (cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) @@ -4572,8 +4607,7 @@ tu_dispatch(struct tu_cmd_buffer *cmd, tu_emit_cache_flush(cmd, cs); /* note: no reason to have this in a separate IB */ - tu_cs_emit_state_ib(cs, - tu6_emit_consts(cmd, pipeline, MESA_SHADER_COMPUTE)); + tu_cs_emit_state_ib(cs, tu6_emit_consts(cmd, pipeline, true)); tu_emit_compute_driver_params(cmd, cs, pipeline, info); diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c index e405079938f..726fb2d9fbf 100644 --- a/src/freedreno/vulkan/tu_pipeline.c +++ b/src/freedreno/vulkan/tu_pipeline.c @@ -662,15 +662,29 @@ tu6_emit_xs(struct tu_cs *cs, } } +static void +tu6_emit_shared_consts_enable(struct tu_cs *cs, bool enable) +{ + /* Enable/disable shared constants */ + tu_cs_emit_regs(cs, A6XX_HLSQ_SHARED_CONSTS(.enable = enable)); + tu_cs_emit_regs(cs, A6XX_SP_MODE_CONTROL(.constant_demotion_enable = true, + .isammode = ISAMMODE_GL, + .shared_consts_enable = enable)); +} + static void tu6_emit_cs_config(struct tu_cs *cs, const struct ir3_shader_variant *v, const struct tu_pvtmem_config *pvtmem, uint64_t binary_iova) { + bool shared_consts_enable = ir3_const_state(v)->shared_consts_enable; + tu6_emit_shared_consts_enable(cs, shared_consts_enable); + tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD( .cs_state = true, - .cs_ibo = true)); + .cs_ibo = true, + .cs_shared_const = shared_consts_enable)); tu6_emit_xs_config(cs, MESA_SHADER_COMPUTE, v); tu6_emit_xs(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova); @@ -1678,13 +1692,17 @@ tu6_emit_program_config(struct tu_cs *cs, STATIC_ASSERT(MESA_SHADER_VERTEX == 0); + bool shared_consts_enable = builder->layout->push_constant_size > 0; + tu6_emit_shared_consts_enable(cs, shared_consts_enable); + tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD( .vs_state = true, .hs_state = true, .ds_state = true, .gs_state = true, .fs_state = true, - .gfx_ibo = true)); + .gfx_ibo = true, + .gfx_shared_const = shared_consts_enable)); for (; stage < ARRAY_SIZE(builder->shader_iova); stage++) { tu6_emit_xs_config(cs, stage, builder->shaders->variants[stage]); } @@ -2793,6 +2811,13 @@ tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder, stage_infos[stage] = &builder->create_info->pStages[i]; } + if (builder->layout->push_constant_size > 0) { + pipeline->shared_consts = (struct tu_push_constant_range) { + .lo = 0, + .dwords = builder->layout->push_constant_size / 4, + }; + } + struct tu_shader_key keys[ARRAY_SIZE(stage_infos)] = { }; for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < ARRAY_SIZE(keys); stage++) { @@ -2952,7 +2977,7 @@ tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder, stage < ARRAY_SIZE(shaders); stage++) { if (!shaders[stage]) continue; - + int64_t stage_start = os_time_get_nano(); compiled_shaders->variants[stage] = @@ -2965,6 +2990,7 @@ tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder, stage_feedbacks[stage].duration += os_time_get_nano() - stage_start; } + compiled_shaders->shared_consts = pipeline->shared_consts; uint32_t safe_constlens = ir3_trim_constlen(compiled_shaders->variants, compiler); @@ -4038,6 +4064,13 @@ tu_compute_pipeline_create(VkDevice device, VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT; } + if (layout->push_constant_size > 0) { + pipeline->shared_consts = (struct tu_push_constant_range) { + .lo = 0, + .dwords = layout->push_constant_size / 4, + }; + } + char *nir_initial_disasm = NULL; if (!compiled) { @@ -4071,6 +4104,7 @@ tu_compute_pipeline_create(VkDevice device, compiled->active_desc_sets = shader->active_desc_sets; compiled->push_consts[MESA_SHADER_COMPUTE] = shader->push_consts; + compiled->shared_consts = pipeline->shared_consts; struct ir3_shader_variant *v = ir3_shader_create_variant(shader->ir3_shader, &ir3_key, executable_info); diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index c3dbd7d795b..deca65f9452 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -720,8 +720,7 @@ enum tu_draw_state_group_id TU_DRAW_STATE_VI, TU_DRAW_STATE_VI_BINNING, TU_DRAW_STATE_RAST, - TU_DRAW_STATE_SHADER_GEOM_CONST, - TU_DRAW_STATE_FS_CONST, + TU_DRAW_STATE_CONST, TU_DRAW_STATE_DESC_SETS, TU_DRAW_STATE_DESC_SETS_LOAD, TU_DRAW_STATE_VS_PARAMS, @@ -1189,7 +1188,7 @@ struct tu_cmd_state /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */ struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT]; struct tu_draw_state vertex_buffers; - struct tu_draw_state shader_const[2]; + struct tu_draw_state shader_const; struct tu_draw_state desc_sets; struct tu_draw_state vs_params; @@ -1377,7 +1376,7 @@ struct tu_event struct tu_push_constant_range { uint32_t lo; - uint32_t count; + uint32_t dwords; }; struct tu_shader @@ -1399,6 +1398,7 @@ struct tu_compiled_shaders { struct vk_pipeline_cache_object base; + struct tu_push_constant_range shared_consts; struct tu_push_constant_range push_consts[MESA_SHADER_STAGES]; uint8_t active_desc_sets; bool multi_pos_output; @@ -1498,6 +1498,8 @@ struct tu_pipeline /* for vertex buffers state */ uint32_t num_vbs; + struct tu_push_constant_range shared_consts; + struct { struct tu_draw_state config_state; diff --git a/src/freedreno/vulkan/tu_shader.c b/src/freedreno/vulkan/tu_shader.c index eb5a91cd2fc..7880e4a3e8e 100644 --- a/src/freedreno/vulkan/tu_shader.c +++ b/src/freedreno/vulkan/tu_shader.c @@ -138,18 +138,21 @@ tu_spirv_to_nir(struct tu_device *dev, } static void -lower_load_push_constant(nir_builder *b, nir_intrinsic_instr *instr, +lower_load_push_constant(struct tu_device *dev, + nir_builder *b, + nir_intrinsic_instr *instr, struct tu_shader *shader) { uint32_t base = nir_intrinsic_base(instr); assert(base % 4 == 0); - assert(base >= shader->push_consts.lo * 16); - base -= shader->push_consts.lo * 16; + assert(base >= shader->push_consts.lo * 4); + base -= shader->push_consts.lo * 4; nir_ssa_def *load = - nir_load_uniform(b, instr->num_components, instr->dest.ssa.bit_size, - nir_ushr(b, instr->src[0].ssa, nir_imm_int(b, 2)), - .base = base / 4); + nir_load_uniform(b, instr->num_components, + instr->dest.ssa.bit_size, + nir_ushr(b, instr->src[0].ssa, nir_imm_int(b, 2)), + .base = base + dev->compiler->shared_consts_base_offset * 4); nir_ssa_def_rewrite_uses(&instr->dest.ssa, load); @@ -398,7 +401,7 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr, { switch (instr->intrinsic) { case nir_intrinsic_load_push_constant: - lower_load_push_constant(b, instr, shader); + lower_load_push_constant(dev, b, instr, shader); return true; case nir_intrinsic_load_vulkan_descriptor: @@ -610,17 +613,21 @@ gather_push_constants(nir_shader *shader, struct tu_shader *tu_shader) if (min >= max) { tu_shader->push_consts.lo = 0; - tu_shader->push_consts.count = 0; + tu_shader->push_consts.dwords = 0; return; } - /* CP_LOAD_STATE OFFSET and NUM_UNIT are in units of vec4 (4 dwords), - * however there's an alignment requirement of 4 on OFFSET. Expand the - * range and change units accordingly. + /* CP_LOAD_STATE OFFSET and NUM_UNIT for SHARED_CONSTS are in units of + * dwords while loading regular consts is in units of vec4's. + * So we unify the unit here as dwords for tu_push_constant_range, then + * we should consider correct unit when emitting. + * + * Note there's an alignment requirement of 16 dwords on OFFSET. Expand + * the range and change units accordingly. */ - tu_shader->push_consts.lo = (min / 16) / 4 * 4; - tu_shader->push_consts.count = - align(max, 16) / 16 - tu_shader->push_consts.lo; + tu_shader->push_consts.lo = (min / 4) / 4 * 4; + tu_shader->push_consts.dwords = + align(max, 16) / 4 - tu_shader->push_consts.lo; } static bool @@ -822,7 +829,8 @@ tu_shader_create(struct tu_device *dev, shader->ir3_shader = ir3_shader_from_nir(dev->compiler, nir, &(struct ir3_shader_options) { - .reserved_user_consts = align(shader->push_consts.count, 4), + .reserved_user_consts = 0, + .shared_consts_enable = layout->push_constant_size > 0, .api_wavesize = key->api_wavesize, .real_wavesize = key->real_wavesize, }, &so_info);