From dfedeccc1395e7a43d41165dc09d9ab4e5f16c3c Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Fri, 7 Jun 2019 18:17:36 -0500 Subject: [PATCH] intel: Only set VectorMaskEnable when needed For cases with lots of very small primitives, this may improve performance because we're not executing those dead channels all the time. Shader-db reports no instruction or cycle-count changes. However, by hacking up the driver to report when this optimization triggers, it appears to affect about 10% of shader-db. v2 (Kenneth Graunke): Always enable VMask prior to XeHP for now, because using VMask on those platforms allows us to perform the eliminate_find_live_channel() optimization. However, XeHP doesn't seem to have packed fragment shader dispatch, so we lose that optimization regardless, and there's no reason not to avoid vmask. Reviewed-by: Kenneth Graunke Part-of: --- src/gallium/drivers/crocus/crocus_state.c | 2 +- src/gallium/drivers/iris/iris_state.c | 2 +- src/intel/compiler/brw_compiler.h | 5 ++++- src/intel/compiler/brw_fs.cpp | 14 ++++++++++++-- src/intel/compiler/brw_fs_generator.cpp | 12 +++++++++--- src/intel/vulkan/genX_pipeline.c | 3 ++- 6 files changed, 29 insertions(+), 9 deletions(-) diff --git a/src/gallium/drivers/crocus/crocus_state.c b/src/gallium/drivers/crocus/crocus_state.c index 440a7870652..74549291a81 100644 --- a/src/gallium/drivers/crocus/crocus_state.c +++ b/src/gallium/drivers/crocus/crocus_state.c @@ -6441,7 +6441,7 @@ crocus_upload_dirty_render_state(struct crocus_context *ice, * incorrect for subspans where some of the pixels are unlit. We believe * the bit just didn't take effect in previous generations. */ - ps.VectorMaskEnable = GFX_VER >= 8; + ps.VectorMaskEnable = GFX_VER >= 8 && wm_prog_data->uses_vmask; ps._8PixelDispatchEnable = wm_prog_data->dispatch_8; ps._16PixelDispatchEnable = wm_prog_data->dispatch_16; diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index 24dae802611..00114113062 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -4654,7 +4654,7 @@ iris_store_fs_state(const struct intel_device_info *devinfo, uint32_t *psx_state = ps_state + GENX(3DSTATE_PS_length); iris_pack_command(GENX(3DSTATE_PS), ps_state, ps) { - ps.VectorMaskEnable = true; + ps.VectorMaskEnable = wm_prog_data->uses_vmask; ps.BindingTableEntryCount = shader->bt.size_bytes / 4; ps.FloatingPointMode = prog_data->use_alt_mode; ps.MaximumNumberofThreadsPerPSD = diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index ef4c8650ede..62433320107 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -887,6 +887,7 @@ struct brw_wm_prog_data { bool uses_src_w; bool uses_depth_w_coefficients; bool uses_sample_mask; + bool uses_vmask; bool has_render_target_reads; bool has_side_effects; bool pulls_bary; @@ -1967,7 +1968,9 @@ brw_stage_has_packed_dispatch(ASSERTED const struct intel_device_info *devinfo, */ const struct brw_wm_prog_data *wm_prog_data = (const struct brw_wm_prog_data *)prog_data; - return devinfo->verx10 < 125 && !wm_prog_data->persample_dispatch; + return devinfo->verx10 < 125 && + !wm_prog_data->persample_dispatch && + wm_prog_data->uses_vmask; } case MESA_SHADER_COMPUTE: /* Compute shaders will be spawned with either a fully enabled dispatch diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index c02faf65021..84167496b99 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -9824,6 +9824,14 @@ brw_nir_populate_wm_prog_data(const nir_shader *shader, (prog_data->computed_depth_mode == BRW_PSCDEPTH_OFF) && !prog_data->computed_stencil; + /* We choose to always enable VMask prior to XeHP, as it would cause + * us to lose out on the eliminate_find_live_channel() optimization. + */ + prog_data->uses_vmask = devinfo->verx10 < 125 || + shader->info.fs.needs_quad_helper_invocations || + shader->info.fs.needs_all_helper_invocations || + prog_data->per_coarse_pixel_dispatch; + prog_data->uses_src_w = BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD); prog_data->uses_src_depth = @@ -10569,13 +10577,15 @@ static UNUSED void brw_fs_test_dispatch_packing(const fs_builder &bld) { const gl_shader_stage stage = bld.shader->stage; + const bool uses_vmask = + stage == MESA_SHADER_FRAGMENT && + brw_wm_prog_data(bld.shader->stage_prog_data)->uses_vmask; if (brw_stage_has_packed_dispatch(bld.shader->devinfo, stage, bld.shader->stage_prog_data)) { const fs_builder ubld = bld.exec_all().group(1, 0); const fs_reg tmp = component(bld.vgrf(BRW_REGISTER_TYPE_UD), 0); - const fs_reg mask = (stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() : - brw_dmask_reg()); + const fs_reg mask = uses_vmask ? brw_vmask_reg() : brw_dmask_reg(); ubld.ADD(tmp, mask, brw_imm_ud(1)); ubld.AND(tmp, mask, tmp); diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index 54db3b6721b..5a4b2fb35e6 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -2411,21 +2411,27 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, break; case SHADER_OPCODE_FIND_LIVE_CHANNEL: { + const bool uses_vmask = + stage == MESA_SHADER_FRAGMENT && + brw_wm_prog_data(this->prog_data)->uses_vmask; const struct brw_reg mask = brw_stage_has_packed_dispatch(devinfo, stage, prog_data) ? brw_imm_ud(~0u) : - stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() : - brw_dmask_reg(); + uses_vmask ? brw_vmask_reg() : brw_dmask_reg(); brw_find_live_channel(p, dst, mask, false); break; } case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL: { + const bool uses_vmask = + stage == MESA_SHADER_FRAGMENT && + brw_wm_prog_data(this->prog_data)->uses_vmask; + /* ce0 doesn't consider the thread dispatch mask, so if we want * to find the true last enabled channel, we need to apply that too. */ const struct brw_reg mask = - stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() : brw_dmask_reg(); + uses_vmask ? brw_vmask_reg() : brw_dmask_reg(); brw_find_live_channel(p, dst, mask, true); break; diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c index 0fa95ed86b6..192fec6b6e3 100644 --- a/src/intel/vulkan/genX_pipeline.c +++ b/src/intel/vulkan/genX_pipeline.c @@ -2351,7 +2351,8 @@ emit_3dstate_ps(struct anv_graphics_pipeline *pipeline, brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2); ps.SingleProgramFlow = false; - ps.VectorMaskEnable = GFX_VER >= 8; + ps.VectorMaskEnable = GFX_VER >= 8 && + wm_prog_data->uses_vmask; /* Wa_1606682166 */ ps.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(fs_bin); ps.BindingTableEntryCount = fs_bin->bind_map.surface_count;