diff --git a/src/amd/common/ac_nir.h b/src/amd/common/ac_nir.h index f77104ecb61..dd95139a266 100644 --- a/src/amd/common/ac_nir.h +++ b/src/amd/common/ac_nir.h @@ -94,7 +94,6 @@ ac_nir_lower_indirect_derefs(nir_shader *shader, typedef struct { unsigned lds_bytes_if_culling_off; - bool can_cull; bool passthrough; bool early_prim_export; uint64_t nggc_inputs_read_by_pos; @@ -107,7 +106,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, unsigned num_vertices_per_primitive, unsigned max_workgroup_size, unsigned wave_size, - bool consider_culling, + bool can_cull, bool consider_passthrough, bool export_prim_id, bool provoking_vtx_last, diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c index 88d6865198d..228ebba1cbc 100644 --- a/src/amd/common/ac_nir_lower_ngg.c +++ b/src/amd/common/ac_nir_lower_ngg.c @@ -1254,34 +1254,13 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c unreachable("Should be VS or TES."); } -static bool -can_use_deferred_attribute_culling(nir_shader *shader) -{ - /* When the shader writes memory, it is difficult to guarantee correctness. - * Future work: - * - if only write-only SSBOs are used - * - if we can prove that non-position outputs don't rely on memory stores - * then may be okay to keep the memory stores in the 1st shader part, and delete them from the 2nd. - */ - if (shader->info.writes_memory) - return false; - - /* When the shader relies on the subgroup invocation ID, we'd break it, because the ID changes after the culling. - * Future work: try to save this to LDS and reload, but it can still be broken in subtle ways. - */ - if (BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SUBGROUP_INVOCATION)) - return false; - - return true; -} - ac_nir_ngg_config ac_nir_lower_ngg_nogs(nir_shader *shader, unsigned max_num_es_vertices, unsigned num_vertices_per_primitives, unsigned max_workgroup_size, unsigned wave_size, - bool consider_culling, + bool can_cull, bool consider_passthrough, bool export_prim_id, bool provoking_vtx_last, @@ -1292,8 +1271,6 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, assert(impl); assert(max_num_es_vertices && max_workgroup_size && wave_size); - bool can_cull = consider_culling && (num_vertices_per_primitives == 3) && - can_use_deferred_attribute_culling(shader); bool passthrough = consider_passthrough && !can_cull && !(shader->info.stage == MESA_SHADER_VERTEX && export_prim_id); @@ -1441,7 +1418,6 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, ac_nir_ngg_config ret = { .lds_bytes_if_culling_off = lds_bytes_if_culling_off, - .can_cull = can_cull, .passthrough = passthrough, .early_prim_export = state.early_prim_export, .nggc_inputs_read_by_pos = state.inputs_needed_by_pos, diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 3eea082571c..d9e28280827 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -2763,6 +2763,29 @@ radv_get_ballot_bit_size(struct radv_device *device, const VkPipelineShaderStage return 64; } +static void +radv_determine_ngg_settings(struct radv_pipeline *pipeline, + const struct radv_pipeline_key *pipeline_key, + struct radv_shader_info *infos, nir_shader **nir) +{ + struct radv_device *device = pipeline->device; + + if (!nir[MESA_SHADER_GEOMETRY] && pipeline->graphics.last_vgt_api_stage != MESA_SHADER_NONE) { + uint64_t ps_inputs_read = + nir[MESA_SHADER_FRAGMENT] ? nir[MESA_SHADER_FRAGMENT]->info.inputs_read : 0; + gl_shader_stage es_stage = pipeline->graphics.last_vgt_api_stage; + + unsigned num_vertices_per_prim = si_conv_prim_to_gs_out(pipeline_key->vs.topology) + 1; + if (es_stage == MESA_SHADER_TESS_EVAL) + num_vertices_per_prim = nir[es_stage]->info.tess.point_mode ? 1 + : nir[es_stage]->info.tess.primitive_mode == GL_ISOLINES ? 2 + : 3; + + infos[es_stage].has_ngg_culling = + radv_consider_culling(device, nir[es_stage], ps_inputs_read, num_vertices_per_prim); + } +} + static void radv_fill_shader_info(struct radv_pipeline *pipeline, const VkPipelineShaderStageCreateInfo **pStages, @@ -3454,6 +3477,8 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device, infos[hw_vs_api_stage].workgroup_size = infos[hw_vs_api_stage].wave_size; } + radv_determine_ngg_settings(pipeline, pipeline_key, infos, nir); + for (int i = 0; i < MESA_SHADER_STAGES; ++i) { if (nir[i]) { radv_start_feedback(stage_feedbacks[i]); @@ -3518,11 +3543,8 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device, bool io_to_mem = radv_lower_io_to_mem(device, nir[i], &infos[i], pipeline_key); bool lowered_ngg = pipeline_has_ngg && i == pipeline->graphics.last_vgt_api_stage && !radv_use_llvm_for_stage(device, i); - if (lowered_ngg) { - uint64_t ps_inputs_read = nir[MESA_SHADER_FRAGMENT] ? nir[MESA_SHADER_FRAGMENT]->info.inputs_read : 0; - bool consider_culling = radv_consider_culling(device, nir[i], ps_inputs_read); - radv_lower_ngg(device, nir[i], &infos[i], pipeline_key, consider_culling); - } + if (lowered_ngg) + radv_lower_ngg(device, nir[i], &infos[i], pipeline_key); radv_optimize_nir_algebraic(nir[i], io_to_mem || lowered_ngg || i == MESA_SHADER_COMPUTE); diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 7319c4f59aa..1786370b7b8 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -886,7 +886,7 @@ radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir, bool radv_consider_culling(struct radv_device *device, struct nir_shader *nir, - uint64_t ps_inputs_read) + uint64_t ps_inputs_read, unsigned num_vertices_per_primitive) { /* Culling doesn't make sense for meta shaders. */ if (!!nir->info.name) @@ -917,14 +917,34 @@ radv_consider_culling(struct radv_device *device, struct nir_shader *nir, max_ps_params = 4; /* Navi 1x. */ /* TODO: consider other heuristics here, such as PS execution time */ + if (util_bitcount64(ps_inputs_read & ~VARYING_BIT_POS) > max_ps_params) + return false; - return util_bitcount64(ps_inputs_read & ~VARYING_BIT_POS) <= max_ps_params; + /* Only triangle culling is supported. */ + if (num_vertices_per_primitive != 3) + return false; + + /* When the shader writes memory, it is difficult to guarantee correctness. + * Future work: + * - if only write-only SSBOs are used + * - if we can prove that non-position outputs don't rely on memory stores + * then may be okay to keep the memory stores in the 1st shader part, and delete them from the 2nd. + */ + if (nir->info.writes_memory) + return false; + + /* When the shader relies on the subgroup invocation ID, we'd break it, because the ID changes after the culling. + * Future work: try to save this to LDS and reload, but it can still be broken in subtle ways. + */ + if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SUBGROUP_INVOCATION)) + return false; + + return true; } void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir, struct radv_shader_info *info, - const struct radv_pipeline_key *pl_key, - bool consider_culling) + const struct radv_pipeline_key *pl_key) { /* TODO: support the LLVM backend with the NIR lowering */ assert(!radv_use_llvm_for_stage(device, nir->info.stage)); @@ -971,7 +991,7 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir, assert(info->is_ngg); - if (consider_culling) + if (info->has_ngg_culling) radv_optimize_nir_algebraic(nir, false); if (nir->info.stage == MESA_SHADER_VERTEX) { @@ -987,14 +1007,13 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir, num_vertices_per_prim, info->workgroup_size, info->wave_size, - consider_culling, + info->has_ngg_culling, info->is_ngg_passthrough, export_prim_id, pl_key->vs.provoking_vtx_last, false, pl_key->vs.instance_rate_inputs); - info->has_ngg_culling = out_conf.can_cull; info->has_ngg_early_prim_export = out_conf.early_prim_export; info->num_lds_blocks_when_not_culling = DIV_ROUND_UP(out_conf.lds_bytes_if_culling_off, device->physical_device->rad_info.lds_encode_granularity); info->is_ngg_passthrough = out_conf.passthrough; diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index a26b1c993dc..158aea71e83 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -530,11 +530,10 @@ bool radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir, void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir, struct radv_shader_info *info, - const struct radv_pipeline_key *pl_key, - bool consider_culling); + const struct radv_pipeline_key *pl_key); bool radv_consider_culling(struct radv_device *device, struct nir_shader *nir, - uint64_t ps_inputs_read); + uint64_t ps_inputs_read, unsigned num_vertices_per_primitive); void radv_get_nir_options(struct radv_physical_device *device);