radv: move ngg culling determination earlier
Co-Authored-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13134>
This commit is contained in:
parent
5896bf41ca
commit
24501b5452
|
@ -94,7 +94,6 @@ ac_nir_lower_indirect_derefs(nir_shader *shader,
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
unsigned lds_bytes_if_culling_off;
|
unsigned lds_bytes_if_culling_off;
|
||||||
bool can_cull;
|
|
||||||
bool passthrough;
|
bool passthrough;
|
||||||
bool early_prim_export;
|
bool early_prim_export;
|
||||||
uint64_t nggc_inputs_read_by_pos;
|
uint64_t nggc_inputs_read_by_pos;
|
||||||
|
@ -107,7 +106,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
|
||||||
unsigned num_vertices_per_primitive,
|
unsigned num_vertices_per_primitive,
|
||||||
unsigned max_workgroup_size,
|
unsigned max_workgroup_size,
|
||||||
unsigned wave_size,
|
unsigned wave_size,
|
||||||
bool consider_culling,
|
bool can_cull,
|
||||||
bool consider_passthrough,
|
bool consider_passthrough,
|
||||||
bool export_prim_id,
|
bool export_prim_id,
|
||||||
bool provoking_vtx_last,
|
bool provoking_vtx_last,
|
||||||
|
|
|
@ -1254,34 +1254,13 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
|
||||||
unreachable("Should be VS or TES.");
|
unreachable("Should be VS or TES.");
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool
|
|
||||||
can_use_deferred_attribute_culling(nir_shader *shader)
|
|
||||||
{
|
|
||||||
/* When the shader writes memory, it is difficult to guarantee correctness.
|
|
||||||
* Future work:
|
|
||||||
* - if only write-only SSBOs are used
|
|
||||||
* - if we can prove that non-position outputs don't rely on memory stores
|
|
||||||
* then may be okay to keep the memory stores in the 1st shader part, and delete them from the 2nd.
|
|
||||||
*/
|
|
||||||
if (shader->info.writes_memory)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
/* When the shader relies on the subgroup invocation ID, we'd break it, because the ID changes after the culling.
|
|
||||||
* Future work: try to save this to LDS and reload, but it can still be broken in subtle ways.
|
|
||||||
*/
|
|
||||||
if (BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SUBGROUP_INVOCATION))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
ac_nir_ngg_config
|
ac_nir_ngg_config
|
||||||
ac_nir_lower_ngg_nogs(nir_shader *shader,
|
ac_nir_lower_ngg_nogs(nir_shader *shader,
|
||||||
unsigned max_num_es_vertices,
|
unsigned max_num_es_vertices,
|
||||||
unsigned num_vertices_per_primitives,
|
unsigned num_vertices_per_primitives,
|
||||||
unsigned max_workgroup_size,
|
unsigned max_workgroup_size,
|
||||||
unsigned wave_size,
|
unsigned wave_size,
|
||||||
bool consider_culling,
|
bool can_cull,
|
||||||
bool consider_passthrough,
|
bool consider_passthrough,
|
||||||
bool export_prim_id,
|
bool export_prim_id,
|
||||||
bool provoking_vtx_last,
|
bool provoking_vtx_last,
|
||||||
|
@ -1292,8 +1271,6 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
|
||||||
assert(impl);
|
assert(impl);
|
||||||
assert(max_num_es_vertices && max_workgroup_size && wave_size);
|
assert(max_num_es_vertices && max_workgroup_size && wave_size);
|
||||||
|
|
||||||
bool can_cull = consider_culling && (num_vertices_per_primitives == 3) &&
|
|
||||||
can_use_deferred_attribute_culling(shader);
|
|
||||||
bool passthrough = consider_passthrough && !can_cull &&
|
bool passthrough = consider_passthrough && !can_cull &&
|
||||||
!(shader->info.stage == MESA_SHADER_VERTEX && export_prim_id);
|
!(shader->info.stage == MESA_SHADER_VERTEX && export_prim_id);
|
||||||
|
|
||||||
|
@ -1441,7 +1418,6 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
|
||||||
|
|
||||||
ac_nir_ngg_config ret = {
|
ac_nir_ngg_config ret = {
|
||||||
.lds_bytes_if_culling_off = lds_bytes_if_culling_off,
|
.lds_bytes_if_culling_off = lds_bytes_if_culling_off,
|
||||||
.can_cull = can_cull,
|
|
||||||
.passthrough = passthrough,
|
.passthrough = passthrough,
|
||||||
.early_prim_export = state.early_prim_export,
|
.early_prim_export = state.early_prim_export,
|
||||||
.nggc_inputs_read_by_pos = state.inputs_needed_by_pos,
|
.nggc_inputs_read_by_pos = state.inputs_needed_by_pos,
|
||||||
|
|
|
@ -2763,6 +2763,29 @@ radv_get_ballot_bit_size(struct radv_device *device, const VkPipelineShaderStage
|
||||||
return 64;
|
return 64;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
radv_determine_ngg_settings(struct radv_pipeline *pipeline,
|
||||||
|
const struct radv_pipeline_key *pipeline_key,
|
||||||
|
struct radv_shader_info *infos, nir_shader **nir)
|
||||||
|
{
|
||||||
|
struct radv_device *device = pipeline->device;
|
||||||
|
|
||||||
|
if (!nir[MESA_SHADER_GEOMETRY] && pipeline->graphics.last_vgt_api_stage != MESA_SHADER_NONE) {
|
||||||
|
uint64_t ps_inputs_read =
|
||||||
|
nir[MESA_SHADER_FRAGMENT] ? nir[MESA_SHADER_FRAGMENT]->info.inputs_read : 0;
|
||||||
|
gl_shader_stage es_stage = pipeline->graphics.last_vgt_api_stage;
|
||||||
|
|
||||||
|
unsigned num_vertices_per_prim = si_conv_prim_to_gs_out(pipeline_key->vs.topology) + 1;
|
||||||
|
if (es_stage == MESA_SHADER_TESS_EVAL)
|
||||||
|
num_vertices_per_prim = nir[es_stage]->info.tess.point_mode ? 1
|
||||||
|
: nir[es_stage]->info.tess.primitive_mode == GL_ISOLINES ? 2
|
||||||
|
: 3;
|
||||||
|
|
||||||
|
infos[es_stage].has_ngg_culling =
|
||||||
|
radv_consider_culling(device, nir[es_stage], ps_inputs_read, num_vertices_per_prim);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
radv_fill_shader_info(struct radv_pipeline *pipeline,
|
radv_fill_shader_info(struct radv_pipeline *pipeline,
|
||||||
const VkPipelineShaderStageCreateInfo **pStages,
|
const VkPipelineShaderStageCreateInfo **pStages,
|
||||||
|
@ -3454,6 +3477,8 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
|
||||||
infos[hw_vs_api_stage].workgroup_size = infos[hw_vs_api_stage].wave_size;
|
infos[hw_vs_api_stage].workgroup_size = infos[hw_vs_api_stage].wave_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
radv_determine_ngg_settings(pipeline, pipeline_key, infos, nir);
|
||||||
|
|
||||||
for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
|
for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
|
||||||
if (nir[i]) {
|
if (nir[i]) {
|
||||||
radv_start_feedback(stage_feedbacks[i]);
|
radv_start_feedback(stage_feedbacks[i]);
|
||||||
|
@ -3518,11 +3543,8 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
|
||||||
bool io_to_mem = radv_lower_io_to_mem(device, nir[i], &infos[i], pipeline_key);
|
bool io_to_mem = radv_lower_io_to_mem(device, nir[i], &infos[i], pipeline_key);
|
||||||
bool lowered_ngg = pipeline_has_ngg && i == pipeline->graphics.last_vgt_api_stage &&
|
bool lowered_ngg = pipeline_has_ngg && i == pipeline->graphics.last_vgt_api_stage &&
|
||||||
!radv_use_llvm_for_stage(device, i);
|
!radv_use_llvm_for_stage(device, i);
|
||||||
if (lowered_ngg) {
|
if (lowered_ngg)
|
||||||
uint64_t ps_inputs_read = nir[MESA_SHADER_FRAGMENT] ? nir[MESA_SHADER_FRAGMENT]->info.inputs_read : 0;
|
radv_lower_ngg(device, nir[i], &infos[i], pipeline_key);
|
||||||
bool consider_culling = radv_consider_culling(device, nir[i], ps_inputs_read);
|
|
||||||
radv_lower_ngg(device, nir[i], &infos[i], pipeline_key, consider_culling);
|
|
||||||
}
|
|
||||||
|
|
||||||
radv_optimize_nir_algebraic(nir[i], io_to_mem || lowered_ngg || i == MESA_SHADER_COMPUTE);
|
radv_optimize_nir_algebraic(nir[i], io_to_mem || lowered_ngg || i == MESA_SHADER_COMPUTE);
|
||||||
|
|
||||||
|
|
|
@ -886,7 +886,7 @@ radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
|
||||||
|
|
||||||
bool
|
bool
|
||||||
radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
|
radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
|
||||||
uint64_t ps_inputs_read)
|
uint64_t ps_inputs_read, unsigned num_vertices_per_primitive)
|
||||||
{
|
{
|
||||||
/* Culling doesn't make sense for meta shaders. */
|
/* Culling doesn't make sense for meta shaders. */
|
||||||
if (!!nir->info.name)
|
if (!!nir->info.name)
|
||||||
|
@ -917,14 +917,34 @@ radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
|
||||||
max_ps_params = 4; /* Navi 1x. */
|
max_ps_params = 4; /* Navi 1x. */
|
||||||
|
|
||||||
/* TODO: consider other heuristics here, such as PS execution time */
|
/* TODO: consider other heuristics here, such as PS execution time */
|
||||||
|
if (util_bitcount64(ps_inputs_read & ~VARYING_BIT_POS) > max_ps_params)
|
||||||
|
return false;
|
||||||
|
|
||||||
return util_bitcount64(ps_inputs_read & ~VARYING_BIT_POS) <= max_ps_params;
|
/* Only triangle culling is supported. */
|
||||||
|
if (num_vertices_per_primitive != 3)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
/* When the shader writes memory, it is difficult to guarantee correctness.
|
||||||
|
* Future work:
|
||||||
|
* - if only write-only SSBOs are used
|
||||||
|
* - if we can prove that non-position outputs don't rely on memory stores
|
||||||
|
* then may be okay to keep the memory stores in the 1st shader part, and delete them from the 2nd.
|
||||||
|
*/
|
||||||
|
if (nir->info.writes_memory)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
/* When the shader relies on the subgroup invocation ID, we'd break it, because the ID changes after the culling.
|
||||||
|
* Future work: try to save this to LDS and reload, but it can still be broken in subtle ways.
|
||||||
|
*/
|
||||||
|
if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SUBGROUP_INVOCATION))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
|
void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
|
||||||
struct radv_shader_info *info,
|
struct radv_shader_info *info,
|
||||||
const struct radv_pipeline_key *pl_key,
|
const struct radv_pipeline_key *pl_key)
|
||||||
bool consider_culling)
|
|
||||||
{
|
{
|
||||||
/* TODO: support the LLVM backend with the NIR lowering */
|
/* TODO: support the LLVM backend with the NIR lowering */
|
||||||
assert(!radv_use_llvm_for_stage(device, nir->info.stage));
|
assert(!radv_use_llvm_for_stage(device, nir->info.stage));
|
||||||
|
@ -971,7 +991,7 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
|
||||||
|
|
||||||
assert(info->is_ngg);
|
assert(info->is_ngg);
|
||||||
|
|
||||||
if (consider_culling)
|
if (info->has_ngg_culling)
|
||||||
radv_optimize_nir_algebraic(nir, false);
|
radv_optimize_nir_algebraic(nir, false);
|
||||||
|
|
||||||
if (nir->info.stage == MESA_SHADER_VERTEX) {
|
if (nir->info.stage == MESA_SHADER_VERTEX) {
|
||||||
|
@ -987,14 +1007,13 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
|
||||||
num_vertices_per_prim,
|
num_vertices_per_prim,
|
||||||
info->workgroup_size,
|
info->workgroup_size,
|
||||||
info->wave_size,
|
info->wave_size,
|
||||||
consider_culling,
|
info->has_ngg_culling,
|
||||||
info->is_ngg_passthrough,
|
info->is_ngg_passthrough,
|
||||||
export_prim_id,
|
export_prim_id,
|
||||||
pl_key->vs.provoking_vtx_last,
|
pl_key->vs.provoking_vtx_last,
|
||||||
false,
|
false,
|
||||||
pl_key->vs.instance_rate_inputs);
|
pl_key->vs.instance_rate_inputs);
|
||||||
|
|
||||||
info->has_ngg_culling = out_conf.can_cull;
|
|
||||||
info->has_ngg_early_prim_export = out_conf.early_prim_export;
|
info->has_ngg_early_prim_export = out_conf.early_prim_export;
|
||||||
info->num_lds_blocks_when_not_culling = DIV_ROUND_UP(out_conf.lds_bytes_if_culling_off, device->physical_device->rad_info.lds_encode_granularity);
|
info->num_lds_blocks_when_not_culling = DIV_ROUND_UP(out_conf.lds_bytes_if_culling_off, device->physical_device->rad_info.lds_encode_granularity);
|
||||||
info->is_ngg_passthrough = out_conf.passthrough;
|
info->is_ngg_passthrough = out_conf.passthrough;
|
||||||
|
|
|
@ -530,11 +530,10 @@ bool radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
|
||||||
|
|
||||||
void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
|
void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
|
||||||
struct radv_shader_info *info,
|
struct radv_shader_info *info,
|
||||||
const struct radv_pipeline_key *pl_key,
|
const struct radv_pipeline_key *pl_key);
|
||||||
bool consider_culling);
|
|
||||||
|
|
||||||
bool radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
|
bool radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
|
||||||
uint64_t ps_inputs_read);
|
uint64_t ps_inputs_read, unsigned num_vertices_per_primitive);
|
||||||
|
|
||||||
void radv_get_nir_options(struct radv_physical_device *device);
|
void radv_get_nir_options(struct radv_physical_device *device);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue