radv: Support NGG culling with new perftest environment variable.

Currently we don't enable it on any chip by default, but
we plan to enable it soon on GFX10.3 when we are comfortable
with its performance.

RADV_PERFTEST=nggc environment variable enables it on GFX10+ GPUs.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10525>
This commit is contained in:
Timur Kristóf 2021-06-08 12:32:35 +02:00 committed by Marge Bot
parent 182d9b1e60
commit f30e4351de
8 changed files with 64 additions and 5 deletions

View File

@ -647,6 +647,8 @@ RADV driver environment variables
disable optimizations that get enabled when all VRAM is CPU visible.
``pswave32``
enable wave32 for pixel shaders (GFX10+)
``nggc``
enable NGG culling on GFX10+ GPUs.
``rt``
enable rt extensions whose implementation is still experimental.
``sam``

View File

@ -15,6 +15,7 @@ VK_EXT_multi_draw on ANV, lavapipe, and RADV
VK_KHR_separate_depth_stencil_layouts on lavapipe
VK_EXT_separate_stencil_usage on lavapipe
VK_EXT_extended_dynamic_state2 on lavapipe
NGG shader based primitive culling is now supported by RADV.
Panfrost supports OpenGL ES 3.1
New Asahi driver for the Apple M1
GL_ARB_sample_locations on zink

View File

@ -74,6 +74,7 @@ enum {
RADV_PERFTEST_NO_SAM = 1u << 6,
RADV_PERFTEST_SAM = 1u << 7,
RADV_PERFTEST_RT = 1u << 8,
RADV_PERFTEST_NGGC = 1u << 9,
};
bool radv_init_trace(struct radv_device *device);

View File

@ -830,6 +830,7 @@ static const struct debug_control radv_perftest_options[] = {{"localbos", RADV_P
{"nosam", RADV_PERFTEST_NO_SAM},
{"sam", RADV_PERFTEST_SAM},
{"rt", RADV_PERFTEST_RT},
{"nggc", RADV_PERFTEST_NGGC},
{NULL, 0}};
const char *

View File

@ -211,6 +211,8 @@ get_hash_flags(const struct radv_device *device, bool stats)
if (device->instance->debug_flags & RADV_DEBUG_NO_NGG)
hash_flags |= RADV_HASH_SHADER_NO_NGG;
if (device->instance->perftest_flags & RADV_PERFTEST_NGGC)
hash_flags |= RADV_HASH_SHADER_FORCE_NGG_CULLING;
if (device->physical_device->cs_wave_size == 32)
hash_flags |= RADV_HASH_SHADER_CS_WAVE32;
if (device->physical_device->ps_wave_size == 32)
@ -3451,8 +3453,11 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
bool io_to_mem = radv_lower_io_to_mem(device, nir[i], &infos[i], pipeline_key);
bool lowered_ngg = pipeline_has_ngg && i == pipeline->graphics.last_vgt_api_stage &&
!radv_use_llvm_for_stage(device, i);
if (lowered_ngg)
radv_lower_ngg(device, nir[i], &infos[i], pipeline_key, &keys[i]);
if (lowered_ngg) {
uint64_t ps_inputs_read = nir[MESA_SHADER_FRAGMENT] ? nir[MESA_SHADER_FRAGMENT]->info.inputs_read : 0;
bool consider_culling = radv_consider_culling(device, nir[i], ps_inputs_read);
radv_lower_ngg(device, nir[i], &infos[i], pipeline_key, &keys[i], consider_culling);
}
radv_optimize_nir_algebraic(nir[i], io_to_mem || lowered_ngg || i == MESA_SHADER_COMPUTE);

View File

@ -1672,6 +1672,7 @@ struct radv_event {
#define RADV_HASH_SHADER_FORCE_VRS_2x2 (1 << 9)
#define RADV_HASH_SHADER_FORCE_VRS_2x1 (1 << 10)
#define RADV_HASH_SHADER_FORCE_VRS_1x2 (1 << 11)
#define RADV_HASH_SHADER_FORCE_NGG_CULLING (1 << 13)
void radv_hash_shaders(unsigned char *hash, const VkPipelineShaderStageCreateInfo **stages,
const struct radv_pipeline_layout *layout,

View File

@ -907,10 +907,44 @@ radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
return false;
}
bool
radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
uint64_t ps_inputs_read)
{
/* Culling doesn't make sense for meta shaders. */
if (!!nir->info.name)
return false;
/* TODO: enable by default on GFX10.3 when we're confident about performance. */
bool culling_enabled = device->instance->perftest_flags & RADV_PERFTEST_NGGC;
if (!culling_enabled)
return false;
/* Shader based culling efficiency can depend on PS throughput.
* Estimate an upper limit for PS input param count based on GPU info.
*/
unsigned max_ps_params;
unsigned max_render_backends = device->physical_device->rad_info.max_render_backends;
unsigned max_se = device->physical_device->rad_info.max_se;
if (max_render_backends < 2)
return false; /* Don't use NGG culling on 1 RB chips. */
else if (max_render_backends / max_se == 4)
max_ps_params = 6; /* Sienna Cichlid and other GFX10.3 dGPUs. */
else
max_ps_params = 4; /* Navi 1x. */
/* TODO: consider other heuristics here, such as PS execution time */
return util_bitcount64(ps_inputs_read & ~VARYING_BIT_POS) <= max_ps_params;
}
void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
struct radv_shader_info *info,
const struct radv_pipeline_key *pl_key,
struct radv_shader_variant_key *key)
struct radv_shader_variant_key *key,
bool consider_culling)
{
/* TODO: support the LLVM backend with the NIR lowering */
assert(!radv_use_llvm_for_stage(device, nir->info.stage));
@ -930,9 +964,19 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
num_vertices_per_prim = 1;
else if (nir->info.tess.primitive_mode == GL_ISOLINES)
num_vertices_per_prim = 2;
/* Manually mark the primitive ID used, so the shader can repack it. */
if (key->vs_common_out.export_prim_id)
BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
} else if (nir->info.stage == MESA_SHADER_VERTEX) {
/* Need to add 1, because: V_028A6C_POINTLIST=0, V_028A6C_LINESTRIP=1, V_028A6C_TRISTRIP=2, etc. */
num_vertices_per_prim = key->vs.outprim + 1;
/* Manually mark the instance ID used, so the shader can repack it. */
if (key->vs.instance_rate_inputs)
BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
} else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
num_vertices_per_prim = nir->info.gs.vertices_in;
} else {
@ -964,7 +1008,7 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
num_vertices_per_prim,
max_workgroup_size,
info->wave_size,
false,
consider_culling,
key->vs_common_out.as_ngg_passthrough,
key->vs_common_out.export_prim_id,
key->vs.provoking_vtx_last);

View File

@ -569,6 +569,10 @@ bool radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
struct radv_shader_info *info,
const struct radv_pipeline_key *pl_key,
struct radv_shader_variant_key *key);
struct radv_shader_variant_key *key,
bool consider_culling);
bool radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
uint64_t ps_inputs_read);
#endif