radeonsi: implement GS fast launch for indexed triangle strips
This increases performance for indexed triangle strips up to +100%. In practice, it's limited by memory bandwidth and compute power, so 256-bit memory bus and a lot of CUs are recommended. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7681>
This commit is contained in:
parent
4cce4d22a7
commit
aaed7a29be
|
@ -1582,6 +1582,8 @@ static void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num
|
||||||
!!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
|
!!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
|
||||||
key->vs_prolog.gs_fast_launch_tri_strip =
|
key->vs_prolog.gs_fast_launch_tri_strip =
|
||||||
!!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
|
!!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
|
||||||
|
key->vs_prolog.gs_fast_launch_index_size_packed =
|
||||||
|
SI_GET_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(shader_out->key.opt.ngg_culling);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (shader_out->selector->info.stage == MESA_SHADER_TESS_CTRL) {
|
if (shader_out->selector->info.stage == MESA_SHADER_TESS_CTRL) {
|
||||||
|
@ -2056,7 +2058,8 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
|
||||||
shader.key.as_ngg = key->vs_prolog.as_ngg;
|
shader.key.as_ngg = key->vs_prolog.as_ngg;
|
||||||
shader.key.opt.ngg_culling =
|
shader.key.opt.ngg_culling =
|
||||||
(key->vs_prolog.gs_fast_launch_tri_list ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST : 0) |
|
(key->vs_prolog.gs_fast_launch_tri_list ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST : 0) |
|
||||||
(key->vs_prolog.gs_fast_launch_tri_strip ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP : 0);
|
(key->vs_prolog.gs_fast_launch_tri_strip ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP : 0) |
|
||||||
|
SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(key->vs_prolog.gs_fast_launch_index_size_packed);
|
||||||
shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs;
|
shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs;
|
||||||
break;
|
break;
|
||||||
case MESA_SHADER_TESS_CTRL:
|
case MESA_SHADER_TESS_CTRL:
|
||||||
|
|
|
@ -278,7 +278,9 @@ enum
|
||||||
#define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */
|
#define SI_NGG_CULL_FRONT_FACE (1 << 2) /* front faces */
|
||||||
#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST (1 << 3) /* GS fast launch: triangles */
|
#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST (1 << 3) /* GS fast launch: triangles */
|
||||||
#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP (1 << 4) /* GS fast launch: triangle strip */
|
#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP (1 << 4) /* GS fast launch: triangle strip */
|
||||||
#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL (0x3 << 3) /* GS fast launch (both prim types) */
|
#define SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(x) (((x) & 0x3) << 5) /* 0->0, 1->1, 2->2, 3->4 */
|
||||||
|
#define SI_GET_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(x) (((x) >> 5) & 0x3)
|
||||||
|
#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL (0xf << 3) /* GS fast launch (both prim types) */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* For VS shader keys, describe any fixups required for vertex fetch.
|
* For VS shader keys, describe any fixups required for vertex fetch.
|
||||||
|
@ -559,6 +561,7 @@ union si_shader_part_key {
|
||||||
unsigned as_prim_discard_cs : 1;
|
unsigned as_prim_discard_cs : 1;
|
||||||
unsigned gs_fast_launch_tri_list : 1; /* for NGG culling */
|
unsigned gs_fast_launch_tri_list : 1; /* for NGG culling */
|
||||||
unsigned gs_fast_launch_tri_strip : 1; /* for NGG culling */
|
unsigned gs_fast_launch_tri_strip : 1; /* for NGG culling */
|
||||||
|
unsigned gs_fast_launch_index_size_packed : 2;
|
||||||
/* Prologs for monolithic shaders shouldn't set EXEC. */
|
/* Prologs for monolithic shaders shouldn't set EXEC. */
|
||||||
unsigned is_monolithic : 1;
|
unsigned is_monolithic : 1;
|
||||||
} vs_prolog;
|
} vs_prolog;
|
||||||
|
@ -652,7 +655,7 @@ struct si_shader_key {
|
||||||
unsigned kill_pointsize : 1;
|
unsigned kill_pointsize : 1;
|
||||||
|
|
||||||
/* For NGG VS and TES. */
|
/* For NGG VS and TES. */
|
||||||
unsigned ngg_culling : 5; /* SI_NGG_CULL_* */
|
unsigned ngg_culling : 7; /* SI_NGG_CULL_* */
|
||||||
|
|
||||||
/* For shaders where monolithic variants have better code.
|
/* For shaders where monolithic variants have better code.
|
||||||
*
|
*
|
||||||
|
|
|
@ -923,6 +923,48 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part
|
||||||
input_vgprs[5] =
|
input_vgprs[5] =
|
||||||
LLVMBuildAdd(ctx->ac.builder, input_vgprs[5], thread_id_in_tg, ""); /* VertexID */
|
LLVMBuildAdd(ctx->ac.builder, input_vgprs[5], thread_id_in_tg, ""); /* VertexID */
|
||||||
input_vgprs[8] = input_vgprs[6]; /* InstanceID */
|
input_vgprs[8] = input_vgprs[6]; /* InstanceID */
|
||||||
|
|
||||||
|
if (key->vs_prolog.gs_fast_launch_index_size_packed) {
|
||||||
|
LLVMTypeRef index_type = ctx->ac.voidt;
|
||||||
|
|
||||||
|
switch (key->vs_prolog.gs_fast_launch_index_size_packed) {
|
||||||
|
case 1:
|
||||||
|
index_type = ctx->ac.i8;
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
index_type = ctx->ac.i16;
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
index_type = ctx->ac.i32;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
unreachable("invalid gs_fast_launch_index_size_packed");
|
||||||
|
}
|
||||||
|
|
||||||
|
LLVMValueRef sgprs[2] = {
|
||||||
|
ac_get_arg(&ctx->ac, input_sgpr_param[0]),
|
||||||
|
ac_get_arg(&ctx->ac, input_sgpr_param[1]),
|
||||||
|
};
|
||||||
|
LLVMValueRef indices = ac_build_gather_values(&ctx->ac, sgprs, 2);
|
||||||
|
indices = LLVMBuildBitCast(ctx->ac.builder, indices, ctx->ac.i64, "");
|
||||||
|
indices = LLVMBuildIntToPtr(ctx->ac.builder, indices,
|
||||||
|
LLVMPointerType(index_type, AC_ADDR_SPACE_CONST), "");
|
||||||
|
|
||||||
|
LLVMValueRef vertex_id = ac_build_alloca_init(&ctx->ac, input_vgprs[5], "");
|
||||||
|
|
||||||
|
/* if (is ES thread...) */
|
||||||
|
ac_build_ifcc(&ctx->ac,
|
||||||
|
LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
|
||||||
|
si_unpack_param(ctx, merged_wave_info, 0, 8), ""), 0);
|
||||||
|
/* VertexID = indexBufferLoad(VertexID); */
|
||||||
|
LLVMValueRef index = LLVMBuildGEP(ctx->ac.builder, indices, &input_vgprs[5], 1, "");
|
||||||
|
index = LLVMBuildLoad(ctx->ac.builder, index, "");
|
||||||
|
index = LLVMBuildZExt(ctx->ac.builder, index, ctx->ac.i32, "");
|
||||||
|
LLVMBuildStore(ctx->ac.builder, index, vertex_id);
|
||||||
|
ac_build_endif(&ctx->ac, 0);
|
||||||
|
|
||||||
|
input_vgprs[5] = LLVMBuildLoad(ctx->ac.builder, vertex_id, "");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned vertex_id_vgpr = first_vs_vgpr;
|
unsigned vertex_id_vgpr = first_vs_vgpr;
|
||||||
|
|
|
@ -1020,6 +1020,38 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
|
||||||
!(sctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)));
|
!(sctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
/* Set the index buffer for fast launch. The VS prolog will load the indices. */
|
||||||
|
if (sctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0)) {
|
||||||
|
index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(original_index_size);
|
||||||
|
|
||||||
|
radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(indexbuf),
|
||||||
|
RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER);
|
||||||
|
uint64_t base_index_va = si_resource(indexbuf)->gpu_address + index_offset;
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < num_draws; i++) {
|
||||||
|
uint64_t index_va = base_index_va + draws[i].start * original_index_size;
|
||||||
|
|
||||||
|
radeon_set_sh_reg_seq(cs, R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS, 2);
|
||||||
|
radeon_emit(cs, index_va);
|
||||||
|
radeon_emit(cs, index_va >> 32);
|
||||||
|
|
||||||
|
if (i > 0) {
|
||||||
|
if (info->increment_draw_id) {
|
||||||
|
unsigned draw_id = info->drawid + i;
|
||||||
|
|
||||||
|
radeon_set_sh_reg(cs, sh_base_reg + SI_SGPR_DRAWID * 4, draw_id);
|
||||||
|
sctx->last_drawid = draw_id;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* TODO: Do index buffer bounds checking? We don't do it in this case. */
|
||||||
|
radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));
|
||||||
|
radeon_emit(cs, draws[i].count);
|
||||||
|
radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
for (unsigned i = 0; i < num_draws; i++) {
|
for (unsigned i = 0; i < num_draws; i++) {
|
||||||
if (i > 0) {
|
if (i > 0) {
|
||||||
if (info->increment_draw_id) {
|
if (info->increment_draw_id) {
|
||||||
|
@ -2131,15 +2163,19 @@ static void si_draw_vbo(struct pipe_context *ctx,
|
||||||
ngg_culling |= SI_NGG_CULL_BACK_FACE;
|
ngg_culling |= SI_NGG_CULL_BACK_FACE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Use NGG fast launch for certain non-indexed primitive types.
|
/* Use NGG fast launch for certain primitive types.
|
||||||
* A draw must have at least 1 full primitive.
|
* A draw must have at least 1 full primitive.
|
||||||
*/
|
*/
|
||||||
if (ngg_culling && !index_size && min_direct_count >= 3 && !sctx->tes_shader.cso &&
|
if (ngg_culling && min_direct_count >= 3 && !sctx->tes_shader.cso &&
|
||||||
!sctx->gs_shader.cso) {
|
!sctx->gs_shader.cso) {
|
||||||
if (prim == PIPE_PRIM_TRIANGLES)
|
if (prim == PIPE_PRIM_TRIANGLES && !index_size) {
|
||||||
ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST;
|
ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST;
|
||||||
else if (prim == PIPE_PRIM_TRIANGLE_STRIP)
|
} else if (prim == PIPE_PRIM_TRIANGLE_STRIP && !primitive_restart) {
|
||||||
ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP;
|
ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP |
|
||||||
|
SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(MIN2(index_size, 3));
|
||||||
|
/* The index buffer will be emulated. */
|
||||||
|
index_size = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ngg_culling != old_ngg_culling) {
|
if (ngg_culling != old_ngg_culling) {
|
||||||
|
@ -2175,6 +2211,15 @@ static void si_draw_vbo(struct pipe_context *ctx,
|
||||||
ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
|
ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
|
||||||
sctx->flags |= SI_CONTEXT_VGT_FLUSH;
|
sctx->flags |= SI_CONTEXT_VGT_FLUSH;
|
||||||
|
|
||||||
|
if (old_ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0) &&
|
||||||
|
!(ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0))) {
|
||||||
|
/* Need to re-set these, because we have bound an index buffer there. */
|
||||||
|
sctx->shader_pointers_dirty |=
|
||||||
|
(1u << si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_GEOMETRY)) |
|
||||||
|
(1u << si_sampler_and_image_descriptors_idx(PIPE_SHADER_GEOMETRY));
|
||||||
|
si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
|
||||||
|
}
|
||||||
|
|
||||||
/* Set this to the correct value determined by si_update_shaders. */
|
/* Set this to the correct value determined by si_update_shaders. */
|
||||||
sctx->ngg_culling = ngg_culling;
|
sctx->ngg_culling = ngg_culling;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue