ac/nir: replace SI.buffer.load.dword with amdgcn.buffer.load

The old one generates useless instructions in there, found while
comparing geometry shaders between RadeonSI and RADV.

This improves all Vulkan demos that use geometry shaders, +4%
for deferredshadows, +9% for viewportarray, +7% for
geometryshader on Polaris10.

This seems to also improve DOW3 a little bit (+1%).

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by:  Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
This commit is contained in:
Samuel Pitoiset 2018-02-01 16:37:15 +01:00
parent f9c121c420
commit df1d5174fc
1 changed files with 19 additions and 31 deletions

View File

@ -3047,7 +3047,6 @@ load_gs_input(struct ac_shader_abi *abi,
{ {
struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi); struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
LLVMValueRef vtx_offset; LLVMValueRef vtx_offset;
LLVMValueRef args[9];
unsigned param, vtx_offset_param; unsigned param, vtx_offset_param;
LLVMValueRef value[4], result; LLVMValueRef value[4], result;
@ -3065,20 +3064,16 @@ load_gs_input(struct ac_shader_abi *abi,
LLVMConstInt(ctx->ac.i32, param * 4 + i + const_index, 0), ""); LLVMConstInt(ctx->ac.i32, param * 4 + i + const_index, 0), "");
value[i] = ac_lds_load(&ctx->ac, dw_addr); value[i] = ac_lds_load(&ctx->ac, dw_addr);
} else { } else {
args[0] = ctx->esgs_ring; LLVMValueRef soffset =
args[1] = vtx_offset; LLVMConstInt(ctx->ac.i32,
args[2] = LLVMConstInt(ctx->ac.i32, (param * 4 + i + const_index) * 256, false); (param * 4 + i + const_index) * 256,
args[3] = ctx->ac.i32_0; false);
args[4] = ctx->ac.i32_1; /* OFFEN */
args[5] = ctx->ac.i32_0; /* IDXEN */
args[6] = ctx->ac.i32_1; /* GLC */
args[7] = ctx->ac.i32_0; /* SLC */
args[8] = ctx->ac.i32_0; /* TFE */
value[i] = ac_build_intrinsic(&ctx->ac, "llvm.SI.buffer.load.dword.i32.i32", value[i] = ac_build_buffer_load(&ctx->ac,
ctx->ac.i32, args, 9, ctx->esgs_ring, 1,
AC_FUNC_ATTR_READONLY | ctx->ac.i32_0,
AC_FUNC_ATTR_LEGACY); vtx_offset, soffset,
0, 1, 0, true, false);
} }
} }
result = ac_build_varying_gather_values(&ctx->ac, value, num_components, component); result = ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
@ -7166,16 +7161,9 @@ void ac_compile_nir_shader(LLVMTargetMachineRef tm,
static void static void
ac_gs_copy_shader_emit(struct nir_to_llvm_context *ctx) ac_gs_copy_shader_emit(struct nir_to_llvm_context *ctx)
{ {
LLVMValueRef args[9]; LLVMValueRef vtx_offset =
args[0] = ctx->gsvs_ring; LLVMBuildMul(ctx->builder, ctx->abi.vertex_id,
args[1] = LLVMBuildMul(ctx->builder, ctx->abi.vertex_id, LLVMConstInt(ctx->ac.i32, 4, false), ""); LLVMConstInt(ctx->ac.i32, 4, false), "");
args[3] = ctx->ac.i32_0;
args[4] = ctx->ac.i32_1; /* OFFEN */
args[5] = ctx->ac.i32_0; /* IDXEN */
args[6] = ctx->ac.i32_1; /* GLC */
args[7] = ctx->ac.i32_1; /* SLC */
args[8] = ctx->ac.i32_0; /* TFE */
int idx = 0; int idx = 0;
for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) { for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) {
@ -7193,16 +7181,16 @@ ac_gs_copy_shader_emit(struct nir_to_llvm_context *ctx)
} }
for (unsigned j = 0; j < length; j++) { for (unsigned j = 0; j < length; j++) {
LLVMValueRef value; LLVMValueRef value, soffset;
args[2] = LLVMConstInt(ctx->ac.i32,
soffset = LLVMConstInt(ctx->ac.i32,
(slot * 4 + j) * (slot * 4 + j) *
ctx->gs_max_out_vertices * 16 * 4, false); ctx->gs_max_out_vertices * 16 * 4, false);
value = ac_build_intrinsic(&ctx->ac, value = ac_build_buffer_load(&ctx->ac, ctx->gsvs_ring,
"llvm.SI.buffer.load.dword.i32.i32", 1, ctx->ac.i32_0,
ctx->ac.i32, args, 9, vtx_offset, soffset,
AC_FUNC_ATTR_READONLY | 0, 1, 1, true, false);
AC_FUNC_ATTR_LEGACY);
LLVMBuildStore(ctx->builder, LLVMBuildStore(ctx->builder,
ac_to_float(&ctx->ac, value), ctx->nir->outputs[radeon_llvm_reg_index_soa(i, j)]); ac_to_float(&ctx->ac, value), ctx->nir->outputs[radeon_llvm_reg_index_soa(i, j)]);