From d45958f82e4526f809dcb03ff6b3b0b438803ecb Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Mon, 17 May 2021 17:56:28 +0100 Subject: [PATCH] aco: implement VS input loads with prologs Signed-off-by: Rhys Perry Reviewed-by: Samuel Pitoiset Part-of: --- src/amd/compiler/aco_insert_waitcnt.cpp | 7 +++ .../compiler/aco_instruction_selection.cpp | 46 ++++++++++++++++++- src/amd/compiler/aco_ir.h | 2 + src/amd/compiler/aco_statistics.cpp | 9 ++++ 4 files changed, 62 insertions(+), 2 deletions(-) diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index cb6c2a60804..2934c71c087 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -770,6 +770,13 @@ insert_wait_states(Program* program) std::stack> loop_header_indices; unsigned loop_progress = 0; + if (program->stage.has(SWStage::VS) && program->info->vs.dynamic_inputs) { + for (Definition def : program->vs_inputs) { + update_counters(in_ctx[0], event_vmem); + insert_wait_entry(in_ctx[0], def, event_vmem); + } + } + for (unsigned i = 0; i < program->blocks.size();) { Block& current = program->blocks[i++]; wait_ctx ctx = in_ctx[current.index]; diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 7d65e7855ff..b4ab24dd80e 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -5004,7 +5004,36 @@ visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr) Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); nir_src offset = *nir_get_io_offset_src(instr); - if (ctx->shader->info.stage == MESA_SHADER_VERTEX) { + if (ctx->shader->info.stage == MESA_SHADER_VERTEX && ctx->args->shader_info->vs.dynamic_inputs) { + if (!nir_src_is_const(offset) || nir_src_as_uint(offset)) + isel_err(offset.ssa->parent_instr, + "Unimplemented non-zero nir_intrinsic_load_input offset"); + + unsigned location = nir_intrinsic_base(instr) - VERT_ATTRIB_GENERIC0; + unsigned component = nir_intrinsic_component(instr); + unsigned bitsize = instr->dest.ssa.bit_size; + unsigned num_components = instr->dest.ssa.num_components; + + Temp input = get_arg(ctx, ctx->args->vs_inputs[location]); + + aco_ptr vec{create_instruction( + aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; + std::array elems; + for (unsigned i = 0; i < num_components; i++) { + elems[i] = emit_extract_vector(ctx, input, component + i, bitsize == 64 ? v2 : v1); + if (bitsize == 16) { + if (nir_alu_type_get_base_type(nir_intrinsic_dest_type(instr)) == nir_type_float) + elems[i] = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), elems[i]); + else + elems[i] = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), elems[i], + Operand::c32(0u)); + } + vec->operands[i] = Operand(elems[i]); + } + vec->definitions[0] = Definition(dst); + ctx->block->instructions.emplace_back(std::move(vec)); + ctx->allocated_vec.emplace(dst.id(), elems); + } else if (ctx->shader->info.stage == MESA_SHADER_VERTEX) { if (!nir_src_is_const(offset) || nir_src_as_uint(offset)) isel_err(offset.ssa->parent_instr, @@ -11273,6 +11302,18 @@ add_startpgm(struct isel_context* ctx) ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets); ctx->program->scratch_offset = get_arg(ctx, ctx->args->ac.scratch_offset); + if (ctx->stage.has(SWStage::VS) && ctx->program->info->vs.dynamic_inputs) { + unsigned num_attributes = util_last_bit(ctx->program->info->vs.vb_desc_usage_mask); + for (unsigned i = 0; i < num_attributes; i++) { + Definition def(get_arg(ctx, ctx->args->vs_inputs[i])); + + unsigned idx = ctx->args->vs_inputs[i].arg_index; + def.setFixed(PhysReg(256 + ctx->args->ac.args[idx].offset)); + + ctx->program->vs_inputs.push_back(def); + } + } + return instr; } @@ -11571,7 +11612,8 @@ select_program(Program* program, unsigned shader_count, struct nir_shader* const split_arguments(&ctx, startpgm); - if (program->stage.has(SWStage::VS) || program->stage.has(SWStage::TES)) { + if (!args->shader_info->vs.has_prolog && + (program->stage.has(SWStage::VS) || program->stage.has(SWStage::TES))) { Builder(ctx.program, ctx.block).sopp(aco_opcode::s_setprio, -1u, 0x3u); } } diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 66081d9db45..8de4f455aec 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -2073,6 +2073,8 @@ public: unsigned next_divergent_if_logical_depth = 0; unsigned next_uniform_if_depth = 0; + std::vector vs_inputs; + struct { FILE* output = stderr; bool shorten_messages = false; diff --git a/src/amd/compiler/aco_statistics.cpp b/src/amd/compiler/aco_statistics.cpp index ce114e3f879..8ccb5198b01 100644 --- a/src/amd/compiler/aco_statistics.cpp +++ b/src/amd/compiler/aco_statistics.cpp @@ -473,6 +473,15 @@ collect_preasm_stats(Program* program) double usage[(int)BlockCycleEstimator::resource_count] = {0}; std::vector blocks(program->blocks.size(), program); + if (program->stage.has(SWStage::VS) && program->info->vs.has_prolog) { + unsigned vs_input_latency = 320; + for (Definition def : program->vs_inputs) { + blocks[0].vm.push_back(vs_input_latency); + for (unsigned i = 0; i < def.size(); i++) + blocks[0].reg_available[def.physReg().reg() + i] = vs_input_latency; + } + } + for (Block& block : program->blocks) { BlockCycleEstimator& block_est = blocks[block.index]; for (unsigned pred : block.linear_preds)