diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index 94c41173d6e..7683c8f45af 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -329,10 +329,10 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* } encoding |= instr->operands[2].physReg() << 24; encoding |= (mubuf->tfe ? 1 : 0) << 23; - encoding |= (instr->operands[1].physReg() >> 2) << 16; + encoding |= (instr->operands[0].physReg() >> 2) << 16; unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() : instr->definitions[0].physReg(); encoding |= (0xFF & reg) << 8; - encoding |= (0xFF & instr->operands[0].physReg()); + encoding |= (0xFF & instr->operands[1].physReg()); out.push_back(encoding); break; } @@ -362,10 +362,10 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* encoding |= instr->operands[2].physReg() << 24; encoding |= (mtbuf->tfe ? 1 : 0) << 23; encoding |= (mtbuf->slc ? 1 : 0) << 22; - encoding |= (instr->operands[1].physReg() >> 2) << 16; + encoding |= (instr->operands[0].physReg() >> 2) << 16; unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() : instr->definitions[0].physReg(); encoding |= (0xFF & reg) << 8; - encoding |= (0xFF & instr->operands[0].physReg()); + encoding |= (0xFF & instr->operands[1].physReg()); if (ctx.chip_class >= GFX10) { encoding |= (((opcode & 0x08) >> 4) << 21); /* MSB of 4-bit OPCODE */ @@ -395,15 +395,15 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* } encoding |= (0xF & mimg->dmask) << 8; out.push_back(encoding); - encoding = (0xFF & instr->operands[0].physReg()); /* VADDR */ + encoding = (0xFF & instr->operands[2].physReg()); /* VADDR */ if (!instr->definitions.empty()) { encoding |= (0xFF & instr->definitions[0].physReg()) << 8; /* VDATA */ - } else if (instr->operands.size() == 4) { - encoding |= (0xFF & instr->operands[3].physReg()) << 8; /* VDATA */ + } else if (instr->operands[1].regClass().type() == RegType::vgpr) { + encoding |= (0xFF & instr->operands[1].physReg()) << 8; /* VDATA */ } - encoding |= (0x1F & (instr->operands[1].physReg() >> 2)) << 16; /* T# (resource) */ - if (instr->operands.size() > 2) - encoding |= (0x1F & (instr->operands[2].physReg() >> 2)) << 21; /* sampler */ + encoding |= (0x1F & (instr->operands[0].physReg() >> 2)) << 16; /* T# (resource) */ + if (instr->operands[1].regClass().type() == RegType::sgpr) + encoding |= (0x1F & (instr->operands[1].physReg() >> 2)) << 21; /* sampler */ assert(!mimg->d16 || ctx.chip_class >= GFX9); encoding |= mimg->d16 ? 1 << 15 : 0; diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py index 4e1d6f72b63..97c03ac8adf 100644 --- a/src/amd/compiler/aco_builder_h.py +++ b/src/amd/compiler/aco_builder_h.py @@ -473,7 +473,7 @@ formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.prod ("ds", [Format.DS], 'DS_instruction', [(1, 1), (1, 2), (0, 3), (0, 4)]), ("mubuf", [Format.MUBUF], 'MUBUF_instruction', [(0, 4), (1, 3)]), ("mtbuf", [Format.MTBUF], 'MTBUF_instruction', [(0, 4), (1, 3)]), - ("mimg", [Format.MIMG], 'MIMG_instruction', [(0, 4), (1, 3), (0, 3), (1, 2)]), #TODO(pendingchaos): less shapes? + ("mimg", [Format.MIMG], 'MIMG_instruction', [(0, 3), (1, 3)]), ("exp", [Format.EXP], 'Export_instruction', [(0, 4)]), ("branch", [Format.PSEUDO_BRANCH], 'Pseudo_branch_instruction', itertools.product([0], [0, 1])), ("barrier", [Format.PSEUDO_BARRIER], 'Pseudo_barrier_instruction', [(0, 0)]), diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp index 605ed8a2081..9ff3d580736 100644 --- a/src/amd/compiler/aco_insert_NOPs.cpp +++ b/src/amd/compiler/aco_insert_NOPs.cpp @@ -325,9 +325,9 @@ int handle_instruction_gfx8_9(NOP_ctx_gfx8_9& ctx, aco_ptr& instr, pred->operands[2].physReg() >= 128; /* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit store) */ bool consider_mimg = pred->format == Format::MIMG && - pred->operands.size() == 4 && - pred->operands[3].size() > 2 && - pred->operands[1].size() != 8; + pred->operands[1].regClass().type() == RegType::vgpr && + pred->operands[1].size() > 2 && + pred->operands[0].size() == 4; /* FLAT/GLOBAL/SCRATCH store with >64-bit data */ bool consider_flat = (pred->isFlatOrGlobal() || pred->format == Format::SCRATCH) && pred->operands.size() == 3 && @@ -376,6 +376,7 @@ int handle_instruction_gfx8_9(NOP_ctx_gfx8_9& ctx, aco_ptr& instr, /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */ for (int pred_idx = new_idx - 1; pred_idx >= 0 && pred_idx >= new_idx - 5; pred_idx--) { aco_ptr& pred = new_instructions[pred_idx]; + // TODO: break if something else writes the SGPR if (!(pred->isVALU() && VALU_writes_sgpr(pred))) continue; @@ -383,16 +384,10 @@ int handle_instruction_gfx8_9(NOP_ctx_gfx8_9& ctx, aco_ptr& instr, if (def.physReg() > 102) continue; - if (instr->operands.size() > 1 && - regs_intersect(instr->operands[1].physReg(), instr->operands[1].size(), - def.physReg(), def.size())) { + for (const Operand& op : instr->operands) { + if (regs_intersect(op.physReg(), op.size(), def.physReg(), def.size())) return 5 + pred_idx - new_idx + 1; - } - if (instr->operands.size() > 2 && - regs_intersect(instr->operands[2].physReg(), instr->operands[2].size(), - def.physReg(), def.size())) { - return 5 + pred_idx - new_idx + 1; } } } diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index 8e1b64bfcd0..5ec9636752d 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -689,11 +689,20 @@ void gen(Instruction* instr, wait_ctx& ctx) if (!instr->definitions.empty()) insert_wait_entry(ctx, instr->definitions[0], ev); - if (instr->operands.size() == 4 && ctx.chip_class == GFX6) { + if (ctx.chip_class == GFX6 && + instr->format != Format::MIMG && + instr->operands.size() == 4) { ctx.exp_cnt++; update_counters(ctx, event_vmem_gpr_lock); insert_wait_entry(ctx, instr->operands[3], event_vmem_gpr_lock); + } else if (ctx.chip_class == GFX6 && + instr->format == Format::MIMG && + instr->operands[1].regClass().type() == RegType::vgpr) { + ctx.exp_cnt++; + update_counters(ctx, event_vmem_gpr_lock); + insert_wait_entry(ctx, instr->operands[1], event_vmem_gpr_lock); } + break; } case Format::SOPP: { diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 6734b9c98d3..8d08c416ef0 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -2929,8 +2929,8 @@ void visit_store_vsgs_output(isel_context *ctx, nir_intrinsic_instr *instr) } aco_ptr mtbuf{create_instruction(aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)}; - mtbuf->operands[0] = vaddr_offset; - mtbuf->operands[1] = Operand(esgs_ring); + mtbuf->operands[0] = Operand(esgs_ring); + mtbuf->operands[1] = vaddr_offset; mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->es2gs_offset)); mtbuf->operands[3] = Operand(elem); mtbuf->offen = !vaddr_offset.isUndefined(); @@ -3288,12 +3288,12 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) if (use_mubuf) { Instruction *mubuf = bld.mubuf(opcode, - Definition(fetch_dst), fetch_index, list, soffset, + Definition(fetch_dst), list, fetch_index, soffset, fetch_offset, false, true).instr; static_cast(mubuf)->can_reorder = true; } else { Instruction *mtbuf = bld.mtbuf(opcode, - Definition(fetch_dst), fetch_index, list, soffset, + Definition(fetch_dst), list, fetch_index, soffset, fetch_dfmt, nfmt, fetch_offset, false, true).instr; static_cast(mtbuf)->can_reorder = true; } @@ -3487,8 +3487,8 @@ void visit_load_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr) aco_ptr mubuf{create_instruction(aco_opcode::buffer_load_dword, Format::MUBUF, 3, 1)}; mubuf->definitions[0] = bld.def(v1); subelems[j] = mubuf->definitions[0].getTemp(); - mubuf->operands[0] = Operand(offset); - mubuf->operands[1] = Operand(esgs_ring); + mubuf->operands[0] = Operand(esgs_ring); + mubuf->operands[1] = Operand(offset); mubuf->operands[2] = Operand(soffset); mubuf->offen = true; mubuf->offset = const_offset % 4096u; @@ -3616,8 +3616,8 @@ void load_buffer(isel_context *ctx, unsigned num_components, Temp dst, lower = bld.tmp(v4); aco_ptr mubuf{create_instruction(op, Format::MUBUF, 3, 1)}; mubuf->definitions[0] = Definition(lower); - mubuf->operands[0] = vaddr; - mubuf->operands[1] = Operand(rsrc); + mubuf->operands[0] = Operand(rsrc); + mubuf->operands[1] = vaddr; mubuf->operands[2] = soffset; mubuf->offen = (offset.type() == RegType::vgpr); mubuf->glc = glc; @@ -3651,8 +3651,8 @@ void load_buffer(isel_context *ctx, unsigned num_components, Temp dst, unreachable("Load SSBO not implemented for this size."); } aco_ptr mubuf{create_instruction(op, Format::MUBUF, 3, 1)}; - mubuf->operands[0] = vaddr; - mubuf->operands[1] = Operand(rsrc); + mubuf->operands[0] = Operand(rsrc); + mubuf->operands[1] = vaddr; mubuf->operands[2] = soffset; mubuf->offen = (offset.type() == RegType::vgpr); mubuf->glc = glc; @@ -4228,9 +4228,10 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, Temp coo ? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da) : 0; - aco_ptr load{create_instruction(aco_opcode::image_load, Format::MIMG, 2, 1)}; - load->operands[0] = Operand(coords); - load->operands[1] = Operand(fmask_desc_ptr); + aco_ptr load{create_instruction(aco_opcode::image_load, Format::MIMG, 3, 1)}; + load->operands[0] = Operand(fmask_desc_ptr); + load->operands[1] = Operand(s4); /* no sampler */ + load->operands[2] = Operand(coords); load->definitions[0] = Definition(fmask); load->glc = false; load->dlc = false; @@ -4374,8 +4375,8 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr) unreachable(">4 channel buffer image load"); } aco_ptr load{create_instruction(opcode, Format::MUBUF, 3, 1)}; - load->operands[0] = Operand(vindex); - load->operands[1] = Operand(rsrc); + load->operands[0] = Operand(rsrc); + load->operands[1] = Operand(vindex); load->operands[2] = Operand((uint32_t) 0); Temp tmp; if (num_channels == instr->dest.ssa.num_components && dst.type() == RegType::vgpr) @@ -4407,9 +4408,10 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr) bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0; aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip; - aco_ptr load{create_instruction(opcode, Format::MIMG, 2, 1)}; - load->operands[0] = Operand(coords); - load->operands[1] = Operand(resource); + aco_ptr load{create_instruction(opcode, Format::MIMG, 3, 1)}; + load->operands[0] = Operand(resource); + load->operands[1] = Operand(s4); /* no sampler */ + load->operands[2] = Operand(coords); load->definitions[0] = Definition(tmp); load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0; load->dlc = load->glc && ctx->options->chip_class >= GFX10; @@ -4455,8 +4457,8 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr) unreachable(">4 channel buffer image store"); } aco_ptr store{create_instruction(opcode, Format::MUBUF, 4, 0)}; - store->operands[0] = Operand(vindex); - store->operands[1] = Operand(rsrc); + store->operands[0] = Operand(rsrc); + store->operands[1] = Operand(vindex); store->operands[2] = Operand((uint32_t) 0); store->operands[3] = Operand(data); store->idxen = true; @@ -4476,11 +4478,10 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr) bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0; aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip; - aco_ptr store{create_instruction(opcode, Format::MIMG, 4, 0)}; - store->operands[0] = Operand(coords); - store->operands[1] = Operand(resource); - store->operands[2] = Operand(s4); - store->operands[3] = Operand(data); + aco_ptr store{create_instruction(opcode, Format::MIMG, 3, 0)}; + store->operands[0] = Operand(resource); + store->operands[1] = Operand(data); + store->operands[2] = Operand(coords); store->glc = glc; store->dlc = false; store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array); @@ -4572,8 +4573,8 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr) Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true); //assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet implemented."); aco_ptr mubuf{create_instruction(buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)}; - mubuf->operands[0] = Operand(vindex); - mubuf->operands[1] = Operand(resource); + mubuf->operands[0] = Operand(resource); + mubuf->operands[1] = Operand(vindex); mubuf->operands[2] = Operand((uint32_t)0); mubuf->operands[3] = Operand(data); if (return_previous) @@ -4591,11 +4592,10 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr) Temp coords = get_image_coords(ctx, instr, type); Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true); - aco_ptr mimg{create_instruction(image_op, Format::MIMG, 4, return_previous ? 1 : 0)}; - mimg->operands[0] = Operand(coords); - mimg->operands[1] = Operand(resource); - mimg->operands[2] = Operand(s4); /* no sampler */ - mimg->operands[3] = Operand(data); + aco_ptr mimg{create_instruction(image_op, Format::MIMG, 3, return_previous ? 1 : 0)}; + mimg->operands[0] = Operand(resource); + mimg->operands[1] = Operand(data); + mimg->operands[2] = Operand(coords); if (return_previous) mimg->definitions[0] = Definition(dst); mimg->glc = return_previous; @@ -4661,9 +4661,10 @@ void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr) Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - aco_ptr mimg{create_instruction(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1)}; - mimg->operands[0] = Operand(lod); - mimg->operands[1] = Operand(resource); + aco_ptr mimg{create_instruction(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1)}; + mimg->operands[0] = Operand(resource); + mimg->operands[1] = Operand(s4); /* no sampler */ + mimg->operands[2] = Operand(lod); uint8_t& dmask = mimg->dmask; mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array); mimg->dmask = (1 << instr->dest.ssa.num_components) - 1; @@ -4823,8 +4824,8 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) } } else { aco_ptr store{create_instruction(vmem_op, Format::MUBUF, 4, 0)}; - store->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); - store->operands[1] = Operand(rsrc); + store->operands[0] = Operand(rsrc); + store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0); store->operands[3] = Operand(write_data); store->offset = start * elem_size_bytes; @@ -4912,8 +4913,8 @@ void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) } aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64; aco_ptr mubuf{create_instruction(op, Format::MUBUF, 4, return_previous ? 1 : 0)}; - mubuf->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); - mubuf->operands[1] = Operand(rsrc); + mubuf->operands[0] = Operand(rsrc); + mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0); mubuf->operands[3] = Operand(data); if (return_previous) @@ -5021,8 +5022,8 @@ void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr) Temp rsrc = get_gfx6_global_rsrc(bld, addr); aco_ptr mubuf{create_instruction(op, Format::MUBUF, 3, 1)}; - mubuf->operands[0] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1); - mubuf->operands[1] = Operand(rsrc); + mubuf->operands[0] = Operand(rsrc); + mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1); mubuf->operands[2] = Operand(0u); mubuf->glc = glc; mubuf->dlc = false; @@ -5202,8 +5203,8 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr) Temp rsrc = get_gfx6_global_rsrc(bld, addr); aco_ptr mubuf{create_instruction(op, Format::MUBUF, 4, 0)}; - mubuf->operands[0] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1); - mubuf->operands[1] = Operand(rsrc); + mubuf->operands[0] = Operand(rsrc); + mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1); mubuf->operands[2] = Operand(0u); mubuf->operands[3] = Operand(write_data); mubuf->glc = glc; @@ -5360,8 +5361,8 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr) aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64; aco_ptr mubuf{create_instruction(op, Format::MUBUF, 4, return_previous ? 1 : 0)}; - mubuf->operands[0] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1); - mubuf->operands[1] = Operand(rsrc); + mubuf->operands[0] = Operand(rsrc); + mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1); mubuf->operands[2] = Operand(0u); mubuf->operands[3] = Operand(data); if (return_previous) @@ -5589,12 +5590,12 @@ void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) { case 8: { std::array elems; Temp lower = bld.mubuf(aco_opcode::buffer_load_dwordx4, - bld.def(v4), offset, rsrc, + bld.def(v4), rsrc, offset, ctx->program->scratch_offset, 0, true); Temp upper = bld.mubuf(dst.size() == 6 ? aco_opcode::buffer_load_dwordx2 : aco_opcode::buffer_load_dwordx4, dst.size() == 6 ? bld.def(v2) : bld.def(v4), - offset, rsrc, ctx->program->scratch_offset, 16, true); + rsrc, offset, ctx->program->scratch_offset, 16, true); emit_split_vector(ctx, lower, 2); elems[0] = emit_extract_vector(ctx, lower, 0, v2); elems[1] = emit_extract_vector(ctx, lower, 1, v2); @@ -5619,7 +5620,7 @@ void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) { unreachable("Wrong dst size for nir_intrinsic_load_scratch"); } - bld.mubuf(op, Definition(dst), offset, rsrc, ctx->program->scratch_offset, 0, true); + bld.mubuf(op, Definition(dst), rsrc, offset, ctx->program->scratch_offset, 0, true); emit_split_vector(ctx, dst, instr->num_components); } @@ -5680,7 +5681,7 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) { unreachable("Invalid data size for nir_intrinsic_store_scratch."); } - bld.mubuf(op, offset, rsrc, ctx->program->scratch_offset, write_data, start * elem_size_bytes, true); + bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_data, start * elem_size_bytes, true); } } @@ -5784,8 +5785,8 @@ void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *inst } aco_ptr mtbuf{create_instruction(aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)}; - mtbuf->operands[0] = vaddr_offset; - mtbuf->operands[1] = Operand(gsvs_ring); + mtbuf->operands[0] = Operand(gsvs_ring); + mtbuf->operands[1] = vaddr_offset; mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->gs2vs_offset)); mtbuf->operands[3] = Operand(ctx->outputs.outputs[i][j]); mtbuf->offen = !vaddr_offset.isUndefined(); @@ -6110,8 +6111,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) aco_ptr load{create_instruction(aco_opcode::buffer_load_dwordx2, Format::MUBUF, 3, 1)}; load->definitions[0] = Definition(sample_pos); - load->operands[0] = Operand(addr); - load->operands[1] = Operand(rsrc); + load->operands[0] = Operand(rsrc); + load->operands[1] = Operand(addr); load->operands[2] = Operand(0u); load->offset = sample_pos_offset; load->offen = 0; @@ -7340,9 +7341,10 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) if (tmp_dst.id() == dst.id() && div_by_6) tmp_dst = bld.tmp(tmp_dst.regClass()); - tex.reset(create_instruction(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1)); - tex->operands[0] = Operand(as_vgpr(ctx,lod)); - tex->operands[1] = Operand(resource); + tex.reset(create_instruction(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1)); + tex->operands[0] = Operand(resource); + tex->operands[1] = Operand(s4); /* no sampler */ + tex->operands[2] = Operand(as_vgpr(ctx,lod)); if (ctx->options->chip_class == GFX9 && instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_1D && @@ -7380,9 +7382,10 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) Temp tg4_compare_cube_wa64 = Temp(); if (tg4_integer_workarounds) { - tex.reset(create_instruction(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1)); - tex->operands[0] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u)); - tex->operands[1] = Operand(resource); + tex.reset(create_instruction(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1)); + tex->operands[0] = Operand(resource); + tex->operands[1] = Operand(s4); /* no sampler */ + tex->operands[2] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u)); tex->dim = dim; tex->dmask = 0x3; tex->da = da; @@ -7537,8 +7540,8 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) tmp_dst = bld.tmp(RegType::vgpr, last_bit); aco_ptr mubuf{create_instruction(op, Format::MUBUF, 3, 1)}; - mubuf->operands[0] = Operand(coords); - mubuf->operands[1] = Operand(resource); + mubuf->operands[0] = Operand(resource); + mubuf->operands[1] = Operand(coords); mubuf->operands[2] = Operand((uint32_t) 0); mubuf->definitions[0] = Definition(tmp_dst); mubuf->idxen = true; @@ -7556,9 +7559,10 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) instr->op == nir_texop_fragment_fetch || instr->op == nir_texop_fragment_mask_fetch) { aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS || instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS ? aco_opcode::image_load : aco_opcode::image_load_mip; - tex.reset(create_instruction(op, Format::MIMG, 2, 1)); - tex->operands[0] = Operand(arg); - tex->operands[1] = Operand(resource); + tex.reset(create_instruction(op, Format::MIMG, 3, 1)); + tex->operands[0] = Operand(resource); + tex->operands[1] = Operand(s4); /* no sampler */ + tex->operands[2] = Operand(arg); tex->dim = dim; tex->dmask = dmask; tex->unrm = true; @@ -7644,9 +7648,9 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) } tex.reset(create_instruction(opcode, Format::MIMG, 3, 1)); - tex->operands[0] = Operand(arg); - tex->operands[1] = Operand(resource); - tex->operands[2] = Operand(sampler); + tex->operands[0] = Operand(resource); + tex->operands[1] = Operand(sampler); + tex->operands[2] = Operand(arg); tex->dim = dim; tex->dmask = dmask; tex->da = da; @@ -8753,8 +8757,8 @@ static void emit_stream_output(isel_context *ctx, } aco_ptr store{create_instruction(opcode, Format::MUBUF, 4, 0)}; - store->operands[0] = Operand(so_write_offset[buf]); - store->operands[1] = Operand(so_buffers[buf]); + store->operands[0] = Operand(so_buffers[buf]); + store->operands[1] = Operand(so_write_offset[buf]); store->operands[2] = Operand((uint32_t) 0); store->operands[3] = Operand(write_data); if (offset > 4095) { @@ -9118,8 +9122,8 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader, aco_ptr mubuf{create_instruction(aco_opcode::buffer_load_dword, Format::MUBUF, 3, 1)}; mubuf->definitions[0] = bld.def(v1); - mubuf->operands[0] = Operand(voffset); - mubuf->operands[1] = Operand(gsvs_ring); + mubuf->operands[0] = Operand(gsvs_ring); + mubuf->operands[1] = Operand(voffset); mubuf->operands[2] = Operand(0u); mubuf->offen = true; mubuf->offset = const_offset; diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 3f38e6aadae..1ccaf2a0158 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -781,8 +781,8 @@ struct DS_instruction : public Instruction { /** * Vector Memory Untyped-buffer Instructions - * Operand(0): VADDR - Address source. Can carry an index and/or offset - * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant) + * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant) + * Operand(1): VADDR - Address source. Can carry an index and/or offset * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant) * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data * @@ -804,8 +804,8 @@ struct MUBUF_instruction : public Instruction { /** * Vector Memory Typed-buffer Instructions - * Operand(0): VADDR - Address source. Can carry an index and/or offset - * Operand(1): SRSRC - Specifies which SGPR supplies T# (resource constant) + * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant) + * Operand(1): VADDR - Address source. Can carry an index and/or offset * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant) * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data * @@ -827,10 +827,11 @@ struct MTBUF_instruction : public Instruction { /** * Vector Memory Image Instructions - * Operand(0): VADDR - Address source. Can carry an offset or an index. - * Operand(1): SRSRC - Scalar GPR that specifies the resource constant. - * Operand(2): SSAMP - Scalar GPR that specifies sampler constant. - * Operand(3) / Definition(0): VDATA - Vector GPR for read / write result. + * Operand(0) SRSRC - Scalar GPR that specifies the resource constant. + * Operand(1): SSAMP - Scalar GPR that specifies sampler constant. + * or VDATA - Vector GPR for write data. + * Operand(2): VADDR - Address source. Can carry an offset or an index. + * Definition(0): VDATA - Vector GPR for read result. * */ struct MIMG_instruction : public Instruction { diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 9e606c880f4..9ef94d4f697 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -528,9 +528,9 @@ void to_VOP3(opt_ctx& ctx, aco_ptr& instr) } /* only covers special cases */ -bool can_accept_constant(aco_ptr& instr, unsigned operand) +bool alu_can_accept_constant(aco_opcode opcode, unsigned operand) { - switch (instr->opcode) { + switch (opcode) { case aco_opcode::v_interp_p2_f32: case aco_opcode::v_mac_f32: case aco_opcode::v_writelane_b32: @@ -547,12 +547,6 @@ bool can_accept_constant(aco_ptr& instr, unsigned operand) case aco_opcode::v_readfirstlane_b32: return operand != 0; default: - if ((instr->format == Format::MUBUF || - instr->format == Format::MIMG) && - instr->definitions.size() == 1 && - instr->operands.size() == 4) { - return operand != 3; - } return true; } } @@ -719,7 +713,8 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) break; } } - if ((info.is_constant() || info.is_constant_64bit() || (info.is_literal() && instr->format == Format::PSEUDO)) && !instr->operands[i].isFixed() && can_accept_constant(instr, i)) { + if ((info.is_constant() || info.is_constant_64bit() || (info.is_literal() && instr->format == Format::PSEUDO)) && + !instr->operands[i].isFixed() && alu_can_accept_constant(instr->opcode, i)) { instr->operands[i] = get_constant_op(ctx, info.val, info.is_constant_64bit()); continue; } @@ -754,7 +749,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) static_cast(instr.get())->neg[i] = true; continue; } - if ((info.is_constant() || info.is_constant_64bit()) && can_accept_constant(instr, i)) { + if ((info.is_constant() || info.is_constant_64bit()) && alu_can_accept_constant(instr->opcode, i)) { Operand op = get_constant_op(ctx, info.val, info.is_constant_64bit()); perfwarn(instr->opcode == aco_opcode::v_cndmask_b32 && i == 2, "v_cndmask_b32 with a constant selector", instr.get()); if (i == 0 || instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_writelane_b32) { @@ -780,9 +775,9 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) while (info.is_temp()) info = ctx.info[info.temp.id()]; - if (mubuf->offen && i == 0 && info.is_constant_or_literal() && mubuf->offset + info.val < 4096) { + if (mubuf->offen && i == 1 && info.is_constant_or_literal() && mubuf->offset + info.val < 4096) { assert(!mubuf->idxen); - instr->operands[i] = Operand(v1); + instr->operands[1] = Operand(v1); mubuf->offset += info.val; mubuf->offen = false; continue; @@ -790,9 +785,9 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) instr->operands[2] = Operand((uint32_t) 0); mubuf->offset += info.val; continue; - } else if (mubuf->offen && i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == v1 && mubuf->offset + offset < 4096) { + } else if (mubuf->offen && i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == v1 && mubuf->offset + offset < 4096) { assert(!mubuf->idxen); - instr->operands[i].setTemp(base); + instr->operands[1].setTemp(base); mubuf->offset += offset; continue; } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == s1 && mubuf->offset + offset < 4096) { @@ -2698,7 +2693,7 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) continue; } - if (!can_accept_constant(instr, i)) + if (!alu_can_accept_constant(instr->opcode, i)) continue; if (ctx.uses[op.tempId()] < literal_uses) { diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index 8370effdbc3..55c41dcdb06 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -1532,11 +1532,14 @@ void register_allocation(Program *program, std::vector> live_out_ } else if (instr->opcode == aco_opcode::s_addk_i32 || instr->opcode == aco_opcode::s_mulk_i32) { instr->definitions[0].setFixed(instr->operands[0].physReg()); - } else if ((instr->format == Format::MUBUF || - instr->format == Format::MIMG) && - instr->definitions.size() == 1 && - instr->operands.size() == 4) { + } else if (instr->format == Format::MUBUF && + instr->definitions.size() == 1 && + instr->operands.size() == 4) { instr->definitions[0].setFixed(instr->operands[3].physReg()); + } else if (instr->format == Format::MIMG && + instr->definitions.size() == 1 && + instr->operands[1].regClass() == instr->definitions[0].regClass()) { + instr->definitions[0].setFixed(instr->operands[1].physReg()); } ctx.defs_done.reset(); diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp index d5f2d913a65..5634a55766c 100644 --- a/src/amd/compiler/aco_scheduler.cpp +++ b/src/amd/compiler/aco_scheduler.cpp @@ -548,7 +548,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block, if (current->isVMEM() == candidate->isVMEM()) { bool same_resource = true; if (current->isVMEM()) - same_resource = candidate->operands[1].tempId() == current->operands[1].tempId(); + same_resource = candidate->operands[0].tempId() == current->operands[0].tempId(); bool can_reorder = can_reorder_vmem || can_reorder_candidate; int grab_dist = clause_insert_idx - candidate_idx; /* We can't easily tell how much this will decrease the def-to-use diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index bfa3c266a76..dfcdab2f003 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -1575,9 +1575,9 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { split->definitions[i] = bld.def(v1); bld.insert(split); for (unsigned i = 0; i < temp.size(); i++) - bld.mubuf(opcode, Operand(), scratch_rsrc, scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false); + bld.mubuf(opcode, scratch_rsrc, Operand(), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false); } else { - bld.mubuf(opcode, Operand(), scratch_rsrc, scratch_offset, temp, offset, false); + bld.mubuf(opcode, scratch_rsrc, Operand(), scratch_offset, temp, offset, false); } } else if (sgpr_slot.find(spill_id) != sgpr_slot.end()) { ctx.program->config->spilled_sgprs += (*it)->operands[0].size(); @@ -1641,11 +1641,11 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { for (unsigned i = 0; i < def.size(); i++) { Temp tmp = bld.tmp(v1); vec->operands[i] = Operand(tmp); - bld.mubuf(opcode, Definition(tmp), Operand(), scratch_rsrc, scratch_offset, offset + i * 4, false); + bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(), scratch_offset, offset + i * 4, false); } bld.insert(vec); } else { - bld.mubuf(opcode, def, Operand(), scratch_rsrc, scratch_offset, offset, false); + bld.mubuf(opcode, def, scratch_rsrc, Operand(), scratch_offset, offset, false); } } else if (sgpr_slot.find(spill_id) != sgpr_slot.end()) { uint32_t spill_slot = sgpr_slot[spill_id]; diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index 8d69952c811..293ec32a330 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -99,8 +99,8 @@ void validate(Program* program, FILE * output) bool flat = instr->format == Format::FLAT || instr->format == Format::SCRATCH || instr->format == Format::GLOBAL; bool can_be_undef = is_phi(instr) || instr->format == Format::EXP || instr->format == Format::PSEUDO_REDUCTION || - (flat && i == 1) || (instr->format == Format::MIMG && i == 2) || - ((instr->format == Format::MUBUF || instr->format == Format::MTBUF) && i == 0); + (flat && i == 1) || (instr->format == Format::MIMG && i == 1) || + ((instr->format == Format::MUBUF || instr->format == Format::MTBUF) && i == 1); check(can_be_undef, "Undefs can only be used in certain operands", instr.get()); } } @@ -229,15 +229,29 @@ void validate(Program* program, FILE * output) break; } case Format::MTBUF: - case Format::MUBUF: - case Format::MIMG: { + case Format::MUBUF: { check(instr->operands.size() > 1, "VMEM instructions must have at least one operand", instr.get()); - check(instr->operands[0].hasRegClass() && instr->operands[0].regClass().type() == RegType::vgpr, + check(instr->operands[1].hasRegClass() && instr->operands[1].regClass().type() == RegType::vgpr, "VADDR must be in vgpr for VMEM instructions", instr.get()); - check(instr->operands[1].isTemp() && instr->operands[1].regClass().type() == RegType::sgpr, "VMEM resource constant must be sgpr", instr.get()); + check(instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::sgpr, "VMEM resource constant must be sgpr", instr.get()); check(instr->operands.size() < 4 || (instr->operands[3].isTemp() && instr->operands[3].regClass().type() == RegType::vgpr), "VMEM write data must be vgpr", instr.get()); break; } + case Format::MIMG: { + check(instr->operands.size() == 3, "MIMG instructions must have exactly 3 operands", instr.get()); + check(instr->operands[0].hasRegClass() && (instr->operands[0].regClass() == s4 || instr->operands[0].regClass() == s8), + "MIMG operands[0] (resource constant) must be in 4 or 8 SGPRs", instr.get()); + if (instr->operands[1].hasRegClass() && instr->operands[1].regClass().type() == RegType::sgpr) + check(instr->operands[1].regClass() == s4, "MIMG operands[1] (sampler constant) must be 4 SGPRs", instr.get()); + else if (instr->operands[1].hasRegClass() && instr->operands[1].regClass().type() == RegType::vgpr) + check(instr->definitions.empty() || instr->definitions[0].regClass() == instr->operands[1].regClass(), + "MIMG operands[1] (VDATA) must be the same as definitions[0] for atomics", instr.get()); + check(instr->operands[2].hasRegClass() && instr->operands[2].regClass().type() == RegType::vgpr, + "MIMG operands[2] (VADDR) must be VGPR", instr.get()); + check(instr->definitions.empty() || (instr->definitions[0].isTemp() && instr->definitions[0].regClass().type() == RegType::vgpr), + "MIMG definitions[0] (VDATA) must be VGPR", instr.get()); + break; + } case Format::DS: { for (const Operand& op : instr->operands) { check((op.isTemp() && op.regClass().type() == RegType::vgpr) || op.physReg() == m0,