diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 11981fbe57d..3b1d8ab5799 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -2938,6 +2938,7 @@ struct LoadEmitInfo { unsigned align_offset = 0; bool glc = false; + bool slc = false; unsigned swizzle_component_size = 0; memory_sync_info sync; Temp soffset = Temp(0, s1); @@ -3358,6 +3359,7 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo &info, mubuf->offen = (offset.type() == RegType::vgpr); mubuf->glc = info.glc; mubuf->dlc = info.glc && bld.program->chip_class >= GFX10; + mubuf->slc = info.slc; mubuf->sync = info.sync; mubuf->offset = const_offset; mubuf->swizzled = info.swizzle_component_size != 0; @@ -3862,7 +3864,8 @@ void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset void load_vmem_mubuf(isel_context *ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset, unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components, - unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true) + unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true, + bool slc = false) { assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8); assert((num_components * elem_size_bytes) == dst.bytes()); @@ -3873,6 +3876,7 @@ void load_vmem_mubuf(isel_context *ctx, Temp dst, Temp descriptor, Temp voffset, LoadEmitInfo info = {Operand(voffset), dst, num_components, elem_size_bytes, descriptor}; info.component_stride = allow_combining ? 0 : stride; info.glc = true; + info.slc = slc; info.swizzle_component_size = allow_combining ? 0 : 4; info.align_mul = MIN2(elem_size_bytes, 4); info.align_offset = 0; @@ -11750,28 +11754,13 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader, if (!(output_usage_mask & (1 << j))) continue; + Temp val = bld.tmp(v1); unsigned const_offset = offset * args->shader_info->gs.vertices_out * 16 * 4; - Temp voffset = vtx_offset; - if (const_offset >= 4096u) { - voffset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), voffset); - const_offset %= 4096u; - } - - aco_ptr mubuf{create_instruction(aco_opcode::buffer_load_dword, Format::MUBUF, 3, 1)}; - mubuf->definitions[0] = bld.def(v1); - mubuf->operands[0] = Operand(gsvs_ring); - mubuf->operands[1] = Operand(voffset); - mubuf->operands[2] = Operand(0u); - mubuf->offen = true; - mubuf->offset = const_offset; - mubuf->glc = true; - mubuf->slc = true; - mubuf->dlc = args->options->chip_class >= GFX10; + load_vmem_mubuf(&ctx, val, gsvs_ring, vtx_offset, Temp(), const_offset, 4, 1, + 0u, true, true, true); ctx.outputs.mask[i] |= 1 << j; - ctx.outputs.temps[i * 4u + j] = mubuf->definitions[0].getTemp(); - - bld.insert(std::move(mubuf)); + ctx.outputs.temps[i * 4u + j] = val; offset++; }