diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 11981fbe57d..3b1d8ab5799 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -2938,6 +2938,7 @@ struct LoadEmitInfo {
    unsigned align_offset = 0;
 
    bool glc = false;
+   bool slc = false;
    unsigned swizzle_component_size = 0;
    memory_sync_info sync;
    Temp soffset = Temp(0, s1);
@@ -3358,6 +3359,7 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo &info,
    mubuf->offen = (offset.type() == RegType::vgpr);
    mubuf->glc = info.glc;
    mubuf->dlc = info.glc && bld.program->chip_class >= GFX10;
+   mubuf->slc = info.slc;
    mubuf->sync = info.sync;
    mubuf->offset = const_offset;
    mubuf->swizzled = info.swizzle_component_size != 0;
@@ -3862,7 +3864,8 @@ void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset
 
 void load_vmem_mubuf(isel_context *ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset,
                      unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
-                     unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true)
+                     unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true,
+                     bool slc = false)
 {
    assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
    assert((num_components * elem_size_bytes) == dst.bytes());
@@ -3873,6 +3876,7 @@ void load_vmem_mubuf(isel_context *ctx, Temp dst, Temp descriptor, Temp voffset,
    LoadEmitInfo info = {Operand(voffset), dst, num_components, elem_size_bytes, descriptor};
    info.component_stride = allow_combining ? 0 : stride;
    info.glc = true;
+   info.slc = slc;
    info.swizzle_component_size = allow_combining ? 0 : 4;
    info.align_mul = MIN2(elem_size_bytes, 4);
    info.align_offset = 0;
@@ -11750,28 +11754,13 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
             if (!(output_usage_mask & (1 << j)))
                continue;
 
+            Temp val = bld.tmp(v1);
             unsigned const_offset = offset * args->shader_info->gs.vertices_out * 16 * 4;
-            Temp voffset = vtx_offset;
-            if (const_offset >= 4096u) {
-               voffset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), voffset);
-               const_offset %= 4096u;
-            }
-
-            aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(aco_opcode::buffer_load_dword, Format::MUBUF, 3, 1)};
-            mubuf->definitions[0] = bld.def(v1);
-            mubuf->operands[0] = Operand(gsvs_ring);
-            mubuf->operands[1] = Operand(voffset);
-            mubuf->operands[2] = Operand(0u);
-            mubuf->offen = true;
-            mubuf->offset = const_offset;
-            mubuf->glc = true;
-            mubuf->slc = true;
-            mubuf->dlc = args->options->chip_class >= GFX10;
+            load_vmem_mubuf(&ctx, val, gsvs_ring, vtx_offset, Temp(), const_offset, 4, 1,
+                            0u, true, true, true);
 
             ctx.outputs.mask[i] |= 1 << j;
-            ctx.outputs.temps[i * 4u + j] = mubuf->definitions[0].getTemp();
-
-            bld.insert(std::move(mubuf));
+            ctx.outputs.temps[i * 4u + j] = val;
 
             offset++;
          }