mesa/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c

/*
 * Copyright 2020 Advanced Micro Devices, Inc.
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * on the rights to use, copy, modify, merge, publish, distribute, sub
 * license, and/or sell copies of the Software, and to permit persons to whom
 * the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

#include "si_pipe.h"
#include "si_shader_internal.h"
#include "sid.h"
#include "util/u_memory.h"
#include "ac_nir.h"

static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, LLVMValueRef i32, unsigned index)
{
   assert(index <= 1);

   if (index == 1)
      return LLVMBuildAShr(ctx->ac.builder, i32, LLVMConstInt(ctx->ac.i32, 16, 0), "");

   return LLVMBuildSExt(ctx->ac.builder, LLVMBuildTrunc(ctx->ac.builder, i32, ctx->ac.i16, ""),
                        ctx->ac.i32, "");
}

static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, LLVMValueRef out[4])
{
   const struct si_shader_info *info = &ctx->shader->selector->info;
   unsigned vs_blit_property = info->base.vs.blit_sgprs_amd;

   if (vs_blit_property) {
      LLVMValueRef vertex_id = ctx->abi.vertex_id;
      LLVMValueRef sel_x1 =
         LLVMBuildICmp(ctx->ac.builder, LLVMIntULE, vertex_id, ctx->ac.i32_1, "");
      /* Use LLVMIntNE, because we have 3 vertices and only
       * the middle one should use y2.
       */
      LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, vertex_id, ctx->ac.i32_1, "");

      unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index;
      if (input_index == 0) {
         /* Position: */
         LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs);
         LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 1);

         LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);
         LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);
         LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);
         LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);

         LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1, x1, x2, "");
         LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1, y1, y2, "");

         out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->ac.f32, "");
         out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->ac.f32, "");
         out[2] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 2);
         out[3] = ctx->ac.f32_1;
         return;
      }

      /* Color or texture coordinates: */
      assert(input_index == 1);

      if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
         for (int i = 0; i < 4; i++) {
            out[i] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 3 + i);
         }
      } else {
         assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);
         LLVMValueRef x1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 3);
         LLVMValueRef y1 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 4);
         LLVMValueRef x2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 5);
         LLVMValueRef y2 = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 6);

         out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1, x1, x2, "");
         out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1, y1, y2, "");
         out[2] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 7);
         out[3] = LLVMGetParam(ctx->main_fn, param_vs_blit_inputs + 8);
      }
      return;
   }

   /* Set can_speculate=false to help keep all loads grouped together
    * for better latency hiding. If it was true, LLVM could move the loads forward
    * and accidentally double memory latency by doing:
    *
    *    buffer_load_dword_xyzw
    *    s_waitcnt vmcnt(0)
    *    buffer_load_dword_xyzw
    *    s_waitcnt vmcnt(0)
    *
    * ... which is what we must prevent at all cost.
    */
   const bool can_speculate = false;
   unsigned bit_size = info->input[input_index].fp16_lo_hi_valid & 0x1 ? 16 : 32;
   LLVMTypeRef int_type = bit_size == 16 ? ctx->ac.i16 : ctx->ac.i32;
   LLVMTypeRef float_type = bit_size == 16 ? ctx->ac.f16 : ctx->ac.f32;
   unsigned num_vbos_in_user_sgprs = ctx->shader->selector->info.num_vbos_in_user_sgprs;
   union si_vs_fix_fetch fix_fetch;
   LLVMValueRef vb_desc;
   LLVMValueRef vertex_index;
   LLVMValueRef tmp;

   if (input_index < num_vbos_in_user_sgprs) {
      vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]);
   } else {
      unsigned index = input_index - num_vbos_in_user_sgprs;
      vb_desc = ac_build_load_to_sgpr(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args.vertex_buffers),
                                      LLVMConstInt(ctx->ac.i32, index, 0));
   }

   vertex_index = LLVMGetParam(ctx->main_fn, ctx->vertex_index0.arg_index + input_index);

   /* Use the open-coded implementation for all loads of doubles and
    * of dword-sized data that needs fixups. We need to insert conversion
    * code anyway, and the amd/common code does it for us.
    */
   bool opencode = ctx->shader->key.ge.mono.vs_fetch_opencode & (1 << input_index);
   fix_fetch.bits = ctx->shader->key.ge.mono.vs_fix_fetch[input_index].bits;
   if (opencode || (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) ||
       (fix_fetch.u.log_size == 2)) {
      tmp = ac_build_opencoded_load_format(&ctx->ac, fix_fetch.u.log_size,
                                           fix_fetch.u.num_channels_m1 + 1, fix_fetch.u.format,
                                           fix_fetch.u.reverse, !opencode, vb_desc, vertex_index,
                                           ctx->ac.i32_0, ctx->ac.i32_0, 0, can_speculate);
      for (unsigned i = 0; i < 4; ++i)
         out[i] =
            LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), "");

      if (bit_size == 16) {
         if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT ||
             fix_fetch.u.format == AC_FETCH_FORMAT_SINT) {
            for (unsigned i = 0; i < 4; i++)
               out[i] = LLVMBuildTrunc(ctx->ac.builder, out[i], ctx->ac.i16, "");
         } else {
            for (unsigned i = 0; i < 4; i++) {
               out[i] = ac_to_float(&ctx->ac, out[i]);
               out[i] = LLVMBuildFPTrunc(ctx->ac.builder, out[i], ctx->ac.f16, "");
            }
         }
      }
      return;
   }

   unsigned required_channels = util_last_bit(info->input[input_index].usage_mask);
   if (required_channels == 0) {
      for (unsigned i = 0; i < 4; ++i)
         out[i] = LLVMGetUndef(ctx->ac.f32);
      return;
   }

   /* Do multiple loads for special formats. */
   LLVMValueRef fetches[4];
   unsigned num_fetches;
   unsigned fetch_stride;
   unsigned channels_per_fetch;

   if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) {
      num_fetches = MIN2(required_channels, 3);
      fetch_stride = 1 << fix_fetch.u.log_size;
      channels_per_fetch = 1;
   } else {
      num_fetches = 1;
      fetch_stride = 0;
      channels_per_fetch = required_channels;
   }

   for (unsigned i = 0; i < num_fetches; ++i) {
      LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0);
      fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,
                                               channels_per_fetch, 0, can_speculate,
                                               bit_size == 16, false);
   }

   if (num_fetches == 1 && channels_per_fetch > 1) {
      LLVMValueRef fetch = fetches[0];
      for (unsigned i = 0; i < channels_per_fetch; ++i) {
         tmp = LLVMConstInt(ctx->ac.i32, i, false);
         fetches[i] = LLVMBuildExtractElement(ctx->ac.builder, fetch, tmp, "");
      }
      num_fetches = channels_per_fetch;
      channels_per_fetch = 1;
   }

   for (unsigned i = num_fetches; i < 4; ++i)
      fetches[i] = LLVMGetUndef(float_type);

   if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 && required_channels == 4) {
      if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT)
         fetches[3] = LLVMConstInt(int_type, 1, 0);
      else
         fetches[3] = LLVMConstReal(float_type, 1);
   } else if (fix_fetch.u.log_size == 3 &&
              (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ||
               fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED ||
               fix_fetch.u.format == AC_FETCH_FORMAT_SINT) &&
              required_channels == 4) {

      /* For 2_10_10_10, the hardware returns an unsigned value;
       * convert it to a signed one.
       */
      LLVMValueRef tmp = fetches[3];
      LLVMValueRef c30 = LLVMConstInt(int_type, 30, 0);

      /* First, recover the sign-extended signed integer value. */
      if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
         tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, int_type, "");
      else
         tmp = ac_to_integer(&ctx->ac, tmp);

      /* For the integer-like cases, do a natural sign extension.
       *
       * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
       * and happen to contain 0, 1, 2, 3 as the two LSBs of the
       * exponent.
       */
      tmp = LLVMBuildShl(
         ctx->ac.builder, tmp,
         fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ? LLVMConstInt(int_type, 7, 0) : c30, "");
      tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");

      /* Convert back to the right type. */
      if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
         LLVMValueRef clamp;
         LLVMValueRef neg_one = LLVMConstReal(float_type, -1.0);
         tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, float_type, "");
         clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
         tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
      } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
         tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, float_type, "");
      }

      fetches[3] = tmp;
   }

   for (unsigned i = 0; i < 4; ++i)
      out[i] = ac_to_float(&ctx->ac, fetches[i]);
}

static LLVMValueRef si_load_vs_input(struct ac_shader_abi *abi, unsigned driver_location,
                                     unsigned component, unsigned num_components,
                                     unsigned vertex_index, LLVMTypeRef type)
{
   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
   LLVMValueRef values[4];

   load_input_vs(ctx, driver_location, values);

   for (unsigned i = 0; i < 4; i++)
      values[i] = LLVMBuildBitCast(ctx->ac.builder, values[i], type, "");

   return ac_build_varying_gather_values(&ctx->ac, values, num_components, component);
}

void si_llvm_streamout_store_output(struct si_shader_context *ctx, LLVMValueRef const *so_buffers,
                                    LLVMValueRef const *so_write_offsets,
                                    struct pipe_stream_output *stream_out,
                                    struct si_shader_output_values *shader_out)
{
   unsigned buf_idx = stream_out->output_buffer;
   unsigned start = stream_out->start_component;
   unsigned num_comps = stream_out->num_components;
   LLVMValueRef out[4];

   assert(num_comps && num_comps <= 4);
   if (!num_comps || num_comps > 4)
      return;

   /* Load the output as int. */
   for (int j = 0; j < num_comps; j++) {
      assert(stream_out->stream == ((shader_out->vertex_streams >> ((start + j) * 2)) & 0x3));

      out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);
   }

   /* Pack the output. */
   LLVMValueRef vdata = NULL;

   switch (num_comps) {
   case 1: /* as i32 */
      vdata = out[0];
      break;
   case 2: /* as v2i32 */
   case 3: /* as v3i32 */
   case 4: /* as v4i32 */
      vdata = ac_build_gather_values(&ctx->ac, out, num_comps);
      break;
   }

   ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx], vdata, NULL,
                               LLVMBuildAdd(ctx->ac.builder, so_write_offsets[buf_idx],
                                            LLVMConstInt(ctx->ac.i32, stream_out->dst_offset * 4, 0), ""),
                               ctx->ac.i32_0, ac_glc | ac_slc);
}

/**
 * Write streamout data to buffers for vertex stream @p stream (different
 * vertex streams can occur for GS copy shaders).
 */
void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_output_values *outputs,
                            unsigned noutput, unsigned stream)
{
   struct pipe_stream_output_info *so = &ctx->so;
   LLVMBuilderRef builder = ctx->ac.builder;
   int i;

   /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
   LLVMValueRef so_vtx_count = si_unpack_param(ctx, ctx->args.streamout_config, 16, 7);

   LLVMValueRef tid = ac_get_thread_id(&ctx->ac);

   /* can_emit = tid < so_vtx_count; */
   LLVMValueRef can_emit = LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");

   /* Emit the streamout code conditionally. This actually avoids
    * out-of-bounds buffer access. The hw tells us via the SGPR
    * (so_vtx_count) which threads are allowed to emit streamout data. */
   ac_build_ifcc(&ctx->ac, can_emit, 6501);
   {
      /* The buffer offset is computed as follows:
       *   ByteOffset = streamout_offset[buffer_id]*4 +
       *                (streamout_write_index + thread_id)*stride[buffer_id] +
       *                attrib_offset
       */

      LLVMValueRef so_write_index = ac_get_arg(&ctx->ac, ctx->args.streamout_write_index);

      /* Compute (streamout_write_index + thread_id). */
      so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");

      /* Load the descriptor and compute the write offset for each
       * enabled buffer. */
      LLVMValueRef so_write_offset[4] = {};
      LLVMValueRef so_buffers[4];
      LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);

      for (i = 0; i < 4; i++) {
         if (!so->stride[i])
            continue;

         LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, SI_VS_STREAMOUT_BUF0 + i, 0);

         so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);

         LLVMValueRef so_offset = ac_get_arg(&ctx->ac, ctx->args.streamout_offset[i]);
         so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");

         so_write_offset[i] = ac_build_imad(
            &ctx->ac, so_write_index, LLVMConstInt(ctx->ac.i32, so->stride[i] * 4, 0), so_offset);
      }

      /* Write streamout data. */
      for (i = 0; i < so->num_outputs; i++) {
         unsigned reg = so->output[i].register_index;

         if (reg >= noutput)
            continue;

         if (stream != so->output[i].stream)
            continue;

         si_llvm_streamout_store_output(ctx, so_buffers, so_write_offset, &so->output[i],
                                        &outputs[reg]);
      }
   }
   ac_build_endif(&ctx->ac, 6501);
}

void si_llvm_clipvertex_to_clipdist(struct si_shader_context *ctx,
                                    struct ac_export_args clipdist[2], LLVMValueRef clipvertex[4])
{
   unsigned reg_index;
   unsigned chan;
   unsigned const_chan;
   LLVMValueRef base_elt;
   LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);
   LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32, SI_VS_CONST_CLIP_PLANES, 0);
   LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
   unsigned clipdist_mask = ctx->shader->selector->info.clipdist_mask &
                            ~ctx->shader->key.ge.opt.kill_clip_distances;

   for (reg_index = 0; reg_index < 2; reg_index++) {
      struct ac_export_args *args = &clipdist[reg_index];

      if (!(clipdist_mask & BITFIELD_RANGE(reg_index * 4, 4)))
         continue;

      args->out[0] = args->out[1] = args->out[2] = args->out[3] = LLVMGetUndef(ctx->ac.f32);

      /* Compute dot products of position and user clip plane vectors */
      for (chan = 0; chan < 4; chan++) {
         if (!(clipdist_mask & BITFIELD_BIT(reg_index * 4 + chan)))
            continue;

         for (const_chan = 0; const_chan < 4; const_chan++) {
            LLVMValueRef addr =
               LLVMConstInt(ctx->ac.i32, ((reg_index * 4 + chan) * 4 + const_chan) * 4, 0);
            base_elt = si_buffer_load_const(ctx, const_resource, addr);
            args->out[chan] =
               ac_build_fmad(&ctx->ac, base_elt, clipvertex[const_chan],
                             const_chan == 0 ? ctx->ac.f32_0 : args->out[chan]);
         }
      }

      args->enabled_channels = 0xf;
      args->valid_mask = 0;
      args->done = 0;
      args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
      args->compr = 0;
   }
}

/* Initialize arguments for the shader export intrinsic */
static void si_llvm_init_vs_export_args(struct si_shader_context *ctx, const LLVMValueRef *values,
                                        unsigned target, struct ac_export_args *args)
{
   args->enabled_channels = 0xf; /* writemask - default is 0xf */
   args->valid_mask = 0;         /* Specify whether the EXEC mask represents the valid mask */
   args->done = 0;               /* Specify whether this is the last export */
   args->target = target;        /* Specify the target we are exporting */
   args->compr = false;

   memcpy(&args->out[0], values, sizeof(values[0]) * 4);
}

/**
 * Vertex color clamping.
 *
 * This uses a state constant loaded in a user data SGPR and
 * an IF statement is added that clamps all colors if the constant
 * is true.
 */
static void si_vertex_color_clamping(struct si_shader_context *ctx,
                                     struct si_shader_output_values *outputs, unsigned noutput)
{
   LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4];
   bool has_colors = false;

   /* Store original colors to alloca variables. */
   for (unsigned i = 0; i < noutput; i++) {
      if (outputs[i].semantic != VARYING_SLOT_COL0 &&
          outputs[i].semantic != VARYING_SLOT_COL1 &&
          outputs[i].semantic != VARYING_SLOT_BFC0 &&
          outputs[i].semantic != VARYING_SLOT_BFC1)
         continue;

      for (unsigned j = 0; j < 4; j++)
         addr[i][j] = ac_build_alloca_init(&ctx->ac, outputs[i].values[j], "");

      has_colors = true;
   }

   if (!has_colors)
      return;

   /* The state is in the first bit of the user SGPR. */
   LLVMValueRef cond = GET_FIELD(ctx, VS_STATE_CLAMP_VERTEX_COLOR);
   cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, "");

   ac_build_ifcc(&ctx->ac, cond, 6502);

   /* Store clamped colors to alloca variables within the conditional block. */
   for (unsigned i = 0; i < noutput; i++) {
      if (outputs[i].semantic != VARYING_SLOT_COL0 &&
          outputs[i].semantic != VARYING_SLOT_COL1 &&
          outputs[i].semantic != VARYING_SLOT_BFC0 &&
          outputs[i].semantic != VARYING_SLOT_BFC1)
         continue;

      for (unsigned j = 0; j < 4; j++) {
         LLVMBuildStore(ctx->ac.builder, ac_build_clamp(&ctx->ac, outputs[i].values[j]),
                        addr[i][j]);
      }
   }
   ac_build_endif(&ctx->ac, 6502);

   /* Load clamped colors */
   for (unsigned i = 0; i < noutput; i++) {
      if (outputs[i].semantic != VARYING_SLOT_COL0 &&
          outputs[i].semantic != VARYING_SLOT_COL1 &&
          outputs[i].semantic != VARYING_SLOT_BFC0 &&
          outputs[i].semantic != VARYING_SLOT_BFC1)
         continue;

      for (unsigned j = 0; j < 4; j++) {
         outputs[i].values[j] = LLVMBuildLoad2(ctx->ac.builder, ctx->ac.f32, addr[i][j], "");
      }
   }
}

/**
 * Generate export instructions for hardware VS shader stage or NGG GS stage
 * (position and parameter data only).
 *
 * \param num_export_threads  The number of threads that are active for exports. Only used by gfx11.
 */
void si_llvm_build_vs_exports(struct si_shader_context *ctx, LLVMValueRef num_export_threads,
                              struct si_shader_output_values *outputs, unsigned noutput)
{
   struct si_shader *shader = ctx->shader;
   struct ac_export_args pos_args[4] = {};
   LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL,
                viewport_index_value = NULL;
   unsigned pos_idx, index;
   unsigned clipdist_mask = (shader->selector->info.clipdist_mask &
                             ~shader->key.ge.opt.kill_clip_distances) |
                            shader->selector->info.culldist_mask;
   int i;

   si_vertex_color_clamping(ctx, outputs, noutput);

   /* Build position exports. */
   for (i = 0; i < noutput; i++) {
      switch (outputs[i].semantic) {
      case VARYING_SLOT_POS:
         si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_POS, &pos_args[0]);
         break;
      case VARYING_SLOT_PSIZ:
         psize_value = outputs[i].values[0];
         break;
      case VARYING_SLOT_LAYER:
         layer_value = outputs[i].values[0];
         break;
      case VARYING_SLOT_VIEWPORT:
         viewport_index_value = outputs[i].values[0];
         break;
      case VARYING_SLOT_EDGE:
         edgeflag_value = outputs[i].values[0];
         break;
      case VARYING_SLOT_CLIP_DIST0:
      case VARYING_SLOT_CLIP_DIST1:
         index = outputs[i].semantic - VARYING_SLOT_CLIP_DIST0;
         if (clipdist_mask & BITFIELD_RANGE(index * 4, 4)) {
            si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_POS + 2 + index,
                                        &pos_args[2 + index]);
         }
         break;
      case VARYING_SLOT_CLIP_VERTEX:
         si_llvm_clipvertex_to_clipdist(ctx, pos_args + 2, outputs[i].values);
         break;
      }
   }

   /* We need to add the position output manually if it's missing. */
   if (!pos_args[0].out[0]) {
      pos_args[0].enabled_channels = 0xf; /* writemask */
      pos_args[0].valid_mask = 0;         /* EXEC mask */
      pos_args[0].done = 0;               /* last export? */
      pos_args[0].target = V_008DFC_SQ_EXP_POS;
      pos_args[0].compr = 0;              /* COMPR flag */
      pos_args[0].out[0] = ctx->ac.f32_0; /* X */
      pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
      pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
      pos_args[0].out[3] = ctx->ac.f32_1; /* W */
   }

   bool writes_psize = shader->selector->info.writes_psize && !shader->key.ge.opt.kill_pointsize;
   bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag && !shader->key.ge.as_ngg;
   bool writes_vrs = ctx->screen->options.vrs2x2;

   /* Write the misc vector (point size, edgeflag, layer, viewport). */
   if (writes_psize || pos_writes_edgeflag || writes_vrs ||
       shader->selector->info.writes_viewport_index || shader->selector->info.writes_layer) {
      pos_args[1].enabled_channels = writes_psize |
                                     ((pos_writes_edgeflag | writes_vrs) << 1) |
                                     (shader->selector->info.writes_layer << 2);

      pos_args[1].valid_mask = 0; /* EXEC mask */
      pos_args[1].done = 0;       /* last export? */
      pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
      pos_args[1].compr = 0;              /* COMPR flag */
      pos_args[1].out[0] = ctx->ac.f32_0; /* X */
      pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
      pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
      pos_args[1].out[3] = ctx->ac.f32_0; /* W */

      if (writes_psize)
         pos_args[1].out[0] = psize_value;

      if (pos_writes_edgeflag) {
         /* The output is a float, but the hw expects an integer
          * with the first bit containing the edge flag. */
         edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder, edgeflag_value, ctx->ac.i32, "");
         edgeflag_value = ac_build_umin(&ctx->ac, edgeflag_value, ctx->ac.i32_1);

         /* The LLVM intrinsic expects a float. */
         pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
      }

      if (writes_vrs) {
         LLVMValueRef rates;
         if (ctx->screen->info.gfx_level >= GFX11) {
            /* Bits [2:5] = VRS rate
             *
             * The range is [0, 15].
             *
             * If the hw doesn't support VRS 4x4, it will silently use 2x2 instead.
             */
            rates = LLVMConstInt(ctx->ac.i32, (V_0283D0_VRS_SHADING_RATE_4X4 << 2), 0);
         } else {
            /* Bits [2:3] = VRS rate X
             * Bits [4:5] = VRS rate Y
             *
             * The range is [-2, 1]. Values:
             *   1: 2x coarser shading rate in that direction.
             *   0: normal shading rate
             *  -1: 2x finer shading rate (sample shading, not directional)
             *  -2: 4x finer shading rate (sample shading, not directional)
             *
             * Sample shading can't go above 8 samples, so both numbers can't be -2
             * at the same time.
             */
            rates = LLVMConstInt(ctx->ac.i32, (1 << 2) | (1 << 4), 0);
         }

         /* If Pos.W != 1 (typical for non-GUI elements), use 2x2 coarse shading. */
         rates = LLVMBuildSelect(ctx->ac.builder,
                                 LLVMBuildFCmp(ctx->ac.builder, LLVMRealUNE,
                                               pos_args[0].out[3], ctx->ac.f32_1, ""),
                                 rates, ctx->ac.i32_0, "");

         LLVMValueRef v = ac_to_integer(&ctx->ac, pos_args[1].out[1]);
         v = LLVMBuildOr(ctx->ac.builder, v, rates, "");
         pos_args[1].out[1] = ac_to_float(&ctx->ac, v);
      }

      if (ctx->screen->info.gfx_level >= GFX9) {
         /* GFX9 has the layer in out.z[10:0] and the viewport
          * index in out.z[19:16].
          */
         if (shader->selector->info.writes_layer)
            pos_args[1].out[2] = layer_value;

         if (shader->selector->info.writes_viewport_index) {
            LLVMValueRef v = viewport_index_value;

            v = ac_to_integer(&ctx->ac, v);
            v = LLVMBuildShl(ctx->ac.builder, v, LLVMConstInt(ctx->ac.i32, 16, 0), "");
            v = LLVMBuildOr(ctx->ac.builder, v, ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
            pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
            pos_args[1].enabled_channels |= 1 << 2;
         }
      } else {
         if (shader->selector->info.writes_layer)
            pos_args[1].out[2] = layer_value;

         if (shader->selector->info.writes_viewport_index) {
            pos_args[1].out[3] = viewport_index_value;
            pos_args[1].enabled_channels |= 1 << 3;
         }
      }
   }

   for (i = 0; i < 4; i++)
      if (pos_args[i].out[0])
         shader->info.nr_pos_exports++;

   /* GFX10 (Navi1x) skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
    * Setting valid_mask=1 prevents it and has no other effect.
    */
   if (ctx->screen->info.gfx_level == GFX10)
      pos_args[0].valid_mask = 1;

   pos_idx = 0;
   for (i = 0; i < 4; i++) {
      if (!pos_args[i].out[0])
         continue;

      /* Specify the target we are exporting */
      pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;

      if (pos_idx == shader->info.nr_pos_exports) {
         /* Specify that this is the last export */
         pos_args[i].done = 1;

         /* If a shader has no param exports, rasterization can start before
          * the shader finishes and thus memory stores might not finish before
          * the pixel shader starts.
          *
          * VLOAD is for atomics with return.
          */
         if (ctx->screen->info.gfx_level >= GFX10 &&
             !shader->info.nr_param_exports &&
             shader->selector->info.base.writes_memory)
            ac_build_waitcnt(&ctx->ac, AC_WAIT_VLOAD | AC_WAIT_VSTORE);
      }

      ac_build_export(&ctx->ac, &pos_args[i]);
   }

   if (!shader->info.nr_param_exports)
      return;

   /* Build parameter exports. Use 2 loops to export params in ascending order.
    * 32 is the maximum number of parameter exports.
    */
   struct ac_export_args param_exports[32] = {};
   uint64_t vs_output_param_mask = shader->info.vs_output_param_mask;

   while (vs_output_param_mask) {
      unsigned i = u_bit_scan64(&vs_output_param_mask);
      unsigned offset = shader->info.vs_output_param_offset[outputs[i].semantic];

      assert(offset <= AC_EXP_PARAM_OFFSET_31);
      assert(!param_exports[offset].enabled_channels);

      si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_PARAM + offset,
                                  &param_exports[offset]);
   }

   if (ctx->screen->info.gfx_level >= GFX11) {
      /* Store primitive exports to alloca variables, so that we can read them outside this branch. */
      for (unsigned i = 0; i < shader->info.nr_param_exports; i++) {
         for (unsigned chan = 0; chan < 4; chan++) {
            param_exports[i].out[chan] =
               ac_build_alloca_init(&ctx->ac, param_exports[i].out[chan], "");
         }
      }
      ac_build_endif(&ctx->ac, 0);

      if (!num_export_threads)
         num_export_threads = si_unpack_param(ctx, ctx->args.merged_wave_info, 0, 8);

      /* We should always store full vec4s in groups of 8 lanes for the best performance even if
       * some of them are garbage or have unused components, so align the number of export threads
       * to 8.
       */
      num_export_threads = LLVMBuildAdd(ctx->ac.builder, num_export_threads,
                                        LLVMConstInt(ctx->ac.i32, 7, 0), "");
      num_export_threads = LLVMBuildAnd(ctx->ac.builder, num_export_threads,
                                        LLVMConstInt(ctx->ac.i32, ~7, 0), "");
      ac_build_ifcc(&ctx->ac,
                    LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
                                  ac_get_thread_id(&ctx->ac), num_export_threads, ""), 0);

      /* Get the attribute ring address and descriptor. */
      LLVMValueRef attr_address;
      if (ctx->stage == MESA_SHADER_VERTEX && shader->selector->info.base.vs.blit_sgprs_amd) {
         LLVMValueRef ptr =
            LLVMBuildPointerCast(ctx->ac.builder,
                                 ac_get_arg(&ctx->ac, ctx->internal_bindings),
                                 LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_CONST_32BIT), "");
         attr_address = ac_build_load_to_sgpr(&ctx->ac, ptr,
                                              LLVMConstInt(ctx->ac.i32, SI_GS_ATTRIBUTE_RING * 4, 0));
      } else {
         attr_address = ac_get_arg(&ctx->ac, ctx->gs_attr_address);
      }

      unsigned stride = 16 * shader->info.nr_param_exports;
      LLVMValueRef attr_desc[4] = {
         attr_address,
         LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi) |
                                   S_008F04_STRIDE(stride) |
                                   S_008F04_SWIZZLE_ENABLE_GFX11(3) /* 16B */, 0),
         LLVMConstInt(ctx->ac.i32, 0xffffffff, 0),
         LLVMConstInt(ctx->ac.i32, S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
                                   S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
                                   S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
                                   S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
                                   S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_32_32_32_FLOAT) |
                                   S_008F0C_INDEX_STRIDE(2) /* 32 elements */, 0),
      };
      LLVMValueRef attr_rsrc = ac_build_gather_values(&ctx->ac, attr_desc, 4);
      LLVMValueRef attr_offset = LLVMBuildShl(ctx->ac.builder,
                                              si_unpack_param(ctx, ctx->args.gs_attr_offset, 0, 15),
                                              LLVMConstInt(ctx->ac.i32, 9, 0), ""); /* 512B increments */
      LLVMValueRef vindex = gfx10_get_thread_id_in_tg(ctx);

      LLVMValueRef soffset[32];

      /* Compute scalar offsets first. */
      for (unsigned i = 0; i < shader->info.nr_param_exports; i++) {
         soffset[i] = LLVMBuildAdd(ctx->ac.builder, attr_offset,
                                   LLVMConstInt(ctx->ac.i32, 32 * i * 16, 0), "");
      }

      /* Write attributes to the attribute ring buffer. */
      for (unsigned i = 0; i < shader->info.nr_param_exports; i++) {
         for (unsigned chan = 0; chan < 4; chan++) {
            param_exports[i].out[chan] =
               LLVMBuildLoad2(ctx->ac.builder, ctx->ac.f32, param_exports[i].out[chan], "");
         }

         LLVMValueRef vdata = ac_build_gather_values_extended(&ctx->ac, param_exports[i].out,
                                                              4, 1, false);

         ac_build_buffer_store_dword(&ctx->ac, attr_rsrc, vdata, vindex,
                                     ctx->ac.i32_0, soffset[i], ac_swizzled);
      }
   } else {
      /* Export attributes using parameter exports. */
      for (unsigned i = 0; i < shader->info.nr_param_exports; i++)
         ac_build_export(&ctx->ac, &param_exports[i]);
   }
}

void si_llvm_vs_build_end(struct si_shader_context *ctx)
{
   struct si_shader_info *info = &ctx->shader->selector->info;
   struct si_shader_output_values *outputs = NULL;
   LLVMValueRef *addrs = ctx->abi.outputs;
   int i, j;

   assert(!ctx->shader->is_gs_copy_shader);
   assert(info->num_outputs <= AC_LLVM_MAX_OUTPUTS);

   outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));

   for (i = 0; i < info->num_outputs; i++) {
      outputs[i].semantic = info->output_semantic[i];

      for (j = 0; j < 4; j++) {
         outputs[i].values[j] = LLVMBuildLoad2(ctx->ac.builder, ctx->ac.f32, addrs[4 * i + j], "");
         outputs[i].vertex_streams = info->output_streams[i];
      }
   }

   if (!ctx->screen->use_ngg_streamout && ctx->so.num_outputs)
      si_llvm_emit_streamout(ctx, outputs, i, 0);

   /* Export PrimitiveID. */
   if (ctx->shader->key.ge.mono.u.vs_export_prim_id) {
      outputs[i].semantic = VARYING_SLOT_PRIMITIVE_ID;
      outputs[i].vertex_streams = 0;
      outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
      for (j = 1; j < 4; j++)
         outputs[i].values[j] = LLVMConstReal(ctx->ac.f32, 0);
      i++;
   }

   si_llvm_build_vs_exports(ctx, NULL, outputs, i);
   FREE(outputs);
}

/**
 * Build the vertex shader prolog function.
 *
 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
 * All inputs are returned unmodified. The vertex load indices are
 * stored after them, which will be used by the API VS for fetching inputs.
 *
 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
 *   input_v0,
 *   input_v1,
 *   input_v2,
 *   input_v3,
 *   (VertexID + BaseVertex),
 *   (InstanceID + StartInstance),
 *   (InstanceID / 2 + StartInstance)
 */
void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
{
   LLVMTypeRef *returns;
   LLVMValueRef ret, func;
   int num_returns, i;
   unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
   unsigned num_input_vgprs =
      key->vs_prolog.num_merged_next_stage_vgprs + 4;
   struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs];
   struct ac_arg input_vgpr_param[10];
   LLVMValueRef input_vgprs[10];
   unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs + num_input_vgprs;
   unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;

   memset(&ctx->args, 0, sizeof(ctx->args));

   /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
   returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) * sizeof(LLVMTypeRef));
   num_returns = 0;

   /* Declare input and output SGPRs. */
   for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &input_sgpr_param[i]);
      returns[num_returns++] = ctx->ac.i32;
   }

   /* Preloaded VGPRs (outputs must be floats) */
   for (i = 0; i < num_input_vgprs; i++) {
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]);
      returns[num_returns++] = ctx->ac.f32;
   }

   /* Vertex load indices. */
   for (i = 0; i < key->vs_prolog.num_inputs; i++)
      returns[num_returns++] = ctx->ac.f32;

   /* Create the function. */
   si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0);
   func = ctx->main_fn;

   for (i = 0; i < num_input_vgprs; i++) {
      input_vgprs[i] = ac_get_arg(&ctx->ac, input_vgpr_param[i]);
   }

   if (key->vs_prolog.num_merged_next_stage_vgprs) {
      if (!key->vs_prolog.is_monolithic)
         ac_init_exec_full_mask(&ctx->ac);

      if (key->vs_prolog.as_ls && ctx->screen->info.has_ls_vgpr_init_bug) {
         /* If there are no HS threads, SPI loads the LS VGPRs
          * starting at VGPR 0. Shift them back to where they
          * belong.
          */
         LLVMValueRef has_hs_threads =
            LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
                          si_unpack_param(ctx, input_sgpr_param[3], 8, 8), ctx->ac.i32_0, "");

         for (i = 4; i > 0; --i) {
            input_vgprs[i + 1] = LLVMBuildSelect(ctx->ac.builder, has_hs_threads,
                                                 input_vgprs[i + 1], input_vgprs[i - 1], "");
         }
      }
   }

   /* The culling code stored the LDS addresses of the VGPRs into those VGPRs. Load them. */
   if (key->vs_prolog.load_vgprs_after_culling) {
      for (i = 5; i <= 8; i++) {
         bool is_tes_rel_patch_id = i == 7;
         LLVMTypeRef t = is_tes_rel_patch_id ? ctx->ac.i8 : ctx->ac.i32;
         input_vgprs[i] = LLVMBuildIntToPtr(ctx->ac.builder, input_vgprs[i], LLVMPointerType(t, AC_ADDR_SPACE_LDS), "");
         input_vgprs[i] = LLVMBuildLoad2(ctx->ac.builder, t, input_vgprs[i], "");
         if (is_tes_rel_patch_id)
            input_vgprs[i] = LLVMBuildZExt(ctx->ac.builder, input_vgprs[i], ctx->ac.i32, "");
      }
   }

   unsigned vertex_id_vgpr = first_vs_vgpr;
   unsigned instance_id_vgpr = ctx->screen->info.gfx_level >= GFX10
                                  ? first_vs_vgpr + 3
                                  : first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);

   ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr];
   ctx->abi.instance_id = input_vgprs[instance_id_vgpr];

   /* Copy inputs to outputs. This should be no-op, as the registers match,
    * but it will prevent the compiler from overwriting them unintentionally.
    */
   ret = ctx->return_value;
   for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
      LLVMValueRef p = LLVMGetParam(func, i);
      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
   }
   for (i = 0; i < num_input_vgprs; i++) {
      LLVMValueRef p = input_vgprs[i];

      if (i == vertex_id_vgpr)
         p = ctx->abi.vertex_id;
      else if (i == instance_id_vgpr)
         p = ctx->abi.instance_id;

      p = ac_to_float(&ctx->ac, p);
      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, key->vs_prolog.num_input_sgprs + i, "");
   }

   /* Compute vertex load indices from instance divisors. */
   LLVMValueRef instance_divisor_constbuf = NULL;

   if (key->vs_prolog.states.instance_divisor_is_fetched) {
      LLVMValueRef list = si_prolog_get_internal_bindings(ctx);
      LLVMValueRef buf_index = LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
      instance_divisor_constbuf = ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
   }

   for (i = 0; i < key->vs_prolog.num_inputs; i++) {
      bool divisor_is_one = key->vs_prolog.states.instance_divisor_is_one & (1u << i);
      bool divisor_is_fetched = key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
      LLVMValueRef index = NULL;

      if (divisor_is_one) {
         index = ctx->abi.instance_id;
      } else if (divisor_is_fetched) {
         LLVMValueRef udiv_factors[4];

         for (unsigned j = 0; j < 4; j++) {
            udiv_factors[j] = si_buffer_load_const(ctx, instance_divisor_constbuf,
                                                   LLVMConstInt(ctx->ac.i32, i * 16 + j * 4, 0));
            udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
         }
         /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
          * Such InstanceID might not be achievable in a reasonable time though.
          */
         index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id, udiv_factors[0],
                                        udiv_factors[1], udiv_factors[2], udiv_factors[3]);
      }

      if (divisor_is_one || divisor_is_fetched) {
         /* Add StartInstance. */
         index =
            LLVMBuildAdd(ctx->ac.builder, index,
                         LLVMGetParam(ctx->main_fn, user_sgpr_base + SI_SGPR_START_INSTANCE), "");
      } else {
         /* VertexID + BaseVertex */
         index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
                              LLVMGetParam(func, user_sgpr_base + SI_SGPR_BASE_VERTEX), "");
      }

      index = ac_to_float(&ctx->ac, index);
      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index, ctx->args.arg_count + i, "");
   }

   si_llvm_build_ret(ctx, ret);
}

void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader)
{
   ctx->abi.load_inputs = si_load_vs_input;
}