mesa/src/amd/vulkan/radv_device_generated_comma...

/*
 * Copyright © 2021 Google
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include "radv_meta.h"
#include "radv_private.h"

#include "nir_builder.h"

static void
radv_get_sequence_size(const struct radv_indirect_command_layout *layout,
                       const struct radv_graphics_pipeline *pipeline, uint32_t *cmd_size,
                       uint32_t *upload_size)
{
   *cmd_size = 0;
   *upload_size = 0;

   if (layout->bind_vbo_mask) {
      *upload_size += 16 * util_bitcount(pipeline->vb_desc_usage_mask);

     /* One PKT3_SET_SH_REG for emitting VBO pointer (32-bit) */
      *cmd_size += 3 * 4;
   }

   if (layout->push_constant_mask) {
      bool need_copy = false;

      for (unsigned i = 0; i < ARRAY_SIZE(pipeline->base.shaders); ++i) {
         if (!pipeline->base.shaders[i])
            continue;

         struct radv_userdata_locations *locs = &pipeline->base.shaders[i]->info.user_sgprs_locs;
         if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) {
            /* One PKT3_SET_SH_REG for emitting push constants pointer (32-bit) */
            *cmd_size += 3 * 4;
            need_copy = true;
         }
         if (locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0)
            /* One PKT3_SET_SH_REG writing all inline push constants. */
            *cmd_size += (2 + locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].num_sgprs) * 4;
      }
      if (need_copy)
         *upload_size +=
            align(pipeline->base.push_constant_size + 16 * pipeline->base.dynamic_offset_count, 16);
   }

   if (layout->binds_index_buffer) {
      /* Index type write (normal reg write) + index buffer base write (64-bits, but special packet
       * so only 1 word overhead) + index buffer size (again, special packet so only 1 word
       * overhead)
       */
      *cmd_size += (3 + 3 + 2) * 4;
   }

   if (layout->indexed) {
      /* userdata writes + instance count + indexed draw */
      *cmd_size += (5 + 2 + 5) * 4;
   } else {
      /* userdata writes + instance count + non-indexed draw */
      *cmd_size += (5 + 2 + 3) * 4;
   }

   if (layout->binds_state) {
      /* One PKT3_SET_CONTEXT_REG (PA_SU_SC_MODE_CNTL) */
      *cmd_size += 3 * 4;

      if (pipeline->base.device->physical_device->rad_info.has_gfx9_scissor_bug) {
         /* 1 reg write of 4 regs + 1 reg write of 2 regs per scissor */
         *cmd_size += (8 + 2 * MAX_SCISSORS) * 4;
      }
   }
}

static uint32_t
radv_align_cmdbuf_size(uint32_t size)
{
   return align(MAX2(1, size), 256);
}

uint32_t
radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info)
{
   VK_FROM_HANDLE(radv_indirect_command_layout, layout, cmd_info->indirectCommandsLayout);
   VK_FROM_HANDLE(radv_pipeline, pipeline, cmd_info->pipeline);
   struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);

   uint32_t cmd_size, upload_size;
   radv_get_sequence_size(layout, graphics_pipeline, &cmd_size, &upload_size);
   return radv_align_cmdbuf_size(cmd_size * cmd_info->sequencesCount);
}

enum radv_dgc_token_type {
   RADV_DGC_INDEX_BUFFER,
   RADV_DGC_DRAW,
   RADV_DGC_INDEXED_DRAW,
};

struct radv_dgc_token {
   uint16_t type; /* enum radv_dgc_token_type, but making the size explicit */
   uint16_t offset; /* offset in the input stream */
   union {
      struct {
         uint16_t vtx_base_sgpr;
      } draw;
      struct {
         uint16_t index_size;
         uint16_t vtx_base_sgpr;
         uint32_t max_index_count;
      } indexed_draw;
   };
};

struct radv_dgc_params {
   uint32_t cmd_buf_stride;
   uint32_t cmd_buf_size;
   uint32_t upload_stride;
   uint32_t upload_addr;
   uint32_t sequence_count;
   uint32_t stream_stride;

   /* draw info */
   uint16_t draw_indexed;
   uint16_t draw_params_offset;
   uint16_t base_index_size;
   uint16_t vtx_base_sgpr;
   uint32_t max_index_count;

   /* bind index buffer info. Valid if base_index_size == 0 && draw_indexed */
   uint16_t index_buffer_offset;

   uint8_t vbo_cnt;

   uint8_t const_copy;

   /* Which VBOs are set in this indirect layout. */
   uint32_t vbo_bind_mask;

   uint16_t vbo_reg;
   uint16_t const_copy_size;

   uint64_t push_constant_mask;

   uint32_t ibo_type_32;
   uint32_t ibo_type_8;

   uint16_t push_constant_shader_cnt;

   uint16_t emit_state;
   uint32_t pa_su_sc_mode_cntl_base;
   uint16_t state_offset;
   uint16_t scissor_count;
   uint16_t scissor_offset; /* in parameter buffer. */
};

enum {
   DGC_USES_DRAWID = 1u << 14,
   DGC_USES_BASEINSTANCE = 1u << 15,
};

enum {
   DGC_DYNAMIC_STRIDE = 1u << 15,
};

enum {
   DGC_DESC_STREAM,
   DGC_DESC_PREPARE,
   DGC_DESC_PARAMS,
   DGC_DESC_COUNT,
   DGC_NUM_DESCS,
};

struct dgc_cmdbuf {
   nir_ssa_def *descriptor;
   nir_variable *offset;
};

static void
dgc_emit(nir_builder *b, struct dgc_cmdbuf *cs, nir_ssa_def *value)
{
   assert(value->bit_size >= 32);
   nir_ssa_def *offset = nir_load_var(b, cs->offset);
   nir_store_ssbo(b, value, cs->descriptor, offset,.access = ACCESS_NON_READABLE);
   nir_store_var(b, cs->offset, nir_iadd_imm(b, offset, value->num_components * value->bit_size / 8), 0x1);
}


#define load_param32(b, field)                                                                     \
   nir_load_push_constant((b), 1, 32, nir_imm_int((b), 0),                                         \
                          .base = offsetof(struct radv_dgc_params, field), .range = 4)

#define load_param16(b, field)                                                                     \
   nir_ubfe_imm(                                                                                   \
      (b),                                                                                         \
      nir_load_push_constant((b), 1, 32, nir_imm_int((b), 0),                                      \
                             .base = (offsetof(struct radv_dgc_params, field) & ~3), .range = 4),  \
      (offsetof(struct radv_dgc_params, field) & 2) * 8, 16)

#define load_param8(b, field)                                                                      \
   nir_ubfe_imm(                                                                                   \
      (b),                                                                                         \
      nir_load_push_constant((b), 1, 32, nir_imm_int((b), 0),                                      \
                             .base = (offsetof(struct radv_dgc_params, field) & ~3), .range = 4),  \
      (offsetof(struct radv_dgc_params, field) & 3) * 8, 8)

#define load_param64(b, field)                                                                     \
   nir_pack_64_2x32((b), nir_load_push_constant((b), 2, 32, nir_imm_int((b), 0),                   \
                          .base = offsetof(struct radv_dgc_params, field), .range = 8))

static nir_ssa_def *
nir_pkt3(nir_builder *b, unsigned op, nir_ssa_def *len)
{
   len = nir_iand_imm(b, len, 0x3fff);
   return nir_ior_imm(b, nir_ishl_imm(b, len, 16), PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op));
}

static void
dgc_emit_userdata_vertex(nir_builder *b, struct dgc_cmdbuf *cs, nir_ssa_def *vtx_base_sgpr,
                         nir_ssa_def *first_vertex, nir_ssa_def *first_instance, nir_ssa_def *drawid)
{
   vtx_base_sgpr = nir_u2u32(b, vtx_base_sgpr);
   nir_ssa_def *has_drawid =
      nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID);
   nir_ssa_def *has_baseinstance =
      nir_test_mask(b, vtx_base_sgpr, DGC_USES_BASEINSTANCE);

   nir_ssa_def *pkt_cnt = nir_imm_int(b, 1);
   pkt_cnt = nir_bcsel(b, has_drawid, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt);
   pkt_cnt = nir_bcsel(b, has_baseinstance, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt);

   nir_ssa_def *values[5] = {
      nir_pkt3(b, PKT3_SET_SH_REG, pkt_cnt), nir_iand_imm(b, vtx_base_sgpr, 0x3FFF), first_vertex,
      nir_imm_int(b, PKT3_NOP_PAD),          nir_imm_int(b, PKT3_NOP_PAD),
   };

   values[3] = nir_bcsel(b, nir_ior(b, has_drawid, has_baseinstance),
                         nir_bcsel(b, has_drawid, drawid, first_instance), values[4]);
   values[4] = nir_bcsel(b, nir_iand(b, has_drawid, has_baseinstance), first_instance, values[4]);

   dgc_emit(b, cs, nir_vec(b, values, 5));
}

static void
dgc_emit_instance_count(nir_builder *b, struct dgc_cmdbuf *cs, nir_ssa_def *instance_count)
{
   nir_ssa_def *values[2] = {nir_imm_int(b, PKT3(PKT3_NUM_INSTANCES, 0, false)), instance_count};

   dgc_emit(b, cs, nir_vec(b, values, 2));
}

static void
dgc_emit_draw_indexed(nir_builder *b, struct dgc_cmdbuf *cs, nir_ssa_def *index_offset,
                      nir_ssa_def *index_count, nir_ssa_def *max_index_count)
{
   nir_ssa_def *values[5] = {nir_imm_int(b, PKT3(PKT3_DRAW_INDEX_OFFSET_2, 3, false)),
                             max_index_count, index_offset, index_count,
                             nir_imm_int(b, V_0287F0_DI_SRC_SEL_DMA)};

   dgc_emit(b, cs, nir_vec(b, values, 5));
}

static void
dgc_emit_draw(nir_builder *b, struct dgc_cmdbuf *cs, nir_ssa_def *vertex_count)
{
   nir_ssa_def *values[3] = {nir_imm_int(b, PKT3(PKT3_DRAW_INDEX_AUTO, 1, false)), vertex_count,
                             nir_imm_int(b, V_0287F0_DI_SRC_SEL_AUTO_INDEX)};

   dgc_emit(b, cs, nir_vec(b, values, 3));
}

static void
build_dgc_buffer_tail(nir_builder *b, nir_ssa_def *sequence_count)
{
   nir_ssa_def *global_id = get_global_ids(b, 1);

   nir_ssa_def *cmd_buf_stride = load_param32(b, cmd_buf_stride);
   nir_ssa_def *cmd_buf_size = load_param32(b, cmd_buf_size);

   nir_push_if(b, nir_ieq_imm(b, global_id, 0));
   {
      nir_ssa_def *cmd_buf_tail_start = nir_imul(b, cmd_buf_stride, sequence_count);

      nir_variable *offset =
         nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "offset");
      nir_store_var(b, offset, cmd_buf_tail_start, 0x1);

      nir_ssa_def *dst_buf = radv_meta_load_descriptor(b, 0, DGC_DESC_PREPARE);
      nir_push_loop(b);
      {
         nir_ssa_def *curr_offset = nir_load_var(b, offset);
         const unsigned MAX_PACKET_WORDS = 0x3FFC;

         nir_push_if(b, nir_ieq(b, curr_offset, cmd_buf_size));
         {
            nir_jump(b, nir_jump_break);
         }
         nir_pop_if(b, NULL);

         nir_ssa_def *packet_size = nir_isub(b, cmd_buf_size, curr_offset);
         packet_size = nir_umin(b, packet_size, nir_imm_int(b, MAX_PACKET_WORDS * 4));

         nir_ssa_def *len = nir_ushr_imm(b, packet_size, 2);
         len = nir_iadd_imm(b, len, -2);
         nir_ssa_def *packet = nir_pkt3(b, PKT3_NOP, len);

         nir_store_ssbo(b, packet, dst_buf, curr_offset, .access = ACCESS_NON_READABLE);
         nir_store_var(b, offset, nir_iadd(b, curr_offset, packet_size), 0x1);
      }
      nir_pop_loop(b, NULL);
   }
   nir_pop_if(b, NULL);
}

static nir_shader *
build_dgc_prepare_shader(struct radv_device *dev)
{
   nir_builder b = radv_meta_init_shader(dev, MESA_SHADER_COMPUTE, "meta_dgc_prepare");
   b.shader->info.workgroup_size[0] = 64;

   nir_ssa_def *global_id = get_global_ids(&b, 1);

   nir_ssa_def *sequence_id = global_id;

   nir_ssa_def *cmd_buf_stride = load_param32(&b, cmd_buf_stride);
   nir_ssa_def *sequence_count = load_param32(&b, sequence_count);
   nir_ssa_def *stream_stride = load_param32(&b, stream_stride);

   nir_variable *count_var = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "sequence_count");
   nir_store_var(&b, count_var, sequence_count, 0x1);

   nir_push_if(&b, nir_ieq_imm(&b, sequence_count, UINT32_MAX));
   {
      nir_ssa_def *count_buf = radv_meta_load_descriptor(&b, 0, DGC_DESC_COUNT);
      nir_ssa_def *cnt = nir_load_ssbo(&b, 1, 32, count_buf, nir_imm_int(&b, 0));
      nir_store_var(&b, count_var, cnt, 0x1);
   }
   nir_pop_if(&b, NULL);

   sequence_count = nir_load_var(&b, count_var);

   nir_push_if(&b, nir_ult(&b, sequence_id, sequence_count));
   {
      struct dgc_cmdbuf cmd_buf = {
         .descriptor = radv_meta_load_descriptor(&b, 0, DGC_DESC_PREPARE),
         .offset = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "cmd_buf_offset"),
      };
      nir_store_var(&b, cmd_buf.offset, nir_imul(&b, global_id, cmd_buf_stride), 1);
      nir_ssa_def *cmd_buf_end = nir_iadd(&b, nir_load_var(&b, cmd_buf.offset), cmd_buf_stride);

      nir_ssa_def *stream_buf = radv_meta_load_descriptor(&b, 0, DGC_DESC_STREAM);
      nir_ssa_def *stream_base = nir_imul(&b, sequence_id, stream_stride);

      nir_variable *upload_offset =
         nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "upload_offset");
      nir_store_var(&b, upload_offset,
                    nir_iadd(&b, load_param32(&b, cmd_buf_size),
                             nir_imul(&b, load_param32(&b, upload_stride), sequence_id)),
                    0x1);

      nir_ssa_def *vbo_bind_mask = load_param32(&b, vbo_bind_mask);
      nir_ssa_def *vbo_cnt = load_param8(&b, vbo_cnt);
      nir_push_if(&b, nir_ine_imm(&b, vbo_bind_mask, 0));
      {
         nir_variable *vbo_idx =
            nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "vbo_idx");
         nir_store_var(&b, vbo_idx, nir_imm_int(&b, 0), 0x1);

         nir_push_loop(&b);
         {
            nir_push_if(&b, nir_uge(&b, nir_load_var(&b, vbo_idx), vbo_cnt));
            {
               nir_jump(&b, nir_jump_break);
            }
            nir_pop_if(&b, NULL);

            nir_ssa_def *vbo_offset = nir_imul_imm(&b, nir_load_var(&b, vbo_idx), 16);
            nir_variable *vbo_data =
               nir_variable_create(b.shader, nir_var_shader_temp, glsl_uvec4_type(), "vbo_data");

            nir_ssa_def *param_buf = radv_meta_load_descriptor(&b, 0, DGC_DESC_PARAMS);
            nir_store_var(&b, vbo_data,
                          nir_load_ssbo(&b, 4, 32, param_buf, vbo_offset), 0xf);

            nir_ssa_def *vbo_override =
               nir_ine_imm(&b,
                       nir_iand(&b, vbo_bind_mask,
                                nir_ishl(&b, nir_imm_int(&b, 1), nir_load_var(&b, vbo_idx))),
                       0);
            nir_push_if(&b, vbo_override);
            {
               nir_ssa_def *vbo_offset_offset =
                  nir_iadd(&b, nir_imul_imm(&b, vbo_cnt, 16),
                           nir_imul_imm(&b, nir_load_var(&b, vbo_idx), 8));
               nir_ssa_def *vbo_over_data =
                  nir_load_ssbo(&b, 2, 32, param_buf, vbo_offset_offset);
               nir_ssa_def *stream_offset = nir_iadd(
                  &b, stream_base, nir_iand_imm(&b, nir_channel(&b, vbo_over_data, 0), 0x7FFF));
               nir_ssa_def *stream_data =
                  nir_load_ssbo(&b, 4, 32, stream_buf, stream_offset);

               nir_ssa_def *va = nir_pack_64_2x32(&b, nir_channels(&b, stream_data, 0x3));
               nir_ssa_def *size = nir_channel(&b, stream_data, 2);
               nir_ssa_def *stride = nir_channel(&b, stream_data, 3);

               nir_ssa_def *dyn_stride = nir_test_mask(&b, nir_channel(&b, vbo_over_data, 0), DGC_DYNAMIC_STRIDE);
               nir_ssa_def *old_stride =
                  nir_ubfe_imm(&b, nir_channel(&b, nir_load_var(&b, vbo_data), 1), 16, 14);
               stride = nir_bcsel(&b, dyn_stride, stride, old_stride);

               nir_ssa_def *use_per_attribute_vb_descs =
                  nir_test_mask(&b, nir_channel(&b, vbo_over_data, 0), 1u << 31);
               nir_variable *num_records = nir_variable_create(b.shader, nir_var_shader_temp,
                                                               glsl_uint_type(), "num_records");
               nir_store_var(&b, num_records, size, 0x1);

               nir_push_if(&b, use_per_attribute_vb_descs);
               {
                  nir_ssa_def *attrib_end = nir_ubfe_imm(&b, nir_channel(&b, vbo_over_data, 1), 16,
                                                         16);
                  nir_ssa_def *attrib_index_offset =
                     nir_ubfe_imm(&b, nir_channel(&b, vbo_over_data, 1), 0, 16);

                  nir_push_if(&b, nir_ult(&b, nir_load_var(&b, num_records), attrib_end));
                  {
                     nir_store_var(&b, num_records, nir_imm_int(&b, 0), 0x1);
                  }
                  nir_push_else(&b, NULL);
                  nir_push_if(&b, nir_ieq_imm(&b, stride, 0));
                  {
                     nir_store_var(&b, num_records, nir_imm_int(&b, 1), 0x1);
                  }
                  nir_push_else(&b, NULL);
                  {
                     nir_ssa_def *r = nir_iadd(
                        &b,
                        nir_iadd_imm(
                           &b,
                           nir_udiv(&b, nir_isub(&b, nir_load_var(&b, num_records), attrib_end),
                                    stride),
                           1),
                        attrib_index_offset);
                     nir_store_var(&b, num_records, r, 0x1);
                  }
                  nir_pop_if(&b, NULL);
                  nir_pop_if(&b, NULL);

                  nir_ssa_def *convert_cond =
                     nir_ine_imm(&b, nir_load_var(&b, num_records), 0);
                  if (dev->physical_device->rad_info.gfx_level == GFX9)
                     convert_cond = nir_imm_bool(&b, false);
                  else if (dev->physical_device->rad_info.gfx_level != GFX8)
                     convert_cond =
                        nir_iand(&b, convert_cond, nir_ieq_imm(&b, stride, 0));

                  nir_ssa_def *new_records = nir_iadd(
                     &b, nir_imul(&b, nir_iadd_imm(&b, nir_load_var(&b, num_records), -1), stride),
                     attrib_end);
                  new_records =
                     nir_bcsel(&b, convert_cond, new_records, nir_load_var(&b, num_records));
                  nir_store_var(&b, num_records, new_records, 0x1);
               }
               nir_push_else(&b, NULL);
               {
                  if (dev->physical_device->rad_info.gfx_level != GFX8) {
                     nir_push_if(&b, nir_ine_imm(&b, stride, 0));
                     {
                        nir_ssa_def *r = nir_iadd(&b, nir_load_var(&b, num_records),
                                                  nir_iadd_imm(&b, stride, -1));
                        nir_store_var(&b, num_records, nir_udiv(&b, r, stride), 0x1);
                     }
                     nir_pop_if(&b, NULL);
                  }
               }
               nir_pop_if(&b, NULL);

               nir_ssa_def *rsrc_word3 = nir_channel(&b, nir_load_var(&b, vbo_data), 3);
               if (dev->physical_device->rad_info.gfx_level >= GFX10) {
                  nir_ssa_def *oob_select = nir_bcsel(
                     &b, nir_ieq_imm(&b, stride, 0), nir_imm_int(&b, V_008F0C_OOB_SELECT_RAW),
                     nir_imm_int(&b, V_008F0C_OOB_SELECT_STRUCTURED));
                  rsrc_word3 = nir_iand_imm(&b, rsrc_word3, C_008F0C_OOB_SELECT);
                  rsrc_word3 = nir_ior(&b, rsrc_word3, nir_ishl_imm(&b, oob_select, 28));
               }

               nir_ssa_def *va_hi = nir_iand_imm(&b, nir_unpack_64_2x32_split_y(&b, va), 0xFFFF);
               stride = nir_iand_imm(&b, stride, 0x3FFF);
               nir_ssa_def *new_vbo_data[4] = {nir_unpack_64_2x32_split_x(&b, va),
                                               nir_ior(&b, nir_ishl_imm(&b, stride, 16), va_hi),
                                               nir_load_var(&b, num_records), rsrc_word3};
               nir_store_var(&b, vbo_data, nir_vec(&b, new_vbo_data, 4), 0xf);
            }
            nir_pop_if(&b, NULL);

            /* On GFX9, it seems bounds checking is disabled if both
             * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
             * GFX10.3 but it doesn't hurt.
             */
            nir_ssa_def *num_records = nir_channel(&b, nir_load_var(&b, vbo_data), 2);
            nir_ssa_def *buf_va = nir_iand_imm(
               &b, nir_pack_64_2x32(&b, nir_channels(&b, nir_load_var(&b, vbo_data), 0x3)),
               (1ull << 48) - 1ull);
            nir_push_if(&b,
                        nir_ior(&b, nir_ieq_imm(&b, num_records, 0), nir_ieq_imm(&b, buf_va, 0)));
            {
               nir_ssa_def *new_vbo_data[4] = {nir_imm_int(&b, 0), nir_imm_int(&b, 0),
                                               nir_imm_int(&b, 0), nir_imm_int(&b, 0)};
               nir_store_var(&b, vbo_data, nir_vec(&b, new_vbo_data, 4), 0xf);
            }
            nir_pop_if(&b, NULL);

            nir_ssa_def *upload_off = nir_iadd(&b, nir_load_var(&b, upload_offset), vbo_offset);
            nir_store_ssbo(&b, nir_load_var(&b, vbo_data), cmd_buf.descriptor, upload_off, .access = ACCESS_NON_READABLE);
            nir_store_var(&b, vbo_idx, nir_iadd_imm(&b, nir_load_var(&b, vbo_idx), 1), 0x1);
         }
         nir_pop_loop(&b, NULL);
         nir_ssa_def *packet[3] = {
            nir_imm_int(&b, PKT3(PKT3_SET_SH_REG, 1, 0)), load_param16(&b, vbo_reg),
            nir_iadd(&b, load_param32(&b, upload_addr), nir_load_var(&b, upload_offset))};

         dgc_emit(&b, &cmd_buf, nir_vec(&b, packet, 3));

         nir_store_var(&b, upload_offset,
                       nir_iadd(&b, nir_load_var(&b, upload_offset), nir_imul_imm(&b, vbo_cnt, 16)),
                       0x1);
      }
      nir_pop_if(&b, NULL);


      nir_ssa_def *push_const_mask = load_param64(&b, push_constant_mask);
      nir_push_if(&b, nir_ine_imm(&b, push_const_mask, 0));
      {
         nir_ssa_def *const_copy = nir_ine_imm(&b, load_param8(&b, const_copy), 0);
         nir_ssa_def *const_copy_size = load_param16(&b, const_copy_size);
         nir_ssa_def *const_copy_words = nir_ushr_imm(&b, const_copy_size, 2);
         const_copy_words = nir_bcsel(&b, const_copy, const_copy_words, nir_imm_int(&b, 0));

         nir_variable *idx =
            nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "const_copy_idx");
         nir_store_var(&b, idx, nir_imm_int(&b, 0), 0x1);

         nir_ssa_def *param_buf = radv_meta_load_descriptor(&b, 0, DGC_DESC_PARAMS);
         nir_ssa_def *param_offset = nir_imul_imm(&b, vbo_cnt, 24);
         nir_ssa_def *param_offset_offset = nir_iadd_imm(&b, param_offset, MESA_VULKAN_SHADER_STAGES * 12);
         nir_ssa_def *param_const_offset = nir_iadd_imm(&b, param_offset, MAX_PUSH_CONSTANTS_SIZE + MESA_VULKAN_SHADER_STAGES * 12);
         nir_push_loop(&b);
         {
            nir_ssa_def *cur_idx = nir_load_var(&b, idx);
            nir_push_if(&b, nir_uge(&b, cur_idx, const_copy_words));
            {
               nir_jump(&b, nir_jump_break);
            }
            nir_pop_if(&b, NULL);

            nir_variable *data = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "copy_data");

            nir_ssa_def *update = nir_iand(&b, push_const_mask, nir_ishl(&b, nir_imm_int64(&b, 1), cur_idx));
            update = nir_bcsel(
               &b, nir_ult(&b, cur_idx, nir_imm_int(&b, 64 /* bits in push_const_mask */)), update,
               nir_imm_int64(&b, 0));

            nir_push_if(&b, nir_ine_imm(&b, update, 0));
            {
               nir_ssa_def *stream_offset = nir_load_ssbo(
                  &b, 1, 32, param_buf,
                  nir_iadd(&b, param_offset_offset, nir_ishl_imm(&b, cur_idx, 2)));
               nir_ssa_def *new_data = nir_load_ssbo(&b, 1, 32, stream_buf, nir_iadd(&b, stream_base, stream_offset));
               nir_store_var(&b, data, new_data, 0x1);
            }
            nir_push_else(&b, NULL);
            {
               nir_store_var(
                  &b, data,
                  nir_load_ssbo(&b, 1, 32, param_buf,
                                nir_iadd(&b, param_const_offset, nir_ishl_imm(&b, cur_idx, 2))),
                  0x1);
            }
            nir_pop_if(&b, NULL);

            nir_store_ssbo(
               &b, nir_load_var(&b, data), cmd_buf.descriptor,
               nir_iadd(&b, nir_load_var(&b, upload_offset), nir_ishl_imm(&b, cur_idx, 2)),
               .access = ACCESS_NON_READABLE);

            nir_store_var(&b, idx, nir_iadd_imm(&b, cur_idx, 1), 0x1);
         }
         nir_pop_loop(&b, NULL);

         nir_variable *shader_idx =
            nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "shader_idx");
         nir_store_var(&b, shader_idx, nir_imm_int(&b, 0), 0x1);
         nir_ssa_def *shader_cnt = load_param16(&b, push_constant_shader_cnt);

         nir_push_loop(&b);
         {
            nir_ssa_def *cur_shader_idx = nir_load_var(&b, shader_idx);
            nir_push_if(&b, nir_uge(&b, cur_shader_idx, shader_cnt));
            {
               nir_jump(&b, nir_jump_break);
            }
            nir_pop_if(&b, NULL);

            nir_ssa_def *reg_info = nir_load_ssbo(&b, 3, 32, param_buf, nir_iadd(&b, param_offset, nir_imul_imm(&b, cur_shader_idx, 12)));
            nir_ssa_def *upload_sgpr = nir_ubfe_imm(&b, nir_channel(&b, reg_info, 0), 0, 16);
            nir_ssa_def *inline_sgpr = nir_ubfe_imm(&b, nir_channel(&b, reg_info, 0), 16, 16);
            nir_ssa_def *inline_mask = nir_pack_64_2x32(&b, nir_channels(&b, reg_info, 0x6));

            nir_push_if(&b, nir_ine_imm(&b, upload_sgpr, 0));
            {
               nir_ssa_def *pkt[3] = {
                  nir_imm_int(&b, PKT3(PKT3_SET_SH_REG, 1, 0)),
                  upload_sgpr,
                  nir_iadd(&b, load_param32(&b, upload_addr), nir_load_var(&b, upload_offset))
               };

               dgc_emit(&b, &cmd_buf, nir_vec(&b, pkt, 3));
            }
            nir_pop_if(&b, NULL);

            nir_push_if(&b, nir_ine_imm(&b, inline_sgpr, 0));
            {
               nir_ssa_def *inline_len = nir_bit_count(&b, inline_mask);
               nir_store_var(&b, idx, nir_imm_int(&b, 0), 0x1);

               nir_ssa_def *pkt[2] = {
                  nir_pkt3(&b, PKT3_SET_SH_REG, inline_len),
                  inline_sgpr
               };

               dgc_emit(&b, &cmd_buf, nir_vec(&b, pkt, 2));

               nir_push_loop(&b);
               {
                  nir_ssa_def *cur_idx = nir_load_var(&b, idx);
                  nir_push_if(&b,
                              nir_uge(&b, cur_idx, nir_imm_int(&b, 64 /* bits in inline_mask */)));
                  {
                     nir_jump(&b, nir_jump_break);
                  }
                  nir_pop_if(&b, NULL);

                  nir_ssa_def *l = nir_ishl(&b, nir_imm_int64(&b, 1), cur_idx);
                  nir_push_if(&b, nir_ieq_imm(&b, nir_iand(&b, l, inline_mask), 0));
                  {
                     nir_store_var(&b, idx, nir_iadd_imm(&b, cur_idx, 1), 0x1);
                     nir_jump(&b, nir_jump_continue);
                  }
                  nir_pop_if(&b, NULL);

                  nir_variable *data = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "copy_data");

                  nir_ssa_def *update = nir_iand(&b, push_const_mask, nir_ishl(&b, nir_imm_int64(&b, 1), cur_idx));
                  update = nir_bcsel(
                     &b, nir_ult(&b, cur_idx, nir_imm_int(&b, 64 /* bits in push_const_mask */)),
                     update, nir_imm_int64(&b, 0));

                  nir_push_if(&b, nir_ine_imm(&b, update, 0));
                  {
                     nir_ssa_def *stream_offset = nir_load_ssbo(
                        &b, 1, 32, param_buf,
                        nir_iadd(&b, param_offset_offset, nir_ishl_imm(&b, cur_idx, 2)));
                     nir_ssa_def *new_data = nir_load_ssbo(&b, 1, 32, stream_buf, nir_iadd(&b, stream_base, stream_offset));
                     nir_store_var(&b, data, new_data, 0x1);
                  }
                  nir_push_else(&b, NULL);
                  {
                     nir_store_var(&b, data,
                                   nir_load_ssbo(&b, 1, 32, param_buf,
                                                 nir_iadd(&b, param_const_offset,
                                                          nir_ishl_imm(&b, cur_idx, 2))),
                                   0x1);
                  }
                  nir_pop_if(&b, NULL);

                  dgc_emit(&b, &cmd_buf, nir_load_var(&b, data));

                  nir_store_var(&b, idx, nir_iadd_imm(&b, cur_idx, 1), 0x1);
               }
               nir_pop_loop(&b, NULL);
            }
            nir_pop_if(&b, NULL);
            nir_store_var(&b, shader_idx, nir_iadd_imm(&b, cur_shader_idx, 1), 0x1);
         }
         nir_pop_loop(&b, NULL);
      }
      nir_pop_if(&b, 0);

      nir_push_if(&b, nir_ieq_imm(&b, load_param16(&b, emit_state), 1));
      {
         nir_ssa_def *stream_offset = nir_iadd(&b, load_param16(&b, state_offset), stream_base);
         nir_ssa_def *state = nir_load_ssbo(&b, 1, 32, stream_buf, stream_offset);
         state = nir_iand_imm(&b, state, 1);

         nir_ssa_def *reg =
            nir_ior(&b, load_param32(&b, pa_su_sc_mode_cntl_base), nir_ishl_imm(&b, state, 2));

         nir_ssa_def *cmd_values[3] = {
            nir_imm_int(&b, PKT3(PKT3_SET_CONTEXT_REG, 1, 0)),
            nir_imm_int(&b, (R_028814_PA_SU_SC_MODE_CNTL - SI_CONTEXT_REG_OFFSET) >> 2), reg};

         dgc_emit(&b, &cmd_buf, nir_vec(&b, cmd_values, 3));
      }
      nir_pop_if(&b, NULL);

      nir_ssa_def *scissor_count = load_param16(&b, scissor_count);
      nir_push_if(&b, nir_ine_imm(&b, scissor_count, 0));
      {
         nir_ssa_def *scissor_offset = load_param16(&b, scissor_offset);
         nir_variable *idx = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(),
                                                 "scissor_copy_idx");
         nir_store_var(&b, idx, nir_imm_int(&b, 0), 1);

         nir_push_loop(&b);
         {
            nir_ssa_def *cur_idx = nir_load_var(&b, idx);
            nir_push_if(&b, nir_uge(&b, cur_idx, scissor_count));
            {
               nir_jump(&b, nir_jump_break);
            }
            nir_pop_if(&b, NULL);

            nir_ssa_def *param_buf = radv_meta_load_descriptor(&b, 0, DGC_DESC_PARAMS);
            nir_ssa_def *param_offset = nir_iadd(&b, scissor_offset, nir_imul_imm(&b, cur_idx, 4));
            nir_ssa_def *value = nir_load_ssbo(&b, 1, 32, param_buf, param_offset);

            dgc_emit(&b, &cmd_buf, value);

            nir_store_var(&b, idx, nir_iadd_imm(&b, cur_idx, 1), 1);
         }
         nir_pop_loop(&b, NULL);
      }
      nir_pop_if(&b, NULL);

      nir_push_if(&b, nir_ieq_imm(&b, load_param16(&b, draw_indexed), 0));
      {
         nir_ssa_def *vtx_base_sgpr = load_param16(&b, vtx_base_sgpr);
         nir_ssa_def *stream_offset =
            nir_iadd(&b, load_param16(&b, draw_params_offset), stream_base);

         nir_ssa_def *draw_data0 =
            nir_load_ssbo(&b, 4, 32, stream_buf, stream_offset);
         nir_ssa_def *vertex_count = nir_channel(&b, draw_data0, 0);
         nir_ssa_def *instance_count = nir_channel(&b, draw_data0, 1);
         nir_ssa_def *vertex_offset = nir_channel(&b, draw_data0, 2);
         nir_ssa_def *first_instance = nir_channel(&b, draw_data0, 3);

         nir_push_if(&b, nir_iand(&b, nir_ine_imm(&b, vertex_count, 0), nir_ine_imm(&b, instance_count, 0)));
         {
            dgc_emit_userdata_vertex(&b, &cmd_buf, vtx_base_sgpr, vertex_offset, first_instance, sequence_id);
            dgc_emit_instance_count(&b, &cmd_buf, instance_count);
            dgc_emit_draw(&b, &cmd_buf, vertex_count);
         }
         nir_pop_if(&b, 0);
      }
      nir_push_else(&b, NULL);
      {
         nir_variable *index_size_var =
            nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "index_size");
         nir_store_var(&b, index_size_var, load_param16(&b, base_index_size), 0x1);
         nir_variable *max_index_count_var =
            nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "max_index_count");
         nir_store_var(&b, max_index_count_var, load_param32(&b, max_index_count), 0x1);

         nir_ssa_def *bind_index_buffer = nir_ieq_imm(&b, nir_load_var(&b, index_size_var), 0);
         nir_push_if(&b, bind_index_buffer);
         {
            nir_ssa_def *index_stream_offset =
               nir_iadd(&b, load_param16(&b, index_buffer_offset), stream_base);
            nir_ssa_def *data =
               nir_load_ssbo(&b, 4, 32, stream_buf, index_stream_offset);

            nir_ssa_def *vk_index_type = nir_channel(&b, data, 3);
            nir_ssa_def *index_type = nir_bcsel(
               &b, nir_ieq(&b, vk_index_type, load_param32(&b, ibo_type_32)),
               nir_imm_int(&b, V_028A7C_VGT_INDEX_32), nir_imm_int(&b, V_028A7C_VGT_INDEX_16));
            index_type = nir_bcsel(&b, nir_ieq(&b, vk_index_type, load_param32(&b, ibo_type_8)),
                                   nir_imm_int(&b, V_028A7C_VGT_INDEX_8), index_type);

            nir_ssa_def *index_size = nir_iand_imm(
               &b, nir_ushr(&b, nir_imm_int(&b, 0x142), nir_imul_imm(&b, index_type, 4)), 0xf);
            nir_store_var(&b, index_size_var, index_size, 0x1);

            nir_ssa_def *max_index_count = nir_udiv(&b, nir_channel(&b, data, 2), index_size);
            nir_store_var(&b, max_index_count_var, max_index_count, 0x1);

            nir_ssa_def *cmd_values[3 + 2 + 3];

            if (dev->physical_device->rad_info.gfx_level >= GFX9) {
               unsigned opcode = PKT3_SET_UCONFIG_REG_INDEX;
               if (dev->physical_device->rad_info.gfx_level < GFX9 ||
                   (dev->physical_device->rad_info.gfx_level == GFX9 &&
                    dev->physical_device->rad_info.me_fw_version < 26))
                  opcode = PKT3_SET_UCONFIG_REG;
               cmd_values[0] = nir_imm_int(&b, PKT3(opcode, 1, 0));
               cmd_values[1] = nir_imm_int(
                  &b, (R_03090C_VGT_INDEX_TYPE - CIK_UCONFIG_REG_OFFSET) >> 2 | (2u << 28));
               cmd_values[2] = index_type;
            } else {
               cmd_values[0] = nir_imm_int(&b, PKT3(PKT3_INDEX_TYPE, 0, 0));
               cmd_values[1] = index_type;
               cmd_values[2] = nir_imm_int(&b, PKT3_NOP_PAD);
            }

            nir_ssa_def *addr_upper = nir_channel(&b, data, 1);
            addr_upper = nir_ishr_imm(&b, nir_ishl_imm(&b, addr_upper, 16), 16);

            cmd_values[3] = nir_imm_int(&b, PKT3(PKT3_INDEX_BASE, 1, 0));
            cmd_values[4] = nir_channel(&b, data, 0);
            cmd_values[5] = addr_upper;
            cmd_values[6] = nir_imm_int(&b, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
            cmd_values[7] = max_index_count;

            dgc_emit(&b, &cmd_buf, nir_vec(&b, cmd_values, 8));
         }
         nir_pop_if(&b, NULL);

         nir_ssa_def *index_size = nir_load_var(&b, index_size_var);
         nir_ssa_def *max_index_count = nir_load_var(&b, max_index_count_var);
         nir_ssa_def *vtx_base_sgpr = load_param16(&b, vtx_base_sgpr);
         nir_ssa_def *stream_offset =
            nir_iadd(&b, load_param16(&b, draw_params_offset), stream_base);

         index_size =
            nir_bcsel(&b, bind_index_buffer, nir_load_var(&b, index_size_var), index_size);
         max_index_count = nir_bcsel(&b, bind_index_buffer, nir_load_var(&b, max_index_count_var),
                                     max_index_count);
         nir_ssa_def *draw_data0 =
            nir_load_ssbo(&b, 4, 32, stream_buf, stream_offset);
         nir_ssa_def *draw_data1 = nir_load_ssbo(
            &b, 1, 32, stream_buf, nir_iadd_imm(&b, stream_offset, 16));
         nir_ssa_def *index_count = nir_channel(&b, draw_data0, 0);
         nir_ssa_def *instance_count = nir_channel(&b, draw_data0, 1);
         nir_ssa_def *first_index = nir_channel(&b, draw_data0, 2);
         nir_ssa_def *vertex_offset = nir_channel(&b, draw_data0, 3);
         nir_ssa_def *first_instance = nir_channel(&b, draw_data1, 0);

         nir_push_if(&b, nir_iand(&b, nir_ine_imm(&b, index_count, 0), nir_ine_imm(&b, instance_count, 0)));
         {
            dgc_emit_userdata_vertex(&b, &cmd_buf, vtx_base_sgpr, vertex_offset, first_instance, sequence_id);
            dgc_emit_instance_count(&b, &cmd_buf, instance_count);
            dgc_emit_draw_indexed(&b, &cmd_buf, first_index, index_count,
                                       max_index_count);
         }
         nir_pop_if(&b, 0);
      }
      nir_pop_if(&b, NULL);

      /* Pad the cmdbuffer if we did not use the whole stride */
      nir_push_if(&b, nir_ine(&b, nir_load_var(&b, cmd_buf.offset), cmd_buf_end));
      {
         nir_ssa_def *cnt = nir_isub(&b, cmd_buf_end, nir_load_var(&b, cmd_buf.offset));
         cnt = nir_ushr_imm(&b, cnt, 2);
         cnt = nir_iadd_imm(&b, cnt, -2);
         nir_ssa_def *pkt = nir_pkt3(&b, PKT3_NOP, cnt);

         dgc_emit(&b, &cmd_buf, pkt);
      }
      nir_pop_if(&b, NULL);
   }
   nir_pop_if(&b, NULL);

   build_dgc_buffer_tail(&b, sequence_count);
   return b.shader;
}

void
radv_device_finish_dgc_prepare_state(struct radv_device *device)
{
   radv_DestroyPipeline(radv_device_to_handle(device), device->meta_state.dgc_prepare.pipeline,
                        &device->meta_state.alloc);
   radv_DestroyPipelineLayout(radv_device_to_handle(device),
                              device->meta_state.dgc_prepare.p_layout, &device->meta_state.alloc);
   device->vk.dispatch_table.DestroyDescriptorSetLayout(radv_device_to_handle(device),
                                                        device->meta_state.dgc_prepare.ds_layout,
                                                        &device->meta_state.alloc);
}

VkResult
radv_device_init_dgc_prepare_state(struct radv_device *device)
{
   VkResult result;
   nir_shader *cs = build_dgc_prepare_shader(device);

   VkDescriptorSetLayoutCreateInfo ds_create_info = {
      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
      .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
      .bindingCount = DGC_NUM_DESCS,
      .pBindings = (VkDescriptorSetLayoutBinding[]){
         {.binding = DGC_DESC_STREAM,
          .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
          .descriptorCount = 1,
          .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
          .pImmutableSamplers = NULL},
         {.binding = DGC_DESC_PREPARE,
          .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
          .descriptorCount = 1,
          .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
          .pImmutableSamplers = NULL},
         {.binding = DGC_DESC_PARAMS,
          .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
          .descriptorCount = 1,
          .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
          .pImmutableSamplers = NULL},
         {.binding = DGC_DESC_COUNT,
          .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
          .descriptorCount = 1,
          .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
          .pImmutableSamplers = NULL},
      }};

   result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device), &ds_create_info,
                                           &device->meta_state.alloc,
                                           &device->meta_state.dgc_prepare.ds_layout);
   if (result != VK_SUCCESS)
      goto cleanup;

   const VkPipelineLayoutCreateInfo leaf_pl_create_info = {
      .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
      .setLayoutCount = 1,
      .pSetLayouts = &device->meta_state.dgc_prepare.ds_layout,
      .pushConstantRangeCount = 1,
      .pPushConstantRanges =
         &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(struct radv_dgc_params)},
   };

   result = radv_CreatePipelineLayout(radv_device_to_handle(device), &leaf_pl_create_info,
                                      &device->meta_state.alloc,
                                      &device->meta_state.dgc_prepare.p_layout);
   if (result != VK_SUCCESS)
      goto cleanup;

   VkPipelineShaderStageCreateInfo shader_stage = {
      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
      .stage = VK_SHADER_STAGE_COMPUTE_BIT,
      .module = vk_shader_module_handle_from_nir(cs),
      .pName = "main",
      .pSpecializationInfo = NULL,
   };

   VkComputePipelineCreateInfo pipeline_info = {
      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
      .stage = shader_stage,
      .flags = 0,
      .layout = device->meta_state.dgc_prepare.p_layout,
   };

   result = radv_CreateComputePipelines(
      radv_device_to_handle(device), device->meta_state.cache, 1,
      &pipeline_info, &device->meta_state.alloc, &device->meta_state.dgc_prepare.pipeline);
   if (result != VK_SUCCESS)
      goto cleanup;

cleanup:
   ralloc_free(cs);
   return result;
}

VKAPI_ATTR VkResult VKAPI_CALL
radv_CreateIndirectCommandsLayoutNV(VkDevice _device,
                                    const VkIndirectCommandsLayoutCreateInfoNV *pCreateInfo,
                                    const VkAllocationCallbacks *pAllocator,
                                    VkIndirectCommandsLayoutNV *pIndirectCommandsLayout)
{
   RADV_FROM_HANDLE(radv_device, device, _device);
   struct radv_indirect_command_layout *layout;

   size_t size =
      sizeof(*layout) + pCreateInfo->tokenCount * sizeof(VkIndirectCommandsLayoutTokenNV);

   layout =
      vk_zalloc2(&device->vk.alloc, pAllocator, size, alignof(struct radv_indirect_command_layout),
                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
   if (!layout)
      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);

   vk_object_base_init(&device->vk, &layout->base, VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_NV);

   layout->input_stride = pCreateInfo->pStreamStrides[0];
   layout->token_count = pCreateInfo->tokenCount;
   typed_memcpy(layout->tokens, pCreateInfo->pTokens, pCreateInfo->tokenCount);

   layout->ibo_type_32 = VK_INDEX_TYPE_UINT32;
   layout->ibo_type_8 = VK_INDEX_TYPE_UINT8_EXT;

   for (unsigned i = 0; i < pCreateInfo->tokenCount; ++i) {
      switch (pCreateInfo->pTokens[i].tokenType) {
      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_NV:
         layout->draw_params_offset = pCreateInfo->pTokens[i].offset;
         break;
      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_NV:
         layout->indexed = true;
         layout->draw_params_offset = pCreateInfo->pTokens[i].offset;
         break;
      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_NV:
         layout->binds_index_buffer = true;
         layout->index_buffer_offset = pCreateInfo->pTokens[i].offset;
         /* 16-bit is implied if we find no match. */
         for (unsigned j = 0; j < pCreateInfo->pTokens[i].indexTypeCount; j++) {
            if (pCreateInfo->pTokens[i].pIndexTypes[j] == VK_INDEX_TYPE_UINT32)
               layout->ibo_type_32 = pCreateInfo->pTokens[i].pIndexTypeValues[j];
            else if (pCreateInfo->pTokens[i].pIndexTypes[j] == VK_INDEX_TYPE_UINT8_EXT)
               layout->ibo_type_8 = pCreateInfo->pTokens[i].pIndexTypeValues[j];
         }
         break;
      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_NV:
         layout->bind_vbo_mask |= 1u << pCreateInfo->pTokens[i].vertexBindingUnit;
         layout->vbo_offsets[pCreateInfo->pTokens[i].vertexBindingUnit] =
            pCreateInfo->pTokens[i].offset;
         if (pCreateInfo->pTokens[i].vertexDynamicStride)
            layout->vbo_offsets[pCreateInfo->pTokens[i].vertexBindingUnit] |= DGC_DYNAMIC_STRIDE;
         break;
      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_NV:
         for (unsigned j = pCreateInfo->pTokens[i].pushconstantOffset / 4, k = 0;
              k < pCreateInfo->pTokens[i].pushconstantSize / 4; ++j, ++k) {
            layout->push_constant_mask |= 1ull << j;
            layout->push_constant_offsets[j] = pCreateInfo->pTokens[i].offset + k * 4;
         }
         break;
      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_STATE_FLAGS_NV:
         layout->binds_state = true;
         layout->state_offset = pCreateInfo->pTokens[i].offset;
         break;
      default:
         unreachable("Unhandled token type");
      }
   }
   if (!layout->indexed)
      layout->binds_index_buffer = false;

   *pIndirectCommandsLayout = radv_indirect_command_layout_to_handle(layout);
   return VK_SUCCESS;
}

VKAPI_ATTR void VKAPI_CALL
radv_DestroyIndirectCommandsLayoutNV(VkDevice _device,
                                     VkIndirectCommandsLayoutNV indirectCommandsLayout,
                                     const VkAllocationCallbacks *pAllocator)
{
   RADV_FROM_HANDLE(radv_device, device, _device);
   VK_FROM_HANDLE(radv_indirect_command_layout, layout, indirectCommandsLayout);

   if (!layout)
      return;

   vk_object_base_finish(&layout->base);
   vk_free2(&device->vk.alloc, pAllocator, layout);
}

VKAPI_ATTR void VKAPI_CALL
radv_GetGeneratedCommandsMemoryRequirementsNV(
   VkDevice _device, const VkGeneratedCommandsMemoryRequirementsInfoNV *pInfo,
   VkMemoryRequirements2 *pMemoryRequirements)
{
   RADV_FROM_HANDLE(radv_device, device, _device);
   VK_FROM_HANDLE(radv_indirect_command_layout, layout, pInfo->indirectCommandsLayout);
   VK_FROM_HANDLE(radv_pipeline, pipeline, pInfo->pipeline);
   struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);

   uint32_t cmd_stride, upload_stride;
   radv_get_sequence_size(layout, graphics_pipeline, &cmd_stride, &upload_stride);

   VkDeviceSize cmd_buf_size = radv_align_cmdbuf_size(cmd_stride * pInfo->maxSequencesCount);
   VkDeviceSize upload_buf_size = upload_stride * pInfo->maxSequencesCount;

   pMemoryRequirements->memoryRequirements.memoryTypeBits =
      device->physical_device->memory_types_32bit;
   pMemoryRequirements->memoryRequirements.alignment = 256;
   pMemoryRequirements->memoryRequirements.size =
      align(cmd_buf_size + upload_buf_size, pMemoryRequirements->memoryRequirements.alignment);
}

VKAPI_ATTR void VKAPI_CALL
radv_CmdPreprocessGeneratedCommandsNV(VkCommandBuffer commandBuffer,
                                      const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
{
   /* Can't do anything here as we depend on some dynamic state in some cases that we only know
    * at draw time. */
}

/* Always need to call this directly before draw due to dependence on bound state. */
void
radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer,
                 const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
{
   VK_FROM_HANDLE(radv_indirect_command_layout, layout,
                  pGeneratedCommandsInfo->indirectCommandsLayout);
   VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline);
   VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer);
   struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
   struct radv_meta_saved_state saved_state;
   struct radv_buffer token_buffer;

   uint32_t cmd_stride, upload_stride;
   radv_get_sequence_size(layout, graphics_pipeline, &cmd_stride, &upload_stride);

   unsigned cmd_buf_size =
      radv_align_cmdbuf_size(cmd_stride * pGeneratedCommandsInfo->sequencesCount);

   unsigned vb_size = layout->bind_vbo_mask ? util_bitcount(graphics_pipeline->vb_desc_usage_mask) * 24 : 0;
   unsigned const_size = graphics_pipeline->base.push_constant_size +
                         16 * graphics_pipeline->base.dynamic_offset_count +
                         sizeof(layout->push_constant_offsets) + ARRAY_SIZE(graphics_pipeline->base.shaders) * 12;
   if (!layout->push_constant_mask)
      const_size = 0;

   unsigned scissor_size = (8 + 2 * cmd_buffer->state.dynamic.scissor.count) * 4;
   if (!layout->binds_state || !cmd_buffer->state.dynamic.scissor.count ||
       !cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
      scissor_size = 0;

   unsigned upload_size = MAX2(vb_size + const_size + scissor_size, 16);

   void *upload_data;
   unsigned upload_offset;
   if (!radv_cmd_buffer_upload_alloc(cmd_buffer, upload_size, &upload_offset, &upload_data)) {
      vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
      return;
   }

   void *upload_data_base = upload_data;

   radv_buffer_init(&token_buffer, cmd_buffer->device, cmd_buffer->upload.upload_bo, upload_size,
                    upload_offset);

   uint64_t upload_addr = radv_buffer_get_va(prep_buffer->bo) + prep_buffer->offset +
                          pGeneratedCommandsInfo->preprocessOffset;

   uint16_t vtx_base_sgpr =
      (cmd_buffer->state.graphics_pipeline->vtx_base_sgpr - SI_SH_REG_OFFSET) >> 2;
   if (cmd_buffer->state.graphics_pipeline->uses_drawid)
      vtx_base_sgpr |= DGC_USES_DRAWID;
   if (cmd_buffer->state.graphics_pipeline->uses_baseinstance)
      vtx_base_sgpr |= DGC_USES_BASEINSTANCE;

   uint16_t vbo_sgpr =
      ((radv_lookup_user_sgpr(&graphics_pipeline->base, MESA_SHADER_VERTEX, AC_UD_VS_VERTEX_BUFFERS)->sgpr_idx * 4 +
        graphics_pipeline->base.user_data_0[MESA_SHADER_VERTEX]) -
       SI_SH_REG_OFFSET) >>
      2;
   struct radv_dgc_params params = {
      .cmd_buf_stride = cmd_stride,
      .cmd_buf_size = cmd_buf_size,
      .upload_addr = (uint32_t)upload_addr,
      .upload_stride = upload_stride,
      .sequence_count = pGeneratedCommandsInfo->sequencesCount,
      .stream_stride = layout->input_stride,
      .draw_indexed = layout->indexed,
      .draw_params_offset = layout->draw_params_offset,
      .base_index_size =
         layout->binds_index_buffer ? 0 : radv_get_vgt_index_size(cmd_buffer->state.index_type),
      .vtx_base_sgpr = vtx_base_sgpr,
      .max_index_count = cmd_buffer->state.max_index_count,
      .index_buffer_offset = layout->index_buffer_offset,
      .vbo_reg = vbo_sgpr,
      .ibo_type_32 = layout->ibo_type_32,
      .ibo_type_8 = layout->ibo_type_8,
      .emit_state = layout->binds_state,
      .pa_su_sc_mode_cntl_base = radv_get_pa_su_sc_mode_cntl(cmd_buffer) & C_028814_FACE,
      .state_offset = layout->state_offset,
   };

   if (layout->bind_vbo_mask) {
      radv_write_vertex_descriptors(cmd_buffer, graphics_pipeline, true, upload_data);

      uint32_t *vbo_info = (uint32_t *)((char *)upload_data + graphics_pipeline->vb_desc_alloc_size);

      uint32_t mask = graphics_pipeline->vb_desc_usage_mask;
      unsigned idx = 0;
      while (mask) {
         unsigned i = u_bit_scan(&mask);
         unsigned binding =
            graphics_pipeline->use_per_attribute_vb_descs ? graphics_pipeline->attrib_bindings[i] : i;
         uint32_t attrib_end = graphics_pipeline->attrib_ends[i];

         params.vbo_bind_mask |= ((layout->bind_vbo_mask >> binding) & 1u) << idx;
         vbo_info[2 * idx] = ((graphics_pipeline->use_per_attribute_vb_descs ? 1u : 0u) << 31) |
                             layout->vbo_offsets[binding];
         vbo_info[2 * idx + 1] = graphics_pipeline->attrib_index_offset[i] | (attrib_end << 16);
         ++idx;
      }
      params.vbo_cnt = idx;
      upload_data = (char *)upload_data + vb_size;
   }

   if (layout->push_constant_mask) {
      uint32_t *desc = upload_data;
      upload_data = (char *)upload_data + ARRAY_SIZE(graphics_pipeline->base.shaders) * 12;

      unsigned idx = 0;
      for (unsigned i = 0; i < ARRAY_SIZE(graphics_pipeline->base.shaders); ++i) {
         if (!graphics_pipeline->base.shaders[i])
            continue;

         struct radv_userdata_locations *locs = &graphics_pipeline->base.shaders[i]->info.user_sgprs_locs;
         if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0)
            params.const_copy = 1;

         if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0 ||
             locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0) {
            unsigned upload_sgpr = 0;
            unsigned inline_sgpr = 0;

            if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) {
               upload_sgpr =
                  (graphics_pipeline->base.user_data_0[i] + 4 * locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx -
                   SI_SH_REG_OFFSET) >>
                  2;
            }

            if (locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0) {
               inline_sgpr = (graphics_pipeline->base.user_data_0[i] +
                              4 * locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx -
                              SI_SH_REG_OFFSET) >>
                             2;
               desc[idx * 3 + 1] = graphics_pipeline->base.shaders[i]->info.inline_push_constant_mask;
               desc[idx * 3 + 2] = graphics_pipeline->base.shaders[i]->info.inline_push_constant_mask >> 32;
            }
            desc[idx * 3] = upload_sgpr | (inline_sgpr << 16);
            ++idx;
         }
      }

      params.push_constant_shader_cnt = idx;

      params.const_copy_size = graphics_pipeline->base.push_constant_size +
                               16 * graphics_pipeline->base.dynamic_offset_count;
      params.push_constant_mask = layout->push_constant_mask;

      memcpy(upload_data, layout->push_constant_offsets, sizeof(layout->push_constant_offsets));
      upload_data = (char *)upload_data + sizeof(layout->push_constant_offsets);

      memcpy(upload_data, cmd_buffer->push_constants, graphics_pipeline->base.push_constant_size);
      upload_data = (char *)upload_data + graphics_pipeline->base.push_constant_size;

      struct radv_descriptor_state *descriptors_state =
         radv_get_descriptors_state(cmd_buffer, pGeneratedCommandsInfo->pipelineBindPoint);
      memcpy(upload_data, descriptors_state->dynamic_buffers, 16 * graphics_pipeline->base.dynamic_offset_count);
      upload_data = (char *)upload_data + 16 * graphics_pipeline->base.dynamic_offset_count;
   }

   if (scissor_size) {
      params.scissor_offset = (char*)upload_data - (char*)upload_data_base;
      params.scissor_count = scissor_size / 4;

      struct radeon_cmdbuf scissor_cs = {
         .buf = upload_data,
         .cdw = 0,
         .max_dw = scissor_size / 4
      };

      radv_write_scissors(cmd_buffer, &scissor_cs);
      assert(scissor_cs.cdw * 4 == scissor_size);
      upload_data = (char *)upload_data + scissor_size;
   }

   VkWriteDescriptorSet ds_writes[5];
   VkDescriptorBufferInfo buf_info[ARRAY_SIZE(ds_writes)];
   int ds_cnt = 0;
   buf_info[ds_cnt] = (VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&token_buffer),
                                               .offset = 0,
                                               .range = upload_size};
   ds_writes[ds_cnt] = (VkWriteDescriptorSet){.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
                                              .dstBinding = DGC_DESC_PARAMS,
                                              .dstArrayElement = 0,
                                              .descriptorCount = 1,
                                              .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
                                              .pBufferInfo = &buf_info[ds_cnt]};
   ++ds_cnt;

   buf_info[ds_cnt] = (VkDescriptorBufferInfo){.buffer = pGeneratedCommandsInfo->preprocessBuffer,
                                               .offset = pGeneratedCommandsInfo->preprocessOffset,
                                               .range = pGeneratedCommandsInfo->preprocessSize};
   ds_writes[ds_cnt] = (VkWriteDescriptorSet){.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
                                              .dstBinding = DGC_DESC_PREPARE,
                                              .dstArrayElement = 0,
                                              .descriptorCount = 1,
                                              .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
                                              .pBufferInfo = &buf_info[ds_cnt]};
   ++ds_cnt;

   if (pGeneratedCommandsInfo->streamCount > 0) {
      buf_info[ds_cnt] =
         (VkDescriptorBufferInfo){.buffer = pGeneratedCommandsInfo->pStreams[0].buffer,
                                  .offset = pGeneratedCommandsInfo->pStreams[0].offset,
                                  .range = VK_WHOLE_SIZE};
      ds_writes[ds_cnt] =
         (VkWriteDescriptorSet){.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
                                .dstBinding = DGC_DESC_STREAM,
                                .dstArrayElement = 0,
                                .descriptorCount = 1,
                                .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
                                .pBufferInfo = &buf_info[ds_cnt]};
      ++ds_cnt;
   }

   if (pGeneratedCommandsInfo->sequencesCountBuffer != VK_NULL_HANDLE) {
      buf_info[ds_cnt] =
         (VkDescriptorBufferInfo){.buffer = pGeneratedCommandsInfo->sequencesCountBuffer,
                                  .offset = pGeneratedCommandsInfo->sequencesCountOffset,
                                  .range = VK_WHOLE_SIZE};
      ds_writes[ds_cnt] =
         (VkWriteDescriptorSet){.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
                                .dstBinding = DGC_DESC_COUNT,
                                .dstArrayElement = 0,
                                .descriptorCount = 1,
                                .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
                                .pBufferInfo = &buf_info[ds_cnt]};
      ++ds_cnt;
      params.sequence_count = UINT32_MAX;
   }

   radv_meta_save(
      &saved_state, cmd_buffer,
      RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_CONSTANTS);

   radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
                        cmd_buffer->device->meta_state.dgc_prepare.pipeline);

   radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
                         cmd_buffer->device->meta_state.dgc_prepare.p_layout,
                         VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(params), &params);

   radv_meta_push_descriptor_set(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE,
                                 cmd_buffer->device->meta_state.dgc_prepare.p_layout, 0, ds_cnt,
                                 ds_writes);

   unsigned block_count = MAX2(1, round_up_u32(pGeneratedCommandsInfo->sequencesCount, 64));
   radv_CmdDispatch(radv_cmd_buffer_to_handle(cmd_buffer), block_count, 1, 1);

   radv_buffer_finish(&token_buffer);
   radv_meta_restore(&saved_state, cmd_buffer);

   cmd_buffer->state.flush_bits |=
      RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_L2;
}