/*
 * Copyright © 2022 Valve Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 *
 */

#include "ac_nir.h"
#include "nir_builder.h"
#include "amdgfxregs.h"
#include "util/u_math.h"

/*
 * These NIR passes are used to lower NIR cross-stage I/O intrinsics
 * between task and mesh shader stages into the memory accesses
 * that actually happen on the HW.
 *
 */

typedef struct {
   unsigned payload_entry_bytes;
   unsigned draw_entry_bytes;
   unsigned num_entries;
} lower_tsms_io_state;

typedef struct {
   nir_ssa_def *hw_workgroup_id;
   nir_ssa_def *api_workgroup_id;
} add_first_task_to_workgroup_id_state;

static bool filter_workgroup_id(const nir_instr *instr,
                                UNUSED const void *state)
{
   if (instr->type != nir_instr_type_intrinsic)
      return false;

   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
   return intrin->intrinsic == nir_intrinsic_load_workgroup_id;
}

static nir_ssa_def *
replace_workgroup_id_use_first_task(nir_builder *b,
                                    nir_instr *instr,
                                    void *state)
{
   add_first_task_to_workgroup_id_state *s = (add_first_task_to_workgroup_id_state *) state;
   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);

   assert(s->hw_workgroup_id);

   if (s->hw_workgroup_id == &intrin->dest.ssa)
      return NULL;

   return s->api_workgroup_id;
}

void
ac_nir_apply_first_task_to_task_shader(nir_shader *shader)
{
   /* The draw packets on RDNA2 GPUs don't support adding an offset to the task shader
    * workgroups, so we have to emulate the firstTask feature for NV_mesh_shader.
    *
    * 1. Pass the address of the IB (indirect buffer) from the NV_mesh_shader draw call
    *    to the shader in an SGPR argument (2 SGPRs for address, 1 SGPR for stride).
    * 2. Create a descriptor for the IB in the shader.
    * 3. Load the firstTask value from the IB
    * 4. Add the firstTask value the workgroup ID and use the result instead of the
    *    workgroup ID generated by the HW.
    *
    * NOTE: This pass must run _before_ lowering the task shader outputs to memory
    *       accesses. The lowering uses the workgroup ID and that must be unchanged
    *       because it has to be the real HW workgroup ID.
    */

   /* If the shader doesn't use workgroup ID, nothing to do here. */
   if (!BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_WORKGROUP_ID))
      return;

   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
   assert(impl);

   nir_builder builder;
   nir_builder *b = &builder; /* This is to avoid the & */
   nir_builder_init(b, impl);
   b->cursor = nir_before_cf_list(&impl->body);

   /* This is the stride passed to vkCmdDrawMeshTasksIndirectNV */
   nir_ssa_def *ib_stride = nir_load_task_ib_stride(b);
   nir_ssa_def *zero = nir_imm_int(b, 0);
   nir_ssa_def *first_task = NULL;

   /* If the stride is zero, we assume that firstTask is also 0. */
   nir_if *if_stride = nir_push_if(b, nir_ine(b, ib_stride, zero));
   {
      /* Address of the IB (indirect buffer) used by the current draw call. */
      nir_ssa_def *ib_addr = nir_load_task_ib_addr(b);

      /* Compose a 64-bit address from the IB address. */
      nir_ssa_def *addr = nir_pack_64_2x32_split(b, nir_channel(b, ib_addr, 0), nir_channel(b, ib_addr, 1));
      /* The IB needs to be addressed by draw ID * stride. */
      addr = nir_iadd(b, addr, nir_u2u64(b, nir_imul(b, nir_load_draw_id(b), ib_stride)));
      /* Byte offset of the firstTask field in VkDrawMeshTasksIndirectCommandNV. */
      addr = nir_iadd_imm(b, addr, 4);

      first_task = nir_build_load_global(b, 1, 32, addr, .access = ACCESS_NON_WRITEABLE | ACCESS_COHERENT);
   }
   nir_pop_if(b, if_stride);
   first_task = nir_if_phi(b, first_task, zero);

   /* NV_mesh_shader workgroups are 1 dimensional.
    * Apply firstTask to the X dimension, but leave Y and Z intact.
    */
   nir_ssa_def *hw_workgroup_id = nir_load_workgroup_id(b, 32);
   nir_ssa_def *api_workgroup_id_x = nir_iadd(b, nir_channel(b, hw_workgroup_id, 0), first_task);
   nir_ssa_def *api_workgroup_id = nir_vector_insert_imm(b, hw_workgroup_id, api_workgroup_id_x, 0);

   add_first_task_to_workgroup_id_state state = {
      .hw_workgroup_id = hw_workgroup_id,
      .api_workgroup_id = api_workgroup_id,
   };
   nir_shader_lower_instructions(shader,
                                 filter_workgroup_id,
                                 replace_workgroup_id_use_first_task,
                                 &state);

   nir_validate_shader(shader, "after including firstTask in the task shader workgroup ID");
}

static nir_ssa_def *
task_workgroup_index(nir_builder *b,
                     lower_tsms_io_state *s)
{
   nir_ssa_def *id = nir_load_workgroup_id(b, 32);

   nir_ssa_def *x = nir_channel(b, id, 0);
   nir_ssa_def *y = nir_channel(b, id, 1);
   nir_ssa_def *z = nir_channel(b, id, 2);

   nir_ssa_def *grid_size = nir_load_num_workgroups(b, 32);
   nir_ssa_def *grid_size_x = nir_channel(b, grid_size, 0);
   nir_ssa_def *grid_size_y = nir_channel(b, grid_size, 1);

   return nir_iadd(b, nir_imul(b, nir_imul(b, grid_size_x, grid_size_y), z),
                      nir_iadd(b, nir_imul(b, grid_size_x, y), x));
}

static nir_ssa_def *
task_ring_entry_index(nir_builder *b,
                      lower_tsms_io_state *s)
{
   /* Task shader ring_entry shader argument:
    *
    * - It's a copy of write_ptr[31:0] from the task control buffer.
    * - The same value (which is the initial value at dispatch)
    *   seems to be copied to all workgroups in the same dispatch,
    *   therefore a workgroup index needs to be added.
    * - write_ptr must be initialized to num_entries so ring_entry needs
    *   AND with num_entries - 1 to get the correct meaning.
    *   Note that num_entries must be a power of two.
    */
   nir_ssa_def *ring_entry = nir_load_task_ring_entry_amd(b);
   nir_ssa_def *idx = nir_iadd_nuw(b, ring_entry, task_workgroup_index(b, s));
   return nir_iand_imm(b, idx, s->num_entries - 1);
}

static nir_ssa_def *
task_draw_ready_bit(nir_builder *b,
                    lower_tsms_io_state *s)
{
   /* Value of the ready bit is 1 for odd and 0 for even passes through the draw ring.
    *
    * The ring_entry is a copy of the write_ptr. We use that to determine whether
    * the current pass through the draw ring is odd or even, so we can write the
    * correct value to the draw ready bit.
    *
    * This tells the firmware that it can now start launching mesh shader workgroups.
    * The encoding of the last dword of the draw ring entry is:
    * - bit 0: Draw ready bit.
    *          Its meaning flips on every pass through the entry.
    * - bit 1: Packet end bit.
    *          The firmware uses this to mark the entry after the last one
    *          used by the current task dispatch.
    * - bits [2:31] unused.
    *
    * Task shaders MUST write the draw ready bit to the draw ring
    * before they finish. The firmware waits for the shader to write
    * this bit before it reads the mesh dispatch size to launch the
    * mesh shader workgroups.
    *
    * If the task shader doesn't write this bit, the HW hangs.
    */

   nir_ssa_def *ring_entry = nir_load_task_ring_entry_amd(b);
   nir_ssa_def *workgroup_index = task_workgroup_index(b, s);

   nir_ssa_def *idx = nir_iadd_nuw(b, ring_entry, workgroup_index);
   return nir_ubfe(b, idx, nir_imm_int(b, util_bitcount(s->num_entries - 1)), nir_imm_int(b, 1));
}

static nir_ssa_def *
mesh_ring_entry_index(nir_builder *b,
                      lower_tsms_io_state *s)
{
   /* Mesh shader ring_entry shader argument:
    *
    * - It's a copy of the read_ptr[31:0] from the task control buffer.
    * - All workgroups in the same task->mesh dispatch get the same value,
    *   which is fine because they need to read the same entry.
    * - read_ptr must be initialized to num_entries so ring_entry needs
    *   AND with num_entries - 1 to get the correct meaning.
    *   Note that num_entries must be a power of two.
    */
   return nir_iand_imm(b, nir_load_task_ring_entry_amd(b), s->num_entries - 1);
}

static void
task_write_draw_ring(nir_builder *b,
                     nir_ssa_def *store_val,
                     unsigned const_off,
                     lower_tsms_io_state *s)
{
   nir_ssa_def *ptr = task_ring_entry_index(b, s);
   nir_ssa_def *ring = nir_load_ring_task_draw_amd(b);
   nir_ssa_def *scalar_off = nir_imul_imm(b, ptr, s->draw_entry_bytes);
   nir_ssa_def *vector_off = nir_imm_int(b, 0);
   nir_ssa_def *zero = nir_imm_int(b, 0);

   nir_store_buffer_amd(b, store_val, ring, vector_off, scalar_off, zero,
                        .base = const_off, .memory_modes = nir_var_shader_out,
                        .access = ACCESS_COHERENT);
}

static bool
filter_task_intrinsics(const nir_instr *instr,
                       UNUSED const void *state)
{
   if (instr->type != nir_instr_type_intrinsic)
      return false;

   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
   return intrin->intrinsic == nir_intrinsic_launch_mesh_workgroups ||
          intrin->intrinsic == nir_intrinsic_store_task_payload ||
          intrin->intrinsic == nir_intrinsic_load_task_payload;
}

static nir_ssa_def *
lower_task_launch_mesh_workgroups(nir_builder *b,
                                  nir_intrinsic_instr *intrin,
                                  lower_tsms_io_state *s)
{
   /* This intrinsic must be always in uniform control flow,
    * so we assume that all invocations are active here.
    */

   /* Wait for all necessary stores to finish. */
   nir_scoped_barrier(b, .execution_scope = NIR_SCOPE_WORKGROUP,
                         .memory_scope = NIR_SCOPE_WORKGROUP,
                         .memory_semantics = NIR_MEMORY_ACQ_REL,
                         .memory_modes = nir_var_mem_task_payload | nir_var_shader_out |
                                         nir_var_mem_ssbo | nir_var_mem_global);

   /* On the first invocation, write the full draw ring entry. */
   nir_ssa_def *invocation_index = nir_load_local_invocation_index(b);
   nir_if *if_invocation_index_zero = nir_push_if(b, nir_ieq_imm(b, invocation_index, 0));
   {
      nir_ssa_def *dimensions = intrin->src[0].ssa;
      nir_ssa_def *x = nir_channel(b, dimensions, 0);
      nir_ssa_def *y = nir_channel(b, dimensions, 1);
      nir_ssa_def *z = nir_channel(b, dimensions, 2);
      nir_ssa_def *rdy = task_draw_ready_bit(b, s);
      nir_ssa_def *store_val = nir_vec4(b, x, y, z, rdy);
      task_write_draw_ring(b, store_val, 0, s);
   }
   nir_pop_if(b, if_invocation_index_zero);

   return NIR_LOWER_INSTR_PROGRESS_REPLACE;
}

static nir_ssa_def *
lower_task_payload_store(nir_builder *b,
                         nir_intrinsic_instr *intrin,
                         lower_tsms_io_state *s)
{
   unsigned write_mask = nir_intrinsic_write_mask(intrin);
   unsigned base = nir_intrinsic_base(intrin);

   nir_ssa_def *store_val = intrin->src[0].ssa;
   nir_ssa_def *addr = intrin->src[1].ssa;
   nir_ssa_def *ring = nir_load_ring_task_payload_amd(b);
   nir_ssa_def *ptr = task_ring_entry_index(b, s);
   nir_ssa_def *ring_off = nir_imul_imm(b, ptr, s->payload_entry_bytes);
   nir_ssa_def *zero = nir_imm_int(b, 0);

   nir_store_buffer_amd(b, store_val, ring, addr, ring_off, zero, .base = base,
                        .write_mask = write_mask,
                        .memory_modes = nir_var_mem_task_payload,
                        .access = ACCESS_COHERENT);

   return NIR_LOWER_INSTR_PROGRESS_REPLACE;
}

static nir_ssa_def *
lower_taskmesh_payload_load(nir_builder *b,
                            nir_intrinsic_instr *intrin,
                            lower_tsms_io_state *s)
{
   unsigned base = nir_intrinsic_base(intrin);
   unsigned num_components = intrin->dest.ssa.num_components;
   unsigned bit_size = intrin->dest.ssa.bit_size;

   nir_ssa_def *ptr =
      b->shader->info.stage == MESA_SHADER_TASK ?
      task_ring_entry_index(b, s) :
      mesh_ring_entry_index(b, s);

   nir_ssa_def *addr = intrin->src[0].ssa;
   nir_ssa_def *ring = nir_load_ring_task_payload_amd(b);
   nir_ssa_def *ring_off = nir_imul_imm(b, ptr, s->payload_entry_bytes);
   nir_ssa_def *zero = nir_imm_int(b, 0);

   return nir_load_buffer_amd(b, num_components, bit_size, ring, addr, ring_off, zero, .base = base,
                              .memory_modes = nir_var_mem_task_payload,
                              .access = ACCESS_COHERENT);
}

static nir_ssa_def *
lower_task_intrinsics(nir_builder *b,
                      nir_instr *instr,
                      void *state)
{
   assert(instr->type == nir_instr_type_intrinsic);
   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
   lower_tsms_io_state *s = (lower_tsms_io_state *)state;

   switch (intrin->intrinsic) {
      case nir_intrinsic_store_task_payload:
         return lower_task_payload_store(b, intrin, s);
      case nir_intrinsic_load_task_payload:
         return lower_taskmesh_payload_load(b, intrin, s);
      case nir_intrinsic_launch_mesh_workgroups:
         return lower_task_launch_mesh_workgroups(b, intrin, s);
      default:
         unreachable("unsupported task shader intrinsic");
   }
}

void
ac_nir_lower_task_outputs_to_mem(nir_shader *shader,
                                 unsigned task_payload_entry_bytes,
                                 unsigned task_num_entries)
{
   assert(util_is_power_of_two_nonzero(task_num_entries));

   nir_lower_task_shader_options lower_ts_opt = {
      .payload_to_shared_for_atomics = true,
   };
   nir_lower_task_shader(shader, lower_ts_opt);

   lower_tsms_io_state state = {
      .draw_entry_bytes = 16,
      .payload_entry_bytes = task_payload_entry_bytes,
      .num_entries = task_num_entries,
   };

   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
   nir_builder builder;
   nir_builder *b = &builder; /* This is to avoid the & */
   nir_builder_init(b, impl);

   nir_shader_lower_instructions(shader,
                                 filter_task_intrinsics,
                                 lower_task_intrinsics,
                                 &state);

   nir_metadata_preserve(impl, nir_metadata_none);
   nir_validate_shader(shader, "after lowering task shader outputs to memory stores");
}

static bool
filter_mesh_input_load(const nir_instr *instr,
                       UNUSED const void *state)
{
   if (instr->type != nir_instr_type_intrinsic)
      return false;

   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
   return intrin->intrinsic == nir_intrinsic_load_task_payload;
}

static nir_ssa_def *
lower_mesh_intrinsics(nir_builder *b,
                      nir_instr *instr,
                      void *state)
{
   assert(instr->type == nir_instr_type_intrinsic);
   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
   lower_tsms_io_state *s = (lower_tsms_io_state *)state;

   if (intrin->intrinsic == nir_intrinsic_load_task_payload)
      return lower_taskmesh_payload_load(b, intrin, s);
   else
      unreachable("unsupported mesh shader intrinsic");
}

void
ac_nir_lower_mesh_inputs_to_mem(nir_shader *shader,
                                unsigned task_payload_entry_bytes,
                                unsigned task_num_entries)
{
   assert(util_is_power_of_two_nonzero(task_num_entries));

   lower_tsms_io_state state = {
      .draw_entry_bytes = 16,
      .payload_entry_bytes = task_payload_entry_bytes,
      .num_entries = task_num_entries,
   };

   nir_shader_lower_instructions(shader,
                                 filter_mesh_input_load,
                                 lower_mesh_intrinsics,
                                 &state);
}