/*
 * Copyright © 2021 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include "brw_private.h"
#include "compiler/shader_info.h"
#include "intel/dev/intel_debug.h"
#include "intel/dev/intel_device_info.h"
#include "util/ralloc.h"

unsigned
brw_required_dispatch_width(const struct shader_info *info)
{
   if ((int)info->subgroup_size >= (int)SUBGROUP_SIZE_REQUIRE_8) {
      assert(gl_shader_stage_uses_workgroup(info->stage));
      /* These enum values are expressly chosen to be equal to the subgroup
       * size that they require.
       */
      return (unsigned)info->subgroup_size;
   } else {
      return 0;
   }
}

static inline bool
test_bit(unsigned mask, unsigned bit) {
   return mask & (1u << bit);
}

bool
brw_simd_should_compile(void *mem_ctx,
                        unsigned simd,
                        const struct intel_device_info *devinfo,
                        struct brw_cs_prog_data *prog_data,
                        unsigned required,
                        const char **error)

{
   assert(!test_bit(prog_data->prog_mask, simd));
   assert(error);

   const unsigned width = 8u << simd;

   /* For shaders with variable size workgroup, we will always compile all the
    * variants, since the choice will happen only at dispatch time.
    */
   const bool workgroup_size_variable = prog_data->local_size[0] == 0;

   if (!workgroup_size_variable) {
      if (test_bit(prog_data->prog_spilled, simd)) {
         *error = ralloc_asprintf(
            mem_ctx, "SIMD%u skipped because would spill", width);
         return false;
      }

      const unsigned workgroup_size = prog_data->local_size[0] *
                                      prog_data->local_size[1] *
                                      prog_data->local_size[2];

      unsigned max_threads = devinfo->max_cs_workgroup_threads;

      if (required && required != width) {
         *error = ralloc_asprintf(
            mem_ctx, "SIMD%u skipped because required dispatch width is %u",
            width, required);
         return false;
      }

      if (simd > 0 && test_bit(prog_data->prog_mask, simd - 1) &&
          workgroup_size <= (width / 2)) {
         *error = ralloc_asprintf(
            mem_ctx, "SIMD%u skipped because workgroup size %u already fits in SIMD%u",
            width, workgroup_size, width / 2);
         return false;
      }

      if (DIV_ROUND_UP(workgroup_size, width) > max_threads) {
         *error = ralloc_asprintf(
            mem_ctx, "SIMD%u can't fit all %u invocations in %u threads",
            width, workgroup_size, max_threads);
         return false;
      }

      /* The SIMD32 is only enabled for cases it is needed unless forced.
       *
       * TODO: Use performance_analysis and drop this rule.
       */
      if (width == 32) {
         if (!INTEL_DEBUG(DEBUG_DO32) && prog_data->prog_mask) {
            *error = ralloc_strdup(
               mem_ctx, "SIMD32 skipped because not required");
            return false;
         }
      }
   }

   const bool env_skip[3] = {
      INTEL_DEBUG(DEBUG_NO8),
      INTEL_DEBUG(DEBUG_NO16),
      INTEL_DEBUG(DEBUG_NO32),
   };

   if (unlikely(env_skip[simd])) {
      *error = ralloc_asprintf(
         mem_ctx, "SIMD%u skipped because INTEL_DEBUG=no%u",
         width, width);
      return false;
   }

   return true;
}

void
brw_simd_mark_compiled(unsigned simd, struct brw_cs_prog_data *prog_data, bool spilled)
{
   assert(!test_bit(prog_data->prog_mask, simd));

   prog_data->prog_mask |= 1u << simd;

   /* If a SIMD spilled, all the larger ones would spill too. */
   if (spilled) {
      for (unsigned i = simd; i < 3; i++)
         prog_data->prog_spilled |= 1u << i;
   }
}

int
brw_simd_select(const struct brw_cs_prog_data *prog_data)
{
   assert((prog_data->prog_mask & ~0x7u) == 0);
   const unsigned not_spilled_mask =
      prog_data->prog_mask & ~prog_data->prog_spilled;

   /* Util functions index bits from 1 instead of 0, adjust before return. */

   if (not_spilled_mask)
      return util_last_bit(not_spilled_mask) - 1;
   else if (prog_data->prog_mask)
      return ffs(prog_data->prog_mask) - 1;
   else
      return -1;
}

int
brw_simd_select_for_workgroup_size(const struct intel_device_info *devinfo,
                                   const struct brw_cs_prog_data *prog_data,
                                   const unsigned *sizes)
{
   assert(sizes);

   if (prog_data->local_size[0] == sizes[0] &&
       prog_data->local_size[1] == sizes[1] &&
       prog_data->local_size[2] == sizes[2])
      return brw_simd_select(prog_data);

   void *mem_ctx = ralloc_context(NULL);

   struct brw_cs_prog_data cloned = *prog_data;
   for (unsigned i = 0; i < 3; i++)
      cloned.local_size[i] = sizes[i];

   cloned.prog_mask = 0;
   cloned.prog_spilled = 0;

   const char *error[3] = {0};

   for (unsigned simd = 0; simd < 3; simd++) {
      /* We are not recompiling, so use original results of prog_mask and
       * prog_spilled as they will already contain all possible compilations.
       */
      if (brw_simd_should_compile(mem_ctx, simd, devinfo, &cloned,
                                  0 /* required_dispatch_width */, &error[simd]) &&
          test_bit(prog_data->prog_mask, simd)) {
         brw_simd_mark_compiled(simd, &cloned, test_bit(prog_data->prog_spilled, simd));
      }
   }

   ralloc_free(mem_ctx);

   return brw_simd_select(&cloned);
}