mesa/src/gallium/drivers/etnaviv/etnaviv_shader.c

/*
 * Copyright (c) 2012-2015 Etnaviv Project
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sub license,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 *
 * Authors:
 *    Wladimir J. van der Laan <laanwj@gmail.com>
 */

#include "etnaviv_shader.h"

#include "etnaviv_compiler.h"
#include "etnaviv_context.h"
#include "etnaviv_debug.h"
#include "etnaviv_disasm.h"
#include "etnaviv_disk_cache.h"
#include "etnaviv_screen.h"
#include "etnaviv_util.h"

#include "tgsi/tgsi_parse.h"
#include "nir/tgsi_to_nir.h"
#include "util/u_atomic.h"
#include "util/u_cpu_detect.h"
#include "util/u_math.h"
#include "util/u_memory.h"

/* Upload shader code to bo, if not already done */
static bool etna_icache_upload_shader(struct etna_context *ctx, struct etna_shader_variant *v)
{
   if (v->bo)
      return true;
   v->bo = etna_bo_new(ctx->screen->dev, v->code_size*4, DRM_ETNA_GEM_CACHE_WC);
   if (!v->bo)
      return false;

   void *buf = etna_bo_map(v->bo);
   etna_bo_cpu_prep(v->bo, DRM_ETNA_PREP_WRITE);
   memcpy(buf, v->code, v->code_size*4);
   etna_bo_cpu_fini(v->bo);
   DBG("Uploaded %s of %u words to bo %p", v->stage == MESA_SHADER_FRAGMENT ? "fs":"vs", v->code_size, v->bo);
   return true;
}

extern const char *tgsi_swizzle_names[];
void
etna_dump_shader(const struct etna_shader_variant *shader)
{
   if (shader->stage == MESA_SHADER_VERTEX)
      printf("VERT\n");
   else
      printf("FRAG\n");

   etna_disasm(shader->code, shader->code_size, PRINT_RAW);

   printf("num loops: %i\n", shader->num_loops);
   printf("num temps: %i\n", shader->num_temps);
   printf("immediates:\n");
   for (int idx = 0; idx < shader->uniforms.count; ++idx) {
      printf(" [%i].%s = %f (0x%08x) (%d)\n",
             idx / 4,
             tgsi_swizzle_names[idx % 4],
             *((float *)&shader->uniforms.data[idx]),
             shader->uniforms.data[idx],
             shader->uniforms.contents[idx]);
   }
   printf("inputs:\n");
   for (int idx = 0; idx < shader->infile.num_reg; ++idx) {
      printf(" [%i] name=%s comps=%i\n", shader->infile.reg[idx].reg,
               (shader->stage == MESA_SHADER_VERTEX) ?
               gl_vert_attrib_name(shader->infile.reg[idx].slot) :
               gl_varying_slot_name_for_stage(shader->infile.reg[idx].slot, shader->stage),
               shader->infile.reg[idx].num_components);
   }
   printf("outputs:\n");
   for (int idx = 0; idx < shader->outfile.num_reg; ++idx) {
      printf(" [%i] name=%s comps=%i\n", shader->outfile.reg[idx].reg,
               (shader->stage == MESA_SHADER_VERTEX) ?
               gl_varying_slot_name_for_stage(shader->outfile.reg[idx].slot, shader->stage) :
               gl_frag_result_name(shader->outfile.reg[idx].slot),
               shader->outfile.reg[idx].num_components);
   }
   printf("special:\n");
   if (shader->stage == MESA_SHADER_VERTEX) {
      printf("  vs_pos_out_reg=%i\n", shader->vs_pos_out_reg);
      printf("  vs_pointsize_out_reg=%i\n", shader->vs_pointsize_out_reg);
      printf("  vs_load_balancing=0x%08x\n", shader->vs_load_balancing);
   } else {
      printf("  ps_color_out_reg=%i\n", shader->ps_color_out_reg);
      printf("  ps_depth_out_reg=%i\n", shader->ps_depth_out_reg);
   }
   printf("  input_count_unk8=0x%08x\n", shader->input_count_unk8);
}

/* Link vs and fs together: fill in shader_state from vs and fs
 * as this function is called every time a new fs or vs is bound, the goal is to
 * do little processing as possible here, and to precompute as much as possible in
 * the vs/fs shader_object.
 *
 * XXX we could cache the link result for a certain set of VS/PS; usually a pair
 * of VS and PS will be used together anyway.
 */
static bool
etna_link_shaders(struct etna_context *ctx, struct compiled_shader_state *cs,
                  struct etna_shader_variant *vs, struct etna_shader_variant *fs)
{
   struct etna_shader_link_info link = { };
   bool failed;

   assert(vs->stage == MESA_SHADER_VERTEX);
   assert(fs->stage == MESA_SHADER_FRAGMENT);

#ifdef DEBUG
   if (DBG_ENABLED(ETNA_DBG_DUMP_SHADERS)) {
      etna_dump_shader(vs);
      etna_dump_shader(fs);
   }
#endif

   failed = etna_link_shader(&link, vs, fs);

   if (failed) {
      /* linking failed: some fs inputs do not have corresponding
       * vs outputs */
      assert(0);

      return false;
   }

   if (DBG_ENABLED(ETNA_DBG_LINKER_MSGS)) {
      debug_printf("link result:\n");
      debug_printf("  vs  -> fs  comps use     pa_attr\n");

      for (int idx = 0; idx < link.num_varyings; ++idx)
         debug_printf("  t%-2u -> t%-2u %-5.*s %u,%u,%u,%u 0x%08x\n",
                      link.varyings[idx].reg, idx + 1,
                      link.varyings[idx].num_components, "xyzw",
                      link.varyings[idx].use[0], link.varyings[idx].use[1],
                      link.varyings[idx].use[2], link.varyings[idx].use[3],
                      link.varyings[idx].pa_attributes);
   }

   /* set last_varying_2x flag if the last varying has 1 or 2 components */
   bool last_varying_2x = false;
   if (link.num_varyings > 0 && link.varyings[link.num_varyings - 1].num_components <= 2)
      last_varying_2x = true;

   cs->RA_CONTROL = VIVS_RA_CONTROL_UNK0 |
                    COND(last_varying_2x, VIVS_RA_CONTROL_LAST_VARYING_2X);

   cs->PA_ATTRIBUTE_ELEMENT_COUNT = VIVS_PA_ATTRIBUTE_ELEMENT_COUNT_COUNT(link.num_varyings);
   for (int idx = 0; idx < link.num_varyings; ++idx)
      cs->PA_SHADER_ATTRIBUTES[idx] = link.varyings[idx].pa_attributes;

   cs->VS_END_PC = vs->code_size / 4;
   cs->VS_OUTPUT_COUNT = 1 + link.num_varyings; /* position + varyings */

   /* vs outputs (varyings) */
   DEFINE_ETNA_BITARRAY(vs_output, 16, 8) = {0};
   int varid = 0;
   etna_bitarray_set(vs_output, 8, varid++, vs->vs_pos_out_reg);
   for (int idx = 0; idx < link.num_varyings; ++idx)
      etna_bitarray_set(vs_output, 8, varid++, link.varyings[idx].reg);
   if (vs->vs_pointsize_out_reg >= 0)
      etna_bitarray_set(vs_output, 8, varid++, vs->vs_pointsize_out_reg); /* pointsize is last */

   for (int idx = 0; idx < ARRAY_SIZE(cs->VS_OUTPUT); ++idx)
      cs->VS_OUTPUT[idx] = vs_output[idx];

   if (vs->vs_pointsize_out_reg != -1) {
      /* vertex shader outputs point coordinate, provide extra output and make
       * sure PA config is
       * not masked */
      cs->PA_CONFIG = ~0;
      cs->VS_OUTPUT_COUNT_PSIZE = cs->VS_OUTPUT_COUNT + 1;
   } else {
      /* vertex shader does not output point coordinate, make sure thate
       * POINT_SIZE_ENABLE is masked
       * and no extra output is given */
      cs->PA_CONFIG = ~VIVS_PA_CONFIG_POINT_SIZE_ENABLE;
      cs->VS_OUTPUT_COUNT_PSIZE = cs->VS_OUTPUT_COUNT;
   }

   /* if fragment shader doesn't read pointcoord, disable it */
   if (link.pcoord_varying_comp_ofs == -1)
      cs->PA_CONFIG &= ~VIVS_PA_CONFIG_POINT_SPRITE_ENABLE;

   cs->VS_LOAD_BALANCING = vs->vs_load_balancing;
   cs->VS_START_PC = 0;

   cs->PS_END_PC = fs->code_size / 4;
   cs->PS_OUTPUT_REG = fs->ps_color_out_reg;
   cs->PS_INPUT_COUNT =
      VIVS_PS_INPUT_COUNT_COUNT(link.num_varyings + 1) | /* Number of inputs plus position */
      VIVS_PS_INPUT_COUNT_UNK8(fs->input_count_unk8);
   cs->PS_TEMP_REGISTER_CONTROL =
      VIVS_PS_TEMP_REGISTER_CONTROL_NUM_TEMPS(MAX2(fs->num_temps, link.num_varyings + 1));
   cs->PS_START_PC = 0;

   /* Precompute PS_INPUT_COUNT and TEMP_REGISTER_CONTROL in the case of MSAA
    * mode, avoids some fumbling in sync_context. */
   cs->PS_INPUT_COUNT_MSAA =
      VIVS_PS_INPUT_COUNT_COUNT(link.num_varyings + 2) | /* MSAA adds another input */
      VIVS_PS_INPUT_COUNT_UNK8(fs->input_count_unk8);
   cs->PS_TEMP_REGISTER_CONTROL_MSAA =
      VIVS_PS_TEMP_REGISTER_CONTROL_NUM_TEMPS(MAX2(fs->num_temps, link.num_varyings + 2));

   uint32_t total_components = 0;
   DEFINE_ETNA_BITARRAY(num_components, ETNA_NUM_VARYINGS, 4) = {0};
   DEFINE_ETNA_BITARRAY(component_use, 4 * ETNA_NUM_VARYINGS, 2) = {0};
   for (int idx = 0; idx < link.num_varyings; ++idx) {
      const struct etna_varying *varying = &link.varyings[idx];

      etna_bitarray_set(num_components, 4, idx, varying->num_components);
      for (int comp = 0; comp < varying->num_components; ++comp) {
         etna_bitarray_set(component_use, 2, total_components, varying->use[comp]);
         total_components += 1;
      }
   }

   cs->GL_VARYING_TOTAL_COMPONENTS =
      VIVS_GL_VARYING_TOTAL_COMPONENTS_NUM(align(total_components, 2));
   cs->GL_VARYING_NUM_COMPONENTS[0] = num_components[0];
   cs->GL_VARYING_NUM_COMPONENTS[1] = num_components[1];
   cs->GL_VARYING_COMPONENT_USE[0] = component_use[0];
   cs->GL_VARYING_COMPONENT_USE[1] = component_use[1];

   cs->GL_HALTI5_SH_SPECIALS =
      0x7f7f0000 | /* unknown bits, probably other PS inputs */
      /* pointsize is last (see above) */
      VIVS_GL_HALTI5_SH_SPECIALS_VS_PSIZE_OUT((vs->vs_pointsize_out_reg != -1) ?
                                              cs->VS_OUTPUT_COUNT * 4 : 0x00) |
      VIVS_GL_HALTI5_SH_SPECIALS_PS_PCOORD_IN((link.pcoord_varying_comp_ofs != -1) ?
                                              link.pcoord_varying_comp_ofs : 0x7f);

   cs->writes_z = fs->ps_depth_out_reg >= 0;
   cs->uses_discard = fs->uses_discard;

   /* reference instruction memory */
   cs->vs_inst_mem_size = vs->code_size;
   cs->VS_INST_MEM = vs->code;

   cs->ps_inst_mem_size = fs->code_size;
   cs->PS_INST_MEM = fs->code;

   if (vs->needs_icache || fs->needs_icache) {
      /* If either of the shaders needs ICACHE, we use it for both. It is
       * either switched on or off for the entire shader processor.
       */
      if (!etna_icache_upload_shader(ctx, vs) ||
          !etna_icache_upload_shader(ctx, fs)) {
         assert(0);
         return false;
      }

      cs->VS_INST_ADDR.bo = vs->bo;
      cs->VS_INST_ADDR.offset = 0;
      cs->VS_INST_ADDR.flags = ETNA_RELOC_READ;
      cs->PS_INST_ADDR.bo = fs->bo;
      cs->PS_INST_ADDR.offset = 0;
      cs->PS_INST_ADDR.flags = ETNA_RELOC_READ;
   } else {
      /* clear relocs */
      memset(&cs->VS_INST_ADDR, 0, sizeof(cs->VS_INST_ADDR));
      memset(&cs->PS_INST_ADDR, 0, sizeof(cs->PS_INST_ADDR));
   }

   return true;
}

bool
etna_shader_link(struct etna_context *ctx)
{
   if (!ctx->shader.vs || !ctx->shader.fs)
      return false;

   /* re-link vs and fs if needed */
   return etna_link_shaders(ctx, &ctx->shader_state, ctx->shader.vs, ctx->shader.fs);
}

void
etna_destroy_shader(struct etna_shader_variant *shader)
{
   assert(shader);

   FREE(shader->code);
   FREE(shader->uniforms.data);
   FREE(shader->uniforms.contents);
   FREE(shader);
}

static bool
etna_shader_update_vs_inputs(struct compiled_shader_state *cs,
                             const struct etna_shader_variant *vs,
                             const struct compiled_vertex_elements_state *ves)
{
   unsigned num_temps, cur_temp, num_vs_inputs;

   if (!vs)
      return false;

   /* Number of vertex elements determines number of VS inputs. Otherwise,
    * the GPU crashes. Allocate any unused vertex elements to VS temporary
    * registers. */
   num_vs_inputs = MAX2(ves->num_elements, vs->infile.num_reg);
   if (num_vs_inputs != ves->num_elements) {
      BUG("Number of elements %u does not match the number of VS inputs %zu",
          ves->num_elements, vs->infile.num_reg);
      return false;
   }

   cur_temp = vs->num_temps;
   num_temps = num_vs_inputs - vs->infile.num_reg + cur_temp;

   cs->VS_INPUT_COUNT = VIVS_VS_INPUT_COUNT_COUNT(num_vs_inputs) |
                        VIVS_VS_INPUT_COUNT_UNK8(vs->input_count_unk8);
   cs->VS_TEMP_REGISTER_CONTROL =
      VIVS_VS_TEMP_REGISTER_CONTROL_NUM_TEMPS(num_temps);

   /* vs inputs (attributes) */
   DEFINE_ETNA_BITARRAY(vs_input, 16, 8) = {0};
   for (int idx = 0; idx < num_vs_inputs; ++idx) {
      if (idx < vs->infile.num_reg)
         etna_bitarray_set(vs_input, 8, idx, vs->infile.reg[idx].reg);
      else
         etna_bitarray_set(vs_input, 8, idx, cur_temp++);
   }

   if (vs->vs_id_in_reg >= 0) {
      cs->VS_INPUT_COUNT = VIVS_VS_INPUT_COUNT_COUNT(num_vs_inputs + 1) |
                           VIVS_VS_INPUT_COUNT_UNK8(vs->input_count_unk8) |
                           VIVS_VS_INPUT_COUNT_ID_ENABLE;

      etna_bitarray_set(vs_input, 8, num_vs_inputs, vs->vs_id_in_reg);

      cs->FE_HALTI5_ID_CONFIG =
         VIVS_FE_HALTI5_ID_CONFIG_VERTEX_ID_ENABLE |
         VIVS_FE_HALTI5_ID_CONFIG_INSTANCE_ID_ENABLE |
         VIVS_FE_HALTI5_ID_CONFIG_VERTEX_ID_REG(vs->vs_id_in_reg * 4) |
         VIVS_FE_HALTI5_ID_CONFIG_INSTANCE_ID_REG(vs->vs_id_in_reg * 4 + 1);
   }

   for (int idx = 0; idx < ARRAY_SIZE(cs->VS_INPUT); ++idx)
      cs->VS_INPUT[idx] = vs_input[idx];

   return true;
}

static inline const char *
etna_shader_stage(struct etna_shader_variant *shader)
{
   switch (shader->stage) {
   case MESA_SHADER_VERTEX:     return "VERT";
   case MESA_SHADER_FRAGMENT:   return "FRAG";
   case MESA_SHADER_COMPUTE:    return "CL";
   default:
      unreachable("invalid type");
      return NULL;
   }
}

static void
dump_shader_info(struct etna_shader_variant *v, struct util_debug_callback *debug)
{
   if (!unlikely(etna_mesa_debug & ETNA_DBG_SHADERDB))
      return;

   util_debug_message(debug, SHADER_INFO,
         "%s shader: %u instructions, %u temps, "
         "%u immediates, %u loops",
         etna_shader_stage(v),
         v->code_size,
         v->num_temps,
         v->uniforms.count,
         v->num_loops);
}

bool
etna_shader_update_vertex(struct etna_context *ctx)
{
   return etna_shader_update_vs_inputs(&ctx->shader_state, ctx->shader.vs,
                                       ctx->vertex_elements);
}

static struct etna_shader_variant *
create_variant(struct etna_shader *shader, struct etna_shader_key key)
{
   struct etna_shader_variant *v = CALLOC_STRUCT(etna_shader_variant);
   int ret;

   if (!v)
      return NULL;

   v->shader = shader;
   v->key = key;
   v->id = ++shader->variant_count;

   if (etna_disk_cache_retrieve(shader->compiler, v))
      return v;

   ret = etna_compile_shader(v);
   if (!ret) {
      debug_error("compile failed!");
      goto fail;
   }

   etna_disk_cache_store(shader->compiler, v);

   return v;

fail:
   FREE(v);
   return NULL;
}

struct etna_shader_variant *
etna_shader_variant(struct etna_shader *shader, struct etna_shader_key key,
                   struct util_debug_callback *debug)
{
   struct etna_shader_variant *v;

   for (v = shader->variants; v; v = v->next)
      if (etna_shader_key_equal(&key, &v->key))
         return v;

   /* compile new variant if it doesn't exist already */
   v = create_variant(shader, key);
   if (v) {
      v->next = shader->variants;
      shader->variants = v;
      dump_shader_info(v, debug);
   }

   return v;
}

/**
 * Should initial variants be compiled synchronously?
 *
 * The only case where pipe_debug_message() is used in the initial-variants
 * path is with ETNA_MESA_DEBUG=shaderdb. So if either debug is disabled (ie.
 * debug.debug_message==NULL), or shaderdb stats are not enabled, we can
 * compile the initial shader variant asynchronously.
 */
static inline bool
initial_variants_synchronous(struct etna_context *ctx)
{
   return unlikely(ctx->debug.debug_message) || (etna_mesa_debug & ETNA_DBG_SHADERDB);
}

static void
create_initial_variants_async(void *job, void *gdata, int thread_index)
{
   struct etna_shader *shader = job;
   struct util_debug_callback debug = {};
   static struct etna_shader_key key;

   etna_shader_variant(shader, key, &debug);
}

static void *
etna_create_shader_state(struct pipe_context *pctx,
                         const struct pipe_shader_state *pss)
{
   struct etna_context *ctx = etna_context(pctx);
   struct etna_screen *screen = ctx->screen;
   struct etna_compiler *compiler = screen->compiler;
   struct etna_shader *shader = CALLOC_STRUCT(etna_shader);

   if (!shader)
      return NULL;

   shader->id = p_atomic_inc_return(&compiler->shader_count);
   shader->specs = &screen->specs;
   shader->compiler = screen->compiler;
   util_queue_fence_init(&shader->ready);

   shader->nir = (pss->type == PIPE_SHADER_IR_NIR) ? pss->ir.nir :
                  tgsi_to_nir(pss->tokens, pctx->screen, false);

   etna_disk_cache_init_shader_key(compiler, shader);

   if (initial_variants_synchronous(ctx)) {
      struct etna_shader_key key = {};
      etna_shader_variant(shader, key, &ctx->debug);
   } else {
      struct etna_screen *screen = ctx->screen;
      util_queue_add_job(&screen->shader_compiler_queue, shader, &shader->ready,
                         create_initial_variants_async, NULL, 0);
   }

   return shader;
}

static void
etna_delete_shader_state(struct pipe_context *pctx, void *ss)
{
   struct etna_shader *shader = ss;
   struct etna_shader_variant *v, *t;

   v = shader->variants;
   while (v) {
      t = v;
      v = v->next;
      if (t->bo)
         etna_bo_del(t->bo);

      etna_destroy_shader(t);
   }

   tgsi_free_tokens(shader->tokens);
   ralloc_free(shader->nir);
   FREE(shader);
}

static void
etna_bind_fs_state(struct pipe_context *pctx, void *hwcso)
{
   struct etna_context *ctx = etna_context(pctx);

   ctx->shader.bind_fs = hwcso;
   ctx->dirty |= ETNA_DIRTY_SHADER;
}

static void
etna_bind_vs_state(struct pipe_context *pctx, void *hwcso)
{
   struct etna_context *ctx = etna_context(pctx);

   ctx->shader.bind_vs = hwcso;
   ctx->dirty |= ETNA_DIRTY_SHADER;
}

static void
etna_set_max_shader_compiler_threads(struct pipe_screen *pscreen,
                                     unsigned max_threads)
{
   struct etna_screen *screen = etna_screen(pscreen);

   util_queue_adjust_num_threads(&screen->shader_compiler_queue, max_threads);
}

static bool
etna_is_parallel_shader_compilation_finished(struct pipe_screen *pscreen,
                                             void *hwcso,
                                             enum pipe_shader_type shader_type)
{
   struct etna_shader *shader = (struct etna_shader *)hwcso;

   return util_queue_fence_is_signalled(&shader->ready);
}

void
etna_shader_init(struct pipe_context *pctx)
{
   pctx->create_fs_state = etna_create_shader_state;
   pctx->bind_fs_state = etna_bind_fs_state;
   pctx->delete_fs_state = etna_delete_shader_state;
   pctx->create_vs_state = etna_create_shader_state;
   pctx->bind_vs_state = etna_bind_vs_state;
   pctx->delete_vs_state = etna_delete_shader_state;
}

bool
etna_shader_screen_init(struct pipe_screen *pscreen)
{
   struct etna_screen *screen = etna_screen(pscreen);
   unsigned num_threads = util_get_cpu_caps()->nr_cpus - 1;

   /* Create at least one thread - even on single core CPU systems. */
   num_threads = MAX2(1, num_threads);

   screen->compiler = etna_compiler_create(pscreen->get_name(pscreen));
   if (!screen->compiler)
      return false;

   pscreen->set_max_shader_compiler_threads = etna_set_max_shader_compiler_threads;
   pscreen->is_parallel_shader_compilation_finished = etna_is_parallel_shader_compilation_finished;

   return util_queue_init(&screen->shader_compiler_queue, "sh", 64, num_threads,
                          UTIL_QUEUE_INIT_RESIZE_IF_FULL | UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY,
                          NULL);
}

void
etna_shader_screen_fini(struct pipe_screen *pscreen)
{
   struct etna_screen *screen = etna_screen(pscreen);

   util_queue_destroy(&screen->shader_compiler_queue);
   etna_compiler_destroy(screen->compiler);
}