mesa/src/gallium/drivers/zink/zink_compiler.c

/*
 * Copyright 2018 Collabora Ltd.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * on the rights to use, copy, modify, merge, publish, distribute, sub
 * license, and/or sell copies of the Software, and to permit persons to whom
 * the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

#include "nir_opcodes.h"
#include "zink_context.h"
#include "zink_compiler.h"
#include "zink_descriptors.h"
#include "zink_program.h"
#include "zink_screen.h"
#include "nir_to_spirv/nir_to_spirv.h"

#include "pipe/p_state.h"

#include "nir.h"
#include "nir_xfb_info.h"
#include "nir/nir_draw_helpers.h"
#include "compiler/nir/nir_builder.h"
#include "compiler/nir/nir_serialize.h"
#include "compiler/nir/nir_builtin_builder.h"

#include "nir/tgsi_to_nir.h"
#include "tgsi/tgsi_dump.h"

#include "util/u_memory.h"

#include "compiler/spirv/nir_spirv.h"
#include "vk_util.h"

bool
zink_lower_cubemap_to_array(nir_shader *s, uint32_t nonseamless_cube_mask);


static void
copy_vars(nir_builder *b, nir_deref_instr *dst, nir_deref_instr *src)
{
   assert(glsl_get_bare_type(dst->type) == glsl_get_bare_type(src->type));
   if (glsl_type_is_struct_or_ifc(dst->type)) {
      for (unsigned i = 0; i < glsl_get_length(dst->type); ++i) {
         copy_vars(b, nir_build_deref_struct(b, dst, i), nir_build_deref_struct(b, src, i));
      }
   } else if (glsl_type_is_array_or_matrix(dst->type)) {
      unsigned count = glsl_type_is_array(dst->type) ? glsl_array_size(dst->type) : glsl_get_matrix_columns(dst->type);
      for (unsigned i = 0; i < count; i++) {
         copy_vars(b, nir_build_deref_array_imm(b, dst, i), nir_build_deref_array_imm(b, src, i));
      }
   } else {
      nir_def *load = nir_load_deref(b, src);
      nir_store_deref(b, dst, load, BITFIELD_MASK(load->num_components));
   }
}

static bool
is_clipcull_dist(int location)
{
   switch (location) {
   case VARYING_SLOT_CLIP_DIST0:
   case VARYING_SLOT_CLIP_DIST1:
   case VARYING_SLOT_CULL_DIST0:
   case VARYING_SLOT_CULL_DIST1:
      return true;
   default: break;
   }
   return false;
}

#define SIZEOF_FIELD(type, field) sizeof(((type *)0)->field)

static void
create_gfx_pushconst(nir_shader *nir)
{
#define PUSHCONST_MEMBER(member_idx, field)                                                                     \
fields[member_idx].type =                                                                                       \
   glsl_array_type(glsl_uint_type(), SIZEOF_FIELD(struct zink_gfx_push_constant, field) / sizeof(uint32_t), 0); \
fields[member_idx].name = ralloc_asprintf(nir, #field);                                                         \
fields[member_idx].offset = offsetof(struct zink_gfx_push_constant, field);

   nir_variable *pushconst;
   /* create compatible layout for the ntv push constant loader */
   struct glsl_struct_field *fields = rzalloc_array(nir, struct glsl_struct_field, ZINK_GFX_PUSHCONST_MAX);
   PUSHCONST_MEMBER(ZINK_GFX_PUSHCONST_DRAW_MODE_IS_INDEXED, draw_mode_is_indexed);
   PUSHCONST_MEMBER(ZINK_GFX_PUSHCONST_DRAW_ID, draw_id);
   PUSHCONST_MEMBER(ZINK_GFX_PUSHCONST_FRAMEBUFFER_IS_LAYERED, framebuffer_is_layered);
   PUSHCONST_MEMBER(ZINK_GFX_PUSHCONST_DEFAULT_INNER_LEVEL, default_inner_level);
   PUSHCONST_MEMBER(ZINK_GFX_PUSHCONST_DEFAULT_OUTER_LEVEL, default_outer_level);
   PUSHCONST_MEMBER(ZINK_GFX_PUSHCONST_LINE_STIPPLE_PATTERN, line_stipple_pattern);
   PUSHCONST_MEMBER(ZINK_GFX_PUSHCONST_VIEWPORT_SCALE, viewport_scale);
   PUSHCONST_MEMBER(ZINK_GFX_PUSHCONST_LINE_WIDTH, line_width);

   pushconst = nir_variable_create(nir, nir_var_mem_push_const,
                                   glsl_struct_type(fields, ZINK_GFX_PUSHCONST_MAX, "struct", false),
                                   "gfx_pushconst");
   pushconst->data.location = INT_MAX; //doesn't really matter

#undef PUSHCONST_MEMBER
}

static bool
lower_basevertex_instr(nir_builder *b, nir_intrinsic_instr *instr, void *data)
{
   if (instr->intrinsic != nir_intrinsic_load_base_vertex)
      return false;

   b->cursor = nir_after_instr(&instr->instr);
   nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_push_constant_zink);
   load->src[0] = nir_src_for_ssa(nir_imm_int(b, ZINK_GFX_PUSHCONST_DRAW_MODE_IS_INDEXED));
   load->num_components = 1;
   nir_def_init(&load->instr, &load->def, 1, 32);
   nir_builder_instr_insert(b, &load->instr);

   nir_def *composite = nir_build_alu(b, nir_op_bcsel,
                                          nir_build_alu(b, nir_op_ieq, &load->def, nir_imm_int(b, 1), NULL, NULL),
                                          &instr->def,
                                          nir_imm_int(b, 0),
                                          NULL);

   nir_def_rewrite_uses_after(&instr->def, composite,
                                  composite->parent_instr);
   return true;
}

static bool
lower_basevertex(nir_shader *shader)
{
   if (shader->info.stage != MESA_SHADER_VERTEX)
      return false;

   if (!BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_BASE_VERTEX))
      return false;

   return nir_shader_intrinsics_pass(shader, lower_basevertex_instr,
                                     nir_metadata_dominance, NULL);
}


static bool
lower_drawid_instr(nir_builder *b, nir_intrinsic_instr *instr, void *data)
{
   if (instr->intrinsic != nir_intrinsic_load_draw_id)
      return false;

   b->cursor = nir_before_instr(&instr->instr);
   nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_push_constant_zink);
   load->src[0] = nir_src_for_ssa(nir_imm_int(b, ZINK_GFX_PUSHCONST_DRAW_ID));
   load->num_components = 1;
   nir_def_init(&load->instr, &load->def, 1, 32);
   nir_builder_instr_insert(b, &load->instr);

   nir_def_rewrite_uses(&instr->def, &load->def);

   return true;
}

static bool
lower_drawid(nir_shader *shader)
{
   if (shader->info.stage != MESA_SHADER_VERTEX)
      return false;

   if (!BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_DRAW_ID))
      return false;

   return nir_shader_intrinsics_pass(shader, lower_drawid_instr,
                                     nir_metadata_dominance, NULL);
}

struct lower_gl_point_state {
   nir_variable *gl_pos_out;
   nir_variable *gl_point_size;
};

static bool
lower_gl_point_gs_instr(nir_builder *b, nir_instr *instr, void *data)
{
   struct lower_gl_point_state *state = data;
   nir_def *vp_scale, *pos;

   if (instr->type != nir_instr_type_intrinsic)
      return false;

   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
   if (intrin->intrinsic != nir_intrinsic_emit_vertex_with_counter &&
       intrin->intrinsic != nir_intrinsic_emit_vertex)
      return false;

   if (nir_intrinsic_stream_id(intrin) != 0)
      return false;

   if (intrin->intrinsic == nir_intrinsic_end_primitive_with_counter ||
         intrin->intrinsic == nir_intrinsic_end_primitive) {
      nir_instr_remove(&intrin->instr);
      return true;
   }

   b->cursor = nir_before_instr(instr);

   // viewport-map endpoints
   nir_def *vp_const_pos = nir_imm_int(b, ZINK_GFX_PUSHCONST_VIEWPORT_SCALE);
   vp_scale = nir_load_push_constant_zink(b, 2, 32, vp_const_pos);

   // Load point info values
   nir_def *point_size = nir_load_var(b, state->gl_point_size);
   nir_def *point_pos = nir_load_var(b, state->gl_pos_out);

   // w_delta = gl_point_size / width_viewport_size_scale * gl_Position.w
   nir_def *w_delta = nir_fdiv(b, point_size, nir_channel(b, vp_scale, 0));
   w_delta = nir_fmul(b, w_delta, nir_channel(b, point_pos, 3));
   // halt_w_delta = w_delta / 2
   nir_def *half_w_delta = nir_fmul_imm(b, w_delta, 0.5);

   // h_delta = gl_point_size / height_viewport_size_scale * gl_Position.w
   nir_def *h_delta = nir_fdiv(b, point_size, nir_channel(b, vp_scale, 1));
   h_delta = nir_fmul(b, h_delta, nir_channel(b, point_pos, 3));
   // halt_h_delta = h_delta / 2
   nir_def *half_h_delta = nir_fmul_imm(b, h_delta, 0.5);

   nir_def *point_dir[4][2] = {
      { nir_imm_float(b, -1), nir_imm_float(b, -1) },
      { nir_imm_float(b, -1), nir_imm_float(b, 1) },
      { nir_imm_float(b, 1), nir_imm_float(b, -1) },
      { nir_imm_float(b, 1), nir_imm_float(b, 1) }
   };

   nir_def *point_pos_x = nir_channel(b, point_pos, 0);
   nir_def *point_pos_y = nir_channel(b, point_pos, 1);

   for (size_t i = 0; i < 4; i++) {
      pos = nir_vec4(b,
                     nir_ffma(b, half_w_delta, point_dir[i][0], point_pos_x),
                     nir_ffma(b, half_h_delta, point_dir[i][1], point_pos_y),
                     nir_channel(b, point_pos, 2),
                     nir_channel(b, point_pos, 3));

      nir_store_var(b, state->gl_pos_out, pos, 0xf);

      nir_emit_vertex(b);
   }

   nir_end_primitive(b);

   nir_instr_remove(&intrin->instr);

   return true;
}

static bool
lower_gl_point_gs(nir_shader *shader)
{
   struct lower_gl_point_state state;

   shader->info.gs.output_primitive = MESA_PRIM_TRIANGLE_STRIP;
   shader->info.gs.vertices_out *= 4;

   // Gets the gl_Position in and out
   state.gl_pos_out =
      nir_find_variable_with_location(shader, nir_var_shader_out,
                                      VARYING_SLOT_POS);
   state.gl_point_size =
      nir_find_variable_with_location(shader, nir_var_shader_out,
                                      VARYING_SLOT_PSIZ);

   // if position in or gl_PointSize aren't written, we have nothing to do
   if (!state.gl_pos_out || !state.gl_point_size)
      return false;

   return nir_shader_instructions_pass(shader, lower_gl_point_gs_instr,
                                       nir_metadata_dominance, &state);
}

struct lower_pv_mode_state {
   nir_variable *varyings[VARYING_SLOT_MAX][4];
   nir_variable *pos_counter;
   nir_variable *out_pos_counter;
   nir_variable *ring_offset;
   unsigned ring_size;
   unsigned primitive_vert_count;
   unsigned prim;
};

static nir_def*
lower_pv_mode_gs_ring_index(nir_builder *b,
                            struct lower_pv_mode_state *state,
                            nir_def *index)
{
   nir_def *ring_offset = nir_load_var(b, state->ring_offset);
   return nir_imod_imm(b, nir_iadd(b, index, ring_offset),
                          state->ring_size);
}

/* Given the final deref of chain of derefs this function will walk up the chain
 * until it finds a var deref.
 *
 * It will then recreate an identical chain that ends with the provided deref.
 */
static nir_deref_instr*
replicate_derefs(nir_builder *b, nir_deref_instr *old, nir_deref_instr *new)
{
   nir_deref_instr *parent = nir_deref_instr_parent(old);
   if (!parent)
      return new;
   switch(old->deref_type) {
   case nir_deref_type_var:
      return new;
   case nir_deref_type_array:
      return nir_build_deref_array(b, replicate_derefs(b, parent, new), old->arr.index.ssa);
   case nir_deref_type_struct:
      return nir_build_deref_struct(b, replicate_derefs(b, parent, new), old->strct.index);
   case nir_deref_type_array_wildcard:
   case nir_deref_type_ptr_as_array:
   case nir_deref_type_cast:
      unreachable("unexpected deref type");
   }
   unreachable("impossible deref type");
}

static bool
lower_pv_mode_gs_store(nir_builder *b,
                       nir_intrinsic_instr *intrin,
                       struct lower_pv_mode_state *state)
{
   b->cursor = nir_before_instr(&intrin->instr);
   nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
   if (nir_deref_mode_is(deref, nir_var_shader_out)) {
      nir_variable *var = nir_deref_instr_get_variable(deref);

      gl_varying_slot location = var->data.location;
      unsigned location_frac = var->data.location_frac;
      assert(state->varyings[location][location_frac]);
      nir_def *pos_counter = nir_load_var(b, state->pos_counter);
      nir_def *index = lower_pv_mode_gs_ring_index(b, state, pos_counter);
      nir_deref_instr *varying_deref = nir_build_deref_var(b, state->varyings[location][location_frac]);
      nir_deref_instr *ring_deref = nir_build_deref_array(b, varying_deref, index);
      // recreate the chain of deref that lead to the store.
      nir_deref_instr *new_top_deref = replicate_derefs(b, deref, ring_deref);
      nir_store_deref(b, new_top_deref, intrin->src[1].ssa, nir_intrinsic_write_mask(intrin));
      nir_instr_remove(&intrin->instr);
      return true;
   }

   return false;
}

static void
lower_pv_mode_emit_rotated_prim(nir_builder *b,
                                struct lower_pv_mode_state *state,
                                nir_def *current_vertex)
{
   nir_def *two = nir_imm_int(b, 2);
   nir_def *three = nir_imm_int(b, 3);
   bool is_triangle = state->primitive_vert_count == 3;
   /* This shader will always see the last three vertices emitted by the user gs.
    * The following table is used to to rotate primitives within a strip generated
    * by the user gs such that the last vertex becomes the first.
    *
    * [lines, tris][even/odd index][vertex mod 3]
    */
   static const unsigned vert_maps[2][2][3] = {
      {{1, 0, 0}, {1, 0, 0}},
      {{2, 0, 1}, {2, 1, 0}}
   };
   /* When the primive supplied to the gs comes from a strip, the last provoking vertex
    * is either the last or the second, depending on whether the triangle is at an odd
    * or even position within the strip.
    *
    * odd or even primitive within draw
    */
   nir_def *odd_prim = nir_imod(b, nir_load_primitive_id(b), two);
   for (unsigned i = 0; i < state->primitive_vert_count; i++) {
      /* odd or even triangle within strip emitted by user GS
       * this is handled using the table
       */
      nir_def *odd_user_prim = nir_imod(b, current_vertex, two);
      unsigned offset_even = vert_maps[is_triangle][0][i];
      unsigned offset_odd = vert_maps[is_triangle][1][i];
      nir_def *offset_even_value = nir_imm_int(b, offset_even);
      nir_def *offset_odd_value = nir_imm_int(b, offset_odd);
      nir_def *rotated_i = nir_bcsel(b, nir_b2b1(b, odd_user_prim),
                                            offset_odd_value, offset_even_value);
      /* Here we account for how triangles are provided to the gs from a strip.
       * For even primitives we rotate by 3, meaning we do nothing.
       * For odd primitives we rotate by 2, combined with the previous rotation this
       * means the second vertex becomes the last.
       */
      if (state->prim == ZINK_PVE_PRIMITIVE_TRISTRIP)
        rotated_i = nir_imod(b, nir_iadd(b, rotated_i,
                                            nir_isub(b, three,
                                                        odd_prim)),
                                            three);
      /* Triangles that come from fans are provided to the gs the same way as
       * odd triangles from a strip so always rotate by 2.
       */
      else if (state->prim == ZINK_PVE_PRIMITIVE_FAN)
        rotated_i = nir_imod(b, nir_iadd_imm(b, rotated_i, 2),
                                three);
      rotated_i = nir_iadd(b, rotated_i, current_vertex);
      nir_foreach_variable_with_modes(var, b->shader, nir_var_shader_out) {
         gl_varying_slot location = var->data.location;
         unsigned location_frac = var->data.location_frac;
         if (state->varyings[location][location_frac]) {
            nir_def *index = lower_pv_mode_gs_ring_index(b, state, rotated_i);
            nir_deref_instr *value = nir_build_deref_array(b, nir_build_deref_var(b, state->varyings[location][location_frac]), index);
            copy_vars(b, nir_build_deref_var(b, var), value);
         }
      }
      nir_emit_vertex(b);
   }
}

static bool
lower_pv_mode_gs_emit_vertex(nir_builder *b,
                             nir_intrinsic_instr *intrin,
                             struct lower_pv_mode_state *state)
{
   b->cursor = nir_before_instr(&intrin->instr);

   // increment pos_counter
   nir_def *pos_counter = nir_load_var(b, state->pos_counter);
   nir_store_var(b, state->pos_counter, nir_iadd_imm(b, pos_counter, 1), 1);

   nir_instr_remove(&intrin->instr);
   return true;
}

static bool
lower_pv_mode_gs_end_primitive(nir_builder *b,
                               nir_intrinsic_instr *intrin,
                               struct lower_pv_mode_state *state)
{
   b->cursor = nir_before_instr(&intrin->instr);

   nir_def *pos_counter = nir_load_var(b, state->pos_counter);
   nir_push_loop(b);
   {
      nir_def *out_pos_counter = nir_load_var(b, state->out_pos_counter);
      nir_push_if(b, nir_ilt(b, nir_isub(b, pos_counter, out_pos_counter),
                                nir_imm_int(b, state->primitive_vert_count)));
      nir_jump(b, nir_jump_break);
      nir_pop_if(b, NULL);

      lower_pv_mode_emit_rotated_prim(b, state, out_pos_counter);
      nir_end_primitive(b);

      nir_store_var(b, state->out_pos_counter, nir_iadd_imm(b, out_pos_counter, 1), 1);
   }
   nir_pop_loop(b, NULL);
   /* Set the ring offset such that when position 0 is
    * read we get the last value written
    */
   nir_store_var(b, state->ring_offset, pos_counter, 1);
   nir_store_var(b, state->pos_counter, nir_imm_int(b, 0), 1);
   nir_store_var(b, state->out_pos_counter, nir_imm_int(b, 0), 1);

   nir_instr_remove(&intrin->instr);
   return true;
}

static bool
lower_pv_mode_gs_instr(nir_builder *b, nir_instr *instr, void *data)
{
   if (instr->type != nir_instr_type_intrinsic)
      return false;

   struct lower_pv_mode_state *state = data;
   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);

   switch (intrin->intrinsic) {
   case nir_intrinsic_store_deref:
      return lower_pv_mode_gs_store(b, intrin, state);
   case nir_intrinsic_copy_deref:
      unreachable("should be lowered");
   case nir_intrinsic_emit_vertex_with_counter:
   case nir_intrinsic_emit_vertex:
      return lower_pv_mode_gs_emit_vertex(b, intrin, state);
   case nir_intrinsic_end_primitive:
   case nir_intrinsic_end_primitive_with_counter:
      return lower_pv_mode_gs_end_primitive(b, intrin, state);
   default:
      return false;
   }
}

static bool
lower_pv_mode_gs(nir_shader *shader, unsigned prim)
{
   nir_builder b;
   struct lower_pv_mode_state state;
   memset(state.varyings, 0, sizeof(state.varyings));

   nir_function_impl *entry = nir_shader_get_entrypoint(shader);
   b = nir_builder_at(nir_before_impl(entry));

   state.primitive_vert_count =
      mesa_vertices_per_prim(shader->info.gs.output_primitive);
   state.ring_size = shader->info.gs.vertices_out;

   nir_foreach_variable_with_modes(var, shader, nir_var_shader_out) {
      gl_varying_slot location = var->data.location;
      unsigned location_frac = var->data.location_frac;

      char name[100];
      snprintf(name, sizeof(name), "__tmp_primverts_%d_%d", location, location_frac);
      state.varyings[location][location_frac] =
         nir_local_variable_create(entry,
                                   glsl_array_type(var->type,
                                                   state.ring_size,
                                                   false),
                                   name);
   }

   state.pos_counter = nir_local_variable_create(entry,
                                                 glsl_uint_type(),
                                                 "__pos_counter");

   state.out_pos_counter = nir_local_variable_create(entry,
                                                     glsl_uint_type(),
                                                     "__out_pos_counter");

   state.ring_offset = nir_local_variable_create(entry,
                                                 glsl_uint_type(),
                                                 "__ring_offset");

   state.prim = prim;

   // initialize pos_counter and out_pos_counter
   nir_store_var(&b, state.pos_counter, nir_imm_int(&b, 0), 1);
   nir_store_var(&b, state.out_pos_counter, nir_imm_int(&b, 0), 1);
   nir_store_var(&b, state.ring_offset, nir_imm_int(&b, 0), 1);

   shader->info.gs.vertices_out = (shader->info.gs.vertices_out -
                                   (state.primitive_vert_count - 1)) *
                                  state.primitive_vert_count;
   return nir_shader_instructions_pass(shader, lower_pv_mode_gs_instr,
                                       nir_metadata_dominance, &state);
}

struct lower_line_stipple_state {
   nir_variable *pos_out;
   nir_variable *stipple_out;
   nir_variable *prev_pos;
   nir_variable *pos_counter;
   nir_variable *stipple_counter;
   bool line_rectangular;
};

static nir_def *
viewport_map(nir_builder *b, nir_def *vert,
             nir_def *scale)
{
   nir_def *w_recip = nir_frcp(b, nir_channel(b, vert, 3));
   nir_def *ndc_point = nir_fmul(b, nir_trim_vector(b, vert, 2),
                                        w_recip);
   return nir_fmul(b, ndc_point, scale);
}

static bool
lower_line_stipple_gs_instr(nir_builder *b, nir_instr *instr, void *data)
{
   struct lower_line_stipple_state *state = data;
   if (instr->type != nir_instr_type_intrinsic)
      return false;

   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
   if (intrin->intrinsic != nir_intrinsic_emit_vertex_with_counter &&
       intrin->intrinsic != nir_intrinsic_emit_vertex)
      return false;

   b->cursor = nir_before_instr(instr);

   nir_push_if(b, nir_ine_imm(b, nir_load_var(b, state->pos_counter), 0));
   // viewport-map endpoints
   nir_def *vp_scale = nir_load_push_constant_zink(b, 2, 32,
                                                       nir_imm_int(b, ZINK_GFX_PUSHCONST_VIEWPORT_SCALE));
   nir_def *prev = nir_load_var(b, state->prev_pos);
   nir_def *curr = nir_load_var(b, state->pos_out);
   prev = viewport_map(b, prev, vp_scale);
   curr = viewport_map(b, curr, vp_scale);

   // calculate length of line
   nir_def *len;
   if (state->line_rectangular)
      len = nir_fast_distance(b, prev, curr);
   else {
      nir_def *diff = nir_fabs(b, nir_fsub(b, prev, curr));
      len = nir_fmax(b, nir_channel(b, diff, 0), nir_channel(b, diff, 1));
   }
   // update stipple_counter
   nir_store_var(b, state->stipple_counter,
                    nir_fadd(b, nir_load_var(b, state->stipple_counter),
                                len), 1);
   nir_pop_if(b, NULL);
   // emit stipple out
   nir_copy_var(b, state->stipple_out, state->stipple_counter);
   nir_copy_var(b, state->prev_pos, state->pos_out);

   // update prev_pos and pos_counter for next vertex
   b->cursor = nir_after_instr(instr);
   nir_store_var(b, state->pos_counter,
                    nir_iadd_imm(b, nir_load_var(b, state->pos_counter),
                                    1), 1);

   return true;
}

static bool
lower_line_stipple_gs(nir_shader *shader, bool line_rectangular)
{
   nir_builder b;
   struct lower_line_stipple_state state;

   state.pos_out =
      nir_find_variable_with_location(shader, nir_var_shader_out,
                                      VARYING_SLOT_POS);

   // if position isn't written, we have nothing to do
   if (!state.pos_out)
      return false;

   state.stipple_out = nir_variable_create(shader, nir_var_shader_out,
                                           glsl_float_type(),
                                           "__stipple");
   state.stipple_out->data.interpolation = INTERP_MODE_NOPERSPECTIVE;
   state.stipple_out->data.driver_location = shader->num_outputs++;
   state.stipple_out->data.location = MAX2(util_last_bit64(shader->info.outputs_written), VARYING_SLOT_VAR0);
   shader->info.outputs_written |= BITFIELD64_BIT(state.stipple_out->data.location);

   // create temp variables
   state.prev_pos = nir_variable_create(shader, nir_var_shader_temp,
                                        glsl_vec4_type(),
                                        "__prev_pos");
   state.pos_counter = nir_variable_create(shader, nir_var_shader_temp,
                                           glsl_uint_type(),
                                           "__pos_counter");
   state.stipple_counter = nir_variable_create(shader, nir_var_shader_temp,
                                               glsl_float_type(),
                                               "__stipple_counter");

   state.line_rectangular = line_rectangular;
   // initialize pos_counter and stipple_counter
   nir_function_impl *entry = nir_shader_get_entrypoint(shader);
   b = nir_builder_at(nir_before_impl(entry));
   nir_store_var(&b, state.pos_counter, nir_imm_int(&b, 0), 1);
   nir_store_var(&b, state.stipple_counter, nir_imm_float(&b, 0), 1);

   return nir_shader_instructions_pass(shader, lower_line_stipple_gs_instr,
                                       nir_metadata_dominance, &state);
}

static bool
lower_line_stipple_fs(nir_shader *shader)
{
   nir_builder b;
   nir_function_impl *entry = nir_shader_get_entrypoint(shader);
   b = nir_builder_at(nir_after_impl(entry));

   // create stipple counter
   nir_variable *stipple = nir_variable_create(shader, nir_var_shader_in,
                                               glsl_float_type(),
                                               "__stipple");
   stipple->data.interpolation = INTERP_MODE_NOPERSPECTIVE;
   stipple->data.driver_location = shader->num_inputs++;
   stipple->data.location = MAX2(util_last_bit64(shader->info.inputs_read), VARYING_SLOT_VAR0);
   shader->info.inputs_read |= BITFIELD64_BIT(stipple->data.location);

   nir_variable *sample_mask_out =
      nir_find_variable_with_location(shader, nir_var_shader_out,
                                      FRAG_RESULT_SAMPLE_MASK);
   if (!sample_mask_out) {
      sample_mask_out = nir_variable_create(shader, nir_var_shader_out,
                                        glsl_uint_type(), "sample_mask");
      sample_mask_out->data.driver_location = shader->num_outputs++;
      sample_mask_out->data.location = FRAG_RESULT_SAMPLE_MASK;
   }

   nir_def *pattern = nir_load_push_constant_zink(&b, 1, 32,
                                                      nir_imm_int(&b, ZINK_GFX_PUSHCONST_LINE_STIPPLE_PATTERN));
   nir_def *factor = nir_i2f32(&b, nir_ishr_imm(&b, pattern, 16));
   pattern = nir_iand_imm(&b, pattern, 0xffff);

   nir_def *sample_mask_in = nir_load_sample_mask_in(&b);
   nir_variable *v = nir_local_variable_create(entry, glsl_uint_type(), NULL);
   nir_variable *sample_mask = nir_local_variable_create(entry, glsl_uint_type(), NULL);
   nir_store_var(&b, v, sample_mask_in, 1);
   nir_store_var(&b, sample_mask, sample_mask_in, 1);
   nir_push_loop(&b);
   {
      nir_def *value = nir_load_var(&b, v);
      nir_def *index = nir_ufind_msb(&b, value);
      nir_def *index_mask = nir_ishl(&b, nir_imm_int(&b, 1), index);
      nir_def *new_value = nir_ixor(&b, value, index_mask);
      nir_store_var(&b, v, new_value,  1);
      nir_push_if(&b, nir_ieq_imm(&b, value, 0));
      nir_jump(&b, nir_jump_break);
      nir_pop_if(&b, NULL);

      nir_def *stipple_pos =
         nir_interp_deref_at_sample(&b, 1, 32,
            &nir_build_deref_var(&b, stipple)->def, index);
      stipple_pos = nir_fmod(&b, nir_fdiv(&b, stipple_pos, factor),
                                 nir_imm_float(&b, 16.0));
      stipple_pos = nir_f2i32(&b, stipple_pos);
      nir_def *bit =
         nir_iand_imm(&b, nir_ishr(&b, pattern, stipple_pos), 1);
      nir_push_if(&b, nir_ieq_imm(&b, bit, 0));
      {
         nir_def *sample_mask_value = nir_load_var(&b, sample_mask);
         sample_mask_value = nir_ixor(&b, sample_mask_value, index_mask);
         nir_store_var(&b, sample_mask, sample_mask_value, 1);
      }
      nir_pop_if(&b, NULL);
   }
   nir_pop_loop(&b, NULL);
   nir_store_var(&b, sample_mask_out, nir_load_var(&b, sample_mask), 1);

   return true;
}

struct lower_line_smooth_state {
   nir_variable *pos_out;
   nir_variable *line_coord_out;
   nir_variable *prev_pos;
   nir_variable *pos_counter;
   nir_variable *prev_varyings[VARYING_SLOT_MAX][4],
                *varyings[VARYING_SLOT_MAX][4]; // location_frac
};

static bool
lower_line_smooth_gs_store(nir_builder *b,
                           nir_intrinsic_instr *intrin,
                           struct lower_line_smooth_state *state)
{
   b->cursor = nir_before_instr(&intrin->instr);
   nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
   if (nir_deref_mode_is(deref, nir_var_shader_out)) {
      nir_variable *var = nir_deref_instr_get_variable(deref);

      // we take care of position elsewhere
      gl_varying_slot location = var->data.location;
      unsigned location_frac = var->data.location_frac;
      if (location != VARYING_SLOT_POS) {
         assert(state->varyings[location]);
         nir_store_var(b, state->varyings[location][location_frac],
                       intrin->src[1].ssa,
                       nir_intrinsic_write_mask(intrin));
         nir_instr_remove(&intrin->instr);
         return true;
      }
   }

   return false;
}

static bool
lower_line_smooth_gs_emit_vertex(nir_builder *b,
                                 nir_intrinsic_instr *intrin,
                                 struct lower_line_smooth_state *state)
{
   b->cursor = nir_before_instr(&intrin->instr);

   nir_push_if(b, nir_ine_imm(b, nir_load_var(b, state->pos_counter), 0));
   nir_def *vp_scale = nir_load_push_constant_zink(b, 2, 32,
                                                       nir_imm_int(b, ZINK_GFX_PUSHCONST_VIEWPORT_SCALE));
   nir_def *prev = nir_load_var(b, state->prev_pos);
   nir_def *curr = nir_load_var(b, state->pos_out);
   nir_def *prev_vp = viewport_map(b, prev, vp_scale);
   nir_def *curr_vp = viewport_map(b, curr, vp_scale);

   nir_def *width = nir_load_push_constant_zink(b, 1, 32,
                                                    nir_imm_int(b, ZINK_GFX_PUSHCONST_LINE_WIDTH));
   nir_def *half_width = nir_fadd_imm(b, nir_fmul_imm(b, width, 0.5), 0.5);

   const unsigned yx[2] = { 1, 0 };
   nir_def *vec = nir_fsub(b, curr_vp, prev_vp);
   nir_def *len = nir_fast_length(b, vec);
   nir_def *dir = nir_normalize(b, vec);
   nir_def *half_length = nir_fmul_imm(b, len, 0.5);
   half_length = nir_fadd_imm(b, half_length, 0.5);

   nir_def *vp_scale_rcp = nir_frcp(b, vp_scale);
   nir_def *tangent =
      nir_fmul(b,
               nir_fmul(b,
                        nir_swizzle(b, dir, yx, 2),
                        nir_imm_vec2(b, 1.0, -1.0)),
               vp_scale_rcp);
   tangent = nir_fmul(b, tangent, half_width);
   tangent = nir_pad_vector_imm_int(b, tangent, 0, 4);
   dir = nir_fmul_imm(b, nir_fmul(b, dir, vp_scale_rcp), 0.5);

   nir_def *line_offets[8] = {
      nir_fadd(b, tangent, nir_fneg(b, dir)),
      nir_fadd(b, nir_fneg(b, tangent), nir_fneg(b, dir)),
      tangent,
      nir_fneg(b, tangent),
      tangent,
      nir_fneg(b, tangent),
      nir_fadd(b, tangent, dir),
      nir_fadd(b, nir_fneg(b, tangent), dir),
   };
   nir_def *line_coord =
      nir_vec4(b, half_width, half_width, half_length, half_length);
   nir_def *line_coords[8] = {
      nir_fmul(b, line_coord, nir_imm_vec4(b, -1,  1,  -1,  1)),
      nir_fmul(b, line_coord, nir_imm_vec4(b,  1,  1,  -1,  1)),
      nir_fmul(b, line_coord, nir_imm_vec4(b, -1,  1,   0,  1)),
      nir_fmul(b, line_coord, nir_imm_vec4(b,  1,  1,   0,  1)),
      nir_fmul(b, line_coord, nir_imm_vec4(b, -1,  1,   0,  1)),
      nir_fmul(b, line_coord, nir_imm_vec4(b,  1,  1,   0,  1)),
      nir_fmul(b, line_coord, nir_imm_vec4(b, -1,  1,   1,  1)),
      nir_fmul(b, line_coord, nir_imm_vec4(b,  1,  1,   1,  1)),
   };

   /* emit first end-cap, and start line */
   for (int i = 0; i < 4; ++i) {
      nir_foreach_variable_with_modes(var, b->shader, nir_var_shader_out) {
         gl_varying_slot location = var->data.location;
         unsigned location_frac = var->data.location_frac;
         if (state->prev_varyings[location][location_frac])
            nir_copy_var(b, var, state->prev_varyings[location][location_frac]);
      }
      nir_store_var(b, state->pos_out,
                    nir_fadd(b, prev, nir_fmul(b, line_offets[i],
                             nir_channel(b, prev, 3))), 0xf);
      nir_store_var(b, state->line_coord_out, line_coords[i], 0xf);
      nir_emit_vertex(b);
   }

   /* finish line and emit last end-cap */
   for (int i = 4; i < 8; ++i) {
      nir_foreach_variable_with_modes(var, b->shader, nir_var_shader_out) {
         gl_varying_slot location = var->data.location;
         unsigned location_frac = var->data.location_frac;
         if (state->varyings[location][location_frac])
            nir_copy_var(b, var, state->varyings[location][location_frac]);
      }
      nir_store_var(b, state->pos_out,
                    nir_fadd(b, curr, nir_fmul(b, line_offets[i],
                             nir_channel(b, curr, 3))), 0xf);
      nir_store_var(b, state->line_coord_out, line_coords[i], 0xf);
      nir_emit_vertex(b);
   }
   nir_end_primitive(b);

   nir_pop_if(b, NULL);

   nir_copy_var(b, state->prev_pos, state->pos_out);
   nir_foreach_variable_with_modes(var, b->shader, nir_var_shader_out) {
      gl_varying_slot location = var->data.location;
      unsigned location_frac = var->data.location_frac;
      if (state->varyings[location][location_frac])
         nir_copy_var(b, state->prev_varyings[location][location_frac], state->varyings[location][location_frac]);
   }

   // update prev_pos and pos_counter for next vertex
   b->cursor = nir_after_instr(&intrin->instr);
   nir_store_var(b, state->pos_counter,
                    nir_iadd_imm(b, nir_load_var(b, state->pos_counter),
                                    1), 1);

   nir_instr_remove(&intrin->instr);
   return true;
}

static bool
lower_line_smooth_gs_end_primitive(nir_builder *b,
                                   nir_intrinsic_instr *intrin,
                                   struct lower_line_smooth_state *state)
{
   b->cursor = nir_before_instr(&intrin->instr);

   // reset line counter
   nir_store_var(b, state->pos_counter, nir_imm_int(b, 0), 1);

   nir_instr_remove(&intrin->instr);
   return true;
}

static bool
lower_line_smooth_gs_instr(nir_builder *b, nir_instr *instr, void *data)
{
   if (instr->type != nir_instr_type_intrinsic)
      return false;

   struct lower_line_smooth_state *state = data;
   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);

   switch (intrin->intrinsic) {
   case nir_intrinsic_store_deref:
      return lower_line_smooth_gs_store(b, intrin, state);
   case nir_intrinsic_copy_deref:
      unreachable("should be lowered");
   case nir_intrinsic_emit_vertex_with_counter:
   case nir_intrinsic_emit_vertex:
      return lower_line_smooth_gs_emit_vertex(b, intrin, state);
   case nir_intrinsic_end_primitive:
   case nir_intrinsic_end_primitive_with_counter:
      return lower_line_smooth_gs_end_primitive(b, intrin, state);
   default:
      return false;
   }
}

static bool
lower_line_smooth_gs(nir_shader *shader)
{
   nir_builder b;
   struct lower_line_smooth_state state;

   memset(state.varyings, 0, sizeof(state.varyings));
   memset(state.prev_varyings, 0, sizeof(state.prev_varyings));
   nir_foreach_variable_with_modes(var, shader, nir_var_shader_out) {
      gl_varying_slot location = var->data.location;
      unsigned location_frac = var->data.location_frac;
      if (location == VARYING_SLOT_POS)
         continue;

      char name[100];
      snprintf(name, sizeof(name), "__tmp_%d_%d", location, location_frac);
      state.varyings[location][location_frac] =
         nir_variable_create(shader, nir_var_shader_temp,
                              var->type, name);

      snprintf(name, sizeof(name), "__tmp_prev_%d_%d", location, location_frac);
      state.prev_varyings[location][location_frac] =
         nir_variable_create(shader, nir_var_shader_temp,
                              var->type, name);
   }

   state.pos_out =
      nir_find_variable_with_location(shader, nir_var_shader_out,
                                      VARYING_SLOT_POS);

   // if position isn't written, we have nothing to do
   if (!state.pos_out)
      return false;

   unsigned location = 0;
   nir_foreach_shader_in_variable(var, shader) {
     if (var->data.driver_location >= location)
         location = var->data.driver_location + 1;
   }

   state.line_coord_out =
      nir_variable_create(shader, nir_var_shader_out, glsl_vec4_type(),
                          "__line_coord");
   state.line_coord_out->data.interpolation = INTERP_MODE_NOPERSPECTIVE;
   state.line_coord_out->data.driver_location = location;
   state.line_coord_out->data.location = MAX2(util_last_bit64(shader->info.outputs_written), VARYING_SLOT_VAR0);
   shader->info.outputs_written |= BITFIELD64_BIT(state.line_coord_out->data.location);
   shader->num_outputs++;

   // create temp variables
   state.prev_pos = nir_variable_create(shader, nir_var_shader_temp,
                                        glsl_vec4_type(),
                                        "__prev_pos");
   state.pos_counter = nir_variable_create(shader, nir_var_shader_temp,
                                           glsl_uint_type(),
                                           "__pos_counter");

   // initialize pos_counter
   nir_function_impl *entry = nir_shader_get_entrypoint(shader);
   b = nir_builder_at(nir_before_impl(entry));
   nir_store_var(&b, state.pos_counter, nir_imm_int(&b, 0), 1);

   shader->info.gs.vertices_out = 8 * shader->info.gs.vertices_out;
   shader->info.gs.output_primitive = MESA_PRIM_TRIANGLE_STRIP;

   return nir_shader_instructions_pass(shader, lower_line_smooth_gs_instr,
                                       nir_metadata_dominance, &state);
}

static bool
lower_line_smooth_fs(nir_shader *shader, bool lower_stipple)
{
   int dummy;
   nir_builder b;

   nir_variable *stipple_counter = NULL, *stipple_pattern = NULL;
   if (lower_stipple) {
      stipple_counter = nir_variable_create(shader, nir_var_shader_in,
                                            glsl_float_type(),
                                            "__stipple");
      stipple_counter->data.interpolation = INTERP_MODE_NOPERSPECTIVE;
      stipple_counter->data.driver_location = shader->num_inputs++;
      stipple_counter->data.location =
         MAX2(util_last_bit64(shader->info.inputs_read), VARYING_SLOT_VAR0);
      shader->info.inputs_read |= BITFIELD64_BIT(stipple_counter->data.location);

      stipple_pattern = nir_variable_create(shader, nir_var_shader_temp,
                                            glsl_uint_type(),
                                            "stipple_pattern");

      // initialize stipple_pattern
      nir_function_impl *entry = nir_shader_get_entrypoint(shader);
      b = nir_builder_at(nir_before_impl(entry));
      nir_def *pattern = nir_load_push_constant_zink(&b, 1, 32,
                                                         nir_imm_int(&b, ZINK_GFX_PUSHCONST_LINE_STIPPLE_PATTERN));
      nir_store_var(&b, stipple_pattern, pattern, 1);
   }

   nir_lower_aaline_fs(shader, &dummy, stipple_counter, stipple_pattern);
   return true;
}

static bool
lower_dual_blend(nir_shader *shader)
{
   bool progress = false;
   nir_variable *var = nir_find_variable_with_location(shader, nir_var_shader_out, FRAG_RESULT_DATA1);
   if (var) {
      var->data.location = FRAG_RESULT_DATA0;
      var->data.index = 1;
      progress = true;
   }
   nir_shader_preserve_all_metadata(shader);
   return progress;
}

static bool
lower_64bit_pack_instr(nir_builder *b, nir_instr *instr, void *data)
{
   if (instr->type != nir_instr_type_alu)
      return false;
   nir_alu_instr *alu_instr = (nir_alu_instr *) instr;
   if (alu_instr->op != nir_op_pack_64_2x32 &&
       alu_instr->op != nir_op_unpack_64_2x32)
      return false;
   b->cursor = nir_before_instr(&alu_instr->instr);
   nir_def *src = nir_ssa_for_alu_src(b, alu_instr, 0);
   nir_def *dest;
   switch (alu_instr->op) {
   case nir_op_pack_64_2x32:
      dest = nir_pack_64_2x32_split(b, nir_channel(b, src, 0), nir_channel(b, src, 1));
      break;
   case nir_op_unpack_64_2x32:
      dest = nir_vec2(b, nir_unpack_64_2x32_split_x(b, src), nir_unpack_64_2x32_split_y(b, src));
      break;
   default:
      unreachable("Impossible opcode");
   }
   nir_def_rewrite_uses(&alu_instr->def, dest);
   nir_instr_remove(&alu_instr->instr);
   return true;
}

static bool
lower_64bit_pack(nir_shader *shader)
{
   return nir_shader_instructions_pass(shader, lower_64bit_pack_instr,
                                       nir_metadata_block_index | nir_metadata_dominance, NULL);
}

nir_shader *
zink_create_quads_emulation_gs(const nir_shader_compiler_options *options,
                               const nir_shader *prev_stage)
{
   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_GEOMETRY,
                                                  options,
                                                  "filled quad gs");

   nir_shader *nir = b.shader;
   nir->info.gs.input_primitive = MESA_PRIM_LINES_ADJACENCY;
   nir->info.gs.output_primitive = MESA_PRIM_TRIANGLE_STRIP;
   nir->info.gs.vertices_in = 4;
   nir->info.gs.vertices_out = 6;
   nir->info.gs.invocations = 1;
   nir->info.gs.active_stream_mask = 1;

   nir->info.has_transform_feedback_varyings = prev_stage->info.has_transform_feedback_varyings;
   memcpy(nir->info.xfb_stride, prev_stage->info.xfb_stride, sizeof(prev_stage->info.xfb_stride));
   if (prev_stage->xfb_info) {
      size_t size = nir_xfb_info_size(prev_stage->xfb_info->output_count);
      nir->xfb_info = ralloc_memdup(nir, prev_stage->xfb_info, size);
   }

   nir_variable *in_vars[VARYING_SLOT_MAX];
   nir_variable *out_vars[VARYING_SLOT_MAX];
   unsigned num_vars = 0;

   /* Create input/output variables. */
   nir_foreach_shader_out_variable(var, prev_stage) {
      assert(!var->data.patch);

      /* input vars can't be created for those */
      if (var->data.location == VARYING_SLOT_LAYER ||
          var->data.location == VARYING_SLOT_VIEW_INDEX ||
          /* psiz not needed for quads */
          var->data.location == VARYING_SLOT_PSIZ)
         continue;

      char name[100];
      if (var->name)
         snprintf(name, sizeof(name), "in_%s", var->name);
      else
         snprintf(name, sizeof(name), "in_%d", var->data.driver_location);

      nir_variable *in = nir_variable_clone(var, nir);
      ralloc_free(in->name);
      in->name = ralloc_strdup(in, name);
      in->type = glsl_array_type(var->type, 4, false);
      in->data.mode = nir_var_shader_in;
      nir_shader_add_variable(nir, in);

      if (var->name)
         snprintf(name, sizeof(name), "out_%s", var->name);
      else
         snprintf(name, sizeof(name), "out_%d", var->data.driver_location);

      nir_variable *out = nir_variable_clone(var, nir);
      ralloc_free(out->name);
      out->name = ralloc_strdup(out, name);
      out->data.mode = nir_var_shader_out;
      nir_shader_add_variable(nir, out);

      in_vars[num_vars] = in;
      out_vars[num_vars++] = out;
   }

   int mapping_first[] = {0, 1, 2, 0, 2, 3};
   int mapping_last[] = {0, 1, 3, 1, 2, 3};
   nir_def *last_pv_vert_def = nir_load_provoking_last(&b);
   last_pv_vert_def = nir_ine_imm(&b, last_pv_vert_def, 0);
   for (unsigned i = 0; i < 6; ++i) {
      /* swap indices 2 and 3 */
      nir_def *idx = nir_bcsel(&b, last_pv_vert_def,
                                   nir_imm_int(&b, mapping_last[i]),
                                   nir_imm_int(&b, mapping_first[i]));
      /* Copy inputs to outputs. */
      for (unsigned j = 0; j < num_vars; ++j) {
         if (in_vars[j]->data.location == VARYING_SLOT_EDGE) {
            continue;
         }
         nir_deref_instr *in_value = nir_build_deref_array(&b, nir_build_deref_var(&b, in_vars[j]), idx);
         copy_vars(&b, nir_build_deref_var(&b, out_vars[j]), in_value);
      }
      nir_emit_vertex(&b, 0);
      if (i == 2)
        nir_end_primitive(&b, 0);
   }

   nir_end_primitive(&b, 0);
   nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
   nir_validate_shader(nir, "in zink_create_quads_emulation_gs");
   return nir;
}

static bool
lower_system_values_to_inlined_uniforms_instr(nir_builder *b,
                                              nir_intrinsic_instr *intrin,
                                              void *data)
{
   int inlined_uniform_offset;
   switch (intrin->intrinsic) {
   case nir_intrinsic_load_flat_mask:
      inlined_uniform_offset = ZINK_INLINE_VAL_FLAT_MASK * sizeof(uint32_t);
      break;
   case nir_intrinsic_load_provoking_last:
      inlined_uniform_offset = ZINK_INLINE_VAL_PV_LAST_VERT * sizeof(uint32_t);
      break;
   default:
      return false;
   }

   b->cursor = nir_before_instr(&intrin->instr);
   assert(intrin->def.bit_size == 32 || intrin->def.bit_size == 64);
   /* nir_inline_uniforms can't handle bit_size != 32 (it will silently ignore
    * anything with a different bit_size) so we need to split the load. */
   int num_dwords = intrin->def.bit_size / 32;
   nir_def *dwords[2] = {NULL};
   for (unsigned i = 0; i < num_dwords; i++)
      dwords[i] = nir_load_ubo(b, 1, 32, nir_imm_int(b, 0),
                                   nir_imm_int(b, inlined_uniform_offset + i),
                                   .align_mul = intrin->def.bit_size / 8,
                                   .align_offset = 0,
                                   .range_base = 0, .range = ~0);
   nir_def *new_dest_def;
   if (intrin->def.bit_size == 32)
      new_dest_def = dwords[0];
   else
      new_dest_def = nir_pack_64_2x32_split(b, dwords[0], dwords[1]);
   nir_def_rewrite_uses(&intrin->def, new_dest_def);
   nir_instr_remove(&intrin->instr);
   return true;
}

bool
zink_lower_system_values_to_inlined_uniforms(nir_shader *nir)
{
   return nir_shader_intrinsics_pass(nir,
                                       lower_system_values_to_inlined_uniforms_instr,
                                       nir_metadata_dominance, NULL);
}

/* from radeonsi */
static unsigned
amd_varying_expression_max_cost(nir_shader *producer, nir_shader *consumer)
{
   /* TODO: maybe implement shader profiles to disable, cf. 39804ebf1766d38004259085e1fec4ed8db86f1c */

   switch (consumer->info.stage) {
   case MESA_SHADER_TESS_CTRL: /* VS->TCS */
      /* Non-amplifying shaders can always have their variyng expressions
       * moved into later shaders.
       */
      return UINT_MAX;

   case MESA_SHADER_GEOMETRY: /* VS->GS, TES->GS */
      return consumer->info.gs.vertices_in == 1 ? UINT_MAX :
             consumer->info.gs.vertices_in == 2 ? 20 : 14;

   case MESA_SHADER_TESS_EVAL: /* VS->TES, TCS->TES */
   case MESA_SHADER_FRAGMENT:
      /* Up to 3 uniforms and 5 ALUs. */
      return 14;

   default:
      unreachable("unexpected shader stage");
   }
}

/* from radeonsi */
static unsigned
amd_varying_estimate_instr_cost(nir_instr *instr)
{
   unsigned dst_bit_size, src_bit_size, num_dst_dwords;
   nir_op alu_op;

   /* This is a very loose approximation based on gfx10. */
   switch (instr->type) {
   case nir_instr_type_alu:
      dst_bit_size = nir_instr_as_alu(instr)->def.bit_size;
      src_bit_size = nir_instr_as_alu(instr)->src[0].src.ssa->bit_size;
      alu_op = nir_instr_as_alu(instr)->op;
      num_dst_dwords = DIV_ROUND_UP(dst_bit_size, 32);

      switch (alu_op) {
      case nir_op_mov:
      case nir_op_vec2:
      case nir_op_vec3:
      case nir_op_vec4:
      case nir_op_vec5:
      case nir_op_vec8:
      case nir_op_vec16:
      case nir_op_fabs:
      case nir_op_fneg:
      case nir_op_fsat:
         return 0;

      case nir_op_imul:
      case nir_op_umul_low:
         return dst_bit_size <= 16 ? 1 : 4 * num_dst_dwords;

      case nir_op_imul_high:
      case nir_op_umul_high:
      case nir_op_imul_2x32_64:
      case nir_op_umul_2x32_64:
         return 4;

      case nir_op_fexp2:
      case nir_op_flog2:
      case nir_op_frcp:
      case nir_op_frsq:
      case nir_op_fsqrt:
      case nir_op_fsin:
      case nir_op_fcos:
      case nir_op_fsin_amd:
      case nir_op_fcos_amd:
         return 4; /* FP16 & FP32. */

      case nir_op_fpow:
         return 4 + 1 + 4; /* log2 + mul + exp2 */

      case nir_op_fsign:
         return dst_bit_size == 64 ? 4 : 3; /* See ac_build_fsign. */

      case nir_op_idiv:
      case nir_op_udiv:
      case nir_op_imod:
      case nir_op_umod:
      case nir_op_irem:
         return dst_bit_size == 64 ? 80 : 40;

      case nir_op_fdiv:
         return dst_bit_size == 64 ? 80 : 5; /* FP16 & FP32: rcp + mul */

      case nir_op_fmod:
      case nir_op_frem:
         return dst_bit_size == 64 ? 80 : 8;

      default:
         /* Double opcodes. Comparisons have always full performance. */
         if ((dst_bit_size == 64 &&
              nir_op_infos[alu_op].output_type & nir_type_float) ||
             (dst_bit_size >= 8 && src_bit_size == 64 &&
              nir_op_infos[alu_op].input_types[0] & nir_type_float))
            return 16;

         return DIV_ROUND_UP(MAX2(dst_bit_size, src_bit_size), 32);
      }

   case nir_instr_type_intrinsic:
      dst_bit_size = nir_instr_as_intrinsic(instr)->def.bit_size;
      num_dst_dwords = DIV_ROUND_UP(dst_bit_size, 32);

      switch (nir_instr_as_intrinsic(instr)->intrinsic) {
      case nir_intrinsic_load_deref:
         /* Uniform or UBO load.
          * Set a low cost to balance the number of scalar loads and ALUs.
          */
         return 3 * num_dst_dwords;

      default:
         unreachable("unexpected intrinsic");
      }

   default:
      unreachable("unexpected instr type");
   }
}

void
zink_screen_init_compiler(struct zink_screen *screen)
{
   static const struct nir_shader_compiler_options
   default_options = {
      .io_options = nir_io_glsl_lower_derefs,
      .lower_ffma16 = true,
      .lower_ffma32 = true,
      .lower_ffma64 = true,
      .lower_scmp = true,
      .lower_fdph = true,
      .lower_flrp32 = true,
      .lower_fpow = true,
      .lower_fsat = true,
      .lower_hadd = true,
      .lower_iadd_sat = true,
      .lower_fisnormal = true,
      .lower_extract_byte = true,
      .lower_extract_word = true,
      .lower_insert_byte = true,
      .lower_insert_word = true,

      /* We can only support 32-bit ldexp, but NIR doesn't have a flag
       * distinguishing 64-bit ldexp support (radeonsi *does* support 64-bit
       * ldexp, so we don't just always lower it in NIR).  Given that ldexp is
       * effectively unused (no instances in shader-db), it's not worth the
       * effort to do so.
       * */
      .lower_ldexp = true,

      .lower_mul_high = true,
      .lower_to_scalar = true,
      .lower_uadd_carry = true,
      .compact_arrays = true,
      .lower_usub_borrow = true,
      .lower_uadd_sat = true,
      .lower_usub_sat = true,
      .lower_vector_cmp = true,
      .lower_int64_options = 0,
      .lower_doubles_options = nir_lower_dround_even,
      .lower_uniforms_to_ubo = true,
      .has_fsub = true,
      .has_isub = true,
      .lower_mul_2x32_64 = true,
      .support_16bit_alu = true, /* not quite what it sounds like */
      .support_indirect_inputs = BITFIELD_MASK(MESA_SHADER_COMPUTE),
      .support_indirect_outputs = BITFIELD_MASK(MESA_SHADER_COMPUTE),
      .max_unroll_iterations = 0,
      .use_interpolated_input_intrinsics = true,
   };

   screen->nir_options = default_options;

   if (!screen->info.feats.features.shaderInt64)
      screen->nir_options.lower_int64_options = ~0;

   if (!screen->info.feats.features.shaderFloat64) {
      screen->nir_options.lower_doubles_options = ~0;
      screen->nir_options.lower_flrp64 = true;
      screen->nir_options.lower_ffma64 = true;
      /* soft fp64 function inlining will blow up loop bodies and effectively
       * stop Vulkan drivers from unrolling the loops.
       */
      screen->nir_options.max_unroll_iterations_fp64 = 32;
   }

   if (screen->driver_workarounds.io_opt) {
      screen->nir_options.io_options |= nir_io_glsl_opt_varyings;

      switch (screen->info.driver_props.driverID) {
      case VK_DRIVER_ID_MESA_RADV:
      case VK_DRIVER_ID_AMD_OPEN_SOURCE:
      case VK_DRIVER_ID_AMD_PROPRIETARY:
         screen->nir_options.varying_expression_max_cost = amd_varying_expression_max_cost;
         screen->nir_options.varying_estimate_instr_cost = amd_varying_estimate_instr_cost;
         break;
      default:
         mesa_logw("zink: instruction costs not implemented for this implementation!");
         screen->nir_options.varying_expression_max_cost = amd_varying_expression_max_cost;
         screen->nir_options.varying_estimate_instr_cost = amd_varying_estimate_instr_cost;
      }
   }

   /*
       The OpFRem and OpFMod instructions use cheap approximations of remainder,
       and the error can be large due to the discontinuity in trunc() and floor().
       This can produce mathematically unexpected results in some cases, such as
       FMod(x,x) computing x rather than 0, and can also cause the result to have
       a different sign than the infinitely precise result.

       -Table 84. Precision of core SPIR-V Instructions
       * for drivers that are known to have imprecise fmod for doubles, lower dmod
    */
   if (screen->info.driver_props.driverID == VK_DRIVER_ID_MESA_RADV ||
       screen->info.driver_props.driverID == VK_DRIVER_ID_AMD_OPEN_SOURCE ||
       screen->info.driver_props.driverID == VK_DRIVER_ID_AMD_PROPRIETARY)
      screen->nir_options.lower_doubles_options = nir_lower_dmod;
}

const void *
zink_get_compiler_options(struct pipe_screen *pscreen,
                          enum pipe_shader_ir ir,
                          gl_shader_stage shader)
{
   assert(ir == PIPE_SHADER_IR_NIR);
   return &zink_screen(pscreen)->nir_options;
}

struct nir_shader *
zink_tgsi_to_nir(struct pipe_screen *screen, const struct tgsi_token *tokens)
{
   if (zink_debug & ZINK_DEBUG_TGSI) {
      fprintf(stderr, "TGSI shader:\n---8<---\n");
      tgsi_dump_to_file(tokens, 0, stderr);
      fprintf(stderr, "---8<---\n\n");
   }

   return tgsi_to_nir(tokens, screen, false);
}


static bool
def_is_64bit(nir_def *def, void *state)
{
   bool *lower = (bool *)state;
   if (def && (def->bit_size == 64)) {
      *lower = true;
      return false;
   }
   return true;
}

static bool
src_is_64bit(nir_src *src, void *state)
{
   bool *lower = (bool *)state;
   if (src && (nir_src_bit_size(*src) == 64)) {
      *lower = true;
      return false;
   }
   return true;
}

static bool
filter_64_bit_instr(const nir_instr *const_instr, UNUSED const void *data)
{
   bool lower = false;
   /* lower_alu_to_scalar required nir_instr to be const, but nir_foreach_*
    * doesn't have const variants, so do the ugly const_cast here. */
   nir_instr *instr = (nir_instr *)const_instr;

   nir_foreach_def(instr, def_is_64bit, &lower);
   if (lower)
      return true;
   nir_foreach_src(instr, src_is_64bit, &lower);
   return lower;
}

static bool
filter_pack_instr(const nir_instr *const_instr, UNUSED const void *data)
{
   nir_instr *instr = (nir_instr *)const_instr;
   nir_alu_instr *alu = nir_instr_as_alu(instr);
   switch (alu->op) {
   case nir_op_pack_64_2x32_split:
   case nir_op_pack_32_2x16_split:
   case nir_op_unpack_32_2x16_split_x:
   case nir_op_unpack_32_2x16_split_y:
   case nir_op_unpack_64_2x32_split_x:
   case nir_op_unpack_64_2x32_split_y:
      return true;
   default:
      break;
   }
   return false;
}


struct bo_vars {
   nir_variable *uniforms[5];
   nir_variable *ubo[5];
   nir_variable *ssbo[5];
   uint32_t first_ubo;
   uint32_t first_ssbo;
};

static struct bo_vars
get_bo_vars(struct zink_shader *zs, nir_shader *shader)
{
   struct bo_vars bo;
   memset(&bo, 0, sizeof(bo));
   if (zs->ubos_used)
      bo.first_ubo = ffs(zs->ubos_used & ~BITFIELD_BIT(0)) - 2;
   assert(bo.first_ssbo < PIPE_MAX_CONSTANT_BUFFERS);
   if (zs->ssbos_used)
      bo.first_ssbo = ffs(zs->ssbos_used) - 1;
   assert(bo.first_ssbo < PIPE_MAX_SHADER_BUFFERS);
   nir_foreach_variable_with_modes(var, shader, nir_var_mem_ssbo | nir_var_mem_ubo) {
      unsigned idx = glsl_get_explicit_stride(glsl_get_struct_field(glsl_without_array(var->type), 0)) >> 1;
      if (var->data.mode == nir_var_mem_ssbo) {
         assert(!bo.ssbo[idx]);
         bo.ssbo[idx] = var;
      } else {
         if (var->data.driver_location) {
            assert(!bo.ubo[idx]);
            bo.ubo[idx] = var;
         } else {
            assert(!bo.uniforms[idx]);
            bo.uniforms[idx] = var;
         }
      }
   }
   return bo;
}

static bool
bound_bo_access_instr(nir_builder *b, nir_instr *instr, void *data)
{
   struct bo_vars *bo = data;
   if (instr->type != nir_instr_type_intrinsic)
      return false;
   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
   nir_variable *var = NULL;
   nir_def *offset = NULL;
   bool is_load = true;
   b->cursor = nir_before_instr(instr);

   switch (intr->intrinsic) {
   case nir_intrinsic_store_ssbo:
      var = bo->ssbo[intr->def.bit_size >> 4];
      offset = intr->src[2].ssa;
      is_load = false;
      break;
   case nir_intrinsic_load_ssbo:
      var = bo->ssbo[intr->def.bit_size >> 4];
      offset = intr->src[1].ssa;
      break;
   case nir_intrinsic_load_ubo:
      if (nir_src_is_const(intr->src[0]) && nir_src_as_const_value(intr->src[0])->u32 == 0)
         var = bo->uniforms[intr->def.bit_size >> 4];
      else
         var = bo->ubo[intr->def.bit_size >> 4];
      offset = intr->src[1].ssa;
      break;
   default:
      return false;
   }
   nir_src offset_src = nir_src_for_ssa(offset);
   if (!nir_src_is_const(offset_src))
      return false;

   unsigned offset_bytes = nir_src_as_const_value(offset_src)->u32;
   const struct glsl_type *strct_type = glsl_get_array_element(var->type);
   unsigned size = glsl_array_size(glsl_get_struct_field(strct_type, 0));
   bool has_unsized = glsl_array_size(glsl_get_struct_field(strct_type, glsl_get_length(strct_type) - 1)) == 0;
   if (has_unsized || offset_bytes + intr->num_components - 1 < size)
      return false;

   unsigned rewrites = 0;
   nir_def *result[2];
   for (unsigned i = 0; i < intr->num_components; i++) {
      if (offset_bytes + i >= size) {
         rewrites++;
         if (is_load)
            result[i] = nir_imm_zero(b, 1, intr->def.bit_size);
      }
   }
   assert(rewrites == intr->num_components);
   if (is_load) {
      nir_def *load = nir_vec(b, result, intr->num_components);
      nir_def_rewrite_uses(&intr->def, load);
   }
   nir_instr_remove(instr);
   return true;
}

static bool
bound_bo_access(nir_shader *shader, struct zink_shader *zs)
{
   struct bo_vars bo = get_bo_vars(zs, shader);
   return nir_shader_instructions_pass(shader, bound_bo_access_instr, nir_metadata_dominance, &bo);
}

static void
optimize_nir(struct nir_shader *s, struct zink_shader *zs, bool can_shrink)
{
   bool progress;
   do {
      progress = false;
      if (s->options->lower_int64_options)
         NIR_PASS_V(s, nir_lower_int64);
      if (s->options->lower_doubles_options & nir_lower_fp64_full_software)
         NIR_PASS_V(s, lower_64bit_pack);
      NIR_PASS_V(s, nir_lower_vars_to_ssa);
      NIR_PASS(progress, s, nir_lower_alu_to_scalar, filter_pack_instr, NULL);
      NIR_PASS(progress, s, nir_opt_copy_prop_vars);
      NIR_PASS(progress, s, nir_copy_prop);
      NIR_PASS(progress, s, nir_opt_remove_phis);
      if (s->options->lower_int64_options) {
         NIR_PASS(progress, s, nir_lower_64bit_phis);
         NIR_PASS(progress, s, nir_lower_alu_to_scalar, filter_64_bit_instr, NULL);
      }
      NIR_PASS(progress, s, nir_opt_dce);
      NIR_PASS(progress, s, nir_opt_dead_cf);
      NIR_PASS(progress, s, nir_lower_phis_to_scalar, false);
      NIR_PASS(progress, s, nir_opt_cse);
      NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
      NIR_PASS(progress, s, nir_opt_algebraic);
      NIR_PASS(progress, s, nir_opt_constant_folding);
      NIR_PASS(progress, s, nir_opt_undef);
      NIR_PASS(progress, s, zink_nir_lower_b2b);
      if (zs)
         NIR_PASS(progress, s, bound_bo_access, zs);
      if (can_shrink)
         NIR_PASS(progress, s, nir_opt_shrink_vectors, false);
   } while (progress);

   do {
      progress = false;
      NIR_PASS(progress, s, nir_opt_algebraic_late);
      if (progress) {
         NIR_PASS_V(s, nir_copy_prop);
         NIR_PASS_V(s, nir_opt_dce);
         NIR_PASS_V(s, nir_opt_cse);
      }
   } while (progress);
}

/* - copy the lowered fbfetch variable
 * - set the new one up as an input attachment for descriptor 0.6
 * - load it as an image
 * - overwrite the previous load
 */
static bool
lower_fbfetch_instr(nir_builder *b, nir_instr *instr, void *data)
{
   bool ms = data != NULL;
   if (instr->type != nir_instr_type_intrinsic)
      return false;
   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
   if (intr->intrinsic != nir_intrinsic_load_deref)
      return false;
   nir_variable *var = nir_intrinsic_get_var(intr, 0);
   if (!var->data.fb_fetch_output)
      return false;
   b->cursor = nir_after_instr(instr);
   nir_variable *fbfetch = nir_variable_clone(var, b->shader);
   /* If Dim is SubpassData, ... Image Format must be Unknown
    * - SPIRV OpTypeImage specification
    */
   fbfetch->data.image.format = 0;
   fbfetch->data.index = 0; /* fix this if more than 1 fbfetch target is supported */
   fbfetch->data.mode = nir_var_uniform;
   fbfetch->data.binding = ZINK_FBFETCH_BINDING;
   fbfetch->data.binding = ZINK_FBFETCH_BINDING;
   fbfetch->data.sample = ms;
   enum glsl_sampler_dim dim = ms ? GLSL_SAMPLER_DIM_SUBPASS_MS : GLSL_SAMPLER_DIM_SUBPASS;
   fbfetch->type = glsl_image_type(dim, false, GLSL_TYPE_FLOAT);
   nir_shader_add_variable(b->shader, fbfetch);
   nir_def *deref = &nir_build_deref_var(b, fbfetch)->def;
   nir_def *sample = ms ? nir_load_sample_id(b) : nir_undef(b, 1, 32);
   nir_def *load = nir_image_deref_load(b, 4, 32, deref, nir_imm_vec4(b, 0, 0, 0, 1), sample, nir_imm_int(b, 0));
   nir_def_rewrite_uses(&intr->def, load);
   return true;
}

static bool
lower_fbfetch(nir_shader *shader, nir_variable **fbfetch, bool ms)
{
   nir_foreach_shader_out_variable(var, shader) {
      if (var->data.fb_fetch_output) {
         *fbfetch = var;
         break;
      }
   }
   assert(*fbfetch);
   if (!*fbfetch)
      return false;
   return nir_shader_instructions_pass(shader, lower_fbfetch_instr, nir_metadata_dominance, (void*)ms);
}

/*
 * Add a check for out of bounds LOD for every texel fetch op
 * It boils down to:
 * - if (lod < query_levels(tex))
 * -    res = txf(tex)
 * - else
 * -    res = (0, 0, 0, 1)
 */
static bool
lower_txf_lod_robustness_instr(nir_builder *b, nir_instr *in, void *data)
{
   if (in->type != nir_instr_type_tex)
      return false;
   nir_tex_instr *txf = nir_instr_as_tex(in);
   if (txf->op != nir_texop_txf)
      return false;

   b->cursor = nir_before_instr(in);
   int lod_idx = nir_tex_instr_src_index(txf, nir_tex_src_lod);
   assert(lod_idx >= 0);
   nir_src lod_src = txf->src[lod_idx].src;
   if (nir_src_is_const(lod_src) && nir_src_as_const_value(lod_src)->u32 == 0)
      return false;

   nir_def *lod = lod_src.ssa;

   int offset_idx = nir_tex_instr_src_index(txf, nir_tex_src_texture_offset);
   int handle_idx = nir_tex_instr_src_index(txf, nir_tex_src_texture_handle);
   nir_tex_instr *levels = nir_tex_instr_create(b->shader,
                                                !!(offset_idx >= 0) + !!(handle_idx >= 0));
   levels->op = nir_texop_query_levels;
   levels->texture_index = txf->texture_index;
   levels->dest_type = nir_type_int | lod->bit_size;
   if (offset_idx >= 0) {
      levels->src[0].src_type = nir_tex_src_texture_offset;
      levels->src[0].src = nir_src_for_ssa(txf->src[offset_idx].src.ssa);
   }
   if (handle_idx >= 0) {
      levels->src[!!(offset_idx >= 0)].src_type = nir_tex_src_texture_handle;
      levels->src[!!(offset_idx >= 0)].src = nir_src_for_ssa(txf->src[handle_idx].src.ssa);
   }
   nir_def_init(&levels->instr, &levels->def,
                nir_tex_instr_dest_size(levels), 32);
   nir_builder_instr_insert(b, &levels->instr);

   nir_if *lod_oob_if = nir_push_if(b, nir_ilt(b, lod, &levels->def));
   nir_tex_instr *new_txf = nir_instr_as_tex(nir_instr_clone(b->shader, in));
   nir_builder_instr_insert(b, &new_txf->instr);

   nir_if *lod_oob_else = nir_push_else(b, lod_oob_if);
   nir_const_value oob_values[4] = {0};
   unsigned bit_size = nir_alu_type_get_type_size(txf->dest_type);
   oob_values[3] = (txf->dest_type & nir_type_float) ?
                   nir_const_value_for_float(1.0, bit_size) : nir_const_value_for_uint(1, bit_size);
   nir_def *oob_val = nir_build_imm(b, nir_tex_instr_dest_size(txf), bit_size, oob_values);

   nir_pop_if(b, lod_oob_else);
   nir_def *robust_txf = nir_if_phi(b, &new_txf->def, oob_val);

   nir_def_rewrite_uses(&txf->def, robust_txf);
   nir_instr_remove_v(in);
   return true;
}

/* This pass is used to workaround the lack of out of bounds LOD robustness
 * for texel fetch ops in VK_EXT_image_robustness.
 */
static bool
lower_txf_lod_robustness(nir_shader *shader)
{
   return nir_shader_instructions_pass(shader, lower_txf_lod_robustness_instr, nir_metadata_none, NULL);
}

/* check for a genuine gl_PointSize output vs one from nir_lower_point_size_mov */
static bool
check_psiz(struct nir_shader *s)
{
   bool have_psiz = false;
   nir_foreach_shader_out_variable(var, s) {
      if (var->data.location == VARYING_SLOT_PSIZ) {
         /* genuine PSIZ outputs will have this set */
         have_psiz |= !!var->data.explicit_location;
      }
   }
   return have_psiz;
}

static nir_variable *
find_var_with_location_frac(nir_shader *nir, unsigned location, unsigned location_frac, bool have_psiz, nir_variable_mode mode)
{
   assert((int)location >= 0);

   nir_foreach_variable_with_modes(var, nir, mode) {
      if (var->data.location == location && (location != VARYING_SLOT_PSIZ || !have_psiz || var->data.explicit_location)) {
         unsigned num_components = glsl_get_vector_elements(var->type);
         if (glsl_type_is_64bit(glsl_without_array(var->type)))
            num_components *= 2;
         if (is_clipcull_dist(var->data.location))
            num_components = glsl_get_aoa_size(var->type);
         if (var->data.location_frac <= location_frac &&
               var->data.location_frac + num_components > location_frac)
            return var;
      }
   }
   return NULL;
}

static bool
is_inlined(const bool *inlined, const nir_xfb_output_info *output)
{
   unsigned num_components = util_bitcount(output->component_mask);
   for (unsigned i = 0; i < num_components; i++)
      if (!inlined[output->component_offset + i])
         return false;
   return true;
}

static void
update_psiz_location(nir_shader *nir, nir_variable *psiz)
{
   uint32_t last_output = util_last_bit64(nir->info.outputs_written);
   if (last_output < VARYING_SLOT_VAR0)
      last_output = VARYING_SLOT_VAR0;
   else
      last_output++;
   /* this should get fixed up by slot remapping */
   psiz->data.location = last_output;
}

static const struct glsl_type *
clamp_slot_type(const struct glsl_type *type, unsigned slot)
{
   /* could be dvec/dmat/mat: each member is the same */
   const struct glsl_type *plain = glsl_without_array_or_matrix(type);
   /* determine size of each member type */
   unsigned slot_count = glsl_count_vec4_slots(plain, false, false);
   /* normalize slot idx to current type's size */
   slot %= slot_count;
   unsigned slot_components = glsl_get_components(plain);
   if (glsl_base_type_is_64bit(glsl_get_base_type(plain)))
      slot_components *= 2;
   /* create a vec4 mask of the selected slot's components out of all the components */
   uint32_t mask = BITFIELD_MASK(slot_components) & BITFIELD_RANGE(slot * 4, 4);
   /* return a vecN of the selected components */
   slot_components = util_bitcount(mask);
   return glsl_vec_type(slot_components);
}

static const struct glsl_type *
unroll_struct_type(const struct glsl_type *slot_type, unsigned *slot_idx)
{
   const struct glsl_type *type = slot_type;
   unsigned slot_count = 0;
   unsigned cur_slot = 0;
   /* iterate over all the members in the struct, stopping once the slot idx is reached */
   for (unsigned i = 0; i < glsl_get_length(slot_type) && cur_slot <= *slot_idx; i++, cur_slot += slot_count) {
      /* use array type for slot counting but return array member type for unroll */
      const struct glsl_type *arraytype = glsl_get_struct_field(slot_type, i);
      type = glsl_without_array(arraytype);
      slot_count = glsl_count_vec4_slots(arraytype, false, false);
   }
   *slot_idx -= (cur_slot - slot_count);
   if (!glsl_type_is_struct_or_ifc(type))
      /* this is a fully unrolled struct: find the number of vec components to output */
      type = clamp_slot_type(type, *slot_idx);
   return type;
}

static unsigned
get_slot_components(nir_variable *var, unsigned slot, unsigned so_slot)
{
   assert(var && slot < var->data.location + glsl_count_vec4_slots(var->type, false, false));
   const struct glsl_type *orig_type = var->type;
   const struct glsl_type *type = glsl_without_array(var->type);
   unsigned slot_idx = slot - so_slot;
   if (type != orig_type)
      slot_idx %= glsl_count_vec4_slots(type, false, false);
   /* need to find the vec4 that's being exported by this slot */
   while (glsl_type_is_struct_or_ifc(type))
      type = unroll_struct_type(type, &slot_idx);

   /* arrays here are already fully unrolled from their structs, so slot handling is implicit */
   unsigned num_components = glsl_get_components(glsl_without_array(type));
   /* special handling: clip/cull distance are arrays with vector semantics */
   if (is_clipcull_dist(var->data.location)) {
      num_components = glsl_array_size(type);
      if (slot_idx)
         /* this is the second vec4 */
         num_components %= 4;
      else
         /* this is the first vec4 */
         num_components = MIN2(num_components, 4);
   }
   assert(num_components);
   /* gallium handles xfb in terms of 32bit units */
   if (glsl_base_type_is_64bit(glsl_get_base_type(glsl_without_array(type))))
      num_components *= 2;
   return num_components;
}

static unsigned
get_var_slot_count(nir_shader *nir, nir_variable *var)
{
   assert(var->data.mode == nir_var_shader_in || var->data.mode == nir_var_shader_out);
   const struct glsl_type *type = var->type;
   if (nir_is_arrayed_io(var, nir->info.stage))
      type = glsl_get_array_element(type);
   unsigned slot_count = 0;
   if ((nir->info.stage == MESA_SHADER_VERTEX && var->data.mode == nir_var_shader_in && var->data.location >= VERT_ATTRIB_GENERIC0) ||
       var->data.location >= VARYING_SLOT_VAR0)
      slot_count = glsl_count_vec4_slots(type, false, false);
   else if (glsl_type_is_array(type))
      slot_count = DIV_ROUND_UP(glsl_get_aoa_size(type), 4);
   else
      slot_count = 1;
   return slot_count;
}


static const nir_xfb_output_info *
find_packed_output(const nir_xfb_info *xfb_info, unsigned slot)
{
   for (unsigned i = 0; i < xfb_info->output_count; i++) {
      const nir_xfb_output_info *packed_output = &xfb_info->outputs[i];
      if (packed_output->location == slot)
         return packed_output;
   }
   return NULL;
}

static void
update_so_info(struct zink_shader *zs, nir_shader *nir, uint64_t outputs_written, bool have_psiz)
{
   bool inlined[VARYING_SLOT_MAX][4] = {0};
   uint64_t packed = 0;
   uint8_t packed_components[VARYING_SLOT_MAX] = {0};
   uint8_t packed_streams[VARYING_SLOT_MAX] = {0};
   uint8_t packed_buffers[VARYING_SLOT_MAX] = {0};
   uint16_t packed_offsets[VARYING_SLOT_MAX][4] = {0};
   for (unsigned i = 0; i < nir->xfb_info->output_count; i++) {
      const nir_xfb_output_info *output = &nir->xfb_info->outputs[i];
      unsigned xfb_components = util_bitcount(output->component_mask);
      /* always set stride to be used during draw */
      zs->sinfo.stride[output->buffer] = nir->xfb_info->buffers[output->buffer].stride;
      if (zs->info.stage != MESA_SHADER_GEOMETRY || util_bitcount(zs->info.gs.active_stream_mask) == 1) {
         for (unsigned c = 0; !is_inlined(inlined[output->location], output) && c < xfb_components; c++) {
            unsigned slot = output->location;
            if (inlined[slot][output->component_offset + c])
               continue;
            nir_variable *var = NULL;
            while (!var && slot < VARYING_SLOT_TESS_MAX)
               var = find_var_with_location_frac(nir, slot--, output->component_offset + c, have_psiz, nir_var_shader_out);
            slot = output->location;
            unsigned slot_count = var ? get_var_slot_count(nir, var) : 0;
            if (!var || var->data.location > slot || var->data.location + slot_count <= slot) {
               /* if no variable is found for the xfb output, no output exists */
               inlined[slot][c + output->component_offset] = true;
               continue;
            }
            if (var->data.explicit_xfb_buffer) {
               /* handle dvec3 where gallium splits streamout over 2 registers */
               for (unsigned j = 0; j < xfb_components; j++)
                  inlined[slot][c + output->component_offset + j] = true;
            }
            if (is_inlined(inlined[slot], output))
               continue;
            assert(!glsl_type_is_array(var->type) || is_clipcull_dist(var->data.location));
            assert(!glsl_type_is_struct_or_ifc(var->type));
            unsigned num_components = glsl_type_is_array(var->type) ? glsl_get_aoa_size(var->type) : glsl_get_vector_elements(var->type);
            if (glsl_type_is_64bit(glsl_without_array(var->type)))
               num_components *= 2;
            /* if this is the entire variable, try to blast it out during the initial declaration
            * structs must be handled later to ensure accurate analysis
            */
            if ((num_components == xfb_components ||
                 num_components < xfb_components ||
                 (num_components > xfb_components && xfb_components == 4))) {
               var->data.explicit_xfb_buffer = 1;
               var->data.xfb.buffer = output->buffer;
               var->data.xfb.stride = zs->sinfo.stride[output->buffer];
               var->data.offset = (output->offset + c * sizeof(uint32_t));
               var->data.stream = nir->xfb_info->buffer_to_stream[output->buffer];
               for (unsigned j = 0; j < MIN2(num_components, xfb_components); j++)
                  inlined[slot][c + output->component_offset + j] = true;
            } else {
               /* otherwise store some metadata for later */
               packed |= BITFIELD64_BIT(slot);
               packed_components[slot] += xfb_components;
               packed_streams[slot] |= BITFIELD_BIT(nir->xfb_info->buffer_to_stream[output->buffer]);
               packed_buffers[slot] |= BITFIELD_BIT(output->buffer);
               for (unsigned j = 0; j < xfb_components; j++)
                  packed_offsets[output->location][j + output->component_offset + c] = output->offset + j * sizeof(uint32_t);
            }
         }
      }
   }

   /* if this was flagged as a packed output before, and if all the components are
    * being output with the same stream on the same buffer with increasing offsets, this entire variable
    * can be consolidated into a single output to conserve locations
    */
   for (unsigned i = 0; i < nir->xfb_info->output_count; i++) {
      const nir_xfb_output_info *output = &nir->xfb_info->outputs[i];
      unsigned slot = output->location;
      if (is_inlined(inlined[slot], output))
         continue;
      if (zs->info.stage != MESA_SHADER_GEOMETRY || util_bitcount(zs->info.gs.active_stream_mask) == 1) {
         nir_variable *var = NULL;
         while (!var)
            var = find_var_with_location_frac(nir, slot--, output->component_offset, have_psiz, nir_var_shader_out);
         slot = output->location;
         unsigned slot_count = var ? get_var_slot_count(nir, var) : 0;
         if (!var || var->data.location > slot || var->data.location + slot_count <= slot)
            continue;
         /* this is a lowered 64bit variable that can't be exported due to packing */
         if (var->data.is_xfb)
            goto out;

         unsigned num_slots = is_clipcull_dist(var->data.location) ?
                              glsl_array_size(var->type) / 4 :
                              glsl_count_vec4_slots(var->type, false, false);
         /* for each variable, iterate over all the variable's slots and inline the outputs */
         for (unsigned j = 0; j < num_slots; j++) {
            slot = var->data.location + j;
            const nir_xfb_output_info *packed_output = find_packed_output(nir->xfb_info, slot);
            if (!packed_output)
               goto out;

            /* if this slot wasn't packed or isn't in the same stream/buffer, skip consolidation */
            if (!(packed & BITFIELD64_BIT(slot)) ||
                util_bitcount(packed_streams[slot]) != 1 ||
                util_bitcount(packed_buffers[slot]) != 1)
               goto out;

            /* if all the components the variable exports to this slot aren't captured, skip consolidation */
            unsigned num_components = get_slot_components(var, slot, var->data.location);
            if (num_components != packed_components[slot])
               goto out;

            /* in order to pack the xfb output, all the offsets must be sequentially incrementing */
            uint32_t prev_offset = packed_offsets[packed_output->location][0];
            for (unsigned k = 1; k < num_components; k++) {
               /* if the offsets are not incrementing as expected, skip consolidation */
               if (packed_offsets[packed_output->location][k] != prev_offset + sizeof(uint32_t))
                  goto out;
               prev_offset = packed_offsets[packed_output->location][k + packed_output->component_offset];
            }
         }
         /* this output can be consolidated: blast out all the data inlined */
         var->data.explicit_xfb_buffer = 1;
         var->data.xfb.buffer = output->buffer;
         var->data.xfb.stride = zs->sinfo.stride[output->buffer];
         var->data.offset = output->offset;
         var->data.stream = nir->xfb_info->buffer_to_stream[output->buffer];
         /* mark all slot components inlined to skip subsequent loop iterations */
         for (unsigned j = 0; j < num_slots; j++) {
            slot = var->data.location + j;
            for (unsigned k = 0; k < packed_components[slot]; k++)
               inlined[slot][k] = true;
            packed &= ~BITFIELD64_BIT(slot);
         }
         continue;
      }
out:
      unreachable("xfb should be inlined by now!");
   }
}

struct decompose_state {
  nir_variable **split;
  bool needs_w;
};

static bool
lower_attrib(nir_builder *b, nir_instr *instr, void *data)
{
   struct decompose_state *state = data;
   nir_variable **split = state->split;
   if (instr->type != nir_instr_type_intrinsic)
      return false;
   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
   if (intr->intrinsic != nir_intrinsic_load_deref)
      return false;
   nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
   nir_variable *var = nir_deref_instr_get_variable(deref);
   if (var != split[0])
      return false;
   unsigned num_components = glsl_get_vector_elements(split[0]->type);
   b->cursor = nir_after_instr(instr);
   nir_def *loads[4];
   for (unsigned i = 0; i < (state->needs_w ? num_components - 1 : num_components); i++)
      loads[i] = nir_load_deref(b, nir_build_deref_var(b, split[i+1]));
   if (state->needs_w) {
      /* oob load w comopnent to get correct value for int/float */
      loads[3] = nir_channel(b, loads[0], 3);
      loads[0] = nir_channel(b, loads[0], 0);
   }
   nir_def *new_load = nir_vec(b, loads, num_components);
   nir_def_rewrite_uses(&intr->def, new_load);
   nir_instr_remove_v(instr);
   return true;
}

static bool
decompose_attribs(nir_shader *nir, uint32_t decomposed_attrs, uint32_t decomposed_attrs_without_w)
{
   uint32_t bits = 0;
   nir_foreach_variable_with_modes(var, nir, nir_var_shader_in)
      bits |= BITFIELD_BIT(var->data.driver_location);
   bits = ~bits;
   u_foreach_bit(location, decomposed_attrs | decomposed_attrs_without_w) {
      nir_variable *split[5];
      struct decompose_state state;
      state.split = split;
      nir_variable *var = nir_find_variable_with_driver_location(nir, nir_var_shader_in, location);
      assert(var);
      split[0] = var;
      bits |= BITFIELD_BIT(var->data.driver_location);
      const struct glsl_type *new_type = glsl_type_is_scalar(var->type) ? var->type : glsl_get_array_element(var->type);
      unsigned num_components = glsl_get_vector_elements(var->type);
      state.needs_w = (decomposed_attrs_without_w & BITFIELD_BIT(location)) != 0 && num_components == 4;
      for (unsigned i = 0; i < (state.needs_w ? num_components - 1 : num_components); i++) {
         split[i+1] = nir_variable_clone(var, nir);
         split[i+1]->name = ralloc_asprintf(nir, "%s_split%u", var->name, i);
         if (decomposed_attrs_without_w & BITFIELD_BIT(location))
            split[i+1]->type = !i && num_components == 4 ? var->type : new_type;
         else
            split[i+1]->type = new_type;
         split[i+1]->data.driver_location = ffs(bits) - 1;
         bits &= ~BITFIELD_BIT(split[i+1]->data.driver_location);
         nir_shader_add_variable(nir, split[i+1]);
      }
      var->data.mode = nir_var_shader_temp;
      nir_shader_instructions_pass(nir, lower_attrib, nir_metadata_dominance, &state);
   }
   nir_fixup_deref_modes(nir);
   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_temp, NULL);
   optimize_nir(nir, NULL, true);
   return true;
}

static bool
rewrite_bo_access_instr(nir_builder *b, nir_instr *instr, void *data)
{
   struct zink_screen *screen = data;
   const bool has_int64 = screen->info.feats.features.shaderInt64;
   if (instr->type != nir_instr_type_intrinsic)
      return false;
   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
   b->cursor = nir_before_instr(instr);
   switch (intr->intrinsic) {
   case nir_intrinsic_ssbo_atomic:
   case nir_intrinsic_ssbo_atomic_swap: {
      /* convert offset to uintN_t[idx] */
      nir_def *offset = nir_udiv_imm(b, intr->src[1].ssa, intr->def.bit_size / 8);
      nir_src_rewrite(&intr->src[1], offset);
      return true;
   }
   case nir_intrinsic_load_ssbo:
   case nir_intrinsic_load_ubo: {
      /* ubo0 can have unaligned 64bit loads, particularly for bindless texture ids */
      bool force_2x32 = intr->intrinsic == nir_intrinsic_load_ubo &&
                        nir_src_is_const(intr->src[0]) &&
                        nir_src_as_uint(intr->src[0]) == 0 &&
                        intr->def.bit_size == 64 &&
                        nir_intrinsic_align_offset(intr) % 8 != 0;
      force_2x32 |= intr->def.bit_size == 64 && !has_int64;
      nir_def *offset = nir_udiv_imm(b, intr->src[1].ssa, (force_2x32 ? 32 : intr->def.bit_size) / 8);
      nir_src_rewrite(&intr->src[1], offset);
      /* if 64bit isn't supported, 64bit loads definitely aren't supported, so rewrite as 2x32 with cast and pray */
      if (force_2x32) {
         /* this is always scalarized */
         assert(intr->def.num_components == 1);
         /* rewrite as 2x32 */
         nir_def *load[2];
         for (unsigned i = 0; i < 2; i++) {
            if (intr->intrinsic == nir_intrinsic_load_ssbo)
               load[i] = nir_load_ssbo(b, 1, 32, intr->src[0].ssa, nir_iadd_imm(b, intr->src[1].ssa, i), .align_mul = 4, .align_offset = 0);
            else
               load[i] = nir_load_ubo(b, 1, 32, intr->src[0].ssa, nir_iadd_imm(b, intr->src[1].ssa, i), .align_mul = 4, .align_offset = 0, .range = 4);
            nir_intrinsic_set_access(nir_instr_as_intrinsic(load[i]->parent_instr), nir_intrinsic_access(intr));
         }
         /* cast back to 64bit */
         nir_def *casted = nir_pack_64_2x32_split(b, load[0], load[1]);
         nir_def_rewrite_uses(&intr->def, casted);
         nir_instr_remove(instr);
      }
      return true;
   }
   case nir_intrinsic_load_scratch:
   case nir_intrinsic_load_shared: {
      b->cursor = nir_before_instr(instr);
      bool force_2x32 = intr->def.bit_size == 64 && !has_int64;
      nir_def *offset = nir_udiv_imm(b, intr->src[0].ssa, (force_2x32 ? 32 : intr->def.bit_size) / 8);
      nir_src_rewrite(&intr->src[0], offset);
      /* if 64bit isn't supported, 64bit loads definitely aren't supported, so rewrite as 2x32 with cast and pray */
      if (force_2x32) {
         /* this is always scalarized */
         assert(intr->def.num_components == 1);
         /* rewrite as 2x32 */
         nir_def *load[2];
         for (unsigned i = 0; i < 2; i++)
            load[i] = nir_load_shared(b, 1, 32, nir_iadd_imm(b, intr->src[0].ssa, i), .align_mul = 4, .align_offset = 0);
         /* cast back to 64bit */
         nir_def *casted = nir_pack_64_2x32_split(b, load[0], load[1]);
         nir_def_rewrite_uses(&intr->def, casted);
         nir_instr_remove(instr);
         return true;
      }
      break;
   }
   case nir_intrinsic_store_ssbo: {
      b->cursor = nir_before_instr(instr);
      bool force_2x32 = nir_src_bit_size(intr->src[0]) == 64 && !has_int64;
      nir_def *offset = nir_udiv_imm(b, intr->src[2].ssa, (force_2x32 ? 32 : nir_src_bit_size(intr->src[0])) / 8);
      nir_src_rewrite(&intr->src[2], offset);
      /* if 64bit isn't supported, 64bit loads definitely aren't supported, so rewrite as 2x32 with cast and pray */
      if (force_2x32) {
         /* this is always scalarized */
         assert(intr->src[0].ssa->num_components == 1);
         nir_def *vals[2] = {nir_unpack_64_2x32_split_x(b, intr->src[0].ssa), nir_unpack_64_2x32_split_y(b, intr->src[0].ssa)};
         for (unsigned i = 0; i < 2; i++)
            nir_store_ssbo(b, vals[i], intr->src[1].ssa, nir_iadd_imm(b, intr->src[2].ssa, i), .align_mul = 4, .align_offset = 0);
         nir_instr_remove(instr);
      }
      return true;
   }
   case nir_intrinsic_store_scratch:
   case nir_intrinsic_store_shared: {
      b->cursor = nir_before_instr(instr);
      bool force_2x32 = nir_src_bit_size(intr->src[0]) == 64 && !has_int64;
      nir_def *offset = nir_udiv_imm(b, intr->src[1].ssa, (force_2x32 ? 32 : nir_src_bit_size(intr->src[0])) / 8);
      nir_src_rewrite(&intr->src[1], offset);
      /* if 64bit isn't supported, 64bit loads definitely aren't supported, so rewrite as 2x32 with cast and pray */
      if (nir_src_bit_size(intr->src[0]) == 64 && !has_int64) {
         /* this is always scalarized */
         assert(intr->src[0].ssa->num_components == 1);
         nir_def *vals[2] = {nir_unpack_64_2x32_split_x(b, intr->src[0].ssa), nir_unpack_64_2x32_split_y(b, intr->src[0].ssa)};
         for (unsigned i = 0; i < 2; i++)
            nir_store_shared(b, vals[i], nir_iadd_imm(b, intr->src[1].ssa, i), .align_mul = 4, .align_offset = 0);
         nir_instr_remove(instr);
      }
      return true;
   }
   default:
      break;
   }
   return false;
}

static bool
rewrite_bo_access(nir_shader *shader, struct zink_screen *screen)
{
   return nir_shader_instructions_pass(shader, rewrite_bo_access_instr, nir_metadata_dominance, screen);
}

static nir_variable *
get_bo_var(nir_shader *shader, struct bo_vars *bo, bool ssbo, nir_src *src, unsigned bit_size)
{
   nir_variable *var, **ptr;
   unsigned idx = ssbo || (nir_src_is_const(*src) && !nir_src_as_uint(*src)) ? 0 : 1;

   if (ssbo)
      ptr = &bo->ssbo[bit_size >> 4];
   else {
      if (!idx) {
         ptr = &bo->uniforms[bit_size >> 4];
      } else
         ptr = &bo->ubo[bit_size >> 4];
   }
   var = *ptr;
   if (!var) {
      if (ssbo)
         var = bo->ssbo[32 >> 4];
      else {
         if (!idx)
            var = bo->uniforms[32 >> 4];
         else
            var = bo->ubo[32 >> 4];
      }
      var = nir_variable_clone(var, shader);
      if (ssbo)
         var->name = ralloc_asprintf(shader, "%s@%u", "ssbos", bit_size);
      else
         var->name = ralloc_asprintf(shader, "%s@%u", idx ? "ubos" : "uniform_0", bit_size);
      *ptr = var;
      nir_shader_add_variable(shader, var);

      struct glsl_struct_field *fields = rzalloc_array(shader, struct glsl_struct_field, 2);
      fields[0].name = ralloc_strdup(shader, "base");
      fields[1].name = ralloc_strdup(shader, "unsized");
      unsigned array_size = glsl_get_length(var->type);
      const struct glsl_type *bare_type = glsl_without_array(var->type);
      const struct glsl_type *array_type = glsl_get_struct_field(bare_type, 0);
      unsigned length = glsl_get_length(array_type);
      const struct glsl_type *type;
      const struct glsl_type *unsized = glsl_array_type(glsl_uintN_t_type(bit_size), 0, bit_size / 8);
      if (bit_size > 32) {
         assert(bit_size == 64);
         type = glsl_array_type(glsl_uintN_t_type(bit_size), length / 2, bit_size / 8);
      } else {
         type = glsl_array_type(glsl_uintN_t_type(bit_size), length * (32 / bit_size), bit_size / 8);
      }
      fields[0].type = type;
      fields[1].type = unsized;
      var->type = glsl_array_type(glsl_struct_type(fields, glsl_get_length(bare_type), "struct", false), array_size, 0);
      var->data.driver_location = idx;
   }
   return var;
}

static void
rewrite_atomic_ssbo_instr(nir_builder *b, nir_instr *instr, struct bo_vars *bo)
{
   nir_intrinsic_op op;
   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
   if (intr->intrinsic == nir_intrinsic_ssbo_atomic)
      op = nir_intrinsic_deref_atomic;
   else if (intr->intrinsic == nir_intrinsic_ssbo_atomic_swap)
      op = nir_intrinsic_deref_atomic_swap;
   else
      unreachable("unknown intrinsic");
   nir_def *offset = intr->src[1].ssa;
   nir_src *src = &intr->src[0];
   nir_variable *var = get_bo_var(b->shader, bo, true, src,
                                  intr->def.bit_size);
   nir_deref_instr *deref_var = nir_build_deref_var(b, var);
   nir_def *idx = src->ssa;
   if (bo->first_ssbo)
      idx = nir_iadd_imm(b, idx, -bo->first_ssbo);
   nir_deref_instr *deref_array = nir_build_deref_array(b, deref_var, idx);
   nir_deref_instr *deref_struct = nir_build_deref_struct(b, deref_array, 0);

   /* generate new atomic deref ops for every component */
   nir_def *result[4];
   unsigned num_components = intr->def.num_components;
   for (unsigned i = 0; i < num_components; i++) {
      nir_deref_instr *deref_arr = nir_build_deref_array(b, deref_struct, offset);
      nir_intrinsic_instr *new_instr = nir_intrinsic_instr_create(b->shader, op);
      nir_def_init(&new_instr->instr, &new_instr->def, 1,
                   intr->def.bit_size);
      nir_intrinsic_set_atomic_op(new_instr, nir_intrinsic_atomic_op(intr));
      new_instr->src[0] = nir_src_for_ssa(&deref_arr->def);
      /* deref ops have no offset src, so copy the srcs after it */
      for (unsigned j = 2; j < nir_intrinsic_infos[intr->intrinsic].num_srcs; j++)
         new_instr->src[j - 1] = nir_src_for_ssa(intr->src[j].ssa);
      nir_builder_instr_insert(b, &new_instr->instr);

      result[i] = &new_instr->def;
      offset = nir_iadd_imm(b, offset, 1);
   }

   nir_def *load = nir_vec(b, result, num_components);
   nir_def_rewrite_uses(&intr->def, load);
   nir_instr_remove(instr);
}

static bool
remove_bo_access_instr(nir_builder *b, nir_instr *instr, void *data)
{
   struct bo_vars *bo = data;
   if (instr->type != nir_instr_type_intrinsic)
      return false;
   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
   nir_variable *var = NULL;
   nir_def *offset = NULL;
   bool is_load = true;
   b->cursor = nir_before_instr(instr);
   nir_src *src;
   bool ssbo = true;
   switch (intr->intrinsic) {
   case nir_intrinsic_ssbo_atomic:
   case nir_intrinsic_ssbo_atomic_swap:
      rewrite_atomic_ssbo_instr(b, instr, bo);
      return true;
   case nir_intrinsic_store_ssbo:
      src = &intr->src[1];
      var = get_bo_var(b->shader, bo, true, src, nir_src_bit_size(intr->src[0]));
      offset = intr->src[2].ssa;
      is_load = false;
      break;
   case nir_intrinsic_load_ssbo:
      src = &intr->src[0];
      var = get_bo_var(b->shader, bo, true, src, intr->def.bit_size);
      offset = intr->src[1].ssa;
      break;
   case nir_intrinsic_load_ubo:
      src = &intr->src[0];
      var = get_bo_var(b->shader, bo, false, src, intr->def.bit_size);
      offset = intr->src[1].ssa;
      ssbo = false;
      break;
   default:
      return false;
   }
   assert(var);
   assert(offset);
   nir_deref_instr *deref_var = nir_build_deref_var(b, var);
   nir_def *idx = !ssbo && var->data.driver_location ? nir_iadd_imm(b, src->ssa, -1) : src->ssa;
   if (!ssbo && bo->first_ubo && var->data.driver_location)
      idx = nir_iadd_imm(b, idx, -bo->first_ubo);
   else if (ssbo && bo->first_ssbo)
      idx = nir_iadd_imm(b, idx, -bo->first_ssbo);
   nir_deref_instr *deref_array = nir_build_deref_array(b, deref_var,
                                                        nir_i2iN(b, idx, deref_var->def.bit_size));
   nir_deref_instr *deref_struct = nir_build_deref_struct(b, deref_array, 0);
   assert(intr->num_components <= 2);
   if (is_load) {
      nir_def *result[2];
      for (unsigned i = 0; i < intr->num_components; i++) {
         nir_deref_instr *deref_arr = nir_build_deref_array(b, deref_struct,
                                                            nir_i2iN(b, offset, deref_struct->def.bit_size));
         result[i] = nir_load_deref(b, deref_arr);
         if (intr->intrinsic == nir_intrinsic_load_ssbo)
            nir_intrinsic_set_access(nir_instr_as_intrinsic(result[i]->parent_instr), nir_intrinsic_access(intr));
         offset = nir_iadd_imm(b, offset, 1);
      }
      nir_def *load = nir_vec(b, result, intr->num_components);
      nir_def_rewrite_uses(&intr->def, load);
   } else {
      nir_deref_instr *deref_arr = nir_build_deref_array(b, deref_struct,
                                                         nir_i2iN(b, offset, deref_struct->def.bit_size));
      nir_build_store_deref(b, &deref_arr->def, intr->src[0].ssa, BITFIELD_MASK(intr->num_components), nir_intrinsic_access(intr));
   }
   nir_instr_remove(instr);
   return true;
}

static bool
remove_bo_access(nir_shader *shader, struct zink_shader *zs)
{
   struct bo_vars bo = get_bo_vars(zs, shader);
   return nir_shader_instructions_pass(shader, remove_bo_access_instr, nir_metadata_dominance, &bo);
}

static bool
filter_io_instr(nir_intrinsic_instr *intr, bool *is_load, bool *is_input, bool *is_interp)
{
   switch (intr->intrinsic) {
   case nir_intrinsic_load_interpolated_input:
      *is_interp = true;
      FALLTHROUGH;
   case nir_intrinsic_load_input:
   case nir_intrinsic_load_per_vertex_input:
      *is_input = true;
      FALLTHROUGH;
   case nir_intrinsic_load_output:
   case nir_intrinsic_load_per_vertex_output:
   case nir_intrinsic_load_per_primitive_output:
      *is_load = true;
      FALLTHROUGH;
   case nir_intrinsic_store_output:
   case nir_intrinsic_store_per_primitive_output:
   case nir_intrinsic_store_per_vertex_output:
      break;
   default:
      return false;
   }
   return true;
}

static bool
io_instr_is_arrayed(nir_intrinsic_instr *intr)
{
   switch (intr->intrinsic) {
   case nir_intrinsic_load_per_vertex_input:
   case nir_intrinsic_load_per_vertex_output:
   case nir_intrinsic_load_per_primitive_output:
   case nir_intrinsic_store_per_primitive_output:
   case nir_intrinsic_store_per_vertex_output:
      return true;
   default:
      break;
   }
   return false;
}

static bool
find_var_deref(nir_shader *nir, nir_variable *var)
{
   nir_foreach_function_impl(impl, nir) {
      nir_foreach_block(block, impl) {
         nir_foreach_instr(instr, block) {
            if (instr->type != nir_instr_type_deref)
               continue;
            nir_deref_instr *deref = nir_instr_as_deref(instr);
            if (deref->deref_type == nir_deref_type_var && deref->var == var)
               return true;
         }
      }
   }
   return false;
}

static bool
find_var_io(nir_shader *nir, nir_variable *var)
{
   nir_foreach_function(function, nir) {
      if (!function->impl)
         continue;

      nir_foreach_block(block, function->impl) {
         nir_foreach_instr(instr, block) {
            if (instr->type != nir_instr_type_intrinsic)
               continue;
            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
            bool is_load = false;
            bool is_input = false;
            bool is_interp = false;
            if (!filter_io_instr(intr, &is_load, &is_input, &is_interp))
               continue;
            if (var->data.mode == nir_var_shader_in && !is_input)
               continue;
            if (var->data.mode == nir_var_shader_out && is_input)
               continue;
            unsigned slot_offset = 0;
            if (var->data.fb_fetch_output && !is_load)
               continue;
            if (nir->info.stage == MESA_SHADER_FRAGMENT && !is_load && !is_input && nir_intrinsic_io_semantics(intr).dual_source_blend_index != var->data.index)
               continue;
            nir_src *src_offset = nir_get_io_offset_src(intr);
            if (src_offset && nir_src_is_const(*src_offset))
               slot_offset = nir_src_as_uint(*src_offset);
            unsigned slot_count = get_var_slot_count(nir, var);
            if (var->data.mode & (nir_var_shader_out | nir_var_shader_in) &&
                var->data.fb_fetch_output == nir_intrinsic_io_semantics(intr).fb_fetch_output &&
                var->data.location <= nir_intrinsic_io_semantics(intr).location + slot_offset &&
                var->data.location + slot_count > nir_intrinsic_io_semantics(intr).location + slot_offset)
               return true;
         }
      }
   }
   return false;
}

struct clamp_layer_output_state {
   nir_variable *original;
   nir_variable *clamped;
};

static void
clamp_layer_output_emit(nir_builder *b, struct clamp_layer_output_state *state)
{
   nir_def *is_layered = nir_load_push_constant_zink(b, 1, 32,
                                                         nir_imm_int(b, ZINK_GFX_PUSHCONST_FRAMEBUFFER_IS_LAYERED));
   nir_deref_instr *original_deref = nir_build_deref_var(b, state->original);
   nir_deref_instr *clamped_deref = nir_build_deref_var(b, state->clamped);
   nir_def *layer = nir_bcsel(b, nir_ieq_imm(b, is_layered, 1),
                                  nir_load_deref(b, original_deref),
                                  nir_imm_int(b, 0));
   nir_store_deref(b, clamped_deref, layer, 0);
}

static bool
clamp_layer_output_instr(nir_builder *b, nir_instr *instr, void *data)
{
   struct clamp_layer_output_state *state = data;
   switch (instr->type) {
   case nir_instr_type_intrinsic: {
      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
      if (intr->intrinsic != nir_intrinsic_emit_vertex_with_counter &&
          intr->intrinsic != nir_intrinsic_emit_vertex)
         return false;
      b->cursor = nir_before_instr(instr);
      clamp_layer_output_emit(b, state);
      return true;
   }
   default: return false;
   }
}

static bool
clamp_layer_output(nir_shader *vs, nir_shader *fs, unsigned *next_location)
{
   switch (vs->info.stage) {
   case MESA_SHADER_VERTEX:
   case MESA_SHADER_GEOMETRY:
   case MESA_SHADER_TESS_EVAL:
      break;
   default:
      unreachable("invalid last vertex stage!");
   }
   struct clamp_layer_output_state state = {0};
   state.original = nir_find_variable_with_location(vs, nir_var_shader_out, VARYING_SLOT_LAYER);
   if (!state.original || (!find_var_deref(vs, state.original) && !find_var_io(vs, state.original)))
      return false;
   state.clamped = nir_variable_create(vs, nir_var_shader_out, glsl_int_type(), "layer_clamped");
   state.clamped->data.location = VARYING_SLOT_LAYER;
   nir_variable *fs_var = nir_find_variable_with_location(fs, nir_var_shader_in, VARYING_SLOT_LAYER);
   if ((state.original->data.explicit_xfb_buffer || fs_var) && *next_location < MAX_VARYING) {
      state.original->data.location = VARYING_SLOT_VAR0; // Anything but a built-in slot
      state.original->data.driver_location = (*next_location)++;
      if (fs_var) {
         fs_var->data.location = state.original->data.location;
         fs_var->data.driver_location = state.original->data.driver_location;
      }
   } else {
      if (state.original->data.explicit_xfb_buffer) {
         /* Will xfb the clamped output but still better than nothing */
         state.clamped->data.explicit_xfb_buffer = state.original->data.explicit_xfb_buffer;
         state.clamped->data.xfb.buffer = state.original->data.xfb.buffer;
         state.clamped->data.xfb.stride = state.original->data.xfb.stride;
         state.clamped->data.offset = state.original->data.offset;
         state.clamped->data.stream = state.original->data.stream;
      }
      state.original->data.mode = nir_var_shader_temp;
      nir_fixup_deref_modes(vs);
   }
   if (vs->info.stage == MESA_SHADER_GEOMETRY) {
      nir_shader_instructions_pass(vs, clamp_layer_output_instr, nir_metadata_dominance, &state);
   } else {
      nir_builder b;
      nir_function_impl *impl = nir_shader_get_entrypoint(vs);
      b = nir_builder_at(nir_after_impl(impl));
      assert(impl->end_block->predecessors->entries == 1);
      clamp_layer_output_emit(&b, &state);
      nir_metadata_preserve(impl, nir_metadata_dominance);
   }
   optimize_nir(vs, NULL, true);
   NIR_PASS_V(vs, nir_remove_dead_variables, nir_var_shader_temp, NULL);
   return true;
}

static void
assign_producer_var_io(gl_shader_stage stage, nir_variable *var, unsigned *reserved, unsigned char *slot_map)
{
   unsigned slot = var->data.location;
   switch (slot) {
   case -1:
   case VARYING_SLOT_POS:
   case VARYING_SLOT_PSIZ:
   case VARYING_SLOT_LAYER:
   case VARYING_SLOT_PRIMITIVE_ID:
   case VARYING_SLOT_CLIP_DIST0:
   case VARYING_SLOT_CULL_DIST0:
   case VARYING_SLOT_VIEWPORT:
   case VARYING_SLOT_FACE:
   case VARYING_SLOT_TESS_LEVEL_OUTER:
   case VARYING_SLOT_TESS_LEVEL_INNER:
      /* use a sentinel value to avoid counting later */
      var->data.driver_location = UINT_MAX;
      break;

   default:
      if (var->data.patch) {
         assert(slot >= VARYING_SLOT_PATCH0);
         slot -= VARYING_SLOT_PATCH0;
      }
      if (slot_map[slot] == 0xff) {
         assert(*reserved < MAX_VARYING);
         unsigned num_slots;
         if (nir_is_arrayed_io(var, stage))
            num_slots = glsl_count_vec4_slots(glsl_get_array_element(var->type), false, false);
         else
            num_slots = glsl_count_vec4_slots(var->type, false, false);
         assert(*reserved + num_slots <= MAX_VARYING);
         for (unsigned i = 0; i < num_slots; i++)
            slot_map[slot + i] = (*reserved)++;
      }
      slot = slot_map[slot];
      assert(slot < MAX_VARYING);
      var->data.driver_location = slot;
   }
}

ALWAYS_INLINE static bool
is_texcoord(gl_shader_stage stage, const nir_variable *var)
{
   if (stage != MESA_SHADER_FRAGMENT)
      return false;
   return var->data.location >= VARYING_SLOT_TEX0 &&
          var->data.location <= VARYING_SLOT_TEX7;
}

static bool
assign_consumer_var_io(gl_shader_stage stage, nir_variable *var, unsigned *reserved, unsigned char *slot_map)
{
   unsigned slot = var->data.location;
   switch (slot) {
   case VARYING_SLOT_POS:
   case VARYING_SLOT_PSIZ:
   case VARYING_SLOT_LAYER:
   case VARYING_SLOT_PRIMITIVE_ID:
   case VARYING_SLOT_CLIP_DIST0:
   case VARYING_SLOT_CULL_DIST0:
   case VARYING_SLOT_VIEWPORT:
   case VARYING_SLOT_FACE:
   case VARYING_SLOT_TESS_LEVEL_OUTER:
   case VARYING_SLOT_TESS_LEVEL_INNER:
      /* use a sentinel value to avoid counting later */
      var->data.driver_location = UINT_MAX;
      break;
   default:
      if (var->data.patch) {
         assert(slot >= VARYING_SLOT_PATCH0);
         slot -= VARYING_SLOT_PATCH0;
      }
      if (slot_map[slot] == (unsigned char)-1) {
         /* texcoords can't be eliminated in fs due to GL_COORD_REPLACE,
          * so keep for now and eliminate later
          */
         if (is_texcoord(stage, var)) {
            var->data.driver_location = -1;
            return true;
         }
         /* patch variables may be read in the workgroup */
         if (stage != MESA_SHADER_TESS_CTRL)
            /* dead io */
            return false;
         unsigned num_slots;
         if (nir_is_arrayed_io(var, stage))
            num_slots = glsl_count_vec4_slots(glsl_get_array_element(var->type), false, false);
         else
            num_slots = glsl_count_vec4_slots(var->type, false, false);
         assert(*reserved + num_slots <= MAX_VARYING);
         for (unsigned i = 0; i < num_slots; i++)
            slot_map[slot + i] = (*reserved)++;
      }
      var->data.driver_location = slot_map[slot];
   }
   return true;
}


static bool
rewrite_read_as_0(nir_builder *b, nir_instr *instr, void *data)
{
   nir_variable *var = data;
   if (instr->type != nir_instr_type_intrinsic)
      return false;

   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
   bool is_load = false;
   bool is_input = false;
   bool is_interp = false;
   if (!filter_io_instr(intr, &is_load, &is_input, &is_interp))
      return false;
   if (!is_load)
      return false;
   unsigned location = nir_intrinsic_io_semantics(intr).location;
   if (location != var->data.location)
      return false;
   b->cursor = nir_before_instr(instr);
   nir_def *zero = nir_imm_zero(b, intr->def.num_components,
                                intr->def.bit_size);
   if (b->shader->info.stage == MESA_SHADER_FRAGMENT) {
      switch (location) {
      case VARYING_SLOT_COL0:
      case VARYING_SLOT_COL1:
      case VARYING_SLOT_BFC0:
      case VARYING_SLOT_BFC1:
         /* default color is 0,0,0,1 */
         if (intr->def.num_components == 4)
            zero = nir_vector_insert_imm(b, zero, nir_imm_float(b, 1.0), 3);
         break;
      default:
         break;
      }
   }
   nir_def_rewrite_uses(&intr->def, zero);
   nir_instr_remove(instr);
   return true;
}


static bool
delete_psiz_store_instr(nir_builder *b, nir_intrinsic_instr *intr, void *data)
{
   switch (intr->intrinsic) {
   case nir_intrinsic_store_output:
   case nir_intrinsic_store_per_primitive_output:
   case nir_intrinsic_store_per_vertex_output:
      break;
   default:
      return false;
   }
   if (nir_intrinsic_io_semantics(intr).location != VARYING_SLOT_PSIZ)
      return false;
   if (!data || (nir_src_is_const(intr->src[0]) && fabs(nir_src_as_float(intr->src[0]) - 1.0) < FLT_EPSILON)) {
      nir_instr_remove(&intr->instr);
      return true;
   }
   return false;
}

static bool
delete_psiz_store(nir_shader *nir, bool one)
{
   bool progress = nir_shader_intrinsics_pass(nir, delete_psiz_store_instr,
                                              nir_metadata_dominance, one ? nir : NULL);
   if (progress)
      nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
   return progress;
}

struct write_components {
   unsigned slot;
   uint32_t component_mask;
};

static bool
fill_zero_reads(nir_builder *b, nir_intrinsic_instr *intr, void *data)
{
   struct write_components *wc = data;
   bool is_load = false;
   bool is_input = false;
   bool is_interp = false;
   if (!filter_io_instr(intr, &is_load, &is_input, &is_interp))
      return false;
   if (!is_input)
      return false;
   nir_io_semantics s = nir_intrinsic_io_semantics(intr);
   if (wc->slot < s.location || wc->slot >= s.location + s.num_slots)
      return false;
   unsigned num_components = intr->num_components;
   unsigned c = nir_intrinsic_component(intr);
   if (intr->def.bit_size == 64)
      num_components *= 2;
   nir_src *src_offset = nir_get_io_offset_src(intr);
   if (!nir_src_is_const(*src_offset))
      return false;
   unsigned slot_offset = nir_src_as_uint(*src_offset);
   if (s.location + slot_offset != wc->slot)
      return false;
   uint32_t readmask = BITFIELD_MASK(intr->num_components) << c;
   if (intr->def.bit_size == 64)
      readmask |= readmask << (intr->num_components + c);
   /* handle dvec3/dvec4 */
   if (num_components + c > 4)
      readmask >>= 4;
   if ((wc->component_mask & readmask) == readmask)
      return false;
   uint32_t rewrite_mask = readmask & ~wc->component_mask;
   if (!rewrite_mask)
      return false;
   b->cursor = nir_after_instr(&intr->instr);
   nir_def *zero = nir_imm_zero(b, intr->def.num_components, intr->def.bit_size);
   if (b->shader->info.stage == MESA_SHADER_FRAGMENT) {
      switch (wc->slot) {
      case VARYING_SLOT_COL0:
      case VARYING_SLOT_COL1:
      case VARYING_SLOT_BFC0:
      case VARYING_SLOT_BFC1:
         /* default color is 0,0,0,1 */
         if (intr->def.num_components == 4)
            zero = nir_vector_insert_imm(b, zero, nir_imm_float(b, 1.0), 3);
         break;
      default:
         break;
      }
   }
   rewrite_mask >>= c;
   nir_def *dest = &intr->def;
   u_foreach_bit(component, rewrite_mask)
      dest = nir_vector_insert_imm(b, dest, nir_channel(b, zero, component), component);
   nir_def_rewrite_uses_after(&intr->def, dest, dest->parent_instr);
   return true;
}

static bool
find_max_write_components(nir_builder *b, nir_intrinsic_instr *intr, void *data)
{
   struct write_components *wc = data;
   bool is_load = false;
   bool is_input = false;
   bool is_interp = false;
   if (!filter_io_instr(intr, &is_load, &is_input, &is_interp))
      return false;
   if (is_input || is_load)
      return false;
   nir_io_semantics s = nir_intrinsic_io_semantics(intr);
   if (wc->slot < s.location || wc->slot >= s.location + s.num_slots)
      return false;
   unsigned location = s.location;
   unsigned c = nir_intrinsic_component(intr);
   uint32_t wrmask = nir_intrinsic_write_mask(intr) << c;
   if ((nir_intrinsic_src_type(intr) & NIR_ALU_TYPE_SIZE_MASK) == 64) {
      unsigned num_components = intr->num_components * 2;
      nir_src *src_offset = nir_get_io_offset_src(intr);
      if (nir_src_is_const(*src_offset)) {
         if (location + nir_src_as_uint(*src_offset) != wc->slot && num_components + c < 4)
            return false;
      }
      wrmask |= wrmask << intr->num_components;
      /* handle dvec3/dvec4 */
      if (num_components + c > 4)
         wrmask >>= 4;
   }
   wc->component_mask |= wrmask;
   return false;
}

void
zink_compiler_assign_io(struct zink_screen *screen, nir_shader *producer, nir_shader *consumer)
{
   unsigned reserved = 0;
   unsigned char slot_map[VARYING_SLOT_MAX];
   memset(slot_map, -1, sizeof(slot_map));
   bool do_fixup = false;
   nir_shader *nir = producer->info.stage == MESA_SHADER_TESS_CTRL ? producer : consumer;
   nir_variable *var = nir_find_variable_with_location(producer, nir_var_shader_out, VARYING_SLOT_PSIZ);
   if (var) {
      bool can_remove = false;
      if (!nir_find_variable_with_location(consumer, nir_var_shader_in, VARYING_SLOT_PSIZ)) {
         /* maintenance5 guarantees "A default size of 1.0 is used if PointSize is not written" */
         if (screen->info.have_KHR_maintenance5 && !var->data.explicit_xfb_buffer && delete_psiz_store(producer, true))
            can_remove = !(producer->info.outputs_written & VARYING_BIT_PSIZ);
         else if (consumer->info.stage != MESA_SHADER_FRAGMENT)
            can_remove = !var->data.explicit_location;
      }
      /* remove injected pointsize from all but the last vertex stage */
      if (can_remove) {
         var->data.mode = nir_var_shader_temp;
         nir_fixup_deref_modes(producer);
         delete_psiz_store(producer, false);
         NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_temp, NULL);
         optimize_nir(producer, NULL, true);
      }
   }
   if (consumer->info.stage != MESA_SHADER_FRAGMENT) {
      producer->info.has_transform_feedback_varyings = false;
      nir_foreach_shader_out_variable(var_out, producer)
         var_out->data.explicit_xfb_buffer = false;
   }
   if (producer->info.stage == MESA_SHADER_TESS_CTRL) {
      /* never assign from tcs -> tes, always invert */
      nir_foreach_variable_with_modes(var_in, consumer, nir_var_shader_in)
         assign_producer_var_io(consumer->info.stage, var_in, &reserved, slot_map);
      nir_foreach_variable_with_modes_safe(var_out, producer, nir_var_shader_out) {
         if (!assign_consumer_var_io(producer->info.stage, var_out, &reserved, slot_map))
            /* this is an output, nothing more needs to be done for it to be dropped */
            do_fixup = true;
      }
   } else {
      nir_foreach_variable_with_modes(var_out, producer, nir_var_shader_out)
         assign_producer_var_io(producer->info.stage, var_out, &reserved, slot_map);
      nir_foreach_variable_with_modes_safe(var_in, consumer, nir_var_shader_in) {
         if (!assign_consumer_var_io(consumer->info.stage, var_in, &reserved, slot_map)) {
            do_fixup = true;
            /* input needs to be rewritten */
            nir_shader_instructions_pass(consumer, rewrite_read_as_0, nir_metadata_dominance, var_in);
         }
      }
      if (consumer->info.stage == MESA_SHADER_FRAGMENT && screen->driver_workarounds.needs_sanitised_layer)
         do_fixup |= clamp_layer_output(producer, consumer, &reserved);
   }
   nir_shader_gather_info(producer, nir_shader_get_entrypoint(producer));
   if (producer->info.io_lowered && consumer->info.io_lowered) {
      u_foreach_bit64(slot, producer->info.outputs_written & BITFIELD64_RANGE(VARYING_SLOT_VAR0, 31)) {
         struct write_components wc = {slot, 0};
         nir_shader_intrinsics_pass(producer, find_max_write_components, nir_metadata_all, &wc);
         assert(wc.component_mask);
         if (wc.component_mask != BITFIELD_MASK(4))
            do_fixup |= nir_shader_intrinsics_pass(consumer, fill_zero_reads, nir_metadata_dominance, &wc);
      }
   }
   if (!do_fixup)
      return;
   nir_fixup_deref_modes(nir);
   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_temp, NULL);
   optimize_nir(nir, NULL, true);
}

/* all types that hit this function contain something that is 64bit */
static const struct glsl_type *
rewrite_64bit_type(nir_shader *nir, const struct glsl_type *type, nir_variable *var, bool doubles_only)
{
   if (glsl_type_is_array(type)) {
      const struct glsl_type *child = glsl_get_array_element(type);
      unsigned elements = glsl_array_size(type);
      unsigned stride = glsl_get_explicit_stride(type);
      return glsl_array_type(rewrite_64bit_type(nir, child, var, doubles_only), elements, stride);
   }
   /* rewrite structs recursively */
   if (glsl_type_is_struct_or_ifc(type)) {
      unsigned nmembers = glsl_get_length(type);
      struct glsl_struct_field *fields = rzalloc_array(nir, struct glsl_struct_field, nmembers * 2);
      unsigned xfb_offset = 0;
      for (unsigned i = 0; i < nmembers; i++) {
         const struct glsl_struct_field *f = glsl_get_struct_field_data(type, i);
         fields[i] = *f;
         xfb_offset += glsl_get_component_slots(fields[i].type) * 4;
         if (i < nmembers - 1 && xfb_offset % 8 &&
             (glsl_contains_double(glsl_get_struct_field(type, i + 1)) ||
              (glsl_type_contains_64bit(glsl_get_struct_field(type, i + 1)) && !doubles_only))) {
            var->data.is_xfb = true;
         }
         fields[i].type = rewrite_64bit_type(nir, f->type, var, doubles_only);
      }
      return glsl_struct_type(fields, nmembers, glsl_get_type_name(type), glsl_struct_type_is_packed(type));
   }
   if (!glsl_type_is_64bit(type) || (!glsl_contains_double(type) && doubles_only))
      return type;
   if (doubles_only && glsl_type_is_vector_or_scalar(type))
      return glsl_vector_type(GLSL_TYPE_UINT64, glsl_get_vector_elements(type));
   enum glsl_base_type base_type;
   switch (glsl_get_base_type(type)) {
   case GLSL_TYPE_UINT64:
      base_type = GLSL_TYPE_UINT;
      break;
   case GLSL_TYPE_INT64:
      base_type = GLSL_TYPE_INT;
      break;
   case GLSL_TYPE_DOUBLE:
      base_type = GLSL_TYPE_FLOAT;
      break;
   default:
      unreachable("unknown 64-bit vertex attribute format!");
   }
   if (glsl_type_is_scalar(type))
      return glsl_vector_type(base_type, 2);
   unsigned num_components;
   if (glsl_type_is_matrix(type)) {
      /* align to vec4 size: dvec3-composed arrays are arrays of dvec3s */
      unsigned vec_components = glsl_get_vector_elements(type);
      if (vec_components == 3)
         vec_components = 4;
      num_components = vec_components * 2 * glsl_get_matrix_columns(type);
   } else {
      num_components = glsl_get_vector_elements(type) * 2;
      if (num_components <= 4)
         return glsl_vector_type(base_type, num_components);
   }
   /* dvec3/dvec4/dmatX: rewrite as struct { vec4, vec4, vec4, ... [vec2] } */
   struct glsl_struct_field fields[8] = {0};
   unsigned remaining = num_components;
   unsigned nfields = 0;
   for (unsigned i = 0; remaining; i++, remaining -= MIN2(4, remaining), nfields++) {
      assert(i < ARRAY_SIZE(fields));
      fields[i].name = "";
      fields[i].offset = i * 16;
      fields[i].type = glsl_vector_type(base_type, MIN2(4, remaining));
   }
   char buf[64];
   snprintf(buf, sizeof(buf), "struct(%s)", glsl_get_type_name(type));
   return glsl_struct_type(fields, nfields, buf, true);
}

static const struct glsl_type *
deref_is_matrix(nir_deref_instr *deref)
{
   if (glsl_type_is_matrix(deref->type))
      return deref->type;
   nir_deref_instr *parent = nir_deref_instr_parent(deref);
   if (parent)
      return deref_is_matrix(parent);
   return NULL;
}

static bool
lower_64bit_vars_function(nir_shader *shader, nir_function_impl *impl, nir_variable *var,
                          struct hash_table *derefs, struct set *deletes, bool doubles_only)
{
   bool func_progress = false;
   nir_builder b = nir_builder_create(impl);
   nir_foreach_block(block, impl) {
      nir_foreach_instr_safe(instr, block) {
         switch (instr->type) {
         case nir_instr_type_deref: {
            nir_deref_instr *deref = nir_instr_as_deref(instr);
            if (!(deref->modes & var->data.mode))
               continue;
            if (nir_deref_instr_get_variable(deref) != var)
               continue;

            /* matrix types are special: store the original deref type for later use */
            const struct glsl_type *matrix = deref_is_matrix(deref);
            nir_deref_instr *parent = nir_deref_instr_parent(deref);
            if (!matrix) {
               /* if this isn't a direct matrix deref, it's maybe a matrix row deref */
               hash_table_foreach(derefs, he) {
                  /* propagate parent matrix type to row deref */
                  if (he->key == parent)
                     matrix = he->data;
               }
            }
            if (matrix)
               _mesa_hash_table_insert(derefs, deref, (void*)matrix);
            if (deref->deref_type == nir_deref_type_var)
               deref->type = var->type;
            else
               deref->type = rewrite_64bit_type(shader, deref->type, var, doubles_only);
         }
         break;
         case nir_instr_type_intrinsic: {
            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
            if (intr->intrinsic != nir_intrinsic_store_deref &&
                  intr->intrinsic != nir_intrinsic_load_deref)
               break;
            if (nir_intrinsic_get_var(intr, 0) != var)
               break;
            if ((intr->intrinsic == nir_intrinsic_store_deref && intr->src[1].ssa->bit_size != 64) ||
                  (intr->intrinsic == nir_intrinsic_load_deref && intr->def.bit_size != 64))
               break;
            b.cursor = nir_before_instr(instr);
            nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
            unsigned num_components = intr->num_components * 2;
            nir_def *comp[NIR_MAX_VEC_COMPONENTS];
            /* this is the stored matrix type from the deref */
            struct hash_entry *he = _mesa_hash_table_search(derefs, deref);
            const struct glsl_type *matrix = he ? he->data : NULL;
            if (doubles_only && !matrix)
               break;
            func_progress = true;
            if (intr->intrinsic == nir_intrinsic_store_deref) {
               /* first, unpack the src data to 32bit vec2 components */
               for (unsigned i = 0; i < intr->num_components; i++) {
                  nir_def *ssa = nir_unpack_64_2x32(&b, nir_channel(&b, intr->src[1].ssa, i));
                  comp[i * 2] = nir_channel(&b, ssa, 0);
                  comp[i * 2 + 1] = nir_channel(&b, ssa, 1);
               }
               unsigned wrmask = nir_intrinsic_write_mask(intr);
               unsigned mask = 0;
               /* expand writemask for doubled components */
               for (unsigned i = 0; i < intr->num_components; i++) {
                  if (wrmask & BITFIELD_BIT(i))
                     mask |= BITFIELD_BIT(i * 2) | BITFIELD_BIT(i * 2 + 1);
               }
               if (matrix) {
                  /* matrix types always come from array (row) derefs */
                  assert(deref->deref_type == nir_deref_type_array);
                  nir_deref_instr *var_deref = nir_deref_instr_parent(deref);
                  /* let optimization clean up consts later */
                  nir_def *index = deref->arr.index.ssa;
                  /* this might be an indirect array index:
                     * - iterate over matrix columns
                     * - add if blocks for each column
                     * - perform the store in the block
                     */
                  for (unsigned idx = 0; idx < glsl_get_matrix_columns(matrix); idx++) {
                     nir_push_if(&b, nir_ieq_imm(&b, index, idx));
                     unsigned vec_components = glsl_get_vector_elements(matrix);
                     /* always clamp dvec3 to 4 components */
                     if (vec_components == 3)
                        vec_components = 4;
                     unsigned start_component = idx * vec_components * 2;
                     /* struct member */
                     unsigned member = start_component / 4;
                     /* number of components remaining */
                     unsigned remaining = num_components;
                     for (unsigned i = 0; i < num_components; member++) {
                        if (!(mask & BITFIELD_BIT(i)))
                           continue;
                        assert(member < glsl_get_length(var_deref->type));
                        /* deref the rewritten struct to the appropriate vec4/vec2 */
                        nir_deref_instr *strct = nir_build_deref_struct(&b, var_deref, member);
                        unsigned incr = MIN2(remaining, 4);
                        /* assemble the write component vec */
                        nir_def *val = nir_vec(&b, &comp[i], incr);
                        /* use the number of components being written as the writemask */
                        if (glsl_get_vector_elements(strct->type) > val->num_components)
                           val = nir_pad_vector(&b, val, glsl_get_vector_elements(strct->type));
                        nir_store_deref(&b, strct, val, BITFIELD_MASK(incr));
                        remaining -= incr;
                        i += incr;
                     }
                     nir_pop_if(&b, NULL);
                  }
                  _mesa_set_add(deletes, &deref->instr);
               } else if (num_components <= 4) {
                  /* simple store case: just write out the components */
                  nir_def *dest = nir_vec(&b, comp, num_components);
                  nir_store_deref(&b, deref, dest, mask);
               } else {
                  /* writing > 4 components: access the struct and write to the appropriate vec4 members */
                  for (unsigned i = 0; num_components; i++, num_components -= MIN2(num_components, 4)) {
                     if (!(mask & BITFIELD_MASK(4)))
                        continue;
                     nir_deref_instr *strct = nir_build_deref_struct(&b, deref, i);
                     nir_def *dest = nir_vec(&b, &comp[i * 4], MIN2(num_components, 4));
                     if (glsl_get_vector_elements(strct->type) > dest->num_components)
                        dest = nir_pad_vector(&b, dest, glsl_get_vector_elements(strct->type));
                     nir_store_deref(&b, strct, dest, mask & BITFIELD_MASK(4));
                     mask >>= 4;
                  }
               }
            } else {
               nir_def *dest = NULL;
               if (matrix) {
                  /* matrix types always come from array (row) derefs */
                  assert(deref->deref_type == nir_deref_type_array);
                  nir_deref_instr *var_deref = nir_deref_instr_parent(deref);
                  /* let optimization clean up consts later */
                  nir_def *index = deref->arr.index.ssa;
                  /* this might be an indirect array index:
                     * - iterate over matrix columns
                     * - add if blocks for each column
                     * - phi the loads using the array index
                     */
                  unsigned cols = glsl_get_matrix_columns(matrix);
                  nir_def *dests[4];
                  for (unsigned idx = 0; idx < cols; idx++) {
                     /* don't add an if for the final row: this will be handled in the else */
                     if (idx < cols - 1)
                        nir_push_if(&b, nir_ieq_imm(&b, index, idx));
                     unsigned vec_components = glsl_get_vector_elements(matrix);
                     /* always clamp dvec3 to 4 components */
                     if (vec_components == 3)
                        vec_components = 4;
                     unsigned start_component = idx * vec_components * 2;
                     /* struct member */
                     unsigned member = start_component / 4;
                     /* number of components remaining */
                     unsigned remaining = num_components;
                     /* component index */
                     unsigned comp_idx = 0;
                     for (unsigned i = 0; i < num_components; member++) {
                        assert(member < glsl_get_length(var_deref->type));
                        nir_deref_instr *strct = nir_build_deref_struct(&b, var_deref, member);
                        nir_def *load = nir_load_deref(&b, strct);
                        unsigned incr = MIN2(remaining, 4);
                        /* repack the loads to 64bit */
                        for (unsigned c = 0; c < incr / 2; c++, comp_idx++)
                           comp[comp_idx] = nir_pack_64_2x32(&b, nir_channels(&b, load, BITFIELD_RANGE(c * 2, 2)));
                        remaining -= incr;
                        i += incr;
                     }
                     dest = dests[idx] = nir_vec(&b, comp, intr->num_components);
                     if (idx < cols - 1)
                        nir_push_else(&b, NULL);
                  }
                  /* loop over all the if blocks that were made, pop them, and phi the loaded+packed results */
                  for (unsigned idx = cols - 1; idx >= 1; idx--) {
                     nir_pop_if(&b, NULL);
                     dest = nir_if_phi(&b, dests[idx - 1], dest);
                  }
                  _mesa_set_add(deletes, &deref->instr);
               } else if (num_components <= 4) {
                  /* simple load case */
                  nir_def *load = nir_load_deref(&b, deref);
                  /* pack 32bit loads into 64bit: this will automagically get optimized out later */
                  for (unsigned i = 0; i < intr->num_components; i++) {
                     comp[i] = nir_pack_64_2x32(&b, nir_channels(&b, load, BITFIELD_RANGE(i * 2, 2)));
                  }
                  dest = nir_vec(&b, comp, intr->num_components);
               } else {
                  /* writing > 4 components: access the struct and load the appropriate vec4 members */
                  for (unsigned i = 0; i < 2; i++, num_components -= 4) {
                     nir_deref_instr *strct = nir_build_deref_struct(&b, deref, i);
                     nir_def *load = nir_load_deref(&b, strct);
                     comp[i * 2] = nir_pack_64_2x32(&b,
                                                    nir_trim_vector(&b, load, 2));
                     if (num_components > 2)
                        comp[i * 2 + 1] = nir_pack_64_2x32(&b, nir_channels(&b, load, BITFIELD_RANGE(2, 2)));
                  }
                  dest = nir_vec(&b, comp, intr->num_components);
               }
               nir_def_rewrite_uses_after(&intr->def, dest, instr);
            }
            _mesa_set_add(deletes, instr);
            break;
         }
         break;
         default: break;
         }
      }
   }
   if (func_progress)
      nir_metadata_preserve(impl, nir_metadata_none);
   /* derefs must be queued for deletion to avoid deleting the same deref repeatedly */
   set_foreach_remove(deletes, he)
      nir_instr_remove((void*)he->key);
   return func_progress;
}

static bool
lower_64bit_vars_loop(nir_shader *shader, nir_variable *var, struct hash_table *derefs,
                      struct set *deletes, bool doubles_only)
{
   if (!glsl_type_contains_64bit(var->type) || (doubles_only && !glsl_contains_double(var->type)))
      return false;
   var->type = rewrite_64bit_type(shader, var->type, var, doubles_only);
   /* once type is rewritten, rewrite all loads and stores */
   nir_foreach_function_impl(impl, shader)
      lower_64bit_vars_function(shader, impl, var, derefs, deletes, doubles_only);
   return true;
}

/* rewrite all input/output variables using 32bit types and load/stores */
static bool
lower_64bit_vars(nir_shader *shader, bool doubles_only)
{
   bool progress = false;
   struct hash_table *derefs = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
   struct set *deletes = _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
   nir_foreach_function_impl(impl, shader) {
      nir_foreach_function_temp_variable(var, impl) {
         if (!glsl_type_contains_64bit(var->type) || (doubles_only && !glsl_contains_double(var->type)))
            continue;
         var->type = rewrite_64bit_type(shader, var->type, var, doubles_only);
         progress |= lower_64bit_vars_function(shader, impl, var, derefs, deletes, doubles_only);
      }
   }
   ralloc_free(deletes);
   ralloc_free(derefs);
   if (progress) {
      nir_lower_alu_to_scalar(shader, filter_64_bit_instr, NULL);
      nir_lower_phis_to_scalar(shader, false);
      optimize_nir(shader, NULL, true);
   }
   return progress;
}

static void
zink_shader_dump(const struct zink_shader *zs, void *words, size_t size, const char *file)
{
   FILE *fp = fopen(file, "wb");
   if (fp) {
      fwrite(words, 1, size, fp);
      fclose(fp);
      fprintf(stderr, "wrote %s shader '%s'...\n", _mesa_shader_stage_to_string(zs->info.stage), file);
   }
}

static VkShaderStageFlagBits
zink_get_next_stage(gl_shader_stage stage)
{
   switch (stage) {
   case MESA_SHADER_VERTEX:
      return VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT |
             VK_SHADER_STAGE_GEOMETRY_BIT |
             VK_SHADER_STAGE_FRAGMENT_BIT;
   case MESA_SHADER_TESS_CTRL:
      return VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT;
   case MESA_SHADER_TESS_EVAL:
      return VK_SHADER_STAGE_GEOMETRY_BIT |
             VK_SHADER_STAGE_FRAGMENT_BIT;
   case MESA_SHADER_GEOMETRY:
      return VK_SHADER_STAGE_FRAGMENT_BIT;
   case MESA_SHADER_FRAGMENT:
   case MESA_SHADER_COMPUTE:
   case MESA_SHADER_KERNEL:
      return 0;
   default:
      unreachable("invalid shader stage");
   }
}

struct zink_shader_object
zink_shader_spirv_compile(struct zink_screen *screen, struct zink_shader *zs, struct spirv_shader *spirv, bool can_shobj, struct zink_program *pg)
{
   VkShaderModuleCreateInfo smci = {0};
   VkShaderCreateInfoEXT sci = {0};

   if (!spirv)
      spirv = zs->spirv;

   if (zink_debug & ZINK_DEBUG_SPIRV) {
      char buf[256];
      static int i;
      snprintf(buf, sizeof(buf), "dump%02d.spv", i++);
      zink_shader_dump(zs, spirv->words, spirv->num_words * sizeof(uint32_t), buf);
   }

   sci.sType = VK_STRUCTURE_TYPE_SHADER_CREATE_INFO_EXT;
   sci.stage = mesa_to_vk_shader_stage(zs->info.stage);
   sci.nextStage = zink_get_next_stage(zs->info.stage);
   sci.codeType = VK_SHADER_CODE_TYPE_SPIRV_EXT;
   sci.codeSize = spirv->num_words * sizeof(uint32_t);
   sci.pCode = spirv->words;
   sci.pName = "main";
   VkDescriptorSetLayout dsl[ZINK_GFX_SHADER_COUNT] = {0};
   if (pg) {
      sci.setLayoutCount = pg->num_dsl;
      sci.pSetLayouts = pg->dsl;
   } else {
      sci.setLayoutCount = zs->info.stage + 1;
      dsl[zs->info.stage] = zs->precompile.dsl;;
      sci.pSetLayouts = dsl;
   }
   VkPushConstantRange pcr;
   pcr.stageFlags = VK_SHADER_STAGE_ALL_GRAPHICS;
   pcr.offset = 0;
   pcr.size = sizeof(struct zink_gfx_push_constant);
   sci.pushConstantRangeCount = 1;
   sci.pPushConstantRanges = &pcr;

   smci.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
   smci.codeSize = spirv->num_words * sizeof(uint32_t);
   smci.pCode = spirv->words;

#ifndef NDEBUG
   if (zink_debug & ZINK_DEBUG_VALIDATION) {
      static const struct spirv_to_nir_options spirv_options = {
         .environment = NIR_SPIRV_VULKAN,
         .caps = {
            .float64 = true,
            .int16 = true,
            .int64 = true,
            .tessellation = true,
            .float_controls = true,
            .image_ms_array = true,
            .image_read_without_format = true,
            .image_write_without_format = true,
            .storage_image_ms = true,
            .geometry_streams = true,
            .storage_8bit = true,
            .storage_16bit = true,
            .variable_pointers = true,
            .stencil_export = true,
            .post_depth_coverage = true,
            .transform_feedback = true,
            .device_group = true,
            .draw_parameters = true,
            .shader_viewport_index_layer = true,
            .multiview = true,
            .physical_storage_buffer_address = true,
            .int64_atomics = true,
            .subgroup_arithmetic = true,
            .subgroup_basic = true,
            .subgroup_ballot = true,
            .subgroup_quad = true,
            .subgroup_shuffle = true,
            .subgroup_vote = true,
            .vk_memory_model = true,
            .vk_memory_model_device_scope = true,
            .int8 = true,
            .float16 = true,
            .demote_to_helper_invocation = true,
            .sparse_residency = true,
            .min_lod = true,
            .workgroup_memory_explicit_layout = true,
         },
         .ubo_addr_format = nir_address_format_32bit_index_offset,
         .ssbo_addr_format = nir_address_format_32bit_index_offset,
         .phys_ssbo_addr_format = nir_address_format_64bit_global,
         .push_const_addr_format = nir_address_format_logical,
         .shared_addr_format = nir_address_format_32bit_offset,
      };
      uint32_t num_spec_entries = 0;
      struct nir_spirv_specialization *spec_entries = NULL;
      VkSpecializationInfo sinfo = {0};
      VkSpecializationMapEntry me[3];
      uint32_t size[3] = {1,1,1};
      if (!zs->info.workgroup_size[0]) {
         sinfo.mapEntryCount = 3;
         sinfo.pMapEntries = &me[0];
         sinfo.dataSize = sizeof(uint32_t) * 3;
         sinfo.pData = size;
         uint32_t ids[] = {ZINK_WORKGROUP_SIZE_X, ZINK_WORKGROUP_SIZE_Y, ZINK_WORKGROUP_SIZE_Z};
         for (int i = 0; i < 3; i++) {
            me[i].size = sizeof(uint32_t);
            me[i].constantID = ids[i];
            me[i].offset = i * sizeof(uint32_t);
         }
         spec_entries = vk_spec_info_to_nir_spirv(&sinfo, &num_spec_entries);
      }
      nir_shader *nir = spirv_to_nir(spirv->words, spirv->num_words,
                         spec_entries, num_spec_entries,
                         clamp_stage(&zs->info), "main", &spirv_options, &screen->nir_options);
      assert(nir);
      ralloc_free(nir);
      free(spec_entries);
   }
#endif

   VkResult ret;
   struct zink_shader_object obj = {0};
   if (!can_shobj || !screen->info.have_EXT_shader_object)
      ret = VKSCR(CreateShaderModule)(screen->dev, &smci, NULL, &obj.mod);
   else
      ret = VKSCR(CreateShadersEXT)(screen->dev, 1, &sci, NULL, &obj.obj);
   ASSERTED bool success = zink_screen_handle_vkresult(screen, ret);
   assert(success);
   return obj;
}

static void
prune_io(nir_shader *nir)
{
   nir_foreach_shader_in_variable_safe(var, nir) {
      if (!find_var_deref(nir, var) && !find_var_io(nir, var))
         var->data.mode = nir_var_shader_temp;
   }
   nir_foreach_shader_out_variable_safe(var, nir) {
      if (!find_var_deref(nir, var) && !find_var_io(nir, var))
         var->data.mode = nir_var_shader_temp;
   }
   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_temp, NULL);
}

static void
flag_shadow_tex(nir_variable *var, struct zink_shader *zs)
{
   /* unconvert from zink_binding() */
   uint32_t sampler_id = var->data.binding - (PIPE_MAX_SAMPLERS * MESA_SHADER_FRAGMENT);
   assert(sampler_id < 32); //bitfield size for tracking
   zs->fs.legacy_shadow_mask |= BITFIELD_BIT(sampler_id);
}

static nir_def *
rewrite_tex_dest(nir_builder *b, nir_tex_instr *tex, nir_variable *var, struct zink_shader *zs)
{
   assert(var);
   const struct glsl_type *type = glsl_without_array(var->type);
   enum glsl_base_type ret_type = glsl_get_sampler_result_type(type);
   bool is_int = glsl_base_type_is_integer(ret_type);
   unsigned bit_size = glsl_base_type_get_bit_size(ret_type);
   unsigned dest_size = tex->def.bit_size;
   b->cursor = nir_after_instr(&tex->instr);
   unsigned num_components = tex->def.num_components;
   bool rewrite_depth = tex->is_shadow && num_components > 1 && tex->op != nir_texop_tg4 && !tex->is_sparse;
   if (bit_size == dest_size && !rewrite_depth)
      return NULL;
   nir_def *dest = &tex->def;
   if (rewrite_depth && zs) {
      if (nir_def_components_read(dest) & ~1) {
         /* this needs recompiles */
         if (b->shader->info.stage == MESA_SHADER_FRAGMENT)
            flag_shadow_tex(var, zs);
         else
            mesa_loge("unhandled old-style shadow sampler in non-fragment stage!");
         return NULL;
      }
      /* If only .x is used in the NIR, then it's effectively not a legacy depth
       * sample anyway and we don't want to ask for shader recompiles.  This is
       * the typical path, since GL_DEPTH_TEXTURE_MODE defaults to either RED or
       * LUMINANCE, so apps just use the first channel.
       */
      tex->def.num_components = 1;
      tex->is_new_style_shadow = true;
   }
   if (bit_size != dest_size) {
      tex->def.bit_size = bit_size;
      tex->dest_type = nir_get_nir_type_for_glsl_base_type(ret_type);

      if (is_int) {
         if (glsl_unsigned_base_type_of(ret_type) == ret_type)
            dest = nir_u2uN(b, &tex->def, dest_size);
         else
            dest = nir_i2iN(b, &tex->def, dest_size);
      } else {
         dest = nir_f2fN(b, &tex->def, dest_size);
      }
      if (rewrite_depth)
         return dest;
      nir_def_rewrite_uses_after(&tex->def, dest, dest->parent_instr);
   } else if (rewrite_depth) {
      return dest;
   }
   return dest;
}

struct lower_zs_swizzle_state {
   bool shadow_only;
   unsigned base_sampler_id;
   const struct zink_zs_swizzle_key *swizzle;
};

static bool
lower_zs_swizzle_tex_instr(nir_builder *b, nir_instr *instr, void *data)
{
   struct lower_zs_swizzle_state *state = data;
   const struct zink_zs_swizzle_key *swizzle_key = state->swizzle;
   assert(state->shadow_only || swizzle_key);
   if (instr->type != nir_instr_type_tex)
      return false;
   nir_tex_instr *tex = nir_instr_as_tex(instr);
   if (tex->op == nir_texop_txs || tex->op == nir_texop_lod ||
       (!tex->is_shadow && state->shadow_only) || tex->is_new_style_shadow)
      return false;
   if (tex->is_shadow && tex->op == nir_texop_tg4)
      /* Will not even try to emulate the shadow comparison */
      return false;
   int handle = nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
   nir_variable *var = NULL;
   if (handle != -1)
      /* gtfo bindless depth texture mode */
      return false;
   nir_foreach_variable_with_modes(img, b->shader, nir_var_uniform) {
      if (glsl_type_is_sampler(glsl_without_array(img->type))) {
         unsigned size = glsl_type_is_array(img->type) ? glsl_get_aoa_size(img->type) : 1;
         if (tex->texture_index >= img->data.driver_location &&
               tex->texture_index < img->data.driver_location + size) {
            var = img;
            break;
         }
      }
   }
   assert(var);
   uint32_t sampler_id = var->data.binding - state->base_sampler_id;
   const struct glsl_type *type = glsl_without_array(var->type);
   enum glsl_base_type ret_type = glsl_get_sampler_result_type(type);
   bool is_int = glsl_base_type_is_integer(ret_type);
   unsigned num_components = tex->def.num_components;
   if (tex->is_shadow)
      tex->is_new_style_shadow = true;
   nir_def *dest = rewrite_tex_dest(b, tex, var, NULL);
   assert(dest || !state->shadow_only);
   if (!dest && !(swizzle_key->mask & BITFIELD_BIT(sampler_id)))
      return false;
   else if (!dest)
      dest = &tex->def;
   else
      tex->def.num_components = 1;
   if (swizzle_key && (swizzle_key->mask & BITFIELD_BIT(sampler_id))) {
      /* these require manual swizzles */
      if (tex->op == nir_texop_tg4) {
         assert(!tex->is_shadow);
         nir_def *swizzle;
         switch (swizzle_key->swizzle[sampler_id].s[tex->component]) {
         case PIPE_SWIZZLE_0:
            swizzle = nir_imm_zero(b, 4, tex->def.bit_size);
            break;
         case PIPE_SWIZZLE_1:
            if (is_int)
               swizzle = nir_imm_intN_t(b, 4, tex->def.bit_size);
            else
               swizzle = nir_imm_floatN_t(b, 4, tex->def.bit_size);
            break;
         default:
            if (!tex->component)
               return false;
            tex->component = 0;
            return true;
         }
         nir_def_rewrite_uses_after(dest, swizzle, swizzle->parent_instr);
         return true;
      }
      nir_def *vec[4];
      for (unsigned i = 0; i < ARRAY_SIZE(vec); i++) {
         switch (swizzle_key->swizzle[sampler_id].s[i]) {
         case PIPE_SWIZZLE_0:
            vec[i] = nir_imm_zero(b, 1, tex->def.bit_size);
            break;
         case PIPE_SWIZZLE_1:
            if (is_int)
               vec[i] = nir_imm_intN_t(b, 1, tex->def.bit_size);
            else
               vec[i] = nir_imm_floatN_t(b, 1, tex->def.bit_size);
            break;
         default:
            vec[i] = dest->num_components == 1 ? dest : nir_channel(b, dest, i);
            break;
         }
      }
      nir_def *swizzle = nir_vec(b, vec, num_components);
      nir_def_rewrite_uses_after(dest, swizzle, swizzle->parent_instr);
   } else {
      assert(tex->is_shadow);
      nir_def *vec[4] = {dest, dest, dest, dest};
      nir_def *splat = nir_vec(b, vec, num_components);
      nir_def_rewrite_uses_after(dest, splat, splat->parent_instr);
   }
   return true;
}

/* Applies in-shader swizzles when necessary for depth/shadow sampling.
 *
 * SPIRV only has new-style (scalar result) shadow sampling, so to emulate
 * !is_new_style_shadow (vec4 result) shadow sampling we lower to a
 * new-style-shadow sample, and apply GL_DEPTH_TEXTURE_MODE swizzles in the NIR
 * shader to expand out to vec4.  Since this depends on sampler state, it's a
 * draw-time shader recompile to do so.
 *
 * We may also need to apply shader swizzles for
 * driver_workarounds.needs_zs_shader_swizzle.
 */
static bool
lower_zs_swizzle_tex(nir_shader *nir, const void *swizzle, bool shadow_only)
{
   /* We don't use nir_lower_tex to do our swizzling, because of this base_sampler_id. */
   unsigned base_sampler_id = gl_shader_stage_is_compute(nir->info.stage) ? 0 : PIPE_MAX_SAMPLERS * nir->info.stage;
   struct lower_zs_swizzle_state state = {shadow_only, base_sampler_id, swizzle};
   return nir_shader_instructions_pass(nir, lower_zs_swizzle_tex_instr, nir_metadata_dominance | nir_metadata_block_index, (void*)&state);
}

static bool
invert_point_coord_instr(nir_builder *b, nir_intrinsic_instr *intr,
                         void *data)
{
   if (intr->intrinsic != nir_intrinsic_load_point_coord)
      return false;
   b->cursor = nir_after_instr(&intr->instr);
   nir_def *def = nir_vec2(b, nir_channel(b, &intr->def, 0),
                                  nir_fsub_imm(b, 1.0, nir_channel(b, &intr->def, 1)));
   nir_def_rewrite_uses_after(&intr->def, def, def->parent_instr);
   return true;
}

static bool
invert_point_coord(nir_shader *nir)
{
   if (!BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_POINT_COORD))
      return false;
   return nir_shader_intrinsics_pass(nir, invert_point_coord_instr,
                                     nir_metadata_dominance, NULL);
}

static bool
lower_sparse_instr(nir_builder *b, nir_instr *instr, void *data)
{
   b->cursor = nir_after_instr(instr);

   switch (instr->type) {
   case nir_instr_type_tex: {
      nir_tex_instr *tex = nir_instr_as_tex(instr);
      if (!tex->is_sparse)
         return false;

      nir_def *res = nir_b2i32(b, nir_is_sparse_resident_zink(b, &tex->def));
      nir_def *vec = nir_vector_insert_imm(b, &tex->def, res,
                                           tex->def.num_components - 1);
      nir_def_rewrite_uses_after(&tex->def, vec, vec->parent_instr);
      return true;
   }

   case nir_instr_type_intrinsic: {
      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
      switch (intrin->intrinsic) {
      case nir_intrinsic_image_deref_sparse_load: {
         nir_def *res = nir_b2i32(b, nir_is_sparse_resident_zink(b, &intrin->def));
         nir_def *vec = nir_vector_insert_imm(b, &intrin->def, res, 4);
         nir_def_rewrite_uses_after(&intrin->def, vec, vec->parent_instr);
         return true;
      }

      case nir_intrinsic_sparse_residency_code_and: {
         nir_def *res = nir_iand(b, intrin->src[0].ssa, intrin->src[1].ssa);
         nir_def_rewrite_uses(&intrin->def, res);
         return true;
      }

      case nir_intrinsic_is_sparse_texels_resident: {
         nir_def *res = nir_i2b(b, intrin->src[0].ssa);
         nir_def_rewrite_uses(&intrin->def, res);
         return true;
      }

      default:
         return false;
      }
   }

   default:
      return false;
   }
}

static bool
lower_sparse(nir_shader *shader)
{
   return nir_shader_instructions_pass(shader, lower_sparse_instr,
                                       nir_metadata_dominance, NULL);
}

static bool
add_derefs_instr(nir_builder *b, nir_intrinsic_instr *intr, void *data)
{
   bool is_load = false;
   bool is_input = false;
   bool is_interp = false;
   if (!filter_io_instr(intr, &is_load, &is_input, &is_interp))
      return false;
   bool is_special_io = (b->shader->info.stage == MESA_SHADER_VERTEX && is_input) ||
                        (b->shader->info.stage == MESA_SHADER_FRAGMENT && !is_input);
   unsigned loc = nir_intrinsic_io_semantics(intr).location;
   nir_src *src_offset = nir_get_io_offset_src(intr);
   const unsigned slot_offset = src_offset && nir_src_is_const(*src_offset) ? nir_src_as_uint(*src_offset) : 0;
   unsigned location = loc + slot_offset;
   unsigned frac = nir_intrinsic_component(intr);
   unsigned bit_size = is_load ? intr->def.bit_size : nir_src_bit_size(intr->src[0]);
   /* set c aligned/rounded down to dword */
   unsigned c = frac;
   if (frac && bit_size < 32)
      c = frac * bit_size / 32;
   /* loop over all the variables and rewrite corresponding access */
   nir_foreach_variable_with_modes(var, b->shader, is_input ? nir_var_shader_in : nir_var_shader_out) {
      const struct glsl_type *type = var->type;
      if (nir_is_arrayed_io(var, b->shader->info.stage))
         type = glsl_get_array_element(type);
      unsigned slot_count = get_var_slot_count(b->shader, var);
      /* filter access that isn't specific to this variable */
      if (var->data.location > location || var->data.location + slot_count <= location)
         continue;
      if (var->data.fb_fetch_output != nir_intrinsic_io_semantics(intr).fb_fetch_output)
         continue;
      if (b->shader->info.stage == MESA_SHADER_FRAGMENT && !is_load && nir_intrinsic_io_semantics(intr).dual_source_blend_index != var->data.index)
         continue;

      unsigned size = 0;
      bool is_struct = glsl_type_is_struct(glsl_without_array(type));
      if (is_struct)
         size = get_slot_components(var, var->data.location + slot_offset, var->data.location);
      else if (!is_special_io && var->data.compact)
         size = glsl_get_aoa_size(type);
      else
         size = glsl_get_vector_elements(glsl_without_array(type));
      assert(size);
      if (glsl_type_is_64bit(glsl_without_array(var->type)))
         size *= 2;
      if (var->data.location != location && size > 4 && size % 4 && !is_struct) {
         /* adjust for dvec3-type slot overflow */
         assert(location > var->data.location);
         size -= (location - var->data.location) * 4;
      }
      assert(size);
      if (var->data.location_frac + size <= c || var->data.location_frac > c)
         continue;

      b->cursor = nir_before_instr(&intr->instr);
      nir_deref_instr *deref = nir_build_deref_var(b, var);
      if (nir_is_arrayed_io(var, b->shader->info.stage)) {
         assert(intr->intrinsic != nir_intrinsic_store_output);
         deref = nir_build_deref_array(b, deref, intr->src[!is_load].ssa);
      }
      if (glsl_type_is_array(type)) {
         /* unroll array derefs */
         unsigned idx = var->data.compact ? (frac - var->data.location_frac) : 0;
         assert(src_offset);
         if (var->data.location < VARYING_SLOT_VAR0) {
            if (src_offset) {
               /* clip/cull dist and tess levels use different array offset semantics */
               bool is_clipdist = (b->shader->info.stage != MESA_SHADER_VERTEX || var->data.mode == nir_var_shader_out) &&
                                  is_clipcull_dist(var->data.location);
               bool is_tess_level = b->shader->info.stage == MESA_SHADER_TESS_CTRL &&
                                    var->data.location >= VARYING_SLOT_TESS_LEVEL_INNER && var->data.location >= VARYING_SLOT_TESS_LEVEL_OUTER;
               bool is_builtin_array = is_clipdist || is_tess_level;
               /* this is explicit for ease of debugging but could be collapsed at some point in the future*/
               if (nir_src_is_const(*src_offset)) {
                  unsigned offset = slot_offset;
                  if (is_builtin_array)
                     offset *= 4;
                  if (is_clipdist) {
                     if (loc == VARYING_SLOT_CLIP_DIST1 || loc == VARYING_SLOT_CULL_DIST1)
                        offset += 4;
                  }
                  deref = nir_build_deref_array_imm(b, deref, offset + idx);
               } else {
                  nir_def *offset = src_offset->ssa;
                  if (is_builtin_array)
                     nir_imul_imm(b, offset, 4);
                  deref = nir_build_deref_array(b, deref, idx ? nir_iadd_imm(b, offset, idx) : src_offset->ssa);
               }
            } else {
               deref = nir_build_deref_array_imm(b, deref, idx);
            }
            type = glsl_get_array_element(type);
         } else {
            idx += location - var->data.location;
            /* need to convert possible N*M to [N][M] */
            nir_def *nm = idx ? nir_iadd_imm(b, src_offset->ssa, idx) : src_offset->ssa;
            while (glsl_type_is_array(type)) {
               const struct glsl_type *elem = glsl_get_array_element(type);
               unsigned type_size = glsl_count_vec4_slots(elem, false, false);
               nir_def *n = glsl_type_is_array(elem) ? nir_udiv_imm(b, nm, type_size) : nm;
               if (glsl_type_is_vector_or_scalar(elem) && glsl_type_is_64bit(elem) && glsl_get_vector_elements(elem) > 2)
                  n = nir_udiv_imm(b, n, 2);
               deref = nir_build_deref_array(b, deref, n);
               nm = nir_umod_imm(b, nm, type_size);
               type = glsl_get_array_element(type);
            }
         }
      } else if (glsl_type_is_struct(type)) {
         deref = nir_build_deref_struct(b, deref, slot_offset);
      }
      assert(!glsl_type_is_array(type));
      unsigned num_components = glsl_get_vector_elements(type);
      if (is_load) {
         nir_def *load;
         if (is_interp) {
            nir_def *interp = intr->src[0].ssa;
            nir_intrinsic_instr *interp_intr = nir_instr_as_intrinsic(interp->parent_instr);
            assert(interp_intr);
            var->data.interpolation = nir_intrinsic_interp_mode(interp_intr);
            switch (interp_intr->intrinsic) {
            case nir_intrinsic_load_barycentric_centroid:
               load = nir_interp_deref_at_centroid(b, num_components, bit_size, &deref->def);
               break;
            case nir_intrinsic_load_barycentric_sample:
               var->data.sample = 1;
               load = nir_load_deref(b, deref);
               break;
            case nir_intrinsic_load_barycentric_pixel:
               load = nir_load_deref(b, deref);
               break;
            case nir_intrinsic_load_barycentric_at_sample:
               load = nir_interp_deref_at_sample(b, num_components, bit_size, &deref->def, interp_intr->src[0].ssa);
               break;
            case nir_intrinsic_load_barycentric_at_offset:
               load = nir_interp_deref_at_offset(b, num_components, bit_size, &deref->def, interp_intr->src[0].ssa);
               break;
            default:
               unreachable("unhandled interp!");
            }
         } else {
            load = nir_load_deref(b, deref);
         }
         /* filter needed components */
         if (intr->num_components < load->num_components)
            load = nir_channels(b, load, BITFIELD_MASK(intr->num_components) << (c - var->data.location_frac));
         nir_def_rewrite_uses(&intr->def, load);
      } else {
         nir_def *store = intr->src[0].ssa;
         /* pad/filter components to match deref type */
         if (intr->num_components < num_components) {
            nir_def *zero = nir_imm_zero(b, 1, bit_size);
            nir_def *vec[4] = {zero, zero, zero, zero};
            u_foreach_bit(i, nir_intrinsic_write_mask(intr))
               vec[c - var->data.location_frac + i] = nir_channel(b, store, i);
            store = nir_vec(b, vec, num_components);
         } if (store->num_components > num_components) {
            store = nir_channels(b, store, nir_intrinsic_write_mask(intr));
         }
         if (store->bit_size != glsl_get_bit_size(type)) {
            /* this should be some weird bindless io conversion */
            assert(store->bit_size == 64 && glsl_get_bit_size(type) == 32);
            assert(num_components != store->num_components);
            store = nir_unpack_64_2x32(b, store);
         }
         nir_store_deref(b, deref, store, BITFIELD_RANGE(c - var->data.location_frac, intr->num_components));
      }
      nir_instr_remove(&intr->instr);
      return true;
   }
   unreachable("failed to find variable for explicit io!");
   return true;
}

static bool
add_derefs(nir_shader *nir)
{
   return nir_shader_intrinsics_pass(nir, add_derefs_instr,
                                     nir_metadata_dominance, NULL);
}

static struct zink_shader_object
compile_module(struct zink_screen *screen, struct zink_shader *zs, nir_shader *nir, bool can_shobj, struct zink_program *pg)
{
   struct zink_shader_info *sinfo = &zs->sinfo;
   prune_io(nir);

   switch (nir->info.stage) {
   case MESA_SHADER_VERTEX:
   case MESA_SHADER_TESS_EVAL:
   case MESA_SHADER_GEOMETRY:
      NIR_PASS_V(nir, nir_divergence_analysis);
      break;
   default: break;
   }
   NIR_PASS_V(nir, nir_convert_from_ssa, true);

   if (zink_debug & (ZINK_DEBUG_NIR | ZINK_DEBUG_SPIRV))
      nir_index_ssa_defs(nir_shader_get_entrypoint(nir));
   if (zink_debug & ZINK_DEBUG_NIR) {
      fprintf(stderr, "NIR shader:\n---8<---\n");
      nir_print_shader(nir, stderr);
      fprintf(stderr, "---8<---\n");
   }

   struct zink_shader_object obj = {0};
   struct spirv_shader *spirv = nir_to_spirv(nir, sinfo, screen->spirv_version);
   if (spirv)
      obj = zink_shader_spirv_compile(screen, zs, spirv, can_shobj, pg);

   /* TODO: determine if there's any reason to cache spirv output? */
   if (zs->info.stage == MESA_SHADER_TESS_CTRL && zs->non_fs.is_generated)
      zs->spirv = spirv;
   else
      obj.spirv = spirv;
   return obj;
}

static bool
remove_interpolate_at_sample(struct nir_builder *b, nir_intrinsic_instr *interp, void *data)
{
   if (interp->intrinsic != nir_intrinsic_interp_deref_at_sample)
      return false;

   b->cursor = nir_before_instr(&interp->instr);
   nir_def *res = nir_load_deref(b, nir_src_as_deref(interp->src[0]));
   nir_def_rewrite_uses(&interp->def, res);

   return true;
}

struct zink_shader_object
zink_shader_compile(struct zink_screen *screen, bool can_shobj, struct zink_shader *zs,
                    nir_shader *nir, const struct zink_shader_key *key, const void *extra_data, struct zink_program *pg)
{
   bool need_optimize = true;
   bool inlined_uniforms = false;

   NIR_PASS_V(nir, add_derefs);
   NIR_PASS_V(nir, nir_lower_fragcolor, nir->info.fs.color_is_dual_source ? 1 : 8);
   if (key) {
      if (key->inline_uniforms) {
         NIR_PASS_V(nir, nir_inline_uniforms,
                    nir->info.num_inlinable_uniforms,
                    key->base.inlined_uniform_values,
                    nir->info.inlinable_uniform_dw_offsets);

         inlined_uniforms = true;
      }

      /* TODO: use a separate mem ctx here for ralloc */

      if (!screen->optimal_keys) {
         switch (zs->info.stage) {
         case MESA_SHADER_VERTEX: {
            uint32_t decomposed_attrs = 0, decomposed_attrs_without_w = 0;
            const struct zink_vs_key *vs_key = zink_vs_key(key);
            switch (vs_key->size) {
            case 4:
               decomposed_attrs = vs_key->u32.decomposed_attrs;
               decomposed_attrs_without_w = vs_key->u32.decomposed_attrs_without_w;
               break;
            case 2:
               decomposed_attrs = vs_key->u16.decomposed_attrs;
               decomposed_attrs_without_w = vs_key->u16.decomposed_attrs_without_w;
               break;
            case 1:
               decomposed_attrs = vs_key->u8.decomposed_attrs;
               decomposed_attrs_without_w = vs_key->u8.decomposed_attrs_without_w;
               break;
            default: break;
            }
            if (decomposed_attrs || decomposed_attrs_without_w)
               NIR_PASS_V(nir, decompose_attribs, decomposed_attrs, decomposed_attrs_without_w);
            break;
         }

         case MESA_SHADER_GEOMETRY:
            if (zink_gs_key(key)->lower_line_stipple) {
               NIR_PASS_V(nir, lower_line_stipple_gs, zink_gs_key(key)->line_rectangular);
               NIR_PASS_V(nir, nir_lower_var_copies);
               need_optimize = true;
            }

            if (zink_gs_key(key)->lower_line_smooth) {
               NIR_PASS_V(nir, lower_line_smooth_gs);
               NIR_PASS_V(nir, nir_lower_var_copies);
               need_optimize = true;
            }

            if (zink_gs_key(key)->lower_gl_point) {
               NIR_PASS_V(nir, lower_gl_point_gs);
               need_optimize = true;
            }

            if (zink_gs_key(key)->lower_pv_mode) {
               NIR_PASS_V(nir, lower_pv_mode_gs, zink_gs_key(key)->lower_pv_mode);
               need_optimize = true; //TODO verify that this is required
            }
            break;

         default:
            break;
         }
      }

      switch (zs->info.stage) {
      case MESA_SHADER_VERTEX:
      case MESA_SHADER_TESS_EVAL:
      case MESA_SHADER_GEOMETRY:
         if (zink_vs_key_base(key)->last_vertex_stage) {
            if (!zink_vs_key_base(key)->clip_halfz && !screen->info.have_EXT_depth_clip_control) {
               NIR_PASS_V(nir, nir_lower_clip_halfz);
            }
            if (zink_vs_key_base(key)->push_drawid) {
               NIR_PASS_V(nir, lower_drawid);
            }
         } else {
            nir->xfb_info = NULL;
         }
         if (zink_vs_key_base(key)->robust_access)
            NIR_PASS(need_optimize, nir, lower_txf_lod_robustness);
         break;
      case MESA_SHADER_FRAGMENT:
         if (zink_fs_key(key)->lower_line_smooth) {
            NIR_PASS_V(nir, lower_line_smooth_fs,
                       zink_fs_key(key)->lower_line_stipple);
            need_optimize = true;
         } else if (zink_fs_key(key)->lower_line_stipple)
               NIR_PASS_V(nir, lower_line_stipple_fs);

         if (zink_fs_key(key)->lower_point_smooth) {
            NIR_PASS_V(nir, nir_lower_point_smooth);
            NIR_PASS_V(nir, nir_lower_discard_if, nir_lower_discard_if_to_cf);
            nir->info.fs.uses_discard = true;
            need_optimize = true;
         }

         if (zink_fs_key(key)->robust_access)
            NIR_PASS(need_optimize, nir, lower_txf_lod_robustness);

         if (!zink_fs_key_base(key)->samples && zink_shader_uses_samples(zs)) {
            /* VK will always use gl_SampleMask[] values even if sample count is 0,
            * so we need to skip this write here to mimic GL's behavior of ignoring it
            */
            nir_foreach_shader_out_variable(var, nir) {
               if (var->data.location == FRAG_RESULT_SAMPLE_MASK)
                  var->data.mode = nir_var_shader_temp;
            }
            nir_fixup_deref_modes(nir);
            NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_temp, NULL);
            NIR_PASS_V(nir, nir_shader_intrinsics_pass, remove_interpolate_at_sample,
                       nir_metadata_dominance | nir_metadata_block_index, NULL);

            need_optimize = true;
         }
         if (zink_fs_key_base(key)->force_dual_color_blend && nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DATA1)) {
            NIR_PASS_V(nir, lower_dual_blend);
         }
         if (zink_fs_key_base(key)->coord_replace_bits)
            NIR_PASS_V(nir, nir_lower_texcoord_replace, zink_fs_key_base(key)->coord_replace_bits, true, false);
         if (zink_fs_key_base(key)->point_coord_yinvert)
            NIR_PASS_V(nir, invert_point_coord);
         if (zink_fs_key_base(key)->force_persample_interp || zink_fs_key_base(key)->fbfetch_ms) {
            nir_foreach_shader_in_variable(var, nir)
               var->data.sample = true;
            nir->info.fs.uses_sample_qualifier = true;
            nir->info.fs.uses_sample_shading = true;
         }
         if (zs->fs.legacy_shadow_mask && !key->base.needs_zs_shader_swizzle)
            NIR_PASS(need_optimize, nir, lower_zs_swizzle_tex, zink_fs_key_base(key)->shadow_needs_shader_swizzle ? extra_data : NULL, true);
         if (nir->info.fs.uses_fbfetch_output) {
            nir_variable *fbfetch = NULL;
            NIR_PASS_V(nir, lower_fbfetch, &fbfetch, zink_fs_key_base(key)->fbfetch_ms);
            /* old variable must be deleted to avoid spirv errors */
            fbfetch->data.mode = nir_var_shader_temp;
            nir_fixup_deref_modes(nir);
            NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_temp, NULL);
            need_optimize = true;
         }
         nir_foreach_shader_in_variable_safe(var, nir) {
            if (!is_texcoord(MESA_SHADER_FRAGMENT, var) || var->data.driver_location != -1)
               continue;
            nir_shader_instructions_pass(nir, rewrite_read_as_0, nir_metadata_dominance, var);
            var->data.mode = nir_var_shader_temp;
            nir_fixup_deref_modes(nir);
            NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_temp, NULL);
            need_optimize = true;
         }
         break;
      case MESA_SHADER_COMPUTE:
         if (zink_cs_key(key)->robust_access)
            NIR_PASS(need_optimize, nir, lower_txf_lod_robustness);
         break;
      default: break;
      }
      if (key->base.needs_zs_shader_swizzle) {
         assert(extra_data);
         NIR_PASS(need_optimize, nir, lower_zs_swizzle_tex, extra_data, false);
      }
      if (key->base.nonseamless_cube_mask) {
         NIR_PASS_V(nir, zink_lower_cubemap_to_array, key->base.nonseamless_cube_mask);
         need_optimize = true;
      }
   }
   if (screen->driconf.inline_uniforms) {
      NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_mem_global | nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_shared, NULL, NULL);
      NIR_PASS_V(nir, rewrite_bo_access, screen);
      NIR_PASS_V(nir, remove_bo_access, zs);
      need_optimize = true;
   }
   if (inlined_uniforms) {
      optimize_nir(nir, zs, true);

      /* This must be done again. */
      NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in |
                                                       nir_var_shader_out);

      nir_function_impl *impl = nir_shader_get_entrypoint(nir);
      if (impl->ssa_alloc > ZINK_ALWAYS_INLINE_LIMIT)
         zs->can_inline = false;
   } else if (need_optimize)
      optimize_nir(nir, zs, true);
   bool has_sparse = false;
   NIR_PASS(has_sparse, nir, lower_sparse);
   if (has_sparse)
      optimize_nir(nir, zs, false);

   struct zink_shader_object obj = compile_module(screen, zs, nir, can_shobj, pg);
   ralloc_free(nir);
   return obj;
}

struct zink_shader_object
zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs)
{
   nir_shader *nir = zink_shader_deserialize(screen, zs);
   /* TODO: maybe compile multiple variants for different set counts for compact mode? */
   int set = zs->info.stage == MESA_SHADER_FRAGMENT;
   if (screen->info.have_EXT_shader_object)
      set = zs->info.stage;
   unsigned offsets[4];
   zink_descriptor_shader_get_binding_offsets(zs, offsets);
   nir_foreach_variable_with_modes(var, nir, nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_uniform | nir_var_image) {
      if (var->data.descriptor_set == screen->desc_set_id[ZINK_DESCRIPTOR_BINDLESS])
         continue;
      var->data.descriptor_set = set;
      switch (var->data.mode) {
      case nir_var_mem_ubo:
            var->data.binding = !!var->data.driver_location;
            break;
      case nir_var_uniform:
         if (glsl_type_is_sampler(glsl_without_array(var->type)))
            var->data.binding += offsets[1];
         break;
      case nir_var_mem_ssbo:
         var->data.binding += offsets[2];
         break;
      case nir_var_image:
         var->data.binding += offsets[3];
         break;
      default: break;
      }
   }
   NIR_PASS_V(nir, add_derefs);
   NIR_PASS_V(nir, nir_lower_fragcolor, nir->info.fs.color_is_dual_source ? 1 : 8);
   if (screen->driconf.inline_uniforms) {
      NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_mem_global | nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_shared, NULL, NULL);
      NIR_PASS_V(nir, rewrite_bo_access, screen);
      NIR_PASS_V(nir, remove_bo_access, zs);
   }
   optimize_nir(nir, zs, true);
   zink_descriptor_shader_init(screen, zs);
   nir_shader *nir_clone = NULL;
   if (screen->info.have_EXT_shader_object)
      nir_clone = nir_shader_clone(nir, nir);
   struct zink_shader_object obj = compile_module(screen, zs, nir, true, NULL);
   if (screen->info.have_EXT_shader_object && !zs->info.internal) {
      /* always try to pre-generate a tcs in case it's needed */
      if (zs->info.stage == MESA_SHADER_TESS_EVAL) {
         nir_shader *nir_tcs = NULL;
         /* use max pcp for compat */
         zs->non_fs.generated_tcs = zink_shader_tcs_create(screen, nir_clone, 32, &nir_tcs);
         nir_tcs->info.separate_shader = true;
         zs->non_fs.generated_tcs->precompile.obj = zink_shader_compile_separate(screen, zs->non_fs.generated_tcs);
         ralloc_free(nir_tcs);
      }
   }
   ralloc_free(nir);
   spirv_shader_delete(obj.spirv);
   obj.spirv = NULL;
   return obj;
}

static bool
lower_baseinstance_instr(nir_builder *b, nir_intrinsic_instr *intr,
                         void *data)
{
   if (intr->intrinsic != nir_intrinsic_load_instance_id)
      return false;
   b->cursor = nir_after_instr(&intr->instr);
   nir_def *def = nir_isub(b, &intr->def, nir_load_base_instance(b));
   nir_def_rewrite_uses_after(&intr->def, def, def->parent_instr);
   return true;
}

static bool
lower_baseinstance(nir_shader *shader)
{
   if (shader->info.stage != MESA_SHADER_VERTEX)
      return false;
   return nir_shader_intrinsics_pass(shader, lower_baseinstance_instr,
                                     nir_metadata_dominance, NULL);
}

/* gl_nir_lower_buffers makes variables unusable for all UBO/SSBO access
 * so instead we delete all those broken variables and just make new ones
 */
static bool
unbreak_bos(nir_shader *shader, struct zink_shader *zs, bool needs_size)
{
   uint64_t max_ssbo_size = 0;
   uint64_t max_ubo_size = 0;
   uint64_t max_uniform_size = 0;

   if (!shader->info.num_ssbos && !shader->info.num_ubos)
      return false;

   nir_foreach_variable_with_modes(var, shader, nir_var_mem_ssbo | nir_var_mem_ubo) {
      const struct glsl_type *type = glsl_without_array(var->type);
      if (type_is_counter(type))
         continue;
      /* be conservative: use the bigger of the interface and variable types to ensure in-bounds access */
      unsigned size = glsl_count_attribute_slots(glsl_type_is_array(var->type) ? var->type : type, false);
      const struct glsl_type *interface_type = var->interface_type ? glsl_without_array(var->interface_type) : NULL;
      if (interface_type) {
         unsigned block_size = glsl_get_explicit_size(interface_type, true);
         if (glsl_get_length(interface_type) == 1) {
            /* handle bare unsized ssbo arrays: glsl_get_explicit_size always returns type-aligned sizes */
            const struct glsl_type *f = glsl_get_struct_field(interface_type, 0);
            if (glsl_type_is_array(f) && !glsl_array_size(f))
               block_size = 0;
         }
         if (block_size) {
            block_size = DIV_ROUND_UP(block_size, sizeof(float) * 4);
            size = MAX2(size, block_size);
         }
      }
      if (var->data.mode == nir_var_mem_ubo) {
         if (var->data.driver_location)
            max_ubo_size = MAX2(max_ubo_size, size);
         else
            max_uniform_size = MAX2(max_uniform_size, size);
      } else {
         max_ssbo_size = MAX2(max_ssbo_size, size);
         if (interface_type) {
            if (glsl_type_is_unsized_array(glsl_get_struct_field(interface_type, glsl_get_length(interface_type) - 1)))
               needs_size = true;
         }
      }
      var->data.mode = nir_var_shader_temp;
   }
   nir_fixup_deref_modes(shader);
   NIR_PASS_V(shader, nir_remove_dead_variables, nir_var_shader_temp, NULL);
   optimize_nir(shader, NULL, true);

   struct glsl_struct_field field = {0};
   field.name = ralloc_strdup(shader, "base");
   if (shader->info.num_ubos) {
      if (shader->num_uniforms && zs->ubos_used & BITFIELD_BIT(0)) {
         field.type = glsl_array_type(glsl_uint_type(), max_uniform_size * 4, 4);
         nir_variable *var = nir_variable_create(shader, nir_var_mem_ubo,
                                                 glsl_array_type(glsl_interface_type(&field, 1, GLSL_INTERFACE_PACKING_STD430, false, "struct"), 1, 0),
                                                 "uniform_0@32");
         var->interface_type = var->type;
         var->data.mode = nir_var_mem_ubo;
         var->data.driver_location = 0;
      }

      unsigned num_ubos = shader->info.num_ubos - !!shader->info.first_ubo_is_default_ubo;
      uint32_t ubos_used = zs->ubos_used & ~BITFIELD_BIT(0);
      if (num_ubos && ubos_used) {
         field.type = glsl_array_type(glsl_uint_type(), max_ubo_size * 4, 4);
         /* shrink array as much as possible */
         unsigned first_ubo = ffs(ubos_used) - 2;
         assert(first_ubo < PIPE_MAX_CONSTANT_BUFFERS);
         num_ubos -= first_ubo;
         assert(num_ubos);
         nir_variable *var = nir_variable_create(shader, nir_var_mem_ubo,
                                   glsl_array_type(glsl_struct_type(&field, 1, "struct", false), num_ubos, 0),
                                   "ubos@32");
         var->interface_type = var->type;
         var->data.mode = nir_var_mem_ubo;
         var->data.driver_location = first_ubo + !!shader->info.first_ubo_is_default_ubo;
      }
   }
   if (shader->info.num_ssbos && zs->ssbos_used) {
      /* shrink array as much as possible */
      unsigned first_ssbo = ffs(zs->ssbos_used) - 1;
      assert(first_ssbo < PIPE_MAX_SHADER_BUFFERS);
      unsigned num_ssbos = shader->info.num_ssbos - first_ssbo;
      assert(num_ssbos);
      const struct glsl_type *ssbo_type = glsl_array_type(glsl_uint_type(), needs_size ? 0 : max_ssbo_size * 4, 4);
      field.type = ssbo_type;
      nir_variable *var = nir_variable_create(shader, nir_var_mem_ssbo,
                                              glsl_array_type(glsl_struct_type(&field, 1, "struct", false), num_ssbos, 0),
                                              "ssbos@32");
      var->interface_type = var->type;
      var->data.mode = nir_var_mem_ssbo;
      var->data.driver_location = first_ssbo;
   }
   return true;
}

static uint32_t
get_src_mask_ssbo(unsigned total, nir_src src)
{
   if (nir_src_is_const(src))
      return BITFIELD_BIT(nir_src_as_uint(src));
   return BITFIELD_MASK(total);
}

static uint32_t
get_src_mask_ubo(unsigned total, nir_src src)
{
   if (nir_src_is_const(src))
      return BITFIELD_BIT(nir_src_as_uint(src));
   return BITFIELD_MASK(total) & ~BITFIELD_BIT(0);
}

static bool
analyze_io(struct zink_shader *zs, nir_shader *shader)
{
   bool ret = false;
   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
   nir_foreach_block(block, impl) {
      nir_foreach_instr(instr, block) {
         if (shader->info.stage != MESA_SHADER_KERNEL && instr->type == nir_instr_type_tex) {
            /* gl_nir_lower_samplers_as_deref is where this would normally be set, but zink doesn't use it */
            nir_tex_instr *tex = nir_instr_as_tex(instr);
            nir_foreach_variable_with_modes(img, shader, nir_var_uniform) {
               if (glsl_type_is_sampler(glsl_without_array(img->type))) {
                  unsigned size = glsl_type_is_array(img->type) ? glsl_get_aoa_size(img->type) : 1;
                  if (tex->texture_index >= img->data.driver_location &&
                     tex->texture_index < img->data.driver_location + size) {
                     BITSET_SET_RANGE(shader->info.textures_used, img->data.driver_location, img->data.driver_location + (size - 1));
                     break;
                  }
               }
            }
            continue;
         }
         if (instr->type != nir_instr_type_intrinsic)
            continue;

         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
         switch (intrin->intrinsic) {
         case nir_intrinsic_store_ssbo:
            zs->ssbos_used |= get_src_mask_ssbo(shader->info.num_ssbos, intrin->src[1]);
            break;

         case nir_intrinsic_get_ssbo_size: {
            zs->ssbos_used |= get_src_mask_ssbo(shader->info.num_ssbos, intrin->src[0]);
            ret = true;
            break;
         }
         case nir_intrinsic_ssbo_atomic:
         case nir_intrinsic_ssbo_atomic_swap:
         case nir_intrinsic_load_ssbo:
            zs->ssbos_used |= get_src_mask_ssbo(shader->info.num_ssbos, intrin->src[0]);
            break;
         case nir_intrinsic_load_ubo:
         case nir_intrinsic_load_ubo_vec4:
            zs->ubos_used |= get_src_mask_ubo(shader->info.num_ubos, intrin->src[0]);
            break;
         default:
            break;
         }
      }
   }
   return ret;
}

struct zink_bindless_info {
   nir_variable *bindless[4];
   unsigned bindless_set;
};

/* this is a "default" bindless texture used if the shader has no texture variables */
static nir_variable *
create_bindless_texture(nir_shader *nir, nir_tex_instr *tex, unsigned descriptor_set)
{
   unsigned binding = tex->sampler_dim == GLSL_SAMPLER_DIM_BUF ? 1 : 0;
   nir_variable *var;

   const struct glsl_type *sampler_type = glsl_sampler_type(tex->sampler_dim, tex->is_shadow, tex->is_array, GLSL_TYPE_FLOAT);
   var = nir_variable_create(nir, nir_var_uniform, glsl_array_type(sampler_type, ZINK_MAX_BINDLESS_HANDLES, 0), "bindless_texture");
   var->data.descriptor_set = descriptor_set;
   var->data.driver_location = var->data.binding = binding;
   return var;
}

/* this is a "default" bindless image used if the shader has no image variables */
static nir_variable *
create_bindless_image(nir_shader *nir, enum glsl_sampler_dim dim, unsigned descriptor_set)
{
   unsigned binding = dim == GLSL_SAMPLER_DIM_BUF ? 3 : 2;
   nir_variable *var;

   const struct glsl_type *image_type = glsl_image_type(dim, false, GLSL_TYPE_FLOAT);
   var = nir_variable_create(nir, nir_var_image, glsl_array_type(image_type, ZINK_MAX_BINDLESS_HANDLES, 0), "bindless_image");
   var->data.descriptor_set = descriptor_set;
   var->data.driver_location = var->data.binding = binding;
   var->data.image.format = PIPE_FORMAT_R8G8B8A8_UNORM;
   return var;
}

/* rewrite bindless instructions as array deref instructions */
static bool
lower_bindless_instr(nir_builder *b, nir_instr *in, void *data)
{
   struct zink_bindless_info *bindless = data;

   if (in->type == nir_instr_type_tex) {
      nir_tex_instr *tex = nir_instr_as_tex(in);
      int idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
      if (idx == -1)
         return false;

      nir_variable *var = tex->sampler_dim == GLSL_SAMPLER_DIM_BUF ? bindless->bindless[1] : bindless->bindless[0];
      if (!var) {
         var = create_bindless_texture(b->shader, tex, bindless->bindless_set);
         if (tex->sampler_dim == GLSL_SAMPLER_DIM_BUF)
            bindless->bindless[1] = var;
         else
            bindless->bindless[0] = var;
      }
      b->cursor = nir_before_instr(in);
      nir_deref_instr *deref = nir_build_deref_var(b, var);
      if (glsl_type_is_array(var->type))
         deref = nir_build_deref_array(b, deref, nir_u2uN(b, tex->src[idx].src.ssa, 32));
      nir_src_rewrite(&tex->src[idx].src, &deref->def);

      /* bindless sampling uses the variable type directly, which means the tex instr has to exactly
       * match up with it in contrast to normal sampler ops where things are a bit more flexible;
       * this results in cases where a shader is passed with sampler2DArray but the tex instr only has
       * 2 components, which explodes spirv compilation even though it doesn't trigger validation errors
       *
       * to fix this, pad the coord src here and fix the tex instr so that ntv will do the "right" thing
       * - Warhammer 40k: Dawn of War III
       */
      unsigned needed_components = glsl_get_sampler_coordinate_components(glsl_without_array(var->type));
      unsigned c = nir_tex_instr_src_index(tex, nir_tex_src_coord);
      unsigned coord_components = nir_src_num_components(tex->src[c].src);
      if (coord_components < needed_components) {
         nir_def *def = nir_pad_vector(b, tex->src[c].src.ssa, needed_components);
         nir_src_rewrite(&tex->src[c].src, def);
         tex->coord_components = needed_components;
      }
      return true;
   }
   if (in->type != nir_instr_type_intrinsic)
      return false;
   nir_intrinsic_instr *instr = nir_instr_as_intrinsic(in);

   nir_intrinsic_op op;
#define OP_SWAP(OP) \
   case nir_intrinsic_bindless_image_##OP: \
      op = nir_intrinsic_image_deref_##OP; \
      break;


   /* convert bindless intrinsics to deref intrinsics */
   switch (instr->intrinsic) {
   OP_SWAP(atomic)
   OP_SWAP(atomic_swap)
   OP_SWAP(format)
   OP_SWAP(load)
   OP_SWAP(order)
   OP_SWAP(samples)
   OP_SWAP(size)
   OP_SWAP(store)
   default:
      return false;
   }

   enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
   nir_variable *var = dim == GLSL_SAMPLER_DIM_BUF ? bindless->bindless[3] : bindless->bindless[2];
   if (!var)
      var = create_bindless_image(b->shader, dim, bindless->bindless_set);
   instr->intrinsic = op;
   b->cursor = nir_before_instr(in);
   nir_deref_instr *deref = nir_build_deref_var(b, var);
   if (glsl_type_is_array(var->type))
      deref = nir_build_deref_array(b, deref, nir_u2uN(b, instr->src[0].ssa, 32));
   nir_src_rewrite(&instr->src[0], &deref->def);
   return true;
}

static bool
lower_bindless(nir_shader *shader, struct zink_bindless_info *bindless)
{
   if (!nir_shader_instructions_pass(shader, lower_bindless_instr, nir_metadata_dominance, bindless))
      return false;
   nir_fixup_deref_modes(shader);
   NIR_PASS_V(shader, nir_remove_dead_variables, nir_var_shader_temp, NULL);
   optimize_nir(shader, NULL, true);
   return true;
}

/* convert shader image/texture io variables to int64 handles for bindless indexing */
static bool
lower_bindless_io_instr(nir_builder *b, nir_intrinsic_instr *instr,
                        void *data)
{
   bool is_load = false;
   bool is_input = false;
   bool is_interp = false;
   if (!filter_io_instr(instr, &is_load, &is_input, &is_interp))
      return false;

   nir_variable *var = find_var_with_location_frac(b->shader, nir_intrinsic_io_semantics(instr).location, nir_intrinsic_component(instr), false, is_input ? nir_var_shader_in : nir_var_shader_out);
   if (var->data.bindless)
      return false;
   if (var->data.mode != nir_var_shader_in && var->data.mode != nir_var_shader_out)
      return false;
   if (!glsl_type_is_image(var->type) && !glsl_type_is_sampler(var->type))
      return false;

   var->type = glsl_vector_type(GLSL_TYPE_INT, 2);
   var->data.bindless = 1;
   return true;
}

static bool
lower_bindless_io(nir_shader *shader)
{
   return nir_shader_intrinsics_pass(shader, lower_bindless_io_instr,
                                     nir_metadata_dominance, NULL);
}

static uint32_t
zink_binding(gl_shader_stage stage, VkDescriptorType type, int index, bool compact_descriptors)
{
   if (stage == MESA_SHADER_NONE) {
      unreachable("not supported");
   } else {
      unsigned base = stage;
      /* clamp compute bindings for better driver efficiency */
      if (gl_shader_stage_is_compute(stage))
         base = 0;
      switch (type) {
      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
         return base * 2 + !!index;

      case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
         assert(stage == MESA_SHADER_KERNEL);
         FALLTHROUGH;
      case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
         if (stage == MESA_SHADER_KERNEL) {
            assert(index < PIPE_MAX_SHADER_SAMPLER_VIEWS);
            return index + PIPE_MAX_SAMPLERS;
         }
         FALLTHROUGH;
      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
         assert(index < PIPE_MAX_SAMPLERS);
         assert(stage != MESA_SHADER_KERNEL);
         return (base * PIPE_MAX_SAMPLERS) + index;

      case VK_DESCRIPTOR_TYPE_SAMPLER:
         assert(index < PIPE_MAX_SAMPLERS);
         assert(stage == MESA_SHADER_KERNEL);
         return index;

      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
         return base + (compact_descriptors * (ZINK_GFX_SHADER_COUNT * 2));

      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
      case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
         assert(index < ZINK_MAX_SHADER_IMAGES);
         if (stage == MESA_SHADER_KERNEL)
            return index + (compact_descriptors ? (PIPE_MAX_SAMPLERS + PIPE_MAX_SHADER_SAMPLER_VIEWS) : 0);
         return (base * ZINK_MAX_SHADER_IMAGES) + index + (compact_descriptors * (ZINK_GFX_SHADER_COUNT * PIPE_MAX_SAMPLERS));

      default:
         unreachable("unexpected type");
      }
   }
}

static void
handle_bindless_var(nir_shader *nir, nir_variable *var, const struct glsl_type *type, struct zink_bindless_info *bindless)
{
   if (glsl_type_is_struct(type)) {
      for (unsigned i = 0; i < glsl_get_length(type); i++)
         handle_bindless_var(nir, var, glsl_get_struct_field(type, i), bindless);
      return;
   }

   /* just a random scalar in a struct */
   if (!glsl_type_is_image(type) && !glsl_type_is_sampler(type))
      return;

   VkDescriptorType vktype = glsl_type_is_image(type) ? zink_image_type(type) : zink_sampler_type(type);
   unsigned binding;
   switch (vktype) {
      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
         binding = 0;
         break;
      case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
         binding = 1;
         break;
      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
         binding = 2;
         break;
      case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
         binding = 3;
         break;
      default:
         unreachable("unknown");
   }
   if (!bindless->bindless[binding]) {
      bindless->bindless[binding] = nir_variable_clone(var, nir);
      bindless->bindless[binding]->data.bindless = 0;
      bindless->bindless[binding]->data.descriptor_set = bindless->bindless_set;
      bindless->bindless[binding]->type = glsl_array_type(type, ZINK_MAX_BINDLESS_HANDLES, 0);
      bindless->bindless[binding]->data.driver_location = bindless->bindless[binding]->data.binding = binding;
      if (!bindless->bindless[binding]->data.image.format)
         bindless->bindless[binding]->data.image.format = PIPE_FORMAT_R8G8B8A8_UNORM;
      nir_shader_add_variable(nir, bindless->bindless[binding]);
   } else {
      assert(glsl_get_sampler_dim(glsl_without_array(bindless->bindless[binding]->type)) == glsl_get_sampler_dim(glsl_without_array(var->type)));
   }
   var->data.mode = nir_var_shader_temp;
}

static bool
convert_1d_shadow_tex(nir_builder *b, nir_instr *instr, void *data)
{
   struct zink_screen *screen = data;
   if (instr->type != nir_instr_type_tex)
      return false;
   nir_tex_instr *tex = nir_instr_as_tex(instr);
   if (tex->sampler_dim != GLSL_SAMPLER_DIM_1D || !tex->is_shadow)
      return false;
   if (tex->is_sparse && screen->need_2D_sparse) {
      /* no known case of this exists: only nvidia can hit it, and nothing uses it */
      mesa_loge("unhandled/unsupported 1D sparse texture!");
      abort();
   }
   tex->sampler_dim = GLSL_SAMPLER_DIM_2D;
   b->cursor = nir_before_instr(instr);
   tex->coord_components++;
   unsigned srcs[] = {
      nir_tex_src_coord,
      nir_tex_src_offset,
      nir_tex_src_ddx,
      nir_tex_src_ddy,
   };
   for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++) {
      unsigned c = nir_tex_instr_src_index(tex, srcs[i]);
      if (c == -1)
         continue;
      if (tex->src[c].src.ssa->num_components == tex->coord_components)
         continue;
      nir_def *def;
      nir_def *zero = nir_imm_zero(b, 1, tex->src[c].src.ssa->bit_size);
      if (tex->src[c].src.ssa->num_components == 1)
         def = nir_vec2(b, tex->src[c].src.ssa, zero);
      else
         def = nir_vec3(b, nir_channel(b, tex->src[c].src.ssa, 0), zero, nir_channel(b, tex->src[c].src.ssa, 1));
      nir_src_rewrite(&tex->src[c].src, def);
   }
   b->cursor = nir_after_instr(instr);
   unsigned needed_components = nir_tex_instr_dest_size(tex);
   unsigned num_components = tex->def.num_components;
   if (needed_components > num_components) {
      tex->def.num_components = needed_components;
      assert(num_components < 3);
      /* take either xz or just x since this is promoted to 2D from 1D */
      uint32_t mask = num_components == 2 ? (1|4) : 1;
      nir_def *dst = nir_channels(b, &tex->def, mask);
      nir_def_rewrite_uses_after(&tex->def, dst, dst->parent_instr);
   }
   return true;
}

static bool
lower_1d_shadow(nir_shader *shader, struct zink_screen *screen)
{
   bool found = false;
   nir_foreach_variable_with_modes(var, shader, nir_var_uniform | nir_var_image) {
      const struct glsl_type *type = glsl_without_array(var->type);
      unsigned length = glsl_get_length(var->type);
      if (!glsl_type_is_sampler(type) || !glsl_sampler_type_is_shadow(type) || glsl_get_sampler_dim(type) != GLSL_SAMPLER_DIM_1D)
         continue;
      const struct glsl_type *sampler = glsl_sampler_type(GLSL_SAMPLER_DIM_2D, true, glsl_sampler_type_is_array(type), glsl_get_sampler_result_type(type));
      var->type = type != var->type ? glsl_array_type(sampler, length, glsl_get_explicit_stride(var->type)) : sampler;

      found = true;
   }
   if (found)
      nir_shader_instructions_pass(shader, convert_1d_shadow_tex, nir_metadata_dominance, screen);
   return found;
}

static void
scan_nir(struct zink_screen *screen, nir_shader *shader, struct zink_shader *zs)
{
   nir_foreach_function_impl(impl, shader) {
      nir_foreach_block_safe(block, impl) {
         nir_foreach_instr_safe(instr, block) {
            if (instr->type == nir_instr_type_tex) {
               nir_tex_instr *tex = nir_instr_as_tex(instr);
               zs->sinfo.have_sparse |= tex->is_sparse;
            }
            if (instr->type != nir_instr_type_intrinsic)
               continue;
            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
            if (intr->intrinsic == nir_intrinsic_image_deref_load ||
                intr->intrinsic == nir_intrinsic_image_deref_sparse_load ||
                intr->intrinsic == nir_intrinsic_image_deref_store ||
                intr->intrinsic == nir_intrinsic_image_deref_atomic ||
                intr->intrinsic == nir_intrinsic_image_deref_atomic_swap ||
                intr->intrinsic == nir_intrinsic_image_deref_size ||
                intr->intrinsic == nir_intrinsic_image_deref_samples ||
                intr->intrinsic == nir_intrinsic_image_deref_format ||
                intr->intrinsic == nir_intrinsic_image_deref_order) {

                nir_variable *var = nir_intrinsic_get_var(intr, 0);

                /* Structs have been lowered already, so get_aoa_size is sufficient. */
                const unsigned size =
                   glsl_type_is_array(var->type) ? glsl_get_aoa_size(var->type) : 1;
                BITSET_SET_RANGE(shader->info.images_used, var->data.binding,
                                 var->data.binding + (MAX2(size, 1) - 1));
            }
            if (intr->intrinsic == nir_intrinsic_load_barycentric_at_sample)
               zs->uses_sample = true;
            if (intr->intrinsic == nir_intrinsic_is_sparse_texels_resident ||
                intr->intrinsic == nir_intrinsic_image_deref_sparse_load)
               zs->sinfo.have_sparse = true;

            bool is_load = false;
            bool is_input = false;
            bool is_interp = false;
            if (filter_io_instr(intr, &is_load, &is_input, &is_interp)) {
               nir_io_semantics s = nir_intrinsic_io_semantics(intr);
               if (io_instr_is_arrayed(intr) && s.location < VARYING_SLOT_PATCH0) {
                  if (is_input)
                     zs->arrayed_inputs |= BITFIELD64_BIT(s.location);
                  else
                     zs->arrayed_outputs |= BITFIELD64_BIT(s.location);
               }
               /* TODO: delete this once #10826 is fixed */
               if (!(is_input && shader->info.stage == MESA_SHADER_VERTEX)) {
                  if (is_clipcull_dist(s.location)) {
                     unsigned frac = nir_intrinsic_component(intr) + 1;
                     if (s.location < VARYING_SLOT_CULL_DIST0) {
                        if (s.location == VARYING_SLOT_CLIP_DIST1)
                           frac += 4;
                        shader->info.clip_distance_array_size = MAX3(shader->info.clip_distance_array_size, frac, s.num_slots);
                     } else {
                        if (s.location == VARYING_SLOT_CULL_DIST1)
                           frac += 4;
                        shader->info.cull_distance_array_size = MAX3(shader->info.cull_distance_array_size, frac, s.num_slots);
                     }
                  }
               }
            }

            static bool warned = false;
            if (!screen->info.have_EXT_shader_atomic_float && !screen->is_cpu && !warned) {
               switch (intr->intrinsic) {
               case nir_intrinsic_image_deref_atomic: {
                  nir_variable *var = nir_intrinsic_get_var(intr, 0);
                  if (nir_intrinsic_atomic_op(intr) == nir_atomic_op_iadd &&
                      util_format_is_float(var->data.image.format))
                     fprintf(stderr, "zink: Vulkan driver missing VK_EXT_shader_atomic_float but attempting to do atomic ops!\n");
                  break;
               }
               default:
                  break;
               }
            }
         }
      }
   }
}

static bool
match_tex_dests_instr(nir_builder *b, nir_instr *in, void *data)
{
   if (in->type != nir_instr_type_tex)
      return false;
   nir_tex_instr *tex = nir_instr_as_tex(in);
   if (tex->op == nir_texop_txs || tex->op == nir_texop_lod)
      return false;
   int handle = nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
   nir_variable *var = NULL;
   if (handle != -1) {
      var = nir_deref_instr_get_variable(nir_src_as_deref(tex->src[handle].src));
   } else {
      nir_foreach_variable_with_modes(img, b->shader, nir_var_uniform) {
         if (glsl_type_is_sampler(glsl_without_array(img->type))) {
            unsigned size = glsl_type_is_array(img->type) ? glsl_get_aoa_size(img->type) : 1;
            if (tex->texture_index >= img->data.driver_location &&
                tex->texture_index < img->data.driver_location + size) {
               var = img;
               break;
            }
         }
      }
   }
   return !!rewrite_tex_dest(b, tex, var, data);
}

static bool
match_tex_dests(nir_shader *shader, struct zink_shader *zs)
{
   return nir_shader_instructions_pass(shader, match_tex_dests_instr, nir_metadata_dominance, zs);
}

static bool
split_bitfields_instr(nir_builder *b, nir_instr *in, void *data)
{
   if (in->type != nir_instr_type_alu)
      return false;
   nir_alu_instr *alu = nir_instr_as_alu(in);
   switch (alu->op) {
   case nir_op_ubitfield_extract:
   case nir_op_ibitfield_extract:
   case nir_op_bitfield_insert:
      break;
   default:
      return false;
   }
   unsigned num_components = alu->def.num_components;
   if (num_components == 1)
      return false;
   b->cursor = nir_before_instr(in);
   nir_def *dests[NIR_MAX_VEC_COMPONENTS];
   for (unsigned i = 0; i < num_components; i++) {
      if (alu->op == nir_op_bitfield_insert)
         dests[i] = nir_bitfield_insert(b,
                                        nir_channel(b, alu->src[0].src.ssa, alu->src[0].swizzle[i]),
                                        nir_channel(b, alu->src[1].src.ssa, alu->src[1].swizzle[i]),
                                        nir_channel(b, alu->src[2].src.ssa, alu->src[2].swizzle[i]),
                                        nir_channel(b, alu->src[3].src.ssa, alu->src[3].swizzle[i]));
      else if (alu->op == nir_op_ubitfield_extract)
         dests[i] = nir_ubitfield_extract(b,
                                          nir_channel(b, alu->src[0].src.ssa, alu->src[0].swizzle[i]),
                                          nir_channel(b, alu->src[1].src.ssa, alu->src[1].swizzle[i]),
                                          nir_channel(b, alu->src[2].src.ssa, alu->src[2].swizzle[i]));
      else
         dests[i] = nir_ibitfield_extract(b,
                                          nir_channel(b, alu->src[0].src.ssa, alu->src[0].swizzle[i]),
                                          nir_channel(b, alu->src[1].src.ssa, alu->src[1].swizzle[i]),
                                          nir_channel(b, alu->src[2].src.ssa, alu->src[2].swizzle[i]));
   }
   nir_def *dest = nir_vec(b, dests, num_components);
   nir_def_rewrite_uses_after(&alu->def, dest, in);
   nir_instr_remove(in);
   return true;
}


static bool
split_bitfields(nir_shader *shader)
{
   return nir_shader_instructions_pass(shader, split_bitfields_instr, nir_metadata_dominance, NULL);
}

static bool
strip_tex_ms_instr(nir_builder *b, nir_instr *in, void *data)
{
   if (in->type != nir_instr_type_intrinsic)
      return false;
   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(in);
   switch (intr->intrinsic) {
   case nir_intrinsic_image_deref_samples:
      b->cursor = nir_before_instr(in);
      nir_def_rewrite_uses_after(&intr->def, nir_imm_zero(b, 1, intr->def.bit_size), in);
      nir_instr_remove(in);
      break;
   case nir_intrinsic_image_deref_store:
   case nir_intrinsic_image_deref_load:
      break;
   default:
      return false;
   }
   enum glsl_sampler_dim dim = nir_intrinsic_image_dim(intr);
   if (dim != GLSL_SAMPLER_DIM_MS)
      return false;

   nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
   nir_variable *var = nir_deref_instr_get_variable(deref);
   nir_deref_instr *parent = nir_deref_instr_parent(deref);
   if (parent) {
      parent->type = var->type;
      deref->type = glsl_without_array(var->type);
   } else {
      deref->type = var->type;
   }
   nir_intrinsic_set_image_dim(intr, GLSL_SAMPLER_DIM_2D);
   return true;
}


static bool
strip_tex_ms(nir_shader *shader)
{
   bool progress = false;
   nir_foreach_image_variable(var, shader) {
      const struct glsl_type *bare_type = glsl_without_array(var->type);
      if (glsl_get_sampler_dim(bare_type) != GLSL_SAMPLER_DIM_MS)
         continue;
      unsigned array_size = 0;
      if (glsl_type_is_array(var->type))
         array_size = glsl_array_size(var->type);

      const struct glsl_type *new_type = glsl_image_type(GLSL_SAMPLER_DIM_2D, glsl_sampler_type_is_array(bare_type), glsl_get_sampler_result_type(bare_type));
      if (array_size)
         new_type = glsl_array_type(new_type, array_size, glsl_get_explicit_stride(var->type));
      var->type = new_type;
      progress = true;
   }
   if (!progress)
      return false;
   return nir_shader_instructions_pass(shader, strip_tex_ms_instr, nir_metadata_all, NULL);
}

static void
rewrite_cl_derefs(nir_shader *nir, nir_variable *var)
{
   nir_foreach_function_impl(impl, nir) {
      nir_foreach_block(block, impl) {
         nir_foreach_instr_safe(instr, block) {
            if (instr->type != nir_instr_type_deref)
               continue;
            nir_deref_instr *deref = nir_instr_as_deref(instr);
            nir_variable *img = nir_deref_instr_get_variable(deref);
            if (img != var)
               continue;
            if (glsl_type_is_array(var->type)) {
               if (deref->deref_type == nir_deref_type_array)
                  deref->type = glsl_without_array(var->type);
               else
                  deref->type = var->type;
            } else {
               deref->type = var->type;
            }
         }
      }
   }
}

static void
type_image(nir_shader *nir, nir_variable *var)
{
   nir_foreach_function_impl(impl, nir) {
      nir_foreach_block(block, impl) {
         nir_foreach_instr_safe(instr, block) {
            if (instr->type != nir_instr_type_intrinsic)
               continue;
            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
            if (intr->intrinsic == nir_intrinsic_image_deref_load ||
               intr->intrinsic == nir_intrinsic_image_deref_sparse_load ||
               intr->intrinsic == nir_intrinsic_image_deref_store ||
               intr->intrinsic == nir_intrinsic_image_deref_atomic ||
               intr->intrinsic == nir_intrinsic_image_deref_atomic_swap ||
               intr->intrinsic == nir_intrinsic_image_deref_samples ||
               intr->intrinsic == nir_intrinsic_image_deref_format ||
               intr->intrinsic == nir_intrinsic_image_deref_order) {
               nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
               nir_variable *img = nir_deref_instr_get_variable(deref);
               if (img != var)
                  continue;
               nir_alu_type alu_type = nir_intrinsic_src_type(intr);
               const struct glsl_type *type = glsl_without_array(var->type);
               if (glsl_get_sampler_result_type(type) != GLSL_TYPE_VOID) {
                  assert(glsl_get_sampler_result_type(type) == nir_get_glsl_base_type_for_nir_type(alu_type));
                  continue;
               }
               const struct glsl_type *img_type = glsl_image_type(glsl_get_sampler_dim(type), glsl_sampler_type_is_array(type), nir_get_glsl_base_type_for_nir_type(alu_type));
               if (glsl_type_is_array(var->type))
                  img_type = glsl_array_type(img_type, glsl_array_size(var->type), glsl_get_explicit_stride(var->type));
               var->type = img_type;
               rewrite_cl_derefs(nir, var);
               return;
            }
         }
      }
   }
   nir_foreach_function_impl(impl, nir) {
      nir_foreach_block(block, impl) {
         nir_foreach_instr_safe(instr, block) {
            if (instr->type != nir_instr_type_intrinsic)
               continue;
            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
            if (intr->intrinsic != nir_intrinsic_image_deref_size)
               continue;
            nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
            nir_variable *img = nir_deref_instr_get_variable(deref);
            if (img != var)
               continue;
            nir_alu_type alu_type = nir_type_uint32;
            const struct glsl_type *type = glsl_without_array(var->type);
            if (glsl_get_sampler_result_type(type) != GLSL_TYPE_VOID) {
               continue;
            }
            const struct glsl_type *img_type = glsl_image_type(glsl_get_sampler_dim(type), glsl_sampler_type_is_array(type), nir_get_glsl_base_type_for_nir_type(alu_type));
            if (glsl_type_is_array(var->type))
               img_type = glsl_array_type(img_type, glsl_array_size(var->type), glsl_get_explicit_stride(var->type));
            var->type = img_type;
            rewrite_cl_derefs(nir, var);
            return;
         }
      }
   }
   var->data.mode = nir_var_shader_temp;
}

static bool
type_sampler_vars(nir_shader *nir, unsigned *sampler_mask)
{
   bool progress = false;
   nir_foreach_function_impl(impl, nir) {
      nir_foreach_block(block, impl) {
         nir_foreach_instr(instr, block) {
            if (instr->type != nir_instr_type_tex)
               continue;
            nir_tex_instr *tex = nir_instr_as_tex(instr);
            if (nir_tex_instr_need_sampler(tex))
               *sampler_mask |= BITFIELD_BIT(tex->sampler_index);
            nir_variable *var = nir_find_sampler_variable_with_tex_index(nir, tex->texture_index);
            assert(var);
            if (glsl_get_sampler_result_type(glsl_without_array(var->type)) != GLSL_TYPE_VOID &&
                nir_tex_instr_is_query(tex))
               continue;
            const struct glsl_type *img_type = glsl_sampler_type(glsl_get_sampler_dim(glsl_without_array(var->type)), tex->is_shadow, tex->is_array, nir_get_glsl_base_type_for_nir_type(tex->dest_type));
            unsigned size = glsl_type_is_array(var->type) ? glsl_array_size(var->type) : 1;
            if (size > 1)
               img_type = glsl_array_type(img_type, size, 0);
            var->type = img_type;
            progress = true;
         }
      }
   }
   return progress;
}

static bool
delete_samplers(nir_shader *nir)
{
   bool progress = false;
   nir_foreach_variable_with_modes(var, nir, nir_var_uniform) {
      if (glsl_type_is_sampler(glsl_without_array(var->type))) {
         var->data.mode = nir_var_shader_temp;
         progress = true;
      }
   }
   return progress;
}

static bool
type_images(nir_shader *nir, unsigned *sampler_mask)
{
   bool progress = false;
   progress |= delete_samplers(nir);
   progress |= type_sampler_vars(nir, sampler_mask);
   nir_foreach_variable_with_modes(var, nir, nir_var_image) {
      type_image(nir, var);
      progress = true;
   }
   return progress;
}

/* attempt to assign io for separate shaders */
static bool
fixup_io_locations(nir_shader *nir)
{
   nir_variable_mode modes;
   if (nir->info.stage != MESA_SHADER_FRAGMENT && nir->info.stage != MESA_SHADER_VERTEX)
      modes = nir_var_shader_in | nir_var_shader_out;
   else
      modes = nir->info.stage == MESA_SHADER_FRAGMENT ? nir_var_shader_in : nir_var_shader_out;
   u_foreach_bit(mode, modes) {
      nir_variable_mode m = BITFIELD_BIT(mode);
      if ((m == nir_var_shader_in && ((nir->info.inputs_read & BITFIELD64_MASK(VARYING_SLOT_VAR1)) == nir->info.inputs_read)) ||
          (m == nir_var_shader_out && ((nir->info.outputs_written | nir->info.outputs_read) & BITFIELD64_MASK(VARYING_SLOT_VAR1)) == (nir->info.outputs_written | nir->info.outputs_read))) {
         /* this is a special heuristic to catch ARB/fixedfunc shaders which have different rules:
          * - i/o interface blocks don't need to match
          * - any location can be present or not
          * - it just has to work
          *
          * VAR0 is the only user varying that mesa can produce in this case, so overwrite POS
          * since it's a builtin and yolo it with all the other legacy crap
          */
         nir_foreach_variable_with_modes(var, nir, m) {
            if (nir_slot_is_sysval_output(var->data.location, MESA_SHADER_NONE))
               continue;
            if (var->data.location == VARYING_SLOT_VAR0)
               var->data.driver_location = 0;
            else if (var->data.patch)
               var->data.driver_location = var->data.location - VARYING_SLOT_PATCH0;
            else
               var->data.driver_location = var->data.location;
         }
         continue;
      }
      /* i/o interface blocks are required to be EXACT matches between stages:
      * iterate over all locations and set locations incrementally
      */
      unsigned slot = 0;
      for (unsigned i = 0; i < VARYING_SLOT_TESS_MAX; i++) {
         if (nir_slot_is_sysval_output(i, MESA_SHADER_NONE))
            continue;
         bool found = false;
         unsigned size = 0;
         nir_foreach_variable_with_modes(var, nir, m) {
            if (var->data.location != i)
               continue;
            /* only add slots for non-component vars or first-time component vars */
            if (!var->data.location_frac || !size) {
               /* ensure variable is given enough slots */
               if (nir_is_arrayed_io(var, nir->info.stage))
                  size += glsl_count_vec4_slots(glsl_get_array_element(var->type), false, false);
               else
                  size += glsl_count_vec4_slots(var->type, false, false);
            }
            if (var->data.patch)
               var->data.driver_location = var->data.location - VARYING_SLOT_PATCH0;
            else
               var->data.driver_location = slot;
            found = true;
         }
         slot += size;
         if (found) {
            /* ensure the consumed slots aren't double iterated */
            i += size - 1;
         } else {
            /* locations used between stages are not required to be contiguous */
            if (i >= VARYING_SLOT_VAR0)
               slot++;
         }
      }
   }
   return true;
}

static uint64_t
zink_flat_flags(struct nir_shader *shader)
{
   uint64_t flat_flags = 0;
   nir_foreach_shader_in_variable(var, shader) {
      if (var->data.interpolation == INTERP_MODE_FLAT)
         flat_flags |= BITFIELD64_BIT(var->data.location);
   }

   return flat_flags;
}

struct rework_io_state {
   /* these are search criteria */
   bool indirect_only;
   unsigned location;
   nir_variable_mode mode;
   gl_shader_stage stage;
   nir_shader *nir;
   const char *name;

   /* these are found by scanning */
   bool arrayed_io;
   bool medium_precision;
   bool fb_fetch_output;
   bool dual_source_blend_index;
   uint32_t component_mask;
   uint32_t ignored_component_mask;
   unsigned array_size;
   unsigned bit_size;
   unsigned base;
   nir_alu_type type;
   /* must be last */
   char *newname;
};

/* match an existing variable against the rework state */
static nir_variable *
find_rework_var(nir_shader *nir, struct rework_io_state *ris)
{
   nir_foreach_variable_with_modes(var, nir, ris->mode) {
      const struct glsl_type *type = var->type;
      if (nir_is_arrayed_io(var, nir->info.stage))
         type = glsl_get_array_element(type);
      if (var->data.fb_fetch_output != ris->fb_fetch_output)
         continue;
      if (nir->info.stage == MESA_SHADER_FRAGMENT && ris->mode == nir_var_shader_out && ris->dual_source_blend_index != var->data.index)
         continue;
      unsigned num_slots = var->data.compact ? DIV_ROUND_UP(glsl_array_size(type), 4) : glsl_count_attribute_slots(type, false);
      if (var->data.location > ris->location + ris->array_size || var->data.location + num_slots <= ris->location)
         continue;
      unsigned num_components = glsl_get_vector_elements(glsl_without_array(type));
      assert(!glsl_type_contains_64bit(type));
      uint32_t component_mask = ris->component_mask ? ris->component_mask : BITFIELD_MASK(4);
      if (BITFIELD_RANGE(var->data.location_frac, num_components) & component_mask)
         return var;
   }
   return NULL;
}

static void
update_io_var_name(struct rework_io_state *ris, const char *name)
{
   if (!(zink_debug & (ZINK_DEBUG_NIR | ZINK_DEBUG_SPIRV)))
      return;
   if (!name)
      return;
   if (ris->name && !strcmp(ris->name, name))
      return;
   if (ris->newname && !strcmp(ris->newname, name))
      return;
   if (ris->newname) {
      ris->newname = ralloc_asprintf(ris->nir, "%s_%s", ris->newname, name);
   } else if (ris->name) {
      ris->newname = ralloc_asprintf(ris->nir, "%s_%s", ris->name, name);
   } else {
      ris->newname = ralloc_strdup(ris->nir, name);
   }
}

/* check/update tracking state for variable info */
static void
update_io_var_state(nir_intrinsic_instr *intr, struct rework_io_state *ris)
{
   bool is_load = false;
   bool is_input = false;
   bool is_interp = false;
   filter_io_instr(intr, &is_load, &is_input, &is_interp);
   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
   unsigned frac = nir_intrinsic_component(intr);
   /* the mask of components for the instruction */
   uint32_t cmask = is_load ? BITFIELD_RANGE(frac, intr->num_components) : (nir_intrinsic_write_mask(intr) << frac);

   /* always check for existing variables first */
   struct rework_io_state test = {
      .location = ris->location,
      .mode = ris->mode,
      .stage = ris->stage,
      .arrayed_io = io_instr_is_arrayed(intr),
      .medium_precision = sem.medium_precision,
      .fb_fetch_output = sem.fb_fetch_output,
      .dual_source_blend_index = sem.dual_source_blend_index,
      .component_mask = cmask,
      .array_size = sem.num_slots > 1 ? sem.num_slots : 0,
   };
   if (find_rework_var(ris->nir, &test))
      return;

   /* filter ignored components to scan later:
    * - ignore no-overlapping-components case
    * - always match fbfetch and dual src blend
    */
   if (ris->component_mask &&
       (!(ris->component_mask & cmask) || ris->fb_fetch_output != sem.fb_fetch_output || ris->dual_source_blend_index != sem.dual_source_blend_index)) {
      ris->ignored_component_mask |= cmask;
      return;
   }

   assert(!ris->indirect_only || sem.num_slots > 1);
   if (sem.num_slots > 1)
      ris->array_size = MAX2(ris->array_size, sem.num_slots);

   assert(!ris->component_mask || ris->arrayed_io == io_instr_is_arrayed(intr));
   ris->arrayed_io = io_instr_is_arrayed(intr);

   ris->component_mask |= cmask;

   unsigned bit_size = is_load ? intr->def.bit_size : nir_src_bit_size(intr->src[0]);
   assert(!ris->bit_size || ris->bit_size == bit_size);
   ris->bit_size = bit_size;

   nir_alu_type type = is_load ? nir_intrinsic_dest_type(intr) : nir_intrinsic_src_type(intr);
   if (ris->type) {
      /* in the case of clashing types, this heuristic guarantees some semblance of a match */
      if (ris->type & nir_type_float || type & nir_type_float) {
         ris->type = nir_type_float | bit_size;
      } else if (ris->type & nir_type_int || type & nir_type_int) {
         ris->type = nir_type_int | bit_size;
      } else if (ris->type & nir_type_uint || type & nir_type_uint) {
         ris->type = nir_type_uint | bit_size;
      } else {
         assert(bit_size == 1);
         ris->type = nir_type_bool;
      }
   } else {
      ris->type = type;
   }

   update_io_var_name(ris, intr->name);

   ris->medium_precision |= sem.medium_precision;
   ris->fb_fetch_output |= sem.fb_fetch_output;
   ris->dual_source_blend_index |= sem.dual_source_blend_index;
   if (ris->stage == MESA_SHADER_VERTEX && ris->mode == nir_var_shader_in)
      ris->base = nir_intrinsic_base(intr);
}

/* instruction-level scanning for variable data */
static bool
scan_io_var_usage(nir_builder *b, nir_intrinsic_instr *intr, void *data)
{
   struct rework_io_state *ris = data;
   bool is_load = false;
   bool is_input = false;
   bool is_interp = false;
   /* mode-based filtering */
   if (!filter_io_instr(intr, &is_load, &is_input, &is_interp))
      return false;
   if (ris->mode == nir_var_shader_in) {
      if (!is_input)
         return false;
   } else {
      if (is_input)
         return false;
   }
   /* location-based filtering */
   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
   if (sem.location != ris->location && (ris->location > sem.location || ris->location + ris->array_size <= sem.location))
      return false;

   /* only scan indirect i/o when indirect_only is set */
   nir_src *src_offset = nir_get_io_offset_src(intr);
   if (!nir_src_is_const(*src_offset)) {
      if (!ris->indirect_only)
         return false;
      update_io_var_state(intr, ris);
      return false;
   }

   /* don't scan direct i/o when indirect_only is set */
   if (ris->indirect_only)
      return false;

   update_io_var_state(intr, ris);
   return false;
}

/* scan a given i/o slot for state info */
static struct rework_io_state
scan_io_var_slot(nir_shader *nir, nir_variable_mode mode, unsigned location, bool scan_indirects)
{
   struct rework_io_state ris = {
      .location = location,
      .mode = mode,
      .stage = nir->info.stage,
      .nir = nir,
   };

   struct rework_io_state test;
   do {
      update_io_var_name(&test, ris.newname ? ris.newname : ris.name);
      test = ris;
      /* always run indirect scan first to detect potential overlaps */
      if (scan_indirects) {
         ris.indirect_only = true;
         nir_shader_intrinsics_pass(nir, scan_io_var_usage, nir_metadata_all, &ris);
      }
      ris.indirect_only = false;
      nir_shader_intrinsics_pass(nir, scan_io_var_usage, nir_metadata_all, &ris);
      /* keep scanning until no changes found */
   } while (memcmp(&ris, &test, offsetof(struct rework_io_state, newname)));
   return ris;
}

/* create a variable using explicit/scan info */
static void
create_io_var(nir_shader *nir, struct rework_io_state *ris)
{
   char name[1024];
   assert(ris->component_mask);
   if (ris->newname || ris->name) {
      snprintf(name, sizeof(name), "%s", ris->newname ? ris->newname : ris->name);
   /* always use builtin name where possible */
   } else if (nir->info.stage == MESA_SHADER_VERTEX && ris->mode == nir_var_shader_in) {
      snprintf(name, sizeof(name), "%s", gl_vert_attrib_name(ris->location));
   } else if (nir->info.stage == MESA_SHADER_FRAGMENT && ris->mode == nir_var_shader_out) {
      snprintf(name, sizeof(name), "%s", gl_frag_result_name(ris->location));
   } else if (nir_slot_is_sysval_output(ris->location, nir->info.stage)) {
      snprintf(name, sizeof(name), "%s", gl_varying_slot_name_for_stage(ris->location, nir->info.stage));
   } else {
      int c = ffs(ris->component_mask) - 1;
      if (c)
         snprintf(name, sizeof(name), "slot_%u_c%u", ris->location, c);
      else
         snprintf(name, sizeof(name), "slot_%u", ris->location);
   }
   /* calculate vec/array type */
   int frac = ffs(ris->component_mask) - 1;
   int num_components = util_last_bit(ris->component_mask) - frac;
   assert(ris->component_mask == BITFIELD_RANGE(frac, num_components));
   const struct glsl_type *vec_type = glsl_vector_type(nir_get_glsl_base_type_for_nir_type(ris->type), num_components);
   if (ris->array_size)
      vec_type = glsl_array_type(vec_type, ris->array_size, glsl_get_explicit_stride(vec_type));
   if (ris->arrayed_io) {
      /* tess size may be unknown with generated tcs */
      unsigned arrayed = nir->info.stage == MESA_SHADER_GEOMETRY ?
                         nir->info.gs.vertices_in : 32 /* MAX_PATCH_VERTICES */;
      vec_type = glsl_array_type(vec_type, arrayed, glsl_get_explicit_stride(vec_type));
   }
   nir_variable *var = nir_variable_create(nir, ris->mode, vec_type, name);
   var->data.location_frac = frac;
   var->data.location = ris->location;
   /* gallium vertex inputs use intrinsic 'base' indexing */
   if (nir->info.stage == MESA_SHADER_VERTEX && ris->mode == nir_var_shader_in)
      var->data.driver_location = ris->base;
   var->data.patch = ris->location >= VARYING_SLOT_PATCH0 ||
                     ((nir->info.stage == MESA_SHADER_TESS_CTRL || nir->info.stage == MESA_SHADER_TESS_EVAL) &&
                      (ris->location == VARYING_SLOT_TESS_LEVEL_INNER || ris->location == VARYING_SLOT_TESS_LEVEL_OUTER));
   /* set flat by default: add_derefs will fill this in later after more shader passes */
   if (nir->info.stage == MESA_SHADER_FRAGMENT && ris->mode == nir_var_shader_in)
      var->data.interpolation = INTERP_MODE_FLAT;
   var->data.fb_fetch_output = ris->fb_fetch_output;
   var->data.index = ris->dual_source_blend_index;
   var->data.precision = ris->medium_precision;
   /* only clip/cull dist and tess levels are compact */
   if (nir->info.stage != MESA_SHADER_VERTEX || ris->mode != nir_var_shader_in)
      var->data.compact = is_clipcull_dist(ris->location) || (ris->location == VARYING_SLOT_TESS_LEVEL_INNER || ris->location == VARYING_SLOT_TESS_LEVEL_OUTER);
}

/* loop the i/o mask and generate variables for specified locations */
static void
loop_io_var_mask(nir_shader *nir, nir_variable_mode mode, bool indirect, bool patch, uint64_t mask)
{
   bool is_vertex_input = nir->info.stage == MESA_SHADER_VERTEX && mode == nir_var_shader_in;
   u_foreach_bit64(slot, mask) {
      if (patch)
         slot += VARYING_SLOT_PATCH0;

      /* this should've been handled explicitly */
      assert(is_vertex_input || !is_clipcull_dist(slot));

      unsigned remaining = 0;
      do {
         /* scan the slot for usage */
         struct rework_io_state ris = scan_io_var_slot(nir, mode, slot, indirect);
         /* one of these must be true or things have gone very wrong */
         assert(indirect || ris.component_mask || find_rework_var(nir, &ris) || remaining);
         /* release builds only */
         if (!ris.component_mask)
            break;

         /* whatever reaches this point is either enough info to create a variable or an existing variable */
         if (!find_rework_var(nir, &ris))
            create_io_var(nir, &ris);
         /* scanning may detect multiple potential variables per location at component offsets: process again */
         remaining = ris.ignored_component_mask;
      } while (remaining);
   }
}

/* for a given mode, generate variables */
static void
rework_io_vars(nir_shader *nir, nir_variable_mode mode, struct zink_shader *zs)
{
   assert(mode == nir_var_shader_out || mode == nir_var_shader_in);
   assert(util_bitcount(mode) == 1);
   bool found = false;
   /* if no i/o, skip */
   if (mode == nir_var_shader_out)
      found = nir->info.outputs_written || nir->info.outputs_read || nir->info.patch_outputs_written || nir->info.patch_outputs_read;
   else
      found = nir->info.inputs_read || nir->info.patch_inputs_read;
   if (!found)
      return;

   /* use local copies to enable incremental processing */
   uint64_t inputs_read = nir->info.inputs_read;
   uint64_t inputs_read_indirectly = nir->info.inputs_read_indirectly;
   uint64_t outputs_accessed = nir->info.outputs_written | nir->info.outputs_read;
   uint64_t outputs_accessed_indirectly = nir->info.outputs_accessed_indirectly;

   /* fragment outputs are special: handle separately */
   if (mode == nir_var_shader_out && nir->info.stage == MESA_SHADER_FRAGMENT) {
      assert(!outputs_accessed_indirectly);
      u_foreach_bit64(slot, outputs_accessed) {
         struct rework_io_state ris = {
            .location = slot,
            .mode = mode,
            .stage = nir->info.stage,
         };
         /* explicitly handle builtins */
         switch (slot) {
         case FRAG_RESULT_DEPTH:
         case FRAG_RESULT_STENCIL:
         case FRAG_RESULT_SAMPLE_MASK:
            ris.bit_size = 32;
            ris.component_mask = 0x1;
            ris.type = slot == FRAG_RESULT_DEPTH ? nir_type_float32 : nir_type_uint32;
            create_io_var(nir, &ris);
            outputs_accessed &= ~BITFIELD64_BIT(slot);
            break;
         default:
            break;
         }
      }
      /* the rest of the outputs can be generated normally */
      loop_io_var_mask(nir, mode, false, false, outputs_accessed);
      return;
   }

   /* vertex inputs are special: handle separately */
   if (nir->info.stage == MESA_SHADER_VERTEX && mode == nir_var_shader_in) {
      assert(!inputs_read_indirectly);
      u_foreach_bit64(slot, inputs_read) {
         /* explicitly handle builtins */
         if (slot != VERT_ATTRIB_POS && slot != VERT_ATTRIB_POINT_SIZE)
            continue;

         uint32_t component_mask = slot == VERT_ATTRIB_POINT_SIZE ? 0x1 : 0xf;
         struct rework_io_state ris = {
            .location = slot,
            .mode = mode,
            .stage = nir->info.stage,
            .bit_size = 32,
            .component_mask = component_mask,
            .type = nir_type_float32,
            .newname = scan_io_var_slot(nir, nir_var_shader_in, slot, false).newname,
         };
         create_io_var(nir, &ris);
         inputs_read &= ~BITFIELD64_BIT(slot);
      }
      /* the rest of the inputs can be generated normally */
      loop_io_var_mask(nir, mode, false, false, inputs_read);
      return;
   }

   /* these are the masks to process based on the mode: nothing "special" as above */
   uint64_t mask = mode == nir_var_shader_in ? inputs_read : outputs_accessed;
   uint64_t indirect_mask = mode == nir_var_shader_in ? inputs_read_indirectly : outputs_accessed_indirectly;
   u_foreach_bit64(slot, mask) {
      struct rework_io_state ris = {
         .location = slot,
         .mode = mode,
         .stage = nir->info.stage,
         .arrayed_io = (mode == nir_var_shader_in ? zs->arrayed_inputs : zs->arrayed_outputs) & BITFIELD64_BIT(slot),
      };
      /* explicitly handle builtins */
      unsigned max_components = 0;
      switch (slot) {
      case VARYING_SLOT_FOGC:
         /* use intr components */
         break;
      case VARYING_SLOT_POS:
      case VARYING_SLOT_CLIP_VERTEX:
      case VARYING_SLOT_PNTC:
      case VARYING_SLOT_BOUNDING_BOX0:
      case VARYING_SLOT_BOUNDING_BOX1:
         max_components = 4;
         ris.type = nir_type_float32;
         break;
      case VARYING_SLOT_CLIP_DIST0:
         max_components = nir->info.clip_distance_array_size;
         assert(max_components);
         ris.type = nir_type_float32;
         break;
      case VARYING_SLOT_CULL_DIST0:
         max_components = nir->info.cull_distance_array_size;
         assert(max_components);
         ris.type = nir_type_float32;
         break;
      case VARYING_SLOT_CLIP_DIST1:
      case VARYING_SLOT_CULL_DIST1:
         mask &= ~BITFIELD64_BIT(slot);
         indirect_mask &= ~BITFIELD64_BIT(slot);
         continue;
      case VARYING_SLOT_TESS_LEVEL_OUTER:
         max_components = 4;
         ris.type = nir_type_float32;
         break;
      case VARYING_SLOT_TESS_LEVEL_INNER:
         max_components = 2;
         ris.type = nir_type_float32;
         break;
      case VARYING_SLOT_PRIMITIVE_ID:
      case VARYING_SLOT_LAYER:
      case VARYING_SLOT_VIEWPORT:
      case VARYING_SLOT_FACE:
      case VARYING_SLOT_VIEW_INDEX:
      case VARYING_SLOT_VIEWPORT_MASK:
         ris.type = nir_type_int32;
         max_components = 1;
         break;
      case VARYING_SLOT_PSIZ:
         max_components = 1;
         ris.type = nir_type_float32;
         break;
      default:
         break;
      }
      if (!max_components)
         continue;
      switch (slot) {
      case VARYING_SLOT_CLIP_DIST0:
      case VARYING_SLOT_CLIP_DIST1:
      case VARYING_SLOT_CULL_DIST0:
      case VARYING_SLOT_CULL_DIST1:
      case VARYING_SLOT_TESS_LEVEL_OUTER:
      case VARYING_SLOT_TESS_LEVEL_INNER:
         /* compact arrays */
         ris.component_mask = 0x1;
         ris.array_size = max_components;
         break;
      default:
         ris.component_mask = BITFIELD_MASK(max_components);
         break;
      }
      ris.bit_size = 32;
      create_io_var(nir, &ris);
      mask &= ~BITFIELD64_BIT(slot);
      /* eliminate clip/cull distance scanning early */
      indirect_mask &= ~BITFIELD64_BIT(slot);
   }

   /* patch i/o */
   if ((nir->info.stage == MESA_SHADER_TESS_CTRL && mode == nir_var_shader_out) ||
       (nir->info.stage == MESA_SHADER_TESS_EVAL && mode == nir_var_shader_in)) {
      uint64_t patch_outputs_accessed = nir->info.patch_outputs_read | nir->info.patch_outputs_written;
      uint64_t indirect_patch_mask = mode == nir_var_shader_in ? nir->info.patch_inputs_read_indirectly : nir->info.patch_outputs_accessed_indirectly;
      uint64_t patch_mask = mode == nir_var_shader_in ? nir->info.patch_inputs_read : patch_outputs_accessed;

      loop_io_var_mask(nir, mode, true, true, indirect_patch_mask);
      loop_io_var_mask(nir, mode, false, true, patch_mask);
   }

   /* regular i/o */
   loop_io_var_mask(nir, mode, true, false, indirect_mask);
   loop_io_var_mask(nir, mode, false, false, mask);
}

static int
zink_type_size(const struct glsl_type *type, bool bindless)
{
   return glsl_count_attribute_slots(type, false);
}

static nir_mem_access_size_align
mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,
                         uint8_t bit_size, uint32_t align,
                         uint32_t align_offset, bool offset_is_const,
                         const void *cb_data)
{
   align = nir_combined_align(align, align_offset);

   assert(util_is_power_of_two_nonzero(align));

   /* simply drop the bit_size for unaligned load/stores */
   if (align < (bit_size / 8)) {
      return (nir_mem_access_size_align){
         .num_components = MIN2(bytes / align, 4),
         .bit_size = align * 8,
         .align = align,
      };
   } else {
      return (nir_mem_access_size_align){
         .num_components = MIN2(bytes / (bit_size / 8), 4),
         .bit_size = bit_size,
         .align = bit_size / 8,
      };
   }
}

static nir_mem_access_size_align
mem_access_scratch_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,
                                 uint8_t bit_size, uint32_t align,
                                 uint32_t align_offset, bool offset_is_const,
                                 const void *cb_data)
{
   bit_size = *(const uint8_t *)cb_data;
   align = nir_combined_align(align, align_offset);

   assert(util_is_power_of_two_nonzero(align));

   return (nir_mem_access_size_align){
      .num_components = MIN2(bytes / (bit_size / 8), 4),
      .bit_size = bit_size,
      .align = bit_size / 8,
   };
}

static bool
alias_scratch_memory_scan_bit_size(struct nir_builder *b, nir_intrinsic_instr *instr, void *data)
{
   uint8_t *bit_size = data;
   switch (instr->intrinsic) {
   case nir_intrinsic_load_scratch:
      *bit_size = MIN2(*bit_size, instr->def.bit_size);
      return false;
   case nir_intrinsic_store_scratch:
      *bit_size = MIN2(*bit_size, instr->src[0].ssa->bit_size);
      return false;
   default:
      return false;
   }
}

static bool
alias_scratch_memory(nir_shader *nir)
{
   uint8_t bit_size = 64;

   nir_shader_intrinsics_pass(nir, alias_scratch_memory_scan_bit_size, nir_metadata_all, &bit_size);
   nir_lower_mem_access_bit_sizes_options lower_scratch_mem_access_options = {
      .modes = nir_var_function_temp,
      .may_lower_unaligned_stores_to_atomics = true,
      .callback = mem_access_scratch_size_align_cb,
      .cb_data = &bit_size,
   };
   return nir_lower_mem_access_bit_sizes(nir, &lower_scratch_mem_access_options);
}

static uint8_t
lower_vec816_alu(const nir_instr *instr, const void *cb_data)
{
   return 4;
}

static bool
fix_vertex_input_locations_instr(nir_builder *b, nir_intrinsic_instr *intr, void *data)
{
   bool is_load = false;
   bool is_input = false;
   bool is_interp = false;
   if (!filter_io_instr(intr, &is_load, &is_input, &is_interp) || !is_input)
      return false;

   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
   if (sem.location < VERT_ATTRIB_GENERIC0)
      return false;
   sem.location = VERT_ATTRIB_GENERIC0 + nir_intrinsic_base(intr);
   nir_intrinsic_set_io_semantics(intr, sem);
   return true;
}

static bool
fix_vertex_input_locations(nir_shader *nir)
{
   if (nir->info.stage != MESA_SHADER_VERTEX)
      return false;

   return nir_shader_intrinsics_pass(nir, fix_vertex_input_locations_instr, nir_metadata_all, NULL);
}

struct trivial_revectorize_state {
   bool has_xfb;
   uint32_t component_mask;
   nir_intrinsic_instr *base;
   nir_intrinsic_instr *next_emit_vertex;
   nir_intrinsic_instr *merge[NIR_MAX_VEC_COMPONENTS];
   struct set *deletions;
};

/* always skip xfb; scalarized xfb is preferred */
static bool
intr_has_xfb(nir_intrinsic_instr *intr)
{
   if (!nir_intrinsic_has_io_xfb(intr))
      return false;
   for (unsigned i = 0; i < 2; i++) {
      if (nir_intrinsic_io_xfb(intr).out[i].num_components || nir_intrinsic_io_xfb2(intr).out[i].num_components) {
         return true;
      }
   }
   return false;
}

/* helper to avoid vectorizing i/o for different vertices */
static nir_intrinsic_instr *
find_next_emit_vertex(nir_intrinsic_instr *intr)
{
   bool found = false;
   nir_foreach_instr_safe(instr, intr->instr.block) {
      if (instr->type == nir_instr_type_intrinsic) {
         nir_intrinsic_instr *test_intr = nir_instr_as_intrinsic(instr);
         if (!found && test_intr != intr)
            continue;
         if (!found) {
            assert(intr == test_intr);
            found = true;
            continue;
         }
         if (test_intr->intrinsic == nir_intrinsic_emit_vertex)
            return test_intr;
      }
   }
   return NULL;
}

/* scan for vectorizable instrs on a given location */
static bool
trivial_revectorize_intr_scan(nir_shader *nir, nir_intrinsic_instr *intr, struct trivial_revectorize_state *state)
{
   nir_intrinsic_instr *base = state->base;

   if (intr == base)
      return false;

   if (intr->intrinsic != base->intrinsic)
      return false;

   if (_mesa_set_search(state->deletions, intr))
      return false;

   bool is_load = false;
   bool is_input = false;
   bool is_interp = false;
   filter_io_instr(intr, &is_load, &is_input, &is_interp);

   nir_io_semantics base_sem = nir_intrinsic_io_semantics(base);
   nir_io_semantics test_sem = nir_intrinsic_io_semantics(intr);
   nir_alu_type base_type = is_load ? nir_intrinsic_dest_type(base) : nir_intrinsic_src_type(base);
   nir_alu_type test_type = is_load ? nir_intrinsic_dest_type(intr) : nir_intrinsic_src_type(intr);
   int c = nir_intrinsic_component(intr);
   /* already detected */
   if (state->component_mask & BITFIELD_BIT(c))
      return false;
   /* not a match */
   if (base_sem.location != test_sem.location || base_sem.num_slots != test_sem.num_slots || base_type != test_type)
      return false;
   /* only vectorize when all srcs match */
   for (unsigned i = !is_input; i < nir_intrinsic_infos[intr->intrinsic].num_srcs; i++) {
      if (!nir_srcs_equal(intr->src[i], base->src[i]))
         return false;
   }
   /* never match xfb */
   state->has_xfb |= intr_has_xfb(intr);
   if (state->has_xfb)
      return false;
   if (nir->info.stage == MESA_SHADER_GEOMETRY) {
      /* only match same vertex */
      if (state->next_emit_vertex != find_next_emit_vertex(intr))
         return false;
   }
   uint32_t mask = is_load ? BITFIELD_RANGE(c, intr->num_components) : (nir_intrinsic_write_mask(intr) << c);
   state->component_mask |= mask;
   u_foreach_bit(component, mask)
      state->merge[component] = intr;

   return true;
}

static bool
trivial_revectorize_scan(struct nir_builder *b, nir_intrinsic_instr *intr, void *data)
{
   bool is_load = false;
   bool is_input = false;
   bool is_interp = false;
   if (!filter_io_instr(intr, &is_load, &is_input, &is_interp))
      return false;
   if (intr->num_components != 1)
      return false;
   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
   if (!is_input || b->shader->info.stage != MESA_SHADER_VERTEX) {
      /* always ignore compact arrays */
      switch (sem.location) {
      case VARYING_SLOT_CLIP_DIST0:
      case VARYING_SLOT_CLIP_DIST1:
      case VARYING_SLOT_CULL_DIST0:
      case VARYING_SLOT_CULL_DIST1:
      case VARYING_SLOT_TESS_LEVEL_INNER:
      case VARYING_SLOT_TESS_LEVEL_OUTER:
         return false;
      default: break;
      }
   }
   /* always ignore to-be-deleted instrs */
   if (_mesa_set_search(data, intr))
      return false;

   /* never vectorize xfb */
   if (intr_has_xfb(intr))
      return false;

   int ic = nir_intrinsic_component(intr);
   uint32_t mask = is_load ? BITFIELD_RANGE(ic, intr->num_components) : (nir_intrinsic_write_mask(intr) << ic);
   /* already vectorized */
   if (util_bitcount(mask) == 4)
      return false;
   struct trivial_revectorize_state state = {
      .component_mask = mask,
      .base = intr,
      /* avoid clobbering i/o for different vertices */
      .next_emit_vertex = b->shader->info.stage == MESA_SHADER_GEOMETRY ? find_next_emit_vertex(intr) : NULL,
      .deletions = data,
   };
   u_foreach_bit(bit, mask)
      state.merge[bit] = intr;
   bool progress = false;
   nir_foreach_instr(instr, intr->instr.block) {
      if (instr->type != nir_instr_type_intrinsic)
         continue;
      nir_intrinsic_instr *test_intr = nir_instr_as_intrinsic(instr);
      /* no matching across vertex emission */
      if (test_intr->intrinsic == nir_intrinsic_emit_vertex)
         break;
      progress |= trivial_revectorize_intr_scan(b->shader, test_intr, &state);
   }
   if (!progress || state.has_xfb)
      return false;

   /* verify nothing crazy happened */
   assert(state.component_mask);
   for (unsigned i = 0; i < 4; i++) {
      assert(!state.merge[i] || !intr_has_xfb(state.merge[i]));
   }

   unsigned first_component = ffs(state.component_mask) - 1;
   unsigned num_components = util_bitcount(state.component_mask);
   unsigned num_contiguous = 0;
   uint32_t contiguous_mask = 0;
   for (unsigned i = 0; i < num_components; i++) {
      unsigned c = i + first_component;
      /* calc mask of contiguous components to vectorize */
      if (state.component_mask & BITFIELD_BIT(c)) {
         num_contiguous++;
         contiguous_mask |= BITFIELD_BIT(c);
      }
      /* on the first gap or the the last component, vectorize */
      if (!(state.component_mask & BITFIELD_BIT(c)) || i == num_components - 1) {
         if (num_contiguous > 1) {
            /* reindex to enable easy src/dest index comparison */
            nir_index_ssa_defs(nir_shader_get_entrypoint(b->shader));
            /* determine the first/last instr to use for the base (vectorized) load/store */
            unsigned first_c = ffs(contiguous_mask) - 1;
            nir_intrinsic_instr *base = NULL;
            unsigned test_idx = is_load ? UINT32_MAX : 0;
            for (unsigned j = 0; j < num_contiguous; j++) {
               unsigned merge_c = j + first_c;
               nir_intrinsic_instr *merge_intr = state.merge[merge_c];
               /* avoid breaking ssa ordering by using:
                * - first instr for vectorized load
                * - last instr for vectorized store
                * this guarantees all srcs have been seen
                */
               if ((is_load && merge_intr->def.index < test_idx) ||
                   (!is_load && merge_intr->src[0].ssa->index >= test_idx)) {
                  test_idx = is_load ? merge_intr->def.index : merge_intr->src[0].ssa->index;
                  base = merge_intr;
               }
            }
            assert(base);
            /* update instr components */
            nir_intrinsic_set_component(base, nir_intrinsic_component(state.merge[first_c]));
            unsigned orig_components = base->num_components;
            base->num_components = num_contiguous;
            /* do rewrites after loads and before stores */
            b->cursor = is_load ? nir_after_instr(&base->instr) : nir_before_instr(&base->instr);
            if (is_load) {
               base->def.num_components = num_contiguous;
               /* iterate the contiguous loaded components and rewrite merged dests */
               for (unsigned j = 0; j < num_contiguous; j++) {
                  unsigned merge_c = j + first_c;
                  nir_intrinsic_instr *merge_intr = state.merge[merge_c];
                  /* detect if the merged instr loaded multiple components and use swizzle mask for rewrite */
                  unsigned use_components = merge_intr == base ? orig_components : merge_intr->def.num_components;
                  nir_def *swiz = nir_channels(b, &base->def, BITFIELD_RANGE(j, use_components));
                  nir_def_rewrite_uses_after(&merge_intr->def, swiz, merge_intr == base ? swiz->parent_instr : &merge_intr->instr);
                  j += use_components - 1;
               }
            } else {
               nir_def *comp[NIR_MAX_VEC_COMPONENTS];
               /* generate swizzled vec of store components and rewrite store src */
               for (unsigned j = 0; j < num_contiguous; j++) {
                  unsigned merge_c = j + first_c;
                  nir_intrinsic_instr *merge_intr = state.merge[merge_c];
                  /* detect if the merged instr stored multiple components and extract them for rewrite */
                  unsigned use_components = merge_intr == base ? orig_components : merge_intr->num_components;
                  for (unsigned k = 0; k < use_components; k++)
                     comp[j + k] = nir_channel(b, merge_intr->src[0].ssa, k);
                  j += use_components - 1;
               }
               nir_def *val = nir_vec(b, comp, num_contiguous);
               nir_src_rewrite(&base->src[0], val);
               nir_intrinsic_set_write_mask(base, BITFIELD_MASK(num_contiguous));
            }
            /* deleting instructions during a foreach explodes the compiler, so delete later */
            for (unsigned j = 0; j < num_contiguous; j++) {
               unsigned merge_c = j + first_c;
               nir_intrinsic_instr *merge_intr = state.merge[merge_c];
               if (merge_intr != base)
                  _mesa_set_add(data, &merge_intr->instr);
            }
         }
         contiguous_mask = 0;
         num_contiguous = 0;
      }
   }

   return true;
}

/* attempt to revectorize scalar i/o, ignoring xfb and "hard stuff" */
static bool
trivial_revectorize(nir_shader *nir)
{
   struct set deletions;

   if (nir->info.stage > MESA_SHADER_FRAGMENT)
      return false;

   _mesa_set_init(&deletions, NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
   bool progress = nir_shader_intrinsics_pass(nir, trivial_revectorize_scan, nir_metadata_dominance, &deletions);
   /* now it's safe to delete */
   set_foreach_remove(&deletions, entry) {
      nir_instr *instr = (void*)entry->key;
      nir_instr_remove(instr);
   }
   ralloc_free(deletions.table);
   return progress;
}

struct zink_shader *
zink_shader_create(struct zink_screen *screen, struct nir_shader *nir)
{
   struct zink_shader *ret = rzalloc(NULL, struct zink_shader);
   bool have_psiz = false;

   ret->has_edgeflags = nir->info.stage == MESA_SHADER_VERTEX &&
                        nir->info.outputs_written & VARYING_BIT_EDGE;

   ret->sinfo.have_vulkan_memory_model = screen->info.have_KHR_vulkan_memory_model;
   ret->sinfo.have_workgroup_memory_explicit_layout = screen->info.have_KHR_workgroup_memory_explicit_layout;
   if (screen->info.have_KHR_shader_float_controls) {
      if (screen->info.props12.shaderDenormFlushToZeroFloat16)
         ret->sinfo.float_controls.flush_denorms |= 0x1;
      if (screen->info.props12.shaderDenormFlushToZeroFloat32)
         ret->sinfo.float_controls.flush_denorms |= 0x2;
      if (screen->info.props12.shaderDenormFlushToZeroFloat64)
         ret->sinfo.float_controls.flush_denorms |= 0x4;

      if (screen->info.props12.shaderDenormPreserveFloat16)
         ret->sinfo.float_controls.preserve_denorms |= 0x1;
      if (screen->info.props12.shaderDenormPreserveFloat32)
         ret->sinfo.float_controls.preserve_denorms |= 0x2;
      if (screen->info.props12.shaderDenormPreserveFloat64)
         ret->sinfo.float_controls.preserve_denorms |= 0x4;

      ret->sinfo.float_controls.denorms_all_independence =
         screen->info.props12.denormBehaviorIndependence == VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL;

      ret->sinfo.float_controls.denorms_32_bit_independence =
         ret->sinfo.float_controls.denorms_all_independence ||
         screen->info.props12.denormBehaviorIndependence == VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_32_BIT_ONLY;
   }
   ret->sinfo.bindless_set_idx = screen->desc_set_id[ZINK_DESCRIPTOR_BINDLESS];

   util_queue_fence_init(&ret->precompile.fence);
   util_dynarray_init(&ret->pipeline_libs, ret);
   ret->hash = _mesa_hash_pointer(ret);

   ret->programs = _mesa_pointer_set_create(NULL);
   simple_mtx_init(&ret->lock, mtx_plain);

   if (nir->info.stage == MESA_SHADER_KERNEL) {
      nir_lower_mem_access_bit_sizes_options lower_mem_access_options = {
         .modes = nir_var_all ^ nir_var_function_temp,
         .may_lower_unaligned_stores_to_atomics = true,
         .callback = mem_access_size_align_cb,
         .cb_data = screen,
      };
      NIR_PASS_V(nir, nir_lower_mem_access_bit_sizes, &lower_mem_access_options);
      NIR_PASS_V(nir, alias_scratch_memory);
      NIR_PASS_V(nir, nir_lower_alu_width, lower_vec816_alu, NULL);
      NIR_PASS_V(nir, nir_lower_alu_vec8_16_srcs);
   }

   NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_in | nir_var_shader_out, NULL, NULL);
   optimize_nir(nir, NULL, true);
   nir_foreach_variable_with_modes(var, nir, nir_var_shader_in | nir_var_shader_out) {
      if (glsl_type_is_image(var->type) || glsl_type_is_sampler(var->type)) {
         NIR_PASS_V(nir, lower_bindless_io);
         break;
      }
   }
   if (nir->info.stage < MESA_SHADER_FRAGMENT)
      nir_gather_xfb_info_from_intrinsics(nir);
   NIR_PASS_V(nir, fix_vertex_input_locations);
   nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
   scan_nir(screen, nir, ret);
   NIR_PASS_V(nir, nir_opt_vectorize, NULL, NULL);
   NIR_PASS_V(nir, trivial_revectorize);
   if (nir->info.io_lowered) {
      rework_io_vars(nir, nir_var_shader_in, ret);
      rework_io_vars(nir, nir_var_shader_out, ret);
      nir_sort_variables_by_location(nir, nir_var_shader_in);
      nir_sort_variables_by_location(nir, nir_var_shader_out);
   }

   if (nir->info.stage < MESA_SHADER_COMPUTE)
      create_gfx_pushconst(nir);

   if (nir->info.stage == MESA_SHADER_TESS_CTRL ||
            nir->info.stage == MESA_SHADER_TESS_EVAL)
      NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, false);

   if (nir->info.stage < MESA_SHADER_FRAGMENT)
      have_psiz = check_psiz(nir);
   if (nir->info.stage == MESA_SHADER_FRAGMENT)
      ret->flat_flags = zink_flat_flags(nir);

   if (!gl_shader_stage_is_compute(nir->info.stage) && nir->info.separate_shader)
      NIR_PASS_V(nir, fixup_io_locations);

   NIR_PASS_V(nir, lower_basevertex);
   NIR_PASS_V(nir, lower_baseinstance);
   NIR_PASS_V(nir, split_bitfields);
   if (!screen->info.feats.features.shaderStorageImageMultisample)
      NIR_PASS_V(nir, strip_tex_ms);
   NIR_PASS_V(nir, nir_lower_frexp); /* TODO: Use the spirv instructions for this. */

   if (screen->info.have_EXT_shader_demote_to_helper_invocation) {
      NIR_PASS_V(nir, nir_lower_discard_or_demote, true);
   }

   if (screen->need_2D_zs)
      NIR_PASS_V(nir, lower_1d_shadow, screen);

   {
      nir_lower_subgroups_options subgroup_options = {0};
      subgroup_options.lower_to_scalar = true;
      subgroup_options.subgroup_size = screen->info.props11.subgroupSize;
      subgroup_options.ballot_bit_size = 32;
      subgroup_options.ballot_components = 4;
      subgroup_options.lower_subgroup_masks = true;
      if (!(screen->info.subgroup.supportedStages & mesa_to_vk_shader_stage(clamp_stage(&nir->info)))) {
         subgroup_options.subgroup_size = 1;
         subgroup_options.lower_vote_trivial = true;
      }
      subgroup_options.lower_inverse_ballot = true;
      NIR_PASS_V(nir, nir_lower_subgroups, &subgroup_options);
   }

   optimize_nir(nir, NULL, true);
   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
   NIR_PASS_V(nir, nir_lower_discard_if, (nir_lower_discard_if_to_cf |
                                          nir_lower_demote_if_to_cf |
                                          nir_lower_terminate_if_to_cf));

   bool needs_size = analyze_io(ret, nir);
   NIR_PASS_V(nir, unbreak_bos, ret, needs_size);
   /* run in compile if there could be inlined uniforms */
   if (!screen->driconf.inline_uniforms && !nir->info.num_inlinable_uniforms) {
      NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_mem_global | nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_shared, NULL, NULL);
      NIR_PASS_V(nir, rewrite_bo_access, screen);
      NIR_PASS_V(nir, remove_bo_access, ret);
   }

   struct zink_bindless_info bindless = {0};
   bindless.bindless_set = screen->desc_set_id[ZINK_DESCRIPTOR_BINDLESS];
   nir_foreach_variable_with_modes(var, nir, nir_var_shader_in | nir_var_shader_out)
      var->data.is_xfb = false;

   optimize_nir(nir, NULL, true);
   prune_io(nir);

   unsigned sampler_mask = 0;
   if (nir->info.stage == MESA_SHADER_KERNEL) {
      NIR_PASS_V(nir, type_images, &sampler_mask);
      enum zink_descriptor_type ztype = ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW;
      VkDescriptorType vktype = VK_DESCRIPTOR_TYPE_SAMPLER;
      u_foreach_bit(s, sampler_mask) {
         ret->bindings[ztype][ret->num_bindings[ztype]].index = s;
         ret->bindings[ztype][ret->num_bindings[ztype]].binding = zink_binding(MESA_SHADER_KERNEL, vktype, s, screen->compact_descriptors);
         ret->bindings[ztype][ret->num_bindings[ztype]].type = vktype;
         ret->bindings[ztype][ret->num_bindings[ztype]].size = 1;
         ret->num_bindings[ztype]++;
      }
      ret->sinfo.sampler_mask = sampler_mask;
   }

   unsigned ubo_binding_mask = 0;
   unsigned ssbo_binding_mask = 0;
   foreach_list_typed_reverse_safe(nir_variable, var, node, &nir->variables) {
      if (_nir_shader_variable_has_mode(var, nir_var_uniform |
                                        nir_var_image |
                                        nir_var_mem_ubo |
                                        nir_var_mem_ssbo)) {
         enum zink_descriptor_type ztype;
         const struct glsl_type *type = glsl_without_array(var->type);
         if (var->data.mode == nir_var_mem_ubo) {
            ztype = ZINK_DESCRIPTOR_TYPE_UBO;
            /* buffer 0 is a push descriptor */
            var->data.descriptor_set = !!var->data.driver_location;
            var->data.binding = !var->data.driver_location ? clamp_stage(&nir->info) :
                                zink_binding(nir->info.stage,
                                             VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
                                             var->data.driver_location,
                                             screen->compact_descriptors);
            assert(var->data.driver_location || var->data.binding < 10);
            VkDescriptorType vktype = !var->data.driver_location ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC : VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
            int binding = var->data.binding;

            if (!var->data.driver_location) {
               ret->has_uniforms = true;
            } else if (!(ubo_binding_mask & BITFIELD_BIT(binding))) {
               ret->bindings[ztype][ret->num_bindings[ztype]].index = var->data.driver_location;
               ret->bindings[ztype][ret->num_bindings[ztype]].binding = binding;
               ret->bindings[ztype][ret->num_bindings[ztype]].type = vktype;
               ret->bindings[ztype][ret->num_bindings[ztype]].size = glsl_get_length(var->type);
               assert(ret->bindings[ztype][ret->num_bindings[ztype]].size);
               ret->num_bindings[ztype]++;
               ubo_binding_mask |= BITFIELD_BIT(binding);
            }
         } else if (var->data.mode == nir_var_mem_ssbo) {
            ztype = ZINK_DESCRIPTOR_TYPE_SSBO;
            var->data.descriptor_set = screen->desc_set_id[ztype];
            var->data.binding = zink_binding(clamp_stage(&nir->info),
                                             VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
                                             var->data.driver_location,
                                             screen->compact_descriptors);
            if (!(ssbo_binding_mask & BITFIELD_BIT(var->data.binding))) {
               ret->bindings[ztype][ret->num_bindings[ztype]].index = var->data.driver_location;
               ret->bindings[ztype][ret->num_bindings[ztype]].binding = var->data.binding;
               ret->bindings[ztype][ret->num_bindings[ztype]].type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
               ret->bindings[ztype][ret->num_bindings[ztype]].size = glsl_get_length(var->type);
               assert(ret->bindings[ztype][ret->num_bindings[ztype]].size);
               ret->num_bindings[ztype]++;
               ssbo_binding_mask |= BITFIELD_BIT(var->data.binding);
            }
         } else {
            assert(var->data.mode == nir_var_uniform ||
                   var->data.mode == nir_var_image);
            if (var->data.bindless) {
               ret->bindless = true;
               handle_bindless_var(nir, var, type, &bindless);
            } else if (glsl_type_is_sampler(type) || glsl_type_is_image(type)) {
               VkDescriptorType vktype = glsl_type_is_image(type) ? zink_image_type(type) : zink_sampler_type(type);
               if (nir->info.stage == MESA_SHADER_KERNEL && vktype == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
                  vktype = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE;
               ztype = zink_desc_type_from_vktype(vktype);
               var->data.driver_location = var->data.binding;
               var->data.descriptor_set = screen->desc_set_id[ztype];
               var->data.binding = zink_binding(nir->info.stage, vktype, var->data.driver_location, screen->compact_descriptors);
               ret->bindings[ztype][ret->num_bindings[ztype]].index = var->data.driver_location;
               ret->bindings[ztype][ret->num_bindings[ztype]].binding = var->data.binding;
               ret->bindings[ztype][ret->num_bindings[ztype]].type = vktype;
               if (glsl_type_is_array(var->type))
                  ret->bindings[ztype][ret->num_bindings[ztype]].size = glsl_get_aoa_size(var->type);
               else
                  ret->bindings[ztype][ret->num_bindings[ztype]].size = 1;
               ret->num_bindings[ztype]++;
            } else if (var->data.mode == nir_var_uniform) {
               /* this is a dead uniform */
               var->data.mode = 0;
               exec_node_remove(&var->node);
            }
         }
      }
   }
   bool bindless_lowered = false;
   NIR_PASS(bindless_lowered, nir, lower_bindless, &bindless);
   ret->bindless |= bindless_lowered;

   if (!screen->info.feats.features.shaderInt64 || !screen->info.feats.features.shaderFloat64)
      NIR_PASS_V(nir, lower_64bit_vars, screen->info.feats.features.shaderInt64);
   if (nir->info.stage != MESA_SHADER_KERNEL)
      NIR_PASS_V(nir, match_tex_dests, ret);

   if (!nir->info.internal)
      nir_foreach_shader_out_variable(var, nir)
         var->data.explicit_xfb_buffer = 0;
   if (nir->xfb_info && nir->xfb_info->output_count && nir->info.outputs_written)
      update_so_info(ret, nir, nir->info.outputs_written, have_psiz);
   zink_shader_serialize_blob(nir, &ret->blob);
   memcpy(&ret->info, &nir->info, sizeof(nir->info));
   ret->info.name = ralloc_strdup(ret, nir->info.name);

   ret->can_inline = true;

   return ret;
}

char *
zink_shader_finalize(struct pipe_screen *pscreen, void *nirptr)
{
   struct zink_screen *screen = zink_screen(pscreen);
   nir_shader *nir = nirptr;

   nir_lower_tex_options tex_opts = {
      .lower_invalid_implicit_lod = true,
   };
   /*
      Sampled Image must be an object whose type is OpTypeSampledImage.
      The Dim operand of the underlying OpTypeImage must be 1D, 2D, 3D,
      or Rect, and the Arrayed and MS operands must be 0.
      - SPIRV, OpImageSampleProj* opcodes
    */
   tex_opts.lower_txp = BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE) |
                        BITFIELD_BIT(GLSL_SAMPLER_DIM_MS);
   tex_opts.lower_txp_array = true;
   if (!screen->info.feats.features.shaderImageGatherExtended)
      tex_opts.lower_tg4_offsets = true;
   NIR_PASS_V(nir, nir_lower_tex, &tex_opts);
   optimize_nir(nir, NULL, false);
   if (nir->info.stage == MESA_SHADER_VERTEX)
      nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
   if (screen->driconf.inline_uniforms)
      nir_find_inlinable_uniforms(nir);

   return NULL;
}

void
zink_shader_free(struct zink_screen *screen, struct zink_shader *shader)
{
   _mesa_set_destroy(shader->programs, NULL);
   util_queue_fence_wait(&shader->precompile.fence);
   util_queue_fence_destroy(&shader->precompile.fence);
   zink_descriptor_shader_deinit(screen, shader);
   if (screen->info.have_EXT_shader_object) {
      VKSCR(DestroyShaderEXT)(screen->dev, shader->precompile.obj.obj, NULL);
   } else {
      if (shader->precompile.obj.mod)
         VKSCR(DestroyShaderModule)(screen->dev, shader->precompile.obj.mod, NULL);
      if (shader->precompile.gpl)
         VKSCR(DestroyPipeline)(screen->dev, shader->precompile.gpl, NULL);
   }
   blob_finish(&shader->blob);
   ralloc_free(shader->spirv);
   free(shader->precompile.bindings);
   ralloc_free(shader);
}

void
zink_gfx_shader_free(struct zink_screen *screen, struct zink_shader *shader)
{
   assert(shader->info.stage != MESA_SHADER_COMPUTE);
   util_queue_fence_wait(&shader->precompile.fence);
   set_foreach(shader->programs, entry) {
      struct zink_gfx_program *prog = (void*)entry->key;
      gl_shader_stage stage = shader->info.stage;
      assert(stage < ZINK_GFX_SHADER_COUNT);
      unsigned stages_present = prog->stages_present;
      if (prog->shaders[MESA_SHADER_TESS_CTRL] &&
            prog->shaders[MESA_SHADER_TESS_CTRL]->non_fs.is_generated)
         stages_present &= ~BITFIELD_BIT(MESA_SHADER_TESS_CTRL);
      unsigned idx = zink_program_cache_stages(stages_present);
      if (!prog->base.removed && prog->stages_present == prog->stages_remaining &&
          (stage == MESA_SHADER_FRAGMENT || !shader->non_fs.is_generated)) {
         struct hash_table *ht = &prog->base.ctx->program_cache[idx];
         simple_mtx_lock(&prog->base.ctx->program_lock[idx]);
         struct hash_entry *he = _mesa_hash_table_search(ht, prog->shaders);
         assert(he && he->data == prog);
         _mesa_hash_table_remove(ht, he);
         prog->base.removed = true;
         simple_mtx_unlock(&prog->base.ctx->program_lock[idx]);
         util_queue_fence_wait(&prog->base.cache_fence);

         for (unsigned r = 0; r < ARRAY_SIZE(prog->pipelines); r++) {
            for (int i = 0; i < ARRAY_SIZE(prog->pipelines[0]); ++i) {
               hash_table_foreach(&prog->pipelines[r][i], table_entry) {
                  struct zink_gfx_pipeline_cache_entry *pc_entry = table_entry->data;

                  util_queue_fence_wait(&pc_entry->fence);
               }
            }
         }

      }
      if (stage == MESA_SHADER_FRAGMENT || !shader->non_fs.is_generated) {
         prog->shaders[stage] = NULL;
         prog->stages_remaining &= ~BITFIELD_BIT(stage);
      }
      /* only remove generated tcs during parent tes destruction */
      if (stage == MESA_SHADER_TESS_EVAL && shader->non_fs.generated_tcs)
         prog->shaders[MESA_SHADER_TESS_CTRL] = NULL;
      if (stage != MESA_SHADER_FRAGMENT &&
          prog->shaders[MESA_SHADER_GEOMETRY] &&
          prog->shaders[MESA_SHADER_GEOMETRY]->non_fs.parent ==
          shader) {
         prog->shaders[MESA_SHADER_GEOMETRY] = NULL;
      }
      zink_gfx_program_reference(screen, &prog, NULL);
   }
   while (util_dynarray_contains(&shader->pipeline_libs, struct zink_gfx_lib_cache*)) {
      struct zink_gfx_lib_cache *libs = util_dynarray_pop(&shader->pipeline_libs, struct zink_gfx_lib_cache*);
      if (!libs->removed) {
         libs->removed = true;
         unsigned idx = zink_program_cache_stages(libs->stages_present);
         simple_mtx_lock(&screen->pipeline_libs_lock[idx]);
         _mesa_set_remove_key(&screen->pipeline_libs[idx], libs);
         simple_mtx_unlock(&screen->pipeline_libs_lock[idx]);
      }
      zink_gfx_lib_cache_unref(screen, libs);
   }
   if (shader->info.stage == MESA_SHADER_TESS_EVAL &&
       shader->non_fs.generated_tcs) {
      /* automatically destroy generated tcs shaders when tes is destroyed */
      zink_gfx_shader_free(screen, shader->non_fs.generated_tcs);
      shader->non_fs.generated_tcs = NULL;
   }
   if (shader->info.stage != MESA_SHADER_FRAGMENT) {
      for (unsigned int i = 0; i < ARRAY_SIZE(shader->non_fs.generated_gs); i++) {
         for (int j = 0; j < ARRAY_SIZE(shader->non_fs.generated_gs[0]); j++) {
            if (shader->non_fs.generated_gs[i][j]) {
               /* automatically destroy generated gs shaders when owner is destroyed */
               zink_gfx_shader_free(screen, shader->non_fs.generated_gs[i][j]);
               shader->non_fs.generated_gs[i][j] = NULL;
            }
         }
      }
   }
   zink_shader_free(screen, shader);
}


struct zink_shader_object
zink_shader_tcs_compile(struct zink_screen *screen, struct zink_shader *zs, unsigned patch_vertices, bool can_shobj, struct zink_program *pg)
{
   assert(zs->info.stage == MESA_SHADER_TESS_CTRL);
   /* shortcut all the nir passes since we just have to change this one word */
   zs->spirv->words[zs->spirv->tcs_vertices_out_word] = patch_vertices;
   return zink_shader_spirv_compile(screen, zs, NULL, can_shobj, pg);
}

/* creating a passthrough tcs shader that's roughly:

#version 150
#extension GL_ARB_tessellation_shader : require

in vec4 some_var[gl_MaxPatchVertices];
out vec4 some_var_out;

layout(push_constant) uniform tcsPushConstants {
    layout(offset = 0) float TessLevelInner[2];
    layout(offset = 8) float TessLevelOuter[4];
} u_tcsPushConstants;
layout(vertices = $vertices_per_patch) out;
void main()
{
  gl_TessLevelInner = u_tcsPushConstants.TessLevelInner;
  gl_TessLevelOuter = u_tcsPushConstants.TessLevelOuter;
  some_var_out = some_var[gl_InvocationID];
}

*/
struct zink_shader *
zink_shader_tcs_create(struct zink_screen *screen, nir_shader *tes, unsigned vertices_per_patch, nir_shader **nir_ret)
{
   struct zink_shader *ret = rzalloc(NULL, struct zink_shader);
   util_queue_fence_init(&ret->precompile.fence);
   ret->hash = _mesa_hash_pointer(ret);
   ret->programs = _mesa_pointer_set_create(NULL);
   simple_mtx_init(&ret->lock, mtx_plain);

   nir_shader *nir = nir_shader_create(NULL, MESA_SHADER_TESS_CTRL, &screen->nir_options, NULL);
   nir_function *fn = nir_function_create(nir, "main");
   fn->is_entrypoint = true;
   nir_function_impl *impl = nir_function_impl_create(fn);

   nir_builder b = nir_builder_at(nir_before_impl(impl));

   nir_def *invocation_id = nir_load_invocation_id(&b);

   nir_foreach_shader_in_variable(var, tes) {
      if (var->data.location == VARYING_SLOT_TESS_LEVEL_INNER || var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER)
         continue;
      const struct glsl_type *in_type = var->type;
      const struct glsl_type *out_type = var->type;
      char buf[1024];
      snprintf(buf, sizeof(buf), "%s_out", var->name);
      if (!nir_is_arrayed_io(var, MESA_SHADER_TESS_EVAL)) {
         const struct glsl_type *type = var->type;
         in_type = glsl_array_type(type, 32 /* MAX_PATCH_VERTICES */, 0);
         out_type = glsl_array_type(type, vertices_per_patch, 0);
      }

      nir_variable *in = nir_variable_create(nir, nir_var_shader_in, in_type, var->name);
      nir_variable *out = nir_variable_create(nir, nir_var_shader_out, out_type, buf);
      out->data.location = in->data.location = var->data.location;
      out->data.location_frac = in->data.location_frac = var->data.location_frac;

      /* gl_in[] receives values from equivalent built-in output
         variables written by the vertex shader (section 2.14.7).  Each array
         element of gl_in[] is a structure holding values for a specific vertex of
         the input patch.  The length of gl_in[] is equal to the
         implementation-dependent maximum patch size (gl_MaxPatchVertices).
         - ARB_tessellation_shader
       */
      /* we need to load the invocation-specific value of the vertex output and then store it to the per-patch output */
      nir_deref_instr *in_value = nir_build_deref_array(&b, nir_build_deref_var(&b, in), invocation_id);
      nir_deref_instr *out_value = nir_build_deref_array(&b, nir_build_deref_var(&b, out), invocation_id);
      copy_vars(&b, out_value, in_value);
   }
   nir_variable *gl_TessLevelInner = nir_variable_create(nir, nir_var_shader_out, glsl_array_type(glsl_float_type(), 2, 0), "gl_TessLevelInner");
   gl_TessLevelInner->data.location = VARYING_SLOT_TESS_LEVEL_INNER;
   gl_TessLevelInner->data.patch = 1;
   nir_variable *gl_TessLevelOuter = nir_variable_create(nir, nir_var_shader_out, glsl_array_type(glsl_float_type(), 4, 0), "gl_TessLevelOuter");
   gl_TessLevelOuter->data.location = VARYING_SLOT_TESS_LEVEL_OUTER;
   gl_TessLevelOuter->data.patch = 1;

   create_gfx_pushconst(nir);

   nir_def *load_inner = nir_load_push_constant_zink(&b, 2, 32,
                                                         nir_imm_int(&b, ZINK_GFX_PUSHCONST_DEFAULT_INNER_LEVEL));
   nir_def *load_outer = nir_load_push_constant_zink(&b, 4, 32,
                                                         nir_imm_int(&b, ZINK_GFX_PUSHCONST_DEFAULT_OUTER_LEVEL));

   for (unsigned i = 0; i < 2; i++) {
      nir_deref_instr *store_idx = nir_build_deref_array_imm(&b, nir_build_deref_var(&b, gl_TessLevelInner), i);
      nir_store_deref(&b, store_idx, nir_channel(&b, load_inner, i), 0xff);
   }
   for (unsigned i = 0; i < 4; i++) {
      nir_deref_instr *store_idx = nir_build_deref_array_imm(&b, nir_build_deref_var(&b, gl_TessLevelOuter), i);
      nir_store_deref(&b, store_idx, nir_channel(&b, load_outer, i), 0xff);
   }

   nir->info.tess.tcs_vertices_out = vertices_per_patch;
   nir_validate_shader(nir, "created");

   optimize_nir(nir, NULL, true);
   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
   NIR_PASS_V(nir, nir_convert_from_ssa, true);

   *nir_ret = nir;
   zink_shader_serialize_blob(nir, &ret->blob);
   memcpy(&ret->info, &nir->info, sizeof(nir->info));
   ret->non_fs.is_generated = true;
   return ret;
}

bool
zink_shader_has_cubes(nir_shader *nir)
{
   nir_foreach_variable_with_modes(var, nir, nir_var_uniform) {
      const struct glsl_type *type = glsl_without_array(var->type);
      if (glsl_type_is_sampler(type) && glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE)
         return true;
   }
   return false;
}

nir_shader *
zink_shader_blob_deserialize(struct zink_screen *screen, struct blob *blob)
{
   struct blob_reader blob_reader;
   blob_reader_init(&blob_reader, blob->data, blob->size);
   return nir_deserialize(NULL, &screen->nir_options, &blob_reader);
}

nir_shader *
zink_shader_deserialize(struct zink_screen *screen, struct zink_shader *zs)
{
   return zink_shader_blob_deserialize(screen, &zs->blob);
}

void
zink_shader_serialize_blob(nir_shader *nir, struct blob *blob)
{
   blob_init(blob);
#ifndef NDEBUG
   bool strip = !(zink_debug & (ZINK_DEBUG_NIR | ZINK_DEBUG_SPIRV | ZINK_DEBUG_TGSI));
#else
   bool strip = false;
#endif
   nir_serialize(blob, nir, strip);
}

void
zink_print_shader(struct zink_screen *screen, struct zink_shader *zs, FILE *fp)
{
   nir_shader *nir = zink_shader_deserialize(screen, zs);
   nir_print_shader(nir, fp);
   ralloc_free(nir);
}