anv: Completely rework shader compilation

Now that we have a decent interface in upstream mesa, we can get rid of all our hacks. As of this commit, we no longer use any fake GL state objects and all of shader compilation is moved into anv_pipeline.c. This should make way for actually implementing a shader cache one of these days. As a nice side-benifit, this commit also gains us an extra 300 passing CTS tests because we're actually filling out the texture swizzle information for vertex shaders.
2015-10-19 22:06:59 -07:00 · 2015-10-19 22:06:59 -07:00 · a71e614d33
parent 2d9e899e35
commit a71e614d33
6 changed files with 661 additions and 942 deletions
--- a/src/vulkan/Makefile.am
+++ b/src/vulkan/Makefile.am
@ -55,14 +55,10 @@ libvulkan_la_CFLAGS =							\
 	-Wall -Wno-unused-parameter -fvisibility=hidden -O0 -g		\
 	-Wstrict-prototypes -Wmissing-prototypes -Wno-override-init

-libvulkan_la_CXXFLAGS =							\
-	-Wall -Wno-unused-parameter -fvisibility=hidden -O0 -g
-
 VULKAN_SOURCES =                                        \
 	anv_allocator.c                                 \
 	anv_cmd_buffer.c                                \
 	anv_batch_chain.c                               \
-	anv_compiler.cpp                                \
 	anv_device.c                                    \
        anv_dump.c                                      \
 	anv_entrypoints.c                               \
@ -124,7 +120,7 @@ libvulkan_la_LIBADD = $(WAYLAND_LIBS) -lxcb -lxcb-dri3 \
 	$(top_builddir)/src/mesa/drivers/dri/i965/libi965_compiler.la \
 	../mesa/libmesa.la \
 	../mesa/drivers/dri/common/libdri_test_stubs.la \
-	-lpthread -ldl
+	-lpthread -ldl -lstdc++

 # Libvulkan with dummy gem. Used for unit tests.

@ -133,7 +129,6 @@ libvulkan_test_la_SOURCES =                             \
 	anv_gem_stubs.c

 libvulkan_test_la_CFLAGS = $(libvulkan_la_CFLAGS)
-libvulkan_test_la_CXXFLAGS = $(libvulkan_la_CXXFLAGS)
 libvulkan_test_la_LIBADD = $(libvulkan_la_LIBADD)

 include $(top_srcdir)/install-lib-links.mk
--- a/src/vulkan/anv_compiler.cpp
+++ b/src/vulkan/anv_compiler.cpp
@ -1,891 +0,0 @@
-/*
- * Copyright © 2015 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-#include <sys/stat.h>
-#include <unistd.h>
-#include <fcntl.h>
-
-#include "anv_private.h"
-#include "anv_nir.h"
-
-#include <brw_context.h>
-#include <brw_wm.h> /* brw_new_shader_program is here */
-#include <brw_nir.h>
-
-#include <brw_vs.h>
-#include <brw_gs.h>
-#include <brw_cs.h>
-#include "brw_vec4_gs_visitor.h"
-#include <brw_compiler.h>
-
-#include <mesa/main/shaderobj.h>
-#include <mesa/main/fbobject.h>
-#include <mesa/main/context.h>
-#include <mesa/program/program.h>
-#include <glsl/program.h>
-
-/* XXX: We need this to keep symbols in nir.h from conflicting with the
- * generated GEN command packing headers.  We need to fix *both* to not
- * define something as generic as LOAD.
- */
-#undef LOAD
-
-#include <glsl/nir/nir_spirv.h>
-
-#define SPIR_V_MAGIC_NUMBER 0x07230203
-
-static void
-fail_if(int cond, const char *format, ...)
-{
-   va_list args;
-
-   if (!cond)
-      return;
-
-   va_start(args, format);
-   vfprintf(stderr, format, args);
-   va_end(args);
-
-   exit(1);
-}
-
-static VkResult
-set_binding_table_layout(struct brw_stage_prog_data *prog_data,
-                         struct anv_pipeline *pipeline, uint32_t stage)
-{
-   unsigned bias;
-   if (stage == VK_SHADER_STAGE_FRAGMENT)
-      bias = MAX_RTS;
-   else
-      bias = 0;
-
-   prog_data->binding_table.size_bytes = 0;
-   prog_data->binding_table.texture_start = bias;
-   prog_data->binding_table.ubo_start = bias;
-   prog_data->binding_table.image_start = bias;
-
-   return VK_SUCCESS;
-}
-
-static uint32_t
-upload_kernel(struct anv_pipeline *pipeline, const void *data, size_t size)
-{
-   struct anv_state state =
-      anv_state_stream_alloc(&pipeline->program_stream, size, 64);
-
-   assert(size < pipeline->program_stream.block_pool->block_size);
-
-   memcpy(state.map, data, size);
-
-   return state.offset;
-}
-
-static void
-create_params_array(struct anv_pipeline *pipeline,
-                    struct gl_shader *shader,
-                    struct brw_stage_prog_data *prog_data)
-{
-   VkShaderStage stage = anv_vk_shader_stage_for_mesa_stage(shader->Stage);
-   unsigned num_params = 0;
-
-   if (shader->num_uniform_components) {
-      /* If the shader uses any push constants at all, we'll just give
-       * them the maximum possible number
-       */
-      num_params += MAX_PUSH_CONSTANTS_SIZE / sizeof(float);
-   }
-
-   if (pipeline->layout && pipeline->layout->stage[stage].has_dynamic_offsets)
-      num_params += MAX_DYNAMIC_BUFFERS;
-
-   if (num_params == 0)
-      return;
-
-   prog_data->param = (const gl_constant_value **)
-      anv_device_alloc(pipeline->device,
-                       num_params * sizeof(gl_constant_value *),
-                       8, VK_SYSTEM_ALLOC_TYPE_INTERNAL_SHADER);
-
-   /* We now set the param values to be offsets into a
-    * anv_push_constant_data structure.  Since the compiler doesn't
-    * actually dereference any of the gl_constant_value pointers in the
-    * params array, it doesn't really matter what we put here.
-    */
-   struct anv_push_constants *null_data = NULL;
-   for (unsigned i = 0; i < num_params; i++)
-      prog_data->param[i] =
-         (const gl_constant_value *)&null_data->client_data[i * sizeof(float)];
-}
-
-static void
-brw_vs_populate_key(struct brw_context *brw,
-                    struct brw_vertex_program *vp,
-                    struct brw_vs_prog_key *key)
-{
-   memset(key, 0, sizeof(*key));
-
-   /* XXX: Handle vertex input work-arounds */
-
-   /* XXX: Handle sampler_prog_key */
-}
-
-static bool
-really_do_vs_prog(struct brw_context *brw,
-                  struct gl_shader_program *prog,
-                  struct brw_vertex_program *vp,
-                  struct brw_vs_prog_key *key, struct anv_pipeline *pipeline)
-{
-   GLuint program_size;
-   const GLuint *program;
-   struct brw_vs_prog_data *prog_data = &pipeline->vs_prog_data;
-   void *mem_ctx;
-   struct gl_shader *vs = NULL;
-
-   if (prog)
-      vs = prog->_LinkedShaders[MESA_SHADER_VERTEX];
-
-   memset(prog_data, 0, sizeof(*prog_data));
-
-   mem_ctx = ralloc_context(NULL);
-
-   create_params_array(pipeline, vs, &prog_data->base.base);
-   anv_nir_apply_dynamic_offsets(pipeline, vs->Program->nir,
-                                 &prog_data->base.base);
-   anv_nir_apply_pipeline_layout(vs->Program->nir, pipeline->layout);
-
-   prog_data->inputs_read = vp->program.Base.InputsRead;
-
-   brw_compute_vue_map(brw->intelScreen->devinfo,
-                       &prog_data->base.vue_map,
-                       vp->program.Base.OutputsWritten,
-                       prog ? prog->SeparateShader : false);
-
-   set_binding_table_layout(&prog_data->base.base, pipeline,
-                            VK_SHADER_STAGE_VERTEX);
-
-   /* Emit GEN4 code.
-    */
-   program = brw_compile_vs(brw->intelScreen->compiler, brw, mem_ctx,
-                            key, prog_data, vs->Program->nir, NULL, false, -1,
-                            &program_size, NULL);
-   if (program == NULL) {
-      ralloc_free(mem_ctx);
-      return false;
-   }
-
-   const uint32_t offset = upload_kernel(pipeline, program, program_size);
-   if (prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8) {
-      pipeline->vs_simd8 = offset;
-      pipeline->vs_vec4 = NO_KERNEL;
-   } else {
-      pipeline->vs_simd8 = NO_KERNEL;
-      pipeline->vs_vec4 = offset;
-   }
-
-   ralloc_free(mem_ctx);
-
-   return true;
-}
-
-void brw_wm_populate_key(struct brw_context *brw,
-                         struct brw_fragment_program *fp,
-                         struct brw_wm_prog_key *key)
-{
-   struct gl_context *ctx = &brw->ctx;
-   bool program_uses_dfdy = fp->program.UsesDFdy;
-   struct gl_framebuffer draw_buffer;
-   bool multisample_fbo;
-
-   memset(key, 0, sizeof(*key));
-
-   for (int i = 0; i < MAX_SAMPLERS; i++) {
-      /* Assume color sampler, no swizzling. */
-      key->tex.swizzles[i] = SWIZZLE_XYZW;
-   }
-
-   /* A non-zero framebuffer name indicates that the framebuffer was created by
-    * the user rather than the window system. */
-   draw_buffer.Name = 1;
-   draw_buffer.Visual.samples = 1;
-   draw_buffer._NumColorDrawBuffers = 1;
-   draw_buffer._NumColorDrawBuffers = 1;
-   draw_buffer.Width = 400;
-   draw_buffer.Height = 400;
-   ctx->DrawBuffer = &draw_buffer;
-
-   multisample_fbo = ctx->DrawBuffer->Visual.samples > 1;
-
-   /* _NEW_HINT */
-   key->high_quality_derivatives =
-      ctx->Hint.FragmentShaderDerivative == GL_NICEST;
-
-   /* _NEW_FRAG_CLAMP | _NEW_BUFFERS */
-   key->clamp_fragment_color = ctx->Color._ClampFragmentColor;
-
-   /* _NEW_BUFFERS */
-   /*
-    * Include the draw buffer origin and height so that we can calculate
-    * fragment position values relative to the bottom left of the drawable,
-    * from the incoming screen origin relative position we get as part of our
-    * payload.
-    *
-    * This is only needed for the WM_WPOSXY opcode when the fragment program
-    * uses the gl_FragCoord input.
-    *
-    * We could avoid recompiling by including this as a constant referenced by
-    * our program, but if we were to do that it would also be nice to handle
-    * getting that constant updated at batchbuffer submit time (when we
-    * hold the lock and know where the buffer really is) rather than at emit
-    * time when we don't hold the lock and are just guessing.  We could also
-    * just avoid using this as key data if the program doesn't use
-    * fragment.position.
-    *
-    * For DRI2 the origin_x/y will always be (0,0) but we still need the
-    * drawable height in order to invert the Y axis.
-    */
-   if (fp->program.Base.InputsRead & VARYING_BIT_POS) {
-      key->drawable_height = ctx->DrawBuffer->Height;
-   }
-
-   if ((fp->program.Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
-      key->render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
-   }
-
-   /* _NEW_BUFFERS */
-   key->nr_color_regions = ctx->DrawBuffer->_NumColorDrawBuffers;
-
-   /* _NEW_MULTISAMPLE, _NEW_COLOR, _NEW_BUFFERS */
-   key->replicate_alpha = ctx->DrawBuffer->_NumColorDrawBuffers > 1 &&
-      (ctx->Multisample.SampleAlphaToCoverage || ctx->Color.AlphaEnabled);
-
-   /* _NEW_BUFFERS _NEW_MULTISAMPLE */
-   /* Ignore sample qualifier while computing this flag. */
-   key->persample_shading =
-      _mesa_get_min_invocations_per_fragment(ctx, &fp->program, true) > 1;
-   if (key->persample_shading)
-      key->persample_2x = ctx->DrawBuffer->Visual.samples == 2;
-
-   key->compute_pos_offset =
-      _mesa_get_min_invocations_per_fragment(ctx, &fp->program, false) > 1 &&
-      fp->program.Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_POS;
-
-   key->compute_sample_id =
-      multisample_fbo &&
-      ctx->Multisample.Enabled &&
-      (fp->program.Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_ID);
-
-   /* The unique fragment program ID */
-   key->program_string_id = fp->id;
-
-   ctx->DrawBuffer = NULL;
-}
-
-static bool
-really_do_wm_prog(struct brw_context *brw,
-                  struct gl_shader_program *prog,
-                  struct brw_fragment_program *fp,
-                  struct brw_wm_prog_key *key, struct anv_pipeline *pipeline)
-{
-   void *mem_ctx = ralloc_context(NULL);
-   struct brw_wm_prog_data *prog_data = &pipeline->wm_prog_data;
-   struct gl_shader *fs = NULL;
-   unsigned int program_size;
-   const uint32_t *program;
-
-   if (prog)
-      fs = prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
-
-   memset(prog_data, 0, sizeof(*prog_data));
-
-   create_params_array(pipeline, fs, &prog_data->base);
-   anv_nir_apply_dynamic_offsets(pipeline, fs->Program->nir, &prog_data->base);
-   anv_nir_apply_pipeline_layout(fs->Program->nir, pipeline->layout);
-
-   set_binding_table_layout(&prog_data->base, pipeline,
-                            VK_SHADER_STAGE_FRAGMENT);
-   /* This needs to come after shader time and pull constant entries, but we
-    * don't have those set up now, so just put it after the layout entries.
-    */
-   prog_data->binding_table.render_target_start = 0;
-
-   program = brw_compile_fs(brw->intelScreen->compiler, brw, mem_ctx, key,
-                            prog_data, fp->program.Base.nir, fs->Program,
-                            -1, -1, brw->use_rep_send, &program_size, NULL);
-   if (program == NULL) {
-      ralloc_free(mem_ctx);
-      return false;
-   }
-
-   uint32_t offset = upload_kernel(pipeline, program, program_size);
-
-   if (prog_data->no_8)
-      pipeline->ps_simd8 = NO_KERNEL;
-   else
-      pipeline->ps_simd8 = offset;
-
-   if (prog_data->no_8 || prog_data->prog_offset_16) {
-      pipeline->ps_simd16 = offset + prog_data->prog_offset_16;
-   } else {
-      pipeline->ps_simd16 = NO_KERNEL;
-   }
-
-   ralloc_free(mem_ctx);
-
-   return true;
-}
-
-static bool
-brw_codegen_cs_prog(struct brw_context *brw,
-                    struct gl_shader_program *prog,
-                    struct brw_compute_program *cp,
-                    struct brw_cs_prog_key *key, struct anv_pipeline *pipeline)
-{
-   const GLuint *program;
-   void *mem_ctx = ralloc_context(NULL);
-   GLuint program_size;
-   struct brw_cs_prog_data *prog_data = &pipeline->cs_prog_data;
-
-   struct gl_shader *cs = prog->_LinkedShaders[MESA_SHADER_COMPUTE];
-   assert (cs);
-
-   memset(prog_data, 0, sizeof(*prog_data));
-
-   set_binding_table_layout(&prog_data->base, pipeline, VK_SHADER_STAGE_COMPUTE);
-
-   create_params_array(pipeline, cs, &prog_data->base);
-   anv_nir_apply_dynamic_offsets(pipeline, cs->Program->nir, &prog_data->base);
-   anv_nir_apply_pipeline_layout(cs->Program->nir, pipeline->layout);
-
-   program = brw_compile_cs(brw->intelScreen->compiler, brw, mem_ctx, key,
-                            prog_data, cs->Program->nir, -1,
-                            &program_size, NULL);
-   if (program == NULL) {
-      ralloc_free(mem_ctx);
-      return false;
-   }
-
-   if (unlikely(INTEL_DEBUG & DEBUG_CS))
-      fprintf(stderr, "\n");
-
-   pipeline->cs_simd = upload_kernel(pipeline, program, program_size);
-
-   ralloc_free(mem_ctx);
-
-   return true;
-}
-
-static void
-brw_cs_populate_key(struct brw_context *brw,
-                    struct brw_compute_program *bcp, struct brw_cs_prog_key *key)
-{
-   memset(key, 0, sizeof(*key));
-
-   /* The unique compute program ID */
-   key->program_string_id = bcp->id;
-}
-
-struct anv_compiler {
-   struct anv_device *device;
-   struct intel_screen *screen;
-   struct brw_context *brw;
-   struct gl_pipeline_object pipeline;
-};
-
-extern "C" {
-
-struct anv_compiler *
-anv_compiler_create(struct anv_device *device)
-{
-   const struct brw_device_info *devinfo = &device->info;
-   struct anv_compiler *compiler;
-   struct gl_context *ctx;
-
-   compiler = rzalloc(NULL, struct anv_compiler);
-   if (compiler == NULL)
-      return NULL;
-
-   compiler->screen = rzalloc(compiler, struct intel_screen);
-   if (compiler->screen == NULL)
-      goto fail;
-
-   compiler->brw = rzalloc(compiler, struct brw_context);
-   if (compiler->brw == NULL)
-      goto fail;
-
-   compiler->device = device;
-
-   compiler->brw->gen = devinfo->gen;
-   compiler->brw->is_g4x = devinfo->is_g4x;
-   compiler->brw->is_baytrail = devinfo->is_baytrail;
-   compiler->brw->is_haswell = devinfo->is_haswell;
-   compiler->brw->is_cherryview = devinfo->is_cherryview;
-
-   /* We need this at least for CS, which will check brw->max_cs_threads
-    * against the work group size. */
-   compiler->brw->max_vs_threads = devinfo->max_vs_threads;
-   compiler->brw->max_hs_threads = devinfo->max_hs_threads;
-   compiler->brw->max_ds_threads = devinfo->max_ds_threads;
-   compiler->brw->max_gs_threads = devinfo->max_gs_threads;
-   compiler->brw->max_wm_threads = devinfo->max_wm_threads;
-   compiler->brw->max_cs_threads = devinfo->max_cs_threads;
-   compiler->brw->urb.size = devinfo->urb.size;
-   compiler->brw->urb.min_vs_entries = devinfo->urb.min_vs_entries;
-   compiler->brw->urb.max_vs_entries = devinfo->urb.max_vs_entries;
-   compiler->brw->urb.max_hs_entries = devinfo->urb.max_hs_entries;
-   compiler->brw->urb.max_ds_entries = devinfo->urb.max_ds_entries;
-   compiler->brw->urb.max_gs_entries = devinfo->urb.max_gs_entries;
-
-   compiler->brw->intelScreen = compiler->screen;
-   compiler->screen->devinfo = &device->info;
-
-   brw_process_intel_debug_variable();
-
-   compiler->screen->compiler = device->instance->physicalDevice.compiler;
-
-   ctx = &compiler->brw->ctx;
-   _mesa_init_shader_object_functions(&ctx->Driver);
-
-   /* brw_select_clip_planes() needs this for bogus reasons. */
-   ctx->_Shader = &compiler->pipeline;
-
-   return compiler;
-
- fail:
-   ralloc_free(compiler);
-   return NULL;
-}
-
-void
-anv_compiler_destroy(struct anv_compiler *compiler)
-{
-   _mesa_free_errors_data(&compiler->brw->ctx);
-   ralloc_free(compiler);
-}
-
-/* From gen7_urb.c */
-
-/* FIXME: Add to struct intel_device_info */
-
-static const int gen8_push_size = 32 * 1024;
-
-static void
-gen7_compute_urb_partition(struct anv_pipeline *pipeline)
-{
-   const struct brw_device_info *devinfo = &pipeline->device->info;
-   bool vs_present = pipeline->vs_simd8 != NO_KERNEL;
-   unsigned vs_size = vs_present ? pipeline->vs_prog_data.base.urb_entry_size : 1;
-   unsigned vs_entry_size_bytes = vs_size * 64;
-   bool gs_present = pipeline->gs_vec4 != NO_KERNEL;
-   unsigned gs_size = gs_present ? pipeline->gs_prog_data.base.urb_entry_size : 1;
-   unsigned gs_entry_size_bytes = gs_size * 64;
-
-   /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS):
-    *
-    *     VS Number of URB Entries must be divisible by 8 if the VS URB Entry
-    *     Allocation Size is less than 9 512-bit URB entries.
-    *
-    * Similar text exists for GS.
-    */
-   unsigned vs_granularity = (vs_size < 9) ? 8 : 1;
-   unsigned gs_granularity = (gs_size < 9) ? 8 : 1;
-
-   /* URB allocations must be done in 8k chunks. */
-   unsigned chunk_size_bytes = 8192;
-
-   /* Determine the size of the URB in chunks. */
-   unsigned urb_chunks = devinfo->urb.size * 1024 / chunk_size_bytes;
-
-   /* Reserve space for push constants */
-   unsigned push_constant_bytes = gen8_push_size;
-   unsigned push_constant_chunks =
-      push_constant_bytes / chunk_size_bytes;
-
-   /* Initially, assign each stage the minimum amount of URB space it needs,
-    * and make a note of how much additional space it "wants" (the amount of
-    * additional space it could actually make use of).
-    */
-
-   /* VS has a lower limit on the number of URB entries */
-   unsigned vs_chunks =
-      ALIGN(devinfo->urb.min_vs_entries * vs_entry_size_bytes,
-            chunk_size_bytes) / chunk_size_bytes;
-   unsigned vs_wants =
-      ALIGN(devinfo->urb.max_vs_entries * vs_entry_size_bytes,
-            chunk_size_bytes) / chunk_size_bytes - vs_chunks;
-
-   unsigned gs_chunks = 0;
-   unsigned gs_wants = 0;
-   if (gs_present) {
-      /* There are two constraints on the minimum amount of URB space we can
-       * allocate:
-       *
-       * (1) We need room for at least 2 URB entries, since we always operate
-       * the GS in DUAL_OBJECT mode.
-       *
-       * (2) We can't allocate less than nr_gs_entries_granularity.
-       */
-      gs_chunks = ALIGN(MAX2(gs_granularity, 2) * gs_entry_size_bytes,
-                        chunk_size_bytes) / chunk_size_bytes;
-      gs_wants =
-         ALIGN(devinfo->urb.max_gs_entries * gs_entry_size_bytes,
-               chunk_size_bytes) / chunk_size_bytes - gs_chunks;
-   }
-
-   /* There should always be enough URB space to satisfy the minimum
-    * requirements of each stage.
-    */
-   unsigned total_needs = push_constant_chunks + vs_chunks + gs_chunks;
-   assert(total_needs <= urb_chunks);
-
-   /* Mete out remaining space (if any) in proportion to "wants". */
-   unsigned total_wants = vs_wants + gs_wants;
-   unsigned remaining_space = urb_chunks - total_needs;
-   if (remaining_space > total_wants)
-      remaining_space = total_wants;
-   if (remaining_space > 0) {
-      unsigned vs_additional = (unsigned)
-         round(vs_wants * (((double) remaining_space) / total_wants));
-      vs_chunks += vs_additional;
-      remaining_space -= vs_additional;
-      gs_chunks += remaining_space;
-   }
-
-   /* Sanity check that we haven't over-allocated. */
-   assert(push_constant_chunks + vs_chunks + gs_chunks <= urb_chunks);
-
-   /* Finally, compute the number of entries that can fit in the space
-    * allocated to each stage.
-    */
-   unsigned nr_vs_entries = vs_chunks * chunk_size_bytes / vs_entry_size_bytes;
-   unsigned nr_gs_entries = gs_chunks * chunk_size_bytes / gs_entry_size_bytes;
-
-   /* Since we rounded up when computing *_wants, this may be slightly more
-    * than the maximum allowed amount, so correct for that.
-    */
-   nr_vs_entries = MIN2(nr_vs_entries, devinfo->urb.max_vs_entries);
-   nr_gs_entries = MIN2(nr_gs_entries, devinfo->urb.max_gs_entries);
-
-   /* Ensure that we program a multiple of the granularity. */
-   nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, vs_granularity);
-   nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, gs_granularity);
-
-   /* Finally, sanity check to make sure we have at least the minimum number
-    * of entries needed for each stage.
-    */
-   assert(nr_vs_entries >= devinfo->urb.min_vs_entries);
-   if (gs_present)
-      assert(nr_gs_entries >= 2);
-
-   /* Lay out the URB in the following order:
-    * - push constants
-    * - VS
-    * - GS
-    */
-   pipeline->urb.vs_start = push_constant_chunks;
-   pipeline->urb.vs_size = vs_size;
-   pipeline->urb.nr_vs_entries = nr_vs_entries;
-
-   pipeline->urb.gs_start = push_constant_chunks + vs_chunks;
-   pipeline->urb.gs_size = gs_size;
-   pipeline->urb.nr_gs_entries = nr_gs_entries;
-}
-
-static const struct {
-   uint32_t token;
-   gl_shader_stage stage;
-   const char *name;
-} stage_info[] = {
-   { GL_VERTEX_SHADER, MESA_SHADER_VERTEX, "vertex" },
-   { GL_TESS_CONTROL_SHADER, (gl_shader_stage)-1,"tess control" },
-   { GL_TESS_EVALUATION_SHADER, (gl_shader_stage)-1, "tess evaluation" },
-   { GL_GEOMETRY_SHADER, MESA_SHADER_GEOMETRY, "geometry" },
-   { GL_FRAGMENT_SHADER, MESA_SHADER_FRAGMENT, "fragment" },
-   { GL_COMPUTE_SHADER, MESA_SHADER_COMPUTE, "compute" },
-};
-
-struct spirv_header{
-   uint32_t magic;
-   uint32_t version;
-   uint32_t gen_magic;
-};
-
-static void
-setup_nir_io(struct gl_shader *mesa_shader,
-             nir_shader *shader)
-{
-   struct gl_program *prog = mesa_shader->Program;
-   foreach_list_typed(nir_variable, var, node, &shader->inputs) {
-      prog->InputsRead |= BITFIELD64_BIT(var->data.location);
-      if (shader->stage == MESA_SHADER_FRAGMENT) {
-         struct gl_fragment_program *fprog = (struct gl_fragment_program *)prog;
-
-         fprog->InterpQualifier[var->data.location] =
-            (glsl_interp_qualifier)var->data.interpolation;
-         if (var->data.centroid)
-            fprog->IsCentroid |= BITFIELD64_BIT(var->data.location);
-         if (var->data.sample)
-            fprog->IsSample |= BITFIELD64_BIT(var->data.location);
-      }
-   }
-
-   foreach_list_typed(nir_variable, var, node, &shader->outputs) {
-      prog->OutputsWritten |= BITFIELD64_BIT(var->data.location);
-   }
-
-   shader->info.system_values_read = 0;
-   foreach_list_typed(nir_variable, var, node, &shader->system_values) {
-      shader->info.system_values_read |= BITFIELD64_BIT(var->data.location);
-   }
-
-   shader->info.inputs_read = prog->InputsRead;
-   shader->info.outputs_written = prog->OutputsWritten;
-}
-
-static void
-anv_compile_shader_spirv(struct anv_compiler *compiler,
-                         struct gl_shader_program *program,
-                         struct anv_pipeline *pipeline, uint32_t stage)
-{
-   struct brw_context *brw = compiler->brw;
-   struct anv_shader *shader = pipeline->shaders[stage];
-   struct gl_shader *mesa_shader;
-   int name = 0;
-
-   mesa_shader = brw_new_shader(&brw->ctx, name, stage_info[stage].token);
-   fail_if(mesa_shader == NULL,
-           "failed to create %s shader\n", stage_info[stage].name);
-
-#define CREATE_PROGRAM(stage) \
-   &ralloc(mesa_shader, struct brw_##stage##_program)->program.Base
-
-   bool is_scalar;
-   struct gl_program *prog;
-   switch (stage) {
-   case VK_SHADER_STAGE_VERTEX:
-      prog = CREATE_PROGRAM(vertex);
-      is_scalar = compiler->screen->compiler->scalar_vs;
-      break;
-   case VK_SHADER_STAGE_GEOMETRY:
-      prog = CREATE_PROGRAM(geometry);
-      is_scalar = false;
-      break;
-   case VK_SHADER_STAGE_FRAGMENT:
-      prog = CREATE_PROGRAM(fragment);
-      is_scalar = true;
-      break;
-   case VK_SHADER_STAGE_COMPUTE:
-      prog = CREATE_PROGRAM(compute);
-      is_scalar = true;
-      break;
-   default:
-      unreachable("Unsupported shader stage");
-   }
-   _mesa_init_gl_program(prog, 0, 0);
-   _mesa_reference_program(&brw->ctx, &mesa_shader->Program, prog);
-
-   mesa_shader->Program->Parameters =
-      rzalloc(mesa_shader, struct gl_program_parameter_list);
-
-   mesa_shader->Type = stage_info[stage].token;
-   mesa_shader->Stage = stage_info[stage].stage;
-
-   struct gl_shader_compiler_options *glsl_options =
-      &compiler->screen->compiler->glsl_compiler_options[stage_info[stage].stage];
-
-   if (shader->module->nir) {
-      /* Some things such as our meta clear/blit code will give us a NIR
-       * shader directly.  In that case, we just ignore the SPIR-V entirely
-       * and just use the NIR shader */
-      mesa_shader->Program->nir = shader->module->nir;
-      mesa_shader->Program->nir->options = glsl_options->NirOptions;
-   } else {
-      uint32_t *spirv = (uint32_t *) shader->module->data;
-      assert(spirv[0] == SPIR_V_MAGIC_NUMBER);
-      assert(shader->module->size % 4 == 0);
-
-      mesa_shader->Program->nir =
-         spirv_to_nir(spirv, shader->module->size / 4,
-                      stage_info[stage].stage, glsl_options->NirOptions);
-   }
-   nir_validate_shader(mesa_shader->Program->nir);
-
-   brw_preprocess_nir(mesa_shader->Program->nir,
-                      compiler->screen->devinfo, is_scalar);
-
-   setup_nir_io(mesa_shader, mesa_shader->Program->nir);
-
-   brw_postprocess_nir(mesa_shader->Program->nir,
-                       compiler->screen->devinfo, is_scalar);
-
-   mesa_shader->num_uniform_components =
-      mesa_shader->Program->nir->num_uniforms;
-
-   fail_if(mesa_shader->Program->nir == NULL,
-           "failed to translate SPIR-V to NIR\n");
-
-   _mesa_reference_shader(&brw->ctx, &program->Shaders[program->NumShaders],
-                          mesa_shader);
-   program->NumShaders++;
-}
-
-static void
-add_compiled_stage(struct anv_pipeline *pipeline, uint32_t stage,
-                   struct brw_stage_prog_data *prog_data)
-{
-   struct brw_device_info *devinfo = &pipeline->device->info;
-   uint32_t max_threads[] = {
-      [VK_SHADER_STAGE_VERTEX]                  = devinfo->max_vs_threads,
-      [VK_SHADER_STAGE_TESS_CONTROL]            = 0,
-      [VK_SHADER_STAGE_TESS_EVALUATION]         = 0,
-      [VK_SHADER_STAGE_GEOMETRY]                = devinfo->max_gs_threads,
-      [VK_SHADER_STAGE_FRAGMENT]                = devinfo->max_wm_threads,
-      [VK_SHADER_STAGE_COMPUTE]                 = devinfo->max_cs_threads,
-   };
-
-   pipeline->prog_data[stage] = prog_data;
-   pipeline->active_stages |= 1 << stage;
-   pipeline->scratch_start[stage] = pipeline->total_scratch;
-   pipeline->total_scratch =
-      align_u32(pipeline->total_scratch, 1024) +
-      prog_data->total_scratch * max_threads[stage];
-}
-
-int
-anv_compiler_run(struct anv_compiler *compiler, struct anv_pipeline *pipeline)
-{
-   struct gl_shader_program *program;
-   int name = 0;
-   struct brw_context *brw = compiler->brw;
-
-   pipeline->writes_point_size = false;
-
-   /* When we free the pipeline, we detect stages based on the NULL status
-    * of various prog_data pointers.  Make them NULL by default.
-    */
-   memset(pipeline->prog_data, 0, sizeof(pipeline->prog_data));
-   memset(pipeline->scratch_start, 0, sizeof(pipeline->scratch_start));
-
-   brw->use_rep_send = pipeline->use_repclear;
-   brw->no_simd8 = pipeline->use_repclear;
-
-   program = _mesa_new_shader_program(name);
-   program->Shaders = (struct gl_shader **)
-      calloc(VK_SHADER_STAGE_NUM, sizeof(struct gl_shader *));
-   fail_if(program == NULL || program->Shaders == NULL,
-           "failed to create program\n");
-
-   for (unsigned i = 0; i < VK_SHADER_STAGE_NUM; i++) {
-      if (pipeline->shaders[i])
-         anv_compile_shader_spirv(compiler, program, pipeline, i);
-   }
-
-   for (unsigned i = 0; i < program->NumShaders; i++) {
-      struct gl_shader *shader = program->Shaders[i];
-      program->_LinkedShaders[shader->Stage] = shader;
-   }
-
-   bool success;
-   pipeline->active_stages = 0;
-   pipeline->total_scratch = 0;
-
-   if (pipeline->shaders[VK_SHADER_STAGE_VERTEX]) {
-      struct brw_vs_prog_key vs_key;
-      struct gl_vertex_program *vp = (struct gl_vertex_program *)
-         program->_LinkedShaders[MESA_SHADER_VERTEX]->Program;
-      struct brw_vertex_program *bvp = brw_vertex_program(vp);
-
-      brw_vs_populate_key(brw, bvp, &vs_key);
-
-      success = really_do_vs_prog(brw, program, bvp, &vs_key, pipeline);
-      fail_if(!success, "do_wm_prog failed\n");
-      add_compiled_stage(pipeline, VK_SHADER_STAGE_VERTEX,
-                         &pipeline->vs_prog_data.base.base);
-
-      if (vp->Base.OutputsWritten & VARYING_SLOT_PSIZ)
-         pipeline->writes_point_size = true;
-   } else {
-      memset(&pipeline->vs_prog_data, 0, sizeof(pipeline->vs_prog_data));
-      pipeline->vs_simd8 = NO_KERNEL;
-      pipeline->vs_vec4 = NO_KERNEL;
-   }
-
-   /* Geometry shaders not yet supported */
-   anv_assert(pipeline->shaders[VK_SHADER_STAGE_GEOMETRY] == NULL);
-   pipeline->gs_vec4 = NO_KERNEL;
-
-   if (pipeline->shaders[VK_SHADER_STAGE_FRAGMENT]) {
-      struct brw_wm_prog_key wm_key;
-      struct gl_fragment_program *fp = (struct gl_fragment_program *)
-         program->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
-      struct brw_fragment_program *bfp = brw_fragment_program(fp);
-
-      brw_wm_populate_key(brw, bfp, &wm_key);
-
-      success = really_do_wm_prog(brw, program, bfp, &wm_key, pipeline);
-      fail_if(!success, "do_wm_prog failed\n");
-      add_compiled_stage(pipeline, VK_SHADER_STAGE_FRAGMENT,
-                         &pipeline->wm_prog_data.base);
-   }
-
-   if (pipeline->shaders[VK_SHADER_STAGE_COMPUTE]) {
-      struct brw_cs_prog_key cs_key;
-      struct gl_compute_program *cp = (struct gl_compute_program *)
-         program->_LinkedShaders[MESA_SHADER_COMPUTE]->Program;
-      struct brw_compute_program *bcp = brw_compute_program(cp);
-
-      brw_cs_populate_key(brw, bcp, &cs_key);
-
-      success = brw_codegen_cs_prog(brw, program, bcp, &cs_key, pipeline);
-      fail_if(!success, "brw_codegen_cs_prog failed\n");
-      add_compiled_stage(pipeline, VK_SHADER_STAGE_COMPUTE,
-                         &pipeline->cs_prog_data.base);
-   }
-
-   _mesa_delete_shader_program(&brw->ctx, program);
-
-   struct anv_device *device = compiler->device;
-   while (device->scratch_block_pool.bo.size < pipeline->total_scratch)
-      anv_block_pool_alloc(&device->scratch_block_pool);
-
-   gen7_compute_urb_partition(pipeline);
-
-   return 0;
-}
-
-/* This badly named function frees the struct anv_pipeline data that the compiler
- * allocates.  Currently just the prog_data structs.
- */
-void
-anv_compiler_free(struct anv_pipeline *pipeline)
-{
-   for (uint32_t stage = 0; stage < VK_SHADER_STAGE_NUM; stage++) {
-      if (pipeline->prog_data[stage]) {
-         /* We only ever set up the params array because we don't do
-          * non-UBO pull constants
-          */
-         anv_device_free(pipeline->device, pipeline->prog_data[stage]->param);
-      }
-   }
-}
-
-}
--- a/src/vulkan/anv_device.c
+++ b/src/vulkan/anv_device.c
@ -33,6 +33,22 @@

 struct anv_dispatch_table dtable;

+static void
+compiler_debug_log(void *data, const char *fmt, ...)
+{ }
+
+static void
+compiler_perf_log(void *data, const char *fmt, ...)
+{
+   va_list args;
+   va_start(args, fmt);
+
+   if (unlikely(INTEL_DEBUG & DEBUG_PERF))
+      vfprintf(stderr, fmt, args);
+
+   va_end(args);
+}
+
 static VkResult
 anv_physical_device_init(struct anv_physical_device *device,
                         struct anv_instance *instance,
@ -91,11 +107,15 @@ anv_physical_device_init(struct anv_physical_device *device,
   
   close(fd);

+   brw_process_intel_debug_variable();
+
   device->compiler = brw_compiler_create(NULL, device->info);
   if (device->compiler == NULL) {
      result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
      goto fail;
   }
+   device->compiler->shader_debug_log = compiler_debug_log;
+   device->compiler->shader_perf_log = compiler_perf_log;

   return VK_SUCCESS;
   
@ -146,7 +166,6 @@ static const VkExtensionProperties device_extensions[] = {
   },
 };

-
 VkResult anv_CreateInstance(
    const VkInstanceCreateInfo*                 pCreateInfo,
    VkInstance*                                 pInstance)
@ -633,8 +652,6 @@ VkResult anv_CreateDevice(

   device->info = *physical_device->info;

-   device->compiler = anv_compiler_create(device);
-
   anv_queue_init(device, &device->queue);

   anv_device_init_meta(device);
@ -658,8 +675,6 @@ void anv_DestroyDevice(
 {
   ANV_FROM_HANDLE(anv_device, device, _device);

-   anv_compiler_destroy(device->compiler);
-
   anv_queue_finish(&device->queue);

   anv_device_finish_meta(device);
--- a/src/vulkan/anv_pipeline.c
+++ b/src/vulkan/anv_pipeline.c
@ -28,6 +28,12 @@
 #include <fcntl.h>

 #include "anv_private.h"
+#include "brw_nir.h"
+#include "anv_nir.h"
+#include "glsl/nir/nir_spirv.h"
+
+/* Needed for SWIZZLE macros */
+#include "program/prog_instruction.h"

 // Shader functions

@ -81,16 +87,12 @@ VkResult anv_CreateShader(
   const char *name = pCreateInfo->pName ? pCreateInfo->pName : "main";
   size_t name_len = strlen(name);

-   if (strcmp(name, "main") != 0) {
-      anv_finishme("Multiple shaders per module not really supported");
-   }
-
   shader = anv_device_alloc(device, sizeof(*shader) + name_len + 1, 8,
                             VK_SYSTEM_ALLOC_TYPE_API_OBJECT);
   if (shader == NULL)
      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);

-   shader->module = module;
+   shader->module = module,
   memcpy(shader->entrypoint, name, name_len + 1);

   *pShader = anv_shader_to_handle(shader);
@ -108,6 +110,86 @@ void anv_DestroyShader(
   anv_device_free(device, shader);
 }

+#define SPIR_V_MAGIC_NUMBER 0x07230203
+
+static const gl_shader_stage vk_shader_stage_to_mesa_stage[] = {
+   [VK_SHADER_STAGE_VERTEX] = MESA_SHADER_VERTEX,
+   [VK_SHADER_STAGE_TESS_CONTROL] = -1,
+   [VK_SHADER_STAGE_TESS_EVALUATION] = -1,
+   [VK_SHADER_STAGE_GEOMETRY] = MESA_SHADER_GEOMETRY,
+   [VK_SHADER_STAGE_FRAGMENT] = MESA_SHADER_FRAGMENT,
+   [VK_SHADER_STAGE_COMPUTE] = MESA_SHADER_COMPUTE,
+};
+
+static bool
+is_scalar_shader_stage(const struct brw_compiler *compiler, VkShaderStage stage)
+{
+   switch (stage) {
+   case VK_SHADER_STAGE_VERTEX:
+      return compiler->scalar_vs;
+   case VK_SHADER_STAGE_GEOMETRY:
+      return false;
+   case VK_SHADER_STAGE_FRAGMENT:
+   case VK_SHADER_STAGE_COMPUTE:
+      return true;
+   default:
+      unreachable("Unsupported shader stage");
+   }
+}
+
+/* Eventually, this will become part of anv_CreateShader.  Unfortunately,
+ * we can't do that yet because we don't have the ability to copy nir.
+ */
+static nir_shader *
+anv_shader_compile_to_nir(struct anv_device *device,
+                          struct anv_shader *shader, VkShaderStage vk_stage)
+{
+   if (strcmp(shader->entrypoint, "main") != 0) {
+      anv_finishme("Multiple shaders per module not really supported");
+   }
+
+   gl_shader_stage stage = vk_shader_stage_to_mesa_stage[vk_stage];
+   const struct brw_compiler *compiler =
+      device->instance->physicalDevice.compiler;
+   const nir_shader_compiler_options *nir_options =
+      compiler->glsl_compiler_options[stage].NirOptions;
+
+   nir_shader *nir;
+   if (shader->module->nir) {
+      /* Some things such as our meta clear/blit code will give us a NIR
+       * shader directly.  In that case, we just ignore the SPIR-V entirely
+       * and just use the NIR shader */
+      nir = shader->module->nir;
+      nir->options = nir_options;
+   } else {
+      uint32_t *spirv = (uint32_t *) shader->module->data;
+      assert(spirv[0] == SPIR_V_MAGIC_NUMBER);
+      assert(shader->module->size % 4 == 0);
+
+      nir = spirv_to_nir(spirv, shader->module->size / 4, stage, nir_options);
+   }
+   nir_validate_shader(nir);
+
+   /* Make sure the provided shader has exactly one entrypoint and that the
+    * name matches the name that came in from the VkShader.
+    */
+   nir_function_impl *entrypoint = NULL;
+   nir_foreach_overload(nir, overload) {
+      if (strcmp(shader->entrypoint, overload->function->name) == 0 &&
+          overload->impl) {
+         assert(entrypoint == NULL);
+         entrypoint = overload->impl;
+      }
+   }
+   assert(entrypoint != NULL);
+
+   brw_preprocess_nir(nir, &device->info,
+                      is_scalar_shader_stage(compiler, vk_stage));
+
+   nir_shader_gather_info(nir, entrypoint);
+
+   return nir;
+}

 VkResult anv_CreatePipelineCache(
    VkDevice                                    device,
@ -156,7 +238,6 @@ void anv_DestroyPipeline(
   ANV_FROM_HANDLE(anv_device, device, _device);
   ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline);

-   anv_compiler_free(pipeline);
   anv_reloc_list_finish(&pipeline->batch_relocs, pipeline->device);
   anv_state_stream_finish(&pipeline->program_stream);
   anv_state_pool_free(&device->dynamic_state_pool, pipeline->blend_state);
@ -177,6 +258,506 @@ static const uint32_t vk_to_gen_primitive_type[] = {
   [VK_PRIMITIVE_TOPOLOGY_PATCH]                = _3DPRIM_PATCHLIST_1
 };

+static void
+populate_sampler_prog_key(const struct brw_device_info *devinfo,
+                          struct brw_sampler_prog_key_data *key)
+{
+   /* XXX: Handle texture swizzle on HSW- */
+   for (int i = 0; i < MAX_SAMPLERS; i++) {
+      /* Assume color sampler, no swizzling. (Works for BDW+) */
+      key->swizzles[i] = SWIZZLE_XYZW;
+   }
+}
+
+static void
+populate_vs_prog_key(const struct brw_device_info *devinfo,
+                     struct brw_vs_prog_key *key)
+{
+   memset(key, 0, sizeof(*key));
+
+   populate_sampler_prog_key(devinfo, &key->tex);
+
+   /* XXX: Handle vertex input work-arounds */
+
+   /* XXX: Handle sampler_prog_key */
+}
+
+static void
+populate_wm_prog_key(const struct brw_device_info *devinfo,
+                     const VkGraphicsPipelineCreateInfo *info,
+                     struct brw_wm_prog_key *key)
+{
+   ANV_FROM_HANDLE(anv_render_pass, render_pass, info->renderPass);
+
+   memset(key, 0, sizeof(*key));
+
+   populate_sampler_prog_key(devinfo, &key->tex);
+
+   /* Vulkan doesn't specify a default */
+   key->high_quality_derivatives = false;
+
+   /* XXX Vulkan doesn't appear to specify */
+   key->clamp_fragment_color = false;
+
+   /* XXX: These are needed for flipping the coordinates.  Do we need to do
+    * this in Vulkan?
+    */
+   key->drawable_height = 0;
+   key->render_to_fbo = true; /* XXX really? */
+
+   key->nr_color_regions = render_pass->subpasses[info->subpass].color_count;
+
+   key->replicate_alpha = key->nr_color_regions > 1 &&
+                          info->pColorBlendState->alphaToCoverageEnable;
+
+   if (info->pMultisampleState && info->pMultisampleState->rasterSamples > 1) {
+      /* We should probably pull this out of the shader, but it's fairly
+       * harmless to compute it and then let dead-code take care of it.
+       */
+      key->compute_sample_id = true;
+      key->persample_shading = info->pMultisampleState->sampleShadingEnable;
+      if (key->persample_shading)
+         key->persample_2x = info->pMultisampleState->rasterSamples == 2;
+
+      key->compute_pos_offset = info->pMultisampleState->sampleShadingEnable;
+      key->compute_sample_id = info->pMultisampleState->sampleShadingEnable;
+   }
+}
+
+static void
+populate_cs_prog_key(const struct brw_device_info *devinfo,
+                     struct brw_cs_prog_key *key)
+{
+   memset(key, 0, sizeof(*key));
+
+   populate_sampler_prog_key(devinfo, &key->tex);
+}
+
+static nir_shader *
+anv_pipeline_compile(struct anv_pipeline *pipeline,
+                     struct anv_shader *shader,
+                     VkShaderStage stage,
+                     struct brw_stage_prog_data *prog_data)
+{
+   const struct brw_compiler *compiler =
+      pipeline->device->instance->physicalDevice.compiler;
+
+   nir_shader *nir = anv_shader_compile_to_nir(pipeline->device, shader, stage);
+   if (nir == NULL)
+      return NULL;
+
+   bool have_push_constants = false;
+   nir_foreach_variable(var, &nir->uniforms) {
+      if (!glsl_type_is_sampler(var->type)) {
+         have_push_constants = true;
+         break;
+      }
+   }
+
+   /* Figure out the number of parameters */
+   prog_data->nr_params = 0;
+
+   if (have_push_constants) {
+      /* If the shader uses any push constants at all, we'll just give
+       * them the maximum possible number
+       */
+      prog_data->nr_params += MAX_PUSH_CONSTANTS_SIZE / sizeof(float);
+   }
+
+   if (pipeline->layout && pipeline->layout->stage[stage].has_dynamic_offsets)
+      prog_data->nr_params += MAX_DYNAMIC_BUFFERS;
+
+   if (prog_data->nr_params > 0) {
+      prog_data->param = (const gl_constant_value **)
+         anv_device_alloc(pipeline->device,
+                          prog_data->nr_params * sizeof(gl_constant_value *),
+                          8, VK_SYSTEM_ALLOC_TYPE_INTERNAL_SHADER);
+
+      /* We now set the param values to be offsets into a
+       * anv_push_constant_data structure.  Since the compiler doesn't
+       * actually dereference any of the gl_constant_value pointers in the
+       * params array, it doesn't really matter what we put here.
+       */
+      struct anv_push_constants *null_data = NULL;
+      if (have_push_constants) {
+         /* Fill out the push constants section of the param array */
+         for (unsigned i = 0; i < MAX_PUSH_CONSTANTS_SIZE / sizeof(float); i++)
+            prog_data->param[i] = (const gl_constant_value *)
+               &null_data->client_data[i * sizeof(float)];
+      }
+   }
+
+   /* Set up dynamic offsets */
+   anv_nir_apply_dynamic_offsets(pipeline, nir, prog_data);
+
+   /* Apply the actual pipeline layout to UBOs, SSBOs, and textures */
+   anv_nir_apply_pipeline_layout(nir, pipeline->layout);
+
+   /* All binding table offsets provided by apply_pipeline_layout() are
+    * relative to the start of the bindint table (plus MAX_RTS for VS).
+    */
+   unsigned bias = stage == VK_SHADER_STAGE_FRAGMENT ? MAX_RTS : 0;
+   prog_data->binding_table.size_bytes = 0;
+   prog_data->binding_table.texture_start = bias;
+   prog_data->binding_table.ubo_start = bias;
+   prog_data->binding_table.image_start = bias;
+
+   /* Finish the optimization and compilation process */
+   brw_postprocess_nir(nir, &pipeline->device->info,
+                       is_scalar_shader_stage(compiler, stage));
+
+   /* nir_lower_io will only handle the push constants; we need to set this
+    * to the full number of possible uniforms.
+    */
+   nir->num_uniforms = prog_data->nr_params;
+
+   return nir;
+}
+
+static uint32_t
+anv_pipeline_upload_kernel(struct anv_pipeline *pipeline,
+                           const void *data, size_t size)
+{
+   struct anv_state state =
+      anv_state_stream_alloc(&pipeline->program_stream, size, 64);
+
+   assert(size < pipeline->program_stream.block_pool->block_size);
+
+   memcpy(state.map, data, size);
+
+   return state.offset;
+}
+static void
+anv_pipeline_add_compiled_stage(struct anv_pipeline *pipeline,
+                                VkShaderStage stage,
+                                struct brw_stage_prog_data *prog_data)
+{
+   struct brw_device_info *devinfo = &pipeline->device->info;
+   uint32_t max_threads[] = {
+      [VK_SHADER_STAGE_VERTEX]                  = devinfo->max_vs_threads,
+      [VK_SHADER_STAGE_TESS_CONTROL]            = 0,
+      [VK_SHADER_STAGE_TESS_EVALUATION]         = 0,
+      [VK_SHADER_STAGE_GEOMETRY]                = devinfo->max_gs_threads,
+      [VK_SHADER_STAGE_FRAGMENT]                = devinfo->max_wm_threads,
+      [VK_SHADER_STAGE_COMPUTE]                 = devinfo->max_cs_threads,
+   };
+
+   pipeline->prog_data[stage] = prog_data;
+   pipeline->active_stages |= 1 << stage;
+   pipeline->scratch_start[stage] = pipeline->total_scratch;
+   pipeline->total_scratch =
+      align_u32(pipeline->total_scratch, 1024) +
+      prog_data->total_scratch * max_threads[stage];
+}
+
+static VkResult
+anv_pipeline_compile_vs(struct anv_pipeline *pipeline,
+                        const VkGraphicsPipelineCreateInfo *info,
+                        struct anv_shader *shader)
+{
+   const struct brw_compiler *compiler =
+      pipeline->device->instance->physicalDevice.compiler;
+   struct brw_vs_prog_data *prog_data = &pipeline->vs_prog_data;
+   struct brw_vs_prog_key key;
+
+   populate_vs_prog_key(&pipeline->device->info, &key);
+
+   /* TODO: Look up shader in cache */
+
+   memset(prog_data, 0, sizeof(*prog_data));
+
+   nir_shader *nir = anv_pipeline_compile(pipeline, shader,
+                                          VK_SHADER_STAGE_VERTEX,
+                                          &prog_data->base.base);
+   if (nir == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   void *mem_ctx = ralloc_context(NULL);
+
+   if (shader->module->nir == NULL)
+      ralloc_steal(mem_ctx, nir);
+
+   prog_data->inputs_read = nir->info.inputs_read;
+   pipeline->writes_point_size = nir->info.outputs_written & VARYING_SLOT_PSIZ;
+
+   brw_compute_vue_map(&pipeline->device->info,
+                       &prog_data->base.vue_map,
+                       nir->info.outputs_written,
+                       false /* XXX: Do SSO? */);
+
+   unsigned code_size;
+   const unsigned *shader_code =
+      brw_compile_vs(compiler, NULL, mem_ctx, &key, prog_data, nir,
+                     NULL, false, -1, &code_size, NULL);
+   if (shader_code == NULL) {
+      ralloc_free(mem_ctx);
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+
+   const uint32_t offset =
+      anv_pipeline_upload_kernel(pipeline, shader_code, code_size);
+   if (prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8) {
+      pipeline->vs_simd8 = offset;
+      pipeline->vs_vec4 = NO_KERNEL;
+   } else {
+      pipeline->vs_simd8 = NO_KERNEL;
+      pipeline->vs_vec4 = offset;
+   }
+
+   ralloc_free(mem_ctx);
+
+   anv_pipeline_add_compiled_stage(pipeline, VK_SHADER_STAGE_VERTEX,
+                                   &prog_data->base.base);
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+anv_pipeline_compile_fs(struct anv_pipeline *pipeline,
+                        const VkGraphicsPipelineCreateInfo *info,
+                        struct anv_shader *shader)
+{
+   const struct brw_compiler *compiler =
+      pipeline->device->instance->physicalDevice.compiler;
+   struct brw_wm_prog_data *prog_data = &pipeline->wm_prog_data;
+   struct brw_wm_prog_key key;
+
+   populate_wm_prog_key(&pipeline->device->info, info, &key);
+
+   if (pipeline->use_repclear)
+      key.nr_color_regions = 1;
+
+   /* TODO: Look up shader in cache */
+
+   memset(prog_data, 0, sizeof(*prog_data));
+
+   prog_data->binding_table.render_target_start = 0;
+
+   nir_shader *nir = anv_pipeline_compile(pipeline, shader,
+                                          VK_SHADER_STAGE_FRAGMENT,
+                                          &prog_data->base);
+   if (nir == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   void *mem_ctx = ralloc_context(NULL);
+
+   if (shader->module->nir == NULL)
+      ralloc_steal(mem_ctx, nir);
+
+   unsigned code_size;
+   const unsigned *shader_code =
+      brw_compile_fs(compiler, NULL, mem_ctx, &key, prog_data, nir,
+                     NULL, -1, -1, pipeline->use_repclear, &code_size, NULL);
+   if (shader_code == NULL) {
+      ralloc_free(mem_ctx);
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+
+   uint32_t offset = anv_pipeline_upload_kernel(pipeline,
+                                                shader_code, code_size);
+   if (prog_data->no_8)
+      pipeline->ps_simd8 = NO_KERNEL;
+   else
+      pipeline->ps_simd8 = offset;
+
+   if (prog_data->no_8 || prog_data->prog_offset_16) {
+      pipeline->ps_simd16 = offset + prog_data->prog_offset_16;
+   } else {
+      pipeline->ps_simd16 = NO_KERNEL;
+   }
+
+   pipeline->ps_ksp2 = 0;
+   pipeline->ps_grf_start2 = 0;
+   if (pipeline->ps_simd8 != NO_KERNEL) {
+      pipeline->ps_ksp0 = pipeline->ps_simd8;
+      pipeline->ps_grf_start0 = prog_data->base.dispatch_grf_start_reg;
+      if (pipeline->ps_simd16 != NO_KERNEL) {
+         pipeline->ps_ksp2 = pipeline->ps_simd16;
+         pipeline->ps_grf_start2 = prog_data->dispatch_grf_start_reg_16;
+      }
+   } else if (pipeline->ps_simd16 != NO_KERNEL) {
+      pipeline->ps_ksp0 = pipeline->ps_simd16;
+      pipeline->ps_grf_start0 = prog_data->dispatch_grf_start_reg_16;
+   }
+
+   ralloc_free(mem_ctx);
+
+   anv_pipeline_add_compiled_stage(pipeline, VK_SHADER_STAGE_FRAGMENT,
+                                   &prog_data->base);
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_pipeline_compile_cs(struct anv_pipeline *pipeline,
+                        const VkComputePipelineCreateInfo *info,
+                        struct anv_shader *shader)
+{
+   const struct brw_compiler *compiler =
+      pipeline->device->instance->physicalDevice.compiler;
+   struct brw_cs_prog_data *prog_data = &pipeline->cs_prog_data;
+   struct brw_cs_prog_key key;
+
+   populate_cs_prog_key(&pipeline->device->info, &key);
+
+   /* TODO: Look up shader in cache */
+
+   memset(prog_data, 0, sizeof(*prog_data));
+
+   nir_shader *nir = anv_pipeline_compile(pipeline, shader,
+                                          VK_SHADER_STAGE_COMPUTE,
+                                          &prog_data->base);
+   if (nir == NULL)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   void *mem_ctx = ralloc_context(NULL);
+
+   if (shader->module->nir == NULL)
+      ralloc_steal(mem_ctx, nir);
+
+   unsigned code_size;
+   const unsigned *shader_code =
+      brw_compile_cs(compiler, NULL, mem_ctx, &key, prog_data, nir,
+                     -1, &code_size, NULL);
+   if (shader_code == NULL) {
+      ralloc_free(mem_ctx);
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+
+   pipeline->cs_simd = anv_pipeline_upload_kernel(pipeline,
+                                                  shader_code, code_size);
+   ralloc_free(mem_ctx);
+
+   anv_pipeline_add_compiled_stage(pipeline, VK_SHADER_STAGE_COMPUTE,
+                                   &prog_data->base);
+
+   return VK_SUCCESS;
+}
+
+static const int gen8_push_size = 32 * 1024;
+
+static void
+gen7_compute_urb_partition(struct anv_pipeline *pipeline)
+{
+   const struct brw_device_info *devinfo = &pipeline->device->info;
+   bool vs_present = pipeline->active_stages & VK_SHADER_STAGE_VERTEX_BIT;
+   unsigned vs_size = vs_present ? pipeline->vs_prog_data.base.urb_entry_size : 1;
+   unsigned vs_entry_size_bytes = vs_size * 64;
+   bool gs_present = pipeline->active_stages & VK_SHADER_STAGE_GEOMETRY_BIT;
+   unsigned gs_size = gs_present ? pipeline->gs_prog_data.base.urb_entry_size : 1;
+   unsigned gs_entry_size_bytes = gs_size * 64;
+
+   /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS):
+    *
+    *     VS Number of URB Entries must be divisible by 8 if the VS URB Entry
+    *     Allocation Size is less than 9 512-bit URB entries.
+    *
+    * Similar text exists for GS.
+    */
+   unsigned vs_granularity = (vs_size < 9) ? 8 : 1;
+   unsigned gs_granularity = (gs_size < 9) ? 8 : 1;
+
+   /* URB allocations must be done in 8k chunks. */
+   unsigned chunk_size_bytes = 8192;
+
+   /* Determine the size of the URB in chunks. */
+   unsigned urb_chunks = devinfo->urb.size * 1024 / chunk_size_bytes;
+
+   /* Reserve space for push constants */
+   unsigned push_constant_bytes = gen8_push_size;
+   unsigned push_constant_chunks =
+      push_constant_bytes / chunk_size_bytes;
+
+   /* Initially, assign each stage the minimum amount of URB space it needs,
+    * and make a note of how much additional space it "wants" (the amount of
+    * additional space it could actually make use of).
+    */
+
+   /* VS has a lower limit on the number of URB entries */
+   unsigned vs_chunks =
+      ALIGN(devinfo->urb.min_vs_entries * vs_entry_size_bytes,
+            chunk_size_bytes) / chunk_size_bytes;
+   unsigned vs_wants =
+      ALIGN(devinfo->urb.max_vs_entries * vs_entry_size_bytes,
+            chunk_size_bytes) / chunk_size_bytes - vs_chunks;
+
+   unsigned gs_chunks = 0;
+   unsigned gs_wants = 0;
+   if (gs_present) {
+      /* There are two constraints on the minimum amount of URB space we can
+       * allocate:
+       *
+       * (1) We need room for at least 2 URB entries, since we always operate
+       * the GS in DUAL_OBJECT mode.
+       *
+       * (2) We can't allocate less than nr_gs_entries_granularity.
+       */
+      gs_chunks = ALIGN(MAX2(gs_granularity, 2) * gs_entry_size_bytes,
+                        chunk_size_bytes) / chunk_size_bytes;
+      gs_wants =
+         ALIGN(devinfo->urb.max_gs_entries * gs_entry_size_bytes,
+               chunk_size_bytes) / chunk_size_bytes - gs_chunks;
+   }
+
+   /* There should always be enough URB space to satisfy the minimum
+    * requirements of each stage.
+    */
+   unsigned total_needs = push_constant_chunks + vs_chunks + gs_chunks;
+   assert(total_needs <= urb_chunks);
+
+   /* Mete out remaining space (if any) in proportion to "wants". */
+   unsigned total_wants = vs_wants + gs_wants;
+   unsigned remaining_space = urb_chunks - total_needs;
+   if (remaining_space > total_wants)
+      remaining_space = total_wants;
+   if (remaining_space > 0) {
+      unsigned vs_additional = (unsigned)
+         round(vs_wants * (((double) remaining_space) / total_wants));
+      vs_chunks += vs_additional;
+      remaining_space -= vs_additional;
+      gs_chunks += remaining_space;
+   }
+
+   /* Sanity check that we haven't over-allocated. */
+   assert(push_constant_chunks + vs_chunks + gs_chunks <= urb_chunks);
+
+   /* Finally, compute the number of entries that can fit in the space
+    * allocated to each stage.
+    */
+   unsigned nr_vs_entries = vs_chunks * chunk_size_bytes / vs_entry_size_bytes;
+   unsigned nr_gs_entries = gs_chunks * chunk_size_bytes / gs_entry_size_bytes;
+
+   /* Since we rounded up when computing *_wants, this may be slightly more
+    * than the maximum allowed amount, so correct for that.
+    */
+   nr_vs_entries = MIN2(nr_vs_entries, devinfo->urb.max_vs_entries);
+   nr_gs_entries = MIN2(nr_gs_entries, devinfo->urb.max_gs_entries);
+
+   /* Ensure that we program a multiple of the granularity. */
+   nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, vs_granularity);
+   nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, gs_granularity);
+
+   /* Finally, sanity check to make sure we have at least the minimum number
+    * of entries needed for each stage.
+    */
+   assert(nr_vs_entries >= devinfo->urb.min_vs_entries);
+   if (gs_present)
+      assert(nr_gs_entries >= 2);
+
+   /* Lay out the URB in the following order:
+    * - push constants
+    * - VS
+    * - GS
+    */
+   pipeline->urb.vs_start = push_constant_chunks;
+   pipeline->urb.vs_size = vs_size;
+   pipeline->urb.nr_vs_entries = nr_vs_entries;
+
+   pipeline->urb.gs_start = push_constant_chunks + vs_chunks;
+   pipeline->urb.gs_size = gs_size;
+   pipeline->urb.nr_gs_entries = nr_gs_entries;
+}
+
 static void
 anv_pipeline_init_dynamic_state(struct anv_pipeline *pipeline,
                                const VkGraphicsPipelineCreateInfo *pCreateInfo)
@ -335,7 +916,6 @@ anv_pipeline_init(struct anv_pipeline *pipeline, struct anv_device *device,

   pipeline->device = device;
   pipeline->layout = anv_pipeline_layout_from_handle(pCreateInfo->layout);
-   memset(pipeline->shaders, 0, sizeof(pipeline->shaders));

   result = anv_reloc_list_init(&pipeline->batch_relocs, device);
   if (result != VK_SUCCESS) {
@ -349,11 +929,6 @@ anv_pipeline_init(struct anv_pipeline *pipeline, struct anv_device *device,
   anv_state_stream_init(&pipeline->program_stream,
                         &device->instruction_block_pool);

-   for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
-      pipeline->shaders[pCreateInfo->pStages[i].stage] =
-         anv_shader_from_handle(pCreateInfo->pStages[i].shader);
-   }
-
   anv_pipeline_init_dynamic_state(pipeline, pCreateInfo);

   if (pCreateInfo->pTessellationState)
@ -363,27 +938,44 @@ anv_pipeline_init(struct anv_pipeline *pipeline, struct anv_device *device,
      anv_finishme("VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO");

   pipeline->use_repclear = extra && extra->use_repclear;
+   pipeline->writes_point_size = false;

-   anv_compiler_run(device->compiler, pipeline);
+   /* When we free the pipeline, we detect stages based on the NULL status
+    * of various prog_data pointers.  Make them NULL by default.
+    */
+   memset(pipeline->prog_data, 0, sizeof(pipeline->prog_data));
+   memset(pipeline->scratch_start, 0, sizeof(pipeline->scratch_start));

-   const struct brw_wm_prog_data *wm_prog_data = &pipeline->wm_prog_data;
+   pipeline->vs_simd8 = NO_KERNEL;
+   pipeline->vs_vec4 = NO_KERNEL;
+   pipeline->gs_vec4 = NO_KERNEL;

-   pipeline->ps_ksp2 = 0;
-   pipeline->ps_grf_start2 = 0;
-   if (pipeline->ps_simd8 != NO_KERNEL) {
-      pipeline->ps_ksp0 = pipeline->ps_simd8;
-      pipeline->ps_grf_start0 = wm_prog_data->base.dispatch_grf_start_reg;
-      if (pipeline->ps_simd16 != NO_KERNEL) {
-         pipeline->ps_ksp2 = pipeline->ps_simd16;
-         pipeline->ps_grf_start2 = wm_prog_data->dispatch_grf_start_reg_16;
+   pipeline->active_stages = 0;
+   pipeline->total_scratch = 0;
+
+   for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
+      ANV_FROM_HANDLE(anv_shader, shader, pCreateInfo->pStages[i].shader);
+
+      switch (pCreateInfo->pStages[i].stage) {
+      case VK_SHADER_STAGE_VERTEX:
+         anv_pipeline_compile_vs(pipeline, pCreateInfo, shader);
+         break;
+      case VK_SHADER_STAGE_FRAGMENT:
+         anv_pipeline_compile_fs(pipeline, pCreateInfo, shader);
+         break;
+      default:
+         anv_finishme("Unsupported shader stage");
      }
-   } else if (pipeline->ps_simd16 != NO_KERNEL) {
-      pipeline->ps_ksp0 = pipeline->ps_simd16;
-      pipeline->ps_grf_start0 = wm_prog_data->dispatch_grf_start_reg_16;
-   } else {
-      unreachable("no ps shader");
   }

+   if (!(pipeline->active_stages & VK_SHADER_STAGE_VERTEX_BIT)) {
+      /* Vertex is only optional if disable_vs is set */
+      assert(extra->disable_vs);
+      memset(&pipeline->vs_prog_data, 0, sizeof(pipeline->vs_prog_data));
+   }
+
+   gen7_compute_urb_partition(pipeline);
+
   const VkPipelineVertexInputStateCreateInfo *vi_info =
      pCreateInfo->pVertexInputState;
   pipeline->vb_used = 0;
--- a/src/vulkan/anv_private.h
+++ b/src/vulkan/anv_private.h
@ -499,7 +499,6 @@ struct anv_device {

    struct anv_block_pool                       scratch_block_pool;

-    struct anv_compiler *                       compiler;
    pthread_mutex_t                             mutex;
 };

@ -1089,7 +1088,6 @@ struct anv_pipeline {
   uint32_t                                     dynamic_state_mask;
   struct anv_dynamic_state                     dynamic_state;

-   struct anv_shader *                          shaders[VK_SHADER_STAGE_NUM];
   struct anv_pipeline_layout *                 layout;
   bool                                         use_repclear;

@ -1160,6 +1158,11 @@ anv_pipeline_init(struct anv_pipeline *pipeline, struct anv_device *device,
                  const VkGraphicsPipelineCreateInfo *pCreateInfo,
                  const struct anv_graphics_pipeline_create_info *extra);

+VkResult
+anv_pipeline_compile_cs(struct anv_pipeline *pipeline,
+                        const VkComputePipelineCreateInfo *info,
+                        struct anv_shader *shader);
+
 VkResult
 anv_graphics_pipeline_create(VkDevice device,
                             const VkGraphicsPipelineCreateInfo *pCreateInfo,
@ -1187,11 +1190,6 @@ gen8_compute_pipeline_create(VkDevice _device,
                             const VkComputePipelineCreateInfo *pCreateInfo,
                             VkPipeline *pPipeline);

-struct anv_compiler *anv_compiler_create(struct anv_device *device);
-void anv_compiler_destroy(struct anv_compiler *compiler);
-int anv_compiler_run(struct anv_compiler *compiler, struct anv_pipeline *pipeline);
-void anv_compiler_free(struct anv_pipeline *pipeline);
-
 struct anv_format {
   const VkFormat vk_format;
   const char *name;
--- a/src/vulkan/gen8_pipeline.c
+++ b/src/vulkan/gen8_pipeline.c
@ -568,19 +568,29 @@ VkResult gen8_compute_pipeline_create(
   anv_state_stream_init(&pipeline->program_stream,
                         &device->instruction_block_pool);

-   memset(pipeline->shaders, 0, sizeof(pipeline->shaders));
+   /* When we free the pipeline, we detect stages based on the NULL status
+    * of various prog_data pointers.  Make them NULL by default.
+    */
+   memset(pipeline->prog_data, 0, sizeof(pipeline->prog_data));
+   memset(pipeline->scratch_start, 0, sizeof(pipeline->scratch_start));

-   pipeline->shaders[VK_SHADER_STAGE_COMPUTE] =
-      anv_shader_from_handle(pCreateInfo->stage.shader);
+   pipeline->vs_simd8 = NO_KERNEL;
+   pipeline->vs_vec4 = NO_KERNEL;
+   pipeline->gs_vec4 = NO_KERNEL;
+
+   pipeline->active_stages = 0;
+   pipeline->total_scratch = 0;
+
+   assert(pCreateInfo->stage.stage == VK_SHADER_STAGE_COMPUTE);
+   ANV_FROM_HANDLE(anv_shader, shader,  pCreateInfo->stage.shader);
+   anv_pipeline_compile_cs(pipeline, pCreateInfo, shader);

   pipeline->use_repclear = false;

-   anv_compiler_run(device->compiler, pipeline);
-
   const struct brw_cs_prog_data *cs_prog_data = &pipeline->cs_prog_data;

   anv_batch_emit(&pipeline->batch, GEN8_MEDIA_VFE_STATE,
-                  .ScratchSpaceBasePointer = pipeline->scratch_start[VK_SHADER_STAGE_FRAGMENT],
+                  .ScratchSpaceBasePointer = pipeline->scratch_start[VK_SHADER_STAGE_COMPUTE],
                  .PerThreadScratchSpace = ffs(cs_prog_data->base.total_scratch / 2048),
                  .ScratchSpaceBasePointerHigh = 0,
                  .StackSize = 0,