i965/fs: Organize prog_data by ksp number rather than SIMD width

The hardware packets organize kernel pointers and GRF start by slots that don't map directly to dispatch width. This means that all of the state setup code has to re-arrange the data from prog_data into these slots. This logic has been duplicated 4 times in the GL driver and one more time in the Vulkan driver. Let's just put it all in brw_fs.cpp. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2016-04-28 15:37:39 -07:00 · 2016-04-28 15:37:39 -07:00 · bee160b31b
parent 7be100ac9a
commit bee160b31b
11 changed files with 109 additions and 191 deletions
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@ -585,17 +585,17 @@ anv_pipeline_compile_fs(struct anv_pipeline *pipeline,
   const struct brw_stage_prog_data *stage_prog_data;
   struct anv_pipeline_bind_map map;
   struct brw_wm_prog_key key;
-   uint32_t kernel = NO_KERNEL;
   unsigned char sha1[20];

   populate_wm_prog_key(&pipeline->device->info, info, extra, &key);

   if (module->size > 0) {
      anv_hash_shader(sha1, &key, sizeof(key), module, entrypoint, spec_info);
-      kernel = anv_pipeline_cache_search(cache, sha1, &stage_prog_data, &map);
+      pipeline->ps_ksp0 =
+         anv_pipeline_cache_search(cache, sha1, &stage_prog_data, &map);
   }

-   if (kernel == NO_KERNEL) {
+   if (pipeline->ps_ksp0 == NO_KERNEL) {
      struct brw_wm_prog_data prog_data = { 0, };
      struct anv_pipeline_binding surface_to_descriptor[256];
      struct anv_pipeline_binding sampler_to_descriptor[256];
@ -682,7 +682,8 @@ anv_pipeline_compile_fs(struct anv_pipeline *pipeline,
      }

      stage_prog_data = &prog_data.base;
-      kernel = anv_pipeline_cache_upload_kernel(cache,
+      pipeline->ps_ksp0 =
+         anv_pipeline_cache_upload_kernel(cache,
                                          module->size > 0 ? sha1 : NULL,
                                          shader_code, code_size,
                                                &stage_prog_data, sizeof(prog_data),
@ -691,34 +692,6 @@ anv_pipeline_compile_fs(struct anv_pipeline *pipeline,
      ralloc_free(mem_ctx);
   }

-   const struct brw_wm_prog_data *wm_prog_data =
-      (const struct brw_wm_prog_data *) stage_prog_data;
-
-   if (wm_prog_data->no_8)
-      pipeline->ps_simd8 = NO_KERNEL;
-   else
-      pipeline->ps_simd8 = kernel;
-
-   if (wm_prog_data->no_8 || wm_prog_data->prog_offset_16) {
-      pipeline->ps_simd16 = kernel + wm_prog_data->prog_offset_16;
-   } else {
-      pipeline->ps_simd16 = NO_KERNEL;
-   }
-
-   pipeline->ps_ksp2 = 0;
-   pipeline->ps_grf_start2 = 0;
-   if (pipeline->ps_simd8 != NO_KERNEL) {
-      pipeline->ps_ksp0 = pipeline->ps_simd8;
-      pipeline->ps_grf_start0 = wm_prog_data->base.dispatch_grf_start_reg;
-      if (pipeline->ps_simd16 != NO_KERNEL) {
-         pipeline->ps_ksp2 = pipeline->ps_simd16;
-         pipeline->ps_grf_start2 = wm_prog_data->dispatch_grf_start_reg_16;
-      }
-   } else if (pipeline->ps_simd16 != NO_KERNEL) {
-      pipeline->ps_ksp0 = pipeline->ps_simd16;
-      pipeline->ps_grf_start0 = wm_prog_data->dispatch_grf_start_reg_16;
-   }
-
   anv_pipeline_add_compiled_stage(pipeline, MESA_SHADER_FRAGMENT,
                                   stage_prog_data, &map);

--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@ -1418,12 +1418,7 @@ struct anv_pipeline {
   struct anv_state                             blend_state;
   uint32_t                                     vs_simd8;
   uint32_t                                     vs_vec4;
-   uint32_t                                     ps_simd8;
-   uint32_t                                     ps_simd16;
   uint32_t                                     ps_ksp0;
-   uint32_t                                     ps_ksp2;
-   uint32_t                                     ps_grf_start0;
-   uint32_t                                     ps_grf_start2;
   uint32_t                                     gs_kernel;
   uint32_t                                     cs_simd;

--- a/src/intel/vulkan/gen7_pipeline.c
+++ b/src/intel/vulkan/gen7_pipeline.c
@ -375,19 +375,21 @@ genX(graphics_pipeline_create)(
                                            POSOFFSET_SAMPLE : POSOFFSET_NONE;

         ps._32PixelDispatchEnable        = false;
-         ps._16PixelDispatchEnable        = pipeline->ps_simd16 != NO_KERNEL;
-         ps._8PixelDispatchEnable         = pipeline->ps_simd8 != NO_KERNEL;
+         ps._16PixelDispatchEnable        = wm_prog_data->dispatch_16;
+         ps._8PixelDispatchEnable         = wm_prog_data->dispatch_8;

-         ps.DispatchGRFStartRegisterforConstantSetupData0 = pipeline->ps_grf_start0,
+         ps.DispatchGRFStartRegisterforConstantSetupData0 =
+            wm_prog_data->base.dispatch_grf_start_reg,
         ps.DispatchGRFStartRegisterforConstantSetupData1 = 0,
-         ps.DispatchGRFStartRegisterforConstantSetupData2 = pipeline->ps_grf_start2,
+         ps.DispatchGRFStartRegisterforConstantSetupData2 =
+            wm_prog_data->dispatch_grf_start_reg_2,

         /* Haswell requires the sample mask to be set in this packet as well as
          * in 3DSTATE_SAMPLE_MASK; the values should match. */
         /* _NEW_BUFFERS, _NEW_MULTISAMPLE */

         ps.KernelStartPointer1           = 0;
-         ps.KernelStartPointer2           = pipeline->ps_ksp2;
+         ps.KernelStartPointer2           = pipeline->ps_ksp0 + wm_prog_data->prog_offset_2;
      }

      /* FIXME-GEN7: This needs a lot more work, cf gen7 upload_wm_state(). */
--- a/src/intel/vulkan/gen8_pipeline.c
+++ b/src/intel/vulkan/gen8_pipeline.c
@ -502,9 +502,9 @@ genX(graphics_pipeline_create)(
      anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PS), ps) {
         ps.KernelStartPointer0     = pipeline->ps_ksp0;
         ps.KernelStartPointer1     = 0;
-         ps.KernelStartPointer2     = pipeline->ps_ksp2;
-         ps._8PixelDispatchEnable   = pipeline->ps_simd8 != NO_KERNEL;
-         ps._16PixelDispatchEnable  = pipeline->ps_simd16 != NO_KERNEL;
+         ps.KernelStartPointer2     = pipeline->ps_ksp0 + wm_prog_data->prog_offset_2;
+         ps._8PixelDispatchEnable   = wm_prog_data->dispatch_8;
+         ps._16PixelDispatchEnable  = wm_prog_data->dispatch_16;
         ps._32PixelDispatchEnable  = false;
         ps.SingleProgramFlow       = false;
         ps.VectorMaskEnable        = true;
@ -518,9 +518,11 @@ genX(graphics_pipeline_create)(
         ps.ScratchSpaceBasePointer = pipeline->scratch_start[MESA_SHADER_FRAGMENT];
         ps.PerThreadScratchSpace   = scratch_space(&wm_prog_data->base);

-         ps.DispatchGRFStartRegisterForConstantSetupData0 = pipeline->ps_grf_start0;
+         ps.DispatchGRFStartRegisterForConstantSetupData0 =
+            wm_prog_data->base.dispatch_grf_start_reg;
         ps.DispatchGRFStartRegisterForConstantSetupData1 = 0;
-         ps.DispatchGRFStartRegisterForConstantSetupData2 = pipeline->ps_grf_start2;
+         ps.DispatchGRFStartRegisterForConstantSetupData2 =
+            wm_prog_data->dispatch_grf_start_reg_2;
      }

      bool per_sample_ps = pCreateInfo->pMultisampleState &&
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@ -367,9 +367,11 @@ struct brw_wm_prog_data {

   GLuint num_varying_inputs;

-   GLuint dispatch_grf_start_reg_16;
-   GLuint reg_blocks;
-   GLuint reg_blocks_16;
+   uint8_t reg_blocks_0;
+   uint8_t reg_blocks_2;
+
+   uint8_t dispatch_grf_start_reg_2;
+   uint32_t prog_offset_2;

   struct {
      /** @{
@ -383,7 +385,8 @@ struct brw_wm_prog_data {
   bool computed_stencil;

   bool early_fragment_tests;
-   bool no_8;
+   bool dispatch_8;
+   bool dispatch_16;
   bool dual_src_blend;
   bool persample_dispatch;
   bool uses_pos_offset;
@ -393,7 +396,6 @@ struct brw_wm_prog_data {
   bool uses_src_w;
   bool uses_sample_mask;
   bool pulls_bary;
-   uint32_t prog_offset_16;

   /**
    * Mask of which interpolation modes are required by the fragment shader.
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@ -5800,11 +5800,6 @@ fs_visitor::run_fs(bool do_rep_send)
         return false;
   }

-   if (dispatch_width == 8)
-      wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
-   else
-      wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
-
   return !failed;
 }

@ -6004,6 +5999,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                                           shader);

   cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL;
+   uint8_t simd8_grf_start, simd16_grf_start;
+   unsigned simd8_grf_used, simd16_grf_used;

   fs_visitor v8(compiler, log_data, mem_ctx, key,
                 &prog_data->base, prog, shader, 8,
@ -6015,7 +6012,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
      return NULL;
   } else if (likely(!(INTEL_DEBUG & DEBUG_NO8))) {
      simd8_cfg = v8.cfg;
-      prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs;
+      simd8_grf_start = v8.payload.num_regs;
+      simd8_grf_used = v8.grf_used;
   }

   if (!v8.simd16_unsupported &&
@ -6031,7 +6029,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                                   v16.fail_msg);
      } else {
         simd16_cfg = v16.cfg;
-         prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs;
+         simd16_grf_start = v16.payload.num_regs;
+         simd16_grf_used = v16.grf_used;
      }
   }

@ -6047,6 +6046,24 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
   if (compiler->devinfo->gen < 5 && simd16_cfg)
      simd8_cfg = NULL;

+   if (prog_data->persample_dispatch) {
+      /* Starting with SandyBridge (where we first get MSAA), the different
+       * pixel dispatch combinations are grouped into classifications A
+       * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1).  On all hardware
+       * generations, the only configurations supporting persample dispatch
+       * are are this in which only one dispatch width is enabled.
+       *
+       * If computed depth is enabled, SNB only allows SIMD8 while IVB+
+       * allow SIMD8 or SIMD16 so we choose SIMD16 if available.
+       */
+      if (compiler->devinfo->gen == 6 &&
+          prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) {
+         simd16_cfg = NULL;
+      } else if (simd16_cfg) {
+         simd8_cfg = NULL;
+      }
+   }
+
   /* We have to compute the flat inputs after the visitor is finished running
    * because it relies on prog_data->urb_setup which is computed in
    * fs_visitor::calculate_urb_setup().
@ -6065,14 +6082,23 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
   }

   if (simd8_cfg) {
+      prog_data->dispatch_8 = true;
      g.generate_code(simd8_cfg, 8);
-      prog_data->no_8 = false;
-   } else {
-      prog_data->no_8 = true;
-   }
+      prog_data->base.dispatch_grf_start_reg = simd8_grf_start;
+      prog_data->reg_blocks_0 = brw_register_blocks(simd8_grf_used);

-   if (simd16_cfg)
-      prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
+      if (simd16_cfg) {
+         prog_data->dispatch_16 = true;
+         prog_data->prog_offset_2 = g.generate_code(simd16_cfg, 16);
+         prog_data->dispatch_grf_start_reg_2 = simd16_grf_start;
+         prog_data->reg_blocks_2 = brw_register_blocks(simd16_grf_used);
+      }
+   } else if (simd16_cfg) {
+      prog_data->dispatch_16 = true;
+      g.generate_code(simd16_cfg, 16);
+      prog_data->base.dispatch_grf_start_reg = simd16_grf_start;
+      prog_data->reg_blocks_0 = brw_register_blocks(simd16_grf_used);
+   }

   return g.get_assembly(final_assembly_size);
 }
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@ -169,7 +169,7 @@ fs_visitor::emit_dummy_fs()
   stage_prog_data->nr_pull_params = 0;
   stage_prog_data->curb_read_length = 0;
   stage_prog_data->dispatch_grf_start_reg = 2;
-   wm_prog_data->dispatch_grf_start_reg_16 = 2;
+   wm_prog_data->dispatch_grf_start_reg_2 = 2;
   grf_used = 1; /* Gen4-5 don't allow zero GRF blocks */

   calculate_cfg();
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@ -86,48 +86,37 @@ brw_upload_wm_unit(struct brw_context *brw)
 			sizeof(*wm), 32, &brw->wm.base.state_offset);
   memset(wm, 0, sizeof(*wm));

-   if (prog_data->prog_offset_16) {
+   if (prog_data->dispatch_8 && prog_data->dispatch_16) {
      /* These two fields should be the same pre-gen6, which is why we
       * only have one hardware field to program for both dispatch
       * widths.
       */
      assert(prog_data->base.dispatch_grf_start_reg ==
-	     prog_data->dispatch_grf_start_reg_16);
+	     prog_data->dispatch_grf_start_reg_2);
   }

   /* BRW_NEW_PROGRAM_CACHE | BRW_NEW_FS_PROG_DATA */
-   if (prog_data->no_8) {
-      wm->wm5.enable_16_pix = 1;
-      wm->thread0.grf_reg_count = prog_data->reg_blocks_16;
-      wm->thread0.kernel_start_pointer =
-         brw_program_reloc(brw,
-                           brw->wm.base.state_offset +
-                           offsetof(struct brw_wm_unit_state, thread0),
-                           brw->wm.base.prog_offset +
-                           prog_data->prog_offset_16 +
-                           (prog_data->reg_blocks_16 << 1)) >> 6;
-
-   } else {
-      wm->thread0.grf_reg_count = prog_data->reg_blocks;
-      wm->wm9.grf_reg_count_2 = prog_data->reg_blocks_16;
-
-      wm->wm5.enable_8_pix = 1;
-      if (prog_data->prog_offset_16)
-         wm->wm5.enable_16_pix = 1;
+   wm->wm5.enable_8_pix = prog_data->dispatch_8;
+   wm->wm5.enable_16_pix = prog_data->dispatch_16;

+   if (prog_data->dispatch_8 || prog_data->dispatch_16) {
+      wm->thread0.grf_reg_count = prog_data->reg_blocks_0;
      wm->thread0.kernel_start_pointer =
         brw_program_reloc(brw,
                           brw->wm.base.state_offset +
                           offsetof(struct brw_wm_unit_state, thread0),
                           brw->wm.base.prog_offset +
                           (wm->thread0.grf_reg_count << 1)) >> 6;
+   }

+   if (prog_data->prog_offset_2) {
+      wm->wm9.grf_reg_count_2 = prog_data->reg_blocks_2;
      wm->wm9.kernel_start_pointer_2 =
         brw_program_reloc(brw,
                           brw->wm.base.state_offset +
                           offsetof(struct brw_wm_unit_state, wm9),
                           brw->wm.base.prog_offset +
-                           prog_data->prog_offset_16 +
+                           prog_data->prog_offset_2 +
                           (wm->wm9.grf_reg_count_2 << 1)) >> 6;
   }

--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@ -129,29 +129,19 @@ gen6_upload_wm_state(struct brw_context *brw,

   dw5 |= (brw->max_wm_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT;

-   if (prog_data->prog_offset_16 || prog_data->no_8) {
+   if (prog_data->dispatch_8)
+      dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
+
+   if (prog_data->dispatch_16)
      dw5 |= GEN6_WM_16_DISPATCH_ENABLE;

-      if (!prog_data->no_8 && !prog_data->persample_dispatch) {
-         dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
-         dw4 |= (prog_data->base.dispatch_grf_start_reg <<
-                 GEN6_WM_DISPATCH_START_GRF_SHIFT_0);
-         dw4 |= (prog_data->dispatch_grf_start_reg_16 <<
-                 GEN6_WM_DISPATCH_START_GRF_SHIFT_2);
+   dw4 |= prog_data->base.dispatch_grf_start_reg <<
+          GEN6_WM_DISPATCH_START_GRF_SHIFT_0;
+   dw4 |= prog_data->dispatch_grf_start_reg_2 <<
+          GEN6_WM_DISPATCH_START_GRF_SHIFT_2;
+
   ksp0 = stage_state->prog_offset;
-         ksp2 = stage_state->prog_offset + prog_data->prog_offset_16;
-      } else {
-         dw4 |= (prog_data->dispatch_grf_start_reg_16 <<
-                GEN6_WM_DISPATCH_START_GRF_SHIFT_0);
-         ksp0 = stage_state->prog_offset + prog_data->prog_offset_16;
-      }
-   }
-   else {
-      dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
-      dw4 |= (prog_data->base.dispatch_grf_start_reg <<
-              GEN6_WM_DISPATCH_START_GRF_SHIFT_0);
-      ksp0 = stage_state->prog_offset;
-   }
+   ksp2 = stage_state->prog_offset + prog_data->prog_offset_2;

   if (dual_source_blend_enable)
      dw5 |= GEN6_WM_DUAL_SOURCE_BLEND_ENABLE;
@ -200,37 +190,6 @@ gen6_upload_wm_state(struct brw_context *brw,
         dw6 |= GEN6_WM_MSDISPMODE_PERSAMPLE;
      else {
         dw6 |= GEN6_WM_MSDISPMODE_PERPIXEL;
-
-         /* From the Sandy Bridge PRM, Vol 2 part 1, 7.7.1 ("Pixel Grouping
-          * (Dispatch Size) Control"), p.334:
-          *
-          *     Note: in the table below, the Valid column indicates which
-          *     products that combination is supported on. Combinations of
-          *     dispatch enables not listed in the table are not available on
-          *     any product.
-          *
-          *     A: Valid on all products
-          *
-          *     B: Not valid on [DevSNB] if 4x PERPIXEL mode with pixel shader
-          *     computed depth.
-          *
-          *     D: Valid on all products, except when in non-1x PERSAMPLE mode
-          *     (applies to [DevSNB+] only). Not valid on [DevSNB] if 4x
-          *     PERPIXEL mode with pixel shader computed depth.
-          *
-          *     E: Not valid on [DevSNB] if 4x PERPIXEL mode with pixel shader
-          *     computed depth.
-          *
-          *     F: Valid on all products, except not valid on [DevSNB] if 4x
-          *     PERPIXEL mode with pixel shader computed depth.
-          *
-          * In the table that follows, the only entry with "A" in the Valid
-          * column is the entry where only 8 pixel dispatch is enabled.
-          * Therefore, when we are in PERPIXEL mode with pixel shader computed
-          * depth, we need to disable SIMD16 dispatch.
-          */
-         if (dw5 & GEN6_WM_COMPUTED_DEPTH)
-            dw5 &= ~GEN6_WM_16_DISPATCH_ENABLE;
      }
   } else {
      dw6 |= GEN6_WM_MSRAST_OFF_PIXEL;
--- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
@ -216,34 +216,19 @@ gen7_upload_ps_state(struct brw_context *brw,

   dw4 |= fast_clear_op;

-   if (prog_data->prog_offset_16 || prog_data->no_8) {
+   if (prog_data->dispatch_16)
      dw4 |= GEN7_PS_16_DISPATCH_ENABLE;

-      /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16
-       * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader
-       * is successfully compiled. In majority of the cases that bring us
-       * better performance than 'SIMD8 only' dispatch.
-       */
-      if (!prog_data->no_8 && !prog_data->persample_dispatch) {
+   if (prog_data->dispatch_8)
      dw4 |= GEN7_PS_8_DISPATCH_ENABLE;
-         dw5 |= (prog_data->base.dispatch_grf_start_reg <<
-                 GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
-         dw5 |= (prog_data->dispatch_grf_start_reg_16 <<
-                 GEN7_PS_DISPATCH_START_GRF_SHIFT_2);
+
+   dw5 |= prog_data->base.dispatch_grf_start_reg <<
+          GEN7_PS_DISPATCH_START_GRF_SHIFT_0;
+   dw5 |= prog_data->dispatch_grf_start_reg_2 <<
+          GEN7_PS_DISPATCH_START_GRF_SHIFT_2;
+
   ksp0 = stage_state->prog_offset;
-         ksp2 = stage_state->prog_offset + prog_data->prog_offset_16;
-      } else {
-         dw5 |= (prog_data->dispatch_grf_start_reg_16 <<
-                 GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
-         ksp0 = stage_state->prog_offset + prog_data->prog_offset_16;
-      }
-   }
-   else {
-      dw4 |= GEN7_PS_8_DISPATCH_ENABLE;
-      dw5 |= (prog_data->base.dispatch_grf_start_reg <<
-              GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
-      ksp0 = stage_state->prog_offset;
-   }
+   ksp2 = stage_state->prog_offset + prog_data->prog_offset_2;

   BEGIN_BATCH(8);
   OUT_BATCH(_3DSTATE_PS << 16 | (8 - 2));
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
@ -234,34 +234,19 @@ gen8_upload_ps_state(struct brw_context *brw,

   dw6 |= fast_clear_op;

-   if (prog_data->prog_offset_16 || prog_data->no_8) {
+   if (prog_data->dispatch_8)
+      dw6 |= GEN7_PS_8_DISPATCH_ENABLE;
+
+   if (prog_data->dispatch_16)
      dw6 |= GEN7_PS_16_DISPATCH_ENABLE;

-      /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16
-       * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader
-       * is successfully compiled. In majority of the cases that bring us
-       * better performance than 'SIMD8 only' dispatch.
-       */
-      if (!prog_data->no_8 && !prog_data->persample_dispatch) {
-         dw6 |= GEN7_PS_8_DISPATCH_ENABLE;
-         dw7 |= (prog_data->base.dispatch_grf_start_reg <<
-                 GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
-         dw7 |= (prog_data->dispatch_grf_start_reg_16 <<
-                 GEN7_PS_DISPATCH_START_GRF_SHIFT_2);
-         ksp0 = stage_state->prog_offset;
-         ksp2 = stage_state->prog_offset + prog_data->prog_offset_16;
-      } else {
-         dw7 |= (prog_data->dispatch_grf_start_reg_16 <<
-                 GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
+   dw7 |= prog_data->base.dispatch_grf_start_reg <<
+          GEN7_PS_DISPATCH_START_GRF_SHIFT_0;
+   dw7 |= prog_data->dispatch_grf_start_reg_2 <<
+          GEN7_PS_DISPATCH_START_GRF_SHIFT_2;

-         ksp0 = stage_state->prog_offset + prog_data->prog_offset_16;
-      }
-   } else {
-      dw6 |= GEN7_PS_8_DISPATCH_ENABLE;
-      dw7 |= (prog_data->base.dispatch_grf_start_reg <<
-              GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
   ksp0 = stage_state->prog_offset;
-   }
+   ksp2 = stage_state->prog_offset + prog_data->prog_offset_2;

   BEGIN_BATCH(12);
   OUT_BATCH(_3DSTATE_PS << 16 | (12 - 2));