diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c index c25b82757c1..c2810bb21b9 100644 --- a/src/mesa/drivers/dri/i965/gen8_ps_state.c +++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c @@ -180,10 +180,6 @@ upload_ps_state(struct brw_context *brw) if (brw->wm.prog_data->base.nr_params > 0) dw6 |= GEN7_PS_PUSH_CONSTANT_ENABLE; - dw6 |= GEN7_PS_8_DISPATCH_ENABLE; - if (brw->wm.prog_data->prog_offset_16) - dw6 |= GEN7_PS_16_DISPATCH_ENABLE; - /* From the documentation for this packet: * "If the PS kernel does not need the Position XY Offsets to * compute a Position Value, then this field should be programmed @@ -202,13 +198,40 @@ upload_ps_state(struct brw_context *brw) else dw6 |= GEN7_PS_POSOFFSET_NONE; - dw7 |= - brw->wm.prog_data->first_curbe_grf << GEN7_PS_DISPATCH_START_GRF_SHIFT_0 | - brw->wm.prog_data->first_curbe_grf_16<< GEN7_PS_DISPATCH_START_GRF_SHIFT_2; + /* _NEW_MULTISAMPLE + * In case of non 1x per sample shading, only one of SIMD8 and SIMD16 + * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader + * is successfully compiled. In majority of the cases that bring us + * better performance than 'SIMD8 only' dispatch. + */ + int min_invocations_per_fragment = + _mesa_get_min_invocations_per_fragment(ctx, brw->fragment_program, false); + assert(min_invocations_per_fragment >= 1); + + if (brw->wm.prog_data->prog_offset_16) { + dw6 |= GEN7_PS_16_DISPATCH_ENABLE; + if (min_invocations_per_fragment == 1) { + dw6 |= GEN7_PS_8_DISPATCH_ENABLE; + dw7 |= (brw->wm.prog_data->first_curbe_grf << + GEN7_PS_DISPATCH_START_GRF_SHIFT_0); + dw7 |= (brw->wm.prog_data->first_curbe_grf_16 << + GEN7_PS_DISPATCH_START_GRF_SHIFT_2); + } else { + dw7 |= (brw->wm.prog_data->first_curbe_grf_16 << + GEN7_PS_DISPATCH_START_GRF_SHIFT_0); + } + } else { + dw6 |= GEN7_PS_8_DISPATCH_ENABLE; + dw7 |= (brw->wm.prog_data->first_curbe_grf << + GEN7_PS_DISPATCH_START_GRF_SHIFT_0); + } BEGIN_BATCH(12); OUT_BATCH(_3DSTATE_PS << 16 | (12 - 2)); - OUT_BATCH(brw->wm.base.prog_offset); + if (brw->wm.prog_data->prog_offset_16 && min_invocations_per_fragment > 1) + OUT_BATCH(brw->wm.base.prog_offset + brw->wm.prog_data->prog_offset_16); + else + OUT_BATCH(brw->wm.base.prog_offset); OUT_BATCH(0); OUT_BATCH(dw3); if (brw->wm.prog_data->total_scratch) { @@ -230,7 +253,7 @@ upload_ps_state(struct brw_context *brw) const struct brw_tracked_state gen8_ps_state = { .dirty = { - .mesa = _NEW_PROGRAM_CONSTANTS, + .mesa = _NEW_PROGRAM_CONSTANTS | _NEW_MULTISAMPLE, .brw = BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_PS_BINDING_TABLE | BRW_NEW_BATCH |