i965: Do Sandybridge workaround flushes before each primitive.

Sandybridge requires the post-sync non-zero workaround in a ton of
places, and if you ever miss one, the GPU usually hangs.

Currently, we try to track exactly when a workaround flush is
necessary (via the brw->batch.need_workaround_flush flag).  This is
tricky to get right, and we've botched it several times in the past.

This patch unconditionally performs the post-sync non-zero flush at the
start of each primitive's state upload (including BLORP).  We drop the
needs_workaround_flush flag, and drop all the other callers, as the
flush has already been performed.

We have no data to indicate that simply flushing all the time will
hurt performance, and it has the potential to help stability.

v2: Add post-sync workaround to initial GPU state upload to be extra
    cautious (suggested by Chad Versace).

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chad Versace <chad.versace@linux.intel.com>
This commit is contained in:
Kenneth Graunke 2015-01-09 23:07:56 -08:00
parent 92163482bd
commit 0b499abb51
11 changed files with 20 additions and 84 deletions

View File

@ -861,7 +861,6 @@ struct intel_batchbuffer {
drm_intel_bo *last_bo;
/** BO for post-sync nonzero writes for gen6 workaround. */
drm_intel_bo *workaround_bo;
bool need_workaround_flush;
uint16_t emit, total;
uint16_t used, reserved_space;

View File

@ -276,9 +276,6 @@ static void brw_emit_prim(struct brw_context *brw,
OUT_BATCH(base_vertex_location);
ADVANCE_BATCH();
/* Only used on Sandybridge; harmless to set elsewhere. */
brw->batch.need_workaround_flush = true;
if (brw->always_flush_cache) {
intel_batchbuffer_emit_mi_flush(brw);
}

View File

@ -47,10 +47,6 @@ static void upload_drawing_rect(struct brw_context *brw)
{
struct gl_context *ctx = &brw->ctx;
/* 3DSTATE_DRAWING_RECTANGLE is non-pipelined. */
if (brw->gen == 6)
intel_emit_post_sync_nonzero_flush(brw);
BEGIN_BATCH(4);
OUT_BATCH(_3DSTATE_DRAWING_RECTANGLE << 16 | (4 - 2));
OUT_BATCH(0); /* xmin, ymin */
@ -581,7 +577,6 @@ brw_emit_depth_stencil_hiz(struct brw_context *brw,
* non-pipelined state that will need the PIPE_CONTROL workaround.
*/
if (brw->gen == 6) {
intel_emit_post_sync_nonzero_flush(brw);
intel_emit_depth_stall_flushes(brw);
}
@ -685,9 +680,6 @@ brw_emit_depth_stencil_hiz(struct brw_context *brw,
* when HiZ is enabled and the DEPTH_BUFFER_STATE changes.
*/
if (brw->gen >= 6 || hiz) {
if (brw->gen == 6)
intel_emit_post_sync_nonzero_flush(brw);
BEGIN_BATCH(2);
OUT_BATCH(_3DSTATE_CLEAR_PARAMS << 16 |
GEN5_DEPTH_CLEAR_VALID |
@ -720,9 +712,6 @@ static void upload_polygon_stipple(struct brw_context *brw)
if (!ctx->Polygon.StippleFlag)
return;
if (brw->gen == 6)
intel_emit_post_sync_nonzero_flush(brw);
BEGIN_BATCH(33);
OUT_BATCH(_3DSTATE_POLY_STIPPLE_PATTERN << 16 | (33 - 2));
@ -766,9 +755,6 @@ static void upload_polygon_stipple_offset(struct brw_context *brw)
if (!ctx->Polygon.StippleFlag)
return;
if (brw->gen == 6)
intel_emit_post_sync_nonzero_flush(brw);
BEGIN_BATCH(2);
OUT_BATCH(_3DSTATE_POLY_STIPPLE_OFFSET << 16 | (2-2));
@ -810,9 +796,6 @@ static void upload_aa_line_parameters(struct brw_context *brw)
if (brw->gen == 4 && !brw->is_g4x)
return;
if (brw->gen == 6)
intel_emit_post_sync_nonzero_flush(brw);
BEGIN_BATCH(3);
OUT_BATCH(_3DSTATE_AA_LINE_PARAMETERS << 16 | (3 - 2));
/* use legacy aa line coverage computation */
@ -842,9 +825,6 @@ static void upload_line_stipple(struct brw_context *brw)
if (!ctx->Line.StippleFlag)
return;
if (brw->gen == 6)
intel_emit_post_sync_nonzero_flush(brw);
BEGIN_BATCH(3);
OUT_BATCH(_3DSTATE_LINE_STIPPLE_PATTERN << 16 | (3 - 2));
OUT_BATCH(ctx->Line.StipplePattern);
@ -883,10 +863,6 @@ brw_upload_invariant_state(struct brw_context *brw)
{
const bool is_965 = brw->gen == 4 && !brw->is_g4x;
/* 3DSTATE_SIP, 3DSTATE_MULTISAMPLE, etc. are nonpipelined. */
if (brw->gen == 6)
intel_emit_post_sync_nonzero_flush(brw);
/* Select the 3D pipeline (as opposed to media) */
const uint32_t _3DSTATE_PIPELINE_SELECT =
is_965 ? CMD_PIPELINE_SELECT_965 : CMD_PIPELINE_SELECT_GM45;
@ -954,9 +930,6 @@ static void upload_state_base_address( struct brw_context *brw )
if (brw->gen >= 6) {
uint8_t mocs = brw->gen == 7 ? GEN7_MOCS_L3 : 0;
if (brw->gen == 6)
intel_emit_post_sync_nonzero_flush(brw);
BEGIN_BATCH(10);
OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (10 - 2));
OUT_BATCH(mocs << 8 | /* General State Memory Object Control State */

View File

@ -66,10 +66,6 @@ brw_write_timestamp(struct brw_context *brw, drm_intel_bo *query_bo, int idx)
void
brw_write_depth_count(struct brw_context *brw, drm_intel_bo *query_bo, int idx)
{
/* Emit Sandybridge workaround flush: */
if (brw->gen == 6)
intel_emit_post_sync_nonzero_flush(brw);
brw_emit_pipe_control_write(brw,
PIPE_CONTROL_WRITE_DEPTH_COUNT
| PIPE_CONTROL_DEPTH_STALL,

View File

@ -344,6 +344,9 @@ brw_upload_initial_gpu_state(struct brw_context *brw)
if (!brw->hw_ctx)
return;
if (brw->gen == 6)
intel_emit_post_sync_nonzero_flush(brw);
brw_upload_invariant_state(brw);
if (brw->gen >= 8) {
@ -612,6 +615,10 @@ void brw_upload_state(struct brw_context *brw)
if ((state->mesa | state->brw) == 0)
return;
/* Emit Sandybridge workaround flushes on every primitive, for safety. */
if (brw->gen == 6)
intel_emit_post_sync_nonzero_flush(brw);
if (unlikely(INTEL_DEBUG)) {
/* Debug version which enforces various sanity checks on the
* state flags which are generated and checked to help ensure

View File

@ -517,17 +517,16 @@ void
gen6_blorp_emit_vs_disable(struct brw_context *brw,
const brw_blorp_params *params)
{
if (brw->gen == 6) {
/* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
* 3DSTATE_VS, Dword 5.0 "VS Function Enable":
*
* [DevSNB] A pipeline flush must be programmed prior to a
* 3DSTATE_VS command that causes the VS Function Enable to
* toggle. Pipeline flush can be executed by sending a PIPE_CONTROL
* command with CS stall bit set and a post sync operation.
*/
intel_emit_post_sync_nonzero_flush(brw);
}
/* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
* 3DSTATE_VS, Dword 5.0 "VS Function Enable":
*
* [DevSNB] A pipeline flush must be programmed prior to a
* 3DSTATE_VS command that causes the VS Function Enable to
* toggle. Pipeline flush can be executed by sending a PIPE_CONTROL
* command with CS stall bit set and a post sync operation.
*
* We've already done one at the start of the BLORP operation.
*/
/* Disable the push constant buffers. */
BEGIN_BATCH(5);
@ -817,7 +816,6 @@ gen6_blorp_emit_depth_stencil_config(struct brw_context *brw,
/* 3DSTATE_DEPTH_BUFFER */
{
intel_emit_post_sync_nonzero_flush(brw);
intel_emit_depth_stall_flushes(brw);
BEGIN_BATCH(7);
@ -893,7 +891,6 @@ static void
gen6_blorp_emit_depth_disable(struct brw_context *brw,
const brw_blorp_params *params)
{
intel_emit_post_sync_nonzero_flush(brw);
intel_emit_depth_stall_flushes(brw);
BEGIN_BATCH(7);
@ -945,9 +942,6 @@ void
gen6_blorp_emit_drawing_rectangle(struct brw_context *brw,
const brw_blorp_params *params)
{
if (brw->gen == 6)
intel_emit_post_sync_nonzero_flush(brw);
BEGIN_BATCH(4);
OUT_BATCH(_3DSTATE_DRAWING_RECTANGLE << 16 | (4 - 2));
OUT_BATCH(0);
@ -997,9 +991,6 @@ gen6_blorp_emit_primitive(struct brw_context *brw,
OUT_BATCH(0);
OUT_BATCH(0);
ADVANCE_BATCH();
/* Only used on Sandybridge; harmless to set elsewhere. */
brw->batch.need_workaround_flush = true;
}
/**
@ -1025,7 +1016,7 @@ gen6_blorp_exec(struct brw_context *brw,
uint32_t prog_offset = params->get_wm_prog(brw, &prog_data);
/* Emit workaround flushes when we switch from drawing to blorping. */
brw->batch.need_workaround_flush = true;
intel_emit_post_sync_nonzero_flush(brw);
gen6_emit_3dstate_multisample(brw, params->dst.num_samples);
gen6_emit_3dstate_sample_mask(brw,

View File

@ -65,11 +65,6 @@ gen6_emit_depth_stencil_hiz(struct brw_context *brw,
*/
bool enable_hiz_ss = hiz || separate_stencil;
/* 3DSTATE_DEPTH_BUFFER, 3DSTATE_STENCIL_BUFFER are both
* non-pipelined state that will need the PIPE_CONTROL workaround.
*/
intel_emit_post_sync_nonzero_flush(brw);
intel_emit_depth_stall_flushes(brw);
irb = intel_get_renderbuffer(fb, BUFFER_DEPTH);
@ -238,8 +233,6 @@ gen6_emit_depth_stencil_hiz(struct brw_context *brw,
* 3DSTATE_CLEAR_PARAMS packet must follow the DEPTH_BUFFER_STATE packet
* when HiZ is enabled and the DEPTH_BUFFER_STATE changes.
*/
intel_emit_post_sync_nonzero_flush(brw);
BEGIN_BATCH(2);
OUT_BATCH(_3DSTATE_CLEAR_PARAMS << 16 |
GEN5_DEPTH_CLEAR_VALID |

View File

@ -132,9 +132,6 @@ gen6_emit_3dstate_multisample(struct brw_context *brw,
unreachable("Unrecognized num_samples in gen6_emit_3dstate_multisample");
}
/* 3DSTATE_MULTISAMPLE is nonpipelined. */
intel_emit_post_sync_nonzero_flush(brw);
int len = brw->gen >= 7 ? 4 : 3;
BEGIN_BATCH(len);
OUT_BATCH(_3DSTATE_MULTISAMPLE << 16 | (len - 2));

View File

@ -259,9 +259,6 @@ brw_begin_transform_feedback(struct gl_context *ctx, GLenum mode,
= _mesa_compute_max_transform_feedback_vertices(xfb_obj,
linked_xfb_info);
/* 3DSTATE_GS_SVB_INDEX is non-pipelined. */
intel_emit_post_sync_nonzero_flush(brw);
/* Initialize the SVBI 0 register to zero and set the maximum index. */
BEGIN_BATCH(4);
OUT_BATCH(_3DSTATE_GS_SVB_INDEX << 16 | (4 - 2));

View File

@ -171,10 +171,9 @@ upload_vs_state(struct brw_context *brw)
* flush can be executed by sending a PIPE_CONTROL command with CS
* stall bit set and a post sync operation.
*
* Although we don't disable the VS during normal drawing, BLORP sometimes
* disables it. To be safe, do the flush here just in case.
* We've already done such a flush at the start of state upload, so we
* don't need to do another one here.
*/
intel_emit_post_sync_nonzero_flush(brw);
if (stage_state->push_const_size == 0) {
/* Disable the push constant buffers. */
@ -247,7 +246,6 @@ upload_vs_state(struct brw_context *brw)
* bug reports that led to this workaround, and may be more than
* what is strictly required to avoid the issue.
*/
intel_emit_post_sync_nonzero_flush(brw);
brw_emit_pipe_control_flush(brw,
PIPE_CONTROL_DEPTH_STALL |
PIPE_CONTROL_INSTRUCTION_FLUSH |

View File

@ -51,8 +51,6 @@ intel_batchbuffer_init(struct brw_context *brw)
4096, 4096);
}
brw->batch.need_workaround_flush = true;
if (!brw->has_llc) {
brw->batch.cpu_map = malloc(BATCH_SZ);
brw->batch.map = brw->batch.cpu_map;
@ -183,11 +181,6 @@ brw_new_batch(struct brw_context *brw)
brw->state.dirty.brw |= BRW_NEW_BATCH;
/* Assume that the last command before the start of our batch was a
* primitive, for safety.
*/
brw->batch.need_workaround_flush = true;
brw->state_batch_count = 0;
brw->ib.type = -1;
@ -647,17 +640,12 @@ gen7_emit_cs_stall_flush(struct brw_context *brw)
void
intel_emit_post_sync_nonzero_flush(struct brw_context *brw)
{
if (!brw->batch.need_workaround_flush)
return;
brw_emit_pipe_control_flush(brw,
PIPE_CONTROL_CS_STALL |
PIPE_CONTROL_STALL_AT_SCOREBOARD);
brw_emit_pipe_control_write(brw, PIPE_CONTROL_WRITE_IMMEDIATE,
brw->batch.workaround_bo, 0, 0, 0);
brw->batch.need_workaround_flush = false;
}
/* Emit a pipelined flush to either flush render and texture cache for