i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could know ahead of time that it won't succeed---it's purely a waste of time. This is especially important for state-based recompiles, which happen at draw time. The fragment shader compiler has a number of checks like: if (dispatch_width == 16) fail("...some reason..."); This patch introduces a new no16() function which replaces the above pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag. Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and issue a helpful performance warning if INTEL_DEBUG=perf is set. (In SIMD16 mode, no16() calls fail(), for safety's sake.) The great part is that this is not a heuristic---if the flag is set, we know with 100% certainty that the SIMD16 compile would fail. (It might fail anyway if we run out of registers, but it's always worth trying.) v2: Fix missing va_end in early-return case (caught by Ilia Mirkin). Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1] Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1] Reviewed-by: Eric Anholt <eric@anholt.net>
This commit is contained in:
parent
b207e88b25
commit
de7ad2c88f
|
@ -647,9 +647,8 @@ fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
|
|||
}
|
||||
|
||||
void
|
||||
fs_visitor::fail(const char *format, ...)
|
||||
fs_visitor::vfail(const char *format, va_list va)
|
||||
{
|
||||
va_list va;
|
||||
char *msg;
|
||||
|
||||
if (failed)
|
||||
|
@ -657,9 +656,7 @@ fs_visitor::fail(const char *format, ...)
|
|||
|
||||
failed = true;
|
||||
|
||||
va_start(va, format);
|
||||
msg = ralloc_vasprintf(mem_ctx, format, va);
|
||||
va_end(va);
|
||||
msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
|
||||
|
||||
this->fail_msg = msg;
|
||||
|
@ -669,6 +666,48 @@ fs_visitor::fail(const char *format, ...)
|
|||
}
|
||||
}
|
||||
|
||||
void
|
||||
fs_visitor::fail(const char *format, ...)
|
||||
{
|
||||
va_list va;
|
||||
|
||||
va_start(va, format);
|
||||
vfail(format, va);
|
||||
va_end(va);
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark this program as impossible to compile in SIMD16 mode.
|
||||
*
|
||||
* During the SIMD8 compile (which happens first), we can detect and flag
|
||||
* things that are unsupported in SIMD16 mode, so the compiler can skip
|
||||
* the SIMD16 compile altogether.
|
||||
*
|
||||
* During a SIMD16 compile (if one happens anyway), this just calls fail().
|
||||
*/
|
||||
void
|
||||
fs_visitor::no16(const char *format, ...)
|
||||
{
|
||||
va_list va;
|
||||
|
||||
va_start(va, format);
|
||||
|
||||
if (dispatch_width == 16) {
|
||||
vfail(format, va);
|
||||
} else {
|
||||
simd16_unsupported = true;
|
||||
|
||||
if (INTEL_DEBUG & DEBUG_PERF) {
|
||||
if (no16_msg)
|
||||
ralloc_vasprintf_append(&no16_msg, format, va);
|
||||
else
|
||||
no16_msg = ralloc_vasprintf(mem_ctx, format, va);
|
||||
}
|
||||
}
|
||||
|
||||
va_end(va);
|
||||
}
|
||||
|
||||
fs_inst *
|
||||
fs_visitor::emit(enum opcode opcode)
|
||||
{
|
||||
|
@ -1356,8 +1395,8 @@ fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
|
|||
switch (opcode) {
|
||||
case SHADER_OPCODE_INT_QUOTIENT:
|
||||
case SHADER_OPCODE_INT_REMAINDER:
|
||||
if (brw->gen >= 7 && dispatch_width == 16)
|
||||
fail("SIMD16 INTDIV unsupported\n");
|
||||
if (brw->gen >= 7)
|
||||
no16("SIMD16 INTDIV unsupported\n");
|
||||
break;
|
||||
case SHADER_OPCODE_POW:
|
||||
break;
|
||||
|
@ -3505,13 +3544,18 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
|
|||
exec_list *simd16_instructions = NULL;
|
||||
fs_visitor v2(brw, c, prog, fp, 16);
|
||||
if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
|
||||
/* Try a SIMD16 compile */
|
||||
v2.import_uniforms(&v);
|
||||
if (!v2.run()) {
|
||||
perf_debug("SIMD16 shader failed to compile, falling back to "
|
||||
"SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
|
||||
if (!v.simd16_unsupported) {
|
||||
/* Try a SIMD16 compile */
|
||||
v2.import_uniforms(&v);
|
||||
if (!v2.run()) {
|
||||
perf_debug("SIMD16 shader failed to compile, falling back to "
|
||||
"SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
|
||||
} else {
|
||||
simd16_instructions = &v2.instructions;
|
||||
}
|
||||
} else {
|
||||
simd16_instructions = &v2.instructions;
|
||||
perf_debug("SIMD16 shader unsupported, falling back to "
|
||||
"SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -381,7 +381,9 @@ public:
|
|||
void insert_gen4_send_dependency_workarounds();
|
||||
void insert_gen4_pre_send_dependency_workarounds(fs_inst *inst);
|
||||
void insert_gen4_post_send_dependency_workarounds(fs_inst *inst);
|
||||
void vfail(const char *msg, va_list args);
|
||||
void fail(const char *msg, ...);
|
||||
void no16(const char *msg, ...);
|
||||
void lower_uniform_pull_constant_loads();
|
||||
|
||||
void push_force_uncompressed();
|
||||
|
@ -541,6 +543,8 @@ public:
|
|||
|
||||
bool failed;
|
||||
char *fail_msg;
|
||||
bool simd16_unsupported;
|
||||
char *no16_msg;
|
||||
|
||||
/* Result of last visit() method. */
|
||||
fs_reg result;
|
||||
|
|
|
@ -462,8 +462,8 @@ fs_visitor::visit(ir_expression *ir)
|
|||
* FINISHME: Emit just the MUL if we know an operand is small
|
||||
* enough.
|
||||
*/
|
||||
if (brw->gen >= 7 && dispatch_width == 16)
|
||||
fail("SIMD16 explicit accumulator operands unsupported\n");
|
||||
if (brw->gen >= 7)
|
||||
no16("SIMD16 explicit accumulator operands unsupported\n");
|
||||
|
||||
struct brw_reg acc = retype(brw_acc_reg(), this->result.type);
|
||||
|
||||
|
@ -475,8 +475,8 @@ fs_visitor::visit(ir_expression *ir)
|
|||
}
|
||||
break;
|
||||
case ir_binop_imul_high: {
|
||||
if (brw->gen >= 7 && dispatch_width == 16)
|
||||
fail("SIMD16 explicit accumulator operands unsupported\n");
|
||||
if (brw->gen >= 7)
|
||||
no16("SIMD16 explicit accumulator operands unsupported\n");
|
||||
|
||||
struct brw_reg acc = retype(brw_acc_reg(), this->result.type);
|
||||
|
||||
|
@ -490,8 +490,8 @@ fs_visitor::visit(ir_expression *ir)
|
|||
emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]);
|
||||
break;
|
||||
case ir_binop_carry: {
|
||||
if (brw->gen >= 7 && dispatch_width == 16)
|
||||
fail("SIMD16 explicit accumulator operands unsupported\n");
|
||||
if (brw->gen >= 7)
|
||||
no16("SIMD16 explicit accumulator operands unsupported\n");
|
||||
|
||||
struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
|
||||
|
||||
|
@ -500,8 +500,8 @@ fs_visitor::visit(ir_expression *ir)
|
|||
break;
|
||||
}
|
||||
case ir_binop_borrow: {
|
||||
if (brw->gen >= 7 && dispatch_width == 16)
|
||||
fail("SIMD16 explicit accumulator operands unsupported\n");
|
||||
if (brw->gen >= 7)
|
||||
no16("SIMD16 explicit accumulator operands unsupported\n");
|
||||
|
||||
struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
|
||||
|
||||
|
@ -1290,8 +1290,7 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
|
|||
next.reg_offset++;
|
||||
break;
|
||||
case ir_txd: {
|
||||
if (dispatch_width == 16)
|
||||
fail("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
|
||||
no16("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
|
||||
|
||||
/* Load dPdx and the coordinate together:
|
||||
* [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
|
||||
|
@ -1364,8 +1363,8 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
|
|||
break;
|
||||
case ir_tg4:
|
||||
if (has_nonconstant_offset) {
|
||||
if (ir->shadow_comparitor && dispatch_width == 16)
|
||||
fail("Gen7 does not support gather4_po_c in SIMD16 mode.");
|
||||
if (ir->shadow_comparitor)
|
||||
no16("Gen7 does not support gather4_po_c in SIMD16 mode.");
|
||||
|
||||
/* More crazy intermixing */
|
||||
ir->offset->accept(this);
|
||||
|
@ -1464,8 +1463,8 @@ fs_visitor::rescale_texcoord(ir_texture *ir, fs_reg coordinate,
|
|||
0
|
||||
};
|
||||
|
||||
no16("rectangle scale uniform setup not supported on SIMD16\n");
|
||||
if (dispatch_width == 16) {
|
||||
fail("rectangle scale uniform setup not supported on SIMD16\n");
|
||||
return coordinate;
|
||||
}
|
||||
|
||||
|
@ -2183,8 +2182,8 @@ fs_visitor::try_replace_with_sel()
|
|||
void
|
||||
fs_visitor::visit(ir_if *ir)
|
||||
{
|
||||
if (brw->gen < 6 && dispatch_width == 16) {
|
||||
fail("Can't support (non-uniform) control flow on SIMD16\n");
|
||||
if (brw->gen < 6) {
|
||||
no16("Can't support (non-uniform) control flow on SIMD16\n");
|
||||
}
|
||||
|
||||
/* Don't point the annotation at the if statement, because then it plus
|
||||
|
@ -2226,8 +2225,8 @@ fs_visitor::visit(ir_if *ir)
|
|||
void
|
||||
fs_visitor::visit(ir_loop *ir)
|
||||
{
|
||||
if (brw->gen < 6 && dispatch_width == 16) {
|
||||
fail("Can't support (non-uniform) control flow on SIMD16\n");
|
||||
if (brw->gen < 6) {
|
||||
no16("Can't support (non-uniform) control flow on SIMD16\n");
|
||||
}
|
||||
|
||||
this->base_ir = NULL;
|
||||
|
@ -2725,9 +2724,10 @@ fs_visitor::emit_fb_writes()
|
|||
bool do_dual_src = this->dual_src_output.file != BAD_FILE;
|
||||
bool src0_alpha_to_render_target = false;
|
||||
|
||||
if (dispatch_width == 16 && do_dual_src) {
|
||||
fail("GL_ARB_blend_func_extended not yet supported in SIMD16.");
|
||||
do_dual_src = false;
|
||||
if (do_dual_src) {
|
||||
no16("GL_ARB_blend_func_extended not yet supported in SIMD16.");
|
||||
if (dispatch_width == 16)
|
||||
do_dual_src = false;
|
||||
}
|
||||
|
||||
/* From the Sandy Bridge PRM, volume 4, page 198:
|
||||
|
@ -2778,13 +2778,13 @@ fs_visitor::emit_fb_writes()
|
|||
nr += reg_width;
|
||||
|
||||
if (c->source_depth_to_render_target) {
|
||||
if (brw->gen == 6 && dispatch_width == 16) {
|
||||
if (brw->gen == 6) {
|
||||
/* For outputting oDepth on gen6, SIMD8 writes have to be
|
||||
* used. This would require SIMD8 moves of each half to
|
||||
* message regs, kind of like pre-gen5 SIMD16 FB writes.
|
||||
* Just bail on doing so for now.
|
||||
*/
|
||||
fail("Missing support for simd16 depth writes on gen6\n");
|
||||
no16("Missing support for simd16 depth writes on gen6\n");
|
||||
}
|
||||
|
||||
if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
|
||||
|
|
Loading…
Reference in New Issue