i965/fs: Improve performance of varying-index uniform loads on IVB.
Like we have done for the VS and for constant-index uniform loads, we use the sampler engine to get caching in front of the L3 to avoid tickling the IVB L3 bug. This is also a bit of a functional change, as we're now loading a vec4 instead of a single dword, though we're not taking advantage of the other 3 components of the vec4 (yet). With the driver hacked to always take the varying-index path for all uniforms, improves performance of my old GLSL demo by 315% +/- 2% (n=4). This a major fix for some blur shaders in compositors from the varying-index uniforms support I introduced in 9.1. v2: Move old offset computation into the pre-gen7 path. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=61554 NOTE: This is a candidate for the 9.1 branch.
This commit is contained in:
parent
bc0e1591f6
commit
dca5fc1435
|
@ -235,14 +235,33 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
|
|||
exec_list instructions;
|
||||
fs_inst *inst;
|
||||
|
||||
fs_reg offset = fs_reg(this, glsl_type::uint_type);
|
||||
instructions.push_tail(ADD(offset, varying_offset, fs_reg(const_offset)));
|
||||
|
||||
if (intel->gen >= 7) {
|
||||
/* We have our constant surface use a pitch of 4 bytes, so our index can
|
||||
* be any component of a vector, and then we load 4 contiguous
|
||||
* components starting from that.
|
||||
*
|
||||
* We break down the const_offset to a portion added to the variable
|
||||
* offset and a portion done using reg_offset, which means that if you
|
||||
* have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
|
||||
* a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
|
||||
* CSE can later notice that those loads are all the same and eliminate
|
||||
* the redundant ones.
|
||||
*/
|
||||
fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
|
||||
instructions.push_tail(ADD(vec4_offset,
|
||||
varying_offset, const_offset & ~3));
|
||||
|
||||
fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4), dst.type);
|
||||
inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
|
||||
dst, surf_index, offset);
|
||||
vec4_result, surf_index, vec4_offset);
|
||||
instructions.push_tail(inst);
|
||||
|
||||
vec4_result.reg_offset += const_offset & 3;
|
||||
instructions.push_tail(MOV(dst, vec4_result));
|
||||
} else {
|
||||
fs_reg offset = fs_reg(this, glsl_type::uint_type);
|
||||
instructions.push_tail(ADD(offset, varying_offset, fs_reg(const_offset)));
|
||||
|
||||
int base_mrf = 13;
|
||||
bool header_present = true;
|
||||
|
||||
|
@ -313,7 +332,7 @@ fs_inst::equals(fs_inst *inst)
|
|||
int
|
||||
fs_inst::regs_written()
|
||||
{
|
||||
if (is_tex())
|
||||
if (is_tex() || opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7)
|
||||
return 4;
|
||||
|
||||
/* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
|
||||
|
|
|
@ -734,28 +734,29 @@ fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
|
|||
index.type == BRW_REGISTER_TYPE_UD);
|
||||
uint32_t surf_index = index.dw1.ud;
|
||||
|
||||
uint32_t msg_control, rlen, mlen;
|
||||
uint32_t simd_mode, rlen, mlen;
|
||||
if (dispatch_width == 16) {
|
||||
msg_control = BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS;
|
||||
mlen = rlen = 2;
|
||||
mlen = 2;
|
||||
rlen = 8;
|
||||
simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
|
||||
} else {
|
||||
msg_control = BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS;
|
||||
mlen = rlen = 1;
|
||||
mlen = 1;
|
||||
rlen = 4;
|
||||
simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
|
||||
}
|
||||
|
||||
struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
|
||||
brw_set_dest(p, send, dst);
|
||||
brw_set_src0(p, send, offset);
|
||||
if (intel->gen < 6)
|
||||
send->header.destreg__conditionalmod = inst->base_mrf;
|
||||
brw_set_dp_read_message(p, send,
|
||||
brw_set_sampler_message(p, send,
|
||||
surf_index,
|
||||
msg_control,
|
||||
GEN7_DATAPORT_DC_DWORD_SCATTERED_READ,
|
||||
BRW_DATAPORT_READ_TARGET_DATA_CACHE,
|
||||
0, /* LD message ignores sampler unit */
|
||||
GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
|
||||
rlen,
|
||||
mlen,
|
||||
inst->header_present,
|
||||
rlen);
|
||||
false, /* no header */
|
||||
simd_mode,
|
||||
0);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
Loading…
Reference in New Issue