mirror of https://gitlab.freedesktop.org/mesa/mesa
vc4: Add THRSW nodes after each tex sample setup in multithreaded mode.
This is a suboptimal implementation, but Jonas Pfeil found that it was still a massive performance gain.
This commit is contained in:
parent
e3c620e868
commit
67f72c5f5d
|
@ -65,6 +65,23 @@ resize_qreg_array(struct vc4_compile *c,
|
|||
(*regs)[i] = c->undef;
|
||||
}
|
||||
|
||||
static void
|
||||
ntq_emit_thrsw(struct vc4_compile *c)
|
||||
{
|
||||
if (!c->fs_threaded)
|
||||
return;
|
||||
|
||||
/* Always thread switch after each texture operation for now.
|
||||
*
|
||||
* We could do better by batching a bunch of texture fetches up and
|
||||
* then doing one thread switch and collecting all their results
|
||||
* afterward.
|
||||
*/
|
||||
qir_emit_nondef(c, qir_inst(QOP_THRSW, c->undef,
|
||||
c->undef, c->undef));
|
||||
c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
|
||||
}
|
||||
|
||||
static struct qreg
|
||||
indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
|
||||
{
|
||||
|
@ -105,6 +122,9 @@ indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
|
|||
|
||||
qir_TEX_DIRECT(c, indirect_offset, qir_uniform(c, QUNIFORM_UBO_ADDR, 0));
|
||||
c->num_texture_samples++;
|
||||
|
||||
ntq_emit_thrsw(c);
|
||||
|
||||
return qir_TEX_RESULT(c);
|
||||
}
|
||||
|
||||
|
@ -363,6 +383,8 @@ ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr)
|
|||
|
||||
qir_TEX_DIRECT(c, addr, qir_uniform(c, QUNIFORM_TEXTURE_MSAA_ADDR, unit));
|
||||
|
||||
ntq_emit_thrsw(c);
|
||||
|
||||
struct qreg tex = qir_TEX_RESULT(c);
|
||||
c->num_texture_samples++;
|
||||
|
||||
|
@ -483,6 +505,9 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
|
|||
qir_TEX_S(c, s, texture_u[next_texture_u++]);
|
||||
|
||||
c->num_texture_samples++;
|
||||
|
||||
ntq_emit_thrsw(c);
|
||||
|
||||
struct qreg tex = qir_TEX_RESULT(c);
|
||||
|
||||
enum pipe_format format = c->key->tex[unit].format;
|
||||
|
|
|
@ -229,6 +229,30 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n)
|
|||
add_write_dep(dir, &state->last_tex_result, n);
|
||||
break;
|
||||
|
||||
case QOP_THRSW:
|
||||
/* After a new THRSW, one must collect all texture samples
|
||||
* queued since the previous THRSW/program start. For now, we
|
||||
* have one THRSW in between each texture setup and its
|
||||
* results collection as our input, and we just make sure that
|
||||
* that ordering is maintained.
|
||||
*/
|
||||
add_write_dep(dir, &state->last_tex_coord, n);
|
||||
add_write_dep(dir, &state->last_tex_result, n);
|
||||
|
||||
/* accumulators and flags are lost across thread switches. */
|
||||
add_write_dep(dir, &state->last_sf, n);
|
||||
|
||||
/* Setup, like the varyings, will need to be drained before we
|
||||
* thread switch.
|
||||
*/
|
||||
add_write_dep(dir, &state->last_vary_read, n);
|
||||
|
||||
/* The TLB-locking operations have to stay after the last
|
||||
* thread switch.
|
||||
*/
|
||||
add_write_dep(dir, &state->last_tlb, n);
|
||||
break;
|
||||
|
||||
case QOP_TLB_COLOR_READ:
|
||||
case QOP_MS_MASK:
|
||||
add_write_dep(dir, &state->last_tlb, n);
|
||||
|
|
Loading…
Reference in New Issue