diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources index 76e46f54089..76e52ce142d 100644 --- a/src/gallium/drivers/vc4/Makefile.sources +++ b/src/gallium/drivers/vc4/Makefile.sources @@ -31,6 +31,7 @@ C_SOURCES := \ vc4_opt_vpm.c \ vc4_program.c \ vc4_qir.c \ + vc4_qir_emit_uniform_stream_resets.c \ vc4_qir_live_variables.c \ vc4_qir_lower_uniforms.c \ vc4_qir_schedule.c \ diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index 465e052053e..521f971272a 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -2114,6 +2114,7 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage, qir_lower_uniforms(c); qir_schedule_instructions(c); + qir_emit_uniform_stream_resets(c); if (vc4_debug & VC4_DEBUG_QIR) { fprintf(stderr, "%s prog %d/%d QIR:\n", diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c index e1d663dd3a7..9ff15611ef9 100644 --- a/src/gallium/drivers/vc4/vc4_qir.c +++ b/src/gallium/drivers/vc4/vc4_qir.c @@ -85,6 +85,7 @@ static const struct qir_op_info qir_op_info[] = { [QOP_LOAD_IMM] = { "load_imm", 0, 1 }, [QOP_BRANCH] = { "branch", 0, 0, true }, + [QOP_UNIFORMS_RESET] = { "uniforms_reset", 0, 2, true }, }; static const char * diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index e7ddfaa1fcb..88eda225d80 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -162,6 +162,12 @@ enum qop { * that block->successor[1] may be unset if the condition is ALWAYS. */ QOP_BRANCH, + + /* Emits an ADD from src[0] to src[1], where src[0] must be a + * QOP_LOAD_IMM result and src[1] is a QUNIFORM_UNIFORMS_ADDRESS, + * required by the kernel as part of its branch validation. + */ + QOP_UNIFORMS_RESET, }; struct queued_qpu_inst { @@ -260,6 +266,11 @@ enum quniform_contents { QUNIFORM_ALPHA_REF, QUNIFORM_SAMPLE_MASK, + + /* Placeholder uniform that will be updated by the kernel when used by + * an instruction writing to QPU_W_UNIFORMS_ADDRESS. + */ + QUNIFORM_UNIFORMS_ADDRESS, }; struct vc4_varying_slot { @@ -521,6 +532,7 @@ struct qreg qir_uniform(struct vc4_compile *c, uint32_t data); void qir_schedule_instructions(struct vc4_compile *c); void qir_reorder_uniforms(struct vc4_compile *c); +void qir_emit_uniform_stream_resets(struct vc4_compile *c); struct qreg qir_emit_def(struct vc4_compile *c, struct qinst *inst); struct qinst *qir_emit_nondef(struct vc4_compile *c, struct qinst *inst); diff --git a/src/gallium/drivers/vc4/vc4_qir_emit_uniform_stream_resets.c b/src/gallium/drivers/vc4/vc4_qir_emit_uniform_stream_resets.c new file mode 100644 index 00000000000..3fd6358e3d3 --- /dev/null +++ b/src/gallium/drivers/vc4/vc4_qir_emit_uniform_stream_resets.c @@ -0,0 +1,101 @@ +/* + * Copyright © 2014 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * @file vc4_qir_emit_uniform_stream_resets.c + * + * Adds updates to the uniform stream address at the start of each basic block + * that uses uniforms. + * + * This will be done just before the translation to QPU instructions, once we + * have performed optimization know how many uniforms are used in each block. + */ + +#include "vc4_qir.h" +#include "util/hash_table.h" +#include "util/u_math.h" + +static bool +inst_reads_a_uniform(struct qinst *inst) +{ + if (qir_is_tex(inst)) + return true; + + for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + if (inst->src[i].file == QFILE_UNIF) + return true; + } + + return false; +} + +static bool +block_reads_any_uniform(struct qblock *block) +{ + qir_for_each_inst(inst, block) { + if (inst_reads_a_uniform(inst)) + return true; + } + + return false; +} + +void +qir_emit_uniform_stream_resets(struct vc4_compile *c) +{ + uint32_t uniform_count = 0; + + qir_for_each_block(block, c) { + if (block != qir_entry_block(c) && + (block_reads_any_uniform(block) || + block == qir_exit_block(c))) { + struct qreg t = qir_get_temp(c); + struct qreg uni_addr = + qir_uniform(c, QUNIFORM_UNIFORMS_ADDRESS, 0); + + /* Load the offset of the next uniform in the stream + * after the one we're generating here. + */ + struct qinst *load_imm = + qir_inst(QOP_LOAD_IMM, + t, + qir_reg(QFILE_LOAD_IMM, + (uniform_count + 1) * 4), + c->undef); + struct qinst *add = + qir_inst(QOP_UNIFORMS_RESET, c->undef, + t, uni_addr); + + /* Pushes to the top of the block, so in reverse + * order. + */ + list_add(&add->link, &block->instructions); + list_add(&load_imm->link, &block->instructions); + } + + qir_for_each_inst(inst, block) { + if (inst_reads_a_uniform(inst)) + uniform_count++; + } + } +} diff --git a/src/gallium/drivers/vc4/vc4_qir_schedule.c b/src/gallium/drivers/vc4/vc4_qir_schedule.c index 903c6108824..69bd0dd623e 100644 --- a/src/gallium/drivers/vc4/vc4_qir_schedule.c +++ b/src/gallium/drivers/vc4/vc4_qir_schedule.c @@ -138,6 +138,7 @@ struct schedule_setup_state { struct schedule_node *last_tex_coord; struct schedule_node *last_tex_result; struct schedule_node *last_tlb; + struct schedule_node *last_uniforms_reset; enum direction dir; /** @@ -280,6 +281,16 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx, calculate_deps(&state, n); + for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + switch (inst->src[i].file) { + case QFILE_UNIF: + add_dep(state.dir, state.last_uniforms_reset, n); + break; + default: + break; + } + } + switch (inst->op) { case QOP_TEX_S: case QOP_TEX_T: @@ -324,6 +335,11 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx, memset(&state.tex_fifo[state.tex_fifo_pos], 0, sizeof(state.tex_fifo[0])); break; + + case QOP_UNIFORMS_RESET: + add_write_dep(state.dir, &state.last_uniforms_reset, n); + break; + default: assert(!qir_is_tex(inst)); break; diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index 9001643507e..6a10e1b68de 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -427,6 +427,14 @@ vc4_generate_code_block(struct vc4_compile *c, handled_qinst_cond = true; break; + case QOP_UNIFORMS_RESET: + fixup_raddr_conflict(block, dst, &src[0], &src[1], + qinst, &unpack); + + queue(block, qpu_a_ADD(qpu_ra(QPU_W_UNIFORMS_ADDRESS), + src[0], src[1])); + break; + default: assert(qinst->op < ARRAY_SIZE(translate)); assert(translate[qinst->op].op != 0); /* NOPs */ diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c index a55b0351402..1caee51a581 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c +++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c @@ -92,6 +92,7 @@ struct schedule_state { struct schedule_node *last_tmu_write; struct schedule_node *last_tlb; struct schedule_node *last_vpm; + struct schedule_node *last_uniforms_reset; enum direction dir; /* Estimated cycle when the current instruction would start. */ uint32_t time; @@ -184,6 +185,9 @@ process_raddr_deps(struct schedule_state *state, struct schedule_node *n, break; case QPU_R_UNIF: + add_read_dep(state, state->last_uniforms_reset, n); + break; + case QPU_R_NOP: case QPU_R_ELEM_QPU: case QPU_R_XY_PIXEL_COORD: @@ -259,6 +263,7 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n, } } else if (is_tmu_write(waddr)) { add_write_dep(state, &state->last_tmu_write, n); + add_read_dep(state, state->last_uniforms_reset, n); } else if (qpu_waddr_is_tlb(waddr) || waddr == QPU_W_MS_FLAGS) { add_write_dep(state, &state->last_tlb, n); @@ -305,6 +310,10 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n, add_write_dep(state, &state->last_tlb, n); break; + case QPU_W_UNIFORMS_ADDRESS: + add_write_dep(state, &state->last_uniforms_reset, n); + break; + case QPU_W_NOP: break; @@ -442,6 +451,7 @@ calculate_reverse_deps(struct vc4_compile *c, struct list_head *schedule_list) struct choose_scoreboard { int tick; int last_sfu_write_tick; + int last_uniforms_reset_tick; uint32_t last_waddr_a, last_waddr_b; }; @@ -476,6 +486,11 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard, uint64_t inst) } } + if (reads_uniform(inst) && + scoreboard->tick - scoreboard->last_uniforms_reset_tick <= 2) { + return true; + } + return false; } @@ -614,6 +629,11 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, (waddr_mul >= QPU_W_SFU_RECIP && waddr_mul <= QPU_W_SFU_LOG)) { scoreboard->last_sfu_write_tick = scoreboard->tick; } + + if (waddr_add == QPU_W_UNIFORMS_ADDRESS || + waddr_mul == QPU_W_UNIFORMS_ADDRESS) { + scoreboard->last_uniforms_reset_tick = scoreboard->tick; + } } static void @@ -971,6 +991,7 @@ qpu_schedule_instructions(struct vc4_compile *c) scoreboard.last_waddr_a = ~0; scoreboard.last_waddr_b = ~0; scoreboard.last_sfu_write_tick = -10; + scoreboard.last_uniforms_reset_tick = -10; if (debug) { fprintf(stderr, "Pre-schedule instructions\n"); diff --git a/src/gallium/drivers/vc4/vc4_uniforms.c b/src/gallium/drivers/vc4/vc4_uniforms.c index 4715a7fffd5..ee21771dd89 100644 --- a/src/gallium/drivers/vc4/vc4_uniforms.c +++ b/src/gallium/drivers/vc4/vc4_uniforms.c @@ -324,6 +324,11 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, case QUNIFORM_SAMPLE_MASK: cl_aligned_u32(&uniforms, vc4->sample_mask); break; + + case QUNIFORM_UNIFORMS_ADDRESS: + /* This will be filled in by the kernel. */ + cl_aligned_u32(&uniforms, 0xd0d0d0d0); + break; } #if 0 uint32_t written_val = *((uint32_t *)uniforms - 1); @@ -345,6 +350,7 @@ vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader) for (int i = 0; i < shader->uniforms.count; i++) { switch (shader->uniforms.contents[i]) { case QUNIFORM_CONSTANT: + case QUNIFORM_UNIFORMS_ADDRESS: break; case QUNIFORM_UNIFORM: case QUNIFORM_UBO_ADDR: