diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index 2bf591fbac5..2ed84146735 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -206,8 +206,10 @@ v3d_general_tmu_op(nir_intrinsic_instr *instr) * Checks if pipelining a new TMU operation requiring 'components' LDTMUs and * 'writes' TMU register writes would overflow any of the TMU fifos. */ -static bool -tmu_fifo_overflow(struct v3d_compile *c, uint32_t components, uint32_t writes) +bool +ntq_tmu_fifo_overflow(struct v3d_compile *c, + uint32_t components, + uint32_t writes) { if (c->tmu.input_fifo_size + writes > 16 / c->threads) return true; @@ -236,13 +238,15 @@ ntq_flush_tmu(struct v3d_compile *c) bool emitted_tmuwt = false; for (int i = 0; i < c->tmu.flush_count; i++) { - if (c->tmu.flush[i].num_components > 0) { + if (c->tmu.flush[i].component_mask > 0) { nir_dest *dest = c->tmu.flush[i].dest; assert(dest); - for (int j = 0; j < c->tmu.flush[i].num_components; j++) { - ntq_store_dest(c, dest, j, - vir_MOV(c, vir_LDTMU(c))); + for (int j = 0; j < 4; j++) { + if (c->tmu.flush[i].component_mask & (1 << j)) { + ntq_store_dest(c, dest, j, + vir_MOV(c, vir_LDTMU(c))); + } } } else if (!emitted_tmuwt) { vir_TMUWT(c); @@ -262,13 +266,14 @@ ntq_flush_tmu(struct v3d_compile *c) * is reponsible for ensuring that doing this doesn't overflow the TMU fifos, * and more specifically, the output fifo, since that can't stall. */ -static void +void ntq_add_pending_tmu_flush(struct v3d_compile *c, nir_dest *dest, - uint32_t num_components, + uint32_t component_mask, uint32_t tmu_writes) { - assert(!tmu_fifo_overflow(c, num_components, tmu_writes)); + const uint32_t num_components = util_bitcount(component_mask); + assert(!ntq_tmu_fifo_overflow(c, num_components, tmu_writes)); c->tmu.input_fifo_size += tmu_writes; if (num_components > 0) { @@ -279,7 +284,7 @@ ntq_add_pending_tmu_flush(struct v3d_compile *c, } c->tmu.flush[c->tmu.flush_count].dest = dest; - c->tmu.flush[c->tmu.flush_count].num_components = num_components; + c->tmu.flush[c->tmu.flush_count].component_mask = component_mask; c->tmu.flush_count++; } @@ -615,15 +620,17 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, /* If pipelining this TMU operation would * overflow TMU fifos, we need to flush. */ - if (tmu_fifo_overflow(c, dest_components, tmu_writes)) + if (ntq_tmu_fifo_overflow(c, dest_components, tmu_writes)) ntq_flush_tmu(c); } else { /* Delay emission of the thread switch and * LDTMU/TMUWT until we really need to do it to * improve pipelining. */ + const uint32_t component_mask = + (1 << dest_components) - 1; ntq_add_pending_tmu_flush(c, &instr->dest, - dest_components, + component_mask, tmu_writes); } } diff --git a/src/broadcom/compiler/v3d33_tex.c b/src/broadcom/compiler/v3d33_tex.c index 386453289c3..b933635f6fe 100644 --- a/src/broadcom/compiler/v3d33_tex.c +++ b/src/broadcom/compiler/v3d33_tex.c @@ -33,7 +33,11 @@ void v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) { - /* FIXME: allow tex pipelining */ + /* FIXME: We don't bother implementing pipelining for texture reads + * for any pre 4.x hardware. It should be straight forward to do but + * we are not really testing or even targetting this hardware at + * present. + */ ntq_flush_tmu(c); unsigned unit = instr->texture_index; diff --git a/src/broadcom/compiler/v3d40_tex.c b/src/broadcom/compiler/v3d40_tex.c index f999c8b8619..73a1d539aab 100644 --- a/src/broadcom/compiler/v3d40_tex.c +++ b/src/broadcom/compiler/v3d40_tex.c @@ -39,7 +39,8 @@ vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val, */ vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val); - (*tmu_writes)++; + if (tmu_writes) + (*tmu_writes)++; } static void @@ -58,125 +59,185 @@ static const struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked_default = { .op = V3D_TMU_OP_REGULAR, }; +/** + * If 'tmu_writes' is not NULL, then it just counts required register writes, + * otherwise, it emits the actual register writes. + * + * It is important to notice that emitting register writes for the current + * TMU operation may trigger a TMU flush, since it is possible that any + * of the inputs required for the register writes is the result of a pending + * TMU operation. If that happens we need to make sure that it doesn't happen + * in the middle of the TMU register writes for the current TMU operation, + * which is why we always call ntq_get_src() even if we are only interested in + * register write counts. + */ +static void +handle_tex_src(struct v3d_compile *c, + nir_tex_instr *instr, + unsigned src_idx, + unsigned non_array_components, + struct V3D41_TMU_CONFIG_PARAMETER_2 *p2_unpacked, + struct qreg *s_out, + unsigned *tmu_writes) +{ + /* Either we are calling this just to count required TMU writes, or we + * are calling this to emit the actual TMU writes. + */ + assert(tmu_writes || (s_out && p2_unpacked)); + + struct qreg s; + switch (instr->src[src_idx].src_type) { + case nir_tex_src_coord: + /* S triggers the lookup, so save it for the end. */ + s = ntq_get_src(c, instr->src[src_idx].src, 0); + if (tmu_writes) + (*tmu_writes)++; + else + *s_out = s; + + if (non_array_components > 1) { + struct qreg src = + ntq_get_src(c, instr->src[src_idx].src, 1); + if (tmu_writes) + (*tmu_writes)++; + else + vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUT, src, NULL); + } + + if (non_array_components > 2) { + struct qreg src = + ntq_get_src(c, instr->src[src_idx].src, 2); + if (tmu_writes) + (*tmu_writes)++; + else + vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUR, src, NULL); + } + + if (instr->is_array) { + struct qreg src = + ntq_get_src(c, instr->src[src_idx].src, + instr->coord_components - 1); + if (tmu_writes) + (*tmu_writes)++; + else + vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUI, src, NULL); + } + break; + + case nir_tex_src_bias: { + struct qreg src = ntq_get_src(c, instr->src[src_idx].src, 0); + if (tmu_writes) + (*tmu_writes)++; + else + vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUB, src, NULL); + break; + } + + case nir_tex_src_lod: { + struct qreg src = ntq_get_src(c, instr->src[src_idx].src, 0); + if (tmu_writes) { + (*tmu_writes)++; + } else { + vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUB, src, NULL); + + /* With texel fetch automatic LOD is already disabled, + * and disable_autolod must not be enabled. For + * non-cubes we can use the register TMUSLOD, that + * implicitly sets disable_autolod. + */ + if (instr->op != nir_texop_txf && + instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { + p2_unpacked->disable_autolod = true; + } + } + break; + } + + case nir_tex_src_comparator: { + struct qreg src = ntq_get_src(c, instr->src[src_idx].src, 0); + if (tmu_writes) + (*tmu_writes)++; + else + vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUDREF, src , NULL); + break; + } + + case nir_tex_src_offset: { + bool is_const_offset = nir_src_is_const(instr->src[src_idx].src); + if (is_const_offset) { + if (!tmu_writes) { + p2_unpacked->offset_s = + nir_src_comp_as_int(instr->src[src_idx].src, 0); + if (non_array_components >= 2) + p2_unpacked->offset_t = + nir_src_comp_as_int(instr->src[src_idx].src, 1); + if (non_array_components >= 3) + p2_unpacked->offset_r = + nir_src_comp_as_int(instr->src[src_idx].src, 2); + } + } else { + struct qreg src_0 = + ntq_get_src(c, instr->src[src_idx].src, 0); + struct qreg src_1 = + ntq_get_src(c, instr->src[src_idx].src, 1); + if (!tmu_writes) { + struct qreg mask = vir_uniform_ui(c, 0xf); + struct qreg x, y, offset; + + x = vir_AND(c, src_0, mask); + y = vir_AND(c, src_1, mask); + offset = vir_OR(c, x, + vir_SHL(c, y, vir_uniform_ui(c, 4))); + + vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUOFF, offset, NULL); + } else { + (*tmu_writes)++; + } + } + break; + } + + default: + unreachable("unknown texture source"); + } +} + +static void +vir_tex_handle_srcs(struct v3d_compile *c, + nir_tex_instr *instr, + struct V3D41_TMU_CONFIG_PARAMETER_2 *p2_unpacked, + struct qreg *s, + unsigned *tmu_writes) +{ + unsigned non_array_components = instr->op != nir_texop_lod ? + instr->coord_components - instr->is_array : + instr->coord_components; + + for (unsigned i = 0; i < instr->num_srcs; i++) { + handle_tex_src(c, instr, i, non_array_components, + p2_unpacked, s, tmu_writes); + } +} + +static unsigned +get_required_tmu_writes(struct v3d_compile *c, nir_tex_instr *instr) +{ + unsigned tmu_writes = 0; + vir_tex_handle_srcs(c, instr, NULL, NULL, &tmu_writes); + return tmu_writes; +} + void v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) { - /* FIXME: allow tex pipelining */ - ntq_flush_tmu(c); + assert(instr->op != nir_texop_lod || c->devinfo->ver >= 42); unsigned texture_idx = instr->texture_index; unsigned sampler_idx = instr->sampler_index; - int tmu_writes = 0; - struct V3D41_TMU_CONFIG_PARAMETER_0 p0_unpacked = { }; - assert(instr->op != nir_texop_lod || c->devinfo->ver >= 42); - - struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked = { - .op = V3D_TMU_OP_REGULAR, - - .gather_mode = instr->op == nir_texop_tg4, - .gather_component = instr->component, - - .coefficient_mode = instr->op == nir_texop_txd, - - .disable_autolod = instr->op == nir_texop_tg4 - }; - - int non_array_components = - instr->op != nir_texop_lod ? - instr->coord_components - instr->is_array : - instr->coord_components; - - struct qreg s; - - for (unsigned i = 0; i < instr->num_srcs; i++) { - switch (instr->src[i].src_type) { - case nir_tex_src_coord: - /* S triggers the lookup, so save it for the end. */ - s = ntq_get_src(c, instr->src[i].src, 0); - - if (non_array_components > 1) { - vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUT, - ntq_get_src(c, instr->src[i].src, - 1), &tmu_writes); - } - if (non_array_components > 2) { - vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUR, - ntq_get_src(c, instr->src[i].src, - 2), &tmu_writes); - } - - if (instr->is_array) { - vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUI, - ntq_get_src(c, instr->src[i].src, - instr->coord_components - 1), - &tmu_writes); - } - break; - - case nir_tex_src_bias: - vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUB, - ntq_get_src(c, instr->src[i].src, 0), - &tmu_writes); - break; - - case nir_tex_src_lod: - vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUB, - ntq_get_src(c, instr->src[i].src, 0), - &tmu_writes); - - /* With texel fetch automatic LOD is already disabled, - * and disable_autolod must not be enabled. For - * non-cubes we can use the register TMUSLOD, that - * implicitly sets disable_autolod. - */ - if (instr->op != nir_texop_txf && - instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { - p2_unpacked.disable_autolod = true; - } - break; - - case nir_tex_src_comparator: - vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUDREF, - ntq_get_src(c, instr->src[i].src, 0), - &tmu_writes); - break; - - case nir_tex_src_offset: { - if (nir_src_is_const(instr->src[i].src)) { - p2_unpacked.offset_s = nir_src_comp_as_int(instr->src[i].src, 0); - if (non_array_components >= 2) - p2_unpacked.offset_t = - nir_src_comp_as_int(instr->src[i].src, 1); - if (non_array_components >= 3) - p2_unpacked.offset_r = - nir_src_comp_as_int(instr->src[i].src, 2); - } else { - struct qreg mask = vir_uniform_ui(c, 0xf); - struct qreg x, y, offset; - - x = vir_AND(c, ntq_get_src(c, instr->src[i].src, - 0), mask); - y = vir_AND(c, ntq_get_src(c, instr->src[i].src, - 1), mask); - offset = vir_OR(c, x, - vir_SHL(c, y, - vir_uniform_ui(c, 4))); - - vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUOFF, - offset, &tmu_writes); - } - break; - } - - default: - unreachable("unknown texture source"); - } - } - /* Limit the number of channels returned to both how many the NIR * instruction writes and how many the instruction could produce. */ @@ -184,9 +245,36 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) instr->dest.is_ssa ? nir_ssa_def_components_read(&instr->dest.ssa) : (1 << instr->dest.reg.reg->num_components) - 1; - assert(p0_unpacked.return_words_of_texture_data != 0); + struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked = { + .op = V3D_TMU_OP_REGULAR, + .gather_mode = instr->op == nir_texop_tg4, + .gather_component = instr->component, + .coefficient_mode = instr->op == nir_texop_txd, + .disable_autolod = instr->op == nir_texop_tg4 + }; + + const unsigned tmu_writes = get_required_tmu_writes(c, instr); + + /* The input FIFO has 16 slots across all threads so if we require + * more than that we need to lower thread count. + */ + while (tmu_writes > 16 / c->threads) + c->threads /= 2; + + /* If pipelining this TMU operation would overflow TMU fifos, we need + * to flush any outstanding TMU operations. + */ + const unsigned dest_components = + util_bitcount(p0_unpacked.return_words_of_texture_data); + if (ntq_tmu_fifo_overflow(c, dest_components, tmu_writes)) + ntq_flush_tmu(c); + + /* Process tex sources emitting corresponding TMU writes */ + struct qreg s = { }; + vir_tex_handle_srcs(c, instr, &p2_unpacked, &s, NULL); + uint32_t p0_packed; V3D41_TMU_CONFIG_PARAMETER_0_pack(NULL, (uint8_t *)&p0_packed, @@ -216,15 +304,15 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) * itself, we still need to add the sampler configuration * parameter if the output is 32 bit */ - bool output_type_32_bit = (c->key->sampler[sampler_idx].return_size == 32 && - !instr->is_shadow); + bool output_type_32_bit = + c->key->sampler[sampler_idx].return_size == 32 && + !instr->is_shadow; - /* - * p1 is optional, but we can skip it only if p2 can be skipped too - */ + /* p1 is optional, but we can skip it only if p2 can be skipped too */ bool needs_p2_config = (instr->op == nir_texop_lod || - memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked)) != 0); + memcmp(&p2_unpacked, &p2_unpacked_default, + sizeof(p2_unpacked)) != 0); /* To handle the cases were we can't just use p1_unpacked_default */ bool non_default_p1_config = nir_tex_instr_need_sampler(instr) || @@ -285,29 +373,21 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) if (needs_p2_config) vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed); + /* Emit retiring TMU write */ if (instr->op == nir_texop_txf) { assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE); - vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s, &tmu_writes); + vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s, NULL); } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { - vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s, &tmu_writes); + vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s, NULL); } else if (instr->op == nir_texop_txl) { - vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s, &tmu_writes); + vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s, NULL); } else { - vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s, &tmu_writes); + vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s, NULL); } - vir_emit_thrsw(c); - - /* The input FIFO has 16 slots across all threads, so make sure we - * don't overfill our allocation. - */ - while (tmu_writes > 16 / c->threads) - c->threads /= 2; - - for (int i = 0; i < 4; i++) { - if (p0_unpacked.return_words_of_texture_data & (1 << i)) - ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c)); - } + ntq_add_pending_tmu_flush(c, &instr->dest, + p0_unpacked.return_words_of_texture_data, + tmu_writes); } static uint32_t diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index d617168ddd5..d75a6203ba9 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -579,7 +579,8 @@ struct v3d_compile { struct { nir_dest *dest; - uint32_t num_components; + uint8_t num_components; + uint8_t component_mask; } flush[8]; /* 16 entries / 2 threads for input/output fifos */ uint32_t flush_count; } tmu; @@ -936,6 +937,9 @@ uint8_t vir_channels_written(struct qinst *inst); struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i); void ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, struct qreg result); +bool ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components, uint32_t writes); +void ntq_add_pending_tmu_flush(struct v3d_compile *c, nir_dest *dest, + uint32_t component_mask, uint32_t tmu_writes); void ntq_flush_tmu(struct v3d_compile *c); void vir_emit_thrsw(struct v3d_compile *c);