diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c index 44dc6f1b8e4..c9a1b679116 100644 --- a/src/freedreno/ir3/ir3_ra.c +++ b/src/freedreno/ir3/ir3_ra.c @@ -563,6 +563,9 @@ ra_init(struct ir3_ra_ctx *ctx) ctx->hr0_xyz_nodes = ctx->alloc_count; ctx->alloc_count += 3; + /* Add vreg name for prefetch-exclusion range: */ + ctx->prefetch_exclude_node = ctx->alloc_count++; + ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count); ralloc_steal(ctx->g, ctx->instrd); ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count); @@ -711,11 +714,20 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) */ if (is_tex_or_prefetch(instr)) { int writemask_skipped_regs = ffs(instr->regs[0]->wrmask) - 1; - int r0_xyz = (instr->regs[0]->flags & IR3_REG_HALF) ? + int r0_xyz = is_half(instr) ? ctx->hr0_xyz_nodes : ctx->r0_xyz_nodes; for (int i = 0; i < writemask_skipped_regs; i++) ra_add_node_interference(ctx->g, name, r0_xyz + i); } + + /* Pre-fetched textures have a lower limit for bits to encode dst + * register, so add additional interference with registers above + * that limit. + */ + if (instr->opc == OPC_META_TEX_PREFETCH) { + ra_add_node_interference(ctx->g, name, + ctx->prefetch_exclude_node); + } } foreach_use (name, ctx, instr) { @@ -1011,7 +1023,6 @@ ra_add_interference(struct ir3_ra_ctx *ctx) arr->end_ip = 0; } - /* set up the r0.xyz precolor regs. */ for (int i = 0; i < 3; i++) { ra_set_node_reg(ctx->g, ctx->r0_xyz_nodes + i, i); @@ -1019,6 +1030,12 @@ ra_add_interference(struct ir3_ra_ctx *ctx) ctx->set->first_half_reg + i); } + /* pre-color node that conflict with half/full regs higher than what + * can be encoded for tex-prefetch: + */ + ra_set_node_reg(ctx->g, ctx->prefetch_exclude_node, + ctx->set->prefetch_exclude_reg); + /* compute live ranges (use/def) on a block level, also updating * block's def/use bitmasks (used below to calculate per-block * livein/liveout): diff --git a/src/freedreno/ir3/ir3_ra.h b/src/freedreno/ir3/ir3_ra.h index 437223bd1de..aa703ae645f 100644 --- a/src/freedreno/ir3/ir3_ra.h +++ b/src/freedreno/ir3/ir3_ra.h @@ -89,6 +89,14 @@ struct ir3_ra_reg_set { unsigned int half_classes[half_class_count]; unsigned int high_classes[high_class_count]; + /* pre-fetched tex dst is limited, on current gens to regs + * 0x3f and below. An additional register class, with one + * vreg, that is setup to conflict with any regs above that + * limit. + */ + unsigned prefetch_exclude_class; + unsigned prefetch_exclude_reg; + /* The virtual register space flattens out all the classes, * starting with full, followed by half and then high, ie: * @@ -145,7 +153,8 @@ struct ir3_ra_ctx { unsigned alloc_count; unsigned r0_xyz_nodes; /* ra node numbers for r0.[xyz] precolors */ - unsigned hr0_xyz_nodes; /* ra node numbers for hr0.[xyz] precolors pre-a6xx */ + unsigned hr0_xyz_nodes; /* ra node numbers for hr0.[xyz] precolors */ + unsigned prefetch_exclude_node; /* one per class, plus one slot for arrays: */ unsigned class_alloc_count[total_class_count + 1]; unsigned class_base[total_class_count + 1]; diff --git a/src/freedreno/ir3/ir3_ra_regset.c b/src/freedreno/ir3/ir3_ra_regset.c index f5acc8f87ac..48fd9f106e8 100644 --- a/src/freedreno/ir3/ir3_ra_regset.c +++ b/src/freedreno/ir3/ir3_ra_regset.c @@ -70,6 +70,21 @@ setup_conflicts(struct ir3_ra_reg_set *set) reg++; } } + + /* + * Setup conflicts with registers over 0x3f for the special vreg + * that exists to use as interference for tex-prefetch: + */ + + for (unsigned i = 0x40; i < CLASS_REGS(0); i++) { + ra_add_transitive_reg_conflict(set->regs, i, + set->prefetch_exclude_reg); + } + + for (unsigned i = 0x40; i < HALF_CLASS_REGS(0); i++) { + ra_add_transitive_reg_conflict(set->regs, i + set->first_half_reg, + set->prefetch_exclude_reg); + } } /* One-time setup of RA register-set, which describes all the possible @@ -104,6 +119,8 @@ ir3_ra_alloc_reg_set(struct ir3_compiler *compiler) for (unsigned i = 0; i < high_class_count; i++) ra_reg_count += HIGH_CLASS_REGS(i); + ra_reg_count += 1; /* for tex-prefetch excludes */ + /* allocate the reg-set.. */ set->regs = ra_alloc_reg_set(set, ra_reg_count, true); set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count); @@ -164,7 +181,20 @@ ir3_ra_alloc_reg_set(struct ir3_compiler *compiler) } } - /* starting a6xx, half precision regs conflict w/ full precision regs: */ + /* + * Setup an additional class, with one vreg, to simply conflict + * with registers that are too high to encode tex-prefetch. This + * vreg is only used to setup additional conflicts so that RA + * knows to allocate prefetch dst regs below the limit: + */ + set->prefetch_exclude_class = ra_alloc_reg_class(set->regs); + ra_class_add_reg(set->regs, set->prefetch_exclude_class, reg); + set->prefetch_exclude_reg = reg++; + + /* + * And finally setup conflicts. Starting a6xx, half precision regs + * conflict w/ full precision regs (when using MERGEDREGS): + */ if (compiler->gpu_id >= 600) { for (unsigned i = 0; i < CLASS_REGS(0) / 2; i++) { unsigned freg = set->gpr_to_ra_reg[0][i];