ir3: Reformat source with clang-format

Generated using:

cd src/freedreno/ir3 && clang-format -i {**,.}/*.c {**,.}/*.h -style=file

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11801>
This commit is contained in:
Connor Abbott 2021-07-09 14:50:05 +02:00 committed by Marge Bot
parent 082871bb35
commit 177138d8cb
52 changed files with 18722 additions and 18389 deletions

View File

@ -21,15 +21,15 @@
* SOFTWARE.
*/
#include <assert.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
#include <assert.h>
#include <util/u_debug.h>
#include <util/log.h>
#include <util/u_debug.h>
#include "isa/isa.h"
@ -39,125 +39,120 @@
static enum debug_t debug;
static const char *levels[] = {
"",
"\t",
"\t\t",
"\t\t\t",
"\t\t\t\t",
"\t\t\t\t\t",
"\t\t\t\t\t\t",
"\t\t\t\t\t\t\t",
"\t\t\t\t\t\t\t\t",
"\t\t\t\t\t\t\t\t\t",
"x",
"x",
"x",
"x",
"x",
"x",
"",
"\t",
"\t\t",
"\t\t\t",
"\t\t\t\t",
"\t\t\t\t\t",
"\t\t\t\t\t\t",
"\t\t\t\t\t\t\t",
"\t\t\t\t\t\t\t\t",
"\t\t\t\t\t\t\t\t\t",
"x",
"x",
"x",
"x",
"x",
"x",
};
struct disasm_ctx {
FILE *out;
struct isa_decode_options *options;
unsigned level;
unsigned extra_cycles;
FILE *out;
struct isa_decode_options *options;
unsigned level;
unsigned extra_cycles;
/**
* nop_count/has_end used to detect the real end of shader. Since
* in some cases there can be a epilogue following an `end` we look
* for a sequence of `nop`s following the `end`
*/
int nop_count; /* number of nop's since non-nop instruction: */
bool has_end; /* have we seen end instruction */
/**
* nop_count/has_end used to detect the real end of shader. Since
* in some cases there can be a epilogue following an `end` we look
* for a sequence of `nop`s following the `end`
*/
int nop_count; /* number of nop's since non-nop instruction: */
bool has_end; /* have we seen end instruction */
int cur_n; /* current instr # */
int cur_opc_cat; /* current opc_cat */
int cur_n; /* current instr # */
int cur_opc_cat; /* current opc_cat */
int sfu_delay;
int sfu_delay;
/**
* State accumulated decoding fields of the current instruction,
* handled after decoding is complete (ie. at start of next instr)
*/
struct {
bool ss;
uint8_t nop;
uint8_t repeat;
} last;
/**
* State accumulated decoding fields of the current instruction,
* handled after decoding is complete (ie. at start of next instr)
*/
struct {
bool ss;
uint8_t nop;
uint8_t repeat;
} last;
/**
* State accumulated decoding fields of src or dst register
*/
struct {
bool half;
bool r;
enum {
FILE_GPR = 1,
FILE_CONST = 2,
} file;
unsigned num;
} reg;
/**
* State accumulated decoding fields of src or dst register
*/
struct {
bool half;
bool r;
enum {
FILE_GPR = 1,
FILE_CONST = 2,
} file;
unsigned num;
} reg;
struct shader_stats *stats;
struct shader_stats *stats;
};
static void print_stats(struct disasm_ctx *ctx)
static void
print_stats(struct disasm_ctx *ctx)
{
if (ctx->options->gpu_id >= 600) {
/* handle MERGEREGS case.. this isn't *entirely* accurate, as
* you can have shader stages not using merged register file,
* but it is good enough for a guestimate:
*/
unsigned n = (ctx->stats->halfreg + 1) / 2;
if (ctx->options->gpu_id >= 600) {
/* handle MERGEREGS case.. this isn't *entirely* accurate, as
* you can have shader stages not using merged register file,
* but it is good enough for a guestimate:
*/
unsigned n = (ctx->stats->halfreg + 1) / 2;
ctx->stats->halfreg = 0;
ctx->stats->fullreg = MAX2(ctx->stats->fullreg, n);
}
ctx->stats->halfreg = 0;
ctx->stats->fullreg = MAX2(ctx->stats->fullreg, n);
}
unsigned instructions = ctx->cur_n + ctx->extra_cycles + 1;
unsigned instructions = ctx->cur_n + ctx->extra_cycles + 1;
fprintf(ctx->out, "%sStats:\n", levels[ctx->level]);
fprintf(ctx->out, "%s- shaderdb: %u instr, %u nops, %u non-nops, %u mov, %u cov\n",
levels[ctx->level],
instructions,
ctx->stats->nops,
instructions - ctx->stats->nops,
ctx->stats->mov_count,
ctx->stats->cov_count);
fprintf(ctx->out, "%sStats:\n", levels[ctx->level]);
fprintf(ctx->out,
"%s- shaderdb: %u instr, %u nops, %u non-nops, %u mov, %u cov\n",
levels[ctx->level], instructions, ctx->stats->nops,
instructions - ctx->stats->nops, ctx->stats->mov_count,
ctx->stats->cov_count);
fprintf(ctx->out, "%s- shaderdb: %u last-baryf, %d half, %d full, %u constlen\n",
levels[ctx->level],
ctx->stats->last_baryf,
DIV_ROUND_UP(ctx->stats->halfreg, 4),
DIV_ROUND_UP(ctx->stats->fullreg, 4),
DIV_ROUND_UP(ctx->stats->constlen, 4));
fprintf(ctx->out,
"%s- shaderdb: %u last-baryf, %d half, %d full, %u constlen\n",
levels[ctx->level], ctx->stats->last_baryf,
DIV_ROUND_UP(ctx->stats->halfreg, 4),
DIV_ROUND_UP(ctx->stats->fullreg, 4),
DIV_ROUND_UP(ctx->stats->constlen, 4));
fprintf(ctx->out, "%s- shaderdb: %u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7\n",
levels[ctx->level],
ctx->stats->instrs_per_cat[0],
ctx->stats->instrs_per_cat[1],
ctx->stats->instrs_per_cat[2],
ctx->stats->instrs_per_cat[3],
ctx->stats->instrs_per_cat[4],
ctx->stats->instrs_per_cat[5],
ctx->stats->instrs_per_cat[6],
ctx->stats->instrs_per_cat[7]);
fprintf(
ctx->out,
"%s- shaderdb: %u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7\n",
levels[ctx->level], ctx->stats->instrs_per_cat[0],
ctx->stats->instrs_per_cat[1], ctx->stats->instrs_per_cat[2],
ctx->stats->instrs_per_cat[3], ctx->stats->instrs_per_cat[4],
ctx->stats->instrs_per_cat[5], ctx->stats->instrs_per_cat[6],
ctx->stats->instrs_per_cat[7]);
fprintf(ctx->out, "%s- shaderdb: %u sstall, %u (ss), %u (sy)\n",
levels[ctx->level],
ctx->stats->sstall,
ctx->stats->ss,
ctx->stats->sy);
fprintf(ctx->out, "%s- shaderdb: %u sstall, %u (ss), %u (sy)\n",
levels[ctx->level], ctx->stats->sstall, ctx->stats->ss,
ctx->stats->sy);
}
/* size of largest OPC field of all the instruction categories: */
#define NOPC_BITS 6
static const struct opc_info {
const char *name;
} opcs[1 << (3+NOPC_BITS)] = {
#define OPC(cat, opc, name) [(opc)] = { #name }
const char *name;
} opcs[1 << (3 + NOPC_BITS)] = {
#define OPC(cat, opc, name) [(opc)] = {#name}
/* clang-format off */
/* category 0: */
OPC(0, OPC_NOP, nop),
@ -359,96 +354,96 @@ static const struct opc_info {
#undef OPC
};
#define GETINFO(instr) (&(opcs[((instr)->opc_cat << NOPC_BITS) | instr_opc(instr, ctx->gpu_id)]))
#define GETINFO(instr) \
(&(opcs[((instr)->opc_cat << NOPC_BITS) | instr_opc(instr, ctx->gpu_id)]))
const char *disasm_a3xx_instr_name(opc_t opc)
const char *
disasm_a3xx_instr_name(opc_t opc)
{
if (opc_cat(opc) == -1) return "??meta??";
return opcs[opc].name;
if (opc_cat(opc) == -1)
return "??meta??";
return opcs[opc].name;
}
static void
disasm_field_cb(void *d, const char *field_name, struct isa_decode_value *val)
{
struct disasm_ctx *ctx = d;
struct disasm_ctx *ctx = d;
if (!strcmp(field_name, "NAME")) {
if (!strcmp("nop", val->str)) {
if (ctx->has_end) {
ctx->nop_count++;
if (ctx->nop_count > 3) {
ctx->options->stop = true;
}
}
ctx->stats->nops += 1 + ctx->last.repeat;
} else {
ctx->nop_count = 0;
}
if (!strcmp(field_name, "NAME")) {
if (!strcmp("nop", val->str)) {
if (ctx->has_end) {
ctx->nop_count++;
if (ctx->nop_count > 3) {
ctx->options->stop = true;
}
}
ctx->stats->nops += 1 + ctx->last.repeat;
} else {
ctx->nop_count = 0;
}
if (!strcmp("end", val->str)) {
ctx->has_end = true;
ctx->nop_count = 0;
} else if (!strcmp("chsh", val->str)) {
ctx->options->stop = true;
} else if (!strcmp("bary.f", val->str)) {
ctx->stats->last_baryf = ctx->cur_n;
}
} else if (!strcmp(field_name, "REPEAT")) {
ctx->extra_cycles += val->num;
ctx->stats->instrs_per_cat[ctx->cur_opc_cat] += val->num;
ctx->last.repeat = val->num;
} else if (!strcmp(field_name, "NOP")) {
ctx->extra_cycles += val->num;
ctx->stats->instrs_per_cat[0] += val->num;
ctx->stats->nops += val->num;
ctx->last.nop = val->num;
} else if (!strcmp(field_name, "SY")) {
ctx->stats->sy += val->num;
} else if (!strcmp(field_name, "SS")) {
ctx->stats->ss += val->num;
ctx->last.ss = !!val->num;
} else if (!strcmp(field_name, "CONST")) {
ctx->reg.num = val->num;
ctx->reg.file = FILE_CONST;
} else if (!strcmp(field_name, "GPR")) {
/* don't count GPR regs r48.x (shared) or higher: */
if (val->num < 48) {
ctx->reg.num = val->num;
ctx->reg.file = FILE_GPR;
}
} else if (!strcmp(field_name, "SRC_R") ||
!strcmp(field_name, "SRC1_R") ||
!strcmp(field_name, "SRC2_R") ||
!strcmp(field_name, "SRC3_R")) {
ctx->reg.r = val->num;
} else if (!strcmp(field_name, "DST")) {
/* Dest register is always repeated
*
* Note that this doesn't really properly handle instructions
* that write multiple components.. the old disasm didn't handle
* that case either.
*/
ctx->reg.r = true;
} else if (strstr(field_name, "HALF")) {
ctx->reg.half = val->num;
} else if (!strcmp(field_name, "SWIZ")) {
unsigned num = (ctx->reg.num << 2) | val->num;
if (ctx->reg.r)
num += ctx->last.repeat;
if (!strcmp("end", val->str)) {
ctx->has_end = true;
ctx->nop_count = 0;
} else if (!strcmp("chsh", val->str)) {
ctx->options->stop = true;
} else if (!strcmp("bary.f", val->str)) {
ctx->stats->last_baryf = ctx->cur_n;
}
} else if (!strcmp(field_name, "REPEAT")) {
ctx->extra_cycles += val->num;
ctx->stats->instrs_per_cat[ctx->cur_opc_cat] += val->num;
ctx->last.repeat = val->num;
} else if (!strcmp(field_name, "NOP")) {
ctx->extra_cycles += val->num;
ctx->stats->instrs_per_cat[0] += val->num;
ctx->stats->nops += val->num;
ctx->last.nop = val->num;
} else if (!strcmp(field_name, "SY")) {
ctx->stats->sy += val->num;
} else if (!strcmp(field_name, "SS")) {
ctx->stats->ss += val->num;
ctx->last.ss = !!val->num;
} else if (!strcmp(field_name, "CONST")) {
ctx->reg.num = val->num;
ctx->reg.file = FILE_CONST;
} else if (!strcmp(field_name, "GPR")) {
/* don't count GPR regs r48.x (shared) or higher: */
if (val->num < 48) {
ctx->reg.num = val->num;
ctx->reg.file = FILE_GPR;
}
} else if (!strcmp(field_name, "SRC_R") || !strcmp(field_name, "SRC1_R") ||
!strcmp(field_name, "SRC2_R") || !strcmp(field_name, "SRC3_R")) {
ctx->reg.r = val->num;
} else if (!strcmp(field_name, "DST")) {
/* Dest register is always repeated
*
* Note that this doesn't really properly handle instructions
* that write multiple components.. the old disasm didn't handle
* that case either.
*/
ctx->reg.r = true;
} else if (strstr(field_name, "HALF")) {
ctx->reg.half = val->num;
} else if (!strcmp(field_name, "SWIZ")) {
unsigned num = (ctx->reg.num << 2) | val->num;
if (ctx->reg.r)
num += ctx->last.repeat;
if (ctx->reg.file == FILE_CONST) {
ctx->stats->constlen = MAX2(ctx->stats->constlen, num);
} else if (ctx->reg.file == FILE_GPR) {
if (ctx->reg.half) {
ctx->stats->halfreg = MAX2(ctx->stats->halfreg, num);
} else {
ctx->stats->fullreg = MAX2(ctx->stats->fullreg, num);
}
}
if (ctx->reg.file == FILE_CONST) {
ctx->stats->constlen = MAX2(ctx->stats->constlen, num);
} else if (ctx->reg.file == FILE_GPR) {
if (ctx->reg.half) {
ctx->stats->halfreg = MAX2(ctx->stats->halfreg, num);
} else {
ctx->stats->fullreg = MAX2(ctx->stats->fullreg, num);
}
}
memset(&ctx->reg, 0, sizeof(ctx->reg));
}
memset(&ctx->reg, 0, sizeof(ctx->reg));
}
}
/**
@ -458,103 +453,105 @@ disasm_field_cb(void *d, const char *field_name, struct isa_decode_value *val)
static void
disasm_handle_last(struct disasm_ctx *ctx)
{
if (ctx->last.ss) {
ctx->stats->sstall += ctx->sfu_delay;
ctx->sfu_delay = 0;
}
if (ctx->last.ss) {
ctx->stats->sstall += ctx->sfu_delay;
ctx->sfu_delay = 0;
}
if (ctx->cur_opc_cat == 4) {
ctx->sfu_delay = 10;
} else {
int n = MIN2(ctx->sfu_delay, 1 + ctx->last.repeat + ctx->last.nop);
ctx->sfu_delay -= n;
}
if (ctx->cur_opc_cat == 4) {
ctx->sfu_delay = 10;
} else {
int n = MIN2(ctx->sfu_delay, 1 + ctx->last.repeat + ctx->last.nop);
ctx->sfu_delay -= n;
}
memset(&ctx->last, 0, sizeof(ctx->last));
memset(&ctx->last, 0, sizeof(ctx->last));
}
static void
disasm_instr_cb(void *d, unsigned n, uint64_t instr)
{
struct disasm_ctx *ctx = d;
uint32_t *dwords = (uint32_t *)&instr;
unsigned opc_cat = instr >> 61;
struct disasm_ctx *ctx = d;
uint32_t *dwords = (uint32_t *)&instr;
unsigned opc_cat = instr >> 61;
/* There are some cases where we can get instr_cb called multiple
* times per instruction (like when we need an extra line for branch
* target labels), don't update stats in these cases:
*/
if (n != ctx->cur_n) {
if (n > 0) {
disasm_handle_last(ctx);
}
ctx->stats->instrs_per_cat[opc_cat]++;
ctx->cur_n = n;
/* There are some cases where we can get instr_cb called multiple
* times per instruction (like when we need an extra line for branch
* target labels), don't update stats in these cases:
*/
if (n != ctx->cur_n) {
if (n > 0) {
disasm_handle_last(ctx);
}
ctx->stats->instrs_per_cat[opc_cat]++;
ctx->cur_n = n;
/* mov vs cov stats are a bit harder to fish out of the field
* names, because current ir3-cat1.xml doesn't use {NAME} for
* this distinction. So for now just handle this case with
* some hand-coded parsing:
*/
if (opc_cat == 1) {
unsigned opc = (instr >> 57) & 0x3;
unsigned src_type = (instr >> 50) & 0x7;
unsigned dst_type = (instr >> 46) & 0x7;
/* mov vs cov stats are a bit harder to fish out of the field
* names, because current ir3-cat1.xml doesn't use {NAME} for
* this distinction. So for now just handle this case with
* some hand-coded parsing:
*/
if (opc_cat == 1) {
unsigned opc = (instr >> 57) & 0x3;
unsigned src_type = (instr >> 50) & 0x7;
unsigned dst_type = (instr >> 46) & 0x7;
if (opc == 0) {
if (src_type == dst_type) {
ctx->stats->mov_count++;
} else {
ctx->stats->cov_count++;
}
}
}
}
if (opc == 0) {
if (src_type == dst_type) {
ctx->stats->mov_count++;
} else {
ctx->stats->cov_count++;
}
}
}
}
ctx->cur_opc_cat = opc_cat;
ctx->cur_opc_cat = opc_cat;
if (debug & PRINT_RAW) {
fprintf(ctx->out, "%s:%d:%04d:%04d[%08xx_%08xx] ", levels[ctx->level],
opc_cat, n, ctx->extra_cycles + n, dwords[1], dwords[0]);
}
if (debug & PRINT_RAW) {
fprintf(ctx->out, "%s:%d:%04d:%04d[%08xx_%08xx] ", levels[ctx->level],
opc_cat, n, ctx->extra_cycles + n, dwords[1], dwords[0]);
}
}
int disasm_a3xx_stat(uint32_t *dwords, int sizedwords, int level, FILE *out,
unsigned gpu_id, struct shader_stats *stats)
int
disasm_a3xx_stat(uint32_t *dwords, int sizedwords, int level, FILE *out,
unsigned gpu_id, struct shader_stats *stats)
{
struct isa_decode_options decode_options = {
.gpu_id = gpu_id,
.show_errors = true,
.max_errors = 5,
.branch_labels = true,
.field_cb = disasm_field_cb,
.instr_cb = disasm_instr_cb,
};
struct disasm_ctx ctx = {
.out = out,
.level = level,
.options = &decode_options,
.stats = stats,
.cur_n = -1,
};
struct isa_decode_options decode_options = {
.gpu_id = gpu_id,
.show_errors = true,
.max_errors = 5,
.branch_labels = true,
.field_cb = disasm_field_cb,
.instr_cb = disasm_instr_cb,
};
struct disasm_ctx ctx = {
.out = out,
.level = level,
.options = &decode_options,
.stats = stats,
.cur_n = -1,
};
memset(stats, 0, sizeof(*stats));
memset(stats, 0, sizeof(*stats));
decode_options.cbdata = &ctx;
decode_options.cbdata = &ctx;
isa_decode(dwords, sizedwords * 4, out, &decode_options);
isa_decode(dwords, sizedwords * 4, out, &decode_options);
disasm_handle_last(&ctx);
disasm_handle_last(&ctx);
if (debug & PRINT_STATS)
print_stats(&ctx);
if (debug & PRINT_STATS)
print_stats(&ctx);
return 0;
return 0;
}
void disasm_a3xx_set_debug(enum debug_t d)
void
disasm_a3xx_set_debug(enum debug_t d)
{
debug = d;
debug = d;
}
#include <setjmp.h>
@ -564,34 +561,38 @@ static jmp_buf jmp_env;
void
ir3_assert_handler(const char *expr, const char *file, int line,
const char *func)
const char *func)
{
mesa_loge("%s:%u: %s: Assertion `%s' failed.", file, line, func, expr);
if (jmp_env_valid)
longjmp(jmp_env, 1);
abort();
mesa_loge("%s:%u: %s: Assertion `%s' failed.", file, line, func, expr);
if (jmp_env_valid)
longjmp(jmp_env, 1);
abort();
}
#define TRY(x) do { \
assert(!jmp_env_valid); \
if (setjmp(jmp_env) == 0) { \
jmp_env_valid = true; \
x; \
} \
jmp_env_valid = false; \
} while (0)
#define TRY(x) \
do { \
assert(!jmp_env_valid); \
if (setjmp(jmp_env) == 0) { \
jmp_env_valid = true; \
x; \
} \
jmp_env_valid = false; \
} while (0)
int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out, unsigned gpu_id)
int
disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out,
unsigned gpu_id)
{
struct shader_stats stats;
return disasm_a3xx_stat(dwords, sizedwords, level, out, gpu_id, &stats);
struct shader_stats stats;
return disasm_a3xx_stat(dwords, sizedwords, level, out, gpu_id, &stats);
}
int try_disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out, unsigned gpu_id)
int
try_disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out,
unsigned gpu_id)
{
struct shader_stats stats;
int ret = -1;
TRY(ret = disasm_a3xx_stat(dwords, sizedwords, level, out, gpu_id, &stats));
return ret;
struct shader_stats stats;
int ret = -1;
TRY(ret = disasm_a3xx_stat(dwords, sizedwords, level, out, gpu_id, &stats));
return ret;
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -33,67 +33,65 @@
* Handlers for instructions changed/added in a4xx:
*/
/* src[] = { buffer_index, offset }. No const_index */
static void
emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
struct ir3_instruction **dst)
struct ir3_instruction **dst)
{
struct ir3_block *b = ctx->block;
struct ir3_instruction *ldgb, *src0, *src1, *byte_offset, *offset;
struct ir3_block *b = ctx->block;
struct ir3_instruction *ldgb, *src0, *src1, *byte_offset, *offset;
struct ir3_instruction *ssbo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
struct ir3_instruction *ssbo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
byte_offset = ir3_get_src(ctx, &intr->src[1])[0];
offset = ir3_get_src(ctx, &intr->src[2])[0];
byte_offset = ir3_get_src(ctx, &intr->src[1])[0];
offset = ir3_get_src(ctx, &intr->src[2])[0];
/* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */
src0 = ir3_collect(ctx, byte_offset, create_immed(b, 0));
src1 = offset;
/* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */
src0 = ir3_collect(ctx, byte_offset, create_immed(b, 0));
src1 = offset;
ldgb = ir3_LDGB(b, ssbo, 0,
src0, 0, src1, 0);
ldgb->dsts[0]->wrmask = MASK(intr->num_components);
ldgb->cat6.iim_val = intr->num_components;
ldgb->cat6.d = 4;
ldgb->cat6.type = TYPE_U32;
ldgb->barrier_class = IR3_BARRIER_BUFFER_R;
ldgb->barrier_conflict = IR3_BARRIER_BUFFER_W;
ldgb = ir3_LDGB(b, ssbo, 0, src0, 0, src1, 0);
ldgb->dsts[0]->wrmask = MASK(intr->num_components);
ldgb->cat6.iim_val = intr->num_components;
ldgb->cat6.d = 4;
ldgb->cat6.type = TYPE_U32;
ldgb->barrier_class = IR3_BARRIER_BUFFER_R;
ldgb->barrier_conflict = IR3_BARRIER_BUFFER_W;
ir3_split_dest(b, dst, ldgb, 0, intr->num_components);
ir3_split_dest(b, dst, ldgb, 0, intr->num_components);
}
/* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
static void
emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
{
struct ir3_block *b = ctx->block;
struct ir3_instruction *stgb, *src0, *src1, *src2, *byte_offset, *offset;
unsigned wrmask = nir_intrinsic_write_mask(intr);
unsigned ncomp = ffs(~wrmask) - 1;
struct ir3_block *b = ctx->block;
struct ir3_instruction *stgb, *src0, *src1, *src2, *byte_offset, *offset;
unsigned wrmask = nir_intrinsic_write_mask(intr);
unsigned ncomp = ffs(~wrmask) - 1;
assert(wrmask == BITFIELD_MASK(intr->num_components));
assert(wrmask == BITFIELD_MASK(intr->num_components));
struct ir3_instruction *ssbo = ir3_ssbo_to_ibo(ctx, intr->src[1]);
struct ir3_instruction *ssbo = ir3_ssbo_to_ibo(ctx, intr->src[1]);
byte_offset = ir3_get_src(ctx, &intr->src[2])[0];
offset = ir3_get_src(ctx, &intr->src[3])[0];
byte_offset = ir3_get_src(ctx, &intr->src[2])[0];
offset = ir3_get_src(ctx, &intr->src[3])[0];
/* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0)..
* nir already *= 4:
*/
src0 = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
src1 = offset;
src2 = ir3_collect(ctx, byte_offset, create_immed(b, 0));
/* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0)..
* nir already *= 4:
*/
src0 = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
src1 = offset;
src2 = ir3_collect(ctx, byte_offset, create_immed(b, 0));
stgb = ir3_STGB(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
stgb->cat6.iim_val = ncomp;
stgb->cat6.d = 4;
stgb->cat6.type = TYPE_U32;
stgb->barrier_class = IR3_BARRIER_BUFFER_W;
stgb->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
stgb = ir3_STGB(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
stgb->cat6.iim_val = ncomp;
stgb->cat6.d = 4;
stgb->cat6.type = TYPE_U32;
stgb->barrier_class = IR3_BARRIER_BUFFER_W;
stgb->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
array_insert(b, b->keeps, stgb);
array_insert(b, b->keeps, stgb);
}
/*
@ -116,229 +114,228 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
static struct ir3_instruction *
emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
{
struct ir3_block *b = ctx->block;
struct ir3_instruction *atomic, *ssbo, *src0, *src1, *src2, *byte_offset,
*offset;
type_t type = TYPE_U32;
struct ir3_block *b = ctx->block;
struct ir3_instruction *atomic, *ssbo, *src0, *src1, *src2, *byte_offset,
*offset;
type_t type = TYPE_U32;
ssbo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
ssbo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
byte_offset = ir3_get_src(ctx, &intr->src[1])[0];
offset = ir3_get_src(ctx, &intr->src[3])[0];
byte_offset = ir3_get_src(ctx, &intr->src[1])[0];
offset = ir3_get_src(ctx, &intr->src[3])[0];
/* src0 is data (or uvec2(data, compare))
* src1 is offset
* src2 is uvec2(offset*4, 0) (appears to be 64b byte offset)
*
* Note that nir already multiplies the offset by four
*/
src0 = ir3_get_src(ctx, &intr->src[2])[0];
src1 = offset;
src2 = ir3_collect(ctx, byte_offset, create_immed(b, 0));
/* src0 is data (or uvec2(data, compare))
* src1 is offset
* src2 is uvec2(offset*4, 0) (appears to be 64b byte offset)
*
* Note that nir already multiplies the offset by four
*/
src0 = ir3_get_src(ctx, &intr->src[2])[0];
src1 = offset;
src2 = ir3_collect(ctx, byte_offset, create_immed(b, 0));
switch (intr->intrinsic) {
case nir_intrinsic_ssbo_atomic_add_ir3:
atomic = ir3_ATOMIC_ADD_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_ssbo_atomic_imin_ir3:
atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
type = TYPE_S32;
break;
case nir_intrinsic_ssbo_atomic_umin_ir3:
atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_ssbo_atomic_imax_ir3:
atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
type = TYPE_S32;
break;
case nir_intrinsic_ssbo_atomic_umax_ir3:
atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_ssbo_atomic_and_ir3:
atomic = ir3_ATOMIC_AND_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_ssbo_atomic_or_ir3:
atomic = ir3_ATOMIC_OR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_ssbo_atomic_xor_ir3:
atomic = ir3_ATOMIC_XOR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_ssbo_atomic_exchange_ir3:
atomic = ir3_ATOMIC_XCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
/* for cmpxchg, src0 is [ui]vec2(data, compare): */
src0 = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[3])[0], src0);
src1 = ir3_get_src(ctx, &intr->src[4])[0];
atomic = ir3_ATOMIC_CMPXCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
break;
default:
unreachable("boo");
}
switch (intr->intrinsic) {
case nir_intrinsic_ssbo_atomic_add_ir3:
atomic = ir3_ATOMIC_ADD_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_ssbo_atomic_imin_ir3:
atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
type = TYPE_S32;
break;
case nir_intrinsic_ssbo_atomic_umin_ir3:
atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_ssbo_atomic_imax_ir3:
atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
type = TYPE_S32;
break;
case nir_intrinsic_ssbo_atomic_umax_ir3:
atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_ssbo_atomic_and_ir3:
atomic = ir3_ATOMIC_AND_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_ssbo_atomic_or_ir3:
atomic = ir3_ATOMIC_OR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_ssbo_atomic_xor_ir3:
atomic = ir3_ATOMIC_XOR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_ssbo_atomic_exchange_ir3:
atomic = ir3_ATOMIC_XCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
/* for cmpxchg, src0 is [ui]vec2(data, compare): */
src0 = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[3])[0], src0);
src1 = ir3_get_src(ctx, &intr->src[4])[0];
atomic = ir3_ATOMIC_CMPXCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
break;
default:
unreachable("boo");
}
atomic->cat6.iim_val = 1;
atomic->cat6.d = 4;
atomic->cat6.type = type;
atomic->barrier_class = IR3_BARRIER_BUFFER_W;
atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
atomic->cat6.iim_val = 1;
atomic->cat6.d = 4;
atomic->cat6.type = type;
atomic->barrier_class = IR3_BARRIER_BUFFER_W;
atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
/* even if nothing consume the result, we can't DCE the instruction: */
array_insert(b, b->keeps, atomic);
/* even if nothing consume the result, we can't DCE the instruction: */
array_insert(b, b->keeps, atomic);
return atomic;
return atomic;
}
static struct ir3_instruction *
get_image_offset(struct ir3_context *ctx, const nir_intrinsic_instr *instr,
struct ir3_instruction * const *coords, bool byteoff)
struct ir3_instruction *const *coords, bool byteoff)
{
struct ir3_block *b = ctx->block;
struct ir3_instruction *offset;
unsigned index = nir_src_as_uint(instr->src[0]);
unsigned ncoords = ir3_get_image_coords(instr, NULL);
struct ir3_block *b = ctx->block;
struct ir3_instruction *offset;
unsigned index = nir_src_as_uint(instr->src[0]);
unsigned ncoords = ir3_get_image_coords(instr, NULL);
/* to calculate the byte offset (yes, uggg) we need (up to) three
* const values to know the bytes per pixel, and y and z stride:
*/
const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
unsigned cb = regid(const_state->offsets.image_dims, 0) +
const_state->image_dims.off[index];
/* to calculate the byte offset (yes, uggg) we need (up to) three
* const values to know the bytes per pixel, and y and z stride:
*/
const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
unsigned cb = regid(const_state->offsets.image_dims, 0) +
const_state->image_dims.off[index];
debug_assert(const_state->image_dims.mask & (1 << index));
debug_assert(const_state->image_dims.mask & (1 << index));
/* offset = coords.x * bytes_per_pixel: */
offset = ir3_MUL_S24(b, coords[0], 0, create_uniform(b, cb + 0), 0);
if (ncoords > 1) {
/* offset += coords.y * y_pitch: */
offset = ir3_MAD_S24(b, create_uniform(b, cb + 1), 0,
coords[1], 0, offset, 0);
}
if (ncoords > 2) {
/* offset += coords.z * z_pitch: */
offset = ir3_MAD_S24(b, create_uniform(b, cb + 2), 0,
coords[2], 0, offset, 0);
}
/* offset = coords.x * bytes_per_pixel: */
offset = ir3_MUL_S24(b, coords[0], 0, create_uniform(b, cb + 0), 0);
if (ncoords > 1) {
/* offset += coords.y * y_pitch: */
offset =
ir3_MAD_S24(b, create_uniform(b, cb + 1), 0, coords[1], 0, offset, 0);
}
if (ncoords > 2) {
/* offset += coords.z * z_pitch: */
offset =
ir3_MAD_S24(b, create_uniform(b, cb + 2), 0, coords[2], 0, offset, 0);
}
if (!byteoff) {
/* Some cases, like atomics, seem to use dword offset instead
* of byte offsets.. blob just puts an extra shr.b in there
* in those cases:
*/
offset = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
}
if (!byteoff) {
/* Some cases, like atomics, seem to use dword offset instead
* of byte offsets.. blob just puts an extra shr.b in there
* in those cases:
*/
offset = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
}
return ir3_collect(ctx, offset, create_immed(b, 0));
return ir3_collect(ctx, offset, create_immed(b, 0));
}
/* src[] = { index, coord, sample_index, value }. const_index[] = {} */
static void
emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
{
struct ir3_block *b = ctx->block;
struct ir3_instruction *stib, *offset;
struct ir3_instruction * const *value = ir3_get_src(ctx, &intr->src[3]);
struct ir3_instruction * const *coords = ir3_get_src(ctx, &intr->src[1]);
struct ir3_instruction * ibo = ir3_image_to_ibo(ctx, intr->src[0]);
unsigned ncoords = ir3_get_image_coords(intr, NULL);
unsigned ncomp = ir3_get_num_components_for_image_format(nir_intrinsic_format(intr));
struct ir3_block *b = ctx->block;
struct ir3_instruction *stib, *offset;
struct ir3_instruction *const *value = ir3_get_src(ctx, &intr->src[3]);
struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]);
struct ir3_instruction *ibo = ir3_image_to_ibo(ctx, intr->src[0]);
unsigned ncoords = ir3_get_image_coords(intr, NULL);
unsigned ncomp =
ir3_get_num_components_for_image_format(nir_intrinsic_format(intr));
/* src0 is value
* src1 is coords
* src2 is 64b byte offset
*/
/* src0 is value
* src1 is coords
* src2 is 64b byte offset
*/
offset = get_image_offset(ctx, intr, coords, true);
offset = get_image_offset(ctx, intr, coords, true);
/* NOTE: stib seems to take byte offset, but stgb.typed can be used
* too and takes a dword offset.. not quite sure yet why blob uses
* one over the other in various cases.
*/
/* NOTE: stib seems to take byte offset, but stgb.typed can be used
* too and takes a dword offset.. not quite sure yet why blob uses
* one over the other in various cases.
*/
stib = ir3_STIB(b, ibo, 0,
ir3_create_collect(ctx, value, ncomp), 0,
ir3_create_collect(ctx, coords, ncoords), 0,
offset, 0);
stib->cat6.iim_val = ncomp;
stib->cat6.d = ncoords;
stib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
stib->cat6.typed = true;
stib->barrier_class = IR3_BARRIER_IMAGE_W;
stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
stib = ir3_STIB(b, ibo, 0, ir3_create_collect(ctx, value, ncomp), 0,
ir3_create_collect(ctx, coords, ncoords), 0, offset, 0);
stib->cat6.iim_val = ncomp;
stib->cat6.d = ncoords;
stib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
stib->cat6.typed = true;
stib->barrier_class = IR3_BARRIER_IMAGE_W;
stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
array_insert(b, b->keeps, stib);
array_insert(b, b->keeps, stib);
}
/* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */
static struct ir3_instruction *
emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
{
struct ir3_block *b = ctx->block;
struct ir3_instruction *atomic, *src0, *src1, *src2;
struct ir3_instruction * const *coords = ir3_get_src(ctx, &intr->src[1]);
struct ir3_instruction * image = ir3_image_to_ibo(ctx, intr->src[0]);
unsigned ncoords = ir3_get_image_coords(intr, NULL);
struct ir3_block *b = ctx->block;
struct ir3_instruction *atomic, *src0, *src1, *src2;
struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]);
struct ir3_instruction *image = ir3_image_to_ibo(ctx, intr->src[0]);
unsigned ncoords = ir3_get_image_coords(intr, NULL);
/* src0 is value (or uvec2(value, compare))
* src1 is coords
* src2 is 64b byte offset
*/
src0 = ir3_get_src(ctx, &intr->src[3])[0];
src1 = ir3_create_collect(ctx, coords, ncoords);
src2 = get_image_offset(ctx, intr, coords, false);
/* src0 is value (or uvec2(value, compare))
* src1 is coords
* src2 is 64b byte offset
*/
src0 = ir3_get_src(ctx, &intr->src[3])[0];
src1 = ir3_create_collect(ctx, coords, ncoords);
src2 = get_image_offset(ctx, intr, coords, false);
switch (intr->intrinsic) {
case nir_intrinsic_image_atomic_add:
atomic = ir3_ATOMIC_ADD_G(b, image, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_image_atomic_imin:
case nir_intrinsic_image_atomic_umin:
atomic = ir3_ATOMIC_MIN_G(b, image, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_image_atomic_imax:
case nir_intrinsic_image_atomic_umax:
atomic = ir3_ATOMIC_MAX_G(b, image, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_image_atomic_and:
atomic = ir3_ATOMIC_AND_G(b, image, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_image_atomic_or:
atomic = ir3_ATOMIC_OR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_image_atomic_xor:
atomic = ir3_ATOMIC_XOR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_image_atomic_exchange:
atomic = ir3_ATOMIC_XCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_image_atomic_comp_swap:
/* for cmpxchg, src0 is [ui]vec2(data, compare): */
src0 = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[4])[0], src0);
atomic = ir3_ATOMIC_CMPXCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
break;
default:
unreachable("boo");
}
switch (intr->intrinsic) {
case nir_intrinsic_image_atomic_add:
atomic = ir3_ATOMIC_ADD_G(b, image, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_image_atomic_imin:
case nir_intrinsic_image_atomic_umin:
atomic = ir3_ATOMIC_MIN_G(b, image, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_image_atomic_imax:
case nir_intrinsic_image_atomic_umax:
atomic = ir3_ATOMIC_MAX_G(b, image, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_image_atomic_and:
atomic = ir3_ATOMIC_AND_G(b, image, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_image_atomic_or:
atomic = ir3_ATOMIC_OR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_image_atomic_xor:
atomic = ir3_ATOMIC_XOR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_image_atomic_exchange:
atomic = ir3_ATOMIC_XCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
break;
case nir_intrinsic_image_atomic_comp_swap:
/* for cmpxchg, src0 is [ui]vec2(data, compare): */
src0 = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[4])[0], src0);
atomic = ir3_ATOMIC_CMPXCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
break;
default:
unreachable("boo");
}
atomic->cat6.iim_val = 1;
atomic->cat6.d = ncoords;
atomic->cat6.type = ir3_get_type_for_image_intrinsic(intr);
atomic->cat6.typed = true;
atomic->barrier_class = IR3_BARRIER_IMAGE_W;
atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
atomic->cat6.iim_val = 1;
atomic->cat6.d = ncoords;
atomic->cat6.type = ir3_get_type_for_image_intrinsic(intr);
atomic->cat6.typed = true;
atomic->barrier_class = IR3_BARRIER_IMAGE_W;
atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
/* even if nothing consume the result, we can't DCE the instruction: */
array_insert(b, b->keeps, atomic);
/* even if nothing consume the result, we can't DCE the instruction: */
array_insert(b, b->keeps, atomic);
return atomic;
return atomic;
}
const struct ir3_context_funcs ir3_a4xx_funcs = {
.emit_intrinsic_load_ssbo = emit_intrinsic_load_ssbo,
.emit_intrinsic_store_ssbo = emit_intrinsic_store_ssbo,
.emit_intrinsic_atomic_ssbo = emit_intrinsic_atomic_ssbo,
.emit_intrinsic_store_image = emit_intrinsic_store_image,
.emit_intrinsic_atomic_image = emit_intrinsic_atomic_image,
.emit_intrinsic_image_size = emit_intrinsic_image_size_tex,
.emit_intrinsic_load_global_ir3 = NULL,
.emit_intrinsic_store_global_ir3 = NULL,
.emit_intrinsic_load_ssbo = emit_intrinsic_load_ssbo,
.emit_intrinsic_store_ssbo = emit_intrinsic_store_ssbo,
.emit_intrinsic_atomic_ssbo = emit_intrinsic_atomic_ssbo,
.emit_intrinsic_store_image = emit_intrinsic_store_image,
.emit_intrinsic_atomic_image = emit_intrinsic_atomic_image,
.emit_intrinsic_image_size = emit_intrinsic_image_size_tex,
.emit_intrinsic_load_global_ir3 = NULL,
.emit_intrinsic_store_global_ir3 = NULL,
};

View File

@ -40,53 +40,53 @@
/* src[] = { buffer_index, offset }. No const_index */
static void
emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
struct ir3_instruction **dst)
struct ir3_instruction **dst)
{
struct ir3_block *b = ctx->block;
struct ir3_instruction *offset;
struct ir3_instruction *ldib;
struct ir3_block *b = ctx->block;
struct ir3_instruction *offset;
struct ir3_instruction *ldib;
offset = ir3_get_src(ctx, &intr->src[2])[0];
offset = ir3_get_src(ctx, &intr->src[2])[0];
ldib = ir3_LDIB(b, ir3_ssbo_to_ibo(ctx, intr->src[0]), 0, offset, 0);
ldib->dsts[0]->wrmask = MASK(intr->num_components);
ldib->cat6.iim_val = intr->num_components;
ldib->cat6.d = 1;
ldib->cat6.type = intr->dest.ssa.bit_size == 16 ? TYPE_U16 : TYPE_U32;
ldib->barrier_class = IR3_BARRIER_BUFFER_R;
ldib->barrier_conflict = IR3_BARRIER_BUFFER_W;
ir3_handle_bindless_cat6(ldib, intr->src[0]);
ir3_handle_nonuniform(ldib, intr);
ldib = ir3_LDIB(b, ir3_ssbo_to_ibo(ctx, intr->src[0]), 0, offset, 0);
ldib->dsts[0]->wrmask = MASK(intr->num_components);
ldib->cat6.iim_val = intr->num_components;
ldib->cat6.d = 1;
ldib->cat6.type = intr->dest.ssa.bit_size == 16 ? TYPE_U16 : TYPE_U32;
ldib->barrier_class = IR3_BARRIER_BUFFER_R;
ldib->barrier_conflict = IR3_BARRIER_BUFFER_W;
ir3_handle_bindless_cat6(ldib, intr->src[0]);
ir3_handle_nonuniform(ldib, intr);
ir3_split_dest(b, dst, ldib, 0, intr->num_components);
ir3_split_dest(b, dst, ldib, 0, intr->num_components);
}
/* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
static void
emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
{
struct ir3_block *b = ctx->block;
struct ir3_instruction *stib, *val, *offset;
unsigned wrmask = nir_intrinsic_write_mask(intr);
unsigned ncomp = ffs(~wrmask) - 1;
struct ir3_block *b = ctx->block;
struct ir3_instruction *stib, *val, *offset;
unsigned wrmask = nir_intrinsic_write_mask(intr);
unsigned ncomp = ffs(~wrmask) - 1;
assert(wrmask == BITFIELD_MASK(intr->num_components));
assert(wrmask == BITFIELD_MASK(intr->num_components));
/* src0 is offset, src1 is value:
*/
val = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
offset = ir3_get_src(ctx, &intr->src[3])[0];
/* src0 is offset, src1 is value:
*/
val = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
offset = ir3_get_src(ctx, &intr->src[3])[0];
stib = ir3_STIB(b, ir3_ssbo_to_ibo(ctx, intr->src[1]), 0, offset, 0, val, 0);
stib->cat6.iim_val = ncomp;
stib->cat6.d = 1;
stib->cat6.type = intr->src[0].ssa->bit_size == 16 ? TYPE_U16 : TYPE_U32;
stib->barrier_class = IR3_BARRIER_BUFFER_W;
stib->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
ir3_handle_bindless_cat6(stib, intr->src[1]);
ir3_handle_nonuniform(stib, intr);
stib = ir3_STIB(b, ir3_ssbo_to_ibo(ctx, intr->src[1]), 0, offset, 0, val, 0);
stib->cat6.iim_val = ncomp;
stib->cat6.d = 1;
stib->cat6.type = intr->src[0].ssa->bit_size == 16 ? TYPE_U16 : TYPE_U32;
stib->barrier_class = IR3_BARRIER_BUFFER_W;
stib->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
ir3_handle_bindless_cat6(stib, intr->src[1]);
ir3_handle_nonuniform(stib, intr);
array_insert(b, b->keeps, stib);
array_insert(b, b->keeps, stib);
}
/*
@ -109,329 +109,321 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
static struct ir3_instruction *
emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
{
struct ir3_block *b = ctx->block;
struct ir3_instruction *atomic, *ibo, *src0, *src1, *data, *dummy;
type_t type = TYPE_U32;
struct ir3_block *b = ctx->block;
struct ir3_instruction *atomic, *ibo, *src0, *src1, *data, *dummy;
type_t type = TYPE_U32;
ibo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
ibo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
data = ir3_get_src(ctx, &intr->src[2])[0];
data = ir3_get_src(ctx, &intr->src[2])[0];
/* So this gets a bit creative:
*
* src0 - vecN offset/coords
* src1.x - is actually destination register
* src1.y - is 'data' except for cmpxchg where src2.y is 'compare'
* src1.z - is 'data' for cmpxchg
*
* The combining src and dest kinda doesn't work out so well with how
* scheduling and RA work. So we create a dummy src2 which is tied to the
* destination in RA (i.e. must be allocated to the same vec2/vec3
* register) and then immediately extract the first component.
*
* Note that nir already multiplies the offset by four
*/
dummy = create_immed(b, 0);
/* So this gets a bit creative:
*
* src0 - vecN offset/coords
* src1.x - is actually destination register
* src1.y - is 'data' except for cmpxchg where src2.y is 'compare'
* src1.z - is 'data' for cmpxchg
*
* The combining src and dest kinda doesn't work out so well with how
* scheduling and RA work. So we create a dummy src2 which is tied to the
* destination in RA (i.e. must be allocated to the same vec2/vec3
* register) and then immediately extract the first component.
*
* Note that nir already multiplies the offset by four
*/
dummy = create_immed(b, 0);
if (intr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap_ir3) {
src0 = ir3_get_src(ctx, &intr->src[4])[0];
struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[3])[0];
src1 = ir3_collect(ctx, dummy, compare, data);
} else {
src0 = ir3_get_src(ctx, &intr->src[3])[0];
src1 = ir3_collect(ctx, dummy, data);
}
if (intr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap_ir3) {
src0 = ir3_get_src(ctx, &intr->src[4])[0];
struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[3])[0];
src1 = ir3_collect(ctx, dummy, compare, data);
} else {
src0 = ir3_get_src(ctx, &intr->src[3])[0];
src1 = ir3_collect(ctx, dummy, data);
}
switch (intr->intrinsic) {
case nir_intrinsic_ssbo_atomic_add_ir3:
atomic = ir3_ATOMIC_ADD_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_ssbo_atomic_imin_ir3:
atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);
type = TYPE_S32;
break;
case nir_intrinsic_ssbo_atomic_umin_ir3:
atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_ssbo_atomic_imax_ir3:
atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);
type = TYPE_S32;
break;
case nir_intrinsic_ssbo_atomic_umax_ir3:
atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_ssbo_atomic_and_ir3:
atomic = ir3_ATOMIC_AND_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_ssbo_atomic_or_ir3:
atomic = ir3_ATOMIC_OR_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_ssbo_atomic_xor_ir3:
atomic = ir3_ATOMIC_XOR_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_ssbo_atomic_exchange_ir3:
atomic = ir3_ATOMIC_XCHG_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
atomic = ir3_ATOMIC_CMPXCHG_G(b, ibo, 0, src0, 0, src1, 0);
break;
default:
unreachable("boo");
}
switch (intr->intrinsic) {
case nir_intrinsic_ssbo_atomic_add_ir3:
atomic = ir3_ATOMIC_ADD_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_ssbo_atomic_imin_ir3:
atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);
type = TYPE_S32;
break;
case nir_intrinsic_ssbo_atomic_umin_ir3:
atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_ssbo_atomic_imax_ir3:
atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);
type = TYPE_S32;
break;
case nir_intrinsic_ssbo_atomic_umax_ir3:
atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_ssbo_atomic_and_ir3:
atomic = ir3_ATOMIC_AND_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_ssbo_atomic_or_ir3:
atomic = ir3_ATOMIC_OR_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_ssbo_atomic_xor_ir3:
atomic = ir3_ATOMIC_XOR_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_ssbo_atomic_exchange_ir3:
atomic = ir3_ATOMIC_XCHG_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
atomic = ir3_ATOMIC_CMPXCHG_G(b, ibo, 0, src0, 0, src1, 0);
break;
default:
unreachable("boo");
}
atomic->cat6.iim_val = 1;
atomic->cat6.d = 1;
atomic->cat6.type = type;
atomic->barrier_class = IR3_BARRIER_BUFFER_W;
atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
ir3_handle_bindless_cat6(atomic, intr->src[0]);
atomic->cat6.iim_val = 1;
atomic->cat6.d = 1;
atomic->cat6.type = type;
atomic->barrier_class = IR3_BARRIER_BUFFER_W;
atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
ir3_handle_bindless_cat6(atomic, intr->src[0]);
/* even if nothing consume the result, we can't DCE the instruction: */
array_insert(b, b->keeps, atomic);
/* even if nothing consume the result, we can't DCE the instruction: */
array_insert(b, b->keeps, atomic);
atomic->dsts[0]->wrmask = src1->dsts[0]->wrmask;
ir3_reg_tie(atomic->dsts[0], atomic->srcs[2]);
struct ir3_instruction *split;
ir3_split_dest(b, &split, atomic, 0, 1);
return split;
atomic->dsts[0]->wrmask = src1->dsts[0]->wrmask;
ir3_reg_tie(atomic->dsts[0], atomic->srcs[2]);
struct ir3_instruction *split;
ir3_split_dest(b, &split, atomic, 0, 1);
return split;
}
/* src[] = { deref, coord, sample_index }. const_index[] = {} */
static void
emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
struct ir3_instruction **dst)
struct ir3_instruction **dst)
{
struct ir3_block *b = ctx->block;
struct ir3_instruction *ldib;
struct ir3_instruction * const *coords = ir3_get_src(ctx, &intr->src[1]);
unsigned ncoords = ir3_get_image_coords(intr, NULL);
struct ir3_block *b = ctx->block;
struct ir3_instruction *ldib;
struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]);
unsigned ncoords = ir3_get_image_coords(intr, NULL);
ldib = ir3_LDIB(b, ir3_image_to_ibo(ctx, intr->src[0]), 0,
ir3_create_collect(ctx, coords, ncoords), 0);
ldib->dsts[0]->wrmask = MASK(intr->num_components);
ldib->cat6.iim_val = intr->num_components;
ldib->cat6.d = ncoords;
ldib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
ldib->cat6.typed = true;
ldib->barrier_class = IR3_BARRIER_IMAGE_R;
ldib->barrier_conflict = IR3_BARRIER_IMAGE_W;
ir3_handle_bindless_cat6(ldib, intr->src[0]);
ir3_handle_nonuniform(ldib, intr);
ldib = ir3_LDIB(b, ir3_image_to_ibo(ctx, intr->src[0]), 0,
ir3_create_collect(ctx, coords, ncoords), 0);
ldib->dsts[0]->wrmask = MASK(intr->num_components);
ldib->cat6.iim_val = intr->num_components;
ldib->cat6.d = ncoords;
ldib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
ldib->cat6.typed = true;
ldib->barrier_class = IR3_BARRIER_IMAGE_R;
ldib->barrier_conflict = IR3_BARRIER_IMAGE_W;
ir3_handle_bindless_cat6(ldib, intr->src[0]);
ir3_handle_nonuniform(ldib, intr);
ir3_split_dest(b, dst, ldib, 0, intr->num_components);
ir3_split_dest(b, dst, ldib, 0, intr->num_components);
}
/* src[] = { deref, coord, sample_index, value }. const_index[] = {} */
static void
emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
{
struct ir3_block *b = ctx->block;
struct ir3_instruction *stib;
struct ir3_instruction * const *value = ir3_get_src(ctx, &intr->src[3]);
struct ir3_instruction * const *coords = ir3_get_src(ctx, &intr->src[1]);
unsigned ncoords = ir3_get_image_coords(intr, NULL);
enum pipe_format format = nir_intrinsic_format(intr);
unsigned ncomp = ir3_get_num_components_for_image_format(format);
struct ir3_block *b = ctx->block;
struct ir3_instruction *stib;
struct ir3_instruction *const *value = ir3_get_src(ctx, &intr->src[3]);
struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]);
unsigned ncoords = ir3_get_image_coords(intr, NULL);
enum pipe_format format = nir_intrinsic_format(intr);
unsigned ncomp = ir3_get_num_components_for_image_format(format);
/* src0 is offset, src1 is value:
*/
stib = ir3_STIB(b, ir3_image_to_ibo(ctx, intr->src[0]), 0,
ir3_create_collect(ctx, coords, ncoords), 0,
ir3_create_collect(ctx, value, ncomp), 0);
stib->cat6.iim_val = ncomp;
stib->cat6.d = ncoords;
stib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
stib->cat6.typed = true;
stib->barrier_class = IR3_BARRIER_IMAGE_W;
stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
ir3_handle_bindless_cat6(stib, intr->src[0]);
ir3_handle_nonuniform(stib, intr);
/* src0 is offset, src1 is value:
*/
stib = ir3_STIB(b, ir3_image_to_ibo(ctx, intr->src[0]), 0,
ir3_create_collect(ctx, coords, ncoords), 0,
ir3_create_collect(ctx, value, ncomp), 0);
stib->cat6.iim_val = ncomp;
stib->cat6.d = ncoords;
stib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
stib->cat6.typed = true;
stib->barrier_class = IR3_BARRIER_IMAGE_W;
stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
ir3_handle_bindless_cat6(stib, intr->src[0]);
ir3_handle_nonuniform(stib, intr);
array_insert(b, b->keeps, stib);
array_insert(b, b->keeps, stib);
}
/* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */
static struct ir3_instruction *
emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
{
struct ir3_block *b = ctx->block;
struct ir3_instruction *atomic, *ibo, *src0, *src1, *dummy;
struct ir3_instruction * const *coords = ir3_get_src(ctx, &intr->src[1]);
struct ir3_instruction *value = ir3_get_src(ctx, &intr->src[3])[0];
unsigned ncoords = ir3_get_image_coords(intr, NULL);
struct ir3_block *b = ctx->block;
struct ir3_instruction *atomic, *ibo, *src0, *src1, *dummy;
struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]);
struct ir3_instruction *value = ir3_get_src(ctx, &intr->src[3])[0];
unsigned ncoords = ir3_get_image_coords(intr, NULL);
ibo = ir3_image_to_ibo(ctx, intr->src[0]);
ibo = ir3_image_to_ibo(ctx, intr->src[0]);
/* So this gets a bit creative:
*
* src0 - vecN offset/coords
* src1.x - is actually destination register
* src1.y - is 'value' except for cmpxchg where src2.y is 'compare'
* src1.z - is 'value' for cmpxchg
*
* The combining src and dest kinda doesn't work out so well with how
* scheduling and RA work. So we create a dummy src2 which is tied to the
* destination in RA (i.e. must be allocated to the same vec2/vec3
* register) and then immediately extract the first component.
*/
dummy = create_immed(b, 0);
src0 = ir3_create_collect(ctx, coords, ncoords);
/* So this gets a bit creative:
*
* src0 - vecN offset/coords
* src1.x - is actually destination register
* src1.y - is 'value' except for cmpxchg where src2.y is 'compare'
* src1.z - is 'value' for cmpxchg
*
* The combining src and dest kinda doesn't work out so well with how
* scheduling and RA work. So we create a dummy src2 which is tied to the
* destination in RA (i.e. must be allocated to the same vec2/vec3
* register) and then immediately extract the first component.
*/
dummy = create_immed(b, 0);
src0 = ir3_create_collect(ctx, coords, ncoords);
if (intr->intrinsic == nir_intrinsic_image_atomic_comp_swap ||
intr->intrinsic == nir_intrinsic_bindless_image_atomic_comp_swap) {
struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[4])[0];
src1 = ir3_collect(ctx, dummy, compare, value);
} else {
src1 = ir3_collect(ctx, dummy, value);
}
if (intr->intrinsic == nir_intrinsic_image_atomic_comp_swap ||
intr->intrinsic == nir_intrinsic_bindless_image_atomic_comp_swap) {
struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[4])[0];
src1 = ir3_collect(ctx, dummy, compare, value);
} else {
src1 = ir3_collect(ctx, dummy, value);
}
switch (intr->intrinsic) {
case nir_intrinsic_image_atomic_add:
case nir_intrinsic_bindless_image_atomic_add:
atomic = ir3_ATOMIC_ADD_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_image_atomic_imin:
case nir_intrinsic_image_atomic_umin:
case nir_intrinsic_bindless_image_atomic_imin:
case nir_intrinsic_bindless_image_atomic_umin:
atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_image_atomic_imax:
case nir_intrinsic_image_atomic_umax:
case nir_intrinsic_bindless_image_atomic_imax:
case nir_intrinsic_bindless_image_atomic_umax:
atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_image_atomic_and:
case nir_intrinsic_bindless_image_atomic_and:
atomic = ir3_ATOMIC_AND_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_image_atomic_or:
case nir_intrinsic_bindless_image_atomic_or:
atomic = ir3_ATOMIC_OR_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_image_atomic_xor:
case nir_intrinsic_bindless_image_atomic_xor:
atomic = ir3_ATOMIC_XOR_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_image_atomic_exchange:
case nir_intrinsic_bindless_image_atomic_exchange:
atomic = ir3_ATOMIC_XCHG_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_image_atomic_comp_swap:
case nir_intrinsic_bindless_image_atomic_comp_swap:
atomic = ir3_ATOMIC_CMPXCHG_G(b, ibo, 0, src0, 0, src1, 0);
break;
default:
unreachable("boo");
}
switch (intr->intrinsic) {
case nir_intrinsic_image_atomic_add:
case nir_intrinsic_bindless_image_atomic_add:
atomic = ir3_ATOMIC_ADD_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_image_atomic_imin:
case nir_intrinsic_image_atomic_umin:
case nir_intrinsic_bindless_image_atomic_imin:
case nir_intrinsic_bindless_image_atomic_umin:
atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_image_atomic_imax:
case nir_intrinsic_image_atomic_umax:
case nir_intrinsic_bindless_image_atomic_imax:
case nir_intrinsic_bindless_image_atomic_umax:
atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_image_atomic_and:
case nir_intrinsic_bindless_image_atomic_and:
atomic = ir3_ATOMIC_AND_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_image_atomic_or:
case nir_intrinsic_bindless_image_atomic_or:
atomic = ir3_ATOMIC_OR_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_image_atomic_xor:
case nir_intrinsic_bindless_image_atomic_xor:
atomic = ir3_ATOMIC_XOR_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_image_atomic_exchange:
case nir_intrinsic_bindless_image_atomic_exchange:
atomic = ir3_ATOMIC_XCHG_G(b, ibo, 0, src0, 0, src1, 0);
break;
case nir_intrinsic_image_atomic_comp_swap:
case nir_intrinsic_bindless_image_atomic_comp_swap:
atomic = ir3_ATOMIC_CMPXCHG_G(b, ibo, 0, src0, 0, src1, 0);
break;
default:
unreachable("boo");
}
atomic->cat6.iim_val = 1;
atomic->cat6.d = ncoords;
atomic->cat6.type = ir3_get_type_for_image_intrinsic(intr);
atomic->cat6.typed = true;
atomic->barrier_class = IR3_BARRIER_IMAGE_W;
atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
ir3_handle_bindless_cat6(atomic, intr->src[0]);
atomic->cat6.iim_val = 1;
atomic->cat6.d = ncoords;
atomic->cat6.type = ir3_get_type_for_image_intrinsic(intr);
atomic->cat6.typed = true;
atomic->barrier_class = IR3_BARRIER_IMAGE_W;
atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
ir3_handle_bindless_cat6(atomic, intr->src[0]);
/* even if nothing consume the result, we can't DCE the instruction: */
array_insert(b, b->keeps, atomic);
/* even if nothing consume the result, we can't DCE the instruction: */
array_insert(b, b->keeps, atomic);
atomic->dsts[0]->wrmask = src1->dsts[0]->wrmask;
ir3_reg_tie(atomic->dsts[0], atomic->srcs[2]);
struct ir3_instruction *split;
ir3_split_dest(b, &split, atomic, 0, 1);
return split;
atomic->dsts[0]->wrmask = src1->dsts[0]->wrmask;
ir3_reg_tie(atomic->dsts[0], atomic->srcs[2]);
struct ir3_instruction *split;
ir3_split_dest(b, &split, atomic, 0, 1);
return split;
}
static void
emit_intrinsic_image_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
struct ir3_instruction **dst)
struct ir3_instruction **dst)
{
struct ir3_block *b = ctx->block;
struct ir3_instruction *ibo = ir3_image_to_ibo(ctx, intr->src[0]);
struct ir3_instruction *resinfo = ir3_RESINFO(b, ibo, 0);
resinfo->cat6.iim_val = 1;
resinfo->cat6.d = intr->num_components;
resinfo->cat6.type = TYPE_U32;
resinfo->cat6.typed = false;
/* resinfo has no writemask and always writes out 3 components: */
compile_assert(ctx, intr->num_components <= 3);
resinfo->dsts[0]->wrmask = MASK(3);
ir3_handle_bindless_cat6(resinfo, intr->src[0]);
struct ir3_block *b = ctx->block;
struct ir3_instruction *ibo = ir3_image_to_ibo(ctx, intr->src[0]);
struct ir3_instruction *resinfo = ir3_RESINFO(b, ibo, 0);
resinfo->cat6.iim_val = 1;
resinfo->cat6.d = intr->num_components;
resinfo->cat6.type = TYPE_U32;
resinfo->cat6.typed = false;
/* resinfo has no writemask and always writes out 3 components: */
compile_assert(ctx, intr->num_components <= 3);
resinfo->dsts[0]->wrmask = MASK(3);
ir3_handle_bindless_cat6(resinfo, intr->src[0]);
ir3_split_dest(b, dst, resinfo, 0, intr->num_components);
ir3_split_dest(b, dst, resinfo, 0, intr->num_components);
}
static void
emit_intrinsic_load_global_ir3(struct ir3_context *ctx, nir_intrinsic_instr *intr,
struct ir3_instruction **dst)
emit_intrinsic_load_global_ir3(struct ir3_context *ctx,
nir_intrinsic_instr *intr,
struct ir3_instruction **dst)
{
struct ir3_block *b = ctx->block;
unsigned dest_components = nir_intrinsic_dest_components(intr);
struct ir3_instruction *addr, *offset;
struct ir3_block *b = ctx->block;
unsigned dest_components = nir_intrinsic_dest_components(intr);
struct ir3_instruction *addr, *offset;
addr = ir3_collect(ctx,
ir3_get_src(ctx, &intr->src[0])[0],
ir3_get_src(ctx, &intr->src[0])[1]);
addr = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[0])[0],
ir3_get_src(ctx, &intr->src[0])[1]);
offset = ir3_get_src(ctx, &intr->src[1])[0];
offset = ir3_get_src(ctx, &intr->src[1])[0];
struct ir3_instruction *load =
ir3_LDG_A(b, addr, 0, offset, 0,
create_immed(b, 0), 0,
create_immed(b, 0), 0,
create_immed(b, dest_components), 0);
load->cat6.type = TYPE_U32;
load->dsts[0]->wrmask = MASK(dest_components);
struct ir3_instruction *load =
ir3_LDG_A(b, addr, 0, offset, 0, create_immed(b, 0), 0,
create_immed(b, 0), 0, create_immed(b, dest_components), 0);
load->cat6.type = TYPE_U32;
load->dsts[0]->wrmask = MASK(dest_components);
load->barrier_class = IR3_BARRIER_BUFFER_R;
load->barrier_conflict = IR3_BARRIER_BUFFER_W;
load->barrier_class = IR3_BARRIER_BUFFER_R;
load->barrier_conflict = IR3_BARRIER_BUFFER_W;
ir3_split_dest(b, dst, load, 0, dest_components);
ir3_split_dest(b, dst, load, 0, dest_components);
}
static void
emit_intrinsic_store_global_ir3(struct ir3_context *ctx, nir_intrinsic_instr *intr)
emit_intrinsic_store_global_ir3(struct ir3_context *ctx,
nir_intrinsic_instr *intr)
{
struct ir3_block *b = ctx->block;
struct ir3_instruction *value, *addr, *offset;
unsigned ncomp = nir_intrinsic_src_components(intr, 0);
struct ir3_block *b = ctx->block;
struct ir3_instruction *value, *addr, *offset;
unsigned ncomp = nir_intrinsic_src_components(intr, 0);
addr = ir3_collect(ctx,
ir3_get_src(ctx, &intr->src[1])[0],
ir3_get_src(ctx, &intr->src[1])[1]);
addr = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[1])[0],
ir3_get_src(ctx, &intr->src[1])[1]);
offset = ir3_get_src(ctx, &intr->src[2])[0];
offset = ir3_get_src(ctx, &intr->src[2])[0];
value = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
value = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
struct ir3_instruction *stg =
ir3_STG_A(b,
addr, 0,
offset, 0,
create_immed(b, 0), 0,
create_immed(b, 0), 0,
value, 0,
create_immed(b, ncomp), 0);
stg->cat6.type = TYPE_U32;
stg->cat6.iim_val = 1;
struct ir3_instruction *stg =
ir3_STG_A(b, addr, 0, offset, 0, create_immed(b, 0), 0,
create_immed(b, 0), 0, value, 0, create_immed(b, ncomp), 0);
stg->cat6.type = TYPE_U32;
stg->cat6.iim_val = 1;
array_insert(b, b->keeps, stg);
array_insert(b, b->keeps, stg);
stg->barrier_class = IR3_BARRIER_BUFFER_W;
stg->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
stg->barrier_class = IR3_BARRIER_BUFFER_W;
stg->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
}
const struct ir3_context_funcs ir3_a6xx_funcs = {
.emit_intrinsic_load_ssbo = emit_intrinsic_load_ssbo,
.emit_intrinsic_store_ssbo = emit_intrinsic_store_ssbo,
.emit_intrinsic_atomic_ssbo = emit_intrinsic_atomic_ssbo,
.emit_intrinsic_load_image = emit_intrinsic_load_image,
.emit_intrinsic_store_image = emit_intrinsic_store_image,
.emit_intrinsic_atomic_image = emit_intrinsic_atomic_image,
.emit_intrinsic_image_size = emit_intrinsic_image_size,
.emit_intrinsic_load_global_ir3 = emit_intrinsic_load_global_ir3,
.emit_intrinsic_store_global_ir3 = emit_intrinsic_store_global_ir3,
.emit_intrinsic_load_ssbo = emit_intrinsic_load_ssbo,
.emit_intrinsic_store_ssbo = emit_intrinsic_store_ssbo,
.emit_intrinsic_atomic_ssbo = emit_intrinsic_atomic_ssbo,
.emit_intrinsic_load_image = emit_intrinsic_load_image,
.emit_intrinsic_store_image = emit_intrinsic_store_image,
.emit_intrinsic_atomic_image = emit_intrinsic_atomic_image,
.emit_intrinsic_image_size = emit_intrinsic_image_size,
.emit_intrinsic_load_global_ir3 = emit_intrinsic_load_global_ir3,
.emit_intrinsic_store_global_ir3 = emit_intrinsic_store_global_ir3,
};

View File

@ -42,270 +42,274 @@
* so that we don't have to rewrite (and keep track of) users.
*/
#include "ir3.h"
#include <stdlib.h>
#include "ir3.h"
struct array_state {
struct ir3_register *live_in_definition;
struct ir3_register *live_out_definition;
bool constructed;
bool optimized;
struct ir3_register *live_in_definition;
struct ir3_register *live_out_definition;
bool constructed;
bool optimized;
};
struct array_ctx {
struct array_state *states;
struct ir3 *ir;
unsigned array_count;
struct array_state *states;
struct ir3 *ir;
unsigned array_count;
};
static struct array_state *
get_state(struct array_ctx *ctx, struct ir3_block *block, unsigned id)
{
return &ctx->states[ctx->array_count * block->index + id];
return &ctx->states[ctx->array_count * block->index + id];
}
static struct ir3_register *
read_value_beginning(struct array_ctx *ctx, struct ir3_block *block, struct ir3_array *arr);
static struct ir3_register *read_value_beginning(struct array_ctx *ctx,
struct ir3_block *block,
struct ir3_array *arr);
static struct ir3_register *
read_value_end(struct array_ctx *ctx, struct ir3_block *block, struct ir3_array *arr)
read_value_end(struct array_ctx *ctx, struct ir3_block *block,
struct ir3_array *arr)
{
struct array_state *state = get_state(ctx, block, arr->id);
if (state->live_out_definition)
return state->live_out_definition;
struct array_state *state = get_state(ctx, block, arr->id);
if (state->live_out_definition)
return state->live_out_definition;
state->live_out_definition = read_value_beginning(ctx, block, arr);
return state->live_out_definition;
state->live_out_definition = read_value_beginning(ctx, block, arr);
return state->live_out_definition;
}
/* Roughly equivalent to readValueRecursive from the paper: */
static struct ir3_register *
read_value_beginning(struct array_ctx *ctx, struct ir3_block *block, struct ir3_array *arr)
read_value_beginning(struct array_ctx *ctx, struct ir3_block *block,
struct ir3_array *arr)
{
struct array_state *state = get_state(ctx, block, arr->id);
struct array_state *state = get_state(ctx, block, arr->id);
if (state->constructed)
return state->live_in_definition;
if (state->constructed)
return state->live_in_definition;
if (block->predecessors_count == 0) {
state->constructed = true;
return NULL;
}
if (block->predecessors_count == 0) {
state->constructed = true;
return NULL;
}
if (block->predecessors_count == 1) {
state->live_in_definition = read_value_end(ctx, block->predecessors[0], arr);
state->constructed = true;
return state->live_in_definition;
}
if (block->predecessors_count == 1) {
state->live_in_definition =
read_value_end(ctx, block->predecessors[0], arr);
state->constructed = true;
return state->live_in_definition;
}
unsigned flags = IR3_REG_ARRAY | (arr->half ? IR3_REG_HALF : 0);
struct ir3_instruction *phi =
ir3_instr_create(block, OPC_META_PHI, 1, block->predecessors_count);
list_del(&phi->node);
list_add(&phi->node, &block->instr_list);
unsigned flags = IR3_REG_ARRAY | (arr->half ? IR3_REG_HALF : 0);
struct ir3_instruction *phi =
ir3_instr_create(block, OPC_META_PHI, 1, block->predecessors_count);
list_del(&phi->node);
list_add(&phi->node, &block->instr_list);
struct ir3_register *dst = __ssa_dst(phi);
dst->flags |= flags;
dst->array.id = arr->id;
dst->size = arr->length;
struct ir3_register *dst = __ssa_dst(phi);
dst->flags |= flags;
dst->array.id = arr->id;
dst->size = arr->length;
state->live_in_definition = phi->dsts[0];
state->constructed = true;
state->live_in_definition = phi->dsts[0];
state->constructed = true;
for (unsigned i = 0; i < block->predecessors_count; i++) {
struct ir3_register *src = read_value_end(ctx, block->predecessors[i], arr);
struct ir3_register *src_reg;
if (src) {
src_reg = __ssa_src(phi, src->instr, flags);
} else {
src_reg = ir3_src_create(phi, INVALID_REG, flags | IR3_REG_SSA);
}
src_reg->array.id = arr->id;
src_reg->size = arr->length;
}
return phi->dsts[0];
for (unsigned i = 0; i < block->predecessors_count; i++) {
struct ir3_register *src =
read_value_end(ctx, block->predecessors[i], arr);
struct ir3_register *src_reg;
if (src) {
src_reg = __ssa_src(phi, src->instr, flags);
} else {
src_reg = ir3_src_create(phi, INVALID_REG, flags | IR3_REG_SSA);
}
src_reg->array.id = arr->id;
src_reg->size = arr->length;
}
return phi->dsts[0];
}
static struct ir3_register *
remove_trivial_phi(struct ir3_instruction *phi)
{
/* Break cycles */
if (phi->data)
return phi->data;
/* Break cycles */
if (phi->data)
return phi->data;
phi->data = phi->dsts[0];
phi->data = phi->dsts[0];
struct ir3_register *unique_def = NULL;
bool unique = true;
for (unsigned i = 0; i < phi->block->predecessors_count; i++) {
struct ir3_register *src = phi->srcs[i];
struct ir3_register *unique_def = NULL;
bool unique = true;
for (unsigned i = 0; i < phi->block->predecessors_count; i++) {
struct ir3_register *src = phi->srcs[i];
/* If there are any undef sources, then the remaining sources may not
* dominate the phi node, even if they are all equal. So we need to
* bail out in this case.
*
* This seems to be a bug in the original paper.
*/
if (!src->def) {
unique = false;
break;
}
/* If there are any undef sources, then the remaining sources may not
* dominate the phi node, even if they are all equal. So we need to
* bail out in this case.
*
* This seems to be a bug in the original paper.
*/
if (!src->def) {
unique = false;
break;
}
struct ir3_instruction *src_instr = src->def->instr;
struct ir3_instruction *src_instr = src->def->instr;
/* phi sources which point to the phi itself don't count for
* figuring out if the phi is trivial
*/
if (src_instr == phi)
continue;
/* phi sources which point to the phi itself don't count for
* figuring out if the phi is trivial
*/
if (src_instr == phi)
continue;
if (src_instr->opc == OPC_META_PHI) {
src->def = remove_trivial_phi(src->def->instr);
}
if (src_instr->opc == OPC_META_PHI) {
src->def = remove_trivial_phi(src->def->instr);
}
if (unique_def) {
if (unique_def != src->def) {
unique = false;
break;
}
} else {
unique_def = src->def;
}
}
if (unique_def) {
if (unique_def != src->def) {
unique = false;
break;
}
} else {
unique_def = src->def;
}
}
if (unique) {
phi->data = unique_def;
return unique_def;
} else {
return phi->dsts[0];
}
if (unique) {
phi->data = unique_def;
return unique_def;
} else {
return phi->dsts[0];
}
}
static struct ir3_register *
lookup_value(struct ir3_register *reg)
{
if (reg->instr->opc == OPC_META_PHI)
return reg->instr->data;
return reg;
if (reg->instr->opc == OPC_META_PHI)
return reg->instr->data;
return reg;
}
static struct ir3_register *
lookup_live_in(struct array_ctx *ctx, struct ir3_block *block, unsigned id)
{
struct array_state *state = get_state(ctx, block, id);
if (state->live_in_definition)
return lookup_value(state->live_in_definition);
struct array_state *state = get_state(ctx, block, id);
if (state->live_in_definition)
return lookup_value(state->live_in_definition);
return NULL;
return NULL;
}
bool
ir3_array_to_ssa(struct ir3 *ir)
{
struct array_ctx ctx = {};
struct array_ctx ctx = {};
foreach_array (array, &ir->array_list) {
ctx.array_count = MAX2(ctx.array_count, array->id + 1);
}
foreach_array (array, &ir->array_list) {
ctx.array_count = MAX2(ctx.array_count, array->id + 1);
}
if (ctx.array_count == 0)
return false;
if (ctx.array_count == 0)
return false;
unsigned i = 0;
foreach_block (block, &ir->block_list) {
block->index = i++;
}
unsigned i = 0;
foreach_block (block, &ir->block_list) {
block->index = i++;
}
ctx.ir = ir;
ctx.states = calloc(ctx.array_count * i, sizeof(struct array_state));
ctx.ir = ir;
ctx.states = calloc(ctx.array_count * i, sizeof(struct array_state));
foreach_block (block, &ir->block_list) {
foreach_instr (instr, &block->instr_list) {
foreach_dst (dst, instr) {
if (dst->flags & IR3_REG_ARRAY) {
struct array_state *state =
get_state(&ctx, block, dst->array.id);
state->live_out_definition = dst;
}
}
}
}
foreach_block (block, &ir->block_list) {
foreach_instr (instr, &block->instr_list) {
foreach_dst (dst, instr) {
if (dst->flags & IR3_REG_ARRAY) {
struct array_state *state =
get_state(&ctx, block, dst->array.id);
state->live_out_definition = dst;
}
}
}
}
foreach_block (block, &ir->block_list) {
foreach_instr (instr, &block->instr_list) {
if (instr->opc == OPC_META_PHI)
continue;
foreach_block (block, &ir->block_list) {
foreach_instr (instr, &block->instr_list) {
if (instr->opc == OPC_META_PHI)
continue;
foreach_dst (reg, instr) {
if ((reg->flags & IR3_REG_ARRAY) && !reg->tied) {
struct ir3_array *arr = ir3_lookup_array(ir, reg->array.id);
foreach_dst (reg, instr) {
if ((reg->flags & IR3_REG_ARRAY) && !reg->tied) {
struct ir3_array *arr = ir3_lookup_array(ir, reg->array.id);
/* Construct any phi nodes necessary to read this value */
read_value_beginning(&ctx, block, arr);
}
}
foreach_src (reg, instr) {
if ((reg->flags & IR3_REG_ARRAY) && !reg->def) {
struct ir3_array *arr = ir3_lookup_array(ir, reg->array.id);
/* Construct any phi nodes necessary to read this value */
read_value_beginning(&ctx, block, arr);
}
}
foreach_src (reg, instr) {
if ((reg->flags & IR3_REG_ARRAY) && !reg->def) {
struct ir3_array *arr = ir3_lookup_array(ir, reg->array.id);
/* Construct any phi nodes necessary to read this value */
read_value_beginning(&ctx, block, arr);
}
}
}
}
/* Construct any phi nodes necessary to read this value */
read_value_beginning(&ctx, block, arr);
}
}
}
}
foreach_block (block, &ir->block_list) {
foreach_instr_safe (instr, &block->instr_list) {
if (instr->opc == OPC_META_PHI)
remove_trivial_phi(instr);
else
break;
}
}
foreach_block (block, &ir->block_list) {
foreach_instr_safe (instr, &block->instr_list) {
if (instr->opc == OPC_META_PHI)
remove_trivial_phi(instr);
else
break;
}
}
foreach_block (block, &ir->block_list) {
foreach_instr_safe (instr, &block->instr_list) {
if (instr->opc == OPC_META_PHI) {
if (!(instr->flags & IR3_REG_ARRAY))
continue;
if (instr->data != instr->dsts[0]) {
list_del(&instr->node);
continue;
}
for (unsigned i = 0; i < instr->srcs_count; i++) {
instr->srcs[i] = lookup_value(instr->srcs[i]);
}
} else {
foreach_dst (reg, instr) {
if ((reg->flags & IR3_REG_ARRAY)) {
if (!reg->tied) {
struct ir3_register *def =
lookup_live_in(&ctx, block, reg->array.id);
if (def)
ir3_reg_set_last_array(instr, reg, def);
}
reg->flags |= IR3_REG_SSA;
}
}
foreach_src (reg, instr) {
if ((reg->flags & IR3_REG_ARRAY)) {
/* It is assumed that before calling
* ir3_array_to_ssa(), reg->def was set to the
* previous writer of the array within the current
* block or NULL if none.
*/
if (!reg->def) {
reg->def = lookup_live_in(&ctx, block, reg->array.id);
}
reg->flags |= IR3_REG_SSA;
}
}
}
}
}
foreach_block (block, &ir->block_list) {
foreach_instr_safe (instr, &block->instr_list) {
if (instr->opc == OPC_META_PHI) {
if (!(instr->flags & IR3_REG_ARRAY))
continue;
if (instr->data != instr->dsts[0]) {
list_del(&instr->node);
continue;
}
for (unsigned i = 0; i < instr->srcs_count; i++) {
instr->srcs[i] = lookup_value(instr->srcs[i]);
}
} else {
foreach_dst (reg, instr) {
if ((reg->flags & IR3_REG_ARRAY)) {
if (!reg->tied) {
struct ir3_register *def =
lookup_live_in(&ctx, block, reg->array.id);
if (def)
ir3_reg_set_last_array(instr, reg, def);
}
reg->flags |= IR3_REG_SSA;
}
}
foreach_src (reg, instr) {
if ((reg->flags & IR3_REG_ARRAY)) {
/* It is assumed that before calling
* ir3_array_to_ssa(), reg->def was set to the
* previous writer of the array within the current
* block or NULL if none.
*/
if (!reg->def) {
reg->def = lookup_live_in(&ctx, block, reg->array.id);
}
reg->flags |= IR3_REG_SSA;
}
}
}
}
}
free(ctx.states);
return true;
free(ctx.states);
return true;
}

View File

@ -22,8 +22,8 @@
*/
#include "ir3_assembler.h"
#include "ir3_shader.h"
#include "ir3_parser.h"
#include "ir3_shader.h"
/**
* A helper to go from ir3 assembly to assembled shader. The shader has a
@ -32,43 +32,43 @@
struct ir3_shader *
ir3_parse_asm(struct ir3_compiler *c, struct ir3_kernel_info *info, FILE *in)
{
struct ir3_shader *shader = rzalloc_size(NULL, sizeof(*shader));
shader->compiler = c;
shader->type = MESA_SHADER_COMPUTE;
mtx_init(&shader->variants_lock, mtx_plain);
struct ir3_shader *shader = rzalloc_size(NULL, sizeof(*shader));
shader->compiler = c;
shader->type = MESA_SHADER_COMPUTE;
mtx_init(&shader->variants_lock, mtx_plain);
struct ir3_shader_variant *v = rzalloc_size(shader, sizeof(*v));
v->type = MESA_SHADER_COMPUTE;
v->shader = shader;
v->const_state = rzalloc_size(v, sizeof(*v->const_state));
struct ir3_shader_variant *v = rzalloc_size(shader, sizeof(*v));
v->type = MESA_SHADER_COMPUTE;
v->shader = shader;
v->const_state = rzalloc_size(v, sizeof(*v->const_state));
shader->variants = v;
shader->variant_count = 1;
shader->variants = v;
shader->variant_count = 1;
info->numwg = INVALID_REG;
info->numwg = INVALID_REG;
for (int i = 0; i < MAX_BUFS; i++) {
info->buf_addr_regs[i] = INVALID_REG;
}
for (int i = 0; i < MAX_BUFS; i++) {
info->buf_addr_regs[i] = INVALID_REG;
}
/* Provide a default local_size in case the shader doesn't set it, so that
* we don't crash at least.
*/
v->local_size[0] = v->local_size[1] = v->local_size[2] = 1;
/* Provide a default local_size in case the shader doesn't set it, so that
* we don't crash at least.
*/
v->local_size[0] = v->local_size[1] = v->local_size[2] = 1;
v->ir = ir3_parse(v, info, in);
if (!v->ir)
goto error;
v->ir = ir3_parse(v, info, in);
if (!v->ir)
goto error;
ir3_debug_print(v->ir, "AFTER PARSING");
ir3_debug_print(v->ir, "AFTER PARSING");
v->bin = ir3_shader_assemble(v);
if (!v->bin)
goto error;
v->bin = ir3_shader_assemble(v);
if (!v->bin)
goto error;
return shader;
return shader;
error:
ralloc_free(shader);
return NULL;
ralloc_free(shader);
return NULL;
}

View File

@ -30,17 +30,18 @@
#define MAX_BUFS 4
struct ir3_kernel_info {
uint32_t num_bufs;
uint32_t buf_sizes[MAX_BUFS]; /* size in dwords */
uint32_t buf_addr_regs[MAX_BUFS];
uint32_t num_bufs;
uint32_t buf_sizes[MAX_BUFS]; /* size in dwords */
uint32_t buf_addr_regs[MAX_BUFS];
/* driver-param uniforms: */
unsigned numwg;
/* driver-param uniforms: */
unsigned numwg;
};
struct ir3_shader;
struct ir3_compiler;
struct ir3_shader * ir3_parse_asm(struct ir3_compiler *c, struct ir3_kernel_info *info, FILE *in);
struct ir3_shader *ir3_parse_asm(struct ir3_compiler *c,
struct ir3_kernel_info *info, FILE *in);
#endif /* __IR3_ASSEMBLER_H__ */

View File

@ -26,75 +26,74 @@
#include "ir3.h"
static bool
is_safe_conv(struct ir3_instruction *instr, type_t src_type,
opc_t *src_opc)
is_safe_conv(struct ir3_instruction *instr, type_t src_type, opc_t *src_opc)
{
if (instr->opc != OPC_MOV)
return false;
if (instr->opc != OPC_MOV)
return false;
/* Only allow half->full or full->half without any type conversion (like
* int to float).
*/
if (type_size(instr->cat1.src_type) == type_size(instr->cat1.dst_type) ||
full_type(instr->cat1.src_type) != full_type(instr->cat1.dst_type))
return false;
/* Only allow half->full or full->half without any type conversion (like
* int to float).
*/
if (type_size(instr->cat1.src_type) == type_size(instr->cat1.dst_type) ||
full_type(instr->cat1.src_type) != full_type(instr->cat1.dst_type))
return false;
struct ir3_register *dst = instr->dsts[0];
struct ir3_register *src = instr->srcs[0];
struct ir3_register *dst = instr->dsts[0];
struct ir3_register *src = instr->srcs[0];
/* disallow conversions that cannot be folded into
* alu instructions:
*/
if (instr->cat1.round != ROUND_ZERO)
return false;
/* disallow conversions that cannot be folded into
* alu instructions:
*/
if (instr->cat1.round != ROUND_ZERO)
return false;
if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
return false;
if (src->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
return false;
if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
return false;
if (src->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
return false;
/* Check that the source of the conv matches the type of the src
* instruction.
*/
if (src_type == instr->cat1.src_type)
return true;
/* Check that the source of the conv matches the type of the src
* instruction.
*/
if (src_type == instr->cat1.src_type)
return true;
/* We can handle mismatches with integer types by converting the opcode
* but not when an integer is reinterpreted as a float or vice-versa.
*/
if (type_float(src_type) != type_float(instr->cat1.src_type))
return false;
/* We can handle mismatches with integer types by converting the opcode
* but not when an integer is reinterpreted as a float or vice-versa.
*/
if (type_float(src_type) != type_float(instr->cat1.src_type))
return false;
/* We have types with mismatched signedness. Mismatches on the signedness
* don't matter when narrowing:
*/
if (type_size(instr->cat1.dst_type) < type_size(instr->cat1.src_type))
return true;
/* We have types with mismatched signedness. Mismatches on the signedness
* don't matter when narrowing:
*/
if (type_size(instr->cat1.dst_type) < type_size(instr->cat1.src_type))
return true;
/* Try swapping the opcode: */
bool can_swap = true;
*src_opc = ir3_try_swap_signedness(*src_opc, &can_swap);
return can_swap;
/* Try swapping the opcode: */
bool can_swap = true;
*src_opc = ir3_try_swap_signedness(*src_opc, &can_swap);
return can_swap;
}
static bool
all_uses_safe_conv(struct ir3_instruction *conv_src, type_t src_type)
{
opc_t opc = conv_src->opc;
bool first = true;
foreach_ssa_use (use, conv_src) {
opc_t new_opc = opc;
if (!is_safe_conv(use, src_type, &new_opc))
return false;
/* Check if multiple uses have conflicting requirements on the opcode.
*/
if (!first && opc != new_opc)
return false;
first = false;
opc = new_opc;
}
conv_src->opc = opc;
return true;
opc_t opc = conv_src->opc;
bool first = true;
foreach_ssa_use (use, conv_src) {
opc_t new_opc = opc;
if (!is_safe_conv(use, src_type, &new_opc))
return false;
/* Check if multiple uses have conflicting requirements on the opcode.
*/
if (!first && opc != new_opc)
return false;
first = false;
opc = new_opc;
}
conv_src->opc = opc;
return true;
}
/* For an instruction which has a conversion folded in, re-write the
@ -105,74 +104,74 @@ all_uses_safe_conv(struct ir3_instruction *conv_src, type_t src_type)
static void
rewrite_src_uses(struct ir3_instruction *src)
{
foreach_ssa_use (use, src) {
assert(use->opc == OPC_MOV);
foreach_ssa_use (use, src) {
assert(use->opc == OPC_MOV);
if (is_half(src)) {
use->srcs[0]->flags |= IR3_REG_HALF;
} else {
use->srcs[0]->flags &= ~IR3_REG_HALF;
}
if (is_half(src)) {
use->srcs[0]->flags |= IR3_REG_HALF;
} else {
use->srcs[0]->flags &= ~IR3_REG_HALF;
}
use->cat1.src_type = use->cat1.dst_type;
}
use->cat1.src_type = use->cat1.dst_type;
}
}
static bool
try_conversion_folding(struct ir3_instruction *conv)
{
struct ir3_instruction *src;
struct ir3_instruction *src;
if (conv->opc != OPC_MOV)
return false;
if (conv->opc != OPC_MOV)
return false;
/* NOTE: we can have non-ssa srcs after copy propagation: */
src = ssa(conv->srcs[0]);
if (!src)
return false;
/* NOTE: we can have non-ssa srcs after copy propagation: */
src = ssa(conv->srcs[0]);
if (!src)
return false;
if (!is_alu(src))
return false;
if (!is_alu(src))
return false;
bool can_fold;
type_t base_type = ir3_output_conv_type(src, &can_fold);
if (!can_fold)
return false;
bool can_fold;
type_t base_type = ir3_output_conv_type(src, &can_fold);
if (!can_fold)
return false;
type_t src_type = ir3_output_conv_src_type(src, base_type);
type_t dst_type = ir3_output_conv_dst_type(src, base_type);
type_t src_type = ir3_output_conv_src_type(src, base_type);
type_t dst_type = ir3_output_conv_dst_type(src, base_type);
/* Avoid cases where we've already folded in a conversion. We assume that
* if there is a chain of conversions that's foldable then it's been
* folded in NIR already.
*/
if (src_type != dst_type)
return false;
/* Avoid cases where we've already folded in a conversion. We assume that
* if there is a chain of conversions that's foldable then it's been
* folded in NIR already.
*/
if (src_type != dst_type)
return false;
if (!all_uses_safe_conv(src, src_type))
return false;
if (!all_uses_safe_conv(src, src_type))
return false;
ir3_set_dst_type(src, is_half(conv));
rewrite_src_uses(src);
ir3_set_dst_type(src, is_half(conv));
rewrite_src_uses(src);
return true;
return true;
}
bool
ir3_cf(struct ir3 *ir)
{
void *mem_ctx = ralloc_context(NULL);
bool progress = false;
void *mem_ctx = ralloc_context(NULL);
bool progress = false;
ir3_find_ssa_uses(ir, mem_ctx, false);
ir3_find_ssa_uses(ir, mem_ctx, false);
foreach_block (block, &ir->block_list) {
foreach_instr (instr, &block->instr_list) {
progress |= try_conversion_folding(instr);
}
}
foreach_block (block, &ir->block_list) {
foreach_instr (instr, &block->instr_list) {
progress |= try_conversion_folding(instr);
}
}
ralloc_free(mem_ctx);
ralloc_free(mem_ctx);
return progress;
return progress;
}

View File

@ -51,8 +51,10 @@ static const struct debug_named_value shader_debug_options[] = {
/* clang-format on */
};
DEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG", shader_debug_options, 0)
DEBUG_GET_ONCE_OPTION(ir3_shader_override_path, "IR3_SHADER_OVERRIDE_PATH", NULL)
DEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG",
shader_debug_options, 0)
DEBUG_GET_ONCE_OPTION(ir3_shader_override_path, "IR3_SHADER_OVERRIDE_PATH",
NULL)
enum ir3_shader_debug ir3_shader_debug = 0;
const char *ir3_shader_override_path = NULL;
@ -60,126 +62,127 @@ const char *ir3_shader_override_path = NULL;
void
ir3_compiler_destroy(struct ir3_compiler *compiler)
{
disk_cache_destroy(compiler->disk_cache);
ralloc_free(compiler);
disk_cache_destroy(compiler->disk_cache);
ralloc_free(compiler);
}
struct ir3_compiler *
ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id, bool robust_ubo_access)
ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id,
bool robust_ubo_access)
{
struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);
struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);
ir3_shader_debug = debug_get_option_ir3_shader_debug();
ir3_shader_override_path =
!__check_suid() ? debug_get_option_ir3_shader_override_path() : NULL;
ir3_shader_debug = debug_get_option_ir3_shader_debug();
ir3_shader_override_path =
!__check_suid() ? debug_get_option_ir3_shader_override_path() : NULL;
if (ir3_shader_override_path) {
ir3_shader_debug |= IR3_DBG_NOCACHE;
}
if (ir3_shader_override_path) {
ir3_shader_debug |= IR3_DBG_NOCACHE;
}
compiler->dev = dev;
compiler->gpu_id = gpu_id;
compiler->robust_ubo_access = robust_ubo_access;
compiler->dev = dev;
compiler->gpu_id = gpu_id;
compiler->robust_ubo_access = robust_ubo_access;
/* All known GPU's have 32k local memory (aka shared) */
compiler->local_mem_size = 32 * 1024;
/* TODO see if older GPU's were different here */
compiler->branchstack_size = 64;
compiler->wave_granularity = 2;
compiler->max_waves = 16;
/* All known GPU's have 32k local memory (aka shared) */
compiler->local_mem_size = 32 * 1024;
/* TODO see if older GPU's were different here */
compiler->branchstack_size = 64;
compiler->wave_granularity = 2;
compiler->max_waves = 16;
if (compiler->gpu_id >= 600) {
compiler->samgq_workaround = true;
/* a6xx split the pipeline state into geometry and fragment state, in
* order to let the VS run ahead of the FS. As a result there are now
* separate const files for the the fragment shader and everything
* else, and separate limits. There seems to be a shared limit, but
* it's higher than the vert or frag limits.
*
* TODO: The shared limit seems to be different on different on
* different models.
*/
compiler->max_const_pipeline = 640;
compiler->max_const_frag = 512;
compiler->max_const_geom = 512;
compiler->max_const_safe = 128;
if (compiler->gpu_id >= 600) {
compiler->samgq_workaround = true;
/* a6xx split the pipeline state into geometry and fragment state, in
* order to let the VS run ahead of the FS. As a result there are now
* separate const files for the the fragment shader and everything
* else, and separate limits. There seems to be a shared limit, but
* it's higher than the vert or frag limits.
*
* TODO: The shared limit seems to be different on different on
* different models.
*/
compiler->max_const_pipeline = 640;
compiler->max_const_frag = 512;
compiler->max_const_geom = 512;
compiler->max_const_safe = 128;
/* Compute shaders don't share a const file with the FS. Instead they
* have their own file, which is smaller than the FS one.
*
* TODO: is this true on earlier gen's?
*/
compiler->max_const_compute = 256;
/* Compute shaders don't share a const file with the FS. Instead they
* have their own file, which is smaller than the FS one.
*
* TODO: is this true on earlier gen's?
*/
compiler->max_const_compute = 256;
/* TODO: implement clip+cull distances on earlier gen's */
compiler->has_clip_cull = true;
/* TODO: implement clip+cull distances on earlier gen's */
compiler->has_clip_cull = true;
/* TODO: implement private memory on earlier gen's */
compiler->has_pvtmem = true;
/* TODO: implement private memory on earlier gen's */
compiler->has_pvtmem = true;
if (compiler->gpu_id == 650)
compiler->tess_use_shared = true;
} else {
compiler->max_const_pipeline = 512;
compiler->max_const_geom = 512;
compiler->max_const_frag = 512;
compiler->max_const_compute = 512;
if (compiler->gpu_id == 650)
compiler->tess_use_shared = true;
} else {
compiler->max_const_pipeline = 512;
compiler->max_const_geom = 512;
compiler->max_const_frag = 512;
compiler->max_const_compute = 512;
/* Note: this will have to change if/when we support tess+GS on
* earlier gen's.
*/
compiler->max_const_safe = 256;
}
/* Note: this will have to change if/when we support tess+GS on
* earlier gen's.
*/
compiler->max_const_safe = 256;
}
if (compiler->gpu_id == 650) {
/* This changed mid-generation for a650, so that using r32.x and above
* requires using the smallest threadsize.
*/
compiler->reg_size_vec4 = 64;
} else if (compiler->gpu_id >= 600) {
compiler->reg_size_vec4 = 96;
} else if (compiler->gpu_id >= 400) {
/* On a4xx-a5xx, using r24.x and above requires using the smallest
* threadsize.
*/
compiler->reg_size_vec4 = 48;
} else {
/* TODO: confirm this */
compiler->reg_size_vec4 = 96;
}
if (compiler->gpu_id == 650) {
/* This changed mid-generation for a650, so that using r32.x and above
* requires using the smallest threadsize.
*/
compiler->reg_size_vec4 = 64;
} else if (compiler->gpu_id >= 600) {
compiler->reg_size_vec4 = 96;
} else if (compiler->gpu_id >= 400) {
/* On a4xx-a5xx, using r24.x and above requires using the smallest
* threadsize.
*/
compiler->reg_size_vec4 = 48;
} else {
/* TODO: confirm this */
compiler->reg_size_vec4 = 96;
}
if (compiler->gpu_id >= 600) {
compiler->threadsize_base = 64;
} else if (compiler->gpu_id >= 400) {
/* TODO: Confirm this for a4xx. For a5xx this is based on the Vulkan
* 1.1 subgroupSize which is 32.
*/
compiler->threadsize_base = 32;
} else {
compiler->threadsize_base = 8;
}
if (compiler->gpu_id >= 600) {
compiler->threadsize_base = 64;
} else if (compiler->gpu_id >= 400) {
/* TODO: Confirm this for a4xx. For a5xx this is based on the Vulkan
* 1.1 subgroupSize which is 32.
*/
compiler->threadsize_base = 32;
} else {
compiler->threadsize_base = 8;
}
if (compiler->gpu_id >= 400) {
/* need special handling for "flat" */
compiler->flat_bypass = true;
compiler->levels_add_one = false;
compiler->unminify_coords = false;
compiler->txf_ms_with_isaml = false;
compiler->array_index_add_half = true;
compiler->instr_align = 16;
compiler->const_upload_unit = 4;
} else {
/* no special handling for "flat" */
compiler->flat_bypass = false;
compiler->levels_add_one = true;
compiler->unminify_coords = true;
compiler->txf_ms_with_isaml = true;
compiler->array_index_add_half = false;
compiler->instr_align = 4;
compiler->const_upload_unit = 8;
}
if (compiler->gpu_id >= 400) {
/* need special handling for "flat" */
compiler->flat_bypass = true;
compiler->levels_add_one = false;
compiler->unminify_coords = false;
compiler->txf_ms_with_isaml = false;
compiler->array_index_add_half = true;
compiler->instr_align = 16;
compiler->const_upload_unit = 4;
} else {
/* no special handling for "flat" */
compiler->flat_bypass = false;
compiler->levels_add_one = true;
compiler->unminify_coords = true;
compiler->txf_ms_with_isaml = true;
compiler->array_index_add_half = false;
compiler->instr_align = 4;
compiler->const_upload_unit = 8;
}
ir3_disk_cache_init(compiler);
ir3_disk_cache_init(compiler);
return compiler;
return compiler;
}

View File

@ -36,167 +36,167 @@ struct ir3_ra_reg_set;
struct ir3_shader;
struct ir3_compiler {
struct fd_device *dev;
uint32_t gpu_id;
uint32_t shader_count;
struct fd_device *dev;
uint32_t gpu_id;
uint32_t shader_count;
struct disk_cache *disk_cache;
struct disk_cache *disk_cache;
/* If true, UBO accesses are assumed to be bounds-checked as defined by
* VK_EXT_robustness2 and optimizations may have to be more conservative.
*/
bool robust_ubo_access;
/* If true, UBO accesses are assumed to be bounds-checked as defined by
* VK_EXT_robustness2 and optimizations may have to be more conservative.
*/
bool robust_ubo_access;
/*
* Configuration options for things that are handled differently on
* different generations:
*/
/*
* Configuration options for things that are handled differently on
* different generations:
*/
/* a4xx (and later) drops SP_FS_FLAT_SHAD_MODE_REG_* for flat-interpolate
* so we need to use ldlv.u32 to load the varying directly:
*/
bool flat_bypass;
/* a4xx (and later) drops SP_FS_FLAT_SHAD_MODE_REG_* for flat-interpolate
* so we need to use ldlv.u32 to load the varying directly:
*/
bool flat_bypass;
/* on a3xx, we need to add one to # of array levels:
*/
bool levels_add_one;
/* on a3xx, we need to add one to # of array levels:
*/
bool levels_add_one;
/* on a3xx, we need to scale up integer coords for isaml based
* on LoD:
*/
bool unminify_coords;
/* on a3xx, we need to scale up integer coords for isaml based
* on LoD:
*/
bool unminify_coords;
/* on a3xx do txf_ms w/ isaml and scaled coords: */
bool txf_ms_with_isaml;
/* on a3xx do txf_ms w/ isaml and scaled coords: */
bool txf_ms_with_isaml;
/* on a4xx, for array textures we need to add 0.5 to the array
* index coordinate:
*/
bool array_index_add_half;
/* on a4xx, for array textures we need to add 0.5 to the array
* index coordinate:
*/
bool array_index_add_half;
/* on a6xx, rewrite samgp to sequence of samgq0-3 in vertex shaders:
*/
bool samgq_workaround;
/* on a6xx, rewrite samgp to sequence of samgq0-3 in vertex shaders:
*/
bool samgq_workaround;
/* on a650, vertex shader <-> tess control io uses LDL/STL */
bool tess_use_shared;
/* on a650, vertex shader <-> tess control io uses LDL/STL */
bool tess_use_shared;
/* The maximum number of constants, in vec4's, across the entire graphics
* pipeline.
*/
uint16_t max_const_pipeline;
/* The maximum number of constants, in vec4's, across the entire graphics
* pipeline.
*/
uint16_t max_const_pipeline;
/* The maximum number of constants, in vec4's, for VS+HS+DS+GS. */
uint16_t max_const_geom;
/* The maximum number of constants, in vec4's, for VS+HS+DS+GS. */
uint16_t max_const_geom;
/* The maximum number of constants, in vec4's, for FS. */
uint16_t max_const_frag;
/* The maximum number of constants, in vec4's, for FS. */
uint16_t max_const_frag;
/* A "safe" max constlen that can be applied to each shader in the
* pipeline which we guarantee will never exceed any combined limits.
*/
uint16_t max_const_safe;
/* A "safe" max constlen that can be applied to each shader in the
* pipeline which we guarantee will never exceed any combined limits.
*/
uint16_t max_const_safe;
/* The maximum number of constants, in vec4's, for compute shaders. */
uint16_t max_const_compute;
/* The maximum number of constants, in vec4's, for compute shaders. */
uint16_t max_const_compute;
/* Number of instructions that the shader's base address and length
* (instrlen divides instruction count by this) must be aligned to.
*/
uint32_t instr_align;
/* Number of instructions that the shader's base address and length
* (instrlen divides instruction count by this) must be aligned to.
*/
uint32_t instr_align;
/* on a3xx, the unit of indirect const load is higher than later gens (in
* vec4 units):
*/
uint32_t const_upload_unit;
/* on a3xx, the unit of indirect const load is higher than later gens (in
* vec4 units):
*/
uint32_t const_upload_unit;
/* The base number of threads per wave. Some stages may be able to double
* this.
*/
uint32_t threadsize_base;
/* The base number of threads per wave. Some stages may be able to double
* this.
*/
uint32_t threadsize_base;
/* On at least a6xx, waves are always launched in pairs. In calculations
* about occupancy, we pretend that each wave pair is actually one wave,
* which simplifies many of the calculations, but means we have to
* multiply threadsize_base by this number.
*/
uint32_t wave_granularity;
/* On at least a6xx, waves are always launched in pairs. In calculations
* about occupancy, we pretend that each wave pair is actually one wave,
* which simplifies many of the calculations, but means we have to
* multiply threadsize_base by this number.
*/
uint32_t wave_granularity;
/* The maximum number of simultaneous waves per core. */
uint32_t max_waves;
/* The maximum number of simultaneous waves per core. */
uint32_t max_waves;
/* This is theoretical maximum number of vec4 registers that one wave of
* the base threadsize could use. To get the actual size of the register
* file in bytes one would need to compute:
*
* reg_size_vec4 * threadsize_base * wave_granularity * 16 (bytes per vec4)
*
* However this number is more often what we actually need. For example, a
* max_reg more than half of this will result in a doubled threadsize
* being impossible (because double-sized waves take up twice as many
* registers). Also, the formula for the occupancy given a particular
* register footprint is simpler.
*
* It is in vec4 units because the register file is allocated
* with vec4 granularity, so it's in the same units as max_reg.
*/
uint32_t reg_size_vec4;
/* This is theoretical maximum number of vec4 registers that one wave of
* the base threadsize could use. To get the actual size of the register
* file in bytes one would need to compute:
*
* reg_size_vec4 * threadsize_base * wave_granularity * 16 (bytes per vec4)
*
* However this number is more often what we actually need. For example, a
* max_reg more than half of this will result in a doubled threadsize
* being impossible (because double-sized waves take up twice as many
* registers). Also, the formula for the occupancy given a particular
* register footprint is simpler.
*
* It is in vec4 units because the register file is allocated
* with vec4 granularity, so it's in the same units as max_reg.
*/
uint32_t reg_size_vec4;
/* The size of local memory in bytes */
uint32_t local_mem_size;
/* The size of local memory in bytes */
uint32_t local_mem_size;
/* The number of total branch stack entries, divided by wave_granularity. */
uint32_t branchstack_size;
/* The number of total branch stack entries, divided by wave_granularity. */
uint32_t branchstack_size;
/* Whether clip+cull distances are supported */
bool has_clip_cull;
/* Whether clip+cull distances are supported */
bool has_clip_cull;
/* Whether private memory is supported */
bool has_pvtmem;
/* Whether private memory is supported */
bool has_pvtmem;
};
void ir3_compiler_destroy(struct ir3_compiler *compiler);
struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id,
bool robust_ubo_access);
struct ir3_compiler *ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id,
bool robust_ubo_access);
void ir3_disk_cache_init(struct ir3_compiler *compiler);
void ir3_disk_cache_init_shader_key(struct ir3_compiler *compiler,
struct ir3_shader *shader);
struct ir3_shader *shader);
bool ir3_disk_cache_retrieve(struct ir3_compiler *compiler,
struct ir3_shader_variant *v);
struct ir3_shader_variant *v);
void ir3_disk_cache_store(struct ir3_compiler *compiler,
struct ir3_shader_variant *v);
struct ir3_shader_variant *v);
int ir3_compile_shader_nir(struct ir3_compiler *compiler,
struct ir3_shader_variant *so);
struct ir3_shader_variant *so);
/* gpu pointer size in units of 32bit registers/slots */
static inline
unsigned ir3_pointer_size(struct ir3_compiler *compiler)
static inline unsigned
ir3_pointer_size(struct ir3_compiler *compiler)
{
return (compiler->gpu_id >= 500) ? 2 : 1;
return (compiler->gpu_id >= 500) ? 2 : 1;
}
enum ir3_shader_debug {
IR3_DBG_SHADER_VS = BITFIELD_BIT(0),
IR3_DBG_SHADER_TCS = BITFIELD_BIT(1),
IR3_DBG_SHADER_TES = BITFIELD_BIT(2),
IR3_DBG_SHADER_GS = BITFIELD_BIT(3),
IR3_DBG_SHADER_FS = BITFIELD_BIT(4),
IR3_DBG_SHADER_CS = BITFIELD_BIT(5),
IR3_DBG_DISASM = BITFIELD_BIT(6),
IR3_DBG_OPTMSGS = BITFIELD_BIT(7),
IR3_DBG_FORCES2EN = BITFIELD_BIT(8),
IR3_DBG_NOUBOOPT = BITFIELD_BIT(9),
IR3_DBG_NOFP16 = BITFIELD_BIT(10),
IR3_DBG_NOCACHE = BITFIELD_BIT(11),
IR3_DBG_SHADER_VS = BITFIELD_BIT(0),
IR3_DBG_SHADER_TCS = BITFIELD_BIT(1),
IR3_DBG_SHADER_TES = BITFIELD_BIT(2),
IR3_DBG_SHADER_GS = BITFIELD_BIT(3),
IR3_DBG_SHADER_FS = BITFIELD_BIT(4),
IR3_DBG_SHADER_CS = BITFIELD_BIT(5),
IR3_DBG_DISASM = BITFIELD_BIT(6),
IR3_DBG_OPTMSGS = BITFIELD_BIT(7),
IR3_DBG_FORCES2EN = BITFIELD_BIT(8),
IR3_DBG_NOUBOOPT = BITFIELD_BIT(9),
IR3_DBG_NOFP16 = BITFIELD_BIT(10),
IR3_DBG_NOCACHE = BITFIELD_BIT(11),
/* DEBUG-only options: */
IR3_DBG_SCHEDMSGS = BITFIELD_BIT(20),
IR3_DBG_RAMSGS = BITFIELD_BIT(21),
/* DEBUG-only options: */
IR3_DBG_SCHEDMSGS = BITFIELD_BIT(20),
IR3_DBG_RAMSGS = BITFIELD_BIT(21),
/* Only used for the disk-caching logic: */
IR3_DBG_ROBUST_UBO_ACCESS = BITFIELD_BIT(30),
/* Only used for the disk-caching logic: */
IR3_DBG_ROBUST_UBO_ACCESS = BITFIELD_BIT(30),
};
extern enum ir3_shader_debug ir3_shader_debug;
@ -205,29 +205,35 @@ extern const char *ir3_shader_override_path;
static inline bool
shader_debug_enabled(gl_shader_stage type)
{
if (ir3_shader_debug & IR3_DBG_DISASM)
return true;
if (ir3_shader_debug & IR3_DBG_DISASM)
return true;
switch (type) {
case MESA_SHADER_VERTEX: return !!(ir3_shader_debug & IR3_DBG_SHADER_VS);
case MESA_SHADER_TESS_CTRL: return !!(ir3_shader_debug & IR3_DBG_SHADER_TCS);
case MESA_SHADER_TESS_EVAL: return !!(ir3_shader_debug & IR3_DBG_SHADER_TES);
case MESA_SHADER_GEOMETRY: return !!(ir3_shader_debug & IR3_DBG_SHADER_GS);
case MESA_SHADER_FRAGMENT: return !!(ir3_shader_debug & IR3_DBG_SHADER_FS);
case MESA_SHADER_COMPUTE: return !!(ir3_shader_debug & IR3_DBG_SHADER_CS);
default:
debug_assert(0);
return false;
}
switch (type) {
case MESA_SHADER_VERTEX:
return !!(ir3_shader_debug & IR3_DBG_SHADER_VS);
case MESA_SHADER_TESS_CTRL:
return !!(ir3_shader_debug & IR3_DBG_SHADER_TCS);
case MESA_SHADER_TESS_EVAL:
return !!(ir3_shader_debug & IR3_DBG_SHADER_TES);
case MESA_SHADER_GEOMETRY:
return !!(ir3_shader_debug & IR3_DBG_SHADER_GS);
case MESA_SHADER_FRAGMENT:
return !!(ir3_shader_debug & IR3_DBG_SHADER_FS);
case MESA_SHADER_COMPUTE:
return !!(ir3_shader_debug & IR3_DBG_SHADER_CS);
default:
debug_assert(0);
return false;
}
}
static inline void
ir3_debug_print(struct ir3 *ir, const char *when)
{
if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
mesa_logi("%s:", when);
ir3_print(ir);
}
if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
mesa_logi("%s:", when);
ir3_print(ir);
}
}
#endif /* IR3_COMPILER_H_ */

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -27,215 +27,250 @@
#ifndef IR3_CONTEXT_H_
#define IR3_CONTEXT_H_
#include "ir3.h"
#include "ir3_compiler.h"
#include "ir3_nir.h"
#include "ir3.h"
/* for conditionally setting boolean flag(s): */
#define COND(bool, val) ((bool) ? (val) : 0)
#define DBG(fmt, ...) \
do { mesa_logd("%s:%d: "fmt, \
__FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
#define DBG(fmt, ...) \
do { \
mesa_logd("%s:%d: " fmt, __FUNCTION__, __LINE__, ##__VA_ARGS__); \
} while (0)
/**
* The context for compilation of a single shader.
*/
struct ir3_context {
struct ir3_compiler *compiler;
const struct ir3_context_funcs *funcs;
struct ir3_compiler *compiler;
const struct ir3_context_funcs *funcs;
struct nir_shader *s;
struct nir_shader *s;
struct nir_instr *cur_instr; /* current instruction, just for debug */
struct nir_instr *cur_instr; /* current instruction, just for debug */
struct ir3 *ir;
struct ir3_shader_variant *so;
struct ir3 *ir;
struct ir3_shader_variant *so;
/* Tables of scalar inputs/outputs. Because of the way varying packing
* works, we could have inputs w/ fractional location, which is a bit
* awkward to deal with unless we keep track of the split scalar in/
* out components.
*
* These *only* have inputs/outputs that are touched by load_*input and
* store_output.
*/
unsigned ninputs, noutputs;
struct ir3_instruction **inputs;
struct ir3_instruction **outputs;
/* Tables of scalar inputs/outputs. Because of the way varying packing
* works, we could have inputs w/ fractional location, which is a bit
* awkward to deal with unless we keep track of the split scalar in/
* out components.
*
* These *only* have inputs/outputs that are touched by load_*input and
* store_output.
*/
unsigned ninputs, noutputs;
struct ir3_instruction **inputs;
struct ir3_instruction **outputs;
struct ir3_block *block; /* the current block */
struct ir3_block *in_block; /* block created for shader inputs */
struct ir3_block *block; /* the current block */
struct ir3_block *in_block; /* block created for shader inputs */
nir_function_impl *impl;
nir_function_impl *impl;
/* For fragment shaders, varyings are not actual shader inputs,
* instead the hw passes a ij coord which is used with
* bary.f.
*
* But NIR doesn't know that, it still declares varyings as
* inputs. So we do all the input tracking normally and fix
* things up after compile_instructions()
*/
struct ir3_instruction *ij[IJ_COUNT];
/* For fragment shaders, varyings are not actual shader inputs,
* instead the hw passes a ij coord which is used with
* bary.f.
*
* But NIR doesn't know that, it still declares varyings as
* inputs. So we do all the input tracking normally and fix
* things up after compile_instructions()
*/
struct ir3_instruction *ij[IJ_COUNT];
/* for fragment shaders, for gl_FrontFacing and gl_FragCoord: */
struct ir3_instruction *frag_face, *frag_coord;
/* for fragment shaders, for gl_FrontFacing and gl_FragCoord: */
struct ir3_instruction *frag_face, *frag_coord;
/* For vertex shaders, keep track of the system values sources */
struct ir3_instruction *vertex_id, *basevertex, *instance_id, *base_instance, *draw_id, *view_index;
/* For vertex shaders, keep track of the system values sources */
struct ir3_instruction *vertex_id, *basevertex, *instance_id, *base_instance,
*draw_id, *view_index;
/* For fragment shaders: */
struct ir3_instruction *samp_id, *samp_mask_in;
/* For fragment shaders: */
struct ir3_instruction *samp_id, *samp_mask_in;
/* For geometry shaders: */
struct ir3_instruction *primitive_id;
struct ir3_instruction *gs_header;
/* For geometry shaders: */
struct ir3_instruction *primitive_id;
struct ir3_instruction *gs_header;
/* For tessellation shaders: */
struct ir3_instruction *patch_vertices_in;
struct ir3_instruction *tcs_header;
struct ir3_instruction *tess_coord;
/* For tessellation shaders: */
struct ir3_instruction *patch_vertices_in;
struct ir3_instruction *tcs_header;
struct ir3_instruction *tess_coord;
/* Compute shader inputs: */
struct ir3_instruction *local_invocation_id, *work_group_id;
/* Compute shader inputs: */
struct ir3_instruction *local_invocation_id, *work_group_id;
/* mapping from nir_register to defining instruction: */
struct hash_table *def_ht;
/* mapping from nir_register to defining instruction: */
struct hash_table *def_ht;
unsigned num_arrays;
unsigned num_arrays;
/* Tracking for max level of flowcontrol (branchstack) needed
* by a5xx+:
*/
unsigned stack, max_stack;
/* Tracking for max level of flowcontrol (branchstack) needed
* by a5xx+:
*/
unsigned stack, max_stack;
unsigned loop_id;
unsigned loop_id;
/* a common pattern for indirect addressing is to request the
* same address register multiple times. To avoid generating
* duplicate instruction sequences (which our backend does not
* try to clean up, since that should be done as the NIR stage)
* we cache the address value generated for a given src value:
*
* Note that we have to cache these per alignment, since same
* src used for an array of vec1 cannot be also used for an
* array of vec4.
*/
struct hash_table *addr0_ht[4];
/* a common pattern for indirect addressing is to request the
* same address register multiple times. To avoid generating
* duplicate instruction sequences (which our backend does not
* try to clean up, since that should be done as the NIR stage)
* we cache the address value generated for a given src value:
*
* Note that we have to cache these per alignment, since same
* src used for an array of vec1 cannot be also used for an
* array of vec4.
*/
struct hash_table *addr0_ht[4];
/* The same for a1.x. We only support immediate values for a1.x, as this
* is the only use so far.
*/
struct hash_table_u64 *addr1_ht;
/* The same for a1.x. We only support immediate values for a1.x, as this
* is the only use so far.
*/
struct hash_table_u64 *addr1_ht;
struct hash_table *sel_cond_conversions;
struct hash_table *sel_cond_conversions;
/* last dst array, for indirect we need to insert a var-store.
*/
struct ir3_instruction **last_dst;
unsigned last_dst_n;
/* last dst array, for indirect we need to insert a var-store.
*/
struct ir3_instruction **last_dst;
unsigned last_dst_n;
/* maps nir_block to ir3_block, mostly for the purposes of
* figuring out the blocks successors
*/
struct hash_table *block_ht;
/* maps nir_block to ir3_block, mostly for the purposes of
* figuring out the blocks successors
*/
struct hash_table *block_ht;
/* maps nir_block at the top of a loop to ir3_block collecting continue
* edges.
*/
struct hash_table *continue_block_ht;
/* maps nir_block at the top of a loop to ir3_block collecting continue
* edges.
*/
struct hash_table *continue_block_ht;
/* on a4xx, bitmask of samplers which need astc+srgb workaround: */
unsigned astc_srgb;
/* on a4xx, bitmask of samplers which need astc+srgb workaround: */
unsigned astc_srgb;
unsigned samples; /* bitmask of x,y sample shifts */
unsigned samples; /* bitmask of x,y sample shifts */
unsigned max_texture_index;
unsigned max_texture_index;
unsigned prefetch_limit;
unsigned prefetch_limit;
/* set if we encounter something we can't handle yet, so we
* can bail cleanly and fallback to TGSI compiler f/e
*/
bool error;
/* set if we encounter something we can't handle yet, so we
* can bail cleanly and fallback to TGSI compiler f/e
*/
bool error;
};
struct ir3_context_funcs {
void (*emit_intrinsic_load_ssbo)(struct ir3_context *ctx, nir_intrinsic_instr *intr,
struct ir3_instruction **dst);
void (*emit_intrinsic_store_ssbo)(struct ir3_context *ctx, nir_intrinsic_instr *intr);
struct ir3_instruction * (*emit_intrinsic_atomic_ssbo)(struct ir3_context *ctx, nir_intrinsic_instr *intr);
void (*emit_intrinsic_load_image)(struct ir3_context *ctx, nir_intrinsic_instr *intr,
struct ir3_instruction **dst);
void (*emit_intrinsic_store_image)(struct ir3_context *ctx, nir_intrinsic_instr *intr);
struct ir3_instruction * (*emit_intrinsic_atomic_image)(struct ir3_context *ctx, nir_intrinsic_instr *intr);
void (*emit_intrinsic_image_size)(struct ir3_context *ctx, nir_intrinsic_instr *intr,
struct ir3_instruction **dst);
void (*emit_intrinsic_load_global_ir3)(struct ir3_context *ctx, nir_intrinsic_instr *intr,
struct ir3_instruction **dst);
void (*emit_intrinsic_store_global_ir3)(struct ir3_context *ctx, nir_intrinsic_instr *intr);
void (*emit_intrinsic_load_ssbo)(struct ir3_context *ctx,
nir_intrinsic_instr *intr,
struct ir3_instruction **dst);
void (*emit_intrinsic_store_ssbo)(struct ir3_context *ctx,
nir_intrinsic_instr *intr);
struct ir3_instruction *(*emit_intrinsic_atomic_ssbo)(
struct ir3_context *ctx, nir_intrinsic_instr *intr);
void (*emit_intrinsic_load_image)(struct ir3_context *ctx,
nir_intrinsic_instr *intr,
struct ir3_instruction **dst);
void (*emit_intrinsic_store_image)(struct ir3_context *ctx,
nir_intrinsic_instr *intr);
struct ir3_instruction *(*emit_intrinsic_atomic_image)(
struct ir3_context *ctx, nir_intrinsic_instr *intr);
void (*emit_intrinsic_image_size)(struct ir3_context *ctx,
nir_intrinsic_instr *intr,
struct ir3_instruction **dst);
void (*emit_intrinsic_load_global_ir3)(struct ir3_context *ctx,
nir_intrinsic_instr *intr,
struct ir3_instruction **dst);
void (*emit_intrinsic_store_global_ir3)(struct ir3_context *ctx,
nir_intrinsic_instr *intr);
};
extern const struct ir3_context_funcs ir3_a4xx_funcs;
extern const struct ir3_context_funcs ir3_a6xx_funcs;
struct ir3_context * ir3_context_init(struct ir3_compiler *compiler,
struct ir3_shader_variant *so);
struct ir3_context *ir3_context_init(struct ir3_compiler *compiler,
struct ir3_shader_variant *so);
void ir3_context_free(struct ir3_context *ctx);
struct ir3_instruction ** ir3_get_dst_ssa(struct ir3_context *ctx, nir_ssa_def *dst, unsigned n);
struct ir3_instruction ** ir3_get_dst(struct ir3_context *ctx, nir_dest *dst, unsigned n);
struct ir3_instruction * const * ir3_get_src(struct ir3_context *ctx, nir_src *src);
struct ir3_instruction **ir3_get_dst_ssa(struct ir3_context *ctx,
nir_ssa_def *dst, unsigned n);
struct ir3_instruction **ir3_get_dst(struct ir3_context *ctx, nir_dest *dst,
unsigned n);
struct ir3_instruction *const *ir3_get_src(struct ir3_context *ctx,
nir_src *src);
void ir3_put_dst(struct ir3_context *ctx, nir_dest *dst);
struct ir3_instruction * ir3_create_collect(struct ir3_context *ctx,
struct ir3_instruction *const *arr, unsigned arrsz);
struct ir3_instruction *ir3_create_collect(struct ir3_context *ctx,
struct ir3_instruction *const *arr,
unsigned arrsz);
void ir3_split_dest(struct ir3_block *block, struct ir3_instruction **dst,
struct ir3_instruction *src, unsigned base, unsigned n);
struct ir3_instruction *src, unsigned base, unsigned n);
void ir3_handle_bindless_cat6(struct ir3_instruction *instr, nir_src rsrc);
void ir3_handle_nonuniform(struct ir3_instruction *instr, nir_intrinsic_instr *intrin);
void emit_intrinsic_image_size_tex(struct ir3_context *ctx, nir_intrinsic_instr *intr,
struct ir3_instruction **dst);
void ir3_handle_nonuniform(struct ir3_instruction *instr,
nir_intrinsic_instr *intrin);
void emit_intrinsic_image_size_tex(struct ir3_context *ctx,
nir_intrinsic_instr *intr,
struct ir3_instruction **dst);
#define ir3_collect(ctx, ...) ({ \
struct ir3_instruction *__arr[] = { __VA_ARGS__ }; \
ir3_create_collect(ctx, __arr, ARRAY_SIZE(__arr)); \
})
#define ir3_collect(ctx, ...) \
({ \
struct ir3_instruction *__arr[] = {__VA_ARGS__}; \
ir3_create_collect(ctx, __arr, ARRAY_SIZE(__arr)); \
})
NORETURN void ir3_context_error(struct ir3_context *ctx, const char *format, ...);
NORETURN void ir3_context_error(struct ir3_context *ctx, const char *format,
...);
#define compile_assert(ctx, cond) do { \
if (!(cond)) ir3_context_error((ctx), "failed assert: "#cond"\n"); \
} while (0)
#define compile_assert(ctx, cond) \
do { \
if (!(cond)) \
ir3_context_error((ctx), "failed assert: " #cond "\n"); \
} while (0)
struct ir3_instruction * ir3_get_addr0(struct ir3_context *ctx,
struct ir3_instruction *src, int align);
struct ir3_instruction * ir3_get_addr1(struct ir3_context *ctx,
unsigned const_val);
struct ir3_instruction * ir3_get_predicate(struct ir3_context *ctx,
struct ir3_instruction *src);
struct ir3_instruction *ir3_get_addr0(struct ir3_context *ctx,
struct ir3_instruction *src, int align);
struct ir3_instruction *ir3_get_addr1(struct ir3_context *ctx,
unsigned const_val);
struct ir3_instruction *ir3_get_predicate(struct ir3_context *ctx,
struct ir3_instruction *src);
void ir3_declare_array(struct ir3_context *ctx, nir_register *reg);
struct ir3_array * ir3_get_array(struct ir3_context *ctx, nir_register *reg);
struct ir3_array *ir3_get_array(struct ir3_context *ctx, nir_register *reg);
struct ir3_instruction *ir3_create_array_load(struct ir3_context *ctx,
struct ir3_array *arr, int n, struct ir3_instruction *address);
void ir3_create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n,
struct ir3_instruction *src, struct ir3_instruction *address);
struct ir3_array *arr, int n,
struct ir3_instruction *address);
void ir3_create_array_store(struct ir3_context *ctx, struct ir3_array *arr,
int n, struct ir3_instruction *src,
struct ir3_instruction *address);
static inline type_t utype_for_size(unsigned bit_size)
static inline type_t
utype_for_size(unsigned bit_size)
{
switch (bit_size) {
case 32: return TYPE_U32;
case 16: return TYPE_U16;
case 8: return TYPE_U8;
default: unreachable("bad bitsize"); return ~0;
}
switch (bit_size) {
case 32:
return TYPE_U32;
case 16:
return TYPE_U16;
case 8:
return TYPE_U8;
default:
unreachable("bad bitsize");
return ~0;
}
}
static inline type_t utype_src(nir_src src)
{ return utype_for_size(nir_src_bit_size(src)); }
static inline type_t
utype_src(nir_src src)
{
return utype_for_size(nir_src_bit_size(src));
}
static inline type_t utype_dst(nir_dest dst)
{ return utype_for_size(nir_dest_bit_size(dst)); }
static inline type_t
utype_dst(nir_dest dst)
{
return utype_for_size(nir_dest_bit_size(dst));
}
#endif /* IR3_CONTEXT_H_ */

File diff suppressed because it is too large Load Diff

View File

@ -36,7 +36,6 @@
* one. It is basically anything that is not SSA.
*/
/**
* Check if any instruction before `use` and after `src` writes to the
* specified array. If `offset` is negative, it is a relative (a0.x)
@ -48,186 +47,184 @@
* the correct array write.
*/
static bool
has_conflicting_write(struct ir3_instruction *src,
struct ir3_instruction *use,
struct ir3_register **def,
unsigned id, int offset)
has_conflicting_write(struct ir3_instruction *src, struct ir3_instruction *use,
struct ir3_register **def, unsigned id, int offset)
{
assert(src->block == use->block);
bool last_write = true;
assert(src->block == use->block);
bool last_write = true;
/* NOTE that since src and use are in the same block, src by
* definition appears in the block's instr_list before use:
*/
foreach_instr_rev (instr, &use->node) {
if (instr == src)
break;
/* NOTE that since src and use are in the same block, src by
* definition appears in the block's instr_list before use:
*/
foreach_instr_rev (instr, &use->node) {
if (instr == src)
break;
/* if we are looking at a RELATIV read, we can't move
* it past an a0.x write:
*/
if ((offset < 0) && (dest_regs(instr) > 0) &&
(instr->dsts[0]->num == regid(REG_A0, 0)))
return true;
/* if we are looking at a RELATIV read, we can't move
* it past an a0.x write:
*/
if ((offset < 0) && (dest_regs(instr) > 0) &&
(instr->dsts[0]->num == regid(REG_A0, 0)))
return true;
if (!writes_gpr(instr))
continue;
if (!writes_gpr(instr))
continue;
struct ir3_register *dst = instr->dsts[0];
if (!(dst->flags & IR3_REG_ARRAY))
continue;
struct ir3_register *dst = instr->dsts[0];
if (!(dst->flags & IR3_REG_ARRAY))
continue;
if (dst->array.id != id)
continue;
if (dst->array.id != id)
continue;
/*
* At this point, we have narrowed down an instruction
* that writes to the same array.. check if it the write
* is to an array element that we care about:
*/
/*
* At this point, we have narrowed down an instruction
* that writes to the same array.. check if it the write
* is to an array element that we care about:
*/
/* is write to an unknown array element? */
if (dst->flags & IR3_REG_RELATIV)
return true;
/* is write to an unknown array element? */
if (dst->flags & IR3_REG_RELATIV)
return true;
/* is read from an unknown array element? */
if (offset < 0)
return true;
/* is read from an unknown array element? */
if (offset < 0)
return true;
/* is write to same array element? */
if (dst->array.offset == offset)
return true;
/* is write to same array element? */
if (dst->array.offset == offset)
return true;
if (last_write)
*def = dst;
if (last_write)
*def = dst;
last_write = false;
}
last_write = false;
}
return false;
return false;
}
/* Can we fold the mov src into use without invalid flags? */
static bool
valid_flags(struct ir3_instruction *use, struct ir3_instruction *mov)
{
struct ir3_register *src = mov->srcs[0];
struct ir3_register *src = mov->srcs[0];
foreach_src_n (reg, n, use) {
if (ssa(reg) != mov)
continue;
foreach_src_n (reg, n, use) {
if (ssa(reg) != mov)
continue;
if (!ir3_valid_flags(use, n, reg->flags | src->flags))
return false;
}
if (!ir3_valid_flags(use, n, reg->flags | src->flags))
return false;
}
return true;
return true;
}
static bool
instr_cp_postsched(struct ir3_instruction *mov)
{
struct ir3_register *src = mov->srcs[0];
struct ir3_register *src = mov->srcs[0];
/* only consider mov's from "arrays", other cases we have
* already considered already:
*/
if (!(src->flags & IR3_REG_ARRAY))
return false;
/* only consider mov's from "arrays", other cases we have
* already considered already:
*/
if (!(src->flags & IR3_REG_ARRAY))
return false;
int offset = (src->flags & IR3_REG_RELATIV) ? -1 : src->array.offset;
int offset = (src->flags & IR3_REG_RELATIV) ? -1 : src->array.offset;
/* Once we move the array read directly into the consuming
* instruction(s), we will also need to update instructions
* that had a false-dep on the original mov to have deps
* on the consuming instructions:
*/
struct util_dynarray newdeps;
util_dynarray_init(&newdeps, mov->uses);
/* Once we move the array read directly into the consuming
* instruction(s), we will also need to update instructions
* that had a false-dep on the original mov to have deps
* on the consuming instructions:
*/
struct util_dynarray newdeps;
util_dynarray_init(&newdeps, mov->uses);
foreach_ssa_use (use, mov) {
if (use->block != mov->block)
continue;
foreach_ssa_use (use, mov) {
if (use->block != mov->block)
continue;
if (is_meta(use))
continue;
if (is_meta(use))
continue;
struct ir3_register *def = src->def;
if (has_conflicting_write(mov, use, &def, src->array.id, offset))
continue;
struct ir3_register *def = src->def;
if (has_conflicting_write(mov, use, &def, src->array.id, offset))
continue;
if (conflicts(mov->address, use->address))
continue;
if (conflicts(mov->address, use->address))
continue;
if (!valid_flags(use, mov))
continue;
if (!valid_flags(use, mov))
continue;
/* Ok, we've established that it is safe to remove this copy: */
/* Ok, we've established that it is safe to remove this copy: */
bool removed = false;
foreach_src_n (reg, n, use) {
if (ssa(reg) != mov)
continue;
bool removed = false;
foreach_src_n (reg, n, use) {
if (ssa(reg) != mov)
continue;
use->srcs[n] = ir3_reg_clone(mov->block->shader, src);
use->srcs[n] = ir3_reg_clone(mov->block->shader, src);
/* preserve (abs)/etc modifiers: */
use->srcs[n]-> flags |= reg->flags;
/* preserve (abs)/etc modifiers: */
use->srcs[n]->flags |= reg->flags;
/* If we're sinking the array read past any writes, make
* sure to update it to point to the new previous write:
*/
use->srcs[n]->def = def;
/* If we're sinking the array read past any writes, make
* sure to update it to point to the new previous write:
*/
use->srcs[n]->def = def;
removed = true;
}
removed = true;
}
/* the use could have been only a false-dep, only add to the newdeps
* array and update the address if we've actually updated a real src
* reg for the use:
*/
if (removed) {
if (src->flags & IR3_REG_RELATIV)
ir3_instr_set_address(use, mov->address->def->instr);
/* the use could have been only a false-dep, only add to the newdeps
* array and update the address if we've actually updated a real src
* reg for the use:
*/
if (removed) {
if (src->flags & IR3_REG_RELATIV)
ir3_instr_set_address(use, mov->address->def->instr);
util_dynarray_append(&newdeps, struct ir3_instruction *, use);
util_dynarray_append(&newdeps, struct ir3_instruction *, use);
/* Remove the use from the src instruction: */
_mesa_set_remove_key(mov->uses, use);
}
}
/* Remove the use from the src instruction: */
_mesa_set_remove_key(mov->uses, use);
}
}
/* Once we have the complete set of instruction(s) that are are now
* directly reading from the array, update any false-dep uses to
* now depend on these instructions. The only remaining uses at
* this point should be false-deps:
*/
foreach_ssa_use (use, mov) {
util_dynarray_foreach(&newdeps, struct ir3_instruction *, instrp) {
struct ir3_instruction *newdep = *instrp;
ir3_instr_add_dep(use, newdep);
}
}
/* Once we have the complete set of instruction(s) that are are now
* directly reading from the array, update any false-dep uses to
* now depend on these instructions. The only remaining uses at
* this point should be false-deps:
*/
foreach_ssa_use (use, mov) {
util_dynarray_foreach (&newdeps, struct ir3_instruction *, instrp) {
struct ir3_instruction *newdep = *instrp;
ir3_instr_add_dep(use, newdep);
}
}
return util_dynarray_num_elements(&newdeps, struct ir3_instruction **) > 0;
return util_dynarray_num_elements(&newdeps, struct ir3_instruction **) > 0;
}
bool
ir3_cp_postsched(struct ir3 *ir)
{
void *mem_ctx = ralloc_context(NULL);
bool progress = false;
void *mem_ctx = ralloc_context(NULL);
bool progress = false;
ir3_find_ssa_uses(ir, mem_ctx, false);
ir3_find_ssa_uses(ir, mem_ctx, false);
foreach_block (block, &ir->block_list) {
foreach_instr_safe (instr, &block->instr_list) {
if (is_same_type_mov(instr))
progress |= instr_cp_postsched(instr);
}
}
foreach_block (block, &ir->block_list) {
foreach_instr_safe (instr, &block->instr_list) {
if (is_same_type_mov(instr))
progress |= instr_cp_postsched(instr);
}
}
ralloc_free(mem_ctx);
ralloc_free(mem_ctx);
return progress;
return progress;
}

View File

@ -37,112 +37,109 @@
static uint32_t
hash_instr(const void *data)
{
const struct ir3_instruction *instr = data;
uint32_t hash = 0;
const struct ir3_instruction *instr = data;
uint32_t hash = 0;
hash = HASH(hash, instr->opc);
hash = HASH(hash, instr->dsts[0]->flags);
foreach_src (src, (struct ir3_instruction *) instr) {
if (src->flags & IR3_REG_CONST)
hash = HASH(hash, src->num);
else if (src->flags & IR3_REG_IMMED)
hash = HASH(hash, src->uim_val);
else
hash = HASH(hash, src->def);
}
hash = HASH(hash, instr->opc);
hash = HASH(hash, instr->dsts[0]->flags);
foreach_src (src, (struct ir3_instruction *)instr) {
if (src->flags & IR3_REG_CONST)
hash = HASH(hash, src->num);
else if (src->flags & IR3_REG_IMMED)
hash = HASH(hash, src->uim_val);
else
hash = HASH(hash, src->def);
}
return hash;
return hash;
}
static bool
instrs_equal(const struct ir3_instruction *i1, const struct ir3_instruction *i2)
{
if (i1->opc != i2->opc)
return false;
if (i1->opc != i2->opc)
return false;
if (i1->dsts_count != i2->dsts_count)
return false;
if (i1->dsts_count != i2->dsts_count)
return false;
if (i1->srcs_count != i2->srcs_count)
return false;
if (i1->srcs_count != i2->srcs_count)
return false;
if (i1->dsts[0]->flags != i2->dsts[0]->flags)
return false;
if (i1->dsts[0]->flags != i2->dsts[0]->flags)
return false;
for (unsigned i = 0; i < i1->srcs_count; i++) {
const struct ir3_register *i1_reg = i1->srcs[i], *i2_reg = i2->srcs[i];
for (unsigned i = 0; i < i1->srcs_count; i++) {
const struct ir3_register *i1_reg = i1->srcs[i], *i2_reg = i2->srcs[i];
if (i1_reg->flags != i2_reg->flags)
return false;
if (i1_reg->flags != i2_reg->flags)
return false;
if (i1_reg->flags & IR3_REG_CONST) {
if (i1_reg->num != i2_reg->num)
return false;
} else if (i1_reg->flags & IR3_REG_IMMED) {
if (i1_reg->uim_val != i2_reg->uim_val)
return false;
} else {
if (i1_reg->def != i2_reg->def)
return false;
}
}
if (i1_reg->flags & IR3_REG_CONST) {
if (i1_reg->num != i2_reg->num)
return false;
} else if (i1_reg->flags & IR3_REG_IMMED) {
if (i1_reg->uim_val != i2_reg->uim_val)
return false;
} else {
if (i1_reg->def != i2_reg->def)
return false;
}
}
return true;
return true;
}
static bool
instr_can_cse(const struct ir3_instruction *instr)
{
if (instr->opc != OPC_META_COLLECT)
return false;
if (instr->opc != OPC_META_COLLECT)
return false;
return true;
return true;
}
static bool
cmp_func(const void *data1, const void *data2)
{
return instrs_equal(data1, data2);
return instrs_equal(data1, data2);
}
bool
ir3_cse(struct ir3 *ir)
{
struct set *instr_set = _mesa_set_create(NULL, hash_instr, cmp_func);
foreach_block (block, &ir->block_list) {
_mesa_set_clear(instr_set, NULL);
struct set *instr_set = _mesa_set_create(NULL, hash_instr, cmp_func);
foreach_block (block, &ir->block_list) {
_mesa_set_clear(instr_set, NULL);
foreach_instr (instr, &block->instr_list) {
instr->data = NULL;
foreach_instr (instr, &block->instr_list) {
instr->data = NULL;
if (!instr_can_cse(instr))
continue;
if (!instr_can_cse(instr))
continue;
bool found;
struct set_entry *entry =
_mesa_set_search_or_add(instr_set, instr, &found);
if (found)
instr->data = (void *) entry->key;
}
}
bool found;
struct set_entry *entry =
_mesa_set_search_or_add(instr_set, instr, &found);
if (found)
instr->data = (void *)entry->key;
}
}
bool progress = false;
foreach_block (block, &ir->block_list) {
foreach_instr (instr, &block->instr_list) {
foreach_src (src, instr) {
if ((src->flags & IR3_REG_SSA) && src->def &&
src->def->instr->data) {
progress = true;
struct ir3_instruction *instr = src->def->instr->data;
src->def = instr->dsts[0];
}
}
}
}
bool progress = false;
foreach_block (block, &ir->block_list) {
foreach_instr (instr, &block->instr_list) {
foreach_src(src, instr) {
if ((src->flags & IR3_REG_SSA) &&
src->def &&
src->def->instr->data) {
progress = true;
struct ir3_instruction *instr = src->def->instr->data;
src->def = instr->dsts[0];
}
}
}
}
_mesa_set_destroy(instr_set, NULL);
return progress;
_mesa_set_destroy(instr_set, NULL);
return progress;
}

View File

@ -36,168 +36,168 @@
static void
mark_array_use(struct ir3_instruction *instr, struct ir3_register *reg)
{
if (reg->flags & IR3_REG_ARRAY) {
struct ir3_array *arr =
ir3_lookup_array(instr->block->shader, reg->array.id);
arr->unused = false;
}
if (reg->flags & IR3_REG_ARRAY) {
struct ir3_array *arr =
ir3_lookup_array(instr->block->shader, reg->array.id);
arr->unused = false;
}
}
static void
instr_dce(struct ir3_instruction *instr, bool falsedep)
{
/* don't mark falsedep's as used, but otherwise process them normally: */
if (!falsedep)
instr->flags &= ~IR3_INSTR_UNUSED;
/* don't mark falsedep's as used, but otherwise process them normally: */
if (!falsedep)
instr->flags &= ~IR3_INSTR_UNUSED;
if (ir3_instr_check_mark(instr))
return;
if (ir3_instr_check_mark(instr))
return;
if (writes_gpr(instr))
mark_array_use(instr, instr->dsts[0]); /* dst */
if (writes_gpr(instr))
mark_array_use(instr, instr->dsts[0]); /* dst */
foreach_src (reg, instr)
mark_array_use(instr, reg); /* src */
foreach_src (reg, instr)
mark_array_use(instr, reg); /* src */
foreach_ssa_src_n (src, i, instr) {
instr_dce(src, __is_false_dep(instr, i));
}
foreach_ssa_src_n (src, i, instr) {
instr_dce(src, __is_false_dep(instr, i));
}
}
static bool
remove_unused_by_block(struct ir3_block *block)
{
bool progress = false;
foreach_instr_safe (instr, &block->instr_list) {
if (instr->opc == OPC_END || instr->opc == OPC_CHSH || instr->opc == OPC_CHMASK)
continue;
if (instr->flags & IR3_INSTR_UNUSED) {
if (instr->opc == OPC_META_SPLIT) {
struct ir3_instruction *src = ssa(instr->srcs[0]);
/* tex (cat5) instructions have a writemask, so we can
* mask off unused components. Other instructions do not.
*/
if (src && is_tex_or_prefetch(src) && (src->dsts[0]->wrmask > 1)) {
src->dsts[0]->wrmask &= ~(1 << instr->split.off);
}
}
bool progress = false;
foreach_instr_safe (instr, &block->instr_list) {
if (instr->opc == OPC_END || instr->opc == OPC_CHSH ||
instr->opc == OPC_CHMASK)
continue;
if (instr->flags & IR3_INSTR_UNUSED) {
if (instr->opc == OPC_META_SPLIT) {
struct ir3_instruction *src = ssa(instr->srcs[0]);
/* tex (cat5) instructions have a writemask, so we can
* mask off unused components. Other instructions do not.
*/
if (src && is_tex_or_prefetch(src) && (src->dsts[0]->wrmask > 1)) {
src->dsts[0]->wrmask &= ~(1 << instr->split.off);
}
}
/* prune false-deps, etc: */
foreach_ssa_use (use, instr)
foreach_ssa_srcp_n (srcp, n, use)
if (*srcp == instr)
*srcp = NULL;
/* prune false-deps, etc: */
foreach_ssa_use (use, instr)
foreach_ssa_srcp_n (srcp, n, use)
if (*srcp == instr)
*srcp = NULL;
list_delinit(&instr->node);
progress = true;
}
}
return progress;
list_delinit(&instr->node);
progress = true;
}
}
return progress;
}
static bool
find_and_remove_unused(struct ir3 *ir, struct ir3_shader_variant *so)
{
unsigned i;
bool progress = false;
unsigned i;
bool progress = false;
ir3_clear_mark(ir);
ir3_clear_mark(ir);
/* initially mark everything as unused, we'll clear the flag as we
* visit the instructions:
*/
foreach_block (block, &ir->block_list) {
foreach_instr (instr, &block->instr_list) {
/* special case, if pre-fs texture fetch used, we cannot
* eliminate the barycentric i/j input
*/
if (so->num_sampler_prefetch &&
(instr->opc == OPC_META_INPUT) &&
(instr->input.sysval == SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL))
continue;
instr->flags |= IR3_INSTR_UNUSED;
}
}
/* initially mark everything as unused, we'll clear the flag as we
* visit the instructions:
*/
foreach_block (block, &ir->block_list) {
foreach_instr (instr, &block->instr_list) {
/* special case, if pre-fs texture fetch used, we cannot
* eliminate the barycentric i/j input
*/
if (so->num_sampler_prefetch && (instr->opc == OPC_META_INPUT) &&
(instr->input.sysval == SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL))
continue;
instr->flags |= IR3_INSTR_UNUSED;
}
}
foreach_array (arr, &ir->array_list)
arr->unused = true;
foreach_array (arr, &ir->array_list)
arr->unused = true;
foreach_block (block, &ir->block_list) {
for (i = 0; i < block->keeps_count; i++)
instr_dce(block->keeps[i], false);
foreach_block (block, &ir->block_list) {
for (i = 0; i < block->keeps_count; i++)
instr_dce(block->keeps[i], false);
/* We also need to account for if-condition: */
if (block->condition)
instr_dce(block->condition, false);
}
/* We also need to account for if-condition: */
if (block->condition)
instr_dce(block->condition, false);
}
/* remove un-used instructions: */
foreach_block (block, &ir->block_list) {
progress |= remove_unused_by_block(block);
}
/* remove un-used instructions: */
foreach_block (block, &ir->block_list) {
progress |= remove_unused_by_block(block);
}
/* remove un-used arrays: */
foreach_array_safe (arr, &ir->array_list) {
if (arr->unused)
list_delinit(&arr->node);
}
/* remove un-used arrays: */
foreach_array_safe (arr, &ir->array_list) {
if (arr->unused)
list_delinit(&arr->node);
}
/* fixup wrmask of split instructions to account for adjusted tex
* wrmask's:
*/
foreach_block (block, &ir->block_list) {
foreach_instr (instr, &block->instr_list) {
if (instr->opc != OPC_META_SPLIT)
continue;
/* fixup wrmask of split instructions to account for adjusted tex
* wrmask's:
*/
foreach_block (block, &ir->block_list) {
foreach_instr (instr, &block->instr_list) {
if (instr->opc != OPC_META_SPLIT)
continue;
struct ir3_instruction *src = ssa(instr->srcs[0]);
if (!is_tex_or_prefetch(src))
continue;
struct ir3_instruction *src = ssa(instr->srcs[0]);
if (!is_tex_or_prefetch(src))
continue;
instr->srcs[0]->wrmask = src->dsts[0]->wrmask;
}
}
instr->srcs[0]->wrmask = src->dsts[0]->wrmask;
}
}
for (i = 0; i < ir->a0_users_count; i++) {
struct ir3_instruction *instr = ir->a0_users[i];
if (instr && (instr->flags & IR3_INSTR_UNUSED))
ir->a0_users[i] = NULL;
}
for (i = 0; i < ir->a0_users_count; i++) {
struct ir3_instruction *instr = ir->a0_users[i];
if (instr && (instr->flags & IR3_INSTR_UNUSED))
ir->a0_users[i] = NULL;
}
for (i = 0; i < ir->a1_users_count; i++) {
struct ir3_instruction *instr = ir->a1_users[i];
if (instr && (instr->flags & IR3_INSTR_UNUSED))
ir->a1_users[i] = NULL;
}
for (i = 0; i < ir->a1_users_count; i++) {
struct ir3_instruction *instr = ir->a1_users[i];
if (instr && (instr->flags & IR3_INSTR_UNUSED))
ir->a1_users[i] = NULL;
}
for (i = 0; i < ir->predicates_count; i++) {
struct ir3_instruction *instr = ir->predicates[i];
if (instr && (instr->flags & IR3_INSTR_UNUSED))
ir->predicates[i] = NULL;
}
for (i = 0; i < ir->predicates_count; i++) {
struct ir3_instruction *instr = ir->predicates[i];
if (instr && (instr->flags & IR3_INSTR_UNUSED))
ir->predicates[i] = NULL;
}
/* cleanup unused inputs: */
foreach_input_n (in, n, ir)
if (in->flags & IR3_INSTR_UNUSED)
ir->inputs[n] = NULL;
/* cleanup unused inputs: */
foreach_input_n (in, n, ir)
if (in->flags & IR3_INSTR_UNUSED)
ir->inputs[n] = NULL;
return progress;
return progress;
}
bool
ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so)
{
void *mem_ctx = ralloc_context(NULL);
bool progress, made_progress = false;
void *mem_ctx = ralloc_context(NULL);
bool progress, made_progress = false;
ir3_find_ssa_uses(ir, mem_ctx, true);
ir3_find_ssa_uses(ir, mem_ctx, true);
do {
progress = find_and_remove_unused(ir, so);
made_progress |= progress;
} while (progress);
do {
progress = find_and_remove_unused(ir, so);
made_progress |= progress;
} while (progress);
ralloc_free(mem_ctx);
ralloc_free(mem_ctx);
return made_progress;
return made_progress;
}

View File

@ -57,116 +57,112 @@
*/
int
ir3_delayslots(struct ir3_instruction *assigner,
struct ir3_instruction *consumer, unsigned n, bool soft)
struct ir3_instruction *consumer, unsigned n, bool soft)
{
/* generally don't count false dependencies, since this can just be
* something like a barrier, or SSBO store.
*/
if (__is_false_dep(consumer, n))
return 0;
/* generally don't count false dependencies, since this can just be
* something like a barrier, or SSBO store.
*/
if (__is_false_dep(consumer, n))
return 0;
/* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
* alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
* handled with sync bits
*/
/* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
* alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
* handled with sync bits
*/
if (is_meta(assigner) || is_meta(consumer))
return 0;
if (is_meta(assigner) || is_meta(consumer))
return 0;
if (writes_addr0(assigner) || writes_addr1(assigner))
return 6;
if (writes_addr0(assigner) || writes_addr1(assigner))
return 6;
if (soft && is_sfu(assigner))
return SOFT_SS_NOPS;
if (soft && is_sfu(assigner))
return SOFT_SS_NOPS;
/* handled via sync flags: */
if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
return 0;
/* handled via sync flags: */
if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
return 0;
/* As far as we know, shader outputs don't need any delay. */
if (consumer->opc == OPC_END || consumer->opc == OPC_CHMASK)
return 0;
/* As far as we know, shader outputs don't need any delay. */
if (consumer->opc == OPC_END || consumer->opc == OPC_CHMASK)
return 0;
/* assigner must be alu: */
if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
is_mem(consumer) || (assigner->dsts[0]->flags & IR3_REG_SHARED)) {
return 6;
} else {
/* In mergedregs mode, there is an extra 2-cycle penalty when half of
* a full-reg is read as a half-reg or when a half-reg is read as a
* full-reg.
*/
bool mismatched_half =
(assigner->dsts[0]->flags & IR3_REG_HALF) !=
(consumer->srcs[n]->flags & IR3_REG_HALF);
unsigned penalty = mismatched_half ? 2 : 0;
if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
(n == 2)) {
/* special case, 3rd src to cat3 not required on first cycle */
return 1 + penalty;
} else {
return 3 + penalty;
}
}
/* assigner must be alu: */
if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
is_mem(consumer) || (assigner->dsts[0]->flags & IR3_REG_SHARED)) {
return 6;
} else {
/* In mergedregs mode, there is an extra 2-cycle penalty when half of
* a full-reg is read as a half-reg or when a half-reg is read as a
* full-reg.
*/
bool mismatched_half = (assigner->dsts[0]->flags & IR3_REG_HALF) !=
(consumer->srcs[n]->flags & IR3_REG_HALF);
unsigned penalty = mismatched_half ? 2 : 0;
if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) && (n == 2)) {
/* special case, 3rd src to cat3 not required on first cycle */
return 1 + penalty;
} else {
return 3 + penalty;
}
}
}
static bool
count_instruction(struct ir3_instruction *n)
{
/* NOTE: don't count branch/jump since we don't know yet if they will
* be eliminated later in resolve_jumps().. really should do that
* earlier so we don't have this constraint.
*/
return is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
/* NOTE: don't count branch/jump since we don't know yet if they will
* be eliminated later in resolve_jumps().. really should do that
* earlier so we don't have this constraint.
*/
return is_alu(n) ||
(is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
}
static unsigned
distance(struct ir3_block *block, struct ir3_instruction *instr,
unsigned maxd)
distance(struct ir3_block *block, struct ir3_instruction *instr, unsigned maxd)
{
unsigned d = 0;
unsigned d = 0;
/* Note that this relies on incrementally building up the block's
* instruction list.. but this is how scheduling and nopsched
* work.
*/
foreach_instr_rev (n, &block->instr_list) {
if ((n == instr) || (d >= maxd))
return MIN2(maxd, d + n->nop);
if (count_instruction(n))
d = MIN2(maxd, d + 1 + n->repeat + n->nop);
}
/* Note that this relies on incrementally building up the block's
* instruction list.. but this is how scheduling and nopsched
* work.
*/
foreach_instr_rev (n, &block->instr_list) {
if ((n == instr) || (d >= maxd))
return MIN2(maxd, d + n->nop);
if (count_instruction(n))
d = MIN2(maxd, d + 1 + n->repeat + n->nop);
}
return maxd;
return maxd;
}
static unsigned
delay_calc_srcn_prera(struct ir3_block *block,
struct ir3_instruction *assigner,
struct ir3_instruction *consumer,
unsigned srcn)
delay_calc_srcn_prera(struct ir3_block *block, struct ir3_instruction *assigner,
struct ir3_instruction *consumer, unsigned srcn)
{
unsigned delay = 0;
unsigned delay = 0;
if (assigner->opc == OPC_META_PHI)
return 0;
if (assigner->opc == OPC_META_PHI)
return 0;
if (is_meta(assigner)) {
foreach_src_n (src, n, assigner) {
unsigned d;
if (is_meta(assigner)) {
foreach_src_n (src, n, assigner) {
unsigned d;
if (!src->def)
continue;
if (!src->def)
continue;
d = delay_calc_srcn_prera(block, src->def->instr, consumer, srcn);
delay = MAX2(delay, d);
}
} else {
delay = ir3_delayslots(assigner, consumer, srcn, false);
delay -= distance(block, assigner, delay);
}
d = delay_calc_srcn_prera(block, src->def->instr, consumer, srcn);
delay = MAX2(delay, d);
}
} else {
delay = ir3_delayslots(assigner, consumer, srcn, false);
delay -= distance(block, assigner, delay);
}
return delay;
return delay;
}
/**
@ -176,19 +172,19 @@ delay_calc_srcn_prera(struct ir3_block *block,
unsigned
ir3_delay_calc_prera(struct ir3_block *block, struct ir3_instruction *instr)
{
unsigned delay = 0;
unsigned delay = 0;
foreach_src_n (src, i, instr) {
unsigned d = 0;
foreach_src_n (src, i, instr) {
unsigned d = 0;
if (src->def && src->def->instr->block == block) {
d = delay_calc_srcn_prera(block, src->def->instr, instr, i);
}
if (src->def && src->def->instr->block == block) {
d = delay_calc_srcn_prera(block, src->def->instr, instr, i);
}
delay = MAX2(delay, d);
}
delay = MAX2(delay, d);
}
return delay;
return delay;
}
/* Post-RA, we don't have arrays any more, so we have to be a bit careful here
@ -198,185 +194,186 @@ ir3_delay_calc_prera(struct ir3_block *block, struct ir3_instruction *instr)
static unsigned
post_ra_reg_elems(struct ir3_register *reg)
{
if (reg->flags & IR3_REG_RELATIV)
return reg->size;
return reg_elems(reg);
if (reg->flags & IR3_REG_RELATIV)
return reg->size;
return reg_elems(reg);
}
static unsigned
post_ra_reg_num(struct ir3_register *reg)
{
if (reg->flags & IR3_REG_RELATIV)
return reg->array.base;
return reg->num;
if (reg->flags & IR3_REG_RELATIV)
return reg->array.base;
return reg->num;
}
static unsigned
delay_calc_srcn_postra(struct ir3_instruction *assigner, struct ir3_instruction *consumer,
unsigned assigner_n, unsigned consumer_n, bool soft, bool mergedregs)
delay_calc_srcn_postra(struct ir3_instruction *assigner,
struct ir3_instruction *consumer, unsigned assigner_n,
unsigned consumer_n, bool soft, bool mergedregs)
{
struct ir3_register *src = consumer->srcs[consumer_n];
struct ir3_register *dst = assigner->dsts[assigner_n];
bool mismatched_half =
(src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);
struct ir3_register *src = consumer->srcs[consumer_n];
struct ir3_register *dst = assigner->dsts[assigner_n];
bool mismatched_half =
(src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);
/* In the mergedregs case or when the register is a special register,
* half-registers do not alias with full registers.
*/
if ((!mergedregs || is_reg_special(src) || is_reg_special(dst)) &&
mismatched_half)
return 0;
/* In the mergedregs case or when the register is a special register,
* half-registers do not alias with full registers.
*/
if ((!mergedregs || is_reg_special(src) || is_reg_special(dst)) &&
mismatched_half)
return 0;
unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src);
unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst);
unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src);
unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst);
if (dst_start >= src_end || src_start >= dst_end)
return 0;
if (dst_start >= src_end || src_start >= dst_end)
return 0;
unsigned delay = ir3_delayslots(assigner, consumer, consumer_n, soft);
unsigned delay = ir3_delayslots(assigner, consumer, consumer_n, soft);
if (assigner->repeat == 0 && consumer->repeat == 0)
return delay;
if (assigner->repeat == 0 && consumer->repeat == 0)
return delay;
/* If either side is a relative access, we can't really apply most of the
* reasoning below because we don't know which component aliases which.
* Just bail in this case.
*/
if ((src->flags & IR3_REG_RELATIV) || (dst->flags & IR3_REG_RELATIV))
return delay;
/* If either side is a relative access, we can't really apply most of the
* reasoning below because we don't know which component aliases which.
* Just bail in this case.
*/
if ((src->flags & IR3_REG_RELATIV) || (dst->flags & IR3_REG_RELATIV))
return delay;
/* MOVMSK seems to require that all users wait until the entire
* instruction is finished, so just bail here.
*/
if (assigner->opc == OPC_MOVMSK)
return delay;
/* MOVMSK seems to require that all users wait until the entire
* instruction is finished, so just bail here.
*/
if (assigner->opc == OPC_MOVMSK)
return delay;
/* TODO: Handle the combination of (rpt) and different component sizes
* better like below. This complicates things significantly because the
* components don't line up.
*/
if (mismatched_half)
return delay;
/* TODO: Handle the combination of (rpt) and different component sizes
* better like below. This complicates things significantly because the
* components don't line up.
*/
if (mismatched_half)
return delay;
/* If an instruction has a (rpt), then it acts as a sequence of
* instructions, reading its non-(r) sources at each cycle. First, get the
* register num for the first instruction where they interfere:
*/
/* If an instruction has a (rpt), then it acts as a sequence of
* instructions, reading its non-(r) sources at each cycle. First, get the
* register num for the first instruction where they interfere:
*/
unsigned first_num = MAX2(src_start, dst_start) / reg_elem_size(dst);
unsigned first_num = MAX2(src_start, dst_start) / reg_elem_size(dst);
/* Now, for that first conflicting half/full register, figure out the
* sub-instruction within assigner/consumer it corresponds to. For (r)
* sources, this should already return the correct answer of 0. However we
* have to special-case the multi-mov instructions, where the
* sub-instructions sometimes come from the src/dst indices instead.
*/
unsigned first_src_instr;
if (consumer->opc == OPC_SWZ || consumer->opc == OPC_GAT)
first_src_instr = consumer_n;
else
first_src_instr = first_num - src->num;
/* Now, for that first conflicting half/full register, figure out the
* sub-instruction within assigner/consumer it corresponds to. For (r)
* sources, this should already return the correct answer of 0. However we
* have to special-case the multi-mov instructions, where the
* sub-instructions sometimes come from the src/dst indices instead.
*/
unsigned first_src_instr;
if (consumer->opc == OPC_SWZ || consumer->opc == OPC_GAT)
first_src_instr = consumer_n;
else
first_src_instr = first_num - src->num;
unsigned first_dst_instr;
if (assigner->opc == OPC_SWZ || assigner->opc == OPC_SCT)
first_dst_instr = assigner_n;
else
first_dst_instr = first_num - dst->num;
unsigned first_dst_instr;
if (assigner->opc == OPC_SWZ || assigner->opc == OPC_SCT)
first_dst_instr = assigner_n;
else
first_dst_instr = first_num - dst->num;
/* The delay we return is relative to the *end* of assigner and the
* *beginning* of consumer, because it's the number of nops (or other
* things) needed between them. Any instructions after first_dst_instr
* subtract from the delay, and so do any instructions before
* first_src_instr. Calculate an offset to subtract from the non-rpt-aware
* delay to account for that.
*
* Now, a priori, we need to go through this process for every
* conflicting regnum and take the minimum of the offsets to make sure
* that the appropriate number of nop's is inserted for every conflicting
* pair of sub-instructions. However, as we go to the next conflicting
* regnum (if any), the number of instructions after first_dst_instr
* decreases by 1 and the number of source instructions before
* first_src_instr correspondingly increases by 1, so the offset stays the
* same for all conflicting registers.
*/
unsigned offset = first_src_instr + (assigner->repeat - first_dst_instr);
return offset > delay ? 0 : delay - offset;
/* The delay we return is relative to the *end* of assigner and the
* *beginning* of consumer, because it's the number of nops (or other
* things) needed between them. Any instructions after first_dst_instr
* subtract from the delay, and so do any instructions before
* first_src_instr. Calculate an offset to subtract from the non-rpt-aware
* delay to account for that.
*
* Now, a priori, we need to go through this process for every
* conflicting regnum and take the minimum of the offsets to make sure
* that the appropriate number of nop's is inserted for every conflicting
* pair of sub-instructions. However, as we go to the next conflicting
* regnum (if any), the number of instructions after first_dst_instr
* decreases by 1 and the number of source instructions before
* first_src_instr correspondingly increases by 1, so the offset stays the
* same for all conflicting registers.
*/
unsigned offset = first_src_instr + (assigner->repeat - first_dst_instr);
return offset > delay ? 0 : delay - offset;
}
static unsigned
delay_calc_postra(struct ir3_block *block,
struct ir3_instruction *start,
struct ir3_instruction *consumer,
unsigned distance, bool soft, bool pred, bool mergedregs)
delay_calc_postra(struct ir3_block *block, struct ir3_instruction *start,
struct ir3_instruction *consumer, unsigned distance,
bool soft, bool pred, bool mergedregs)
{
unsigned delay = 0;
/* Search backwards starting at the instruction before start, unless it's
* NULL then search backwards from the block end.
*/
struct list_head *start_list = start ? start->node.prev : block->instr_list.prev;
list_for_each_entry_from_rev(struct ir3_instruction, assigner, start_list, &block->instr_list, node) {
if (count_instruction(assigner))
distance += assigner->nop;
unsigned delay = 0;
/* Search backwards starting at the instruction before start, unless it's
* NULL then search backwards from the block end.
*/
struct list_head *start_list =
start ? start->node.prev : block->instr_list.prev;
list_for_each_entry_from_rev (struct ir3_instruction, assigner, start_list,
&block->instr_list, node) {
if (count_instruction(assigner))
distance += assigner->nop;
if (distance + delay >= (soft ? SOFT_SS_NOPS : MAX_NOPS))
return delay;
if (distance + delay >= (soft ? SOFT_SS_NOPS : MAX_NOPS))
return delay;
if (is_meta(assigner))
continue;
if (is_meta(assigner))
continue;
unsigned new_delay = 0;
unsigned new_delay = 0;
foreach_dst_n (dst, dst_n, assigner) {
if (dst->wrmask == 0)
continue;
foreach_src_n (src, src_n, consumer) {
if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST))
continue;
foreach_dst_n (dst, dst_n, assigner) {
if (dst->wrmask == 0)
continue;
foreach_src_n (src, src_n, consumer) {
if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST))
continue;
unsigned src_delay =
delay_calc_srcn_postra(assigner, consumer, dst_n,
src_n, soft, mergedregs);
new_delay = MAX2(new_delay, src_delay);
}
}
unsigned src_delay = delay_calc_srcn_postra(
assigner, consumer, dst_n, src_n, soft, mergedregs);
new_delay = MAX2(new_delay, src_delay);
}
}
new_delay = new_delay > distance ? new_delay - distance : 0;
delay = MAX2(delay, new_delay);
new_delay = new_delay > distance ? new_delay - distance : 0;
delay = MAX2(delay, new_delay);
if (count_instruction(assigner))
distance += 1 + assigner->repeat;
}
if (count_instruction(assigner))
distance += 1 + assigner->repeat;
}
/* Note: this allows recursion into "block" if it has already been
* visited, but *not* recursion into its predecessors. We may have to
* visit the original block twice, for the loop case where we have to
* consider definititons in an earlier iterations of the same loop:
*
* while (...) {
* mov.u32u32 ..., r0.x
* ...
* mov.u32u32 r0.x, ...
* }
*
* However any other recursion would be unnecessary.
*/
/* Note: this allows recursion into "block" if it has already been
* visited, but *not* recursion into its predecessors. We may have to
* visit the original block twice, for the loop case where we have to
* consider definititons in an earlier iterations of the same loop:
*
* while (...) {
* mov.u32u32 ..., r0.x
* ...
* mov.u32u32 r0.x, ...
* }
*
* However any other recursion would be unnecessary.
*/
if (pred && block->data != block) {
block->data = block;
if (pred && block->data != block) {
block->data = block;
for (unsigned i = 0; i < block->predecessors_count; i++) {
struct ir3_block *pred = block->predecessors[i];
unsigned pred_delay =
delay_calc_postra(pred, NULL, consumer, distance, soft, pred, mergedregs);
delay = MAX2(delay, pred_delay);
}
for (unsigned i = 0; i < block->predecessors_count; i++) {
struct ir3_block *pred = block->predecessors[i];
unsigned pred_delay = delay_calc_postra(pred, NULL, consumer, distance,
soft, pred, mergedregs);
delay = MAX2(delay, pred_delay);
}
block->data = NULL;
}
block->data = NULL;
}
return delay;
return delay;
}
/**
@ -392,9 +389,9 @@ delay_calc_postra(struct ir3_block *block,
*/
unsigned
ir3_delay_calc_postra(struct ir3_block *block, struct ir3_instruction *instr,
bool soft, bool mergedregs)
bool soft, bool mergedregs)
{
return delay_calc_postra(block, NULL, instr, 0, soft, false, mergedregs);
return delay_calc_postra(block, NULL, instr, 0, soft, false, mergedregs);
}
/**
@ -403,9 +400,9 @@ ir3_delay_calc_postra(struct ir3_block *block, struct ir3_instruction *instr,
*/
unsigned
ir3_delay_calc_exact(struct ir3_block *block, struct ir3_instruction *instr,
bool mergedregs)
bool mergedregs)
{
return delay_calc_postra(block, NULL, instr, 0, false, true, mergedregs);
return delay_calc_postra(block, NULL, instr, 0, false, true, mergedregs);
}
/**
@ -419,12 +416,11 @@ ir3_delay_calc_exact(struct ir3_block *block, struct ir3_instruction *instr,
void
ir3_remove_nops(struct ir3 *ir)
{
foreach_block (block, &ir->block_list) {
foreach_instr_safe (instr, &block->instr_list) {
if (instr->opc == OPC_NOP) {
list_del(&instr->node);
}
}
}
foreach_block (block, &ir->block_list) {
foreach_instr_safe (instr, &block->instr_list) {
if (instr->opc == OPC_NOP) {
list_del(&instr->node);
}
}
}
}

View File

@ -48,183 +48,185 @@
void
ir3_disk_cache_init(struct ir3_compiler *compiler)
{
if (ir3_shader_debug & IR3_DBG_NOCACHE)
return;
if (ir3_shader_debug & IR3_DBG_NOCACHE)
return;
/* array length = print length + nul char + 1 extra to verify it's unused */
char renderer[7];
ASSERTED int len =
snprintf(renderer, sizeof(renderer), "FD%03d", compiler->gpu_id);
assert(len == sizeof(renderer) - 2);
/* array length = print length + nul char + 1 extra to verify it's unused */
char renderer[7];
ASSERTED int len =
snprintf(renderer, sizeof(renderer), "FD%03d", compiler->gpu_id);
assert(len == sizeof(renderer) - 2);
const struct build_id_note *note =
build_id_find_nhdr_for_addr(ir3_disk_cache_init);
assert(note && build_id_length(note) == 20); /* sha1 */
const struct build_id_note *note =
build_id_find_nhdr_for_addr(ir3_disk_cache_init);
assert(note && build_id_length(note) == 20); /* sha1 */
const uint8_t *id_sha1 = build_id_data(note);
assert(id_sha1);
const uint8_t *id_sha1 = build_id_data(note);
assert(id_sha1);
char timestamp[41];
_mesa_sha1_format(timestamp, id_sha1);
char timestamp[41];
_mesa_sha1_format(timestamp, id_sha1);
uint64_t driver_flags = ir3_shader_debug;
if (compiler->robust_ubo_access)
driver_flags |= IR3_DBG_ROBUST_UBO_ACCESS;
compiler->disk_cache = disk_cache_create(renderer, timestamp, driver_flags);
uint64_t driver_flags = ir3_shader_debug;
if (compiler->robust_ubo_access)
driver_flags |= IR3_DBG_ROBUST_UBO_ACCESS;
compiler->disk_cache = disk_cache_create(renderer, timestamp, driver_flags);
}
void
ir3_disk_cache_init_shader_key(struct ir3_compiler *compiler,
struct ir3_shader *shader)
struct ir3_shader *shader)
{
if (!compiler->disk_cache)
return;
if (!compiler->disk_cache)
return;
struct mesa_sha1 ctx;
struct mesa_sha1 ctx;
_mesa_sha1_init(&ctx);
_mesa_sha1_init(&ctx);
/* Serialize the NIR to a binary blob that we can hash for the disk
* cache. Drop unnecessary information (like variable names)
* so the serialized NIR is smaller, and also to let us detect more
* isomorphic shaders when hashing, increasing cache hits.
*/
struct blob blob;
blob_init(&blob);
nir_serialize(&blob, shader->nir, true);
_mesa_sha1_update(&ctx, blob.data, blob.size);
blob_finish(&blob);
/* Serialize the NIR to a binary blob that we can hash for the disk
* cache. Drop unnecessary information (like variable names)
* so the serialized NIR is smaller, and also to let us detect more
* isomorphic shaders when hashing, increasing cache hits.
*/
struct blob blob;
blob_init(&blob);
nir_serialize(&blob, shader->nir, true);
_mesa_sha1_update(&ctx, blob.data, blob.size);
blob_finish(&blob);
/* Note that on some gens stream-out is lowered in ir3 to stg. For later
* gens we maybe don't need to include stream-out in the cache key.
*/
_mesa_sha1_update(&ctx, &shader->stream_output, sizeof(shader->stream_output));
/* Note that on some gens stream-out is lowered in ir3 to stg. For later
* gens we maybe don't need to include stream-out in the cache key.
*/
_mesa_sha1_update(&ctx, &shader->stream_output,
sizeof(shader->stream_output));
_mesa_sha1_final(&ctx, shader->cache_key);
_mesa_sha1_final(&ctx, shader->cache_key);
}
static void
compute_variant_key(struct ir3_compiler *compiler,
struct ir3_shader_variant *v, cache_key cache_key)
compute_variant_key(struct ir3_compiler *compiler, struct ir3_shader_variant *v,
cache_key cache_key)
{
struct blob blob;
blob_init(&blob);
struct blob blob;
blob_init(&blob);
blob_write_bytes(&blob, &v->shader->cache_key, sizeof(v->shader->cache_key));
blob_write_bytes(&blob, &v->key, sizeof(v->key));
blob_write_uint8(&blob, v->binning_pass);
blob_write_bytes(&blob, &v->shader->cache_key, sizeof(v->shader->cache_key));
blob_write_bytes(&blob, &v->key, sizeof(v->key));
blob_write_uint8(&blob, v->binning_pass);
disk_cache_compute_key(compiler->disk_cache, blob.data, blob.size, cache_key);
disk_cache_compute_key(compiler->disk_cache, blob.data, blob.size,
cache_key);
blob_finish(&blob);
blob_finish(&blob);
}
static void
retrieve_variant(struct blob_reader *blob, struct ir3_shader_variant *v)
{
blob_copy_bytes(blob, VARIANT_CACHE_PTR(v), VARIANT_CACHE_SIZE);
blob_copy_bytes(blob, VARIANT_CACHE_PTR(v), VARIANT_CACHE_SIZE);
/*
* pointers need special handling:
*/
/*
* pointers need special handling:
*/
v->bin = rzalloc_size(v, v->info.size);
blob_copy_bytes(blob, v->bin, v->info.size);
v->bin = rzalloc_size(v, v->info.size);
blob_copy_bytes(blob, v->bin, v->info.size);
if (!v->binning_pass) {
blob_copy_bytes(blob, v->const_state, sizeof(*v->const_state));
unsigned immeds_sz = v->const_state->immediates_size *
sizeof(v->const_state->immediates[0]);
v->const_state->immediates = ralloc_size(v->const_state, immeds_sz);
blob_copy_bytes(blob, v->const_state->immediates, immeds_sz);
}
if (!v->binning_pass) {
blob_copy_bytes(blob, v->const_state, sizeof(*v->const_state));
unsigned immeds_sz = v->const_state->immediates_size *
sizeof(v->const_state->immediates[0]);
v->const_state->immediates = ralloc_size(v->const_state, immeds_sz);
blob_copy_bytes(blob, v->const_state->immediates, immeds_sz);
}
}
static void
store_variant(struct blob *blob, struct ir3_shader_variant *v)
{
blob_write_bytes(blob, VARIANT_CACHE_PTR(v), VARIANT_CACHE_SIZE);
blob_write_bytes(blob, VARIANT_CACHE_PTR(v), VARIANT_CACHE_SIZE);
/*
* pointers need special handling:
*/
/*
* pointers need special handling:
*/
blob_write_bytes(blob, v->bin, v->info.size);
blob_write_bytes(blob, v->bin, v->info.size);
/* No saving constant_data, it's already baked into bin at this point. */
/* No saving constant_data, it's already baked into bin at this point. */
if (!v->binning_pass) {
blob_write_bytes(blob, v->const_state, sizeof(*v->const_state));
unsigned immeds_sz = v->const_state->immediates_size *
sizeof(v->const_state->immediates[0]);
blob_write_bytes(blob, v->const_state->immediates, immeds_sz);
}
if (!v->binning_pass) {
blob_write_bytes(blob, v->const_state, sizeof(*v->const_state));
unsigned immeds_sz = v->const_state->immediates_size *
sizeof(v->const_state->immediates[0]);
blob_write_bytes(blob, v->const_state->immediates, immeds_sz);
}
}
bool
ir3_disk_cache_retrieve(struct ir3_compiler *compiler,
struct ir3_shader_variant *v)
struct ir3_shader_variant *v)
{
if (!compiler->disk_cache)
return false;
if (!compiler->disk_cache)
return false;
cache_key cache_key;
cache_key cache_key;
compute_variant_key(compiler, v, cache_key);
compute_variant_key(compiler, v, cache_key);
if (debug) {
char sha1[41];
_mesa_sha1_format(sha1, cache_key);
fprintf(stderr, "[mesa disk cache] retrieving variant %s: ", sha1);
}
if (debug) {
char sha1[41];
_mesa_sha1_format(sha1, cache_key);
fprintf(stderr, "[mesa disk cache] retrieving variant %s: ", sha1);
}
size_t size;
void *buffer = disk_cache_get(compiler->disk_cache, cache_key, &size);
size_t size;
void *buffer = disk_cache_get(compiler->disk_cache, cache_key, &size);
if (debug)
fprintf(stderr, "%s\n", buffer ? "found" : "missing");
if (debug)
fprintf(stderr, "%s\n", buffer ? "found" : "missing");
if (!buffer)
return false;
if (!buffer)
return false;
struct blob_reader blob;
blob_reader_init(&blob, buffer, size);
struct blob_reader blob;
blob_reader_init(&blob, buffer, size);
retrieve_variant(&blob, v);
retrieve_variant(&blob, v);
if (v->binning)
retrieve_variant(&blob, v->binning);
if (v->binning)
retrieve_variant(&blob, v->binning);
free(buffer);
free(buffer);
return true;
return true;
}
void
ir3_disk_cache_store(struct ir3_compiler *compiler,
struct ir3_shader_variant *v)
struct ir3_shader_variant *v)
{
if (!compiler->disk_cache)
return;
if (!compiler->disk_cache)
return;
cache_key cache_key;
cache_key cache_key;
compute_variant_key(compiler, v, cache_key);
compute_variant_key(compiler, v, cache_key);
if (debug) {
char sha1[41];
_mesa_sha1_format(sha1, cache_key);
fprintf(stderr, "[mesa disk cache] storing variant %s\n", sha1);
}
if (debug) {
char sha1[41];
_mesa_sha1_format(sha1, cache_key);
fprintf(stderr, "[mesa disk cache] storing variant %s\n", sha1);
}
struct blob blob;
blob_init(&blob);
struct blob blob;
blob_init(&blob);
store_variant(&blob, v);
store_variant(&blob, v);
if (v->binning)
store_variant(&blob, v->binning);
if (v->binning)
store_variant(&blob, v->binning);
disk_cache_put(compiler->disk_cache, cache_key, blob.data, blob.size, NULL);
blob_finish(&blob);
disk_cache_put(compiler->disk_cache, cache_key, blob.data, blob.size, NULL);
blob_finish(&blob);
}

View File

@ -35,92 +35,91 @@
static struct ir3_block *
intersect(struct ir3_block *b1, struct ir3_block *b2)
{
while (b1 != b2) {
/*
* Note, the comparisons here are the opposite of what the paper says
* because we index blocks from beginning -> end (i.e. reverse
* post-order) instead of post-order like they assume.
*/
while (b1->index > b2->index)
b1 = b1->imm_dom;
while (b2->index > b1->index)
b2 = b2->imm_dom;
}
while (b1 != b2) {
/*
* Note, the comparisons here are the opposite of what the paper says
* because we index blocks from beginning -> end (i.e. reverse
* post-order) instead of post-order like they assume.
*/
while (b1->index > b2->index)
b1 = b1->imm_dom;
while (b2->index > b1->index)
b2 = b2->imm_dom;
}
return b1;
return b1;
}
static bool
calc_dominance(struct ir3_block *block)
{
struct ir3_block *new_idom = NULL;
for (unsigned i = 0; i < block->predecessors_count; i++) {
struct ir3_block *pred = block->predecessors[i];
struct ir3_block *new_idom = NULL;
for (unsigned i = 0; i < block->predecessors_count; i++) {
struct ir3_block *pred = block->predecessors[i];
if (pred->imm_dom) {
if (new_idom)
new_idom = intersect(pred, new_idom);
else
new_idom = pred;
}
}
if (pred->imm_dom) {
if (new_idom)
new_idom = intersect(pred, new_idom);
else
new_idom = pred;
}
}
if (block->imm_dom != new_idom) {
block->imm_dom = new_idom;
return true;
}
if (block->imm_dom != new_idom) {
block->imm_dom = new_idom;
return true;
}
return false;
return false;
}
static unsigned
calc_dfs_indices(struct ir3_block *block, unsigned index)
{
block->dom_pre_index = index++;
for (unsigned i = 0; i < block->dom_children_count; i++)
index = calc_dfs_indices(block->dom_children[i], index);
block->dom_post_index = index++;
return index;
block->dom_pre_index = index++;
for (unsigned i = 0; i < block->dom_children_count; i++)
index = calc_dfs_indices(block->dom_children[i], index);
block->dom_post_index = index++;
return index;
}
void
ir3_calc_dominance(struct ir3 *ir)
{
unsigned i = 0;
foreach_block (block, &ir->block_list) {
block->index = i++;
if (block == ir3_start_block(ir))
block->imm_dom = block;
else
block->imm_dom = NULL;
block->dom_children = NULL;
block->dom_children_count = block->dom_children_sz = 0;
}
unsigned i = 0;
foreach_block (block, &ir->block_list) {
block->index = i++;
if (block == ir3_start_block(ir))
block->imm_dom = block;
else
block->imm_dom = NULL;
block->dom_children = NULL;
block->dom_children_count = block->dom_children_sz = 0;
}
bool progress = true;
while (progress) {
progress = false;
foreach_block (block, &ir->block_list) {
if (block != ir3_start_block(ir))
progress |= calc_dominance(block);
}
}
bool progress = true;
while (progress) {
progress = false;
foreach_block (block, &ir->block_list) {
if (block != ir3_start_block(ir))
progress |= calc_dominance(block);
}
}
ir3_start_block(ir)->imm_dom = NULL;
ir3_start_block(ir)->imm_dom = NULL;
foreach_block (block, &ir->block_list) {
if (block->imm_dom)
array_insert(block->imm_dom, block->imm_dom->dom_children, block);
}
foreach_block (block, &ir->block_list) {
if (block->imm_dom)
array_insert(block->imm_dom, block->imm_dom->dom_children, block);
}
calc_dfs_indices(ir3_start_block(ir), 0);
calc_dfs_indices(ir3_start_block(ir), 0);
}
/* Return true if a dominates b. This includes if a == b. */
bool ir3_block_dominates(struct ir3_block *a, struct ir3_block *b)
bool
ir3_block_dominates(struct ir3_block *a, struct ir3_block *b)
{
return a->dom_pre_index <= b->dom_pre_index &&
a->dom_post_index >= b->dom_post_index;
return a->dom_pre_index <= b->dom_pre_index &&
a->dom_post_index >= b->dom_post_index;
}

View File

@ -26,7 +26,6 @@
#include "ir3_image.h"
/*
* SSBO/Image to/from IBO/tex hw mapping table:
*/
@ -34,57 +33,57 @@
void
ir3_ibo_mapping_init(struct ir3_ibo_mapping *mapping, unsigned num_textures)
{
memset(mapping, IBO_INVALID, sizeof(*mapping));
mapping->num_tex = 0;
mapping->tex_base = num_textures;
memset(mapping, IBO_INVALID, sizeof(*mapping));
mapping->num_tex = 0;
mapping->tex_base = num_textures;
}
struct ir3_instruction *
ir3_ssbo_to_ibo(struct ir3_context *ctx, nir_src src)
{
if (ir3_bindless_resource(src)) {
ctx->so->bindless_ibo = true;
return ir3_get_src(ctx, &src)[0];
} else {
/* can this be non-const buffer_index? how do we handle that? */
int ssbo_idx = nir_src_as_uint(src);
return create_immed(ctx->block, ssbo_idx);
}
if (ir3_bindless_resource(src)) {
ctx->so->bindless_ibo = true;
return ir3_get_src(ctx, &src)[0];
} else {
/* can this be non-const buffer_index? how do we handle that? */
int ssbo_idx = nir_src_as_uint(src);
return create_immed(ctx->block, ssbo_idx);
}
}
unsigned
ir3_ssbo_to_tex(struct ir3_ibo_mapping *mapping, unsigned ssbo)
{
if (mapping->ssbo_to_tex[ssbo] == IBO_INVALID) {
unsigned tex = mapping->num_tex++;
mapping->ssbo_to_tex[ssbo] = tex;
mapping->tex_to_image[tex] = IBO_SSBO | ssbo;
}
return mapping->ssbo_to_tex[ssbo] + mapping->tex_base;
if (mapping->ssbo_to_tex[ssbo] == IBO_INVALID) {
unsigned tex = mapping->num_tex++;
mapping->ssbo_to_tex[ssbo] = tex;
mapping->tex_to_image[tex] = IBO_SSBO | ssbo;
}
return mapping->ssbo_to_tex[ssbo] + mapping->tex_base;
}
struct ir3_instruction *
ir3_image_to_ibo(struct ir3_context *ctx, nir_src src)
{
if (ir3_bindless_resource(src)) {
ctx->so->bindless_ibo = true;
return ir3_get_src(ctx, &src)[0];
} else {
/* can this be non-const buffer_index? how do we handle that? */
int image_idx = nir_src_as_uint(src);
return create_immed(ctx->block, ctx->s->info.num_ssbos + image_idx);
}
if (ir3_bindless_resource(src)) {
ctx->so->bindless_ibo = true;
return ir3_get_src(ctx, &src)[0];
} else {
/* can this be non-const buffer_index? how do we handle that? */
int image_idx = nir_src_as_uint(src);
return create_immed(ctx->block, ctx->s->info.num_ssbos + image_idx);
}
}
unsigned
ir3_image_to_tex(struct ir3_ibo_mapping *mapping, unsigned image)
{
if (mapping->image_to_tex[image] == IBO_INVALID) {
unsigned tex = mapping->num_tex++;
mapping->image_to_tex[image] = tex;
mapping->tex_to_image[tex] = image;
}
return mapping->image_to_tex[image] + mapping->tex_base;
if (mapping->image_to_tex[image] == IBO_INVALID) {
unsigned tex = mapping->num_tex++;
mapping->image_to_tex[image] = tex;
mapping->tex_to_image[tex] = image;
}
return mapping->image_to_tex[image] + mapping->tex_base;
}
/* see tex_info() for equiv logic for texture instructions.. it would be
@ -93,87 +92,87 @@ ir3_image_to_tex(struct ir3_ibo_mapping *mapping, unsigned image)
unsigned
ir3_get_image_coords(const nir_intrinsic_instr *instr, unsigned *flagsp)
{
unsigned coords = nir_image_intrinsic_coord_components(instr);
unsigned flags = 0;
unsigned coords = nir_image_intrinsic_coord_components(instr);
unsigned flags = 0;
if (coords == 3)
flags |= IR3_INSTR_3D;
if (coords == 3)
flags |= IR3_INSTR_3D;
if (nir_intrinsic_image_array(instr))
flags |= IR3_INSTR_A;
if (nir_intrinsic_image_array(instr))
flags |= IR3_INSTR_A;
if (flagsp)
*flagsp = flags;
if (flagsp)
*flagsp = flags;
return coords;
return coords;
}
type_t
ir3_get_type_for_image_intrinsic(const nir_intrinsic_instr *instr)
{
const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
int bit_size = info->has_dest ? nir_dest_bit_size(instr->dest) : 32;
const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
int bit_size = info->has_dest ? nir_dest_bit_size(instr->dest) : 32;
nir_alu_type type = nir_type_uint;
switch (instr->intrinsic) {
case nir_intrinsic_image_load:
case nir_intrinsic_bindless_image_load:
type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(instr));
/* SpvOpAtomicLoad doesn't have dest type */
if (type == nir_type_invalid)
type = nir_type_uint;
break;
nir_alu_type type = nir_type_uint;
switch (instr->intrinsic) {
case nir_intrinsic_image_load:
case nir_intrinsic_bindless_image_load:
type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(instr));
/* SpvOpAtomicLoad doesn't have dest type */
if (type == nir_type_invalid)
type = nir_type_uint;
break;
case nir_intrinsic_image_store:
case nir_intrinsic_bindless_image_store:
type = nir_alu_type_get_base_type(nir_intrinsic_src_type(instr));
/* SpvOpAtomicStore doesn't have src type */
if (type == nir_type_invalid)
type = nir_type_uint;
break;
case nir_intrinsic_image_store:
case nir_intrinsic_bindless_image_store:
type = nir_alu_type_get_base_type(nir_intrinsic_src_type(instr));
/* SpvOpAtomicStore doesn't have src type */
if (type == nir_type_invalid)
type = nir_type_uint;
break;
case nir_intrinsic_image_atomic_add:
case nir_intrinsic_bindless_image_atomic_add:
case nir_intrinsic_image_atomic_umin:
case nir_intrinsic_bindless_image_atomic_umin:
case nir_intrinsic_image_atomic_umax:
case nir_intrinsic_bindless_image_atomic_umax:
case nir_intrinsic_image_atomic_and:
case nir_intrinsic_bindless_image_atomic_and:
case nir_intrinsic_image_atomic_or:
case nir_intrinsic_bindless_image_atomic_or:
case nir_intrinsic_image_atomic_xor:
case nir_intrinsic_bindless_image_atomic_xor:
case nir_intrinsic_image_atomic_exchange:
case nir_intrinsic_bindless_image_atomic_exchange:
case nir_intrinsic_image_atomic_comp_swap:
case nir_intrinsic_bindless_image_atomic_comp_swap:
case nir_intrinsic_image_atomic_inc_wrap:
case nir_intrinsic_bindless_image_atomic_inc_wrap:
type = nir_type_uint;
break;
case nir_intrinsic_image_atomic_add:
case nir_intrinsic_bindless_image_atomic_add:
case nir_intrinsic_image_atomic_umin:
case nir_intrinsic_bindless_image_atomic_umin:
case nir_intrinsic_image_atomic_umax:
case nir_intrinsic_bindless_image_atomic_umax:
case nir_intrinsic_image_atomic_and:
case nir_intrinsic_bindless_image_atomic_and:
case nir_intrinsic_image_atomic_or:
case nir_intrinsic_bindless_image_atomic_or:
case nir_intrinsic_image_atomic_xor:
case nir_intrinsic_bindless_image_atomic_xor:
case nir_intrinsic_image_atomic_exchange:
case nir_intrinsic_bindless_image_atomic_exchange:
case nir_intrinsic_image_atomic_comp_swap:
case nir_intrinsic_bindless_image_atomic_comp_swap:
case nir_intrinsic_image_atomic_inc_wrap:
case nir_intrinsic_bindless_image_atomic_inc_wrap:
type = nir_type_uint;
break;
case nir_intrinsic_image_atomic_imin:
case nir_intrinsic_bindless_image_atomic_imin:
case nir_intrinsic_image_atomic_imax:
case nir_intrinsic_bindless_image_atomic_imax:
type = nir_type_int;
break;
case nir_intrinsic_image_atomic_imin:
case nir_intrinsic_bindless_image_atomic_imin:
case nir_intrinsic_image_atomic_imax:
case nir_intrinsic_bindless_image_atomic_imax:
type = nir_type_int;
break;
default:
unreachable("Unhandled NIR image intrinsic");
}
default:
unreachable("Unhandled NIR image intrinsic");
}
switch (type) {
case nir_type_uint:
return bit_size == 16 ? TYPE_U16 : TYPE_U32;
case nir_type_int:
return bit_size == 16 ? TYPE_S16 : TYPE_S32;
case nir_type_float:
return bit_size == 16 ? TYPE_F16 : TYPE_F32;
default:
unreachable("bad type");
}
switch (type) {
case nir_type_uint:
return bit_size == 16 ? TYPE_U16 : TYPE_U32;
case nir_type_int:
return bit_size == 16 ? TYPE_S16 : TYPE_S32;
case nir_type_float:
return bit_size == 16 ? TYPE_F16 : TYPE_F32;
default:
unreachable("bad type");
}
}
/* Returns the number of components for the different image formats
@ -183,8 +182,8 @@ ir3_get_type_for_image_intrinsic(const nir_intrinsic_instr *instr)
unsigned
ir3_get_num_components_for_image_format(enum pipe_format format)
{
if (format == PIPE_FORMAT_NONE)
return 4;
else
return util_format_get_nr_components(format);
if (format == PIPE_FORMAT_NONE)
return 4;
else
return util_format_get_nr_components(format);
}

View File

@ -29,14 +29,15 @@
#include "ir3_context.h"
void ir3_ibo_mapping_init(struct ir3_ibo_mapping *mapping, unsigned num_textures);
void ir3_ibo_mapping_init(struct ir3_ibo_mapping *mapping,
unsigned num_textures);
struct ir3_instruction *ir3_ssbo_to_ibo(struct ir3_context *ctx, nir_src src);
unsigned ir3_ssbo_to_tex(struct ir3_ibo_mapping *mapping, unsigned ssbo);
struct ir3_instruction *ir3_image_to_ibo(struct ir3_context *ctx, nir_src src);
unsigned ir3_image_to_tex(struct ir3_ibo_mapping *mapping, unsigned image);
unsigned ir3_get_image_coords(const nir_intrinsic_instr *instr, unsigned *flagsp);
unsigned ir3_get_image_coords(const nir_intrinsic_instr *instr,
unsigned *flagsp);
type_t ir3_get_type_for_image_intrinsic(const nir_intrinsic_instr *instr);
unsigned ir3_get_num_components_for_image_format(enum pipe_format);

File diff suppressed because it is too large Load Diff

View File

@ -37,127 +37,130 @@
static bool
compute_block_liveness(struct ir3_liveness *live, struct ir3_block *block,
BITSET_WORD *tmp_live, unsigned bitset_words)
BITSET_WORD *tmp_live, unsigned bitset_words)
{
memcpy(tmp_live, live->live_out[block->index], bitset_words *
sizeof(BITSET_WORD));
memcpy(tmp_live, live->live_out[block->index],
bitset_words * sizeof(BITSET_WORD));
/* Process instructions */
foreach_instr_rev (instr, &block->instr_list) {
ra_foreach_dst(dst, instr) {
if (BITSET_TEST(tmp_live, dst->name))
dst->flags &= ~IR3_REG_UNUSED;
else
dst->flags |= IR3_REG_UNUSED;
BITSET_CLEAR(tmp_live, dst->name);
}
/* Process instructions */
foreach_instr_rev (instr, &block->instr_list) {
ra_foreach_dst (dst, instr) {
if (BITSET_TEST(tmp_live, dst->name))
dst->flags &= ~IR3_REG_UNUSED;
else
dst->flags |= IR3_REG_UNUSED;
BITSET_CLEAR(tmp_live, dst->name);
}
/* Phi node uses occur after the predecessor block */
if (instr->opc != OPC_META_PHI) {
ra_foreach_src(src, instr) {
if (BITSET_TEST(tmp_live, src->def->name))
src->flags &= ~IR3_REG_KILL;
else
src->flags |= IR3_REG_KILL;
}
/* Phi node uses occur after the predecessor block */
if (instr->opc != OPC_META_PHI) {
ra_foreach_src (src, instr) {
if (BITSET_TEST(tmp_live, src->def->name))
src->flags &= ~IR3_REG_KILL;
else
src->flags |= IR3_REG_KILL;
}
ra_foreach_src(src, instr) {
if (BITSET_TEST(tmp_live, src->def->name))
src->flags &= ~IR3_REG_FIRST_KILL;
else
src->flags |= IR3_REG_FIRST_KILL;
BITSET_SET(tmp_live, src->def->name);
}
}
}
ra_foreach_src (src, instr) {
if (BITSET_TEST(tmp_live, src->def->name))
src->flags &= ~IR3_REG_FIRST_KILL;
else
src->flags |= IR3_REG_FIRST_KILL;
BITSET_SET(tmp_live, src->def->name);
}
}
}
memcpy(live->live_in[block->index], tmp_live,
bitset_words * sizeof(BITSET_WORD));
memcpy(live->live_in[block->index], tmp_live,
bitset_words * sizeof(BITSET_WORD));
bool progress = false;
for (unsigned i = 0; i < block->predecessors_count; i++) {
const struct ir3_block *pred = block->predecessors[i];
for (unsigned j = 0; j < bitset_words; j++) {
if (tmp_live[j] & ~live->live_out[pred->index][j])
progress = true;
live->live_out[pred->index][j] |= tmp_live[j];
}
bool progress = false;
for (unsigned i = 0; i < block->predecessors_count; i++) {
const struct ir3_block *pred = block->predecessors[i];
for (unsigned j = 0; j < bitset_words; j++) {
if (tmp_live[j] & ~live->live_out[pred->index][j])
progress = true;
live->live_out[pred->index][j] |= tmp_live[j];
}
/* Process phi sources. */
foreach_instr (phi, &block->instr_list) {
if (phi->opc != OPC_META_PHI)
break;
if (!phi->srcs[i]->def)
continue;
unsigned name = phi->srcs[i]->def->name;
if (!BITSET_TEST(live->live_out[pred->index], name)) {
progress = true;
BITSET_SET(live->live_out[pred->index], name);
}
}
}
/* Process phi sources. */
foreach_instr (phi, &block->instr_list) {
if (phi->opc != OPC_META_PHI)
break;
if (!phi->srcs[i]->def)
continue;
unsigned name = phi->srcs[i]->def->name;
if (!BITSET_TEST(live->live_out[pred->index], name)) {
progress = true;
BITSET_SET(live->live_out[pred->index], name);
}
}
}
for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
const struct ir3_block *pred = block->physical_predecessors[i];
unsigned name;
BITSET_FOREACH_SET(name, tmp_live, live->definitions_count) {
struct ir3_register *reg = live->definitions[name];
if (!(reg->flags & IR3_REG_SHARED))
continue;
if (!BITSET_TEST(live->live_out[pred->index], name)) {
progress = true;
BITSET_SET(live->live_out[pred->index], name);
}
}
}
for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
const struct ir3_block *pred = block->physical_predecessors[i];
unsigned name;
BITSET_FOREACH_SET (name, tmp_live, live->definitions_count) {
struct ir3_register *reg = live->definitions[name];
if (!(reg->flags & IR3_REG_SHARED))
continue;
if (!BITSET_TEST(live->live_out[pred->index], name)) {
progress = true;
BITSET_SET(live->live_out[pred->index], name);
}
}
}
return progress;
return progress;
}
struct ir3_liveness *ir3_calc_liveness(struct ir3_shader_variant *v)
struct ir3_liveness *
ir3_calc_liveness(struct ir3_shader_variant *v)
{
struct ir3_liveness *live = rzalloc(NULL, struct ir3_liveness);
struct ir3_liveness *live = rzalloc(NULL, struct ir3_liveness);
/* Reserve name 0 to mean "doesn't have a name yet" to make the debug
* output nicer.
*/
array_insert(live, live->definitions, NULL);
/* Reserve name 0 to mean "doesn't have a name yet" to make the debug
* output nicer.
*/
array_insert(live, live->definitions, NULL);
/* Build definition <-> name mapping */
unsigned block_count = 0;
foreach_block (block, &v->ir->block_list) {
block->index = block_count++;
foreach_instr (instr, &block->instr_list) {
ra_foreach_dst(dst, instr) {
dst->name = live->definitions_count;
array_insert(live, live->definitions, dst);
}
}
}
/* Build definition <-> name mapping */
unsigned block_count = 0;
foreach_block (block, &v->ir->block_list) {
block->index = block_count++;
foreach_instr (instr, &block->instr_list) {
ra_foreach_dst (dst, instr) {
dst->name = live->definitions_count;
array_insert(live, live->definitions, dst);
}
}
}
live->block_count = block_count;
live->block_count = block_count;
unsigned bitset_words = BITSET_WORDS(live->definitions_count);
BITSET_WORD *tmp_live = ralloc_array(live, BITSET_WORD, bitset_words);
live->live_in = ralloc_array(live, BITSET_WORD *, block_count);
live->live_out = ralloc_array(live, BITSET_WORD *, block_count);
unsigned i = 0;
foreach_block (block, &v->ir->block_list) {
block->index = i++;
live->live_in[block->index] = rzalloc_array(live, BITSET_WORD, bitset_words);
live->live_out[block->index] = rzalloc_array(live, BITSET_WORD, bitset_words);
}
unsigned bitset_words = BITSET_WORDS(live->definitions_count);
BITSET_WORD *tmp_live = ralloc_array(live, BITSET_WORD, bitset_words);
live->live_in = ralloc_array(live, BITSET_WORD *, block_count);
live->live_out = ralloc_array(live, BITSET_WORD *, block_count);
unsigned i = 0;
foreach_block (block, &v->ir->block_list) {
block->index = i++;
live->live_in[block->index] =
rzalloc_array(live, BITSET_WORD, bitset_words);
live->live_out[block->index] =
rzalloc_array(live, BITSET_WORD, bitset_words);
}
bool progress = true;
while (progress) {
progress = false;
foreach_block_rev (block, &v->ir->block_list) {
progress |=
compute_block_liveness(live, block, tmp_live, bitset_words);
}
}
bool progress = true;
while (progress) {
progress = false;
foreach_block_rev (block, &v->ir->block_list) {
progress |=
compute_block_liveness(live, block, tmp_live, bitset_words);
}
}
return live;
return live;
}
/* Return true if "def" is live after "instr". It's assumed that "def"
@ -165,32 +168,31 @@ struct ir3_liveness *ir3_calc_liveness(struct ir3_shader_variant *v)
*/
bool
ir3_def_live_after(struct ir3_liveness *live, struct ir3_register *def,
struct ir3_instruction *instr)
struct ir3_instruction *instr)
{
/* If it's live out then it's definitely live at the instruction. */
if (BITSET_TEST(live->live_out[instr->block->index], def->name))
return true;
/* If it's live out then it's definitely live at the instruction. */
if (BITSET_TEST(live->live_out[instr->block->index], def->name))
return true;
/* If it's not live in and not defined in the same block then the live
* range can't extend to the instruction.
*/
if (def->instr->block != instr->block &&
!BITSET_TEST(live->live_in[instr->block->index], def->name))
return false;
/* If it's not live in and not defined in the same block then the live
* range can't extend to the instruction.
*/
if (def->instr->block != instr->block &&
!BITSET_TEST(live->live_in[instr->block->index], def->name))
return false;
/* Ok, now comes the tricky case, where "def" is killed somewhere in
* "instr"'s block and we have to check if it's before or after.
*/
foreach_instr_rev (test_instr, &instr->block->instr_list) {
if (test_instr == instr)
break;
/* Ok, now comes the tricky case, where "def" is killed somewhere in
* "instr"'s block and we have to check if it's before or after.
*/
foreach_instr_rev (test_instr, &instr->block->instr_list) {
if (test_instr == instr)
break;
for (unsigned i = 0; i < test_instr->srcs_count; i++) {
if (test_instr->srcs[i]->def == def)
return true;
}
}
for (unsigned i = 0; i < test_instr->srcs_count; i++) {
if (test_instr->srcs[i]->def == def)
return true;
}
}
return false;
return false;
}

View File

@ -25,524 +25,542 @@
#include "ir3_shader.h"
struct copy_src {
unsigned flags;
union {
uint32_t imm;
physreg_t reg;
unsigned const_num;
};
unsigned flags;
union {
uint32_t imm;
physreg_t reg;
unsigned const_num;
};
};
struct copy_entry {
physreg_t dst;
unsigned flags;
bool done;
physreg_t dst;
unsigned flags;
bool done;
struct copy_src src;
struct copy_src src;
};
static unsigned
copy_entry_size(const struct copy_entry *entry)
{
return (entry->flags & IR3_REG_HALF) ? 1 : 2;
return (entry->flags & IR3_REG_HALF) ? 1 : 2;
}
static struct copy_src
get_copy_src(const struct ir3_register *reg, unsigned offset)
{
if (reg->flags & IR3_REG_IMMED) {
return (struct copy_src) {
.flags = IR3_REG_IMMED,
.imm = reg->uim_val,
};
} else if (reg->flags & IR3_REG_CONST) {
return (struct copy_src) {
.flags = IR3_REG_CONST,
.const_num = reg->num,
};
} else {
return (struct copy_src) {
.flags = 0,
.reg = ra_reg_get_physreg(reg) + offset,
};
}
if (reg->flags & IR3_REG_IMMED) {
return (struct copy_src){
.flags = IR3_REG_IMMED,
.imm = reg->uim_val,
};
} else if (reg->flags & IR3_REG_CONST) {
return (struct copy_src){
.flags = IR3_REG_CONST,
.const_num = reg->num,
};
} else {
return (struct copy_src){
.flags = 0,
.reg = ra_reg_get_physreg(reg) + offset,
};
}
}
static void
do_xor(struct ir3_instruction *instr, unsigned dst_num, unsigned src1_num, unsigned src2_num, unsigned flags)
do_xor(struct ir3_instruction *instr, unsigned dst_num, unsigned src1_num,
unsigned src2_num, unsigned flags)
{
struct ir3_instruction *xor = ir3_instr_create(instr->block, OPC_XOR_B, 1, 2);
ir3_dst_create(xor, dst_num, flags);
ir3_src_create(xor, src1_num, flags);
ir3_src_create(xor, src2_num, flags);
struct ir3_instruction * xor
= ir3_instr_create(instr->block, OPC_XOR_B, 1, 2);
ir3_dst_create(xor, dst_num, flags);
ir3_src_create(xor, src1_num, flags);
ir3_src_create(xor, src2_num, flags);
ir3_instr_move_before(xor, instr);
ir3_instr_move_before(xor, instr);
}
static void
do_swap(struct ir3_compiler *compiler, struct ir3_instruction *instr,
const struct copy_entry *entry)
const struct copy_entry *entry)
{
assert(!entry->src.flags);
assert(!entry->src.flags);
if (entry->flags & IR3_REG_HALF) {
/* We currently make sure to never emit parallel copies where the
* source/destination is a half-reg above the range accessable to half
* registers. However, when a full-reg source overlaps a half-reg
* destination or vice versa, it can be very, very complicated to come
* up with a series of "legal" swaps and copies to resolve the
* parallel copy. So here we provide a fallback to implement the
* "illegal" swap instead. This may also be useful for implementing
* "spilling" half-regs to the inaccessable space.
*/
if (entry->src.reg >= RA_HALF_SIZE) {
/* Choose a temporary that doesn't overlap src or dst */
physreg_t tmp = entry->dst < 2 ? 2 : 0;
if (entry->flags & IR3_REG_HALF) {
/* We currently make sure to never emit parallel copies where the
* source/destination is a half-reg above the range accessable to half
* registers. However, when a full-reg source overlaps a half-reg
* destination or vice versa, it can be very, very complicated to come
* up with a series of "legal" swaps and copies to resolve the
* parallel copy. So here we provide a fallback to implement the
* "illegal" swap instead. This may also be useful for implementing
* "spilling" half-regs to the inaccessable space.
*/
if (entry->src.reg >= RA_HALF_SIZE) {
/* Choose a temporary that doesn't overlap src or dst */
physreg_t tmp = entry->dst < 2 ? 2 : 0;
/* Swap src and the temporary */
do_swap(compiler, instr, &(struct copy_entry) {
.src = { .reg = entry->src.reg & ~1u },
.dst = tmp,
.flags = entry->flags & ~IR3_REG_HALF,
});
/* Swap src and the temporary */
do_swap(compiler, instr,
&(struct copy_entry){
.src = {.reg = entry->src.reg & ~1u},
.dst = tmp,
.flags = entry->flags & ~IR3_REG_HALF,
});
/* Do the original swap with src replaced with tmp */
do_swap(compiler, instr, &(struct copy_entry) {
.src = { .reg = tmp + (entry->src.reg & 1) },
.dst = entry->dst,
.flags = entry->flags,
});
/* Do the original swap with src replaced with tmp */
do_swap(compiler, instr,
&(struct copy_entry){
.src = {.reg = tmp + (entry->src.reg & 1)},
.dst = entry->dst,
.flags = entry->flags,
});
/* Swap src and the temporary back */
do_swap(compiler, instr, &(struct copy_entry) {
.src = { .reg = entry->src.reg & ~1u },
.dst = tmp,
.flags = entry->flags & ~IR3_REG_HALF,
});
return;
}
/* Swap src and the temporary back */
do_swap(compiler, instr,
&(struct copy_entry){
.src = {.reg = entry->src.reg & ~1u},
.dst = tmp,
.flags = entry->flags & ~IR3_REG_HALF,
});
return;
}
/* If dst is not addressable, we only need to swap the arguments and
* let the case above handle it.
*/
if (entry->dst >= RA_HALF_SIZE) {
do_swap(compiler, instr, &(struct copy_entry) {
.src = { .reg = entry->dst },
.dst = entry->src.reg,
.flags = entry->flags,
});
return;
}
}
/* If dst is not addressable, we only need to swap the arguments and
* let the case above handle it.
*/
if (entry->dst >= RA_HALF_SIZE) {
do_swap(compiler, instr,
&(struct copy_entry){
.src = {.reg = entry->dst},
.dst = entry->src.reg,
.flags = entry->flags,
});
return;
}
}
unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
/* a5xx+ is known to support swz, which enables us to swap two registers
* in-place. If unsupported we emulate it using the xor trick.
*/
if (compiler->gpu_id < 500) {
/* Shared regs only exist since a5xx, so we don't have to provide a
* fallback path for them.
*/
assert(!(entry->flags & IR3_REG_SHARED));
do_xor(instr, dst_num, dst_num, src_num, entry->flags);
do_xor(instr, src_num, src_num, dst_num, entry->flags);
do_xor(instr, dst_num, dst_num, src_num, entry->flags);
} else {
/* Use a macro for shared regs because any shared reg writes need to
* be wrapped in a getone block to work correctly. Writing shared regs
* with multiple threads active does not work, even if they all return
* the same value.
*/
unsigned opc = (entry->flags & IR3_REG_SHARED) ? OPC_SWZ_SHARED_MACRO : OPC_SWZ;
struct ir3_instruction *swz = ir3_instr_create(instr->block, opc, 2, 2);
ir3_dst_create(swz, dst_num, entry->flags);
ir3_dst_create(swz, src_num, entry->flags);
ir3_src_create(swz, src_num, entry->flags);
ir3_src_create(swz, dst_num, entry->flags);
swz->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
swz->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
swz->repeat = 1;
ir3_instr_move_before(swz, instr);
}
/* a5xx+ is known to support swz, which enables us to swap two registers
* in-place. If unsupported we emulate it using the xor trick.
*/
if (compiler->gpu_id < 500) {
/* Shared regs only exist since a5xx, so we don't have to provide a
* fallback path for them.
*/
assert(!(entry->flags & IR3_REG_SHARED));
do_xor(instr, dst_num, dst_num, src_num, entry->flags);
do_xor(instr, src_num, src_num, dst_num, entry->flags);
do_xor(instr, dst_num, dst_num, src_num, entry->flags);
} else {
/* Use a macro for shared regs because any shared reg writes need to
* be wrapped in a getone block to work correctly. Writing shared regs
* with multiple threads active does not work, even if they all return
* the same value.
*/
unsigned opc =
(entry->flags & IR3_REG_SHARED) ? OPC_SWZ_SHARED_MACRO : OPC_SWZ;
struct ir3_instruction *swz = ir3_instr_create(instr->block, opc, 2, 2);
ir3_dst_create(swz, dst_num, entry->flags);
ir3_dst_create(swz, src_num, entry->flags);
ir3_src_create(swz, src_num, entry->flags);
ir3_src_create(swz, dst_num, entry->flags);
swz->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
swz->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
swz->repeat = 1;
ir3_instr_move_before(swz, instr);
}
}
static void
do_copy(struct ir3_compiler *compiler, struct ir3_instruction *instr,
const struct copy_entry *entry)
const struct copy_entry *entry)
{
if (entry->flags & IR3_REG_HALF) {
/* See do_swap() for why this is here. */
if (entry->dst >= RA_HALF_SIZE) {
/* TODO: is there a hw instruction we can use for this case? */
physreg_t tmp = !entry->src.flags && entry->src.reg < 2 ? 2 : 0;
if (entry->flags & IR3_REG_HALF) {
/* See do_swap() for why this is here. */
if (entry->dst >= RA_HALF_SIZE) {
/* TODO: is there a hw instruction we can use for this case? */
physreg_t tmp = !entry->src.flags && entry->src.reg < 2 ? 2 : 0;
do_swap(compiler, instr, &(struct copy_entry) {
.src = { .reg = entry->dst & ~1u },
.dst = tmp,
.flags = entry->flags & ~IR3_REG_HALF,
});
do_swap(compiler, instr,
&(struct copy_entry){
.src = {.reg = entry->dst & ~1u},
.dst = tmp,
.flags = entry->flags & ~IR3_REG_HALF,
});
do_copy(compiler, instr, &(struct copy_entry) {
.src = entry->src,
.dst = tmp + (entry->dst & 1),
.flags = entry->flags,
});
do_copy(compiler, instr,
&(struct copy_entry){
.src = entry->src,
.dst = tmp + (entry->dst & 1),
.flags = entry->flags,
});
do_swap(compiler, instr, &(struct copy_entry) {
.src = { .reg = entry->dst & ~1u },
.dst = tmp,
.flags = entry->flags & ~IR3_REG_HALF,
});
return;
}
do_swap(compiler, instr,
&(struct copy_entry){
.src = {.reg = entry->dst & ~1u},
.dst = tmp,
.flags = entry->flags & ~IR3_REG_HALF,
});
return;
}
if (!entry->src.flags && entry->src.reg >= RA_HALF_SIZE) {
unsigned src_num =
ra_physreg_to_num(entry->src.reg & ~1u, entry->flags & ~IR3_REG_HALF);
unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
if (!entry->src.flags && entry->src.reg >= RA_HALF_SIZE) {
unsigned src_num = ra_physreg_to_num(entry->src.reg & ~1u,
entry->flags & ~IR3_REG_HALF);
unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
if (entry->src.reg % 2 == 0) {
/* cov.u32u16 dst, src */
struct ir3_instruction *cov = ir3_instr_create(instr->block, OPC_MOV, 1, 1);
ir3_dst_create(cov, dst_num, entry->flags);
ir3_src_create(cov, src_num, entry->flags & ~IR3_REG_HALF);
cov->cat1.dst_type = TYPE_U16;
cov->cat1.src_type = TYPE_U32;
ir3_instr_move_before(cov, instr);
} else {
/* shr.b dst, src, h(16) */
struct ir3_instruction *shr = ir3_instr_create(instr->block, OPC_SHR_B, 1, 2);
ir3_dst_create(shr, dst_num, entry->flags);
ir3_src_create(shr, src_num, entry->flags & ~IR3_REG_HALF);
ir3_src_create(shr, 0, entry->flags | IR3_REG_IMMED)->uim_val = 16;
ir3_instr_move_before(shr, instr);
}
return;
}
}
if (entry->src.reg % 2 == 0) {
/* cov.u32u16 dst, src */
struct ir3_instruction *cov =
ir3_instr_create(instr->block, OPC_MOV, 1, 1);
ir3_dst_create(cov, dst_num, entry->flags);
ir3_src_create(cov, src_num, entry->flags & ~IR3_REG_HALF);
cov->cat1.dst_type = TYPE_U16;
cov->cat1.src_type = TYPE_U32;
ir3_instr_move_before(cov, instr);
} else {
/* shr.b dst, src, h(16) */
struct ir3_instruction *shr =
ir3_instr_create(instr->block, OPC_SHR_B, 1, 2);
ir3_dst_create(shr, dst_num, entry->flags);
ir3_src_create(shr, src_num, entry->flags & ~IR3_REG_HALF);
ir3_src_create(shr, 0, entry->flags | IR3_REG_IMMED)->uim_val = 16;
ir3_instr_move_before(shr, instr);
}
return;
}
}
unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
/* Similar to the swap case, we have to use a macro for shared regs. */
unsigned opc = (entry->flags & IR3_REG_SHARED) ? OPC_READ_FIRST_MACRO : OPC_MOV;
struct ir3_instruction *mov = ir3_instr_create(instr->block, opc, 1, 1);
ir3_dst_create(mov, dst_num, entry->flags);
ir3_src_create(mov, src_num, entry->flags | entry->src.flags);
mov->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
mov->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
if (entry->src.flags & IR3_REG_IMMED)
mov->srcs[0]->uim_val = entry->src.imm;
else if (entry->src.flags & IR3_REG_CONST)
mov->srcs[0]->num = entry->src.const_num;
ir3_instr_move_before(mov, instr);
/* Similar to the swap case, we have to use a macro for shared regs. */
unsigned opc =
(entry->flags & IR3_REG_SHARED) ? OPC_READ_FIRST_MACRO : OPC_MOV;
struct ir3_instruction *mov = ir3_instr_create(instr->block, opc, 1, 1);
ir3_dst_create(mov, dst_num, entry->flags);
ir3_src_create(mov, src_num, entry->flags | entry->src.flags);
mov->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
mov->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
if (entry->src.flags & IR3_REG_IMMED)
mov->srcs[0]->uim_val = entry->src.imm;
else if (entry->src.flags & IR3_REG_CONST)
mov->srcs[0]->num = entry->src.const_num;
ir3_instr_move_before(mov, instr);
}
struct copy_ctx {
/* For each physreg, the number of pending copy entries that use it as a
* source. Once this drops to zero, then the physreg is unblocked and can
* be moved to.
*/
unsigned physreg_use_count[RA_MAX_FILE_SIZE];
/* For each physreg, the number of pending copy entries that use it as a
* source. Once this drops to zero, then the physreg is unblocked and can
* be moved to.
*/
unsigned physreg_use_count[RA_MAX_FILE_SIZE];
/* For each physreg, the pending copy_entry that uses it as a dest. */
struct copy_entry *physreg_dst[RA_MAX_FILE_SIZE];
/* For each physreg, the pending copy_entry that uses it as a dest. */
struct copy_entry *physreg_dst[RA_MAX_FILE_SIZE];
struct copy_entry entries[RA_MAX_FILE_SIZE];
unsigned entry_count;
struct copy_entry entries[RA_MAX_FILE_SIZE];
unsigned entry_count;
};
static bool
entry_blocked(struct copy_entry *entry, struct copy_ctx *ctx)
{
for (unsigned i = 0; i < copy_entry_size(entry); i++) {
if (ctx->physreg_use_count[entry->dst + i] != 0)
return true;
}
for (unsigned i = 0; i < copy_entry_size(entry); i++) {
if (ctx->physreg_use_count[entry->dst + i] != 0)
return true;
}
return false;
return false;
}
static void
split_32bit_copy(struct copy_ctx *ctx, struct copy_entry *entry)
{
assert(!entry->done);
assert(!(entry->flags & (IR3_REG_IMMED | IR3_REG_CONST)));
assert(copy_entry_size(entry) == 2);
struct copy_entry *new_entry = &ctx->entries[ctx->entry_count++];
assert(!entry->done);
assert(!(entry->flags & (IR3_REG_IMMED | IR3_REG_CONST)));
assert(copy_entry_size(entry) == 2);
struct copy_entry *new_entry = &ctx->entries[ctx->entry_count++];
new_entry->dst = entry->dst + 1;
new_entry->src.flags = entry->src.flags;
new_entry->src.reg = entry->src.reg + 1;
new_entry->done = false;
entry->flags |= IR3_REG_HALF;
new_entry->flags = entry->flags;
ctx->physreg_dst[entry->dst + 1] = new_entry;
new_entry->dst = entry->dst + 1;
new_entry->src.flags = entry->src.flags;
new_entry->src.reg = entry->src.reg + 1;
new_entry->done = false;
entry->flags |= IR3_REG_HALF;
new_entry->flags = entry->flags;
ctx->physreg_dst[entry->dst + 1] = new_entry;
}
static void
_handle_copies(struct ir3_compiler *compiler, struct ir3_instruction *instr,
struct copy_ctx *ctx)
struct copy_ctx *ctx)
{
/* Set up the bookkeeping */
memset(ctx->physreg_dst, 0, sizeof(ctx->physreg_dst));
memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count));
/* Set up the bookkeeping */
memset(ctx->physreg_dst, 0, sizeof(ctx->physreg_dst));
memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count));
for (unsigned i = 0; i < ctx->entry_count; i++) {
struct copy_entry *entry = &ctx->entries[i];
for (unsigned j = 0; j < copy_entry_size(entry); j++) {
if (!entry->src.flags)
ctx->physreg_use_count[entry->src.reg + j]++;
for (unsigned i = 0; i < ctx->entry_count; i++) {
struct copy_entry *entry = &ctx->entries[i];
for (unsigned j = 0; j < copy_entry_size(entry); j++) {
if (!entry->src.flags)
ctx->physreg_use_count[entry->src.reg + j]++;
/* Copies should not have overlapping destinations. */
assert(!ctx->physreg_dst[entry->dst + j]);
ctx->physreg_dst[entry->dst + j] = entry;
}
}
/* Copies should not have overlapping destinations. */
assert(!ctx->physreg_dst[entry->dst + j]);
ctx->physreg_dst[entry->dst + j] = entry;
}
}
bool progress = true;
while (progress) {
progress = false;
bool progress = true;
while (progress) {
progress = false;
/* Step 1: resolve paths in the transfer graph. This means finding
* copies whose destination aren't blocked by something else and then
* emitting them, continuing this process until every copy is blocked
* and there are only cycles left.
*
* TODO: We should note that src is also available in dst to unblock
* cycles that src is involved in.
*/
/* Step 1: resolve paths in the transfer graph. This means finding
* copies whose destination aren't blocked by something else and then
* emitting them, continuing this process until every copy is blocked
* and there are only cycles left.
*
* TODO: We should note that src is also available in dst to unblock
* cycles that src is involved in.
*/
for (unsigned i = 0; i < ctx->entry_count; i++) {
struct copy_entry *entry = &ctx->entries[i];
if (!entry->done && !entry_blocked(entry, ctx)) {
entry->done = true;
progress = true;
do_copy(compiler, instr, entry);
for (unsigned j = 0; j < copy_entry_size(entry); j++) {
if (!entry->src.flags)
ctx->physreg_use_count[entry->src.reg + j]--;
ctx->physreg_dst[entry->dst + j] = NULL;
}
}
}
for (unsigned i = 0; i < ctx->entry_count; i++) {
struct copy_entry *entry = &ctx->entries[i];
if (!entry->done && !entry_blocked(entry, ctx)) {
entry->done = true;
progress = true;
do_copy(compiler, instr, entry);
for (unsigned j = 0; j < copy_entry_size(entry); j++) {
if (!entry->src.flags)
ctx->physreg_use_count[entry->src.reg + j]--;
ctx->physreg_dst[entry->dst + j] = NULL;
}
}
}
if (progress)
continue;
if (progress)
continue;
/* Step 2: Find partially blocked copies and split them. In the
* mergedregs case, we can 32-bit copies which are only blocked on one
* 16-bit half, and splitting them helps get things moving.
*
* We can skip splitting copies if the source isn't a register,
* however, because it does not unblock anything and therefore doesn't
* contribute to making forward progress with step 1. These copies
* should still be resolved eventually in step 1 because they can't be
* part of a cycle.
*/
for (unsigned i = 0; i < ctx->entry_count; i++) {
struct copy_entry *entry = &ctx->entries[i];
if (entry->done || entry->flags & IR3_REG_HALF)
continue;
/* Step 2: Find partially blocked copies and split them. In the
* mergedregs case, we can 32-bit copies which are only blocked on one
* 16-bit half, and splitting them helps get things moving.
*
* We can skip splitting copies if the source isn't a register,
* however, because it does not unblock anything and therefore doesn't
* contribute to making forward progress with step 1. These copies
* should still be resolved eventually in step 1 because they can't be
* part of a cycle.
*/
for (unsigned i = 0; i < ctx->entry_count; i++) {
struct copy_entry *entry = &ctx->entries[i];
if (entry->done || entry->flags & IR3_REG_HALF)
continue;
if (((ctx->physreg_use_count[entry->dst] == 0 ||
ctx->physreg_use_count[entry->dst + 1] == 0)) &&
!(entry->flags & (IR3_REG_IMMED | IR3_REG_CONST))) {
split_32bit_copy(ctx, entry);
progress = true;
}
}
}
if (((ctx->physreg_use_count[entry->dst] == 0 ||
ctx->physreg_use_count[entry->dst + 1] == 0)) &&
!(entry->flags & (IR3_REG_IMMED | IR3_REG_CONST))) {
split_32bit_copy(ctx, entry);
progress = true;
}
}
}
/* Step 3: resolve cycles through swapping.
*
* At this point, the transfer graph should consist of only cycles.
* The reason is that, given any physreg n_1 that's the source of a
* remaining entry, it has a destination n_2, which (because every
* copy is blocked) is the source of some other copy whose destination
* is n_3, and so we can follow the chain until we get a cycle. If we
* reached some other node than n_1:
*
* n_1 -> n_2 -> ... -> n_i
* ^ |
* |-------------|
*
* then n_2 would be the destination of 2 copies, which is illegal
* (checked above in an assert). So n_1 must be part of a cycle:
*
* n_1 -> n_2 -> ... -> n_i
* ^ |
* |---------------------|
*
* and this must be only cycle n_1 is involved in, because any other
* path starting from n_1 would also have to end in n_1, resulting in
* a node somewhere along the way being the destination of 2 copies
* when the 2 paths merge.
*
* The way we resolve the cycle is through picking a copy (n_1, n_2)
* and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken
* out of the cycle:
*
* n_1 -> ... -> n_i
* ^ |
* |--------------|
*
* and we can keep repeating this until the cycle is empty.
*/
/* Step 3: resolve cycles through swapping.
*
* At this point, the transfer graph should consist of only cycles.
* The reason is that, given any physreg n_1 that's the source of a
* remaining entry, it has a destination n_2, which (because every
* copy is blocked) is the source of some other copy whose destination
* is n_3, and so we can follow the chain until we get a cycle. If we
* reached some other node than n_1:
*
* n_1 -> n_2 -> ... -> n_i
* ^ |
* |-------------|
*
* then n_2 would be the destination of 2 copies, which is illegal
* (checked above in an assert). So n_1 must be part of a cycle:
*
* n_1 -> n_2 -> ... -> n_i
* ^ |
* |---------------------|
*
* and this must be only cycle n_1 is involved in, because any other
* path starting from n_1 would also have to end in n_1, resulting in
* a node somewhere along the way being the destination of 2 copies
* when the 2 paths merge.
*
* The way we resolve the cycle is through picking a copy (n_1, n_2)
* and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken
* out of the cycle:
*
* n_1 -> ... -> n_i
* ^ |
* |--------------|
*
* and we can keep repeating this until the cycle is empty.
*/
for (unsigned i = 0; i < ctx->entry_count; i++) {
struct copy_entry *entry = &ctx->entries[i];
if (entry->done)
continue;
for (unsigned i = 0; i < ctx->entry_count; i++) {
struct copy_entry *entry = &ctx->entries[i];
if (entry->done)
continue;
assert(!entry->src.flags);
assert(!entry->src.flags);
/* catch trivial copies */
if (entry->dst == entry->src.reg) {
entry->done = true;
continue;
}
/* catch trivial copies */
if (entry->dst == entry->src.reg) {
entry->done = true;
continue;
}
do_swap(compiler, instr, entry);
do_swap(compiler, instr, entry);
/* Split any blocking copies whose sources are only partially
* contained within our destination.
*/
if (entry->flags & IR3_REG_HALF) {
for (unsigned j = 0; j < ctx->entry_count; j++) {
struct copy_entry *blocking = &ctx->entries[j];
/* Split any blocking copies whose sources are only partially
* contained within our destination.
*/
if (entry->flags & IR3_REG_HALF) {
for (unsigned j = 0; j < ctx->entry_count; j++) {
struct copy_entry *blocking = &ctx->entries[j];
if (blocking->done)
continue;
if (blocking->done)
continue;
if (blocking->src.reg <= entry->dst &&
blocking->src.reg + 1 >= entry->dst &&
!(blocking->flags & IR3_REG_HALF)) {
split_32bit_copy(ctx, blocking);
}
}
}
if (blocking->src.reg <= entry->dst &&
blocking->src.reg + 1 >= entry->dst &&
!(blocking->flags & IR3_REG_HALF)) {
split_32bit_copy(ctx, blocking);
}
}
}
/* Update sources of blocking copies.
*
* Note: at this point, every blocking copy's source should be
* contained within our destination.
*/
for (unsigned j = 0; j < ctx->entry_count; j++) {
struct copy_entry *blocking = &ctx->entries[j];
if (blocking->src.reg >= entry->dst &&
blocking->src.reg < entry->dst + copy_entry_size(entry)) {
blocking->src.reg = entry->src.reg + (blocking->src.reg - entry->dst);
}
}
}
/* Update sources of blocking copies.
*
* Note: at this point, every blocking copy's source should be
* contained within our destination.
*/
for (unsigned j = 0; j < ctx->entry_count; j++) {
struct copy_entry *blocking = &ctx->entries[j];
if (blocking->src.reg >= entry->dst &&
blocking->src.reg < entry->dst + copy_entry_size(entry)) {
blocking->src.reg =
entry->src.reg + (blocking->src.reg - entry->dst);
}
}
}
}
static void
handle_copies(struct ir3_shader_variant *v, struct ir3_instruction *instr,
struct copy_entry *entries, unsigned entry_count)
struct copy_entry *entries, unsigned entry_count)
{
struct copy_ctx ctx;
struct copy_ctx ctx;
/* handle shared copies first */
ctx.entry_count = 0;
for (unsigned i = 0; i < entry_count; i++) {
if (entries[i].flags & IR3_REG_SHARED)
ctx.entries[ctx.entry_count++] = entries[i];
}
_handle_copies(v->shader->compiler, instr, &ctx);
/* handle shared copies first */
ctx.entry_count = 0;
for (unsigned i = 0; i < entry_count; i++) {
if (entries[i].flags & IR3_REG_SHARED)
ctx.entries[ctx.entry_count++] = entries[i];
}
_handle_copies(v->shader->compiler, instr, &ctx);
if (v->mergedregs) {
/* Half regs and full regs are in the same file, so handle everything
* at once.
*/
ctx.entry_count = 0;
for (unsigned i = 0; i < entry_count; i++) {
if (!(entries[i].flags & IR3_REG_SHARED))
ctx.entries[ctx.entry_count++] = entries[i];
}
_handle_copies(v->shader->compiler, instr, &ctx);
} else {
/* There may be both half copies and full copies, so we have to split
* them up since they don't interfere.
*/
ctx.entry_count = 0;
for (unsigned i = 0; i < entry_count; i++) {
if (entries[i].flags & IR3_REG_HALF)
ctx.entries[ctx.entry_count++] = entries[i];
}
_handle_copies(v->shader->compiler, instr, &ctx);
if (v->mergedregs) {
/* Half regs and full regs are in the same file, so handle everything
* at once.
*/
ctx.entry_count = 0;
for (unsigned i = 0; i < entry_count; i++) {
if (!(entries[i].flags & IR3_REG_SHARED))
ctx.entries[ctx.entry_count++] = entries[i];
}
_handle_copies(v->shader->compiler, instr, &ctx);
} else {
/* There may be both half copies and full copies, so we have to split
* them up since they don't interfere.
*/
ctx.entry_count = 0;
for (unsigned i = 0; i < entry_count; i++) {
if (entries[i].flags & IR3_REG_HALF)
ctx.entries[ctx.entry_count++] = entries[i];
}
_handle_copies(v->shader->compiler, instr, &ctx);
ctx.entry_count = 0;
for (unsigned i = 0; i < entry_count; i++) {
if (!(entries[i].flags & (IR3_REG_HALF | IR3_REG_SHARED)))
ctx.entries[ctx.entry_count++] = entries[i];
}
_handle_copies(v->shader->compiler, instr, &ctx);
}
ctx.entry_count = 0;
for (unsigned i = 0; i < entry_count; i++) {
if (!(entries[i].flags & (IR3_REG_HALF | IR3_REG_SHARED)))
ctx.entries[ctx.entry_count++] = entries[i];
}
_handle_copies(v->shader->compiler, instr, &ctx);
}
}
void
ir3_lower_copies(struct ir3_shader_variant *v)
{
DECLARE_ARRAY(struct copy_entry, copies);
copies_count = copies_sz = 0;
copies = NULL;
DECLARE_ARRAY(struct copy_entry, copies);
copies_count = copies_sz = 0;
copies = NULL;
foreach_block (block, &v->ir->block_list) {
foreach_instr_safe (instr, &block->instr_list) {
if (instr->opc == OPC_META_PARALLEL_COPY) {
copies_count = 0;
for (unsigned i = 0; i < instr->dsts_count; i++) {
struct ir3_register *dst = instr->dsts[i];
struct ir3_register *src = instr->srcs[i];
unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
unsigned dst_physreg = ra_reg_get_physreg(dst);
for (unsigned j = 0; j < reg_elems(dst); j++) {
array_insert(NULL, copies, (struct copy_entry) {
.dst = dst_physreg + j * reg_elem_size(dst),
.src = get_copy_src(src, j * reg_elem_size(dst)),
.flags = flags,
});
}
}
handle_copies(v, instr, copies, copies_count);
list_del(&instr->node);
} else if (instr->opc == OPC_META_COLLECT) {
copies_count = 0;
struct ir3_register *dst = instr->dsts[0];
unsigned flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED);
for (unsigned i = 0; i < instr->srcs_count; i++) {
struct ir3_register *src = instr->srcs[i];
array_insert(NULL, copies, (struct copy_entry) {
.dst = ra_num_to_physreg(dst->num + i, flags),
.src = get_copy_src(src, 0),
.flags = flags,
});
}
handle_copies(v, instr, copies, copies_count);
list_del(&instr->node);
} else if (instr->opc == OPC_META_SPLIT) {
copies_count = 0;
struct ir3_register *dst = instr->dsts[0];
struct ir3_register *src = instr->srcs[0];
unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
array_insert(NULL, copies, (struct copy_entry) {
.dst = ra_reg_get_physreg(dst),
.src = get_copy_src(src, instr->split.off * reg_elem_size(dst)),
.flags = flags,
});
handle_copies(v, instr, copies, copies_count);
list_del(&instr->node);
} else if (instr->opc == OPC_META_PHI) {
list_del(&instr->node);
}
}
}
foreach_block (block, &v->ir->block_list) {
foreach_instr_safe (instr, &block->instr_list) {
if (instr->opc == OPC_META_PARALLEL_COPY) {
copies_count = 0;
for (unsigned i = 0; i < instr->dsts_count; i++) {
struct ir3_register *dst = instr->dsts[i];
struct ir3_register *src = instr->srcs[i];
unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
unsigned dst_physreg = ra_reg_get_physreg(dst);
for (unsigned j = 0; j < reg_elems(dst); j++) {
array_insert(
NULL, copies,
(struct copy_entry){
.dst = dst_physreg + j * reg_elem_size(dst),
.src = get_copy_src(src, j * reg_elem_size(dst)),
.flags = flags,
});
}
}
handle_copies(v, instr, copies, copies_count);
list_del(&instr->node);
} else if (instr->opc == OPC_META_COLLECT) {
copies_count = 0;
struct ir3_register *dst = instr->dsts[0];
unsigned flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED);
for (unsigned i = 0; i < instr->srcs_count; i++) {
struct ir3_register *src = instr->srcs[i];
array_insert(NULL, copies,
(struct copy_entry){
.dst = ra_num_to_physreg(dst->num + i, flags),
.src = get_copy_src(src, 0),
.flags = flags,
});
}
handle_copies(v, instr, copies, copies_count);
list_del(&instr->node);
} else if (instr->opc == OPC_META_SPLIT) {
copies_count = 0;
struct ir3_register *dst = instr->dsts[0];
struct ir3_register *src = instr->srcs[0];
unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
array_insert(NULL, copies,
(struct copy_entry){
.dst = ra_reg_get_physreg(dst),
.src = get_copy_src(
src, instr->split.off * reg_elem_size(dst)),
.flags = flags,
});
handle_copies(v, instr, copies, copies_count);
list_del(&instr->node);
} else if (instr->opc == OPC_META_PHI) {
list_del(&instr->node);
}
}
}
if (copies)
ralloc_free(copies);
if (copies)
ralloc_free(copies);
}

View File

@ -35,220 +35,224 @@
static void
replace_pred(struct ir3_block *block, struct ir3_block *old_pred,
struct ir3_block *new_pred)
struct ir3_block *new_pred)
{
for (unsigned i = 0; i < block->predecessors_count; i++) {
if (block->predecessors[i] == old_pred) {
block->predecessors[i] = new_pred;
return;
}
}
for (unsigned i = 0; i < block->predecessors_count; i++) {
if (block->predecessors[i] == old_pred) {
block->predecessors[i] = new_pred;
return;
}
}
}
static void
replace_physical_pred(struct ir3_block *block, struct ir3_block *old_pred,
struct ir3_block *new_pred)
struct ir3_block *new_pred)
{
for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
if (block->physical_predecessors[i] == old_pred) {
block->physical_predecessors[i] = new_pred;
return;
}
}
for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
if (block->physical_predecessors[i] == old_pred) {
block->physical_predecessors[i] = new_pred;
return;
}
}
}
static void
mov_immed(struct ir3_register *dst, struct ir3_block *block, unsigned immed)
{
struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1);
struct ir3_register *mov_dst = ir3_dst_create(mov, dst->num, dst->flags);
mov_dst->wrmask = dst->wrmask;
struct ir3_register *src =
ir3_src_create(mov, INVALID_REG, (dst->flags & IR3_REG_HALF) | IR3_REG_IMMED);
src->uim_val = immed;
mov->cat1.dst_type = (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
mov->cat1.src_type = mov->cat1.dst_type;
mov->repeat = util_last_bit(mov_dst->wrmask) - 1;
struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1);
struct ir3_register *mov_dst = ir3_dst_create(mov, dst->num, dst->flags);
mov_dst->wrmask = dst->wrmask;
struct ir3_register *src = ir3_src_create(
mov, INVALID_REG, (dst->flags & IR3_REG_HALF) | IR3_REG_IMMED);
src->uim_val = immed;
mov->cat1.dst_type = (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
mov->cat1.src_type = mov->cat1.dst_type;
mov->repeat = util_last_bit(mov_dst->wrmask) - 1;
}
static struct ir3_block *
split_block(struct ir3 *ir, struct ir3_block *before_block,
struct ir3_instruction *instr, struct ir3_block **then)
struct ir3_instruction *instr, struct ir3_block **then)
{
struct ir3_block *then_block = ir3_block_create(ir);
struct ir3_block *after_block = ir3_block_create(ir);
list_add(&then_block->node, &before_block->node);
list_add(&after_block->node, &then_block->node);
struct ir3_block *then_block = ir3_block_create(ir);
struct ir3_block *after_block = ir3_block_create(ir);
list_add(&then_block->node, &before_block->node);
list_add(&after_block->node, &then_block->node);
for (unsigned i = 0; i < ARRAY_SIZE(before_block->successors); i++) {
after_block->successors[i] = before_block->successors[i];
if (after_block->successors[i])
replace_pred(after_block->successors[i], before_block, after_block);
}
for (unsigned i = 0; i < ARRAY_SIZE(before_block->successors); i++) {
after_block->successors[i] = before_block->successors[i];
if (after_block->successors[i])
replace_pred(after_block->successors[i], before_block, after_block);
}
for (unsigned i = 0; i < ARRAY_SIZE(before_block->physical_successors); i++) {
after_block->physical_successors[i] = before_block->physical_successors[i];
if (after_block->physical_successors[i]) {
replace_physical_pred(after_block->physical_successors[i],
before_block, after_block);
}
}
for (unsigned i = 0; i < ARRAY_SIZE(before_block->physical_successors);
i++) {
after_block->physical_successors[i] =
before_block->physical_successors[i];
if (after_block->physical_successors[i]) {
replace_physical_pred(after_block->physical_successors[i],
before_block, after_block);
}
}
before_block->successors[0] = then_block;
before_block->successors[1] = after_block;
before_block->physical_successors[0] = then_block;
before_block->physical_successors[1] = after_block;
ir3_block_add_predecessor(then_block, before_block);
ir3_block_add_predecessor(after_block, before_block);
ir3_block_add_physical_predecessor(then_block, before_block);
ir3_block_add_physical_predecessor(after_block, before_block);
before_block->successors[0] = then_block;
before_block->successors[1] = after_block;
before_block->physical_successors[0] = then_block;
before_block->physical_successors[1] = after_block;
ir3_block_add_predecessor(then_block, before_block);
ir3_block_add_predecessor(after_block, before_block);
ir3_block_add_physical_predecessor(then_block, before_block);
ir3_block_add_physical_predecessor(after_block, before_block);
then_block->successors[0] = after_block;
then_block->physical_successors[0] = after_block;
ir3_block_add_predecessor(after_block, then_block);
ir3_block_add_physical_predecessor(after_block, then_block);
then_block->successors[0] = after_block;
then_block->physical_successors[0] = after_block;
ir3_block_add_predecessor(after_block, then_block);
ir3_block_add_physical_predecessor(after_block, then_block);
foreach_instr_from_safe (rem_instr, &instr->node, &before_block->instr_list) {
list_del(&rem_instr->node);
list_addtail(&rem_instr->node, &after_block->instr_list);
rem_instr->block = after_block;
}
foreach_instr_from_safe (rem_instr, &instr->node,
&before_block->instr_list) {
list_del(&rem_instr->node);
list_addtail(&rem_instr->node, &after_block->instr_list);
rem_instr->block = after_block;
}
after_block->brtype = before_block->brtype;
after_block->condition = before_block->condition;
after_block->brtype = before_block->brtype;
after_block->condition = before_block->condition;
*then = then_block;
return after_block;
*then = then_block;
return after_block;
}
static bool
lower_block(struct ir3 *ir, struct ir3_block **block)
{
bool progress = false;
bool progress = false;
foreach_instr_safe (instr, &(*block)->instr_list) {
switch (instr->opc) {
case OPC_BALLOT_MACRO:
case OPC_ANY_MACRO:
case OPC_ALL_MACRO:
case OPC_ELECT_MACRO:
case OPC_READ_COND_MACRO:
case OPC_READ_FIRST_MACRO:
case OPC_SWZ_SHARED_MACRO:
break;
default:
continue;
}
foreach_instr_safe (instr, &(*block)->instr_list) {
switch (instr->opc) {
case OPC_BALLOT_MACRO:
case OPC_ANY_MACRO:
case OPC_ALL_MACRO:
case OPC_ELECT_MACRO:
case OPC_READ_COND_MACRO:
case OPC_READ_FIRST_MACRO:
case OPC_SWZ_SHARED_MACRO:
break;
default:
continue;
}
struct ir3_block *before_block = *block;
struct ir3_block *then_block;
struct ir3_block *after_block =
split_block(ir, before_block, instr, &then_block);
struct ir3_block *before_block = *block;
struct ir3_block *then_block;
struct ir3_block *after_block =
split_block(ir, before_block, instr, &then_block);
/* For ballot, the destination must be initialized to 0 before we do
* the movmsk because the condition may be 0 and then the movmsk will
* be skipped. Because it's a shared register we have to wrap the
* initialization in a getone block.
*/
if (instr->opc == OPC_BALLOT_MACRO) {
before_block->brtype = IR3_BRANCH_GETONE;
before_block->condition = NULL;
mov_immed(instr->dsts[0], then_block, 0);
before_block = after_block;
after_block = split_block(ir, before_block, instr, &then_block);
}
/* For ballot, the destination must be initialized to 0 before we do
* the movmsk because the condition may be 0 and then the movmsk will
* be skipped. Because it's a shared register we have to wrap the
* initialization in a getone block.
*/
if (instr->opc == OPC_BALLOT_MACRO) {
before_block->brtype = IR3_BRANCH_GETONE;
before_block->condition = NULL;
mov_immed(instr->dsts[0], then_block, 0);
before_block = after_block;
after_block = split_block(ir, before_block, instr, &then_block);
}
switch (instr->opc) {
case OPC_BALLOT_MACRO:
case OPC_READ_COND_MACRO:
case OPC_ANY_MACRO:
case OPC_ALL_MACRO:
before_block->condition = instr->srcs[0]->def->instr;
break;
default:
before_block->condition = NULL;
break;
}
switch (instr->opc) {
case OPC_BALLOT_MACRO:
case OPC_READ_COND_MACRO:
case OPC_ANY_MACRO:
case OPC_ALL_MACRO:
before_block->condition = instr->srcs[0]->def->instr;
break;
default:
before_block->condition = NULL;
break;
}
switch (instr->opc) {
case OPC_BALLOT_MACRO:
case OPC_READ_COND_MACRO:
before_block->brtype = IR3_BRANCH_COND;
break;
case OPC_ANY_MACRO:
before_block->brtype = IR3_BRANCH_ANY;
break;
case OPC_ALL_MACRO:
before_block->brtype = IR3_BRANCH_ALL;
break;
case OPC_ELECT_MACRO:
case OPC_READ_FIRST_MACRO:
case OPC_SWZ_SHARED_MACRO:
before_block->brtype = IR3_BRANCH_GETONE;
break;
default:
unreachable("bad opcode");
}
switch (instr->opc) {
case OPC_BALLOT_MACRO:
case OPC_READ_COND_MACRO:
before_block->brtype = IR3_BRANCH_COND;
break;
case OPC_ANY_MACRO:
before_block->brtype = IR3_BRANCH_ANY;
break;
case OPC_ALL_MACRO:
before_block->brtype = IR3_BRANCH_ALL;
break;
case OPC_ELECT_MACRO:
case OPC_READ_FIRST_MACRO:
case OPC_SWZ_SHARED_MACRO:
before_block->brtype = IR3_BRANCH_GETONE;
break;
default:
unreachable("bad opcode");
}
switch (instr->opc) {
case OPC_ALL_MACRO:
case OPC_ANY_MACRO:
case OPC_ELECT_MACRO:
mov_immed(instr->dsts[0], then_block, 1);
mov_immed(instr->dsts[0], before_block, 0);
break;
switch (instr->opc) {
case OPC_ALL_MACRO:
case OPC_ANY_MACRO:
case OPC_ELECT_MACRO:
mov_immed(instr->dsts[0], then_block, 1);
mov_immed(instr->dsts[0], before_block, 0);
break;
case OPC_BALLOT_MACRO: {
unsigned comp_count = util_last_bit(instr->dsts[0]->wrmask);
struct ir3_instruction *movmsk = ir3_instr_create(then_block, OPC_MOVMSK, 1, 0);
ir3_dst_create(movmsk, instr->dsts[0]->num, instr->dsts[0]->flags);
movmsk->repeat = comp_count - 1;
break;
}
case OPC_BALLOT_MACRO: {
unsigned comp_count = util_last_bit(instr->dsts[0]->wrmask);
struct ir3_instruction *movmsk =
ir3_instr_create(then_block, OPC_MOVMSK, 1, 0);
ir3_dst_create(movmsk, instr->dsts[0]->num, instr->dsts[0]->flags);
movmsk->repeat = comp_count - 1;
break;
}
case OPC_READ_COND_MACRO:
case OPC_READ_FIRST_MACRO: {
struct ir3_instruction *mov = ir3_instr_create(then_block, OPC_MOV, 1, 1);
unsigned src = instr->opc == OPC_READ_COND_MACRO ? 1 : 0;
ir3_dst_create(mov, instr->dsts[0]->num, instr->dsts[0]->flags);
struct ir3_register *new_src = ir3_src_create(mov, 0, 0);
*new_src = *instr->srcs[src];
mov->cat1.dst_type = mov->cat1.src_type = TYPE_U32;
break;
}
case OPC_READ_COND_MACRO:
case OPC_READ_FIRST_MACRO: {
struct ir3_instruction *mov =
ir3_instr_create(then_block, OPC_MOV, 1, 1);
unsigned src = instr->opc == OPC_READ_COND_MACRO ? 1 : 0;
ir3_dst_create(mov, instr->dsts[0]->num, instr->dsts[0]->flags);
struct ir3_register *new_src = ir3_src_create(mov, 0, 0);
*new_src = *instr->srcs[src];
mov->cat1.dst_type = mov->cat1.src_type = TYPE_U32;
break;
}
case OPC_SWZ_SHARED_MACRO: {
struct ir3_instruction *swz =
ir3_instr_create(then_block, OPC_SWZ, 2, 2);
ir3_dst_create(swz, instr->dsts[0]->num, instr->dsts[0]->flags);
ir3_dst_create(swz, instr->dsts[1]->num, instr->dsts[1]->flags);
ir3_src_create(swz, instr->srcs[0]->num, instr->srcs[0]->flags);
ir3_src_create(swz, instr->srcs[1]->num, instr->srcs[1]->flags);
swz->cat1.dst_type = swz->cat1.src_type = TYPE_U32;
swz->repeat = 1;
break;
}
case OPC_SWZ_SHARED_MACRO: {
struct ir3_instruction *swz =
ir3_instr_create(then_block, OPC_SWZ, 2, 2);
ir3_dst_create(swz, instr->dsts[0]->num, instr->dsts[0]->flags);
ir3_dst_create(swz, instr->dsts[1]->num, instr->dsts[1]->flags);
ir3_src_create(swz, instr->srcs[0]->num, instr->srcs[0]->flags);
ir3_src_create(swz, instr->srcs[1]->num, instr->srcs[1]->flags);
swz->cat1.dst_type = swz->cat1.src_type = TYPE_U32;
swz->repeat = 1;
break;
}
default:
unreachable("bad opcode");
}
default:
unreachable("bad opcode");
}
*block = after_block;
list_delinit(&instr->node);
progress = true;
}
*block = after_block;
list_delinit(&instr->node);
progress = true;
}
return progress;
return progress;
}
bool
ir3_lower_subgroups(struct ir3 *ir)
{
bool progress = false;
bool progress = false;
foreach_block (block, &ir->block_list)
progress |= lower_block(ir, &block);
foreach_block (block, &ir->block_list)
progress |= lower_block(ir, &block);
return progress;
return progress;
}

View File

@ -21,8 +21,8 @@
* SOFTWARE.
*/
#include "ir3_ra.h"
#include "ir3_compiler.h"
#include "ir3_ra.h"
#include "ralloc.h"
/* This pass "merges" compatible phi-web SSA values. First, we insert a bunch
@ -71,13 +71,13 @@
static unsigned
index_instrs(struct ir3_block *block, unsigned index)
{
foreach_instr (instr, &block->instr_list)
instr->ip = index++;
foreach_instr (instr, &block->instr_list)
instr->ip = index++;
for (unsigned i = 0; i < block->dom_children_count; i++)
index = index_instrs(block->dom_children[i], index);
for (unsigned i = 0; i < block->dom_children_count; i++)
index = index_instrs(block->dom_children[i], index);
return index;
return index;
}
/* Definitions within a merge set are ordered by instr->ip as set above: */
@ -85,27 +85,27 @@ index_instrs(struct ir3_block *block, unsigned index)
static bool
def_after(struct ir3_register *a, struct ir3_register *b)
{
return a->instr->ip > b->instr->ip;
return a->instr->ip > b->instr->ip;
}
static bool
def_dominates(struct ir3_register *a, struct ir3_register *b)
{
if (def_after(a, b)) {
return false;
} else if (a->instr->block == b->instr->block) {
return def_after(b, a);
} else {
return ir3_block_dominates(a->instr->block, b->instr->block);
}
if (def_after(a, b)) {
return false;
} else if (a->instr->block == b->instr->block) {
return def_after(b, a);
} else {
return ir3_block_dominates(a->instr->block, b->instr->block);
}
}
/* This represents a region inside a register. The offset is relative to the
* start of the register, and offset + size <= size(reg).
*/
struct def_value {
struct ir3_register *reg;
unsigned offset, size;
struct ir3_register *reg;
unsigned offset, size;
};
/* Chase any copies to get the source of a region inside a register. This is
@ -114,456 +114,452 @@ struct def_value {
static struct def_value
chase_copies(struct def_value value)
{
while (true) {
struct ir3_instruction *instr = value.reg->instr;
if (instr->opc == OPC_META_SPLIT) {
value.offset += instr->split.off * reg_elem_size(value.reg);
value.reg = instr->srcs[0]->def;
} else if (instr->opc == OPC_META_COLLECT) {
if (value.offset % reg_elem_size(value.reg) != 0 ||
value.size > reg_elem_size(value.reg) ||
value.offset + value.size > reg_size(value.reg))
break;
struct ir3_register *src = instr->srcs[value.offset / reg_elem_size(value.reg)];
if (!src->def)
break;
value.offset = 0;
value.reg = src->def;
} else {
/* TODO: parallelcopy */
break;
}
}
while (true) {
struct ir3_instruction *instr = value.reg->instr;
if (instr->opc == OPC_META_SPLIT) {
value.offset += instr->split.off * reg_elem_size(value.reg);
value.reg = instr->srcs[0]->def;
} else if (instr->opc == OPC_META_COLLECT) {
if (value.offset % reg_elem_size(value.reg) != 0 ||
value.size > reg_elem_size(value.reg) ||
value.offset + value.size > reg_size(value.reg))
break;
struct ir3_register *src =
instr->srcs[value.offset / reg_elem_size(value.reg)];
if (!src->def)
break;
value.offset = 0;
value.reg = src->def;
} else {
/* TODO: parallelcopy */
break;
}
}
return value;
return value;
}
/* This represents an entry in the merge set, and consists of a register +
* offset from the merge set base.
*/
struct merge_def {
struct ir3_register *reg;
unsigned offset;
struct ir3_register *reg;
unsigned offset;
};
static bool
can_skip_interference(const struct merge_def *a, const struct merge_def *b)
{
unsigned a_start = a->offset;
unsigned b_start = b->offset;
unsigned a_end = a_start + reg_size(a->reg);
unsigned b_end = b_start + reg_size(b->reg);
unsigned a_start = a->offset;
unsigned b_start = b->offset;
unsigned a_end = a_start + reg_size(a->reg);
unsigned b_end = b_start + reg_size(b->reg);
/* Registers that don't overlap never interfere */
if (a_end <= b_start || b_end <= a_start)
return true;
/* Registers that don't overlap never interfere */
if (a_end <= b_start || b_end <= a_start)
return true;
/* Disallow skipping interference unless one definition contains the
* other. This restriction is important for register allocation, because
* it means that at any given point in the program, the live values in a
* given merge set will form a tree. If they didn't, then one live value
* would partially overlap another, and they would have overlapping live
* ranges because they're live at the same point. This simplifies register
* allocation and spilling.
*/
if (!((a_start <= b_start && a_end >= b_end) ||
(b_start <= a_start && b_end >= a_end)))
return false;
/* Disallow skipping interference unless one definition contains the
* other. This restriction is important for register allocation, because
* it means that at any given point in the program, the live values in a
* given merge set will form a tree. If they didn't, then one live value
* would partially overlap another, and they would have overlapping live
* ranges because they're live at the same point. This simplifies register
* allocation and spilling.
*/
if (!((a_start <= b_start && a_end >= b_end) ||
(b_start <= a_start && b_end >= a_end)))
return false;
/* For each register, chase the intersection of a and b to find the
* ultimate source.
*/
unsigned start = MAX2(a_start, b_start);
unsigned end = MIN2(a_end, b_end);
struct def_value a_value =
chase_copies((struct def_value) {
.reg = a->reg,
.offset = start - a_start,
.size = end - start,
});
struct def_value b_value =
chase_copies((struct def_value) {
.reg = b->reg,
.offset = start - b_start,
.size = end - start,
});
return a_value.reg == b_value.reg && a_value.offset == b_value.offset;
/* For each register, chase the intersection of a and b to find the
* ultimate source.
*/
unsigned start = MAX2(a_start, b_start);
unsigned end = MIN2(a_end, b_end);
struct def_value a_value = chase_copies((struct def_value){
.reg = a->reg,
.offset = start - a_start,
.size = end - start,
});
struct def_value b_value = chase_copies((struct def_value){
.reg = b->reg,
.offset = start - b_start,
.size = end - start,
});
return a_value.reg == b_value.reg && a_value.offset == b_value.offset;
}
static struct ir3_merge_set *
get_merge_set(struct ir3_register *def)
{
if (def->merge_set)
return def->merge_set;
if (def->merge_set)
return def->merge_set;
struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set);
set->preferred_reg = ~0;
set->interval_start = ~0;
set->size = reg_size(def);
set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2;
set->regs_count = 1;
set->regs = ralloc(set, struct ir3_register *);
set->regs[0] = def;
struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set);
set->preferred_reg = ~0;
set->interval_start = ~0;
set->size = reg_size(def);
set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2;
set->regs_count = 1;
set->regs = ralloc(set, struct ir3_register *);
set->regs[0] = def;
return set;
return set;
}
/* Merges b into a */
static struct ir3_merge_set *
merge_merge_sets(struct ir3_merge_set *a, struct ir3_merge_set *b,
int b_offset)
merge_merge_sets(struct ir3_merge_set *a, struct ir3_merge_set *b, int b_offset)
{
if (b_offset < 0)
return merge_merge_sets(b, a, -b_offset);
if (b_offset < 0)
return merge_merge_sets(b, a, -b_offset);
struct ir3_register **new_regs =
rzalloc_array(a, struct ir3_register *, a->regs_count + b->regs_count);
struct ir3_register **new_regs =
rzalloc_array(a, struct ir3_register *, a->regs_count + b->regs_count);
unsigned a_index = 0, b_index = 0, new_index = 0;
for (; a_index < a->regs_count || b_index < b->regs_count; new_index++) {
if (b_index < b->regs_count &&
(a_index == a->regs_count ||
def_after(a->regs[a_index], b->regs[b_index]))) {
new_regs[new_index] = b->regs[b_index++];
new_regs[new_index]->merge_set_offset += b_offset;
} else {
new_regs[new_index] = a->regs[a_index++];
}
new_regs[new_index]->merge_set = a;
}
unsigned a_index = 0, b_index = 0, new_index = 0;
for (; a_index < a->regs_count || b_index < b->regs_count; new_index++) {
if (b_index < b->regs_count &&
(a_index == a->regs_count ||
def_after(a->regs[a_index], b->regs[b_index]))) {
new_regs[new_index] = b->regs[b_index++];
new_regs[new_index]->merge_set_offset += b_offset;
} else {
new_regs[new_index] = a->regs[a_index++];
}
new_regs[new_index]->merge_set = a;
}
assert(new_index == a->regs_count + b->regs_count);
assert(new_index == a->regs_count + b->regs_count);
/* Technically this should be the lcm, but because alignment is only 1 or
* 2 so far this should be ok.
*/
a->alignment = MAX2(a->alignment, b->alignment);
a->regs_count += b->regs_count;
ralloc_free(a->regs);
a->regs = new_regs;
a->size = MAX2(a->size, b->size + b_offset);
/* Technically this should be the lcm, but because alignment is only 1 or
* 2 so far this should be ok.
*/
a->alignment = MAX2(a->alignment, b->alignment);
a->regs_count += b->regs_count;
ralloc_free(a->regs);
a->regs = new_regs;
a->size = MAX2(a->size, b->size + b_offset);
return a;
return a;
}
static bool
merge_sets_interfere(struct ir3_liveness *live,
struct ir3_merge_set *a, struct ir3_merge_set *b,
int b_offset)
merge_sets_interfere(struct ir3_liveness *live, struct ir3_merge_set *a,
struct ir3_merge_set *b, int b_offset)
{
if (b_offset < 0)
return merge_sets_interfere(live, b, a, -b_offset);
if (b_offset < 0)
return merge_sets_interfere(live, b, a, -b_offset);
struct merge_def dom[a->regs_count + b->regs_count];
unsigned a_index = 0, b_index = 0;
int dom_index = -1;
struct merge_def dom[a->regs_count + b->regs_count];
unsigned a_index = 0, b_index = 0;
int dom_index = -1;
/* Reject trying to merge the sets if the alignment doesn't work out */
if (b_offset % a->alignment != 0)
return true;
/* Reject trying to merge the sets if the alignment doesn't work out */
if (b_offset % a->alignment != 0)
return true;
while (a_index < a->regs_count || b_index < b->regs_count) {
struct merge_def current;
if (a_index == a->regs_count) {
current.reg = b->regs[b_index];
current.offset = current.reg->merge_set_offset + b_offset;
b_index++;
} else if (b_index == b->regs_count) {
current.reg = a->regs[a_index];
current.offset = current.reg->merge_set_offset;
a_index++;
} else {
if (def_after(b->regs[b_index], a->regs[a_index])) {
current.reg = a->regs[a_index];
current.offset = current.reg->merge_set_offset;
a_index++;
} else {
current.reg = b->regs[b_index];
current.offset = current.reg->merge_set_offset + b_offset;
b_index++;
}
}
while (a_index < a->regs_count || b_index < b->regs_count) {
struct merge_def current;
if (a_index == a->regs_count) {
current.reg = b->regs[b_index];
current.offset = current.reg->merge_set_offset + b_offset;
b_index++;
} else if (b_index == b->regs_count) {
current.reg = a->regs[a_index];
current.offset = current.reg->merge_set_offset;
a_index++;
} else {
if (def_after(b->regs[b_index], a->regs[a_index])) {
current.reg = a->regs[a_index];
current.offset = current.reg->merge_set_offset;
a_index++;
} else {
current.reg = b->regs[b_index];
current.offset = current.reg->merge_set_offset + b_offset;
b_index++;
}
}
while (dom_index >= 0 &&
!def_dominates(dom[dom_index].reg, current.reg)) {
dom_index--;
}
while (dom_index >= 0 &&
!def_dominates(dom[dom_index].reg, current.reg)) {
dom_index--;
}
/* TODO: in the original paper, just dom[dom_index] needs to be
* checked for interference. We implement the value-chasing extension
* as well as support for sub-registers, which complicates this
* significantly because it's no longer the case that if a dominates b
* dominates c and a and b don't interfere then we only need to check
* interference between b and c to be sure a and c don't interfere --
* this means we may have to check for interference against values
* higher in the stack then dom[dom_index]. In the paper there's a
* description of a way to do less interference tests with the
* value-chasing extension, but we'd have to come up with something
* ourselves for handling the similar problems that come up with
* allowing values to contain subregisters. For now we just test
* everything in the stack.
*/
for (int i = 0; i <= dom_index; i++) {
if (can_skip_interference(&current, &dom[i]))
continue;
/* TODO: in the original paper, just dom[dom_index] needs to be
* checked for interference. We implement the value-chasing extension
* as well as support for sub-registers, which complicates this
* significantly because it's no longer the case that if a dominates b
* dominates c and a and b don't interfere then we only need to check
* interference between b and c to be sure a and c don't interfere --
* this means we may have to check for interference against values
* higher in the stack then dom[dom_index]. In the paper there's a
* description of a way to do less interference tests with the
* value-chasing extension, but we'd have to come up with something
* ourselves for handling the similar problems that come up with
* allowing values to contain subregisters. For now we just test
* everything in the stack.
*/
for (int i = 0; i <= dom_index; i++) {
if (can_skip_interference(&current, &dom[i]))
continue;
/* Ok, now we actually have to check interference. Since we know
* that dom[i] dominates current, this boils down to checking
* whether dom[i] is live after current.
*/
if (ir3_def_live_after(live, dom[i].reg, current.reg->instr))
return true;
}
/* Ok, now we actually have to check interference. Since we know
* that dom[i] dominates current, this boils down to checking
* whether dom[i] is live after current.
*/
if (ir3_def_live_after(live, dom[i].reg, current.reg->instr))
return true;
}
dom[++dom_index] = current;
}
dom[++dom_index] = current;
}
return false;
return false;
}
static void
try_merge_defs(struct ir3_liveness *live,
struct ir3_register *a, struct ir3_register *b,
unsigned b_offset)
try_merge_defs(struct ir3_liveness *live, struct ir3_register *a,
struct ir3_register *b, unsigned b_offset)
{
struct ir3_merge_set *a_set = get_merge_set(a);
struct ir3_merge_set *b_set = get_merge_set(b);
struct ir3_merge_set *a_set = get_merge_set(a);
struct ir3_merge_set *b_set = get_merge_set(b);
if (a_set == b_set) {
/* Note: Even in this case we may not always successfully be able to
* coalesce this copy, if the offsets don't line up. But in any
* case, we can't do anything.
*/
return;
}
if (a_set == b_set) {
/* Note: Even in this case we may not always successfully be able to
* coalesce this copy, if the offsets don't line up. But in any
* case, we can't do anything.
*/
return;
}
int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
if (!merge_sets_interfere(live, a_set, b_set, b_set_offset))
merge_merge_sets(a_set, b_set, b_set_offset);
if (!merge_sets_interfere(live, a_set, b_set, b_set_offset))
merge_merge_sets(a_set, b_set, b_set_offset);
}
static void
coalesce_phi(struct ir3_liveness *live,
struct ir3_instruction *phi)
coalesce_phi(struct ir3_liveness *live, struct ir3_instruction *phi)
{
for (unsigned i = 0; i < phi->srcs_count; i++) {
if (phi->srcs[i]->def)
try_merge_defs(live, phi->dsts[0], phi->srcs[i]->def, 0);
}
for (unsigned i = 0; i < phi->srcs_count; i++) {
if (phi->srcs[i]->def)
try_merge_defs(live, phi->dsts[0], phi->srcs[i]->def, 0);
}
}
static void
aggressive_coalesce_parallel_copy(struct ir3_liveness *live,
struct ir3_instruction *pcopy)
struct ir3_instruction *pcopy)
{
for (unsigned i = 0; i < pcopy->dsts_count; i++) {
if (!(pcopy->srcs[i]->flags & IR3_REG_SSA))
continue;
try_merge_defs(live, pcopy->dsts[i], pcopy->srcs[i]->def, 0);
}
for (unsigned i = 0; i < pcopy->dsts_count; i++) {
if (!(pcopy->srcs[i]->flags & IR3_REG_SSA))
continue;
try_merge_defs(live, pcopy->dsts[i], pcopy->srcs[i]->def, 0);
}
}
static void
aggressive_coalesce_split(struct ir3_liveness *live,
struct ir3_instruction *split)
struct ir3_instruction *split)
{
try_merge_defs(live, split->srcs[0]->def, split->dsts[0],
split->split.off * reg_elem_size(split->dsts[0]));
try_merge_defs(live, split->srcs[0]->def, split->dsts[0],
split->split.off * reg_elem_size(split->dsts[0]));
}
static void
aggressive_coalesce_collect(struct ir3_liveness *live,
struct ir3_instruction *collect)
struct ir3_instruction *collect)
{
for (unsigned i = 0, offset = 0; i < collect->srcs_count;
offset += reg_elem_size(collect->srcs[i]), i++) {
if (!(collect->srcs[i]->flags & IR3_REG_SSA))
continue;
try_merge_defs(live, collect->dsts[0], collect->srcs[i]->def, offset);
}
for (unsigned i = 0, offset = 0; i < collect->srcs_count;
offset += reg_elem_size(collect->srcs[i]), i++) {
if (!(collect->srcs[i]->flags & IR3_REG_SSA))
continue;
try_merge_defs(live, collect->dsts[0], collect->srcs[i]->def, offset);
}
}
static void
create_parallel_copy(struct ir3_block *block)
{
for (unsigned i = 0; i < 2; i++) {
if (!block->successors[i])
continue;
for (unsigned i = 0; i < 2; i++) {
if (!block->successors[i])
continue;
struct ir3_block *succ = block->successors[i];
struct ir3_block *succ = block->successors[i];
unsigned pred_idx = ir3_block_get_pred_index(succ, block);
unsigned pred_idx = ir3_block_get_pred_index(succ, block);
unsigned phi_count = 0;
foreach_instr (phi, &succ->instr_list) {
if (phi->opc != OPC_META_PHI)
break;
unsigned phi_count = 0;
foreach_instr (phi, &succ->instr_list) {
if (phi->opc != OPC_META_PHI)
break;
/* Avoid undef */
if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
!phi->srcs[pred_idx]->def)
continue;
/* Avoid undef */
if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
!phi->srcs[pred_idx]->def)
continue;
/* We don't support critical edges. If we were to support them,
* we'd need to insert parallel copies after the phi node to solve
* the lost-copy problem.
*/
assert(i == 0 && !block->successors[1]);
phi_count++;
}
/* We don't support critical edges. If we were to support them,
* we'd need to insert parallel copies after the phi node to solve
* the lost-copy problem.
*/
assert(i == 0 && !block->successors[1]);
phi_count++;
}
if (phi_count == 0)
continue;
if (phi_count == 0)
continue;
struct ir3_register *src[phi_count];
unsigned j = 0;
foreach_instr (phi, &succ->instr_list) {
if (phi->opc != OPC_META_PHI)
break;
if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
!phi->srcs[pred_idx]->def)
continue;
src[j++] = phi->srcs[pred_idx];
}
assert(j == phi_count);
struct ir3_register *src[phi_count];
unsigned j = 0;
foreach_instr (phi, &succ->instr_list) {
if (phi->opc != OPC_META_PHI)
break;
if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
!phi->srcs[pred_idx]->def)
continue;
src[j++] = phi->srcs[pred_idx];
}
assert(j == phi_count);
struct ir3_instruction *pcopy =
ir3_instr_create(block, OPC_META_PARALLEL_COPY, phi_count, phi_count);
struct ir3_instruction *pcopy =
ir3_instr_create(block, OPC_META_PARALLEL_COPY, phi_count, phi_count);
for (j = 0; j < phi_count; j++) {
struct ir3_register *reg = __ssa_dst(pcopy);
reg->flags |= src[j]->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
reg->size = reg_elems(src[j]);
}
for (j = 0; j < phi_count; j++) {
struct ir3_register *reg = __ssa_dst(pcopy);
reg->flags |= src[j]->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
reg->size = reg_elems(src[j]);
}
for (j = 0; j < phi_count; j++) {
pcopy->srcs[pcopy->srcs_count++] = ir3_reg_clone(block->shader, src[j]);
}
for (j = 0; j < phi_count; j++) {
pcopy->srcs[pcopy->srcs_count++] =
ir3_reg_clone(block->shader, src[j]);
}
j = 0;
foreach_instr (phi, &succ->instr_list) {
if (phi->opc != OPC_META_PHI)
break;
if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
!phi->srcs[pred_idx]->def)
continue;
phi->srcs[pred_idx]->def = pcopy->dsts[j];
phi->srcs[pred_idx]->flags = pcopy->dsts[j]->flags;
j++;
}
assert(j == phi_count);
}
j = 0;
foreach_instr (phi, &succ->instr_list) {
if (phi->opc != OPC_META_PHI)
break;
if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
!phi->srcs[pred_idx]->def)
continue;
phi->srcs[pred_idx]->def = pcopy->dsts[j];
phi->srcs[pred_idx]->flags = pcopy->dsts[j]->flags;
j++;
}
assert(j == phi_count);
}
}
void
ir3_create_parallel_copies(struct ir3 *ir)
{
foreach_block (block, &ir->block_list) {
create_parallel_copy(block);
}
foreach_block (block, &ir->block_list) {
create_parallel_copy(block);
}
}
static void
index_merge_sets(struct ir3 *ir)
{
unsigned offset = 0;
foreach_block (block, &ir->block_list) {
foreach_instr (instr, &block->instr_list) {
for (unsigned i = 0; i < instr->dsts_count; i++) {
struct ir3_register *dst = instr->dsts[i];
unsigned offset = 0;
foreach_block (block, &ir->block_list) {
foreach_instr (instr, &block->instr_list) {
for (unsigned i = 0; i < instr->dsts_count; i++) {
struct ir3_register *dst = instr->dsts[i];
unsigned dst_offset;
struct ir3_merge_set *merge_set = dst->merge_set;
unsigned size = reg_size(dst);
if (merge_set) {
if (merge_set->interval_start == ~0) {
merge_set->interval_start = offset;
offset += merge_set->size;
}
dst_offset = merge_set->interval_start + dst->merge_set_offset;
} else {
dst_offset = offset;
offset += size;
}
unsigned dst_offset;
struct ir3_merge_set *merge_set = dst->merge_set;
unsigned size = reg_size(dst);
if (merge_set) {
if (merge_set->interval_start == ~0) {
merge_set->interval_start = offset;
offset += merge_set->size;
}
dst_offset = merge_set->interval_start + dst->merge_set_offset;
} else {
dst_offset = offset;
offset += size;
}
dst->interval_start = dst_offset;
dst->interval_end = dst_offset + size;
}
}
}
dst->interval_start = dst_offset;
dst->interval_end = dst_offset + size;
}
}
}
}
#define RESET "\x1b[0m"
#define BLUE "\x1b[0;34m"
#define SYN_SSA(x) BLUE x RESET
#define RESET "\x1b[0m"
#define BLUE "\x1b[0;34m"
#define SYN_SSA(x) BLUE x RESET
static void
dump_merge_sets(struct ir3 *ir)
{
printf("merge sets:\n");
struct set *merge_sets = _mesa_pointer_set_create(NULL);
foreach_block (block, &ir->block_list) {
foreach_instr (instr, &block->instr_list) {
for (unsigned i = 0; i < instr->dsts_count; i++) {
struct ir3_register *dst = instr->dsts[i];
printf("merge sets:\n");
struct set *merge_sets = _mesa_pointer_set_create(NULL);
foreach_block (block, &ir->block_list) {
foreach_instr (instr, &block->instr_list) {
for (unsigned i = 0; i < instr->dsts_count; i++) {
struct ir3_register *dst = instr->dsts[i];
struct ir3_merge_set *merge_set = dst->merge_set;
if (!merge_set || _mesa_set_search(merge_sets, merge_set))
continue;
struct ir3_merge_set *merge_set = dst->merge_set;
if (!merge_set || _mesa_set_search(merge_sets, merge_set))
continue;
printf("merge set, size %u, align %u:\n", merge_set->size, merge_set->alignment);
for (unsigned j = 0; j < merge_set->regs_count; j++) {
struct ir3_register *reg = merge_set->regs[j];
printf("\t"SYN_SSA("ssa_%u")":%u, offset %u\n", reg->instr->serialno,
reg->name, reg->merge_set_offset);
}
printf("merge set, size %u, align %u:\n", merge_set->size,
merge_set->alignment);
for (unsigned j = 0; j < merge_set->regs_count; j++) {
struct ir3_register *reg = merge_set->regs[j];
printf("\t" SYN_SSA("ssa_%u") ":%u, offset %u\n",
reg->instr->serialno, reg->name, reg->merge_set_offset);
}
_mesa_set_add(merge_sets, merge_set);
}
}
}
_mesa_set_add(merge_sets, merge_set);
}
}
}
ralloc_free(merge_sets);
ralloc_free(merge_sets);
}
void
ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir)
{
index_instrs(ir3_start_block(ir), 0);
index_instrs(ir3_start_block(ir), 0);
/* First pass: coalesce phis, which must be together. */
foreach_block (block, &ir->block_list) {
foreach_instr (instr, &block->instr_list) {
if (instr->opc != OPC_META_PHI)
break;
/* First pass: coalesce phis, which must be together. */
foreach_block (block, &ir->block_list) {
foreach_instr (instr, &block->instr_list) {
if (instr->opc != OPC_META_PHI)
break;
coalesce_phi(live, instr);
}
}
coalesce_phi(live, instr);
}
}
/* Second pass: aggressively coalesce parallelcopy, split, collect */
foreach_block (block, &ir->block_list) {
foreach_instr (instr, &block->instr_list) {
switch (instr->opc) {
case OPC_META_SPLIT:
aggressive_coalesce_split(live, instr);
break;
case OPC_META_COLLECT:
aggressive_coalesce_collect(live, instr);
break;
case OPC_META_PARALLEL_COPY:
aggressive_coalesce_parallel_copy(live, instr);
break;
default:
break;
}
}
}
/* Second pass: aggressively coalesce parallelcopy, split, collect */
foreach_block (block, &ir->block_list) {
foreach_instr (instr, &block->instr_list) {
switch (instr->opc) {
case OPC_META_SPLIT:
aggressive_coalesce_split(live, instr);
break;
case OPC_META_COLLECT:
aggressive_coalesce_collect(live, instr);
break;
case OPC_META_PARALLEL_COPY:
aggressive_coalesce_parallel_copy(live, instr);
break;
default:
break;
}
}
}
index_merge_sets(ir);
index_merge_sets(ir);
if (ir3_shader_debug & IR3_DBG_RAMSGS)
dump_merge_sets(ir);
if (ir3_shader_debug & IR3_DBG_RAMSGS)
dump_merge_sets(ir);
}

File diff suppressed because it is too large Load Diff

View File

@ -43,15 +43,19 @@ bool ir3_nir_move_varying_inputs(nir_shader *shader);
int ir3_nir_coord_offset(nir_ssa_def *ssa);
bool ir3_nir_lower_tex_prefetch(nir_shader *shader);
void ir3_nir_lower_to_explicit_output(nir_shader *shader,
struct ir3_shader_variant *v, unsigned topology);
void ir3_nir_lower_to_explicit_input(nir_shader *shader, struct ir3_shader_variant *v);
void ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader_variant *v, unsigned topology);
void ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v, unsigned topology);
struct ir3_shader_variant *v,
unsigned topology);
void ir3_nir_lower_to_explicit_input(nir_shader *shader,
struct ir3_shader_variant *v);
void ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader_variant *v,
unsigned topology);
void ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v,
unsigned topology);
void ir3_nir_lower_gs(nir_shader *shader);
const nir_shader_compiler_options * ir3_get_compiler_options(struct ir3_compiler *compiler);
const nir_shader_compiler_options *
ir3_get_compiler_options(struct ir3_compiler *compiler);
void ir3_optimize_loop(struct ir3_compiler *compiler, nir_shader *s);
void ir3_nir_lower_io_to_temporaries(nir_shader *s);
void ir3_finalize_nir(struct ir3_compiler *compiler, nir_shader *s);
@ -59,29 +63,30 @@ void ir3_nir_post_finalize(struct ir3_compiler *compiler, nir_shader *s);
void ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s);
void ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v,
struct ir3_const_state *const_state);
struct ir3_const_state *const_state);
bool ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v);
void ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v);
bool ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v);
bool ir3_nir_fixup_load_uniform(nir_shader *nir);
nir_ssa_def *
ir3_nir_try_propagate_bit_shift(nir_builder *b, nir_ssa_def *offset, int32_t shift);
nir_ssa_def *ir3_nir_try_propagate_bit_shift(nir_builder *b,
nir_ssa_def *offset,
int32_t shift);
static inline nir_intrinsic_instr *
ir3_bindless_resource(nir_src src)
{
if (!src.is_ssa)
return NULL;
if (!src.is_ssa)
return NULL;
if (src.ssa->parent_instr->type != nir_instr_type_intrinsic)
return NULL;
if (src.ssa->parent_instr->type != nir_instr_type_intrinsic)
return NULL;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src.ssa->parent_instr);
if (intrin->intrinsic != nir_intrinsic_bindless_resource_ir3)
return NULL;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src.ssa->parent_instr);
if (intrin->intrinsic != nir_intrinsic_bindless_resource_ir3)
return NULL;
return intrin;
return intrin;
}
#endif /* IR3_NIR_H_ */

View File

@ -21,54 +21,55 @@
* SOFTWARE.
*/
#include "ir3_nir.h"
#include "ir3_compiler.h"
#include "compiler/nir/nir.h"
#include "compiler/nir/nir_builder.h"
#include "util/u_math.h"
#include "ir3_compiler.h"
#include "ir3_nir.h"
static inline bool
get_ubo_load_range(nir_shader *nir, nir_intrinsic_instr *instr, uint32_t alignment, struct ir3_ubo_range *r)
get_ubo_load_range(nir_shader *nir, nir_intrinsic_instr *instr,
uint32_t alignment, struct ir3_ubo_range *r)
{
uint32_t offset = nir_intrinsic_range_base(instr);
uint32_t size = nir_intrinsic_range(instr);
uint32_t offset = nir_intrinsic_range_base(instr);
uint32_t size = nir_intrinsic_range(instr);
/* If the offset is constant, the range is trivial (and NIR may not have
* figured it out).
*/
if (nir_src_is_const(instr->src[1])) {
offset = nir_src_as_uint(instr->src[1]);
size = nir_intrinsic_dest_components(instr) * 4;
}
/* If the offset is constant, the range is trivial (and NIR may not have
* figured it out).
*/
if (nir_src_is_const(instr->src[1])) {
offset = nir_src_as_uint(instr->src[1]);
size = nir_intrinsic_dest_components(instr) * 4;
}
/* If we haven't figured out the range accessed in the UBO, bail. */
if (size == ~0)
return false;
/* If we haven't figured out the range accessed in the UBO, bail. */
if (size == ~0)
return false;
r->start = ROUND_DOWN_TO(offset, alignment * 16);
r->end = ALIGN(offset + size, alignment * 16);
r->start = ROUND_DOWN_TO(offset, alignment * 16);
r->end = ALIGN(offset + size, alignment * 16);
return true;
return true;
}
static bool
get_ubo_info(nir_intrinsic_instr *instr, struct ir3_ubo_info *ubo)
{
if (nir_src_is_const(instr->src[0])) {
ubo->block = nir_src_as_uint(instr->src[0]);
ubo->bindless_base = 0;
ubo->bindless = false;
return true;
} else {
nir_intrinsic_instr *rsrc = ir3_bindless_resource(instr->src[0]);
if (rsrc && nir_src_is_const(rsrc->src[0])) {
ubo->block = nir_src_as_uint(rsrc->src[0]);
ubo->bindless_base = nir_intrinsic_desc_set(rsrc);
ubo->bindless = true;
return true;
}
}
return false;
if (nir_src_is_const(instr->src[0])) {
ubo->block = nir_src_as_uint(instr->src[0]);
ubo->bindless_base = 0;
ubo->bindless = false;
return true;
} else {
nir_intrinsic_instr *rsrc = ir3_bindless_resource(instr->src[0]);
if (rsrc && nir_src_is_const(rsrc->src[0])) {
ubo->block = nir_src_as_uint(rsrc->src[0]);
ubo->bindless_base = nir_intrinsic_desc_set(rsrc);
ubo->bindless = true;
return true;
}
}
return false;
}
/**
@ -76,24 +77,23 @@ get_ubo_info(nir_intrinsic_instr *instr, struct ir3_ubo_info *ubo)
*/
static const struct ir3_ubo_range *
get_existing_range(nir_intrinsic_instr *instr,
const struct ir3_ubo_analysis_state *state,
struct ir3_ubo_range *r)
const struct ir3_ubo_analysis_state *state,
struct ir3_ubo_range *r)
{
struct ir3_ubo_info ubo = {};
struct ir3_ubo_info ubo = {};
if (!get_ubo_info(instr, &ubo))
return NULL;
if (!get_ubo_info(instr, &ubo))
return NULL;
for (int i = 0; i < state->num_enabled; i++) {
const struct ir3_ubo_range *range = &state->range[i];
if (!memcmp(&range->ubo, &ubo, sizeof(ubo)) &&
r->start >= range->start &&
r->end <= range->end) {
return range;
}
}
for (int i = 0; i < state->num_enabled; i++) {
const struct ir3_ubo_range *range = &state->range[i];
if (!memcmp(&range->ubo, &ubo, sizeof(ubo)) && r->start >= range->start &&
r->end <= range->end) {
return range;
}
}
return NULL;
return NULL;
}
/**
@ -103,26 +103,26 @@ get_existing_range(nir_intrinsic_instr *instr,
static void
merge_neighbors(struct ir3_ubo_analysis_state *state, int index)
{
struct ir3_ubo_range *a = &state->range[index];
struct ir3_ubo_range *a = &state->range[index];
/* index is always the first slot that would have neighbored/overlapped with
* the new range.
*/
for (int i = index + 1; i < state->num_enabled; i++) {
struct ir3_ubo_range *b = &state->range[i];
if (memcmp(&a->ubo, &b->ubo, sizeof(a->ubo)))
continue;
/* index is always the first slot that would have neighbored/overlapped with
* the new range.
*/
for (int i = index + 1; i < state->num_enabled; i++) {
struct ir3_ubo_range *b = &state->range[i];
if (memcmp(&a->ubo, &b->ubo, sizeof(a->ubo)))
continue;
if (a->start > b->end || a->end < b->start)
continue;
if (a->start > b->end || a->end < b->start)
continue;
/* Merge B into A. */
a->start = MIN2(a->start, b->start);
a->end = MAX2(a->end, b->end);
/* Merge B into A. */
a->start = MIN2(a->start, b->start);
a->end = MAX2(a->end, b->end);
/* Swap the last enabled range into B's now unused slot */
*b = state->range[--state->num_enabled];
}
/* Swap the last enabled range into B's now unused slot */
*b = state->range[--state->num_enabled];
}
}
/**
@ -134,59 +134,59 @@ merge_neighbors(struct ir3_ubo_analysis_state *state, int index)
*/
static void
gather_ubo_ranges(nir_shader *nir, nir_intrinsic_instr *instr,
struct ir3_ubo_analysis_state *state, uint32_t alignment,
uint32_t *upload_remaining)
struct ir3_ubo_analysis_state *state, uint32_t alignment,
uint32_t *upload_remaining)
{
if (ir3_shader_debug & IR3_DBG_NOUBOOPT)
return;
if (ir3_shader_debug & IR3_DBG_NOUBOOPT)
return;
struct ir3_ubo_info ubo = {};
if (!get_ubo_info(instr, &ubo))
return;
struct ir3_ubo_info ubo = {};
if (!get_ubo_info(instr, &ubo))
return;
struct ir3_ubo_range r;
if (!get_ubo_load_range(nir, instr, alignment, &r))
return;
struct ir3_ubo_range r;
if (!get_ubo_load_range(nir, instr, alignment, &r))
return;
/* See if there's an existing range for this UBO we want to merge into. */
for (int i = 0; i < state->num_enabled; i++) {
struct ir3_ubo_range *plan_r = &state->range[i];
if (memcmp(&plan_r->ubo, &ubo, sizeof(ubo)))
continue;
/* See if there's an existing range for this UBO we want to merge into. */
for (int i = 0; i < state->num_enabled; i++) {
struct ir3_ubo_range *plan_r = &state->range[i];
if (memcmp(&plan_r->ubo, &ubo, sizeof(ubo)))
continue;
/* Don't extend existing uploads unless they're
* neighboring/overlapping.
*/
if (r.start > plan_r->end || r.end < plan_r->start)
continue;
/* Don't extend existing uploads unless they're
* neighboring/overlapping.
*/
if (r.start > plan_r->end || r.end < plan_r->start)
continue;
r.start = MIN2(r.start, plan_r->start);
r.end = MAX2(r.end, plan_r->end);
r.start = MIN2(r.start, plan_r->start);
r.end = MAX2(r.end, plan_r->end);
uint32_t added = (plan_r->start - r.start) + (r.end - plan_r->end);
if (added >= *upload_remaining)
return;
uint32_t added = (plan_r->start - r.start) + (r.end - plan_r->end);
if (added >= *upload_remaining)
return;
plan_r->start = r.start;
plan_r->end = r.end;
*upload_remaining -= added;
plan_r->start = r.start;
plan_r->end = r.end;
*upload_remaining -= added;
merge_neighbors(state, i);
return;
}
merge_neighbors(state, i);
return;
}
if (state->num_enabled == ARRAY_SIZE(state->range))
return;
if (state->num_enabled == ARRAY_SIZE(state->range))
return;
uint32_t added = r.end - r.start;
if (added >= *upload_remaining)
return;
uint32_t added = r.end - r.start;
if (added >= *upload_remaining)
return;
struct ir3_ubo_range *plan_r = &state->range[state->num_enabled++];
plan_r->ubo = ubo;
plan_r->start = r.start;
plan_r->end = r.end;
*upload_remaining -= added;
struct ir3_ubo_range *plan_r = &state->range[state->num_enabled++];
plan_r->ubo = ubo;
plan_r->start = r.start;
plan_r->end = r.end;
*upload_remaining -= added;
}
/* For indirect offset, it is common to see a pattern of multiple
@ -197,7 +197,8 @@ gather_ubo_ranges(nir_shader *nir, nir_intrinsic_instr *instr,
*
* Detect this, and peel out the const_offset part, to end up with:
*
* vec4 32 ssa_34 = intrinsic load_uniform (ssa_base) (base=N+const_offset, 0, 0)
* vec4 32 ssa_34 = intrinsic load_uniform (ssa_base) (base=N+const_offset,
* 0, 0)
*
* Or similarly:
*
@ -207,7 +208,8 @@ gather_ubo_ranges(nir_shader *nir, nir_intrinsic_instr *instr,
* Can be converted to:
*
* vec1 32 ssa_base = imul24 a, b
* vec4 32 ssa_34 = intrinsic load_uniform (ssa_base) (base=N+const_offset, 0, 0)
* vec4 32 ssa_34 = intrinsic load_uniform (ssa_base) (base=N+const_offset,
* 0, 0)
*
* This gives the other opt passes something much easier to work
* with (ie. not requiring value range tracking)
@ -215,38 +217,38 @@ gather_ubo_ranges(nir_shader *nir, nir_intrinsic_instr *instr,
static void
handle_partial_const(nir_builder *b, nir_ssa_def **srcp, int *offp)
{
if ((*srcp)->parent_instr->type != nir_instr_type_alu)
return;
if ((*srcp)->parent_instr->type != nir_instr_type_alu)
return;
nir_alu_instr *alu = nir_instr_as_alu((*srcp)->parent_instr);
nir_alu_instr *alu = nir_instr_as_alu((*srcp)->parent_instr);
if (alu->op == nir_op_imad24_ir3) {
/* This case is slightly more complicated as we need to
* replace the imad24_ir3 with an imul24:
*/
if (!nir_src_is_const(alu->src[2].src))
return;
if (alu->op == nir_op_imad24_ir3) {
/* This case is slightly more complicated as we need to
* replace the imad24_ir3 with an imul24:
*/
if (!nir_src_is_const(alu->src[2].src))
return;
*offp += nir_src_as_uint(alu->src[2].src);
*srcp = nir_imul24(b, nir_ssa_for_alu_src(b, alu, 0),
nir_ssa_for_alu_src(b, alu, 1));
*offp += nir_src_as_uint(alu->src[2].src);
*srcp = nir_imul24(b, nir_ssa_for_alu_src(b, alu, 0),
nir_ssa_for_alu_src(b, alu, 1));
return;
}
return;
}
if (alu->op != nir_op_iadd)
return;
if (alu->op != nir_op_iadd)
return;
if (!(alu->src[0].src.is_ssa && alu->src[1].src.is_ssa))
return;
if (!(alu->src[0].src.is_ssa && alu->src[1].src.is_ssa))
return;
if (nir_src_is_const(alu->src[0].src)) {
*offp += nir_src_as_uint(alu->src[0].src);
*srcp = alu->src[1].src.ssa;
} else if (nir_src_is_const(alu->src[1].src)) {
*srcp = alu->src[0].src.ssa;
*offp += nir_src_as_uint(alu->src[1].src);
}
if (nir_src_is_const(alu->src[0].src)) {
*offp += nir_src_as_uint(alu->src[0].src);
*srcp = alu->src[1].src.ssa;
} else if (nir_src_is_const(alu->src[1].src)) {
*srcp = alu->src[0].src.ssa;
*offp += nir_src_as_uint(alu->src[1].src);
}
}
/* Tracks the maximum bindful UBO accessed so that we reduce the UBO
@ -255,258 +257,256 @@ handle_partial_const(nir_builder *b, nir_ssa_def **srcp, int *offp)
static void
track_ubo_use(nir_intrinsic_instr *instr, nir_builder *b, int *num_ubos)
{
if (ir3_bindless_resource(instr->src[0])) {
assert(!b->shader->info.first_ubo_is_default_ubo); /* only set for GL */
return;
}
if (ir3_bindless_resource(instr->src[0])) {
assert(!b->shader->info.first_ubo_is_default_ubo); /* only set for GL */
return;
}
if (nir_src_is_const(instr->src[0])) {
int block = nir_src_as_uint(instr->src[0]);
*num_ubos = MAX2(*num_ubos, block + 1);
} else {
*num_ubos = b->shader->info.num_ubos;
}
if (nir_src_is_const(instr->src[0])) {
int block = nir_src_as_uint(instr->src[0]);
*num_ubos = MAX2(*num_ubos, block + 1);
} else {
*num_ubos = b->shader->info.num_ubos;
}
}
static bool
lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b,
const struct ir3_ubo_analysis_state *state,
int *num_ubos, uint32_t alignment)
const struct ir3_ubo_analysis_state *state,
int *num_ubos, uint32_t alignment)
{
b->cursor = nir_before_instr(&instr->instr);
b->cursor = nir_before_instr(&instr->instr);
struct ir3_ubo_range r;
if (!get_ubo_load_range(b->shader, instr, alignment, &r)) {
track_ubo_use(instr, b, num_ubos);
return false;
}
struct ir3_ubo_range r;
if (!get_ubo_load_range(b->shader, instr, alignment, &r)) {
track_ubo_use(instr, b, num_ubos);
return false;
}
/* We don't lower dynamic block index UBO loads to load_uniform, but we
* could probably with some effort determine a block stride in number of
* registers.
*/
const struct ir3_ubo_range *range = get_existing_range(instr, state, &r);
if (!range) {
track_ubo_use(instr, b, num_ubos);
return false;
}
/* We don't lower dynamic block index UBO loads to load_uniform, but we
* could probably with some effort determine a block stride in number of
* registers.
*/
const struct ir3_ubo_range *range = get_existing_range(instr, state, &r);
if (!range) {
track_ubo_use(instr, b, num_ubos);
return false;
}
nir_ssa_def *ubo_offset = nir_ssa_for_src(b, instr->src[1], 1);
int const_offset = 0;
nir_ssa_def *ubo_offset = nir_ssa_for_src(b, instr->src[1], 1);
int const_offset = 0;
handle_partial_const(b, &ubo_offset, &const_offset);
handle_partial_const(b, &ubo_offset, &const_offset);
/* UBO offset is in bytes, but uniform offset is in units of
* dwords, so we need to divide by 4 (right-shift by 2). For ldc the
* offset is in units of 16 bytes, so we need to multiply by 4. And
* also the same for the constant part of the offset:
*/
const int shift = -2;
nir_ssa_def *new_offset = ir3_nir_try_propagate_bit_shift(b, ubo_offset, -2);
nir_ssa_def *uniform_offset = NULL;
if (new_offset) {
uniform_offset = new_offset;
} else {
uniform_offset = shift > 0 ?
nir_ishl(b, ubo_offset, nir_imm_int(b, shift)) :
nir_ushr(b, ubo_offset, nir_imm_int(b, -shift));
}
/* UBO offset is in bytes, but uniform offset is in units of
* dwords, so we need to divide by 4 (right-shift by 2). For ldc the
* offset is in units of 16 bytes, so we need to multiply by 4. And
* also the same for the constant part of the offset:
*/
const int shift = -2;
nir_ssa_def *new_offset = ir3_nir_try_propagate_bit_shift(b, ubo_offset, -2);
nir_ssa_def *uniform_offset = NULL;
if (new_offset) {
uniform_offset = new_offset;
} else {
uniform_offset = shift > 0
? nir_ishl(b, ubo_offset, nir_imm_int(b, shift))
: nir_ushr(b, ubo_offset, nir_imm_int(b, -shift));
}
debug_assert(!(const_offset & 0x3));
const_offset >>= 2;
debug_assert(!(const_offset & 0x3));
const_offset >>= 2;
const int range_offset = ((int)range->offset - (int)range->start) / 4;
const_offset += range_offset;
const int range_offset = ((int)range->offset - (int)range->start) / 4;
const_offset += range_offset;
/* The range_offset could be negative, if if only part of the UBO
* block is accessed, range->start can be greater than range->offset.
* But we can't underflow const_offset. If necessary we need to
* insert nir instructions to compensate (which can hopefully be
* optimized away)
*/
if (const_offset < 0) {
uniform_offset = nir_iadd_imm(b, uniform_offset, const_offset);
const_offset = 0;
}
/* The range_offset could be negative, if if only part of the UBO
* block is accessed, range->start can be greater than range->offset.
* But we can't underflow const_offset. If necessary we need to
* insert nir instructions to compensate (which can hopefully be
* optimized away)
*/
if (const_offset < 0) {
uniform_offset = nir_iadd_imm(b, uniform_offset, const_offset);
const_offset = 0;
}
nir_ssa_def *uniform =
nir_load_uniform(b, instr->num_components, instr->dest.ssa.bit_size, uniform_offset, .base = const_offset);
nir_ssa_def *uniform =
nir_load_uniform(b, instr->num_components, instr->dest.ssa.bit_size,
uniform_offset, .base = const_offset);
nir_ssa_def_rewrite_uses(&instr->dest.ssa,
uniform);
nir_ssa_def_rewrite_uses(&instr->dest.ssa, uniform);
nir_instr_remove(&instr->instr);
nir_instr_remove(&instr->instr);
return true;
return true;
}
static bool
instr_is_load_ubo(nir_instr *instr)
{
if (instr->type != nir_instr_type_intrinsic)
return false;
if (instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_op op = nir_instr_as_intrinsic(instr)->intrinsic;
nir_intrinsic_op op = nir_instr_as_intrinsic(instr)->intrinsic;
/* nir_lower_ubo_vec4 happens after this pass. */
assert(op != nir_intrinsic_load_ubo_vec4);
/* nir_lower_ubo_vec4 happens after this pass. */
assert(op != nir_intrinsic_load_ubo_vec4);
return op == nir_intrinsic_load_ubo;
return op == nir_intrinsic_load_ubo;
}
void
ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v)
{
struct ir3_const_state *const_state = ir3_const_state(v);
struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
struct ir3_compiler *compiler = v->shader->compiler;
struct ir3_const_state *const_state = ir3_const_state(v);
struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
struct ir3_compiler *compiler = v->shader->compiler;
/* Limit our uploads to the amount of constant buffer space available in
* the hardware, minus what the shader compiler may need for various
* driver params. We do this UBO-to-push-constant before the real
* allocation of the driver params' const space, because UBO pointers can
* be driver params but this pass usually eliminatings them.
*/
struct ir3_const_state worst_case_const_state = { };
ir3_setup_const_state(nir, v, &worst_case_const_state);
const uint32_t max_upload = (ir3_max_const(v) -
worst_case_const_state.offsets.immediate) * 16;
/* Limit our uploads to the amount of constant buffer space available in
* the hardware, minus what the shader compiler may need for various
* driver params. We do this UBO-to-push-constant before the real
* allocation of the driver params' const space, because UBO pointers can
* be driver params but this pass usually eliminatings them.
*/
struct ir3_const_state worst_case_const_state = {};
ir3_setup_const_state(nir, v, &worst_case_const_state);
const uint32_t max_upload =
(ir3_max_const(v) - worst_case_const_state.offsets.immediate) * 16;
memset(state, 0, sizeof(*state));
memset(state, 0, sizeof(*state));
uint32_t upload_remaining = max_upload;
nir_foreach_function (function, nir) {
if (function->impl) {
nir_foreach_block (block, function->impl) {
nir_foreach_instr (instr, block) {
if (instr_is_load_ubo(instr))
gather_ubo_ranges(nir, nir_instr_as_intrinsic(instr),
state, compiler->const_upload_unit,
&upload_remaining);
}
}
}
}
uint32_t upload_remaining = max_upload;
nir_foreach_function (function, nir) {
if (function->impl) {
nir_foreach_block (block, function->impl) {
nir_foreach_instr (instr, block) {
if (instr_is_load_ubo(instr))
gather_ubo_ranges(nir, nir_instr_as_intrinsic(instr), state,
compiler->const_upload_unit,
&upload_remaining);
}
}
}
}
/* For now, everything we upload is accessed statically and thus will be
* used by the shader. Once we can upload dynamically indexed data, we may
* upload sparsely accessed arrays, at which point we probably want to
* give priority to smaller UBOs, on the assumption that big UBOs will be
* accessed dynamically. Alternatively, we can track statically and
* dynamically accessed ranges separately and upload static rangtes
* first.
*/
/* For now, everything we upload is accessed statically and thus will be
* used by the shader. Once we can upload dynamically indexed data, we may
* upload sparsely accessed arrays, at which point we probably want to
* give priority to smaller UBOs, on the assumption that big UBOs will be
* accessed dynamically. Alternatively, we can track statically and
* dynamically accessed ranges separately and upload static rangtes
* first.
*/
uint32_t offset = v->shader->num_reserved_user_consts * 16;
for (uint32_t i = 0; i < state->num_enabled; i++) {
uint32_t range_size = state->range[i].end - state->range[i].start;
uint32_t offset = v->shader->num_reserved_user_consts * 16;
for (uint32_t i = 0; i < state->num_enabled; i++) {
uint32_t range_size = state->range[i].end - state->range[i].start;
debug_assert(offset <= max_upload);
state->range[i].offset = offset;
assert(offset <= max_upload);
offset += range_size;
}
state->size = offset;
debug_assert(offset <= max_upload);
state->range[i].offset = offset;
assert(offset <= max_upload);
offset += range_size;
}
state->size = offset;
}
bool
ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v)
{
struct ir3_compiler *compiler = v->shader->compiler;
/* For the binning pass variant, we re-use the corresponding draw-pass
* variants const_state and ubo state. To make these clear, in this
* pass it is const (read-only)
*/
const struct ir3_const_state *const_state = ir3_const_state(v);
const struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
struct ir3_compiler *compiler = v->shader->compiler;
/* For the binning pass variant, we re-use the corresponding draw-pass
* variants const_state and ubo state. To make these clear, in this
* pass it is const (read-only)
*/
const struct ir3_const_state *const_state = ir3_const_state(v);
const struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
int num_ubos = 0;
bool progress = false;
nir_foreach_function (function, nir) {
if (function->impl) {
nir_builder builder;
nir_builder_init(&builder, function->impl);
nir_foreach_block (block, function->impl) {
nir_foreach_instr_safe (instr, block) {
if (!instr_is_load_ubo(instr))
continue;
progress |=
lower_ubo_load_to_uniform(nir_instr_as_intrinsic(instr),
&builder, state, &num_ubos,
compiler->const_upload_unit);
}
}
int num_ubos = 0;
bool progress = false;
nir_foreach_function (function, nir) {
if (function->impl) {
nir_builder builder;
nir_builder_init(&builder, function->impl);
nir_foreach_block (block, function->impl) {
nir_foreach_instr_safe (instr, block) {
if (!instr_is_load_ubo(instr))
continue;
progress |= lower_ubo_load_to_uniform(
nir_instr_as_intrinsic(instr), &builder, state, &num_ubos,
compiler->const_upload_unit);
}
}
nir_metadata_preserve(function->impl, nir_metadata_block_index |
nir_metadata_dominance);
}
}
/* Update the num_ubos field for GL (first_ubo_is_default_ubo). With
* Vulkan's bindless, we don't use the num_ubos field, so we can leave it
* incremented.
*/
if (nir->info.first_ubo_is_default_ubo)
nir->info.num_ubos = num_ubos;
nir_metadata_preserve(
function->impl, nir_metadata_block_index | nir_metadata_dominance);
}
}
/* Update the num_ubos field for GL (first_ubo_is_default_ubo). With
* Vulkan's bindless, we don't use the num_ubos field, so we can leave it
* incremented.
*/
if (nir->info.first_ubo_is_default_ubo)
nir->info.num_ubos = num_ubos;
return progress;
return progress;
}
static bool
fixup_load_uniform_filter(const nir_instr *instr, const void *arg)
{
if (instr->type != nir_instr_type_intrinsic)
return false;
return nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_uniform;
if (instr->type != nir_instr_type_intrinsic)
return false;
return nir_instr_as_intrinsic(instr)->intrinsic ==
nir_intrinsic_load_uniform;
}
static nir_ssa_def *
fixup_load_uniform_instr(struct nir_builder *b, nir_instr *instr, void *arg)
{
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
/* We don't need to worry about non-indirect case: */
if (nir_src_is_const(intr->src[0]))
return NULL;
/* We don't need to worry about non-indirect case: */
if (nir_src_is_const(intr->src[0]))
return NULL;
const unsigned base_offset_limit = (1 << 9); /* 9 bits */
unsigned base_offset = nir_intrinsic_base(intr);
const unsigned base_offset_limit = (1 << 9); /* 9 bits */
unsigned base_offset = nir_intrinsic_base(intr);
/* Or cases were base offset is lower than the hw limit: */
if (base_offset < base_offset_limit)
return NULL;
/* Or cases were base offset is lower than the hw limit: */
if (base_offset < base_offset_limit)
return NULL;
b->cursor = nir_before_instr(instr);
b->cursor = nir_before_instr(instr);
nir_ssa_def *offset = nir_ssa_for_src(b, intr->src[0], 1);
nir_ssa_def *offset = nir_ssa_for_src(b, intr->src[0], 1);
/* We'd like to avoid a sequence like:
*
* vec4 32 ssa_18 = intrinsic load_uniform (ssa_4) (1024, 0, 0)
* vec4 32 ssa_19 = intrinsic load_uniform (ssa_4) (1072, 0, 0)
* vec4 32 ssa_20 = intrinsic load_uniform (ssa_4) (1120, 0, 0)
*
* From turning into a unique offset value (which requires reloading
* a0.x for each instruction). So instead of just adding the constant
* base_offset to the non-const offset, be a bit more clever and only
* extract the part that cannot be encoded. Afterwards CSE should
* turn the result into:
*
* vec1 32 ssa_5 = load_const (1024)
* vec4 32 ssa_6 = iadd ssa4_, ssa_5
* vec4 32 ssa_18 = intrinsic load_uniform (ssa_5) (0, 0, 0)
* vec4 32 ssa_19 = intrinsic load_uniform (ssa_5) (48, 0, 0)
* vec4 32 ssa_20 = intrinsic load_uniform (ssa_5) (96, 0, 0)
*/
unsigned new_base_offset = base_offset % base_offset_limit;
/* We'd like to avoid a sequence like:
*
* vec4 32 ssa_18 = intrinsic load_uniform (ssa_4) (1024, 0, 0)
* vec4 32 ssa_19 = intrinsic load_uniform (ssa_4) (1072, 0, 0)
* vec4 32 ssa_20 = intrinsic load_uniform (ssa_4) (1120, 0, 0)
*
* From turning into a unique offset value (which requires reloading
* a0.x for each instruction). So instead of just adding the constant
* base_offset to the non-const offset, be a bit more clever and only
* extract the part that cannot be encoded. Afterwards CSE should
* turn the result into:
*
* vec1 32 ssa_5 = load_const (1024)
* vec4 32 ssa_6 = iadd ssa4_, ssa_5
* vec4 32 ssa_18 = intrinsic load_uniform (ssa_5) (0, 0, 0)
* vec4 32 ssa_19 = intrinsic load_uniform (ssa_5) (48, 0, 0)
* vec4 32 ssa_20 = intrinsic load_uniform (ssa_5) (96, 0, 0)
*/
unsigned new_base_offset = base_offset % base_offset_limit;
nir_intrinsic_set_base(intr, new_base_offset);
offset = nir_iadd_imm(b, offset, base_offset - new_base_offset);
nir_intrinsic_set_base(intr, new_base_offset);
offset = nir_iadd_imm(b, offset, base_offset - new_base_offset);
nir_instr_rewrite_src(instr, &intr->src[0], nir_src_for_ssa(offset));
nir_instr_rewrite_src(instr, &intr->src[0], nir_src_for_ssa(offset));
return NIR_LOWER_INSTR_PROGRESS;
return NIR_LOWER_INSTR_PROGRESS;
}
/**
@ -520,59 +520,59 @@ fixup_load_uniform_instr(struct nir_builder *b, nir_instr *instr, void *arg)
bool
ir3_nir_fixup_load_uniform(nir_shader *nir)
{
return nir_shader_lower_instructions(nir,
fixup_load_uniform_filter, fixup_load_uniform_instr,
NULL);
return nir_shader_lower_instructions(nir, fixup_load_uniform_filter,
fixup_load_uniform_instr, NULL);
}
static nir_ssa_def *
ir3_nir_lower_load_const_instr(nir_builder *b, nir_instr *in_instr, void *data)
{
struct ir3_const_state *const_state = data;
nir_intrinsic_instr *instr = nir_instr_as_intrinsic(in_instr);
struct ir3_const_state *const_state = data;
nir_intrinsic_instr *instr = nir_instr_as_intrinsic(in_instr);
/* Pick a UBO index to use as our constant data. Skip UBO 0 since that's
* reserved for gallium's cb0.
*/
if (const_state->constant_data_ubo == -1) {
if (b->shader->info.num_ubos == 0)
b->shader->info.num_ubos++;
const_state->constant_data_ubo = b->shader->info.num_ubos++;
}
/* Pick a UBO index to use as our constant data. Skip UBO 0 since that's
* reserved for gallium's cb0.
*/
if (const_state->constant_data_ubo == -1) {
if (b->shader->info.num_ubos == 0)
b->shader->info.num_ubos++;
const_state->constant_data_ubo = b->shader->info.num_ubos++;
}
unsigned num_components = instr->num_components;
if (nir_dest_bit_size(instr->dest) == 16) {
/* We can't do 16b loads -- either from LDC (32-bit only in any of our
* traces, and disasm that doesn't look like it really supports it) or
* from the constant file (where CONSTANT_DEMOTION_ENABLE means we get
* automatic 32b-to-16b conversions when we ask for 16b from it).
* Instead, we'll load 32b from a UBO and unpack from there.
*/
num_components = DIV_ROUND_UP(num_components, 2);
}
unsigned base = nir_intrinsic_base(instr);
nir_ssa_def *index = nir_imm_int(b, const_state->constant_data_ubo);
nir_ssa_def *offset = nir_iadd_imm(b, nir_ssa_for_src(b, instr->src[0], 1), base);
unsigned num_components = instr->num_components;
if (nir_dest_bit_size(instr->dest) == 16) {
/* We can't do 16b loads -- either from LDC (32-bit only in any of our
* traces, and disasm that doesn't look like it really supports it) or
* from the constant file (where CONSTANT_DEMOTION_ENABLE means we get
* automatic 32b-to-16b conversions when we ask for 16b from it).
* Instead, we'll load 32b from a UBO and unpack from there.
*/
num_components = DIV_ROUND_UP(num_components, 2);
}
unsigned base = nir_intrinsic_base(instr);
nir_ssa_def *index = nir_imm_int(b, const_state->constant_data_ubo);
nir_ssa_def *offset =
nir_iadd_imm(b, nir_ssa_for_src(b, instr->src[0], 1), base);
nir_ssa_def *result =
nir_load_ubo(b, num_components, 32, index, offset,
.align_mul = nir_intrinsic_align_mul(instr),
.align_offset = nir_intrinsic_align_offset(instr),
.range_base = base,
.range = nir_intrinsic_range(instr));
nir_ssa_def *result =
nir_load_ubo(b, num_components, 32, index, offset,
.align_mul = nir_intrinsic_align_mul(instr),
.align_offset = nir_intrinsic_align_offset(instr),
.range_base = base, .range = nir_intrinsic_range(instr));
if (nir_dest_bit_size(instr->dest) == 16) {
result = nir_bitcast_vector(b, result, 16);
result = nir_channels(b, result, BITSET_MASK(instr->num_components));
}
if (nir_dest_bit_size(instr->dest) == 16) {
result = nir_bitcast_vector(b, result, 16);
result = nir_channels(b, result, BITSET_MASK(instr->num_components));
}
return result;
return result;
}
static bool
ir3_lower_load_const_filter(const nir_instr *instr, const void *data)
{
return (instr->type == nir_instr_type_intrinsic &&
nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_constant);
return (instr->type == nir_instr_type_intrinsic &&
nir_instr_as_intrinsic(instr)->intrinsic ==
nir_intrinsic_load_constant);
}
/* Lowers load_constant intrinsics to UBO accesses so we can run them through
@ -581,26 +581,26 @@ ir3_lower_load_const_filter(const nir_instr *instr, const void *data)
bool
ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v)
{
struct ir3_const_state *const_state = ir3_const_state(v);
struct ir3_const_state *const_state = ir3_const_state(v);
const_state->constant_data_ubo = -1;
const_state->constant_data_ubo = -1;
bool progress = nir_shader_lower_instructions(nir,
ir3_lower_load_const_filter, ir3_nir_lower_load_const_instr,
const_state);
bool progress = nir_shader_lower_instructions(
nir, ir3_lower_load_const_filter, ir3_nir_lower_load_const_instr,
const_state);
if (progress) {
struct ir3_compiler *compiler = v->shader->compiler;
if (progress) {
struct ir3_compiler *compiler = v->shader->compiler;
/* Save a copy of the NIR constant data to the variant for
* inclusion in the final assembly.
*/
v->constant_data_size = align(nir->constant_data_size,
compiler->const_upload_unit * 4 * sizeof(uint32_t));
v->constant_data = rzalloc_size(v, v->constant_data_size);
memcpy(v->constant_data, nir->constant_data,
nir->constant_data_size);
}
/* Save a copy of the NIR constant data to the variant for
* inclusion in the final assembly.
*/
v->constant_data_size =
align(nir->constant_data_size,
compiler->const_upload_unit * 4 * sizeof(uint32_t));
v->constant_data = rzalloc_size(v, v->constant_data_size);
memcpy(v->constant_data, nir->constant_data, nir->constant_data_size);
}
return progress;
return progress;
}

View File

@ -21,8 +21,8 @@
* IN THE SOFTWARE.
*/
#include "ir3_nir.h"
#include "compiler/nir/nir_builder.h"
#include "ir3_nir.h"
/**
* This pass moves to NIR certain offset computations for different I/O
@ -34,7 +34,6 @@
* holds the result of the original byte-offset source divided by 4.
*/
/* Returns the ir3-specific intrinsic opcode corresponding to an SSBO
* instruction that is handled by this pass. It also conveniently returns
* the offset source index in @offset_src_idx.
@ -44,269 +43,269 @@
*/
static int
get_ir3_intrinsic_for_ssbo_intrinsic(unsigned intrinsic,
uint8_t *offset_src_idx)
uint8_t *offset_src_idx)
{
debug_assert(offset_src_idx);
debug_assert(offset_src_idx);
*offset_src_idx = 1;
*offset_src_idx = 1;
switch (intrinsic) {
case nir_intrinsic_store_ssbo:
*offset_src_idx = 2;
return nir_intrinsic_store_ssbo_ir3;
case nir_intrinsic_load_ssbo:
return nir_intrinsic_load_ssbo_ir3;
case nir_intrinsic_ssbo_atomic_add:
return nir_intrinsic_ssbo_atomic_add_ir3;
case nir_intrinsic_ssbo_atomic_imin:
return nir_intrinsic_ssbo_atomic_imin_ir3;
case nir_intrinsic_ssbo_atomic_umin:
return nir_intrinsic_ssbo_atomic_umin_ir3;
case nir_intrinsic_ssbo_atomic_imax:
return nir_intrinsic_ssbo_atomic_imax_ir3;
case nir_intrinsic_ssbo_atomic_umax:
return nir_intrinsic_ssbo_atomic_umax_ir3;
case nir_intrinsic_ssbo_atomic_and:
return nir_intrinsic_ssbo_atomic_and_ir3;
case nir_intrinsic_ssbo_atomic_or:
return nir_intrinsic_ssbo_atomic_or_ir3;
case nir_intrinsic_ssbo_atomic_xor:
return nir_intrinsic_ssbo_atomic_xor_ir3;
case nir_intrinsic_ssbo_atomic_exchange:
return nir_intrinsic_ssbo_atomic_exchange_ir3;
case nir_intrinsic_ssbo_atomic_comp_swap:
return nir_intrinsic_ssbo_atomic_comp_swap_ir3;
default:
break;
}
switch (intrinsic) {
case nir_intrinsic_store_ssbo:
*offset_src_idx = 2;
return nir_intrinsic_store_ssbo_ir3;
case nir_intrinsic_load_ssbo:
return nir_intrinsic_load_ssbo_ir3;
case nir_intrinsic_ssbo_atomic_add:
return nir_intrinsic_ssbo_atomic_add_ir3;
case nir_intrinsic_ssbo_atomic_imin:
return nir_intrinsic_ssbo_atomic_imin_ir3;
case nir_intrinsic_ssbo_atomic_umin:
return nir_intrinsic_ssbo_atomic_umin_ir3;
case nir_intrinsic_ssbo_atomic_imax:
return nir_intrinsic_ssbo_atomic_imax_ir3;
case nir_intrinsic_ssbo_atomic_umax:
return nir_intrinsic_ssbo_atomic_umax_ir3;
case nir_intrinsic_ssbo_atomic_and:
return nir_intrinsic_ssbo_atomic_and_ir3;
case nir_intrinsic_ssbo_atomic_or:
return nir_intrinsic_ssbo_atomic_or_ir3;
case nir_intrinsic_ssbo_atomic_xor:
return nir_intrinsic_ssbo_atomic_xor_ir3;
case nir_intrinsic_ssbo_atomic_exchange:
return nir_intrinsic_ssbo_atomic_exchange_ir3;
case nir_intrinsic_ssbo_atomic_comp_swap:
return nir_intrinsic_ssbo_atomic_comp_swap_ir3;
default:
break;
}
return -1;
return -1;
}
static nir_ssa_def *
check_and_propagate_bit_shift32(nir_builder *b, nir_alu_instr *alu_instr,
int32_t direction, int32_t shift)
int32_t direction, int32_t shift)
{
debug_assert(alu_instr->src[1].src.is_ssa);
nir_ssa_def *shift_ssa = alu_instr->src[1].src.ssa;
debug_assert(alu_instr->src[1].src.is_ssa);
nir_ssa_def *shift_ssa = alu_instr->src[1].src.ssa;
/* Only propagate if the shift is a const value so we can check value range
* statically.
*/
nir_const_value *const_val = nir_src_as_const_value(alu_instr->src[1].src);
if (!const_val)
return NULL;
/* Only propagate if the shift is a const value so we can check value range
* statically.
*/
nir_const_value *const_val = nir_src_as_const_value(alu_instr->src[1].src);
if (!const_val)
return NULL;
int32_t current_shift = const_val[0].i32 * direction;
int32_t new_shift = current_shift + shift;
int32_t current_shift = const_val[0].i32 * direction;
int32_t new_shift = current_shift + shift;
/* If the merge would reverse the direction, bail out.
* e.g, 'x << 2' then 'x >> 4' is not 'x >> 2'.
*/
if (current_shift * new_shift < 0)
return NULL;
/* If the merge would reverse the direction, bail out.
* e.g, 'x << 2' then 'x >> 4' is not 'x >> 2'.
*/
if (current_shift * new_shift < 0)
return NULL;
/* If the propagation would overflow an int32_t, bail out too to be on the
* safe side.
*/
if (new_shift < -31 || new_shift > 31)
return NULL;
/* If the propagation would overflow an int32_t, bail out too to be on the
* safe side.
*/
if (new_shift < -31 || new_shift > 31)
return NULL;
/* Add or substract shift depending on the final direction (SHR vs. SHL). */
if (shift * direction < 0)
shift_ssa = nir_isub(b, shift_ssa, nir_imm_int(b, abs(shift)));
else
shift_ssa = nir_iadd(b, shift_ssa, nir_imm_int(b, abs(shift)));
/* Add or substract shift depending on the final direction (SHR vs. SHL). */
if (shift * direction < 0)
shift_ssa = nir_isub(b, shift_ssa, nir_imm_int(b, abs(shift)));
else
shift_ssa = nir_iadd(b, shift_ssa, nir_imm_int(b, abs(shift)));
return shift_ssa;
return shift_ssa;
}
nir_ssa_def *
ir3_nir_try_propagate_bit_shift(nir_builder *b, nir_ssa_def *offset, int32_t shift)
ir3_nir_try_propagate_bit_shift(nir_builder *b, nir_ssa_def *offset,
int32_t shift)
{
nir_instr *offset_instr = offset->parent_instr;
if (offset_instr->type != nir_instr_type_alu)
return NULL;
nir_instr *offset_instr = offset->parent_instr;
if (offset_instr->type != nir_instr_type_alu)
return NULL;
nir_alu_instr *alu = nir_instr_as_alu(offset_instr);
nir_ssa_def *shift_ssa;
nir_ssa_def *new_offset = NULL;
nir_alu_instr *alu = nir_instr_as_alu(offset_instr);
nir_ssa_def *shift_ssa;
nir_ssa_def *new_offset = NULL;
/* the first src could be something like ssa_18.x, but we only want
* the single component. Otherwise the ishl/ishr/ushr could turn
* into a vec4 operation:
*/
nir_ssa_def *src0 = nir_mov_alu(b, alu->src[0], 1);
/* the first src could be something like ssa_18.x, but we only want
* the single component. Otherwise the ishl/ishr/ushr could turn
* into a vec4 operation:
*/
nir_ssa_def *src0 = nir_mov_alu(b, alu->src[0], 1);
switch (alu->op) {
case nir_op_ishl:
shift_ssa = check_and_propagate_bit_shift32(b, alu, 1, shift);
if (shift_ssa)
new_offset = nir_ishl(b, src0, shift_ssa);
break;
case nir_op_ishr:
shift_ssa = check_and_propagate_bit_shift32(b, alu, -1, shift);
if (shift_ssa)
new_offset = nir_ishr(b, src0, shift_ssa);
break;
case nir_op_ushr:
shift_ssa = check_and_propagate_bit_shift32(b, alu, -1, shift);
if (shift_ssa)
new_offset = nir_ushr(b, src0, shift_ssa);
break;
default:
return NULL;
}
switch (alu->op) {
case nir_op_ishl:
shift_ssa = check_and_propagate_bit_shift32(b, alu, 1, shift);
if (shift_ssa)
new_offset = nir_ishl(b, src0, shift_ssa);
break;
case nir_op_ishr:
shift_ssa = check_and_propagate_bit_shift32(b, alu, -1, shift);
if (shift_ssa)
new_offset = nir_ishr(b, src0, shift_ssa);
break;
case nir_op_ushr:
shift_ssa = check_and_propagate_bit_shift32(b, alu, -1, shift);
if (shift_ssa)
new_offset = nir_ushr(b, src0, shift_ssa);
break;
default:
return NULL;
}
return new_offset;
return new_offset;
}
static bool
lower_offset_for_ssbo(nir_intrinsic_instr *intrinsic, nir_builder *b,
unsigned ir3_ssbo_opcode, uint8_t offset_src_idx)
unsigned ir3_ssbo_opcode, uint8_t offset_src_idx)
{
unsigned num_srcs = nir_intrinsic_infos[intrinsic->intrinsic].num_srcs;
int shift = 2;
unsigned num_srcs = nir_intrinsic_infos[intrinsic->intrinsic].num_srcs;
int shift = 2;
bool has_dest = nir_intrinsic_infos[intrinsic->intrinsic].has_dest;
nir_ssa_def *new_dest = NULL;
bool has_dest = nir_intrinsic_infos[intrinsic->intrinsic].has_dest;
nir_ssa_def *new_dest = NULL;
/* for 16-bit ssbo access, offset is in 16-bit words instead of dwords */
if ((has_dest && intrinsic->dest.ssa.bit_size == 16) ||
(!has_dest && intrinsic->src[0].ssa->bit_size == 16))
shift = 1;
/* for 16-bit ssbo access, offset is in 16-bit words instead of dwords */
if ((has_dest && intrinsic->dest.ssa.bit_size == 16) ||
(!has_dest && intrinsic->src[0].ssa->bit_size == 16))
shift = 1;
/* Here we create a new intrinsic and copy over all contents from the old one. */
/* Here we create a new intrinsic and copy over all contents from the old
* one. */
nir_intrinsic_instr *new_intrinsic;
nir_src *target_src;
nir_intrinsic_instr *new_intrinsic;
nir_src *target_src;
b->cursor = nir_before_instr(&intrinsic->instr);
b->cursor = nir_before_instr(&intrinsic->instr);
/* 'offset_src_idx' holds the index of the source that represent the offset. */
new_intrinsic =
nir_intrinsic_instr_create(b->shader, ir3_ssbo_opcode);
/* 'offset_src_idx' holds the index of the source that represent the offset. */
new_intrinsic = nir_intrinsic_instr_create(b->shader, ir3_ssbo_opcode);
debug_assert(intrinsic->src[offset_src_idx].is_ssa);
nir_ssa_def *offset = intrinsic->src[offset_src_idx].ssa;
debug_assert(intrinsic->src[offset_src_idx].is_ssa);
nir_ssa_def *offset = intrinsic->src[offset_src_idx].ssa;
/* Since we don't have value range checking, we first try to propagate
* the division by 4 ('offset >> 2') into another bit-shift instruction that
* possibly defines the offset. If that's the case, we emit a similar
* instructions adjusting (merging) the shift value.
*
* Here we use the convention that shifting right is negative while shifting
* left is positive. So 'x / 4' ~ 'x >> 2' or 'x << -2'.
*/
nir_ssa_def *new_offset = ir3_nir_try_propagate_bit_shift(b, offset, -shift);
/* Since we don't have value range checking, we first try to propagate
* the division by 4 ('offset >> 2') into another bit-shift instruction that
* possibly defines the offset. If that's the case, we emit a similar
* instructions adjusting (merging) the shift value.
*
* Here we use the convention that shifting right is negative while shifting
* left is positive. So 'x / 4' ~ 'x >> 2' or 'x << -2'.
*/
nir_ssa_def *new_offset = ir3_nir_try_propagate_bit_shift(b, offset, -shift);
/* The new source that will hold the dword-offset is always the last
* one for every intrinsic.
*/
target_src = &new_intrinsic->src[num_srcs];
*target_src = nir_src_for_ssa(offset);
/* The new source that will hold the dword-offset is always the last
* one for every intrinsic.
*/
target_src = &new_intrinsic->src[num_srcs];
*target_src = nir_src_for_ssa(offset);
if (has_dest) {
debug_assert(intrinsic->dest.is_ssa);
nir_ssa_def *dest = &intrinsic->dest.ssa;
nir_ssa_dest_init(&new_intrinsic->instr, &new_intrinsic->dest,
dest->num_components, dest->bit_size, NULL);
new_dest = &new_intrinsic->dest.ssa;
}
if (has_dest) {
debug_assert(intrinsic->dest.is_ssa);
nir_ssa_def *dest = &intrinsic->dest.ssa;
nir_ssa_dest_init(&new_intrinsic->instr, &new_intrinsic->dest,
dest->num_components, dest->bit_size, NULL);
new_dest = &new_intrinsic->dest.ssa;
}
for (unsigned i = 0; i < num_srcs; i++)
new_intrinsic->src[i] = nir_src_for_ssa(intrinsic->src[i].ssa);
for (unsigned i = 0; i < num_srcs; i++)
new_intrinsic->src[i] = nir_src_for_ssa(intrinsic->src[i].ssa);
nir_intrinsic_copy_const_indices(new_intrinsic, intrinsic);
nir_intrinsic_copy_const_indices(new_intrinsic, intrinsic);
new_intrinsic->num_components = intrinsic->num_components;
new_intrinsic->num_components = intrinsic->num_components;
/* If we managed to propagate the division by 4, just use the new offset
* register and don't emit the SHR.
*/
if (new_offset)
offset = new_offset;
else
offset = nir_ushr(b, offset, nir_imm_int(b, shift));
/* If we managed to propagate the division by 4, just use the new offset
* register and don't emit the SHR.
*/
if (new_offset)
offset = new_offset;
else
offset = nir_ushr(b, offset, nir_imm_int(b, shift));
/* Insert the new intrinsic right before the old one. */
nir_builder_instr_insert(b, &new_intrinsic->instr);
/* Insert the new intrinsic right before the old one. */
nir_builder_instr_insert(b, &new_intrinsic->instr);
/* Replace the last source of the new intrinsic by the result of
* the offset divided by 4.
*/
nir_instr_rewrite_src(&new_intrinsic->instr,
target_src,
nir_src_for_ssa(offset));
/* Replace the last source of the new intrinsic by the result of
* the offset divided by 4.
*/
nir_instr_rewrite_src(&new_intrinsic->instr, target_src,
nir_src_for_ssa(offset));
if (has_dest) {
/* Replace the uses of the original destination by that
* of the new intrinsic.
*/
nir_ssa_def_rewrite_uses(&intrinsic->dest.ssa,
new_dest);
}
if (has_dest) {
/* Replace the uses of the original destination by that
* of the new intrinsic.
*/
nir_ssa_def_rewrite_uses(&intrinsic->dest.ssa, new_dest);
}
/* Finally remove the original intrinsic. */
nir_instr_remove(&intrinsic->instr);
/* Finally remove the original intrinsic. */
nir_instr_remove(&intrinsic->instr);
return true;
return true;
}
static bool
lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx, int gpu_id)
lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx,
int gpu_id)
{
bool progress = false;
bool progress = false;
nir_foreach_instr_safe (instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_foreach_instr_safe (instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
/* SSBO */
int ir3_intrinsic;
uint8_t offset_src_idx;
ir3_intrinsic = get_ir3_intrinsic_for_ssbo_intrinsic(intr->intrinsic,
&offset_src_idx);
if (ir3_intrinsic != -1) {
progress |= lower_offset_for_ssbo(intr, b, (unsigned) ir3_intrinsic,
offset_src_idx);
}
}
/* SSBO */
int ir3_intrinsic;
uint8_t offset_src_idx;
ir3_intrinsic =
get_ir3_intrinsic_for_ssbo_intrinsic(intr->intrinsic, &offset_src_idx);
if (ir3_intrinsic != -1) {
progress |= lower_offset_for_ssbo(intr, b, (unsigned)ir3_intrinsic,
offset_src_idx);
}
}
return progress;
return progress;
}
static bool
lower_io_offsets_func(nir_function_impl *impl, int gpu_id)
{
void *mem_ctx = ralloc_parent(impl);
nir_builder b;
nir_builder_init(&b, impl);
void *mem_ctx = ralloc_parent(impl);
nir_builder b;
nir_builder_init(&b, impl);
bool progress = false;
nir_foreach_block_safe (block, impl) {
progress |= lower_io_offsets_block(block, &b, mem_ctx, gpu_id);
}
bool progress = false;
nir_foreach_block_safe (block, impl) {
progress |= lower_io_offsets_block(block, &b, mem_ctx, gpu_id);
}
if (progress) {
nir_metadata_preserve(impl, nir_metadata_block_index |
nir_metadata_dominance);
}
if (progress) {
nir_metadata_preserve(impl,
nir_metadata_block_index | nir_metadata_dominance);
}
return progress;
return progress;
}
bool
ir3_nir_lower_io_offsets(nir_shader *shader, int gpu_id)
{
bool progress = false;
bool progress = false;
nir_foreach_function (function, shader) {
if (function->impl)
progress |= lower_io_offsets_func(function->impl, gpu_id);
}
nir_foreach_function (function, shader) {
if (function->impl)
progress |= lower_io_offsets_func(function->impl, gpu_id);
}
return progress;
return progress;
}

View File

@ -21,8 +21,8 @@
* IN THE SOFTWARE.
*/
#include "ir3_nir.h"
#include "compiler/nir/nir_builder.h"
#include "ir3_nir.h"
/**
* This pass lowers load_barycentric_at_offset to dsx.3d/dsy.3d and alu
@ -32,75 +32,72 @@
static nir_ssa_def *
load(nir_builder *b, unsigned ncomp, nir_intrinsic_op op)
{
nir_intrinsic_instr *load_size = nir_intrinsic_instr_create(b->shader, op);
nir_ssa_dest_init(&load_size->instr, &load_size->dest, ncomp, 32, NULL);
nir_builder_instr_insert(b, &load_size->instr);
nir_intrinsic_instr *load_size = nir_intrinsic_instr_create(b->shader, op);
nir_ssa_dest_init(&load_size->instr, &load_size->dest, ncomp, 32, NULL);
nir_builder_instr_insert(b, &load_size->instr);
return &load_size->dest.ssa;
return &load_size->dest.ssa;
}
static nir_ssa_def *
ir3_nir_lower_load_barycentric_at_offset_instr(nir_builder *b,
nir_instr *instr, void *data)
ir3_nir_lower_load_barycentric_at_offset_instr(nir_builder *b, nir_instr *instr,
void *data)
{
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
#define chan(var, c) nir_channel(b, var, c)
nir_ssa_def *off = intr->src[0].ssa;
nir_ssa_def *ij = load(b, 2, nir_intrinsic_load_barycentric_pixel);
nir_ssa_def *s = load(b, 1, nir_intrinsic_load_size_ir3);
nir_ssa_def *off = intr->src[0].ssa;
nir_ssa_def *ij = load(b, 2, nir_intrinsic_load_barycentric_pixel);
nir_ssa_def *s = load(b, 1, nir_intrinsic_load_size_ir3);
s = nir_frcp(b, s);
s = nir_frcp(b, s);
/* scaled ij with s as 3rd component: */
nir_ssa_def *sij = nir_vec3(b,
nir_fmul(b, chan(ij, 0), s),
nir_fmul(b, chan(ij, 1), s),
s);
/* scaled ij with s as 3rd component: */
nir_ssa_def *sij =
nir_vec3(b, nir_fmul(b, chan(ij, 0), s), nir_fmul(b, chan(ij, 1), s), s);
nir_ssa_def *foo = nir_fddx(b, sij);
nir_ssa_def *bar = nir_fddy(b, sij);
nir_ssa_def *foo = nir_fddx(b, sij);
nir_ssa_def *bar = nir_fddy(b, sij);
if (b->shader->info.stage == MESA_SHADER_FRAGMENT)
b->shader->info.fs.needs_quad_helper_invocations = true;
if (b->shader->info.stage == MESA_SHADER_FRAGMENT)
b->shader->info.fs.needs_quad_helper_invocations = true;
nir_ssa_def *x, *y, *z, *i, *j;
nir_ssa_def *x, *y, *z, *i, *j;
x = nir_ffma(b, chan(off, 0), chan(foo, 0), chan(sij, 0));
y = nir_ffma(b, chan(off, 0), chan(foo, 1), chan(sij, 1));
z = nir_ffma(b, chan(off, 0), chan(foo, 2), chan(sij, 2));
x = nir_ffma(b, chan(off, 0), chan(foo, 0), chan(sij, 0));
y = nir_ffma(b, chan(off, 0), chan(foo, 1), chan(sij, 1));
z = nir_ffma(b, chan(off, 0), chan(foo, 2), chan(sij, 2));
x = nir_ffma(b, chan(off, 1), chan(bar, 0), x);
y = nir_ffma(b, chan(off, 1), chan(bar, 1), y);
z = nir_ffma(b, chan(off, 1), chan(bar, 2), z);
x = nir_ffma(b, chan(off, 1), chan(bar, 0), x);
y = nir_ffma(b, chan(off, 1), chan(bar, 1), y);
z = nir_ffma(b, chan(off, 1), chan(bar, 2), z);
/* convert back into primitive space: */
z = nir_frcp(b, z);
i = nir_fmul(b, z, x);
j = nir_fmul(b, z, y);
/* convert back into primitive space: */
z = nir_frcp(b, z);
i = nir_fmul(b, z, x);
j = nir_fmul(b, z, y);
ij = nir_vec2(b, i, j);
ij = nir_vec2(b, i, j);
return ij;
return ij;
}
static bool
ir3_nir_lower_load_barycentric_at_offset_filter(const nir_instr *instr,
const void *data)
const void *data)
{
return (instr->type == nir_instr_type_intrinsic &&
nir_instr_as_intrinsic(instr)->intrinsic ==
nir_intrinsic_load_barycentric_at_offset);
return (instr->type == nir_instr_type_intrinsic &&
nir_instr_as_intrinsic(instr)->intrinsic ==
nir_intrinsic_load_barycentric_at_offset);
}
bool
ir3_nir_lower_load_barycentric_at_offset(nir_shader *shader)
{
debug_assert(shader->info.stage == MESA_SHADER_FRAGMENT);
debug_assert(shader->info.stage == MESA_SHADER_FRAGMENT);
return nir_shader_lower_instructions(shader,
ir3_nir_lower_load_barycentric_at_offset_filter,
ir3_nir_lower_load_barycentric_at_offset_instr,
NULL);
return nir_shader_lower_instructions(
shader, ir3_nir_lower_load_barycentric_at_offset_filter,
ir3_nir_lower_load_barycentric_at_offset_instr, NULL);
}

View File

@ -21,8 +21,8 @@
* IN THE SOFTWARE.
*/
#include "ir3_nir.h"
#include "compiler/nir/nir_builder.h"
#include "ir3_nir.h"
/**
* This pass lowers load_barycentric_at_sample to load_sample_pos_from_id
@ -35,61 +35,60 @@
static nir_ssa_def *
load_sample_pos(nir_builder *b, nir_ssa_def *samp_id)
{
return nir_load_sample_pos_from_id(b, 32, samp_id);
return nir_load_sample_pos_from_id(b, 32, samp_id);
}
static nir_ssa_def *
lower_load_barycentric_at_sample(nir_builder *b, nir_intrinsic_instr *intr)
{
nir_ssa_def *pos = load_sample_pos(b, intr->src[0].ssa);
nir_ssa_def *pos = load_sample_pos(b, intr->src[0].ssa);
return nir_load_barycentric_at_offset(b, 32, pos);
return nir_load_barycentric_at_offset(b, 32, pos);
}
static nir_ssa_def *
lower_load_sample_pos(nir_builder *b, nir_intrinsic_instr *intr)
{
nir_ssa_def *pos = load_sample_pos(b, nir_load_sample_id(b));
nir_ssa_def *pos = load_sample_pos(b, nir_load_sample_id(b));
/* Note that gl_SamplePosition is offset by +vec2(0.5, 0.5) vs the
* offset passed to interpolateAtOffset(). See
* dEQP-GLES31.functional.shaders.multisample_interpolation.interpolate_at_offset.at_sample_position.default_framebuffer
* for example.
*/
nir_ssa_def *half = nir_imm_float(b, 0.5);
return nir_fadd(b, pos, nir_vec2(b, half, half));
/* Note that gl_SamplePosition is offset by +vec2(0.5, 0.5) vs the
* offset passed to interpolateAtOffset(). See
* dEQP-GLES31.functional.shaders.multisample_interpolation.interpolate_at_offset.at_sample_position.default_framebuffer
* for example.
*/
nir_ssa_def *half = nir_imm_float(b, 0.5);
return nir_fadd(b, pos, nir_vec2(b, half, half));
}
static nir_ssa_def *
ir3_nir_lower_load_barycentric_at_sample_instr(nir_builder *b,
nir_instr *instr, void *data)
ir3_nir_lower_load_barycentric_at_sample_instr(nir_builder *b, nir_instr *instr,
void *data)
{
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
if (intr->intrinsic == nir_intrinsic_load_sample_pos)
return lower_load_sample_pos(b, intr);
else
return lower_load_barycentric_at_sample(b, intr);
if (intr->intrinsic == nir_intrinsic_load_sample_pos)
return lower_load_sample_pos(b, intr);
else
return lower_load_barycentric_at_sample(b, intr);
}
static bool
ir3_nir_lower_load_barycentric_at_sample_filter(const nir_instr *instr,
const void *data)
const void *data)
{
if (instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
return (intr->intrinsic == nir_intrinsic_load_barycentric_at_sample ||
intr->intrinsic == nir_intrinsic_load_sample_pos);
if (instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
return (intr->intrinsic == nir_intrinsic_load_barycentric_at_sample ||
intr->intrinsic == nir_intrinsic_load_sample_pos);
}
bool
ir3_nir_lower_load_barycentric_at_sample(nir_shader *shader)
{
debug_assert(shader->info.stage == MESA_SHADER_FRAGMENT);
debug_assert(shader->info.stage == MESA_SHADER_FRAGMENT);
return nir_shader_lower_instructions(shader,
ir3_nir_lower_load_barycentric_at_sample_filter,
ir3_nir_lower_load_barycentric_at_sample_instr,
NULL);
return nir_shader_lower_instructions(
shader, ir3_nir_lower_load_barycentric_at_sample_filter,
ir3_nir_lower_load_barycentric_at_sample_instr, NULL);
}

File diff suppressed because it is too large Load Diff

View File

@ -31,97 +31,97 @@
static int
coord_offset(nir_ssa_def *ssa)
{
nir_instr *parent_instr = ssa->parent_instr;
nir_instr *parent_instr = ssa->parent_instr;
/* The coordinate of a texture sampling instruction eligible for
* pre-fetch is either going to be a load_interpolated_input/
* load_input, or a vec2 assembling non-swizzled components of
* a load_interpolated_input/load_input (due to varying packing)
*/
/* The coordinate of a texture sampling instruction eligible for
* pre-fetch is either going to be a load_interpolated_input/
* load_input, or a vec2 assembling non-swizzled components of
* a load_interpolated_input/load_input (due to varying packing)
*/
if (parent_instr->type == nir_instr_type_alu) {
nir_alu_instr *alu = nir_instr_as_alu(parent_instr);
if (parent_instr->type == nir_instr_type_alu) {
nir_alu_instr *alu = nir_instr_as_alu(parent_instr);
if (alu->op != nir_op_vec2)
return -1;
if (alu->op != nir_op_vec2)
return -1;
if (!alu->src[0].src.is_ssa)
return -1;
if (!alu->src[0].src.is_ssa)
return -1;
int base_offset = coord_offset(alu->src[0].src.ssa) +
alu->src[0].swizzle[0];
int base_offset =
coord_offset(alu->src[0].src.ssa) + alu->src[0].swizzle[0];
/* NOTE it might be possible to support more than 2D? */
for (int i = 1; i < 2; i++) {
if (!alu->src[i].src.is_ssa)
return -1;
/* NOTE it might be possible to support more than 2D? */
for (int i = 1; i < 2; i++) {
if (!alu->src[i].src.is_ssa)
return -1;
int nth_offset = coord_offset(alu->src[i].src.ssa) +
alu->src[i].swizzle[0];
int nth_offset =
coord_offset(alu->src[i].src.ssa) + alu->src[i].swizzle[0];
if (nth_offset != (base_offset + i))
return -1;
}
if (nth_offset != (base_offset + i))
return -1;
}
return base_offset;
}
return base_offset;
}
if (parent_instr->type != nir_instr_type_intrinsic)
return -1;
if (parent_instr->type != nir_instr_type_intrinsic)
return -1;
nir_intrinsic_instr *input = nir_instr_as_intrinsic(parent_instr);
nir_intrinsic_instr *input = nir_instr_as_intrinsic(parent_instr);
if (input->intrinsic != nir_intrinsic_load_interpolated_input)
return -1;
if (input->intrinsic != nir_intrinsic_load_interpolated_input)
return -1;
/* limit to load_barycentric_pixel, other interpolation modes don't seem
* to be supported:
*/
if (!input->src[0].is_ssa)
return -1;
/* limit to load_barycentric_pixel, other interpolation modes don't seem
* to be supported:
*/
if (!input->src[0].is_ssa)
return -1;
nir_intrinsic_instr *interp =
nir_instr_as_intrinsic(input->src[0].ssa->parent_instr);
nir_intrinsic_instr *interp =
nir_instr_as_intrinsic(input->src[0].ssa->parent_instr);
if (interp->intrinsic != nir_intrinsic_load_barycentric_pixel)
return -1;
if (interp->intrinsic != nir_intrinsic_load_barycentric_pixel)
return -1;
/* we also need a const input offset: */
if (!nir_src_is_const(input->src[1]))
return -1;
/* we also need a const input offset: */
if (!nir_src_is_const(input->src[1]))
return -1;
unsigned base = nir_src_as_uint(input->src[1]) + nir_intrinsic_base(input);
unsigned comp = nir_intrinsic_component(input);
unsigned base = nir_src_as_uint(input->src[1]) + nir_intrinsic_base(input);
unsigned comp = nir_intrinsic_component(input);
return (4 * base) + comp;
return (4 * base) + comp;
}
int
ir3_nir_coord_offset(nir_ssa_def *ssa)
{
assert (ssa->num_components == 2);
return coord_offset(ssa);
assert(ssa->num_components == 2);
return coord_offset(ssa);
}
static bool
has_src(nir_tex_instr *tex, nir_tex_src_type type)
{
return nir_tex_instr_src_index(tex, type) >= 0;
return nir_tex_instr_src_index(tex, type) >= 0;
}
static bool
ok_bindless_src(nir_tex_instr *tex, nir_tex_src_type type)
{
int idx = nir_tex_instr_src_index(tex, type);
assert(idx >= 0);
nir_intrinsic_instr *bindless = ir3_bindless_resource(tex->src[idx].src);
int idx = nir_tex_instr_src_index(tex, type);
assert(idx >= 0);
nir_intrinsic_instr *bindless = ir3_bindless_resource(tex->src[idx].src);
/* TODO from SP_FS_BINDLESS_PREFETCH[n] it looks like this limit should
* be 1<<8 ?
*/
return nir_src_is_const(bindless->src[0]) &&
(nir_src_as_uint(bindless->src[0]) < (1 << 16));
/* TODO from SP_FS_BINDLESS_PREFETCH[n] it looks like this limit should
* be 1<<8 ?
*/
return nir_src_is_const(bindless->src[0]) &&
(nir_src_as_uint(bindless->src[0]) < (1 << 16));
}
/**
@ -134,107 +134,103 @@ ok_bindless_src(nir_tex_instr *tex, nir_tex_src_type type)
static bool
ok_tex_samp(nir_tex_instr *tex)
{
if (has_src(tex, nir_tex_src_texture_handle)) {
/* bindless case: */
if (has_src(tex, nir_tex_src_texture_handle)) {
/* bindless case: */
assert(has_src(tex, nir_tex_src_sampler_handle));
assert(has_src(tex, nir_tex_src_sampler_handle));
return ok_bindless_src(tex, nir_tex_src_texture_handle) &&
ok_bindless_src(tex, nir_tex_src_sampler_handle);
} else {
assert(!has_src(tex, nir_tex_src_texture_offset));
assert(!has_src(tex, nir_tex_src_sampler_offset));
return ok_bindless_src(tex, nir_tex_src_texture_handle) &&
ok_bindless_src(tex, nir_tex_src_sampler_handle);
} else {
assert(!has_src(tex, nir_tex_src_texture_offset));
assert(!has_src(tex, nir_tex_src_sampler_offset));
return (tex->texture_index <= 0x1f) &&
(tex->sampler_index <= 0xf);
}
return (tex->texture_index <= 0x1f) && (tex->sampler_index <= 0xf);
}
}
static bool
lower_tex_prefetch_block(nir_block *block)
{
bool progress = false;
bool progress = false;
nir_foreach_instr_safe (instr, block) {
if (instr->type != nir_instr_type_tex)
continue;
nir_foreach_instr_safe (instr, block) {
if (instr->type != nir_instr_type_tex)
continue;
nir_tex_instr *tex = nir_instr_as_tex(instr);
if (tex->op != nir_texop_tex)
continue;
nir_tex_instr *tex = nir_instr_as_tex(instr);
if (tex->op != nir_texop_tex)
continue;
if (has_src(tex, nir_tex_src_bias) ||
has_src(tex, nir_tex_src_lod) ||
has_src(tex, nir_tex_src_comparator) ||
has_src(tex, nir_tex_src_projector) ||
has_src(tex, nir_tex_src_offset) ||
has_src(tex, nir_tex_src_ddx) ||
has_src(tex, nir_tex_src_ddy) ||
has_src(tex, nir_tex_src_ms_index) ||
has_src(tex, nir_tex_src_texture_offset) ||
has_src(tex, nir_tex_src_sampler_offset))
continue;
if (has_src(tex, nir_tex_src_bias) || has_src(tex, nir_tex_src_lod) ||
has_src(tex, nir_tex_src_comparator) ||
has_src(tex, nir_tex_src_projector) ||
has_src(tex, nir_tex_src_offset) || has_src(tex, nir_tex_src_ddx) ||
has_src(tex, nir_tex_src_ddy) || has_src(tex, nir_tex_src_ms_index) ||
has_src(tex, nir_tex_src_texture_offset) ||
has_src(tex, nir_tex_src_sampler_offset))
continue;
/* only prefetch for simple 2d tex fetch case */
if (tex->sampler_dim != GLSL_SAMPLER_DIM_2D || tex->is_array)
continue;
/* only prefetch for simple 2d tex fetch case */
if (tex->sampler_dim != GLSL_SAMPLER_DIM_2D || tex->is_array)
continue;
if (!ok_tex_samp(tex))
continue;
if (!ok_tex_samp(tex))
continue;
int idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
/* First source should be the sampling coordinate. */
nir_tex_src *coord = &tex->src[idx];
debug_assert(coord->src.is_ssa);
int idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
/* First source should be the sampling coordinate. */
nir_tex_src *coord = &tex->src[idx];
debug_assert(coord->src.is_ssa);
if (ir3_nir_coord_offset(coord->src.ssa) >= 0) {
tex->op = nir_texop_tex_prefetch;
if (ir3_nir_coord_offset(coord->src.ssa) >= 0) {
tex->op = nir_texop_tex_prefetch;
progress |= true;
}
}
progress |= true;
}
}
return progress;
return progress;
}
static bool
lower_tex_prefetch_func(nir_function_impl *impl)
{
/* Only instructions in the the outer-most block are considered
* eligible for pre-dispatch, because they need to be move-able
* to the beginning of the shader to avoid locking down the
* register holding the pre-fetched result for too long.
*/
nir_block *block = nir_start_block(impl);
if (!block)
return false;
/* Only instructions in the the outer-most block are considered
* eligible for pre-dispatch, because they need to be move-able
* to the beginning of the shader to avoid locking down the
* register holding the pre-fetched result for too long.
*/
nir_block *block = nir_start_block(impl);
if (!block)
return false;
bool progress = lower_tex_prefetch_block(block);
bool progress = lower_tex_prefetch_block(block);
if (progress) {
nir_metadata_preserve(impl, nir_metadata_block_index |
nir_metadata_dominance);
}
if (progress) {
nir_metadata_preserve(impl,
nir_metadata_block_index | nir_metadata_dominance);
}
return progress;
return progress;
}
bool
ir3_nir_lower_tex_prefetch(nir_shader *shader)
{
bool progress = false;
bool progress = false;
assert(shader->info.stage == MESA_SHADER_FRAGMENT);
assert(shader->info.stage == MESA_SHADER_FRAGMENT);
nir_foreach_function (function, shader) {
/* Only texture sampling instructions inside the main function
* are eligible for pre-dispatch.
*/
if (!function->impl || !function->is_entrypoint)
continue;
nir_foreach_function (function, shader) {
/* Only texture sampling instructions inside the main function
* are eligible for pre-dispatch.
*/
if (!function->impl || !function->is_entrypoint)
continue;
progress |= lower_tex_prefetch_func(function->impl);
}
progress |= lower_tex_prefetch_func(function->impl);
}
return progress;
return progress;
}

View File

@ -21,8 +21,8 @@
* IN THE SOFTWARE.
*/
#include "ir3_nir.h"
#include "compiler/nir/nir_builder.h"
#include "ir3_nir.h"
/* A4XX has a broken GATHER4 operation. It performs the texture swizzle on the
* gather results, rather than before. As a result, it must be emulated with
@ -32,70 +32,68 @@
static nir_ssa_def *
ir3_nir_lower_tg4_to_tex_instr(nir_builder *b, nir_instr *instr, void *data)
{
nir_tex_instr *tg4 = nir_instr_as_tex(instr);
static const int offsets[3][2] = { {0, 1}, {1, 1}, {1, 0} };
nir_tex_instr *tg4 = nir_instr_as_tex(instr);
static const int offsets[3][2] = {{0, 1}, {1, 1}, {1, 0}};
nir_ssa_def *results[4];
int offset_index = nir_tex_instr_src_index(tg4, nir_tex_src_offset);
for (int i = 0; i < 4; i++) {
int num_srcs = tg4->num_srcs + 1 /* lod */;
if (offset_index < 0 && i < 3)
num_srcs++;
nir_ssa_def *results[4];
int offset_index = nir_tex_instr_src_index(tg4, nir_tex_src_offset);
for (int i = 0; i < 4; i++) {
int num_srcs = tg4->num_srcs + 1 /* lod */;
if (offset_index < 0 && i < 3)
num_srcs++;
nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs);
tex->op = nir_texop_txl;
tex->sampler_dim = tg4->sampler_dim;
tex->coord_components = tg4->coord_components;
tex->is_array = tg4->is_array;
tex->is_shadow = tg4->is_shadow;
tex->is_new_style_shadow = tg4->is_new_style_shadow;
tex->texture_index = tg4->texture_index;
tex->sampler_index = tg4->sampler_index;
tex->dest_type = tg4->dest_type;
nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs);
tex->op = nir_texop_txl;
tex->sampler_dim = tg4->sampler_dim;
tex->coord_components = tg4->coord_components;
tex->is_array = tg4->is_array;
tex->is_shadow = tg4->is_shadow;
tex->is_new_style_shadow = tg4->is_new_style_shadow;
tex->texture_index = tg4->texture_index;
tex->sampler_index = tg4->sampler_index;
tex->dest_type = tg4->dest_type;
for (int j = 0; j < tg4->num_srcs; j++) {
nir_src_copy(&tex->src[j].src, &tg4->src[j].src, tex);
tex->src[j].src_type = tg4->src[j].src_type;
}
if (i != 3) {
nir_ssa_def *offset =
nir_vec2(b, nir_imm_int(b, offsets[i][0]),
nir_imm_int(b, offsets[i][1]));
if (offset_index < 0) {
tex->src[tg4->num_srcs].src = nir_src_for_ssa(offset);
tex->src[tg4->num_srcs].src_type = nir_tex_src_offset;
} else {
assert(nir_tex_instr_src_size(tex, offset_index) == 2);
nir_ssa_def *orig = nir_ssa_for_src(
b, tex->src[offset_index].src, 2);
tex->src[offset_index].src =
nir_src_for_ssa(nir_iadd(b, orig, offset));
}
}
tex->src[num_srcs - 1].src = nir_src_for_ssa(nir_imm_float(b, 0));
tex->src[num_srcs - 1].src_type = nir_tex_src_lod;
for (int j = 0; j < tg4->num_srcs; j++) {
nir_src_copy(&tex->src[j].src, &tg4->src[j].src, tex);
tex->src[j].src_type = tg4->src[j].src_type;
}
if (i != 3) {
nir_ssa_def *offset = nir_vec2(b, nir_imm_int(b, offsets[i][0]),
nir_imm_int(b, offsets[i][1]));
if (offset_index < 0) {
tex->src[tg4->num_srcs].src = nir_src_for_ssa(offset);
tex->src[tg4->num_srcs].src_type = nir_tex_src_offset;
} else {
assert(nir_tex_instr_src_size(tex, offset_index) == 2);
nir_ssa_def *orig =
nir_ssa_for_src(b, tex->src[offset_index].src, 2);
tex->src[offset_index].src =
nir_src_for_ssa(nir_iadd(b, orig, offset));
}
}
tex->src[num_srcs - 1].src = nir_src_for_ssa(nir_imm_float(b, 0));
tex->src[num_srcs - 1].src_type = nir_tex_src_lod;
nir_ssa_dest_init(&tex->instr, &tex->dest,
nir_tex_instr_dest_size(tex), 32, NULL);
nir_builder_instr_insert(b, &tex->instr);
nir_ssa_dest_init(&tex->instr, &tex->dest, nir_tex_instr_dest_size(tex),
32, NULL);
nir_builder_instr_insert(b, &tex->instr);
results[i] = nir_channel(b, &tex->dest.ssa, tg4->component);
}
results[i] = nir_channel(b, &tex->dest.ssa, tg4->component);
}
return nir_vec(b, results, 4);
return nir_vec(b, results, 4);
}
static bool
ir3_nir_lower_tg4_to_tex_filter(const nir_instr *instr, const void *data)
{
return (instr->type == nir_instr_type_tex &&
nir_instr_as_tex(instr)->op == nir_texop_tg4);
return (instr->type == nir_instr_type_tex &&
nir_instr_as_tex(instr)->op == nir_texop_tg4);
}
bool
ir3_nir_lower_tg4_to_tex(nir_shader *shader)
{
return nir_shader_lower_instructions(shader,
ir3_nir_lower_tg4_to_tex_filter,
ir3_nir_lower_tg4_to_tex_instr, NULL);
return nir_shader_lower_instructions(shader, ir3_nir_lower_tg4_to_tex_filter,
ir3_nir_lower_tg4_to_tex_instr, NULL);
}

View File

@ -21,8 +21,8 @@
* IN THE SOFTWARE.
*/
#include "ir3_nir.h"
#include "compiler/nir/nir_builder.h"
#include "ir3_nir.h"
/**
* This pass moves varying fetches (and the instructions they depend on
@ -46,25 +46,23 @@
*/
typedef struct {
nir_block *start_block;
bool precondition_failed;
nir_block *start_block;
bool precondition_failed;
} precond_state;
typedef struct {
nir_shader *shader;
nir_block *start_block;
nir_shader *shader;
nir_block *start_block;
} state;
static void check_precondition_instr(precond_state *state, nir_instr *instr);
static void move_instruction_to_start_block(state *state, nir_instr *instr);
static bool
check_precondition_src(nir_src *src, void *state)
{
check_precondition_instr(state, src->ssa->parent_instr);
return true;
check_precondition_instr(state, src->ssa->parent_instr);
return true;
}
/* Recursively check if there is even a single dependency which
@ -73,163 +71,163 @@ check_precondition_src(nir_src *src, void *state)
static void
check_precondition_instr(precond_state *state, nir_instr *instr)
{
if (instr->block == state->start_block)
return;
if (instr->block == state->start_block)
return;
switch (instr->type) {
case nir_instr_type_alu:
case nir_instr_type_deref:
case nir_instr_type_load_const:
case nir_instr_type_ssa_undef:
/* These could be safely moved around */
break;
case nir_instr_type_intrinsic: {
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
if (!nir_intrinsic_can_reorder(intr)) {
state->precondition_failed = true;
return;
}
break;
}
default:
state->precondition_failed = true;
return;
}
switch (instr->type) {
case nir_instr_type_alu:
case nir_instr_type_deref:
case nir_instr_type_load_const:
case nir_instr_type_ssa_undef:
/* These could be safely moved around */
break;
case nir_instr_type_intrinsic: {
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
if (!nir_intrinsic_can_reorder(intr)) {
state->precondition_failed = true;
return;
}
break;
}
default:
state->precondition_failed = true;
return;
}
nir_foreach_src(instr, check_precondition_src, state);
nir_foreach_src(instr, check_precondition_src, state);
}
static void
check_precondition_block(precond_state *state, nir_block *block)
{
nir_foreach_instr_safe (instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_foreach_instr_safe (instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
switch (intr->intrinsic) {
case nir_intrinsic_load_interpolated_input:
case nir_intrinsic_load_input:
break;
default:
continue;
}
switch (intr->intrinsic) {
case nir_intrinsic_load_interpolated_input:
case nir_intrinsic_load_input:
break;
default:
continue;
}
check_precondition_instr(state, instr);
check_precondition_instr(state, instr);
if (state->precondition_failed)
return;
}
if (state->precondition_failed)
return;
}
}
static bool
move_src(nir_src *src, void *state)
{
/* At this point we shouldn't have any non-ssa src: */
debug_assert(src->is_ssa);
move_instruction_to_start_block(state, src->ssa->parent_instr);
return true;
/* At this point we shouldn't have any non-ssa src: */
debug_assert(src->is_ssa);
move_instruction_to_start_block(state, src->ssa->parent_instr);
return true;
}
static void
move_instruction_to_start_block(state *state, nir_instr *instr)
{
/* nothing to do if the instruction is already in the start block */
if (instr->block == state->start_block)
return;
/* nothing to do if the instruction is already in the start block */
if (instr->block == state->start_block)
return;
/* first move (recursively) all src's to ensure they appear before
* load*_input that we are trying to move:
*/
nir_foreach_src(instr, move_src, state);
/* first move (recursively) all src's to ensure they appear before
* load*_input that we are trying to move:
*/
nir_foreach_src(instr, move_src, state);
/* and then move the instruction itself:
*/
exec_node_remove(&instr->node);
exec_list_push_tail(&state->start_block->instr_list, &instr->node);
instr->block = state->start_block;
/* and then move the instruction itself:
*/
exec_node_remove(&instr->node);
exec_list_push_tail(&state->start_block->instr_list, &instr->node);
instr->block = state->start_block;
}
static bool
move_varying_inputs_block(state *state, nir_block *block)
{
bool progress = false;
bool progress = false;
nir_foreach_instr_safe (instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_foreach_instr_safe (instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
switch (intr->intrinsic) {
case nir_intrinsic_load_interpolated_input:
case nir_intrinsic_load_input:
/* TODO any others to handle? */
break;
default:
continue;
}
switch (intr->intrinsic) {
case nir_intrinsic_load_interpolated_input:
case nir_intrinsic_load_input:
/* TODO any others to handle? */
break;
default:
continue;
}
debug_assert(intr->dest.is_ssa);
debug_assert(intr->dest.is_ssa);
move_instruction_to_start_block(state, instr);
move_instruction_to_start_block(state, instr);
progress = true;
}
progress = true;
}
return progress;
return progress;
}
bool
ir3_nir_move_varying_inputs(nir_shader *shader)
{
bool progress = false;
bool progress = false;
debug_assert(shader->info.stage == MESA_SHADER_FRAGMENT);
debug_assert(shader->info.stage == MESA_SHADER_FRAGMENT);
nir_foreach_function (function, shader) {
precond_state state;
nir_foreach_function (function, shader) {
precond_state state;
if (!function->impl)
continue;
if (!function->impl)
continue;
state.precondition_failed = false;
state.start_block = nir_start_block(function->impl);
state.precondition_failed = false;
state.start_block = nir_start_block(function->impl);
nir_foreach_block (block, function->impl) {
if (block == state.start_block)
continue;
nir_foreach_block (block, function->impl) {
if (block == state.start_block)
continue;
check_precondition_block(&state, block);
check_precondition_block(&state, block);
if (state.precondition_failed)
return false;
}
}
if (state.precondition_failed)
return false;
}
}
nir_foreach_function (function, shader) {
state state;
nir_foreach_function (function, shader) {
state state;
if (!function->impl)
continue;
if (!function->impl)
continue;
state.shader = shader;
state.start_block = nir_start_block(function->impl);
state.shader = shader;
state.start_block = nir_start_block(function->impl);
bool progress = false;
nir_foreach_block (block, function->impl) {
/* don't need to move anything that is already in the first block */
if (block == state.start_block)
continue;
progress |= move_varying_inputs_block(&state, block);
}
bool progress = false;
nir_foreach_block (block, function->impl) {
/* don't need to move anything that is already in the first block */
if (block == state.start_block)
continue;
progress |= move_varying_inputs_block(&state, block);
}
if (progress) {
nir_metadata_preserve(function->impl,
nir_metadata_block_index | nir_metadata_dominance);
}
}
if (progress) {
nir_metadata_preserve(
function->impl, nir_metadata_block_index | nir_metadata_dominance);
}
}
return progress;
return progress;
}

File diff suppressed because it is too large Load Diff

View File

@ -33,425 +33,452 @@
#define PTRID(x) ((unsigned long)(x))
/* ansi escape sequences: */
#define RESET "\x1b[0m"
#define RED "\x1b[0;31m"
#define GREEN "\x1b[0;32m"
#define BLUE "\x1b[0;34m"
#define MAGENTA "\x1b[0;35m"
#define RESET "\x1b[0m"
#define RED "\x1b[0;31m"
#define GREEN "\x1b[0;32m"
#define BLUE "\x1b[0;34m"
#define MAGENTA "\x1b[0;35m"
/* syntax coloring, mostly to make it easier to see different sorts of
* srcs (immediate, constant, ssa, array, ...)
*/
#define SYN_REG(x) RED x RESET
#define SYN_IMMED(x) GREEN x RESET
#define SYN_CONST(x) GREEN x RESET
#define SYN_SSA(x) BLUE x RESET
#define SYN_ARRAY(x) MAGENTA x RESET
#define SYN_REG(x) RED x RESET
#define SYN_IMMED(x) GREEN x RESET
#define SYN_CONST(x) GREEN x RESET
#define SYN_SSA(x) BLUE x RESET
#define SYN_ARRAY(x) MAGENTA x RESET
static const char *
type_name(type_t type)
{
static const char *type_names[] = {
[TYPE_F16] = "f16",
[TYPE_F32] = "f32",
[TYPE_U16] = "u16",
[TYPE_U32] = "u32",
[TYPE_S16] = "s16",
[TYPE_S32] = "s32",
[TYPE_U8] = "u8",
[TYPE_S8] = "s8",
};
return type_names[type];
static const char *type_names[] = {
[TYPE_F16] = "f16", [TYPE_F32] = "f32", [TYPE_U16] = "u16",
[TYPE_U32] = "u32", [TYPE_S16] = "s16", [TYPE_S32] = "s32",
[TYPE_U8] = "u8", [TYPE_S8] = "s8",
};
return type_names[type];
}
static void print_instr_name(struct log_stream *stream, struct ir3_instruction *instr, bool flags)
static void
print_instr_name(struct log_stream *stream, struct ir3_instruction *instr,
bool flags)
{
if (!instr)
return;
if (!instr)
return;
#ifdef DEBUG
mesa_log_stream_printf(stream, "%04u:", instr->serialno);
mesa_log_stream_printf(stream, "%04u:", instr->serialno);
#endif
mesa_log_stream_printf(stream, "%04u:", instr->name);
mesa_log_stream_printf(stream, "%04u:", instr->ip);
if (instr->flags & IR3_INSTR_UNUSED) {
mesa_log_stream_printf(stream, "XXX: ");
} else {
mesa_log_stream_printf(stream, "%03u: ", instr->use_count);
}
mesa_log_stream_printf(stream, "%04u:", instr->name);
mesa_log_stream_printf(stream, "%04u:", instr->ip);
if (instr->flags & IR3_INSTR_UNUSED) {
mesa_log_stream_printf(stream, "XXX: ");
} else {
mesa_log_stream_printf(stream, "%03u: ", instr->use_count);
}
if (flags) {
mesa_log_stream_printf(stream, "\t");
if (instr->flags & IR3_INSTR_SY)
mesa_log_stream_printf(stream, "(sy)");
if (instr->flags & IR3_INSTR_SS)
mesa_log_stream_printf(stream, "(ss)");
if (instr->flags & IR3_INSTR_JP)
mesa_log_stream_printf(stream, "(jp)");
if (instr->repeat)
mesa_log_stream_printf(stream, "(rpt%d)", instr->repeat);
if (instr->nop)
mesa_log_stream_printf(stream, "(nop%d)", instr->nop);
if (instr->flags & IR3_INSTR_UL)
mesa_log_stream_printf(stream, "(ul)");
} else {
mesa_log_stream_printf(stream, " ");
}
if (flags) {
mesa_log_stream_printf(stream, "\t");
if (instr->flags & IR3_INSTR_SY)
mesa_log_stream_printf(stream, "(sy)");
if (instr->flags & IR3_INSTR_SS)
mesa_log_stream_printf(stream, "(ss)");
if (instr->flags & IR3_INSTR_JP)
mesa_log_stream_printf(stream, "(jp)");
if (instr->repeat)
mesa_log_stream_printf(stream, "(rpt%d)", instr->repeat);
if (instr->nop)
mesa_log_stream_printf(stream, "(nop%d)", instr->nop);
if (instr->flags & IR3_INSTR_UL)
mesa_log_stream_printf(stream, "(ul)");
} else {
mesa_log_stream_printf(stream, " ");
}
if (is_meta(instr)) {
switch (instr->opc) {
case OPC_META_INPUT: mesa_log_stream_printf(stream, "_meta:in"); break;
case OPC_META_SPLIT: mesa_log_stream_printf(stream, "_meta:split"); break;
case OPC_META_COLLECT: mesa_log_stream_printf(stream, "_meta:collect"); break;
case OPC_META_TEX_PREFETCH: mesa_log_stream_printf(stream, "_meta:tex_prefetch"); break;
case OPC_META_PARALLEL_COPY: mesa_log_stream_printf(stream, "_meta:parallel_copy"); break;
case OPC_META_PHI: mesa_log_stream_printf(stream, "_meta:phi"); break;
if (is_meta(instr)) {
switch (instr->opc) {
case OPC_META_INPUT:
mesa_log_stream_printf(stream, "_meta:in");
break;
case OPC_META_SPLIT:
mesa_log_stream_printf(stream, "_meta:split");
break;
case OPC_META_COLLECT:
mesa_log_stream_printf(stream, "_meta:collect");
break;
case OPC_META_TEX_PREFETCH:
mesa_log_stream_printf(stream, "_meta:tex_prefetch");
break;
case OPC_META_PARALLEL_COPY:
mesa_log_stream_printf(stream, "_meta:parallel_copy");
break;
case OPC_META_PHI:
mesa_log_stream_printf(stream, "_meta:phi");
break;
/* shouldn't hit here.. just for debugging: */
default: mesa_log_stream_printf(stream, "_meta:%d", instr->opc); break;
}
} else if (opc_cat(instr->opc) == 1) {
if (instr->opc == OPC_MOV) {
if (instr->cat1.src_type == instr->cat1.dst_type)
mesa_log_stream_printf(stream, "mov");
else
mesa_log_stream_printf(stream, "cov");
} else {
mesa_log_stream_printf(stream, "%s", disasm_a3xx_instr_name(instr->opc));
}
/* shouldn't hit here.. just for debugging: */
default:
mesa_log_stream_printf(stream, "_meta:%d", instr->opc);
break;
}
} else if (opc_cat(instr->opc) == 1) {
if (instr->opc == OPC_MOV) {
if (instr->cat1.src_type == instr->cat1.dst_type)
mesa_log_stream_printf(stream, "mov");
else
mesa_log_stream_printf(stream, "cov");
} else {
mesa_log_stream_printf(stream, "%s",
disasm_a3xx_instr_name(instr->opc));
}
if (instr->opc != OPC_MOVMSK) {
mesa_log_stream_printf(stream, ".%s%s", type_name(instr->cat1.src_type),
type_name(instr->cat1.dst_type));
}
} else if (instr->opc == OPC_B) {
const char *name[8] = {
[BRANCH_PLAIN] = "br",
[BRANCH_OR] = "brao",
[BRANCH_AND] = "braa",
[BRANCH_CONST] = "brac",
[BRANCH_ANY] = "bany",
[BRANCH_ALL] = "ball",
[BRANCH_X] = "brax",
};
mesa_log_stream_printf(stream, "%s", name[instr->cat0.brtype]);
} else {
mesa_log_stream_printf(stream, "%s", disasm_a3xx_instr_name(instr->opc));
if (instr->flags & IR3_INSTR_3D)
mesa_log_stream_printf(stream, ".3d");
if (instr->flags & IR3_INSTR_A)
mesa_log_stream_printf(stream, ".a");
if (instr->flags & IR3_INSTR_O)
mesa_log_stream_printf(stream, ".o");
if (instr->flags & IR3_INSTR_P)
mesa_log_stream_printf(stream, ".p");
if (instr->flags & IR3_INSTR_S)
mesa_log_stream_printf(stream, ".s");
if (instr->flags & IR3_INSTR_A1EN)
mesa_log_stream_printf(stream, ".a1en");
if (instr->opc == OPC_LDC)
mesa_log_stream_printf(stream, ".offset%d", instr->cat6.d);
if (instr->flags & IR3_INSTR_B) {
mesa_log_stream_printf(stream, ".base%d",
is_tex(instr) ? instr->cat5.tex_base : instr->cat6.base);
}
if (instr->flags & IR3_INSTR_S2EN)
mesa_log_stream_printf(stream, ".s2en");
if (instr->opc != OPC_MOVMSK) {
mesa_log_stream_printf(stream, ".%s%s",
type_name(instr->cat1.src_type),
type_name(instr->cat1.dst_type));
}
} else if (instr->opc == OPC_B) {
const char *name[8] = {
[BRANCH_PLAIN] = "br", [BRANCH_OR] = "brao", [BRANCH_AND] = "braa",
[BRANCH_CONST] = "brac", [BRANCH_ANY] = "bany", [BRANCH_ALL] = "ball",
[BRANCH_X] = "brax",
};
mesa_log_stream_printf(stream, "%s", name[instr->cat0.brtype]);
} else {
mesa_log_stream_printf(stream, "%s", disasm_a3xx_instr_name(instr->opc));
if (instr->flags & IR3_INSTR_3D)
mesa_log_stream_printf(stream, ".3d");
if (instr->flags & IR3_INSTR_A)
mesa_log_stream_printf(stream, ".a");
if (instr->flags & IR3_INSTR_O)
mesa_log_stream_printf(stream, ".o");
if (instr->flags & IR3_INSTR_P)
mesa_log_stream_printf(stream, ".p");
if (instr->flags & IR3_INSTR_S)
mesa_log_stream_printf(stream, ".s");
if (instr->flags & IR3_INSTR_A1EN)
mesa_log_stream_printf(stream, ".a1en");
if (instr->opc == OPC_LDC)
mesa_log_stream_printf(stream, ".offset%d", instr->cat6.d);
if (instr->flags & IR3_INSTR_B) {
mesa_log_stream_printf(
stream, ".base%d",
is_tex(instr) ? instr->cat5.tex_base : instr->cat6.base);
}
if (instr->flags & IR3_INSTR_S2EN)
mesa_log_stream_printf(stream, ".s2en");
static const char *cond[0x7] = {
"lt",
"le",
"gt",
"ge",
"eq",
"ne",
};
static const char *cond[0x7] = {
"lt", "le", "gt", "ge", "eq", "ne",
};
switch (instr->opc) {
case OPC_CMPS_F:
case OPC_CMPS_U:
case OPC_CMPS_S:
case OPC_CMPV_F:
case OPC_CMPV_U:
case OPC_CMPV_S:
mesa_log_stream_printf(stream, ".%s", cond[instr->cat2.condition & 0x7]);
break;
default:
break;
}
}
switch (instr->opc) {
case OPC_CMPS_F:
case OPC_CMPS_U:
case OPC_CMPS_S:
case OPC_CMPV_F:
case OPC_CMPV_U:
case OPC_CMPV_S:
mesa_log_stream_printf(stream, ".%s",
cond[instr->cat2.condition & 0x7]);
break;
default:
break;
}
}
}
static void print_ssa_def_name(struct log_stream *stream, struct ir3_register *reg)
static void
print_ssa_def_name(struct log_stream *stream, struct ir3_register *reg)
{
mesa_log_stream_printf(stream, SYN_SSA("ssa_%u"), reg->instr->serialno);
if (reg->name != 0)
mesa_log_stream_printf(stream, ":%u", reg->name);
mesa_log_stream_printf(stream, SYN_SSA("ssa_%u"), reg->instr->serialno);
if (reg->name != 0)
mesa_log_stream_printf(stream, ":%u", reg->name);
}
static void print_ssa_name(struct log_stream *stream, struct ir3_register *reg, bool dst)
static void
print_ssa_name(struct log_stream *stream, struct ir3_register *reg, bool dst)
{
if (!dst) {
if (!reg->def)
mesa_log_stream_printf(stream, SYN_SSA("undef"));
else
print_ssa_def_name(stream, reg->def);
} else {
print_ssa_def_name(stream, reg);
}
if (!dst) {
if (!reg->def)
mesa_log_stream_printf(stream, SYN_SSA("undef"));
else
print_ssa_def_name(stream, reg->def);
} else {
print_ssa_def_name(stream, reg);
}
if (reg->num != INVALID_REG && !(reg->flags & IR3_REG_ARRAY))
mesa_log_stream_printf(stream, "("SYN_REG("r%u.%c")")", reg_num(reg), "xyzw"[reg_comp(reg)]);
if (reg->num != INVALID_REG && !(reg->flags & IR3_REG_ARRAY))
mesa_log_stream_printf(stream, "(" SYN_REG("r%u.%c") ")", reg_num(reg),
"xyzw"[reg_comp(reg)]);
}
static void print_reg_name(struct log_stream *stream, struct ir3_instruction *instr,
struct ir3_register *reg, bool dest)
static void
print_reg_name(struct log_stream *stream, struct ir3_instruction *instr,
struct ir3_register *reg, bool dest)
{
if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) &&
(reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)))
mesa_log_stream_printf(stream, "(absneg)");
else if (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT))
mesa_log_stream_printf(stream, "(neg)");
else if (reg->flags & (IR3_REG_FABS | IR3_REG_SABS))
mesa_log_stream_printf(stream, "(abs)");
if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) &&
(reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)))
mesa_log_stream_printf(stream, "(absneg)");
else if (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT))
mesa_log_stream_printf(stream, "(neg)");
else if (reg->flags & (IR3_REG_FABS | IR3_REG_SABS))
mesa_log_stream_printf(stream, "(abs)");
if (reg->flags & IR3_REG_FIRST_KILL)
mesa_log_stream_printf(stream, "(kill)");
if (reg->flags & IR3_REG_UNUSED)
mesa_log_stream_printf(stream, "(unused)");
if (reg->flags & IR3_REG_FIRST_KILL)
mesa_log_stream_printf(stream, "(kill)");
if (reg->flags & IR3_REG_UNUSED)
mesa_log_stream_printf(stream, "(unused)");
if (reg->flags & IR3_REG_R)
mesa_log_stream_printf(stream, "(r)");
if (reg->flags & IR3_REG_R)
mesa_log_stream_printf(stream, "(r)");
/* Right now all instructions that use tied registers only have one
* destination register, so we can just print (tied) as if it's a flag,
* although it's more convenient for RA if it's a pointer.
*/
if (reg->tied)
printf("(tied)");
/* Right now all instructions that use tied registers only have one
* destination register, so we can just print (tied) as if it's a flag,
* although it's more convenient for RA if it's a pointer.
*/
if (reg->tied)
printf("(tied)");
if (reg->flags & IR3_REG_SHARED)
mesa_log_stream_printf(stream, "s");
if (reg->flags & IR3_REG_HALF)
mesa_log_stream_printf(stream, "h");
if (reg->flags & IR3_REG_SHARED)
mesa_log_stream_printf(stream, "s");
if (reg->flags & IR3_REG_HALF)
mesa_log_stream_printf(stream, "h");
if (reg->flags & IR3_REG_IMMED) {
mesa_log_stream_printf(stream, SYN_IMMED("imm[%f,%d,0x%x]"), reg->fim_val, reg->iim_val, reg->iim_val);
} else if (reg->flags & IR3_REG_ARRAY) {
if (reg->flags & IR3_REG_SSA) {
print_ssa_name(stream, reg, dest);
mesa_log_stream_printf(stream, ":");
}
mesa_log_stream_printf(stream, SYN_ARRAY("arr[id=%u, offset=%d, size=%u]"), reg->array.id,
reg->array.offset, reg->size);
if (reg->array.base != INVALID_REG)
mesa_log_stream_printf(stream, "("SYN_REG("r%u.%c")")", reg->array.base >> 2,
"xyzw"[reg->array.base & 0x3]);
} else if (reg->flags & IR3_REG_SSA) {
print_ssa_name(stream, reg, dest);
} else if (reg->flags & IR3_REG_RELATIV) {
if (reg->flags & IR3_REG_CONST)
mesa_log_stream_printf(stream, SYN_CONST("c<a0.x + %d>"), reg->array.offset);
else
mesa_log_stream_printf(stream, SYN_REG("r<a0.x + %d>")" (%u)", reg->array.offset, reg->size);
} else {
if (reg->flags & IR3_REG_CONST)
mesa_log_stream_printf(stream, SYN_CONST("c%u.%c"), reg_num(reg), "xyzw"[reg_comp(reg)]);
else
mesa_log_stream_printf(stream, SYN_REG("r%u.%c"), reg_num(reg), "xyzw"[reg_comp(reg)]);
}
if (reg->flags & IR3_REG_IMMED) {
mesa_log_stream_printf(stream, SYN_IMMED("imm[%f,%d,0x%x]"), reg->fim_val,
reg->iim_val, reg->iim_val);
} else if (reg->flags & IR3_REG_ARRAY) {
if (reg->flags & IR3_REG_SSA) {
print_ssa_name(stream, reg, dest);
mesa_log_stream_printf(stream, ":");
}
mesa_log_stream_printf(stream,
SYN_ARRAY("arr[id=%u, offset=%d, size=%u]"),
reg->array.id, reg->array.offset, reg->size);
if (reg->array.base != INVALID_REG)
mesa_log_stream_printf(stream, "(" SYN_REG("r%u.%c") ")",
reg->array.base >> 2,
"xyzw"[reg->array.base & 0x3]);
} else if (reg->flags & IR3_REG_SSA) {
print_ssa_name(stream, reg, dest);
} else if (reg->flags & IR3_REG_RELATIV) {
if (reg->flags & IR3_REG_CONST)
mesa_log_stream_printf(stream, SYN_CONST("c<a0.x + %d>"),
reg->array.offset);
else
mesa_log_stream_printf(stream, SYN_REG("r<a0.x + %d>") " (%u)",
reg->array.offset, reg->size);
} else {
if (reg->flags & IR3_REG_CONST)
mesa_log_stream_printf(stream, SYN_CONST("c%u.%c"), reg_num(reg),
"xyzw"[reg_comp(reg)]);
else
mesa_log_stream_printf(stream, SYN_REG("r%u.%c"), reg_num(reg),
"xyzw"[reg_comp(reg)]);
}
if (reg->wrmask > 0x1)
mesa_log_stream_printf(stream, " (wrmask=0x%x)", reg->wrmask);
if (reg->wrmask > 0x1)
mesa_log_stream_printf(stream, " (wrmask=0x%x)", reg->wrmask);
}
static void
tab(struct log_stream *stream, int lvl)
{
for (int i = 0; i < lvl; i++)
mesa_log_stream_printf(stream, "\t");
for (int i = 0; i < lvl; i++)
mesa_log_stream_printf(stream, "\t");
}
static void
print_instr(struct log_stream *stream, struct ir3_instruction *instr, int lvl)
{
tab(stream, lvl);
tab(stream, lvl);
print_instr_name(stream, instr, true);
print_instr_name(stream, instr, true);
if (is_tex(instr)) {
mesa_log_stream_printf(stream, " (%s)(", type_name(instr->cat5.type));
for (unsigned i = 0; i < 4; i++)
if (instr->dsts[0]->wrmask & (1 << i))
mesa_log_stream_printf(stream, "%c", "xyzw"[i]);
mesa_log_stream_printf(stream, ")");
} else if ((instr->srcs_count > 0 || instr->dsts_count > 0) && (instr->opc != OPC_B)) {
/* NOTE the b(ranch) instruction has a suffix, which is
* handled below
*/
mesa_log_stream_printf(stream, " ");
}
if (is_tex(instr)) {
mesa_log_stream_printf(stream, " (%s)(", type_name(instr->cat5.type));
for (unsigned i = 0; i < 4; i++)
if (instr->dsts[0]->wrmask & (1 << i))
mesa_log_stream_printf(stream, "%c", "xyzw"[i]);
mesa_log_stream_printf(stream, ")");
} else if ((instr->srcs_count > 0 || instr->dsts_count > 0) &&
(instr->opc != OPC_B)) {
/* NOTE the b(ranch) instruction has a suffix, which is
* handled below
*/
mesa_log_stream_printf(stream, " ");
}
if (!is_flow(instr) || instr->opc == OPC_END || instr->opc == OPC_CHMASK) {
bool first = true;
foreach_dst (reg, instr) {
if (reg->wrmask == 0)
continue;
if (!first)
mesa_log_stream_printf(stream, ", ");
print_reg_name(stream, instr, reg, true);
first = false;
}
foreach_src (reg, instr) {
if (!first)
mesa_log_stream_printf(stream, ", ");
print_reg_name(stream, instr, reg, false);
first = false;
}
}
if (!is_flow(instr) || instr->opc == OPC_END || instr->opc == OPC_CHMASK) {
bool first = true;
foreach_dst (reg, instr) {
if (reg->wrmask == 0)
continue;
if (!first)
mesa_log_stream_printf(stream, ", ");
print_reg_name(stream, instr, reg, true);
first = false;
}
foreach_src (reg, instr) {
if (!first)
mesa_log_stream_printf(stream, ", ");
print_reg_name(stream, instr, reg, false);
first = false;
}
}
if (is_tex(instr) && !(instr->flags & IR3_INSTR_S2EN)) {
if (!!(instr->flags & IR3_INSTR_B)) {
if (!!(instr->flags & IR3_INSTR_A1EN)) {
mesa_log_stream_printf(stream, ", s#%d", instr->cat5.samp);
} else {
mesa_log_stream_printf(stream, ", s#%d, t#%d", instr->cat5.samp & 0xf,
instr->cat5.samp >> 4);
}
} else {
mesa_log_stream_printf(stream, ", s#%d, t#%d", instr->cat5.samp, instr->cat5.tex);
}
}
if (is_tex(instr) && !(instr->flags & IR3_INSTR_S2EN)) {
if (!!(instr->flags & IR3_INSTR_B)) {
if (!!(instr->flags & IR3_INSTR_A1EN)) {
mesa_log_stream_printf(stream, ", s#%d", instr->cat5.samp);
} else {
mesa_log_stream_printf(stream, ", s#%d, t#%d",
instr->cat5.samp & 0xf,
instr->cat5.samp >> 4);
}
} else {
mesa_log_stream_printf(stream, ", s#%d, t#%d", instr->cat5.samp,
instr->cat5.tex);
}
}
if (instr->opc == OPC_META_SPLIT) {
mesa_log_stream_printf(stream, ", off=%d", instr->split.off);
} else if (instr->opc == OPC_META_TEX_PREFETCH) {
mesa_log_stream_printf(stream, ", tex=%d, samp=%d, input_offset=%d", instr->prefetch.tex,
instr->prefetch.samp, instr->prefetch.input_offset);
}
if (instr->opc == OPC_META_SPLIT) {
mesa_log_stream_printf(stream, ", off=%d", instr->split.off);
} else if (instr->opc == OPC_META_TEX_PREFETCH) {
mesa_log_stream_printf(stream, ", tex=%d, samp=%d, input_offset=%d",
instr->prefetch.tex, instr->prefetch.samp,
instr->prefetch.input_offset);
}
if (is_flow(instr) && instr->cat0.target) {
/* the predicate register src is implied: */
if (instr->opc == OPC_B) {
static const struct {
const char *suffix;
int nsrc;
bool idx;
} brinfo[7] = {
[BRANCH_PLAIN] = { "r", 1, false },
[BRANCH_OR] = { "rao", 2, false },
[BRANCH_AND] = { "raa", 2, false },
[BRANCH_CONST] = { "rac", 0, true },
[BRANCH_ANY] = { "any", 1, false },
[BRANCH_ALL] = { "all", 1, false },
[BRANCH_X] = { "rax", 0, false },
};
if (is_flow(instr) && instr->cat0.target) {
/* the predicate register src is implied: */
if (instr->opc == OPC_B) {
static const struct {
const char *suffix;
int nsrc;
bool idx;
} brinfo[7] = {
[BRANCH_PLAIN] = {"r", 1, false}, [BRANCH_OR] = {"rao", 2, false},
[BRANCH_AND] = {"raa", 2, false}, [BRANCH_CONST] = {"rac", 0, true},
[BRANCH_ANY] = {"any", 1, false}, [BRANCH_ALL] = {"all", 1, false},
[BRANCH_X] = {"rax", 0, false},
};
mesa_log_stream_printf(stream, "%s", brinfo[instr->cat0.brtype].suffix);
if (brinfo[instr->cat0.brtype].idx) {
mesa_log_stream_printf(stream, ".%u", instr->cat0.idx);
}
if (brinfo[instr->cat0.brtype].nsrc >= 1) {
mesa_log_stream_printf(stream, " %sp0.%c (",
instr->cat0.inv1 ? "!" : "",
"xyzw"[instr->cat0.comp1 & 0x3]);
print_reg_name(stream, instr, instr->srcs[0], false);
mesa_log_stream_printf(stream, "), ");
}
if (brinfo[instr->cat0.brtype].nsrc >= 2) {
mesa_log_stream_printf(stream, " %sp0.%c (",
instr->cat0.inv2 ? "!" : "",
"xyzw"[instr->cat0.comp2 & 0x3]);
print_reg_name(stream, instr, instr->srcs[1], false);
mesa_log_stream_printf(stream, "), ");
}
}
mesa_log_stream_printf(stream, " target=block%u", block_id(instr->cat0.target));
}
mesa_log_stream_printf(stream, "%s",
brinfo[instr->cat0.brtype].suffix);
if (brinfo[instr->cat0.brtype].idx) {
mesa_log_stream_printf(stream, ".%u", instr->cat0.idx);
}
if (brinfo[instr->cat0.brtype].nsrc >= 1) {
mesa_log_stream_printf(stream, " %sp0.%c (",
instr->cat0.inv1 ? "!" : "",
"xyzw"[instr->cat0.comp1 & 0x3]);
print_reg_name(stream, instr, instr->srcs[0], false);
mesa_log_stream_printf(stream, "), ");
}
if (brinfo[instr->cat0.brtype].nsrc >= 2) {
mesa_log_stream_printf(stream, " %sp0.%c (",
instr->cat0.inv2 ? "!" : "",
"xyzw"[instr->cat0.comp2 & 0x3]);
print_reg_name(stream, instr, instr->srcs[1], false);
mesa_log_stream_printf(stream, "), ");
}
}
mesa_log_stream_printf(stream, " target=block%u",
block_id(instr->cat0.target));
}
if (instr->deps_count) {
mesa_log_stream_printf(stream, ", false-deps:");
unsigned n = 0;
for (unsigned i = 0; i < instr->deps_count; i++) {
if (!instr->deps[i])
continue;
if (n++ > 0)
mesa_log_stream_printf(stream, ", ");
mesa_log_stream_printf(stream, SYN_SSA("ssa_%u"), instr->deps[i]->serialno);
}
}
if (instr->deps_count) {
mesa_log_stream_printf(stream, ", false-deps:");
unsigned n = 0;
for (unsigned i = 0; i < instr->deps_count; i++) {
if (!instr->deps[i])
continue;
if (n++ > 0)
mesa_log_stream_printf(stream, ", ");
mesa_log_stream_printf(stream, SYN_SSA("ssa_%u"),
instr->deps[i]->serialno);
}
}
mesa_log_stream_printf(stream, "\n");
mesa_log_stream_printf(stream, "\n");
}
void ir3_print_instr(struct ir3_instruction *instr)
void
ir3_print_instr(struct ir3_instruction *instr)
{
struct log_stream *stream = mesa_log_streami();
print_instr(stream, instr, 0);
mesa_log_stream_destroy(stream);
struct log_stream *stream = mesa_log_streami();
print_instr(stream, instr, 0);
mesa_log_stream_destroy(stream);
}
static void
print_block(struct ir3_block *block, int lvl)
{
struct log_stream *stream = mesa_log_streami();
struct log_stream *stream = mesa_log_streami();
tab(stream, lvl); mesa_log_stream_printf(stream, "block%u {\n", block_id(block));
tab(stream, lvl);
mesa_log_stream_printf(stream, "block%u {\n", block_id(block));
if (block->predecessors_count > 0) {
tab(stream, lvl+1);
mesa_log_stream_printf(stream, "pred: ");
for (unsigned i = 0; i < block->predecessors_count; i++) {
struct ir3_block *pred = block->predecessors[i];
if (i != 0)
mesa_log_stream_printf(stream, ", ");
mesa_log_stream_printf(stream, "block%u", block_id(pred));
}
mesa_log_stream_printf(stream, "\n");
}
if (block->predecessors_count > 0) {
tab(stream, lvl + 1);
mesa_log_stream_printf(stream, "pred: ");
for (unsigned i = 0; i < block->predecessors_count; i++) {
struct ir3_block *pred = block->predecessors[i];
if (i != 0)
mesa_log_stream_printf(stream, ", ");
mesa_log_stream_printf(stream, "block%u", block_id(pred));
}
mesa_log_stream_printf(stream, "\n");
}
foreach_instr (instr, &block->instr_list) {
print_instr(stream, instr, lvl+1);
}
foreach_instr (instr, &block->instr_list) {
print_instr(stream, instr, lvl + 1);
}
tab(stream, lvl+1); mesa_log_stream_printf(stream, "/* keeps:\n");
for (unsigned i = 0; i < block->keeps_count; i++) {
print_instr(stream, block->keeps[i], lvl+2);
}
tab(stream, lvl+1); mesa_log_stream_printf(stream, " */\n");
tab(stream, lvl + 1);
mesa_log_stream_printf(stream, "/* keeps:\n");
for (unsigned i = 0; i < block->keeps_count; i++) {
print_instr(stream, block->keeps[i], lvl + 2);
}
tab(stream, lvl + 1);
mesa_log_stream_printf(stream, " */\n");
if (block->successors[1]) {
/* leading into if/else: */
tab(stream, lvl+1);
mesa_log_stream_printf(stream, "/* succs: if ");
switch (block->brtype) {
case IR3_BRANCH_COND:
break;
case IR3_BRANCH_ANY:
printf("any ");
break;
case IR3_BRANCH_ALL:
printf("all ");
break;
case IR3_BRANCH_GETONE:
printf("getone ");
break;
}
if (block->condition)
mesa_log_stream_printf(stream, SYN_SSA("ssa_%u")" ", block->condition->serialno);
mesa_log_stream_printf(stream, "block%u; else block%u; */\n",
block_id(block->successors[0]),
block_id(block->successors[1]));
} else if (block->successors[0]) {
tab(stream, lvl+1);
mesa_log_stream_printf(stream, "/* succs: block%u; */\n",
block_id(block->successors[0]));
}
tab(stream, lvl); mesa_log_stream_printf(stream, "}\n");
if (block->successors[1]) {
/* leading into if/else: */
tab(stream, lvl + 1);
mesa_log_stream_printf(stream, "/* succs: if ");
switch (block->brtype) {
case IR3_BRANCH_COND:
break;
case IR3_BRANCH_ANY:
printf("any ");
break;
case IR3_BRANCH_ALL:
printf("all ");
break;
case IR3_BRANCH_GETONE:
printf("getone ");
break;
}
if (block->condition)
mesa_log_stream_printf(stream, SYN_SSA("ssa_%u") " ",
block->condition->serialno);
mesa_log_stream_printf(stream, "block%u; else block%u; */\n",
block_id(block->successors[0]),
block_id(block->successors[1]));
} else if (block->successors[0]) {
tab(stream, lvl + 1);
mesa_log_stream_printf(stream, "/* succs: block%u; */\n",
block_id(block->successors[0]));
}
tab(stream, lvl);
mesa_log_stream_printf(stream, "}\n");
}
void
ir3_print(struct ir3 *ir)
{
foreach_block (block, &ir->block_list)
print_block(block, 0);
foreach_block (block, &ir->block_list)
print_block(block, 0);
}

File diff suppressed because it is too large Load Diff

View File

@ -24,62 +24,68 @@
#ifndef _IR3_RA_H
#define _IR3_RA_H
#include "util/rb_tree.h"
#include "ir3.h"
#include "ir3_compiler.h"
#include "util/rb_tree.h"
#ifdef DEBUG
#define RA_DEBUG (ir3_shader_debug & IR3_DBG_RAMSGS)
#else
#define RA_DEBUG 0
#endif
#define d(fmt, ...) do { if (RA_DEBUG) { \
printf("RA: "fmt"\n", ##__VA_ARGS__); \
} } while (0)
#define d(fmt, ...) \
do { \
if (RA_DEBUG) { \
printf("RA: " fmt "\n", ##__VA_ARGS__); \
} \
} while (0)
#define di(instr, fmt, ...) do { if (RA_DEBUG) { \
printf("RA: "fmt": ", ##__VA_ARGS__); \
ir3_print_instr(instr); \
} } while (0)
#define di(instr, fmt, ...) \
do { \
if (RA_DEBUG) { \
printf("RA: " fmt ": ", ##__VA_ARGS__); \
ir3_print_instr(instr); \
} \
} while (0)
typedef uint16_t physreg_t;
static inline unsigned
ra_physreg_to_num(physreg_t physreg, unsigned flags)
{
if (!(flags & IR3_REG_HALF))
physreg /= 2;
if (flags & IR3_REG_SHARED)
physreg += 48 * 4;
return physreg;
if (!(flags & IR3_REG_HALF))
physreg /= 2;
if (flags & IR3_REG_SHARED)
physreg += 48 * 4;
return physreg;
}
static inline physreg_t
ra_num_to_physreg(unsigned num, unsigned flags)
{
if (flags & IR3_REG_SHARED)
num -= 48 * 4;
if (!(flags & IR3_REG_HALF))
num *= 2;
return num;
if (flags & IR3_REG_SHARED)
num -= 48 * 4;
if (!(flags & IR3_REG_HALF))
num *= 2;
return num;
}
static inline unsigned
ra_reg_get_num(const struct ir3_register *reg)
{
return (reg->flags & IR3_REG_ARRAY) ? reg->array.base : reg->num;
return (reg->flags & IR3_REG_ARRAY) ? reg->array.base : reg->num;
}
static inline physreg_t
ra_reg_get_physreg(const struct ir3_register *reg)
{
return ra_num_to_physreg(ra_reg_get_num(reg), reg->flags);
return ra_num_to_physreg(ra_reg_get_num(reg), reg->flags);
}
static inline bool
def_is_gpr(const struct ir3_register *reg)
{
return reg_num(reg) != REG_A0 && reg_num(reg) != REG_P0;
return reg_num(reg) != REG_A0 && reg_num(reg) != REG_P0;
}
/* Note: don't count undef as a source.
@ -87,16 +93,14 @@ def_is_gpr(const struct ir3_register *reg)
static inline bool
ra_reg_is_src(const struct ir3_register *reg)
{
return (reg->flags & IR3_REG_SSA) && reg->def &&
def_is_gpr(reg->def);
return (reg->flags & IR3_REG_SSA) && reg->def && def_is_gpr(reg->def);
}
static inline bool
ra_reg_is_dst(const struct ir3_register *reg)
{
return (reg->flags & IR3_REG_SSA) &&
def_is_gpr(reg) &&
((reg->flags & IR3_REG_ARRAY) || reg->wrmask);
return (reg->flags & IR3_REG_SSA) && def_is_gpr(reg) &&
((reg->flags & IR3_REG_ARRAY) || reg->wrmask);
}
/* Iterators for sources and destinations which:
@ -105,53 +109,54 @@ ra_reg_is_dst(const struct ir3_register *reg)
* - Consider array destinations as both a source and a destination
*/
#define ra_foreach_src(__srcreg, __instr) \
for (struct ir3_register *__srcreg = (void *)~0; __srcreg; __srcreg = NULL) \
for (unsigned __cnt = (__instr)->srcs_count, __i = 0; __i < __cnt; __i++) \
if (ra_reg_is_src((__srcreg = (__instr)->srcs[__i])))
#define ra_foreach_src(__srcreg, __instr) \
for (struct ir3_register *__srcreg = (void *)~0; __srcreg; __srcreg = NULL) \
for (unsigned __cnt = (__instr)->srcs_count, __i = 0; __i < __cnt; \
__i++) \
if (ra_reg_is_src((__srcreg = (__instr)->srcs[__i])))
#define ra_foreach_src_rev(__srcreg, __instr) \
for (struct ir3_register *__srcreg = (void *)~0; __srcreg; __srcreg = NULL) \
for (int __cnt = (__instr)->srcs_count, __i = __cnt - 1; __i >= 0; __i--) \
if (ra_reg_is_src((__srcreg = (__instr)->srcs[__i])))
#define ra_foreach_src_rev(__srcreg, __instr) \
for (struct ir3_register *__srcreg = (void *)~0; __srcreg; __srcreg = NULL) \
for (int __cnt = (__instr)->srcs_count, __i = __cnt - 1; __i >= 0; \
__i--) \
if (ra_reg_is_src((__srcreg = (__instr)->srcs[__i])))
#define ra_foreach_dst(__dstreg, __instr) \
for (struct ir3_register *__dstreg = (void *)~0; __dstreg; __dstreg = NULL) \
for (unsigned __cnt = (__instr)->dsts_count, __i = 0; __i < __cnt; __i++) \
if (ra_reg_is_dst((__dstreg = (__instr)->dsts[__i])))
#define ra_foreach_dst(__dstreg, __instr) \
for (struct ir3_register *__dstreg = (void *)~0; __dstreg; __dstreg = NULL) \
for (unsigned __cnt = (__instr)->dsts_count, __i = 0; __i < __cnt; \
__i++) \
if (ra_reg_is_dst((__dstreg = (__instr)->dsts[__i])))
#define RA_HALF_SIZE (4 * 48)
#define RA_FULL_SIZE (4 * 48 * 2)
#define RA_SHARED_SIZE (2 * 4 * 8)
#define RA_HALF_SIZE (4 * 48)
#define RA_FULL_SIZE (4 * 48 * 2)
#define RA_SHARED_SIZE (2 * 4 * 8)
#define RA_MAX_FILE_SIZE RA_FULL_SIZE
struct ir3_liveness {
unsigned block_count;
DECLARE_ARRAY(struct ir3_register *, definitions);
DECLARE_ARRAY(BITSET_WORD *, live_out);
DECLARE_ARRAY(BITSET_WORD *, live_in);
unsigned block_count;
DECLARE_ARRAY(struct ir3_register *, definitions);
DECLARE_ARRAY(BITSET_WORD *, live_out);
DECLARE_ARRAY(BITSET_WORD *, live_in);
};
struct ir3_liveness *ir3_calc_liveness(struct ir3_shader_variant *v);
bool ir3_def_live_after(struct ir3_liveness *live, struct ir3_register *def,
struct ir3_instruction *instr);
struct ir3_instruction *instr);
void ir3_create_parallel_copies(struct ir3 *ir);
void ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir);
struct ir3_pressure {
unsigned full, half, shared;
unsigned full, half, shared;
};
void ir3_calc_pressure(struct ir3_shader_variant *v,
struct ir3_liveness *live,
struct ir3_pressure *max_pressure);
void ir3_calc_pressure(struct ir3_shader_variant *v, struct ir3_liveness *live,
struct ir3_pressure *max_pressure);
void ir3_ra_validate(struct ir3_shader_variant *v,
unsigned full_size, unsigned half_size, unsigned block_count);
void ir3_ra_validate(struct ir3_shader_variant *v, unsigned full_size,
unsigned half_size, unsigned block_count);
void ir3_lower_copies(struct ir3_shader_variant *v);
@ -176,91 +181,90 @@ void ir3_lower_copies(struct ir3_shader_variant *v);
*/
struct ir3_reg_interval {
struct rb_node node;
struct rb_node node;
struct rb_tree children;
struct rb_tree children;
struct ir3_reg_interval *parent;
struct ir3_reg_interval *parent;
struct ir3_register *reg;
struct ir3_register *reg;
bool inserted;
bool inserted;
};
struct ir3_reg_ctx {
/* The tree of top-level intervals in the forest. */
struct rb_tree intervals;
/* The tree of top-level intervals in the forest. */
struct rb_tree intervals;
/* Users of ir3_reg_ctx need to keep around additional state that is
* modified when top-level intervals are added or removed. For register
* pressure tracking, this is just the register pressure, but for RA we
* need to keep track of the physreg of each top-level interval. These
* callbacks provide a place to let users deriving from ir3_reg_ctx update
* their state when top-level intervals are inserted/removed.
*/
/* Users of ir3_reg_ctx need to keep around additional state that is
* modified when top-level intervals are added or removed. For register
* pressure tracking, this is just the register pressure, but for RA we
* need to keep track of the physreg of each top-level interval. These
* callbacks provide a place to let users deriving from ir3_reg_ctx update
* their state when top-level intervals are inserted/removed.
*/
/* Called when an interval is added and it turns out to be at the top
* level.
*/
void (*interval_add)(struct ir3_reg_ctx *ctx,
struct ir3_reg_interval *interval);
/* Called when an interval is added and it turns out to be at the top
* level.
*/
void (*interval_add)(struct ir3_reg_ctx *ctx,
struct ir3_reg_interval *interval);
/* Called when an interval is deleted from the top level. */
void (*interval_delete)(struct ir3_reg_ctx *ctx,
struct ir3_reg_interval *interval);
/* Called when an interval is deleted from the top level. */
void (*interval_delete)(struct ir3_reg_ctx *ctx,
struct ir3_reg_interval *interval);
/* Called when an interval is deleted and its child becomes top-level.
*/
void (*interval_readd)(struct ir3_reg_ctx *ctx,
struct ir3_reg_interval *parent,
struct ir3_reg_interval *child);
/* Called when an interval is deleted and its child becomes top-level.
*/
void (*interval_readd)(struct ir3_reg_ctx *ctx,
struct ir3_reg_interval *parent,
struct ir3_reg_interval *child);
};
static inline struct ir3_reg_interval *
ir3_rb_node_to_interval(struct rb_node *node)
{
return rb_node_data(struct ir3_reg_interval, node, node);
return rb_node_data(struct ir3_reg_interval, node, node);
}
static inline const struct ir3_reg_interval *
ir3_rb_node_to_interval_const(const struct rb_node *node)
{
return rb_node_data(struct ir3_reg_interval, node, node);
return rb_node_data(struct ir3_reg_interval, node, node);
}
static inline struct ir3_reg_interval *
ir3_reg_interval_next(struct ir3_reg_interval *interval)
{
struct rb_node *next = rb_node_next(&interval->node);
return next ? ir3_rb_node_to_interval(next) : NULL;
struct rb_node *next = rb_node_next(&interval->node);
return next ? ir3_rb_node_to_interval(next) : NULL;
}
static inline struct ir3_reg_interval *
ir3_reg_interval_next_or_null(struct ir3_reg_interval *interval)
{
return interval ? ir3_reg_interval_next(interval) : NULL;
return interval ? ir3_reg_interval_next(interval) : NULL;
}
static inline void
ir3_reg_interval_init(struct ir3_reg_interval *interval, struct ir3_register *reg)
ir3_reg_interval_init(struct ir3_reg_interval *interval,
struct ir3_register *reg)
{
rb_tree_init(&interval->children);
interval->reg = reg;
interval->parent = NULL;
interval->inserted = false;
rb_tree_init(&interval->children);
interval->reg = reg;
interval->parent = NULL;
interval->inserted = false;
}
void
ir3_reg_interval_dump(struct ir3_reg_interval *interval);
void ir3_reg_interval_dump(struct ir3_reg_interval *interval);
void ir3_reg_interval_insert(struct ir3_reg_ctx *ctx,
struct ir3_reg_interval *interval);
struct ir3_reg_interval *interval);
void ir3_reg_interval_remove(struct ir3_reg_ctx *ctx,
struct ir3_reg_interval *interval);
struct ir3_reg_interval *interval);
void ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx,
struct ir3_reg_interval *interval);
struct ir3_reg_interval *interval);
#endif

View File

@ -73,59 +73,61 @@
*/
#define UNKNOWN ((struct ir3_register *)NULL)
#define UNDEF ((struct ir3_register *)(uintptr_t)1)
#define UNDEF ((struct ir3_register *)(uintptr_t)1)
#define OVERDEF ((struct ir3_register *)(uintptr_t)2)
struct reg_state {
struct ir3_register *def;
unsigned offset;
struct ir3_register *def;
unsigned offset;
};
struct file_state {
struct reg_state regs[RA_MAX_FILE_SIZE];
struct reg_state regs[RA_MAX_FILE_SIZE];
};
struct reaching_state {
struct file_state half, full, shared;
struct file_state half, full, shared;
};
struct ra_val_ctx {
struct ir3_instruction *current_instr;
struct ir3_instruction *current_instr;
struct reaching_state reaching;
struct reaching_state *block_reaching;
unsigned block_count;
struct reaching_state reaching;
struct reaching_state *block_reaching;
unsigned block_count;
unsigned full_size, half_size;
unsigned full_size, half_size;
bool merged_regs;
bool merged_regs;
bool failed;
bool failed;
};
static void
validate_error(struct ra_val_ctx *ctx, const char *condstr)
{
fprintf(stderr, "ra validation fail: %s\n", condstr);
fprintf(stderr, " -> for instruction: ");
ir3_print_instr(ctx->current_instr);
abort();
fprintf(stderr, "ra validation fail: %s\n", condstr);
fprintf(stderr, " -> for instruction: ");
ir3_print_instr(ctx->current_instr);
abort();
}
#define validate_assert(ctx, cond) do { \
if (!(cond)) { \
validate_error(ctx, #cond); \
} } while (0)
#define validate_assert(ctx, cond) \
do { \
if (!(cond)) { \
validate_error(ctx, #cond); \
} \
} while (0)
static unsigned
get_file_size(struct ra_val_ctx *ctx, struct ir3_register *reg)
{
if (reg->flags & IR3_REG_SHARED)
return RA_SHARED_SIZE;
else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
return ctx->full_size;
else
return ctx->half_size;
if (reg->flags & IR3_REG_SHARED)
return RA_SHARED_SIZE;
else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
return ctx->full_size;
else
return ctx->half_size;
}
/* Validate simple things, like the registers being in-bounds. This way we
@ -135,438 +137,434 @@ get_file_size(struct ra_val_ctx *ctx, struct ir3_register *reg)
static void
validate_simple(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
{
ctx->current_instr = instr;
ra_foreach_dst (dst, instr) {
unsigned dst_max = ra_reg_get_physreg(dst) + reg_size(dst);
validate_assert(ctx, dst_max <= get_file_size(ctx, dst));
if (dst->tied)
validate_assert(ctx, ra_reg_get_num(dst) == ra_reg_get_num(dst->tied));
}
ctx->current_instr = instr;
ra_foreach_dst (dst, instr) {
unsigned dst_max = ra_reg_get_physreg(dst) + reg_size(dst);
validate_assert(ctx, dst_max <= get_file_size(ctx, dst));
if (dst->tied)
validate_assert(ctx, ra_reg_get_num(dst) == ra_reg_get_num(dst->tied));
}
ra_foreach_src (src, instr) {
unsigned src_max = ra_reg_get_physreg(src) + reg_size(src);
validate_assert(ctx, src_max <= get_file_size(ctx, src));
}
ra_foreach_src (src, instr) {
unsigned src_max = ra_reg_get_physreg(src) + reg_size(src);
validate_assert(ctx, src_max <= get_file_size(ctx, src));
}
}
/* This is the lattice operator. */
static bool
merge_reg(struct reg_state *dst, const struct reg_state *src)
{
if (dst->def == UNKNOWN) {
*dst = *src;
return src->def != UNKNOWN;
} else if (dst->def == OVERDEF) {
return false;
} else {
if (src->def == UNKNOWN)
return false;
else if (src->def == OVERDEF) {
*dst = *src;
return true;
} else {
if (dst->def != src->def || dst->offset != src->offset) {
dst->def = OVERDEF;
dst->offset = 0;
return true;
} else {
return false;
}
}
}
if (dst->def == UNKNOWN) {
*dst = *src;
return src->def != UNKNOWN;
} else if (dst->def == OVERDEF) {
return false;
} else {
if (src->def == UNKNOWN)
return false;
else if (src->def == OVERDEF) {
*dst = *src;
return true;
} else {
if (dst->def != src->def || dst->offset != src->offset) {
dst->def = OVERDEF;
dst->offset = 0;
return true;
} else {
return false;
}
}
}
}
static bool
merge_file(struct file_state *dst, const struct file_state *src, unsigned size)
{
bool progress = false;
for (unsigned i = 0; i < size; i++)
progress |= merge_reg(&dst->regs[i], &src->regs[i]);
return progress;
bool progress = false;
for (unsigned i = 0; i < size; i++)
progress |= merge_reg(&dst->regs[i], &src->regs[i]);
return progress;
}
static bool
merge_state(struct ra_val_ctx *ctx, struct reaching_state *dst,
const struct reaching_state *src)
const struct reaching_state *src)
{
bool progress = false;
progress |= merge_file(&dst->full, &src->full, ctx->full_size);
progress |= merge_file(&dst->half, &src->half, ctx->half_size);
return progress;
bool progress = false;
progress |= merge_file(&dst->full, &src->full, ctx->full_size);
progress |= merge_file(&dst->half, &src->half, ctx->half_size);
return progress;
}
static bool
merge_state_physical(struct ra_val_ctx *ctx, struct reaching_state *dst,
const struct reaching_state *src)
const struct reaching_state *src)
{
return merge_file(&dst->shared, &src->shared, RA_SHARED_SIZE);
return merge_file(&dst->shared, &src->shared, RA_SHARED_SIZE);
}
static struct file_state *
ra_val_get_file(struct ra_val_ctx *ctx, struct ir3_register *reg)
{
if (reg->flags & IR3_REG_SHARED)
return &ctx->reaching.shared;
else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
return &ctx->reaching.full;
else
return &ctx->reaching.half;
if (reg->flags & IR3_REG_SHARED)
return &ctx->reaching.shared;
else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
return &ctx->reaching.full;
else
return &ctx->reaching.half;
}
static void
propagate_normal_instr(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
{
ra_foreach_dst (dst, instr) {
struct file_state *file = ra_val_get_file(ctx, dst);
physreg_t physreg = ra_reg_get_physreg(dst);
for (unsigned i = 0; i < reg_size(dst); i++) {
file->regs[physreg + i] = (struct reg_state) {
.def = dst,
.offset = i,
};
}
}
ra_foreach_dst (dst, instr) {
struct file_state *file = ra_val_get_file(ctx, dst);
physreg_t physreg = ra_reg_get_physreg(dst);
for (unsigned i = 0; i < reg_size(dst); i++) {
file->regs[physreg + i] = (struct reg_state){
.def = dst,
.offset = i,
};
}
}
}
static void
propagate_split(struct ra_val_ctx *ctx, struct ir3_instruction *split)
{
struct ir3_register *dst = split->dsts[0];
struct ir3_register *src = split->srcs[0];
physreg_t dst_physreg = ra_reg_get_physreg(dst);
physreg_t src_physreg = ra_reg_get_physreg(src);
struct file_state *file = ra_val_get_file(ctx, dst);
struct ir3_register *dst = split->dsts[0];
struct ir3_register *src = split->srcs[0];
physreg_t dst_physreg = ra_reg_get_physreg(dst);
physreg_t src_physreg = ra_reg_get_physreg(src);
struct file_state *file = ra_val_get_file(ctx, dst);
unsigned offset = split->split.off * reg_elem_size(src);
for (unsigned i = 0; i < reg_elem_size(src); i++) {
file->regs[dst_physreg + i] = file->regs[src_physreg + offset + i];
}
unsigned offset = split->split.off * reg_elem_size(src);
for (unsigned i = 0; i < reg_elem_size(src); i++) {
file->regs[dst_physreg + i] = file->regs[src_physreg + offset + i];
}
}
static void
propagate_collect(struct ra_val_ctx *ctx, struct ir3_instruction *collect)
{
struct ir3_register *dst = collect->dsts[0];
physreg_t dst_physreg = ra_reg_get_physreg(dst);
struct file_state *file = ra_val_get_file(ctx, dst);
struct ir3_register *dst = collect->dsts[0];
physreg_t dst_physreg = ra_reg_get_physreg(dst);
struct file_state *file = ra_val_get_file(ctx, dst);
unsigned size = reg_size(dst);
struct reg_state srcs[size];
unsigned size = reg_size(dst);
struct reg_state srcs[size];
for (unsigned i = 0; i < collect->srcs_count; i++) {
struct ir3_register *src = collect->srcs[i];
unsigned dst_offset = i * reg_elem_size(dst);
for (unsigned j = 0; j < reg_elem_size(dst); j++) {
if (!ra_reg_is_src(src)) {
srcs[dst_offset + j] = (struct reg_state) {
.def = dst,
.offset = dst_offset + j,
};
} else {
physreg_t src_physreg = ra_reg_get_physreg(src);
srcs[dst_offset + j] = file->regs[src_physreg + j];
}
}
}
for (unsigned i = 0; i < collect->srcs_count; i++) {
struct ir3_register *src = collect->srcs[i];
unsigned dst_offset = i * reg_elem_size(dst);
for (unsigned j = 0; j < reg_elem_size(dst); j++) {
if (!ra_reg_is_src(src)) {
srcs[dst_offset + j] = (struct reg_state){
.def = dst,
.offset = dst_offset + j,
};
} else {
physreg_t src_physreg = ra_reg_get_physreg(src);
srcs[dst_offset + j] = file->regs[src_physreg + j];
}
}
}
for (unsigned i = 0; i < size; i++)
file->regs[dst_physreg + i] = srcs[i];
for (unsigned i = 0; i < size; i++)
file->regs[dst_physreg + i] = srcs[i];
}
static void
propagate_parallelcopy(struct ra_val_ctx *ctx, struct ir3_instruction *pcopy)
{
unsigned size = 0;
for (unsigned i = 0; i < pcopy->dsts_count; i++) {
size += reg_size(pcopy->srcs[i]);
}
unsigned size = 0;
for (unsigned i = 0; i < pcopy->dsts_count; i++) {
size += reg_size(pcopy->srcs[i]);
}
struct reg_state srcs[size];
struct reg_state srcs[size];
unsigned offset = 0;
for (unsigned i = 0; i < pcopy->srcs_count; i++) {
struct ir3_register *dst = pcopy->dsts[i];
struct ir3_register *src = pcopy->srcs[i];
struct file_state *file = ra_val_get_file(ctx, dst);
unsigned offset = 0;
for (unsigned i = 0; i < pcopy->srcs_count; i++) {
struct ir3_register *dst = pcopy->dsts[i];
struct ir3_register *src = pcopy->srcs[i];
struct file_state *file = ra_val_get_file(ctx, dst);
for (unsigned j = 0; j < reg_size(dst); j++) {
if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST)) {
srcs[offset + j] = (struct reg_state) {
.def = dst,
.offset = j,
};
} else {
physreg_t src_physreg = ra_reg_get_physreg(src);
srcs[offset + j] = file->regs[src_physreg + j];
}
}
for (unsigned j = 0; j < reg_size(dst); j++) {
if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST)) {
srcs[offset + j] = (struct reg_state){
.def = dst,
.offset = j,
};
} else {
physreg_t src_physreg = ra_reg_get_physreg(src);
srcs[offset + j] = file->regs[src_physreg + j];
}
}
offset += reg_size(dst);
}
assert(offset == size);
offset += reg_size(dst);
}
assert(offset == size);
offset = 0;
for (unsigned i = 0; i < pcopy->dsts_count; i++) {
struct ir3_register *dst = pcopy->dsts[i];
physreg_t dst_physreg = ra_reg_get_physreg(dst);
struct file_state *file = ra_val_get_file(ctx, dst);
offset = 0;
for (unsigned i = 0; i < pcopy->dsts_count; i++) {
struct ir3_register *dst = pcopy->dsts[i];
physreg_t dst_physreg = ra_reg_get_physreg(dst);
struct file_state *file = ra_val_get_file(ctx, dst);
for (unsigned j = 0; j < reg_size(dst); j++)
file->regs[dst_physreg + j] = srcs[offset + j];
for (unsigned j = 0; j < reg_size(dst); j++)
file->regs[dst_physreg + j] = srcs[offset + j];
offset += reg_size(dst);
}
assert(offset == size);
offset += reg_size(dst);
}
assert(offset == size);
}
static void
propagate_instr(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
{
if (instr->opc == OPC_META_SPLIT)
propagate_split(ctx, instr);
else if (instr->opc == OPC_META_COLLECT)
propagate_collect(ctx, instr);
else if (instr->opc == OPC_META_PARALLEL_COPY)
propagate_parallelcopy(ctx, instr);
else
propagate_normal_instr(ctx, instr);
if (instr->opc == OPC_META_SPLIT)
propagate_split(ctx, instr);
else if (instr->opc == OPC_META_COLLECT)
propagate_collect(ctx, instr);
else if (instr->opc == OPC_META_PARALLEL_COPY)
propagate_parallelcopy(ctx, instr);
else
propagate_normal_instr(ctx, instr);
}
static bool
propagate_block(struct ra_val_ctx *ctx, struct ir3_block *block)
{
ctx->reaching = ctx->block_reaching[block->index];
ctx->reaching = ctx->block_reaching[block->index];
foreach_instr (instr, &block->instr_list) {
propagate_instr(ctx, instr);
}
foreach_instr (instr, &block->instr_list) {
propagate_instr(ctx, instr);
}
bool progress = false;
for (unsigned i = 0; i < 2; i++) {
struct ir3_block *succ = block->successors[i];
if (!succ)
continue;
progress |= merge_state(ctx,
&ctx->block_reaching[succ->index],
&ctx->reaching);
}
for (unsigned i = 0; i < 2; i++) {
struct ir3_block *succ = block->physical_successors[i];
if (!succ)
continue;
progress |= merge_state_physical(ctx,
&ctx->block_reaching[succ->index],
&ctx->reaching);
}
return progress;
bool progress = false;
for (unsigned i = 0; i < 2; i++) {
struct ir3_block *succ = block->successors[i];
if (!succ)
continue;
progress |=
merge_state(ctx, &ctx->block_reaching[succ->index], &ctx->reaching);
}
for (unsigned i = 0; i < 2; i++) {
struct ir3_block *succ = block->physical_successors[i];
if (!succ)
continue;
progress |= merge_state_physical(ctx, &ctx->block_reaching[succ->index],
&ctx->reaching);
}
return progress;
}
static void
chase_definition(struct reg_state *state)
{
while (true) {
struct ir3_instruction *instr = state->def->instr;
switch (instr->opc) {
case OPC_META_SPLIT: {
struct ir3_register *new_def = instr->srcs[0]->def;
unsigned offset = instr->split.off * reg_elem_size(new_def);
*state = (struct reg_state) {
.def = new_def,
.offset = state->offset + offset,
};
break;
}
case OPC_META_COLLECT: {
unsigned src_idx = state->offset / reg_elem_size(state->def);
unsigned src_offset = state->offset % reg_elem_size(state->def);
struct ir3_register *new_def = instr->srcs[src_idx]->def;
if (new_def) {
*state = (struct reg_state) {
.def = new_def,
.offset = src_offset,
};
} else {
/* Bail on immed/const */
return;
}
break;
}
case OPC_META_PARALLEL_COPY: {
unsigned dst_idx = ~0;
for (unsigned i = 0; i < instr->dsts_count; i++) {
if (instr->dsts[i] == state->def) {
dst_idx = i;
break;
}
}
assert(dst_idx != ~0);
while (true) {
struct ir3_instruction *instr = state->def->instr;
switch (instr->opc) {
case OPC_META_SPLIT: {
struct ir3_register *new_def = instr->srcs[0]->def;
unsigned offset = instr->split.off * reg_elem_size(new_def);
*state = (struct reg_state){
.def = new_def,
.offset = state->offset + offset,
};
break;
}
case OPC_META_COLLECT: {
unsigned src_idx = state->offset / reg_elem_size(state->def);
unsigned src_offset = state->offset % reg_elem_size(state->def);
struct ir3_register *new_def = instr->srcs[src_idx]->def;
if (new_def) {
*state = (struct reg_state){
.def = new_def,
.offset = src_offset,
};
} else {
/* Bail on immed/const */
return;
}
break;
}
case OPC_META_PARALLEL_COPY: {
unsigned dst_idx = ~0;
for (unsigned i = 0; i < instr->dsts_count; i++) {
if (instr->dsts[i] == state->def) {
dst_idx = i;
break;
}
}
assert(dst_idx != ~0);
struct ir3_register *new_def = instr->srcs[dst_idx]->def;
if (new_def) {
state->def = new_def;
} else {
/* Bail on immed/const */
return;
}
break;
}
default:
return;
}
}
struct ir3_register *new_def = instr->srcs[dst_idx]->def;
if (new_def) {
state->def = new_def;
} else {
/* Bail on immed/const */
return;
}
break;
}
default:
return;
}
}
}
static void
dump_reg_state(struct reg_state *state)
{
if (state->def == UNDEF) {
fprintf(stderr, "no reaching definition");
} else if (state->def == OVERDEF) {
fprintf(stderr, "more than one reaching definition or partial definition");
} else {
/* The analysis should always remove UNKNOWN eventually. */
assert(state->def != UNKNOWN);
if (state->def == UNDEF) {
fprintf(stderr, "no reaching definition");
} else if (state->def == OVERDEF) {
fprintf(stderr,
"more than one reaching definition or partial definition");
} else {
/* The analysis should always remove UNKNOWN eventually. */
assert(state->def != UNKNOWN);
fprintf(stderr, "ssa_%u:%u(%sr%u.%c) + %u",
state->def->instr->serialno, state->def->name,
(state->def->flags & IR3_REG_HALF) ? "h" : "",
state->def->num / 4, "xyzw"[state->def->num % 4],
state->offset);
}
fprintf(stderr, "ssa_%u:%u(%sr%u.%c) + %u", state->def->instr->serialno,
state->def->name, (state->def->flags & IR3_REG_HALF) ? "h" : "",
state->def->num / 4, "xyzw"[state->def->num % 4],
state -> offset);
}
}
static void
check_reaching_src(struct ra_val_ctx *ctx, struct ir3_instruction *instr,
struct ir3_register *src)
struct ir3_register *src)
{
struct file_state *file = ra_val_get_file(ctx, src);
physreg_t physreg = ra_reg_get_physreg(src);
for (unsigned i = 0; i < reg_size(src); i++) {
struct reg_state expected = (struct reg_state) {
.def = src->def,
.offset = i,
};
chase_definition(&expected);
struct file_state *file = ra_val_get_file(ctx, src);
physreg_t physreg = ra_reg_get_physreg(src);
for (unsigned i = 0; i < reg_size(src); i++) {
struct reg_state expected = (struct reg_state){
.def = src->def,
.offset = i,
};
chase_definition(&expected);
struct reg_state actual = file->regs[physreg + i];
struct reg_state actual = file->regs[physreg + i];
if (expected.def != actual.def ||
expected.offset != actual.offset) {
fprintf(stderr, "ra validation fail: wrong definition reaches source ssa_%u:%u + %u\n",
src->def->instr->serialno, src->def->name, i);
fprintf(stderr, "expected: ");
dump_reg_state(&expected);
fprintf(stderr, "\n");
fprintf(stderr, "actual: ");
dump_reg_state(&actual);
fprintf(stderr, "\n");
fprintf(stderr, "-> for instruction: ");
ir3_print_instr(instr);
ctx->failed = true;
}
}
if (expected.def != actual.def || expected.offset != actual.offset) {
fprintf(
stderr,
"ra validation fail: wrong definition reaches source ssa_%u:%u + %u\n",
src->def->instr->serialno, src->def->name, i);
fprintf(stderr, "expected: ");
dump_reg_state(&expected);
fprintf(stderr, "\n");
fprintf(stderr, "actual: ");
dump_reg_state(&actual);
fprintf(stderr, "\n");
fprintf(stderr, "-> for instruction: ");
ir3_print_instr(instr);
ctx->failed = true;
}
}
}
static void
check_reaching_instr(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
{
if (instr->opc == OPC_META_SPLIT ||
instr->opc == OPC_META_COLLECT ||
instr->opc == OPC_META_PARALLEL_COPY ||
instr->opc == OPC_META_PHI) {
return;
}
if (instr->opc == OPC_META_SPLIT || instr->opc == OPC_META_COLLECT ||
instr->opc == OPC_META_PARALLEL_COPY || instr->opc == OPC_META_PHI) {
return;
}
ra_foreach_src (src, instr) {
check_reaching_src(ctx, instr, src);
}
ra_foreach_src (src, instr) {
check_reaching_src(ctx, instr, src);
}
}
static void
check_reaching_block(struct ra_val_ctx *ctx, struct ir3_block *block)
{
ctx->reaching = ctx->block_reaching[block->index];
ctx->reaching = ctx->block_reaching[block->index];
foreach_instr (instr, &block->instr_list) {
check_reaching_instr(ctx, instr);
propagate_instr(ctx, instr);
}
foreach_instr (instr, &block->instr_list) {
check_reaching_instr(ctx, instr);
propagate_instr(ctx, instr);
}
for (unsigned i = 0; i < 2; i++) {
struct ir3_block *succ = block->successors[i];
if (!succ)
continue;
for (unsigned i = 0; i < 2; i++) {
struct ir3_block *succ = block->successors[i];
if (!succ)
continue;
unsigned pred_idx = ir3_block_get_pred_index(succ, block);
foreach_instr (instr, &succ->instr_list) {
if (instr->opc != OPC_META_PHI)
break;
if (instr->srcs[pred_idx]->def)
check_reaching_src(ctx, instr, instr->srcs[pred_idx]);
}
}
unsigned pred_idx = ir3_block_get_pred_index(succ, block);
foreach_instr (instr, &succ->instr_list) {
if (instr->opc != OPC_META_PHI)
break;
if (instr->srcs[pred_idx]->def)
check_reaching_src(ctx, instr, instr->srcs[pred_idx]);
}
}
}
static void
check_reaching_defs(struct ra_val_ctx *ctx, struct ir3 *ir)
{
ctx->block_reaching =
rzalloc_array(ctx, struct reaching_state, ctx->block_count);
ctx->block_reaching =
rzalloc_array(ctx, struct reaching_state, ctx->block_count);
struct reaching_state *start = &ctx->block_reaching[0];
for (unsigned i = 0; i < ctx->full_size; i++)
start->full.regs[i].def = UNDEF;
for (unsigned i = 0; i < ctx->half_size; i++)
start->half.regs[i].def = UNDEF;
for (unsigned i = 0; i < RA_SHARED_SIZE; i++)
start->shared.regs[i].def = UNDEF;
struct reaching_state *start = &ctx->block_reaching[0];
for (unsigned i = 0; i < ctx->full_size; i++)
start->full.regs[i].def = UNDEF;
for (unsigned i = 0; i < ctx->half_size; i++)
start->half.regs[i].def = UNDEF;
for (unsigned i = 0; i < RA_SHARED_SIZE; i++)
start->shared.regs[i].def = UNDEF;
bool progress;
do {
progress = false;
foreach_block (block, &ir->block_list) {
progress |= propagate_block(ctx, block);
}
} while (progress);
bool progress;
do {
progress = false;
foreach_block (block, &ir->block_list) {
progress |= propagate_block(ctx, block);
}
} while (progress);
foreach_block (block, &ir->block_list) {
check_reaching_block(ctx, block);
}
foreach_block (block, &ir->block_list) {
check_reaching_block(ctx, block);
}
if (ctx->failed) {
fprintf(stderr, "failing shader:\n");
ir3_print(ir);
abort();
}
if (ctx->failed) {
fprintf(stderr, "failing shader:\n");
ir3_print(ir);
abort();
}
}
void
ir3_ra_validate(struct ir3_shader_variant *v,
unsigned full_size, unsigned half_size, unsigned block_count)
ir3_ra_validate(struct ir3_shader_variant *v, unsigned full_size,
unsigned half_size, unsigned block_count)
{
#ifdef NDEBUG
# define VALIDATE 0
#define VALIDATE 0
#else
# define VALIDATE 1
#define VALIDATE 1
#endif
if (!VALIDATE)
return;
if (!VALIDATE)
return;
struct ra_val_ctx *ctx = rzalloc(NULL, struct ra_val_ctx);
ctx->merged_regs = v->mergedregs;
ctx->full_size = full_size;
ctx->half_size = half_size;
ctx->block_count = block_count;
struct ra_val_ctx *ctx = rzalloc(NULL, struct ra_val_ctx);
ctx->merged_regs = v->mergedregs;
ctx->full_size = full_size;
ctx->half_size = half_size;
ctx->block_count = block_count;
foreach_block (block, &v->ir->block_list) {
foreach_instr (instr, &block->instr_list) {
validate_simple(ctx, instr);
}
}
foreach_block (block, &v->ir->block_list) {
foreach_instr (instr, &block->instr_list) {
validate_simple(ctx, instr);
}
}
check_reaching_defs(ctx, v->ir);
check_reaching_defs(ctx, v->ir);
ralloc_free(ctx);
ralloc_free(ctx);
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -21,9 +21,9 @@
* SOFTWARE.
*/
#include "util/rb_tree.h"
#include "ir3_ra.h"
#include "ir3_shader.h"
#include "util/rb_tree.h"
/*
* This pass does one thing so far:
@ -36,326 +36,330 @@
*/
struct ra_spill_interval {
struct ir3_reg_interval interval;
struct ir3_reg_interval interval;
};
struct ra_spill_ctx {
struct ir3_reg_ctx reg_ctx;
struct ir3_reg_ctx reg_ctx;
struct ra_spill_interval *intervals;
struct ra_spill_interval *intervals;
struct ir3_pressure cur_pressure, max_pressure;
struct ir3_pressure cur_pressure, max_pressure;
struct ir3_liveness *live;
struct ir3_liveness *live;
const struct ir3_compiler *compiler;
const struct ir3_compiler *compiler;
};
static void
ra_spill_interval_init(struct ra_spill_interval *interval, struct ir3_register *reg)
ra_spill_interval_init(struct ra_spill_interval *interval,
struct ir3_register *reg)
{
ir3_reg_interval_init(&interval->interval, reg);
ir3_reg_interval_init(&interval->interval, reg);
}
static void
ra_pressure_add(struct ir3_pressure *pressure, struct ra_spill_interval *interval)
ra_pressure_add(struct ir3_pressure *pressure,
struct ra_spill_interval *interval)
{
unsigned size = reg_size(interval->interval.reg);
if (interval->interval.reg->flags & IR3_REG_SHARED)
pressure->shared += size;
else if (interval->interval.reg->flags & IR3_REG_HALF)
pressure->half += size;
else
pressure->full += size;
unsigned size = reg_size(interval->interval.reg);
if (interval->interval.reg->flags & IR3_REG_SHARED)
pressure->shared += size;
else if (interval->interval.reg->flags & IR3_REG_HALF)
pressure->half += size;
else
pressure->full += size;
}
static void
ra_pressure_sub(struct ir3_pressure *pressure, struct ra_spill_interval *interval)
ra_pressure_sub(struct ir3_pressure *pressure,
struct ra_spill_interval *interval)
{
unsigned size = reg_size(interval->interval.reg);
if (interval->interval.reg->flags & IR3_REG_SHARED)
pressure->shared -= size;
else if (interval->interval.reg->flags & IR3_REG_HALF)
pressure->half -= size;
else
pressure->full -= size;
unsigned size = reg_size(interval->interval.reg);
if (interval->interval.reg->flags & IR3_REG_SHARED)
pressure->shared -= size;
else if (interval->interval.reg->flags & IR3_REG_HALF)
pressure->half -= size;
else
pressure->full -= size;
}
static struct ra_spill_interval *
ir3_reg_interval_to_interval(struct ir3_reg_interval *interval)
{
return rb_node_data(struct ra_spill_interval, interval, interval);
return rb_node_data(struct ra_spill_interval, interval, interval);
}
static struct ra_spill_ctx *
ir3_reg_ctx_to_ctx(struct ir3_reg_ctx *ctx)
{
return rb_node_data(struct ra_spill_ctx, ctx, reg_ctx);
return rb_node_data(struct ra_spill_ctx, ctx, reg_ctx);
}
static void
interval_add(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
{
struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
ra_pressure_add(&ctx->cur_pressure, interval);
ra_pressure_add(&ctx->cur_pressure, interval);
}
static void
interval_delete(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
{
struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
ra_pressure_sub(&ctx->cur_pressure, interval);
ra_pressure_sub(&ctx->cur_pressure, interval);
}
static void
interval_readd(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_parent,
struct ir3_reg_interval *_child)
struct ir3_reg_interval *_child)
{
interval_add(_ctx, _child);
interval_add(_ctx, _child);
}
static void
spill_ctx_init(struct ra_spill_ctx *ctx)
{
rb_tree_init(&ctx->reg_ctx.intervals);
ctx->reg_ctx.interval_add = interval_add;
ctx->reg_ctx.interval_delete = interval_delete;
ctx->reg_ctx.interval_readd = interval_readd;
rb_tree_init(&ctx->reg_ctx.intervals);
ctx->reg_ctx.interval_add = interval_add;
ctx->reg_ctx.interval_delete = interval_delete;
ctx->reg_ctx.interval_readd = interval_readd;
}
static void
ra_spill_ctx_insert(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval)
ra_spill_ctx_insert(struct ra_spill_ctx *ctx,
struct ra_spill_interval *interval)
{
ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
}
static void
ra_spill_ctx_remove(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval)
ra_spill_ctx_remove(struct ra_spill_ctx *ctx,
struct ra_spill_interval *interval)
{
ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
}
static void
init_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
{
struct ra_spill_interval *interval = &ctx->intervals[dst->name];
ra_spill_interval_init(interval, dst);
struct ra_spill_interval *interval = &ctx->intervals[dst->name];
ra_spill_interval_init(interval, dst);
}
static void
insert_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
{
struct ra_spill_interval *interval = &ctx->intervals[dst->name];
if (interval->interval.inserted)
return;
struct ra_spill_interval *interval = &ctx->intervals[dst->name];
if (interval->interval.inserted)
return;
ra_spill_ctx_insert(ctx, interval);
ra_spill_ctx_insert(ctx, interval);
/* For precolored inputs, make sure we leave enough registers to allow for
* holes in the inputs. It can happen that the binning shader has a lower
* register pressure than the main shader, but the main shader decided to
* add holes between the inputs which means that the binning shader has a
* higher register demand.
*/
if (dst->instr->opc == OPC_META_INPUT &&
dst->num != INVALID_REG) {
physreg_t physreg = ra_reg_get_physreg(dst);
physreg_t max = physreg + reg_size(dst);
/* For precolored inputs, make sure we leave enough registers to allow for
* holes in the inputs. It can happen that the binning shader has a lower
* register pressure than the main shader, but the main shader decided to
* add holes between the inputs which means that the binning shader has a
* higher register demand.
*/
if (dst->instr->opc == OPC_META_INPUT && dst->num != INVALID_REG) {
physreg_t physreg = ra_reg_get_physreg(dst);
physreg_t max = physreg + reg_size(dst);
if (interval->interval.reg->flags & IR3_REG_SHARED)
ctx->max_pressure.shared = MAX2(ctx->max_pressure.shared, max);
else if (interval->interval.reg->flags & IR3_REG_HALF)
ctx->max_pressure.half = MAX2(ctx->max_pressure.half, max);
else
ctx->max_pressure.full = MAX2(ctx->max_pressure.full, max);
}
if (interval->interval.reg->flags & IR3_REG_SHARED)
ctx->max_pressure.shared = MAX2(ctx->max_pressure.shared, max);
else if (interval->interval.reg->flags & IR3_REG_HALF)
ctx->max_pressure.half = MAX2(ctx->max_pressure.half, max);
else
ctx->max_pressure.full = MAX2(ctx->max_pressure.full, max);
}
}
static void
remove_src_early(struct ra_spill_ctx *ctx, struct ir3_instruction *instr, struct ir3_register *src)
remove_src_early(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
struct ir3_register *src)
{
if (!(src->flags & IR3_REG_FIRST_KILL))
return;
if (!(src->flags & IR3_REG_FIRST_KILL))
return;
struct ra_spill_interval *interval = &ctx->intervals[src->def->name];
struct ra_spill_interval *interval = &ctx->intervals[src->def->name];
if (!interval->interval.inserted || interval->interval.parent ||
!rb_tree_is_empty(&interval->interval.children))
return;
if (!interval->interval.inserted || interval->interval.parent ||
!rb_tree_is_empty(&interval->interval.children))
return;
ra_spill_ctx_remove(ctx, interval);
ra_spill_ctx_remove(ctx, interval);
}
static void
remove_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr, struct ir3_register *src)
remove_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
struct ir3_register *src)
{
if (!(src->flags & IR3_REG_FIRST_KILL))
return;
if (!(src->flags & IR3_REG_FIRST_KILL))
return;
struct ra_spill_interval *interval = &ctx->intervals[src->def->name];
struct ra_spill_interval *interval = &ctx->intervals[src->def->name];
if (!interval->interval.inserted)
return;
if (!interval->interval.inserted)
return;
ra_spill_ctx_remove(ctx, interval);
ra_spill_ctx_remove(ctx, interval);
}
static void
remove_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
{
struct ra_spill_interval *interval = &ctx->intervals[dst->name];
struct ra_spill_interval *interval = &ctx->intervals[dst->name];
if (!interval->interval.inserted)
return;
if (!interval->interval.inserted)
return;
ra_spill_ctx_remove(ctx, interval);
ra_spill_ctx_remove(ctx, interval);
}
static void
update_max_pressure(struct ra_spill_ctx *ctx)
{
d("pressure:");
d("\tfull: %u", ctx->cur_pressure.full);
d("\thalf: %u", ctx->cur_pressure.half);
d("\tshared: %u", ctx->cur_pressure.shared);
d("pressure:");
d("\tfull: %u", ctx->cur_pressure.full);
d("\thalf: %u", ctx->cur_pressure.half);
d("\tshared: %u", ctx->cur_pressure.shared);
ctx->max_pressure.full =
MAX2(ctx->max_pressure.full, ctx->cur_pressure.full);
ctx->max_pressure.half =
MAX2(ctx->max_pressure.half, ctx->cur_pressure.half);
ctx->max_pressure.shared =
MAX2(ctx->max_pressure.shared, ctx->cur_pressure.shared);
ctx->max_pressure.full =
MAX2(ctx->max_pressure.full, ctx->cur_pressure.full);
ctx->max_pressure.half =
MAX2(ctx->max_pressure.half, ctx->cur_pressure.half);
ctx->max_pressure.shared =
MAX2(ctx->max_pressure.shared, ctx->cur_pressure.shared);
}
static void
handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
{
if (RA_DEBUG) {
printf("processing: ");
ir3_print_instr(instr);
}
if (RA_DEBUG) {
printf("processing: ");
ir3_print_instr(instr);
}
ra_foreach_dst(dst, instr) {
init_dst(ctx, dst);
}
ra_foreach_dst (dst, instr) {
init_dst(ctx, dst);
}
/* Handle tied destinations. If a destination is tied to a source and that
* source is live-through, then we need to allocate a new register for the
* destination which is live-through itself and cannot overlap the
* sources.
*/
/* Handle tied destinations. If a destination is tied to a source and that
* source is live-through, then we need to allocate a new register for the
* destination which is live-through itself and cannot overlap the
* sources.
*/
ra_foreach_dst(dst, instr) {
struct ir3_register *tied_src = dst->tied;
if (tied_src && !(tied_src->flags & IR3_REG_FIRST_KILL))
insert_dst(ctx, dst);
}
ra_foreach_dst (dst, instr) {
struct ir3_register *tied_src = dst->tied;
if (tied_src && !(tied_src->flags & IR3_REG_FIRST_KILL))
insert_dst(ctx, dst);
}
update_max_pressure(ctx);
update_max_pressure(ctx);
ra_foreach_src(src, instr) {
if (src->flags & IR3_REG_FIRST_KILL)
remove_src_early(ctx, instr, src);
}
ra_foreach_src (src, instr) {
if (src->flags & IR3_REG_FIRST_KILL)
remove_src_early(ctx, instr, src);
}
ra_foreach_dst (dst, instr) {
insert_dst(ctx, dst);
}
ra_foreach_dst(dst, instr) {
insert_dst(ctx, dst);
}
update_max_pressure(ctx);
update_max_pressure(ctx);
for (unsigned i = 0; i < instr->srcs_count; i++) {
if (ra_reg_is_src(instr->srcs[i]) &&
(instr->srcs[i]->flags & IR3_REG_FIRST_KILL))
remove_src(ctx, instr, instr->srcs[i]);
}
for (unsigned i = 0; i < instr->dsts_count; i++) {
if (ra_reg_is_dst(instr->dsts[i]) &&
(instr->dsts[i]->flags & IR3_REG_UNUSED))
remove_dst(ctx, instr->dsts[i]);
}
for (unsigned i = 0; i < instr->srcs_count; i++) {
if (ra_reg_is_src(instr->srcs[i]) &&
(instr->srcs[i]->flags & IR3_REG_FIRST_KILL))
remove_src(ctx, instr, instr->srcs[i]);
}
for (unsigned i = 0; i < instr->dsts_count; i++) {
if (ra_reg_is_dst(instr->dsts[i]) &&
(instr->dsts[i]->flags & IR3_REG_UNUSED))
remove_dst(ctx, instr->dsts[i]);
}
}
static void
handle_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
{
init_dst(ctx, instr->dsts[0]);
insert_dst(ctx, instr->dsts[0]);
init_dst(ctx, instr->dsts[0]);
insert_dst(ctx, instr->dsts[0]);
}
static void
remove_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
{
ra_foreach_src(src, instr)
remove_src(ctx, instr, src);
if (instr->dsts[0]->flags & IR3_REG_UNUSED)
remove_dst(ctx, instr->dsts[0]);
ra_foreach_src (src, instr)
remove_src(ctx, instr, src);
if (instr->dsts[0]->flags & IR3_REG_UNUSED)
remove_dst(ctx, instr->dsts[0]);
}
static void
handle_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def)
{
struct ra_spill_interval *interval = &ctx->intervals[def->name];
ra_spill_interval_init(interval, def);
insert_dst(ctx, def);
struct ra_spill_interval *interval = &ctx->intervals[def->name];
ra_spill_interval_init(interval, def);
insert_dst(ctx, def);
}
static void
handle_block(struct ra_spill_ctx *ctx, struct ir3_block *block)
{
memset(&ctx->cur_pressure, 0, sizeof(ctx->cur_pressure));
rb_tree_init(&ctx->reg_ctx.intervals);
memset(&ctx->cur_pressure, 0, sizeof(ctx->cur_pressure));
rb_tree_init(&ctx->reg_ctx.intervals);
unsigned name;
BITSET_FOREACH_SET(name, ctx->live->live_in[block->index],
ctx->live->definitions_count) {
struct ir3_register *reg = ctx->live->definitions[name];
handle_live_in(ctx, reg);
}
unsigned name;
BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
ctx->live->definitions_count) {
struct ir3_register *reg = ctx->live->definitions[name];
handle_live_in(ctx, reg);
}
foreach_instr (instr, &block->instr_list) {
if (instr->opc != OPC_META_PHI && instr->opc != OPC_META_INPUT &&
instr->opc != OPC_META_TEX_PREFETCH)
break;
handle_input_phi(ctx, instr);
}
foreach_instr (instr, &block->instr_list) {
if (instr->opc != OPC_META_PHI && instr->opc != OPC_META_INPUT &&
instr->opc != OPC_META_TEX_PREFETCH)
break;
handle_input_phi(ctx, instr);
}
update_max_pressure(ctx);
update_max_pressure(ctx);
foreach_instr (instr, &block->instr_list) {
if (instr->opc == OPC_META_PHI || instr->opc == OPC_META_INPUT ||
instr->opc == OPC_META_TEX_PREFETCH)
remove_input_phi(ctx, instr);
else
handle_instr(ctx, instr);
}
foreach_instr (instr, &block->instr_list) {
if (instr->opc == OPC_META_PHI || instr->opc == OPC_META_INPUT ||
instr->opc == OPC_META_TEX_PREFETCH)
remove_input_phi(ctx, instr);
else
handle_instr(ctx, instr);
}
}
void
ir3_calc_pressure(struct ir3_shader_variant *v, struct ir3_liveness *live,
struct ir3_pressure *max_pressure)
struct ir3_pressure *max_pressure)
{
struct ra_spill_ctx ctx = {};
ctx.live = live;
ctx.intervals = calloc(live->definitions_count, sizeof(*ctx.intervals));
ctx.compiler = v->shader->compiler;
spill_ctx_init(&ctx);
struct ra_spill_ctx ctx = {};
ctx.live = live;
ctx.intervals = calloc(live->definitions_count, sizeof(*ctx.intervals));
ctx.compiler = v->shader->compiler;
spill_ctx_init(&ctx);
foreach_block (block, &v->ir->block_list) {
handle_block(&ctx, block);
}
foreach_block (block, &v->ir->block_list) {
handle_block(&ctx, block);
}
assert(ctx.cur_pressure.full == 0);
assert(ctx.cur_pressure.half == 0);
assert(ctx.cur_pressure.shared == 0);
assert(ctx.cur_pressure.full == 0);
assert(ctx.cur_pressure.half == 0);
assert(ctx.cur_pressure.shared == 0);
free(ctx.intervals);
free(ctx.intervals);
*max_pressure = ctx.max_pressure;
*max_pressure = ctx.max_pressure;
}

View File

@ -28,61 +28,64 @@
#include "ir3.h"
struct ir3_validate_ctx {
struct ir3 *ir;
struct ir3 *ir;
/* Current instruction being validated: */
struct ir3_instruction *current_instr;
/* Current instruction being validated: */
struct ir3_instruction *current_instr;
/* Set of instructions found so far, used to validate that we
* don't have SSA uses that occure before def's
*/
struct set *defs;
/* Set of instructions found so far, used to validate that we
* don't have SSA uses that occure before def's
*/
struct set *defs;
};
static void
validate_error(struct ir3_validate_ctx *ctx, const char *condstr)
{
fprintf(stderr, "validation fail: %s\n", condstr);
fprintf(stderr, " -> for instruction: ");
ir3_print_instr(ctx->current_instr);
abort();
fprintf(stderr, "validation fail: %s\n", condstr);
fprintf(stderr, " -> for instruction: ");
ir3_print_instr(ctx->current_instr);
abort();
}
#define validate_assert(ctx, cond) do { \
if (!(cond)) { \
validate_error(ctx, #cond); \
} } while (0)
#define validate_assert(ctx, cond) \
do { \
if (!(cond)) { \
validate_error(ctx, #cond); \
} \
} while (0)
static unsigned
reg_class_flags(struct ir3_register *reg)
{
return reg->flags & (IR3_REG_HALF | IR3_REG_SHARED);
return reg->flags & (IR3_REG_HALF | IR3_REG_SHARED);
}
static void
validate_src(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr,
struct ir3_register *reg)
struct ir3_register *reg)
{
if (!(reg->flags & IR3_REG_SSA) || !reg->def)
return;
if (!(reg->flags & IR3_REG_SSA) || !reg->def)
return;
struct ir3_register *src = reg->def;
struct ir3_register *src = reg->def;
validate_assert(ctx, _mesa_set_search(ctx->defs, src->instr));
validate_assert(ctx, src->wrmask == reg->wrmask);
validate_assert(ctx, reg_class_flags(src) == reg_class_flags(reg));
validate_assert(ctx, _mesa_set_search(ctx->defs, src->instr));
validate_assert(ctx, src->wrmask == reg->wrmask);
validate_assert(ctx, reg_class_flags(src) == reg_class_flags(reg));
if (reg->tied) {
validate_assert(ctx, reg->tied->tied == reg);
bool found = false;
foreach_dst (dst, instr) {
if (dst == reg->tied) {
found = true;
break;
}
}
validate_assert(ctx, found && "tied register not in the same instruction");
}
if (reg->tied) {
validate_assert(ctx, reg->tied->tied == reg);
bool found = false;
foreach_dst (dst, instr) {
if (dst == reg->tied) {
found = true;
break;
}
}
validate_assert(ctx,
found && "tied register not in the same instruction");
}
}
/* phi sources are logically read at the end of the predecessor basic block,
@ -90,275 +93,280 @@ validate_src(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr,
* use comes after the definition for loop phis.
*/
static void
validate_phi_src(struct ir3_validate_ctx *ctx, struct ir3_block *block, struct ir3_block *pred)
validate_phi_src(struct ir3_validate_ctx *ctx, struct ir3_block *block,
struct ir3_block *pred)
{
unsigned pred_idx = ir3_block_get_pred_index(block, pred);
unsigned pred_idx = ir3_block_get_pred_index(block, pred);
foreach_instr (phi, &block->instr_list) {
if (phi->opc != OPC_META_PHI)
break;
foreach_instr (phi, &block->instr_list) {
if (phi->opc != OPC_META_PHI)
break;
ctx->current_instr = phi;
validate_assert(ctx, phi->srcs_count == block->predecessors_count);
validate_src(ctx, phi, phi->srcs[pred_idx]);
}
ctx->current_instr = phi;
validate_assert(ctx, phi->srcs_count == block->predecessors_count);
validate_src(ctx, phi, phi->srcs[pred_idx]);
}
}
static void
validate_phi(struct ir3_validate_ctx *ctx, struct ir3_instruction *phi)
{
_mesa_set_add(ctx->defs, phi);
validate_assert(ctx, phi->dsts_count == 1);
validate_assert(ctx, is_dest_gpr(phi->dsts[0]));
_mesa_set_add(ctx->defs, phi);
validate_assert(ctx, phi->dsts_count == 1);
validate_assert(ctx, is_dest_gpr(phi->dsts[0]));
}
static void
validate_dst(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr,
struct ir3_register *reg)
struct ir3_register *reg)
{
if (reg->tied) {
validate_assert(ctx, reg->tied->tied == reg);
validate_assert(ctx, reg_class_flags(reg->tied) == reg_class_flags(reg));
validate_assert(ctx, reg->tied->wrmask == reg->wrmask);
if (reg->flags & IR3_REG_ARRAY) {
validate_assert(ctx, reg->tied->array.base == reg->array.base);
validate_assert(ctx, reg->tied->size == reg->size);
}
bool found = false;
foreach_src (src, instr) {
if (src == reg->tied) {
found = true;
break;
}
}
validate_assert(ctx, found && "tied register not in the same instruction");
}
if (reg->tied) {
validate_assert(ctx, reg->tied->tied == reg);
validate_assert(ctx, reg_class_flags(reg->tied) == reg_class_flags(reg));
validate_assert(ctx, reg->tied->wrmask == reg->wrmask);
if (reg->flags & IR3_REG_ARRAY) {
validate_assert(ctx, reg->tied->array.base == reg->array.base);
validate_assert(ctx, reg->tied->size == reg->size);
}
bool found = false;
foreach_src (src, instr) {
if (src == reg->tied) {
found = true;
break;
}
}
validate_assert(ctx,
found && "tied register not in the same instruction");
}
if (reg->flags & IR3_REG_SSA)
validate_assert(ctx, reg->instr == instr);
if (reg->flags & IR3_REG_SSA)
validate_assert(ctx, reg->instr == instr);
if (reg->flags & IR3_REG_RELATIV)
validate_assert(ctx, instr->address);
if (reg->flags & IR3_REG_RELATIV)
validate_assert(ctx, instr->address);
}
#define validate_reg_size(ctx, reg, type) \
validate_assert(ctx, type_size(type) == (((reg)->flags & IR3_REG_HALF) ? 16 : 32))
#define validate_reg_size(ctx, reg, type) \
validate_assert( \
ctx, type_size(type) == (((reg)->flags & IR3_REG_HALF) ? 16 : 32))
static void
validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr)
{
struct ir3_register *last_reg = NULL;
struct ir3_register *last_reg = NULL;
foreach_src_n (reg, n, instr) {
if (reg->flags & IR3_REG_RELATIV)
validate_assert(ctx, instr->address);
foreach_src_n (reg, n, instr) {
if (reg->flags & IR3_REG_RELATIV)
validate_assert(ctx, instr->address);
validate_src(ctx, instr, reg);
validate_src(ctx, instr, reg);
/* Validate that all src's are either half of full.
*
* Note: tex instructions w/ .s2en are a bit special in that the
* tex/samp src reg is half-reg for non-bindless and full for
* bindless, irrespective of the precision of other srcs. The
* tex/samp src is the first src reg when .s2en is set
*/
if (reg->tied) {
/* must have the same size as the destination, handled in
* validate_reg().
*/
} else if (reg == instr->address) {
validate_assert(ctx, reg->flags & IR3_REG_HALF);
} else if ((instr->flags & IR3_INSTR_S2EN) && (n < 2)) {
if (n == 0) {
if (instr->flags & IR3_INSTR_B)
validate_assert(ctx, !(reg->flags & IR3_REG_HALF));
else
validate_assert(ctx, reg->flags & IR3_REG_HALF);
}
} else if (opc_cat(instr->opc) == 6) {
/* handled below */
} else if (opc_cat(instr->opc) == 0) {
/* end/chmask/etc are allowed to have different size sources */
} else if (n > 0) {
validate_assert(ctx, (last_reg->flags & IR3_REG_HALF) == (reg->flags & IR3_REG_HALF));
}
/* Validate that all src's are either half of full.
*
* Note: tex instructions w/ .s2en are a bit special in that the
* tex/samp src reg is half-reg for non-bindless and full for
* bindless, irrespective of the precision of other srcs. The
* tex/samp src is the first src reg when .s2en is set
*/
if (reg->tied) {
/* must have the same size as the destination, handled in
* validate_reg().
*/
} else if (reg == instr->address) {
validate_assert(ctx, reg->flags & IR3_REG_HALF);
} else if ((instr->flags & IR3_INSTR_S2EN) && (n < 2)) {
if (n == 0) {
if (instr->flags & IR3_INSTR_B)
validate_assert(ctx, !(reg->flags & IR3_REG_HALF));
else
validate_assert(ctx, reg->flags & IR3_REG_HALF);
}
} else if (opc_cat(instr->opc) == 6) {
/* handled below */
} else if (opc_cat(instr->opc) == 0) {
/* end/chmask/etc are allowed to have different size sources */
} else if (n > 0) {
validate_assert(ctx, (last_reg->flags & IR3_REG_HALF) ==
(reg->flags & IR3_REG_HALF));
}
last_reg = reg;
}
last_reg = reg;
}
for (unsigned i = 0; i < instr->dsts_count; i++) {
struct ir3_register *reg = instr->dsts[i];
for (unsigned i = 0; i < instr->dsts_count; i++) {
struct ir3_register *reg = instr->dsts[i];
validate_dst(ctx, instr, reg);
}
validate_dst(ctx, instr, reg);
}
_mesa_set_add(ctx->defs, instr);
_mesa_set_add(ctx->defs, instr);
/* Check that src/dst types match the register types, and for
* instructions that have different opcodes depending on type,
* that the opcodes are correct.
*/
switch (opc_cat(instr->opc)) {
case 1: /* move instructions */
if (instr->opc == OPC_MOVMSK || instr->opc == OPC_BALLOT_MACRO) {
validate_assert(ctx, instr->dsts_count == 1);
validate_assert(ctx, instr->dsts[0]->flags & IR3_REG_SHARED);
validate_assert(ctx, !(instr->dsts[0]->flags & IR3_REG_HALF));
validate_assert(ctx, util_is_power_of_two_or_zero(instr->dsts[0]->wrmask + 1));
} else if (instr->opc == OPC_ANY_MACRO || instr->opc == OPC_ALL_MACRO ||
instr->opc == OPC_READ_FIRST_MACRO ||
instr->opc == OPC_READ_COND_MACRO) {
/* nothing yet */
} else if (instr->opc == OPC_ELECT_MACRO) {
validate_assert(ctx, instr->dsts_count == 1);
validate_assert(ctx, !(instr->dsts[0]->flags & IR3_REG_SHARED));
} else {
foreach_dst (dst, instr)
validate_reg_size(ctx, dst, instr->cat1.dst_type);
foreach_src (src, instr) {
if (!src->tied && src != instr->address)
validate_reg_size(ctx, src, instr->cat1.src_type);
}
/* Check that src/dst types match the register types, and for
* instructions that have different opcodes depending on type,
* that the opcodes are correct.
*/
switch (opc_cat(instr->opc)) {
case 1: /* move instructions */
if (instr->opc == OPC_MOVMSK || instr->opc == OPC_BALLOT_MACRO) {
validate_assert(ctx, instr->dsts_count == 1);
validate_assert(ctx, instr->dsts[0]->flags & IR3_REG_SHARED);
validate_assert(ctx, !(instr->dsts[0]->flags & IR3_REG_HALF));
validate_assert(
ctx, util_is_power_of_two_or_zero(instr->dsts[0]->wrmask + 1));
} else if (instr->opc == OPC_ANY_MACRO || instr->opc == OPC_ALL_MACRO ||
instr->opc == OPC_READ_FIRST_MACRO ||
instr->opc == OPC_READ_COND_MACRO) {
/* nothing yet */
} else if (instr->opc == OPC_ELECT_MACRO) {
validate_assert(ctx, instr->dsts_count == 1);
validate_assert(ctx, !(instr->dsts[0]->flags & IR3_REG_SHARED));
} else {
foreach_dst (dst, instr)
validate_reg_size(ctx, dst, instr->cat1.dst_type);
foreach_src (src, instr) {
if (!src->tied && src != instr->address)
validate_reg_size(ctx, src, instr->cat1.src_type);
}
switch (instr->opc) {
case OPC_SWZ:
validate_assert(ctx, instr->srcs_count == 2);
validate_assert(ctx, instr->dsts_count == 2);
break;
case OPC_GAT:
validate_assert(ctx, instr->srcs_count == 4);
validate_assert(ctx, instr->dsts_count == 1);
break;
case OPC_SCT:
validate_assert(ctx, instr->srcs_count == 1);
validate_assert(ctx, instr->dsts_count == 4);
break;
default:
break;
}
}
switch (instr->opc) {
case OPC_SWZ:
validate_assert(ctx, instr->srcs_count == 2);
validate_assert(ctx, instr->dsts_count == 2);
break;
case OPC_GAT:
validate_assert(ctx, instr->srcs_count == 4);
validate_assert(ctx, instr->dsts_count == 1);
break;
case OPC_SCT:
validate_assert(ctx, instr->srcs_count == 1);
validate_assert(ctx, instr->dsts_count == 4);
break;
default:
break;
}
}
if (instr->opc != OPC_MOV)
validate_assert(ctx, !instr->address);
if (instr->opc != OPC_MOV)
validate_assert(ctx, !instr->address);
break;
case 3:
/* Validate that cat3 opc matches the src type. We've already checked that all
* the src regs are same type
*/
if (instr->srcs[0]->flags & IR3_REG_HALF) {
validate_assert(ctx, instr->opc == cat3_half_opc(instr->opc));
} else {
validate_assert(ctx, instr->opc == cat3_full_opc(instr->opc));
}
break;
case 4:
/* Validate that cat4 opc matches the dst type: */
if (instr->dsts[0]->flags & IR3_REG_HALF) {
validate_assert(ctx, instr->opc == cat4_half_opc(instr->opc));
} else {
validate_assert(ctx, instr->opc == cat4_full_opc(instr->opc));
}
break;
case 5:
validate_reg_size(ctx, instr->dsts[0], instr->cat5.type);
break;
case 6:
switch (instr->opc) {
case OPC_RESINFO:
case OPC_RESFMT:
validate_reg_size(ctx, instr->dsts[0], instr->cat6.type);
validate_reg_size(ctx, instr->srcs[0], instr->cat6.type);
break;
case OPC_L2G:
case OPC_G2L:
validate_assert(ctx, !(instr->dsts[0]->flags & IR3_REG_HALF));
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
break;
case OPC_STG:
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
validate_assert(ctx, !(instr->srcs[1]->flags & IR3_REG_HALF));
validate_reg_size(ctx, instr->srcs[2], instr->cat6.type);
validate_assert(ctx, !(instr->srcs[3]->flags & IR3_REG_HALF));
break;
case OPC_STG_A:
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
validate_assert(ctx, !(instr->srcs[2]->flags & IR3_REG_HALF));
validate_assert(ctx, !(instr->srcs[3]->flags & IR3_REG_HALF));
validate_reg_size(ctx, instr->srcs[4], instr->cat6.type);
validate_assert(ctx, !(instr->srcs[5]->flags & IR3_REG_HALF));
break;
case OPC_STL:
case OPC_STP:
case OPC_STLW:
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
validate_reg_size(ctx, instr->srcs[1], instr->cat6.type);
validate_assert(ctx, !(instr->srcs[2]->flags & IR3_REG_HALF));
break;
case OPC_STIB:
if (instr->flags & IR3_INSTR_B) {
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
validate_assert(ctx, !(instr->srcs[1]->flags & IR3_REG_HALF));
validate_reg_size(ctx, instr->srcs[2], instr->cat6.type);
} else {
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
validate_reg_size(ctx, instr->srcs[1], instr->cat6.type);
validate_assert(ctx, !(instr->srcs[2]->flags & IR3_REG_HALF));
}
break;
default:
validate_reg_size(ctx, instr->dsts[0], instr->cat6.type);
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
if (instr->srcs_count > 1)
validate_assert(ctx, !(instr->srcs[1]->flags & IR3_REG_HALF));
break;
}
}
break;
case 3:
/* Validate that cat3 opc matches the src type. We've already checked
* that all the src regs are same type
*/
if (instr->srcs[0]->flags & IR3_REG_HALF) {
validate_assert(ctx, instr->opc == cat3_half_opc(instr->opc));
} else {
validate_assert(ctx, instr->opc == cat3_full_opc(instr->opc));
}
break;
case 4:
/* Validate that cat4 opc matches the dst type: */
if (instr->dsts[0]->flags & IR3_REG_HALF) {
validate_assert(ctx, instr->opc == cat4_half_opc(instr->opc));
} else {
validate_assert(ctx, instr->opc == cat4_full_opc(instr->opc));
}
break;
case 5:
validate_reg_size(ctx, instr->dsts[0], instr->cat5.type);
break;
case 6:
switch (instr->opc) {
case OPC_RESINFO:
case OPC_RESFMT:
validate_reg_size(ctx, instr->dsts[0], instr->cat6.type);
validate_reg_size(ctx, instr->srcs[0], instr->cat6.type);
break;
case OPC_L2G:
case OPC_G2L:
validate_assert(ctx, !(instr->dsts[0]->flags & IR3_REG_HALF));
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
break;
case OPC_STG:
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
validate_assert(ctx, !(instr->srcs[1]->flags & IR3_REG_HALF));
validate_reg_size(ctx, instr->srcs[2], instr->cat6.type);
validate_assert(ctx, !(instr->srcs[3]->flags & IR3_REG_HALF));
break;
case OPC_STG_A:
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
validate_assert(ctx, !(instr->srcs[2]->flags & IR3_REG_HALF));
validate_assert(ctx, !(instr->srcs[3]->flags & IR3_REG_HALF));
validate_reg_size(ctx, instr->srcs[4], instr->cat6.type);
validate_assert(ctx, !(instr->srcs[5]->flags & IR3_REG_HALF));
break;
case OPC_STL:
case OPC_STP:
case OPC_STLW:
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
validate_reg_size(ctx, instr->srcs[1], instr->cat6.type);
validate_assert(ctx, !(instr->srcs[2]->flags & IR3_REG_HALF));
break;
case OPC_STIB:
if (instr->flags & IR3_INSTR_B) {
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
validate_assert(ctx, !(instr->srcs[1]->flags & IR3_REG_HALF));
validate_reg_size(ctx, instr->srcs[2], instr->cat6.type);
} else {
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
validate_reg_size(ctx, instr->srcs[1], instr->cat6.type);
validate_assert(ctx, !(instr->srcs[2]->flags & IR3_REG_HALF));
}
break;
default:
validate_reg_size(ctx, instr->dsts[0], instr->cat6.type);
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
if (instr->srcs_count > 1)
validate_assert(ctx, !(instr->srcs[1]->flags & IR3_REG_HALF));
break;
}
}
}
void
ir3_validate(struct ir3 *ir)
{
#ifdef NDEBUG
# define VALIDATE 0
#define VALIDATE 0
#else
# define VALIDATE 1
#define VALIDATE 1
#endif
if (!VALIDATE)
return;
if (!VALIDATE)
return;
struct ir3_validate_ctx *ctx = ralloc_size(NULL, sizeof(*ctx));
struct ir3_validate_ctx *ctx = ralloc_size(NULL, sizeof(*ctx));
ctx->ir = ir;
ctx->defs = _mesa_pointer_set_create(ctx);
ctx->ir = ir;
ctx->defs = _mesa_pointer_set_create(ctx);
foreach_block (block, &ir->block_list) {
/* We require that the first block does not have any predecessors,
* which allows us to assume that phi nodes and meta:input's do not
* appear in the same basic block.
*/
validate_assert(ctx,
block != ir3_start_block(ir) || block->predecessors_count == 0);
foreach_block (block, &ir->block_list) {
/* We require that the first block does not have any predecessors,
* which allows us to assume that phi nodes and meta:input's do not
* appear in the same basic block.
*/
validate_assert(
ctx, block != ir3_start_block(ir) || block->predecessors_count == 0);
struct ir3_instruction *prev = NULL;
foreach_instr (instr, &block->instr_list) {
ctx->current_instr = instr;
if (instr->opc == OPC_META_PHI) {
/* phis must be the first in the block */
validate_assert(ctx, prev == NULL || prev->opc == OPC_META_PHI);
validate_phi(ctx, instr);
} else {
validate_instr(ctx, instr);
}
prev = instr;
}
struct ir3_instruction *prev = NULL;
foreach_instr (instr, &block->instr_list) {
ctx->current_instr = instr;
if (instr->opc == OPC_META_PHI) {
/* phis must be the first in the block */
validate_assert(ctx, prev == NULL || prev->opc == OPC_META_PHI);
validate_phi(ctx, instr);
} else {
validate_instr(ctx, instr);
}
prev = instr;
}
for (unsigned i = 0; i < 2; i++) {
if (block->successors[i])
validate_phi_src(ctx, block->successors[i], block);
}
}
for (unsigned i = 0; i < 2; i++) {
if (block->successors[i])
validate_phi_src(ctx, block->successors[i], block);
}
}
ralloc_free(ctx);
ralloc_free(ctx);
}

View File

@ -32,100 +32,100 @@
typedef BITSET_DECLARE(regmaskstate_t, 2 * MAX_REG);
typedef struct {
bool mergedregs;
regmaskstate_t mask;
bool mergedregs;
regmaskstate_t mask;
} regmask_t;
static inline bool
__regmask_get(regmask_t *regmask, bool half, unsigned n)
{
if (regmask->mergedregs) {
/* a6xx+ case, with merged register file, we track things in terms
* of half-precision registers, with a full precisions register
* using two half-precision slots:
*/
if (half) {
return BITSET_TEST(regmask->mask, n);
} else {
n *= 2;
return BITSET_TEST(regmask->mask, n) ||
BITSET_TEST(regmask->mask, n+1);
}
} else {
/* pre a6xx case, with separate register file for half and full
* precision:
*/
if (half)
n += MAX_REG;
return BITSET_TEST(regmask->mask, n);
}
if (regmask->mergedregs) {
/* a6xx+ case, with merged register file, we track things in terms
* of half-precision registers, with a full precisions register
* using two half-precision slots:
*/
if (half) {
return BITSET_TEST(regmask->mask, n);
} else {
n *= 2;
return BITSET_TEST(regmask->mask, n) ||
BITSET_TEST(regmask->mask, n + 1);
}
} else {
/* pre a6xx case, with separate register file for half and full
* precision:
*/
if (half)
n += MAX_REG;
return BITSET_TEST(regmask->mask, n);
}
}
static inline void
__regmask_set(regmask_t *regmask, bool half, unsigned n)
{
if (regmask->mergedregs) {
/* a6xx+ case, with merged register file, we track things in terms
* of half-precision registers, with a full precisions register
* using two half-precision slots:
*/
if (half) {
BITSET_SET(regmask->mask, n);
} else {
n *= 2;
BITSET_SET(regmask->mask, n);
BITSET_SET(regmask->mask, n+1);
}
} else {
/* pre a6xx case, with separate register file for half and full
* precision:
*/
if (half)
n += MAX_REG;
BITSET_SET(regmask->mask, n);
}
if (regmask->mergedregs) {
/* a6xx+ case, with merged register file, we track things in terms
* of half-precision registers, with a full precisions register
* using two half-precision slots:
*/
if (half) {
BITSET_SET(regmask->mask, n);
} else {
n *= 2;
BITSET_SET(regmask->mask, n);
BITSET_SET(regmask->mask, n + 1);
}
} else {
/* pre a6xx case, with separate register file for half and full
* precision:
*/
if (half)
n += MAX_REG;
BITSET_SET(regmask->mask, n);
}
}
static inline void
__regmask_clear(regmask_t *regmask, bool half, unsigned n)
{
if (regmask->mergedregs) {
/* a6xx+ case, with merged register file, we track things in terms
* of half-precision registers, with a full precisions register
* using two half-precision slots:
*/
if (half) {
BITSET_CLEAR(regmask->mask, n);
} else {
n *= 2;
BITSET_CLEAR(regmask->mask, n);
BITSET_CLEAR(regmask->mask, n+1);
}
} else {
/* pre a6xx case, with separate register file for half and full
* precision:
*/
if (half)
n += MAX_REG;
BITSET_CLEAR(regmask->mask, n);
}
if (regmask->mergedregs) {
/* a6xx+ case, with merged register file, we track things in terms
* of half-precision registers, with a full precisions register
* using two half-precision slots:
*/
if (half) {
BITSET_CLEAR(regmask->mask, n);
} else {
n *= 2;
BITSET_CLEAR(regmask->mask, n);
BITSET_CLEAR(regmask->mask, n + 1);
}
} else {
/* pre a6xx case, with separate register file for half and full
* precision:
*/
if (half)
n += MAX_REG;
BITSET_CLEAR(regmask->mask, n);
}
}
static inline void
regmask_init(regmask_t *regmask, bool mergedregs)
{
memset(&regmask->mask, 0, sizeof(regmask->mask));
regmask->mergedregs = mergedregs;
memset(&regmask->mask, 0, sizeof(regmask->mask));
regmask->mergedregs = mergedregs;
}
static inline void
regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
{
assert(dst->mergedregs == a->mergedregs);
assert(dst->mergedregs == b->mergedregs);
assert(dst->mergedregs == a->mergedregs);
assert(dst->mergedregs == b->mergedregs);
for (unsigned i = 0; i < ARRAY_SIZE(dst->mask); i++)
dst->mask[i] = a->mask[i] | b->mask[i];
for (unsigned i = 0; i < ARRAY_SIZE(dst->mask); i++)
dst->mask[i] = a->mask[i] | b->mask[i];
}
#endif /* REGMASK_H_ */

View File

@ -42,8 +42,8 @@
/* clang-format on */
static const struct test {
const char *asmstr;
unsigned expected_delay;
const char *asmstr;
unsigned expected_delay;
} tests[] = {
/* clang-format off */
TEST(6,
@ -101,16 +101,16 @@ static const struct test {
static struct ir3_shader *
parse_asm(struct ir3_compiler *c, const char *asmstr)
{
struct ir3_kernel_info info = {};
FILE *in = fmemopen((void *)asmstr, strlen(asmstr), "r");
struct ir3_shader *shader = ir3_parse_asm(c, &info, in);
struct ir3_kernel_info info = {};
FILE *in = fmemopen((void *)asmstr, strlen(asmstr), "r");
struct ir3_shader *shader = ir3_parse_asm(c, &info, in);
fclose(in);
fclose(in);
if (!shader)
errx(-1, "assembler failed");
if (!shader)
errx(-1, "assembler failed");
return shader;
return shader;
}
/**
@ -124,71 +124,70 @@ parse_asm(struct ir3_compiler *c, const char *asmstr)
static void
fixup_wrmask(struct ir3 *ir)
{
struct ir3_block *block = ir3_start_block(ir);
struct ir3_block *block = ir3_start_block(ir);
foreach_instr_safe (instr, &block->instr_list) {
instr->dsts[0]->wrmask = MASK(instr->repeat + 1);
foreach_src (reg, instr) {
if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
continue;
foreach_instr_safe (instr, &block->instr_list) {
instr->dsts[0]->wrmask = MASK(instr->repeat + 1);
foreach_src (reg, instr) {
if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
continue;
if (reg->flags & IR3_REG_R)
reg->wrmask = MASK(instr->repeat + 1);
else
reg->wrmask = 1;
}
}
if (reg->flags & IR3_REG_R)
reg->wrmask = MASK(instr->repeat + 1);
else
reg->wrmask = 1;
}
}
}
int
main(int argc, char **argv)
{
struct ir3_compiler *c;
int result = 0;
struct ir3_compiler *c;
int result = 0;
c = ir3_compiler_create(NULL, 630, false);
c = ir3_compiler_create(NULL, 630, false);
for (int i = 0; i < ARRAY_SIZE(tests); i++) {
const struct test *test = &tests[i];
struct ir3_shader *shader = parse_asm(c, test->asmstr);
struct ir3 *ir = shader->variants->ir;
for (int i = 0; i < ARRAY_SIZE(tests); i++) {
const struct test *test = &tests[i];
struct ir3_shader *shader = parse_asm(c, test->asmstr);
struct ir3 *ir = shader->variants->ir;
fixup_wrmask(ir);
fixup_wrmask(ir);
ir3_debug_print(ir, "AFTER fixup_wrmask");
ir3_debug_print(ir, "AFTER fixup_wrmask");
struct ir3_block *block =
list_first_entry(&ir->block_list, struct ir3_block, node);
struct ir3_instruction *last = NULL;
struct ir3_block *block =
list_first_entry(&ir->block_list, struct ir3_block, node);
struct ir3_instruction *last = NULL;
foreach_instr_rev (instr, &block->instr_list) {
if (is_meta(instr))
continue;
last = instr;
break;
}
foreach_instr_rev (instr, &block->instr_list) {
if (is_meta(instr))
continue;
last = instr;
break;
}
/* The delay calc is expecting the instr to not yet be added to the
* block, so remove it from the block so that it doesn't get counted
* in the distance from assigner:
*/
list_delinit(&last->node);
/* The delay calc is expecting the instr to not yet be added to the
* block, so remove it from the block so that it doesn't get counted
* in the distance from assigner:
*/
list_delinit(&last->node);
unsigned n = ir3_delay_calc_exact(block, last, true);
unsigned n = ir3_delay_calc_exact(block, last, true);
if (n != test->expected_delay) {
printf("%d: FAIL: Expected delay %u, but got %u, for:\n%s\n",
i, test->expected_delay, n, test->asmstr);
result = -1;
} else {
printf("%d: PASS\n", i);
}
if (n != test->expected_delay) {
printf("%d: FAIL: Expected delay %u, but got %u, for:\n%s\n", i,
test->expected_delay, n, test->asmstr);
result = -1;
} else {
printf("%d: PASS\n", i);
}
ir3_shader_destroy(shader);
}
ir3_shader_destroy(shader);
}
ir3_compiler_destroy(c);
ir3_compiler_destroy(c);
return result;
return result;
}

View File

@ -48,15 +48,16 @@
/* clang-format on */
static const struct test {
int gpu_id;
const char *instr;
const char *expected;
/**
* Do we expect asm parse fail (ie. for things not (yet) supported by ir3_parser.y)
*/
bool parse_fail;
int gpu_id;
const char *instr;
const char *expected;
/**
* Do we expect asm parse fail (ie. for things not (yet) supported by
* ir3_parser.y)
*/
bool parse_fail;
} tests[] = {
/* clang-format off */
/* clang-format off */
/* cat0 */
INSTR_6XX(00000000_00000000, "nop"),
INSTR_6XX(00000200_00000000, "(rpt2)nop"),
@ -351,128 +352,132 @@ static const struct test {
INSTR_6XX(e0fa0000_00000000, "fence.g.l.r.w"),
INSTR_6XX(e09a0000_00000000, "fence.r.w"),
INSTR_6XX(f0420000_00000000, "(sy)bar.g"),
/* clang-format on */
/* clang-format on */
};
static void
trim(char *string)
{
for (int len = strlen(string); len > 0 && string[len - 1] == '\n'; len--)
string[len - 1] = 0;
for (int len = strlen(string); len > 0 && string[len - 1] == '\n'; len--)
string[len - 1] = 0;
}
int
main(int argc, char **argv)
{
int retval = 0;
int decode_fails = 0, asm_fails = 0, encode_fails = 0;
const int output_size = 4096;
char *disasm_output = malloc(output_size);
FILE *fdisasm = fmemopen(disasm_output, output_size, "w+");
if (!fdisasm) {
fprintf(stderr, "failed to fmemopen\n");
return 1;
}
int retval = 0;
int decode_fails = 0, asm_fails = 0, encode_fails = 0;
const int output_size = 4096;
char *disasm_output = malloc(output_size);
FILE *fdisasm = fmemopen(disasm_output, output_size, "w+");
if (!fdisasm) {
fprintf(stderr, "failed to fmemopen\n");
return 1;
}
struct ir3_compiler *compilers[10] = {};
struct ir3_compiler *compilers[10] = {};
for (int i = 0; i < ARRAY_SIZE(tests); i++) {
const struct test *test = &tests[i];
printf("Testing a%d %s: \"%s\"...\n",
test->gpu_id, test->instr, test->expected);
for (int i = 0; i < ARRAY_SIZE(tests); i++) {
const struct test *test = &tests[i];
printf("Testing a%d %s: \"%s\"...\n", test->gpu_id, test->instr,
test->expected);
rewind(fdisasm);
memset(disasm_output, 0, output_size);
rewind(fdisasm);
memset(disasm_output, 0, output_size);
/*
* Test disassembly:
*/
/*
* Test disassembly:
*/
uint32_t code[2] = {
strtoll(&test->instr[9], NULL, 16),
strtoll(&test->instr[0], NULL, 16),
};
isa_decode(code, 8, fdisasm, &(struct isa_decode_options){
.gpu_id = test->gpu_id,
.show_errors = true,
});
fflush(fdisasm);
uint32_t code[2] = {
strtoll(&test->instr[9], NULL, 16),
strtoll(&test->instr[0], NULL, 16),
};
isa_decode(code, 8, fdisasm,
&(struct isa_decode_options){
.gpu_id = test->gpu_id,
.show_errors = true,
});
fflush(fdisasm);
trim(disasm_output);
trim(disasm_output);
if (strcmp(disasm_output, test->expected) != 0) {
printf("FAIL: disasm\n");
printf(" Expected: \"%s\"\n", test->expected);
printf(" Got: \"%s\"\n", disasm_output);
retval = 1;
decode_fails++;
continue;
}
if (strcmp(disasm_output, test->expected) != 0) {
printf("FAIL: disasm\n");
printf(" Expected: \"%s\"\n", test->expected);
printf(" Got: \"%s\"\n", disasm_output);
retval = 1;
decode_fails++;
continue;
}
/*
* Test assembly, which should result in the identical binary:
*/
/*
* Test assembly, which should result in the identical binary:
*/
unsigned gen = test->gpu_id / 100;
if (!compilers[gen]) {
compilers[gen] = ir3_compiler_create(NULL, test->gpu_id, false);
}
unsigned gen = test->gpu_id / 100;
if (!compilers[gen]) {
compilers[gen] = ir3_compiler_create(NULL, test->gpu_id, false);
}
FILE *fasm = fmemopen((void *)test->expected, strlen(test->expected), "r");
FILE *fasm =
fmemopen((void *)test->expected, strlen(test->expected), "r");
struct ir3_kernel_info info = {};
struct ir3_shader *shader = ir3_parse_asm(compilers[gen], &info, fasm);
fclose(fasm);
if (!shader) {
printf("FAIL: %sexpected assembler fail\n", test->parse_fail ? "" : "un");
asm_fails++;
/* If this is an instruction that the asm parser is not expected
* to handle, don't count it as a fail.
*/
if (!test->parse_fail)
retval = 1;
continue;
} else if (test->parse_fail) {
/* If asm parse starts passing, and we don't expect that, flag
* it as a fail so we don't forget to update the test vector:
*/
printf("FAIL: unexpected parse success, please remove '.parse_fail=true'\n");
retval = 1;
}
struct ir3_kernel_info info = {};
struct ir3_shader *shader = ir3_parse_asm(compilers[gen], &info, fasm);
fclose(fasm);
if (!shader) {
printf("FAIL: %sexpected assembler fail\n",
test->parse_fail ? "" : "un");
asm_fails++;
/* If this is an instruction that the asm parser is not expected
* to handle, don't count it as a fail.
*/
if (!test->parse_fail)
retval = 1;
continue;
} else if (test->parse_fail) {
/* If asm parse starts passing, and we don't expect that, flag
* it as a fail so we don't forget to update the test vector:
*/
printf(
"FAIL: unexpected parse success, please remove '.parse_fail=true'\n");
retval = 1;
}
struct ir3_shader_variant *v = shader->variants;
if (memcmp(v->bin, code, sizeof(code))) {
printf("FAIL: assembler\n");
printf(" Expected: %08x_%08x\n", code[1], code[0]);
printf(" Got: %08x_%08x\n", v->bin[1], v->bin[0]);
retval = 1;
encode_fails++;
}
struct ir3_shader_variant *v = shader->variants;
if (memcmp(v->bin, code, sizeof(code))) {
printf("FAIL: assembler\n");
printf(" Expected: %08x_%08x\n", code[1], code[0]);
printf(" Got: %08x_%08x\n", v->bin[1], v->bin[0]);
retval = 1;
encode_fails++;
}
ir3_shader_destroy(shader);
}
ir3_shader_destroy(shader);
}
if (decode_fails)
printf("%d/%d decode fails\n", decode_fails, (int)ARRAY_SIZE(tests));
if (asm_fails)
printf("%d/%d assembler fails\n", asm_fails, (int)ARRAY_SIZE(tests));
if (encode_fails)
printf("%d/%d encode fails\n", encode_fails, (int)ARRAY_SIZE(tests));
if (decode_fails)
printf("%d/%d decode fails\n", decode_fails, (int)ARRAY_SIZE(tests));
if (asm_fails)
printf("%d/%d assembler fails\n", asm_fails, (int)ARRAY_SIZE(tests));
if (encode_fails)
printf("%d/%d encode fails\n", encode_fails, (int)ARRAY_SIZE(tests));
if (retval) {
printf("FAILED!\n");
} else {
printf("PASSED!\n");
}
if (retval) {
printf("FAILED!\n");
} else {
printf("PASSED!\n");
}
for (unsigned i = 0; i < ARRAY_SIZE(compilers); i++) {
if (!compilers[i])
continue;
ir3_compiler_destroy(compilers[i]);
}
for (unsigned i = 0; i < ARRAY_SIZE(compilers); i++) {
if (!compilers[i])
continue;
ir3_compiler_destroy(compilers[i]);
}
fclose(fdisasm);
free(disasm_output);
fclose(fdisasm);
free(disasm_output);
return retval;
return retval;
}