ir3: Reformat source with clang-format
Generated using: cd src/freedreno/ir3 && clang-format -i {**,.}/*.c {**,.}/*.h -style=file Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11801>
This commit is contained in:
parent
082871bb35
commit
177138d8cb
|
@ -21,15 +21,15 @@
|
|||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include <util/u_debug.h>
|
||||
#include <util/log.h>
|
||||
#include <util/u_debug.h>
|
||||
|
||||
#include "isa/isa.h"
|
||||
|
||||
|
@ -39,125 +39,120 @@
|
|||
static enum debug_t debug;
|
||||
|
||||
static const char *levels[] = {
|
||||
"",
|
||||
"\t",
|
||||
"\t\t",
|
||||
"\t\t\t",
|
||||
"\t\t\t\t",
|
||||
"\t\t\t\t\t",
|
||||
"\t\t\t\t\t\t",
|
||||
"\t\t\t\t\t\t\t",
|
||||
"\t\t\t\t\t\t\t\t",
|
||||
"\t\t\t\t\t\t\t\t\t",
|
||||
"x",
|
||||
"x",
|
||||
"x",
|
||||
"x",
|
||||
"x",
|
||||
"x",
|
||||
"",
|
||||
"\t",
|
||||
"\t\t",
|
||||
"\t\t\t",
|
||||
"\t\t\t\t",
|
||||
"\t\t\t\t\t",
|
||||
"\t\t\t\t\t\t",
|
||||
"\t\t\t\t\t\t\t",
|
||||
"\t\t\t\t\t\t\t\t",
|
||||
"\t\t\t\t\t\t\t\t\t",
|
||||
"x",
|
||||
"x",
|
||||
"x",
|
||||
"x",
|
||||
"x",
|
||||
"x",
|
||||
};
|
||||
|
||||
struct disasm_ctx {
|
||||
FILE *out;
|
||||
struct isa_decode_options *options;
|
||||
unsigned level;
|
||||
unsigned extra_cycles;
|
||||
FILE *out;
|
||||
struct isa_decode_options *options;
|
||||
unsigned level;
|
||||
unsigned extra_cycles;
|
||||
|
||||
/**
|
||||
* nop_count/has_end used to detect the real end of shader. Since
|
||||
* in some cases there can be a epilogue following an `end` we look
|
||||
* for a sequence of `nop`s following the `end`
|
||||
*/
|
||||
int nop_count; /* number of nop's since non-nop instruction: */
|
||||
bool has_end; /* have we seen end instruction */
|
||||
/**
|
||||
* nop_count/has_end used to detect the real end of shader. Since
|
||||
* in some cases there can be a epilogue following an `end` we look
|
||||
* for a sequence of `nop`s following the `end`
|
||||
*/
|
||||
int nop_count; /* number of nop's since non-nop instruction: */
|
||||
bool has_end; /* have we seen end instruction */
|
||||
|
||||
int cur_n; /* current instr # */
|
||||
int cur_opc_cat; /* current opc_cat */
|
||||
int cur_n; /* current instr # */
|
||||
int cur_opc_cat; /* current opc_cat */
|
||||
|
||||
int sfu_delay;
|
||||
int sfu_delay;
|
||||
|
||||
/**
|
||||
* State accumulated decoding fields of the current instruction,
|
||||
* handled after decoding is complete (ie. at start of next instr)
|
||||
*/
|
||||
struct {
|
||||
bool ss;
|
||||
uint8_t nop;
|
||||
uint8_t repeat;
|
||||
} last;
|
||||
/**
|
||||
* State accumulated decoding fields of the current instruction,
|
||||
* handled after decoding is complete (ie. at start of next instr)
|
||||
*/
|
||||
struct {
|
||||
bool ss;
|
||||
uint8_t nop;
|
||||
uint8_t repeat;
|
||||
} last;
|
||||
|
||||
/**
|
||||
* State accumulated decoding fields of src or dst register
|
||||
*/
|
||||
struct {
|
||||
bool half;
|
||||
bool r;
|
||||
enum {
|
||||
FILE_GPR = 1,
|
||||
FILE_CONST = 2,
|
||||
} file;
|
||||
unsigned num;
|
||||
} reg;
|
||||
/**
|
||||
* State accumulated decoding fields of src or dst register
|
||||
*/
|
||||
struct {
|
||||
bool half;
|
||||
bool r;
|
||||
enum {
|
||||
FILE_GPR = 1,
|
||||
FILE_CONST = 2,
|
||||
} file;
|
||||
unsigned num;
|
||||
} reg;
|
||||
|
||||
struct shader_stats *stats;
|
||||
struct shader_stats *stats;
|
||||
};
|
||||
|
||||
static void print_stats(struct disasm_ctx *ctx)
|
||||
static void
|
||||
print_stats(struct disasm_ctx *ctx)
|
||||
{
|
||||
if (ctx->options->gpu_id >= 600) {
|
||||
/* handle MERGEREGS case.. this isn't *entirely* accurate, as
|
||||
* you can have shader stages not using merged register file,
|
||||
* but it is good enough for a guestimate:
|
||||
*/
|
||||
unsigned n = (ctx->stats->halfreg + 1) / 2;
|
||||
if (ctx->options->gpu_id >= 600) {
|
||||
/* handle MERGEREGS case.. this isn't *entirely* accurate, as
|
||||
* you can have shader stages not using merged register file,
|
||||
* but it is good enough for a guestimate:
|
||||
*/
|
||||
unsigned n = (ctx->stats->halfreg + 1) / 2;
|
||||
|
||||
ctx->stats->halfreg = 0;
|
||||
ctx->stats->fullreg = MAX2(ctx->stats->fullreg, n);
|
||||
}
|
||||
ctx->stats->halfreg = 0;
|
||||
ctx->stats->fullreg = MAX2(ctx->stats->fullreg, n);
|
||||
}
|
||||
|
||||
unsigned instructions = ctx->cur_n + ctx->extra_cycles + 1;
|
||||
unsigned instructions = ctx->cur_n + ctx->extra_cycles + 1;
|
||||
|
||||
fprintf(ctx->out, "%sStats:\n", levels[ctx->level]);
|
||||
fprintf(ctx->out, "%s- shaderdb: %u instr, %u nops, %u non-nops, %u mov, %u cov\n",
|
||||
levels[ctx->level],
|
||||
instructions,
|
||||
ctx->stats->nops,
|
||||
instructions - ctx->stats->nops,
|
||||
ctx->stats->mov_count,
|
||||
ctx->stats->cov_count);
|
||||
fprintf(ctx->out, "%sStats:\n", levels[ctx->level]);
|
||||
fprintf(ctx->out,
|
||||
"%s- shaderdb: %u instr, %u nops, %u non-nops, %u mov, %u cov\n",
|
||||
levels[ctx->level], instructions, ctx->stats->nops,
|
||||
instructions - ctx->stats->nops, ctx->stats->mov_count,
|
||||
ctx->stats->cov_count);
|
||||
|
||||
fprintf(ctx->out, "%s- shaderdb: %u last-baryf, %d half, %d full, %u constlen\n",
|
||||
levels[ctx->level],
|
||||
ctx->stats->last_baryf,
|
||||
DIV_ROUND_UP(ctx->stats->halfreg, 4),
|
||||
DIV_ROUND_UP(ctx->stats->fullreg, 4),
|
||||
DIV_ROUND_UP(ctx->stats->constlen, 4));
|
||||
fprintf(ctx->out,
|
||||
"%s- shaderdb: %u last-baryf, %d half, %d full, %u constlen\n",
|
||||
levels[ctx->level], ctx->stats->last_baryf,
|
||||
DIV_ROUND_UP(ctx->stats->halfreg, 4),
|
||||
DIV_ROUND_UP(ctx->stats->fullreg, 4),
|
||||
DIV_ROUND_UP(ctx->stats->constlen, 4));
|
||||
|
||||
fprintf(ctx->out, "%s- shaderdb: %u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7\n",
|
||||
levels[ctx->level],
|
||||
ctx->stats->instrs_per_cat[0],
|
||||
ctx->stats->instrs_per_cat[1],
|
||||
ctx->stats->instrs_per_cat[2],
|
||||
ctx->stats->instrs_per_cat[3],
|
||||
ctx->stats->instrs_per_cat[4],
|
||||
ctx->stats->instrs_per_cat[5],
|
||||
ctx->stats->instrs_per_cat[6],
|
||||
ctx->stats->instrs_per_cat[7]);
|
||||
fprintf(
|
||||
ctx->out,
|
||||
"%s- shaderdb: %u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7\n",
|
||||
levels[ctx->level], ctx->stats->instrs_per_cat[0],
|
||||
ctx->stats->instrs_per_cat[1], ctx->stats->instrs_per_cat[2],
|
||||
ctx->stats->instrs_per_cat[3], ctx->stats->instrs_per_cat[4],
|
||||
ctx->stats->instrs_per_cat[5], ctx->stats->instrs_per_cat[6],
|
||||
ctx->stats->instrs_per_cat[7]);
|
||||
|
||||
fprintf(ctx->out, "%s- shaderdb: %u sstall, %u (ss), %u (sy)\n",
|
||||
levels[ctx->level],
|
||||
ctx->stats->sstall,
|
||||
ctx->stats->ss,
|
||||
ctx->stats->sy);
|
||||
fprintf(ctx->out, "%s- shaderdb: %u sstall, %u (ss), %u (sy)\n",
|
||||
levels[ctx->level], ctx->stats->sstall, ctx->stats->ss,
|
||||
ctx->stats->sy);
|
||||
}
|
||||
|
||||
/* size of largest OPC field of all the instruction categories: */
|
||||
#define NOPC_BITS 6
|
||||
|
||||
static const struct opc_info {
|
||||
const char *name;
|
||||
} opcs[1 << (3+NOPC_BITS)] = {
|
||||
#define OPC(cat, opc, name) [(opc)] = { #name }
|
||||
const char *name;
|
||||
} opcs[1 << (3 + NOPC_BITS)] = {
|
||||
#define OPC(cat, opc, name) [(opc)] = {#name}
|
||||
/* clang-format off */
|
||||
/* category 0: */
|
||||
OPC(0, OPC_NOP, nop),
|
||||
|
@ -359,96 +354,96 @@ static const struct opc_info {
|
|||
#undef OPC
|
||||
};
|
||||
|
||||
#define GETINFO(instr) (&(opcs[((instr)->opc_cat << NOPC_BITS) | instr_opc(instr, ctx->gpu_id)]))
|
||||
#define GETINFO(instr) \
|
||||
(&(opcs[((instr)->opc_cat << NOPC_BITS) | instr_opc(instr, ctx->gpu_id)]))
|
||||
|
||||
const char *disasm_a3xx_instr_name(opc_t opc)
|
||||
const char *
|
||||
disasm_a3xx_instr_name(opc_t opc)
|
||||
{
|
||||
if (opc_cat(opc) == -1) return "??meta??";
|
||||
return opcs[opc].name;
|
||||
if (opc_cat(opc) == -1)
|
||||
return "??meta??";
|
||||
return opcs[opc].name;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
disasm_field_cb(void *d, const char *field_name, struct isa_decode_value *val)
|
||||
{
|
||||
struct disasm_ctx *ctx = d;
|
||||
struct disasm_ctx *ctx = d;
|
||||
|
||||
if (!strcmp(field_name, "NAME")) {
|
||||
if (!strcmp("nop", val->str)) {
|
||||
if (ctx->has_end) {
|
||||
ctx->nop_count++;
|
||||
if (ctx->nop_count > 3) {
|
||||
ctx->options->stop = true;
|
||||
}
|
||||
}
|
||||
ctx->stats->nops += 1 + ctx->last.repeat;
|
||||
} else {
|
||||
ctx->nop_count = 0;
|
||||
}
|
||||
if (!strcmp(field_name, "NAME")) {
|
||||
if (!strcmp("nop", val->str)) {
|
||||
if (ctx->has_end) {
|
||||
ctx->nop_count++;
|
||||
if (ctx->nop_count > 3) {
|
||||
ctx->options->stop = true;
|
||||
}
|
||||
}
|
||||
ctx->stats->nops += 1 + ctx->last.repeat;
|
||||
} else {
|
||||
ctx->nop_count = 0;
|
||||
}
|
||||
|
||||
if (!strcmp("end", val->str)) {
|
||||
ctx->has_end = true;
|
||||
ctx->nop_count = 0;
|
||||
} else if (!strcmp("chsh", val->str)) {
|
||||
ctx->options->stop = true;
|
||||
} else if (!strcmp("bary.f", val->str)) {
|
||||
ctx->stats->last_baryf = ctx->cur_n;
|
||||
}
|
||||
} else if (!strcmp(field_name, "REPEAT")) {
|
||||
ctx->extra_cycles += val->num;
|
||||
ctx->stats->instrs_per_cat[ctx->cur_opc_cat] += val->num;
|
||||
ctx->last.repeat = val->num;
|
||||
} else if (!strcmp(field_name, "NOP")) {
|
||||
ctx->extra_cycles += val->num;
|
||||
ctx->stats->instrs_per_cat[0] += val->num;
|
||||
ctx->stats->nops += val->num;
|
||||
ctx->last.nop = val->num;
|
||||
} else if (!strcmp(field_name, "SY")) {
|
||||
ctx->stats->sy += val->num;
|
||||
} else if (!strcmp(field_name, "SS")) {
|
||||
ctx->stats->ss += val->num;
|
||||
ctx->last.ss = !!val->num;
|
||||
} else if (!strcmp(field_name, "CONST")) {
|
||||
ctx->reg.num = val->num;
|
||||
ctx->reg.file = FILE_CONST;
|
||||
} else if (!strcmp(field_name, "GPR")) {
|
||||
/* don't count GPR regs r48.x (shared) or higher: */
|
||||
if (val->num < 48) {
|
||||
ctx->reg.num = val->num;
|
||||
ctx->reg.file = FILE_GPR;
|
||||
}
|
||||
} else if (!strcmp(field_name, "SRC_R") ||
|
||||
!strcmp(field_name, "SRC1_R") ||
|
||||
!strcmp(field_name, "SRC2_R") ||
|
||||
!strcmp(field_name, "SRC3_R")) {
|
||||
ctx->reg.r = val->num;
|
||||
} else if (!strcmp(field_name, "DST")) {
|
||||
/* Dest register is always repeated
|
||||
*
|
||||
* Note that this doesn't really properly handle instructions
|
||||
* that write multiple components.. the old disasm didn't handle
|
||||
* that case either.
|
||||
*/
|
||||
ctx->reg.r = true;
|
||||
} else if (strstr(field_name, "HALF")) {
|
||||
ctx->reg.half = val->num;
|
||||
} else if (!strcmp(field_name, "SWIZ")) {
|
||||
unsigned num = (ctx->reg.num << 2) | val->num;
|
||||
if (ctx->reg.r)
|
||||
num += ctx->last.repeat;
|
||||
if (!strcmp("end", val->str)) {
|
||||
ctx->has_end = true;
|
||||
ctx->nop_count = 0;
|
||||
} else if (!strcmp("chsh", val->str)) {
|
||||
ctx->options->stop = true;
|
||||
} else if (!strcmp("bary.f", val->str)) {
|
||||
ctx->stats->last_baryf = ctx->cur_n;
|
||||
}
|
||||
} else if (!strcmp(field_name, "REPEAT")) {
|
||||
ctx->extra_cycles += val->num;
|
||||
ctx->stats->instrs_per_cat[ctx->cur_opc_cat] += val->num;
|
||||
ctx->last.repeat = val->num;
|
||||
} else if (!strcmp(field_name, "NOP")) {
|
||||
ctx->extra_cycles += val->num;
|
||||
ctx->stats->instrs_per_cat[0] += val->num;
|
||||
ctx->stats->nops += val->num;
|
||||
ctx->last.nop = val->num;
|
||||
} else if (!strcmp(field_name, "SY")) {
|
||||
ctx->stats->sy += val->num;
|
||||
} else if (!strcmp(field_name, "SS")) {
|
||||
ctx->stats->ss += val->num;
|
||||
ctx->last.ss = !!val->num;
|
||||
} else if (!strcmp(field_name, "CONST")) {
|
||||
ctx->reg.num = val->num;
|
||||
ctx->reg.file = FILE_CONST;
|
||||
} else if (!strcmp(field_name, "GPR")) {
|
||||
/* don't count GPR regs r48.x (shared) or higher: */
|
||||
if (val->num < 48) {
|
||||
ctx->reg.num = val->num;
|
||||
ctx->reg.file = FILE_GPR;
|
||||
}
|
||||
} else if (!strcmp(field_name, "SRC_R") || !strcmp(field_name, "SRC1_R") ||
|
||||
!strcmp(field_name, "SRC2_R") || !strcmp(field_name, "SRC3_R")) {
|
||||
ctx->reg.r = val->num;
|
||||
} else if (!strcmp(field_name, "DST")) {
|
||||
/* Dest register is always repeated
|
||||
*
|
||||
* Note that this doesn't really properly handle instructions
|
||||
* that write multiple components.. the old disasm didn't handle
|
||||
* that case either.
|
||||
*/
|
||||
ctx->reg.r = true;
|
||||
} else if (strstr(field_name, "HALF")) {
|
||||
ctx->reg.half = val->num;
|
||||
} else if (!strcmp(field_name, "SWIZ")) {
|
||||
unsigned num = (ctx->reg.num << 2) | val->num;
|
||||
if (ctx->reg.r)
|
||||
num += ctx->last.repeat;
|
||||
|
||||
if (ctx->reg.file == FILE_CONST) {
|
||||
ctx->stats->constlen = MAX2(ctx->stats->constlen, num);
|
||||
} else if (ctx->reg.file == FILE_GPR) {
|
||||
if (ctx->reg.half) {
|
||||
ctx->stats->halfreg = MAX2(ctx->stats->halfreg, num);
|
||||
} else {
|
||||
ctx->stats->fullreg = MAX2(ctx->stats->fullreg, num);
|
||||
}
|
||||
}
|
||||
if (ctx->reg.file == FILE_CONST) {
|
||||
ctx->stats->constlen = MAX2(ctx->stats->constlen, num);
|
||||
} else if (ctx->reg.file == FILE_GPR) {
|
||||
if (ctx->reg.half) {
|
||||
ctx->stats->halfreg = MAX2(ctx->stats->halfreg, num);
|
||||
} else {
|
||||
ctx->stats->fullreg = MAX2(ctx->stats->fullreg, num);
|
||||
}
|
||||
}
|
||||
|
||||
memset(&ctx->reg, 0, sizeof(ctx->reg));
|
||||
}
|
||||
memset(&ctx->reg, 0, sizeof(ctx->reg));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -458,103 +453,105 @@ disasm_field_cb(void *d, const char *field_name, struct isa_decode_value *val)
|
|||
static void
|
||||
disasm_handle_last(struct disasm_ctx *ctx)
|
||||
{
|
||||
if (ctx->last.ss) {
|
||||
ctx->stats->sstall += ctx->sfu_delay;
|
||||
ctx->sfu_delay = 0;
|
||||
}
|
||||
if (ctx->last.ss) {
|
||||
ctx->stats->sstall += ctx->sfu_delay;
|
||||
ctx->sfu_delay = 0;
|
||||
}
|
||||
|
||||
if (ctx->cur_opc_cat == 4) {
|
||||
ctx->sfu_delay = 10;
|
||||
} else {
|
||||
int n = MIN2(ctx->sfu_delay, 1 + ctx->last.repeat + ctx->last.nop);
|
||||
ctx->sfu_delay -= n;
|
||||
}
|
||||
if (ctx->cur_opc_cat == 4) {
|
||||
ctx->sfu_delay = 10;
|
||||
} else {
|
||||
int n = MIN2(ctx->sfu_delay, 1 + ctx->last.repeat + ctx->last.nop);
|
||||
ctx->sfu_delay -= n;
|
||||
}
|
||||
|
||||
memset(&ctx->last, 0, sizeof(ctx->last));
|
||||
memset(&ctx->last, 0, sizeof(ctx->last));
|
||||
}
|
||||
|
||||
static void
|
||||
disasm_instr_cb(void *d, unsigned n, uint64_t instr)
|
||||
{
|
||||
struct disasm_ctx *ctx = d;
|
||||
uint32_t *dwords = (uint32_t *)&instr;
|
||||
unsigned opc_cat = instr >> 61;
|
||||
struct disasm_ctx *ctx = d;
|
||||
uint32_t *dwords = (uint32_t *)&instr;
|
||||
unsigned opc_cat = instr >> 61;
|
||||
|
||||
/* There are some cases where we can get instr_cb called multiple
|
||||
* times per instruction (like when we need an extra line for branch
|
||||
* target labels), don't update stats in these cases:
|
||||
*/
|
||||
if (n != ctx->cur_n) {
|
||||
if (n > 0) {
|
||||
disasm_handle_last(ctx);
|
||||
}
|
||||
ctx->stats->instrs_per_cat[opc_cat]++;
|
||||
ctx->cur_n = n;
|
||||
/* There are some cases where we can get instr_cb called multiple
|
||||
* times per instruction (like when we need an extra line for branch
|
||||
* target labels), don't update stats in these cases:
|
||||
*/
|
||||
if (n != ctx->cur_n) {
|
||||
if (n > 0) {
|
||||
disasm_handle_last(ctx);
|
||||
}
|
||||
ctx->stats->instrs_per_cat[opc_cat]++;
|
||||
ctx->cur_n = n;
|
||||
|
||||
/* mov vs cov stats are a bit harder to fish out of the field
|
||||
* names, because current ir3-cat1.xml doesn't use {NAME} for
|
||||
* this distinction. So for now just handle this case with
|
||||
* some hand-coded parsing:
|
||||
*/
|
||||
if (opc_cat == 1) {
|
||||
unsigned opc = (instr >> 57) & 0x3;
|
||||
unsigned src_type = (instr >> 50) & 0x7;
|
||||
unsigned dst_type = (instr >> 46) & 0x7;
|
||||
/* mov vs cov stats are a bit harder to fish out of the field
|
||||
* names, because current ir3-cat1.xml doesn't use {NAME} for
|
||||
* this distinction. So for now just handle this case with
|
||||
* some hand-coded parsing:
|
||||
*/
|
||||
if (opc_cat == 1) {
|
||||
unsigned opc = (instr >> 57) & 0x3;
|
||||
unsigned src_type = (instr >> 50) & 0x7;
|
||||
unsigned dst_type = (instr >> 46) & 0x7;
|
||||
|
||||
if (opc == 0) {
|
||||
if (src_type == dst_type) {
|
||||
ctx->stats->mov_count++;
|
||||
} else {
|
||||
ctx->stats->cov_count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (opc == 0) {
|
||||
if (src_type == dst_type) {
|
||||
ctx->stats->mov_count++;
|
||||
} else {
|
||||
ctx->stats->cov_count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ctx->cur_opc_cat = opc_cat;
|
||||
ctx->cur_opc_cat = opc_cat;
|
||||
|
||||
if (debug & PRINT_RAW) {
|
||||
fprintf(ctx->out, "%s:%d:%04d:%04d[%08xx_%08xx] ", levels[ctx->level],
|
||||
opc_cat, n, ctx->extra_cycles + n, dwords[1], dwords[0]);
|
||||
}
|
||||
if (debug & PRINT_RAW) {
|
||||
fprintf(ctx->out, "%s:%d:%04d:%04d[%08xx_%08xx] ", levels[ctx->level],
|
||||
opc_cat, n, ctx->extra_cycles + n, dwords[1], dwords[0]);
|
||||
}
|
||||
}
|
||||
|
||||
int disasm_a3xx_stat(uint32_t *dwords, int sizedwords, int level, FILE *out,
|
||||
unsigned gpu_id, struct shader_stats *stats)
|
||||
int
|
||||
disasm_a3xx_stat(uint32_t *dwords, int sizedwords, int level, FILE *out,
|
||||
unsigned gpu_id, struct shader_stats *stats)
|
||||
{
|
||||
struct isa_decode_options decode_options = {
|
||||
.gpu_id = gpu_id,
|
||||
.show_errors = true,
|
||||
.max_errors = 5,
|
||||
.branch_labels = true,
|
||||
.field_cb = disasm_field_cb,
|
||||
.instr_cb = disasm_instr_cb,
|
||||
};
|
||||
struct disasm_ctx ctx = {
|
||||
.out = out,
|
||||
.level = level,
|
||||
.options = &decode_options,
|
||||
.stats = stats,
|
||||
.cur_n = -1,
|
||||
};
|
||||
struct isa_decode_options decode_options = {
|
||||
.gpu_id = gpu_id,
|
||||
.show_errors = true,
|
||||
.max_errors = 5,
|
||||
.branch_labels = true,
|
||||
.field_cb = disasm_field_cb,
|
||||
.instr_cb = disasm_instr_cb,
|
||||
};
|
||||
struct disasm_ctx ctx = {
|
||||
.out = out,
|
||||
.level = level,
|
||||
.options = &decode_options,
|
||||
.stats = stats,
|
||||
.cur_n = -1,
|
||||
};
|
||||
|
||||
memset(stats, 0, sizeof(*stats));
|
||||
memset(stats, 0, sizeof(*stats));
|
||||
|
||||
decode_options.cbdata = &ctx;
|
||||
decode_options.cbdata = &ctx;
|
||||
|
||||
isa_decode(dwords, sizedwords * 4, out, &decode_options);
|
||||
isa_decode(dwords, sizedwords * 4, out, &decode_options);
|
||||
|
||||
disasm_handle_last(&ctx);
|
||||
disasm_handle_last(&ctx);
|
||||
|
||||
if (debug & PRINT_STATS)
|
||||
print_stats(&ctx);
|
||||
if (debug & PRINT_STATS)
|
||||
print_stats(&ctx);
|
||||
|
||||
return 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void disasm_a3xx_set_debug(enum debug_t d)
|
||||
void
|
||||
disasm_a3xx_set_debug(enum debug_t d)
|
||||
{
|
||||
debug = d;
|
||||
debug = d;
|
||||
}
|
||||
|
||||
#include <setjmp.h>
|
||||
|
@ -564,34 +561,38 @@ static jmp_buf jmp_env;
|
|||
|
||||
void
|
||||
ir3_assert_handler(const char *expr, const char *file, int line,
|
||||
const char *func)
|
||||
const char *func)
|
||||
{
|
||||
mesa_loge("%s:%u: %s: Assertion `%s' failed.", file, line, func, expr);
|
||||
if (jmp_env_valid)
|
||||
longjmp(jmp_env, 1);
|
||||
abort();
|
||||
mesa_loge("%s:%u: %s: Assertion `%s' failed.", file, line, func, expr);
|
||||
if (jmp_env_valid)
|
||||
longjmp(jmp_env, 1);
|
||||
abort();
|
||||
}
|
||||
|
||||
#define TRY(x) do { \
|
||||
assert(!jmp_env_valid); \
|
||||
if (setjmp(jmp_env) == 0) { \
|
||||
jmp_env_valid = true; \
|
||||
x; \
|
||||
} \
|
||||
jmp_env_valid = false; \
|
||||
} while (0)
|
||||
#define TRY(x) \
|
||||
do { \
|
||||
assert(!jmp_env_valid); \
|
||||
if (setjmp(jmp_env) == 0) { \
|
||||
jmp_env_valid = true; \
|
||||
x; \
|
||||
} \
|
||||
jmp_env_valid = false; \
|
||||
} while (0)
|
||||
|
||||
|
||||
int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out, unsigned gpu_id)
|
||||
int
|
||||
disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out,
|
||||
unsigned gpu_id)
|
||||
{
|
||||
struct shader_stats stats;
|
||||
return disasm_a3xx_stat(dwords, sizedwords, level, out, gpu_id, &stats);
|
||||
struct shader_stats stats;
|
||||
return disasm_a3xx_stat(dwords, sizedwords, level, out, gpu_id, &stats);
|
||||
}
|
||||
|
||||
int try_disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out, unsigned gpu_id)
|
||||
int
|
||||
try_disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out,
|
||||
unsigned gpu_id)
|
||||
{
|
||||
struct shader_stats stats;
|
||||
int ret = -1;
|
||||
TRY(ret = disasm_a3xx_stat(dwords, sizedwords, level, out, gpu_id, &stats));
|
||||
return ret;
|
||||
struct shader_stats stats;
|
||||
int ret = -1;
|
||||
TRY(ret = disasm_a3xx_stat(dwords, sizedwords, level, out, gpu_id, &stats));
|
||||
return ret;
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -33,67 +33,65 @@
|
|||
* Handlers for instructions changed/added in a4xx:
|
||||
*/
|
||||
|
||||
|
||||
/* src[] = { buffer_index, offset }. No const_index */
|
||||
static void
|
||||
emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
|
||||
struct ir3_instruction **dst)
|
||||
struct ir3_instruction **dst)
|
||||
{
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *ldgb, *src0, *src1, *byte_offset, *offset;
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *ldgb, *src0, *src1, *byte_offset, *offset;
|
||||
|
||||
struct ir3_instruction *ssbo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
|
||||
struct ir3_instruction *ssbo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
|
||||
|
||||
byte_offset = ir3_get_src(ctx, &intr->src[1])[0];
|
||||
offset = ir3_get_src(ctx, &intr->src[2])[0];
|
||||
byte_offset = ir3_get_src(ctx, &intr->src[1])[0];
|
||||
offset = ir3_get_src(ctx, &intr->src[2])[0];
|
||||
|
||||
/* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */
|
||||
src0 = ir3_collect(ctx, byte_offset, create_immed(b, 0));
|
||||
src1 = offset;
|
||||
/* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */
|
||||
src0 = ir3_collect(ctx, byte_offset, create_immed(b, 0));
|
||||
src1 = offset;
|
||||
|
||||
ldgb = ir3_LDGB(b, ssbo, 0,
|
||||
src0, 0, src1, 0);
|
||||
ldgb->dsts[0]->wrmask = MASK(intr->num_components);
|
||||
ldgb->cat6.iim_val = intr->num_components;
|
||||
ldgb->cat6.d = 4;
|
||||
ldgb->cat6.type = TYPE_U32;
|
||||
ldgb->barrier_class = IR3_BARRIER_BUFFER_R;
|
||||
ldgb->barrier_conflict = IR3_BARRIER_BUFFER_W;
|
||||
ldgb = ir3_LDGB(b, ssbo, 0, src0, 0, src1, 0);
|
||||
ldgb->dsts[0]->wrmask = MASK(intr->num_components);
|
||||
ldgb->cat6.iim_val = intr->num_components;
|
||||
ldgb->cat6.d = 4;
|
||||
ldgb->cat6.type = TYPE_U32;
|
||||
ldgb->barrier_class = IR3_BARRIER_BUFFER_R;
|
||||
ldgb->barrier_conflict = IR3_BARRIER_BUFFER_W;
|
||||
|
||||
ir3_split_dest(b, dst, ldgb, 0, intr->num_components);
|
||||
ir3_split_dest(b, dst, ldgb, 0, intr->num_components);
|
||||
}
|
||||
|
||||
/* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
|
||||
static void
|
||||
emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
||||
{
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *stgb, *src0, *src1, *src2, *byte_offset, *offset;
|
||||
unsigned wrmask = nir_intrinsic_write_mask(intr);
|
||||
unsigned ncomp = ffs(~wrmask) - 1;
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *stgb, *src0, *src1, *src2, *byte_offset, *offset;
|
||||
unsigned wrmask = nir_intrinsic_write_mask(intr);
|
||||
unsigned ncomp = ffs(~wrmask) - 1;
|
||||
|
||||
assert(wrmask == BITFIELD_MASK(intr->num_components));
|
||||
assert(wrmask == BITFIELD_MASK(intr->num_components));
|
||||
|
||||
struct ir3_instruction *ssbo = ir3_ssbo_to_ibo(ctx, intr->src[1]);
|
||||
struct ir3_instruction *ssbo = ir3_ssbo_to_ibo(ctx, intr->src[1]);
|
||||
|
||||
byte_offset = ir3_get_src(ctx, &intr->src[2])[0];
|
||||
offset = ir3_get_src(ctx, &intr->src[3])[0];
|
||||
byte_offset = ir3_get_src(ctx, &intr->src[2])[0];
|
||||
offset = ir3_get_src(ctx, &intr->src[3])[0];
|
||||
|
||||
/* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0)..
|
||||
* nir already *= 4:
|
||||
*/
|
||||
src0 = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
|
||||
src1 = offset;
|
||||
src2 = ir3_collect(ctx, byte_offset, create_immed(b, 0));
|
||||
/* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0)..
|
||||
* nir already *= 4:
|
||||
*/
|
||||
src0 = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
|
||||
src1 = offset;
|
||||
src2 = ir3_collect(ctx, byte_offset, create_immed(b, 0));
|
||||
|
||||
stgb = ir3_STGB(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
|
||||
stgb->cat6.iim_val = ncomp;
|
||||
stgb->cat6.d = 4;
|
||||
stgb->cat6.type = TYPE_U32;
|
||||
stgb->barrier_class = IR3_BARRIER_BUFFER_W;
|
||||
stgb->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
|
||||
stgb = ir3_STGB(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
|
||||
stgb->cat6.iim_val = ncomp;
|
||||
stgb->cat6.d = 4;
|
||||
stgb->cat6.type = TYPE_U32;
|
||||
stgb->barrier_class = IR3_BARRIER_BUFFER_W;
|
||||
stgb->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
|
||||
|
||||
array_insert(b, b->keeps, stgb);
|
||||
array_insert(b, b->keeps, stgb);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -116,229 +114,228 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
|||
static struct ir3_instruction *
|
||||
emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
||||
{
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *atomic, *ssbo, *src0, *src1, *src2, *byte_offset,
|
||||
*offset;
|
||||
type_t type = TYPE_U32;
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *atomic, *ssbo, *src0, *src1, *src2, *byte_offset,
|
||||
*offset;
|
||||
type_t type = TYPE_U32;
|
||||
|
||||
ssbo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
|
||||
ssbo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
|
||||
|
||||
byte_offset = ir3_get_src(ctx, &intr->src[1])[0];
|
||||
offset = ir3_get_src(ctx, &intr->src[3])[0];
|
||||
byte_offset = ir3_get_src(ctx, &intr->src[1])[0];
|
||||
offset = ir3_get_src(ctx, &intr->src[3])[0];
|
||||
|
||||
/* src0 is data (or uvec2(data, compare))
|
||||
* src1 is offset
|
||||
* src2 is uvec2(offset*4, 0) (appears to be 64b byte offset)
|
||||
*
|
||||
* Note that nir already multiplies the offset by four
|
||||
*/
|
||||
src0 = ir3_get_src(ctx, &intr->src[2])[0];
|
||||
src1 = offset;
|
||||
src2 = ir3_collect(ctx, byte_offset, create_immed(b, 0));
|
||||
/* src0 is data (or uvec2(data, compare))
|
||||
* src1 is offset
|
||||
* src2 is uvec2(offset*4, 0) (appears to be 64b byte offset)
|
||||
*
|
||||
* Note that nir already multiplies the offset by four
|
||||
*/
|
||||
src0 = ir3_get_src(ctx, &intr->src[2])[0];
|
||||
src1 = offset;
|
||||
src2 = ir3_collect(ctx, byte_offset, create_immed(b, 0));
|
||||
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_ssbo_atomic_add_ir3:
|
||||
atomic = ir3_ATOMIC_ADD_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_imin_ir3:
|
||||
atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
|
||||
type = TYPE_S32;
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_umin_ir3:
|
||||
atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_imax_ir3:
|
||||
atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
|
||||
type = TYPE_S32;
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_umax_ir3:
|
||||
atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_and_ir3:
|
||||
atomic = ir3_ATOMIC_AND_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_or_ir3:
|
||||
atomic = ir3_ATOMIC_OR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_xor_ir3:
|
||||
atomic = ir3_ATOMIC_XOR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_exchange_ir3:
|
||||
atomic = ir3_ATOMIC_XCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
|
||||
/* for cmpxchg, src0 is [ui]vec2(data, compare): */
|
||||
src0 = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[3])[0], src0);
|
||||
src1 = ir3_get_src(ctx, &intr->src[4])[0];
|
||||
atomic = ir3_ATOMIC_CMPXCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
default:
|
||||
unreachable("boo");
|
||||
}
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_ssbo_atomic_add_ir3:
|
||||
atomic = ir3_ATOMIC_ADD_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_imin_ir3:
|
||||
atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
|
||||
type = TYPE_S32;
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_umin_ir3:
|
||||
atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_imax_ir3:
|
||||
atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
|
||||
type = TYPE_S32;
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_umax_ir3:
|
||||
atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_and_ir3:
|
||||
atomic = ir3_ATOMIC_AND_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_or_ir3:
|
||||
atomic = ir3_ATOMIC_OR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_xor_ir3:
|
||||
atomic = ir3_ATOMIC_XOR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_exchange_ir3:
|
||||
atomic = ir3_ATOMIC_XCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
|
||||
/* for cmpxchg, src0 is [ui]vec2(data, compare): */
|
||||
src0 = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[3])[0], src0);
|
||||
src1 = ir3_get_src(ctx, &intr->src[4])[0];
|
||||
atomic = ir3_ATOMIC_CMPXCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
default:
|
||||
unreachable("boo");
|
||||
}
|
||||
|
||||
atomic->cat6.iim_val = 1;
|
||||
atomic->cat6.d = 4;
|
||||
atomic->cat6.type = type;
|
||||
atomic->barrier_class = IR3_BARRIER_BUFFER_W;
|
||||
atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
|
||||
atomic->cat6.iim_val = 1;
|
||||
atomic->cat6.d = 4;
|
||||
atomic->cat6.type = type;
|
||||
atomic->barrier_class = IR3_BARRIER_BUFFER_W;
|
||||
atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
|
||||
|
||||
/* even if nothing consume the result, we can't DCE the instruction: */
|
||||
array_insert(b, b->keeps, atomic);
|
||||
/* even if nothing consume the result, we can't DCE the instruction: */
|
||||
array_insert(b, b->keeps, atomic);
|
||||
|
||||
return atomic;
|
||||
return atomic;
|
||||
}
|
||||
|
||||
static struct ir3_instruction *
|
||||
get_image_offset(struct ir3_context *ctx, const nir_intrinsic_instr *instr,
|
||||
struct ir3_instruction * const *coords, bool byteoff)
|
||||
struct ir3_instruction *const *coords, bool byteoff)
|
||||
{
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *offset;
|
||||
unsigned index = nir_src_as_uint(instr->src[0]);
|
||||
unsigned ncoords = ir3_get_image_coords(instr, NULL);
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *offset;
|
||||
unsigned index = nir_src_as_uint(instr->src[0]);
|
||||
unsigned ncoords = ir3_get_image_coords(instr, NULL);
|
||||
|
||||
/* to calculate the byte offset (yes, uggg) we need (up to) three
|
||||
* const values to know the bytes per pixel, and y and z stride:
|
||||
*/
|
||||
const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
|
||||
unsigned cb = regid(const_state->offsets.image_dims, 0) +
|
||||
const_state->image_dims.off[index];
|
||||
/* to calculate the byte offset (yes, uggg) we need (up to) three
|
||||
* const values to know the bytes per pixel, and y and z stride:
|
||||
*/
|
||||
const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
|
||||
unsigned cb = regid(const_state->offsets.image_dims, 0) +
|
||||
const_state->image_dims.off[index];
|
||||
|
||||
debug_assert(const_state->image_dims.mask & (1 << index));
|
||||
debug_assert(const_state->image_dims.mask & (1 << index));
|
||||
|
||||
/* offset = coords.x * bytes_per_pixel: */
|
||||
offset = ir3_MUL_S24(b, coords[0], 0, create_uniform(b, cb + 0), 0);
|
||||
if (ncoords > 1) {
|
||||
/* offset += coords.y * y_pitch: */
|
||||
offset = ir3_MAD_S24(b, create_uniform(b, cb + 1), 0,
|
||||
coords[1], 0, offset, 0);
|
||||
}
|
||||
if (ncoords > 2) {
|
||||
/* offset += coords.z * z_pitch: */
|
||||
offset = ir3_MAD_S24(b, create_uniform(b, cb + 2), 0,
|
||||
coords[2], 0, offset, 0);
|
||||
}
|
||||
/* offset = coords.x * bytes_per_pixel: */
|
||||
offset = ir3_MUL_S24(b, coords[0], 0, create_uniform(b, cb + 0), 0);
|
||||
if (ncoords > 1) {
|
||||
/* offset += coords.y * y_pitch: */
|
||||
offset =
|
||||
ir3_MAD_S24(b, create_uniform(b, cb + 1), 0, coords[1], 0, offset, 0);
|
||||
}
|
||||
if (ncoords > 2) {
|
||||
/* offset += coords.z * z_pitch: */
|
||||
offset =
|
||||
ir3_MAD_S24(b, create_uniform(b, cb + 2), 0, coords[2], 0, offset, 0);
|
||||
}
|
||||
|
||||
if (!byteoff) {
|
||||
/* Some cases, like atomics, seem to use dword offset instead
|
||||
* of byte offsets.. blob just puts an extra shr.b in there
|
||||
* in those cases:
|
||||
*/
|
||||
offset = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
|
||||
}
|
||||
if (!byteoff) {
|
||||
/* Some cases, like atomics, seem to use dword offset instead
|
||||
* of byte offsets.. blob just puts an extra shr.b in there
|
||||
* in those cases:
|
||||
*/
|
||||
offset = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
|
||||
}
|
||||
|
||||
return ir3_collect(ctx, offset, create_immed(b, 0));
|
||||
return ir3_collect(ctx, offset, create_immed(b, 0));
|
||||
}
|
||||
|
||||
/* src[] = { index, coord, sample_index, value }. const_index[] = {} */
|
||||
static void
|
||||
emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
||||
{
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *stib, *offset;
|
||||
struct ir3_instruction * const *value = ir3_get_src(ctx, &intr->src[3]);
|
||||
struct ir3_instruction * const *coords = ir3_get_src(ctx, &intr->src[1]);
|
||||
struct ir3_instruction * ibo = ir3_image_to_ibo(ctx, intr->src[0]);
|
||||
unsigned ncoords = ir3_get_image_coords(intr, NULL);
|
||||
unsigned ncomp = ir3_get_num_components_for_image_format(nir_intrinsic_format(intr));
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *stib, *offset;
|
||||
struct ir3_instruction *const *value = ir3_get_src(ctx, &intr->src[3]);
|
||||
struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]);
|
||||
struct ir3_instruction *ibo = ir3_image_to_ibo(ctx, intr->src[0]);
|
||||
unsigned ncoords = ir3_get_image_coords(intr, NULL);
|
||||
unsigned ncomp =
|
||||
ir3_get_num_components_for_image_format(nir_intrinsic_format(intr));
|
||||
|
||||
/* src0 is value
|
||||
* src1 is coords
|
||||
* src2 is 64b byte offset
|
||||
*/
|
||||
/* src0 is value
|
||||
* src1 is coords
|
||||
* src2 is 64b byte offset
|
||||
*/
|
||||
|
||||
offset = get_image_offset(ctx, intr, coords, true);
|
||||
offset = get_image_offset(ctx, intr, coords, true);
|
||||
|
||||
/* NOTE: stib seems to take byte offset, but stgb.typed can be used
|
||||
* too and takes a dword offset.. not quite sure yet why blob uses
|
||||
* one over the other in various cases.
|
||||
*/
|
||||
/* NOTE: stib seems to take byte offset, but stgb.typed can be used
|
||||
* too and takes a dword offset.. not quite sure yet why blob uses
|
||||
* one over the other in various cases.
|
||||
*/
|
||||
|
||||
stib = ir3_STIB(b, ibo, 0,
|
||||
ir3_create_collect(ctx, value, ncomp), 0,
|
||||
ir3_create_collect(ctx, coords, ncoords), 0,
|
||||
offset, 0);
|
||||
stib->cat6.iim_val = ncomp;
|
||||
stib->cat6.d = ncoords;
|
||||
stib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
|
||||
stib->cat6.typed = true;
|
||||
stib->barrier_class = IR3_BARRIER_IMAGE_W;
|
||||
stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
|
||||
stib = ir3_STIB(b, ibo, 0, ir3_create_collect(ctx, value, ncomp), 0,
|
||||
ir3_create_collect(ctx, coords, ncoords), 0, offset, 0);
|
||||
stib->cat6.iim_val = ncomp;
|
||||
stib->cat6.d = ncoords;
|
||||
stib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
|
||||
stib->cat6.typed = true;
|
||||
stib->barrier_class = IR3_BARRIER_IMAGE_W;
|
||||
stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
|
||||
|
||||
array_insert(b, b->keeps, stib);
|
||||
array_insert(b, b->keeps, stib);
|
||||
}
|
||||
|
||||
/* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */
|
||||
static struct ir3_instruction *
|
||||
emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
||||
{
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *atomic, *src0, *src1, *src2;
|
||||
struct ir3_instruction * const *coords = ir3_get_src(ctx, &intr->src[1]);
|
||||
struct ir3_instruction * image = ir3_image_to_ibo(ctx, intr->src[0]);
|
||||
unsigned ncoords = ir3_get_image_coords(intr, NULL);
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *atomic, *src0, *src1, *src2;
|
||||
struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]);
|
||||
struct ir3_instruction *image = ir3_image_to_ibo(ctx, intr->src[0]);
|
||||
unsigned ncoords = ir3_get_image_coords(intr, NULL);
|
||||
|
||||
/* src0 is value (or uvec2(value, compare))
|
||||
* src1 is coords
|
||||
* src2 is 64b byte offset
|
||||
*/
|
||||
src0 = ir3_get_src(ctx, &intr->src[3])[0];
|
||||
src1 = ir3_create_collect(ctx, coords, ncoords);
|
||||
src2 = get_image_offset(ctx, intr, coords, false);
|
||||
/* src0 is value (or uvec2(value, compare))
|
||||
* src1 is coords
|
||||
* src2 is 64b byte offset
|
||||
*/
|
||||
src0 = ir3_get_src(ctx, &intr->src[3])[0];
|
||||
src1 = ir3_create_collect(ctx, coords, ncoords);
|
||||
src2 = get_image_offset(ctx, intr, coords, false);
|
||||
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_image_atomic_add:
|
||||
atomic = ir3_ATOMIC_ADD_G(b, image, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_imin:
|
||||
case nir_intrinsic_image_atomic_umin:
|
||||
atomic = ir3_ATOMIC_MIN_G(b, image, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_imax:
|
||||
case nir_intrinsic_image_atomic_umax:
|
||||
atomic = ir3_ATOMIC_MAX_G(b, image, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_and:
|
||||
atomic = ir3_ATOMIC_AND_G(b, image, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_or:
|
||||
atomic = ir3_ATOMIC_OR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_xor:
|
||||
atomic = ir3_ATOMIC_XOR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_exchange:
|
||||
atomic = ir3_ATOMIC_XCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_comp_swap:
|
||||
/* for cmpxchg, src0 is [ui]vec2(data, compare): */
|
||||
src0 = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[4])[0], src0);
|
||||
atomic = ir3_ATOMIC_CMPXCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
default:
|
||||
unreachable("boo");
|
||||
}
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_image_atomic_add:
|
||||
atomic = ir3_ATOMIC_ADD_G(b, image, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_imin:
|
||||
case nir_intrinsic_image_atomic_umin:
|
||||
atomic = ir3_ATOMIC_MIN_G(b, image, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_imax:
|
||||
case nir_intrinsic_image_atomic_umax:
|
||||
atomic = ir3_ATOMIC_MAX_G(b, image, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_and:
|
||||
atomic = ir3_ATOMIC_AND_G(b, image, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_or:
|
||||
atomic = ir3_ATOMIC_OR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_xor:
|
||||
atomic = ir3_ATOMIC_XOR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_exchange:
|
||||
atomic = ir3_ATOMIC_XCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_comp_swap:
|
||||
/* for cmpxchg, src0 is [ui]vec2(data, compare): */
|
||||
src0 = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[4])[0], src0);
|
||||
atomic = ir3_ATOMIC_CMPXCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
|
||||
break;
|
||||
default:
|
||||
unreachable("boo");
|
||||
}
|
||||
|
||||
atomic->cat6.iim_val = 1;
|
||||
atomic->cat6.d = ncoords;
|
||||
atomic->cat6.type = ir3_get_type_for_image_intrinsic(intr);
|
||||
atomic->cat6.typed = true;
|
||||
atomic->barrier_class = IR3_BARRIER_IMAGE_W;
|
||||
atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
|
||||
atomic->cat6.iim_val = 1;
|
||||
atomic->cat6.d = ncoords;
|
||||
atomic->cat6.type = ir3_get_type_for_image_intrinsic(intr);
|
||||
atomic->cat6.typed = true;
|
||||
atomic->barrier_class = IR3_BARRIER_IMAGE_W;
|
||||
atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
|
||||
|
||||
/* even if nothing consume the result, we can't DCE the instruction: */
|
||||
array_insert(b, b->keeps, atomic);
|
||||
/* even if nothing consume the result, we can't DCE the instruction: */
|
||||
array_insert(b, b->keeps, atomic);
|
||||
|
||||
return atomic;
|
||||
return atomic;
|
||||
}
|
||||
|
||||
const struct ir3_context_funcs ir3_a4xx_funcs = {
|
||||
.emit_intrinsic_load_ssbo = emit_intrinsic_load_ssbo,
|
||||
.emit_intrinsic_store_ssbo = emit_intrinsic_store_ssbo,
|
||||
.emit_intrinsic_atomic_ssbo = emit_intrinsic_atomic_ssbo,
|
||||
.emit_intrinsic_store_image = emit_intrinsic_store_image,
|
||||
.emit_intrinsic_atomic_image = emit_intrinsic_atomic_image,
|
||||
.emit_intrinsic_image_size = emit_intrinsic_image_size_tex,
|
||||
.emit_intrinsic_load_global_ir3 = NULL,
|
||||
.emit_intrinsic_store_global_ir3 = NULL,
|
||||
.emit_intrinsic_load_ssbo = emit_intrinsic_load_ssbo,
|
||||
.emit_intrinsic_store_ssbo = emit_intrinsic_store_ssbo,
|
||||
.emit_intrinsic_atomic_ssbo = emit_intrinsic_atomic_ssbo,
|
||||
.emit_intrinsic_store_image = emit_intrinsic_store_image,
|
||||
.emit_intrinsic_atomic_image = emit_intrinsic_atomic_image,
|
||||
.emit_intrinsic_image_size = emit_intrinsic_image_size_tex,
|
||||
.emit_intrinsic_load_global_ir3 = NULL,
|
||||
.emit_intrinsic_store_global_ir3 = NULL,
|
||||
};
|
||||
|
|
|
@ -40,53 +40,53 @@
|
|||
/* src[] = { buffer_index, offset }. No const_index */
|
||||
static void
|
||||
emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
|
||||
struct ir3_instruction **dst)
|
||||
struct ir3_instruction **dst)
|
||||
{
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *offset;
|
||||
struct ir3_instruction *ldib;
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *offset;
|
||||
struct ir3_instruction *ldib;
|
||||
|
||||
offset = ir3_get_src(ctx, &intr->src[2])[0];
|
||||
offset = ir3_get_src(ctx, &intr->src[2])[0];
|
||||
|
||||
ldib = ir3_LDIB(b, ir3_ssbo_to_ibo(ctx, intr->src[0]), 0, offset, 0);
|
||||
ldib->dsts[0]->wrmask = MASK(intr->num_components);
|
||||
ldib->cat6.iim_val = intr->num_components;
|
||||
ldib->cat6.d = 1;
|
||||
ldib->cat6.type = intr->dest.ssa.bit_size == 16 ? TYPE_U16 : TYPE_U32;
|
||||
ldib->barrier_class = IR3_BARRIER_BUFFER_R;
|
||||
ldib->barrier_conflict = IR3_BARRIER_BUFFER_W;
|
||||
ir3_handle_bindless_cat6(ldib, intr->src[0]);
|
||||
ir3_handle_nonuniform(ldib, intr);
|
||||
ldib = ir3_LDIB(b, ir3_ssbo_to_ibo(ctx, intr->src[0]), 0, offset, 0);
|
||||
ldib->dsts[0]->wrmask = MASK(intr->num_components);
|
||||
ldib->cat6.iim_val = intr->num_components;
|
||||
ldib->cat6.d = 1;
|
||||
ldib->cat6.type = intr->dest.ssa.bit_size == 16 ? TYPE_U16 : TYPE_U32;
|
||||
ldib->barrier_class = IR3_BARRIER_BUFFER_R;
|
||||
ldib->barrier_conflict = IR3_BARRIER_BUFFER_W;
|
||||
ir3_handle_bindless_cat6(ldib, intr->src[0]);
|
||||
ir3_handle_nonuniform(ldib, intr);
|
||||
|
||||
ir3_split_dest(b, dst, ldib, 0, intr->num_components);
|
||||
ir3_split_dest(b, dst, ldib, 0, intr->num_components);
|
||||
}
|
||||
|
||||
/* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
|
||||
static void
|
||||
emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
||||
{
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *stib, *val, *offset;
|
||||
unsigned wrmask = nir_intrinsic_write_mask(intr);
|
||||
unsigned ncomp = ffs(~wrmask) - 1;
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *stib, *val, *offset;
|
||||
unsigned wrmask = nir_intrinsic_write_mask(intr);
|
||||
unsigned ncomp = ffs(~wrmask) - 1;
|
||||
|
||||
assert(wrmask == BITFIELD_MASK(intr->num_components));
|
||||
assert(wrmask == BITFIELD_MASK(intr->num_components));
|
||||
|
||||
/* src0 is offset, src1 is value:
|
||||
*/
|
||||
val = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
|
||||
offset = ir3_get_src(ctx, &intr->src[3])[0];
|
||||
/* src0 is offset, src1 is value:
|
||||
*/
|
||||
val = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
|
||||
offset = ir3_get_src(ctx, &intr->src[3])[0];
|
||||
|
||||
stib = ir3_STIB(b, ir3_ssbo_to_ibo(ctx, intr->src[1]), 0, offset, 0, val, 0);
|
||||
stib->cat6.iim_val = ncomp;
|
||||
stib->cat6.d = 1;
|
||||
stib->cat6.type = intr->src[0].ssa->bit_size == 16 ? TYPE_U16 : TYPE_U32;
|
||||
stib->barrier_class = IR3_BARRIER_BUFFER_W;
|
||||
stib->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
|
||||
ir3_handle_bindless_cat6(stib, intr->src[1]);
|
||||
ir3_handle_nonuniform(stib, intr);
|
||||
stib = ir3_STIB(b, ir3_ssbo_to_ibo(ctx, intr->src[1]), 0, offset, 0, val, 0);
|
||||
stib->cat6.iim_val = ncomp;
|
||||
stib->cat6.d = 1;
|
||||
stib->cat6.type = intr->src[0].ssa->bit_size == 16 ? TYPE_U16 : TYPE_U32;
|
||||
stib->barrier_class = IR3_BARRIER_BUFFER_W;
|
||||
stib->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
|
||||
ir3_handle_bindless_cat6(stib, intr->src[1]);
|
||||
ir3_handle_nonuniform(stib, intr);
|
||||
|
||||
array_insert(b, b->keeps, stib);
|
||||
array_insert(b, b->keeps, stib);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -109,329 +109,321 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
|||
static struct ir3_instruction *
|
||||
emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
||||
{
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *atomic, *ibo, *src0, *src1, *data, *dummy;
|
||||
type_t type = TYPE_U32;
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *atomic, *ibo, *src0, *src1, *data, *dummy;
|
||||
type_t type = TYPE_U32;
|
||||
|
||||
ibo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
|
||||
ibo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
|
||||
|
||||
data = ir3_get_src(ctx, &intr->src[2])[0];
|
||||
data = ir3_get_src(ctx, &intr->src[2])[0];
|
||||
|
||||
/* So this gets a bit creative:
|
||||
*
|
||||
* src0 - vecN offset/coords
|
||||
* src1.x - is actually destination register
|
||||
* src1.y - is 'data' except for cmpxchg where src2.y is 'compare'
|
||||
* src1.z - is 'data' for cmpxchg
|
||||
*
|
||||
* The combining src and dest kinda doesn't work out so well with how
|
||||
* scheduling and RA work. So we create a dummy src2 which is tied to the
|
||||
* destination in RA (i.e. must be allocated to the same vec2/vec3
|
||||
* register) and then immediately extract the first component.
|
||||
*
|
||||
* Note that nir already multiplies the offset by four
|
||||
*/
|
||||
dummy = create_immed(b, 0);
|
||||
/* So this gets a bit creative:
|
||||
*
|
||||
* src0 - vecN offset/coords
|
||||
* src1.x - is actually destination register
|
||||
* src1.y - is 'data' except for cmpxchg where src2.y is 'compare'
|
||||
* src1.z - is 'data' for cmpxchg
|
||||
*
|
||||
* The combining src and dest kinda doesn't work out so well with how
|
||||
* scheduling and RA work. So we create a dummy src2 which is tied to the
|
||||
* destination in RA (i.e. must be allocated to the same vec2/vec3
|
||||
* register) and then immediately extract the first component.
|
||||
*
|
||||
* Note that nir already multiplies the offset by four
|
||||
*/
|
||||
dummy = create_immed(b, 0);
|
||||
|
||||
if (intr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap_ir3) {
|
||||
src0 = ir3_get_src(ctx, &intr->src[4])[0];
|
||||
struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[3])[0];
|
||||
src1 = ir3_collect(ctx, dummy, compare, data);
|
||||
} else {
|
||||
src0 = ir3_get_src(ctx, &intr->src[3])[0];
|
||||
src1 = ir3_collect(ctx, dummy, data);
|
||||
}
|
||||
if (intr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap_ir3) {
|
||||
src0 = ir3_get_src(ctx, &intr->src[4])[0];
|
||||
struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[3])[0];
|
||||
src1 = ir3_collect(ctx, dummy, compare, data);
|
||||
} else {
|
||||
src0 = ir3_get_src(ctx, &intr->src[3])[0];
|
||||
src1 = ir3_collect(ctx, dummy, data);
|
||||
}
|
||||
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_ssbo_atomic_add_ir3:
|
||||
atomic = ir3_ATOMIC_ADD_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_imin_ir3:
|
||||
atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
type = TYPE_S32;
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_umin_ir3:
|
||||
atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_imax_ir3:
|
||||
atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
type = TYPE_S32;
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_umax_ir3:
|
||||
atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_and_ir3:
|
||||
atomic = ir3_ATOMIC_AND_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_or_ir3:
|
||||
atomic = ir3_ATOMIC_OR_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_xor_ir3:
|
||||
atomic = ir3_ATOMIC_XOR_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_exchange_ir3:
|
||||
atomic = ir3_ATOMIC_XCHG_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
|
||||
atomic = ir3_ATOMIC_CMPXCHG_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
default:
|
||||
unreachable("boo");
|
||||
}
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_ssbo_atomic_add_ir3:
|
||||
atomic = ir3_ATOMIC_ADD_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_imin_ir3:
|
||||
atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
type = TYPE_S32;
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_umin_ir3:
|
||||
atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_imax_ir3:
|
||||
atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
type = TYPE_S32;
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_umax_ir3:
|
||||
atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_and_ir3:
|
||||
atomic = ir3_ATOMIC_AND_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_or_ir3:
|
||||
atomic = ir3_ATOMIC_OR_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_xor_ir3:
|
||||
atomic = ir3_ATOMIC_XOR_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_exchange_ir3:
|
||||
atomic = ir3_ATOMIC_XCHG_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
|
||||
atomic = ir3_ATOMIC_CMPXCHG_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
default:
|
||||
unreachable("boo");
|
||||
}
|
||||
|
||||
atomic->cat6.iim_val = 1;
|
||||
atomic->cat6.d = 1;
|
||||
atomic->cat6.type = type;
|
||||
atomic->barrier_class = IR3_BARRIER_BUFFER_W;
|
||||
atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
|
||||
ir3_handle_bindless_cat6(atomic, intr->src[0]);
|
||||
atomic->cat6.iim_val = 1;
|
||||
atomic->cat6.d = 1;
|
||||
atomic->cat6.type = type;
|
||||
atomic->barrier_class = IR3_BARRIER_BUFFER_W;
|
||||
atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
|
||||
ir3_handle_bindless_cat6(atomic, intr->src[0]);
|
||||
|
||||
/* even if nothing consume the result, we can't DCE the instruction: */
|
||||
array_insert(b, b->keeps, atomic);
|
||||
/* even if nothing consume the result, we can't DCE the instruction: */
|
||||
array_insert(b, b->keeps, atomic);
|
||||
|
||||
atomic->dsts[0]->wrmask = src1->dsts[0]->wrmask;
|
||||
ir3_reg_tie(atomic->dsts[0], atomic->srcs[2]);
|
||||
struct ir3_instruction *split;
|
||||
ir3_split_dest(b, &split, atomic, 0, 1);
|
||||
return split;
|
||||
atomic->dsts[0]->wrmask = src1->dsts[0]->wrmask;
|
||||
ir3_reg_tie(atomic->dsts[0], atomic->srcs[2]);
|
||||
struct ir3_instruction *split;
|
||||
ir3_split_dest(b, &split, atomic, 0, 1);
|
||||
return split;
|
||||
}
|
||||
|
||||
/* src[] = { deref, coord, sample_index }. const_index[] = {} */
|
||||
static void
|
||||
emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
|
||||
struct ir3_instruction **dst)
|
||||
struct ir3_instruction **dst)
|
||||
{
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *ldib;
|
||||
struct ir3_instruction * const *coords = ir3_get_src(ctx, &intr->src[1]);
|
||||
unsigned ncoords = ir3_get_image_coords(intr, NULL);
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *ldib;
|
||||
struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]);
|
||||
unsigned ncoords = ir3_get_image_coords(intr, NULL);
|
||||
|
||||
ldib = ir3_LDIB(b, ir3_image_to_ibo(ctx, intr->src[0]), 0,
|
||||
ir3_create_collect(ctx, coords, ncoords), 0);
|
||||
ldib->dsts[0]->wrmask = MASK(intr->num_components);
|
||||
ldib->cat6.iim_val = intr->num_components;
|
||||
ldib->cat6.d = ncoords;
|
||||
ldib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
|
||||
ldib->cat6.typed = true;
|
||||
ldib->barrier_class = IR3_BARRIER_IMAGE_R;
|
||||
ldib->barrier_conflict = IR3_BARRIER_IMAGE_W;
|
||||
ir3_handle_bindless_cat6(ldib, intr->src[0]);
|
||||
ir3_handle_nonuniform(ldib, intr);
|
||||
ldib = ir3_LDIB(b, ir3_image_to_ibo(ctx, intr->src[0]), 0,
|
||||
ir3_create_collect(ctx, coords, ncoords), 0);
|
||||
ldib->dsts[0]->wrmask = MASK(intr->num_components);
|
||||
ldib->cat6.iim_val = intr->num_components;
|
||||
ldib->cat6.d = ncoords;
|
||||
ldib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
|
||||
ldib->cat6.typed = true;
|
||||
ldib->barrier_class = IR3_BARRIER_IMAGE_R;
|
||||
ldib->barrier_conflict = IR3_BARRIER_IMAGE_W;
|
||||
ir3_handle_bindless_cat6(ldib, intr->src[0]);
|
||||
ir3_handle_nonuniform(ldib, intr);
|
||||
|
||||
ir3_split_dest(b, dst, ldib, 0, intr->num_components);
|
||||
ir3_split_dest(b, dst, ldib, 0, intr->num_components);
|
||||
}
|
||||
|
||||
/* src[] = { deref, coord, sample_index, value }. const_index[] = {} */
|
||||
static void
|
||||
emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
||||
{
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *stib;
|
||||
struct ir3_instruction * const *value = ir3_get_src(ctx, &intr->src[3]);
|
||||
struct ir3_instruction * const *coords = ir3_get_src(ctx, &intr->src[1]);
|
||||
unsigned ncoords = ir3_get_image_coords(intr, NULL);
|
||||
enum pipe_format format = nir_intrinsic_format(intr);
|
||||
unsigned ncomp = ir3_get_num_components_for_image_format(format);
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *stib;
|
||||
struct ir3_instruction *const *value = ir3_get_src(ctx, &intr->src[3]);
|
||||
struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]);
|
||||
unsigned ncoords = ir3_get_image_coords(intr, NULL);
|
||||
enum pipe_format format = nir_intrinsic_format(intr);
|
||||
unsigned ncomp = ir3_get_num_components_for_image_format(format);
|
||||
|
||||
/* src0 is offset, src1 is value:
|
||||
*/
|
||||
stib = ir3_STIB(b, ir3_image_to_ibo(ctx, intr->src[0]), 0,
|
||||
ir3_create_collect(ctx, coords, ncoords), 0,
|
||||
ir3_create_collect(ctx, value, ncomp), 0);
|
||||
stib->cat6.iim_val = ncomp;
|
||||
stib->cat6.d = ncoords;
|
||||
stib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
|
||||
stib->cat6.typed = true;
|
||||
stib->barrier_class = IR3_BARRIER_IMAGE_W;
|
||||
stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
|
||||
ir3_handle_bindless_cat6(stib, intr->src[0]);
|
||||
ir3_handle_nonuniform(stib, intr);
|
||||
/* src0 is offset, src1 is value:
|
||||
*/
|
||||
stib = ir3_STIB(b, ir3_image_to_ibo(ctx, intr->src[0]), 0,
|
||||
ir3_create_collect(ctx, coords, ncoords), 0,
|
||||
ir3_create_collect(ctx, value, ncomp), 0);
|
||||
stib->cat6.iim_val = ncomp;
|
||||
stib->cat6.d = ncoords;
|
||||
stib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
|
||||
stib->cat6.typed = true;
|
||||
stib->barrier_class = IR3_BARRIER_IMAGE_W;
|
||||
stib->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
|
||||
ir3_handle_bindless_cat6(stib, intr->src[0]);
|
||||
ir3_handle_nonuniform(stib, intr);
|
||||
|
||||
array_insert(b, b->keeps, stib);
|
||||
array_insert(b, b->keeps, stib);
|
||||
}
|
||||
|
||||
/* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */
|
||||
static struct ir3_instruction *
|
||||
emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
||||
{
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *atomic, *ibo, *src0, *src1, *dummy;
|
||||
struct ir3_instruction * const *coords = ir3_get_src(ctx, &intr->src[1]);
|
||||
struct ir3_instruction *value = ir3_get_src(ctx, &intr->src[3])[0];
|
||||
unsigned ncoords = ir3_get_image_coords(intr, NULL);
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *atomic, *ibo, *src0, *src1, *dummy;
|
||||
struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]);
|
||||
struct ir3_instruction *value = ir3_get_src(ctx, &intr->src[3])[0];
|
||||
unsigned ncoords = ir3_get_image_coords(intr, NULL);
|
||||
|
||||
ibo = ir3_image_to_ibo(ctx, intr->src[0]);
|
||||
ibo = ir3_image_to_ibo(ctx, intr->src[0]);
|
||||
|
||||
/* So this gets a bit creative:
|
||||
*
|
||||
* src0 - vecN offset/coords
|
||||
* src1.x - is actually destination register
|
||||
* src1.y - is 'value' except for cmpxchg where src2.y is 'compare'
|
||||
* src1.z - is 'value' for cmpxchg
|
||||
*
|
||||
* The combining src and dest kinda doesn't work out so well with how
|
||||
* scheduling and RA work. So we create a dummy src2 which is tied to the
|
||||
* destination in RA (i.e. must be allocated to the same vec2/vec3
|
||||
* register) and then immediately extract the first component.
|
||||
*/
|
||||
dummy = create_immed(b, 0);
|
||||
src0 = ir3_create_collect(ctx, coords, ncoords);
|
||||
/* So this gets a bit creative:
|
||||
*
|
||||
* src0 - vecN offset/coords
|
||||
* src1.x - is actually destination register
|
||||
* src1.y - is 'value' except for cmpxchg where src2.y is 'compare'
|
||||
* src1.z - is 'value' for cmpxchg
|
||||
*
|
||||
* The combining src and dest kinda doesn't work out so well with how
|
||||
* scheduling and RA work. So we create a dummy src2 which is tied to the
|
||||
* destination in RA (i.e. must be allocated to the same vec2/vec3
|
||||
* register) and then immediately extract the first component.
|
||||
*/
|
||||
dummy = create_immed(b, 0);
|
||||
src0 = ir3_create_collect(ctx, coords, ncoords);
|
||||
|
||||
if (intr->intrinsic == nir_intrinsic_image_atomic_comp_swap ||
|
||||
intr->intrinsic == nir_intrinsic_bindless_image_atomic_comp_swap) {
|
||||
struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[4])[0];
|
||||
src1 = ir3_collect(ctx, dummy, compare, value);
|
||||
} else {
|
||||
src1 = ir3_collect(ctx, dummy, value);
|
||||
}
|
||||
if (intr->intrinsic == nir_intrinsic_image_atomic_comp_swap ||
|
||||
intr->intrinsic == nir_intrinsic_bindless_image_atomic_comp_swap) {
|
||||
struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[4])[0];
|
||||
src1 = ir3_collect(ctx, dummy, compare, value);
|
||||
} else {
|
||||
src1 = ir3_collect(ctx, dummy, value);
|
||||
}
|
||||
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_image_atomic_add:
|
||||
case nir_intrinsic_bindless_image_atomic_add:
|
||||
atomic = ir3_ATOMIC_ADD_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_imin:
|
||||
case nir_intrinsic_image_atomic_umin:
|
||||
case nir_intrinsic_bindless_image_atomic_imin:
|
||||
case nir_intrinsic_bindless_image_atomic_umin:
|
||||
atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_imax:
|
||||
case nir_intrinsic_image_atomic_umax:
|
||||
case nir_intrinsic_bindless_image_atomic_imax:
|
||||
case nir_intrinsic_bindless_image_atomic_umax:
|
||||
atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_and:
|
||||
case nir_intrinsic_bindless_image_atomic_and:
|
||||
atomic = ir3_ATOMIC_AND_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_or:
|
||||
case nir_intrinsic_bindless_image_atomic_or:
|
||||
atomic = ir3_ATOMIC_OR_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_xor:
|
||||
case nir_intrinsic_bindless_image_atomic_xor:
|
||||
atomic = ir3_ATOMIC_XOR_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_exchange:
|
||||
case nir_intrinsic_bindless_image_atomic_exchange:
|
||||
atomic = ir3_ATOMIC_XCHG_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_comp_swap:
|
||||
case nir_intrinsic_bindless_image_atomic_comp_swap:
|
||||
atomic = ir3_ATOMIC_CMPXCHG_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
default:
|
||||
unreachable("boo");
|
||||
}
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_image_atomic_add:
|
||||
case nir_intrinsic_bindless_image_atomic_add:
|
||||
atomic = ir3_ATOMIC_ADD_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_imin:
|
||||
case nir_intrinsic_image_atomic_umin:
|
||||
case nir_intrinsic_bindless_image_atomic_imin:
|
||||
case nir_intrinsic_bindless_image_atomic_umin:
|
||||
atomic = ir3_ATOMIC_MIN_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_imax:
|
||||
case nir_intrinsic_image_atomic_umax:
|
||||
case nir_intrinsic_bindless_image_atomic_imax:
|
||||
case nir_intrinsic_bindless_image_atomic_umax:
|
||||
atomic = ir3_ATOMIC_MAX_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_and:
|
||||
case nir_intrinsic_bindless_image_atomic_and:
|
||||
atomic = ir3_ATOMIC_AND_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_or:
|
||||
case nir_intrinsic_bindless_image_atomic_or:
|
||||
atomic = ir3_ATOMIC_OR_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_xor:
|
||||
case nir_intrinsic_bindless_image_atomic_xor:
|
||||
atomic = ir3_ATOMIC_XOR_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_exchange:
|
||||
case nir_intrinsic_bindless_image_atomic_exchange:
|
||||
atomic = ir3_ATOMIC_XCHG_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_comp_swap:
|
||||
case nir_intrinsic_bindless_image_atomic_comp_swap:
|
||||
atomic = ir3_ATOMIC_CMPXCHG_G(b, ibo, 0, src0, 0, src1, 0);
|
||||
break;
|
||||
default:
|
||||
unreachable("boo");
|
||||
}
|
||||
|
||||
atomic->cat6.iim_val = 1;
|
||||
atomic->cat6.d = ncoords;
|
||||
atomic->cat6.type = ir3_get_type_for_image_intrinsic(intr);
|
||||
atomic->cat6.typed = true;
|
||||
atomic->barrier_class = IR3_BARRIER_IMAGE_W;
|
||||
atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
|
||||
ir3_handle_bindless_cat6(atomic, intr->src[0]);
|
||||
atomic->cat6.iim_val = 1;
|
||||
atomic->cat6.d = ncoords;
|
||||
atomic->cat6.type = ir3_get_type_for_image_intrinsic(intr);
|
||||
atomic->cat6.typed = true;
|
||||
atomic->barrier_class = IR3_BARRIER_IMAGE_W;
|
||||
atomic->barrier_conflict = IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W;
|
||||
ir3_handle_bindless_cat6(atomic, intr->src[0]);
|
||||
|
||||
/* even if nothing consume the result, we can't DCE the instruction: */
|
||||
array_insert(b, b->keeps, atomic);
|
||||
/* even if nothing consume the result, we can't DCE the instruction: */
|
||||
array_insert(b, b->keeps, atomic);
|
||||
|
||||
atomic->dsts[0]->wrmask = src1->dsts[0]->wrmask;
|
||||
ir3_reg_tie(atomic->dsts[0], atomic->srcs[2]);
|
||||
struct ir3_instruction *split;
|
||||
ir3_split_dest(b, &split, atomic, 0, 1);
|
||||
return split;
|
||||
atomic->dsts[0]->wrmask = src1->dsts[0]->wrmask;
|
||||
ir3_reg_tie(atomic->dsts[0], atomic->srcs[2]);
|
||||
struct ir3_instruction *split;
|
||||
ir3_split_dest(b, &split, atomic, 0, 1);
|
||||
return split;
|
||||
}
|
||||
|
||||
static void
|
||||
emit_intrinsic_image_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
|
||||
struct ir3_instruction **dst)
|
||||
struct ir3_instruction **dst)
|
||||
{
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *ibo = ir3_image_to_ibo(ctx, intr->src[0]);
|
||||
struct ir3_instruction *resinfo = ir3_RESINFO(b, ibo, 0);
|
||||
resinfo->cat6.iim_val = 1;
|
||||
resinfo->cat6.d = intr->num_components;
|
||||
resinfo->cat6.type = TYPE_U32;
|
||||
resinfo->cat6.typed = false;
|
||||
/* resinfo has no writemask and always writes out 3 components: */
|
||||
compile_assert(ctx, intr->num_components <= 3);
|
||||
resinfo->dsts[0]->wrmask = MASK(3);
|
||||
ir3_handle_bindless_cat6(resinfo, intr->src[0]);
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *ibo = ir3_image_to_ibo(ctx, intr->src[0]);
|
||||
struct ir3_instruction *resinfo = ir3_RESINFO(b, ibo, 0);
|
||||
resinfo->cat6.iim_val = 1;
|
||||
resinfo->cat6.d = intr->num_components;
|
||||
resinfo->cat6.type = TYPE_U32;
|
||||
resinfo->cat6.typed = false;
|
||||
/* resinfo has no writemask and always writes out 3 components: */
|
||||
compile_assert(ctx, intr->num_components <= 3);
|
||||
resinfo->dsts[0]->wrmask = MASK(3);
|
||||
ir3_handle_bindless_cat6(resinfo, intr->src[0]);
|
||||
|
||||
ir3_split_dest(b, dst, resinfo, 0, intr->num_components);
|
||||
ir3_split_dest(b, dst, resinfo, 0, intr->num_components);
|
||||
}
|
||||
|
||||
static void
|
||||
emit_intrinsic_load_global_ir3(struct ir3_context *ctx, nir_intrinsic_instr *intr,
|
||||
struct ir3_instruction **dst)
|
||||
emit_intrinsic_load_global_ir3(struct ir3_context *ctx,
|
||||
nir_intrinsic_instr *intr,
|
||||
struct ir3_instruction **dst)
|
||||
{
|
||||
struct ir3_block *b = ctx->block;
|
||||
unsigned dest_components = nir_intrinsic_dest_components(intr);
|
||||
struct ir3_instruction *addr, *offset;
|
||||
struct ir3_block *b = ctx->block;
|
||||
unsigned dest_components = nir_intrinsic_dest_components(intr);
|
||||
struct ir3_instruction *addr, *offset;
|
||||
|
||||
addr = ir3_collect(ctx,
|
||||
ir3_get_src(ctx, &intr->src[0])[0],
|
||||
ir3_get_src(ctx, &intr->src[0])[1]);
|
||||
addr = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[0])[0],
|
||||
ir3_get_src(ctx, &intr->src[0])[1]);
|
||||
|
||||
offset = ir3_get_src(ctx, &intr->src[1])[0];
|
||||
offset = ir3_get_src(ctx, &intr->src[1])[0];
|
||||
|
||||
struct ir3_instruction *load =
|
||||
ir3_LDG_A(b, addr, 0, offset, 0,
|
||||
create_immed(b, 0), 0,
|
||||
create_immed(b, 0), 0,
|
||||
create_immed(b, dest_components), 0);
|
||||
load->cat6.type = TYPE_U32;
|
||||
load->dsts[0]->wrmask = MASK(dest_components);
|
||||
struct ir3_instruction *load =
|
||||
ir3_LDG_A(b, addr, 0, offset, 0, create_immed(b, 0), 0,
|
||||
create_immed(b, 0), 0, create_immed(b, dest_components), 0);
|
||||
load->cat6.type = TYPE_U32;
|
||||
load->dsts[0]->wrmask = MASK(dest_components);
|
||||
|
||||
load->barrier_class = IR3_BARRIER_BUFFER_R;
|
||||
load->barrier_conflict = IR3_BARRIER_BUFFER_W;
|
||||
load->barrier_class = IR3_BARRIER_BUFFER_R;
|
||||
load->barrier_conflict = IR3_BARRIER_BUFFER_W;
|
||||
|
||||
ir3_split_dest(b, dst, load, 0, dest_components);
|
||||
ir3_split_dest(b, dst, load, 0, dest_components);
|
||||
}
|
||||
|
||||
static void
|
||||
emit_intrinsic_store_global_ir3(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
||||
emit_intrinsic_store_global_ir3(struct ir3_context *ctx,
|
||||
nir_intrinsic_instr *intr)
|
||||
{
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *value, *addr, *offset;
|
||||
unsigned ncomp = nir_intrinsic_src_components(intr, 0);
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *value, *addr, *offset;
|
||||
unsigned ncomp = nir_intrinsic_src_components(intr, 0);
|
||||
|
||||
addr = ir3_collect(ctx,
|
||||
ir3_get_src(ctx, &intr->src[1])[0],
|
||||
ir3_get_src(ctx, &intr->src[1])[1]);
|
||||
addr = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[1])[0],
|
||||
ir3_get_src(ctx, &intr->src[1])[1]);
|
||||
|
||||
offset = ir3_get_src(ctx, &intr->src[2])[0];
|
||||
offset = ir3_get_src(ctx, &intr->src[2])[0];
|
||||
|
||||
value = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
|
||||
value = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
|
||||
|
||||
struct ir3_instruction *stg =
|
||||
ir3_STG_A(b,
|
||||
addr, 0,
|
||||
offset, 0,
|
||||
create_immed(b, 0), 0,
|
||||
create_immed(b, 0), 0,
|
||||
value, 0,
|
||||
create_immed(b, ncomp), 0);
|
||||
stg->cat6.type = TYPE_U32;
|
||||
stg->cat6.iim_val = 1;
|
||||
struct ir3_instruction *stg =
|
||||
ir3_STG_A(b, addr, 0, offset, 0, create_immed(b, 0), 0,
|
||||
create_immed(b, 0), 0, value, 0, create_immed(b, ncomp), 0);
|
||||
stg->cat6.type = TYPE_U32;
|
||||
stg->cat6.iim_val = 1;
|
||||
|
||||
array_insert(b, b->keeps, stg);
|
||||
array_insert(b, b->keeps, stg);
|
||||
|
||||
stg->barrier_class = IR3_BARRIER_BUFFER_W;
|
||||
stg->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
|
||||
stg->barrier_class = IR3_BARRIER_BUFFER_W;
|
||||
stg->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
|
||||
}
|
||||
|
||||
const struct ir3_context_funcs ir3_a6xx_funcs = {
|
||||
.emit_intrinsic_load_ssbo = emit_intrinsic_load_ssbo,
|
||||
.emit_intrinsic_store_ssbo = emit_intrinsic_store_ssbo,
|
||||
.emit_intrinsic_atomic_ssbo = emit_intrinsic_atomic_ssbo,
|
||||
.emit_intrinsic_load_image = emit_intrinsic_load_image,
|
||||
.emit_intrinsic_store_image = emit_intrinsic_store_image,
|
||||
.emit_intrinsic_atomic_image = emit_intrinsic_atomic_image,
|
||||
.emit_intrinsic_image_size = emit_intrinsic_image_size,
|
||||
.emit_intrinsic_load_global_ir3 = emit_intrinsic_load_global_ir3,
|
||||
.emit_intrinsic_store_global_ir3 = emit_intrinsic_store_global_ir3,
|
||||
.emit_intrinsic_load_ssbo = emit_intrinsic_load_ssbo,
|
||||
.emit_intrinsic_store_ssbo = emit_intrinsic_store_ssbo,
|
||||
.emit_intrinsic_atomic_ssbo = emit_intrinsic_atomic_ssbo,
|
||||
.emit_intrinsic_load_image = emit_intrinsic_load_image,
|
||||
.emit_intrinsic_store_image = emit_intrinsic_store_image,
|
||||
.emit_intrinsic_atomic_image = emit_intrinsic_atomic_image,
|
||||
.emit_intrinsic_image_size = emit_intrinsic_image_size,
|
||||
.emit_intrinsic_load_global_ir3 = emit_intrinsic_load_global_ir3,
|
||||
.emit_intrinsic_store_global_ir3 = emit_intrinsic_store_global_ir3,
|
||||
};
|
||||
|
||||
|
|
|
@ -42,270 +42,274 @@
|
|||
* so that we don't have to rewrite (and keep track of) users.
|
||||
*/
|
||||
|
||||
#include "ir3.h"
|
||||
#include <stdlib.h>
|
||||
#include "ir3.h"
|
||||
|
||||
struct array_state {
|
||||
struct ir3_register *live_in_definition;
|
||||
struct ir3_register *live_out_definition;
|
||||
bool constructed;
|
||||
bool optimized;
|
||||
struct ir3_register *live_in_definition;
|
||||
struct ir3_register *live_out_definition;
|
||||
bool constructed;
|
||||
bool optimized;
|
||||
};
|
||||
|
||||
struct array_ctx {
|
||||
struct array_state *states;
|
||||
struct ir3 *ir;
|
||||
unsigned array_count;
|
||||
struct array_state *states;
|
||||
struct ir3 *ir;
|
||||
unsigned array_count;
|
||||
};
|
||||
|
||||
static struct array_state *
|
||||
get_state(struct array_ctx *ctx, struct ir3_block *block, unsigned id)
|
||||
{
|
||||
return &ctx->states[ctx->array_count * block->index + id];
|
||||
return &ctx->states[ctx->array_count * block->index + id];
|
||||
}
|
||||
|
||||
static struct ir3_register *
|
||||
read_value_beginning(struct array_ctx *ctx, struct ir3_block *block, struct ir3_array *arr);
|
||||
static struct ir3_register *read_value_beginning(struct array_ctx *ctx,
|
||||
struct ir3_block *block,
|
||||
struct ir3_array *arr);
|
||||
|
||||
static struct ir3_register *
|
||||
read_value_end(struct array_ctx *ctx, struct ir3_block *block, struct ir3_array *arr)
|
||||
read_value_end(struct array_ctx *ctx, struct ir3_block *block,
|
||||
struct ir3_array *arr)
|
||||
{
|
||||
struct array_state *state = get_state(ctx, block, arr->id);
|
||||
if (state->live_out_definition)
|
||||
return state->live_out_definition;
|
||||
struct array_state *state = get_state(ctx, block, arr->id);
|
||||
if (state->live_out_definition)
|
||||
return state->live_out_definition;
|
||||
|
||||
state->live_out_definition = read_value_beginning(ctx, block, arr);
|
||||
return state->live_out_definition;
|
||||
state->live_out_definition = read_value_beginning(ctx, block, arr);
|
||||
return state->live_out_definition;
|
||||
}
|
||||
|
||||
/* Roughly equivalent to readValueRecursive from the paper: */
|
||||
static struct ir3_register *
|
||||
read_value_beginning(struct array_ctx *ctx, struct ir3_block *block, struct ir3_array *arr)
|
||||
read_value_beginning(struct array_ctx *ctx, struct ir3_block *block,
|
||||
struct ir3_array *arr)
|
||||
{
|
||||
struct array_state *state = get_state(ctx, block, arr->id);
|
||||
struct array_state *state = get_state(ctx, block, arr->id);
|
||||
|
||||
if (state->constructed)
|
||||
return state->live_in_definition;
|
||||
if (state->constructed)
|
||||
return state->live_in_definition;
|
||||
|
||||
if (block->predecessors_count == 0) {
|
||||
state->constructed = true;
|
||||
return NULL;
|
||||
}
|
||||
if (block->predecessors_count == 0) {
|
||||
state->constructed = true;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (block->predecessors_count == 1) {
|
||||
state->live_in_definition = read_value_end(ctx, block->predecessors[0], arr);
|
||||
state->constructed = true;
|
||||
return state->live_in_definition;
|
||||
}
|
||||
if (block->predecessors_count == 1) {
|
||||
state->live_in_definition =
|
||||
read_value_end(ctx, block->predecessors[0], arr);
|
||||
state->constructed = true;
|
||||
return state->live_in_definition;
|
||||
}
|
||||
|
||||
unsigned flags = IR3_REG_ARRAY | (arr->half ? IR3_REG_HALF : 0);
|
||||
struct ir3_instruction *phi =
|
||||
ir3_instr_create(block, OPC_META_PHI, 1, block->predecessors_count);
|
||||
list_del(&phi->node);
|
||||
list_add(&phi->node, &block->instr_list);
|
||||
unsigned flags = IR3_REG_ARRAY | (arr->half ? IR3_REG_HALF : 0);
|
||||
struct ir3_instruction *phi =
|
||||
ir3_instr_create(block, OPC_META_PHI, 1, block->predecessors_count);
|
||||
list_del(&phi->node);
|
||||
list_add(&phi->node, &block->instr_list);
|
||||
|
||||
struct ir3_register *dst = __ssa_dst(phi);
|
||||
dst->flags |= flags;
|
||||
dst->array.id = arr->id;
|
||||
dst->size = arr->length;
|
||||
struct ir3_register *dst = __ssa_dst(phi);
|
||||
dst->flags |= flags;
|
||||
dst->array.id = arr->id;
|
||||
dst->size = arr->length;
|
||||
|
||||
state->live_in_definition = phi->dsts[0];
|
||||
state->constructed = true;
|
||||
state->live_in_definition = phi->dsts[0];
|
||||
state->constructed = true;
|
||||
|
||||
for (unsigned i = 0; i < block->predecessors_count; i++) {
|
||||
struct ir3_register *src = read_value_end(ctx, block->predecessors[i], arr);
|
||||
struct ir3_register *src_reg;
|
||||
if (src) {
|
||||
src_reg = __ssa_src(phi, src->instr, flags);
|
||||
} else {
|
||||
src_reg = ir3_src_create(phi, INVALID_REG, flags | IR3_REG_SSA);
|
||||
}
|
||||
src_reg->array.id = arr->id;
|
||||
src_reg->size = arr->length;
|
||||
}
|
||||
return phi->dsts[0];
|
||||
for (unsigned i = 0; i < block->predecessors_count; i++) {
|
||||
struct ir3_register *src =
|
||||
read_value_end(ctx, block->predecessors[i], arr);
|
||||
struct ir3_register *src_reg;
|
||||
if (src) {
|
||||
src_reg = __ssa_src(phi, src->instr, flags);
|
||||
} else {
|
||||
src_reg = ir3_src_create(phi, INVALID_REG, flags | IR3_REG_SSA);
|
||||
}
|
||||
src_reg->array.id = arr->id;
|
||||
src_reg->size = arr->length;
|
||||
}
|
||||
return phi->dsts[0];
|
||||
}
|
||||
|
||||
static struct ir3_register *
|
||||
remove_trivial_phi(struct ir3_instruction *phi)
|
||||
{
|
||||
/* Break cycles */
|
||||
if (phi->data)
|
||||
return phi->data;
|
||||
|
||||
phi->data = phi->dsts[0];
|
||||
/* Break cycles */
|
||||
if (phi->data)
|
||||
return phi->data;
|
||||
|
||||
struct ir3_register *unique_def = NULL;
|
||||
bool unique = true;
|
||||
for (unsigned i = 0; i < phi->block->predecessors_count; i++) {
|
||||
struct ir3_register *src = phi->srcs[i];
|
||||
phi->data = phi->dsts[0];
|
||||
|
||||
/* If there are any undef sources, then the remaining sources may not
|
||||
* dominate the phi node, even if they are all equal. So we need to
|
||||
* bail out in this case.
|
||||
*
|
||||
* This seems to be a bug in the original paper.
|
||||
*/
|
||||
if (!src->def) {
|
||||
unique = false;
|
||||
break;
|
||||
}
|
||||
struct ir3_register *unique_def = NULL;
|
||||
bool unique = true;
|
||||
for (unsigned i = 0; i < phi->block->predecessors_count; i++) {
|
||||
struct ir3_register *src = phi->srcs[i];
|
||||
|
||||
struct ir3_instruction *src_instr = src->def->instr;
|
||||
|
||||
/* phi sources which point to the phi itself don't count for
|
||||
* figuring out if the phi is trivial
|
||||
*/
|
||||
if (src_instr == phi)
|
||||
continue;
|
||||
/* If there are any undef sources, then the remaining sources may not
|
||||
* dominate the phi node, even if they are all equal. So we need to
|
||||
* bail out in this case.
|
||||
*
|
||||
* This seems to be a bug in the original paper.
|
||||
*/
|
||||
if (!src->def) {
|
||||
unique = false;
|
||||
break;
|
||||
}
|
||||
|
||||
if (src_instr->opc == OPC_META_PHI) {
|
||||
src->def = remove_trivial_phi(src->def->instr);
|
||||
}
|
||||
struct ir3_instruction *src_instr = src->def->instr;
|
||||
|
||||
if (unique_def) {
|
||||
if (unique_def != src->def) {
|
||||
unique = false;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
unique_def = src->def;
|
||||
}
|
||||
}
|
||||
/* phi sources which point to the phi itself don't count for
|
||||
* figuring out if the phi is trivial
|
||||
*/
|
||||
if (src_instr == phi)
|
||||
continue;
|
||||
|
||||
if (unique) {
|
||||
phi->data = unique_def;
|
||||
return unique_def;
|
||||
} else {
|
||||
return phi->dsts[0];
|
||||
}
|
||||
if (src_instr->opc == OPC_META_PHI) {
|
||||
src->def = remove_trivial_phi(src->def->instr);
|
||||
}
|
||||
|
||||
if (unique_def) {
|
||||
if (unique_def != src->def) {
|
||||
unique = false;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
unique_def = src->def;
|
||||
}
|
||||
}
|
||||
|
||||
if (unique) {
|
||||
phi->data = unique_def;
|
||||
return unique_def;
|
||||
} else {
|
||||
return phi->dsts[0];
|
||||
}
|
||||
}
|
||||
|
||||
static struct ir3_register *
|
||||
lookup_value(struct ir3_register *reg)
|
||||
{
|
||||
if (reg->instr->opc == OPC_META_PHI)
|
||||
return reg->instr->data;
|
||||
return reg;
|
||||
if (reg->instr->opc == OPC_META_PHI)
|
||||
return reg->instr->data;
|
||||
return reg;
|
||||
}
|
||||
|
||||
static struct ir3_register *
|
||||
lookup_live_in(struct array_ctx *ctx, struct ir3_block *block, unsigned id)
|
||||
{
|
||||
struct array_state *state = get_state(ctx, block, id);
|
||||
if (state->live_in_definition)
|
||||
return lookup_value(state->live_in_definition);
|
||||
struct array_state *state = get_state(ctx, block, id);
|
||||
if (state->live_in_definition)
|
||||
return lookup_value(state->live_in_definition);
|
||||
|
||||
return NULL;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
bool
|
||||
ir3_array_to_ssa(struct ir3 *ir)
|
||||
{
|
||||
struct array_ctx ctx = {};
|
||||
struct array_ctx ctx = {};
|
||||
|
||||
foreach_array (array, &ir->array_list) {
|
||||
ctx.array_count = MAX2(ctx.array_count, array->id + 1);
|
||||
}
|
||||
foreach_array (array, &ir->array_list) {
|
||||
ctx.array_count = MAX2(ctx.array_count, array->id + 1);
|
||||
}
|
||||
|
||||
if (ctx.array_count == 0)
|
||||
return false;
|
||||
if (ctx.array_count == 0)
|
||||
return false;
|
||||
|
||||
unsigned i = 0;
|
||||
foreach_block (block, &ir->block_list) {
|
||||
block->index = i++;
|
||||
}
|
||||
unsigned i = 0;
|
||||
foreach_block (block, &ir->block_list) {
|
||||
block->index = i++;
|
||||
}
|
||||
|
||||
ctx.ir = ir;
|
||||
ctx.states = calloc(ctx.array_count * i, sizeof(struct array_state));
|
||||
ctx.ir = ir;
|
||||
ctx.states = calloc(ctx.array_count * i, sizeof(struct array_state));
|
||||
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
foreach_dst (dst, instr) {
|
||||
if (dst->flags & IR3_REG_ARRAY) {
|
||||
struct array_state *state =
|
||||
get_state(&ctx, block, dst->array.id);
|
||||
state->live_out_definition = dst;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
foreach_dst (dst, instr) {
|
||||
if (dst->flags & IR3_REG_ARRAY) {
|
||||
struct array_state *state =
|
||||
get_state(&ctx, block, dst->array.id);
|
||||
state->live_out_definition = dst;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
if (instr->opc == OPC_META_PHI)
|
||||
continue;
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
if (instr->opc == OPC_META_PHI)
|
||||
continue;
|
||||
|
||||
foreach_dst (reg, instr) {
|
||||
if ((reg->flags & IR3_REG_ARRAY) && !reg->tied) {
|
||||
struct ir3_array *arr = ir3_lookup_array(ir, reg->array.id);
|
||||
foreach_dst (reg, instr) {
|
||||
if ((reg->flags & IR3_REG_ARRAY) && !reg->tied) {
|
||||
struct ir3_array *arr = ir3_lookup_array(ir, reg->array.id);
|
||||
|
||||
/* Construct any phi nodes necessary to read this value */
|
||||
read_value_beginning(&ctx, block, arr);
|
||||
}
|
||||
}
|
||||
foreach_src (reg, instr) {
|
||||
if ((reg->flags & IR3_REG_ARRAY) && !reg->def) {
|
||||
struct ir3_array *arr = ir3_lookup_array(ir, reg->array.id);
|
||||
/* Construct any phi nodes necessary to read this value */
|
||||
read_value_beginning(&ctx, block, arr);
|
||||
}
|
||||
}
|
||||
foreach_src (reg, instr) {
|
||||
if ((reg->flags & IR3_REG_ARRAY) && !reg->def) {
|
||||
struct ir3_array *arr = ir3_lookup_array(ir, reg->array.id);
|
||||
|
||||
/* Construct any phi nodes necessary to read this value */
|
||||
read_value_beginning(&ctx, block, arr);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Construct any phi nodes necessary to read this value */
|
||||
read_value_beginning(&ctx, block, arr);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr_safe (instr, &block->instr_list) {
|
||||
if (instr->opc == OPC_META_PHI)
|
||||
remove_trivial_phi(instr);
|
||||
else
|
||||
break;
|
||||
}
|
||||
}
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr_safe (instr, &block->instr_list) {
|
||||
if (instr->opc == OPC_META_PHI)
|
||||
remove_trivial_phi(instr);
|
||||
else
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr_safe (instr, &block->instr_list) {
|
||||
if (instr->opc == OPC_META_PHI) {
|
||||
if (!(instr->flags & IR3_REG_ARRAY))
|
||||
continue;
|
||||
if (instr->data != instr->dsts[0]) {
|
||||
list_del(&instr->node);
|
||||
continue;
|
||||
}
|
||||
for (unsigned i = 0; i < instr->srcs_count; i++) {
|
||||
instr->srcs[i] = lookup_value(instr->srcs[i]);
|
||||
}
|
||||
} else {
|
||||
foreach_dst (reg, instr) {
|
||||
if ((reg->flags & IR3_REG_ARRAY)) {
|
||||
if (!reg->tied) {
|
||||
struct ir3_register *def =
|
||||
lookup_live_in(&ctx, block, reg->array.id);
|
||||
if (def)
|
||||
ir3_reg_set_last_array(instr, reg, def);
|
||||
}
|
||||
reg->flags |= IR3_REG_SSA;
|
||||
}
|
||||
}
|
||||
foreach_src (reg, instr) {
|
||||
if ((reg->flags & IR3_REG_ARRAY)) {
|
||||
/* It is assumed that before calling
|
||||
* ir3_array_to_ssa(), reg->def was set to the
|
||||
* previous writer of the array within the current
|
||||
* block or NULL if none.
|
||||
*/
|
||||
if (!reg->def) {
|
||||
reg->def = lookup_live_in(&ctx, block, reg->array.id);
|
||||
}
|
||||
reg->flags |= IR3_REG_SSA;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr_safe (instr, &block->instr_list) {
|
||||
if (instr->opc == OPC_META_PHI) {
|
||||
if (!(instr->flags & IR3_REG_ARRAY))
|
||||
continue;
|
||||
if (instr->data != instr->dsts[0]) {
|
||||
list_del(&instr->node);
|
||||
continue;
|
||||
}
|
||||
for (unsigned i = 0; i < instr->srcs_count; i++) {
|
||||
instr->srcs[i] = lookup_value(instr->srcs[i]);
|
||||
}
|
||||
} else {
|
||||
foreach_dst (reg, instr) {
|
||||
if ((reg->flags & IR3_REG_ARRAY)) {
|
||||
if (!reg->tied) {
|
||||
struct ir3_register *def =
|
||||
lookup_live_in(&ctx, block, reg->array.id);
|
||||
if (def)
|
||||
ir3_reg_set_last_array(instr, reg, def);
|
||||
}
|
||||
reg->flags |= IR3_REG_SSA;
|
||||
}
|
||||
}
|
||||
foreach_src (reg, instr) {
|
||||
if ((reg->flags & IR3_REG_ARRAY)) {
|
||||
/* It is assumed that before calling
|
||||
* ir3_array_to_ssa(), reg->def was set to the
|
||||
* previous writer of the array within the current
|
||||
* block or NULL if none.
|
||||
*/
|
||||
if (!reg->def) {
|
||||
reg->def = lookup_live_in(&ctx, block, reg->array.id);
|
||||
}
|
||||
reg->flags |= IR3_REG_SSA;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
free(ctx.states);
|
||||
return true;
|
||||
free(ctx.states);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -22,8 +22,8 @@
|
|||
*/
|
||||
|
||||
#include "ir3_assembler.h"
|
||||
#include "ir3_shader.h"
|
||||
#include "ir3_parser.h"
|
||||
#include "ir3_shader.h"
|
||||
|
||||
/**
|
||||
* A helper to go from ir3 assembly to assembled shader. The shader has a
|
||||
|
@ -32,43 +32,43 @@
|
|||
struct ir3_shader *
|
||||
ir3_parse_asm(struct ir3_compiler *c, struct ir3_kernel_info *info, FILE *in)
|
||||
{
|
||||
struct ir3_shader *shader = rzalloc_size(NULL, sizeof(*shader));
|
||||
shader->compiler = c;
|
||||
shader->type = MESA_SHADER_COMPUTE;
|
||||
mtx_init(&shader->variants_lock, mtx_plain);
|
||||
struct ir3_shader *shader = rzalloc_size(NULL, sizeof(*shader));
|
||||
shader->compiler = c;
|
||||
shader->type = MESA_SHADER_COMPUTE;
|
||||
mtx_init(&shader->variants_lock, mtx_plain);
|
||||
|
||||
struct ir3_shader_variant *v = rzalloc_size(shader, sizeof(*v));
|
||||
v->type = MESA_SHADER_COMPUTE;
|
||||
v->shader = shader;
|
||||
v->const_state = rzalloc_size(v, sizeof(*v->const_state));
|
||||
struct ir3_shader_variant *v = rzalloc_size(shader, sizeof(*v));
|
||||
v->type = MESA_SHADER_COMPUTE;
|
||||
v->shader = shader;
|
||||
v->const_state = rzalloc_size(v, sizeof(*v->const_state));
|
||||
|
||||
shader->variants = v;
|
||||
shader->variant_count = 1;
|
||||
shader->variants = v;
|
||||
shader->variant_count = 1;
|
||||
|
||||
info->numwg = INVALID_REG;
|
||||
info->numwg = INVALID_REG;
|
||||
|
||||
for (int i = 0; i < MAX_BUFS; i++) {
|
||||
info->buf_addr_regs[i] = INVALID_REG;
|
||||
}
|
||||
for (int i = 0; i < MAX_BUFS; i++) {
|
||||
info->buf_addr_regs[i] = INVALID_REG;
|
||||
}
|
||||
|
||||
/* Provide a default local_size in case the shader doesn't set it, so that
|
||||
* we don't crash at least.
|
||||
*/
|
||||
v->local_size[0] = v->local_size[1] = v->local_size[2] = 1;
|
||||
/* Provide a default local_size in case the shader doesn't set it, so that
|
||||
* we don't crash at least.
|
||||
*/
|
||||
v->local_size[0] = v->local_size[1] = v->local_size[2] = 1;
|
||||
|
||||
v->ir = ir3_parse(v, info, in);
|
||||
if (!v->ir)
|
||||
goto error;
|
||||
v->ir = ir3_parse(v, info, in);
|
||||
if (!v->ir)
|
||||
goto error;
|
||||
|
||||
ir3_debug_print(v->ir, "AFTER PARSING");
|
||||
ir3_debug_print(v->ir, "AFTER PARSING");
|
||||
|
||||
v->bin = ir3_shader_assemble(v);
|
||||
if (!v->bin)
|
||||
goto error;
|
||||
v->bin = ir3_shader_assemble(v);
|
||||
if (!v->bin)
|
||||
goto error;
|
||||
|
||||
return shader;
|
||||
return shader;
|
||||
|
||||
error:
|
||||
ralloc_free(shader);
|
||||
return NULL;
|
||||
ralloc_free(shader);
|
||||
return NULL;
|
||||
}
|
||||
|
|
|
@ -30,17 +30,18 @@
|
|||
#define MAX_BUFS 4
|
||||
|
||||
struct ir3_kernel_info {
|
||||
uint32_t num_bufs;
|
||||
uint32_t buf_sizes[MAX_BUFS]; /* size in dwords */
|
||||
uint32_t buf_addr_regs[MAX_BUFS];
|
||||
uint32_t num_bufs;
|
||||
uint32_t buf_sizes[MAX_BUFS]; /* size in dwords */
|
||||
uint32_t buf_addr_regs[MAX_BUFS];
|
||||
|
||||
/* driver-param uniforms: */
|
||||
unsigned numwg;
|
||||
/* driver-param uniforms: */
|
||||
unsigned numwg;
|
||||
};
|
||||
|
||||
struct ir3_shader;
|
||||
struct ir3_compiler;
|
||||
|
||||
struct ir3_shader * ir3_parse_asm(struct ir3_compiler *c, struct ir3_kernel_info *info, FILE *in);
|
||||
struct ir3_shader *ir3_parse_asm(struct ir3_compiler *c,
|
||||
struct ir3_kernel_info *info, FILE *in);
|
||||
|
||||
#endif /* __IR3_ASSEMBLER_H__ */
|
||||
|
|
|
@ -26,75 +26,74 @@
|
|||
#include "ir3.h"
|
||||
|
||||
static bool
|
||||
is_safe_conv(struct ir3_instruction *instr, type_t src_type,
|
||||
opc_t *src_opc)
|
||||
is_safe_conv(struct ir3_instruction *instr, type_t src_type, opc_t *src_opc)
|
||||
{
|
||||
if (instr->opc != OPC_MOV)
|
||||
return false;
|
||||
if (instr->opc != OPC_MOV)
|
||||
return false;
|
||||
|
||||
/* Only allow half->full or full->half without any type conversion (like
|
||||
* int to float).
|
||||
*/
|
||||
if (type_size(instr->cat1.src_type) == type_size(instr->cat1.dst_type) ||
|
||||
full_type(instr->cat1.src_type) != full_type(instr->cat1.dst_type))
|
||||
return false;
|
||||
/* Only allow half->full or full->half without any type conversion (like
|
||||
* int to float).
|
||||
*/
|
||||
if (type_size(instr->cat1.src_type) == type_size(instr->cat1.dst_type) ||
|
||||
full_type(instr->cat1.src_type) != full_type(instr->cat1.dst_type))
|
||||
return false;
|
||||
|
||||
struct ir3_register *dst = instr->dsts[0];
|
||||
struct ir3_register *src = instr->srcs[0];
|
||||
struct ir3_register *dst = instr->dsts[0];
|
||||
struct ir3_register *src = instr->srcs[0];
|
||||
|
||||
/* disallow conversions that cannot be folded into
|
||||
* alu instructions:
|
||||
*/
|
||||
if (instr->cat1.round != ROUND_ZERO)
|
||||
return false;
|
||||
/* disallow conversions that cannot be folded into
|
||||
* alu instructions:
|
||||
*/
|
||||
if (instr->cat1.round != ROUND_ZERO)
|
||||
return false;
|
||||
|
||||
if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
|
||||
return false;
|
||||
if (src->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
|
||||
return false;
|
||||
if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
|
||||
return false;
|
||||
if (src->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
|
||||
return false;
|
||||
|
||||
/* Check that the source of the conv matches the type of the src
|
||||
* instruction.
|
||||
*/
|
||||
if (src_type == instr->cat1.src_type)
|
||||
return true;
|
||||
/* Check that the source of the conv matches the type of the src
|
||||
* instruction.
|
||||
*/
|
||||
if (src_type == instr->cat1.src_type)
|
||||
return true;
|
||||
|
||||
/* We can handle mismatches with integer types by converting the opcode
|
||||
* but not when an integer is reinterpreted as a float or vice-versa.
|
||||
*/
|
||||
if (type_float(src_type) != type_float(instr->cat1.src_type))
|
||||
return false;
|
||||
/* We can handle mismatches with integer types by converting the opcode
|
||||
* but not when an integer is reinterpreted as a float or vice-versa.
|
||||
*/
|
||||
if (type_float(src_type) != type_float(instr->cat1.src_type))
|
||||
return false;
|
||||
|
||||
/* We have types with mismatched signedness. Mismatches on the signedness
|
||||
* don't matter when narrowing:
|
||||
*/
|
||||
if (type_size(instr->cat1.dst_type) < type_size(instr->cat1.src_type))
|
||||
return true;
|
||||
/* We have types with mismatched signedness. Mismatches on the signedness
|
||||
* don't matter when narrowing:
|
||||
*/
|
||||
if (type_size(instr->cat1.dst_type) < type_size(instr->cat1.src_type))
|
||||
return true;
|
||||
|
||||
/* Try swapping the opcode: */
|
||||
bool can_swap = true;
|
||||
*src_opc = ir3_try_swap_signedness(*src_opc, &can_swap);
|
||||
return can_swap;
|
||||
/* Try swapping the opcode: */
|
||||
bool can_swap = true;
|
||||
*src_opc = ir3_try_swap_signedness(*src_opc, &can_swap);
|
||||
return can_swap;
|
||||
}
|
||||
|
||||
static bool
|
||||
all_uses_safe_conv(struct ir3_instruction *conv_src, type_t src_type)
|
||||
{
|
||||
opc_t opc = conv_src->opc;
|
||||
bool first = true;
|
||||
foreach_ssa_use (use, conv_src) {
|
||||
opc_t new_opc = opc;
|
||||
if (!is_safe_conv(use, src_type, &new_opc))
|
||||
return false;
|
||||
/* Check if multiple uses have conflicting requirements on the opcode.
|
||||
*/
|
||||
if (!first && opc != new_opc)
|
||||
return false;
|
||||
first = false;
|
||||
opc = new_opc;
|
||||
}
|
||||
conv_src->opc = opc;
|
||||
return true;
|
||||
opc_t opc = conv_src->opc;
|
||||
bool first = true;
|
||||
foreach_ssa_use (use, conv_src) {
|
||||
opc_t new_opc = opc;
|
||||
if (!is_safe_conv(use, src_type, &new_opc))
|
||||
return false;
|
||||
/* Check if multiple uses have conflicting requirements on the opcode.
|
||||
*/
|
||||
if (!first && opc != new_opc)
|
||||
return false;
|
||||
first = false;
|
||||
opc = new_opc;
|
||||
}
|
||||
conv_src->opc = opc;
|
||||
return true;
|
||||
}
|
||||
|
||||
/* For an instruction which has a conversion folded in, re-write the
|
||||
|
@ -105,74 +104,74 @@ all_uses_safe_conv(struct ir3_instruction *conv_src, type_t src_type)
|
|||
static void
|
||||
rewrite_src_uses(struct ir3_instruction *src)
|
||||
{
|
||||
foreach_ssa_use (use, src) {
|
||||
assert(use->opc == OPC_MOV);
|
||||
foreach_ssa_use (use, src) {
|
||||
assert(use->opc == OPC_MOV);
|
||||
|
||||
if (is_half(src)) {
|
||||
use->srcs[0]->flags |= IR3_REG_HALF;
|
||||
} else {
|
||||
use->srcs[0]->flags &= ~IR3_REG_HALF;
|
||||
}
|
||||
if (is_half(src)) {
|
||||
use->srcs[0]->flags |= IR3_REG_HALF;
|
||||
} else {
|
||||
use->srcs[0]->flags &= ~IR3_REG_HALF;
|
||||
}
|
||||
|
||||
use->cat1.src_type = use->cat1.dst_type;
|
||||
}
|
||||
use->cat1.src_type = use->cat1.dst_type;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
try_conversion_folding(struct ir3_instruction *conv)
|
||||
{
|
||||
struct ir3_instruction *src;
|
||||
struct ir3_instruction *src;
|
||||
|
||||
if (conv->opc != OPC_MOV)
|
||||
return false;
|
||||
if (conv->opc != OPC_MOV)
|
||||
return false;
|
||||
|
||||
/* NOTE: we can have non-ssa srcs after copy propagation: */
|
||||
src = ssa(conv->srcs[0]);
|
||||
if (!src)
|
||||
return false;
|
||||
/* NOTE: we can have non-ssa srcs after copy propagation: */
|
||||
src = ssa(conv->srcs[0]);
|
||||
if (!src)
|
||||
return false;
|
||||
|
||||
if (!is_alu(src))
|
||||
return false;
|
||||
if (!is_alu(src))
|
||||
return false;
|
||||
|
||||
bool can_fold;
|
||||
type_t base_type = ir3_output_conv_type(src, &can_fold);
|
||||
if (!can_fold)
|
||||
return false;
|
||||
bool can_fold;
|
||||
type_t base_type = ir3_output_conv_type(src, &can_fold);
|
||||
if (!can_fold)
|
||||
return false;
|
||||
|
||||
type_t src_type = ir3_output_conv_src_type(src, base_type);
|
||||
type_t dst_type = ir3_output_conv_dst_type(src, base_type);
|
||||
type_t src_type = ir3_output_conv_src_type(src, base_type);
|
||||
type_t dst_type = ir3_output_conv_dst_type(src, base_type);
|
||||
|
||||
/* Avoid cases where we've already folded in a conversion. We assume that
|
||||
* if there is a chain of conversions that's foldable then it's been
|
||||
* folded in NIR already.
|
||||
*/
|
||||
if (src_type != dst_type)
|
||||
return false;
|
||||
/* Avoid cases where we've already folded in a conversion. We assume that
|
||||
* if there is a chain of conversions that's foldable then it's been
|
||||
* folded in NIR already.
|
||||
*/
|
||||
if (src_type != dst_type)
|
||||
return false;
|
||||
|
||||
if (!all_uses_safe_conv(src, src_type))
|
||||
return false;
|
||||
if (!all_uses_safe_conv(src, src_type))
|
||||
return false;
|
||||
|
||||
ir3_set_dst_type(src, is_half(conv));
|
||||
rewrite_src_uses(src);
|
||||
ir3_set_dst_type(src, is_half(conv));
|
||||
rewrite_src_uses(src);
|
||||
|
||||
return true;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
ir3_cf(struct ir3 *ir)
|
||||
{
|
||||
void *mem_ctx = ralloc_context(NULL);
|
||||
bool progress = false;
|
||||
void *mem_ctx = ralloc_context(NULL);
|
||||
bool progress = false;
|
||||
|
||||
ir3_find_ssa_uses(ir, mem_ctx, false);
|
||||
ir3_find_ssa_uses(ir, mem_ctx, false);
|
||||
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
progress |= try_conversion_folding(instr);
|
||||
}
|
||||
}
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
progress |= try_conversion_folding(instr);
|
||||
}
|
||||
}
|
||||
|
||||
ralloc_free(mem_ctx);
|
||||
ralloc_free(mem_ctx);
|
||||
|
||||
return progress;
|
||||
return progress;
|
||||
}
|
||||
|
|
|
@ -51,8 +51,10 @@ static const struct debug_named_value shader_debug_options[] = {
|
|||
/* clang-format on */
|
||||
};
|
||||
|
||||
DEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG", shader_debug_options, 0)
|
||||
DEBUG_GET_ONCE_OPTION(ir3_shader_override_path, "IR3_SHADER_OVERRIDE_PATH", NULL)
|
||||
DEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG",
|
||||
shader_debug_options, 0)
|
||||
DEBUG_GET_ONCE_OPTION(ir3_shader_override_path, "IR3_SHADER_OVERRIDE_PATH",
|
||||
NULL)
|
||||
|
||||
enum ir3_shader_debug ir3_shader_debug = 0;
|
||||
const char *ir3_shader_override_path = NULL;
|
||||
|
@ -60,126 +62,127 @@ const char *ir3_shader_override_path = NULL;
|
|||
void
|
||||
ir3_compiler_destroy(struct ir3_compiler *compiler)
|
||||
{
|
||||
disk_cache_destroy(compiler->disk_cache);
|
||||
ralloc_free(compiler);
|
||||
disk_cache_destroy(compiler->disk_cache);
|
||||
ralloc_free(compiler);
|
||||
}
|
||||
|
||||
struct ir3_compiler *
|
||||
ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id, bool robust_ubo_access)
|
||||
ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id,
|
||||
bool robust_ubo_access)
|
||||
{
|
||||
struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);
|
||||
struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);
|
||||
|
||||
ir3_shader_debug = debug_get_option_ir3_shader_debug();
|
||||
ir3_shader_override_path =
|
||||
!__check_suid() ? debug_get_option_ir3_shader_override_path() : NULL;
|
||||
ir3_shader_debug = debug_get_option_ir3_shader_debug();
|
||||
ir3_shader_override_path =
|
||||
!__check_suid() ? debug_get_option_ir3_shader_override_path() : NULL;
|
||||
|
||||
if (ir3_shader_override_path) {
|
||||
ir3_shader_debug |= IR3_DBG_NOCACHE;
|
||||
}
|
||||
if (ir3_shader_override_path) {
|
||||
ir3_shader_debug |= IR3_DBG_NOCACHE;
|
||||
}
|
||||
|
||||
compiler->dev = dev;
|
||||
compiler->gpu_id = gpu_id;
|
||||
compiler->robust_ubo_access = robust_ubo_access;
|
||||
compiler->dev = dev;
|
||||
compiler->gpu_id = gpu_id;
|
||||
compiler->robust_ubo_access = robust_ubo_access;
|
||||
|
||||
/* All known GPU's have 32k local memory (aka shared) */
|
||||
compiler->local_mem_size = 32 * 1024;
|
||||
/* TODO see if older GPU's were different here */
|
||||
compiler->branchstack_size = 64;
|
||||
compiler->wave_granularity = 2;
|
||||
compiler->max_waves = 16;
|
||||
/* All known GPU's have 32k local memory (aka shared) */
|
||||
compiler->local_mem_size = 32 * 1024;
|
||||
/* TODO see if older GPU's were different here */
|
||||
compiler->branchstack_size = 64;
|
||||
compiler->wave_granularity = 2;
|
||||
compiler->max_waves = 16;
|
||||
|
||||
if (compiler->gpu_id >= 600) {
|
||||
compiler->samgq_workaround = true;
|
||||
/* a6xx split the pipeline state into geometry and fragment state, in
|
||||
* order to let the VS run ahead of the FS. As a result there are now
|
||||
* separate const files for the the fragment shader and everything
|
||||
* else, and separate limits. There seems to be a shared limit, but
|
||||
* it's higher than the vert or frag limits.
|
||||
*
|
||||
* TODO: The shared limit seems to be different on different on
|
||||
* different models.
|
||||
*/
|
||||
compiler->max_const_pipeline = 640;
|
||||
compiler->max_const_frag = 512;
|
||||
compiler->max_const_geom = 512;
|
||||
compiler->max_const_safe = 128;
|
||||
if (compiler->gpu_id >= 600) {
|
||||
compiler->samgq_workaround = true;
|
||||
/* a6xx split the pipeline state into geometry and fragment state, in
|
||||
* order to let the VS run ahead of the FS. As a result there are now
|
||||
* separate const files for the the fragment shader and everything
|
||||
* else, and separate limits. There seems to be a shared limit, but
|
||||
* it's higher than the vert or frag limits.
|
||||
*
|
||||
* TODO: The shared limit seems to be different on different on
|
||||
* different models.
|
||||
*/
|
||||
compiler->max_const_pipeline = 640;
|
||||
compiler->max_const_frag = 512;
|
||||
compiler->max_const_geom = 512;
|
||||
compiler->max_const_safe = 128;
|
||||
|
||||
/* Compute shaders don't share a const file with the FS. Instead they
|
||||
* have their own file, which is smaller than the FS one.
|
||||
*
|
||||
* TODO: is this true on earlier gen's?
|
||||
*/
|
||||
compiler->max_const_compute = 256;
|
||||
/* Compute shaders don't share a const file with the FS. Instead they
|
||||
* have their own file, which is smaller than the FS one.
|
||||
*
|
||||
* TODO: is this true on earlier gen's?
|
||||
*/
|
||||
compiler->max_const_compute = 256;
|
||||
|
||||
/* TODO: implement clip+cull distances on earlier gen's */
|
||||
compiler->has_clip_cull = true;
|
||||
/* TODO: implement clip+cull distances on earlier gen's */
|
||||
compiler->has_clip_cull = true;
|
||||
|
||||
/* TODO: implement private memory on earlier gen's */
|
||||
compiler->has_pvtmem = true;
|
||||
/* TODO: implement private memory on earlier gen's */
|
||||
compiler->has_pvtmem = true;
|
||||
|
||||
if (compiler->gpu_id == 650)
|
||||
compiler->tess_use_shared = true;
|
||||
} else {
|
||||
compiler->max_const_pipeline = 512;
|
||||
compiler->max_const_geom = 512;
|
||||
compiler->max_const_frag = 512;
|
||||
compiler->max_const_compute = 512;
|
||||
if (compiler->gpu_id == 650)
|
||||
compiler->tess_use_shared = true;
|
||||
} else {
|
||||
compiler->max_const_pipeline = 512;
|
||||
compiler->max_const_geom = 512;
|
||||
compiler->max_const_frag = 512;
|
||||
compiler->max_const_compute = 512;
|
||||
|
||||
/* Note: this will have to change if/when we support tess+GS on
|
||||
* earlier gen's.
|
||||
*/
|
||||
compiler->max_const_safe = 256;
|
||||
}
|
||||
/* Note: this will have to change if/when we support tess+GS on
|
||||
* earlier gen's.
|
||||
*/
|
||||
compiler->max_const_safe = 256;
|
||||
}
|
||||
|
||||
if (compiler->gpu_id == 650) {
|
||||
/* This changed mid-generation for a650, so that using r32.x and above
|
||||
* requires using the smallest threadsize.
|
||||
*/
|
||||
compiler->reg_size_vec4 = 64;
|
||||
} else if (compiler->gpu_id >= 600) {
|
||||
compiler->reg_size_vec4 = 96;
|
||||
} else if (compiler->gpu_id >= 400) {
|
||||
/* On a4xx-a5xx, using r24.x and above requires using the smallest
|
||||
* threadsize.
|
||||
*/
|
||||
compiler->reg_size_vec4 = 48;
|
||||
} else {
|
||||
/* TODO: confirm this */
|
||||
compiler->reg_size_vec4 = 96;
|
||||
}
|
||||
if (compiler->gpu_id == 650) {
|
||||
/* This changed mid-generation for a650, so that using r32.x and above
|
||||
* requires using the smallest threadsize.
|
||||
*/
|
||||
compiler->reg_size_vec4 = 64;
|
||||
} else if (compiler->gpu_id >= 600) {
|
||||
compiler->reg_size_vec4 = 96;
|
||||
} else if (compiler->gpu_id >= 400) {
|
||||
/* On a4xx-a5xx, using r24.x and above requires using the smallest
|
||||
* threadsize.
|
||||
*/
|
||||
compiler->reg_size_vec4 = 48;
|
||||
} else {
|
||||
/* TODO: confirm this */
|
||||
compiler->reg_size_vec4 = 96;
|
||||
}
|
||||
|
||||
if (compiler->gpu_id >= 600) {
|
||||
compiler->threadsize_base = 64;
|
||||
} else if (compiler->gpu_id >= 400) {
|
||||
/* TODO: Confirm this for a4xx. For a5xx this is based on the Vulkan
|
||||
* 1.1 subgroupSize which is 32.
|
||||
*/
|
||||
compiler->threadsize_base = 32;
|
||||
} else {
|
||||
compiler->threadsize_base = 8;
|
||||
}
|
||||
if (compiler->gpu_id >= 600) {
|
||||
compiler->threadsize_base = 64;
|
||||
} else if (compiler->gpu_id >= 400) {
|
||||
/* TODO: Confirm this for a4xx. For a5xx this is based on the Vulkan
|
||||
* 1.1 subgroupSize which is 32.
|
||||
*/
|
||||
compiler->threadsize_base = 32;
|
||||
} else {
|
||||
compiler->threadsize_base = 8;
|
||||
}
|
||||
|
||||
if (compiler->gpu_id >= 400) {
|
||||
/* need special handling for "flat" */
|
||||
compiler->flat_bypass = true;
|
||||
compiler->levels_add_one = false;
|
||||
compiler->unminify_coords = false;
|
||||
compiler->txf_ms_with_isaml = false;
|
||||
compiler->array_index_add_half = true;
|
||||
compiler->instr_align = 16;
|
||||
compiler->const_upload_unit = 4;
|
||||
} else {
|
||||
/* no special handling for "flat" */
|
||||
compiler->flat_bypass = false;
|
||||
compiler->levels_add_one = true;
|
||||
compiler->unminify_coords = true;
|
||||
compiler->txf_ms_with_isaml = true;
|
||||
compiler->array_index_add_half = false;
|
||||
compiler->instr_align = 4;
|
||||
compiler->const_upload_unit = 8;
|
||||
}
|
||||
if (compiler->gpu_id >= 400) {
|
||||
/* need special handling for "flat" */
|
||||
compiler->flat_bypass = true;
|
||||
compiler->levels_add_one = false;
|
||||
compiler->unminify_coords = false;
|
||||
compiler->txf_ms_with_isaml = false;
|
||||
compiler->array_index_add_half = true;
|
||||
compiler->instr_align = 16;
|
||||
compiler->const_upload_unit = 4;
|
||||
} else {
|
||||
/* no special handling for "flat" */
|
||||
compiler->flat_bypass = false;
|
||||
compiler->levels_add_one = true;
|
||||
compiler->unminify_coords = true;
|
||||
compiler->txf_ms_with_isaml = true;
|
||||
compiler->array_index_add_half = false;
|
||||
compiler->instr_align = 4;
|
||||
compiler->const_upload_unit = 8;
|
||||
}
|
||||
|
||||
ir3_disk_cache_init(compiler);
|
||||
ir3_disk_cache_init(compiler);
|
||||
|
||||
return compiler;
|
||||
return compiler;
|
||||
}
|
||||
|
|
|
@ -36,167 +36,167 @@ struct ir3_ra_reg_set;
|
|||
struct ir3_shader;
|
||||
|
||||
struct ir3_compiler {
|
||||
struct fd_device *dev;
|
||||
uint32_t gpu_id;
|
||||
uint32_t shader_count;
|
||||
struct fd_device *dev;
|
||||
uint32_t gpu_id;
|
||||
uint32_t shader_count;
|
||||
|
||||
struct disk_cache *disk_cache;
|
||||
struct disk_cache *disk_cache;
|
||||
|
||||
/* If true, UBO accesses are assumed to be bounds-checked as defined by
|
||||
* VK_EXT_robustness2 and optimizations may have to be more conservative.
|
||||
*/
|
||||
bool robust_ubo_access;
|
||||
/* If true, UBO accesses are assumed to be bounds-checked as defined by
|
||||
* VK_EXT_robustness2 and optimizations may have to be more conservative.
|
||||
*/
|
||||
bool robust_ubo_access;
|
||||
|
||||
/*
|
||||
* Configuration options for things that are handled differently on
|
||||
* different generations:
|
||||
*/
|
||||
/*
|
||||
* Configuration options for things that are handled differently on
|
||||
* different generations:
|
||||
*/
|
||||
|
||||
/* a4xx (and later) drops SP_FS_FLAT_SHAD_MODE_REG_* for flat-interpolate
|
||||
* so we need to use ldlv.u32 to load the varying directly:
|
||||
*/
|
||||
bool flat_bypass;
|
||||
/* a4xx (and later) drops SP_FS_FLAT_SHAD_MODE_REG_* for flat-interpolate
|
||||
* so we need to use ldlv.u32 to load the varying directly:
|
||||
*/
|
||||
bool flat_bypass;
|
||||
|
||||
/* on a3xx, we need to add one to # of array levels:
|
||||
*/
|
||||
bool levels_add_one;
|
||||
/* on a3xx, we need to add one to # of array levels:
|
||||
*/
|
||||
bool levels_add_one;
|
||||
|
||||
/* on a3xx, we need to scale up integer coords for isaml based
|
||||
* on LoD:
|
||||
*/
|
||||
bool unminify_coords;
|
||||
/* on a3xx, we need to scale up integer coords for isaml based
|
||||
* on LoD:
|
||||
*/
|
||||
bool unminify_coords;
|
||||
|
||||
/* on a3xx do txf_ms w/ isaml and scaled coords: */
|
||||
bool txf_ms_with_isaml;
|
||||
/* on a3xx do txf_ms w/ isaml and scaled coords: */
|
||||
bool txf_ms_with_isaml;
|
||||
|
||||
/* on a4xx, for array textures we need to add 0.5 to the array
|
||||
* index coordinate:
|
||||
*/
|
||||
bool array_index_add_half;
|
||||
/* on a4xx, for array textures we need to add 0.5 to the array
|
||||
* index coordinate:
|
||||
*/
|
||||
bool array_index_add_half;
|
||||
|
||||
/* on a6xx, rewrite samgp to sequence of samgq0-3 in vertex shaders:
|
||||
*/
|
||||
bool samgq_workaround;
|
||||
/* on a6xx, rewrite samgp to sequence of samgq0-3 in vertex shaders:
|
||||
*/
|
||||
bool samgq_workaround;
|
||||
|
||||
/* on a650, vertex shader <-> tess control io uses LDL/STL */
|
||||
bool tess_use_shared;
|
||||
/* on a650, vertex shader <-> tess control io uses LDL/STL */
|
||||
bool tess_use_shared;
|
||||
|
||||
/* The maximum number of constants, in vec4's, across the entire graphics
|
||||
* pipeline.
|
||||
*/
|
||||
uint16_t max_const_pipeline;
|
||||
/* The maximum number of constants, in vec4's, across the entire graphics
|
||||
* pipeline.
|
||||
*/
|
||||
uint16_t max_const_pipeline;
|
||||
|
||||
/* The maximum number of constants, in vec4's, for VS+HS+DS+GS. */
|
||||
uint16_t max_const_geom;
|
||||
/* The maximum number of constants, in vec4's, for VS+HS+DS+GS. */
|
||||
uint16_t max_const_geom;
|
||||
|
||||
/* The maximum number of constants, in vec4's, for FS. */
|
||||
uint16_t max_const_frag;
|
||||
/* The maximum number of constants, in vec4's, for FS. */
|
||||
uint16_t max_const_frag;
|
||||
|
||||
/* A "safe" max constlen that can be applied to each shader in the
|
||||
* pipeline which we guarantee will never exceed any combined limits.
|
||||
*/
|
||||
uint16_t max_const_safe;
|
||||
/* A "safe" max constlen that can be applied to each shader in the
|
||||
* pipeline which we guarantee will never exceed any combined limits.
|
||||
*/
|
||||
uint16_t max_const_safe;
|
||||
|
||||
/* The maximum number of constants, in vec4's, for compute shaders. */
|
||||
uint16_t max_const_compute;
|
||||
/* The maximum number of constants, in vec4's, for compute shaders. */
|
||||
uint16_t max_const_compute;
|
||||
|
||||
/* Number of instructions that the shader's base address and length
|
||||
* (instrlen divides instruction count by this) must be aligned to.
|
||||
*/
|
||||
uint32_t instr_align;
|
||||
/* Number of instructions that the shader's base address and length
|
||||
* (instrlen divides instruction count by this) must be aligned to.
|
||||
*/
|
||||
uint32_t instr_align;
|
||||
|
||||
/* on a3xx, the unit of indirect const load is higher than later gens (in
|
||||
* vec4 units):
|
||||
*/
|
||||
uint32_t const_upload_unit;
|
||||
/* on a3xx, the unit of indirect const load is higher than later gens (in
|
||||
* vec4 units):
|
||||
*/
|
||||
uint32_t const_upload_unit;
|
||||
|
||||
/* The base number of threads per wave. Some stages may be able to double
|
||||
* this.
|
||||
*/
|
||||
uint32_t threadsize_base;
|
||||
/* The base number of threads per wave. Some stages may be able to double
|
||||
* this.
|
||||
*/
|
||||
uint32_t threadsize_base;
|
||||
|
||||
/* On at least a6xx, waves are always launched in pairs. In calculations
|
||||
* about occupancy, we pretend that each wave pair is actually one wave,
|
||||
* which simplifies many of the calculations, but means we have to
|
||||
* multiply threadsize_base by this number.
|
||||
*/
|
||||
uint32_t wave_granularity;
|
||||
/* On at least a6xx, waves are always launched in pairs. In calculations
|
||||
* about occupancy, we pretend that each wave pair is actually one wave,
|
||||
* which simplifies many of the calculations, but means we have to
|
||||
* multiply threadsize_base by this number.
|
||||
*/
|
||||
uint32_t wave_granularity;
|
||||
|
||||
/* The maximum number of simultaneous waves per core. */
|
||||
uint32_t max_waves;
|
||||
/* The maximum number of simultaneous waves per core. */
|
||||
uint32_t max_waves;
|
||||
|
||||
/* This is theoretical maximum number of vec4 registers that one wave of
|
||||
* the base threadsize could use. To get the actual size of the register
|
||||
* file in bytes one would need to compute:
|
||||
*
|
||||
* reg_size_vec4 * threadsize_base * wave_granularity * 16 (bytes per vec4)
|
||||
*
|
||||
* However this number is more often what we actually need. For example, a
|
||||
* max_reg more than half of this will result in a doubled threadsize
|
||||
* being impossible (because double-sized waves take up twice as many
|
||||
* registers). Also, the formula for the occupancy given a particular
|
||||
* register footprint is simpler.
|
||||
*
|
||||
* It is in vec4 units because the register file is allocated
|
||||
* with vec4 granularity, so it's in the same units as max_reg.
|
||||
*/
|
||||
uint32_t reg_size_vec4;
|
||||
/* This is theoretical maximum number of vec4 registers that one wave of
|
||||
* the base threadsize could use. To get the actual size of the register
|
||||
* file in bytes one would need to compute:
|
||||
*
|
||||
* reg_size_vec4 * threadsize_base * wave_granularity * 16 (bytes per vec4)
|
||||
*
|
||||
* However this number is more often what we actually need. For example, a
|
||||
* max_reg more than half of this will result in a doubled threadsize
|
||||
* being impossible (because double-sized waves take up twice as many
|
||||
* registers). Also, the formula for the occupancy given a particular
|
||||
* register footprint is simpler.
|
||||
*
|
||||
* It is in vec4 units because the register file is allocated
|
||||
* with vec4 granularity, so it's in the same units as max_reg.
|
||||
*/
|
||||
uint32_t reg_size_vec4;
|
||||
|
||||
/* The size of local memory in bytes */
|
||||
uint32_t local_mem_size;
|
||||
/* The size of local memory in bytes */
|
||||
uint32_t local_mem_size;
|
||||
|
||||
/* The number of total branch stack entries, divided by wave_granularity. */
|
||||
uint32_t branchstack_size;
|
||||
/* The number of total branch stack entries, divided by wave_granularity. */
|
||||
uint32_t branchstack_size;
|
||||
|
||||
/* Whether clip+cull distances are supported */
|
||||
bool has_clip_cull;
|
||||
/* Whether clip+cull distances are supported */
|
||||
bool has_clip_cull;
|
||||
|
||||
/* Whether private memory is supported */
|
||||
bool has_pvtmem;
|
||||
/* Whether private memory is supported */
|
||||
bool has_pvtmem;
|
||||
};
|
||||
|
||||
void ir3_compiler_destroy(struct ir3_compiler *compiler);
|
||||
struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id,
|
||||
bool robust_ubo_access);
|
||||
struct ir3_compiler *ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id,
|
||||
bool robust_ubo_access);
|
||||
|
||||
void ir3_disk_cache_init(struct ir3_compiler *compiler);
|
||||
void ir3_disk_cache_init_shader_key(struct ir3_compiler *compiler,
|
||||
struct ir3_shader *shader);
|
||||
struct ir3_shader *shader);
|
||||
bool ir3_disk_cache_retrieve(struct ir3_compiler *compiler,
|
||||
struct ir3_shader_variant *v);
|
||||
struct ir3_shader_variant *v);
|
||||
void ir3_disk_cache_store(struct ir3_compiler *compiler,
|
||||
struct ir3_shader_variant *v);
|
||||
struct ir3_shader_variant *v);
|
||||
|
||||
int ir3_compile_shader_nir(struct ir3_compiler *compiler,
|
||||
struct ir3_shader_variant *so);
|
||||
struct ir3_shader_variant *so);
|
||||
|
||||
/* gpu pointer size in units of 32bit registers/slots */
|
||||
static inline
|
||||
unsigned ir3_pointer_size(struct ir3_compiler *compiler)
|
||||
static inline unsigned
|
||||
ir3_pointer_size(struct ir3_compiler *compiler)
|
||||
{
|
||||
return (compiler->gpu_id >= 500) ? 2 : 1;
|
||||
return (compiler->gpu_id >= 500) ? 2 : 1;
|
||||
}
|
||||
|
||||
enum ir3_shader_debug {
|
||||
IR3_DBG_SHADER_VS = BITFIELD_BIT(0),
|
||||
IR3_DBG_SHADER_TCS = BITFIELD_BIT(1),
|
||||
IR3_DBG_SHADER_TES = BITFIELD_BIT(2),
|
||||
IR3_DBG_SHADER_GS = BITFIELD_BIT(3),
|
||||
IR3_DBG_SHADER_FS = BITFIELD_BIT(4),
|
||||
IR3_DBG_SHADER_CS = BITFIELD_BIT(5),
|
||||
IR3_DBG_DISASM = BITFIELD_BIT(6),
|
||||
IR3_DBG_OPTMSGS = BITFIELD_BIT(7),
|
||||
IR3_DBG_FORCES2EN = BITFIELD_BIT(8),
|
||||
IR3_DBG_NOUBOOPT = BITFIELD_BIT(9),
|
||||
IR3_DBG_NOFP16 = BITFIELD_BIT(10),
|
||||
IR3_DBG_NOCACHE = BITFIELD_BIT(11),
|
||||
IR3_DBG_SHADER_VS = BITFIELD_BIT(0),
|
||||
IR3_DBG_SHADER_TCS = BITFIELD_BIT(1),
|
||||
IR3_DBG_SHADER_TES = BITFIELD_BIT(2),
|
||||
IR3_DBG_SHADER_GS = BITFIELD_BIT(3),
|
||||
IR3_DBG_SHADER_FS = BITFIELD_BIT(4),
|
||||
IR3_DBG_SHADER_CS = BITFIELD_BIT(5),
|
||||
IR3_DBG_DISASM = BITFIELD_BIT(6),
|
||||
IR3_DBG_OPTMSGS = BITFIELD_BIT(7),
|
||||
IR3_DBG_FORCES2EN = BITFIELD_BIT(8),
|
||||
IR3_DBG_NOUBOOPT = BITFIELD_BIT(9),
|
||||
IR3_DBG_NOFP16 = BITFIELD_BIT(10),
|
||||
IR3_DBG_NOCACHE = BITFIELD_BIT(11),
|
||||
|
||||
/* DEBUG-only options: */
|
||||
IR3_DBG_SCHEDMSGS = BITFIELD_BIT(20),
|
||||
IR3_DBG_RAMSGS = BITFIELD_BIT(21),
|
||||
/* DEBUG-only options: */
|
||||
IR3_DBG_SCHEDMSGS = BITFIELD_BIT(20),
|
||||
IR3_DBG_RAMSGS = BITFIELD_BIT(21),
|
||||
|
||||
/* Only used for the disk-caching logic: */
|
||||
IR3_DBG_ROBUST_UBO_ACCESS = BITFIELD_BIT(30),
|
||||
/* Only used for the disk-caching logic: */
|
||||
IR3_DBG_ROBUST_UBO_ACCESS = BITFIELD_BIT(30),
|
||||
};
|
||||
|
||||
extern enum ir3_shader_debug ir3_shader_debug;
|
||||
|
@ -205,29 +205,35 @@ extern const char *ir3_shader_override_path;
|
|||
static inline bool
|
||||
shader_debug_enabled(gl_shader_stage type)
|
||||
{
|
||||
if (ir3_shader_debug & IR3_DBG_DISASM)
|
||||
return true;
|
||||
if (ir3_shader_debug & IR3_DBG_DISASM)
|
||||
return true;
|
||||
|
||||
switch (type) {
|
||||
case MESA_SHADER_VERTEX: return !!(ir3_shader_debug & IR3_DBG_SHADER_VS);
|
||||
case MESA_SHADER_TESS_CTRL: return !!(ir3_shader_debug & IR3_DBG_SHADER_TCS);
|
||||
case MESA_SHADER_TESS_EVAL: return !!(ir3_shader_debug & IR3_DBG_SHADER_TES);
|
||||
case MESA_SHADER_GEOMETRY: return !!(ir3_shader_debug & IR3_DBG_SHADER_GS);
|
||||
case MESA_SHADER_FRAGMENT: return !!(ir3_shader_debug & IR3_DBG_SHADER_FS);
|
||||
case MESA_SHADER_COMPUTE: return !!(ir3_shader_debug & IR3_DBG_SHADER_CS);
|
||||
default:
|
||||
debug_assert(0);
|
||||
return false;
|
||||
}
|
||||
switch (type) {
|
||||
case MESA_SHADER_VERTEX:
|
||||
return !!(ir3_shader_debug & IR3_DBG_SHADER_VS);
|
||||
case MESA_SHADER_TESS_CTRL:
|
||||
return !!(ir3_shader_debug & IR3_DBG_SHADER_TCS);
|
||||
case MESA_SHADER_TESS_EVAL:
|
||||
return !!(ir3_shader_debug & IR3_DBG_SHADER_TES);
|
||||
case MESA_SHADER_GEOMETRY:
|
||||
return !!(ir3_shader_debug & IR3_DBG_SHADER_GS);
|
||||
case MESA_SHADER_FRAGMENT:
|
||||
return !!(ir3_shader_debug & IR3_DBG_SHADER_FS);
|
||||
case MESA_SHADER_COMPUTE:
|
||||
return !!(ir3_shader_debug & IR3_DBG_SHADER_CS);
|
||||
default:
|
||||
debug_assert(0);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
ir3_debug_print(struct ir3 *ir, const char *when)
|
||||
{
|
||||
if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
|
||||
mesa_logi("%s:", when);
|
||||
ir3_print(ir);
|
||||
}
|
||||
if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
|
||||
mesa_logi("%s:", when);
|
||||
ir3_print(ir);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* IR3_COMPILER_H_ */
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -27,215 +27,250 @@
|
|||
#ifndef IR3_CONTEXT_H_
|
||||
#define IR3_CONTEXT_H_
|
||||
|
||||
#include "ir3.h"
|
||||
#include "ir3_compiler.h"
|
||||
#include "ir3_nir.h"
|
||||
#include "ir3.h"
|
||||
|
||||
/* for conditionally setting boolean flag(s): */
|
||||
#define COND(bool, val) ((bool) ? (val) : 0)
|
||||
|
||||
#define DBG(fmt, ...) \
|
||||
do { mesa_logd("%s:%d: "fmt, \
|
||||
__FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
|
||||
#define DBG(fmt, ...) \
|
||||
do { \
|
||||
mesa_logd("%s:%d: " fmt, __FUNCTION__, __LINE__, ##__VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* The context for compilation of a single shader.
|
||||
*/
|
||||
struct ir3_context {
|
||||
struct ir3_compiler *compiler;
|
||||
const struct ir3_context_funcs *funcs;
|
||||
struct ir3_compiler *compiler;
|
||||
const struct ir3_context_funcs *funcs;
|
||||
|
||||
struct nir_shader *s;
|
||||
struct nir_shader *s;
|
||||
|
||||
struct nir_instr *cur_instr; /* current instruction, just for debug */
|
||||
struct nir_instr *cur_instr; /* current instruction, just for debug */
|
||||
|
||||
struct ir3 *ir;
|
||||
struct ir3_shader_variant *so;
|
||||
struct ir3 *ir;
|
||||
struct ir3_shader_variant *so;
|
||||
|
||||
/* Tables of scalar inputs/outputs. Because of the way varying packing
|
||||
* works, we could have inputs w/ fractional location, which is a bit
|
||||
* awkward to deal with unless we keep track of the split scalar in/
|
||||
* out components.
|
||||
*
|
||||
* These *only* have inputs/outputs that are touched by load_*input and
|
||||
* store_output.
|
||||
*/
|
||||
unsigned ninputs, noutputs;
|
||||
struct ir3_instruction **inputs;
|
||||
struct ir3_instruction **outputs;
|
||||
/* Tables of scalar inputs/outputs. Because of the way varying packing
|
||||
* works, we could have inputs w/ fractional location, which is a bit
|
||||
* awkward to deal with unless we keep track of the split scalar in/
|
||||
* out components.
|
||||
*
|
||||
* These *only* have inputs/outputs that are touched by load_*input and
|
||||
* store_output.
|
||||
*/
|
||||
unsigned ninputs, noutputs;
|
||||
struct ir3_instruction **inputs;
|
||||
struct ir3_instruction **outputs;
|
||||
|
||||
struct ir3_block *block; /* the current block */
|
||||
struct ir3_block *in_block; /* block created for shader inputs */
|
||||
struct ir3_block *block; /* the current block */
|
||||
struct ir3_block *in_block; /* block created for shader inputs */
|
||||
|
||||
nir_function_impl *impl;
|
||||
nir_function_impl *impl;
|
||||
|
||||
/* For fragment shaders, varyings are not actual shader inputs,
|
||||
* instead the hw passes a ij coord which is used with
|
||||
* bary.f.
|
||||
*
|
||||
* But NIR doesn't know that, it still declares varyings as
|
||||
* inputs. So we do all the input tracking normally and fix
|
||||
* things up after compile_instructions()
|
||||
*/
|
||||
struct ir3_instruction *ij[IJ_COUNT];
|
||||
/* For fragment shaders, varyings are not actual shader inputs,
|
||||
* instead the hw passes a ij coord which is used with
|
||||
* bary.f.
|
||||
*
|
||||
* But NIR doesn't know that, it still declares varyings as
|
||||
* inputs. So we do all the input tracking normally and fix
|
||||
* things up after compile_instructions()
|
||||
*/
|
||||
struct ir3_instruction *ij[IJ_COUNT];
|
||||
|
||||
/* for fragment shaders, for gl_FrontFacing and gl_FragCoord: */
|
||||
struct ir3_instruction *frag_face, *frag_coord;
|
||||
/* for fragment shaders, for gl_FrontFacing and gl_FragCoord: */
|
||||
struct ir3_instruction *frag_face, *frag_coord;
|
||||
|
||||
/* For vertex shaders, keep track of the system values sources */
|
||||
struct ir3_instruction *vertex_id, *basevertex, *instance_id, *base_instance, *draw_id, *view_index;
|
||||
/* For vertex shaders, keep track of the system values sources */
|
||||
struct ir3_instruction *vertex_id, *basevertex, *instance_id, *base_instance,
|
||||
*draw_id, *view_index;
|
||||
|
||||
/* For fragment shaders: */
|
||||
struct ir3_instruction *samp_id, *samp_mask_in;
|
||||
/* For fragment shaders: */
|
||||
struct ir3_instruction *samp_id, *samp_mask_in;
|
||||
|
||||
/* For geometry shaders: */
|
||||
struct ir3_instruction *primitive_id;
|
||||
struct ir3_instruction *gs_header;
|
||||
/* For geometry shaders: */
|
||||
struct ir3_instruction *primitive_id;
|
||||
struct ir3_instruction *gs_header;
|
||||
|
||||
/* For tessellation shaders: */
|
||||
struct ir3_instruction *patch_vertices_in;
|
||||
struct ir3_instruction *tcs_header;
|
||||
struct ir3_instruction *tess_coord;
|
||||
/* For tessellation shaders: */
|
||||
struct ir3_instruction *patch_vertices_in;
|
||||
struct ir3_instruction *tcs_header;
|
||||
struct ir3_instruction *tess_coord;
|
||||
|
||||
/* Compute shader inputs: */
|
||||
struct ir3_instruction *local_invocation_id, *work_group_id;
|
||||
/* Compute shader inputs: */
|
||||
struct ir3_instruction *local_invocation_id, *work_group_id;
|
||||
|
||||
/* mapping from nir_register to defining instruction: */
|
||||
struct hash_table *def_ht;
|
||||
/* mapping from nir_register to defining instruction: */
|
||||
struct hash_table *def_ht;
|
||||
|
||||
unsigned num_arrays;
|
||||
unsigned num_arrays;
|
||||
|
||||
/* Tracking for max level of flowcontrol (branchstack) needed
|
||||
* by a5xx+:
|
||||
*/
|
||||
unsigned stack, max_stack;
|
||||
/* Tracking for max level of flowcontrol (branchstack) needed
|
||||
* by a5xx+:
|
||||
*/
|
||||
unsigned stack, max_stack;
|
||||
|
||||
unsigned loop_id;
|
||||
unsigned loop_id;
|
||||
|
||||
/* a common pattern for indirect addressing is to request the
|
||||
* same address register multiple times. To avoid generating
|
||||
* duplicate instruction sequences (which our backend does not
|
||||
* try to clean up, since that should be done as the NIR stage)
|
||||
* we cache the address value generated for a given src value:
|
||||
*
|
||||
* Note that we have to cache these per alignment, since same
|
||||
* src used for an array of vec1 cannot be also used for an
|
||||
* array of vec4.
|
||||
*/
|
||||
struct hash_table *addr0_ht[4];
|
||||
/* a common pattern for indirect addressing is to request the
|
||||
* same address register multiple times. To avoid generating
|
||||
* duplicate instruction sequences (which our backend does not
|
||||
* try to clean up, since that should be done as the NIR stage)
|
||||
* we cache the address value generated for a given src value:
|
||||
*
|
||||
* Note that we have to cache these per alignment, since same
|
||||
* src used for an array of vec1 cannot be also used for an
|
||||
* array of vec4.
|
||||
*/
|
||||
struct hash_table *addr0_ht[4];
|
||||
|
||||
/* The same for a1.x. We only support immediate values for a1.x, as this
|
||||
* is the only use so far.
|
||||
*/
|
||||
struct hash_table_u64 *addr1_ht;
|
||||
/* The same for a1.x. We only support immediate values for a1.x, as this
|
||||
* is the only use so far.
|
||||
*/
|
||||
struct hash_table_u64 *addr1_ht;
|
||||
|
||||
struct hash_table *sel_cond_conversions;
|
||||
struct hash_table *sel_cond_conversions;
|
||||
|
||||
/* last dst array, for indirect we need to insert a var-store.
|
||||
*/
|
||||
struct ir3_instruction **last_dst;
|
||||
unsigned last_dst_n;
|
||||
/* last dst array, for indirect we need to insert a var-store.
|
||||
*/
|
||||
struct ir3_instruction **last_dst;
|
||||
unsigned last_dst_n;
|
||||
|
||||
/* maps nir_block to ir3_block, mostly for the purposes of
|
||||
* figuring out the blocks successors
|
||||
*/
|
||||
struct hash_table *block_ht;
|
||||
/* maps nir_block to ir3_block, mostly for the purposes of
|
||||
* figuring out the blocks successors
|
||||
*/
|
||||
struct hash_table *block_ht;
|
||||
|
||||
/* maps nir_block at the top of a loop to ir3_block collecting continue
|
||||
* edges.
|
||||
*/
|
||||
struct hash_table *continue_block_ht;
|
||||
/* maps nir_block at the top of a loop to ir3_block collecting continue
|
||||
* edges.
|
||||
*/
|
||||
struct hash_table *continue_block_ht;
|
||||
|
||||
/* on a4xx, bitmask of samplers which need astc+srgb workaround: */
|
||||
unsigned astc_srgb;
|
||||
/* on a4xx, bitmask of samplers which need astc+srgb workaround: */
|
||||
unsigned astc_srgb;
|
||||
|
||||
unsigned samples; /* bitmask of x,y sample shifts */
|
||||
unsigned samples; /* bitmask of x,y sample shifts */
|
||||
|
||||
unsigned max_texture_index;
|
||||
unsigned max_texture_index;
|
||||
|
||||
unsigned prefetch_limit;
|
||||
unsigned prefetch_limit;
|
||||
|
||||
/* set if we encounter something we can't handle yet, so we
|
||||
* can bail cleanly and fallback to TGSI compiler f/e
|
||||
*/
|
||||
bool error;
|
||||
/* set if we encounter something we can't handle yet, so we
|
||||
* can bail cleanly and fallback to TGSI compiler f/e
|
||||
*/
|
||||
bool error;
|
||||
};
|
||||
|
||||
struct ir3_context_funcs {
|
||||
void (*emit_intrinsic_load_ssbo)(struct ir3_context *ctx, nir_intrinsic_instr *intr,
|
||||
struct ir3_instruction **dst);
|
||||
void (*emit_intrinsic_store_ssbo)(struct ir3_context *ctx, nir_intrinsic_instr *intr);
|
||||
struct ir3_instruction * (*emit_intrinsic_atomic_ssbo)(struct ir3_context *ctx, nir_intrinsic_instr *intr);
|
||||
void (*emit_intrinsic_load_image)(struct ir3_context *ctx, nir_intrinsic_instr *intr,
|
||||
struct ir3_instruction **dst);
|
||||
void (*emit_intrinsic_store_image)(struct ir3_context *ctx, nir_intrinsic_instr *intr);
|
||||
struct ir3_instruction * (*emit_intrinsic_atomic_image)(struct ir3_context *ctx, nir_intrinsic_instr *intr);
|
||||
void (*emit_intrinsic_image_size)(struct ir3_context *ctx, nir_intrinsic_instr *intr,
|
||||
struct ir3_instruction **dst);
|
||||
void (*emit_intrinsic_load_global_ir3)(struct ir3_context *ctx, nir_intrinsic_instr *intr,
|
||||
struct ir3_instruction **dst);
|
||||
void (*emit_intrinsic_store_global_ir3)(struct ir3_context *ctx, nir_intrinsic_instr *intr);
|
||||
void (*emit_intrinsic_load_ssbo)(struct ir3_context *ctx,
|
||||
nir_intrinsic_instr *intr,
|
||||
struct ir3_instruction **dst);
|
||||
void (*emit_intrinsic_store_ssbo)(struct ir3_context *ctx,
|
||||
nir_intrinsic_instr *intr);
|
||||
struct ir3_instruction *(*emit_intrinsic_atomic_ssbo)(
|
||||
struct ir3_context *ctx, nir_intrinsic_instr *intr);
|
||||
void (*emit_intrinsic_load_image)(struct ir3_context *ctx,
|
||||
nir_intrinsic_instr *intr,
|
||||
struct ir3_instruction **dst);
|
||||
void (*emit_intrinsic_store_image)(struct ir3_context *ctx,
|
||||
nir_intrinsic_instr *intr);
|
||||
struct ir3_instruction *(*emit_intrinsic_atomic_image)(
|
||||
struct ir3_context *ctx, nir_intrinsic_instr *intr);
|
||||
void (*emit_intrinsic_image_size)(struct ir3_context *ctx,
|
||||
nir_intrinsic_instr *intr,
|
||||
struct ir3_instruction **dst);
|
||||
void (*emit_intrinsic_load_global_ir3)(struct ir3_context *ctx,
|
||||
nir_intrinsic_instr *intr,
|
||||
struct ir3_instruction **dst);
|
||||
void (*emit_intrinsic_store_global_ir3)(struct ir3_context *ctx,
|
||||
nir_intrinsic_instr *intr);
|
||||
};
|
||||
|
||||
extern const struct ir3_context_funcs ir3_a4xx_funcs;
|
||||
extern const struct ir3_context_funcs ir3_a6xx_funcs;
|
||||
|
||||
struct ir3_context * ir3_context_init(struct ir3_compiler *compiler,
|
||||
struct ir3_shader_variant *so);
|
||||
struct ir3_context *ir3_context_init(struct ir3_compiler *compiler,
|
||||
struct ir3_shader_variant *so);
|
||||
void ir3_context_free(struct ir3_context *ctx);
|
||||
|
||||
struct ir3_instruction ** ir3_get_dst_ssa(struct ir3_context *ctx, nir_ssa_def *dst, unsigned n);
|
||||
struct ir3_instruction ** ir3_get_dst(struct ir3_context *ctx, nir_dest *dst, unsigned n);
|
||||
struct ir3_instruction * const * ir3_get_src(struct ir3_context *ctx, nir_src *src);
|
||||
struct ir3_instruction **ir3_get_dst_ssa(struct ir3_context *ctx,
|
||||
nir_ssa_def *dst, unsigned n);
|
||||
struct ir3_instruction **ir3_get_dst(struct ir3_context *ctx, nir_dest *dst,
|
||||
unsigned n);
|
||||
struct ir3_instruction *const *ir3_get_src(struct ir3_context *ctx,
|
||||
nir_src *src);
|
||||
void ir3_put_dst(struct ir3_context *ctx, nir_dest *dst);
|
||||
struct ir3_instruction * ir3_create_collect(struct ir3_context *ctx,
|
||||
struct ir3_instruction *const *arr, unsigned arrsz);
|
||||
struct ir3_instruction *ir3_create_collect(struct ir3_context *ctx,
|
||||
struct ir3_instruction *const *arr,
|
||||
unsigned arrsz);
|
||||
void ir3_split_dest(struct ir3_block *block, struct ir3_instruction **dst,
|
||||
struct ir3_instruction *src, unsigned base, unsigned n);
|
||||
struct ir3_instruction *src, unsigned base, unsigned n);
|
||||
void ir3_handle_bindless_cat6(struct ir3_instruction *instr, nir_src rsrc);
|
||||
void ir3_handle_nonuniform(struct ir3_instruction *instr, nir_intrinsic_instr *intrin);
|
||||
void emit_intrinsic_image_size_tex(struct ir3_context *ctx, nir_intrinsic_instr *intr,
|
||||
struct ir3_instruction **dst);
|
||||
void ir3_handle_nonuniform(struct ir3_instruction *instr,
|
||||
nir_intrinsic_instr *intrin);
|
||||
void emit_intrinsic_image_size_tex(struct ir3_context *ctx,
|
||||
nir_intrinsic_instr *intr,
|
||||
struct ir3_instruction **dst);
|
||||
|
||||
#define ir3_collect(ctx, ...) ({ \
|
||||
struct ir3_instruction *__arr[] = { __VA_ARGS__ }; \
|
||||
ir3_create_collect(ctx, __arr, ARRAY_SIZE(__arr)); \
|
||||
})
|
||||
#define ir3_collect(ctx, ...) \
|
||||
({ \
|
||||
struct ir3_instruction *__arr[] = {__VA_ARGS__}; \
|
||||
ir3_create_collect(ctx, __arr, ARRAY_SIZE(__arr)); \
|
||||
})
|
||||
|
||||
NORETURN void ir3_context_error(struct ir3_context *ctx, const char *format, ...);
|
||||
NORETURN void ir3_context_error(struct ir3_context *ctx, const char *format,
|
||||
...);
|
||||
|
||||
#define compile_assert(ctx, cond) do { \
|
||||
if (!(cond)) ir3_context_error((ctx), "failed assert: "#cond"\n"); \
|
||||
} while (0)
|
||||
#define compile_assert(ctx, cond) \
|
||||
do { \
|
||||
if (!(cond)) \
|
||||
ir3_context_error((ctx), "failed assert: " #cond "\n"); \
|
||||
} while (0)
|
||||
|
||||
struct ir3_instruction * ir3_get_addr0(struct ir3_context *ctx,
|
||||
struct ir3_instruction *src, int align);
|
||||
struct ir3_instruction * ir3_get_addr1(struct ir3_context *ctx,
|
||||
unsigned const_val);
|
||||
struct ir3_instruction * ir3_get_predicate(struct ir3_context *ctx,
|
||||
struct ir3_instruction *src);
|
||||
struct ir3_instruction *ir3_get_addr0(struct ir3_context *ctx,
|
||||
struct ir3_instruction *src, int align);
|
||||
struct ir3_instruction *ir3_get_addr1(struct ir3_context *ctx,
|
||||
unsigned const_val);
|
||||
struct ir3_instruction *ir3_get_predicate(struct ir3_context *ctx,
|
||||
struct ir3_instruction *src);
|
||||
|
||||
void ir3_declare_array(struct ir3_context *ctx, nir_register *reg);
|
||||
struct ir3_array * ir3_get_array(struct ir3_context *ctx, nir_register *reg);
|
||||
struct ir3_array *ir3_get_array(struct ir3_context *ctx, nir_register *reg);
|
||||
struct ir3_instruction *ir3_create_array_load(struct ir3_context *ctx,
|
||||
struct ir3_array *arr, int n, struct ir3_instruction *address);
|
||||
void ir3_create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n,
|
||||
struct ir3_instruction *src, struct ir3_instruction *address);
|
||||
struct ir3_array *arr, int n,
|
||||
struct ir3_instruction *address);
|
||||
void ir3_create_array_store(struct ir3_context *ctx, struct ir3_array *arr,
|
||||
int n, struct ir3_instruction *src,
|
||||
struct ir3_instruction *address);
|
||||
|
||||
static inline type_t utype_for_size(unsigned bit_size)
|
||||
static inline type_t
|
||||
utype_for_size(unsigned bit_size)
|
||||
{
|
||||
switch (bit_size) {
|
||||
case 32: return TYPE_U32;
|
||||
case 16: return TYPE_U16;
|
||||
case 8: return TYPE_U8;
|
||||
default: unreachable("bad bitsize"); return ~0;
|
||||
}
|
||||
switch (bit_size) {
|
||||
case 32:
|
||||
return TYPE_U32;
|
||||
case 16:
|
||||
return TYPE_U16;
|
||||
case 8:
|
||||
return TYPE_U8;
|
||||
default:
|
||||
unreachable("bad bitsize");
|
||||
return ~0;
|
||||
}
|
||||
}
|
||||
|
||||
static inline type_t utype_src(nir_src src)
|
||||
{ return utype_for_size(nir_src_bit_size(src)); }
|
||||
static inline type_t
|
||||
utype_src(nir_src src)
|
||||
{
|
||||
return utype_for_size(nir_src_bit_size(src));
|
||||
}
|
||||
|
||||
static inline type_t utype_dst(nir_dest dst)
|
||||
{ return utype_for_size(nir_dest_bit_size(dst)); }
|
||||
static inline type_t
|
||||
utype_dst(nir_dest dst)
|
||||
{
|
||||
return utype_for_size(nir_dest_bit_size(dst));
|
||||
}
|
||||
|
||||
#endif /* IR3_CONTEXT_H_ */
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -36,7 +36,6 @@
|
|||
* one. It is basically anything that is not SSA.
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* Check if any instruction before `use` and after `src` writes to the
|
||||
* specified array. If `offset` is negative, it is a relative (a0.x)
|
||||
|
@ -48,186 +47,184 @@
|
|||
* the correct array write.
|
||||
*/
|
||||
static bool
|
||||
has_conflicting_write(struct ir3_instruction *src,
|
||||
struct ir3_instruction *use,
|
||||
struct ir3_register **def,
|
||||
unsigned id, int offset)
|
||||
has_conflicting_write(struct ir3_instruction *src, struct ir3_instruction *use,
|
||||
struct ir3_register **def, unsigned id, int offset)
|
||||
{
|
||||
assert(src->block == use->block);
|
||||
bool last_write = true;
|
||||
assert(src->block == use->block);
|
||||
bool last_write = true;
|
||||
|
||||
/* NOTE that since src and use are in the same block, src by
|
||||
* definition appears in the block's instr_list before use:
|
||||
*/
|
||||
foreach_instr_rev (instr, &use->node) {
|
||||
if (instr == src)
|
||||
break;
|
||||
/* NOTE that since src and use are in the same block, src by
|
||||
* definition appears in the block's instr_list before use:
|
||||
*/
|
||||
foreach_instr_rev (instr, &use->node) {
|
||||
if (instr == src)
|
||||
break;
|
||||
|
||||
/* if we are looking at a RELATIV read, we can't move
|
||||
* it past an a0.x write:
|
||||
*/
|
||||
if ((offset < 0) && (dest_regs(instr) > 0) &&
|
||||
(instr->dsts[0]->num == regid(REG_A0, 0)))
|
||||
return true;
|
||||
/* if we are looking at a RELATIV read, we can't move
|
||||
* it past an a0.x write:
|
||||
*/
|
||||
if ((offset < 0) && (dest_regs(instr) > 0) &&
|
||||
(instr->dsts[0]->num == regid(REG_A0, 0)))
|
||||
return true;
|
||||
|
||||
if (!writes_gpr(instr))
|
||||
continue;
|
||||
if (!writes_gpr(instr))
|
||||
continue;
|
||||
|
||||
struct ir3_register *dst = instr->dsts[0];
|
||||
if (!(dst->flags & IR3_REG_ARRAY))
|
||||
continue;
|
||||
struct ir3_register *dst = instr->dsts[0];
|
||||
if (!(dst->flags & IR3_REG_ARRAY))
|
||||
continue;
|
||||
|
||||
if (dst->array.id != id)
|
||||
continue;
|
||||
if (dst->array.id != id)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* At this point, we have narrowed down an instruction
|
||||
* that writes to the same array.. check if it the write
|
||||
* is to an array element that we care about:
|
||||
*/
|
||||
/*
|
||||
* At this point, we have narrowed down an instruction
|
||||
* that writes to the same array.. check if it the write
|
||||
* is to an array element that we care about:
|
||||
*/
|
||||
|
||||
/* is write to an unknown array element? */
|
||||
if (dst->flags & IR3_REG_RELATIV)
|
||||
return true;
|
||||
/* is write to an unknown array element? */
|
||||
if (dst->flags & IR3_REG_RELATIV)
|
||||
return true;
|
||||
|
||||
/* is read from an unknown array element? */
|
||||
if (offset < 0)
|
||||
return true;
|
||||
/* is read from an unknown array element? */
|
||||
if (offset < 0)
|
||||
return true;
|
||||
|
||||
/* is write to same array element? */
|
||||
if (dst->array.offset == offset)
|
||||
return true;
|
||||
/* is write to same array element? */
|
||||
if (dst->array.offset == offset)
|
||||
return true;
|
||||
|
||||
if (last_write)
|
||||
*def = dst;
|
||||
if (last_write)
|
||||
*def = dst;
|
||||
|
||||
last_write = false;
|
||||
}
|
||||
last_write = false;
|
||||
}
|
||||
|
||||
return false;
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Can we fold the mov src into use without invalid flags? */
|
||||
static bool
|
||||
valid_flags(struct ir3_instruction *use, struct ir3_instruction *mov)
|
||||
{
|
||||
struct ir3_register *src = mov->srcs[0];
|
||||
struct ir3_register *src = mov->srcs[0];
|
||||
|
||||
foreach_src_n (reg, n, use) {
|
||||
if (ssa(reg) != mov)
|
||||
continue;
|
||||
foreach_src_n (reg, n, use) {
|
||||
if (ssa(reg) != mov)
|
||||
continue;
|
||||
|
||||
if (!ir3_valid_flags(use, n, reg->flags | src->flags))
|
||||
return false;
|
||||
}
|
||||
if (!ir3_valid_flags(use, n, reg->flags | src->flags))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
instr_cp_postsched(struct ir3_instruction *mov)
|
||||
{
|
||||
struct ir3_register *src = mov->srcs[0];
|
||||
struct ir3_register *src = mov->srcs[0];
|
||||
|
||||
/* only consider mov's from "arrays", other cases we have
|
||||
* already considered already:
|
||||
*/
|
||||
if (!(src->flags & IR3_REG_ARRAY))
|
||||
return false;
|
||||
/* only consider mov's from "arrays", other cases we have
|
||||
* already considered already:
|
||||
*/
|
||||
if (!(src->flags & IR3_REG_ARRAY))
|
||||
return false;
|
||||
|
||||
int offset = (src->flags & IR3_REG_RELATIV) ? -1 : src->array.offset;
|
||||
int offset = (src->flags & IR3_REG_RELATIV) ? -1 : src->array.offset;
|
||||
|
||||
/* Once we move the array read directly into the consuming
|
||||
* instruction(s), we will also need to update instructions
|
||||
* that had a false-dep on the original mov to have deps
|
||||
* on the consuming instructions:
|
||||
*/
|
||||
struct util_dynarray newdeps;
|
||||
util_dynarray_init(&newdeps, mov->uses);
|
||||
/* Once we move the array read directly into the consuming
|
||||
* instruction(s), we will also need to update instructions
|
||||
* that had a false-dep on the original mov to have deps
|
||||
* on the consuming instructions:
|
||||
*/
|
||||
struct util_dynarray newdeps;
|
||||
util_dynarray_init(&newdeps, mov->uses);
|
||||
|
||||
foreach_ssa_use (use, mov) {
|
||||
if (use->block != mov->block)
|
||||
continue;
|
||||
foreach_ssa_use (use, mov) {
|
||||
if (use->block != mov->block)
|
||||
continue;
|
||||
|
||||
if (is_meta(use))
|
||||
continue;
|
||||
if (is_meta(use))
|
||||
continue;
|
||||
|
||||
struct ir3_register *def = src->def;
|
||||
if (has_conflicting_write(mov, use, &def, src->array.id, offset))
|
||||
continue;
|
||||
struct ir3_register *def = src->def;
|
||||
if (has_conflicting_write(mov, use, &def, src->array.id, offset))
|
||||
continue;
|
||||
|
||||
if (conflicts(mov->address, use->address))
|
||||
continue;
|
||||
if (conflicts(mov->address, use->address))
|
||||
continue;
|
||||
|
||||
if (!valid_flags(use, mov))
|
||||
continue;
|
||||
if (!valid_flags(use, mov))
|
||||
continue;
|
||||
|
||||
/* Ok, we've established that it is safe to remove this copy: */
|
||||
/* Ok, we've established that it is safe to remove this copy: */
|
||||
|
||||
bool removed = false;
|
||||
foreach_src_n (reg, n, use) {
|
||||
if (ssa(reg) != mov)
|
||||
continue;
|
||||
bool removed = false;
|
||||
foreach_src_n (reg, n, use) {
|
||||
if (ssa(reg) != mov)
|
||||
continue;
|
||||
|
||||
use->srcs[n] = ir3_reg_clone(mov->block->shader, src);
|
||||
use->srcs[n] = ir3_reg_clone(mov->block->shader, src);
|
||||
|
||||
/* preserve (abs)/etc modifiers: */
|
||||
use->srcs[n]-> flags |= reg->flags;
|
||||
/* preserve (abs)/etc modifiers: */
|
||||
use->srcs[n]->flags |= reg->flags;
|
||||
|
||||
/* If we're sinking the array read past any writes, make
|
||||
* sure to update it to point to the new previous write:
|
||||
*/
|
||||
use->srcs[n]->def = def;
|
||||
/* If we're sinking the array read past any writes, make
|
||||
* sure to update it to point to the new previous write:
|
||||
*/
|
||||
use->srcs[n]->def = def;
|
||||
|
||||
removed = true;
|
||||
}
|
||||
removed = true;
|
||||
}
|
||||
|
||||
/* the use could have been only a false-dep, only add to the newdeps
|
||||
* array and update the address if we've actually updated a real src
|
||||
* reg for the use:
|
||||
*/
|
||||
if (removed) {
|
||||
if (src->flags & IR3_REG_RELATIV)
|
||||
ir3_instr_set_address(use, mov->address->def->instr);
|
||||
/* the use could have been only a false-dep, only add to the newdeps
|
||||
* array and update the address if we've actually updated a real src
|
||||
* reg for the use:
|
||||
*/
|
||||
if (removed) {
|
||||
if (src->flags & IR3_REG_RELATIV)
|
||||
ir3_instr_set_address(use, mov->address->def->instr);
|
||||
|
||||
util_dynarray_append(&newdeps, struct ir3_instruction *, use);
|
||||
util_dynarray_append(&newdeps, struct ir3_instruction *, use);
|
||||
|
||||
/* Remove the use from the src instruction: */
|
||||
_mesa_set_remove_key(mov->uses, use);
|
||||
}
|
||||
}
|
||||
/* Remove the use from the src instruction: */
|
||||
_mesa_set_remove_key(mov->uses, use);
|
||||
}
|
||||
}
|
||||
|
||||
/* Once we have the complete set of instruction(s) that are are now
|
||||
* directly reading from the array, update any false-dep uses to
|
||||
* now depend on these instructions. The only remaining uses at
|
||||
* this point should be false-deps:
|
||||
*/
|
||||
foreach_ssa_use (use, mov) {
|
||||
util_dynarray_foreach(&newdeps, struct ir3_instruction *, instrp) {
|
||||
struct ir3_instruction *newdep = *instrp;
|
||||
ir3_instr_add_dep(use, newdep);
|
||||
}
|
||||
}
|
||||
/* Once we have the complete set of instruction(s) that are are now
|
||||
* directly reading from the array, update any false-dep uses to
|
||||
* now depend on these instructions. The only remaining uses at
|
||||
* this point should be false-deps:
|
||||
*/
|
||||
foreach_ssa_use (use, mov) {
|
||||
util_dynarray_foreach (&newdeps, struct ir3_instruction *, instrp) {
|
||||
struct ir3_instruction *newdep = *instrp;
|
||||
ir3_instr_add_dep(use, newdep);
|
||||
}
|
||||
}
|
||||
|
||||
return util_dynarray_num_elements(&newdeps, struct ir3_instruction **) > 0;
|
||||
return util_dynarray_num_elements(&newdeps, struct ir3_instruction **) > 0;
|
||||
}
|
||||
|
||||
bool
|
||||
ir3_cp_postsched(struct ir3 *ir)
|
||||
{
|
||||
void *mem_ctx = ralloc_context(NULL);
|
||||
bool progress = false;
|
||||
void *mem_ctx = ralloc_context(NULL);
|
||||
bool progress = false;
|
||||
|
||||
ir3_find_ssa_uses(ir, mem_ctx, false);
|
||||
ir3_find_ssa_uses(ir, mem_ctx, false);
|
||||
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr_safe (instr, &block->instr_list) {
|
||||
if (is_same_type_mov(instr))
|
||||
progress |= instr_cp_postsched(instr);
|
||||
}
|
||||
}
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr_safe (instr, &block->instr_list) {
|
||||
if (is_same_type_mov(instr))
|
||||
progress |= instr_cp_postsched(instr);
|
||||
}
|
||||
}
|
||||
|
||||
ralloc_free(mem_ctx);
|
||||
ralloc_free(mem_ctx);
|
||||
|
||||
return progress;
|
||||
return progress;
|
||||
}
|
||||
|
|
|
@ -37,112 +37,109 @@
|
|||
static uint32_t
|
||||
hash_instr(const void *data)
|
||||
{
|
||||
const struct ir3_instruction *instr = data;
|
||||
uint32_t hash = 0;
|
||||
const struct ir3_instruction *instr = data;
|
||||
uint32_t hash = 0;
|
||||
|
||||
hash = HASH(hash, instr->opc);
|
||||
hash = HASH(hash, instr->dsts[0]->flags);
|
||||
foreach_src (src, (struct ir3_instruction *) instr) {
|
||||
if (src->flags & IR3_REG_CONST)
|
||||
hash = HASH(hash, src->num);
|
||||
else if (src->flags & IR3_REG_IMMED)
|
||||
hash = HASH(hash, src->uim_val);
|
||||
else
|
||||
hash = HASH(hash, src->def);
|
||||
}
|
||||
hash = HASH(hash, instr->opc);
|
||||
hash = HASH(hash, instr->dsts[0]->flags);
|
||||
foreach_src (src, (struct ir3_instruction *)instr) {
|
||||
if (src->flags & IR3_REG_CONST)
|
||||
hash = HASH(hash, src->num);
|
||||
else if (src->flags & IR3_REG_IMMED)
|
||||
hash = HASH(hash, src->uim_val);
|
||||
else
|
||||
hash = HASH(hash, src->def);
|
||||
}
|
||||
|
||||
return hash;
|
||||
return hash;
|
||||
}
|
||||
|
||||
static bool
|
||||
instrs_equal(const struct ir3_instruction *i1, const struct ir3_instruction *i2)
|
||||
{
|
||||
if (i1->opc != i2->opc)
|
||||
return false;
|
||||
if (i1->opc != i2->opc)
|
||||
return false;
|
||||
|
||||
if (i1->dsts_count != i2->dsts_count)
|
||||
return false;
|
||||
if (i1->dsts_count != i2->dsts_count)
|
||||
return false;
|
||||
|
||||
if (i1->srcs_count != i2->srcs_count)
|
||||
return false;
|
||||
if (i1->srcs_count != i2->srcs_count)
|
||||
return false;
|
||||
|
||||
if (i1->dsts[0]->flags != i2->dsts[0]->flags)
|
||||
return false;
|
||||
if (i1->dsts[0]->flags != i2->dsts[0]->flags)
|
||||
return false;
|
||||
|
||||
for (unsigned i = 0; i < i1->srcs_count; i++) {
|
||||
const struct ir3_register *i1_reg = i1->srcs[i], *i2_reg = i2->srcs[i];
|
||||
for (unsigned i = 0; i < i1->srcs_count; i++) {
|
||||
const struct ir3_register *i1_reg = i1->srcs[i], *i2_reg = i2->srcs[i];
|
||||
|
||||
if (i1_reg->flags != i2_reg->flags)
|
||||
return false;
|
||||
if (i1_reg->flags != i2_reg->flags)
|
||||
return false;
|
||||
|
||||
if (i1_reg->flags & IR3_REG_CONST) {
|
||||
if (i1_reg->num != i2_reg->num)
|
||||
return false;
|
||||
} else if (i1_reg->flags & IR3_REG_IMMED) {
|
||||
if (i1_reg->uim_val != i2_reg->uim_val)
|
||||
return false;
|
||||
} else {
|
||||
if (i1_reg->def != i2_reg->def)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (i1_reg->flags & IR3_REG_CONST) {
|
||||
if (i1_reg->num != i2_reg->num)
|
||||
return false;
|
||||
} else if (i1_reg->flags & IR3_REG_IMMED) {
|
||||
if (i1_reg->uim_val != i2_reg->uim_val)
|
||||
return false;
|
||||
} else {
|
||||
if (i1_reg->def != i2_reg->def)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
instr_can_cse(const struct ir3_instruction *instr)
|
||||
{
|
||||
if (instr->opc != OPC_META_COLLECT)
|
||||
return false;
|
||||
if (instr->opc != OPC_META_COLLECT)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
cmp_func(const void *data1, const void *data2)
|
||||
{
|
||||
return instrs_equal(data1, data2);
|
||||
return instrs_equal(data1, data2);
|
||||
}
|
||||
|
||||
bool
|
||||
ir3_cse(struct ir3 *ir)
|
||||
{
|
||||
struct set *instr_set = _mesa_set_create(NULL, hash_instr, cmp_func);
|
||||
foreach_block (block, &ir->block_list) {
|
||||
_mesa_set_clear(instr_set, NULL);
|
||||
struct set *instr_set = _mesa_set_create(NULL, hash_instr, cmp_func);
|
||||
foreach_block (block, &ir->block_list) {
|
||||
_mesa_set_clear(instr_set, NULL);
|
||||
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
instr->data = NULL;
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
instr->data = NULL;
|
||||
|
||||
if (!instr_can_cse(instr))
|
||||
continue;
|
||||
if (!instr_can_cse(instr))
|
||||
continue;
|
||||
|
||||
bool found;
|
||||
struct set_entry *entry =
|
||||
_mesa_set_search_or_add(instr_set, instr, &found);
|
||||
if (found)
|
||||
instr->data = (void *) entry->key;
|
||||
}
|
||||
}
|
||||
bool found;
|
||||
struct set_entry *entry =
|
||||
_mesa_set_search_or_add(instr_set, instr, &found);
|
||||
if (found)
|
||||
instr->data = (void *)entry->key;
|
||||
}
|
||||
}
|
||||
|
||||
bool progress = false;
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
foreach_src (src, instr) {
|
||||
if ((src->flags & IR3_REG_SSA) && src->def &&
|
||||
src->def->instr->data) {
|
||||
progress = true;
|
||||
struct ir3_instruction *instr = src->def->instr->data;
|
||||
src->def = instr->dsts[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool progress = false;
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
foreach_src(src, instr) {
|
||||
if ((src->flags & IR3_REG_SSA) &&
|
||||
src->def &&
|
||||
src->def->instr->data) {
|
||||
progress = true;
|
||||
struct ir3_instruction *instr = src->def->instr->data;
|
||||
src->def = instr->dsts[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
_mesa_set_destroy(instr_set, NULL);
|
||||
return progress;
|
||||
_mesa_set_destroy(instr_set, NULL);
|
||||
return progress;
|
||||
}
|
||||
|
||||
|
|
|
@ -36,168 +36,168 @@
|
|||
static void
|
||||
mark_array_use(struct ir3_instruction *instr, struct ir3_register *reg)
|
||||
{
|
||||
if (reg->flags & IR3_REG_ARRAY) {
|
||||
struct ir3_array *arr =
|
||||
ir3_lookup_array(instr->block->shader, reg->array.id);
|
||||
arr->unused = false;
|
||||
}
|
||||
if (reg->flags & IR3_REG_ARRAY) {
|
||||
struct ir3_array *arr =
|
||||
ir3_lookup_array(instr->block->shader, reg->array.id);
|
||||
arr->unused = false;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
instr_dce(struct ir3_instruction *instr, bool falsedep)
|
||||
{
|
||||
/* don't mark falsedep's as used, but otherwise process them normally: */
|
||||
if (!falsedep)
|
||||
instr->flags &= ~IR3_INSTR_UNUSED;
|
||||
/* don't mark falsedep's as used, but otherwise process them normally: */
|
||||
if (!falsedep)
|
||||
instr->flags &= ~IR3_INSTR_UNUSED;
|
||||
|
||||
if (ir3_instr_check_mark(instr))
|
||||
return;
|
||||
if (ir3_instr_check_mark(instr))
|
||||
return;
|
||||
|
||||
if (writes_gpr(instr))
|
||||
mark_array_use(instr, instr->dsts[0]); /* dst */
|
||||
if (writes_gpr(instr))
|
||||
mark_array_use(instr, instr->dsts[0]); /* dst */
|
||||
|
||||
foreach_src (reg, instr)
|
||||
mark_array_use(instr, reg); /* src */
|
||||
foreach_src (reg, instr)
|
||||
mark_array_use(instr, reg); /* src */
|
||||
|
||||
foreach_ssa_src_n (src, i, instr) {
|
||||
instr_dce(src, __is_false_dep(instr, i));
|
||||
}
|
||||
foreach_ssa_src_n (src, i, instr) {
|
||||
instr_dce(src, __is_false_dep(instr, i));
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
remove_unused_by_block(struct ir3_block *block)
|
||||
{
|
||||
bool progress = false;
|
||||
foreach_instr_safe (instr, &block->instr_list) {
|
||||
if (instr->opc == OPC_END || instr->opc == OPC_CHSH || instr->opc == OPC_CHMASK)
|
||||
continue;
|
||||
if (instr->flags & IR3_INSTR_UNUSED) {
|
||||
if (instr->opc == OPC_META_SPLIT) {
|
||||
struct ir3_instruction *src = ssa(instr->srcs[0]);
|
||||
/* tex (cat5) instructions have a writemask, so we can
|
||||
* mask off unused components. Other instructions do not.
|
||||
*/
|
||||
if (src && is_tex_or_prefetch(src) && (src->dsts[0]->wrmask > 1)) {
|
||||
src->dsts[0]->wrmask &= ~(1 << instr->split.off);
|
||||
}
|
||||
}
|
||||
bool progress = false;
|
||||
foreach_instr_safe (instr, &block->instr_list) {
|
||||
if (instr->opc == OPC_END || instr->opc == OPC_CHSH ||
|
||||
instr->opc == OPC_CHMASK)
|
||||
continue;
|
||||
if (instr->flags & IR3_INSTR_UNUSED) {
|
||||
if (instr->opc == OPC_META_SPLIT) {
|
||||
struct ir3_instruction *src = ssa(instr->srcs[0]);
|
||||
/* tex (cat5) instructions have a writemask, so we can
|
||||
* mask off unused components. Other instructions do not.
|
||||
*/
|
||||
if (src && is_tex_or_prefetch(src) && (src->dsts[0]->wrmask > 1)) {
|
||||
src->dsts[0]->wrmask &= ~(1 << instr->split.off);
|
||||
}
|
||||
}
|
||||
|
||||
/* prune false-deps, etc: */
|
||||
foreach_ssa_use (use, instr)
|
||||
foreach_ssa_srcp_n (srcp, n, use)
|
||||
if (*srcp == instr)
|
||||
*srcp = NULL;
|
||||
/* prune false-deps, etc: */
|
||||
foreach_ssa_use (use, instr)
|
||||
foreach_ssa_srcp_n (srcp, n, use)
|
||||
if (*srcp == instr)
|
||||
*srcp = NULL;
|
||||
|
||||
list_delinit(&instr->node);
|
||||
progress = true;
|
||||
}
|
||||
}
|
||||
return progress;
|
||||
list_delinit(&instr->node);
|
||||
progress = true;
|
||||
}
|
||||
}
|
||||
return progress;
|
||||
}
|
||||
|
||||
static bool
|
||||
find_and_remove_unused(struct ir3 *ir, struct ir3_shader_variant *so)
|
||||
{
|
||||
unsigned i;
|
||||
bool progress = false;
|
||||
unsigned i;
|
||||
bool progress = false;
|
||||
|
||||
ir3_clear_mark(ir);
|
||||
ir3_clear_mark(ir);
|
||||
|
||||
/* initially mark everything as unused, we'll clear the flag as we
|
||||
* visit the instructions:
|
||||
*/
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
/* special case, if pre-fs texture fetch used, we cannot
|
||||
* eliminate the barycentric i/j input
|
||||
*/
|
||||
if (so->num_sampler_prefetch &&
|
||||
(instr->opc == OPC_META_INPUT) &&
|
||||
(instr->input.sysval == SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL))
|
||||
continue;
|
||||
instr->flags |= IR3_INSTR_UNUSED;
|
||||
}
|
||||
}
|
||||
/* initially mark everything as unused, we'll clear the flag as we
|
||||
* visit the instructions:
|
||||
*/
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
/* special case, if pre-fs texture fetch used, we cannot
|
||||
* eliminate the barycentric i/j input
|
||||
*/
|
||||
if (so->num_sampler_prefetch && (instr->opc == OPC_META_INPUT) &&
|
||||
(instr->input.sysval == SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL))
|
||||
continue;
|
||||
instr->flags |= IR3_INSTR_UNUSED;
|
||||
}
|
||||
}
|
||||
|
||||
foreach_array (arr, &ir->array_list)
|
||||
arr->unused = true;
|
||||
foreach_array (arr, &ir->array_list)
|
||||
arr->unused = true;
|
||||
|
||||
foreach_block (block, &ir->block_list) {
|
||||
for (i = 0; i < block->keeps_count; i++)
|
||||
instr_dce(block->keeps[i], false);
|
||||
foreach_block (block, &ir->block_list) {
|
||||
for (i = 0; i < block->keeps_count; i++)
|
||||
instr_dce(block->keeps[i], false);
|
||||
|
||||
/* We also need to account for if-condition: */
|
||||
if (block->condition)
|
||||
instr_dce(block->condition, false);
|
||||
}
|
||||
/* We also need to account for if-condition: */
|
||||
if (block->condition)
|
||||
instr_dce(block->condition, false);
|
||||
}
|
||||
|
||||
/* remove un-used instructions: */
|
||||
foreach_block (block, &ir->block_list) {
|
||||
progress |= remove_unused_by_block(block);
|
||||
}
|
||||
/* remove un-used instructions: */
|
||||
foreach_block (block, &ir->block_list) {
|
||||
progress |= remove_unused_by_block(block);
|
||||
}
|
||||
|
||||
/* remove un-used arrays: */
|
||||
foreach_array_safe (arr, &ir->array_list) {
|
||||
if (arr->unused)
|
||||
list_delinit(&arr->node);
|
||||
}
|
||||
/* remove un-used arrays: */
|
||||
foreach_array_safe (arr, &ir->array_list) {
|
||||
if (arr->unused)
|
||||
list_delinit(&arr->node);
|
||||
}
|
||||
|
||||
/* fixup wrmask of split instructions to account for adjusted tex
|
||||
* wrmask's:
|
||||
*/
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
if (instr->opc != OPC_META_SPLIT)
|
||||
continue;
|
||||
/* fixup wrmask of split instructions to account for adjusted tex
|
||||
* wrmask's:
|
||||
*/
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
if (instr->opc != OPC_META_SPLIT)
|
||||
continue;
|
||||
|
||||
struct ir3_instruction *src = ssa(instr->srcs[0]);
|
||||
if (!is_tex_or_prefetch(src))
|
||||
continue;
|
||||
struct ir3_instruction *src = ssa(instr->srcs[0]);
|
||||
if (!is_tex_or_prefetch(src))
|
||||
continue;
|
||||
|
||||
instr->srcs[0]->wrmask = src->dsts[0]->wrmask;
|
||||
}
|
||||
}
|
||||
instr->srcs[0]->wrmask = src->dsts[0]->wrmask;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < ir->a0_users_count; i++) {
|
||||
struct ir3_instruction *instr = ir->a0_users[i];
|
||||
if (instr && (instr->flags & IR3_INSTR_UNUSED))
|
||||
ir->a0_users[i] = NULL;
|
||||
}
|
||||
for (i = 0; i < ir->a0_users_count; i++) {
|
||||
struct ir3_instruction *instr = ir->a0_users[i];
|
||||
if (instr && (instr->flags & IR3_INSTR_UNUSED))
|
||||
ir->a0_users[i] = NULL;
|
||||
}
|
||||
|
||||
for (i = 0; i < ir->a1_users_count; i++) {
|
||||
struct ir3_instruction *instr = ir->a1_users[i];
|
||||
if (instr && (instr->flags & IR3_INSTR_UNUSED))
|
||||
ir->a1_users[i] = NULL;
|
||||
}
|
||||
for (i = 0; i < ir->a1_users_count; i++) {
|
||||
struct ir3_instruction *instr = ir->a1_users[i];
|
||||
if (instr && (instr->flags & IR3_INSTR_UNUSED))
|
||||
ir->a1_users[i] = NULL;
|
||||
}
|
||||
|
||||
for (i = 0; i < ir->predicates_count; i++) {
|
||||
struct ir3_instruction *instr = ir->predicates[i];
|
||||
if (instr && (instr->flags & IR3_INSTR_UNUSED))
|
||||
ir->predicates[i] = NULL;
|
||||
}
|
||||
for (i = 0; i < ir->predicates_count; i++) {
|
||||
struct ir3_instruction *instr = ir->predicates[i];
|
||||
if (instr && (instr->flags & IR3_INSTR_UNUSED))
|
||||
ir->predicates[i] = NULL;
|
||||
}
|
||||
|
||||
/* cleanup unused inputs: */
|
||||
foreach_input_n (in, n, ir)
|
||||
if (in->flags & IR3_INSTR_UNUSED)
|
||||
ir->inputs[n] = NULL;
|
||||
/* cleanup unused inputs: */
|
||||
foreach_input_n (in, n, ir)
|
||||
if (in->flags & IR3_INSTR_UNUSED)
|
||||
ir->inputs[n] = NULL;
|
||||
|
||||
return progress;
|
||||
return progress;
|
||||
}
|
||||
|
||||
bool
|
||||
ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so)
|
||||
{
|
||||
void *mem_ctx = ralloc_context(NULL);
|
||||
bool progress, made_progress = false;
|
||||
void *mem_ctx = ralloc_context(NULL);
|
||||
bool progress, made_progress = false;
|
||||
|
||||
ir3_find_ssa_uses(ir, mem_ctx, true);
|
||||
ir3_find_ssa_uses(ir, mem_ctx, true);
|
||||
|
||||
do {
|
||||
progress = find_and_remove_unused(ir, so);
|
||||
made_progress |= progress;
|
||||
} while (progress);
|
||||
do {
|
||||
progress = find_and_remove_unused(ir, so);
|
||||
made_progress |= progress;
|
||||
} while (progress);
|
||||
|
||||
ralloc_free(mem_ctx);
|
||||
ralloc_free(mem_ctx);
|
||||
|
||||
return made_progress;
|
||||
return made_progress;
|
||||
}
|
||||
|
|
|
@ -57,116 +57,112 @@
|
|||
*/
|
||||
int
|
||||
ir3_delayslots(struct ir3_instruction *assigner,
|
||||
struct ir3_instruction *consumer, unsigned n, bool soft)
|
||||
struct ir3_instruction *consumer, unsigned n, bool soft)
|
||||
{
|
||||
/* generally don't count false dependencies, since this can just be
|
||||
* something like a barrier, or SSBO store.
|
||||
*/
|
||||
if (__is_false_dep(consumer, n))
|
||||
return 0;
|
||||
/* generally don't count false dependencies, since this can just be
|
||||
* something like a barrier, or SSBO store.
|
||||
*/
|
||||
if (__is_false_dep(consumer, n))
|
||||
return 0;
|
||||
|
||||
/* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
|
||||
* alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
|
||||
* handled with sync bits
|
||||
*/
|
||||
/* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
|
||||
* alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
|
||||
* handled with sync bits
|
||||
*/
|
||||
|
||||
if (is_meta(assigner) || is_meta(consumer))
|
||||
return 0;
|
||||
if (is_meta(assigner) || is_meta(consumer))
|
||||
return 0;
|
||||
|
||||
if (writes_addr0(assigner) || writes_addr1(assigner))
|
||||
return 6;
|
||||
if (writes_addr0(assigner) || writes_addr1(assigner))
|
||||
return 6;
|
||||
|
||||
if (soft && is_sfu(assigner))
|
||||
return SOFT_SS_NOPS;
|
||||
if (soft && is_sfu(assigner))
|
||||
return SOFT_SS_NOPS;
|
||||
|
||||
/* handled via sync flags: */
|
||||
if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
|
||||
return 0;
|
||||
/* handled via sync flags: */
|
||||
if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
|
||||
return 0;
|
||||
|
||||
/* As far as we know, shader outputs don't need any delay. */
|
||||
if (consumer->opc == OPC_END || consumer->opc == OPC_CHMASK)
|
||||
return 0;
|
||||
/* As far as we know, shader outputs don't need any delay. */
|
||||
if (consumer->opc == OPC_END || consumer->opc == OPC_CHMASK)
|
||||
return 0;
|
||||
|
||||
/* assigner must be alu: */
|
||||
if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
|
||||
is_mem(consumer) || (assigner->dsts[0]->flags & IR3_REG_SHARED)) {
|
||||
return 6;
|
||||
} else {
|
||||
/* In mergedregs mode, there is an extra 2-cycle penalty when half of
|
||||
* a full-reg is read as a half-reg or when a half-reg is read as a
|
||||
* full-reg.
|
||||
*/
|
||||
bool mismatched_half =
|
||||
(assigner->dsts[0]->flags & IR3_REG_HALF) !=
|
||||
(consumer->srcs[n]->flags & IR3_REG_HALF);
|
||||
unsigned penalty = mismatched_half ? 2 : 0;
|
||||
if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
|
||||
(n == 2)) {
|
||||
/* special case, 3rd src to cat3 not required on first cycle */
|
||||
return 1 + penalty;
|
||||
} else {
|
||||
return 3 + penalty;
|
||||
}
|
||||
}
|
||||
/* assigner must be alu: */
|
||||
if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
|
||||
is_mem(consumer) || (assigner->dsts[0]->flags & IR3_REG_SHARED)) {
|
||||
return 6;
|
||||
} else {
|
||||
/* In mergedregs mode, there is an extra 2-cycle penalty when half of
|
||||
* a full-reg is read as a half-reg or when a half-reg is read as a
|
||||
* full-reg.
|
||||
*/
|
||||
bool mismatched_half = (assigner->dsts[0]->flags & IR3_REG_HALF) !=
|
||||
(consumer->srcs[n]->flags & IR3_REG_HALF);
|
||||
unsigned penalty = mismatched_half ? 2 : 0;
|
||||
if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) && (n == 2)) {
|
||||
/* special case, 3rd src to cat3 not required on first cycle */
|
||||
return 1 + penalty;
|
||||
} else {
|
||||
return 3 + penalty;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
count_instruction(struct ir3_instruction *n)
|
||||
{
|
||||
/* NOTE: don't count branch/jump since we don't know yet if they will
|
||||
* be eliminated later in resolve_jumps().. really should do that
|
||||
* earlier so we don't have this constraint.
|
||||
*/
|
||||
return is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
|
||||
/* NOTE: don't count branch/jump since we don't know yet if they will
|
||||
* be eliminated later in resolve_jumps().. really should do that
|
||||
* earlier so we don't have this constraint.
|
||||
*/
|
||||
return is_alu(n) ||
|
||||
(is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
|
||||
}
|
||||
|
||||
static unsigned
|
||||
distance(struct ir3_block *block, struct ir3_instruction *instr,
|
||||
unsigned maxd)
|
||||
distance(struct ir3_block *block, struct ir3_instruction *instr, unsigned maxd)
|
||||
{
|
||||
unsigned d = 0;
|
||||
unsigned d = 0;
|
||||
|
||||
/* Note that this relies on incrementally building up the block's
|
||||
* instruction list.. but this is how scheduling and nopsched
|
||||
* work.
|
||||
*/
|
||||
foreach_instr_rev (n, &block->instr_list) {
|
||||
if ((n == instr) || (d >= maxd))
|
||||
return MIN2(maxd, d + n->nop);
|
||||
if (count_instruction(n))
|
||||
d = MIN2(maxd, d + 1 + n->repeat + n->nop);
|
||||
}
|
||||
/* Note that this relies on incrementally building up the block's
|
||||
* instruction list.. but this is how scheduling and nopsched
|
||||
* work.
|
||||
*/
|
||||
foreach_instr_rev (n, &block->instr_list) {
|
||||
if ((n == instr) || (d >= maxd))
|
||||
return MIN2(maxd, d + n->nop);
|
||||
if (count_instruction(n))
|
||||
d = MIN2(maxd, d + 1 + n->repeat + n->nop);
|
||||
}
|
||||
|
||||
return maxd;
|
||||
return maxd;
|
||||
}
|
||||
|
||||
static unsigned
|
||||
delay_calc_srcn_prera(struct ir3_block *block,
|
||||
struct ir3_instruction *assigner,
|
||||
struct ir3_instruction *consumer,
|
||||
unsigned srcn)
|
||||
delay_calc_srcn_prera(struct ir3_block *block, struct ir3_instruction *assigner,
|
||||
struct ir3_instruction *consumer, unsigned srcn)
|
||||
{
|
||||
unsigned delay = 0;
|
||||
unsigned delay = 0;
|
||||
|
||||
if (assigner->opc == OPC_META_PHI)
|
||||
return 0;
|
||||
if (assigner->opc == OPC_META_PHI)
|
||||
return 0;
|
||||
|
||||
if (is_meta(assigner)) {
|
||||
foreach_src_n (src, n, assigner) {
|
||||
unsigned d;
|
||||
if (is_meta(assigner)) {
|
||||
foreach_src_n (src, n, assigner) {
|
||||
unsigned d;
|
||||
|
||||
if (!src->def)
|
||||
continue;
|
||||
if (!src->def)
|
||||
continue;
|
||||
|
||||
d = delay_calc_srcn_prera(block, src->def->instr, consumer, srcn);
|
||||
delay = MAX2(delay, d);
|
||||
}
|
||||
} else {
|
||||
delay = ir3_delayslots(assigner, consumer, srcn, false);
|
||||
delay -= distance(block, assigner, delay);
|
||||
}
|
||||
d = delay_calc_srcn_prera(block, src->def->instr, consumer, srcn);
|
||||
delay = MAX2(delay, d);
|
||||
}
|
||||
} else {
|
||||
delay = ir3_delayslots(assigner, consumer, srcn, false);
|
||||
delay -= distance(block, assigner, delay);
|
||||
}
|
||||
|
||||
return delay;
|
||||
return delay;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -176,19 +172,19 @@ delay_calc_srcn_prera(struct ir3_block *block,
|
|||
unsigned
|
||||
ir3_delay_calc_prera(struct ir3_block *block, struct ir3_instruction *instr)
|
||||
{
|
||||
unsigned delay = 0;
|
||||
unsigned delay = 0;
|
||||
|
||||
foreach_src_n (src, i, instr) {
|
||||
unsigned d = 0;
|
||||
foreach_src_n (src, i, instr) {
|
||||
unsigned d = 0;
|
||||
|
||||
if (src->def && src->def->instr->block == block) {
|
||||
d = delay_calc_srcn_prera(block, src->def->instr, instr, i);
|
||||
}
|
||||
if (src->def && src->def->instr->block == block) {
|
||||
d = delay_calc_srcn_prera(block, src->def->instr, instr, i);
|
||||
}
|
||||
|
||||
delay = MAX2(delay, d);
|
||||
}
|
||||
delay = MAX2(delay, d);
|
||||
}
|
||||
|
||||
return delay;
|
||||
return delay;
|
||||
}
|
||||
|
||||
/* Post-RA, we don't have arrays any more, so we have to be a bit careful here
|
||||
|
@ -198,185 +194,186 @@ ir3_delay_calc_prera(struct ir3_block *block, struct ir3_instruction *instr)
|
|||
static unsigned
|
||||
post_ra_reg_elems(struct ir3_register *reg)
|
||||
{
|
||||
if (reg->flags & IR3_REG_RELATIV)
|
||||
return reg->size;
|
||||
return reg_elems(reg);
|
||||
if (reg->flags & IR3_REG_RELATIV)
|
||||
return reg->size;
|
||||
return reg_elems(reg);
|
||||
}
|
||||
|
||||
static unsigned
|
||||
post_ra_reg_num(struct ir3_register *reg)
|
||||
{
|
||||
if (reg->flags & IR3_REG_RELATIV)
|
||||
return reg->array.base;
|
||||
return reg->num;
|
||||
if (reg->flags & IR3_REG_RELATIV)
|
||||
return reg->array.base;
|
||||
return reg->num;
|
||||
}
|
||||
|
||||
static unsigned
|
||||
delay_calc_srcn_postra(struct ir3_instruction *assigner, struct ir3_instruction *consumer,
|
||||
unsigned assigner_n, unsigned consumer_n, bool soft, bool mergedregs)
|
||||
delay_calc_srcn_postra(struct ir3_instruction *assigner,
|
||||
struct ir3_instruction *consumer, unsigned assigner_n,
|
||||
unsigned consumer_n, bool soft, bool mergedregs)
|
||||
{
|
||||
struct ir3_register *src = consumer->srcs[consumer_n];
|
||||
struct ir3_register *dst = assigner->dsts[assigner_n];
|
||||
bool mismatched_half =
|
||||
(src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);
|
||||
struct ir3_register *src = consumer->srcs[consumer_n];
|
||||
struct ir3_register *dst = assigner->dsts[assigner_n];
|
||||
bool mismatched_half =
|
||||
(src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);
|
||||
|
||||
/* In the mergedregs case or when the register is a special register,
|
||||
* half-registers do not alias with full registers.
|
||||
*/
|
||||
if ((!mergedregs || is_reg_special(src) || is_reg_special(dst)) &&
|
||||
mismatched_half)
|
||||
return 0;
|
||||
/* In the mergedregs case or when the register is a special register,
|
||||
* half-registers do not alias with full registers.
|
||||
*/
|
||||
if ((!mergedregs || is_reg_special(src) || is_reg_special(dst)) &&
|
||||
mismatched_half)
|
||||
return 0;
|
||||
|
||||
unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
|
||||
unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src);
|
||||
unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
|
||||
unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst);
|
||||
unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
|
||||
unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src);
|
||||
unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
|
||||
unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst);
|
||||
|
||||
if (dst_start >= src_end || src_start >= dst_end)
|
||||
return 0;
|
||||
if (dst_start >= src_end || src_start >= dst_end)
|
||||
return 0;
|
||||
|
||||
unsigned delay = ir3_delayslots(assigner, consumer, consumer_n, soft);
|
||||
unsigned delay = ir3_delayslots(assigner, consumer, consumer_n, soft);
|
||||
|
||||
if (assigner->repeat == 0 && consumer->repeat == 0)
|
||||
return delay;
|
||||
if (assigner->repeat == 0 && consumer->repeat == 0)
|
||||
return delay;
|
||||
|
||||
/* If either side is a relative access, we can't really apply most of the
|
||||
* reasoning below because we don't know which component aliases which.
|
||||
* Just bail in this case.
|
||||
*/
|
||||
if ((src->flags & IR3_REG_RELATIV) || (dst->flags & IR3_REG_RELATIV))
|
||||
return delay;
|
||||
/* If either side is a relative access, we can't really apply most of the
|
||||
* reasoning below because we don't know which component aliases which.
|
||||
* Just bail in this case.
|
||||
*/
|
||||
if ((src->flags & IR3_REG_RELATIV) || (dst->flags & IR3_REG_RELATIV))
|
||||
return delay;
|
||||
|
||||
/* MOVMSK seems to require that all users wait until the entire
|
||||
* instruction is finished, so just bail here.
|
||||
*/
|
||||
if (assigner->opc == OPC_MOVMSK)
|
||||
return delay;
|
||||
/* MOVMSK seems to require that all users wait until the entire
|
||||
* instruction is finished, so just bail here.
|
||||
*/
|
||||
if (assigner->opc == OPC_MOVMSK)
|
||||
return delay;
|
||||
|
||||
/* TODO: Handle the combination of (rpt) and different component sizes
|
||||
* better like below. This complicates things significantly because the
|
||||
* components don't line up.
|
||||
*/
|
||||
if (mismatched_half)
|
||||
return delay;
|
||||
/* TODO: Handle the combination of (rpt) and different component sizes
|
||||
* better like below. This complicates things significantly because the
|
||||
* components don't line up.
|
||||
*/
|
||||
if (mismatched_half)
|
||||
return delay;
|
||||
|
||||
/* If an instruction has a (rpt), then it acts as a sequence of
|
||||
* instructions, reading its non-(r) sources at each cycle. First, get the
|
||||
* register num for the first instruction where they interfere:
|
||||
*/
|
||||
/* If an instruction has a (rpt), then it acts as a sequence of
|
||||
* instructions, reading its non-(r) sources at each cycle. First, get the
|
||||
* register num for the first instruction where they interfere:
|
||||
*/
|
||||
|
||||
unsigned first_num = MAX2(src_start, dst_start) / reg_elem_size(dst);
|
||||
unsigned first_num = MAX2(src_start, dst_start) / reg_elem_size(dst);
|
||||
|
||||
/* Now, for that first conflicting half/full register, figure out the
|
||||
* sub-instruction within assigner/consumer it corresponds to. For (r)
|
||||
* sources, this should already return the correct answer of 0. However we
|
||||
* have to special-case the multi-mov instructions, where the
|
||||
* sub-instructions sometimes come from the src/dst indices instead.
|
||||
*/
|
||||
unsigned first_src_instr;
|
||||
if (consumer->opc == OPC_SWZ || consumer->opc == OPC_GAT)
|
||||
first_src_instr = consumer_n;
|
||||
else
|
||||
first_src_instr = first_num - src->num;
|
||||
/* Now, for that first conflicting half/full register, figure out the
|
||||
* sub-instruction within assigner/consumer it corresponds to. For (r)
|
||||
* sources, this should already return the correct answer of 0. However we
|
||||
* have to special-case the multi-mov instructions, where the
|
||||
* sub-instructions sometimes come from the src/dst indices instead.
|
||||
*/
|
||||
unsigned first_src_instr;
|
||||
if (consumer->opc == OPC_SWZ || consumer->opc == OPC_GAT)
|
||||
first_src_instr = consumer_n;
|
||||
else
|
||||
first_src_instr = first_num - src->num;
|
||||
|
||||
unsigned first_dst_instr;
|
||||
if (assigner->opc == OPC_SWZ || assigner->opc == OPC_SCT)
|
||||
first_dst_instr = assigner_n;
|
||||
else
|
||||
first_dst_instr = first_num - dst->num;
|
||||
unsigned first_dst_instr;
|
||||
if (assigner->opc == OPC_SWZ || assigner->opc == OPC_SCT)
|
||||
first_dst_instr = assigner_n;
|
||||
else
|
||||
first_dst_instr = first_num - dst->num;
|
||||
|
||||
/* The delay we return is relative to the *end* of assigner and the
|
||||
* *beginning* of consumer, because it's the number of nops (or other
|
||||
* things) needed between them. Any instructions after first_dst_instr
|
||||
* subtract from the delay, and so do any instructions before
|
||||
* first_src_instr. Calculate an offset to subtract from the non-rpt-aware
|
||||
* delay to account for that.
|
||||
*
|
||||
* Now, a priori, we need to go through this process for every
|
||||
* conflicting regnum and take the minimum of the offsets to make sure
|
||||
* that the appropriate number of nop's is inserted for every conflicting
|
||||
* pair of sub-instructions. However, as we go to the next conflicting
|
||||
* regnum (if any), the number of instructions after first_dst_instr
|
||||
* decreases by 1 and the number of source instructions before
|
||||
* first_src_instr correspondingly increases by 1, so the offset stays the
|
||||
* same for all conflicting registers.
|
||||
*/
|
||||
unsigned offset = first_src_instr + (assigner->repeat - first_dst_instr);
|
||||
return offset > delay ? 0 : delay - offset;
|
||||
/* The delay we return is relative to the *end* of assigner and the
|
||||
* *beginning* of consumer, because it's the number of nops (or other
|
||||
* things) needed between them. Any instructions after first_dst_instr
|
||||
* subtract from the delay, and so do any instructions before
|
||||
* first_src_instr. Calculate an offset to subtract from the non-rpt-aware
|
||||
* delay to account for that.
|
||||
*
|
||||
* Now, a priori, we need to go through this process for every
|
||||
* conflicting regnum and take the minimum of the offsets to make sure
|
||||
* that the appropriate number of nop's is inserted for every conflicting
|
||||
* pair of sub-instructions. However, as we go to the next conflicting
|
||||
* regnum (if any), the number of instructions after first_dst_instr
|
||||
* decreases by 1 and the number of source instructions before
|
||||
* first_src_instr correspondingly increases by 1, so the offset stays the
|
||||
* same for all conflicting registers.
|
||||
*/
|
||||
unsigned offset = first_src_instr + (assigner->repeat - first_dst_instr);
|
||||
return offset > delay ? 0 : delay - offset;
|
||||
}
|
||||
|
||||
static unsigned
|
||||
delay_calc_postra(struct ir3_block *block,
|
||||
struct ir3_instruction *start,
|
||||
struct ir3_instruction *consumer,
|
||||
unsigned distance, bool soft, bool pred, bool mergedregs)
|
||||
delay_calc_postra(struct ir3_block *block, struct ir3_instruction *start,
|
||||
struct ir3_instruction *consumer, unsigned distance,
|
||||
bool soft, bool pred, bool mergedregs)
|
||||
{
|
||||
unsigned delay = 0;
|
||||
/* Search backwards starting at the instruction before start, unless it's
|
||||
* NULL then search backwards from the block end.
|
||||
*/
|
||||
struct list_head *start_list = start ? start->node.prev : block->instr_list.prev;
|
||||
list_for_each_entry_from_rev(struct ir3_instruction, assigner, start_list, &block->instr_list, node) {
|
||||
if (count_instruction(assigner))
|
||||
distance += assigner->nop;
|
||||
unsigned delay = 0;
|
||||
/* Search backwards starting at the instruction before start, unless it's
|
||||
* NULL then search backwards from the block end.
|
||||
*/
|
||||
struct list_head *start_list =
|
||||
start ? start->node.prev : block->instr_list.prev;
|
||||
list_for_each_entry_from_rev (struct ir3_instruction, assigner, start_list,
|
||||
&block->instr_list, node) {
|
||||
if (count_instruction(assigner))
|
||||
distance += assigner->nop;
|
||||
|
||||
if (distance + delay >= (soft ? SOFT_SS_NOPS : MAX_NOPS))
|
||||
return delay;
|
||||
if (distance + delay >= (soft ? SOFT_SS_NOPS : MAX_NOPS))
|
||||
return delay;
|
||||
|
||||
if (is_meta(assigner))
|
||||
continue;
|
||||
if (is_meta(assigner))
|
||||
continue;
|
||||
|
||||
unsigned new_delay = 0;
|
||||
unsigned new_delay = 0;
|
||||
|
||||
foreach_dst_n (dst, dst_n, assigner) {
|
||||
if (dst->wrmask == 0)
|
||||
continue;
|
||||
foreach_src_n (src, src_n, consumer) {
|
||||
if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST))
|
||||
continue;
|
||||
foreach_dst_n (dst, dst_n, assigner) {
|
||||
if (dst->wrmask == 0)
|
||||
continue;
|
||||
foreach_src_n (src, src_n, consumer) {
|
||||
if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST))
|
||||
continue;
|
||||
|
||||
unsigned src_delay =
|
||||
delay_calc_srcn_postra(assigner, consumer, dst_n,
|
||||
src_n, soft, mergedregs);
|
||||
new_delay = MAX2(new_delay, src_delay);
|
||||
}
|
||||
}
|
||||
unsigned src_delay = delay_calc_srcn_postra(
|
||||
assigner, consumer, dst_n, src_n, soft, mergedregs);
|
||||
new_delay = MAX2(new_delay, src_delay);
|
||||
}
|
||||
}
|
||||
|
||||
new_delay = new_delay > distance ? new_delay - distance : 0;
|
||||
delay = MAX2(delay, new_delay);
|
||||
new_delay = new_delay > distance ? new_delay - distance : 0;
|
||||
delay = MAX2(delay, new_delay);
|
||||
|
||||
if (count_instruction(assigner))
|
||||
distance += 1 + assigner->repeat;
|
||||
}
|
||||
if (count_instruction(assigner))
|
||||
distance += 1 + assigner->repeat;
|
||||
}
|
||||
|
||||
/* Note: this allows recursion into "block" if it has already been
|
||||
* visited, but *not* recursion into its predecessors. We may have to
|
||||
* visit the original block twice, for the loop case where we have to
|
||||
* consider definititons in an earlier iterations of the same loop:
|
||||
*
|
||||
* while (...) {
|
||||
* mov.u32u32 ..., r0.x
|
||||
* ...
|
||||
* mov.u32u32 r0.x, ...
|
||||
* }
|
||||
*
|
||||
* However any other recursion would be unnecessary.
|
||||
*/
|
||||
/* Note: this allows recursion into "block" if it has already been
|
||||
* visited, but *not* recursion into its predecessors. We may have to
|
||||
* visit the original block twice, for the loop case where we have to
|
||||
* consider definititons in an earlier iterations of the same loop:
|
||||
*
|
||||
* while (...) {
|
||||
* mov.u32u32 ..., r0.x
|
||||
* ...
|
||||
* mov.u32u32 r0.x, ...
|
||||
* }
|
||||
*
|
||||
* However any other recursion would be unnecessary.
|
||||
*/
|
||||
|
||||
if (pred && block->data != block) {
|
||||
block->data = block;
|
||||
if (pred && block->data != block) {
|
||||
block->data = block;
|
||||
|
||||
for (unsigned i = 0; i < block->predecessors_count; i++) {
|
||||
struct ir3_block *pred = block->predecessors[i];
|
||||
unsigned pred_delay =
|
||||
delay_calc_postra(pred, NULL, consumer, distance, soft, pred, mergedregs);
|
||||
delay = MAX2(delay, pred_delay);
|
||||
}
|
||||
for (unsigned i = 0; i < block->predecessors_count; i++) {
|
||||
struct ir3_block *pred = block->predecessors[i];
|
||||
unsigned pred_delay = delay_calc_postra(pred, NULL, consumer, distance,
|
||||
soft, pred, mergedregs);
|
||||
delay = MAX2(delay, pred_delay);
|
||||
}
|
||||
|
||||
block->data = NULL;
|
||||
}
|
||||
block->data = NULL;
|
||||
}
|
||||
|
||||
return delay;
|
||||
return delay;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -392,9 +389,9 @@ delay_calc_postra(struct ir3_block *block,
|
|||
*/
|
||||
unsigned
|
||||
ir3_delay_calc_postra(struct ir3_block *block, struct ir3_instruction *instr,
|
||||
bool soft, bool mergedregs)
|
||||
bool soft, bool mergedregs)
|
||||
{
|
||||
return delay_calc_postra(block, NULL, instr, 0, soft, false, mergedregs);
|
||||
return delay_calc_postra(block, NULL, instr, 0, soft, false, mergedregs);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -403,9 +400,9 @@ ir3_delay_calc_postra(struct ir3_block *block, struct ir3_instruction *instr,
|
|||
*/
|
||||
unsigned
|
||||
ir3_delay_calc_exact(struct ir3_block *block, struct ir3_instruction *instr,
|
||||
bool mergedregs)
|
||||
bool mergedregs)
|
||||
{
|
||||
return delay_calc_postra(block, NULL, instr, 0, false, true, mergedregs);
|
||||
return delay_calc_postra(block, NULL, instr, 0, false, true, mergedregs);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -419,12 +416,11 @@ ir3_delay_calc_exact(struct ir3_block *block, struct ir3_instruction *instr,
|
|||
void
|
||||
ir3_remove_nops(struct ir3 *ir)
|
||||
{
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr_safe (instr, &block->instr_list) {
|
||||
if (instr->opc == OPC_NOP) {
|
||||
list_del(&instr->node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr_safe (instr, &block->instr_list) {
|
||||
if (instr->opc == OPC_NOP) {
|
||||
list_del(&instr->node);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -48,183 +48,185 @@
|
|||
void
|
||||
ir3_disk_cache_init(struct ir3_compiler *compiler)
|
||||
{
|
||||
if (ir3_shader_debug & IR3_DBG_NOCACHE)
|
||||
return;
|
||||
if (ir3_shader_debug & IR3_DBG_NOCACHE)
|
||||
return;
|
||||
|
||||
/* array length = print length + nul char + 1 extra to verify it's unused */
|
||||
char renderer[7];
|
||||
ASSERTED int len =
|
||||
snprintf(renderer, sizeof(renderer), "FD%03d", compiler->gpu_id);
|
||||
assert(len == sizeof(renderer) - 2);
|
||||
/* array length = print length + nul char + 1 extra to verify it's unused */
|
||||
char renderer[7];
|
||||
ASSERTED int len =
|
||||
snprintf(renderer, sizeof(renderer), "FD%03d", compiler->gpu_id);
|
||||
assert(len == sizeof(renderer) - 2);
|
||||
|
||||
const struct build_id_note *note =
|
||||
build_id_find_nhdr_for_addr(ir3_disk_cache_init);
|
||||
assert(note && build_id_length(note) == 20); /* sha1 */
|
||||
const struct build_id_note *note =
|
||||
build_id_find_nhdr_for_addr(ir3_disk_cache_init);
|
||||
assert(note && build_id_length(note) == 20); /* sha1 */
|
||||
|
||||
const uint8_t *id_sha1 = build_id_data(note);
|
||||
assert(id_sha1);
|
||||
const uint8_t *id_sha1 = build_id_data(note);
|
||||
assert(id_sha1);
|
||||
|
||||
char timestamp[41];
|
||||
_mesa_sha1_format(timestamp, id_sha1);
|
||||
char timestamp[41];
|
||||
_mesa_sha1_format(timestamp, id_sha1);
|
||||
|
||||
uint64_t driver_flags = ir3_shader_debug;
|
||||
if (compiler->robust_ubo_access)
|
||||
driver_flags |= IR3_DBG_ROBUST_UBO_ACCESS;
|
||||
compiler->disk_cache = disk_cache_create(renderer, timestamp, driver_flags);
|
||||
uint64_t driver_flags = ir3_shader_debug;
|
||||
if (compiler->robust_ubo_access)
|
||||
driver_flags |= IR3_DBG_ROBUST_UBO_ACCESS;
|
||||
compiler->disk_cache = disk_cache_create(renderer, timestamp, driver_flags);
|
||||
}
|
||||
|
||||
void
|
||||
ir3_disk_cache_init_shader_key(struct ir3_compiler *compiler,
|
||||
struct ir3_shader *shader)
|
||||
struct ir3_shader *shader)
|
||||
{
|
||||
if (!compiler->disk_cache)
|
||||
return;
|
||||
if (!compiler->disk_cache)
|
||||
return;
|
||||
|
||||
struct mesa_sha1 ctx;
|
||||
struct mesa_sha1 ctx;
|
||||
|
||||
_mesa_sha1_init(&ctx);
|
||||
_mesa_sha1_init(&ctx);
|
||||
|
||||
/* Serialize the NIR to a binary blob that we can hash for the disk
|
||||
* cache. Drop unnecessary information (like variable names)
|
||||
* so the serialized NIR is smaller, and also to let us detect more
|
||||
* isomorphic shaders when hashing, increasing cache hits.
|
||||
*/
|
||||
struct blob blob;
|
||||
blob_init(&blob);
|
||||
nir_serialize(&blob, shader->nir, true);
|
||||
_mesa_sha1_update(&ctx, blob.data, blob.size);
|
||||
blob_finish(&blob);
|
||||
/* Serialize the NIR to a binary blob that we can hash for the disk
|
||||
* cache. Drop unnecessary information (like variable names)
|
||||
* so the serialized NIR is smaller, and also to let us detect more
|
||||
* isomorphic shaders when hashing, increasing cache hits.
|
||||
*/
|
||||
struct blob blob;
|
||||
blob_init(&blob);
|
||||
nir_serialize(&blob, shader->nir, true);
|
||||
_mesa_sha1_update(&ctx, blob.data, blob.size);
|
||||
blob_finish(&blob);
|
||||
|
||||
/* Note that on some gens stream-out is lowered in ir3 to stg. For later
|
||||
* gens we maybe don't need to include stream-out in the cache key.
|
||||
*/
|
||||
_mesa_sha1_update(&ctx, &shader->stream_output, sizeof(shader->stream_output));
|
||||
/* Note that on some gens stream-out is lowered in ir3 to stg. For later
|
||||
* gens we maybe don't need to include stream-out in the cache key.
|
||||
*/
|
||||
_mesa_sha1_update(&ctx, &shader->stream_output,
|
||||
sizeof(shader->stream_output));
|
||||
|
||||
_mesa_sha1_final(&ctx, shader->cache_key);
|
||||
_mesa_sha1_final(&ctx, shader->cache_key);
|
||||
}
|
||||
|
||||
static void
|
||||
compute_variant_key(struct ir3_compiler *compiler,
|
||||
struct ir3_shader_variant *v, cache_key cache_key)
|
||||
compute_variant_key(struct ir3_compiler *compiler, struct ir3_shader_variant *v,
|
||||
cache_key cache_key)
|
||||
{
|
||||
struct blob blob;
|
||||
blob_init(&blob);
|
||||
struct blob blob;
|
||||
blob_init(&blob);
|
||||
|
||||
blob_write_bytes(&blob, &v->shader->cache_key, sizeof(v->shader->cache_key));
|
||||
blob_write_bytes(&blob, &v->key, sizeof(v->key));
|
||||
blob_write_uint8(&blob, v->binning_pass);
|
||||
blob_write_bytes(&blob, &v->shader->cache_key, sizeof(v->shader->cache_key));
|
||||
blob_write_bytes(&blob, &v->key, sizeof(v->key));
|
||||
blob_write_uint8(&blob, v->binning_pass);
|
||||
|
||||
disk_cache_compute_key(compiler->disk_cache, blob.data, blob.size, cache_key);
|
||||
disk_cache_compute_key(compiler->disk_cache, blob.data, blob.size,
|
||||
cache_key);
|
||||
|
||||
blob_finish(&blob);
|
||||
blob_finish(&blob);
|
||||
}
|
||||
|
||||
static void
|
||||
retrieve_variant(struct blob_reader *blob, struct ir3_shader_variant *v)
|
||||
{
|
||||
blob_copy_bytes(blob, VARIANT_CACHE_PTR(v), VARIANT_CACHE_SIZE);
|
||||
blob_copy_bytes(blob, VARIANT_CACHE_PTR(v), VARIANT_CACHE_SIZE);
|
||||
|
||||
/*
|
||||
* pointers need special handling:
|
||||
*/
|
||||
/*
|
||||
* pointers need special handling:
|
||||
*/
|
||||
|
||||
v->bin = rzalloc_size(v, v->info.size);
|
||||
blob_copy_bytes(blob, v->bin, v->info.size);
|
||||
v->bin = rzalloc_size(v, v->info.size);
|
||||
blob_copy_bytes(blob, v->bin, v->info.size);
|
||||
|
||||
if (!v->binning_pass) {
|
||||
blob_copy_bytes(blob, v->const_state, sizeof(*v->const_state));
|
||||
unsigned immeds_sz = v->const_state->immediates_size *
|
||||
sizeof(v->const_state->immediates[0]);
|
||||
v->const_state->immediates = ralloc_size(v->const_state, immeds_sz);
|
||||
blob_copy_bytes(blob, v->const_state->immediates, immeds_sz);
|
||||
}
|
||||
if (!v->binning_pass) {
|
||||
blob_copy_bytes(blob, v->const_state, sizeof(*v->const_state));
|
||||
unsigned immeds_sz = v->const_state->immediates_size *
|
||||
sizeof(v->const_state->immediates[0]);
|
||||
v->const_state->immediates = ralloc_size(v->const_state, immeds_sz);
|
||||
blob_copy_bytes(blob, v->const_state->immediates, immeds_sz);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
store_variant(struct blob *blob, struct ir3_shader_variant *v)
|
||||
{
|
||||
blob_write_bytes(blob, VARIANT_CACHE_PTR(v), VARIANT_CACHE_SIZE);
|
||||
blob_write_bytes(blob, VARIANT_CACHE_PTR(v), VARIANT_CACHE_SIZE);
|
||||
|
||||
/*
|
||||
* pointers need special handling:
|
||||
*/
|
||||
/*
|
||||
* pointers need special handling:
|
||||
*/
|
||||
|
||||
blob_write_bytes(blob, v->bin, v->info.size);
|
||||
blob_write_bytes(blob, v->bin, v->info.size);
|
||||
|
||||
/* No saving constant_data, it's already baked into bin at this point. */
|
||||
/* No saving constant_data, it's already baked into bin at this point. */
|
||||
|
||||
if (!v->binning_pass) {
|
||||
blob_write_bytes(blob, v->const_state, sizeof(*v->const_state));
|
||||
unsigned immeds_sz = v->const_state->immediates_size *
|
||||
sizeof(v->const_state->immediates[0]);
|
||||
blob_write_bytes(blob, v->const_state->immediates, immeds_sz);
|
||||
}
|
||||
if (!v->binning_pass) {
|
||||
blob_write_bytes(blob, v->const_state, sizeof(*v->const_state));
|
||||
unsigned immeds_sz = v->const_state->immediates_size *
|
||||
sizeof(v->const_state->immediates[0]);
|
||||
blob_write_bytes(blob, v->const_state->immediates, immeds_sz);
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
ir3_disk_cache_retrieve(struct ir3_compiler *compiler,
|
||||
struct ir3_shader_variant *v)
|
||||
struct ir3_shader_variant *v)
|
||||
{
|
||||
if (!compiler->disk_cache)
|
||||
return false;
|
||||
if (!compiler->disk_cache)
|
||||
return false;
|
||||
|
||||
cache_key cache_key;
|
||||
cache_key cache_key;
|
||||
|
||||
compute_variant_key(compiler, v, cache_key);
|
||||
compute_variant_key(compiler, v, cache_key);
|
||||
|
||||
if (debug) {
|
||||
char sha1[41];
|
||||
_mesa_sha1_format(sha1, cache_key);
|
||||
fprintf(stderr, "[mesa disk cache] retrieving variant %s: ", sha1);
|
||||
}
|
||||
if (debug) {
|
||||
char sha1[41];
|
||||
_mesa_sha1_format(sha1, cache_key);
|
||||
fprintf(stderr, "[mesa disk cache] retrieving variant %s: ", sha1);
|
||||
}
|
||||
|
||||
size_t size;
|
||||
void *buffer = disk_cache_get(compiler->disk_cache, cache_key, &size);
|
||||
size_t size;
|
||||
void *buffer = disk_cache_get(compiler->disk_cache, cache_key, &size);
|
||||
|
||||
if (debug)
|
||||
fprintf(stderr, "%s\n", buffer ? "found" : "missing");
|
||||
if (debug)
|
||||
fprintf(stderr, "%s\n", buffer ? "found" : "missing");
|
||||
|
||||
if (!buffer)
|
||||
return false;
|
||||
if (!buffer)
|
||||
return false;
|
||||
|
||||
struct blob_reader blob;
|
||||
blob_reader_init(&blob, buffer, size);
|
||||
struct blob_reader blob;
|
||||
blob_reader_init(&blob, buffer, size);
|
||||
|
||||
retrieve_variant(&blob, v);
|
||||
retrieve_variant(&blob, v);
|
||||
|
||||
if (v->binning)
|
||||
retrieve_variant(&blob, v->binning);
|
||||
if (v->binning)
|
||||
retrieve_variant(&blob, v->binning);
|
||||
|
||||
free(buffer);
|
||||
free(buffer);
|
||||
|
||||
return true;
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
ir3_disk_cache_store(struct ir3_compiler *compiler,
|
||||
struct ir3_shader_variant *v)
|
||||
struct ir3_shader_variant *v)
|
||||
{
|
||||
if (!compiler->disk_cache)
|
||||
return;
|
||||
if (!compiler->disk_cache)
|
||||
return;
|
||||
|
||||
cache_key cache_key;
|
||||
cache_key cache_key;
|
||||
|
||||
compute_variant_key(compiler, v, cache_key);
|
||||
compute_variant_key(compiler, v, cache_key);
|
||||
|
||||
if (debug) {
|
||||
char sha1[41];
|
||||
_mesa_sha1_format(sha1, cache_key);
|
||||
fprintf(stderr, "[mesa disk cache] storing variant %s\n", sha1);
|
||||
}
|
||||
if (debug) {
|
||||
char sha1[41];
|
||||
_mesa_sha1_format(sha1, cache_key);
|
||||
fprintf(stderr, "[mesa disk cache] storing variant %s\n", sha1);
|
||||
}
|
||||
|
||||
struct blob blob;
|
||||
blob_init(&blob);
|
||||
struct blob blob;
|
||||
blob_init(&blob);
|
||||
|
||||
store_variant(&blob, v);
|
||||
store_variant(&blob, v);
|
||||
|
||||
if (v->binning)
|
||||
store_variant(&blob, v->binning);
|
||||
if (v->binning)
|
||||
store_variant(&blob, v->binning);
|
||||
|
||||
disk_cache_put(compiler->disk_cache, cache_key, blob.data, blob.size, NULL);
|
||||
blob_finish(&blob);
|
||||
disk_cache_put(compiler->disk_cache, cache_key, blob.data, blob.size, NULL);
|
||||
blob_finish(&blob);
|
||||
}
|
||||
|
|
|
@ -35,92 +35,91 @@
|
|||
static struct ir3_block *
|
||||
intersect(struct ir3_block *b1, struct ir3_block *b2)
|
||||
{
|
||||
while (b1 != b2) {
|
||||
/*
|
||||
* Note, the comparisons here are the opposite of what the paper says
|
||||
* because we index blocks from beginning -> end (i.e. reverse
|
||||
* post-order) instead of post-order like they assume.
|
||||
*/
|
||||
while (b1->index > b2->index)
|
||||
b1 = b1->imm_dom;
|
||||
while (b2->index > b1->index)
|
||||
b2 = b2->imm_dom;
|
||||
}
|
||||
while (b1 != b2) {
|
||||
/*
|
||||
* Note, the comparisons here are the opposite of what the paper says
|
||||
* because we index blocks from beginning -> end (i.e. reverse
|
||||
* post-order) instead of post-order like they assume.
|
||||
*/
|
||||
while (b1->index > b2->index)
|
||||
b1 = b1->imm_dom;
|
||||
while (b2->index > b1->index)
|
||||
b2 = b2->imm_dom;
|
||||
}
|
||||
|
||||
return b1;
|
||||
return b1;
|
||||
}
|
||||
|
||||
|
||||
static bool
|
||||
calc_dominance(struct ir3_block *block)
|
||||
{
|
||||
struct ir3_block *new_idom = NULL;
|
||||
for (unsigned i = 0; i < block->predecessors_count; i++) {
|
||||
struct ir3_block *pred = block->predecessors[i];
|
||||
struct ir3_block *new_idom = NULL;
|
||||
for (unsigned i = 0; i < block->predecessors_count; i++) {
|
||||
struct ir3_block *pred = block->predecessors[i];
|
||||
|
||||
if (pred->imm_dom) {
|
||||
if (new_idom)
|
||||
new_idom = intersect(pred, new_idom);
|
||||
else
|
||||
new_idom = pred;
|
||||
}
|
||||
}
|
||||
if (pred->imm_dom) {
|
||||
if (new_idom)
|
||||
new_idom = intersect(pred, new_idom);
|
||||
else
|
||||
new_idom = pred;
|
||||
}
|
||||
}
|
||||
|
||||
if (block->imm_dom != new_idom) {
|
||||
block->imm_dom = new_idom;
|
||||
return true;
|
||||
}
|
||||
if (block->imm_dom != new_idom) {
|
||||
block->imm_dom = new_idom;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
return false;
|
||||
}
|
||||
|
||||
static unsigned
|
||||
calc_dfs_indices(struct ir3_block *block, unsigned index)
|
||||
{
|
||||
block->dom_pre_index = index++;
|
||||
for (unsigned i = 0; i < block->dom_children_count; i++)
|
||||
index = calc_dfs_indices(block->dom_children[i], index);
|
||||
block->dom_post_index = index++;
|
||||
return index;
|
||||
block->dom_pre_index = index++;
|
||||
for (unsigned i = 0; i < block->dom_children_count; i++)
|
||||
index = calc_dfs_indices(block->dom_children[i], index);
|
||||
block->dom_post_index = index++;
|
||||
return index;
|
||||
}
|
||||
|
||||
void
|
||||
ir3_calc_dominance(struct ir3 *ir)
|
||||
{
|
||||
unsigned i = 0;
|
||||
foreach_block (block, &ir->block_list) {
|
||||
block->index = i++;
|
||||
if (block == ir3_start_block(ir))
|
||||
block->imm_dom = block;
|
||||
else
|
||||
block->imm_dom = NULL;
|
||||
block->dom_children = NULL;
|
||||
block->dom_children_count = block->dom_children_sz = 0;
|
||||
}
|
||||
unsigned i = 0;
|
||||
foreach_block (block, &ir->block_list) {
|
||||
block->index = i++;
|
||||
if (block == ir3_start_block(ir))
|
||||
block->imm_dom = block;
|
||||
else
|
||||
block->imm_dom = NULL;
|
||||
block->dom_children = NULL;
|
||||
block->dom_children_count = block->dom_children_sz = 0;
|
||||
}
|
||||
|
||||
bool progress = true;
|
||||
while (progress) {
|
||||
progress = false;
|
||||
foreach_block (block, &ir->block_list) {
|
||||
if (block != ir3_start_block(ir))
|
||||
progress |= calc_dominance(block);
|
||||
}
|
||||
}
|
||||
bool progress = true;
|
||||
while (progress) {
|
||||
progress = false;
|
||||
foreach_block (block, &ir->block_list) {
|
||||
if (block != ir3_start_block(ir))
|
||||
progress |= calc_dominance(block);
|
||||
}
|
||||
}
|
||||
|
||||
ir3_start_block(ir)->imm_dom = NULL;
|
||||
ir3_start_block(ir)->imm_dom = NULL;
|
||||
|
||||
foreach_block (block, &ir->block_list) {
|
||||
if (block->imm_dom)
|
||||
array_insert(block->imm_dom, block->imm_dom->dom_children, block);
|
||||
}
|
||||
foreach_block (block, &ir->block_list) {
|
||||
if (block->imm_dom)
|
||||
array_insert(block->imm_dom, block->imm_dom->dom_children, block);
|
||||
}
|
||||
|
||||
calc_dfs_indices(ir3_start_block(ir), 0);
|
||||
calc_dfs_indices(ir3_start_block(ir), 0);
|
||||
}
|
||||
|
||||
/* Return true if a dominates b. This includes if a == b. */
|
||||
bool ir3_block_dominates(struct ir3_block *a, struct ir3_block *b)
|
||||
bool
|
||||
ir3_block_dominates(struct ir3_block *a, struct ir3_block *b)
|
||||
{
|
||||
return a->dom_pre_index <= b->dom_pre_index &&
|
||||
a->dom_post_index >= b->dom_post_index;
|
||||
return a->dom_pre_index <= b->dom_pre_index &&
|
||||
a->dom_post_index >= b->dom_post_index;
|
||||
}
|
||||
|
||||
|
|
|
@ -26,7 +26,6 @@
|
|||
|
||||
#include "ir3_image.h"
|
||||
|
||||
|
||||
/*
|
||||
* SSBO/Image to/from IBO/tex hw mapping table:
|
||||
*/
|
||||
|
@ -34,57 +33,57 @@
|
|||
void
|
||||
ir3_ibo_mapping_init(struct ir3_ibo_mapping *mapping, unsigned num_textures)
|
||||
{
|
||||
memset(mapping, IBO_INVALID, sizeof(*mapping));
|
||||
mapping->num_tex = 0;
|
||||
mapping->tex_base = num_textures;
|
||||
memset(mapping, IBO_INVALID, sizeof(*mapping));
|
||||
mapping->num_tex = 0;
|
||||
mapping->tex_base = num_textures;
|
||||
}
|
||||
|
||||
struct ir3_instruction *
|
||||
ir3_ssbo_to_ibo(struct ir3_context *ctx, nir_src src)
|
||||
{
|
||||
if (ir3_bindless_resource(src)) {
|
||||
ctx->so->bindless_ibo = true;
|
||||
return ir3_get_src(ctx, &src)[0];
|
||||
} else {
|
||||
/* can this be non-const buffer_index? how do we handle that? */
|
||||
int ssbo_idx = nir_src_as_uint(src);
|
||||
return create_immed(ctx->block, ssbo_idx);
|
||||
}
|
||||
if (ir3_bindless_resource(src)) {
|
||||
ctx->so->bindless_ibo = true;
|
||||
return ir3_get_src(ctx, &src)[0];
|
||||
} else {
|
||||
/* can this be non-const buffer_index? how do we handle that? */
|
||||
int ssbo_idx = nir_src_as_uint(src);
|
||||
return create_immed(ctx->block, ssbo_idx);
|
||||
}
|
||||
}
|
||||
|
||||
unsigned
|
||||
ir3_ssbo_to_tex(struct ir3_ibo_mapping *mapping, unsigned ssbo)
|
||||
{
|
||||
if (mapping->ssbo_to_tex[ssbo] == IBO_INVALID) {
|
||||
unsigned tex = mapping->num_tex++;
|
||||
mapping->ssbo_to_tex[ssbo] = tex;
|
||||
mapping->tex_to_image[tex] = IBO_SSBO | ssbo;
|
||||
}
|
||||
return mapping->ssbo_to_tex[ssbo] + mapping->tex_base;
|
||||
if (mapping->ssbo_to_tex[ssbo] == IBO_INVALID) {
|
||||
unsigned tex = mapping->num_tex++;
|
||||
mapping->ssbo_to_tex[ssbo] = tex;
|
||||
mapping->tex_to_image[tex] = IBO_SSBO | ssbo;
|
||||
}
|
||||
return mapping->ssbo_to_tex[ssbo] + mapping->tex_base;
|
||||
}
|
||||
|
||||
struct ir3_instruction *
|
||||
ir3_image_to_ibo(struct ir3_context *ctx, nir_src src)
|
||||
{
|
||||
if (ir3_bindless_resource(src)) {
|
||||
ctx->so->bindless_ibo = true;
|
||||
return ir3_get_src(ctx, &src)[0];
|
||||
} else {
|
||||
/* can this be non-const buffer_index? how do we handle that? */
|
||||
int image_idx = nir_src_as_uint(src);
|
||||
return create_immed(ctx->block, ctx->s->info.num_ssbos + image_idx);
|
||||
}
|
||||
if (ir3_bindless_resource(src)) {
|
||||
ctx->so->bindless_ibo = true;
|
||||
return ir3_get_src(ctx, &src)[0];
|
||||
} else {
|
||||
/* can this be non-const buffer_index? how do we handle that? */
|
||||
int image_idx = nir_src_as_uint(src);
|
||||
return create_immed(ctx->block, ctx->s->info.num_ssbos + image_idx);
|
||||
}
|
||||
}
|
||||
|
||||
unsigned
|
||||
ir3_image_to_tex(struct ir3_ibo_mapping *mapping, unsigned image)
|
||||
{
|
||||
if (mapping->image_to_tex[image] == IBO_INVALID) {
|
||||
unsigned tex = mapping->num_tex++;
|
||||
mapping->image_to_tex[image] = tex;
|
||||
mapping->tex_to_image[tex] = image;
|
||||
}
|
||||
return mapping->image_to_tex[image] + mapping->tex_base;
|
||||
if (mapping->image_to_tex[image] == IBO_INVALID) {
|
||||
unsigned tex = mapping->num_tex++;
|
||||
mapping->image_to_tex[image] = tex;
|
||||
mapping->tex_to_image[tex] = image;
|
||||
}
|
||||
return mapping->image_to_tex[image] + mapping->tex_base;
|
||||
}
|
||||
|
||||
/* see tex_info() for equiv logic for texture instructions.. it would be
|
||||
|
@ -93,87 +92,87 @@ ir3_image_to_tex(struct ir3_ibo_mapping *mapping, unsigned image)
|
|||
unsigned
|
||||
ir3_get_image_coords(const nir_intrinsic_instr *instr, unsigned *flagsp)
|
||||
{
|
||||
unsigned coords = nir_image_intrinsic_coord_components(instr);
|
||||
unsigned flags = 0;
|
||||
unsigned coords = nir_image_intrinsic_coord_components(instr);
|
||||
unsigned flags = 0;
|
||||
|
||||
if (coords == 3)
|
||||
flags |= IR3_INSTR_3D;
|
||||
if (coords == 3)
|
||||
flags |= IR3_INSTR_3D;
|
||||
|
||||
if (nir_intrinsic_image_array(instr))
|
||||
flags |= IR3_INSTR_A;
|
||||
if (nir_intrinsic_image_array(instr))
|
||||
flags |= IR3_INSTR_A;
|
||||
|
||||
if (flagsp)
|
||||
*flagsp = flags;
|
||||
if (flagsp)
|
||||
*flagsp = flags;
|
||||
|
||||
return coords;
|
||||
return coords;
|
||||
}
|
||||
|
||||
type_t
|
||||
ir3_get_type_for_image_intrinsic(const nir_intrinsic_instr *instr)
|
||||
{
|
||||
const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
|
||||
int bit_size = info->has_dest ? nir_dest_bit_size(instr->dest) : 32;
|
||||
const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
|
||||
int bit_size = info->has_dest ? nir_dest_bit_size(instr->dest) : 32;
|
||||
|
||||
nir_alu_type type = nir_type_uint;
|
||||
switch (instr->intrinsic) {
|
||||
case nir_intrinsic_image_load:
|
||||
case nir_intrinsic_bindless_image_load:
|
||||
type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(instr));
|
||||
/* SpvOpAtomicLoad doesn't have dest type */
|
||||
if (type == nir_type_invalid)
|
||||
type = nir_type_uint;
|
||||
break;
|
||||
nir_alu_type type = nir_type_uint;
|
||||
switch (instr->intrinsic) {
|
||||
case nir_intrinsic_image_load:
|
||||
case nir_intrinsic_bindless_image_load:
|
||||
type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(instr));
|
||||
/* SpvOpAtomicLoad doesn't have dest type */
|
||||
if (type == nir_type_invalid)
|
||||
type = nir_type_uint;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_image_store:
|
||||
case nir_intrinsic_bindless_image_store:
|
||||
type = nir_alu_type_get_base_type(nir_intrinsic_src_type(instr));
|
||||
/* SpvOpAtomicStore doesn't have src type */
|
||||
if (type == nir_type_invalid)
|
||||
type = nir_type_uint;
|
||||
break;
|
||||
case nir_intrinsic_image_store:
|
||||
case nir_intrinsic_bindless_image_store:
|
||||
type = nir_alu_type_get_base_type(nir_intrinsic_src_type(instr));
|
||||
/* SpvOpAtomicStore doesn't have src type */
|
||||
if (type == nir_type_invalid)
|
||||
type = nir_type_uint;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_image_atomic_add:
|
||||
case nir_intrinsic_bindless_image_atomic_add:
|
||||
case nir_intrinsic_image_atomic_umin:
|
||||
case nir_intrinsic_bindless_image_atomic_umin:
|
||||
case nir_intrinsic_image_atomic_umax:
|
||||
case nir_intrinsic_bindless_image_atomic_umax:
|
||||
case nir_intrinsic_image_atomic_and:
|
||||
case nir_intrinsic_bindless_image_atomic_and:
|
||||
case nir_intrinsic_image_atomic_or:
|
||||
case nir_intrinsic_bindless_image_atomic_or:
|
||||
case nir_intrinsic_image_atomic_xor:
|
||||
case nir_intrinsic_bindless_image_atomic_xor:
|
||||
case nir_intrinsic_image_atomic_exchange:
|
||||
case nir_intrinsic_bindless_image_atomic_exchange:
|
||||
case nir_intrinsic_image_atomic_comp_swap:
|
||||
case nir_intrinsic_bindless_image_atomic_comp_swap:
|
||||
case nir_intrinsic_image_atomic_inc_wrap:
|
||||
case nir_intrinsic_bindless_image_atomic_inc_wrap:
|
||||
type = nir_type_uint;
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_add:
|
||||
case nir_intrinsic_bindless_image_atomic_add:
|
||||
case nir_intrinsic_image_atomic_umin:
|
||||
case nir_intrinsic_bindless_image_atomic_umin:
|
||||
case nir_intrinsic_image_atomic_umax:
|
||||
case nir_intrinsic_bindless_image_atomic_umax:
|
||||
case nir_intrinsic_image_atomic_and:
|
||||
case nir_intrinsic_bindless_image_atomic_and:
|
||||
case nir_intrinsic_image_atomic_or:
|
||||
case nir_intrinsic_bindless_image_atomic_or:
|
||||
case nir_intrinsic_image_atomic_xor:
|
||||
case nir_intrinsic_bindless_image_atomic_xor:
|
||||
case nir_intrinsic_image_atomic_exchange:
|
||||
case nir_intrinsic_bindless_image_atomic_exchange:
|
||||
case nir_intrinsic_image_atomic_comp_swap:
|
||||
case nir_intrinsic_bindless_image_atomic_comp_swap:
|
||||
case nir_intrinsic_image_atomic_inc_wrap:
|
||||
case nir_intrinsic_bindless_image_atomic_inc_wrap:
|
||||
type = nir_type_uint;
|
||||
break;
|
||||
|
||||
case nir_intrinsic_image_atomic_imin:
|
||||
case nir_intrinsic_bindless_image_atomic_imin:
|
||||
case nir_intrinsic_image_atomic_imax:
|
||||
case nir_intrinsic_bindless_image_atomic_imax:
|
||||
type = nir_type_int;
|
||||
break;
|
||||
case nir_intrinsic_image_atomic_imin:
|
||||
case nir_intrinsic_bindless_image_atomic_imin:
|
||||
case nir_intrinsic_image_atomic_imax:
|
||||
case nir_intrinsic_bindless_image_atomic_imax:
|
||||
type = nir_type_int;
|
||||
break;
|
||||
|
||||
default:
|
||||
unreachable("Unhandled NIR image intrinsic");
|
||||
}
|
||||
default:
|
||||
unreachable("Unhandled NIR image intrinsic");
|
||||
}
|
||||
|
||||
switch (type) {
|
||||
case nir_type_uint:
|
||||
return bit_size == 16 ? TYPE_U16 : TYPE_U32;
|
||||
case nir_type_int:
|
||||
return bit_size == 16 ? TYPE_S16 : TYPE_S32;
|
||||
case nir_type_float:
|
||||
return bit_size == 16 ? TYPE_F16 : TYPE_F32;
|
||||
default:
|
||||
unreachable("bad type");
|
||||
}
|
||||
switch (type) {
|
||||
case nir_type_uint:
|
||||
return bit_size == 16 ? TYPE_U16 : TYPE_U32;
|
||||
case nir_type_int:
|
||||
return bit_size == 16 ? TYPE_S16 : TYPE_S32;
|
||||
case nir_type_float:
|
||||
return bit_size == 16 ? TYPE_F16 : TYPE_F32;
|
||||
default:
|
||||
unreachable("bad type");
|
||||
}
|
||||
}
|
||||
|
||||
/* Returns the number of components for the different image formats
|
||||
|
@ -183,8 +182,8 @@ ir3_get_type_for_image_intrinsic(const nir_intrinsic_instr *instr)
|
|||
unsigned
|
||||
ir3_get_num_components_for_image_format(enum pipe_format format)
|
||||
{
|
||||
if (format == PIPE_FORMAT_NONE)
|
||||
return 4;
|
||||
else
|
||||
return util_format_get_nr_components(format);
|
||||
if (format == PIPE_FORMAT_NONE)
|
||||
return 4;
|
||||
else
|
||||
return util_format_get_nr_components(format);
|
||||
}
|
||||
|
|
|
@ -29,14 +29,15 @@
|
|||
|
||||
#include "ir3_context.h"
|
||||
|
||||
|
||||
void ir3_ibo_mapping_init(struct ir3_ibo_mapping *mapping, unsigned num_textures);
|
||||
void ir3_ibo_mapping_init(struct ir3_ibo_mapping *mapping,
|
||||
unsigned num_textures);
|
||||
struct ir3_instruction *ir3_ssbo_to_ibo(struct ir3_context *ctx, nir_src src);
|
||||
unsigned ir3_ssbo_to_tex(struct ir3_ibo_mapping *mapping, unsigned ssbo);
|
||||
struct ir3_instruction *ir3_image_to_ibo(struct ir3_context *ctx, nir_src src);
|
||||
unsigned ir3_image_to_tex(struct ir3_ibo_mapping *mapping, unsigned image);
|
||||
|
||||
unsigned ir3_get_image_coords(const nir_intrinsic_instr *instr, unsigned *flagsp);
|
||||
unsigned ir3_get_image_coords(const nir_intrinsic_instr *instr,
|
||||
unsigned *flagsp);
|
||||
type_t ir3_get_type_for_image_intrinsic(const nir_intrinsic_instr *instr);
|
||||
unsigned ir3_get_num_components_for_image_format(enum pipe_format);
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -37,127 +37,130 @@
|
|||
|
||||
static bool
|
||||
compute_block_liveness(struct ir3_liveness *live, struct ir3_block *block,
|
||||
BITSET_WORD *tmp_live, unsigned bitset_words)
|
||||
BITSET_WORD *tmp_live, unsigned bitset_words)
|
||||
{
|
||||
memcpy(tmp_live, live->live_out[block->index], bitset_words *
|
||||
sizeof(BITSET_WORD));
|
||||
memcpy(tmp_live, live->live_out[block->index],
|
||||
bitset_words * sizeof(BITSET_WORD));
|
||||
|
||||
/* Process instructions */
|
||||
foreach_instr_rev (instr, &block->instr_list) {
|
||||
ra_foreach_dst(dst, instr) {
|
||||
if (BITSET_TEST(tmp_live, dst->name))
|
||||
dst->flags &= ~IR3_REG_UNUSED;
|
||||
else
|
||||
dst->flags |= IR3_REG_UNUSED;
|
||||
BITSET_CLEAR(tmp_live, dst->name);
|
||||
}
|
||||
/* Process instructions */
|
||||
foreach_instr_rev (instr, &block->instr_list) {
|
||||
ra_foreach_dst (dst, instr) {
|
||||
if (BITSET_TEST(tmp_live, dst->name))
|
||||
dst->flags &= ~IR3_REG_UNUSED;
|
||||
else
|
||||
dst->flags |= IR3_REG_UNUSED;
|
||||
BITSET_CLEAR(tmp_live, dst->name);
|
||||
}
|
||||
|
||||
/* Phi node uses occur after the predecessor block */
|
||||
if (instr->opc != OPC_META_PHI) {
|
||||
ra_foreach_src(src, instr) {
|
||||
if (BITSET_TEST(tmp_live, src->def->name))
|
||||
src->flags &= ~IR3_REG_KILL;
|
||||
else
|
||||
src->flags |= IR3_REG_KILL;
|
||||
}
|
||||
/* Phi node uses occur after the predecessor block */
|
||||
if (instr->opc != OPC_META_PHI) {
|
||||
ra_foreach_src (src, instr) {
|
||||
if (BITSET_TEST(tmp_live, src->def->name))
|
||||
src->flags &= ~IR3_REG_KILL;
|
||||
else
|
||||
src->flags |= IR3_REG_KILL;
|
||||
}
|
||||
|
||||
ra_foreach_src(src, instr) {
|
||||
if (BITSET_TEST(tmp_live, src->def->name))
|
||||
src->flags &= ~IR3_REG_FIRST_KILL;
|
||||
else
|
||||
src->flags |= IR3_REG_FIRST_KILL;
|
||||
BITSET_SET(tmp_live, src->def->name);
|
||||
}
|
||||
}
|
||||
}
|
||||
ra_foreach_src (src, instr) {
|
||||
if (BITSET_TEST(tmp_live, src->def->name))
|
||||
src->flags &= ~IR3_REG_FIRST_KILL;
|
||||
else
|
||||
src->flags |= IR3_REG_FIRST_KILL;
|
||||
BITSET_SET(tmp_live, src->def->name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
memcpy(live->live_in[block->index], tmp_live,
|
||||
bitset_words * sizeof(BITSET_WORD));
|
||||
memcpy(live->live_in[block->index], tmp_live,
|
||||
bitset_words * sizeof(BITSET_WORD));
|
||||
|
||||
bool progress = false;
|
||||
for (unsigned i = 0; i < block->predecessors_count; i++) {
|
||||
const struct ir3_block *pred = block->predecessors[i];
|
||||
for (unsigned j = 0; j < bitset_words; j++) {
|
||||
if (tmp_live[j] & ~live->live_out[pred->index][j])
|
||||
progress = true;
|
||||
live->live_out[pred->index][j] |= tmp_live[j];
|
||||
}
|
||||
bool progress = false;
|
||||
for (unsigned i = 0; i < block->predecessors_count; i++) {
|
||||
const struct ir3_block *pred = block->predecessors[i];
|
||||
for (unsigned j = 0; j < bitset_words; j++) {
|
||||
if (tmp_live[j] & ~live->live_out[pred->index][j])
|
||||
progress = true;
|
||||
live->live_out[pred->index][j] |= tmp_live[j];
|
||||
}
|
||||
|
||||
/* Process phi sources. */
|
||||
foreach_instr (phi, &block->instr_list) {
|
||||
if (phi->opc != OPC_META_PHI)
|
||||
break;
|
||||
if (!phi->srcs[i]->def)
|
||||
continue;
|
||||
unsigned name = phi->srcs[i]->def->name;
|
||||
if (!BITSET_TEST(live->live_out[pred->index], name)) {
|
||||
progress = true;
|
||||
BITSET_SET(live->live_out[pred->index], name);
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Process phi sources. */
|
||||
foreach_instr (phi, &block->instr_list) {
|
||||
if (phi->opc != OPC_META_PHI)
|
||||
break;
|
||||
if (!phi->srcs[i]->def)
|
||||
continue;
|
||||
unsigned name = phi->srcs[i]->def->name;
|
||||
if (!BITSET_TEST(live->live_out[pred->index], name)) {
|
||||
progress = true;
|
||||
BITSET_SET(live->live_out[pred->index], name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
|
||||
const struct ir3_block *pred = block->physical_predecessors[i];
|
||||
unsigned name;
|
||||
BITSET_FOREACH_SET(name, tmp_live, live->definitions_count) {
|
||||
struct ir3_register *reg = live->definitions[name];
|
||||
if (!(reg->flags & IR3_REG_SHARED))
|
||||
continue;
|
||||
if (!BITSET_TEST(live->live_out[pred->index], name)) {
|
||||
progress = true;
|
||||
BITSET_SET(live->live_out[pred->index], name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return progress;
|
||||
for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
|
||||
const struct ir3_block *pred = block->physical_predecessors[i];
|
||||
unsigned name;
|
||||
BITSET_FOREACH_SET (name, tmp_live, live->definitions_count) {
|
||||
struct ir3_register *reg = live->definitions[name];
|
||||
if (!(reg->flags & IR3_REG_SHARED))
|
||||
continue;
|
||||
if (!BITSET_TEST(live->live_out[pred->index], name)) {
|
||||
progress = true;
|
||||
BITSET_SET(live->live_out[pred->index], name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
struct ir3_liveness *ir3_calc_liveness(struct ir3_shader_variant *v)
|
||||
struct ir3_liveness *
|
||||
ir3_calc_liveness(struct ir3_shader_variant *v)
|
||||
{
|
||||
struct ir3_liveness *live = rzalloc(NULL, struct ir3_liveness);
|
||||
struct ir3_liveness *live = rzalloc(NULL, struct ir3_liveness);
|
||||
|
||||
/* Reserve name 0 to mean "doesn't have a name yet" to make the debug
|
||||
* output nicer.
|
||||
*/
|
||||
array_insert(live, live->definitions, NULL);
|
||||
/* Reserve name 0 to mean "doesn't have a name yet" to make the debug
|
||||
* output nicer.
|
||||
*/
|
||||
array_insert(live, live->definitions, NULL);
|
||||
|
||||
/* Build definition <-> name mapping */
|
||||
unsigned block_count = 0;
|
||||
foreach_block (block, &v->ir->block_list) {
|
||||
block->index = block_count++;
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
ra_foreach_dst(dst, instr) {
|
||||
dst->name = live->definitions_count;
|
||||
array_insert(live, live->definitions, dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Build definition <-> name mapping */
|
||||
unsigned block_count = 0;
|
||||
foreach_block (block, &v->ir->block_list) {
|
||||
block->index = block_count++;
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
ra_foreach_dst (dst, instr) {
|
||||
dst->name = live->definitions_count;
|
||||
array_insert(live, live->definitions, dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
live->block_count = block_count;
|
||||
live->block_count = block_count;
|
||||
|
||||
unsigned bitset_words = BITSET_WORDS(live->definitions_count);
|
||||
BITSET_WORD *tmp_live = ralloc_array(live, BITSET_WORD, bitset_words);
|
||||
live->live_in = ralloc_array(live, BITSET_WORD *, block_count);
|
||||
live->live_out = ralloc_array(live, BITSET_WORD *, block_count);
|
||||
unsigned i = 0;
|
||||
foreach_block (block, &v->ir->block_list) {
|
||||
block->index = i++;
|
||||
live->live_in[block->index] = rzalloc_array(live, BITSET_WORD, bitset_words);
|
||||
live->live_out[block->index] = rzalloc_array(live, BITSET_WORD, bitset_words);
|
||||
}
|
||||
unsigned bitset_words = BITSET_WORDS(live->definitions_count);
|
||||
BITSET_WORD *tmp_live = ralloc_array(live, BITSET_WORD, bitset_words);
|
||||
live->live_in = ralloc_array(live, BITSET_WORD *, block_count);
|
||||
live->live_out = ralloc_array(live, BITSET_WORD *, block_count);
|
||||
unsigned i = 0;
|
||||
foreach_block (block, &v->ir->block_list) {
|
||||
block->index = i++;
|
||||
live->live_in[block->index] =
|
||||
rzalloc_array(live, BITSET_WORD, bitset_words);
|
||||
live->live_out[block->index] =
|
||||
rzalloc_array(live, BITSET_WORD, bitset_words);
|
||||
}
|
||||
|
||||
bool progress = true;
|
||||
while (progress) {
|
||||
progress = false;
|
||||
foreach_block_rev (block, &v->ir->block_list) {
|
||||
progress |=
|
||||
compute_block_liveness(live, block, tmp_live, bitset_words);
|
||||
}
|
||||
}
|
||||
bool progress = true;
|
||||
while (progress) {
|
||||
progress = false;
|
||||
foreach_block_rev (block, &v->ir->block_list) {
|
||||
progress |=
|
||||
compute_block_liveness(live, block, tmp_live, bitset_words);
|
||||
}
|
||||
}
|
||||
|
||||
return live;
|
||||
return live;
|
||||
}
|
||||
|
||||
/* Return true if "def" is live after "instr". It's assumed that "def"
|
||||
|
@ -165,32 +168,31 @@ struct ir3_liveness *ir3_calc_liveness(struct ir3_shader_variant *v)
|
|||
*/
|
||||
bool
|
||||
ir3_def_live_after(struct ir3_liveness *live, struct ir3_register *def,
|
||||
struct ir3_instruction *instr)
|
||||
struct ir3_instruction *instr)
|
||||
{
|
||||
/* If it's live out then it's definitely live at the instruction. */
|
||||
if (BITSET_TEST(live->live_out[instr->block->index], def->name))
|
||||
return true;
|
||||
/* If it's live out then it's definitely live at the instruction. */
|
||||
if (BITSET_TEST(live->live_out[instr->block->index], def->name))
|
||||
return true;
|
||||
|
||||
/* If it's not live in and not defined in the same block then the live
|
||||
* range can't extend to the instruction.
|
||||
*/
|
||||
if (def->instr->block != instr->block &&
|
||||
!BITSET_TEST(live->live_in[instr->block->index], def->name))
|
||||
return false;
|
||||
/* If it's not live in and not defined in the same block then the live
|
||||
* range can't extend to the instruction.
|
||||
*/
|
||||
if (def->instr->block != instr->block &&
|
||||
!BITSET_TEST(live->live_in[instr->block->index], def->name))
|
||||
return false;
|
||||
|
||||
/* Ok, now comes the tricky case, where "def" is killed somewhere in
|
||||
* "instr"'s block and we have to check if it's before or after.
|
||||
*/
|
||||
foreach_instr_rev (test_instr, &instr->block->instr_list) {
|
||||
if (test_instr == instr)
|
||||
break;
|
||||
/* Ok, now comes the tricky case, where "def" is killed somewhere in
|
||||
* "instr"'s block and we have to check if it's before or after.
|
||||
*/
|
||||
foreach_instr_rev (test_instr, &instr->block->instr_list) {
|
||||
if (test_instr == instr)
|
||||
break;
|
||||
|
||||
for (unsigned i = 0; i < test_instr->srcs_count; i++) {
|
||||
if (test_instr->srcs[i]->def == def)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
for (unsigned i = 0; i < test_instr->srcs_count; i++) {
|
||||
if (test_instr->srcs[i]->def == def)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -25,524 +25,542 @@
|
|||
#include "ir3_shader.h"
|
||||
|
||||
struct copy_src {
|
||||
unsigned flags;
|
||||
union {
|
||||
uint32_t imm;
|
||||
physreg_t reg;
|
||||
unsigned const_num;
|
||||
};
|
||||
unsigned flags;
|
||||
union {
|
||||
uint32_t imm;
|
||||
physreg_t reg;
|
||||
unsigned const_num;
|
||||
};
|
||||
};
|
||||
|
||||
struct copy_entry {
|
||||
physreg_t dst;
|
||||
unsigned flags;
|
||||
bool done;
|
||||
physreg_t dst;
|
||||
unsigned flags;
|
||||
bool done;
|
||||
|
||||
struct copy_src src;
|
||||
struct copy_src src;
|
||||
};
|
||||
|
||||
static unsigned
|
||||
copy_entry_size(const struct copy_entry *entry)
|
||||
{
|
||||
return (entry->flags & IR3_REG_HALF) ? 1 : 2;
|
||||
return (entry->flags & IR3_REG_HALF) ? 1 : 2;
|
||||
}
|
||||
|
||||
static struct copy_src
|
||||
get_copy_src(const struct ir3_register *reg, unsigned offset)
|
||||
{
|
||||
if (reg->flags & IR3_REG_IMMED) {
|
||||
return (struct copy_src) {
|
||||
.flags = IR3_REG_IMMED,
|
||||
.imm = reg->uim_val,
|
||||
};
|
||||
} else if (reg->flags & IR3_REG_CONST) {
|
||||
return (struct copy_src) {
|
||||
.flags = IR3_REG_CONST,
|
||||
.const_num = reg->num,
|
||||
};
|
||||
} else {
|
||||
return (struct copy_src) {
|
||||
.flags = 0,
|
||||
.reg = ra_reg_get_physreg(reg) + offset,
|
||||
};
|
||||
}
|
||||
if (reg->flags & IR3_REG_IMMED) {
|
||||
return (struct copy_src){
|
||||
.flags = IR3_REG_IMMED,
|
||||
.imm = reg->uim_val,
|
||||
};
|
||||
} else if (reg->flags & IR3_REG_CONST) {
|
||||
return (struct copy_src){
|
||||
.flags = IR3_REG_CONST,
|
||||
.const_num = reg->num,
|
||||
};
|
||||
} else {
|
||||
return (struct copy_src){
|
||||
.flags = 0,
|
||||
.reg = ra_reg_get_physreg(reg) + offset,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
do_xor(struct ir3_instruction *instr, unsigned dst_num, unsigned src1_num, unsigned src2_num, unsigned flags)
|
||||
do_xor(struct ir3_instruction *instr, unsigned dst_num, unsigned src1_num,
|
||||
unsigned src2_num, unsigned flags)
|
||||
{
|
||||
struct ir3_instruction *xor = ir3_instr_create(instr->block, OPC_XOR_B, 1, 2);
|
||||
ir3_dst_create(xor, dst_num, flags);
|
||||
ir3_src_create(xor, src1_num, flags);
|
||||
ir3_src_create(xor, src2_num, flags);
|
||||
struct ir3_instruction * xor
|
||||
= ir3_instr_create(instr->block, OPC_XOR_B, 1, 2);
|
||||
ir3_dst_create(xor, dst_num, flags);
|
||||
ir3_src_create(xor, src1_num, flags);
|
||||
ir3_src_create(xor, src2_num, flags);
|
||||
|
||||
ir3_instr_move_before(xor, instr);
|
||||
ir3_instr_move_before(xor, instr);
|
||||
}
|
||||
|
||||
static void
|
||||
do_swap(struct ir3_compiler *compiler, struct ir3_instruction *instr,
|
||||
const struct copy_entry *entry)
|
||||
const struct copy_entry *entry)
|
||||
{
|
||||
assert(!entry->src.flags);
|
||||
assert(!entry->src.flags);
|
||||
|
||||
if (entry->flags & IR3_REG_HALF) {
|
||||
/* We currently make sure to never emit parallel copies where the
|
||||
* source/destination is a half-reg above the range accessable to half
|
||||
* registers. However, when a full-reg source overlaps a half-reg
|
||||
* destination or vice versa, it can be very, very complicated to come
|
||||
* up with a series of "legal" swaps and copies to resolve the
|
||||
* parallel copy. So here we provide a fallback to implement the
|
||||
* "illegal" swap instead. This may also be useful for implementing
|
||||
* "spilling" half-regs to the inaccessable space.
|
||||
*/
|
||||
if (entry->src.reg >= RA_HALF_SIZE) {
|
||||
/* Choose a temporary that doesn't overlap src or dst */
|
||||
physreg_t tmp = entry->dst < 2 ? 2 : 0;
|
||||
if (entry->flags & IR3_REG_HALF) {
|
||||
/* We currently make sure to never emit parallel copies where the
|
||||
* source/destination is a half-reg above the range accessable to half
|
||||
* registers. However, when a full-reg source overlaps a half-reg
|
||||
* destination or vice versa, it can be very, very complicated to come
|
||||
* up with a series of "legal" swaps and copies to resolve the
|
||||
* parallel copy. So here we provide a fallback to implement the
|
||||
* "illegal" swap instead. This may also be useful for implementing
|
||||
* "spilling" half-regs to the inaccessable space.
|
||||
*/
|
||||
if (entry->src.reg >= RA_HALF_SIZE) {
|
||||
/* Choose a temporary that doesn't overlap src or dst */
|
||||
physreg_t tmp = entry->dst < 2 ? 2 : 0;
|
||||
|
||||
/* Swap src and the temporary */
|
||||
do_swap(compiler, instr, &(struct copy_entry) {
|
||||
.src = { .reg = entry->src.reg & ~1u },
|
||||
.dst = tmp,
|
||||
.flags = entry->flags & ~IR3_REG_HALF,
|
||||
});
|
||||
/* Swap src and the temporary */
|
||||
do_swap(compiler, instr,
|
||||
&(struct copy_entry){
|
||||
.src = {.reg = entry->src.reg & ~1u},
|
||||
.dst = tmp,
|
||||
.flags = entry->flags & ~IR3_REG_HALF,
|
||||
});
|
||||
|
||||
/* Do the original swap with src replaced with tmp */
|
||||
do_swap(compiler, instr, &(struct copy_entry) {
|
||||
.src = { .reg = tmp + (entry->src.reg & 1) },
|
||||
.dst = entry->dst,
|
||||
.flags = entry->flags,
|
||||
});
|
||||
/* Do the original swap with src replaced with tmp */
|
||||
do_swap(compiler, instr,
|
||||
&(struct copy_entry){
|
||||
.src = {.reg = tmp + (entry->src.reg & 1)},
|
||||
.dst = entry->dst,
|
||||
.flags = entry->flags,
|
||||
});
|
||||
|
||||
/* Swap src and the temporary back */
|
||||
do_swap(compiler, instr, &(struct copy_entry) {
|
||||
.src = { .reg = entry->src.reg & ~1u },
|
||||
.dst = tmp,
|
||||
.flags = entry->flags & ~IR3_REG_HALF,
|
||||
});
|
||||
return;
|
||||
}
|
||||
/* Swap src and the temporary back */
|
||||
do_swap(compiler, instr,
|
||||
&(struct copy_entry){
|
||||
.src = {.reg = entry->src.reg & ~1u},
|
||||
.dst = tmp,
|
||||
.flags = entry->flags & ~IR3_REG_HALF,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
/* If dst is not addressable, we only need to swap the arguments and
|
||||
* let the case above handle it.
|
||||
*/
|
||||
if (entry->dst >= RA_HALF_SIZE) {
|
||||
do_swap(compiler, instr, &(struct copy_entry) {
|
||||
.src = { .reg = entry->dst },
|
||||
.dst = entry->src.reg,
|
||||
.flags = entry->flags,
|
||||
});
|
||||
return;
|
||||
}
|
||||
}
|
||||
/* If dst is not addressable, we only need to swap the arguments and
|
||||
* let the case above handle it.
|
||||
*/
|
||||
if (entry->dst >= RA_HALF_SIZE) {
|
||||
do_swap(compiler, instr,
|
||||
&(struct copy_entry){
|
||||
.src = {.reg = entry->dst},
|
||||
.dst = entry->src.reg,
|
||||
.flags = entry->flags,
|
||||
});
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
|
||||
unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
|
||||
unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
|
||||
unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
|
||||
|
||||
/* a5xx+ is known to support swz, which enables us to swap two registers
|
||||
* in-place. If unsupported we emulate it using the xor trick.
|
||||
*/
|
||||
if (compiler->gpu_id < 500) {
|
||||
/* Shared regs only exist since a5xx, so we don't have to provide a
|
||||
* fallback path for them.
|
||||
*/
|
||||
assert(!(entry->flags & IR3_REG_SHARED));
|
||||
do_xor(instr, dst_num, dst_num, src_num, entry->flags);
|
||||
do_xor(instr, src_num, src_num, dst_num, entry->flags);
|
||||
do_xor(instr, dst_num, dst_num, src_num, entry->flags);
|
||||
} else {
|
||||
/* Use a macro for shared regs because any shared reg writes need to
|
||||
* be wrapped in a getone block to work correctly. Writing shared regs
|
||||
* with multiple threads active does not work, even if they all return
|
||||
* the same value.
|
||||
*/
|
||||
unsigned opc = (entry->flags & IR3_REG_SHARED) ? OPC_SWZ_SHARED_MACRO : OPC_SWZ;
|
||||
struct ir3_instruction *swz = ir3_instr_create(instr->block, opc, 2, 2);
|
||||
ir3_dst_create(swz, dst_num, entry->flags);
|
||||
ir3_dst_create(swz, src_num, entry->flags);
|
||||
ir3_src_create(swz, src_num, entry->flags);
|
||||
ir3_src_create(swz, dst_num, entry->flags);
|
||||
swz->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
|
||||
swz->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
|
||||
swz->repeat = 1;
|
||||
ir3_instr_move_before(swz, instr);
|
||||
}
|
||||
/* a5xx+ is known to support swz, which enables us to swap two registers
|
||||
* in-place. If unsupported we emulate it using the xor trick.
|
||||
*/
|
||||
if (compiler->gpu_id < 500) {
|
||||
/* Shared regs only exist since a5xx, so we don't have to provide a
|
||||
* fallback path for them.
|
||||
*/
|
||||
assert(!(entry->flags & IR3_REG_SHARED));
|
||||
do_xor(instr, dst_num, dst_num, src_num, entry->flags);
|
||||
do_xor(instr, src_num, src_num, dst_num, entry->flags);
|
||||
do_xor(instr, dst_num, dst_num, src_num, entry->flags);
|
||||
} else {
|
||||
/* Use a macro for shared regs because any shared reg writes need to
|
||||
* be wrapped in a getone block to work correctly. Writing shared regs
|
||||
* with multiple threads active does not work, even if they all return
|
||||
* the same value.
|
||||
*/
|
||||
unsigned opc =
|
||||
(entry->flags & IR3_REG_SHARED) ? OPC_SWZ_SHARED_MACRO : OPC_SWZ;
|
||||
struct ir3_instruction *swz = ir3_instr_create(instr->block, opc, 2, 2);
|
||||
ir3_dst_create(swz, dst_num, entry->flags);
|
||||
ir3_dst_create(swz, src_num, entry->flags);
|
||||
ir3_src_create(swz, src_num, entry->flags);
|
||||
ir3_src_create(swz, dst_num, entry->flags);
|
||||
swz->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
|
||||
swz->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
|
||||
swz->repeat = 1;
|
||||
ir3_instr_move_before(swz, instr);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
do_copy(struct ir3_compiler *compiler, struct ir3_instruction *instr,
|
||||
const struct copy_entry *entry)
|
||||
const struct copy_entry *entry)
|
||||
{
|
||||
if (entry->flags & IR3_REG_HALF) {
|
||||
/* See do_swap() for why this is here. */
|
||||
if (entry->dst >= RA_HALF_SIZE) {
|
||||
/* TODO: is there a hw instruction we can use for this case? */
|
||||
physreg_t tmp = !entry->src.flags && entry->src.reg < 2 ? 2 : 0;
|
||||
if (entry->flags & IR3_REG_HALF) {
|
||||
/* See do_swap() for why this is here. */
|
||||
if (entry->dst >= RA_HALF_SIZE) {
|
||||
/* TODO: is there a hw instruction we can use for this case? */
|
||||
physreg_t tmp = !entry->src.flags && entry->src.reg < 2 ? 2 : 0;
|
||||
|
||||
do_swap(compiler, instr, &(struct copy_entry) {
|
||||
.src = { .reg = entry->dst & ~1u },
|
||||
.dst = tmp,
|
||||
.flags = entry->flags & ~IR3_REG_HALF,
|
||||
});
|
||||
do_swap(compiler, instr,
|
||||
&(struct copy_entry){
|
||||
.src = {.reg = entry->dst & ~1u},
|
||||
.dst = tmp,
|
||||
.flags = entry->flags & ~IR3_REG_HALF,
|
||||
});
|
||||
|
||||
do_copy(compiler, instr, &(struct copy_entry) {
|
||||
.src = entry->src,
|
||||
.dst = tmp + (entry->dst & 1),
|
||||
.flags = entry->flags,
|
||||
});
|
||||
do_copy(compiler, instr,
|
||||
&(struct copy_entry){
|
||||
.src = entry->src,
|
||||
.dst = tmp + (entry->dst & 1),
|
||||
.flags = entry->flags,
|
||||
});
|
||||
|
||||
do_swap(compiler, instr, &(struct copy_entry) {
|
||||
.src = { .reg = entry->dst & ~1u },
|
||||
.dst = tmp,
|
||||
.flags = entry->flags & ~IR3_REG_HALF,
|
||||
});
|
||||
return;
|
||||
}
|
||||
do_swap(compiler, instr,
|
||||
&(struct copy_entry){
|
||||
.src = {.reg = entry->dst & ~1u},
|
||||
.dst = tmp,
|
||||
.flags = entry->flags & ~IR3_REG_HALF,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
if (!entry->src.flags && entry->src.reg >= RA_HALF_SIZE) {
|
||||
unsigned src_num =
|
||||
ra_physreg_to_num(entry->src.reg & ~1u, entry->flags & ~IR3_REG_HALF);
|
||||
unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
|
||||
|
||||
if (entry->src.reg % 2 == 0) {
|
||||
/* cov.u32u16 dst, src */
|
||||
struct ir3_instruction *cov = ir3_instr_create(instr->block, OPC_MOV, 1, 1);
|
||||
ir3_dst_create(cov, dst_num, entry->flags);
|
||||
ir3_src_create(cov, src_num, entry->flags & ~IR3_REG_HALF);
|
||||
cov->cat1.dst_type = TYPE_U16;
|
||||
cov->cat1.src_type = TYPE_U32;
|
||||
ir3_instr_move_before(cov, instr);
|
||||
} else {
|
||||
/* shr.b dst, src, h(16) */
|
||||
struct ir3_instruction *shr = ir3_instr_create(instr->block, OPC_SHR_B, 1, 2);
|
||||
ir3_dst_create(shr, dst_num, entry->flags);
|
||||
ir3_src_create(shr, src_num, entry->flags & ~IR3_REG_HALF);
|
||||
ir3_src_create(shr, 0, entry->flags | IR3_REG_IMMED)->uim_val = 16;
|
||||
ir3_instr_move_before(shr, instr);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (!entry->src.flags && entry->src.reg >= RA_HALF_SIZE) {
|
||||
unsigned src_num = ra_physreg_to_num(entry->src.reg & ~1u,
|
||||
entry->flags & ~IR3_REG_HALF);
|
||||
unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
|
||||
|
||||
unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
|
||||
unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
|
||||
if (entry->src.reg % 2 == 0) {
|
||||
/* cov.u32u16 dst, src */
|
||||
struct ir3_instruction *cov =
|
||||
ir3_instr_create(instr->block, OPC_MOV, 1, 1);
|
||||
ir3_dst_create(cov, dst_num, entry->flags);
|
||||
ir3_src_create(cov, src_num, entry->flags & ~IR3_REG_HALF);
|
||||
cov->cat1.dst_type = TYPE_U16;
|
||||
cov->cat1.src_type = TYPE_U32;
|
||||
ir3_instr_move_before(cov, instr);
|
||||
} else {
|
||||
/* shr.b dst, src, h(16) */
|
||||
struct ir3_instruction *shr =
|
||||
ir3_instr_create(instr->block, OPC_SHR_B, 1, 2);
|
||||
ir3_dst_create(shr, dst_num, entry->flags);
|
||||
ir3_src_create(shr, src_num, entry->flags & ~IR3_REG_HALF);
|
||||
ir3_src_create(shr, 0, entry->flags | IR3_REG_IMMED)->uim_val = 16;
|
||||
ir3_instr_move_before(shr, instr);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* Similar to the swap case, we have to use a macro for shared regs. */
|
||||
unsigned opc = (entry->flags & IR3_REG_SHARED) ? OPC_READ_FIRST_MACRO : OPC_MOV;
|
||||
struct ir3_instruction *mov = ir3_instr_create(instr->block, opc, 1, 1);
|
||||
ir3_dst_create(mov, dst_num, entry->flags);
|
||||
ir3_src_create(mov, src_num, entry->flags | entry->src.flags);
|
||||
mov->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
|
||||
mov->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
|
||||
if (entry->src.flags & IR3_REG_IMMED)
|
||||
mov->srcs[0]->uim_val = entry->src.imm;
|
||||
else if (entry->src.flags & IR3_REG_CONST)
|
||||
mov->srcs[0]->num = entry->src.const_num;
|
||||
ir3_instr_move_before(mov, instr);
|
||||
unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
|
||||
unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
|
||||
|
||||
/* Similar to the swap case, we have to use a macro for shared regs. */
|
||||
unsigned opc =
|
||||
(entry->flags & IR3_REG_SHARED) ? OPC_READ_FIRST_MACRO : OPC_MOV;
|
||||
struct ir3_instruction *mov = ir3_instr_create(instr->block, opc, 1, 1);
|
||||
ir3_dst_create(mov, dst_num, entry->flags);
|
||||
ir3_src_create(mov, src_num, entry->flags | entry->src.flags);
|
||||
mov->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
|
||||
mov->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
|
||||
if (entry->src.flags & IR3_REG_IMMED)
|
||||
mov->srcs[0]->uim_val = entry->src.imm;
|
||||
else if (entry->src.flags & IR3_REG_CONST)
|
||||
mov->srcs[0]->num = entry->src.const_num;
|
||||
ir3_instr_move_before(mov, instr);
|
||||
}
|
||||
|
||||
struct copy_ctx {
|
||||
/* For each physreg, the number of pending copy entries that use it as a
|
||||
* source. Once this drops to zero, then the physreg is unblocked and can
|
||||
* be moved to.
|
||||
*/
|
||||
unsigned physreg_use_count[RA_MAX_FILE_SIZE];
|
||||
/* For each physreg, the number of pending copy entries that use it as a
|
||||
* source. Once this drops to zero, then the physreg is unblocked and can
|
||||
* be moved to.
|
||||
*/
|
||||
unsigned physreg_use_count[RA_MAX_FILE_SIZE];
|
||||
|
||||
/* For each physreg, the pending copy_entry that uses it as a dest. */
|
||||
struct copy_entry *physreg_dst[RA_MAX_FILE_SIZE];
|
||||
/* For each physreg, the pending copy_entry that uses it as a dest. */
|
||||
struct copy_entry *physreg_dst[RA_MAX_FILE_SIZE];
|
||||
|
||||
struct copy_entry entries[RA_MAX_FILE_SIZE];
|
||||
unsigned entry_count;
|
||||
struct copy_entry entries[RA_MAX_FILE_SIZE];
|
||||
unsigned entry_count;
|
||||
};
|
||||
|
||||
static bool
|
||||
entry_blocked(struct copy_entry *entry, struct copy_ctx *ctx)
|
||||
{
|
||||
for (unsigned i = 0; i < copy_entry_size(entry); i++) {
|
||||
if (ctx->physreg_use_count[entry->dst + i] != 0)
|
||||
return true;
|
||||
}
|
||||
for (unsigned i = 0; i < copy_entry_size(entry); i++) {
|
||||
if (ctx->physreg_use_count[entry->dst + i] != 0)
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
return false;
|
||||
}
|
||||
|
||||
static void
|
||||
split_32bit_copy(struct copy_ctx *ctx, struct copy_entry *entry)
|
||||
{
|
||||
assert(!entry->done);
|
||||
assert(!(entry->flags & (IR3_REG_IMMED | IR3_REG_CONST)));
|
||||
assert(copy_entry_size(entry) == 2);
|
||||
struct copy_entry *new_entry = &ctx->entries[ctx->entry_count++];
|
||||
assert(!entry->done);
|
||||
assert(!(entry->flags & (IR3_REG_IMMED | IR3_REG_CONST)));
|
||||
assert(copy_entry_size(entry) == 2);
|
||||
struct copy_entry *new_entry = &ctx->entries[ctx->entry_count++];
|
||||
|
||||
new_entry->dst = entry->dst + 1;
|
||||
new_entry->src.flags = entry->src.flags;
|
||||
new_entry->src.reg = entry->src.reg + 1;
|
||||
new_entry->done = false;
|
||||
entry->flags |= IR3_REG_HALF;
|
||||
new_entry->flags = entry->flags;
|
||||
ctx->physreg_dst[entry->dst + 1] = new_entry;
|
||||
new_entry->dst = entry->dst + 1;
|
||||
new_entry->src.flags = entry->src.flags;
|
||||
new_entry->src.reg = entry->src.reg + 1;
|
||||
new_entry->done = false;
|
||||
entry->flags |= IR3_REG_HALF;
|
||||
new_entry->flags = entry->flags;
|
||||
ctx->physreg_dst[entry->dst + 1] = new_entry;
|
||||
}
|
||||
|
||||
static void
|
||||
_handle_copies(struct ir3_compiler *compiler, struct ir3_instruction *instr,
|
||||
struct copy_ctx *ctx)
|
||||
struct copy_ctx *ctx)
|
||||
{
|
||||
/* Set up the bookkeeping */
|
||||
memset(ctx->physreg_dst, 0, sizeof(ctx->physreg_dst));
|
||||
memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count));
|
||||
/* Set up the bookkeeping */
|
||||
memset(ctx->physreg_dst, 0, sizeof(ctx->physreg_dst));
|
||||
memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count));
|
||||
|
||||
for (unsigned i = 0; i < ctx->entry_count; i++) {
|
||||
struct copy_entry *entry = &ctx->entries[i];
|
||||
for (unsigned j = 0; j < copy_entry_size(entry); j++) {
|
||||
if (!entry->src.flags)
|
||||
ctx->physreg_use_count[entry->src.reg + j]++;
|
||||
for (unsigned i = 0; i < ctx->entry_count; i++) {
|
||||
struct copy_entry *entry = &ctx->entries[i];
|
||||
for (unsigned j = 0; j < copy_entry_size(entry); j++) {
|
||||
if (!entry->src.flags)
|
||||
ctx->physreg_use_count[entry->src.reg + j]++;
|
||||
|
||||
/* Copies should not have overlapping destinations. */
|
||||
assert(!ctx->physreg_dst[entry->dst + j]);
|
||||
ctx->physreg_dst[entry->dst + j] = entry;
|
||||
}
|
||||
}
|
||||
/* Copies should not have overlapping destinations. */
|
||||
assert(!ctx->physreg_dst[entry->dst + j]);
|
||||
ctx->physreg_dst[entry->dst + j] = entry;
|
||||
}
|
||||
}
|
||||
|
||||
bool progress = true;
|
||||
while (progress) {
|
||||
progress = false;
|
||||
bool progress = true;
|
||||
while (progress) {
|
||||
progress = false;
|
||||
|
||||
/* Step 1: resolve paths in the transfer graph. This means finding
|
||||
* copies whose destination aren't blocked by something else and then
|
||||
* emitting them, continuing this process until every copy is blocked
|
||||
* and there are only cycles left.
|
||||
*
|
||||
* TODO: We should note that src is also available in dst to unblock
|
||||
* cycles that src is involved in.
|
||||
*/
|
||||
/* Step 1: resolve paths in the transfer graph. This means finding
|
||||
* copies whose destination aren't blocked by something else and then
|
||||
* emitting them, continuing this process until every copy is blocked
|
||||
* and there are only cycles left.
|
||||
*
|
||||
* TODO: We should note that src is also available in dst to unblock
|
||||
* cycles that src is involved in.
|
||||
*/
|
||||
|
||||
for (unsigned i = 0; i < ctx->entry_count; i++) {
|
||||
struct copy_entry *entry = &ctx->entries[i];
|
||||
if (!entry->done && !entry_blocked(entry, ctx)) {
|
||||
entry->done = true;
|
||||
progress = true;
|
||||
do_copy(compiler, instr, entry);
|
||||
for (unsigned j = 0; j < copy_entry_size(entry); j++) {
|
||||
if (!entry->src.flags)
|
||||
ctx->physreg_use_count[entry->src.reg + j]--;
|
||||
ctx->physreg_dst[entry->dst + j] = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (unsigned i = 0; i < ctx->entry_count; i++) {
|
||||
struct copy_entry *entry = &ctx->entries[i];
|
||||
if (!entry->done && !entry_blocked(entry, ctx)) {
|
||||
entry->done = true;
|
||||
progress = true;
|
||||
do_copy(compiler, instr, entry);
|
||||
for (unsigned j = 0; j < copy_entry_size(entry); j++) {
|
||||
if (!entry->src.flags)
|
||||
ctx->physreg_use_count[entry->src.reg + j]--;
|
||||
ctx->physreg_dst[entry->dst + j] = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (progress)
|
||||
continue;
|
||||
if (progress)
|
||||
continue;
|
||||
|
||||
/* Step 2: Find partially blocked copies and split them. In the
|
||||
* mergedregs case, we can 32-bit copies which are only blocked on one
|
||||
* 16-bit half, and splitting them helps get things moving.
|
||||
*
|
||||
* We can skip splitting copies if the source isn't a register,
|
||||
* however, because it does not unblock anything and therefore doesn't
|
||||
* contribute to making forward progress with step 1. These copies
|
||||
* should still be resolved eventually in step 1 because they can't be
|
||||
* part of a cycle.
|
||||
*/
|
||||
for (unsigned i = 0; i < ctx->entry_count; i++) {
|
||||
struct copy_entry *entry = &ctx->entries[i];
|
||||
if (entry->done || entry->flags & IR3_REG_HALF)
|
||||
continue;
|
||||
/* Step 2: Find partially blocked copies and split them. In the
|
||||
* mergedregs case, we can 32-bit copies which are only blocked on one
|
||||
* 16-bit half, and splitting them helps get things moving.
|
||||
*
|
||||
* We can skip splitting copies if the source isn't a register,
|
||||
* however, because it does not unblock anything and therefore doesn't
|
||||
* contribute to making forward progress with step 1. These copies
|
||||
* should still be resolved eventually in step 1 because they can't be
|
||||
* part of a cycle.
|
||||
*/
|
||||
for (unsigned i = 0; i < ctx->entry_count; i++) {
|
||||
struct copy_entry *entry = &ctx->entries[i];
|
||||
if (entry->done || entry->flags & IR3_REG_HALF)
|
||||
continue;
|
||||
|
||||
if (((ctx->physreg_use_count[entry->dst] == 0 ||
|
||||
ctx->physreg_use_count[entry->dst + 1] == 0)) &&
|
||||
!(entry->flags & (IR3_REG_IMMED | IR3_REG_CONST))) {
|
||||
split_32bit_copy(ctx, entry);
|
||||
progress = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (((ctx->physreg_use_count[entry->dst] == 0 ||
|
||||
ctx->physreg_use_count[entry->dst + 1] == 0)) &&
|
||||
!(entry->flags & (IR3_REG_IMMED | IR3_REG_CONST))) {
|
||||
split_32bit_copy(ctx, entry);
|
||||
progress = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Step 3: resolve cycles through swapping.
|
||||
*
|
||||
* At this point, the transfer graph should consist of only cycles.
|
||||
* The reason is that, given any physreg n_1 that's the source of a
|
||||
* remaining entry, it has a destination n_2, which (because every
|
||||
* copy is blocked) is the source of some other copy whose destination
|
||||
* is n_3, and so we can follow the chain until we get a cycle. If we
|
||||
* reached some other node than n_1:
|
||||
*
|
||||
* n_1 -> n_2 -> ... -> n_i
|
||||
* ^ |
|
||||
* |-------------|
|
||||
*
|
||||
* then n_2 would be the destination of 2 copies, which is illegal
|
||||
* (checked above in an assert). So n_1 must be part of a cycle:
|
||||
*
|
||||
* n_1 -> n_2 -> ... -> n_i
|
||||
* ^ |
|
||||
* |---------------------|
|
||||
*
|
||||
* and this must be only cycle n_1 is involved in, because any other
|
||||
* path starting from n_1 would also have to end in n_1, resulting in
|
||||
* a node somewhere along the way being the destination of 2 copies
|
||||
* when the 2 paths merge.
|
||||
*
|
||||
* The way we resolve the cycle is through picking a copy (n_1, n_2)
|
||||
* and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken
|
||||
* out of the cycle:
|
||||
*
|
||||
* n_1 -> ... -> n_i
|
||||
* ^ |
|
||||
* |--------------|
|
||||
*
|
||||
* and we can keep repeating this until the cycle is empty.
|
||||
*/
|
||||
/* Step 3: resolve cycles through swapping.
|
||||
*
|
||||
* At this point, the transfer graph should consist of only cycles.
|
||||
* The reason is that, given any physreg n_1 that's the source of a
|
||||
* remaining entry, it has a destination n_2, which (because every
|
||||
* copy is blocked) is the source of some other copy whose destination
|
||||
* is n_3, and so we can follow the chain until we get a cycle. If we
|
||||
* reached some other node than n_1:
|
||||
*
|
||||
* n_1 -> n_2 -> ... -> n_i
|
||||
* ^ |
|
||||
* |-------------|
|
||||
*
|
||||
* then n_2 would be the destination of 2 copies, which is illegal
|
||||
* (checked above in an assert). So n_1 must be part of a cycle:
|
||||
*
|
||||
* n_1 -> n_2 -> ... -> n_i
|
||||
* ^ |
|
||||
* |---------------------|
|
||||
*
|
||||
* and this must be only cycle n_1 is involved in, because any other
|
||||
* path starting from n_1 would also have to end in n_1, resulting in
|
||||
* a node somewhere along the way being the destination of 2 copies
|
||||
* when the 2 paths merge.
|
||||
*
|
||||
* The way we resolve the cycle is through picking a copy (n_1, n_2)
|
||||
* and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken
|
||||
* out of the cycle:
|
||||
*
|
||||
* n_1 -> ... -> n_i
|
||||
* ^ |
|
||||
* |--------------|
|
||||
*
|
||||
* and we can keep repeating this until the cycle is empty.
|
||||
*/
|
||||
|
||||
for (unsigned i = 0; i < ctx->entry_count; i++) {
|
||||
struct copy_entry *entry = &ctx->entries[i];
|
||||
if (entry->done)
|
||||
continue;
|
||||
for (unsigned i = 0; i < ctx->entry_count; i++) {
|
||||
struct copy_entry *entry = &ctx->entries[i];
|
||||
if (entry->done)
|
||||
continue;
|
||||
|
||||
assert(!entry->src.flags);
|
||||
assert(!entry->src.flags);
|
||||
|
||||
/* catch trivial copies */
|
||||
if (entry->dst == entry->src.reg) {
|
||||
entry->done = true;
|
||||
continue;
|
||||
}
|
||||
/* catch trivial copies */
|
||||
if (entry->dst == entry->src.reg) {
|
||||
entry->done = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
do_swap(compiler, instr, entry);
|
||||
do_swap(compiler, instr, entry);
|
||||
|
||||
/* Split any blocking copies whose sources are only partially
|
||||
* contained within our destination.
|
||||
*/
|
||||
if (entry->flags & IR3_REG_HALF) {
|
||||
for (unsigned j = 0; j < ctx->entry_count; j++) {
|
||||
struct copy_entry *blocking = &ctx->entries[j];
|
||||
/* Split any blocking copies whose sources are only partially
|
||||
* contained within our destination.
|
||||
*/
|
||||
if (entry->flags & IR3_REG_HALF) {
|
||||
for (unsigned j = 0; j < ctx->entry_count; j++) {
|
||||
struct copy_entry *blocking = &ctx->entries[j];
|
||||
|
||||
if (blocking->done)
|
||||
continue;
|
||||
if (blocking->done)
|
||||
continue;
|
||||
|
||||
if (blocking->src.reg <= entry->dst &&
|
||||
blocking->src.reg + 1 >= entry->dst &&
|
||||
!(blocking->flags & IR3_REG_HALF)) {
|
||||
split_32bit_copy(ctx, blocking);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (blocking->src.reg <= entry->dst &&
|
||||
blocking->src.reg + 1 >= entry->dst &&
|
||||
!(blocking->flags & IR3_REG_HALF)) {
|
||||
split_32bit_copy(ctx, blocking);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Update sources of blocking copies.
|
||||
*
|
||||
* Note: at this point, every blocking copy's source should be
|
||||
* contained within our destination.
|
||||
*/
|
||||
for (unsigned j = 0; j < ctx->entry_count; j++) {
|
||||
struct copy_entry *blocking = &ctx->entries[j];
|
||||
if (blocking->src.reg >= entry->dst &&
|
||||
blocking->src.reg < entry->dst + copy_entry_size(entry)) {
|
||||
blocking->src.reg = entry->src.reg + (blocking->src.reg - entry->dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Update sources of blocking copies.
|
||||
*
|
||||
* Note: at this point, every blocking copy's source should be
|
||||
* contained within our destination.
|
||||
*/
|
||||
for (unsigned j = 0; j < ctx->entry_count; j++) {
|
||||
struct copy_entry *blocking = &ctx->entries[j];
|
||||
if (blocking->src.reg >= entry->dst &&
|
||||
blocking->src.reg < entry->dst + copy_entry_size(entry)) {
|
||||
blocking->src.reg =
|
||||
entry->src.reg + (blocking->src.reg - entry->dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
handle_copies(struct ir3_shader_variant *v, struct ir3_instruction *instr,
|
||||
struct copy_entry *entries, unsigned entry_count)
|
||||
struct copy_entry *entries, unsigned entry_count)
|
||||
{
|
||||
struct copy_ctx ctx;
|
||||
struct copy_ctx ctx;
|
||||
|
||||
/* handle shared copies first */
|
||||
ctx.entry_count = 0;
|
||||
for (unsigned i = 0; i < entry_count; i++) {
|
||||
if (entries[i].flags & IR3_REG_SHARED)
|
||||
ctx.entries[ctx.entry_count++] = entries[i];
|
||||
}
|
||||
_handle_copies(v->shader->compiler, instr, &ctx);
|
||||
/* handle shared copies first */
|
||||
ctx.entry_count = 0;
|
||||
for (unsigned i = 0; i < entry_count; i++) {
|
||||
if (entries[i].flags & IR3_REG_SHARED)
|
||||
ctx.entries[ctx.entry_count++] = entries[i];
|
||||
}
|
||||
_handle_copies(v->shader->compiler, instr, &ctx);
|
||||
|
||||
if (v->mergedregs) {
|
||||
/* Half regs and full regs are in the same file, so handle everything
|
||||
* at once.
|
||||
*/
|
||||
ctx.entry_count = 0;
|
||||
for (unsigned i = 0; i < entry_count; i++) {
|
||||
if (!(entries[i].flags & IR3_REG_SHARED))
|
||||
ctx.entries[ctx.entry_count++] = entries[i];
|
||||
}
|
||||
_handle_copies(v->shader->compiler, instr, &ctx);
|
||||
} else {
|
||||
/* There may be both half copies and full copies, so we have to split
|
||||
* them up since they don't interfere.
|
||||
*/
|
||||
ctx.entry_count = 0;
|
||||
for (unsigned i = 0; i < entry_count; i++) {
|
||||
if (entries[i].flags & IR3_REG_HALF)
|
||||
ctx.entries[ctx.entry_count++] = entries[i];
|
||||
}
|
||||
_handle_copies(v->shader->compiler, instr, &ctx);
|
||||
if (v->mergedregs) {
|
||||
/* Half regs and full regs are in the same file, so handle everything
|
||||
* at once.
|
||||
*/
|
||||
ctx.entry_count = 0;
|
||||
for (unsigned i = 0; i < entry_count; i++) {
|
||||
if (!(entries[i].flags & IR3_REG_SHARED))
|
||||
ctx.entries[ctx.entry_count++] = entries[i];
|
||||
}
|
||||
_handle_copies(v->shader->compiler, instr, &ctx);
|
||||
} else {
|
||||
/* There may be both half copies and full copies, so we have to split
|
||||
* them up since they don't interfere.
|
||||
*/
|
||||
ctx.entry_count = 0;
|
||||
for (unsigned i = 0; i < entry_count; i++) {
|
||||
if (entries[i].flags & IR3_REG_HALF)
|
||||
ctx.entries[ctx.entry_count++] = entries[i];
|
||||
}
|
||||
_handle_copies(v->shader->compiler, instr, &ctx);
|
||||
|
||||
ctx.entry_count = 0;
|
||||
for (unsigned i = 0; i < entry_count; i++) {
|
||||
if (!(entries[i].flags & (IR3_REG_HALF | IR3_REG_SHARED)))
|
||||
ctx.entries[ctx.entry_count++] = entries[i];
|
||||
}
|
||||
_handle_copies(v->shader->compiler, instr, &ctx);
|
||||
}
|
||||
ctx.entry_count = 0;
|
||||
for (unsigned i = 0; i < entry_count; i++) {
|
||||
if (!(entries[i].flags & (IR3_REG_HALF | IR3_REG_SHARED)))
|
||||
ctx.entries[ctx.entry_count++] = entries[i];
|
||||
}
|
||||
_handle_copies(v->shader->compiler, instr, &ctx);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ir3_lower_copies(struct ir3_shader_variant *v)
|
||||
{
|
||||
DECLARE_ARRAY(struct copy_entry, copies);
|
||||
copies_count = copies_sz = 0;
|
||||
copies = NULL;
|
||||
DECLARE_ARRAY(struct copy_entry, copies);
|
||||
copies_count = copies_sz = 0;
|
||||
copies = NULL;
|
||||
|
||||
foreach_block (block, &v->ir->block_list) {
|
||||
foreach_instr_safe (instr, &block->instr_list) {
|
||||
if (instr->opc == OPC_META_PARALLEL_COPY) {
|
||||
copies_count = 0;
|
||||
for (unsigned i = 0; i < instr->dsts_count; i++) {
|
||||
struct ir3_register *dst = instr->dsts[i];
|
||||
struct ir3_register *src = instr->srcs[i];
|
||||
unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
|
||||
unsigned dst_physreg = ra_reg_get_physreg(dst);
|
||||
for (unsigned j = 0; j < reg_elems(dst); j++) {
|
||||
array_insert(NULL, copies, (struct copy_entry) {
|
||||
.dst = dst_physreg + j * reg_elem_size(dst),
|
||||
.src = get_copy_src(src, j * reg_elem_size(dst)),
|
||||
.flags = flags,
|
||||
});
|
||||
}
|
||||
}
|
||||
handle_copies(v, instr, copies, copies_count);
|
||||
list_del(&instr->node);
|
||||
} else if (instr->opc == OPC_META_COLLECT) {
|
||||
copies_count = 0;
|
||||
struct ir3_register *dst = instr->dsts[0];
|
||||
unsigned flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED);
|
||||
for (unsigned i = 0; i < instr->srcs_count; i++) {
|
||||
struct ir3_register *src = instr->srcs[i];
|
||||
array_insert(NULL, copies, (struct copy_entry) {
|
||||
.dst = ra_num_to_physreg(dst->num + i, flags),
|
||||
.src = get_copy_src(src, 0),
|
||||
.flags = flags,
|
||||
});
|
||||
}
|
||||
handle_copies(v, instr, copies, copies_count);
|
||||
list_del(&instr->node);
|
||||
} else if (instr->opc == OPC_META_SPLIT) {
|
||||
copies_count = 0;
|
||||
struct ir3_register *dst = instr->dsts[0];
|
||||
struct ir3_register *src = instr->srcs[0];
|
||||
unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
|
||||
array_insert(NULL, copies, (struct copy_entry) {
|
||||
.dst = ra_reg_get_physreg(dst),
|
||||
.src = get_copy_src(src, instr->split.off * reg_elem_size(dst)),
|
||||
.flags = flags,
|
||||
});
|
||||
handle_copies(v, instr, copies, copies_count);
|
||||
list_del(&instr->node);
|
||||
} else if (instr->opc == OPC_META_PHI) {
|
||||
list_del(&instr->node);
|
||||
}
|
||||
}
|
||||
}
|
||||
foreach_block (block, &v->ir->block_list) {
|
||||
foreach_instr_safe (instr, &block->instr_list) {
|
||||
if (instr->opc == OPC_META_PARALLEL_COPY) {
|
||||
copies_count = 0;
|
||||
for (unsigned i = 0; i < instr->dsts_count; i++) {
|
||||
struct ir3_register *dst = instr->dsts[i];
|
||||
struct ir3_register *src = instr->srcs[i];
|
||||
unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
|
||||
unsigned dst_physreg = ra_reg_get_physreg(dst);
|
||||
for (unsigned j = 0; j < reg_elems(dst); j++) {
|
||||
array_insert(
|
||||
NULL, copies,
|
||||
(struct copy_entry){
|
||||
.dst = dst_physreg + j * reg_elem_size(dst),
|
||||
.src = get_copy_src(src, j * reg_elem_size(dst)),
|
||||
.flags = flags,
|
||||
});
|
||||
}
|
||||
}
|
||||
handle_copies(v, instr, copies, copies_count);
|
||||
list_del(&instr->node);
|
||||
} else if (instr->opc == OPC_META_COLLECT) {
|
||||
copies_count = 0;
|
||||
struct ir3_register *dst = instr->dsts[0];
|
||||
unsigned flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED);
|
||||
for (unsigned i = 0; i < instr->srcs_count; i++) {
|
||||
struct ir3_register *src = instr->srcs[i];
|
||||
array_insert(NULL, copies,
|
||||
(struct copy_entry){
|
||||
.dst = ra_num_to_physreg(dst->num + i, flags),
|
||||
.src = get_copy_src(src, 0),
|
||||
.flags = flags,
|
||||
});
|
||||
}
|
||||
handle_copies(v, instr, copies, copies_count);
|
||||
list_del(&instr->node);
|
||||
} else if (instr->opc == OPC_META_SPLIT) {
|
||||
copies_count = 0;
|
||||
struct ir3_register *dst = instr->dsts[0];
|
||||
struct ir3_register *src = instr->srcs[0];
|
||||
unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
|
||||
array_insert(NULL, copies,
|
||||
(struct copy_entry){
|
||||
.dst = ra_reg_get_physreg(dst),
|
||||
.src = get_copy_src(
|
||||
src, instr->split.off * reg_elem_size(dst)),
|
||||
.flags = flags,
|
||||
});
|
||||
handle_copies(v, instr, copies, copies_count);
|
||||
list_del(&instr->node);
|
||||
} else if (instr->opc == OPC_META_PHI) {
|
||||
list_del(&instr->node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (copies)
|
||||
ralloc_free(copies);
|
||||
if (copies)
|
||||
ralloc_free(copies);
|
||||
}
|
||||
|
||||
|
|
|
@ -35,220 +35,224 @@
|
|||
|
||||
static void
|
||||
replace_pred(struct ir3_block *block, struct ir3_block *old_pred,
|
||||
struct ir3_block *new_pred)
|
||||
struct ir3_block *new_pred)
|
||||
{
|
||||
for (unsigned i = 0; i < block->predecessors_count; i++) {
|
||||
if (block->predecessors[i] == old_pred) {
|
||||
block->predecessors[i] = new_pred;
|
||||
return;
|
||||
}
|
||||
}
|
||||
for (unsigned i = 0; i < block->predecessors_count; i++) {
|
||||
if (block->predecessors[i] == old_pred) {
|
||||
block->predecessors[i] = new_pred;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
replace_physical_pred(struct ir3_block *block, struct ir3_block *old_pred,
|
||||
struct ir3_block *new_pred)
|
||||
struct ir3_block *new_pred)
|
||||
{
|
||||
for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
|
||||
if (block->physical_predecessors[i] == old_pred) {
|
||||
block->physical_predecessors[i] = new_pred;
|
||||
return;
|
||||
}
|
||||
}
|
||||
for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
|
||||
if (block->physical_predecessors[i] == old_pred) {
|
||||
block->physical_predecessors[i] = new_pred;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
mov_immed(struct ir3_register *dst, struct ir3_block *block, unsigned immed)
|
||||
{
|
||||
struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1);
|
||||
struct ir3_register *mov_dst = ir3_dst_create(mov, dst->num, dst->flags);
|
||||
mov_dst->wrmask = dst->wrmask;
|
||||
struct ir3_register *src =
|
||||
ir3_src_create(mov, INVALID_REG, (dst->flags & IR3_REG_HALF) | IR3_REG_IMMED);
|
||||
src->uim_val = immed;
|
||||
mov->cat1.dst_type = (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
|
||||
mov->cat1.src_type = mov->cat1.dst_type;
|
||||
mov->repeat = util_last_bit(mov_dst->wrmask) - 1;
|
||||
struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1);
|
||||
struct ir3_register *mov_dst = ir3_dst_create(mov, dst->num, dst->flags);
|
||||
mov_dst->wrmask = dst->wrmask;
|
||||
struct ir3_register *src = ir3_src_create(
|
||||
mov, INVALID_REG, (dst->flags & IR3_REG_HALF) | IR3_REG_IMMED);
|
||||
src->uim_val = immed;
|
||||
mov->cat1.dst_type = (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
|
||||
mov->cat1.src_type = mov->cat1.dst_type;
|
||||
mov->repeat = util_last_bit(mov_dst->wrmask) - 1;
|
||||
}
|
||||
|
||||
static struct ir3_block *
|
||||
split_block(struct ir3 *ir, struct ir3_block *before_block,
|
||||
struct ir3_instruction *instr, struct ir3_block **then)
|
||||
struct ir3_instruction *instr, struct ir3_block **then)
|
||||
{
|
||||
struct ir3_block *then_block = ir3_block_create(ir);
|
||||
struct ir3_block *after_block = ir3_block_create(ir);
|
||||
list_add(&then_block->node, &before_block->node);
|
||||
list_add(&after_block->node, &then_block->node);
|
||||
struct ir3_block *then_block = ir3_block_create(ir);
|
||||
struct ir3_block *after_block = ir3_block_create(ir);
|
||||
list_add(&then_block->node, &before_block->node);
|
||||
list_add(&after_block->node, &then_block->node);
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(before_block->successors); i++) {
|
||||
after_block->successors[i] = before_block->successors[i];
|
||||
if (after_block->successors[i])
|
||||
replace_pred(after_block->successors[i], before_block, after_block);
|
||||
}
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(before_block->successors); i++) {
|
||||
after_block->successors[i] = before_block->successors[i];
|
||||
if (after_block->successors[i])
|
||||
replace_pred(after_block->successors[i], before_block, after_block);
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(before_block->physical_successors); i++) {
|
||||
after_block->physical_successors[i] = before_block->physical_successors[i];
|
||||
if (after_block->physical_successors[i]) {
|
||||
replace_physical_pred(after_block->physical_successors[i],
|
||||
before_block, after_block);
|
||||
}
|
||||
}
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(before_block->physical_successors);
|
||||
i++) {
|
||||
after_block->physical_successors[i] =
|
||||
before_block->physical_successors[i];
|
||||
if (after_block->physical_successors[i]) {
|
||||
replace_physical_pred(after_block->physical_successors[i],
|
||||
before_block, after_block);
|
||||
}
|
||||
}
|
||||
|
||||
before_block->successors[0] = then_block;
|
||||
before_block->successors[1] = after_block;
|
||||
before_block->physical_successors[0] = then_block;
|
||||
before_block->physical_successors[1] = after_block;
|
||||
ir3_block_add_predecessor(then_block, before_block);
|
||||
ir3_block_add_predecessor(after_block, before_block);
|
||||
ir3_block_add_physical_predecessor(then_block, before_block);
|
||||
ir3_block_add_physical_predecessor(after_block, before_block);
|
||||
before_block->successors[0] = then_block;
|
||||
before_block->successors[1] = after_block;
|
||||
before_block->physical_successors[0] = then_block;
|
||||
before_block->physical_successors[1] = after_block;
|
||||
ir3_block_add_predecessor(then_block, before_block);
|
||||
ir3_block_add_predecessor(after_block, before_block);
|
||||
ir3_block_add_physical_predecessor(then_block, before_block);
|
||||
ir3_block_add_physical_predecessor(after_block, before_block);
|
||||
|
||||
then_block->successors[0] = after_block;
|
||||
then_block->physical_successors[0] = after_block;
|
||||
ir3_block_add_predecessor(after_block, then_block);
|
||||
ir3_block_add_physical_predecessor(after_block, then_block);
|
||||
|
||||
foreach_instr_from_safe (rem_instr, &instr->node, &before_block->instr_list) {
|
||||
list_del(&rem_instr->node);
|
||||
list_addtail(&rem_instr->node, &after_block->instr_list);
|
||||
rem_instr->block = after_block;
|
||||
}
|
||||
then_block->successors[0] = after_block;
|
||||
then_block->physical_successors[0] = after_block;
|
||||
ir3_block_add_predecessor(after_block, then_block);
|
||||
ir3_block_add_physical_predecessor(after_block, then_block);
|
||||
|
||||
after_block->brtype = before_block->brtype;
|
||||
after_block->condition = before_block->condition;
|
||||
foreach_instr_from_safe (rem_instr, &instr->node,
|
||||
&before_block->instr_list) {
|
||||
list_del(&rem_instr->node);
|
||||
list_addtail(&rem_instr->node, &after_block->instr_list);
|
||||
rem_instr->block = after_block;
|
||||
}
|
||||
|
||||
*then = then_block;
|
||||
return after_block;
|
||||
after_block->brtype = before_block->brtype;
|
||||
after_block->condition = before_block->condition;
|
||||
|
||||
*then = then_block;
|
||||
return after_block;
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_block(struct ir3 *ir, struct ir3_block **block)
|
||||
{
|
||||
bool progress = false;
|
||||
bool progress = false;
|
||||
|
||||
foreach_instr_safe (instr, &(*block)->instr_list) {
|
||||
switch (instr->opc) {
|
||||
case OPC_BALLOT_MACRO:
|
||||
case OPC_ANY_MACRO:
|
||||
case OPC_ALL_MACRO:
|
||||
case OPC_ELECT_MACRO:
|
||||
case OPC_READ_COND_MACRO:
|
||||
case OPC_READ_FIRST_MACRO:
|
||||
case OPC_SWZ_SHARED_MACRO:
|
||||
break;
|
||||
default:
|
||||
continue;
|
||||
}
|
||||
foreach_instr_safe (instr, &(*block)->instr_list) {
|
||||
switch (instr->opc) {
|
||||
case OPC_BALLOT_MACRO:
|
||||
case OPC_ANY_MACRO:
|
||||
case OPC_ALL_MACRO:
|
||||
case OPC_ELECT_MACRO:
|
||||
case OPC_READ_COND_MACRO:
|
||||
case OPC_READ_FIRST_MACRO:
|
||||
case OPC_SWZ_SHARED_MACRO:
|
||||
break;
|
||||
default:
|
||||
continue;
|
||||
}
|
||||
|
||||
struct ir3_block *before_block = *block;
|
||||
struct ir3_block *then_block;
|
||||
struct ir3_block *after_block =
|
||||
split_block(ir, before_block, instr, &then_block);
|
||||
struct ir3_block *before_block = *block;
|
||||
struct ir3_block *then_block;
|
||||
struct ir3_block *after_block =
|
||||
split_block(ir, before_block, instr, &then_block);
|
||||
|
||||
/* For ballot, the destination must be initialized to 0 before we do
|
||||
* the movmsk because the condition may be 0 and then the movmsk will
|
||||
* be skipped. Because it's a shared register we have to wrap the
|
||||
* initialization in a getone block.
|
||||
*/
|
||||
if (instr->opc == OPC_BALLOT_MACRO) {
|
||||
before_block->brtype = IR3_BRANCH_GETONE;
|
||||
before_block->condition = NULL;
|
||||
mov_immed(instr->dsts[0], then_block, 0);
|
||||
before_block = after_block;
|
||||
after_block = split_block(ir, before_block, instr, &then_block);
|
||||
}
|
||||
/* For ballot, the destination must be initialized to 0 before we do
|
||||
* the movmsk because the condition may be 0 and then the movmsk will
|
||||
* be skipped. Because it's a shared register we have to wrap the
|
||||
* initialization in a getone block.
|
||||
*/
|
||||
if (instr->opc == OPC_BALLOT_MACRO) {
|
||||
before_block->brtype = IR3_BRANCH_GETONE;
|
||||
before_block->condition = NULL;
|
||||
mov_immed(instr->dsts[0], then_block, 0);
|
||||
before_block = after_block;
|
||||
after_block = split_block(ir, before_block, instr, &then_block);
|
||||
}
|
||||
|
||||
switch (instr->opc) {
|
||||
case OPC_BALLOT_MACRO:
|
||||
case OPC_READ_COND_MACRO:
|
||||
case OPC_ANY_MACRO:
|
||||
case OPC_ALL_MACRO:
|
||||
before_block->condition = instr->srcs[0]->def->instr;
|
||||
break;
|
||||
default:
|
||||
before_block->condition = NULL;
|
||||
break;
|
||||
}
|
||||
switch (instr->opc) {
|
||||
case OPC_BALLOT_MACRO:
|
||||
case OPC_READ_COND_MACRO:
|
||||
case OPC_ANY_MACRO:
|
||||
case OPC_ALL_MACRO:
|
||||
before_block->condition = instr->srcs[0]->def->instr;
|
||||
break;
|
||||
default:
|
||||
before_block->condition = NULL;
|
||||
break;
|
||||
}
|
||||
|
||||
switch (instr->opc) {
|
||||
case OPC_BALLOT_MACRO:
|
||||
case OPC_READ_COND_MACRO:
|
||||
before_block->brtype = IR3_BRANCH_COND;
|
||||
break;
|
||||
case OPC_ANY_MACRO:
|
||||
before_block->brtype = IR3_BRANCH_ANY;
|
||||
break;
|
||||
case OPC_ALL_MACRO:
|
||||
before_block->brtype = IR3_BRANCH_ALL;
|
||||
break;
|
||||
case OPC_ELECT_MACRO:
|
||||
case OPC_READ_FIRST_MACRO:
|
||||
case OPC_SWZ_SHARED_MACRO:
|
||||
before_block->brtype = IR3_BRANCH_GETONE;
|
||||
break;
|
||||
default:
|
||||
unreachable("bad opcode");
|
||||
}
|
||||
switch (instr->opc) {
|
||||
case OPC_BALLOT_MACRO:
|
||||
case OPC_READ_COND_MACRO:
|
||||
before_block->brtype = IR3_BRANCH_COND;
|
||||
break;
|
||||
case OPC_ANY_MACRO:
|
||||
before_block->brtype = IR3_BRANCH_ANY;
|
||||
break;
|
||||
case OPC_ALL_MACRO:
|
||||
before_block->brtype = IR3_BRANCH_ALL;
|
||||
break;
|
||||
case OPC_ELECT_MACRO:
|
||||
case OPC_READ_FIRST_MACRO:
|
||||
case OPC_SWZ_SHARED_MACRO:
|
||||
before_block->brtype = IR3_BRANCH_GETONE;
|
||||
break;
|
||||
default:
|
||||
unreachable("bad opcode");
|
||||
}
|
||||
|
||||
switch (instr->opc) {
|
||||
case OPC_ALL_MACRO:
|
||||
case OPC_ANY_MACRO:
|
||||
case OPC_ELECT_MACRO:
|
||||
mov_immed(instr->dsts[0], then_block, 1);
|
||||
mov_immed(instr->dsts[0], before_block, 0);
|
||||
break;
|
||||
switch (instr->opc) {
|
||||
case OPC_ALL_MACRO:
|
||||
case OPC_ANY_MACRO:
|
||||
case OPC_ELECT_MACRO:
|
||||
mov_immed(instr->dsts[0], then_block, 1);
|
||||
mov_immed(instr->dsts[0], before_block, 0);
|
||||
break;
|
||||
|
||||
case OPC_BALLOT_MACRO: {
|
||||
unsigned comp_count = util_last_bit(instr->dsts[0]->wrmask);
|
||||
struct ir3_instruction *movmsk = ir3_instr_create(then_block, OPC_MOVMSK, 1, 0);
|
||||
ir3_dst_create(movmsk, instr->dsts[0]->num, instr->dsts[0]->flags);
|
||||
movmsk->repeat = comp_count - 1;
|
||||
break;
|
||||
}
|
||||
case OPC_BALLOT_MACRO: {
|
||||
unsigned comp_count = util_last_bit(instr->dsts[0]->wrmask);
|
||||
struct ir3_instruction *movmsk =
|
||||
ir3_instr_create(then_block, OPC_MOVMSK, 1, 0);
|
||||
ir3_dst_create(movmsk, instr->dsts[0]->num, instr->dsts[0]->flags);
|
||||
movmsk->repeat = comp_count - 1;
|
||||
break;
|
||||
}
|
||||
|
||||
case OPC_READ_COND_MACRO:
|
||||
case OPC_READ_FIRST_MACRO: {
|
||||
struct ir3_instruction *mov = ir3_instr_create(then_block, OPC_MOV, 1, 1);
|
||||
unsigned src = instr->opc == OPC_READ_COND_MACRO ? 1 : 0;
|
||||
ir3_dst_create(mov, instr->dsts[0]->num, instr->dsts[0]->flags);
|
||||
struct ir3_register *new_src = ir3_src_create(mov, 0, 0);
|
||||
*new_src = *instr->srcs[src];
|
||||
mov->cat1.dst_type = mov->cat1.src_type = TYPE_U32;
|
||||
break;
|
||||
}
|
||||
case OPC_READ_COND_MACRO:
|
||||
case OPC_READ_FIRST_MACRO: {
|
||||
struct ir3_instruction *mov =
|
||||
ir3_instr_create(then_block, OPC_MOV, 1, 1);
|
||||
unsigned src = instr->opc == OPC_READ_COND_MACRO ? 1 : 0;
|
||||
ir3_dst_create(mov, instr->dsts[0]->num, instr->dsts[0]->flags);
|
||||
struct ir3_register *new_src = ir3_src_create(mov, 0, 0);
|
||||
*new_src = *instr->srcs[src];
|
||||
mov->cat1.dst_type = mov->cat1.src_type = TYPE_U32;
|
||||
break;
|
||||
}
|
||||
|
||||
case OPC_SWZ_SHARED_MACRO: {
|
||||
struct ir3_instruction *swz =
|
||||
ir3_instr_create(then_block, OPC_SWZ, 2, 2);
|
||||
ir3_dst_create(swz, instr->dsts[0]->num, instr->dsts[0]->flags);
|
||||
ir3_dst_create(swz, instr->dsts[1]->num, instr->dsts[1]->flags);
|
||||
ir3_src_create(swz, instr->srcs[0]->num, instr->srcs[0]->flags);
|
||||
ir3_src_create(swz, instr->srcs[1]->num, instr->srcs[1]->flags);
|
||||
swz->cat1.dst_type = swz->cat1.src_type = TYPE_U32;
|
||||
swz->repeat = 1;
|
||||
break;
|
||||
}
|
||||
case OPC_SWZ_SHARED_MACRO: {
|
||||
struct ir3_instruction *swz =
|
||||
ir3_instr_create(then_block, OPC_SWZ, 2, 2);
|
||||
ir3_dst_create(swz, instr->dsts[0]->num, instr->dsts[0]->flags);
|
||||
ir3_dst_create(swz, instr->dsts[1]->num, instr->dsts[1]->flags);
|
||||
ir3_src_create(swz, instr->srcs[0]->num, instr->srcs[0]->flags);
|
||||
ir3_src_create(swz, instr->srcs[1]->num, instr->srcs[1]->flags);
|
||||
swz->cat1.dst_type = swz->cat1.src_type = TYPE_U32;
|
||||
swz->repeat = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
unreachable("bad opcode");
|
||||
}
|
||||
default:
|
||||
unreachable("bad opcode");
|
||||
}
|
||||
|
||||
*block = after_block;
|
||||
list_delinit(&instr->node);
|
||||
progress = true;
|
||||
}
|
||||
*block = after_block;
|
||||
list_delinit(&instr->node);
|
||||
progress = true;
|
||||
}
|
||||
|
||||
return progress;
|
||||
return progress;
|
||||
}
|
||||
|
||||
bool
|
||||
ir3_lower_subgroups(struct ir3 *ir)
|
||||
{
|
||||
bool progress = false;
|
||||
bool progress = false;
|
||||
|
||||
foreach_block (block, &ir->block_list)
|
||||
progress |= lower_block(ir, &block);
|
||||
foreach_block (block, &ir->block_list)
|
||||
progress |= lower_block(ir, &block);
|
||||
|
||||
return progress;
|
||||
return progress;
|
||||
}
|
||||
|
||||
|
|
|
@ -21,8 +21,8 @@
|
|||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "ir3_ra.h"
|
||||
#include "ir3_compiler.h"
|
||||
#include "ir3_ra.h"
|
||||
#include "ralloc.h"
|
||||
|
||||
/* This pass "merges" compatible phi-web SSA values. First, we insert a bunch
|
||||
|
@ -49,7 +49,7 @@
|
|||
* try to allocate all the definitions in the same merge set to the
|
||||
* same/compatible registers. This helps us e.g. allocate sources of a collect
|
||||
* to contiguous registers without too much special code in RA.
|
||||
*
|
||||
*
|
||||
* In a "normal" register allocator, or when spilling, we'd just merge
|
||||
* registers in the same merge set to the same register, but with SSA-based
|
||||
* register allocation we may have to split the live interval.
|
||||
|
@ -71,13 +71,13 @@
|
|||
static unsigned
|
||||
index_instrs(struct ir3_block *block, unsigned index)
|
||||
{
|
||||
foreach_instr (instr, &block->instr_list)
|
||||
instr->ip = index++;
|
||||
foreach_instr (instr, &block->instr_list)
|
||||
instr->ip = index++;
|
||||
|
||||
for (unsigned i = 0; i < block->dom_children_count; i++)
|
||||
index = index_instrs(block->dom_children[i], index);
|
||||
for (unsigned i = 0; i < block->dom_children_count; i++)
|
||||
index = index_instrs(block->dom_children[i], index);
|
||||
|
||||
return index;
|
||||
return index;
|
||||
}
|
||||
|
||||
/* Definitions within a merge set are ordered by instr->ip as set above: */
|
||||
|
@ -85,27 +85,27 @@ index_instrs(struct ir3_block *block, unsigned index)
|
|||
static bool
|
||||
def_after(struct ir3_register *a, struct ir3_register *b)
|
||||
{
|
||||
return a->instr->ip > b->instr->ip;
|
||||
return a->instr->ip > b->instr->ip;
|
||||
}
|
||||
|
||||
static bool
|
||||
def_dominates(struct ir3_register *a, struct ir3_register *b)
|
||||
{
|
||||
if (def_after(a, b)) {
|
||||
return false;
|
||||
} else if (a->instr->block == b->instr->block) {
|
||||
return def_after(b, a);
|
||||
} else {
|
||||
return ir3_block_dominates(a->instr->block, b->instr->block);
|
||||
}
|
||||
if (def_after(a, b)) {
|
||||
return false;
|
||||
} else if (a->instr->block == b->instr->block) {
|
||||
return def_after(b, a);
|
||||
} else {
|
||||
return ir3_block_dominates(a->instr->block, b->instr->block);
|
||||
}
|
||||
}
|
||||
|
||||
/* This represents a region inside a register. The offset is relative to the
|
||||
* start of the register, and offset + size <= size(reg).
|
||||
*/
|
||||
struct def_value {
|
||||
struct ir3_register *reg;
|
||||
unsigned offset, size;
|
||||
struct ir3_register *reg;
|
||||
unsigned offset, size;
|
||||
};
|
||||
|
||||
/* Chase any copies to get the source of a region inside a register. This is
|
||||
|
@ -114,456 +114,452 @@ struct def_value {
|
|||
static struct def_value
|
||||
chase_copies(struct def_value value)
|
||||
{
|
||||
while (true) {
|
||||
struct ir3_instruction *instr = value.reg->instr;
|
||||
if (instr->opc == OPC_META_SPLIT) {
|
||||
value.offset += instr->split.off * reg_elem_size(value.reg);
|
||||
value.reg = instr->srcs[0]->def;
|
||||
} else if (instr->opc == OPC_META_COLLECT) {
|
||||
if (value.offset % reg_elem_size(value.reg) != 0 ||
|
||||
value.size > reg_elem_size(value.reg) ||
|
||||
value.offset + value.size > reg_size(value.reg))
|
||||
break;
|
||||
struct ir3_register *src = instr->srcs[value.offset / reg_elem_size(value.reg)];
|
||||
if (!src->def)
|
||||
break;
|
||||
value.offset = 0;
|
||||
value.reg = src->def;
|
||||
} else {
|
||||
/* TODO: parallelcopy */
|
||||
break;
|
||||
}
|
||||
}
|
||||
while (true) {
|
||||
struct ir3_instruction *instr = value.reg->instr;
|
||||
if (instr->opc == OPC_META_SPLIT) {
|
||||
value.offset += instr->split.off * reg_elem_size(value.reg);
|
||||
value.reg = instr->srcs[0]->def;
|
||||
} else if (instr->opc == OPC_META_COLLECT) {
|
||||
if (value.offset % reg_elem_size(value.reg) != 0 ||
|
||||
value.size > reg_elem_size(value.reg) ||
|
||||
value.offset + value.size > reg_size(value.reg))
|
||||
break;
|
||||
struct ir3_register *src =
|
||||
instr->srcs[value.offset / reg_elem_size(value.reg)];
|
||||
if (!src->def)
|
||||
break;
|
||||
value.offset = 0;
|
||||
value.reg = src->def;
|
||||
} else {
|
||||
/* TODO: parallelcopy */
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return value;
|
||||
return value;
|
||||
}
|
||||
|
||||
/* This represents an entry in the merge set, and consists of a register +
|
||||
* offset from the merge set base.
|
||||
*/
|
||||
struct merge_def {
|
||||
struct ir3_register *reg;
|
||||
unsigned offset;
|
||||
struct ir3_register *reg;
|
||||
unsigned offset;
|
||||
};
|
||||
|
||||
static bool
|
||||
can_skip_interference(const struct merge_def *a, const struct merge_def *b)
|
||||
{
|
||||
unsigned a_start = a->offset;
|
||||
unsigned b_start = b->offset;
|
||||
unsigned a_end = a_start + reg_size(a->reg);
|
||||
unsigned b_end = b_start + reg_size(b->reg);
|
||||
unsigned a_start = a->offset;
|
||||
unsigned b_start = b->offset;
|
||||
unsigned a_end = a_start + reg_size(a->reg);
|
||||
unsigned b_end = b_start + reg_size(b->reg);
|
||||
|
||||
/* Registers that don't overlap never interfere */
|
||||
if (a_end <= b_start || b_end <= a_start)
|
||||
return true;
|
||||
/* Registers that don't overlap never interfere */
|
||||
if (a_end <= b_start || b_end <= a_start)
|
||||
return true;
|
||||
|
||||
/* Disallow skipping interference unless one definition contains the
|
||||
* other. This restriction is important for register allocation, because
|
||||
* it means that at any given point in the program, the live values in a
|
||||
* given merge set will form a tree. If they didn't, then one live value
|
||||
* would partially overlap another, and they would have overlapping live
|
||||
* ranges because they're live at the same point. This simplifies register
|
||||
* allocation and spilling.
|
||||
*/
|
||||
if (!((a_start <= b_start && a_end >= b_end) ||
|
||||
(b_start <= a_start && b_end >= a_end)))
|
||||
return false;
|
||||
/* Disallow skipping interference unless one definition contains the
|
||||
* other. This restriction is important for register allocation, because
|
||||
* it means that at any given point in the program, the live values in a
|
||||
* given merge set will form a tree. If they didn't, then one live value
|
||||
* would partially overlap another, and they would have overlapping live
|
||||
* ranges because they're live at the same point. This simplifies register
|
||||
* allocation and spilling.
|
||||
*/
|
||||
if (!((a_start <= b_start && a_end >= b_end) ||
|
||||
(b_start <= a_start && b_end >= a_end)))
|
||||
return false;
|
||||
|
||||
/* For each register, chase the intersection of a and b to find the
|
||||
* ultimate source.
|
||||
*/
|
||||
unsigned start = MAX2(a_start, b_start);
|
||||
unsigned end = MIN2(a_end, b_end);
|
||||
struct def_value a_value =
|
||||
chase_copies((struct def_value) {
|
||||
.reg = a->reg,
|
||||
.offset = start - a_start,
|
||||
.size = end - start,
|
||||
});
|
||||
struct def_value b_value =
|
||||
chase_copies((struct def_value) {
|
||||
.reg = b->reg,
|
||||
.offset = start - b_start,
|
||||
.size = end - start,
|
||||
});
|
||||
return a_value.reg == b_value.reg && a_value.offset == b_value.offset;
|
||||
/* For each register, chase the intersection of a and b to find the
|
||||
* ultimate source.
|
||||
*/
|
||||
unsigned start = MAX2(a_start, b_start);
|
||||
unsigned end = MIN2(a_end, b_end);
|
||||
struct def_value a_value = chase_copies((struct def_value){
|
||||
.reg = a->reg,
|
||||
.offset = start - a_start,
|
||||
.size = end - start,
|
||||
});
|
||||
struct def_value b_value = chase_copies((struct def_value){
|
||||
.reg = b->reg,
|
||||
.offset = start - b_start,
|
||||
.size = end - start,
|
||||
});
|
||||
return a_value.reg == b_value.reg && a_value.offset == b_value.offset;
|
||||
}
|
||||
|
||||
static struct ir3_merge_set *
|
||||
get_merge_set(struct ir3_register *def)
|
||||
{
|
||||
if (def->merge_set)
|
||||
return def->merge_set;
|
||||
if (def->merge_set)
|
||||
return def->merge_set;
|
||||
|
||||
struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set);
|
||||
set->preferred_reg = ~0;
|
||||
set->interval_start = ~0;
|
||||
set->size = reg_size(def);
|
||||
set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2;
|
||||
set->regs_count = 1;
|
||||
set->regs = ralloc(set, struct ir3_register *);
|
||||
set->regs[0] = def;
|
||||
struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set);
|
||||
set->preferred_reg = ~0;
|
||||
set->interval_start = ~0;
|
||||
set->size = reg_size(def);
|
||||
set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2;
|
||||
set->regs_count = 1;
|
||||
set->regs = ralloc(set, struct ir3_register *);
|
||||
set->regs[0] = def;
|
||||
|
||||
return set;
|
||||
return set;
|
||||
}
|
||||
|
||||
/* Merges b into a */
|
||||
static struct ir3_merge_set *
|
||||
merge_merge_sets(struct ir3_merge_set *a, struct ir3_merge_set *b,
|
||||
int b_offset)
|
||||
merge_merge_sets(struct ir3_merge_set *a, struct ir3_merge_set *b, int b_offset)
|
||||
{
|
||||
if (b_offset < 0)
|
||||
return merge_merge_sets(b, a, -b_offset);
|
||||
if (b_offset < 0)
|
||||
return merge_merge_sets(b, a, -b_offset);
|
||||
|
||||
struct ir3_register **new_regs =
|
||||
rzalloc_array(a, struct ir3_register *, a->regs_count + b->regs_count);
|
||||
struct ir3_register **new_regs =
|
||||
rzalloc_array(a, struct ir3_register *, a->regs_count + b->regs_count);
|
||||
|
||||
unsigned a_index = 0, b_index = 0, new_index = 0;
|
||||
for (; a_index < a->regs_count || b_index < b->regs_count; new_index++) {
|
||||
if (b_index < b->regs_count &&
|
||||
(a_index == a->regs_count ||
|
||||
def_after(a->regs[a_index], b->regs[b_index]))) {
|
||||
new_regs[new_index] = b->regs[b_index++];
|
||||
new_regs[new_index]->merge_set_offset += b_offset;
|
||||
} else {
|
||||
new_regs[new_index] = a->regs[a_index++];
|
||||
}
|
||||
new_regs[new_index]->merge_set = a;
|
||||
}
|
||||
unsigned a_index = 0, b_index = 0, new_index = 0;
|
||||
for (; a_index < a->regs_count || b_index < b->regs_count; new_index++) {
|
||||
if (b_index < b->regs_count &&
|
||||
(a_index == a->regs_count ||
|
||||
def_after(a->regs[a_index], b->regs[b_index]))) {
|
||||
new_regs[new_index] = b->regs[b_index++];
|
||||
new_regs[new_index]->merge_set_offset += b_offset;
|
||||
} else {
|
||||
new_regs[new_index] = a->regs[a_index++];
|
||||
}
|
||||
new_regs[new_index]->merge_set = a;
|
||||
}
|
||||
|
||||
assert(new_index == a->regs_count + b->regs_count);
|
||||
assert(new_index == a->regs_count + b->regs_count);
|
||||
|
||||
/* Technically this should be the lcm, but because alignment is only 1 or
|
||||
* 2 so far this should be ok.
|
||||
*/
|
||||
a->alignment = MAX2(a->alignment, b->alignment);
|
||||
a->regs_count += b->regs_count;
|
||||
ralloc_free(a->regs);
|
||||
a->regs = new_regs;
|
||||
a->size = MAX2(a->size, b->size + b_offset);
|
||||
/* Technically this should be the lcm, but because alignment is only 1 or
|
||||
* 2 so far this should be ok.
|
||||
*/
|
||||
a->alignment = MAX2(a->alignment, b->alignment);
|
||||
a->regs_count += b->regs_count;
|
||||
ralloc_free(a->regs);
|
||||
a->regs = new_regs;
|
||||
a->size = MAX2(a->size, b->size + b_offset);
|
||||
|
||||
return a;
|
||||
return a;
|
||||
}
|
||||
|
||||
static bool
|
||||
merge_sets_interfere(struct ir3_liveness *live,
|
||||
struct ir3_merge_set *a, struct ir3_merge_set *b,
|
||||
int b_offset)
|
||||
merge_sets_interfere(struct ir3_liveness *live, struct ir3_merge_set *a,
|
||||
struct ir3_merge_set *b, int b_offset)
|
||||
{
|
||||
if (b_offset < 0)
|
||||
return merge_sets_interfere(live, b, a, -b_offset);
|
||||
if (b_offset < 0)
|
||||
return merge_sets_interfere(live, b, a, -b_offset);
|
||||
|
||||
struct merge_def dom[a->regs_count + b->regs_count];
|
||||
unsigned a_index = 0, b_index = 0;
|
||||
int dom_index = -1;
|
||||
struct merge_def dom[a->regs_count + b->regs_count];
|
||||
unsigned a_index = 0, b_index = 0;
|
||||
int dom_index = -1;
|
||||
|
||||
/* Reject trying to merge the sets if the alignment doesn't work out */
|
||||
if (b_offset % a->alignment != 0)
|
||||
return true;
|
||||
/* Reject trying to merge the sets if the alignment doesn't work out */
|
||||
if (b_offset % a->alignment != 0)
|
||||
return true;
|
||||
|
||||
while (a_index < a->regs_count || b_index < b->regs_count) {
|
||||
struct merge_def current;
|
||||
if (a_index == a->regs_count) {
|
||||
current.reg = b->regs[b_index];
|
||||
current.offset = current.reg->merge_set_offset + b_offset;
|
||||
b_index++;
|
||||
} else if (b_index == b->regs_count) {
|
||||
current.reg = a->regs[a_index];
|
||||
current.offset = current.reg->merge_set_offset;
|
||||
a_index++;
|
||||
} else {
|
||||
if (def_after(b->regs[b_index], a->regs[a_index])) {
|
||||
current.reg = a->regs[a_index];
|
||||
current.offset = current.reg->merge_set_offset;
|
||||
a_index++;
|
||||
} else {
|
||||
current.reg = b->regs[b_index];
|
||||
current.offset = current.reg->merge_set_offset + b_offset;
|
||||
b_index++;
|
||||
}
|
||||
}
|
||||
while (a_index < a->regs_count || b_index < b->regs_count) {
|
||||
struct merge_def current;
|
||||
if (a_index == a->regs_count) {
|
||||
current.reg = b->regs[b_index];
|
||||
current.offset = current.reg->merge_set_offset + b_offset;
|
||||
b_index++;
|
||||
} else if (b_index == b->regs_count) {
|
||||
current.reg = a->regs[a_index];
|
||||
current.offset = current.reg->merge_set_offset;
|
||||
a_index++;
|
||||
} else {
|
||||
if (def_after(b->regs[b_index], a->regs[a_index])) {
|
||||
current.reg = a->regs[a_index];
|
||||
current.offset = current.reg->merge_set_offset;
|
||||
a_index++;
|
||||
} else {
|
||||
current.reg = b->regs[b_index];
|
||||
current.offset = current.reg->merge_set_offset + b_offset;
|
||||
b_index++;
|
||||
}
|
||||
}
|
||||
|
||||
while (dom_index >= 0 &&
|
||||
!def_dominates(dom[dom_index].reg, current.reg)) {
|
||||
dom_index--;
|
||||
}
|
||||
while (dom_index >= 0 &&
|
||||
!def_dominates(dom[dom_index].reg, current.reg)) {
|
||||
dom_index--;
|
||||
}
|
||||
|
||||
/* TODO: in the original paper, just dom[dom_index] needs to be
|
||||
* checked for interference. We implement the value-chasing extension
|
||||
* as well as support for sub-registers, which complicates this
|
||||
* significantly because it's no longer the case that if a dominates b
|
||||
* dominates c and a and b don't interfere then we only need to check
|
||||
* interference between b and c to be sure a and c don't interfere --
|
||||
* this means we may have to check for interference against values
|
||||
* higher in the stack then dom[dom_index]. In the paper there's a
|
||||
* description of a way to do less interference tests with the
|
||||
* value-chasing extension, but we'd have to come up with something
|
||||
* ourselves for handling the similar problems that come up with
|
||||
* allowing values to contain subregisters. For now we just test
|
||||
* everything in the stack.
|
||||
*/
|
||||
for (int i = 0; i <= dom_index; i++) {
|
||||
if (can_skip_interference(¤t, &dom[i]))
|
||||
continue;
|
||||
/* TODO: in the original paper, just dom[dom_index] needs to be
|
||||
* checked for interference. We implement the value-chasing extension
|
||||
* as well as support for sub-registers, which complicates this
|
||||
* significantly because it's no longer the case that if a dominates b
|
||||
* dominates c and a and b don't interfere then we only need to check
|
||||
* interference between b and c to be sure a and c don't interfere --
|
||||
* this means we may have to check for interference against values
|
||||
* higher in the stack then dom[dom_index]. In the paper there's a
|
||||
* description of a way to do less interference tests with the
|
||||
* value-chasing extension, but we'd have to come up with something
|
||||
* ourselves for handling the similar problems that come up with
|
||||
* allowing values to contain subregisters. For now we just test
|
||||
* everything in the stack.
|
||||
*/
|
||||
for (int i = 0; i <= dom_index; i++) {
|
||||
if (can_skip_interference(¤t, &dom[i]))
|
||||
continue;
|
||||
|
||||
/* Ok, now we actually have to check interference. Since we know
|
||||
* that dom[i] dominates current, this boils down to checking
|
||||
* whether dom[i] is live after current.
|
||||
*/
|
||||
if (ir3_def_live_after(live, dom[i].reg, current.reg->instr))
|
||||
return true;
|
||||
}
|
||||
/* Ok, now we actually have to check interference. Since we know
|
||||
* that dom[i] dominates current, this boils down to checking
|
||||
* whether dom[i] is live after current.
|
||||
*/
|
||||
if (ir3_def_live_after(live, dom[i].reg, current.reg->instr))
|
||||
return true;
|
||||
}
|
||||
|
||||
dom[++dom_index] = current;
|
||||
}
|
||||
dom[++dom_index] = current;
|
||||
}
|
||||
|
||||
return false;
|
||||
return false;
|
||||
}
|
||||
|
||||
static void
|
||||
try_merge_defs(struct ir3_liveness *live,
|
||||
struct ir3_register *a, struct ir3_register *b,
|
||||
unsigned b_offset)
|
||||
try_merge_defs(struct ir3_liveness *live, struct ir3_register *a,
|
||||
struct ir3_register *b, unsigned b_offset)
|
||||
{
|
||||
struct ir3_merge_set *a_set = get_merge_set(a);
|
||||
struct ir3_merge_set *b_set = get_merge_set(b);
|
||||
struct ir3_merge_set *a_set = get_merge_set(a);
|
||||
struct ir3_merge_set *b_set = get_merge_set(b);
|
||||
|
||||
if (a_set == b_set) {
|
||||
/* Note: Even in this case we may not always successfully be able to
|
||||
* coalesce this copy, if the offsets don't line up. But in any
|
||||
* case, we can't do anything.
|
||||
*/
|
||||
return;
|
||||
}
|
||||
if (a_set == b_set) {
|
||||
/* Note: Even in this case we may not always successfully be able to
|
||||
* coalesce this copy, if the offsets don't line up. But in any
|
||||
* case, we can't do anything.
|
||||
*/
|
||||
return;
|
||||
}
|
||||
|
||||
int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
|
||||
int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
|
||||
|
||||
if (!merge_sets_interfere(live, a_set, b_set, b_set_offset))
|
||||
merge_merge_sets(a_set, b_set, b_set_offset);
|
||||
if (!merge_sets_interfere(live, a_set, b_set, b_set_offset))
|
||||
merge_merge_sets(a_set, b_set, b_set_offset);
|
||||
}
|
||||
|
||||
static void
|
||||
coalesce_phi(struct ir3_liveness *live,
|
||||
struct ir3_instruction *phi)
|
||||
coalesce_phi(struct ir3_liveness *live, struct ir3_instruction *phi)
|
||||
{
|
||||
for (unsigned i = 0; i < phi->srcs_count; i++) {
|
||||
if (phi->srcs[i]->def)
|
||||
try_merge_defs(live, phi->dsts[0], phi->srcs[i]->def, 0);
|
||||
}
|
||||
for (unsigned i = 0; i < phi->srcs_count; i++) {
|
||||
if (phi->srcs[i]->def)
|
||||
try_merge_defs(live, phi->dsts[0], phi->srcs[i]->def, 0);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
aggressive_coalesce_parallel_copy(struct ir3_liveness *live,
|
||||
struct ir3_instruction *pcopy)
|
||||
struct ir3_instruction *pcopy)
|
||||
{
|
||||
for (unsigned i = 0; i < pcopy->dsts_count; i++) {
|
||||
if (!(pcopy->srcs[i]->flags & IR3_REG_SSA))
|
||||
continue;
|
||||
try_merge_defs(live, pcopy->dsts[i], pcopy->srcs[i]->def, 0);
|
||||
}
|
||||
for (unsigned i = 0; i < pcopy->dsts_count; i++) {
|
||||
if (!(pcopy->srcs[i]->flags & IR3_REG_SSA))
|
||||
continue;
|
||||
try_merge_defs(live, pcopy->dsts[i], pcopy->srcs[i]->def, 0);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
aggressive_coalesce_split(struct ir3_liveness *live,
|
||||
struct ir3_instruction *split)
|
||||
struct ir3_instruction *split)
|
||||
{
|
||||
try_merge_defs(live, split->srcs[0]->def, split->dsts[0],
|
||||
split->split.off * reg_elem_size(split->dsts[0]));
|
||||
try_merge_defs(live, split->srcs[0]->def, split->dsts[0],
|
||||
split->split.off * reg_elem_size(split->dsts[0]));
|
||||
}
|
||||
|
||||
static void
|
||||
aggressive_coalesce_collect(struct ir3_liveness *live,
|
||||
struct ir3_instruction *collect)
|
||||
struct ir3_instruction *collect)
|
||||
{
|
||||
for (unsigned i = 0, offset = 0; i < collect->srcs_count;
|
||||
offset += reg_elem_size(collect->srcs[i]), i++) {
|
||||
if (!(collect->srcs[i]->flags & IR3_REG_SSA))
|
||||
continue;
|
||||
try_merge_defs(live, collect->dsts[0], collect->srcs[i]->def, offset);
|
||||
}
|
||||
for (unsigned i = 0, offset = 0; i < collect->srcs_count;
|
||||
offset += reg_elem_size(collect->srcs[i]), i++) {
|
||||
if (!(collect->srcs[i]->flags & IR3_REG_SSA))
|
||||
continue;
|
||||
try_merge_defs(live, collect->dsts[0], collect->srcs[i]->def, offset);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
create_parallel_copy(struct ir3_block *block)
|
||||
{
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
if (!block->successors[i])
|
||||
continue;
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
if (!block->successors[i])
|
||||
continue;
|
||||
|
||||
struct ir3_block *succ = block->successors[i];
|
||||
struct ir3_block *succ = block->successors[i];
|
||||
|
||||
unsigned pred_idx = ir3_block_get_pred_index(succ, block);
|
||||
unsigned pred_idx = ir3_block_get_pred_index(succ, block);
|
||||
|
||||
unsigned phi_count = 0;
|
||||
foreach_instr (phi, &succ->instr_list) {
|
||||
if (phi->opc != OPC_META_PHI)
|
||||
break;
|
||||
unsigned phi_count = 0;
|
||||
foreach_instr (phi, &succ->instr_list) {
|
||||
if (phi->opc != OPC_META_PHI)
|
||||
break;
|
||||
|
||||
/* Avoid undef */
|
||||
if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
|
||||
!phi->srcs[pred_idx]->def)
|
||||
continue;
|
||||
/* Avoid undef */
|
||||
if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
|
||||
!phi->srcs[pred_idx]->def)
|
||||
continue;
|
||||
|
||||
/* We don't support critical edges. If we were to support them,
|
||||
* we'd need to insert parallel copies after the phi node to solve
|
||||
* the lost-copy problem.
|
||||
*/
|
||||
assert(i == 0 && !block->successors[1]);
|
||||
phi_count++;
|
||||
}
|
||||
/* We don't support critical edges. If we were to support them,
|
||||
* we'd need to insert parallel copies after the phi node to solve
|
||||
* the lost-copy problem.
|
||||
*/
|
||||
assert(i == 0 && !block->successors[1]);
|
||||
phi_count++;
|
||||
}
|
||||
|
||||
if (phi_count == 0)
|
||||
continue;
|
||||
if (phi_count == 0)
|
||||
continue;
|
||||
|
||||
struct ir3_register *src[phi_count];
|
||||
unsigned j = 0;
|
||||
foreach_instr (phi, &succ->instr_list) {
|
||||
if (phi->opc != OPC_META_PHI)
|
||||
break;
|
||||
if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
|
||||
!phi->srcs[pred_idx]->def)
|
||||
continue;
|
||||
src[j++] = phi->srcs[pred_idx];
|
||||
}
|
||||
assert(j == phi_count);
|
||||
struct ir3_register *src[phi_count];
|
||||
unsigned j = 0;
|
||||
foreach_instr (phi, &succ->instr_list) {
|
||||
if (phi->opc != OPC_META_PHI)
|
||||
break;
|
||||
if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
|
||||
!phi->srcs[pred_idx]->def)
|
||||
continue;
|
||||
src[j++] = phi->srcs[pred_idx];
|
||||
}
|
||||
assert(j == phi_count);
|
||||
|
||||
struct ir3_instruction *pcopy =
|
||||
ir3_instr_create(block, OPC_META_PARALLEL_COPY, phi_count, phi_count);
|
||||
|
||||
for (j = 0; j < phi_count; j++) {
|
||||
struct ir3_register *reg = __ssa_dst(pcopy);
|
||||
reg->flags |= src[j]->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
|
||||
reg->size = reg_elems(src[j]);
|
||||
}
|
||||
struct ir3_instruction *pcopy =
|
||||
ir3_instr_create(block, OPC_META_PARALLEL_COPY, phi_count, phi_count);
|
||||
|
||||
for (j = 0; j < phi_count; j++) {
|
||||
pcopy->srcs[pcopy->srcs_count++] = ir3_reg_clone(block->shader, src[j]);
|
||||
}
|
||||
for (j = 0; j < phi_count; j++) {
|
||||
struct ir3_register *reg = __ssa_dst(pcopy);
|
||||
reg->flags |= src[j]->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
|
||||
reg->size = reg_elems(src[j]);
|
||||
}
|
||||
|
||||
j = 0;
|
||||
foreach_instr (phi, &succ->instr_list) {
|
||||
if (phi->opc != OPC_META_PHI)
|
||||
break;
|
||||
if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
|
||||
!phi->srcs[pred_idx]->def)
|
||||
continue;
|
||||
phi->srcs[pred_idx]->def = pcopy->dsts[j];
|
||||
phi->srcs[pred_idx]->flags = pcopy->dsts[j]->flags;
|
||||
j++;
|
||||
}
|
||||
assert(j == phi_count);
|
||||
}
|
||||
for (j = 0; j < phi_count; j++) {
|
||||
pcopy->srcs[pcopy->srcs_count++] =
|
||||
ir3_reg_clone(block->shader, src[j]);
|
||||
}
|
||||
|
||||
j = 0;
|
||||
foreach_instr (phi, &succ->instr_list) {
|
||||
if (phi->opc != OPC_META_PHI)
|
||||
break;
|
||||
if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
|
||||
!phi->srcs[pred_idx]->def)
|
||||
continue;
|
||||
phi->srcs[pred_idx]->def = pcopy->dsts[j];
|
||||
phi->srcs[pred_idx]->flags = pcopy->dsts[j]->flags;
|
||||
j++;
|
||||
}
|
||||
assert(j == phi_count);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ir3_create_parallel_copies(struct ir3 *ir)
|
||||
{
|
||||
foreach_block (block, &ir->block_list) {
|
||||
create_parallel_copy(block);
|
||||
}
|
||||
foreach_block (block, &ir->block_list) {
|
||||
create_parallel_copy(block);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
index_merge_sets(struct ir3 *ir)
|
||||
{
|
||||
unsigned offset = 0;
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
for (unsigned i = 0; i < instr->dsts_count; i++) {
|
||||
struct ir3_register *dst = instr->dsts[i];
|
||||
unsigned offset = 0;
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
for (unsigned i = 0; i < instr->dsts_count; i++) {
|
||||
struct ir3_register *dst = instr->dsts[i];
|
||||
|
||||
unsigned dst_offset;
|
||||
struct ir3_merge_set *merge_set = dst->merge_set;
|
||||
unsigned size = reg_size(dst);
|
||||
if (merge_set) {
|
||||
if (merge_set->interval_start == ~0) {
|
||||
merge_set->interval_start = offset;
|
||||
offset += merge_set->size;
|
||||
}
|
||||
dst_offset = merge_set->interval_start + dst->merge_set_offset;
|
||||
} else {
|
||||
dst_offset = offset;
|
||||
offset += size;
|
||||
}
|
||||
unsigned dst_offset;
|
||||
struct ir3_merge_set *merge_set = dst->merge_set;
|
||||
unsigned size = reg_size(dst);
|
||||
if (merge_set) {
|
||||
if (merge_set->interval_start == ~0) {
|
||||
merge_set->interval_start = offset;
|
||||
offset += merge_set->size;
|
||||
}
|
||||
dst_offset = merge_set->interval_start + dst->merge_set_offset;
|
||||
} else {
|
||||
dst_offset = offset;
|
||||
offset += size;
|
||||
}
|
||||
|
||||
dst->interval_start = dst_offset;
|
||||
dst->interval_end = dst_offset + size;
|
||||
}
|
||||
}
|
||||
}
|
||||
dst->interval_start = dst_offset;
|
||||
dst->interval_end = dst_offset + size;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define RESET "\x1b[0m"
|
||||
#define BLUE "\x1b[0;34m"
|
||||
#define SYN_SSA(x) BLUE x RESET
|
||||
#define RESET "\x1b[0m"
|
||||
#define BLUE "\x1b[0;34m"
|
||||
#define SYN_SSA(x) BLUE x RESET
|
||||
|
||||
static void
|
||||
dump_merge_sets(struct ir3 *ir)
|
||||
{
|
||||
printf("merge sets:\n");
|
||||
struct set *merge_sets = _mesa_pointer_set_create(NULL);
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
for (unsigned i = 0; i < instr->dsts_count; i++) {
|
||||
struct ir3_register *dst = instr->dsts[i];
|
||||
printf("merge sets:\n");
|
||||
struct set *merge_sets = _mesa_pointer_set_create(NULL);
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
for (unsigned i = 0; i < instr->dsts_count; i++) {
|
||||
struct ir3_register *dst = instr->dsts[i];
|
||||
|
||||
struct ir3_merge_set *merge_set = dst->merge_set;
|
||||
if (!merge_set || _mesa_set_search(merge_sets, merge_set))
|
||||
continue;
|
||||
struct ir3_merge_set *merge_set = dst->merge_set;
|
||||
if (!merge_set || _mesa_set_search(merge_sets, merge_set))
|
||||
continue;
|
||||
|
||||
printf("merge set, size %u, align %u:\n", merge_set->size, merge_set->alignment);
|
||||
for (unsigned j = 0; j < merge_set->regs_count; j++) {
|
||||
struct ir3_register *reg = merge_set->regs[j];
|
||||
printf("\t"SYN_SSA("ssa_%u")":%u, offset %u\n", reg->instr->serialno,
|
||||
reg->name, reg->merge_set_offset);
|
||||
}
|
||||
printf("merge set, size %u, align %u:\n", merge_set->size,
|
||||
merge_set->alignment);
|
||||
for (unsigned j = 0; j < merge_set->regs_count; j++) {
|
||||
struct ir3_register *reg = merge_set->regs[j];
|
||||
printf("\t" SYN_SSA("ssa_%u") ":%u, offset %u\n",
|
||||
reg->instr->serialno, reg->name, reg->merge_set_offset);
|
||||
}
|
||||
|
||||
_mesa_set_add(merge_sets, merge_set);
|
||||
}
|
||||
}
|
||||
}
|
||||
_mesa_set_add(merge_sets, merge_set);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ralloc_free(merge_sets);
|
||||
ralloc_free(merge_sets);
|
||||
}
|
||||
|
||||
void
|
||||
ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir)
|
||||
{
|
||||
index_instrs(ir3_start_block(ir), 0);
|
||||
index_instrs(ir3_start_block(ir), 0);
|
||||
|
||||
/* First pass: coalesce phis, which must be together. */
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
if (instr->opc != OPC_META_PHI)
|
||||
break;
|
||||
/* First pass: coalesce phis, which must be together. */
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
if (instr->opc != OPC_META_PHI)
|
||||
break;
|
||||
|
||||
coalesce_phi(live, instr);
|
||||
}
|
||||
}
|
||||
coalesce_phi(live, instr);
|
||||
}
|
||||
}
|
||||
|
||||
/* Second pass: aggressively coalesce parallelcopy, split, collect */
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
switch (instr->opc) {
|
||||
case OPC_META_SPLIT:
|
||||
aggressive_coalesce_split(live, instr);
|
||||
break;
|
||||
case OPC_META_COLLECT:
|
||||
aggressive_coalesce_collect(live, instr);
|
||||
break;
|
||||
case OPC_META_PARALLEL_COPY:
|
||||
aggressive_coalesce_parallel_copy(live, instr);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Second pass: aggressively coalesce parallelcopy, split, collect */
|
||||
foreach_block (block, &ir->block_list) {
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
switch (instr->opc) {
|
||||
case OPC_META_SPLIT:
|
||||
aggressive_coalesce_split(live, instr);
|
||||
break;
|
||||
case OPC_META_COLLECT:
|
||||
aggressive_coalesce_collect(live, instr);
|
||||
break;
|
||||
case OPC_META_PARALLEL_COPY:
|
||||
aggressive_coalesce_parallel_copy(live, instr);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
index_merge_sets(ir);
|
||||
index_merge_sets(ir);
|
||||
|
||||
if (ir3_shader_debug & IR3_DBG_RAMSGS)
|
||||
dump_merge_sets(ir);
|
||||
if (ir3_shader_debug & IR3_DBG_RAMSGS)
|
||||
dump_merge_sets(ir);
|
||||
}
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -43,15 +43,19 @@ bool ir3_nir_move_varying_inputs(nir_shader *shader);
|
|||
int ir3_nir_coord_offset(nir_ssa_def *ssa);
|
||||
bool ir3_nir_lower_tex_prefetch(nir_shader *shader);
|
||||
|
||||
|
||||
void ir3_nir_lower_to_explicit_output(nir_shader *shader,
|
||||
struct ir3_shader_variant *v, unsigned topology);
|
||||
void ir3_nir_lower_to_explicit_input(nir_shader *shader, struct ir3_shader_variant *v);
|
||||
void ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader_variant *v, unsigned topology);
|
||||
void ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v, unsigned topology);
|
||||
struct ir3_shader_variant *v,
|
||||
unsigned topology);
|
||||
void ir3_nir_lower_to_explicit_input(nir_shader *shader,
|
||||
struct ir3_shader_variant *v);
|
||||
void ir3_nir_lower_tess_ctrl(nir_shader *shader, struct ir3_shader_variant *v,
|
||||
unsigned topology);
|
||||
void ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v,
|
||||
unsigned topology);
|
||||
void ir3_nir_lower_gs(nir_shader *shader);
|
||||
|
||||
const nir_shader_compiler_options * ir3_get_compiler_options(struct ir3_compiler *compiler);
|
||||
const nir_shader_compiler_options *
|
||||
ir3_get_compiler_options(struct ir3_compiler *compiler);
|
||||
void ir3_optimize_loop(struct ir3_compiler *compiler, nir_shader *s);
|
||||
void ir3_nir_lower_io_to_temporaries(nir_shader *s);
|
||||
void ir3_finalize_nir(struct ir3_compiler *compiler, nir_shader *s);
|
||||
|
@ -59,29 +63,30 @@ void ir3_nir_post_finalize(struct ir3_compiler *compiler, nir_shader *s);
|
|||
void ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s);
|
||||
|
||||
void ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v,
|
||||
struct ir3_const_state *const_state);
|
||||
struct ir3_const_state *const_state);
|
||||
bool ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v);
|
||||
void ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v);
|
||||
bool ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v);
|
||||
bool ir3_nir_fixup_load_uniform(nir_shader *nir);
|
||||
|
||||
nir_ssa_def *
|
||||
ir3_nir_try_propagate_bit_shift(nir_builder *b, nir_ssa_def *offset, int32_t shift);
|
||||
nir_ssa_def *ir3_nir_try_propagate_bit_shift(nir_builder *b,
|
||||
nir_ssa_def *offset,
|
||||
int32_t shift);
|
||||
|
||||
static inline nir_intrinsic_instr *
|
||||
ir3_bindless_resource(nir_src src)
|
||||
{
|
||||
if (!src.is_ssa)
|
||||
return NULL;
|
||||
if (!src.is_ssa)
|
||||
return NULL;
|
||||
|
||||
if (src.ssa->parent_instr->type != nir_instr_type_intrinsic)
|
||||
return NULL;
|
||||
if (src.ssa->parent_instr->type != nir_instr_type_intrinsic)
|
||||
return NULL;
|
||||
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src.ssa->parent_instr);
|
||||
if (intrin->intrinsic != nir_intrinsic_bindless_resource_ir3)
|
||||
return NULL;
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src.ssa->parent_instr);
|
||||
if (intrin->intrinsic != nir_intrinsic_bindless_resource_ir3)
|
||||
return NULL;
|
||||
|
||||
return intrin;
|
||||
return intrin;
|
||||
}
|
||||
|
||||
#endif /* IR3_NIR_H_ */
|
||||
|
|
|
@ -21,54 +21,55 @@
|
|||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "ir3_nir.h"
|
||||
#include "ir3_compiler.h"
|
||||
#include "compiler/nir/nir.h"
|
||||
#include "compiler/nir/nir_builder.h"
|
||||
#include "util/u_math.h"
|
||||
#include "ir3_compiler.h"
|
||||
#include "ir3_nir.h"
|
||||
|
||||
static inline bool
|
||||
get_ubo_load_range(nir_shader *nir, nir_intrinsic_instr *instr, uint32_t alignment, struct ir3_ubo_range *r)
|
||||
get_ubo_load_range(nir_shader *nir, nir_intrinsic_instr *instr,
|
||||
uint32_t alignment, struct ir3_ubo_range *r)
|
||||
{
|
||||
uint32_t offset = nir_intrinsic_range_base(instr);
|
||||
uint32_t size = nir_intrinsic_range(instr);
|
||||
uint32_t offset = nir_intrinsic_range_base(instr);
|
||||
uint32_t size = nir_intrinsic_range(instr);
|
||||
|
||||
/* If the offset is constant, the range is trivial (and NIR may not have
|
||||
* figured it out).
|
||||
*/
|
||||
if (nir_src_is_const(instr->src[1])) {
|
||||
offset = nir_src_as_uint(instr->src[1]);
|
||||
size = nir_intrinsic_dest_components(instr) * 4;
|
||||
}
|
||||
/* If the offset is constant, the range is trivial (and NIR may not have
|
||||
* figured it out).
|
||||
*/
|
||||
if (nir_src_is_const(instr->src[1])) {
|
||||
offset = nir_src_as_uint(instr->src[1]);
|
||||
size = nir_intrinsic_dest_components(instr) * 4;
|
||||
}
|
||||
|
||||
/* If we haven't figured out the range accessed in the UBO, bail. */
|
||||
if (size == ~0)
|
||||
return false;
|
||||
/* If we haven't figured out the range accessed in the UBO, bail. */
|
||||
if (size == ~0)
|
||||
return false;
|
||||
|
||||
r->start = ROUND_DOWN_TO(offset, alignment * 16);
|
||||
r->end = ALIGN(offset + size, alignment * 16);
|
||||
r->start = ROUND_DOWN_TO(offset, alignment * 16);
|
||||
r->end = ALIGN(offset + size, alignment * 16);
|
||||
|
||||
return true;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
get_ubo_info(nir_intrinsic_instr *instr, struct ir3_ubo_info *ubo)
|
||||
{
|
||||
if (nir_src_is_const(instr->src[0])) {
|
||||
ubo->block = nir_src_as_uint(instr->src[0]);
|
||||
ubo->bindless_base = 0;
|
||||
ubo->bindless = false;
|
||||
return true;
|
||||
} else {
|
||||
nir_intrinsic_instr *rsrc = ir3_bindless_resource(instr->src[0]);
|
||||
if (rsrc && nir_src_is_const(rsrc->src[0])) {
|
||||
ubo->block = nir_src_as_uint(rsrc->src[0]);
|
||||
ubo->bindless_base = nir_intrinsic_desc_set(rsrc);
|
||||
ubo->bindless = true;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
if (nir_src_is_const(instr->src[0])) {
|
||||
ubo->block = nir_src_as_uint(instr->src[0]);
|
||||
ubo->bindless_base = 0;
|
||||
ubo->bindless = false;
|
||||
return true;
|
||||
} else {
|
||||
nir_intrinsic_instr *rsrc = ir3_bindless_resource(instr->src[0]);
|
||||
if (rsrc && nir_src_is_const(rsrc->src[0])) {
|
||||
ubo->block = nir_src_as_uint(rsrc->src[0]);
|
||||
ubo->bindless_base = nir_intrinsic_desc_set(rsrc);
|
||||
ubo->bindless = true;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -76,24 +77,23 @@ get_ubo_info(nir_intrinsic_instr *instr, struct ir3_ubo_info *ubo)
|
|||
*/
|
||||
static const struct ir3_ubo_range *
|
||||
get_existing_range(nir_intrinsic_instr *instr,
|
||||
const struct ir3_ubo_analysis_state *state,
|
||||
struct ir3_ubo_range *r)
|
||||
const struct ir3_ubo_analysis_state *state,
|
||||
struct ir3_ubo_range *r)
|
||||
{
|
||||
struct ir3_ubo_info ubo = {};
|
||||
struct ir3_ubo_info ubo = {};
|
||||
|
||||
if (!get_ubo_info(instr, &ubo))
|
||||
return NULL;
|
||||
if (!get_ubo_info(instr, &ubo))
|
||||
return NULL;
|
||||
|
||||
for (int i = 0; i < state->num_enabled; i++) {
|
||||
const struct ir3_ubo_range *range = &state->range[i];
|
||||
if (!memcmp(&range->ubo, &ubo, sizeof(ubo)) &&
|
||||
r->start >= range->start &&
|
||||
r->end <= range->end) {
|
||||
return range;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < state->num_enabled; i++) {
|
||||
const struct ir3_ubo_range *range = &state->range[i];
|
||||
if (!memcmp(&range->ubo, &ubo, sizeof(ubo)) && r->start >= range->start &&
|
||||
r->end <= range->end) {
|
||||
return range;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -103,26 +103,26 @@ get_existing_range(nir_intrinsic_instr *instr,
|
|||
static void
|
||||
merge_neighbors(struct ir3_ubo_analysis_state *state, int index)
|
||||
{
|
||||
struct ir3_ubo_range *a = &state->range[index];
|
||||
struct ir3_ubo_range *a = &state->range[index];
|
||||
|
||||
/* index is always the first slot that would have neighbored/overlapped with
|
||||
* the new range.
|
||||
*/
|
||||
for (int i = index + 1; i < state->num_enabled; i++) {
|
||||
struct ir3_ubo_range *b = &state->range[i];
|
||||
if (memcmp(&a->ubo, &b->ubo, sizeof(a->ubo)))
|
||||
continue;
|
||||
/* index is always the first slot that would have neighbored/overlapped with
|
||||
* the new range.
|
||||
*/
|
||||
for (int i = index + 1; i < state->num_enabled; i++) {
|
||||
struct ir3_ubo_range *b = &state->range[i];
|
||||
if (memcmp(&a->ubo, &b->ubo, sizeof(a->ubo)))
|
||||
continue;
|
||||
|
||||
if (a->start > b->end || a->end < b->start)
|
||||
continue;
|
||||
if (a->start > b->end || a->end < b->start)
|
||||
continue;
|
||||
|
||||
/* Merge B into A. */
|
||||
a->start = MIN2(a->start, b->start);
|
||||
a->end = MAX2(a->end, b->end);
|
||||
/* Merge B into A. */
|
||||
a->start = MIN2(a->start, b->start);
|
||||
a->end = MAX2(a->end, b->end);
|
||||
|
||||
/* Swap the last enabled range into B's now unused slot */
|
||||
*b = state->range[--state->num_enabled];
|
||||
}
|
||||
/* Swap the last enabled range into B's now unused slot */
|
||||
*b = state->range[--state->num_enabled];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -134,59 +134,59 @@ merge_neighbors(struct ir3_ubo_analysis_state *state, int index)
|
|||
*/
|
||||
static void
|
||||
gather_ubo_ranges(nir_shader *nir, nir_intrinsic_instr *instr,
|
||||
struct ir3_ubo_analysis_state *state, uint32_t alignment,
|
||||
uint32_t *upload_remaining)
|
||||
struct ir3_ubo_analysis_state *state, uint32_t alignment,
|
||||
uint32_t *upload_remaining)
|
||||
{
|
||||
if (ir3_shader_debug & IR3_DBG_NOUBOOPT)
|
||||
return;
|
||||
if (ir3_shader_debug & IR3_DBG_NOUBOOPT)
|
||||
return;
|
||||
|
||||
struct ir3_ubo_info ubo = {};
|
||||
if (!get_ubo_info(instr, &ubo))
|
||||
return;
|
||||
struct ir3_ubo_info ubo = {};
|
||||
if (!get_ubo_info(instr, &ubo))
|
||||
return;
|
||||
|
||||
struct ir3_ubo_range r;
|
||||
if (!get_ubo_load_range(nir, instr, alignment, &r))
|
||||
return;
|
||||
struct ir3_ubo_range r;
|
||||
if (!get_ubo_load_range(nir, instr, alignment, &r))
|
||||
return;
|
||||
|
||||
/* See if there's an existing range for this UBO we want to merge into. */
|
||||
for (int i = 0; i < state->num_enabled; i++) {
|
||||
struct ir3_ubo_range *plan_r = &state->range[i];
|
||||
if (memcmp(&plan_r->ubo, &ubo, sizeof(ubo)))
|
||||
continue;
|
||||
/* See if there's an existing range for this UBO we want to merge into. */
|
||||
for (int i = 0; i < state->num_enabled; i++) {
|
||||
struct ir3_ubo_range *plan_r = &state->range[i];
|
||||
if (memcmp(&plan_r->ubo, &ubo, sizeof(ubo)))
|
||||
continue;
|
||||
|
||||
/* Don't extend existing uploads unless they're
|
||||
* neighboring/overlapping.
|
||||
*/
|
||||
if (r.start > plan_r->end || r.end < plan_r->start)
|
||||
continue;
|
||||
/* Don't extend existing uploads unless they're
|
||||
* neighboring/overlapping.
|
||||
*/
|
||||
if (r.start > plan_r->end || r.end < plan_r->start)
|
||||
continue;
|
||||
|
||||
r.start = MIN2(r.start, plan_r->start);
|
||||
r.end = MAX2(r.end, plan_r->end);
|
||||
r.start = MIN2(r.start, plan_r->start);
|
||||
r.end = MAX2(r.end, plan_r->end);
|
||||
|
||||
uint32_t added = (plan_r->start - r.start) + (r.end - plan_r->end);
|
||||
if (added >= *upload_remaining)
|
||||
return;
|
||||
uint32_t added = (plan_r->start - r.start) + (r.end - plan_r->end);
|
||||
if (added >= *upload_remaining)
|
||||
return;
|
||||
|
||||
plan_r->start = r.start;
|
||||
plan_r->end = r.end;
|
||||
*upload_remaining -= added;
|
||||
plan_r->start = r.start;
|
||||
plan_r->end = r.end;
|
||||
*upload_remaining -= added;
|
||||
|
||||
merge_neighbors(state, i);
|
||||
return;
|
||||
}
|
||||
merge_neighbors(state, i);
|
||||
return;
|
||||
}
|
||||
|
||||
if (state->num_enabled == ARRAY_SIZE(state->range))
|
||||
return;
|
||||
if (state->num_enabled == ARRAY_SIZE(state->range))
|
||||
return;
|
||||
|
||||
uint32_t added = r.end - r.start;
|
||||
if (added >= *upload_remaining)
|
||||
return;
|
||||
uint32_t added = r.end - r.start;
|
||||
if (added >= *upload_remaining)
|
||||
return;
|
||||
|
||||
struct ir3_ubo_range *plan_r = &state->range[state->num_enabled++];
|
||||
plan_r->ubo = ubo;
|
||||
plan_r->start = r.start;
|
||||
plan_r->end = r.end;
|
||||
*upload_remaining -= added;
|
||||
struct ir3_ubo_range *plan_r = &state->range[state->num_enabled++];
|
||||
plan_r->ubo = ubo;
|
||||
plan_r->start = r.start;
|
||||
plan_r->end = r.end;
|
||||
*upload_remaining -= added;
|
||||
}
|
||||
|
||||
/* For indirect offset, it is common to see a pattern of multiple
|
||||
|
@ -197,7 +197,8 @@ gather_ubo_ranges(nir_shader *nir, nir_intrinsic_instr *instr,
|
|||
*
|
||||
* Detect this, and peel out the const_offset part, to end up with:
|
||||
*
|
||||
* vec4 32 ssa_34 = intrinsic load_uniform (ssa_base) (base=N+const_offset, 0, 0)
|
||||
* vec4 32 ssa_34 = intrinsic load_uniform (ssa_base) (base=N+const_offset,
|
||||
* 0, 0)
|
||||
*
|
||||
* Or similarly:
|
||||
*
|
||||
|
@ -207,7 +208,8 @@ gather_ubo_ranges(nir_shader *nir, nir_intrinsic_instr *instr,
|
|||
* Can be converted to:
|
||||
*
|
||||
* vec1 32 ssa_base = imul24 a, b
|
||||
* vec4 32 ssa_34 = intrinsic load_uniform (ssa_base) (base=N+const_offset, 0, 0)
|
||||
* vec4 32 ssa_34 = intrinsic load_uniform (ssa_base) (base=N+const_offset,
|
||||
* 0, 0)
|
||||
*
|
||||
* This gives the other opt passes something much easier to work
|
||||
* with (ie. not requiring value range tracking)
|
||||
|
@ -215,38 +217,38 @@ gather_ubo_ranges(nir_shader *nir, nir_intrinsic_instr *instr,
|
|||
static void
|
||||
handle_partial_const(nir_builder *b, nir_ssa_def **srcp, int *offp)
|
||||
{
|
||||
if ((*srcp)->parent_instr->type != nir_instr_type_alu)
|
||||
return;
|
||||
if ((*srcp)->parent_instr->type != nir_instr_type_alu)
|
||||
return;
|
||||
|
||||
nir_alu_instr *alu = nir_instr_as_alu((*srcp)->parent_instr);
|
||||
nir_alu_instr *alu = nir_instr_as_alu((*srcp)->parent_instr);
|
||||
|
||||
if (alu->op == nir_op_imad24_ir3) {
|
||||
/* This case is slightly more complicated as we need to
|
||||
* replace the imad24_ir3 with an imul24:
|
||||
*/
|
||||
if (!nir_src_is_const(alu->src[2].src))
|
||||
return;
|
||||
if (alu->op == nir_op_imad24_ir3) {
|
||||
/* This case is slightly more complicated as we need to
|
||||
* replace the imad24_ir3 with an imul24:
|
||||
*/
|
||||
if (!nir_src_is_const(alu->src[2].src))
|
||||
return;
|
||||
|
||||
*offp += nir_src_as_uint(alu->src[2].src);
|
||||
*srcp = nir_imul24(b, nir_ssa_for_alu_src(b, alu, 0),
|
||||
nir_ssa_for_alu_src(b, alu, 1));
|
||||
*offp += nir_src_as_uint(alu->src[2].src);
|
||||
*srcp = nir_imul24(b, nir_ssa_for_alu_src(b, alu, 0),
|
||||
nir_ssa_for_alu_src(b, alu, 1));
|
||||
|
||||
return;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (alu->op != nir_op_iadd)
|
||||
return;
|
||||
if (alu->op != nir_op_iadd)
|
||||
return;
|
||||
|
||||
if (!(alu->src[0].src.is_ssa && alu->src[1].src.is_ssa))
|
||||
return;
|
||||
if (!(alu->src[0].src.is_ssa && alu->src[1].src.is_ssa))
|
||||
return;
|
||||
|
||||
if (nir_src_is_const(alu->src[0].src)) {
|
||||
*offp += nir_src_as_uint(alu->src[0].src);
|
||||
*srcp = alu->src[1].src.ssa;
|
||||
} else if (nir_src_is_const(alu->src[1].src)) {
|
||||
*srcp = alu->src[0].src.ssa;
|
||||
*offp += nir_src_as_uint(alu->src[1].src);
|
||||
}
|
||||
if (nir_src_is_const(alu->src[0].src)) {
|
||||
*offp += nir_src_as_uint(alu->src[0].src);
|
||||
*srcp = alu->src[1].src.ssa;
|
||||
} else if (nir_src_is_const(alu->src[1].src)) {
|
||||
*srcp = alu->src[0].src.ssa;
|
||||
*offp += nir_src_as_uint(alu->src[1].src);
|
||||
}
|
||||
}
|
||||
|
||||
/* Tracks the maximum bindful UBO accessed so that we reduce the UBO
|
||||
|
@ -255,258 +257,256 @@ handle_partial_const(nir_builder *b, nir_ssa_def **srcp, int *offp)
|
|||
static void
|
||||
track_ubo_use(nir_intrinsic_instr *instr, nir_builder *b, int *num_ubos)
|
||||
{
|
||||
if (ir3_bindless_resource(instr->src[0])) {
|
||||
assert(!b->shader->info.first_ubo_is_default_ubo); /* only set for GL */
|
||||
return;
|
||||
}
|
||||
if (ir3_bindless_resource(instr->src[0])) {
|
||||
assert(!b->shader->info.first_ubo_is_default_ubo); /* only set for GL */
|
||||
return;
|
||||
}
|
||||
|
||||
if (nir_src_is_const(instr->src[0])) {
|
||||
int block = nir_src_as_uint(instr->src[0]);
|
||||
*num_ubos = MAX2(*num_ubos, block + 1);
|
||||
} else {
|
||||
*num_ubos = b->shader->info.num_ubos;
|
||||
}
|
||||
if (nir_src_is_const(instr->src[0])) {
|
||||
int block = nir_src_as_uint(instr->src[0]);
|
||||
*num_ubos = MAX2(*num_ubos, block + 1);
|
||||
} else {
|
||||
*num_ubos = b->shader->info.num_ubos;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b,
|
||||
const struct ir3_ubo_analysis_state *state,
|
||||
int *num_ubos, uint32_t alignment)
|
||||
const struct ir3_ubo_analysis_state *state,
|
||||
int *num_ubos, uint32_t alignment)
|
||||
{
|
||||
b->cursor = nir_before_instr(&instr->instr);
|
||||
b->cursor = nir_before_instr(&instr->instr);
|
||||
|
||||
struct ir3_ubo_range r;
|
||||
if (!get_ubo_load_range(b->shader, instr, alignment, &r)) {
|
||||
track_ubo_use(instr, b, num_ubos);
|
||||
return false;
|
||||
}
|
||||
struct ir3_ubo_range r;
|
||||
if (!get_ubo_load_range(b->shader, instr, alignment, &r)) {
|
||||
track_ubo_use(instr, b, num_ubos);
|
||||
return false;
|
||||
}
|
||||
|
||||
/* We don't lower dynamic block index UBO loads to load_uniform, but we
|
||||
* could probably with some effort determine a block stride in number of
|
||||
* registers.
|
||||
*/
|
||||
const struct ir3_ubo_range *range = get_existing_range(instr, state, &r);
|
||||
if (!range) {
|
||||
track_ubo_use(instr, b, num_ubos);
|
||||
return false;
|
||||
}
|
||||
/* We don't lower dynamic block index UBO loads to load_uniform, but we
|
||||
* could probably with some effort determine a block stride in number of
|
||||
* registers.
|
||||
*/
|
||||
const struct ir3_ubo_range *range = get_existing_range(instr, state, &r);
|
||||
if (!range) {
|
||||
track_ubo_use(instr, b, num_ubos);
|
||||
return false;
|
||||
}
|
||||
|
||||
nir_ssa_def *ubo_offset = nir_ssa_for_src(b, instr->src[1], 1);
|
||||
int const_offset = 0;
|
||||
nir_ssa_def *ubo_offset = nir_ssa_for_src(b, instr->src[1], 1);
|
||||
int const_offset = 0;
|
||||
|
||||
handle_partial_const(b, &ubo_offset, &const_offset);
|
||||
handle_partial_const(b, &ubo_offset, &const_offset);
|
||||
|
||||
/* UBO offset is in bytes, but uniform offset is in units of
|
||||
* dwords, so we need to divide by 4 (right-shift by 2). For ldc the
|
||||
* offset is in units of 16 bytes, so we need to multiply by 4. And
|
||||
* also the same for the constant part of the offset:
|
||||
*/
|
||||
const int shift = -2;
|
||||
nir_ssa_def *new_offset = ir3_nir_try_propagate_bit_shift(b, ubo_offset, -2);
|
||||
nir_ssa_def *uniform_offset = NULL;
|
||||
if (new_offset) {
|
||||
uniform_offset = new_offset;
|
||||
} else {
|
||||
uniform_offset = shift > 0 ?
|
||||
nir_ishl(b, ubo_offset, nir_imm_int(b, shift)) :
|
||||
nir_ushr(b, ubo_offset, nir_imm_int(b, -shift));
|
||||
}
|
||||
/* UBO offset is in bytes, but uniform offset is in units of
|
||||
* dwords, so we need to divide by 4 (right-shift by 2). For ldc the
|
||||
* offset is in units of 16 bytes, so we need to multiply by 4. And
|
||||
* also the same for the constant part of the offset:
|
||||
*/
|
||||
const int shift = -2;
|
||||
nir_ssa_def *new_offset = ir3_nir_try_propagate_bit_shift(b, ubo_offset, -2);
|
||||
nir_ssa_def *uniform_offset = NULL;
|
||||
if (new_offset) {
|
||||
uniform_offset = new_offset;
|
||||
} else {
|
||||
uniform_offset = shift > 0
|
||||
? nir_ishl(b, ubo_offset, nir_imm_int(b, shift))
|
||||
: nir_ushr(b, ubo_offset, nir_imm_int(b, -shift));
|
||||
}
|
||||
|
||||
debug_assert(!(const_offset & 0x3));
|
||||
const_offset >>= 2;
|
||||
debug_assert(!(const_offset & 0x3));
|
||||
const_offset >>= 2;
|
||||
|
||||
const int range_offset = ((int)range->offset - (int)range->start) / 4;
|
||||
const_offset += range_offset;
|
||||
const int range_offset = ((int)range->offset - (int)range->start) / 4;
|
||||
const_offset += range_offset;
|
||||
|
||||
/* The range_offset could be negative, if if only part of the UBO
|
||||
* block is accessed, range->start can be greater than range->offset.
|
||||
* But we can't underflow const_offset. If necessary we need to
|
||||
* insert nir instructions to compensate (which can hopefully be
|
||||
* optimized away)
|
||||
*/
|
||||
if (const_offset < 0) {
|
||||
uniform_offset = nir_iadd_imm(b, uniform_offset, const_offset);
|
||||
const_offset = 0;
|
||||
}
|
||||
/* The range_offset could be negative, if if only part of the UBO
|
||||
* block is accessed, range->start can be greater than range->offset.
|
||||
* But we can't underflow const_offset. If necessary we need to
|
||||
* insert nir instructions to compensate (which can hopefully be
|
||||
* optimized away)
|
||||
*/
|
||||
if (const_offset < 0) {
|
||||
uniform_offset = nir_iadd_imm(b, uniform_offset, const_offset);
|
||||
const_offset = 0;
|
||||
}
|
||||
|
||||
nir_ssa_def *uniform =
|
||||
nir_load_uniform(b, instr->num_components, instr->dest.ssa.bit_size, uniform_offset, .base = const_offset);
|
||||
nir_ssa_def *uniform =
|
||||
nir_load_uniform(b, instr->num_components, instr->dest.ssa.bit_size,
|
||||
uniform_offset, .base = const_offset);
|
||||
|
||||
nir_ssa_def_rewrite_uses(&instr->dest.ssa,
|
||||
uniform);
|
||||
nir_ssa_def_rewrite_uses(&instr->dest.ssa, uniform);
|
||||
|
||||
nir_instr_remove(&instr->instr);
|
||||
nir_instr_remove(&instr->instr);
|
||||
|
||||
return true;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
instr_is_load_ubo(nir_instr *instr)
|
||||
{
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
return false;
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
return false;
|
||||
|
||||
nir_intrinsic_op op = nir_instr_as_intrinsic(instr)->intrinsic;
|
||||
nir_intrinsic_op op = nir_instr_as_intrinsic(instr)->intrinsic;
|
||||
|
||||
/* nir_lower_ubo_vec4 happens after this pass. */
|
||||
assert(op != nir_intrinsic_load_ubo_vec4);
|
||||
/* nir_lower_ubo_vec4 happens after this pass. */
|
||||
assert(op != nir_intrinsic_load_ubo_vec4);
|
||||
|
||||
return op == nir_intrinsic_load_ubo;
|
||||
return op == nir_intrinsic_load_ubo;
|
||||
}
|
||||
|
||||
void
|
||||
ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v)
|
||||
{
|
||||
struct ir3_const_state *const_state = ir3_const_state(v);
|
||||
struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
|
||||
struct ir3_compiler *compiler = v->shader->compiler;
|
||||
struct ir3_const_state *const_state = ir3_const_state(v);
|
||||
struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
|
||||
struct ir3_compiler *compiler = v->shader->compiler;
|
||||
|
||||
/* Limit our uploads to the amount of constant buffer space available in
|
||||
* the hardware, minus what the shader compiler may need for various
|
||||
* driver params. We do this UBO-to-push-constant before the real
|
||||
* allocation of the driver params' const space, because UBO pointers can
|
||||
* be driver params but this pass usually eliminatings them.
|
||||
*/
|
||||
struct ir3_const_state worst_case_const_state = { };
|
||||
ir3_setup_const_state(nir, v, &worst_case_const_state);
|
||||
const uint32_t max_upload = (ir3_max_const(v) -
|
||||
worst_case_const_state.offsets.immediate) * 16;
|
||||
/* Limit our uploads to the amount of constant buffer space available in
|
||||
* the hardware, minus what the shader compiler may need for various
|
||||
* driver params. We do this UBO-to-push-constant before the real
|
||||
* allocation of the driver params' const space, because UBO pointers can
|
||||
* be driver params but this pass usually eliminatings them.
|
||||
*/
|
||||
struct ir3_const_state worst_case_const_state = {};
|
||||
ir3_setup_const_state(nir, v, &worst_case_const_state);
|
||||
const uint32_t max_upload =
|
||||
(ir3_max_const(v) - worst_case_const_state.offsets.immediate) * 16;
|
||||
|
||||
memset(state, 0, sizeof(*state));
|
||||
memset(state, 0, sizeof(*state));
|
||||
|
||||
uint32_t upload_remaining = max_upload;
|
||||
nir_foreach_function (function, nir) {
|
||||
if (function->impl) {
|
||||
nir_foreach_block (block, function->impl) {
|
||||
nir_foreach_instr (instr, block) {
|
||||
if (instr_is_load_ubo(instr))
|
||||
gather_ubo_ranges(nir, nir_instr_as_intrinsic(instr),
|
||||
state, compiler->const_upload_unit,
|
||||
&upload_remaining);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
uint32_t upload_remaining = max_upload;
|
||||
nir_foreach_function (function, nir) {
|
||||
if (function->impl) {
|
||||
nir_foreach_block (block, function->impl) {
|
||||
nir_foreach_instr (instr, block) {
|
||||
if (instr_is_load_ubo(instr))
|
||||
gather_ubo_ranges(nir, nir_instr_as_intrinsic(instr), state,
|
||||
compiler->const_upload_unit,
|
||||
&upload_remaining);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* For now, everything we upload is accessed statically and thus will be
|
||||
* used by the shader. Once we can upload dynamically indexed data, we may
|
||||
* upload sparsely accessed arrays, at which point we probably want to
|
||||
* give priority to smaller UBOs, on the assumption that big UBOs will be
|
||||
* accessed dynamically. Alternatively, we can track statically and
|
||||
* dynamically accessed ranges separately and upload static rangtes
|
||||
* first.
|
||||
*/
|
||||
/* For now, everything we upload is accessed statically and thus will be
|
||||
* used by the shader. Once we can upload dynamically indexed data, we may
|
||||
* upload sparsely accessed arrays, at which point we probably want to
|
||||
* give priority to smaller UBOs, on the assumption that big UBOs will be
|
||||
* accessed dynamically. Alternatively, we can track statically and
|
||||
* dynamically accessed ranges separately and upload static rangtes
|
||||
* first.
|
||||
*/
|
||||
|
||||
uint32_t offset = v->shader->num_reserved_user_consts * 16;
|
||||
for (uint32_t i = 0; i < state->num_enabled; i++) {
|
||||
uint32_t range_size = state->range[i].end - state->range[i].start;
|
||||
uint32_t offset = v->shader->num_reserved_user_consts * 16;
|
||||
for (uint32_t i = 0; i < state->num_enabled; i++) {
|
||||
uint32_t range_size = state->range[i].end - state->range[i].start;
|
||||
|
||||
debug_assert(offset <= max_upload);
|
||||
state->range[i].offset = offset;
|
||||
assert(offset <= max_upload);
|
||||
offset += range_size;
|
||||
|
||||
}
|
||||
state->size = offset;
|
||||
debug_assert(offset <= max_upload);
|
||||
state->range[i].offset = offset;
|
||||
assert(offset <= max_upload);
|
||||
offset += range_size;
|
||||
}
|
||||
state->size = offset;
|
||||
}
|
||||
|
||||
bool
|
||||
ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v)
|
||||
{
|
||||
struct ir3_compiler *compiler = v->shader->compiler;
|
||||
/* For the binning pass variant, we re-use the corresponding draw-pass
|
||||
* variants const_state and ubo state. To make these clear, in this
|
||||
* pass it is const (read-only)
|
||||
*/
|
||||
const struct ir3_const_state *const_state = ir3_const_state(v);
|
||||
const struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
|
||||
struct ir3_compiler *compiler = v->shader->compiler;
|
||||
/* For the binning pass variant, we re-use the corresponding draw-pass
|
||||
* variants const_state and ubo state. To make these clear, in this
|
||||
* pass it is const (read-only)
|
||||
*/
|
||||
const struct ir3_const_state *const_state = ir3_const_state(v);
|
||||
const struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
|
||||
|
||||
int num_ubos = 0;
|
||||
bool progress = false;
|
||||
nir_foreach_function (function, nir) {
|
||||
if (function->impl) {
|
||||
nir_builder builder;
|
||||
nir_builder_init(&builder, function->impl);
|
||||
nir_foreach_block (block, function->impl) {
|
||||
nir_foreach_instr_safe (instr, block) {
|
||||
if (!instr_is_load_ubo(instr))
|
||||
continue;
|
||||
progress |=
|
||||
lower_ubo_load_to_uniform(nir_instr_as_intrinsic(instr),
|
||||
&builder, state, &num_ubos,
|
||||
compiler->const_upload_unit);
|
||||
}
|
||||
}
|
||||
int num_ubos = 0;
|
||||
bool progress = false;
|
||||
nir_foreach_function (function, nir) {
|
||||
if (function->impl) {
|
||||
nir_builder builder;
|
||||
nir_builder_init(&builder, function->impl);
|
||||
nir_foreach_block (block, function->impl) {
|
||||
nir_foreach_instr_safe (instr, block) {
|
||||
if (!instr_is_load_ubo(instr))
|
||||
continue;
|
||||
progress |= lower_ubo_load_to_uniform(
|
||||
nir_instr_as_intrinsic(instr), &builder, state, &num_ubos,
|
||||
compiler->const_upload_unit);
|
||||
}
|
||||
}
|
||||
|
||||
nir_metadata_preserve(function->impl, nir_metadata_block_index |
|
||||
nir_metadata_dominance);
|
||||
}
|
||||
}
|
||||
/* Update the num_ubos field for GL (first_ubo_is_default_ubo). With
|
||||
* Vulkan's bindless, we don't use the num_ubos field, so we can leave it
|
||||
* incremented.
|
||||
*/
|
||||
if (nir->info.first_ubo_is_default_ubo)
|
||||
nir->info.num_ubos = num_ubos;
|
||||
nir_metadata_preserve(
|
||||
function->impl, nir_metadata_block_index | nir_metadata_dominance);
|
||||
}
|
||||
}
|
||||
/* Update the num_ubos field for GL (first_ubo_is_default_ubo). With
|
||||
* Vulkan's bindless, we don't use the num_ubos field, so we can leave it
|
||||
* incremented.
|
||||
*/
|
||||
if (nir->info.first_ubo_is_default_ubo)
|
||||
nir->info.num_ubos = num_ubos;
|
||||
|
||||
return progress;
|
||||
return progress;
|
||||
}
|
||||
|
||||
|
||||
static bool
|
||||
fixup_load_uniform_filter(const nir_instr *instr, const void *arg)
|
||||
{
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
return false;
|
||||
return nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_uniform;
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
return false;
|
||||
return nir_instr_as_intrinsic(instr)->intrinsic ==
|
||||
nir_intrinsic_load_uniform;
|
||||
}
|
||||
|
||||
static nir_ssa_def *
|
||||
fixup_load_uniform_instr(struct nir_builder *b, nir_instr *instr, void *arg)
|
||||
{
|
||||
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
||||
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
||||
|
||||
/* We don't need to worry about non-indirect case: */
|
||||
if (nir_src_is_const(intr->src[0]))
|
||||
return NULL;
|
||||
/* We don't need to worry about non-indirect case: */
|
||||
if (nir_src_is_const(intr->src[0]))
|
||||
return NULL;
|
||||
|
||||
const unsigned base_offset_limit = (1 << 9); /* 9 bits */
|
||||
unsigned base_offset = nir_intrinsic_base(intr);
|
||||
const unsigned base_offset_limit = (1 << 9); /* 9 bits */
|
||||
unsigned base_offset = nir_intrinsic_base(intr);
|
||||
|
||||
/* Or cases were base offset is lower than the hw limit: */
|
||||
if (base_offset < base_offset_limit)
|
||||
return NULL;
|
||||
/* Or cases were base offset is lower than the hw limit: */
|
||||
if (base_offset < base_offset_limit)
|
||||
return NULL;
|
||||
|
||||
b->cursor = nir_before_instr(instr);
|
||||
b->cursor = nir_before_instr(instr);
|
||||
|
||||
nir_ssa_def *offset = nir_ssa_for_src(b, intr->src[0], 1);
|
||||
nir_ssa_def *offset = nir_ssa_for_src(b, intr->src[0], 1);
|
||||
|
||||
/* We'd like to avoid a sequence like:
|
||||
*
|
||||
* vec4 32 ssa_18 = intrinsic load_uniform (ssa_4) (1024, 0, 0)
|
||||
* vec4 32 ssa_19 = intrinsic load_uniform (ssa_4) (1072, 0, 0)
|
||||
* vec4 32 ssa_20 = intrinsic load_uniform (ssa_4) (1120, 0, 0)
|
||||
*
|
||||
* From turning into a unique offset value (which requires reloading
|
||||
* a0.x for each instruction). So instead of just adding the constant
|
||||
* base_offset to the non-const offset, be a bit more clever and only
|
||||
* extract the part that cannot be encoded. Afterwards CSE should
|
||||
* turn the result into:
|
||||
*
|
||||
* vec1 32 ssa_5 = load_const (1024)
|
||||
* vec4 32 ssa_6 = iadd ssa4_, ssa_5
|
||||
* vec4 32 ssa_18 = intrinsic load_uniform (ssa_5) (0, 0, 0)
|
||||
* vec4 32 ssa_19 = intrinsic load_uniform (ssa_5) (48, 0, 0)
|
||||
* vec4 32 ssa_20 = intrinsic load_uniform (ssa_5) (96, 0, 0)
|
||||
*/
|
||||
unsigned new_base_offset = base_offset % base_offset_limit;
|
||||
/* We'd like to avoid a sequence like:
|
||||
*
|
||||
* vec4 32 ssa_18 = intrinsic load_uniform (ssa_4) (1024, 0, 0)
|
||||
* vec4 32 ssa_19 = intrinsic load_uniform (ssa_4) (1072, 0, 0)
|
||||
* vec4 32 ssa_20 = intrinsic load_uniform (ssa_4) (1120, 0, 0)
|
||||
*
|
||||
* From turning into a unique offset value (which requires reloading
|
||||
* a0.x for each instruction). So instead of just adding the constant
|
||||
* base_offset to the non-const offset, be a bit more clever and only
|
||||
* extract the part that cannot be encoded. Afterwards CSE should
|
||||
* turn the result into:
|
||||
*
|
||||
* vec1 32 ssa_5 = load_const (1024)
|
||||
* vec4 32 ssa_6 = iadd ssa4_, ssa_5
|
||||
* vec4 32 ssa_18 = intrinsic load_uniform (ssa_5) (0, 0, 0)
|
||||
* vec4 32 ssa_19 = intrinsic load_uniform (ssa_5) (48, 0, 0)
|
||||
* vec4 32 ssa_20 = intrinsic load_uniform (ssa_5) (96, 0, 0)
|
||||
*/
|
||||
unsigned new_base_offset = base_offset % base_offset_limit;
|
||||
|
||||
nir_intrinsic_set_base(intr, new_base_offset);
|
||||
offset = nir_iadd_imm(b, offset, base_offset - new_base_offset);
|
||||
nir_intrinsic_set_base(intr, new_base_offset);
|
||||
offset = nir_iadd_imm(b, offset, base_offset - new_base_offset);
|
||||
|
||||
nir_instr_rewrite_src(instr, &intr->src[0], nir_src_for_ssa(offset));
|
||||
nir_instr_rewrite_src(instr, &intr->src[0], nir_src_for_ssa(offset));
|
||||
|
||||
return NIR_LOWER_INSTR_PROGRESS;
|
||||
return NIR_LOWER_INSTR_PROGRESS;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -520,59 +520,59 @@ fixup_load_uniform_instr(struct nir_builder *b, nir_instr *instr, void *arg)
|
|||
bool
|
||||
ir3_nir_fixup_load_uniform(nir_shader *nir)
|
||||
{
|
||||
return nir_shader_lower_instructions(nir,
|
||||
fixup_load_uniform_filter, fixup_load_uniform_instr,
|
||||
NULL);
|
||||
return nir_shader_lower_instructions(nir, fixup_load_uniform_filter,
|
||||
fixup_load_uniform_instr, NULL);
|
||||
}
|
||||
static nir_ssa_def *
|
||||
ir3_nir_lower_load_const_instr(nir_builder *b, nir_instr *in_instr, void *data)
|
||||
{
|
||||
struct ir3_const_state *const_state = data;
|
||||
nir_intrinsic_instr *instr = nir_instr_as_intrinsic(in_instr);
|
||||
struct ir3_const_state *const_state = data;
|
||||
nir_intrinsic_instr *instr = nir_instr_as_intrinsic(in_instr);
|
||||
|
||||
/* Pick a UBO index to use as our constant data. Skip UBO 0 since that's
|
||||
* reserved for gallium's cb0.
|
||||
*/
|
||||
if (const_state->constant_data_ubo == -1) {
|
||||
if (b->shader->info.num_ubos == 0)
|
||||
b->shader->info.num_ubos++;
|
||||
const_state->constant_data_ubo = b->shader->info.num_ubos++;
|
||||
}
|
||||
/* Pick a UBO index to use as our constant data. Skip UBO 0 since that's
|
||||
* reserved for gallium's cb0.
|
||||
*/
|
||||
if (const_state->constant_data_ubo == -1) {
|
||||
if (b->shader->info.num_ubos == 0)
|
||||
b->shader->info.num_ubos++;
|
||||
const_state->constant_data_ubo = b->shader->info.num_ubos++;
|
||||
}
|
||||
|
||||
unsigned num_components = instr->num_components;
|
||||
if (nir_dest_bit_size(instr->dest) == 16) {
|
||||
/* We can't do 16b loads -- either from LDC (32-bit only in any of our
|
||||
* traces, and disasm that doesn't look like it really supports it) or
|
||||
* from the constant file (where CONSTANT_DEMOTION_ENABLE means we get
|
||||
* automatic 32b-to-16b conversions when we ask for 16b from it).
|
||||
* Instead, we'll load 32b from a UBO and unpack from there.
|
||||
*/
|
||||
num_components = DIV_ROUND_UP(num_components, 2);
|
||||
}
|
||||
unsigned base = nir_intrinsic_base(instr);
|
||||
nir_ssa_def *index = nir_imm_int(b, const_state->constant_data_ubo);
|
||||
nir_ssa_def *offset = nir_iadd_imm(b, nir_ssa_for_src(b, instr->src[0], 1), base);
|
||||
unsigned num_components = instr->num_components;
|
||||
if (nir_dest_bit_size(instr->dest) == 16) {
|
||||
/* We can't do 16b loads -- either from LDC (32-bit only in any of our
|
||||
* traces, and disasm that doesn't look like it really supports it) or
|
||||
* from the constant file (where CONSTANT_DEMOTION_ENABLE means we get
|
||||
* automatic 32b-to-16b conversions when we ask for 16b from it).
|
||||
* Instead, we'll load 32b from a UBO and unpack from there.
|
||||
*/
|
||||
num_components = DIV_ROUND_UP(num_components, 2);
|
||||
}
|
||||
unsigned base = nir_intrinsic_base(instr);
|
||||
nir_ssa_def *index = nir_imm_int(b, const_state->constant_data_ubo);
|
||||
nir_ssa_def *offset =
|
||||
nir_iadd_imm(b, nir_ssa_for_src(b, instr->src[0], 1), base);
|
||||
|
||||
nir_ssa_def *result =
|
||||
nir_load_ubo(b, num_components, 32, index, offset,
|
||||
.align_mul = nir_intrinsic_align_mul(instr),
|
||||
.align_offset = nir_intrinsic_align_offset(instr),
|
||||
.range_base = base,
|
||||
.range = nir_intrinsic_range(instr));
|
||||
nir_ssa_def *result =
|
||||
nir_load_ubo(b, num_components, 32, index, offset,
|
||||
.align_mul = nir_intrinsic_align_mul(instr),
|
||||
.align_offset = nir_intrinsic_align_offset(instr),
|
||||
.range_base = base, .range = nir_intrinsic_range(instr));
|
||||
|
||||
if (nir_dest_bit_size(instr->dest) == 16) {
|
||||
result = nir_bitcast_vector(b, result, 16);
|
||||
result = nir_channels(b, result, BITSET_MASK(instr->num_components));
|
||||
}
|
||||
if (nir_dest_bit_size(instr->dest) == 16) {
|
||||
result = nir_bitcast_vector(b, result, 16);
|
||||
result = nir_channels(b, result, BITSET_MASK(instr->num_components));
|
||||
}
|
||||
|
||||
return result;
|
||||
return result;
|
||||
}
|
||||
|
||||
static bool
|
||||
ir3_lower_load_const_filter(const nir_instr *instr, const void *data)
|
||||
{
|
||||
return (instr->type == nir_instr_type_intrinsic &&
|
||||
nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_constant);
|
||||
return (instr->type == nir_instr_type_intrinsic &&
|
||||
nir_instr_as_intrinsic(instr)->intrinsic ==
|
||||
nir_intrinsic_load_constant);
|
||||
}
|
||||
|
||||
/* Lowers load_constant intrinsics to UBO accesses so we can run them through
|
||||
|
@ -581,26 +581,26 @@ ir3_lower_load_const_filter(const nir_instr *instr, const void *data)
|
|||
bool
|
||||
ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v)
|
||||
{
|
||||
struct ir3_const_state *const_state = ir3_const_state(v);
|
||||
struct ir3_const_state *const_state = ir3_const_state(v);
|
||||
|
||||
const_state->constant_data_ubo = -1;
|
||||
const_state->constant_data_ubo = -1;
|
||||
|
||||
bool progress = nir_shader_lower_instructions(nir,
|
||||
ir3_lower_load_const_filter, ir3_nir_lower_load_const_instr,
|
||||
const_state);
|
||||
bool progress = nir_shader_lower_instructions(
|
||||
nir, ir3_lower_load_const_filter, ir3_nir_lower_load_const_instr,
|
||||
const_state);
|
||||
|
||||
if (progress) {
|
||||
struct ir3_compiler *compiler = v->shader->compiler;
|
||||
if (progress) {
|
||||
struct ir3_compiler *compiler = v->shader->compiler;
|
||||
|
||||
/* Save a copy of the NIR constant data to the variant for
|
||||
* inclusion in the final assembly.
|
||||
*/
|
||||
v->constant_data_size = align(nir->constant_data_size,
|
||||
compiler->const_upload_unit * 4 * sizeof(uint32_t));
|
||||
v->constant_data = rzalloc_size(v, v->constant_data_size);
|
||||
memcpy(v->constant_data, nir->constant_data,
|
||||
nir->constant_data_size);
|
||||
}
|
||||
/* Save a copy of the NIR constant data to the variant for
|
||||
* inclusion in the final assembly.
|
||||
*/
|
||||
v->constant_data_size =
|
||||
align(nir->constant_data_size,
|
||||
compiler->const_upload_unit * 4 * sizeof(uint32_t));
|
||||
v->constant_data = rzalloc_size(v, v->constant_data_size);
|
||||
memcpy(v->constant_data, nir->constant_data, nir->constant_data_size);
|
||||
}
|
||||
|
||||
return progress;
|
||||
return progress;
|
||||
}
|
||||
|
|
|
@ -21,8 +21,8 @@
|
|||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "ir3_nir.h"
|
||||
#include "compiler/nir/nir_builder.h"
|
||||
#include "ir3_nir.h"
|
||||
|
||||
/**
|
||||
* This pass moves to NIR certain offset computations for different I/O
|
||||
|
@ -34,7 +34,6 @@
|
|||
* holds the result of the original byte-offset source divided by 4.
|
||||
*/
|
||||
|
||||
|
||||
/* Returns the ir3-specific intrinsic opcode corresponding to an SSBO
|
||||
* instruction that is handled by this pass. It also conveniently returns
|
||||
* the offset source index in @offset_src_idx.
|
||||
|
@ -44,269 +43,269 @@
|
|||
*/
|
||||
static int
|
||||
get_ir3_intrinsic_for_ssbo_intrinsic(unsigned intrinsic,
|
||||
uint8_t *offset_src_idx)
|
||||
uint8_t *offset_src_idx)
|
||||
{
|
||||
debug_assert(offset_src_idx);
|
||||
debug_assert(offset_src_idx);
|
||||
|
||||
*offset_src_idx = 1;
|
||||
*offset_src_idx = 1;
|
||||
|
||||
switch (intrinsic) {
|
||||
case nir_intrinsic_store_ssbo:
|
||||
*offset_src_idx = 2;
|
||||
return nir_intrinsic_store_ssbo_ir3;
|
||||
case nir_intrinsic_load_ssbo:
|
||||
return nir_intrinsic_load_ssbo_ir3;
|
||||
case nir_intrinsic_ssbo_atomic_add:
|
||||
return nir_intrinsic_ssbo_atomic_add_ir3;
|
||||
case nir_intrinsic_ssbo_atomic_imin:
|
||||
return nir_intrinsic_ssbo_atomic_imin_ir3;
|
||||
case nir_intrinsic_ssbo_atomic_umin:
|
||||
return nir_intrinsic_ssbo_atomic_umin_ir3;
|
||||
case nir_intrinsic_ssbo_atomic_imax:
|
||||
return nir_intrinsic_ssbo_atomic_imax_ir3;
|
||||
case nir_intrinsic_ssbo_atomic_umax:
|
||||
return nir_intrinsic_ssbo_atomic_umax_ir3;
|
||||
case nir_intrinsic_ssbo_atomic_and:
|
||||
return nir_intrinsic_ssbo_atomic_and_ir3;
|
||||
case nir_intrinsic_ssbo_atomic_or:
|
||||
return nir_intrinsic_ssbo_atomic_or_ir3;
|
||||
case nir_intrinsic_ssbo_atomic_xor:
|
||||
return nir_intrinsic_ssbo_atomic_xor_ir3;
|
||||
case nir_intrinsic_ssbo_atomic_exchange:
|
||||
return nir_intrinsic_ssbo_atomic_exchange_ir3;
|
||||
case nir_intrinsic_ssbo_atomic_comp_swap:
|
||||
return nir_intrinsic_ssbo_atomic_comp_swap_ir3;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
switch (intrinsic) {
|
||||
case nir_intrinsic_store_ssbo:
|
||||
*offset_src_idx = 2;
|
||||
return nir_intrinsic_store_ssbo_ir3;
|
||||
case nir_intrinsic_load_ssbo:
|
||||
return nir_intrinsic_load_ssbo_ir3;
|
||||
case nir_intrinsic_ssbo_atomic_add:
|
||||
return nir_intrinsic_ssbo_atomic_add_ir3;
|
||||
case nir_intrinsic_ssbo_atomic_imin:
|
||||
return nir_intrinsic_ssbo_atomic_imin_ir3;
|
||||
case nir_intrinsic_ssbo_atomic_umin:
|
||||
return nir_intrinsic_ssbo_atomic_umin_ir3;
|
||||
case nir_intrinsic_ssbo_atomic_imax:
|
||||
return nir_intrinsic_ssbo_atomic_imax_ir3;
|
||||
case nir_intrinsic_ssbo_atomic_umax:
|
||||
return nir_intrinsic_ssbo_atomic_umax_ir3;
|
||||
case nir_intrinsic_ssbo_atomic_and:
|
||||
return nir_intrinsic_ssbo_atomic_and_ir3;
|
||||
case nir_intrinsic_ssbo_atomic_or:
|
||||
return nir_intrinsic_ssbo_atomic_or_ir3;
|
||||
case nir_intrinsic_ssbo_atomic_xor:
|
||||
return nir_intrinsic_ssbo_atomic_xor_ir3;
|
||||
case nir_intrinsic_ssbo_atomic_exchange:
|
||||
return nir_intrinsic_ssbo_atomic_exchange_ir3;
|
||||
case nir_intrinsic_ssbo_atomic_comp_swap:
|
||||
return nir_intrinsic_ssbo_atomic_comp_swap_ir3;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return -1;
|
||||
return -1;
|
||||
}
|
||||
|
||||
static nir_ssa_def *
|
||||
check_and_propagate_bit_shift32(nir_builder *b, nir_alu_instr *alu_instr,
|
||||
int32_t direction, int32_t shift)
|
||||
int32_t direction, int32_t shift)
|
||||
{
|
||||
debug_assert(alu_instr->src[1].src.is_ssa);
|
||||
nir_ssa_def *shift_ssa = alu_instr->src[1].src.ssa;
|
||||
debug_assert(alu_instr->src[1].src.is_ssa);
|
||||
nir_ssa_def *shift_ssa = alu_instr->src[1].src.ssa;
|
||||
|
||||
/* Only propagate if the shift is a const value so we can check value range
|
||||
* statically.
|
||||
*/
|
||||
nir_const_value *const_val = nir_src_as_const_value(alu_instr->src[1].src);
|
||||
if (!const_val)
|
||||
return NULL;
|
||||
/* Only propagate if the shift is a const value so we can check value range
|
||||
* statically.
|
||||
*/
|
||||
nir_const_value *const_val = nir_src_as_const_value(alu_instr->src[1].src);
|
||||
if (!const_val)
|
||||
return NULL;
|
||||
|
||||
int32_t current_shift = const_val[0].i32 * direction;
|
||||
int32_t new_shift = current_shift + shift;
|
||||
int32_t current_shift = const_val[0].i32 * direction;
|
||||
int32_t new_shift = current_shift + shift;
|
||||
|
||||
/* If the merge would reverse the direction, bail out.
|
||||
* e.g, 'x << 2' then 'x >> 4' is not 'x >> 2'.
|
||||
*/
|
||||
if (current_shift * new_shift < 0)
|
||||
return NULL;
|
||||
/* If the merge would reverse the direction, bail out.
|
||||
* e.g, 'x << 2' then 'x >> 4' is not 'x >> 2'.
|
||||
*/
|
||||
if (current_shift * new_shift < 0)
|
||||
return NULL;
|
||||
|
||||
/* If the propagation would overflow an int32_t, bail out too to be on the
|
||||
* safe side.
|
||||
*/
|
||||
if (new_shift < -31 || new_shift > 31)
|
||||
return NULL;
|
||||
/* If the propagation would overflow an int32_t, bail out too to be on the
|
||||
* safe side.
|
||||
*/
|
||||
if (new_shift < -31 || new_shift > 31)
|
||||
return NULL;
|
||||
|
||||
/* Add or substract shift depending on the final direction (SHR vs. SHL). */
|
||||
if (shift * direction < 0)
|
||||
shift_ssa = nir_isub(b, shift_ssa, nir_imm_int(b, abs(shift)));
|
||||
else
|
||||
shift_ssa = nir_iadd(b, shift_ssa, nir_imm_int(b, abs(shift)));
|
||||
/* Add or substract shift depending on the final direction (SHR vs. SHL). */
|
||||
if (shift * direction < 0)
|
||||
shift_ssa = nir_isub(b, shift_ssa, nir_imm_int(b, abs(shift)));
|
||||
else
|
||||
shift_ssa = nir_iadd(b, shift_ssa, nir_imm_int(b, abs(shift)));
|
||||
|
||||
return shift_ssa;
|
||||
return shift_ssa;
|
||||
}
|
||||
|
||||
nir_ssa_def *
|
||||
ir3_nir_try_propagate_bit_shift(nir_builder *b, nir_ssa_def *offset, int32_t shift)
|
||||
ir3_nir_try_propagate_bit_shift(nir_builder *b, nir_ssa_def *offset,
|
||||
int32_t shift)
|
||||
{
|
||||
nir_instr *offset_instr = offset->parent_instr;
|
||||
if (offset_instr->type != nir_instr_type_alu)
|
||||
return NULL;
|
||||
nir_instr *offset_instr = offset->parent_instr;
|
||||
if (offset_instr->type != nir_instr_type_alu)
|
||||
return NULL;
|
||||
|
||||
nir_alu_instr *alu = nir_instr_as_alu(offset_instr);
|
||||
nir_ssa_def *shift_ssa;
|
||||
nir_ssa_def *new_offset = NULL;
|
||||
nir_alu_instr *alu = nir_instr_as_alu(offset_instr);
|
||||
nir_ssa_def *shift_ssa;
|
||||
nir_ssa_def *new_offset = NULL;
|
||||
|
||||
/* the first src could be something like ssa_18.x, but we only want
|
||||
* the single component. Otherwise the ishl/ishr/ushr could turn
|
||||
* into a vec4 operation:
|
||||
*/
|
||||
nir_ssa_def *src0 = nir_mov_alu(b, alu->src[0], 1);
|
||||
/* the first src could be something like ssa_18.x, but we only want
|
||||
* the single component. Otherwise the ishl/ishr/ushr could turn
|
||||
* into a vec4 operation:
|
||||
*/
|
||||
nir_ssa_def *src0 = nir_mov_alu(b, alu->src[0], 1);
|
||||
|
||||
switch (alu->op) {
|
||||
case nir_op_ishl:
|
||||
shift_ssa = check_and_propagate_bit_shift32(b, alu, 1, shift);
|
||||
if (shift_ssa)
|
||||
new_offset = nir_ishl(b, src0, shift_ssa);
|
||||
break;
|
||||
case nir_op_ishr:
|
||||
shift_ssa = check_and_propagate_bit_shift32(b, alu, -1, shift);
|
||||
if (shift_ssa)
|
||||
new_offset = nir_ishr(b, src0, shift_ssa);
|
||||
break;
|
||||
case nir_op_ushr:
|
||||
shift_ssa = check_and_propagate_bit_shift32(b, alu, -1, shift);
|
||||
if (shift_ssa)
|
||||
new_offset = nir_ushr(b, src0, shift_ssa);
|
||||
break;
|
||||
default:
|
||||
return NULL;
|
||||
}
|
||||
switch (alu->op) {
|
||||
case nir_op_ishl:
|
||||
shift_ssa = check_and_propagate_bit_shift32(b, alu, 1, shift);
|
||||
if (shift_ssa)
|
||||
new_offset = nir_ishl(b, src0, shift_ssa);
|
||||
break;
|
||||
case nir_op_ishr:
|
||||
shift_ssa = check_and_propagate_bit_shift32(b, alu, -1, shift);
|
||||
if (shift_ssa)
|
||||
new_offset = nir_ishr(b, src0, shift_ssa);
|
||||
break;
|
||||
case nir_op_ushr:
|
||||
shift_ssa = check_and_propagate_bit_shift32(b, alu, -1, shift);
|
||||
if (shift_ssa)
|
||||
new_offset = nir_ushr(b, src0, shift_ssa);
|
||||
break;
|
||||
default:
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return new_offset;
|
||||
return new_offset;
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_offset_for_ssbo(nir_intrinsic_instr *intrinsic, nir_builder *b,
|
||||
unsigned ir3_ssbo_opcode, uint8_t offset_src_idx)
|
||||
unsigned ir3_ssbo_opcode, uint8_t offset_src_idx)
|
||||
{
|
||||
unsigned num_srcs = nir_intrinsic_infos[intrinsic->intrinsic].num_srcs;
|
||||
int shift = 2;
|
||||
unsigned num_srcs = nir_intrinsic_infos[intrinsic->intrinsic].num_srcs;
|
||||
int shift = 2;
|
||||
|
||||
bool has_dest = nir_intrinsic_infos[intrinsic->intrinsic].has_dest;
|
||||
nir_ssa_def *new_dest = NULL;
|
||||
bool has_dest = nir_intrinsic_infos[intrinsic->intrinsic].has_dest;
|
||||
nir_ssa_def *new_dest = NULL;
|
||||
|
||||
/* for 16-bit ssbo access, offset is in 16-bit words instead of dwords */
|
||||
if ((has_dest && intrinsic->dest.ssa.bit_size == 16) ||
|
||||
(!has_dest && intrinsic->src[0].ssa->bit_size == 16))
|
||||
shift = 1;
|
||||
/* for 16-bit ssbo access, offset is in 16-bit words instead of dwords */
|
||||
if ((has_dest && intrinsic->dest.ssa.bit_size == 16) ||
|
||||
(!has_dest && intrinsic->src[0].ssa->bit_size == 16))
|
||||
shift = 1;
|
||||
|
||||
/* Here we create a new intrinsic and copy over all contents from the old one. */
|
||||
/* Here we create a new intrinsic and copy over all contents from the old
|
||||
* one. */
|
||||
|
||||
nir_intrinsic_instr *new_intrinsic;
|
||||
nir_src *target_src;
|
||||
nir_intrinsic_instr *new_intrinsic;
|
||||
nir_src *target_src;
|
||||
|
||||
b->cursor = nir_before_instr(&intrinsic->instr);
|
||||
b->cursor = nir_before_instr(&intrinsic->instr);
|
||||
|
||||
/* 'offset_src_idx' holds the index of the source that represent the offset. */
|
||||
new_intrinsic =
|
||||
nir_intrinsic_instr_create(b->shader, ir3_ssbo_opcode);
|
||||
/* 'offset_src_idx' holds the index of the source that represent the offset. */
|
||||
new_intrinsic = nir_intrinsic_instr_create(b->shader, ir3_ssbo_opcode);
|
||||
|
||||
debug_assert(intrinsic->src[offset_src_idx].is_ssa);
|
||||
nir_ssa_def *offset = intrinsic->src[offset_src_idx].ssa;
|
||||
debug_assert(intrinsic->src[offset_src_idx].is_ssa);
|
||||
nir_ssa_def *offset = intrinsic->src[offset_src_idx].ssa;
|
||||
|
||||
/* Since we don't have value range checking, we first try to propagate
|
||||
* the division by 4 ('offset >> 2') into another bit-shift instruction that
|
||||
* possibly defines the offset. If that's the case, we emit a similar
|
||||
* instructions adjusting (merging) the shift value.
|
||||
*
|
||||
* Here we use the convention that shifting right is negative while shifting
|
||||
* left is positive. So 'x / 4' ~ 'x >> 2' or 'x << -2'.
|
||||
*/
|
||||
nir_ssa_def *new_offset = ir3_nir_try_propagate_bit_shift(b, offset, -shift);
|
||||
/* Since we don't have value range checking, we first try to propagate
|
||||
* the division by 4 ('offset >> 2') into another bit-shift instruction that
|
||||
* possibly defines the offset. If that's the case, we emit a similar
|
||||
* instructions adjusting (merging) the shift value.
|
||||
*
|
||||
* Here we use the convention that shifting right is negative while shifting
|
||||
* left is positive. So 'x / 4' ~ 'x >> 2' or 'x << -2'.
|
||||
*/
|
||||
nir_ssa_def *new_offset = ir3_nir_try_propagate_bit_shift(b, offset, -shift);
|
||||
|
||||
/* The new source that will hold the dword-offset is always the last
|
||||
* one for every intrinsic.
|
||||
*/
|
||||
target_src = &new_intrinsic->src[num_srcs];
|
||||
*target_src = nir_src_for_ssa(offset);
|
||||
/* The new source that will hold the dword-offset is always the last
|
||||
* one for every intrinsic.
|
||||
*/
|
||||
target_src = &new_intrinsic->src[num_srcs];
|
||||
*target_src = nir_src_for_ssa(offset);
|
||||
|
||||
if (has_dest) {
|
||||
debug_assert(intrinsic->dest.is_ssa);
|
||||
nir_ssa_def *dest = &intrinsic->dest.ssa;
|
||||
nir_ssa_dest_init(&new_intrinsic->instr, &new_intrinsic->dest,
|
||||
dest->num_components, dest->bit_size, NULL);
|
||||
new_dest = &new_intrinsic->dest.ssa;
|
||||
}
|
||||
if (has_dest) {
|
||||
debug_assert(intrinsic->dest.is_ssa);
|
||||
nir_ssa_def *dest = &intrinsic->dest.ssa;
|
||||
nir_ssa_dest_init(&new_intrinsic->instr, &new_intrinsic->dest,
|
||||
dest->num_components, dest->bit_size, NULL);
|
||||
new_dest = &new_intrinsic->dest.ssa;
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < num_srcs; i++)
|
||||
new_intrinsic->src[i] = nir_src_for_ssa(intrinsic->src[i].ssa);
|
||||
for (unsigned i = 0; i < num_srcs; i++)
|
||||
new_intrinsic->src[i] = nir_src_for_ssa(intrinsic->src[i].ssa);
|
||||
|
||||
nir_intrinsic_copy_const_indices(new_intrinsic, intrinsic);
|
||||
nir_intrinsic_copy_const_indices(new_intrinsic, intrinsic);
|
||||
|
||||
new_intrinsic->num_components = intrinsic->num_components;
|
||||
new_intrinsic->num_components = intrinsic->num_components;
|
||||
|
||||
/* If we managed to propagate the division by 4, just use the new offset
|
||||
* register and don't emit the SHR.
|
||||
*/
|
||||
if (new_offset)
|
||||
offset = new_offset;
|
||||
else
|
||||
offset = nir_ushr(b, offset, nir_imm_int(b, shift));
|
||||
/* If we managed to propagate the division by 4, just use the new offset
|
||||
* register and don't emit the SHR.
|
||||
*/
|
||||
if (new_offset)
|
||||
offset = new_offset;
|
||||
else
|
||||
offset = nir_ushr(b, offset, nir_imm_int(b, shift));
|
||||
|
||||
/* Insert the new intrinsic right before the old one. */
|
||||
nir_builder_instr_insert(b, &new_intrinsic->instr);
|
||||
/* Insert the new intrinsic right before the old one. */
|
||||
nir_builder_instr_insert(b, &new_intrinsic->instr);
|
||||
|
||||
/* Replace the last source of the new intrinsic by the result of
|
||||
* the offset divided by 4.
|
||||
*/
|
||||
nir_instr_rewrite_src(&new_intrinsic->instr,
|
||||
target_src,
|
||||
nir_src_for_ssa(offset));
|
||||
/* Replace the last source of the new intrinsic by the result of
|
||||
* the offset divided by 4.
|
||||
*/
|
||||
nir_instr_rewrite_src(&new_intrinsic->instr, target_src,
|
||||
nir_src_for_ssa(offset));
|
||||
|
||||
if (has_dest) {
|
||||
/* Replace the uses of the original destination by that
|
||||
* of the new intrinsic.
|
||||
*/
|
||||
nir_ssa_def_rewrite_uses(&intrinsic->dest.ssa,
|
||||
new_dest);
|
||||
}
|
||||
if (has_dest) {
|
||||
/* Replace the uses of the original destination by that
|
||||
* of the new intrinsic.
|
||||
*/
|
||||
nir_ssa_def_rewrite_uses(&intrinsic->dest.ssa, new_dest);
|
||||
}
|
||||
|
||||
/* Finally remove the original intrinsic. */
|
||||
nir_instr_remove(&intrinsic->instr);
|
||||
/* Finally remove the original intrinsic. */
|
||||
nir_instr_remove(&intrinsic->instr);
|
||||
|
||||
return true;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx, int gpu_id)
|
||||
lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx,
|
||||
int gpu_id)
|
||||
{
|
||||
bool progress = false;
|
||||
bool progress = false;
|
||||
|
||||
nir_foreach_instr_safe (instr, block) {
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
continue;
|
||||
nir_foreach_instr_safe (instr, block) {
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
continue;
|
||||
|
||||
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
||||
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
||||
|
||||
/* SSBO */
|
||||
int ir3_intrinsic;
|
||||
uint8_t offset_src_idx;
|
||||
ir3_intrinsic = get_ir3_intrinsic_for_ssbo_intrinsic(intr->intrinsic,
|
||||
&offset_src_idx);
|
||||
if (ir3_intrinsic != -1) {
|
||||
progress |= lower_offset_for_ssbo(intr, b, (unsigned) ir3_intrinsic,
|
||||
offset_src_idx);
|
||||
}
|
||||
}
|
||||
/* SSBO */
|
||||
int ir3_intrinsic;
|
||||
uint8_t offset_src_idx;
|
||||
ir3_intrinsic =
|
||||
get_ir3_intrinsic_for_ssbo_intrinsic(intr->intrinsic, &offset_src_idx);
|
||||
if (ir3_intrinsic != -1) {
|
||||
progress |= lower_offset_for_ssbo(intr, b, (unsigned)ir3_intrinsic,
|
||||
offset_src_idx);
|
||||
}
|
||||
}
|
||||
|
||||
return progress;
|
||||
return progress;
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_io_offsets_func(nir_function_impl *impl, int gpu_id)
|
||||
{
|
||||
void *mem_ctx = ralloc_parent(impl);
|
||||
nir_builder b;
|
||||
nir_builder_init(&b, impl);
|
||||
void *mem_ctx = ralloc_parent(impl);
|
||||
nir_builder b;
|
||||
nir_builder_init(&b, impl);
|
||||
|
||||
bool progress = false;
|
||||
nir_foreach_block_safe (block, impl) {
|
||||
progress |= lower_io_offsets_block(block, &b, mem_ctx, gpu_id);
|
||||
}
|
||||
bool progress = false;
|
||||
nir_foreach_block_safe (block, impl) {
|
||||
progress |= lower_io_offsets_block(block, &b, mem_ctx, gpu_id);
|
||||
}
|
||||
|
||||
if (progress) {
|
||||
nir_metadata_preserve(impl, nir_metadata_block_index |
|
||||
nir_metadata_dominance);
|
||||
}
|
||||
if (progress) {
|
||||
nir_metadata_preserve(impl,
|
||||
nir_metadata_block_index | nir_metadata_dominance);
|
||||
}
|
||||
|
||||
return progress;
|
||||
return progress;
|
||||
}
|
||||
|
||||
bool
|
||||
ir3_nir_lower_io_offsets(nir_shader *shader, int gpu_id)
|
||||
{
|
||||
bool progress = false;
|
||||
bool progress = false;
|
||||
|
||||
nir_foreach_function (function, shader) {
|
||||
if (function->impl)
|
||||
progress |= lower_io_offsets_func(function->impl, gpu_id);
|
||||
}
|
||||
nir_foreach_function (function, shader) {
|
||||
if (function->impl)
|
||||
progress |= lower_io_offsets_func(function->impl, gpu_id);
|
||||
}
|
||||
|
||||
return progress;
|
||||
return progress;
|
||||
}
|
||||
|
|
|
@ -21,8 +21,8 @@
|
|||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "ir3_nir.h"
|
||||
#include "compiler/nir/nir_builder.h"
|
||||
#include "ir3_nir.h"
|
||||
|
||||
/**
|
||||
* This pass lowers load_barycentric_at_offset to dsx.3d/dsy.3d and alu
|
||||
|
@ -32,75 +32,72 @@
|
|||
static nir_ssa_def *
|
||||
load(nir_builder *b, unsigned ncomp, nir_intrinsic_op op)
|
||||
{
|
||||
nir_intrinsic_instr *load_size = nir_intrinsic_instr_create(b->shader, op);
|
||||
nir_ssa_dest_init(&load_size->instr, &load_size->dest, ncomp, 32, NULL);
|
||||
nir_builder_instr_insert(b, &load_size->instr);
|
||||
nir_intrinsic_instr *load_size = nir_intrinsic_instr_create(b->shader, op);
|
||||
nir_ssa_dest_init(&load_size->instr, &load_size->dest, ncomp, 32, NULL);
|
||||
nir_builder_instr_insert(b, &load_size->instr);
|
||||
|
||||
return &load_size->dest.ssa;
|
||||
return &load_size->dest.ssa;
|
||||
}
|
||||
|
||||
static nir_ssa_def *
|
||||
ir3_nir_lower_load_barycentric_at_offset_instr(nir_builder *b,
|
||||
nir_instr *instr, void *data)
|
||||
ir3_nir_lower_load_barycentric_at_offset_instr(nir_builder *b, nir_instr *instr,
|
||||
void *data)
|
||||
{
|
||||
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
||||
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
||||
|
||||
#define chan(var, c) nir_channel(b, var, c)
|
||||
|
||||
nir_ssa_def *off = intr->src[0].ssa;
|
||||
nir_ssa_def *ij = load(b, 2, nir_intrinsic_load_barycentric_pixel);
|
||||
nir_ssa_def *s = load(b, 1, nir_intrinsic_load_size_ir3);
|
||||
nir_ssa_def *off = intr->src[0].ssa;
|
||||
nir_ssa_def *ij = load(b, 2, nir_intrinsic_load_barycentric_pixel);
|
||||
nir_ssa_def *s = load(b, 1, nir_intrinsic_load_size_ir3);
|
||||
|
||||
s = nir_frcp(b, s);
|
||||
s = nir_frcp(b, s);
|
||||
|
||||
/* scaled ij with s as 3rd component: */
|
||||
nir_ssa_def *sij = nir_vec3(b,
|
||||
nir_fmul(b, chan(ij, 0), s),
|
||||
nir_fmul(b, chan(ij, 1), s),
|
||||
s);
|
||||
/* scaled ij with s as 3rd component: */
|
||||
nir_ssa_def *sij =
|
||||
nir_vec3(b, nir_fmul(b, chan(ij, 0), s), nir_fmul(b, chan(ij, 1), s), s);
|
||||
|
||||
nir_ssa_def *foo = nir_fddx(b, sij);
|
||||
nir_ssa_def *bar = nir_fddy(b, sij);
|
||||
nir_ssa_def *foo = nir_fddx(b, sij);
|
||||
nir_ssa_def *bar = nir_fddy(b, sij);
|
||||
|
||||
if (b->shader->info.stage == MESA_SHADER_FRAGMENT)
|
||||
b->shader->info.fs.needs_quad_helper_invocations = true;
|
||||
if (b->shader->info.stage == MESA_SHADER_FRAGMENT)
|
||||
b->shader->info.fs.needs_quad_helper_invocations = true;
|
||||
|
||||
nir_ssa_def *x, *y, *z, *i, *j;
|
||||
nir_ssa_def *x, *y, *z, *i, *j;
|
||||
|
||||
x = nir_ffma(b, chan(off, 0), chan(foo, 0), chan(sij, 0));
|
||||
y = nir_ffma(b, chan(off, 0), chan(foo, 1), chan(sij, 1));
|
||||
z = nir_ffma(b, chan(off, 0), chan(foo, 2), chan(sij, 2));
|
||||
x = nir_ffma(b, chan(off, 0), chan(foo, 0), chan(sij, 0));
|
||||
y = nir_ffma(b, chan(off, 0), chan(foo, 1), chan(sij, 1));
|
||||
z = nir_ffma(b, chan(off, 0), chan(foo, 2), chan(sij, 2));
|
||||
|
||||
x = nir_ffma(b, chan(off, 1), chan(bar, 0), x);
|
||||
y = nir_ffma(b, chan(off, 1), chan(bar, 1), y);
|
||||
z = nir_ffma(b, chan(off, 1), chan(bar, 2), z);
|
||||
x = nir_ffma(b, chan(off, 1), chan(bar, 0), x);
|
||||
y = nir_ffma(b, chan(off, 1), chan(bar, 1), y);
|
||||
z = nir_ffma(b, chan(off, 1), chan(bar, 2), z);
|
||||
|
||||
/* convert back into primitive space: */
|
||||
z = nir_frcp(b, z);
|
||||
i = nir_fmul(b, z, x);
|
||||
j = nir_fmul(b, z, y);
|
||||
/* convert back into primitive space: */
|
||||
z = nir_frcp(b, z);
|
||||
i = nir_fmul(b, z, x);
|
||||
j = nir_fmul(b, z, y);
|
||||
|
||||
ij = nir_vec2(b, i, j);
|
||||
ij = nir_vec2(b, i, j);
|
||||
|
||||
return ij;
|
||||
return ij;
|
||||
}
|
||||
|
||||
static bool
|
||||
ir3_nir_lower_load_barycentric_at_offset_filter(const nir_instr *instr,
|
||||
const void *data)
|
||||
const void *data)
|
||||
{
|
||||
return (instr->type == nir_instr_type_intrinsic &&
|
||||
nir_instr_as_intrinsic(instr)->intrinsic ==
|
||||
nir_intrinsic_load_barycentric_at_offset);
|
||||
return (instr->type == nir_instr_type_intrinsic &&
|
||||
nir_instr_as_intrinsic(instr)->intrinsic ==
|
||||
nir_intrinsic_load_barycentric_at_offset);
|
||||
}
|
||||
|
||||
bool
|
||||
ir3_nir_lower_load_barycentric_at_offset(nir_shader *shader)
|
||||
{
|
||||
debug_assert(shader->info.stage == MESA_SHADER_FRAGMENT);
|
||||
debug_assert(shader->info.stage == MESA_SHADER_FRAGMENT);
|
||||
|
||||
return nir_shader_lower_instructions(shader,
|
||||
ir3_nir_lower_load_barycentric_at_offset_filter,
|
||||
ir3_nir_lower_load_barycentric_at_offset_instr,
|
||||
NULL);
|
||||
return nir_shader_lower_instructions(
|
||||
shader, ir3_nir_lower_load_barycentric_at_offset_filter,
|
||||
ir3_nir_lower_load_barycentric_at_offset_instr, NULL);
|
||||
}
|
||||
|
|
|
@ -21,8 +21,8 @@
|
|||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "ir3_nir.h"
|
||||
#include "compiler/nir/nir_builder.h"
|
||||
#include "ir3_nir.h"
|
||||
|
||||
/**
|
||||
* This pass lowers load_barycentric_at_sample to load_sample_pos_from_id
|
||||
|
@ -35,61 +35,60 @@
|
|||
static nir_ssa_def *
|
||||
load_sample_pos(nir_builder *b, nir_ssa_def *samp_id)
|
||||
{
|
||||
return nir_load_sample_pos_from_id(b, 32, samp_id);
|
||||
return nir_load_sample_pos_from_id(b, 32, samp_id);
|
||||
}
|
||||
|
||||
static nir_ssa_def *
|
||||
lower_load_barycentric_at_sample(nir_builder *b, nir_intrinsic_instr *intr)
|
||||
{
|
||||
nir_ssa_def *pos = load_sample_pos(b, intr->src[0].ssa);
|
||||
nir_ssa_def *pos = load_sample_pos(b, intr->src[0].ssa);
|
||||
|
||||
return nir_load_barycentric_at_offset(b, 32, pos);
|
||||
return nir_load_barycentric_at_offset(b, 32, pos);
|
||||
}
|
||||
|
||||
static nir_ssa_def *
|
||||
lower_load_sample_pos(nir_builder *b, nir_intrinsic_instr *intr)
|
||||
{
|
||||
nir_ssa_def *pos = load_sample_pos(b, nir_load_sample_id(b));
|
||||
nir_ssa_def *pos = load_sample_pos(b, nir_load_sample_id(b));
|
||||
|
||||
/* Note that gl_SamplePosition is offset by +vec2(0.5, 0.5) vs the
|
||||
* offset passed to interpolateAtOffset(). See
|
||||
* dEQP-GLES31.functional.shaders.multisample_interpolation.interpolate_at_offset.at_sample_position.default_framebuffer
|
||||
* for example.
|
||||
*/
|
||||
nir_ssa_def *half = nir_imm_float(b, 0.5);
|
||||
return nir_fadd(b, pos, nir_vec2(b, half, half));
|
||||
/* Note that gl_SamplePosition is offset by +vec2(0.5, 0.5) vs the
|
||||
* offset passed to interpolateAtOffset(). See
|
||||
* dEQP-GLES31.functional.shaders.multisample_interpolation.interpolate_at_offset.at_sample_position.default_framebuffer
|
||||
* for example.
|
||||
*/
|
||||
nir_ssa_def *half = nir_imm_float(b, 0.5);
|
||||
return nir_fadd(b, pos, nir_vec2(b, half, half));
|
||||
}
|
||||
|
||||
static nir_ssa_def *
|
||||
ir3_nir_lower_load_barycentric_at_sample_instr(nir_builder *b,
|
||||
nir_instr *instr, void *data)
|
||||
ir3_nir_lower_load_barycentric_at_sample_instr(nir_builder *b, nir_instr *instr,
|
||||
void *data)
|
||||
{
|
||||
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
||||
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
||||
|
||||
if (intr->intrinsic == nir_intrinsic_load_sample_pos)
|
||||
return lower_load_sample_pos(b, intr);
|
||||
else
|
||||
return lower_load_barycentric_at_sample(b, intr);
|
||||
if (intr->intrinsic == nir_intrinsic_load_sample_pos)
|
||||
return lower_load_sample_pos(b, intr);
|
||||
else
|
||||
return lower_load_barycentric_at_sample(b, intr);
|
||||
}
|
||||
|
||||
static bool
|
||||
ir3_nir_lower_load_barycentric_at_sample_filter(const nir_instr *instr,
|
||||
const void *data)
|
||||
const void *data)
|
||||
{
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
return false;
|
||||
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
||||
return (intr->intrinsic == nir_intrinsic_load_barycentric_at_sample ||
|
||||
intr->intrinsic == nir_intrinsic_load_sample_pos);
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
return false;
|
||||
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
||||
return (intr->intrinsic == nir_intrinsic_load_barycentric_at_sample ||
|
||||
intr->intrinsic == nir_intrinsic_load_sample_pos);
|
||||
}
|
||||
|
||||
bool
|
||||
ir3_nir_lower_load_barycentric_at_sample(nir_shader *shader)
|
||||
{
|
||||
debug_assert(shader->info.stage == MESA_SHADER_FRAGMENT);
|
||||
debug_assert(shader->info.stage == MESA_SHADER_FRAGMENT);
|
||||
|
||||
return nir_shader_lower_instructions(shader,
|
||||
ir3_nir_lower_load_barycentric_at_sample_filter,
|
||||
ir3_nir_lower_load_barycentric_at_sample_instr,
|
||||
NULL);
|
||||
return nir_shader_lower_instructions(
|
||||
shader, ir3_nir_lower_load_barycentric_at_sample_filter,
|
||||
ir3_nir_lower_load_barycentric_at_sample_instr, NULL);
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -31,97 +31,97 @@
|
|||
static int
|
||||
coord_offset(nir_ssa_def *ssa)
|
||||
{
|
||||
nir_instr *parent_instr = ssa->parent_instr;
|
||||
nir_instr *parent_instr = ssa->parent_instr;
|
||||
|
||||
/* The coordinate of a texture sampling instruction eligible for
|
||||
* pre-fetch is either going to be a load_interpolated_input/
|
||||
* load_input, or a vec2 assembling non-swizzled components of
|
||||
* a load_interpolated_input/load_input (due to varying packing)
|
||||
*/
|
||||
/* The coordinate of a texture sampling instruction eligible for
|
||||
* pre-fetch is either going to be a load_interpolated_input/
|
||||
* load_input, or a vec2 assembling non-swizzled components of
|
||||
* a load_interpolated_input/load_input (due to varying packing)
|
||||
*/
|
||||
|
||||
if (parent_instr->type == nir_instr_type_alu) {
|
||||
nir_alu_instr *alu = nir_instr_as_alu(parent_instr);
|
||||
if (parent_instr->type == nir_instr_type_alu) {
|
||||
nir_alu_instr *alu = nir_instr_as_alu(parent_instr);
|
||||
|
||||
if (alu->op != nir_op_vec2)
|
||||
return -1;
|
||||
if (alu->op != nir_op_vec2)
|
||||
return -1;
|
||||
|
||||
if (!alu->src[0].src.is_ssa)
|
||||
return -1;
|
||||
if (!alu->src[0].src.is_ssa)
|
||||
return -1;
|
||||
|
||||
int base_offset = coord_offset(alu->src[0].src.ssa) +
|
||||
alu->src[0].swizzle[0];
|
||||
int base_offset =
|
||||
coord_offset(alu->src[0].src.ssa) + alu->src[0].swizzle[0];
|
||||
|
||||
/* NOTE it might be possible to support more than 2D? */
|
||||
for (int i = 1; i < 2; i++) {
|
||||
if (!alu->src[i].src.is_ssa)
|
||||
return -1;
|
||||
/* NOTE it might be possible to support more than 2D? */
|
||||
for (int i = 1; i < 2; i++) {
|
||||
if (!alu->src[i].src.is_ssa)
|
||||
return -1;
|
||||
|
||||
int nth_offset = coord_offset(alu->src[i].src.ssa) +
|
||||
alu->src[i].swizzle[0];
|
||||
int nth_offset =
|
||||
coord_offset(alu->src[i].src.ssa) + alu->src[i].swizzle[0];
|
||||
|
||||
if (nth_offset != (base_offset + i))
|
||||
return -1;
|
||||
}
|
||||
if (nth_offset != (base_offset + i))
|
||||
return -1;
|
||||
}
|
||||
|
||||
return base_offset;
|
||||
}
|
||||
return base_offset;
|
||||
}
|
||||
|
||||
if (parent_instr->type != nir_instr_type_intrinsic)
|
||||
return -1;
|
||||
if (parent_instr->type != nir_instr_type_intrinsic)
|
||||
return -1;
|
||||
|
||||
nir_intrinsic_instr *input = nir_instr_as_intrinsic(parent_instr);
|
||||
nir_intrinsic_instr *input = nir_instr_as_intrinsic(parent_instr);
|
||||
|
||||
if (input->intrinsic != nir_intrinsic_load_interpolated_input)
|
||||
return -1;
|
||||
if (input->intrinsic != nir_intrinsic_load_interpolated_input)
|
||||
return -1;
|
||||
|
||||
/* limit to load_barycentric_pixel, other interpolation modes don't seem
|
||||
* to be supported:
|
||||
*/
|
||||
if (!input->src[0].is_ssa)
|
||||
return -1;
|
||||
/* limit to load_barycentric_pixel, other interpolation modes don't seem
|
||||
* to be supported:
|
||||
*/
|
||||
if (!input->src[0].is_ssa)
|
||||
return -1;
|
||||
|
||||
nir_intrinsic_instr *interp =
|
||||
nir_instr_as_intrinsic(input->src[0].ssa->parent_instr);
|
||||
nir_intrinsic_instr *interp =
|
||||
nir_instr_as_intrinsic(input->src[0].ssa->parent_instr);
|
||||
|
||||
if (interp->intrinsic != nir_intrinsic_load_barycentric_pixel)
|
||||
return -1;
|
||||
if (interp->intrinsic != nir_intrinsic_load_barycentric_pixel)
|
||||
return -1;
|
||||
|
||||
/* we also need a const input offset: */
|
||||
if (!nir_src_is_const(input->src[1]))
|
||||
return -1;
|
||||
/* we also need a const input offset: */
|
||||
if (!nir_src_is_const(input->src[1]))
|
||||
return -1;
|
||||
|
||||
unsigned base = nir_src_as_uint(input->src[1]) + nir_intrinsic_base(input);
|
||||
unsigned comp = nir_intrinsic_component(input);
|
||||
unsigned base = nir_src_as_uint(input->src[1]) + nir_intrinsic_base(input);
|
||||
unsigned comp = nir_intrinsic_component(input);
|
||||
|
||||
return (4 * base) + comp;
|
||||
return (4 * base) + comp;
|
||||
}
|
||||
|
||||
int
|
||||
ir3_nir_coord_offset(nir_ssa_def *ssa)
|
||||
{
|
||||
|
||||
assert (ssa->num_components == 2);
|
||||
return coord_offset(ssa);
|
||||
assert(ssa->num_components == 2);
|
||||
return coord_offset(ssa);
|
||||
}
|
||||
|
||||
static bool
|
||||
has_src(nir_tex_instr *tex, nir_tex_src_type type)
|
||||
{
|
||||
return nir_tex_instr_src_index(tex, type) >= 0;
|
||||
return nir_tex_instr_src_index(tex, type) >= 0;
|
||||
}
|
||||
|
||||
static bool
|
||||
ok_bindless_src(nir_tex_instr *tex, nir_tex_src_type type)
|
||||
{
|
||||
int idx = nir_tex_instr_src_index(tex, type);
|
||||
assert(idx >= 0);
|
||||
nir_intrinsic_instr *bindless = ir3_bindless_resource(tex->src[idx].src);
|
||||
int idx = nir_tex_instr_src_index(tex, type);
|
||||
assert(idx >= 0);
|
||||
nir_intrinsic_instr *bindless = ir3_bindless_resource(tex->src[idx].src);
|
||||
|
||||
/* TODO from SP_FS_BINDLESS_PREFETCH[n] it looks like this limit should
|
||||
* be 1<<8 ?
|
||||
*/
|
||||
return nir_src_is_const(bindless->src[0]) &&
|
||||
(nir_src_as_uint(bindless->src[0]) < (1 << 16));
|
||||
/* TODO from SP_FS_BINDLESS_PREFETCH[n] it looks like this limit should
|
||||
* be 1<<8 ?
|
||||
*/
|
||||
return nir_src_is_const(bindless->src[0]) &&
|
||||
(nir_src_as_uint(bindless->src[0]) < (1 << 16));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -134,107 +134,103 @@ ok_bindless_src(nir_tex_instr *tex, nir_tex_src_type type)
|
|||
static bool
|
||||
ok_tex_samp(nir_tex_instr *tex)
|
||||
{
|
||||
if (has_src(tex, nir_tex_src_texture_handle)) {
|
||||
/* bindless case: */
|
||||
if (has_src(tex, nir_tex_src_texture_handle)) {
|
||||
/* bindless case: */
|
||||
|
||||
assert(has_src(tex, nir_tex_src_sampler_handle));
|
||||
assert(has_src(tex, nir_tex_src_sampler_handle));
|
||||
|
||||
return ok_bindless_src(tex, nir_tex_src_texture_handle) &&
|
||||
ok_bindless_src(tex, nir_tex_src_sampler_handle);
|
||||
} else {
|
||||
assert(!has_src(tex, nir_tex_src_texture_offset));
|
||||
assert(!has_src(tex, nir_tex_src_sampler_offset));
|
||||
return ok_bindless_src(tex, nir_tex_src_texture_handle) &&
|
||||
ok_bindless_src(tex, nir_tex_src_sampler_handle);
|
||||
} else {
|
||||
assert(!has_src(tex, nir_tex_src_texture_offset));
|
||||
assert(!has_src(tex, nir_tex_src_sampler_offset));
|
||||
|
||||
return (tex->texture_index <= 0x1f) &&
|
||||
(tex->sampler_index <= 0xf);
|
||||
}
|
||||
return (tex->texture_index <= 0x1f) && (tex->sampler_index <= 0xf);
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_tex_prefetch_block(nir_block *block)
|
||||
{
|
||||
bool progress = false;
|
||||
bool progress = false;
|
||||
|
||||
nir_foreach_instr_safe (instr, block) {
|
||||
if (instr->type != nir_instr_type_tex)
|
||||
continue;
|
||||
nir_foreach_instr_safe (instr, block) {
|
||||
if (instr->type != nir_instr_type_tex)
|
||||
continue;
|
||||
|
||||
nir_tex_instr *tex = nir_instr_as_tex(instr);
|
||||
if (tex->op != nir_texop_tex)
|
||||
continue;
|
||||
nir_tex_instr *tex = nir_instr_as_tex(instr);
|
||||
if (tex->op != nir_texop_tex)
|
||||
continue;
|
||||
|
||||
if (has_src(tex, nir_tex_src_bias) ||
|
||||
has_src(tex, nir_tex_src_lod) ||
|
||||
has_src(tex, nir_tex_src_comparator) ||
|
||||
has_src(tex, nir_tex_src_projector) ||
|
||||
has_src(tex, nir_tex_src_offset) ||
|
||||
has_src(tex, nir_tex_src_ddx) ||
|
||||
has_src(tex, nir_tex_src_ddy) ||
|
||||
has_src(tex, nir_tex_src_ms_index) ||
|
||||
has_src(tex, nir_tex_src_texture_offset) ||
|
||||
has_src(tex, nir_tex_src_sampler_offset))
|
||||
continue;
|
||||
if (has_src(tex, nir_tex_src_bias) || has_src(tex, nir_tex_src_lod) ||
|
||||
has_src(tex, nir_tex_src_comparator) ||
|
||||
has_src(tex, nir_tex_src_projector) ||
|
||||
has_src(tex, nir_tex_src_offset) || has_src(tex, nir_tex_src_ddx) ||
|
||||
has_src(tex, nir_tex_src_ddy) || has_src(tex, nir_tex_src_ms_index) ||
|
||||
has_src(tex, nir_tex_src_texture_offset) ||
|
||||
has_src(tex, nir_tex_src_sampler_offset))
|
||||
continue;
|
||||
|
||||
/* only prefetch for simple 2d tex fetch case */
|
||||
if (tex->sampler_dim != GLSL_SAMPLER_DIM_2D || tex->is_array)
|
||||
continue;
|
||||
/* only prefetch for simple 2d tex fetch case */
|
||||
if (tex->sampler_dim != GLSL_SAMPLER_DIM_2D || tex->is_array)
|
||||
continue;
|
||||
|
||||
if (!ok_tex_samp(tex))
|
||||
continue;
|
||||
if (!ok_tex_samp(tex))
|
||||
continue;
|
||||
|
||||
int idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
|
||||
/* First source should be the sampling coordinate. */
|
||||
nir_tex_src *coord = &tex->src[idx];
|
||||
debug_assert(coord->src.is_ssa);
|
||||
int idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
|
||||
/* First source should be the sampling coordinate. */
|
||||
nir_tex_src *coord = &tex->src[idx];
|
||||
debug_assert(coord->src.is_ssa);
|
||||
|
||||
if (ir3_nir_coord_offset(coord->src.ssa) >= 0) {
|
||||
tex->op = nir_texop_tex_prefetch;
|
||||
if (ir3_nir_coord_offset(coord->src.ssa) >= 0) {
|
||||
tex->op = nir_texop_tex_prefetch;
|
||||
|
||||
progress |= true;
|
||||
}
|
||||
}
|
||||
progress |= true;
|
||||
}
|
||||
}
|
||||
|
||||
return progress;
|
||||
return progress;
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_tex_prefetch_func(nir_function_impl *impl)
|
||||
{
|
||||
/* Only instructions in the the outer-most block are considered
|
||||
* eligible for pre-dispatch, because they need to be move-able
|
||||
* to the beginning of the shader to avoid locking down the
|
||||
* register holding the pre-fetched result for too long.
|
||||
*/
|
||||
nir_block *block = nir_start_block(impl);
|
||||
if (!block)
|
||||
return false;
|
||||
/* Only instructions in the the outer-most block are considered
|
||||
* eligible for pre-dispatch, because they need to be move-able
|
||||
* to the beginning of the shader to avoid locking down the
|
||||
* register holding the pre-fetched result for too long.
|
||||
*/
|
||||
nir_block *block = nir_start_block(impl);
|
||||
if (!block)
|
||||
return false;
|
||||
|
||||
bool progress = lower_tex_prefetch_block(block);
|
||||
bool progress = lower_tex_prefetch_block(block);
|
||||
|
||||
if (progress) {
|
||||
nir_metadata_preserve(impl, nir_metadata_block_index |
|
||||
nir_metadata_dominance);
|
||||
}
|
||||
if (progress) {
|
||||
nir_metadata_preserve(impl,
|
||||
nir_metadata_block_index | nir_metadata_dominance);
|
||||
}
|
||||
|
||||
return progress;
|
||||
return progress;
|
||||
}
|
||||
|
||||
bool
|
||||
ir3_nir_lower_tex_prefetch(nir_shader *shader)
|
||||
{
|
||||
bool progress = false;
|
||||
bool progress = false;
|
||||
|
||||
assert(shader->info.stage == MESA_SHADER_FRAGMENT);
|
||||
assert(shader->info.stage == MESA_SHADER_FRAGMENT);
|
||||
|
||||
nir_foreach_function (function, shader) {
|
||||
/* Only texture sampling instructions inside the main function
|
||||
* are eligible for pre-dispatch.
|
||||
*/
|
||||
if (!function->impl || !function->is_entrypoint)
|
||||
continue;
|
||||
nir_foreach_function (function, shader) {
|
||||
/* Only texture sampling instructions inside the main function
|
||||
* are eligible for pre-dispatch.
|
||||
*/
|
||||
if (!function->impl || !function->is_entrypoint)
|
||||
continue;
|
||||
|
||||
progress |= lower_tex_prefetch_func(function->impl);
|
||||
}
|
||||
progress |= lower_tex_prefetch_func(function->impl);
|
||||
}
|
||||
|
||||
return progress;
|
||||
return progress;
|
||||
}
|
||||
|
|
|
@ -21,8 +21,8 @@
|
|||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "ir3_nir.h"
|
||||
#include "compiler/nir/nir_builder.h"
|
||||
#include "ir3_nir.h"
|
||||
|
||||
/* A4XX has a broken GATHER4 operation. It performs the texture swizzle on the
|
||||
* gather results, rather than before. As a result, it must be emulated with
|
||||
|
@ -32,70 +32,68 @@
|
|||
static nir_ssa_def *
|
||||
ir3_nir_lower_tg4_to_tex_instr(nir_builder *b, nir_instr *instr, void *data)
|
||||
{
|
||||
nir_tex_instr *tg4 = nir_instr_as_tex(instr);
|
||||
static const int offsets[3][2] = { {0, 1}, {1, 1}, {1, 0} };
|
||||
nir_tex_instr *tg4 = nir_instr_as_tex(instr);
|
||||
static const int offsets[3][2] = {{0, 1}, {1, 1}, {1, 0}};
|
||||
|
||||
nir_ssa_def *results[4];
|
||||
int offset_index = nir_tex_instr_src_index(tg4, nir_tex_src_offset);
|
||||
for (int i = 0; i < 4; i++) {
|
||||
int num_srcs = tg4->num_srcs + 1 /* lod */;
|
||||
if (offset_index < 0 && i < 3)
|
||||
num_srcs++;
|
||||
nir_ssa_def *results[4];
|
||||
int offset_index = nir_tex_instr_src_index(tg4, nir_tex_src_offset);
|
||||
for (int i = 0; i < 4; i++) {
|
||||
int num_srcs = tg4->num_srcs + 1 /* lod */;
|
||||
if (offset_index < 0 && i < 3)
|
||||
num_srcs++;
|
||||
|
||||
nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs);
|
||||
tex->op = nir_texop_txl;
|
||||
tex->sampler_dim = tg4->sampler_dim;
|
||||
tex->coord_components = tg4->coord_components;
|
||||
tex->is_array = tg4->is_array;
|
||||
tex->is_shadow = tg4->is_shadow;
|
||||
tex->is_new_style_shadow = tg4->is_new_style_shadow;
|
||||
tex->texture_index = tg4->texture_index;
|
||||
tex->sampler_index = tg4->sampler_index;
|
||||
tex->dest_type = tg4->dest_type;
|
||||
nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs);
|
||||
tex->op = nir_texop_txl;
|
||||
tex->sampler_dim = tg4->sampler_dim;
|
||||
tex->coord_components = tg4->coord_components;
|
||||
tex->is_array = tg4->is_array;
|
||||
tex->is_shadow = tg4->is_shadow;
|
||||
tex->is_new_style_shadow = tg4->is_new_style_shadow;
|
||||
tex->texture_index = tg4->texture_index;
|
||||
tex->sampler_index = tg4->sampler_index;
|
||||
tex->dest_type = tg4->dest_type;
|
||||
|
||||
for (int j = 0; j < tg4->num_srcs; j++) {
|
||||
nir_src_copy(&tex->src[j].src, &tg4->src[j].src, tex);
|
||||
tex->src[j].src_type = tg4->src[j].src_type;
|
||||
}
|
||||
if (i != 3) {
|
||||
nir_ssa_def *offset =
|
||||
nir_vec2(b, nir_imm_int(b, offsets[i][0]),
|
||||
nir_imm_int(b, offsets[i][1]));
|
||||
if (offset_index < 0) {
|
||||
tex->src[tg4->num_srcs].src = nir_src_for_ssa(offset);
|
||||
tex->src[tg4->num_srcs].src_type = nir_tex_src_offset;
|
||||
} else {
|
||||
assert(nir_tex_instr_src_size(tex, offset_index) == 2);
|
||||
nir_ssa_def *orig = nir_ssa_for_src(
|
||||
b, tex->src[offset_index].src, 2);
|
||||
tex->src[offset_index].src =
|
||||
nir_src_for_ssa(nir_iadd(b, orig, offset));
|
||||
}
|
||||
}
|
||||
tex->src[num_srcs - 1].src = nir_src_for_ssa(nir_imm_float(b, 0));
|
||||
tex->src[num_srcs - 1].src_type = nir_tex_src_lod;
|
||||
for (int j = 0; j < tg4->num_srcs; j++) {
|
||||
nir_src_copy(&tex->src[j].src, &tg4->src[j].src, tex);
|
||||
tex->src[j].src_type = tg4->src[j].src_type;
|
||||
}
|
||||
if (i != 3) {
|
||||
nir_ssa_def *offset = nir_vec2(b, nir_imm_int(b, offsets[i][0]),
|
||||
nir_imm_int(b, offsets[i][1]));
|
||||
if (offset_index < 0) {
|
||||
tex->src[tg4->num_srcs].src = nir_src_for_ssa(offset);
|
||||
tex->src[tg4->num_srcs].src_type = nir_tex_src_offset;
|
||||
} else {
|
||||
assert(nir_tex_instr_src_size(tex, offset_index) == 2);
|
||||
nir_ssa_def *orig =
|
||||
nir_ssa_for_src(b, tex->src[offset_index].src, 2);
|
||||
tex->src[offset_index].src =
|
||||
nir_src_for_ssa(nir_iadd(b, orig, offset));
|
||||
}
|
||||
}
|
||||
tex->src[num_srcs - 1].src = nir_src_for_ssa(nir_imm_float(b, 0));
|
||||
tex->src[num_srcs - 1].src_type = nir_tex_src_lod;
|
||||
|
||||
nir_ssa_dest_init(&tex->instr, &tex->dest,
|
||||
nir_tex_instr_dest_size(tex), 32, NULL);
|
||||
nir_builder_instr_insert(b, &tex->instr);
|
||||
nir_ssa_dest_init(&tex->instr, &tex->dest, nir_tex_instr_dest_size(tex),
|
||||
32, NULL);
|
||||
nir_builder_instr_insert(b, &tex->instr);
|
||||
|
||||
results[i] = nir_channel(b, &tex->dest.ssa, tg4->component);
|
||||
}
|
||||
results[i] = nir_channel(b, &tex->dest.ssa, tg4->component);
|
||||
}
|
||||
|
||||
return nir_vec(b, results, 4);
|
||||
return nir_vec(b, results, 4);
|
||||
}
|
||||
|
||||
static bool
|
||||
ir3_nir_lower_tg4_to_tex_filter(const nir_instr *instr, const void *data)
|
||||
{
|
||||
return (instr->type == nir_instr_type_tex &&
|
||||
nir_instr_as_tex(instr)->op == nir_texop_tg4);
|
||||
return (instr->type == nir_instr_type_tex &&
|
||||
nir_instr_as_tex(instr)->op == nir_texop_tg4);
|
||||
}
|
||||
|
||||
bool
|
||||
ir3_nir_lower_tg4_to_tex(nir_shader *shader)
|
||||
{
|
||||
return nir_shader_lower_instructions(shader,
|
||||
ir3_nir_lower_tg4_to_tex_filter,
|
||||
ir3_nir_lower_tg4_to_tex_instr, NULL);
|
||||
return nir_shader_lower_instructions(shader, ir3_nir_lower_tg4_to_tex_filter,
|
||||
ir3_nir_lower_tg4_to_tex_instr, NULL);
|
||||
}
|
||||
|
|
|
@ -21,8 +21,8 @@
|
|||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "ir3_nir.h"
|
||||
#include "compiler/nir/nir_builder.h"
|
||||
#include "ir3_nir.h"
|
||||
|
||||
/**
|
||||
* This pass moves varying fetches (and the instructions they depend on
|
||||
|
@ -46,25 +46,23 @@
|
|||
*/
|
||||
|
||||
typedef struct {
|
||||
nir_block *start_block;
|
||||
bool precondition_failed;
|
||||
nir_block *start_block;
|
||||
bool precondition_failed;
|
||||
} precond_state;
|
||||
|
||||
typedef struct {
|
||||
nir_shader *shader;
|
||||
nir_block *start_block;
|
||||
nir_shader *shader;
|
||||
nir_block *start_block;
|
||||
} state;
|
||||
|
||||
|
||||
|
||||
static void check_precondition_instr(precond_state *state, nir_instr *instr);
|
||||
static void move_instruction_to_start_block(state *state, nir_instr *instr);
|
||||
|
||||
static bool
|
||||
check_precondition_src(nir_src *src, void *state)
|
||||
{
|
||||
check_precondition_instr(state, src->ssa->parent_instr);
|
||||
return true;
|
||||
check_precondition_instr(state, src->ssa->parent_instr);
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Recursively check if there is even a single dependency which
|
||||
|
@ -73,163 +71,163 @@ check_precondition_src(nir_src *src, void *state)
|
|||
static void
|
||||
check_precondition_instr(precond_state *state, nir_instr *instr)
|
||||
{
|
||||
if (instr->block == state->start_block)
|
||||
return;
|
||||
if (instr->block == state->start_block)
|
||||
return;
|
||||
|
||||
switch (instr->type) {
|
||||
case nir_instr_type_alu:
|
||||
case nir_instr_type_deref:
|
||||
case nir_instr_type_load_const:
|
||||
case nir_instr_type_ssa_undef:
|
||||
/* These could be safely moved around */
|
||||
break;
|
||||
case nir_instr_type_intrinsic: {
|
||||
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
||||
if (!nir_intrinsic_can_reorder(intr)) {
|
||||
state->precondition_failed = true;
|
||||
return;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
state->precondition_failed = true;
|
||||
return;
|
||||
}
|
||||
switch (instr->type) {
|
||||
case nir_instr_type_alu:
|
||||
case nir_instr_type_deref:
|
||||
case nir_instr_type_load_const:
|
||||
case nir_instr_type_ssa_undef:
|
||||
/* These could be safely moved around */
|
||||
break;
|
||||
case nir_instr_type_intrinsic: {
|
||||
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
||||
if (!nir_intrinsic_can_reorder(intr)) {
|
||||
state->precondition_failed = true;
|
||||
return;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
state->precondition_failed = true;
|
||||
return;
|
||||
}
|
||||
|
||||
nir_foreach_src(instr, check_precondition_src, state);
|
||||
nir_foreach_src(instr, check_precondition_src, state);
|
||||
}
|
||||
|
||||
static void
|
||||
check_precondition_block(precond_state *state, nir_block *block)
|
||||
{
|
||||
nir_foreach_instr_safe (instr, block) {
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
continue;
|
||||
nir_foreach_instr_safe (instr, block) {
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
continue;
|
||||
|
||||
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
||||
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
||||
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_load_interpolated_input:
|
||||
case nir_intrinsic_load_input:
|
||||
break;
|
||||
default:
|
||||
continue;
|
||||
}
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_load_interpolated_input:
|
||||
case nir_intrinsic_load_input:
|
||||
break;
|
||||
default:
|
||||
continue;
|
||||
}
|
||||
|
||||
check_precondition_instr(state, instr);
|
||||
check_precondition_instr(state, instr);
|
||||
|
||||
if (state->precondition_failed)
|
||||
return;
|
||||
}
|
||||
if (state->precondition_failed)
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
move_src(nir_src *src, void *state)
|
||||
{
|
||||
/* At this point we shouldn't have any non-ssa src: */
|
||||
debug_assert(src->is_ssa);
|
||||
move_instruction_to_start_block(state, src->ssa->parent_instr);
|
||||
return true;
|
||||
/* At this point we shouldn't have any non-ssa src: */
|
||||
debug_assert(src->is_ssa);
|
||||
move_instruction_to_start_block(state, src->ssa->parent_instr);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void
|
||||
move_instruction_to_start_block(state *state, nir_instr *instr)
|
||||
{
|
||||
/* nothing to do if the instruction is already in the start block */
|
||||
if (instr->block == state->start_block)
|
||||
return;
|
||||
/* nothing to do if the instruction is already in the start block */
|
||||
if (instr->block == state->start_block)
|
||||
return;
|
||||
|
||||
/* first move (recursively) all src's to ensure they appear before
|
||||
* load*_input that we are trying to move:
|
||||
*/
|
||||
nir_foreach_src(instr, move_src, state);
|
||||
/* first move (recursively) all src's to ensure they appear before
|
||||
* load*_input that we are trying to move:
|
||||
*/
|
||||
nir_foreach_src(instr, move_src, state);
|
||||
|
||||
/* and then move the instruction itself:
|
||||
*/
|
||||
exec_node_remove(&instr->node);
|
||||
exec_list_push_tail(&state->start_block->instr_list, &instr->node);
|
||||
instr->block = state->start_block;
|
||||
/* and then move the instruction itself:
|
||||
*/
|
||||
exec_node_remove(&instr->node);
|
||||
exec_list_push_tail(&state->start_block->instr_list, &instr->node);
|
||||
instr->block = state->start_block;
|
||||
}
|
||||
|
||||
static bool
|
||||
move_varying_inputs_block(state *state, nir_block *block)
|
||||
{
|
||||
bool progress = false;
|
||||
bool progress = false;
|
||||
|
||||
nir_foreach_instr_safe (instr, block) {
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
continue;
|
||||
nir_foreach_instr_safe (instr, block) {
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
continue;
|
||||
|
||||
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
||||
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
||||
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_load_interpolated_input:
|
||||
case nir_intrinsic_load_input:
|
||||
/* TODO any others to handle? */
|
||||
break;
|
||||
default:
|
||||
continue;
|
||||
}
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_load_interpolated_input:
|
||||
case nir_intrinsic_load_input:
|
||||
/* TODO any others to handle? */
|
||||
break;
|
||||
default:
|
||||
continue;
|
||||
}
|
||||
|
||||
debug_assert(intr->dest.is_ssa);
|
||||
debug_assert(intr->dest.is_ssa);
|
||||
|
||||
move_instruction_to_start_block(state, instr);
|
||||
move_instruction_to_start_block(state, instr);
|
||||
|
||||
progress = true;
|
||||
}
|
||||
progress = true;
|
||||
}
|
||||
|
||||
return progress;
|
||||
return progress;
|
||||
}
|
||||
|
||||
bool
|
||||
ir3_nir_move_varying_inputs(nir_shader *shader)
|
||||
{
|
||||
bool progress = false;
|
||||
bool progress = false;
|
||||
|
||||
debug_assert(shader->info.stage == MESA_SHADER_FRAGMENT);
|
||||
debug_assert(shader->info.stage == MESA_SHADER_FRAGMENT);
|
||||
|
||||
nir_foreach_function (function, shader) {
|
||||
precond_state state;
|
||||
nir_foreach_function (function, shader) {
|
||||
precond_state state;
|
||||
|
||||
if (!function->impl)
|
||||
continue;
|
||||
if (!function->impl)
|
||||
continue;
|
||||
|
||||
state.precondition_failed = false;
|
||||
state.start_block = nir_start_block(function->impl);
|
||||
state.precondition_failed = false;
|
||||
state.start_block = nir_start_block(function->impl);
|
||||
|
||||
nir_foreach_block (block, function->impl) {
|
||||
if (block == state.start_block)
|
||||
continue;
|
||||
nir_foreach_block (block, function->impl) {
|
||||
if (block == state.start_block)
|
||||
continue;
|
||||
|
||||
check_precondition_block(&state, block);
|
||||
check_precondition_block(&state, block);
|
||||
|
||||
if (state.precondition_failed)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (state.precondition_failed)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
nir_foreach_function (function, shader) {
|
||||
state state;
|
||||
nir_foreach_function (function, shader) {
|
||||
state state;
|
||||
|
||||
if (!function->impl)
|
||||
continue;
|
||||
if (!function->impl)
|
||||
continue;
|
||||
|
||||
state.shader = shader;
|
||||
state.start_block = nir_start_block(function->impl);
|
||||
state.shader = shader;
|
||||
state.start_block = nir_start_block(function->impl);
|
||||
|
||||
bool progress = false;
|
||||
nir_foreach_block (block, function->impl) {
|
||||
/* don't need to move anything that is already in the first block */
|
||||
if (block == state.start_block)
|
||||
continue;
|
||||
progress |= move_varying_inputs_block(&state, block);
|
||||
}
|
||||
bool progress = false;
|
||||
nir_foreach_block (block, function->impl) {
|
||||
/* don't need to move anything that is already in the first block */
|
||||
if (block == state.start_block)
|
||||
continue;
|
||||
progress |= move_varying_inputs_block(&state, block);
|
||||
}
|
||||
|
||||
if (progress) {
|
||||
nir_metadata_preserve(function->impl,
|
||||
nir_metadata_block_index | nir_metadata_dominance);
|
||||
}
|
||||
}
|
||||
if (progress) {
|
||||
nir_metadata_preserve(
|
||||
function->impl, nir_metadata_block_index | nir_metadata_dominance);
|
||||
}
|
||||
}
|
||||
|
||||
return progress;
|
||||
return progress;
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -33,425 +33,452 @@
|
|||
#define PTRID(x) ((unsigned long)(x))
|
||||
|
||||
/* ansi escape sequences: */
|
||||
#define RESET "\x1b[0m"
|
||||
#define RED "\x1b[0;31m"
|
||||
#define GREEN "\x1b[0;32m"
|
||||
#define BLUE "\x1b[0;34m"
|
||||
#define MAGENTA "\x1b[0;35m"
|
||||
#define RESET "\x1b[0m"
|
||||
#define RED "\x1b[0;31m"
|
||||
#define GREEN "\x1b[0;32m"
|
||||
#define BLUE "\x1b[0;34m"
|
||||
#define MAGENTA "\x1b[0;35m"
|
||||
|
||||
/* syntax coloring, mostly to make it easier to see different sorts of
|
||||
* srcs (immediate, constant, ssa, array, ...)
|
||||
*/
|
||||
#define SYN_REG(x) RED x RESET
|
||||
#define SYN_IMMED(x) GREEN x RESET
|
||||
#define SYN_CONST(x) GREEN x RESET
|
||||
#define SYN_SSA(x) BLUE x RESET
|
||||
#define SYN_ARRAY(x) MAGENTA x RESET
|
||||
#define SYN_REG(x) RED x RESET
|
||||
#define SYN_IMMED(x) GREEN x RESET
|
||||
#define SYN_CONST(x) GREEN x RESET
|
||||
#define SYN_SSA(x) BLUE x RESET
|
||||
#define SYN_ARRAY(x) MAGENTA x RESET
|
||||
|
||||
static const char *
|
||||
type_name(type_t type)
|
||||
{
|
||||
static const char *type_names[] = {
|
||||
[TYPE_F16] = "f16",
|
||||
[TYPE_F32] = "f32",
|
||||
[TYPE_U16] = "u16",
|
||||
[TYPE_U32] = "u32",
|
||||
[TYPE_S16] = "s16",
|
||||
[TYPE_S32] = "s32",
|
||||
[TYPE_U8] = "u8",
|
||||
[TYPE_S8] = "s8",
|
||||
};
|
||||
return type_names[type];
|
||||
static const char *type_names[] = {
|
||||
[TYPE_F16] = "f16", [TYPE_F32] = "f32", [TYPE_U16] = "u16",
|
||||
[TYPE_U32] = "u32", [TYPE_S16] = "s16", [TYPE_S32] = "s32",
|
||||
[TYPE_U8] = "u8", [TYPE_S8] = "s8",
|
||||
};
|
||||
return type_names[type];
|
||||
}
|
||||
|
||||
static void print_instr_name(struct log_stream *stream, struct ir3_instruction *instr, bool flags)
|
||||
static void
|
||||
print_instr_name(struct log_stream *stream, struct ir3_instruction *instr,
|
||||
bool flags)
|
||||
{
|
||||
if (!instr)
|
||||
return;
|
||||
if (!instr)
|
||||
return;
|
||||
#ifdef DEBUG
|
||||
mesa_log_stream_printf(stream, "%04u:", instr->serialno);
|
||||
mesa_log_stream_printf(stream, "%04u:", instr->serialno);
|
||||
#endif
|
||||
mesa_log_stream_printf(stream, "%04u:", instr->name);
|
||||
mesa_log_stream_printf(stream, "%04u:", instr->ip);
|
||||
if (instr->flags & IR3_INSTR_UNUSED) {
|
||||
mesa_log_stream_printf(stream, "XXX: ");
|
||||
} else {
|
||||
mesa_log_stream_printf(stream, "%03u: ", instr->use_count);
|
||||
}
|
||||
mesa_log_stream_printf(stream, "%04u:", instr->name);
|
||||
mesa_log_stream_printf(stream, "%04u:", instr->ip);
|
||||
if (instr->flags & IR3_INSTR_UNUSED) {
|
||||
mesa_log_stream_printf(stream, "XXX: ");
|
||||
} else {
|
||||
mesa_log_stream_printf(stream, "%03u: ", instr->use_count);
|
||||
}
|
||||
|
||||
if (flags) {
|
||||
mesa_log_stream_printf(stream, "\t");
|
||||
if (instr->flags & IR3_INSTR_SY)
|
||||
mesa_log_stream_printf(stream, "(sy)");
|
||||
if (instr->flags & IR3_INSTR_SS)
|
||||
mesa_log_stream_printf(stream, "(ss)");
|
||||
if (instr->flags & IR3_INSTR_JP)
|
||||
mesa_log_stream_printf(stream, "(jp)");
|
||||
if (instr->repeat)
|
||||
mesa_log_stream_printf(stream, "(rpt%d)", instr->repeat);
|
||||
if (instr->nop)
|
||||
mesa_log_stream_printf(stream, "(nop%d)", instr->nop);
|
||||
if (instr->flags & IR3_INSTR_UL)
|
||||
mesa_log_stream_printf(stream, "(ul)");
|
||||
} else {
|
||||
mesa_log_stream_printf(stream, " ");
|
||||
}
|
||||
if (flags) {
|
||||
mesa_log_stream_printf(stream, "\t");
|
||||
if (instr->flags & IR3_INSTR_SY)
|
||||
mesa_log_stream_printf(stream, "(sy)");
|
||||
if (instr->flags & IR3_INSTR_SS)
|
||||
mesa_log_stream_printf(stream, "(ss)");
|
||||
if (instr->flags & IR3_INSTR_JP)
|
||||
mesa_log_stream_printf(stream, "(jp)");
|
||||
if (instr->repeat)
|
||||
mesa_log_stream_printf(stream, "(rpt%d)", instr->repeat);
|
||||
if (instr->nop)
|
||||
mesa_log_stream_printf(stream, "(nop%d)", instr->nop);
|
||||
if (instr->flags & IR3_INSTR_UL)
|
||||
mesa_log_stream_printf(stream, "(ul)");
|
||||
} else {
|
||||
mesa_log_stream_printf(stream, " ");
|
||||
}
|
||||
|
||||
if (is_meta(instr)) {
|
||||
switch (instr->opc) {
|
||||
case OPC_META_INPUT: mesa_log_stream_printf(stream, "_meta:in"); break;
|
||||
case OPC_META_SPLIT: mesa_log_stream_printf(stream, "_meta:split"); break;
|
||||
case OPC_META_COLLECT: mesa_log_stream_printf(stream, "_meta:collect"); break;
|
||||
case OPC_META_TEX_PREFETCH: mesa_log_stream_printf(stream, "_meta:tex_prefetch"); break;
|
||||
case OPC_META_PARALLEL_COPY: mesa_log_stream_printf(stream, "_meta:parallel_copy"); break;
|
||||
case OPC_META_PHI: mesa_log_stream_printf(stream, "_meta:phi"); break;
|
||||
if (is_meta(instr)) {
|
||||
switch (instr->opc) {
|
||||
case OPC_META_INPUT:
|
||||
mesa_log_stream_printf(stream, "_meta:in");
|
||||
break;
|
||||
case OPC_META_SPLIT:
|
||||
mesa_log_stream_printf(stream, "_meta:split");
|
||||
break;
|
||||
case OPC_META_COLLECT:
|
||||
mesa_log_stream_printf(stream, "_meta:collect");
|
||||
break;
|
||||
case OPC_META_TEX_PREFETCH:
|
||||
mesa_log_stream_printf(stream, "_meta:tex_prefetch");
|
||||
break;
|
||||
case OPC_META_PARALLEL_COPY:
|
||||
mesa_log_stream_printf(stream, "_meta:parallel_copy");
|
||||
break;
|
||||
case OPC_META_PHI:
|
||||
mesa_log_stream_printf(stream, "_meta:phi");
|
||||
break;
|
||||
|
||||
/* shouldn't hit here.. just for debugging: */
|
||||
default: mesa_log_stream_printf(stream, "_meta:%d", instr->opc); break;
|
||||
}
|
||||
} else if (opc_cat(instr->opc) == 1) {
|
||||
if (instr->opc == OPC_MOV) {
|
||||
if (instr->cat1.src_type == instr->cat1.dst_type)
|
||||
mesa_log_stream_printf(stream, "mov");
|
||||
else
|
||||
mesa_log_stream_printf(stream, "cov");
|
||||
} else {
|
||||
mesa_log_stream_printf(stream, "%s", disasm_a3xx_instr_name(instr->opc));
|
||||
}
|
||||
/* shouldn't hit here.. just for debugging: */
|
||||
default:
|
||||
mesa_log_stream_printf(stream, "_meta:%d", instr->opc);
|
||||
break;
|
||||
}
|
||||
} else if (opc_cat(instr->opc) == 1) {
|
||||
if (instr->opc == OPC_MOV) {
|
||||
if (instr->cat1.src_type == instr->cat1.dst_type)
|
||||
mesa_log_stream_printf(stream, "mov");
|
||||
else
|
||||
mesa_log_stream_printf(stream, "cov");
|
||||
} else {
|
||||
mesa_log_stream_printf(stream, "%s",
|
||||
disasm_a3xx_instr_name(instr->opc));
|
||||
}
|
||||
|
||||
if (instr->opc != OPC_MOVMSK) {
|
||||
mesa_log_stream_printf(stream, ".%s%s", type_name(instr->cat1.src_type),
|
||||
type_name(instr->cat1.dst_type));
|
||||
}
|
||||
} else if (instr->opc == OPC_B) {
|
||||
const char *name[8] = {
|
||||
[BRANCH_PLAIN] = "br",
|
||||
[BRANCH_OR] = "brao",
|
||||
[BRANCH_AND] = "braa",
|
||||
[BRANCH_CONST] = "brac",
|
||||
[BRANCH_ANY] = "bany",
|
||||
[BRANCH_ALL] = "ball",
|
||||
[BRANCH_X] = "brax",
|
||||
};
|
||||
mesa_log_stream_printf(stream, "%s", name[instr->cat0.brtype]);
|
||||
} else {
|
||||
mesa_log_stream_printf(stream, "%s", disasm_a3xx_instr_name(instr->opc));
|
||||
if (instr->flags & IR3_INSTR_3D)
|
||||
mesa_log_stream_printf(stream, ".3d");
|
||||
if (instr->flags & IR3_INSTR_A)
|
||||
mesa_log_stream_printf(stream, ".a");
|
||||
if (instr->flags & IR3_INSTR_O)
|
||||
mesa_log_stream_printf(stream, ".o");
|
||||
if (instr->flags & IR3_INSTR_P)
|
||||
mesa_log_stream_printf(stream, ".p");
|
||||
if (instr->flags & IR3_INSTR_S)
|
||||
mesa_log_stream_printf(stream, ".s");
|
||||
if (instr->flags & IR3_INSTR_A1EN)
|
||||
mesa_log_stream_printf(stream, ".a1en");
|
||||
if (instr->opc == OPC_LDC)
|
||||
mesa_log_stream_printf(stream, ".offset%d", instr->cat6.d);
|
||||
if (instr->flags & IR3_INSTR_B) {
|
||||
mesa_log_stream_printf(stream, ".base%d",
|
||||
is_tex(instr) ? instr->cat5.tex_base : instr->cat6.base);
|
||||
}
|
||||
if (instr->flags & IR3_INSTR_S2EN)
|
||||
mesa_log_stream_printf(stream, ".s2en");
|
||||
if (instr->opc != OPC_MOVMSK) {
|
||||
mesa_log_stream_printf(stream, ".%s%s",
|
||||
type_name(instr->cat1.src_type),
|
||||
type_name(instr->cat1.dst_type));
|
||||
}
|
||||
} else if (instr->opc == OPC_B) {
|
||||
const char *name[8] = {
|
||||
[BRANCH_PLAIN] = "br", [BRANCH_OR] = "brao", [BRANCH_AND] = "braa",
|
||||
[BRANCH_CONST] = "brac", [BRANCH_ANY] = "bany", [BRANCH_ALL] = "ball",
|
||||
[BRANCH_X] = "brax",
|
||||
};
|
||||
mesa_log_stream_printf(stream, "%s", name[instr->cat0.brtype]);
|
||||
} else {
|
||||
mesa_log_stream_printf(stream, "%s", disasm_a3xx_instr_name(instr->opc));
|
||||
if (instr->flags & IR3_INSTR_3D)
|
||||
mesa_log_stream_printf(stream, ".3d");
|
||||
if (instr->flags & IR3_INSTR_A)
|
||||
mesa_log_stream_printf(stream, ".a");
|
||||
if (instr->flags & IR3_INSTR_O)
|
||||
mesa_log_stream_printf(stream, ".o");
|
||||
if (instr->flags & IR3_INSTR_P)
|
||||
mesa_log_stream_printf(stream, ".p");
|
||||
if (instr->flags & IR3_INSTR_S)
|
||||
mesa_log_stream_printf(stream, ".s");
|
||||
if (instr->flags & IR3_INSTR_A1EN)
|
||||
mesa_log_stream_printf(stream, ".a1en");
|
||||
if (instr->opc == OPC_LDC)
|
||||
mesa_log_stream_printf(stream, ".offset%d", instr->cat6.d);
|
||||
if (instr->flags & IR3_INSTR_B) {
|
||||
mesa_log_stream_printf(
|
||||
stream, ".base%d",
|
||||
is_tex(instr) ? instr->cat5.tex_base : instr->cat6.base);
|
||||
}
|
||||
if (instr->flags & IR3_INSTR_S2EN)
|
||||
mesa_log_stream_printf(stream, ".s2en");
|
||||
|
||||
static const char *cond[0x7] = {
|
||||
"lt",
|
||||
"le",
|
||||
"gt",
|
||||
"ge",
|
||||
"eq",
|
||||
"ne",
|
||||
};
|
||||
static const char *cond[0x7] = {
|
||||
"lt", "le", "gt", "ge", "eq", "ne",
|
||||
};
|
||||
|
||||
switch (instr->opc) {
|
||||
case OPC_CMPS_F:
|
||||
case OPC_CMPS_U:
|
||||
case OPC_CMPS_S:
|
||||
case OPC_CMPV_F:
|
||||
case OPC_CMPV_U:
|
||||
case OPC_CMPV_S:
|
||||
mesa_log_stream_printf(stream, ".%s", cond[instr->cat2.condition & 0x7]);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
switch (instr->opc) {
|
||||
case OPC_CMPS_F:
|
||||
case OPC_CMPS_U:
|
||||
case OPC_CMPS_S:
|
||||
case OPC_CMPV_F:
|
||||
case OPC_CMPV_U:
|
||||
case OPC_CMPV_S:
|
||||
mesa_log_stream_printf(stream, ".%s",
|
||||
cond[instr->cat2.condition & 0x7]);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void print_ssa_def_name(struct log_stream *stream, struct ir3_register *reg)
|
||||
static void
|
||||
print_ssa_def_name(struct log_stream *stream, struct ir3_register *reg)
|
||||
{
|
||||
mesa_log_stream_printf(stream, SYN_SSA("ssa_%u"), reg->instr->serialno);
|
||||
if (reg->name != 0)
|
||||
mesa_log_stream_printf(stream, ":%u", reg->name);
|
||||
mesa_log_stream_printf(stream, SYN_SSA("ssa_%u"), reg->instr->serialno);
|
||||
if (reg->name != 0)
|
||||
mesa_log_stream_printf(stream, ":%u", reg->name);
|
||||
}
|
||||
|
||||
static void print_ssa_name(struct log_stream *stream, struct ir3_register *reg, bool dst)
|
||||
static void
|
||||
print_ssa_name(struct log_stream *stream, struct ir3_register *reg, bool dst)
|
||||
{
|
||||
if (!dst) {
|
||||
if (!reg->def)
|
||||
mesa_log_stream_printf(stream, SYN_SSA("undef"));
|
||||
else
|
||||
print_ssa_def_name(stream, reg->def);
|
||||
} else {
|
||||
print_ssa_def_name(stream, reg);
|
||||
}
|
||||
if (!dst) {
|
||||
if (!reg->def)
|
||||
mesa_log_stream_printf(stream, SYN_SSA("undef"));
|
||||
else
|
||||
print_ssa_def_name(stream, reg->def);
|
||||
} else {
|
||||
print_ssa_def_name(stream, reg);
|
||||
}
|
||||
|
||||
if (reg->num != INVALID_REG && !(reg->flags & IR3_REG_ARRAY))
|
||||
mesa_log_stream_printf(stream, "("SYN_REG("r%u.%c")")", reg_num(reg), "xyzw"[reg_comp(reg)]);
|
||||
if (reg->num != INVALID_REG && !(reg->flags & IR3_REG_ARRAY))
|
||||
mesa_log_stream_printf(stream, "(" SYN_REG("r%u.%c") ")", reg_num(reg),
|
||||
"xyzw"[reg_comp(reg)]);
|
||||
}
|
||||
|
||||
static void print_reg_name(struct log_stream *stream, struct ir3_instruction *instr,
|
||||
struct ir3_register *reg, bool dest)
|
||||
static void
|
||||
print_reg_name(struct log_stream *stream, struct ir3_instruction *instr,
|
||||
struct ir3_register *reg, bool dest)
|
||||
{
|
||||
if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) &&
|
||||
(reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)))
|
||||
mesa_log_stream_printf(stream, "(absneg)");
|
||||
else if (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT))
|
||||
mesa_log_stream_printf(stream, "(neg)");
|
||||
else if (reg->flags & (IR3_REG_FABS | IR3_REG_SABS))
|
||||
mesa_log_stream_printf(stream, "(abs)");
|
||||
if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) &&
|
||||
(reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)))
|
||||
mesa_log_stream_printf(stream, "(absneg)");
|
||||
else if (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT))
|
||||
mesa_log_stream_printf(stream, "(neg)");
|
||||
else if (reg->flags & (IR3_REG_FABS | IR3_REG_SABS))
|
||||
mesa_log_stream_printf(stream, "(abs)");
|
||||
|
||||
if (reg->flags & IR3_REG_FIRST_KILL)
|
||||
mesa_log_stream_printf(stream, "(kill)");
|
||||
if (reg->flags & IR3_REG_UNUSED)
|
||||
mesa_log_stream_printf(stream, "(unused)");
|
||||
if (reg->flags & IR3_REG_FIRST_KILL)
|
||||
mesa_log_stream_printf(stream, "(kill)");
|
||||
if (reg->flags & IR3_REG_UNUSED)
|
||||
mesa_log_stream_printf(stream, "(unused)");
|
||||
|
||||
if (reg->flags & IR3_REG_R)
|
||||
mesa_log_stream_printf(stream, "(r)");
|
||||
if (reg->flags & IR3_REG_R)
|
||||
mesa_log_stream_printf(stream, "(r)");
|
||||
|
||||
/* Right now all instructions that use tied registers only have one
|
||||
* destination register, so we can just print (tied) as if it's a flag,
|
||||
* although it's more convenient for RA if it's a pointer.
|
||||
*/
|
||||
if (reg->tied)
|
||||
printf("(tied)");
|
||||
/* Right now all instructions that use tied registers only have one
|
||||
* destination register, so we can just print (tied) as if it's a flag,
|
||||
* although it's more convenient for RA if it's a pointer.
|
||||
*/
|
||||
if (reg->tied)
|
||||
printf("(tied)");
|
||||
|
||||
if (reg->flags & IR3_REG_SHARED)
|
||||
mesa_log_stream_printf(stream, "s");
|
||||
if (reg->flags & IR3_REG_HALF)
|
||||
mesa_log_stream_printf(stream, "h");
|
||||
if (reg->flags & IR3_REG_SHARED)
|
||||
mesa_log_stream_printf(stream, "s");
|
||||
if (reg->flags & IR3_REG_HALF)
|
||||
mesa_log_stream_printf(stream, "h");
|
||||
|
||||
if (reg->flags & IR3_REG_IMMED) {
|
||||
mesa_log_stream_printf(stream, SYN_IMMED("imm[%f,%d,0x%x]"), reg->fim_val, reg->iim_val, reg->iim_val);
|
||||
} else if (reg->flags & IR3_REG_ARRAY) {
|
||||
if (reg->flags & IR3_REG_SSA) {
|
||||
print_ssa_name(stream, reg, dest);
|
||||
mesa_log_stream_printf(stream, ":");
|
||||
}
|
||||
mesa_log_stream_printf(stream, SYN_ARRAY("arr[id=%u, offset=%d, size=%u]"), reg->array.id,
|
||||
reg->array.offset, reg->size);
|
||||
if (reg->array.base != INVALID_REG)
|
||||
mesa_log_stream_printf(stream, "("SYN_REG("r%u.%c")")", reg->array.base >> 2,
|
||||
"xyzw"[reg->array.base & 0x3]);
|
||||
} else if (reg->flags & IR3_REG_SSA) {
|
||||
print_ssa_name(stream, reg, dest);
|
||||
} else if (reg->flags & IR3_REG_RELATIV) {
|
||||
if (reg->flags & IR3_REG_CONST)
|
||||
mesa_log_stream_printf(stream, SYN_CONST("c<a0.x + %d>"), reg->array.offset);
|
||||
else
|
||||
mesa_log_stream_printf(stream, SYN_REG("r<a0.x + %d>")" (%u)", reg->array.offset, reg->size);
|
||||
} else {
|
||||
if (reg->flags & IR3_REG_CONST)
|
||||
mesa_log_stream_printf(stream, SYN_CONST("c%u.%c"), reg_num(reg), "xyzw"[reg_comp(reg)]);
|
||||
else
|
||||
mesa_log_stream_printf(stream, SYN_REG("r%u.%c"), reg_num(reg), "xyzw"[reg_comp(reg)]);
|
||||
}
|
||||
if (reg->flags & IR3_REG_IMMED) {
|
||||
mesa_log_stream_printf(stream, SYN_IMMED("imm[%f,%d,0x%x]"), reg->fim_val,
|
||||
reg->iim_val, reg->iim_val);
|
||||
} else if (reg->flags & IR3_REG_ARRAY) {
|
||||
if (reg->flags & IR3_REG_SSA) {
|
||||
print_ssa_name(stream, reg, dest);
|
||||
mesa_log_stream_printf(stream, ":");
|
||||
}
|
||||
mesa_log_stream_printf(stream,
|
||||
SYN_ARRAY("arr[id=%u, offset=%d, size=%u]"),
|
||||
reg->array.id, reg->array.offset, reg->size);
|
||||
if (reg->array.base != INVALID_REG)
|
||||
mesa_log_stream_printf(stream, "(" SYN_REG("r%u.%c") ")",
|
||||
reg->array.base >> 2,
|
||||
"xyzw"[reg->array.base & 0x3]);
|
||||
} else if (reg->flags & IR3_REG_SSA) {
|
||||
print_ssa_name(stream, reg, dest);
|
||||
} else if (reg->flags & IR3_REG_RELATIV) {
|
||||
if (reg->flags & IR3_REG_CONST)
|
||||
mesa_log_stream_printf(stream, SYN_CONST("c<a0.x + %d>"),
|
||||
reg->array.offset);
|
||||
else
|
||||
mesa_log_stream_printf(stream, SYN_REG("r<a0.x + %d>") " (%u)",
|
||||
reg->array.offset, reg->size);
|
||||
} else {
|
||||
if (reg->flags & IR3_REG_CONST)
|
||||
mesa_log_stream_printf(stream, SYN_CONST("c%u.%c"), reg_num(reg),
|
||||
"xyzw"[reg_comp(reg)]);
|
||||
else
|
||||
mesa_log_stream_printf(stream, SYN_REG("r%u.%c"), reg_num(reg),
|
||||
"xyzw"[reg_comp(reg)]);
|
||||
}
|
||||
|
||||
if (reg->wrmask > 0x1)
|
||||
mesa_log_stream_printf(stream, " (wrmask=0x%x)", reg->wrmask);
|
||||
if (reg->wrmask > 0x1)
|
||||
mesa_log_stream_printf(stream, " (wrmask=0x%x)", reg->wrmask);
|
||||
}
|
||||
|
||||
static void
|
||||
tab(struct log_stream *stream, int lvl)
|
||||
{
|
||||
for (int i = 0; i < lvl; i++)
|
||||
mesa_log_stream_printf(stream, "\t");
|
||||
for (int i = 0; i < lvl; i++)
|
||||
mesa_log_stream_printf(stream, "\t");
|
||||
}
|
||||
|
||||
static void
|
||||
print_instr(struct log_stream *stream, struct ir3_instruction *instr, int lvl)
|
||||
{
|
||||
tab(stream, lvl);
|
||||
tab(stream, lvl);
|
||||
|
||||
print_instr_name(stream, instr, true);
|
||||
print_instr_name(stream, instr, true);
|
||||
|
||||
if (is_tex(instr)) {
|
||||
mesa_log_stream_printf(stream, " (%s)(", type_name(instr->cat5.type));
|
||||
for (unsigned i = 0; i < 4; i++)
|
||||
if (instr->dsts[0]->wrmask & (1 << i))
|
||||
mesa_log_stream_printf(stream, "%c", "xyzw"[i]);
|
||||
mesa_log_stream_printf(stream, ")");
|
||||
} else if ((instr->srcs_count > 0 || instr->dsts_count > 0) && (instr->opc != OPC_B)) {
|
||||
/* NOTE the b(ranch) instruction has a suffix, which is
|
||||
* handled below
|
||||
*/
|
||||
mesa_log_stream_printf(stream, " ");
|
||||
}
|
||||
if (is_tex(instr)) {
|
||||
mesa_log_stream_printf(stream, " (%s)(", type_name(instr->cat5.type));
|
||||
for (unsigned i = 0; i < 4; i++)
|
||||
if (instr->dsts[0]->wrmask & (1 << i))
|
||||
mesa_log_stream_printf(stream, "%c", "xyzw"[i]);
|
||||
mesa_log_stream_printf(stream, ")");
|
||||
} else if ((instr->srcs_count > 0 || instr->dsts_count > 0) &&
|
||||
(instr->opc != OPC_B)) {
|
||||
/* NOTE the b(ranch) instruction has a suffix, which is
|
||||
* handled below
|
||||
*/
|
||||
mesa_log_stream_printf(stream, " ");
|
||||
}
|
||||
|
||||
if (!is_flow(instr) || instr->opc == OPC_END || instr->opc == OPC_CHMASK) {
|
||||
bool first = true;
|
||||
foreach_dst (reg, instr) {
|
||||
if (reg->wrmask == 0)
|
||||
continue;
|
||||
if (!first)
|
||||
mesa_log_stream_printf(stream, ", ");
|
||||
print_reg_name(stream, instr, reg, true);
|
||||
first = false;
|
||||
}
|
||||
foreach_src (reg, instr) {
|
||||
if (!first)
|
||||
mesa_log_stream_printf(stream, ", ");
|
||||
print_reg_name(stream, instr, reg, false);
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
if (!is_flow(instr) || instr->opc == OPC_END || instr->opc == OPC_CHMASK) {
|
||||
bool first = true;
|
||||
foreach_dst (reg, instr) {
|
||||
if (reg->wrmask == 0)
|
||||
continue;
|
||||
if (!first)
|
||||
mesa_log_stream_printf(stream, ", ");
|
||||
print_reg_name(stream, instr, reg, true);
|
||||
first = false;
|
||||
}
|
||||
foreach_src (reg, instr) {
|
||||
if (!first)
|
||||
mesa_log_stream_printf(stream, ", ");
|
||||
print_reg_name(stream, instr, reg, false);
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (is_tex(instr) && !(instr->flags & IR3_INSTR_S2EN)) {
|
||||
if (!!(instr->flags & IR3_INSTR_B)) {
|
||||
if (!!(instr->flags & IR3_INSTR_A1EN)) {
|
||||
mesa_log_stream_printf(stream, ", s#%d", instr->cat5.samp);
|
||||
} else {
|
||||
mesa_log_stream_printf(stream, ", s#%d, t#%d", instr->cat5.samp & 0xf,
|
||||
instr->cat5.samp >> 4);
|
||||
}
|
||||
} else {
|
||||
mesa_log_stream_printf(stream, ", s#%d, t#%d", instr->cat5.samp, instr->cat5.tex);
|
||||
}
|
||||
}
|
||||
if (is_tex(instr) && !(instr->flags & IR3_INSTR_S2EN)) {
|
||||
if (!!(instr->flags & IR3_INSTR_B)) {
|
||||
if (!!(instr->flags & IR3_INSTR_A1EN)) {
|
||||
mesa_log_stream_printf(stream, ", s#%d", instr->cat5.samp);
|
||||
} else {
|
||||
mesa_log_stream_printf(stream, ", s#%d, t#%d",
|
||||
instr->cat5.samp & 0xf,
|
||||
instr->cat5.samp >> 4);
|
||||
}
|
||||
} else {
|
||||
mesa_log_stream_printf(stream, ", s#%d, t#%d", instr->cat5.samp,
|
||||
instr->cat5.tex);
|
||||
}
|
||||
}
|
||||
|
||||
if (instr->opc == OPC_META_SPLIT) {
|
||||
mesa_log_stream_printf(stream, ", off=%d", instr->split.off);
|
||||
} else if (instr->opc == OPC_META_TEX_PREFETCH) {
|
||||
mesa_log_stream_printf(stream, ", tex=%d, samp=%d, input_offset=%d", instr->prefetch.tex,
|
||||
instr->prefetch.samp, instr->prefetch.input_offset);
|
||||
}
|
||||
if (instr->opc == OPC_META_SPLIT) {
|
||||
mesa_log_stream_printf(stream, ", off=%d", instr->split.off);
|
||||
} else if (instr->opc == OPC_META_TEX_PREFETCH) {
|
||||
mesa_log_stream_printf(stream, ", tex=%d, samp=%d, input_offset=%d",
|
||||
instr->prefetch.tex, instr->prefetch.samp,
|
||||
instr->prefetch.input_offset);
|
||||
}
|
||||
|
||||
if (is_flow(instr) && instr->cat0.target) {
|
||||
/* the predicate register src is implied: */
|
||||
if (instr->opc == OPC_B) {
|
||||
static const struct {
|
||||
const char *suffix;
|
||||
int nsrc;
|
||||
bool idx;
|
||||
} brinfo[7] = {
|
||||
[BRANCH_PLAIN] = { "r", 1, false },
|
||||
[BRANCH_OR] = { "rao", 2, false },
|
||||
[BRANCH_AND] = { "raa", 2, false },
|
||||
[BRANCH_CONST] = { "rac", 0, true },
|
||||
[BRANCH_ANY] = { "any", 1, false },
|
||||
[BRANCH_ALL] = { "all", 1, false },
|
||||
[BRANCH_X] = { "rax", 0, false },
|
||||
};
|
||||
if (is_flow(instr) && instr->cat0.target) {
|
||||
/* the predicate register src is implied: */
|
||||
if (instr->opc == OPC_B) {
|
||||
static const struct {
|
||||
const char *suffix;
|
||||
int nsrc;
|
||||
bool idx;
|
||||
} brinfo[7] = {
|
||||
[BRANCH_PLAIN] = {"r", 1, false}, [BRANCH_OR] = {"rao", 2, false},
|
||||
[BRANCH_AND] = {"raa", 2, false}, [BRANCH_CONST] = {"rac", 0, true},
|
||||
[BRANCH_ANY] = {"any", 1, false}, [BRANCH_ALL] = {"all", 1, false},
|
||||
[BRANCH_X] = {"rax", 0, false},
|
||||
};
|
||||
|
||||
mesa_log_stream_printf(stream, "%s", brinfo[instr->cat0.brtype].suffix);
|
||||
if (brinfo[instr->cat0.brtype].idx) {
|
||||
mesa_log_stream_printf(stream, ".%u", instr->cat0.idx);
|
||||
}
|
||||
if (brinfo[instr->cat0.brtype].nsrc >= 1) {
|
||||
mesa_log_stream_printf(stream, " %sp0.%c (",
|
||||
instr->cat0.inv1 ? "!" : "",
|
||||
"xyzw"[instr->cat0.comp1 & 0x3]);
|
||||
print_reg_name(stream, instr, instr->srcs[0], false);
|
||||
mesa_log_stream_printf(stream, "), ");
|
||||
}
|
||||
if (brinfo[instr->cat0.brtype].nsrc >= 2) {
|
||||
mesa_log_stream_printf(stream, " %sp0.%c (",
|
||||
instr->cat0.inv2 ? "!" : "",
|
||||
"xyzw"[instr->cat0.comp2 & 0x3]);
|
||||
print_reg_name(stream, instr, instr->srcs[1], false);
|
||||
mesa_log_stream_printf(stream, "), ");
|
||||
}
|
||||
}
|
||||
mesa_log_stream_printf(stream, " target=block%u", block_id(instr->cat0.target));
|
||||
}
|
||||
mesa_log_stream_printf(stream, "%s",
|
||||
brinfo[instr->cat0.brtype].suffix);
|
||||
if (brinfo[instr->cat0.brtype].idx) {
|
||||
mesa_log_stream_printf(stream, ".%u", instr->cat0.idx);
|
||||
}
|
||||
if (brinfo[instr->cat0.brtype].nsrc >= 1) {
|
||||
mesa_log_stream_printf(stream, " %sp0.%c (",
|
||||
instr->cat0.inv1 ? "!" : "",
|
||||
"xyzw"[instr->cat0.comp1 & 0x3]);
|
||||
print_reg_name(stream, instr, instr->srcs[0], false);
|
||||
mesa_log_stream_printf(stream, "), ");
|
||||
}
|
||||
if (brinfo[instr->cat0.brtype].nsrc >= 2) {
|
||||
mesa_log_stream_printf(stream, " %sp0.%c (",
|
||||
instr->cat0.inv2 ? "!" : "",
|
||||
"xyzw"[instr->cat0.comp2 & 0x3]);
|
||||
print_reg_name(stream, instr, instr->srcs[1], false);
|
||||
mesa_log_stream_printf(stream, "), ");
|
||||
}
|
||||
}
|
||||
mesa_log_stream_printf(stream, " target=block%u",
|
||||
block_id(instr->cat0.target));
|
||||
}
|
||||
|
||||
if (instr->deps_count) {
|
||||
mesa_log_stream_printf(stream, ", false-deps:");
|
||||
unsigned n = 0;
|
||||
for (unsigned i = 0; i < instr->deps_count; i++) {
|
||||
if (!instr->deps[i])
|
||||
continue;
|
||||
if (n++ > 0)
|
||||
mesa_log_stream_printf(stream, ", ");
|
||||
mesa_log_stream_printf(stream, SYN_SSA("ssa_%u"), instr->deps[i]->serialno);
|
||||
}
|
||||
}
|
||||
if (instr->deps_count) {
|
||||
mesa_log_stream_printf(stream, ", false-deps:");
|
||||
unsigned n = 0;
|
||||
for (unsigned i = 0; i < instr->deps_count; i++) {
|
||||
if (!instr->deps[i])
|
||||
continue;
|
||||
if (n++ > 0)
|
||||
mesa_log_stream_printf(stream, ", ");
|
||||
mesa_log_stream_printf(stream, SYN_SSA("ssa_%u"),
|
||||
instr->deps[i]->serialno);
|
||||
}
|
||||
}
|
||||
|
||||
mesa_log_stream_printf(stream, "\n");
|
||||
mesa_log_stream_printf(stream, "\n");
|
||||
}
|
||||
|
||||
void ir3_print_instr(struct ir3_instruction *instr)
|
||||
void
|
||||
ir3_print_instr(struct ir3_instruction *instr)
|
||||
{
|
||||
struct log_stream *stream = mesa_log_streami();
|
||||
print_instr(stream, instr, 0);
|
||||
mesa_log_stream_destroy(stream);
|
||||
struct log_stream *stream = mesa_log_streami();
|
||||
print_instr(stream, instr, 0);
|
||||
mesa_log_stream_destroy(stream);
|
||||
}
|
||||
|
||||
static void
|
||||
print_block(struct ir3_block *block, int lvl)
|
||||
{
|
||||
struct log_stream *stream = mesa_log_streami();
|
||||
struct log_stream *stream = mesa_log_streami();
|
||||
|
||||
tab(stream, lvl); mesa_log_stream_printf(stream, "block%u {\n", block_id(block));
|
||||
tab(stream, lvl);
|
||||
mesa_log_stream_printf(stream, "block%u {\n", block_id(block));
|
||||
|
||||
if (block->predecessors_count > 0) {
|
||||
tab(stream, lvl+1);
|
||||
mesa_log_stream_printf(stream, "pred: ");
|
||||
for (unsigned i = 0; i < block->predecessors_count; i++) {
|
||||
struct ir3_block *pred = block->predecessors[i];
|
||||
if (i != 0)
|
||||
mesa_log_stream_printf(stream, ", ");
|
||||
mesa_log_stream_printf(stream, "block%u", block_id(pred));
|
||||
}
|
||||
mesa_log_stream_printf(stream, "\n");
|
||||
}
|
||||
if (block->predecessors_count > 0) {
|
||||
tab(stream, lvl + 1);
|
||||
mesa_log_stream_printf(stream, "pred: ");
|
||||
for (unsigned i = 0; i < block->predecessors_count; i++) {
|
||||
struct ir3_block *pred = block->predecessors[i];
|
||||
if (i != 0)
|
||||
mesa_log_stream_printf(stream, ", ");
|
||||
mesa_log_stream_printf(stream, "block%u", block_id(pred));
|
||||
}
|
||||
mesa_log_stream_printf(stream, "\n");
|
||||
}
|
||||
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
print_instr(stream, instr, lvl+1);
|
||||
}
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
print_instr(stream, instr, lvl + 1);
|
||||
}
|
||||
|
||||
tab(stream, lvl+1); mesa_log_stream_printf(stream, "/* keeps:\n");
|
||||
for (unsigned i = 0; i < block->keeps_count; i++) {
|
||||
print_instr(stream, block->keeps[i], lvl+2);
|
||||
}
|
||||
tab(stream, lvl+1); mesa_log_stream_printf(stream, " */\n");
|
||||
tab(stream, lvl + 1);
|
||||
mesa_log_stream_printf(stream, "/* keeps:\n");
|
||||
for (unsigned i = 0; i < block->keeps_count; i++) {
|
||||
print_instr(stream, block->keeps[i], lvl + 2);
|
||||
}
|
||||
tab(stream, lvl + 1);
|
||||
mesa_log_stream_printf(stream, " */\n");
|
||||
|
||||
if (block->successors[1]) {
|
||||
/* leading into if/else: */
|
||||
tab(stream, lvl+1);
|
||||
mesa_log_stream_printf(stream, "/* succs: if ");
|
||||
switch (block->brtype) {
|
||||
case IR3_BRANCH_COND:
|
||||
break;
|
||||
case IR3_BRANCH_ANY:
|
||||
printf("any ");
|
||||
break;
|
||||
case IR3_BRANCH_ALL:
|
||||
printf("all ");
|
||||
break;
|
||||
case IR3_BRANCH_GETONE:
|
||||
printf("getone ");
|
||||
break;
|
||||
}
|
||||
if (block->condition)
|
||||
mesa_log_stream_printf(stream, SYN_SSA("ssa_%u")" ", block->condition->serialno);
|
||||
mesa_log_stream_printf(stream, "block%u; else block%u; */\n",
|
||||
block_id(block->successors[0]),
|
||||
block_id(block->successors[1]));
|
||||
} else if (block->successors[0]) {
|
||||
tab(stream, lvl+1);
|
||||
mesa_log_stream_printf(stream, "/* succs: block%u; */\n",
|
||||
block_id(block->successors[0]));
|
||||
}
|
||||
tab(stream, lvl); mesa_log_stream_printf(stream, "}\n");
|
||||
if (block->successors[1]) {
|
||||
/* leading into if/else: */
|
||||
tab(stream, lvl + 1);
|
||||
mesa_log_stream_printf(stream, "/* succs: if ");
|
||||
switch (block->brtype) {
|
||||
case IR3_BRANCH_COND:
|
||||
break;
|
||||
case IR3_BRANCH_ANY:
|
||||
printf("any ");
|
||||
break;
|
||||
case IR3_BRANCH_ALL:
|
||||
printf("all ");
|
||||
break;
|
||||
case IR3_BRANCH_GETONE:
|
||||
printf("getone ");
|
||||
break;
|
||||
}
|
||||
if (block->condition)
|
||||
mesa_log_stream_printf(stream, SYN_SSA("ssa_%u") " ",
|
||||
block->condition->serialno);
|
||||
mesa_log_stream_printf(stream, "block%u; else block%u; */\n",
|
||||
block_id(block->successors[0]),
|
||||
block_id(block->successors[1]));
|
||||
} else if (block->successors[0]) {
|
||||
tab(stream, lvl + 1);
|
||||
mesa_log_stream_printf(stream, "/* succs: block%u; */\n",
|
||||
block_id(block->successors[0]));
|
||||
}
|
||||
tab(stream, lvl);
|
||||
mesa_log_stream_printf(stream, "}\n");
|
||||
}
|
||||
|
||||
void
|
||||
ir3_print(struct ir3 *ir)
|
||||
{
|
||||
foreach_block (block, &ir->block_list)
|
||||
print_block(block, 0);
|
||||
foreach_block (block, &ir->block_list)
|
||||
print_block(block, 0);
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -24,62 +24,68 @@
|
|||
#ifndef _IR3_RA_H
|
||||
#define _IR3_RA_H
|
||||
|
||||
#include "util/rb_tree.h"
|
||||
#include "ir3.h"
|
||||
#include "ir3_compiler.h"
|
||||
#include "util/rb_tree.h"
|
||||
|
||||
#ifdef DEBUG
|
||||
#define RA_DEBUG (ir3_shader_debug & IR3_DBG_RAMSGS)
|
||||
#else
|
||||
#define RA_DEBUG 0
|
||||
#endif
|
||||
#define d(fmt, ...) do { if (RA_DEBUG) { \
|
||||
printf("RA: "fmt"\n", ##__VA_ARGS__); \
|
||||
} } while (0)
|
||||
#define d(fmt, ...) \
|
||||
do { \
|
||||
if (RA_DEBUG) { \
|
||||
printf("RA: " fmt "\n", ##__VA_ARGS__); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define di(instr, fmt, ...) do { if (RA_DEBUG) { \
|
||||
printf("RA: "fmt": ", ##__VA_ARGS__); \
|
||||
ir3_print_instr(instr); \
|
||||
} } while (0)
|
||||
#define di(instr, fmt, ...) \
|
||||
do { \
|
||||
if (RA_DEBUG) { \
|
||||
printf("RA: " fmt ": ", ##__VA_ARGS__); \
|
||||
ir3_print_instr(instr); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
typedef uint16_t physreg_t;
|
||||
|
||||
static inline unsigned
|
||||
ra_physreg_to_num(physreg_t physreg, unsigned flags)
|
||||
{
|
||||
if (!(flags & IR3_REG_HALF))
|
||||
physreg /= 2;
|
||||
if (flags & IR3_REG_SHARED)
|
||||
physreg += 48 * 4;
|
||||
return physreg;
|
||||
if (!(flags & IR3_REG_HALF))
|
||||
physreg /= 2;
|
||||
if (flags & IR3_REG_SHARED)
|
||||
physreg += 48 * 4;
|
||||
return physreg;
|
||||
}
|
||||
|
||||
static inline physreg_t
|
||||
ra_num_to_physreg(unsigned num, unsigned flags)
|
||||
{
|
||||
if (flags & IR3_REG_SHARED)
|
||||
num -= 48 * 4;
|
||||
if (!(flags & IR3_REG_HALF))
|
||||
num *= 2;
|
||||
return num;
|
||||
if (flags & IR3_REG_SHARED)
|
||||
num -= 48 * 4;
|
||||
if (!(flags & IR3_REG_HALF))
|
||||
num *= 2;
|
||||
return num;
|
||||
}
|
||||
|
||||
static inline unsigned
|
||||
ra_reg_get_num(const struct ir3_register *reg)
|
||||
{
|
||||
return (reg->flags & IR3_REG_ARRAY) ? reg->array.base : reg->num;
|
||||
return (reg->flags & IR3_REG_ARRAY) ? reg->array.base : reg->num;
|
||||
}
|
||||
|
||||
static inline physreg_t
|
||||
ra_reg_get_physreg(const struct ir3_register *reg)
|
||||
{
|
||||
return ra_num_to_physreg(ra_reg_get_num(reg), reg->flags);
|
||||
return ra_num_to_physreg(ra_reg_get_num(reg), reg->flags);
|
||||
}
|
||||
|
||||
static inline bool
|
||||
def_is_gpr(const struct ir3_register *reg)
|
||||
{
|
||||
return reg_num(reg) != REG_A0 && reg_num(reg) != REG_P0;
|
||||
return reg_num(reg) != REG_A0 && reg_num(reg) != REG_P0;
|
||||
}
|
||||
|
||||
/* Note: don't count undef as a source.
|
||||
|
@ -87,16 +93,14 @@ def_is_gpr(const struct ir3_register *reg)
|
|||
static inline bool
|
||||
ra_reg_is_src(const struct ir3_register *reg)
|
||||
{
|
||||
return (reg->flags & IR3_REG_SSA) && reg->def &&
|
||||
def_is_gpr(reg->def);
|
||||
return (reg->flags & IR3_REG_SSA) && reg->def && def_is_gpr(reg->def);
|
||||
}
|
||||
|
||||
static inline bool
|
||||
ra_reg_is_dst(const struct ir3_register *reg)
|
||||
{
|
||||
return (reg->flags & IR3_REG_SSA) &&
|
||||
def_is_gpr(reg) &&
|
||||
((reg->flags & IR3_REG_ARRAY) || reg->wrmask);
|
||||
return (reg->flags & IR3_REG_SSA) && def_is_gpr(reg) &&
|
||||
((reg->flags & IR3_REG_ARRAY) || reg->wrmask);
|
||||
}
|
||||
|
||||
/* Iterators for sources and destinations which:
|
||||
|
@ -105,53 +109,54 @@ ra_reg_is_dst(const struct ir3_register *reg)
|
|||
* - Consider array destinations as both a source and a destination
|
||||
*/
|
||||
|
||||
#define ra_foreach_src(__srcreg, __instr) \
|
||||
for (struct ir3_register *__srcreg = (void *)~0; __srcreg; __srcreg = NULL) \
|
||||
for (unsigned __cnt = (__instr)->srcs_count, __i = 0; __i < __cnt; __i++) \
|
||||
if (ra_reg_is_src((__srcreg = (__instr)->srcs[__i])))
|
||||
#define ra_foreach_src(__srcreg, __instr) \
|
||||
for (struct ir3_register *__srcreg = (void *)~0; __srcreg; __srcreg = NULL) \
|
||||
for (unsigned __cnt = (__instr)->srcs_count, __i = 0; __i < __cnt; \
|
||||
__i++) \
|
||||
if (ra_reg_is_src((__srcreg = (__instr)->srcs[__i])))
|
||||
|
||||
#define ra_foreach_src_rev(__srcreg, __instr) \
|
||||
for (struct ir3_register *__srcreg = (void *)~0; __srcreg; __srcreg = NULL) \
|
||||
for (int __cnt = (__instr)->srcs_count, __i = __cnt - 1; __i >= 0; __i--) \
|
||||
if (ra_reg_is_src((__srcreg = (__instr)->srcs[__i])))
|
||||
#define ra_foreach_src_rev(__srcreg, __instr) \
|
||||
for (struct ir3_register *__srcreg = (void *)~0; __srcreg; __srcreg = NULL) \
|
||||
for (int __cnt = (__instr)->srcs_count, __i = __cnt - 1; __i >= 0; \
|
||||
__i--) \
|
||||
if (ra_reg_is_src((__srcreg = (__instr)->srcs[__i])))
|
||||
|
||||
#define ra_foreach_dst(__dstreg, __instr) \
|
||||
for (struct ir3_register *__dstreg = (void *)~0; __dstreg; __dstreg = NULL) \
|
||||
for (unsigned __cnt = (__instr)->dsts_count, __i = 0; __i < __cnt; __i++) \
|
||||
if (ra_reg_is_dst((__dstreg = (__instr)->dsts[__i])))
|
||||
#define ra_foreach_dst(__dstreg, __instr) \
|
||||
for (struct ir3_register *__dstreg = (void *)~0; __dstreg; __dstreg = NULL) \
|
||||
for (unsigned __cnt = (__instr)->dsts_count, __i = 0; __i < __cnt; \
|
||||
__i++) \
|
||||
if (ra_reg_is_dst((__dstreg = (__instr)->dsts[__i])))
|
||||
|
||||
|
||||
#define RA_HALF_SIZE (4 * 48)
|
||||
#define RA_FULL_SIZE (4 * 48 * 2)
|
||||
#define RA_SHARED_SIZE (2 * 4 * 8)
|
||||
#define RA_HALF_SIZE (4 * 48)
|
||||
#define RA_FULL_SIZE (4 * 48 * 2)
|
||||
#define RA_SHARED_SIZE (2 * 4 * 8)
|
||||
#define RA_MAX_FILE_SIZE RA_FULL_SIZE
|
||||
|
||||
struct ir3_liveness {
|
||||
unsigned block_count;
|
||||
DECLARE_ARRAY(struct ir3_register *, definitions);
|
||||
DECLARE_ARRAY(BITSET_WORD *, live_out);
|
||||
DECLARE_ARRAY(BITSET_WORD *, live_in);
|
||||
unsigned block_count;
|
||||
DECLARE_ARRAY(struct ir3_register *, definitions);
|
||||
DECLARE_ARRAY(BITSET_WORD *, live_out);
|
||||
DECLARE_ARRAY(BITSET_WORD *, live_in);
|
||||
};
|
||||
|
||||
struct ir3_liveness *ir3_calc_liveness(struct ir3_shader_variant *v);
|
||||
|
||||
bool ir3_def_live_after(struct ir3_liveness *live, struct ir3_register *def,
|
||||
struct ir3_instruction *instr);
|
||||
struct ir3_instruction *instr);
|
||||
|
||||
void ir3_create_parallel_copies(struct ir3 *ir);
|
||||
|
||||
void ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir);
|
||||
|
||||
struct ir3_pressure {
|
||||
unsigned full, half, shared;
|
||||
unsigned full, half, shared;
|
||||
};
|
||||
|
||||
void ir3_calc_pressure(struct ir3_shader_variant *v,
|
||||
struct ir3_liveness *live,
|
||||
struct ir3_pressure *max_pressure);
|
||||
void ir3_calc_pressure(struct ir3_shader_variant *v, struct ir3_liveness *live,
|
||||
struct ir3_pressure *max_pressure);
|
||||
|
||||
void ir3_ra_validate(struct ir3_shader_variant *v,
|
||||
unsigned full_size, unsigned half_size, unsigned block_count);
|
||||
void ir3_ra_validate(struct ir3_shader_variant *v, unsigned full_size,
|
||||
unsigned half_size, unsigned block_count);
|
||||
|
||||
void ir3_lower_copies(struct ir3_shader_variant *v);
|
||||
|
||||
|
@ -176,91 +181,90 @@ void ir3_lower_copies(struct ir3_shader_variant *v);
|
|||
*/
|
||||
|
||||
struct ir3_reg_interval {
|
||||
struct rb_node node;
|
||||
struct rb_node node;
|
||||
|
||||
struct rb_tree children;
|
||||
struct rb_tree children;
|
||||
|
||||
struct ir3_reg_interval *parent;
|
||||
struct ir3_reg_interval *parent;
|
||||
|
||||
struct ir3_register *reg;
|
||||
struct ir3_register *reg;
|
||||
|
||||
bool inserted;
|
||||
bool inserted;
|
||||
};
|
||||
|
||||
struct ir3_reg_ctx {
|
||||
/* The tree of top-level intervals in the forest. */
|
||||
struct rb_tree intervals;
|
||||
/* The tree of top-level intervals in the forest. */
|
||||
struct rb_tree intervals;
|
||||
|
||||
/* Users of ir3_reg_ctx need to keep around additional state that is
|
||||
* modified when top-level intervals are added or removed. For register
|
||||
* pressure tracking, this is just the register pressure, but for RA we
|
||||
* need to keep track of the physreg of each top-level interval. These
|
||||
* callbacks provide a place to let users deriving from ir3_reg_ctx update
|
||||
* their state when top-level intervals are inserted/removed.
|
||||
*/
|
||||
/* Users of ir3_reg_ctx need to keep around additional state that is
|
||||
* modified when top-level intervals are added or removed. For register
|
||||
* pressure tracking, this is just the register pressure, but for RA we
|
||||
* need to keep track of the physreg of each top-level interval. These
|
||||
* callbacks provide a place to let users deriving from ir3_reg_ctx update
|
||||
* their state when top-level intervals are inserted/removed.
|
||||
*/
|
||||
|
||||
/* Called when an interval is added and it turns out to be at the top
|
||||
* level.
|
||||
*/
|
||||
void (*interval_add)(struct ir3_reg_ctx *ctx,
|
||||
struct ir3_reg_interval *interval);
|
||||
/* Called when an interval is added and it turns out to be at the top
|
||||
* level.
|
||||
*/
|
||||
void (*interval_add)(struct ir3_reg_ctx *ctx,
|
||||
struct ir3_reg_interval *interval);
|
||||
|
||||
/* Called when an interval is deleted from the top level. */
|
||||
void (*interval_delete)(struct ir3_reg_ctx *ctx,
|
||||
struct ir3_reg_interval *interval);
|
||||
/* Called when an interval is deleted from the top level. */
|
||||
void (*interval_delete)(struct ir3_reg_ctx *ctx,
|
||||
struct ir3_reg_interval *interval);
|
||||
|
||||
/* Called when an interval is deleted and its child becomes top-level.
|
||||
*/
|
||||
void (*interval_readd)(struct ir3_reg_ctx *ctx,
|
||||
struct ir3_reg_interval *parent,
|
||||
struct ir3_reg_interval *child);
|
||||
/* Called when an interval is deleted and its child becomes top-level.
|
||||
*/
|
||||
void (*interval_readd)(struct ir3_reg_ctx *ctx,
|
||||
struct ir3_reg_interval *parent,
|
||||
struct ir3_reg_interval *child);
|
||||
};
|
||||
|
||||
static inline struct ir3_reg_interval *
|
||||
ir3_rb_node_to_interval(struct rb_node *node)
|
||||
{
|
||||
return rb_node_data(struct ir3_reg_interval, node, node);
|
||||
return rb_node_data(struct ir3_reg_interval, node, node);
|
||||
}
|
||||
|
||||
static inline const struct ir3_reg_interval *
|
||||
ir3_rb_node_to_interval_const(const struct rb_node *node)
|
||||
{
|
||||
return rb_node_data(struct ir3_reg_interval, node, node);
|
||||
return rb_node_data(struct ir3_reg_interval, node, node);
|
||||
}
|
||||
|
||||
static inline struct ir3_reg_interval *
|
||||
ir3_reg_interval_next(struct ir3_reg_interval *interval)
|
||||
{
|
||||
struct rb_node *next = rb_node_next(&interval->node);
|
||||
return next ? ir3_rb_node_to_interval(next) : NULL;
|
||||
struct rb_node *next = rb_node_next(&interval->node);
|
||||
return next ? ir3_rb_node_to_interval(next) : NULL;
|
||||
}
|
||||
|
||||
static inline struct ir3_reg_interval *
|
||||
ir3_reg_interval_next_or_null(struct ir3_reg_interval *interval)
|
||||
{
|
||||
return interval ? ir3_reg_interval_next(interval) : NULL;
|
||||
return interval ? ir3_reg_interval_next(interval) : NULL;
|
||||
}
|
||||
|
||||
static inline void
|
||||
ir3_reg_interval_init(struct ir3_reg_interval *interval, struct ir3_register *reg)
|
||||
ir3_reg_interval_init(struct ir3_reg_interval *interval,
|
||||
struct ir3_register *reg)
|
||||
{
|
||||
rb_tree_init(&interval->children);
|
||||
interval->reg = reg;
|
||||
interval->parent = NULL;
|
||||
interval->inserted = false;
|
||||
rb_tree_init(&interval->children);
|
||||
interval->reg = reg;
|
||||
interval->parent = NULL;
|
||||
interval->inserted = false;
|
||||
}
|
||||
|
||||
void
|
||||
ir3_reg_interval_dump(struct ir3_reg_interval *interval);
|
||||
void ir3_reg_interval_dump(struct ir3_reg_interval *interval);
|
||||
|
||||
void ir3_reg_interval_insert(struct ir3_reg_ctx *ctx,
|
||||
struct ir3_reg_interval *interval);
|
||||
struct ir3_reg_interval *interval);
|
||||
|
||||
void ir3_reg_interval_remove(struct ir3_reg_ctx *ctx,
|
||||
struct ir3_reg_interval *interval);
|
||||
struct ir3_reg_interval *interval);
|
||||
|
||||
void ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx,
|
||||
struct ir3_reg_interval *interval);
|
||||
struct ir3_reg_interval *interval);
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -73,59 +73,61 @@
|
|||
*/
|
||||
|
||||
#define UNKNOWN ((struct ir3_register *)NULL)
|
||||
#define UNDEF ((struct ir3_register *)(uintptr_t)1)
|
||||
#define UNDEF ((struct ir3_register *)(uintptr_t)1)
|
||||
#define OVERDEF ((struct ir3_register *)(uintptr_t)2)
|
||||
|
||||
struct reg_state {
|
||||
struct ir3_register *def;
|
||||
unsigned offset;
|
||||
struct ir3_register *def;
|
||||
unsigned offset;
|
||||
};
|
||||
|
||||
struct file_state {
|
||||
struct reg_state regs[RA_MAX_FILE_SIZE];
|
||||
struct reg_state regs[RA_MAX_FILE_SIZE];
|
||||
};
|
||||
|
||||
struct reaching_state {
|
||||
struct file_state half, full, shared;
|
||||
struct file_state half, full, shared;
|
||||
};
|
||||
|
||||
struct ra_val_ctx {
|
||||
struct ir3_instruction *current_instr;
|
||||
struct ir3_instruction *current_instr;
|
||||
|
||||
struct reaching_state reaching;
|
||||
struct reaching_state *block_reaching;
|
||||
unsigned block_count;
|
||||
struct reaching_state reaching;
|
||||
struct reaching_state *block_reaching;
|
||||
unsigned block_count;
|
||||
|
||||
unsigned full_size, half_size;
|
||||
unsigned full_size, half_size;
|
||||
|
||||
bool merged_regs;
|
||||
bool merged_regs;
|
||||
|
||||
bool failed;
|
||||
bool failed;
|
||||
};
|
||||
|
||||
static void
|
||||
validate_error(struct ra_val_ctx *ctx, const char *condstr)
|
||||
{
|
||||
fprintf(stderr, "ra validation fail: %s\n", condstr);
|
||||
fprintf(stderr, " -> for instruction: ");
|
||||
ir3_print_instr(ctx->current_instr);
|
||||
abort();
|
||||
fprintf(stderr, "ra validation fail: %s\n", condstr);
|
||||
fprintf(stderr, " -> for instruction: ");
|
||||
ir3_print_instr(ctx->current_instr);
|
||||
abort();
|
||||
}
|
||||
|
||||
#define validate_assert(ctx, cond) do { \
|
||||
if (!(cond)) { \
|
||||
validate_error(ctx, #cond); \
|
||||
} } while (0)
|
||||
#define validate_assert(ctx, cond) \
|
||||
do { \
|
||||
if (!(cond)) { \
|
||||
validate_error(ctx, #cond); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
static unsigned
|
||||
get_file_size(struct ra_val_ctx *ctx, struct ir3_register *reg)
|
||||
{
|
||||
if (reg->flags & IR3_REG_SHARED)
|
||||
return RA_SHARED_SIZE;
|
||||
else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
|
||||
return ctx->full_size;
|
||||
else
|
||||
return ctx->half_size;
|
||||
if (reg->flags & IR3_REG_SHARED)
|
||||
return RA_SHARED_SIZE;
|
||||
else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
|
||||
return ctx->full_size;
|
||||
else
|
||||
return ctx->half_size;
|
||||
}
|
||||
|
||||
/* Validate simple things, like the registers being in-bounds. This way we
|
||||
|
@ -135,438 +137,434 @@ get_file_size(struct ra_val_ctx *ctx, struct ir3_register *reg)
|
|||
static void
|
||||
validate_simple(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
|
||||
{
|
||||
ctx->current_instr = instr;
|
||||
ra_foreach_dst (dst, instr) {
|
||||
unsigned dst_max = ra_reg_get_physreg(dst) + reg_size(dst);
|
||||
validate_assert(ctx, dst_max <= get_file_size(ctx, dst));
|
||||
if (dst->tied)
|
||||
validate_assert(ctx, ra_reg_get_num(dst) == ra_reg_get_num(dst->tied));
|
||||
}
|
||||
ctx->current_instr = instr;
|
||||
ra_foreach_dst (dst, instr) {
|
||||
unsigned dst_max = ra_reg_get_physreg(dst) + reg_size(dst);
|
||||
validate_assert(ctx, dst_max <= get_file_size(ctx, dst));
|
||||
if (dst->tied)
|
||||
validate_assert(ctx, ra_reg_get_num(dst) == ra_reg_get_num(dst->tied));
|
||||
}
|
||||
|
||||
ra_foreach_src (src, instr) {
|
||||
unsigned src_max = ra_reg_get_physreg(src) + reg_size(src);
|
||||
validate_assert(ctx, src_max <= get_file_size(ctx, src));
|
||||
}
|
||||
ra_foreach_src (src, instr) {
|
||||
unsigned src_max = ra_reg_get_physreg(src) + reg_size(src);
|
||||
validate_assert(ctx, src_max <= get_file_size(ctx, src));
|
||||
}
|
||||
}
|
||||
|
||||
/* This is the lattice operator. */
|
||||
static bool
|
||||
merge_reg(struct reg_state *dst, const struct reg_state *src)
|
||||
{
|
||||
if (dst->def == UNKNOWN) {
|
||||
*dst = *src;
|
||||
return src->def != UNKNOWN;
|
||||
} else if (dst->def == OVERDEF) {
|
||||
return false;
|
||||
} else {
|
||||
if (src->def == UNKNOWN)
|
||||
return false;
|
||||
else if (src->def == OVERDEF) {
|
||||
*dst = *src;
|
||||
return true;
|
||||
} else {
|
||||
if (dst->def != src->def || dst->offset != src->offset) {
|
||||
dst->def = OVERDEF;
|
||||
dst->offset = 0;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (dst->def == UNKNOWN) {
|
||||
*dst = *src;
|
||||
return src->def != UNKNOWN;
|
||||
} else if (dst->def == OVERDEF) {
|
||||
return false;
|
||||
} else {
|
||||
if (src->def == UNKNOWN)
|
||||
return false;
|
||||
else if (src->def == OVERDEF) {
|
||||
*dst = *src;
|
||||
return true;
|
||||
} else {
|
||||
if (dst->def != src->def || dst->offset != src->offset) {
|
||||
dst->def = OVERDEF;
|
||||
dst->offset = 0;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
merge_file(struct file_state *dst, const struct file_state *src, unsigned size)
|
||||
{
|
||||
bool progress = false;
|
||||
for (unsigned i = 0; i < size; i++)
|
||||
progress |= merge_reg(&dst->regs[i], &src->regs[i]);
|
||||
return progress;
|
||||
bool progress = false;
|
||||
for (unsigned i = 0; i < size; i++)
|
||||
progress |= merge_reg(&dst->regs[i], &src->regs[i]);
|
||||
return progress;
|
||||
}
|
||||
|
||||
static bool
|
||||
merge_state(struct ra_val_ctx *ctx, struct reaching_state *dst,
|
||||
const struct reaching_state *src)
|
||||
const struct reaching_state *src)
|
||||
{
|
||||
bool progress = false;
|
||||
progress |= merge_file(&dst->full, &src->full, ctx->full_size);
|
||||
progress |= merge_file(&dst->half, &src->half, ctx->half_size);
|
||||
return progress;
|
||||
bool progress = false;
|
||||
progress |= merge_file(&dst->full, &src->full, ctx->full_size);
|
||||
progress |= merge_file(&dst->half, &src->half, ctx->half_size);
|
||||
return progress;
|
||||
}
|
||||
|
||||
static bool
|
||||
merge_state_physical(struct ra_val_ctx *ctx, struct reaching_state *dst,
|
||||
const struct reaching_state *src)
|
||||
const struct reaching_state *src)
|
||||
{
|
||||
return merge_file(&dst->shared, &src->shared, RA_SHARED_SIZE);
|
||||
return merge_file(&dst->shared, &src->shared, RA_SHARED_SIZE);
|
||||
}
|
||||
|
||||
static struct file_state *
|
||||
ra_val_get_file(struct ra_val_ctx *ctx, struct ir3_register *reg)
|
||||
{
|
||||
if (reg->flags & IR3_REG_SHARED)
|
||||
return &ctx->reaching.shared;
|
||||
else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
|
||||
return &ctx->reaching.full;
|
||||
else
|
||||
return &ctx->reaching.half;
|
||||
if (reg->flags & IR3_REG_SHARED)
|
||||
return &ctx->reaching.shared;
|
||||
else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
|
||||
return &ctx->reaching.full;
|
||||
else
|
||||
return &ctx->reaching.half;
|
||||
}
|
||||
|
||||
static void
|
||||
propagate_normal_instr(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
|
||||
{
|
||||
ra_foreach_dst (dst, instr) {
|
||||
struct file_state *file = ra_val_get_file(ctx, dst);
|
||||
physreg_t physreg = ra_reg_get_physreg(dst);
|
||||
for (unsigned i = 0; i < reg_size(dst); i++) {
|
||||
file->regs[physreg + i] = (struct reg_state) {
|
||||
.def = dst,
|
||||
.offset = i,
|
||||
};
|
||||
}
|
||||
}
|
||||
ra_foreach_dst (dst, instr) {
|
||||
struct file_state *file = ra_val_get_file(ctx, dst);
|
||||
physreg_t physreg = ra_reg_get_physreg(dst);
|
||||
for (unsigned i = 0; i < reg_size(dst); i++) {
|
||||
file->regs[physreg + i] = (struct reg_state){
|
||||
.def = dst,
|
||||
.offset = i,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
propagate_split(struct ra_val_ctx *ctx, struct ir3_instruction *split)
|
||||
{
|
||||
struct ir3_register *dst = split->dsts[0];
|
||||
struct ir3_register *src = split->srcs[0];
|
||||
physreg_t dst_physreg = ra_reg_get_physreg(dst);
|
||||
physreg_t src_physreg = ra_reg_get_physreg(src);
|
||||
struct file_state *file = ra_val_get_file(ctx, dst);
|
||||
struct ir3_register *dst = split->dsts[0];
|
||||
struct ir3_register *src = split->srcs[0];
|
||||
physreg_t dst_physreg = ra_reg_get_physreg(dst);
|
||||
physreg_t src_physreg = ra_reg_get_physreg(src);
|
||||
struct file_state *file = ra_val_get_file(ctx, dst);
|
||||
|
||||
unsigned offset = split->split.off * reg_elem_size(src);
|
||||
for (unsigned i = 0; i < reg_elem_size(src); i++) {
|
||||
file->regs[dst_physreg + i] = file->regs[src_physreg + offset + i];
|
||||
}
|
||||
unsigned offset = split->split.off * reg_elem_size(src);
|
||||
for (unsigned i = 0; i < reg_elem_size(src); i++) {
|
||||
file->regs[dst_physreg + i] = file->regs[src_physreg + offset + i];
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
propagate_collect(struct ra_val_ctx *ctx, struct ir3_instruction *collect)
|
||||
{
|
||||
struct ir3_register *dst = collect->dsts[0];
|
||||
physreg_t dst_physreg = ra_reg_get_physreg(dst);
|
||||
struct file_state *file = ra_val_get_file(ctx, dst);
|
||||
struct ir3_register *dst = collect->dsts[0];
|
||||
physreg_t dst_physreg = ra_reg_get_physreg(dst);
|
||||
struct file_state *file = ra_val_get_file(ctx, dst);
|
||||
|
||||
unsigned size = reg_size(dst);
|
||||
struct reg_state srcs[size];
|
||||
unsigned size = reg_size(dst);
|
||||
struct reg_state srcs[size];
|
||||
|
||||
for (unsigned i = 0; i < collect->srcs_count; i++) {
|
||||
struct ir3_register *src = collect->srcs[i];
|
||||
unsigned dst_offset = i * reg_elem_size(dst);
|
||||
for (unsigned j = 0; j < reg_elem_size(dst); j++) {
|
||||
if (!ra_reg_is_src(src)) {
|
||||
srcs[dst_offset + j] = (struct reg_state) {
|
||||
.def = dst,
|
||||
.offset = dst_offset + j,
|
||||
};
|
||||
} else {
|
||||
physreg_t src_physreg = ra_reg_get_physreg(src);
|
||||
srcs[dst_offset + j] = file->regs[src_physreg + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
for (unsigned i = 0; i < collect->srcs_count; i++) {
|
||||
struct ir3_register *src = collect->srcs[i];
|
||||
unsigned dst_offset = i * reg_elem_size(dst);
|
||||
for (unsigned j = 0; j < reg_elem_size(dst); j++) {
|
||||
if (!ra_reg_is_src(src)) {
|
||||
srcs[dst_offset + j] = (struct reg_state){
|
||||
.def = dst,
|
||||
.offset = dst_offset + j,
|
||||
};
|
||||
} else {
|
||||
physreg_t src_physreg = ra_reg_get_physreg(src);
|
||||
srcs[dst_offset + j] = file->regs[src_physreg + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < size; i++)
|
||||
file->regs[dst_physreg + i] = srcs[i];
|
||||
for (unsigned i = 0; i < size; i++)
|
||||
file->regs[dst_physreg + i] = srcs[i];
|
||||
}
|
||||
|
||||
static void
|
||||
propagate_parallelcopy(struct ra_val_ctx *ctx, struct ir3_instruction *pcopy)
|
||||
{
|
||||
unsigned size = 0;
|
||||
for (unsigned i = 0; i < pcopy->dsts_count; i++) {
|
||||
size += reg_size(pcopy->srcs[i]);
|
||||
}
|
||||
unsigned size = 0;
|
||||
for (unsigned i = 0; i < pcopy->dsts_count; i++) {
|
||||
size += reg_size(pcopy->srcs[i]);
|
||||
}
|
||||
|
||||
struct reg_state srcs[size];
|
||||
struct reg_state srcs[size];
|
||||
|
||||
unsigned offset = 0;
|
||||
for (unsigned i = 0; i < pcopy->srcs_count; i++) {
|
||||
struct ir3_register *dst = pcopy->dsts[i];
|
||||
struct ir3_register *src = pcopy->srcs[i];
|
||||
struct file_state *file = ra_val_get_file(ctx, dst);
|
||||
unsigned offset = 0;
|
||||
for (unsigned i = 0; i < pcopy->srcs_count; i++) {
|
||||
struct ir3_register *dst = pcopy->dsts[i];
|
||||
struct ir3_register *src = pcopy->srcs[i];
|
||||
struct file_state *file = ra_val_get_file(ctx, dst);
|
||||
|
||||
for (unsigned j = 0; j < reg_size(dst); j++) {
|
||||
if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST)) {
|
||||
srcs[offset + j] = (struct reg_state) {
|
||||
.def = dst,
|
||||
.offset = j,
|
||||
};
|
||||
} else {
|
||||
physreg_t src_physreg = ra_reg_get_physreg(src);
|
||||
srcs[offset + j] = file->regs[src_physreg + j];
|
||||
}
|
||||
}
|
||||
for (unsigned j = 0; j < reg_size(dst); j++) {
|
||||
if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST)) {
|
||||
srcs[offset + j] = (struct reg_state){
|
||||
.def = dst,
|
||||
.offset = j,
|
||||
};
|
||||
} else {
|
||||
physreg_t src_physreg = ra_reg_get_physreg(src);
|
||||
srcs[offset + j] = file->regs[src_physreg + j];
|
||||
}
|
||||
}
|
||||
|
||||
offset += reg_size(dst);
|
||||
}
|
||||
assert(offset == size);
|
||||
offset += reg_size(dst);
|
||||
}
|
||||
assert(offset == size);
|
||||
|
||||
offset = 0;
|
||||
for (unsigned i = 0; i < pcopy->dsts_count; i++) {
|
||||
struct ir3_register *dst = pcopy->dsts[i];
|
||||
physreg_t dst_physreg = ra_reg_get_physreg(dst);
|
||||
struct file_state *file = ra_val_get_file(ctx, dst);
|
||||
offset = 0;
|
||||
for (unsigned i = 0; i < pcopy->dsts_count; i++) {
|
||||
struct ir3_register *dst = pcopy->dsts[i];
|
||||
physreg_t dst_physreg = ra_reg_get_physreg(dst);
|
||||
struct file_state *file = ra_val_get_file(ctx, dst);
|
||||
|
||||
for (unsigned j = 0; j < reg_size(dst); j++)
|
||||
file->regs[dst_physreg + j] = srcs[offset + j];
|
||||
for (unsigned j = 0; j < reg_size(dst); j++)
|
||||
file->regs[dst_physreg + j] = srcs[offset + j];
|
||||
|
||||
offset += reg_size(dst);
|
||||
}
|
||||
assert(offset == size);
|
||||
offset += reg_size(dst);
|
||||
}
|
||||
assert(offset == size);
|
||||
}
|
||||
|
||||
static void
|
||||
propagate_instr(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
|
||||
{
|
||||
if (instr->opc == OPC_META_SPLIT)
|
||||
propagate_split(ctx, instr);
|
||||
else if (instr->opc == OPC_META_COLLECT)
|
||||
propagate_collect(ctx, instr);
|
||||
else if (instr->opc == OPC_META_PARALLEL_COPY)
|
||||
propagate_parallelcopy(ctx, instr);
|
||||
else
|
||||
propagate_normal_instr(ctx, instr);
|
||||
if (instr->opc == OPC_META_SPLIT)
|
||||
propagate_split(ctx, instr);
|
||||
else if (instr->opc == OPC_META_COLLECT)
|
||||
propagate_collect(ctx, instr);
|
||||
else if (instr->opc == OPC_META_PARALLEL_COPY)
|
||||
propagate_parallelcopy(ctx, instr);
|
||||
else
|
||||
propagate_normal_instr(ctx, instr);
|
||||
}
|
||||
|
||||
static bool
|
||||
propagate_block(struct ra_val_ctx *ctx, struct ir3_block *block)
|
||||
{
|
||||
ctx->reaching = ctx->block_reaching[block->index];
|
||||
ctx->reaching = ctx->block_reaching[block->index];
|
||||
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
propagate_instr(ctx, instr);
|
||||
}
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
propagate_instr(ctx, instr);
|
||||
}
|
||||
|
||||
bool progress = false;
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
struct ir3_block *succ = block->successors[i];
|
||||
if (!succ)
|
||||
continue;
|
||||
progress |= merge_state(ctx,
|
||||
&ctx->block_reaching[succ->index],
|
||||
&ctx->reaching);
|
||||
}
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
struct ir3_block *succ = block->physical_successors[i];
|
||||
if (!succ)
|
||||
continue;
|
||||
progress |= merge_state_physical(ctx,
|
||||
&ctx->block_reaching[succ->index],
|
||||
&ctx->reaching);
|
||||
}
|
||||
return progress;
|
||||
bool progress = false;
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
struct ir3_block *succ = block->successors[i];
|
||||
if (!succ)
|
||||
continue;
|
||||
progress |=
|
||||
merge_state(ctx, &ctx->block_reaching[succ->index], &ctx->reaching);
|
||||
}
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
struct ir3_block *succ = block->physical_successors[i];
|
||||
if (!succ)
|
||||
continue;
|
||||
progress |= merge_state_physical(ctx, &ctx->block_reaching[succ->index],
|
||||
&ctx->reaching);
|
||||
}
|
||||
return progress;
|
||||
}
|
||||
|
||||
static void
|
||||
chase_definition(struct reg_state *state)
|
||||
{
|
||||
while (true) {
|
||||
struct ir3_instruction *instr = state->def->instr;
|
||||
switch (instr->opc) {
|
||||
case OPC_META_SPLIT: {
|
||||
struct ir3_register *new_def = instr->srcs[0]->def;
|
||||
unsigned offset = instr->split.off * reg_elem_size(new_def);
|
||||
*state = (struct reg_state) {
|
||||
.def = new_def,
|
||||
.offset = state->offset + offset,
|
||||
};
|
||||
break;
|
||||
}
|
||||
case OPC_META_COLLECT: {
|
||||
unsigned src_idx = state->offset / reg_elem_size(state->def);
|
||||
unsigned src_offset = state->offset % reg_elem_size(state->def);
|
||||
struct ir3_register *new_def = instr->srcs[src_idx]->def;
|
||||
if (new_def) {
|
||||
*state = (struct reg_state) {
|
||||
.def = new_def,
|
||||
.offset = src_offset,
|
||||
};
|
||||
} else {
|
||||
/* Bail on immed/const */
|
||||
return;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case OPC_META_PARALLEL_COPY: {
|
||||
unsigned dst_idx = ~0;
|
||||
for (unsigned i = 0; i < instr->dsts_count; i++) {
|
||||
if (instr->dsts[i] == state->def) {
|
||||
dst_idx = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert(dst_idx != ~0);
|
||||
while (true) {
|
||||
struct ir3_instruction *instr = state->def->instr;
|
||||
switch (instr->opc) {
|
||||
case OPC_META_SPLIT: {
|
||||
struct ir3_register *new_def = instr->srcs[0]->def;
|
||||
unsigned offset = instr->split.off * reg_elem_size(new_def);
|
||||
*state = (struct reg_state){
|
||||
.def = new_def,
|
||||
.offset = state->offset + offset,
|
||||
};
|
||||
break;
|
||||
}
|
||||
case OPC_META_COLLECT: {
|
||||
unsigned src_idx = state->offset / reg_elem_size(state->def);
|
||||
unsigned src_offset = state->offset % reg_elem_size(state->def);
|
||||
struct ir3_register *new_def = instr->srcs[src_idx]->def;
|
||||
if (new_def) {
|
||||
*state = (struct reg_state){
|
||||
.def = new_def,
|
||||
.offset = src_offset,
|
||||
};
|
||||
} else {
|
||||
/* Bail on immed/const */
|
||||
return;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case OPC_META_PARALLEL_COPY: {
|
||||
unsigned dst_idx = ~0;
|
||||
for (unsigned i = 0; i < instr->dsts_count; i++) {
|
||||
if (instr->dsts[i] == state->def) {
|
||||
dst_idx = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert(dst_idx != ~0);
|
||||
|
||||
struct ir3_register *new_def = instr->srcs[dst_idx]->def;
|
||||
if (new_def) {
|
||||
state->def = new_def;
|
||||
} else {
|
||||
/* Bail on immed/const */
|
||||
return;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
return;
|
||||
}
|
||||
}
|
||||
struct ir3_register *new_def = instr->srcs[dst_idx]->def;
|
||||
if (new_def) {
|
||||
state->def = new_def;
|
||||
} else {
|
||||
/* Bail on immed/const */
|
||||
return;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
dump_reg_state(struct reg_state *state)
|
||||
{
|
||||
if (state->def == UNDEF) {
|
||||
fprintf(stderr, "no reaching definition");
|
||||
} else if (state->def == OVERDEF) {
|
||||
fprintf(stderr, "more than one reaching definition or partial definition");
|
||||
} else {
|
||||
/* The analysis should always remove UNKNOWN eventually. */
|
||||
assert(state->def != UNKNOWN);
|
||||
if (state->def == UNDEF) {
|
||||
fprintf(stderr, "no reaching definition");
|
||||
} else if (state->def == OVERDEF) {
|
||||
fprintf(stderr,
|
||||
"more than one reaching definition or partial definition");
|
||||
} else {
|
||||
/* The analysis should always remove UNKNOWN eventually. */
|
||||
assert(state->def != UNKNOWN);
|
||||
|
||||
fprintf(stderr, "ssa_%u:%u(%sr%u.%c) + %u",
|
||||
state->def->instr->serialno, state->def->name,
|
||||
(state->def->flags & IR3_REG_HALF) ? "h" : "",
|
||||
state->def->num / 4, "xyzw"[state->def->num % 4],
|
||||
state->offset);
|
||||
}
|
||||
fprintf(stderr, "ssa_%u:%u(%sr%u.%c) + %u", state->def->instr->serialno,
|
||||
state->def->name, (state->def->flags & IR3_REG_HALF) ? "h" : "",
|
||||
state->def->num / 4, "xyzw"[state->def->num % 4],
|
||||
state -> offset);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
check_reaching_src(struct ra_val_ctx *ctx, struct ir3_instruction *instr,
|
||||
struct ir3_register *src)
|
||||
struct ir3_register *src)
|
||||
{
|
||||
struct file_state *file = ra_val_get_file(ctx, src);
|
||||
physreg_t physreg = ra_reg_get_physreg(src);
|
||||
for (unsigned i = 0; i < reg_size(src); i++) {
|
||||
struct reg_state expected = (struct reg_state) {
|
||||
.def = src->def,
|
||||
.offset = i,
|
||||
};
|
||||
chase_definition(&expected);
|
||||
struct file_state *file = ra_val_get_file(ctx, src);
|
||||
physreg_t physreg = ra_reg_get_physreg(src);
|
||||
for (unsigned i = 0; i < reg_size(src); i++) {
|
||||
struct reg_state expected = (struct reg_state){
|
||||
.def = src->def,
|
||||
.offset = i,
|
||||
};
|
||||
chase_definition(&expected);
|
||||
|
||||
struct reg_state actual = file->regs[physreg + i];
|
||||
struct reg_state actual = file->regs[physreg + i];
|
||||
|
||||
if (expected.def != actual.def ||
|
||||
expected.offset != actual.offset) {
|
||||
fprintf(stderr, "ra validation fail: wrong definition reaches source ssa_%u:%u + %u\n",
|
||||
src->def->instr->serialno, src->def->name, i);
|
||||
fprintf(stderr, "expected: ");
|
||||
dump_reg_state(&expected);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "actual: ");
|
||||
dump_reg_state(&actual);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "-> for instruction: ");
|
||||
ir3_print_instr(instr);
|
||||
ctx->failed = true;
|
||||
}
|
||||
}
|
||||
if (expected.def != actual.def || expected.offset != actual.offset) {
|
||||
fprintf(
|
||||
stderr,
|
||||
"ra validation fail: wrong definition reaches source ssa_%u:%u + %u\n",
|
||||
src->def->instr->serialno, src->def->name, i);
|
||||
fprintf(stderr, "expected: ");
|
||||
dump_reg_state(&expected);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "actual: ");
|
||||
dump_reg_state(&actual);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "-> for instruction: ");
|
||||
ir3_print_instr(instr);
|
||||
ctx->failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
check_reaching_instr(struct ra_val_ctx *ctx, struct ir3_instruction *instr)
|
||||
{
|
||||
if (instr->opc == OPC_META_SPLIT ||
|
||||
instr->opc == OPC_META_COLLECT ||
|
||||
instr->opc == OPC_META_PARALLEL_COPY ||
|
||||
instr->opc == OPC_META_PHI) {
|
||||
return;
|
||||
}
|
||||
if (instr->opc == OPC_META_SPLIT || instr->opc == OPC_META_COLLECT ||
|
||||
instr->opc == OPC_META_PARALLEL_COPY || instr->opc == OPC_META_PHI) {
|
||||
return;
|
||||
}
|
||||
|
||||
ra_foreach_src (src, instr) {
|
||||
check_reaching_src(ctx, instr, src);
|
||||
}
|
||||
ra_foreach_src (src, instr) {
|
||||
check_reaching_src(ctx, instr, src);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
check_reaching_block(struct ra_val_ctx *ctx, struct ir3_block *block)
|
||||
{
|
||||
ctx->reaching = ctx->block_reaching[block->index];
|
||||
ctx->reaching = ctx->block_reaching[block->index];
|
||||
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
check_reaching_instr(ctx, instr);
|
||||
propagate_instr(ctx, instr);
|
||||
}
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
check_reaching_instr(ctx, instr);
|
||||
propagate_instr(ctx, instr);
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
struct ir3_block *succ = block->successors[i];
|
||||
if (!succ)
|
||||
continue;
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
struct ir3_block *succ = block->successors[i];
|
||||
if (!succ)
|
||||
continue;
|
||||
|
||||
unsigned pred_idx = ir3_block_get_pred_index(succ, block);
|
||||
foreach_instr (instr, &succ->instr_list) {
|
||||
if (instr->opc != OPC_META_PHI)
|
||||
break;
|
||||
if (instr->srcs[pred_idx]->def)
|
||||
check_reaching_src(ctx, instr, instr->srcs[pred_idx]);
|
||||
}
|
||||
}
|
||||
unsigned pred_idx = ir3_block_get_pred_index(succ, block);
|
||||
foreach_instr (instr, &succ->instr_list) {
|
||||
if (instr->opc != OPC_META_PHI)
|
||||
break;
|
||||
if (instr->srcs[pred_idx]->def)
|
||||
check_reaching_src(ctx, instr, instr->srcs[pred_idx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
check_reaching_defs(struct ra_val_ctx *ctx, struct ir3 *ir)
|
||||
{
|
||||
ctx->block_reaching =
|
||||
rzalloc_array(ctx, struct reaching_state, ctx->block_count);
|
||||
ctx->block_reaching =
|
||||
rzalloc_array(ctx, struct reaching_state, ctx->block_count);
|
||||
|
||||
struct reaching_state *start = &ctx->block_reaching[0];
|
||||
for (unsigned i = 0; i < ctx->full_size; i++)
|
||||
start->full.regs[i].def = UNDEF;
|
||||
for (unsigned i = 0; i < ctx->half_size; i++)
|
||||
start->half.regs[i].def = UNDEF;
|
||||
for (unsigned i = 0; i < RA_SHARED_SIZE; i++)
|
||||
start->shared.regs[i].def = UNDEF;
|
||||
struct reaching_state *start = &ctx->block_reaching[0];
|
||||
for (unsigned i = 0; i < ctx->full_size; i++)
|
||||
start->full.regs[i].def = UNDEF;
|
||||
for (unsigned i = 0; i < ctx->half_size; i++)
|
||||
start->half.regs[i].def = UNDEF;
|
||||
for (unsigned i = 0; i < RA_SHARED_SIZE; i++)
|
||||
start->shared.regs[i].def = UNDEF;
|
||||
|
||||
bool progress;
|
||||
do {
|
||||
progress = false;
|
||||
foreach_block (block, &ir->block_list) {
|
||||
progress |= propagate_block(ctx, block);
|
||||
}
|
||||
} while (progress);
|
||||
|
||||
foreach_block (block, &ir->block_list) {
|
||||
check_reaching_block(ctx, block);
|
||||
}
|
||||
bool progress;
|
||||
do {
|
||||
progress = false;
|
||||
foreach_block (block, &ir->block_list) {
|
||||
progress |= propagate_block(ctx, block);
|
||||
}
|
||||
} while (progress);
|
||||
|
||||
if (ctx->failed) {
|
||||
fprintf(stderr, "failing shader:\n");
|
||||
ir3_print(ir);
|
||||
abort();
|
||||
}
|
||||
foreach_block (block, &ir->block_list) {
|
||||
check_reaching_block(ctx, block);
|
||||
}
|
||||
|
||||
if (ctx->failed) {
|
||||
fprintf(stderr, "failing shader:\n");
|
||||
ir3_print(ir);
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ir3_ra_validate(struct ir3_shader_variant *v,
|
||||
unsigned full_size, unsigned half_size, unsigned block_count)
|
||||
ir3_ra_validate(struct ir3_shader_variant *v, unsigned full_size,
|
||||
unsigned half_size, unsigned block_count)
|
||||
{
|
||||
#ifdef NDEBUG
|
||||
# define VALIDATE 0
|
||||
#define VALIDATE 0
|
||||
#else
|
||||
# define VALIDATE 1
|
||||
#define VALIDATE 1
|
||||
#endif
|
||||
|
||||
if (!VALIDATE)
|
||||
return;
|
||||
if (!VALIDATE)
|
||||
return;
|
||||
|
||||
struct ra_val_ctx *ctx = rzalloc(NULL, struct ra_val_ctx);
|
||||
ctx->merged_regs = v->mergedregs;
|
||||
ctx->full_size = full_size;
|
||||
ctx->half_size = half_size;
|
||||
ctx->block_count = block_count;
|
||||
struct ra_val_ctx *ctx = rzalloc(NULL, struct ra_val_ctx);
|
||||
ctx->merged_regs = v->mergedregs;
|
||||
ctx->full_size = full_size;
|
||||
ctx->half_size = half_size;
|
||||
ctx->block_count = block_count;
|
||||
|
||||
foreach_block (block, &v->ir->block_list) {
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
validate_simple(ctx, instr);
|
||||
}
|
||||
}
|
||||
foreach_block (block, &v->ir->block_list) {
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
validate_simple(ctx, instr);
|
||||
}
|
||||
}
|
||||
|
||||
check_reaching_defs(ctx, v->ir);
|
||||
check_reaching_defs(ctx, v->ir);
|
||||
|
||||
ralloc_free(ctx);
|
||||
ralloc_free(ctx);
|
||||
}
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -21,9 +21,9 @@
|
|||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "util/rb_tree.h"
|
||||
#include "ir3_ra.h"
|
||||
#include "ir3_shader.h"
|
||||
#include "util/rb_tree.h"
|
||||
|
||||
/*
|
||||
* This pass does one thing so far:
|
||||
|
@ -36,326 +36,330 @@
|
|||
*/
|
||||
|
||||
struct ra_spill_interval {
|
||||
struct ir3_reg_interval interval;
|
||||
struct ir3_reg_interval interval;
|
||||
};
|
||||
|
||||
struct ra_spill_ctx {
|
||||
struct ir3_reg_ctx reg_ctx;
|
||||
struct ir3_reg_ctx reg_ctx;
|
||||
|
||||
struct ra_spill_interval *intervals;
|
||||
struct ra_spill_interval *intervals;
|
||||
|
||||
struct ir3_pressure cur_pressure, max_pressure;
|
||||
struct ir3_pressure cur_pressure, max_pressure;
|
||||
|
||||
struct ir3_liveness *live;
|
||||
struct ir3_liveness *live;
|
||||
|
||||
const struct ir3_compiler *compiler;
|
||||
const struct ir3_compiler *compiler;
|
||||
};
|
||||
|
||||
static void
|
||||
ra_spill_interval_init(struct ra_spill_interval *interval, struct ir3_register *reg)
|
||||
ra_spill_interval_init(struct ra_spill_interval *interval,
|
||||
struct ir3_register *reg)
|
||||
{
|
||||
ir3_reg_interval_init(&interval->interval, reg);
|
||||
ir3_reg_interval_init(&interval->interval, reg);
|
||||
}
|
||||
|
||||
static void
|
||||
ra_pressure_add(struct ir3_pressure *pressure, struct ra_spill_interval *interval)
|
||||
ra_pressure_add(struct ir3_pressure *pressure,
|
||||
struct ra_spill_interval *interval)
|
||||
{
|
||||
unsigned size = reg_size(interval->interval.reg);
|
||||
if (interval->interval.reg->flags & IR3_REG_SHARED)
|
||||
pressure->shared += size;
|
||||
else if (interval->interval.reg->flags & IR3_REG_HALF)
|
||||
pressure->half += size;
|
||||
else
|
||||
pressure->full += size;
|
||||
unsigned size = reg_size(interval->interval.reg);
|
||||
if (interval->interval.reg->flags & IR3_REG_SHARED)
|
||||
pressure->shared += size;
|
||||
else if (interval->interval.reg->flags & IR3_REG_HALF)
|
||||
pressure->half += size;
|
||||
else
|
||||
pressure->full += size;
|
||||
}
|
||||
|
||||
static void
|
||||
ra_pressure_sub(struct ir3_pressure *pressure, struct ra_spill_interval *interval)
|
||||
ra_pressure_sub(struct ir3_pressure *pressure,
|
||||
struct ra_spill_interval *interval)
|
||||
{
|
||||
unsigned size = reg_size(interval->interval.reg);
|
||||
if (interval->interval.reg->flags & IR3_REG_SHARED)
|
||||
pressure->shared -= size;
|
||||
else if (interval->interval.reg->flags & IR3_REG_HALF)
|
||||
pressure->half -= size;
|
||||
else
|
||||
pressure->full -= size;
|
||||
unsigned size = reg_size(interval->interval.reg);
|
||||
if (interval->interval.reg->flags & IR3_REG_SHARED)
|
||||
pressure->shared -= size;
|
||||
else if (interval->interval.reg->flags & IR3_REG_HALF)
|
||||
pressure->half -= size;
|
||||
else
|
||||
pressure->full -= size;
|
||||
}
|
||||
|
||||
static struct ra_spill_interval *
|
||||
ir3_reg_interval_to_interval(struct ir3_reg_interval *interval)
|
||||
{
|
||||
return rb_node_data(struct ra_spill_interval, interval, interval);
|
||||
return rb_node_data(struct ra_spill_interval, interval, interval);
|
||||
}
|
||||
|
||||
static struct ra_spill_ctx *
|
||||
ir3_reg_ctx_to_ctx(struct ir3_reg_ctx *ctx)
|
||||
{
|
||||
return rb_node_data(struct ra_spill_ctx, ctx, reg_ctx);
|
||||
return rb_node_data(struct ra_spill_ctx, ctx, reg_ctx);
|
||||
}
|
||||
|
||||
static void
|
||||
interval_add(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
|
||||
{
|
||||
struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
|
||||
struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
|
||||
struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
|
||||
struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
|
||||
|
||||
ra_pressure_add(&ctx->cur_pressure, interval);
|
||||
ra_pressure_add(&ctx->cur_pressure, interval);
|
||||
}
|
||||
|
||||
static void
|
||||
interval_delete(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
|
||||
{
|
||||
struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
|
||||
struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
|
||||
struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
|
||||
struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
|
||||
|
||||
ra_pressure_sub(&ctx->cur_pressure, interval);
|
||||
ra_pressure_sub(&ctx->cur_pressure, interval);
|
||||
}
|
||||
|
||||
static void
|
||||
interval_readd(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_parent,
|
||||
struct ir3_reg_interval *_child)
|
||||
struct ir3_reg_interval *_child)
|
||||
{
|
||||
interval_add(_ctx, _child);
|
||||
interval_add(_ctx, _child);
|
||||
}
|
||||
|
||||
static void
|
||||
spill_ctx_init(struct ra_spill_ctx *ctx)
|
||||
{
|
||||
rb_tree_init(&ctx->reg_ctx.intervals);
|
||||
ctx->reg_ctx.interval_add = interval_add;
|
||||
ctx->reg_ctx.interval_delete = interval_delete;
|
||||
ctx->reg_ctx.interval_readd = interval_readd;
|
||||
rb_tree_init(&ctx->reg_ctx.intervals);
|
||||
ctx->reg_ctx.interval_add = interval_add;
|
||||
ctx->reg_ctx.interval_delete = interval_delete;
|
||||
ctx->reg_ctx.interval_readd = interval_readd;
|
||||
}
|
||||
|
||||
static void
|
||||
ra_spill_ctx_insert(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval)
|
||||
ra_spill_ctx_insert(struct ra_spill_ctx *ctx,
|
||||
struct ra_spill_interval *interval)
|
||||
{
|
||||
ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
|
||||
ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
|
||||
}
|
||||
|
||||
static void
|
||||
ra_spill_ctx_remove(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval)
|
||||
ra_spill_ctx_remove(struct ra_spill_ctx *ctx,
|
||||
struct ra_spill_interval *interval)
|
||||
{
|
||||
ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
|
||||
ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
|
||||
}
|
||||
|
||||
static void
|
||||
init_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
|
||||
{
|
||||
struct ra_spill_interval *interval = &ctx->intervals[dst->name];
|
||||
ra_spill_interval_init(interval, dst);
|
||||
struct ra_spill_interval *interval = &ctx->intervals[dst->name];
|
||||
ra_spill_interval_init(interval, dst);
|
||||
}
|
||||
|
||||
static void
|
||||
insert_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
|
||||
{
|
||||
struct ra_spill_interval *interval = &ctx->intervals[dst->name];
|
||||
if (interval->interval.inserted)
|
||||
return;
|
||||
struct ra_spill_interval *interval = &ctx->intervals[dst->name];
|
||||
if (interval->interval.inserted)
|
||||
return;
|
||||
|
||||
ra_spill_ctx_insert(ctx, interval);
|
||||
ra_spill_ctx_insert(ctx, interval);
|
||||
|
||||
/* For precolored inputs, make sure we leave enough registers to allow for
|
||||
* holes in the inputs. It can happen that the binning shader has a lower
|
||||
* register pressure than the main shader, but the main shader decided to
|
||||
* add holes between the inputs which means that the binning shader has a
|
||||
* higher register demand.
|
||||
*/
|
||||
if (dst->instr->opc == OPC_META_INPUT &&
|
||||
dst->num != INVALID_REG) {
|
||||
physreg_t physreg = ra_reg_get_physreg(dst);
|
||||
physreg_t max = physreg + reg_size(dst);
|
||||
/* For precolored inputs, make sure we leave enough registers to allow for
|
||||
* holes in the inputs. It can happen that the binning shader has a lower
|
||||
* register pressure than the main shader, but the main shader decided to
|
||||
* add holes between the inputs which means that the binning shader has a
|
||||
* higher register demand.
|
||||
*/
|
||||
if (dst->instr->opc == OPC_META_INPUT && dst->num != INVALID_REG) {
|
||||
physreg_t physreg = ra_reg_get_physreg(dst);
|
||||
physreg_t max = physreg + reg_size(dst);
|
||||
|
||||
if (interval->interval.reg->flags & IR3_REG_SHARED)
|
||||
ctx->max_pressure.shared = MAX2(ctx->max_pressure.shared, max);
|
||||
else if (interval->interval.reg->flags & IR3_REG_HALF)
|
||||
ctx->max_pressure.half = MAX2(ctx->max_pressure.half, max);
|
||||
else
|
||||
ctx->max_pressure.full = MAX2(ctx->max_pressure.full, max);
|
||||
}
|
||||
if (interval->interval.reg->flags & IR3_REG_SHARED)
|
||||
ctx->max_pressure.shared = MAX2(ctx->max_pressure.shared, max);
|
||||
else if (interval->interval.reg->flags & IR3_REG_HALF)
|
||||
ctx->max_pressure.half = MAX2(ctx->max_pressure.half, max);
|
||||
else
|
||||
ctx->max_pressure.full = MAX2(ctx->max_pressure.full, max);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
remove_src_early(struct ra_spill_ctx *ctx, struct ir3_instruction *instr, struct ir3_register *src)
|
||||
remove_src_early(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
|
||||
struct ir3_register *src)
|
||||
{
|
||||
if (!(src->flags & IR3_REG_FIRST_KILL))
|
||||
return;
|
||||
if (!(src->flags & IR3_REG_FIRST_KILL))
|
||||
return;
|
||||
|
||||
struct ra_spill_interval *interval = &ctx->intervals[src->def->name];
|
||||
struct ra_spill_interval *interval = &ctx->intervals[src->def->name];
|
||||
|
||||
if (!interval->interval.inserted || interval->interval.parent ||
|
||||
!rb_tree_is_empty(&interval->interval.children))
|
||||
return;
|
||||
if (!interval->interval.inserted || interval->interval.parent ||
|
||||
!rb_tree_is_empty(&interval->interval.children))
|
||||
return;
|
||||
|
||||
ra_spill_ctx_remove(ctx, interval);
|
||||
ra_spill_ctx_remove(ctx, interval);
|
||||
}
|
||||
|
||||
static void
|
||||
remove_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr, struct ir3_register *src)
|
||||
remove_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
|
||||
struct ir3_register *src)
|
||||
{
|
||||
if (!(src->flags & IR3_REG_FIRST_KILL))
|
||||
return;
|
||||
if (!(src->flags & IR3_REG_FIRST_KILL))
|
||||
return;
|
||||
|
||||
struct ra_spill_interval *interval = &ctx->intervals[src->def->name];
|
||||
struct ra_spill_interval *interval = &ctx->intervals[src->def->name];
|
||||
|
||||
if (!interval->interval.inserted)
|
||||
return;
|
||||
if (!interval->interval.inserted)
|
||||
return;
|
||||
|
||||
ra_spill_ctx_remove(ctx, interval);
|
||||
ra_spill_ctx_remove(ctx, interval);
|
||||
}
|
||||
|
||||
static void
|
||||
remove_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
|
||||
{
|
||||
struct ra_spill_interval *interval = &ctx->intervals[dst->name];
|
||||
struct ra_spill_interval *interval = &ctx->intervals[dst->name];
|
||||
|
||||
if (!interval->interval.inserted)
|
||||
return;
|
||||
if (!interval->interval.inserted)
|
||||
return;
|
||||
|
||||
ra_spill_ctx_remove(ctx, interval);
|
||||
ra_spill_ctx_remove(ctx, interval);
|
||||
}
|
||||
|
||||
static void
|
||||
update_max_pressure(struct ra_spill_ctx *ctx)
|
||||
{
|
||||
d("pressure:");
|
||||
d("\tfull: %u", ctx->cur_pressure.full);
|
||||
d("\thalf: %u", ctx->cur_pressure.half);
|
||||
d("\tshared: %u", ctx->cur_pressure.shared);
|
||||
d("pressure:");
|
||||
d("\tfull: %u", ctx->cur_pressure.full);
|
||||
d("\thalf: %u", ctx->cur_pressure.half);
|
||||
d("\tshared: %u", ctx->cur_pressure.shared);
|
||||
|
||||
ctx->max_pressure.full =
|
||||
MAX2(ctx->max_pressure.full, ctx->cur_pressure.full);
|
||||
ctx->max_pressure.half =
|
||||
MAX2(ctx->max_pressure.half, ctx->cur_pressure.half);
|
||||
ctx->max_pressure.shared =
|
||||
MAX2(ctx->max_pressure.shared, ctx->cur_pressure.shared);
|
||||
ctx->max_pressure.full =
|
||||
MAX2(ctx->max_pressure.full, ctx->cur_pressure.full);
|
||||
ctx->max_pressure.half =
|
||||
MAX2(ctx->max_pressure.half, ctx->cur_pressure.half);
|
||||
ctx->max_pressure.shared =
|
||||
MAX2(ctx->max_pressure.shared, ctx->cur_pressure.shared);
|
||||
}
|
||||
|
||||
static void
|
||||
handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
|
||||
{
|
||||
if (RA_DEBUG) {
|
||||
printf("processing: ");
|
||||
ir3_print_instr(instr);
|
||||
}
|
||||
if (RA_DEBUG) {
|
||||
printf("processing: ");
|
||||
ir3_print_instr(instr);
|
||||
}
|
||||
|
||||
ra_foreach_dst(dst, instr) {
|
||||
init_dst(ctx, dst);
|
||||
}
|
||||
ra_foreach_dst (dst, instr) {
|
||||
init_dst(ctx, dst);
|
||||
}
|
||||
|
||||
/* Handle tied destinations. If a destination is tied to a source and that
|
||||
* source is live-through, then we need to allocate a new register for the
|
||||
* destination which is live-through itself and cannot overlap the
|
||||
* sources.
|
||||
*/
|
||||
/* Handle tied destinations. If a destination is tied to a source and that
|
||||
* source is live-through, then we need to allocate a new register for the
|
||||
* destination which is live-through itself and cannot overlap the
|
||||
* sources.
|
||||
*/
|
||||
|
||||
ra_foreach_dst(dst, instr) {
|
||||
struct ir3_register *tied_src = dst->tied;
|
||||
if (tied_src && !(tied_src->flags & IR3_REG_FIRST_KILL))
|
||||
insert_dst(ctx, dst);
|
||||
}
|
||||
ra_foreach_dst (dst, instr) {
|
||||
struct ir3_register *tied_src = dst->tied;
|
||||
if (tied_src && !(tied_src->flags & IR3_REG_FIRST_KILL))
|
||||
insert_dst(ctx, dst);
|
||||
}
|
||||
|
||||
update_max_pressure(ctx);
|
||||
update_max_pressure(ctx);
|
||||
|
||||
ra_foreach_src(src, instr) {
|
||||
if (src->flags & IR3_REG_FIRST_KILL)
|
||||
remove_src_early(ctx, instr, src);
|
||||
}
|
||||
ra_foreach_src (src, instr) {
|
||||
if (src->flags & IR3_REG_FIRST_KILL)
|
||||
remove_src_early(ctx, instr, src);
|
||||
}
|
||||
|
||||
ra_foreach_dst (dst, instr) {
|
||||
insert_dst(ctx, dst);
|
||||
}
|
||||
|
||||
ra_foreach_dst(dst, instr) {
|
||||
insert_dst(ctx, dst);
|
||||
}
|
||||
update_max_pressure(ctx);
|
||||
|
||||
update_max_pressure(ctx);
|
||||
|
||||
for (unsigned i = 0; i < instr->srcs_count; i++) {
|
||||
if (ra_reg_is_src(instr->srcs[i]) &&
|
||||
(instr->srcs[i]->flags & IR3_REG_FIRST_KILL))
|
||||
remove_src(ctx, instr, instr->srcs[i]);
|
||||
}
|
||||
for (unsigned i = 0; i < instr->dsts_count; i++) {
|
||||
if (ra_reg_is_dst(instr->dsts[i]) &&
|
||||
(instr->dsts[i]->flags & IR3_REG_UNUSED))
|
||||
remove_dst(ctx, instr->dsts[i]);
|
||||
}
|
||||
for (unsigned i = 0; i < instr->srcs_count; i++) {
|
||||
if (ra_reg_is_src(instr->srcs[i]) &&
|
||||
(instr->srcs[i]->flags & IR3_REG_FIRST_KILL))
|
||||
remove_src(ctx, instr, instr->srcs[i]);
|
||||
}
|
||||
for (unsigned i = 0; i < instr->dsts_count; i++) {
|
||||
if (ra_reg_is_dst(instr->dsts[i]) &&
|
||||
(instr->dsts[i]->flags & IR3_REG_UNUSED))
|
||||
remove_dst(ctx, instr->dsts[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
handle_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
|
||||
{
|
||||
init_dst(ctx, instr->dsts[0]);
|
||||
insert_dst(ctx, instr->dsts[0]);
|
||||
init_dst(ctx, instr->dsts[0]);
|
||||
insert_dst(ctx, instr->dsts[0]);
|
||||
}
|
||||
|
||||
static void
|
||||
remove_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
|
||||
{
|
||||
ra_foreach_src(src, instr)
|
||||
remove_src(ctx, instr, src);
|
||||
if (instr->dsts[0]->flags & IR3_REG_UNUSED)
|
||||
remove_dst(ctx, instr->dsts[0]);
|
||||
ra_foreach_src (src, instr)
|
||||
remove_src(ctx, instr, src);
|
||||
if (instr->dsts[0]->flags & IR3_REG_UNUSED)
|
||||
remove_dst(ctx, instr->dsts[0]);
|
||||
}
|
||||
|
||||
static void
|
||||
handle_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def)
|
||||
{
|
||||
struct ra_spill_interval *interval = &ctx->intervals[def->name];
|
||||
ra_spill_interval_init(interval, def);
|
||||
insert_dst(ctx, def);
|
||||
struct ra_spill_interval *interval = &ctx->intervals[def->name];
|
||||
ra_spill_interval_init(interval, def);
|
||||
insert_dst(ctx, def);
|
||||
}
|
||||
|
||||
static void
|
||||
handle_block(struct ra_spill_ctx *ctx, struct ir3_block *block)
|
||||
{
|
||||
memset(&ctx->cur_pressure, 0, sizeof(ctx->cur_pressure));
|
||||
rb_tree_init(&ctx->reg_ctx.intervals);
|
||||
memset(&ctx->cur_pressure, 0, sizeof(ctx->cur_pressure));
|
||||
rb_tree_init(&ctx->reg_ctx.intervals);
|
||||
|
||||
unsigned name;
|
||||
BITSET_FOREACH_SET(name, ctx->live->live_in[block->index],
|
||||
ctx->live->definitions_count) {
|
||||
struct ir3_register *reg = ctx->live->definitions[name];
|
||||
handle_live_in(ctx, reg);
|
||||
}
|
||||
unsigned name;
|
||||
BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
|
||||
ctx->live->definitions_count) {
|
||||
struct ir3_register *reg = ctx->live->definitions[name];
|
||||
handle_live_in(ctx, reg);
|
||||
}
|
||||
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
if (instr->opc != OPC_META_PHI && instr->opc != OPC_META_INPUT &&
|
||||
instr->opc != OPC_META_TEX_PREFETCH)
|
||||
break;
|
||||
handle_input_phi(ctx, instr);
|
||||
}
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
if (instr->opc != OPC_META_PHI && instr->opc != OPC_META_INPUT &&
|
||||
instr->opc != OPC_META_TEX_PREFETCH)
|
||||
break;
|
||||
handle_input_phi(ctx, instr);
|
||||
}
|
||||
|
||||
update_max_pressure(ctx);
|
||||
update_max_pressure(ctx);
|
||||
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
if (instr->opc == OPC_META_PHI || instr->opc == OPC_META_INPUT ||
|
||||
instr->opc == OPC_META_TEX_PREFETCH)
|
||||
remove_input_phi(ctx, instr);
|
||||
else
|
||||
handle_instr(ctx, instr);
|
||||
}
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
if (instr->opc == OPC_META_PHI || instr->opc == OPC_META_INPUT ||
|
||||
instr->opc == OPC_META_TEX_PREFETCH)
|
||||
remove_input_phi(ctx, instr);
|
||||
else
|
||||
handle_instr(ctx, instr);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ir3_calc_pressure(struct ir3_shader_variant *v, struct ir3_liveness *live,
|
||||
struct ir3_pressure *max_pressure)
|
||||
struct ir3_pressure *max_pressure)
|
||||
{
|
||||
struct ra_spill_ctx ctx = {};
|
||||
ctx.live = live;
|
||||
ctx.intervals = calloc(live->definitions_count, sizeof(*ctx.intervals));
|
||||
ctx.compiler = v->shader->compiler;
|
||||
spill_ctx_init(&ctx);
|
||||
struct ra_spill_ctx ctx = {};
|
||||
ctx.live = live;
|
||||
ctx.intervals = calloc(live->definitions_count, sizeof(*ctx.intervals));
|
||||
ctx.compiler = v->shader->compiler;
|
||||
spill_ctx_init(&ctx);
|
||||
|
||||
foreach_block (block, &v->ir->block_list) {
|
||||
handle_block(&ctx, block);
|
||||
}
|
||||
foreach_block (block, &v->ir->block_list) {
|
||||
handle_block(&ctx, block);
|
||||
}
|
||||
|
||||
assert(ctx.cur_pressure.full == 0);
|
||||
assert(ctx.cur_pressure.half == 0);
|
||||
assert(ctx.cur_pressure.shared == 0);
|
||||
assert(ctx.cur_pressure.full == 0);
|
||||
assert(ctx.cur_pressure.half == 0);
|
||||
assert(ctx.cur_pressure.shared == 0);
|
||||
|
||||
free(ctx.intervals);
|
||||
free(ctx.intervals);
|
||||
|
||||
*max_pressure = ctx.max_pressure;
|
||||
*max_pressure = ctx.max_pressure;
|
||||
}
|
||||
|
||||
|
|
|
@ -28,61 +28,64 @@
|
|||
#include "ir3.h"
|
||||
|
||||
struct ir3_validate_ctx {
|
||||
struct ir3 *ir;
|
||||
struct ir3 *ir;
|
||||
|
||||
/* Current instruction being validated: */
|
||||
struct ir3_instruction *current_instr;
|
||||
/* Current instruction being validated: */
|
||||
struct ir3_instruction *current_instr;
|
||||
|
||||
/* Set of instructions found so far, used to validate that we
|
||||
* don't have SSA uses that occure before def's
|
||||
*/
|
||||
struct set *defs;
|
||||
/* Set of instructions found so far, used to validate that we
|
||||
* don't have SSA uses that occure before def's
|
||||
*/
|
||||
struct set *defs;
|
||||
};
|
||||
|
||||
static void
|
||||
validate_error(struct ir3_validate_ctx *ctx, const char *condstr)
|
||||
{
|
||||
fprintf(stderr, "validation fail: %s\n", condstr);
|
||||
fprintf(stderr, " -> for instruction: ");
|
||||
ir3_print_instr(ctx->current_instr);
|
||||
abort();
|
||||
fprintf(stderr, "validation fail: %s\n", condstr);
|
||||
fprintf(stderr, " -> for instruction: ");
|
||||
ir3_print_instr(ctx->current_instr);
|
||||
abort();
|
||||
}
|
||||
|
||||
#define validate_assert(ctx, cond) do { \
|
||||
if (!(cond)) { \
|
||||
validate_error(ctx, #cond); \
|
||||
} } while (0)
|
||||
#define validate_assert(ctx, cond) \
|
||||
do { \
|
||||
if (!(cond)) { \
|
||||
validate_error(ctx, #cond); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
static unsigned
|
||||
reg_class_flags(struct ir3_register *reg)
|
||||
{
|
||||
return reg->flags & (IR3_REG_HALF | IR3_REG_SHARED);
|
||||
return reg->flags & (IR3_REG_HALF | IR3_REG_SHARED);
|
||||
}
|
||||
|
||||
static void
|
||||
validate_src(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr,
|
||||
struct ir3_register *reg)
|
||||
struct ir3_register *reg)
|
||||
{
|
||||
if (!(reg->flags & IR3_REG_SSA) || !reg->def)
|
||||
return;
|
||||
if (!(reg->flags & IR3_REG_SSA) || !reg->def)
|
||||
return;
|
||||
|
||||
struct ir3_register *src = reg->def;
|
||||
struct ir3_register *src = reg->def;
|
||||
|
||||
validate_assert(ctx, _mesa_set_search(ctx->defs, src->instr));
|
||||
validate_assert(ctx, src->wrmask == reg->wrmask);
|
||||
validate_assert(ctx, reg_class_flags(src) == reg_class_flags(reg));
|
||||
validate_assert(ctx, _mesa_set_search(ctx->defs, src->instr));
|
||||
validate_assert(ctx, src->wrmask == reg->wrmask);
|
||||
validate_assert(ctx, reg_class_flags(src) == reg_class_flags(reg));
|
||||
|
||||
if (reg->tied) {
|
||||
validate_assert(ctx, reg->tied->tied == reg);
|
||||
bool found = false;
|
||||
foreach_dst (dst, instr) {
|
||||
if (dst == reg->tied) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
validate_assert(ctx, found && "tied register not in the same instruction");
|
||||
}
|
||||
if (reg->tied) {
|
||||
validate_assert(ctx, reg->tied->tied == reg);
|
||||
bool found = false;
|
||||
foreach_dst (dst, instr) {
|
||||
if (dst == reg->tied) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
validate_assert(ctx,
|
||||
found && "tied register not in the same instruction");
|
||||
}
|
||||
}
|
||||
|
||||
/* phi sources are logically read at the end of the predecessor basic block,
|
||||
|
@ -90,275 +93,280 @@ validate_src(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr,
|
|||
* use comes after the definition for loop phis.
|
||||
*/
|
||||
static void
|
||||
validate_phi_src(struct ir3_validate_ctx *ctx, struct ir3_block *block, struct ir3_block *pred)
|
||||
validate_phi_src(struct ir3_validate_ctx *ctx, struct ir3_block *block,
|
||||
struct ir3_block *pred)
|
||||
{
|
||||
unsigned pred_idx = ir3_block_get_pred_index(block, pred);
|
||||
unsigned pred_idx = ir3_block_get_pred_index(block, pred);
|
||||
|
||||
foreach_instr (phi, &block->instr_list) {
|
||||
if (phi->opc != OPC_META_PHI)
|
||||
break;
|
||||
foreach_instr (phi, &block->instr_list) {
|
||||
if (phi->opc != OPC_META_PHI)
|
||||
break;
|
||||
|
||||
ctx->current_instr = phi;
|
||||
validate_assert(ctx, phi->srcs_count == block->predecessors_count);
|
||||
validate_src(ctx, phi, phi->srcs[pred_idx]);
|
||||
}
|
||||
ctx->current_instr = phi;
|
||||
validate_assert(ctx, phi->srcs_count == block->predecessors_count);
|
||||
validate_src(ctx, phi, phi->srcs[pred_idx]);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
validate_phi(struct ir3_validate_ctx *ctx, struct ir3_instruction *phi)
|
||||
{
|
||||
_mesa_set_add(ctx->defs, phi);
|
||||
validate_assert(ctx, phi->dsts_count == 1);
|
||||
validate_assert(ctx, is_dest_gpr(phi->dsts[0]));
|
||||
_mesa_set_add(ctx->defs, phi);
|
||||
validate_assert(ctx, phi->dsts_count == 1);
|
||||
validate_assert(ctx, is_dest_gpr(phi->dsts[0]));
|
||||
}
|
||||
|
||||
static void
|
||||
validate_dst(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr,
|
||||
struct ir3_register *reg)
|
||||
struct ir3_register *reg)
|
||||
{
|
||||
if (reg->tied) {
|
||||
validate_assert(ctx, reg->tied->tied == reg);
|
||||
validate_assert(ctx, reg_class_flags(reg->tied) == reg_class_flags(reg));
|
||||
validate_assert(ctx, reg->tied->wrmask == reg->wrmask);
|
||||
if (reg->flags & IR3_REG_ARRAY) {
|
||||
validate_assert(ctx, reg->tied->array.base == reg->array.base);
|
||||
validate_assert(ctx, reg->tied->size == reg->size);
|
||||
}
|
||||
bool found = false;
|
||||
foreach_src (src, instr) {
|
||||
if (src == reg->tied) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
validate_assert(ctx, found && "tied register not in the same instruction");
|
||||
}
|
||||
if (reg->tied) {
|
||||
validate_assert(ctx, reg->tied->tied == reg);
|
||||
validate_assert(ctx, reg_class_flags(reg->tied) == reg_class_flags(reg));
|
||||
validate_assert(ctx, reg->tied->wrmask == reg->wrmask);
|
||||
if (reg->flags & IR3_REG_ARRAY) {
|
||||
validate_assert(ctx, reg->tied->array.base == reg->array.base);
|
||||
validate_assert(ctx, reg->tied->size == reg->size);
|
||||
}
|
||||
bool found = false;
|
||||
foreach_src (src, instr) {
|
||||
if (src == reg->tied) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
validate_assert(ctx,
|
||||
found && "tied register not in the same instruction");
|
||||
}
|
||||
|
||||
if (reg->flags & IR3_REG_SSA)
|
||||
validate_assert(ctx, reg->instr == instr);
|
||||
if (reg->flags & IR3_REG_SSA)
|
||||
validate_assert(ctx, reg->instr == instr);
|
||||
|
||||
if (reg->flags & IR3_REG_RELATIV)
|
||||
validate_assert(ctx, instr->address);
|
||||
if (reg->flags & IR3_REG_RELATIV)
|
||||
validate_assert(ctx, instr->address);
|
||||
}
|
||||
|
||||
#define validate_reg_size(ctx, reg, type) \
|
||||
validate_assert(ctx, type_size(type) == (((reg)->flags & IR3_REG_HALF) ? 16 : 32))
|
||||
#define validate_reg_size(ctx, reg, type) \
|
||||
validate_assert( \
|
||||
ctx, type_size(type) == (((reg)->flags & IR3_REG_HALF) ? 16 : 32))
|
||||
|
||||
static void
|
||||
validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr)
|
||||
{
|
||||
struct ir3_register *last_reg = NULL;
|
||||
struct ir3_register *last_reg = NULL;
|
||||
|
||||
foreach_src_n (reg, n, instr) {
|
||||
if (reg->flags & IR3_REG_RELATIV)
|
||||
validate_assert(ctx, instr->address);
|
||||
foreach_src_n (reg, n, instr) {
|
||||
if (reg->flags & IR3_REG_RELATIV)
|
||||
validate_assert(ctx, instr->address);
|
||||
|
||||
validate_src(ctx, instr, reg);
|
||||
validate_src(ctx, instr, reg);
|
||||
|
||||
/* Validate that all src's are either half of full.
|
||||
*
|
||||
* Note: tex instructions w/ .s2en are a bit special in that the
|
||||
* tex/samp src reg is half-reg for non-bindless and full for
|
||||
* bindless, irrespective of the precision of other srcs. The
|
||||
* tex/samp src is the first src reg when .s2en is set
|
||||
*/
|
||||
if (reg->tied) {
|
||||
/* must have the same size as the destination, handled in
|
||||
* validate_reg().
|
||||
*/
|
||||
} else if (reg == instr->address) {
|
||||
validate_assert(ctx, reg->flags & IR3_REG_HALF);
|
||||
} else if ((instr->flags & IR3_INSTR_S2EN) && (n < 2)) {
|
||||
if (n == 0) {
|
||||
if (instr->flags & IR3_INSTR_B)
|
||||
validate_assert(ctx, !(reg->flags & IR3_REG_HALF));
|
||||
else
|
||||
validate_assert(ctx, reg->flags & IR3_REG_HALF);
|
||||
}
|
||||
} else if (opc_cat(instr->opc) == 6) {
|
||||
/* handled below */
|
||||
} else if (opc_cat(instr->opc) == 0) {
|
||||
/* end/chmask/etc are allowed to have different size sources */
|
||||
} else if (n > 0) {
|
||||
validate_assert(ctx, (last_reg->flags & IR3_REG_HALF) == (reg->flags & IR3_REG_HALF));
|
||||
}
|
||||
/* Validate that all src's are either half of full.
|
||||
*
|
||||
* Note: tex instructions w/ .s2en are a bit special in that the
|
||||
* tex/samp src reg is half-reg for non-bindless and full for
|
||||
* bindless, irrespective of the precision of other srcs. The
|
||||
* tex/samp src is the first src reg when .s2en is set
|
||||
*/
|
||||
if (reg->tied) {
|
||||
/* must have the same size as the destination, handled in
|
||||
* validate_reg().
|
||||
*/
|
||||
} else if (reg == instr->address) {
|
||||
validate_assert(ctx, reg->flags & IR3_REG_HALF);
|
||||
} else if ((instr->flags & IR3_INSTR_S2EN) && (n < 2)) {
|
||||
if (n == 0) {
|
||||
if (instr->flags & IR3_INSTR_B)
|
||||
validate_assert(ctx, !(reg->flags & IR3_REG_HALF));
|
||||
else
|
||||
validate_assert(ctx, reg->flags & IR3_REG_HALF);
|
||||
}
|
||||
} else if (opc_cat(instr->opc) == 6) {
|
||||
/* handled below */
|
||||
} else if (opc_cat(instr->opc) == 0) {
|
||||
/* end/chmask/etc are allowed to have different size sources */
|
||||
} else if (n > 0) {
|
||||
validate_assert(ctx, (last_reg->flags & IR3_REG_HALF) ==
|
||||
(reg->flags & IR3_REG_HALF));
|
||||
}
|
||||
|
||||
last_reg = reg;
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < instr->dsts_count; i++) {
|
||||
struct ir3_register *reg = instr->dsts[i];
|
||||
last_reg = reg;
|
||||
}
|
||||
|
||||
validate_dst(ctx, instr, reg);
|
||||
}
|
||||
for (unsigned i = 0; i < instr->dsts_count; i++) {
|
||||
struct ir3_register *reg = instr->dsts[i];
|
||||
|
||||
_mesa_set_add(ctx->defs, instr);
|
||||
validate_dst(ctx, instr, reg);
|
||||
}
|
||||
|
||||
/* Check that src/dst types match the register types, and for
|
||||
* instructions that have different opcodes depending on type,
|
||||
* that the opcodes are correct.
|
||||
*/
|
||||
switch (opc_cat(instr->opc)) {
|
||||
case 1: /* move instructions */
|
||||
if (instr->opc == OPC_MOVMSK || instr->opc == OPC_BALLOT_MACRO) {
|
||||
validate_assert(ctx, instr->dsts_count == 1);
|
||||
validate_assert(ctx, instr->dsts[0]->flags & IR3_REG_SHARED);
|
||||
validate_assert(ctx, !(instr->dsts[0]->flags & IR3_REG_HALF));
|
||||
validate_assert(ctx, util_is_power_of_two_or_zero(instr->dsts[0]->wrmask + 1));
|
||||
} else if (instr->opc == OPC_ANY_MACRO || instr->opc == OPC_ALL_MACRO ||
|
||||
instr->opc == OPC_READ_FIRST_MACRO ||
|
||||
instr->opc == OPC_READ_COND_MACRO) {
|
||||
/* nothing yet */
|
||||
} else if (instr->opc == OPC_ELECT_MACRO) {
|
||||
validate_assert(ctx, instr->dsts_count == 1);
|
||||
validate_assert(ctx, !(instr->dsts[0]->flags & IR3_REG_SHARED));
|
||||
} else {
|
||||
foreach_dst (dst, instr)
|
||||
validate_reg_size(ctx, dst, instr->cat1.dst_type);
|
||||
foreach_src (src, instr) {
|
||||
if (!src->tied && src != instr->address)
|
||||
validate_reg_size(ctx, src, instr->cat1.src_type);
|
||||
}
|
||||
_mesa_set_add(ctx->defs, instr);
|
||||
|
||||
switch (instr->opc) {
|
||||
case OPC_SWZ:
|
||||
validate_assert(ctx, instr->srcs_count == 2);
|
||||
validate_assert(ctx, instr->dsts_count == 2);
|
||||
break;
|
||||
case OPC_GAT:
|
||||
validate_assert(ctx, instr->srcs_count == 4);
|
||||
validate_assert(ctx, instr->dsts_count == 1);
|
||||
break;
|
||||
case OPC_SCT:
|
||||
validate_assert(ctx, instr->srcs_count == 1);
|
||||
validate_assert(ctx, instr->dsts_count == 4);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* Check that src/dst types match the register types, and for
|
||||
* instructions that have different opcodes depending on type,
|
||||
* that the opcodes are correct.
|
||||
*/
|
||||
switch (opc_cat(instr->opc)) {
|
||||
case 1: /* move instructions */
|
||||
if (instr->opc == OPC_MOVMSK || instr->opc == OPC_BALLOT_MACRO) {
|
||||
validate_assert(ctx, instr->dsts_count == 1);
|
||||
validate_assert(ctx, instr->dsts[0]->flags & IR3_REG_SHARED);
|
||||
validate_assert(ctx, !(instr->dsts[0]->flags & IR3_REG_HALF));
|
||||
validate_assert(
|
||||
ctx, util_is_power_of_two_or_zero(instr->dsts[0]->wrmask + 1));
|
||||
} else if (instr->opc == OPC_ANY_MACRO || instr->opc == OPC_ALL_MACRO ||
|
||||
instr->opc == OPC_READ_FIRST_MACRO ||
|
||||
instr->opc == OPC_READ_COND_MACRO) {
|
||||
/* nothing yet */
|
||||
} else if (instr->opc == OPC_ELECT_MACRO) {
|
||||
validate_assert(ctx, instr->dsts_count == 1);
|
||||
validate_assert(ctx, !(instr->dsts[0]->flags & IR3_REG_SHARED));
|
||||
} else {
|
||||
foreach_dst (dst, instr)
|
||||
validate_reg_size(ctx, dst, instr->cat1.dst_type);
|
||||
foreach_src (src, instr) {
|
||||
if (!src->tied && src != instr->address)
|
||||
validate_reg_size(ctx, src, instr->cat1.src_type);
|
||||
}
|
||||
|
||||
if (instr->opc != OPC_MOV)
|
||||
validate_assert(ctx, !instr->address);
|
||||
switch (instr->opc) {
|
||||
case OPC_SWZ:
|
||||
validate_assert(ctx, instr->srcs_count == 2);
|
||||
validate_assert(ctx, instr->dsts_count == 2);
|
||||
break;
|
||||
case OPC_GAT:
|
||||
validate_assert(ctx, instr->srcs_count == 4);
|
||||
validate_assert(ctx, instr->dsts_count == 1);
|
||||
break;
|
||||
case OPC_SCT:
|
||||
validate_assert(ctx, instr->srcs_count == 1);
|
||||
validate_assert(ctx, instr->dsts_count == 4);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
case 3:
|
||||
/* Validate that cat3 opc matches the src type. We've already checked that all
|
||||
* the src regs are same type
|
||||
*/
|
||||
if (instr->srcs[0]->flags & IR3_REG_HALF) {
|
||||
validate_assert(ctx, instr->opc == cat3_half_opc(instr->opc));
|
||||
} else {
|
||||
validate_assert(ctx, instr->opc == cat3_full_opc(instr->opc));
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
/* Validate that cat4 opc matches the dst type: */
|
||||
if (instr->dsts[0]->flags & IR3_REG_HALF) {
|
||||
validate_assert(ctx, instr->opc == cat4_half_opc(instr->opc));
|
||||
} else {
|
||||
validate_assert(ctx, instr->opc == cat4_full_opc(instr->opc));
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
validate_reg_size(ctx, instr->dsts[0], instr->cat5.type);
|
||||
break;
|
||||
case 6:
|
||||
switch (instr->opc) {
|
||||
case OPC_RESINFO:
|
||||
case OPC_RESFMT:
|
||||
validate_reg_size(ctx, instr->dsts[0], instr->cat6.type);
|
||||
validate_reg_size(ctx, instr->srcs[0], instr->cat6.type);
|
||||
break;
|
||||
case OPC_L2G:
|
||||
case OPC_G2L:
|
||||
validate_assert(ctx, !(instr->dsts[0]->flags & IR3_REG_HALF));
|
||||
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
|
||||
break;
|
||||
case OPC_STG:
|
||||
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
|
||||
validate_assert(ctx, !(instr->srcs[1]->flags & IR3_REG_HALF));
|
||||
validate_reg_size(ctx, instr->srcs[2], instr->cat6.type);
|
||||
validate_assert(ctx, !(instr->srcs[3]->flags & IR3_REG_HALF));
|
||||
break;
|
||||
case OPC_STG_A:
|
||||
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
|
||||
validate_assert(ctx, !(instr->srcs[2]->flags & IR3_REG_HALF));
|
||||
validate_assert(ctx, !(instr->srcs[3]->flags & IR3_REG_HALF));
|
||||
validate_reg_size(ctx, instr->srcs[4], instr->cat6.type);
|
||||
validate_assert(ctx, !(instr->srcs[5]->flags & IR3_REG_HALF));
|
||||
break;
|
||||
case OPC_STL:
|
||||
case OPC_STP:
|
||||
case OPC_STLW:
|
||||
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
|
||||
validate_reg_size(ctx, instr->srcs[1], instr->cat6.type);
|
||||
validate_assert(ctx, !(instr->srcs[2]->flags & IR3_REG_HALF));
|
||||
break;
|
||||
case OPC_STIB:
|
||||
if (instr->flags & IR3_INSTR_B) {
|
||||
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
|
||||
validate_assert(ctx, !(instr->srcs[1]->flags & IR3_REG_HALF));
|
||||
validate_reg_size(ctx, instr->srcs[2], instr->cat6.type);
|
||||
} else {
|
||||
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
|
||||
validate_reg_size(ctx, instr->srcs[1], instr->cat6.type);
|
||||
validate_assert(ctx, !(instr->srcs[2]->flags & IR3_REG_HALF));
|
||||
}
|
||||
break;
|
||||
default:
|
||||
validate_reg_size(ctx, instr->dsts[0], instr->cat6.type);
|
||||
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
|
||||
if (instr->srcs_count > 1)
|
||||
validate_assert(ctx, !(instr->srcs[1]->flags & IR3_REG_HALF));
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (instr->opc != OPC_MOV)
|
||||
validate_assert(ctx, !instr->address);
|
||||
|
||||
break;
|
||||
case 3:
|
||||
/* Validate that cat3 opc matches the src type. We've already checked
|
||||
* that all the src regs are same type
|
||||
*/
|
||||
if (instr->srcs[0]->flags & IR3_REG_HALF) {
|
||||
validate_assert(ctx, instr->opc == cat3_half_opc(instr->opc));
|
||||
} else {
|
||||
validate_assert(ctx, instr->opc == cat3_full_opc(instr->opc));
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
/* Validate that cat4 opc matches the dst type: */
|
||||
if (instr->dsts[0]->flags & IR3_REG_HALF) {
|
||||
validate_assert(ctx, instr->opc == cat4_half_opc(instr->opc));
|
||||
} else {
|
||||
validate_assert(ctx, instr->opc == cat4_full_opc(instr->opc));
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
validate_reg_size(ctx, instr->dsts[0], instr->cat5.type);
|
||||
break;
|
||||
case 6:
|
||||
switch (instr->opc) {
|
||||
case OPC_RESINFO:
|
||||
case OPC_RESFMT:
|
||||
validate_reg_size(ctx, instr->dsts[0], instr->cat6.type);
|
||||
validate_reg_size(ctx, instr->srcs[0], instr->cat6.type);
|
||||
break;
|
||||
case OPC_L2G:
|
||||
case OPC_G2L:
|
||||
validate_assert(ctx, !(instr->dsts[0]->flags & IR3_REG_HALF));
|
||||
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
|
||||
break;
|
||||
case OPC_STG:
|
||||
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
|
||||
validate_assert(ctx, !(instr->srcs[1]->flags & IR3_REG_HALF));
|
||||
validate_reg_size(ctx, instr->srcs[2], instr->cat6.type);
|
||||
validate_assert(ctx, !(instr->srcs[3]->flags & IR3_REG_HALF));
|
||||
break;
|
||||
case OPC_STG_A:
|
||||
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
|
||||
validate_assert(ctx, !(instr->srcs[2]->flags & IR3_REG_HALF));
|
||||
validate_assert(ctx, !(instr->srcs[3]->flags & IR3_REG_HALF));
|
||||
validate_reg_size(ctx, instr->srcs[4], instr->cat6.type);
|
||||
validate_assert(ctx, !(instr->srcs[5]->flags & IR3_REG_HALF));
|
||||
break;
|
||||
case OPC_STL:
|
||||
case OPC_STP:
|
||||
case OPC_STLW:
|
||||
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
|
||||
validate_reg_size(ctx, instr->srcs[1], instr->cat6.type);
|
||||
validate_assert(ctx, !(instr->srcs[2]->flags & IR3_REG_HALF));
|
||||
break;
|
||||
case OPC_STIB:
|
||||
if (instr->flags & IR3_INSTR_B) {
|
||||
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
|
||||
validate_assert(ctx, !(instr->srcs[1]->flags & IR3_REG_HALF));
|
||||
validate_reg_size(ctx, instr->srcs[2], instr->cat6.type);
|
||||
} else {
|
||||
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
|
||||
validate_reg_size(ctx, instr->srcs[1], instr->cat6.type);
|
||||
validate_assert(ctx, !(instr->srcs[2]->flags & IR3_REG_HALF));
|
||||
}
|
||||
break;
|
||||
default:
|
||||
validate_reg_size(ctx, instr->dsts[0], instr->cat6.type);
|
||||
validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
|
||||
if (instr->srcs_count > 1)
|
||||
validate_assert(ctx, !(instr->srcs[1]->flags & IR3_REG_HALF));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ir3_validate(struct ir3 *ir)
|
||||
{
|
||||
#ifdef NDEBUG
|
||||
# define VALIDATE 0
|
||||
#define VALIDATE 0
|
||||
#else
|
||||
# define VALIDATE 1
|
||||
#define VALIDATE 1
|
||||
#endif
|
||||
|
||||
if (!VALIDATE)
|
||||
return;
|
||||
if (!VALIDATE)
|
||||
return;
|
||||
|
||||
struct ir3_validate_ctx *ctx = ralloc_size(NULL, sizeof(*ctx));
|
||||
struct ir3_validate_ctx *ctx = ralloc_size(NULL, sizeof(*ctx));
|
||||
|
||||
ctx->ir = ir;
|
||||
ctx->defs = _mesa_pointer_set_create(ctx);
|
||||
ctx->ir = ir;
|
||||
ctx->defs = _mesa_pointer_set_create(ctx);
|
||||
|
||||
foreach_block (block, &ir->block_list) {
|
||||
/* We require that the first block does not have any predecessors,
|
||||
* which allows us to assume that phi nodes and meta:input's do not
|
||||
* appear in the same basic block.
|
||||
*/
|
||||
validate_assert(ctx,
|
||||
block != ir3_start_block(ir) || block->predecessors_count == 0);
|
||||
foreach_block (block, &ir->block_list) {
|
||||
/* We require that the first block does not have any predecessors,
|
||||
* which allows us to assume that phi nodes and meta:input's do not
|
||||
* appear in the same basic block.
|
||||
*/
|
||||
validate_assert(
|
||||
ctx, block != ir3_start_block(ir) || block->predecessors_count == 0);
|
||||
|
||||
struct ir3_instruction *prev = NULL;
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
ctx->current_instr = instr;
|
||||
if (instr->opc == OPC_META_PHI) {
|
||||
/* phis must be the first in the block */
|
||||
validate_assert(ctx, prev == NULL || prev->opc == OPC_META_PHI);
|
||||
validate_phi(ctx, instr);
|
||||
} else {
|
||||
validate_instr(ctx, instr);
|
||||
}
|
||||
prev = instr;
|
||||
}
|
||||
struct ir3_instruction *prev = NULL;
|
||||
foreach_instr (instr, &block->instr_list) {
|
||||
ctx->current_instr = instr;
|
||||
if (instr->opc == OPC_META_PHI) {
|
||||
/* phis must be the first in the block */
|
||||
validate_assert(ctx, prev == NULL || prev->opc == OPC_META_PHI);
|
||||
validate_phi(ctx, instr);
|
||||
} else {
|
||||
validate_instr(ctx, instr);
|
||||
}
|
||||
prev = instr;
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
if (block->successors[i])
|
||||
validate_phi_src(ctx, block->successors[i], block);
|
||||
}
|
||||
}
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
if (block->successors[i])
|
||||
validate_phi_src(ctx, block->successors[i], block);
|
||||
}
|
||||
}
|
||||
|
||||
ralloc_free(ctx);
|
||||
ralloc_free(ctx);
|
||||
}
|
||||
|
|
|
@ -32,100 +32,100 @@
|
|||
typedef BITSET_DECLARE(regmaskstate_t, 2 * MAX_REG);
|
||||
|
||||
typedef struct {
|
||||
bool mergedregs;
|
||||
regmaskstate_t mask;
|
||||
bool mergedregs;
|
||||
regmaskstate_t mask;
|
||||
} regmask_t;
|
||||
|
||||
static inline bool
|
||||
__regmask_get(regmask_t *regmask, bool half, unsigned n)
|
||||
{
|
||||
if (regmask->mergedregs) {
|
||||
/* a6xx+ case, with merged register file, we track things in terms
|
||||
* of half-precision registers, with a full precisions register
|
||||
* using two half-precision slots:
|
||||
*/
|
||||
if (half) {
|
||||
return BITSET_TEST(regmask->mask, n);
|
||||
} else {
|
||||
n *= 2;
|
||||
return BITSET_TEST(regmask->mask, n) ||
|
||||
BITSET_TEST(regmask->mask, n+1);
|
||||
}
|
||||
} else {
|
||||
/* pre a6xx case, with separate register file for half and full
|
||||
* precision:
|
||||
*/
|
||||
if (half)
|
||||
n += MAX_REG;
|
||||
return BITSET_TEST(regmask->mask, n);
|
||||
}
|
||||
if (regmask->mergedregs) {
|
||||
/* a6xx+ case, with merged register file, we track things in terms
|
||||
* of half-precision registers, with a full precisions register
|
||||
* using two half-precision slots:
|
||||
*/
|
||||
if (half) {
|
||||
return BITSET_TEST(regmask->mask, n);
|
||||
} else {
|
||||
n *= 2;
|
||||
return BITSET_TEST(regmask->mask, n) ||
|
||||
BITSET_TEST(regmask->mask, n + 1);
|
||||
}
|
||||
} else {
|
||||
/* pre a6xx case, with separate register file for half and full
|
||||
* precision:
|
||||
*/
|
||||
if (half)
|
||||
n += MAX_REG;
|
||||
return BITSET_TEST(regmask->mask, n);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
__regmask_set(regmask_t *regmask, bool half, unsigned n)
|
||||
{
|
||||
if (regmask->mergedregs) {
|
||||
/* a6xx+ case, with merged register file, we track things in terms
|
||||
* of half-precision registers, with a full precisions register
|
||||
* using two half-precision slots:
|
||||
*/
|
||||
if (half) {
|
||||
BITSET_SET(regmask->mask, n);
|
||||
} else {
|
||||
n *= 2;
|
||||
BITSET_SET(regmask->mask, n);
|
||||
BITSET_SET(regmask->mask, n+1);
|
||||
}
|
||||
} else {
|
||||
/* pre a6xx case, with separate register file for half and full
|
||||
* precision:
|
||||
*/
|
||||
if (half)
|
||||
n += MAX_REG;
|
||||
BITSET_SET(regmask->mask, n);
|
||||
}
|
||||
if (regmask->mergedregs) {
|
||||
/* a6xx+ case, with merged register file, we track things in terms
|
||||
* of half-precision registers, with a full precisions register
|
||||
* using two half-precision slots:
|
||||
*/
|
||||
if (half) {
|
||||
BITSET_SET(regmask->mask, n);
|
||||
} else {
|
||||
n *= 2;
|
||||
BITSET_SET(regmask->mask, n);
|
||||
BITSET_SET(regmask->mask, n + 1);
|
||||
}
|
||||
} else {
|
||||
/* pre a6xx case, with separate register file for half and full
|
||||
* precision:
|
||||
*/
|
||||
if (half)
|
||||
n += MAX_REG;
|
||||
BITSET_SET(regmask->mask, n);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
__regmask_clear(regmask_t *regmask, bool half, unsigned n)
|
||||
{
|
||||
if (regmask->mergedregs) {
|
||||
/* a6xx+ case, with merged register file, we track things in terms
|
||||
* of half-precision registers, with a full precisions register
|
||||
* using two half-precision slots:
|
||||
*/
|
||||
if (half) {
|
||||
BITSET_CLEAR(regmask->mask, n);
|
||||
} else {
|
||||
n *= 2;
|
||||
BITSET_CLEAR(regmask->mask, n);
|
||||
BITSET_CLEAR(regmask->mask, n+1);
|
||||
}
|
||||
} else {
|
||||
/* pre a6xx case, with separate register file for half and full
|
||||
* precision:
|
||||
*/
|
||||
if (half)
|
||||
n += MAX_REG;
|
||||
BITSET_CLEAR(regmask->mask, n);
|
||||
}
|
||||
if (regmask->mergedregs) {
|
||||
/* a6xx+ case, with merged register file, we track things in terms
|
||||
* of half-precision registers, with a full precisions register
|
||||
* using two half-precision slots:
|
||||
*/
|
||||
if (half) {
|
||||
BITSET_CLEAR(regmask->mask, n);
|
||||
} else {
|
||||
n *= 2;
|
||||
BITSET_CLEAR(regmask->mask, n);
|
||||
BITSET_CLEAR(regmask->mask, n + 1);
|
||||
}
|
||||
} else {
|
||||
/* pre a6xx case, with separate register file for half and full
|
||||
* precision:
|
||||
*/
|
||||
if (half)
|
||||
n += MAX_REG;
|
||||
BITSET_CLEAR(regmask->mask, n);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
regmask_init(regmask_t *regmask, bool mergedregs)
|
||||
{
|
||||
memset(®mask->mask, 0, sizeof(regmask->mask));
|
||||
regmask->mergedregs = mergedregs;
|
||||
memset(®mask->mask, 0, sizeof(regmask->mask));
|
||||
regmask->mergedregs = mergedregs;
|
||||
}
|
||||
|
||||
static inline void
|
||||
regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
|
||||
{
|
||||
assert(dst->mergedregs == a->mergedregs);
|
||||
assert(dst->mergedregs == b->mergedregs);
|
||||
assert(dst->mergedregs == a->mergedregs);
|
||||
assert(dst->mergedregs == b->mergedregs);
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(dst->mask); i++)
|
||||
dst->mask[i] = a->mask[i] | b->mask[i];
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(dst->mask); i++)
|
||||
dst->mask[i] = a->mask[i] | b->mask[i];
|
||||
}
|
||||
|
||||
#endif /* REGMASK_H_ */
|
||||
|
|
|
@ -42,8 +42,8 @@
|
|||
/* clang-format on */
|
||||
|
||||
static const struct test {
|
||||
const char *asmstr;
|
||||
unsigned expected_delay;
|
||||
const char *asmstr;
|
||||
unsigned expected_delay;
|
||||
} tests[] = {
|
||||
/* clang-format off */
|
||||
TEST(6,
|
||||
|
@ -101,16 +101,16 @@ static const struct test {
|
|||
static struct ir3_shader *
|
||||
parse_asm(struct ir3_compiler *c, const char *asmstr)
|
||||
{
|
||||
struct ir3_kernel_info info = {};
|
||||
FILE *in = fmemopen((void *)asmstr, strlen(asmstr), "r");
|
||||
struct ir3_shader *shader = ir3_parse_asm(c, &info, in);
|
||||
struct ir3_kernel_info info = {};
|
||||
FILE *in = fmemopen((void *)asmstr, strlen(asmstr), "r");
|
||||
struct ir3_shader *shader = ir3_parse_asm(c, &info, in);
|
||||
|
||||
fclose(in);
|
||||
fclose(in);
|
||||
|
||||
if (!shader)
|
||||
errx(-1, "assembler failed");
|
||||
if (!shader)
|
||||
errx(-1, "assembler failed");
|
||||
|
||||
return shader;
|
||||
return shader;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -124,71 +124,70 @@ parse_asm(struct ir3_compiler *c, const char *asmstr)
|
|||
static void
|
||||
fixup_wrmask(struct ir3 *ir)
|
||||
{
|
||||
struct ir3_block *block = ir3_start_block(ir);
|
||||
struct ir3_block *block = ir3_start_block(ir);
|
||||
|
||||
foreach_instr_safe (instr, &block->instr_list) {
|
||||
instr->dsts[0]->wrmask = MASK(instr->repeat + 1);
|
||||
foreach_src (reg, instr) {
|
||||
if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
|
||||
continue;
|
||||
foreach_instr_safe (instr, &block->instr_list) {
|
||||
instr->dsts[0]->wrmask = MASK(instr->repeat + 1);
|
||||
foreach_src (reg, instr) {
|
||||
if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
|
||||
continue;
|
||||
|
||||
if (reg->flags & IR3_REG_R)
|
||||
reg->wrmask = MASK(instr->repeat + 1);
|
||||
else
|
||||
reg->wrmask = 1;
|
||||
}
|
||||
}
|
||||
if (reg->flags & IR3_REG_R)
|
||||
reg->wrmask = MASK(instr->repeat + 1);
|
||||
else
|
||||
reg->wrmask = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
struct ir3_compiler *c;
|
||||
int result = 0;
|
||||
struct ir3_compiler *c;
|
||||
int result = 0;
|
||||
|
||||
c = ir3_compiler_create(NULL, 630, false);
|
||||
c = ir3_compiler_create(NULL, 630, false);
|
||||
|
||||
for (int i = 0; i < ARRAY_SIZE(tests); i++) {
|
||||
const struct test *test = &tests[i];
|
||||
struct ir3_shader *shader = parse_asm(c, test->asmstr);
|
||||
struct ir3 *ir = shader->variants->ir;
|
||||
for (int i = 0; i < ARRAY_SIZE(tests); i++) {
|
||||
const struct test *test = &tests[i];
|
||||
struct ir3_shader *shader = parse_asm(c, test->asmstr);
|
||||
struct ir3 *ir = shader->variants->ir;
|
||||
|
||||
fixup_wrmask(ir);
|
||||
fixup_wrmask(ir);
|
||||
|
||||
ir3_debug_print(ir, "AFTER fixup_wrmask");
|
||||
ir3_debug_print(ir, "AFTER fixup_wrmask");
|
||||
|
||||
struct ir3_block *block =
|
||||
list_first_entry(&ir->block_list, struct ir3_block, node);
|
||||
struct ir3_instruction *last = NULL;
|
||||
struct ir3_block *block =
|
||||
list_first_entry(&ir->block_list, struct ir3_block, node);
|
||||
struct ir3_instruction *last = NULL;
|
||||
|
||||
foreach_instr_rev (instr, &block->instr_list) {
|
||||
if (is_meta(instr))
|
||||
continue;
|
||||
last = instr;
|
||||
break;
|
||||
}
|
||||
foreach_instr_rev (instr, &block->instr_list) {
|
||||
if (is_meta(instr))
|
||||
continue;
|
||||
last = instr;
|
||||
break;
|
||||
}
|
||||
|
||||
/* The delay calc is expecting the instr to not yet be added to the
|
||||
* block, so remove it from the block so that it doesn't get counted
|
||||
* in the distance from assigner:
|
||||
*/
|
||||
list_delinit(&last->node);
|
||||
/* The delay calc is expecting the instr to not yet be added to the
|
||||
* block, so remove it from the block so that it doesn't get counted
|
||||
* in the distance from assigner:
|
||||
*/
|
||||
list_delinit(&last->node);
|
||||
|
||||
unsigned n = ir3_delay_calc_exact(block, last, true);
|
||||
unsigned n = ir3_delay_calc_exact(block, last, true);
|
||||
|
||||
if (n != test->expected_delay) {
|
||||
printf("%d: FAIL: Expected delay %u, but got %u, for:\n%s\n",
|
||||
i, test->expected_delay, n, test->asmstr);
|
||||
result = -1;
|
||||
} else {
|
||||
printf("%d: PASS\n", i);
|
||||
}
|
||||
if (n != test->expected_delay) {
|
||||
printf("%d: FAIL: Expected delay %u, but got %u, for:\n%s\n", i,
|
||||
test->expected_delay, n, test->asmstr);
|
||||
result = -1;
|
||||
} else {
|
||||
printf("%d: PASS\n", i);
|
||||
}
|
||||
|
||||
ir3_shader_destroy(shader);
|
||||
}
|
||||
ir3_shader_destroy(shader);
|
||||
}
|
||||
|
||||
ir3_compiler_destroy(c);
|
||||
ir3_compiler_destroy(c);
|
||||
|
||||
return result;
|
||||
return result;
|
||||
}
|
||||
|
|
|
@ -48,15 +48,16 @@
|
|||
/* clang-format on */
|
||||
|
||||
static const struct test {
|
||||
int gpu_id;
|
||||
const char *instr;
|
||||
const char *expected;
|
||||
/**
|
||||
* Do we expect asm parse fail (ie. for things not (yet) supported by ir3_parser.y)
|
||||
*/
|
||||
bool parse_fail;
|
||||
int gpu_id;
|
||||
const char *instr;
|
||||
const char *expected;
|
||||
/**
|
||||
* Do we expect asm parse fail (ie. for things not (yet) supported by
|
||||
* ir3_parser.y)
|
||||
*/
|
||||
bool parse_fail;
|
||||
} tests[] = {
|
||||
/* clang-format off */
|
||||
/* clang-format off */
|
||||
/* cat0 */
|
||||
INSTR_6XX(00000000_00000000, "nop"),
|
||||
INSTR_6XX(00000200_00000000, "(rpt2)nop"),
|
||||
|
@ -351,128 +352,132 @@ static const struct test {
|
|||
INSTR_6XX(e0fa0000_00000000, "fence.g.l.r.w"),
|
||||
INSTR_6XX(e09a0000_00000000, "fence.r.w"),
|
||||
INSTR_6XX(f0420000_00000000, "(sy)bar.g"),
|
||||
/* clang-format on */
|
||||
/* clang-format on */
|
||||
};
|
||||
|
||||
static void
|
||||
trim(char *string)
|
||||
{
|
||||
for (int len = strlen(string); len > 0 && string[len - 1] == '\n'; len--)
|
||||
string[len - 1] = 0;
|
||||
for (int len = strlen(string); len > 0 && string[len - 1] == '\n'; len--)
|
||||
string[len - 1] = 0;
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
int retval = 0;
|
||||
int decode_fails = 0, asm_fails = 0, encode_fails = 0;
|
||||
const int output_size = 4096;
|
||||
char *disasm_output = malloc(output_size);
|
||||
FILE *fdisasm = fmemopen(disasm_output, output_size, "w+");
|
||||
if (!fdisasm) {
|
||||
fprintf(stderr, "failed to fmemopen\n");
|
||||
return 1;
|
||||
}
|
||||
int retval = 0;
|
||||
int decode_fails = 0, asm_fails = 0, encode_fails = 0;
|
||||
const int output_size = 4096;
|
||||
char *disasm_output = malloc(output_size);
|
||||
FILE *fdisasm = fmemopen(disasm_output, output_size, "w+");
|
||||
if (!fdisasm) {
|
||||
fprintf(stderr, "failed to fmemopen\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
struct ir3_compiler *compilers[10] = {};
|
||||
struct ir3_compiler *compilers[10] = {};
|
||||
|
||||
for (int i = 0; i < ARRAY_SIZE(tests); i++) {
|
||||
const struct test *test = &tests[i];
|
||||
printf("Testing a%d %s: \"%s\"...\n",
|
||||
test->gpu_id, test->instr, test->expected);
|
||||
for (int i = 0; i < ARRAY_SIZE(tests); i++) {
|
||||
const struct test *test = &tests[i];
|
||||
printf("Testing a%d %s: \"%s\"...\n", test->gpu_id, test->instr,
|
||||
test->expected);
|
||||
|
||||
rewind(fdisasm);
|
||||
memset(disasm_output, 0, output_size);
|
||||
rewind(fdisasm);
|
||||
memset(disasm_output, 0, output_size);
|
||||
|
||||
/*
|
||||
* Test disassembly:
|
||||
*/
|
||||
/*
|
||||
* Test disassembly:
|
||||
*/
|
||||
|
||||
uint32_t code[2] = {
|
||||
strtoll(&test->instr[9], NULL, 16),
|
||||
strtoll(&test->instr[0], NULL, 16),
|
||||
};
|
||||
isa_decode(code, 8, fdisasm, &(struct isa_decode_options){
|
||||
.gpu_id = test->gpu_id,
|
||||
.show_errors = true,
|
||||
});
|
||||
fflush(fdisasm);
|
||||
uint32_t code[2] = {
|
||||
strtoll(&test->instr[9], NULL, 16),
|
||||
strtoll(&test->instr[0], NULL, 16),
|
||||
};
|
||||
isa_decode(code, 8, fdisasm,
|
||||
&(struct isa_decode_options){
|
||||
.gpu_id = test->gpu_id,
|
||||
.show_errors = true,
|
||||
});
|
||||
fflush(fdisasm);
|
||||
|
||||
trim(disasm_output);
|
||||
trim(disasm_output);
|
||||
|
||||
if (strcmp(disasm_output, test->expected) != 0) {
|
||||
printf("FAIL: disasm\n");
|
||||
printf(" Expected: \"%s\"\n", test->expected);
|
||||
printf(" Got: \"%s\"\n", disasm_output);
|
||||
retval = 1;
|
||||
decode_fails++;
|
||||
continue;
|
||||
}
|
||||
if (strcmp(disasm_output, test->expected) != 0) {
|
||||
printf("FAIL: disasm\n");
|
||||
printf(" Expected: \"%s\"\n", test->expected);
|
||||
printf(" Got: \"%s\"\n", disasm_output);
|
||||
retval = 1;
|
||||
decode_fails++;
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Test assembly, which should result in the identical binary:
|
||||
*/
|
||||
/*
|
||||
* Test assembly, which should result in the identical binary:
|
||||
*/
|
||||
|
||||
unsigned gen = test->gpu_id / 100;
|
||||
if (!compilers[gen]) {
|
||||
compilers[gen] = ir3_compiler_create(NULL, test->gpu_id, false);
|
||||
}
|
||||
unsigned gen = test->gpu_id / 100;
|
||||
if (!compilers[gen]) {
|
||||
compilers[gen] = ir3_compiler_create(NULL, test->gpu_id, false);
|
||||
}
|
||||
|
||||
FILE *fasm = fmemopen((void *)test->expected, strlen(test->expected), "r");
|
||||
FILE *fasm =
|
||||
fmemopen((void *)test->expected, strlen(test->expected), "r");
|
||||
|
||||
struct ir3_kernel_info info = {};
|
||||
struct ir3_shader *shader = ir3_parse_asm(compilers[gen], &info, fasm);
|
||||
fclose(fasm);
|
||||
if (!shader) {
|
||||
printf("FAIL: %sexpected assembler fail\n", test->parse_fail ? "" : "un");
|
||||
asm_fails++;
|
||||
/* If this is an instruction that the asm parser is not expected
|
||||
* to handle, don't count it as a fail.
|
||||
*/
|
||||
if (!test->parse_fail)
|
||||
retval = 1;
|
||||
continue;
|
||||
} else if (test->parse_fail) {
|
||||
/* If asm parse starts passing, and we don't expect that, flag
|
||||
* it as a fail so we don't forget to update the test vector:
|
||||
*/
|
||||
printf("FAIL: unexpected parse success, please remove '.parse_fail=true'\n");
|
||||
retval = 1;
|
||||
}
|
||||
struct ir3_kernel_info info = {};
|
||||
struct ir3_shader *shader = ir3_parse_asm(compilers[gen], &info, fasm);
|
||||
fclose(fasm);
|
||||
if (!shader) {
|
||||
printf("FAIL: %sexpected assembler fail\n",
|
||||
test->parse_fail ? "" : "un");
|
||||
asm_fails++;
|
||||
/* If this is an instruction that the asm parser is not expected
|
||||
* to handle, don't count it as a fail.
|
||||
*/
|
||||
if (!test->parse_fail)
|
||||
retval = 1;
|
||||
continue;
|
||||
} else if (test->parse_fail) {
|
||||
/* If asm parse starts passing, and we don't expect that, flag
|
||||
* it as a fail so we don't forget to update the test vector:
|
||||
*/
|
||||
printf(
|
||||
"FAIL: unexpected parse success, please remove '.parse_fail=true'\n");
|
||||
retval = 1;
|
||||
}
|
||||
|
||||
struct ir3_shader_variant *v = shader->variants;
|
||||
if (memcmp(v->bin, code, sizeof(code))) {
|
||||
printf("FAIL: assembler\n");
|
||||
printf(" Expected: %08x_%08x\n", code[1], code[0]);
|
||||
printf(" Got: %08x_%08x\n", v->bin[1], v->bin[0]);
|
||||
retval = 1;
|
||||
encode_fails++;
|
||||
}
|
||||
struct ir3_shader_variant *v = shader->variants;
|
||||
if (memcmp(v->bin, code, sizeof(code))) {
|
||||
printf("FAIL: assembler\n");
|
||||
printf(" Expected: %08x_%08x\n", code[1], code[0]);
|
||||
printf(" Got: %08x_%08x\n", v->bin[1], v->bin[0]);
|
||||
retval = 1;
|
||||
encode_fails++;
|
||||
}
|
||||
|
||||
ir3_shader_destroy(shader);
|
||||
}
|
||||
ir3_shader_destroy(shader);
|
||||
}
|
||||
|
||||
if (decode_fails)
|
||||
printf("%d/%d decode fails\n", decode_fails, (int)ARRAY_SIZE(tests));
|
||||
if (asm_fails)
|
||||
printf("%d/%d assembler fails\n", asm_fails, (int)ARRAY_SIZE(tests));
|
||||
if (encode_fails)
|
||||
printf("%d/%d encode fails\n", encode_fails, (int)ARRAY_SIZE(tests));
|
||||
if (decode_fails)
|
||||
printf("%d/%d decode fails\n", decode_fails, (int)ARRAY_SIZE(tests));
|
||||
if (asm_fails)
|
||||
printf("%d/%d assembler fails\n", asm_fails, (int)ARRAY_SIZE(tests));
|
||||
if (encode_fails)
|
||||
printf("%d/%d encode fails\n", encode_fails, (int)ARRAY_SIZE(tests));
|
||||
|
||||
if (retval) {
|
||||
printf("FAILED!\n");
|
||||
} else {
|
||||
printf("PASSED!\n");
|
||||
}
|
||||
if (retval) {
|
||||
printf("FAILED!\n");
|
||||
} else {
|
||||
printf("PASSED!\n");
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(compilers); i++) {
|
||||
if (!compilers[i])
|
||||
continue;
|
||||
ir3_compiler_destroy(compilers[i]);
|
||||
}
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(compilers); i++) {
|
||||
if (!compilers[i])
|
||||
continue;
|
||||
ir3_compiler_destroy(compilers[i]);
|
||||
}
|
||||
|
||||
fclose(fdisasm);
|
||||
free(disasm_output);
|
||||
fclose(fdisasm);
|
||||
free(disasm_output);
|
||||
|
||||
return retval;
|
||||
return retval;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue