i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the original. Before, the LOAD_PAYLOAD instruction was lowered by a the complicated and broken-by-design pile of heuristics to try and guess force_writemask_all, exec_size, and a number of other factors on the sources. Instead, we use the header_size on the instruction to denote which sources are "header sources". Header sources are required to be a single physical hardware register that is copied verbatim. The registers that follow are considered the actual payload registers and have a width that correspond's to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This gives us a fairly straightforward lowering: 1) All header sources are copied directly using force_writemask_all and, since they are guaranteed to be a single register, there are no force_sechalf issues. 2) All non-header sources are copied using the exact same force_sechalf and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself. 3) In order to accommodate older gens that need interleaved colors, lower_load_payload detects when the destination is a COMPR4 register and automatically interleaves the non-header sources. The lower_load_payload pass does the right thing here regardless of whether or not the hardware actually supports COMPR4. This patch commit itself is made up of a bunch of smaller changes squashed together. Individual change descriptions follow: i965/fs: Rework fs_visitor::LOAD_PAYLOAD We rework LOAD_PAYLOAD to verify that all of the sources that count as headers are, indeed, exactly one register and that all of the non-header sources match the destination width. We then take the exec_size for LOAD_PAYLOAD directly from the destination width. i965/fs: Make destinations of load_payload have the appropreate width i965/fs: Rework fs_visitor::lower_load_payload v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions i965/fs_cse: Support the new-style LOAD_PAYLOAD i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD i965/fs: Simplify setup_color_payload Previously, setup_color_payload was a a big helper function that did a lot of gen-specific special casing for setting up the color sources of the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more sane, most of that complexity isn't needed anymore. Instead, we can do a simple fixup pass for color clamps and then just stash sources directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the right thing with respect to COMPR4. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
This commit is contained in:
parent
94ee908448
commit
41868bb682
|
@ -358,28 +358,18 @@ fs_inst *
|
||||||
fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
|
fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
|
||||||
int header_size)
|
int header_size)
|
||||||
{
|
{
|
||||||
|
assert(dst.width % 8 == 0);
|
||||||
|
fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
|
||||||
|
dst, src, sources);
|
||||||
|
inst->header_size = header_size;
|
||||||
|
|
||||||
for (int i = 0; i < header_size; i++)
|
for (int i = 0; i < header_size; i++)
|
||||||
assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
|
assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
|
||||||
|
inst->regs_written = header_size;
|
||||||
|
|
||||||
uint8_t exec_size = dst.width;
|
for (int i = header_size; i < sources; ++i)
|
||||||
for (int i = 0; i < sources; ++i) {
|
assert(src[i].file != GRF || src[i].width == dst.width);
|
||||||
assert(src[i].width % dst.width == 0);
|
inst->regs_written += (sources - header_size) * (dst.width / 8);
|
||||||
if (src[i].width > exec_size)
|
|
||||||
exec_size = src[i].width;
|
|
||||||
}
|
|
||||||
|
|
||||||
fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
|
|
||||||
dst, src, sources);
|
|
||||||
inst->regs_written = 0;
|
|
||||||
for (int i = 0; i < sources; ++i) {
|
|
||||||
/* The LOAD_PAYLOAD instruction only really makes sense if we are
|
|
||||||
* dealing with whole registers. If this ever changes, we can deal
|
|
||||||
* with it later.
|
|
||||||
*/
|
|
||||||
int size = inst->src[i].effective_width * type_sz(src[i].type);
|
|
||||||
assert(size % 32 == 0);
|
|
||||||
inst->regs_written += (size + 31) / 32;
|
|
||||||
}
|
|
||||||
|
|
||||||
return inst;
|
return inst;
|
||||||
}
|
}
|
||||||
|
@ -537,9 +527,13 @@ fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
|
||||||
if (grf_alloc.sizes[reg.reg] != this->regs_written)
|
if (grf_alloc.sizes[reg.reg] != this->regs_written)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
for (int i = 1; i < this->sources; i++)
|
for (int i = 0; i < this->sources; i++) {
|
||||||
if (!this->src[i].equals(::offset(reg, i)))
|
reg.type = this->src[i].type;
|
||||||
|
reg.width = this->src[i].width;
|
||||||
|
if (!this->src[i].equals(reg))
|
||||||
return false;
|
return false;
|
||||||
|
reg = ::offset(reg, 1);
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -3429,100 +3423,109 @@ fs_visitor::lower_load_payload()
|
||||||
{
|
{
|
||||||
bool progress = false;
|
bool progress = false;
|
||||||
|
|
||||||
int vgrf_to_reg[alloc.count];
|
|
||||||
int reg_count = 0;
|
|
||||||
for (unsigned i = 0; i < alloc.count; ++i) {
|
|
||||||
vgrf_to_reg[i] = reg_count;
|
|
||||||
reg_count += alloc.sizes[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
struct {
|
|
||||||
bool written:1; /* Whether this register has ever been written */
|
|
||||||
bool force_writemask_all:1;
|
|
||||||
bool force_sechalf:1;
|
|
||||||
} metadata[reg_count];
|
|
||||||
memset(metadata, 0, sizeof(metadata));
|
|
||||||
|
|
||||||
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
|
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
|
||||||
if (inst->dst.file == GRF) {
|
if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
|
||||||
const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
|
continue;
|
||||||
bool force_sechalf = inst->force_sechalf &&
|
|
||||||
!inst->force_writemask_all;
|
|
||||||
bool toggle_sechalf = inst->dst.width == 16 &&
|
|
||||||
type_sz(inst->dst.type) == 4 &&
|
|
||||||
!inst->force_writemask_all;
|
|
||||||
for (int i = 0; i < inst->regs_written; ++i) {
|
|
||||||
metadata[dst_reg + i].written = true;
|
|
||||||
metadata[dst_reg + i].force_sechalf = force_sechalf;
|
|
||||||
metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
|
|
||||||
force_sechalf = (toggle_sechalf != force_sechalf);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
|
|
||||||
assert(inst->dst.file == MRF || inst->dst.file == GRF);
|
assert(inst->dst.file == MRF || inst->dst.file == GRF);
|
||||||
|
assert(inst->saturate == false);
|
||||||
|
|
||||||
fs_reg dst = inst->dst;
|
fs_reg dst = inst->dst;
|
||||||
|
|
||||||
for (int i = 0; i < inst->sources; i++) {
|
/* Get rid of COMPR4. We'll add it back in if we need it */
|
||||||
dst.width = inst->src[i].effective_width;
|
if (dst.file == MRF)
|
||||||
dst.type = inst->src[i].type;
|
dst.reg = dst.reg & ~BRW_MRF_COMPR4;
|
||||||
|
|
||||||
if (inst->src[i].file == BAD_FILE) {
|
dst.width = 8;
|
||||||
/* Do nothing but otherwise increment as normal */
|
for (uint8_t i = 0; i < inst->header_size; i++) {
|
||||||
} else if (dst.file == MRF &&
|
if (inst->src[i].file != BAD_FILE) {
|
||||||
dst.width == 8 &&
|
fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
|
||||||
devinfo->has_compr4 &&
|
fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
|
||||||
i + 4 < inst->sources &&
|
mov_src.width = 8;
|
||||||
inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
|
fs_inst *mov = MOV(mov_dst, mov_src);
|
||||||
fs_reg compr4_dst = dst;
|
|
||||||
compr4_dst.reg += BRW_MRF_COMPR4;
|
|
||||||
compr4_dst.width = 16;
|
|
||||||
fs_reg compr4_src = inst->src[i];
|
|
||||||
compr4_src.width = 16;
|
|
||||||
fs_inst *mov = MOV(compr4_dst, compr4_src);
|
|
||||||
mov->force_writemask_all = true;
|
mov->force_writemask_all = true;
|
||||||
inst->insert_before(block, mov);
|
inst->insert_before(block, mov);
|
||||||
/* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
|
}
|
||||||
inst->src[i + 4].file = BAD_FILE;
|
dst = offset(dst, 1);
|
||||||
} else {
|
}
|
||||||
fs_inst *mov = MOV(dst, inst->src[i]);
|
|
||||||
if (inst->src[i].file == GRF) {
|
dst.width = inst->exec_size;
|
||||||
int src_reg = vgrf_to_reg[inst->src[i].reg] +
|
if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
|
||||||
inst->src[i].reg_offset;
|
inst->exec_size > 8) {
|
||||||
mov->force_sechalf = metadata[src_reg].force_sechalf;
|
/* In this case, the payload portion of the LOAD_PAYLOAD isn't
|
||||||
mov->force_writemask_all = metadata[src_reg].force_writemask_all;
|
* a straightforward copy. Instead, the result of the
|
||||||
} else {
|
* LOAD_PAYLOAD is treated as interleaved and the first four
|
||||||
/* We don't have any useful metadata for immediates or
|
* non-header sources are unpacked as:
|
||||||
* uniforms. Assume that any of the channels of the
|
*
|
||||||
* destination may be used.
|
* m + 0: r0
|
||||||
|
* m + 1: g0
|
||||||
|
* m + 2: b0
|
||||||
|
* m + 3: a0
|
||||||
|
* m + 4: r1
|
||||||
|
* m + 5: g1
|
||||||
|
* m + 6: b1
|
||||||
|
* m + 7: a1
|
||||||
|
*
|
||||||
|
* This is used for gen <= 5 fb writes.
|
||||||
*/
|
*/
|
||||||
assert(inst->src[i].file == IMM ||
|
assert(inst->exec_size == 16);
|
||||||
inst->src[i].file == UNIFORM);
|
assert(inst->header_size + 4 <= inst->sources);
|
||||||
mov->force_writemask_all = true;
|
for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
|
||||||
}
|
if (inst->src[i].file != BAD_FILE) {
|
||||||
|
if (devinfo->has_compr4) {
|
||||||
|
fs_reg compr4_dst = retype(dst, inst->src[i].type);
|
||||||
|
compr4_dst.reg |= BRW_MRF_COMPR4;
|
||||||
|
|
||||||
if (dst.file == GRF) {
|
fs_inst *mov = MOV(compr4_dst, inst->src[i]);
|
||||||
const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
|
mov->force_writemask_all = inst->force_writemask_all;
|
||||||
const bool force_writemask = mov->force_writemask_all;
|
inst->insert_before(block, mov);
|
||||||
metadata[dst_reg].force_writemask_all = force_writemask;
|
} else {
|
||||||
metadata[dst_reg].force_sechalf = mov->force_sechalf;
|
/* Platform doesn't have COMPR4. We have to fake it */
|
||||||
if (dst.width * type_sz(dst.type) > 32) {
|
fs_reg mov_dst = retype(dst, inst->src[i].type);
|
||||||
assert(!mov->force_sechalf);
|
mov_dst.width = 8;
|
||||||
metadata[dst_reg + 1].force_writemask_all = force_writemask;
|
|
||||||
metadata[dst_reg + 1].force_sechalf = !force_writemask;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
fs_inst *mov = MOV(mov_dst, half(inst->src[i], 0));
|
||||||
|
mov->force_writemask_all = inst->force_writemask_all;
|
||||||
|
inst->insert_before(block, mov);
|
||||||
|
|
||||||
|
mov = MOV(offset(mov_dst, 4), half(inst->src[i], 1));
|
||||||
|
mov->force_writemask_all = inst->force_writemask_all;
|
||||||
|
mov->force_sechalf = true;
|
||||||
inst->insert_before(block, mov);
|
inst->insert_before(block, mov);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dst.reg++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The loop above only ever incremented us through the first set
|
||||||
|
* of 4 registers. However, thanks to the magic of COMPR4, we
|
||||||
|
* actually wrote to the first 8 registers, so we need to take
|
||||||
|
* that into account now.
|
||||||
|
*/
|
||||||
|
dst.reg += 4;
|
||||||
|
|
||||||
|
/* The COMPR4 code took care of the first 4 sources. We'll let
|
||||||
|
* the regular path handle any remaining sources. Yes, we are
|
||||||
|
* modifying the instruction but we're about to delete it so
|
||||||
|
* this really doesn't hurt anything.
|
||||||
|
*/
|
||||||
|
inst->header_size += 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (uint8_t i = inst->header_size; i < inst->sources; i++) {
|
||||||
|
if (inst->src[i].file != BAD_FILE) {
|
||||||
|
fs_inst *mov = MOV(retype(dst, inst->src[i].type),
|
||||||
|
inst->src[i]);
|
||||||
|
mov->force_writemask_all = inst->force_writemask_all;
|
||||||
|
inst->insert_before(block, mov);
|
||||||
|
}
|
||||||
dst = offset(dst, 1);
|
dst = offset(dst, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
inst->remove(block);
|
inst->remove(block);
|
||||||
progress = true;
|
progress = true;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (progress)
|
if (progress)
|
||||||
invalidate_live_intervals();
|
invalidate_live_intervals();
|
||||||
|
|
|
@ -394,8 +394,8 @@ public:
|
||||||
bool optimize_frontfacing_ternary(nir_alu_instr *instr,
|
bool optimize_frontfacing_ternary(nir_alu_instr *instr,
|
||||||
const fs_reg &result);
|
const fs_reg &result);
|
||||||
|
|
||||||
int setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
|
void setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
|
||||||
bool use_2nd_half);
|
unsigned exec_size, bool use_2nd_half);
|
||||||
void emit_alpha_test();
|
void emit_alpha_test();
|
||||||
fs_inst *emit_single_fb_write(fs_reg color1, fs_reg color2,
|
fs_inst *emit_single_fb_write(fs_reg color1, fs_reg color2,
|
||||||
fs_reg src0_alpha, unsigned components,
|
fs_reg src0_alpha, unsigned components,
|
||||||
|
|
|
@ -174,17 +174,34 @@ create_copy_instr(fs_visitor *v, fs_inst *inst, fs_reg src, bool negate)
|
||||||
{
|
{
|
||||||
int written = inst->regs_written;
|
int written = inst->regs_written;
|
||||||
int dst_width = inst->dst.width / 8;
|
int dst_width = inst->dst.width / 8;
|
||||||
fs_reg dst = inst->dst;
|
|
||||||
fs_inst *copy;
|
fs_inst *copy;
|
||||||
|
|
||||||
if (written > dst_width) {
|
if (written > dst_width) {
|
||||||
fs_reg *sources = ralloc_array(v->mem_ctx, fs_reg, written / dst_width);
|
fs_reg *payload;
|
||||||
for (int i = 0; i < written / dst_width; i++)
|
int sources, header_size;
|
||||||
sources[i] = offset(src, i);
|
if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
|
||||||
copy = v->LOAD_PAYLOAD(dst, sources, written / dst_width,
|
sources = inst->sources;
|
||||||
inst->header_size);
|
header_size = inst->header_size;
|
||||||
} else {
|
} else {
|
||||||
copy = v->MOV(dst, src);
|
assert(written % dst_width == 0);
|
||||||
|
sources = written / dst_width;
|
||||||
|
header_size = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(src.file == GRF);
|
||||||
|
payload = ralloc_array(v->mem_ctx, fs_reg, sources);
|
||||||
|
for (int i = 0; i < header_size; i++) {
|
||||||
|
payload[i] = src;
|
||||||
|
payload[i].width = 8;
|
||||||
|
src.reg_offset++;
|
||||||
|
}
|
||||||
|
for (int i = header_size; i < sources; i++) {
|
||||||
|
payload[i] = src;
|
||||||
|
src = offset(src, 1);
|
||||||
|
}
|
||||||
|
copy = v->LOAD_PAYLOAD(inst->dst, payload, sources, header_size);
|
||||||
|
} else {
|
||||||
|
copy = v->MOV(inst->dst, src);
|
||||||
copy->force_writemask_all = inst->force_writemask_all;
|
copy->force_writemask_all = inst->force_writemask_all;
|
||||||
copy->src[0].negate = negate;
|
copy->src[0].negate = negate;
|
||||||
}
|
}
|
||||||
|
|
|
@ -2002,7 +2002,7 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
|
||||||
mlen = length * reg_width;
|
mlen = length * reg_width;
|
||||||
|
|
||||||
fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
|
fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
|
||||||
BRW_REGISTER_TYPE_F);
|
BRW_REGISTER_TYPE_F, dispatch_width);
|
||||||
emit(LOAD_PAYLOAD(src_payload, sources, length, header_size));
|
emit(LOAD_PAYLOAD(src_payload, sources, length, header_size));
|
||||||
|
|
||||||
/* Generate the SEND */
|
/* Generate the SEND */
|
||||||
|
@ -2159,7 +2159,7 @@ fs_visitor::emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler)
|
||||||
{
|
{
|
||||||
int reg_width = dispatch_width / 8;
|
int reg_width = dispatch_width / 8;
|
||||||
fs_reg payload = fs_reg(GRF, alloc.allocate(components * reg_width),
|
fs_reg payload = fs_reg(GRF, alloc.allocate(components * reg_width),
|
||||||
BRW_REGISTER_TYPE_F);
|
BRW_REGISTER_TYPE_F, dispatch_width);
|
||||||
fs_reg dest = vgrf(glsl_type::uvec4_type);
|
fs_reg dest = vgrf(glsl_type::uvec4_type);
|
||||||
fs_reg *sources = ralloc_array(mem_ctx, fs_reg, components);
|
fs_reg *sources = ralloc_array(mem_ctx, fs_reg, components);
|
||||||
|
|
||||||
|
@ -3295,7 +3295,7 @@ fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
|
||||||
|
|
||||||
int mlen = 1 + (length - 1) * reg_width;
|
int mlen = 1 + (length - 1) * reg_width;
|
||||||
fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
|
fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
|
||||||
BRW_REGISTER_TYPE_UD);
|
BRW_REGISTER_TYPE_UD, dispatch_width);
|
||||||
emit(LOAD_PAYLOAD(src_payload, sources, length, 1));
|
emit(LOAD_PAYLOAD(src_payload, sources, length, 1));
|
||||||
|
|
||||||
/* Emit the instruction. */
|
/* Emit the instruction. */
|
||||||
|
@ -3343,7 +3343,7 @@ fs_visitor::emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
|
||||||
|
|
||||||
int mlen = 1 + reg_width;
|
int mlen = 1 + reg_width;
|
||||||
fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
|
fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
|
||||||
BRW_REGISTER_TYPE_UD);
|
BRW_REGISTER_TYPE_UD, dispatch_width);
|
||||||
fs_inst *inst = emit(LOAD_PAYLOAD(src_payload, sources, 2, 1));
|
fs_inst *inst = emit(LOAD_PAYLOAD(src_payload, sources, 2, 1));
|
||||||
|
|
||||||
/* Emit the instruction. */
|
/* Emit the instruction. */
|
||||||
|
@ -3558,108 +3558,30 @@ fs_visitor::emit_interpolation_setup_gen6()
|
||||||
this->current_annotation = NULL;
|
this->current_annotation = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
void
|
||||||
fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
|
fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
|
||||||
bool use_2nd_half)
|
unsigned exec_size, bool use_2nd_half)
|
||||||
{
|
{
|
||||||
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
|
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
|
||||||
fs_inst *inst;
|
fs_inst *inst;
|
||||||
|
|
||||||
if (color.file == BAD_FILE) {
|
if (key->clamp_fragment_color) {
|
||||||
return 4 * (dispatch_width / 8);
|
fs_reg tmp = vgrf(glsl_type::vec4_type);
|
||||||
|
assert(color.type == BRW_REGISTER_TYPE_F);
|
||||||
|
for (unsigned i = 0; i < components; i++) {
|
||||||
|
inst = emit(MOV(offset(tmp, i), offset(color, i)));
|
||||||
|
inst->saturate = true;
|
||||||
|
}
|
||||||
|
color = tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint8_t colors_enabled;
|
if (exec_size < dispatch_width) {
|
||||||
if (components == 0) {
|
unsigned half_idx = use_2nd_half ? 1 : 0;
|
||||||
/* We want to write one component to the alpha channel */
|
for (unsigned i = 0; i < components; i++)
|
||||||
colors_enabled = 0x8;
|
dst[i] = half(offset(color, i), half_idx);
|
||||||
} else {
|
} else {
|
||||||
/* Enable the first components-many channels */
|
for (unsigned i = 0; i < components; i++)
|
||||||
colors_enabled = (1 << components) - 1;
|
dst[i] = offset(color, i);
|
||||||
}
|
|
||||||
|
|
||||||
if (dispatch_width == 8 || (devinfo->gen >= 6 && !do_dual_src)) {
|
|
||||||
/* SIMD8 write looks like:
|
|
||||||
* m + 0: r0
|
|
||||||
* m + 1: r1
|
|
||||||
* m + 2: g0
|
|
||||||
* m + 3: g1
|
|
||||||
*
|
|
||||||
* gen6 SIMD16 DP write looks like:
|
|
||||||
* m + 0: r0
|
|
||||||
* m + 1: r1
|
|
||||||
* m + 2: g0
|
|
||||||
* m + 3: g1
|
|
||||||
* m + 4: b0
|
|
||||||
* m + 5: b1
|
|
||||||
* m + 6: a0
|
|
||||||
* m + 7: a1
|
|
||||||
*/
|
|
||||||
int len = 0;
|
|
||||||
for (unsigned i = 0; i < 4; ++i) {
|
|
||||||
if (colors_enabled & (1 << i)) {
|
|
||||||
dst[len] = fs_reg(GRF, alloc.allocate(color.width / 8),
|
|
||||||
color.type, color.width);
|
|
||||||
inst = emit(MOV(dst[len], offset(color, i)));
|
|
||||||
inst->saturate = key->clamp_fragment_color;
|
|
||||||
} else if (color.width == 16) {
|
|
||||||
/* We need two BAD_FILE slots for a 16-wide color */
|
|
||||||
len++;
|
|
||||||
}
|
|
||||||
len++;
|
|
||||||
}
|
|
||||||
return len;
|
|
||||||
} else if (devinfo->gen >= 6 && do_dual_src) {
|
|
||||||
/* SIMD16 dual source blending for gen6+.
|
|
||||||
*
|
|
||||||
* From the SNB PRM, volume 4, part 1, page 193:
|
|
||||||
*
|
|
||||||
* "The dual source render target messages only have SIMD8 forms due to
|
|
||||||
* maximum message length limitations. SIMD16 pixel shaders must send two
|
|
||||||
* of these messages to cover all of the pixels. Each message contains
|
|
||||||
* two colors (4 channels each) for each pixel in the message payload."
|
|
||||||
*
|
|
||||||
* So in SIMD16 dual source blending we will send 2 SIMD8 messages,
|
|
||||||
* each one will call this function twice (one for each color involved),
|
|
||||||
* so in each pass we only write 4 registers. Notice that the second
|
|
||||||
* SIMD8 message needs to read color data from the 2nd half of the color
|
|
||||||
* registers, so it needs to call this with use_2nd_half = true.
|
|
||||||
*/
|
|
||||||
for (unsigned i = 0; i < 4; ++i) {
|
|
||||||
if (colors_enabled & (1 << i)) {
|
|
||||||
dst[i] = fs_reg(GRF, alloc.allocate(1), color.type);
|
|
||||||
inst = emit(MOV(dst[i], half(offset(color, i),
|
|
||||||
use_2nd_half ? 1 : 0)));
|
|
||||||
inst->saturate = key->clamp_fragment_color;
|
|
||||||
if (use_2nd_half)
|
|
||||||
inst->force_sechalf = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return 4;
|
|
||||||
} else {
|
|
||||||
/* pre-gen6 SIMD16 single source DP write looks like:
|
|
||||||
* m + 0: r0
|
|
||||||
* m + 1: g0
|
|
||||||
* m + 2: b0
|
|
||||||
* m + 3: a0
|
|
||||||
* m + 4: r1
|
|
||||||
* m + 5: g1
|
|
||||||
* m + 6: b1
|
|
||||||
* m + 7: a1
|
|
||||||
*/
|
|
||||||
for (unsigned i = 0; i < 4; ++i) {
|
|
||||||
if (colors_enabled & (1 << i)) {
|
|
||||||
dst[i] = fs_reg(GRF, alloc.allocate(1), color.type);
|
|
||||||
inst = emit(MOV(dst[i], half(offset(color, i), 0)));
|
|
||||||
inst->saturate = key->clamp_fragment_color;
|
|
||||||
|
|
||||||
dst[i + 4] = fs_reg(GRF, alloc.allocate(1), color.type);
|
|
||||||
inst = emit(MOV(dst[i + 4], half(offset(color, i), 1)));
|
|
||||||
inst->saturate = key->clamp_fragment_color;
|
|
||||||
inst->force_sechalf = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return 8;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3728,7 +3650,6 @@ fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1,
|
||||||
|
|
||||||
this->current_annotation = "FB write header";
|
this->current_annotation = "FB write header";
|
||||||
int header_size = 2, payload_header_size;
|
int header_size = 2, payload_header_size;
|
||||||
int reg_size = exec_size / 8;
|
|
||||||
|
|
||||||
/* We can potentially have a message length of up to 15, so we have to set
|
/* We can potentially have a message length of up to 15, so we have to set
|
||||||
* base_mrf to either 0 or 1 in order to fit in m0..m15.
|
* base_mrf to either 0 or 1 in order to fit in m0..m15.
|
||||||
|
@ -3784,24 +3705,26 @@ fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1,
|
||||||
* alpha out the pipeline to our null renderbuffer to support
|
* alpha out the pipeline to our null renderbuffer to support
|
||||||
* alpha-testing, alpha-to-coverage, and so on.
|
* alpha-testing, alpha-to-coverage, and so on.
|
||||||
*/
|
*/
|
||||||
length += setup_color_payload(sources + length, this->outputs[0], 0,
|
if (this->outputs[0].file != BAD_FILE)
|
||||||
false);
|
setup_color_payload(&sources[length + 3], offset(this->outputs[0], 3),
|
||||||
|
1, exec_size, false);
|
||||||
|
length += 4;
|
||||||
} else if (color1.file == BAD_FILE) {
|
} else if (color1.file == BAD_FILE) {
|
||||||
if (src0_alpha.file != BAD_FILE) {
|
if (src0_alpha.file != BAD_FILE) {
|
||||||
sources[length] = fs_reg(GRF, alloc.allocate(reg_size),
|
setup_color_payload(&sources[length], src0_alpha, 1, exec_size, false);
|
||||||
src0_alpha.type, src0_alpha.width);
|
|
||||||
fs_inst *inst = emit(MOV(sources[length], src0_alpha));
|
|
||||||
inst->saturate = key->clamp_fragment_color;
|
|
||||||
length++;
|
length++;
|
||||||
}
|
}
|
||||||
|
|
||||||
length += setup_color_payload(sources + length, color0, components,
|
setup_color_payload(&sources[length], color0, components,
|
||||||
false);
|
exec_size, use_2nd_half);
|
||||||
|
length += 4;
|
||||||
} else {
|
} else {
|
||||||
length += setup_color_payload(sources + length, color0, components,
|
setup_color_payload(&sources[length], color0, components,
|
||||||
use_2nd_half);
|
exec_size, use_2nd_half);
|
||||||
length += setup_color_payload(sources + length, color1, components,
|
length += 4;
|
||||||
use_2nd_half);
|
setup_color_payload(&sources[length], color1, components,
|
||||||
|
exec_size, use_2nd_half);
|
||||||
|
length += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (source_depth_to_render_target) {
|
if (source_depth_to_render_target) {
|
||||||
|
@ -3814,41 +3737,41 @@ fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1,
|
||||||
no16("Missing support for simd16 depth writes on gen6\n");
|
no16("Missing support for simd16 depth writes on gen6\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
sources[length] = vgrf(glsl_type::float_type);
|
|
||||||
if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
|
if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
|
||||||
/* Hand over gl_FragDepth. */
|
/* Hand over gl_FragDepth. */
|
||||||
assert(this->frag_depth.file != BAD_FILE);
|
assert(this->frag_depth.file != BAD_FILE);
|
||||||
emit(MOV(sources[length], this->frag_depth));
|
sources[length] = this->frag_depth;
|
||||||
} else {
|
} else {
|
||||||
/* Pass through the payload depth. */
|
/* Pass through the payload depth. */
|
||||||
emit(MOV(sources[length],
|
sources[length] = fs_reg(brw_vec8_grf(payload.source_depth_reg, 0));
|
||||||
fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
|
|
||||||
}
|
}
|
||||||
length++;
|
length++;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (payload.dest_depth_reg) {
|
if (payload.dest_depth_reg)
|
||||||
sources[length] = vgrf(glsl_type::float_type);
|
sources[length++] = fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0));
|
||||||
emit(MOV(sources[length],
|
|
||||||
fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0))));
|
|
||||||
length++;
|
|
||||||
}
|
|
||||||
|
|
||||||
fs_inst *load;
|
fs_inst *load;
|
||||||
fs_inst *write;
|
fs_inst *write;
|
||||||
if (devinfo->gen >= 7) {
|
if (devinfo->gen >= 7) {
|
||||||
/* Send from the GRF */
|
/* Send from the GRF */
|
||||||
fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F);
|
fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F, exec_size);
|
||||||
load = emit(LOAD_PAYLOAD(payload, sources, length, payload_header_size));
|
load = emit(LOAD_PAYLOAD(payload, sources, length, payload_header_size));
|
||||||
payload.reg = alloc.allocate(load->regs_written);
|
payload.reg = alloc.allocate(load->regs_written);
|
||||||
payload.width = dispatch_width;
|
|
||||||
load->dst = payload;
|
load->dst = payload;
|
||||||
write = emit(FS_OPCODE_FB_WRITE, reg_undef, payload);
|
write = emit(FS_OPCODE_FB_WRITE, reg_undef, payload);
|
||||||
write->base_mrf = -1;
|
write->base_mrf = -1;
|
||||||
} else {
|
} else {
|
||||||
/* Send from the MRF */
|
/* Send from the MRF */
|
||||||
load = emit(LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
|
load = emit(LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F, exec_size),
|
||||||
sources, length, payload_header_size));
|
sources, length, payload_header_size));
|
||||||
|
|
||||||
|
/* On pre-SNB, we have to interlace the color values. LOAD_PAYLOAD
|
||||||
|
* will do this for us if we just give it a COMPR4 destination.
|
||||||
|
*/
|
||||||
|
if (brw->gen < 6 && exec_size == 16)
|
||||||
|
load->dst.reg |= BRW_MRF_COMPR4;
|
||||||
|
|
||||||
write = emit(FS_OPCODE_FB_WRITE);
|
write = emit(FS_OPCODE_FB_WRITE);
|
||||||
write->exec_size = exec_size;
|
write->exec_size = exec_size;
|
||||||
write->base_mrf = 1;
|
write->base_mrf = 1;
|
||||||
|
@ -4137,7 +4060,7 @@ fs_visitor::emit_urb_writes()
|
||||||
if (flush) {
|
if (flush) {
|
||||||
fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
|
fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
|
||||||
fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1),
|
fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1),
|
||||||
BRW_REGISTER_TYPE_F);
|
BRW_REGISTER_TYPE_F, dispatch_width);
|
||||||
|
|
||||||
/* We need WE_all on the MOV for the message header (the URB handles)
|
/* We need WE_all on the MOV for the message header (the URB handles)
|
||||||
* so do a MOV to a dummy register and set force_writemask_all on the
|
* so do a MOV to a dummy register and set force_writemask_all on the
|
||||||
|
|
Loading…
Reference in New Issue