diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index b4528eb9999..d0a3bdd9264 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -358,28 +358,18 @@ fs_inst * fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources, int header_size) { + assert(dst.width % 8 == 0); + fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width, + dst, src, sources); + inst->header_size = header_size; + for (int i = 0; i < header_size; i++) assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32); + inst->regs_written = header_size; - uint8_t exec_size = dst.width; - for (int i = 0; i < sources; ++i) { - assert(src[i].width % dst.width == 0); - if (src[i].width > exec_size) - exec_size = src[i].width; - } - - fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size, - dst, src, sources); - inst->regs_written = 0; - for (int i = 0; i < sources; ++i) { - /* The LOAD_PAYLOAD instruction only really makes sense if we are - * dealing with whole registers. If this ever changes, we can deal - * with it later. - */ - int size = inst->src[i].effective_width * type_sz(src[i].type); - assert(size % 32 == 0); - inst->regs_written += (size + 31) / 32; - } + for (int i = header_size; i < sources; ++i) + assert(src[i].file != GRF || src[i].width == dst.width); + inst->regs_written += (sources - header_size) * (dst.width / 8); return inst; } @@ -537,9 +527,13 @@ fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const if (grf_alloc.sizes[reg.reg] != this->regs_written) return false; - for (int i = 1; i < this->sources; i++) - if (!this->src[i].equals(::offset(reg, i))) + for (int i = 0; i < this->sources; i++) { + reg.type = this->src[i].type; + reg.width = this->src[i].width; + if (!this->src[i].equals(reg)) return false; + reg = ::offset(reg, 1); + } return true; } @@ -3429,99 +3423,108 @@ fs_visitor::lower_load_payload() { bool progress = false; - int vgrf_to_reg[alloc.count]; - int reg_count = 0; - for (unsigned i = 0; i < alloc.count; ++i) { - vgrf_to_reg[i] = reg_count; - reg_count += alloc.sizes[i]; - } - - struct { - bool written:1; /* Whether this register has ever been written */ - bool force_writemask_all:1; - bool force_sechalf:1; - } metadata[reg_count]; - memset(metadata, 0, sizeof(metadata)); - foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { - if (inst->dst.file == GRF) { - const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset; - bool force_sechalf = inst->force_sechalf && - !inst->force_writemask_all; - bool toggle_sechalf = inst->dst.width == 16 && - type_sz(inst->dst.type) == 4 && - !inst->force_writemask_all; - for (int i = 0; i < inst->regs_written; ++i) { - metadata[dst_reg + i].written = true; - metadata[dst_reg + i].force_sechalf = force_sechalf; - metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all; - force_sechalf = (toggle_sechalf != force_sechalf); + if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) + continue; + + assert(inst->dst.file == MRF || inst->dst.file == GRF); + assert(inst->saturate == false); + + fs_reg dst = inst->dst; + + /* Get rid of COMPR4. We'll add it back in if we need it */ + if (dst.file == MRF) + dst.reg = dst.reg & ~BRW_MRF_COMPR4; + + dst.width = 8; + for (uint8_t i = 0; i < inst->header_size; i++) { + if (inst->src[i].file != BAD_FILE) { + fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD); + fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD); + mov_src.width = 8; + fs_inst *mov = MOV(mov_dst, mov_src); + mov->force_writemask_all = true; + inst->insert_before(block, mov); } + dst = offset(dst, 1); } - if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) { - assert(inst->dst.file == MRF || inst->dst.file == GRF); - fs_reg dst = inst->dst; + dst.width = inst->exec_size; + if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) && + inst->exec_size > 8) { + /* In this case, the payload portion of the LOAD_PAYLOAD isn't + * a straightforward copy. Instead, the result of the + * LOAD_PAYLOAD is treated as interleaved and the first four + * non-header sources are unpacked as: + * + * m + 0: r0 + * m + 1: g0 + * m + 2: b0 + * m + 3: a0 + * m + 4: r1 + * m + 5: g1 + * m + 6: b1 + * m + 7: a1 + * + * This is used for gen <= 5 fb writes. + */ + assert(inst->exec_size == 16); + assert(inst->header_size + 4 <= inst->sources); + for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) { + if (inst->src[i].file != BAD_FILE) { + if (devinfo->has_compr4) { + fs_reg compr4_dst = retype(dst, inst->src[i].type); + compr4_dst.reg |= BRW_MRF_COMPR4; - for (int i = 0; i < inst->sources; i++) { - dst.width = inst->src[i].effective_width; - dst.type = inst->src[i].type; - - if (inst->src[i].file == BAD_FILE) { - /* Do nothing but otherwise increment as normal */ - } else if (dst.file == MRF && - dst.width == 8 && - devinfo->has_compr4 && - i + 4 < inst->sources && - inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) { - fs_reg compr4_dst = dst; - compr4_dst.reg += BRW_MRF_COMPR4; - compr4_dst.width = 16; - fs_reg compr4_src = inst->src[i]; - compr4_src.width = 16; - fs_inst *mov = MOV(compr4_dst, compr4_src); - mov->force_writemask_all = true; - inst->insert_before(block, mov); - /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */ - inst->src[i + 4].file = BAD_FILE; - } else { - fs_inst *mov = MOV(dst, inst->src[i]); - if (inst->src[i].file == GRF) { - int src_reg = vgrf_to_reg[inst->src[i].reg] + - inst->src[i].reg_offset; - mov->force_sechalf = metadata[src_reg].force_sechalf; - mov->force_writemask_all = metadata[src_reg].force_writemask_all; + fs_inst *mov = MOV(compr4_dst, inst->src[i]); + mov->force_writemask_all = inst->force_writemask_all; + inst->insert_before(block, mov); } else { - /* We don't have any useful metadata for immediates or - * uniforms. Assume that any of the channels of the - * destination may be used. - */ - assert(inst->src[i].file == IMM || - inst->src[i].file == UNIFORM); - mov->force_writemask_all = true; - } + /* Platform doesn't have COMPR4. We have to fake it */ + fs_reg mov_dst = retype(dst, inst->src[i].type); + mov_dst.width = 8; - if (dst.file == GRF) { - const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset; - const bool force_writemask = mov->force_writemask_all; - metadata[dst_reg].force_writemask_all = force_writemask; - metadata[dst_reg].force_sechalf = mov->force_sechalf; - if (dst.width * type_sz(dst.type) > 32) { - assert(!mov->force_sechalf); - metadata[dst_reg + 1].force_writemask_all = force_writemask; - metadata[dst_reg + 1].force_sechalf = !force_writemask; - } - } + fs_inst *mov = MOV(mov_dst, half(inst->src[i], 0)); + mov->force_writemask_all = inst->force_writemask_all; + inst->insert_before(block, mov); - inst->insert_before(block, mov); + mov = MOV(offset(mov_dst, 4), half(inst->src[i], 1)); + mov->force_writemask_all = inst->force_writemask_all; + mov->force_sechalf = true; + inst->insert_before(block, mov); + } } - dst = offset(dst, 1); + dst.reg++; } - inst->remove(block); - progress = true; + /* The loop above only ever incremented us through the first set + * of 4 registers. However, thanks to the magic of COMPR4, we + * actually wrote to the first 8 registers, so we need to take + * that into account now. + */ + dst.reg += 4; + + /* The COMPR4 code took care of the first 4 sources. We'll let + * the regular path handle any remaining sources. Yes, we are + * modifying the instruction but we're about to delete it so + * this really doesn't hurt anything. + */ + inst->header_size += 4; } + + for (uint8_t i = inst->header_size; i < inst->sources; i++) { + if (inst->src[i].file != BAD_FILE) { + fs_inst *mov = MOV(retype(dst, inst->src[i].type), + inst->src[i]); + mov->force_writemask_all = inst->force_writemask_all; + inst->insert_before(block, mov); + } + dst = offset(dst, 1); + } + + inst->remove(block); + progress = true; } if (progress) diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 30cefe4f2a8..1d7de2effbd 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -394,8 +394,8 @@ public: bool optimize_frontfacing_ternary(nir_alu_instr *instr, const fs_reg &result); - int setup_color_payload(fs_reg *dst, fs_reg color, unsigned components, - bool use_2nd_half); + void setup_color_payload(fs_reg *dst, fs_reg color, unsigned components, + unsigned exec_size, bool use_2nd_half); void emit_alpha_test(); fs_inst *emit_single_fb_write(fs_reg color1, fs_reg color2, fs_reg src0_alpha, unsigned components, diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp index a582e6a8e4b..db01f8cf7ab 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp @@ -174,17 +174,34 @@ create_copy_instr(fs_visitor *v, fs_inst *inst, fs_reg src, bool negate) { int written = inst->regs_written; int dst_width = inst->dst.width / 8; - fs_reg dst = inst->dst; fs_inst *copy; if (written > dst_width) { - fs_reg *sources = ralloc_array(v->mem_ctx, fs_reg, written / dst_width); - for (int i = 0; i < written / dst_width; i++) - sources[i] = offset(src, i); - copy = v->LOAD_PAYLOAD(dst, sources, written / dst_width, - inst->header_size); + fs_reg *payload; + int sources, header_size; + if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) { + sources = inst->sources; + header_size = inst->header_size; + } else { + assert(written % dst_width == 0); + sources = written / dst_width; + header_size = 0; + } + + assert(src.file == GRF); + payload = ralloc_array(v->mem_ctx, fs_reg, sources); + for (int i = 0; i < header_size; i++) { + payload[i] = src; + payload[i].width = 8; + src.reg_offset++; + } + for (int i = header_size; i < sources; i++) { + payload[i] = src; + src = offset(src, 1); + } + copy = v->LOAD_PAYLOAD(inst->dst, payload, sources, header_size); } else { - copy = v->MOV(dst, src); + copy = v->MOV(inst->dst, src); copy->force_writemask_all = inst->force_writemask_all; copy->src[0].negate = negate; } diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index 813df22c9c7..80ca1b750f8 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -2002,7 +2002,7 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst, mlen = length * reg_width; fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen), - BRW_REGISTER_TYPE_F); + BRW_REGISTER_TYPE_F, dispatch_width); emit(LOAD_PAYLOAD(src_payload, sources, length, header_size)); /* Generate the SEND */ @@ -2159,7 +2159,7 @@ fs_visitor::emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler) { int reg_width = dispatch_width / 8; fs_reg payload = fs_reg(GRF, alloc.allocate(components * reg_width), - BRW_REGISTER_TYPE_F); + BRW_REGISTER_TYPE_F, dispatch_width); fs_reg dest = vgrf(glsl_type::uvec4_type); fs_reg *sources = ralloc_array(mem_ctx, fs_reg, components); @@ -3295,7 +3295,7 @@ fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index, int mlen = 1 + (length - 1) * reg_width; fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen), - BRW_REGISTER_TYPE_UD); + BRW_REGISTER_TYPE_UD, dispatch_width); emit(LOAD_PAYLOAD(src_payload, sources, length, 1)); /* Emit the instruction. */ @@ -3343,7 +3343,7 @@ fs_visitor::emit_untyped_surface_read(unsigned surf_index, fs_reg dst, int mlen = 1 + reg_width; fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen), - BRW_REGISTER_TYPE_UD); + BRW_REGISTER_TYPE_UD, dispatch_width); fs_inst *inst = emit(LOAD_PAYLOAD(src_payload, sources, 2, 1)); /* Emit the instruction. */ @@ -3558,108 +3558,30 @@ fs_visitor::emit_interpolation_setup_gen6() this->current_annotation = NULL; } -int +void fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components, - bool use_2nd_half) + unsigned exec_size, bool use_2nd_half) { brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; fs_inst *inst; - if (color.file == BAD_FILE) { - return 4 * (dispatch_width / 8); + if (key->clamp_fragment_color) { + fs_reg tmp = vgrf(glsl_type::vec4_type); + assert(color.type == BRW_REGISTER_TYPE_F); + for (unsigned i = 0; i < components; i++) { + inst = emit(MOV(offset(tmp, i), offset(color, i))); + inst->saturate = true; + } + color = tmp; } - uint8_t colors_enabled; - if (components == 0) { - /* We want to write one component to the alpha channel */ - colors_enabled = 0x8; + if (exec_size < dispatch_width) { + unsigned half_idx = use_2nd_half ? 1 : 0; + for (unsigned i = 0; i < components; i++) + dst[i] = half(offset(color, i), half_idx); } else { - /* Enable the first components-many channels */ - colors_enabled = (1 << components) - 1; - } - - if (dispatch_width == 8 || (devinfo->gen >= 6 && !do_dual_src)) { - /* SIMD8 write looks like: - * m + 0: r0 - * m + 1: r1 - * m + 2: g0 - * m + 3: g1 - * - * gen6 SIMD16 DP write looks like: - * m + 0: r0 - * m + 1: r1 - * m + 2: g0 - * m + 3: g1 - * m + 4: b0 - * m + 5: b1 - * m + 6: a0 - * m + 7: a1 - */ - int len = 0; - for (unsigned i = 0; i < 4; ++i) { - if (colors_enabled & (1 << i)) { - dst[len] = fs_reg(GRF, alloc.allocate(color.width / 8), - color.type, color.width); - inst = emit(MOV(dst[len], offset(color, i))); - inst->saturate = key->clamp_fragment_color; - } else if (color.width == 16) { - /* We need two BAD_FILE slots for a 16-wide color */ - len++; - } - len++; - } - return len; - } else if (devinfo->gen >= 6 && do_dual_src) { - /* SIMD16 dual source blending for gen6+. - * - * From the SNB PRM, volume 4, part 1, page 193: - * - * "The dual source render target messages only have SIMD8 forms due to - * maximum message length limitations. SIMD16 pixel shaders must send two - * of these messages to cover all of the pixels. Each message contains - * two colors (4 channels each) for each pixel in the message payload." - * - * So in SIMD16 dual source blending we will send 2 SIMD8 messages, - * each one will call this function twice (one for each color involved), - * so in each pass we only write 4 registers. Notice that the second - * SIMD8 message needs to read color data from the 2nd half of the color - * registers, so it needs to call this with use_2nd_half = true. - */ - for (unsigned i = 0; i < 4; ++i) { - if (colors_enabled & (1 << i)) { - dst[i] = fs_reg(GRF, alloc.allocate(1), color.type); - inst = emit(MOV(dst[i], half(offset(color, i), - use_2nd_half ? 1 : 0))); - inst->saturate = key->clamp_fragment_color; - if (use_2nd_half) - inst->force_sechalf = true; - } - } - return 4; - } else { - /* pre-gen6 SIMD16 single source DP write looks like: - * m + 0: r0 - * m + 1: g0 - * m + 2: b0 - * m + 3: a0 - * m + 4: r1 - * m + 5: g1 - * m + 6: b1 - * m + 7: a1 - */ - for (unsigned i = 0; i < 4; ++i) { - if (colors_enabled & (1 << i)) { - dst[i] = fs_reg(GRF, alloc.allocate(1), color.type); - inst = emit(MOV(dst[i], half(offset(color, i), 0))); - inst->saturate = key->clamp_fragment_color; - - dst[i + 4] = fs_reg(GRF, alloc.allocate(1), color.type); - inst = emit(MOV(dst[i + 4], half(offset(color, i), 1))); - inst->saturate = key->clamp_fragment_color; - inst->force_sechalf = true; - } - } - return 8; + for (unsigned i = 0; i < components; i++) + dst[i] = offset(color, i); } } @@ -3728,7 +3650,6 @@ fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1, this->current_annotation = "FB write header"; int header_size = 2, payload_header_size; - int reg_size = exec_size / 8; /* We can potentially have a message length of up to 15, so we have to set * base_mrf to either 0 or 1 in order to fit in m0..m15. @@ -3784,24 +3705,26 @@ fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1, * alpha out the pipeline to our null renderbuffer to support * alpha-testing, alpha-to-coverage, and so on. */ - length += setup_color_payload(sources + length, this->outputs[0], 0, - false); + if (this->outputs[0].file != BAD_FILE) + setup_color_payload(&sources[length + 3], offset(this->outputs[0], 3), + 1, exec_size, false); + length += 4; } else if (color1.file == BAD_FILE) { if (src0_alpha.file != BAD_FILE) { - sources[length] = fs_reg(GRF, alloc.allocate(reg_size), - src0_alpha.type, src0_alpha.width); - fs_inst *inst = emit(MOV(sources[length], src0_alpha)); - inst->saturate = key->clamp_fragment_color; + setup_color_payload(&sources[length], src0_alpha, 1, exec_size, false); length++; } - length += setup_color_payload(sources + length, color0, components, - false); + setup_color_payload(&sources[length], color0, components, + exec_size, use_2nd_half); + length += 4; } else { - length += setup_color_payload(sources + length, color0, components, - use_2nd_half); - length += setup_color_payload(sources + length, color1, components, - use_2nd_half); + setup_color_payload(&sources[length], color0, components, + exec_size, use_2nd_half); + length += 4; + setup_color_payload(&sources[length], color1, components, + exec_size, use_2nd_half); + length += 4; } if (source_depth_to_render_target) { @@ -3814,41 +3737,41 @@ fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1, no16("Missing support for simd16 depth writes on gen6\n"); } - sources[length] = vgrf(glsl_type::float_type); if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) { /* Hand over gl_FragDepth. */ assert(this->frag_depth.file != BAD_FILE); - emit(MOV(sources[length], this->frag_depth)); + sources[length] = this->frag_depth; } else { /* Pass through the payload depth. */ - emit(MOV(sources[length], - fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)))); + sources[length] = fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)); } length++; } - if (payload.dest_depth_reg) { - sources[length] = vgrf(glsl_type::float_type); - emit(MOV(sources[length], - fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0)))); - length++; - } + if (payload.dest_depth_reg) + sources[length++] = fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0)); fs_inst *load; fs_inst *write; if (devinfo->gen >= 7) { /* Send from the GRF */ - fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F); + fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F, exec_size); load = emit(LOAD_PAYLOAD(payload, sources, length, payload_header_size)); payload.reg = alloc.allocate(load->regs_written); - payload.width = dispatch_width; load->dst = payload; write = emit(FS_OPCODE_FB_WRITE, reg_undef, payload); write->base_mrf = -1; } else { /* Send from the MRF */ - load = emit(LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F), + load = emit(LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F, exec_size), sources, length, payload_header_size)); + + /* On pre-SNB, we have to interlace the color values. LOAD_PAYLOAD + * will do this for us if we just give it a COMPR4 destination. + */ + if (brw->gen < 6 && exec_size == 16) + load->dst.reg |= BRW_MRF_COMPR4; + write = emit(FS_OPCODE_FB_WRITE); write->exec_size = exec_size; write->base_mrf = 1; @@ -4137,7 +4060,7 @@ fs_visitor::emit_urb_writes() if (flush) { fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1); fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1), - BRW_REGISTER_TYPE_F); + BRW_REGISTER_TYPE_F, dispatch_width); /* We need WE_all on the MOV for the message header (the URB handles) * so do a MOV to a dummy register and set force_writemask_all on the