diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 8dba7706ebd..e585d382b23 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -4990,6 +4990,81 @@ get_lowered_simd_width(const struct brw_device_info *devinfo, } } +/** + * Extract the data that would be consumed by the channel group given by + * lbld.group() from the i-th source region of instruction \p inst and return + * it as result in packed form. If any copy instructions are required they + * will be emitted before the given \p inst in \p block. + */ +static fs_reg +emit_unzip(const fs_builder &lbld, bblock_t *block, fs_inst *inst, + unsigned i) +{ + /* Specified channel group from the source region. */ + const fs_reg src = horiz_offset(inst->src[i], lbld.group()); + + if (!is_periodic(inst->src[i], lbld.dispatch_width())) { + /* Builder of the right width to perform the copy avoiding uninitialized + * data if the lowered execution size is greater than the original + * execution size of the instruction. + */ + const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(), + inst->exec_size), 0); + const fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i)); + + for (unsigned k = 0; k < inst->components_read(i); ++k) + cbld.at(block, inst) + .MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k)); + + return tmp; + + } else { + /* The source is invariant for all dispatch_width-wide groups of the + * original region. + */ + return inst->src[i]; + } +} + +/** + * Insert data from a packed temporary into the channel group given by + * lbld.group() of the destination region of instruction \p inst and return + * the temporary as result. If any copy instructions are required they will + * be emitted around the given \p inst in \p block. + */ +static fs_reg +emit_zip(const fs_builder &lbld, bblock_t *block, fs_inst *inst) +{ + /* Builder of the right width to perform the copy avoiding uninitialized + * data if the lowered execution size is greater than the original + * execution size of the instruction. + */ + const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(), + inst->exec_size), 0); + + /* Specified channel group from the destination region. */ + const fs_reg dst = horiz_offset(inst->dst, lbld.group()); + const unsigned dst_size = inst->regs_written * REG_SIZE / + inst->dst.component_size(inst->exec_size); + const fs_reg tmp = lbld.vgrf(inst->dst.type, dst_size); + + if (inst->predicate) { + /* Handle predication by copying the original contents of the + * destination into the temporary before emitting the lowered + * instruction. + */ + for (unsigned k = 0; k < dst_size; ++k) + cbld.at(block, inst) + .MOV(offset(tmp, lbld, k), offset(dst, inst->exec_size, k)); + } + + for (unsigned k = 0; k < dst_size; ++k) + cbld.at(block, inst->next) + .MOV(offset(dst, inst->exec_size, k), offset(tmp, lbld, k)); + + return tmp; +} + bool fs_visitor::lower_simd_width() { @@ -5012,14 +5087,11 @@ fs_visitor::lower_simd_width() /* Split the copies in chunks of the execution width of either the * original or the lowered instruction, whichever is lower. */ - const unsigned copy_width = MIN2(lower_width, inst->exec_size); - const unsigned n = inst->exec_size / copy_width; + const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width); const unsigned dst_size = inst->regs_written * REG_SIZE / inst->dst.component_size(inst->exec_size); - fs_reg dsts[4]; - assert(n > 0 && n <= ARRAY_SIZE(dsts) && - !inst->writes_accumulator && !inst->mlen); + assert(!inst->writes_accumulator && !inst->mlen); for (unsigned i = 0; i < n; i++) { /* Emit a copy of the original instruction with the lowered width. @@ -5035,66 +5107,18 @@ fs_visitor::lower_simd_width() * instruction. */ const fs_builder lbld = ibld.group(lower_width, i); - const fs_builder cbld = lbld.group(copy_width, 0); - for (unsigned j = 0; j < inst->sources; j++) { - if (inst->src[j].file != BAD_FILE && - !is_periodic(inst->src[j], lower_width)) { - /* Get the i-th copy_width-wide chunk of the source. */ - const fs_reg src = offset(inst->src[j], cbld, i); - const unsigned src_size = inst->components_read(j); + for (unsigned j = 0; j < inst->sources; j++) + split_inst.src[j] = emit_unzip(lbld, block, inst, j); - /* Copy one every n copy_width-wide components of the - * register into a temporary passed as source to the lowered - * instruction. - */ - split_inst.src[j] = lbld.vgrf(inst->src[j].type, src_size); - - for (unsigned k = 0; k < src_size; ++k) - cbld.MOV(offset(split_inst.src[j], lbld, k), - offset(src, cbld, n * k)); - } - } - - if (inst->regs_written) { - /* Allocate enough space to hold the result of the lowered - * instruction and fix up the number of registers written. - */ - split_inst.dst = dsts[i] = - lbld.vgrf(inst->dst.type, dst_size); - split_inst.regs_written = - DIV_ROUND_UP(type_sz(inst->dst.type) * dst_size * lower_width, - REG_SIZE); - - if (inst->predicate) { - /* Handle predication by copying the original contents of - * the destination into the temporary before emitting the - * lowered instruction. - */ - for (unsigned k = 0; k < dst_size; ++k) - cbld.MOV(offset(split_inst.dst, lbld, k), - offset(inst->dst, cbld, n * k + i)); - } - } + split_inst.dst = emit_zip(lbld, block, inst); + split_inst.regs_written = + DIV_ROUND_UP(type_sz(inst->dst.type) * dst_size * lower_width, + REG_SIZE); lbld.emit(split_inst); } - if (inst->regs_written) { - const fs_builder lbld = ibld.group(lower_width, 0); - - /* Interleave the components of the result from the lowered - * instructions. - */ - for (unsigned i = 0; i < dst_size; ++i) { - for (unsigned j = 0; j < n; ++j) { - const fs_builder cbld = ibld.group(copy_width, j); - cbld.MOV(offset(inst->dst, cbld, n * i + j), - offset(dsts[j], lbld, i)); - } - } - } - inst->remove(block); progress = true; }