i965/fs: Switch to using sampler LD messages for uniform pull constants.

When forcing the compiler to always generate pull constants instead of push constants (in order to have an easy to use testcase), improves performance of my old GLSL demo 23.3553% +/- 1.42968% (n=7). Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=60866 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2013-03-06 14:47:22 -08:00 · 2013-03-06 14:47:22 -08:00 · 4c1fdae0a0
parent 1323772543
commit 4c1fdae0a0
4 changed files with 49 additions and 51 deletions
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@ -727,7 +727,7 @@ enum opcode {
   FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
   FS_OPCODE_MOV_DISPATCH_TO_FLAGS,
   FS_OPCODE_DISCARD_JUMP,
-   FS_OPCODE_SET_GLOBAL_OFFSET,
+   FS_OPCODE_SET_SIMD4X2_OFFSET,
   FS_OPCODE_PACK_HALF_2x16_SPLIT,
   FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X,
   FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y,
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@ -2461,6 +2461,11 @@ fs_visitor::insert_gen4_send_dependency_workarounds()
 * scheduling full flexibility, while the conversion to native instructions
 * allows the post-register-allocation scheduler the best information
 * possible.
 *
 * Note that execution masking for setting up pull constant loads is special:
 * the channels that need to be written are unrelated to the current execution
 * mask, since a later instruction will use one of the result channels as a
 * source operand for all 8 or 16 of its channels.
 */
 void
 fs_visitor::lower_uniform_pull_constant_loads()
@ -2477,26 +2482,24 @@ fs_visitor::lower_uniform_pull_constant_loads()
                const_offset_reg.type == BRW_REGISTER_TYPE_UD);
         const_offset_reg.imm.u /= 16;
         fs_reg payload = fs_reg(this, glsl_type::uint_type);
         struct brw_reg g0 = retype(brw_vec8_grf(0, 0),
                                    BRW_REGISTER_TYPE_UD);
-         fs_inst *setup1 = MOV(payload, fs_reg(g0));
+         /* This is actually going to be a MOV, but since only the first dword
-         setup1->force_writemask_all = true;
+          * is accessed, we have a special opcode to do just that one.  Note
-         /* We don't need the second half of this vgrf to be filled with g1
+          * that this needs to be an operation that will be considered a def
-          * in the 16-wide case, but if we use force_uncompressed then live
+          * by live variable analysis, or register allocation will explode.
          * variable analysis won't consider this a def!
          */
         fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
                                               payload, const_offset_reg);
         setup->force_writemask_all = true;
-         fs_inst *setup2 = new(mem_ctx) fs_inst(FS_OPCODE_SET_GLOBAL_OFFSET,
+         setup->ir = inst->ir;
-                                                payload, payload,
+         setup->annotation = inst->annotation;
-                                                const_offset_reg);
+         inst->insert_before(setup);
-         setup1->ir = inst->ir;
+         /* Similarly, this will only populate the first 4 channels of the
-         setup1->annotation = inst->annotation;
+          * result register (since we only use smear values from 0-3), but we
-         inst->insert_before(setup1);
+          * don't tell the optimizer.
-         setup2->ir = inst->ir;
+          */
         setup2->annotation = inst->annotation;
         inst->insert_before(setup2);
         inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
         inst->src[1] = payload;
@ -2533,7 +2536,7 @@ fs_visitor::dump_instruction(fs_inst *inst)
      case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
         printf("uniform_pull_const_gen7");
         break;
-      case FS_OPCODE_SET_GLOBAL_OFFSET:
+      case FS_OPCODE_SET_SIMD4X2_OFFSET:
         printf("set_global_offset");
         break;
      default:
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@ -546,10 +546,9 @@ private:
                                                 struct brw_reg index,
                                                 struct brw_reg offset);
   void generate_mov_dispatch_to_flags(fs_inst *inst);
-   void generate_set_global_offset(fs_inst *inst,
+   void generate_set_simd4x2_offset(fs_inst *inst,
-                                   struct brw_reg dst,
+                                    struct brw_reg dst,
-                                   struct brw_reg src,
+                                    struct brw_reg offset);
                                   struct brw_reg offset);
   void generate_discard_jump(fs_inst *inst);
   void generate_pack_half_2x16_split(fs_inst *inst,
--- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
@ -647,6 +647,8 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
   uint32_t surf_index = index.dw1.ud;
   assert(offset.file == BRW_GENERAL_REGISTER_FILE);
   /* Reference just the dword we need, to avoid angering validate_reg(). */
   offset = brw_vec1_grf(offset.nr, 0);
   brw_push_insn_state(p);
   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
@ -654,20 +656,22 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
   struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
   brw_pop_insn_state(p);
   /* We use the SIMD4x2 mode because we want to end up with 4 components in
    * the destination loaded consecutively from the same offset (which appears
    * in the first component, and the rest are ignored).
    */
   dst.width = BRW_WIDTH_4;
   brw_set_dest(p, send, dst);
   brw_set_src0(p, send, offset);
-
+   brw_set_sampler_message(p, send,
   uint32_t msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
   uint32_t msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ;
   bool header_present = true;
   brw_set_dp_read_message(p, send,
                           surf_index,
-                           msg_control,
+                           0, /* LD message ignores sampler unit */
-                           msg_type,
+                           GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
-                           BRW_DATAPORT_READ_TARGET_DATA_CACHE,
+                           1, /* rlen */
-                           1,
+                           1, /* mlen */
-                           header_present,
+                           false, /* no header */
-                           1);
+                           BRW_SAMPLER_SIMD_MODE_SIMD4X2,
                           0);
 }
 void
@ -858,31 +862,23 @@ brw_reg_from_fs_reg(fs_reg *reg)
 }
 /**
- * Sets the second dword of a vgrf for gen7+ message setup.
+ * Sets the first word of a vgrf for gen7+ simd4x2 uniform pull constant
 * sampler LD messages.
 *
- * For setting up gen7 messages in VGRFs, we need to be able to set the second
+ * We don't want to bake it into the send message's code generation because
- * dword for some payloads where in the MRF world we'd have just used
+ * that means we don't get a chance to schedule the instructions.
 * brw_message_reg().  We don't want to bake it into the send message's code
 * generation because that means we don't get a chance to schedule the
 * instructions.
 */
 void
-fs_generator::generate_set_global_offset(fs_inst *inst,
+fs_generator::generate_set_simd4x2_offset(fs_inst *inst,
-                                         struct brw_reg dst,
+                                          struct brw_reg dst,
-                                         struct brw_reg src,
+                                          struct brw_reg value)
                                         struct brw_reg value)
 {
   /* We use a matching src and dst to get the information on how this
    * instruction works exposed to various optimization passes that would
    * otherwise treat it as completely overwriting the dst.
    */
   assert(src.file == dst.file && src.nr == dst.nr);
   assert(value.file == BRW_IMMEDIATE_VALUE);
   brw_push_insn_state(p);
   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
   brw_set_mask_control(p, BRW_MASK_DISABLE);
-   brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 2), value.type), value);
+   brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value);
   brw_pop_insn_state(p);
 }
@ -1298,8 +1294,8 @@ fs_generator::generate_code(exec_list *instructions)
         brw_shader_time_add(p, inst->base_mrf, SURF_INDEX_WM_SHADER_TIME);
         break;
-      case FS_OPCODE_SET_GLOBAL_OFFSET:
+      case FS_OPCODE_SET_SIMD4X2_OFFSET:
-         generate_set_global_offset(inst, dst, src[0], src[1]);
+         generate_set_simd4x2_offset(inst, dst, src[0]);
         break;
      case FS_OPCODE_PACK_HALF_2x16_SPLIT: