i965: Optimize batchbuffer macros.

Previously OUT_BATCH was just a macro around an inline function which does brw->batch.map[brw->batch.used++] = dword; When making consecutive calls to intel_batchbuffer_emit_dword() the compiler isn't able to recognize that we're writing consecutive memory locations or that it doesn't need to write batch.used back to memory each time. We can avoid both of these problems by making a local pointer to the next location in the batch in BEGIN_BATCH(). Cuts 18k from the .text size. text data bss dec hex filename 4946956 195152 26192 5168300 4edcac i965_dri.so before 4928956 195152 26192 5150300 4e965c i965_dri.so after This series (including commit c0433948) improves performance of Synmark OglBatch7 by 8.01389% +/- 0.63922% (n=83) on Ivybridge. Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
2015-07-08 19:00:48 -07:00 · 2015-07-08 19:00:48 -07:00 · f11c6f09cf
parent 131573df7a
commit f11c6f09cf
6 changed files with 71 additions and 43 deletions
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@ -873,7 +873,8 @@ struct intel_batchbuffer {
 #ifdef DEBUG
   uint16_t emit, total;
 #endif
-   uint16_t used, reserved_space;
+   uint16_t reserved_space;
+   uint32_t *map_next;
   uint32_t *map;
   uint32_t *cpu_map;
 #define BATCH_SZ (8192*sizeof(uint32_t))
@ -883,7 +884,7 @@ struct intel_batchbuffer {
   bool needs_sol_reset;

   struct {
-      uint16_t used;
+      uint32_t *map_next;
      int reloc_count;
   } saved;
 };
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@ -604,14 +604,15 @@ brw_prepare_shader_draw_parameters(struct brw_context *brw)
 /**
 * Emit a VERTEX_BUFFER_STATE entry (part of 3DSTATE_VERTEX_BUFFERS).
 */
-static void
+static uint32_t *
 emit_vertex_buffer_state(struct brw_context *brw,
                         unsigned buffer_nr,
                         drm_intel_bo *bo,
                         unsigned bo_ending_address,
                         unsigned bo_offset,
                         unsigned stride,
-                         unsigned step_rate)
+                         unsigned step_rate,
+                         uint32_t *__map)
 {
   struct gl_context *ctx = &brw->ctx;
   uint32_t dw0;
@ -643,7 +644,10 @@ emit_vertex_buffer_state(struct brw_context *brw,
      OUT_BATCH(0);
   }
   OUT_BATCH(step_rate);
+
+   return __map;
 }
+#define EMIT_VERTEX_BUFFER_STATE(...) __map = emit_vertex_buffer_state(__VA_ARGS__, __map)

 static void brw_emit_vertices(struct brw_context *brw)
 {
@ -704,14 +708,14 @@ static void brw_emit_vertices(struct brw_context *brw)
      OUT_BATCH((_3DSTATE_VERTEX_BUFFERS << 16) | (4 * nr_buffers - 1));
      for (i = 0; i < brw->vb.nr_buffers; i++) {
 	 struct brw_vertex_buffer *buffer = &brw->vb.buffers[i];
-         emit_vertex_buffer_state(brw, i, buffer->bo, buffer->bo->size - 1,
+         EMIT_VERTEX_BUFFER_STATE(brw, i, buffer->bo, buffer->bo->size - 1,
                                  buffer->offset, buffer->stride,
                                  buffer->step_rate);

      }

      if (brw->vs.prog_data->uses_vertexid) {
-         emit_vertex_buffer_state(brw, brw->vb.nr_buffers,
+         EMIT_VERTEX_BUFFER_STATE(brw, brw->vb.nr_buffers,
                                  brw->draw.draw_params_bo,
                                  brw->draw.draw_params_bo->size - 1,
                                  brw->draw.draw_params_offset,
--- a/src/mesa/drivers/dri/i965/brw_urb.c
+++ b/src/mesa/drivers/dri/i965/brw_urb.c
@ -252,7 +252,7 @@ void brw_upload_urb_fence(struct brw_context *brw)
   if ((USED_BATCH(brw->batch) & 15) > 12) {
      int pad = 16 - (USED_BATCH(brw->batch) & 15);
      do
-	 brw->batch.map[brw->batch.used++] = MI_NOOP;
+         *brw->batch.map_next++ = MI_NOOP;
      while (--pad);
   }

--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@ -48,6 +48,7 @@ intel_batchbuffer_init(struct brw_context *brw)
   if (!brw->has_llc) {
      brw->batch.cpu_map = malloc(BATCH_SZ);
      brw->batch.map = brw->batch.cpu_map;
+      brw->batch.map_next = brw->batch.cpu_map;
   }
 }

@ -68,10 +69,10 @@ intel_batchbuffer_reset(struct brw_context *brw)
      drm_intel_bo_map(brw->batch.bo, true);
      brw->batch.map = brw->batch.bo->virtual;
   }
+   brw->batch.map_next = brw->batch.map;

   brw->batch.reserved_space = BATCH_RESERVED;
   brw->batch.state_batch_offset = brw->batch.bo->size;
-   brw->batch.used = 0;
   brw->batch.needs_sol_reset = false;

   /* We don't know what ring the new batch will be sent to until we see the
@ -83,7 +84,7 @@ intel_batchbuffer_reset(struct brw_context *brw)
 void
 intel_batchbuffer_save_state(struct brw_context *brw)
 {
-   brw->batch.saved.used = brw->batch.used;
+   brw->batch.saved.map_next = brw->batch.map_next;
   brw->batch.saved.reloc_count =
      drm_intel_gem_bo_get_reloc_count(brw->batch.bo);
 }
@ -93,7 +94,7 @@ intel_batchbuffer_reset_to_saved(struct brw_context *brw)
 {
   drm_intel_gem_bo_clear_relocs(brw->batch.bo, brw->batch.saved.reloc_count);

-   brw->batch.used = brw->batch.saved.used;
+   brw->batch.map_next = brw->batch.saved.map_next;
   if (USED_BATCH(brw->batch) == 0)
      brw->batch.ring = UNKNOWN_RING;
 }
@ -395,13 +396,13 @@ _intel_batchbuffer_flush(struct brw_context *brw,
 */
 uint32_t
 intel_batchbuffer_reloc(struct brw_context *brw,
-                        drm_intel_bo *buffer,
+                        drm_intel_bo *buffer, uint32_t offset,
                        uint32_t read_domains, uint32_t write_domain,
                        uint32_t delta)
 {
   int ret;

-   ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
+   ret = drm_intel_bo_emit_reloc(brw->batch.bo, offset,
 				 buffer, delta,
 				 read_domains, write_domain);
   assert(ret == 0);
@ -416,11 +417,11 @@ intel_batchbuffer_reloc(struct brw_context *brw,

 uint64_t
 intel_batchbuffer_reloc64(struct brw_context *brw,
-                          drm_intel_bo *buffer,
+                          drm_intel_bo *buffer, uint32_t offset,
                          uint32_t read_domains, uint32_t write_domain,
                          uint32_t delta)
 {
-   int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
+   int ret = drm_intel_bo_emit_reloc(brw->batch.bo, offset,
                                     buffer, delta,
                                     read_domains, write_domain);
   assert(ret == 0);
@ -440,8 +441,8 @@ intel_batchbuffer_data(struct brw_context *brw,
 {
   assert((bytes & 3) == 0);
   intel_batchbuffer_require_space(brw, bytes, ring);
-   memcpy(brw->batch.map + brw->batch.used, data, bytes);
-   brw->batch.used += bytes >> 2;
+   memcpy(brw->batch.map_next, data, bytes);
+   brw->batch.map_next += bytes >> 2;
 }

 static void
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
@ -59,16 +59,18 @@ void intel_batchbuffer_data(struct brw_context *brw,

 uint32_t intel_batchbuffer_reloc(struct brw_context *brw,
                                 drm_intel_bo *buffer,
+                                 uint32_t offset,
                                 uint32_t read_domains,
                                 uint32_t write_domain,
-                                 uint32_t offset);
+                                 uint32_t delta);
 uint64_t intel_batchbuffer_reloc64(struct brw_context *brw,
                                   drm_intel_bo *buffer,
+                                   uint32_t offset,
                                   uint32_t read_domains,
                                   uint32_t write_domain,
-                                   uint32_t offset);
+                                   uint32_t delta);

-#define USED_BATCH(batch) ((batch).used)
+#define USED_BATCH(batch) ((uintptr_t)((batch).map_next - (batch).map))

 static inline uint32_t float_as_int(float f)
 {
@ -100,7 +102,7 @@ intel_batchbuffer_emit_dword(struct brw_context *brw, GLuint dword)
 #ifdef DEBUG
   assert(intel_batchbuffer_space(brw) >= 4);
 #endif
-   brw->batch.map[brw->batch.used++] = dword;
+   *brw->batch.map_next++ = dword;
   assert(brw->batch.ring != UNKNOWN_RING);
 }

@ -163,23 +165,42 @@ intel_batchbuffer_advance(struct brw_context *brw)
 #endif
 }

-#define BEGIN_BATCH(n) intel_batchbuffer_begin(brw, n, RENDER_RING)
-#define BEGIN_BATCH_BLT(n) intel_batchbuffer_begin(brw, n, BLT_RING)
-#define OUT_BATCH(d) intel_batchbuffer_emit_dword(brw, d)
-#define OUT_BATCH_F(f) intel_batchbuffer_emit_float(brw, f)
-#define OUT_RELOC(buf, read_domains, write_domain, delta)                  \
-   OUT_BATCH(intel_batchbuffer_reloc(brw, buf, read_domains, write_domain, \
-                                     delta))
+#define BEGIN_BATCH(n) do {                            \
+   intel_batchbuffer_begin(brw, (n), RENDER_RING);     \
+   uint32_t *__map = brw->batch.map_next;              \
+   brw->batch.map_next += (n)

-/* Handle 48-bit address relocations for Gen8+ */
-#define OUT_RELOC64(buf, read_domains, write_domain, delta) do {        \
-   uint64_t reloc64 = intel_batchbuffer_reloc64(brw, buf, read_domains, \
-                                                write_domain, delta);   \
-   OUT_BATCH(reloc64);                                                  \
-   OUT_BATCH(reloc64 >> 32);                                            \
+#define BEGIN_BATCH_BLT(n) do {                        \
+   intel_batchbuffer_begin(brw, (n), BLT_RING);        \
+   uint32_t *__map = brw->batch.map_next;              \
+   brw->batch.map_next += (n)
+
+#define OUT_BATCH(d) *__map++ = (d)
+#define OUT_BATCH_F(f) OUT_BATCH(float_as_int((f)))
+
+#define OUT_RELOC(buf, read_domains, write_domain, delta) do { \
+   uint32_t __offset = (__map - brw->batch.map) * 4;           \
+   OUT_BATCH(intel_batchbuffer_reloc(brw, (buf), __offset,     \
+                                     (read_domains),           \
+                                     (write_domain),           \
+                                     (delta)));                \
 } while (0)

-#define ADVANCE_BATCH() intel_batchbuffer_advance(brw);
+/* Handle 48-bit address relocations for Gen8+ */
+#define OUT_RELOC64(buf, read_domains, write_domain, delta) do {      \
+   uint32_t __offset = (__map - brw->batch.map) * 4;                  \
+   uint64_t reloc64 = intel_batchbuffer_reloc64(brw, (buf), __offset, \
+                                                (read_domains),       \
+                                                (write_domain),       \
+                                                (delta));             \
+   OUT_BATCH(reloc64);                                                \
+   OUT_BATCH(reloc64 >> 32);                                          \
+} while (0)
+
+#define ADVANCE_BATCH()                  \
+   assert(__map == brw->batch.map_next); \
+   intel_batchbuffer_advance(brw);       \
+} while (0)

 #ifdef __cplusplus
 }
--- a/src/mesa/drivers/dri/i965/intel_blit.c
+++ b/src/mesa/drivers/dri/i965/intel_blit.c
@ -176,9 +176,10 @@ get_tr_vertical_align(uint32_t tr_mode, uint32_t cpp, bool is_src) {
 * tiling state would leak into other unsuspecting applications (like the X
 * server).
 */
-static void
+static uint32_t *
 set_blitter_tiling(struct brw_context *brw,
-                   bool dst_y_tiled, bool src_y_tiled)
+                   bool dst_y_tiled, bool src_y_tiled,
+                   uint32_t *__map)
 {
   assert(brw->gen >= 6);

@ -193,19 +194,19 @@ set_blitter_tiling(struct brw_context *brw,
   OUT_BATCH((BCS_SWCTRL_DST_Y | BCS_SWCTRL_SRC_Y) << 16 |
             (dst_y_tiled ? BCS_SWCTRL_DST_Y : 0) |
             (src_y_tiled ? BCS_SWCTRL_SRC_Y : 0));
+   return __map;
 }
+#define SET_BLITTER_TILING(...) __map = set_blitter_tiling(__VA_ARGS__, __map)

-#define BEGIN_BATCH_BLT_TILED(n, dst_y_tiled, src_y_tiled) do {         \
+#define BEGIN_BATCH_BLT_TILED(n, dst_y_tiled, src_y_tiled)              \
      BEGIN_BATCH_BLT(n + ((dst_y_tiled || src_y_tiled) ? 14 : 0));     \
      if (dst_y_tiled || src_y_tiled)                                   \
-         set_blitter_tiling(brw, dst_y_tiled, src_y_tiled);             \
-   } while (0)
+         SET_BLITTER_TILING(brw, dst_y_tiled, src_y_tiled)

-#define ADVANCE_BATCH_TILED(dst_y_tiled, src_y_tiled) do {              \
+#define ADVANCE_BATCH_TILED(dst_y_tiled, src_y_tiled)                   \
      if (dst_y_tiled || src_y_tiled)                                   \
-         set_blitter_tiling(brw, false, false);                         \
-      ADVANCE_BATCH();                                                  \
-   } while (0)
+         SET_BLITTER_TILING(brw, false, false);                         \
+      ADVANCE_BATCH()

 static int
 blt_pitch(struct intel_mipmap_tree *mt)