From f11c6f09cf36909ff399353b20195a31cf0f1907 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Wed, 8 Jul 2015 19:00:48 -0700
Subject: [PATCH] i965: Optimize batchbuffer macros.

Previously OUT_BATCH was just a macro around an inline function which
does

   brw->batch.map[brw->batch.used++] = dword;

When making consecutive calls to intel_batchbuffer_emit_dword() the
compiler isn't able to recognize that we're writing consecutive memory
locations or that it doesn't need to write batch.used back to memory
each time.

We can avoid both of these problems by making a local pointer to the
next location in the batch in BEGIN_BATCH().

Cuts 18k from the .text size.

   text     data      bss      dec      hex  filename
4946956   195152    26192  5168300   4edcac  i965_dri.so before
4928956   195152    26192  5150300   4e965c  i965_dri.so after

This series (including commit c0433948) improves performance of Synmark
OglBatch7 by 8.01389% +/- 0.63922% (n=83) on Ivybridge.

Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 src/mesa/drivers/dri/i965/brw_context.h       |  5 +-
 src/mesa/drivers/dri/i965/brw_draw_upload.c   | 12 ++--
 src/mesa/drivers/dri/i965/brw_urb.c           |  2 +-
 src/mesa/drivers/dri/i965/intel_batchbuffer.c | 19 ++++---
 src/mesa/drivers/dri/i965/intel_batchbuffer.h | 57 +++++++++++++------
 src/mesa/drivers/dri/i965/intel_blit.c        | 19 ++++---
 6 files changed, 71 insertions(+), 43 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 44d1aeaf54a..34a49b2abdc 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -873,7 +873,8 @@ struct intel_batchbuffer {
 #ifdef DEBUG
    uint16_t emit, total;
 #endif
-   uint16_t used, reserved_space;
+   uint16_t reserved_space;
+   uint32_t *map_next;
    uint32_t *map;
    uint32_t *cpu_map;
 #define BATCH_SZ (8192*sizeof(uint32_t))
@@ -883,7 +884,7 @@ struct intel_batchbuffer {
    bool needs_sol_reset;
 
    struct {
-      uint16_t used;
+      uint32_t *map_next;
       int reloc_count;
    } saved;
 };
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index e0b55c6b620..c95f0c37f89 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -604,14 +604,15 @@ brw_prepare_shader_draw_parameters(struct brw_context *brw)
 /**
  * Emit a VERTEX_BUFFER_STATE entry (part of 3DSTATE_VERTEX_BUFFERS).
  */
-static void
+static uint32_t *
 emit_vertex_buffer_state(struct brw_context *brw,
                          unsigned buffer_nr,
                          drm_intel_bo *bo,
                          unsigned bo_ending_address,
                          unsigned bo_offset,
                          unsigned stride,
-                         unsigned step_rate)
+                         unsigned step_rate,
+                         uint32_t *__map)
 {
    struct gl_context *ctx = &brw->ctx;
    uint32_t dw0;
@@ -643,7 +644,10 @@ emit_vertex_buffer_state(struct brw_context *brw,
       OUT_BATCH(0);
    }
    OUT_BATCH(step_rate);
+
+   return __map;
 }
+#define EMIT_VERTEX_BUFFER_STATE(...) __map = emit_vertex_buffer_state(__VA_ARGS__, __map)
 
 static void brw_emit_vertices(struct brw_context *brw)
 {
@@ -704,14 +708,14 @@ static void brw_emit_vertices(struct brw_context *brw)
       OUT_BATCH((_3DSTATE_VERTEX_BUFFERS << 16) | (4 * nr_buffers - 1));
       for (i = 0; i < brw->vb.nr_buffers; i++) {
 	 struct brw_vertex_buffer *buffer = &brw->vb.buffers[i];
-         emit_vertex_buffer_state(brw, i, buffer->bo, buffer->bo->size - 1,
+         EMIT_VERTEX_BUFFER_STATE(brw, i, buffer->bo, buffer->bo->size - 1,
                                   buffer->offset, buffer->stride,
                                   buffer->step_rate);
 
       }
 
       if (brw->vs.prog_data->uses_vertexid) {
-         emit_vertex_buffer_state(brw, brw->vb.nr_buffers,
+         EMIT_VERTEX_BUFFER_STATE(brw, brw->vb.nr_buffers,
                                   brw->draw.draw_params_bo,
                                   brw->draw.draw_params_bo->size - 1,
                                   brw->draw.draw_params_offset,
diff --git a/src/mesa/drivers/dri/i965/brw_urb.c b/src/mesa/drivers/dri/i965/brw_urb.c
index 8fc06ba7cd9..6078c3810d4 100644
--- a/src/mesa/drivers/dri/i965/brw_urb.c
+++ b/src/mesa/drivers/dri/i965/brw_urb.c
@@ -252,7 +252,7 @@ void brw_upload_urb_fence(struct brw_context *brw)
    if ((USED_BATCH(brw->batch) & 15) > 12) {
       int pad = 16 - (USED_BATCH(brw->batch) & 15);
       do
-	 brw->batch.map[brw->batch.used++] = MI_NOOP;
+         *brw->batch.map_next++ = MI_NOOP;
       while (--pad);
    }
 
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 628a7b774b2..088ffd276b4 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -48,6 +48,7 @@ intel_batchbuffer_init(struct brw_context *brw)
    if (!brw->has_llc) {
       brw->batch.cpu_map = malloc(BATCH_SZ);
       brw->batch.map = brw->batch.cpu_map;
+      brw->batch.map_next = brw->batch.cpu_map;
    }
 }
 
@@ -68,10 +69,10 @@ intel_batchbuffer_reset(struct brw_context *brw)
       drm_intel_bo_map(brw->batch.bo, true);
       brw->batch.map = brw->batch.bo->virtual;
    }
+   brw->batch.map_next = brw->batch.map;
 
    brw->batch.reserved_space = BATCH_RESERVED;
    brw->batch.state_batch_offset = brw->batch.bo->size;
-   brw->batch.used = 0;
    brw->batch.needs_sol_reset = false;
 
    /* We don't know what ring the new batch will be sent to until we see the
@@ -83,7 +84,7 @@ intel_batchbuffer_reset(struct brw_context *brw)
 void
 intel_batchbuffer_save_state(struct brw_context *brw)
 {
-   brw->batch.saved.used = brw->batch.used;
+   brw->batch.saved.map_next = brw->batch.map_next;
    brw->batch.saved.reloc_count =
       drm_intel_gem_bo_get_reloc_count(brw->batch.bo);
 }
@@ -93,7 +94,7 @@ intel_batchbuffer_reset_to_saved(struct brw_context *brw)
 {
    drm_intel_gem_bo_clear_relocs(brw->batch.bo, brw->batch.saved.reloc_count);
 
-   brw->batch.used = brw->batch.saved.used;
+   brw->batch.map_next = brw->batch.saved.map_next;
    if (USED_BATCH(brw->batch) == 0)
       brw->batch.ring = UNKNOWN_RING;
 }
@@ -395,13 +396,13 @@ _intel_batchbuffer_flush(struct brw_context *brw,
  */
 uint32_t
 intel_batchbuffer_reloc(struct brw_context *brw,
-                        drm_intel_bo *buffer,
+                        drm_intel_bo *buffer, uint32_t offset,
                         uint32_t read_domains, uint32_t write_domain,
                         uint32_t delta)
 {
    int ret;
 
-   ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
+   ret = drm_intel_bo_emit_reloc(brw->batch.bo, offset,
 				 buffer, delta,
 				 read_domains, write_domain);
    assert(ret == 0);
@@ -416,11 +417,11 @@ intel_batchbuffer_reloc(struct brw_context *brw,
 
 uint64_t
 intel_batchbuffer_reloc64(struct brw_context *brw,
-                          drm_intel_bo *buffer,
+                          drm_intel_bo *buffer, uint32_t offset,
                           uint32_t read_domains, uint32_t write_domain,
                           uint32_t delta)
 {
-   int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
+   int ret = drm_intel_bo_emit_reloc(brw->batch.bo, offset,
                                      buffer, delta,
                                      read_domains, write_domain);
    assert(ret == 0);
@@ -440,8 +441,8 @@ intel_batchbuffer_data(struct brw_context *brw,
 {
    assert((bytes & 3) == 0);
    intel_batchbuffer_require_space(brw, bytes, ring);
-   memcpy(brw->batch.map + brw->batch.used, data, bytes);
-   brw->batch.used += bytes >> 2;
+   memcpy(brw->batch.map_next, data, bytes);
+   brw->batch.map_next += bytes >> 2;
 }
 
 static void
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
index 12d20d363c3..84add927c9a 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
@@ -59,16 +59,18 @@ void intel_batchbuffer_data(struct brw_context *brw,
 
 uint32_t intel_batchbuffer_reloc(struct brw_context *brw,
                                  drm_intel_bo *buffer,
+                                 uint32_t offset,
                                  uint32_t read_domains,
                                  uint32_t write_domain,
-                                 uint32_t offset);
+                                 uint32_t delta);
 uint64_t intel_batchbuffer_reloc64(struct brw_context *brw,
                                    drm_intel_bo *buffer,
+                                   uint32_t offset,
                                    uint32_t read_domains,
                                    uint32_t write_domain,
-                                   uint32_t offset);
+                                   uint32_t delta);
 
-#define USED_BATCH(batch) ((batch).used)
+#define USED_BATCH(batch) ((uintptr_t)((batch).map_next - (batch).map))
 
 static inline uint32_t float_as_int(float f)
 {
@@ -100,7 +102,7 @@ intel_batchbuffer_emit_dword(struct brw_context *brw, GLuint dword)
 #ifdef DEBUG
    assert(intel_batchbuffer_space(brw) >= 4);
 #endif
-   brw->batch.map[brw->batch.used++] = dword;
+   *brw->batch.map_next++ = dword;
    assert(brw->batch.ring != UNKNOWN_RING);
 }
 
@@ -163,23 +165,42 @@ intel_batchbuffer_advance(struct brw_context *brw)
 #endif
 }
 
-#define BEGIN_BATCH(n) intel_batchbuffer_begin(brw, n, RENDER_RING)
-#define BEGIN_BATCH_BLT(n) intel_batchbuffer_begin(brw, n, BLT_RING)
-#define OUT_BATCH(d) intel_batchbuffer_emit_dword(brw, d)
-#define OUT_BATCH_F(f) intel_batchbuffer_emit_float(brw, f)
-#define OUT_RELOC(buf, read_domains, write_domain, delta)                  \
-   OUT_BATCH(intel_batchbuffer_reloc(brw, buf, read_domains, write_domain, \
-                                     delta))
+#define BEGIN_BATCH(n) do {                            \
+   intel_batchbuffer_begin(brw, (n), RENDER_RING);     \
+   uint32_t *__map = brw->batch.map_next;              \
+   brw->batch.map_next += (n)
 
-/* Handle 48-bit address relocations for Gen8+ */
-#define OUT_RELOC64(buf, read_domains, write_domain, delta) do {        \
-   uint64_t reloc64 = intel_batchbuffer_reloc64(brw, buf, read_domains, \
-                                                write_domain, delta);   \
-   OUT_BATCH(reloc64);                                                  \
-   OUT_BATCH(reloc64 >> 32);                                            \
+#define BEGIN_BATCH_BLT(n) do {                        \
+   intel_batchbuffer_begin(brw, (n), BLT_RING);        \
+   uint32_t *__map = brw->batch.map_next;              \
+   brw->batch.map_next += (n)
+
+#define OUT_BATCH(d) *__map++ = (d)
+#define OUT_BATCH_F(f) OUT_BATCH(float_as_int((f)))
+
+#define OUT_RELOC(buf, read_domains, write_domain, delta) do { \
+   uint32_t __offset = (__map - brw->batch.map) * 4;           \
+   OUT_BATCH(intel_batchbuffer_reloc(brw, (buf), __offset,     \
+                                     (read_domains),           \
+                                     (write_domain),           \
+                                     (delta)));                \
 } while (0)
 
-#define ADVANCE_BATCH() intel_batchbuffer_advance(brw);
+/* Handle 48-bit address relocations for Gen8+ */
+#define OUT_RELOC64(buf, read_domains, write_domain, delta) do {      \
+   uint32_t __offset = (__map - brw->batch.map) * 4;                  \
+   uint64_t reloc64 = intel_batchbuffer_reloc64(brw, (buf), __offset, \
+                                                (read_domains),       \
+                                                (write_domain),       \
+                                                (delta));             \
+   OUT_BATCH(reloc64);                                                \
+   OUT_BATCH(reloc64 >> 32);                                          \
+} while (0)
+
+#define ADVANCE_BATCH()                  \
+   assert(__map == brw->batch.map_next); \
+   intel_batchbuffer_advance(brw);       \
+} while (0)
 
 #ifdef __cplusplus
 }
diff --git a/src/mesa/drivers/dri/i965/intel_blit.c b/src/mesa/drivers/dri/i965/intel_blit.c
index bc390535c86..4fc3fa803cb 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.c
+++ b/src/mesa/drivers/dri/i965/intel_blit.c
@@ -176,9 +176,10 @@ get_tr_vertical_align(uint32_t tr_mode, uint32_t cpp, bool is_src) {
  * tiling state would leak into other unsuspecting applications (like the X
  * server).
  */
-static void
+static uint32_t *
 set_blitter_tiling(struct brw_context *brw,
-                   bool dst_y_tiled, bool src_y_tiled)
+                   bool dst_y_tiled, bool src_y_tiled,
+                   uint32_t *__map)
 {
    assert(brw->gen >= 6);
 
@@ -193,19 +194,19 @@ set_blitter_tiling(struct brw_context *brw,
    OUT_BATCH((BCS_SWCTRL_DST_Y | BCS_SWCTRL_SRC_Y) << 16 |
              (dst_y_tiled ? BCS_SWCTRL_DST_Y : 0) |
              (src_y_tiled ? BCS_SWCTRL_SRC_Y : 0));
+   return __map;
 }
+#define SET_BLITTER_TILING(...) __map = set_blitter_tiling(__VA_ARGS__, __map)
 
-#define BEGIN_BATCH_BLT_TILED(n, dst_y_tiled, src_y_tiled) do {         \
+#define BEGIN_BATCH_BLT_TILED(n, dst_y_tiled, src_y_tiled)              \
       BEGIN_BATCH_BLT(n + ((dst_y_tiled || src_y_tiled) ? 14 : 0));     \
       if (dst_y_tiled || src_y_tiled)                                   \
-         set_blitter_tiling(brw, dst_y_tiled, src_y_tiled);             \
-   } while (0)
+         SET_BLITTER_TILING(brw, dst_y_tiled, src_y_tiled)
 
-#define ADVANCE_BATCH_TILED(dst_y_tiled, src_y_tiled) do {              \
+#define ADVANCE_BATCH_TILED(dst_y_tiled, src_y_tiled)                   \
       if (dst_y_tiled || src_y_tiled)                                   \
-         set_blitter_tiling(brw, false, false);                         \
-      ADVANCE_BATCH();                                                  \
-   } while (0)
+         SET_BLITTER_TILING(brw, false, false);                         \
+      ADVANCE_BATCH()
 
 static int
 blt_pitch(struct intel_mipmap_tree *mt)