i965: Optimize batchbuffer macros.

Previously OUT_BATCH was just a macro around an inline function which
does

   brw->batch.map[brw->batch.used++] = dword;

When making consecutive calls to intel_batchbuffer_emit_dword() the
compiler isn't able to recognize that we're writing consecutive memory
locations or that it doesn't need to write batch.used back to memory
each time.

We can avoid both of these problems by making a local pointer to the
next location in the batch in BEGIN_BATCH().

Cuts 18k from the .text size.

   text     data      bss      dec      hex  filename
4946956   195152    26192  5168300   4edcac  i965_dri.so before
4928956   195152    26192  5150300   4e965c  i965_dri.so after

This series (including commit c0433948) improves performance of Synmark
OglBatch7 by 8.01389% +/- 0.63922% (n=83) on Ivybridge.

Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
This commit is contained in:
Matt Turner 2015-07-08 19:00:48 -07:00
parent 131573df7a
commit f11c6f09cf
6 changed files with 71 additions and 43 deletions

View File

@ -873,7 +873,8 @@ struct intel_batchbuffer {
#ifdef DEBUG
uint16_t emit, total;
#endif
uint16_t used, reserved_space;
uint16_t reserved_space;
uint32_t *map_next;
uint32_t *map;
uint32_t *cpu_map;
#define BATCH_SZ (8192*sizeof(uint32_t))
@ -883,7 +884,7 @@ struct intel_batchbuffer {
bool needs_sol_reset;
struct {
uint16_t used;
uint32_t *map_next;
int reloc_count;
} saved;
};

View File

@ -604,14 +604,15 @@ brw_prepare_shader_draw_parameters(struct brw_context *brw)
/**
* Emit a VERTEX_BUFFER_STATE entry (part of 3DSTATE_VERTEX_BUFFERS).
*/
static void
static uint32_t *
emit_vertex_buffer_state(struct brw_context *brw,
unsigned buffer_nr,
drm_intel_bo *bo,
unsigned bo_ending_address,
unsigned bo_offset,
unsigned stride,
unsigned step_rate)
unsigned step_rate,
uint32_t *__map)
{
struct gl_context *ctx = &brw->ctx;
uint32_t dw0;
@ -643,7 +644,10 @@ emit_vertex_buffer_state(struct brw_context *brw,
OUT_BATCH(0);
}
OUT_BATCH(step_rate);
return __map;
}
#define EMIT_VERTEX_BUFFER_STATE(...) __map = emit_vertex_buffer_state(__VA_ARGS__, __map)
static void brw_emit_vertices(struct brw_context *brw)
{
@ -704,14 +708,14 @@ static void brw_emit_vertices(struct brw_context *brw)
OUT_BATCH((_3DSTATE_VERTEX_BUFFERS << 16) | (4 * nr_buffers - 1));
for (i = 0; i < brw->vb.nr_buffers; i++) {
struct brw_vertex_buffer *buffer = &brw->vb.buffers[i];
emit_vertex_buffer_state(brw, i, buffer->bo, buffer->bo->size - 1,
EMIT_VERTEX_BUFFER_STATE(brw, i, buffer->bo, buffer->bo->size - 1,
buffer->offset, buffer->stride,
buffer->step_rate);
}
if (brw->vs.prog_data->uses_vertexid) {
emit_vertex_buffer_state(brw, brw->vb.nr_buffers,
EMIT_VERTEX_BUFFER_STATE(brw, brw->vb.nr_buffers,
brw->draw.draw_params_bo,
brw->draw.draw_params_bo->size - 1,
brw->draw.draw_params_offset,

View File

@ -252,7 +252,7 @@ void brw_upload_urb_fence(struct brw_context *brw)
if ((USED_BATCH(brw->batch) & 15) > 12) {
int pad = 16 - (USED_BATCH(brw->batch) & 15);
do
brw->batch.map[brw->batch.used++] = MI_NOOP;
*brw->batch.map_next++ = MI_NOOP;
while (--pad);
}

View File

@ -48,6 +48,7 @@ intel_batchbuffer_init(struct brw_context *brw)
if (!brw->has_llc) {
brw->batch.cpu_map = malloc(BATCH_SZ);
brw->batch.map = brw->batch.cpu_map;
brw->batch.map_next = brw->batch.cpu_map;
}
}
@ -68,10 +69,10 @@ intel_batchbuffer_reset(struct brw_context *brw)
drm_intel_bo_map(brw->batch.bo, true);
brw->batch.map = brw->batch.bo->virtual;
}
brw->batch.map_next = brw->batch.map;
brw->batch.reserved_space = BATCH_RESERVED;
brw->batch.state_batch_offset = brw->batch.bo->size;
brw->batch.used = 0;
brw->batch.needs_sol_reset = false;
/* We don't know what ring the new batch will be sent to until we see the
@ -83,7 +84,7 @@ intel_batchbuffer_reset(struct brw_context *brw)
void
intel_batchbuffer_save_state(struct brw_context *brw)
{
brw->batch.saved.used = brw->batch.used;
brw->batch.saved.map_next = brw->batch.map_next;
brw->batch.saved.reloc_count =
drm_intel_gem_bo_get_reloc_count(brw->batch.bo);
}
@ -93,7 +94,7 @@ intel_batchbuffer_reset_to_saved(struct brw_context *brw)
{
drm_intel_gem_bo_clear_relocs(brw->batch.bo, brw->batch.saved.reloc_count);
brw->batch.used = brw->batch.saved.used;
brw->batch.map_next = brw->batch.saved.map_next;
if (USED_BATCH(brw->batch) == 0)
brw->batch.ring = UNKNOWN_RING;
}
@ -395,13 +396,13 @@ _intel_batchbuffer_flush(struct brw_context *brw,
*/
uint32_t
intel_batchbuffer_reloc(struct brw_context *brw,
drm_intel_bo *buffer,
drm_intel_bo *buffer, uint32_t offset,
uint32_t read_domains, uint32_t write_domain,
uint32_t delta)
{
int ret;
ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
ret = drm_intel_bo_emit_reloc(brw->batch.bo, offset,
buffer, delta,
read_domains, write_domain);
assert(ret == 0);
@ -416,11 +417,11 @@ intel_batchbuffer_reloc(struct brw_context *brw,
uint64_t
intel_batchbuffer_reloc64(struct brw_context *brw,
drm_intel_bo *buffer,
drm_intel_bo *buffer, uint32_t offset,
uint32_t read_domains, uint32_t write_domain,
uint32_t delta)
{
int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
int ret = drm_intel_bo_emit_reloc(brw->batch.bo, offset,
buffer, delta,
read_domains, write_domain);
assert(ret == 0);
@ -440,8 +441,8 @@ intel_batchbuffer_data(struct brw_context *brw,
{
assert((bytes & 3) == 0);
intel_batchbuffer_require_space(brw, bytes, ring);
memcpy(brw->batch.map + brw->batch.used, data, bytes);
brw->batch.used += bytes >> 2;
memcpy(brw->batch.map_next, data, bytes);
brw->batch.map_next += bytes >> 2;
}
static void

View File

@ -59,16 +59,18 @@ void intel_batchbuffer_data(struct brw_context *brw,
uint32_t intel_batchbuffer_reloc(struct brw_context *brw,
drm_intel_bo *buffer,
uint32_t offset,
uint32_t read_domains,
uint32_t write_domain,
uint32_t offset);
uint32_t delta);
uint64_t intel_batchbuffer_reloc64(struct brw_context *brw,
drm_intel_bo *buffer,
uint32_t offset,
uint32_t read_domains,
uint32_t write_domain,
uint32_t offset);
uint32_t delta);
#define USED_BATCH(batch) ((batch).used)
#define USED_BATCH(batch) ((uintptr_t)((batch).map_next - (batch).map))
static inline uint32_t float_as_int(float f)
{
@ -100,7 +102,7 @@ intel_batchbuffer_emit_dword(struct brw_context *brw, GLuint dword)
#ifdef DEBUG
assert(intel_batchbuffer_space(brw) >= 4);
#endif
brw->batch.map[brw->batch.used++] = dword;
*brw->batch.map_next++ = dword;
assert(brw->batch.ring != UNKNOWN_RING);
}
@ -163,23 +165,42 @@ intel_batchbuffer_advance(struct brw_context *brw)
#endif
}
#define BEGIN_BATCH(n) intel_batchbuffer_begin(brw, n, RENDER_RING)
#define BEGIN_BATCH_BLT(n) intel_batchbuffer_begin(brw, n, BLT_RING)
#define OUT_BATCH(d) intel_batchbuffer_emit_dword(brw, d)
#define OUT_BATCH_F(f) intel_batchbuffer_emit_float(brw, f)
#define OUT_RELOC(buf, read_domains, write_domain, delta) \
OUT_BATCH(intel_batchbuffer_reloc(brw, buf, read_domains, write_domain, \
delta))
#define BEGIN_BATCH(n) do { \
intel_batchbuffer_begin(brw, (n), RENDER_RING); \
uint32_t *__map = brw->batch.map_next; \
brw->batch.map_next += (n)
/* Handle 48-bit address relocations for Gen8+ */
#define OUT_RELOC64(buf, read_domains, write_domain, delta) do { \
uint64_t reloc64 = intel_batchbuffer_reloc64(brw, buf, read_domains, \
write_domain, delta); \
OUT_BATCH(reloc64); \
OUT_BATCH(reloc64 >> 32); \
#define BEGIN_BATCH_BLT(n) do { \
intel_batchbuffer_begin(brw, (n), BLT_RING); \
uint32_t *__map = brw->batch.map_next; \
brw->batch.map_next += (n)
#define OUT_BATCH(d) *__map++ = (d)
#define OUT_BATCH_F(f) OUT_BATCH(float_as_int((f)))
#define OUT_RELOC(buf, read_domains, write_domain, delta) do { \
uint32_t __offset = (__map - brw->batch.map) * 4; \
OUT_BATCH(intel_batchbuffer_reloc(brw, (buf), __offset, \
(read_domains), \
(write_domain), \
(delta))); \
} while (0)
#define ADVANCE_BATCH() intel_batchbuffer_advance(brw);
/* Handle 48-bit address relocations for Gen8+ */
#define OUT_RELOC64(buf, read_domains, write_domain, delta) do { \
uint32_t __offset = (__map - brw->batch.map) * 4; \
uint64_t reloc64 = intel_batchbuffer_reloc64(brw, (buf), __offset, \
(read_domains), \
(write_domain), \
(delta)); \
OUT_BATCH(reloc64); \
OUT_BATCH(reloc64 >> 32); \
} while (0)
#define ADVANCE_BATCH() \
assert(__map == brw->batch.map_next); \
intel_batchbuffer_advance(brw); \
} while (0)
#ifdef __cplusplus
}

View File

@ -176,9 +176,10 @@ get_tr_vertical_align(uint32_t tr_mode, uint32_t cpp, bool is_src) {
* tiling state would leak into other unsuspecting applications (like the X
* server).
*/
static void
static uint32_t *
set_blitter_tiling(struct brw_context *brw,
bool dst_y_tiled, bool src_y_tiled)
bool dst_y_tiled, bool src_y_tiled,
uint32_t *__map)
{
assert(brw->gen >= 6);
@ -193,19 +194,19 @@ set_blitter_tiling(struct brw_context *brw,
OUT_BATCH((BCS_SWCTRL_DST_Y | BCS_SWCTRL_SRC_Y) << 16 |
(dst_y_tiled ? BCS_SWCTRL_DST_Y : 0) |
(src_y_tiled ? BCS_SWCTRL_SRC_Y : 0));
return __map;
}
#define SET_BLITTER_TILING(...) __map = set_blitter_tiling(__VA_ARGS__, __map)
#define BEGIN_BATCH_BLT_TILED(n, dst_y_tiled, src_y_tiled) do { \
#define BEGIN_BATCH_BLT_TILED(n, dst_y_tiled, src_y_tiled) \
BEGIN_BATCH_BLT(n + ((dst_y_tiled || src_y_tiled) ? 14 : 0)); \
if (dst_y_tiled || src_y_tiled) \
set_blitter_tiling(brw, dst_y_tiled, src_y_tiled); \
} while (0)
SET_BLITTER_TILING(brw, dst_y_tiled, src_y_tiled)
#define ADVANCE_BATCH_TILED(dst_y_tiled, src_y_tiled) do { \
#define ADVANCE_BATCH_TILED(dst_y_tiled, src_y_tiled) \
if (dst_y_tiled || src_y_tiled) \
set_blitter_tiling(brw, false, false); \
ADVANCE_BATCH(); \
} while (0)
SET_BLITTER_TILING(brw, false, false); \
ADVANCE_BATCH()
static int
blt_pitch(struct intel_mipmap_tree *mt)