i965: Optimize batchbuffer macros.
Previously OUT_BATCH was just a macro around an inline function which
does
brw->batch.map[brw->batch.used++] = dword;
When making consecutive calls to intel_batchbuffer_emit_dword() the
compiler isn't able to recognize that we're writing consecutive memory
locations or that it doesn't need to write batch.used back to memory
each time.
We can avoid both of these problems by making a local pointer to the
next location in the batch in BEGIN_BATCH().
Cuts 18k from the .text size.
text data bss dec hex filename
4946956 195152 26192 5168300 4edcac i965_dri.so before
4928956 195152 26192 5150300 4e965c i965_dri.so after
This series (including commit c0433948
) improves performance of Synmark
OglBatch7 by 8.01389% +/- 0.63922% (n=83) on Ivybridge.
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
This commit is contained in:
parent
131573df7a
commit
f11c6f09cf
|
@ -873,7 +873,8 @@ struct intel_batchbuffer {
|
|||
#ifdef DEBUG
|
||||
uint16_t emit, total;
|
||||
#endif
|
||||
uint16_t used, reserved_space;
|
||||
uint16_t reserved_space;
|
||||
uint32_t *map_next;
|
||||
uint32_t *map;
|
||||
uint32_t *cpu_map;
|
||||
#define BATCH_SZ (8192*sizeof(uint32_t))
|
||||
|
@ -883,7 +884,7 @@ struct intel_batchbuffer {
|
|||
bool needs_sol_reset;
|
||||
|
||||
struct {
|
||||
uint16_t used;
|
||||
uint32_t *map_next;
|
||||
int reloc_count;
|
||||
} saved;
|
||||
};
|
||||
|
|
|
@ -604,14 +604,15 @@ brw_prepare_shader_draw_parameters(struct brw_context *brw)
|
|||
/**
|
||||
* Emit a VERTEX_BUFFER_STATE entry (part of 3DSTATE_VERTEX_BUFFERS).
|
||||
*/
|
||||
static void
|
||||
static uint32_t *
|
||||
emit_vertex_buffer_state(struct brw_context *brw,
|
||||
unsigned buffer_nr,
|
||||
drm_intel_bo *bo,
|
||||
unsigned bo_ending_address,
|
||||
unsigned bo_offset,
|
||||
unsigned stride,
|
||||
unsigned step_rate)
|
||||
unsigned step_rate,
|
||||
uint32_t *__map)
|
||||
{
|
||||
struct gl_context *ctx = &brw->ctx;
|
||||
uint32_t dw0;
|
||||
|
@ -643,7 +644,10 @@ emit_vertex_buffer_state(struct brw_context *brw,
|
|||
OUT_BATCH(0);
|
||||
}
|
||||
OUT_BATCH(step_rate);
|
||||
|
||||
return __map;
|
||||
}
|
||||
#define EMIT_VERTEX_BUFFER_STATE(...) __map = emit_vertex_buffer_state(__VA_ARGS__, __map)
|
||||
|
||||
static void brw_emit_vertices(struct brw_context *brw)
|
||||
{
|
||||
|
@ -704,14 +708,14 @@ static void brw_emit_vertices(struct brw_context *brw)
|
|||
OUT_BATCH((_3DSTATE_VERTEX_BUFFERS << 16) | (4 * nr_buffers - 1));
|
||||
for (i = 0; i < brw->vb.nr_buffers; i++) {
|
||||
struct brw_vertex_buffer *buffer = &brw->vb.buffers[i];
|
||||
emit_vertex_buffer_state(brw, i, buffer->bo, buffer->bo->size - 1,
|
||||
EMIT_VERTEX_BUFFER_STATE(brw, i, buffer->bo, buffer->bo->size - 1,
|
||||
buffer->offset, buffer->stride,
|
||||
buffer->step_rate);
|
||||
|
||||
}
|
||||
|
||||
if (brw->vs.prog_data->uses_vertexid) {
|
||||
emit_vertex_buffer_state(brw, brw->vb.nr_buffers,
|
||||
EMIT_VERTEX_BUFFER_STATE(brw, brw->vb.nr_buffers,
|
||||
brw->draw.draw_params_bo,
|
||||
brw->draw.draw_params_bo->size - 1,
|
||||
brw->draw.draw_params_offset,
|
||||
|
|
|
@ -252,7 +252,7 @@ void brw_upload_urb_fence(struct brw_context *brw)
|
|||
if ((USED_BATCH(brw->batch) & 15) > 12) {
|
||||
int pad = 16 - (USED_BATCH(brw->batch) & 15);
|
||||
do
|
||||
brw->batch.map[brw->batch.used++] = MI_NOOP;
|
||||
*brw->batch.map_next++ = MI_NOOP;
|
||||
while (--pad);
|
||||
}
|
||||
|
||||
|
|
|
@ -48,6 +48,7 @@ intel_batchbuffer_init(struct brw_context *brw)
|
|||
if (!brw->has_llc) {
|
||||
brw->batch.cpu_map = malloc(BATCH_SZ);
|
||||
brw->batch.map = brw->batch.cpu_map;
|
||||
brw->batch.map_next = brw->batch.cpu_map;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -68,10 +69,10 @@ intel_batchbuffer_reset(struct brw_context *brw)
|
|||
drm_intel_bo_map(brw->batch.bo, true);
|
||||
brw->batch.map = brw->batch.bo->virtual;
|
||||
}
|
||||
brw->batch.map_next = brw->batch.map;
|
||||
|
||||
brw->batch.reserved_space = BATCH_RESERVED;
|
||||
brw->batch.state_batch_offset = brw->batch.bo->size;
|
||||
brw->batch.used = 0;
|
||||
brw->batch.needs_sol_reset = false;
|
||||
|
||||
/* We don't know what ring the new batch will be sent to until we see the
|
||||
|
@ -83,7 +84,7 @@ intel_batchbuffer_reset(struct brw_context *brw)
|
|||
void
|
||||
intel_batchbuffer_save_state(struct brw_context *brw)
|
||||
{
|
||||
brw->batch.saved.used = brw->batch.used;
|
||||
brw->batch.saved.map_next = brw->batch.map_next;
|
||||
brw->batch.saved.reloc_count =
|
||||
drm_intel_gem_bo_get_reloc_count(brw->batch.bo);
|
||||
}
|
||||
|
@ -93,7 +94,7 @@ intel_batchbuffer_reset_to_saved(struct brw_context *brw)
|
|||
{
|
||||
drm_intel_gem_bo_clear_relocs(brw->batch.bo, brw->batch.saved.reloc_count);
|
||||
|
||||
brw->batch.used = brw->batch.saved.used;
|
||||
brw->batch.map_next = brw->batch.saved.map_next;
|
||||
if (USED_BATCH(brw->batch) == 0)
|
||||
brw->batch.ring = UNKNOWN_RING;
|
||||
}
|
||||
|
@ -395,13 +396,13 @@ _intel_batchbuffer_flush(struct brw_context *brw,
|
|||
*/
|
||||
uint32_t
|
||||
intel_batchbuffer_reloc(struct brw_context *brw,
|
||||
drm_intel_bo *buffer,
|
||||
drm_intel_bo *buffer, uint32_t offset,
|
||||
uint32_t read_domains, uint32_t write_domain,
|
||||
uint32_t delta)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
|
||||
ret = drm_intel_bo_emit_reloc(brw->batch.bo, offset,
|
||||
buffer, delta,
|
||||
read_domains, write_domain);
|
||||
assert(ret == 0);
|
||||
|
@ -416,11 +417,11 @@ intel_batchbuffer_reloc(struct brw_context *brw,
|
|||
|
||||
uint64_t
|
||||
intel_batchbuffer_reloc64(struct brw_context *brw,
|
||||
drm_intel_bo *buffer,
|
||||
drm_intel_bo *buffer, uint32_t offset,
|
||||
uint32_t read_domains, uint32_t write_domain,
|
||||
uint32_t delta)
|
||||
{
|
||||
int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
|
||||
int ret = drm_intel_bo_emit_reloc(brw->batch.bo, offset,
|
||||
buffer, delta,
|
||||
read_domains, write_domain);
|
||||
assert(ret == 0);
|
||||
|
@ -440,8 +441,8 @@ intel_batchbuffer_data(struct brw_context *brw,
|
|||
{
|
||||
assert((bytes & 3) == 0);
|
||||
intel_batchbuffer_require_space(brw, bytes, ring);
|
||||
memcpy(brw->batch.map + brw->batch.used, data, bytes);
|
||||
brw->batch.used += bytes >> 2;
|
||||
memcpy(brw->batch.map_next, data, bytes);
|
||||
brw->batch.map_next += bytes >> 2;
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
|
@ -59,16 +59,18 @@ void intel_batchbuffer_data(struct brw_context *brw,
|
|||
|
||||
uint32_t intel_batchbuffer_reloc(struct brw_context *brw,
|
||||
drm_intel_bo *buffer,
|
||||
uint32_t offset,
|
||||
uint32_t read_domains,
|
||||
uint32_t write_domain,
|
||||
uint32_t offset);
|
||||
uint32_t delta);
|
||||
uint64_t intel_batchbuffer_reloc64(struct brw_context *brw,
|
||||
drm_intel_bo *buffer,
|
||||
uint32_t offset,
|
||||
uint32_t read_domains,
|
||||
uint32_t write_domain,
|
||||
uint32_t offset);
|
||||
uint32_t delta);
|
||||
|
||||
#define USED_BATCH(batch) ((batch).used)
|
||||
#define USED_BATCH(batch) ((uintptr_t)((batch).map_next - (batch).map))
|
||||
|
||||
static inline uint32_t float_as_int(float f)
|
||||
{
|
||||
|
@ -100,7 +102,7 @@ intel_batchbuffer_emit_dword(struct brw_context *brw, GLuint dword)
|
|||
#ifdef DEBUG
|
||||
assert(intel_batchbuffer_space(brw) >= 4);
|
||||
#endif
|
||||
brw->batch.map[brw->batch.used++] = dword;
|
||||
*brw->batch.map_next++ = dword;
|
||||
assert(brw->batch.ring != UNKNOWN_RING);
|
||||
}
|
||||
|
||||
|
@ -163,23 +165,42 @@ intel_batchbuffer_advance(struct brw_context *brw)
|
|||
#endif
|
||||
}
|
||||
|
||||
#define BEGIN_BATCH(n) intel_batchbuffer_begin(brw, n, RENDER_RING)
|
||||
#define BEGIN_BATCH_BLT(n) intel_batchbuffer_begin(brw, n, BLT_RING)
|
||||
#define OUT_BATCH(d) intel_batchbuffer_emit_dword(brw, d)
|
||||
#define OUT_BATCH_F(f) intel_batchbuffer_emit_float(brw, f)
|
||||
#define OUT_RELOC(buf, read_domains, write_domain, delta) \
|
||||
OUT_BATCH(intel_batchbuffer_reloc(brw, buf, read_domains, write_domain, \
|
||||
delta))
|
||||
#define BEGIN_BATCH(n) do { \
|
||||
intel_batchbuffer_begin(brw, (n), RENDER_RING); \
|
||||
uint32_t *__map = brw->batch.map_next; \
|
||||
brw->batch.map_next += (n)
|
||||
|
||||
/* Handle 48-bit address relocations for Gen8+ */
|
||||
#define OUT_RELOC64(buf, read_domains, write_domain, delta) do { \
|
||||
uint64_t reloc64 = intel_batchbuffer_reloc64(brw, buf, read_domains, \
|
||||
write_domain, delta); \
|
||||
OUT_BATCH(reloc64); \
|
||||
OUT_BATCH(reloc64 >> 32); \
|
||||
#define BEGIN_BATCH_BLT(n) do { \
|
||||
intel_batchbuffer_begin(brw, (n), BLT_RING); \
|
||||
uint32_t *__map = brw->batch.map_next; \
|
||||
brw->batch.map_next += (n)
|
||||
|
||||
#define OUT_BATCH(d) *__map++ = (d)
|
||||
#define OUT_BATCH_F(f) OUT_BATCH(float_as_int((f)))
|
||||
|
||||
#define OUT_RELOC(buf, read_domains, write_domain, delta) do { \
|
||||
uint32_t __offset = (__map - brw->batch.map) * 4; \
|
||||
OUT_BATCH(intel_batchbuffer_reloc(brw, (buf), __offset, \
|
||||
(read_domains), \
|
||||
(write_domain), \
|
||||
(delta))); \
|
||||
} while (0)
|
||||
|
||||
#define ADVANCE_BATCH() intel_batchbuffer_advance(brw);
|
||||
/* Handle 48-bit address relocations for Gen8+ */
|
||||
#define OUT_RELOC64(buf, read_domains, write_domain, delta) do { \
|
||||
uint32_t __offset = (__map - brw->batch.map) * 4; \
|
||||
uint64_t reloc64 = intel_batchbuffer_reloc64(brw, (buf), __offset, \
|
||||
(read_domains), \
|
||||
(write_domain), \
|
||||
(delta)); \
|
||||
OUT_BATCH(reloc64); \
|
||||
OUT_BATCH(reloc64 >> 32); \
|
||||
} while (0)
|
||||
|
||||
#define ADVANCE_BATCH() \
|
||||
assert(__map == brw->batch.map_next); \
|
||||
intel_batchbuffer_advance(brw); \
|
||||
} while (0)
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -176,9 +176,10 @@ get_tr_vertical_align(uint32_t tr_mode, uint32_t cpp, bool is_src) {
|
|||
* tiling state would leak into other unsuspecting applications (like the X
|
||||
* server).
|
||||
*/
|
||||
static void
|
||||
static uint32_t *
|
||||
set_blitter_tiling(struct brw_context *brw,
|
||||
bool dst_y_tiled, bool src_y_tiled)
|
||||
bool dst_y_tiled, bool src_y_tiled,
|
||||
uint32_t *__map)
|
||||
{
|
||||
assert(brw->gen >= 6);
|
||||
|
||||
|
@ -193,19 +194,19 @@ set_blitter_tiling(struct brw_context *brw,
|
|||
OUT_BATCH((BCS_SWCTRL_DST_Y | BCS_SWCTRL_SRC_Y) << 16 |
|
||||
(dst_y_tiled ? BCS_SWCTRL_DST_Y : 0) |
|
||||
(src_y_tiled ? BCS_SWCTRL_SRC_Y : 0));
|
||||
return __map;
|
||||
}
|
||||
#define SET_BLITTER_TILING(...) __map = set_blitter_tiling(__VA_ARGS__, __map)
|
||||
|
||||
#define BEGIN_BATCH_BLT_TILED(n, dst_y_tiled, src_y_tiled) do { \
|
||||
#define BEGIN_BATCH_BLT_TILED(n, dst_y_tiled, src_y_tiled) \
|
||||
BEGIN_BATCH_BLT(n + ((dst_y_tiled || src_y_tiled) ? 14 : 0)); \
|
||||
if (dst_y_tiled || src_y_tiled) \
|
||||
set_blitter_tiling(brw, dst_y_tiled, src_y_tiled); \
|
||||
} while (0)
|
||||
SET_BLITTER_TILING(brw, dst_y_tiled, src_y_tiled)
|
||||
|
||||
#define ADVANCE_BATCH_TILED(dst_y_tiled, src_y_tiled) do { \
|
||||
#define ADVANCE_BATCH_TILED(dst_y_tiled, src_y_tiled) \
|
||||
if (dst_y_tiled || src_y_tiled) \
|
||||
set_blitter_tiling(brw, false, false); \
|
||||
ADVANCE_BATCH(); \
|
||||
} while (0)
|
||||
SET_BLITTER_TILING(brw, false, false); \
|
||||
ADVANCE_BATCH()
|
||||
|
||||
static int
|
||||
blt_pitch(struct intel_mipmap_tree *mt)
|
||||
|
|
Loading…
Reference in New Issue