vc4: Optimize CL emits by doing size checks up front.

The optimizer obviously doesn't have the ability to rewrite these to skip
the size checks per call, so we have to do it manually.

Improves a norast benchmark on simulation by 0.779706% +/- 0.405838%
(n=6087).
This commit is contained in:
Eric Anholt 2014-12-22 10:09:10 -08:00
parent 20e3a2430e
commit 229bf4475f
5 changed files with 66 additions and 16 deletions

View File

@ -29,17 +29,21 @@ void
vc4_init_cl(struct vc4_context *vc4, struct vc4_cl *cl)
{
cl->base = ralloc_size(vc4, 1);
cl->end = cl->next = cl->base;
cl->next = cl->base;
cl->size = 0;
}
void
vc4_grow_cl(struct vc4_cl *cl)
cl_ensure_space(struct vc4_cl *cl, uint32_t space)
{
uint32_t size = MAX2((cl->end - cl->base) * 2, 4096);
if ((cl->next - cl->base) + space <= cl->size)
return;
uint32_t size = MAX2(cl->size + space, cl->size * 2);
uint32_t offset = cl->next -cl->base;
cl->base = reralloc(ralloc_parent(cl->base), cl->base, uint8_t, size);
cl->end = cl->base + size;
cl->size = size;
cl->next = cl->base + offset;
}

View File

@ -35,13 +35,12 @@ struct vc4_bo;
struct vc4_cl {
void *base;
void *next;
void *end;
uint32_t size;
uint32_t reloc_next;
uint32_t reloc_count;
};
void vc4_init_cl(struct vc4_context *vc4, struct vc4_cl *cl);
void vc4_grow_cl(struct vc4_cl *cl);
void vc4_reset_cl(struct vc4_cl *cl);
void vc4_dump_cl(void *cl, uint32_t size, bool is_render);
uint32_t vc4_gem_hindex(struct vc4_context *vc4, struct vc4_bo *bo);
@ -49,8 +48,7 @@ uint32_t vc4_gem_hindex(struct vc4_context *vc4, struct vc4_bo *bo);
static inline void
cl_u8(struct vc4_cl *cl, uint8_t n)
{
if (cl->next + 1 > cl->end)
vc4_grow_cl(cl);
assert((cl->next - cl->base) + 1 <= cl->size);
*(uint8_t *)cl->next = n;
cl->next++;
@ -59,8 +57,7 @@ cl_u8(struct vc4_cl *cl, uint8_t n)
static inline void
cl_u16(struct vc4_cl *cl, uint32_t n)
{
if (cl->next + 2 > cl->end)
vc4_grow_cl(cl);
assert((cl->next - cl->base) + 2 <= cl->size);
*(uint16_t *)cl->next = n;
cl->next += 2;
@ -69,8 +66,7 @@ cl_u16(struct vc4_cl *cl, uint32_t n)
static inline void
cl_u32(struct vc4_cl *cl, uint32_t n)
{
if (cl->next + 4 > cl->end)
vc4_grow_cl(cl);
assert((cl->next - cl->base) + 4 <= cl->size);
*(uint32_t *)cl->next = n;
cl->next += 4;
@ -79,8 +75,7 @@ cl_u32(struct vc4_cl *cl, uint32_t n)
static inline void
cl_ptr(struct vc4_cl *cl, void *ptr)
{
if (cl->next + sizeof(void *) > cl->end)
vc4_grow_cl(cl);
assert((cl->next - cl->base) + sizeof(void *) <= cl->size);
*(void **)cl->next = ptr;
cl->next += sizeof(void *);
@ -134,4 +129,6 @@ cl_reloc(struct vc4_context *vc4, struct vc4_cl *cl,
cl_reloc_hindex(cl, vc4_gem_hindex(vc4, bo), offset);
}
void cl_ensure_space(struct vc4_cl *cl, uint32_t size);
#endif /* VC4_CL_H */

View File

@ -104,6 +104,22 @@ vc4_setup_rcl(struct vc4_context *vc4)
resolve_uncleared);
#endif
uint32_t reloc_size = 9;
uint32_t clear_size = 14;
uint32_t config_size = 11 + reloc_size;
uint32_t loadstore_size = 7 + reloc_size;
uint32_t tilecoords_size = 3;
uint32_t branch_size = 5 + reloc_size;
uint32_t color_store_size = 1;
cl_ensure_space(&vc4->rcl,
clear_size +
config_size +
loadstore_size +
xtiles * ytiles * (loadstore_size * 4 +
tilecoords_size * 3 +
branch_size +
color_store_size));
cl_u8(&vc4->rcl, VC4_PACKET_CLEAR_COLORS);
cl_u32(&vc4->rcl, vc4->clear_color[0]);
cl_u32(&vc4->rcl, vc4->clear_color[1]);
@ -290,9 +306,9 @@ vc4_flush(struct pipe_context *pctx)
if (vc4_debug & VC4_DEBUG_CL) {
fprintf(stderr, "BCL:\n");
vc4_dump_cl(vc4->bcl.base, vc4->bcl.end - vc4->bcl.base, false);
vc4_dump_cl(vc4->bcl.base, vc4->bcl.size, false);
fprintf(stderr, "RCL:\n");
vc4_dump_cl(vc4->rcl.base, vc4->rcl.end - vc4->rcl.base, true);
vc4_dump_cl(vc4->rcl.base, vc4->rcl.size, true);
}
struct drm_vc4_submit_cl submit;

View File

@ -29,6 +29,32 @@
#include "vc4_context.h"
#include "vc4_resource.h"
static void
vc4_get_draw_cl_space(struct vc4_context *vc4)
{
/* Binner gets our packet state -- vc4_emit.c contents,
* and the primitive itself.
*/
cl_ensure_space(&vc4->bcl, 256);
/* Nothing for rcl -- that's covered by vc4_context.c */
/* shader_rec gets up to 12 dwords of reloc handles plus a maximally
* sized shader_rec (104 bytes base for 8 vattrs plus 32 bytes of
* vattr stride).
*/
cl_ensure_space(&vc4->shader_rec, 12 * sizeof(uint32_t) + 104 + 8 * 32);
/* Uniforms are covered by vc4_write_uniforms(). */
/* There could be up to 16 textures per stage, plus misc other
* pointers.
*/
cl_ensure_space(&vc4->bo_handles, (2 * 16 + 20) * sizeof(uint32_t));
cl_ensure_space(&vc4->bo_pointers,
(2 * 16 + 20) * sizeof(struct vc4_bo *));
}
/**
* Does the initial bining command list setup for drawing to a given FBO.
*/
@ -38,6 +64,8 @@ vc4_start_draw(struct vc4_context *vc4)
if (vc4->needs_flush)
return;
vc4_get_draw_cl_space(vc4);
uint32_t width = vc4->framebuffer.width;
uint32_t height = vc4->framebuffer.height;
uint32_t tilew = align(width, 64) / 64;
@ -114,6 +142,8 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
return;
}
vc4_get_draw_cl_space(vc4);
struct vc4_vertex_stateobj *vtx = vc4->vtx;
struct vc4_vertexbuf_stateobj *vertexbuf = &vc4->vertexbuf;

View File

@ -2729,6 +2729,9 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
const uint32_t *gallium_uniforms = cb->cb[0].user_buffer;
struct vc4_bo *ubo = vc4_upload_ubo(vc4, shader, gallium_uniforms);
cl_ensure_space(&vc4->uniforms, (uinfo->count +
uinfo->num_texture_samples) * 4);
cl_start_shader_reloc(&vc4->uniforms, uinfo->num_texture_samples);
for (int i = 0; i < uinfo->count; i++) {