vbo: restructure vbo_save_vertex_list to get more cache hits
- Move more stuff into the cold structure. - Reorder fields for better packing. - Flatten the gallium and merged nested structures. Since we have tens of thousands of these, decreasing the size improves performance by 13%. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13506>
This commit is contained in:
parent
3835205a0e
commit
76892c4e46
|
@ -761,18 +761,18 @@ static void
|
|||
vbo_destroy_vertex_list(struct gl_context *ctx, struct vbo_save_vertex_list *node)
|
||||
{
|
||||
for (gl_vertex_processing_mode mode = VP_MODE_FF; mode < VP_MODE_MAX; ++mode) {
|
||||
_mesa_reference_vao(ctx, &node->VAO[mode], NULL);
|
||||
if (node->merged.gallium.private_refcount[mode]) {
|
||||
assert(node->merged.gallium.private_refcount[mode] > 0);
|
||||
p_atomic_add(&node->merged.gallium.state[mode]->reference.count,
|
||||
-node->merged.gallium.private_refcount[mode]);
|
||||
_mesa_reference_vao(ctx, &node->cold->VAO[mode], NULL);
|
||||
if (node->private_refcount[mode]) {
|
||||
assert(node->private_refcount[mode] > 0);
|
||||
p_atomic_add(&node->state[mode]->reference.count,
|
||||
-node->private_refcount[mode]);
|
||||
}
|
||||
pipe_vertex_state_reference(&node->merged.gallium.state[mode], NULL);
|
||||
pipe_vertex_state_reference(&node->state[mode], NULL);
|
||||
}
|
||||
|
||||
if (node->merged.mode) {
|
||||
free(node->merged.mode);
|
||||
free(node->merged.start_counts);
|
||||
if (node->modes) {
|
||||
free(node->modes);
|
||||
free(node->start_counts);
|
||||
}
|
||||
|
||||
_mesa_reference_buffer_object(ctx, &node->cold->ib.obj, NULL);
|
||||
|
@ -786,7 +786,7 @@ static void
|
|||
vbo_print_vertex_list(struct gl_context *ctx, struct vbo_save_vertex_list *node, OpCode op, FILE *f)
|
||||
{
|
||||
GLuint i;
|
||||
struct gl_buffer_object *buffer = node->VAO[0]->BufferBinding[0].BufferObj;
|
||||
struct gl_buffer_object *buffer = node->cold->VAO[0]->BufferBinding[0].BufferObj;
|
||||
const GLuint vertex_size = _vbo_save_get_stride(node)/sizeof(GLfloat);
|
||||
(void) ctx;
|
||||
|
||||
|
|
|
@ -57,30 +57,30 @@ struct vbo_save_vertex_list {
|
|||
union gl_dlist_node header;
|
||||
|
||||
/* Data used in vbo_save_playback_vertex_list */
|
||||
struct gl_vertex_array_object *VAO[VP_MODE_MAX];
|
||||
unsigned num_draws;
|
||||
uint8_t *modes;
|
||||
union {
|
||||
struct pipe_draw_start_count_bias *start_counts;
|
||||
struct pipe_draw_start_count_bias start_count;
|
||||
};
|
||||
uint8_t mode;
|
||||
|
||||
int16_t private_refcount[VP_MODE_MAX];
|
||||
struct gl_context *ctx;
|
||||
struct pipe_vertex_state *state[VP_MODE_MAX];
|
||||
GLbitfield enabled_attribs[VP_MODE_MAX];
|
||||
|
||||
/* Cold: used during construction or to handle edge-cases.
|
||||
* It's not part of the structure because we want display list nodes
|
||||
* to be tightly packed to get cache hits. Without this, performance would
|
||||
* decrease by an order of magnitude with 10k display lists.
|
||||
*/
|
||||
struct {
|
||||
struct pipe_draw_info info;
|
||||
unsigned char *mode;
|
||||
union {
|
||||
struct pipe_draw_start_count_bias *start_counts;
|
||||
struct pipe_draw_start_count_bias start_count;
|
||||
};
|
||||
unsigned num_draws;
|
||||
|
||||
struct {
|
||||
struct gl_context *ctx;
|
||||
struct pipe_vertex_state *state[VP_MODE_MAX];
|
||||
int16_t private_refcount[VP_MODE_MAX];
|
||||
GLbitfield enabled_attribs[VP_MODE_MAX];
|
||||
struct pipe_draw_vertex_state_info info;
|
||||
} gallium;
|
||||
} merged;
|
||||
|
||||
/* Cold: used during construction or to handle egde-cases */
|
||||
struct {
|
||||
struct gl_vertex_array_object *VAO[VP_MODE_MAX];
|
||||
struct _mesa_index_buffer ib;
|
||||
|
||||
struct pipe_draw_info info;
|
||||
|
||||
/* Copy of the final vertex from node->vertex_store->bufferobj.
|
||||
* Keep this in regular (non-VBO) memory to avoid repeated
|
||||
* map/unmap of the VBO when updating GL current data.
|
||||
|
@ -103,7 +103,7 @@ struct vbo_save_vertex_list {
|
|||
static inline GLsizei
|
||||
_vbo_save_get_stride(const struct vbo_save_vertex_list *node)
|
||||
{
|
||||
return node->VAO[0]->BufferBinding[0].Stride;
|
||||
return node->cold->VAO[0]->BufferBinding[0].Stride;
|
||||
}
|
||||
|
||||
/* Default size for the buffer holding the vertices and the indices.
|
||||
|
|
|
@ -830,38 +830,38 @@ compile_vertex_list(struct gl_context *ctx)
|
|||
}
|
||||
|
||||
/* Prepare for DrawGallium */
|
||||
memset(&node->merged.info, 0, sizeof(struct pipe_draw_info));
|
||||
memset(&node->cold->info, 0, sizeof(struct pipe_draw_info));
|
||||
/* The other info fields will be updated in vbo_save_playback_vertex_list */
|
||||
node->merged.info.index_size = 4;
|
||||
node->merged.info.instance_count = 1;
|
||||
node->merged.info.index.gl_bo = node->cold->ib.obj;
|
||||
node->cold->info.index_size = 4;
|
||||
node->cold->info.instance_count = 1;
|
||||
node->cold->info.index.gl_bo = node->cold->ib.obj;
|
||||
if (merged_prim_count == 1) {
|
||||
node->merged.info.mode = merged_prims[0].mode;
|
||||
node->merged.start_count.start = merged_prims[0].start;
|
||||
node->merged.start_count.count = merged_prims[0].count;
|
||||
node->merged.start_count.index_bias = 0;
|
||||
node->merged.mode = NULL;
|
||||
node->cold->info.mode = merged_prims[0].mode;
|
||||
node->start_count.start = merged_prims[0].start;
|
||||
node->start_count.count = merged_prims[0].count;
|
||||
node->start_count.index_bias = 0;
|
||||
node->modes = NULL;
|
||||
} else {
|
||||
node->merged.mode = malloc(merged_prim_count * sizeof(unsigned char));
|
||||
node->merged.start_counts = malloc(merged_prim_count * sizeof(struct pipe_draw_start_count_bias));
|
||||
node->modes = malloc(merged_prim_count * sizeof(unsigned char));
|
||||
node->start_counts = malloc(merged_prim_count * sizeof(struct pipe_draw_start_count_bias));
|
||||
for (unsigned i = 0; i < merged_prim_count; i++) {
|
||||
node->merged.start_counts[i].start = merged_prims[i].start;
|
||||
node->merged.start_counts[i].count = merged_prims[i].count;
|
||||
node->merged.start_counts[i].index_bias = 0;
|
||||
node->merged.mode[i] = merged_prims[i].mode;
|
||||
node->start_counts[i].start = merged_prims[i].start;
|
||||
node->start_counts[i].count = merged_prims[i].count;
|
||||
node->start_counts[i].index_bias = 0;
|
||||
node->modes[i] = merged_prims[i].mode;
|
||||
}
|
||||
}
|
||||
node->merged.num_draws = merged_prim_count;
|
||||
if (node->merged.num_draws > 1) {
|
||||
node->num_draws = merged_prim_count;
|
||||
if (node->num_draws > 1) {
|
||||
bool same_mode = true;
|
||||
for (unsigned i = 1; i < node->merged.num_draws && same_mode; i++) {
|
||||
same_mode = node->merged.mode[i] == node->merged.mode[0];
|
||||
for (unsigned i = 1; i < node->num_draws && same_mode; i++) {
|
||||
same_mode = node->modes[i] == node->modes[0];
|
||||
}
|
||||
if (same_mode) {
|
||||
/* All primitives use the same mode, so we can simplify a bit */
|
||||
node->merged.info.mode = node->merged.mode[0];
|
||||
free(node->merged.mode);
|
||||
node->merged.mode = NULL;
|
||||
node->cold->info.mode = node->modes[0];
|
||||
free(node->modes);
|
||||
node->modes = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -897,28 +897,27 @@ end:
|
|||
save->current_bo, buffer_offset, stride,
|
||||
save->enabled, save->attrsz, save->attrtype, offsets);
|
||||
/* Reference the vao in the dlist */
|
||||
node->VAO[vpm] = NULL;
|
||||
_mesa_reference_vao(ctx, &node->VAO[vpm], save->VAO[vpm]);
|
||||
node->cold->VAO[vpm] = NULL;
|
||||
_mesa_reference_vao(ctx, &node->cold->VAO[vpm], save->VAO[vpm]);
|
||||
}
|
||||
|
||||
/* Prepare for DrawGalliumVertexState */
|
||||
if (node->merged.num_draws && ctx->Driver.DrawGalliumVertexState) {
|
||||
if (node->num_draws && ctx->Driver.DrawGalliumVertexState) {
|
||||
for (unsigned i = 0; i < VP_MODE_MAX; i++) {
|
||||
uint32_t enabled_attribs = _vbo_get_vao_filter(i) &
|
||||
node->VAO[i]->_EnabledWithMapMode;
|
||||
node->cold->VAO[i]->_EnabledWithMapMode;
|
||||
|
||||
node->merged.gallium.state[i] =
|
||||
ctx->Driver.CreateGalliumVertexState(ctx, node->VAO[i],
|
||||
node->state[i] =
|
||||
ctx->Driver.CreateGalliumVertexState(ctx, node->cold->VAO[i],
|
||||
node->cold->ib.obj,
|
||||
enabled_attribs);
|
||||
node->merged.gallium.private_refcount[i] = 0;
|
||||
node->merged.gallium.enabled_attribs[i] = enabled_attribs;
|
||||
node->private_refcount[i] = 0;
|
||||
node->enabled_attribs[i] = enabled_attribs;
|
||||
}
|
||||
|
||||
node->merged.gallium.ctx = ctx;
|
||||
node->merged.gallium.info.mode = node->merged.info.mode;
|
||||
node->merged.gallium.info.take_vertex_state_ownership = false;
|
||||
assert(node->merged.info.index_size == 4);
|
||||
node->ctx = ctx;
|
||||
node->mode = node->cold->info.mode;
|
||||
assert(node->cold->info.index_size == 4);
|
||||
}
|
||||
|
||||
/* Deal with GL_COMPILE_AND_EXECUTE:
|
||||
|
@ -935,7 +934,7 @@ end:
|
|||
* The problem is that the VAO offset is based on current_bo's layout,
|
||||
* so we have to use a temp value.
|
||||
*/
|
||||
struct gl_vertex_array_object *vao = node->VAO[VP_MODE_SHADER];
|
||||
struct gl_vertex_array_object *vao = node->cold->VAO[VP_MODE_SHADER];
|
||||
GLintptr original = vao->BufferBinding[0].Offset;
|
||||
if (!ctx->ListState.Current.UseLoopback) {
|
||||
GLintptr new_offset = 0;
|
||||
|
|
|
@ -106,10 +106,10 @@ playback_copy_to_current(struct gl_context *ctx,
|
|||
bool color0_changed = false;
|
||||
|
||||
/* Copy conventional attribs and generics except pos */
|
||||
copy_vao(ctx, node->VAO[VP_MODE_SHADER], ~VERT_BIT_POS & VERT_BIT_ALL,
|
||||
copy_vao(ctx, node->cold->VAO[VP_MODE_SHADER], ~VERT_BIT_POS & VERT_BIT_ALL,
|
||||
_NEW_CURRENT_ATTRIB, GL_CURRENT_BIT, 0, &data, &color0_changed);
|
||||
/* Copy materials */
|
||||
copy_vao(ctx, node->VAO[VP_MODE_FF], VERT_BIT_MAT_ALL,
|
||||
copy_vao(ctx, node->cold->VAO[VP_MODE_FF], VERT_BIT_MAT_ALL,
|
||||
_NEW_MATERIAL, GL_LIGHTING_BIT,
|
||||
VBO_MATERIAL_SHIFT, &data, &color0_changed);
|
||||
|
||||
|
@ -138,7 +138,7 @@ bind_vertex_list(struct gl_context *ctx,
|
|||
const struct vbo_save_vertex_list *node)
|
||||
{
|
||||
const gl_vertex_processing_mode mode = ctx->VertexProgram._VPMode;
|
||||
_mesa_set_draw_vao(ctx, node->VAO[mode], _vbo_get_vao_filter(mode));
|
||||
_mesa_set_draw_vao(ctx, node->cold->VAO[mode], _vbo_get_vao_filter(mode));
|
||||
}
|
||||
|
||||
|
||||
|
@ -146,7 +146,7 @@ static void
|
|||
loopback_vertex_list(struct gl_context *ctx,
|
||||
const struct vbo_save_vertex_list *list)
|
||||
{
|
||||
struct gl_buffer_object *bo = list->VAO[0]->BufferBinding[0].BufferObj;
|
||||
struct gl_buffer_object *bo = list->cold->VAO[0]->BufferBinding[0].BufferObj;
|
||||
void *buffer = ctx->Driver.MapBufferRange(ctx, 0, bo->Size, GL_MAP_READ_BIT, /* ? */
|
||||
bo, MAP_INTERNAL);
|
||||
|
||||
|
@ -201,7 +201,7 @@ vbo_save_playback_vertex_list_gallium(struct gl_context *ctx,
|
|||
/* This sets which vertex arrays are enabled, which determines
|
||||
* which attribs have stride = 0 and whether edge flags are enabled.
|
||||
*/
|
||||
const GLbitfield enabled = node->merged.gallium.enabled_attribs[mode];
|
||||
const GLbitfield enabled = node->enabled_attribs[mode];
|
||||
ctx->Array._DrawVAOEnabledAttribs = enabled;
|
||||
_mesa_set_varying_vp_inputs(ctx, enabled);
|
||||
|
||||
|
@ -228,10 +228,13 @@ vbo_save_playback_vertex_list_gallium(struct gl_context *ctx,
|
|||
if (vp->info.inputs_read & ~enabled || vp->DualSlotInputs)
|
||||
return USE_SLOW_PATH;
|
||||
|
||||
struct pipe_vertex_state *state = node->merged.gallium.state[mode];
|
||||
struct pipe_draw_vertex_state_info info = node->merged.gallium.info;
|
||||
struct pipe_vertex_state *state = node->state[mode];
|
||||
struct pipe_draw_vertex_state_info info;
|
||||
|
||||
if (node->merged.gallium.ctx == ctx) {
|
||||
info.mode = node->mode;
|
||||
info.take_vertex_state_ownership = false;
|
||||
|
||||
if (node->ctx == ctx) {
|
||||
/* This mechanism allows passing references to the driver without
|
||||
* using atomics to increase the reference count.
|
||||
*
|
||||
|
@ -248,7 +251,7 @@ vbo_save_playback_vertex_list_gallium(struct gl_context *ctx,
|
|||
* possibly turn a million atomic increments into 1 add and 1 subtract
|
||||
* atomic op over the whole lifetime of an app.
|
||||
*/
|
||||
int16_t * const private_refcount = (int16_t*)&node->merged.gallium.private_refcount[mode];
|
||||
int16_t * const private_refcount = (int16_t*)&node->private_refcount[mode];
|
||||
assert(*private_refcount >= 0);
|
||||
|
||||
if (unlikely(*private_refcount == 0)) {
|
||||
|
@ -270,15 +273,15 @@ vbo_save_playback_vertex_list_gallium(struct gl_context *ctx,
|
|||
}
|
||||
|
||||
/* Fast path using a pre-built gallium vertex buffer state. */
|
||||
if (node->merged.mode || node->merged.num_draws > 1) {
|
||||
if (node->modes || node->num_draws > 1) {
|
||||
ctx->Driver.DrawGalliumVertexState(ctx, state, info,
|
||||
node->merged.start_counts,
|
||||
node->merged.mode,
|
||||
node->merged.num_draws,
|
||||
node->start_counts,
|
||||
node->modes,
|
||||
node->num_draws,
|
||||
enabled & VERT_ATTRIB_EDGEFLAG);
|
||||
} else if (node->merged.num_draws) {
|
||||
} else if (node->num_draws) {
|
||||
ctx->Driver.DrawGalliumVertexState(ctx, state, info,
|
||||
&node->merged.start_count,
|
||||
&node->start_count,
|
||||
NULL, 1,
|
||||
enabled & VERT_ATTRIB_EDGEFLAG);
|
||||
}
|
||||
|
@ -327,18 +330,18 @@ vbo_save_playback_vertex_list(struct gl_context *ctx, void *data, bool copy_to_c
|
|||
|
||||
assert(ctx->NewState == 0);
|
||||
|
||||
struct pipe_draw_info *info = (struct pipe_draw_info *) &node->merged.info;
|
||||
struct pipe_draw_info *info = (struct pipe_draw_info *) &node->cold->info;
|
||||
void *gl_bo = info->index.gl_bo;
|
||||
if (node->merged.mode) {
|
||||
if (node->modes) {
|
||||
ctx->Driver.DrawGalliumMultiMode(ctx, info,
|
||||
node->merged.start_counts,
|
||||
node->merged.mode,
|
||||
node->merged.num_draws);
|
||||
} else if (node->merged.num_draws == 1) {
|
||||
ctx->Driver.DrawGallium(ctx, info, 0, &node->merged.start_count, 1);
|
||||
} else if (node->merged.num_draws) {
|
||||
ctx->Driver.DrawGallium(ctx, info, 0, node->merged.start_counts,
|
||||
node->merged.num_draws);
|
||||
node->start_counts,
|
||||
node->modes,
|
||||
node->num_draws);
|
||||
} else if (node->num_draws == 1) {
|
||||
ctx->Driver.DrawGallium(ctx, info, 0, &node->start_count, 1);
|
||||
} else if (node->num_draws) {
|
||||
ctx->Driver.DrawGallium(ctx, info, 0, node->start_counts,
|
||||
node->num_draws);
|
||||
}
|
||||
info->index.gl_bo = gl_bo;
|
||||
|
||||
|
|
|
@ -155,14 +155,14 @@ _vbo_loopback_vertex_list(struct gl_context *ctx,
|
|||
/* All Legacy, NV, ARB and Material attributes are routed through
|
||||
* the NV attributes entrypoints:
|
||||
*/
|
||||
const struct gl_vertex_array_object *vao = node->VAO[VP_MODE_FF];
|
||||
const struct gl_vertex_array_object *vao = node->cold->VAO[VP_MODE_FF];
|
||||
GLbitfield mask = vao->Enabled & VERT_BIT_MAT_ALL;
|
||||
while (mask) {
|
||||
const int i = u_bit_scan(&mask);
|
||||
append_attr(&nr, la, i, VBO_MATERIAL_SHIFT, vao);
|
||||
}
|
||||
|
||||
vao = node->VAO[VP_MODE_SHADER];
|
||||
vao = node->cold->VAO[VP_MODE_SHADER];
|
||||
mask = vao->Enabled & ~(VERT_BIT_POS | VERT_BIT_GENERIC0);
|
||||
while (mask) {
|
||||
const int i = u_bit_scan(&mask);
|
||||
|
|
Loading…
Reference in New Issue