vbo: restructure vbo_save_vertex_list to get more cache hits

- Move more stuff into the cold structure.
- Reorder fields for better packing.
- Flatten the gallium and merged nested structures.

Since we have tens of thousands of these, decreasing the size improves
performance by 13%.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13506>
This commit is contained in:
Marek Olšák 2021-10-23 01:19:23 -04:00 committed by Marge Bot
parent 3835205a0e
commit 76892c4e46
5 changed files with 95 additions and 93 deletions

View File

@ -761,18 +761,18 @@ static void
vbo_destroy_vertex_list(struct gl_context *ctx, struct vbo_save_vertex_list *node)
{
for (gl_vertex_processing_mode mode = VP_MODE_FF; mode < VP_MODE_MAX; ++mode) {
_mesa_reference_vao(ctx, &node->VAO[mode], NULL);
if (node->merged.gallium.private_refcount[mode]) {
assert(node->merged.gallium.private_refcount[mode] > 0);
p_atomic_add(&node->merged.gallium.state[mode]->reference.count,
-node->merged.gallium.private_refcount[mode]);
_mesa_reference_vao(ctx, &node->cold->VAO[mode], NULL);
if (node->private_refcount[mode]) {
assert(node->private_refcount[mode] > 0);
p_atomic_add(&node->state[mode]->reference.count,
-node->private_refcount[mode]);
}
pipe_vertex_state_reference(&node->merged.gallium.state[mode], NULL);
pipe_vertex_state_reference(&node->state[mode], NULL);
}
if (node->merged.mode) {
free(node->merged.mode);
free(node->merged.start_counts);
if (node->modes) {
free(node->modes);
free(node->start_counts);
}
_mesa_reference_buffer_object(ctx, &node->cold->ib.obj, NULL);
@ -786,7 +786,7 @@ static void
vbo_print_vertex_list(struct gl_context *ctx, struct vbo_save_vertex_list *node, OpCode op, FILE *f)
{
GLuint i;
struct gl_buffer_object *buffer = node->VAO[0]->BufferBinding[0].BufferObj;
struct gl_buffer_object *buffer = node->cold->VAO[0]->BufferBinding[0].BufferObj;
const GLuint vertex_size = _vbo_save_get_stride(node)/sizeof(GLfloat);
(void) ctx;

View File

@ -57,30 +57,30 @@ struct vbo_save_vertex_list {
union gl_dlist_node header;
/* Data used in vbo_save_playback_vertex_list */
struct gl_vertex_array_object *VAO[VP_MODE_MAX];
unsigned num_draws;
uint8_t *modes;
union {
struct pipe_draw_start_count_bias *start_counts;
struct pipe_draw_start_count_bias start_count;
};
uint8_t mode;
int16_t private_refcount[VP_MODE_MAX];
struct gl_context *ctx;
struct pipe_vertex_state *state[VP_MODE_MAX];
GLbitfield enabled_attribs[VP_MODE_MAX];
/* Cold: used during construction or to handle edge-cases.
* It's not part of the structure because we want display list nodes
* to be tightly packed to get cache hits. Without this, performance would
* decrease by an order of magnitude with 10k display lists.
*/
struct {
struct pipe_draw_info info;
unsigned char *mode;
union {
struct pipe_draw_start_count_bias *start_counts;
struct pipe_draw_start_count_bias start_count;
};
unsigned num_draws;
struct {
struct gl_context *ctx;
struct pipe_vertex_state *state[VP_MODE_MAX];
int16_t private_refcount[VP_MODE_MAX];
GLbitfield enabled_attribs[VP_MODE_MAX];
struct pipe_draw_vertex_state_info info;
} gallium;
} merged;
/* Cold: used during construction or to handle egde-cases */
struct {
struct gl_vertex_array_object *VAO[VP_MODE_MAX];
struct _mesa_index_buffer ib;
struct pipe_draw_info info;
/* Copy of the final vertex from node->vertex_store->bufferobj.
* Keep this in regular (non-VBO) memory to avoid repeated
* map/unmap of the VBO when updating GL current data.
@ -103,7 +103,7 @@ struct vbo_save_vertex_list {
static inline GLsizei
_vbo_save_get_stride(const struct vbo_save_vertex_list *node)
{
return node->VAO[0]->BufferBinding[0].Stride;
return node->cold->VAO[0]->BufferBinding[0].Stride;
}
/* Default size for the buffer holding the vertices and the indices.

View File

@ -830,38 +830,38 @@ compile_vertex_list(struct gl_context *ctx)
}
/* Prepare for DrawGallium */
memset(&node->merged.info, 0, sizeof(struct pipe_draw_info));
memset(&node->cold->info, 0, sizeof(struct pipe_draw_info));
/* The other info fields will be updated in vbo_save_playback_vertex_list */
node->merged.info.index_size = 4;
node->merged.info.instance_count = 1;
node->merged.info.index.gl_bo = node->cold->ib.obj;
node->cold->info.index_size = 4;
node->cold->info.instance_count = 1;
node->cold->info.index.gl_bo = node->cold->ib.obj;
if (merged_prim_count == 1) {
node->merged.info.mode = merged_prims[0].mode;
node->merged.start_count.start = merged_prims[0].start;
node->merged.start_count.count = merged_prims[0].count;
node->merged.start_count.index_bias = 0;
node->merged.mode = NULL;
node->cold->info.mode = merged_prims[0].mode;
node->start_count.start = merged_prims[0].start;
node->start_count.count = merged_prims[0].count;
node->start_count.index_bias = 0;
node->modes = NULL;
} else {
node->merged.mode = malloc(merged_prim_count * sizeof(unsigned char));
node->merged.start_counts = malloc(merged_prim_count * sizeof(struct pipe_draw_start_count_bias));
node->modes = malloc(merged_prim_count * sizeof(unsigned char));
node->start_counts = malloc(merged_prim_count * sizeof(struct pipe_draw_start_count_bias));
for (unsigned i = 0; i < merged_prim_count; i++) {
node->merged.start_counts[i].start = merged_prims[i].start;
node->merged.start_counts[i].count = merged_prims[i].count;
node->merged.start_counts[i].index_bias = 0;
node->merged.mode[i] = merged_prims[i].mode;
node->start_counts[i].start = merged_prims[i].start;
node->start_counts[i].count = merged_prims[i].count;
node->start_counts[i].index_bias = 0;
node->modes[i] = merged_prims[i].mode;
}
}
node->merged.num_draws = merged_prim_count;
if (node->merged.num_draws > 1) {
node->num_draws = merged_prim_count;
if (node->num_draws > 1) {
bool same_mode = true;
for (unsigned i = 1; i < node->merged.num_draws && same_mode; i++) {
same_mode = node->merged.mode[i] == node->merged.mode[0];
for (unsigned i = 1; i < node->num_draws && same_mode; i++) {
same_mode = node->modes[i] == node->modes[0];
}
if (same_mode) {
/* All primitives use the same mode, so we can simplify a bit */
node->merged.info.mode = node->merged.mode[0];
free(node->merged.mode);
node->merged.mode = NULL;
node->cold->info.mode = node->modes[0];
free(node->modes);
node->modes = NULL;
}
}
@ -897,28 +897,27 @@ end:
save->current_bo, buffer_offset, stride,
save->enabled, save->attrsz, save->attrtype, offsets);
/* Reference the vao in the dlist */
node->VAO[vpm] = NULL;
_mesa_reference_vao(ctx, &node->VAO[vpm], save->VAO[vpm]);
node->cold->VAO[vpm] = NULL;
_mesa_reference_vao(ctx, &node->cold->VAO[vpm], save->VAO[vpm]);
}
/* Prepare for DrawGalliumVertexState */
if (node->merged.num_draws && ctx->Driver.DrawGalliumVertexState) {
if (node->num_draws && ctx->Driver.DrawGalliumVertexState) {
for (unsigned i = 0; i < VP_MODE_MAX; i++) {
uint32_t enabled_attribs = _vbo_get_vao_filter(i) &
node->VAO[i]->_EnabledWithMapMode;
node->cold->VAO[i]->_EnabledWithMapMode;
node->merged.gallium.state[i] =
ctx->Driver.CreateGalliumVertexState(ctx, node->VAO[i],
node->state[i] =
ctx->Driver.CreateGalliumVertexState(ctx, node->cold->VAO[i],
node->cold->ib.obj,
enabled_attribs);
node->merged.gallium.private_refcount[i] = 0;
node->merged.gallium.enabled_attribs[i] = enabled_attribs;
node->private_refcount[i] = 0;
node->enabled_attribs[i] = enabled_attribs;
}
node->merged.gallium.ctx = ctx;
node->merged.gallium.info.mode = node->merged.info.mode;
node->merged.gallium.info.take_vertex_state_ownership = false;
assert(node->merged.info.index_size == 4);
node->ctx = ctx;
node->mode = node->cold->info.mode;
assert(node->cold->info.index_size == 4);
}
/* Deal with GL_COMPILE_AND_EXECUTE:
@ -935,7 +934,7 @@ end:
* The problem is that the VAO offset is based on current_bo's layout,
* so we have to use a temp value.
*/
struct gl_vertex_array_object *vao = node->VAO[VP_MODE_SHADER];
struct gl_vertex_array_object *vao = node->cold->VAO[VP_MODE_SHADER];
GLintptr original = vao->BufferBinding[0].Offset;
if (!ctx->ListState.Current.UseLoopback) {
GLintptr new_offset = 0;

View File

@ -106,10 +106,10 @@ playback_copy_to_current(struct gl_context *ctx,
bool color0_changed = false;
/* Copy conventional attribs and generics except pos */
copy_vao(ctx, node->VAO[VP_MODE_SHADER], ~VERT_BIT_POS & VERT_BIT_ALL,
copy_vao(ctx, node->cold->VAO[VP_MODE_SHADER], ~VERT_BIT_POS & VERT_BIT_ALL,
_NEW_CURRENT_ATTRIB, GL_CURRENT_BIT, 0, &data, &color0_changed);
/* Copy materials */
copy_vao(ctx, node->VAO[VP_MODE_FF], VERT_BIT_MAT_ALL,
copy_vao(ctx, node->cold->VAO[VP_MODE_FF], VERT_BIT_MAT_ALL,
_NEW_MATERIAL, GL_LIGHTING_BIT,
VBO_MATERIAL_SHIFT, &data, &color0_changed);
@ -138,7 +138,7 @@ bind_vertex_list(struct gl_context *ctx,
const struct vbo_save_vertex_list *node)
{
const gl_vertex_processing_mode mode = ctx->VertexProgram._VPMode;
_mesa_set_draw_vao(ctx, node->VAO[mode], _vbo_get_vao_filter(mode));
_mesa_set_draw_vao(ctx, node->cold->VAO[mode], _vbo_get_vao_filter(mode));
}
@ -146,7 +146,7 @@ static void
loopback_vertex_list(struct gl_context *ctx,
const struct vbo_save_vertex_list *list)
{
struct gl_buffer_object *bo = list->VAO[0]->BufferBinding[0].BufferObj;
struct gl_buffer_object *bo = list->cold->VAO[0]->BufferBinding[0].BufferObj;
void *buffer = ctx->Driver.MapBufferRange(ctx, 0, bo->Size, GL_MAP_READ_BIT, /* ? */
bo, MAP_INTERNAL);
@ -201,7 +201,7 @@ vbo_save_playback_vertex_list_gallium(struct gl_context *ctx,
/* This sets which vertex arrays are enabled, which determines
* which attribs have stride = 0 and whether edge flags are enabled.
*/
const GLbitfield enabled = node->merged.gallium.enabled_attribs[mode];
const GLbitfield enabled = node->enabled_attribs[mode];
ctx->Array._DrawVAOEnabledAttribs = enabled;
_mesa_set_varying_vp_inputs(ctx, enabled);
@ -228,10 +228,13 @@ vbo_save_playback_vertex_list_gallium(struct gl_context *ctx,
if (vp->info.inputs_read & ~enabled || vp->DualSlotInputs)
return USE_SLOW_PATH;
struct pipe_vertex_state *state = node->merged.gallium.state[mode];
struct pipe_draw_vertex_state_info info = node->merged.gallium.info;
struct pipe_vertex_state *state = node->state[mode];
struct pipe_draw_vertex_state_info info;
if (node->merged.gallium.ctx == ctx) {
info.mode = node->mode;
info.take_vertex_state_ownership = false;
if (node->ctx == ctx) {
/* This mechanism allows passing references to the driver without
* using atomics to increase the reference count.
*
@ -248,7 +251,7 @@ vbo_save_playback_vertex_list_gallium(struct gl_context *ctx,
* possibly turn a million atomic increments into 1 add and 1 subtract
* atomic op over the whole lifetime of an app.
*/
int16_t * const private_refcount = (int16_t*)&node->merged.gallium.private_refcount[mode];
int16_t * const private_refcount = (int16_t*)&node->private_refcount[mode];
assert(*private_refcount >= 0);
if (unlikely(*private_refcount == 0)) {
@ -270,15 +273,15 @@ vbo_save_playback_vertex_list_gallium(struct gl_context *ctx,
}
/* Fast path using a pre-built gallium vertex buffer state. */
if (node->merged.mode || node->merged.num_draws > 1) {
if (node->modes || node->num_draws > 1) {
ctx->Driver.DrawGalliumVertexState(ctx, state, info,
node->merged.start_counts,
node->merged.mode,
node->merged.num_draws,
node->start_counts,
node->modes,
node->num_draws,
enabled & VERT_ATTRIB_EDGEFLAG);
} else if (node->merged.num_draws) {
} else if (node->num_draws) {
ctx->Driver.DrawGalliumVertexState(ctx, state, info,
&node->merged.start_count,
&node->start_count,
NULL, 1,
enabled & VERT_ATTRIB_EDGEFLAG);
}
@ -327,18 +330,18 @@ vbo_save_playback_vertex_list(struct gl_context *ctx, void *data, bool copy_to_c
assert(ctx->NewState == 0);
struct pipe_draw_info *info = (struct pipe_draw_info *) &node->merged.info;
struct pipe_draw_info *info = (struct pipe_draw_info *) &node->cold->info;
void *gl_bo = info->index.gl_bo;
if (node->merged.mode) {
if (node->modes) {
ctx->Driver.DrawGalliumMultiMode(ctx, info,
node->merged.start_counts,
node->merged.mode,
node->merged.num_draws);
} else if (node->merged.num_draws == 1) {
ctx->Driver.DrawGallium(ctx, info, 0, &node->merged.start_count, 1);
} else if (node->merged.num_draws) {
ctx->Driver.DrawGallium(ctx, info, 0, node->merged.start_counts,
node->merged.num_draws);
node->start_counts,
node->modes,
node->num_draws);
} else if (node->num_draws == 1) {
ctx->Driver.DrawGallium(ctx, info, 0, &node->start_count, 1);
} else if (node->num_draws) {
ctx->Driver.DrawGallium(ctx, info, 0, node->start_counts,
node->num_draws);
}
info->index.gl_bo = gl_bo;

View File

@ -155,14 +155,14 @@ _vbo_loopback_vertex_list(struct gl_context *ctx,
/* All Legacy, NV, ARB and Material attributes are routed through
* the NV attributes entrypoints:
*/
const struct gl_vertex_array_object *vao = node->VAO[VP_MODE_FF];
const struct gl_vertex_array_object *vao = node->cold->VAO[VP_MODE_FF];
GLbitfield mask = vao->Enabled & VERT_BIT_MAT_ALL;
while (mask) {
const int i = u_bit_scan(&mask);
append_attr(&nr, la, i, VBO_MATERIAL_SHIFT, vao);
}
vao = node->VAO[VP_MODE_SHADER];
vao = node->cold->VAO[VP_MODE_SHADER];
mask = vao->Enabled & ~(VERT_BIT_POS | VERT_BIT_GENERIC0);
while (mask) {
const int i = u_bit_scan(&mask);