vbo/dlist: implement primitive merging

Merge consecutive primitives using the same mode while constructing the index buffer.

This improves performance a lot (x3 - x10) SPECviewperf13 snx-03 test by reducing the
number of draw calls per frame.
Here are some numbers for 4 of the tests:

      | Num draw calls |     GPU-load    |
------|----------------|-----------------|
      | Before | After |  Before | After |
------|--------|-------|---------|-------|
test1 |  390k  |  16k  |   68%   |  90%  |
test2 |  370k  |  16k  |   40%   |  90%  |
test3 |  1.2M  |  35k  |   38%   |  78%  |
test10|  3.5M  |  35k  |   36%   |  58%  |

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7078>
This commit is contained in:
Pierre-Eric Pelloux-Prayer 2020-10-08 14:49:29 +02:00
parent ac3d4c7635
commit 310991415e
1 changed files with 74 additions and 15 deletions

View File

@ -605,25 +605,67 @@ compile_vertex_list(struct gl_context *ctx)
int end = node->prims[node->prim_count - 1].start +
node->prims[node->prim_count - 1].count;
int total_vert_count = end - node->prims[0].start;
int max_indices_count = total_vert_count * 2;
/* Estimate for the worst case: all prims are line strips (the +1 is because
* wrap_buffers may call use but the last primitive may not be complete) */
int max_indices_count = MAX2(total_vert_count * 2 - (node->prim_count * 2) + 1,
total_vert_count);
int size = max_indices_count * sizeof(uint32_t);
uint32_t* indices = (uint32_t*) malloc(size);
uint32_t max_index = 0, min_index = 0xFFFFFFFF;
int idx = 0;
int last_valid_prim = -1;
/* Construct indices array. */
for (unsigned i = 0; i < node->prim_count; i++) {
assert(node->prims[i].basevertex == 0);
GLubyte mode = node->prims[i].mode;
int vertex_count = node->prims[i].count;
if (!vertex_count) {
continue;
}
/* Line strips get converted to lines */
if (mode == GL_LINE_STRIP)
mode = GL_LINES;
/* If 2 consecutive prims use the same mode => merge them. */
bool merge_prims = last_valid_prim >= 0 &&
mode == node->prims[last_valid_prim].mode &&
mode != GL_LINE_LOOP && mode != GL_TRIANGLE_FAN &&
mode != GL_QUAD_STRIP && mode != GL_POLYGON &&
mode != GL_PATCHES;
/* To be able to merge consecutive triangle strips we need to insert
* a degenerate triangle.
*/
if (merge_prims &&
mode == GL_TRIANGLE_STRIP) {
/* Insert a degenerate triangle */
assert(node->prims[last_valid_prim].mode == GL_TRIANGLE_STRIP);
unsigned tri_count = node->prims[last_valid_prim].count - 2;
indices[idx] = indices[idx - 1];
indices[idx + 1] = node->prims[i].start;
idx += 2;
node->prims[last_valid_prim].count += 2;
if (tri_count % 2) {
/* Add another index to preserve winding order */
indices[idx++] = node->prims[i].start;
node->prims[last_valid_prim].count++;
}
}
int start = idx;
/* Convert line strips to lines if it'll allow if the previous
* prim mode is GL_LINES or if the next primitive mode is
* GL_LINES or GL_LINE_LOOP.
* prim mode is GL_LINES (so merge_prims is true) or if the next
* primitive mode is GL_LINES or GL_LINE_LOOP.
*/
if (node->prims[i].mode == GL_LINE_STRIP &&
((i > 0 && node->prims[i - 1].mode == GL_LINES) ||
(merge_prims ||
(i < node->prim_count - 1 &&
(node->prims[i + 1].mode == GL_LINE_STRIP ||
node->prims[i + 1].mode == GL_LINES)))) {
@ -635,7 +677,7 @@ compile_vertex_list(struct gl_context *ctx)
node->prims[i].count++;
}
}
node->prims[i].mode = GL_LINES;
node->prims[i].mode = mode;
} else {
for (unsigned j = 0; j < vertex_count; j++) {
indices[idx++] = node->prims[i].start + j;
@ -645,15 +687,27 @@ compile_vertex_list(struct gl_context *ctx)
min_index = MIN2(min_index, indices[start]);
max_index = MAX2(max_index, indices[idx - 1]);
node->prims[i].start = start;
if (merge_prims) {
/* Update vertex count. */
node->prims[last_valid_prim].count += idx - start;
} else {
/* Keep this primitive */
last_valid_prim += 1;
assert(last_valid_prim <= i);
node->prims[i].start = start;
node->prims[last_valid_prim] = node->prims[i];
}
}
if (idx == 0)
goto skip_node;
assert(idx <= max_indices_count);
node->prim_count = last_valid_prim + 1;
node->ib.ptr = NULL;
node->ib.count = idx;
node->ib.index_size_shift = (GL_UNSIGNED_INT - GL_UNSIGNED_BYTE) >> 1;
node->min_index = min_index;
node->max_index = max_index;
@ -663,14 +717,19 @@ compile_vertex_list(struct gl_context *ctx)
idx * sizeof(uint32_t), indices,
GL_STATIC_DRAW_ARB, GL_MAP_WRITE_BIT,
node->ib.obj);
assert(success);
if (!success) {
node->min_index = node->max_index = 0;
ctx->Driver.DeleteBuffer(ctx, node->ib.obj);
node->ib.obj = NULL;
node->vertex_count = 0;
_mesa_error(ctx, GL_OUT_OF_MEMORY, "IB allocation");
}
if (success)
goto out;
ctx->Driver.DeleteBuffer(ctx, node->ib.obj);
_mesa_error(ctx, GL_OUT_OF_MEMORY, "IB allocation");
skip_node:
node->ib.obj = NULL;
node->vertex_count = 0;
node->prim_count = 0;
out:
free(indices);
} else {
node->ib.obj = NULL;