llvmpipe: rework do_block_16 to use bitmasks and ffs

Some nice speedups:
   gears:  547 -> 597
   isosurf: 83 -> 98

Others like gloss unchanged.  Could do further work in this direction.
This commit is contained in:
Keith Whitwell 2010-02-17 11:07:45 +00:00
parent cb66e9f2b4
commit 31c816731e
1 changed files with 41 additions and 27 deletions

View File

@ -146,42 +146,56 @@ static void
do_block_16( struct lp_rasterizer_task *rast_task,
const struct lp_rast_triangle *tri,
int x, int y,
int c0,
int c1,
int c2,
int c3 )
int c2 )
{
const int eo1 = tri->eo1 * 4;
const int eo2 = tri->eo2 * 4;
const int eo3 = tri->eo3 * 4;
const int *step0 = tri->inputs.step[0];
const int *step1 = tri->inputs.step[1];
const int *step2 = tri->inputs.step[2];
int i;
unsigned mask = 0;
int eo[3];
int c[3];
int i, j;
assert(x % 16 == 0);
assert(y % 16 == 0);
for (i = 0; i < 16; i++) {
int cx1 = c1 + step0[i] * 4;
int cx2 = c2 + step1[i] * 4;
int cx3 = c3 + step2[i] * 4;
eo[0] = tri->eo1 * 4;
eo[1] = tri->eo2 * 4;
eo[2] = tri->eo3 * 4;
if (cx1 + eo1 < 0 ||
cx2 + eo2 < 0 ||
cx3 + eo3 < 0) {
/* the block is completely outside the triangle - nop */
LP_COUNT(nr_empty_4);
}
else {
int px = x + pos_table4[i][0];
int py = y + pos_table4[i][1];
/* Don't bother testing if the 4x4 block is entirely in/out of
* the triangle. It's a little faster to do it in the jit code.
*/
LP_COUNT(nr_non_empty_4);
do_block_4(rast_task, tri, px, py, cx1, cx2, cx3);
c[0] = c0;
c[1] = c1;
c[2] = c2;
for (j = 0; j < 3; j++) {
const int *step = tri->inputs.step[j];
int cx = c[j];
int eox = eo[j];
/* Mask has bits set whenever we are outside any of the edges.
*/
for (i = 0; i < 16; i++) {
int out = cx + step[i] * 4 + eox;
mask |= (out >> 31) & (1 << i);
}
}
mask = ~mask & 0xffff;
while (mask) {
int i = ffs(mask) - 1;
int px = x + pos_table4[i][0];
int py = y + pos_table4[i][1];
int cx1 = c0 + tri->inputs.step[0][i] * 4;
int cx2 = c1 + tri->inputs.step[1][i] * 4;
int cx3 = c2 + tri->inputs.step[2][i] * 4;
mask &= ~(1 << i);
/* Don't bother testing if the 4x4 block is entirely in/out of
* the triangle. It's a little faster to do it in the jit code.
*/
LP_COUNT(nr_non_empty_4);
do_block_4(rast_task, tri, px, py, cx1, cx2, cx3);
}
}