llvmpipe: try to do more of rast_tri_3_16 with intrinsics

There was actually a large quantity of scalar code in these functions
previously.  This tries to move more into intrinsics.

Introduce an sse2 mm_mullo_epi32 replacement to avoid sse4 dependency
in the new rasterization code.
This commit is contained in:
Keith Whitwell 2010-10-11 16:30:14 +01:00
parent 4cb3b4ced8
commit 2cf98d5a6d
2 changed files with 271 additions and 9 deletions

View File

@ -89,19 +89,21 @@ struct lp_rast_shader_inputs {
const struct lp_rast_state *state;
};
/* Note: the order of these values is important as they are loaded by
* sse code in rasterization:
*/
struct lp_rast_plane {
/* one-pixel sized trivial accept offsets for each plane */
int ei;
/* one-pixel sized trivial reject offsets for each plane */
int eo;
/* edge function values at minx,miny ?? */
int c;
int dcdx;
int dcdy;
/* one-pixel sized trivial reject offsets for each plane */
int eo;
/* one-pixel sized trivial accept offsets for each plane */
int ei;
};
/**

View File

@ -32,6 +32,7 @@
#include <limits.h>
#include "util/u_math.h"
#include "lp_debug.h"
#include "lp_debug_intrin.h"
#include "lp_perf.h"
#include "lp_rast_priv.h"
#include "lp_tile_soa.h"
@ -254,8 +255,8 @@ sign_bits4(const __m128i *cstep, int cdiff)
#define TAG(x) x##_3
#define NR_PLANES 3
#define TRI_4 lp_rast_triangle_3_4
#define TRI_16 lp_rast_triangle_3_16
/*#define TRI_4 lp_rast_triangle_3_4*/
/*#define TRI_16 lp_rast_triangle_3_16*/
#include "lp_rast_tri_tmp.h"
#define TAG(x) x##_4
@ -279,3 +280,262 @@ sign_bits4(const __m128i *cstep, int cdiff)
#define NR_PLANES 8
#include "lp_rast_tri_tmp.h"
static INLINE void
transpose4_epi32(__m128i a,
__m128i b,
__m128i c,
__m128i d,
__m128i *o,
__m128i *p,
__m128i *q,
__m128i *r)
{
__m128i t0 = _mm_unpacklo_epi32(a, b);
__m128i t1 = _mm_unpacklo_epi32(c, d);
__m128i t2 = _mm_unpackhi_epi32(a, b);
__m128i t3 = _mm_unpackhi_epi32(c, d);
*o = _mm_unpacklo_epi64(t0, t1);
*p = _mm_unpackhi_epi64(t0, t1);
*q = _mm_unpacklo_epi64(t2, t3);
*r = _mm_unpackhi_epi64(t2, t3);
}
#define SCALAR_EPI32(m, i) _mm_shuffle_epi32((m), _MM_SHUFFLE(i,i,i,i))
#define NR_PLANES 3
/* Provide an SSE2 implementation of _mm_mullo_epi32() in terms of
* _mm_mul_epu32().
*
* I suspect this works fine for us because one of our operands is
* always positive, but not sure that this can be used for general
* signed integer multiplication.
*
* This seems close enough to the speed of SSE4 and the real
* _mm_mullo_epi32() intrinsic as to not justify adding an sse4
* dependency at this point.
*/
static INLINE __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
{
__m128i a4 = _mm_srli_si128(a, 4); /* shift by one dword */
__m128i b4 = _mm_srli_si128(b, 4); /* shift by one dword */
__m128i ba = _mm_mul_epu32(b, a); /* multply dwords 0, 2 */
__m128i b4a4 = _mm_mul_epu32(b4, a4); /* multiply dwords 1, 3 */
/* Interleave the results, either with shuffles or (slightly
* faster) direct bit operations:
*/
#if 0
__m128i ba8 = _mm_shuffle_epi32(ba, 8);
__m128i b4a48 = _mm_shuffle_epi32(b4a4, 8);
__m128i result = _mm_unpacklo_epi32(ba8, b4a48);
#else
__m128i mask = _mm_setr_epi32(~0,0,~0,0);
__m128i ba_mask = _mm_and_si128(ba, mask);
__m128i b4a4_mask = _mm_and_si128(b4a4, mask);
__m128i b4a4_mask_shift = _mm_slli_si128(b4a4_mask, 4);
__m128i result = _mm_or_si128(ba_mask, b4a4_mask_shift);
#endif
return result;
}
void
lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
const union lp_rast_cmd_arg arg)
{
const struct lp_rast_triangle *tri = arg.triangle.tri;
const struct lp_rast_plane *plane = tri->plane;
int x = (arg.triangle.plane_mask & 0xff) + task->x;
int y = (arg.triangle.plane_mask >> 8) + task->y;
unsigned i, j;
struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
unsigned nr = 0;
__m128i p0 = _mm_loadu_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
__m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
__m128i p2 = _mm_loadu_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
__m128i zero = _mm_setzero_si128();
__m128i c;
__m128i dcdx;
__m128i dcdy;
__m128i rej4;
__m128i dcdx2;
__m128i dcdx3;
__m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */
__m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */
__m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */
__m128i unused;
transpose4_epi32(p0, p1, p2, zero,
&c, &dcdx, &dcdy, &rej4);
/* Adjust dcdx;
*/
dcdx = _mm_sub_epi32(zero, dcdx);
c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
rej4 = _mm_slli_epi32(rej4, 2);
dcdx2 = _mm_add_epi32(dcdx, dcdx);
dcdx3 = _mm_add_epi32(dcdx2, dcdx);
transpose4_epi32(zero, dcdx, dcdx2, dcdx3,
&span_0, &span_1, &span_2, &unused);
for (i = 0; i < 4; i++) {
__m128i cx = c;
for (j = 0; j < 4; j++) {
__m128i c4rej = _mm_add_epi32(cx, rej4);
__m128i rej_masks = _mm_srai_epi32(c4rej, 31);
/* if (is_zero(rej_masks)) */
if (_mm_movemask_epi8(rej_masks) == 0) {
__m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0);
__m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1);
__m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2);
__m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
__m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
__m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
__m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
__m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
__m128i c_01 = _mm_packs_epi32(c_0, c_1);
__m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
__m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
__m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
__m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
__m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
__m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
__m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
__m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
__m128i c_23 = _mm_packs_epi32(c_2, c_3);
__m128i c_0123 = _mm_packs_epi16(c_01, c_23);
unsigned mask = _mm_movemask_epi8(c_0123);
out[nr].i = i;
out[nr].j = j;
out[nr].mask = mask;
if (mask != 0xffff)
nr++;
}
cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2));
}
c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2));
}
for (i = 0; i < nr; i++)
lp_rast_shade_quads_mask(task,
&tri->inputs,
x + 4 * out[i].j,
y + 4 * out[i].i,
0xffff & ~out[i].mask);
}
void
lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
const union lp_rast_cmd_arg arg)
{
const struct lp_rast_triangle *tri = arg.triangle.tri;
const struct lp_rast_plane *plane = tri->plane;
int x = (arg.triangle.plane_mask & 0xff) + task->x;
int y = (arg.triangle.plane_mask >> 8) + task->y;
__m128i p0 = _mm_loadu_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
__m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
__m128i p2 = _mm_loadu_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
__m128i zero = _mm_setzero_si128();
__m128i c;
__m128i dcdx;
__m128i dcdy;
__m128i dcdx2;
__m128i dcdx3;
__m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */
__m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */
__m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */
__m128i unused;
transpose4_epi32(p0, p1, p2, zero,
&c, &dcdx, &dcdy, &unused);
/* Adjust dcdx;
*/
dcdx = _mm_sub_epi32(zero, dcdx);
c = _mm_add_epi32(c, _mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
c = _mm_add_epi32(c, _mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
dcdx2 = _mm_add_epi32(dcdx, dcdx);
dcdx3 = _mm_add_epi32(dcdx2, dcdx);
transpose4_epi32(zero, dcdx, dcdx2, dcdx3,
&span_0, &span_1, &span_2, &unused);
{
__m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0);
__m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1);
__m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2);
__m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
__m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
__m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
__m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
__m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
__m128i c_01 = _mm_packs_epi32(c_0, c_1);
__m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
__m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
__m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
__m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
__m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
__m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
__m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
__m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
__m128i c_23 = _mm_packs_epi32(c_2, c_3);
__m128i c_0123 = _mm_packs_epi16(c_01, c_23);
unsigned mask = _mm_movemask_epi8(c_0123);
if (mask != 0xffff)
lp_rast_shade_quads_mask(task,
&tri->inputs,
x,
y,
0xffff & ~mask);
}
}