r300/compiler: make lowering passes possibly use up to two less temps

CMP may now use two less temps, other non-native instructions may end up
using one less temp, except for SIN/COS/SCS, which I am leaving unchanged
for now.

This may reduce register pressure inside loops, because the register
allocator doesn't do a very good job there.
This commit is contained in:
Marek Olšák 2010-12-07 21:57:18 +01:00
parent 93f2df0760
commit 2ff9d4474b
1 changed files with 86 additions and 63 deletions

View File

@ -85,16 +85,6 @@ static struct rc_instruction *emit3(
return fpi;
}
static struct rc_dst_register dstreg(int file, int index)
{
struct rc_dst_register dst;
dst.File = file;
dst.Index = index;
dst.WriteMask = RC_MASK_XYZW;
dst.RelAddr = 0;
return dst;
}
static struct rc_dst_register dstregtmpmask(int index, int mask)
{
struct rc_dst_register dst = {0};
@ -187,6 +177,38 @@ static struct rc_src_register swizzle_wwww(struct rc_src_register reg)
return swizzle_smear(reg, RC_SWIZZLE_W);
}
static int is_dst_safe_to_reuse(struct rc_instruction *inst)
{
const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);
unsigned i;
assert(info->HasDstReg);
if (inst->U.I.DstReg.File != RC_FILE_TEMPORARY)
return 0;
for (i = 0; i < info->NumSrcRegs; i++) {
if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY &&
inst->U.I.SrcReg[i].Index == inst->U.I.DstReg.Index)
return 0;
}
return 1;
}
static struct rc_dst_register try_to_reuse_dst(struct radeon_compiler *c,
struct rc_instruction *inst)
{
unsigned tmp;
if (is_dst_safe_to_reuse(inst))
tmp = inst->U.I.DstReg.Index;
else
tmp = rc_find_free_temporary(c);
return dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask);
}
static void transform_ABS(struct radeon_compiler* c,
struct rc_instruction* inst)
{
@ -210,10 +232,10 @@ static void transform_CEIL(struct radeon_compiler* c,
* ceil(x) = x+frac(-x)
*/
int tempreg = rc_find_free_temporary(c);
emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstreg(RC_FILE_TEMPORARY, tempreg), negate(inst->U.I.SrcReg[0]));
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, negate(inst->U.I.SrcReg[0]));
emit2(c, inst->Prev, RC_OPCODE_ADD, inst->U.I.SaturateMode, inst->U.I.DstReg,
inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, tempreg));
inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index));
rc_remove_instruction(inst);
}
@ -225,11 +247,11 @@ static void transform_CLAMP(struct radeon_compiler *c,
* MIN tmp, src, max
* MAX dst, tmp, min
*/
int tempreg = rc_find_free_temporary(c);
emit2(c, inst->Prev, RC_OPCODE_MIN, 0, dstreg(RC_FILE_TEMPORARY, tempreg),
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
emit2(c, inst->Prev, RC_OPCODE_MIN, 0, dst,
inst->U.I.SrcReg[0], inst->U.I.SrcReg[2]);
emit2(c, inst->Prev, RC_OPCODE_MAX, inst->U.I.SaturateMode, inst->U.I.DstReg,
srcreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[1]);
srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1]);
rc_remove_instruction(inst);
}
@ -275,10 +297,10 @@ static void transform_DST(struct radeon_compiler* c,
static void transform_FLR(struct radeon_compiler* c,
struct rc_instruction* inst)
{
int tempreg = rc_find_free_temporary(c);
emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0]);
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, inst->U.I.SrcReg[0]);
emit2(c, inst->Prev, RC_OPCODE_ADD, inst->U.I.SaturateMode, inst->U.I.DstReg,
inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, tempreg)));
inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
rc_remove_instruction(inst);
}
@ -368,14 +390,14 @@ static void transform_LIT(struct radeon_compiler* c,
static void transform_LRP(struct radeon_compiler* c,
struct rc_instruction* inst)
{
int tempreg = rc_find_free_temporary(c);
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
dstreg(RC_FILE_TEMPORARY, tempreg),
dst,
inst->U.I.SrcReg[1], negate(inst->U.I.SrcReg[2]));
emit3(c, inst->Prev, RC_OPCODE_MAD, inst->U.I.SaturateMode,
inst->U.I.DstReg,
inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[2]);
inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[2]);
rc_remove_instruction(inst);
}
@ -383,9 +405,8 @@ static void transform_LRP(struct radeon_compiler* c,
static void transform_POW(struct radeon_compiler* c,
struct rc_instruction* inst)
{
int tempreg = rc_find_free_temporary(c);
struct rc_dst_register tempdst = dstreg(RC_FILE_TEMPORARY, tempreg);
struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempreg);
struct rc_dst_register tempdst = try_to_reuse_dst(c, inst);
struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempdst.Index);
tempdst.WriteMask = RC_MASK_W;
tempsrc.Swizzle = RC_SWIZZLE_WWWW;
@ -405,11 +426,11 @@ static void transform_RSQ(struct radeon_compiler* c,
static void transform_SEQ(struct radeon_compiler* c,
struct rc_instruction* inst)
{
int tempreg = rc_find_free_temporary(c);
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
negate(absolute(srcreg(RC_FILE_TEMPORARY, tempreg))), builtin_zero, builtin_one);
negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_zero, builtin_one);
rc_remove_instruction(inst);
}
@ -424,11 +445,11 @@ static void transform_SFL(struct radeon_compiler* c,
static void transform_SGE(struct radeon_compiler* c,
struct rc_instruction* inst)
{
int tempreg = rc_find_free_temporary(c);
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
srcreg(RC_FILE_TEMPORARY, tempreg), builtin_zero, builtin_one);
srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
rc_remove_instruction(inst);
}
@ -436,11 +457,11 @@ static void transform_SGE(struct radeon_compiler* c,
static void transform_SGT(struct radeon_compiler* c,
struct rc_instruction* inst)
{
int tempreg = rc_find_free_temporary(c);
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
srcreg(RC_FILE_TEMPORARY, tempreg), builtin_one, builtin_zero);
srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
rc_remove_instruction(inst);
}
@ -448,11 +469,11 @@ static void transform_SGT(struct radeon_compiler* c,
static void transform_SLE(struct radeon_compiler* c,
struct rc_instruction* inst)
{
int tempreg = rc_find_free_temporary(c);
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
srcreg(RC_FILE_TEMPORARY, tempreg), builtin_zero, builtin_one);
srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
rc_remove_instruction(inst);
}
@ -460,11 +481,11 @@ static void transform_SLE(struct radeon_compiler* c,
static void transform_SLT(struct radeon_compiler* c,
struct rc_instruction* inst)
{
int tempreg = rc_find_free_temporary(c);
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
srcreg(RC_FILE_TEMPORARY, tempreg), builtin_one, builtin_zero);
srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
rc_remove_instruction(inst);
}
@ -472,11 +493,11 @@ static void transform_SLT(struct radeon_compiler* c,
static void transform_SNE(struct radeon_compiler* c,
struct rc_instruction* inst)
{
int tempreg = rc_find_free_temporary(c);
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
negate(absolute(srcreg(RC_FILE_TEMPORARY, tempreg))), builtin_one, builtin_zero);
negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_one, builtin_zero);
rc_remove_instruction(inst);
}
@ -490,12 +511,13 @@ static void transform_SSG(struct radeon_compiler* c,
* CMP tmp1, x, 1, 0
* ADD result, tmp0, -tmp1;
*/
unsigned tmp0, tmp1;
struct rc_dst_register dst0;
unsigned tmp1;
/* 0 < x */
tmp0 = rc_find_free_temporary(c);
dst0 = try_to_reuse_dst(c, inst);
emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
dstregtmpmask(tmp0, inst->U.I.DstReg.WriteMask),
dst0,
negate(inst->U.I.SrcReg[0]),
builtin_one,
builtin_zero);
@ -512,7 +534,7 @@ static void transform_SSG(struct radeon_compiler* c,
/* result = tmp0 - tmp1 */
emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
inst->U.I.DstReg,
srcreg(RC_FILE_TEMPORARY, tmp0),
srcreg(RC_FILE_TEMPORARY, dst0.Index),
negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
rc_remove_instruction(inst);
@ -534,15 +556,15 @@ static void transform_SWZ(struct radeon_compiler* c,
static void transform_XPD(struct radeon_compiler* c,
struct rc_instruction* inst)
{
int tempreg = rc_find_free_temporary(c);
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstreg(RC_FILE_TEMPORARY, tempreg),
emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dst,
swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W));
emit3(c, inst->Prev, RC_OPCODE_MAD, inst->U.I.SaturateMode, inst->U.I.DstReg,
swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W),
swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
negate(srcreg(RC_FILE_TEMPORARY, tempreg)));
negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
rc_remove_instruction(inst);
}
@ -610,7 +632,7 @@ static void transform_r300_vertex_CMP(struct radeon_compiler* c,
{
/* There is no decent CMP available, so let's rig one up.
* CMP is defined as dst = src0 < 0.0 ? src1 : src2
* The following sequence consumes two temps and two extra slots
* The following sequence consumes zero to two temps and two extra slots
* (the second temp and the second slot is consumed by transform_LRP),
* but should be equivalent:
*
@ -618,18 +640,18 @@ static void transform_r300_vertex_CMP(struct radeon_compiler* c,
* LRP dst, tmp0, src1, src2
*
* Yes, I know, I'm a mad scientist. ~ C. & M. */
int tempreg0 = rc_find_free_temporary(c);
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
/* SLT tmp0, src0, 0.0 */
emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
dstreg(RC_FILE_TEMPORARY, tempreg0),
dst,
inst->U.I.SrcReg[0], builtin_zero);
/* LRP dst, tmp0, src1, src2 */
transform_LRP(c,
emit3(c, inst->Prev, RC_OPCODE_LRP, 0,
inst->U.I.DstReg,
srcreg(RC_FILE_TEMPORARY, tempreg0), inst->U.I.SrcReg[1], inst->U.I.SrcReg[2]));
srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1], inst->U.I.SrcReg[2]));
rc_remove_instruction(inst);
}
@ -660,7 +682,7 @@ static void transform_r300_vertex_DP3(struct radeon_compiler* c,
static void transform_r300_vertex_fix_LIT(struct radeon_compiler* c,
struct rc_instruction* inst)
{
int tempreg = rc_find_free_temporary(c);
struct rc_dst_register dst = try_to_reuse_dst(c, inst);
unsigned constant_swizzle;
int constant = rc_constants_add_immediate_scalar(&c->Program.Constants,
0.0000000000000000001,
@ -668,16 +690,16 @@ static void transform_r300_vertex_fix_LIT(struct radeon_compiler* c,
/* MOV dst, src */
emit1(c, inst->Prev, RC_OPCODE_MOV, 0,
dstreg(RC_FILE_TEMPORARY, tempreg),
dst,
inst->U.I.SrcReg[0]);
/* MAX dst.z, src, 0.00...001 */
emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
dstregtmpmask(tempreg, RC_MASK_Y),
srcreg(RC_FILE_TEMPORARY, tempreg),
dstregtmpmask(dst.Index, RC_MASK_Y),
srcreg(RC_FILE_TEMPORARY, dst.Index),
srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
inst->U.I.SrcReg[0] = srcreg(RC_FILE_TEMPORARY, tempreg);
inst->U.I.SrcReg[0] = srcreg(RC_FILE_TEMPORARY, dst.Index);
}
static void transform_r300_vertex_SEQ(struct radeon_compiler *c,
@ -761,12 +783,13 @@ static void transform_r300_vertex_SSG(struct radeon_compiler* c,
* SLT tmp1, x, 0;
* ADD result, tmp0, -tmp1;
*/
unsigned tmp0, tmp1;
struct rc_dst_register dst0 = try_to_reuse_dst(c, inst);
unsigned tmp1;
/* 0 < x */
tmp0 = rc_find_free_temporary(c);
dst0 = try_to_reuse_dst(c, inst);
emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
dstregtmpmask(tmp0, inst->U.I.DstReg.WriteMask),
dst0,
builtin_zero,
inst->U.I.SrcReg[0]);
@ -781,7 +804,7 @@ static void transform_r300_vertex_SSG(struct radeon_compiler* c,
/* result = tmp0 - tmp1 */
emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
inst->U.I.DstReg,
srcreg(RC_FILE_TEMPORARY, tmp0),
srcreg(RC_FILE_TEMPORARY, dst0.Index),
negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
rc_remove_instruction(inst);