i965/fs: Improve accuracy of dFdy() to match dFdx().
Previously, we computed dFdy() using the following instruction: add(8) dst<1>F src<4,4,0)F -src.2<4,4,0>F { align1 1Q } That had the disadvantage that it computed the same value for all 4 pixels of a 2x2 subspan, which meant that it was less accurate than dFdx(). This patch changes it to the following instruction when c->key.high_quality_derivatives is set: add(8) dst<1>F src<4,4,1>.xyxyF -src<4,4,1>.zwzwF { align16 1Q } This gives it comparable accuracy to dFdx(). Unfortunately, align16 instructions can't be compressed, so in SIMD16 shaders, instead of emitting this instruction: add(16) dst<1>F src<4,4,1>.xyxyF -src<4,4,1>.zwzwF { align16 1H } We need to unroll to two instructions: add(8) dst<1>F src<4,4,1>.xyxyF -src<4,4,1>.zwzwF { align16 1Q } add(8) (dst+1)<1>F (src+1)<4,4,1>.xyxyF -(src+1)<4,4,1>.zwzwF { align16 2Q } Fixes piglit test spec/glsl-1.10/execution/fs-dfdy-accuracy. Acked-by: Chris Forbes <chrisf@ijw.co.nz> Reviewed-by: Eric Anholt <eric@anholt.net>
This commit is contained in:
parent
9267565ee4
commit
800610f9eb
|
@ -568,10 +568,8 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
|
|||
* sample_d. On at least Haswell, sample_d instruction does some
|
||||
* optimizations if the same LOD is used for all pixels in the subspan.
|
||||
*
|
||||
* For DDY, it's harder, as we want to produce the pairs swizzled between each
|
||||
* other. We could probably do it like ddx and swizzle the right order later,
|
||||
* but bail for now and just produce
|
||||
* ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
|
||||
* For DDY, we need to use ALIGN16 mode since it's capable of doing the
|
||||
* appropriate swizzling.
|
||||
*/
|
||||
void
|
||||
fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
|
||||
|
@ -612,22 +610,65 @@ void
|
|||
fs_generator::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
|
||||
bool negate_value)
|
||||
{
|
||||
struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
|
||||
BRW_REGISTER_TYPE_F,
|
||||
BRW_VERTICAL_STRIDE_4,
|
||||
BRW_WIDTH_4,
|
||||
BRW_HORIZONTAL_STRIDE_0,
|
||||
BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
|
||||
struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
|
||||
BRW_REGISTER_TYPE_F,
|
||||
BRW_VERTICAL_STRIDE_4,
|
||||
BRW_WIDTH_4,
|
||||
BRW_HORIZONTAL_STRIDE_0,
|
||||
BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
|
||||
if (negate_value)
|
||||
brw_ADD(p, dst, src1, negate(src0));
|
||||
else
|
||||
brw_ADD(p, dst, src0, negate(src1));
|
||||
if (c->key.high_quality_derivatives) {
|
||||
/* produce accurate derivatives */
|
||||
struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
|
||||
BRW_REGISTER_TYPE_F,
|
||||
BRW_VERTICAL_STRIDE_4,
|
||||
BRW_WIDTH_4,
|
||||
BRW_HORIZONTAL_STRIDE_1,
|
||||
BRW_SWIZZLE_XYXY, WRITEMASK_XYZW);
|
||||
struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
|
||||
BRW_REGISTER_TYPE_F,
|
||||
BRW_VERTICAL_STRIDE_4,
|
||||
BRW_WIDTH_4,
|
||||
BRW_HORIZONTAL_STRIDE_1,
|
||||
BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW);
|
||||
brw_push_insn_state(p);
|
||||
brw_set_access_mode(p, BRW_ALIGN_16);
|
||||
brw_set_compression_control(p, BRW_COMPRESSION_NONE);
|
||||
if (negate_value)
|
||||
brw_ADD(p, dst, src1, negate(src0));
|
||||
else
|
||||
brw_ADD(p, dst, src0, negate(src1));
|
||||
if (dispatch_width == 16) {
|
||||
/* From page 340 of the i965 PRM:
|
||||
*
|
||||
* "A compressed instruction must be in Align1 access
|
||||
* mode. Align16 mode instructions cannot be compressed."
|
||||
*
|
||||
* Therefore, when doing a 16-wide dispatch, we need to manually
|
||||
* unroll to two instructions.
|
||||
*/
|
||||
brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
|
||||
src0 = sechalf(src0);
|
||||
src1 = sechalf(src1);
|
||||
dst = sechalf(dst);
|
||||
if (negate_value)
|
||||
brw_ADD(p, dst, src1, negate(src0));
|
||||
else
|
||||
brw_ADD(p, dst, src0, negate(src1));
|
||||
}
|
||||
brw_pop_insn_state(p);
|
||||
} else {
|
||||
/* replicate the derivative at the top-left pixel to other pixels */
|
||||
struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
|
||||
BRW_REGISTER_TYPE_F,
|
||||
BRW_VERTICAL_STRIDE_4,
|
||||
BRW_WIDTH_4,
|
||||
BRW_HORIZONTAL_STRIDE_0,
|
||||
BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
|
||||
struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
|
||||
BRW_REGISTER_TYPE_F,
|
||||
BRW_VERTICAL_STRIDE_4,
|
||||
BRW_WIDTH_4,
|
||||
BRW_HORIZONTAL_STRIDE_0,
|
||||
BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
|
||||
if (negate_value)
|
||||
brw_ADD(p, dst, src1, negate(src0));
|
||||
else
|
||||
brw_ADD(p, dst, src0, negate(src1));
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
@ -77,6 +77,7 @@ extern "C" {
|
|||
#define BRW_SWIZZLE_ZZZZ BRW_SWIZZLE4(2,2,2,2)
|
||||
#define BRW_SWIZZLE_WWWW BRW_SWIZZLE4(3,3,3,3)
|
||||
#define BRW_SWIZZLE_XYXY BRW_SWIZZLE4(0,1,0,1)
|
||||
#define BRW_SWIZZLE_ZWZW BRW_SWIZZLE4(2,3,2,3)
|
||||
|
||||
static inline bool
|
||||
brw_is_single_value_swizzle(int swiz)
|
||||
|
|
Loading…
Reference in New Issue