i965/fs/gen7: Emit code for GLSL 3.00 pack/unpack operations (v4)

v2: Remove lewd comment. [for idr]
v3: - Optimize away tmp register for packHalf2x16. [for anholt, paul]
    - Improve comments. [for anholt, paul]
    - Reduce near-duplicate code by removing vec4_visitor emit_pack/unpack
      methods. [for chadv]
v4: Factor our UD/W register conversion into helper function. [for anholt]

Reviewed-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> (v2)
Signed-off-by: Chad Versace <chad.versace@linux.intel.com>
This commit is contained in:
Chad Versace 2013-01-09 11:46:42 -08:00
parent 203c12b18f
commit 20dfa501b3
5 changed files with 144 additions and 3 deletions

View File

@ -726,6 +726,9 @@ enum opcode {
FS_OPCODE_MOV_DISPATCH_TO_FLAGS,
FS_OPCODE_DISCARD_JUMP,
FS_OPCODE_SET_GLOBAL_OFFSET,
FS_OPCODE_PACK_HALF_2x16_SPLIT,
FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X,
FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y,
VS_OPCODE_URB_WRITE,
VS_OPCODE_SCRATCH_READ,

View File

@ -542,6 +542,14 @@ private:
struct brw_reg offset);
void generate_discard_jump(fs_inst *inst);
void generate_pack_half_2x16_split(fs_inst *inst,
struct brw_reg dst,
struct brw_reg x,
struct brw_reg y);
void generate_unpack_half_2x16_split(fs_inst *inst,
struct brw_reg dst,
struct brw_reg src);
void patch_discard_jumps_to_fb_writes();
struct brw_context *brw;

View File

@ -342,9 +342,21 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
assert(!"not yet supported");
break;
case ir_unop_pack_snorm_2x16:
case ir_unop_pack_unorm_2x16:
case ir_unop_pack_half_2x16:
case ir_unop_unpack_snorm_2x16:
case ir_unop_unpack_unorm_2x16:
case ir_unop_unpack_half_2x16:
case ir_quadop_vector:
assert(!"should have been lowered");
break;
case ir_unop_unpack_half_2x16_split_x:
case ir_unop_unpack_half_2x16_split_y:
case ir_binop_pack_half_2x16_split:
assert("!not reached: expression operates on scalars only");
break;
}
ir->remove();

View File

@ -922,6 +922,95 @@ fs_generator::generate_set_global_offset(fs_inst *inst,
brw_pop_insn_state(p);
}
/**
* Change the register's data type from UD to W, doubling the strides in order
* to compensate for halving the data type width.
*/
static struct brw_reg
ud_reg_to_w(struct brw_reg r)
{
assert(r.type == BRW_REGISTER_TYPE_UD);
r.type = BRW_REGISTER_TYPE_W;
/* The BRW_*_STRIDE enums are defined so that incrementing the field
* doubles the real stride.
*/
if (r.hstride != 0)
++r.hstride;
if (r.vstride != 0)
++r.vstride;
return r;
}
void
fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
struct brw_reg dst,
struct brw_reg x,
struct brw_reg y)
{
assert(intel->gen >= 7);
assert(dst.type == BRW_REGISTER_TYPE_UD);
assert(x.type = BRW_REGISTER_TYPE_F);
assert(y.type = BRW_REGISTER_TYPE_F);
/* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
*
* Because this instruction does not have a 16-bit floating-point type,
* the destination data type must be Word (W).
*
* The destination must be DWord-aligned and specify a horizontal stride
* (HorzStride) of 2. The 16-bit result is stored in the lower word of
* each destination channel and the upper word is not modified.
*/
struct brw_reg dst_w = ud_reg_to_w(dst);
/* Give each 32-bit channel of dst the form below , where "." means
* unchanged.
* 0x....hhhh
*/
brw_F32TO16(p, dst_w, y);
/* Now the form:
* 0xhhhh0000
*/
brw_SHL(p, dst, dst, brw_imm_ud(16u));
/* And, finally the form of packHalf2x16's output:
* 0xhhhhllll
*/
brw_F32TO16(p, dst_w, x);
}
void
fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
struct brw_reg dst,
struct brw_reg src)
{
assert(intel->gen >= 7);
assert(dst.type == BRW_REGISTER_TYPE_F);
assert(src.type == BRW_REGISTER_TYPE_UD);
/* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
*
* Because this instruction does not have a 16-bit floating-point type,
* the source data type must be Word (W). The destination type must be
* F (Float).
*/
struct brw_reg src_w = ud_reg_to_w(src);
/* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
* For the Y case, we wish to access only the upper word; therefore
* a 16-bit subregister offset is needed.
*/
assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
src.subnr += 2;
brw_F16TO32(p, dst, src_w);
}
void
fs_generator::generate_code(exec_list *instructions)
{
@ -1082,7 +1171,12 @@ fs_generator::generate_code(exec_list *instructions)
case BRW_OPCODE_SHL:
brw_SHL(p, dst, src[0], src[1]);
break;
case BRW_OPCODE_F32TO16:
brw_F32TO16(p, dst, src[0]);
break;
case BRW_OPCODE_F16TO32:
brw_F16TO32(p, dst, src[0]);
break;
case BRW_OPCODE_CMP:
brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
break;
@ -1229,6 +1323,15 @@ fs_generator::generate_code(exec_list *instructions)
generate_set_global_offset(inst, dst, src[0], src[1]);
break;
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
break;
case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
generate_unpack_half_2x16_split(inst, dst, src[0]);
break;
default:
if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
_mesa_problem(ctx, "Unsupported opcode `%s' in FS",

View File

@ -538,7 +538,20 @@ fs_visitor::visit(ir_expression *ir)
BRW_CONDITIONAL_L : BRW_CONDITIONAL_GE,
this->result, op[0], op[1]);
break;
case ir_unop_pack_snorm_2x16:
case ir_unop_pack_unorm_2x16:
case ir_unop_unpack_snorm_2x16:
case ir_unop_unpack_unorm_2x16:
case ir_unop_unpack_half_2x16:
case ir_unop_pack_half_2x16:
assert(!"not reached: should be handled by lower_packing_builtins");
break;
case ir_unop_unpack_half_2x16_split_x:
emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, this->result, op[0]);
break;
case ir_unop_unpack_half_2x16_split_y:
emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, this->result, op[0]);
break;
case ir_binop_pow:
emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
break;
@ -566,7 +579,9 @@ fs_visitor::visit(ir_expression *ir)
else
inst = emit(SHR(this->result, op[0], op[1]));
break;
case ir_binop_pack_half_2x16_split:
emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, this->result, op[0], op[1]);
break;
case ir_binop_ubo_load:
/* This IR node takes a constant uniform block and a constant or
* variable byte offset within the block and loads a vector from that.