nir/glsl: Add another way of doing lower_imul64 for gen8+
On Gen 8 and 9, "mul" instruction supports 64 bit destination type. We can reduce our 64x64 int multiplication from 4 instructions to 3. Also instead of emitting two mul instructions, we can emit single mul instuction and extract low/high 32 bits from 64 bit result for [i/u]mulExtended v2: 1) Allow lower_mul_high64 to use new opcode (Jason Ekstrand) 2) Add lower_mul_2x32_64 flag (Matt Turner) 3) Remove associative property as bit size is different (Connor Abbott) v3: Fix indentation and variable naming convention (Jason Ekstrand) Signed-off-by: Sagar Ghuge <sagar.ghuge@intel.com> Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
This commit is contained in:
parent
1d363d440f
commit
e551040c60
|
@ -2118,6 +2118,7 @@ typedef enum {
|
|||
nir_lower_logic64 = (1 << 9),
|
||||
nir_lower_minmax64 = (1 << 10),
|
||||
nir_lower_shift64 = (1 << 11),
|
||||
nir_lower_imul_2x32_64 = (1 << 12),
|
||||
} nir_lower_int64_options;
|
||||
|
||||
typedef enum {
|
||||
|
@ -2259,6 +2260,9 @@ typedef struct nir_shader_compiler_options {
|
|||
*/
|
||||
bool use_interpolated_input_intrinsics;
|
||||
|
||||
/* Lowers when 32x32->64 bit multiplication is not supported */
|
||||
bool lower_mul_2x32_64;
|
||||
|
||||
unsigned max_unroll_iterations;
|
||||
|
||||
nir_lower_int64_options lower_int64_options;
|
||||
|
|
|
@ -383,6 +383,16 @@ lower_imin64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
|
|||
return nir_bcsel(b, lower_int64_compare(b, nir_op_ilt, x, y), x, y);
|
||||
}
|
||||
|
||||
static nir_ssa_def *
|
||||
lower_mul_2x32_64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y,
|
||||
bool sign_extend)
|
||||
{
|
||||
nir_ssa_def *res_hi = sign_extend ? nir_imul_high(b, x, y)
|
||||
: nir_umul_high(b, x, y);
|
||||
|
||||
return nir_pack_64_2x32_split(b, nir_imul(b, x, y), res_hi);
|
||||
}
|
||||
|
||||
static nir_ssa_def *
|
||||
lower_imul64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
|
||||
{
|
||||
|
@ -391,12 +401,13 @@ lower_imul64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
|
|||
nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
|
||||
nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
|
||||
|
||||
nir_ssa_def *res_lo = nir_imul(b, x_lo, y_lo);
|
||||
nir_ssa_def *res_hi = nir_iadd(b, nir_umul_high(b, x_lo, y_lo),
|
||||
nir_ssa_def *mul_lo = nir_umul_2x32_64(b, x_lo, y_lo);
|
||||
nir_ssa_def *res_hi = nir_iadd(b, nir_unpack_64_2x32_split_y(b, mul_lo),
|
||||
nir_iadd(b, nir_imul(b, x_lo, y_hi),
|
||||
nir_imul(b, x_hi, y_lo)));
|
||||
|
||||
return nir_pack_64_2x32_split(b, res_lo, res_hi);
|
||||
return nir_pack_64_2x32_split(b, nir_unpack_64_2x32_split_x(b, mul_lo),
|
||||
res_hi);
|
||||
}
|
||||
|
||||
static nir_ssa_def *
|
||||
|
@ -441,9 +452,8 @@ lower_mul_high64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y,
|
|||
* so we're guaranteed that we can add in two more 32-bit values
|
||||
* without overflowing tmp.
|
||||
*/
|
||||
nir_ssa_def *tmp =
|
||||
nir_pack_64_2x32_split(b, nir_imul(b, x32[i], y32[j]),
|
||||
nir_umul_high(b, x32[i], y32[j]));
|
||||
nir_ssa_def *tmp = nir_umul_2x32_64(b, x32[i], y32[i]);
|
||||
|
||||
if (res[i + j])
|
||||
tmp = nir_iadd(b, tmp, nir_u2u64(b, res[i + j]));
|
||||
if (carry)
|
||||
|
@ -626,6 +636,9 @@ opcode_to_options_mask(nir_op opcode)
|
|||
switch (opcode) {
|
||||
case nir_op_imul:
|
||||
return nir_lower_imul64;
|
||||
case nir_op_imul_2x32_64:
|
||||
case nir_op_umul_2x32_64:
|
||||
return nir_lower_imul_2x32_64;
|
||||
case nir_op_imul_high:
|
||||
case nir_op_umul_high:
|
||||
return nir_lower_imul_high64;
|
||||
|
@ -688,6 +701,10 @@ lower_int64_alu_instr(nir_builder *b, nir_alu_instr *alu)
|
|||
switch (alu->op) {
|
||||
case nir_op_imul:
|
||||
return lower_imul64(b, src[0], src[1]);
|
||||
case nir_op_imul_2x32_64:
|
||||
return lower_mul_2x32_64(b, src[0], src[1], true);
|
||||
case nir_op_umul_2x32_64:
|
||||
return lower_mul_2x32_64(b, src[0], src[1], false);
|
||||
case nir_op_imul_high:
|
||||
return lower_mul_high64(b, src[0], src[1], true);
|
||||
case nir_op_umul_high:
|
||||
|
|
|
@ -475,6 +475,12 @@ binop("fmul", tfloat, commutative + associative, "src0 * src1")
|
|||
# low 32-bits of signed/unsigned integer multiply
|
||||
binop("imul", tint, commutative + associative, "src0 * src1")
|
||||
|
||||
# Generate 64 bit result from 2 32 bits quantity
|
||||
binop_convert("imul_2x32_64", tint64, tint32, commutative,
|
||||
"(int64_t)src0 * (int64_t)src1")
|
||||
binop_convert("umul_2x32_64", tuint64, tuint32, commutative,
|
||||
"(uint64_t)src0 * (uint64_t)src1")
|
||||
|
||||
# high 32-bits of signed integer multiply
|
||||
binop("imul_high", tint, commutative, """
|
||||
if (bit_size == 64) {
|
||||
|
|
|
@ -70,6 +70,8 @@ optimizations = [
|
|||
|
||||
(('imul', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b))),
|
||||
(('imul', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b))))),
|
||||
(('imul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('imul_high', a, b)), 'options->lower_mul_2x32_64'),
|
||||
(('umul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('umul_high', a, b)), 'options->lower_mul_2x32_64'),
|
||||
(('udiv', a, 1), a),
|
||||
(('idiv', a, 1), a),
|
||||
(('umod', a, 1), 0),
|
||||
|
|
|
@ -171,6 +171,13 @@ brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo)
|
|||
fp64_options |= nir_lower_fp64_full_software;
|
||||
}
|
||||
|
||||
/* The Bspec's section tittled "Instruction_multiply[DevBDW+]" claims that
|
||||
* destination type can be Quadword and source type Doubleword for Gen8 and
|
||||
* Gen9. So, lower 64 bit multiply instruction on rest of the platforms.
|
||||
*/
|
||||
if (devinfo->gen < 8 || devinfo->gen > 9)
|
||||
int64_options |= nir_lower_imul_2x32_64;
|
||||
|
||||
/* We want the GLSL compiler to emit code that uses condition codes */
|
||||
for (int i = 0; i < MESA_SHADER_STAGES; i++) {
|
||||
compiler->glsl_compiler_options[i].MaxUnrollIterations = 0;
|
||||
|
|
|
@ -1055,6 +1055,11 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
|
|||
inst->saturate = instr->dest.saturate;
|
||||
break;
|
||||
|
||||
case nir_op_imul_2x32_64:
|
||||
case nir_op_umul_2x32_64:
|
||||
bld.MUL(result, op[0], op[1]);
|
||||
break;
|
||||
|
||||
case nir_op_imul:
|
||||
assert(nir_dest_bit_size(instr->dest.dest) < 64);
|
||||
bld.MUL(result, op[0], op[1]);
|
||||
|
|
Loading…
Reference in New Issue