microsoft/compiler: Handle bitfield_insert

This *almost* matches what GLSL wants, except for the handling of large widths. You can see this in the lowering algorithm: (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), ('bcsel', ('ult', 31, 'bits'), 'insert', ('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')), 'options->lower_bitfield_insert'), DXIL's 'bfi' instruction is an inseparable pairing of NIR's 'bfi(bfm(...), ...)', so we just apply the additional bcsel in the backend. Reviewed-by: Sil Vilerino <sivileri@microsoft.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14624>
2022-01-17 13:20:53 -08:00 · 2022-01-17 13:20:53 -08:00 · 80e782d5ed
parent 8938c6e032
commit 80e782d5ed
2 changed files with 53 additions and 0 deletions
--- a/src/microsoft/compiler/dxil_function.c
+++ b/src/microsoft/compiler/dxil_function.c
@ -41,6 +41,7 @@ static struct  predefined_func_descr predefined_funcs[] = {
 {"dx.op.storeOutput", "v", "iiicO", DXIL_ATTR_KIND_NO_UNWIND},
 {"dx.op.loadInput", "O", "iiici", DXIL_ATTR_KIND_READ_NONE},
 {"dx.op.tertiary", "O", "iOOO", DXIL_ATTR_KIND_READ_NONE},
+{"dx.op.quaternary", "O", "iOOOO", DXIL_ATTR_KIND_READ_NONE},
 {"dx.op.threadId", "i", "ii", DXIL_ATTR_KIND_READ_NONE},
 {"dx.op.threadIdInGroup", "i", "ii", DXIL_ATTR_KIND_READ_NONE},
 {"dx.op.flattenedThreadIdInGroup", "i", "i", DXIL_ATTR_KIND_READ_NONE},
--- a/src/microsoft/compiler/nir_to_dxil.c
+++ b/src/microsoft/compiler/nir_to_dxil.c
@ -224,6 +224,8 @@ enum dxil_intr {

   DXIL_INTR_FMA = 47,

+   DXIL_INTR_BFI = 53,
+
   DXIL_INTR_CREATE_HANDLE = 57,
   DXIL_INTR_CBUFFER_LOAD_LEGACY = 59,

@ -540,6 +542,33 @@ emit_tertiary_call(struct ntd_context *ctx, enum overload_type overload,
   return dxil_emit_call(&ctx->mod, func, args, ARRAY_SIZE(args));
 }

+static const struct dxil_value *
+emit_quaternary_call(struct ntd_context *ctx, enum overload_type overload,
+                     enum dxil_intr intr,
+                     const struct dxil_value *op0,
+                     const struct dxil_value *op1,
+                     const struct dxil_value *op2,
+                     const struct dxil_value *op3)
+{
+   const struct dxil_func *func = dxil_get_function(&ctx->mod, "dx.op.quaternary", overload);
+   if (!func)
+      return NULL;
+
+   const struct dxil_value *opcode = dxil_module_get_int32_const(&ctx->mod, intr);
+   if (!opcode)
+      return NULL;
+
+   const struct dxil_value *args[] = {
+     opcode,
+     op0,
+     op1,
+     op2,
+     op3
+   };
+
+   return dxil_emit_call(&ctx->mod, func, args, ARRAY_SIZE(args));
+}
+
 static const struct dxil_value *
 emit_threadid_call(struct ntd_context *ctx, const struct dxil_value *comp)
 {
@ -1874,6 +1903,27 @@ emit_tertiary_intin(struct ntd_context *ctx, nir_alu_instr *alu,
   return true;
 }

+static bool
+emit_bitfield_insert(struct ntd_context *ctx, nir_alu_instr *alu,
+                     const struct dxil_value *base,
+                     const struct dxil_value *insert,
+                     const struct dxil_value *offset,
+                     const struct dxil_value *width)
+{
+   /* DXIL is width, offset, insert, base, NIR is base, insert, offset, width */
+   const struct dxil_value *v = emit_quaternary_call(ctx, DXIL_I32, DXIL_INTR_BFI,
+                                                     width, offset, insert, base);
+   if (!v)
+      return false;
+
+   /* DXIL uses the 5 LSB from width/offset. Special-case width >= 32 == copy insert. */
+   const struct dxil_value *compare_width = dxil_emit_cmp(&ctx->mod, DXIL_ICMP_SGE,
+      width, dxil_module_get_int32_const(&ctx->mod, 32));
+   v = dxil_emit_select(&ctx->mod, compare_width, insert, v);
+   store_alu_dest(ctx, alu, 0, v);
+   return true;
+}
+
 static bool emit_select(struct ntd_context *ctx, nir_alu_instr *alu,
                        const struct dxil_value *sel,
                        const struct dxil_value *val_true,
@ -2236,6 +2286,8 @@ emit_alu(struct ntd_context *ctx, nir_alu_instr *alu)
   case nir_op_fmin: return emit_binary_intin(ctx, alu, DXIL_INTR_FMIN, src[0], src[1]);
   case nir_op_ffma: return emit_tertiary_intin(ctx, alu, DXIL_INTR_FMA, src[0], src[1], src[2]);

+   case nir_op_bitfield_insert: return emit_bitfield_insert(ctx, alu, src[0], src[1], src[2], src[3]);
+
   case nir_op_unpack_half_2x16_split_x: return emit_f16tof32(ctx, alu, src[0], false);
   case nir_op_unpack_half_2x16_split_y: return emit_f16tof32(ctx, alu, src[0], true);
   case nir_op_pack_half_2x16_split: return emit_f32tof16(ctx, alu, src[0], src[1]);