nvc0: add maxwell (sm50) compiler backend

The big missing part here is proper sched data calculations, but hopefully the chosen placeholder will be sufficient for now. Passes piglit as well as GK107 does. Signed-off-by: Ben Skeggs <bskeggs@redhat.com> Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
2014-05-09 15:56:05 +10:00 · 2014-05-09 15:56:05 +10:00 · d548d47edf
parent 7b9475fa65
commit d548d47edf
16 changed files with 3588 additions and 5 deletions
--- a/src/gallium/drivers/nouveau/Makefile.sources
+++ b/src/gallium/drivers/nouveau/Makefile.sources
@ -72,10 +72,13 @@ NV50_CODEGEN_SOURCES := \
 	codegen/nv50_ir_util.cpp

 NVC0_CODEGEN_SOURCES := \
-	codegen/nv50_ir_emit_gk110.cpp \
 	codegen/nv50_ir_emit_nvc0.cpp \
+	codegen/nv50_ir_emit_gk110.cpp \
+	codegen/nv50_ir_emit_gm107.cpp \
 	codegen/nv50_ir_lowering_nvc0.cpp \
-	codegen/nv50_ir_target_nvc0.cpp
+	codegen/nv50_ir_lowering_gm107.cpp \
+	codegen/nv50_ir_target_nvc0.cpp \
+	codegen/nv50_ir_target_gm107.cpp

 NVC0_C_SOURCES := \
 	nvc0/nvc0_compute.c \
--- a/src/gallium/drivers/nouveau/codegen/lib/Makefile
+++ b/src/gallium/drivers/nouveau/codegen/lib/Makefile
@ -1,6 +1,6 @@
 ENVYAS ?= envyas

-all: gf100.asm.h gk104.asm.h gk110.asm.h
+all: gf100.asm.h gk104.asm.h gk110.asm.h gm107.asm.h

 gf100.asm.h: %.asm.h: %.asm
 	$(ENVYAS) -a -W -mnvc0 -Vnvc0 $< -o $@
@ -8,3 +8,5 @@ gk104.asm.h: %.asm.h: %.asm
 	$(ENVYAS) -a -W -mnvc0 -Vnve4 $< -o $@
 gk110.asm.h: %.asm.h: %.asm
 	$(ENVYAS) -a -W -mgk110 $< -o $@
+gm107.asm.h: %.asm.h: %.asm
+	$(ENVYAS) -a -W -mgm107 $< -o $@
--- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm
+++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm
@ -0,0 +1,115 @@
+.section #gm107_builtin_code
+// DIV U32
+//
+// UNR recurrence (q = a / b):
+// look for z such that 2^32 - b <= b * z < 2^32
+// then q - 1 <= (a * z) / 2^32 <= q
+//
+// INPUT:   $r0: dividend, $r1: divisor
+// OUTPUT:  $r0: result, $r1: modulus
+// CLOBBER: $r2 - $r3, $p0 - $p1
+// SIZE:    22 / 14 * 8 bytes
+//
+gm107_div_u32:
+   sched 0x7e0 0x7e0 0x7e0
+   flo u32 $r2 $r1
+   lop xor 1 $r2 $r2 0x1f
+   mov $r3 0x1 0xf
+   sched 0x7e0 0x7e0 0x7e0
+   shl $r2 $r3 $r2
+   i2i u32 u32 $r1 neg $r1
+   imul u32 u32 $r3 $r1 $r2
+   sched 0x7e0 0x7e0 0x7e0
+   imad u32 u32 hi $r2 $r2 $r3 $r2
+   imul u32 u32 $r3 $r1 $r2
+   imad u32 u32 hi $r2 $r2 $r3 $r2
+   sched 0x7e0 0x7e0 0x7e0
+   imul u32 u32 $r3 $r1 $r2
+   imad u32 u32 hi $r2 $r2 $r3 $r2
+   imul u32 u32 $r3 $r1 $r2
+   sched 0x7e0 0x7e0 0x7e0
+   imad u32 u32 hi $r2 $r2 $r3 $r2
+   imul u32 u32 $r3 $r1 $r2
+   imad u32 u32 hi $r2 $r2 $r3 $r2
+   sched 0x7e0 0x7e0 0x7e0
+   mov $r3 $r0 0xf
+   imul u32 u32 hi $r0 $r0 $r2
+   i2i u32 u32 $r2 neg $r1
+   sched 0x7e0 0x7e0 0x7e0
+   imad u32 u32 $r1 $r1 $r0 $r3
+   isetp ge u32 and $p0 1 $r1 $r2 1
+   $p0 iadd $r1 $r1 neg $r2
+   sched 0x7e0 0x7e0 0x7e0
+   $p0 iadd $r0 $r0 0x1
+   $p0 isetp ge u32 and $p0 1 $r1 $r2 1
+   $p0 iadd $r1 $r1 neg $r2
+   sched 0x7e0 0x7e0 0x7e0
+   $p0 iadd $r0 $r0 0x1
+   ret
+   nop 0
+
+// DIV S32, like DIV U32 after taking ABS(inputs)
+//
+// INPUT:   $r0: dividend, $r1: divisor
+// OUTPUT:  $r0: result, $r1: modulus
+// CLOBBER: $r2 - $r3, $p0 - $p3
+//
+gm107_div_s32:
+   sched 0x7e0 0x7e0 0x7e0
+   isetp lt and $p2 0x1 $r0 0 1
+   isetp lt xor $p3 1 $r1 0 $p2
+   i2i s32 s32 $r0 abs $r0
+   sched 0x7e0 0x7e0 0x7e0
+   i2i s32 s32 $r1 abs $r1
+   flo u32 $r2 $r1
+   lop xor 1 $r2 $r2 0x1f
+   sched 0x7e0 0x7e0 0x7e0
+   mov $r3 0x1 0xf
+   shl $r2 $r3 $r2
+   i2i u32 u32 $r1 neg $r1
+   sched 0x7e0 0x7e0 0x7e0
+   imul u32 u32 $r3 $r1 $r2
+   imad u32 u32 hi $r2 $r2 $r3 $r2
+   imul u32 u32 $r3 $r1 $r2
+   sched 0x7e0 0x7e0 0x7e0
+   imad u32 u32 hi $r2 $r2 $r3 $r2
+   imul u32 u32 $r3 $r1 $r2
+   imad u32 u32 hi $r2 $r2 $r3 $r2
+   sched 0x7e0 0x7e0 0x7e0
+   imul u32 u32 $r3 $r1 $r2
+   imad u32 u32 hi $r2 $r2 $r3 $r2
+   imul u32 u32 $r3 $r1 $r2
+   sched 0x7e0 0x7e0 0x7e0
+   imad u32 u32 hi $r2 $r2 $r3 $r2
+   mov $r3 $r0 0xf
+   imul u32 u32 hi $r0 $r0 $r2
+   sched 0x7e0 0x7e0 0x7e0
+   i2i u32 u32 $r2 neg $r1
+   imad u32 u32 $r1 $r1 $r0 $r3
+   isetp ge u32 and $p0 1 $r1 $r2 1
+   sched 0x7e0 0x7e0 0x7e0
+   $p0 iadd $r1 $r1 neg $r2
+   $p0 iadd $r0 $r0 0x1
+   $p0 isetp ge u32 and $p0 1 $r1 $r2 1
+   sched 0x7e0 0x7e0 0x7e0
+   $p0 iadd $r1 $r1 neg $r2
+   $p0 iadd $r0 $r0 0x1
+   $p3 i2i s32 s32 $r0 neg $r0
+   sched 0x7e0 0x7e0 0x7e0
+   $p2 i2i s32 s32 $r1 neg $r1
+   ret
+   nop 0
+
+// STUB
+gm107_rcp_f64:
+gm107_rsq_f64:
+   sched 0x7e0 0x7e0 0x7e0
+   ret
+   nop 0
+   nop 0
+
+.section #gm107_builtin_offsets
+.b64 #gm107_div_u32
+.b64 #gm107_div_s32
+.b64 #gm107_rcp_f64
+.b64 #gm107_rsq_f64
--- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h
+++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h
@ -0,0 +1,97 @@
+uint64_t gm107_builtin_code[] = {
+/* 0x0000: gm107_div_u32 */
+	0x001f8000fc0007e0,
+	0x5c30000000170002,
+	0x3847040001f70202,
+	0x3898078000170003,
+	0x001f8000fc0007e0,
+	0x5c48000000270302,
+	0x5ce0200000170a01,
+	0x5c38000000270103,
+	0x001f8000fc0007e0,
+	0x5a40010000370202,
+	0x5c38000000270103,
+	0x5a40010000370202,
+	0x001f8000fc0007e0,
+	0x5c38000000270103,
+	0x5a40010000370202,
+	0x5c38000000270103,
+	0x001f8000fc0007e0,
+	0x5a40010000370202,
+	0x5c38000000270103,
+	0x5a40010000370202,
+	0x001f8000fc0007e0,
+	0x5c98078000070003,
+	0x5c38008000270000,
+	0x5ce0200000170a02,
+	0x001f8000fc0007e0,
+	0x5a00018000070101,
+	0x5b6c038000270107,
+	0x5c11000000200101,
+	0x001f8000fc0007e0,
+	0x3810000000100000,
+	0x5b6c038000200107,
+	0x5c11000000200101,
+	0x001f8000fc0007e0,
+	0x3810000000100000,
+	0xe32000000007000f,
+	0x50b0000000070f00,
+/* 0x0120: gm107_div_s32 */
+	0x001f8000fc0007e0,
+	0x5b6303800ff70017,
+	0x5b6341000ff7011f,
+	0x5ce2000000073a00,
+	0x001f8000fc0007e0,
+	0x5ce2000000173a01,
+	0x5c30000000170002,
+	0x3847040001f70202,
+	0x001f8000fc0007e0,
+	0x3898078000170003,
+	0x5c48000000270302,
+	0x5ce0200000170a01,
+	0x001f8000fc0007e0,
+	0x5c38000000270103,
+	0x5a40010000370202,
+	0x5c38000000270103,
+	0x001f8000fc0007e0,
+	0x5a40010000370202,
+	0x5c38000000270103,
+	0x5a40010000370202,
+	0x001f8000fc0007e0,
+	0x5c38000000270103,
+	0x5a40010000370202,
+	0x5c38000000270103,
+	0x001f8000fc0007e0,
+	0x5a40010000370202,
+	0x5c98078000070003,
+	0x5c38008000270000,
+	0x001f8000fc0007e0,
+	0x5ce0200000170a02,
+	0x5a00018000070101,
+	0x5b6c038000270107,
+	0x001f8000fc0007e0,
+	0x5c11000000200101,
+	0x3810000000100000,
+	0x5b6c038000200107,
+	0x001f8000fc0007e0,
+	0x5c11000000200101,
+	0x3810000000100000,
+	0x5ce0200000033a00,
+	0x001f8000fc0007e0,
+	0x5ce0200000123a01,
+	0xe32000000007000f,
+	0x50b0000000070f00,
+/* 0x0280: gm107_rcp_f64 */
+/* 0x0280: gm107_rsq_f64 */
+	0x001f8000fc0007e0,
+	0xe32000000007000f,
+	0x50b0000000070f00,
+	0x50b0000000070f00,
+};
+
+uint64_t gm107_builtin_offsets[] = {
+	0x0000000000000000,
+	0x0000000000000120,
+	0x0000000000000280,
+	0x0000000000000280,
+};
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@ -157,6 +157,7 @@ enum operation
   OP_VSHL,
   OP_VSEL,
   OP_CCTL, // cache control
+   OP_SHFL, // warp shuffle
   OP_LAST
 };

@ -223,6 +224,10 @@ enum operation
 #define NV50_IR_SUBOP_PIXLD_OFFSET      3
 #define NV50_IR_SUBOP_PIXLD_CENT_OFFSET 4
 #define NV50_IR_SUBOP_PIXLD_SAMPLEID    5
+#define NV50_IR_SUBOP_SHFL_IDX  0
+#define NV50_IR_SUBOP_SHFL_UP   1
+#define NV50_IR_SUBOP_SHFL_DOWN 2
+#define NV50_IR_SUBOP_SHFL_BFLY 3
 #define NV50_IR_SUBOP_MADSP_SD     0xffff
 // Yes, we could represent those with DataType.
 // Or put the type into operation and have a couple 1000 values in that enum.
@ -379,6 +384,7 @@ enum SVSemantic
   SV_LBASE,
   SV_SBASE,
   SV_VERTEX_STRIDE,
+   SV_INVOCATION_INFO,
   SV_UNDEFINED,
   SV_LAST
 };
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@ -92,6 +92,7 @@ struct nv50_ir_prog_symbol
 #define NVISA_GF100_CHIPSET_D0 0xd0
 #define NVISA_GK104_CHIPSET    0xe0
 #define NVISA_GK110_CHIPSET    0xf0
+#define NVISA_GM107_CHIPSET    0x110

 struct nv50_ir_prog_info
 {
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
@ -0,0 +1,273 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *           2014 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_build_util.h"
+
+#include "codegen/nv50_ir_target_nvc0.h"
+#include "codegen/nv50_ir_lowering_gm107.h"
+
+#include <limits>
+
+namespace nv50_ir {
+
+#define QOP_ADD  0
+#define QOP_SUBR 1
+#define QOP_SUB  2
+#define QOP_MOV2 3
+
+//             UL UR LL LR
+#define QUADOP(q, r, s, t)                      \
+   ((QOP_##q << 6) | (QOP_##r << 4) |           \
+    (QOP_##s << 2) | (QOP_##t << 0))
+
+bool
+GM107LoweringPass::handleManualTXD(TexInstruction *i)
+{
+   static const uint8_t qOps[4][2] =
+   {
+      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
+      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
+      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
+      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
+   };
+   Value *def[4][4];
+   Value *crd[3];
+   Value *tmp;
+   Instruction *tex, *add;
+   Value *zero = bld.loadImm(bld.getSSA(), 0);
+   int l, c;
+   const int dim = i->tex.target.getDim();
+
+   i->op = OP_TEX; // no need to clone dPdx/dPdy later
+
+   for (c = 0; c < dim; ++c)
+      crd[c] = bld.getScratch();
+   tmp = bld.getScratch();
+
+   for (l = 0; l < 4; ++l) {
+      // mov coordinates from lane l to all lanes
+      bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
+      for (c = 0; c < dim; ++c) {
+         bld.mkOp2(OP_SHFL, TYPE_F32, crd[c], i->getSrc(c), bld.mkImm(l));
+         add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], crd[c], zero);
+         add->subOp = 0x00;
+         add->lanes = 1; /* abused for .ndv */
+      }
+
+      // add dPdx from lane l to lanes dx
+      for (c = 0; c < dim; ++c) {
+         bld.mkOp2(OP_SHFL, TYPE_F32, tmp, i->dPdx[c].get(), bld.mkImm(l));
+         add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]);
+         add->subOp = qOps[l][0];
+         add->lanes = 1; /* abused for .ndv */
+      }
+
+      // add dPdy from lane l to lanes dy
+      for (c = 0; c < dim; ++c) {
+         bld.mkOp2(OP_SHFL, TYPE_F32, tmp, i->dPdy[c].get(), bld.mkImm(l));
+         add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]);
+         add->subOp = qOps[l][1];
+         add->lanes = 1; /* abused for .ndv */
+      }
+
+      // texture
+      bld.insert(tex = cloneForward(func, i));
+      for (c = 0; c < dim; ++c)
+         tex->setSrc(c, crd[c]);
+      bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
+
+      // save results
+      for (c = 0; i->defExists(c); ++c) {
+         Instruction *mov;
+         def[c][l] = bld.getSSA();
+         mov = bld.mkMov(def[c][l], tex->getDef(c));
+         mov->fixed = 1;
+         mov->lanes = 1 << l;
+      }
+   }
+
+   for (c = 0; i->defExists(c); ++c) {
+      Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
+      for (l = 0; l < 4; ++l)
+         u->setSrc(l, def[c][l]);
+   }
+
+   i->bb->remove(i);
+   return true;
+}
+
+bool
+GM107LoweringPass::handleDFDX(Instruction *insn)
+{
+   Instruction *shfl;
+   int qop = 0, xid = 0;
+
+   switch (insn->op) {
+   case OP_DFDX:
+      qop = QUADOP(SUB, SUBR, SUB, SUBR);
+      xid = 1;
+      break;
+   case OP_DFDY:
+      qop = QUADOP(SUB, SUB, SUBR, SUBR);
+      xid = 2;
+      break;
+   default:
+      assert(!"invalid dfdx opcode");
+      break;
+   }
+
+   shfl = bld.mkOp2(OP_SHFL, TYPE_F32, bld.getScratch(),
+                    insn->getSrc(0), bld.mkImm(xid));
+   shfl->subOp = NV50_IR_SUBOP_SHFL_BFLY;
+   insn->op = OP_QUADOP;
+   insn->subOp = qop;
+   insn->lanes = 0; /* abused for !.ndv */
+   insn->setSrc(1, insn->getSrc(0));
+   insn->setSrc(0, shfl->getDef(0));
+   return true;
+}
+
+bool
+GM107LoweringPass::handlePFETCH(Instruction *i)
+{
+   Value *tmp0 = bld.getScratch();
+   Value *tmp1 = bld.getScratch();
+   Value *tmp2 = bld.getScratch();
+   bld.mkOp1(OP_RDSV, TYPE_U32, tmp0, bld.mkSysVal(SV_INVOCATION_INFO, 0));
+   bld.mkOp2(OP_SHR , TYPE_U32, tmp1, tmp0, bld.mkImm(16));
+   bld.mkOp2(OP_AND , TYPE_U32, tmp0, tmp0, bld.mkImm(0xff));
+   bld.mkOp2(OP_AND , TYPE_U32, tmp1, tmp1, bld.mkImm(0xff));
+   bld.mkOp1(OP_MOV , TYPE_U32, tmp2, bld.mkImm(i->getSrc(0)->reg.data.u32));
+   bld.mkOp3(OP_MAD , TYPE_U32, tmp0, tmp0, tmp1, tmp2);
+   i->setSrc(0, tmp0);
+   i->setSrc(1, NULL);
+   return true;
+}
+
+bool
+GM107LoweringPass::handlePOPCNT(Instruction *i)
+{
+   Value *tmp = bld.mkOp2v(OP_AND, i->sType, bld.getScratch(),
+                           i->getSrc(0), i->getSrc(1));
+   i->setSrc(0, tmp);
+   i->setSrc(1, NULL);
+   return TRUE;
+}
+
+//
+// - add quadop dance for texturing
+// - put FP outputs in GPRs
+// - convert instruction sequences
+//
+bool
+GM107LoweringPass::visit(Instruction *i)
+{
+   bld.setPosition(i, false);
+
+   if (i->cc != CC_ALWAYS)
+      checkPredicate(i);
+
+   switch (i->op) {
+   case OP_TEX:
+   case OP_TXB:
+   case OP_TXL:
+   case OP_TXF:
+   case OP_TXG:
+      return handleTEX(i->asTex());
+   case OP_TXD:
+      return handleTXD(i->asTex());
+   case OP_TXLQ:
+      return handleTXLQ(i->asTex());
+   case OP_TXQ:
+      return handleTXQ(i->asTex());
+   case OP_EX2:
+      bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
+      i->setSrc(0, i->getDef(0));
+      break;
+   case OP_POW:
+      return handlePOW(i);
+   case OP_DIV:
+      return handleDIV(i);
+   case OP_MOD:
+      return handleMOD(i);
+   case OP_SQRT:
+      return handleSQRT(i);
+   case OP_EXPORT:
+      return handleEXPORT(i);
+   case OP_PFETCH:
+      return handlePFETCH(i);
+   case OP_EMIT:
+   case OP_RESTART:
+      return handleOUT(i);
+   case OP_RDSV:
+      return handleRDSV(i);
+   case OP_WRSV:
+      return handleWRSV(i);
+   case OP_LOAD:
+      if (i->src(0).getFile() == FILE_SHADER_INPUT) {
+         if (prog->getType() == Program::TYPE_COMPUTE) {
+            i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
+            i->getSrc(0)->reg.fileIndex = 0;
+         } else
+         if (prog->getType() == Program::TYPE_GEOMETRY &&
+             i->src(0).isIndirect(0)) {
+            // XXX: this assumes vec4 units
+            Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
+                                    i->getIndirect(0, 0), bld.mkImm(4));
+            i->setIndirect(0, 0, ptr);
+         } else {
+            i->op = OP_VFETCH;
+            assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
+         }
+      }
+      break;
+   case OP_ATOM:
+   {
+      const bool cctl = i->src(0).getFile() == FILE_MEMORY_GLOBAL;
+      handleATOM(i);
+      handleCasExch(i, cctl);
+   }
+      break;
+   case OP_SULDB:
+   case OP_SULDP:
+   case OP_SUSTB:
+   case OP_SUSTP:
+   case OP_SUREDB:
+   case OP_SUREDP:
+      handleSurfaceOpNVE4(i->asTex());
+      break;
+   case OP_DFDX:
+   case OP_DFDY:
+      handleDFDX(i);
+      break;
+   case OP_POPCNT:
+      handlePOPCNT(i);
+      break;
+   default:
+      break;
+   }
+   return true;
+}
+
+} // namespace nv50_ir
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.h
@ -0,0 +1,18 @@
+#include "codegen/nv50_ir_lowering_nvc0.h"
+
+namespace nv50_ir {
+
+class GM107LoweringPass : public NVC0LoweringPass
+{
+public:
+   GM107LoweringPass(Program *p) : NVC0LoweringPass(p) {}
+private:
+   virtual bool visit(Instruction *);
+
+   virtual bool handleManualTXD(TexInstruction *);
+   bool handleDFDX(Instruction *);
+   bool handlePFETCH(Instruction *);
+   bool handlePOPCNT(Instruction *);
+};
+
+} // namespace nv50_ir
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@ -99,7 +99,7 @@ protected:
   bool handleTEX(TexInstruction *);
   bool handleTXD(TexInstruction *);
   bool handleTXQ(TexInstruction *);
-   bool handleManualTXD(TexInstruction *);
+   virtual bool handleManualTXD(TexInstruction *);
   bool handleTXLQ(TexInstruction *);
   bool handleATOM(Instruction *);
   bool handleCasExch(Instruction *, bool needCctl);
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
@ -187,6 +187,7 @@ const char *operationStr[OP_LAST + 1] =
   "vshl",
   "vsel",
   "cctl",
+   "shfl",
   "(invalid)"
 };

@ -271,6 +272,7 @@ static const char *SemanticStr[SV_LAST + 1] =
   "LBASE",
   "SBASE",
   "VERTEX_STRIDE",
+   "INVOCATION_INFO",
   "?",
   "(INVALID)"
 };
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@ -256,6 +256,7 @@ private:
      void texConstraintNV50(TexInstruction *);
      void texConstraintNVC0(TexInstruction *);
      void texConstraintNVE0(TexInstruction *);
+      void texConstraintGM107(TexInstruction *);

      std::list<Instruction *> constrList;

@ -855,6 +856,7 @@ GCRA::coalesce(ArrayList& insns)
   case 0xe0:
   case 0xf0:
   case 0x100:
+   case 0x110:
      ret = doCoalesce(insns, JOIN_MASK_UNION);
      break;
   default:
@ -1880,6 +1882,34 @@ RegAlloc::InsertConstraintsPass::condenseSrcs(Instruction *insn,
   constrList.push_back(merge);
 }

+void
+RegAlloc::InsertConstraintsPass::texConstraintGM107(TexInstruction *tex)
+{
+   int n, s;
+
+   if (isTextureOp(tex->op))
+      textureMask(tex);
+   condenseDefs(tex);
+
+   if (tex->op == OP_SUSTB || tex->op == OP_SUSTP) {
+      condenseSrcs(tex, 3, (3 + typeSizeof(tex->dType) / 4) - 1);
+   } else
+   if (isTextureOp(tex->op)) {
+      if (tex->op != OP_TXQ) {
+         s = tex->tex.target.getArgCount() - tex->tex.target.isMS();
+         n = tex->srcCount(0xff) - s;
+      } else {
+         s = tex->srcCount(0xff);
+         n = 0;
+      }
+
+      if (s > 1)
+         condenseSrcs(tex, 0, s - 1);
+      if (n > 1) // NOTE: first call modified positions already
+         condenseSrcs(tex, 1, n);
+   }
+}
+
 void
 RegAlloc::InsertConstraintsPass::texConstraintNVE0(TexInstruction *tex)
 {
@ -1987,6 +2017,9 @@ RegAlloc::InsertConstraintsPass::visit(BasicBlock *bb)
         case 0x100:
            texConstraintNVE0(tex);
            break;
+         case 0x110:
+            texConstraintGM107(tex);
+            break;
         default:
            break;
         }
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
@ -54,6 +54,7 @@ const uint8_t Target::operationSrcNr[] =
   2, 2,                   // ATOM, BAR
   2, 2, 2, 2, 3, 2,       // VADD, VAVG, VMIN, VMAX, VSAD, VSET,
   2, 2, 2, 1,             // VSHR, VSHL, VSEL, CCTL
+   3,                      // SHFL
   0
 };

@ -126,10 +127,13 @@ const OpClass Target::operationClass[] =
   OPCLASS_VECTOR, OPCLASS_VECTOR, OPCLASS_VECTOR, OPCLASS_VECTOR,
   // VSEL, CCTL
   OPCLASS_VECTOR, OPCLASS_CONTROL,
+   // SHFL
+   OPCLASS_OTHER,
   OPCLASS_PSEUDO // LAST
 };


+extern Target *getTargetGM107(unsigned int chipset);
 extern Target *getTargetNVC0(unsigned int chipset);
 extern Target *getTargetNV50(unsigned int chipset);

@ -138,6 +142,8 @@ Target *Target::create(unsigned int chipset)
   STATIC_ASSERT(Elements(operationSrcNr) == OP_LAST + 1);
   STATIC_ASSERT(Elements(operationClass) == OP_LAST + 1);
   switch (chipset & ~0xf) {
+   case 0x110:
+      return getTargetGM107(chipset);
   case 0xc0:
   case 0xd0:
   case 0xe0:
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
@ -78,7 +78,7 @@ public:

   inline void *getRelocInfo() const { return relocInfo; }

-   void prepareEmission(Program *);
+   virtual void prepareEmission(Program *);
   virtual void prepareEmission(Function *);
   virtual void prepareEmission(BasicBlock *);

--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
@ -0,0 +1,100 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *           2014 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir_target_gm107.h"
+#include "codegen/nv50_ir_lowering_gm107.h"
+
+namespace nv50_ir {
+
+Target *getTargetGM107(unsigned int chipset)
+{
+   return new TargetGM107(chipset);
+}
+
+// BULTINS / LIBRARY FUNCTIONS:
+
+// lazyness -> will just hardcode everything for the time being
+
+#include "lib/gm107.asm.h"
+
+void
+TargetGM107::getBuiltinCode(const uint32_t **code, uint32_t *size) const
+{
+   *code = (const uint32_t *)&gm107_builtin_code[0];
+   *size = sizeof(gm107_builtin_code);
+}
+
+uint32_t
+TargetGM107::getBuiltinOffset(int builtin) const
+{
+   assert(builtin < NVC0_BUILTIN_COUNT);
+   return gm107_builtin_offsets[builtin];
+}
+
+bool
+TargetGM107::isOpSupported(operation op, DataType ty) const
+{
+   switch (op) {
+   case OP_MAD:
+   case OP_FMA:
+      if (ty != TYPE_F32)
+         return false;
+      break;
+   case OP_SAD:
+   case OP_POW:
+   case OP_SQRT:
+   case OP_DIV:
+   case OP_MOD:
+      return false;
+   default:
+      break;
+   }
+
+   return true;
+}
+
+bool
+TargetGM107::runLegalizePass(Program *prog, CGStage stage) const
+{
+   if (stage == CG_STAGE_PRE_SSA) {
+      GM107LoweringPass pass(prog);
+      return pass.run(prog, false, true);
+   } else
+   if (stage == CG_STAGE_POST_RA) {
+      NVC0LegalizePostRA pass(prog);
+      return pass.run(prog, false, true);
+   } else
+   if (stage == CG_STAGE_SSA) {
+      NVC0LegalizeSSA pass;
+      return pass.run(prog, false, true);
+   }
+   return false;
+}
+
+CodeEmitter *
+TargetGM107::getCodeEmitter(Program::Type type)
+{
+   return createCodeEmitterGM107(type);
+}
+
+} // namespace nv50_ir
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.h
@ -0,0 +1,21 @@
+#include "codegen/nv50_ir_target_nvc0.h"
+
+namespace nv50_ir {
+
+class TargetGM107 : public TargetNVC0
+{
+public:
+   TargetGM107(unsigned int chipset) : TargetNVC0(chipset) {}
+
+   virtual CodeEmitter *getCodeEmitter(Program::Type);
+   CodeEmitter *createCodeEmitterGM107(Program::Type);
+
+   virtual bool runLegalizePass(Program *, CGStage) const;
+
+   virtual void getBuiltinCode(const uint32_t **, uint32_t *) const;
+   virtual uint32_t getBuiltinOffset(int) const;
+
+   virtual bool isOpSupported(operation, DataType) const;
+};
+
+} // namespace nv50_ir