nv50/ir: add surface op lowering

This handles BUFQ, SUQ, as well as all the various texture types and formats, driven by data supplied by the driver (and shader itself). TODO: - 2d linear surfaces - format via key for writeonly These will be included in a later change. ES3.1 doesn't require writeonly, and it's very hard to generate a 2d linear surface. Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu> Acked-by: Pierre Moreau <dev@pmoreau.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10164>
2021-02-24 22:23:48 -05:00 · 2021-02-24 22:23:48 -05:00 · 6b1a526ac5
parent 67f98497af
commit 6b1a526ac5
1 changed files with 578 additions and 0 deletions
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@ -25,6 +25,24 @@

 #include "codegen/nv50_ir_target_nv50.h"

+#define NV50_SU_INFO_SIZE_X   0x00
+#define NV50_SU_INFO_SIZE_Y   0x04
+#define NV50_SU_INFO_SIZE_Z   0x08
+#define NV50_SU_INFO_BSIZE    0x0c
+#define NV50_SU_INFO_STRIDE_Y 0x10
+#define NV50_SU_INFO_MS_X     0x18
+#define NV50_SU_INFO_MS_Y     0x1c
+#define NV50_SU_INFO_TILE_SHIFT_X 0x20
+#define NV50_SU_INFO_TILE_SHIFT_Y 0x24
+#define NV50_SU_INFO_TILE_SHIFT_Z 0x28
+#define NV50_SU_INFO_OFFSET_Z 0x2c
+
+#define NV50_SU_INFO__STRIDE 0x30
+
+#define NV50_SU_INFO_SIZE(i) (0x00 + (i) * 4)
+#define NV50_SU_INFO_MS(i)   (0x18 + (i) * 4)
+#define NV50_SU_INFO_TILE_SHIFT(i) (0x20 + (i) * 4)
+
 namespace nv50_ir {

 // nv50 doesn't support 32 bit integer multiplication
@ -215,6 +233,8 @@ private:
   void handlePRERET(FlowInstruction *);
   void replaceZero(Instruction *);

+   BuildUtil bld;
+
   LValue *r63;
 };

@ -627,6 +647,10 @@ private:
   bool handleEXPORT(Instruction *);
   bool handleLOAD(Instruction *);
   bool handleLDST(Instruction *);
+   bool handleSULDP(TexInstruction *);
+   bool handleSUREDP(TexInstruction *);
+   bool handleSUSTP(TexInstruction *);
+   Value *processSurfaceCoords(TexInstruction *);

   bool handleDIV(Instruction *);
   bool handleSQRT(Instruction *);
@ -642,6 +666,8 @@ private:
   bool handleTXD(TexInstruction *); // these 3
   bool handleTXLQ(TexInstruction *);
   bool handleTXQ(TexInstruction *);
+   bool handleSUQ(TexInstruction *);
+   bool handleBUFQ(Instruction *);

   bool handleCALL(Instruction *);
   bool handlePRECONT(Instruction *);
@ -650,6 +676,8 @@ private:
   void checkPredicate(Instruction *);
   void loadTexMsInfo(uint32_t off, Value **ms, Value **ms_x, Value **ms_y);
   void loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy);
+   Value *loadSuInfo(int slot, uint32_t off);
+   Value *loadSuInfo16(int slot, uint32_t off);

 private:
   const Target *const targ;
@ -724,6 +752,24 @@ void NV50LoweringPreSSA::loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy)
                           prog->driver->io.msInfoBase + 4), off);
 }

+Value *
+NV50LoweringPreSSA::loadSuInfo(int slot, uint32_t off)
+{
+   uint8_t b = prog->driver->io.auxCBSlot;
+   off += prog->driver->io.bufInfoBase + slot * NV50_SU_INFO__STRIDE;
+   return bld.mkLoadv(TYPE_U32, bld.mkSymbol(
+                            FILE_MEMORY_CONST, b, TYPE_U32, off), NULL);
+}
+
+Value *
+NV50LoweringPreSSA::loadSuInfo16(int slot, uint32_t off)
+{
+   uint8_t b = prog->driver->io.auxCBSlot;
+   off += prog->driver->io.bufInfoBase + slot * NV50_SU_INFO__STRIDE;
+   return bld.mkLoadv(TYPE_U16, bld.mkSymbol(
+                            FILE_MEMORY_CONST, b, TYPE_U16, off), NULL);
+}
+
 bool
 NV50LoweringPreSSA::handleTEX(TexInstruction *i)
 {
@ -1064,6 +1110,56 @@ NV50LoweringPreSSA::handleTXQ(TexInstruction *i)
   return true;
 }

+bool
+NV50LoweringPreSSA::handleSUQ(TexInstruction *suq)
+{
+   const int dim = suq->tex.target.getDim();
+   const int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube());
+   int mask = suq->tex.mask;
+   int slot = suq->tex.r + 7;
+   int c, d;
+
+   for (c = 0, d = 0; c < 3; ++c, mask >>= 1) {
+      if (c >= arg || !(mask & 1))
+         continue;
+
+      int offset;
+
+      if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) {
+         offset = NV50_SU_INFO_SIZE(2);
+      } else {
+         offset = NV50_SU_INFO_SIZE(c);
+      }
+      bld.mkMov(suq->getDef(d++), loadSuInfo(slot, offset));
+      if (c == 2 && suq->tex.target.isCube())
+         bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1),
+                   bld.loadImm(NULL, 6));
+   }
+
+   if (mask & 1) {
+      if (suq->tex.target.isMS()) {
+         Value *ms_x = loadSuInfo(slot, NV50_SU_INFO_MS(0));
+         Value *ms_y = loadSuInfo(slot, NV50_SU_INFO_MS(1));
+         Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y);
+         bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms);
+      } else {
+         bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1));
+      }
+   }
+
+   bld.remove(suq);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleBUFQ(Instruction *bufq)
+{
+   bufq->op = OP_MOV;
+   bufq->setSrc(0, loadSuInfo(bufq->getSrc(0)->reg.fileIndex, NV50_SU_INFO_SIZE_X));
+   bufq->setIndirect(0, 0, NULL);
+   bufq->setIndirect(0, 1, NULL);
+   return true;
+}

 bool
 NV50LoweringPreSSA::handleSET(Instruction *i)
@ -1407,6 +1503,478 @@ NV50LoweringPreSSA::handleLDST(Instruction *i)
   return true;
 }

+// The type that bests represents how each component can be stored when packed.
+static DataType
+getPackedType(const TexInstruction::ImgFormatDesc *t, int c)
+{
+   switch (t->type) {
+   case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32;
+   case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16;
+   case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16;
+   case UINT:
+      return (t->bits[c] == 8 ? TYPE_U8 :
+              (t->bits[c] <= 16 ? TYPE_U16 : TYPE_U32));
+   case SINT:
+      return (t->bits[c] == 8 ? TYPE_S8 :
+              (t->bits[c] <= 16 ? TYPE_S16 : TYPE_S32));
+   }
+   return TYPE_NONE;
+}
+
+// The type that the rest of the shader expects to process this image type in.
+static DataType
+getShaderType(const ImgType type) {
+   switch (type) {
+   case FLOAT:
+   case UNORM:
+   case SNORM:
+      return TYPE_F32;
+   case UINT:
+      return TYPE_U32;
+   case SINT:
+      return TYPE_S32;
+   default:
+      assert(!"Impossible type");
+      return TYPE_NONE;
+   }
+}
+
+// Reads the raw coordinates out of the input instruction, and returns a
+// single-value coordinate which is what the hardware expects to receive in a
+// ld/st op.
+Value *
+NV50LoweringPreSSA::processSurfaceCoords(TexInstruction *su)
+{
+   const int slot = su->tex.r + 7;
+   const int dim = su->tex.target.getDim();
+   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
+
+   const TexInstruction::ImgFormatDesc *format = su->tex.format;
+   const uint16_t bytes = (format->bits[0] + format->bits[1] +
+                           format->bits[2] + format->bits[3]) / 8;
+   uint16_t shift = ffs(bytes) - 1;
+
+   // Buffer sizes don't necessarily fit in 16-bit values
+   if (su->tex.target == TEX_TARGET_BUFFER) {
+      return bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
+                        su->getSrc(0), bld.loadImm(NULL, (uint32_t)shift));
+   }
+
+   // For buffers, we just need the byte offset. And for 2d buffers we want
+   // the x coordinate in bytes as well.
+   Value *coords[3] = {};
+   for (int i = 0; i < arg; i++) {
+      Value *src[2];
+      bld.mkSplit(src, 2, su->getSrc(i));
+      coords[i] = src[0];
+      // For 1d-images, we want the y coord to be 0, which it will be here.
+      if (i == 0)
+         coords[1] = src[1];
+   }
+
+   coords[0] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
+                          coords[0], bld.loadImm(NULL, shift));
+
+   if (su->tex.target.isMS()) {
+      Value *ms_x = loadSuInfo16(slot, NV50_SU_INFO_MS(0));
+      Value *ms_y = loadSuInfo16(slot, NV50_SU_INFO_MS(1));
+      coords[0] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), coords[0], ms_x);
+      coords[1] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), coords[1], ms_y);
+   }
+
+   // If there are more dimensions, we just want the y-offset. But that needs
+   // to be adjusted up by the y-stride for array images.
+   if (su->tex.target.isArray() || su->tex.target.isCube()) {
+      Value *index = coords[dim];
+      Value *height = loadSuInfo16(slot, NV50_SU_INFO_STRIDE_Y);
+      Instruction *mul = bld.mkOp2(OP_MUL, TYPE_U32, bld.getSSA(4), index, height);
+      mul->sType = TYPE_U16;
+      Value *muls[2];
+      bld.mkSplit(muls, 2, mul->getDef(0));
+      if (dim > 1)
+         coords[1] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), coords[1], muls[0]);
+      else
+         coords[1] = muls[0];
+   }
+
+   // 3d is special-cased. Note that a single "slice" of a 3d image may
+   // also be attached as 2d, so we have to do the same 3d processing for
+   // 2d as well, just in case. In order to remap a 3d image onto a 2d
+   // image, we have to retile it "by hand".
+   if (su->tex.target == TEX_TARGET_3D || su->tex.target == TEX_TARGET_2D) {
+      Value *z = loadSuInfo16(slot, NV50_SU_INFO_OFFSET_Z);
+      Value *y_size_aligned = loadSuInfo16(slot, NV50_SU_INFO_STRIDE_Y);
+      // Add the z coordinate for actual 3d-images
+      if (dim > 2)
+         coords[2] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), z, coords[2]);
+      else
+         coords[2] = z;
+
+      // Compute the surface parameters from tile shifts
+      Value *tile_shift[3];
+      Value *tile_size[3];
+      Value *tile_mask[3];
+      // We only ever use one kind of X-tiling.
+      tile_shift[0] = bld.loadImm(NULL, (uint16_t)6);
+      tile_size[0] = bld.loadImm(NULL, (uint16_t)64);
+      tile_mask[0] = bld.loadImm(NULL, (uint16_t)63);
+      // Fetch the "real" tiling parameters of the underlying surface
+      for (int i = 1; i < 3; i++) {
+         tile_shift[i] = loadSuInfo16(slot, NV50_SU_INFO_TILE_SHIFT(i));
+         tile_size[i] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), bld.loadImm(NULL, (uint16_t)1), tile_shift[i]);
+         tile_mask[i] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), tile_size[i], bld.loadImm(NULL, (uint16_t)-1));
+      }
+
+      // Compute the location of given coordinate, both inside the tile as
+      // well as which (linearly-laid out) tile it's in.
+      Value *coord_in_tile[3];
+      Value *tile[3];
+      for (int i = 0; i < 3; i++) {
+         coord_in_tile[i] = bld.mkOp2v(OP_AND, TYPE_U16, bld.getSSA(2), coords[i], tile_mask[i]);
+         tile[i] = bld.mkOp2v(OP_SHR, TYPE_U16, bld.getSSA(2), coords[i], tile_shift[i]);
+      }
+
+      // Based on the "real" tiling parameters, compute x/y coordinates in the
+      // larger surface with 2d tiling that was supplied to the hardware. This
+      // was determined and verified with the help of the tiling pseudocode in
+      // the envytools docs.
+      //
+      // adj_x = x_coord_in_tile + x_tile * x_tile_size * z_tile_size +
+      //         z_coord_in_tile * x_tile_size
+      // adj_y = y_coord_in_tile + y_tile * y_tile_size +
+      //         z_tile * y_tile_size * y_tiles
+      //
+      // Note: STRIDE_Y = y_tile_size * y_tiles
+
+      coords[0] = bld.mkOp2v(
+            OP_ADD, TYPE_U16, bld.getSSA(2),
+            bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),
+                       coord_in_tile[0],
+                       bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
+                                  tile[0],
+                                  bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),
+                                             tile_shift[2], tile_shift[0]))),
+            bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
+                       coord_in_tile[2], tile_shift[0]));
+
+      Instruction *mul = bld.mkOp2(OP_MUL, TYPE_U32, bld.getSSA(4),
+                                   tile[2], y_size_aligned);
+      mul->sType = TYPE_U16;
+      Value *muls[2];
+      bld.mkSplit(muls, 2, mul->getDef(0));
+
+      coords[1] = bld.mkOp2v(
+            OP_ADD, TYPE_U16, bld.getSSA(2),
+            muls[0],
+            bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),
+                       coord_in_tile[1],
+                       bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
+                                  tile[1], tile_shift[1])));
+   }
+
+   return bld.mkOp2v(OP_MERGE, TYPE_U32, bld.getSSA(), coords[0], coords[1]);
+}
+
+// This is largely a copy of NVC0LoweringPass::convertSurfaceFormat, but
+// adjusted to make use of 16-bit math where possible.
+bool
+NV50LoweringPreSSA::handleSULDP(TexInstruction *su)
+{
+   const int slot = su->tex.r + 7;
+   assert(!su->getIndirectR());
+
+   bld.setPosition(su, false);
+
+   const TexInstruction::ImgFormatDesc *format = su->tex.format;
+   const int bytes = (su->tex.format->bits[0] +
+                      su->tex.format->bits[1] +
+                      su->tex.format->bits[2] +
+                      su->tex.format->bits[3]) / 8;
+   DataType ty = typeOfSize(bytes);
+
+   Value *coord = processSurfaceCoords(su);
+
+   Value *untypedDst[4] = {};
+   Value *typedDst[4] = {};
+   int i;
+   for (i = 0; i < bytes / 4; i++)
+      untypedDst[i] = bld.getSSA();
+   if (bytes < 4)
+      untypedDst[0] = bld.getSSA();
+
+   for (i = 0; i < 4; i++)
+      typedDst[i] = su->getDef(i);
+
+   Instruction *load = bld.mkLoad(ty, NULL, bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, ty, 0), coord);
+   for (i = 0; i < 4 && untypedDst[i]; i++)
+      load->setDef(i, untypedDst[i]);
+
+   // Unpack each component into the typed dsts
+   int bits = 0;
+   for (int i = 0; i < 4; bits += format->bits[i], i++) {
+      if (!typedDst[i])
+         continue;
+
+      if (i >= format->components) {
+         if (format->type == FLOAT ||
+             format->type == UNORM ||
+             format->type == SNORM)
+            bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f);
+         else
+            bld.loadImm(typedDst[i], i == 3 ? 1 : 0);
+         continue;
+      }
+
+      // Get just that component's data into the relevant place
+      if (format->bits[i] == 32)
+         bld.mkMov(typedDst[i], untypedDst[i]);
+      else if (format->bits[i] == 16) {
+         // We can always convert directly from the appropriate half of the
+         // loaded value into the typed result.
+         Value *src[2];
+         bld.mkSplit(src, 2, untypedDst[i / 2]);
+         bld.mkCvt(OP_CVT, getShaderType(format->type), typedDst[i],
+                   getPackedType(format, i), src[i & 1]);
+      }
+      else if (format->bits[i] == 8) {
+         // Same approach as for 16 bits, but we have to massage the value a
+         // bit more, since we have to get the appropriate 8 bits from the
+         // half-register. In all cases, we can CVT from a 8-bit source, so we
+         // only have to shift when we want the upper 8 bits.
+         Value *src[2], *shifted;
+         bld.mkSplit(src, 2, untypedDst[0]);
+         DataType packedType = getPackedType(format, i);
+         if (i & 1)
+            shifted = bld.mkOp2v(OP_SHR, TYPE_U16, bld.getSSA(2), src[!!(i & 2)], bld.loadImm(NULL, (uint16_t)8));
+         else
+            shifted = src[!!(i & 2)];
+
+         bld.mkCvt(OP_CVT, getShaderType(format->type), typedDst[i],
+                   packedType, shifted);
+      }
+      else {
+         // The options are 10, 11, and 2. Get it into a 32-bit reg, then
+         // shift/mask. That's where it'll have to end up anyways. For signed,
+         // we have to make sure to get sign-extension, so we actually have to
+         // shift *up* first, and then shift down. There's no advantage to
+         // AND'ing, so we don't.
+         DataType ty = TYPE_U32;
+         if (format->type == SNORM || format->type == SINT) {
+            ty = TYPE_S32;
+         }
+
+         // Poor man's EXTBF
+         bld.mkOp2(
+               OP_SHR, ty, typedDst[i],
+               bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), untypedDst[0], bld.loadImm(NULL, 32 - bits - format->bits[i])),
+               bld.loadImm(NULL, 32 - format->bits[i]));
+
+         // If the stored data is already in the appropriate type, we don't
+         // have to do anything. Convert to float for the *NORM formats.
+         if (format->type == UNORM || format->type == SNORM)
+            bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_U32, typedDst[i]);
+      }
+
+      // Normalize / convert as necessary
+      if (format->type == UNORM)
+         bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1)));
+      else if (format->type == SNORM)
+         bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1)));
+      else if (format->type == FLOAT && format->bits[i] < 16) {
+         // We expect the value to be in the low bits of the register, so we
+         // have to shift back up.
+         bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i]));
+         Value *src[2];
+         bld.mkSplit(src, 2, typedDst[i]);
+         bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, src[0]);
+      }
+   }
+
+   if (format->bgra) {
+      std::swap(typedDst[0], typedDst[2]);
+   }
+
+   bld.getBB()->remove(su);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSUREDP(TexInstruction *su)
+{
+   const int slot = su->tex.r + 7;
+   const int dim = su->tex.target.getDim();
+   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
+   assert(!su->getIndirectR());
+
+   bld.setPosition(su, false);
+
+   Value *coord = processSurfaceCoords(su);
+
+   // This is guaranteed to be a 32-bit format. So there's nothing to
+   // pack/unpack.
+   Instruction *atom = bld.mkOp2(
+         OP_ATOM, su->dType, su->getDef(0),
+         bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, TYPE_U32, 0), su->getSrc(arg));
+   if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
+      atom->setSrc(2, su->getSrc(arg + 1));
+   atom->setIndirect(0, 0, coord);
+   atom->subOp = su->subOp;
+
+   bld.getBB()->remove(su);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSUSTP(TexInstruction *su)
+{
+   const int slot = su->tex.r + 7;
+   const int dim = su->tex.target.getDim();
+   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
+   assert(!su->getIndirectR());
+
+   bld.setPosition(su, false);
+
+   const TexInstruction::ImgFormatDesc *format = su->tex.format;
+   const int bytes = (su->tex.format->bits[0] +
+                      su->tex.format->bits[1] +
+                      su->tex.format->bits[2] +
+                      su->tex.format->bits[3]) / 8;
+   DataType ty = typeOfSize(bytes);
+
+   Value *coord = processSurfaceCoords(su);
+
+   // The packed values we will eventually store into memory
+   Value *untypedDst[4] = {};
+   // Each component's packed representation, in 16-bit registers (only used
+   // where appropriate)
+   Value *untypedDst16[4] = {};
+   // The original values that are being packed
+   Value *typedDst[4] = {};
+   int i;
+
+   for (i = 0; i < bytes / 4; i++)
+      untypedDst[i] = bld.getSSA();
+   for (i = 0; i < format->components; i++)
+      untypedDst16[i] = bld.getSSA(2);
+   // Make sure we get at least one of each value allocated for the
+   // super-narrow formats.
+   if (bytes < 4)
+      untypedDst[0] = bld.getSSA();
+   if (bytes < 2)
+      untypedDst16[0] = bld.getSSA(2);
+
+   for (i = 0; i < 4; i++) {
+      typedDst[i] = bld.getSSA();
+      bld.mkMov(typedDst[i], su->getSrc(arg + i));
+   }
+
+   if (format->bgra) {
+      std::swap(typedDst[0], typedDst[2]);
+   }
+
+   // Pack each component into the untyped dsts.
+   int bits = 0;
+   for (int i = 0; i < format->components; bits += format->bits[i], i++) {
+      // Un-normalize / convert as necessary
+      if (format->type == UNORM)
+         bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f * ((1 << format->bits[i]) - 1)));
+      else if (format->type == SNORM)
+         bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f * ((1 << (format->bits[i] - 1)) - 1)));
+
+      // There is nothing to convert/pack for 32-bit values
+      if (format->bits[i] == 32) {
+         bld.mkMov(untypedDst[i], typedDst[i]);
+         continue;
+      }
+
+      // The remainder of the cases will naturally want to deal in 16-bit
+      // registers. We will put these into untypedDst16 and then merge them
+      // together later.
+      if (format->type == FLOAT && format->bits[i] < 16) {
+         bld.mkCvt(OP_CVT, TYPE_F16, untypedDst16[i], TYPE_F32, typedDst[i]);
+         bld.mkOp2(OP_SHR, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)(15 - format->bits[i])));
+
+         // For odd bit sizes, it's easier to pack it into the final
+         // destination directly.
+         Value *tmp = bld.getSSA();
+         bld.mkCvt(OP_CVT, TYPE_U32, tmp, TYPE_U16, untypedDst16[i]);
+         if (i == 0) {
+            untypedDst[0] = tmp;
+         } else {
+            bld.mkOp2(OP_SHL, TYPE_U32, tmp, tmp, bld.loadImm(NULL, bits));
+            bld.mkOp2(OP_OR, TYPE_U32, untypedDst[0], untypedDst[0], tmp);
+         }
+      } else if (format->bits[i] == 16) {
+         // We can always convert the shader value into the packed value
+         // directly here
+         bld.mkCvt(OP_CVT, getPackedType(format, i), untypedDst16[i],
+                   getShaderType(format->type), typedDst[i]);
+      } else if (format->bits[i] < 16) {
+         DataType packedType = getPackedType(format, i);
+         DataType shaderType = getShaderType(format->type);
+         // We can't convert F32 to U8/S8 directly, so go to U16/S16 first.
+         if (shaderType == TYPE_F32 && typeSizeof(packedType) == 1) {
+            packedType = format->type == SNORM ? TYPE_S16 : TYPE_U16;
+         }
+         bld.mkCvt(OP_CVT, packedType, untypedDst16[i], shaderType, typedDst[i]);
+         // TODO: clamp for 10- and 2-bit sizes. Also, due to the oddness of
+         // the size, it's easier to dump them into a 32-bit value and OR
+         // everything later.
+         if (format->bits[i] != 8) {
+            // Restrict value to the appropriate bits (although maybe supposed
+            // to clamp instead?)
+            bld.mkOp2(OP_AND, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)((1 << format->bits[i]) - 1)));
+            // And merge into final packed value
+            Value *tmp = bld.getSSA();
+            bld.mkCvt(OP_CVT, TYPE_U32, tmp, TYPE_U16, untypedDst16[i]);
+            if (i == 0) {
+               untypedDst[0] = tmp;
+            } else {
+               bld.mkOp2(OP_SHL, TYPE_U32, tmp, tmp, bld.loadImm(NULL, bits));
+               bld.mkOp2(OP_OR, TYPE_U32, untypedDst[0], untypedDst[0], tmp);
+            }
+         } else if (i & 1) {
+            // Shift the 8-bit value up (so that it can be OR'd later)
+            bld.mkOp2(OP_SHL, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)(bits % 16)));
+         } else if (packedType != TYPE_U8) {
+            // S8 (or the *16 if converted from float) will all have high bits
+            // set, so AND them out.
+            bld.mkOp2(OP_AND, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)0xff));
+         }
+      }
+   }
+
+   // OR pairs of 8-bit values together (into the even value)
+   if (format->bits[0] == 8) {
+      for (i = 0; i < 2 && untypedDst16[2 * i] && untypedDst16[2 * i + 1]; i++)
+         bld.mkOp2(OP_OR, TYPE_U16, untypedDst16[2 * i], untypedDst16[2 * i], untypedDst16[2 * i + 1]);
+   }
+
+   // We'll always want to have at least a 32-bit source register for the store
+   Instruction *merge = bld.mkOp(OP_MERGE, bytes < 4 ? TYPE_U32 : ty, bld.getSSA(bytes < 4 ? 4 : bytes));
+   if (format->bits[0] == 32) {
+      for (i = 0; i < 4 && untypedDst[i]; i++)
+         merge->setSrc(i, untypedDst[i]);
+   } else if (format->bits[0] == 16) {
+      for (i = 0; i < 4 && untypedDst16[i]; i++)
+         merge->setSrc(i, untypedDst16[i]);
+      if (i == 1)
+         merge->setSrc(i, bld.getSSA(2));
+   } else if (format->bits[0] == 8) {
+      for (i = 0; i < 2 && untypedDst16[2 * i]; i++)
+         merge->setSrc(i, untypedDst16[2 * i]);
+      if (i == 1)
+         merge->setSrc(i, bld.getSSA(2));
+   } else {
+      merge->setSrc(0, untypedDst[0]);
+   }
+
+   bld.mkStore(OP_STORE, ty, bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, TYPE_U32, 0), coord, merge->getDef(0));
+
+   bld.getBB()->remove(su);
+   return true;
+}
+
 bool
 NV50LoweringPreSSA::handlePFETCH(Instruction *i)
 {
@ -1507,6 +2075,16 @@ NV50LoweringPreSSA::visit(Instruction *i)
   case OP_ATOM:
   case OP_STORE:
      return handleLDST(i);
+   case OP_SULDP:
+      return handleSULDP(i->asTex());
+   case OP_SUSTP:
+      return handleSUSTP(i->asTex());
+   case OP_SUREDP:
+      return handleSUREDP(i->asTex());
+   case OP_SUQ:
+      return handleSUQ(i->asTex());
+   case OP_BUFQ:
+      return handleBUFQ(i);
   case OP_RDSV:
      return handleRDSV(i);
   case OP_WRSV: