nv50: add remapping of buffers/images into unified space

This allows us to use up to 15 images or buffers (but not both). GL supports the concept of combined resource maximums though. Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu> Acked-by: Pierre Moreau <dev@pmoreau.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10164>
2021-03-19 20:10:24 -04:00 · 2021-03-19 20:10:24 -04:00 · f451854f39
parent 58d47ca324
commit f451854f39
8 changed files with 124 additions and 66 deletions
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
@ -26,7 +26,6 @@

 extern "C" {
 #include "nouveau_debug.h"
-#include "nv50/nv50_program.h"
 }

 namespace nv50_ir {
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@ -184,6 +184,13 @@ struct nv50_ir_prog_info_out
         bool readsSampleLocations  : 1;
         bool separateFragData      : 1;
      } fp;
+      struct {
+         struct {
+            unsigned valid : 1;
+            unsigned image : 1;
+            unsigned slot  : 6;
+         } gmem[16]; /* nv50 only */
+      } cp;
   } prop;

   struct {
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@ -1012,6 +1012,9 @@ public:
   std::map<int, std::pair<int, int> > tempArrayInfo;
   std::vector<int> tempArrayId;

+   std::map<int, int> bufferIds;
+   std::map<int, int> imageIds;
+
   int clipVertexOutput;

   struct TextureView {
@ -1041,6 +1044,7 @@ public:
   } immd;

 private:
+   int gmemSlot;
   nv50_ir::Program *prog;
   int inferSysValDirection(unsigned sn) const;
   bool scanDeclaration(const struct tgsi_full_declaration *);
@ -1056,7 +1060,8 @@ private:

 Source::Source(struct nv50_ir_prog_info *info, struct nv50_ir_prog_info_out *info_out,
               nv50_ir::Program *prog)
-:  insns(NULL), info(info), info_out(info_out), clipVertexOutput(-1), prog(prog)
+:  insns(NULL), info(info), info_out(info_out), clipVertexOutput(-1),
+   gmemSlot(0), prog(prog)
 {
   tokens = (const struct tgsi_token *)info->bin.source;

@ -1437,12 +1442,27 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
   case TGSI_FILE_BUFFER:
      for (i = first; i <= last; ++i)
         bufferAtomics[i] = decl->Declaration.Atomic;
+      if (info->type == PIPE_SHADER_COMPUTE && info->target < NVISA_GF100_CHIPSET) {
+         for (i = first; i <= last; i++) {
+            bufferIds.insert(std::make_pair(i, gmemSlot));
+            info_out->prop.cp.gmem[gmemSlot++] = {.valid = 1, .slot = i};
+            assert(gmemSlot < 16);
+         }
+      }
+      break;
+   case TGSI_FILE_IMAGE:
+      if (info->type == PIPE_SHADER_COMPUTE && info->target < NVISA_GF100_CHIPSET) {
+         for (i = first; i <= last; i++) {
+            imageIds.insert(std::make_pair(i, gmemSlot));
+            info_out->prop.cp.gmem[gmemSlot++] = {.valid = 1, .image = 1, .slot = i};
+            assert(gmemSlot < 16);
+         }
+      }
      break;
   case TGSI_FILE_ADDRESS:
   case TGSI_FILE_CONSTANT:
   case TGSI_FILE_IMMEDIATE:
   case TGSI_FILE_SAMPLER:
-   case TGSI_FILE_IMAGE:
      break;
   default:
      ERROR("unhandled TGSI_FILE %d\n", decl->Declaration.File);
@ -1677,6 +1697,8 @@ private:

   // Symbol *getResourceBase(int r);
   void getImageCoords(std::vector<Value *>&, int s);
+   int remapImageId(int);
+   int remapBufferId(int);

   void handleLOAD(Value *dst0[4]);
   void handleSTORE();
@ -2610,12 +2632,30 @@ Converter::getImageCoords(std::vector<Value *> &coords, int s)
      coords.push_back(fetchSrc(s, 3));
 }

+int
+Converter::remapBufferId(int id)
+{
+   std::map<int, int>::const_iterator it = code->bufferIds.find(id);
+   if (it != code->bufferIds.end())
+      return it->second;
+   return id;
+}
+
+int
+Converter::remapImageId(int id)
+{
+   std::map<int, int>::const_iterator it = code->imageIds.find(id);
+   if (it != code->imageIds.end())
+      return it->second;
+   return id;
+}
+
 // For raw loads, granularity is 4 byte.
 // Usage of the texture read mask on OP_SULDP is not allowed.
 void
 Converter::handleLOAD(Value *dst0[4])
 {
-   const int r = tgsi.getSrc(0).getIndex(0);
+   int r = tgsi.getSrc(0).getIndex(0);
   int c;
   std::vector<Value *> off, src, ldv, def;
   Value *ind = NULL;
@ -2625,6 +2665,8 @@ Converter::handleLOAD(Value *dst0[4])

   switch (tgsi.getSrc(0).getFile()) {
   case TGSI_FILE_BUFFER:
+      r = remapBufferId(r);
+      /* fallthrough */
   case TGSI_FILE_MEMORY:
      for (c = 0; c < 4; ++c) {
         if (!dst0[c])
@ -2648,7 +2690,7 @@ Converter::handleLOAD(Value *dst0[4])

         Instruction *ld = mkLoad(TYPE_U32, dst0[c], sym, off);
         if (tgsi.getSrc(0).getFile() == TGSI_FILE_BUFFER &&
-             code->bufferAtomics[r])
+             code->bufferAtomics[tgsi.getSrc(0).getIndex(0)])
            ld->cache = nv50_ir::CACHE_CG;
         else
            ld->cache = tgsi.getCacheMode();
@ -2657,6 +2699,7 @@ Converter::handleLOAD(Value *dst0[4])
      }
      break;
   default: {
+      r = remapImageId(r);
      getImageCoords(off, 1);
      def.resize(4);

@ -2764,7 +2807,7 @@ Converter::handleLOAD(Value *dst0[4])
 void
 Converter::handleSTORE()
 {
-   const int r = tgsi.getDst(0).getIndex(0);
+   int r = tgsi.getDst(0).getIndex(0);
   int c;
   std::vector<Value *> off, src, dummy;
   Value *ind = NULL;
@ -2774,6 +2817,8 @@ Converter::handleSTORE()

   switch (tgsi.getDst(0).getFile()) {
   case TGSI_FILE_BUFFER:
+      r = remapBufferId(r);
+      /* fallthrough */
   case TGSI_FILE_MEMORY:
      for (c = 0; c < 4; ++c) {
         if (!(tgsi.getDst(0).getMask() & (1 << c)))
@ -2798,6 +2843,7 @@ Converter::handleSTORE()
      }
      break;
   default: {
+      r = remapImageId(r);
      getImageCoords(off, 0);
      src = off;

@ -2881,7 +2927,7 @@ Converter::handleSTORE()
 void
 Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp)
 {
-   const int r = tgsi.getSrc(0).getIndex(0);
+   int r = tgsi.getSrc(0).getIndex(0);
   std::vector<Value *> srcv;
   std::vector<Value *> defv;
   LValue *dst = getScratch();
@ -2892,6 +2938,8 @@ Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp)

   switch (tgsi.getSrc(0).getFile()) {
   case TGSI_FILE_BUFFER:
+      r = remapBufferId(r);
+      /* fallthrough */
   case TGSI_FILE_MEMORY:
      for (int c = 0; c < 4; ++c) {
         if (!dst0[c])
@ -2920,6 +2968,7 @@ Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp)
            dst0[c] = dst; // not equal to rDst so handleInstruction will do mkMov
      break;
   default: {
+      r = remapImageId(r);
      getImageCoords(srcv, 1);
      defv.push_back(dst);
      srcv.push_back(fetchSrc(2, 0));
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@ -1117,7 +1117,7 @@ NV50LoweringPreSSA::handleSUQ(TexInstruction *suq)
   const int dim = suq->tex.target.getDim();
   const int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube());
   int mask = suq->tex.mask;
-   int slot = suq->tex.r + 7;
+   int slot = suq->tex.r;
   int c, d;

   for (c = 0, d = 0; c < 3; ++c, mask >>= 1) {
@ -1661,7 +1661,7 @@ getShaderType(const ImgType type) {
 Value *
 NV50LoweringPreSSA::processSurfaceCoords(TexInstruction *su)
 {
-   const int slot = su->tex.r + 7;
+   const int slot = su->tex.r;
   const int dim = su->tex.target.getDim();
   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());

@ -1796,7 +1796,7 @@ NV50LoweringPreSSA::processSurfaceCoords(TexInstruction *su)
 bool
 NV50LoweringPreSSA::handleSULDP(TexInstruction *su)
 {
-   const int slot = su->tex.r + 7;
+   const int slot = su->tex.r;
   assert(!su->getIndirectR());

   bld.setPosition(su, false);
@ -1917,7 +1917,7 @@ NV50LoweringPreSSA::handleSULDP(TexInstruction *su)
 bool
 NV50LoweringPreSSA::handleSUREDP(TexInstruction *su)
 {
-   const int slot = su->tex.r + 7;
+   const int slot = su->tex.r;
   const int dim = su->tex.target.getDim();
   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
   assert(!su->getIndirectR());
@ -1943,7 +1943,7 @@ NV50LoweringPreSSA::handleSUREDP(TexInstruction *su)
 bool
 NV50LoweringPreSSA::handleSUSTP(TexInstruction *su)
 {
-   const int slot = su->tex.r + 7;
+   const int slot = su->tex.r;
   const int dim = su->tex.target.getDim();
   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
   assert(!su->getIndirectR());
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_serialize.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_serialize.cpp
@ -142,6 +142,9 @@ nv50_ir_prog_info_out_serialize(struct blob *blob,
      case PIPE_SHADER_FRAGMENT:
         blob_write_bytes(blob, &info_out->prop.fp, sizeof(info_out->prop.fp));
         break;
+      case PIPE_SHADER_COMPUTE:
+         blob_write_bytes(blob, &info_out->prop.cp, sizeof(info_out->prop.cp));
+         break;
      default:
         break;
   }
@ -259,6 +262,9 @@ nv50_ir_prog_info_out_deserialize(void *data, size_t size, size_t offset,
      case PIPE_SHADER_FRAGMENT:
         blob_copy_bytes(&reader, &info_out->prop.fp, sizeof(info_out->prop.fp));
         break;
+      case PIPE_SHADER_COMPUTE:
+         blob_copy_bytes(&reader, &info_out->prop.cp, sizeof(info_out->prop.cp));
+         break;
      default:
         break;
   }
--- a/src/gallium/drivers/nouveau/nv50/nv50_compute.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
@ -275,46 +275,6 @@ nv50_compute_validate_constbufs(struct nv50_context *nv50)
   nv50_compute_invalidate_constbufs(nv50);
 }

-static void
-nv50_compute_validate_buffers(struct nv50_context *nv50)
-{
-   struct nouveau_pushbuf *push = nv50->base.pushbuf;
-   int i;
-
-   for (i = 0; i < 7; i++) {
-      BEGIN_NV04(push, NV50_CP(GLOBAL(i)), 5);
-      unsigned width;
-      if (nv50->buffers[i].buffer) {
-         struct nv04_resource *res =
-            nv04_resource(nv50->buffers[i].buffer);
-         PUSH_DATAh(push, res->address + nv50->buffers[i].buffer_offset);
-         PUSH_DATA (push, res->address + nv50->buffers[i].buffer_offset);
-         PUSH_DATA (push, 0); /* pitch? */
-         PUSH_DATA (push, ALIGN(nv50->buffers[i].buffer_size, 256) - 1);
-         PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);
-         BCTX_REFN(nv50->bufctx_cp, CP_BUF, res, RDWR);
-         util_range_add(&res->base, &res->valid_buffer_range,
-                        nv50->buffers[i].buffer_offset,
-                        nv50->buffers[i].buffer_offset +
-                        nv50->buffers[i].buffer_size);
-         width = nv50->buffers[i].buffer_size;
-      } else {
-         PUSH_DATA (push, 0);
-         PUSH_DATA (push, 0);
-         PUSH_DATA (push, 0);
-         PUSH_DATA (push, 0);
-         PUSH_DATA (push, 0);
-         width = 0;
-      }
-
-      PUSH_SPACE(push, 1 + 3);
-      BEGIN_NV04(push, NV50_CP(CB_ADDR), 1);
-      PUSH_DATA (push, NV50_CB_AUX_BUF_INFO(i) << (8 - 2) | NV50_CB_AUX);
-      BEGIN_NI04(push, NV50_CP(CB_DATA(0)), 1);
-      PUSH_DATA (push, width);
-   }
-}
-
 static void
 nv50_get_surface_dims(const struct pipe_image_view *view,
                      int *width, int *height, int *depth)
@ -416,13 +376,34 @@ nv50_compute_validate_surfaces(struct nv50_context *nv50)
   struct nouveau_pushbuf *push = nv50->base.pushbuf;
   int i;

-   for (i = 0; i < 8; i++) {
-      struct pipe_image_view *view = &nv50->images[i];
+   for (i = 0; i < NV50_MAX_GLOBALS - 1; i++) {
+      struct nv50_gmem_state *gmem = &nv50->compprog->cp.gmem[i];
      int width, height, depth;
      uint64_t address = 0;

-      BEGIN_NV04(push, NV50_CP(GLOBAL(7 + i)), 5);
-      if (view->resource) {
+      BEGIN_NV04(push, NV50_CP(GLOBAL(i)), 5);
+
+      if (gmem->valid && !gmem->image && nv50->buffers[gmem->slot].buffer) {
+         struct pipe_shader_buffer *buffer = &nv50->buffers[gmem->slot];
+         struct nv04_resource *res = nv04_resource(buffer->buffer);
+         PUSH_DATAh(push, res->address + buffer->buffer_offset);
+         PUSH_DATA (push, res->address + buffer->buffer_offset);
+         PUSH_DATA (push, 0); /* pitch? */
+         PUSH_DATA (push, ALIGN(buffer->buffer_size, 256) - 1);
+         PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);
+         BCTX_REFN(nv50->bufctx_cp, CP_BUF, res, RDWR);
+         util_range_add(&res->base, &res->valid_buffer_range,
+                        buffer->buffer_offset,
+                        buffer->buffer_offset +
+                        buffer->buffer_size);
+
+         PUSH_SPACE(push, 1 + 3);
+         BEGIN_NV04(push, NV50_CP(CB_ADDR), 1);
+         PUSH_DATA (push, NV50_CB_AUX_BUF_INFO(i) << (8 - 2) | NV50_CB_AUX);
+         BEGIN_NI04(push, NV50_CP(CB_DATA(0)), 1);
+         PUSH_DATA (push, buffer->buffer_size);
+      } else if (gmem->valid && gmem->image && nv50->images[gmem->slot].resource) {
+         struct pipe_image_view *view = &nv50->images[gmem->slot];
         struct nv04_resource *res = nv04_resource(view->resource);

         /* get surface dimensions based on the target. */
@ -483,6 +464,12 @@ nv50_compute_validate_surfaces(struct nv50_context *nv50)
         }

         BCTX_REFN(nv50->bufctx_cp, CP_SUF, res, RDWR);
+
+         PUSH_SPACE(push, 12 + 3);
+         BEGIN_NV04(push, NV50_CP(CB_ADDR), 1);
+         PUSH_DATA (push, NV50_CB_AUX_BUF_INFO(i) << (8 - 2) | NV50_CB_AUX);
+         BEGIN_NI04(push, NV50_CP(CB_DATA(0)), 12);
+         nv50_set_surface_info(push, view, width, height, depth);
      } else {
         PUSH_DATA (push, 0);
         PUSH_DATA (push, 0);
@ -490,12 +477,6 @@ nv50_compute_validate_surfaces(struct nv50_context *nv50)
         PUSH_DATA (push, 0);
         PUSH_DATA (push, 0);
      }
-
-      PUSH_SPACE(push, 12 + 3);
-      BEGIN_NV04(push, NV50_CP(CB_ADDR), 1);
-      PUSH_DATA (push, NV50_CB_AUX_BUF_INFO(7 + i) << (8 - 2) | NV50_CB_AUX);
-      BEGIN_NI04(push, NV50_CP(CB_DATA(0)), 12);
-      nv50_set_surface_info(push, view, width, height, depth);
   }
 }

@ -518,8 +499,9 @@ static struct nv50_state_validate
 validate_list_cp[] = {
   { nv50_compprog_validate,              NV50_NEW_CP_PROGRAM     },
   { nv50_compute_validate_constbufs,     NV50_NEW_CP_CONSTBUF    },
-   { nv50_compute_validate_buffers,       NV50_NEW_CP_BUFFERS     },
-   { nv50_compute_validate_surfaces,      NV50_NEW_CP_SURFACES    },
+   { nv50_compute_validate_surfaces,      NV50_NEW_CP_SURFACES |
+                                          NV50_NEW_CP_BUFFERS  |
+                                          NV50_NEW_CP_PROGRAM     },
   { nv50_compute_validate_textures,      NV50_NEW_CP_TEXTURES    },
   { nv50_compute_validate_samplers,      NV50_NEW_CP_SAMPLERS    },
   { nv50_compute_validate_globals,       NV50_NEW_CP_GLOBALS     },
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@ -24,8 +24,8 @@

 #include "compiler/nir/nir.h"

-#include "nv50/nv50_program.h"
 #include "nv50/nv50_context.h"
+#include "nv50/nv50_program.h"

 #include "codegen/nv50_ir_driver.h"

@ -434,6 +434,15 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
         break;
      }
      prog->gp.vert_count = CLAMP(info_out.prop.gp.maxVertices, 1, 1024);
+   } else
+   if (prog->type == PIPE_SHADER_COMPUTE) {
+      for (i = 0; i < NV50_MAX_GLOBALS; i++) {
+         prog->cp.gmem[i] = (struct nv50_gmem_state){
+            .valid = info_out.prop.cp.gmem[i].valid,
+            .image = info_out.prop.cp.gmem[i].image,
+            .slot  = info_out.prop.cp.gmem[i].slot
+         };
+      }
   }

   if (prog->pipe.stream_output.num_outputs)
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.h
@ -26,7 +26,6 @@
 struct nv50_context;

 #include "pipe/p_state.h"
-#include "pipe/p_shader_tokens.h"

 struct nv50_varying {
   uint8_t id; /* tgsi index */
@ -49,6 +48,12 @@ struct nv50_stream_output_state
   uint8_t map[128];
 };

+struct nv50_gmem_state {
+   unsigned valid : 1; /* whether there's something there */
+   unsigned image : 1; /* buffer or image */
+   unsigned slot  : 6; /* slot in the relevant resource arrays */
+};
+
 struct nv50_program {
   struct pipe_shader_state pipe;

@ -104,6 +109,7 @@ struct nv50_program {
   struct {
      uint32_t lmem_size; /* local memory (TGSI PRIVATE resource) size */
      uint32_t smem_size; /* shared memory (TGSI LOCAL resource) size */
+      struct nv50_gmem_state gmem[NV50_MAX_GLOBALS];
   } cp;

   bool mul_zero_wins;