nv50: add compute invocations counter

This is a purely software counter alongside the other hardware counters for ease of use and consistency. However we have to make room for it in the allocated query space. Use this opportunity to make the nv50 queries work like the nvc0 ones in terms of space allocation. Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu> Reviewed-by: Pierre Moreau <dev@pmoreau.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10164>
2021-03-02 00:18:07 -05:00 · 2021-03-02 00:18:07 -05:00 · 58d47ca324
parent bd2f14a5ea
commit 58d47ca324
3 changed files with 29 additions and 10 deletions
--- a/src/gallium/drivers/nouveau/nv50/nv50_compute.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
@ -628,4 +628,7 @@ nv50_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)

   /* bind a compute shader clobbers fragment shader state */
   nv50->dirty_3d |= NV50_NEW_3D_FRAGPROG;
+
+   nv50->compute_invocations += info->block[0] * info->block[1] * info->block[2] *
+      info->grid[0] * info->grid[1] * info->grid[2];
 }
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@ -225,6 +225,8 @@ struct nv50_context {
   uint16_t images_valid;

   struct util_dynarray global_residents;
+
+   uint64_t compute_invocations;
 };

 static inline struct nv50_context *
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
@ -174,14 +174,15 @@ nv50_hw_begin_query(struct nv50_context *nv50, struct nv50_query *q)
      nv50_hw_query_get(push, q, 0x30, 0x06805002);
      break;
   case PIPE_QUERY_PIPELINE_STATISTICS:
-      nv50_hw_query_get(push, q, 0x80, 0x00801002); /* VFETCH, VERTICES */
-      nv50_hw_query_get(push, q, 0x90, 0x01801002); /* VFETCH, PRIMS */
-      nv50_hw_query_get(push, q, 0xa0, 0x02802002); /* VP, LAUNCHES */
-      nv50_hw_query_get(push, q, 0xb0, 0x03806002); /* GP, LAUNCHES */
-      nv50_hw_query_get(push, q, 0xc0, 0x04806002); /* GP, PRIMS_OUT */
-      nv50_hw_query_get(push, q, 0xd0, 0x07804002); /* RAST, PRIMS_IN */
-      nv50_hw_query_get(push, q, 0xe0, 0x08804002); /* RAST, PRIMS_OUT */
-      nv50_hw_query_get(push, q, 0xf0, 0x0980a002); /* ROP, PIXELS */
+      nv50_hw_query_get(push, q, 0x90, 0x00801002); /* VFETCH, VERTICES */
+      nv50_hw_query_get(push, q, 0xa0, 0x01801002); /* VFETCH, PRIMS */
+      nv50_hw_query_get(push, q, 0xb0, 0x02802002); /* VP, LAUNCHES */
+      nv50_hw_query_get(push, q, 0xc0, 0x03806002); /* GP, LAUNCHES */
+      nv50_hw_query_get(push, q, 0xd0, 0x04806002); /* GP, PRIMS_OUT */
+      nv50_hw_query_get(push, q, 0xe0, 0x07804002); /* RAST, PRIMS_IN */
+      nv50_hw_query_get(push, q, 0xf0, 0x08804002); /* RAST, PRIMS_OUT */
+      nv50_hw_query_get(push, q, 0x100, 0x0980a002); /* ROP, PIXELS */
+      ((uint64_t *)hq->data)[2 * 0x11] = nv50->compute_invocations;
      break;
   case PIPE_QUERY_TIME_ELAPSED:
      nv50_hw_query_get(push, q, 0x10, 0x00005002);
@ -237,6 +238,7 @@ nv50_hw_end_query(struct nv50_context *nv50, struct nv50_query *q)
      nv50_hw_query_get(push, q, 0x50, 0x07804002); /* RAST, PRIMS_IN */
      nv50_hw_query_get(push, q, 0x60, 0x08804002); /* RAST, PRIMS_OUT */
      nv50_hw_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */
+      ((uint64_t *)hq->data)[2 * 0x8] = nv50->compute_invocations;
      break;
   case PIPE_QUERY_TIMESTAMP:
      hq->sequence++;
@ -316,7 +318,8 @@ nv50_hw_get_query_result(struct nv50_context *nv50, struct nv50_query *q,
      break;
   case PIPE_QUERY_PIPELINE_STATISTICS:
      for (i = 0; i < 8; ++i)
-         res64[i] = data64[i * 2] - data64[16 + i * 2];
+         res64[i] = data64[i * 2] - data64[18 + i * 2];
+      result->pipeline_statistics.cs_invocations = data64[i * 2] - data64[18 + i * 2];
      break;
   case PIPE_QUERY_TIMESTAMP:
      res64[0] = data64[1];
@ -351,6 +354,7 @@ nv50_hw_create_query(struct nv50_context *nv50, unsigned type, unsigned index)
 {
   struct nv50_hw_query *hq;
   struct nv50_query *q;
+   unsigned space = NV50_HW_QUERY_ALLOC_SPACE;

   hq = nv50_hw_sm_create_query(nv50, type);
   if (hq) {
@ -380,15 +384,25 @@ nv50_hw_create_query(struct nv50_context *nv50, unsigned type, unsigned index)
      break;
   case PIPE_QUERY_PRIMITIVES_GENERATED:
   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      hq->is64bit = true;
+      space = 32;
+      break;
   case PIPE_QUERY_SO_STATISTICS:
+      hq->is64bit = true;
+      space = 64;
+      break;
   case PIPE_QUERY_PIPELINE_STATISTICS:
      hq->is64bit = true;
+      space = 9 * 2 * 16; /* 9 values, start/end, 16-bytes each */
      break;
   case PIPE_QUERY_TIME_ELAPSED:
   case PIPE_QUERY_TIMESTAMP:
   case PIPE_QUERY_TIMESTAMP_DISJOINT:
   case PIPE_QUERY_GPU_FINISHED:
+      space = 32;
+      break;
   case NVA0_HW_QUERY_STREAM_OUTPUT_BUFFER_OFFSET:
+      space = 16;
      break;
   default:
      debug_printf("invalid query type: %u\n", type);
@ -396,7 +410,7 @@ nv50_hw_create_query(struct nv50_context *nv50, unsigned type, unsigned index)
      return NULL;
   }

-   if (!nv50_hw_query_allocate(nv50, q, NV50_HW_QUERY_ALLOC_SPACE)) {
+   if (!nv50_hw_query_allocate(nv50, q, space)) {
      FREE(hq);
      return NULL;
   }