radeonsi/gfx10: fix overflow and primitive queries

This aligns the offsets to match the memory layout of the query buffer defined by gfx10_sh_query_buffer_mem and calls si_launch_grid_internal to flush caches and wait for completion of shaders prior to retrieving results. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Reviewed-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7181>
2020-10-16 10:09:02 +05:30 · 2020-10-16 10:09:02 +05:30 · 5d14562da8
parent fd4016f978
commit 5d14562da8
4 changed files with 28 additions and 23 deletions
--- a/src/gallium/drivers/radeonsi/gfx10_query.c
+++ b/src/gallium/drivers/radeonsi/gfx10_query.c
@ -360,11 +360,11 @@ static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct s
   if (index >= 0) {
      switch (query->b.type) {
      case PIPE_QUERY_PRIMITIVES_GENERATED:
-         consts.offset = sizeof(uint32_t) * query->stream;
+         consts.offset = 4 * sizeof(uint64_t) * query->stream + 2 * sizeof(uint64_t);
         consts.config = 0;
         break;
      case PIPE_QUERY_PRIMITIVES_EMITTED:
-         consts.offset = sizeof(uint32_t) * (4 + query->stream);
+         consts.offset = 4 * sizeof(uint64_t) * query->stream + 3 * sizeof(uint64_t);
         consts.config = 0;
         break;
      case PIPE_QUERY_SO_STATISTICS:
@ -372,7 +372,7 @@ static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct s
         consts.config = 0;
         break;
      case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-         consts.offset = sizeof(uint32_t) * query->stream;
+         consts.offset = 4 * sizeof(uint64_t) * query->stream;
         consts.config = 2;
         break;
      case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
@ -454,8 +454,9 @@ static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct s
         si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
      }

-      sctx->b.launch_grid(&sctx->b, &grid);
-      sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+      void *saved_cs = sctx->cs_shader_state.program;
+      si_launch_grid_internal((struct si_context *)&sctx->b, &grid, saved_cs,
+                              SI_CS_WAIT_FOR_IDLE | SI_CS_PARTIAL_FLUSH_DISABLE);

      if (qbuf == query->last)
         break;
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@ -60,15 +60,13 @@ unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
   }
 }

-#define SI_CS_IMAGE_OP           (1 << 0)
-#define SI_CS_WAIT_FOR_IDLE      (1 << 1)
-#define SI_CS_RENDER_COND_ENABLE (1 << 2)
-
-static void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *info,
+void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *info,
                                    void *restore_cs, unsigned flags)
 {
   /* Wait for previous shaders to finish. */
-   sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH;
+   sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+   if (!(flags & SI_CS_PARTIAL_FLUSH_DISABLE))
+      sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
   /* Invalidate L0-L1 caches. */
   /* sL0 is never invalidated, because src resources don't use it. */
   sctx->flags |= SI_CONTEXT_INV_VCACHE;
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@ -1339,8 +1339,15 @@ bool vi_dcc_clear_level(struct si_context *sctx, struct si_texture *tex, unsigne
 void si_init_clear_functions(struct si_context *sctx);

 /* si_compute_blit.c */
+#define SI_CS_IMAGE_OP              (1 << 0)
+#define SI_CS_WAIT_FOR_IDLE         (1 << 1)
+#define SI_CS_RENDER_COND_ENABLE    (1 << 2)
+#define SI_CS_PARTIAL_FLUSH_DISABLE (1 << 3)
+
 unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
                            enum si_cache_policy cache_policy);
+void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *info,
+                                    void *restore_cs, unsigned flags);
 void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset,
                     uint64_t size, uint32_t *clear_value, uint32_t clear_value_size,
                     enum si_coherency coher, bool force_cpdma);
--- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
@ -816,7 +816,7 @@ void *gfx10_create_sh_query_result_cs(struct si_context *sctx)
                                   "DCL BUFFER[2]\n"
                                   "DCL CONST[0][0..0]\n"
                                   "DCL TEMP[0..5]\n"
-                                   "IMM[0] UINT32 {0, 7, 0, 4294967295}\n"
+                                   "IMM[0] UINT32 {0, 7, 256, 4294967295}\n"
                                   "IMM[1] UINT32 {1, 2, 4, 8}\n"
                                   "IMM[2] UINT32 {16, 32, 64, 128}\n"

@ -855,13 +855,13 @@ void *gfx10_create_sh_query_result_cs(struct si_context *sctx)
                                   "UADD TEMP[1].x, TEMP[1].xxxx, IMM[0].wwww\n"

                                   /*
-                                   fence = buffer[0]@(base_offset + 32);
+                                   fence = buffer[0]@(base_offset + sizeof(gfx10_sh_query_buffer_mem.stream));
                                   if (!fence) {
                                           acc_missing = ~0u;
                                           break;
                                   }
                                   */
-                                   "UADD TEMP[5].x, TEMP[1].yyyy, IMM[2].yyyy\n"
+                                   "UADD TEMP[5].x, TEMP[1].yyyy, IMM[2].wwww\n"
                                   "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
                                   "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
                                   "UIF TEMP[5]\n"
@ -897,22 +897,21 @@ void *gfx10_create_sh_query_result_cs(struct si_context *sctx)

                                   /*
                                   do {
-                                           generated = buffer[0]@stream_offset;
-                                           emitted = buffer[0]@(stream_offset + 16);
+                                           generated = buffer[0]@(stream_offset + 2 * sizeof(uint64_t));
+                                           emitted = buffer[0]@(stream_offset + 3 * sizeof(uint64_t));
                                           if (generated != emitted) {
                                                   acc_result = 1;
                                                   result_remaining = 0;
                                                   break;
                                           }

-                                           stream_offset += 4;
+                                           stream_offset += sizeof(gfx10_sh_query_buffer_mem.stream[0]);
                                   } while (--count);
                                   */
                                   "BGNLOOP\n"
                                   "UADD TEMP[5].x, TEMP[2].xxxx, IMM[2].xxxx\n"
-                                   "LOAD TEMP[4].x, BUFFER[0], TEMP[2].xxxx\n"
-                                   "LOAD TEMP[4].y, BUFFER[0], TEMP[5].xxxx\n"
-                                   "USNE TEMP[5], TEMP[4].xxxx, TEMP[4].yyyy\n"
+                                   "LOAD TEMP[4].xyzw, BUFFER[0], TEMP[5].xxxx\n"
+                                   "USNE TEMP[5], TEMP[4].xyxy, TEMP[4].zwzw\n"
                                   "UIF TEMP[5]\n"
                                   "MOV TEMP[0].x, IMM[1].xxxx\n"
                                   "MOV TEMP[1].y, IMM[0].xxxx\n"
@ -924,15 +923,15 @@ void *gfx10_create_sh_query_result_cs(struct si_context *sctx)
                                   "UIF TEMP[5]\n"
                                   "BRK\n"
                                   "ENDIF\n"
-                                   "UADD TEMP[2].x, TEMP[2].xxxx, IMM[1].zzzz\n"
+                                   "UADD TEMP[2].x, TEMP[2].xxxx, IMM[2].yyyy\n"
                                   "ENDLOOP\n"
                                   "ENDIF\n"

                                   /*
-                                           base_offset += 64;
+                                           base_offset += sizeof(gfx10_sh_query_buffer_mem);
                                   } // end outer loop
                                   */
-                                   "UADD TEMP[1].y, TEMP[1].yyyy, IMM[2].zzzz\n"
+                                   "UADD TEMP[1].y, TEMP[1].yyyy, IMM[0].zzzz\n"
                                   "ENDLOOP\n"

                                   /*