i965/drm: Add stall warnings when mapping or waiting on BOs.

This restores the performance warnings removed in: i965: Drop brw_bo_map[_gtt] wrappers which issue perf warnings. but adds them for nearly all BO mapping, and also for wait_rendering. Because we add this to the core bufmgr, we automatically get stall warnings in all callers, unlike before where only a few callsites used the wrappers that gave stall warnings. We also do it a bit differently: we simply measure how long set_domain takes (the part that stalls), and complain if it's more than 0.01 ms. We don't bother calling brw_bo_busy(), and we don't measure the mmap time (which doesn't stall). This should be more accurate. Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2017-04-09 23:14:56 -07:00 · 2017-04-09 23:14:56 -07:00 · bd84252be6
parent f053ee78ed
commit bd84252be6
17 changed files with 68 additions and 55 deletions
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@ -64,6 +64,7 @@
 #include "util/hash_table.h"
 #include "util/list.h"
 #include "brw_bufmgr.h"
+#include "brw_context.h"
 #include "string.h"

 #include "i915_drm.h"
@ -640,7 +641,8 @@ brw_bo_unreference(struct brw_bo *bo)
 }

 static void
-set_domain(struct brw_bo *bo, uint32_t read_domains, uint32_t write_domain)
+set_domain(struct brw_context *brw, const char *action,
+           struct brw_bo *bo, uint32_t read_domains, uint32_t write_domain)
 {
   struct drm_i915_gem_set_domain sd = {
      .handle = bo->gem_handle,
@ -648,15 +650,24 @@ set_domain(struct brw_bo *bo, uint32_t read_domains, uint32_t write_domain)
      .write_domain = write_domain,
   };

+   double elapsed = unlikely(brw && brw->perf_debug) ? -get_time() : 0.0;
+
   if (drmIoctl(bo->bufmgr->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &sd) != 0) {
      DBG("%s:%d: Error setting memory domains %d (%08x %08x): %s.\n",
          __FILE__, __LINE__, bo->gem_handle, read_domains, write_domain,
          strerror(errno));
   }
+
+   if (unlikely(brw && brw->perf_debug)) {
+      elapsed += get_time();
+      if (elapsed > 1e-5) /* 0.01ms */
+         perf_debug("%s a busy \"%s\" BO stalled and took %.03f ms.\n",
+                    action, bo->name, elapsed * 1000);
+   }
 }

 int
-brw_bo_map(struct brw_bo *bo, int write_enable)
+brw_bo_map(struct brw_context *brw, struct brw_bo *bo, int write_enable)
 {
   struct brw_bufmgr *bufmgr = bo->bufmgr;
   int ret;
@ -687,7 +698,7 @@ brw_bo_map(struct brw_bo *bo, int write_enable)
   DBG("bo_map: %d (%s) -> %p\n", bo->gem_handle, bo->name, bo->mem_virtual);
   bo->virtual = bo->mem_virtual;

-   set_domain(bo, I915_GEM_DOMAIN_CPU,
+   set_domain(brw, "CPU mapping", bo, I915_GEM_DOMAIN_CPU,
              write_enable ? I915_GEM_DOMAIN_CPU : 0);

   bo_mark_mmaps_incoherent(bo);
@ -744,7 +755,7 @@ map_gtt(struct brw_bo *bo)
 }

 int
-brw_bo_map_gtt(struct brw_bo *bo)
+brw_bo_map_gtt(struct brw_context *brw, struct brw_bo *bo)
 {
   struct brw_bufmgr *bufmgr = bo->bufmgr;
   int ret;
@ -766,7 +777,8 @@ brw_bo_map_gtt(struct brw_bo *bo)
    * tell it when we're about to use things if we had done
    * rendering and it still happens to be bound to the GTT.
    */
-   set_domain(bo, I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
+   set_domain(brw, "GTT mapping", bo,
+              I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);

   bo_mark_mmaps_incoherent(bo);
   VG(VALGRIND_MAKE_MEM_DEFINED(bo->gtt_virtual, bo->size));
@ -790,7 +802,7 @@ brw_bo_map_gtt(struct brw_bo *bo)
 */

 int
-brw_bo_map_unsynchronized(struct brw_bo *bo)
+brw_bo_map_unsynchronized(struct brw_context *brw, struct brw_bo *bo)
 {
   struct brw_bufmgr *bufmgr = bo->bufmgr;
   int ret;
@ -803,7 +815,7 @@ brw_bo_map_unsynchronized(struct brw_bo *bo)
    * does reasonable things.
    */
   if (!bufmgr->has_llc)
-      return brw_bo_map_gtt(bo);
+      return brw_bo_map_gtt(brw, bo);

   pthread_mutex_lock(&bufmgr->lock);

@ -897,9 +909,10 @@ brw_bo_get_subdata(struct brw_bo *bo, unsigned long offset,

 /** Waits for all GPU rendering with the object to have completed. */
 void
-brw_bo_wait_rendering(struct brw_bo *bo)
+brw_bo_wait_rendering(struct brw_context *brw, struct brw_bo *bo)
 {
-   set_domain(bo, I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
+   set_domain(brw, "waiting for",
+              bo, I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
 }

 /**
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.h
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.h
@ -44,6 +44,7 @@ extern "C" {
 #endif

 struct gen_device_info;
+struct brw_context;

 struct brw_bo {
   /**
@ -179,7 +180,7 @@ void brw_bo_unreference(struct brw_bo *bo);
 * buffer to complete, first.  The resulting mapping is available at
 * buf->virtual.
 */
-int brw_bo_map(struct brw_bo *bo, int write_enable);
+int brw_bo_map(struct brw_context *brw, struct brw_bo *bo, int write_enable);

 /**
 * Reduces the refcount on the userspace mapping of the buffer
@ -200,7 +201,7 @@ int brw_bo_get_subdata(struct brw_bo *bo, unsigned long offset,
 * bo_subdata, etc.  It is merely a way for the driver to implement
 * glFinish.
 */
-void brw_bo_wait_rendering(struct brw_bo *bo);
+void brw_bo_wait_rendering(struct brw_context *brw, struct brw_bo *bo);

 /**
 * Tears down the buffer manager instance.
@ -252,8 +253,8 @@ struct brw_bo *brw_bo_gem_create_from_name(struct brw_bufmgr *bufmgr,
                                           const char *name,
                                           unsigned int handle);
 void brw_bufmgr_enable_reuse(struct brw_bufmgr *bufmgr);
-int brw_bo_map_unsynchronized(struct brw_bo *bo);
-int brw_bo_map_gtt(struct brw_bo *bo);
+int brw_bo_map_unsynchronized(struct brw_context *brw, struct brw_bo *bo);
+int brw_bo_map_gtt(struct brw_context *brw, struct brw_bo *bo);

 void *brw_bo_map__cpu(struct brw_bo *bo);
 void *brw_bo_map__gtt(struct brw_bo *bo);
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@ -413,7 +413,7 @@ intel_finish(struct gl_context * ctx)
   intel_glFlush(ctx);

   if (brw->batch.last_bo)
-      brw_bo_wait_rendering(brw->batch.last_bo);
+      brw_bo_wait_rendering(brw, brw->batch.last_bo);
 }

 static void
--- a/src/mesa/drivers/dri/i965/brw_performance_query.c
+++ b/src/mesa/drivers/dri/i965/brw_performance_query.c
@ -713,7 +713,7 @@ accumulate_oa_reports(struct brw_context *brw,
   if (!read_oa_samples(brw))
      goto error;

-   brw_bo_map(obj->oa.bo, false);
+   brw_bo_map(brw, obj->oa.bo, false);
   query_buffer = obj->oa.bo->virtual;

   start = last = query_buffer;
@ -993,7 +993,7 @@ brw_begin_perf_query(struct gl_context *ctx,
                      MI_RPC_BO_SIZE, 64);
 #ifdef DEBUG
      /* Pre-filling the BO helps debug whether writes landed. */
-      brw_bo_map(obj->oa.bo, true);
+      brw_bo_map(brw, obj->oa.bo, true);
      memset((char *) obj->oa.bo->virtual, 0x80, MI_RPC_BO_SIZE);
      brw_bo_unmap(obj->oa.bo);
 #endif
@ -1131,12 +1131,7 @@ brw_wait_perf_query(struct gl_context *ctx, struct gl_perf_query_object *o)
   if (brw_batch_references(&brw->batch, bo))
      intel_batchbuffer_flush(brw);

-   if (unlikely(brw->perf_debug)) {
-      if (brw_bo_busy(bo))
-         perf_debug("Stalling GPU waiting for a performance query object.\n");
-   }
-
-   brw_bo_wait_rendering(bo);
+   brw_bo_wait_rendering(brw, bo);
 }

 static bool
@ -1220,7 +1215,7 @@ get_pipeline_stats_data(struct brw_context *brw,
   int n_counters = obj->query->n_counters;
   uint8_t *p = data;

-   brw_bo_map(obj->pipeline_stats.bo, false);
+   brw_bo_map(brw, obj->pipeline_stats.bo, false);
   uint64_t *start = obj->pipeline_stats.bo->virtual;
   uint64_t *end = start + (STATS_BO_END_OFFSET_BYTES / sizeof(uint64_t));

--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@ -580,7 +580,7 @@ brw_collect_shader_time(struct brw_context *brw)
    * delaying reading the reports, but it doesn't look like it's a big
    * overhead compared to the cost of tracking the time in the first place.
    */
-   brw_bo_map(brw->shader_time.bo, true);
+   brw_bo_map(brw, brw->shader_time.bo, true);
   void *bo_map = brw->shader_time.bo->virtual;

   for (int i = 0; i < brw->shader_time.num_entries; i++) {
--- a/src/mesa/drivers/dri/i965/brw_program_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_program_cache.c
@ -217,14 +217,14 @@ brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)

   new_bo = brw_bo_alloc(brw->bufmgr, "program cache", new_size, 64);
   if (brw->has_llc)
-      brw_bo_map_unsynchronized(new_bo);
+      brw_bo_map_unsynchronized(brw, new_bo);

   /* Copy any existing data that needs to be saved. */
   if (cache->next_offset != 0) {
      if (brw->has_llc) {
         memcpy(new_bo->virtual, cache->bo->virtual, cache->next_offset);
      } else {
-         brw_bo_map(cache->bo, false);
+         brw_bo_map(brw, cache->bo, false);
         brw_bo_subdata(new_bo, 0, cache->next_offset,
                              cache->bo->virtual);
         brw_bo_unmap(cache->bo);
@ -252,7 +252,7 @@ brw_lookup_prog(const struct brw_cache *cache,
                enum brw_cache_id cache_id,
                const void *data, unsigned data_size)
 {
-   const struct brw_context *brw = cache->brw;
+   struct brw_context *brw = cache->brw;
   unsigned i;
   const struct brw_cache_item *item;

@ -264,7 +264,7 @@ brw_lookup_prog(const struct brw_cache *cache,
            continue;

         if (!brw->has_llc)
-            brw_bo_map(cache->bo, false);
+            brw_bo_map(brw, cache->bo, false);
         ret = memcmp(cache->bo->virtual + item->offset, data, item->size);
         if (!brw->has_llc)
            brw_bo_unmap(cache->bo);
@ -408,7 +408,7 @@ brw_init_caches(struct brw_context *brw)

   cache->bo = brw_bo_alloc(brw->bufmgr, "program cache",  4096, 64);
   if (brw->has_llc)
-      brw_bo_map_unsynchronized(cache->bo);
+      brw_bo_map_unsynchronized(brw, cache->bo);
 }

 static void
@ -536,7 +536,7 @@ brw_print_program_cache(struct brw_context *brw)
   struct brw_cache_item *item;

   if (!brw->has_llc)
-      brw_bo_map(cache->bo, false);
+      brw_bo_map(brw, cache->bo, false);

   for (unsigned i = 0; i < cache->size; i++) {
      for (item = cache->items[i]; item; item = item->next) {
--- a/src/mesa/drivers/dri/i965/brw_queryobj.c
+++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
@ -146,7 +146,7 @@ brw_queryobj_get_results(struct gl_context *ctx,
      }
   }

-   brw_bo_map(query->bo, false);
+   brw_bo_map(brw, query->bo, false);
   results = query->bo->virtual;
   switch (query->Base.Target) {
   case GL_TIME_ELAPSED_EXT:
--- a/src/mesa/drivers/dri/i965/gen6_queryobj.c
+++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c
@ -212,7 +212,7 @@ gen6_queryobj_get_results(struct gl_context *ctx,
   if (query->bo == NULL)
      return;

-   brw_bo_map(query->bo, false);
+   brw_bo_map(brw, query->bo, false);
   uint64_t *results = query->bo->virtual;
   switch (query->Base.Target) {
   case GL_TIME_ELAPSED:
--- a/src/mesa/drivers/dri/i965/gen6_sol.c
+++ b/src/mesa/drivers/dri/i965/gen6_sol.c
@ -247,7 +247,7 @@ tally_prims_generated(struct brw_context *brw,
   if (unlikely(brw->perf_debug && brw_bo_busy(obj->prim_count_bo)))
      perf_debug("Stalling for # of transform feedback primitives written.\n");

-   brw_bo_map(obj->prim_count_bo, false);
+   brw_bo_map(brw, obj->prim_count_bo, false);
   uint64_t *prim_counts = obj->prim_count_bo->virtual;

   assert(obj->prim_count_buffer_index % (2 * streams) == 0);
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@ -100,7 +100,7 @@ intel_batchbuffer_reset(struct intel_batchbuffer *batch,

   batch->bo = brw_bo_alloc(bufmgr, "batchbuffer", BATCH_SZ, 4096);
   if (has_llc) {
-      brw_bo_map(batch->bo, true);
+      brw_bo_map(NULL, batch->bo, true);
      batch->map = batch->bo->virtual;
   }
   batch->map_next = batch->map;
@ -240,7 +240,7 @@ do_batch_dump(struct brw_context *brw)
   if (batch->ring != RENDER_RING)
      return;

-   int ret = brw_bo_map(batch->bo, false);
+   int ret = brw_bo_map(brw, batch->bo, false);
   if (ret != 0) {
      fprintf(stderr,
 	      "WARNING: failed to map batchbuffer (%s), "
@ -474,8 +474,12 @@ throttle(struct brw_context *brw)
    */
   if (brw->need_swap_throttle && brw->throttle_batch[0]) {
      if (brw->throttle_batch[1]) {
-         if (!brw->disable_throttling)
-            brw_bo_wait_rendering(brw->throttle_batch[1]);
+         if (!brw->disable_throttling) {
+            /* Pass NULL rather than brw so we avoid perf_debug warnings;
+             * stalling is common and expected here...
+             */
+            brw_bo_wait_rendering(NULL, brw->throttle_batch[1]);
+         }
         brw_bo_unreference(brw->throttle_batch[1]);
      }
      brw->throttle_batch[1] = brw->throttle_batch[0];
@ -700,7 +704,7 @@ _intel_batchbuffer_flush_fence(struct brw_context *brw,

   if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
      fprintf(stderr, "waiting for idle\n");
-      brw_bo_wait_rendering(brw->batch.bo);
+      brw_bo_wait_rendering(brw, brw->batch.bo);
   }

   /* Start a new batch buffer. */
--- a/src/mesa/drivers/dri/i965/intel_buffer_objects.c
+++ b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
@ -217,7 +217,7 @@ brw_buffer_subdata(struct gl_context *ctx,
   if (offset + size <= intel_obj->gpu_active_start ||
       intel_obj->gpu_active_end <= offset) {
      if (brw->has_llc) {
-         brw_bo_map_unsynchronized(intel_obj->buffer);
+         brw_bo_map_unsynchronized(brw, intel_obj->buffer);
         memcpy(intel_obj->buffer->virtual + offset, data, size);
         brw_bo_unmap(intel_obj->buffer);

@ -389,10 +389,10 @@ brw_map_buffer_range(struct gl_context *ctx,
                                                          intel_obj->map_extra[index],
                                                          alignment);
      if (brw->has_llc) {
-         brw_bo_map(intel_obj->range_map_bo[index],
-                          (access & GL_MAP_WRITE_BIT) != 0);
+         brw_bo_map(brw, intel_obj->range_map_bo[index],
+                    (access & GL_MAP_WRITE_BIT) != 0);
      } else {
-         brw_bo_map_gtt(intel_obj->range_map_bo[index]);
+         brw_bo_map_gtt(brw, intel_obj->range_map_bo[index]);
      }
      obj->Mappings[index].Pointer =
         intel_obj->range_map_bo[index]->virtual + intel_obj->map_extra[index];
@ -404,13 +404,13 @@ brw_map_buffer_range(struct gl_context *ctx,
          brw_bo_busy(intel_obj->buffer)) {
         perf_debug("MapBufferRange with GL_MAP_UNSYNCHRONIZED_BIT stalling (it's actually synchronized on non-LLC platforms)\n");
      }
-      brw_bo_map_unsynchronized(intel_obj->buffer);
+      brw_bo_map_unsynchronized(brw, intel_obj->buffer);
   } else if (!brw->has_llc && (!(access & GL_MAP_READ_BIT) ||
                              (access & GL_MAP_PERSISTENT_BIT))) {
-      brw_bo_map_gtt(intel_obj->buffer);
+      brw_bo_map_gtt(brw, intel_obj->buffer);
      mark_buffer_inactive(intel_obj);
   } else {
-      brw_bo_map(intel_obj->buffer, (access & GL_MAP_WRITE_BIT) != 0);
+      brw_bo_map(brw, intel_obj->buffer, (access & GL_MAP_WRITE_BIT) != 0);
      mark_buffer_inactive(intel_obj);
   }

--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@ -1386,7 +1386,7 @@ intel_miptree_init_mcs(struct brw_context *brw,
    *
    * Note: the clear value for MCS buffers is all 1's, so we memset to 0xff.
    */
-   const int ret = brw_bo_map_gtt(mt->mcs_buf->bo);
+   const int ret = brw_bo_map_gtt(brw, mt->mcs_buf->bo);
   if (unlikely(ret)) {
      fprintf(stderr, "Failed to map mcs buffer into GTT\n");
      brw_bo_unreference(mt->mcs_buf->bo);
@ -2473,9 +2473,9 @@ intel_miptree_map_raw(struct brw_context *brw, struct intel_mipmap_tree *mt)
    * long as cache consistency is maintained).
    */
   if (mt->tiling != I915_TILING_NONE || mt->is_scanout)
-      brw_bo_map_gtt(bo);
+      brw_bo_map_gtt(brw, bo);
   else
-      brw_bo_map(bo, true);
+      brw_bo_map(brw, bo, true);

   return bo->virtual;
 }
--- a/src/mesa/drivers/dri/i965/intel_pixel_read.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_read.c
@ -147,7 +147,7 @@ intel_readpixels_tiled_memcpy(struct gl_context * ctx,
      intel_batchbuffer_flush(brw);
   }

-   error = brw_bo_map(bo, false /* write enable */);
+   error = brw_bo_map(brw, bo, false /* write enable */);
   if (error) {
      DBG("%s: failed to map bo\n", __func__);
      return false;
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@ -1384,7 +1384,7 @@ intel_detect_pipelined_register(struct intel_screen *screen,
   if (bo == NULL)
      goto err_results;

-   if (brw_bo_map(bo, 1))
+   if (brw_bo_map(NULL, bo, 1))
      goto err_batch;

   batch = bo->virtual;
@ -1440,7 +1440,7 @@ intel_detect_pipelined_register(struct intel_screen *screen,
   drmIoctl(dri_screen->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf);

   /* Check whether the value got written. */
-   if (brw_bo_map(results, false) == 0) {
+   if (brw_bo_map(NULL, results, false) == 0) {
      success = *((uint32_t *)results->virtual + offset) == expected_value;
      brw_bo_unmap(results);
   }
--- a/src/mesa/drivers/dri/i965/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
@ -532,7 +532,7 @@ intel_gettexsubimage_tiled_memcpy(struct gl_context *ctx,
      intel_batchbuffer_flush(brw);
   }

-   error = brw_bo_map(bo, false /* write enable */);
+   error = brw_bo_map(brw, bo, false /* write enable */);
   if (error) {
      DBG("%s: failed to map bo\n", __func__);
      return false;
--- a/src/mesa/drivers/dri/i965/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
@ -148,7 +148,7 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
      intel_batchbuffer_flush(brw);
   }

-   error = brw_bo_map(bo, true /* write enable */);
+   error = brw_bo_map(brw, bo, true /* write enable */);
   if (error || bo->virtual == NULL) {
      DBG("%s: failed to map bo\n", __func__);
      return false;
--- a/src/mesa/drivers/dri/i965/intel_upload.c
+++ b/src/mesa/drivers/dri/i965/intel_upload.c
@ -98,9 +98,9 @@ intel_upload_space(struct brw_context *brw,
      brw->upload.bo = brw_bo_alloc(brw->bufmgr, "streamed data",
                                    MAX2(INTEL_UPLOAD_SIZE, size), 4096);
      if (brw->has_llc)
-         brw_bo_map(brw->upload.bo, true);
+         brw_bo_map(brw, brw->upload.bo, true);
      else
-         brw_bo_map_gtt(brw->upload.bo);
+         brw_bo_map_gtt(brw, brw->upload.bo);
   }

   brw->upload.next_offset = offset + size;