i965/drm: Add stall warnings when mapping or waiting on BOs.

This restores the performance warnings removed in:

    i965: Drop brw_bo_map[_gtt] wrappers which issue perf warnings.

but adds them for nearly all BO mapping, and also for wait_rendering.

Because we add this to the core bufmgr, we automatically get stall
warnings in all callers, unlike before where only a few callsites used
the wrappers that gave stall warnings.

We also do it a bit differently: we simply measure how long set_domain
takes (the part that stalls), and complain if it's more than 0.01 ms.
We don't bother calling brw_bo_busy(), and we don't measure the mmap
time (which doesn't stall).  This should be more accurate.

Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
This commit is contained in:
Kenneth Graunke 2017-04-09 23:14:56 -07:00
parent f053ee78ed
commit bd84252be6
17 changed files with 68 additions and 55 deletions

View File

@ -64,6 +64,7 @@
#include "util/hash_table.h"
#include "util/list.h"
#include "brw_bufmgr.h"
#include "brw_context.h"
#include "string.h"
#include "i915_drm.h"
@ -640,7 +641,8 @@ brw_bo_unreference(struct brw_bo *bo)
}
static void
set_domain(struct brw_bo *bo, uint32_t read_domains, uint32_t write_domain)
set_domain(struct brw_context *brw, const char *action,
struct brw_bo *bo, uint32_t read_domains, uint32_t write_domain)
{
struct drm_i915_gem_set_domain sd = {
.handle = bo->gem_handle,
@ -648,15 +650,24 @@ set_domain(struct brw_bo *bo, uint32_t read_domains, uint32_t write_domain)
.write_domain = write_domain,
};
double elapsed = unlikely(brw && brw->perf_debug) ? -get_time() : 0.0;
if (drmIoctl(bo->bufmgr->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &sd) != 0) {
DBG("%s:%d: Error setting memory domains %d (%08x %08x): %s.\n",
__FILE__, __LINE__, bo->gem_handle, read_domains, write_domain,
strerror(errno));
}
if (unlikely(brw && brw->perf_debug)) {
elapsed += get_time();
if (elapsed > 1e-5) /* 0.01ms */
perf_debug("%s a busy \"%s\" BO stalled and took %.03f ms.\n",
action, bo->name, elapsed * 1000);
}
}
int
brw_bo_map(struct brw_bo *bo, int write_enable)
brw_bo_map(struct brw_context *brw, struct brw_bo *bo, int write_enable)
{
struct brw_bufmgr *bufmgr = bo->bufmgr;
int ret;
@ -687,7 +698,7 @@ brw_bo_map(struct brw_bo *bo, int write_enable)
DBG("bo_map: %d (%s) -> %p\n", bo->gem_handle, bo->name, bo->mem_virtual);
bo->virtual = bo->mem_virtual;
set_domain(bo, I915_GEM_DOMAIN_CPU,
set_domain(brw, "CPU mapping", bo, I915_GEM_DOMAIN_CPU,
write_enable ? I915_GEM_DOMAIN_CPU : 0);
bo_mark_mmaps_incoherent(bo);
@ -744,7 +755,7 @@ map_gtt(struct brw_bo *bo)
}
int
brw_bo_map_gtt(struct brw_bo *bo)
brw_bo_map_gtt(struct brw_context *brw, struct brw_bo *bo)
{
struct brw_bufmgr *bufmgr = bo->bufmgr;
int ret;
@ -766,7 +777,8 @@ brw_bo_map_gtt(struct brw_bo *bo)
* tell it when we're about to use things if we had done
* rendering and it still happens to be bound to the GTT.
*/
set_domain(bo, I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
set_domain(brw, "GTT mapping", bo,
I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
bo_mark_mmaps_incoherent(bo);
VG(VALGRIND_MAKE_MEM_DEFINED(bo->gtt_virtual, bo->size));
@ -790,7 +802,7 @@ brw_bo_map_gtt(struct brw_bo *bo)
*/
int
brw_bo_map_unsynchronized(struct brw_bo *bo)
brw_bo_map_unsynchronized(struct brw_context *brw, struct brw_bo *bo)
{
struct brw_bufmgr *bufmgr = bo->bufmgr;
int ret;
@ -803,7 +815,7 @@ brw_bo_map_unsynchronized(struct brw_bo *bo)
* does reasonable things.
*/
if (!bufmgr->has_llc)
return brw_bo_map_gtt(bo);
return brw_bo_map_gtt(brw, bo);
pthread_mutex_lock(&bufmgr->lock);
@ -897,9 +909,10 @@ brw_bo_get_subdata(struct brw_bo *bo, unsigned long offset,
/** Waits for all GPU rendering with the object to have completed. */
void
brw_bo_wait_rendering(struct brw_bo *bo)
brw_bo_wait_rendering(struct brw_context *brw, struct brw_bo *bo)
{
set_domain(bo, I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
set_domain(brw, "waiting for",
bo, I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
}
/**

View File

@ -44,6 +44,7 @@ extern "C" {
#endif
struct gen_device_info;
struct brw_context;
struct brw_bo {
/**
@ -179,7 +180,7 @@ void brw_bo_unreference(struct brw_bo *bo);
* buffer to complete, first. The resulting mapping is available at
* buf->virtual.
*/
int brw_bo_map(struct brw_bo *bo, int write_enable);
int brw_bo_map(struct brw_context *brw, struct brw_bo *bo, int write_enable);
/**
* Reduces the refcount on the userspace mapping of the buffer
@ -200,7 +201,7 @@ int brw_bo_get_subdata(struct brw_bo *bo, unsigned long offset,
* bo_subdata, etc. It is merely a way for the driver to implement
* glFinish.
*/
void brw_bo_wait_rendering(struct brw_bo *bo);
void brw_bo_wait_rendering(struct brw_context *brw, struct brw_bo *bo);
/**
* Tears down the buffer manager instance.
@ -252,8 +253,8 @@ struct brw_bo *brw_bo_gem_create_from_name(struct brw_bufmgr *bufmgr,
const char *name,
unsigned int handle);
void brw_bufmgr_enable_reuse(struct brw_bufmgr *bufmgr);
int brw_bo_map_unsynchronized(struct brw_bo *bo);
int brw_bo_map_gtt(struct brw_bo *bo);
int brw_bo_map_unsynchronized(struct brw_context *brw, struct brw_bo *bo);
int brw_bo_map_gtt(struct brw_context *brw, struct brw_bo *bo);
void *brw_bo_map__cpu(struct brw_bo *bo);
void *brw_bo_map__gtt(struct brw_bo *bo);

View File

@ -413,7 +413,7 @@ intel_finish(struct gl_context * ctx)
intel_glFlush(ctx);
if (brw->batch.last_bo)
brw_bo_wait_rendering(brw->batch.last_bo);
brw_bo_wait_rendering(brw, brw->batch.last_bo);
}
static void

View File

@ -713,7 +713,7 @@ accumulate_oa_reports(struct brw_context *brw,
if (!read_oa_samples(brw))
goto error;
brw_bo_map(obj->oa.bo, false);
brw_bo_map(brw, obj->oa.bo, false);
query_buffer = obj->oa.bo->virtual;
start = last = query_buffer;
@ -993,7 +993,7 @@ brw_begin_perf_query(struct gl_context *ctx,
MI_RPC_BO_SIZE, 64);
#ifdef DEBUG
/* Pre-filling the BO helps debug whether writes landed. */
brw_bo_map(obj->oa.bo, true);
brw_bo_map(brw, obj->oa.bo, true);
memset((char *) obj->oa.bo->virtual, 0x80, MI_RPC_BO_SIZE);
brw_bo_unmap(obj->oa.bo);
#endif
@ -1131,12 +1131,7 @@ brw_wait_perf_query(struct gl_context *ctx, struct gl_perf_query_object *o)
if (brw_batch_references(&brw->batch, bo))
intel_batchbuffer_flush(brw);
if (unlikely(brw->perf_debug)) {
if (brw_bo_busy(bo))
perf_debug("Stalling GPU waiting for a performance query object.\n");
}
brw_bo_wait_rendering(bo);
brw_bo_wait_rendering(brw, bo);
}
static bool
@ -1220,7 +1215,7 @@ get_pipeline_stats_data(struct brw_context *brw,
int n_counters = obj->query->n_counters;
uint8_t *p = data;
brw_bo_map(obj->pipeline_stats.bo, false);
brw_bo_map(brw, obj->pipeline_stats.bo, false);
uint64_t *start = obj->pipeline_stats.bo->virtual;
uint64_t *end = start + (STATS_BO_END_OFFSET_BYTES / sizeof(uint64_t));

View File

@ -580,7 +580,7 @@ brw_collect_shader_time(struct brw_context *brw)
* delaying reading the reports, but it doesn't look like it's a big
* overhead compared to the cost of tracking the time in the first place.
*/
brw_bo_map(brw->shader_time.bo, true);
brw_bo_map(brw, brw->shader_time.bo, true);
void *bo_map = brw->shader_time.bo->virtual;
for (int i = 0; i < brw->shader_time.num_entries; i++) {

View File

@ -217,14 +217,14 @@ brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
new_bo = brw_bo_alloc(brw->bufmgr, "program cache", new_size, 64);
if (brw->has_llc)
brw_bo_map_unsynchronized(new_bo);
brw_bo_map_unsynchronized(brw, new_bo);
/* Copy any existing data that needs to be saved. */
if (cache->next_offset != 0) {
if (brw->has_llc) {
memcpy(new_bo->virtual, cache->bo->virtual, cache->next_offset);
} else {
brw_bo_map(cache->bo, false);
brw_bo_map(brw, cache->bo, false);
brw_bo_subdata(new_bo, 0, cache->next_offset,
cache->bo->virtual);
brw_bo_unmap(cache->bo);
@ -252,7 +252,7 @@ brw_lookup_prog(const struct brw_cache *cache,
enum brw_cache_id cache_id,
const void *data, unsigned data_size)
{
const struct brw_context *brw = cache->brw;
struct brw_context *brw = cache->brw;
unsigned i;
const struct brw_cache_item *item;
@ -264,7 +264,7 @@ brw_lookup_prog(const struct brw_cache *cache,
continue;
if (!brw->has_llc)
brw_bo_map(cache->bo, false);
brw_bo_map(brw, cache->bo, false);
ret = memcmp(cache->bo->virtual + item->offset, data, item->size);
if (!brw->has_llc)
brw_bo_unmap(cache->bo);
@ -408,7 +408,7 @@ brw_init_caches(struct brw_context *brw)
cache->bo = brw_bo_alloc(brw->bufmgr, "program cache", 4096, 64);
if (brw->has_llc)
brw_bo_map_unsynchronized(cache->bo);
brw_bo_map_unsynchronized(brw, cache->bo);
}
static void
@ -536,7 +536,7 @@ brw_print_program_cache(struct brw_context *brw)
struct brw_cache_item *item;
if (!brw->has_llc)
brw_bo_map(cache->bo, false);
brw_bo_map(brw, cache->bo, false);
for (unsigned i = 0; i < cache->size; i++) {
for (item = cache->items[i]; item; item = item->next) {

View File

@ -146,7 +146,7 @@ brw_queryobj_get_results(struct gl_context *ctx,
}
}
brw_bo_map(query->bo, false);
brw_bo_map(brw, query->bo, false);
results = query->bo->virtual;
switch (query->Base.Target) {
case GL_TIME_ELAPSED_EXT:

View File

@ -212,7 +212,7 @@ gen6_queryobj_get_results(struct gl_context *ctx,
if (query->bo == NULL)
return;
brw_bo_map(query->bo, false);
brw_bo_map(brw, query->bo, false);
uint64_t *results = query->bo->virtual;
switch (query->Base.Target) {
case GL_TIME_ELAPSED:

View File

@ -247,7 +247,7 @@ tally_prims_generated(struct brw_context *brw,
if (unlikely(brw->perf_debug && brw_bo_busy(obj->prim_count_bo)))
perf_debug("Stalling for # of transform feedback primitives written.\n");
brw_bo_map(obj->prim_count_bo, false);
brw_bo_map(brw, obj->prim_count_bo, false);
uint64_t *prim_counts = obj->prim_count_bo->virtual;
assert(obj->prim_count_buffer_index % (2 * streams) == 0);

View File

@ -100,7 +100,7 @@ intel_batchbuffer_reset(struct intel_batchbuffer *batch,
batch->bo = brw_bo_alloc(bufmgr, "batchbuffer", BATCH_SZ, 4096);
if (has_llc) {
brw_bo_map(batch->bo, true);
brw_bo_map(NULL, batch->bo, true);
batch->map = batch->bo->virtual;
}
batch->map_next = batch->map;
@ -240,7 +240,7 @@ do_batch_dump(struct brw_context *brw)
if (batch->ring != RENDER_RING)
return;
int ret = brw_bo_map(batch->bo, false);
int ret = brw_bo_map(brw, batch->bo, false);
if (ret != 0) {
fprintf(stderr,
"WARNING: failed to map batchbuffer (%s), "
@ -474,8 +474,12 @@ throttle(struct brw_context *brw)
*/
if (brw->need_swap_throttle && brw->throttle_batch[0]) {
if (brw->throttle_batch[1]) {
if (!brw->disable_throttling)
brw_bo_wait_rendering(brw->throttle_batch[1]);
if (!brw->disable_throttling) {
/* Pass NULL rather than brw so we avoid perf_debug warnings;
* stalling is common and expected here...
*/
brw_bo_wait_rendering(NULL, brw->throttle_batch[1]);
}
brw_bo_unreference(brw->throttle_batch[1]);
}
brw->throttle_batch[1] = brw->throttle_batch[0];
@ -700,7 +704,7 @@ _intel_batchbuffer_flush_fence(struct brw_context *brw,
if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
fprintf(stderr, "waiting for idle\n");
brw_bo_wait_rendering(brw->batch.bo);
brw_bo_wait_rendering(brw, brw->batch.bo);
}
/* Start a new batch buffer. */

View File

@ -217,7 +217,7 @@ brw_buffer_subdata(struct gl_context *ctx,
if (offset + size <= intel_obj->gpu_active_start ||
intel_obj->gpu_active_end <= offset) {
if (brw->has_llc) {
brw_bo_map_unsynchronized(intel_obj->buffer);
brw_bo_map_unsynchronized(brw, intel_obj->buffer);
memcpy(intel_obj->buffer->virtual + offset, data, size);
brw_bo_unmap(intel_obj->buffer);
@ -389,10 +389,10 @@ brw_map_buffer_range(struct gl_context *ctx,
intel_obj->map_extra[index],
alignment);
if (brw->has_llc) {
brw_bo_map(intel_obj->range_map_bo[index],
(access & GL_MAP_WRITE_BIT) != 0);
brw_bo_map(brw, intel_obj->range_map_bo[index],
(access & GL_MAP_WRITE_BIT) != 0);
} else {
brw_bo_map_gtt(intel_obj->range_map_bo[index]);
brw_bo_map_gtt(brw, intel_obj->range_map_bo[index]);
}
obj->Mappings[index].Pointer =
intel_obj->range_map_bo[index]->virtual + intel_obj->map_extra[index];
@ -404,13 +404,13 @@ brw_map_buffer_range(struct gl_context *ctx,
brw_bo_busy(intel_obj->buffer)) {
perf_debug("MapBufferRange with GL_MAP_UNSYNCHRONIZED_BIT stalling (it's actually synchronized on non-LLC platforms)\n");
}
brw_bo_map_unsynchronized(intel_obj->buffer);
brw_bo_map_unsynchronized(brw, intel_obj->buffer);
} else if (!brw->has_llc && (!(access & GL_MAP_READ_BIT) ||
(access & GL_MAP_PERSISTENT_BIT))) {
brw_bo_map_gtt(intel_obj->buffer);
brw_bo_map_gtt(brw, intel_obj->buffer);
mark_buffer_inactive(intel_obj);
} else {
brw_bo_map(intel_obj->buffer, (access & GL_MAP_WRITE_BIT) != 0);
brw_bo_map(brw, intel_obj->buffer, (access & GL_MAP_WRITE_BIT) != 0);
mark_buffer_inactive(intel_obj);
}

View File

@ -1386,7 +1386,7 @@ intel_miptree_init_mcs(struct brw_context *brw,
*
* Note: the clear value for MCS buffers is all 1's, so we memset to 0xff.
*/
const int ret = brw_bo_map_gtt(mt->mcs_buf->bo);
const int ret = brw_bo_map_gtt(brw, mt->mcs_buf->bo);
if (unlikely(ret)) {
fprintf(stderr, "Failed to map mcs buffer into GTT\n");
brw_bo_unreference(mt->mcs_buf->bo);
@ -2473,9 +2473,9 @@ intel_miptree_map_raw(struct brw_context *brw, struct intel_mipmap_tree *mt)
* long as cache consistency is maintained).
*/
if (mt->tiling != I915_TILING_NONE || mt->is_scanout)
brw_bo_map_gtt(bo);
brw_bo_map_gtt(brw, bo);
else
brw_bo_map(bo, true);
brw_bo_map(brw, bo, true);
return bo->virtual;
}

View File

@ -147,7 +147,7 @@ intel_readpixels_tiled_memcpy(struct gl_context * ctx,
intel_batchbuffer_flush(brw);
}
error = brw_bo_map(bo, false /* write enable */);
error = brw_bo_map(brw, bo, false /* write enable */);
if (error) {
DBG("%s: failed to map bo\n", __func__);
return false;

View File

@ -1384,7 +1384,7 @@ intel_detect_pipelined_register(struct intel_screen *screen,
if (bo == NULL)
goto err_results;
if (brw_bo_map(bo, 1))
if (brw_bo_map(NULL, bo, 1))
goto err_batch;
batch = bo->virtual;
@ -1440,7 +1440,7 @@ intel_detect_pipelined_register(struct intel_screen *screen,
drmIoctl(dri_screen->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf);
/* Check whether the value got written. */
if (brw_bo_map(results, false) == 0) {
if (brw_bo_map(NULL, results, false) == 0) {
success = *((uint32_t *)results->virtual + offset) == expected_value;
brw_bo_unmap(results);
}

View File

@ -532,7 +532,7 @@ intel_gettexsubimage_tiled_memcpy(struct gl_context *ctx,
intel_batchbuffer_flush(brw);
}
error = brw_bo_map(bo, false /* write enable */);
error = brw_bo_map(brw, bo, false /* write enable */);
if (error) {
DBG("%s: failed to map bo\n", __func__);
return false;

View File

@ -148,7 +148,7 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
intel_batchbuffer_flush(brw);
}
error = brw_bo_map(bo, true /* write enable */);
error = brw_bo_map(brw, bo, true /* write enable */);
if (error || bo->virtual == NULL) {
DBG("%s: failed to map bo\n", __func__);
return false;

View File

@ -98,9 +98,9 @@ intel_upload_space(struct brw_context *brw,
brw->upload.bo = brw_bo_alloc(brw->bufmgr, "streamed data",
MAX2(INTEL_UPLOAD_SIZE, size), 4096);
if (brw->has_llc)
brw_bo_map(brw->upload.bo, true);
brw_bo_map(brw, brw->upload.bo, true);
else
brw_bo_map_gtt(brw->upload.bo);
brw_bo_map_gtt(brw, brw->upload.bo);
}
brw->upload.next_offset = offset + size;