i965: Try to avoid stalls on the GPU when doing glBufferSubData().
On DOTA2, framerate on dota2-de1.dem in windowed mode on my laptop improves by 7.69854% +/- 0.909163% (n=3). In a microbenchmark hitting this code path (wall time of piglit vbo-subdata-many), runtime decreases from 0.8 to 0.05 seconds. v2: Use out of range start/end instead of separate bool for the active flag (suggestion by Jordan), fix double-upload in the stalling path. Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
This commit is contained in:
parent
3b58e0ed64
commit
c298f5ff56
|
@ -472,12 +472,30 @@ static void brw_prepare_vertices(struct brw_context *brw)
|
|||
struct brw_vertex_buffer *buffer = &brw->vb.buffers[j];
|
||||
|
||||
/* Named buffer object: Just reference its contents directly. */
|
||||
buffer->bo = intel_bufferobj_buffer(brw, intel_buffer, INTEL_READ);
|
||||
drm_intel_bo_reference(buffer->bo);
|
||||
buffer->offset = (uintptr_t)glarray->Ptr;
|
||||
buffer->stride = glarray->StrideB;
|
||||
buffer->step_rate = glarray->InstanceDivisor;
|
||||
|
||||
uint32_t offset, size;
|
||||
if (glarray->InstanceDivisor) {
|
||||
offset = buffer->offset;
|
||||
size = (buffer->stride * ((brw->num_instances /
|
||||
glarray->InstanceDivisor) - 1) +
|
||||
glarray->_ElementSize);
|
||||
} else {
|
||||
if (min_index == -1) {
|
||||
offset = 0;
|
||||
size = intel_buffer->Base.Size;
|
||||
} else {
|
||||
offset = buffer->offset + min_index * buffer->stride;
|
||||
size = (buffer->stride * (max_index - min_index) +
|
||||
glarray->_ElementSize);
|
||||
}
|
||||
}
|
||||
buffer->bo = intel_bufferobj_buffer(brw, intel_buffer,
|
||||
offset, size);
|
||||
drm_intel_bo_reference(buffer->bo);
|
||||
|
||||
input->buffer = j++;
|
||||
input->offset = 0;
|
||||
}
|
||||
|
@ -850,7 +868,7 @@ static void brw_upload_indices(struct brw_context *brw)
|
|||
brw->ib.start_vertex_offset = offset / ib_type_size;
|
||||
|
||||
bo = intel_bufferobj_buffer(brw, intel_buffer_object(bufferobj),
|
||||
INTEL_READ);
|
||||
offset, ib_size);
|
||||
drm_intel_bo_reference(bo);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -62,10 +62,7 @@ intel_buffer_object_purgeable(struct gl_context * ctx,
|
|||
return GL_RELEASED_APPLE;
|
||||
} else {
|
||||
/* XXX Create the buffer and madvise(MADV_DONTNEED)? */
|
||||
struct brw_context *brw = brw_context(ctx);
|
||||
drm_intel_bo *bo = intel_bufferobj_buffer(brw, intel_obj, INTEL_READ);
|
||||
|
||||
return intel_buffer_purgeable(bo);
|
||||
return intel_buffer_purgeable(intel_obj->buffer);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -239,8 +239,8 @@ brw_update_buffer_texture_surface(struct gl_context *ctx,
|
|||
int texel_size = _mesa_get_format_bytes(format);
|
||||
|
||||
if (intel_obj) {
|
||||
bo = intel_obj->buffer;
|
||||
size = MIN2(size, intel_obj->Base.Size);
|
||||
bo = intel_bufferobj_buffer(brw, intel_obj, tObj->BufferOffset, size);
|
||||
}
|
||||
|
||||
if (brw_format == 0 && format != MESA_FORMAT_RGBA_FLOAT32) {
|
||||
|
@ -345,11 +345,13 @@ brw_update_sol_surface(struct brw_context *brw,
|
|||
unsigned stride_dwords, unsigned offset_dwords)
|
||||
{
|
||||
struct intel_buffer_object *intel_bo = intel_buffer_object(buffer_obj);
|
||||
drm_intel_bo *bo = intel_bufferobj_buffer(brw, intel_bo, INTEL_WRITE_PART);
|
||||
uint32_t offset_bytes = 4 * offset_dwords;
|
||||
drm_intel_bo *bo = intel_bufferobj_buffer(brw, intel_bo,
|
||||
offset_bytes,
|
||||
buffer_obj->Size - offset_bytes);
|
||||
uint32_t *surf = brw_state_batch(brw, AUB_TRACE_SURFACE_STATE, 6 * 4, 32,
|
||||
out_offset);
|
||||
uint32_t pitch_minus_1 = 4*stride_dwords - 1;
|
||||
uint32_t offset_bytes = 4 * offset_dwords;
|
||||
size_t size_dwords = buffer_obj->Size / 4;
|
||||
uint32_t buffer_size_minus_1, width, height, depth, surface_format;
|
||||
|
||||
|
@ -824,7 +826,10 @@ brw_upload_ubo_surfaces(struct brw_context *brw,
|
|||
|
||||
binding = &ctx->UniformBufferBindings[shader->UniformBlocks[i].Binding];
|
||||
intel_bo = intel_buffer_object(binding->BufferObject);
|
||||
drm_intel_bo *bo = intel_bufferobj_buffer(brw, intel_bo, INTEL_READ);
|
||||
drm_intel_bo *bo =
|
||||
intel_bufferobj_buffer(brw, intel_bo,
|
||||
binding->Offset,
|
||||
binding->BufferObject->Size - binding->Offset);
|
||||
|
||||
/* Because behavior for referencing outside of the binding's size in the
|
||||
* glBindBufferRange case is undefined, we can just bind the whole buffer
|
||||
|
|
|
@ -73,12 +73,12 @@ upload_3dstate_so_buffers(struct brw_context *brw)
|
|||
continue;
|
||||
}
|
||||
|
||||
bo = intel_bufferobj_buffer(brw, bufferobj, INTEL_WRITE_PART);
|
||||
stride = linked_xfb_info->BufferStride[i] * 4;
|
||||
|
||||
start = xfb_obj->Offset[i];
|
||||
assert(start % 4 == 0);
|
||||
end = ALIGN(start + xfb_obj->Size[i], 4);
|
||||
bo = intel_bufferobj_buffer(brw, bufferobj, start, end - start);
|
||||
assert(end <= bo->size);
|
||||
|
||||
BEGIN_BATCH(4);
|
||||
|
|
|
@ -278,8 +278,8 @@ gen7_update_buffer_texture_surface(struct gl_context *ctx,
|
|||
drm_intel_bo *bo = NULL;
|
||||
|
||||
if (intel_obj) {
|
||||
bo = intel_obj->buffer;
|
||||
size = MIN2(size, intel_obj->Base.Size);
|
||||
bo = intel_bufferobj_buffer(brw, intel_obj, tObj->BufferOffset, size);
|
||||
}
|
||||
|
||||
gl_format format = tObj->_BufferObjectFormat;
|
||||
|
|
|
@ -44,6 +44,21 @@
|
|||
static GLboolean
|
||||
intel_bufferobj_unmap(struct gl_context * ctx, struct gl_buffer_object *obj);
|
||||
|
||||
static void
|
||||
intel_bufferobj_mark_gpu_usage(struct intel_buffer_object *intel_obj,
|
||||
uint32_t offset, uint32_t size)
|
||||
{
|
||||
intel_obj->gpu_active_start = MIN2(intel_obj->gpu_active_start, offset);
|
||||
intel_obj->gpu_active_end = MAX2(intel_obj->gpu_active_end, offset + size);
|
||||
}
|
||||
|
||||
static void
|
||||
intel_bufferobj_mark_inactive(struct intel_buffer_object *intel_obj)
|
||||
{
|
||||
intel_obj->gpu_active_start = ~0;
|
||||
intel_obj->gpu_active_end = 0;
|
||||
}
|
||||
|
||||
/** Allocates a new drm_intel_bo to store the data for the buffer object. */
|
||||
static void
|
||||
intel_bufferobj_alloc_buffer(struct brw_context *brw,
|
||||
|
@ -55,6 +70,8 @@ intel_bufferobj_alloc_buffer(struct brw_context *brw,
|
|||
/* the buffer might be bound as a uniform buffer, need to update it
|
||||
*/
|
||||
brw->state.dirty.brw |= BRW_NEW_UNIFORM_BUFFER;
|
||||
|
||||
intel_bufferobj_mark_inactive(intel_obj);
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -179,20 +196,44 @@ intel_bufferobj_subdata(struct gl_context * ctx,
|
|||
|
||||
assert(intel_obj);
|
||||
|
||||
/* See if we can unsynchronized write the data into the user's BO. This
|
||||
* avoids GPU stalls in unfortunately common user patterns (uploading
|
||||
* sequentially into a BO, with draw calls in between each upload).
|
||||
*
|
||||
* Once we've hit this path, we mark this GL BO as preferring stalling to
|
||||
* blits, so that we can hopefully hit this path again in the future
|
||||
* (otherwise, an app that might occasionally stall but mostly not will end
|
||||
* up with blitting all the time, at the cost of bandwidth)
|
||||
*/
|
||||
if (brw->has_llc) {
|
||||
if (offset + size <= intel_obj->gpu_active_start ||
|
||||
intel_obj->gpu_active_end <= offset) {
|
||||
drm_intel_gem_bo_map_unsynchronized(intel_obj->buffer);
|
||||
memcpy(intel_obj->buffer->virtual + offset, data, size);
|
||||
drm_intel_bo_unmap(intel_obj->buffer);
|
||||
|
||||
if (intel_obj->gpu_active_end > intel_obj->gpu_active_start)
|
||||
intel_obj->prefer_stall_to_blit = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
busy =
|
||||
drm_intel_bo_busy(intel_obj->buffer) ||
|
||||
drm_intel_bo_references(brw->batch.bo, intel_obj->buffer);
|
||||
|
||||
if (busy) {
|
||||
if (size == intel_obj->Base.Size) {
|
||||
/* Replace the current busy bo with fresh data. */
|
||||
/* Replace the current busy bo so the subdata doesn't stall. */
|
||||
drm_intel_bo_unreference(intel_obj->buffer);
|
||||
intel_bufferobj_alloc_buffer(brw, intel_obj);
|
||||
drm_intel_bo_subdata(intel_obj->buffer, 0, size, data);
|
||||
} else {
|
||||
perf_debug("Using a blit copy to avoid stalling on %ldb "
|
||||
"glBufferSubData() to a busy buffer object.\n",
|
||||
(long)size);
|
||||
} else if (!intel_obj->prefer_stall_to_blit) {
|
||||
perf_debug("Using a blit copy to avoid stalling on "
|
||||
"glBufferSubData(%ld, %ld) (%ldkb) to a busy "
|
||||
"(%d-%d) buffer object.\n",
|
||||
(long)offset, (long)offset + size, (long)(size/1024),
|
||||
intel_obj->gpu_active_start,
|
||||
intel_obj->gpu_active_end);
|
||||
drm_intel_bo *temp_bo =
|
||||
drm_intel_bo_alloc(brw->bufmgr, "subdata temp", size, 64);
|
||||
|
||||
|
@ -204,10 +245,20 @@ intel_bufferobj_subdata(struct gl_context * ctx,
|
|||
size);
|
||||
|
||||
drm_intel_bo_unreference(temp_bo);
|
||||
return;
|
||||
} else {
|
||||
perf_debug("Stalling on glBufferSubData(%ld, %ld) (%ldkb) to a busy "
|
||||
"(%d-%d) buffer object. Use glMapBufferRange() to "
|
||||
"avoid this.\n",
|
||||
(long)offset, (long)offset + size, (long)(size/1024),
|
||||
intel_obj->gpu_active_start,
|
||||
intel_obj->gpu_active_end);
|
||||
intel_batchbuffer_flush(brw);
|
||||
}
|
||||
} else {
|
||||
drm_intel_bo_subdata(intel_obj->buffer, offset, size, data);
|
||||
}
|
||||
|
||||
drm_intel_bo_subdata(intel_obj->buffer, offset, size, data);
|
||||
intel_bufferobj_mark_inactive(intel_obj);
|
||||
}
|
||||
|
||||
|
||||
|
@ -231,6 +282,8 @@ intel_bufferobj_get_subdata(struct gl_context * ctx,
|
|||
intel_batchbuffer_flush(brw);
|
||||
}
|
||||
drm_intel_bo_get_subdata(intel_obj->buffer, offset, size, data);
|
||||
|
||||
intel_bufferobj_mark_inactive(intel_obj);
|
||||
}
|
||||
|
||||
|
||||
|
@ -328,8 +381,10 @@ intel_bufferobj_map_range(struct gl_context * ctx,
|
|||
drm_intel_gem_bo_map_unsynchronized(intel_obj->buffer);
|
||||
else if (!(access & GL_MAP_READ_BIT)) {
|
||||
drm_intel_gem_bo_map_gtt(intel_obj->buffer);
|
||||
intel_bufferobj_mark_inactive(intel_obj);
|
||||
} else {
|
||||
drm_intel_bo_map(intel_obj->buffer, (access & GL_MAP_WRITE_BIT) != 0);
|
||||
intel_bufferobj_mark_inactive(intel_obj);
|
||||
}
|
||||
|
||||
obj->Pointer = intel_obj->buffer->virtual + offset;
|
||||
|
@ -375,6 +430,7 @@ intel_bufferobj_flush_mapped_range(struct gl_context *ctx,
|
|||
intel_obj->buffer, obj->Offset + offset,
|
||||
temp_bo, 0,
|
||||
length);
|
||||
intel_bufferobj_mark_gpu_usage(intel_obj, obj->Offset + offset, length);
|
||||
|
||||
drm_intel_bo_unreference(temp_bo);
|
||||
}
|
||||
|
@ -409,6 +465,7 @@ intel_bufferobj_unmap(struct gl_context * ctx, struct gl_buffer_object *obj)
|
|||
intel_obj->buffer, obj->Offset,
|
||||
intel_obj->range_map_bo, 0,
|
||||
obj->Length);
|
||||
intel_bufferobj_mark_gpu_usage(intel_obj, obj->Offset, obj->Length);
|
||||
|
||||
/* Since we've emitted some blits to buffers that will (likely) be used
|
||||
* in rendering operations in other cache domains in this batch, emit a
|
||||
|
@ -429,10 +486,17 @@ intel_bufferobj_unmap(struct gl_context * ctx, struct gl_buffer_object *obj)
|
|||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a pointer to the object's BO, and marks the given range as being used
|
||||
* on the GPU.
|
||||
*
|
||||
* Anywhere that uses buffer objects in the pipeline should be using this to
|
||||
* mark the range of the buffer that is being accessed by the pipeline.
|
||||
*/
|
||||
drm_intel_bo *
|
||||
intel_bufferobj_buffer(struct brw_context *brw,
|
||||
struct intel_buffer_object *intel_obj,
|
||||
GLuint flag)
|
||||
uint32_t offset, uint32_t size)
|
||||
{
|
||||
/* This is needed so that things like transform feedback and texture buffer
|
||||
* objects that need a BO but don't want to check that they exist for
|
||||
|
@ -441,6 +505,8 @@ intel_bufferobj_buffer(struct brw_context *brw,
|
|||
if (intel_obj->buffer == NULL)
|
||||
intel_bufferobj_alloc_buffer(brw, intel_obj);
|
||||
|
||||
intel_bufferobj_mark_gpu_usage(intel_obj, offset, size);
|
||||
|
||||
return intel_obj->buffer;
|
||||
}
|
||||
|
||||
|
@ -466,8 +532,8 @@ intel_bufferobj_copy_subdata(struct gl_context *ctx,
|
|||
if (size == 0)
|
||||
return;
|
||||
|
||||
dst_bo = intel_bufferobj_buffer(brw, intel_dst, INTEL_WRITE_PART);
|
||||
src_bo = intel_bufferobj_buffer(brw, intel_src, INTEL_READ);
|
||||
dst_bo = intel_bufferobj_buffer(brw, intel_dst, write_offset, size);
|
||||
src_bo = intel_bufferobj_buffer(brw, intel_src, read_offset, size);
|
||||
|
||||
intel_emit_linear_blit(brw,
|
||||
dst_bo, write_offset,
|
||||
|
|
|
@ -45,14 +45,45 @@ struct intel_buffer_object
|
|||
drm_intel_bo *range_map_bo;
|
||||
void *range_map_buffer;
|
||||
unsigned int range_map_offset;
|
||||
|
||||
/** @{
|
||||
* Tracking for what range of the BO may currently be in use by the GPU.
|
||||
*
|
||||
* Users often want to either glBufferSubData() or glMapBufferRange() a
|
||||
* buffer object where some subset of it is busy on the GPU, without either
|
||||
* stalling or doing an extra blit (since our blits are extra expensive,
|
||||
* given that we have to reupload most of the 3D state when switching
|
||||
* rings). We wish they'd just use glMapBufferRange() with the
|
||||
* UNSYNC|INVALIDATE_RANGE flag or the INVALIDATE_BUFFER flag, but lots
|
||||
* don't.
|
||||
*
|
||||
* To work around apps, we track what range of the BO we might have used on
|
||||
* the GPU as vertex data, tranform feedback output, buffer textures, etc.,
|
||||
* and just do glBufferSubData() with an unsynchronized map when they're
|
||||
* outside of that range.
|
||||
*
|
||||
* If gpu_active_start > gpu_active_end, then the GPU is not currently
|
||||
* accessing the BO (and we can map it without synchronization).
|
||||
*/
|
||||
uint32_t gpu_active_start;
|
||||
uint32_t gpu_active_end;
|
||||
|
||||
/**
|
||||
* If we've avoided stalls/blits using the active tracking, flag the buffer
|
||||
* for (occasional) stalling in the future to avoid getting stuck in a
|
||||
* cycle of blitting on buffer wraparound.
|
||||
*/
|
||||
bool prefer_stall_to_blit;
|
||||
/** @} */
|
||||
};
|
||||
|
||||
|
||||
/* Get the bm buffer associated with a GL bufferobject:
|
||||
*/
|
||||
drm_intel_bo *intel_bufferobj_buffer(struct brw_context *brw,
|
||||
struct intel_buffer_object *obj,
|
||||
GLuint flag);
|
||||
struct intel_buffer_object *obj,
|
||||
uint32_t offset,
|
||||
uint32_t size);
|
||||
|
||||
void intel_upload_data(struct brw_context *brw,
|
||||
const void *ptr, GLuint size, GLuint align,
|
||||
|
|
|
@ -79,7 +79,6 @@ do_blit_readpixels(struct gl_context * ctx,
|
|||
struct intel_buffer_object *dst = intel_buffer_object(pack->BufferObj);
|
||||
GLuint dst_offset;
|
||||
drm_intel_bo *dst_buffer;
|
||||
bool all;
|
||||
GLint dst_x, dst_y;
|
||||
GLuint dirty;
|
||||
|
||||
|
@ -127,12 +126,9 @@ do_blit_readpixels(struct gl_context * ctx,
|
|||
intel_prepare_render(brw);
|
||||
brw->front_buffer_dirty = dirty;
|
||||
|
||||
all = (width * height * irb->mt->cpp == dst->Base.Size &&
|
||||
x == 0 && dst_offset == 0);
|
||||
|
||||
dst_buffer = intel_bufferobj_buffer(brw, dst,
|
||||
all ? INTEL_WRITE_FULL :
|
||||
INTEL_WRITE_PART);
|
||||
dst_offset, width * height *
|
||||
irb->mt->cpp);
|
||||
|
||||
struct intel_mipmap_tree *pbo_mt =
|
||||
intel_miptree_create_for_bo(brw,
|
||||
|
|
|
@ -126,13 +126,14 @@ try_pbo_upload(struct gl_context *ctx,
|
|||
return false;
|
||||
}
|
||||
|
||||
src_buffer = intel_bufferobj_buffer(brw, pbo, INTEL_READ);
|
||||
/* note: potential 64-bit ptr to 32-bit int cast */
|
||||
src_offset = (GLuint) (unsigned long) pixels;
|
||||
|
||||
int src_stride =
|
||||
_mesa_image_row_stride(unpack, image->Width, format, type);
|
||||
|
||||
/* note: potential 64-bit ptr to 32-bit int cast */
|
||||
src_offset = (GLuint) (unsigned long) pixels;
|
||||
src_buffer = intel_bufferobj_buffer(brw, pbo,
|
||||
src_offset, src_stride * image->Height);
|
||||
|
||||
struct intel_mipmap_tree *pbo_mt =
|
||||
intel_miptree_create_for_bo(brw,
|
||||
src_buffer,
|
||||
|
|
Loading…
Reference in New Issue