st/nine: Optimize dynamic systemmem buffers

Some apps use DYNAMIC SYSTEMMEM buffers and fill them in a
dynamic fashion with discard and nooverwrite locking flags.

To prevent uploading the whole buffer every draw call,
track the region needed for the draw call, and
upload only that region (or a bit more in order
to ease valid region tracking).

Try to aggressively upload with discard/unsynchronized.

Signed-off-by: Axel Davy <davyaxel0@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9451>
This commit is contained in:
Axel Davy 2021-03-06 18:27:42 +01:00 committed by Marge Bot
parent 01c8071f93
commit 380c2bf887
7 changed files with 248 additions and 27 deletions

View File

@ -87,7 +87,9 @@ NineBuffer9_ctor( struct NineBuffer9 *This,
* some small behavior differences between vendors). Implementing exactly as MANAGED should
* be fine.
*/
if (Pool != D3DPOOL_DEFAULT)
if (Pool == D3DPOOL_SYSTEMMEM && Usage & D3DUSAGE_DYNAMIC)
info->usage = PIPE_USAGE_STREAM;
else if (Pool != D3DPOOL_DEFAULT)
info->usage = PIPE_USAGE_DEFAULT;
else if (Usage & D3DUSAGE_DYNAMIC && Usage & D3DUSAGE_WRITEONLY)
info->usage = PIPE_USAGE_STREAM;
@ -140,6 +142,10 @@ NineBuffer9_ctor( struct NineBuffer9 *This,
memset(This->managed.data, 0, Size);
This->managed.dirty = TRUE;
u_box_1d(0, Size, &This->managed.dirty_box);
u_box_1d(0, 0, &This->managed.valid_region);
u_box_1d(0, 0, &This->managed.required_valid_region);
u_box_1d(0, 0, &This->managed.filled_region);
This->managed.can_unsynchronized = true;
list_inithead(&This->managed.list);
list_inithead(&This->managed.list2);
list_add(&This->managed.list2, &pParams->device->managed_buffers);
@ -245,28 +251,50 @@ NineBuffer9_Lock( struct NineBuffer9 *This,
u_box_1d(OffsetToLock, SizeToLock, &box);
if (This->base.pool != D3DPOOL_DEFAULT) {
/* Systemmem takes into account writes outside the locked region on AMD/NVidia */
if (This->base.pool == D3DPOOL_SYSTEMMEM)
u_box_1d(0, This->size, &box);
/* READONLY doesn't dirty the buffer */
/* Tests on Win: READONLY doesn't wait for the upload */
if (!(Flags & D3DLOCK_READONLY)) {
if (!This->managed.dirty) {
assert(list_is_empty(&This->managed.list));
This->managed.dirty = TRUE;
This->managed.dirty_box = box;
/* Flush if regions pending to be uploaded would be dirtied */
if (p_atomic_read(&This->managed.pending_upload)) {
u_box_intersect_1d(&box, &box, &This->managed.upload_pending_regions);
if (box.width != 0)
nine_csmt_process(This->base.base.device);
}
} else
u_box_union_1d(&This->managed.dirty_box, &This->managed.dirty_box, &box);
/* Tests trying to draw while the buffer is locked show that
* MANAGED buffers are made dirty at Lock time */
/* MANAGED: READONLY doesn't dirty the buffer, nor
* wait the upload in the worker thread
* SYSTEMMEM: AMD/NVidia: All locks dirty the full buffer. Not on Intel
* For Nvidia, SYSTEMMEM behaves are if there is no worker thread.
* On AMD, READONLY and NOOVERWRITE do dirty the buffer, but do not sync the previous uploads
* in the worker thread. On Intel only NOOVERWRITE has that effect.
* We implement the AMD behaviour. */
if (This->base.pool == D3DPOOL_MANAGED) {
if (!(Flags & D3DLOCK_READONLY)) {
if (!This->managed.dirty) {
assert(list_is_empty(&This->managed.list));
This->managed.dirty = TRUE;
This->managed.dirty_box = box;
/* Flush if regions pending to be uploaded would be dirtied */
if (p_atomic_read(&This->managed.pending_upload)) {
u_box_intersect_1d(&box, &box, &This->managed.upload_pending_regions);
if (box.width != 0)
nine_csmt_process(This->base.base.device);
}
} else
u_box_union_1d(&This->managed.dirty_box, &This->managed.dirty_box, &box);
/* Tests trying to draw while the buffer is locked show that
* SYSTEMMEM/MANAGED buffers are made dirty at Lock time */
BASEBUF_REGISTER_UPDATE(This);
}
} else {
if (!(Flags & (D3DLOCK_READONLY|D3DLOCK_NOOVERWRITE)) &&
p_atomic_read(&This->managed.pending_upload)) {
nine_csmt_process(This->base.base.device);
/* Note: AS DISCARD is not relevant for SYSTEMMEM,
* NOOVERWRITE might have a similar meaning as what is
* in D3D7 doc. Basically that data from previous draws
* OF THIS FRAME are unaffected. As we flush csmt in Present(),
* we should be correct. In some parts of the doc, the notion
* of frame is implied to be related to Begin/EndScene(),
* but tests show NOOVERWRITE after EndScene() doesn't flush
* the csmt thread. */
}
This->managed.dirty = true;
u_box_1d(0, This->size, &This->managed.dirty_box); /* systemmem non-dynamic */
u_box_1d(0, 0, &This->managed.valid_region); /* systemmem dynamic */
BASEBUF_REGISTER_UPDATE(This);
}
*ppbData = (char *)This->managed.data + OffsetToLock;
DBG("returning pointer %p\n", *ppbData);
This->nlocks++;

View File

@ -29,6 +29,7 @@
#include "nine_state.h"
#include "resource9.h"
#include "pipe/p_context.h"
#include "pipe/p_defines.h"
#include "pipe/p_state.h"
#include "util/list.h"
#include "util/u_box.h"
@ -69,6 +70,12 @@ struct NineBuffer9
struct list_head list; /* for update_buffers */
struct list_head list2; /* for managed_buffers */
unsigned pending_upload; /* for uploads */
/* SYSTEMMEM DYNAMIC */
bool can_unsynchronized; /* Whether the upload can use nooverwrite */
struct pipe_box valid_region; /* Region in the GPU buffer with valid content */
struct pipe_box required_valid_region; /* Region that needs to be valid right now. */
struct pipe_box filled_region; /* Region in the GPU buffer filled since last discard */
unsigned frame_count_last_discard;
} managed;
};
static inline struct NineBuffer9 *
@ -101,25 +108,146 @@ NineBuffer9_Lock( struct NineBuffer9 *This,
HRESULT NINE_WINAPI
NineBuffer9_Unlock( struct NineBuffer9 *This );
/* Try to remove b from a, supposed to include b */
static void u_box_try_remove_region_1d(struct pipe_box *dst,
const struct pipe_box *a,
const struct pipe_box *b)
{
int x, width;
if (a->x == b->x) {
x = a->x + b->width;
width = a->width - b->width;
} else if ((a->x + a->width) == (b->x + b->width)) {
x = a->x;
width = a->width - b->width;
} else {
x = a->x;
width = a->width;
}
dst->x = x;
dst->width = width;
}
static inline void
NineBuffer9_Upload( struct NineBuffer9 *This )
{
struct NineDevice9 *device = This->base.base.device;
unsigned upload_flags = 0;
struct pipe_box box_upload;
assert(This->base.pool != D3DPOOL_DEFAULT && This->managed.dirty);
if (This->base.pool == D3DPOOL_SYSTEMMEM && This->base.usage & D3DUSAGE_DYNAMIC) {
struct pipe_box region_already_valid;
struct pipe_box conflicting_region;
struct pipe_box *valid_region = &This->managed.valid_region;
struct pipe_box *required_valid_region = &This->managed.required_valid_region;
struct pipe_box *filled_region = &This->managed.filled_region;
/* Try to upload SYSTEMMEM DYNAMIC in an efficient fashion.
* Unlike non-dynamic for which we upload the whole dirty region, try to
* only upload the data needed for the draw. The draw call preparation
* fills This->managed.required_valid_region for that */
u_box_intersect_1d(&region_already_valid,
valid_region,
required_valid_region);
/* If the required valid region is already valid, nothing to do */
if (region_already_valid.x == required_valid_region->x &&
region_already_valid.width == required_valid_region->width) {
u_box_1d(0, 0, required_valid_region);
return;
}
/* (Try to) Remove valid areas from the region to upload */
u_box_try_remove_region_1d(&box_upload,
required_valid_region,
&region_already_valid);
assert(box_upload.width > 0);
/* To maintain correctly the valid region, as we will do union later with
* box_upload, we must ensure box_upload is consecutive with valid_region */
if (box_upload.x > valid_region->x + valid_region->width && valid_region->width > 0) {
box_upload.width = box_upload.x + box_upload.width - (valid_region->x + valid_region->width);
box_upload.x = valid_region->x + valid_region->width;
} else if (box_upload.x + box_upload.width < valid_region->x && valid_region->width > 0) {
box_upload.width = valid_region->x - box_upload.x;
}
/* There is conflict if some areas, that are not valid but are filled for previous draw calls,
* intersect with the region we plan to upload. Note by construction valid_region IS
* included in filled_region, thus so is region_already_valid. */
u_box_intersect_1d(&conflicting_region, &box_upload, filled_region);
/* As box_upload could still contain region_already_valid, check the intersection
* doesn't happen to be exactly region_already_valid (it cannot be smaller, see above) */
if (This->managed.can_unsynchronized && (conflicting_region.width == 0 ||
(conflicting_region.x == region_already_valid.x &&
conflicting_region.width == region_already_valid.width))) {
/* No conflicts. */
upload_flags |= PIPE_MAP_UNSYNCHRONIZED;
} else {
/* We cannot use PIPE_MAP_UNSYNCHRONIZED. We must choose between no flag and DISCARD.
* Criterias to discard:
* . Most of the resource was filled (but some apps do allocate a big buffer
* to only use a small part in a round fashion)
* . The region to upload is very small compared to the filled region and
* at the start of the buffer (hints at round usage starting again)
* . The region to upload is very big compared to the required region
* . We have not discarded yet this frame */
if (filled_region->width > (This->size / 2) ||
(10 * box_upload.width < filled_region->width &&
box_upload.x < (filled_region->x + filled_region->width)/2) ||
box_upload.width > 2 * required_valid_region->width ||
This->managed.frame_count_last_discard != device->frame_count) {
/* Avoid DISCARDING too much by discarding only if most of the buffer
* has been used */
DBG_FLAG(DBG_INDEXBUFFER|DBG_VERTEXBUFFER,
"Uploading %p DISCARD: valid %d %d, filled %d %d, required %d %d, box_upload %d %d, required already_valid %d %d, conficting %d %d\n",
This, valid_region->x, valid_region->width, filled_region->x, filled_region->width,
required_valid_region->x, required_valid_region->width, box_upload.x, box_upload.width,
region_already_valid.x, region_already_valid.width, conflicting_region.x, conflicting_region.width
);
upload_flags |= PIPE_MAP_DISCARD_WHOLE_RESOURCE;
u_box_1d(0, 0, filled_region);
u_box_1d(0, 0, valid_region);
box_upload = This->managed.required_valid_region;
This->managed.can_unsynchronized = true;
This->managed.frame_count_last_discard = device->frame_count;
} else {
/* Once we use without UNSYNCHRONIZED, we cannot use it anymore.
* TODO: For SYSTEMMEM resources which hit this,
* it would probably be better to use stream_uploader */
This->managed.can_unsynchronized = false;
}
}
u_box_union_1d(filled_region,
filled_region,
&box_upload);
u_box_union_1d(valid_region,
valid_region,
&box_upload);
u_box_1d(0, 0, required_valid_region);
} else
box_upload = This->managed.dirty_box;
if (box_upload.x == 0 && box_upload.width == This->size) {
upload_flags |= PIPE_MAP_DISCARD_WHOLE_RESOURCE;
}
if (This->managed.pending_upload) {
u_box_union_1d(&This->managed.upload_pending_regions,
&This->managed.upload_pending_regions,
&This->managed.dirty_box);
&box_upload);
} else {
This->managed.upload_pending_regions = This->managed.dirty_box;
This->managed.upload_pending_regions = box_upload;
}
DBG_FLAG(DBG_INDEXBUFFER|DBG_VERTEXBUFFER,
"Uploading %p, offset=%d, size=%d, Flags=0x%x\n",
This, box_upload.x, box_upload.width, upload_flags);
nine_context_range_upload(device, &This->managed.pending_upload,
(struct NineUnknown *)This,
This->base.resource,
This->managed.dirty_box.x,
This->managed.dirty_box.width,
(char *)This->managed.data + This->managed.dirty_box.x);
box_upload.x,
box_upload.width,
upload_flags,
(char *)This->managed.data + box_upload.x);
This->managed.dirty = FALSE;
}

View File

@ -249,6 +249,8 @@ NineDevice9_ctor( struct NineDevice9 *This,
* still succeeds when texture allocation fails. */
This->available_texture_limit = This->available_texture_mem * 5LL / 100LL;
This->frame_count = 0; /* Used to check if events occur the same frame */
/* create implicit swapchains */
This->nswapchains = ID3DPresentGroup_GetMultiheadCount(This->present);
This->swapchains = CALLOC(This->nswapchains,
@ -2912,15 +2914,50 @@ NineAfterDraw( struct NineDevice9 *This )
}
}
#define IS_SYSTEMMEM_DYNAMIC(t) ((t) && (t)->base.pool == D3DPOOL_SYSTEMMEM && (t)->base.usage & D3DUSAGE_DYNAMIC)
/* Indicates the region needed right now for these buffers and add them to the list
* of buffers to process in NineBeforeDraw.
* The reason we don't call the upload right now is to generate smaller code (no
* duplication of the NineBuffer9_Upload inline) and to have one upload (of the correct size)
* if a vertex buffer is twice input of the draw call. */
static void
NineTrackSystemmemDynamic( struct NineBuffer9 *This, unsigned start, unsigned width )
{
struct pipe_box box;
u_box_1d(start, width, &box);
u_box_union_1d(&This->managed.required_valid_region,
&This->managed.required_valid_region,
&box);
This->managed.dirty = TRUE;
BASEBUF_REGISTER_UPDATE(This);
}
HRESULT NINE_WINAPI
NineDevice9_DrawPrimitive( struct NineDevice9 *This,
D3DPRIMITIVETYPE PrimitiveType,
UINT StartVertex,
UINT PrimitiveCount )
{
unsigned i;
DBG("iface %p, PrimitiveType %u, StartVertex %u, PrimitiveCount %u\n",
This, PrimitiveType, StartVertex, PrimitiveCount);
/* Tracking for dynamic SYSTEMMEM */
for (i = 0; i < This->caps.MaxStreams; i++) {
unsigned stride = This->state.vtxbuf[i].stride;
if (IS_SYSTEMMEM_DYNAMIC((struct NineBuffer9*)This->state.stream[i])) {
unsigned start = This->state.vtxbuf[i].buffer_offset + StartVertex * stride;
unsigned full_size = This->state.stream[i]->base.size;
unsigned num_vertices = prim_count_to_vertex_count(PrimitiveType, PrimitiveCount);
unsigned size = MIN2(full_size-start, num_vertices * stride);
if (!stride) /* Instancing. Not sure what to do. Require all */
size = full_size;
NineTrackSystemmemDynamic(&This->state.stream[i]->base, start, size);
}
}
NineBeforeDraw(This);
nine_context_draw_primitive(This, PrimitiveType, StartVertex, PrimitiveCount);
NineAfterDraw(This);
@ -2937,6 +2974,7 @@ NineDevice9_DrawIndexedPrimitive( struct NineDevice9 *This,
UINT StartIndex,
UINT PrimitiveCount )
{
unsigned i, num_indices;
DBG("iface %p, PrimitiveType %u, BaseVertexIndex %u, MinVertexIndex %u "
"NumVertices %u, StartIndex %u, PrimitiveCount %u\n",
This, PrimitiveType, BaseVertexIndex, MinVertexIndex, NumVertices,
@ -2945,6 +2983,28 @@ NineDevice9_DrawIndexedPrimitive( struct NineDevice9 *This,
user_assert(This->state.idxbuf, D3DERR_INVALIDCALL);
user_assert(This->state.vdecl, D3DERR_INVALIDCALL);
num_indices = prim_count_to_vertex_count(PrimitiveType, PrimitiveCount);
/* Tracking for dynamic SYSTEMMEM */
if (IS_SYSTEMMEM_DYNAMIC(&This->state.idxbuf->base))
NineTrackSystemmemDynamic(&This->state.idxbuf->base,
StartIndex * This->state.idxbuf->index_size,
num_indices * This->state.idxbuf->index_size);
for (i = 0; i < This->caps.MaxStreams; i++) {
if (IS_SYSTEMMEM_DYNAMIC((struct NineBuffer9*)This->state.stream[i])) {
uint32_t stride = This->state.vtxbuf[i].stride;
uint32_t full_size = This->state.stream[i]->base.size;
uint32_t start, stop;
start = MAX2(0, This->state.vtxbuf[i].buffer_offset+(MinVertexIndex+BaseVertexIndex)*stride);
stop = This->state.vtxbuf[i].buffer_offset+(MinVertexIndex+NumVertices+BaseVertexIndex)*stride;
stop = MIN2(stop, full_size);
NineTrackSystemmemDynamic(&This->state.stream[i]->base,
start, stop-start);
}
}
NineBeforeDraw(This);
nine_context_draw_indexed_primitive(This, PrimitiveType, BaseVertexIndex,
MinVertexIndex, NumVertices, StartIndex,

View File

@ -163,6 +163,8 @@ struct NineDevice9
boolean swvp;
/* pure device */
boolean pure;
unsigned frame_count; /* It's ok if we overflow */
};
static inline struct NineDevice9 *
NineDevice9( void *data )

View File

@ -2522,6 +2522,7 @@ CSMT_ITEM_NO_WAIT_WITH_COUNTER(nine_context_range_upload,
ARG_BIND_RES(struct pipe_resource, res),
ARG_VAL(unsigned, offset),
ARG_VAL(unsigned, size),
ARG_VAL(unsigned, usage),
ARG_VAL(const void *, data))
{
struct nine_context *context = &device->context;
@ -2529,7 +2530,7 @@ CSMT_ITEM_NO_WAIT_WITH_COUNTER(nine_context_range_upload,
/* Binding src_ref avoids release before upload */
(void)src_ref;
context->pipe->buffer_subdata(context->pipe, res, 0, offset, size, data);
context->pipe->buffer_subdata(context->pipe, res, usage, offset, size, data);
}
CSMT_ITEM_NO_WAIT_WITH_COUNTER(nine_context_box_upload,

View File

@ -574,6 +574,7 @@ nine_context_range_upload(struct NineDevice9 *device,
struct pipe_resource *res,
unsigned offset,
unsigned size,
unsigned usage,
const void *data);
void

View File

@ -931,6 +931,7 @@ bypass_rendering:
}
This->base.device->end_scene_since_present = 0;
This->base.device->frame_count++;
return D3D_OK;
}