From 205201c968cceeef1e8d3685851dcfd7e9e806c3 Mon Sep 17 00:00:00 2001 From: Axel Davy Date: Sat, 13 Mar 2021 11:06:43 +0100 Subject: [PATCH] st/nine: Use stream_uploader for bad cases of systemmem Using stream_uploader in case we cannot use unsynchronized improves performance. Signed-off-by: Axel Davy Part-of: --- src/gallium/frontends/nine/buffer9.c | 196 ++++++++++++++++++++++-- src/gallium/frontends/nine/buffer9.h | 145 +----------------- src/gallium/frontends/nine/nine_state.c | 6 - src/gallium/frontends/nine/nine_state.h | 6 + 4 files changed, 196 insertions(+), 157 deletions(-) diff --git a/src/gallium/frontends/nine/buffer9.c b/src/gallium/frontends/nine/buffer9.c index d63480af9ce..92d95a27358 100644 --- a/src/gallium/frontends/nine/buffer9.c +++ b/src/gallium/frontends/nine/buffer9.c @@ -23,6 +23,7 @@ #include "buffer9.h" #include "device9.h" +#include "indexbuffer9.h" #include "nine_buffer_upload.h" #include "nine_helpers.h" #include "nine_pipe.h" @@ -218,7 +219,9 @@ NineBuffer9_GetResource( struct NineBuffer9 *This, unsigned *offset ) static void NineBuffer9_RebindIfRequired( struct NineBuffer9 *This, - struct NineDevice9 *device ) + struct NineDevice9 *device, + struct pipe_resource *resource, + unsigned offset ) { int i; @@ -226,13 +229,15 @@ NineBuffer9_RebindIfRequired( struct NineBuffer9 *This, return; for (i = 0; i < device->caps.MaxStreams; i++) { if (device->state.stream[i] == (struct NineVertexBuffer9 *)This) - nine_context_set_stream_source(device, i, - (struct NineVertexBuffer9 *)This, - device->state.vtxbuf[i].buffer_offset, - device->state.vtxbuf[i].stride); + nine_context_set_stream_source_apply(device, i, + resource, + device->state.vtxbuf[i].buffer_offset + offset, + device->state.vtxbuf[i].stride); } if (device->state.idxbuf == (struct NineIndexBuffer9 *)This) - nine_context_set_indices(device, (struct NineIndexBuffer9 *)This); + nine_context_set_indices_apply(device, resource, + ((struct NineIndexBuffer9 *)This)->index_size, + offset); } HRESULT NINE_WINAPI @@ -387,7 +392,7 @@ NineBuffer9_Lock( struct NineBuffer9 *This, nine_upload_release_buffer(device->buffer_upload, This->buf); This->buf = NULL; /* Rebind buffer */ - NineBuffer9_RebindIfRequired(This, device); + NineBuffer9_RebindIfRequired(This, device, This->base.resource, 0); } This->maps[This->nmaps].transfer = NULL; @@ -406,8 +411,11 @@ NineBuffer9_Lock( struct NineBuffer9 *This, } if (!This->buf) { + unsigned offset; + struct pipe_resource *res; This->buf = nine_upload_create_buffer(device->buffer_upload, This->base.info.width0); - NineBuffer9_RebindIfRequired(This, device); + res = nine_upload_buffer_resource_and_offset(This->buf, &offset); + NineBuffer9_RebindIfRequired(This, device, res, offset); } if (This->buf) { @@ -451,7 +459,7 @@ NineBuffer9_Lock( struct NineBuffer9 *This, pipe_resource_reference(&This->base.resource, new_res); pipe_resource_reference(&new_res, NULL); usage = PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED; - NineBuffer9_RebindIfRequired(This, device); + NineBuffer9_RebindIfRequired(This, device, This->base.resource, 0); This->maps[This->nmaps].is_pipe_secondary = TRUE; } } else if (Flags & D3DLOCK_NOOVERWRITE && device->csmt_active) @@ -527,3 +535,173 @@ NineBuffer9_SetDirty( struct NineBuffer9 *This ) u_box_1d(0, This->size, &This->managed.dirty_box); BASEBUF_REGISTER_UPDATE(This); } + +/* Try to remove b from a, supposed to include b */ +static void u_box_try_remove_region_1d(struct pipe_box *dst, + const struct pipe_box *a, + const struct pipe_box *b) +{ + int x, width; + if (a->x == b->x) { + x = a->x + b->width; + width = a->width - b->width; + } else if ((a->x + a->width) == (b->x + b->width)) { + x = a->x; + width = a->width - b->width; + } else { + x = a->x; + width = a->width; + } + dst->x = x; + dst->width = width; +} + +void +NineBuffer9_Upload( struct NineBuffer9 *This ) +{ + struct NineDevice9 *device = This->base.base.device; + unsigned upload_flags = 0; + struct pipe_box box_upload; + + assert(This->base.pool != D3DPOOL_DEFAULT && This->managed.dirty); + + if (This->base.pool == D3DPOOL_SYSTEMMEM && This->base.usage & D3DUSAGE_DYNAMIC) { + struct pipe_box region_already_valid; + struct pipe_box conflicting_region; + struct pipe_box *valid_region = &This->managed.valid_region; + struct pipe_box *required_valid_region = &This->managed.required_valid_region; + struct pipe_box *filled_region = &This->managed.filled_region; + /* Try to upload SYSTEMMEM DYNAMIC in an efficient fashion. + * Unlike non-dynamic for which we upload the whole dirty region, try to + * only upload the data needed for the draw. The draw call preparation + * fills This->managed.required_valid_region for that */ + u_box_intersect_1d(®ion_already_valid, + valid_region, + required_valid_region); + /* If the required valid region is already valid, nothing to do */ + if (region_already_valid.x == required_valid_region->x && + region_already_valid.width == required_valid_region->width) { + /* Rebind if the region happens to be valid in the original buffer + * but we have since used vertex_uploader */ + if (!This->managed.can_unsynchronized) + NineBuffer9_RebindIfRequired(This, device, This->base.resource, 0); + u_box_1d(0, 0, required_valid_region); + return; + } + /* (Try to) Remove valid areas from the region to upload */ + u_box_try_remove_region_1d(&box_upload, + required_valid_region, + ®ion_already_valid); + assert(box_upload.width > 0); + /* To maintain correctly the valid region, as we will do union later with + * box_upload, we must ensure box_upload is consecutive with valid_region */ + if (box_upload.x > valid_region->x + valid_region->width && valid_region->width > 0) { + box_upload.width = box_upload.x + box_upload.width - (valid_region->x + valid_region->width); + box_upload.x = valid_region->x + valid_region->width; + } else if (box_upload.x + box_upload.width < valid_region->x && valid_region->width > 0) { + box_upload.width = valid_region->x - box_upload.x; + } + /* There is conflict if some areas, that are not valid but are filled for previous draw calls, + * intersect with the region we plan to upload. Note by construction valid_region IS + * included in filled_region, thus so is region_already_valid. */ + u_box_intersect_1d(&conflicting_region, &box_upload, filled_region); + /* As box_upload could still contain region_already_valid, check the intersection + * doesn't happen to be exactly region_already_valid (it cannot be smaller, see above) */ + if (This->managed.can_unsynchronized && (conflicting_region.width == 0 || + (conflicting_region.x == region_already_valid.x && + conflicting_region.width == region_already_valid.width))) { + /* No conflicts. */ + upload_flags |= PIPE_MAP_UNSYNCHRONIZED; + } else { + /* We cannot use PIPE_MAP_UNSYNCHRONIZED. We must choose between no flag and DISCARD. + * Criterias to discard: + * . Most of the resource was filled (but some apps do allocate a big buffer + * to only use a small part in a round fashion) + * . The region to upload is very small compared to the filled region and + * at the start of the buffer (hints at round usage starting again) + * . The region to upload is very big compared to the required region + * . We have not discarded yet this frame */ + if (filled_region->width > (This->size / 2) || + (10 * box_upload.width < filled_region->width && + box_upload.x < (filled_region->x + filled_region->width)/2) || + box_upload.width > 2 * required_valid_region->width || + This->managed.frame_count_last_discard != device->frame_count) { + /* Avoid DISCARDING too much by discarding only if most of the buffer + * has been used */ + DBG_FLAG(DBG_INDEXBUFFER|DBG_VERTEXBUFFER, + "Uploading %p DISCARD: valid %d %d, filled %d %d, required %d %d, box_upload %d %d, required already_valid %d %d, conficting %d %d\n", + This, valid_region->x, valid_region->width, filled_region->x, filled_region->width, + required_valid_region->x, required_valid_region->width, box_upload.x, box_upload.width, + region_already_valid.x, region_already_valid.width, conflicting_region.x, conflicting_region.width + ); + upload_flags |= PIPE_MAP_DISCARD_WHOLE_RESOURCE; + u_box_1d(0, 0, filled_region); + u_box_1d(0, 0, valid_region); + box_upload = This->managed.required_valid_region; + /* Rebind the buffer if we used intermediate alternative buffer */ + if (!This->managed.can_unsynchronized) + NineBuffer9_RebindIfRequired(This, device, This->base.resource, 0); + This->managed.can_unsynchronized = true; + This->managed.frame_count_last_discard = device->frame_count; + } else { + /* Once we use without UNSYNCHRONIZED, we cannot use it anymore. + * Use a different buffer. */ + unsigned buffer_offset = 0; + struct pipe_resource *resource = NULL; + This->managed.can_unsynchronized = false; + u_upload_data(device->vertex_uploader, + required_valid_region->x, + required_valid_region->width, + 64, + This->managed.data + required_valid_region->x, + &buffer_offset, + &resource); + buffer_offset -= required_valid_region->x; + u_upload_unmap(device->vertex_uploader); + if (resource) { + NineBuffer9_RebindIfRequired(This, device, resource, buffer_offset); + /* Note: This only works because for these types of buffers this function + * is called before every draw call. Else it wouldn't work when the app + * rebinds buffers. In addition it needs this function to be called only + * once per buffers even if bound several times, which we do. */ + u_box_1d(0, 0, required_valid_region); + pipe_resource_reference(&resource, NULL); + return; + } + } + } + + u_box_union_1d(filled_region, + filled_region, + &box_upload); + u_box_union_1d(valid_region, + valid_region, + &box_upload); + u_box_1d(0, 0, required_valid_region); + } else + box_upload = This->managed.dirty_box; + + if (box_upload.x == 0 && box_upload.width == This->size) { + upload_flags |= PIPE_MAP_DISCARD_WHOLE_RESOURCE; + } + + if (This->managed.pending_upload) { + u_box_union_1d(&This->managed.upload_pending_regions, + &This->managed.upload_pending_regions, + &box_upload); + } else { + This->managed.upload_pending_regions = box_upload; + } + + DBG_FLAG(DBG_INDEXBUFFER|DBG_VERTEXBUFFER, + "Uploading %p, offset=%d, size=%d, Flags=0x%x\n", + This, box_upload.x, box_upload.width, upload_flags); + nine_context_range_upload(device, &This->managed.pending_upload, + (struct NineUnknown *)This, + This->base.resource, + box_upload.x, + box_upload.width, + upload_flags, + (char *)This->managed.data + box_upload.x); + This->managed.dirty = FALSE; +} diff --git a/src/gallium/frontends/nine/buffer9.h b/src/gallium/frontends/nine/buffer9.h index 039ad1a2cda..275b03bf1b3 100644 --- a/src/gallium/frontends/nine/buffer9.h +++ b/src/gallium/frontends/nine/buffer9.h @@ -33,6 +33,7 @@ #include "pipe/p_state.h" #include "util/list.h" #include "util/u_box.h" +#include "util/u_upload_mgr.h" struct pipe_screen; struct pipe_context; @@ -108,148 +109,8 @@ NineBuffer9_Lock( struct NineBuffer9 *This, HRESULT NINE_WINAPI NineBuffer9_Unlock( struct NineBuffer9 *This ); -/* Try to remove b from a, supposed to include b */ -static void u_box_try_remove_region_1d(struct pipe_box *dst, - const struct pipe_box *a, - const struct pipe_box *b) -{ - int x, width; - if (a->x == b->x) { - x = a->x + b->width; - width = a->width - b->width; - } else if ((a->x + a->width) == (b->x + b->width)) { - x = a->x; - width = a->width - b->width; - } else { - x = a->x; - width = a->width; - } - dst->x = x; - dst->width = width; -} - -static inline void -NineBuffer9_Upload( struct NineBuffer9 *This ) -{ - struct NineDevice9 *device = This->base.base.device; - unsigned upload_flags = 0; - struct pipe_box box_upload; - - assert(This->base.pool != D3DPOOL_DEFAULT && This->managed.dirty); - - if (This->base.pool == D3DPOOL_SYSTEMMEM && This->base.usage & D3DUSAGE_DYNAMIC) { - struct pipe_box region_already_valid; - struct pipe_box conflicting_region; - struct pipe_box *valid_region = &This->managed.valid_region; - struct pipe_box *required_valid_region = &This->managed.required_valid_region; - struct pipe_box *filled_region = &This->managed.filled_region; - /* Try to upload SYSTEMMEM DYNAMIC in an efficient fashion. - * Unlike non-dynamic for which we upload the whole dirty region, try to - * only upload the data needed for the draw. The draw call preparation - * fills This->managed.required_valid_region for that */ - u_box_intersect_1d(®ion_already_valid, - valid_region, - required_valid_region); - /* If the required valid region is already valid, nothing to do */ - if (region_already_valid.x == required_valid_region->x && - region_already_valid.width == required_valid_region->width) { - u_box_1d(0, 0, required_valid_region); - return; - } - /* (Try to) Remove valid areas from the region to upload */ - u_box_try_remove_region_1d(&box_upload, - required_valid_region, - ®ion_already_valid); - assert(box_upload.width > 0); - /* To maintain correctly the valid region, as we will do union later with - * box_upload, we must ensure box_upload is consecutive with valid_region */ - if (box_upload.x > valid_region->x + valid_region->width && valid_region->width > 0) { - box_upload.width = box_upload.x + box_upload.width - (valid_region->x + valid_region->width); - box_upload.x = valid_region->x + valid_region->width; - } else if (box_upload.x + box_upload.width < valid_region->x && valid_region->width > 0) { - box_upload.width = valid_region->x - box_upload.x; - } - /* There is conflict if some areas, that are not valid but are filled for previous draw calls, - * intersect with the region we plan to upload. Note by construction valid_region IS - * included in filled_region, thus so is region_already_valid. */ - u_box_intersect_1d(&conflicting_region, &box_upload, filled_region); - /* As box_upload could still contain region_already_valid, check the intersection - * doesn't happen to be exactly region_already_valid (it cannot be smaller, see above) */ - if (This->managed.can_unsynchronized && (conflicting_region.width == 0 || - (conflicting_region.x == region_already_valid.x && - conflicting_region.width == region_already_valid.width))) { - /* No conflicts. */ - upload_flags |= PIPE_MAP_UNSYNCHRONIZED; - } else { - /* We cannot use PIPE_MAP_UNSYNCHRONIZED. We must choose between no flag and DISCARD. - * Criterias to discard: - * . Most of the resource was filled (but some apps do allocate a big buffer - * to only use a small part in a round fashion) - * . The region to upload is very small compared to the filled region and - * at the start of the buffer (hints at round usage starting again) - * . The region to upload is very big compared to the required region - * . We have not discarded yet this frame */ - if (filled_region->width > (This->size / 2) || - (10 * box_upload.width < filled_region->width && - box_upload.x < (filled_region->x + filled_region->width)/2) || - box_upload.width > 2 * required_valid_region->width || - This->managed.frame_count_last_discard != device->frame_count) { - /* Avoid DISCARDING too much by discarding only if most of the buffer - * has been used */ - DBG_FLAG(DBG_INDEXBUFFER|DBG_VERTEXBUFFER, - "Uploading %p DISCARD: valid %d %d, filled %d %d, required %d %d, box_upload %d %d, required already_valid %d %d, conficting %d %d\n", - This, valid_region->x, valid_region->width, filled_region->x, filled_region->width, - required_valid_region->x, required_valid_region->width, box_upload.x, box_upload.width, - region_already_valid.x, region_already_valid.width, conflicting_region.x, conflicting_region.width - ); - upload_flags |= PIPE_MAP_DISCARD_WHOLE_RESOURCE; - u_box_1d(0, 0, filled_region); - u_box_1d(0, 0, valid_region); - box_upload = This->managed.required_valid_region; - This->managed.can_unsynchronized = true; - This->managed.frame_count_last_discard = device->frame_count; - } else { - /* Once we use without UNSYNCHRONIZED, we cannot use it anymore. - * TODO: For SYSTEMMEM resources which hit this, - * it would probably be better to use stream_uploader */ - This->managed.can_unsynchronized = false; - } - } - - u_box_union_1d(filled_region, - filled_region, - &box_upload); - u_box_union_1d(valid_region, - valid_region, - &box_upload); - u_box_1d(0, 0, required_valid_region); - } else - box_upload = This->managed.dirty_box; - - if (box_upload.x == 0 && box_upload.width == This->size) { - upload_flags |= PIPE_MAP_DISCARD_WHOLE_RESOURCE; - } - - if (This->managed.pending_upload) { - u_box_union_1d(&This->managed.upload_pending_regions, - &This->managed.upload_pending_regions, - &box_upload); - } else { - This->managed.upload_pending_regions = box_upload; - } - - DBG_FLAG(DBG_INDEXBUFFER|DBG_VERTEXBUFFER, - "Uploading %p, offset=%d, size=%d, Flags=0x%x\n", - This, box_upload.x, box_upload.width, upload_flags); - nine_context_range_upload(device, &This->managed.pending_upload, - (struct NineUnknown *)This, - This->base.resource, - box_upload.x, - box_upload.width, - upload_flags, - (char *)This->managed.data + box_upload.x); - This->managed.dirty = FALSE; -} +void +NineBuffer9_Upload( struct NineBuffer9 *This ); static void inline NineBindBufferToDevice( struct NineDevice9 *device, diff --git a/src/gallium/frontends/nine/nine_state.c b/src/gallium/frontends/nine/nine_state.c index 136e994aafc..ad84a6e0093 100644 --- a/src/gallium/frontends/nine/nine_state.c +++ b/src/gallium/frontends/nine/nine_state.c @@ -1391,12 +1391,6 @@ nine_context_set_texture_apply(struct NineDevice9 *device, struct pipe_sampler_view *view0, struct pipe_sampler_view *view1); -static void -nine_context_set_indices_apply(struct NineDevice9 *device, - struct pipe_resource *res, - UINT IndexSize, - UINT OffsetInBytes); - static void nine_context_set_pixel_shader_constant_i_transformed(struct NineDevice9 *device, UINT StartRegister, diff --git a/src/gallium/frontends/nine/nine_state.h b/src/gallium/frontends/nine/nine_state.h index d5dfa1065a4..dcb66ca87f8 100644 --- a/src/gallium/frontends/nine/nine_state.h +++ b/src/gallium/frontends/nine/nine_state.h @@ -360,6 +360,12 @@ nine_context_set_stream_source_apply(struct NineDevice9 *device, UINT OffsetInBytes, UINT Stride); +void +nine_context_set_indices_apply(struct NineDevice9 *device, + struct pipe_resource *res, + UINT IndexSize, + UINT OffsetInBytes); + void nine_context_set_render_state(struct NineDevice9 *device, D3DRENDERSTATETYPE State,