radeonsi: use SDMA for uploading data through const_uploader
v2: use tc.stream_uploader in si buffer_transfer_map if not called from the driver thread Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com> (v1) Tested-by: Dieter Nützel <Dieter@nuetzel-hh.de>
This commit is contained in:
parent
54f7545cd7
commit
edbd2c1ff5
|
@ -440,7 +440,15 @@ static void *si_buffer_transfer_map(struct pipe_context *ctx,
|
|||
}
|
||||
}
|
||||
|
||||
if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
|
||||
if (usage & PIPE_TRANSFER_FLUSH_EXPLICIT &&
|
||||
buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) {
|
||||
usage &= ~(PIPE_TRANSFER_UNSYNCHRONIZED |
|
||||
PIPE_TRANSFER_PERSISTENT);
|
||||
usage |= PIPE_TRANSFER_DISCARD_RANGE;
|
||||
force_discard_range = true;
|
||||
}
|
||||
|
||||
if (usage & PIPE_TRANSFER_DISCARD_RANGE &&
|
||||
((!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
|
||||
PIPE_TRANSFER_PERSISTENT))) ||
|
||||
(buf->flags & RADEON_FLAG_SPARSE))) {
|
||||
|
@ -453,10 +461,20 @@ static void *si_buffer_transfer_map(struct pipe_context *ctx,
|
|||
si_rings_is_buffer_referenced(sctx, buf->buf, RADEON_USAGE_READWRITE) ||
|
||||
!sctx->ws->buffer_wait(buf->buf, 0, RADEON_USAGE_READWRITE)) {
|
||||
/* Do a wait-free write-only transfer using a temporary buffer. */
|
||||
unsigned offset;
|
||||
struct u_upload_mgr *uploader;
|
||||
struct si_resource *staging = NULL;
|
||||
unsigned offset;
|
||||
|
||||
u_upload_alloc(ctx->stream_uploader, 0,
|
||||
/* If we are not called from the driver thread, we have
|
||||
* to use the uploader from u_threaded_context, which is
|
||||
* local to the calling thread.
|
||||
*/
|
||||
if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC)
|
||||
uploader = sctx->tc->base.stream_uploader;
|
||||
else
|
||||
uploader = sctx->b.stream_uploader;
|
||||
|
||||
u_upload_alloc(uploader, 0,
|
||||
box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT),
|
||||
sctx->screen->info.tcc_cache_line_size,
|
||||
&offset, (struct pipe_resource**)&staging,
|
||||
|
@ -521,6 +539,7 @@ static void si_buffer_do_flush_region(struct pipe_context *ctx,
|
|||
struct pipe_transfer *transfer,
|
||||
const struct pipe_box *box)
|
||||
{
|
||||
struct si_context *sctx = (struct si_context*)ctx;
|
||||
struct si_transfer *stransfer = (struct si_transfer*)transfer;
|
||||
struct si_resource *buf = si_resource(transfer->resource);
|
||||
|
||||
|
@ -529,10 +548,49 @@ static void si_buffer_do_flush_region(struct pipe_context *ctx,
|
|||
transfer->box.x % SI_MAP_BUFFER_ALIGNMENT +
|
||||
(box->x - transfer->box.x);
|
||||
|
||||
if (buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) {
|
||||
/* This should be true for all uploaders. */
|
||||
assert(transfer->box.x == 0);
|
||||
|
||||
/* Find a previous upload and extend its range. The last
|
||||
* upload is likely to be at the end of the list.
|
||||
*/
|
||||
for (int i = sctx->num_sdma_uploads - 1; i >= 0; i--) {
|
||||
struct si_sdma_upload *up = &sctx->sdma_uploads[i];
|
||||
|
||||
if (up->dst != buf)
|
||||
continue;
|
||||
|
||||
assert(up->src == stransfer->staging);
|
||||
assert(box->x > up->dst_offset);
|
||||
up->size = box->x + box->width - up->dst_offset;
|
||||
return;
|
||||
}
|
||||
|
||||
/* Enlarge the array if it's full. */
|
||||
if (sctx->num_sdma_uploads == sctx->max_sdma_uploads) {
|
||||
unsigned size;
|
||||
|
||||
sctx->max_sdma_uploads += 4;
|
||||
size = sctx->max_sdma_uploads * sizeof(sctx->sdma_uploads[0]);
|
||||
sctx->sdma_uploads = realloc(sctx->sdma_uploads, size);
|
||||
}
|
||||
|
||||
/* Add a new upload. */
|
||||
struct si_sdma_upload *up =
|
||||
&sctx->sdma_uploads[sctx->num_sdma_uploads++];
|
||||
up->dst = up->src = NULL;
|
||||
si_resource_reference(&up->dst, buf);
|
||||
si_resource_reference(&up->src, stransfer->staging);
|
||||
up->dst_offset = box->x;
|
||||
up->src_offset = src_offset;
|
||||
up->size = box->width;
|
||||
return;
|
||||
}
|
||||
|
||||
/* Copy the staging buffer into the original one. */
|
||||
si_copy_buffer((struct si_context*)ctx, transfer->resource,
|
||||
&stransfer->staging->b.b, box->x, src_offset,
|
||||
box->width);
|
||||
si_copy_buffer(sctx, transfer->resource, &stransfer->staging->b.b,
|
||||
box->x, src_offset, box->width);
|
||||
}
|
||||
|
||||
util_range_add(&buf->valid_buffer_range, box->x,
|
||||
|
|
|
@ -140,7 +140,8 @@ void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
|
|||
}
|
||||
|
||||
/* Flush the GFX IB if DMA depends on it. */
|
||||
if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
|
||||
if (!ctx->sdma_uploads_in_progress &&
|
||||
radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
|
||||
((dst &&
|
||||
ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf,
|
||||
RADEON_USAGE_READWRITE)) ||
|
||||
|
@ -162,9 +163,10 @@ void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
|
|||
* engine busy while uploads are being submitted.
|
||||
*/
|
||||
num_dw++; /* for emit_wait_idle below */
|
||||
if (!ws->cs_check_space(ctx->dma_cs, num_dw) ||
|
||||
ctx->dma_cs->used_vram + ctx->dma_cs->used_gart > 64 * 1024 * 1024 ||
|
||||
!radeon_cs_memory_below_limit(ctx->screen, ctx->dma_cs, vram, gtt)) {
|
||||
if (!ctx->sdma_uploads_in_progress &&
|
||||
(!ws->cs_check_space(ctx->dma_cs, num_dw) ||
|
||||
ctx->dma_cs->used_vram + ctx->dma_cs->used_gart > 64 * 1024 * 1024 ||
|
||||
!radeon_cs_memory_below_limit(ctx->screen, ctx->dma_cs, vram, gtt))) {
|
||||
si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
|
||||
assert((num_dw + ctx->dma_cs->current.cdw) <= ctx->dma_cs->current.max_dw);
|
||||
}
|
||||
|
@ -180,13 +182,14 @@ void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
|
|||
RADEON_USAGE_WRITE)))
|
||||
si_dma_emit_wait_idle(ctx);
|
||||
|
||||
unsigned sync = ctx->sdma_uploads_in_progress ? 0 : RADEON_USAGE_SYNCHRONIZED;
|
||||
if (dst) {
|
||||
radeon_add_to_buffer_list(ctx, ctx->dma_cs, dst,
|
||||
RADEON_USAGE_WRITE, 0);
|
||||
ws->cs_add_buffer(ctx->dma_cs, dst->buf, RADEON_USAGE_WRITE | sync,
|
||||
dst->domains, 0);
|
||||
}
|
||||
if (src) {
|
||||
radeon_add_to_buffer_list(ctx, ctx->dma_cs, src,
|
||||
RADEON_USAGE_READ, 0);
|
||||
ws->cs_add_buffer(ctx->dma_cs, src->buf, RADEON_USAGE_READ | sync,
|
||||
src->domains, 0);
|
||||
}
|
||||
|
||||
/* this function is called before all DMA calls, so increment this. */
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
#include "si_pipe.h"
|
||||
|
||||
#include "util/os_time.h"
|
||||
#include "util/u_upload_mgr.h"
|
||||
|
||||
/* initialize */
|
||||
void si_need_gfx_cs_space(struct si_context *ctx)
|
||||
|
@ -64,6 +65,15 @@ void si_need_gfx_cs_space(struct si_context *ctx)
|
|||
si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
|
||||
}
|
||||
|
||||
void si_unref_sdma_uploads(struct si_context *sctx)
|
||||
{
|
||||
for (unsigned i = 0; i < sctx->num_sdma_uploads; i++) {
|
||||
si_resource_reference(&sctx->sdma_uploads[i].dst, NULL);
|
||||
si_resource_reference(&sctx->sdma_uploads[i].src, NULL);
|
||||
}
|
||||
sctx->num_sdma_uploads = 0;
|
||||
}
|
||||
|
||||
void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
|
||||
struct pipe_fence_handle **fence)
|
||||
{
|
||||
|
@ -98,17 +108,37 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
|
|||
if (ctx->screen->debug_flags & DBG(CHECK_VM))
|
||||
flags &= ~PIPE_FLUSH_ASYNC;
|
||||
|
||||
ctx->gfx_flush_in_progress = true;
|
||||
|
||||
/* If the state tracker is flushing the GFX IB, si_flush_from_st is
|
||||
* responsible for flushing the DMA IB and merging the fences from both.
|
||||
* This code is only needed when the driver flushes the GFX IB
|
||||
* internally, and it never asks for a fence handle.
|
||||
* If the driver flushes the GFX IB internally, and it should never ask
|
||||
* for a fence handle.
|
||||
*/
|
||||
if (radeon_emitted(ctx->dma_cs, 0)) {
|
||||
assert(fence == NULL); /* internal flushes only */
|
||||
si_flush_dma_cs(ctx, flags, NULL);
|
||||
}
|
||||
assert(!radeon_emitted(ctx->dma_cs, 0) || fence == NULL);
|
||||
|
||||
ctx->gfx_flush_in_progress = true;
|
||||
/* Update the sdma_uploads list by flushing the uploader. */
|
||||
u_upload_unmap(ctx->b.const_uploader);
|
||||
|
||||
/* Execute SDMA uploads. */
|
||||
ctx->sdma_uploads_in_progress = true;
|
||||
for (unsigned i = 0; i < ctx->num_sdma_uploads; i++) {
|
||||
struct si_sdma_upload *up = &ctx->sdma_uploads[i];
|
||||
struct pipe_box box;
|
||||
|
||||
assert(up->src_offset % 4 == 0 && up->dst_offset % 4 == 0 &&
|
||||
up->size % 4 == 0);
|
||||
|
||||
u_box_1d(up->src_offset, up->size, &box);
|
||||
ctx->dma_copy(&ctx->b, &up->dst->b.b, 0, up->dst_offset, 0, 0,
|
||||
&up->src->b.b, 0, &box);
|
||||
}
|
||||
ctx->sdma_uploads_in_progress = false;
|
||||
si_unref_sdma_uploads(ctx);
|
||||
|
||||
/* Flush SDMA (preamble IB). */
|
||||
if (radeon_emitted(ctx->dma_cs, 0))
|
||||
si_flush_dma_cs(ctx, flags, NULL);
|
||||
|
||||
if (!LIST_IS_EMPTY(&ctx->active_queries))
|
||||
si_suspend_queries(ctx);
|
||||
|
|
|
@ -264,6 +264,7 @@ static void si_destroy_context(struct pipe_context *context)
|
|||
util_dynarray_fini(&sctx->resident_tex_needs_color_decompress);
|
||||
util_dynarray_fini(&sctx->resident_img_needs_color_decompress);
|
||||
util_dynarray_fini(&sctx->resident_tex_needs_depth_decompress);
|
||||
si_unref_sdma_uploads(sctx);
|
||||
FREE(sctx);
|
||||
}
|
||||
|
||||
|
@ -443,14 +444,6 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
|
|||
if (!sctx->b.stream_uploader)
|
||||
goto fail;
|
||||
|
||||
sctx->b.const_uploader = u_upload_create(&sctx->b, 128 * 1024,
|
||||
0, PIPE_USAGE_DEFAULT,
|
||||
SI_RESOURCE_FLAG_32BIT |
|
||||
(sscreen->cpdma_prefetch_writes_memory ?
|
||||
0 : SI_RESOURCE_FLAG_READ_ONLY));
|
||||
if (!sctx->b.const_uploader)
|
||||
goto fail;
|
||||
|
||||
sctx->cached_gtt_allocator = u_upload_create(&sctx->b, 16 * 1024,
|
||||
0, PIPE_USAGE_STAGING, 0);
|
||||
if (!sctx->cached_gtt_allocator)
|
||||
|
@ -466,6 +459,20 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
|
|||
sctx, stop_exec_on_failure);
|
||||
}
|
||||
|
||||
bool use_sdma_upload = sscreen->info.has_dedicated_vram && sctx->dma_cs;
|
||||
sctx->b.const_uploader = u_upload_create(&sctx->b, 256 * 1024,
|
||||
0, PIPE_USAGE_DEFAULT,
|
||||
SI_RESOURCE_FLAG_32BIT |
|
||||
(use_sdma_upload ?
|
||||
SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA :
|
||||
(sscreen->cpdma_prefetch_writes_memory ?
|
||||
0 : SI_RESOURCE_FLAG_READ_ONLY)));
|
||||
if (!sctx->b.const_uploader)
|
||||
goto fail;
|
||||
|
||||
if (use_sdma_upload)
|
||||
u_upload_enable_flush_explicit(sctx->b.const_uploader);
|
||||
|
||||
si_init_buffer_functions(sctx);
|
||||
si_init_clear_functions(sctx);
|
||||
si_init_blit_functions(sctx);
|
||||
|
|
|
@ -110,6 +110,8 @@
|
|||
#define SI_RESOURCE_FLAG_READ_ONLY (PIPE_RESOURCE_FLAG_DRV_PRIV << 5)
|
||||
#define SI_RESOURCE_FLAG_32BIT (PIPE_RESOURCE_FLAG_DRV_PRIV << 6)
|
||||
#define SI_RESOURCE_FLAG_CLEAR (PIPE_RESOURCE_FLAG_DRV_PRIV << 7)
|
||||
/* For const_uploader, upload data via GTT and copy to VRAM on context flush via SDMA. */
|
||||
#define SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA (PIPE_RESOURCE_FLAG_DRV_PRIV << 8)
|
||||
|
||||
enum si_clear_code
|
||||
{
|
||||
|
@ -776,6 +778,14 @@ struct si_saved_cs {
|
|||
int64_t time_flush;
|
||||
};
|
||||
|
||||
struct si_sdma_upload {
|
||||
struct si_resource *dst;
|
||||
struct si_resource *src;
|
||||
unsigned src_offset;
|
||||
unsigned dst_offset;
|
||||
unsigned size;
|
||||
};
|
||||
|
||||
struct si_context {
|
||||
struct pipe_context b; /* base class */
|
||||
|
||||
|
@ -1081,6 +1091,12 @@ struct si_context {
|
|||
bool render_cond_invert;
|
||||
bool render_cond_force_off; /* for u_blitter */
|
||||
|
||||
/* For uploading data via GTT and copy to VRAM on context flush via SDMA. */
|
||||
bool sdma_uploads_in_progress;
|
||||
struct si_sdma_upload *sdma_uploads;
|
||||
unsigned num_sdma_uploads;
|
||||
unsigned max_sdma_uploads;
|
||||
|
||||
/* Statistics gathering for the DCC enablement heuristic. It can't be
|
||||
* in si_texture because si_texture can be shared by multiple
|
||||
* contexts. This is for back buffers only. We shouldn't get too many
|
||||
|
@ -1280,6 +1296,7 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
|
|||
struct pipe_fence_handle **fence);
|
||||
void si_begin_new_gfx_cs(struct si_context *ctx);
|
||||
void si_need_gfx_cs_space(struct si_context *ctx);
|
||||
void si_unref_sdma_uploads(struct si_context *sctx);
|
||||
|
||||
/* si_gpu_load.c */
|
||||
void si_gpu_load_kill_thread(struct si_screen *sscreen);
|
||||
|
|
Loading…
Reference in New Issue