gallium/u_threaded: implement pipelined partial buffer uploads using CPU storage

This removes resource_copy_region for BufferSubData. Drivers have to opt in to use this. See the comment in the header file. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Reviewed-By: Mike Blumenkrantz <michael.blumenkrantz@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13399>
2021-10-17 17:16:09 -04:00 · 2021-10-17 17:16:09 -04:00 · 3df9d8ed80
parent cc2f3a0168
commit 3df9d8ed80
9 changed files with 138 additions and 15 deletions
--- a/src/gallium/auxiliary/driver_noop/noop_pipe.c
+++ b/src/gallium/auxiliary/driver_noop/noop_pipe.c
@ -120,7 +120,7 @@ static struct pipe_resource *noop_resource_create(struct pipe_screen *screen,
      FREE(nresource);
      return NULL;
   }
-   threaded_resource_init(&nresource->b.b);
+   threaded_resource_init(&nresource->b.b, false, 0);
   return &nresource->b.b;
 }

--- a/src/gallium/auxiliary/util/u_threaded_context.c
+++ b/src/gallium/auxiliary/util/u_threaded_context.c
@ -71,6 +71,12 @@ typedef uint16_t (*tc_execute)(struct pipe_context *pipe, void *call, uint64_t *

 static const tc_execute execute_func[TC_NUM_CALLS];

+static void
+tc_buffer_subdata(struct pipe_context *_pipe,
+                  struct pipe_resource *resource,
+                  unsigned usage, unsigned offset,
+                  unsigned size, const void *data);
+
 static void
 tc_batch_check(UNUSED struct tc_batch *batch)
 {
@ -669,18 +675,32 @@ tc_is_buffer_busy(struct threaded_context *tc, struct threaded_resource *tbuf,
   return tc->options.is_resource_busy(tc->pipe->screen, tbuf->latest, map_usage);
 }

+/**
+ * allow_cpu_storage should be false for user memory and imported buffers.
+ */
 void
-threaded_resource_init(struct pipe_resource *res)
+threaded_resource_init(struct pipe_resource *res, bool allow_cpu_storage,
+                       unsigned map_buffer_alignment)
 {
   struct threaded_resource *tres = threaded_resource(res);

   tres->latest = &tres->b;
+   tres->cpu_storage = NULL;
   util_range_init(&tres->valid_buffer_range);
   tres->is_shared = false;
   tres->is_user_ptr = false;
   tres->buffer_id_unique = 0;
   tres->pending_staging_uploads = 0;
   util_range_init(&tres->pending_staging_uploads_range);
+
+   if (allow_cpu_storage &&
+       !(res->flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT |
+                       PIPE_RESOURCE_FLAG_SPARSE |
+                       PIPE_RESOURCE_FLAG_ENCRYPTED)) &&
+       /* We need buffer invalidation and buffer busyness tracking for the CPU
+        * storage, which aren't supported with pipe_vertex_state. */
+       !(res->bind & PIPE_BIND_VERTEX_STATE))
+      tres->cpu_storage = align_malloc(res->width0, map_buffer_alignment);
 }

 void
@ -692,6 +712,7 @@ threaded_resource_deinit(struct pipe_resource *res)
           pipe_resource_reference(&tres->latest, NULL);
   util_range_destroy(&tres->valid_buffer_range);
   util_range_destroy(&tres->pending_staging_uploads_range);
+   align_free(tres->cpu_storage);
 }

 struct pipe_context *
@ -907,10 +928,12 @@ tc_get_query_result_resource(struct pipe_context *_pipe,
                             struct pipe_resource *resource, unsigned offset)
 {
   struct threaded_context *tc = threaded_context(_pipe);
+
+   tc_buffer_disable_cpu_storage(resource);
+
   struct tc_query_result_resource *p =
      tc_add_call(tc, TC_CALL_get_query_result_resource,
                  tc_query_result_resource);
-
   p->query = query;
   p->wait = wait;
   p->result_type = result_type;
@ -1499,6 +1522,7 @@ tc_set_shader_images(struct pipe_context *_pipe,
            if (images[i].access & PIPE_IMAGE_ACCESS_WRITE) {
               struct threaded_resource *tres = threaded_resource(resource);

+               tc_buffer_disable_cpu_storage(resource);
               util_range_add(&tres->b, &tres->valid_buffer_range,
                              images[i].u.buf.offset,
                              images[i].u.buf.offset + images[i].u.buf.size);
@ -1591,6 +1615,7 @@ tc_set_shader_buffers(struct pipe_context *_pipe,
            tc_bind_buffer(&tc->shader_buffers[shader][start + i], next, &tres->b);

            if (writable_bitmask & BITFIELD_BIT(i)) {
+               tc_buffer_disable_cpu_storage(src->buffer);
               util_range_add(&tres->b, &tres->valid_buffer_range,
                              src->buffer_offset,
                              src->buffer_offset + src->buffer_size);
@ -1737,6 +1762,7 @@ tc_set_stream_output_targets(struct pipe_context *_pipe,
      p->targets[i] = NULL;
      pipe_so_target_reference(&p->targets[i], tgs[i]);
      if (tgs[i]) {
+         tc_buffer_disable_cpu_storage(tgs[i]->buffer);
         tc_bind_buffer(&tc->streamout_buffers[i], next, tgs[i]->buffer);
      } else {
         tc_unbind_buffer(&tc->streamout_buffers[i]);
@ -1906,6 +1932,9 @@ tc_create_image_handle(struct pipe_context *_pipe,
   struct threaded_context *tc = threaded_context(_pipe);
   struct pipe_context *pipe = tc->pipe;

+   if (image->resource->target == PIPE_BUFFER)
+      tc_buffer_disable_cpu_storage(image->resource);
+
   tc_sync(tc);
   return pipe->create_image_handle(pipe, image);
 }
@ -2142,8 +2171,35 @@ tc_buffer_map(struct pipe_context *_pipe,
   struct threaded_resource *tres = threaded_resource(resource);
   struct pipe_context *pipe = tc->pipe;

+   /* PIPE_MAP_THREAD_SAFE is for glthread, which shouldn't use the CPU storage and
+    * this shouldn't normally be necessary because glthread only uses large buffers.
+    */
+   if (usage & PIPE_MAP_THREAD_SAFE)
+      tc_buffer_disable_cpu_storage(resource);
+
   usage = tc_improve_map_buffer_flags(tc, tres, usage, box->x, box->width);

+   /* If the CPU storage is enabled, return it directly. */
+   if (tres->cpu_storage && !(usage & TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE)) {
+      /* We can't let resource_copy_region disable the CPU storage. */
+      assert(!(tres->b.flags & PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY));
+
+      struct threaded_transfer *ttrans = slab_alloc(&tc->pool_transfers);
+      ttrans->b.resource = resource;
+      ttrans->b.level = 0;
+      ttrans->b.usage = usage;
+      ttrans->b.box = *box;
+      ttrans->b.stride = 0;
+      ttrans->b.layer_stride = 0;
+      ttrans->b.offset = 0;
+      ttrans->staging = NULL;
+      ttrans->valid_buffer_range = &tres->valid_buffer_range;
+      ttrans->cpu_storage_mapped = true;
+      *transfer = &ttrans->b;
+
+      return (uint8_t*)tres->cpu_storage + box->x;
+   }
+
   /* Do a staging transfer within the threaded context. The driver should
    * only get resource_copy_region.
    */
@ -2169,6 +2225,7 @@ tc_buffer_map(struct pipe_context *_pipe,
      ttrans->b.stride = 0;
      ttrans->b.layer_stride = 0;
      ttrans->valid_buffer_range = &tres->valid_buffer_range;
+      ttrans->cpu_storage_mapped = false;
      *transfer = &ttrans->b;

      p_atomic_inc(&tres->pending_staging_uploads);
@ -2203,6 +2260,7 @@ tc_buffer_map(struct pipe_context *_pipe,
   void *ret = pipe->buffer_map(pipe, tres->latest ? tres->latest : resource,
                                level, usage, box, transfer);
   threaded_transfer(*transfer)->valid_buffer_range = &tres->valid_buffer_range;
+   threaded_transfer(*transfer)->cpu_storage_mapped = false;

   if (!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC))
      tc_clear_driver_thread(tc);
@ -2285,8 +2343,13 @@ tc_buffer_do_flush_region(struct threaded_context *tc,
                              ttrans->staging, 0, &src_box);
   }

-   util_range_add(&tres->b, ttrans->valid_buffer_range,
-                  box->x, box->x + box->width);
+   /* Don't update the valid range when we're uploading the CPU storage
+    * because it includes the uninitialized range too.
+    */
+   if (!(ttrans->b.usage & TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE)) {
+      util_range_add(&tres->b, ttrans->valid_buffer_range,
+                     box->x, box->x + box->width);
+   }
 }

 static void
@ -2373,12 +2436,37 @@ tc_buffer_unmap(struct pipe_context *_pipe, struct pipe_transfer *transfer)
      return;
   }

-   bool was_staging_transfer = false;
-
   if (transfer->usage & PIPE_MAP_WRITE &&
       !(transfer->usage & PIPE_MAP_FLUSH_EXPLICIT))
      tc_buffer_do_flush_region(tc, ttrans, &transfer->box);

+   if (ttrans->cpu_storage_mapped) {
+      /* GL allows simultaneous GPU stores with mapped buffers as long as GPU stores don't
+       * touch the mapped range. That's a problem because GPU stores free the CPU storage.
+       * If that happens, we just ignore the unmap call and don't upload anything to prevent
+       * a crash.
+       *
+       * Disallow the CPU storage in the driver to work around this.
+       */
+      assert(tres->cpu_storage);
+
+      if (tres->cpu_storage) {
+         tc_invalidate_buffer(tc, tres);
+         tc_buffer_subdata(&tc->base, &tres->b,
+                           PIPE_MAP_UNSYNCHRONIZED |
+                           TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE,
+                           0, tres->b.width0, tres->cpu_storage);
+         /* This shouldn't have been freed by buffer_subdata. */
+         assert(tres->cpu_storage);
+      }
+
+      tc_drop_resource_reference(ttrans->staging);
+      slab_free(&tc->pool_transfers, ttrans);
+      return;
+   }
+
+   bool was_staging_transfer = false;
+
   if (ttrans->staging) {
      was_staging_transfer = true;

@ -2483,7 +2571,8 @@ tc_buffer_subdata(struct pipe_context *_pipe,
    */
   if (usage & (PIPE_MAP_UNSYNCHRONIZED |
                PIPE_MAP_DISCARD_WHOLE_RESOURCE) ||
-       size > TC_MAX_SUBDATA_BYTES) {
+       size > TC_MAX_SUBDATA_BYTES ||
+       tres->cpu_storage) {
      struct pipe_transfer *transfer;
      struct pipe_box box;
      uint8_t *map = NULL;
@ -3560,6 +3649,9 @@ tc_resource_copy_region(struct pipe_context *_pipe,
      tc_add_call(tc, TC_CALL_resource_copy_region,
                  tc_resource_copy_region);

+   if (dst->target == PIPE_BUFFER)
+      tc_buffer_disable_cpu_storage(dst);
+
   tc_set_resource_reference(&p->dst, dst);
   p->dst_level = dst_level;
   p->dstx = dstx;
@ -3874,6 +3966,8 @@ tc_clear_buffer(struct pipe_context *_pipe, struct pipe_resource *res,
   struct tc_clear_buffer *p =
      tc_add_call(tc, TC_CALL_clear_buffer, tc_clear_buffer);

+   tc_buffer_disable_cpu_storage(res);
+
   tc_set_resource_reference(&p->res, res);
   tc_add_to_buffer_list(&tc->buffer_lists[tc->next_buf_list], res);
   p->offset = offset;
--- a/src/gallium/auxiliary/util/u_threaded_context.h
+++ b/src/gallium/auxiliary/util/u_threaded_context.h
@ -208,6 +208,8 @@ struct tc_unflushed_batch_token;
 /* 0 = disabled, 1 = assertions, 2 = printfs, 3 = logging */
 #define TC_DEBUG 0

+/* This is an internal flag not sent to the driver. */
+#define TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE   (1u << 28)
 /* These are map flags sent to drivers. */
 /* Never infer whether it's safe to use unsychronized mappings: */
 #define TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED (1u << 29)
@ -313,6 +315,13 @@ struct threaded_resource {
    */
   struct pipe_resource *latest;

+   /* Optional CPU storage of the buffer. When we get partial glBufferSubData(implemented by
+    * copy_buffer) + glDrawElements, we don't want to drain the gfx pipeline before executing
+    * the copy. For ideal pipelining, we upload to this CPU storage and then reallocate
+    * the GPU storage completely and reupload everything without copy_buffer.
+    */
+   void *cpu_storage;
+
   /* The buffer range which is initialized (with a write transfer, streamout,
    * or writable shader resources). The remainder of the buffer is considered
    * invalid and can be mapped unsynchronized.
@ -360,6 +369,8 @@ struct threaded_transfer {
    * the base instance. Initially it's set to &b.resource->valid_buffer_range.
    */
   struct util_range *valid_buffer_range;
+
+   bool cpu_storage_mapped;
 };

 struct threaded_query {
@ -503,7 +514,8 @@ struct threaded_context {
   struct tc_buffer_list buffer_lists[TC_MAX_BUFFER_LISTS];
 };

-void threaded_resource_init(struct pipe_resource *res);
+void threaded_resource_init(struct pipe_resource *res, bool allow_cpu_storage,
+                            unsigned map_buffer_alignment);
 void threaded_resource_deinit(struct pipe_resource *res);
 struct pipe_context *threaded_context_unwrap_sync(struct pipe_context *pipe);
 void tc_driver_internal_flush_notify(struct threaded_context *tc);
@ -579,4 +591,21 @@ tc_assert_driver_thread(struct threaded_context *tc)
 #endif
 }

+/**
+ * This is called before GPU stores to disable the CPU storage because
+ * the CPU storage doesn't mirror the GPU storage.
+ *
+ * Drivers should also call it before exporting a DMABUF of a buffer.
+ */
+static inline void
+tc_buffer_disable_cpu_storage(struct pipe_resource *buf)
+{
+   struct threaded_resource *tres = threaded_resource(buf);
+
+   if (tres->cpu_storage) {
+      free(tres->cpu_storage);
+      tres->cpu_storage = NULL;
+   }
+}
+
 #endif
--- a/src/gallium/drivers/crocus/crocus_resource.c
+++ b/src/gallium/drivers/crocus/crocus_resource.c
@ -356,7 +356,7 @@ crocus_alloc_resource(struct pipe_screen *pscreen,
   res->base.b.screen = pscreen;
   res->orig_screen = crocus_pscreen_ref(pscreen);
   pipe_reference_init(&res->base.b.reference, 1);
-   threaded_resource_init(&res->base.b);
+   threaded_resource_init(&res->base.b, false, 0);

   if (templ->target == PIPE_BUFFER)
      util_range_init(&res->valid_buffer_range);
--- a/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/src/gallium/drivers/freedreno/freedreno_resource.c
@ -1109,7 +1109,7 @@ alloc_resource_struct(struct pipe_screen *pscreen,

   pipe_reference_init(&rsc->track->reference, 1);

-   threaded_resource_init(prsc);
+   threaded_resource_init(prsc, false, 0);

   if (tmpl->target == PIPE_BUFFER)
      rsc->b.buffer_id_unique = util_idalloc_mt_alloc(&screen->buffer_ids);
--- a/src/gallium/drivers/iris/iris_resource.c
+++ b/src/gallium/drivers/iris/iris_resource.c
@ -479,7 +479,7 @@ iris_alloc_resource(struct pipe_screen *pscreen,
   res->base.b.screen = pscreen;
   res->orig_screen = iris_pscreen_ref(pscreen);
   pipe_reference_init(&res->base.b.reference, 1);
-   threaded_resource_init(&res->base.b);
+   threaded_resource_init(&res->base.b, false, 0);

   res->aux.possible_usages = 1 << ISL_AUX_USAGE_NONE;
   res->aux.sampler_usages = 1 << ISL_AUX_USAGE_NONE;
--- a/src/gallium/drivers/r600/r600_buffer_common.c
+++ b/src/gallium/drivers/r600/r600_buffer_common.c
@ -584,7 +584,7 @@ r600_alloc_buffer_struct(struct pipe_screen *screen,
 	pipe_reference_init(&rbuffer->b.b.reference, 1);
 	rbuffer->b.b.screen = screen;

-	threaded_resource_init(&rbuffer->b.b);
+	threaded_resource_init(&rbuffer->b.b, false, 0);

 	rbuffer->buf = NULL;
 	rbuffer->bind_history = 0;
--- a/src/gallium/drivers/radeonsi/si_buffer.c
+++ b/src/gallium/drivers/radeonsi/si_buffer.c
@ -567,7 +567,7 @@ static struct si_resource *si_alloc_buffer_struct(struct pipe_screen *screen,
   pipe_reference_init(&buf->b.b.reference, 1);
   buf->b.b.screen = screen;

-   threaded_resource_init(&buf->b.b);
+   threaded_resource_init(&buf->b.b, false, 0);

   buf->buf = NULL;
   buf->bind_history = 0;
--- a/src/gallium/drivers/zink/zink_resource.c
+++ b/src/gallium/drivers/zink/zink_resource.c
@ -760,7 +760,7 @@ resource_create(struct pipe_screen *pscreen,

   res->base.b = *templ;

-   threaded_resource_init(&res->base.b);
+   threaded_resource_init(&res->base.b, false, 0);
   pipe_reference_init(&res->base.b.reference, 1);
   res->base.b.screen = pscreen;