/*
 * Copyright © 2020 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

/* Draw function marshalling for glthread.
 *
 * The purpose of these glDraw wrappers is to upload non-VBO vertex and
 * index data, so that glthread doesn't have to execute synchronously.
 */

#include "c99_alloca.h"

#include "main/glthread_marshal.h"
#include "main/dispatch.h"
#include "main/varray.h"

static inline unsigned
get_index_size(GLenum type)
{
   /* GL_UNSIGNED_BYTE  - GL_UNSIGNED_BYTE = 0
    * GL_UNSIGNED_SHORT - GL_UNSIGNED_BYTE = 2
    * GL_UNSIGNED_INT   - GL_UNSIGNED_BYTE = 4
    *
    * Divide by 2 to get n=0,1,2, then the index size is: 1 << n
    */
   return 1 << ((type - GL_UNSIGNED_BYTE) >> 1);
}

static inline bool
is_index_type_valid(GLenum type)
{
   /* GL_UNSIGNED_BYTE  = 0x1401
    * GL_UNSIGNED_SHORT = 0x1403
    * GL_UNSIGNED_INT   = 0x1405
    *
    * The trick is that bit 1 and bit 2 mean USHORT and UINT, respectively.
    * After clearing those two bits (with ~6), we should get UBYTE.
    * Both bits can't be set, because the enum would be greater than UINT.
    */
   return type <= GL_UNSIGNED_INT && (type & ~6) == GL_UNSIGNED_BYTE;
}

static ALWAYS_INLINE struct gl_buffer_object *
upload_indices(struct gl_context *ctx, unsigned count, unsigned index_size,
               const GLvoid **indices)
{
   struct gl_buffer_object *upload_buffer = NULL;
   unsigned upload_offset = 0;

   assert(count);

   _mesa_glthread_upload(ctx, *indices, index_size * count,
                         &upload_offset, &upload_buffer, NULL);
   assert(upload_buffer);
   *indices = (const GLvoid*)(intptr_t)upload_offset;

   return upload_buffer;
}

static ALWAYS_INLINE struct gl_buffer_object *
upload_multi_indices(struct gl_context *ctx, unsigned total_count,
                     unsigned index_size, unsigned draw_count,
                     const GLsizei *count, const GLvoid *const *indices,
                     const GLvoid **out_indices)
{
   struct gl_buffer_object *upload_buffer = NULL;
   unsigned upload_offset = 0;
   uint8_t *upload_ptr = NULL;

   assert(total_count);

   _mesa_glthread_upload(ctx, NULL, index_size * total_count,
                         &upload_offset, &upload_buffer, &upload_ptr);
   assert(upload_buffer);

   for (unsigned i = 0, offset = 0; i < draw_count; i++) {
      if (count[i] == 0)
         continue;

      unsigned size = count[i] * index_size;

      memcpy(upload_ptr + offset, indices[i], size);
      out_indices[i] = (const GLvoid*)(intptr_t)(upload_offset + offset);
      offset += size;
   }

   return upload_buffer;
}

static ALWAYS_INLINE bool
upload_vertices(struct gl_context *ctx, unsigned user_buffer_mask,
                unsigned start_vertex, unsigned num_vertices,
                unsigned start_instance, unsigned num_instances,
                struct glthread_attrib_binding *buffers)
{
   struct glthread_vao *vao = ctx->GLThread.CurrentVAO;
   unsigned attrib_mask_iter = vao->Enabled;
   unsigned num_buffers = 0;

   assert((num_vertices || !(user_buffer_mask & ~vao->NonZeroDivisorMask)) &&
          (num_instances || !(user_buffer_mask & vao->NonZeroDivisorMask)));

   if (unlikely(vao->BufferInterleaved & user_buffer_mask)) {
      /* Slower upload path where some buffers reference multiple attribs,
       * so we have to use 2 while loops instead of 1.
       */
      unsigned start_offset[VERT_ATTRIB_MAX];
      unsigned end_offset[VERT_ATTRIB_MAX];
      uint32_t buffer_mask = 0;

      while (attrib_mask_iter) {
         unsigned i = u_bit_scan(&attrib_mask_iter);
         unsigned binding_index = vao->Attrib[i].BufferIndex;

         if (!(user_buffer_mask & (1 << binding_index)))
            continue;

         unsigned stride = vao->Attrib[binding_index].Stride;
         unsigned instance_div = vao->Attrib[binding_index].Divisor;
         unsigned element_size = vao->Attrib[i].ElementSize;
         unsigned offset = vao->Attrib[i].RelativeOffset;
         unsigned size;

         if (instance_div) {
            /* Per-instance attrib. */

            /* Figure out how many instances we'll render given instance_div.  We
             * can't use the typical div_round_up() pattern because the CTS uses
             * instance_div = ~0 for a test, which overflows div_round_up()'s
             * addition.
             */
            unsigned count = num_instances / instance_div;
            if (count * instance_div != num_instances)
               count++;

            offset += stride * start_instance;
            size = stride * (count - 1) + element_size;
         } else {
            /* Per-vertex attrib. */
            offset += stride * start_vertex;
            size = stride * (num_vertices - 1) + element_size;
         }

         unsigned binding_index_bit = 1u << binding_index;

         /* Update upload offsets. */
         if (!(buffer_mask & binding_index_bit)) {
            start_offset[binding_index] = offset;
            end_offset[binding_index] = offset + size;
         } else {
            if (offset < start_offset[binding_index])
               start_offset[binding_index] = offset;
            if (offset + size > end_offset[binding_index])
               end_offset[binding_index] = offset + size;
         }

         buffer_mask |= binding_index_bit;
      }

      /* Upload buffers. */
      while (buffer_mask) {
         struct gl_buffer_object *upload_buffer = NULL;
         unsigned upload_offset = 0;
         unsigned start, end;

         unsigned binding_index = u_bit_scan(&buffer_mask);

         start = start_offset[binding_index];
         end = end_offset[binding_index];
         assert(start < end);

         const void *ptr = vao->Attrib[binding_index].Pointer;
         _mesa_glthread_upload(ctx, (uint8_t*)ptr + start,
                               end - start, &upload_offset,
                               &upload_buffer, NULL);
         assert(upload_buffer);

         buffers[num_buffers].buffer = upload_buffer;
         buffers[num_buffers].offset = upload_offset - start;
         buffers[num_buffers].original_pointer = ptr;
         num_buffers++;
      }

      return true;
   }

   /* Faster path where all attribs are separate. */
   while (attrib_mask_iter) {
      unsigned i = u_bit_scan(&attrib_mask_iter);
      unsigned binding_index = vao->Attrib[i].BufferIndex;

      if (!(user_buffer_mask & (1 << binding_index)))
         continue;

      struct gl_buffer_object *upload_buffer = NULL;
      unsigned upload_offset = 0;
      unsigned stride = vao->Attrib[binding_index].Stride;
      unsigned instance_div = vao->Attrib[binding_index].Divisor;
      unsigned element_size = vao->Attrib[i].ElementSize;
      unsigned offset = vao->Attrib[i].RelativeOffset;
      unsigned size;

      if (instance_div) {
         /* Per-instance attrib. */

         /* Figure out how many instances we'll render given instance_div.  We
          * can't use the typical div_round_up() pattern because the CTS uses
          * instance_div = ~0 for a test, which overflows div_round_up()'s
          * addition.
          */
         unsigned count = num_instances / instance_div;
         if (count * instance_div != num_instances)
            count++;

         offset += stride * start_instance;
         size = stride * (count - 1) + element_size;
      } else {
         /* Per-vertex attrib. */
         offset += stride * start_vertex;
         size = stride * (num_vertices - 1) + element_size;
      }

      const void *ptr = vao->Attrib[binding_index].Pointer;
      _mesa_glthread_upload(ctx, (uint8_t*)ptr + offset,
                            size, &upload_offset, &upload_buffer, NULL);
      assert(upload_buffer);

      buffers[num_buffers].buffer = upload_buffer;
      buffers[num_buffers].offset = upload_offset - offset;
      buffers[num_buffers].original_pointer = ptr;
      num_buffers++;
   }

   return true;
}

/* Generic DrawArrays structure NOT supporting user buffers. Ignore the name. */
struct marshal_cmd_DrawArrays
{
   struct marshal_cmd_base cmd_base;
   GLenum mode;
   GLint first;
   GLsizei count;
   GLsizei instance_count;
   GLuint baseinstance;
};

uint32_t
_mesa_unmarshal_DrawArrays(struct gl_context *ctx,
                           const struct marshal_cmd_DrawArrays *cmd,
                           const uint64_t *last)
{
   /* Ignore the function name. We use DISPATCH_CMD_DrawArrays
    * for all DrawArrays variants without user buffers, and
    * DISPATCH_CMD_DrawArraysInstancedBaseInstance for all DrawArrays
    * variants with user buffrs.
    */
   const GLenum mode = cmd->mode;
   const GLint first = cmd->first;
   const GLsizei count = cmd->count;
   const GLsizei instance_count = cmd->instance_count;
   const GLuint baseinstance = cmd->baseinstance;

   CALL_DrawArraysInstancedBaseInstance(ctx->CurrentServerDispatch,
                                        (mode, first, count, instance_count,
                                         baseinstance));
   return cmd->cmd_base.cmd_size;
}

static ALWAYS_INLINE void
draw_arrays_async(struct gl_context *ctx, GLenum mode, GLint first,
                  GLsizei count, GLsizei instance_count, GLuint baseinstance)
{
   int cmd_size = sizeof(struct marshal_cmd_DrawArrays);
   struct marshal_cmd_DrawArrays *cmd =
      _mesa_glthread_allocate_command(ctx, DISPATCH_CMD_DrawArrays, cmd_size);

   cmd->mode = mode;
   cmd->first = first;
   cmd->count = count;
   cmd->instance_count = instance_count;
   cmd->baseinstance = baseinstance;
}

/* Generic DrawArrays structure supporting user buffers. Ignore the name. */
struct marshal_cmd_DrawArraysInstancedBaseInstance
{
   struct marshal_cmd_base cmd_base;
   GLenum mode;
   GLint first;
   GLsizei count;
   GLsizei instance_count;
   GLuint baseinstance;
   GLuint user_buffer_mask;
};

uint32_t
_mesa_unmarshal_DrawArraysInstancedBaseInstance(struct gl_context *ctx,
                                                const struct marshal_cmd_DrawArraysInstancedBaseInstance *cmd,
                                                const uint64_t *last)
{
   /* Ignore the function name. We use DISPATCH_CMD_DrawArrays
    * for all DrawArrays variants without user buffers, and
    * DISPATCH_CMD_DrawArraysInstancedBaseInstance for all DrawArrays
    * variants with user buffrs.
    */
   const GLenum mode = cmd->mode;
   const GLint first = cmd->first;
   const GLsizei count = cmd->count;
   const GLsizei instance_count = cmd->instance_count;
   const GLuint baseinstance = cmd->baseinstance;
   const GLuint user_buffer_mask = cmd->user_buffer_mask;
   const struct glthread_attrib_binding *buffers =
      (const struct glthread_attrib_binding *)(cmd + 1);

   /* Bind uploaded buffers if needed. */
   if (user_buffer_mask) {
      _mesa_InternalBindVertexBuffers(ctx, buffers, user_buffer_mask,
                                      false);
   }

   CALL_DrawArraysInstancedBaseInstance(ctx->CurrentServerDispatch,
                                        (mode, first, count, instance_count,
                                         baseinstance));

   /* Restore states. */
   if (user_buffer_mask) {
      _mesa_InternalBindVertexBuffers(ctx, buffers, user_buffer_mask,
                                      true);
   }
   return cmd->cmd_base.cmd_size;
}

static ALWAYS_INLINE void
draw_arrays_async_user(struct gl_context *ctx, GLenum mode, GLint first,
                       GLsizei count, GLsizei instance_count, GLuint baseinstance,
                       unsigned user_buffer_mask,
                       const struct glthread_attrib_binding *buffers)
{
   int buffers_size = util_bitcount(user_buffer_mask) * sizeof(buffers[0]);
   int cmd_size = sizeof(struct marshal_cmd_DrawArraysInstancedBaseInstance) +
                  buffers_size;
   struct marshal_cmd_DrawArraysInstancedBaseInstance *cmd;

   cmd = _mesa_glthread_allocate_command(ctx, DISPATCH_CMD_DrawArraysInstancedBaseInstance,
                                         cmd_size);
   cmd->mode = mode;
   cmd->first = first;
   cmd->count = count;
   cmd->instance_count = instance_count;
   cmd->baseinstance = baseinstance;
   cmd->user_buffer_mask = user_buffer_mask;

   if (user_buffer_mask)
      memcpy(cmd + 1, buffers, buffers_size);
}

static ALWAYS_INLINE void
draw_arrays(GLenum mode, GLint first, GLsizei count, GLsizei instance_count,
            GLuint baseinstance, bool compiled_into_dlist)
{
   GET_CURRENT_CONTEXT(ctx);

   struct glthread_vao *vao = ctx->GLThread.CurrentVAO;
   unsigned user_buffer_mask = vao->UserPointerMask & vao->BufferEnabled;

   if (compiled_into_dlist && ctx->GLThread.ListMode) {
      _mesa_glthread_finish_before(ctx, "DrawArrays");
      /* Use the function that's compiled into a display list. */
      CALL_DrawArrays(ctx->CurrentServerDispatch, (mode, first, count));
      return;
   }

   /* Fast path when nothing needs to be done.
    *
    * This is also an error path. Zero counts should still call the driver
    * for possible GL errors.
    */
   if (ctx->API == API_OPENGL_CORE || !user_buffer_mask ||
       count <= 0 || instance_count <= 0) {
      draw_arrays_async(ctx, mode, first, count, instance_count, baseinstance);
      return;
   }

   /* Upload and draw. */
   struct glthread_attrib_binding buffers[VERT_ATTRIB_MAX];
   if (!ctx->GLThread.SupportsNonVBOUploads ||
       !upload_vertices(ctx, user_buffer_mask, first, count, baseinstance,
                        instance_count, buffers)) {
      _mesa_glthread_finish_before(ctx, "DrawArrays");
      CALL_DrawArraysInstancedBaseInstance(ctx->CurrentServerDispatch,
                                           (mode, first, count, instance_count,
                                            baseinstance));
      return;
   }

   draw_arrays_async_user(ctx, mode, first, count, instance_count, baseinstance,
                          user_buffer_mask, buffers);
}

struct marshal_cmd_MultiDrawArrays
{
   struct marshal_cmd_base cmd_base;
   GLenum mode;
   GLsizei draw_count;
   GLuint user_buffer_mask;
};

uint32_t
_mesa_unmarshal_MultiDrawArrays(struct gl_context *ctx,
                                const struct marshal_cmd_MultiDrawArrays *cmd,
                                const uint64_t *last)
{
   const GLenum mode = cmd->mode;
   const GLsizei draw_count = cmd->draw_count;
   const GLuint user_buffer_mask = cmd->user_buffer_mask;

   const char *variable_data = (const char *)(cmd + 1);
   const GLint *first = (GLint *)variable_data;
   variable_data += sizeof(GLint) * draw_count;
   const GLsizei *count = (GLsizei *)variable_data;
   variable_data += sizeof(GLsizei) * draw_count;
   const struct glthread_attrib_binding *buffers =
      (const struct glthread_attrib_binding *)variable_data;

   /* Bind uploaded buffers if needed. */
   if (user_buffer_mask) {
      _mesa_InternalBindVertexBuffers(ctx, buffers, user_buffer_mask,
                                      false);
   }

   CALL_MultiDrawArrays(ctx->CurrentServerDispatch,
                        (mode, first, count, draw_count));

   /* Restore states. */
   if (user_buffer_mask) {
      _mesa_InternalBindVertexBuffers(ctx, buffers, user_buffer_mask,
                                      true);
   }
   return cmd->cmd_base.cmd_size;
}

static ALWAYS_INLINE void
multi_draw_arrays_async(struct gl_context *ctx, GLenum mode,
                        const GLint *first, const GLsizei *count,
                        GLsizei draw_count, unsigned user_buffer_mask,
                        const struct glthread_attrib_binding *buffers)
{
   int first_size = sizeof(GLint) * draw_count;
   int count_size = sizeof(GLsizei) * draw_count;
   int buffers_size = util_bitcount(user_buffer_mask) * sizeof(buffers[0]);
   int cmd_size = sizeof(struct marshal_cmd_MultiDrawArrays) +
                  first_size + count_size + buffers_size;
   struct marshal_cmd_MultiDrawArrays *cmd;

   cmd = _mesa_glthread_allocate_command(ctx, DISPATCH_CMD_MultiDrawArrays,
                                         cmd_size);
   cmd->mode = mode;
   cmd->draw_count = draw_count;
   cmd->user_buffer_mask = user_buffer_mask;

   char *variable_data = (char*)(cmd + 1);
   memcpy(variable_data, first, first_size);
   variable_data += first_size;
   memcpy(variable_data, count, count_size);

   if (user_buffer_mask) {
      variable_data += count_size;
      memcpy(variable_data, buffers, buffers_size);
   }
}

void GLAPIENTRY
_mesa_marshal_MultiDrawArrays(GLenum mode, const GLint *first,
                              const GLsizei *count, GLsizei draw_count)
{
   GET_CURRENT_CONTEXT(ctx);

   struct glthread_vao *vao = ctx->GLThread.CurrentVAO;
   unsigned user_buffer_mask = vao->UserPointerMask & vao->BufferEnabled;

   if (ctx->GLThread.ListMode)
      goto sync;

   if (draw_count >= 0 &&
       (ctx->API == API_OPENGL_CORE || !user_buffer_mask)) {
      multi_draw_arrays_async(ctx, mode, first, count, draw_count, 0, NULL);
      return;
   }

   /* If the draw count is too high or negative, the queue can't be used. */
   if (!ctx->GLThread.SupportsNonVBOUploads ||
       draw_count < 0 || draw_count > MARSHAL_MAX_CMD_SIZE / 16)
      goto sync;

   unsigned min_index = ~0;
   unsigned max_index_exclusive = 0;

   for (unsigned i = 0; i < draw_count; i++) {
      GLsizei vertex_count = count[i];

      if (vertex_count < 0) {
         /* Just call the driver to set the error. */
         multi_draw_arrays_async(ctx, mode, first, count, draw_count, 0, NULL);
         return;
      }
      if (vertex_count == 0)
         continue;

      min_index = MIN2(min_index, first[i]);
      max_index_exclusive = MAX2(max_index_exclusive, first[i] + vertex_count);
   }

   unsigned num_vertices = max_index_exclusive - min_index;
   if (num_vertices == 0) {
      /* Nothing to do, but call the driver to set possible GL errors. */
      multi_draw_arrays_async(ctx, mode, first, count, draw_count, 0, NULL);
      return;
   }

   /* Upload and draw. */
   struct glthread_attrib_binding buffers[VERT_ATTRIB_MAX];
   if (!upload_vertices(ctx, user_buffer_mask, min_index, num_vertices,
                        0, 1, buffers))
      goto sync;

   multi_draw_arrays_async(ctx, mode, first, count, draw_count,
                           user_buffer_mask, buffers);
   return;

sync:
   _mesa_glthread_finish_before(ctx, "MultiDrawArrays");
   CALL_MultiDrawArrays(ctx->CurrentServerDispatch,
                        (mode, first, count, draw_count));
}

/* DrawElementsInstancedBaseVertexBaseInstance not supporting user buffers.
 * Ignore the name.
 */
struct marshal_cmd_DrawElementsInstancedARB
{
   struct marshal_cmd_base cmd_base;
   GLenum mode;
   GLenum type;
   GLsizei count;
   GLsizei instance_count;
   GLint basevertex;
   GLuint baseinstance;
   const GLvoid *indices;
};

uint32_t
_mesa_unmarshal_DrawElementsInstancedARB(struct gl_context *ctx,
                                         const struct marshal_cmd_DrawElementsInstancedARB *cmd,
                                         const uint64_t *last)
{
   /* Ignore the function name. We use DISPATCH_CMD_DrawElementsInstanced-
    * BaseVertexBaseInstance for all DrawElements variants with user buffers,
    * and both DISPATCH_CMD_DrawElementsInstancedARB and DISPATCH_CMD_Draw-
    * RangeElementsBaseVertex for all draw elements variants without user
    * buffers.
    */
   const GLenum mode = cmd->mode;
   const GLsizei count = cmd->count;
   const GLenum type = cmd->type;
   const GLvoid *indices = cmd->indices;
   const GLsizei instance_count = cmd->instance_count;
   const GLint basevertex = cmd->basevertex;
   const GLuint baseinstance = cmd->baseinstance;

   CALL_DrawElementsInstancedBaseVertexBaseInstance(ctx->CurrentServerDispatch,
                                                    (mode, count, type, indices,
                                                     instance_count, basevertex,
                                                     baseinstance));
   return cmd->cmd_base.cmd_size;
}

struct marshal_cmd_DrawRangeElementsBaseVertex
{
   struct marshal_cmd_base cmd_base;
   GLenum mode;
   GLenum type;
   GLsizei count;
   GLint basevertex;
   GLuint min_index;
   GLuint max_index;
   const GLvoid *indices;
};

uint32_t
_mesa_unmarshal_DrawRangeElementsBaseVertex(struct gl_context *ctx,
                                            const struct marshal_cmd_DrawRangeElementsBaseVertex *cmd,
                                            const uint64_t *last)
{
   const GLenum mode = cmd->mode;
   const GLsizei count = cmd->count;
   const GLenum type = cmd->type;
   const GLvoid *indices = cmd->indices;
   const GLint basevertex = cmd->basevertex;
   const GLuint min_index = cmd->min_index;
   const GLuint max_index = cmd->max_index;

   CALL_DrawRangeElementsBaseVertex(ctx->CurrentServerDispatch,
                                    (mode, min_index, max_index, count,
                                     type, indices, basevertex));
   return cmd->cmd_base.cmd_size;
}

static ALWAYS_INLINE void
draw_elements_async(struct gl_context *ctx, GLenum mode, GLsizei count,
                    GLenum type, const GLvoid *indices, GLsizei instance_count,
                    GLint basevertex, GLuint baseinstance,
                    bool index_bounds_valid, GLuint min_index, GLuint max_index)
{
   if (index_bounds_valid) {
      int cmd_size = sizeof(struct marshal_cmd_DrawRangeElementsBaseVertex);
      struct marshal_cmd_DrawRangeElementsBaseVertex *cmd =
         _mesa_glthread_allocate_command(ctx, DISPATCH_CMD_DrawRangeElementsBaseVertex, cmd_size);

      cmd->mode = mode;
      cmd->count = count;
      cmd->type = type;
      cmd->indices = indices;
      cmd->basevertex = basevertex;
      cmd->min_index = min_index;
      cmd->max_index = max_index;
   } else {
      int cmd_size = sizeof(struct marshal_cmd_DrawElementsInstancedARB);
      struct marshal_cmd_DrawElementsInstancedARB *cmd =
         _mesa_glthread_allocate_command(ctx, DISPATCH_CMD_DrawElementsInstancedARB, cmd_size);

      cmd->mode = mode;
      cmd->count = count;
      cmd->type = type;
      cmd->indices = indices;
      cmd->instance_count = instance_count;
      cmd->basevertex = basevertex;
      cmd->baseinstance = baseinstance;
   }
}

struct marshal_cmd_DrawElementsInstancedBaseVertexBaseInstance
{
   struct marshal_cmd_base cmd_base;
   bool index_bounds_valid;
   GLenum mode;
   GLenum type;
   GLsizei count;
   GLsizei instance_count;
   GLint basevertex;
   GLuint baseinstance;
   GLuint min_index;
   GLuint max_index;
   GLuint user_buffer_mask;
   const GLvoid *indices;
   struct gl_buffer_object *index_buffer;
};

uint32_t
_mesa_unmarshal_DrawElementsInstancedBaseVertexBaseInstance(struct gl_context *ctx,
                                                            const struct marshal_cmd_DrawElementsInstancedBaseVertexBaseInstance *cmd,
                                                            const uint64_t *last)
{
   /* Ignore the function name. We use DISPATCH_CMD_DrawElementsInstanced-
    * BaseVertexBaseInstance for all DrawElements variants with user buffers,
    * and both DISPATCH_CMD_DrawElementsInstancedARB and DISPATCH_CMD_Draw-
    * RangeElementsBaseVertex for all draw elements variants without user
    * buffers.
    */
   const GLenum mode = cmd->mode;
   const GLsizei count = cmd->count;
   const GLenum type = cmd->type;
   const GLvoid *indices = cmd->indices;
   const GLsizei instance_count = cmd->instance_count;
   const GLint basevertex = cmd->basevertex;
   const GLuint baseinstance = cmd->baseinstance;
   const GLuint min_index = cmd->min_index;
   const GLuint max_index = cmd->max_index;
   const GLuint user_buffer_mask = cmd->user_buffer_mask;
   struct gl_buffer_object *index_buffer = cmd->index_buffer;
   const struct glthread_attrib_binding *buffers =
      (const struct glthread_attrib_binding *)(cmd + 1);

   /* Bind uploaded buffers if needed. */
   if (user_buffer_mask) {
      _mesa_InternalBindVertexBuffers(ctx, buffers, user_buffer_mask,
                                      false);
   }
   if (index_buffer) {
      _mesa_InternalBindElementBuffer(ctx, index_buffer);
   }

   /* Draw. */
   if (cmd->index_bounds_valid && instance_count == 1 && baseinstance == 0) {
      CALL_DrawRangeElementsBaseVertex(ctx->CurrentServerDispatch,
                                       (mode, min_index, max_index, count,
                                        type, indices, basevertex));
   } else {
      CALL_DrawElementsInstancedBaseVertexBaseInstance(ctx->CurrentServerDispatch,
                                                       (mode, count, type, indices,
                                                        instance_count, basevertex,
                                                        baseinstance));
   }

   /* Restore states. */
   if (index_buffer) {
      _mesa_InternalBindElementBuffer(ctx, NULL);
   }
   if (user_buffer_mask) {
      _mesa_InternalBindVertexBuffers(ctx, buffers, user_buffer_mask,
                                      true);
   }
   return cmd->cmd_base.cmd_size;
}

static ALWAYS_INLINE void
draw_elements_async_user(struct gl_context *ctx, GLenum mode, GLsizei count,
                         GLenum type, const GLvoid *indices, GLsizei instance_count,
                         GLint basevertex, GLuint baseinstance,
                         bool index_bounds_valid, GLuint min_index, GLuint max_index,
                         struct gl_buffer_object *index_buffer,
                         unsigned user_buffer_mask,
                         const struct glthread_attrib_binding *buffers)
{
   int buffers_size = util_bitcount(user_buffer_mask) * sizeof(buffers[0]);
   int cmd_size = sizeof(struct marshal_cmd_DrawElementsInstancedBaseVertexBaseInstance) +
                  buffers_size;
   struct marshal_cmd_DrawElementsInstancedBaseVertexBaseInstance *cmd;

   cmd = _mesa_glthread_allocate_command(ctx, DISPATCH_CMD_DrawElementsInstancedBaseVertexBaseInstance, cmd_size);
   cmd->mode = mode;
   cmd->count = count;
   cmd->type = type;
   cmd->indices = indices;
   cmd->instance_count = instance_count;
   cmd->basevertex = basevertex;
   cmd->baseinstance = baseinstance;
   cmd->min_index = min_index;
   cmd->max_index = max_index;
   cmd->user_buffer_mask = user_buffer_mask;
   cmd->index_bounds_valid = index_bounds_valid;
   cmd->index_buffer = index_buffer;

   if (user_buffer_mask)
      memcpy(cmd + 1, buffers, buffers_size);
}

static void
draw_elements(GLenum mode, GLsizei count, GLenum type, const GLvoid *indices,
              GLsizei instance_count, GLint basevertex, GLuint baseinstance,
              bool index_bounds_valid, GLuint min_index, GLuint max_index,
              bool compiled_into_dlist)
{
   GET_CURRENT_CONTEXT(ctx);

   struct glthread_vao *vao = ctx->GLThread.CurrentVAO;
   unsigned user_buffer_mask = vao->UserPointerMask & vao->BufferEnabled;
   bool has_user_indices = vao->CurrentElementBufferName == 0;

   if (compiled_into_dlist && ctx->GLThread.ListMode)
      goto sync;

   /* Fast path when nothing needs to be done.
    *
    * This is also an error path. Zero counts should still call the driver
    * for possible GL errors.
    */
   if (ctx->API == API_OPENGL_CORE ||
       count <= 0 || instance_count <= 0 || max_index < min_index ||
       !is_index_type_valid(type) ||
       (!user_buffer_mask && !has_user_indices)) {
      draw_elements_async(ctx, mode, count, type, indices, instance_count,
                          basevertex, baseinstance, index_bounds_valid,
                          min_index, max_index);
      return;
   }

   if (!ctx->GLThread.SupportsNonVBOUploads)
      goto sync;

   bool need_index_bounds = user_buffer_mask & ~vao->NonZeroDivisorMask;
   unsigned index_size = get_index_size(type);

   if (need_index_bounds && !index_bounds_valid) {
      /* Sync if indices come from a buffer and vertices come from memory
       * and index bounds are not valid.
       *
       * We would have to map the indices to compute the index bounds, and
       * for that we would have to sync anyway.
       */
      if (!has_user_indices)
         goto sync;

      /* Compute the index bounds. */
      min_index = ~0;
      max_index = 0;
      vbo_get_minmax_index_mapped(count, index_size,
                                  ctx->GLThread._RestartIndex[index_size - 1],
                                  ctx->GLThread._PrimitiveRestart, indices,
                                  &min_index, &max_index);
      index_bounds_valid = true;
   }

   unsigned start_vertex = min_index + basevertex;
   unsigned num_vertices = max_index + 1 - min_index;

   /* If there is too much data to upload, sync and let the driver unroll
    * indices. */
   if (util_is_vbo_upload_ratio_too_large(count, num_vertices))
      goto sync;

   struct glthread_attrib_binding buffers[VERT_ATTRIB_MAX];
   if (user_buffer_mask &&
       !upload_vertices(ctx, user_buffer_mask, start_vertex, num_vertices,
                        baseinstance, instance_count, buffers))
      goto sync;

   /* Upload indices. */
   struct gl_buffer_object *index_buffer = NULL;
   if (has_user_indices)
      index_buffer = upload_indices(ctx, count, index_size, &indices);

   /* Draw asynchronously. */
   draw_elements_async_user(ctx, mode, count, type, indices, instance_count,
                            basevertex, baseinstance, index_bounds_valid,
                            min_index, max_index, index_buffer,
                            user_buffer_mask, buffers);
   return;

sync:
   _mesa_glthread_finish_before(ctx, "DrawElements");

   if (compiled_into_dlist && ctx->GLThread.ListMode) {
      /* Only use the ones that are compiled into display lists. */
      if (basevertex) {
         CALL_DrawElementsBaseVertex(ctx->CurrentServerDispatch,
                                     (mode, count, type, indices, basevertex));
      } else if (index_bounds_valid) {
         CALL_DrawRangeElements(ctx->CurrentServerDispatch,
                                (mode, min_index, max_index, count, type, indices));
      } else {
         CALL_DrawElements(ctx->CurrentServerDispatch, (mode, count, type, indices));
      }
   } else if (index_bounds_valid && instance_count == 1 && baseinstance == 0) {
      CALL_DrawRangeElementsBaseVertex(ctx->CurrentServerDispatch,
                                       (mode, min_index, max_index, count,
                                        type, indices, basevertex));
   } else {
      CALL_DrawElementsInstancedBaseVertexBaseInstance(ctx->CurrentServerDispatch,
                                                       (mode, count, type, indices,
                                                        instance_count, basevertex,
                                                        baseinstance));
   }
}

struct marshal_cmd_MultiDrawElementsBaseVertex
{
   struct marshal_cmd_base cmd_base;
   bool has_base_vertex;
   GLenum mode;
   GLenum type;
   GLsizei draw_count;
   GLuint user_buffer_mask;
   struct gl_buffer_object *index_buffer;
};

uint32_t
_mesa_unmarshal_MultiDrawElementsBaseVertex(struct gl_context *ctx,
                                            const struct marshal_cmd_MultiDrawElementsBaseVertex *cmd,
                                            const uint64_t *last)
{
   const GLenum mode = cmd->mode;
   const GLenum type = cmd->type;
   const GLsizei draw_count = cmd->draw_count;
   const GLuint user_buffer_mask = cmd->user_buffer_mask;
   struct gl_buffer_object *index_buffer = cmd->index_buffer;
   const bool has_base_vertex = cmd->has_base_vertex;

   const char *variable_data = (const char *)(cmd + 1);
   const GLsizei *count = (GLsizei *)variable_data;
   variable_data += sizeof(GLsizei) * draw_count;
   const GLvoid *const *indices = (const GLvoid *const *)variable_data;
   variable_data += sizeof(const GLvoid *const *) * draw_count;
   const GLsizei *basevertex = NULL;
   if (has_base_vertex) {
      basevertex = (GLsizei *)variable_data;
      variable_data += sizeof(GLsizei) * draw_count;
   }
   const struct glthread_attrib_binding *buffers =
      (const struct glthread_attrib_binding *)variable_data;

   /* Bind uploaded buffers if needed. */
   if (user_buffer_mask) {
      _mesa_InternalBindVertexBuffers(ctx, buffers, user_buffer_mask,
                                      false);
   }
   if (index_buffer) {
      _mesa_InternalBindElementBuffer(ctx, index_buffer);
   }

   /* Draw. */
   if (has_base_vertex) {
      CALL_MultiDrawElementsBaseVertex(ctx->CurrentServerDispatch,
                                       (mode, count, type, indices, draw_count,
                                        basevertex));
   } else {
      CALL_MultiDrawElementsEXT(ctx->CurrentServerDispatch,
                                (mode, count, type, indices, draw_count));
   }

   /* Restore states. */
   if (index_buffer) {
      _mesa_InternalBindElementBuffer(ctx, NULL);
   }
   if (user_buffer_mask) {
      _mesa_InternalBindVertexBuffers(ctx, buffers, user_buffer_mask,
                                      true);
   }
   return cmd->cmd_base.cmd_size;
}

static ALWAYS_INLINE void
multi_draw_elements_async(struct gl_context *ctx, GLenum mode,
                          const GLsizei *count, GLenum type,
                          const GLvoid *const *indices, GLsizei draw_count,
                          const GLsizei *basevertex,
                          struct gl_buffer_object *index_buffer,
                          unsigned user_buffer_mask,
                          const struct glthread_attrib_binding *buffers)
{
   int count_size = sizeof(GLsizei) * draw_count;
   int indices_size = sizeof(indices[0]) * draw_count;
   int basevertex_size = basevertex ? sizeof(GLsizei) * draw_count : 0;
   int buffers_size = util_bitcount(user_buffer_mask) * sizeof(buffers[0]);
   int cmd_size = sizeof(struct marshal_cmd_MultiDrawElementsBaseVertex) +
                  count_size + indices_size + basevertex_size + buffers_size;
   struct marshal_cmd_MultiDrawElementsBaseVertex *cmd;

   cmd = _mesa_glthread_allocate_command(ctx, DISPATCH_CMD_MultiDrawElementsBaseVertex, cmd_size);
   cmd->mode = mode;
   cmd->type = type;
   cmd->draw_count = draw_count;
   cmd->user_buffer_mask = user_buffer_mask;
   cmd->index_buffer = index_buffer;
   cmd->has_base_vertex = basevertex != NULL;

   char *variable_data = (char*)(cmd + 1);
   memcpy(variable_data, count, count_size);
   variable_data += count_size;
   memcpy(variable_data, indices, indices_size);
   variable_data += indices_size;

   if (basevertex) {
      memcpy(variable_data, basevertex, basevertex_size);
      variable_data += basevertex_size;
   }

   if (user_buffer_mask)
      memcpy(variable_data, buffers, buffers_size);
}

void GLAPIENTRY
_mesa_marshal_MultiDrawElementsBaseVertex(GLenum mode, const GLsizei *count,
                                          GLenum type,
                                          const GLvoid *const *indices,
                                          GLsizei draw_count,
                                          const GLsizei *basevertex)
{
   GET_CURRENT_CONTEXT(ctx);

   struct glthread_vao *vao = ctx->GLThread.CurrentVAO;
   unsigned user_buffer_mask = vao->UserPointerMask & vao->BufferEnabled;
   bool has_user_indices = vao->CurrentElementBufferName == 0;

   if (ctx->GLThread.ListMode)
      goto sync;

   /* Fast path when nothing needs to be done. */
   if (draw_count >= 0 &&
       (ctx->API == API_OPENGL_CORE ||
        !is_index_type_valid(type) ||
        (!user_buffer_mask && !has_user_indices))) {
      multi_draw_elements_async(ctx, mode, count, type, indices, draw_count,
                                basevertex, NULL, 0, NULL);
      return;
   }

   bool need_index_bounds = user_buffer_mask & ~vao->NonZeroDivisorMask;

   /* If the draw count is too high or negative, the queue can't be used.
    *
    * Sync if indices come from a buffer and vertices come from memory
    * and index bounds are not valid. We would have to map the indices
    * to compute the index bounds, and for that we would have to sync anyway.
    */
   if (!ctx->GLThread.SupportsNonVBOUploads ||
       draw_count < 0 || draw_count > MARSHAL_MAX_CMD_SIZE / 32 ||
       (need_index_bounds && !has_user_indices))
      goto sync;

   unsigned index_size = get_index_size(type);
   unsigned min_index = ~0;
   unsigned max_index = 0;
   unsigned total_count = 0;
   unsigned num_vertices = 0;

   /* This is always true if there is per-vertex data that needs to be
    * uploaded.
    */
   if (need_index_bounds) {
      /* Compute the index bounds. */
      for (unsigned i = 0; i < draw_count; i++) {
         GLsizei vertex_count = count[i];

         if (vertex_count < 0) {
            /* Just call the driver to set the error. */
            multi_draw_elements_async(ctx, mode, count, type, indices, draw_count,
                                      basevertex, NULL, 0, NULL);
            return;
         }
         if (vertex_count == 0)
            continue;

         unsigned min = ~0, max = 0;
         vbo_get_minmax_index_mapped(vertex_count, index_size,
                                     ctx->GLThread._RestartIndex[index_size - 1],
                                     ctx->GLThread._PrimitiveRestart, indices[i],
                                     &min, &max);
         if (basevertex) {
            min += basevertex[i];
            max += basevertex[i];
         }
         min_index = MIN2(min_index, min);
         max_index = MAX2(max_index, max);
         total_count += vertex_count;
      }

      num_vertices = max_index + 1 - min_index;

      if (total_count == 0 || num_vertices == 0) {
         /* Nothing to do, but call the driver to set possible GL errors. */
         multi_draw_elements_async(ctx, mode, count, type, indices, draw_count,
                                   basevertex, NULL, 0, NULL);
         return;
      }

      /* If there is too much data to upload, sync and let the driver unroll
       * indices. */
      if (util_is_vbo_upload_ratio_too_large(total_count, num_vertices))
         goto sync;
   } else if (has_user_indices) {
      /* Only compute total_count for the upload of indices. */
      for (unsigned i = 0; i < draw_count; i++) {
         GLsizei vertex_count = count[i];

         if (vertex_count < 0) {
            /* Just call the driver to set the error. */
            multi_draw_elements_async(ctx, mode, count, type, indices, draw_count,
                                      basevertex, NULL, 0, NULL);
            return;
         }
         if (vertex_count == 0)
            continue;

         total_count += vertex_count;
      }

      if (total_count == 0) {
         /* Nothing to do, but call the driver to set possible GL errors. */
         multi_draw_elements_async(ctx, mode, count, type, indices, draw_count,
                                   basevertex, NULL, 0, NULL);
         return;
      }
   }

   /* Upload vertices. */
   struct glthread_attrib_binding buffers[VERT_ATTRIB_MAX];
   if (user_buffer_mask &&
       !upload_vertices(ctx, user_buffer_mask, min_index, num_vertices,
                        0, 1, buffers))
      goto sync;

   /* Upload indices. */
   struct gl_buffer_object *index_buffer = NULL;
   if (has_user_indices) {
      const GLvoid **out_indices = alloca(sizeof(indices[0]) * draw_count);

      index_buffer = upload_multi_indices(ctx, total_count, index_size,
                                          draw_count, count, indices,
                                          out_indices);
      indices = out_indices;
   }

   /* Draw asynchronously. */
   multi_draw_elements_async(ctx, mode, count, type, indices, draw_count,
                             basevertex, index_buffer, user_buffer_mask,
                             buffers);
   return;

sync:
   _mesa_glthread_finish_before(ctx, "DrawElements");

   if (basevertex) {
      CALL_MultiDrawElementsBaseVertex(ctx->CurrentServerDispatch,
                                       (mode, count, type, indices, draw_count,
                                        basevertex));
   } else {
      CALL_MultiDrawElementsEXT(ctx->CurrentServerDispatch,
                                (mode, count, type, indices, draw_count));
   }
}

void GLAPIENTRY
_mesa_marshal_DrawArrays(GLenum mode, GLint first, GLsizei count)
{
   draw_arrays(mode, first, count, 1, 0, true);
}

void GLAPIENTRY
_mesa_marshal_DrawArraysInstancedARB(GLenum mode, GLint first, GLsizei count,
                                     GLsizei instance_count)
{
   draw_arrays(mode, first, count, instance_count, 0, false);
}

void GLAPIENTRY
_mesa_marshal_DrawArraysInstancedBaseInstance(GLenum mode, GLint first,
                                              GLsizei count, GLsizei instance_count,
                                              GLuint baseinstance)
{
   draw_arrays(mode, first, count, instance_count, baseinstance, false);
}

void GLAPIENTRY
_mesa_marshal_DrawElements(GLenum mode, GLsizei count, GLenum type,
                           const GLvoid *indices)
{
   draw_elements(mode, count, type, indices, 1, 0, 0, false, 0, 0, true);
}

void GLAPIENTRY
_mesa_marshal_DrawRangeElements(GLenum mode, GLuint start, GLuint end,
                                GLsizei count, GLenum type,
                                const GLvoid *indices)
{
   draw_elements(mode, count, type, indices, 1, 0, 0, true, start, end, true);
}

void GLAPIENTRY
_mesa_marshal_DrawElementsInstancedARB(GLenum mode, GLsizei count, GLenum type,
                                       const GLvoid *indices, GLsizei instance_count)
{
   draw_elements(mode, count, type, indices, instance_count, 0, 0, false, 0, 0, false);
}

void GLAPIENTRY
_mesa_marshal_DrawElementsBaseVertex(GLenum mode, GLsizei count, GLenum type,
                                     const GLvoid *indices, GLint basevertex)
{
   draw_elements(mode, count, type, indices, 1, basevertex, 0, false, 0, 0, true);
}

void GLAPIENTRY
_mesa_marshal_DrawRangeElementsBaseVertex(GLenum mode, GLuint start, GLuint end,
                                          GLsizei count, GLenum type,
                                          const GLvoid *indices, GLint basevertex)
{
   draw_elements(mode, count, type, indices, 1, basevertex, 0, true, start, end, false);
}

void GLAPIENTRY
_mesa_marshal_DrawElementsInstancedBaseVertex(GLenum mode, GLsizei count,
                                              GLenum type, const GLvoid *indices,
                                              GLsizei instance_count, GLint basevertex)
{
   draw_elements(mode, count, type, indices, instance_count, basevertex, 0, false, 0, 0, false);
}

void GLAPIENTRY
_mesa_marshal_DrawElementsInstancedBaseInstance(GLenum mode, GLsizei count,
                                                GLenum type, const GLvoid *indices,
                                                GLsizei instance_count, GLuint baseinstance)
{
   draw_elements(mode, count, type, indices, instance_count, 0, baseinstance, false, 0, 0, false);
}

void GLAPIENTRY
_mesa_marshal_DrawElementsInstancedBaseVertexBaseInstance(GLenum mode, GLsizei count,
                                                          GLenum type, const GLvoid *indices,
                                                          GLsizei instance_count, GLint basevertex,
                                                          GLuint baseinstance)
{
   draw_elements(mode, count, type, indices, instance_count, basevertex, baseinstance, false, 0, 0, false);
}

void GLAPIENTRY
_mesa_marshal_MultiDrawElementsEXT(GLenum mode, const GLsizei *count,
                                   GLenum type, const GLvoid *const *indices,
                                   GLsizei draw_count)
{
   _mesa_marshal_MultiDrawElementsBaseVertex(mode, count, type, indices,
                                             draw_count, NULL);
}

uint32_t
_mesa_unmarshal_DrawArraysInstancedARB(struct gl_context *ctx, const struct marshal_cmd_DrawArraysInstancedARB *cmd, const uint64_t *last)
{
   unreachable("never used - DrawArraysInstancedBaseInstance is used instead");
   return 0;
}

uint32_t
_mesa_unmarshal_DrawElements(struct gl_context *ctx, const struct marshal_cmd_DrawElements *cmd, const uint64_t *last)
{
   unreachable("never used - DrawElementsInstancedBaseVertexBaseInstance is used instead");
   return 0;
}

uint32_t
_mesa_unmarshal_DrawRangeElements(struct gl_context *ctx, const struct marshal_cmd_DrawRangeElements *cmd, const uint64_t *last)
{
   unreachable("never used - DrawElementsInstancedBaseVertexBaseInstance is used instead");
   return 0;
}

uint32_t
_mesa_unmarshal_DrawElementsBaseVertex(struct gl_context *ctx, const struct marshal_cmd_DrawElementsBaseVertex *cmd, const uint64_t *last)
{
   unreachable("never used - DrawElementsInstancedBaseVertexBaseInstance is used instead");
   return 0;
}

uint32_t
_mesa_unmarshal_DrawElementsInstancedBaseVertex(struct gl_context *ctx, const struct marshal_cmd_DrawElementsInstancedBaseVertex *cmd, const uint64_t *last)
{
   unreachable("never used - DrawElementsInstancedBaseVertexBaseInstance is used instead");
   return 0;
}

uint32_t
_mesa_unmarshal_DrawElementsInstancedBaseInstance(struct gl_context *ctx, const struct marshal_cmd_DrawElementsInstancedBaseInstance *cmd, const uint64_t *last)
{
   unreachable("never used - DrawElementsInstancedBaseVertexBaseInstance is used instead");
   return 0;
}

uint32_t
_mesa_unmarshal_MultiDrawElementsEXT(struct gl_context *ctx, const struct marshal_cmd_MultiDrawElementsEXT *cmd, const uint64_t *last)
{
   unreachable("never used - MultiDrawElementsBaseVertex is used instead");
   return 0;
}