svga: Add GL4.1(compatibility profile) support in svga driver

This patch is a squash commit of a very long in-house patch series. Reviewed-by: Brian Paul <brianp@vmware.com> Reviewed-by: Charmaine Lee <charmainel@vmware.com> Signed-off-by: Neha Bhende <bhenden@vmware.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5317>
2020-05-26 21:26:42 +05:30 · 2020-05-26 21:26:42 +05:30 · ccb4ea5a43
parent 52ce25be87
commit ccb4ea5a43
61 changed files with 8067 additions and 1555 deletions
--- a/src/gallium/drivers/svga/include/VGPU10ShaderTokens.h
+++ b/src/gallium/drivers/svga/include/VGPU10ShaderTokens.h
@ -201,7 +201,7 @@ typedef enum {
   VGPU10_OPCODE_DCL_GLOBAL_FLAGS                  = 106,

   /* GL guest */
-   VGPU10_OPCODE_IDIV                              = 107,
+   VGPU10_OPCODE_VMWARE                            = 107,

   /* DX10.1 */
   VGPU10_OPCODE_LOD                               = 108,
--- a/src/gallium/drivers/svga/include/svga3d_types.h
+++ b/src/gallium/drivers/svga/include/svga3d_types.h
@ -436,8 +436,9 @@ typedef uint32 SVGA3dSurfaceFlags;
 * mob-backing to store all the samples.
 */
 #define SVGA3D_SURFACE_MULTISAMPLE            (CONST64U(1) << 32)
+#define SVGA3D_SURFACE_DRAWINDIRECT_ARGS      (CONST64U(1) << 38)

-#define SVGA3D_SURFACE_FLAG_MAX               (CONST64U(1) << 33)
+#define SVGA3D_SURFACE_FLAG_MAX               (CONST64U(1) << 42)

 /*
 * Surface flags types:
@ -464,7 +465,8 @@ typedef uint64 SVGA3dSurfaceAllFlags;
           SVGA3D_SURFACE_HINT_INDIRECT_UPDATE | \
           SVGA3D_SURFACE_TRANSFER_FROM_BUFFER | \
           SVGA3D_SURFACE_VADECODE |             \
-           SVGA3D_SURFACE_MULTISAMPLE            \
+           SVGA3D_SURFACE_MULTISAMPLE |          \
+           SVGA3D_SURFACE_DRAWINDIRECT_ARGS      \
        )

 #define SVGA3D_SURFACE_2D_DISALLOWED_MASK           \
@ -480,7 +482,8 @@ typedef uint64 SVGA3dSurfaceAllFlags;
           SVGA3D_SURFACE_BIND_STREAM_OUTPUT |      \
           SVGA3D_SURFACE_TRANSFER_FROM_BUFFER |    \
           SVGA3D_SURFACE_VADECODE |                \
-           SVGA3D_SURFACE_MULTISAMPLE               \
+           SVGA3D_SURFACE_MULTISAMPLE |             \
+           SVGA3D_SURFACE_DRAWINDIRECT_ARGS         \
        )

 #define SVGA3D_SURFACE_BASICOPS_DISALLOWED_MASK     \
@ -508,7 +511,8 @@ typedef uint64 SVGA3dSurfaceAllFlags;
           SVGA3D_SURFACE_HINT_INDIRECT_UPDATE |    \
           SVGA3D_SURFACE_TRANSFER_FROM_BUFFER |    \
           SVGA3D_SURFACE_VADECODE |                \
-           SVGA3D_SURFACE_MULTISAMPLE               \
+           SVGA3D_SURFACE_MULTISAMPLE |             \
+           SVGA3D_SURFACE_DRAWINDIRECT_ARGS         \
        )

 #define SVGA3D_SURFACE_BUFFER_DISALLOWED_MASK       \
@ -527,7 +531,8 @@ typedef uint64 SVGA3dSurfaceAllFlags;
           SVGA3D_SURFACE_VOLUME |                  \
           SVGA3D_SURFACE_1D |                      \
           SVGA3D_SURFACE_SCREENTARGET |            \
-           SVGA3D_SURFACE_MOB_PITCH                 \
+           SVGA3D_SURFACE_MOB_PITCH |               \
+           SVGA3D_SURFACE_DRAWINDIRECT_ARGS         \
        )

 #define SVGA3D_SURFACE_DX_ONLY_MASK             \
@ -636,7 +641,8 @@ typedef uint64 SVGA3dSurfaceAllFlags;
           SVGA3D_SURFACE_BIND_STREAM_OUTPUT |      \
           SVGA3D_SURFACE_TRANSFER_FROM_BUFFER |    \
           SVGA3D_SURFACE_VADECODE |                \
-           SVGA3D_SURFACE_MULTISAMPLE               \
+           SVGA3D_SURFACE_MULTISAMPLE |             \
+           SVGA3D_SURFACE_DRAWINDIRECT_ARGS         \
        )


--- a/src/gallium/drivers/svga/meson.build
+++ b/src/gallium/drivers/svga/meson.build
@ -36,6 +36,7 @@ files_svga = files(
  'svga_pipe_flush.c',
  'svga_pipe_fs.c',
  'svga_pipe_gs.c',
+  'svga_pipe_ts.c',
  'svga_pipe_misc.c',
  'svga_pipe_query.c',
  'svga_pipe_rasterizer.c',
@ -56,6 +57,7 @@ files_svga = files(
  'svga_state_framebuffer.c',
  'svga_state_fs.c',
  'svga_state_gs.c',
+  'svga_state_ts.c',
  'svga_state_need_swtnl.c',
  'svga_state_rss.c',
  'svga_state_sampler.c',
--- a/src/gallium/drivers/svga/svga_cmd.h
+++ b/src/gallium/drivers/svga/svga_cmd.h
@ -697,4 +697,33 @@ SVGA3D_vgpu10_ResolveCopy(struct svga_winsys_context *swc,
                          struct svga_winsys_surface *src,
                          const SVGA3dSurfaceFormat copyFormat);

+enum pipe_error
+SVGA3D_sm5_DrawIndexedInstancedIndirect(struct svga_winsys_context *swc,
+                                        struct svga_winsys_surface *argBuffer,
+                                        unsigned argOffset);
+
+enum pipe_error
+SVGA3D_sm5_DrawInstancedIndirect(struct svga_winsys_context *swc,
+                                 struct svga_winsys_surface *argBuffer,
+                                 unsigned argOffset);
+
+enum pipe_error
+SVGA3D_sm5_Dispatch(struct svga_winsys_context *swc,
+                    const uint32 threadGroupCount[3]);
+
+enum pipe_error
+SVGA3D_sm5_DispatchIndirect(struct svga_winsys_context *swc,
+                            struct svga_winsys_surface *argBuffer,
+                            uint32 argOffset);
+
+enum pipe_error
+SVGA3D_sm5_DefineAndBindStreamOutput(struct svga_winsys_context *swc,
+       SVGA3dStreamOutputId soid,
+       uint32 numOutputStreamEntries,
+       uint32 numOutputStreamStrides,
+       uint32 streamOutputStrideInBytes[SVGA3D_DX_MAX_SOTARGETS],
+       struct svga_winsys_buffer *declBuf,
+       uint32 rasterizedStream,
+       uint32 sizeInBytes);
+
 #endif /* __SVGA3D_H__ */
--- a/src/gallium/drivers/svga/svga_cmd_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_cmd_vgpu10.c
@ -1130,7 +1130,7 @@ SVGA3D_vgpu10_DefineStreamOutput(struct svga_winsys_context *swc,

   memcpy(cmd->decl, decl,
          sizeof(SVGA3dStreamOutputDeclarationEntry)
-          * SVGA3D_MAX_STREAMOUT_DECLS);
+          * SVGA3D_MAX_DX10_STREAMOUT_DECLS);

   cmd->rasterizedStream = 0;
   swc->commit(swc);
@ -1432,3 +1432,159 @@ SVGA3D_vgpu10_ResolveCopy(struct svga_winsys_context *swc,

   return PIPE_OK;
 }
+
+
+enum pipe_error
+SVGA3D_sm5_DrawIndexedInstancedIndirect(struct svga_winsys_context *swc,
+                                        struct svga_winsys_surface *argBuffer,
+                                        unsigned argOffset)
+{
+   SVGA3dCmdDXDrawIndexedInstancedIndirect *cmd =
+      SVGA3D_FIFOReserve(swc,
+                         SVGA_3D_CMD_DX_DRAW_INDEXED_INSTANCED_INDIRECT,
+                         sizeof(SVGA3dCmdDXDrawIndexedInstancedIndirect),
+                         1); /* one relocation */
+   if (!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   swc->surface_relocation(swc, &cmd->argsBufferSid, NULL, argBuffer,
+                           SVGA_RELOC_READ);
+   cmd->byteOffsetForArgs = argOffset;
+
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+enum pipe_error
+SVGA3D_sm5_DrawInstancedIndirect(struct svga_winsys_context *swc,
+                                 struct svga_winsys_surface *argBuffer,
+                                 unsigned argOffset)
+{
+   SVGA3dCmdDXDrawInstancedIndirect *cmd =
+      SVGA3D_FIFOReserve(swc,
+                         SVGA_3D_CMD_DX_DRAW_INSTANCED_INDIRECT,
+                         sizeof(SVGA3dCmdDXDrawInstancedIndirect),
+                         1); /* one relocation */
+   if (!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   swc->surface_relocation(swc, &cmd->argsBufferSid, NULL, argBuffer,
+                           SVGA_RELOC_READ);
+   cmd->byteOffsetForArgs = argOffset;
+
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+enum pipe_error
+SVGA3D_sm5_Dispatch(struct svga_winsys_context *swc,
+                    const uint32 threadGroupCount[3])
+{
+   SVGA3dCmdDXDispatch *cmd;
+
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_DX_DISPATCH,
+                            sizeof(SVGA3dCmdDXDispatch),
+                            0);
+   if (!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->threadGroupCountX = threadGroupCount[0];
+   cmd->threadGroupCountY = threadGroupCount[1];
+   cmd->threadGroupCountZ = threadGroupCount[2];
+
+   swc->commit(swc);
+   return PIPE_OK;
+}
+
+
+enum pipe_error
+SVGA3D_sm5_DispatchIndirect(struct svga_winsys_context *swc,
+                            struct svga_winsys_surface *argBuffer,
+                            uint32 argOffset)
+{
+   SVGA3dCmdDXDispatchIndirect *cmd;
+
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_DX_DISPATCH_INDIRECT,
+                            sizeof(SVGA3dCmdDXDispatchIndirect),
+                            1);
+   if (!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   swc->surface_relocation(swc, &cmd->argsBufferSid, NULL, argBuffer,
+                           SVGA_RELOC_READ);
+   cmd->byteOffsetForArgs = argOffset;
+
+   swc->commit(swc);
+   return PIPE_OK;
+}
+
+
+/**
+  * We don't want any flush between DefineStreamOutputWithMob and
+  * BindStreamOutput because it will cause partial state in command
+  * buffer. This function make that sure there is enough room for
+  * both commands before issuing them
+  */
+
+enum pipe_error
+SVGA3D_sm5_DefineAndBindStreamOutput(struct svga_winsys_context *swc,
+       SVGA3dStreamOutputId soid,
+       uint32 numOutputStreamEntries,
+       uint32 numOutputStreamStrides,
+       uint32 streamOutputStrideInBytes[SVGA3D_DX_MAX_SOTARGETS],
+       struct svga_winsys_buffer *declBuf,
+       uint32 rasterizedStream,
+       uint32 sizeInBytes)
+{
+   unsigned i;
+   SVGA3dCmdHeader *header;
+   SVGA3dCmdDXDefineStreamOutputWithMob *dcmd;
+   SVGA3dCmdDXBindStreamOutput *bcmd;
+
+   unsigned totalSize = 2 * sizeof(*header) +
+                        sizeof(*dcmd) + sizeof(*bcmd);
+
+   /* Make sure there is room for both commands */
+   header = swc->reserve(swc, totalSize, 2);
+   if (!header)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   /* DXDefineStreamOutputWithMob command */
+   header->id = SVGA_3D_CMD_DX_DEFINE_STREAMOUTPUT_WITH_MOB;
+   header->size = sizeof(*dcmd);
+   dcmd = (SVGA3dCmdDXDefineStreamOutputWithMob *)(header + 1);
+   dcmd->soid= soid;
+   dcmd->numOutputStreamEntries = numOutputStreamEntries;
+   dcmd->numOutputStreamStrides = numOutputStreamStrides;
+   dcmd->rasterizedStream = rasterizedStream;
+
+   for (i = 0; i < ARRAY_SIZE(dcmd->streamOutputStrideInBytes); i++)
+      dcmd->streamOutputStrideInBytes[i] = streamOutputStrideInBytes[i];
+
+
+   /* DXBindStreamOutput command */
+   header = (SVGA3dCmdHeader *)(dcmd + 1);
+
+   header->id = SVGA_3D_CMD_DX_BIND_STREAMOUTPUT;
+   header->size = sizeof(*bcmd);
+   bcmd = (SVGA3dCmdDXBindStreamOutput *)(header + 1);
+
+   bcmd->soid = soid;
+   bcmd->offsetInBytes = 0;
+   swc->mob_relocation(swc, &bcmd->mobid,
+                       &bcmd->offsetInBytes, declBuf, 0,
+                       SVGA_RELOC_WRITE);
+
+   bcmd->sizeInBytes = sizeInBytes;
+   bcmd->offsetInBytes = 0;
+
+
+   swc->commit(swc);
+   return PIPE_OK;
+}
--- a/src/gallium/drivers/svga/svga_context.c
+++ b/src/gallium/drivers/svga/svga_context.c
@ -44,6 +44,7 @@
 #include "svga_debug.h"
 #include "svga_state.h"
 #include "svga_winsys.h"
+#include "svga_streamout.h"

 #define CONST0_UPLOAD_DEFAULT_SIZE 65536

@ -79,6 +80,9 @@ svga_destroy(struct pipe_context *pipe)

   pipe->delete_blend_state(pipe, svga->noop_blend);

+   /* destroy stream output statistics queries */
+   svga_destroy_stream_output_queries(svga);
+
   /* free query gb object */
   if (svga->gb_query) {
      pipe->destroy_query(pipe, NULL);
@ -91,6 +95,7 @@ svga_destroy(struct pipe_context *pipe)
   svga_cleanup_framebuffer(svga);
   svga_cleanup_tss_binding(svga);
   svga_cleanup_vertex_state(svga);
+   svga_cleanup_tcs_state(svga);

   svga_destroy_swtnl(svga);
   svga_hwtnl_destroy(svga->hwtnl);
@ -174,12 +179,14 @@ svga_context_create(struct pipe_screen *screen, void *priv, unsigned flags)
   svga_init_fs_functions(svga);
   svga_init_vs_functions(svga);
   svga_init_gs_functions(svga);
+   svga_init_ts_functions(svga);
   svga_init_vertex_functions(svga);
   svga_init_constbuffer_functions(svga);
   svga_init_query_functions(svga);
   svga_init_surface_functions(svga);
   svga_init_stream_output_functions(svga);
   svga_init_clear_functions(svga);
+   svga_init_tracked_state(svga);

   /* init misc state */
   svga->curr.sample_mask = ~0;
@ -250,6 +257,7 @@ svga_context_create(struct pipe_screen *screen, void *priv, unsigned flags)
   memset(&svga->state.hw_clear, 0xcd, sizeof(svga->state.hw_clear));
   memset(&svga->state.hw_clear.framebuffer, 0x0,
          sizeof(svga->state.hw_clear.framebuffer));
+   memset(&svga->state.hw_clear.rtv, 0, sizeof(svga->state.hw_clear.rtv));
   svga->state.hw_clear.num_rendertargets = 0;
   svga->state.hw_clear.dsv = NULL;

@ -269,6 +277,8 @@ svga_context_create(struct pipe_screen *screen, void *priv, unsigned flags)
   svga->state.hw_draw.vs = NULL;
   svga->state.hw_draw.gs = NULL;
   svga->state.hw_draw.fs = NULL;
+   svga->state.hw_draw.tcs = NULL;
+   svga->state.hw_draw.tes = NULL;

   /* Initialize the currently bound buffer resources */
   memset(svga->state.hw_draw.constbuf, 0,
@ -303,10 +313,16 @@ svga_context_create(struct pipe_screen *screen, void *priv, unsigned flags)
      svga->noop_blend = svga->pipe.create_blend_state(&svga->pipe, &noop_tmpl);
   }

-   svga->dirty = ~0;
+   svga->dirty = SVGA_NEW_ALL;
   svga->pred.query_id = SVGA3D_INVALID_ID;
   svga->disable_rasterizer = FALSE;

+   /**
+    * Create stream output statistics queries used in the workaround for auto
+    * draw with stream instancing.
+    */
+   svga_create_stream_output_queries(svga);
+
   goto done;

 cleanup:
@ -398,6 +414,11 @@ svga_context_flush(struct svga_context *svga,
      svga->rebind.flags.fs = TRUE;
      svga->rebind.flags.gs = TRUE;

+      if (svga_have_sm5(svga)) {
+         svga->rebind.flags.tcs = TRUE;
+         svga->rebind.flags.tes = TRUE;
+      }
+
      if (svga_need_to_rebind_resources(svga)) {
         svga->rebind.flags.query = TRUE;
      }
@ -447,12 +468,7 @@ svga_hwtnl_flush_retry(struct svga_context *svga)
 {
   enum pipe_error ret = PIPE_OK;

-   ret = svga_hwtnl_flush(svga->hwtnl);
-   if (ret == PIPE_ERROR_OUT_OF_MEMORY) {
-      svga_context_flush(svga, NULL);
-      ret = svga_hwtnl_flush(svga->hwtnl);
-   }
-
+   SVGA_RETRY_OOM(svga, ret, svga_hwtnl_flush(svga->hwtnl));
   assert(ret == PIPE_OK);
 }

--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@ -43,7 +43,7 @@
 #include "svga_winsys.h"
 #include "svga_hw_reg.h"
 #include "svga3d_shaderdefs.h"
-
+#include "svga_debug.h"

 /** Non-GPU queries for gallium HUD */
 enum svga_hud {
@ -56,6 +56,7 @@ enum svga_hud {
   SVGA_QUERY_NUM_BUFFERS_MAPPED,
   SVGA_QUERY_NUM_TEXTURES_MAPPED,
   SVGA_QUERY_NUM_BYTES_UPLOADED,
+   SVGA_QUERY_NUM_COMMAND_BUFFERS,
   SVGA_QUERY_COMMAND_BUFFER_SIZE,
   SVGA_QUERY_FLUSH_TIME,
   SVGA_QUERY_SURFACE_WRITE_FLUSHES,
@ -64,6 +65,8 @@ enum svga_hud {
   SVGA_QUERY_NUM_BUFFER_UPLOADS,
   SVGA_QUERY_NUM_CONST_BUF_UPDATES,
   SVGA_QUERY_NUM_CONST_UPDATES,
+   SVGA_QUERY_NUM_SHADER_RELOCATIONS,
+   SVGA_QUERY_NUM_SURFACE_RELOCATIONS,

 /* running total counters */
   SVGA_QUERY_MEMORY_USED,
@ -74,6 +77,7 @@ enum svga_hud {
   SVGA_QUERY_NUM_GENERATE_MIPMAP,
   SVGA_QUERY_NUM_FAILED_ALLOCATIONS,
   SVGA_QUERY_NUM_COMMANDS_PER_DRAW,
+   SVGA_QUERY_SHADER_MEM_USED,

 /*SVGA_QUERY_MAX has to be last because it is size of an array*/
   SVGA_QUERY_MAX
@ -109,6 +113,8 @@ struct svga_blend_state {
   unsigned alpha_to_coverage:1;
   unsigned alpha_to_one:1;
   unsigned blend_color_alpha:1;  /**< set blend color to alpha value */
+   unsigned logicop_enabled:1;
+   unsigned logicop_mode:5;

   /** Per-render target state */
   struct {
@ -269,6 +275,11 @@ struct svga_state
   struct svga_vertex_shader *vs;
   struct svga_geometry_shader *user_gs; /* user-specified GS */
   struct svga_geometry_shader *gs;      /* derived GS */
+   /* derived tessellation control shader */
+   struct svga_tcs_shader *tcs;
+   /* derived tessellation evaluation shader */
+   struct svga_tes_shader *tes;
+   struct svga_compute_shader *cs;

   struct pipe_vertex_buffer vb[PIPE_MAX_ATTRIBS];
   /** Constant buffers for each shader.
@ -286,11 +297,11 @@ struct svga_state
   int nr_fbs;

   struct pipe_poly_stipple poly_stipple;
-   struct pipe_scissor_state scissor;
+   struct pipe_scissor_state scissor[SVGA3D_DX_MAX_VIEWPORTS];
   struct pipe_blend_color blend_color;
   struct pipe_stencil_ref stencil_ref;
   struct pipe_clip_state clip;
-   struct pipe_viewport_state viewport;
+   struct pipe_viewport_state viewport[SVGA3D_DX_MAX_VIEWPORTS];

   unsigned num_samplers[PIPE_SHADER_TYPES];
   unsigned num_sampler_views[PIPE_SHADER_TYPES];
@ -303,6 +314,14 @@ struct svga_state
   } tex_flags;

   unsigned sample_mask;
+   unsigned vertices_per_patch;
+   float default_tesslevels[6]; /* tessellation (outer[4] + inner[2]) levels */
+   struct {
+      /* Determine the layout of the grid (in block units) to be used. */
+      unsigned size[3];
+      /* If DispatchIndirect is used, this will has grid size info*/
+      struct pipe_resource *indirect;
+   } grid_info;
 };

 struct svga_prescale {
@ -311,21 +330,27 @@ struct svga_prescale {
   boolean enabled;
 };

+struct svga_depthrange {
+   float zmin;
+   float zmax;
+};

 /* Updated by calling svga_update_state( SVGA_STATE_HW_CLEAR )
 */
 struct svga_hw_clear_state
 {
-   SVGA3dRect viewport;
-
-   struct {
-      float zmin, zmax;
-   } depthrange;
-
   struct pipe_framebuffer_state framebuffer;
-   struct svga_prescale prescale;
+
+   /* VGPU9 only */
+   SVGA3dRect viewport;
+   struct svga_depthrange depthrange;

   /* VGPU10 state */
+   SVGA3dViewport viewports[SVGA3D_DX_MAX_VIEWPORTS];
+   struct svga_prescale prescale[SVGA3D_DX_MAX_VIEWPORTS];
+   struct pipe_scissor_state scissors[SVGA3D_DX_MAX_VIEWPORTS];
+   unsigned num_prescale;
+
   unsigned num_rendertargets;
   struct pipe_surface *rtv[SVGA3D_MAX_RENDER_TARGETS];
   struct pipe_surface *dsv;
@ -361,6 +386,9 @@ struct svga_hw_draw_state
   struct svga_shader_variant *fs;
   struct svga_shader_variant *vs;
   struct svga_shader_variant *gs;
+   struct svga_shader_variant *tcs;
+   struct svga_shader_variant *tes;
+   struct svga_shader_variant *cs;

   /** Currently bound constant buffer, per shader stage */
   struct pipe_resource *constbuf[PIPE_SHADER_TYPES];
@ -495,7 +523,7 @@ struct svga_context
   struct util_bitmask *query_id_bm;

   struct {
-      unsigned dirty[SVGA_STATE_MAX];
+      uint64_t dirty[SVGA_STATE_MAX];

      /** bitmasks of which const buffers are changed */
      unsigned dirty_constbufs[PIPE_SHADER_TYPES];
@ -508,7 +536,7 @@ struct svga_context
   } state;

   struct svga_state curr;      /* state from the gallium frontend */
-   unsigned dirty;              /* statechanges since last update_state() */
+   uint64_t dirty;              /* statechanges since last update_state() */

   union {
      struct {
@ -518,6 +546,9 @@ struct svga_context
         unsigned vs:1;
         unsigned fs:1;
         unsigned gs:1;
+         unsigned tcs:1;
+         unsigned tes:1;
+         unsigned cs:1;
         unsigned query:1;
      } flags;
      unsigned val;
@ -531,7 +562,10 @@ struct svga_context
   struct util_bitmask *gb_query_alloc_mask;  /**< gb query object allocation mask */
   struct svga_qmem_alloc_entry *gb_query_map[SVGA_QUERY_MAX];
                                              /**< query mem block mapping */
-   struct svga_query *sq[SVGA_QUERY_MAX];     /**< queries currently in progress */
+   struct svga_query *sq[SVGA_QUERY_MAX+12];  /**< queries currently in progress */
+                                              /* The last 12 entries are for streamout
+                                               * queries for stream 0..3
+                                               */

   /** List of buffers with queued transfers */
   struct list_head dirty_buffers;
@ -545,6 +579,7 @@ struct svga_context
      uint64_t map_buffer_time;         /**< SVGA_QUERY_MAP_BUFFER_TIME */
      uint64_t num_buffers_mapped;      /**< SVGA_QUERY_NUM_BUFFERS_MAPPED */
      uint64_t num_textures_mapped;     /**< SVGA_QUERY_NUM_TEXTURES_MAPPED */
+      uint64_t num_command_buffers;     /**< SVGA_QUERY_NUM_COMMAND_BUFFERS */
      uint64_t command_buffer_size;     /**< SVGA_QUERY_COMMAND_BUFFER_SIZE */
      uint64_t flush_time;              /**< SVGA_QUERY_FLUSH_TIME */
      uint64_t surface_write_flushes;   /**< SVGA_QUERY_SURFACE_WRITE_FLUSHES */
@ -566,16 +601,28 @@ struct svga_context
      uint64_t num_surface_views;       /**< SVGA_QUERY_NUM_SURFACE_VIEWS */
      uint64_t num_bytes_uploaded;      /**< SVGA_QUERY_NUM_BYTES_UPLOADED */
      uint64_t num_generate_mipmap;     /**< SVGA_QUERY_NUM_GENERATE_MIPMAP */
+      uint64_t shader_mem_used;         /**< SVGA_QUERY_SHADER_MEM_USED */

      boolean uses_time;                /**< os_time_get() calls needed? */
   } hud;

   /** The currently bound stream output targets */
+   boolean in_streamout;                /* Set if streamout is active */
   unsigned num_so_targets;
   struct svga_winsys_surface *so_surfaces[SVGA3D_DX_MAX_SOTARGETS];
   struct pipe_stream_output_target *so_targets[SVGA3D_DX_MAX_SOTARGETS];
   struct svga_stream_output *current_so;

+   /**
+    * The following states are used in the workaround for auto draw with
+    * stream instancing.
+    */
+
+   /* Last bound SO targets that can be used to get vertex count */
+   struct pipe_stream_output_target *vcount_so_targets[SVGA3D_DX_MAX_SOTARGETS];
+   unsigned vcount_buffer_stream;       /* SO buffer to stream index mask */
+   struct pipe_query *so_queries[4];    /* SO stat queries for each stream */
+
   /** A blend state with blending disabled, for falling back to when blending
    * is illegal (e.g. an integer texture is bound)
    */
@ -601,41 +648,58 @@ struct svga_context

   boolean render_condition;
   boolean disable_rasterizer; /* Set if to disable rasterization */
+
+   struct {
+      struct svga_tcs_shader *passthrough_tcs;
+      struct svga_vertex_shader *vs;
+      struct svga_tes_shader *tes;
+      unsigned vertices_per_patch;
+      boolean passthrough;
+   } tcs;
+
 };

 /* A flag for each frontend state object:
 */
-#define SVGA_NEW_BLEND               0x1
-#define SVGA_NEW_DEPTH_STENCIL_ALPHA 0x2
-#define SVGA_NEW_RAST                0x4
-#define SVGA_NEW_SAMPLER             0x8
-#define SVGA_NEW_TEXTURE             0x10
-#define SVGA_NEW_VBUFFER             0x20
-#define SVGA_NEW_VELEMENT            0x40
-#define SVGA_NEW_FS                  0x80
-#define SVGA_NEW_VS                  0x100
-#define SVGA_NEW_FS_CONST_BUFFER     0x200
-#define SVGA_NEW_VS_CONST_BUFFER     0x400
-#define SVGA_NEW_FRAME_BUFFER        0x800
-#define SVGA_NEW_STIPPLE             0x1000
-#define SVGA_NEW_SCISSOR             0x2000
-#define SVGA_NEW_BLEND_COLOR         0x4000
-#define SVGA_NEW_CLIP                0x8000
-#define SVGA_NEW_VIEWPORT            0x10000
-#define SVGA_NEW_PRESCALE            0x20000
-#define SVGA_NEW_REDUCED_PRIMITIVE   0x40000
-#define SVGA_NEW_TEXTURE_BINDING     0x80000
-#define SVGA_NEW_NEED_PIPELINE       0x100000
-#define SVGA_NEW_NEED_SWVFETCH       0x200000
-#define SVGA_NEW_NEED_SWTNL          0x400000
-#define SVGA_NEW_FS_VARIANT          0x800000
-#define SVGA_NEW_VS_VARIANT          0x1000000
-#define SVGA_NEW_TEXTURE_FLAGS       0x4000000
-#define SVGA_NEW_STENCIL_REF         0x8000000
-#define SVGA_NEW_GS                  0x10000000
-#define SVGA_NEW_GS_CONST_BUFFER     0x20000000
-#define SVGA_NEW_GS_VARIANT          0x40000000
-#define SVGA_NEW_TEXTURE_CONSTS      0x80000000
+#define SVGA_NEW_BLEND               ((uint64_t) 0x1)
+#define SVGA_NEW_DEPTH_STENCIL_ALPHA ((uint64_t) 0x2)
+#define SVGA_NEW_RAST                ((uint64_t) 0x4)
+#define SVGA_NEW_SAMPLER             ((uint64_t) 0x8)
+#define SVGA_NEW_TEXTURE             ((uint64_t) 0x10)
+#define SVGA_NEW_VBUFFER             ((uint64_t) 0x20)
+#define SVGA_NEW_VELEMENT            ((uint64_t) 0x40)
+#define SVGA_NEW_FS                  ((uint64_t) 0x80)
+#define SVGA_NEW_VS                  ((uint64_t) 0x100)
+#define SVGA_NEW_FS_CONST_BUFFER     ((uint64_t) 0x200)
+#define SVGA_NEW_VS_CONST_BUFFER     ((uint64_t) 0x400)
+#define SVGA_NEW_FRAME_BUFFER        ((uint64_t) 0x800)
+#define SVGA_NEW_STIPPLE             ((uint64_t) 0x1000)
+#define SVGA_NEW_SCISSOR             ((uint64_t) 0x2000)
+#define SVGA_NEW_BLEND_COLOR         ((uint64_t) 0x4000)
+#define SVGA_NEW_CLIP                ((uint64_t) 0x8000)
+#define SVGA_NEW_VIEWPORT            ((uint64_t) 0x10000)
+#define SVGA_NEW_PRESCALE            ((uint64_t) 0x20000)
+#define SVGA_NEW_REDUCED_PRIMITIVE   ((uint64_t) 0x40000)
+#define SVGA_NEW_TEXTURE_BINDING     ((uint64_t) 0x80000)
+#define SVGA_NEW_NEED_PIPELINE       ((uint64_t) 0x100000)
+#define SVGA_NEW_NEED_SWVFETCH       ((uint64_t) 0x200000)
+#define SVGA_NEW_NEED_SWTNL          ((uint64_t) 0x400000)
+#define SVGA_NEW_FS_VARIANT          ((uint64_t) 0x800000)
+#define SVGA_NEW_VS_VARIANT          ((uint64_t) 0x1000000)
+#define SVGA_NEW_TEXTURE_FLAGS       ((uint64_t) 0x4000000)
+#define SVGA_NEW_STENCIL_REF         ((uint64_t) 0x8000000)
+#define SVGA_NEW_GS                  ((uint64_t) 0x10000000)
+#define SVGA_NEW_GS_CONST_BUFFER     ((uint64_t) 0x20000000)
+#define SVGA_NEW_GS_VARIANT          ((uint64_t) 0x40000000)
+#define SVGA_NEW_TEXTURE_CONSTS      ((uint64_t) 0x80000000)
+#define SVGA_NEW_TCS                 ((uint64_t) 0x100000000)
+#define SVGA_NEW_TES                 ((uint64_t) 0x200000000)
+#define SVGA_NEW_TCS_VARIANT         ((uint64_t) 0x400000000)
+#define SVGA_NEW_TES_VARIANT         ((uint64_t) 0x800000000)
+#define SVGA_NEW_TCS_CONST_BUFFER    ((uint64_t) 0x1000000000)
+#define SVGA_NEW_TES_CONST_BUFFER    ((uint64_t) 0x2000000000)
+#define SVGA_NEW_TCS_PARAM           ((uint64_t) 0x4000000000)
+#define SVGA_NEW_ALL                 ((uint64_t) 0xFFFFFFFFFFFFFFFF)


 void svga_init_state_functions( struct svga_context *svga );
@ -648,9 +712,11 @@ void svga_init_depth_stencil_functions( struct svga_context *svga );
 void svga_init_misc_functions( struct svga_context *svga );
 void svga_init_rasterizer_functions( struct svga_context *svga );
 void svga_init_sampler_functions( struct svga_context *svga );
+void svga_init_cs_functions( struct svga_context *svga );
 void svga_init_fs_functions( struct svga_context *svga );
 void svga_init_vs_functions( struct svga_context *svga );
 void svga_init_gs_functions( struct svga_context *svga );
+void svga_init_ts_functions( struct svga_context *svga );
 void svga_init_vertex_functions( struct svga_context *svga );
 void svga_init_constbuffer_functions( struct svga_context *svga );
 void svga_init_draw_functions( struct svga_context *svga );
@ -663,6 +729,7 @@ void svga_cleanup_vertex_state( struct svga_context *svga );
 void svga_cleanup_sampler_state( struct svga_context *svga );
 void svga_cleanup_tss_binding( struct svga_context *svga );
 void svga_cleanup_framebuffer( struct svga_context *svga );
+void svga_cleanup_tcs_state( struct svga_context *svga );

 void svga_context_flush( struct svga_context *svga,
                         struct pipe_fence_handle **pfence );
@ -723,6 +790,12 @@ svga_have_sm4_1(const struct svga_context *svga)
   return svga_screen(svga->pipe.screen)->sws->have_sm4_1;
 }

+static inline boolean
+svga_have_sm5(const struct svga_context *svga)
+{
+   return svga_screen(svga->pipe.screen)->sws->have_sm5;
+}
+
 static inline boolean
 svga_need_to_rebind_resources(const struct svga_context *svga)
 {
@ -745,5 +818,107 @@ svga_get_time(struct svga_context *svga)
   return svga->hud.uses_time ? os_time_get() : 0;
 }

+/*
+ * The SVGA_TRY_XX family of macros can be used to optionally replace a
+ * function call with an error value, the purpose is to trigger and test
+ * retry path handling.
+ */
+#ifdef DEBUG
+
+/*
+ * Optionally replace a function call with a PIPE_ERROR_OUT_OF_MEMORY
+ * return value
+ */
+#define SVGA_TRY(_func) \
+   ((SVGA_DEBUG & DEBUG_RETRY) ? PIPE_ERROR_OUT_OF_MEMORY : (_func))
+
+/* Optionally replace a function call with a NULL return value */
+#define SVGA_TRY_PTR(_func) \
+   ((SVGA_DEBUG & DEBUG_RETRY) ? NULL : (_func))
+
+/*
+ * Optionally replace a function call with a NULL return value, and set
+ * the _retry parameter to TRUE.
+ */
+#define SVGA_TRY_MAP(_func, _retry) \
+   ((SVGA_DEBUG & DEBUG_RETRY) ? (_retry) = TRUE, NULL : (_func))
+#else
+
+#define SVGA_TRY(_func) (_func)
+
+#define SVGA_TRY_PTR(_func) (_func)
+
+#define SVGA_TRY_MAP(_func, _retry) (_func)
+#endif
+
+/**
+ * Enter retry processing after hitting out-of-command space
+ */
+static inline void
+svga_retry_enter(struct svga_context *svga)
+{
+   /* We shouldn't nest retries, but currently we do. */
+   if ((SVGA_DEBUG & DEBUG_RETRY) && svga->swc->in_retry) {
+      debug_printf("WARNING: Recursive retry. Level: %u.\n",
+                   svga->swc->in_retry);
+   }
+   svga->swc->in_retry++;
+}
+
+/**
+ * Exit retry processing after hitting out-of-command space
+ */
+static inline void
+svga_retry_exit(struct svga_context *svga)
+{
+   assert(svga->swc->in_retry > 0);
+   svga->swc->in_retry--;
+}
+
+/**
+ * Perform a function call, and on failure flush the context and retry,
+ * asserting that the retry succeeded. On return, the boolean argument
+ * _retried indicates whether the function call was retried or not.
+ */
+#define SVGA_RETRY_CHECK(_svga, _func, _retried)       \
+   do {                                                \
+      enum pipe_error ret;                             \
+                                                       \
+      ret = SVGA_TRY(_func);                           \
+      (_retried) = (ret != PIPE_OK);                   \
+      if (_retried) {                                  \
+         svga_retry_enter(_svga);                      \
+         svga_context_flush(_svga, NULL);              \
+         ret = (_func);                                \
+         assert(ret == PIPE_OK);                       \
+         svga_retry_exit(_svga);                       \
+      }                                                \
+   } while(0)
+
+/**
+ * Perform a function call, and on failure flush the context and retry,
+ * asserting that the retry succeeded.
+ */
+#define SVGA_RETRY(_svga, _func)                \
+   do {                                         \
+      UNUSED boolean retried;                   \
+                                                \
+      SVGA_RETRY_CHECK(_svga, _func, retried);  \
+   } while(0)
+
+/**
+ * Perform a function call, and on out-of-memory, flush the context and
+ * retry. The retry return value is stored in _ret for reuse.
+ */
+#define SVGA_RETRY_OOM(_svga, _ret, _func)              \
+   do {                                                 \
+      (_ret) = SVGA_TRY(_func);                         \
+      if ((_ret) == PIPE_ERROR_OUT_OF_MEMORY) {         \
+         svga_retry_enter(_svga);                       \
+         svga_context_flush(_svga, NULL);               \
+         (_ret) = (_func);                              \
+         svga_retry_exit(_svga);                        \
+      }                                                 \
+   } while (0);

 #endif
--- a/src/gallium/drivers/svga/svga_debug.h
+++ b/src/gallium/drivers/svga/svga_debug.h
@ -46,6 +46,7 @@
 #define DEBUG_CACHE        0x8000
 #define DEBUG_STREAMOUT    0x10000
 #define DEBUG_SAMPLERS     0x20000
+#define DEBUG_RETRY        0x100000

 #ifdef DEBUG
 extern int SVGA_DEBUG;
--- a/src/gallium/drivers/svga/svga_draw.c
+++ b/src/gallium/drivers/svga/svga_draw.c
@ -349,7 +349,7 @@ validate_sampler_resources(struct svga_context *svga)

   assert(svga_have_vgpu10(svga));

-   for (shader = PIPE_SHADER_VERTEX; shader <= PIPE_SHADER_GEOMETRY; shader++) {
+   for (shader = PIPE_SHADER_VERTEX; shader <= PIPE_SHADER_COMPUTE; shader++) {
      unsigned count = svga->curr.num_sampler_views[shader];
      unsigned i;
      struct svga_winsys_surface *surfaces[PIPE_MAX_SAMPLERS];
@ -379,7 +379,8 @@ validate_sampler_resources(struct svga_context *svga)

      if (shader == PIPE_SHADER_FRAGMENT &&
          svga->curr.rast->templ.poly_stipple_enable) {
-         const unsigned unit = svga->state.hw_draw.fs->pstipple_sampler_unit;
+         const unsigned unit =
+            svga_fs_variant(svga->state.hw_draw.fs)->pstipple_sampler_unit;
         struct svga_pipe_sampler_view *sv =
            svga->polygon_stipple.sampler_view;

@ -415,7 +416,7 @@ validate_constant_buffers(struct svga_context *svga)

   assert(svga_have_vgpu10(svga));

-   for (shader = PIPE_SHADER_VERTEX; shader <= PIPE_SHADER_GEOMETRY; shader++) {
+   for (shader = PIPE_SHADER_VERTEX; shader <= PIPE_SHADER_COMPUTE; shader++) {
      enum pipe_error ret;
      struct svga_buffer *buffer;
      struct svga_winsys_surface *handle;
@ -482,6 +483,8 @@ last_command_was_draw(const struct svga_context *svga)
   case SVGA_3D_CMD_DX_DRAW_INSTANCED:
   case SVGA_3D_CMD_DX_DRAW_INDEXED_INSTANCED:
   case SVGA_3D_CMD_DX_DRAW_AUTO:
+   case SVGA_3D_CMD_DX_DRAW_INDEXED_INSTANCED_INDIRECT:
+   case SVGA_3D_CMD_DX_DRAW_INSTANCED_INDIRECT:
      return true;
   default:
      return false;
@ -511,17 +514,51 @@ vertex_buffers_equal(unsigned count,
 * Prepare the vertex buffers for a drawing command.
 */
 static enum pipe_error
-validate_vertex_buffers(struct svga_hwtnl *hwtnl)
+validate_vertex_buffers(struct svga_hwtnl *hwtnl,
+                   const struct pipe_stream_output_target *so_vertex_count)
 {
   struct svga_context *svga = hwtnl->svga;
   struct pipe_resource *vbuffers[SVGA3D_INPUTREG_MAX];
   struct svga_winsys_surface *vbuffer_handles[SVGA3D_INPUTREG_MAX];
-   const unsigned vbuf_count = hwtnl->cmd.vbuf_count;
+   struct svga_winsys_surface *so_vertex_count_handle;
+   const unsigned vbuf_count = so_vertex_count ? 1 : hwtnl->cmd.vbuf_count;
   int last_vbuf = -1;
   unsigned i;

   assert(svga_have_vgpu10(svga));

+   /* Get handle for each referenced vertex buffer, unless we're using a
+    * stream-out buffer to specify the drawing information (DrawAuto).
+    */
+   if (so_vertex_count) {
+      i = 0;
+   }
+   else {
+      for (i = 0; i < vbuf_count; i++) {
+         struct svga_buffer *sbuf =
+            svga_buffer(hwtnl->cmd.vbufs[i].buffer.resource);
+
+         if (sbuf) {
+            vbuffer_handles[i] = svga_buffer_handle(svga, &sbuf->b.b,
+                                                    PIPE_BIND_VERTEX_BUFFER);
+            assert(sbuf->key.flags & SVGA3D_SURFACE_BIND_VERTEX_BUFFER);
+            if (vbuffer_handles[i] == NULL)
+               return PIPE_ERROR_OUT_OF_MEMORY;
+            vbuffers[i] = &sbuf->b.b;
+            last_vbuf = i;
+         }
+         else {
+            vbuffers[i] = NULL;
+            vbuffer_handles[i] = NULL;
+         }
+      }
+   }
+
+   for (; i < svga->state.hw_draw.num_vbuffers; i++) {
+      vbuffers[i] = NULL;
+      vbuffer_handles[i] = NULL;
+   }
+
   /* Get handle for each referenced vertex buffer */
   for (i = 0; i < vbuf_count; i++) {
      struct svga_buffer *sbuf =
@ -558,14 +595,38 @@ validate_vertex_buffers(struct svga_hwtnl *hwtnl)
      svga->state.hw_draw.layout_id = hwtnl->cmd.vdecl_layout_id;
   }

+   /* Get handle for the stream out buffer */
+   if (so_vertex_count) {
+      so_vertex_count_handle = svga_buffer_handle(svga,
+                                                  so_vertex_count->buffer,
+                                                  (PIPE_BIND_VERTEX_BUFFER |
+                                                   PIPE_BIND_STREAM_OUTPUT));
+      if (!so_vertex_count_handle)
+         return PIPE_ERROR_OUT_OF_MEMORY;
+   }
+   else {
+      so_vertex_count_handle = NULL;
+   }
+
   /* setup vertex buffers */
   {
      SVGA3dVertexBuffer vbuffer_attrs[PIPE_MAX_ATTRIBS];

-      for (i = 0; i < vbuf_count; i++) {
-         vbuffer_attrs[i].stride = hwtnl->cmd.vbufs[i].stride;
-         vbuffer_attrs[i].offset = hwtnl->cmd.vbufs[i].buffer_offset;
-         vbuffer_attrs[i].sid = 0;
+      if (so_vertex_count) {
+         /* Set IA slot0 input buffer to the SO buffer */
+         assert(vbuf_count == 1);
+         vbuffer_attrs[0].stride = hwtnl->cmd.vbufs[0].stride;
+         vbuffer_attrs[0].offset = hwtnl->cmd.vbufs[0].buffer_offset;
+         vbuffer_attrs[0].sid = 0;
+         vbuffers[0] = so_vertex_count->buffer;
+         vbuffer_handles[0] = so_vertex_count_handle;
+      }
+      else {
+         for (i = 0; i < vbuf_count; i++) {
+            vbuffer_attrs[i].stride = hwtnl->cmd.vbufs[i].stride;
+            vbuffer_attrs[i].offset = hwtnl->cmd.vbufs[i].buffer_offset;
+            vbuffer_attrs[i].sid = 0;
+         }
      }

      /* If any of the vertex buffer state has changed, issue
@ -736,10 +797,14 @@ static enum pipe_error
 draw_vgpu10(struct svga_hwtnl *hwtnl,
            const SVGA3dPrimitiveRange *range,
            unsigned vcount,
+            unsigned min_index, unsigned max_index,
            struct pipe_resource *ib,
-            unsigned start_instance, unsigned instance_count)
+            unsigned start_instance, unsigned instance_count,
+            const struct pipe_draw_indirect_info *indirect,
+            const struct pipe_stream_output_target *so_vertex_count)
 {
   struct svga_context *svga = hwtnl->svga;
+   struct svga_winsys_surface *indirect_handle;
   enum pipe_error ret;

   assert(svga_have_vgpu10(svga));
@ -779,7 +844,7 @@ draw_vgpu10(struct svga_hwtnl *hwtnl,
   if (ret != PIPE_OK)
      return ret;

-   ret = validate_vertex_buffers(hwtnl);
+   ret = validate_vertex_buffers(hwtnl, so_vertex_count);
   if (ret != PIPE_OK)
      return ret;

@ -789,6 +854,16 @@ draw_vgpu10(struct svga_hwtnl *hwtnl,
         return ret;
   }

+   if (indirect) {
+      indirect_handle = svga_buffer_handle(svga, indirect->buffer,
+                                           PIPE_BIND_COMMAND_ARGS_BUFFER);
+      if (!indirect_handle)
+         return PIPE_ERROR_OUT_OF_MEMORY;
+   }
+   else {
+      indirect_handle = NULL;
+   }
+
   /* Set primitive type (line, tri, etc) */
   if (svga->state.hw_draw.topology != range->primType) {
      ret = SVGA3D_vgpu10_SetTopology(svga->swc, range->primType);
@ -800,15 +875,18 @@ draw_vgpu10(struct svga_hwtnl *hwtnl,

   if (ib) {
      /* indexed drawing */
-      if (instance_count > 1) {
+      if (indirect) {
+         ret = SVGA3D_sm5_DrawIndexedInstancedIndirect(svga->swc,
+                                                       indirect_handle,
+                                                       indirect->offset);
+      }
+      else if (instance_count > 1) {
         ret = SVGA3D_vgpu10_DrawIndexedInstanced(svga->swc,
                                                  vcount,
                                                  instance_count,
                                                  0, /* startIndexLocation */
                                                  range->indexBias,
                                                  start_instance);
-         if (ret != PIPE_OK)
-            return ret;
      }
      else {
         /* non-instanced drawing */
@ -816,8 +894,9 @@ draw_vgpu10(struct svga_hwtnl *hwtnl,
                                         vcount,
                                         0,      /* startIndexLocation */
                                         range->indexBias);
-         if (ret != PIPE_OK)
-            return ret;
+      }
+      if (ret != PIPE_OK) {
+         return ret;
      }
   }
   else {
@ -835,22 +914,30 @@ draw_vgpu10(struct svga_hwtnl *hwtnl,

      assert(svga->state.hw_draw.ib == NULL);

-      if (instance_count > 1) {
+      if (so_vertex_count) {
+         /* Stream-output drawing */
+         ret = SVGA3D_vgpu10_DrawAuto(svga->swc);
+      }
+      else if (indirect) {
+         ret = SVGA3D_sm5_DrawInstancedIndirect(svga->swc,
+                                                indirect_handle,
+                                                indirect->offset);
+      }
+      else if (instance_count > 1) {
         ret = SVGA3D_vgpu10_DrawInstanced(svga->swc,
                                           vcount,
                                           instance_count,
                                           range->indexBias,
                                           start_instance);
-         if (ret != PIPE_OK)
-            return ret;
      }
      else {
         /* non-instanced */
         ret = SVGA3D_vgpu10_Draw(svga->swc,
                                  vcount,
                                  range->indexBias);
-         if (ret != PIPE_OK)
-            return ret;
+      }
+      if (ret != PIPE_OK) {
+         return ret;
      }
   }

@ -1044,14 +1131,20 @@ check_draw_params(struct svga_hwtnl *hwtnl,
 /**
 * All drawing filters down into this function, either directly
 * on the hardware path or after doing software vertex processing.
+ * \param indirect  if non-null, get the vertex count, first vertex, etc.
+ *                  from a buffer.
+ * \param so_vertex_count  if non-null, get the vertex count from a
+ *                         stream-output target.
 */
 enum pipe_error
 svga_hwtnl_prim(struct svga_hwtnl *hwtnl,
-                const SVGA3dPrimitiveRange * range,
+                const SVGA3dPrimitiveRange *range,
                unsigned vcount,
-                unsigned min_index,
-                unsigned max_index, struct pipe_resource *ib,
-                unsigned start_instance, unsigned instance_count)
+                unsigned min_index, unsigned max_index,
+                struct pipe_resource *ib,
+                unsigned start_instance, unsigned instance_count,
+                const struct pipe_draw_indirect_info *indirect,
+                const struct pipe_stream_output_target *so_vertex_count)
 {
   enum pipe_error ret = PIPE_OK;

@ -1059,17 +1152,14 @@ svga_hwtnl_prim(struct svga_hwtnl *hwtnl,

   if (svga_have_vgpu10(hwtnl->svga)) {
      /* draw immediately */
-      ret = draw_vgpu10(hwtnl, range, vcount, ib,
-                        start_instance, instance_count);
-      if (ret != PIPE_OK) {
-         svga_context_flush(hwtnl->svga, NULL);
-         ret = draw_vgpu10(hwtnl, range, vcount, ib,
-                           start_instance, instance_count);
-         assert(ret == PIPE_OK);
-      }
+      SVGA_RETRY(hwtnl->svga, draw_vgpu10(hwtnl, range, vcount, min_index,
+                                          max_index, ib, start_instance,
+                                          instance_count, indirect,
+                                          so_vertex_count));
   }
   else {
      /* batch up drawing commands */
+      assert(indirect == NULL);
 #ifdef DEBUG
      check_draw_params(hwtnl, range, min_index, max_index, ib);
      assert(start_instance == 0);
--- a/src/gallium/drivers/svga/svga_draw.h
+++ b/src/gallium/drivers/svga/svga_draw.h
@ -60,7 +60,8 @@ svga_hwtnl_vertex_buffers(struct svga_hwtnl *hwtnl,
 enum pipe_error
 svga_hwtnl_draw_arrays(struct svga_hwtnl *hwtnl,
                       enum pipe_prim_type prim, unsigned start, unsigned count,
-                       unsigned start_instance, unsigned instance_count);
+                       unsigned start_instance, unsigned instance_count,
+                       ubyte vertices_per_patch);

 enum pipe_error
 svga_hwtnl_draw_range_elements(struct svga_hwtnl *hwtnl,
--- a/src/gallium/drivers/svga/svga_draw_arrays.c
+++ b/src/gallium/drivers/svga/svga_draw_arrays.c
@ -175,13 +175,14 @@ done:
 static enum pipe_error
 simple_draw_arrays(struct svga_hwtnl *hwtnl,
                   enum pipe_prim_type prim, unsigned start, unsigned count,
-                   unsigned start_instance, unsigned instance_count)
+                   unsigned start_instance, unsigned instance_count,
+                   ubyte vertices_per_patch)
 {
   SVGA3dPrimitiveRange range;
   unsigned hw_prim;
   unsigned hw_count;

-   hw_prim = svga_translate_prim(prim, count, &hw_count);
+   hw_prim = svga_translate_prim(prim, count, &hw_count, vertices_per_patch);
   if (hw_count == 0)
      return PIPE_ERROR_BAD_INPUT;

@ -200,14 +201,16 @@ simple_draw_arrays(struct svga_hwtnl *hwtnl,
    */
   return svga_hwtnl_prim(hwtnl, &range, count,
                          0, count - 1, NULL,
-                          start_instance, instance_count);
+                          start_instance, instance_count,
+                          NULL, NULL);
 }


 enum pipe_error
 svga_hwtnl_draw_arrays(struct svga_hwtnl *hwtnl,
                       enum pipe_prim_type prim, unsigned start, unsigned count,
-                       unsigned start_instance, unsigned instance_count)
+                       unsigned start_instance, unsigned instance_count,
+                       ubyte vertices_per_patch)
 {
   enum pipe_prim_type gen_prim;
   unsigned gen_size, gen_nr;
@ -225,7 +228,7 @@ svga_hwtnl_draw_arrays(struct svga_hwtnl *hwtnl,
   }

   if (svga->curr.rast->templ.flatshade &&
-       svga->state.hw_draw.fs->constant_color_output) {
+         svga_fs_variant(svga->state.hw_draw.fs)->constant_color_output) {
      /* The fragment color is a constant, not per-vertex so the whole
       * primitive will be the same color (except for possible blending).
       * We can ignore the current provoking vertex state and use whatever
@ -273,7 +276,8 @@ svga_hwtnl_draw_arrays(struct svga_hwtnl *hwtnl,

   if (gen_type == U_GENERATE_LINEAR) {
      ret = simple_draw_arrays(hwtnl, gen_prim, start, count,
-                                start_instance, instance_count);
+                               start_instance, instance_count,
+                               vertices_per_patch);
   }
   else {
      struct pipe_resource *gen_buf = NULL;
@ -299,7 +303,8 @@ svga_hwtnl_draw_arrays(struct svga_hwtnl *hwtnl,
                                                     count - 1,
                                                     gen_prim, 0, gen_nr,
                                                     start_instance,
-                                                     instance_count);
+                                                     instance_count,
+                                                     vertices_per_patch);
      }

      if (gen_buf) {
--- a/src/gallium/drivers/svga/svga_draw_elements.c
+++ b/src/gallium/drivers/svga/svga_draw_elements.c
@ -186,14 +186,15 @@ svga_hwtnl_simple_draw_range_elements(struct svga_hwtnl *hwtnl,
                                      enum pipe_prim_type prim, unsigned start,
                                      unsigned count,
                                      unsigned start_instance,
-                                      unsigned instance_count)
+                                      unsigned instance_count,
+                                      ubyte vertices_per_patch)
 {
   SVGA3dPrimitiveRange range;
   unsigned hw_prim;
   unsigned hw_count;
   unsigned index_offset = start * index_size;

-   hw_prim = svga_translate_prim(prim, count, &hw_count);
+   hw_prim = svga_translate_prim(prim, count, &hw_count, vertices_per_patch);
   if (hw_count == 0)
      return PIPE_OK; /* nothing to draw */

@ -206,7 +207,8 @@ svga_hwtnl_simple_draw_range_elements(struct svga_hwtnl *hwtnl,

   return svga_hwtnl_prim(hwtnl, &range, count,
                          min_index, max_index, index_buffer,
-                          start_instance, instance_count);
+                          start_instance, instance_count,
+                          NULL, NULL);
 }


@ -234,12 +236,20 @@ svga_hwtnl_draw_range_elements(struct svga_hwtnl *hwtnl,
                                       &gen_size, &gen_nr, &gen_func);
   }
   else {
+      unsigned hw_pv;
+
+      /* There is no geometry ordering with PATCH, so no need to
+       * consider provoking vertex mode for the translation.
+       * So use the same api_pv as the hw_pv.
+       */
+      hw_pv = info->mode == PIPE_PRIM_PATCHES ? hwtnl->api_pv :
+                                                hwtnl->hw_pv;
      gen_type = u_index_translator(svga_hw_prims,
                                    info->mode,
                                    info->index_size,
                                    count,
                                    hwtnl->api_pv,
-                                    hwtnl->hw_pv,
+                                    hw_pv,
                                    PR_DISABLE,
                                    &gen_prim, &gen_size, &gen_nr, &gen_func);
   }
@ -271,7 +281,8 @@ svga_hwtnl_draw_range_elements(struct svga_hwtnl *hwtnl,
                                                  info->max_index,
                                                  gen_prim, index_offset, count,
                                                  info->start_instance,
-                                                  info->instance_count);
+                                                  info->instance_count,
+                                                  info->vertices_per_patch);
      pipe_resource_reference(&index_buffer, NULL);
   }
   else {
@ -299,7 +310,8 @@ svga_hwtnl_draw_range_elements(struct svga_hwtnl *hwtnl,
                                                     gen_prim, gen_offset,
                                                     gen_nr,
                                                     info->start_instance,
-                                                     info->instance_count);
+                                                     info->instance_count,
+                                                     info->vertices_per_patch);
      }

      if (gen_buf) {
--- a/src/gallium/drivers/svga/svga_draw_private.h
+++ b/src/gallium/drivers/svga/svga_draw_private.h
@ -52,7 +52,8 @@ static const unsigned svga_hw_prims =
    (1 << PIPE_PRIM_LINES_ADJACENCY) |
    (1 << PIPE_PRIM_LINE_STRIP_ADJACENCY) |
    (1 << PIPE_PRIM_TRIANGLES_ADJACENCY) |
-    (1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY));
+    (1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY) |
+    (1 << PIPE_PRIM_PATCHES));


 /**
@ -64,7 +65,8 @@ static const unsigned svga_hw_prims =
 * those to other types of primitives with index/translation code.
 */
 static inline SVGA3dPrimitiveType
-svga_translate_prim(unsigned mode, unsigned vcount, unsigned *prim_count)
+svga_translate_prim(unsigned mode, unsigned vcount, unsigned *prim_count,
+                    ubyte vertices_per_patch)
 {
   switch (mode) {
   case PIPE_PRIM_POINTS:
@ -107,6 +109,13 @@ svga_translate_prim(unsigned mode, unsigned vcount, unsigned *prim_count)
      *prim_count = vcount / 2 - 2 ;
      return SVGA3D_PRIMITIVE_TRIANGLESTRIP_ADJ;

+   case PIPE_PRIM_PATCHES:
+      *prim_count = vcount / vertices_per_patch ;
+      assert(vertices_per_patch >= 1);
+      assert(vertices_per_patch <= 32);
+      return (SVGA3D_PRIMITIVE_1_CONTROL_POINT_PATCH - 1)
+             + vertices_per_patch;
+
   default:
      assert(0);
      *prim_count = 0;
@ -218,7 +227,9 @@ svga_hwtnl_prim(struct svga_hwtnl *hwtnl,
                unsigned min_index,
                unsigned max_index,
                struct pipe_resource *ib,
-                unsigned start_instance, unsigned instance_count);
+                unsigned start_instance, unsigned instance_count,
+                const struct pipe_draw_indirect_info *indirect,
+                const struct pipe_stream_output_target *so_vertex_count);

 enum pipe_error
 svga_hwtnl_simple_draw_range_elements(struct svga_hwtnl *hwtnl,
@ -231,6 +242,7 @@ svga_hwtnl_simple_draw_range_elements(struct svga_hwtnl *hwtnl,
                                      unsigned start,
                                      unsigned count,
                                      unsigned start_instance,
-                                      unsigned instance_count);
+                                      unsigned instance_count,
+                                      ubyte vertices_per_patch);

 #endif
--- a/src/gallium/drivers/svga/svga_format.c
+++ b/src/gallium/drivers/svga/svga_format.c
@ -71,10 +71,10 @@ static const struct vgpu10_format_entry format_conversion_table[] =
   [ PIPE_FORMAT_Z32_FLOAT ] =             { SVGA3D_FORMAT_INVALID,      SVGA3D_D32_FLOAT,            SVGA3D_D32_FLOAT,            0 },
   [ PIPE_FORMAT_Z24_UNORM_S8_UINT ] =     { SVGA3D_FORMAT_INVALID,      SVGA3D_D24_UNORM_S8_UINT,    SVGA3D_D24_UNORM_S8_UINT,    0 },
   [ PIPE_FORMAT_Z24X8_UNORM ] =           { SVGA3D_FORMAT_INVALID,      SVGA3D_D24_UNORM_S8_UINT,    SVGA3D_D24_UNORM_S8_UINT,    0 },
-   [ PIPE_FORMAT_R32_FLOAT ] =             { SVGA3D_R32_FLOAT,           SVGA3D_R32_FLOAT,            SVGA3D_R32_FLOAT,            TF_GEN_MIPS },
-   [ PIPE_FORMAT_R32G32_FLOAT ] =          { SVGA3D_R32G32_FLOAT,        SVGA3D_R32G32_FLOAT,         SVGA3D_R32G32_FLOAT,         TF_GEN_MIPS },
+   [ PIPE_FORMAT_R32_FLOAT ] =             { SVGA3D_R32_FLOAT,           SVGA3D_R32_FLOAT,            SVGA3D_R32_FLOAT,            TF_GEN_MIPS  },
+   [ PIPE_FORMAT_R32G32_FLOAT ] =          { SVGA3D_R32G32_FLOAT,        SVGA3D_R32G32_FLOAT,         SVGA3D_R32G32_FLOAT,         TF_GEN_MIPS  },
   [ PIPE_FORMAT_R32G32B32_FLOAT ] =       { SVGA3D_R32G32B32_FLOAT,     SVGA3D_R32G32B32_FLOAT,      SVGA3D_R32G32B32_FLOAT,      TF_GEN_MIPS },
-   [ PIPE_FORMAT_R32G32B32A32_FLOAT ] =    { SVGA3D_R32G32B32A32_FLOAT,  SVGA3D_R32G32B32A32_FLOAT,   SVGA3D_R32G32B32A32_FLOAT,   TF_GEN_MIPS },
+   [ PIPE_FORMAT_R32G32B32A32_FLOAT ] =    { SVGA3D_R32G32B32A32_FLOAT,  SVGA3D_R32G32B32A32_FLOAT,   SVGA3D_R32G32B32A32_FLOAT,   TF_GEN_MIPS  },
   [ PIPE_FORMAT_R32_USCALED ] =           { SVGA3D_R32_UINT,            SVGA3D_FORMAT_INVALID,       SVGA3D_FORMAT_INVALID,       VF_U_TO_F_CAST },
   [ PIPE_FORMAT_R32G32_USCALED ] =        { SVGA3D_R32G32_UINT,         SVGA3D_FORMAT_INVALID,       SVGA3D_FORMAT_INVALID,       VF_U_TO_F_CAST },
   [ PIPE_FORMAT_R32G32B32_USCALED ] =     { SVGA3D_R32G32B32_UINT,      SVGA3D_FORMAT_INVALID,       SVGA3D_FORMAT_INVALID,       VF_U_TO_F_CAST },
@ -176,11 +176,11 @@ static const struct vgpu10_format_entry format_conversion_table[] =
   [ PIPE_FORMAT_R16G16B16A16_SINT ] =     { SVGA3D_R16G16B16A16_SINT,   SVGA3D_R16G16B16A16_SINT,    SVGA3D_R16G16B16A16_SINT,    0 },
   [ PIPE_FORMAT_R32_UINT ] =              { SVGA3D_R32_UINT,            SVGA3D_R32_UINT,             SVGA3D_R32_UINT,             0 },
   [ PIPE_FORMAT_R32G32_UINT ] =           { SVGA3D_R32G32_UINT,         SVGA3D_R32G32_UINT,          SVGA3D_R32G32_UINT,          0 },
-   [ PIPE_FORMAT_R32G32B32_UINT ] =        { SVGA3D_R32G32B32_UINT,      SVGA3D_FORMAT_INVALID,       SVGA3D_FORMAT_INVALID,       0 },
+   [ PIPE_FORMAT_R32G32B32_UINT ] =        { SVGA3D_R32G32B32_UINT,      SVGA3D_R32G32B32_UINT,       SVGA3D_R32G32B32_UINT,       0 },
   [ PIPE_FORMAT_R32G32B32A32_UINT ] =     { SVGA3D_R32G32B32A32_UINT,   SVGA3D_R32G32B32A32_UINT,    SVGA3D_R32G32B32A32_UINT,    0 },
   [ PIPE_FORMAT_R32_SINT ] =              { SVGA3D_R32_SINT,            SVGA3D_R32_SINT,             SVGA3D_R32_SINT,             0 },
   [ PIPE_FORMAT_R32G32_SINT ] =           { SVGA3D_R32G32_SINT,         SVGA3D_R32G32_SINT,          SVGA3D_R32G32_SINT,          0 },
-   [ PIPE_FORMAT_R32G32B32_SINT ] =        { SVGA3D_R32G32B32_SINT,      SVGA3D_FORMAT_INVALID,       SVGA3D_FORMAT_INVALID,       0 },
+   [ PIPE_FORMAT_R32G32B32_SINT ] =        { SVGA3D_R32G32B32_SINT,      SVGA3D_R32G32B32_SINT,       SVGA3D_R32G32B32_SINT,       0 },
   [ PIPE_FORMAT_R32G32B32A32_SINT ] =     { SVGA3D_R32G32B32A32_SINT,   SVGA3D_R32G32B32A32_SINT,    SVGA3D_R32G32B32A32_SINT,    0 },
   [ PIPE_FORMAT_A8_UINT ] =               { SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       SVGA3D_R8_UINT,              TF_000X },
   [ PIPE_FORMAT_I8_UINT ] =               { SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       SVGA3D_R8_UINT,              TF_XXXX },
@ -2137,7 +2137,7 @@ svga_is_format_supported(struct pipe_screen *screen,
   }

   if (util_format_is_srgb(format) &&
-       (bindings & PIPE_BIND_DISPLAY_TARGET)) {
+       (bindings & (PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_RENDER_TARGET))) {
       /* We only support sRGB rendering with vgpu10 */
      return false;
   }
@ -2252,6 +2252,12 @@ svga_is_dx_format_supported(struct pipe_screen *screen,
      return svga_format != SVGA3D_FORMAT_INVALID;
   }

+   if (bindings & PIPE_BIND_SAMPLER_VIEW && target == PIPE_BUFFER) {
+      unsigned flags;
+      svga_translate_texture_buffer_view_format(format, &svga_format, &flags);
+      return svga_format != SVGA3D_FORMAT_INVALID;
+   }
+
   svga_format = svga_translate_format(ss, format, bindings);
   if (svga_format == SVGA3D_FORMAT_INVALID) {
      return false;
--- a/src/gallium/drivers/svga/svga_link.c
+++ b/src/gallium/drivers/svga/svga_link.c
@ -87,6 +87,15 @@ svga_link_shaders(const struct tgsi_shader_info *outshader_info,
      }
   }

+   /* Find the index for position */
+   linkage->position_index = 0;
+   for (i = 0; i < outshader_info->num_outputs; i++) {
+      if (outshader_info->output_semantic_name[i] == TGSI_SEMANTIC_POSITION) {
+         linkage->position_index = i;
+         break;
+      }
+   }
+
   linkage->num_inputs = inshader_info->num_inputs;

   /* Things like the front-face register are handled here */
@ -100,7 +109,8 @@ svga_link_shaders(const struct tgsi_shader_info *outshader_info,

   /* Debug */
   if (SVGA_DEBUG & DEBUG_TGSI) {
-      unsigned reg = 0;
+      uint64_t reg = 0;
+      uint64_t one = 1;
      debug_printf("### linkage info: num_inputs=%d input_map_max=%d\n",
                   linkage->num_inputs, linkage->input_map_max);

@ -116,10 +126,8 @@ svga_link_shaders(const struct tgsi_shader_info *outshader_info,
                      tgsi_interpolate_names[inshader_info->input_interpolate[i]]);

         /* make sure no repeating register index */
-         if (reg & 1 << linkage->input_map[i]) {
-            assert(0);
-         }
-         reg |= 1 << linkage->input_map[i];
+         assert((reg & (one << linkage->input_map[i])) == 0);
+         reg |= one << linkage->input_map[i];
      }
   }
 }
--- a/src/gallium/drivers/svga/svga_link.h
+++ b/src/gallium/drivers/svga/svga_link.h
@ -9,6 +9,7 @@ struct svga_context;
 struct shader_linkage
 {
   unsigned num_inputs;
+   unsigned position_index; /* position register index */
   unsigned input_map_max;  /* highest index of mapped inputs */
   ubyte input_map[PIPE_MAX_SHADER_INPUTS];
 };
--- a/src/gallium/drivers/svga/svga_pipe_blend.c
+++ b/src/gallium/drivers/svga/svga_pipe_blend.c
@ -91,6 +91,51 @@ svga_translate_blend_func(unsigned mode)
 }


+/**
+ * Translate gallium logicop mode to SVGA3D logicop mode.
+ */
+static int
+translate_logicop(enum pipe_logicop op)
+{
+   switch (op) {
+   case PIPE_LOGICOP_CLEAR:
+      return SVGA3D_DX11_LOGICOP_CLEAR;
+   case PIPE_LOGICOP_NOR:
+      return SVGA3D_DX11_LOGICOP_NOR;
+   case PIPE_LOGICOP_AND_INVERTED:
+      return SVGA3D_DX11_LOGICOP_AND_INVERTED;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      return SVGA3D_DX11_LOGICOP_COPY_INVERTED;
+   case PIPE_LOGICOP_AND_REVERSE:
+      return SVGA3D_DX11_LOGICOP_AND_REVERSE;
+   case PIPE_LOGICOP_INVERT:
+      return SVGA3D_DX11_LOGICOP_INVERT;
+   case PIPE_LOGICOP_XOR:
+      return SVGA3D_DX11_LOGICOP_XOR;
+   case PIPE_LOGICOP_NAND:
+      return SVGA3D_DX11_LOGICOP_NAND;
+   case PIPE_LOGICOP_AND:
+      return SVGA3D_DX11_LOGICOP_AND;
+   case PIPE_LOGICOP_EQUIV:
+      return SVGA3D_DX11_LOGICOP_EQUIV;
+   case PIPE_LOGICOP_NOOP:
+      return SVGA3D_DX11_LOGICOP_NOOP;
+   case PIPE_LOGICOP_OR_INVERTED:
+      return SVGA3D_DX11_LOGICOP_OR_INVERTED;
+   case PIPE_LOGICOP_COPY:
+      return SVGA3D_DX11_LOGICOP_COPY;
+   case PIPE_LOGICOP_OR_REVERSE:
+      return SVGA3D_DX11_LOGICOP_OR_REVERSE;
+   case PIPE_LOGICOP_OR:
+      return SVGA3D_DX11_LOGICOP_OR;
+   case PIPE_LOGICOP_SET:
+      return SVGA3D_DX11_LOGICOP_SET;
+   default:
+      return SVGA3D_DX11_LOGICOP_COPY;
+   }
+};
+
+
 /**
 * Define a vgpu10 blend state object for the given
 * svga blend state.
@ -100,7 +145,6 @@ define_blend_state_object(struct svga_context *svga,
                          struct svga_blend_state *bs)
 {
   SVGA3dDXBlendStatePerRT perRT[SVGA3D_MAX_RENDER_TARGETS];
-   unsigned try;
   int i;

   assert(svga_have_vgpu10(svga));
@ -116,31 +160,141 @@ define_blend_state_object(struct svga_context *svga,
      perRT[i].destBlendAlpha = bs->rt[i].dstblend_alpha;
      perRT[i].blendOpAlpha = bs->rt[i].blendeq_alpha;
      perRT[i].renderTargetWriteMask = bs->rt[i].writemask;
-      perRT[i].logicOpEnable = 0;
-      perRT[i].logicOp = SVGA3D_LOGICOP_COPY;
+      perRT[i].logicOpEnable = bs->logicop_enabled;
+      perRT[i].logicOp = bs->logicop_mode;
   }

-   /* Loop in case command buffer is full and we need to flush and retry */
-   for (try = 0; try < 2; try++) {
-      enum pipe_error ret;
+   SVGA_RETRY(svga, SVGA3D_vgpu10_DefineBlendState(svga->swc,
+                                                   bs->id,
+                                                   bs->alpha_to_coverage,
+                                                   bs->independent_blend_enable,
+                                                   perRT));
+}

-      ret = SVGA3D_vgpu10_DefineBlendState(svga->swc,
-                                           bs->id,
-                                           bs->alpha_to_coverage,
-                                           bs->independent_blend_enable,
-                                           perRT);
-      if (ret == PIPE_OK)
-         return;
-      svga_context_flush(svga, NULL);
+
+/**
+ * If SVGA3D_DEVCAP_LOGIC_BLENDOPS is false, we can't directly implement
+ * GL's logicops.  But we can emulate some of them.  We set up the blending
+ * state for that here.
+ */
+static void
+emulate_logicop(struct svga_context *svga,
+                unsigned logicop_func,
+                struct svga_blend_state *blend,
+                unsigned buffer)
+{
+   switch (logicop_func) {
+   case PIPE_LOGICOP_XOR:
+   case PIPE_LOGICOP_INVERT:
+      blend->need_white_fragments = TRUE;
+      blend->rt[buffer].blend_enable = TRUE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_ONE;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_ONE;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_SUBTRACT;
+      break;
+   case PIPE_LOGICOP_CLEAR:
+      blend->rt[buffer].blend_enable = TRUE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_ZERO;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_ZERO;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_MINIMUM;
+      break;
+   case PIPE_LOGICOP_COPY:
+      blend->rt[buffer].blend_enable = FALSE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_ONE;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_ZERO;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_ADD;
+      break;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      blend->rt[buffer].blend_enable   = TRUE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_INVSRCCOLOR;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_ZERO;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_ADD;
+      break;
+   case PIPE_LOGICOP_NOOP:
+      blend->rt[buffer].blend_enable   = TRUE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_ZERO;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_ADD;
+      break;
+   case PIPE_LOGICOP_SET:
+      blend->rt[buffer].blend_enable = TRUE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_ONE;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_ONE;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_MAXIMUM;
+      break;
+   case PIPE_LOGICOP_AND:
+      /* Approximate with minimum - works for the 0 & anything case: */
+      blend->rt[buffer].blend_enable = TRUE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_SRCCOLOR;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_MINIMUM;
+      break;
+   case PIPE_LOGICOP_AND_REVERSE:
+      blend->rt[buffer].blend_enable = TRUE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_SRCCOLOR;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_INVDESTCOLOR;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_MINIMUM;
+      break;
+   case PIPE_LOGICOP_AND_INVERTED:
+      blend->rt[buffer].blend_enable = TRUE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_INVSRCCOLOR;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_MINIMUM;
+      break;
+   case PIPE_LOGICOP_OR:
+      /* Approximate with maximum - works for the 1 | anything case: */
+      blend->rt[buffer].blend_enable = TRUE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_SRCCOLOR;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_MAXIMUM;
+      break;
+   case PIPE_LOGICOP_OR_REVERSE:
+      blend->rt[buffer].blend_enable = TRUE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_SRCCOLOR;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_INVDESTCOLOR;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_MAXIMUM;
+      break;
+   case PIPE_LOGICOP_OR_INVERTED:
+      blend->rt[buffer].blend_enable = TRUE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_INVSRCCOLOR;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_MAXIMUM;
+      break;
+   case PIPE_LOGICOP_NAND:
+   case PIPE_LOGICOP_NOR:
+   case PIPE_LOGICOP_EQUIV:
+      /* Fill these in with plausible values */
+      blend->rt[buffer].blend_enable = FALSE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_ONE;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_ZERO;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_ADD;
+      break;
+   default:
+      assert(0);
+      break;
+   }
+   blend->rt[buffer].srcblend_alpha = blend->rt[buffer].srcblend;
+   blend->rt[buffer].dstblend_alpha = blend->rt[buffer].dstblend;
+   blend->rt[buffer].blendeq_alpha = blend->rt[buffer].blendeq;
+
+   if (logicop_func == PIPE_LOGICOP_XOR) {
+      pipe_debug_message(&svga->debug.callback, CONFORMANCE,
+                         "XOR logicop mode has limited support");
+   }
+   else if (logicop_func != PIPE_LOGICOP_COPY) {
+      pipe_debug_message(&svga->debug.callback, CONFORMANCE,
+                         "general logicops are not supported");
   }
 }


+
 static void *
 svga_create_blend_state(struct pipe_context *pipe,
                        const struct pipe_blend_state *templ)
 {
   struct svga_context *svga = svga_context(pipe);
+   struct svga_screen *ss = svga_screen(pipe->screen);
   struct svga_blend_state *blend = CALLOC_STRUCT( svga_blend_state );
   unsigned i;

@ -166,107 +320,18 @@ svga_create_blend_state(struct pipe_context *pipe,
       * top of D3D9 API.  Instead we try to simulate with various blend modes.
       */
      if (templ->logicop_enable) {
-         switch (templ->logicop_func) {
-         case PIPE_LOGICOP_XOR:
-         case PIPE_LOGICOP_INVERT:
-            blend->need_white_fragments = TRUE;
-            blend->rt[i].blend_enable = TRUE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_ONE;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_ONE;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_SUBTRACT;
-            break;
-         case PIPE_LOGICOP_CLEAR:
-            blend->rt[i].blend_enable = TRUE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_ZERO;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_ZERO;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MINIMUM;
-            break;
-         case PIPE_LOGICOP_COPY:
-            blend->rt[i].blend_enable = FALSE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_ONE;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_ZERO;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_ADD;
-            break;
-         case PIPE_LOGICOP_COPY_INVERTED:
-            blend->rt[i].blend_enable   = TRUE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_INVSRCCOLOR;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_ZERO;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_ADD;
-            break;
-         case PIPE_LOGICOP_NOOP:
-            blend->rt[i].blend_enable   = TRUE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_ZERO;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_ADD;
-            break;
-         case PIPE_LOGICOP_SET:
-            blend->rt[i].blend_enable = TRUE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_ONE;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_ONE;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MAXIMUM;
-            break;
-         case PIPE_LOGICOP_AND:
-            /* Approximate with minimum - works for the 0 & anything case: */
-            blend->rt[i].blend_enable = TRUE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_SRCCOLOR;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MINIMUM;
-            break;
-         case PIPE_LOGICOP_AND_REVERSE:
-            blend->rt[i].blend_enable = TRUE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_SRCCOLOR;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_INVDESTCOLOR;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MINIMUM;
-            break;
-         case PIPE_LOGICOP_AND_INVERTED:
-            blend->rt[i].blend_enable = TRUE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_INVSRCCOLOR;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MINIMUM;
-            break;
-         case PIPE_LOGICOP_OR:
-            /* Approximate with maximum - works for the 1 | anything case: */
-            blend->rt[i].blend_enable = TRUE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_SRCCOLOR;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MAXIMUM;
-            break;
-         case PIPE_LOGICOP_OR_REVERSE:
-            blend->rt[i].blend_enable = TRUE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_SRCCOLOR;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_INVDESTCOLOR;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MAXIMUM;
-            break;
-         case PIPE_LOGICOP_OR_INVERTED:
-            blend->rt[i].blend_enable = TRUE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_INVSRCCOLOR;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MAXIMUM;
-            break;
-         case PIPE_LOGICOP_NAND:
-         case PIPE_LOGICOP_NOR:
-         case PIPE_LOGICOP_EQUIV:
-            /* Fill these in with plausible values */
-            blend->rt[i].blend_enable = FALSE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_ONE;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_ZERO;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_ADD;
-            break;
-         default:
-            assert(0);
-            break;
+         if (ss->haveBlendLogicops) {
+            blend->logicop_enabled = TRUE;
+            blend->logicop_mode = translate_logicop(templ->logicop_func);
+            blend->rt[i].blendeq = SVGA3D_BLENDEQ_ADD;
+            blend->rt[i].blendeq_alpha = SVGA3D_BLENDEQ_ADD;
+            blend->rt[i].srcblend = SVGA3D_BLENDOP_ZERO;
+            blend->rt[i].dstblend = SVGA3D_BLENDOP_ZERO;
+            blend->rt[i].srcblend_alpha = SVGA3D_BLENDOP_ZERO;
+            blend->rt[i].dstblend_alpha = SVGA3D_BLENDOP_ZERO;
         }
-         blend->rt[i].srcblend_alpha = blend->rt[i].srcblend;
-         blend->rt[i].dstblend_alpha = blend->rt[i].dstblend;
-         blend->rt[i].blendeq_alpha = blend->rt[i].blendeq;
-
-         if (templ->logicop_func == PIPE_LOGICOP_XOR) {
-            pipe_debug_message(&svga->debug.callback, CONFORMANCE,
-                               "XOR logicop mode has limited support");
-         }
-         else if (templ->logicop_func != PIPE_LOGICOP_COPY) {
-            pipe_debug_message(&svga->debug.callback, CONFORMANCE,
-                               "general logicops are not supported");
+         else {
+            emulate_logicop(svga, templ->logicop_func, blend, i);
         }
      }
      else {
@ -374,14 +439,7 @@ static void svga_delete_blend_state(struct pipe_context *pipe,
      (struct svga_blend_state *) blend;

   if (svga_have_vgpu10(svga) && bs->id != SVGA3D_INVALID_ID) {
-      enum pipe_error ret;
-
-      ret = SVGA3D_vgpu10_DestroyBlendState(svga->swc, bs->id);
-      if (ret != PIPE_OK) {
-         svga_context_flush(svga, NULL);
-         ret = SVGA3D_vgpu10_DestroyBlendState(svga->swc, bs->id);
-         assert(ret == PIPE_OK);
-      }
+      SVGA_RETRY(svga, SVGA3D_vgpu10_DestroyBlendState(svga->swc, bs->id));

      if (bs->id == svga->state.hw_draw.blend_id)
         svga->state.hw_draw.blend_id = SVGA3D_INVALID_ID;
--- a/src/gallium/drivers/svga/svga_pipe_blit.c
+++ b/src/gallium/drivers/svga/svga_pipe_blit.c
@ -80,7 +80,6 @@ intra_surface_copy(struct svga_context *svga, struct pipe_resource *tex,
                    unsigned dst_x, unsigned dst_y, unsigned dst_z,
                    unsigned width, unsigned height, unsigned depth)
 {
-   enum pipe_error ret;
   SVGA3dCopyBox box;
   struct svga_texture *stex;

@ -102,15 +101,8 @@ intra_surface_copy(struct svga_context *svga, struct pipe_resource *tex,
   box.srcy = src_y;
   box.srcz = src_z;

-   ret = SVGA3D_vgpu10_IntraSurfaceCopy(svga->swc,
-                                 stex->handle, level, layer_face,  &box);
-   if (ret != PIPE_OK) {
-      svga_context_flush(svga, NULL);
-   ret = SVGA3D_vgpu10_IntraSurfaceCopy(svga->swc,
-                                 stex->handle, level, layer_face, &box);
-      assert(ret == PIPE_OK);
-   }
-
+   SVGA_RETRY(svga, SVGA3D_vgpu10_IntraSurfaceCopy(svga->swc, stex->handle,
+                                                   level, layer_face,  &box));
   /* Mark the texture subresource as rendered-to. */
   svga_set_texture_rendered_to(stex, layer_face, level);
 }
@ -630,11 +622,13 @@ try_blit(struct svga_context *svga, const struct pipe_blit_info *blit_info)
   util_blitter_save_vertex_elements(svga->blitter, (void*)svga->curr.velems);
   util_blitter_save_vertex_shader(svga->blitter, svga->curr.vs);
   util_blitter_save_geometry_shader(svga->blitter, svga->curr.user_gs);
+   util_blitter_save_tessctrl_shader(svga->blitter, svga->curr.tcs);
+   util_blitter_save_tesseval_shader(svga->blitter, svga->curr.tes);
   util_blitter_save_so_targets(svga->blitter, svga->num_so_targets,
                     (struct pipe_stream_output_target**)svga->so_targets);
   util_blitter_save_rasterizer(svga->blitter, (void*)svga->curr.rast);
-   util_blitter_save_viewport(svga->blitter, &svga->curr.viewport);
-   util_blitter_save_scissor(svga->blitter, &svga->curr.scissor);
+   util_blitter_save_viewport(svga->blitter, &svga->curr.viewport[0]);
+   util_blitter_save_scissor(svga->blitter, &svga->curr.scissor[0]);
   util_blitter_save_fragment_shader(svga->blitter, svga->curr.fs);
   util_blitter_save_blend(svga->blitter, (void*)svga->curr.blend);
   util_blitter_save_depth_stencil_alpha(svga->blitter,
@ -835,7 +829,6 @@ svga_resource_copy_region(struct pipe_context *pipe,
   if (dst_tex->target == PIPE_BUFFER && src_tex->target == PIPE_BUFFER) {
      /* can't copy within the same buffer, unfortunately */
      if (svga_have_vgpu10(svga) && src_tex != dst_tex) {
-         enum pipe_error ret;
         struct svga_winsys_surface *src_surf;
         struct svga_winsys_surface *dst_surf;
         struct svga_buffer *dbuffer = svga_buffer(dst_tex);
@ -844,15 +837,9 @@ svga_resource_copy_region(struct pipe_context *pipe,
         src_surf = svga_buffer_handle(svga, src_tex, sbuffer->bind_flags);
         dst_surf = svga_buffer_handle(svga, dst_tex, dbuffer->bind_flags);

-         ret = SVGA3D_vgpu10_BufferCopy(svga->swc, src_surf, dst_surf,
-                                        src_box->x, dstx, src_box->width);
-         if (ret != PIPE_OK) {
-            svga_context_flush(svga, NULL);
-            ret = SVGA3D_vgpu10_BufferCopy(svga->swc, src_surf, dst_surf,
-                                           src_box->x, dstx, src_box->width);
-            assert(ret == PIPE_OK);
-         }
-
+         SVGA_RETRY(svga, SVGA3D_vgpu10_BufferCopy(svga->swc, src_surf,
+                                                   dst_surf, src_box->x, dstx,
+                                                   src_box->width));
         dbuffer->dirty = TRUE;
      }
      else {
--- a/src/gallium/drivers/svga/svga_pipe_clear.c
+++ b/src/gallium/drivers/svga/svga_pipe_clear.c
@ -45,11 +45,13 @@ begin_blit(struct svga_context *svga)
   util_blitter_save_vertex_elements(svga->blitter, (void*)svga->curr.velems);
   util_blitter_save_vertex_shader(svga->blitter, svga->curr.vs);
   util_blitter_save_geometry_shader(svga->blitter, svga->curr.gs);
+   util_blitter_save_tessctrl_shader(svga->blitter, svga->curr.tcs);
+   util_blitter_save_tesseval_shader(svga->blitter, svga->curr.tes);
   util_blitter_save_so_targets(svga->blitter, svga->num_so_targets,
                     (struct pipe_stream_output_target**)svga->so_targets);
   util_blitter_save_rasterizer(svga->blitter, (void*)svga->curr.rast);
-   util_blitter_save_viewport(svga->blitter, &svga->curr.viewport);
-   util_blitter_save_scissor(svga->blitter, &svga->curr.scissor);
+   util_blitter_save_viewport(svga->blitter, &svga->curr.viewport[0]);
+   util_blitter_save_scissor(svga->blitter, &svga->curr.scissor[0]);
   util_blitter_save_fragment_shader(svga->blitter, svga->curr.fs);
   util_blitter_save_blend(svga->blitter, (void*)svga->curr.blend);
   util_blitter_save_depth_stencil_alpha(svga->blitter,
@ -248,15 +250,7 @@ svga_clear(struct pipe_context *pipe, unsigned buffers, const struct pipe_scisso
   /* flush any queued prims (don't want them to appear after the clear!) */
   svga_hwtnl_flush_retry(svga);

-   ret = try_clear( svga, buffers, color, depth, stencil );
-
-   if (ret == PIPE_ERROR_OUT_OF_MEMORY) {
-      /* Flush command buffer and retry:
-       */
-      svga_context_flush( svga, NULL );
-
-      ret = try_clear( svga, buffers, color, depth, stencil );
-   }
+   SVGA_RETRY_OOM(svga, ret, try_clear( svga, buffers, color, depth, stencil));

   /*
    * Mark target surfaces as dirty
@ -277,7 +271,6 @@ svga_clear_texture(struct pipe_context *pipe,
 {
   struct svga_context *svga = svga_context(pipe);
   struct svga_surface *svga_surface_dst;
-   enum pipe_error ret;
   struct pipe_surface tmpl;
   struct pipe_surface *surface;

@ -309,8 +302,8 @@ svga_clear_texture(struct pipe_context *pipe,
         stencil = 0;
      }
      else {
-         util_format_unpack_z_float(surface->format, &depth, data, 1);
-         util_format_unpack_s_8uint(surface->format, &stencil, data, 1);
+         desc->unpack_z_float(&depth, 0, data, 0, 1, 1);
+         desc->unpack_s_8uint(&stencil, 0, data, 0, 1, 1);
      }

      if (util_format_has_depth(desc)) {
@ -334,17 +327,9 @@ svga_clear_texture(struct pipe_context *pipe,
         /* clearing whole surface, use direct VGPU10 command */


-         ret = SVGA3D_vgpu10_ClearDepthStencilView(svga->swc, dsv,
-                                                   clear_flags,
-                                                   stencil, depth);
-         if (ret != PIPE_OK) {
-            /* flush and try again */
-            svga_context_flush(svga, NULL);
-            ret = SVGA3D_vgpu10_ClearDepthStencilView(svga->swc, dsv,
-                                                      clear_flags,
-                                                      stencil, depth);
-            assert(ret == PIPE_OK);
-         }
+         SVGA_RETRY(svga, SVGA3D_vgpu10_ClearDepthStencilView(svga->swc, dsv,
+                                                              clear_flags,
+                                                              stencil, depth));
      }
      else {
         /* To clear subtexture use software fallback */
@ -367,7 +352,18 @@ svga_clear_texture(struct pipe_context *pipe,
         color.f[0] = color.f[1] = color.f[2] = color.f[3] = 0;
      }
      else {
-         util_format_unpack_rgba(surface->format, color.ui, data, 1);
+         if (util_format_is_pure_sint(surface->format)) {
+            /* signed integer */
+            desc->unpack_rgba_sint(color.i, 0, data, 0, 1, 1);
+         }
+         else if (util_format_is_pure_uint(surface->format)) {
+            /* unsigned integer */
+            desc->unpack_rgba_uint(color.ui, 0, data, 0, 1, 1);
+         }
+         else {
+            /* floating point */
+            desc->unpack_rgba_float(color.f, 0, data, 0, 1, 1);
+         }
      }

      /* Setup render target view */
@ -390,14 +386,8 @@ svga_clear_texture(struct pipe_context *pipe,
         }
         else {
            /* clearing whole surface using VGPU10 command */
-            ret = SVGA3D_vgpu10_ClearRenderTargetView(svga->swc, rtv,
-                                                      color.f);
-            if (ret != PIPE_OK) {
-               svga_context_flush(svga,NULL);
-               ret = SVGA3D_vgpu10_ClearRenderTargetView(svga->swc, rtv,
-                                                         color.f);
-               assert(ret == PIPE_OK);
-            }
+            SVGA_RETRY(svga, SVGA3D_vgpu10_ClearRenderTargetView(svga->swc, rtv,
+                                                                 color.f));
         }
      }
      else {
@ -526,13 +516,9 @@ svga_clear_render_target(struct pipe_context *pipe,
                                        height);
    } else {
       enum pipe_error ret;
-       
-       ret = svga_try_clear_render_target(svga, dst, color);
-       if (ret == PIPE_ERROR_OUT_OF_MEMORY) {
-          svga_context_flush( svga, NULL );
-          ret = svga_try_clear_render_target(svga, dst, color);
-       }
-       
+
+       SVGA_RETRY_OOM(svga, ret, svga_try_clear_render_target(svga, dst,
+                                                              color));
       assert (ret == PIPE_OK);
    }
    svga_toggle_render_condition(svga, render_condition_enabled, TRUE);
--- a/src/gallium/drivers/svga/svga_pipe_draw.c
+++ b/src/gallium/drivers/svga/svga_pipe_draw.c
@ -24,12 +24,16 @@
 **********************************************************/


+#include "util/u_draw.h"
+#include "util/format/u_format.h"
 #include "util/u_helpers.h"
 #include "util/u_inlines.h"
 #include "util/u_prim.h"
 #include "util/u_prim_restart.h"

 #include "svga_context.h"
+#include "svga_draw_private.h"
+#include "svga_screen.h"
 #include "svga_draw.h"
 #include "svga_shader.h"
 #include "svga_surface.h"
@ -37,59 +41,137 @@
 #include "svga_debug.h"
 #include "svga_resource_buffer.h"

-/* Returns TRUE if we are currently using flat shading.
- */
-static boolean
-is_using_flat_shading(const struct svga_context *svga)
-{
-   return
-      svga->state.hw_draw.fs ? svga->state.hw_draw.fs->uses_flat_interp : FALSE;
-}
-

 static enum pipe_error
 retry_draw_range_elements(struct svga_context *svga,
                          const struct pipe_draw_info *info,
                          unsigned count)
 {
-   enum pipe_error ret;
-
   SVGA_STATS_TIME_PUSH(svga_sws(svga), SVGA_STATS_TIME_DRAWELEMENTS);

-   ret = svga_hwtnl_draw_range_elements(svga->hwtnl, info, count);
-   if (ret != PIPE_OK) {
-      svga_context_flush(svga, NULL);
-      ret = svga_hwtnl_draw_range_elements(svga->hwtnl, info, count);
-   }
+   SVGA_RETRY(svga, svga_hwtnl_draw_range_elements(svga->hwtnl, info, count));

-   assert (ret == PIPE_OK);
   SVGA_STATS_TIME_POP(svga_sws(svga));
-   return ret;
+   return PIPE_OK;
 }


 static enum pipe_error
-retry_draw_arrays(struct svga_context *svga,
-                  enum pipe_prim_type prim, unsigned start, unsigned count,
-                  unsigned start_instance, unsigned instance_count)
+retry_draw_arrays( struct svga_context *svga,
+                   enum pipe_prim_type prim, unsigned start, unsigned count,
+                   unsigned start_instance, unsigned instance_count,
+                   ubyte vertices_per_patch)
 {
   enum pipe_error ret;

   SVGA_STATS_TIME_PUSH(svga_sws(svga), SVGA_STATS_TIME_DRAWARRAYS);

-   for (unsigned try = 0; try < 2; try++) {
-      ret = svga_hwtnl_draw_arrays(svga->hwtnl, prim, start, count,
-                                   start_instance, instance_count);
-      if (ret == PIPE_OK)
-         break;
-      svga_context_flush(svga, NULL);
-   }
-
+   SVGA_RETRY_OOM(svga, ret, svga_hwtnl_draw_arrays(svga->hwtnl, prim, start,
+                                                    count, start_instance,
+                                                    instance_count,
+                                                    vertices_per_patch));
   SVGA_STATS_TIME_POP(svga_sws(svga));
   return ret;
 }


+/**
+ * Auto draw (get vertex count from a transform feedback result).
+ */
+static enum pipe_error
+retry_draw_auto(struct svga_context *svga,
+                const struct pipe_draw_info *info)
+{
+   assert(svga_have_sm5(svga));
+   assert(info->count_from_stream_output);
+   assert(info->instance_count == 1);
+   /* SO drawing implies core profile and none of these prim types */
+   assert(info->mode != PIPE_PRIM_QUADS &&
+          info->mode != PIPE_PRIM_QUAD_STRIP &&
+          info->mode != PIPE_PRIM_POLYGON);
+
+   if (info->mode == PIPE_PRIM_LINE_LOOP) {
+      /* XXX need to do a fallback */
+      assert(!"draw auto fallback not supported yet");
+      return PIPE_OK;
+   }
+   else {
+      SVGA3dPrimitiveRange range;
+      unsigned hw_count;
+
+      range.primType = svga_translate_prim(info->mode, 12, &hw_count,
+                                           info->vertices_per_patch);
+      range.primitiveCount = 0;
+      range.indexArray.surfaceId = SVGA3D_INVALID_ID;
+      range.indexArray.offset = 0;
+      range.indexArray.stride = 0;
+      range.indexWidth = 0;
+      range.indexBias = 0;
+
+      SVGA_RETRY(svga, svga_hwtnl_prim
+                 (svga->hwtnl, &range,
+                  0,    /* vertex count comes from SO buffer */
+                  0,    /* don't know min index */
+                  ~0u,  /* don't know max index */
+                  NULL, /* no index buffer */
+                  0,    /* start instance */
+                  1,    /* only 1 instance supported */
+                  NULL, /* indirect drawing info */
+                  info->count_from_stream_output));
+
+      return PIPE_OK;
+   }
+}
+
+
+/**
+ * Indirect draw (get vertex count, start index, etc. from a buffer object.
+ */
+static enum pipe_error
+retry_draw_indirect(struct svga_context *svga,
+                    const struct pipe_draw_info *info)
+{
+   assert(svga_have_sm5(svga));
+   assert(info->indirect);
+   /* indirect drawing implies core profile and none of these prim types */
+   assert(info->mode != PIPE_PRIM_QUADS &&
+          info->mode != PIPE_PRIM_QUAD_STRIP &&
+          info->mode != PIPE_PRIM_POLYGON);
+
+   if (info->mode == PIPE_PRIM_LINE_LOOP) {
+      /* need to do a fallback */
+      util_draw_indirect(&svga->pipe, info);
+      return PIPE_OK;
+   }
+   else {
+      SVGA3dPrimitiveRange range;
+      unsigned hw_count;
+
+      range.primType = svga_translate_prim(info->mode, 12, &hw_count,
+                                           info->vertices_per_patch);
+      range.primitiveCount = 0;  /* specified in indirect buffer */
+      range.indexArray.surfaceId = SVGA3D_INVALID_ID;
+      range.indexArray.offset = 0;
+      range.indexArray.stride = 0;
+      range.indexWidth = info->index_size;
+      range.indexBias = 0; /* specified in indirect buffer */
+
+      SVGA_RETRY(svga, svga_hwtnl_prim
+                 (svga->hwtnl, &range,
+                  0,   /* vertex count is in indirect buffer */
+                  0,   /* don't know min index */
+                  ~0u, /* don't know max index */
+                  info->index.resource,
+                  info->start_instance,
+                  0,   /* don't know instance count */
+                  info->indirect,
+                  NULL)); /* SO vertex count */
+
+      return PIPE_OK;
+   }
+}
+
+
 /**
 * Determine if we need to implement primitive restart with a fallback
 * path which breaks the original primitive into sub-primitive at the
@ -116,6 +198,21 @@ need_fallback_prim_restart(const struct svga_context *svga,
 }


+/**
+ * A helper function to return the vertex count from the primitive count
+ * returned from the stream output statistics query for the specified stream.
+ */
+static unsigned
+get_vcount_from_stream_output(struct svga_context *svga,
+                              const struct pipe_draw_info *info,
+                              unsigned stream)
+{
+   unsigned primcount;
+   primcount = svga_get_primcount_from_stream_output(svga, stream);
+   return u_vertices_for_prims(info->mode, primcount);
+}
+
+
 static void
 svga_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 {
@ -147,6 +244,18 @@ svga_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
      svga->dirty |= SVGA_NEW_REDUCED_PRIMITIVE;
   }

+   if (svga->curr.vertices_per_patch != info->vertices_per_patch) {
+      svga->curr.vertices_per_patch = info->vertices_per_patch;
+
+      /* If input patch size changes, we need to notifiy the TCS
+       * code to reevaluate the shader variant since the
+       * vertices per patch count is a constant in the control
+       * point count declaration.
+       */
+      if (svga->curr.tcs || svga->curr.tes)
+         svga->dirty |= SVGA_NEW_TCS_PARAM;
+   }
+
   if (need_fallback_prim_restart(svga, info)) {
      enum pipe_error r;
      r = util_draw_vbo_without_prim_restart(pipe, info);
@ -155,7 +264,8 @@ svga_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
      goto done;
   }

-   if (!u_trim_pipe_prim(info->mode, &count))
+   if (!info->indirect && !info->count_from_stream_output &&
+       !u_trim_pipe_prim(info->mode, &count))
      goto done;

   needed_swtnl = svga->state.sw.need_swtnl;
@ -189,20 +299,53 @@ svga_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
      }
      svga_hwtnl_set_fillmode(svga->hwtnl, svga->curr.rast->hw_fillmode);

+      svga_update_state_retry(svga, SVGA_STATE_HW_DRAW);
+
      /** determine if flatshade is to be used after svga_update_state()
       *  in case the fragment shader is changed.
       */
      svga_hwtnl_set_flatshade(svga->hwtnl,
                               svga->curr.rast->templ.flatshade ||
-                               is_using_flat_shading(svga),
+                               svga_is_using_flat_shading(svga),
                               svga->curr.rast->templ.flatshade_first);

-      if (info->index_size) {
+      if (info->count_from_stream_output) {
+         unsigned stream = 0;
+         assert(count == 0);
+
+         /* If the vertex count is from the stream output of a non-zero stream
+          * or the draw info specifies instancing, we will need a workaround
+          * since the draw_auto command does not support stream instancing.
+          * The workaround requires querying the vertex count from the
+          * stream output statistics query for the specified stream and then
+          * fallback to the regular draw function.
+          */
+
+         /* Check the stream index of the specified stream output target */
+         for (unsigned i = 0; i < ARRAY_SIZE(svga->so_targets); i++) {
+            if (svga->vcount_so_targets[i] == info->count_from_stream_output) {
+               stream = (svga->vcount_buffer_stream >> (i * 4)) & 0xf;
+               break;
+            }
+         }
+         if (info->instance_count > 1 || stream > 0) {
+            count = get_vcount_from_stream_output(svga, info, stream);
+         }
+      }
+
+      if (info->count_from_stream_output && count == 0) {
+         ret = retry_draw_auto(svga, info);
+      }
+      else if (info->indirect) {
+         ret = retry_draw_indirect(svga, info);
+      }
+      else if (info->index_size) {
         ret = retry_draw_range_elements(svga, info, count);
      }
      else {
         ret = retry_draw_arrays(svga, info->mode, info->start, count,
-                                 info->start_instance, info->instance_count);
+                                 info->start_instance, info->instance_count,
+                                 info->vertices_per_patch);
      }
   }

--- a/src/gallium/drivers/svga/svga_pipe_fs.c
+++ b/src/gallium/drivers/svga/svga_pipe_fs.c
@ -37,7 +37,7 @@
 #include "svga_shader.h"


-static void *
+void *
 svga_create_fs_state(struct pipe_context *pipe,
                     const struct pipe_shader_state *templ)
 {
@ -69,7 +69,7 @@ svga_create_fs_state(struct pipe_context *pipe,
 }


-static void
+void
 svga_bind_fs_state(struct pipe_context *pipe, void *shader)
 {
   struct svga_fragment_shader *fs = (struct svga_fragment_shader *) shader;
@ -85,6 +85,7 @@ svga_delete_fs_state(struct pipe_context *pipe, void *shader)
 {
   struct svga_context *svga = svga_context(pipe);
   struct svga_fragment_shader *fs = (struct svga_fragment_shader *) shader;
+   struct svga_fragment_shader *next_fs;
   struct svga_shader_variant *variant, *tmp;
   enum pipe_error ret;

@ -92,27 +93,32 @@ svga_delete_fs_state(struct pipe_context *pipe, void *shader)

   assert(fs->base.parent == NULL);

-   draw_delete_fragment_shader(svga->swtnl.draw, fs->draw_shader);
+   while (fs) {
+      next_fs = (struct svga_fragment_shader *) fs->base.next;

-   for (variant = fs->base.variants; variant; variant = tmp) {
-      tmp = variant->next;
+      draw_delete_fragment_shader(svga->swtnl.draw, fs->draw_shader);

-      /* Check if deleting currently bound shader */
-      if (variant == svga->state.hw_draw.fs) {
-         ret = svga_set_shader(svga, SVGA3D_SHADERTYPE_PS, NULL);
-         if (ret != PIPE_OK) {
-            svga_context_flush(svga, NULL);
+      for (variant = fs->base.variants; variant; variant = tmp) {
+         tmp = variant->next;
+
+         /* Check if deleting currently bound shader */
+         if (variant == svga->state.hw_draw.fs) {
            ret = svga_set_shader(svga, SVGA3D_SHADERTYPE_PS, NULL);
-            assert(ret == PIPE_OK);
+            if (ret != PIPE_OK) {
+               svga_context_flush(svga, NULL);
+               ret = svga_set_shader(svga, SVGA3D_SHADERTYPE_PS, NULL);
+               assert(ret == PIPE_OK);
+            }
+            svga->state.hw_draw.fs = NULL;
         }
-         svga->state.hw_draw.fs = NULL;
+
+         svga_destroy_shader_variant(svga, variant);
      }

-      svga_destroy_shader_variant(svga, variant);
+      FREE((void *)fs->base.tokens);
+      FREE(fs);
+      fs = next_fs;
   }
-
-   FREE((void *)fs->base.tokens);
-   FREE(fs);
 }


--- a/src/gallium/drivers/svga/svga_pipe_misc.c
+++ b/src/gallium/drivers/svga/svga_pipe_misc.c
@ -40,9 +40,16 @@ svga_set_scissor_states(struct pipe_context *pipe,
                        unsigned num_scissors,
                        const struct pipe_scissor_state *scissors)
 {
+   ASSERTED struct svga_screen *svgascreen = svga_screen(pipe->screen);
   struct svga_context *svga = svga_context(pipe);
+   unsigned i, num_sc;
+
+   assert(start_slot + num_scissors <= svgascreen->max_viewports);
+
+   for (i = 0, num_sc = start_slot; i < num_scissors; i++)  {
+      svga->curr.scissor[num_sc++] = scissors[i]; /* struct copy */
+   }

-   memcpy(&svga->curr.scissor, scissors, sizeof(*scissors));
   svga->dirty |= SVGA_NEW_SCISSOR;
 }

@ -199,8 +206,14 @@ svga_set_viewport_states(struct pipe_context *pipe,
                         const struct pipe_viewport_state *viewports)
 {
   struct svga_context *svga = svga_context(pipe);
+   ASSERTED struct svga_screen *svgascreen = svga_screen(pipe->screen);
+   unsigned i, num_vp;

-   svga->curr.viewport = *viewports; /* struct copy */
+   assert(start_slot + num_viewports <= svgascreen->max_viewports);
+
+   for (i = 0, num_vp = start_slot; i < num_viewports; i++)  {
+      svga->curr.viewport[num_vp++] = viewports[i]; /* struct copy */
+   }

   svga->dirty |= SVGA_NEW_VIEWPORT;
 }
--- a/src/gallium/drivers/svga/svga_pipe_query.c
+++ b/src/gallium/drivers/svga/svga_pipe_query.c
@ -50,6 +50,7 @@ struct svga_query {
   SVGA3dQueryType svga_type;      /**< SVGA3D_QUERYTYPE_x or unused */

   unsigned id;                    /** Per-context query identifier */
+   boolean active;                 /** TRUE if query is active */

   struct pipe_fence_handle *fence;

@ -214,10 +215,10 @@ get_query_result_vgpu9(struct svga_context *svga, struct svga_query *sq,
 * will hold queries of the same type. Multiple memory blocks can be allocated
 * for a particular query type.
 *
- * Currently each memory block is of 184 bytes. We support up to 128
+ * Currently each memory block is of 184 bytes. We support up to 512
 * memory blocks. The query memory size is arbitrary right now.
 * Each occlusion query takes about 8 bytes. One memory block can accomodate
- * 23 occlusion queries. 128 of those blocks can support up to 2944 occlusion
+ * 23 occlusion queries. 512 of those blocks can support up to 11K occlusion
 * queries. That seems reasonable for now. If we think this limit is
 * not enough, we can increase the limit or try to grow the mob in runtime.
 * Note, SVGA device does not impose one mob per context for queries,
@ -228,7 +229,7 @@ get_query_result_vgpu9(struct svga_context *svga, struct svga_query *sq,
 * following commands: DXMoveQuery, DXBindAllQuery & DXReadbackAllQuery.
 */
 #define SVGA_QUERY_MEM_BLOCK_SIZE    (sizeof(SVGADXQueryResultUnion) * 2)
-#define SVGA_QUERY_MEM_SIZE          (128 * SVGA_QUERY_MEM_BLOCK_SIZE)
+#define SVGA_QUERY_MEM_SIZE          (512 * SVGA_QUERY_MEM_BLOCK_SIZE)

 struct svga_qmem_alloc_entry
 {
@ -243,31 +244,34 @@ struct svga_qmem_alloc_entry

 /**
 * Allocate a memory block from the query object memory
- * \return -1 if out of memory, else index of the query memory block
+ * \return NULL if out of memory, else pointer to the query memory block
 */
-static int
+static struct svga_qmem_alloc_entry *
 allocate_query_block(struct svga_context *svga)
 {
   int index;
   unsigned offset;
+   struct svga_qmem_alloc_entry *alloc_entry = NULL;

   /* Find the next available query block */
   index = util_bitmask_add(svga->gb_query_alloc_mask);

   if (index == UTIL_BITMASK_INVALID_INDEX)
-      return -1;
+      return NULL;

   offset = index * SVGA_QUERY_MEM_BLOCK_SIZE;
   if (offset >= svga->gb_query_len) {
      unsigned i;

+      /* Deallocate the out-of-range index */
+      util_bitmask_clear(svga->gb_query_alloc_mask, index);
+      index = -1;
+
      /**
       * All the memory blocks are allocated, lets see if there is
       * any empty memory block around that can be freed up.
       */
-      index = -1;
      for (i = 0; i < SVGA3D_QUERYTYPE_MAX && index == -1; i++) {
-         struct svga_qmem_alloc_entry *alloc_entry;
         struct svga_qmem_alloc_entry *prev_alloc_entry = NULL;

         alloc_entry = svga->gb_query_map[i];
@ -286,9 +290,20 @@ allocate_query_block(struct svga_context *svga)
            }
         }
      }
+
+      if (index == -1) {
+         debug_printf("Query memory object is full\n");
+         return NULL;
+      }
   }

-   return index;
+   if (!alloc_entry) {
+      assert(index != -1);
+      alloc_entry = CALLOC_STRUCT(svga_qmem_alloc_entry);
+      alloc_entry->block_index = index;
+   }
+
+   return alloc_entry;
 }

 /**
@ -346,17 +361,14 @@ allocate_query_block_entry(struct svga_context *svga,
                           unsigned len)
 {
   struct svga_qmem_alloc_entry *alloc_entry;
-   int block_index = -1;

-   block_index = allocate_query_block(svga);
-   if (block_index == -1)
-      return NULL;
-   alloc_entry = CALLOC_STRUCT(svga_qmem_alloc_entry);
+   alloc_entry = allocate_query_block(svga);
   if (!alloc_entry)
      return NULL;

-   alloc_entry->block_index = block_index;
-   alloc_entry->start_offset = block_index * SVGA_QUERY_MEM_BLOCK_SIZE;
+   assert(alloc_entry->block_index != -1);
+   alloc_entry->start_offset =
+      alloc_entry->block_index * SVGA_QUERY_MEM_BLOCK_SIZE;
   alloc_entry->nquery = 0;
   alloc_entry->alloc_mask = util_bitmask_create();
   alloc_entry->next = NULL;
@ -508,17 +520,16 @@ define_query_vgpu10(struct svga_context *svga,

   sq->gb_query = svga->gb_query;

-   /* Allocate an integer ID for this query */
-   sq->id = util_bitmask_add(svga->query_id_bm);
-   if (sq->id == UTIL_BITMASK_INVALID_INDEX)
-      return PIPE_ERROR_OUT_OF_MEMORY;
+   /* Make sure query length is in multiples of 8 bytes */
+   qlen = align(resultLen + sizeof(SVGA3dQueryState), 8);

   /* Find a slot for this query in the gb object */
-   qlen = resultLen + sizeof(SVGA3dQueryState);
   sq->offset = allocate_query(svga, sq->svga_type, qlen);
   if (sq->offset == -1)
      return PIPE_ERROR_OUT_OF_MEMORY;

+   assert((sq->offset & 7) == 0);
+
   SVGA_DBG(DEBUG_QUERY, "   query type=%d qid=0x%x offset=%d\n",
            sq->svga_type, sq->id, sq->offset);

@ -731,7 +742,19 @@ svga_create_query(struct pipe_context *pipe,
   case PIPE_QUERY_PRIMITIVES_EMITTED:
   case PIPE_QUERY_SO_STATISTICS:
      assert(svga_have_vgpu10(svga));
-      sq->svga_type = SVGA3D_QUERYTYPE_STREAMOUTPUTSTATS;
+
+      /* Until the device supports the new query type for multiple streams,
+       * we will use the single stream query type for stream 0.
+       */
+      if (svga_have_sm5(svga) && index > 0) {
+         assert(index < 4);
+
+         sq->svga_type = SVGA3D_QUERYTYPE_SOSTATS_STREAM0 + index;
+      }
+      else {
+         assert(index == 0);
+         sq->svga_type = SVGA3D_QUERYTYPE_STREAMOUTPUTSTATS;
+      }
      ret = define_query_vgpu10(svga, sq,
                                sizeof(SVGADXStreamOutStatisticsQueryResult));
      if (ret != PIPE_OK)
@ -969,7 +992,10 @@ svga_begin_query(struct pipe_context *pipe, struct pipe_query *q)
      assert(!"unexpected query type in svga_begin_query()");
   }

-   svga->sq[sq->type] = sq;
+   SVGA_DBG(DEBUG_QUERY, "%s sq=0x%x id=%d type=%d svga_type=%d\n",
+            __FUNCTION__, sq, sq->id, sq->type, sq->svga_type);
+
+   sq->active = TRUE;

   return true;
 }
@ -988,12 +1014,12 @@ svga_end_query(struct pipe_context *pipe, struct pipe_query *q)
   SVGA_DBG(DEBUG_QUERY, "%s sq=0x%x id=%d\n", __FUNCTION__,
            sq, sq->id);

-   if (sq->type == PIPE_QUERY_TIMESTAMP && svga->sq[sq->type] != sq)
+   if (sq->type == PIPE_QUERY_TIMESTAMP && !sq->active)
      svga_begin_query(pipe, q);

   svga_hwtnl_flush_retry(svga);

-   assert(svga->sq[sq->type] == sq);
+   assert(sq->active);

   switch (sq->type) {
   case PIPE_QUERY_OCCLUSION_COUNTER:
@ -1083,7 +1109,7 @@ svga_end_query(struct pipe_context *pipe, struct pipe_query *q)
   default:
      assert(!"unexpected query type in svga_end_query()");
   }
-   svga->sq[sq->type] = NULL;
+   sq->active = FALSE;
   return true;
 }

--- a/src/gallium/drivers/svga/svga_pipe_rasterizer.c
+++ b/src/gallium/drivers/svga/svga_pipe_rasterizer.c
@ -118,6 +118,9 @@ define_rasterizer_object(struct svga_context *svga,
      rast->templ.line_stipple_factor : 0;
   const uint16 line_pattern = rast->templ.line_stipple_enable ?
      rast->templ.line_stipple_pattern : 0;
+   const uint8 pv_last = !rast->templ.flatshade_first &&
+      svgascreen->haveProvokingVertex;
+
   unsigned try;

   rast->id = util_bitmask_add(svga->rast_object_id_bm);
@ -194,7 +197,18 @@ svga_create_rasterizer_state(struct pipe_context *pipe,
      rast->templ.point_smooth = TRUE;
   }

-   if (templ->point_smooth) {
+   if (rast->templ.point_smooth &&
+       rast->templ.point_size_per_vertex == 0 &&
+       rast->templ.point_size <= screen->pointSmoothThreshold) {
+      /* If the point size is less than the threshold, disable smoothing.
+       * Note that this only effects point rendering when we use the
+       * pipe_rasterizer_state::point_size value, not when the point size
+       * is set in the VS.
+       */
+      rast->templ.point_smooth = FALSE;
+   }
+
+   if (rast->templ.point_smooth) {
      /* For smooth points we need to generate fragments for at least
       * a 2x2 region.  Otherwise the quad we draw may be too small and
       * we may generate no fragments at all.
@ -237,7 +251,7 @@ svga_create_rasterizer_state(struct pipe_context *pipe,
      }
   }

-   if (!svga_have_vgpu10(svga) && templ->point_smooth) {
+   if (!svga_have_vgpu10(svga) && rast->templ.point_smooth) {
      rast->need_pipeline |= SVGA_PIPELINE_FLAG_POINTS;
      rast->need_pipeline_points_str = "smooth points";
   }
--- a/src/gallium/drivers/svga/svga_pipe_streamout.c
+++ b/src/gallium/drivers/svga/svga_pipe_streamout.c
@ -44,6 +44,89 @@ svga_stream_output_target(struct pipe_stream_output_target *s)
   return (struct svga_stream_output_target *)s;
 }

+
+/**
+ * A helper function to send different version of the DefineStreamOutput command
+ * depending on if device is SM5 capable or not.
+ */
+static enum pipe_error
+svga_define_stream_output(struct svga_context *svga,
+       SVGA3dStreamOutputId soid,
+       uint32 numOutputStreamEntries,
+       uint32 numOutputStreamStrides,
+       uint32 streamStrides[SVGA3D_DX_MAX_SOTARGETS],
+       const SVGA3dStreamOutputDeclarationEntry decls[SVGA3D_MAX_STREAMOUT_DECLS],
+       uint32 rasterizedStream,
+       struct svga_stream_output *streamout)
+{
+   unsigned i;
+
+   SVGA_DBG(DEBUG_STREAMOUT, "%s: id=%d\n", __FUNCTION__, soid);
+   SVGA_DBG(DEBUG_STREAMOUT,
+            "numOutputStreamEntires=%d\n", numOutputStreamEntries);
+
+   for (i = 0; i < numOutputStreamEntries; i++) {
+      SVGA_DBG(DEBUG_STREAMOUT,
+               "  %d: slot=%d regIdx=%d regMask=0x%x stream=%d\n",
+               i, decls[i].outputSlot, decls[i].registerIndex,
+               decls[i].registerMask, decls[i].stream);
+   }
+
+   SVGA_DBG(DEBUG_STREAMOUT,
+            "numOutputStreamStrides=%d\n", numOutputStreamStrides);
+   for (i = 0; i < numOutputStreamStrides; i++) {
+      SVGA_DBG(DEBUG_STREAMOUT, "  %d ", streamStrides[i]);
+   }
+   SVGA_DBG(DEBUG_STREAMOUT, "\n");
+
+   if (svga_have_sm5(svga) &&
+       (numOutputStreamEntries > SVGA3D_MAX_DX10_STREAMOUT_DECLS ||
+        numOutputStreamStrides > 1)) {
+      unsigned bufSize = sizeof(SVGA3dStreamOutputDeclarationEntry)
+         * numOutputStreamEntries;
+      struct svga_winsys_buffer *declBuf;
+      struct svga_winsys_screen *sws = svga_screen(svga->pipe.screen)->sws;
+      void *map;
+
+      declBuf = svga_winsys_buffer_create(svga, 1, SVGA_BUFFER_USAGE_PINNED,
+                                          bufSize);
+      if (!declBuf)
+         return PIPE_ERROR;
+      map = sws->buffer_map(sws, declBuf, PIPE_TRANSFER_WRITE);
+      if (!map) {
+         sws->buffer_destroy(sws, declBuf);
+         return PIPE_ERROR;
+      }
+
+      /* copy decls to buffer */
+      memcpy(map, decls, bufSize);
+
+      /* unmap buffer */
+      sws->buffer_unmap(sws, declBuf);
+      streamout->declBuf = declBuf;
+
+      SVGA_RETRY(svga, SVGA3D_sm5_DefineAndBindStreamOutput
+                 (svga->swc, soid,
+                  numOutputStreamEntries,
+                  numOutputStreamStrides,
+                  streamStrides,
+                  streamout->declBuf,
+                  rasterizedStream,
+                  bufSize));
+   } else {
+      SVGA_RETRY(svga, SVGA3D_vgpu10_DefineStreamOutput(svga->swc, soid,
+                                                        numOutputStreamEntries,
+                                                        streamStrides,
+                                                        decls));
+   }
+
+   return PIPE_OK;
+}
+
+
+/**
+ * Creates stream output from the stream output info.
+ */
 struct svga_stream_output *
 svga_create_stream_output(struct svga_context *svga,
                          struct svga_shader *shader,
@ -52,9 +135,13 @@ svga_create_stream_output(struct svga_context *svga,
   struct svga_stream_output *streamout;
   SVGA3dStreamOutputDeclarationEntry decls[SVGA3D_MAX_STREAMOUT_DECLS];
   unsigned strides[SVGA3D_DX_MAX_SOTARGETS];
+   unsigned dstOffset[SVGA3D_DX_MAX_SOTARGETS];
+   unsigned numStreamStrides = 0;
+   unsigned numDecls;
   unsigned i;
   enum pipe_error ret;
   unsigned id;
+   ASSERTED unsigned maxDecls;

   assert(info->num_outputs <= PIPE_MAX_SO_OUTPUTS);

@ -64,7 +151,12 @@ svga_create_stream_output(struct svga_context *svga,
   if (!svga_have_vgpu10(svga))
      return NULL;

-   assert(info->num_outputs <= SVGA3D_MAX_STREAMOUT_DECLS);
+   if (svga_have_sm5(svga))
+      maxDecls = SVGA3D_MAX_STREAMOUT_DECLS;
+   else if (svga_have_vgpu10(svga))
+      maxDecls = SVGA3D_MAX_DX10_STREAMOUT_DECLS;
+
+   assert(info->num_outputs <= maxDecls);

   /* Allocate an integer ID for the stream output */
   id = util_bitmask_add(svga->stream_output_id_bm);
@ -81,15 +173,17 @@ svga_create_stream_output(struct svga_context *svga,
   streamout->info = *info;
   streamout->id = id;
   streamout->pos_out_index = -1;
+   streamout->streammask = 0;

-   SVGA_DBG(DEBUG_STREAMOUT, "%s, num_outputs=%d id=%d\n", __FUNCTION__,
-            info->num_outputs, id);
-
-   /* init whole decls and stride arrays to zero to avoid garbage values */
+   /* Init whole decls and stride arrays to zero to avoid garbage values */
   memset(decls, 0, sizeof(decls));
   memset(strides, 0, sizeof(strides));
+   memset(dstOffset, 0, sizeof(dstOffset));

-   for (i = 0; i < info->num_outputs; i++) {
+   SVGA_DBG(DEBUG_STREAMOUT, "%s: num_outputs\n",
+            __FUNCTION__, info->num_outputs);
+
+   for (i = 0, numDecls = 0; i < info->num_outputs; i++, numDecls++) {
      unsigned reg_idx = info->output[i].register_index;
      unsigned buf_idx = info->output[i].output_buffer;
      const enum tgsi_semantic sem_name =
@ -97,17 +191,59 @@ svga_create_stream_output(struct svga_context *svga,

      assert(buf_idx <= PIPE_MAX_SO_BUFFERS);

+      numStreamStrides = MAX2(numStreamStrides, buf_idx);
+
+      SVGA_DBG(DEBUG_STREAMOUT,
+               "  %d: register_index=%d output_buffer=%d stream=%d\n",
+               i, reg_idx, buf_idx, info->output[i].stream);
+
+      SVGA_DBG(DEBUG_STREAMOUT,
+               "     dst_offset=%d start_component=%d num_components=%d\n",
+               info->output[i].dst_offset,
+               info->output[i].start_component,
+               info->output[i].num_components);
+
+      streamout->buffer_stream |= info->output[i].stream << (buf_idx * 4);
+
+      /**
+       * Check if the destination offset of the current output
+       * is at the expected offset. If it is greater, then that means
+       * there is a gap in the stream output. We need to insert
+       * extra declaration entries with an invalid register index
+       * to specify a gap.
+       */
+      while (info->output[i].dst_offset > dstOffset[buf_idx]) {
+
+         unsigned numComponents = info->output[i].dst_offset -
+                                  dstOffset[buf_idx];;
+
+         assert(svga_have_sm5(svga));
+
+         /* We can only specify at most 4 components to skip in each
+          * declaration entry.
+          */
+         numComponents = numComponents > 4 ? 4 : numComponents;
+
+         decls[numDecls].outputSlot = buf_idx,
+         decls[numDecls].stream = info->output[i].stream;
+         decls[numDecls].registerIndex = SVGA3D_INVALID_ID;
+         decls[numDecls].registerMask = (1 << numComponents) - 1;
+
+         dstOffset[buf_idx] += numComponents;
+         numDecls++;
+      }
+
      if (sem_name == TGSI_SEMANTIC_POSITION) {
         /**
          * Check if streaming out POSITION. If so, replace the
          * register index with the index for NON_ADJUSTED POSITION.
          */
-         decls[i].registerIndex = shader->info.num_outputs;
+         decls[numDecls].registerIndex = shader->info.num_outputs;

         /* Save this output index, so we can tell later if this stream output
          * includes an output of a vertex position
          */
-         streamout->pos_out_index = i;
+         streamout->pos_out_index = numDecls;
      }
      else if (sem_name == TGSI_SEMANTIC_CLIPDIST) {
         /**
@ -116,44 +252,49 @@ svga_create_stream_output(struct svga_context *svga,
          * It's valid to write to ClipDistance variable for non-enabled
          * clip planes.
          */
-         decls[i].registerIndex = shader->info.num_outputs + 1 +
-                                  shader->info.output_semantic_index[reg_idx];
+         decls[numDecls].registerIndex =
+            shader->info.num_outputs + 1 +
+            shader->info.output_semantic_index[reg_idx];
      }
      else {
-         decls[i].registerIndex = reg_idx;
+         decls[numDecls].registerIndex = reg_idx;
      }

-      decls[i].outputSlot = buf_idx;
-      decls[i].registerMask =
+      decls[numDecls].outputSlot = buf_idx;
+      decls[numDecls].registerMask =
         ((1 << info->output[i].num_components) - 1)
            << info->output[i].start_component;

-      SVGA_DBG(DEBUG_STREAMOUT, "%d slot=%d regIdx=%d regMask=0x%x\n",
-               i, decls[i].outputSlot, decls[i].registerIndex,
-               decls[i].registerMask);
+      decls[numDecls].stream = info->output[i].stream;
+      assert(decls[numDecls].stream == 0 || svga_have_sm5(svga));
+
+      /* Set the bit in streammask for the enabled stream */
+      streamout->streammask |= 1 << info->output[i].stream;
+
+      /* Update the expected offset for the next output */
+      dstOffset[buf_idx] += info->output[i].num_components;

      strides[buf_idx] = info->stride[buf_idx] * sizeof(float);
   }

-   ret = SVGA3D_vgpu10_DefineStreamOutput(svga->swc, id,
-                                          info->num_outputs,
-                                          strides,
-                                          decls);
+   assert(numDecls <= maxDecls);
+
+   /* Send the DefineStreamOutput command.
+    * Note, rasterizedStream is always 0.
+    */
+   ret = svga_define_stream_output(svga, id,
+                                   numDecls, numStreamStrides+1,
+                                   strides, decls, 0, streamout);
+
   if (ret != PIPE_OK) {
-      svga_context_flush(svga, NULL);
-      ret = SVGA3D_vgpu10_DefineStreamOutput(svga->swc, id,
-                                             info->num_outputs,
-                                             strides,
-                                             decls);
-      if (ret != PIPE_OK) {
-         util_bitmask_clear(svga->stream_output_id_bm, id);
-         FREE(streamout);
-         streamout = NULL;
-      }
+      util_bitmask_clear(svga->stream_output_id_bm, id);
+      FREE(streamout);
+      streamout = NULL;
   }
   return streamout;
 }

+
 enum pipe_error
 svga_set_stream_output(struct svga_context *svga,
                       struct svga_stream_output *streamout)
@ -168,12 +309,28 @@ svga_set_stream_output(struct svga_context *svga,
            streamout, id);

   if (svga->current_so != streamout) {
+
+      /* Before unbinding the current stream output, stop the stream output
+       * statistics queries for the active streams.
+       */
+      if (svga_have_sm5(svga) && svga->current_so) {
+         svga->vcount_buffer_stream = svga->current_so->buffer_stream;
+         svga_end_stream_output_queries(svga, svga->current_so->streammask);
+      }
+
      enum pipe_error ret = SVGA3D_vgpu10_SetStreamOutput(svga->swc, id);
      if (ret != PIPE_OK) {
         return ret;
      }

      svga->current_so = streamout;
+
+      /* After binding the new stream output, start the stream output
+       * statistics queries for the active streams.
+       */
+      if (svga_have_sm5(svga) && svga->current_so) {
+         svga_begin_stream_output_queries(svga, svga->current_so->streammask);
+      }
   }

   return PIPE_OK;
@ -183,17 +340,18 @@ void
 svga_delete_stream_output(struct svga_context *svga,
                          struct svga_stream_output *streamout)
 {
-   enum pipe_error ret;
+   struct svga_winsys_screen *sws = svga_screen(svga->pipe.screen)->sws;

   SVGA_DBG(DEBUG_STREAMOUT, "%s streamout=0x%x\n", __FUNCTION__, streamout);

   assert(svga_have_vgpu10(svga));
   assert(streamout != NULL);

-   ret = SVGA3D_vgpu10_DestroyStreamOutput(svga->swc, streamout->id);
-   if (ret != PIPE_OK) {
-      svga_context_flush(svga, NULL);
-      ret = SVGA3D_vgpu10_DestroyStreamOutput(svga->swc, streamout->id);
+   SVGA_RETRY(svga, SVGA3D_vgpu10_DestroyStreamOutput(svga->swc,
+                                                      streamout->id));
+
+   if (svga_have_sm5(svga) && streamout->declBuf) {
+      sws->buffer_destroy(sws, streamout->declBuf);
   }

   /* Release the ID */
@ -203,6 +361,7 @@ svga_delete_stream_output(struct svga_context *svga,
   FREE(streamout);
 }

+
 static struct pipe_stream_output_target *
 svga_create_stream_output_target(struct pipe_context *pipe,
                                 struct pipe_resource *buffer,
@ -252,9 +411,9 @@ svga_set_stream_output_targets(struct pipe_context *pipe,
 {
   struct svga_context *svga = svga_context(pipe);
   struct SVGA3dSoTarget soBindings[SVGA3D_DX_MAX_SOTARGETS];
-   enum pipe_error ret;
   unsigned i;
   unsigned num_so_targets;
+   boolean begin_so_queries = num_targets > 0;

   SVGA_DBG(DEBUG_STREAMOUT, "%s num_targets=%d\n", __FUNCTION__,
            num_targets);
@ -269,6 +428,14 @@ svga_set_stream_output_targets(struct pipe_context *pipe,
      sbuf->dirty = TRUE;
   }

+   /* Before the currently bound streamout targets are unbound,
+    * save them in case they need to be referenced to retrieve the
+    * number of vertices being streamed out.
+    */
+   for (i = 0; i < ARRAY_SIZE(svga->so_targets); i++) {
+      svga->vcount_so_targets[i] = svga->so_targets[i];
+   }
+
   assert(num_targets <= SVGA3D_DX_MAX_SOTARGETS);

   for (i = 0; i < num_targets; i++) {
@ -283,7 +450,16 @@ svga_set_stream_output_targets(struct pipe_context *pipe,
             & SVGA3D_SURFACE_BIND_STREAM_OUTPUT);

      svga->so_targets[i] = &sot->base;
-      soBindings[i].offset = sot->base.buffer_offset;
+      if (offsets[i] == -1) {
+         soBindings[i].offset = -1;
+
+         /* The streamout is being resumed. There is no need to restart streamout statistics
+          * queries for the draw-auto fallback since those queries are still active.
+          */
+         begin_so_queries = FALSE;
+      }
+      else
+         soBindings[i].offset = sot->base.buffer_offset + offsets[i];

      /* The size cannot extend beyond the end of the buffer.  Clamp it. */
      size = MIN2(sot->base.buffer_size,
@ -299,15 +475,22 @@ svga_set_stream_output_targets(struct pipe_context *pipe,
   }

   num_so_targets = MAX2(svga->num_so_targets, num_targets);
-   ret = SVGA3D_vgpu10_SetSOTargets(svga->swc, num_so_targets,
-                                    soBindings, svga->so_surfaces);
-   if (ret != PIPE_OK) {
-      svga_context_flush(svga, NULL);
-      ret = SVGA3D_vgpu10_SetSOTargets(svga->swc, num_so_targets,
-                                       soBindings, svga->so_surfaces);
-   }
-
+   SVGA_RETRY(svga, SVGA3D_vgpu10_SetSOTargets(svga->swc, num_so_targets,
+                                               soBindings, svga->so_surfaces));
   svga->num_so_targets = num_targets;
+
+   if (svga_have_sm5(svga) && svga->current_so && begin_so_queries) {
+
+      /* If there are aleady active queries and we need to start a new streamout,
+       * we need to stop the current active queries first.
+       */
+      if (svga->in_streamout) {
+         svga_end_stream_output_queries(svga, svga->current_so->streammask);
+      }
+
+      /* Start stream out statistics queries for the new streamout */
+      svga_begin_stream_output_queries(svga, svga->current_so->streammask);
+   }
 }

 /**
@ -329,6 +512,7 @@ svga_rebind_stream_output_targets(struct svga_context *svga)
   return PIPE_OK;
 }

+
 void
 svga_init_stream_output_functions(struct svga_context *svga)
 {
@ -336,3 +520,117 @@ svga_init_stream_output_functions(struct svga_context *svga)
   svga->pipe.stream_output_target_destroy = svga_destroy_stream_output_target;
   svga->pipe.set_stream_output_targets = svga_set_stream_output_targets;
 }
+
+
+/**
+ * A helper function to create stream output statistics queries for each stream.
+ * These queries are created as a workaround for DrawTransformFeedbackInstanced or
+ * DrawTransformFeedbackStreamInstanced when auto draw doesn't support
+ * instancing or non-0 stream. In this case, the vertex count will
+ * be retrieved from the stream output statistics query.
+ */
+void
+svga_create_stream_output_queries(struct svga_context *svga)
+{
+   unsigned i;
+
+   if (!svga_have_sm5(svga))
+      return;
+
+   for (i = 0; i < ARRAY_SIZE(svga->so_queries); i++) {
+      svga->so_queries[i] = svga->pipe.create_query(&svga->pipe,
+                               PIPE_QUERY_SO_STATISTICS, i);
+      assert(svga->so_queries[i] != NULL);
+   }
+}
+
+
+/**
+ * Destroy the stream output statistics queries for the draw-auto workaround.
+ */
+void
+svga_destroy_stream_output_queries(struct svga_context *svga)
+{
+   unsigned i;
+
+   if (!svga_have_sm5(svga))
+      return;
+
+   for (i = 0; i < ARRAY_SIZE(svga->so_queries); i++) {
+      svga->pipe.destroy_query(&svga->pipe, svga->so_queries[i]);
+   }
+}
+
+
+/**
+ * Start stream output statistics queries for the active streams.
+ */
+void
+svga_begin_stream_output_queries(struct svga_context *svga,
+                                 unsigned streammask)
+{
+   assert(svga_have_sm5(svga));
+   assert(!svga->in_streamout);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(svga->so_queries); i++) {
+      bool ret;
+      if (streammask & (1 << i)) {
+         ret = svga->pipe.begin_query(&svga->pipe, svga->so_queries[i]);
+      }
+      (void) ret;
+   }   
+   svga->in_streamout = TRUE;
+
+   return;
+}
+
+
+/**
+ * Stop stream output statistics queries for the active streams.
+ */
+void
+svga_end_stream_output_queries(struct svga_context *svga,
+                               unsigned streammask)
+{
+   assert(svga_have_sm5(svga));
+
+   if (!svga->in_streamout)
+      return;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(svga->so_queries); i++) {
+      bool ret;
+      if (streammask & (1 << i)) {
+         ret = svga->pipe.end_query(&svga->pipe, svga->so_queries[i]);
+      }
+      (void) ret;
+   }   
+   svga->in_streamout = FALSE;
+
+   return;
+}
+
+
+/**
+ * Return the primitive count returned from the stream output statistics query
+ * for the specified stream.
+ */
+unsigned
+svga_get_primcount_from_stream_output(struct svga_context *svga,
+                                      unsigned stream)
+{
+   unsigned primcount = 0;
+   union pipe_query_result result;
+   bool ret;
+
+   if (svga->current_so) {
+      svga_end_stream_output_queries(svga, svga->current_so->streammask);
+   }
+
+   ret = svga->pipe.get_query_result(&svga->pipe,
+                                     svga->so_queries[stream],
+                                     TRUE, &result);
+   if (ret)
+      primcount = result.so_statistics.num_primitives_written;
+
+   return primcount;
+}
--- a/src/gallium/drivers/svga/svga_pipe_ts.c
+++ b/src/gallium/drivers/svga/svga_pipe_ts.c
@ -0,0 +1,219 @@
+/**********************************************************
+ * Copyright 2018-2020 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "pipe/p_context.h"
+#include "util/u_memory.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "svga_context.h"
+#include "svga_shader.h"
+
+static void
+svga_set_tess_state(struct pipe_context *pipe,
+                    const float default_outer_level[4],
+                    const float default_inner_level[2])
+{
+   struct svga_context *svga = svga_context(pipe);
+   unsigned i;
+
+   for (i = 0; i < 4; i++) {
+      svga->curr.default_tesslevels[i] = default_outer_level[i];
+   }
+   for (i = 0; i < 2; i++) {
+      svga->curr.default_tesslevels[i + 4] = default_inner_level[i];
+   }
+}
+
+
+static void *
+svga_create_tcs_state(struct pipe_context *pipe,
+                      const struct pipe_shader_state *templ)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_tcs_shader *tcs;
+
+   tcs = CALLOC_STRUCT(svga_tcs_shader);
+   if (!tcs)
+      return NULL;
+
+   SVGA_STATS_TIME_PUSH(svga_sws(svga), SVGA_STATS_TIME_CREATETCS);
+
+   tcs->base.tokens = tgsi_dup_tokens(templ->tokens);
+
+   /* Collect basic info that we'll need later:
+    */
+   tgsi_scan_shader(tcs->base.tokens, &tcs->base.info);
+
+   tcs->base.id = svga->debug.shader_id++;
+
+   tcs->generic_outputs = svga_get_generic_outputs_mask(&tcs->base.info);
+
+   SVGA_STATS_TIME_POP(svga_sws(svga));
+   return tcs;
+}
+
+
+static void
+svga_bind_tcs_state(struct pipe_context *pipe, void *shader)
+{
+   struct svga_tcs_shader *tcs = (struct svga_tcs_shader *) shader;
+   struct svga_context *svga = svga_context(pipe);
+
+   if (tcs == svga->curr.tcs)
+      return;
+
+   svga->curr.tcs = tcs;
+   svga->dirty |= SVGA_NEW_TCS;
+}
+
+
+static void
+svga_delete_tcs_state(struct pipe_context *pipe, void *shader)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_tcs_shader *tcs = (struct svga_tcs_shader *) shader;
+   struct svga_tcs_shader *next_tcs;
+   struct svga_shader_variant *variant, *tmp;
+
+   svga_hwtnl_flush_retry(svga);
+
+   assert(tcs->base.parent == NULL);
+
+   while (tcs) {
+      next_tcs = (struct svga_tcs_shader *)tcs->base.next;
+      for (variant = tcs->base.variants; variant; variant = tmp) {
+         tmp = variant->next;
+
+         /* Check if deleting currently bound shader */
+         if (variant == svga->state.hw_draw.tcs) {
+            SVGA_RETRY(svga, svga_set_shader(svga, SVGA3D_SHADERTYPE_HS, NULL));
+            svga->state.hw_draw.tcs = NULL;
+         }
+
+         svga_destroy_shader_variant(svga, variant);
+      }
+
+      FREE((void *)tcs->base.tokens);
+      FREE(tcs);
+      tcs = next_tcs;
+   }
+}
+
+
+void
+svga_cleanup_tcs_state(struct svga_context *svga)
+{
+   if (svga->tcs.passthrough_tcs) {
+      svga_delete_tcs_state(&svga->pipe, svga->tcs.passthrough_tcs);
+   }
+}
+
+
+static void *
+svga_create_tes_state(struct pipe_context *pipe,
+                      const struct pipe_shader_state *templ)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_tes_shader *tes;
+
+   tes = CALLOC_STRUCT(svga_tes_shader);
+   if (!tes)
+      return NULL;
+
+   SVGA_STATS_TIME_PUSH(svga_sws(svga), SVGA_STATS_TIME_CREATETES);
+
+   tes->base.tokens = tgsi_dup_tokens(templ->tokens);
+
+   /* Collect basic info that we'll need later:
+    */
+   tgsi_scan_shader(tes->base.tokens, &tes->base.info);
+
+   tes->base.id = svga->debug.shader_id++;
+
+   tes->generic_inputs = svga_get_generic_inputs_mask(&tes->base.info);
+
+   SVGA_STATS_TIME_POP(svga_sws(svga));
+   return tes;
+}
+
+
+static void
+svga_bind_tes_state(struct pipe_context *pipe, void *shader)
+{
+   struct svga_tes_shader *tes = (struct svga_tes_shader *) shader;
+   struct svga_context *svga = svga_context(pipe);
+
+   if (tes == svga->curr.tes)
+      return;
+
+   svga->curr.tes = tes;
+   svga->dirty |= SVGA_NEW_TES;
+}
+
+
+static void
+svga_delete_tes_state(struct pipe_context *pipe, void *shader)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_tes_shader *tes = (struct svga_tes_shader *) shader;
+   struct svga_tes_shader *next_tes;
+   struct svga_shader_variant *variant, *tmp;
+
+   svga_hwtnl_flush_retry(svga);
+
+   assert(tes->base.parent == NULL);
+
+   while (tes) {
+      next_tes = (struct svga_tes_shader *)tes->base.next;
+      for (variant = tes->base.variants; variant; variant = tmp) {
+         tmp = variant->next;
+
+         /* Check if deleting currently bound shader */
+         if (variant == svga->state.hw_draw.tes) {
+            SVGA_RETRY(svga, svga_set_shader(svga, SVGA3D_SHADERTYPE_DS, NULL));
+            svga->state.hw_draw.tes = NULL;
+         }
+
+         svga_destroy_shader_variant(svga, variant);
+      }
+
+      FREE((void *)tes->base.tokens);
+      FREE(tes);
+      tes = next_tes;
+   }
+}
+
+
+void
+svga_init_ts_functions(struct svga_context *svga)
+{
+   svga->pipe.set_tess_state = svga_set_tess_state;
+   svga->pipe.create_tcs_state = svga_create_tcs_state;
+   svga->pipe.bind_tcs_state = svga_bind_tcs_state;
+   svga->pipe.delete_tcs_state = svga_delete_tcs_state;
+   svga->pipe.create_tes_state = svga_create_tes_state;
+   svga->pipe.bind_tes_state = svga_bind_tes_state;
+   svga->pipe.delete_tes_state = svga_delete_tes_state;
+}
--- a/src/gallium/drivers/svga/svga_pipe_vs.c
+++ b/src/gallium/drivers/svga/svga_pipe_vs.c
@ -166,6 +166,7 @@ svga_delete_vs_state(struct pipe_context *pipe, void *shader)
 {
   struct svga_context *svga = svga_context(pipe);
   struct svga_vertex_shader *vs = (struct svga_vertex_shader *)shader;
+   struct svga_vertex_shader *next_vs;
   struct svga_shader_variant *variant, *tmp;
   enum pipe_error ret;

@ -173,37 +174,42 @@ svga_delete_vs_state(struct pipe_context *pipe, void *shader)

   assert(vs->base.parent == NULL);

-   /* Check if there is a generated geometry shader to go with this
-    * vertex shader. If there is, then delete the geometry shader as well.
-    */
-   if (vs->gs != NULL) {
-      svga->pipe.delete_gs_state(&svga->pipe, vs->gs);
-   }
+   while (vs) {
+      next_vs = (struct svga_vertex_shader *)vs->base.next;

-   if (vs->base.stream_output != NULL)
-      svga_delete_stream_output(svga, vs->base.stream_output);
-
-   draw_delete_vertex_shader(svga->swtnl.draw, vs->draw_shader);
-
-   for (variant = vs->base.variants; variant; variant = tmp) {
-      tmp = variant->next;
-
-      /* Check if deleting currently bound shader */
-      if (variant == svga->state.hw_draw.vs) {
-         ret = svga_set_shader(svga, SVGA3D_SHADERTYPE_VS, NULL);
-         if (ret != PIPE_OK) {
-            svga_context_flush(svga, NULL);
-            ret = svga_set_shader(svga, SVGA3D_SHADERTYPE_VS, NULL);
-            assert(ret == PIPE_OK);
-         }
-         svga->state.hw_draw.vs = NULL;
+      /* Check if there is a generated geometry shader to go with this
+       * vertex shader. If there is, then delete the geometry shader as well.
+       */
+      if (vs->gs != NULL) {
+         svga->pipe.delete_gs_state(&svga->pipe, vs->gs);
      }

-      svga_destroy_shader_variant(svga, variant);
-   }
+      if (vs->base.stream_output != NULL)
+         svga_delete_stream_output(svga, vs->base.stream_output);

-   FREE((void *)vs->base.tokens);
-   FREE(vs);
+      draw_delete_vertex_shader(svga->swtnl.draw, vs->draw_shader);
+
+      for (variant = vs->base.variants; variant; variant = tmp) {
+         tmp = variant->next;
+
+         /* Check if deleting currently bound shader */
+         if (variant == svga->state.hw_draw.vs) {
+            ret = svga_set_shader(svga, SVGA3D_SHADERTYPE_VS, NULL);
+            if (ret != PIPE_OK) {
+               svga_context_flush(svga, NULL);
+               ret = svga_set_shader(svga, SVGA3D_SHADERTYPE_VS, NULL);
+               assert(ret == PIPE_OK);
+            }
+            svga->state.hw_draw.vs = NULL;
+         }
+
+         svga_destroy_shader_variant(svga, variant);
+      }
+
+      FREE((void *)vs->base.tokens);
+      FREE(vs);
+      vs = next_vs;
+   }
 }


--- a/src/gallium/drivers/svga/svga_resource_buffer.c
+++ b/src/gallium/drivers/svga/svga_resource_buffer.c
@ -53,7 +53,8 @@ svga_buffer_needs_hw_storage(const struct svga_screen *ss,
                             const struct pipe_resource *template)
 {
   unsigned bind_mask = (PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER |
-                         PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_STREAM_OUTPUT);
+                         PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_STREAM_OUTPUT |
+                         PIPE_BIND_SHADER_BUFFER | PIPE_BIND_COMMAND_ARGS_BUFFER);

   if (ss->sws->have_vgpu10) {
      /*
@ -478,6 +479,9 @@ svga_buffer_create(struct pipe_screen *screen,
          */
         bind_flags |= (PIPE_BIND_VERTEX_BUFFER |
                        PIPE_BIND_INDEX_BUFFER);
+
+         /* It may be used for shader resource as well. */
+         bind_flags |= PIPE_BIND_SAMPLER_VIEW;
      }

      if (svga_buffer_create_host_surface(ss, sbuf, bind_flags) != PIPE_OK)
--- a/src/gallium/drivers/svga/svga_resource_buffer_upload.c
+++ b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
@ -175,6 +175,11 @@ svga_buffer_create_host_surface(struct svga_screen *ss,
      if (bind_flags & PIPE_BIND_SAMPLER_VIEW)
         sbuf->key.flags |= SVGA3D_SURFACE_BIND_SHADER_RESOURCE;

+      if (bind_flags & PIPE_BIND_COMMAND_ARGS_BUFFER) {
+         assert(ss->sws->have_sm5);
+         sbuf->key.flags |= SVGA3D_SURFACE_DRAWINDIRECT_ARGS;
+      }
+
      if (!bind_flags && sbuf->b.b.usage == PIPE_USAGE_STAGING) {
         /* This surface is to be used with the
          * SVGA3D_CMD_DX_TRANSFER_FROM_BUFFER command, and no other
--- a/src/gallium/drivers/svga/svga_resource_texture.c
+++ b/src/gallium/drivers/svga/svga_resource_texture.c
@ -133,26 +133,25 @@ svga_transfer_dma(struct svga_context *svga,
      }
   }
   else {
-      int y, h, y_max;
+      int y, h, srcy;
      unsigned blockheight =
         util_format_get_blockheight(st->base.resource->format);

      h = st->hw_nblocksy * blockheight;
-      y_max = st->box.y + st->box.h;
+      srcy = 0;

-      for (y = st->box.y; y < y_max; y += h) {
+      for (y = 0; y < st->box.h; y += h) {
         unsigned offset, length;
         void *hw, *sw;

-         if (y + h > y_max)
-            h = y_max - y;
+         if (y + h > st->box.h)
+            h = st->box.h - y;

         /* Transfer band must be aligned to pixel block boundaries */
         assert(y % blockheight == 0);
         assert(h % blockheight == 0);

-         /* First band starts at the top of the SW buffer. */
-         offset = (y - st->box.y) * st->base.stride / blockheight;
+         offset = y * st->base.stride / blockheight;
         length = h * st->base.stride / blockheight;

         sw = (uint8_t *) st->swbuf + offset;
@ -160,9 +159,9 @@ svga_transfer_dma(struct svga_context *svga,
         if (transfer == SVGA3D_WRITE_HOST_VRAM) {
            unsigned usage = PIPE_TRANSFER_WRITE;

-            /* Don't write to an in-flight DMA buffer. Synchronize or
-             * discard in-flight storage. */
-            if (y != st->box.y) {
+            /* Wait for the previous DMAs to complete */
+            /* TODO: keep one DMA (at half the size) in the background */
+            if (y) {
               svga_context_flush(svga, NULL);
               usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE;
            }
@ -178,7 +177,7 @@ svga_transfer_dma(struct svga_context *svga,
         svga_transfer_dma_band(svga, st, transfer,
                                st->box.x, y, st->box.z,
                                st->box.w, h, st->box.d,
-                                0, 0, 0, flags);
+                                0, srcy, 0, flags);

         /*
          * Prevent the texture contents to be discarded on the next band
@ -488,6 +487,18 @@ svga_texture_transfer_map_direct(struct svga_context *svga,
         svga_context_flush(svga, NULL);
      }

+      if (map && rebind) {
+         enum pipe_error ret;
+
+         ret = SVGA3D_BindGBSurface(swc, surf);
+         if (ret != PIPE_OK) {
+            svga_context_flush(svga, NULL);
+            ret = SVGA3D_BindGBSurface(swc, surf);
+            assert(ret == PIPE_OK);
+         }
+         svga_context_flush(svga, NULL);
+      }
+
      /*
       * Make sure we return NULL if the map fails
       */
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@ -49,6 +49,10 @@
 /* NOTE: this constant may get moved into a svga3d*.h header file */
 #define SVGA3D_DX_MAX_RESOURCE_SIZE (128 * 1024 * 1024)

+#ifndef MESA_GIT_SHA1
+#define MESA_GIT_SHA1 "(unknown git revision)"
+#endif
+
 #ifdef DEBUG
 int SVGA_DEBUG = 0;

@ -249,7 +253,8 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
                  12 /* 2048x2048 */);

   case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
-      return sws->have_vgpu10 ? SVGA3D_MAX_SURFACE_ARRAYSIZE : 0;
+      return sws->have_sm5 ? SVGA3D_SM5_MAX_SURFACE_ARRAYSIZE :
+             (sws->have_vgpu10 ? SVGA3D_SM4_MAX_SURFACE_ARRAYSIZE : 0);

   case PIPE_CAP_BLEND_EQUATION_SEPARATE: /* req. for GL 1.5 */
      return 1;
@ -266,7 +271,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_VERTEX_COLOR_UNCLAMPED:
      return 1; /* The color outputs of vertex shaders are not clamped */
   case PIPE_CAP_VERTEX_COLOR_CLAMPED:
-      return 0; /* The driver can't clamp vertex colors */
+      return sws->have_vgpu10;
   case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
      return 0; /* The driver can't clamp fragment colors */

@ -274,10 +279,16 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
      return 1; /* expected for GL_ARB_framebuffer_object */

   case PIPE_CAP_GLSL_FEATURE_LEVEL:
-      return sws->have_vgpu10 ? 330 : 120;
+      if (sws->have_sm5) {
+         return 410;
+      } else if (sws->have_vgpu10) {
+         return 330;
+      } else {
+         return 120;
+      }

   case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
-      return sws->have_vgpu10 ? 330 : 120;
+      return sws->have_sm5 ? 410 : (sws->have_vgpu10 ? 330 : 120);

   case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
   case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE:
@ -303,10 +314,12 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
      return sws->have_vgpu10 ? 4 : 0;
   case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
-      return sws->have_vgpu10 ? SVGA3D_MAX_STREAMOUT_DECLS : 0;
+      return sws->have_sm5 ? SVGA3D_MAX_STREAMOUT_DECLS :
+             (sws->have_vgpu10 ? SVGA3D_MAX_DX10_STREAMOUT_DECLS : 0);
   case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+      return sws->have_sm5;
   case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS:
-      return 0;
+      return sws->have_sm5;
   case PIPE_CAP_TEXTURE_MULTISAMPLE:
      return svgascreen->ms_samples ? 1 : 0;

@ -350,7 +363,16 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
      return sws->have_sm4_1;

   case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
-      return sws->have_sm4_1 ? 1 : 0; /* only single-channel textures */
+      /* SM4_1 supports only single-channel textures where as SM5 supports
+       * all four channel textures */
+      return sws->have_sm5 ? 4 :
+             (sws->have_sm4_1 ? 1 : 0);
+   case PIPE_CAP_DRAW_INDIRECT:
+      return sws->have_sm5;
+   case PIPE_CAP_MAX_VERTEX_STREAMS:
+      return sws->have_sm5 ? 4 : 0;
+   case PIPE_CAP_COMPUTE:
+      return 0;
   case PIPE_CAP_MAX_VARYINGS:
      return sws->have_vgpu10 ? VGPU10_MAX_FS_INPUTS : 10;
   case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
@ -362,9 +384,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_SHADER_STENCIL_EXPORT:
   case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
   case PIPE_CAP_TEXTURE_BARRIER:
-   case PIPE_CAP_MAX_VERTEX_STREAMS:
   case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
-   case PIPE_CAP_COMPUTE:
   case PIPE_CAP_START_INSTANCE:
   case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
   case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
@ -372,7 +392,6 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_TEXTURE_GATHER_SM5:
   case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
   case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
-   case PIPE_CAP_DRAW_INDIRECT:
   case PIPE_CAP_MULTI_DRAW_INDIRECT:
   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
   case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
@ -410,7 +429,10 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
      return 2048;
   case PIPE_CAP_MAX_VIEWPORTS:
-      return 1;
+      assert((!sws->have_vgpu10 && svgascreen->max_viewports == 1) ||
+             (sws->have_vgpu10 &&
+              svgascreen->max_viewports == SVGA3D_DX_MAX_VIEWPORTS));
+      return svgascreen->max_viewports;
   case PIPE_CAP_ENDIANNESS:
      return PIPE_ENDIAN_LITTLE;

@ -427,10 +449,11 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
      return sws->have_vgpu10;
   case PIPE_CAP_CLEAR_TEXTURE:
      return sws->have_vgpu10;
+   case PIPE_CAP_DOUBLES:
+      return sws->have_sm5;
   case PIPE_CAP_UMA:
   case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
-   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
   case PIPE_CAP_DEPTH_BOUNDS_TEST:
@ -453,7 +476,6 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY:
   case PIPE_CAP_FBFETCH:
   case PIPE_CAP_TGSI_MUL_ZERO_WINS:
-   case PIPE_CAP_DOUBLES:
   case PIPE_CAP_INT64:
   case PIPE_CAP_INT64_DIVMOD:
   case PIPE_CAP_TGSI_TEX_TXF_LZ:
@ -487,6 +509,9 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
      return 32;
   case PIPE_CAP_MAX_SHADER_BUFFER_SIZE:
      return 1 << 27;
+   /* Verify this once protocol is finalized. Setting it to minimum value. */
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+      return sws->have_sm5 ? 30 : 0;
   default:
      return u_pipe_screen_get_param_defaults(screen, param);
   }
@ -674,12 +699,12 @@ vgpu10_get_shader_param(struct pipe_screen *screen,
   assert(sws->have_vgpu10);
   (void) sws;  /* silence unused var warnings in non-debug builds */

-   /* Only VS, GS, FS supported */
-   if (shader != PIPE_SHADER_VERTEX &&
-       shader != PIPE_SHADER_GEOMETRY &&
-       shader != PIPE_SHADER_FRAGMENT) {
+   if ((!sws->have_sm5) &&
+       (shader == PIPE_SHADER_TESS_CTRL || shader == PIPE_SHADER_TESS_EVAL))
+      return 0;
+
+   if (shader == PIPE_SHADER_COMPUTE)
      return 0;
-   }

   /* NOTE: we do not query the device for any caps/limits at this time */

@ -697,6 +722,10 @@ vgpu10_get_shader_param(struct pipe_screen *screen,
         return VGPU10_MAX_FS_INPUTS;
      else if (shader == PIPE_SHADER_GEOMETRY)
         return VGPU10_MAX_GS_INPUTS;
+      else if (shader == PIPE_SHADER_TESS_CTRL)
+         return VGPU11_MAX_HS_INPUTS;
+      else if (shader == PIPE_SHADER_TESS_EVAL)
+         return VGPU11_MAX_DS_INPUT_CONTROL_POINTS;
      else
         return VGPU10_MAX_VS_INPUTS;
   case PIPE_SHADER_CAP_MAX_OUTPUTS:
@ -704,6 +733,10 @@ vgpu10_get_shader_param(struct pipe_screen *screen,
         return VGPU10_MAX_FS_OUTPUTS;
      else if (shader == PIPE_SHADER_GEOMETRY)
         return VGPU10_MAX_GS_OUTPUTS;
+      else if (shader == PIPE_SHADER_TESS_CTRL)
+         return VGPU11_MAX_HS_OUTPUTS;
+      else if (shader == PIPE_SHADER_TESS_EVAL)
+         return VGPU11_MAX_DS_OUTPUTS;
      else
         return VGPU10_MAX_VS_OUTPUTS;
   case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
@ -844,6 +877,8 @@ svga_get_driver_query_info(struct pipe_screen *screen,
            PIPE_DRIVER_QUERY_TYPE_UINT64),
      QUERY("num-bytes-uploaded", SVGA_QUERY_NUM_BYTES_UPLOADED,
            PIPE_DRIVER_QUERY_TYPE_BYTES),
+      QUERY("num-command-buffers", SVGA_QUERY_NUM_COMMAND_BUFFERS,
+            PIPE_DRIVER_QUERY_TYPE_UINT64),
      QUERY("command-buffer-size", SVGA_QUERY_COMMAND_BUFFER_SIZE,
            PIPE_DRIVER_QUERY_TYPE_BYTES),
      QUERY("flush-time", SVGA_QUERY_FLUSH_TIME,
@ -860,6 +895,10 @@ svga_get_driver_query_info(struct pipe_screen *screen,
            PIPE_DRIVER_QUERY_TYPE_UINT64),
      QUERY("num-const-updates", SVGA_QUERY_NUM_CONST_UPDATES,
            PIPE_DRIVER_QUERY_TYPE_UINT64),
+      QUERY("num-shader-relocations", SVGA_QUERY_NUM_SHADER_RELOCATIONS,
+            PIPE_DRIVER_QUERY_TYPE_UINT64),
+      QUERY("num-surface-relocations", SVGA_QUERY_NUM_SURFACE_RELOCATIONS,
+            PIPE_DRIVER_QUERY_TYPE_UINT64),

      /* running total counters */
      QUERY("memory-used", SVGA_QUERY_MEMORY_USED,
@ -878,6 +917,8 @@ svga_get_driver_query_info(struct pipe_screen *screen,
            PIPE_DRIVER_QUERY_TYPE_UINT64),
      QUERY("num-commands-per-draw", SVGA_QUERY_NUM_COMMANDS_PER_DRAW,
            PIPE_DRIVER_QUERY_TYPE_FLOAT),
+      QUERY("shader-mem-used", SVGA_QUERY_SHADER_MEM_USED,
+            PIPE_DRIVER_QUERY_TYPE_UINT64),
   };
 #undef QUERY

@ -1012,9 +1053,10 @@ svga_screen_create(struct svga_winsys_screen *sws)
      goto error2;
   }

-   debug_printf("%s enabled = %u\n",
-                sws->have_sm4_1 ? "SM4_1" : "VGPU10",
-                sws->have_sm4_1 ? 1 : sws->have_vgpu10);
+   debug_printf("%s enabled\n",
+                sws->have_sm5 ? "SM5" :
+                sws->have_sm4_1 ? "SM4_1" :
+                sws->have_vgpu10 ? "VGPU10" : "VGPU9");

   debug_printf("Mesa: %s %s (%s)\n", svga_get_name(screen),
                PACKAGE_VERSION, MESA_GIT_SHA1);
@ -1081,13 +1123,23 @@ svga_screen_create(struct svga_winsys_screen *sws)
            svgascreen->ms_samples |= 1 << 3;
      }

+      if (sws->have_sm5 && debug_get_bool_option("SVGA_MSAA", TRUE)) {
+         if (get_bool_cap(sws, SVGA3D_DEVCAP_MULTISAMPLE_8X, FALSE))
+            svgascreen->ms_samples |= 1 << 7;
+      }
+
      /* Maximum number of constant buffers */
      svgascreen->max_const_buffers =
         get_uint_cap(sws, SVGA3D_DEVCAP_DX_MAX_CONSTANT_BUFFERS, 1);
      svgascreen->max_const_buffers = MIN2(svgascreen->max_const_buffers,
                                           SVGA_MAX_CONST_BUFS);

+      svgascreen->haveBlendLogicops =
+         get_bool_cap(sws, SVGA3D_DEVCAP_LOGIC_BLENDOPS, FALSE);
+
      screen->is_format_supported = svga_is_dx_format_supported;
+
+      svgascreen->max_viewports = SVGA3D_DX_MAX_VIEWPORTS;
   }
   else {
      /* VGPU9 */
@ -1122,6 +1174,9 @@ svga_screen_create(struct svga_winsys_screen *sws)

      /* No multisampling */
      svgascreen->ms_samples = 0;
+
+      /* Only one viewport */
+      svgascreen->max_viewports = 1;
   }

   /* common VGPU9 / VGPU10 caps */
--- a/src/gallium/drivers/svga/svga_screen.h
+++ b/src/gallium/drivers/svga/svga_screen.h
@ -50,10 +50,13 @@ struct svga_screen
   /** Device caps */
   boolean haveProvokingVertex;
   boolean haveLineStipple, haveLineSmooth;
+   boolean haveBlendLogicops;
   float maxLineWidth, maxLineWidthAA;
   float maxPointSize;
+   float pointSmoothThreshold; /** Disable point AA for sizes less than this */
   unsigned max_color_buffers;
   unsigned max_const_buffers;
+   unsigned max_viewports;
   unsigned ms_samples;

   struct {
--- a/src/gallium/drivers/svga/svga_screen_cache.c
+++ b/src/gallium/drivers/svga/svga_screen_cache.c
@ -311,6 +311,9 @@ svga_screen_cache_add(struct svga_screen *svgascreen,
 }


+/* Maximum number of invalidate surface commands in a command buffer */
+# define SVGA_MAX_SURFACE_TO_INVALIDATE 1000
+
 /**
 * Called during the screen flush to move all buffers not in a validate list
 * into the unused list.
@ -354,6 +357,7 @@ svga_screen_cache_flush(struct svga_screen *svgascreen,
      next = curr->next;
   }

+   unsigned nsurf = 0;
   curr = cache->validated.next;
   next = curr->next;
   while (curr != &cache->validated) {
@ -381,12 +385,14 @@ svga_screen_cache_flush(struct svga_screen *svgascreen,
             * this function itself is called inside svga_context_flush().
             */
            svga->swc->flush(svga->swc, NULL);
+            nsurf = 0;
            ret = SVGA3D_InvalidateGBSurface(svga->swc, entry->handle);
            assert(ret == PIPE_OK);
         }

         /* add the entry to the invalidated list */
         list_add(&entry->head, &cache->invalidated);
+         nsurf++;
      }

      curr = next;
@ -394,6 +400,16 @@ svga_screen_cache_flush(struct svga_screen *svgascreen,
   }

   mtx_unlock(&cache->mutex);
+
+   /**
+    * In some rare cases (when running ARK survival), we hit the max number
+    * of surface relocations with invalidated surfaces during context flush.
+    * So if the number of invalidated surface exceeds a certain limit (1000),
+    * we'll do another winsys flush.
+    */
+   if (nsurf > SVGA_MAX_SURFACE_TO_INVALIDATE) {
+      svga->swc->flush(svga->swc, NULL);
+   }
 }


--- a/src/gallium/drivers/svga/svga_shader.c
+++ b/src/gallium/drivers/svga/svga_shader.c
@ -229,22 +229,25 @@ static const enum pipe_swizzle set_XXXY[PIPE_SWIZZLE_MAX] = {
 */
 void
 svga_init_shader_key_common(const struct svga_context *svga,
-                            enum pipe_shader_type shader,
+                            enum pipe_shader_type shader_type,
+                            const struct svga_shader *shader,
                            struct svga_compile_key *key)
 {
   unsigned i, idx = 0;

-   assert(shader < ARRAY_SIZE(svga->curr.num_sampler_views));
+   assert(shader_type < ARRAY_SIZE(svga->curr.num_sampler_views));

   /* In case the number of samplers and sampler_views doesn't match,
    * loop over the lower of the two counts.
    */
-   key->num_textures = MAX2(svga->curr.num_sampler_views[shader],
-                            svga->curr.num_samplers[shader]);
+   key->num_textures = MAX2(svga->curr.num_sampler_views[shader_type],
+                            svga->curr.num_samplers[shader_type]);

   for (i = 0; i < key->num_textures; i++) {
-      struct pipe_sampler_view *view = svga->curr.sampler_views[shader][i];
-      const struct svga_sampler_state *sampler = svga->curr.sampler[shader][i];
+      struct pipe_sampler_view *view = svga->curr.sampler_views[shader_type][i];
+      const struct svga_sampler_state
+         *sampler = svga->curr.sampler[shader_type][i];
+
      if (view) {
         assert(view->texture);
         assert(view->texture->target < (1 << 4)); /* texture_target:4 */
@ -304,6 +307,12 @@ svga_init_shader_key_common(const struct svga_context *svga,
            if (view->texture->format == PIPE_FORMAT_DXT1_RGB ||
                view->texture->format == PIPE_FORMAT_DXT1_SRGB)
               swizzle_tab = set_alpha;
+
+            /* Save the compare function as we need to handle
+             * depth compare in the shader.
+             */
+            key->tex[i].compare_mode = sampler->compare_mode;
+            key->tex[i].compare_func = sampler->compare_func;
         }

         key->tex[i].swizzle_r = swizzle_tab[view->swizzle_r];
@ -314,8 +323,10 @@ svga_init_shader_key_common(const struct svga_context *svga,

      if (sampler) {
         if (!sampler->normalized_coords) {
-            assert(idx < (1 << 5));  /* width_height_idx:5 bitfield */
-            key->tex[i].width_height_idx = idx++;
+            if (view) {
+               assert(idx < (1 << 5));  /* width_height_idx:5 bitfield */
+               key->tex[i].width_height_idx = idx++;
+	    }
            key->tex[i].unnormalized = TRUE;
            ++key->num_unnormalized_coords;

@ -326,6 +337,9 @@ svga_init_shader_key_common(const struct svga_context *svga,
         }
      }
   }
+
+   key->clamp_vertex_color = svga->curr.rast ?
+                             svga->curr.rast->templ.clamp_vertex_color : 0;
 }


@ -380,6 +394,8 @@ define_gb_shader_vgpu9(struct svga_context *svga,
   variant->gb_shader = sws->shader_create(sws, variant->type,
                                           variant->tokens, codeLen);

+   svga->hud.shader_mem_used += codeLen;
+
   if (!variant->gb_shader)
      return PIPE_ERROR_OUT_OF_MEMORY;

@ -398,6 +414,7 @@ define_gb_shader_vgpu10(struct svga_context *svga,
 {
   struct svga_winsys_context *swc = svga->swc;
   enum pipe_error ret;
+   unsigned len = codeLen + variant->signatureLen;

   /**
    * Shaders in VGPU10 enabled device reside in the device COTable.
@ -412,7 +429,11 @@ define_gb_shader_vgpu10(struct svga_context *svga,
   /* Create gb memory for the shader and upload the shader code */
   variant->gb_shader = swc->shader_create(swc,
                                           variant->id, variant->type,
-                                           variant->tokens, codeLen);
+                                           variant->tokens, codeLen,
+                                           variant->signature,
+                                           variant->signatureLen);
+
+   svga->hud.shader_mem_used += len;

   if (!variant->gb_shader) {
      /* Free the shader ID */
@ -429,7 +450,8 @@ define_gb_shader_vgpu10(struct svga_context *svga,
    * the shader creation and return an error.
    */
   ret = SVGA3D_vgpu10_DefineAndBindShader(swc, variant->gb_shader,
-                                           variant->id, variant->type, codeLen);
+                                           variant->id, variant->type,
+                                           len);

   if (ret != PIPE_OK)
      goto fail;
@ -511,7 +533,10 @@ svga_set_shader(struct svga_context *svga,

   assert(type == SVGA3D_SHADERTYPE_VS ||
          type == SVGA3D_SHADERTYPE_GS ||
-          type == SVGA3D_SHADERTYPE_PS);
+          type == SVGA3D_SHADERTYPE_PS ||
+          type == SVGA3D_SHADERTYPE_HS ||
+          type == SVGA3D_SHADERTYPE_DS ||
+          type == SVGA3D_SHADERTYPE_CS);

   if (svga_have_gb_objects(svga)) {
      struct svga_winsys_gb_shader *gbshader =
@ -533,7 +558,27 @@ svga_set_shader(struct svga_context *svga,
 struct svga_shader_variant *
 svga_new_shader_variant(struct svga_context *svga, enum pipe_shader_type type)
 {
-   struct svga_shader_variant *variant = CALLOC_STRUCT(svga_shader_variant);
+   struct svga_shader_variant *variant;
+
+   switch (type) {
+   case PIPE_SHADER_FRAGMENT:
+      variant = CALLOC(1, sizeof(struct svga_fs_variant));
+      break;
+   case PIPE_SHADER_GEOMETRY:
+      variant = CALLOC(1, sizeof(struct svga_gs_variant));
+      break;
+   case PIPE_SHADER_VERTEX:
+      variant = CALLOC(1, sizeof(struct svga_vs_variant));
+      break;
+   case PIPE_SHADER_TESS_EVAL:
+      variant = CALLOC(1, sizeof(struct svga_tes_variant));
+      break;
+   case PIPE_SHADER_TESS_CTRL:
+      variant = CALLOC(1, sizeof(struct svga_tcs_variant));
+      break;
+   default:
+      return NULL;
+   }

   if (variant) {
      variant->type = svga_shader_type(type);
@ -547,19 +592,11 @@ void
 svga_destroy_shader_variant(struct svga_context *svga,
                            struct svga_shader_variant *variant)
 {
-   enum pipe_error ret = PIPE_OK;
-
   if (svga_have_gb_objects(svga) && variant->gb_shader) {
      if (svga_have_vgpu10(svga)) {
         struct svga_winsys_context *swc = svga->swc;
         swc->shader_destroy(swc, variant->gb_shader);
-         ret = SVGA3D_vgpu10_DestroyShader(svga->swc, variant->id);
-         if (ret != PIPE_OK) {
-            /* flush and try again */
-            svga_context_flush(svga, NULL);
-            ret = SVGA3D_vgpu10_DestroyShader(svga->swc, variant->id);
-            assert(ret == PIPE_OK);
-         }
+         SVGA_RETRY(svga, SVGA3D_vgpu10_DestroyShader(svga->swc, variant->id));
         util_bitmask_clear(svga->shader_id_bm, variant->id);
      }
      else {
@ -570,17 +607,13 @@ svga_destroy_shader_variant(struct svga_context *svga,
   }
   else {
      if (variant->id != UTIL_BITMASK_INVALID_INDEX) {
-         ret = SVGA3D_DestroyShader(svga->swc, variant->id, variant->type);
-         if (ret != PIPE_OK) {
-            /* flush and try again */
-            svga_context_flush(svga, NULL);
-            ret = SVGA3D_DestroyShader(svga->swc, variant->id, variant->type);
-            assert(ret == PIPE_OK);
-         }
+         SVGA_RETRY(svga, SVGA3D_DestroyShader(svga->swc, variant->id,
+                                               variant->type));
         util_bitmask_clear(svga->shader_id_bm, variant->id);
      }
   }

+   FREE(variant->signature);
   FREE((unsigned *)variant->tokens);
   FREE(variant);

@ -612,6 +645,8 @@ svga_rebind_shaders(struct svga_context *svga)
      svga->rebind.flags.vs = 0;
      svga->rebind.flags.gs = 0;
      svga->rebind.flags.fs = 0;
+      svga->rebind.flags.tcs = 0;
+      svga->rebind.flags.tes = 0;

      return PIPE_OK;
   }
@ -637,5 +672,19 @@ svga_rebind_shaders(struct svga_context *svga)
   }
   svga->rebind.flags.fs = 0;

+   if (svga->rebind.flags.tcs && hw->tcs && hw->tcs->gb_shader) {
+      ret = swc->resource_rebind(swc, NULL, hw->tcs->gb_shader, SVGA_RELOC_READ);
+      if (ret != PIPE_OK)
+         return ret;
+   }
+   svga->rebind.flags.tcs = 0;
+
+   if (svga->rebind.flags.tes && hw->tes && hw->tes->gb_shader) {
+      ret = swc->resource_rebind(swc, NULL, hw->tes->gb_shader, SVGA_RELOC_READ);
+      if (ret != PIPE_OK)
+         return ret;
+   }
+   svga->rebind.flags.tes = 0;
+
   return PIPE_OK;
 }
--- a/src/gallium/drivers/svga/svga_shader.h
+++ b/src/gallium/drivers/svga/svga_shader.h
@ -68,6 +68,8 @@ struct svga_compile_key
      unsigned need_prescale:1;
      unsigned writes_psize:1;
      unsigned wide_point:1;
+      unsigned writes_viewport_index:1;
+      unsigned num_prescale:5;
   } gs;

   /* fragment shader only */
@ -83,15 +85,42 @@ struct svga_compile_key
      unsigned alpha_func:4;  /**< SVGA3D_CMP_x */
      unsigned write_color0_to_n_cbufs:4;
      unsigned aa_point:1;
+      unsigned layer_to_zero:1;
      int aa_point_coord_index;
      float alpha_ref;
   } fs;

+   /* tessellation control shader */
+   struct {
+      unsigned vertices_per_patch:8;
+      enum pipe_prim_type prim_mode:8;
+      enum pipe_tess_spacing spacing:3;
+      unsigned vertices_order_cw:1;
+      unsigned point_mode:1;
+      unsigned passthrough:1;
+   } tcs;
+
+   /* tessellation evaluation shader */
+   struct {
+      unsigned vertices_per_patch:8;
+      unsigned tessfactor_index:8;
+      unsigned need_prescale:1;
+      unsigned need_tessouter:1;
+      unsigned need_tessinner:1;
+   } tes;
+
+   /* compute shader */
+   struct {
+      unsigned grid_size[3];
+   } cs;
+
   /* any shader type */
   int8_t generic_remap_table[MAX_GENERIC_VARYING];
   unsigned num_textures:8;
   unsigned num_unnormalized_coords:8;
   unsigned clip_plane_enable:PIPE_MAX_CLIP_PLANES;
+   unsigned last_vertex_stage:1;
+   unsigned clamp_vertex_color:1;
   unsigned sprite_origin_lower_left:1;
   uint16_t sprite_coord_enable;
   struct {
@ -121,6 +150,10 @@ struct svga_token_key {
      unsigned writes_psize:1;
      unsigned aa_point:1;
   } gs;
+   struct {
+      unsigned write_position:1;
+   } vs;
+   unsigned dynamic_indexing:1;
 };

 /**
@ -143,6 +176,10 @@ struct svga_shader_variant
   const unsigned *tokens;
   unsigned nr_tokens;

+   /* shader signature */
+   unsigned signatureLen;
+   SVGA3dDXShaderSignatureHeader *signature;
+
   /** Per-context shader identifier used with SVGA_3D_CMD_SHADER_DEFINE,
    * SVGA_3D_CMD_SET_SHADER and SVGA_3D_CMD_SHADER_DESTROY.
    */
@ -154,6 +191,18 @@ struct svga_shader_variant
   /* GB object buffer containing the bytecode */
   struct svga_winsys_gb_shader *gb_shader;

+   /** Next variant */
+   struct svga_shader_variant *next;
+};
+
+
+/**
+ * Shader variant for fragment shader
+ */
+struct svga_fs_variant
+{
+   struct svga_shader_variant base;
+
   boolean uses_flat_interp;   /** TRUE if flat interpolation qualifier is
                                *  applied to any of the varyings.
                                */
@ -168,9 +217,56 @@ struct svga_shader_variant

   /** For FS-based polygon stipple */
   unsigned pstipple_sampler_unit;
+};

-   /** Next variant */
-   struct svga_shader_variant *next;
+
+/**
+ * Shader variant for geometry shader
+ */
+struct svga_gs_variant
+{
+   struct svga_shader_variant base;
+};
+
+
+/**
+ * Shader variant for vertex shader
+ */
+struct svga_vs_variant
+{
+   struct svga_shader_variant base;
+};
+
+
+/**
+ * Shader variant for tessellation evaluation shader
+ */
+struct svga_tes_variant
+{
+   struct svga_shader_variant base;
+
+   enum pipe_prim_type prim_mode:8;
+   enum pipe_tess_spacing spacing:3;
+   unsigned vertices_order_cw:1;
+   unsigned point_mode:1;
+};
+
+
+/**
+ * Shader variant for tessellation control shader
+ */
+struct svga_tcs_variant
+{
+   struct svga_shader_variant base;
+};
+
+
+/**
+ * Shader variant for compute shader
+ */
+struct svga_cs_variant
+{
+   struct svga_shader_variant base;
 };


@ -237,6 +333,30 @@ struct svga_geometry_shader
 };


+struct svga_tcs_shader
+{
+   struct svga_shader base;
+
+   /** Mask of which generic varying variables are written by this shader */
+   uint64_t generic_outputs;
+};
+
+
+struct svga_tes_shader
+{
+   struct svga_shader base;
+
+   /** Mask of which generic varying variables are written by this shader */
+   uint64_t generic_inputs;
+};
+
+
+struct svga_compute_shader
+{
+   struct svga_shader base;
+};
+
+
 static inline boolean
 svga_compile_keys_equal(const struct svga_compile_key *a,
                        const struct svga_compile_key *b)
@ -264,7 +384,8 @@ svga_remap_generic_index(int8_t remap_table[MAX_GENERIC_VARYING],

 void
 svga_init_shader_key_common(const struct svga_context *svga,
-                            enum pipe_shader_type shader,
+                            enum pipe_shader_type shader_type,
+                            const struct svga_shader *shader,
                            struct svga_compile_key *key);

 struct svga_shader_variant *
@ -328,6 +449,12 @@ svga_shader_type(enum pipe_shader_type shader)
      return SVGA3D_SHADERTYPE_GS;
   case PIPE_SHADER_FRAGMENT:
      return SVGA3D_SHADERTYPE_PS;
+   case PIPE_SHADER_TESS_CTRL:
+      return SVGA3D_SHADERTYPE_HS;
+   case PIPE_SHADER_TESS_EVAL:
+      return SVGA3D_SHADERTYPE_DS;
+   case PIPE_SHADER_COMPUTE:
+      return SVGA3D_SHADERTYPE_CS;
   default:
      assert(!"Invalid shader type");
      return SVGA3D_SHADERTYPE_VS;
@ -351,4 +478,39 @@ svga_have_gs_streamout(const struct svga_context *svga)
 }


+static inline struct svga_fs_variant *
+svga_fs_variant(struct svga_shader_variant *variant)
+{
+   assert(!variant || variant->type == SVGA3D_SHADERTYPE_PS);
+   return (struct svga_fs_variant *)variant;
+}
+
+
+static inline struct svga_tes_variant *
+svga_tes_variant(struct svga_shader_variant *variant)
+{
+   assert(!variant || variant->type == SVGA3D_SHADERTYPE_DS);
+   return (struct svga_tes_variant *)variant;
+}
+
+
+static inline struct svga_cs_variant *
+svga_cs_variant(struct svga_shader_variant *variant)
+{
+   assert(!variant || variant->type == SVGA3D_SHADERTYPE_CS);
+   return (struct svga_cs_variant *)variant;
+}
+
+
+/* Returns TRUE if we are currently using flat shading.
+ */
+static inline boolean
+svga_is_using_flat_shading(const struct svga_context *svga)
+{
+   return
+      svga->state.hw_draw.fs ?
+         svga_fs_variant(svga->state.hw_draw.fs)->uses_flat_interp : FALSE;
+}
+
+
 #endif /* SVGA_SHADER_H */
--- a/src/gallium/drivers/svga/svga_state.c
+++ b/src/gallium/drivers/svga/svga_state.c
@ -60,19 +60,40 @@ static const struct svga_tracked_state *hw_clear_state[] =
 };


-/* Atoms to update hardware state prior to emitting a draw packet.
+/**
+ * Atoms to update hardware state prior to emitting a draw packet
+ * for VGPU9 device.
 */
-static const struct svga_tracked_state *hw_draw_state[] =
+static const struct svga_tracked_state *hw_draw_state_vgpu9[] =
+{
+   &svga_hw_fs,
+   &svga_hw_vs,
+   &svga_hw_rss,
+   &svga_hw_tss,
+   &svga_hw_tss_binding,
+   &svga_hw_clip_planes,
+   &svga_hw_vdecl,
+   &svga_hw_fs_constants,
+   &svga_hw_vs_constants,
+   NULL
+};
+
+
+/**
+ * Atoms to update hardware state prior to emitting a draw packet
+ * for VGPU10 device.
+ * Geometry Shader is new to VGPU10.
+ * TSS and TSS bindings are replaced by sampler and sampler bindings.
+ */
+static const struct svga_tracked_state *hw_draw_state_vgpu10[] =
 {
   &svga_need_tgsi_transform,
   &svga_hw_fs,
   &svga_hw_gs,
   &svga_hw_vs,
   &svga_hw_rss,
-   &svga_hw_sampler,           /* VGPU10 */
-   &svga_hw_sampler_bindings,  /* VGPU10 */
-   &svga_hw_tss,               /* pre-VGPU10 */
-   &svga_hw_tss_binding,       /* pre-VGPU10 */
+   &svga_hw_sampler,
+   &svga_hw_sampler_bindings,
   &svga_hw_clip_planes,
   &svga_hw_vdecl,
   &svga_hw_fs_constants,
@ -82,6 +103,33 @@ static const struct svga_tracked_state *hw_draw_state[] =
 };


+/**
+ * Atoms to update hardware state prior to emitting a draw packet
+ * for SM5 device.
+ * TCS and TES Shaders are new to SM5 device.
+ */
+static const struct svga_tracked_state *hw_draw_state_sm5[] =
+{
+   &svga_need_tgsi_transform,
+   &svga_hw_fs,
+   &svga_hw_gs,
+   &svga_hw_tes,
+   &svga_hw_tcs,
+   &svga_hw_vs,
+   &svga_hw_rss,
+   &svga_hw_sampler,
+   &svga_hw_sampler_bindings,
+   &svga_hw_clip_planes,
+   &svga_hw_vdecl,
+   &svga_hw_fs_constants,
+   &svga_hw_gs_constants,
+   &svga_hw_tes_constants,
+   &svga_hw_tcs_constants,
+   &svga_hw_vs_constants,
+   NULL
+};
+
+
 static const struct svga_tracked_state *swtnl_draw_state[] =
 {
   &svga_update_swtnl_draw,
@ -89,6 +137,7 @@ static const struct svga_tracked_state *swtnl_draw_state[] =
   NULL
 };

+
 /* Flattens the graph of state dependencies.  Could swap the positions
 * of hw_clear_state and need_swtnl_state without breaking anything.
 */
@ -96,27 +145,26 @@ static const struct svga_tracked_state **state_levels[] =
 {
   need_swtnl_state,
   hw_clear_state,
-   hw_draw_state,
+   NULL,              /* hw_draw_state, to be set to the right version */
   swtnl_draw_state
 };


-
-static unsigned
-check_state(unsigned a, unsigned b)
+static uint64_t
+check_state(uint64_t a, uint64_t b)
 {
   return (a & b);
 }

 static void
-accumulate_state(unsigned *a, unsigned b)
+accumulate_state(uint64_t *a, uint64_t b)
 {
   *a |= b;
 }


 static void
-xor_states(unsigned *result, unsigned a, unsigned b)
+xor_states(uint64_t *result, uint64_t a, uint64_t b)
 {
   *result = a ^ b;
 }
@ -125,7 +173,7 @@ xor_states(unsigned *result, unsigned a, unsigned b)
 static enum pipe_error
 update_state(struct svga_context *svga,
             const struct svga_tracked_state *atoms[],
-             unsigned *state)
+             uint64_t *state)
 {
 #ifdef DEBUG
   boolean debug = TRUE;
@ -144,13 +192,13 @@ update_state(struct svga_context *svga,
       * state flags which are generated and checked to help ensure
       * state atoms are ordered correctly in the list.
       */
-      unsigned examined, prev;
+      uint64_t examined, prev;

      examined = 0;
      prev = *state;

      for (i = 0; atoms[i] != NULL; i++) {
-         unsigned generated;
+         uint64_t generated;

         assert(atoms[i]->dirty);
         assert(atoms[i]->update);
@ -247,12 +295,7 @@ svga_update_state_retry(struct svga_context *svga, unsigned max_level)
 {
   enum pipe_error ret;

-   ret = svga_update_state( svga, max_level );
-
-   if (ret != PIPE_OK) {
-      svga_context_flush(svga, NULL);
-      ret = svga_update_state( svga, max_level );
-   }
+   SVGA_RETRY_OOM(svga, ret, svga_update_state( svga, max_level ));

   return ret == PIPE_OK;
 }
@ -325,3 +368,14 @@ svga_emit_initial_state(struct svga_context *svga)
      return PIPE_OK;
   }
 }
+
+
+void
+svga_init_tracked_state(struct svga_context *svga)
+{
+   /* Set the hw_draw_state atom list to the one for the particular gpu version.
+    */
+   state_levels[2] = svga_have_sm5(svga) ? hw_draw_state_sm5 :
+                       (svga_have_vgpu10(svga) ? hw_draw_state_vgpu10 :
+                                                 hw_draw_state_vgpu9);
+}
--- a/src/gallium/drivers/svga/svga_state.h
+++ b/src/gallium/drivers/svga/svga_state.h
@ -39,8 +39,8 @@ void svga_destroy_state( struct svga_context *svga );

 struct svga_tracked_state {
   const char *name;
-   unsigned dirty;
-   enum pipe_error (*update)( struct svga_context *svga, unsigned dirty );
+   uint64_t dirty;
+   enum pipe_error (*update)( struct svga_context *svga, uint64_t dirty );
 };

 /* NEED_SWTNL
@ -61,6 +61,8 @@ extern struct svga_tracked_state svga_need_tgsi_transform;
 extern struct svga_tracked_state svga_hw_vs;
 extern struct svga_tracked_state svga_hw_fs;
 extern struct svga_tracked_state svga_hw_gs;
+extern struct svga_tracked_state svga_hw_tcs;
+extern struct svga_tracked_state svga_hw_tes;
 extern struct svga_tracked_state svga_hw_rss;
 extern struct svga_tracked_state svga_hw_pstipple;
 extern struct svga_tracked_state svga_hw_sampler;
@ -72,6 +74,8 @@ extern struct svga_tracked_state svga_hw_vdecl;
 extern struct svga_tracked_state svga_hw_fs_constants;
 extern struct svga_tracked_state svga_hw_gs_constants;
 extern struct svga_tracked_state svga_hw_vs_constants;
+extern struct svga_tracked_state svga_hw_tes_constants;
+extern struct svga_tracked_state svga_hw_tcs_constants;

 /* SWTNL_DRAW
 */
@ -105,4 +109,15 @@ enum pipe_error svga_reemit_vs_bindings(struct svga_context *svga);

 enum pipe_error svga_reemit_fs_bindings(struct svga_context *svga);

+void svga_init_tracked_state(struct svga_context *svga);
+
+void *
+svga_create_fs_state(struct pipe_context *pipe,
+                     const struct pipe_shader_state *templ);
+
+void
+svga_bind_fs_state(struct pipe_context *pipe, void *shader);
+
+bool svga_update_compute_state(struct svga_context *svga);
+
 #endif
--- a/src/gallium/drivers/svga/svga_state_constants.c
+++ b/src/gallium/drivers/svga/svga_state_constants.c
@ -133,12 +133,13 @@ svga_get_extra_fs_constants(const struct svga_context *svga, float *dest)
 * will be returned in 'dest'.
 */
 static unsigned
-svga_get_prescale_constants(const struct svga_context *svga, float **dest)
+svga_get_prescale_constants(const struct svga_context *svga, float **dest,
+		            const struct svga_prescale *prescale)
 {
-   memcpy(*dest, svga->state.hw_clear.prescale.scale, 4 * sizeof(float));
+   memcpy(*dest, prescale->scale, 4 * sizeof(float));
   *dest += 4;

-   memcpy(*dest, svga->state.hw_clear.prescale.translate, 4 * sizeof(float));
+   memcpy(*dest, prescale->translate, 4 * sizeof(float));
   *dest += 4;

   return 2;
@ -153,8 +154,8 @@ svga_get_pt_sprite_constants(const struct svga_context *svga, float **dest)
   const struct svga_screen *screen = svga_screen(svga->pipe.screen);
   float *dst = *dest;

-   dst[0] = 1.0 / (svga->curr.viewport.scale[0] * 2);
-   dst[1] = 1.0 / (svga->curr.viewport.scale[1] * 2);
+   dst[0] = 1.0 / (svga->curr.viewport[0].scale[0] * 2);
+   dst[1] = 1.0 / (svga->curr.viewport[0].scale[1] * 2);
   dst[2] = svga->curr.rast->pointsize;
   dst[3] = screen->maxPointSize;
   *dest = *dest + 4;
@ -186,6 +187,7 @@ svga_get_clip_plane_constants(const struct svga_context *svga,
   return count;
 }

+
 /**
 * Emit any extra vertex shader constants into the buffer pointed
 * to by 'dest'.
@ -203,15 +205,16 @@ svga_get_extra_vs_constants(const struct svga_context *svga, float *dest)
   /* SVGA_NEW_VS_VARIANT
    */
   if (variant->key.vs.need_prescale) {
-      count += svga_get_prescale_constants(svga, &dest);
+      count += svga_get_prescale_constants(svga, &dest,
+		                           &svga->state.hw_clear.prescale[0]);
   }

   if (variant->key.vs.undo_viewport) {
      /* Used to convert window coords back to NDC coords */
-      dest[0] = 1.0f / svga->curr.viewport.scale[0];
-      dest[1] = 1.0f / svga->curr.viewport.scale[1];
-      dest[2] = -svga->curr.viewport.translate[0];
-      dest[3] = -svga->curr.viewport.translate[1];
+      dest[0] = 1.0f / svga->curr.viewport[0].scale[0];
+      dest[1] = 1.0f / svga->curr.viewport[0].scale[1];
+      dest[2] = -svga->curr.viewport[0].translate[0];
+      dest[3] = -svga->curr.viewport[0].translate[1];
      dest += 4;
      count += 1;
   }
@ -250,7 +253,20 @@ svga_get_extra_gs_constants(const struct svga_context *svga, float *dest)
   }

   if (variant->key.gs.need_prescale) {
-      count += svga_get_prescale_constants(svga, &dest);
+      unsigned i, num_prescale = 1;
+
+      /* If prescale is needed and the geometry shader writes to viewport
+       * index, then prescale for all viewports will be added to the
+       * constant buffer.
+       */
+      if (variant->key.gs.writes_viewport_index)
+         num_prescale = svga->state.hw_clear.num_prescale;
+
+      for (i = 0; i < num_prescale; i++) {
+         count +=
+            svga_get_prescale_constants(svga, &dest,
+			                &svga->state.hw_clear.prescale[i]);
+      }
   }

   /* SVGA_NEW_CLIP */
@ -265,6 +281,77 @@ svga_get_extra_gs_constants(const struct svga_context *svga, float *dest)
 }


+/**
+ * Emit any extra tessellation control shader constants into the
+ * buffer pointed to by 'dest'.
+ */
+static unsigned
+svga_get_extra_tcs_constants(struct svga_context *svga, float *dest)
+{
+   const struct svga_shader_variant *variant = svga->state.hw_draw.tcs;
+   unsigned count = 0;
+
+   /* SVGA_NEW_CLIP */
+   count += svga_get_clip_plane_constants(svga, variant, &dest);
+
+   /* common constants */
+   count += svga_get_extra_constants_common(svga, variant,
+                                            PIPE_SHADER_TESS_CTRL,
+                                            dest);
+
+   assert(count <= MAX_EXTRA_CONSTS);
+   return count;
+}
+
+
+/**
+ * Emit any extra tessellation evaluation shader constants into
+ * the buffer pointed to by 'dest'.
+ */
+static unsigned
+svga_get_extra_tes_constants(struct svga_context *svga, float *dest)
+{
+   const struct svga_shader_variant *variant = svga->state.hw_draw.tes;
+   unsigned count = 0;
+
+   if (variant->key.tes.need_prescale) {
+      count += svga_get_prescale_constants(svga, &dest,
+		                           &svga->state.hw_clear.prescale[0]);
+   }
+
+   /* SVGA_NEW_CLIP */
+   count += svga_get_clip_plane_constants(svga, variant, &dest);
+
+   /* common constants */
+   count += svga_get_extra_constants_common(svga, variant,
+                                            PIPE_SHADER_TESS_EVAL,
+                                            dest);
+
+   assert(count <= MAX_EXTRA_CONSTS);
+   return count;
+}
+
+
+/**
+ * Emit any extra compute shader constants into
+ * the buffer pointed to by 'dest'.
+ */
+static unsigned
+svga_get_extra_cs_constants(struct svga_context *svga, float *dest)
+{
+   const struct svga_shader_variant *variant = svga->state.hw_draw.cs;
+   unsigned count = 0;
+
+   /* common constants */
+   count += svga_get_extra_constants_common(svga, variant,
+                                            PIPE_SHADER_COMPUTE,
+                                            dest);
+
+   assert(count <= MAX_EXTRA_CONSTS);
+   return count;
+}
+
+
 /*
 * Check and emit a range of shader constant registers, trying to coalesce
 * successive shader constant updates in a single command in order to save
@ -490,6 +577,15 @@ emit_constbuf_vgpu10(struct svga_context *svga, enum pipe_shader_type shader)
   const struct svga_shader_variant *variant;
   unsigned alloc_buf_size;

+   assert(shader == PIPE_SHADER_VERTEX ||
+          shader == PIPE_SHADER_GEOMETRY ||
+          shader == PIPE_SHADER_FRAGMENT ||
+          shader == PIPE_SHADER_TESS_CTRL ||
+          shader == PIPE_SHADER_TESS_EVAL ||
+          shader == PIPE_SHADER_COMPUTE);
+
+   cbuf = &svga->curr.constbufs[shader][0];
+
   switch (shader) {
   case PIPE_SHADER_VERTEX:
      variant = svga->state.hw_draw.vs;
@ -503,6 +599,18 @@ emit_constbuf_vgpu10(struct svga_context *svga, enum pipe_shader_type shader)
      variant = svga->state.hw_draw.gs;
      extra_count = svga_get_extra_gs_constants(svga, (float *) extras);
      break;
+   case PIPE_SHADER_TESS_CTRL:
+      variant = svga->state.hw_draw.tcs;
+      extra_count = svga_get_extra_tcs_constants(svga, (float *) extras);
+      break;
+   case PIPE_SHADER_TESS_EVAL:
+      variant = svga->state.hw_draw.tes;
+      extra_count = svga_get_extra_tes_constants(svga, (float *) extras);
+      break;
+   case PIPE_SHADER_COMPUTE:
+      variant = svga->state.hw_draw.cs;
+      extra_count = svga_get_extra_cs_constants(svga, (float *) extras);
+      break;
   default:
      assert(!"Unexpected shader type");
      /* Don't return an error code since we don't want to keep re-trying
@ -706,7 +814,7 @@ emit_consts_vgpu10(struct svga_context *svga, enum pipe_shader_type shader)
 }

 static enum pipe_error
-emit_fs_consts(struct svga_context *svga, unsigned dirty)
+emit_fs_consts(struct svga_context *svga, uint64_t dirty)
 {
   const struct svga_shader_variant *variant = svga->state.hw_draw.fs;
   enum pipe_error ret = PIPE_OK;
@ -741,7 +849,7 @@ struct svga_tracked_state svga_hw_fs_constants =


 static enum pipe_error
-emit_vs_consts(struct svga_context *svga, unsigned dirty)
+emit_vs_consts(struct svga_context *svga, uint64_t dirty)
 {
   const struct svga_shader_variant *variant = svga->state.hw_draw.vs;
   enum pipe_error ret = PIPE_OK;
@ -776,7 +884,7 @@ struct svga_tracked_state svga_hw_vs_constants =


 static enum pipe_error
-emit_gs_consts(struct svga_context *svga, unsigned dirty)
+emit_gs_consts(struct svga_context *svga, uint64_t dirty)
 {
   const struct svga_shader_variant *variant = svga->state.hw_draw.gs;
   enum pipe_error ret = PIPE_OK;
@ -788,17 +896,17 @@ emit_gs_consts(struct svga_context *svga, unsigned dirty)

   /* SVGA_NEW_GS_CONST_BUFFER
    */
-   if (svga_have_vgpu10(svga)) {
-      /**
-       * If only the rasterizer state has changed and the current geometry
-       * shader does not emit wide points, then there is no reason to
-       * re-emit the GS constants, so skip it.
-       */
-      if (dirty == SVGA_NEW_RAST && !variant->key.gs.wide_point)
-         return PIPE_OK;
+   assert(svga_have_vgpu10(svga));

-      ret = emit_consts_vgpu10(svga, PIPE_SHADER_GEOMETRY);
-   }
+   /**
+    * If only the rasterizer state has changed and the current geometry
+    * shader does not emit wide points, then there is no reason to
+    * re-emit the GS constants, so skip it.
+    */
+   if (dirty == SVGA_NEW_RAST && !variant->key.gs.wide_point)
+      return PIPE_OK;
+
+   ret = emit_consts_vgpu10(svga, PIPE_SHADER_GEOMETRY);

   return ret;
 }
@ -814,3 +922,66 @@ struct svga_tracked_state svga_hw_gs_constants =
    SVGA_NEW_TEXTURE_CONSTS),
   emit_gs_consts
 };
+
+
+/**
+ * Emit constant buffer for tessellation control shader
+ */
+static enum pipe_error
+emit_tcs_consts(struct svga_context *svga, uint64_t dirty)
+{
+   const struct svga_shader_variant *variant = svga->state.hw_draw.tcs;
+   enum pipe_error ret = PIPE_OK;
+
+   assert(svga_have_sm5(svga));
+
+   /* SVGA_NEW_TCS_VARIANT */
+   if (!variant)
+      return PIPE_OK;
+
+   /* SVGA_NEW_TCS_CONST_BUFFER */
+
+   ret = emit_consts_vgpu10(svga, PIPE_SHADER_TESS_CTRL);
+
+   return ret;
+}
+
+
+struct svga_tracked_state svga_hw_tcs_constants =
+{
+   "hw tcs params",
+   (SVGA_NEW_TCS_CONST_BUFFER |
+    SVGA_NEW_TCS_VARIANT),
+   emit_tcs_consts
+};
+
+
+/**
+ * Emit constant buffer for tessellation evaluation shader
+ */
+static enum pipe_error
+emit_tes_consts(struct svga_context *svga, uint64_t dirty)
+{
+   const struct svga_shader_variant *variant = svga->state.hw_draw.tes;
+   enum pipe_error ret = PIPE_OK;
+
+   assert(svga_have_sm5(svga));
+
+   /* SVGA_NEW_TES_VARIANT */
+   if (!variant)
+      return PIPE_OK;
+
+   ret = emit_consts_vgpu10(svga, PIPE_SHADER_TESS_EVAL);
+
+   return ret;
+}
+
+
+struct svga_tracked_state svga_hw_tes_constants =
+{
+   "hw tes params",
+   (SVGA_NEW_PRESCALE |
+    SVGA_NEW_TES_CONST_BUFFER |
+    SVGA_NEW_TES_VARIANT),
+   emit_tes_consts
+};
--- a/src/gallium/drivers/svga/svga_state_framebuffer.c
+++ b/src/gallium/drivers/svga/svga_state_framebuffer.c
@ -212,9 +212,13 @@ emit_fb_vgpu10(struct svga_context *svga)
      if (curr->cbufs[i]) {
         struct pipe_surface *s = curr->cbufs[i];

-         rtv[i] = svga_validate_surface_view(svga, svga_surface(s));
-         if (rtv[i] == NULL) {
-            return PIPE_ERROR_OUT_OF_MEMORY;
+         if (curr->cbufs[i] != hw->cbufs[i]) {
+            rtv[i] = svga_validate_surface_view(svga, svga_surface(s));
+            if (rtv[i] == NULL) {
+               return PIPE_ERROR_OUT_OF_MEMORY;
+            }
+         } else {
+           rtv[i] = svga->state.hw_clear.rtv[i];
         }

         assert(svga_surface(rtv[i])->view_id != SVGA3D_INVALID_ID);
@ -233,9 +237,13 @@ emit_fb_vgpu10(struct svga_context *svga)
   if (curr->zsbuf) {
      struct pipe_surface *s = curr->zsbuf;

-      dsv = svga_validate_surface_view(svga, svga_surface(curr->zsbuf));
-      if (!dsv) {
-         return PIPE_ERROR_OUT_OF_MEMORY;
+      if (curr->zsbuf != hw->zsbuf) {
+         dsv = svga_validate_surface_view(svga, svga_surface(curr->zsbuf));
+         if (!dsv) {
+            return PIPE_ERROR_OUT_OF_MEMORY;
+         }
+      } else {
+         dsv = svga->state.hw_clear.dsv;
      }

      /* Set the rendered-to flag */
@ -258,10 +266,6 @@ emit_fb_vgpu10(struct svga_context *svga)
      /* number of render targets sent to the device, not including trailing
       * unbound render targets.
       */
-      svga->state.hw_clear.num_rendertargets = last_rtv + 1;
-      svga->state.hw_clear.dsv = dsv;
-      memcpy(svga->state.hw_clear.rtv, rtv, num_color * sizeof(rtv[0]));
-    
      for (i = 0; i < ss->max_color_buffers; i++) {
         if (hw->cbufs[i] != curr->cbufs[i]) {
            /* propagate the backed view surface before unbinding it */
@ -270,19 +274,32 @@ emit_fb_vgpu10(struct svga_context *svga)
                                      &svga_surface(hw->cbufs[i])->backed->base,
                                      TRUE);
            }
+            else if (svga->state.hw_clear.rtv[i] != hw->cbufs[i] &&
+                     svga->state.hw_clear.rtv[i]) {
+               /* Free the alternate surface view when it is unbound.  */
+               svga->pipe.surface_destroy(&svga->pipe, svga->state.hw_clear.rtv[i]);
+            }
            pipe_surface_reference(&hw->cbufs[i], curr->cbufs[i]);
         }
      }
+      svga->state.hw_clear.num_rendertargets = last_rtv + 1;
+      memcpy(svga->state.hw_clear.rtv, rtv, num_color * sizeof(rtv[0]));
      hw->nr_cbufs = curr->nr_cbufs;

      if (hw->zsbuf != curr->zsbuf) {
         /* propagate the backed view surface before unbinding it */
         if (hw->zsbuf && svga_surface(hw->zsbuf)->backed) {
-            svga_propagate_surface(svga, &svga_surface(hw->zsbuf)->backed->base,
+            svga_propagate_surface(svga,
+                                   &svga_surface(hw->zsbuf)->backed->base,
                                   TRUE);
         }
+         else if (svga->state.hw_clear.dsv != hw->zsbuf && svga->state.hw_clear.dsv) {
+            /* Free the alternate surface view when it is unbound.  */
+            svga->pipe.surface_destroy(&svga->pipe, svga->state.hw_clear.dsv);
+         }
         pipe_surface_reference(&hw->zsbuf, curr->zsbuf);
      }
+      svga->state.hw_clear.dsv = dsv;
   }

   return ret;
@ -290,7 +307,7 @@ emit_fb_vgpu10(struct svga_context *svga)


 static enum pipe_error
-emit_framebuffer(struct svga_context *svga, unsigned dirty)
+emit_framebuffer(struct svga_context *svga, uint64_t dirty)
 {
   if (svga_have_vgpu10(svga)) {
      return emit_fb_vgpu10(svga);
@ -383,13 +400,14 @@ struct svga_tracked_state svga_hw_framebuffer =
 /***********************************************************************
 */

-static enum pipe_error
-emit_viewport( struct svga_context *svga,
-               unsigned dirty )
+static void
+get_viewport_prescale(struct svga_context *svga,
+                      struct pipe_viewport_state *viewport,
+                      SVGA3dViewport *vp,
+                      struct svga_prescale *prescale)
 {
-   const struct pipe_viewport_state *viewport = &svga->curr.viewport;
-   struct svga_prescale prescale;
   SVGA3dRect rect;
+
   /* Not sure if this state is relevant with POSITIONT.  Probably
    * not, but setting to 0,1 avoids some state pingponging.
    */
@ -398,7 +416,6 @@ emit_viewport( struct svga_context *svga,
   float flip = -1.0;
   boolean degenerate = FALSE;
   boolean invertY = FALSE;
-   enum pipe_error ret;

   float fb_width = (float) svga->curr.framebuffer.width;
   float fb_height = (float) svga->curr.framebuffer.height;
@ -407,9 +424,8 @@ emit_viewport( struct svga_context *svga,
   float fy = flip * viewport->scale[1] * -1.0f + viewport->translate[1];
   float fw =        viewport->scale[0] * 2.0f;
   float fh = flip * viewport->scale[1] * 2.0f;
-   boolean emit_vgpu10_viewport = FALSE;

-   memset( &prescale, 0, sizeof(prescale) );
+   memset(prescale, 0, sizeof(*prescale));

   /* Examine gallium viewport transformation and produce a screen
    * rectangle and possibly vertex shader pre-transformation to
@ -423,14 +439,14 @@ emit_viewport( struct svga_context *svga,
            fw,
            fh);

-   prescale.scale[0] = 1.0;
-   prescale.scale[1] = 1.0;
-   prescale.scale[2] = 1.0;
-   prescale.scale[3] = 1.0;
-   prescale.translate[0] = 0;
-   prescale.translate[1] = 0;
-   prescale.translate[2] = 0;
-   prescale.translate[3] = 0;
+   prescale->scale[0] = 1.0;
+   prescale->scale[1] = 1.0;
+   prescale->scale[2] = 1.0;
+   prescale->scale[3] = 1.0;
+   prescale->translate[0] = 0;
+   prescale->translate[1] = 0;
+   prescale->translate[2] = 0;
+   prescale->translate[3] = 0;

   /* Enable prescale to adjust vertex positions to match
      VGPU10 convention only if rasterization is enabled.
@ -439,12 +455,12 @@ emit_viewport( struct svga_context *svga,
      degenerate = TRUE;
      goto out;
   } else {
-      prescale.enabled = TRUE;
+      prescale->enabled = TRUE;
   }

   if (fw < 0) {
-      prescale.scale[0] *= -1.0f;
-      prescale.translate[0] += -fw;
+      prescale->scale[0] *= -1.0f;
+      prescale->translate[0] += -fw;
      fw = -fw;
      fx = viewport->scale[0] * 1.0f + viewport->translate[0];
   }
@ -452,54 +468,54 @@ emit_viewport( struct svga_context *svga,
   if (fh < 0.0) {
      if (svga_have_vgpu10(svga)) {
         /* floating point viewport params below */
-         prescale.translate[1] = fh + fy * 2.0f;
+         prescale->translate[1] = fh + fy * 2.0f;
      }
      else {
         /* integer viewport params below */
-         prescale.translate[1] = fh - 1.0f + fy * 2.0f;
+         prescale->translate[1] = fh - 1.0f + fy * 2.0f;
      }
      fh = -fh;
      fy -= fh;
-      prescale.scale[1] = -1.0f;
+      prescale->scale[1] = -1.0f;
      invertY = TRUE;
   }

   if (fx < 0) {
-      prescale.translate[0] += fx;
-      prescale.scale[0] *= fw / (fw + fx);
+      prescale->translate[0] += fx;
+      prescale->scale[0] *= fw / (fw + fx);
      fw += fx;
      fx = 0.0f;
   }

   if (fy < 0) {
      if (invertY) {
-         prescale.translate[1] -= fy;
+         prescale->translate[1] -= fy;
      }
      else {
-         prescale.translate[1] += fy;
+         prescale->translate[1] += fy;
      }
-      prescale.scale[1] *= fh / (fh + fy);
+      prescale->scale[1] *= fh / (fh + fy);
      fh += fy;
      fy = 0.0f;
   }

   if (fx + fw > fb_width) {
-      prescale.scale[0] *= fw / (fb_width - fx);
-      prescale.translate[0] -= fx * (fw / (fb_width - fx));
-      prescale.translate[0] += fx;
+      prescale->scale[0] *= fw / (fb_width - fx);
+      prescale->translate[0] -= fx * (fw / (fb_width - fx));
+      prescale->translate[0] += fx;
      fw = fb_width - fx;
   }

   if (fy + fh > fb_height) {
-      prescale.scale[1] *= fh / (fb_height - fy);
+      prescale->scale[1] *= fh / (fb_height - fy);
      if (invertY) {
         float in = fb_height - fy;       /* number of vp pixels inside view */
         float out = fy + fh - fb_height; /* number of vp pixels out of view */
-         prescale.translate[1] += fy * out / in;
+         prescale->translate[1] += fy * out / in;
      }
      else {
-         prescale.translate[1] -= fy * (fh / (fb_height - fy));
-         prescale.translate[1] += fy;
+         prescale->translate[1] -= fy * (fh / (fb_height - fy));
+         prescale->translate[1] += fy;
      }
      fh = fb_height - fy;
   }
@ -566,10 +582,10 @@ emit_viewport( struct svga_context *svga,
      if (invertY)
         adjust_y = -adjust_y;

-      prescale.translate[0] += adjust_x;
-      prescale.translate[1] += adjust_y;
-      prescale.translate[2] = 0.5; /* D3D clip space */
-      prescale.scale[2]     = 0.5; /* D3D clip space */
+      prescale->translate[0] += adjust_x;
+      prescale->translate[1] += adjust_y;
+      prescale->translate[2] = 0.5; /* D3D clip space */
+      prescale->scale[2]     = 0.5; /* D3D clip space */
   }

   range_min = viewport->scale[2] * -1.0f + viewport->translate[2];
@ -584,7 +600,7 @@ emit_viewport( struct svga_context *svga,
      range_tmp = range_min;
      range_min = range_max;
      range_max = range_tmp;
-      prescale.scale[2] = -prescale.scale[2];
+      prescale->scale[2] = -prescale->scale[2];
   }

   /* If zmin is less than 0, clamp zmin to 0 and adjust the prescale.
@ -594,21 +610,21 @@ emit_viewport( struct svga_context *svga,
   if (range_min < 0.0f) {
      range_min = -0.5f * viewport->scale[2] + 0.5f + viewport->translate[2];
      range_max = 0.5f * viewport->scale[2] + 0.5f + viewport->translate[2];
-      prescale.scale[2] *= 2.0f;
-      prescale.translate[2] -= 0.5f;
+      prescale->scale[2] *= 2.0f;
+      prescale->translate[2] -= 0.5f;
   }

-   if (prescale.enabled) {
+   if (prescale->enabled) {
      float H[2];
      float J[2];
      int i;

      SVGA_DBG(DEBUG_VIEWPORT,
               "prescale %f,%f %fx%f\n",
-               prescale.translate[0],
-               prescale.translate[1],
-               prescale.scale[0],
-               prescale.scale[1]);
+               prescale->translate[0],
+               prescale->translate[1],
+               prescale->scale[0],
+               prescale->scale[1]);

      H[0] = (float)rect.w / 2.0f;
      H[1] = -(float)rect.h / 2.0f;
@ -645,16 +661,16 @@ emit_viewport( struct svga_context *svga,
       * Overwrite prescale.translate with values for K:
       */
      for (i = 0; i < 2; i++) {
-         prescale.translate[i] = ((prescale.translate[i] +
-                                   (prescale.scale[i] - 1.0f) * J[i]) / H[i]);
+         prescale->translate[i] = ((prescale->translate[i] +
+                                   (prescale->scale[i] - 1.0f) * J[i]) / H[i]);
      }

      SVGA_DBG(DEBUG_VIEWPORT,
               "clipspace %f,%f %fx%f\n",
-               prescale.translate[0],
-               prescale.translate[1],
-               prescale.scale[0],
-               prescale.scale[1]);
+               prescale->translate[0],
+               prescale->translate[1],
+               prescale->scale[0],
+               prescale->scale[1]);
   }

 out:
@ -663,59 +679,90 @@ out:
      rect.y = 0;
      rect.w = 1;
      rect.h = 1;
-      prescale.enabled = FALSE;
+      prescale->enabled = FALSE;
   }

-   if (!svga_rects_equal(&rect, &svga->state.hw_clear.viewport)) {
-      if (svga_have_vgpu10(svga)) {
-         emit_vgpu10_viewport = TRUE;
-      }
-      else {
+   vp->x = (float) rect.x;
+   vp->y = (float) rect.y;
+   vp->width = (float) rect.w;
+   vp->height = (float) rect.h;
+   vp->minDepth = range_min;
+   vp->maxDepth = range_max;
+}
+
+
+static enum pipe_error
+emit_viewport( struct svga_context *svga,
+               uint64_t dirty )
+{
+   struct svga_screen *svgascreen = svga_screen(svga->pipe.screen);
+   SVGA3dViewport viewports[SVGA3D_DX_MAX_VIEWPORTS];
+   struct svga_prescale prescale[SVGA3D_DX_MAX_VIEWPORTS];
+   unsigned i;
+   enum pipe_error ret;
+   unsigned max_viewports = svgascreen->max_viewports;
+
+   for (i = 0; i < max_viewports; i++) {
+      get_viewport_prescale(svga, &svga->curr.viewport[i],
+                            &viewports[i], &prescale[i]);
+   }
+
+   if (memcmp(viewports, svga->state.hw_clear.viewports,
+              max_viewports * sizeof viewports[0]) != 0) {
+
+      if (!svga_have_vgpu10(svga)) {
+         SVGA3dRect rect;
+         SVGA3dViewport *vp = &viewports[0];
+
+         rect.x = (uint32)vp->x;
+         rect.y = (uint32)vp->y;
+         rect.w = (uint32)vp->width;
+         rect.h = (uint32)vp->height;
+
         ret = SVGA3D_SetViewport(svga->swc, &rect);
         if (ret != PIPE_OK)
            return ret;

-         svga->state.hw_clear.viewport = rect;
-      }
-   }
-
-   if (svga->state.hw_clear.depthrange.zmin != range_min ||
-       svga->state.hw_clear.depthrange.zmax != range_max)
-   {
-      if (svga_have_vgpu10(svga)) {
-         emit_vgpu10_viewport = TRUE;
-      }
-      else {
-         ret = SVGA3D_SetZRange(svga->swc, range_min, range_max );
+         ret = SVGA3D_SetZRange(svga->swc, vp->minDepth, vp->maxDepth);
         if (ret != PIPE_OK)
            return ret;

-         svga->state.hw_clear.depthrange.zmin = range_min;
-         svga->state.hw_clear.depthrange.zmax = range_max;
+         svga->state.hw_clear.viewport = rect;
+         svga->state.hw_clear.depthrange.zmin = vp->minDepth;
+         svga->state.hw_clear.depthrange.zmax = vp->maxDepth;
      }
+      else {
+         ret = SVGA3D_vgpu10_SetViewports(svga->swc, max_viewports,
+                                          viewports);
+         if (ret != PIPE_OK)
+            return ret;
+      }
+      memcpy(svga->state.hw_clear.viewports, viewports,
+             max_viewports * sizeof viewports[0]);
   }

-   if (emit_vgpu10_viewport) {
-      SVGA3dViewport vp;
-      vp.x = (float) rect.x;
-      vp.y = (float) rect.y;
-      vp.width = (float) rect.w;
-      vp.height = (float) rect.h;
-      vp.minDepth = range_min;
-      vp.maxDepth = range_max;
-      ret = SVGA3D_vgpu10_SetViewports(svga->swc, 1, &vp);
-      if (ret != PIPE_OK)
-         return ret;
-
-      svga->state.hw_clear.viewport = rect;
-
-      svga->state.hw_clear.depthrange.zmin = range_min;
-      svga->state.hw_clear.depthrange.zmax = range_max;
-   }
-
-   if (memcmp(&prescale, &svga->state.hw_clear.prescale, sizeof prescale) != 0) {
+   if (memcmp(prescale, svga->state.hw_clear.prescale,
+              max_viewports * sizeof prescale[0]) != 0) {
      svga->dirty |= SVGA_NEW_PRESCALE;
-      svga->state.hw_clear.prescale = prescale;
+      memcpy(svga->state.hw_clear.prescale, prescale,
+             max_viewports * sizeof prescale[0]);
+
+      /*
+       * Determine number of unique prescales. This is to minimize the
+       * if check needed in the geometry shader to identify the prescale
+       * for the specified viewport.
+       */
+      unsigned last_prescale = SVGA3D_DX_MAX_VIEWPORTS - 1;
+      unsigned i;
+      for (i = SVGA3D_DX_MAX_VIEWPORTS-1; i > 0; i--) {
+         if (memcmp(&svga->state.hw_clear.prescale[i],
+                    &svga->state.hw_clear.prescale[i-1],
+                    sizeof svga->state.hw_clear.prescale[0])) {
+            break;
+         }
+         last_prescale--;
+      }
+      svga->state.hw_clear.num_prescale = last_prescale + 1;
   }

   return PIPE_OK;
@ -738,33 +785,50 @@ struct svga_tracked_state svga_hw_viewport =
 */
 static enum pipe_error
 emit_scissor_rect( struct svga_context *svga,
-                   unsigned dirty )
+                   uint64_t dirty )
 {
-   const struct pipe_scissor_state *scissor = &svga->curr.scissor;
+   struct svga_screen *svgascreen = svga_screen(svga->pipe.screen);
+   const struct pipe_scissor_state *scissor = svga->curr.scissor;
+   unsigned max_viewports = svgascreen->max_viewports;
+   enum pipe_error ret;

-   if (svga_have_vgpu10(svga)) {
-      SVGASignedRect rect;
+   if (memcmp(&svga->state.hw_clear.scissors[0], scissor,
+              max_viewports * sizeof *scissor) != 0) {

-      rect.left = scissor->minx;
-      rect.top = scissor->miny;
-      rect.right = scissor->maxx;
-      rect.bottom = scissor->maxy;
+      if (svga_have_vgpu10(svga)) {
+         SVGASignedRect rect[SVGA3D_DX_MAX_VIEWPORTS];
+         unsigned i;

-      return SVGA3D_vgpu10_SetScissorRects(svga->swc, 1, &rect);
+         for (i = 0; i < max_viewports; i++) {
+            rect[i].left = scissor[i].minx;
+            rect[i].top = scissor[i].miny;
+            rect[i].right = scissor[i].maxx;
+            rect[i].bottom = scissor[i].maxy;
+         }
+
+         ret = SVGA3D_vgpu10_SetScissorRects(svga->swc, max_viewports, rect);
+      }
+      else {
+         SVGA3dRect rect;
+
+         rect.x = scissor[0].minx;
+         rect.y = scissor[0].miny;
+         rect.w = scissor[0].maxx - scissor[0].minx; /* + 1 ?? */
+         rect.h = scissor[0].maxy - scissor[0].miny; /* + 1 ?? */
+
+         ret = SVGA3D_SetScissorRect(svga->swc, &rect);
+      }
+
+      if (ret != PIPE_OK)
+         return ret;
+
+      memcpy(svga->state.hw_clear.scissors, scissor,
+             max_viewports * sizeof *scissor);
   }
-   else {
-      SVGA3dRect rect;

-      rect.x = scissor->minx;
-      rect.y = scissor->miny;
-      rect.w = scissor->maxx - scissor->minx; /* + 1 ?? */
-      rect.h = scissor->maxy - scissor->miny; /* + 1 ?? */
-
-      return SVGA3D_SetScissorRect(svga->swc, &rect);
-   }
+   return PIPE_OK;
 }

-
 struct svga_tracked_state svga_hw_scissor =
 {
   "hw scissor state",
@ -779,7 +843,7 @@ struct svga_tracked_state svga_hw_scissor =

 static enum pipe_error
 emit_clip_planes( struct svga_context *svga,
-                  unsigned dirty )
+                  uint64_t dirty )
 {
   unsigned i;
   enum pipe_error ret;
--- a/src/gallium/drivers/svga/svga_state_fs.c
+++ b/src/gallium/drivers/svga/svga_state_fs.c
@ -196,8 +196,10 @@ make_fs_key(const struct svga_context *svga,
    */
   if (svga->curr.gs) {
      key->fs.gs_generic_outputs = svga->curr.gs->generic_outputs;
+      key->fs.layer_to_zero = !svga->curr.gs->base.info.writes_layer;
   } else {
      key->fs.vs_generic_outputs = svga->curr.vs->generic_outputs;
+      key->fs.layer_to_zero = 1;
   }

   /* Only need fragment shader fixup for twoside lighting if doing
@ -276,7 +278,7 @@ make_fs_key(const struct svga_context *svga,
    *
    * SVGA_NEW_TEXTURE_BINDING | SVGA_NEW_SAMPLER
    */
-   svga_init_shader_key_common(svga, shader, key);
+   svga_init_shader_key_common(svga, shader, &fs->base, key);

   for (i = 0; i < svga->curr.num_samplers[shader]; ++i) {
      struct pipe_sampler_view *view = svga->curr.sampler_views[shader][i];
@ -317,15 +319,6 @@ make_fs_key(const struct svga_context *svga,
                  debug_warn_once("Unsupported shadow compare function");
               }
            }
-            else {
-               /* For other texture formats, just use the compare func/mode
-                * as-is.  Should be no-ops for color textures.  For depth
-                * textures, we do not get automatic depth compare.  We have
-                * to do it ourselves in the shader.  And we don't get PCF.
-                */
-               key->tex[i].compare_mode = sampler->compare_mode;
-               key->tex[i].compare_func = sampler->compare_func;
-            }
         }
      }
   }
@ -401,22 +394,26 @@ svga_reemit_fs_bindings(struct svga_context *svga)


 static enum pipe_error
-emit_hw_fs(struct svga_context *svga, unsigned dirty)
+emit_hw_fs(struct svga_context *svga, uint64_t dirty)
 {
   struct svga_shader_variant *variant = NULL;
   enum pipe_error ret = PIPE_OK;
   struct svga_fragment_shader *fs = svga->curr.fs;
   struct svga_compile_key key;
+   struct svga_shader *prevShader = NULL;   /* shader in the previous stage */

   SVGA_STATS_TIME_PUSH(svga_sws(svga), SVGA_STATS_TIME_EMITFS);

+   prevShader = svga->curr.gs ?
+      &svga->curr.gs->base : (svga->curr.tes ?
+      &svga->curr.tes->base : &svga->curr.vs->base);
+
   /* Disable rasterization if rasterizer_discard flag is set or
    * vs/gs does not output position.
    */
   svga->disable_rasterizer =
      svga->curr.rast->templ.rasterizer_discard ||
-      (svga->curr.gs && !svga->curr.gs->base.info.writes_position) ||
-      (!svga->curr.gs && !svga->curr.vs->base.info.writes_position);
+      !prevShader->info.writes_position;

   /* Set FS to NULL when rasterization is to be disabled */
   if (svga->disable_rasterizer) {
--- a/src/gallium/drivers/svga/svga_state_gs.c
+++ b/src/gallium/drivers/svga/svga_state_gs.c
@ -109,34 +109,45 @@ make_gs_key(struct svga_context *svga, struct svga_compile_key *key)
   /*
    * SVGA_NEW_TEXTURE_BINDING | SVGA_NEW_SAMPLER
    */
-   svga_init_shader_key_common(svga, PIPE_SHADER_GEOMETRY, key);
+   svga_init_shader_key_common(svga, PIPE_SHADER_GEOMETRY, &gs->base, key);

   memcpy(key->generic_remap_table, gs->generic_remap_table,
          sizeof(gs->generic_remap_table));

   key->gs.vs_generic_outputs = svga->curr.vs->generic_outputs;

-   key->gs.need_prescale = svga->state.hw_clear.prescale.enabled;
+   key->gs.need_prescale = svga->state.hw_clear.prescale[0].enabled;

   key->gs.writes_psize = gs->base.info.writes_psize;
   key->gs.wide_point = gs->wide_point;
+   key->gs.writes_viewport_index = gs->base.info.writes_viewport_index;
+   if (key->gs.writes_viewport_index) {
+      key->gs.num_prescale = svga->state.hw_clear.num_prescale;
+   } else {
+      key->gs.num_prescale = 1;
+   }
   key->sprite_coord_enable = svga->curr.rast->templ.sprite_coord_enable;
   key->sprite_origin_lower_left = (svga->curr.rast->templ.sprite_coord_mode
                                    == PIPE_SPRITE_COORD_LOWER_LEFT);

   /* SVGA_NEW_RAST */
   key->clip_plane_enable = svga->curr.rast->templ.clip_plane_enable;
+
+   /* Mark this as the last shader in the vertex processing stage */
+   key->last_vertex_stage = 1;
 }


 static enum pipe_error
-emit_hw_gs(struct svga_context *svga, unsigned dirty)
+emit_hw_gs(struct svga_context *svga, uint64_t dirty)
 {
   struct svga_shader_variant *variant;
   struct svga_geometry_shader *gs = svga->curr.gs;
   enum pipe_error ret = PIPE_OK;
   struct svga_compile_key key;

+   assert(svga_have_vgpu10(svga));
+
   SVGA_STATS_TIME_PUSH(svga_sws(svga), SVGA_STATS_TIME_EMITGS);

   /* If there's a user-defined GS, we should have a pointer to a derived
--- a/src/gallium/drivers/svga/svga_state_need_swtnl.c
+++ b/src/gallium/drivers/svga/svga_state_need_swtnl.c
@ -33,7 +33,7 @@


 static enum pipe_error
-update_need_swvfetch(struct svga_context *svga, unsigned dirty)
+update_need_swvfetch(struct svga_context *svga, uint64_t dirty)
 {
   if (!svga->curr.velems) {
      /* No vertex elements bound. */
@ -58,7 +58,7 @@ struct svga_tracked_state svga_update_need_swvfetch =


 static enum pipe_error
-update_need_pipeline(struct svga_context *svga, unsigned dirty)
+update_need_pipeline(struct svga_context *svga, uint64_t dirty)
 {
   boolean need_pipeline = FALSE;
   struct svga_vertex_shader *vs = svga->curr.vs;
@ -156,7 +156,7 @@ struct svga_tracked_state svga_update_need_pipeline =


 static enum pipe_error
-update_need_swtnl(struct svga_context *svga, unsigned dirty)
+update_need_swtnl(struct svga_context *svga, uint64_t dirty)
 {
   boolean need_swtnl;

--- a/src/gallium/drivers/svga/svga_state_rss.c
+++ b/src/gallium/drivers/svga/svga_state_rss.c
@ -97,7 +97,7 @@ translate_fill_mode(unsigned fill)
 * the "to" state.
 */
 static enum pipe_error
-emit_rss_vgpu9(struct svga_context *svga, unsigned dirty)
+emit_rss_vgpu9(struct svga_context *svga, uint64_t dirty)
 {
   struct svga_screen *screen = svga_screen(svga->pipe.screen);
   struct rs_queue queue;
@ -363,7 +363,7 @@ get_no_depth_stencil_test_state(struct svga_context *svga)


 static enum pipe_error
-emit_rss_vgpu10(struct svga_context *svga, unsigned dirty)
+emit_rss_vgpu10(struct svga_context *svga, uint64_t dirty)
 {
   enum pipe_error ret = PIPE_OK;

@ -487,7 +487,7 @@ emit_rss_vgpu10(struct svga_context *svga, unsigned dirty)


 static enum pipe_error
-emit_rss(struct svga_context *svga, unsigned dirty)
+emit_rss(struct svga_context *svga, uint64_t dirty)
 {
   if (svga_have_vgpu10(svga)) {
      return emit_rss_vgpu10(svga, dirty);
--- a/src/gallium/drivers/svga/svga_state_sampler.c
+++ b/src/gallium/drivers/svga/svga_state_sampler.c
@ -131,7 +131,7 @@ svga_validate_pipe_sampler_view(struct svga_context *svga,
   if (sv->id == SVGA3D_INVALID_ID) {
      struct svga_screen *ss = svga_screen(svga->pipe.screen);
      struct pipe_resource *texture = sv->base.texture;
-      struct svga_winsys_surface *surface = svga_resource_handle(texture);
+      struct svga_winsys_surface *surface;
      SVGA3dSurfaceFormat format;
      SVGA3dResourceType resourceDim;
      SVGA3dShaderResourceViewDesc viewDesc;
@ -154,6 +154,7 @@ svga_validate_pipe_sampler_view(struct svga_context *svga,
         svga_translate_texture_buffer_view_format(viewFormat,
                                                   &format,
                                                   &pf_flags);
+         surface = svga_buffer_handle(svga, texture, PIPE_BIND_SAMPLER_VIEW);
      }
      else {
         format = svga_translate_format(ss, viewFormat,
@ -161,6 +162,8 @@ svga_validate_pipe_sampler_view(struct svga_context *svga,

         /* Convert the format to a sampler-friendly format, if needed */
         format = svga_sampler_format(format);
+
+         surface = svga_texture(texture)->handle;
      }

      assert(format != SVGA3D_FORMAT_INVALID);
@ -234,15 +237,14 @@ svga_validate_pipe_sampler_view(struct svga_context *svga,


 static enum pipe_error
-update_sampler_resources(struct svga_context *svga, unsigned dirty)
+update_sampler_resources(struct svga_context *svga, uint64_t dirty)
 {
   enum pipe_error ret = PIPE_OK;
   enum pipe_shader_type shader;

-   if (!svga_have_vgpu10(svga))
-      return PIPE_OK;
+   assert(svga_have_vgpu10(svga));

-   for (shader = PIPE_SHADER_VERTEX; shader <= PIPE_SHADER_GEOMETRY; shader++) {
+   for (shader = PIPE_SHADER_VERTEX; shader <= PIPE_SHADER_TESS_EVAL; shader++) {
      SVGA3dShaderResourceViewId ids[PIPE_MAX_SAMPLERS];
      struct svga_winsys_surface *surfaces[PIPE_MAX_SAMPLERS];
      struct pipe_sampler_view *sampler_views[PIPE_MAX_SAMPLERS];
@ -349,7 +351,8 @@ update_sampler_resources(struct svga_context *svga, unsigned dirty)

   /* Handle polygon stipple sampler view */
   if (svga->curr.rast->templ.poly_stipple_enable) {
-      const unsigned unit = svga->state.hw_draw.fs->pstipple_sampler_unit;
+      const unsigned unit =
+         svga_fs_variant(svga->state.hw_draw.fs)->pstipple_sampler_unit;
      struct svga_pipe_sampler_view *sv = svga->polygon_stipple.sampler_view;
      struct svga_winsys_surface *surface;

@ -385,15 +388,14 @@ struct svga_tracked_state svga_hw_sampler_bindings = {


 static enum pipe_error
-update_samplers(struct svga_context *svga, unsigned dirty )
+update_samplers(struct svga_context *svga, uint64_t dirty )
 {
   enum pipe_error ret = PIPE_OK;
   enum pipe_shader_type shader;

-   if (!svga_have_vgpu10(svga))
-      return PIPE_OK;
+   assert(svga_have_vgpu10(svga));

-   for (shader = PIPE_SHADER_VERTEX; shader <= PIPE_SHADER_GEOMETRY; shader++) {
+   for (shader = PIPE_SHADER_VERTEX; shader <= PIPE_SHADER_TESS_EVAL; shader++) {
      const unsigned count = svga->curr.num_samplers[shader];
      SVGA3dSamplerId ids[PIPE_MAX_SAMPLERS];
      unsigned i;
@ -404,7 +406,8 @@ update_samplers(struct svga_context *svga, unsigned dirty )

         /* _NEW_FS */
         if (shader == PIPE_SHADER_FRAGMENT) {
-            struct svga_shader_variant *fs = svga->state.hw_draw.fs;
+            struct svga_fs_variant *fs =
+               svga_fs_variant(svga->state.hw_draw.fs);
            /* If the fragment shader is doing the shadow comparison
             * for this texture unit, don't enable shadow compare in
             * the texture sampler state.
@ -449,7 +452,8 @@ update_samplers(struct svga_context *svga, unsigned dirty )

   /* Handle polygon stipple sampler texture */
   if (svga->curr.rast->templ.poly_stipple_enable) {
-      const unsigned unit = svga->state.hw_draw.fs->pstipple_sampler_unit;
+      const unsigned unit =
+         svga_fs_variant(svga->state.hw_draw.fs)->pstipple_sampler_unit;
      struct svga_sampler_state *sampler = svga->polygon_stipple.sampler;

      assert(sampler);
--- a/src/gallium/drivers/svga/svga_state_tgsi_transform.c
+++ b/src/gallium/drivers/svga/svga_state_tgsi_transform.c
@ -29,7 +29,10 @@
 #include "util/u_simple_shaders.h"
 #include "tgsi/tgsi_ureg.h"
 #include "tgsi/tgsi_point_sprite.h"
+#include "tgsi/tgsi_dynamic_indexing.h"
+#include "tgsi/tgsi_vpos.h"
 #include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_info.h"

 #include "svga_context.h"
 #include "svga_shader.h"
@ -49,6 +52,171 @@ bind_gs_state(struct svga_context *svga,
 }


+static void
+insert_at_head(struct svga_shader *head, struct svga_shader *shader)
+{
+   shader->parent = head;
+   shader->next = head->next;
+   head->next = shader;
+}
+
+
+/**
+ * Bind shader
+ */
+static void
+bind_shader(struct svga_context *svga,
+            const enum pipe_shader_type shader_type,
+            struct svga_shader *shader)
+{
+   switch (shader_type) {
+   case PIPE_SHADER_VERTEX:
+      svga->pipe.bind_vs_state(&svga->pipe, shader);
+      break;
+   case PIPE_SHADER_FRAGMENT:
+      /**
+       * Avoid pipe->bind_fs_state call because it goes through aapoint
+       * layer. We loose linked list of all transformed shaders if aapoint
+       * is used.
+       */
+      svga_bind_fs_state(&svga->pipe, shader);
+      break;
+   case PIPE_SHADER_GEOMETRY:
+      svga->pipe.bind_gs_state(&svga->pipe, shader);
+      break;
+   case PIPE_SHADER_TESS_CTRL:
+      svga->pipe.bind_tcs_state(&svga->pipe, shader);
+      break;
+   case PIPE_SHADER_TESS_EVAL:
+      svga->pipe.bind_tes_state(&svga->pipe, shader);
+      break;
+   default:
+      return;
+   }
+}
+
+
+
+/**
+ * Create shader
+ */
+static void *
+create_shader(struct svga_context *svga,
+              const enum pipe_shader_type shader_type,
+              struct pipe_shader_state *state)
+{
+   switch (shader_type) {
+   case PIPE_SHADER_VERTEX:
+      return svga->pipe.create_vs_state(&svga->pipe, state);
+   case PIPE_SHADER_FRAGMENT:
+      /**
+       * Avoid pipe->create_fs_state call because it goes through aapoint
+       * layer. We loose linked list of all transformed shaders if aapoint
+       * is used.
+       */
+      return svga_create_fs_state(&svga->pipe, state);
+   case PIPE_SHADER_GEOMETRY:
+      return svga->pipe.create_gs_state(&svga->pipe, state);
+   case PIPE_SHADER_TESS_CTRL:
+      return svga->pipe.create_tcs_state(&svga->pipe, state);
+   case PIPE_SHADER_TESS_EVAL:
+      return svga->pipe.create_tes_state(&svga->pipe, state);
+   default:
+      return NULL;
+   }
+}
+
+
+static void
+write_vpos(struct svga_context *svga,
+           struct svga_shader *shader)
+{
+   struct svga_token_key key;
+   boolean use_existing = FALSE;
+   struct svga_shader *transform_shader;
+   const struct tgsi_shader_info *info = &shader->info;
+
+   /* Create a token key */
+   memset(&key, 0, sizeof key);
+   key.vs.write_position = 1;
+
+   if (shader->next) {
+      transform_shader = svga_search_shader_token_key(shader->next, &key);
+      if (transform_shader) {
+         use_existing = TRUE;
+      }
+   }
+
+   if (!use_existing) {
+      struct pipe_shader_state state;
+      struct tgsi_token *new_tokens = NULL;
+
+      new_tokens = tgsi_write_vpos(shader->tokens,
+                                   info->immediate_count);
+      if (!new_tokens)
+         return;
+
+      pipe_shader_state_from_tgsi(&state, new_tokens);
+
+      transform_shader = create_shader(svga, info->processor, &state);
+      insert_at_head(shader, transform_shader);
+      FREE(new_tokens);
+   }
+   transform_shader->token_key = key;
+   bind_shader(svga, info->processor, transform_shader);
+}
+
+
+/**
+ * transform_dynamic_indexing searches shader variant list to see if
+ * we have transformed shader for dynamic indexing and reuse/bind it. If we
+ * don't have transformed shader, then it will create new shader from which
+ * dynamic indexing will be removed. It will also be added to the shader
+ * variant list and this new shader will be bind to current svga state.
+ */
+static void
+transform_dynamic_indexing(struct svga_context *svga,
+                           struct svga_shader *shader)
+{
+   struct svga_token_key key;
+   boolean use_existing = FALSE;
+   struct svga_shader *transform_shader;
+   const struct tgsi_shader_info *info = &shader->info;
+
+   /* Create a token key */
+   memset(&key, 0, sizeof key);
+   key.dynamic_indexing = 1;
+
+   if (shader->next) {
+      transform_shader = svga_search_shader_token_key(shader->next, &key);
+      if (transform_shader) {
+         use_existing = TRUE;
+      }
+   }
+
+   struct tgsi_token *new_tokens = NULL;
+
+   if (!use_existing) {
+      struct pipe_shader_state state;
+      new_tokens = tgsi_remove_dynamic_indexing(shader->tokens,
+                                                info->const_buffers_declared,
+                                                info->samplers_declared,
+                                                info->immediate_count);
+      if (!new_tokens)
+         return;
+
+      pipe_shader_state_from_tgsi(&state, new_tokens);
+
+      transform_shader = create_shader(svga, info->processor, &state);
+      insert_at_head(shader, transform_shader);
+   }
+   transform_shader->token_key = key;
+   bind_shader(svga, info->processor, transform_shader);
+   if (new_tokens)
+      FREE(new_tokens);
+}
+
+
 /**
 * emulate_point_sprite searches the shader variants list to see it there is
 * a shader variant with a token string that matches the emulation
@ -233,18 +401,49 @@ add_point_sprite_shader(struct svga_context *svga)
   return &new_gs->base;
 }

+
+static boolean
+has_dynamic_indexing(const struct tgsi_shader_info *info)
+{
+   return (info->dim_indirect_files & (1u << TGSI_FILE_CONSTANT)) ||
+      (info->indirect_files & (1u << TGSI_FILE_SAMPLER));
+}
+
+
 /* update_tgsi_transform provides a hook to transform a shader if needed.
 */
 static enum pipe_error
-update_tgsi_transform(struct svga_context *svga, unsigned dirty)
+update_tgsi_transform(struct svga_context *svga, uint64_t dirty)
 {
   struct svga_geometry_shader *gs = svga->curr.user_gs;   /* current gs */
   struct svga_vertex_shader *vs = svga->curr.vs;     /* currently bound vs */
+   struct svga_fragment_shader *fs = svga->curr.fs;   /* currently bound fs */
+   struct svga_tcs_shader *tcs = svga->curr.tcs;      /* currently bound tcs */
+   struct svga_tes_shader *tes = svga->curr.tes;      /* currently bound tes */
   struct svga_shader *orig_gs;                       /* original gs */
   struct svga_shader *new_gs;                        /* new gs */

-   if (!svga_have_vgpu10(svga))
-      return PIPE_OK;
+   assert(svga_have_vgpu10(svga));
+
+   if (vs->base.info.num_outputs == 0) {
+      write_vpos(svga, &vs->base);
+   }
+
+   if (vs && has_dynamic_indexing(&vs->base.info)) {
+      transform_dynamic_indexing(svga, &vs->base);
+   }
+   if (fs && has_dynamic_indexing(&fs->base.info)) {
+      transform_dynamic_indexing(svga, &fs->base);
+   }
+   if (gs && has_dynamic_indexing(&gs->base.info)) {
+      transform_dynamic_indexing(svga, &gs->base);
+   }
+   if (tcs && has_dynamic_indexing(&tcs->base.info)) {
+      transform_dynamic_indexing(svga, &tcs->base);
+   }
+   if (tes && has_dynamic_indexing(&tes->base.info)) {
+      transform_dynamic_indexing(svga, &tes->base);
+   }

   if (svga->curr.reduced_prim == PIPE_PRIM_POINTS) {
      /* If the current prim type is POINTS and the current geometry shader
--- a/src/gallium/drivers/svga/svga_state_ts.c
+++ b/src/gallium/drivers/svga/svga_state_ts.c
@ -0,0 +1,392 @@
+/**********************************************************
+ * Copyright 2018-2020 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_simple_shaders.h"
+
+#include "svga_context.h"
+#include "svga_cmd.h"
+#include "svga_tgsi.h"
+#include "svga_shader.h"
+
+
+/**
+ * Translate TGSI shader into an svga shader variant.
+ */
+static enum pipe_error
+compile_tcs(struct svga_context *svga,
+           struct svga_tcs_shader *tcs,
+           const struct svga_compile_key *key,
+           struct svga_shader_variant **out_variant)
+{
+   struct svga_shader_variant *variant;
+   enum pipe_error ret = PIPE_ERROR;
+
+   variant = svga_tgsi_vgpu10_translate(svga, &tcs->base, key,
+                                        PIPE_SHADER_TESS_CTRL);
+   if (!variant)
+      return PIPE_ERROR;
+
+   ret = svga_define_shader(svga, variant);
+   if (ret != PIPE_OK) {
+      svga_destroy_shader_variant(svga, variant);
+      return ret;
+   }
+
+   *out_variant = variant;
+
+   return PIPE_OK;
+}
+
+
+static void
+make_tcs_key(struct svga_context *svga, struct svga_compile_key *key)
+{
+   struct svga_tcs_shader *tcs = svga->curr.tcs;
+
+   memset(key, 0, sizeof *key);
+
+   /*
+    * SVGA_NEW_TEXTURE_BINDING | SVGA_NEW_SAMPLER
+    */
+   svga_init_shader_key_common(svga, PIPE_SHADER_TESS_CTRL, &tcs->base, key);
+
+   /* SVGA_NEW_TCS_PARAM */
+   key->tcs.vertices_per_patch = svga->curr.vertices_per_patch;
+
+   /* The tessellator parameters come from the layout section in the
+    * tessellation evaluation shader. Get these parameters from the
+    * current tessellation evaluation shader variant.
+    * Note: this requires the tessellation evaluation shader to be
+    * compiled first.
+    */
+   struct svga_tes_variant *tes = svga_tes_variant(svga->state.hw_draw.tes);
+   key->tcs.prim_mode = tes->prim_mode;
+   key->tcs.spacing = tes->spacing;
+   key->tcs.vertices_order_cw = tes->vertices_order_cw;
+   key->tcs.point_mode = tes->point_mode;
+
+   if (svga->tcs.passthrough)
+      key->tcs.passthrough = 1;
+
+   key->clip_plane_enable = svga->curr.rast->templ.clip_plane_enable;
+
+   /* tcs is always followed by tes */
+   key->last_vertex_stage = 0;
+}
+
+
+static enum pipe_error
+emit_hw_tcs(struct svga_context *svga, uint64_t dirty)
+{
+   struct svga_shader_variant *variant;
+   struct svga_tcs_shader *tcs = svga->curr.tcs;
+   enum pipe_error ret = PIPE_OK;
+   struct svga_compile_key key;
+
+   assert(svga_have_sm5(svga));
+
+   SVGA_STATS_TIME_PUSH(svga_sws(svga), SVGA_STATS_TIME_EMITTCS);
+
+   if (!tcs) {
+      /* If there is no active tcs, then there should not be
+       * active tes either
+       */
+      assert(!svga->curr.tes);
+      if (svga->state.hw_draw.tcs != NULL) {
+
+         /** The previous tessellation control shader is made inactive.
+          *  Needs to unbind the tessellation control shader.
+          */
+         ret = svga_set_shader(svga, SVGA3D_SHADERTYPE_HS, NULL);
+         if (ret != PIPE_OK)
+            goto done;
+         svga->state.hw_draw.tcs = NULL;
+      }
+      goto done;
+   }
+
+   make_tcs_key(svga, &key);
+
+   /* See if we already have a TCS variant that matches the key */
+   variant = svga_search_shader_key(&tcs->base, &key);
+
+   if (!variant) {
+      ret = compile_tcs(svga, tcs, &key, &variant);
+      if (ret != PIPE_OK)
+         goto done;
+
+      /* insert the new variant at head of linked list */
+      assert(variant);
+      variant->next = tcs->base.variants;
+      tcs->base.variants = variant;
+   }
+
+   if (variant != svga->state.hw_draw.tcs) {
+      /* Bind the new variant */
+      ret = svga_set_shader(svga, SVGA3D_SHADERTYPE_HS, variant);
+      if (ret != PIPE_OK)
+         goto done;
+
+      svga->rebind.flags.tcs = FALSE;
+      svga->dirty |= SVGA_NEW_TCS_VARIANT;
+      svga->state.hw_draw.tcs = variant;
+   }
+
+done:
+   SVGA_STATS_TIME_POP(svga_sws(svga));
+   return ret;
+}
+
+
+struct svga_tracked_state svga_hw_tcs =
+{
+   "tessellation control shader (hwtnl)",
+   (SVGA_NEW_VS |
+    SVGA_NEW_TCS |
+    SVGA_NEW_TES |
+    SVGA_NEW_TEXTURE_BINDING |
+    SVGA_NEW_SAMPLER |
+    SVGA_NEW_RAST),
+   emit_hw_tcs
+};
+
+
+/**
+ * Translate TGSI shader into an svga shader variant.
+ */
+static enum pipe_error
+compile_tes(struct svga_context *svga,
+           struct svga_tes_shader *tes,
+           const struct svga_compile_key *key,
+           struct svga_shader_variant **out_variant)
+{
+   struct svga_shader_variant *variant;
+   enum pipe_error ret = PIPE_ERROR;
+
+   variant = svga_tgsi_vgpu10_translate(svga, &tes->base, key,
+                                        PIPE_SHADER_TESS_EVAL);
+   if (!variant)
+      return PIPE_ERROR;
+
+   ret = svga_define_shader(svga, variant);
+   if (ret != PIPE_OK) {
+      svga_destroy_shader_variant(svga, variant);
+      return ret;
+   }
+
+   *out_variant = variant;
+
+   return PIPE_OK;
+}
+
+
+static void
+make_tes_key(struct svga_context *svga, struct svga_compile_key *key)
+{
+   struct svga_tes_shader *tes = svga->curr.tes;
+
+   memset(key, 0, sizeof *key);
+
+   /*
+    * SVGA_NEW_TEXTURE_BINDING | SVGA_NEW_SAMPLER
+    */
+   svga_init_shader_key_common(svga, PIPE_SHADER_TESS_EVAL, &tes->base, key);
+
+   assert(svga->curr.tcs);
+   key->tes.vertices_per_patch =
+      svga->curr.tcs->base.info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
+
+   key->tes.need_prescale = svga->state.hw_clear.prescale[0].enabled &&
+                            (svga->curr.gs == NULL);
+
+   /* tcs emits tessellation factors as extra outputs.
+    * Since tes depends on them, save the tessFactor output index
+    * from tcs in the tes compile key, so that if a different
+    * tcs is bound and if the tessFactor index is different,
+    * a different tes variant will be generated.
+    */
+   key->tes.tessfactor_index = svga->curr.tcs->base.info.num_outputs;
+
+   key->clip_plane_enable = svga->curr.rast->templ.clip_plane_enable;
+
+   /* This is the last vertex stage if there is no geometry shader. */
+   key->last_vertex_stage = !svga->curr.gs;
+
+   key->tes.need_tessinner = 0;
+   key->tes.need_tessouter = 0;
+
+   for (int i = 0; i < svga->curr.tcs->base.info.num_outputs; i++) {
+      switch (svga->curr.tcs->base.info.output_semantic_name[i]) {
+      case TGSI_SEMANTIC_TESSOUTER:
+         key->tes.need_tessouter = 1;
+         break;
+      case TGSI_SEMANTIC_TESSINNER:
+         key->tes.need_tessinner = 1;
+         break;
+      default:
+         break;
+      }
+   }
+
+}
+
+
+static void
+get_passthrough_tcs(struct svga_context *svga)
+{
+   if (svga->tcs.passthrough_tcs &&
+       svga->tcs.vs == svga->curr.vs &&
+       svga->tcs.tes == svga->curr.tes &&
+       svga->tcs.vertices_per_patch == svga->curr.vertices_per_patch) {
+      svga->pipe.bind_tcs_state(&svga->pipe,
+                                svga->tcs.passthrough_tcs);
+   }
+   else {
+      struct svga_tcs_shader *new_tcs;
+
+      /* delete older passthrough shader*/
+      if (svga->tcs.passthrough_tcs) {
+         svga->pipe.delete_tcs_state(&svga->pipe,
+                                     svga->tcs.passthrough_tcs);
+      }
+
+      new_tcs = (struct svga_tcs_shader *)
+         util_make_tess_ctrl_passthrough_shader(&svga->pipe,
+            svga->curr.vs->base.info.num_outputs,
+            svga->curr.tes->base.info.num_inputs,
+            svga->curr.vs->base.info.output_semantic_name,
+            svga->curr.vs->base.info.output_semantic_index,
+            svga->curr.tes->base.info.input_semantic_name,
+            svga->curr.tes->base.info.input_semantic_index,
+            svga->curr.vertices_per_patch);
+      svga->pipe.bind_tcs_state(&svga->pipe, new_tcs);
+      svga->tcs.passthrough_tcs = new_tcs;
+      svga->tcs.vs = svga->curr.vs;
+      svga->tcs.tes = svga->curr.tes;
+      svga->tcs.vertices_per_patch = svga->curr.vertices_per_patch;
+   }
+
+   struct pipe_constant_buffer cb;
+
+   cb.buffer = NULL;
+   cb.user_buffer = (void *) svga->curr.default_tesslevels;
+   cb.buffer_offset = 0;
+   cb.buffer_size = 2 * 4 * sizeof(float);
+   svga->pipe.set_constant_buffer(&svga->pipe, PIPE_SHADER_TESS_CTRL, 0, &cb);
+}
+
+
+static enum pipe_error
+emit_hw_tes(struct svga_context *svga, uint64_t dirty)
+{
+   struct svga_shader_variant *variant;
+   struct svga_tes_shader *tes = svga->curr.tes;
+   enum pipe_error ret = PIPE_OK;
+   struct svga_compile_key key;
+
+   assert(svga_have_sm5(svga));
+
+   SVGA_STATS_TIME_PUSH(svga_sws(svga), SVGA_STATS_TIME_EMITTES);
+
+   if (!tes) {
+      /* The GL spec implies that TES is optional when there's a TCS,
+       * but that's apparently a spec error. Assert if we have a TCS
+       * but no TES.
+       */
+      assert(!svga->curr.tcs);
+      if (svga->state.hw_draw.tes != NULL) {
+
+         /** The previous tessellation evaluation shader is made inactive.
+          *  Needs to unbind the tessellation evaluation shader.
+          */
+         ret = svga_set_shader(svga, SVGA3D_SHADERTYPE_DS, NULL);
+         if (ret != PIPE_OK)
+            goto done;
+         svga->state.hw_draw.tes = NULL;
+      }
+      goto done;
+   }
+
+   if (!svga->curr.tcs) {
+      /* TES state is processed before the TCS
+       * shader and that's why we're checking for and creating the
+       * passthough TCS in the emit_hw_tes() function.
+       */
+      get_passthrough_tcs(svga);
+      svga->tcs.passthrough = TRUE;
+   }
+   else {
+      svga->tcs.passthrough = FALSE;
+   }
+
+   make_tes_key(svga, &key);
+
+   /* See if we already have a TES variant that matches the key */
+   variant = svga_search_shader_key(&tes->base, &key);
+
+   if (!variant) {
+      ret = compile_tes(svga, tes, &key, &variant);
+      if (ret != PIPE_OK)
+         goto done;
+
+      /* insert the new variant at head of linked list */
+      assert(variant);
+      variant->next = tes->base.variants;
+      tes->base.variants = variant;
+   }
+
+   if (variant != svga->state.hw_draw.tes) {
+      /* Bind the new variant */
+      ret = svga_set_shader(svga, SVGA3D_SHADERTYPE_DS, variant);
+      if (ret != PIPE_OK)
+         goto done;
+
+      svga->rebind.flags.tes = FALSE;
+      svga->dirty |= SVGA_NEW_TES_VARIANT;
+      svga->state.hw_draw.tes = variant;
+   }
+
+done:
+   SVGA_STATS_TIME_POP(svga_sws(svga));
+   return ret;
+}
+
+
+struct svga_tracked_state svga_hw_tes =
+{
+   "tessellation evaluation shader (hwtnl)",
+   /* TBD SVGA_NEW_VS/SVGA_NEW_FS/SVGA_NEW_GS are required or not*/
+   (SVGA_NEW_VS |
+    SVGA_NEW_FS |
+    SVGA_NEW_GS |
+    SVGA_NEW_TCS |
+    SVGA_NEW_TES |
+    SVGA_NEW_TEXTURE_BINDING |
+    SVGA_NEW_SAMPLER |
+    SVGA_NEW_RAST),
+   emit_hw_tes
+};
--- a/src/gallium/drivers/svga/svga_state_tss.c
+++ b/src/gallium/drivers/svga/svga_state_tss.c
@ -139,7 +139,7 @@ emit_tex_binding_unit(struct svga_context *svga,


 static enum pipe_error
-update_tss_binding(struct svga_context *svga, unsigned dirty)
+update_tss_binding(struct svga_context *svga, uint64_t dirty )
 {
   const enum pipe_shader_type shader = PIPE_SHADER_FRAGMENT;
   boolean reemit = svga->rebind.flags.texture_samplers;
@ -149,8 +149,7 @@ update_tss_binding(struct svga_context *svga, unsigned dirty)

   struct bind_queue queue;

-   if (svga_have_vgpu10(svga))
-      return PIPE_OK;
+   assert(!svga_have_vgpu10(svga));

   queue.bind_count = 0;

@ -167,7 +166,8 @@ update_tss_binding(struct svga_context *svga, unsigned dirty)

   /* Polygon stipple */
   if (svga->curr.rast->templ.poly_stipple_enable) {
-      const unsigned unit = svga->state.hw_draw.fs->pstipple_sampler_unit;
+      const unsigned unit =
+         svga_fs_variant(svga->state.hw_draw.fs)->pstipple_sampler_unit;
      emit_tex_binding_unit(svga, unit,
                            svga->polygon_stipple.sampler,
                            &svga->polygon_stipple.sampler_view->base,
@ -257,7 +257,8 @@ svga_reemit_tss_bindings(struct svga_context *svga)

   /* Polygon stipple */
   if (svga->curr.rast && svga->curr.rast->templ.poly_stipple_enable) {
-      const unsigned unit = svga->state.hw_draw.fs->pstipple_sampler_unit;
+      const unsigned unit =
+         svga_fs_variant(svga->state.hw_draw.fs)->pstipple_sampler_unit;
      struct svga_hw_view_state *view = &svga->state.hw_draw.views[unit];

      if (view->v) {
@ -380,14 +381,13 @@ emit_tss_unit(struct svga_context *svga, unsigned unit,
 }

 static enum pipe_error
-update_tss(struct svga_context *svga, unsigned dirty)
+update_tss(struct svga_context *svga, uint64_t dirty )
 {
   const enum pipe_shader_type shader = PIPE_SHADER_FRAGMENT;
   unsigned i;
   struct ts_queue queue;

-   if (svga_have_vgpu10(svga))
-      return PIPE_OK;
+   assert(!svga_have_vgpu10(svga));

   queue.ts_count = 0;
   for (i = 0; i < svga->curr.num_samplers[shader]; i++) {
@ -400,7 +400,7 @@ update_tss(struct svga_context *svga, unsigned dirty)
   /* polygon stipple sampler */
   if (svga->curr.rast->templ.poly_stipple_enable) {
      emit_tss_unit(svga,
-                    svga->state.hw_draw.fs->pstipple_sampler_unit,
+                    svga_fs_variant(svga->state.hw_draw.fs)->pstipple_sampler_unit,
                    svga->polygon_stipple.sampler,
                    &queue);
   }
--- a/src/gallium/drivers/svga/svga_state_vdecl.c
+++ b/src/gallium/drivers/svga/svga_state_vdecl.c
@ -40,7 +40,7 @@


 static enum pipe_error
-emit_hw_vs_vdecl(struct svga_context *svga, unsigned dirty)
+emit_hw_vs_vdecl(struct svga_context *svga, uint64_t dirty)
 {
   const struct pipe_vertex_element *ve = svga->curr.velems->velem;
   SVGA3dVertexDecl decls[SVGA3D_INPUTREG_MAX];
@ -136,7 +136,7 @@ emit_hw_vs_vdecl(struct svga_context *svga, unsigned dirty)


 static enum pipe_error
-emit_hw_vdecl(struct svga_context *svga, unsigned dirty)
+emit_hw_vdecl(struct svga_context *svga, uint64_t dirty)
 {
   /* SVGA_NEW_NEED_SWTNL
    */
--- a/src/gallium/drivers/svga/svga_state_vs.c
+++ b/src/gallium/drivers/svga/svga_state_vs.c
@ -164,7 +164,7 @@ compile_vs(struct svga_context *svga,
 static void
 make_vs_key(struct svga_context *svga, struct svga_compile_key *key)
 {
-   const enum pipe_shader_type shader = PIPE_SHADER_VERTEX;
+   struct svga_vertex_shader *vs = svga->curr.vs;

   memset(key, 0, sizeof *key);

@ -176,7 +176,8 @@ make_vs_key(struct svga_context *svga, struct svga_compile_key *key)
   }

   /* SVGA_NEW_PRESCALE */
-   key->vs.need_prescale = svga->state.hw_clear.prescale.enabled &&
+   key->vs.need_prescale = svga->state.hw_clear.prescale[0].enabled &&
+                           (svga->curr.tes == NULL) &&
                           (svga->curr.gs == NULL);

   /* SVGA_NEW_RAST */
@ -199,10 +200,16 @@ make_vs_key(struct svga_context *svga, struct svga_compile_key *key)
   key->vs.attrib_puint_to_sscaled = svga->curr.velems->attrib_puint_to_sscaled;

   /* SVGA_NEW_TEXTURE_BINDING | SVGA_NEW_SAMPLER */
-   svga_init_shader_key_common(svga, shader, key);
+   svga_init_shader_key_common(svga, PIPE_SHADER_VERTEX, &vs->base, key);

   /* SVGA_NEW_RAST */
   key->clip_plane_enable = svga->curr.rast->templ.clip_plane_enable;
+
+   /* Determine if this shader is the last shader in the vertex
+    * processing stage.
+    */
+   key->last_vertex_stage = !(svga->curr.gs ||
+                              svga->curr.tcs || svga->curr.tes);
 }


@ -338,7 +345,7 @@ compile_passthrough_vs(struct svga_context *svga,


 static enum pipe_error
-emit_hw_vs(struct svga_context *svga, unsigned dirty)
+emit_hw_vs(struct svga_context *svga, uint64_t dirty)
 {
   struct svga_shader_variant *variant;
   struct svga_vertex_shader *vs = svga->curr.vs;
--- a/src/gallium/drivers/svga/svga_streamout.h
+++ b/src/gallium/drivers/svga/svga_streamout.h
@ -32,6 +32,9 @@ struct svga_stream_output {
   struct pipe_stream_output_info info;
   unsigned pos_out_index;                  // position output index
   unsigned id;
+   unsigned streammask;                     // bitmask to specify which streams are enabled
+   unsigned buffer_stream;
+   struct svga_winsys_buffer *declBuf;
 };

 struct svga_stream_output *
@ -50,4 +53,20 @@ svga_delete_stream_output(struct svga_context *svga,
 enum pipe_error
 svga_rebind_stream_output_targets(struct svga_context *svga);

+void
+svga_create_stream_output_queries(struct svga_context *svga);
+
+void
+svga_destroy_stream_output_queries(struct svga_context *svga);
+
+void
+svga_begin_stream_output_queries(struct svga_context *svga, unsigned mask);
+
+void
+svga_end_stream_output_queries(struct svga_context *svga, unsigned mask);
+
+unsigned
+svga_get_primcount_from_stream_output(struct svga_context *svga,
+                                      unsigned stream);
+
 #endif /* SVGA_STREAMOUT_H */
--- a/src/gallium/drivers/svga/svga_surface.c
+++ b/src/gallium/drivers/svga/svga_surface.c
@ -578,6 +578,16 @@ svga_validate_surface_view(struct svga_context *svga, struct svga_surface *s)
      }
   }

+   /**
+    * Create an alternate surface view for the specified context if the
+    * view was created for another context.
+    */
+   if (s && s->base.context != &svga->pipe) {
+      struct pipe_surface *surf;
+      surf = svga_create_surface_view(&svga->pipe, s->base.texture, &s->base, FALSE);
+      s = svga_surface(surf);
+   }
+
   if (s && s->view_id == SVGA3D_INVALID_ID) {
      SVGA3dResourceType resType;
      SVGA3dRenderTargetViewDesc desc;
--- a/src/gallium/drivers/svga/svga_surface.h
+++ b/src/gallium/drivers/svga/svga_surface.h
@ -146,6 +146,8 @@ static inline SVGA3dResourceType
 svga_resource_type(enum pipe_texture_target target)
 {
   switch (target) {
+   case PIPE_BUFFER:
+      return SVGA3D_RESOURCE_BUFFER;
   case PIPE_TEXTURE_1D:
   case PIPE_TEXTURE_1D_ARRAY:
      return SVGA3D_RESOURCE_TEXTURE1D;
--- a/src/gallium/drivers/svga/svga_swtnl_backend.c
+++ b/src/gallium/drivers/svga/svga_swtnl_backend.c
@ -90,11 +90,12 @@ svga_vbuf_render_allocate_vertices(struct vbuf_render *render,

   if (!svga_render->vbuf) {
      svga_render->vbuf_size = MAX2(size, svga_render->vbuf_alloc_size);
-      svga_render->vbuf = pipe_buffer_create(screen,
-                                             PIPE_BIND_VERTEX_BUFFER,
-                                             PIPE_USAGE_STREAM,
-                                             svga_render->vbuf_size);
+      svga_render->vbuf = SVGA_TRY_PTR(pipe_buffer_create
+                                       (screen, PIPE_BIND_VERTEX_BUFFER,
+                                        PIPE_USAGE_STREAM,
+                                        svga_render->vbuf_size));
      if (!svga_render->vbuf) {
+         svga_retry_enter(svga);
         svga_context_flush(svga, NULL);
         assert(!svga_render->vbuf);
         svga_render->vbuf = pipe_buffer_create(screen,
@ -104,6 +105,7 @@ svga_vbuf_render_allocate_vertices(struct vbuf_render *render,
         /* The buffer allocation may fail if we run out of memory.
          * The draw module's vbuf code should handle that without crashing.
          */
+         svga_retry_exit(svga);
      }

      svga->swtnl.new_vdecl = TRUE;
@ -267,7 +269,7 @@ svga_vbuf_submit_state(struct svga_vbuf_render *svga_render)
   else {
      svga_hwtnl_set_flatshade(svga->hwtnl,
                                svga->curr.rast->templ.flatshade ||
-                                svga->state.hw_draw.fs->uses_flat_interp,
+                                svga_is_using_flat_shading(svga),
                                svga->curr.rast->templ.flatshade_first);

      svga_hwtnl_set_fillmode(svga->hwtnl, svga->curr.rast->hw_fillmode);
@ -286,10 +288,10 @@ svga_vbuf_render_draw_arrays(struct vbuf_render *render,
   struct svga_context *svga = svga_render->svga;
   unsigned bias = (svga_render->vbuf_offset - svga_render->vdecl_offset)
      / svga_render->vertex_size;
-   enum pipe_error ret = PIPE_OK;
   /* instancing will already have been resolved at this point by 'draw' */
   const unsigned start_instance = 0;
   const unsigned instance_count = 1;
+   boolean retried;

   SVGA_STATS_TIME_PUSH(svga_sws(svga), SVGA_STATS_TIME_VBUFDRAWARRAYS);

@ -301,17 +303,13 @@ svga_vbuf_render_draw_arrays(struct vbuf_render *render,
    * redbook/polys.c
    */
   svga_update_state_retry(svga, SVGA_STATE_HW_DRAW);
-
-   ret = svga_hwtnl_draw_arrays(svga->hwtnl, svga_render->prim, start + bias,
-                                 nr, start_instance, instance_count);
-   if (ret != PIPE_OK) {
-      svga_context_flush(svga, NULL);
-      ret = svga_hwtnl_draw_arrays(svga->hwtnl, svga_render->prim,
-                                   start + bias, nr,
-                                   start_instance, instance_count);
+   SVGA_RETRY_CHECK(svga, svga_hwtnl_draw_arrays
+                    (svga->hwtnl, svga_render->prim, start + bias,
+                     nr, start_instance, instance_count, 0), retried);
+   if (retried) {
      svga->swtnl.new_vbuf = TRUE;
-      assert(ret == PIPE_OK);
   }
+
   SVGA_STATS_TIME_POP(svga_sws(svga));
 }

@ -325,7 +323,7 @@ svga_vbuf_render_draw_elements(struct vbuf_render *render,
   struct svga_context *svga = svga_render->svga;
   int bias = (svga_render->vbuf_offset - svga_render->vdecl_offset)
      / svga_render->vertex_size;
-   boolean ret;
+   boolean retried;
   /* instancing will already have been resolved at this point by 'draw' */
   const struct pipe_draw_info info = {
      .index_size = 2,
@ -354,13 +352,12 @@ svga_vbuf_render_draw_elements(struct vbuf_render *render,
    * redbook/polys.c
    */
   svga_update_state_retry(svga, SVGA_STATE_HW_DRAW);
-   ret = svga_hwtnl_draw_range_elements(svga->hwtnl, &info, nr_indices);
-   if (ret != PIPE_OK) {
-      svga_context_flush(svga, NULL);
-      ret = svga_hwtnl_draw_range_elements(svga->hwtnl, &info, nr_indices);
+   SVGA_RETRY_CHECK(svga, svga_hwtnl_draw_range_elements(svga->hwtnl, &info,
+                                                         nr_indices), retried);
+   if (retried) {
      svga->swtnl.new_vbuf = TRUE;
-      assert(ret == PIPE_OK);
   }
+
   SVGA_STATS_TIME_POP(svga_sws(svga));
 }

--- a/src/gallium/drivers/svga/svga_swtnl_state.c
+++ b/src/gallium/drivers/svga/svga_swtnl_state.c
@ -51,7 +51,7 @@
 static void
 set_draw_viewport(struct svga_context *svga)
 {
-   struct pipe_viewport_state vp = svga->curr.viewport;
+   struct pipe_viewport_state vp = svga->curr.viewport[0];
   float adjx = 0.0f;
   float adjy = 0.0f;

@ -98,7 +98,7 @@ set_draw_viewport(struct svga_context *svga)
 }

 static enum pipe_error
-update_swtnl_draw(struct svga_context *svga, unsigned dirty)
+update_swtnl_draw(struct svga_context *svga, uint64_t dirty)
 {
   SVGA_STATS_TIME_PUSH(svga_sws(svga), SVGA_STATS_TIME_SWTNLUPDATEDRAW);

@ -191,7 +191,6 @@ svga_vdecl_to_input_element(struct svga_context *svga,
 {
   SVGA3dElementLayoutId id;
   SVGA3dInputElementDesc elements[PIPE_MAX_ATTRIBS];
-   enum pipe_error ret;
   unsigned i;

   assert(num_decls <= PIPE_MAX_ATTRIBS);
@ -208,13 +207,8 @@ svga_vdecl_to_input_element(struct svga_context *svga,

   id = util_bitmask_add(svga->input_element_object_id_bm);

-   ret = SVGA3D_vgpu10_DefineElementLayout(svga->swc, num_decls, id, elements);
-   if (ret != PIPE_OK) {
-      svga_context_flush(svga, NULL);
-      ret = SVGA3D_vgpu10_DefineElementLayout(svga->swc, num_decls,
-                                              id, elements);
-      assert(ret == PIPE_OK);
-   }
+   SVGA_RETRY(svga, SVGA3D_vgpu10_DefineElementLayout(svga->swc, num_decls, id,
+                                                      elements));

   return id;
 }
@ -306,22 +300,14 @@ svga_swtnl_update_vdecl(struct svga_context *svga)
   any_change = memcmp(svga_render->vdecl, vdecl, sizeof(vdecl));

   if (svga_have_vgpu10(svga)) {
-      enum pipe_error ret;
-
      if (!any_change && svga_render->layout_id != SVGA3D_INVALID_ID) {
         goto done;
      }

      if (svga_render->layout_id != SVGA3D_INVALID_ID) {
         /* destroy old */
-         ret = SVGA3D_vgpu10_DestroyElementLayout(svga->swc,
-                                                  svga_render->layout_id);
-         if (ret != PIPE_OK) {
-            svga_context_flush(svga, NULL);
-            ret = SVGA3D_vgpu10_DestroyElementLayout(svga->swc,
-                                                     svga_render->layout_id);
-            assert(ret == PIPE_OK);
-         }
+         SVGA_RETRY(svga, SVGA3D_vgpu10_DestroyElementLayout
+                    (svga->swc, svga_render->layout_id));

         /**
          * reset current layout id state after the element layout is
@ -340,14 +326,8 @@ svga_swtnl_update_vdecl(struct svga_context *svga)

      /* bind new */
      if (svga->state.hw_draw.layout_id != svga_render->layout_id) {
-         ret = SVGA3D_vgpu10_SetInputLayout(svga->swc, svga_render->layout_id);
-         if (ret != PIPE_OK) {
-            svga_context_flush(svga, NULL);
-            ret = SVGA3D_vgpu10_SetInputLayout(svga->swc,
-                                               svga_render->layout_id);
-            assert(ret == PIPE_OK);
-         }
-
+         SVGA_RETRY(svga, SVGA3D_vgpu10_SetInputLayout(svga->swc,
+                                                       svga_render->layout_id));
         svga->state.hw_draw.layout_id = svga_render->layout_id;
      }
   }
@ -366,7 +346,7 @@ done:


 static enum pipe_error
-update_swtnl_vdecl(struct svga_context *svga, unsigned dirty)
+update_swtnl_vdecl(struct svga_context *svga, uint64_t dirty)
 {
   return svga_swtnl_update_vdecl(svga);
 }
--- a/src/gallium/drivers/svga/svga_tgsi.c
+++ b/src/gallium/drivers/svga/svga_tgsi.c
@ -238,14 +238,18 @@ svga_tgsi_vgpu9_translate(struct svga_context *svga,
   memcpy(&variant->key, key, sizeof(*key));
   variant->id = UTIL_BITMASK_INVALID_INDEX;

-   variant->pstipple_sampler_unit = emit.pstipple_sampler_unit;
+   if (unit == PIPE_SHADER_FRAGMENT) {
+      struct svga_fs_variant *fs_variant = svga_fs_variant(variant);

-   /* If there was exactly one write to a fragment shader output register
-    * and it came from a constant buffer, we know all fragments will have
-    * the same color (except for blending).
-    */
-   variant->constant_color_output =
-      emit.constant_color_output && emit.num_output_writes == 1;
+      fs_variant->pstipple_sampler_unit = emit.pstipple_sampler_unit;
+
+      /* If there was exactly one write to a fragment shader output register
+       * and it came from a constant buffer, we know all fragments will have
+       * the same color (except for blending).
+       */
+      fs_variant->constant_color_output =
+         emit.constant_color_output && emit.num_output_writes == 1;
+   }

 #if 0
   if (!svga_shader_verify(variant->tokens, variant->nr_tokens) ||
--- a/src/gallium/drivers/svga/svga_tgsi.h
+++ b/src/gallium/drivers/svga/svga_tgsi.h
@ -30,7 +30,7 @@
 #include "svga3d_reg.h"


-#define MAX_VGPU10_ADDR_REGS 2
+#define MAX_VGPU10_ADDR_REGS 4

 struct svga_compile_key;
 struct svga_context;
--- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
--- a/src/gallium/drivers/svga/svga_winsys.h
+++ b/src/gallium/drivers/svga/svga_winsys.h
@ -427,7 +427,9 @@ struct svga_winsys_context
                    uint32 shaderId,
                    SVGA3dShaderType shaderType,
                    const uint32 *bytecode,
-                    uint32 bytecodeLen);
+                    uint32 bytecodeLen,
+                    const SVGA3dDXShaderSignatureHeader *sgnInfo,
+                    uint32 sgnLen);

   /**
    * Destroy a DX GB shader.
@ -457,7 +459,13 @@ struct svga_winsys_context

   /** For HUD queries */
   uint64_t num_commands;
+   uint64_t num_command_buffers;
   uint64_t num_draw_commands;
+   uint64_t num_shader_reloc;
+   uint64_t num_surf_reloc;
+
+   /* Whether we are in retry processing */
+   unsigned int in_retry;
 };


--- a/src/gallium/winsys/svga/drm/vmw_context.c
+++ b/src/gallium/winsys/svga/drm/vmw_context.c
@ -65,6 +65,7 @@
 #define VMW_MAX_SURF_MEM_FACTOR 2


+
 struct vmw_buffer_relocation
 {
   struct pb_buffer *buffer;
@ -701,20 +702,19 @@ vmw_svga_winsys_vgpu10_shader_create(struct svga_winsys_context *swc,
                                     uint32 shaderId,
                                     SVGA3dShaderType shaderType,
                                     const uint32 *bytecode,
-                                     uint32 bytecodeLen)
+                                     uint32 bytecodeLen,
+                                     const SVGA3dDXShaderSignatureHeader *sgnInfo,
+                                     uint32 sgnLen)
 {
   struct vmw_svga_winsys_context *vswc = vmw_svga_winsys_context(swc);
   struct vmw_svga_winsys_shader *shader;
-   struct svga_winsys_gb_shader *gb_shader =
-      vmw_svga_winsys_shader_create(&vswc->vws->base, shaderType, bytecode,
-                                    bytecodeLen);
-   if (!gb_shader)
+   shader = vmw_svga_shader_create(&vswc->vws->base, shaderType, bytecode,
+                                   bytecodeLen, sgnInfo, sgnLen);
+   if (!shader)
      return NULL;

-   shader = vmw_svga_winsys_shader(gb_shader);
   shader->shid = shaderId;
-
-   return gb_shader;
+   return svga_winsys_shader(shader);
 }

 /**
--- a/src/gallium/winsys/svga/drm/vmw_shader.c
+++ b/src/gallium/winsys/svga/drm/vmw_shader.c
@ -28,7 +28,9 @@
 #include "util/u_debug.h"
 #include "util/u_memory.h"

+#include "vmw_context.h"
 #include "vmw_shader.h"
+#include "vmw_buffer.h"
 #include "vmw_screen.h"

 void
@ -63,3 +65,54 @@ vmw_svga_winsys_shader_reference(struct vmw_svga_winsys_shader **pdst,

   *pdst = src;
 }
+
+
+/**
+ * A helper function to create a shader object and upload the
+ * shader bytecode and signature if specified to the shader memory.
+ */
+struct vmw_svga_winsys_shader *
+vmw_svga_shader_create(struct svga_winsys_screen *sws,
+                       SVGA3dShaderType type,
+                       const uint32 *bytecode,
+                       uint32 bytecodeLen,
+                       const SVGA3dDXShaderSignatureHeader *sgnInfo,
+                       uint32 sgnLen)
+{
+   struct vmw_svga_winsys_shader *shader;
+   void *map;
+
+   shader = CALLOC_STRUCT(vmw_svga_winsys_shader);
+   if (!shader)
+      return NULL;
+
+   pipe_reference_init(&shader->refcnt, 1);
+   p_atomic_set(&shader->validated, 0);
+   shader->screen = vmw_winsys_screen(sws);
+   shader->buf = sws->buffer_create(sws, 64,
+                                    SVGA_BUFFER_USAGE_SHADER,
+                                    bytecodeLen + sgnLen);
+   if (!shader->buf) {
+      FREE(shader);
+      return NULL;
+   }
+
+   map = sws->buffer_map(sws, shader->buf, PIPE_TRANSFER_WRITE);
+   if (!map) {
+      FREE(shader);
+      return NULL;
+   }
+
+   /* copy the shader bytecode */
+   memcpy(map, bytecode, bytecodeLen);
+
+   /* if shader signature is specified, append it to the bytecode. */
+   if (sgnLen) {
+      assert(sws->have_sm5);
+      map = (char *)map + bytecodeLen;
+      memcpy(map, sgnInfo, sgnLen);
+   }
+   sws->buffer_unmap(sws, shader->buf);
+
+   return shader;
+}
--- a/src/gallium/winsys/svga/drm/vmw_shader.h
+++ b/src/gallium/winsys/svga/drm/vmw_shader.h
@ -65,4 +65,12 @@ void
 vmw_svga_winsys_shader_reference(struct vmw_svga_winsys_shader **pdst,
                                  struct vmw_svga_winsys_shader *src);

+struct vmw_svga_winsys_shader *
+vmw_svga_shader_create(struct svga_winsys_screen *sws,
+                       SVGA3dShaderType type,
+                       const uint32 *bytecode,
+                       uint32 bytecodeLen,
+                       const SVGA3dDXShaderSignatureHeader *sgnInfo,
+                       uint32 sgnLen);
+
 #endif /* VMW_SHADER_H_ */