Merge remote-tracking branch 'mesa-public/master' into vulkan

This fixes the bitfieldextract and bitfieldinsert CTS tests
2016-01-14 11:36:17 -08:00 · 2016-01-14 11:36:17 -08:00 · 45349acad0
parent f46f4e4886 e94ef885bb
commit 45349acad0
90 changed files with 1116 additions and 465 deletions
--- a/docs/index.html
+++ b/docs/index.html
@ -16,6 +16,12 @@

 <h1>News</h1>

+<h2>January 13, 2015</h2>
+<p>
+<a href="relnotes/11.1.1.html">Mesa 11.1.1</a> is released.
+This is a bug-fix release.
+</p>
+
 <h2>December 21, 2015</h2>
 <p>
 <a href="relnotes/11.0.8.html">Mesa 11.0.8</a> is released.
--- a/docs/relnotes.html
+++ b/docs/relnotes.html
@ -21,6 +21,7 @@ The release notes summarize what's new or changed in each Mesa release.
 </p>

 <ul>
+<li><a href="relnotes/11.1.1.html">11.1.1 release notes</a>
 <li><a href="relnotes/11.0.8.html">11.0.8 release notes</a>
 <li><a href="relnotes/11.1.0.html">11.1.0 release notes</a>
 <li><a href="relnotes/11.0.7.html">11.0.7 release notes</a>
--- a/docs/relnotes/11.1.1.html
+++ b/docs/relnotes/11.1.1.html
@ -0,0 +1,197 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 11.1.1 Release Notes / January 13, 2016</h1>
+
+<p>
+Mesa 11.1.1 is a bug fix release which fixes bugs found since the 11.1.0 release.
+</p>
+<p>
+Mesa 11.1.1 implements the OpenGL 4.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.1.  OpenGL
+4.1 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+b15089817540ba0bffd0aad323ecf3a8ff6779568451827c7274890b4a269d58  mesa-11.1.1.tar.gz
+64db074fc514136b5fb3890111f0d50604db52f0b1e94ba3fcb0fe8668a7fd20  mesa-11.1.1.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91806">Bug 91806</a> - configure does not test whether assembler supports sse4.1</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92229">Bug 92229</a> - [APITRACE] SOMA have serious graphical errors</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92233">Bug 92233</a> - Unigine Heaven 4.0 silhuette run</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93004">Bug 93004</a> - Guild Wars 2 crash on nouveau DX11 cards</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93215">Bug 93215</a> - [Regression bisected] Ogles1conform Automatic mipmap generation test is fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93257">Bug 93257</a> - [SKL, bisected] ASTC dEQP tests segfault</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Brian Paul (1):</p>
+<ul>
+  <li>st/mesa: check state-&gt;mesa in early return check in st_validate_state()</li>
+</ul>
+
+<p>Dave Airlie (6):</p>
+<ul>
+  <li>mesa/varray: set double arrays to non-normalised.</li>
+  <li>mesa/shader: return correct attribute location for double matrix arrays</li>
+  <li>glsl: pass stage into mark function</li>
+  <li>glsl/fp64: add helper for dual slot double detection.</li>
+  <li>glsl: fix count_attribute_slots to allow for different 64-bit handling</li>
+  <li>glsl: only update doubles inputs for vertex inputs.</li>
+</ul>
+
+<p>Emil Velikov (4):</p>
+<ul>
+  <li>docs: add sha256 checksums for 11.0.1</li>
+  <li>cherry-ignore: drop the "re-enable" DCC on Stoney</li>
+  <li>cherry-ignore: don't pick a specific i965 formats patch</li>
+  <li>Update version to 11.1.1</li>
+</ul>
+
+<p>Eric Anholt (2):</p>
+<ul>
+  <li>vc4: Warn instead of abort()ing on exec ioctl failures.</li>
+  <li>vc4: Keep sample mask writes from being reordered after TLB writes</li>
+</ul>
+
+<p>Grazvydas Ignotas (1):</p>
+<ul>
+  <li>r600: fix constant buffer size programming</li>
+</ul>
+
+<p>Ian Romanick (1):</p>
+<ul>
+  <li>meta/generate_mipmap: Work-around GLES 1.x problem with GL_DRAW_FRAMEBUFFER</li>
+</ul>
+
+<p>Ilia Mirkin (9):</p>
+<ul>
+  <li>nv50/ir: can't have predication and immediates</li>
+  <li>gk104/ir: simplify and fool-proof texbar algorithm</li>
+  <li>glsl: assign varying locations to tess shaders when doing SSO</li>
+  <li>glx/dri3: a drawable might not be bound at wait time</li>
+  <li>nvc0: don't forget to reset VTX_TMP bufctx slot after blit completion</li>
+  <li>nv50/ir: float(s32 &amp; 0xff) = float(u8), not s8</li>
+  <li>nv50,nvc0: make sure there's pushbuf space and that we ref the bo early</li>
+  <li>nv50,nvc0: fix crash when increasing bsp bo size for h264</li>
+  <li>nvc0: scale up inter_bo size so that it's 16M for a 4K video</li>
+</ul>
+
+<p>Jonathan Gray (2):</p>
+<ul>
+  <li>configure.ac: use pkg-config for libelf</li>
+  <li>configure: check for python2.7 for PYTHON2</li>
+</ul>
+
+<p>Kenneth Graunke (5):</p>
+<ul>
+  <li>ralloc: Fix ralloc_adopt() to the old context's last child's parent.</li>
+  <li>drirc: Disable ARB_blend_func_extended for Heaven 4.0/Valley 1.0.</li>
+  <li>glsl: Fix varying struct locations when varying packing is disabled.</li>
+  <li>nvc0: Set winding order regardless of domain.</li>
+  <li>nir: Add a lower_fdiv option, turn fdiv into fmul/frcp.</li>
+</ul>
+
+<p>Marek Olšák (7):</p>
+<ul>
+  <li>tgsi/scan: add flag colors_written</li>
+  <li>r600g: write all MRTs only if there is exactly one output (fixes a hang)</li>
+  <li>radeonsi: don't call of u_prims_for_vertices for patches and rectangles</li>
+  <li>radeonsi: apply the streamout workaround to Fiji as well</li>
+  <li>gallium/radeon: fix Hyper-Z hangs by programming PA_SC_MODE_CNTL_1 correctly</li>
+  <li>program: add _mesa_reserve_parameter_storage</li>
+  <li>st/mesa: fix GLSL uniform updates for glBitmap &amp; glDrawPixels (v2)</li>
+</ul>
+
+<p>Mark Janes (1):</p>
+<ul>
+  <li>Add missing platform information for KBL</li>
+</ul>
+
+<p>Miklós Máté (1):</p>
+<ul>
+  <li>mesa: Don't leak ATIfs instructions in DeleteFragmentShader</li>
+</ul>
+
+<p>Neil Roberts (3):</p>
+<ul>
+  <li>i965: Add MESA_FORMAT_B8G8R8X8_SRGB to brw_format_for_mesa_format</li>
+  <li>i965: Add B8G8R8X8_SRGB to the alpha format override</li>
+  <li>i965: Fix crash when calling glViewport with no surface bound</li>
+</ul>
+
+<p>Nicolai Hähnle (2):</p>
+<ul>
+  <li>gallium/radeon: only dispose locally created target machine in radeon_llvm_compile</li>
+  <li>gallium/radeon: fix regression in a number of driver queries</li>
+</ul>
+
+<p>Oded Gabbay (1):</p>
+<ul>
+  <li>configura.ac: fix test for SSE4.1 assembler support</li>
+</ul>
+
+<p>Patrick Rudolph (2):</p>
+<ul>
+  <li>nv50,nvc0: fix use-after-free when vertex buffers are unbound</li>
+  <li>gallium/util: return correct number of bound vertex buffers</li>
+</ul>
+
+<p>Rob Herring (1):</p>
+<ul>
+  <li>freedreno/ir3: fix 32-bit builds with pointer-to-int-cast error enabled</li>
+</ul>
+
+<p>Samuel Pitoiset (3):</p>
+<ul>
+  <li>nvc0: free memory allocated by the prog which reads MP perf counters</li>
+  <li>nv50,nvc0: free memory allocated by performance metrics</li>
+  <li>nv50: free memory allocated by the prog which reads MP perf counters</li>
+</ul>
+
+<p>Sarah Sharp (1):</p>
+<ul>
+  <li>mesa: Add KBL PCI IDs and platform information.</li>
+</ul>
+
+
+</div>
+</body>
+</html>
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@ -1861,6 +1861,8 @@ draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store)

   key = (struct draw_llvm_variant_key *)store;

+   memset(key, 0, offsetof(struct draw_llvm_variant_key, vertex_element[0]));
+
   key->clamp_vertex_color = llvm->draw->rasterizer->clamp_vertex_color; /**/

   /* Presumably all variants of the shader should have the same
@ -1883,7 +1885,6 @@ draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store)
   key->ucp_enable = llvm->draw->rasterizer->clip_plane_enable;
   key->has_gs = llvm->draw->gs.geometry_shader != NULL;
   key->num_outputs = draw_total_vs_outputs(llvm->draw);
-   key->pad1 = 0;

   /* All variants of this shader will have the same value for
    * nr_samplers.  Not yet trying to compact away holes in the
@ -2315,6 +2316,8 @@ draw_gs_llvm_make_variant_key(struct draw_llvm *llvm, char *store)

   key = (struct draw_gs_llvm_variant_key *)store;

+   memset(key, 0, offsetof(struct draw_gs_llvm_variant_key, samplers[0]));
+
   key->num_outputs = draw_total_gs_outputs(llvm->draw);

   /* All variants of this shader will have the same value for
--- a/src/gallium/auxiliary/draw/draw_llvm.h
+++ b/src/gallium/auxiliary/draw/draw_llvm.h
@ -311,12 +311,8 @@ struct draw_llvm_variant_key
   unsigned need_edgeflags:1;
   unsigned has_gs:1;
   unsigned num_outputs:8;
-   /*
-    * it is important there are no holes in this struct
-    * (and all padding gets zeroed).
-    */
   unsigned ucp_enable:PIPE_MAX_CLIP_PLANES;
-   unsigned pad1:24-PIPE_MAX_CLIP_PLANES;
+   /* note padding here - must use memset */

   /* Variable number of vertex elements:
    */
@ -332,6 +328,7 @@ struct draw_gs_llvm_variant_key
   unsigned nr_samplers:8;
   unsigned nr_sampler_views:8;
   unsigned num_outputs:8;
+   /* note padding here - must use memset */

   struct draw_sampler_static_state samplers[1];
 };
--- a/src/gallium/docs/source/context.rst
+++ b/src/gallium/docs/source/context.rst
@ -648,3 +648,14 @@ In addition, normal texture sampling is allowed from the compute
 program: ``bind_sampler_states`` may be used to set up texture
 samplers for the compute stage and ``set_sampler_views`` may
 be used to bind a number of sampler views to it.
+
+Mipmap generation
+^^^^^^^^^^^^^^^^^
+
+If PIPE_CAP_GENERATE_MIPMAP is true, ``generate_mipmap`` can be used
+to generate mipmaps for the specified texture resource.
+It replaces texel image levels base_level+1 through
+last_level for layers range from first_layer through last_layer.
+It returns TRUE if mipmap generation succeeds, otherwise it
+returns FALSE. Mipmap generation may fail when it is not supported
+for particular texture types or formats.
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@ -301,6 +301,10 @@ The integer capabilities:
  alignment for pipe_shader_buffer::buffer_offset, in bytes. Maximum
  value allowed is 256 (for GL conformance). 0 is only allowed if
  shader buffers are not supported.
+* ``PIPE_CAP_INVALIDATE_BUFFER``: Whether the use of ``invalidate_resource``
+  for buffers is supported.
+* ``PIPE_CAP_GENERATE_MIPMAP``: Indicates whether pipe_context::generate_mipmap
+  is supported.


 .. _pipe_capf:
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@ -245,6 +245,8 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
 	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
 	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+	case PIPE_CAP_INVALIDATE_BUFFER:
+	case PIPE_CAP_GENERATE_MIPMAP:
 		return 0;

 	case PIPE_CAP_MAX_VIEWPORTS:
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@ -259,6 +259,8 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_INVALIDATE_BUFFER:
+   case PIPE_CAP_GENERATE_MIPMAP:
      return 0;

   case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@ -483,6 +483,8 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_INVALIDATE_BUFFER:
+   case PIPE_CAP_GENERATE_MIPMAP:
      return 0;

   case PIPE_CAP_VENDOR_ID:
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@ -64,43 +64,43 @@ block_full_16(struct lp_rasterizer_task *task,
 }

 static inline unsigned
-build_mask_linear(int64_t c, int64_t dcdx, int64_t dcdy)
+build_mask_linear(int32_t c, int32_t dcdx, int32_t dcdy)
 {
   unsigned mask = 0;

-   int64_t c0 = c;
-   int64_t c1 = c0 + dcdy;
-   int64_t c2 = c1 + dcdy;
-   int64_t c3 = c2 + dcdy;
+   int32_t c0 = c;
+   int32_t c1 = c0 + dcdy;
+   int32_t c2 = c1 + dcdy;
+   int32_t c3 = c2 + dcdy;

-   mask |= ((c0 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 0);
-   mask |= ((c0 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 1);
-   mask |= ((c0 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 2);
-   mask |= ((c0 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 3);
-   mask |= ((c1 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 4);
-   mask |= ((c1 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 5);
-   mask |= ((c1 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 6);
-   mask |= ((c1 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 7);
-   mask |= ((c2 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 8);
-   mask |= ((c2 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 9);
-   mask |= ((c2 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 10);
-   mask |= ((c2 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 11);
-   mask |= ((c3 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 12);
-   mask |= ((c3 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 13);
-   mask |= ((c3 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 14);
-   mask |= ((c3 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 15);
+   mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0);
+   mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1);
+   mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2);
+   mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3);
+   mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4);
+   mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5);
+   mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6);
+   mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7);
+   mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8);
+   mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9);
+   mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10);
+   mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11);
+   mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12);
+   mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13);
+   mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14);
+   mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15);
  
   return mask;
 }


 static inline void
-build_masks(int64_t c,
-            int64_t cdiff,
-            int64_t dcdx,
-            int64_t dcdy,
-	    unsigned *outmask,
-	    unsigned *partmask)
+build_masks(int32_t c,
+            int32_t cdiff,
+            int32_t dcdx,
+            int32_t dcdy,
+            unsigned *outmask,
+            unsigned *partmask)
 {
   *outmask |= build_mask_linear(c, dcdx, dcdy);
   *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy);
@ -140,12 +140,12 @@ lp_rast_triangle_4_16(struct lp_rasterizer_task *task,


 static inline void
-build_masks_32(int c, 
-               int cdiff,
-               int dcdx,
-               int dcdy,
-               unsigned *outmask,
-               unsigned *partmask)
+build_masks_sse(int c,
+                int cdiff,
+                int dcdx,
+                int dcdy,
+                unsigned *outmask,
+                unsigned *partmask)
 {
   __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
   __m128i xdcdy = _mm_set1_epi32(dcdy);
@ -186,7 +186,7 @@ build_masks_32(int c,


 static inline unsigned
-build_mask_linear_32(int c, int dcdx, int dcdy)
+build_mask_linear_sse(int c, int dcdx, int dcdy)
 {
   __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
   __m128i xdcdy = _mm_set1_epi32(dcdy);
@ -442,12 +442,12 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
 #include "util/u_pwr8.h"

 static inline void
-build_masks_32(int c,
-               int cdiff,
-               int dcdx,
-               int dcdy,
-               unsigned *outmask,
-               unsigned *partmask)
+build_masks_ppc(int c,
+                int cdiff,
+                int dcdx,
+                int dcdy,
+                unsigned *outmask,
+                unsigned *partmask)
 {
   __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
   __m128i xdcdy = (__m128i) vec_splats(dcdy);
@ -487,7 +487,7 @@ build_masks_32(int c,
 }

 static inline unsigned
-build_mask_linear_32(int c, int dcdx, int dcdy)
+build_mask_linear_ppc(int c, int dcdx, int dcdy)
 {
   __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
   __m128i xdcdy = (__m128i) vec_splats(dcdy);
@ -684,8 +684,18 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
 #endif


+#if defined PIPE_ARCH_SSE
+#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_sse((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
+#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_sse((int)c, dcdx, dcdy)
+#elif (defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN))
+#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_ppc((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
+#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_ppc((int)c, dcdx, dcdy)
+#else
 #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks(c, cdiff, dcdx, dcdy, omask, pmask)
 #define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear(c, dcdx, dcdy)
+#endif
+
+#define RASTER_64 1

 #define TAG(x) x##_1
 #define NR_PLANES 1
@ -722,12 +732,7 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
 #define NR_PLANES 8
 #include "lp_rast_tri_tmp.h"

-#if defined(PIPE_ARCH_SSE) || (defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN))
-#undef BUILD_MASKS
-#undef BUILD_MASK_LINEAR
-#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_32((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
-#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_32((int)c, dcdx, dcdy)
-#endif
+#undef RASTER_64

 #define TAG(x) x##_32_1
 #define NR_PLANES 1
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
@ -50,9 +50,15 @@ TAG(do_block_4)(struct lp_rasterizer_task *task,
   int j;

   for (j = 0; j < NR_PLANES; j++) {
-      mask &= ~BUILD_MASK_LINEAR(c[j] - 1, 
-				 -plane[j].dcdx,
-				 plane[j].dcdy);
+#ifdef RASTER_64
+      mask &= ~BUILD_MASK_LINEAR(((c[j] - 1) >> (int64_t)FIXED_ORDER),
+                                 -plane[j].dcdx >> FIXED_ORDER,
+                                 plane[j].dcdy >> FIXED_ORDER);
+#else
+      mask &= ~BUILD_MASK_LINEAR((c[j] - 1),
+                                 -plane[j].dcdx,
+                                 plane[j].dcdy);
+#endif
   }

   /* Now pass to the shader:
@ -79,17 +85,33 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
   partmask = 0;                /* outside one or more trivial accept planes */

   for (j = 0; j < NR_PLANES; j++) {
+#ifdef RASTER_64
+      int32_t dcdx = -plane[j].dcdx >> FIXED_ORDER;
+      int32_t dcdy = plane[j].dcdy >> FIXED_ORDER;
+      const int32_t cox = plane[j].eo >> FIXED_ORDER;
+      const int32_t ei = (dcdy + dcdx - cox) << 2;
+      const int32_t cox_s = cox << 2;
+      const int32_t co = (int32_t)(c[j] >> (int64_t)FIXED_ORDER) + cox_s;
+      int32_t cdiff;
+      cdiff = ei - cox_s + ((int32_t)((c[j] - 1) >> (int64_t)FIXED_ORDER) -
+                            (int32_t)(c[j] >> (int64_t)FIXED_ORDER));
+      dcdx <<= 2;
+      dcdy <<= 2;
+#else
      const int64_t dcdx = -IMUL64(plane[j].dcdx, 4);
      const int64_t dcdy = IMUL64(plane[j].dcdy, 4);
      const int64_t cox = IMUL64(plane[j].eo, 4);
-      const int64_t ei = plane[j].dcdy - plane[j].dcdx - (int64_t)plane[j].eo;
+      const int32_t ei = plane[j].dcdy - plane[j].dcdx - (int64_t)plane[j].eo;
      const int64_t cio = IMUL64(ei, 4) - 1;
+      int32_t co, cdiff;
+      co = c[j] + cox;
+      cdiff = cio - cox;
+#endif

-      BUILD_MASKS(c[j] + cox,
-		  cio - cox,
-		  dcdx, dcdy, 
-		  &outmask,   /* sign bits from c[i][0..15] + cox */
-		  &partmask); /* sign bits from c[i][0..15] + cio */
+      BUILD_MASKS(co, cdiff,
+                  dcdx, dcdy,
+                  &outmask,   /* sign bits from c[i][0..15] + cox */
+                  &partmask); /* sign bits from c[i][0..15] + cio */
   }

   if (outmask == 0xffff)
@ -179,14 +201,65 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
      c[j] = plane[j].c + IMUL64(plane[j].dcdy, y) - IMUL64(plane[j].dcdx, x);

      {
-         const int64_t dcdx = -IMUL64(plane[j].dcdx, 16);
-         const int64_t dcdy = IMUL64(plane[j].dcdy, 16);
-         const int64_t cox = IMUL64(plane[j].eo, 16);
-         const int64_t ei = plane[j].dcdy - plane[j].dcdx - (int64_t)plane[j].eo;
-         const int64_t cio = IMUL64(ei, 16) - 1;
-
-         BUILD_MASKS(c[j] + cox,
-                     cio - cox,
+#ifdef RASTER_64
+         /*
+          * Strip off lower FIXED_ORDER bits. Note that those bits from
+          * dcdx, dcdy, eo are always 0 (by definition).
+          * c values, however, are not. This means that for every
+          * addition of the form c + n*dcdx the lower FIXED_ORDER bits will
+          * NOT change. And those bits are not relevant to the sign bit (which
+          * is only what we need!) that is,
+          * sign(c + n*dcdx) == sign((c >> FIXED_ORDER) + n*(dcdx >> FIXED_ORDER))
+          * This means we can get away with using 32bit math for the most part.
+          * Only tricky part is the -1 adjustment for cdiff.
+          */
+         int32_t dcdx = -plane[j].dcdx >> FIXED_ORDER;
+         int32_t dcdy = plane[j].dcdy >> FIXED_ORDER;
+         const int32_t cox = plane[j].eo >> FIXED_ORDER;
+         const int32_t ei = (dcdy + dcdx - cox) << 4;
+         const int32_t cox_s = cox << 4;
+         const int32_t co = (int32_t)(c[j] >> (int64_t)FIXED_ORDER) + cox_s;
+         int32_t cdiff;
+         /*
+          * Plausibility check to ensure the 32bit math works.
+          * Note that within a tile, the max we can move the edge function
+          * is essentially dcdx * TILE_SIZE + dcdy * TILE_SIZE.
+          * TILE_SIZE is 64, dcdx/dcdy are nominally 21 bit (for 8192 max size
+          * and 8 subpixel bits), I'd be happy with 2 bits more too (1 for
+          * increasing fb size to 16384, the required d3d11 value, another one
+          * because I'm not quite sure we can't be _just_ above the max value
+          * here). This gives us 30 bits max - hence if c would exceed that here
+          * that means the plane is either trivial reject for the whole tile
+          * (in which case the tri will not get binned), or trivial accept for
+          * the whole tile (in which case plane_mask will not include it).
+          */
+         assert((c[j] >> (int64_t)FIXED_ORDER) > (int32_t)0xb0000000 &&
+                (c[j] >> (int64_t)FIXED_ORDER) < (int32_t)0x3fffffff);
+         /*
+          * Note the fixup part is constant throughout the tile - thus could
+          * just calculate this and avoid _all_ 64bit math in rasterization
+          * (except exactly this fixup calc).
+          * In fact theoretically could move that even to setup, albeit that
+          * seems tricky (pre-bin certainly can have values larger than 32bit,
+          * and would need to communicate that fixup value through).
+          * And if we want to support msaa, we'd probably don't want to do the
+          * downscaling in setup in any case...
+          */
+         cdiff = ei - cox_s + ((int32_t)((c[j] - 1) >> (int64_t)FIXED_ORDER) -
+                               (int32_t)(c[j] >> (int64_t)FIXED_ORDER));
+         dcdx <<= 4;
+         dcdy <<= 4;
+#else
+         const int32_t dcdx = -plane[j].dcdx << 4;
+         const int32_t dcdy = plane[j].dcdy << 4;
+         const int32_t cox = plane[j].eo << 4;
+         const int32_t ei = plane[j].dcdy - plane[j].dcdx - (int32_t)plane[j].eo;
+         const int32_t cio = (ei << 4) - 1;
+         int32_t co, cdiff;
+         co = c[j] + cox;
+         cdiff = cio - cox;
+#endif
+         BUILD_MASKS(co, cdiff,
                     dcdx, dcdy,
                     &outmask,   /* sign bits from c[i][0..15] + cox */
                     &partmask); /* sign bits from c[i][0..15] + cio */
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@ -308,6 +308,8 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_INVALIDATE_BUFFER:
+   case PIPE_CAP_GENERATE_MIPMAP:
      return 0;
   }
   /* should only get here on unhandled cases */
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@ -713,24 +713,24 @@ try_setup_line( struct lp_setup_context *setup,
      const struct u_rect *scissor =
         &setup->scissors[viewport_index];

-      plane[4].dcdx = -1;
+      plane[4].dcdx = -1 << 8;
      plane[4].dcdy = 0;
-      plane[4].c = 1-scissor->x0;
-      plane[4].eo = 1;
+      plane[4].c = (1-scissor->x0) << 8;
+      plane[4].eo = 1 << 8;

-      plane[5].dcdx = 1;
+      plane[5].dcdx = 1 << 8;
      plane[5].dcdy = 0;
-      plane[5].c = scissor->x1+1;
+      plane[5].c = (scissor->x1+1) << 8;
      plane[5].eo = 0;

      plane[6].dcdx = 0;
-      plane[6].dcdy = 1;
-      plane[6].c = 1-scissor->y0;
-      plane[6].eo = 1;
+      plane[6].dcdy = 1 << 8;
+      plane[6].c = (1-scissor->y0) << 8;
+      plane[6].eo = 1 << 8;

      plane[7].dcdx = 0;
-      plane[7].dcdy = -1;
-      plane[7].c = scissor->y1+1;
+      plane[7].dcdy = -1 << 8;
+      plane[7].c = (scissor->y1+1) << 8;
      plane[7].eo = 0;
   }

--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@ -492,24 +492,24 @@ try_setup_point( struct lp_setup_context *setup,
   {
      struct lp_rast_plane *plane = GET_PLANES(point);

-      plane[0].dcdx = -1;
+      plane[0].dcdx = -1 << 8;
      plane[0].dcdy = 0;
-      plane[0].c = 1-bbox.x0;
-      plane[0].eo = 1;
+      plane[0].c = (1-bbox.x0) << 8;
+      plane[0].eo = 1 << 8;

-      plane[1].dcdx = 1;
+      plane[1].dcdx = 1 << 8;
      plane[1].dcdy = 0;
-      plane[1].c = bbox.x1+1;
+      plane[1].c = (bbox.x1+1) << 8;
      plane[1].eo = 0;

      plane[2].dcdx = 0;
-      plane[2].dcdy = 1;
-      plane[2].c = 1-bbox.y0;
-      plane[2].eo = 1;
+      plane[2].dcdy = 1 << 8;
+      plane[2].c = (1-bbox.y0) << 8;
+      plane[2].eo = 1 << 8;

      plane[3].dcdx = 0;
-      plane[3].dcdy = -1;
-      plane[3].c = bbox.y1+1;
+      plane[3].dcdy = -1 << 8;
+      plane[3].c = (bbox.y1+1) << 8;
      plane[3].eo = 0;
   }

--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@ -68,11 +68,11 @@ fixed_to_float(int a)
 struct fixed_position {
   int32_t x[4];
   int32_t y[4];
-   int64_t area;
   int32_t dx01;
   int32_t dy01;
   int32_t dx20;
   int32_t dy20;
+   int64_t area;
 };


@ -676,24 +676,24 @@ do_triangle_ccw(struct lp_setup_context *setup,
   if (nr_planes == 7) {
      const struct u_rect *scissor = &setup->scissors[viewport_index];

-      plane[3].dcdx = -1;
+      plane[3].dcdx = -1 << 8;
      plane[3].dcdy = 0;
-      plane[3].c = 1-scissor->x0;
-      plane[3].eo = 1;
+      plane[3].c = (1-scissor->x0) << 8;
+      plane[3].eo = 1 << 8;

-      plane[4].dcdx = 1;
+      plane[4].dcdx = 1 << 8;
      plane[4].dcdy = 0;
-      plane[4].c = scissor->x1+1;
+      plane[4].c = (scissor->x1+1) << 8;
      plane[4].eo = 0;

      plane[5].dcdx = 0;
-      plane[5].dcdy = 1;
-      plane[5].c = 1-scissor->y0;
-      plane[5].eo = 1;
+      plane[5].dcdy = 1 << 8;
+      plane[5].c = (1-scissor->y0) << 8;
+      plane[5].eo = 1 << 8;

      plane[6].dcdx = 0;
-      plane[6].dcdy = -1;
-      plane[6].c = scissor->y1+1;
+      plane[6].dcdy = -1 << 8;
+      plane[6].c = (scissor->y1+1) << 8;
      plane[6].eo = 0;
   }

@ -966,29 +966,71 @@ static void retry_triangle_ccw( struct lp_setup_context *setup,

 /**
 * Calculate fixed position data for a triangle
+ * It is unfortunate we need to do that here (as we need area
+ * calculated in fixed point), as there's quite some code duplication
+ * to what is done in the jit setup prog.
 */
 static inline void
-calc_fixed_position( struct lp_setup_context *setup,
-                     struct fixed_position* position,
-                     const float (*v0)[4],
-                     const float (*v1)[4],
-                     const float (*v2)[4])
+calc_fixed_position(struct lp_setup_context *setup,
+                    struct fixed_position* position,
+                    const float (*v0)[4],
+                    const float (*v1)[4],
+                    const float (*v2)[4])
 {
+   /*
+    * The rounding may not be quite the same with PIPE_ARCH_SSE
+    * (util_iround right now only does nearest/even on x87,
+    * otherwise nearest/away-from-zero).
+    * Both should be acceptable, I think.
+    */
+#if defined(PIPE_ARCH_SSE)
+   __m128d v0r, v1r, v2r;
+   __m128 vxy0xy2, vxy1xy0;
+   __m128i vxy0xy2i, vxy1xy0i;
+   __m128i dxdy0120, x0x2y0y2, x1x0y1y0, x0120, y0120;
+   __m128 pix_offset = _mm_set1_ps(setup->pixel_offset);
+   __m128 fixed_one = _mm_set1_ps((float)FIXED_ONE);
+   v0r = _mm_load_sd((const double *)v0[0]);
+   v1r = _mm_load_sd((const double *)v1[0]);
+   v2r = _mm_load_sd((const double *)v2[0]);
+   vxy0xy2 = _mm_castpd_ps(_mm_unpacklo_pd(v0r, v2r));
+   vxy1xy0 = _mm_castpd_ps(_mm_unpacklo_pd(v1r, v0r));
+   vxy0xy2 = _mm_sub_ps(vxy0xy2, pix_offset);
+   vxy1xy0 = _mm_sub_ps(vxy1xy0, pix_offset);
+   vxy0xy2 = _mm_mul_ps(vxy0xy2, fixed_one);
+   vxy1xy0 = _mm_mul_ps(vxy1xy0, fixed_one);
+   vxy0xy2i = _mm_cvtps_epi32(vxy0xy2);
+   vxy1xy0i = _mm_cvtps_epi32(vxy1xy0);
+   dxdy0120 = _mm_sub_epi32(vxy0xy2i, vxy1xy0i);
+   _mm_store_si128((__m128i *)&position->dx01, dxdy0120);
+   /*
+    * For the mul, would need some more shuffles, plus emulation
+    * for the signed mul (without sse41), so don't bother.
+    */
+   x0x2y0y2 = _mm_shuffle_epi32(vxy0xy2i, _MM_SHUFFLE(3,1,2,0));
+   x1x0y1y0 = _mm_shuffle_epi32(vxy1xy0i, _MM_SHUFFLE(3,1,2,0));
+   x0120 = _mm_unpacklo_epi32(x0x2y0y2, x1x0y1y0);
+   y0120 = _mm_unpackhi_epi32(x0x2y0y2, x1x0y1y0);
+   _mm_store_si128((__m128i *)&position->x[0], x0120);
+   _mm_store_si128((__m128i *)&position->y[0], y0120);
+
+#else
   position->x[0] = subpixel_snap(v0[0][0] - setup->pixel_offset);
   position->x[1] = subpixel_snap(v1[0][0] - setup->pixel_offset);
   position->x[2] = subpixel_snap(v2[0][0] - setup->pixel_offset);
-   position->x[3] = 0;
+   position->x[3] = 0; // should be unused

   position->y[0] = subpixel_snap(v0[0][1] - setup->pixel_offset);
   position->y[1] = subpixel_snap(v1[0][1] - setup->pixel_offset);
   position->y[2] = subpixel_snap(v2[0][1] - setup->pixel_offset);
-   position->y[3] = 0;
+   position->y[3] = 0; // should be unused

   position->dx01 = position->x[0] - position->x[1];
   position->dy01 = position->y[0] - position->y[1];

   position->dx20 = position->x[2] - position->x[0];
   position->dy20 = position->y[2] - position->y[0];
+#endif

   position->area = IMUL64(position->dx01, position->dy20) -
         IMUL64(position->dx20, position->dy01);
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@ -181,6 +181,8 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_INVALIDATE_BUFFER:
+   case PIPE_CAP_GENERATE_MIPMAP:
      return 0;

   case PIPE_CAP_VENDOR_ID:
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@ -224,6 +224,8 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_INVALIDATE_BUFFER:
+   case PIPE_CAP_GENERATE_MIPMAP:
      return 0;

   case PIPE_CAP_VENDOR_ID:
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@ -213,6 +213,8 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_INVALIDATE_BUFFER:
+   case PIPE_CAP_GENERATE_MIPMAP:
      return 0;

   case PIPE_CAP_VENDOR_ID:
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@ -207,6 +207,8 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
        case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
        case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
        case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+        case PIPE_CAP_INVALIDATE_BUFFER:
+        case PIPE_CAP_GENERATE_MIPMAP:
            return 0;

        /* SWTCL-only features. */
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@ -278,6 +278,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
 	case PIPE_CAP_TGSI_TXQS:
 	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+	case PIPE_CAP_INVALIDATE_BUFFER:
 		return 1;

 	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
@ -355,6 +356,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
 	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
 	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+	case PIPE_CAP_GENERATE_MIPMAP:
 		return 0;

 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@ -209,6 +209,36 @@ static void r600_buffer_destroy(struct pipe_screen *screen,
 	FREE(rbuffer);
 }

+static bool
+r600_do_invalidate_resource(struct r600_common_context *rctx,
+			    struct r600_resource *rbuffer)
+{
+	/* In AMD_pinned_memory, the user pointer association only gets
+	 * broken when the buffer is explicitly re-allocated.
+	 */
+	if (rctx->ws->buffer_is_user_ptr(rbuffer->buf))
+		return false;
+
+	/* Check if mapping this buffer would cause waiting for the GPU. */
+	if (r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) ||
+	    !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
+		rctx->invalidate_buffer(&rctx->b, &rbuffer->b.b);
+	} else {
+		util_range_set_empty(&rbuffer->valid_buffer_range);
+	}
+
+	return true;
+}
+
+void r600_invalidate_resource(struct pipe_context *ctx,
+			      struct pipe_resource *resource)
+{
+	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
+	struct r600_resource *rbuffer = r600_resource(resource);
+
+	(void)r600_do_invalidate_resource(rctx, rbuffer);
+}
+
 static void *r600_buffer_get_transfer(struct pipe_context *ctx,
 				      struct pipe_resource *resource,
                                      unsigned level,
@ -276,13 +306,10 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 	    !(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
 		assert(usage & PIPE_TRANSFER_WRITE);

-		/* Check if mapping this buffer would cause waiting for the GPU. */
-		if (r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) ||
-		    !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
-			rctx->invalidate_buffer(&rctx->b, &rbuffer->b.b);
+		if (r600_do_invalidate_resource(rctx, rbuffer)) {
+			/* At this point, the buffer is always idle. */
+			usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
 		}
-		/* At this point, the buffer is always idle. */
-		usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
 	}
 	else if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
 		 !(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@ -257,6 +257,7 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 	else
 		rctx->max_db = 4;

+	rctx->b.invalidate_resource = r600_invalidate_resource;
 	rctx->b.transfer_map = u_transfer_map_vtbl;
 	rctx->b.transfer_flush_region = u_transfer_flush_region_vtbl;
 	rctx->b.transfer_unmap = u_transfer_unmap_vtbl;
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@ -500,6 +500,9 @@ struct pipe_resource *
 r600_buffer_from_user_memory(struct pipe_screen *screen,
 			     const struct pipe_resource *templ,
 			     void *user_memory);
+void
+r600_invalidate_resource(struct pipe_context *ctx,
+			 struct pipe_resource *resource);

 /* r600_common_pipe.c */
 void r600_draw_rectangle(struct blitter_context *blitter,
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@ -529,6 +529,14 @@ struct radeon_winsys {
    struct pb_buffer *(*buffer_from_ptr)(struct radeon_winsys *ws,
                                         void *pointer, unsigned size);

+    /**
+     * Whether the buffer was created from a user pointer.
+     *
+     * \param buf       A winsys buffer object
+     * \return          whether \p buf was created via buffer_from_ptr
+     */
+    bool (*buffer_is_user_ptr)(struct pb_buffer *buf);
+
    /**
     * Get a winsys handle from a winsys buffer. The internal structure
     * of the handle is platform-specific and only a winsys should access it.
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@ -301,6 +301,9 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_TGSI_TXQS:
 	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
 	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+	case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+	case PIPE_CAP_INVALIDATE_BUFFER:
 		return 1;

 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
@ -344,9 +347,8 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
 	case PIPE_CAP_MULTI_DRAW_INDIRECT:
 	case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
-	case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
-	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
 	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+	case PIPE_CAP_GENERATE_MIPMAP:
 		return 0;

 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@ -911,36 +911,6 @@ static void declare_input_fs(

 	unsigned chan;

-	if (decl->Semantic.Name == TGSI_SEMANTIC_POSITION) {
-		for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
-			unsigned soa_index =
-				radeon_llvm_reg_index_soa(input_index, chan);
-			radeon_bld->inputs[soa_index] =
-				LLVMGetParam(main_fn, SI_PARAM_POS_X_FLOAT + chan);
-
-			if (chan == 3)
-				/* RCP for fragcoord.w */
-				radeon_bld->inputs[soa_index] =
-					LLVMBuildFDiv(gallivm->builder,
-						      lp_build_const_float(gallivm, 1.0f),
-						      radeon_bld->inputs[soa_index],
-						      "");
-		}
-		return;
-	}
-
-	if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
-		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
-			LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE);
-		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
-		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
-			lp_build_const_float(gallivm, 0.0f);
-		radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
-			lp_build_const_float(gallivm, 1.0f);
-
-		return;
-	}
-
 	shader->ps_input_param_offset[input_index] = shader->nparam++;
 	attr_number = lp_build_const_int32(gallivm,
 					   shader->ps_input_param_offset[input_index]);
@ -975,10 +945,8 @@ static void declare_input_fs(

 		face = LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE);

-		is_face_positive = LLVMBuildFCmp(gallivm->builder,
-						 LLVMRealOGT, face,
-						 lp_build_const_float(gallivm, 0.0f),
-						 "");
+		is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
+						 face, uint->zero, "");

 		args[2] = params;
 		args[3] = interp_param;
@ -1129,6 +1097,24 @@ static void declare_system_value(
 			assert(!"INVOCATIONID not implemented");
 		break;

+	case TGSI_SEMANTIC_POSITION:
+	{
+		LLVMValueRef pos[4] = {
+			LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
+			LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
+			LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Z_FLOAT),
+			lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base, TGSI_OPCODE_RCP,
+						 LLVMGetParam(radeon_bld->main_fn,
+							      SI_PARAM_POS_W_FLOAT)),
+		};
+		value = lp_build_gather_values(gallivm, pos, 4);
+		break;
+	}
+
+	case TGSI_SEMANTIC_FACE:
+		value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_FRONT_FACE);
+		break;
+
 	case TGSI_SEMANTIC_SAMPLEID:
 		value = get_sample_id(radeon_bld);
 		break;
@ -3506,7 +3492,7 @@ static void create_function(struct si_shader_context *si_shader_ctx)
 		params[SI_PARAM_POS_Y_FLOAT] = f32;
 		params[SI_PARAM_POS_Z_FLOAT] = f32;
 		params[SI_PARAM_POS_W_FLOAT] = f32;
-		params[SI_PARAM_FRONT_FACE] = f32;
+		params[SI_PARAM_FRONT_FACE] = i32;
 		params[SI_PARAM_ANCILLARY] = i32;
 		params[SI_PARAM_SAMPLE_COVERAGE] = f32;
 		params[SI_PARAM_POS_FIXED_PT] = f32;
@ -4067,7 +4053,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	if (poly_stipple) {
 		tokens = util_pstipple_create_fragment_shader(tokens, NULL,
 						SI_POLY_STIPPLE_SAMPLER,
-						TGSI_FILE_INPUT);
+						TGSI_FILE_SYSTEM_VALUE);
 		tgsi_scan_shader(tokens, &stipple_shader_info);
 	}

--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@ -390,7 +390,7 @@ static void si_shader_ps(struct si_shader *shader)
 	unsigned spi_shader_col_format = 0, cb_shader_mask = 0;
 	unsigned colors_written, export_16bpc;
 	unsigned num_sgprs, num_user_sgprs;
-	unsigned spi_baryc_cntl = 0;
+	unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
 	uint64_t va;
 	bool has_centroid;

@ -399,30 +399,29 @@ static void si_shader_ps(struct si_shader *shader)
 	if (!pm4)
 		return;

-	for (i = 0; i < info->num_inputs; i++) {
-		switch (info->input_semantic_name[i]) {
-		case TGSI_SEMANTIC_POSITION:
-			/* SPI_BARYC_CNTL.POS_FLOAT_LOCATION
-			 * Possible vaules:
-			 * 0 -> Position = pixel center (default)
-			 * 1 -> Position = pixel centroid
-			 * 2 -> Position = at sample position
-			 */
-			switch (info->input_interpolate_loc[i]) {
-			case TGSI_INTERPOLATE_LOC_CENTROID:
-				spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(1);
-				break;
-			case TGSI_INTERPOLATE_LOC_SAMPLE:
-				spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2);
-				break;
-			}
+	/* SPI_BARYC_CNTL.POS_FLOAT_LOCATION
+	 * Possible vaules:
+	 * 0 -> Position = pixel center
+	 * 1 -> Position = pixel centroid
+	 * 2 -> Position = at sample position
+	 *
+	 * From GLSL 4.5 specification, section 7.1:
+	 *   "The variable gl_FragCoord is available as an input variable from
+	 *    within fragment shaders and it holds the window relative coordinates
+	 *    (x, y, z, 1/w) values for the fragment. If multi-sampling, this
+	 *    value can be for any location within the pixel, or one of the
+	 *    fragment samples. The use of centroid does not further restrict
+	 *    this value to be inside the current primitive."
+	 *
+	 * Meaning that centroid has no effect and we can return anything within
+	 * the pixel. Thus, return the value at sample position, because that's
+	 * the most accurate one shaders can get.
+	 */
+	spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2);

-			if (info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] ==
-			    TGSI_FS_COORD_PIXEL_CENTER_INTEGER)
-				spi_baryc_cntl |= S_0286E0_POS_FLOAT_ULC(1);
-			break;
-		}
-	}
+	if (info->properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER] ==
+	    TGSI_FS_COORD_PIXEL_CENTER_INTEGER)
+		spi_baryc_cntl |= S_0286E0_POS_FLOAT_ULC(1);

 	/* Find out what SPI_SHADER_COL_FORMAT and CB_SHADER_MASK should be. */
 	colors_written = info->colors_written;
@ -980,12 +979,6 @@ static void si_emit_spi_map(struct si_context *sctx, struct r600_atom *atom)
 		unsigned index = psinfo->input_semantic_index[i];
 		unsigned interpolate = psinfo->input_interpolate[i];
 		unsigned param_offset = ps->ps_input_param_offset[i];
-
-		if (name == TGSI_SEMANTIC_POSITION ||
-		    name == TGSI_SEMANTIC_FACE)
-			/* Read from preloaded VGPRs, not parameters */
-			continue;
-
 bcolor:
 		tmp = 0;

@ -1324,6 +1317,7 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx)
 		si_get_max_scratch_bytes_per_wave(sctx);
 	unsigned scratch_needed_size = scratch_bytes_per_wave *
 		sctx->scratch_waves;
+	unsigned spi_tmpring_size;
 	int r;

 	if (scratch_needed_size > 0) {
@ -1393,8 +1387,12 @@ static bool si_update_spi_tmpring_size(struct si_context *sctx)
 	assert((scratch_needed_size & ~0x3FF) == scratch_needed_size &&
 		"scratch size should already be aligned correctly.");

-	sctx->spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) |
-				S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10);
+	spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) |
+			   S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10);
+	if (spi_tmpring_size != sctx->spi_tmpring_size) {
+		sctx->spi_tmpring_size = spi_tmpring_size;
+		sctx->emit_scratch_reloc = true;
+	}
 	return true;
 }

--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@ -258,6 +258,8 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_INVALIDATE_BUFFER:
+   case PIPE_CAP_GENERATE_MIPMAP:
      return 0;
   }
   /* should only get here on unhandled cases */
--- a/src/gallium/drivers/svga/svga_cmd.h
+++ b/src/gallium/drivers/svga/svga_cmd.h
@ -638,4 +638,8 @@ SVGA3D_vgpu10_UpdateSubResource(struct svga_winsys_context *swc,
                                const SVGA3dBox *box,
                                unsigned subResource);

+enum pipe_error
+SVGA3D_vgpu10_GenMips(struct svga_winsys_context *swc,
+                      const SVGA3dShaderResourceViewId shaderResourceViewId,
+                      struct svga_winsys_surface *view);
 #endif /* __SVGA3D_H__ */
--- a/src/gallium/drivers/svga/svga_cmd_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_cmd_vgpu10.c
@ -1293,3 +1293,24 @@ SVGA3D_vgpu10_UpdateSubResource(struct svga_winsys_context *swc,
   swc->commit(swc);
   return PIPE_OK;
 }
+
+enum pipe_error
+SVGA3D_vgpu10_GenMips(struct svga_winsys_context *swc,
+                      SVGA3dShaderResourceViewId shaderResourceViewId,
+                      struct svga_winsys_surface *view)
+{
+   SVGA3dCmdDXGenMips *cmd;
+
+   cmd = SVGA3D_FIFOReserve(swc, SVGA_3D_CMD_DX_GENMIPS,
+                            sizeof(SVGA3dCmdDXGenMips), 1);
+
+   if (!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   swc->surface_relocation(swc, &cmd->shaderResourceViewId, NULL, view,
+                           SVGA_RELOC_WRITE);
+   cmd->shaderResourceViewId = shaderResourceViewId;
+
+   swc->commit(swc);
+   return PIPE_OK;
+}
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@ -59,8 +59,9 @@
 #define SVGA_QUERY_NUM_RESOURCES           (PIPE_QUERY_DRIVER_SPECIFIC + 9)
 #define SVGA_QUERY_NUM_STATE_OBJECTS       (PIPE_QUERY_DRIVER_SPECIFIC + 10)
 #define SVGA_QUERY_NUM_SURFACE_VIEWS       (PIPE_QUERY_DRIVER_SPECIFIC + 11)
+#define SVGA_QUERY_NUM_GENERATE_MIPMAP     (PIPE_QUERY_DRIVER_SPECIFIC + 12)
 /*SVGA_QUERY_MAX has to be last because it is size of an array*/
-#define SVGA_QUERY_MAX                     (PIPE_QUERY_DRIVER_SPECIFIC + 12)
+#define SVGA_QUERY_MAX                     (PIPE_QUERY_DRIVER_SPECIFIC + 13)

 /**
 * Maximum supported number of constant buffers per shader
@ -505,6 +506,7 @@ struct svga_context
      uint64_t num_state_objects;    /**< SVGA_QUERY_NUM_STATE_OBJECTS */
      uint64_t num_surface_views;    /**< SVGA_QUERY_NUM_SURFACE_VIEWS */
      uint64_t num_bytes_uploaded;   /**< SVGA_QUERY_NUM_BYTES_UPLOADED */
+      uint64_t num_generate_mipmap;  /**< SVGA_QUERY_NUM_GENERATE_MIPMAP */
   } hud;

   /** The currently bound stream output targets */
--- a/src/gallium/drivers/svga/svga_format.c
+++ b/src/gallium/drivers/svga/svga_format.c
@ -48,16 +48,16 @@ static const struct vgpu10_format_entry format_conversion_table[] =
 {
   /* Gallium format                    SVGA3D vertex format        SVGA3D pixel format          Flags */
   { PIPE_FORMAT_NONE,                  SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
-   { PIPE_FORMAT_B8G8R8A8_UNORM,        SVGA3D_B8G8R8A8_UNORM,      SVGA3D_B8G8R8A8_UNORM,       0 },
-   { PIPE_FORMAT_B8G8R8X8_UNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_B8G8R8X8_UNORM,       0 },
+   { PIPE_FORMAT_B8G8R8A8_UNORM,        SVGA3D_B8G8R8A8_UNORM,      SVGA3D_B8G8R8A8_UNORM,       TF_GEN_MIPS },
+   { PIPE_FORMAT_B8G8R8X8_UNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_B8G8R8X8_UNORM,       TF_GEN_MIPS },
   { PIPE_FORMAT_A8R8G8B8_UNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
   { PIPE_FORMAT_X8R8G8B8_UNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
-   { PIPE_FORMAT_B5G5R5A1_UNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_B5G5R5A1_UNORM,       0 },
+   { PIPE_FORMAT_B5G5R5A1_UNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_B5G5R5A1_UNORM,       TF_GEN_MIPS },
   { PIPE_FORMAT_B4G4R4A4_UNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
-   { PIPE_FORMAT_B5G6R5_UNORM,          SVGA3D_FORMAT_INVALID,      SVGA3D_B5G6R5_UNORM,         0 },
-   { PIPE_FORMAT_R10G10B10A2_UNORM,     SVGA3D_R10G10B10A2_UNORM,   SVGA3D_R10G10B10A2_UNORM,    0 },
+   { PIPE_FORMAT_B5G6R5_UNORM,          SVGA3D_FORMAT_INVALID,      SVGA3D_B5G6R5_UNORM,         TF_GEN_MIPS },
+   { PIPE_FORMAT_R10G10B10A2_UNORM,     SVGA3D_R10G10B10A2_UNORM,   SVGA3D_R10G10B10A2_UNORM,    TF_GEN_MIPS },
   { PIPE_FORMAT_L8_UNORM,              SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
-   { PIPE_FORMAT_A8_UNORM,              SVGA3D_FORMAT_INVALID,      SVGA3D_A8_UNORM,             0 },
+   { PIPE_FORMAT_A8_UNORM,              SVGA3D_FORMAT_INVALID,      SVGA3D_A8_UNORM,             TF_GEN_MIPS },
   { PIPE_FORMAT_I8_UNORM,              SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
   { PIPE_FORMAT_L8A8_UNORM,            SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
   { PIPE_FORMAT_L16_UNORM,             SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
@ -75,10 +75,10 @@ static const struct vgpu10_format_entry format_conversion_table[] =
   { PIPE_FORMAT_R64G64_FLOAT,          SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
   { PIPE_FORMAT_R64G64B64_FLOAT,       SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
   { PIPE_FORMAT_R64G64B64A64_FLOAT,    SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
-   { PIPE_FORMAT_R32_FLOAT,             SVGA3D_R32_FLOAT,           SVGA3D_R32_FLOAT,            0 },
-   { PIPE_FORMAT_R32G32_FLOAT,          SVGA3D_R32G32_FLOAT,        SVGA3D_R32G32_FLOAT,         0 },
-   { PIPE_FORMAT_R32G32B32_FLOAT,       SVGA3D_R32G32B32_FLOAT,     SVGA3D_R32G32B32_FLOAT,      0 },
-   { PIPE_FORMAT_R32G32B32A32_FLOAT,    SVGA3D_R32G32B32A32_FLOAT,  SVGA3D_R32G32B32A32_FLOAT,   0 },
+   { PIPE_FORMAT_R32_FLOAT,             SVGA3D_R32_FLOAT,           SVGA3D_R32_FLOAT,            TF_GEN_MIPS },
+   { PIPE_FORMAT_R32G32_FLOAT,          SVGA3D_R32G32_FLOAT,        SVGA3D_R32G32_FLOAT,         TF_GEN_MIPS },
+   { PIPE_FORMAT_R32G32B32_FLOAT,       SVGA3D_R32G32B32_FLOAT,     SVGA3D_R32G32B32_FLOAT,      TF_GEN_MIPS },
+   { PIPE_FORMAT_R32G32B32A32_FLOAT,    SVGA3D_R32G32B32A32_FLOAT,  SVGA3D_R32G32B32A32_FLOAT,   TF_GEN_MIPS },
   { PIPE_FORMAT_R32_UNORM,             SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
   { PIPE_FORMAT_R32G32_UNORM,          SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
   { PIPE_FORMAT_R32G32B32_UNORM,       SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
@ -95,10 +95,10 @@ static const struct vgpu10_format_entry format_conversion_table[] =
   { PIPE_FORMAT_R32G32_SSCALED,        SVGA3D_R32G32_SINT,         SVGA3D_FORMAT_INVALID,       VF_I_TO_F_CAST },
   { PIPE_FORMAT_R32G32B32_SSCALED,     SVGA3D_R32G32B32_SINT,      SVGA3D_FORMAT_INVALID,       VF_I_TO_F_CAST },
   { PIPE_FORMAT_R32G32B32A32_SSCALED,  SVGA3D_R32G32B32A32_SINT,   SVGA3D_FORMAT_INVALID,       VF_I_TO_F_CAST },
-   { PIPE_FORMAT_R16_UNORM,             SVGA3D_R16_UNORM,           SVGA3D_R16_UNORM,            0 },
-   { PIPE_FORMAT_R16G16_UNORM,          SVGA3D_R16G16_UNORM,        SVGA3D_R16G16_UNORM,         0 },
+   { PIPE_FORMAT_R16_UNORM,             SVGA3D_R16_UNORM,           SVGA3D_R16_UNORM,            TF_GEN_MIPS },
+   { PIPE_FORMAT_R16G16_UNORM,          SVGA3D_R16G16_UNORM,        SVGA3D_R16G16_UNORM,         TF_GEN_MIPS },
   { PIPE_FORMAT_R16G16B16_UNORM,       SVGA3D_R16G16B16A16_UNORM,  SVGA3D_FORMAT_INVALID,       VF_W_TO_1 },
-   { PIPE_FORMAT_R16G16B16A16_UNORM,    SVGA3D_R16G16B16A16_UNORM,  SVGA3D_R16G16B16A16_UNORM,   0 },
+   { PIPE_FORMAT_R16G16B16A16_UNORM,    SVGA3D_R16G16B16A16_UNORM,  SVGA3D_R16G16B16A16_UNORM,   TF_GEN_MIPS },
   { PIPE_FORMAT_R16_USCALED,           SVGA3D_R16_UINT,            SVGA3D_FORMAT_INVALID,       VF_U_TO_F_CAST },
   { PIPE_FORMAT_R16G16_USCALED,        SVGA3D_R16G16_UINT,         SVGA3D_FORMAT_INVALID,       VF_U_TO_F_CAST },
   { PIPE_FORMAT_R16G16B16_USCALED,     SVGA3D_R16G16B16A16_UINT,   SVGA3D_FORMAT_INVALID,       VF_W_TO_1 | VF_U_TO_F_CAST },
@ -111,10 +111,10 @@ static const struct vgpu10_format_entry format_conversion_table[] =
   { PIPE_FORMAT_R16G16_SSCALED,        SVGA3D_R16G16_SINT,         SVGA3D_FORMAT_INVALID,       VF_I_TO_F_CAST },
   { PIPE_FORMAT_R16G16B16_SSCALED,     SVGA3D_R16G16B16A16_SINT,   SVGA3D_FORMAT_INVALID,       VF_W_TO_1 | VF_I_TO_F_CAST },
   { PIPE_FORMAT_R16G16B16A16_SSCALED,  SVGA3D_R16G16B16A16_SINT,   SVGA3D_FORMAT_INVALID,       VF_I_TO_F_CAST },
-   { PIPE_FORMAT_R8_UNORM,              SVGA3D_R8_UNORM,            SVGA3D_R8_UNORM,             0 },
-   { PIPE_FORMAT_R8G8_UNORM,            SVGA3D_R8G8_UNORM,          SVGA3D_R8G8_UNORM,           0 },
+   { PIPE_FORMAT_R8_UNORM,              SVGA3D_R8_UNORM,            SVGA3D_R8_UNORM,             TF_GEN_MIPS },
+   { PIPE_FORMAT_R8G8_UNORM,            SVGA3D_R8G8_UNORM,          SVGA3D_R8G8_UNORM,           TF_GEN_MIPS },
   { PIPE_FORMAT_R8G8B8_UNORM,          SVGA3D_R8G8B8A8_UNORM,      SVGA3D_FORMAT_INVALID,       VF_W_TO_1 },
-   { PIPE_FORMAT_R8G8B8A8_UNORM,        SVGA3D_R8G8B8A8_UNORM,      SVGA3D_R8G8B8A8_UNORM,       0 },
+   { PIPE_FORMAT_R8G8B8A8_UNORM,        SVGA3D_R8G8B8A8_UNORM,      SVGA3D_R8G8B8A8_UNORM,       TF_GEN_MIPS },
   { PIPE_FORMAT_X8B8G8R8_UNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
   { PIPE_FORMAT_R8_USCALED,            SVGA3D_R8_UINT,             SVGA3D_FORMAT_INVALID,       VF_U_TO_F_CAST },
   { PIPE_FORMAT_R8G8_USCALED,          SVGA3D_R8G8_UINT,           SVGA3D_FORMAT_INVALID,       VF_U_TO_F_CAST },
@ -138,20 +138,20 @@ static const struct vgpu10_format_entry format_conversion_table[] =
   { PIPE_FORMAT_R32G32_FIXED,          SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
   { PIPE_FORMAT_R32G32B32_FIXED,       SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
   { PIPE_FORMAT_R32G32B32A32_FIXED,    SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
-   { PIPE_FORMAT_R16_FLOAT,             SVGA3D_R16_FLOAT,           SVGA3D_R16_FLOAT,            0 },
-   { PIPE_FORMAT_R16G16_FLOAT,          SVGA3D_R16G16_FLOAT,        SVGA3D_R16G16_FLOAT,         0 },
+   { PIPE_FORMAT_R16_FLOAT,             SVGA3D_R16_FLOAT,           SVGA3D_R16_FLOAT,            TF_GEN_MIPS },
+   { PIPE_FORMAT_R16G16_FLOAT,          SVGA3D_R16G16_FLOAT,        SVGA3D_R16G16_FLOAT,         TF_GEN_MIPS },
   { PIPE_FORMAT_R16G16B16_FLOAT,       SVGA3D_R16G16B16A16_FLOAT,  SVGA3D_FORMAT_INVALID,       VF_W_TO_1 },
-   { PIPE_FORMAT_R16G16B16A16_FLOAT,    SVGA3D_R16G16B16A16_FLOAT,  SVGA3D_R16G16B16A16_FLOAT,   0 },
+   { PIPE_FORMAT_R16G16B16A16_FLOAT,    SVGA3D_R16G16B16A16_FLOAT,  SVGA3D_R16G16B16A16_FLOAT,   TF_GEN_MIPS },
   { PIPE_FORMAT_L8_SRGB,               SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
   { PIPE_FORMAT_L8A8_SRGB,             SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
   { PIPE_FORMAT_R8G8B8_SRGB,           SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
   { PIPE_FORMAT_A8B8G8R8_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
   { PIPE_FORMAT_X8B8G8R8_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
-   { PIPE_FORMAT_B8G8R8A8_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_B8G8R8A8_UNORM_SRGB,  0 },
-   { PIPE_FORMAT_B8G8R8X8_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_B8G8R8X8_UNORM_SRGB,  0 },
+   { PIPE_FORMAT_B8G8R8A8_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_B8G8R8A8_UNORM_SRGB,  TF_GEN_MIPS },
+   { PIPE_FORMAT_B8G8R8X8_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_B8G8R8X8_UNORM_SRGB,  TF_GEN_MIPS },
   { PIPE_FORMAT_A8R8G8B8_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
   { PIPE_FORMAT_X8R8G8B8_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
-   { PIPE_FORMAT_R8G8B8A8_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_R8G8B8A8_UNORM_SRGB,  0 },
+   { PIPE_FORMAT_R8G8B8A8_SRGB,         SVGA3D_FORMAT_INVALID,      SVGA3D_R8G8B8A8_UNORM_SRGB,  TF_GEN_MIPS },
   { PIPE_FORMAT_DXT1_RGB,              SVGA3D_FORMAT_INVALID,      SVGA3D_BC1_UNORM,            0 },
   { PIPE_FORMAT_DXT1_RGBA,             SVGA3D_FORMAT_INVALID,      SVGA3D_BC1_UNORM,            0 },
   { PIPE_FORMAT_DXT3_RGBA,             SVGA3D_FORMAT_INVALID,      SVGA3D_BC2_UNORM,            0 },
@ -171,7 +171,7 @@ static const struct vgpu10_format_entry format_conversion_table[] =
   { PIPE_FORMAT_A8B8G8R8_UNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
   { PIPE_FORMAT_B5G5R5X1_UNORM,        SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
   { PIPE_FORMAT_R10G10B10A2_USCALED,   SVGA3D_R10G10B10A2_UNORM,   SVGA3D_FORMAT_INVALID,       VF_PUINT_TO_USCALED },
-   { PIPE_FORMAT_R11G11B10_FLOAT,       SVGA3D_FORMAT_INVALID,      SVGA3D_R11G11B10_FLOAT,      0 },
+   { PIPE_FORMAT_R11G11B10_FLOAT,       SVGA3D_FORMAT_INVALID,      SVGA3D_R11G11B10_FLOAT,      TF_GEN_MIPS },
   { PIPE_FORMAT_R9G9B9E5_FLOAT,        SVGA3D_FORMAT_INVALID,      SVGA3D_R9G9B9E5_SHAREDEXP,   0 },
   { PIPE_FORMAT_Z32_FLOAT_S8X24_UINT,  SVGA3D_FORMAT_INVALID,      SVGA3D_D32_FLOAT_S8X24_UINT, 0 },
   { PIPE_FORMAT_R1_UNORM,              SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       0 },
@ -1967,6 +1967,13 @@ svga_format_is_integer(SVGA3dSurfaceFormat format)
   }
 }

+boolean
+svga_format_support_gen_mips(enum pipe_format format)
+{
+   assert(format < Elements(format_conversion_table));
+   return ((format_conversion_table[format].flags & TF_GEN_MIPS) > 0);
+}
+

 /**
 * Given a texture format, return the expected data type returned from
--- a/src/gallium/drivers/svga/svga_format.h
+++ b/src/gallium/drivers/svga/svga_format.h
@ -52,6 +52,10 @@ struct svga_screen;
 #define VF_PUINT_TO_USCALED (1 << 6)  /* 10_10_10_2 to uscaled */
 #define VF_PUINT_TO_SSCALED (1 << 7)  /* 10_10_10_2 to sscaled */

+/**
+ * Texture format flags.
+ */
+#define TF_GEN_MIPS         (1 << 8)  /* supports hw generate mipmap */

 void
 svga_translate_vertex_format_vgpu10(enum pipe_format format,
@ -80,6 +84,9 @@ svga_format_name(SVGA3dSurfaceFormat format);
 boolean
 svga_format_is_integer(SVGA3dSurfaceFormat format);

+boolean
+svga_format_support_gen_mips(enum pipe_format format);
+
 enum tgsi_return_type
 svga_get_texture_datatype(enum pipe_format format);

--- a/src/gallium/drivers/svga/svga_pipe_query.c
+++ b/src/gallium/drivers/svga/svga_pipe_query.c
@ -732,6 +732,7 @@ svga_create_query(struct pipe_context *pipe,
   case SVGA_QUERY_NUM_SURFACE_VIEWS:
   case SVGA_QUERY_NUM_RESOURCES_MAPPED:
   case SVGA_QUERY_NUM_BYTES_UPLOADED:
+   case SVGA_QUERY_NUM_GENERATE_MIPMAP:
      break;
   default:
      assert(!"unexpected query type in svga_create_query()");
@ -800,6 +801,7 @@ svga_destroy_query(struct pipe_context *pipe, struct pipe_query *q)
   case SVGA_QUERY_NUM_SURFACE_VIEWS:
   case SVGA_QUERY_NUM_RESOURCES_MAPPED:
   case SVGA_QUERY_NUM_BYTES_UPLOADED:
+   case SVGA_QUERY_NUM_GENERATE_MIPMAP:
      /* nothing */
      break;
   default:
@ -887,6 +889,7 @@ svga_begin_query(struct pipe_context *pipe, struct pipe_query *q)
   case SVGA_QUERY_NUM_RESOURCES:
   case SVGA_QUERY_NUM_STATE_OBJECTS:
   case SVGA_QUERY_NUM_SURFACE_VIEWS:
+   case SVGA_QUERY_NUM_GENERATE_MIPMAP:
      /* nothing */
      break;
   default:
@ -980,6 +983,7 @@ svga_end_query(struct pipe_context *pipe, struct pipe_query *q)
   case SVGA_QUERY_NUM_RESOURCES:
   case SVGA_QUERY_NUM_STATE_OBJECTS:
   case SVGA_QUERY_NUM_SURFACE_VIEWS:
+   case SVGA_QUERY_NUM_GENERATE_MIPMAP:
      /* nothing */
      break;
   default:
@ -1090,6 +1094,9 @@ svga_get_query_result(struct pipe_context *pipe,
   case SVGA_QUERY_NUM_SURFACE_VIEWS:
      vresult->u64 = svga->hud.num_surface_views;
      break;
+   case SVGA_QUERY_NUM_GENERATE_MIPMAP:
+      vresult->u64 = svga->hud.num_generate_mipmap;
+      break;
   default:
      assert(!"unexpected query type in svga_get_query_result");
   }
--- a/src/gallium/drivers/svga/svga_resource.c
+++ b/src/gallium/drivers/svga/svga_resource.c
@ -107,6 +107,12 @@ svga_init_resource_functions(struct svga_context *svga)
   svga->pipe.transfer_flush_region = u_transfer_flush_region_vtbl;
   svga->pipe.transfer_unmap = u_transfer_unmap_vtbl;
   svga->pipe.transfer_inline_write = u_transfer_inline_write_vtbl;
+
+   if (svga_have_vgpu10(svga)) {
+      svga->pipe.generate_mipmap = svga_texture_generate_mipmap;
+   } else {
+      svga->pipe.generate_mipmap = NULL;
+   }
 }

 void
--- a/src/gallium/drivers/svga/svga_resource_texture.c
+++ b/src/gallium/drivers/svga/svga_resource_texture.c
@ -993,3 +993,61 @@ svga_texture_from_handle(struct pipe_screen *screen,

   return &tex->b.b;
 }
+
+boolean
+svga_texture_generate_mipmap(struct pipe_context *pipe,
+                             struct pipe_resource *pt,
+                             enum pipe_format format,
+                             unsigned base_level,
+                             unsigned last_level,
+                             unsigned first_layer,
+                             unsigned last_layer)
+{
+   struct pipe_sampler_view templ, *psv;
+   struct svga_pipe_sampler_view *sv;
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_texture *tex = svga_texture(pt);
+   enum pipe_error ret;
+
+   assert(svga_have_vgpu10(svga));
+
+   /* Only support 2D texture for now */
+   if (pt->target != PIPE_TEXTURE_2D)
+      return FALSE;
+
+   /* Fallback to the mipmap generation utility for those formats that
+    * do not support hw generate mipmap
+    */
+   if (!svga_format_support_gen_mips(format))
+      return FALSE;
+
+   /* Make sure the texture surface was created with
+    * SVGA3D_SURFACE_BIND_RENDER_TARGET
+    */
+   if (!tex->handle || !(tex->key.flags & SVGA3D_SURFACE_BIND_RENDER_TARGET))
+      return FALSE;
+
+   templ.format = format;
+   templ.u.tex.first_layer = first_layer;
+   templ.u.tex.last_layer = last_layer;
+   templ.u.tex.first_level = base_level;
+   templ.u.tex.last_level = last_level;
+
+   psv = pipe->create_sampler_view(pipe, pt, &templ);
+   if (psv == NULL)
+      return FALSE;
+
+   sv = svga_pipe_sampler_view(psv);
+   svga_validate_pipe_sampler_view(svga, sv);
+
+   ret = SVGA3D_vgpu10_GenMips(svga->swc, sv->id, tex->handle);
+   if (ret != PIPE_OK) {
+      svga_context_flush(svga, NULL);
+      ret = SVGA3D_vgpu10_GenMips(svga->swc, sv->id, tex->handle);
+   }
+   pipe_sampler_view_reference(&psv, NULL);
+
+   svga->hud.num_generate_mipmap++;
+
+   return TRUE;
+}
--- a/src/gallium/drivers/svga/svga_resource_texture.h
+++ b/src/gallium/drivers/svga/svga_resource_texture.h
@ -217,7 +217,14 @@ svga_texture_from_handle(struct pipe_screen * screen,
 			const struct pipe_resource *template,
 			struct winsys_handle *whandle);

-
+boolean
+svga_texture_generate_mipmap(struct pipe_context *pipe,
+                             struct pipe_resource *pt,
+                             enum pipe_format format,
+                             unsigned base_level,
+                             unsigned last_level,
+                             unsigned first_layer,
+                             unsigned last_layer);


 #endif /* SVGA_TEXTURE_H */
--- a/src/gallium/drivers/svga/svga_sampler_view.h
+++ b/src/gallium/drivers/svga/svga_sampler_view.h
@ -35,6 +35,7 @@
 struct pipe_context;
 struct pipe_screen;
 struct svga_context;
+struct svga_pipe_sampler_view;
 struct svga_winsys_surface;
 struct svga_surface;
 enum SVGA3dSurfaceFormat;
@ -102,4 +103,9 @@ boolean
 svga_check_sampler_view_resource_collision(struct svga_context *svga,
                                           struct svga_winsys_surface *res,
                                           unsigned shader);
+
+enum pipe_error
+svga_validate_pipe_sampler_view(struct svga_context *svga,
+                                struct svga_pipe_sampler_view *sv);
+
 #endif
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@ -319,6 +319,9 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_PRIMITIVE_RESTART:
      return 1; /* may be a sw fallback, depending on restart index */

+   case PIPE_CAP_GENERATE_MIPMAP:
+      return sws->have_vgpu10;
+
   /* Unsupported features */
   case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
   case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
@ -353,6 +356,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_INVALIDATE_BUFFER:
      return 0;
   case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
      return 64;
@ -821,6 +825,8 @@ svga_get_driver_query_info(struct pipe_screen *screen,
            PIPE_DRIVER_QUERY_TYPE_UINT64),
      QUERY("num-surface-views", SVGA_QUERY_NUM_SURFACE_VIEWS,
            PIPE_DRIVER_QUERY_TYPE_UINT64),
+      QUERY("num-generate-mipmap", SVGA_QUERY_NUM_GENERATE_MIPMAP,
+            PIPE_DRIVER_QUERY_TYPE_UINT64),
   };
 #undef QUERY

--- a/src/gallium/drivers/svga/svga_state_sampler.c
+++ b/src/gallium/drivers/svga/svga_state_sampler.c
@ -90,7 +90,7 @@ svga_check_sampler_view_resource_collision(struct svga_context *svga,
 * Create a DX ShaderResourceSamplerView for the given pipe_sampler_view,
 * if needed.
 */
-static enum pipe_error
+enum pipe_error
 svga_validate_pipe_sampler_view(struct svga_context *svga,
                                struct svga_pipe_sampler_view *sv)
 {
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@ -1291,6 +1291,42 @@ trace_context_flush(struct pipe_context *_pipe,
 }


+static inline boolean
+trace_context_generate_mipmap(struct pipe_context *_pipe,
+                              struct pipe_resource *res,
+                              enum pipe_format format,
+                              unsigned base_level,
+                              unsigned last_level,
+                              unsigned first_layer,
+                              unsigned last_layer)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   boolean ret;
+
+   res = trace_resource_unwrap(tr_ctx, res);
+
+   trace_dump_call_begin("pipe_context", "generate_mipmap");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, res);
+
+   trace_dump_arg(format, format);
+   trace_dump_arg(uint, base_level);
+   trace_dump_arg(uint, last_level);
+   trace_dump_arg(uint, first_layer);
+   trace_dump_arg(uint, last_layer);
+
+   ret = pipe->generate_mipmap(pipe, res, format, base_level, last_level,
+                               first_layer, last_layer);
+
+   trace_dump_ret(bool, ret);
+   trace_dump_call_end();
+
+   return ret;
+}
+
+
 static inline void
 trace_context_destroy(struct pipe_context *_pipe)
 {
@ -1620,6 +1656,7 @@ trace_context_create(struct trace_screen *tr_scr,
   TR_CTX_INIT(clear_render_target);
   TR_CTX_INIT(clear_depth_stencil);
   TR_CTX_INIT(flush);
+   TR_CTX_INIT(generate_mipmap);
   TR_CTX_INIT(texture_barrier);
   TR_CTX_INIT(memory_barrier);
   TR_CTX_INIT(set_tess_state);
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@ -196,6 +196,8 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
        case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
        case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
        case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+        case PIPE_CAP_INVALIDATE_BUFFER:
+        case PIPE_CAP_GENERATE_MIPMAP:
                return 0;

                /* Stream output. */
--- a/src/gallium/drivers/virgl/virgl_screen.c
+++ b/src/gallium/drivers/virgl/virgl_screen.c
@ -226,6 +226,8 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_INVALIDATE_BUFFER:
+   case PIPE_CAP_GENERATE_MIPMAP:
      return 0;
   case PIPE_CAP_VENDOR_ID:
      return 0x1af4;
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@ -649,11 +649,15 @@ struct pipe_context {
                          struct pipe_resource *resource);

   /**
-    * Invalidate the contents of the resource.
+    * Invalidate the contents of the resource. This is used to
    *
-    * This is used to implement EGL's semantic of undefined depth/stencil
+    * (1) implement EGL's semantic of undefined depth/stencil
    * contenst after a swapbuffers.  This allows a tiled renderer (for
    * example) to not store the depth buffer.
+    *
+    * (2) implement GL's InvalidateBufferData. For backwards compatibility,
+    * you must only rely on the usability for this purpose when
+    * PIPE_CAP_INVALIDATE_BUFFER is enabled.
    */
   void (*invalidate_resource)(struct pipe_context *ctx,
                               struct pipe_resource *resource);
@ -673,6 +677,18 @@ struct pipe_context {
    */
   void (*dump_debug_state)(struct pipe_context *ctx, FILE *stream,
                            unsigned flags);
+
+   /**
+    * Generate mipmap.
+    * \return TRUE if mipmap generation succeeds, FALSE otherwise
+    */
+   boolean (*generate_mipmap)(struct pipe_context *ctx,
+                              struct pipe_resource *resource,
+                              enum pipe_format format,
+                              unsigned base_level,
+                              unsigned last_level,
+                              unsigned first_layer,
+                              unsigned last_layer);
 };


--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@ -642,6 +642,8 @@ enum pipe_cap
   PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL,
   PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL,
   PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT,
+   PIPE_CAP_INVALIDATE_BUFFER,
+   PIPE_CAP_GENERATE_MIPMAP,
 };

 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@ -686,6 +686,11 @@ error:
    return NULL;
 }

+static bool amdgpu_bo_is_user_ptr(struct pb_buffer *buf)
+{
+   return ((struct amdgpu_winsys_bo*)buf)->user_ptr != NULL;
+}
+
 static uint64_t amdgpu_bo_get_va(struct pb_buffer *buf)
 {
   return ((struct amdgpu_winsys_bo*)buf)->va;
@ -701,6 +706,7 @@ void amdgpu_bo_init_functions(struct amdgpu_winsys *ws)
   ws->base.buffer_create = amdgpu_bo_create;
   ws->base.buffer_from_handle = amdgpu_bo_from_handle;
   ws->base.buffer_from_ptr = amdgpu_bo_from_ptr;
+   ws->base.buffer_is_user_ptr = amdgpu_bo_is_user_ptr;
   ws->base.buffer_get_handle = amdgpu_bo_get_handle;
   ws->base.buffer_get_virtual_address = amdgpu_bo_get_va;
   ws->base.buffer_get_initial_domain = amdgpu_bo_get_initial_domain;
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@ -582,7 +582,7 @@ static struct radeon_bo *radeon_create_bo(struct radeon_drm_winsys *rws,

            pipe_mutex_unlock(rws->bo_handles_mutex);
            pb_reference(&b, &old_bo->base);
-            return b;
+            return radeon_bo(b);
        }

        util_hash_table_set(rws->bo_vas, (void*)(uintptr_t)bo->va, bo);
@ -594,7 +594,7 @@ static struct radeon_bo *radeon_create_bo(struct radeon_drm_winsys *rws,
    else if (initial_domains & RADEON_DOMAIN_GTT)
        rws->allocated_gtt += align(size, rws->size_align);

-    return &bo->base;
+    return bo;
 }

 bool radeon_bo_can_reclaim(struct pb_buffer *_buf)
@ -768,9 +768,9 @@ radeon_winsys_bo_create(struct radeon_winsys *rws,
    usage |= 1 << (flags + 3);

    if (use_reusable_pool) {
-        bo = pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage);
+        bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage));
        if (bo)
-            return bo;
+            return &bo->base;
    }

    bo = radeon_create_bo(ws, size, alignment, usage, domain, flags);
@ -837,7 +837,7 @@ static struct pb_buffer *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws,
    if (ws->info.r600_virtual_address) {
        struct drm_radeon_gem_va va;

-        bo->va = radeon_bomgr_find_va(rws, bo->base.size, 1 << 20);
+        bo->va = radeon_bomgr_find_va(ws, bo->base.size, 1 << 20);

        va.handle = bo->handle;
        va.operation = RADEON_VA_MAP;
@ -969,7 +969,7 @@ done:
    if (ws->info.r600_virtual_address && !bo->va) {
        struct drm_radeon_gem_va va;

-        bo->va = radeon_bomgr_find_va(rws, bo->base.size, 1 << 20);
+        bo->va = radeon_bomgr_find_va(ws, bo->base.size, 1 << 20);

        va.handle = bo->handle;
        va.operation = RADEON_VA_MAP;
@ -1052,6 +1052,11 @@ static boolean radeon_winsys_bo_get_handle(struct pb_buffer *buffer,
    return TRUE;
 }

+static bool radeon_winsys_bo_is_user_ptr(struct pb_buffer *buf)
+{
+   return ((struct radeon_bo*)buf)->user_ptr != NULL;
+}
+
 static uint64_t radeon_winsys_bo_va(struct pb_buffer *buf)
 {
    return ((struct radeon_bo*)buf)->va;
@ -1067,6 +1072,7 @@ void radeon_drm_bo_init_functions(struct radeon_drm_winsys *ws)
    ws->base.buffer_create = radeon_winsys_bo_create;
    ws->base.buffer_from_handle = radeon_winsys_bo_from_handle;
    ws->base.buffer_from_ptr = radeon_winsys_bo_from_ptr;
+    ws->base.buffer_is_user_ptr = radeon_winsys_bo_is_user_ptr;
    ws->base.buffer_get_handle = radeon_winsys_bo_get_handle;
    ws->base.buffer_get_virtual_address = radeon_winsys_bo_va;
    ws->base.buffer_get_initial_domain = radeon_bo_get_initial_domain;
--- a/src/glsl/Makefile.am
+++ b/src/glsl/Makefile.am
@ -259,23 +259,23 @@ PYTHON_GEN = $(AM_V_GEN)$(PYTHON2) $(PYTHON_FLAGS)

 nir/nir_builder_opcodes.h: nir/nir_opcodes.py nir/nir_builder_opcodes_h.py
 	$(MKDIR_GEN)
-	$(PYTHON_GEN) $(srcdir)/nir/nir_builder_opcodes_h.py > $@
+	$(PYTHON_GEN) $(srcdir)/nir/nir_builder_opcodes_h.py > $@ || ($(RM) $@; false)

 nir/nir_constant_expressions.c: nir/nir_opcodes.py nir/nir_constant_expressions.py
 	$(MKDIR_GEN)
-	$(PYTHON_GEN) $(srcdir)/nir/nir_constant_expressions.py > $@
+	$(PYTHON_GEN) $(srcdir)/nir/nir_constant_expressions.py > $@ || ($(RM) $@; false)

 nir/nir_opcodes.h: nir/nir_opcodes.py nir/nir_opcodes_h.py
 	$(MKDIR_GEN)
-	$(PYTHON_GEN) $(srcdir)/nir/nir_opcodes_h.py > $@
+	$(PYTHON_GEN) $(srcdir)/nir/nir_opcodes_h.py > $@ || ($(RM) $@; false)

 nir/nir_opcodes.c: nir/nir_opcodes.py nir/nir_opcodes_c.py
 	$(MKDIR_GEN)
-	$(PYTHON_GEN) $(srcdir)/nir/nir_opcodes_c.py > $@
+	$(PYTHON_GEN) $(srcdir)/nir/nir_opcodes_c.py > $@ || ($(RM) $@; false)

 nir/nir_opt_algebraic.c: nir/nir_opt_algebraic.py nir/nir_algebraic.py
 	$(MKDIR_GEN)
-	$(PYTHON_GEN) $(srcdir)/nir/nir_opt_algebraic.py > $@
+	$(PYTHON_GEN) $(srcdir)/nir/nir_opt_algebraic.py > $@ || ($(RM) $@; false)

 nir_tests_control_flow_tests_SOURCES =			\
 	nir/tests/control_flow_tests.cpp
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@ -106,6 +106,15 @@ public:
      return found;
   }

+   virtual ir_visitor_status visit_enter(ir_expression *ir)
+   {
+      /* .length() doesn't actually read anything */
+      if (ir->operation == ir_unop_ssbo_unsized_array_length)
+         return visit_continue_with_parent;
+
+      return visit_continue;
+   }
+
 private:
   ir_variable *found;
 };
--- a/src/glsl/builtin_functions.cpp
+++ b/src/glsl/builtin_functions.cpp
@ -4889,12 +4889,18 @@ builtin_builder::_noise4(const glsl_type *type)
 ir_function_signature *
 builtin_builder::_bitfieldExtract(const glsl_type *type)
 {
+   bool is_uint = type->base_type == GLSL_TYPE_UINT;
   ir_variable *value  = in_var(type, "value");
   ir_variable *offset = in_var(glsl_type::int_type, "offset");
   ir_variable *bits   = in_var(glsl_type::int_type, "bits");
   MAKE_SIG(type, gpu_shader5_or_es31, 3, value, offset, bits);

-   body.emit(ret(expr(ir_triop_bitfield_extract, value, offset, bits)));
+   operand cast_offset = is_uint ? i2u(offset) : operand(offset);
+   operand cast_bits = is_uint ? i2u(bits) : operand(bits);
+
+   body.emit(ret(expr(ir_triop_bitfield_extract, value,
+      swizzle(cast_offset, SWIZZLE_XXXX, type->vector_elements),
+      swizzle(cast_bits, SWIZZLE_XXXX, type->vector_elements))));

   return sig;
 }
@ -4902,13 +4908,19 @@ builtin_builder::_bitfieldExtract(const glsl_type *type)
 ir_function_signature *
 builtin_builder::_bitfieldInsert(const glsl_type *type)
 {
+   bool is_uint = type->base_type == GLSL_TYPE_UINT;
   ir_variable *base   = in_var(type, "base");
   ir_variable *insert = in_var(type, "insert");
   ir_variable *offset = in_var(glsl_type::int_type, "offset");
   ir_variable *bits   = in_var(glsl_type::int_type, "bits");
   MAKE_SIG(type, gpu_shader5_or_es31, 4, base, insert, offset, bits);

-   body.emit(ret(bitfield_insert(base, insert, offset, bits)));
+   operand cast_offset = is_uint ? i2u(offset) : operand(offset);
+   operand cast_bits = is_uint ? i2u(bits) : operand(bits);
+
+   body.emit(ret(bitfield_insert(base, insert,
+      swizzle(cast_offset, SWIZZLE_XXXX, type->vector_elements),
+      swizzle(cast_bits, SWIZZLE_XXXX, type->vector_elements))));

   return sig;
 }
--- a/src/glsl/ir.cpp
+++ b/src/glsl/ir.cpp
@ -431,7 +431,6 @@ ir_expression::ir_expression(int op, ir_rvalue *op0, ir_rvalue *op1)
   case ir_binop_borrow:
   case ir_binop_lshift:
   case ir_binop_rshift:
-   case ir_binop_bfm:
   case ir_binop_ldexp:
   case ir_binop_interpolate_at_offset:
   case ir_binop_interpolate_at_sample:
@ -468,7 +467,6 @@ ir_expression::ir_expression(int op, ir_rvalue *op0, ir_rvalue *op1,
      this->type = op0->type;
      break;

-   case ir_triop_bfi:
   case ir_triop_csel:
      this->type = op1->type;
      break;
@ -602,7 +600,6 @@ static const char *const operator_strs[] = {
   "max",
   "pow",
   "packHalf2x16_split",
-   "bfm",
   "ubo_load",
   "ldexp",
   "vector_extract",
@ -611,7 +608,6 @@ static const char *const operator_strs[] = {
   "fma",
   "lrp",
   "csel",
-   "bfi",
   "bitfield_extract",
   "vector_insert",
   "bitfield_insert",
--- a/src/glsl/ir.h
+++ b/src/glsl/ir.h
@ -1550,15 +1550,6 @@ enum ir_expression_operation {
   ir_binop_pack_half_2x16_split,
   /*@}*/

-   /**
-    * \name First half of a lowered bitfieldInsert() operation.
-    *
-    * \see lower_instructions::bitfield_insert_to_bfm_bfi
-    */
-   /*@{*/
-   ir_binop_bfm,
-   /*@}*/
-
   /**
    * Load a value the size of a given GLSL type from a uniform block.
    *
@ -1624,15 +1615,6 @@ enum ir_expression_operation {
   ir_triop_csel,
   /*@}*/

-   /**
-    * \name Second half of a lowered bitfieldInsert() operation.
-    *
-    * \see lower_instructions::bitfield_insert_to_bfm_bfi
-    */
-   /*@{*/
-   ir_triop_bfi,
-   /*@}*/
-
   ir_triop_bitfield_extract,

   /**
@ -1726,12 +1708,7 @@ public:
             operation == ir_binop_dot ||
             operation == ir_binop_vector_extract ||
             operation == ir_triop_vector_insert ||
-             operation == ir_quadop_vector ||
-             /* TODO: these can't currently be vectorized */
-             operation == ir_quadop_bitfield_insert ||
-             operation == ir_triop_bitfield_extract ||
-             operation == ir_triop_bfi ||
-             operation == ir_binop_bfm;
+             operation == ir_quadop_vector;
   }

   /**
--- a/src/glsl/ir_constant_expression.cpp
+++ b/src/glsl/ir_constant_expression.cpp
@ -1539,10 +1539,10 @@ ir_expression::constant_expression_value(struct hash_table *variable_context)
            data.i[c] = -1;
         else {
            int count = 0;
-            int top_bit = op[0]->type->base_type == GLSL_TYPE_UINT
-                          ? 0 : v & (1 << 31);
+            unsigned top_bit = op[0]->type->base_type == GLSL_TYPE_UINT
+                               ? 0 : v & (1u << 31);

-            while (((v & (1 << 31)) == top_bit) && count != 32) {
+            while (((v & (1u << 31)) == top_bit) && count != 32) {
               count++;
               v <<= 1;
            }
@ -1588,10 +1588,10 @@ ir_expression::constant_expression_value(struct hash_table *variable_context)
      break;

   case ir_triop_bitfield_extract: {
-      int offset = op[1]->value.i[0];
-      int bits = op[2]->value.i[0];
-
      for (unsigned c = 0; c < components; c++) {
+         int offset = op[1]->value.i[c];
+         int bits = op[2]->value.i[c];
+
         if (bits == 0)
            data.u[c] = 0;
         else if (offset < 0 || bits < 0)
@ -1616,23 +1616,6 @@ ir_expression::constant_expression_value(struct hash_table *variable_context)
      break;
   }

-   case ir_binop_bfm: {
-      int bits = op[0]->value.i[0];
-      int offset = op[1]->value.i[0];
-
-      for (unsigned c = 0; c < components; c++) {
-         if (bits == 0)
-            data.u[c] = op[0]->value.u[c];
-         else if (offset < 0 || bits < 0)
-            data.u[c] = 0; /* Undefined for bitfieldInsert, per spec. */
-         else if (offset + bits > 32)
-            data.u[c] = 0; /* Undefined for bitfieldInsert, per spec. */
-         else
-            data.u[c] = ((1 << bits) - 1) << offset;
-      }
-      break;
-   }
-
   case ir_binop_ldexp:
      for (unsigned c = 0; c < components; c++) {
         if (op[0]->type->base_type == GLSL_TYPE_DOUBLE) {
@ -1727,10 +1710,10 @@ ir_expression::constant_expression_value(struct hash_table *variable_context)
   }

   case ir_quadop_bitfield_insert: {
-      int offset = op[2]->value.i[0];
-      int bits = op[3]->value.i[0];
-
      for (unsigned c = 0; c < components; c++) {
+         int offset = op[2]->value.i[c];
+         int bits = op[3]->value.i[c];
+
         if (bits == 0)
            data.u[c] = op[0]->value.u[c];
         else if (offset < 0 || bits < 0)
@ -1738,7 +1721,7 @@ ir_expression::constant_expression_value(struct hash_table *variable_context)
         else if (offset + bits > 32)
            data.u[c] = 0; /* Undefined, per spec. */
         else {
-            unsigned insert_mask = ((1 << bits) - 1) << offset;
+            unsigned insert_mask = ((1ull << bits) - 1) << offset;

            unsigned insert = op[1]->value.u[c];
            insert <<= offset;
--- a/src/glsl/ir_optimization.h
+++ b/src/glsl/ir_optimization.h
@ -36,13 +36,12 @@
 #define LOG_TO_LOG2        0x10
 #define MOD_TO_FLOOR       0x20
 #define INT_DIV_TO_MUL_RCP 0x40
-#define BITFIELD_INSERT_TO_BFM_BFI 0x80
-#define LDEXP_TO_ARITH     0x100
-#define CARRY_TO_ARITH     0x200
-#define BORROW_TO_ARITH    0x400
-#define SAT_TO_CLAMP       0x800
-#define DOPS_TO_DFRAC      0x1000
-#define DFREXP_DLDEXP_TO_ARITH    0x2000
+#define LDEXP_TO_ARITH     0x80
+#define CARRY_TO_ARITH     0x100
+#define BORROW_TO_ARITH    0x200
+#define SAT_TO_CLAMP       0x400
+#define DOPS_TO_DFRAC      0x800
+#define DFREXP_DLDEXP_TO_ARITH    0x1000

 /**
 * \see class lower_packing_builtins_visitor
--- a/src/glsl/ir_validate.cpp
+++ b/src/glsl/ir_validate.cpp
@ -573,12 +573,6 @@ ir_validate::visit_leave(ir_expression *ir)
      assert(ir->operands[1]->type == glsl_type::float_type);
      break;

-   case ir_binop_bfm:
-      assert(ir->type->is_integer());
-      assert(ir->operands[0]->type->is_integer());
-      assert(ir->operands[1]->type->is_integer());
-      break;
-
   case ir_binop_ubo_load:
      assert(ir->operands[0]->type == glsl_type::uint_type);

@ -637,16 +631,11 @@ ir_validate::visit_leave(ir_expression *ir)
      assert(ir->type == ir->operands[2]->type);
      break;

-   case ir_triop_bfi:
-      assert(ir->operands[0]->type->is_integer());
-      assert(ir->operands[1]->type == ir->operands[2]->type);
-      assert(ir->operands[1]->type == ir->type);
-      break;
-
   case ir_triop_bitfield_extract:
+      assert(ir->type->is_integer());
      assert(ir->operands[0]->type == ir->type);
-      assert(ir->operands[1]->type == glsl_type::int_type);
-      assert(ir->operands[2]->type == glsl_type::int_type);
+      assert(ir->operands[1]->type == ir->type);
+      assert(ir->operands[2]->type == ir->type);
      break;

   case ir_triop_vector_insert:
@ -659,10 +648,11 @@ ir_validate::visit_leave(ir_expression *ir)
      break;

   case ir_quadop_bitfield_insert:
+      assert(ir->type->is_integer());
      assert(ir->operands[0]->type == ir->type);
      assert(ir->operands[1]->type == ir->type);
-      assert(ir->operands[2]->type == glsl_type::int_type);
-      assert(ir->operands[3]->type == glsl_type::int_type);
+      assert(ir->operands[2]->type == ir->type);
+      assert(ir->operands[3]->type == ir->type);
      break;

   case ir_quadop_vector:
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@ -1133,6 +1133,12 @@ cross_validate_globals(struct gl_shader_program *prog,
                            mode_string(var), var->name);
               return;
            }
+            if (existing->data.image_format != var->data.image_format) {
+               linker_error(prog, "declarations for %s `%s` have "
+                            "mismatching image format qualifiers\n",
+                            mode_string(var), var->name);
+               return;
+            }
 	 } else
 	    variables.add_variable(var);
      }
@ -3753,13 +3759,8 @@ build_program_resource_list(struct gl_shader_program *shProg)
      if (!add_packed_varyings(shProg, input_stage, GL_PROGRAM_INPUT))
         return;

-      /* Only when dealing with multiple stages, otherwise we would have
-       * duplicate gl_shader_variable entries.
-       */
-      if (input_stage != output_stage) {
-         if (!add_packed_varyings(shProg, output_stage, GL_PROGRAM_OUTPUT))
-            return;
-      }
+      if (!add_packed_varyings(shProg, output_stage, GL_PROGRAM_OUTPUT))
+         return;
   }

   if (!add_fragdata_arrays(shProg))
--- a/src/glsl/lower_instructions.cpp
+++ b/src/glsl/lower_instructions.cpp
@ -39,7 +39,6 @@
 * - MOD_TO_FLOOR
 * - LDEXP_TO_ARITH
 * - DFREXP_TO_ARITH
- * - BITFIELD_INSERT_TO_BFM_BFI
 * - CARRY_TO_ARITH
 * - BORROW_TO_ARITH
 * - SAT_TO_CLAMP
@ -99,14 +98,6 @@
 * Converts ir_binop_ldexp, ir_unop_frexp_sig, and ir_unop_frexp_exp to
 * arithmetic and bit ops for double arguments.
 *
- * BITFIELD_INSERT_TO_BFM_BFI:
- * ---------------------------
- * Breaks ir_quadop_bitfield_insert into ir_binop_bfm (bitfield mask) and
- * ir_triop_bfi (bitfield insert).
- *
- * Many GPUs implement the bitfieldInsert() built-in from ARB_gpu_shader_5
- * with a pair of instructions.
- *
 * CARRY_TO_ARITH:
 * ---------------
 * Converts ir_carry into (x + y) < x.
@ -154,7 +145,6 @@ private:
   void exp_to_exp2(ir_expression *);
   void pow_to_exp2(ir_expression *);
   void log_to_log2(ir_expression *);
-   void bitfield_insert_to_bfm_bfi(ir_expression *);
   void ldexp_to_arith(ir_expression *);
   void dldexp_to_arith(ir_expression *);
   void dfrexp_sig_to_arith(ir_expression *);
@ -347,29 +337,6 @@ lower_instructions_visitor::mod_to_floor(ir_expression *ir)
   this->progress = true;
 }

-void
-lower_instructions_visitor::bitfield_insert_to_bfm_bfi(ir_expression *ir)
-{
-   /* Translates
-    *    ir_quadop_bitfield_insert base insert offset bits
-    * into
-    *    ir_triop_bfi (ir_binop_bfm bits offset) insert base
-    */
-
-   ir_rvalue *base_expr = ir->operands[0];
-
-   ir->operation = ir_triop_bfi;
-   ir->operands[0] = new(ir) ir_expression(ir_binop_bfm,
-                                           ir->type->get_base_type(),
-                                           ir->operands[3],
-                                           ir->operands[2]);
-   /* ir->operands[1] is still the value to insert. */
-   ir->operands[2] = base_expr;
-   ir->operands[3] = NULL;
-
-   this->progress = true;
-}
-
 void
 lower_instructions_visitor::ldexp_to_arith(ir_expression *ir)
 {
@ -414,8 +381,8 @@ lower_instructions_visitor::ldexp_to_arith(ir_expression *ir)

   ir_constant *sign_mask = new(ir) ir_constant(0x80000000u, vec_elem);

-   ir_constant *exp_shift = new(ir) ir_constant(23);
-   ir_constant *exp_width = new(ir) ir_constant(8);
+   ir_constant *exp_shift = new(ir) ir_constant(23, vec_elem);
+   ir_constant *exp_width = new(ir) ir_constant(8, vec_elem);

   /* Temporary variables */
   ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
@ -482,12 +449,6 @@ lower_instructions_visitor::ldexp_to_arith(ir_expression *ir)
                                     exp_shift_clone, exp_width);
   ir->operands[1] = NULL;

-   /* Don't generate new IR that would need to be lowered in an additional
-    * pass.
-    */
-   if (lowering(BITFIELD_INSERT_TO_BFM_BFI))
-      bitfield_insert_to_bfm_bfi(ir->operands[0]->as_expression());
-
   this->progress = true;
 }

@ -509,8 +470,8 @@ lower_instructions_visitor::dldexp_to_arith(ir_expression *ir)

   ir_constant *sign_mask = new(ir) ir_constant(0x80000000u);

-   ir_constant *exp_shift = new(ir) ir_constant(20);
-   ir_constant *exp_width = new(ir) ir_constant(11);
+   ir_constant *exp_shift = new(ir) ir_constant(20, vec_elem);
+   ir_constant *exp_width = new(ir) ir_constant(11, vec_elem);
   ir_constant *exp_bias = new(ir) ir_constant(1022, vec_elem);

   /* Temporary variables */
@ -602,9 +563,6 @@ lower_instructions_visitor::dldexp_to_arith(ir_expression *ir)
            exp_shift->clone(ir, NULL),
            exp_width->clone(ir, NULL));

-      if (lowering(BITFIELD_INSERT_TO_BFM_BFI))
-         bitfield_insert_to_bfm_bfi(bfi);
-
      i.insert_before(assign(unpacked, bfi, WRITEMASK_Y));

      results[elem] = expr(ir_unop_pack_double_2x32, unpacked);
@ -1039,11 +997,6 @@ lower_instructions_visitor::visit_leave(ir_expression *ir)
 	 pow_to_exp2(ir);
      break;

-   case ir_quadop_bitfield_insert:
-      if (lowering(BITFIELD_INSERT_TO_BFM_BFI))
-         bitfield_insert_to_bfm_bfi(ir);
-      break;
-
   case ir_binop_ldexp:
      if (lowering(LDEXP_TO_ARITH) && ir->type->is_float())
         ldexp_to_arith(ir);
--- a/src/glsl/lower_packing_builtins.cpp
+++ b/src/glsl/lower_packing_builtins.cpp
@ -230,8 +230,8 @@ private:
      if (op_mask & LOWER_PACK_USE_BFI) {
         return bitfield_insert(bit_and(swizzle_x(u), constant(0xffffu)),
                                swizzle_y(u),
-                                constant(16),
-                                constant(16));
+                                constant(16u),
+                                constant(16u));
      }

      /* return (u.y << 16) | (u.x & 0xffff); */
@ -261,9 +261,9 @@ private:
         return bitfield_insert(bitfield_insert(
                                   bitfield_insert(
                                      bit_and(swizzle_x(u), constant(0xffu)),
-                                      swizzle_y(u), constant(8), constant(8)),
-                                   swizzle_z(u), constant(16), constant(8)),
-                                swizzle_w(u), constant(24), constant(8));
+                                      swizzle_y(u), constant(8u), constant(8u)),
+                                   swizzle_z(u), constant(16u), constant(8u)),
+                                swizzle_w(u), constant(24u), constant(8u));
      }

      /* uvec4 u = UVEC4_RVAL & 0xff */
@ -365,11 +365,11 @@ private:

      if (op_mask & LOWER_PACK_USE_BFE) {
         /* u4.y = bitfield_extract(u, 8, 8); */
-         factory.emit(assign(u4, bitfield_extract(u, constant(8), constant(8)),
+         factory.emit(assign(u4, bitfield_extract(u, constant(8u), constant(8u)),
                             WRITEMASK_Y));

         /* u4.z = bitfield_extract(u, 16, 8); */
-         factory.emit(assign(u4, bitfield_extract(u, constant(16), constant(8)),
+         factory.emit(assign(u4, bitfield_extract(u, constant(16u), constant(8u)),
                             WRITEMASK_Z));
      } else {
         /* u4.y = (u >> 8u) & 0xffu; */
--- a/src/glsl/nir/glsl_to_nir.cpp
+++ b/src/glsl/nir/glsl_to_nir.cpp
@ -1734,7 +1734,6 @@ nir_visitor::visit(ir_expression *ir)
   case ir_binop_pack_half_2x16_split:
         result = nir_pack_half_2x16_split(&b, srcs[0], srcs[1]);
         break;
-   case ir_binop_bfm:   result = nir_bfm(&b, srcs[0], srcs[1]);   break;
   case ir_binop_ldexp: result = nir_ldexp(&b, srcs[0], srcs[1]); break;
   case ir_triop_fma:
      result = nir_ffma(&b, srcs[0], srcs[1], srcs[2]);
@ -1748,9 +1747,6 @@ nir_visitor::visit(ir_expression *ir)
      else
         result = nir_fcsel(&b, srcs[0], srcs[1], srcs[2]);
      break;
-   case ir_triop_bfi:
-      result = nir_bfi(&b, srcs[0], srcs[1], srcs[2]);
-      break;
   case ir_triop_bitfield_extract:
      result = (out_type == GLSL_TYPE_INT) ?
         nir_ibitfield_extract(&b, srcs[0], srcs[1], srcs[2]) :
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@ -1508,6 +1508,7 @@ typedef struct nir_shader_compiler_options {
   bool lower_fsat;
   bool lower_fsqrt;
   bool lower_fmod;
+   bool lower_bitfield_extract;
   bool lower_bitfield_insert;
   bool lower_uadd_carry;
   bool lower_usub_borrow;
--- a/src/glsl/nir/nir_opcodes.py
+++ b/src/glsl/nir/nir_opcodes.py
@ -526,12 +526,15 @@ binop("fpow", tfloat, "", "powf(src0, src1)")
 binop_horiz("pack_half_2x16_split", 1, tuint, 1, tfloat, 1, tfloat,
            "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")

+# bfm implements the behavior of the first operation of the SM5 "bfi" assembly
+# and that of the "bfi1" i965 instruction. That is, it has undefined behavior
+# if either of its arguments are 32.
 binop_convert("bfm", tuint, tint, "", """
-int offset = src0, bits = src1;
-if (offset < 0 || bits < 0 || offset + bits > 32)
-   dst = 0; /* undefined per the spec */
+int bits = src0, offset = src1;
+if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
+   dst = 0; /* undefined */
 else
-   dst = ((1 << bits)- 1) << offset;
+   dst = ((1u << bits) - 1) << offset;
 """)

 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint], "", """
@ -570,6 +573,7 @@ triop("fcsel", tfloat, "(src0 != 0.0f) ? src1 : src2")
 opcode("bcsel", 0, tuint, [0, 0, 0],
      [tbool, tuint, tuint], "", "src0 ? src1 : src2")

+# SM5 bfi assembly
 triop("bfi", tuint, """
 unsigned mask = src0, insert = src1, base = src2;
 if (mask == 0) {
@ -584,22 +588,53 @@ if (mask == 0) {
 }
 """)

-opcode("ubitfield_extract", 0, tuint,
-       [0, 1, 1], [tuint, tint, tint], "", """
+# SM5 ubfe/ibfe assembly
+opcode("ubfe", 0, tuint,
+       [0, 0, 0], [tuint, tint, tint], "", """
 unsigned base = src0;
-int offset = src1.x, bits = src2.x;
+int offset = src1, bits = src2;
+if (bits == 0) {
+   dst = 0;
+} else if (bits < 0 || offset < 0) {
+   dst = 0; /* undefined */
+} else if (offset + bits < 32) {
+   dst = (base << (32 - bits - offset)) >> (32 - bits);
+} else {
+   dst = base >> offset;
+}
+""")
+opcode("ibfe", 0, tint,
+       [0, 0, 0], [tint, tint, tint], "", """
+int base = src0;
+int offset = src1, bits = src2;
+if (bits == 0) {
+   dst = 0;
+} else if (bits < 0 || offset < 0) {
+   dst = 0; /* undefined */
+} else if (offset + bits < 32) {
+   dst = (base << (32 - bits - offset)) >> (32 - bits);
+} else {
+   dst = base >> offset;
+}
+""")
+
+# GLSL bitfieldExtract()
+opcode("ubitfield_extract", 0, tuint,
+       [0, 0, 0], [tuint, tint, tint], "", """
+unsigned base = src0;
+int offset = src1, bits = src2;
 if (bits == 0) {
   dst = 0;
 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
   dst = 0; /* undefined per the spec */
 } else {
-   dst = (base >> offset) & ((1 << bits) - 1);
+   dst = (base >> offset) & ((1ull << bits) - 1);
 }
 """)
 opcode("ibitfield_extract", 0, tint,
-       [0, 1, 1], [tint, tint, tint], "", """
+       [0, 0, 0], [tint, tint, tint], "", """
 int base = src0;
-int offset = src1.x, bits = src2.x;
+int offset = src1, bits = src2;
 if (bits == 0) {
   dst = 0;
 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
@ -624,16 +659,16 @@ def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
          [tuint, tuint, tuint, tuint],
          "", const_expr)

-opcode("bitfield_insert", 0, tuint, [0, 0, 1, 1],
+opcode("bitfield_insert", 0, tuint, [0, 0, 0, 0],
       [tuint, tuint, tint, tint], "", """
 unsigned base = src0, insert = src1;
-int offset = src2.x, bits = src3.x;
+int offset = src2, bits = src3;
 if (bits == 0) {
   dst = 0;
 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
   dst = 0;
 } else {
-   unsigned mask = ((1 << bits) - 1) << offset;
+   unsigned mask = ((1ull << bits) - 1) << offset;
   dst = (base & ~mask) | ((insert << bits) & mask);
 }
 """)
--- a/src/glsl/nir/nir_opt_algebraic.py
+++ b/src/glsl/nir/nir_opt_algebraic.py
@ -227,9 +227,23 @@ optimizations = [
   # Misc. lowering
   (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
   (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'),
-   (('bitfield_insert', a, b, c, d), ('bfi', ('bfm', d, c), b, a), 'options->lower_bitfield_insert'),
   (('uadd_carry', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'),
   (('usub_borrow', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'),
+
+   (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
+    ('bcsel', ('ilt', 31, 'bits'), 'insert',
+              ('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')),
+    'options->lower_bitfield_insert'),
+
+   (('ibitfield_extract', 'value', 'offset', 'bits'),
+    ('bcsel', ('ilt', 31, 'bits'), 'value',
+              ('ibfe', 'value', 'offset', 'bits')),
+    'options->lower_bitfield_extract'),
+
+   (('ubitfield_extract', 'value', 'offset', 'bits'),
+    ('bcsel', ('ult', 31, 'bits'), 'value',
+              ('ubfe', 'value', 'offset', 'bits')),
+    'options->lower_bitfield_extract'),
 ]

 # Add optimizations to handle the case where the result of a ternary is
--- a/src/mesa/drivers/common/meta.h
+++ b/src/mesa/drivers/common/meta.h
@ -453,7 +453,7 @@ _mesa_meta_in_progress(struct gl_context *ctx)
 }

 extern void
-_mesa_meta_fb_tex_blit_begin(const struct gl_context *ctx,
+_mesa_meta_fb_tex_blit_begin(struct gl_context *ctx,
                             struct fb_tex_blit_state *blit);

 extern void
--- a/src/mesa/drivers/common/meta_blit.c
+++ b/src/mesa/drivers/common/meta_blit.c
@ -807,7 +807,7 @@ blitframebuffer_texture(struct gl_context *ctx,
 }

 void
-_mesa_meta_fb_tex_blit_begin(const struct gl_context *ctx,
+_mesa_meta_fb_tex_blit_begin(struct gl_context *ctx,
                             struct fb_tex_blit_state *blit)
 {
   /* None of the existing callers preinitialize fb_tex_blit_state to zeros,
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@ -812,7 +812,7 @@ brwCreateContext(gl_api api,
   brw->needs_unlit_centroid_workaround =
      devinfo->needs_unlit_centroid_workaround;

-   brw->must_use_separate_stencil = screen->hw_must_use_separate_stencil;
+   brw->must_use_separate_stencil = devinfo->must_use_separate_stencil;
   brw->has_swizzling = screen->hw_has_swizzling;

   brw->vs.base.stage = MESA_SHADER_VERTEX;
--- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
@ -143,7 +143,7 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
   ir_expression *expr = ir->rhs->as_expression();
   bool found_vector = false;
   unsigned int i, vector_elements = 1;
-   ir_variable *op_var[3];
+   ir_variable *op_var[4];

   if (!expr)
      return visit_continue;
@ -345,20 +345,6 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
   case ir_unop_noise:
      unreachable("noise should have been broken down to function call");

-   case ir_binop_bfm: {
-      /* Does not need to be scalarized, since its result will be identical
-       * for all channels.
-       */
-      ir_rvalue *op0 = get_element(op_var[0], 0);
-      ir_rvalue *op1 = get_element(op_var[1], 0);
-
-      assign(ir, 0, new(mem_ctx) ir_expression(expr->operation,
-                                               element_type,
-                                               op0,
-                                               op1));
-      break;
-   }
-
   case ir_binop_ubo_load:
   case ir_unop_get_buffer_size:
      unreachable("not yet supported");
@ -380,22 +366,21 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
      }
      break;

-   case ir_triop_bfi: {
-      /* Only a single BFM is needed for multiple BFIs. */
-      ir_rvalue *op0 = get_element(op_var[0], 0);
-
+   case ir_quadop_bitfield_insert:
      for (i = 0; i < vector_elements; i++) {
+         ir_rvalue *op0 = get_element(op_var[0], i);
         ir_rvalue *op1 = get_element(op_var[1], i);
         ir_rvalue *op2 = get_element(op_var[2], i);
+         ir_rvalue *op3 = get_element(op_var[3], i);

         assign(ir, i, new(mem_ctx) ir_expression(expr->operation,
                                                  element_type,
-                                                  op0->clone(mem_ctx, NULL),
+                                                  op0,
                                                  op1,
-                                                  op2));
+                                                  op2,
+                                                  op3));
      }
      break;
-   }

   case ir_unop_pack_snorm_2x16:
   case ir_unop_pack_snorm_4x8:
@ -410,7 +395,6 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
   case ir_binop_ldexp:
   case ir_binop_vector_extract:
   case ir_triop_vector_insert:
-   case ir_quadop_bitfield_insert:
   case ir_quadop_vector:
   case ir_unop_ssbo_unsized_array_length:
      unreachable("should have been lowered");
--- a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
@ -39,6 +39,8 @@

 using namespace brw;

+static const bool debug = false;
+
 /* Returns whether an instruction could co-issue if its immediate source were
 * replaced with a GRF source.
 */
@ -265,7 +267,6 @@ fs_visitor::opt_combine_constants()
   if (cfg->num_blocks != 1)
      qsort(table.imm, table.len, sizeof(struct imm), compare);

-
   /* Insert MOVs to load the constant values into GRFs. */
   fs_reg reg(VGRF, alloc.allocate(dispatch_width / 8));
   reg.stride = 0;
@ -299,7 +300,26 @@ fs_visitor::opt_combine_constants()
         reg->subreg_offset = table.imm[i].subreg_offset;
         reg->stride = 0;
         reg->negate = signbit(reg->f) != signbit(table.imm[i].val);
-         assert(fabsf(reg->f) == table.imm[i].val);
+         assert((isnan(reg->f) && isnan(table.imm[i].val)) ||
+                fabsf(reg->f) == table.imm[i].val);
+      }
+   }
+
+   if (debug) {
+      for (int i = 0; i < table.len; i++) {
+         struct imm *imm = &table.imm[i];
+
+         printf("%.3fF - block %3d, reg %3d sub %2d, Uses: (%2d, %2d), "
+                "IP: %4d to %4d, length %4d\n",
+                imm->val,
+                imm->block->num,
+                imm->nr,
+                imm->subreg_offset,
+                imm->must_promote,
+                imm->uses_by_coissue,
+                imm->first_use_ip,
+                imm->last_use_ip,
+                imm->last_use_ip - imm->first_use_ip);
      }
   }

--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@ -2313,7 +2313,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
   compiler->shader_debug_log(log_data,
                              "%s SIMD%d shader: %d inst, %d loops, %u cycles, "
                              "%d:%d spills:fills, Promoted %u constants, "
-                              "compacted %d to %d bytes.\n",
+                              "compacted %d to %d bytes.",
                              stage_abbrev, dispatch_width, before_size / 16,
                              loop_count, cfg->cycle_count, spill_count,
                              fill_count, promoted_constants, before_size,
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@ -1072,6 +1072,9 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)

   case nir_op_ubitfield_extract:
   case nir_op_ibitfield_extract:
+      unreachable("should have been lowered");
+   case nir_op_ubfe:
+   case nir_op_ibfe:
      bld.BFE(result, op[2], op[1], op[0]);
      break;
   case nir_op_bfm:
@ -1082,8 +1085,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
      break;

   case nir_op_bitfield_insert:
-      unreachable("not reached: should be handled by "
-                  "lower_instructions::bitfield_insert_to_bfm_bfi");
+      unreachable("not reached: should have been lowered");

   case nir_op_ishl:
      bld.SHL(result, op[0], op[1]);
--- a/src/mesa/drivers/dri/i965/brw_link.cpp
+++ b/src/mesa/drivers/dri/i965/brw_link.cpp
@ -126,14 +126,12 @@ process_glsl_ir(gl_shader_stage stage,
    */
   brw_lower_packing_builtins(brw, shader->Stage, shader->ir);
   do_mat_op_to_vec(shader->ir);
-   const int bitfield_insert = brw->gen >= 7 ? BITFIELD_INSERT_TO_BFM_BFI : 0;
   lower_instructions(shader->ir,
                      MOD_TO_FLOOR |
                      DIV_TO_MUL_RCP |
                      SUB_TO_ADD_NEG |
                      EXP_TO_EXP2 |
                      LOG_TO_LOG2 |
-                      bitfield_insert |
                      LDEXP_TO_ARITH |
                      CARRY_TO_ARITH |
                      BORROW_TO_ARITH);
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@ -106,6 +106,7 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
   nir_options->lower_fdiv = true;
   nir_options->lower_scmp = true;
   nir_options->lower_fmod = true;
+   nir_options->lower_bitfield_extract = true;
   nir_options->lower_bitfield_insert = true;
   nir_options->lower_uadd_carry = true;
   nir_options->lower_usub_borrow = true;
@ -1023,6 +1024,7 @@ backend_instruction::has_side_effects() const
   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
   case FS_OPCODE_FB_WRITE:
   case SHADER_OPCODE_BARRIER:
+   case TCS_OPCODE_URB_WRITE:
   case TCS_OPCODE_RELEASE_INPUT:
      return true;
   default:
--- a/src/mesa/drivers/dri/i965/brw_surface_formats.c
+++ b/src/mesa/drivers/dri/i965/brw_surface_formats.c
@ -723,6 +723,34 @@ brw_init_surface_formats(struct brw_context *brw)
   if (brw->gen >= 8)
      ctx->TextureFormatSupported[MESA_FORMAT_Z_UNORM16] = true;

+   /* The RGBX formats are not renderable. Normally these get mapped
+    * internally to RGBA formats when rendering. However on Gen9+ when this
+    * internal override is used fast clears don't work so they are disabled in
+    * brw_meta_fast_clear. To avoid this problem we can just pretend not to
+    * support RGBX formats at all. This will cause the upper layers of Mesa to
+    * pick the RGBA formats instead. This works fine because when it is used
+    * as a texture source the swizzle state is programmed to force the alpha
+    * channel to 1.0 anyway. We could also do this for all gens except that
+    * it's a bit more difficult when the hardware doesn't support texture
+    * swizzling. Gens using the blorp have further problems because that
+    * doesn't implement this swizzle override. We don't need to do this for
+    * BGRX because that actually is supported natively on Gen8+.
+    */
+   if (brw->gen >= 9) {
+      static const mesa_format rgbx_formats[] = {
+         MESA_FORMAT_R8G8B8X8_UNORM,
+         MESA_FORMAT_R8G8B8X8_SRGB,
+         MESA_FORMAT_RGBX_UNORM16,
+         MESA_FORMAT_RGBX_FLOAT16,
+         MESA_FORMAT_RGBX_FLOAT32
+      };
+
+      for (int i = 0; i < ARRAY_SIZE(rgbx_formats); i++) {
+         ctx->TextureFormatSupported[rgbx_formats[i]] = false;
+         brw->format_supported_as_render_target[rgbx_formats[i]] = false;
+      }
+   }
+
   /* On hardware that lacks support for ETC1, we map ETC1 to RGBX
    * during glCompressedTexImage2D(). See intel_mipmap_tree::wraps_etc1.
    */
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@ -1991,7 +1991,7 @@ generate_code(struct brw_codegen *p,

   compiler->shader_debug_log(log_data,
                              "%s vec4 shader: %d inst, %d loops, %u cycles, "
-                              "compacted %d to %d bytes.\n",
+                              "compacted %d to %d bytes.",
                              stage_abbrev, before_size / 16,
                              loop_count, cfg->cycle_count,
                              before_size, after_size);
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@ -1427,6 +1427,9 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)

   case nir_op_ubitfield_extract:
   case nir_op_ibitfield_extract:
+      unreachable("should have been lowered");
+   case nir_op_ubfe:
+   case nir_op_ibfe:
      op[0] = fix_3src_operand(op[0]);
      op[1] = fix_3src_operand(op[1]);
      op[2] = fix_3src_operand(op[2]);
@ -1447,8 +1450,7 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
      break;

   case nir_op_bitfield_insert:
-      unreachable("not reached: should be handled by "
-                  "lower_instructions::bitfield_insert_to_bfm_bfi");
+      unreachable("not reached: should have been lowered");

   case nir_op_fsign:
      /* AND(val, 0x80000000) gives the sign bit.
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@ -1449,8 +1449,6 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
   if (INTEL_DEBUG & DEBUG_AUB)
      drm_intel_bufmgr_gem_set_aub_dump(intelScreen->bufmgr, true);

-   intelScreen->hw_must_use_separate_stencil = intelScreen->devinfo->gen >= 7;
-
   intelScreen->hw_has_swizzling = intel_detect_swizzling(intelScreen);
   intelScreen->hw_has_timestamp = intel_detect_timestamp(intelScreen);

--- a/src/mesa/drivers/dri/i965/intel_screen.h
+++ b/src/mesa/drivers/dri/i965/intel_screen.h
@ -46,8 +46,6 @@ struct intel_screen

   bool no_hw;

-   bool hw_must_use_separate_stencil;
-
   bool hw_has_swizzling;

   int hw_has_timestamp;
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@ -3898,8 +3898,14 @@ _mesa_InvalidateBufferSubData(GLuint buffer, GLintptr offset,
   struct gl_buffer_object *bufObj;
   const GLintptr end = offset + length;

+   /* Section 6.5 (Invalidating Buffer Data) of the OpenGL 4.5 (Compatibility
+    * Profile) spec says:
+    *
+    *     "An INVALID_VALUE error is generated if buffer is zero or is not the
+    *     name of an existing buffer object."
+    */
   bufObj = _mesa_lookup_bufferobj(ctx, buffer);
-   if (!bufObj) {
+   if (!bufObj || bufObj == &DummyBufferObject) {
      _mesa_error(ctx, GL_INVALID_VALUE,
                  "glInvalidateBufferSubData(name = 0x%x) invalid object",
                  buffer);
@ -3912,7 +3918,7 @@ _mesa_InvalidateBufferSubData(GLuint buffer, GLintptr offset,
    *     negative, or if <offset> + <length> is greater than the value of
    *     BUFFER_SIZE."
    */
-   if (end < 0 || end > bufObj->Size) {
+   if (offset < 0 || length < 0 || end > bufObj->Size) {
      _mesa_error(ctx, GL_INVALID_VALUE,
                  "glInvalidateBufferSubData(invalid offset or length)");
      return;
@ -3933,10 +3939,8 @@ _mesa_InvalidateBufferSubData(GLuint buffer, GLintptr offset,
      return;
   }

-   /* We don't actually do anything for this yet.  Just return after
-    * validating the parameters and generating the required errors.
-    */
-   return;
+   if (ctx->Driver.InvalidateBufferSubData)
+      ctx->Driver.InvalidateBufferSubData(ctx, bufObj, offset, length);
 }

 void GLAPIENTRY
@ -3945,8 +3949,14 @@ _mesa_InvalidateBufferData(GLuint buffer)
   GET_CURRENT_CONTEXT(ctx);
   struct gl_buffer_object *bufObj;

+   /* Section 6.5 (Invalidating Buffer Data) of the OpenGL 4.5 (Compatibility
+    * Profile) spec says:
+    *
+    *     "An INVALID_VALUE error is generated if buffer is zero or is not the
+    *     name of an existing buffer object."
+    */
   bufObj = _mesa_lookup_bufferobj(ctx, buffer);
-   if (!bufObj) {
+   if (!bufObj || bufObj == &DummyBufferObject) {
      _mesa_error(ctx, GL_INVALID_VALUE,
                  "glInvalidateBufferData(name = 0x%x) invalid object",
                  buffer);
@ -3967,8 +3977,6 @@ _mesa_InvalidateBufferData(GLuint buffer)
      return;
   }

-   /* We don't actually do anything for this yet.  Just return after
-    * validating the parameters and generating the required errors.
-    */
-   return;
+   if (ctx->Driver.InvalidateBufferSubData)
+      ctx->Driver.InvalidateBufferSubData(ctx, bufObj, 0, bufObj->Size);
 }
--- a/src/mesa/main/dd.h
+++ b/src/mesa/main/dd.h
@ -634,6 +634,11 @@ struct dd_function_table {
                              GLintptr readOffset, GLintptr writeOffset,
                              GLsizeiptr size );

+   void (*InvalidateBufferSubData)( struct gl_context *ctx,
+                                    struct gl_buffer_object *obj,
+                                    GLintptr offset,
+                                    GLsizeiptr length );
+
   /* Returns pointer to the start of the mapped range.
    * May return NULL if MESA_MAP_NOWAIT_BIT is set in access:
    */
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@ -1500,6 +1500,13 @@ _mesa_validate_pipeline_io(struct gl_pipeline_object *pipeline)

   for (idx = prev + 1; idx < ARRAY_SIZE(pipeline->CurrentProgram); idx++) {
      if (shProg[idx]) {
+         /* Pipeline might include both non-compute and a compute program, do
+          * not attempt to validate varyings between non-compute and compute
+          * stage.
+          */
+         if (shProg[idx]->_LinkedShaders[idx]->Stage == MESA_SHADER_COMPUTE)
+            break;
+
         if (!validate_io(shProg[prev]->_LinkedShaders[prev],
                          shProg[idx]->_LinkedShaders[idx],
                          shProg[prev]->IsES || shProg[idx]->IsES))
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@ -300,7 +300,8 @@ create_shader(struct gl_context *ctx, GLenum type)
   GLuint name;

   if (!_mesa_validate_shader_target(ctx, type)) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "CreateShader(type)");
+      _mesa_error(ctx, GL_INVALID_ENUM, "CreateShader(%s)",
+                  _mesa_enum_to_string(type));
      return 0;
   }

--- a/src/mesa/main/texobj.c
+++ b/src/mesa/main/texobj.c
@ -835,7 +835,7 @@ _mesa_test_texobj_completeness( const struct gl_context *ctx,
                  incomplete(t, MIPMAP, "TexImage[%d] is missing", i);
                  return;
               }
-               if (img->TexFormat != baseImage->TexFormat) {
+               if (img->InternalFormat != baseImage->InternalFormat) {
                  incomplete(t, MIPMAP, "Format[i] != Format[baseLevel]");
                  return;
               }
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@ -1304,9 +1304,7 @@ ir_to_mesa_visitor::visit(ir_expression *ir)
      break;

   case ir_binop_vector_extract:
-   case ir_binop_bfm:
   case ir_triop_fma:
-   case ir_triop_bfi:
   case ir_triop_bitfield_extract:
   case ir_triop_vector_insert:
   case ir_quadop_bitfield_insert:
--- a/src/mesa/state_tracker/st_cb_bufferobjects.c
+++ b/src/mesa/state_tracker/st_cb_bufferobjects.c
@ -182,25 +182,31 @@ st_bufferobj_data(struct gl_context *ctx,
 {
   struct st_context *st = st_context(ctx);
   struct pipe_context *pipe = st->pipe;
+   struct pipe_screen *screen = pipe->screen;
   struct st_buffer_object *st_obj = st_buffer_object(obj);
   unsigned bind, pipe_usage, pipe_flags = 0;

   if (target != GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD &&
-       size && data && st_obj->buffer &&
+       size && st_obj->buffer &&
       st_obj->Base.Size == size &&
       st_obj->Base.Usage == usage &&
       st_obj->Base.StorageFlags == storageFlags) {
-      /* Just discard the old contents and write new data.
-       * This should be the same as creating a new buffer, but we avoid
-       * a lot of validation in Mesa.
-       */
-      struct pipe_box box;
+      if (data) {
+         /* Just discard the old contents and write new data.
+          * This should be the same as creating a new buffer, but we avoid
+          * a lot of validation in Mesa.
+          */
+         struct pipe_box box;

-      u_box_1d(0, size, &box);
-      pipe->transfer_inline_write(pipe, st_obj->buffer, 0,
-                                  PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE,
-                                  &box, data, 0, 0);
-      return GL_TRUE;
+         u_box_1d(0, size, &box);
+         pipe->transfer_inline_write(pipe, st_obj->buffer, 0,
+                                    PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE,
+                                    &box, data, 0, 0);
+         return GL_TRUE;
+      } else if (screen->get_param(screen, PIPE_CAP_INVALIDATE_BUFFER)) {
+         pipe->invalidate_resource(pipe, st_obj->buffer);
+         return GL_TRUE;
+      }
   }

   st_obj->Base.Size = size;
@ -288,7 +294,6 @@ st_bufferobj_data(struct gl_context *ctx,
   }

   if (size != 0) {
-      struct pipe_screen *screen = pipe->screen;
      struct pipe_resource buffer;

      memset(&buffer, 0, sizeof buffer);
@ -327,6 +332,31 @@ st_bufferobj_data(struct gl_context *ctx,
 }


+/**
+ * Called via glInvalidateBuffer(Sub)Data.
+ */
+static void
+st_bufferobj_invalidate(struct gl_context *ctx,
+                        struct gl_buffer_object *obj,
+                        GLintptr offset,
+                        GLsizeiptr size)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_context *pipe = st->pipe;
+   struct st_buffer_object *st_obj = st_buffer_object(obj);
+
+   /* We ignore partial invalidates. */
+   if (offset != 0 || size != obj->Size)
+      return;
+
+   /* Nothing to invalidate. */
+   if (!st_obj->buffer)
+      return;
+
+   pipe->invalidate_resource(pipe, st_obj->buffer);
+}
+
+
 /**
 * Called via glMapBufferRange().
 */
@ -512,7 +542,8 @@ st_bufferobj_validate_usage(struct st_context *st,


 void
-st_init_bufferobject_functions(struct dd_function_table *functions)
+st_init_bufferobject_functions(struct pipe_screen *screen,
+                               struct dd_function_table *functions)
 {
   /* plug in default driver fallbacks (such as for ClearBufferSubData) */
   _mesa_init_buffer_object_functions(functions);
@ -527,4 +558,7 @@ st_init_bufferobject_functions(struct dd_function_table *functions)
   functions->UnmapBuffer = st_bufferobj_unmap;
   functions->CopyBufferSubData = st_copy_buffer_subdata;
   functions->ClearBufferSubData = st_clear_buffer_subdata;
+
+   if (screen->get_param(screen, PIPE_CAP_INVALIDATE_BUFFER))
+      functions->InvalidateBufferSubData = st_bufferobj_invalidate;
 }
--- a/src/mesa/state_tracker/st_cb_bufferobjects.h
+++ b/src/mesa/state_tracker/st_cb_bufferobjects.h
@ -33,6 +33,7 @@

 struct dd_function_table;
 struct pipe_resource;
+struct pipe_screen;
 struct st_context;

 /**
@ -62,7 +63,8 @@ st_bufferobj_validate_usage(struct st_context *st,


 extern void
-st_init_bufferobject_functions(struct dd_function_table *functions);
+st_init_bufferobject_functions(struct pipe_screen *screen,
+                               struct dd_function_table *functions);


 #endif
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@ -1302,6 +1302,7 @@ blit_copy_pixels(struct gl_context *ctx, GLint srcx, GLint srcy,
       ctx->_ImageTransferState == 0x0 &&
       !ctx->Color.BlendEnabled &&
       !ctx->Color.AlphaEnabled &&
+       (!ctx->Color.ColorLogicOpEnabled || ctx->Color.LogicOp == GL_COPY) &&
       !ctx->Depth.Test &&
       !ctx->Fog.Enabled &&
       !ctx->Stencil.Enabled &&
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@ -446,7 +446,7 @@ void st_init_driver_functions(struct pipe_screen *screen,
   _mesa_init_sampler_object_functions(functions);

   st_init_blit_functions(functions);
-   st_init_bufferobject_functions(functions);
+   st_init_bufferobject_functions(screen, functions);
   st_init_clear_functions(functions);
   st_init_bitmap_functions(functions);
   st_init_copy_image_functions(functions);
--- a/src/mesa/state_tracker/st_copytex.c
+++ b/src/mesa/state_tracker/st_copytex.c
@ -59,7 +59,7 @@ st_copy_framebuffer_to_texture(GLenum srcBuffer,
   _mesa_GetIntegerv(GL_READ_BUFFER, &readBufSave);

   /* Read from the winsys buffer */
-   _mesa_BindFramebuffer(GL_READ_BUFFER, 0);
+   _mesa_BindFramebuffer(GL_READ_FRAMEBUFFER, 0);
   _mesa_ReadBuffer(srcBuffer);

   /* copy image from pbuffer to texture */
@ -136,5 +136,5 @@ st_copy_framebuffer_to_texture(GLenum srcBuffer,

   /* restore readbuffer */
   _mesa_ReadBuffer(readBufSave);
-   _mesa_BindFramebuffer(GL_READ_BUFFER, readFBOSave);
+   _mesa_BindFramebuffer(GL_READ_FRAMEBUFFER, readFBOSave);
 }
--- a/src/mesa/state_tracker/st_gen_mipmap.c
+++ b/src/mesa/state_tracker/st_gen_mipmap.c
@ -80,6 +80,7 @@ st_generate_mipmap(struct gl_context *ctx, GLenum target,
   struct st_texture_object *stObj = st_texture_object(texObj);
   struct pipe_resource *pt = st_get_texobj_resource(texObj);
   const uint baseLevel = texObj->BaseLevel;
+   enum pipe_format format;
   uint lastLevel, first_layer, last_layer;
   uint dstLevel;

@ -149,12 +150,24 @@ st_generate_mipmap(struct gl_context *ctx, GLenum target,
      last_layer = util_max_layer(pt, baseLevel);
   }

-   /* Try to generate the mipmap by rendering/texturing.  If that fails,
-    * use the software fallback.
+   if (stObj->surface_based)
+      format = stObj->surface_format;
+   else
+      format = pt->format;
+
+   /* First see if the driver supports hardware mipmap generation,
+    * if not then generate the mipmap by rendering/texturing.
+    * If that fails, use the software fallback.
    */
-   if (!util_gen_mipmap(st->pipe, pt, pt->format, baseLevel, lastLevel,
-                        first_layer, last_layer, PIPE_TEX_FILTER_LINEAR)) {
-      _mesa_generate_mipmap(ctx, target, texObj);
+   if (!st->pipe->screen->get_param(st->pipe->screen,
+                                    PIPE_CAP_GENERATE_MIPMAP) ||
+       !st->pipe->generate_mipmap(st->pipe, pt, format, baseLevel,
+                                  lastLevel, first_layer, last_layer)) {
+
+      if (!util_gen_mipmap(st->pipe, pt, format, baseLevel, lastLevel,
+                           first_layer, last_layer, PIPE_TEX_FILTER_LINEAR)) {
+         _mesa_generate_mipmap(ctx, target, texObj);
+      }
   }

   /* Fill in the Mesa gl_texture_image fields */
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@ -2183,8 +2183,6 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
   case ir_unop_unpack_unorm_4x8:

   case ir_binop_pack_half_2x16_split:
-   case ir_binop_bfm:
-   case ir_triop_bfi:
   case ir_quadop_vector:
   case ir_binop_vector_extract:
   case ir_triop_vector_insert: