Merge remote-tracking branch 'mesa-public/master' into vulkan

2015-11-03 15:45:04 -08:00 · 2015-11-03 15:45:04 -08:00 · b00e3f221b
parent a1e7b8701a 5d4b019d2a
commit b00e3f221b
340 changed files with 16562 additions and 3672 deletions
--- a/configure.ac
+++ b/configure.ac
@ -81,7 +81,7 @@ PRESENTPROTO_REQUIRED=1.0
 LIBUDEV_REQUIRED=151
 GLPROTO_REQUIRED=1.4.14
 LIBOMXIL_BELLAGIO_REQUIRED=0.0
-LIBVA_REQUIRED=0.35.0
+LIBVA_REQUIRED=0.38.0
 VDPAU_REQUIRED=1.1
 WAYLAND_REQUIRED=1.2.0
 XCB_REQUIRED=1.9.3
@ -867,7 +867,7 @@ GALLIUM_DRIVERS_DEFAULT="r300,r600,svga,swrast"
 AC_ARG_WITH([gallium-drivers],
    [AS_HELP_STRING([--with-gallium-drivers@<:@=DIRS...@:>@],
        [comma delimited Gallium drivers list, e.g.
-        "i915,ilo,nouveau,r300,r600,radeonsi,freedreno,svga,swrast,vc4"
+        "i915,ilo,nouveau,r300,r600,radeonsi,freedreno,svga,swrast,vc4,virgl"
        @<:@default=r300,r600,svga,swrast@:>@])],
    [with_gallium_drivers="$withval"],
    [with_gallium_drivers="$GALLIUM_DRIVERS_DEFAULT"])
@ -2188,6 +2188,12 @@ if test -n "$with_gallium_drivers"; then
            PKG_CHECK_MODULES([SIMPENROSE], [simpenrose],
                              [USE_VC4_SIMULATOR=yes], [USE_VC4_SIMULATOR=no])
            ;;
+        xvirgl)
+            HAVE_GALLIUM_VIRGL=yes
+            gallium_require_drm "virgl"
+            gallium_require_drm_loader
+            require_egl_drm "virgl"
+            ;;
        *)
            AC_MSG_ERROR([Unknown Gallium driver: $driver])
            ;;
@ -2259,6 +2265,7 @@ AM_CONDITIONAL(HAVE_GALLIUM_FREEDRENO, test "x$HAVE_GALLIUM_FREEDRENO" = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_SOFTPIPE, test "x$HAVE_GALLIUM_SOFTPIPE" = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_LLVMPIPE, test "x$HAVE_GALLIUM_LLVMPIPE" = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_VC4, test "x$HAVE_GALLIUM_VC4" = xyes)
+AM_CONDITIONAL(HAVE_GALLIUM_VIRGL, test "x$HAVE_GALLIUM_VIRGL" = xyes)

 AM_CONDITIONAL(HAVE_GALLIUM_STATIC_TARGETS, test "x$enable_shared_pipe_drivers" = xno)

@ -2386,6 +2393,7 @@ AC_CONFIG_FILES([Makefile
 		src/gallium/drivers/svga/Makefile
 		src/gallium/drivers/trace/Makefile
 		src/gallium/drivers/vc4/Makefile
+		src/gallium/drivers/virgl/Makefile
 		src/gallium/state_trackers/clover/Makefile
 		src/gallium/state_trackers/dri/Makefile
 		src/gallium/state_trackers/glx/xlib/Makefile
@ -2426,6 +2434,8 @@ AC_CONFIG_FILES([Makefile
 		src/gallium/winsys/sw/wrapper/Makefile
 		src/gallium/winsys/sw/xlib/Makefile
 		src/gallium/winsys/vc4/drm/Makefile
+		src/gallium/winsys/virgl/drm/Makefile
+		src/gallium/winsys/virgl/vtest/Makefile
 		src/gbm/Makefile
 		src/gbm/main/gbm.pc
 		src/glsl/Makefile
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@ -153,10 +153,10 @@ GL 4.3, GLSL 4.30:
  GL_ARB_ES3_compatibility                             DONE (all drivers that support GLSL 3.30)
  GL_ARB_clear_buffer_object                           DONE (all drivers)
  GL_ARB_compute_shader                                in progress (jljusten)
-  GL_ARB_copy_image                                    DONE (i965) (gallium - in progress, VMware)
+  GL_ARB_copy_image                                    DONE (i965, nv50, nvc0, radeonsi)
  GL_KHR_debug                                         DONE (all drivers)
  GL_ARB_explicit_uniform_location                     DONE (all drivers that support GLSL)
-  GL_ARB_fragment_layer_viewport                       DONE (nv50, nvc0, r600, radeonsi, llvmpipe)
+  GL_ARB_fragment_layer_viewport                       DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe)
  GL_ARB_framebuffer_no_attachments                    DONE (i965)
  GL_ARB_internalformat_query2                         not started
  GL_ARB_invalidate_subdata                            DONE (all drivers)
@ -243,7 +243,7 @@ GLES3.2, GLSL ES 3.2
  GL_KHR_texture_compression_astc_ldr                  DONE (i965/gen9+)
  GL_OES_copy_image                                    not started (based on GL_ARB_copy_image, which is done for some drivers)
  GL_OES_draw_buffers_indexed                          not started
-  GL_OES_draw_elements_base_vertex                     not started (based on GL_ARB_draw_elements_base_vertex, which is done for all drivers)
+  GL_OES_draw_elements_base_vertex                     DONE (all drivers)
  GL_OES_geometry_shader                               not started (based on GL_ARB_geometry_shader4, which is done for all drivers)
  GL_OES_gpu_shader5                                   not started (based on parts of GL_ARB_gpu_shader5, which is done for some drivers)
  GL_OES_primitive_bounding box                        not started
--- a/docs/index.html
+++ b/docs/index.html
@ -16,6 +16,12 @@

 <h1>News</h1>

+<h2>October 24, 2015</h2>
+<p>
+<a href="relnotes/11.0.4.html">Mesa 11.0.4</a> is released.
+This is a bug-fix release.
+</p>
+
 <h2>October 10, 2015</h2>
 <p>
 <a href="relnotes/11.0.3.html">Mesa 11.0.3</a> is released.
@ -28,7 +34,7 @@ This is a bug-fix release.
 This is a bug-fix release.
 <br>
 NOTE: It is anticipated that 10.6.9 will be the final release in the 10.6
-series. Users of 10.5 are encouraged to migrate to the 11.0 series in order
+series. Users of 10.6 are encouraged to migrate to the 11.0 series in order
 to obtain future fixes.
 </p>

--- a/docs/relnotes.html
+++ b/docs/relnotes.html
@ -21,6 +21,7 @@ The release notes summarize what's new or changed in each Mesa release.
 </p>

 <ul>
+<li><a href="relnotes/11.0.4.html">11.0.4 release notes</a>
 <li><a href="relnotes/11.0.3.html">11.0.3 release notes</a>
 <li><a href="relnotes/10.6.9.html">10.6.9 release notes</a>
 <li><a href="relnotes/11.0.2.html">11.0.2 release notes</a>
--- a/docs/relnotes/11.0.4.html
+++ b/docs/relnotes/11.0.4.html
@ -0,0 +1,168 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 11.0.4 Release Notes / October 24, 2015</h1>
+
+<p>
+Mesa 11.0.4 is a bug fix release which fixes bugs found since the 11.0.3 release.
+</p>
+<p>
+Mesa 11.0.4 implements the OpenGL 4.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.1.  OpenGL
+4.1 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+ed412ca6a46d1bd055120e5c12806c15419ae8c4dd6d3f6ea20a83091d5c78bf  mesa-11.0.4.tar.gz
+40201bf7fc6fa12a6d9edfe870b41eb4dd6669154e3c42c48a96f70805f5483d  mesa-11.0.4.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86281">Bug 86281</a> - brw_meta_fast_clear (brw=brw&#64;entry=0x7fffd4097a08, fb=fb&#64;entry=0x7fffd40fa900, buffers=buffers&#64;entry=2, partial_clear=partial_clear&#64;entry=false)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86720">Bug 86720</a> - [radeon] Europa Universalis 4 freezing during game start (10.3.3+, still broken on 11.0.2)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91788">Bug 91788</a> - [HSW Regression] Synmark2_v6 Multithread performance case FPS reduced by 36%</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92304">Bug 92304</a> - [cts] cts.shaders.negative conformance tests fail</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Alejandro Piñeiro (2):</p>
+<ul>
+  <li>i965/vec4: check writemask when bailing out at register coalesce</li>
+  <li>i965/vec4: fill src_reg type using the constructor type parameter</li>
+</ul>
+
+<p>Brian Paul (2):</p>
+<ul>
+  <li>vbo: fix incorrect switch statement in init_mat_currval()</li>
+  <li>mesa: fix incorrect opcode in save_BlendFunci()</li>
+</ul>
+
+<p>Chih-Wei Huang (3):</p>
+<ul>
+  <li>mesa: android: Fix the incorrect path of sse_minmax.c</li>
+  <li>nv50/ir: use C++11 standard std::unordered_map if possible</li>
+  <li>nv30: include the header of ffs prototype</li>
+</ul>
+
+<p>Chris Wilson (1):</p>
+<ul>
+  <li>i965: Remove early release of DRI2 miptree</li>
+</ul>
+
+<p>Dave Airlie (1):</p>
+<ul>
+  <li>mesa/uniforms: fix get_uniform for doubles (v2)</li>
+</ul>
+
+<p>Emil Velikov (1):</p>
+<ul>
+  <li>docs: add sha256 checksums for 11.0.3</li>
+</ul>
+
+<p>Francisco Jerez (5):</p>
+<ul>
+  <li>i965: Don't tell the hardware about our UAV access.</li>
+  <li>mesa: Expose function to calculate whether a shader image unit is valid.</li>
+  <li>mesa: Skip redundant texture completeness checking during image validation.</li>
+  <li>i965: Use _mesa_is_image_unit_valid() instead of gl_image_unit::_Valid.</li>
+  <li>mesa: Get rid of texture-dependent image unit derived state.</li>
+</ul>
+
+<p>Ian Romanick (8):</p>
+<ul>
+  <li>glsl: Allow built-in functions as constant expressions in OpenGL ES 1.00</li>
+  <li>ff_fragment_shader: Use binding to set the sampler unit</li>
+  <li>glsl/linker: Use constant_initializer instead of constant_value to initialize uniforms</li>
+  <li>glsl: Use constant_initializer instead of constant_value to determine whether to keep an unused uniform</li>
+  <li>glsl: Only set ir_variable::constant_value for const-decorated variables</li>
+  <li>glsl: Restrict initializers for global variables to constant expression in ES</li>
+  <li>glsl: Add method to determine whether an expression contains the sequence operator</li>
+  <li>glsl: In later GLSL versions, sequence operator is cannot be a constant expression</li>
+</ul>
+
+<p>Ilia Mirkin (1):</p>
+<ul>
+  <li>nouveau: make sure there's always room to emit a fence</li>
+</ul>
+
+<p>Indrajit Das (1):</p>
+<ul>
+  <li>st/va: Used correct parameter to derive the value of the "h" variable in vlVaCreateImage</li>
+</ul>
+
+<p>Jonathan Gray (1):</p>
+<ul>
+  <li>configure.ac: ensure RM is set</li>
+</ul>
+
+<p>Krzysztof Sobiecki (1):</p>
+<ul>
+  <li>st/fbo: use pipe_surface_release instead of pipe_surface_reference</li>
+</ul>
+
+<p>Leo Liu (1):</p>
+<ul>
+  <li>st/omx/dec/h264: fix field picture type 0 poc disorder</li>
+</ul>
+
+<p>Marek Olšák (3):</p>
+<ul>
+  <li>st/mesa: fix clip state dependencies</li>
+  <li>radeonsi: fix a GS copy shader leak</li>
+  <li>gallium: add PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT</li>
+</ul>
+
+<p>Nicolai Hähnle (1):</p>
+<ul>
+  <li>u_vbuf: fix vb slot assignment for translated buffers</li>
+</ul>
+
+<p>Rob Clark (1):</p>
+<ul>
+  <li>freedreno/a3xx: cache-flush is needed after MEM_WRITE</li>
+</ul>
+
+<p>Tapani Pälli (3):</p>
+<ul>
+  <li>mesa: add GL_UNSIGNED_INT_24_8 to _mesa_pack_depth_span</li>
+  <li>mesa: Set api prefix to version string when overriding version</li>
+  <li>mesa: fix ARRAY_SIZE query for GetProgramResourceiv</li>
+</ul>
+
+
+</div>
+</body>
+</html>
--- a/docs/relnotes/11.1.0.html
+++ b/docs/relnotes/11.1.0.html
@ -45,15 +45,21 @@ Note: some of the new features are only available with certain drivers.

 <ul>
 <li>GL_ARB_blend_func_extended on freedreno (a3xx)</li>
+<li>GL_ARB_copy_image on radeonsi</li>
 <li>GL_ARB_gpu_shader_fp64 on r600 for Cypress/Cayman/Aruba chips</li>
 <li>GL_ARB_gpu_shader5 on r600 for Evergreen and later chips</li>
+<li>GL_ARB_shader_clock on i965 (gen7+)</li>
+<li>GL_ARB_shader_stencil_export on i965 (gen9+)</li>
 <li>GL_ARB_shader_storage_buffer_object on i965</li>
 <li>GL_ARB_shader_texture_image_samples on i965, nv50, nvc0, r600, radeonsi</li>
 <li>GL_ARB_texture_barrier / GL_NV_texture_barrier on i965</li>
 <li>GL_ARB_texture_query_lod on softpipe</li>
 <li>GL_ARB_texture_view on radeonsi</li>
+<li>GL_EXT_draw_elements_base_vertex on all drivers</li>
+<li>GL_OES_draw_elements_base_vertex on all drivers</li>
 <li>EGL_KHR_create_context on softpipe, llvmpipe</li>
 <li>EGL_KHR_gl_colorspace on softpipe, llvmpipe</li>
+<li>new virgl gallium driver for qemu virtio-gpu</li>
 </ul>

 <h2>Bug fixes</h2>
--- a/include/GL/internal/dri_interface.h
+++ b/include/GL/internal/dri_interface.h
@ -495,7 +495,7 @@ struct __DRIdamageExtensionRec {
 * SWRast Loader extension.
 */
 #define __DRI_SWRAST_LOADER "DRI_SWRastLoader"
-#define __DRI_SWRAST_LOADER_VERSION 2
+#define __DRI_SWRAST_LOADER_VERSION 3
 struct __DRIswrastLoaderExtensionRec {
    __DRIextension base;

@ -528,6 +528,15 @@ struct __DRIswrastLoaderExtensionRec {
    void (*putImage2)(__DRIdrawable *drawable, int op,
                      int x, int y, int width, int height, int stride,
                      char *data, void *loaderPrivate);
+
+   /**
+     * Put image to drawable
+     *
+     * \since 3
+     */
+   void (*getImage2)(__DRIdrawable *readable,
+		     int x, int y, int width, int height, int stride,
+		     char *data, void *loaderPrivate);
 };

 /**
--- a/include/pci_ids/i965_pci_ids.h
+++ b/include/pci_ids/i965_pci_ids.h
@ -109,21 +109,29 @@ CHIPSET(0x162A, bdw_gt3, "Intel(R) Iris Pro P6300 (Broadwell GT3e)")
 CHIPSET(0x162B, bdw_gt3, "Intel(R) Iris 6100 (Broadwell GT3)")
 CHIPSET(0x162D, bdw_gt3, "Intel(R) Broadwell GT3")
 CHIPSET(0x162E, bdw_gt3, "Intel(R) Broadwell GT3")
-CHIPSET(0x1902, skl_gt1, "Intel(R) Skylake DT  GT1")
-CHIPSET(0x1906, skl_gt1, "Intel(R) Skylake ULT GT1")
-CHIPSET(0x190A, skl_gt1, "Intel(R) Skylake SRV GT1")
-CHIPSET(0x190B, skl_gt1, "Intel(R) Skylake Halo GT1")
-CHIPSET(0x190E, skl_gt1, "Intel(R) Skylake ULX GT1")
-CHIPSET(0x1912, skl_gt2, "Intel(R) Skylake DT  GT2")
-CHIPSET(0x1916, skl_gt2, "Intel(R) Skylake ULT GT2")
-CHIPSET(0x191A, skl_gt2, "Intel(R) Skylake SRV GT2")
-CHIPSET(0x191B, skl_gt2, "Intel(R) Skylake Halo GT2")
-CHIPSET(0x191D, skl_gt2, "Intel(R) Skylake WKS GT2")
-CHIPSET(0x191E, skl_gt2, "Intel(R) Skylake ULX GT2")
-CHIPSET(0x1921, skl_gt2, "Intel(R) Skylake ULT GT2F")
-CHIPSET(0x1926, skl_gt3, "Intel(R) Skylake ULT GT3")
-CHIPSET(0x192A, skl_gt3, "Intel(R) Skylake SRV GT3")
-CHIPSET(0x192B, skl_gt3, "Intel(R) Skylake Halo GT3")
+CHIPSET(0x1902, skl_gt1, "Intel(R) HD Graphics 510 (Skylake GT1)")
+CHIPSET(0x1906, skl_gt1, "Intel(R) HD Graphics 510 (Skylake GT1)")
+CHIPSET(0x190A, skl_gt1, "Intel(R) Skylake GT1")
+CHIPSET(0x190E, skl_gt1, "Intel(R) Skylake GT1")
+CHIPSET(0x1912, skl_gt2, "Intel(R) HD Graphics 530 (Skylake GT2)")
+CHIPSET(0x1913, skl_gt2, "Intel(R) Skylake GT2f")
+CHIPSET(0x1915, skl_gt2, "Intel(R) Skylake GT2f")
+CHIPSET(0x1916, skl_gt2, "Intel(R) HD Graphics 520 (Skylake GT2)")
+CHIPSET(0x1917, skl_gt2, "Intel(R) Skylake GT2f")
+CHIPSET(0x191A, skl_gt2, "Intel(R) Skylake GT2")
+CHIPSET(0x191B, skl_gt2, "Intel(R) HD Graphics 530 (Skylake GT2)")
+CHIPSET(0x191D, skl_gt2, "Intel(R) HD Graphics P530 (Skylake GT2)")
+CHIPSET(0x191E, skl_gt2, "Intel(R) HD Graphics 515 (Skylake GT2)")
+CHIPSET(0x1921, skl_gt2, "Intel(R) Skylake GT2")
+CHIPSET(0x1923, skl_gt3, "Intel(R) Iris Graphics 540 (Skylake GT3e)")
+CHIPSET(0x1926, skl_gt3, "Intel(R) HD Graphics 535 (Skylake GT3)")
+CHIPSET(0x1927, skl_gt3, "Intel(R) Iris Graphics 550 (Skylake GT3e)")
+CHIPSET(0x192A, skl_gt4, "Intel(R) Skylake GT4")
+CHIPSET(0x192B, skl_gt3, "Intel(R) Iris Graphics (Skylake GT3fe)")
+CHIPSET(0x1932, skl_gt4, "Intel(R) Skylake GT4")
+CHIPSET(0x193A, skl_gt4, "Intel(R) Skylake GT4")
+CHIPSET(0x193B, skl_gt4, "Intel(R) Skylake GT4")
+CHIPSET(0x193D, skl_gt4, "Intel(R) Skylake GT4")
 CHIPSET(0x22B0, chv,     "Intel(R) HD Graphics (Cherryview)")
 CHIPSET(0x22B1, chv,     "Intel(R) HD Graphics (Cherryview)")
 CHIPSET(0x22B2, chv,     "Intel(R) HD Graphics (Cherryview)")
--- a/include/pci_ids/radeonsi_pci_ids.h
+++ b/include/pci_ids/radeonsi_pci_ids.h
@ -181,3 +181,5 @@ CHIPSET(0x9876, CARRIZO_, CARRIZO)
 CHIPSET(0x9877, CARRIZO_, CARRIZO)

 CHIPSET(0x7300, FIJI_, FIJI)
+
+CHIPSET(0x98E4, STONEY_, STONEY)
--- a/src/gallium/Makefile.am
+++ b/src/gallium/Makefile.am
@ -82,6 +82,11 @@ if HAVE_GALLIUM_VC4
 SUBDIRS += drivers/vc4 winsys/vc4/drm
 endif

+## virgl
+if HAVE_GALLIUM_VIRGL
+SUBDIRS += drivers/virgl winsys/virgl/drm winsys/virgl/vtest
+endif
+
 ## the sw winsys'
 SUBDIRS += winsys/sw/null

--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@ -427,6 +427,7 @@ lp_build_init(void)
       */
      util_cpu_caps.has_avx = 0;
      util_cpu_caps.has_avx2 = 0;
+      util_cpu_caps.has_f16c = 0;
   }

 #ifdef PIPE_ARCH_PPC_64
@ -458,7 +459,9 @@ lp_build_init(void)
   util_cpu_caps.has_sse3 = 0;
   util_cpu_caps.has_ssse3 = 0;
   util_cpu_caps.has_sse4_1 = 0;
+   util_cpu_caps.has_sse4_2 = 0;
   util_cpu_caps.has_avx = 0;
+   util_cpu_caps.has_avx2 = 0;
   util_cpu_caps.has_f16c = 0;
 #endif

--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@ -497,20 +497,48 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
 #endif
   }

-   llvm::SmallVector<std::string, 1> MAttrs;
-   if (util_cpu_caps.has_avx) {
-      /*
-       * AVX feature is not automatically detected from CPUID by the X86 target
-       * yet, because the old (yet default) JIT engine is not capable of
-       * emitting the opcodes. On newer llvm versions it is and at least some
-       * versions (tested with 3.3) will emit avx opcodes without this anyway.
-       */
-      MAttrs.push_back("+avx");
-      if (util_cpu_caps.has_f16c) {
-         MAttrs.push_back("+f16c");
-      }
-      builder.setMAttrs(MAttrs);
-   }
+   llvm::SmallVector<std::string, 16> MAttrs;
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   /*
+    * We need to unset attributes because sometimes LLVM mistakenly assumes
+    * certain features are present given the processor name.
+    *
+    * https://bugs.freedesktop.org/show_bug.cgi?id=92214
+    * http://llvm.org/PR25021
+    * http://llvm.org/PR19429
+    * http://llvm.org/PR16721
+    */
+   MAttrs.push_back(util_cpu_caps.has_sse    ? "+sse"    : "-sse"   );
+   MAttrs.push_back(util_cpu_caps.has_sse2   ? "+sse2"   : "-sse2"  );
+   MAttrs.push_back(util_cpu_caps.has_sse3   ? "+sse3"   : "-sse3"  );
+   MAttrs.push_back(util_cpu_caps.has_ssse3  ? "+ssse3"  : "-ssse3" );
+#if HAVE_LLVM >= 0x0304
+   MAttrs.push_back(util_cpu_caps.has_sse4_1 ? "+sse4.1" : "-sse4.1");
+#else
+   MAttrs.push_back(util_cpu_caps.has_sse4_1 ? "+sse41"  : "-sse41" );
+#endif
+#if HAVE_LLVM >= 0x0304
+   MAttrs.push_back(util_cpu_caps.has_sse4_2 ? "+sse4.2" : "-sse4.2");
+#else
+   MAttrs.push_back(util_cpu_caps.has_sse4_2 ? "+sse42"  : "-sse42" );
+#endif
+   /*
+    * AVX feature is not automatically detected from CPUID by the X86 target
+    * yet, because the old (yet default) JIT engine is not capable of
+    * emitting the opcodes. On newer llvm versions it is and at least some
+    * versions (tested with 3.3) will emit avx opcodes without this anyway.
+    */
+   MAttrs.push_back(util_cpu_caps.has_avx  ? "+avx"  : "-avx");
+   MAttrs.push_back(util_cpu_caps.has_f16c ? "+f16c" : "-f16c");
+   MAttrs.push_back(util_cpu_caps.has_avx2 ? "+avx2" : "-avx2");
+#endif
+
+#if defined(PIPE_ARCH_PPC)
+   MAttrs.push_back(util_cpu_caps.has_altivec ? "+altivec" : "-altivec");
+#endif
+
+   builder.setMAttrs(MAttrs);

 #if HAVE_LLVM >= 0x0305
   StringRef MCPU = llvm::sys::getHostCPUName();
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@ -405,16 +405,17 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
      break;

   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      if (offset) {
+         offset = lp_build_int_to_float(coord_bld, offset);
+         offset = lp_build_div(coord_bld, offset, length_f);
+         coord = lp_build_add(coord_bld, coord, offset);
+      }
      /* compute mirror function */
      coord = lp_build_coord_mirror(bld, coord);

      /* scale coord to length */
      coord = lp_build_mul(coord_bld, coord, length_f);
      coord = lp_build_sub(coord_bld, coord, half);
-      if (offset) {
-         offset = lp_build_int_to_float(coord_bld, offset);
-         coord = lp_build_add(coord_bld, coord, offset);
-      }

      /* convert to int, compute lerp weight */
      lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
@ -567,12 +568,13 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
         coord = lp_build_mul(coord_bld, coord, length_f);
      }

+      if (offset) {
+         offset = lp_build_int_to_float(coord_bld, offset);
+         coord = lp_build_add(coord_bld, coord, offset);
+      }
      /* floor */
      /* use itrunc instead since we clamp to 0 anyway */
      icoord = lp_build_itrunc(coord_bld, coord);
-      if (offset) {
-         icoord = lp_build_add(int_coord_bld, icoord, offset);
-      }

      /* clamp to [0, length - 1]. */
      icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero,
@ -2586,6 +2588,10 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
      derived_sampler_state.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
      derived_sampler_state.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
   }
+   /*
+    * We could force CLAMP to CLAMP_TO_EDGE here if min/mag filter is nearest,
+    * so AoS path could be used. Not sure it's worth the trouble...
+    */

   min_img_filter = derived_sampler_state.min_img_filter;
   mag_img_filter = derived_sampler_state.mag_img_filter;
--- a/src/gallium/auxiliary/target-helpers/inline_drm_helper.h
+++ b/src/gallium/auxiliary/target-helpers/inline_drm_helper.h
@ -59,6 +59,11 @@
 #include "vc4/drm/vc4_drm_public.h"
 #endif

+#if GALLIUM_VIRGL
+#include "virgl/drm/virgl_drm_public.h"
+#include "virgl/virgl_public.h"
+#endif
+
 static char* driver_name = NULL;

 /* XXX: We need to teardown the winsys if *screen_create() fails. */
@ -296,6 +301,33 @@ pipe_freedreno_create_screen(int fd)
 }
 #endif

+#if defined(GALLIUM_VIRGL)
+#if defined(DRI_TARGET)
+
+const __DRIextension **__driDriverGetExtensions_virtio_gpu(void);
+
+PUBLIC const __DRIextension **__driDriverGetExtensions_virtio_gpu(void)
+{
+   globalDriverAPI = &galliumdrm_driver_api;
+   return galliumdrm_driver_extensions;
+}
+#endif
+
+static struct pipe_screen *
+pipe_virgl_create_screen(int fd)
+{
+   struct virgl_winsys *vws;
+   struct pipe_screen *screen;
+
+   vws = virgl_drm_winsys_create(fd);
+   if (!vws)
+      return NULL;
+
+   screen = virgl_create_screen(vws);
+   return screen ? debug_screen_wrap(screen) : NULL;
+}
+#endif
+
 #if defined(GALLIUM_VC4)
 #if defined(DRI_TARGET)

@ -385,6 +417,11 @@ dd_create_screen(int fd)
      return pipe_freedreno_create_screen(fd);
   else
 #endif
+#if defined(GALLIUM_VIRGL)
+   if ((strcmp(driver_name, "virtio_gpu") == 0))
+      return pipe_virgl_create_screen(fd);
+   else
+#endif
 #if defined(GALLIUM_VC4)
   if (strcmp(driver_name, "vc4") == 0)
      return pipe_vc4_create_screen(fd);
@ -474,6 +511,11 @@ dd_configuration(enum drm_conf conf)
      return configuration_query(conf);
   else
 #endif
+#if defined(GALLIUM_VIRGL)
+   if ((strcmp(driver_name, "virtio_gpu") == 0))
+      return configuration_query(conf);
+   else
+#endif
 #if defined(GALLIUM_VC4)
   if (strcmp(driver_name, "vc4") == 0)
      return configuration_query(conf);
--- a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
+++ b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
@ -19,6 +19,10 @@
 #include "llvmpipe/lp_public.h"
 #endif

+#ifdef GALLIUM_VIRGL
+#include "virgl/virgl_public.h"
+#include "virgl/vtest/virgl_vtest_public.h"
+#endif

 static inline struct pipe_screen *
 sw_screen_create_named(struct sw_winsys *winsys, const char *driver)
@ -30,6 +34,14 @@ sw_screen_create_named(struct sw_winsys *winsys, const char *driver)
      screen = llvmpipe_create_screen(winsys);
 #endif

+#if defined(GALLIUM_VIRGL)
+   if (screen == NULL && strcmp(driver, "virpipe") == 0) {
+      struct virgl_winsys *vws;
+      vws = virgl_vtest_winsys_wrap(winsys);
+      screen = virgl_create_screen(vws);
+   }
+#endif
+
 #if defined(GALLIUM_SOFTPIPE)
   if (screen == NULL)
      screen = softpipe_create_screen(winsys);
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@ -29,6 +29,7 @@
 #include "util/u_string.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "util/u_math.h"
 #include "tgsi_dump.h"
 #include "tgsi_info.h"
 #include "tgsi_iterate.h"
@ -43,6 +44,8 @@ struct dump_ctx
 {
   struct tgsi_iterate_context iter;

+   boolean dump_float_as_hex;
+
   uint instno;
   uint immno;
   int indent;
@ -88,6 +91,7 @@ dump_enum(
 #define SID(I)          ctx->dump_printf( ctx, "%d", I )
 #define FLT(F)          ctx->dump_printf( ctx, "%10.4f", F )
 #define DBL(D)          ctx->dump_printf( ctx, "%10.8f", D )
+#define HFLT(F)         ctx->dump_printf( ctx, "0x%08x", fui((F)) )
 #define ENM(E,ENUMS)    dump_enum( ctx, E, ENUMS, sizeof( ENUMS ) / sizeof( *ENUMS ) )

 const char *
@ -251,7 +255,10 @@ dump_imm_data(struct tgsi_iterate_context *iter,
         break;
      }
      case TGSI_IMM_FLOAT32:
-         FLT( data[i].Float );
+         if (ctx->dump_float_as_hex)
+            HFLT( data[i].Float );
+         else
+            FLT( data[i].Float );
         break;
      case TGSI_IMM_UINT32:
         UID(data[i].Uint);
@ -682,6 +689,11 @@ tgsi_dump_to_file(const struct tgsi_token *tokens, uint flags, FILE *file)
   ctx.indentation = 0;
   ctx.file = file;

+   if (flags & TGSI_DUMP_FLOAT_AS_HEX)
+      ctx.dump_float_as_hex = TRUE;
+   else
+      ctx.dump_float_as_hex = FALSE;
+
   tgsi_iterate_shader( tokens, &ctx.iter );
 }

@ -697,6 +709,7 @@ struct str_dump_ctx
   char *str;
   char *ptr;
   int left;
+   bool nospace;
 };

 static void
@ -719,10 +732,11 @@ str_dump_ctx_printf(struct dump_ctx *ctx, const char *format, ...)
         sctx->ptr += written;
         sctx->left -= written;
      }
-   }
+   } else
+      sctx->nospace = true;
 }

-void
+bool
 tgsi_dump_str(
   const struct tgsi_token *tokens,
   uint flags,
@ -749,8 +763,16 @@ tgsi_dump_str(
   ctx.str[0] = 0;
   ctx.ptr = str;
   ctx.left = (int)size;
+   ctx.nospace = false;
+
+   if (flags & TGSI_DUMP_FLOAT_AS_HEX)
+      ctx.base.dump_float_as_hex = TRUE;
+   else
+      ctx.base.dump_float_as_hex = FALSE;

   tgsi_iterate_shader( tokens, &ctx.base.iter );
+
+   return !ctx.nospace;
 }

 void
@ -773,6 +795,7 @@ tgsi_dump_instruction_str(
   ctx.str[0] = 0;
   ctx.ptr = str;
   ctx.left = (int)size;
+   ctx.nospace = false;

   iter_instruction( &ctx.base.iter, (struct tgsi_full_instruction *)inst );
 }
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.h
@ -38,7 +38,9 @@
 extern "C" {
 #endif

-void
+#define TGSI_DUMP_FLOAT_AS_HEX (1 << 0)
+
+bool
 tgsi_dump_str(
   const struct tgsi_token *tokens,
   uint flags,
--- a/src/gallium/auxiliary/tgsi/tgsi_text.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@ -195,8 +195,15 @@ static boolean parse_float( const char **pcur, float *val )
   boolean integral_part = FALSE;
   boolean fractional_part = FALSE;

-   *val = (float) atof( cur );
+   if (*cur == '0' && *(cur + 1) == 'x') {
+      union fi fi;
+      fi.ui = strtoul(cur, NULL, 16);
+      *val = fi.f;
+      cur += 10;
+      goto out;
+   }

+   *val = (float) atof( cur );
   if (*cur == '-' || *cur == '+')
      cur++;
   if (is_digit( cur )) {
@ -228,6 +235,8 @@ static boolean parse_float( const char **pcur, float *val )
      else
         return FALSE;
   }
+
+out:
   *pcur = cur;
   return TRUE;
 }
--- a/src/gallium/auxiliary/util/u_format.c
+++ b/src/gallium/auxiliary/util/u_format.c
@ -169,6 +169,25 @@ util_format_is_snorm(enum pipe_format format)
          desc->channel[i].normalized;
 }

+boolean
+util_format_is_snorm8(enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
+   int i;
+
+   if (desc->is_mixed)
+      return FALSE;
+
+   i = util_format_get_first_non_void_channel(format);
+   if (i == -1)
+      return FALSE;
+
+   return desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED &&
+          !desc->channel[i].pure_integer &&
+          desc->channel[i].normalized &&
+          desc->channel[i].size == 8;
+}
+
 boolean
 util_format_is_luminance_alpha(enum pipe_format format)
 {
--- a/src/gallium/auxiliary/util/u_format.h
+++ b/src/gallium/auxiliary/util/u_format.h
@ -686,6 +686,9 @@ util_format_is_pure_uint(enum pipe_format format);
 boolean
 util_format_is_snorm(enum pipe_format format);

+boolean
+util_format_is_snorm8(enum pipe_format format);
+
 /**
 * Check if the src format can be blitted to the destination format with
 * a simple memcpy.  For example, blitting from RGBA to RGBx is OK, but not
--- a/src/gallium/auxiliary/util/u_tests.c
+++ b/src/gallium/auxiliary/util/u_tests.c
@ -450,6 +450,43 @@ null_constant_buffer(struct pipe_context *ctx)
   util_report_result(pass);
 }

+static void
+null_fragment_shader(struct pipe_context *ctx)
+{
+   struct cso_context *cso;
+   struct pipe_resource *cb;
+   void *vs;
+   struct pipe_rasterizer_state rs = {0};
+   struct pipe_query *query;
+   union pipe_query_result qresult;
+
+   cso = cso_create_context(ctx);
+   cb = util_create_texture2d(ctx->screen, 256, 256,
+                              PIPE_FORMAT_R8G8B8A8_UNORM);
+   util_set_common_states_and_clear(cso, ctx, cb);
+
+   /* No rasterization. */
+   rs.rasterizer_discard = 1;
+   cso_set_rasterizer(cso, &rs);
+
+   vs = util_set_passthrough_vertex_shader(cso, ctx, false);
+
+   query = ctx->create_query(ctx, PIPE_QUERY_PRIMITIVES_GENERATED, 0);
+   ctx->begin_query(ctx, query);
+   util_draw_fullscreen_quad(cso);
+   ctx->end_query(ctx, query);
+   ctx->get_query_result(ctx, query, true, &qresult);
+
+   /* Cleanup. */
+   cso_destroy_context(cso);
+   ctx->delete_vs_state(ctx, vs);
+   ctx->destroy_query(ctx, query);
+   pipe_resource_reference(&cb, NULL);
+
+   /* Check PRIMITIVES_GENERATED. */
+   util_report_result(qresult.u64 == 2);
+}
+
 /**
 * Run all tests. This should be run with a clean context after
 * context_create.
@ -459,6 +496,7 @@ util_run_tests(struct pipe_screen *screen)
 {
   struct pipe_context *ctx = screen->context_create(screen, NULL, 0);

+   null_fragment_shader(ctx);
   tgsi_vs_window_space_position(ctx);
   null_sampler_view(ctx, TGSI_TEXTURE_2D);
   null_sampler_view(ctx, TGSI_TEXTURE_BUFFER);
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@ -278,7 +278,9 @@ The integer capabilities:
  in the shader.
 * ``PIPE_CAP_SHAREABLE_SHADERS``: Whether shader CSOs can be used by any
  pipe_context.
-
+* ``PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS``:
+  Whether copying between compressed and plain formats is supported where
+  a compressed block is copied to/from a plain pixel of the same size.


 .. _pipe_capf:
--- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
@ -81,7 +81,7 @@ draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring,
 	OUT_RING(ring, info->primitive_restart ? /* PC_RESTART_INDEX */
 			info->restart_index : 0xffffffff);

-	if (ctx->rasterizer && ctx->rasterizer->point_size_per_vertex &&
+	if (ctx->rasterizer->point_size_per_vertex &&
 			(info->mode == PIPE_PRIM_POINTS))
 		primtype = DI_PT_POINTLIST_PSIZE;

@ -137,7 +137,7 @@ fd3_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
 		.key = {
 			/* do binning pass first: */
 			.binning_pass = true,
-			.color_two_side = ctx->rasterizer ? ctx->rasterizer->light_twoside : false,
+			.color_two_side = ctx->rasterizer->light_twoside,
 			// TODO set .half_precision based on render target format,
 			// ie. float16 and smaller use half, float32 use full..
 			.half_precision = !!(fd_mesa_debug & FD_DBG_FRAGHALF),
@ -149,9 +149,9 @@ fd3_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
 			.fsaturate_t = fd3_ctx->fsaturate_t,
 			.fsaturate_r = fd3_ctx->fsaturate_r,
 		},
-		.rasterflat = ctx->rasterizer && ctx->rasterizer->flatshade,
-		.sprite_coord_enable = ctx->rasterizer ? ctx->rasterizer->sprite_coord_enable : 0,
-		.sprite_coord_mode = ctx->rasterizer ? ctx->rasterizer->sprite_coord_mode : false,
+		.rasterflat = ctx->rasterizer->flatshade,
+		.sprite_coord_enable = ctx->rasterizer->sprite_coord_enable,
+		.sprite_coord_mode = ctx->rasterizer->sprite_coord_mode,
 	};
 	unsigned dirty;

--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@ -627,7 +627,7 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		ctx->prog.dirty = 0;
 	}

-	if ((dirty & (FD_DIRTY_BLEND | FD_DIRTY_FRAMEBUFFER)) && ctx->blend) {
+	if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_FRAMEBUFFER)) {
 		struct fd3_blend_stateobj *blend = fd3_blend_stateobj(ctx->blend);
 		uint32_t i;

--- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
@ -118,12 +118,12 @@ fd4_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
 		.key = {
 			/* do binning pass first: */
 			.binning_pass = true,
-			.color_two_side = ctx->rasterizer ? ctx->rasterizer->light_twoside : false,
-			.rasterflat = ctx->rasterizer && ctx->rasterizer->flatshade,
+			.color_two_side = ctx->rasterizer->light_twoside,
+			.rasterflat = ctx->rasterizer->flatshade,
 			// TODO set .half_precision based on render target format,
 			// ie. float16 and smaller use half, float32 use full..
 			.half_precision = !!(fd_mesa_debug & FD_DBG_FRAGHALF),
-			.ucp_enables = ctx->rasterizer ? ctx->rasterizer->clip_plane_enable : 0,
+			.ucp_enables = ctx->rasterizer->clip_plane_enable,
 			.has_per_samp = (fd4_ctx->fsaturate || fd4_ctx->vsaturate),
 			.vsaturate_s = fd4_ctx->vsaturate_s,
 			.vsaturate_t = fd4_ctx->vsaturate_t,
@ -132,9 +132,9 @@ fd4_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
 			.fsaturate_t = fd4_ctx->fsaturate_t,
 			.fsaturate_r = fd4_ctx->fsaturate_r,
 		},
-		.rasterflat = ctx->rasterizer && ctx->rasterizer->flatshade,
-		.sprite_coord_enable = ctx->rasterizer ? ctx->rasterizer->sprite_coord_enable : false,
-		.sprite_coord_mode = ctx->rasterizer ? ctx->rasterizer->sprite_coord_mode : false,
+		.rasterflat = ctx->rasterizer->flatshade,
+		.sprite_coord_enable = ctx->rasterizer->sprite_coord_enable,
+		.sprite_coord_mode = ctx->rasterizer->sprite_coord_mode,
 	};
 	unsigned dirty;

--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@ -594,7 +594,7 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		ctx->prog.dirty = 0;
 	}

-	if ((dirty & FD_DIRTY_BLEND) && ctx->blend) {
+	if ((dirty & FD_DIRTY_BLEND)) {
 		struct fd4_blend_stateobj *blend = fd4_blend_stateobj(ctx->blend);
 		uint32_t i;

--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@ -238,6 +238,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TGSI_TXQS:
 	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
 	case PIPE_CAP_SHAREABLE_SHADERS:
+	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
 		return 0;

 	case PIPE_CAP_MAX_VIEWPORTS:
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@ -252,6 +252,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
   case PIPE_CAP_TGSI_TXQS:
   case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
   case PIPE_CAP_SHAREABLE_SHADERS:
+   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
      return 0;

   case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
--- a/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h
+++ b/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h
@ -202,14 +202,16 @@ static inline void
 gen6_3DSTATE_WM(struct ilo_builder *builder,
                const struct ilo_state_raster *rs,
                const struct ilo_state_ps *ps,
-                uint32_t kernel_offset)
+                uint32_t kernel_offset,
+                struct intel_bo *scratch_bo)
 {
   const uint8_t cmd_len = 9;
   uint32_t *dw;
+   unsigned pos;

   ILO_DEV_ASSERT(builder->dev, 6, 6);

-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);

   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_WM) | (cmd_len - 2);
   dw[1] = kernel_offset;
@ -221,6 +223,11 @@ gen6_3DSTATE_WM(struct ilo_builder *builder,
   dw[6] = rs->wm[2] | ps->ps[4];
   dw[7] = 0; /* kernel 1 */
   dw[8] = 0; /* kernel 2 */
+
+   if (ilo_state_ps_get_scratch_size(ps)) {
+      ilo_builder_batch_reloc(builder, pos + 2, scratch_bo,
+            ps->ps[0], 0);
+   }
 }

 static inline void
@ -329,14 +336,16 @@ gen8_3DSTATE_WM_CHROMAKEY(struct ilo_builder *builder)
 static inline void
 gen7_3DSTATE_PS(struct ilo_builder *builder,
                const struct ilo_state_ps *ps,
-                uint32_t kernel_offset)
+                uint32_t kernel_offset,
+                struct intel_bo *scratch_bo)
 {
   const uint8_t cmd_len = 8;
   uint32_t *dw;
+   unsigned pos;

   ILO_DEV_ASSERT(builder->dev, 7, 7.5);

-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);

   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PS) | (cmd_len - 2);
   dw[1] = kernel_offset;
@ -347,19 +356,26 @@ gen7_3DSTATE_PS(struct ilo_builder *builder,
   dw[5] = ps->ps[5];
   dw[6] = 0; /* kernel 1 */
   dw[7] = 0; /* kernel 2 */
+
+   if (ilo_state_ps_get_scratch_size(ps)) {
+      ilo_builder_batch_reloc(builder, pos + 3, scratch_bo,
+            ps->ps[3], 0);
+   }
 }

 static inline void
 gen8_3DSTATE_PS(struct ilo_builder *builder,
                const struct ilo_state_ps *ps,
-                uint32_t kernel_offset)
+                uint32_t kernel_offset,
+                struct intel_bo *scratch_bo)
 {
   const uint8_t cmd_len = 12;
   uint32_t *dw;
+   unsigned pos;

   ILO_DEV_ASSERT(builder->dev, 8, 8);

-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);

   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PS) | (cmd_len - 2);
   dw[1] = kernel_offset;
@ -374,6 +390,11 @@ gen8_3DSTATE_PS(struct ilo_builder *builder,
   dw[9] = 0;
   dw[10] = 0; /* kernel 2 */
   dw[11] = 0;
+
+   if (ilo_state_ps_get_scratch_size(ps)) {
+      ilo_builder_batch_reloc64(builder, pos + 4, scratch_bo,
+            ps->ps[1], 0);
+   }
 }

 static inline void
--- a/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h
+++ b/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h
@ -477,14 +477,16 @@ gen8_3DSTATE_INDEX_BUFFER(struct ilo_builder *builder,
 static inline void
 gen6_3DSTATE_VS(struct ilo_builder *builder,
                const struct ilo_state_vs *vs,
-                uint32_t kernel_offset)
+                uint32_t kernel_offset,
+                struct intel_bo *scratch_bo)
 {
   const uint8_t cmd_len = 6;
   uint32_t *dw;
+   unsigned pos;

   ILO_DEV_ASSERT(builder->dev, 6, 7.5);

-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);

   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VS) | (cmd_len - 2);
   dw[1] = kernel_offset;
@ -493,19 +495,26 @@ gen6_3DSTATE_VS(struct ilo_builder *builder,
   dw[3] = vs->vs[1];
   dw[4] = vs->vs[2];
   dw[5] = vs->vs[3];
+
+   if (ilo_state_vs_get_scratch_size(vs)) {
+      ilo_builder_batch_reloc(builder, pos + 3, scratch_bo,
+            vs->vs[1], 0);
+   }
 }

 static inline void
 gen8_3DSTATE_VS(struct ilo_builder *builder,
                const struct ilo_state_vs *vs,
-                uint32_t kernel_offset)
+                uint32_t kernel_offset,
+                struct intel_bo *scratch_bo)
 {
   const uint8_t cmd_len = 9;
   uint32_t *dw;
+   unsigned pos;

   ILO_DEV_ASSERT(builder->dev, 8, 8);

-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);

   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VS) | (cmd_len - 2);
   dw[1] = kernel_offset;
@ -517,19 +526,26 @@ gen8_3DSTATE_VS(struct ilo_builder *builder,
   dw[6] = vs->vs[2];
   dw[7] = vs->vs[3];
   dw[8] = vs->vs[4];
+
+   if (ilo_state_vs_get_scratch_size(vs)) {
+      ilo_builder_batch_reloc64(builder, pos + 4, scratch_bo,
+            vs->vs[1], 0);
+   }
 }

 static inline void
 gen7_3DSTATE_HS(struct ilo_builder *builder,
                const struct ilo_state_hs *hs,
-                uint32_t kernel_offset)
+                uint32_t kernel_offset,
+                struct intel_bo *scratch_bo)
 {
   const uint8_t cmd_len = 7;
   uint32_t *dw;
+   unsigned pos;

   ILO_DEV_ASSERT(builder->dev, 7, 7.5);

-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);

   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_HS) | (cmd_len - 2);
   /* see hs_set_gen7_3DSTATE_HS() */
@ -539,19 +555,26 @@ gen7_3DSTATE_HS(struct ilo_builder *builder,
   dw[4] = hs->hs[2];
   dw[5] = hs->hs[3];
   dw[6] = 0;
+
+   if (ilo_state_hs_get_scratch_size(hs)) {
+      ilo_builder_batch_reloc(builder, pos + 4, scratch_bo,
+            hs->hs[2], 0);
+   }
 }

 static inline void
 gen8_3DSTATE_HS(struct ilo_builder *builder,
                const struct ilo_state_hs *hs,
-                uint32_t kernel_offset)
+                uint32_t kernel_offset,
+                struct intel_bo *scratch_bo)
 {
   const uint8_t cmd_len = 9;
   uint32_t *dw;
+   unsigned pos;

   ILO_DEV_ASSERT(builder->dev, 8, 8);

-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);

   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_HS) | (cmd_len - 2);
   /* see hs_set_gen7_3DSTATE_HS() */
@ -563,6 +586,11 @@ gen8_3DSTATE_HS(struct ilo_builder *builder,
   dw[6] = 0;
   dw[7] = hs->hs[3];
   dw[8] = 0;
+
+   if (ilo_state_hs_get_scratch_size(hs)) {
+      ilo_builder_batch_reloc64(builder, pos + 5, scratch_bo,
+            hs->hs[2], 0);
+   }
 }

 static inline void
@ -586,14 +614,16 @@ gen7_3DSTATE_TE(struct ilo_builder *builder,
 static inline void
 gen7_3DSTATE_DS(struct ilo_builder *builder,
                const struct ilo_state_ds *ds,
-                uint32_t kernel_offset)
+                uint32_t kernel_offset,
+                struct intel_bo *scratch_bo)
 {
   const uint8_t cmd_len = 6;
   uint32_t *dw;
+   unsigned pos;

   ILO_DEV_ASSERT(builder->dev, 7, 7.5);

-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);

   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_DS) | (cmd_len - 2);
   /* see ds_set_gen7_3DSTATE_DS() */
@ -602,19 +632,26 @@ gen7_3DSTATE_DS(struct ilo_builder *builder,
   dw[3] = ds->ds[1];
   dw[4] = ds->ds[2];
   dw[5] = ds->ds[3];
+
+   if (ilo_state_ds_get_scratch_size(ds)) {
+      ilo_builder_batch_reloc(builder, pos + 3, scratch_bo,
+            ds->ds[1], 0);
+   }
 }

 static inline void
 gen8_3DSTATE_DS(struct ilo_builder *builder,
                const struct ilo_state_ds *ds,
-                uint32_t kernel_offset)
+                uint32_t kernel_offset,
+                struct intel_bo *scratch_bo)
 {
   const uint8_t cmd_len = 9;
   uint32_t *dw;
+   unsigned pos;

   ILO_DEV_ASSERT(builder->dev, 8, 8);

-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);

   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_DS) | (cmd_len - 2);
   /* see ds_set_gen7_3DSTATE_DS() */
@ -626,19 +663,26 @@ gen8_3DSTATE_DS(struct ilo_builder *builder,
   dw[6] = ds->ds[2];
   dw[7] = ds->ds[3];
   dw[8] = ds->ds[4];
+
+   if (ilo_state_ds_get_scratch_size(ds)) {
+      ilo_builder_batch_reloc64(builder, pos + 4, scratch_bo,
+            ds->ds[1], 0);
+   }
 }

 static inline void
 gen6_3DSTATE_GS(struct ilo_builder *builder,
                const struct ilo_state_gs *gs,
-                uint32_t kernel_offset)
+                uint32_t kernel_offset,
+                struct intel_bo *scratch_bo)
 {
   const uint8_t cmd_len = 7;
   uint32_t *dw;
+   unsigned pos;

   ILO_DEV_ASSERT(builder->dev, 6, 6);

-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);

   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_GS) | (cmd_len - 2);
   dw[1] = kernel_offset;
@ -648,6 +692,11 @@ gen6_3DSTATE_GS(struct ilo_builder *builder,
   dw[4] = gs->gs[2];
   dw[5] = gs->gs[3];
   dw[6] = gs->gs[4];
+
+   if (ilo_state_gs_get_scratch_size(gs)) {
+      ilo_builder_batch_reloc(builder, pos + 3, scratch_bo,
+            gs->gs[1], 0);
+   }
 }

 static inline void
@ -677,14 +726,16 @@ gen6_3DSTATE_GS_SVB_INDEX(struct ilo_builder *builder,
 static inline void
 gen7_3DSTATE_GS(struct ilo_builder *builder,
                const struct ilo_state_gs *gs,
-                uint32_t kernel_offset)
+                uint32_t kernel_offset,
+                struct intel_bo *scratch_bo)
 {
   const uint8_t cmd_len = 7;
   uint32_t *dw;
+   unsigned pos;

   ILO_DEV_ASSERT(builder->dev, 7, 7.5);

-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);

   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_GS) | (cmd_len - 2);
   dw[1] = kernel_offset;
@ -694,19 +745,26 @@ gen7_3DSTATE_GS(struct ilo_builder *builder,
   dw[4] = gs->gs[2];
   dw[5] = gs->gs[3];
   dw[6] = 0;
+
+   if (ilo_state_gs_get_scratch_size(gs)) {
+      ilo_builder_batch_reloc(builder, pos + 3, scratch_bo,
+            gs->gs[1], 0);
+   }
 }

 static inline void
 gen8_3DSTATE_GS(struct ilo_builder *builder,
                const struct ilo_state_gs *gs,
-                uint32_t kernel_offset)
+                uint32_t kernel_offset,
+                struct intel_bo *scratch_bo)
 {
   const uint8_t cmd_len = 10;
   uint32_t *dw;
+   unsigned pos;

   ILO_DEV_ASSERT(builder->dev, 8, 8);

-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);

   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_GS) | (cmd_len - 2);
   dw[1] = kernel_offset;
@ -719,6 +777,11 @@ gen8_3DSTATE_GS(struct ilo_builder *builder,
   dw[7] = gs->gs[3];
   dw[8] = 0;
   dw[9] = gs->gs[4];
+
+   if (ilo_state_gs_get_scratch_size(gs)) {
+      ilo_builder_batch_reloc64(builder, pos + 4, scratch_bo,
+            gs->gs[1], 0);
+   }
 }

 static inline void
--- a/src/gallium/drivers/ilo/core/ilo_state_compute.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_compute.c
@ -158,7 +158,8 @@ compute_interface_get_gen6_read_end(const struct ilo_dev *dev,
    */
   assert(per_thread_read <= 63);

-   /* From the Haswell PRM, volume 2d, page 199:
+   /*
+    * From the Haswell PRM, volume 2d, page 199:
    *
    *     "(Cross-Thread Constant Data Read Length) [0,127]"
    */
@ -210,38 +211,68 @@ compute_validate_gen6(const struct ilo_dev *dev,
   return true;
 }

-static uint8_t
-compute_get_gen6_scratch_space(const struct ilo_dev *dev,
-                               const struct ilo_state_compute_info *info)
+static uint32_t
+compute_get_gen6_per_thread_scratch_size(const struct ilo_dev *dev,
+                                         const struct ilo_state_compute_info *info,
+                                         uint8_t *per_thread_space)
 {
-   uint32_t scratch_size = 0;
-   uint8_t i;
+   ILO_DEV_ASSERT(dev, 6, 7);

-   ILO_DEV_ASSERT(dev, 6, 8);
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 2, page 30:
+    *
+    *     "(Per Thread Scratch Space)
+    *      Range = [0,11] indicating [1k bytes, 12k bytes] [DevSNB]"
+    */
+   assert(info->per_thread_scratch_size <= 12 * 1024);

-   for (i = 0; i < info->interface_count; i++) {
-      if (scratch_size < info->interfaces[i].scratch_size)
-         scratch_size = info->interfaces[i].scratch_size;
+   if (!info->per_thread_scratch_size) {
+      *per_thread_space = 0;
+      return 0;
   }

-   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
-      assert(scratch_size <= 2 * 1024 * 1024);
+   *per_thread_space = (info->per_thread_scratch_size > 1024) ?
+      (info->per_thread_scratch_size - 1) / 1024 : 0;

-      /* next power of two, starting from 1KB */
-      return (scratch_size > 1024) ?
-         (util_last_bit(scratch_size - 1) - 10): 0;
-   } else if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
-      assert(scratch_size <= 2 * 1024 * 1024);
+   return 1024 * (1 + *per_thread_space);
+}

-      /* next power of two, starting from 2KB */
-      return (scratch_size > 2048) ?
-         (util_last_bit(scratch_size - 1) - 11): 0;
-   } else {
-      assert(scratch_size <= 12 * 1024);
+static uint32_t
+compute_get_gen75_per_thread_scratch_size(const struct ilo_dev *dev,
+                                          const struct ilo_state_compute_info *info,
+                                          uint8_t *per_thread_space)
+{
+   ILO_DEV_ASSERT(dev, 7.5, 8);

-      return (scratch_size > 1024) ?
-         (scratch_size - 1) / 1024 : 0;
+   /*
+    * From the Haswell PRM, volume 2b, page 407:
+    *
+    *     "(Per Thread Scratch Space)
+    *      [0,10]  Indicating [2k bytes, 2 Mbytes]"
+    *
+    *     "Note: The scratch space should be declared as 2x the desired
+    *      scratch space. The stack will start at the half-way point instead
+    *      of the end. The upper half of scratch space will not be accessed
+    *      and so does not have to be allocated in memory."
+    *
+    * From the Broadwell PRM, volume 2a, page 450:
+    *
+    *     "(Per Thread Scratch Space)
+    *      [0,11]  indicating [1k bytes, 2 Mbytes]"
+    */
+   assert(info->per_thread_scratch_size <=
+         ((ilo_dev_gen(dev) >= ILO_GEN(8)) ? 2 : 1) * 1024 * 1024);
+
+   if (!info->per_thread_scratch_size) {
+      *per_thread_space = 0;
+      return 0;
   }
+
+   /* next power of two, starting from 1KB */
+   *per_thread_space = (info->per_thread_scratch_size > 1024) ?
+      (util_last_bit(info->per_thread_scratch_size - 1) - 10) : 0;
+
+   return 1 << (10 + *per_thread_space);
 }

 static bool
@ -250,7 +281,8 @@ compute_set_gen6_MEDIA_VFE_STATE(struct ilo_state_compute *compute,
                                 const struct ilo_state_compute_info *info)
 {
   struct compute_urb_configuration urb;
-   uint8_t scratch_space;
+   uint32_t per_thread_size;
+   uint8_t per_thread_space;

   uint32_t dw1, dw2, dw4;

@ -260,9 +292,16 @@ compute_set_gen6_MEDIA_VFE_STATE(struct ilo_state_compute *compute,
       !compute_validate_gen6(dev, info, &urb))
      return false;

-   scratch_space = compute_get_gen6_scratch_space(dev, info);
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
+      per_thread_size = compute_get_gen75_per_thread_scratch_size(dev,
+            info, &per_thread_space);
+   } else {
+      per_thread_size = compute_get_gen6_per_thread_scratch_size(dev,
+            info, &per_thread_space);
+   }
+
+   dw1 = per_thread_space << GEN6_VFE_DW1_SCRATCH_SPACE_PER_THREAD__SHIFT;

-   dw1 = scratch_space << GEN6_VFE_DW1_SCRATCH_SPACE_PER_THREAD__SHIFT;
   dw2 = (dev->thread_count - 1) << GEN6_VFE_DW2_MAX_THREADS__SHIFT |
         urb.urb_entry_count << GEN6_VFE_DW2_URB_ENTRY_COUNT__SHIFT |
         GEN6_VFE_DW2_RESET_GATEWAY_TIMER |
@ -281,6 +320,8 @@ compute_set_gen6_MEDIA_VFE_STATE(struct ilo_state_compute *compute,
   compute->vfe[1] = dw2;
   compute->vfe[2] = dw4;

+   compute->scratch_size = per_thread_size * dev->thread_count;
+
   return true;
 }

--- a/src/gallium/drivers/ilo/core/ilo_state_compute.h
+++ b/src/gallium/drivers/ilo/core/ilo_state_compute.h
@ -45,8 +45,6 @@ struct ilo_state_compute_interface_info {
   /* usually 0 unless there are multiple interfaces */
   uint32_t kernel_offset;

-   uint32_t scratch_size;
-
   uint8_t sampler_count;
   uint8_t surface_count;

@ -65,6 +63,8 @@ struct ilo_state_compute_info {
   const struct ilo_state_compute_interface_info *interfaces;
   uint8_t interface_count;

+   uint32_t per_thread_scratch_size;
+
   uint32_t cv_urb_alloc_size;
   uint32_t curbe_alloc_size;
 };
@ -74,6 +74,8 @@ struct ilo_state_compute {

   uint32_t (*idrt)[6];
   uint8_t idrt_count;
+
+   uint32_t scratch_size;
 };

 static inline size_t
@ -89,4 +91,10 @@ ilo_state_compute_init(struct ilo_state_compute *compute,
                       const struct ilo_dev *dev,
                       const struct ilo_state_compute_info *info);

+static inline uint32_t
+ilo_state_compute_get_scratch_size(const struct ilo_state_compute *compute)
+{
+   return compute->scratch_size;
+}
+
 #endif /* ILO_STATE_COMPUTE_H */
--- a/src/gallium/drivers/ilo/core/ilo_state_shader.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_shader.c
@ -37,7 +37,9 @@ enum vertex_stage {

 struct vertex_ff {
   uint8_t grf_start;
-   uint8_t scratch_space;
+
+   uint8_t per_thread_scratch_space;
+   uint32_t per_thread_scratch_size;

   uint8_t sampler_count;
   uint8_t surface_count;
@ -59,13 +61,6 @@ vertex_validate_gen6_kernel(const struct ilo_dev *dev,
    * others.
    */
   const uint8_t max_grf_start = (stage == STAGE_GS) ? 16 : 32;
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 134:
-    *
-    *     "(Per-Thread Scratch Space)
-    *      Range    [0,11] indicating [1K Bytes, 2M Bytes]"
-    */
-   const uint32_t max_scratch_size = 2 * 1024 * 1024;

   ILO_DEV_ASSERT(dev, 6, 8);

@ -73,7 +68,6 @@ vertex_validate_gen6_kernel(const struct ilo_dev *dev,
   assert(!kernel->offset);

   assert(kernel->grf_start < max_grf_start);
-   assert(kernel->scratch_size <= max_scratch_size);

   return true;
 }
@ -112,18 +106,33 @@ vertex_get_gen6_ff(const struct ilo_dev *dev,
                   const struct ilo_state_shader_kernel_info *kernel,
                   const struct ilo_state_shader_resource_info *resource,
                   const struct ilo_state_shader_urb_info *urb,
+                   uint32_t per_thread_scratch_size,
                   struct vertex_ff *ff)
 {
   ILO_DEV_ASSERT(dev, 6, 8);

+   memset(ff, 0, sizeof(*ff));
+
   if (!vertex_validate_gen6_kernel(dev, stage, kernel) ||
       !vertex_validate_gen6_urb(dev, stage, urb))
      return false;

   ff->grf_start = kernel->grf_start;
-   /* next power of two, starting from 1KB */
-   ff->scratch_space = (kernel->scratch_size > 1024) ?
-      (util_last_bit(kernel->scratch_size - 1) - 10): 0;
+
+   if (per_thread_scratch_size) {
+      /*
+       * From the Sandy Bridge PRM, volume 2 part 1, page 134:
+       *
+       *     "(Per-Thread Scratch Space)
+       *      Range    [0,11] indicating [1K Bytes, 2M Bytes]"
+       */
+      assert(per_thread_scratch_size <= 2 * 1024 * 1024);
+
+      /* next power of two, starting from 1KB */
+      ff->per_thread_scratch_space = (per_thread_scratch_size > 1024) ?
+         (util_last_bit(per_thread_scratch_size - 1) - 10) : 0;
+      ff->per_thread_scratch_size = 1 << (10 + ff->per_thread_scratch_space);
+   }

   ff->sampler_count = (resource->sampler_count <= 12) ?
      (resource->sampler_count + 3) / 4 : 4;
@ -192,8 +201,8 @@ vs_set_gen6_3DSTATE_VS(struct ilo_state_vs *vs,

   ILO_DEV_ASSERT(dev, 6, 8);

-   if (!vertex_get_gen6_ff(dev, STAGE_VS, &info->kernel,
-            &info->resource, &info->urb, &ff))
+   if (!vertex_get_gen6_ff(dev, STAGE_VS, &info->kernel, &info->resource,
+            &info->urb, info->per_thread_scratch_size, &ff))
      return false;

   thread_count = vs_get_gen6_thread_count(dev, info);
@ -207,7 +216,8 @@ vs_set_gen6_3DSTATE_VS(struct ilo_state_vs *vs,
   if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && ff.has_uav)
      dw2 |= GEN75_THREADDISP_ACCESS_UAV;

-   dw3 = ff.scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+   dw3 = ff.per_thread_scratch_space <<
+      GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;

   dw4 = ff.grf_start << GEN6_VS_DW4_URB_GRF_START__SHIFT |
         ff.vue_read_len << GEN6_VS_DW4_URB_READ_LEN__SHIFT |
@ -234,6 +244,8 @@ vs_set_gen6_3DSTATE_VS(struct ilo_state_vs *vs,
   if (ilo_dev_gen(dev) >= ILO_GEN(8))
      vs->vs[4] = ff.user_clip_enables << GEN8_VS_DW8_UCP_CLIP_ENABLES__SHIFT;

+   vs->scratch_size = ff.per_thread_scratch_size * thread_count;
+
   return true;
 }

@ -273,8 +285,8 @@ hs_set_gen7_3DSTATE_HS(struct ilo_state_hs *hs,

   ILO_DEV_ASSERT(dev, 7, 8);

-   if (!vertex_get_gen6_ff(dev, STAGE_HS, &info->kernel,
-            &info->resource, &info->urb, &ff))
+   if (!vertex_get_gen6_ff(dev, STAGE_HS, &info->kernel, &info->resource,
+            &info->urb, info->per_thread_scratch_size, &ff))
      return false;

   thread_count = hs_get_gen7_thread_count(dev, info);
@ -282,19 +294,22 @@ hs_set_gen7_3DSTATE_HS(struct ilo_state_hs *hs,
   dw1 = ff.sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
         ff.surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;

-   if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
+   dw2 = 0 << GEN7_HS_DW2_INSTANCE_COUNT__SHIFT;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8))
+      dw2 |= thread_count << GEN8_HS_DW2_MAX_THREADS__SHIFT;
+   else if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
      dw1 |= thread_count << GEN75_HS_DW1_DISPATCH_MAX_THREADS__SHIFT;
   else
      dw1 |= thread_count << GEN7_HS_DW1_DISPATCH_MAX_THREADS__SHIFT;

-   dw2 = 0 << GEN7_HS_DW2_INSTANCE_COUNT__SHIFT;
-
   if (info->dispatch_enable)
      dw2 |= GEN7_HS_DW2_HS_ENABLE;
   if (info->stats_enable)
      dw2 |= GEN7_HS_DW2_STATISTICS;

-   dw4 = ff.scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+   dw4 = ff.per_thread_scratch_space <<
+      GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;

   dw5 = GEN7_HS_DW5_INCLUDE_VERTEX_HANDLES |
         ff.grf_start << GEN7_HS_DW5_URB_GRF_START__SHIFT |
@ -310,6 +325,8 @@ hs_set_gen7_3DSTATE_HS(struct ilo_state_hs *hs,
   hs->hs[2] = dw4;
   hs->hs[3] = dw5;

+   hs->scratch_size = ff.per_thread_scratch_size * thread_count;
+
   return true;
 }

@ -373,8 +390,8 @@ ds_set_gen7_3DSTATE_DS(struct ilo_state_ds *ds,

   ILO_DEV_ASSERT(dev, 7, 8);

-   if (!vertex_get_gen6_ff(dev, STAGE_DS, &info->kernel,
-            &info->resource, &info->urb, &ff))
+   if (!vertex_get_gen6_ff(dev, STAGE_DS, &info->kernel, &info->resource,
+            &info->urb, info->per_thread_scratch_size, &ff))
      return false;

   thread_count = ds_get_gen7_thread_count(dev, info);
@ -385,7 +402,8 @@ ds_set_gen7_3DSTATE_DS(struct ilo_state_ds *ds,
   if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && ff.has_uav)
      dw2 |= GEN75_THREADDISP_ACCESS_UAV;

-   dw3 = ff.scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+   dw3 = ff.per_thread_scratch_space <<
+      GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;

   dw4 = ff.grf_start << GEN7_DS_DW4_URB_GRF_START__SHIFT |
         ff.vue_read_len << GEN7_DS_DW4_URB_READ_LEN__SHIFT |
@ -412,6 +430,8 @@ ds_set_gen7_3DSTATE_DS(struct ilo_state_ds *ds,
   if (ilo_dev_gen(dev) >= ILO_GEN(8))
      ds->ds[4] = ff.user_clip_enables << GEN8_DS_DW8_UCP_CLIP_ENABLES__SHIFT;

+   ds->scratch_size = ff.per_thread_scratch_size * thread_count;
+
   return true;
 }

@ -425,8 +445,8 @@ gs_get_gen6_ff(const struct ilo_dev *dev,

   ILO_DEV_ASSERT(dev, 6, 8);

-   if (!vertex_get_gen6_ff(dev, STAGE_GS, &info->kernel,
-            &info->resource, &info->urb, ff))
+   if (!vertex_get_gen6_ff(dev, STAGE_GS, &info->kernel, &info->resource,
+            &info->urb, info->per_thread_scratch_size, ff))
      return false;

   /*
@ -510,7 +530,8 @@ gs_set_gen6_3DSTATE_GS(struct ilo_state_gs *gs,
         ff.sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
         ff.surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;

-   dw3 = ff.scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+   dw3 = ff.per_thread_scratch_space <<
+      GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;

   dw4 = ff.vue_read_len << GEN6_GS_DW4_URB_READ_LEN__SHIFT |
         ff.vue_read_offset << GEN6_GS_DW4_URB_READ_OFFSET__SHIFT |
@ -550,6 +571,8 @@ gs_set_gen6_3DSTATE_GS(struct ilo_state_gs *gs,
   gs->gs[3] = dw5;
   gs->gs[4] = dw6;

+   gs->scratch_size = ff.per_thread_scratch_size * thread_count;
+
   return true;
 }

@ -588,7 +611,8 @@ gs_set_gen7_3DSTATE_GS(struct ilo_state_gs *gs,
   if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && ff.has_uav)
      dw2 |= GEN75_THREADDISP_ACCESS_UAV;

-   dw3 = ff.scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+   dw3 = ff.per_thread_scratch_space <<
+      GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;

   dw4 = vertex_size << GEN7_GS_DW4_OUTPUT_SIZE__SHIFT |
         0 << GEN7_GS_DW4_OUTPUT_TOPO__SHIFT |
@ -618,6 +642,8 @@ gs_set_gen7_3DSTATE_GS(struct ilo_state_gs *gs,
   if (ilo_dev_gen(dev) >= ILO_GEN(8))
      gs->gs[4] = ff.user_clip_enables << GEN8_GS_DW9_UCP_CLIP_ENABLES__SHIFT;

+   gs->scratch_size = ff.per_thread_scratch_size * thread_count;
+
   return true;
 }

--- a/src/gallium/drivers/ilo/core/ilo_state_shader.h
+++ b/src/gallium/drivers/ilo/core/ilo_state_shader.h
@ -42,8 +42,6 @@ struct ilo_state_shader_kernel_info {

   uint8_t grf_start;
   uint8_t pcb_attr_count;
-
-   uint32_t scratch_size;
 };

 /**
@ -77,6 +75,7 @@ struct ilo_state_vs_info {
   struct ilo_state_shader_resource_info resource;
   struct ilo_state_shader_urb_info urb;

+   uint32_t per_thread_scratch_size;
   bool dispatch_enable;
   bool stats_enable;
 };
@ -86,6 +85,7 @@ struct ilo_state_hs_info {
   struct ilo_state_shader_resource_info resource;
   struct ilo_state_shader_urb_info urb;

+   uint32_t per_thread_scratch_size;
   bool dispatch_enable;
   bool stats_enable;
 };
@ -95,6 +95,7 @@ struct ilo_state_ds_info {
   struct ilo_state_shader_resource_info resource;
   struct ilo_state_shader_urb_info urb;

+   uint32_t per_thread_scratch_size;
   bool dispatch_enable;
   bool stats_enable;
 };
@ -119,6 +120,7 @@ struct ilo_state_gs_info {

   struct ilo_state_gs_sol_info sol;

+   uint32_t per_thread_scratch_size;
   bool dispatch_enable;
   bool stats_enable;
 };
@ -158,6 +160,8 @@ struct ilo_state_ps_info {
   struct ilo_state_ps_io_info io;
   struct ilo_state_ps_params_info params;

+   uint32_t per_thread_scratch_size;
+
   /* bitmask of GEN6_PS_DISPATCH_x */
   uint8_t valid_kernels;
   bool per_sample_dispatch;
@ -173,23 +177,28 @@ struct ilo_state_ps_info {

 struct ilo_state_vs {
   uint32_t vs[5];
+   uint32_t scratch_size;
 };

 struct ilo_state_hs {
   uint32_t hs[4];
+   uint32_t scratch_size;
 };

 struct ilo_state_ds {
   uint32_t te[3];
   uint32_t ds[5];
+   uint32_t scratch_size;
 };

 struct ilo_state_gs {
   uint32_t gs[5];
+   uint32_t scratch_size;
 };

 struct ilo_state_ps {
   uint32_t ps[8];
+   uint32_t scratch_size;

   struct ilo_state_ps_dispatch_conds {
      bool ps_valid;
@ -211,6 +220,12 @@ bool
 ilo_state_vs_init_disabled(struct ilo_state_vs *vs,
                           const struct ilo_dev *dev);

+static inline uint32_t
+ilo_state_vs_get_scratch_size(const struct ilo_state_vs *vs)
+{
+   return vs->scratch_size;
+}
+
 bool
 ilo_state_hs_init(struct ilo_state_hs *hs,
                  const struct ilo_dev *dev,
@ -221,6 +236,12 @@ ilo_state_hs_init_disabled(struct ilo_state_hs *hs,
                           const struct ilo_dev *dev);


+static inline uint32_t
+ilo_state_hs_get_scratch_size(const struct ilo_state_hs *hs)
+{
+   return hs->scratch_size;
+}
+
 bool
 ilo_state_ds_init(struct ilo_state_ds *ds,
                  const struct ilo_dev *dev,
@ -230,6 +251,12 @@ bool
 ilo_state_ds_init_disabled(struct ilo_state_ds *ds,
                           const struct ilo_dev *dev);

+static inline uint32_t
+ilo_state_ds_get_scratch_size(const struct ilo_state_ds *ds)
+{
+   return ds->scratch_size;
+}
+
 bool
 ilo_state_gs_init(struct ilo_state_gs *gs,
                  const struct ilo_dev *dev,
@ -239,6 +266,12 @@ bool
 ilo_state_gs_init_disabled(struct ilo_state_gs *gs,
                           const struct ilo_dev *dev);

+static inline uint32_t
+ilo_state_gs_get_scratch_size(const struct ilo_state_gs *gs)
+{
+   return gs->scratch_size;
+}
+
 bool
 ilo_state_ps_init(struct ilo_state_ps *ps,
                  const struct ilo_dev *dev,
@ -253,4 +286,10 @@ ilo_state_ps_set_params(struct ilo_state_ps *ps,
                        const struct ilo_dev *dev,
                        const struct ilo_state_ps_params_info *params);

+static inline uint32_t
+ilo_state_ps_get_scratch_size(const struct ilo_state_ps *ps)
+{
+   return ps->scratch_size;
+}
+
 #endif /* ILO_STATE_SHADER_H */
--- a/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c
@ -34,7 +34,8 @@ struct pixel_ff {
   uint32_t kernel_offsets[3];
   uint8_t grf_starts[3];
   bool pcb_enable;
-   uint8_t scratch_space;
+   uint8_t per_thread_scratch_space;
+   uint32_t per_thread_scratch_size;

   uint8_t sampler_count;
   uint8_t surface_count;
@ -56,13 +57,6 @@ ps_kernel_validate_gen6(const struct ilo_dev *dev,
 {
   /* "Dispatch GRF Start Register for Constant/Setup Data" is U7 */
   const uint8_t max_grf_start = 128;
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 271:
-    *
-    *     "(Per-Thread Scratch Space)
-    *      Range  [0,11] indicating [1k bytes, 2M bytes] in powers of two"
-    */
-   const uint32_t max_scratch_size = 2 * 1024 * 1024;

   ILO_DEV_ASSERT(dev, 6, 8);

@ -70,7 +64,6 @@ ps_kernel_validate_gen6(const struct ilo_dev *dev,
   assert(kernel->offset % 64 == 0);

   assert(kernel->grf_start < max_grf_start);
-   assert(kernel->scratch_size <= max_scratch_size);

   return true;
 }
@ -325,7 +318,6 @@ ps_get_gen6_ff_kernels(const struct ilo_dev *dev,
   const struct ilo_state_shader_kernel_info *kernel_8 = &info->kernel_8;
   const struct ilo_state_shader_kernel_info *kernel_16 = &info->kernel_16;
   const struct ilo_state_shader_kernel_info *kernel_32 = &info->kernel_32;
-   uint32_t scratch_size;

   ILO_DEV_ASSERT(dev, 6, 8);

@ -363,21 +355,6 @@ ps_get_gen6_ff_kernels(const struct ilo_dev *dev,
                     ((ff->dispatch_modes & GEN6_PS_DISPATCH_32) &&
                      kernel_32->pcb_attr_count));

-   scratch_size = 0;
-   if ((ff->dispatch_modes & GEN6_PS_DISPATCH_8) &&
-       scratch_size < kernel_8->scratch_size)
-      scratch_size = kernel_8->scratch_size;
-   if ((ff->dispatch_modes & GEN6_PS_DISPATCH_16) &&
-       scratch_size < kernel_16->scratch_size)
-      scratch_size = kernel_16->scratch_size;
-   if ((ff->dispatch_modes & GEN6_PS_DISPATCH_32) &&
-       scratch_size < kernel_32->scratch_size)
-      scratch_size = kernel_32->scratch_size;
-
-   /* next power of two, starting from 1KB */
-   ff->scratch_space = (scratch_size > 1024) ?
-      (util_last_bit(scratch_size - 1) - 10): 0;
-
   /* GPU hangs on Haswell if none of the dispatch mode bits is set */
   if (ilo_dev_gen(dev) == ILO_GEN(7.5) && !ff->dispatch_modes)
      ff->dispatch_modes |= GEN6_PS_DISPATCH_8;
@ -401,6 +378,21 @@ ps_get_gen6_ff(const struct ilo_dev *dev,
   if (!ps_validate_gen6(dev, info) || !ps_get_gen6_ff_kernels(dev, info, ff))
      return false;

+   if (info->per_thread_scratch_size) {
+      /*
+       * From the Sandy Bridge PRM, volume 2 part 1, page 271:
+       *
+       *     "(Per-Thread Scratch Space)
+       *      Range  [0,11] indicating [1k bytes, 2M bytes] in powers of two"
+       */
+      assert(info->per_thread_scratch_size <= 2 * 1024 * 1024);
+
+      /* next power of two, starting from 1KB */
+      ff->per_thread_scratch_space = (info->per_thread_scratch_size > 1024) ?
+         (util_last_bit(info->per_thread_scratch_size - 1) - 10) : 0;
+      ff->per_thread_scratch_size = 1 << (10 + ff->per_thread_scratch_space);
+   }
+
   ff->sampler_count = (resource->sampler_count <= 12) ?
      (resource->sampler_count + 3) / 4 : 4;
   ff->surface_count = resource->surface_count;
@ -441,7 +433,8 @@ ps_set_gen6_3dstate_wm(struct ilo_state_ps *ps,
   if (false)
      dw2 |= GEN6_THREADDISP_FP_MODE_ALT;

-   dw3 = ff->scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+   dw3 = ff->per_thread_scratch_space <<
+      GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;

   dw4 = ff->grf_starts[0] << GEN6_WM_DW4_URB_GRF_START0__SHIFT |
         ff->grf_starts[1] << GEN6_WM_DW4_URB_GRF_START1__SHIFT |
@ -539,7 +532,8 @@ ps_set_gen7_3DSTATE_PS(struct ilo_state_ps *ps,
   if (false)
      dw2 |= GEN6_THREADDISP_FP_MODE_ALT;

-   dw3 = ff->scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+   dw3 = ff->per_thread_scratch_space <<
+      GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;

   dw4 = io->posoffset << GEN7_PS_DW4_POSOFFSET__SHIFT |
         ff->dispatch_modes << GEN7_PS_DW4_DISPATCH_MODE__SHIFT;
@ -603,7 +597,8 @@ ps_set_gen8_3DSTATE_PS(struct ilo_state_ps *ps,
   if (false)
      dw3 |= GEN6_THREADDISP_FP_MODE_ALT;

-   dw4 = ff->scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+   dw4 = ff->per_thread_scratch_space <<
+      GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;

   dw6 = ff->thread_count << GEN8_PS_DW6_MAX_THREADS__SHIFT |
         io->posoffset << GEN8_PS_DW6_POSOFFSET__SHIFT |
@ -705,6 +700,7 @@ ilo_state_ps_init(struct ilo_state_ps *ps,
      ret &= ps_set_gen6_3dstate_wm(ps, dev, info, &ff);
   }

+   ps->scratch_size = ff.per_thread_scratch_size * ff.thread_count;
   /* save conditions */
   ps->conds = ff.conds;

--- a/src/gallium/drivers/ilo/ilo_blit.h
+++ b/src/gallium/drivers/ilo/ilo_blit.h
@ -58,10 +58,12 @@ ilo_blit_resolve_slices(struct ilo_context *ilo,
    * As it is only used to resolve HiZ right now, return early when there is
    * no HiZ.
    */
-   if (!ilo_image_can_enable_aux(&tex->image, level))
+   if (tex->image.aux.type != ILO_IMAGE_AUX_HIZ ||
+       !ilo_image_can_enable_aux(&tex->image, level))
      return;

-   if (ilo_image_can_enable_aux(&tex->image, level)) {
+   if (tex->image.aux.type == ILO_IMAGE_AUX_HIZ &&
+       ilo_image_can_enable_aux(&tex->image, level)) {
      ilo_blit_resolve_slices_for_hiz(ilo, res, level,
            first_slice, num_slices, resolve_flags);
   }
--- a/src/gallium/drivers/ilo/ilo_draw.c
+++ b/src/gallium/drivers/ilo/ilo_draw.c
@ -547,6 +547,7 @@ static void
 ilo_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 {
   struct ilo_context *ilo = ilo_context(pipe);
+   int vs_scratch_size, gs_scratch_size, fs_scratch_size;

   if (ilo_debug & ILO_DEBUG_DRAW) {
      if (info->indexed) {
@ -574,8 +575,15 @@ ilo_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)

   ilo_finalize_3d_states(ilo, info);

+   /* upload kernels */
   ilo_shader_cache_upload(ilo->shader_cache, &ilo->cp->builder);

+   /* prepare scratch spaces */
+   ilo_shader_cache_get_max_scratch_sizes(ilo->shader_cache,
+         &vs_scratch_size, &gs_scratch_size, &fs_scratch_size);
+   ilo_render_prepare_scratch_spaces(ilo->render,
+         vs_scratch_size, gs_scratch_size, fs_scratch_size);
+
   ilo_blit_resolve_framebuffer(ilo);

   /* If draw_vbo ever fails, return immediately. */
--- a/src/gallium/drivers/ilo/ilo_render.c
+++ b/src/gallium/drivers/ilo/ilo_render.c
@ -67,10 +67,49 @@ ilo_render_create(struct ilo_builder *builder)
 void
 ilo_render_destroy(struct ilo_render *render)
 {
+   intel_bo_unref(render->vs_scratch.bo);
+   intel_bo_unref(render->gs_scratch.bo);
+   intel_bo_unref(render->fs_scratch.bo);
+
   intel_bo_unref(render->workaround_bo);
   FREE(render);
 }

+static bool
+resize_scratch_space(struct ilo_render *render,
+                     struct ilo_render_scratch_space *scratch,
+                     const char *name, int new_size)
+{
+   struct intel_bo *bo;
+
+   if (scratch->size >= new_size)
+      return true;
+
+   bo = intel_winsys_alloc_bo(render->builder->winsys, name, new_size, false);
+   if (!bo)
+      return false;
+
+   intel_bo_unref(scratch->bo);
+   scratch->bo = bo;
+   scratch->size = new_size;
+
+   return true;
+}
+
+bool
+ilo_render_prepare_scratch_spaces(struct ilo_render *render,
+                                  int vs_scratch_size,
+                                  int gs_scratch_size,
+                                  int fs_scratch_size)
+{
+   return (resize_scratch_space(render, &render->vs_scratch,
+            "vs scratch", vs_scratch_size) &&
+           resize_scratch_space(render, &render->gs_scratch,
+            "gs scratch", gs_scratch_size) &&
+           resize_scratch_space(render, &render->fs_scratch,
+            "fs scratch", fs_scratch_size));
+}
+
 void
 ilo_render_get_sample_position(const struct ilo_render *render,
                               unsigned sample_count,
--- a/src/gallium/drivers/ilo/ilo_render.h
+++ b/src/gallium/drivers/ilo/ilo_render.h
@ -43,6 +43,12 @@ ilo_render_create(struct ilo_builder *builder);
 void
 ilo_render_destroy(struct ilo_render *render);

+bool
+ilo_render_prepare_scratch_spaces(struct ilo_render *render,
+                                  int vs_scratch_size,
+                                  int gs_scratch_size,
+                                  int fs_scratch_size);
+
 void
 ilo_render_get_sample_position(const struct ilo_render *render,
                               unsigned sample_count,
--- a/src/gallium/drivers/ilo/ilo_render_gen.h
+++ b/src/gallium/drivers/ilo/ilo_render_gen.h
@ -51,6 +51,11 @@ struct ilo_render {

   struct intel_bo *workaround_bo;

+   struct ilo_render_scratch_space {
+      struct intel_bo *bo;
+      int size;
+   } vs_scratch, gs_scratch, fs_scratch;
+
   struct ilo_state_sample_pattern sample_pattern;

   bool hw_ctx_changed;
--- a/src/gallium/drivers/ilo/ilo_render_gen6.c
+++ b/src/gallium/drivers/ilo/ilo_render_gen6.c
@ -475,10 +475,13 @@ gen6_draw_vs(struct ilo_render *r,
         gen6_wa_pre_3dstate_vs_toggle(r);

      if (ilo_dev_gen(r->dev) == ILO_GEN(6) &&
-          ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_GEN6_SO))
-         gen6_3DSTATE_VS(r->builder, &cso->vs_sol.vs, kernel_offset);
-      else
-         gen6_3DSTATE_VS(r->builder, &cso->vs, kernel_offset);
+          ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_GEN6_SO)) {
+         gen6_3DSTATE_VS(r->builder, &cso->vs_sol.vs,
+               kernel_offset, r->vs_scratch.bo);
+      } else {
+         gen6_3DSTATE_VS(r->builder, &cso->vs,
+               kernel_offset, r->vs_scratch.bo);
+      }
   }
 }

@ -501,7 +504,8 @@ gen6_draw_gs(struct ilo_render *r,
         cso = ilo_shader_get_kernel_cso(vec->gs);
         kernel_offset = ilo_shader_get_kernel_offset(vec->gs);

-         gen6_3DSTATE_GS(r->builder, &cso->gs, kernel_offset);
+         gen6_3DSTATE_GS(r->builder, &cso->gs,
+               kernel_offset, r->gs_scratch.bo);
      } else if (ilo_dev_gen(r->dev) == ILO_GEN(6) &&
            ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_GEN6_SO)) {
         const int verts_per_prim =
@ -524,9 +528,10 @@ gen6_draw_gs(struct ilo_render *r,
         kernel_offset = ilo_shader_get_kernel_offset(vec->vs) +
            ilo_shader_get_kernel_param(vec->vs, param);

-         gen6_3DSTATE_GS(r->builder, &cso->vs_sol.sol, kernel_offset);
+         gen6_3DSTATE_GS(r->builder, &cso->vs_sol.sol,
+               kernel_offset, r->gs_scratch.bo);
      } else {
-         gen6_3DSTATE_GS(r->builder, &vec->disabled_gs, 0);
+         gen6_3DSTATE_GS(r->builder, &vec->disabled_gs, 0, NULL);
      }
   }
 }
@ -672,7 +677,7 @@ gen6_draw_wm(struct ilo_render *r,
         gen6_wa_pre_3dstate_wm_max_threads(r);

      gen6_3DSTATE_WM(r->builder, &vec->rasterizer->rs,
-            &cso->ps, kernel_offset);
+            &cso->ps, kernel_offset, r->fs_scratch.bo);
   }
 }

@ -817,10 +822,10 @@ gen6_rectlist_vs_to_sf(struct ilo_render *r,
   gen6_wa_post_3dstate_constant_vs(r);

   gen6_wa_pre_3dstate_vs_toggle(r);
-   gen6_3DSTATE_VS(r->builder, &blitter->vs, 0);
+   gen6_3DSTATE_VS(r->builder, &blitter->vs, 0, NULL);

   gen6_3DSTATE_CONSTANT_GS(r->builder, NULL, NULL, 0);
-   gen6_3DSTATE_GS(r->builder, &blitter->gs, 0);
+   gen6_3DSTATE_GS(r->builder, &blitter->gs, 0, NULL);

   gen6_3DSTATE_CLIP(r->builder, &blitter->fb.rs);
   gen6_3DSTATE_SF(r->builder, &blitter->fb.rs, &blitter->sbe);
@ -833,7 +838,7 @@ gen6_rectlist_wm(struct ilo_render *r,
   gen6_3DSTATE_CONSTANT_PS(r->builder, NULL, NULL, 0);

   gen6_wa_pre_3dstate_wm_max_threads(r);
-   gen6_3DSTATE_WM(r->builder, &blitter->fb.rs, &blitter->ps, 0);
+   gen6_3DSTATE_WM(r->builder, &blitter->fb.rs, &blitter->ps, 0, NULL);
 }

 static void
--- a/src/gallium/drivers/ilo/ilo_render_gen7.c
+++ b/src/gallium/drivers/ilo/ilo_render_gen7.c
@ -318,10 +318,13 @@ gen7_draw_vs(struct ilo_render *r,
      const union ilo_shader_cso *cso = ilo_shader_get_kernel_cso(vec->vs);
      const uint32_t kernel_offset = ilo_shader_get_kernel_offset(vec->vs);

-      if (ilo_dev_gen(r->dev) >= ILO_GEN(8))
-         gen8_3DSTATE_VS(r->builder, &cso->vs, kernel_offset);
-      else
-         gen6_3DSTATE_VS(r->builder, &cso->vs, kernel_offset);
+      if (ilo_dev_gen(r->dev) >= ILO_GEN(8)) {
+         gen8_3DSTATE_VS(r->builder, &cso->vs,
+               kernel_offset, r->vs_scratch.bo);
+      } else {
+         gen6_3DSTATE_VS(r->builder, &cso->vs,
+               kernel_offset, r->vs_scratch.bo);
+      }
   }
 }

@ -338,9 +341,9 @@ gen7_draw_hs(struct ilo_render *r,
      gen7_3DSTATE_CONSTANT_HS(r->builder, 0, 0, 0);

      if (ilo_dev_gen(r->dev) >= ILO_GEN(8))
-         gen8_3DSTATE_HS(r->builder, hs, kernel_offset);
+         gen8_3DSTATE_HS(r->builder, hs, kernel_offset, NULL);
      else
-         gen7_3DSTATE_HS(r->builder, hs, kernel_offset);
+         gen7_3DSTATE_HS(r->builder, hs, kernel_offset, NULL);
   }

   /* 3DSTATE_BINDING_TABLE_POINTERS_HS */
@ -373,9 +376,9 @@ gen7_draw_ds(struct ilo_render *r,
      gen7_3DSTATE_CONSTANT_DS(r->builder, 0, 0, 0);

      if (ilo_dev_gen(r->dev) >= ILO_GEN(8))
-         gen8_3DSTATE_DS(r->builder, ds, kernel_offset);
+         gen8_3DSTATE_DS(r->builder, ds, kernel_offset, NULL);
      else
-         gen7_3DSTATE_DS(r->builder, ds, kernel_offset);
+         gen7_3DSTATE_DS(r->builder, ds, kernel_offset, NULL);
   }

   /* 3DSTATE_BINDING_TABLE_POINTERS_DS */
@ -397,9 +400,9 @@ gen7_draw_gs(struct ilo_render *r,
      gen7_3DSTATE_CONSTANT_GS(r->builder, 0, 0, 0);

      if (ilo_dev_gen(r->dev) >= ILO_GEN(8))
-         gen8_3DSTATE_GS(r->builder, gs, kernel_offset);
+         gen8_3DSTATE_GS(r->builder, gs, kernel_offset, NULL);
      else
-         gen7_3DSTATE_GS(r->builder, gs, kernel_offset);
+         gen7_3DSTATE_GS(r->builder, gs, kernel_offset, NULL);
   }

   /* 3DSTATE_BINDING_TABLE_POINTERS_GS */
@ -534,7 +537,7 @@ gen7_draw_wm(struct ilo_render *r,
      if (r->hw_ctx_changed)
         gen7_wa_pre_3dstate_ps_max_threads(r);

-      gen7_3DSTATE_PS(r->builder, &cso->ps, kernel_offset);
+      gen7_3DSTATE_PS(r->builder, &cso->ps, kernel_offset, r->fs_scratch.bo);
   }

   /* 3DSTATE_SCISSOR_STATE_POINTERS */
@ -678,18 +681,18 @@ gen7_rectlist_vs_to_sf(struct ilo_render *r,
                       const struct ilo_blitter *blitter)
 {
   gen7_3DSTATE_CONSTANT_VS(r->builder, NULL, NULL, 0);
-   gen6_3DSTATE_VS(r->builder, &blitter->vs, 0);
+   gen6_3DSTATE_VS(r->builder, &blitter->vs, 0, NULL);

   gen7_3DSTATE_CONSTANT_HS(r->builder, NULL, NULL, 0);
-   gen7_3DSTATE_HS(r->builder, &blitter->hs, 0);
+   gen7_3DSTATE_HS(r->builder, &blitter->hs, 0, NULL);

   gen7_3DSTATE_TE(r->builder, &blitter->ds);

   gen7_3DSTATE_CONSTANT_DS(r->builder, NULL, NULL, 0);
-   gen7_3DSTATE_DS(r->builder, &blitter->ds, 0);
+   gen7_3DSTATE_DS(r->builder, &blitter->ds, 0, NULL);

   gen7_3DSTATE_CONSTANT_GS(r->builder, NULL, NULL, 0);
-   gen7_3DSTATE_GS(r->builder, &blitter->gs, 0);
+   gen7_3DSTATE_GS(r->builder, &blitter->gs, 0, NULL);

   gen7_3DSTATE_STREAMOUT(r->builder, &blitter->sol);

@ -711,7 +714,7 @@ gen7_rectlist_wm(struct ilo_render *r,
   gen7_3DSTATE_CONSTANT_PS(r->builder, NULL, NULL, 0);

   gen7_wa_pre_3dstate_ps_max_threads(r);
-   gen7_3DSTATE_PS(r->builder, &blitter->ps, 0);
+   gen7_3DSTATE_PS(r->builder, &blitter->ps, 0, NULL);
 }

 static void
--- a/src/gallium/drivers/ilo/ilo_render_gen8.c
+++ b/src/gallium/drivers/ilo/ilo_render_gen8.c
@ -125,7 +125,7 @@ gen8_draw_wm(struct ilo_render *r,

   /* 3DSTATE_PS */
   if (DIRTY(FS) || r->instruction_bo_changed)
-      gen8_3DSTATE_PS(r->builder, &cso->ps, kernel_offset);
+      gen8_3DSTATE_PS(r->builder, &cso->ps, kernel_offset, r->fs_scratch.bo);

   /* 3DSTATE_PS_EXTRA */
   if (DIRTY(FS))
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@ -474,6 +474,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_TGSI_TXQS:
   case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
   case PIPE_CAP_SHAREABLE_SHADERS:
+   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
      return 0;

   case PIPE_CAP_VENDOR_ID:
--- a/src/gallium/drivers/ilo/ilo_shader.c
+++ b/src/gallium/drivers/ilo/ilo_shader.c
@ -37,6 +37,10 @@
 struct ilo_shader_cache {
   struct list_head shaders;
   struct list_head changed;
+
+   int max_vs_scratch_size;
+   int max_gs_scratch_size;
+   int max_fs_scratch_size;
 };

 /**
@ -121,6 +125,8 @@ ilo_shader_cache_upload(struct ilo_shader_cache *shc,
      struct ilo_shader *sh;

      LIST_FOR_EACH_ENTRY(sh, &shader->variants, list) {
+         int scratch_size, *cur_max;
+
         if (sh->uploaded)
            continue;

@ -128,6 +134,29 @@ ilo_shader_cache_upload(struct ilo_shader_cache *shc,
               sh->kernel_size, sh->kernel);

         sh->uploaded = true;
+
+         switch (shader->info.type) {
+         case PIPE_SHADER_VERTEX:
+            scratch_size = ilo_state_vs_get_scratch_size(&sh->cso.vs);
+            cur_max = &shc->max_vs_scratch_size;
+            break;
+         case PIPE_SHADER_GEOMETRY:
+            scratch_size = ilo_state_gs_get_scratch_size(&sh->cso.gs);
+            cur_max = &shc->max_gs_scratch_size;
+            break;
+         case PIPE_SHADER_FRAGMENT:
+            scratch_size = ilo_state_ps_get_scratch_size(&sh->cso.ps);
+            cur_max = &shc->max_fs_scratch_size;
+            break;
+         default:
+            assert(!"unknown shader type");
+            scratch_size = 0;
+            cur_max = &shc->max_vs_scratch_size;
+            break;
+         }
+
+         if (*cur_max < scratch_size)
+            *cur_max = scratch_size;
      }

      list_del(&shader->list);
@ -155,6 +184,21 @@ ilo_shader_cache_invalidate(struct ilo_shader_cache *shc)
      LIST_FOR_EACH_ENTRY(sh, &shader->variants, list)
         sh->uploaded = false;
   }
+
+   shc->max_vs_scratch_size = 0;
+   shc->max_gs_scratch_size = 0;
+   shc->max_fs_scratch_size = 0;
+}
+
+void
+ilo_shader_cache_get_max_scratch_sizes(const struct ilo_shader_cache *shc,
+                                       int *vs_scratch_size,
+                                       int *gs_scratch_size,
+                                       int *fs_scratch_size)
+{
+   *vs_scratch_size = shc->max_vs_scratch_size;
+   *gs_scratch_size = shc->max_gs_scratch_size;
+   *fs_scratch_size = shc->max_fs_scratch_size;
 }

 /**
@ -578,7 +622,6 @@ init_shader_kernel(const struct ilo_shader *kernel,
   kern->grf_start = kernel->in.start_grf;
   kern->pcb_attr_count =
      (kernel->pcb.cbuf0_size + kernel->pcb.clip_state_size + 15) / 16;
-   kern->scratch_size = 0;
 }

 static void
@ -602,6 +645,7 @@ init_vs(struct ilo_shader *kernel,
   init_shader_urb(kernel, state, &info.urb);
   init_shader_kernel(kernel, state, &info.kernel);
   init_shader_resource(kernel, state, &info.resource);
+   info.per_thread_scratch_size = kernel->per_thread_scratch_size;
   info.dispatch_enable = true;
   info.stats_enable = true;

@ -640,6 +684,7 @@ init_gs(struct ilo_shader *kernel,
   init_shader_urb(kernel, state, &info.urb);
   init_shader_kernel(kernel, state, &info.kernel);
   init_shader_resource(kernel, state, &info.resource);
+   info.per_thread_scratch_size = kernel->per_thread_scratch_size;
   info.dispatch_enable = true;
   info.stats_enable = true;

@ -664,6 +709,7 @@ init_ps(struct ilo_shader *kernel,
   init_shader_kernel(kernel, state, &info.kernel_8);
   init_shader_resource(kernel, state, &info.resource);

+   info.per_thread_scratch_size = kernel->per_thread_scratch_size;
   info.io.has_rt_write = true;
   info.io.posoffset = GEN6_POSOFFSET_NONE;
   info.io.attr_count = kernel->in.count;
--- a/src/gallium/drivers/ilo/ilo_shader.h
+++ b/src/gallium/drivers/ilo/ilo_shader.h
@ -120,6 +120,12 @@ ilo_shader_cache_upload(struct ilo_shader_cache *shc,
 void
 ilo_shader_cache_invalidate(struct ilo_shader_cache *shc);

+void
+ilo_shader_cache_get_max_scratch_sizes(const struct ilo_shader_cache *shc,
+                                       int *vs_scratch_size,
+                                       int *gs_scratch_size,
+                                       int *fs_scratch_size);
+
 struct ilo_shader_state *
 ilo_shader_create_vs(const struct ilo_dev *dev,
                     const struct pipe_shader_state *state,
--- a/src/gallium/drivers/ilo/shader/ilo_shader_internal.h
+++ b/src/gallium/drivers/ilo/shader/ilo_shader_internal.h
@ -139,6 +139,7 @@ struct ilo_shader {

   void *kernel;
   int kernel_size;
+   int per_thread_scratch_size;

   struct ilo_kernel_routing routing;
   struct ilo_state_ps_params_info ps_params;
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@ -299,6 +299,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_TGSI_TXQS:
   case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
   case PIPE_CAP_SHAREABLE_SHADERS:
+   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
      return 0;
   }
   /* should only get here on unhandled cases */
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@ -854,10 +854,10 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
                     jit_tex->img_stride[j] = lp_tex->img_stride[j];
                  }

-                  if (view->target == PIPE_TEXTURE_1D_ARRAY ||
-                      view->target == PIPE_TEXTURE_2D_ARRAY ||
-                      view->target == PIPE_TEXTURE_CUBE ||
-                      view->target == PIPE_TEXTURE_CUBE_ARRAY) {
+                  if (res->target == PIPE_TEXTURE_1D_ARRAY ||
+                      res->target == PIPE_TEXTURE_2D_ARRAY ||
+                      res->target == PIPE_TEXTURE_CUBE ||
+                      res->target == PIPE_TEXTURE_CUBE_ARRAY) {
                     /*
                      * For array textures, we don't have first_layer, instead
                      * adjust last_layer (stored as depth) plus the mip level offsets
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@ -275,10 +275,10 @@ prepare_shader_sampling(
                  row_stride[j] = lp_tex->row_stride[j];
                  img_stride[j] = lp_tex->img_stride[j];
               }
-               if (view->target == PIPE_TEXTURE_1D_ARRAY ||
-                   view->target == PIPE_TEXTURE_2D_ARRAY ||
-                   view->target == PIPE_TEXTURE_CUBE ||
-                   view->target == PIPE_TEXTURE_CUBE_ARRAY) {
+               if (tex->target == PIPE_TEXTURE_1D_ARRAY ||
+                   tex->target == PIPE_TEXTURE_2D_ARRAY ||
+                   tex->target == PIPE_TEXTURE_CUBE ||
+                   tex->target == PIPE_TEXTURE_CUBE_ARRAY) {
                  num_layers = view->u.tex.last_layer - view->u.tex.first_layer + 1;
                  for (j = first_level; j <= last_level; j++) {
                     mip_offsets[j] += view->u.tex.first_layer *
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@ -200,7 +200,8 @@ llvmpipe_can_create_resource(struct pipe_screen *screen,

 static boolean
 llvmpipe_displaytarget_layout(struct llvmpipe_screen *screen,
-                              struct llvmpipe_resource *lpr)
+                              struct llvmpipe_resource *lpr,
+                              const void *map_front_private)
 {
   struct sw_winsys *winsys = screen->winsys;

@ -215,12 +216,13 @@ llvmpipe_displaytarget_layout(struct llvmpipe_screen *screen,
                                          lpr->base.format,
                                          width, height,
                                          64,
+                                          map_front_private,
                                          &lpr->row_stride[0] );

   if (lpr->dt == NULL)
      return FALSE;

-   {
+   if (!map_front_private) {
      void *map = winsys->displaytarget_map(winsys, lpr->dt,
                                            PIPE_TRANSFER_WRITE);

@ -235,8 +237,9 @@ llvmpipe_displaytarget_layout(struct llvmpipe_screen *screen,


 static struct pipe_resource *
-llvmpipe_resource_create(struct pipe_screen *_screen,
-                         const struct pipe_resource *templat)
+llvmpipe_resource_create_front(struct pipe_screen *_screen,
+                               const struct pipe_resource *templat,
+                               const void *map_front_private)
 {
   struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
   struct llvmpipe_resource *lpr = CALLOC_STRUCT(llvmpipe_resource);
@ -254,7 +257,7 @@ llvmpipe_resource_create(struct pipe_screen *_screen,
                            PIPE_BIND_SCANOUT |
                            PIPE_BIND_SHARED)) {
         /* displayable surface */
-         if (!llvmpipe_displaytarget_layout(screen, lpr))
+         if (!llvmpipe_displaytarget_layout(screen, lpr, map_front_private))
            goto fail;
      }
      else {
@ -300,7 +303,12 @@ llvmpipe_resource_create(struct pipe_screen *_screen,
   FREE(lpr);
   return NULL;
 }
-
+static struct pipe_resource *
+llvmpipe_resource_create(struct pipe_screen *_screen,
+                         const struct pipe_resource *templat)
+{
+   return llvmpipe_resource_create_front(_screen, templat, NULL);
+}

 static void
 llvmpipe_resource_destroy(struct pipe_screen *pscreen,
@ -797,6 +805,7 @@ llvmpipe_init_screen_resource_funcs(struct pipe_screen *screen)
 #endif

   screen->resource_create = llvmpipe_resource_create;
+   screen->resource_create_front = llvmpipe_resource_create_front;
   screen->resource_destroy = llvmpipe_resource_destroy;
   screen->resource_from_handle = llvmpipe_resource_from_handle;
   screen->resource_get_handle = llvmpipe_resource_get_handle;
--- a/src/gallium/drivers/nouveau/Makefile.sources
+++ b/src/gallium/drivers/nouveau/Makefile.sources
@ -73,6 +73,9 @@ NV50_C_SOURCES := \
 	nv50/nv50_program.h \
 	nv50/nv50_push.c \
 	nv50/nv50_query.c \
+	nv50/nv50_query.h \
+	nv50/nv50_query_hw.c \
+	nv50/nv50_query_hw.h \
 	nv50/nv50_resource.c \
 	nv50/nv50_resource.h \
 	nv50/nv50_screen.c \
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
@ -1128,7 +1128,6 @@ nv50_ir_init_prog_info(struct nv50_ir_prog_info *info)
      info->prop.gp.instanceCount = 1;
      info->prop.gp.maxVertices = 1;
   }
-   info->io.clipDistance = 0xff;
   info->io.pointSize = 0xff;
   info->io.instanceId = 0xff;
   info->io.vertexId = 0xff;
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
@ -73,8 +73,8 @@ public:

   Instruction *mkCvt(operation, DataType, Value *, DataType, Value *);
   CmpInstruction *mkCmp(operation, CondCode, DataType,
-			 Value *,
-			 DataType, Value *, Value *, Value * = NULL);
+                         Value *,
+                         DataType, Value *, Value *, Value * = NULL);
   TexInstruction *mkTex(operation, TexTarget,
                         uint16_t tic, uint16_t tsc,
                         const std::vector<Value *> &def,
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@ -99,6 +99,7 @@ struct nv50_ir_prog_info
      uint8_t sourceRep;  /* NV50_PROGRAM_IR */
      const void *source;
      void *relocData;
+      void *interpData;
      struct nv50_ir_prog_symbol *syms;
      uint16_t numSyms;
   } bin;
@ -143,6 +144,7 @@ struct nv50_ir_prog_info
         bool earlyFragTests;
         bool separateFragData;
         bool usesDiscard;
+         bool sampleInterp;      /* perform sample interp on all fp inputs */
      } fp;
      struct {
         uint32_t inputOffset; /* base address for user args */
@ -154,9 +156,8 @@ struct nv50_ir_prog_info
   uint8_t numBarriers;

   struct {
-      uint8_t clipDistance;      /* index of first clip distance output */
-      uint8_t clipDistanceMask;  /* mask of clip distances defined */
-      uint8_t cullDistanceMask;  /* clip distance mode (1 bit per output) */
+      uint8_t clipDistances;     /* number of clip distance outputs */
+      uint8_t cullDistances;     /* number of cull distance outputs */
      int8_t genUserClip;        /* request user clip planes for ClipVertex */
      uint16_t ucpBase;          /* base address for UCPs */
      uint8_t ucpCBSlot;         /* constant buffer index of UCP data */
@ -168,7 +169,6 @@ struct nv50_ir_prog_info
      int8_t viewportId;         /* output index of ViewportIndex */
      uint8_t fragDepth;         /* output index of FragDepth */
      uint8_t sampleMask;        /* output index of SampleMask */
-      bool sampleInterp;         /* perform sample interp on all fp inputs */
      uint8_t backFaceColor[2];  /* input/output indices of back face colour */
      uint8_t globalAccess;      /* 1 for read, 2 for wr, 3 for rw */
      bool fp64;                 /* program uses fp64 math */
@ -198,6 +198,10 @@ extern void nv50_ir_relocate_code(void *relocData, uint32_t *code,
                                  uint32_t libPos,
                                  uint32_t dataPos);

+extern void
+nv50_ir_change_interp(void *interpData, uint32_t *code,
+                      bool force_per_sample, bool flatshade);
+
 /* obtain code that will be shared among programs */
 extern void nv50_ir_get_target_library(uint32_t chipset,
                                       const uint32_t **code, uint32_t *size);
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@ -1437,6 +1437,30 @@ CodeEmitterGK110::emitInterpMode(const Instruction *i)
   code[1] |= (i->ipa & 0xc) << (19 - 2);
 }

+static void
+interpApply(const InterpEntry *entry, uint32_t *code,
+      bool force_persample_interp, bool flatshade)
+{
+   int ipa = entry->ipa;
+   int reg = entry->reg;
+   int loc = entry->loc;
+
+   if (flatshade &&
+       (ipa & NV50_IR_INTERP_MODE_MASK) == NV50_IR_INTERP_SC) {
+      ipa = NV50_IR_INTERP_FLAT;
+      reg = 0xff;
+   } else if (force_persample_interp &&
+              (ipa & NV50_IR_INTERP_SAMPLE_MASK) == NV50_IR_INTERP_DEFAULT &&
+              (ipa & NV50_IR_INTERP_MODE_MASK) != NV50_IR_INTERP_FLAT) {
+      ipa |= NV50_IR_INTERP_CENTROID;
+   }
+   code[loc + 1] &= ~(0xf << 19);
+   code[loc + 1] |= (ipa & 0x3) << 21;
+   code[loc + 1] |= (ipa & 0xc) << (19 - 2);
+   code[loc + 0] &= ~(0xff << 23);
+   code[loc + 0] |= reg << 23;
+}
+
 void
 CodeEmitterGK110::emitINTERP(const Instruction *i)
 {
@ -1448,10 +1472,13 @@ CodeEmitterGK110::emitINTERP(const Instruction *i)
   if (i->saturate)
      code[1] |= 1 << 18;

-   if (i->op == OP_PINTERP)
+   if (i->op == OP_PINTERP) {
      srcId(i->src(1), 23);
-   else
+      addInterp(i->ipa, SDATA(i->src(1)).id, interpApply);
+   } else {
      code[0] |= 0xff << 23;
+      addInterp(i->ipa, 0xff, interpApply);
+   }

   srcId(i->src(0).getIndirect(0), 10);
   emitInterpMode(i);
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@ -2217,6 +2217,30 @@ CodeEmitterGM107::emitAL2P()
   emitGPR  (0x00, insn->def(0));
 }

+static void
+interpApply(const InterpEntry *entry, uint32_t *code,
+      bool force_persample_interp, bool flatshade)
+{
+   int ipa = entry->ipa;
+   int reg = entry->reg;
+   int loc = entry->loc;
+
+   if (flatshade &&
+       (ipa & NV50_IR_INTERP_MODE_MASK) == NV50_IR_INTERP_SC) {
+      ipa = NV50_IR_INTERP_FLAT;
+      reg = 0xff;
+   } else if (force_persample_interp &&
+              (ipa & NV50_IR_INTERP_SAMPLE_MASK) == NV50_IR_INTERP_DEFAULT &&
+              (ipa & NV50_IR_INTERP_MODE_MASK) != NV50_IR_INTERP_FLAT) {
+      ipa |= NV50_IR_INTERP_CENTROID;
+   }
+   code[loc + 1] &= ~(0xf << 0x14);
+   code[loc + 1] |= (ipa & 0x3) << 0x16;
+   code[loc + 1] |= (ipa & 0xc) << (0x14 - 2);
+   code[loc + 0] &= ~(0xff << 0x14);
+   code[loc + 0] |= reg << 0x14;
+}
+
 void
 CodeEmitterGM107::emitIPA()
 {
@ -2255,10 +2279,12 @@ CodeEmitterGM107::emitIPA()
      emitGPR(0x14, insn->src(1));
      if (insn->getSampleMode() == NV50_IR_INTERP_OFFSET)
         emitGPR(0x27, insn->src(2));
+      addInterp(insn->ipa, insn->getSrc(1)->reg.data.id, interpApply);
   } else {
      if (insn->getSampleMode() == NV50_IR_INTERP_OFFSET)
         emitGPR(0x27, insn->src(1));
      emitGPR(0x14);
+      addInterp(insn->ipa, 0xff, interpApply);
   }

   if (insn->getSampleMode() != NV50_IR_INTERP_OFFSET)
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@ -372,7 +372,7 @@ CodeEmitterNV50::setSrcFileBits(const Instruction *i, int enc)
         mode |= 3 << (s * 2);
         break;
      default:
-	      ERROR("invalid file on source %i: %u\n", s, i->src(s).getFile());
+         ERROR("invalid file on source %i: %u\n", s, i->src(s).getFile());
         assert(0);
         break;
      }
@ -876,6 +876,30 @@ CodeEmitterNV50::emitPFETCH(const Instruction *i)
   emitFlagsRd(i);
 }

+static void
+interpApply(const InterpEntry *entry, uint32_t *code,
+      bool force_persample_interp, bool flatshade)
+{
+   int ipa = entry->ipa;
+   int encSize = entry->reg;
+   int loc = entry->loc;
+
+   if ((ipa & NV50_IR_INTERP_SAMPLE_MASK) == NV50_IR_INTERP_DEFAULT &&
+       (ipa & NV50_IR_INTERP_MODE_MASK) != NV50_IR_INTERP_FLAT) {
+      if (force_persample_interp) {
+         if (encSize == 8)
+            code[loc + 1] |= 1 << 16;
+         else
+            code[loc + 0] |= 1 << 24;
+      } else {
+         if (encSize == 8)
+            code[loc + 1] &= ~(1 << 16);
+         else
+            code[loc + 0] &= ~(1 << 24);
+      }
+   }
+}
+
 void
 CodeEmitterNV50::emitINTERP(const Instruction *i)
 {
@ -904,6 +928,8 @@ CodeEmitterNV50::emitINTERP(const Instruction *i)
      code[0] |= 1;
      emitFlagsRd(i);
   }
+
+   addInterp(i->ipa, i->encSize, interpApply);
 }

 void
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@ -1618,6 +1618,29 @@ CodeEmitterNVC0::emitInterpMode(const Instruction *i)
   }
 }

+static void
+interpApply(const InterpEntry *entry, uint32_t *code,
+      bool force_persample_interp, bool flatshade)
+{
+   int ipa = entry->ipa;
+   int reg = entry->reg;
+   int loc = entry->loc;
+
+   if (flatshade &&
+       (ipa & NV50_IR_INTERP_MODE_MASK) == NV50_IR_INTERP_SC) {
+      ipa = NV50_IR_INTERP_FLAT;
+      reg = 0x3f;
+   } else if (force_persample_interp &&
+              (ipa & NV50_IR_INTERP_SAMPLE_MASK) == NV50_IR_INTERP_DEFAULT &&
+              (ipa & NV50_IR_INTERP_MODE_MASK) != NV50_IR_INTERP_FLAT) {
+      ipa |= NV50_IR_INTERP_CENTROID;
+   }
+   code[loc + 0] &= ~(0xf << 6);
+   code[loc + 0] |= ipa << 6;
+   code[loc + 0] &= ~(0x3f << 26);
+   code[loc + 0] |= reg << 26;
+}
+
 void
 CodeEmitterNVC0::emitINTERP(const Instruction *i)
 {
@ -1630,10 +1653,13 @@ CodeEmitterNVC0::emitINTERP(const Instruction *i)
      if (i->saturate)
         code[0] |= 1 << 5;

-      if (i->op == OP_PINTERP)
+      if (i->op == OP_PINTERP) {
         srcId(i->src(1), 26);
-      else
+         addInterp(i->ipa, SDATA(i->src(1)).id, interpApply);
+      } else {
         code[0] |= 0x3f << 26;
+         addInterp(i->ipa, 0x3f, interpApply);
+      }

      srcId(i->src(0).getIndirect(0), 20);
   } else {
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@ -910,7 +910,7 @@ bool Source::scanSource()
      info->bin.tlsSpace += (scan.file_max[TGSI_FILE_TEMPORARY] + 1) * 16;

   if (info->io.genUserClip > 0) {
-      info->io.clipDistanceMask = (1 << info->io.genUserClip) - 1;
+      info->io.clipDistances = info->io.genUserClip;

      const unsigned int nOut = (info->io.genUserClip + 3) / 4;

@ -919,7 +919,7 @@ bool Source::scanSource()
         info->out[i].id = i;
         info->out[i].sn = TGSI_SEMANTIC_CLIPDIST;
         info->out[i].si = n;
-         info->out[i].mask = info->io.clipDistanceMask >> (n * 4);
+         info->out[i].mask = ((1 << info->io.clipDistances) - 1) >> (n * 4);
      }
   }

@ -969,6 +969,12 @@ void Source::scanProperty(const struct tgsi_full_property *prop)
      else
         info->prop.tp.outputPrim = PIPE_PRIM_TRIANGLES; /* anything but points */
      break;
+   case TGSI_PROPERTY_NUM_CLIPDIST_ENABLED:
+      info->io.clipDistances = prop->u[0].Data;
+      break;
+   case TGSI_PROPERTY_NUM_CULLDIST_ENABLED:
+      info->io.cullDistances = prop->u[0].Data;
+      break;
   default:
      INFO("unhandled TGSI property %d\n", prop->Property.PropertyName);
      break;
@ -1054,7 +1060,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
               default:
                  break;
               }
-               if (decl->Interp.Location || info->io.sampleInterp)
+               if (decl->Interp.Location)
                  info->in[i].centroid = 1;
            }

@ -1086,8 +1092,6 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
            clipVertexOutput = i;
            break;
         case TGSI_SEMANTIC_CLIPDIST:
-            info->io.clipDistanceMask |=
-               decl->Declaration.UsageMask << (si * 4);
            info->io.genUserClip = -1;
            break;
         case TGSI_SEMANTIC_SAMPLEMASK:
@ -1119,6 +1123,10 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
      case TGSI_SEMANTIC_VERTEXID:
         info->io.vertexId = first;
         break;
+      case TGSI_SEMANTIC_SAMPLEID:
+      case TGSI_SEMANTIC_SAMPLEPOS:
+         info->prop.fp.sampleInterp = 1;
+         break;
      default:
         break;
      }
@ -1338,6 +1346,8 @@ private:

   void handleINTERP(Value *dst0[4]);

+   uint8_t translateInterpMode(const struct nv50_ir_varying *var,
+                               operation& op);
   Value *interpolate(tgsi::Instruction::SrcRegister, int c, Value *ptr);

   void insertConvergenceOps(BasicBlock *conv, BasicBlock *fork);
@ -1451,8 +1461,8 @@ Converter::makeSym(uint tgsiFile, int fileIdx, int idx, int c, uint32_t address)
   return sym;
 }

-static inline uint8_t
-translateInterpMode(const struct nv50_ir_varying *var, operation& op)
+uint8_t
+Converter::translateInterpMode(const struct nv50_ir_varying *var, operation& op)
 {
   uint8_t mode = NV50_IR_INTERP_PERSPECTIVE;

@ -1468,7 +1478,7 @@ translateInterpMode(const struct nv50_ir_varying *var, operation& op)
   op = (mode == NV50_IR_INTERP_PERSPECTIVE || mode == NV50_IR_INTERP_SC)
      ? OP_PINTERP : OP_LINTERP;

-   if (var->centroid)
+   if (var->centroid || info->prop.fp.sampleInterp)
      mode |= NV50_IR_INTERP_CENTROID;

   return mode;
@ -1628,7 +1638,7 @@ Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
         // don't load masked inputs, won't be assigned a slot
         if (!ptr && !(info->in[idx].mask & (1 << swz)))
            return loadImm(NULL, swz == TGSI_SWIZZLE_W ? 1.0f : 0.0f);
-	 if (!ptr && info->in[idx].sn == TGSI_SEMANTIC_FACE)
+         if (!ptr && info->in[idx].sn == TGSI_SEMANTIC_FACE)
            return mkOp1v(OP_RDSV, TYPE_F32, getSSA(), mkSysVal(SV_FACE, 0));
         return interpolate(src, c, shiftAddress(ptr));
      } else
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
@ -166,7 +166,7 @@ void Target::destroy(Target *targ)
   delete targ;
 }

-CodeEmitter::CodeEmitter(const Target *target) : targ(target)
+CodeEmitter::CodeEmitter(const Target *target) : targ(target), interpInfo(NULL)
 {
 }

@ -388,6 +388,7 @@ Program::emitBinary(struct nv50_ir_prog_info *info)
      }
   }
   info->bin.relocData = emit->getRelocInfo();
+   info->bin.interpData = emit->getInterpInfo();

   emitSymbolTable(info);

@ -428,6 +429,29 @@ CodeEmitter::addReloc(RelocEntry::Type ty, int w, uint32_t data, uint32_t m,
   return true;
 }

+bool
+CodeEmitter::addInterp(int ipa, int reg, InterpApply apply)
+{
+   unsigned int n = interpInfo ? interpInfo->count : 0;
+
+   if (!(n % RELOC_ALLOC_INCREMENT)) {
+      size_t size = sizeof(InterpInfo) + n * sizeof(InterpEntry);
+      interpInfo = reinterpret_cast<InterpInfo *>(
+         REALLOC(interpInfo, n ? size : 0,
+                 size + RELOC_ALLOC_INCREMENT * sizeof(InterpEntry)));
+      if (!interpInfo)
+         return false;
+      if (n == 0)
+         memset(interpInfo, 0, sizeof(InterpInfo));
+   }
+   ++interpInfo->count;
+
+   interpInfo->entry[n] = InterpEntry(ipa, reg, codeSize >> 2);
+   interpInfo->apply = apply;
+
+   return true;
+}
+
 void
 RelocEntry::apply(uint32_t *binary, const RelocInfo *info) const
 {
@ -471,6 +495,19 @@ nv50_ir_relocate_code(void *relocData, uint32_t *code,
      info->entry[i].apply(code, info);
 }

+void
+nv50_ir_change_interp(void *interpData, uint32_t *code,
+                      bool force_persample_interp, bool flatshade)
+{
+   nv50_ir::InterpInfo *info = reinterpret_cast<nv50_ir::InterpInfo *>(
+      interpData);
+
+   // force_persample_interp: all non-flat -> per-sample
+   // flatshade: all color -> flat
+   for (unsigned i = 0; i < info->count; ++i)
+      info->apply(&info->entry[i], code, force_persample_interp, flatshade);
+}
+
 void
 nv50_ir_get_target_library(uint32_t chipset,
                           const uint32_t **code, uint32_t *size)
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
@ -58,6 +58,23 @@ struct RelocInfo
   RelocEntry entry[0];
 };

+struct InterpEntry
+{
+   InterpEntry(int ipa, int reg, int loc) : ipa(ipa), reg(reg), loc(loc) {}
+   uint32_t ipa:4; // SC mode used to identify colors
+   uint32_t reg:8; // The reg used for perspective division
+   uint32_t loc:20; // Let's hope we don't have more than 1M-sized shaders
+};
+
+typedef void (*InterpApply)(const InterpEntry*, uint32_t*, bool, bool);
+
+struct InterpInfo
+{
+   uint32_t count;
+   InterpApply apply;
+   InterpEntry entry[0];
+};
+
 class CodeEmitter
 {
 public:
@ -78,6 +95,9 @@ public:

   inline void *getRelocInfo() const { return relocInfo; }

+   bool addInterp(int ipa, int reg, InterpApply apply);
+   inline void *getInterpInfo() const { return interpInfo; }
+
   virtual void prepareEmission(Program *);
   virtual void prepareEmission(Function *);
   virtual void prepareEmission(BasicBlock *);
@ -92,6 +112,7 @@ protected:
   uint32_t codeSizeLimit;

   RelocInfo *relocInfo;
+   InterpInfo *interpInfo;
 };


--- a/src/gallium/drivers/nouveau/nouveau_heap.c
+++ b/src/gallium/drivers/nouveau/nouveau_heap.c
@ -29,95 +29,95 @@ int
 nouveau_heap_init(struct nouveau_heap **heap,
                  unsigned start, unsigned size)
 {
-	struct nouveau_heap *r;
+   struct nouveau_heap *r;

-	r = calloc(1, sizeof(struct nouveau_heap));
-	if (!r)
-		return 1;
+   r = calloc(1, sizeof(struct nouveau_heap));
+   if (!r)
+      return 1;

-	r->start = start;
-	r->size  = size;
-	*heap = r;
-	return 0;
+   r->start = start;
+   r->size  = size;
+   *heap = r;
+   return 0;
 }

 void
 nouveau_heap_destroy(struct nouveau_heap **heap)
 {
-	if (!*heap)
-		return;
-	free(*heap);
-	*heap = NULL;
+   if (!*heap)
+      return;
+   free(*heap);
+   *heap = NULL;
 }

 int
 nouveau_heap_alloc(struct nouveau_heap *heap, unsigned size, void *priv,
                   struct nouveau_heap **res)
 {
-	struct nouveau_heap *r;
+   struct nouveau_heap *r;

-	if (!heap || !size || !res || *res)
-		return 1;
+   if (!heap || !size || !res || *res)
+      return 1;

-	while (heap) {
-		if (!heap->in_use && heap->size >= size) {
-			r = calloc(1, sizeof(struct nouveau_heap));
-			if (!r)
-				return 1;
+   while (heap) {
+      if (!heap->in_use && heap->size >= size) {
+         r = calloc(1, sizeof(struct nouveau_heap));
+         if (!r)
+            return 1;

-			r->start  = (heap->start + heap->size) - size;
-			r->size   = size;
-			r->in_use = 1;
-			r->priv   = priv;
+         r->start  = (heap->start + heap->size) - size;
+         r->size   = size;
+         r->in_use = 1;
+         r->priv   = priv;

-			heap->size -= size;
+         heap->size -= size;

-			r->next = heap->next;
-			if (heap->next)
-				heap->next->prev = r;
-			r->prev = heap;
-			heap->next = r;
+         r->next = heap->next;
+         if (heap->next)
+            heap->next->prev = r;
+         r->prev = heap;
+         heap->next = r;

-			*res = r;
-			return 0;
-		}
+         *res = r;
+         return 0;
+      }

-		heap = heap->next;
-	}
+      heap = heap->next;
+   }

-	return 1;
+   return 1;
 }

 void
 nouveau_heap_free(struct nouveau_heap **res)
 {
-	struct nouveau_heap *r;
+   struct nouveau_heap *r;

-	if (!res || !*res)
-		return;
-	r = *res;
-	*res = NULL;
+   if (!res || !*res)
+      return;
+   r = *res;
+   *res = NULL;

-	r->in_use = 0;
+   r->in_use = 0;

-	if (r->next && !r->next->in_use) {
-		struct nouveau_heap *new = r->next;
+   if (r->next && !r->next->in_use) {
+      struct nouveau_heap *new = r->next;

-		new->prev = r->prev;
-		if (r->prev)
-			r->prev->next = new;
-		new->size += r->size;
-		new->start = r->start;
+      new->prev = r->prev;
+      if (r->prev)
+         r->prev->next = new;
+      new->size += r->size;
+      new->start = r->start;

-		free(r);
-		r = new;
-	}
+      free(r);
+      r = new;
+   }

-	if (r->prev && !r->prev->in_use) {
-		r->prev->next = r->next;
-		if (r->next)
-			r->next->prev = r->prev;
-		r->prev->size += r->size;
-		free(r);
-	}
+   if (r->prev && !r->prev->in_use) {
+      r->prev->next = r->next;
+      if (r->next)
+         r->next->prev = r->prev;
+      r->prev->size += r->size;
+      free(r);
+   }
 }
--- a/src/gallium/drivers/nouveau/nouveau_heap.h
+++ b/src/gallium/drivers/nouveau/nouveau_heap.h
@ -44,15 +44,15 @@
 * full size of the heap.
 */
 struct nouveau_heap {
-	struct nouveau_heap *prev;
-	struct nouveau_heap *next;
+   struct nouveau_heap *prev;
+   struct nouveau_heap *next;

-	void *priv;
+   void *priv;

-	unsigned start;
-	unsigned size;
+   unsigned start;
+   unsigned size;

-	int in_use;
+   int in_use;
 };

 int
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@ -30,211 +30,211 @@ int nouveau_mesa_debug = 0;
 static const char *
 nouveau_screen_get_name(struct pipe_screen *pscreen)
 {
-	struct nouveau_device *dev = nouveau_screen(pscreen)->device;
-	static char buffer[128];
+   struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+   static char buffer[128];

-	util_snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
-	return buffer;
+   util_snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+   return buffer;
 }

 static const char *
 nouveau_screen_get_vendor(struct pipe_screen *pscreen)
 {
-	return "nouveau";
+   return "nouveau";
 }

 static const char *
 nouveau_screen_get_device_vendor(struct pipe_screen *pscreen)
 {
-	return "NVIDIA";
+   return "NVIDIA";
 }

 static uint64_t
 nouveau_screen_get_timestamp(struct pipe_screen *pscreen)
 {
-	int64_t cpu_time = os_time_get() * 1000;
+   int64_t cpu_time = os_time_get() * 1000;

-        /* getparam of PTIMER_TIME takes about x10 as long (several usecs) */
+   /* getparam of PTIMER_TIME takes about x10 as long (several usecs) */

-	return cpu_time + nouveau_screen(pscreen)->cpu_gpu_time_delta;
+   return cpu_time + nouveau_screen(pscreen)->cpu_gpu_time_delta;
 }

 static void
 nouveau_screen_fence_ref(struct pipe_screen *pscreen,
-			 struct pipe_fence_handle **ptr,
-			 struct pipe_fence_handle *pfence)
+                         struct pipe_fence_handle **ptr,
+                         struct pipe_fence_handle *pfence)
 {
-	nouveau_fence_ref(nouveau_fence(pfence), (struct nouveau_fence **)ptr);
+   nouveau_fence_ref(nouveau_fence(pfence), (struct nouveau_fence **)ptr);
 }

 static boolean
 nouveau_screen_fence_finish(struct pipe_screen *screen,
-			    struct pipe_fence_handle *pfence,
+                            struct pipe_fence_handle *pfence,
                            uint64_t timeout)
 {
-	if (!timeout)
-		return nouveau_fence_signalled(nouveau_fence(pfence));
+   if (!timeout)
+      return nouveau_fence_signalled(nouveau_fence(pfence));

-	return nouveau_fence_wait(nouveau_fence(pfence));
+   return nouveau_fence_wait(nouveau_fence(pfence));
 }


 struct nouveau_bo *
 nouveau_screen_bo_from_handle(struct pipe_screen *pscreen,
-			      struct winsys_handle *whandle,
-			      unsigned *out_stride)
+                              struct winsys_handle *whandle,
+                              unsigned *out_stride)
 {
-	struct nouveau_device *dev = nouveau_screen(pscreen)->device;
-	struct nouveau_bo *bo = 0;
-	int ret;
+   struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+   struct nouveau_bo *bo = 0;
+   int ret;

-	if (whandle->type != DRM_API_HANDLE_TYPE_SHARED &&
-	    whandle->type != DRM_API_HANDLE_TYPE_FD) {
-		debug_printf("%s: attempt to import unsupported handle type %d\n",
-			     __FUNCTION__, whandle->type);
-		return NULL;
-	}
+   if (whandle->type != DRM_API_HANDLE_TYPE_SHARED &&
+       whandle->type != DRM_API_HANDLE_TYPE_FD) {
+      debug_printf("%s: attempt to import unsupported handle type %d\n",
+                   __FUNCTION__, whandle->type);
+      return NULL;
+   }

-	if (whandle->type == DRM_API_HANDLE_TYPE_SHARED)
-		ret = nouveau_bo_name_ref(dev, whandle->handle, &bo);
-	else
-		ret = nouveau_bo_prime_handle_ref(dev, whandle->handle, &bo);
+   if (whandle->type == DRM_API_HANDLE_TYPE_SHARED)
+      ret = nouveau_bo_name_ref(dev, whandle->handle, &bo);
+   else
+      ret = nouveau_bo_prime_handle_ref(dev, whandle->handle, &bo);

-	if (ret) {
-		debug_printf("%s: ref name 0x%08x failed with %d\n",
-			     __FUNCTION__, whandle->handle, ret);
-		return NULL;
-	}
+   if (ret) {
+      debug_printf("%s: ref name 0x%08x failed with %d\n",
+                   __FUNCTION__, whandle->handle, ret);
+      return NULL;
+   }

-	*out_stride = whandle->stride;
-	return bo;
+   *out_stride = whandle->stride;
+   return bo;
 }


 bool
 nouveau_screen_bo_get_handle(struct pipe_screen *pscreen,
-			     struct nouveau_bo *bo,
-			     unsigned stride,
-			     struct winsys_handle *whandle)
+                             struct nouveau_bo *bo,
+                             unsigned stride,
+                             struct winsys_handle *whandle)
 {
-	whandle->stride = stride;
+   whandle->stride = stride;

-	if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) {
-		return nouveau_bo_name_get(bo, &whandle->handle) == 0;
-	} else if (whandle->type == DRM_API_HANDLE_TYPE_KMS) {
-		whandle->handle = bo->handle;
-		return true;
-	} else if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
-		return nouveau_bo_set_prime(bo, (int *)&whandle->handle) == 0;
-	} else {
-		return false;
-	}
+   if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) {
+      return nouveau_bo_name_get(bo, &whandle->handle) == 0;
+   } else if (whandle->type == DRM_API_HANDLE_TYPE_KMS) {
+      whandle->handle = bo->handle;
+      return true;
+   } else if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
+      return nouveau_bo_set_prime(bo, (int *)&whandle->handle) == 0;
+   } else {
+      return false;
+   }
 }

 int
 nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev)
 {
-	struct pipe_screen *pscreen = &screen->base;
-	struct nv04_fifo nv04_data = { .vram = 0xbeef0201, .gart = 0xbeef0202 };
-	struct nvc0_fifo nvc0_data = { };
-	uint64_t time;
-	int size, ret;
-	void *data;
-	union nouveau_bo_config mm_config;
+   struct pipe_screen *pscreen = &screen->base;
+   struct nv04_fifo nv04_data = { .vram = 0xbeef0201, .gart = 0xbeef0202 };
+   struct nvc0_fifo nvc0_data = { };
+   uint64_t time;
+   int size, ret;
+   void *data;
+   union nouveau_bo_config mm_config;

-	char *nv_dbg = getenv("NOUVEAU_MESA_DEBUG");
-	if (nv_dbg)
-	   nouveau_mesa_debug = atoi(nv_dbg);
+   char *nv_dbg = getenv("NOUVEAU_MESA_DEBUG");
+   if (nv_dbg)
+      nouveau_mesa_debug = atoi(nv_dbg);

-	/*
-	 * this is initialized to 1 in nouveau_drm_screen_create after screen
-	 * is fully constructed and added to the global screen list.
-	 */
-	screen->refcount = -1;
+   /*
+    * this is initialized to 1 in nouveau_drm_screen_create after screen
+    * is fully constructed and added to the global screen list.
+    */
+   screen->refcount = -1;

-	if (dev->chipset < 0xc0) {
-		data = &nv04_data;
-		size = sizeof(nv04_data);
-	} else {
-		data = &nvc0_data;
-		size = sizeof(nvc0_data);
-	}
+   if (dev->chipset < 0xc0) {
+      data = &nv04_data;
+      size = sizeof(nv04_data);
+   } else {
+      data = &nvc0_data;
+      size = sizeof(nvc0_data);
+   }

-	/*
-	 * Set default VRAM domain if not overridden
-	 */
-	if (!screen->vram_domain) {
-		if (dev->vram_size > 0)
-			screen->vram_domain = NOUVEAU_BO_VRAM;
-		else
-			screen->vram_domain = NOUVEAU_BO_GART;
-	}
+   /*
+    * Set default VRAM domain if not overridden
+    */
+   if (!screen->vram_domain) {
+      if (dev->vram_size > 0)
+         screen->vram_domain = NOUVEAU_BO_VRAM;
+      else
+         screen->vram_domain = NOUVEAU_BO_GART;
+   }

-	ret = nouveau_object_new(&dev->object, 0, NOUVEAU_FIFO_CHANNEL_CLASS,
-				 data, size, &screen->channel);
-	if (ret)
-		return ret;
-	screen->device = dev;
+   ret = nouveau_object_new(&dev->object, 0, NOUVEAU_FIFO_CHANNEL_CLASS,
+                            data, size, &screen->channel);
+   if (ret)
+      return ret;
+   screen->device = dev;

-	ret = nouveau_client_new(screen->device, &screen->client);
-	if (ret)
-		return ret;
-	ret = nouveau_pushbuf_new(screen->client, screen->channel,
-				  4, 512 * 1024, 1,
-				  &screen->pushbuf);
-	if (ret)
-		return ret;
+   ret = nouveau_client_new(screen->device, &screen->client);
+   if (ret)
+      return ret;
+   ret = nouveau_pushbuf_new(screen->client, screen->channel,
+                             4, 512 * 1024, 1,
+                             &screen->pushbuf);
+   if (ret)
+      return ret;

-        /* getting CPU time first appears to be more accurate */
-        screen->cpu_gpu_time_delta = os_time_get();
+   /* getting CPU time first appears to be more accurate */
+   screen->cpu_gpu_time_delta = os_time_get();

-        ret = nouveau_getparam(dev, NOUVEAU_GETPARAM_PTIMER_TIME, &time);
-        if (!ret)
-           screen->cpu_gpu_time_delta = time - screen->cpu_gpu_time_delta * 1000;
+   ret = nouveau_getparam(dev, NOUVEAU_GETPARAM_PTIMER_TIME, &time);
+   if (!ret)
+      screen->cpu_gpu_time_delta = time - screen->cpu_gpu_time_delta * 1000;

-	pscreen->get_name = nouveau_screen_get_name;
-	pscreen->get_vendor = nouveau_screen_get_vendor;
-	pscreen->get_device_vendor = nouveau_screen_get_device_vendor;
+   pscreen->get_name = nouveau_screen_get_name;
+   pscreen->get_vendor = nouveau_screen_get_vendor;
+   pscreen->get_device_vendor = nouveau_screen_get_device_vendor;

-	pscreen->get_timestamp = nouveau_screen_get_timestamp;
+   pscreen->get_timestamp = nouveau_screen_get_timestamp;

-	pscreen->fence_reference = nouveau_screen_fence_ref;
-	pscreen->fence_finish = nouveau_screen_fence_finish;
+   pscreen->fence_reference = nouveau_screen_fence_ref;
+   pscreen->fence_finish = nouveau_screen_fence_finish;

-	util_format_s3tc_init();
+   util_format_s3tc_init();

-	screen->lowmem_bindings = PIPE_BIND_GLOBAL; /* gallium limit */
-	screen->vidmem_bindings =
-		PIPE_BIND_RENDER_TARGET | PIPE_BIND_DEPTH_STENCIL |
-		PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT |
-		PIPE_BIND_CURSOR |
-		PIPE_BIND_SAMPLER_VIEW |
-		PIPE_BIND_SHADER_BUFFER | PIPE_BIND_SHADER_IMAGE |
-                PIPE_BIND_COMPUTE_RESOURCE |
-		PIPE_BIND_GLOBAL;
-	screen->sysmem_bindings =
-		PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_STREAM_OUTPUT |
-		PIPE_BIND_COMMAND_ARGS_BUFFER;
+   screen->lowmem_bindings = PIPE_BIND_GLOBAL; /* gallium limit */
+   screen->vidmem_bindings =
+      PIPE_BIND_RENDER_TARGET | PIPE_BIND_DEPTH_STENCIL |
+      PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT |
+      PIPE_BIND_CURSOR |
+      PIPE_BIND_SAMPLER_VIEW |
+      PIPE_BIND_SHADER_BUFFER | PIPE_BIND_SHADER_IMAGE |
+      PIPE_BIND_COMPUTE_RESOURCE |
+      PIPE_BIND_GLOBAL;
+   screen->sysmem_bindings =
+      PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_STREAM_OUTPUT |
+      PIPE_BIND_COMMAND_ARGS_BUFFER;

-	memset(&mm_config, 0, sizeof(mm_config));
+   memset(&mm_config, 0, sizeof(mm_config));

-	screen->mm_GART = nouveau_mm_create(dev,
-					    NOUVEAU_BO_GART | NOUVEAU_BO_MAP,
-					    &mm_config);
-	screen->mm_VRAM = nouveau_mm_create(dev, NOUVEAU_BO_VRAM, &mm_config);
-	return 0;
+   screen->mm_GART = nouveau_mm_create(dev,
+                                       NOUVEAU_BO_GART | NOUVEAU_BO_MAP,
+                                       &mm_config);
+   screen->mm_VRAM = nouveau_mm_create(dev, NOUVEAU_BO_VRAM, &mm_config);
+   return 0;
 }

 void
 nouveau_screen_fini(struct nouveau_screen *screen)
 {
-	nouveau_mm_destroy(screen->mm_GART);
-	nouveau_mm_destroy(screen->mm_VRAM);
+   nouveau_mm_destroy(screen->mm_GART);
+   nouveau_mm_destroy(screen->mm_VRAM);

-	nouveau_pushbuf_del(&screen->pushbuf);
+   nouveau_pushbuf_del(&screen->pushbuf);

-	nouveau_client_del(&screen->client);
-	nouveau_object_del(&screen->channel);
+   nouveau_client_del(&screen->client);
+   nouveau_object_del(&screen->channel);

-	nouveau_device_del(&screen->device);
+   nouveau_device_del(&screen->device);
 }
--- a/src/gallium/drivers/nouveau/nouveau_screen.h
+++ b/src/gallium/drivers/nouveau/nouveau_screen.h
@ -16,47 +16,47 @@ extern int nouveau_mesa_debug;
 struct nouveau_bo;

 struct nouveau_screen {
-	struct pipe_screen base;
-	struct nouveau_device *device;
-	struct nouveau_object *channel;
-	struct nouveau_client *client;
-	struct nouveau_pushbuf *pushbuf;
+   struct pipe_screen base;
+   struct nouveau_device *device;
+   struct nouveau_object *channel;
+   struct nouveau_client *client;
+   struct nouveau_pushbuf *pushbuf;

-	int refcount;
+   int refcount;

-	unsigned vidmem_bindings; /* PIPE_BIND_* where VRAM placement is desired */
-	unsigned sysmem_bindings; /* PIPE_BIND_* where GART placement is desired */
-	unsigned lowmem_bindings; /* PIPE_BIND_* that require an address < 4 GiB */
-	/*
-	 * For bindings with (vidmem & sysmem) bits set, PIPE_USAGE_* decides
-	 * placement.
-	 */
+   unsigned vidmem_bindings; /* PIPE_BIND_* where VRAM placement is desired */
+   unsigned sysmem_bindings; /* PIPE_BIND_* where GART placement is desired */
+   unsigned lowmem_bindings; /* PIPE_BIND_* that require an address < 4 GiB */
+   /*
+    * For bindings with (vidmem & sysmem) bits set, PIPE_USAGE_* decides
+    * placement.
+    */

-	uint16_t class_3d;
+   uint16_t class_3d;

-	struct {
-		struct nouveau_fence *head;
-		struct nouveau_fence *tail;
-		struct nouveau_fence *current;
-		u32 sequence;
-		u32 sequence_ack;
-		void (*emit)(struct pipe_screen *, u32 *sequence);
-		u32  (*update)(struct pipe_screen *);
-	} fence;
+   struct {
+      struct nouveau_fence *head;
+      struct nouveau_fence *tail;
+      struct nouveau_fence *current;
+      u32 sequence;
+      u32 sequence_ack;
+      void (*emit)(struct pipe_screen *, u32 *sequence);
+      u32  (*update)(struct pipe_screen *);
+   } fence;

-	struct nouveau_mman *mm_VRAM;
-	struct nouveau_mman *mm_GART;
+   struct nouveau_mman *mm_VRAM;
+   struct nouveau_mman *mm_GART;

-	int64_t cpu_gpu_time_delta;
+   int64_t cpu_gpu_time_delta;

-	bool hint_buf_keep_sysmem_copy;
+   bool hint_buf_keep_sysmem_copy;

-	unsigned vram_domain;
+   unsigned vram_domain;

-	struct {
-		unsigned profiles_checked;
-		unsigned profiles_present;
-	} firmware_info;
+   struct {
+      unsigned profiles_checked;
+      unsigned profiles_present;
+   } firmware_info;

 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
   union {
@ -100,10 +100,10 @@ struct nouveau_screen {

 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
 # define NOUVEAU_DRV_STAT(s, n, v) do {         \
-      (s)->stats.named.n += (v);               \
+      (s)->stats.named.n += (v);                \
   } while(0)
-# define NOUVEAU_DRV_STAT_RES(r, n, v) do {                       \
-      nouveau_screen((r)->base.screen)->stats.named.n += (v);    \
+# define NOUVEAU_DRV_STAT_RES(r, n, v) do {                     \
+      nouveau_screen((r)->base.screen)->stats.named.n += (v);   \
   } while(0)
 # define NOUVEAU_DRV_STAT_IFD(x) x
 #else
@ -115,20 +115,20 @@ struct nouveau_screen {
 static inline struct nouveau_screen *
 nouveau_screen(struct pipe_screen *pscreen)
 {
-	return (struct nouveau_screen *)pscreen;
+   return (struct nouveau_screen *)pscreen;
 }

 bool nouveau_drm_screen_unref(struct nouveau_screen *screen);

 bool
 nouveau_screen_bo_get_handle(struct pipe_screen *pscreen,
-			     struct nouveau_bo *bo,
-			     unsigned stride,
-			     struct winsys_handle *whandle);
+                             struct nouveau_bo *bo,
+                             unsigned stride,
+                             struct winsys_handle *whandle);
 struct nouveau_bo *
 nouveau_screen_bo_from_handle(struct pipe_screen *pscreen,
-			      struct winsys_handle *whandle,
-			      unsigned *out_stride);
+                              struct winsys_handle *whandle,
+                              unsigned *out_stride);


 int nouveau_screen_init(struct nouveau_screen *, struct nouveau_device *);
--- a/src/gallium/drivers/nouveau/nouveau_statebuf.h
+++ b/src/gallium/drivers/nouveau/nouveau_statebuf.h
@ -6,9 +6,9 @@

 struct nouveau_statebuf_builder
 {
-	uint32_t* p;
+   uint32_t* p;
 #ifdef DEBUG
-	uint32_t* pend;
+   uint32_t* pend;
 #endif
 };

@ -22,7 +22,7 @@ struct nouveau_statebuf_builder

 static inline uint32_t sb_header(unsigned subc, unsigned mthd, unsigned size)
 {
-	return (size << 18) | (subc << 13) | mthd;
+   return (size << 18) | (subc << 13) | mthd;
 }

 #define sb_method(sb, v, n)  sb_data(sb, sb_header(SUBC_3D(v), n));
--- a/src/gallium/drivers/nouveau/nouveau_video.c
+++ b/src/gallium/drivers/nouveau/nouveau_video.c
@ -831,7 +831,7 @@ error:
 static int
 nouveau_screen_get_video_param(struct pipe_screen *pscreen,
                               enum pipe_video_profile profile,
-			       enum pipe_video_entrypoint entrypoint,
+                               enum pipe_video_entrypoint entrypoint,
                               enum pipe_video_cap param)
 {
   switch (param) {
--- a/src/gallium/drivers/nouveau/nouveau_video.h
+++ b/src/gallium/drivers/nouveau/nouveau_video.h
@ -83,7 +83,7 @@ BEGIN_NI04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 static inline void
 PUSH_MTHDl(struct nouveau_pushbuf *push, int subc, int mthd,
           struct nouveau_bo *bo, uint32_t offset,
-	   struct nouveau_bufctx *ctx, int bin, uint32_t rw)
+           struct nouveau_bufctx *ctx, int bin, uint32_t rw)
 {
   nouveau_bufctx_mthd(ctx, bin, NV04_FIFO_PKHDR(subc, mthd, 1),
                       bo, offset,
--- a/src/gallium/drivers/nouveau/nouveau_vp3_video.h
+++ b/src/gallium/drivers/nouveau/nouveau_vp3_video.h
@ -117,22 +117,22 @@ struct nouveau_vp3_decoder {
 };

 struct comm {
-	uint32_t bsp_cur_index; // 000
-	uint32_t byte_ofs; // 004
-	uint32_t status[0x10]; // 008
-	uint32_t pos[0x10]; // 048
-	uint8_t pad[0x100 - 0x88]; // 0a0 bool comm_encrypted
+   uint32_t bsp_cur_index; // 000
+   uint32_t byte_ofs; // 004
+   uint32_t status[0x10]; // 008
+   uint32_t pos[0x10]; // 048
+   uint8_t pad[0x100 - 0x88]; // 0a0 bool comm_encrypted

-	uint32_t pvp_cur_index; // 100
-	uint32_t acked_byte_ofs; // 104
-	uint32_t status_vp[0x10]; // 108
-	uint16_t mb_y[0x10]; //148
-	uint32_t pvp_stage; // 168 0xeeXX
-	uint16_t parse_endpos_index; // 16c
-	uint16_t irq_index; // 16e
-	uint8_t  irq_470[0x10]; // 170
-	uint32_t irq_pos[0x10]; // 180
-	uint32_t parse_endpos[0x10]; // 1c0
+   uint32_t pvp_cur_index; // 100
+   uint32_t acked_byte_ofs; // 104
+   uint32_t status_vp[0x10]; // 108
+   uint16_t mb_y[0x10]; //148
+   uint32_t pvp_stage; // 168 0xeeXX
+   uint16_t parse_endpos_index; // 16c
+   uint16_t irq_index; // 16e
+   uint8_t  irq_470[0x10]; // 170
+   uint32_t irq_pos[0x10]; // 180
+   uint32_t parse_endpos[0x10]; // 1c0
 };

 static inline uint32_t nouveau_vp3_video_align(uint32_t h)
--- a/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c
@ -23,90 +23,90 @@
 #include "nouveau_vp3_video.h"

 struct strparm_bsp {
-	uint32_t w0[4]; // bits 0-23 length, bits 24-31 addr_hi
-	uint32_t w1[4]; // bit 8-24 addr_lo
-	uint32_t unk20; // should be idx * 0x8000000, bitstream offset
-	uint32_t do_crypto_crap; // set to 0
+   uint32_t w0[4]; // bits 0-23 length, bits 24-31 addr_hi
+   uint32_t w1[4]; // bit 8-24 addr_lo
+   uint32_t unk20; // should be idx * 0x8000000, bitstream offset
+   uint32_t do_crypto_crap; // set to 0
 };

 struct mpeg12_picparm_bsp {
-	uint16_t width;
-	uint16_t height;
-	uint8_t picture_structure;
-	uint8_t picture_coding_type;
-	uint8_t intra_dc_precision;
-	uint8_t frame_pred_frame_dct;
-	uint8_t concealment_motion_vectors;
-	uint8_t intra_vlc_format;
-	uint16_t pad;
-	uint8_t f_code[2][2];
+   uint16_t width;
+   uint16_t height;
+   uint8_t picture_structure;
+   uint8_t picture_coding_type;
+   uint8_t intra_dc_precision;
+   uint8_t frame_pred_frame_dct;
+   uint8_t concealment_motion_vectors;
+   uint8_t intra_vlc_format;
+   uint16_t pad;
+   uint8_t f_code[2][2];
 };

 struct mpeg4_picparm_bsp {
-	uint16_t width;
-	uint16_t height;
-	uint8_t vop_time_increment_size;
-	uint8_t interlaced;
-	uint8_t resync_marker_disable;
+   uint16_t width;
+   uint16_t height;
+   uint8_t vop_time_increment_size;
+   uint8_t interlaced;
+   uint8_t resync_marker_disable;
 };

 struct vc1_picparm_bsp {
-	uint16_t width;
-	uint16_t height;
-	uint8_t profile; // 04 0 simple, 1 main, 2 advanced
-	uint8_t postprocflag; // 05
-	uint8_t pulldown; // 06
-	uint8_t interlaced; // 07
-	uint8_t tfcntrflag; // 08
-	uint8_t finterpflag; // 09
-	uint8_t psf; // 0a
-	uint8_t pad; // 0b
-	uint8_t multires; // 0c
-	uint8_t syncmarker; // 0d
-	uint8_t rangered; // 0e
-	uint8_t maxbframes; // 0f
-	uint8_t dquant; // 10
-	uint8_t panscan_flag; // 11
-	uint8_t refdist_flag; // 12
-	uint8_t quantizer; // 13
-	uint8_t extended_mv; // 14
-	uint8_t extended_dmv; // 15
-	uint8_t overlap; // 16
-	uint8_t vstransform; // 17
+   uint16_t width;
+   uint16_t height;
+   uint8_t profile; // 04 0 simple, 1 main, 2 advanced
+   uint8_t postprocflag; // 05
+   uint8_t pulldown; // 06
+   uint8_t interlaced; // 07
+   uint8_t tfcntrflag; // 08
+   uint8_t finterpflag; // 09
+   uint8_t psf; // 0a
+   uint8_t pad; // 0b
+   uint8_t multires; // 0c
+   uint8_t syncmarker; // 0d
+   uint8_t rangered; // 0e
+   uint8_t maxbframes; // 0f
+   uint8_t dquant; // 10
+   uint8_t panscan_flag; // 11
+   uint8_t refdist_flag; // 12
+   uint8_t quantizer; // 13
+   uint8_t extended_mv; // 14
+   uint8_t extended_dmv; // 15
+   uint8_t overlap; // 16
+   uint8_t vstransform; // 17
 };

 struct h264_picparm_bsp {
-	// 00
-	uint32_t unk00;
-	// 04
-	uint32_t log2_max_frame_num_minus4; // 04 checked
-	uint32_t pic_order_cnt_type; // 08 checked
-	uint32_t log2_max_pic_order_cnt_lsb_minus4; // 0c checked
-	uint32_t delta_pic_order_always_zero_flag; // 10, or unknown
+   // 00
+   uint32_t unk00;
+   // 04
+   uint32_t log2_max_frame_num_minus4; // 04 checked
+   uint32_t pic_order_cnt_type; // 08 checked
+   uint32_t log2_max_pic_order_cnt_lsb_minus4; // 0c checked
+   uint32_t delta_pic_order_always_zero_flag; // 10, or unknown

-	uint32_t frame_mbs_only_flag; // 14, always 1?
-	uint32_t direct_8x8_inference_flag; // 18, always 1?
-	uint32_t width_mb; // 1c checked
-	uint32_t height_mb; // 20 checked
-	// 24
-	//struct picparm2
-		uint32_t entropy_coding_mode_flag; // 00, checked
-		uint32_t pic_order_present_flag; // 04 checked
-		uint32_t unk; // 08 seems to be 0?
-		uint32_t pad1; // 0c seems to be 0?
-		uint32_t pad2; // 10 always 0 ?
-		uint32_t num_ref_idx_l0_active_minus1; // 14 always 0?
-		uint32_t num_ref_idx_l1_active_minus1; // 18 always 0?
-		uint32_t weighted_pred_flag; // 1c checked
-		uint32_t weighted_bipred_idc; // 20 checked
-		uint32_t pic_init_qp_minus26; // 24 checked
-		uint32_t deblocking_filter_control_present_flag; // 28 always 1?
-		uint32_t redundant_pic_cnt_present_flag; // 2c always 0?
-		uint32_t transform_8x8_mode_flag; // 30 checked
-		uint32_t mb_adaptive_frame_field_flag; // 34 checked-ish
-		uint8_t field_pic_flag; // 38 checked
-		uint8_t bottom_field_flag; // 39 checked
-		uint8_t real_pad[0x1b]; // XX why?
+   uint32_t frame_mbs_only_flag; // 14, always 1?
+   uint32_t direct_8x8_inference_flag; // 18, always 1?
+   uint32_t width_mb; // 1c checked
+   uint32_t height_mb; // 20 checked
+   // 24
+   //struct picparm2
+   uint32_t entropy_coding_mode_flag; // 00, checked
+   uint32_t pic_order_present_flag; // 04 checked
+   uint32_t unk; // 08 seems to be 0?
+   uint32_t pad1; // 0c seems to be 0?
+   uint32_t pad2; // 10 always 0 ?
+   uint32_t num_ref_idx_l0_active_minus1; // 14 always 0?
+   uint32_t num_ref_idx_l1_active_minus1; // 18 always 0?
+   uint32_t weighted_pred_flag; // 1c checked
+   uint32_t weighted_bipred_idc; // 20 checked
+   uint32_t pic_init_qp_minus26; // 24 checked
+   uint32_t deblocking_filter_control_present_flag; // 28 always 1?
+   uint32_t redundant_pic_cnt_present_flag; // 2c always 0?
+   uint32_t transform_8x8_mode_flag; // 30 checked
+   uint32_t mb_adaptive_frame_field_flag; // 34 checked-ish
+   uint8_t field_pic_flag; // 38 checked
+   uint8_t bottom_field_flag; // 39 checked
+   uint8_t real_pad[0x1b]; // XX why?
 };

 static uint32_t
--- a/src/gallium/drivers/nouveau/nouveau_vp3_video_vp.c
+++ b/src/gallium/drivers/nouveau/nouveau_vp3_video_vp.c
@ -23,147 +23,147 @@
 #include "nouveau_vp3_video.h"

 struct mpeg12_picparm_vp {
-	uint16_t width; // 00 in mb units
-	uint16_t height; // 02 in mb units
+   uint16_t width; // 00 in mb units
+   uint16_t height; // 02 in mb units

-	uint32_t unk04; // 04 stride for Y?
-	uint32_t unk08; // 08 stride for CbCr?
+   uint32_t unk04; // 04 stride for Y?
+   uint32_t unk08; // 08 stride for CbCr?

-	uint32_t ofs[6]; // 1c..20 ofs
-	uint32_t bucket_size; // 24
-	uint32_t inter_ring_data_size; // 28
-	uint16_t unk2c; // 2c
-	uint16_t alternate_scan; // 2e
-	uint16_t unk30; // 30 not seen set yet
-	uint16_t picture_structure; // 32
-	uint16_t pad2[3];
-	uint16_t unk3a; // 3a set on I frame?
+   uint32_t ofs[6]; // 1c..20 ofs
+   uint32_t bucket_size; // 24
+   uint32_t inter_ring_data_size; // 28
+   uint16_t unk2c; // 2c
+   uint16_t alternate_scan; // 2e
+   uint16_t unk30; // 30 not seen set yet
+   uint16_t picture_structure; // 32
+   uint16_t pad2[3];
+   uint16_t unk3a; // 3a set on I frame?

-	uint32_t f_code[4]; // 3c
-	uint32_t picture_coding_type; // 4c
-	uint32_t intra_dc_precision; // 50
-	uint32_t q_scale_type; // 54
-	uint32_t top_field_first; // 58
-	uint32_t full_pel_forward_vector; // 5c
-	uint32_t full_pel_backward_vector; // 60
-	uint8_t intra_quantizer_matrix[0x40]; // 64
-	uint8_t non_intra_quantizer_matrix[0x40]; // a4
+   uint32_t f_code[4]; // 3c
+   uint32_t picture_coding_type; // 4c
+   uint32_t intra_dc_precision; // 50
+   uint32_t q_scale_type; // 54
+   uint32_t top_field_first; // 58
+   uint32_t full_pel_forward_vector; // 5c
+   uint32_t full_pel_backward_vector; // 60
+   uint8_t intra_quantizer_matrix[0x40]; // 64
+   uint8_t non_intra_quantizer_matrix[0x40]; // a4
 };

 struct mpeg4_picparm_vp {
-	uint32_t width; // 00 in normal units
-	uint32_t height; // 04 in normal units
-	uint32_t unk08; // stride 1
-	uint32_t unk0c; // stride 2
-	uint32_t ofs[6]; // 10..24 ofs
-	uint32_t bucket_size; // 28
-	uint32_t pad1; // 2c, pad
-	uint32_t pad2; // 30
-	uint32_t inter_ring_data_size; // 34
+   uint32_t width; // 00 in normal units
+   uint32_t height; // 04 in normal units
+   uint32_t unk08; // stride 1
+   uint32_t unk0c; // stride 2
+   uint32_t ofs[6]; // 10..24 ofs
+   uint32_t bucket_size; // 28
+   uint32_t pad1; // 2c, pad
+   uint32_t pad2; // 30
+   uint32_t inter_ring_data_size; // 34

-	uint32_t trd[2]; // 38, 3c
-	uint32_t trb[2]; // 40, 44
-	uint32_t u48; // XXX codec selection? Should test with different values of VdpDecoderProfile
-	uint16_t f_code_fw; // 4c
-	uint16_t f_code_bw; // 4e
-	uint8_t interlaced; // 50
+   uint32_t trd[2]; // 38, 3c
+   uint32_t trb[2]; // 40, 44
+   uint32_t u48; // XXX codec selection? Should test with different values of VdpDecoderProfile
+   uint16_t f_code_fw; // 4c
+   uint16_t f_code_bw; // 4e
+   uint8_t interlaced; // 50

-	uint8_t quant_type; // bool, written to 528
-	uint8_t quarter_sample; // bool, written to 548
-	uint8_t short_video_header; // bool, negated written to 528 shifted by 1
-	uint8_t u54; // bool, written to 0x740
-	uint8_t vop_coding_type; // 55
-	uint8_t rounding_control; // 56
-	uint8_t alternate_vertical_scan_flag; // 57 bool
-	uint8_t top_field_first; // bool, written to vuc
+   uint8_t quant_type; // bool, written to 528
+   uint8_t quarter_sample; // bool, written to 548
+   uint8_t short_video_header; // bool, negated written to 528 shifted by 1
+   uint8_t u54; // bool, written to 0x740
+   uint8_t vop_coding_type; // 55
+   uint8_t rounding_control; // 56
+   uint8_t alternate_vertical_scan_flag; // 57 bool
+   uint8_t top_field_first; // bool, written to vuc

-	uint8_t pad4[3]; // 59, 5a, 5b, contains garbage on blob
+   uint8_t pad4[3]; // 59, 5a, 5b, contains garbage on blob

-	uint32_t intra[0x10]; // 5c
-	uint32_t non_intra[0x10]; // 9c
-	uint32_t pad5[0x10]; // bc what does this do?
-	// udc..uff pad?
+   uint32_t intra[0x10]; // 5c
+   uint32_t non_intra[0x10]; // 9c
+   uint32_t pad5[0x10]; // bc what does this do?
+   // udc..uff pad?
 };

 // Full version, with data pumped from BSP
 struct vc1_picparm_vp {
-	uint32_t bucket_size; // 00
-	uint32_t pad; // 04
+   uint32_t bucket_size; // 00
+   uint32_t pad; // 04

-	uint32_t inter_ring_data_size; // 08
-	uint32_t unk0c; // stride 1
-	uint32_t unk10; // stride 2
-	uint32_t ofs[6]; // 14..28 ofs
+   uint32_t inter_ring_data_size; // 08
+   uint32_t unk0c; // stride 1
+   uint32_t unk10; // stride 2
+   uint32_t ofs[6]; // 14..28 ofs

-	uint16_t width; // 2c
-	uint16_t height; // 2e
+   uint16_t width; // 2c
+   uint16_t height; // 2e

-	uint8_t profile; // 30 0 = simple, 1 = main, 2 = advanced
-	uint8_t loopfilter; // 31 written into vuc
-	uint8_t fastuvmc; // 32, written into vuc
-	uint8_t dquant; // 33
+   uint8_t profile; // 30 0 = simple, 1 = main, 2 = advanced
+   uint8_t loopfilter; // 31 written into vuc
+   uint8_t fastuvmc; // 32, written into vuc
+   uint8_t dquant; // 33

-	uint8_t overlap; // 34
-	uint8_t quantizer; // 35
-	uint8_t u36; // 36, bool
-	uint8_t pad2; // 37, to align to 0x38
+   uint8_t overlap; // 34
+   uint8_t quantizer; // 35
+   uint8_t u36; // 36, bool
+   uint8_t pad2; // 37, to align to 0x38
 };

 struct h264_picparm_vp { // 700..a00
-	uint16_t width, height;
-	uint32_t stride1, stride2; // 04 08
-	uint32_t ofs[6]; // 0c..24 in-image offset
+   uint16_t width, height;
+   uint32_t stride1, stride2; // 04 08
+   uint32_t ofs[6]; // 0c..24 in-image offset

-	uint32_t tmp_stride;
-	uint32_t bucket_size; // 28 bucket size
-	uint32_t inter_ring_data_size; // 2c
+   uint32_t tmp_stride;
+   uint32_t bucket_size; // 28 bucket size
+   uint32_t inter_ring_data_size; // 2c

-	unsigned mb_adaptive_frame_field_flag : 1; // 0
-	unsigned direct_8x8_inference_flag : 1; // 1 0x02: into vuc ofs 56
-	unsigned weighted_pred_flag : 1; // 2 0x04
-	unsigned constrained_intra_pred_flag : 1; // 3 0x08: into vuc ofs 68
-	unsigned is_reference : 1; // 4
-	unsigned interlace : 1; // 5 field_pic_flag
-	unsigned bottom_field_flag : 1; // 6
-	unsigned second_field : 1; // 7 0x80: nfi yet
+   unsigned mb_adaptive_frame_field_flag : 1; // 0
+   unsigned direct_8x8_inference_flag : 1; // 1 0x02: into vuc ofs 56
+   unsigned weighted_pred_flag : 1; // 2 0x04
+   unsigned constrained_intra_pred_flag : 1; // 3 0x08: into vuc ofs 68
+   unsigned is_reference : 1; // 4
+   unsigned interlace : 1; // 5 field_pic_flag
+   unsigned bottom_field_flag : 1; // 6
+   unsigned second_field : 1; // 7 0x80: nfi yet

-	signed log2_max_frame_num_minus4 : 4; // 31 0..3
-	unsigned chroma_format_idc : 2; // 31 4..5
-	unsigned pic_order_cnt_type : 2; // 31 6..7
-	signed pic_init_qp_minus26 : 6; // 32 0..5
-	signed chroma_qp_index_offset : 5; // 32 6..10
-	signed second_chroma_qp_index_offset : 5; // 32 11..15
+   signed log2_max_frame_num_minus4 : 4; // 31 0..3
+   unsigned chroma_format_idc : 2; // 31 4..5
+   unsigned pic_order_cnt_type : 2; // 31 6..7
+   signed pic_init_qp_minus26 : 6; // 32 0..5
+   signed chroma_qp_index_offset : 5; // 32 6..10
+   signed second_chroma_qp_index_offset : 5; // 32 11..15

-	unsigned weighted_bipred_idc : 2; // 34 0..1
-	unsigned fifo_dec_index : 7; // 34 2..8
-	unsigned tmp_idx : 5; // 34 9..13
-	unsigned frame_number : 16; // 34 14..29
-	unsigned u34_3030 : 1; // 34 30..30 pp.u34[30:30]
-	unsigned u34_3131 : 1; // 34 31..31 pad?
+   unsigned weighted_bipred_idc : 2; // 34 0..1
+   unsigned fifo_dec_index : 7; // 34 2..8
+   unsigned tmp_idx : 5; // 34 9..13
+   unsigned frame_number : 16; // 34 14..29
+   unsigned u34_3030 : 1; // 34 30..30 pp.u34[30:30]
+   unsigned u34_3131 : 1; // 34 31..31 pad?

-	uint32_t field_order_cnt[2]; // 38, 3c
+   uint32_t field_order_cnt[2]; // 38, 3c

-	struct { // 40
-		unsigned fifo_idx : 7; // 00 0..6
-		unsigned tmp_idx : 5; // 00 7..11
-		unsigned top_is_reference : 1; // 00 12
-		unsigned bottom_is_reference : 1; // 00 13
-		unsigned is_long_term : 1; // 00 14
-		unsigned notseenyet : 1; // 00 15 pad?
-		unsigned field_pic_flag : 1; // 00 16
-		unsigned top_field_marking : 4; // 00 17..20
-		unsigned bottom_field_marking : 4; // 00 21..24
-		unsigned pad : 7; // 00 d25..31
+   struct { // 40
+      unsigned fifo_idx : 7; // 00 0..6
+      unsigned tmp_idx : 5; // 00 7..11
+      unsigned top_is_reference : 1; // 00 12
+      unsigned bottom_is_reference : 1; // 00 13
+      unsigned is_long_term : 1; // 00 14
+      unsigned notseenyet : 1; // 00 15 pad?
+      unsigned field_pic_flag : 1; // 00 16
+      unsigned top_field_marking : 4; // 00 17..20
+      unsigned bottom_field_marking : 4; // 00 21..24
+      unsigned pad : 7; // 00 d25..31

-		uint32_t field_order_cnt[2]; // 04,08
-		uint32_t frame_idx; // 0c
-	} refs[0x10];
+      uint32_t field_order_cnt[2]; // 04,08
+      uint32_t frame_idx; // 0c
+   } refs[0x10];

-	uint8_t m4x4[6][16]; // 140
-	uint8_t m8x8[2][64]; // 1a0
-	uint32_t u220; // 220 number of extra reorder_list to append?
-	uint8_t u224[0x20]; // 224..244 reorder_list append ?
-	uint8_t nfi244[0xb0]; // add some pad to make sure nulls are read
+   uint8_t m4x4[6][16]; // 140
+   uint8_t m8x8[2][64]; // 1a0
+   uint32_t u220; // 220 number of extra reorder_list to append?
+   uint8_t u224[0x20]; // 224..244 reorder_list append ?
+   uint8_t nfi244[0xb0]; // add some pad to make sure nulls are read
 };

 static void
--- a/src/gallium/drivers/nouveau/nouveau_winsys.h
+++ b/src/gallium/drivers/nouveau/nouveau_winsys.h
@ -65,18 +65,18 @@ PUSH_KICK(struct nouveau_pushbuf *push)
 static inline uint32_t
 nouveau_screen_transfer_flags(unsigned pipe)
 {
-	uint32_t flags = 0;
+   uint32_t flags = 0;

-	if (!(pipe & PIPE_TRANSFER_UNSYNCHRONIZED)) {
-		if (pipe & PIPE_TRANSFER_READ)
-			flags |= NOUVEAU_BO_RD;
-		if (pipe & PIPE_TRANSFER_WRITE)
-			flags |= NOUVEAU_BO_WR;
-		if (pipe & PIPE_TRANSFER_DONTBLOCK)
-			flags |= NOUVEAU_BO_NOBLOCK;
-	}
+   if (!(pipe & PIPE_TRANSFER_UNSYNCHRONIZED)) {
+      if (pipe & PIPE_TRANSFER_READ)
+         flags |= NOUVEAU_BO_RD;
+      if (pipe & PIPE_TRANSFER_WRITE)
+         flags |= NOUVEAU_BO_WR;
+      if (pipe & PIPE_TRANSFER_DONTBLOCK)
+         flags |= NOUVEAU_BO_NOBLOCK;
+   }

-	return flags;
+   return flags;
 }

 extern struct pipe_screen *
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@ -172,6 +172,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_TGSI_TXQS:
   case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
   case PIPE_CAP_SHAREABLE_SHADERS:
+   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
      return 0;

   case PIPE_CAP_VENDOR_ID:
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@ -16,6 +16,7 @@
 #include "nv50/nv50_program.h"
 #include "nv50/nv50_resource.h"
 #include "nv50/nv50_transfer.h"
+#include "nv50/nv50_query.h"

 #include "nouveau_context.h"
 #include "nouveau_debug.h"
@ -195,17 +196,6 @@ void nv50_default_kick_notify(struct nouveau_pushbuf *);
 /* nv50_draw.c */
 extern struct draw_stage *nv50_draw_render_stage(struct nv50_context *);

-/* nv50_query.c */
-void nv50_init_query_functions(struct nv50_context *);
-void nv50_query_pushbuf_submit(struct nouveau_pushbuf *, uint16_t method,
-                               struct pipe_query *, unsigned result_offset);
-void nv84_query_fifo_wait(struct nouveau_pushbuf *, struct pipe_query *);
-void nva0_so_target_save_offset(struct pipe_context *,
-                                struct pipe_stream_output_target *,
-                                unsigned index, bool seralize);
-
-#define NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
-
 /* nv50_shader_state.c */
 void nv50_vertprog_validate(struct nv50_context *);
 void nv50_gmtyprog_validate(struct nv50_context *);
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@ -336,7 +336,6 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
   info->io.ucpCBSlot = 15;
   info->io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
   info->io.genUserClip = prog->vp.clpd_nr;
-   info->io.sampleInterp = prog->fp.sample_interp;

   info->io.resInfoCBSlot = 15;
   info->io.suInfoBase = NV50_CB_AUX_TEX_MS_OFFSET;
@ -374,6 +373,7 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
   prog->code = info->bin.code;
   prog->code_size = info->bin.codeSize;
   prog->fixups = info->bin.relocData;
+   prog->interps = info->bin.interpData;
   prog->max_gpr = MAX2(4, (info->bin.maxGPR >> 1) + 1);
   prog->tls_space = info->bin.tlsSpace;

@ -420,8 +420,8 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)

   switch (prog->type) {
   case PIPE_SHADER_VERTEX:   heap = nv50->screen->vp_code_heap; break;
-   case PIPE_SHADER_GEOMETRY: heap = nv50->screen->fp_code_heap; break;
-   case PIPE_SHADER_FRAGMENT: heap = nv50->screen->gp_code_heap; break;
+   case PIPE_SHADER_GEOMETRY: heap = nv50->screen->gp_code_heap; break;
+   case PIPE_SHADER_FRAGMENT: heap = nv50->screen->fp_code_heap; break;
   default:
      assert(!"invalid program type");
      return false;
@ -456,6 +456,10 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)

   if (prog->fixups)
      nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0);
+   if (prog->interps)
+      nv50_ir_change_interp(prog->interps, prog->code,
+                            prog->fp.force_persample_interp,
+                            false /* flatshade */);

   nv50_sifc_linear_u8(&nv50->base, nv50->screen->code,
                       (prog->type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base,
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.h
@ -86,7 +86,7 @@ struct nv50_program {
      uint32_t interp; /* 0x1988 */
      uint32_t colors; /* 0x1904 */
      uint8_t has_samplemask;
-      uint8_t sample_interp;
+      uint8_t force_persample_interp;
   } fp;

   struct {
@ -99,6 +99,7 @@ struct nv50_program {
   } gp;

   void *fixups; /* relocation records */
+   void *interps; /* interpolation records */

   struct nouveau_heap *mem;

--- a/src/gallium/drivers/nouveau/nv50/nv50_query.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c
@ -25,356 +25,46 @@
 #define NV50_PUSH_EXPLICIT_SPACE_CHECKING

 #include "nv50/nv50_context.h"
-#include "nv_object.xml.h"
-
-#define NV50_QUERY_STATE_READY   0
-#define NV50_QUERY_STATE_ACTIVE  1
-#define NV50_QUERY_STATE_ENDED   2
-#define NV50_QUERY_STATE_FLUSHED 3
-
-/* XXX: Nested queries, and simultaneous queries on multiple gallium contexts
- * (since we use only a single GPU channel per screen) will not work properly.
- *
- * The first is not that big of an issue because OpenGL does not allow nested
- * queries anyway.
- */
-
-struct nv50_query {
-   uint32_t *data;
-   uint16_t type;
-   uint16_t index;
-   uint32_t sequence;
-   struct nouveau_bo *bo;
-   uint32_t base;
-   uint32_t offset; /* base + i * 32 */
-   uint8_t state;
-   bool is64bit;
-   int nesting; /* only used for occlusion queries */
-   struct nouveau_mm_allocation *mm;
-   struct nouveau_fence *fence;
-};
-
-#define NV50_QUERY_ALLOC_SPACE 256
-
-static inline struct nv50_query *
-nv50_query(struct pipe_query *pipe)
-{
-   return (struct nv50_query *)pipe;
-}
-
-static bool
-nv50_query_allocate(struct nv50_context *nv50, struct nv50_query *q, int size)
-{
-   struct nv50_screen *screen = nv50->screen;
-   int ret;
-
-   if (q->bo) {
-      nouveau_bo_ref(NULL, &q->bo);
-      if (q->mm) {
-         if (q->state == NV50_QUERY_STATE_READY)
-            nouveau_mm_free(q->mm);
-         else
-            nouveau_fence_work(screen->base.fence.current, nouveau_mm_free_work,
-                               q->mm);
-      }
-   }
-   if (size) {
-      q->mm = nouveau_mm_allocate(screen->base.mm_GART, size, &q->bo, &q->base);
-      if (!q->bo)
-         return false;
-      q->offset = q->base;
-
-      ret = nouveau_bo_map(q->bo, 0, screen->base.client);
-      if (ret) {
-         nv50_query_allocate(nv50, q, 0);
-         return false;
-      }
-      q->data = (uint32_t *)((uint8_t *)q->bo->map + q->base);
-   }
-   return true;
-}
-
-static void
-nv50_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
-{
-   nv50_query_allocate(nv50_context(pipe), nv50_query(pq), 0);
-   nouveau_fence_ref(NULL, &nv50_query(pq)->fence);
-   FREE(nv50_query(pq));
-}
+#include "nv50/nv50_query.h"
+#include "nv50/nv50_query_hw.h"

 static struct pipe_query *
-nv50_query_create(struct pipe_context *pipe, unsigned type, unsigned index)
+nv50_create_query(struct pipe_context *pipe, unsigned type, unsigned index)
 {
   struct nv50_context *nv50 = nv50_context(pipe);
   struct nv50_query *q;

-   q = CALLOC_STRUCT(nv50_query);
-   if (!q)
-      return NULL;
-
-   if (!nv50_query_allocate(nv50, q, NV50_QUERY_ALLOC_SPACE)) {
-      FREE(q);
-      return NULL;
-   }
-
-   q->is64bit = (type == PIPE_QUERY_PRIMITIVES_GENERATED ||
-                 type == PIPE_QUERY_PRIMITIVES_EMITTED ||
-                 type == PIPE_QUERY_SO_STATISTICS ||
-                 type == PIPE_QUERY_PIPELINE_STATISTICS);
-   q->type = type;
-
-   if (q->type == PIPE_QUERY_OCCLUSION_COUNTER) {
-      q->offset -= 32;
-      q->data -= 32 / sizeof(*q->data); /* we advance before query_begin ! */
-   }
-
+   q = nv50_hw_create_query(nv50, type, index);
   return (struct pipe_query *)q;
 }

 static void
-nv50_query_get(struct nouveau_pushbuf *push, struct nv50_query *q,
-               unsigned offset, uint32_t get)
+nv50_destroy_query(struct pipe_context *pipe, struct pipe_query *pq)
 {
-   offset += q->offset;
-
-   PUSH_SPACE(push, 5);
-   PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_WR);
-   BEGIN_NV04(push, NV50_3D(QUERY_ADDRESS_HIGH), 4);
-   PUSH_DATAh(push, q->bo->offset + offset);
-   PUSH_DATA (push, q->bo->offset + offset);
-   PUSH_DATA (push, q->sequence);
-   PUSH_DATA (push, get);
+   struct nv50_query *q = nv50_query(pq);
+   q->funcs->destroy_query(nv50_context(pipe), q);
 }

 static boolean
-nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
+nv50_begin_query(struct pipe_context *pipe, struct pipe_query *pq)
 {
-   struct nv50_context *nv50 = nv50_context(pipe);
-   struct nouveau_pushbuf *push = nv50->base.pushbuf;
   struct nv50_query *q = nv50_query(pq);
-
-   /* For occlusion queries we have to change the storage, because a previous
-    * query might set the initial render conition to false even *after* we re-
-    * initialized it to true.
-    */
-   if (q->type == PIPE_QUERY_OCCLUSION_COUNTER) {
-      q->offset += 32;
-      q->data += 32 / sizeof(*q->data);
-      if (q->offset - q->base == NV50_QUERY_ALLOC_SPACE)
-         nv50_query_allocate(nv50, q, NV50_QUERY_ALLOC_SPACE);
-
-      /* XXX: can we do this with the GPU, and sync with respect to a previous
-       *  query ?
-       */
-      q->data[0] = q->sequence; /* initialize sequence */
-      q->data[1] = 1; /* initial render condition = true */
-      q->data[4] = q->sequence + 1; /* for comparison COND_MODE */
-      q->data[5] = 0;
-   }
-   if (!q->is64bit)
-      q->data[0] = q->sequence++; /* the previously used one */
-
-   switch (q->type) {
-   case PIPE_QUERY_OCCLUSION_COUNTER:
-      q->nesting = nv50->screen->num_occlusion_queries_active++;
-      if (q->nesting) {
-         nv50_query_get(push, q, 0x10, 0x0100f002);
-      } else {
-         PUSH_SPACE(push, 4);
-         BEGIN_NV04(push, NV50_3D(COUNTER_RESET), 1);
-         PUSH_DATA (push, NV50_3D_COUNTER_RESET_SAMPLECNT);
-         BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
-         PUSH_DATA (push, 1);
-      }
-      break;
-   case PIPE_QUERY_PRIMITIVES_GENERATED:
-      nv50_query_get(push, q, 0x10, 0x06805002);
-      break;
-   case PIPE_QUERY_PRIMITIVES_EMITTED:
-      nv50_query_get(push, q, 0x10, 0x05805002);
-      break;
-   case PIPE_QUERY_SO_STATISTICS:
-      nv50_query_get(push, q, 0x20, 0x05805002);
-      nv50_query_get(push, q, 0x30, 0x06805002);
-      break;
-   case PIPE_QUERY_PIPELINE_STATISTICS:
-      nv50_query_get(push, q, 0x80, 0x00801002); /* VFETCH, VERTICES */
-      nv50_query_get(push, q, 0x90, 0x01801002); /* VFETCH, PRIMS */
-      nv50_query_get(push, q, 0xa0, 0x02802002); /* VP, LAUNCHES */
-      nv50_query_get(push, q, 0xb0, 0x03806002); /* GP, LAUNCHES */
-      nv50_query_get(push, q, 0xc0, 0x04806002); /* GP, PRIMS_OUT */
-      nv50_query_get(push, q, 0xd0, 0x07804002); /* RAST, PRIMS_IN */
-      nv50_query_get(push, q, 0xe0, 0x08804002); /* RAST, PRIMS_OUT */
-      nv50_query_get(push, q, 0xf0, 0x0980a002); /* ROP, PIXELS */
-      break;
-   case PIPE_QUERY_TIME_ELAPSED:
-      nv50_query_get(push, q, 0x10, 0x00005002);
-      break;
-   default:
-      break;
-   }
-   q->state = NV50_QUERY_STATE_ACTIVE;
-   return true;
+   return q->funcs->begin_query(nv50_context(pipe), q);
 }

 static void
-nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
+nv50_end_query(struct pipe_context *pipe, struct pipe_query *pq)
 {
-   struct nv50_context *nv50 = nv50_context(pipe);
-   struct nouveau_pushbuf *push = nv50->base.pushbuf;
   struct nv50_query *q = nv50_query(pq);
-
-   q->state = NV50_QUERY_STATE_ENDED;
-
-   switch (q->type) {
-   case PIPE_QUERY_OCCLUSION_COUNTER:
-      nv50_query_get(push, q, 0, 0x0100f002);
-      if (--nv50->screen->num_occlusion_queries_active == 0) {
-         PUSH_SPACE(push, 2);
-         BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
-         PUSH_DATA (push, 0);
-      }
-      break;
-   case PIPE_QUERY_PRIMITIVES_GENERATED:
-      nv50_query_get(push, q, 0, 0x06805002);
-      break;
-   case PIPE_QUERY_PRIMITIVES_EMITTED:
-      nv50_query_get(push, q, 0, 0x05805002);
-      break;
-   case PIPE_QUERY_SO_STATISTICS:
-      nv50_query_get(push, q, 0x00, 0x05805002);
-      nv50_query_get(push, q, 0x10, 0x06805002);
-      break;
-   case PIPE_QUERY_PIPELINE_STATISTICS:
-      nv50_query_get(push, q, 0x00, 0x00801002); /* VFETCH, VERTICES */
-      nv50_query_get(push, q, 0x10, 0x01801002); /* VFETCH, PRIMS */
-      nv50_query_get(push, q, 0x20, 0x02802002); /* VP, LAUNCHES */
-      nv50_query_get(push, q, 0x30, 0x03806002); /* GP, LAUNCHES */
-      nv50_query_get(push, q, 0x40, 0x04806002); /* GP, PRIMS_OUT */
-      nv50_query_get(push, q, 0x50, 0x07804002); /* RAST, PRIMS_IN */
-      nv50_query_get(push, q, 0x60, 0x08804002); /* RAST, PRIMS_OUT */
-      nv50_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */
-      break;
-   case PIPE_QUERY_TIMESTAMP:
-      q->sequence++;
-      /* fall through */
-   case PIPE_QUERY_TIME_ELAPSED:
-      nv50_query_get(push, q, 0, 0x00005002);
-      break;
-   case PIPE_QUERY_GPU_FINISHED:
-      q->sequence++;
-      nv50_query_get(push, q, 0, 0x1000f010);
-      break;
-   case NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET:
-      q->sequence++;
-      nv50_query_get(push, q, 0, 0x0d005002 | (q->index << 5));
-      break;
-   case PIPE_QUERY_TIMESTAMP_DISJOINT:
-      /* This query is not issued on GPU because disjoint is forced to false */
-      q->state = NV50_QUERY_STATE_READY;
-      break;
-   default:
-      assert(0);
-      break;
-   }
-
-   if (q->is64bit)
-      nouveau_fence_ref(nv50->screen->base.fence.current, &q->fence);
-}
-
-static inline void
-nv50_query_update(struct nv50_query *q)
-{
-   if (q->is64bit) {
-      if (nouveau_fence_signalled(q->fence))
-         q->state = NV50_QUERY_STATE_READY;
-   } else {
-      if (q->data[0] == q->sequence)
-         q->state = NV50_QUERY_STATE_READY;
-   }
+   q->funcs->end_query(nv50_context(pipe), q);
 }

 static boolean
-nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
-                  boolean wait, union pipe_query_result *result)
-{
-   struct nv50_context *nv50 = nv50_context(pipe);
-   struct nv50_query *q = nv50_query(pq);
-   uint64_t *res64 = (uint64_t *)result;
-   uint32_t *res32 = (uint32_t *)result;
-   uint8_t *res8 = (uint8_t *)result;
-   uint64_t *data64 = (uint64_t *)q->data;
-   int i;
-
-   if (q->state != NV50_QUERY_STATE_READY)
-      nv50_query_update(q);
-
-   if (q->state != NV50_QUERY_STATE_READY) {
-      if (!wait) {
-         /* for broken apps that spin on GL_QUERY_RESULT_AVAILABLE */
-         if (q->state != NV50_QUERY_STATE_FLUSHED) {
-            q->state = NV50_QUERY_STATE_FLUSHED;
-            PUSH_KICK(nv50->base.pushbuf);
-         }
-         return false;
-      }
-      if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nv50->screen->base.client))
-         return false;
-   }
-   q->state = NV50_QUERY_STATE_READY;
-
-   switch (q->type) {
-   case PIPE_QUERY_GPU_FINISHED:
-      res8[0] = true;
-      break;
-   case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */
-      res64[0] = q->data[1] - q->data[5];
-      break;
-   case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */
-   case PIPE_QUERY_PRIMITIVES_EMITTED: /* u64 count, u64 time */
-      res64[0] = data64[0] - data64[2];
-      break;
-   case PIPE_QUERY_SO_STATISTICS:
-      res64[0] = data64[0] - data64[4];
-      res64[1] = data64[2] - data64[6];
-      break;
-   case PIPE_QUERY_PIPELINE_STATISTICS:
-      for (i = 0; i < 8; ++i)
-         res64[i] = data64[i * 2] - data64[16 + i * 2];
-      break;
-   case PIPE_QUERY_TIMESTAMP:
-      res64[0] = data64[1];
-      break;
-   case PIPE_QUERY_TIMESTAMP_DISJOINT:
-      res64[0] = 1000000000;
-      res8[8] = false;
-      break;
-   case PIPE_QUERY_TIME_ELAPSED:
-      res64[0] = data64[1] - data64[3];
-      break;
-   case NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET:
-      res32[0] = q->data[1];
-      break;
-   default:
-      return false;
-   }
-
-   return true;
-}
-
-void
-nv84_query_fifo_wait(struct nouveau_pushbuf *push, struct pipe_query *pq)
+nv50_get_query_result(struct pipe_context *pipe, struct pipe_query *pq,
+                      boolean wait, union pipe_query_result *result)
 {
   struct nv50_query *q = nv50_query(pq);
-   unsigned offset = q->offset;
-
-   PUSH_SPACE(push, 5);
-   PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-   BEGIN_NV04(push, SUBC_3D(NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH), 4);
-   PUSH_DATAh(push, q->bo->offset + offset);
-   PUSH_DATA (push, q->bo->offset + offset);
-   PUSH_DATA (push, q->sequence);
-   PUSH_DATA (push, NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL);
+   return q->funcs->get_query_result(nv50_context(pipe), q, wait, result);
 }

 static void
@ -384,7 +74,8 @@ nv50_render_condition(struct pipe_context *pipe,
 {
   struct nv50_context *nv50 = nv50_context(pipe);
   struct nouveau_pushbuf *push = nv50->base.pushbuf;
-   struct nv50_query *q;
+   struct nv50_query *q = nv50_query(pq);
+   struct nv50_hw_query *hq = nv50_hw_query(q);
   uint32_t cond;
   bool wait =
      mode != PIPE_RENDER_COND_NO_WAIT &&
@ -394,7 +85,6 @@ nv50_render_condition(struct pipe_context *pipe,
      cond = NV50_3D_COND_MODE_ALWAYS;
   }
   else {
-      q = nv50_query(pq);
      /* NOTE: comparison of 2 queries only works if both have completed */
      switch (q->type) {
      case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
@ -405,7 +95,7 @@ nv50_render_condition(struct pipe_context *pipe,
      case PIPE_QUERY_OCCLUSION_COUNTER:
      case PIPE_QUERY_OCCLUSION_PREDICATE:
         if (likely(!condition)) {
-            if (unlikely(q->nesting))
+            if (unlikely(hq->nesting))
               cond = wait ? NV50_3D_COND_MODE_NOT_EQUAL :
                             NV50_3D_COND_MODE_ALWAYS;
            else
@ -440,48 +130,15 @@ nv50_render_condition(struct pipe_context *pipe,
      PUSH_DATA (push, 0);
   }

-   PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+   PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
   BEGIN_NV04(push, NV50_3D(COND_ADDRESS_HIGH), 3);
-   PUSH_DATAh(push, q->bo->offset + q->offset);
-   PUSH_DATA (push, q->bo->offset + q->offset);
+   PUSH_DATAh(push, hq->bo->offset + hq->offset);
+   PUSH_DATA (push, hq->bo->offset + hq->offset);
   PUSH_DATA (push, cond);

   BEGIN_NV04(push, NV50_2D(COND_ADDRESS_HIGH), 2);
-   PUSH_DATAh(push, q->bo->offset + q->offset);
-   PUSH_DATA (push, q->bo->offset + q->offset);
-}
-
-void
-nv50_query_pushbuf_submit(struct nouveau_pushbuf *push, uint16_t method,
-                          struct pipe_query *pq, unsigned result_offset)
-{
-   struct nv50_query *q = nv50_query(pq);
-
-   nv50_query_update(q);
-   if (q->state != NV50_QUERY_STATE_READY)
-      nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, push->client);
-   q->state = NV50_QUERY_STATE_READY;
-
-   BEGIN_NV04(push, SUBC_3D(method), 1);
-   PUSH_DATA (push, q->data[result_offset / 4]);
-}
-
-void
-nva0_so_target_save_offset(struct pipe_context *pipe,
-                           struct pipe_stream_output_target *ptarg,
-                           unsigned index, bool serialize)
-{
-   struct nv50_so_target *targ = nv50_so_target(ptarg);
-
-   if (serialize) {
-      struct nouveau_pushbuf *push = nv50_context(pipe)->base.pushbuf;
-      PUSH_SPACE(push, 2);
-      BEGIN_NV04(push, SUBC_3D(NV50_GRAPH_SERIALIZE), 1);
-      PUSH_DATA (push, 0);
-   }
-
-   nv50_query(targ->pq)->index = index;
-   nv50_query_end(pipe, targ->pq);
+   PUSH_DATAh(push, hq->bo->offset + hq->offset);
+   PUSH_DATA (push, hq->bo->offset + hq->offset);
 }

 void
@ -489,10 +146,10 @@ nv50_init_query_functions(struct nv50_context *nv50)
 {
   struct pipe_context *pipe = &nv50->base.pipe;

-   pipe->create_query = nv50_query_create;
-   pipe->destroy_query = nv50_query_destroy;
-   pipe->begin_query = nv50_query_begin;
-   pipe->end_query = nv50_query_end;
-   pipe->get_query_result = nv50_query_result;
+   pipe->create_query = nv50_create_query;
+   pipe->destroy_query = nv50_destroy_query;
+   pipe->begin_query = nv50_begin_query;
+   pipe->end_query = nv50_end_query;
+   pipe->get_query_result = nv50_get_query_result;
   pipe->render_condition = nv50_render_condition;
 }
--- a/src/gallium/drivers/nouveau/nv50/nv50_query.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query.h
@ -0,0 +1,33 @@
+#ifndef __NV50_QUERY_H__
+#define __NV50_QUERY_H__
+
+#include "pipe/p_context.h"
+
+#include "nouveau_context.h"
+
+struct nv50_context;
+struct nv50_query;
+
+struct nv50_query_funcs {
+   void (*destroy_query)(struct nv50_context *, struct nv50_query *);
+   boolean (*begin_query)(struct nv50_context *, struct nv50_query *);
+   void (*end_query)(struct nv50_context *, struct nv50_query *);
+   boolean (*get_query_result)(struct nv50_context *, struct nv50_query *,
+                               boolean, union pipe_query_result *);
+};
+
+struct nv50_query {
+   const struct nv50_query_funcs *funcs;
+   uint16_t type;
+   uint16_t index;
+};
+
+static inline struct nv50_query *
+nv50_query(struct pipe_query *pipe)
+{
+   return (struct nv50_query *)pipe;
+}
+
+void nv50_init_query_functions(struct nv50_context *);
+
+#endif
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
@ -0,0 +1,406 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ * Copyright 2015 Samuel Pitoiset
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#define NV50_PUSH_EXPLICIT_SPACE_CHECKING
+
+#include "nv50/nv50_context.h"
+#include "nv50/nv50_query_hw.h"
+#include "nv_object.xml.h"
+
+#define NV50_HW_QUERY_STATE_READY   0
+#define NV50_HW_QUERY_STATE_ACTIVE  1
+#define NV50_HW_QUERY_STATE_ENDED   2
+#define NV50_HW_QUERY_STATE_FLUSHED 3
+
+/* XXX: Nested queries, and simultaneous queries on multiple gallium contexts
+ * (since we use only a single GPU channel per screen) will not work properly.
+ *
+ * The first is not that big of an issue because OpenGL does not allow nested
+ * queries anyway.
+ */
+
+#define NV50_HW_QUERY_ALLOC_SPACE 256
+
+static bool
+nv50_hw_query_allocate(struct nv50_context *nv50, struct nv50_query *q,
+                       int size)
+{
+   struct nv50_screen *screen = nv50->screen;
+   struct nv50_hw_query *hq = nv50_hw_query(q);
+   int ret;
+
+   if (hq->bo) {
+      nouveau_bo_ref(NULL, &hq->bo);
+      if (hq->mm) {
+         if (hq->state == NV50_HW_QUERY_STATE_READY)
+            nouveau_mm_free(hq->mm);
+         else
+            nouveau_fence_work(screen->base.fence.current,
+                               nouveau_mm_free_work, hq->mm);
+      }
+   }
+   if (size) {
+      hq->mm = nouveau_mm_allocate(screen->base.mm_GART, size,
+                                   &hq->bo, &hq->base_offset);
+      if (!hq->bo)
+         return false;
+      hq->offset = hq->base_offset;
+
+      ret = nouveau_bo_map(hq->bo, 0, screen->base.client);
+      if (ret) {
+         nv50_hw_query_allocate(nv50, q, 0);
+         return false;
+      }
+      hq->data = (uint32_t *)((uint8_t *)hq->bo->map + hq->base_offset);
+   }
+   return true;
+}
+
+static void
+nv50_hw_query_get(struct nouveau_pushbuf *push, struct nv50_query *q,
+               unsigned offset, uint32_t get)
+{
+   struct nv50_hw_query *hq = nv50_hw_query(q);
+
+   offset += hq->offset;
+
+   PUSH_SPACE(push, 5);
+   PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_WR);
+   BEGIN_NV04(push, NV50_3D(QUERY_ADDRESS_HIGH), 4);
+   PUSH_DATAh(push, hq->bo->offset + offset);
+   PUSH_DATA (push, hq->bo->offset + offset);
+   PUSH_DATA (push, hq->sequence);
+   PUSH_DATA (push, get);
+}
+
+static inline void
+nv50_hw_query_update(struct nv50_query *q)
+{
+   struct nv50_hw_query *hq = nv50_hw_query(q);
+
+   if (hq->is64bit) {
+      if (nouveau_fence_signalled(hq->fence))
+         hq->state = NV50_HW_QUERY_STATE_READY;
+   } else {
+      if (hq->data[0] == hq->sequence)
+         hq->state = NV50_HW_QUERY_STATE_READY;
+   }
+}
+
+static void
+nv50_hw_destroy_query(struct nv50_context *nv50, struct nv50_query *q)
+{
+   struct nv50_hw_query *hq = nv50_hw_query(q);
+   nv50_hw_query_allocate(nv50, q, 0);
+   nouveau_fence_ref(NULL, &hq->fence);
+   FREE(hq);
+}
+
+static boolean
+nv50_hw_begin_query(struct nv50_context *nv50, struct nv50_query *q)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_hw_query *hq = nv50_hw_query(q);
+
+   /* For occlusion queries we have to change the storage, because a previous
+    * query might set the initial render condition to false even *after* we re-
+    * initialized it to true.
+    */
+   if (hq->rotate) {
+      hq->offset += hq->rotate;
+      hq->data += hq->rotate / sizeof(*hq->data);
+      if (hq->offset - hq->base_offset == NV50_HW_QUERY_ALLOC_SPACE)
+         nv50_hw_query_allocate(nv50, q, NV50_HW_QUERY_ALLOC_SPACE);
+
+      /* XXX: can we do this with the GPU, and sync with respect to a previous
+       *  query ?
+       */
+      hq->data[0] = hq->sequence; /* initialize sequence */
+      hq->data[1] = 1; /* initial render condition = true */
+      hq->data[4] = hq->sequence + 1; /* for comparison COND_MODE */
+      hq->data[5] = 0;
+   }
+   if (!hq->is64bit)
+      hq->data[0] = hq->sequence++; /* the previously used one */
+
+   switch (q->type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+      hq->nesting = nv50->screen->num_occlusion_queries_active++;
+      if (hq->nesting) {
+         nv50_hw_query_get(push, q, 0x10, 0x0100f002);
+      } else {
+         PUSH_SPACE(push, 4);
+         BEGIN_NV04(push, NV50_3D(COUNTER_RESET), 1);
+         PUSH_DATA (push, NV50_3D_COUNTER_RESET_SAMPLECNT);
+         BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
+         PUSH_DATA (push, 1);
+      }
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      nv50_hw_query_get(push, q, 0x10, 0x06805002);
+      break;
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      nv50_hw_query_get(push, q, 0x10, 0x05805002);
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      nv50_hw_query_get(push, q, 0x20, 0x05805002);
+      nv50_hw_query_get(push, q, 0x30, 0x06805002);
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      nv50_hw_query_get(push, q, 0x80, 0x00801002); /* VFETCH, VERTICES */
+      nv50_hw_query_get(push, q, 0x90, 0x01801002); /* VFETCH, PRIMS */
+      nv50_hw_query_get(push, q, 0xa0, 0x02802002); /* VP, LAUNCHES */
+      nv50_hw_query_get(push, q, 0xb0, 0x03806002); /* GP, LAUNCHES */
+      nv50_hw_query_get(push, q, 0xc0, 0x04806002); /* GP, PRIMS_OUT */
+      nv50_hw_query_get(push, q, 0xd0, 0x07804002); /* RAST, PRIMS_IN */
+      nv50_hw_query_get(push, q, 0xe0, 0x08804002); /* RAST, PRIMS_OUT */
+      nv50_hw_query_get(push, q, 0xf0, 0x0980a002); /* ROP, PIXELS */
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+      nv50_hw_query_get(push, q, 0x10, 0x00005002);
+      break;
+   default:
+      assert(0);
+      return false;
+   }
+   hq->state = NV50_HW_QUERY_STATE_ACTIVE;
+   return true;
+}
+
+static void
+nv50_hw_end_query(struct nv50_context *nv50, struct nv50_query *q)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_hw_query *hq = nv50_hw_query(q);
+
+   hq->state = NV50_HW_QUERY_STATE_ENDED;
+
+   switch (q->type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+      nv50_hw_query_get(push, q, 0, 0x0100f002);
+      if (--nv50->screen->num_occlusion_queries_active == 0) {
+         PUSH_SPACE(push, 2);
+         BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
+         PUSH_DATA (push, 0);
+      }
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      nv50_hw_query_get(push, q, 0, 0x06805002);
+      break;
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      nv50_hw_query_get(push, q, 0, 0x05805002);
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      nv50_hw_query_get(push, q, 0x00, 0x05805002);
+      nv50_hw_query_get(push, q, 0x10, 0x06805002);
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      nv50_hw_query_get(push, q, 0x00, 0x00801002); /* VFETCH, VERTICES */
+      nv50_hw_query_get(push, q, 0x10, 0x01801002); /* VFETCH, PRIMS */
+      nv50_hw_query_get(push, q, 0x20, 0x02802002); /* VP, LAUNCHES */
+      nv50_hw_query_get(push, q, 0x30, 0x03806002); /* GP, LAUNCHES */
+      nv50_hw_query_get(push, q, 0x40, 0x04806002); /* GP, PRIMS_OUT */
+      nv50_hw_query_get(push, q, 0x50, 0x07804002); /* RAST, PRIMS_IN */
+      nv50_hw_query_get(push, q, 0x60, 0x08804002); /* RAST, PRIMS_OUT */
+      nv50_hw_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */
+      break;
+   case PIPE_QUERY_TIMESTAMP:
+      hq->sequence++;
+      /* fall through */
+   case PIPE_QUERY_TIME_ELAPSED:
+      nv50_hw_query_get(push, q, 0, 0x00005002);
+      break;
+   case PIPE_QUERY_GPU_FINISHED:
+      hq->sequence++;
+      nv50_hw_query_get(push, q, 0, 0x1000f010);
+      break;
+   case NVA0_HW_QUERY_STREAM_OUTPUT_BUFFER_OFFSET:
+      hq->sequence++;
+      nv50_hw_query_get(push, q, 0, 0x0d005002 | (q->index << 5));
+      break;
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+      /* This query is not issued on GPU because disjoint is forced to false */
+      hq->state = NV50_HW_QUERY_STATE_READY;
+      break;
+   default:
+      assert(0);
+      break;
+   }
+   if (hq->is64bit)
+      nouveau_fence_ref(nv50->screen->base.fence.current, &hq->fence);
+}
+
+static boolean
+nv50_hw_get_query_result(struct nv50_context *nv50, struct nv50_query *q,
+                         boolean wait, union pipe_query_result *result)
+{
+   struct nv50_hw_query *hq = nv50_hw_query(q);
+   uint64_t *res64 = (uint64_t *)result;
+   uint32_t *res32 = (uint32_t *)result;
+   uint8_t *res8 = (uint8_t *)result;
+   uint64_t *data64 = (uint64_t *)hq->data;
+   int i;
+
+   if (hq->state != NV50_HW_QUERY_STATE_READY)
+      nv50_hw_query_update(q);
+
+   if (hq->state != NV50_HW_QUERY_STATE_READY) {
+      if (!wait) {
+         /* for broken apps that spin on GL_QUERY_RESULT_AVAILABLE */
+         if (hq->state != NV50_HW_QUERY_STATE_FLUSHED) {
+            hq->state = NV50_HW_QUERY_STATE_FLUSHED;
+            PUSH_KICK(nv50->base.pushbuf);
+         }
+         return false;
+      }
+      if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nv50->screen->base.client))
+         return false;
+   }
+   hq->state = NV50_HW_QUERY_STATE_READY;
+
+   switch (q->type) {
+   case PIPE_QUERY_GPU_FINISHED:
+      res8[0] = true;
+      break;
+   case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */
+      res64[0] = hq->data[1] - hq->data[5];
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */
+   case PIPE_QUERY_PRIMITIVES_EMITTED: /* u64 count, u64 time */
+      res64[0] = data64[0] - data64[2];
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      res64[0] = data64[0] - data64[4];
+      res64[1] = data64[2] - data64[6];
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      for (i = 0; i < 8; ++i)
+         res64[i] = data64[i * 2] - data64[16 + i * 2];
+      break;
+   case PIPE_QUERY_TIMESTAMP:
+      res64[0] = data64[1];
+      break;
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+      res64[0] = 1000000000;
+      res8[8] = false;
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+      res64[0] = data64[1] - data64[3];
+      break;
+   case NVA0_HW_QUERY_STREAM_OUTPUT_BUFFER_OFFSET:
+      res32[0] = hq->data[1];
+      break;
+   default:
+      assert(0);
+      return false;
+   }
+
+   return true;
+}
+
+static const struct nv50_query_funcs hw_query_funcs = {
+   .destroy_query = nv50_hw_destroy_query,
+   .begin_query = nv50_hw_begin_query,
+   .end_query = nv50_hw_end_query,
+   .get_query_result = nv50_hw_get_query_result,
+};
+
+struct nv50_query *
+nv50_hw_create_query(struct nv50_context *nv50, unsigned type, unsigned index)
+{
+   struct nv50_hw_query *hq;
+   struct nv50_query *q;
+
+   hq = CALLOC_STRUCT(nv50_hw_query);
+   if (!hq)
+      return NULL;
+
+   q = &hq->base;
+   q->funcs = &hw_query_funcs;
+   q->type = type;
+
+   switch (q->type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+      hq->rotate = 32;
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+   case PIPE_QUERY_SO_STATISTICS:
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      hq->is64bit = true;
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+   case PIPE_QUERY_TIMESTAMP:
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+   case PIPE_QUERY_GPU_FINISHED:
+   case NVA0_HW_QUERY_STREAM_OUTPUT_BUFFER_OFFSET:
+      break;
+   default:
+      debug_printf("invalid query type: %u\n", type);
+      FREE(q);
+      return NULL;
+   }
+
+   if (!nv50_hw_query_allocate(nv50, q, NV50_HW_QUERY_ALLOC_SPACE)) {
+      FREE(hq);
+      return NULL;
+   }
+
+   if (hq->rotate) {
+      /* we advance before query_begin ! */
+      hq->offset -= hq->rotate;
+      hq->data -= hq->rotate / sizeof(*hq->data);
+   }
+
+   return q;
+}
+
+void
+nv50_hw_query_pushbuf_submit(struct nouveau_pushbuf *push, uint16_t method,
+                             struct nv50_query *q, unsigned result_offset)
+{
+   struct nv50_hw_query *hq = nv50_hw_query(q);
+
+   nv50_hw_query_update(q);
+   if (hq->state != NV50_HW_QUERY_STATE_READY)
+      nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, push->client);
+   hq->state = NV50_HW_QUERY_STATE_READY;
+
+   BEGIN_NV04(push, SUBC_3D(method), 1);
+   PUSH_DATA (push, hq->data[result_offset / 4]);
+}
+
+void
+nv84_hw_query_fifo_wait(struct nouveau_pushbuf *push, struct nv50_query *q)
+{
+   struct nv50_hw_query *hq = nv50_hw_query(q);
+   unsigned offset = hq->offset;
+
+   PUSH_SPACE(push, 5);
+   PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+   BEGIN_NV04(push, SUBC_3D(NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH), 4);
+   PUSH_DATAh(push, hq->bo->offset + offset);
+   PUSH_DATA (push, hq->bo->offset + offset);
+   PUSH_DATA (push, hq->sequence);
+   PUSH_DATA (push, NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL);
+}
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h
@ -0,0 +1,40 @@
+#ifndef __NV50_QUERY_HW_H__
+#define __NV50_QUERY_HW_H__
+
+#include "nouveau_fence.h"
+#include "nouveau_mm.h"
+
+#include "nv50_query.h"
+
+#define NVA0_HW_QUERY_STREAM_OUTPUT_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
+
+struct nv50_hw_query {
+   struct nv50_query base;
+   uint32_t *data;
+   uint32_t sequence;
+   struct nouveau_bo *bo;
+   uint32_t base_offset;
+   uint32_t offset; /* base + i * rotate */
+   uint8_t state;
+   bool is64bit;
+   uint8_t rotate;
+   int nesting; /* only used for occlusion queries */
+   struct nouveau_mm_allocation *mm;
+   struct nouveau_fence *fence;
+};
+
+static inline struct nv50_hw_query *
+nv50_hw_query(struct nv50_query *q)
+{
+   return (struct nv50_hw_query *)q;
+}
+
+struct nv50_query *
+nv50_hw_create_query(struct nv50_context *, unsigned, unsigned);
+void
+nv50_hw_query_pushbuf_submit(struct nouveau_pushbuf *, uint16_t,
+                             struct nv50_query *, unsigned);
+void
+nv84_hw_query_fifo_wait(struct nouveau_pushbuf *, struct nv50_query *);
+
+#endif
--- a/src/gallium/drivers/nouveau/nv50/nv50_resource.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_resource.c
@ -32,8 +32,8 @@ nv50_resource_from_handle(struct pipe_screen * screen,

 struct pipe_surface *
 nv50_surface_from_buffer(struct pipe_context *pipe,
-			 struct pipe_resource *pbuf,
-			 const struct pipe_surface *templ)
+                         struct pipe_resource *pbuf,
+                         const struct pipe_surface *templ)
 {
   struct nv50_surface *sf = CALLOC_STRUCT(nv50_surface);
   if (!sf)
@ -65,8 +65,8 @@ nv50_surface_from_buffer(struct pipe_context *pipe,

 static struct pipe_surface *
 nv50_surface_create(struct pipe_context *pipe,
-		    struct pipe_resource *pres,
-		    const struct pipe_surface *templ)
+                    struct pipe_resource *pres,
+                    const struct pipe_surface *templ)
 {
   /* surfaces are assumed to be miptrees all over the place. */
   assert(pres->target != PIPE_BUFFER);
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@ -180,6 +180,8 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
   case PIPE_CAP_DEPTH_BOUNDS_TEST:
   case PIPE_CAP_TGSI_TXQS:
+   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+   case PIPE_CAP_SHAREABLE_SHADERS:
      return 1;
   case PIPE_CAP_SEAMLESS_CUBE_MAP:
      return 1; /* class_3d >= NVA0_3D_CLASS; */
@ -191,6 +193,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_INDEP_BLEND_FUNC:
   case PIPE_CAP_TEXTURE_QUERY_LOD:
   case PIPE_CAP_SAMPLE_SHADING:
+   case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
      return class_3d >= NVA3_3D_CLASS;

   /* unsupported caps */
@ -215,8 +218,6 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
-   case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
-   case PIPE_CAP_SHAREABLE_SHADERS:
      return 0;

   case PIPE_CAP_VENDOR_ID:
--- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
@ -27,6 +27,7 @@
 #include "util/u_inlines.h"

 #include "nv50/nv50_context.h"
+#include "nv50/nv50_query_hw.h"

 void
 nv50_constbufs_validate(struct nv50_context *nv50)
@ -168,11 +169,23 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 {
   struct nouveau_pushbuf *push = nv50->base.pushbuf;
   struct nv50_program *fp = nv50->fragprog;
+   struct pipe_rasterizer_state *rast = &nv50->rast->pipe;

-   fp->fp.sample_interp = nv50->min_samples > 1;
+   if (fp->fp.force_persample_interp != rast->force_persample_interp) {
+      /* Force the program to be reuploaded, which will trigger interp fixups
+       * to get applied
+       */
+      if (fp->mem)
+         nouveau_heap_free(&fp->mem);
+
+      fp->fp.force_persample_interp = rast->force_persample_interp;
+   }
+
+   if (fp->mem && !(nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_MIN_SAMPLES)))
+      return;

   if (!nv50_program_validate(nv50, fp))
-         return;
+      return;
   nv50_program_update_context_state(nv50, fp, 1);

   BEGIN_NV04(push, NV50_3D(FP_REG_ALLOC_TEMP), 1);
@ -629,7 +642,7 @@ nv50_stream_output_validate(struct nv50_context *nv50)
      const unsigned n = nv50->screen->base.class_3d >= NVA0_3D_CLASS ? 4 : 3;

      if (n == 4 && !targ->clean)
-         nv84_query_fifo_wait(push, targ->pq);
+         nv84_hw_query_fifo_wait(push, nv50_query(targ->pq));
      BEGIN_NV04(push, NV50_3D(STRMOUT_ADDRESS_HIGH(i)), n);
      PUSH_DATAh(push, buf->address + targ->pipe.buffer_offset);
      PUSH_DATA (push, buf->address + targ->pipe.buffer_offset);
@ -638,8 +651,8 @@ nv50_stream_output_validate(struct nv50_context *nv50)
         PUSH_DATA(push, targ->pipe.buffer_size);
         if (!targ->clean) {
            assert(targ->pq);
-            nv50_query_pushbuf_submit(push, NVA0_3D_STRMOUT_OFFSET(i),
-                                      targ->pq, 0x4);
+            nv50_hw_query_pushbuf_submit(push, NVA0_3D_STRMOUT_OFFSET(i),
+                                         nv50_query(targ->pq), 0x4);
         } else {
            BEGIN_NV04(push, NVA0_3D(STRMOUT_OFFSET(i)), 1);
            PUSH_DATA(push, 0);
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@ -30,6 +30,7 @@

 #include "nv50/nv50_stateobj.h"
 #include "nv50/nv50_context.h"
+#include "nv50/nv50_query_hw.h"

 #include "nv50/nv50_3d.xml.h"
 #include "nv50/nv50_texture.xml.h"
@ -725,6 +726,9 @@ nv50_sp_state_create(struct pipe_context *pipe,
   if (cso->stream_output.num_outputs)
      prog->pipe.stream_output = cso->stream_output;

+   prog->translated = nv50_program_translate(
+         prog, nv50_context(pipe)->screen->base.device->chipset);
+
   return (void *)prog;
 }

@ -1033,7 +1037,7 @@ nv50_so_target_create(struct pipe_context *pipe,

   if (nouveau_context(pipe)->screen->class_3d >= NVA0_3D_CLASS) {
      targ->pq = pipe->create_query(pipe,
-                                    NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET, 0);
+                                    NVA0_HW_QUERY_STREAM_OUTPUT_BUFFER_OFFSET, 0);
      if (!targ->pq) {
         FREE(targ);
         return NULL;
@ -1056,6 +1060,24 @@ nv50_so_target_create(struct pipe_context *pipe,
   return &targ->pipe;
 }

+static void
+nva0_so_target_save_offset(struct pipe_context *pipe,
+                           struct pipe_stream_output_target *ptarg,
+                           unsigned index, bool serialize)
+{
+   struct nv50_so_target *targ = nv50_so_target(ptarg);
+
+   if (serialize) {
+      struct nouveau_pushbuf *push = nv50_context(pipe)->base.pushbuf;
+      PUSH_SPACE(push, 2);
+      BEGIN_NV04(push, SUBC_3D(NV50_GRAPH_SERIALIZE), 1);
+      PUSH_DATA (push, 0);
+   }
+
+   nv50_query(targ->pq)->index = index;
+   pipe->end_query(pipe, targ->pq);
+}
+
 static void
 nv50_so_target_destroy(struct pipe_context *pipe,
                       struct pipe_stream_output_target *ptarg)
--- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
@ -487,7 +487,7 @@ static struct state_validate {
    { nv50_validate_viewport,      NV50_NEW_VIEWPORT },
    { nv50_vertprog_validate,      NV50_NEW_VERTPROG },
    { nv50_gmtyprog_validate,      NV50_NEW_GMTYPROG },
-    { nv50_fragprog_validate,      NV50_NEW_FRAGPROG |
+    { nv50_fragprog_validate,      NV50_NEW_FRAGPROG | NV50_NEW_RASTERIZER |
                                   NV50_NEW_MIN_SAMPLES },
    { nv50_fp_linkage_validate,    NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG |
                                   NV50_NEW_GMTYPROG | NV50_NEW_RASTERIZER },
--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@ -220,10 +220,14 @@ nv50_resource_copy_region(struct pipe_context *pipe,
   nv04_resource(dst)->status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;

   if (m2mf) {
+      struct nv50_miptree *src_mt = nv50_miptree(src);
+      struct nv50_miptree *dst_mt = nv50_miptree(dst);
      struct nv50_m2mf_rect drect, srect;
      unsigned i;
-      unsigned nx = util_format_get_nblocksx(src->format, src_box->width);
-      unsigned ny = util_format_get_nblocksy(src->format, src_box->height);
+      unsigned nx = util_format_get_nblocksx(src->format, src_box->width)
+         << src_mt->ms_x;
+      unsigned ny = util_format_get_nblocksy(src->format, src_box->height)
+         << src_mt->ms_y;

      nv50_m2mf_rect_setup(&drect, dst, dst_level, dstx, dsty, dstz);
      nv50_m2mf_rect_setup(&srect, src, src_level,
@ -232,15 +236,15 @@ nv50_resource_copy_region(struct pipe_context *pipe,
      for (i = 0; i < src_box->depth; ++i) {
         nv50_m2mf_transfer_rect(nv50, &drect, &srect, nx, ny);

-         if (nv50_miptree(dst)->layout_3d)
+         if (dst_mt->layout_3d)
            drect.z++;
         else
-            drect.base += nv50_miptree(dst)->layer_stride;
+            drect.base += dst_mt->layer_stride;

-         if (nv50_miptree(src)->layout_3d)
+         if (src_mt->layout_3d)
            srect.z++;
         else
-            srect.base += nv50_miptree(src)->layer_stride;
+            srect.base += src_mt->layer_stride;
      }
      return;
   }
@ -270,7 +274,7 @@ nv50_resource_copy_region(struct pipe_context *pipe,
 static void
 nv50_clear_render_target(struct pipe_context *pipe,
                         struct pipe_surface *dst,
-			 const union pipe_color_union *color,
+                         const union pipe_color_union *color,
                         unsigned dstx, unsigned dsty,
                         unsigned width, unsigned height)
 {
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@ -27,6 +27,7 @@
 #include "translate/translate.h"

 #include "nv50/nv50_context.h"
+#include "nv50/nv50_query_hw.h"
 #include "nv50/nv50_resource.h"

 #include "nv50/nv50_3d.xml.h"
@ -745,7 +746,8 @@ nva0_draw_stream_output(struct nv50_context *nv50,
      PUSH_DATA (push, 0);
      BEGIN_NV04(push, NVA0_3D(DRAW_TFB_STRIDE), 1);
      PUSH_DATA (push, so->stride);
-      nv50_query_pushbuf_submit(push, NVA0_3D_DRAW_TFB_BYTES, so->pq, 0x4);
+      nv50_hw_query_pushbuf_submit(push, NVA0_3D_DRAW_TFB_BYTES,
+                                   nv50_query(so->pq), 0x4);
      BEGIN_NV04(push, NV50_3D(VERTEX_END_GL), 1);
      PUSH_DATA (push, 0);

--- a/src/gallium/drivers/nouveau/nv50/nv98_video_vp.c
+++ b/src/gallium/drivers/nouveau/nv50/nv98_video_vp.c
@ -27,33 +27,33 @@
 static void dump_comm_vp(struct nouveau_vp3_decoder *dec, struct comm *comm, u32 comm_seq,
                         struct nouveau_bo *inter_bo, unsigned slice_size)
 {
-	unsigned i, idx = comm->pvp_cur_index & 0xf;
-	debug_printf("Status: %08x, stage: %08x\n", comm->status_vp[idx], comm->pvp_stage);
+   unsigned i, idx = comm->pvp_cur_index & 0xf;
+   debug_printf("Status: %08x, stage: %08x\n", comm->status_vp[idx], comm->pvp_stage);
 #if 0
-	debug_printf("Acked byte ofs: %x, bsp byte ofs: %x\n", comm->acked_byte_ofs, comm->byte_ofs);
-	debug_printf("Irq/parse indexes: %i %i\n", comm->irq_index, comm->parse_endpos_index);
+   debug_printf("Acked byte ofs: %x, bsp byte ofs: %x\n", comm->acked_byte_ofs, comm->byte_ofs);
+   debug_printf("Irq/parse indexes: %i %i\n", comm->irq_index, comm->parse_endpos_index);

-	for (i = 0; i != comm->irq_index; ++i)
-		debug_printf("irq[%i] = { @ %08x -> %04x }\n", i, comm->irq_pos[i], comm->irq_470[i]);
-	for (i = 0; i != comm->parse_endpos_index; ++i)
-		debug_printf("parse_endpos[%i] = { @ %08x}\n", i, comm->parse_endpos[i]);
+   for (i = 0; i != comm->irq_index; ++i)
+      debug_printf("irq[%i] = { @ %08x -> %04x }\n", i, comm->irq_pos[i], comm->irq_470[i]);
+   for (i = 0; i != comm->parse_endpos_index; ++i)
+      debug_printf("parse_endpos[%i] = { @ %08x}\n", i, comm->parse_endpos[i]);
 #endif
-	debug_printf("mb_y = %u\n", comm->mb_y[idx]);
-	if (comm->status_vp[idx] <= 1)
-		return;
+   debug_printf("mb_y = %u\n", comm->mb_y[idx]);
+   if (comm->status_vp[idx] <= 1)
+      return;

-	if ((comm->pvp_stage & 0xff) != 0xff) {
-		unsigned *map;
-		int ret = nouveau_bo_map(inter_bo, NOUVEAU_BO_RD|NOUVEAU_BO_NOBLOCK, dec->client);
-		assert(ret >= 0);
-		map = inter_bo->map;
-		for (i = 0; i < comm->byte_ofs + slice_size; i += 0x10) {
-			debug_printf("%05x: %08x %08x %08x %08x\n", i, map[i/4], map[i/4+1], map[i/4+2], map[i/4+3]);
-		}
-		munmap(inter_bo->map, inter_bo->size);
-		inter_bo->map = NULL;
-	}
-	assert((comm->pvp_stage & 0xff) == 0xff);
+   if ((comm->pvp_stage & 0xff) != 0xff) {
+      unsigned *map;
+      int ret = nouveau_bo_map(inter_bo, NOUVEAU_BO_RD|NOUVEAU_BO_NOBLOCK, dec->client);
+      assert(ret >= 0);
+      map = inter_bo->map;
+      for (i = 0; i < comm->byte_ofs + slice_size; i += 0x10) {
+         debug_printf("%05x: %08x %08x %08x %08x\n", i, map[i/4], map[i/4+1], map[i/4+2], map[i/4+3]);
+      }
+      munmap(inter_bo->map, inter_bo->size);
+      inter_bo->map = NULL;
+   }
+   assert((comm->pvp_stage & 0xff) == 0xff);
 }
 #endif

--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@ -252,10 +252,10 @@ nvc0_vtgp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info)
      }
   }

-   vp->vp.clip_enable = info->io.clipDistanceMask;
-   for (i = 0; i < 8; ++i)
-      if (info->io.cullDistanceMask & (1 << i))
-         vp->vp.clip_mode |= 1 << (i * 4);
+   vp->vp.clip_enable =
+      (1 << (info->io.clipDistances + info->io.cullDistances)) - 1;
+   for (i = 0; i < info->io.cullDistances; ++i)
+      vp->vp.clip_mode |= 1 << ((info->io.clipDistances + i) * 4);

   if (info->io.genUserClip < 0)
      vp->vp.num_ucps = PIPE_MAX_CLIP_PLANES + 1; /* prevent rebuilding */
@ -269,8 +269,6 @@ nvc0_vp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info)
   vp->hdr[0] = 0x20061 | (1 << 10);
   vp->hdr[4] = 0xff000;

-   vp->hdr[18] = info->io.clipDistanceMask;
-
   return nvc0_vtgp_gen_header(vp, info);
 }

@ -424,6 +422,11 @@ nvc0_fp_gen_header(struct nvc0_program *fp, struct nv50_ir_prog_info *info)

   for (i = 0; i < info->numInputs; ++i) {
      m = nvc0_hdr_interp_mode(&info->in[i]);
+      if (info->in[i].sn == TGSI_SEMANTIC_COLOR) {
+         fp->fp.colors |= 1 << info->in[i].si;
+         if (info->in[i].sc)
+            fp->fp.color_interp[info->in[i].si] = m | (info->in[i].mask << 4);
+      }
      for (c = 0; c < 4; ++c) {
         if (!(info->in[i].mask & (1 << c)))
            continue;
@ -531,7 +534,6 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
   info->io.genUserClip = prog->vp.num_ucps;
   info->io.ucpBase = 256;
   info->io.ucpCBSlot = 15;
-   info->io.sampleInterp = prog->fp.sample_interp;

   if (prog->type == PIPE_SHADER_COMPUTE) {
      if (chipset >= NVISA_GK104_CHIPSET) {
@ -575,6 +577,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
   prog->immd_data = info->immd.buf;
   prog->immd_size = info->immd.bufSize;
   prog->relocs = info->bin.relocData;
+   prog->interps = info->bin.interpData;
   prog->num_gprs = MAX2(4, (info->bin.maxGPR + 1));
   prog->num_barriers = info->numBarriers;

@ -713,6 +716,23 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)

   if (prog->relocs)
      nv50_ir_relocate_code(prog->relocs, prog->code, code_pos, lib_pos, 0);
+   if (prog->interps) {
+      nv50_ir_change_interp(prog->interps, prog->code,
+                            prog->fp.force_persample_interp,
+                            prog->fp.flatshade);
+      for (int i = 0; i < 2; i++) {
+         unsigned mask = prog->fp.color_interp[i] >> 4;
+         unsigned interp = prog->fp.color_interp[i] & 3;
+         if (!mask)
+            continue;
+         prog->hdr[14] &= ~(0xff << (8 * i));
+         if (prog->fp.flatshade)
+            interp = NVC0_INTERP_FLAT;
+         for (int c = 0; c < 4; c++)
+            if (mask & (1 << c))
+               prog->hdr[14] |= interp << (2 * (4 * i + c));
+      }
+   }

 #ifdef DEBUG
   if (debug_get_bool_option("NV50_PROG_DEBUG", false))
@ -773,6 +793,7 @@ nvc0_program_destroy(struct nvc0_context *nvc0, struct nvc0_program *prog)
   FREE(prog->code); /* may be 0 for hardcoded shaders */
   FREE(prog->immd_data);
   FREE(prog->relocs);
+   FREE(prog->interps);
   if (prog->type == PIPE_SHADER_COMPUTE && prog->cp.syms)
      FREE(prog->cp.syms);
   if (prog->tfb) {
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
@ -45,8 +45,10 @@ struct nvc0_program {
   } vp;
   struct {
      uint8_t early_z;
-      uint8_t in_pos[PIPE_MAX_SHADER_INPUTS];
-      uint8_t sample_interp;
+      uint8_t colors;
+      uint8_t color_interp[2];
+      bool force_persample_interp;
+      bool flatshade;
   } fp;
   struct {
      uint32_t tess_mode; /* ~0 if defined by the other stage */
@ -61,6 +63,7 @@ struct nvc0_program {
   uint8_t num_barriers;

   void *relocs;
+   void *interps;

   struct nvc0_transform_feedback_state *tfb;

--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@ -28,6 +28,7 @@
 #include "nvc0/nvc0_query.h"
 #include "nvc0/nvc0_query_sw.h"
 #include "nvc0/nvc0_query_hw.h"
+#include "nvc0/nvc0_query_hw_metric.h"
 #include "nvc0/nvc0_query_hw_sm.h"

 static struct pipe_query *
@ -188,7 +189,7 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
            count++;
         } else
         if (screen->base.class_3d < NVE4_3D_CLASS) {
-            count++;
+            count += 2;
         }
      }
   }
@ -218,6 +219,17 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
            return 1;
         }
      }
+   } else
+   if (id == NVC0_HW_METRIC_QUERY_GROUP) {
+      if (screen->compute) {
+         if (screen->base.class_3d < NVE4_3D_CLASS) {
+            info->name = "Performance metrics";
+            info->type = PIPE_DRIVER_QUERY_GROUP_TYPE_GPU;
+            info->max_active_queries = 1;
+            info->num_queries = NVC0_HW_METRIC_QUERY_COUNT;
+            return 1;
+         }
+      }
   }
 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
   else if (id == NVC0_SW_QUERY_DRV_STAT_GROUP) {
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.h
@ -32,7 +32,8 @@ nvc0_query(struct pipe_query *pipe)
 * Driver queries groups:
 */
 #define NVC0_HW_SM_QUERY_GROUP       0
-#define NVC0_SW_QUERY_DRV_STAT_GROUP 1
+#define NVC0_HW_METRIC_QUERY_GROUP   1
+#define NVC0_SW_QUERY_DRV_STAT_GROUP 2

 void nvc0_init_query_functions(struct nvc0_context *);

--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
@ -431,7 +431,7 @@ nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
            id = nvc0_hw_metric_get_next_query_id(queries, id);
            info->name = nvc0_hw_metric_names[id];
            info->query_type = NVC0_HW_METRIC_QUERY(id);
-            info->group_id = -1;
+            info->group_id = NVC0_HW_METRIC_QUERY_GROUP;
            return 1;
         }
      }
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c
@ -26,7 +26,8 @@ nvc0_resource_from_handle(struct pipe_screen * screen,
   } else {
      struct pipe_resource *res = nv50_miptree_from_handle(screen,
                                                           templ, whandle);
-      nv04_resource(res)->vtbl = &nvc0_miptree_vtbl;
+      if (res)
+         nv04_resource(res)->vtbl = &nvc0_miptree_vtbl;
      return res;
   }
 }
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@ -179,6 +179,9 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
   case PIPE_CAP_DEPTH_BOUNDS_TEST:
   case PIPE_CAP_TGSI_TXQS:
+   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+   case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+   case PIPE_CAP_SHAREABLE_SHADERS:
      return 1;
   case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
      return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
@ -201,8 +204,6 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_VERTEXID_NOBASE:
   case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
-   case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
-   case PIPE_CAP_SHAREABLE_SHADERS:
      return 0;

   case PIPE_CAP_VENDOR_ID:
@ -352,45 +353,51 @@ static int
 nvc0_screen_get_compute_param(struct pipe_screen *pscreen,
                              enum pipe_compute_cap param, void *data)
 {
-   uint64_t *data64 = (uint64_t *)data;
-   uint32_t *data32 = (uint32_t *)data;
-   const uint16_t obj_class = nvc0_screen(pscreen)->compute->oclass;
+   struct nvc0_screen *screen = nvc0_screen(pscreen);
+   const uint16_t obj_class = screen->compute->oclass;
+
+#define RET(x) do {                  \
+   if (data)                         \
+      memcpy(data, x, sizeof(x));    \
+   return sizeof(x);                 \
+} while (0)

   switch (param) {
   case PIPE_COMPUTE_CAP_GRID_DIMENSION:
-      data64[0] = 3;
-      return 8;
+      RET((uint64_t []) { 3 });
   case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
-      data64[0] = (obj_class >= NVE4_COMPUTE_CLASS) ? 0x7fffffff : 65535;
-      data64[1] = 65535;
-      data64[2] = 65535;
-      return 24;
+      if (obj_class >= NVE4_COMPUTE_CLASS) {
+         RET(((uint64_t []) { 0x7fffffff, 65535, 65535 }));
+      } else {
+         RET(((uint64_t []) { 65535, 65535, 65535 }));
+      }
   case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
-      data64[0] = 1024;
-      data64[1] = 1024;
-      data64[2] = 64;
-      return 24;
+      RET(((uint64_t []) { 1024, 1024, 64 }));
   case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
-      data64[0] = 1024;
-      return 8;
+      RET((uint64_t []) { 1024 });
   case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: /* g[] */
-      data64[0] = (uint64_t)1 << 40;
-      return 8;
+      RET((uint64_t []) { 1ULL << 40 });
   case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: /* s[] */
-      data64[0] = 48 << 10;
-      return 8;
+      RET((uint64_t []) { 48 << 10 });
   case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: /* l[] */
-      data64[0] = 512 << 10;
-      return 8;
+      RET((uint64_t []) { 512 << 10 });
   case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: /* c[], arbitrary limit */
-      data64[0] = 4096;
-      return 8;
+      RET((uint64_t []) { 4096 });
   case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
-      data32[0] = 32;
-      return 4;
+      RET((uint32_t []) { 32 });
+   case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE:
+      RET((uint64_t []) { 1ULL << 40 });
+   case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED:
+      RET((uint32_t []) { 0 });
+   case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
+      RET((uint32_t []) { screen->mp_count_compute });
+   case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
+      RET((uint32_t []) { 512 }); /* FIXME: arbitrary limit */
   default:
      return 0;
   }
+
+#undef RET
 }

 static void
@ -827,6 +834,8 @@ nvc0_screen_create(struct nouveau_device *dev)
   PUSH_DATA (push, 1);
   BEGIN_NVC0(push, NVC0_3D(BLEND_ENABLE_COMMON), 1);
   PUSH_DATA (push, 0);
+   BEGIN_NVC0(push, NVC0_3D(SHADE_MODEL), 1);
+   PUSH_DATA (push, NVC0_3D_SHADE_MODEL_SMOOTH);
   if (screen->eng3d->oclass < NVE4_3D_CLASS) {
      BEGIN_NVC0(push, NVC0_3D(TEX_MISC), 1);
      PUSH_DATA (push, NVC0_3D_TEX_MISC_SEAMLESS_CUBE_MAP);
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@ -38,6 +38,7 @@ struct nvc0_graph_state {
   uint32_t constant_elts;
   int32_t index_bias;
   uint16_t scissor;
+   bool flatshade;
   uint8_t patch_vertices;
   uint8_t vbo_mode; /* 0 = normal, 1 = translate, 3 = translate, forced */
   uint8_t num_vtxbufs;
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@ -107,8 +107,54 @@ nvc0_fragprog_validate(struct nvc0_context *nvc0)
 {
   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
   struct nvc0_program *fp = nvc0->fragprog;
+   struct pipe_rasterizer_state *rast = &nvc0->rast->pipe;

-   fp->fp.sample_interp = nvc0->min_samples > 1;
+   if (fp->fp.force_persample_interp != rast->force_persample_interp) {
+      /* Force the program to be reuploaded, which will trigger interp fixups
+       * to get applied
+       */
+      if (fp->mem)
+         nouveau_heap_free(&fp->mem);
+
+      fp->fp.force_persample_interp = rast->force_persample_interp;
+   }
+
+   /* Shade model works well enough when both colors follow it. However if one
+    * (or both) is explicitly set, then we have to go the patching route.
+    */
+   bool has_explicit_color = fp->fp.colors &&
+      (((fp->fp.colors & 1) && !fp->fp.color_interp[0]) ||
+       ((fp->fp.colors & 2) && !fp->fp.color_interp[1]));
+   bool hwflatshade = false;
+   if (has_explicit_color && fp->fp.flatshade != rast->flatshade) {
+      /* Force re-upload */
+      if (fp->mem)
+         nouveau_heap_free(&fp->mem);
+
+      fp->fp.flatshade = rast->flatshade;
+
+      /* Always smooth-shade in this mode, the shader will decide on its own
+       * when to flat-shade.
+       */
+   } else if (!has_explicit_color) {
+      hwflatshade = rast->flatshade;
+
+      /* No need to binary-patch the shader each time, make sure that it's set
+       * up for the default behaviour.
+       */
+      fp->fp.flatshade = 0;
+   }
+
+   if (hwflatshade != nvc0->state.flatshade) {
+      nvc0->state.flatshade = hwflatshade;
+      BEGIN_NVC0(push, NVC0_3D(SHADE_MODEL), 1);
+      PUSH_DATA (push, hwflatshade ? NVC0_3D_SHADE_MODEL_FLAT :
+                                     NVC0_3D_SHADE_MODEL_SMOOTH);
+   }
+
+   if (fp->mem && !(nvc0->dirty & NVC0_NEW_FRAGPROG)) {
+      return;
+   }

   if (!nvc0_program_validate(nvc0, fp))
         return;
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@ -212,9 +212,6 @@ nvc0_rasterizer_state_create(struct pipe_context *pipe,
     * always emit 16 commands, one for each scissor rectangle, here.
     */

-    SB_BEGIN_3D(so, SHADE_MODEL, 1);
-    SB_DATA    (so, cso->flatshade ? NVC0_3D_SHADE_MODEL_FLAT :
-                                     NVC0_3D_SHADE_MODEL_SMOOTH);
    SB_IMMED_3D(so, PROVOKING_VERTEX_LAST, !cso->flatshade_first);
    SB_IMMED_3D(so, VERTEX_TWO_SIDE_ENABLE, cso->light_twoside);

@ -683,6 +680,9 @@ nvc0_sp_state_create(struct pipe_context *pipe,
   if (cso->stream_output.num_outputs)
      prog->pipe.stream_output = cso->stream_output;

+   prog->translated = nvc0_program_translate(
+      prog, nvc0_context(pipe)->screen->base.device->chipset);
+
   return (void *)prog;
 }

--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@ -606,6 +606,9 @@ nvc0_switch_pipe_context(struct nvc0_context *ctx_to)
      ctx_to->constbuf_dirty[s] = (1 << NVC0_MAX_PIPE_CONSTBUFS) - 1;
   }

+   /* Reset tfb as the shader that owns it may have been deleted. */
+   ctx_to->state.tfb = NULL;
+
   if (!ctx_to->vertex)
      ctx_to->dirty &= ~(NVC0_NEW_VERTEX | NVC0_NEW_ARRAYS);
   if (!ctx_to->idxbuf.buffer)
@ -645,7 +648,7 @@ static struct state_validate {
    { nvc0_tevlprog_validate,      NVC0_NEW_TEVLPROG },
    { nvc0_validate_tess_state,    NVC0_NEW_TESSFACTOR },
    { nvc0_gmtyprog_validate,      NVC0_NEW_GMTYPROG },
-    { nvc0_fragprog_validate,      NVC0_NEW_FRAGPROG },
+    { nvc0_fragprog_validate,      NVC0_NEW_FRAGPROG | NVC0_NEW_RASTERIZER },
    { nvc0_validate_derived_1,     NVC0_NEW_FRAGPROG | NVC0_NEW_ZSA |
                                   NVC0_NEW_RASTERIZER },
    { nvc0_validate_derived_2,     NVC0_NEW_ZSA | NVC0_NEW_FRAMEBUFFER },
--- a/Show More
+++ b/Show More