Merge remote-tracking branch 'mesa-public/master' into vulkan

This pulls in Matt's big compiler refactor.
2015-11-14 07:56:10 -08:00 · 2015-11-14 07:56:10 -08:00 · 1469ccb746
parent e8f51fe4de f94e1d9738
commit 1469ccb746
336 changed files with 9200 additions and 4505 deletions
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@ -149,7 +149,7 @@ GL 4.2, GLSL 4.20:

 GL 4.3, GLSL 4.30:

-  GL_ARB_arrays_of_arrays                              started (Timothy)
+  GL_ARB_arrays_of_arrays                              DONE (i965)
  GL_ARB_ES3_compatibility                             DONE (all drivers that support GLSL 3.30)
  GL_ARB_clear_buffer_object                           DONE (all drivers)
  GL_ARB_compute_shader                                in progress (jljusten)
@ -169,7 +169,7 @@ GL 4.3, GLSL 4.30:
  GL_ARB_texture_buffer_range                          DONE (nv50, nvc0, i965, r600, radeonsi, llvmpipe)
  GL_ARB_texture_query_levels                          DONE (all drivers that support GLSL 1.30)
  GL_ARB_texture_storage_multisample                   DONE (all drivers that support GL_ARB_texture_multisample)
-  GL_ARB_texture_view                                  DONE (i965, nv50, nvc0, radeonsi, llvmpipe, softpipe)
+  GL_ARB_texture_view                                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
  GL_ARB_vertex_attrib_binding                         DONE (all drivers)


@ -177,7 +177,7 @@ GL 4.4, GLSL 4.40:

  GL_MAX_VERTEX_ATTRIB_STRIDE                          DONE (all drivers)
  GL_ARB_buffer_storage                                DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_ARB_clear_texture                                 DONE (i965) (gallium - in progress, VMware)
+  GL_ARB_clear_texture                                 DONE (i965, nv50, nvc0)
  GL_ARB_enhanced_layouts                              in progress (Timothy)
  - compile-time constant expressions                  in progress
  - explicit byte offsets for blocks                   in progress
@ -209,7 +209,7 @@ GL 4.5, GLSL 4.50:

 These are the extensions cherry-picked to make GLES 3.1
 GLES3.1, GLSL ES 3.1
-  GL_ARB_arrays_of_arrays                              started (Timothy)
+  GL_ARB_arrays_of_arrays                              DONE (i965)
  GL_ARB_compute_shader                                in progress (jljusten)
  GL_ARB_draw_indirect                                 DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
  GL_ARB_explicit_uniform_location                     DONE (all drivers that support GLSL)
--- a/docs/README.UVD
+++ b/docs/README.UVD
@ -2,8 +2,8 @@ The software may implement third party technologies (e.g. third party
 libraries) that are not licensed to you by AMD and for which you may need
 to obtain licenses from other parties.  Unless explicitly stated otherwise,
 these third party technologies are not licensed hereunder.  Such third
-party technologies include, but are not limited, to H.264, MPEG-2, MPEG-4,
-AVC, and VC-1.
+party technologies include, but are not limited, to H.264, H.265, HEVC, MPEG-2,
+MPEG-4, AVC, and VC-1.

 For MPEG-2 Encoding Products ANY USE OF THIS PRODUCT IN ANY MANNER OTHER
 THAN PERSONAL USE THAT COMPLIES WITH THE MPEG-2 STANDARD FOR ENCODING VIDEO
--- a/docs/index.html
+++ b/docs/index.html
@ -16,6 +16,12 @@

 <h1>News</h1>

+<h2>November 11, 2015</h2>
+<p>
+<a href="relnotes/11.0.5.html">Mesa 11.0.5</a> is released.
+This is a bug-fix release.
+</p>
+
 <h2>October 24, 2015</h2>
 <p>
 <a href="relnotes/11.0.4.html">Mesa 11.0.4</a> is released.
--- a/docs/relnotes.html
+++ b/docs/relnotes.html
@ -21,6 +21,7 @@ The release notes summarize what's new or changed in each Mesa release.
 </p>

 <ul>
+<li><a href="relnotes/11.0.5.html">11.0.5 release notes</a>
 <li><a href="relnotes/11.0.4.html">11.0.4 release notes</a>
 <li><a href="relnotes/11.0.3.html">11.0.3 release notes</a>
 <li><a href="relnotes/10.6.9.html">10.6.9 release notes</a>
--- a/docs/relnotes/11.0.5.html
+++ b/docs/relnotes/11.0.5.html
@ -0,0 +1,174 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 11.0.5 Release Notes / November 11, 2015</h1>
+
+<p>
+Mesa 11.0.5 is a bug fix release which fixes bugs found since the 11.0.4 release.
+</p>
+<p>
+Mesa 11.0.5 implements the OpenGL 4.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.1.  OpenGL
+4.1 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+8495ef5c06f7f726452462b7d408a5b40048373ff908f2283a3b4d1f49b45ee6  mesa-11.0.5.tar.gz
+9c255a2a6695fcc6ef4a279e1df0aeaf417dc142f39ee59dfb533d80494bb67a  mesa-11.0.5.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91993">Bug 91993</a> - Graphical glitch in Astromenace (open-source game).</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92214">Bug 92214</a> - Flightgear crashes during splashboot with R600 driver, LLVM 3.7.0 and mesa 11.0.2</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92437">Bug 92437</a> - osmesa: Expose GL entry points for Windows build, via .def file</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92476">Bug 92476</a> - [cts] ES2-CTS.gtf.GL2ExtensionTests.egl_image.egl_image fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92623">Bug 92623</a> - Differences in prog_data ignored when caching fragment programs (causes hangs)</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Alex Deucher (1):</p>
+<ul>
+  <li>radeon/uvd: don't expose HEVC on old UVD hw (v3)</li>
+</ul>
+
+<p>Ben Widawsky (1):</p>
+<ul>
+  <li>i965/skl: Add GT4 PCI IDs</li>
+</ul>
+
+<p>Emil Velikov (4):</p>
+<ul>
+  <li>docs: add sha256 checksums for 11.0.4</li>
+  <li>cherry-ignore: ignore a possible wrong nomination</li>
+  <li>Revert "mesa/glformats: Undo code changes from _mesa_base_tex_format() move"</li>
+  <li>Update version to 11.0.5</li>
+</ul>
+
+<p>Emmanuel Gil Peyrot (1):</p>
+<ul>
+  <li>gbm.h: Add a missing stddef.h include for size_t.</li>
+</ul>
+
+<p>Eric Anholt (1):</p>
+<ul>
+  <li>vc4: When the create ioctl fails, free our cache and try again.</li>
+</ul>
+
+<p>Ian Romanick (1):</p>
+<ul>
+  <li>i965: Fix is-renderable check in intel_image_target_renderbuffer_storage</li>
+</ul>
+
+<p>Ilia Mirkin (3):</p>
+<ul>
+  <li>nvc0: respect edgeflag attribute width</li>
+  <li>nouveau: set MaxDrawBuffers to the same value as MaxColorAttachments</li>
+  <li>nouveau: relax fence emit space assert</li>
+</ul>
+
+<p>Ivan Kalvachev (1):</p>
+<ul>
+  <li>r600g: Fix special negative immediate constants when using ABS modifier.</li>
+</ul>
+
+<p>Jason Ekstrand (2):</p>
+<ul>
+  <li>nir/lower_vec_to_movs: Pass the shader around directly</li>
+  <li>nir: Report progress from lower_vec_to_movs().</li>
+</ul>
+
+<p>Jose Fonseca (2):</p>
+<ul>
+  <li>gallivm: Translate all util_cpu_caps bits to LLVM attributes.</li>
+  <li>gallivm: Explicitly disable unsupported CPU features.</li>
+</ul>
+
+<p>Julien Isorce (4):</p>
+<ul>
+  <li>st/va: pass picture desc to begin and decode</li>
+  <li>nvc0: fix crash when nv50_miptree_from_handle fails</li>
+  <li>st/va: do not destroy old buffer when new one failed</li>
+  <li>st/va: add more errors checks in vlVaBufferSetNumElements and vlVaMapBuffer</li>
+</ul>
+
+<p>Kenneth Graunke (6):</p>
+<ul>
+  <li>i965: Fix missing BRW_NEW_*_PROG_DATA flagging caused by cache reuse.</li>
+  <li>nir: Report progress from nir_split_var_copies().</li>
+  <li>nir: Properly invalidate metadata in nir_split_var_copies().</li>
+  <li>nir: Properly invalidate metadata in nir_opt_copy_prop().</li>
+  <li>nir: Properly invalidate metadata in nir_lower_vec_to_movs().</li>
+  <li>nir: Properly invalidate metadata in nir_opt_remove_phis().</li>
+</ul>
+
+<p>Marek Olšák (1):</p>
+<ul>
+  <li>radeonsi: add register definitions for Stoney</li>
+</ul>
+
+<p>Nanley Chery (1):</p>
+<ul>
+  <li>mesa/glformats: Undo code changes from _mesa_base_tex_format() move</li>
+</ul>
+
+<p>Nicolai Hähnle (1):</p>
+<ul>
+  <li>st/mesa: fix mipmap generation for immutable textures with incomplete pyramids</li>
+</ul>
+
+<p>Nigel Stewart (1):</p>
+<ul>
+  <li>osmesa: Expose GL entry points for Windows build via DEF file.</li>
+</ul>
+
+<p>Roland Scheidegger (1):</p>
+<ul>
+  <li>gallivm: disable f16c when not using AVX</li>
+</ul>
+
+<p>Samuel Li (2):</p>
+<ul>
+  <li>radeonsi: add support for Stoney asics (v3)</li>
+  <li>radeonsi: add Stoney pci ids</li>
+</ul>
+
+
+</div>
+</body>
+</html>
--- a/docs/relnotes/11.1.0.html
+++ b/docs/relnotes/11.1.0.html
@ -44,8 +44,10 @@ Note: some of the new features are only available with certain drivers.
 </p>

 <ul>
+<li>GL_ARB_arrays_of_arrays on i965</li>
 <li>GL_ARB_blend_func_extended on freedreno (a3xx)</li>
-<li>GL_ARB_copy_image on radeonsi</li>
+<li>GL_ARB_clear_texture on nv50, nvc0</li>
+<li>GL_ARB_copy_image on nv50, nvc0, radeonsi</li>
 <li>GL_ARB_gpu_shader_fp64 on r600 for Cypress/Cayman/Aruba chips</li>
 <li>GL_ARB_gpu_shader5 on r600 for Evergreen and later chips</li>
 <li>GL_ARB_shader_clock on i965 (gen7+)</li>
@ -54,7 +56,8 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_shader_texture_image_samples on i965, nv50, nvc0, r600, radeonsi</li>
 <li>GL_ARB_texture_barrier / GL_NV_texture_barrier on i965</li>
 <li>GL_ARB_texture_query_lod on softpipe</li>
-<li>GL_ARB_texture_view on radeonsi</li>
+<li>GL_ARB_texture_view on radeonsi and r600 (for evergeen and newer)</li>
+<li>GL_EXT_buffer_storage implemented for when ES 3.1 support is gained</li>
 <li>GL_EXT_draw_elements_base_vertex on all drivers</li>
 <li>GL_OES_draw_elements_base_vertex on all drivers</li>
 <li>EGL_KHR_create_context on softpipe, llvmpipe</li>
--- a/docs/utilities.html
+++ b/docs/utilities.html
@ -30,6 +30,10 @@
  <dt><a href="http://www.valgrind.org">Valgrind</a></dt>
  <dd>is a very useful tool for tracking down
  memory-related problems in your code.</dd>
+
+  <dt><a href="http:scan.coverity.com/projects/mesa">Coverity</a><dt>
+  <dd>provides static code analysis of Mesa.  If you create an account
+  you can see the results and try to fix outstanding issues.</dd>
 </dl>

 </div>
--- a/docs/vmware-guest.html
+++ b/docs/vmware-guest.html
@ -148,10 +148,33 @@ To get the latest code from git:
 <h2>Building the Code</h2>

 <ul>
-<li>Build libdrm: If you're on a 32-bit system, you should skip the --libdir configure option. Note also the comment about toolchain libdrm above. 
+<li>
+Determine where the GL-related libraries reside on your system and set
+the LIBDIR environment variable accordingly.
+<br><br>
+For 32-bit Ubuntu systems:
+<pre>
+  export LIBDIR=/usr/lib/i386-linux-gnu
+</pre>
+For 64-bit Ubuntu systems:
+<pre>
+  export LIBDIR=/usr/lib/x86_64-linux-gnu
+</pre>
+For 32-bit Fedora systems:
+<pre>
+  export LIBDIR=/usr/lib
+</pre>
+For 64-bit Fedora systems:
+<pre>
+  export LIBDIR=/usr/lib64
+</pre>
+
+</li>
+
+<li>Build libdrm:
  <pre>
  cd $TOP/drm
-  ./autogen.sh --prefix=/usr --libdir=/usr/lib64
+  ./autogen.sh --prefix=/usr --libdir=${LIBDIR}
  make
  sudo make install
  </pre>
@ -162,12 +185,9 @@ The libxatracker library is used exclusively by the X server to do render,
 copy and video acceleration:
 <br>
 The following configure options doesn't build the EGL system.
-<br>
-As before, if you're on a 32-bit system, you should skip the --libdir
-configure option.
  <pre>
  cd $TOP/mesa
-  ./autogen.sh --prefix=/usr --libdir=/usr/lib64 --with-gallium-drivers=svga --with-dri-drivers= --enable-xa --disable-dri3
+  ./autogen.sh --prefix=/usr --libdir=${LIBDIR} --with-gallium-drivers=svga --with-dri-drivers=swrast --enable-xa --disable-dri3 --enable-glx-tls
  make
  sudo make install
  </pre>
@ -177,25 +197,39 @@ if they're not installed in your system.  You should be told what's missing.
 <br>
 <br>

-<li>xf86-video-vmware: Now, once libxatracker is installed, we proceed with building and replacing the current Xorg driver. First check if your system is 32- or 64-bit. If you're building for a 32-bit system, you will not be needing the --libdir=/usr/lib64 option to autogen. 
+<li>xf86-video-vmware: Now, once libxatracker is installed, we proceed with
+building and replacing the current Xorg driver.
+First check if your system is 32- or 64-bit.
  <pre>
  cd $TOP/xf86-video-vmware
-  ./autogen.sh --prefix=/usr --libdir=/usr/lib64
+  ./autogen.sh --prefix=/usr --libdir=${LIBDIR}
  make
  sudo make install
  </pre>
+
 <li>vmwgfx kernel module. First make sure that any old version of this kernel module is removed from the system by issuing
-  <pre>
+<pre>
  sudo rm /lib/modules/`uname -r`/kernel/drivers/gpu/drm/vmwgfx.ko*
-  </pre>
-Then 
-  <pre>
+</pre>
+Build and install:
+<pre>
  cd $TOP/vmwgfx
  make
  sudo make install
-  sudo cp 00-vmwgfx.rules /etc/udev/rules.d
-  sudo depmod -ae
-  </pre>
+  sudo depmod -a
+</pre>
+If you're using a Ubuntu OS:
+<pre>
+  sudo update-initramfs -u
+</pre>
+If you're using a Fedora OS:
+<pre>
+  sudo dracut --force
+</pre>
+Add 'vmwgfx' to the /etc/modules file:
+<pre>
+  echo vmwgfx | sudo tee -a /etc/modules
+</pre>

 Note: some distros put DRM kernel drivers in different directories.
 For example, sometimes vmwgfx.ko might be found in
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@ -703,18 +703,10 @@ dri2_wl_swap_buffers_with_damage(_EGLDriver *drv,
   dri2_surf->dx = 0;
   dri2_surf->dy = 0;

-   if (n_rects == 0) {
-      wl_surface_damage(dri2_surf->wl_win->surface,
-                        0, 0, INT32_MAX, INT32_MAX);
-   } else {
-      for (i = 0; i < n_rects; i++) {
-         const int *rect = &rects[i * 4];
-         wl_surface_damage(dri2_surf->wl_win->surface,
-                           rect[0],
-                           dri2_surf->base.Height - rect[1] - rect[3],
-                           rect[2], rect[3]);
-      }
-   }
+   /* We deliberately ignore the damage region and post maximum damage, due to
+    * https://bugs.freedesktop.org/78190 */
+   wl_surface_damage(dri2_surf->wl_win->surface,
+                     0, 0, INT32_MAX, INT32_MAX);

   if (dri2_dpy->is_different_gpu) {
      _EGLContext *ctx = _eglGetCurrentContext();
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@ -349,7 +349,8 @@ VL_SOURCES := \

 # XXX: Nuke this as our dri targets no longer depend on VL.
 VL_WINSYS_SOURCES := \
-	vl/vl_winsys_dri.c
+	vl/vl_winsys_dri.c \
+	vl/vl_winsys_drm.c

 VL_STUB_SOURCES := \
 	vl/vl_stubs.c
@ -378,7 +379,9 @@ GALLIVM_SOURCES := \
 	gallivm/lp_bld_flow.h \
 	gallivm/lp_bld_format_aos_array.c \
 	gallivm/lp_bld_format_aos.c \
+	gallivm/lp_bld_format_cached.c \
 	gallivm/lp_bld_format_float.c \
+	gallivm/lp_bld_format.c \
 	gallivm/lp_bld_format.h \
 	gallivm/lp_bld_format_soa.c \
 	gallivm/lp_bld_format_srgb.c \
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@ -625,6 +625,7 @@ generate_vs(struct draw_llvm_variant *variant,
                     inputs,
                     outputs,
                     context_ptr,
+                     NULL,
                     draw_sampler,
                     &llvm->draw->vs.vertex_shader->info,
                     NULL);
@ -749,7 +750,8 @@ generate_fetch(struct gallivm_state *gallivm,
                                    lp_float32_vec4_type(),
                                    FALSE,
                                    map_ptr,
-                                    zero, zero, zero);
+                                    zero, zero, zero,
+                                    NULL);
      LLVMBuildStore(builder, val, temp_ptr);
   }
   lp_build_endif(&if_ctx);
@ -2193,6 +2195,7 @@ draw_gs_llvm_generate(struct draw_llvm *llvm,
                     NULL,
                     outputs,
                     context_ptr,
+                     NULL,
                     sampler,
                     &llvm->draw->gs.geometry_shader->info,
                     (const struct lp_build_tgsi_gs_iface *)&gs_iface);
--- a/src/gallium/auxiliary/gallivm/lp_bld_format.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.c
@ -0,0 +1,56 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#include "lp_bld_format.h"
+
+
+
+LLVMTypeRef
+lp_build_format_cache_type(struct gallivm_state *gallivm)
+{
+   LLVMTypeRef elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_COUNT];
+   LLVMTypeRef s;
+
+   elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_DATA] =
+         LLVMArrayType(LLVMInt32TypeInContext(gallivm->context),
+                       LP_BUILD_FORMAT_CACHE_SIZE * 16);
+   elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_TAGS] =
+         LLVMArrayType(LLVMInt64TypeInContext(gallivm->context),
+                       LP_BUILD_FORMAT_CACHE_SIZE);
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+   elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL] =
+         LLVMInt64TypeInContext(gallivm->context);
+   elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS] =
+         LLVMInt64TypeInContext(gallivm->context);
+#endif
+
+   s = LLVMStructTypeInContext(gallivm->context, elem_types,
+                               LP_BUILD_FORMAT_CACHE_MEMBER_COUNT, 0);
+
+   return s;
+}
--- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
@ -44,6 +44,45 @@ struct lp_type;
 struct lp_build_context;


+#define LP_BUILD_FORMAT_CACHE_DEBUG 0
+/*
+ * Block cache
+ *
+ * Optional block cache to be used when unpacking big pixel blocks.
+ * Must be a power of 2
+ */
+
+#define LP_BUILD_FORMAT_CACHE_SIZE 128
+
+/*
+ * Note: cache_data needs 16 byte alignment.
+ */
+struct lp_build_format_cache
+{
+   PIPE_ALIGN_VAR(16) uint32_t cache_data[LP_BUILD_FORMAT_CACHE_SIZE][4][4];
+   uint64_t cache_tags[LP_BUILD_FORMAT_CACHE_SIZE];
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+   uint64_t cache_access_total;
+   uint64_t cache_access_miss;
+#endif
+};
+
+
+enum {
+   LP_BUILD_FORMAT_CACHE_MEMBER_DATA = 0,
+   LP_BUILD_FORMAT_CACHE_MEMBER_TAGS,
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+   LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL,
+   LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS,
+#endif
+   LP_BUILD_FORMAT_CACHE_MEMBER_COUNT
+};
+
+
+LLVMTypeRef
+lp_build_format_cache_type(struct gallivm_state *gallivm);
+
+
 /*
 * AoS
 */
@ -66,7 +105,8 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
                        LLVMValueRef base_ptr,
                        LLVMValueRef offset,
                        LLVMValueRef i,
-                        LLVMValueRef j);
+                        LLVMValueRef j,
+                        LLVMValueRef cache);

 LLVMValueRef
 lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm,
@ -107,13 +147,13 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
                        LLVMValueRef offsets,
                        LLVMValueRef i,
                        LLVMValueRef j,
+                        LLVMValueRef cache,
                        LLVMValueRef rgba_out[4]);

 /*
 * YUV
 */

-
 LLVMValueRef
 lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm,
                                   const struct util_format_description *format_desc,
@ -123,6 +163,18 @@ lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm,
                                   LLVMValueRef i,
                                   LLVMValueRef j);

+
+LLVMValueRef
+lp_build_fetch_cached_texels(struct gallivm_state *gallivm,
+                             const struct util_format_description *format_desc,
+                             unsigned n,
+                             LLVMValueRef base_ptr,
+                             LLVMValueRef offset,
+                             LLVMValueRef i,
+                             LLVMValueRef j,
+                             LLVMValueRef cache);
+
+
 /*
 * special float formats
 */
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@ -370,7 +370,8 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
                        LLVMValueRef base_ptr,
                        LLVMValueRef offset,
                        LLVMValueRef i,
-                        LLVMValueRef j)
+                        LLVMValueRef j,
+                        LLVMValueRef cache)
 {
   LLVMBuilderRef builder = gallivm->builder;
   unsigned num_pixels = type.length / 4;
@ -502,6 +503,34 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
      return tmp;
   }

+   /*
+    * s3tc rgb formats
+    */
+
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC && cache) {
+      struct lp_type tmp_type;
+      LLVMValueRef tmp;
+
+      memset(&tmp_type, 0, sizeof tmp_type);
+      tmp_type.width = 8;
+      tmp_type.length = num_pixels * 4;
+      tmp_type.norm = TRUE;
+
+      tmp = lp_build_fetch_cached_texels(gallivm,
+                                         format_desc,
+                                         num_pixels,
+                                         base_ptr,
+                                         offset,
+                                         i, j,
+                                         cache);
+
+      lp_build_conv(gallivm,
+                    tmp_type, type,
+                    &tmp, 1, &tmp, 1);
+
+       return tmp;
+   }
+
   /*
    * Fallback to util_format_description::fetch_rgba_8unorm().
    */
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c
@ -0,0 +1,374 @@
+/**************************************************************************
+ *
+ * Copyright 2015 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "lp_bld_format.h"
+#include "lp_bld_type.h"
+#include "lp_bld_struct.h"
+#include "lp_bld_const.h"
+#include "lp_bld_flow.h"
+#include "lp_bld_swizzle.h"
+
+#include "util/u_math.h"
+
+
+/**
+ * @file
+ * Complex block-compression based formats are handled here by using a cache,
+ * so re-decoding of every pixel is not required.
+ * Especially for bilinear filtering, texel reuse is very high hence even
+ * a small cache helps.
+ * The elements in the cache are the decoded blocks - currently things
+ * are restricted to formats which are 4x4 block based, and the decoded
+ * texels must fit into 4x8 bits.
+ * The cache is direct mapped so hitrates aren't all that great and cache
+ * thrashing could happen.
+ *
+ * @author Roland Scheidegger <sroland@vmware.com>
+ */
+
+
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+static void
+update_cache_access(struct gallivm_state *gallivm,
+                    LLVMValueRef ptr,
+                    unsigned count,
+                    unsigned index)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef member_ptr, cache_access;
+
+   assert(index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL ||
+          index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
+
+   member_ptr = lp_build_struct_get_ptr(gallivm, ptr, index, "");
+   cache_access = LLVMBuildLoad(builder, member_ptr, "cache_access");
+   cache_access = LLVMBuildAdd(builder, cache_access,
+                               LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
+                                                                   count, 0), "");
+   LLVMBuildStore(builder, cache_access, member_ptr);
+}
+#endif
+
+
+static void
+store_cached_block(struct gallivm_state *gallivm,
+                   LLVMValueRef *col,
+                   LLVMValueRef tag_value,
+                   LLVMValueRef hash_index,
+                   LLVMValueRef cache)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef ptr, indices[3];
+   LLVMTypeRef type_ptr4x32;
+   unsigned count;
+
+   type_ptr4x32 = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0);
+   indices[0] = lp_build_const_int32(gallivm, 0);
+   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
+   indices[2] = hash_index;
+   ptr = LLVMBuildGEP(builder, cache, indices, Elements(indices), "");
+   LLVMBuildStore(builder, tag_value, ptr);
+
+   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
+   hash_index = LLVMBuildMul(builder, hash_index,
+                             lp_build_const_int32(gallivm, 16), "");
+   for (count = 0; count < 4; count++) {
+      indices[2] = hash_index;
+      ptr = LLVMBuildGEP(builder, cache, indices, Elements(indices), "");
+      ptr = LLVMBuildBitCast(builder, ptr, type_ptr4x32, "");
+      LLVMBuildStore(builder, col[count], ptr);
+      hash_index = LLVMBuildAdd(builder, hash_index,
+                                lp_build_const_int32(gallivm, 4), "");
+   }
+}
+
+
+static LLVMValueRef
+lookup_cached_pixel(struct gallivm_state *gallivm,
+                    LLVMValueRef ptr,
+                    LLVMValueRef index)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef member_ptr, indices[3];
+
+   indices[0] = lp_build_const_int32(gallivm, 0);
+   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
+   indices[2] = index;
+   member_ptr = LLVMBuildGEP(builder, ptr, indices, Elements(indices), "");
+   return LLVMBuildLoad(builder, member_ptr, "cache_data");
+}
+
+
+static LLVMValueRef
+lookup_tag_data(struct gallivm_state *gallivm,
+                LLVMValueRef ptr,
+                LLVMValueRef index)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef member_ptr, indices[3];
+
+   indices[0] = lp_build_const_int32(gallivm, 0);
+   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
+   indices[2] = index;
+   member_ptr = LLVMBuildGEP(builder, ptr, indices, Elements(indices), "");
+   return LLVMBuildLoad(builder, member_ptr, "tag_data");
+}
+
+
+static void
+update_cached_block(struct gallivm_state *gallivm,
+                    const struct util_format_description *format_desc,
+                    LLVMValueRef ptr_addr,
+                    LLVMValueRef hash_index,
+                    LLVMValueRef cache)
+
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
+   LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
+   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+   LLVMTypeRef i32x4 = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
+   LLVMValueRef function;
+   LLVMValueRef tag_value, tmp_ptr;
+   LLVMValueRef col[4];
+   unsigned i, j;
+
+   /*
+    * Use format_desc->fetch_rgba_8unorm() for each pixel in the block.
+    * This doesn't actually make any sense whatsoever, someone would need
+    * to write a function doing this for all pixels in a block (either as
+    * an external c function or with generated code). Don't ask.
+    */
+
+   {
+      /*
+       * Function to call looks like:
+       *   fetch(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
+       */
+      LLVMTypeRef ret_type;
+      LLVMTypeRef arg_types[4];
+      LLVMTypeRef function_type;
+
+      assert(format_desc->fetch_rgba_8unorm);
+
+      ret_type = LLVMVoidTypeInContext(gallivm->context);
+      arg_types[0] = pi8t;
+      arg_types[1] = pi8t;
+      arg_types[2] = i32t;
+      arg_types[3] = i32t;
+      function_type = LLVMFunctionType(ret_type, arg_types,
+                                       Elements(arg_types), 0);
+
+      /* make const pointer for the C fetch_rgba_8unorm function */
+      function = lp_build_const_int_pointer(gallivm,
+         func_to_pointer((func_pointer) format_desc->fetch_rgba_8unorm));
+
+      /* cast the callee pointer to the function's type */
+      function = LLVMBuildBitCast(builder, function,
+                                  LLVMPointerType(function_type, 0),
+                                  "cast callee");
+   }
+
+   tmp_ptr = lp_build_array_alloca(gallivm, i32x4,
+                                   lp_build_const_int32(gallivm, 16),
+                                   "tmp_decode_store");
+   tmp_ptr = LLVMBuildBitCast(builder, tmp_ptr, pi8t, "");
+
+   /*
+    * Invoke format_desc->fetch_rgba_8unorm() for each pixel.
+    * This is going to be really really slow.
+    * Note: the block store format is actually
+    * x0y0x0y1x0y2x0y3 x1y0x1y1x1y2x1y3 ...
+    */
+   for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j) {
+         LLVMValueRef args[4];
+         LLVMValueRef dst_offset = lp_build_const_int32(gallivm, (i * 4 + j) * 4);
+
+         /*
+          * Note we actually supply a pointer to the start of the block,
+          * not the start of the texture.
+          */
+         args[0] = LLVMBuildGEP(gallivm->builder, tmp_ptr, &dst_offset, 1, "");
+         args[1] = ptr_addr;
+         args[2] = LLVMConstInt(i32t, i, 0);
+         args[3] = LLVMConstInt(i32t, j, 0);
+         LLVMBuildCall(builder, function, args, Elements(args), "");
+      }
+   }
+
+   /* Finally store the block - pointless mem copy + update tag. */
+   tmp_ptr = LLVMBuildBitCast(builder, tmp_ptr, LLVMPointerType(i32x4, 0), "");
+   for (i = 0; i < 4; ++i) {
+      LLVMValueRef tmp_offset = lp_build_const_int32(gallivm, i);
+      LLVMValueRef ptr = LLVMBuildGEP(gallivm->builder, tmp_ptr, &tmp_offset, 1, "");
+      col[i] = LLVMBuildLoad(builder, ptr, "");
+   }
+
+   tag_value = LLVMBuildPtrToInt(gallivm->builder, ptr_addr,
+                                 LLVMInt64TypeInContext(gallivm->context), "");
+   store_cached_block(gallivm, col, tag_value, hash_index, cache);
+}
+
+
+/*
+ * Do a cached lookup.
+ *
+ * Returns (vectors of) 4x8 rgba aos value
+ */
+LLVMValueRef
+lp_build_fetch_cached_texels(struct gallivm_state *gallivm,
+                             const struct util_format_description *format_desc,
+                             unsigned n,
+                             LLVMValueRef base_ptr,
+                             LLVMValueRef offset,
+                             LLVMValueRef i,
+                             LLVMValueRef j,
+                             LLVMValueRef cache)
+
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   unsigned count, low_bit, log2size;
+   LLVMValueRef color, offset_stored, addr, ptr_addrtrunc, tmp;
+   LLVMValueRef ij_index, hash_index, hash_mask, block_index;
+   LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
+   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+   LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
+   struct lp_type type;
+   struct lp_build_context bld32;
+   memset(&type, 0, sizeof type);
+   type.width = 32;
+   type.length = n;
+
+   assert(format_desc->block.width == 4);
+   assert(format_desc->block.height == 4);
+
+   lp_build_context_init(&bld32, gallivm, type);
+
+   /*
+    * compute hash - we use direct mapped cache, the hash function could
+    *                be better but it needs to be simple
+    * per-element:
+    *    compare offset with offset stored at tag (hash)
+    *    if not equal decode/store block, update tag
+    *    extract color from cache
+    *    assemble result vector
+    */
+
+   /* TODO: not ideal with 32bit pointers... */
+
+   low_bit = util_logbase2(format_desc->block.bits / 8);
+   log2size = util_logbase2(LP_BUILD_FORMAT_CACHE_SIZE);
+   addr = LLVMBuildPtrToInt(builder, base_ptr, i64t, "");
+   ptr_addrtrunc = LLVMBuildPtrToInt(builder, base_ptr, i32t, "");
+   ptr_addrtrunc = lp_build_broadcast_scalar(&bld32, ptr_addrtrunc);
+   /* For the hash function, first mask off the unused lowest bits. Then just
+      do some xor with address bits - only use lower 32bits */
+   ptr_addrtrunc = LLVMBuildAdd(builder, offset, ptr_addrtrunc, "");
+   ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
+                                 lp_build_const_int_vec(gallivm, type, low_bit), "");
+   /* This only really makes sense for size 64,128,256 */
+   hash_index = ptr_addrtrunc;
+   ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
+                                 lp_build_const_int_vec(gallivm, type, 2*log2size), "");
+   hash_index = LLVMBuildXor(builder, ptr_addrtrunc, hash_index, "");
+   tmp = LLVMBuildLShr(builder, hash_index,
+                       lp_build_const_int_vec(gallivm, type, log2size), "");
+   hash_index = LLVMBuildXor(builder, hash_index, tmp, "");
+
+   hash_mask = lp_build_const_int_vec(gallivm, type, LP_BUILD_FORMAT_CACHE_SIZE - 1);
+   hash_index = LLVMBuildAnd(builder, hash_index, hash_mask, "");
+   ij_index = LLVMBuildShl(builder, i, lp_build_const_int_vec(gallivm, type, 2), "");
+   ij_index = LLVMBuildAdd(builder, ij_index, j, "");
+   block_index = LLVMBuildShl(builder, hash_index,
+                              lp_build_const_int_vec(gallivm, type, 4), "");
+   block_index = LLVMBuildAdd(builder, ij_index, block_index, "");
+
+   if (n > 1) {
+      color = LLVMGetUndef(LLVMVectorType(i32t, n));
+      for (count = 0; count < n; count++) {
+         LLVMValueRef index, cond, colorx;
+         LLVMValueRef block_indexx, hash_indexx, addrx, offsetx, ptr_addrx;
+         struct lp_build_if_state if_ctx;
+
+         index = lp_build_const_int32(gallivm, count);
+         offsetx = LLVMBuildExtractElement(builder, offset, index, "");
+         addrx = LLVMBuildZExt(builder, offsetx, i64t, "");
+         addrx = LLVMBuildAdd(builder, addrx, addr, "");
+         block_indexx = LLVMBuildExtractElement(builder, block_index, index, "");
+         hash_indexx = LLVMBuildLShr(builder, block_indexx,
+                                     lp_build_const_int32(gallivm, 4), "");
+         offset_stored = lookup_tag_data(gallivm, cache, hash_indexx);
+         cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addrx, "");
+
+         lp_build_if(&if_ctx, gallivm, cond);
+         {
+            ptr_addrx = LLVMBuildIntToPtr(builder, addrx,
+                                          LLVMPointerType(i8t, 0), "");
+            update_cached_block(gallivm, format_desc, ptr_addrx, hash_indexx, cache);
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+            update_cache_access(gallivm, cache, 1,
+                                LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
+#endif
+         }
+         lp_build_endif(&if_ctx);
+
+         colorx = lookup_cached_pixel(gallivm, cache, block_indexx);
+
+         color = LLVMBuildInsertElement(builder, color, colorx,
+                                        lp_build_const_int32(gallivm, count), "");
+      }
+   }
+   else {
+      LLVMValueRef cond;
+      struct lp_build_if_state if_ctx;
+
+      tmp = LLVMBuildZExt(builder, offset, i64t, "");
+      addr = LLVMBuildAdd(builder, tmp, addr, "");
+      offset_stored = lookup_tag_data(gallivm, cache, hash_index);
+      cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addr, "");
+
+      lp_build_if(&if_ctx, gallivm, cond);
+      {
+         tmp = LLVMBuildIntToPtr(builder, addr, LLVMPointerType(i8t, 0), "");
+         update_cached_block(gallivm, format_desc, tmp, hash_index, cache);
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+         update_cache_access(gallivm, cache, 1,
+                             LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
+#endif
+      }
+      lp_build_endif(&if_ctx);
+
+      color = lookup_cached_pixel(gallivm, cache, block_index);
+   }
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+   update_cache_access(gallivm, cache, n,
+                       LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL);
+#endif
+   return LLVMBuildBitCast(builder, color, LLVMVectorType(i8t, n * 4), "");
+}
+
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@ -346,6 +346,7 @@ lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,
 * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
 *              these will always be (0,0).  For compressed formats, i will
 *              be in [0, block_width-1] and j will be in [0, block_height-1].
+ * \param cache  optional value pointing to a lp_build_format_cache structure
 */
 void
 lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
@ -355,6 +356,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
                        LLVMValueRef offset,
                        LLVMValueRef i,
                        LLVMValueRef j,
+                        LLVMValueRef cache,
                        LLVMValueRef rgba_out[4])
 {
   LLVMBuilderRef builder = gallivm->builder;
@ -473,7 +475,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
      tmp_type.norm = TRUE;

      tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
-                                    TRUE, base_ptr, offset, i, j);
+                                    TRUE, base_ptr, offset, i, j, cache);

      lp_build_rgba8_to_fi32_soa(gallivm,
                                type,
@ -483,6 +485,39 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
      return;
   }

+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC &&
+       /* non-srgb case is already handled above */
+       format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
+       type.floating && type.width == 32 &&
+       (type.length == 1 || (type.length % 4 == 0)) &&
+       cache) {
+      const struct util_format_description *format_decompressed;
+      const struct util_format_description *flinear_desc;
+      LLVMValueRef packed;
+      flinear_desc = util_format_description(util_format_linear(format_desc->format));
+      packed = lp_build_fetch_cached_texels(gallivm,
+                                            flinear_desc,
+                                            type.length,
+                                            base_ptr,
+                                            offset,
+                                            i, j,
+                                            cache);
+      packed = LLVMBuildBitCast(builder, packed,
+                                lp_build_int_vec_type(gallivm, type), "");
+      /*
+       * The values are now packed so they match ordinary srgb RGBA8 format,
+       * hence need to use matching format for unpack.
+       */
+      format_decompressed = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
+
+      lp_build_unpack_rgba_soa(gallivm,
+                               format_decompressed,
+                               type,
+                               packed, rgba_out);
+
+      return;
+   }
+
   /*
    * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
    *
@ -524,7 +559,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
         /* Get a single float[4]={R,G,B,A} pixel */
         tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
                                       TRUE, base_ptr, offset_elem,
-                                       i_elem, j_elem);
+                                       i_elem, j_elem, cache);

         /*
          * Insert the AoS tmp value channels into the SoA result vectors at
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@ -99,6 +99,7 @@ struct lp_sampler_params
   unsigned sampler_index;
   unsigned sample_key;
   LLVMValueRef context_ptr;
+   LLVMValueRef thread_data_ptr;
   const LLVMValueRef *coords;
   const LLVMValueRef *offsets;
   LLVMValueRef lod;
@ -267,6 +268,17 @@ struct lp_sampler_dynamic_state
                   struct gallivm_state *gallivm,
                   LLVMValueRef context_ptr,
                   unsigned sampler_unit);
+
+   /** 
+    * Obtain texture cache (returns ptr to lp_build_format_cache).
+    *
+    * It's optional: no caching will be done if it's NULL.
+    */
+   LLVMValueRef
+   (*cache_ptr)(const struct lp_sampler_dynamic_state *state,
+                struct gallivm_state *gallivm,
+                LLVMValueRef thread_data_ptr,
+                unsigned unit);
 };


@ -356,6 +368,7 @@ struct lp_build_sample_context
   LLVMValueRef img_stride_array;
   LLVMValueRef base_ptr;
   LLVMValueRef mip_offsets;
+   LLVMValueRef cache;

   /** Integer vector with texture width, height, depth */
   LLVMValueRef int_size;
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@ -593,7 +593,8 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
                                      TRUE,
                                      data_ptr, offset,
                                      x_subcoord,
-                                      y_subcoord);
+                                      y_subcoord,
+                                      bld->cache);
   }

   *colors = rgba8;
@ -933,7 +934,8 @@ lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
                                               TRUE,
                                               data_ptr, offset[k][j][i],
                                               x_subcoord[i],
-                                               y_subcoord[j]);
+                                               y_subcoord[j],
+                                               bld->cache);
            }

            neighbors[k][j][i] = rgba8;
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@ -161,6 +161,7 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
                           bld->texel_type,
                           data_ptr, offset,
                           i, j,
+                           bld->cache,
                           texel_out);

   /*
@ -2389,6 +2390,7 @@ lp_build_fetch_texel(struct lp_build_sample_context *bld,
                           bld->texel_type,
                           bld->base_ptr, offset,
                           i, j,
+                           bld->cache,
                           colors_out);

   if (out_of_bound_ret_zero) {
@ -2442,6 +2444,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
                         unsigned texture_index,
                         unsigned sampler_index,
                         LLVMValueRef context_ptr,
+                         LLVMValueRef thread_data_ptr,
                         const LLVMValueRef *coords,
                         const LLVMValueRef *offsets,
                         const struct lp_derivatives *derivs, /* optional */
@ -2707,6 +2710,11 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
                                                context_ptr, texture_index);
   /* Note that mip_offsets is an array[level] of offsets to texture images */

+   if (dynamic_state->cache_ptr && thread_data_ptr) {
+      bld.cache = dynamic_state->cache_ptr(dynamic_state, gallivm,
+                                           thread_data_ptr, texture_index);
+   }
+
   /* width, height, depth as single int vector */
   if (dims <= 1) {
      bld.int_size = tex_width;
@ -2883,6 +2891,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
         bld4.base_ptr = bld.base_ptr;
         bld4.mip_offsets = bld.mip_offsets;
         bld4.int_size = bld.int_size;
+         bld4.cache = bld.cache;

         bld4.vector_width = lp_type_width(type4);

@ -3081,12 +3090,14 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm,
   LLVMValueRef offsets[3] = { NULL };
   LLVMValueRef lod = NULL;
   LLVMValueRef context_ptr;
+   LLVMValueRef thread_data_ptr = NULL;
   LLVMValueRef texel_out[4];
   struct lp_derivatives derivs;
   struct lp_derivatives *deriv_ptr = NULL;
   unsigned num_param = 0;
   unsigned i, num_coords, num_derivs, num_offsets, layer;
   enum lp_sampler_lod_control lod_control;
+   boolean need_cache = FALSE;

   lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
                    LP_SAMPLER_LOD_CONTROL_SHIFT;
@ -3094,8 +3105,19 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm,
   get_target_info(static_texture_state->target,
                   &num_coords, &num_derivs, &num_offsets, &layer);

+   if (dynamic_state->cache_ptr) {
+      const struct util_format_description *format_desc;
+      format_desc = util_format_description(static_texture_state->format);
+      if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+         need_cache = TRUE;
+      }
+   }
+
   /* "unpack" arguments */
   context_ptr = LLVMGetParam(function, num_param++);
+   if (need_cache) {
+      thread_data_ptr = LLVMGetParam(function, num_param++);
+   }
   for (i = 0; i < num_coords; i++) {
      coords[i] = LLVMGetParam(function, num_param++);
   }
@ -3146,6 +3168,7 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm,
                            texture_index,
                            sampler_index,
                            context_ptr,
+                            thread_data_ptr,
                            coords,
                            offsets,
                            deriv_ptr,
@ -3189,6 +3212,7 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
   const LLVMValueRef *offsets = params->offsets;
   const struct lp_derivatives *derivs = params->derivs;
   enum lp_sampler_lod_control lod_control;
+   boolean need_cache = FALSE;

   lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
                    LP_SAMPLER_LOD_CONTROL_SHIFT;
@ -3196,6 +3220,17 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
   get_target_info(static_texture_state->target,
                   &num_coords, &num_derivs, &num_offsets, &layer);

+   if (dynamic_state->cache_ptr) {
+      const struct util_format_description *format_desc;
+      format_desc = util_format_description(static_texture_state->format);
+      if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+         /*
+          * This is not 100% correct, if we have cache but the
+          * util_format_s3tc_prefer is true the cache won't get used
+          * regardless (could hook up the block decode there...) */
+         need_cache = TRUE;
+      }
+   }
   /*
    * texture function matches are found by name.
    * Thus the name has to include both the texture and sampler unit
@ -3221,6 +3256,9 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
       */

      arg_types[num_param++] = LLVMTypeOf(params->context_ptr);
+      if (need_cache) {
+         arg_types[num_param++] = LLVMTypeOf(params->thread_data_ptr);
+      }
      for (i = 0; i < num_coords; i++) {
         arg_types[num_param++] = LLVMTypeOf(coords[0]);
         assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[i]));
@ -3280,6 +3318,9 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,

   num_args = 0;
   args[num_args++] = params->context_ptr;
+   if (need_cache) {
+      args[num_args++] = params->thread_data_ptr;
+   }
   for (i = 0; i < num_coords; i++) {
      args[num_args++] = coords[i];
   }
@ -3384,6 +3425,7 @@ lp_build_sample_soa(const struct lp_static_texture_state *static_texture_state,
                               params->texture_index,
                               params->sampler_index,
                               params->context_ptr,
+                               params->thread_data_ptr,
                               params->coords,
                               params->offsets,
                               params->derivs,
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@ -230,6 +230,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
                  const LLVMValueRef (*inputs)[4],
                  LLVMValueRef (*outputs)[4],
                  LLVMValueRef context_ptr,
+                  LLVMValueRef thread_data_ptr,
                  struct lp_build_sampler_soa *sampler,
                  const struct tgsi_shader_info *info,
                  const struct lp_build_tgsi_gs_iface *gs_iface);
@ -447,6 +448,7 @@ struct lp_build_tgsi_soa_context
   const LLVMValueRef (*inputs)[TGSI_NUM_CHANNELS];
   LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS];
   LLVMValueRef context_ptr;
+   LLVMValueRef thread_data_ptr;

   const struct lp_build_sampler_soa *sampler;

--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@ -2321,6 +2321,7 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
   params.texture_index = unit;
   params.sampler_index = unit;
   params.context_ptr = bld->context_ptr;
+   params.thread_data_ptr = bld->thread_data_ptr;
   params.coords = coords;
   params.offsets = offsets;
   params.lod = lod;
@ -2488,6 +2489,7 @@ emit_sample(struct lp_build_tgsi_soa_context *bld,
   params.texture_index = texture_unit;
   params.sampler_index = sampler_unit;
   params.context_ptr = bld->context_ptr;
+   params.thread_data_ptr = bld->thread_data_ptr;
   params.coords = coords;
   params.offsets = offsets;
   params.lod = lod;
@ -2608,6 +2610,7 @@ emit_fetch_texels( struct lp_build_tgsi_soa_context *bld,
   params.texture_index = unit;
   params.sampler_index = unit;
   params.context_ptr = bld->context_ptr;
+   params.thread_data_ptr = bld->thread_data_ptr;
   params.coords = coords;
   params.offsets = offsets;
   params.derivs = NULL;
@ -3858,6 +3861,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
                  const LLVMValueRef (*inputs)[TGSI_NUM_CHANNELS],
                  LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
                  LLVMValueRef context_ptr,
+                  LLVMValueRef thread_data_ptr,
                  struct lp_build_sampler_soa *sampler,
                  const struct tgsi_shader_info *info,
                  const struct lp_build_tgsi_gs_iface *gs_iface)
@ -3893,6 +3897,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
   bld.bld_base.info = info;
   bld.indirect_files = info->indirect_files;
   bld.context_ptr = context_ptr;
+   bld.thread_data_ptr = thread_data_ptr;

   /*
    * If the number of temporaries is rather large then we just
--- a/src/gallium/auxiliary/hud/hud_cpu.c
+++ b/src/gallium/auxiliary/hud/hud_cpu.c
@ -33,6 +33,58 @@
 #include "util/u_memory.h"
 #include <stdio.h>
 #include <inttypes.h>
+#ifdef PIPE_OS_WINDOWS
+#include <windows.h>
+#endif
+
+
+#ifdef PIPE_OS_WINDOWS
+
+static inline uint64_t
+filetime_to_scalar(FILETIME ft)
+{
+   ULARGE_INTEGER uli;
+   uli.LowPart = ft.dwLowDateTime;
+   uli.HighPart = ft.dwHighDateTime;
+   return uli.QuadPart;
+}
+
+static boolean
+get_cpu_stats(unsigned cpu_index, uint64_t *busy_time, uint64_t *total_time)
+{
+   SYSTEM_INFO sysInfo;
+   FILETIME ftNow, ftCreation, ftExit, ftKernel, ftUser;
+
+   GetSystemInfo(&sysInfo);
+   assert(sysInfo.dwNumberOfProcessors >= 1);
+   if (cpu_index != ALL_CPUS && cpu_index >= sysInfo.dwNumberOfProcessors) {
+      /* Tell hud_get_num_cpus there are only this many CPUs. */
+      return FALSE;
+   }
+
+   /* Get accumulated user and sys time for all threads */
+   if (!GetProcessTimes(GetCurrentProcess(), &ftCreation, &ftExit,
+                        &ftKernel, &ftUser))
+      return FALSE;
+
+   GetSystemTimeAsFileTime(&ftNow);
+
+   *busy_time = filetime_to_scalar(ftUser) + filetime_to_scalar(ftKernel);
+   *total_time = filetime_to_scalar(ftNow) - filetime_to_scalar(ftCreation);
+
+   /* busy_time already has the time accross all cpus.
+    * XXX: if we want 100% to mean one CPU, 200% two cpus, eliminate the
+    * following line.
+    */
+   *total_time *= sysInfo.dwNumberOfProcessors;
+
+   /* XXX: we ignore cpu_index, i.e, we assume that the individual CPU usage
+    * and the system usage are one and the same.
+    */
+   return TRUE;
+}
+
+#else

 static boolean
 get_cpu_stats(unsigned cpu_index, uint64_t *busy_time, uint64_t *total_time)
@ -81,6 +133,8 @@ get_cpu_stats(unsigned cpu_index, uint64_t *busy_time, uint64_t *total_time)
   fclose(f);
   return FALSE;
 }
+#endif
+

 struct cpu_info {
   unsigned cpu_index;
--- a/src/gallium/auxiliary/indices/u_indices.c
+++ b/src/gallium/auxiliary/indices/u_indices.c
@ -68,17 +68,18 @@ static void translate_memcpy_uint( const void *in,
 * \param out_nr  returns number of new vertices
 * \param out_translate  returns the translation function to use by the caller
 */
-int u_index_translator( unsigned hw_mask,
-                        unsigned prim,
-                        unsigned in_index_size,
-                        unsigned nr,
-                        unsigned in_pv,
-                        unsigned out_pv,
-                        unsigned prim_restart,
-                        unsigned *out_prim,
-                        unsigned *out_index_size,
-                        unsigned *out_nr,
-                        u_translate_func *out_translate )
+enum indices_mode
+u_index_translator(unsigned hw_mask,
+                   unsigned prim,
+                   unsigned in_index_size,
+                   unsigned nr,
+                   unsigned in_pv,
+                   unsigned out_pv,
+                   unsigned prim_restart,
+                   unsigned *out_prim,
+                   unsigned *out_index_size,
+                   unsigned *out_nr,
+                   u_translate_func *out_translate)
 {
   unsigned in_idx;
   unsigned out_idx;
@ -204,17 +205,17 @@ int u_index_translator( unsigned hw_mask,
 * \param out_nr  returns new number of vertices to draw
 * \param out_generate  returns pointer to the generator function
 */
-int u_index_generator( unsigned hw_mask,
-                       unsigned prim,
-                       unsigned start,
-                       unsigned nr,
-                       unsigned in_pv,
-                       unsigned out_pv,
-                       unsigned *out_prim,
-                       unsigned *out_index_size,
-                       unsigned *out_nr,
-                       u_generate_func *out_generate )
-
+enum indices_mode
+u_index_generator(unsigned hw_mask,
+                  unsigned prim,
+                  unsigned start,
+                  unsigned nr,
+                  unsigned in_pv,
+                  unsigned out_pv,
+                  unsigned *out_prim,
+                  unsigned *out_index_size,
+                  unsigned *out_nr,
+                  u_generate_func *out_generate)
 {
   unsigned out_idx;

--- a/src/gallium/auxiliary/indices/u_indices.h
+++ b/src/gallium/auxiliary/indices/u_indices.h
@ -67,66 +67,68 @@ typedef void (*u_generate_func)( unsigned start,
 /* Return codes describe the translate/generate operation.  Caller may
 * be able to reuse translated indices under some circumstances.
 */
-#define U_TRANSLATE_ERROR  -1
-#define U_TRANSLATE_NORMAL  1
-#define U_TRANSLATE_MEMCPY  2
-#define U_GENERATE_LINEAR   3
-#define U_GENERATE_REUSABLE 4
-#define U_GENERATE_ONE_OFF  5
-
+enum indices_mode {
+   U_TRANSLATE_ERROR = -1,
+   U_TRANSLATE_NORMAL = 1,
+   U_TRANSLATE_MEMCPY = 2,
+   U_GENERATE_LINEAR  = 3,
+   U_GENERATE_REUSABLE= 4,
+   U_GENERATE_ONE_OFF = 5,
+};

 void u_index_init( void );

-int u_index_translator( unsigned hw_mask,
-                        unsigned prim,
-                        unsigned in_index_size,
-                        unsigned nr,
-                        unsigned in_pv,   /* API */
-                        unsigned out_pv,  /* hardware */
-                        unsigned prim_restart,
-                        unsigned *out_prim,
-                        unsigned *out_index_size,
-                        unsigned *out_nr,
-                        u_translate_func *out_translate );
+enum indices_mode
+u_index_translator(unsigned hw_mask,
+                   unsigned prim,
+                   unsigned in_index_size,
+                   unsigned nr,
+                   unsigned in_pv,   /* API */
+                   unsigned out_pv,  /* hardware */
+                   unsigned prim_restart,
+                   unsigned *out_prim,
+                   unsigned *out_index_size,
+                   unsigned *out_nr,
+                   u_translate_func *out_translate);

 /* Note that even when generating it is necessary to know what the
 * API's PV is, as the indices generated will depend on whether it is
 * the same as hardware or not, and in the case of triangle strips,
 * whether it is first or last.
 */
-int u_index_generator( unsigned hw_mask,
-                       unsigned prim,
-                       unsigned start,
-                       unsigned nr,
-                       unsigned in_pv,   /* API */
-                       unsigned out_pv,  /* hardware */
-                       unsigned *out_prim,
-                       unsigned *out_index_size,
-                       unsigned *out_nr,
-                       u_generate_func *out_generate );
+enum indices_mode
+u_index_generator(unsigned hw_mask,
+                  unsigned prim,
+                  unsigned start,
+                  unsigned nr,
+                  unsigned in_pv,   /* API */
+                  unsigned out_pv,  /* hardware */
+                  unsigned *out_prim,
+                  unsigned *out_index_size,
+                  unsigned *out_nr,
+                  u_generate_func *out_generate);


 void u_unfilled_init( void );

-int u_unfilled_translator( unsigned prim,
-                           unsigned in_index_size,
-                           unsigned nr,
-                           unsigned unfilled_mode,
-                           unsigned *out_prim,
-                           unsigned *out_index_size,
-                           unsigned *out_nr,
-                           u_translate_func *out_translate );
-
-int u_unfilled_generator( unsigned prim,
-                          unsigned start,
-                          unsigned nr,
-                          unsigned unfilled_mode,
-                          unsigned *out_prim,
-                          unsigned *out_index_size,
-                          unsigned *out_nr,
-                          u_generate_func *out_generate );
-
-
+enum indices_mode
+u_unfilled_translator(unsigned prim,
+                      unsigned in_index_size,
+                      unsigned nr,
+                      unsigned unfilled_mode,
+                      unsigned *out_prim,
+                      unsigned *out_index_size,
+                      unsigned *out_nr,
+                      u_translate_func *out_translate);

+enum indices_mode
+u_unfilled_generator(unsigned prim,
+                     unsigned start,
+                     unsigned nr,
+                     unsigned unfilled_mode,
+                     unsigned *out_prim,
+                     unsigned *out_index_size,
+                     unsigned *out_nr,
+                     u_generate_func *out_generate);

 #endif
--- a/src/gallium/auxiliary/indices/u_unfilled_indices.c
+++ b/src/gallium/auxiliary/indices/u_unfilled_indices.c
@ -111,14 +111,15 @@ static unsigned nr_lines( unsigned prim,
                              


-int u_unfilled_translator( unsigned prim,
-                        unsigned in_index_size,
-                        unsigned nr,
-                        unsigned unfilled_mode,
-                        unsigned *out_prim,
-                        unsigned *out_index_size,
-                        unsigned *out_nr,
-                        u_translate_func *out_translate )
+enum indices_mode
+u_unfilled_translator(unsigned prim,
+                      unsigned in_index_size,
+                      unsigned nr,
+                      unsigned unfilled_mode,
+                      unsigned *out_prim,
+                      unsigned *out_index_size,
+                      unsigned *out_nr,
+                      u_translate_func *out_translate)
 {
   unsigned in_idx;
   unsigned out_idx;
@ -170,14 +171,15 @@ int u_unfilled_translator( unsigned prim,
 * different front/back fill modes, that can be handled with the
 * 'draw' module.
 */
-int u_unfilled_generator( unsigned prim,
-                          unsigned start,
-                          unsigned nr,
-                          unsigned unfilled_mode,
-                          unsigned *out_prim,
-                          unsigned *out_index_size,
-                          unsigned *out_nr,
-                          u_generate_func *out_generate )
+enum indices_mode
+u_unfilled_generator(unsigned prim,
+                     unsigned start,
+                     unsigned nr,
+                     unsigned unfilled_mode,
+                     unsigned *out_prim,
+                     unsigned *out_index_size,
+                     unsigned *out_nr,
+                     u_generate_func *out_generate)
 {
   unsigned out_idx;

--- a/src/gallium/auxiliary/tgsi/tgsi_strings.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c
@ -95,6 +95,7 @@ const char *tgsi_semantic_names[TGSI_SEMANTIC_COUNT] =
   "TESSOUTER",
   "TESSINNER",
   "VERTICESIN",
+   "HELPER_INVOCATION",
 };

 const char *tgsi_texture_names[TGSI_TEXTURE_COUNT] =
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@ -70,7 +70,7 @@ struct blitter_context_priv
   /* Constant state objects. */
   /* Vertex shaders. */
   void *vs; /**< Vertex shader which passes {pos, generic} to the output.*/
-   void *vs_pos_only; /**< Vertex shader which passes pos to the output.*/
+   void *vs_pos_only[4]; /**< Vertex shader which passes pos to the output.*/
   void *vs_layered; /**< Vertex shader which sets LAYER = INSTANCEID. */

   /* Fragment shaders. */
@ -325,27 +325,29 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
   return &ctx->base;
 }

-static void bind_vs_pos_only(struct blitter_context_priv *ctx)
+static void bind_vs_pos_only(struct blitter_context_priv *ctx,
+                             unsigned num_so_channels)
 {
   struct pipe_context *pipe = ctx->base.pipe;
+   int index = num_so_channels ? num_so_channels - 1 : 0;

-   if (!ctx->vs_pos_only) {
+   if (!ctx->vs_pos_only[index]) {
      struct pipe_stream_output_info so;
      const uint semantic_names[] = { TGSI_SEMANTIC_POSITION };
      const uint semantic_indices[] = { 0 };

      memset(&so, 0, sizeof(so));
      so.num_outputs = 1;
-      so.output[0].num_components = 1;
-      so.stride[0] = 1;
+      so.output[0].num_components = num_so_channels;
+      so.stride[0] = num_so_channels;

-      ctx->vs_pos_only =
+      ctx->vs_pos_only[index] =
         util_make_vertex_passthrough_shader_with_so(pipe, 1, semantic_names,
                                                     semantic_indices, FALSE,
                                                     &so);
   }

-   pipe->bind_vs_state(pipe, ctx->vs_pos_only);
+   pipe->bind_vs_state(pipe, ctx->vs_pos_only[index]);
 }

 static void bind_vs_passthrough(struct blitter_context_priv *ctx)
@ -441,8 +443,9 @@ void util_blitter_destroy(struct blitter_context *blitter)
      pipe->delete_rasterizer_state(pipe, ctx->rs_discard_state);
   if (ctx->vs)
      pipe->delete_vs_state(pipe, ctx->vs);
-   if (ctx->vs_pos_only)
-      pipe->delete_vs_state(pipe, ctx->vs_pos_only);
+   for (i = 0; i < 4; i++)
+      if (ctx->vs_pos_only[i])
+         pipe->delete_vs_state(pipe, ctx->vs_pos_only[i]);
   if (ctx->vs_layered)
      pipe->delete_vs_state(pipe, ctx->vs_layered);
   pipe->delete_vertex_elements_state(pipe, ctx->velem_state);
@ -2036,7 +2039,7 @@ void util_blitter_copy_buffer(struct blitter_context *blitter,

   pipe->set_vertex_buffers(pipe, ctx->base.vb_slot, 1, &vb);
   pipe->bind_vertex_elements_state(pipe, ctx->velem_state_readbuf[0]);
-   bind_vs_pos_only(ctx);
+   bind_vs_pos_only(ctx, 1);
   if (ctx->has_geometry_shader)
      pipe->bind_gs_state(pipe, NULL);
   if (ctx->has_tessellation) {
@ -2103,7 +2106,7 @@ void util_blitter_clear_buffer(struct blitter_context *blitter,
   pipe->set_vertex_buffers(pipe, ctx->base.vb_slot, 1, &vb);
   pipe->bind_vertex_elements_state(pipe,
                                    ctx->velem_state_readbuf[num_channels-1]);
-   bind_vs_pos_only(ctx);
+   bind_vs_pos_only(ctx, num_channels);
   if (ctx->has_geometry_shader)
      pipe->bind_gs_state(pipe, NULL);
   if (ctx->has_tessellation) {
--- a/src/gallium/auxiliary/util/u_debug.c
+++ b/src/gallium/auxiliary/util/u_debug.c
@ -70,6 +70,20 @@ void _debug_vprintf(const char *format, va_list ap)
 #endif
 }

+void
+_pipe_debug_message(
+   struct pipe_debug_callback *cb,
+   unsigned *id,
+   enum pipe_debug_type type,
+   const char *fmt, ...)
+{
+   va_list args;
+   va_start(args, fmt);
+   if (cb && cb->debug_message)
+      cb->debug_message(cb->data, id, type, fmt, args);
+   va_end(args);
+}
+

 void
 debug_disable_error_message_boxes(void)
--- a/src/gallium/auxiliary/util/u_debug.h
+++ b/src/gallium/auxiliary/util/u_debug.h
@ -42,6 +42,7 @@
 #include "os/os_misc.h"

 #include "pipe/p_format.h"
+#include "pipe/p_defines.h"


 #ifdef	__cplusplus
@ -262,6 +263,25 @@ void _debug_assert_fail(const char *expr,
   _debug_printf("error: %s\n", __msg)
 #endif

+/**
+ * Output a debug log message to the debug info callback.
+ */
+#define pipe_debug_message(cb, type, fmt, ...) do { \
+   static unsigned id = 0; \
+   _pipe_debug_message(cb, &id, \
+                       PIPE_DEBUG_TYPE_ ## type, \
+                       fmt, __VA_ARGS__); \
+} while (0)
+
+struct pipe_debug_callback;
+
+void
+_pipe_debug_message(
+   struct pipe_debug_callback *cb,
+   unsigned *id,
+   enum pipe_debug_type type,
+   const char *fmt, ...) _util_printf_format(4, 5);
+

 /**
 * Used by debug_dump_enum and debug_dump_flags to describe symbols.
--- a/src/gallium/auxiliary/util/u_vbuf.c
+++ b/src/gallium/auxiliary/util/u_vbuf.c
@ -998,26 +998,30 @@ u_vbuf_upload_buffers(struct u_vbuf *mgr,
   return PIPE_OK;
 }

-static boolean u_vbuf_need_minmax_index(struct u_vbuf *mgr)
+static boolean u_vbuf_need_minmax_index(const struct u_vbuf *mgr)
 {
   /* See if there are any per-vertex attribs which will be uploaded or
    * translated. Use bitmasks to get the info instead of looping over vertex
    * elements. */
   return (mgr->ve->used_vb_mask &
-           ((mgr->user_vb_mask | mgr->incompatible_vb_mask |
+           ((mgr->user_vb_mask |
+             mgr->incompatible_vb_mask |
             mgr->ve->incompatible_vb_mask_any) &
-            mgr->ve->noninstance_vb_mask_any & mgr->nonzero_stride_vb_mask)) != 0;
+            mgr->ve->noninstance_vb_mask_any &
+            mgr->nonzero_stride_vb_mask)) != 0;
 }

-static boolean u_vbuf_mapping_vertex_buffer_blocks(struct u_vbuf *mgr)
+static boolean u_vbuf_mapping_vertex_buffer_blocks(const struct u_vbuf *mgr)
 {
   /* Return true if there are hw buffers which don't need to be translated.
    *
    * We could query whether each buffer is busy, but that would
    * be way more costly than this. */
   return (mgr->ve->used_vb_mask &
-           (~mgr->user_vb_mask & ~mgr->incompatible_vb_mask &
-            mgr->ve->compatible_vb_mask_all & mgr->ve->noninstance_vb_mask_any &
+           (~mgr->user_vb_mask &
+            ~mgr->incompatible_vb_mask &
+            mgr->ve->compatible_vb_mask_all &
+            mgr->ve->noninstance_vb_mask_any &
            mgr->nonzero_stride_vb_mask)) != 0;
 }

--- a/src/gallium/auxiliary/vl/vl_video_buffer.c
+++ b/src/gallium/auxiliary/vl/vl_video_buffer.c
@ -62,6 +62,18 @@ const enum pipe_format const_resource_formats_VUYA[3] = {
   PIPE_FORMAT_NONE
 };

+const enum pipe_format const_resource_formats_YUVX[3] = {
+   PIPE_FORMAT_R8G8B8X8_UNORM,
+   PIPE_FORMAT_NONE,
+   PIPE_FORMAT_NONE
+};
+
+const enum pipe_format const_resource_formats_VUYX[3] = {
+   PIPE_FORMAT_B8G8R8X8_UNORM,
+   PIPE_FORMAT_NONE,
+   PIPE_FORMAT_NONE
+};
+
 const enum pipe_format const_resource_formats_YUYV[3] = {
   PIPE_FORMAT_R8G8_R8B8_UNORM,
   PIPE_FORMAT_NONE,
@ -102,6 +114,12 @@ vl_video_buffer_formats(struct pipe_screen *screen, enum pipe_format format)
   case PIPE_FORMAT_B8G8R8A8_UNORM:
      return const_resource_formats_VUYA;

+   case PIPE_FORMAT_R8G8B8X8_UNORM:
+      return const_resource_formats_VUYX;
+
+   case PIPE_FORMAT_B8G8R8X8_UNORM:
+      return const_resource_formats_VUYX;
+
   case PIPE_FORMAT_YUYV:
      return const_resource_formats_YUYV;

--- a/src/gallium/auxiliary/vl/vl_winsys.h
+++ b/src/gallium/auxiliary/vl/vl_winsys.h
@ -66,4 +66,10 @@ vl_screen_set_next_timestamp(struct vl_screen *vscreen, uint64_t stamp);
 void*
 vl_screen_get_private(struct vl_screen *vscreen);

+struct vl_screen*
+vl_drm_screen_create(int fd);
+
+void
+vl_drm_screen_destroy(struct vl_screen *vscreen);
+
 #endif
--- a/src/gallium/auxiliary/vl/vl_winsys_drm.c
+++ b/src/gallium/auxiliary/vl/vl_winsys_drm.c
@ -0,0 +1,77 @@
+/**************************************************************************
+ *
+ * Copyright 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <assert.h>
+
+#include "pipe/p_screen.h"
+#include "pipe-loader/pipe_loader.h"
+#include "state_tracker/drm_driver.h"
+
+#include "util/u_memory.h"
+#include "vl/vl_winsys.h"
+
+struct vl_screen*
+vl_drm_screen_create(int fd)
+{
+   struct vl_screen *vscreen;
+
+   vscreen = CALLOC_STRUCT(vl_screen);
+   if (!vscreen)
+      return NULL;
+
+#if GALLIUM_STATIC_TARGETS
+   vscreen->pscreen = dd_create_screen(fd);
+#else
+   if (pipe_loader_drm_probe_fd(&vscreen->dev, dup(fd))) {
+      vscreen->pscreen =
+         pipe_loader_create_screen(vscreen->dev, PIPE_SEARCH_DIR);
+      if (!vscreen->pscreen)
+         pipe_loader_release(&vscreen->dev, 1);
+   }
+#endif
+
+   if (!vscreen->pscreen) {
+      FREE(vscreen);
+      return NULL;
+   }
+
+   return vscreen;
+}
+
+void
+vl_drm_screen_destroy(struct vl_screen *vscreen)
+{
+   assert(vscreen);
+
+   vscreen->pscreen->destroy(vscreen->pscreen);
+
+#if !GALLIUM_STATIC_TARGETS
+   pipe_loader_release(&vscreen->dev, 1);
+#endif
+
+   FREE(vscreen);
+}
--- a/src/gallium/docs/source/context.rst
+++ b/src/gallium/docs/source/context.rst
@ -84,6 +84,9 @@ objects. They all follow simple, one-method binding calls, e.g.
    levels. This corresponds to GL's ``PATCH_DEFAULT_OUTER_LEVEL``.
  * ``default_inner_level`` is the default value for the inner tessellation
    levels. This corresponds to GL's ``PATCH_DEFAULT_INNER_LEVEL``.
+* ``set_debug_callback`` sets the callback to be used for reporting
+  various debug messages, eventually reported via KHR_debug and
+  similar mechanisms.


 Sampler Views
@ -224,6 +227,10 @@ is is also possible to only clear one or the other part). While it is only
 possible to clear one surface at a time (which can include several layers),
 this surface need not be bound to the framebuffer.

+``clear_texture`` clears a non-PIPE_BUFFER resource's specified level
+and bounding box with a clear value provided in that resource's native
+format.
+
 ``clear_buffer`` clears a PIPE_BUFFER resource with the specified clear value
 (which may be multiple bytes in length). Logically this is a memset with a
 multi-byte element value starting at offset bytes from resource start, going
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@ -281,6 +281,8 @@ The integer capabilities:
 * ``PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS``:
  Whether copying between compressed and plain formats is supported where
  a compressed block is copied to/from a plain pixel of the same size.
+* ``PIPE_CAP_CLEAR_TEXTURE``: Whether `clear_texture` will be
+  available in contexts.


 .. _pipe_capf:
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@ -2941,6 +2941,14 @@ TGSI_SEMANTIC_VERTICESIN
 For tessellation evaluation/control shaders, this semantic label indicates the
 number of vertices provided in the input patch. Only the X value is defined.

+TGSI_SEMANTIC_HELPER_INVOCATION
+"""""""""""""""""""""""""""""""
+
+For fragment shaders, this semantic indicates whether the current
+invocation is covered or not. Helper invocations are created in order
+to properly compute derivatives, however it may be desirable to skip
+some of the logic in those cases. See ``gl_HelperInvocation`` documentation.
+

 Declaration Interpolate
 ^^^^^^^^^^^^^^^^^^^^^^^
--- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
+++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git

 The rules-ng-ng source files this header was generated from are:
- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67771 bytes, from 2015-09-14 20:46:55)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63970 bytes, from 2015-09-14 20:50:12)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63914 bytes, from 2015-10-27 17:13:16)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)

 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
--- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
+++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git

 The rules-ng-ng source files this header was generated from are:
- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67771 bytes, from 2015-09-14 20:46:55)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63970 bytes, from 2015-09-14 20:50:12)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63914 bytes, from 2015-10-27 17:13:16)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)

 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
--- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
+++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git

 The rules-ng-ng source files this header was generated from are:
- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67771 bytes, from 2015-09-14 20:46:55)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63970 bytes, from 2015-09-14 20:50:12)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63914 bytes, from 2015-10-27 17:13:16)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)

 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
@ -489,8 +490,8 @@ static inline uint32_t A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(enum adreno_r
 	return ((val) << A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR__SHIFT) & A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR__MASK;
 }

-#define REG_A4XX_RB_BLEND_RED					0x000020f3
-#define A4XX_RB_BLEND_RED_UINT__MASK				0x00007fff
+#define REG_A4XX_RB_BLEND_RED					0x000020f0
+#define A4XX_RB_BLEND_RED_UINT__MASK				0x0000ffff
 #define A4XX_RB_BLEND_RED_UINT__SHIFT				0
 static inline uint32_t A4XX_RB_BLEND_RED_UINT(uint32_t val)
 {
@ -503,8 +504,16 @@ static inline uint32_t A4XX_RB_BLEND_RED_FLOAT(float val)
 	return ((util_float_to_half(val)) << A4XX_RB_BLEND_RED_FLOAT__SHIFT) & A4XX_RB_BLEND_RED_FLOAT__MASK;
 }

-#define REG_A4XX_RB_BLEND_GREEN					0x000020f4
-#define A4XX_RB_BLEND_GREEN_UINT__MASK				0x00007fff
+#define REG_A4XX_RB_BLEND_RED_F32				0x000020f1
+#define A4XX_RB_BLEND_RED_F32__MASK				0xffffffff
+#define A4XX_RB_BLEND_RED_F32__SHIFT				0
+static inline uint32_t A4XX_RB_BLEND_RED_F32(float val)
+{
+	return ((fui(val)) << A4XX_RB_BLEND_RED_F32__SHIFT) & A4XX_RB_BLEND_RED_F32__MASK;
+}
+
+#define REG_A4XX_RB_BLEND_GREEN					0x000020f2
+#define A4XX_RB_BLEND_GREEN_UINT__MASK				0x0000ffff
 #define A4XX_RB_BLEND_GREEN_UINT__SHIFT				0
 static inline uint32_t A4XX_RB_BLEND_GREEN_UINT(uint32_t val)
 {
@ -517,8 +526,16 @@ static inline uint32_t A4XX_RB_BLEND_GREEN_FLOAT(float val)
 	return ((util_float_to_half(val)) << A4XX_RB_BLEND_GREEN_FLOAT__SHIFT) & A4XX_RB_BLEND_GREEN_FLOAT__MASK;
 }

-#define REG_A4XX_RB_BLEND_BLUE					0x000020f5
-#define A4XX_RB_BLEND_BLUE_UINT__MASK				0x00007fff
+#define REG_A4XX_RB_BLEND_GREEN_F32				0x000020f3
+#define A4XX_RB_BLEND_GREEN_F32__MASK				0xffffffff
+#define A4XX_RB_BLEND_GREEN_F32__SHIFT				0
+static inline uint32_t A4XX_RB_BLEND_GREEN_F32(float val)
+{
+	return ((fui(val)) << A4XX_RB_BLEND_GREEN_F32__SHIFT) & A4XX_RB_BLEND_GREEN_F32__MASK;
+}
+
+#define REG_A4XX_RB_BLEND_BLUE					0x000020f4
+#define A4XX_RB_BLEND_BLUE_UINT__MASK				0x0000ffff
 #define A4XX_RB_BLEND_BLUE_UINT__SHIFT				0
 static inline uint32_t A4XX_RB_BLEND_BLUE_UINT(uint32_t val)
 {
@ -531,8 +548,16 @@ static inline uint32_t A4XX_RB_BLEND_BLUE_FLOAT(float val)
 	return ((util_float_to_half(val)) << A4XX_RB_BLEND_BLUE_FLOAT__SHIFT) & A4XX_RB_BLEND_BLUE_FLOAT__MASK;
 }

+#define REG_A4XX_RB_BLEND_BLUE_F32				0x000020f5
+#define A4XX_RB_BLEND_BLUE_F32__MASK				0xffffffff
+#define A4XX_RB_BLEND_BLUE_F32__SHIFT				0
+static inline uint32_t A4XX_RB_BLEND_BLUE_F32(float val)
+{
+	return ((fui(val)) << A4XX_RB_BLEND_BLUE_F32__SHIFT) & A4XX_RB_BLEND_BLUE_F32__MASK;
+}
+
 #define REG_A4XX_RB_BLEND_ALPHA					0x000020f6
-#define A4XX_RB_BLEND_ALPHA_UINT__MASK				0x00007fff
+#define A4XX_RB_BLEND_ALPHA_UINT__MASK				0x0000ffff
 #define A4XX_RB_BLEND_ALPHA_UINT__SHIFT				0
 static inline uint32_t A4XX_RB_BLEND_ALPHA_UINT(uint32_t val)
 {
@ -545,6 +570,14 @@ static inline uint32_t A4XX_RB_BLEND_ALPHA_FLOAT(float val)
 	return ((util_float_to_half(val)) << A4XX_RB_BLEND_ALPHA_FLOAT__SHIFT) & A4XX_RB_BLEND_ALPHA_FLOAT__MASK;
 }

+#define REG_A4XX_RB_BLEND_ALPHA_F32				0x000020f7
+#define A4XX_RB_BLEND_ALPHA_F32__MASK				0xffffffff
+#define A4XX_RB_BLEND_ALPHA_F32__SHIFT				0
+static inline uint32_t A4XX_RB_BLEND_ALPHA_F32(float val)
+{
+	return ((fui(val)) << A4XX_RB_BLEND_ALPHA_F32__SHIFT) & A4XX_RB_BLEND_ALPHA_F32__MASK;
+}
+
 #define REG_A4XX_RB_ALPHA_CONTROL				0x000020f8
 #define A4XX_RB_ALPHA_CONTROL_ALPHA_REF__MASK			0x000000ff
 #define A4XX_RB_ALPHA_CONTROL_ALPHA_REF__SHIFT			0
@ -2645,20 +2678,6 @@ static inline uint32_t A4XX_PC_HS_PARAM_PRIMTYPE(enum adreno_pa_su_sc_draw val)

 #define REG_A4XX_UNKNOWN_20EF					0x000020ef

-#define REG_A4XX_UNKNOWN_20F0					0x000020f0
-
-#define REG_A4XX_UNKNOWN_20F1					0x000020f1
-
-#define REG_A4XX_UNKNOWN_20F2					0x000020f2
-
-#define REG_A4XX_UNKNOWN_20F7					0x000020f7
-#define A4XX_UNKNOWN_20F7__MASK					0xffffffff
-#define A4XX_UNKNOWN_20F7__SHIFT				0
-static inline uint32_t A4XX_UNKNOWN_20F7(float val)
-{
-	return ((fui(val)) << A4XX_UNKNOWN_20F7__SHIFT) & A4XX_UNKNOWN_20F7__MASK;
-}
-
 #define REG_A4XX_UNKNOWN_2152					0x00002152

 #define REG_A4XX_UNKNOWN_2153					0x00002153
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@ -613,15 +613,19 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,

 	if (dirty & FD_DIRTY_BLEND_COLOR) {
 		struct pipe_blend_color *bcolor = &ctx->blend_color;
-		OUT_PKT0(ring, REG_A4XX_RB_BLEND_RED, 4);
-		OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(bcolor->color[0] * 255.0) |
+		OUT_PKT0(ring, REG_A4XX_RB_BLEND_RED, 8);
+		OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(bcolor->color[0] * 65535.0) |
 				A4XX_RB_BLEND_RED_FLOAT(bcolor->color[0]));
-		OUT_RING(ring, A4XX_RB_BLEND_GREEN_UINT(bcolor->color[1] * 255.0) |
+		OUT_RING(ring, A4XX_RB_BLEND_RED_F32(bcolor->color[0]));
+		OUT_RING(ring, A4XX_RB_BLEND_GREEN_UINT(bcolor->color[1] * 65535.0) |
 				A4XX_RB_BLEND_GREEN_FLOAT(bcolor->color[1]));
-		OUT_RING(ring, A4XX_RB_BLEND_BLUE_UINT(bcolor->color[2] * 255.0) |
+		OUT_RING(ring, A4XX_RB_BLEND_GREEN_F32(bcolor->color[1]));
+		OUT_RING(ring, A4XX_RB_BLEND_BLUE_UINT(bcolor->color[2] * 65535.0) |
 				A4XX_RB_BLEND_BLUE_FLOAT(bcolor->color[2]));
-		OUT_RING(ring, A4XX_RB_BLEND_ALPHA_UINT(bcolor->color[3] * 255.0) |
+		OUT_RING(ring, A4XX_RB_BLEND_BLUE_F32(bcolor->color[2]));
+		OUT_RING(ring, A4XX_RB_BLEND_ALPHA_UINT(bcolor->color[3] * 65535.0) |
 				A4XX_RB_BLEND_ALPHA_FLOAT(bcolor->color[3]));
+		OUT_RING(ring, A4XX_RB_BLEND_ALPHA_F32(bcolor->color[3]));
 	}

 	if (dirty & FD_DIRTY_VERTTEX) {
@ -699,15 +703,6 @@ fd4_emit_restore(struct fd_context *ctx)
 	OUT_PKT0(ring, REG_A4XX_UNKNOWN_20EF, 1);
 	OUT_RING(ring, 0x00000000);

-	OUT_PKT0(ring, REG_A4XX_UNKNOWN_20F0, 1);
-	OUT_RING(ring, 0x00000000);
-
-	OUT_PKT0(ring, REG_A4XX_UNKNOWN_20F1, 1);
-	OUT_RING(ring, 0x00000000);
-
-	OUT_PKT0(ring, REG_A4XX_UNKNOWN_20F2, 1);
-	OUT_RING(ring, 0x00000000);
-
 	OUT_PKT0(ring, REG_A4XX_RB_BLEND_RED, 4);
 	OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(0) |
 			A4XX_RB_BLEND_RED_FLOAT(0.0));
@ -718,9 +713,6 @@ fd4_emit_restore(struct fd_context *ctx)
 	OUT_RING(ring, A4XX_RB_BLEND_ALPHA_UINT(0x7fff) |
 			A4XX_RB_BLEND_ALPHA_FLOAT(1.0));

-	OUT_PKT0(ring, REG_A4XX_UNKNOWN_20F7, 1);
-	OUT_RING(ring, 0x3f800000);
-
 	OUT_PKT0(ring, REG_A4XX_UNKNOWN_2152, 1);
 	OUT_RING(ring, 0x00000000);

--- a/src/gallium/drivers/freedreno/adreno_common.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_common.xml.h
@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git

 The rules-ng-ng source files this header was generated from are:
- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67771 bytes, from 2015-09-14 20:46:55)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63970 bytes, from 2015-09-14 20:50:12)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63914 bytes, from 2015-10-27 17:13:16)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)

 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
--- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git

 The rules-ng-ng source files this header was generated from are:
- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67771 bytes, from 2015-09-14 20:46:55)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63970 bytes, from 2015-09-14 20:50:12)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63914 bytes, from 2015-10-27 17:13:16)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)

 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@ -239,6 +239,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
 	case PIPE_CAP_SHAREABLE_SHADERS:
 	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+	case PIPE_CAP_CLEAR_TEXTURE:
 		return 0;

 	case PIPE_CAP_MAX_VIEWPORTS:
@ -549,6 +550,7 @@ fd_screen_create(struct fd_device *dev)
 	case 220:
 		fd2_screen_init(pscreen);
 		break;
+	case 305:
 	case 307:
 	case 320:
 	case 330:
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@ -2325,17 +2325,17 @@ emit_instructions(struct ir3_compile *ctx)
 	}

 	/* Setup inputs: */
-	foreach_list_typed(nir_variable, var, node, &ctx->s->inputs) {
+	nir_foreach_variable(var, &ctx->s->inputs) {
 		setup_input(ctx, var);
 	}

 	/* Setup outputs: */
-	foreach_list_typed(nir_variable, var, node, &ctx->s->outputs) {
+	nir_foreach_variable(var, &ctx->s->outputs) {
 		setup_output(ctx, var);
 	}

 	/* Setup variables (which should only be arrays): */
-	foreach_list_typed(nir_variable, var, node, &ctx->s->globals) {
+	nir_foreach_variable(var, &ctx->s->globals) {
 		declare_var(ctx, var);
 	}

--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@ -253,6 +253,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
   case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
   case PIPE_CAP_SHAREABLE_SHADERS:
   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+   case PIPE_CAP_CLEAR_TEXTURE:
      return 0;

   case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@ -475,6 +475,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
   case PIPE_CAP_SHAREABLE_SHADERS:
   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+   case PIPE_CAP_CLEAR_TEXTURE:
      return 0;

   case PIPE_CAP_VENDOR_ID:
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
@ -746,7 +746,12 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,

   pos_init(bld, x0, y0);

-   if (coeff_type.length > 4) {
+   /*
+    * Simple method (single step interpolation) may be slower if vector length
+    * is just 4, but the results are different (generally less accurate) with
+    * the other method, so always use more accurate version.
+    */
+   if (1) {
      bld->simple_interp = TRUE;
      {
         /* XXX this should use a global static table */
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@ -36,6 +36,7 @@
 #include "util/u_memory.h"
 #include "gallivm/lp_bld_init.h"
 #include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_format.h"
 #include "lp_context.h"
 #include "lp_jit.h"

@ -208,6 +209,8 @@ lp_jit_create_types(struct lp_fragment_shader_variant *lp)
      LLVMTypeRef elem_types[LP_JIT_THREAD_DATA_COUNT];
      LLVMTypeRef thread_data_type;

+      elem_types[LP_JIT_THREAD_DATA_CACHE] =
+            LLVMPointerType(lp_build_format_cache_type(gallivm), 0);
      elem_types[LP_JIT_THREAD_DATA_COUNTER] = LLVMInt64TypeInContext(lc);
      elem_types[LP_JIT_THREAD_DATA_RASTER_STATE_VIEWPORT_INDEX] =
            LLVMInt32TypeInContext(lc);
--- a/src/gallium/drivers/llvmpipe/lp_jit.h
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@ -43,6 +43,7 @@
 #include "lp_texture.h"


+struct lp_build_format_cache;
 struct lp_fragment_shader_variant;
 struct llvmpipe_screen;

@ -189,6 +190,7 @@ enum {

 struct lp_jit_thread_data
 {
+   struct lp_build_format_cache *cache;
   uint64_t vis_counter;

   /*
@ -201,12 +203,16 @@ struct lp_jit_thread_data


 enum {
-   LP_JIT_THREAD_DATA_COUNTER = 0,
+   LP_JIT_THREAD_DATA_CACHE = 0,
+   LP_JIT_THREAD_DATA_COUNTER,
   LP_JIT_THREAD_DATA_RASTER_STATE_VIEWPORT_INDEX,
   LP_JIT_THREAD_DATA_COUNT
 };


+#define lp_jit_thread_data_cache(_gallivm, _ptr) \
+   lp_build_struct_get(_gallivm, _ptr, LP_JIT_THREAD_DATA_CACHE, "cache")
+
 #define lp_jit_thread_data_counter(_gallivm, _ptr) \
   lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_THREAD_DATA_COUNTER, "counter")

--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@ -43,6 +43,7 @@
 #include "lp_query.h"
 #include "lp_rast.h"
 #include "lp_rast_priv.h"
+#include "gallivm/lp_bld_format.h"
 #include "gallivm/lp_bld_debug.h"
 #include "lp_scene.h"
 #include "lp_tex_sample.h"
@ -664,6 +665,17 @@ rasterize_scene(struct lp_rasterizer_task *task,
 {
   task->scene = scene;

+   /* Clear the cache tags. This should not always be necessary but
+      simpler for now. */
+#if LP_USE_TEXTURE_CACHE
+   memset(task->thread_data.cache->cache_tags, 0,
+          sizeof(task->thread_data.cache->cache_tags));
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+   task->thread_data.cache->cache_access_total = 0;
+   task->thread_data.cache->cache_access_miss = 0;
+#endif
+#endif
+
   if (!task->rast->no_rast && !scene->discard) {
      /* loop over scene bins, rasterize each */
      {
@ -679,6 +691,20 @@ rasterize_scene(struct lp_rasterizer_task *task,
   }


+#if LP_BUILD_FORMAT_CACHE_DEBUG
+   {
+      uint64_t total, miss;
+      total = task->thread_data.cache->cache_access_total;
+      miss = task->thread_data.cache->cache_access_miss;
+      if (total) {
+         debug_printf("thread %d cache access %llu miss %llu hit rate %f\n",
+                 task->thread_index, (long long unsigned)total,
+                 (long long unsigned)miss,
+                 (float)(total - miss)/(float)total);
+      }
+   }
+#endif
+
   if (scene->fence) {
      lp_fence_signal(scene->fence);
   }
@ -866,10 +892,15 @@ lp_rast_create( unsigned num_threads )
      goto no_full_scenes;
   }

-   for (i = 0; i < Elements(rast->tasks); i++) {
+   for (i = 0; i < MAX2(1, num_threads); i++) {
      struct lp_rasterizer_task *task = &rast->tasks[i];
      task->rast = rast;
      task->thread_index = i;
+      task->thread_data.cache = align_malloc(sizeof(struct lp_build_format_cache),
+                                             16);
+      if (!task->thread_data.cache) {
+         goto no_thread_data_cache;
+      }
   }

   rast->num_threads = num_threads;
@ -885,6 +916,14 @@ lp_rast_create( unsigned num_threads )

   return rast;

+no_thread_data_cache:
+   for (i = 0; i < MAX2(1, rast->num_threads); i++) {
+      if (rast->tasks[i].thread_data.cache) {
+         align_free(rast->tasks[i].thread_data.cache);
+      }
+   }
+
+   lp_scene_queue_destroy(rast->full_scenes);
 no_full_scenes:
   FREE(rast);
 no_rast:
@ -923,6 +962,9 @@ void lp_rast_destroy( struct lp_rasterizer *rast )
      pipe_semaphore_destroy(&rast->tasks[i].work_ready);
      pipe_semaphore_destroy(&rast->tasks[i].work_done);
   }
+   for (i = 0; i < MAX2(1, rast->num_threads); i++) {
+      align_free(rast->tasks[i].thread_data.cache);
+   }

   /* for synchronizing rasterization threads */
   pipe_barrier_destroy( &rast->barrier );
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@ -300,6 +300,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
   case PIPE_CAP_SHAREABLE_SHADERS:
   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+   case PIPE_CAP_CLEAR_TEXTURE:
      return 0;
   }
   /* should only get here on unhandled cases */
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@ -421,7 +421,7 @@ generate_fs_loop(struct gallivm_state *gallivm,
   lp_build_tgsi_soa(gallivm, tokens, type, &mask,
                     consts_ptr, num_consts_ptr, &system_values,
                     interp->inputs,
-                     outputs, context_ptr,
+                     outputs, context_ptr, thread_data_ptr,
                     sampler, &shader->info.base, NULL);

   /* Alpha test */
@ -2303,8 +2303,8 @@ generate_fragment(struct llvmpipe_context *lp,
   lp_build_name(dady_ptr, "dady");
   lp_build_name(color_ptr_ptr, "color_ptr_ptr");
   lp_build_name(depth_ptr, "depth");
-   lp_build_name(thread_data_ptr, "thread_data");
   lp_build_name(mask_input, "mask_input");
+   lp_build_name(thread_data_ptr, "thread_data");
   lp_build_name(stride_ptr, "stride_ptr");
   lp_build_name(depth_stride, "depth_stride");

--- a/src/gallium/drivers/llvmpipe/lp_test_format.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_format.c
@ -44,6 +44,9 @@

 #include "lp_test.h"

+#define USE_TEXTURE_CACHE 1
+
+static struct lp_build_format_cache *cache_ptr;

 void
 write_tsv_header(FILE *fp)
@ -71,7 +74,7 @@ write_tsv_row(FILE *fp,

 typedef void
 (*fetch_ptr_t)(void *unpacked, const void *packed,
-               unsigned i, unsigned j);
+               unsigned i, unsigned j, struct lp_build_format_cache *cache);


 static LLVMValueRef
@ -83,7 +86,7 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose,
   LLVMContextRef context = gallivm->context;
   LLVMModuleRef module = gallivm->module;
   LLVMBuilderRef builder = gallivm->builder;
-   LLVMTypeRef args[4];
+   LLVMTypeRef args[5];
   LLVMValueRef func;
   LLVMValueRef packed_ptr;
   LLVMValueRef offset = LLVMConstNull(LLVMInt32TypeInContext(context));
@ -92,6 +95,7 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose,
   LLVMValueRef j;
   LLVMBasicBlockRef block;
   LLVMValueRef rgba;
+   LLVMValueRef cache = NULL;

   util_snprintf(name, sizeof name, "fetch_%s_%s", desc->short_name,
                 type.floating ? "float" : "unorm8");
@ -99,6 +103,7 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose,
   args[0] = LLVMPointerType(lp_build_vec_type(gallivm, type), 0);
   args[1] = LLVMPointerType(LLVMInt8TypeInContext(context), 0);
   args[3] = args[2] = LLVMInt32TypeInContext(context);
+   args[4] = LLVMPointerType(lp_build_format_cache_type(gallivm), 0);

   func = LLVMAddFunction(module, name,
                          LLVMFunctionType(LLVMVoidTypeInContext(context),
@ -109,11 +114,15 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose,
   i = LLVMGetParam(func, 2);
   j = LLVMGetParam(func, 3);

+   if (cache_ptr) {
+      cache = LLVMGetParam(func, 4);
+   }
+
   block = LLVMAppendBasicBlockInContext(context, func, "entry");
   LLVMPositionBuilderAtEnd(builder, block);

   rgba = lp_build_fetch_rgba_aos(gallivm, desc, type, TRUE,
-                                  packed_ptr, offset, i, j);
+                                  packed_ptr, offset, i, j, cache);

   LLVMBuildStore(builder, rgba, rgba_ptr);

@ -170,7 +179,7 @@ test_format_float(unsigned verbose, FILE *fp,

               memset(unpacked, 0, sizeof unpacked);

-               fetch_ptr(unpacked, packed, j, i);
+               fetch_ptr(unpacked, packed, j, i, cache_ptr);

               for(k = 0; k < 4; ++k) {
                  if (util_double_inf_sign(test->unpacked[i][j][k]) != util_inf_sign(unpacked[k])) {
@ -187,6 +196,11 @@ test_format_float(unsigned verbose, FILE *fp,
                  }
               }

+               /* Ignore errors in S3TC for now */
+               if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+                  match = TRUE;
+               }
+
               if (!match) {
                  printf("FAILED\n");
                  printf("  Packed: %02x %02x %02x %02x\n",
@ -261,7 +275,7 @@ test_format_unorm8(unsigned verbose, FILE *fp,

               memset(unpacked, 0, sizeof unpacked);

-               fetch_ptr(unpacked, packed, j, i);
+               fetch_ptr(unpacked, packed, j, i, cache_ptr);

               match = TRUE;
               for(k = 0; k < 4; ++k) {
@ -277,6 +291,11 @@ test_format_unorm8(unsigned verbose, FILE *fp,
                     match = FALSE;
               }

+               /* Ignore errors in S3TC as we only implement a poor man approach */
+               if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+                  match = TRUE;
+               }
+
               if (!match) {
                  printf("FAILED\n");
                  printf("  Packed: %02x %02x %02x %02x\n",
@ -334,6 +353,10 @@ test_all(unsigned verbose, FILE *fp)

   util_format_s3tc_init();

+#if USE_TEXTURE_CACHE
+   cache_ptr = align_malloc(sizeof(struct lp_build_format_cache), 16);
+#endif
+
   for (format = 1; format < PIPE_FORMAT_COUNT; ++format) {
      const struct util_format_description *format_desc;

@ -363,6 +386,9 @@ test_all(unsigned verbose, FILE *fp)
           success = FALSE;
      }
   }
+#if USE_TEXTURE_CACHE
+   align_free(cache_ptr);
+#endif

   return success;
 }
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.c
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.c
@ -221,6 +221,21 @@ LP_LLVM_SAMPLER_MEMBER(lod_bias,   LP_JIT_SAMPLER_LOD_BIAS, TRUE)
 LP_LLVM_SAMPLER_MEMBER(border_color, LP_JIT_SAMPLER_BORDER_COLOR, FALSE)


+#if LP_USE_TEXTURE_CACHE
+static LLVMValueRef
+lp_llvm_texture_cache_ptr(const struct lp_sampler_dynamic_state *base,
+                          struct gallivm_state *gallivm,
+                          LLVMValueRef thread_data_ptr,
+                          unsigned unit)
+{
+   /* We use the same cache for all units */
+   (void)unit;
+
+   return lp_jit_thread_data_cache(gallivm, thread_data_ptr);
+}
+#endif
+
+
 static void
 lp_llvm_sampler_soa_destroy(struct lp_build_sampler_soa *sampler)
 {
@ -314,6 +329,10 @@ lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state)
   sampler->dynamic_state.base.lod_bias = lp_llvm_sampler_lod_bias;
   sampler->dynamic_state.base.border_color = lp_llvm_sampler_border_color;

+#if LP_USE_TEXTURE_CACHE
+   sampler->dynamic_state.base.cache_ptr = lp_llvm_texture_cache_ptr;
+#endif
+
   sampler->dynamic_state.static_state = static_state;

   return &sampler->base;
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.h
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.h
@ -34,6 +34,10 @@

 struct lp_sampler_static_state;

+/**
+ * Whether texture cache is used for s3tc textures.
+ */
+#define LP_USE_TEXTURE_CACHE 0

 /**
 * Pure-LLVM texture sampling code generator.
@ -42,5 +46,4 @@ struct lp_sampler_static_state;
 struct lp_build_sampler_soa *
 lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *key);

-
 #endif /* LP_TEX_SAMPLE_H */
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@ -805,7 +805,7 @@ llvmpipe_init_screen_resource_funcs(struct pipe_screen *screen)
 #endif

   screen->resource_create = llvmpipe_resource_create;
-   screen->resource_create_front = llvmpipe_resource_create_front;
+/*   screen->resource_create_front = llvmpipe_resource_create_front; */
   screen->resource_destroy = llvmpipe_resource_destroy;
   screen->resource_from_handle = llvmpipe_resource_from_handle;
   screen->resource_get_handle = llvmpipe_resource_get_handle;
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@ -389,6 +389,7 @@ enum SVSemantic
   SV_SBASE,
   SV_VERTEX_STRIDE,
   SV_INVOCATION_INFO,
+   SV_THREAD_KILL,
   SV_UNDEFINED,
   SV_LAST
 };
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
@ -392,12 +392,24 @@ BuildUtil::mkImm(float f)
   return mkImm(u.u32);
 }

+ImmediateValue *
+BuildUtil::mkImm(double d)
+{
+   return new_ImmediateValue(prog, d);
+}
+
 Value *
 BuildUtil::loadImm(Value *dst, float f)
 {
   return mkOp1v(OP_MOV, TYPE_F32, dst ? dst : getScratch(), mkImm(f));
 }

+Value *
+BuildUtil::loadImm(Value *dst, double d)
+{
+   return mkOp1v(OP_MOV, TYPE_F64, dst ? dst : getScratch(), mkImm(d));
+}
+
 Value *
 BuildUtil::loadImm(Value *dst, uint32_t u)
 {
@ -555,6 +567,12 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
   switch (i->dType) {
   case TYPE_U64: hTy = TYPE_U32; break;
   case TYPE_S64: hTy = TYPE_S32; break;
+   case TYPE_F64:
+      if (i->op == OP_MOV) {
+         hTy = TYPE_U32;
+         break;
+      }
+      /* fallthrough */
   default:
      return NULL;
   }
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
@ -90,12 +90,14 @@ public:
   void mkClobber(DataFile file, uint32_t regMask, int regUnitLog2);

   ImmediateValue *mkImm(float);
+   ImmediateValue *mkImm(double);
   ImmediateValue *mkImm(uint32_t);
   ImmediateValue *mkImm(uint64_t);

   ImmediateValue *mkImm(int i) { return mkImm((uint32_t)i); }

   Value *loadImm(Value *dst, float);
+   Value *loadImm(Value *dst, double);
   Value *loadImm(Value *dst, uint32_t);
   Value *loadImm(Value *dst, uint64_t);

--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@ -96,6 +96,7 @@ struct nv50_ir_prog_info
      uint32_t tlsSpace;  /* required local memory per thread */
      uint32_t *code;
      uint32_t codeSize;
+      uint32_t instructions;
      uint8_t sourceRep;  /* NV50_PROGRAM_IR */
      const void *source;
      void *relocData;
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@ -1644,6 +1644,7 @@ CodeEmitterGK110::getSRegEncoding(const ValueRef& ref)
   case SV_VERTEX_COUNT:  return 0x10;
   case SV_INVOCATION_ID: return 0x11;
   case SV_YDIR:          return 0x12;
+   case SV_THREAD_KILL:   return 0x13;
   case SV_TID:           return 0x21 + SDATA(ref).sv.index;
   case SV_CTAID:         return 0x25 + SDATA(ref).sv.index;
   case SV_NTID:          return 0x29 + SDATA(ref).sv.index;
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@ -244,6 +244,7 @@ CodeEmitterGM107::emitSYS(int pos, const Value *val)
   case SV_LANEID         : id = 0x00; break;
   case SV_VERTEX_COUNT   : id = 0x10; break;
   case SV_INVOCATION_ID  : id = 0x11; break;
+   case SV_THREAD_KILL    : id = 0x13; break;
   case SV_INVOCATION_INFO: id = 0x1d; break;
   default:
      assert(!"invalid system value");
@ -310,9 +311,12 @@ CodeEmitterGM107::emitIMMD(int pos, int len, const ValueRef &ref)
   uint32_t val = imm->reg.data.u32;

   if (len == 19) {
-      if (isFloatType(insn->sType)) {
+      if (insn->sType == TYPE_F32 || insn->sType == TYPE_F16) {
         assert(!(val & 0x00000fff));
         val >>= 12;
+      } else if (insn->sType == TYPE_F64) {
+         assert(!(imm->reg.data.u64 & 0x00000fffffffffffULL));
+         val = imm->reg.data.u64 >> 44;
      }
      assert(!(val & 0xfff00000) || (val & 0xfff00000) == 0xfff00000);
      emitField( 56,   1, (val & 0x80000) >> 19);
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@ -96,9 +96,12 @@ private:
   void emitUADD(const Instruction *);
   void emitAADD(const Instruction *);
   void emitFADD(const Instruction *);
+   void emitDADD(const Instruction *);
   void emitIMUL(const Instruction *);
   void emitFMUL(const Instruction *);
+   void emitDMUL(const Instruction *);
   void emitFMAD(const Instruction *);
+   void emitDMAD(const Instruction *);
   void emitIMAD(const Instruction *);
   void emitISAD(const Instruction *);

@ -438,9 +441,9 @@ CodeEmitterNV50::setSrcFileBits(const Instruction *i, int enc)
      return;

   if ((mode & 3) == 1) {
-      const int pos = i->src(1).getFile() == FILE_IMMEDIATE ? 13 : 14;
+      const int pos = ((mode >> 2) & 3) == 3 ? 13 : 14;

-      switch (i->getSrc(0)->reg.type) {
+      switch (i->sType) {
      case TYPE_U8:
         break;
      case TYPE_U16:
@ -954,11 +957,13 @@ CodeEmitterNV50::emitMINMAX(const Instruction *i)
         assert(0);
         break;
      }
-      code[1] |= i->src(0).mod.abs() << 20;
-      code[1] |= i->src(0).mod.neg() << 26;
-      code[1] |= i->src(1).mod.abs() << 19;
-      code[1] |= i->src(1).mod.neg() << 27;
   }
+
+   code[1] |= i->src(0).mod.abs() << 20;
+   code[1] |= i->src(0).mod.neg() << 26;
+   code[1] |= i->src(1).mod.abs() << 19;
+   code[1] |= i->src(1).mod.neg() << 27;
+
   emitForm_MAD(i);
 }

@ -993,6 +998,26 @@ CodeEmitterNV50::emitFMAD(const Instruction *i)
   }
 }

+void
+CodeEmitterNV50::emitDMAD(const Instruction *i)
+{
+   const int neg_mul = i->src(0).mod.neg() ^ i->src(1).mod.neg();
+   const int neg_add = i->src(2).mod.neg();
+
+   assert(i->encSize == 8);
+   assert(!i->saturate);
+
+   code[1] = 0x40000000;
+   code[0] = 0xe0000000;
+
+   code[1] |= neg_mul << 26;
+   code[1] |= neg_add << 27;
+
+   roundMode_MAD(i);
+
+   emitForm_MAD(i);
+}
+
 void
 CodeEmitterNV50::emitFADD(const Instruction *i)
 {
@ -1027,6 +1052,25 @@ CodeEmitterNV50::emitFADD(const Instruction *i)
   }
 }

+void
+CodeEmitterNV50::emitDADD(const Instruction *i)
+{
+   const int neg0 = i->src(0).mod.neg();
+   const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0);
+
+   assert(!(i->src(0).mod | i->src(1).mod).abs());
+   assert(!i->saturate);
+   assert(i->encSize == 8);
+
+   code[1] = 0x60000000;
+   code[0] = 0xe0000000;
+
+   emitForm_ADD(i);
+
+   code[1] |= neg0 << 26;
+   code[1] |= neg1 << 27;
+}
+
 void
 CodeEmitterNV50::emitUADD(const Instruction *i)
 {
@ -1081,7 +1125,10 @@ CodeEmitterNV50::emitIMUL(const Instruction *i)

   if (i->encSize == 8) {
      code[1] = (i->sType == TYPE_S16) ? (0x8000 | 0x4000) : 0x0000;
-      emitForm_MAD(i);
+      if (i->src(1).getFile() == FILE_IMMEDIATE)
+         emitForm_IMM(i);
+      else
+         emitForm_MAD(i);
   } else {
      if (i->sType == TYPE_S16)
         code[0] |= 0x8100;
@ -1120,6 +1167,25 @@ CodeEmitterNV50::emitFMUL(const Instruction *i)
   }
 }

+void
+CodeEmitterNV50::emitDMUL(const Instruction *i)
+{
+   const int neg = (i->src(0).mod ^ i->src(1).mod).neg();
+
+   assert(!i->saturate);
+   assert(i->encSize == 8);
+
+   code[1] = 0x80000000;
+   code[0] = 0xe0000000;
+
+   if (neg)
+      code[1] |= 0x08000000;
+
+   roundMode_CVT(i->rnd);
+
+   emitForm_MAD(i);
+}
+
 void
 CodeEmitterNV50::emitIMAD(const Instruction *i)
 {
@ -1136,7 +1202,10 @@ CodeEmitterNV50::emitIMAD(const Instruction *i)
   code[1] |= neg1 << 27;
   code[1] |= neg2 << 26;

-   emitForm_MAD(i);
+   if (i->src(1).getFile() == FILE_IMMEDIATE)
+      emitForm_IMM(i);
+   else
+      emitForm_MAD(i);

   if (i->flagsSrc >= 0) {
      // add with carry from $cX
@ -1181,9 +1250,11 @@ CodeEmitterNV50::emitSET(const Instruction *i)
   code[0] = 0x30000000;
   code[1] = 0x60000000;

-   emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14);
-
   switch (i->sType) {
+   case TYPE_F64:
+      code[0] = 0xe0000000;
+      code[1] = 0xe0000000;
+      break;
   case TYPE_F32: code[0] |= 0x80000000; break;
   case TYPE_S32: code[1] |= 0x0c000000; break;
   case TYPE_U32: code[1] |= 0x04000000; break;
@ -1193,6 +1264,9 @@ CodeEmitterNV50::emitSET(const Instruction *i)
      assert(0);
      break;
   }
+
+   emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14);
+
   if (i->src(0).mod.neg()) code[1] |= 0x04000000;
   if (i->src(1).mod.neg()) code[1] |= 0x08000000;
   if (i->src(0).mod.abs()) code[1] |= 0x00100000;
@ -1756,7 +1830,9 @@ CodeEmitterNV50::emitInstruction(Instruction *insn)
      break;
   case OP_ADD:
   case OP_SUB:
-      if (isFloatType(insn->dType))
+      if (insn->dType == TYPE_F64)
+         emitDADD(insn);
+      else if (isFloatType(insn->dType))
         emitFADD(insn);
      else if (insn->getDef(0)->reg.file == FILE_ADDRESS)
         emitAADD(insn);
@ -1764,14 +1840,18 @@ CodeEmitterNV50::emitInstruction(Instruction *insn)
         emitUADD(insn);
      break;
   case OP_MUL:
-      if (isFloatType(insn->dType))
+      if (insn->dType == TYPE_F64)
+         emitDMUL(insn);
+      else if (isFloatType(insn->dType))
         emitFMUL(insn);
      else
         emitIMUL(insn);
      break;
   case OP_MAD:
   case OP_FMA:
-      if (isFloatType(insn->dType))
+      if (insn->dType == TYPE_F64)
+         emitDMAD(insn);
+      else if (isFloatType(insn->dType))
         emitFMAD(insn);
      else
         emitIMAD(insn);
@ -1943,7 +2023,7 @@ CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const
 {
   const Target::OpInfo &info = targ->getOpInfo(i);

-   if (info.minEncSize > 4)
+   if (info.minEncSize > 4 || i->dType == TYPE_F64)
      return 8;

   // check constraints on dst and src operands
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@ -323,6 +323,14 @@ CodeEmitterNVC0::setImmediate(const Instruction *i, const int s)
   assert(imm);
   u32 = imm->reg.data.u32;

+   if ((code[0] & 0xf) == 0x1) {
+      // double immediate
+      uint64_t u64 = imm->reg.data.u64;
+      assert(!(u64 & 0x00000fffffffffffULL));
+      assert(!(code[1] & 0xc000));
+      code[0] |= ((u64 >> 44) & 0x3f) << 26;
+      code[1] |= 0xc000 | (u64 >> 50);
+   } else
   if ((code[0] & 0xf) == 0x2) {
      // LIMM
      code[0] |= (u32 & 0x3f) << 26;
@ -1831,6 +1839,7 @@ CodeEmitterNVC0::getSRegEncoding(const ValueRef& ref)
   case SV_VERTEX_COUNT:  return 0x10;
   case SV_INVOCATION_ID: return 0x11;
   case SV_YDIR:          return 0x12;
+   case SV_THREAD_KILL:   return 0x13;
   case SV_TID:           return 0x21 + SDATA(ref).sv.index;
   case SV_CTAID:         return 0x25 + SDATA(ref).sv.index;
   case SV_NTID:          return 0x29 + SDATA(ref).sv.index;
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@ -376,6 +376,7 @@ static nv50_ir::SVSemantic translateSysVal(uint sysval)
   case TGSI_SEMANTIC_TESSOUTER:  return nv50_ir::SV_TESS_OUTER;
   case TGSI_SEMANTIC_TESSINNER:  return nv50_ir::SV_TESS_INNER;
   case TGSI_SEMANTIC_VERTICESIN: return nv50_ir::SV_VERTEX_COUNT;
+   case TGSI_SEMANTIC_HELPER_INVOCATION: return nv50_ir::SV_THREAD_KILL;
   default:
      assert(0);
      return nv50_ir::SV_CLOCK;
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@ -75,7 +75,7 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul)
   s[0] = mul->getSrc(0);
   s[1] = mul->getSrc(1);

-   if (isSignedType(mul->sType)) {
+   if (isSignedType(mul->sType) && highResult) {
      s[0] = bld->getSSA(fullSize);
      s[1] = bld->getSSA(fullSize);
      bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@ -155,7 +155,7 @@ private:
   void checkSwapSrc01(Instruction *);

   bool isCSpaceLoad(Instruction *);
-   bool isImmd32Load(Instruction *);
+   bool isImmdLoad(Instruction *);
   bool isAttribOrSharedLoad(Instruction *);
 };

@ -166,9 +166,10 @@ LoadPropagation::isCSpaceLoad(Instruction *ld)
 }

 bool
-LoadPropagation::isImmd32Load(Instruction *ld)
+LoadPropagation::isImmdLoad(Instruction *ld)
 {
-   if (!ld || (ld->op != OP_MOV) || (typeSizeof(ld->dType) != 4))
+   if (!ld || (ld->op != OP_MOV) ||
+       ((typeSizeof(ld->dType) != 4) && (typeSizeof(ld->dType) != 8)))
      return false;
   return ld->src(0).getFile() == FILE_IMMEDIATE;
 }
@ -201,8 +202,8 @@ LoadPropagation::checkSwapSrc01(Instruction *insn)
      else
         return;
   } else
-   if (isImmd32Load(i0)) {
-      if (!isCSpaceLoad(i1) && !isImmd32Load(i1))
+   if (isImmdLoad(i0)) {
+      if (!isCSpaceLoad(i1) && !isImmdLoad(i1))
         insn->swapSources(0, 1);
      else
         return;
@ -447,6 +448,7 @@ ConstantFolding::expr(Instruction *i,
 {
   struct Storage *const a = &imm0.reg, *const b = &imm1.reg;
   struct Storage res;
+   DataType type = i->dType;

   memset(&res.data, 0, sizeof(res.data));

@ -588,6 +590,18 @@ ConstantFolding::expr(Instruction *i,
      // The two arguments to pfetch are logically added together. Normally
      // the second argument will not be constant, but that can happen.
      res.data.u32 = a->data.u32 + b->data.u32;
+      type = TYPE_U32;
+      break;
+   case OP_MERGE:
+      switch (i->dType) {
+      case TYPE_U64:
+      case TYPE_S64:
+      case TYPE_F64:
+         res.data.u64 = (((uint64_t)b->data.u32) << 32) | a->data.u32;
+         break;
+      default:
+         return;
+      }
      break;
   default:
      return;
@ -602,6 +616,8 @@ ConstantFolding::expr(Instruction *i,
   i->setSrc(1, NULL);

   i->getSrc(0)->reg.data = res.data;
+   i->getSrc(0)->reg.type = type;
+   i->getSrc(0)->reg.size = typeSizeof(type);

   switch (i->op) {
   case OP_MAD:
@ -1148,6 +1164,11 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
 #define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \
   case type: \
      switch (i->sType) { \
+      case TYPE_F64: \
+         res.data.dst = util_iround(i->saturate ? \
+                                    CLAMP(imm0.reg.data.f64, fmin, fmax) : \
+                                    imm0.reg.data.f64); \
+         break; \
      case TYPE_F32: \
         res.data.dst = util_iround(i->saturate ? \
                                    CLAMP(imm0.reg.data.f32, fmin, fmax) : \
@ -1185,6 +1206,11 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
      CASE(TYPE_S32, s32, INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX, 0, INT32_MAX);
      case TYPE_F32:
         switch (i->sType) {
+         case TYPE_F64:
+            res.data.f32 = i->saturate ?
+               CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
+               imm0.reg.data.f64;
+            break;
         case TYPE_F32:
            res.data.f32 = i->saturate ?
               CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
@ -1199,6 +1225,27 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
         }
         i->setSrc(0, bld.mkImm(res.data.f32));
         break;
+      case TYPE_F64:
+         switch (i->sType) {
+         case TYPE_F64:
+            res.data.f64 = i->saturate ?
+               CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
+               imm0.reg.data.f64;
+            break;
+         case TYPE_F32:
+            res.data.f64 = i->saturate ?
+               CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
+               imm0.reg.data.f32;
+            break;
+         case TYPE_U16: res.data.f64 = (double) imm0.reg.data.u16; break;
+         case TYPE_U32: res.data.f64 = (double) imm0.reg.data.u32; break;
+         case TYPE_S16: res.data.f64 = (double) imm0.reg.data.s16; break;
+         case TYPE_S32: res.data.f64 = (double) imm0.reg.data.s32; break;
+         default:
+            return;
+         }
+         i->setSrc(0, bld.mkImm(res.data.f64));
+         break;
      default:
         return;
      }
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
@ -275,6 +275,7 @@ static const char *SemanticStr[SV_LAST + 1] =
   "SBASE",
   "VERTEX_STRIDE",
   "INVOCATION_INFO",
+   "THREAD_KILL",
   "?",
   "(INVALID)"
 };
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
@ -373,6 +373,7 @@ Program::emitBinary(struct nv50_ir_prog_info *info)
   if (!code)
      return false;
   emit->setCodeLocation(code, binSize);
+   info->bin.instructions = 0;

   for (ArrayList::Iterator fi = allFuncs.iterator(); !fi.end(); fi.next()) {
      Function *fn = reinterpret_cast<Function *>(fi.get());
@ -382,6 +383,7 @@ Program::emitBinary(struct nv50_ir_prog_info *info)
      for (int b = 0; b < fn->bbCount; ++b) {
         for (Instruction *i = fn->bbArray[b]->getEntry(); i; i = i->next) {
            emit->emitInstruction(i);
+            info->bin.instructions++;
            if (i->sType == TYPE_F64 || i->dType == TYPE_F64)
               info->io.fp64 = true;
         }
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@ -343,7 +343,7 @@ TargetNV50::insnCanLoad(const Instruction *i, int s,
   }

   if (sf == FILE_IMMEDIATE)
-      return true;
+      return ldSize <= 4;


   // Check if memory access is encodable:
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@ -338,17 +338,30 @@ TargetNVC0::insnCanLoad(const Instruction *i, int s,
   if (sf == FILE_IMMEDIATE) {
      Storage &reg = ld->getSrc(0)->asImm()->reg;

-      if (typeSizeof(i->sType) > 4)
-         return false;
-      if (opInfo[i->op].immdBits != 0xffffffff) {
-         if (i->sType == TYPE_F32) {
+      if (opInfo[i->op].immdBits != 0xffffffff || typeSizeof(i->sType) > 4) {
+         switch (i->sType) {
+         case TYPE_F64:
+            if (reg.data.u64 & 0x00000fffffffffffULL)
+               return false;
+            break;
+         case TYPE_F32:
            if (reg.data.u32 & 0xfff)
               return false;
-         } else
-         if (i->sType == TYPE_S32 || i->sType == TYPE_U32) {
+            break;
+         case TYPE_S32:
+         case TYPE_U32:
            // with u32, 0xfffff counts as 0xffffffff as well
            if (reg.data.s32 > 0x7ffff || reg.data.s32 < -0x80000)
               return false;
+            break;
+         case TYPE_U8:
+         case TYPE_S8:
+         case TYPE_U16:
+         case TYPE_S16:
+         case TYPE_F16:
+            break;
+         default:
+            return false;
         }
      } else
      if (i->op == OP_MAD || i->op == OP_FMA) {
--- a/src/gallium/drivers/nouveau/nouveau_buffer.c
+++ b/src/gallium/drivers/nouveau/nouveau_buffer.c
@ -225,21 +225,22 @@ nouveau_transfer_write(struct nouveau_context *nv, struct nouveau_transfer *tx,
 * for write/read by waiting on the buffer's relevant fences.
 */
 static inline bool
-nouveau_buffer_sync(struct nv04_resource *buf, unsigned rw)
+nouveau_buffer_sync(struct nouveau_context *nv,
+                    struct nv04_resource *buf, unsigned rw)
 {
   if (rw == PIPE_TRANSFER_READ) {
      if (!buf->fence_wr)
         return true;
      NOUVEAU_DRV_STAT_RES(buf, buf_non_kernel_fence_sync_count,
                           !nouveau_fence_signalled(buf->fence_wr));
-      if (!nouveau_fence_wait(buf->fence_wr))
+      if (!nouveau_fence_wait(buf->fence_wr, &nv->debug))
         return false;
   } else {
      if (!buf->fence)
         return true;
      NOUVEAU_DRV_STAT_RES(buf, buf_non_kernel_fence_sync_count,
                           !nouveau_fence_signalled(buf->fence));
-      if (!nouveau_fence_wait(buf->fence))
+      if (!nouveau_fence_wait(buf->fence, &nv->debug))
         return false;

      nouveau_fence_ref(NULL, &buf->fence);
@ -478,7 +479,7 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe,
      if (unlikely(usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE)) {
         /* Discarding was not possible, must sync because
          * subsequent transfers might use UNSYNCHRONIZED. */
-         nouveau_buffer_sync(buf, usage & PIPE_TRANSFER_READ_WRITE);
+         nouveau_buffer_sync(nv, buf, usage & PIPE_TRANSFER_READ_WRITE);
      } else
      if (usage & PIPE_TRANSFER_DISCARD_RANGE) {
         /* The whole range is being discarded, so it doesn't matter what was
@ -490,7 +491,7 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe,
         if (usage & PIPE_TRANSFER_DONTBLOCK)
            map = NULL;
         else
-            nouveau_buffer_sync(buf, usage & PIPE_TRANSFER_READ_WRITE);
+            nouveau_buffer_sync(nv, buf, usage & PIPE_TRANSFER_READ_WRITE);
      } else {
         /* It is expected that the returned buffer be a representation of the
          * data in question, so we must copy it over from the buffer. */
@ -615,7 +616,7 @@ nouveau_resource_map_offset(struct nouveau_context *nv,
   if (res->mm) {
      unsigned rw;
      rw = (flags & NOUVEAU_BO_WR) ? PIPE_TRANSFER_WRITE : PIPE_TRANSFER_READ;
-      nouveau_buffer_sync(res, rw);
+      nouveau_buffer_sync(nv, res, rw);
      if (nouveau_bo_map(res->bo, 0, NULL))
         return NULL;
   } else {
--- a/src/gallium/drivers/nouveau/nouveau_context.h
+++ b/src/gallium/drivers/nouveau/nouveau_context.h
@ -2,6 +2,7 @@
 #define __NOUVEAU_CONTEXT_H__

 #include "pipe/p_context.h"
+#include "pipe/p_state.h"
 #include <nouveau.h>

 #define NOUVEAU_MAX_SCRATCH_BUFS 4
@ -14,6 +15,7 @@ struct nouveau_context {

   struct nouveau_client *client;
   struct nouveau_pushbuf *pushbuf;
+   struct pipe_debug_callback debug;

   bool vbo_dirty;

@ -63,6 +65,9 @@ nouveau_context(struct pipe_context *pipe)
 void
 nouveau_context_init_vdec(struct nouveau_context *);

+void
+nouveau_context_init(struct nouveau_context *);
+
 void
 nouveau_scratch_runout_release(struct nouveau_context *);

--- a/src/gallium/drivers/nouveau/nouveau_fence.c
+++ b/src/gallium/drivers/nouveau/nouveau_fence.c
@ -23,6 +23,7 @@
 #include "nouveau_screen.h"
 #include "nouveau_winsys.h"
 #include "nouveau_fence.h"
+#include "os/os_time.h"

 #ifdef PIPE_OS_UNIX
 #include <sched.h>
@ -58,26 +59,6 @@ nouveau_fence_trigger_work(struct nouveau_fence *fence)
   }
 }

-bool
-nouveau_fence_work(struct nouveau_fence *fence,
-                   void (*func)(void *), void *data)
-{
-   struct nouveau_fence_work *work;
-
-   if (!fence || fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) {
-      func(data);
-      return true;
-   }
-
-   work = CALLOC_STRUCT(nouveau_fence_work);
-   if (!work)
-      return false;
-   work->func = func;
-   work->data = data;
-   LIST_ADD(&work->list, &fence->work);
-   return true;
-}
-
 void
 nouveau_fence_emit(struct nouveau_fence *fence)
 {
@ -181,11 +162,10 @@ nouveau_fence_signalled(struct nouveau_fence *fence)
   return fence->state == NOUVEAU_FENCE_STATE_SIGNALLED;
 }

-bool
-nouveau_fence_wait(struct nouveau_fence *fence)
+static bool
+nouveau_fence_kick(struct nouveau_fence *fence)
 {
   struct nouveau_screen *screen = fence->screen;
-   uint32_t spins = 0;

   /* wtf, someone is waiting on a fence in flush_notify handler? */
   assert(fence->state != NOUVEAU_FENCE_STATE_EMITTING);
@ -206,11 +186,32 @@ nouveau_fence_wait(struct nouveau_fence *fence)
   if (fence == screen->fence.current)
      nouveau_fence_next(screen);

-   do {
-      nouveau_fence_update(screen, false);
+   nouveau_fence_update(screen, false);

-      if (fence->state == NOUVEAU_FENCE_STATE_SIGNALLED)
+   return true;
+}
+
+bool
+nouveau_fence_wait(struct nouveau_fence *fence, struct pipe_debug_callback *debug)
+{
+   struct nouveau_screen *screen = fence->screen;
+   uint32_t spins = 0;
+   int64_t start = 0;
+
+   if (debug && debug->debug_message)
+      start = os_time_get_nano();
+
+   if (!nouveau_fence_kick(fence))
+      return false;
+
+   do {
+      if (fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) {
+         if (debug && debug->debug_message)
+            pipe_debug_message(debug, PERF_INFO,
+                               "stalled %.3f ms waiting for fence",
+                               (os_time_get_nano() - start) / 1000000.f);
         return true;
+      }
      if (!spins)
         NOUVEAU_DRV_STAT(screen, any_non_kernel_fence_sync_count, 1);
      spins++;
@ -218,6 +219,8 @@ nouveau_fence_wait(struct nouveau_fence *fence)
      if (!(spins % 8)) /* donate a few cycles */
         sched_yield();
 #endif
+
+      nouveau_fence_update(screen, false);
   } while (spins < NOUVEAU_FENCE_MAX_SPINS);

   debug_printf("Wait on fence %u (ack = %u, next = %u) timed out !\n",
@ -249,3 +252,26 @@ nouveau_fence_unref_bo(void *data)

   nouveau_bo_ref(NULL, &bo);
 }
+
+bool
+nouveau_fence_work(struct nouveau_fence *fence,
+                   void (*func)(void *), void *data)
+{
+   struct nouveau_fence_work *work;
+
+   if (!fence || fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) {
+      func(data);
+      return true;
+   }
+
+   work = CALLOC_STRUCT(nouveau_fence_work);
+   if (!work)
+      return false;
+   work->func = func;
+   work->data = data;
+   LIST_ADD(&work->list, &fence->work);
+   p_atomic_inc(&fence->work_count);
+   if (fence->work_count > 64)
+      nouveau_fence_kick(fence);
+   return true;
+}
--- a/src/gallium/drivers/nouveau/nouveau_fence.h
+++ b/src/gallium/drivers/nouveau/nouveau_fence.h
@ -11,6 +11,8 @@
 #define NOUVEAU_FENCE_STATE_FLUSHED   3
 #define NOUVEAU_FENCE_STATE_SIGNALLED 4

+struct pipe_debug_callback;
+
 struct nouveau_fence_work {
   struct list_head list;
   void (*func)(void *);
@ -23,6 +25,7 @@ struct nouveau_fence {
   int state;
   int ref;
   uint32_t sequence;
+   uint32_t work_count;
   struct list_head work;
 };

@ -34,7 +37,7 @@ bool nouveau_fence_new(struct nouveau_screen *, struct nouveau_fence **,
 bool nouveau_fence_work(struct nouveau_fence *, void (*)(void *), void *);
 void nouveau_fence_update(struct nouveau_screen *, bool flushed);
 void nouveau_fence_next(struct nouveau_screen *);
-bool nouveau_fence_wait(struct nouveau_fence *);
+bool nouveau_fence_wait(struct nouveau_fence *, struct pipe_debug_callback *);
 bool nouveau_fence_signalled(struct nouveau_fence *);

 void nouveau_fence_unref_bo(void *data); /* generic unref bo callback */
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@ -18,6 +18,7 @@

 #include "nouveau_winsys.h"
 #include "nouveau_screen.h"
+#include "nouveau_context.h"
 #include "nouveau_fence.h"
 #include "nouveau_mm.h"
 #include "nouveau_buffer.h"
@ -75,7 +76,7 @@ nouveau_screen_fence_finish(struct pipe_screen *screen,
   if (!timeout)
      return nouveau_fence_signalled(nouveau_fence(pfence));

-   return nouveau_fence_wait(nouveau_fence(pfence));
+   return nouveau_fence_wait(nouveau_fence(pfence), NULL);
 }


@ -238,3 +239,21 @@ nouveau_screen_fini(struct nouveau_screen *screen)

   nouveau_device_del(&screen->device);
 }
+
+static void
+nouveau_set_debug_callback(struct pipe_context *pipe,
+                           const struct pipe_debug_callback *cb)
+{
+   struct nouveau_context *context = nouveau_context(pipe);
+
+   if (cb)
+      context->debug = *cb;
+   else
+      memset(&context->debug, 0, sizeof(context->debug));
+}
+
+void
+nouveau_context_init(struct nouveau_context *context)
+{
+   context->pipe.set_debug_callback = nouveau_set_debug_callback;
+}
--- a/src/gallium/drivers/nouveau/nouveau_vp3_video.c
+++ b/src/gallium/drivers/nouveau/nouveau_vp3_video.c
@ -437,6 +437,7 @@ nouveau_vp3_screen_get_video_param(struct pipe_screen *pscreen,
      /* VP3 does not support MPEG4, VP4+ do. */
      return entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM &&
         profile >= PIPE_VIDEO_PROFILE_MPEG1 &&
+         profile < PIPE_VIDEO_PROFILE_HEVC_MAIN &&
         (!vp3 || codec != PIPE_VIDEO_FORMAT_MPEG4) &&
         firmware_present(pscreen, profile);
   case PIPE_VIDEO_CAP_NPOT_TEXTURES:
--- a/src/gallium/drivers/nouveau/nv30/nv30_context.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_context.c
@ -242,6 +242,7 @@ nv30_context_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
   if (debug_get_bool_option("NV30_SWTNL", false))
      nv30->draw_flags |= NV30_NEW_SWTNL;

+   nouveau_context_init(&nv30->base);
   nv30->sample_mask = 0xffff;
   nv30_vbo_init(pipe);
   nv30_query_init(pipe);
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@ -173,6 +173,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
   case PIPE_CAP_SHAREABLE_SHADERS:
   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+   case PIPE_CAP_CLEAR_TEXTURE:
      return 0;

   case PIPE_CAP_VENDOR_ID:
@ -353,7 +354,7 @@ nv30_screen_fence_emit(struct pipe_screen *pscreen, uint32_t *sequence)

   *sequence = ++screen->base.fence.sequence;

-   assert(PUSH_AVAIL(push) >= 3);
+   assert(PUSH_AVAIL(push) + push->rsvd_kick >= 3);
   PUSH_DATA (push, NV30_3D_FENCE_OFFSET |
              (2 /* size */ << 18) | (7 /* subchan */ << 13));
   PUSH_DATA (push, 0);
@ -383,7 +384,7 @@ nv30_screen_destroy(struct pipe_screen *pscreen)
       * _current_ one, and remove both.
       */
      nouveau_fence_ref(screen->base.fence.current, &current);
-      nouveau_fence_wait(current);
+      nouveau_fence_wait(current, NULL);
      nouveau_fence_ref(NULL, &current);
      nouveau_fence_ref(NULL, &screen->base.fence.current);
   }
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c
@ -306,6 +306,7 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
   }
   nv50->base.pushbuf->kick_notify = nv50_default_kick_notify;

+   nouveau_context_init(&nv50->base);
   nv50_init_query_functions(nv50);
   nv50_init_surface_functions(nv50);
   nv50_init_state_functions(nv50);
--- a/src/gallium/drivers/nouveau/nv50/nv50_formats.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_formats.c
@ -203,10 +203,8 @@ const struct nv50_format nv50_format_table[PIPE_FORMAT_COUNT] =
   F3B(B5G6R5_UNORM, B5G6R5_UNORM, C2, C1, C0, xx, UNORM, 5_6_5, TD),
   C4B(B5G5R5A1_UNORM, BGR5_A1_UNORM, C2, C1, C0, C3, UNORM, 5_5_5_1, TD),
   F3B(B5G5R5X1_UNORM, BGR5_X1_UNORM, C2, C1, C0, xx, UNORM, 5_5_5_1, TD),
-#if NOUVEAU_DRIVER != 0xc0
   C4B(B4G4R4A4_UNORM, NONE, C2, C1, C0, C3, UNORM, 4_4_4_4, T),
   F3B(B4G4R4X4_UNORM, NONE, C2, C1, C0, xx, UNORM, 4_4_4_4, T),
-#endif
   F3B(R9G9B9E5_FLOAT, NONE, C0, C1, C2, xx, FLOAT, 9_9_9_E5, T),

   C4A(R10G10B10A2_UNORM, RGB10_A2_UNORM, C0, C1, C2, C3, UNORM, 10_10_10_2,
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@ -318,7 +318,8 @@ nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info,
 }

 bool
-nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
+nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
+                       struct pipe_debug_callback *debug)
 {
   struct nv50_ir_prog_info *info;
   int ret;
@ -406,6 +407,11 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
      prog->so = nv50_program_create_strmout_state(info,
                                                   &prog->pipe.stream_output);

+   pipe_debug_message(debug, SHADER_INFO,
+                      "type: %d, local: %d, gpr: %d, inst: %d, bytes: %d",
+                      prog->type, info->bin.tlsSpace, prog->max_gpr,
+                      info->bin.instructions, info->bin.codeSize);
+
 out:
   FREE(info);
   return !ret;
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.h
@ -106,7 +106,8 @@ struct nv50_program {
   struct nv50_stream_output_state *so;
 };

-bool nv50_program_translate(struct nv50_program *, uint16_t chipset);
+bool nv50_program_translate(struct nv50_program *, uint16_t chipset,
+                            struct pipe_debug_callback *);
 bool nv50_program_upload_code(struct nv50_context *, struct nv50_program *);
 void nv50_program_destroy(struct nv50_context *, struct nv50_program *);

--- a/src/gallium/drivers/nouveau/nv50/nv50_resource.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_resource.h
@ -151,4 +151,11 @@ nv50_surface_from_buffer(struct pipe_context *pipe,
 void
 nv50_surface_destroy(struct pipe_context *, struct pipe_surface *);

+void
+nv50_clear_texture(struct pipe_context *pipe,
+                   struct pipe_resource *res,
+                   unsigned level,
+                   const struct pipe_box *box,
+                   const void *data);
+
 #endif /* __NV50_RESOURCE_H__ */
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@ -182,6 +182,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_TGSI_TXQS:
   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
   case PIPE_CAP_SHAREABLE_SHADERS:
+   case PIPE_CAP_CLEAR_TEXTURE:
      return 1;
   case PIPE_CAP_SEAMLESS_CUBE_MAP:
      return 1; /* class_3d >= NVA0_3D_CLASS; */
@ -350,7 +351,7 @@ nv50_screen_destroy(struct pipe_screen *pscreen)
       * _current_ one, and remove both.
       */
      nouveau_fence_ref(screen->base.fence.current, &current);
-      nouveau_fence_wait(current);
+      nouveau_fence_wait(current, NULL);
      nouveau_fence_ref(NULL, &current);
      nouveau_fence_ref(NULL, &screen->base.fence.current);
   }
@ -392,7 +393,7 @@ nv50_screen_fence_emit(struct pipe_screen *pscreen, u32 *sequence)
   /* we need to do it after possible flush in MARK_RING */
   *sequence = ++screen->base.fence.sequence;

-   assert(PUSH_AVAIL(push) >= 5);
+   assert(PUSH_AVAIL(push) + push->rsvd_kick >= 5);
   PUSH_DATA (push, NV50_FIFO_PKHDR(NV50_3D(QUERY_ADDRESS_HIGH), 4));
   PUSH_DATAh(push, screen->fence.bo->offset);
   PUSH_DATA (push, screen->fence.bo->offset);
--- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
@ -113,7 +113,7 @@ nv50_program_validate(struct nv50_context *nv50, struct nv50_program *prog)
 {
   if (!prog->translated) {
      prog->translated = nv50_program_translate(
-         prog, nv50->screen->base.device->chipset);
+         prog, nv50->screen->base.device->chipset, &nv50->base.debug);
      if (!prog->translated)
         return false;
   } else
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@ -727,7 +727,8 @@ nv50_sp_state_create(struct pipe_context *pipe,
      prog->pipe.stream_output = cso->stream_output;

   prog->translated = nv50_program_translate(
-         prog, nv50_context(pipe)->screen->base.device->chipset);
+         prog, nv50_context(pipe)->screen->base.device->chipset,
+         &nouveau_context(pipe)->debug);

   return (void *)prog;
 }
--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@ -27,6 +27,7 @@
 #include "util/u_inlines.h"
 #include "util/u_pack_color.h"
 #include "util/u_format.h"
+#include "util/u_math.h"
 #include "util/u_surface.h"

 #include "tgsi/tgsi_ureg.h"
@ -324,6 +325,9 @@ nv50_clear_render_target(struct pipe_context *pipe,
   else
      PUSH_DATA(push, 512);

+   BEGIN_NV04(push, NV50_3D(MULTISAMPLE_MODE), 1);
+   PUSH_DATA (push, mt->ms_mode);
+
   if (!nouveau_bo_memtype(bo)) {
      BEGIN_NV04(push, NV50_3D(ZETA_ENABLE), 1);
      PUSH_DATA (push, 0);
@ -404,6 +408,9 @@ nv50_clear_depth_stencil(struct pipe_context *pipe,
   BEGIN_NV04(push, NV50_3D(RT_ARRAY_MODE), 1);
   PUSH_DATA (push, 512);

+   BEGIN_NV04(push, NV50_3D(MULTISAMPLE_MODE), 1);
+   PUSH_DATA (push, mt->ms_mode);
+
   BEGIN_NV04(push, NV50_3D(VIEWPORT_HORIZ(0)), 2);
   PUSH_DATA (push, (width << 16) | dstx);
   PUSH_DATA (push, (height << 16) | dsty);
@ -417,6 +424,80 @@ nv50_clear_depth_stencil(struct pipe_context *pipe,
   nv50->dirty |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR;
 }

+void
+nv50_clear_texture(struct pipe_context *pipe,
+                   struct pipe_resource *res,
+                   unsigned level,
+                   const struct pipe_box *box,
+                   const void *data)
+{
+   struct pipe_surface tmpl = {{0}}, *sf;
+
+   tmpl.format = res->format;
+   tmpl.u.tex.first_layer = box->z;
+   tmpl.u.tex.last_layer = box->z + box->depth - 1;
+   tmpl.u.tex.level = level;
+   sf = pipe->create_surface(pipe, res, &tmpl);
+   if (!sf)
+      return;
+
+   if (util_format_is_depth_or_stencil(res->format)) {
+      float depth = 0;
+      uint8_t stencil = 0;
+      unsigned clear = 0;
+      const struct util_format_description *desc =
+         util_format_description(res->format);
+
+      if (util_format_has_depth(desc)) {
+         clear |= PIPE_CLEAR_DEPTH;
+         desc->unpack_z_float(&depth, 0, data, 0, 1, 1);
+      }
+      if (util_format_has_stencil(desc)) {
+         clear |= PIPE_CLEAR_STENCIL;
+         desc->unpack_s_8uint(&stencil, 0, data, 0, 1, 1);
+      }
+      pipe->clear_depth_stencil(pipe, sf, clear, depth, stencil,
+                                box->x, box->y, box->width, box->height);
+   } else {
+      union pipe_color_union color;
+
+      switch (util_format_get_blocksizebits(res->format)) {
+      case 128:
+         sf->format = PIPE_FORMAT_R32G32B32A32_UINT;
+         memcpy(&color.ui, data, 128 / 8);
+         break;
+      case 64:
+         sf->format = PIPE_FORMAT_R32G32_UINT;
+         memcpy(&color.ui, data, 64 / 8);
+         memset(&color.ui[2], 0, 64 / 8);
+         break;
+      case 32:
+         sf->format = PIPE_FORMAT_R32_UINT;
+         memcpy(&color.ui, data, 32 / 8);
+         memset(&color.ui[1], 0, 96 / 8);
+         break;
+      case 16:
+         sf->format = PIPE_FORMAT_R16_UINT;
+         color.ui[0] = util_cpu_to_le32(
+            util_le16_to_cpu(*(unsigned short *)data));
+         memset(&color.ui[1], 0, 96 / 8);
+         break;
+      case 8:
+         sf->format = PIPE_FORMAT_R8_UINT;
+         color.ui[0] = util_cpu_to_le32(*(unsigned char *)data);
+         memset(&color.ui[1], 0, 96 / 8);
+         break;
+      default:
+         assert(!"Unknown texel element size");
+         return;
+      }
+
+      pipe->clear_render_target(pipe, sf, &color,
+                                box->x, box->y, box->width, box->height);
+   }
+   pipe->surface_destroy(pipe, sf);
+}
+
 void
 nv50_clear(struct pipe_context *pipe, unsigned buffers,
           const union pipe_color_union *color,
@ -464,11 +545,9 @@ nv50_clear(struct pipe_context *pipe, unsigned buffers,
   if (mode) {
      int zs_layers = 0, color0_layers = 0;
      if (fb->cbufs[0] && (mode & 0x3c))
-         color0_layers = fb->cbufs[0]->u.tex.last_layer -
-            fb->cbufs[0]->u.tex.first_layer + 1;
+         color0_layers = nv50_surface(fb->cbufs[0])->depth;
      if (fb->zsbuf && (mode & ~0x3c))
-         zs_layers = fb->zsbuf->u.tex.last_layer -
-            fb->zsbuf->u.tex.first_layer + 1;
+         zs_layers = nv50_surface(fb->zsbuf)->depth;

      for (j = 0; j < MIN2(zs_layers, color0_layers); j++) {
         BEGIN_NV04(push, NV50_3D(CLEAR_BUFFERS), 1);
@ -488,7 +567,7 @@ nv50_clear(struct pipe_context *pipe, unsigned buffers,
      struct pipe_surface *sf = fb->cbufs[i];
      if (!sf || !(buffers & (PIPE_CLEAR_COLOR0 << i)))
         continue;
-      for (j = 0; j <= sf->u.tex.last_layer - sf->u.tex.first_layer; j++) {
+      for (j = 0; j < nv50_surface(sf)->depth; j++) {
         BEGIN_NV04(push, NV50_3D(CLEAR_BUFFERS), 1);
         PUSH_DATA (push, (i << 6) | 0x3c |
                    (j << NV50_3D_CLEAR_BUFFERS_LAYER__SHIFT));
@ -585,6 +664,8 @@ nv50_clear_buffer(struct pipe_context *pipe,
   PUSH_DATA (push, height);
   BEGIN_NV04(push, NV50_3D(ZETA_ENABLE), 1);
   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_3D(MULTISAMPLE_MODE), 1);
+   PUSH_DATA (push, 0);

   /* NOTE: only works with D3D clear flag (5097/0x143c bit 4) */

@ -1593,6 +1674,7 @@ nv50_init_surface_functions(struct nv50_context *nv50)
   pipe->resource_copy_region = nv50_resource_copy_region;
   pipe->blit = nv50_blit;
   pipe->flush_resource = nv50_flush_resource;
+   pipe->clear_texture = nv50_clear_texture;
   pipe->clear_render_target = nv50_clear_render_target;
   pipe->clear_depth_stencil = nv50_clear_depth_stencil;
   pipe->clear_buffer = nv50_clear_buffer;
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@ -636,7 +636,7 @@ nv50_draw_elements(struct nv50_context *nv50, bool shorten,
       * pushbuf submit, but it's probably not a big performance difference.
       */
      if (buf->fence_wr && !nouveau_fence_signalled(buf->fence_wr))
-         nouveau_fence_wait(buf->fence_wr);
+         nouveau_fence_wait(buf->fence_wr, &nv50->base.debug);

      while (instance_count--) {
         BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1);
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
@ -120,7 +120,7 @@ nvc0_compute_validate_program(struct nvc0_context *nvc0)

   if (!prog->translated) {
      prog->translated = nvc0_program_translate(
-         prog, nvc0->screen->base.device->chipset);
+         prog, nvc0->screen->base.device->chipset, &nvc0->base.debug);
      if (!prog->translated)
         return false;
   }
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@ -309,6 +309,7 @@ nvc0_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
   pipe->memory_barrier = nvc0_memory_barrier;
   pipe->get_sample_position = nvc0_context_get_sample_position;

+   nouveau_context_init(&nvc0->base);
   nvc0_init_query_functions(nvc0);
   nvc0_init_surface_functions(nvc0);
   nvc0_init_state_functions(nvc0);
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@ -224,7 +224,8 @@ void nvc0_default_kick_notify(struct nouveau_pushbuf *);
 extern struct draw_stage *nvc0_draw_render_stage(struct nvc0_context *);

 /* nvc0_program.c */
-bool nvc0_program_translate(struct nvc0_program *, uint16_t chipset);
+bool nvc0_program_translate(struct nvc0_program *, uint16_t chipset,
+                            struct pipe_debug_callback *);
 bool nvc0_program_upload_code(struct nvc0_context *, struct nvc0_program *);
 void nvc0_program_destroy(struct nvc0_context *, struct nvc0_program *);
 void nvc0_program_library_upload(struct nvc0_context *);
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@ -517,7 +517,8 @@ nvc0_program_dump(struct nvc0_program *prog)
 #endif

 bool
-nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
+nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
+                       struct pipe_debug_callback *debug)
 {
   struct nv50_ir_prog_info *info;
   int ret;
@ -639,6 +640,11 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
      prog->tfb = nvc0_program_create_tfb_state(info,
                                                &prog->pipe.stream_output);

+   pipe_debug_message(debug, SHADER_INFO,
+                      "type: %d, local: %d, gpr: %d, inst: %d, bytes: %d",
+                      prog->type, info->bin.tlsSpace, prog->num_gprs,
+                      info->bin.instructions, info->bin.codeSize);
+
 out:
   FREE(info);
   return !ret;
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@ -182,11 +182,12 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
   case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
   case PIPE_CAP_SHAREABLE_SHADERS:
+   case PIPE_CAP_CLEAR_TEXTURE:
      return 1;
   case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
      return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
   case PIPE_CAP_COMPUTE:
-      return (class_3d == NVE4_3D_CLASS) ? 1 : 0;
+      return (class_3d <= NVE4_3D_CLASS) ? 1 : 0;
   case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
      return nouveau_screen(pscreen)->vram_domain & NOUVEAU_BO_VRAM ? 1 : 0;

@ -245,7 +246,7 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
         return 0;
      break;
   case PIPE_SHADER_COMPUTE:
-      if (class_3d != NVE4_3D_CLASS)
+      if (class_3d > NVE4_3D_CLASS)
         return 0;
      break;
   default:
@ -415,7 +416,7 @@ nvc0_screen_destroy(struct pipe_screen *pscreen)
       * _current_ one, and remove both.
       */
      nouveau_fence_ref(screen->base.fence.current, &current);
-      nouveau_fence_wait(current);
+      nouveau_fence_wait(current, NULL);
      nouveau_fence_ref(NULL, &current);
      nouveau_fence_ref(NULL, &screen->base.fence.current);
   }
@ -547,7 +548,7 @@ nvc0_screen_fence_emit(struct pipe_screen *pscreen, u32 *sequence)
   /* we need to do it after possible flush in MARK_RING */
   *sequence = ++screen->base.fence.sequence;

-   assert(PUSH_AVAIL(push) >= 5);
+   assert(PUSH_AVAIL(push) + push->rsvd_kick >= 5);
   PUSH_DATA (push, NVC0_FIFO_PKHDR_SQ(NVC0_3D(QUERY_ADDRESS_HIGH), 4));
   PUSH_DATAh(push, screen->fence.bo->offset);
   PUSH_DATA (push, screen->fence.bo->offset);
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@ -72,7 +72,7 @@ nvc0_program_validate(struct nvc0_context *nvc0, struct nvc0_program *prog)

   if (!prog->translated) {
      prog->translated = nvc0_program_translate(
-         prog, nvc0->screen->base.device->chipset);
+         prog, nvc0->screen->base.device->chipset, &nvc0->base.debug);
      if (!prog->translated)
         return false;
   }
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@ -681,7 +681,8 @@ nvc0_sp_state_create(struct pipe_context *pipe,
      prog->pipe.stream_output = cso->stream_output;

   prog->translated = nvc0_program_translate(
-      prog, nvc0_context(pipe)->screen->base.device->chipset);
+      prog, nvc0_context(pipe)->screen->base.device->chipset,
+      &nouveau_context(pipe)->debug);

   return (void *)prog;
 }
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@ -67,7 +67,7 @@ nvc0_2d_format(enum pipe_format format, bool dst, bool dst_src_equal)
   case 1:
      return NV50_SURFACE_FORMAT_R8_UNORM;
   case 2:
-      return NV50_SURFACE_FORMAT_R16_UNORM;
+      return NV50_SURFACE_FORMAT_RG8_UNORM;
   case 4:
      return NV50_SURFACE_FORMAT_BGRA8_UNORM;
   case 8:
@ -319,6 +319,7 @@ nvc0_clear_render_target(struct pipe_context *pipe,
      PUSH_DATA(push, dst->u.tex.first_layer + sf->depth);
      PUSH_DATA(push, mt->layer_stride >> 2);
      PUSH_DATA(push, dst->u.tex.first_layer);
+      IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), mt->ms_mode);
   } else {
      if (res->base.target == PIPE_BUFFER) {
         PUSH_DATA(push, 262144);
@ -334,6 +335,7 @@ nvc0_clear_render_target(struct pipe_context *pipe,
      PUSH_DATA(push, 0);

      IMMED_NVC0(push, NVC0_3D(ZETA_ENABLE), 0);
+      IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), 0);

      /* tiled textures don't have to be fenced, they're not mapped directly */
      nvc0_resource_fence(res, NOUVEAU_BO_WR);
@ -466,6 +468,7 @@ nvc0_clear_buffer(struct pipe_context *pipe,
   PUSH_DATA (push, 0);

   IMMED_NVC0(push, NVC0_3D(ZETA_ENABLE), 0);
+   IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), 0);

   IMMED_NVC0(push, NVC0_3D(CLEAR_BUFFERS), 0x3c);

@ -540,6 +543,7 @@ nvc0_clear_depth_stencil(struct pipe_context *pipe,
   PUSH_DATA (push, (unk << 16) | (dst->u.tex.first_layer + sf->depth));
   BEGIN_NVC0(push, NVC0_3D(ZETA_BASE_LAYER), 1);
   PUSH_DATA (push, dst->u.tex.first_layer);
+   IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), mt->ms_mode);

   BEGIN_NIC0(push, NVC0_3D(CLEAR_BUFFERS), sf->depth);
   for (z = 0; z < sf->depth; ++z) {
@ -1541,5 +1545,6 @@ nvc0_init_surface_functions(struct nvc0_context *nvc0)
   pipe->flush_resource = nvc0_flush_resource;
   pipe->clear_render_target = nvc0_clear_render_target;
   pipe->clear_depth_stencil = nvc0_clear_depth_stencil;
+   pipe->clear_texture = nv50_clear_texture;
   pipe->clear_buffer = nvc0_clear_buffer;
 }
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
@ -340,8 +340,8 @@ nvc0_mt_sync(struct nvc0_context *nvc0, struct nv50_miptree *mt, unsigned usage)
      return !nouveau_bo_wait(mt->base.bo, access, nvc0->base.client);
   }
   if (usage & PIPE_TRANSFER_WRITE)
-      return !mt->base.fence || nouveau_fence_wait(mt->base.fence);
-   return !mt->base.fence_wr || nouveau_fence_wait(mt->base.fence_wr);
+      return !mt->base.fence || nouveau_fence_wait(mt->base.fence, &nvc0->base.debug);
+   return !mt->base.fence_wr || nouveau_fence_wait(mt->base.fence_wr, &nvc0->base.debug);
 }

 void *
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@ -199,6 +199,7 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
        case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
        case PIPE_CAP_SHAREABLE_SHADERS:
        case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+        case PIPE_CAP_CLEAR_TEXTURE:
            return 0;

        /* SWTCL-only features. */
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@ -346,7 +346,7 @@ static void evergreen_emit_direct_dispatch(
 		const uint *block_layout, const uint *grid_layout)
 {
 	int i;
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 	unsigned num_waves;
 	unsigned num_pipes = rctx->screen->b.info.r600_max_pipes;
@ -417,12 +417,12 @@ static void evergreen_emit_direct_dispatch(
 static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
 		const uint *grid_layout)
 {
-	struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
 	unsigned i;

 	/* make sure that the gfx ring is only one active */
-	if (ctx->b.rings.dma.cs && ctx->b.rings.dma.cs->cdw) {
-		ctx->b.rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+	if (ctx->b.dma.cs && ctx->b.dma.cs->cdw) {
+		ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 	}

 	/* Initialize all the compute-related registers.
@ -439,7 +439,7 @@ static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
 	/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 	for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
 		struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
-		unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.gfx,
+		unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx,
 						       (struct r600_resource*)cb->base.texture,
 						       RADEON_USAGE_READWRITE,
 						       RADEON_PRIO_SHADER_RW_BUFFER);
@ -538,7 +538,7 @@ void evergreen_emit_cs_shader(
 	struct r600_cs_shader_state *state =
 					(struct r600_cs_shader_state*)atom;
 	struct r600_pipe_compute *shader = state->shader;
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	uint64_t va;
 	struct r600_resource *code_bo;
 	unsigned ngpr, nstack;
@ -564,7 +564,7 @@ void evergreen_emit_cs_shader(
 	radeon_emit(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */

 	radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
-	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx,
+	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 					      code_bo, RADEON_USAGE_READ,
 					      RADEON_PRIO_USER_SHADER));
 }
--- a/Show More
+++ b/Show More