From 6f4083143bb8c478ccfcaef034d183d89b471993 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Wed, 19 Dec 2018 04:37:36 +0100
Subject: [PATCH] gallivm: use llvm jit code for decoding s3tc

This is (much) faster than using the util fallback.
(Note that there's two methods here, one would use a cache, similar to
the existing code (although the cache was disabled), except the block
decode is done with jit code, the other directly decodes the required
pixels. For now don't use the cache (being direct-mapped is suboptimal,
but it's difficult to come up with something better which doesn't have
too much overhead.)

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/auxiliary/Makefile.sources        |    2 +-
 src/gallium/auxiliary/gallivm/lp_bld_format.h |    6 +-
 .../auxiliary/gallivm/lp_bld_format_aos.c     |    5 +-
 .../auxiliary/gallivm/lp_bld_format_cached.c  |  374 ---
 .../auxiliary/gallivm/lp_bld_format_s3tc.c    | 2229 +++++++++++++++++
 .../auxiliary/gallivm/lp_bld_sample_soa.c     |    4 -
 src/gallium/auxiliary/meson.build             |    2 +-
 7 files changed, 2239 insertions(+), 383 deletions(-)
 delete mode 100644 src/gallium/auxiliary/gallivm/lp_bld_format_cached.c
 create mode 100644 src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c

diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
index 87a490e555d..50e88088ff8 100644
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -418,11 +418,11 @@ GALLIVM_SOURCES := \
 	gallivm/lp_bld_flow.h \
 	gallivm/lp_bld_format_aos_array.c \
 	gallivm/lp_bld_format_aos.c \
-	gallivm/lp_bld_format_cached.c \
 	gallivm/lp_bld_format_float.c \
 	gallivm/lp_bld_format.c \
 	gallivm/lp_bld_format.h \
 	gallivm/lp_bld_format_soa.c \
+	gallivm/lp_bld_format_s3tc.c \
 	gallivm/lp_bld_format_srgb.c \
 	gallivm/lp_bld_format_yuv.c \
 	gallivm/lp_bld_gather.c \
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h
index 6540caaa293..b1e95c4e6db 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
@@ -165,8 +165,12 @@ lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm,
                                    LLVMValueRef j);
 
 
+/*
+ * S3TC
+ */
+
 LLVMValueRef
-lp_build_fetch_cached_texels(struct gallivm_state *gallivm,
+lp_build_fetch_s3tc_rgba_aos(struct gallivm_state *gallivm,
                              const struct util_format_description *format_desc,
                              unsigned n,
                              LLVMValueRef base_ptr,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index b52acca1b3e..21680dba74a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -464,6 +464,7 @@ lp_build_pack_rgba_aos(struct gallivm_state *gallivm,
  * \param ptr  address of the pixel block (or the texel if uncompressed)
  * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
  *              these will always be (0, 0).
+ * \param cache  optional value pointing to a lp_build_format_cache structure
  * \return  a 4 element vector with the pixel's RGBA values.
  */
 LLVMValueRef
@@ -728,7 +729,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
     * s3tc rgb formats
     */
 
-   if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC && cache) {
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
       struct lp_type tmp_type;
       LLVMValueRef tmp;
 
@@ -737,7 +738,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
       tmp_type.length = num_pixels * 4;
       tmp_type.norm = TRUE;
 
-      tmp = lp_build_fetch_cached_texels(gallivm,
+      tmp = lp_build_fetch_s3tc_rgba_aos(gallivm,
                                          format_desc,
                                          num_pixels,
                                          base_ptr,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c b/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c
deleted file mode 100644
index e08062dcacd..00000000000
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c
+++ /dev/null
@@ -1,374 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2015 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#include "lp_bld_format.h"
-#include "lp_bld_type.h"
-#include "lp_bld_struct.h"
-#include "lp_bld_const.h"
-#include "lp_bld_flow.h"
-#include "lp_bld_swizzle.h"
-
-#include "util/u_math.h"
-
-
-/**
- * @file
- * Complex block-compression based formats are handled here by using a cache,
- * so re-decoding of every pixel is not required.
- * Especially for bilinear filtering, texel reuse is very high hence even
- * a small cache helps.
- * The elements in the cache are the decoded blocks - currently things
- * are restricted to formats which are 4x4 block based, and the decoded
- * texels must fit into 4x8 bits.
- * The cache is direct mapped so hitrates aren't all that great and cache
- * thrashing could happen.
- *
- * @author Roland Scheidegger <sroland@vmware.com>
- */
-
-
-#if LP_BUILD_FORMAT_CACHE_DEBUG
-static void
-update_cache_access(struct gallivm_state *gallivm,
-                    LLVMValueRef ptr,
-                    unsigned count,
-                    unsigned index)
-{
-   LLVMBuilderRef builder = gallivm->builder;
-   LLVMValueRef member_ptr, cache_access;
-
-   assert(index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL ||
-          index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
-
-   member_ptr = lp_build_struct_get_ptr(gallivm, ptr, index, "");
-   cache_access = LLVMBuildLoad(builder, member_ptr, "cache_access");
-   cache_access = LLVMBuildAdd(builder, cache_access,
-                               LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
-                                                                   count, 0), "");
-   LLVMBuildStore(builder, cache_access, member_ptr);
-}
-#endif
-
-
-static void
-store_cached_block(struct gallivm_state *gallivm,
-                   LLVMValueRef *col,
-                   LLVMValueRef tag_value,
-                   LLVMValueRef hash_index,
-                   LLVMValueRef cache)
-{
-   LLVMBuilderRef builder = gallivm->builder;
-   LLVMValueRef ptr, indices[3];
-   LLVMTypeRef type_ptr4x32;
-   unsigned count;
-
-   type_ptr4x32 = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0);
-   indices[0] = lp_build_const_int32(gallivm, 0);
-   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
-   indices[2] = hash_index;
-   ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), "");
-   LLVMBuildStore(builder, tag_value, ptr);
-
-   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
-   hash_index = LLVMBuildMul(builder, hash_index,
-                             lp_build_const_int32(gallivm, 16), "");
-   for (count = 0; count < 4; count++) {
-      indices[2] = hash_index;
-      ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), "");
-      ptr = LLVMBuildBitCast(builder, ptr, type_ptr4x32, "");
-      LLVMBuildStore(builder, col[count], ptr);
-      hash_index = LLVMBuildAdd(builder, hash_index,
-                                lp_build_const_int32(gallivm, 4), "");
-   }
-}
-
-
-static LLVMValueRef
-lookup_cached_pixel(struct gallivm_state *gallivm,
-                    LLVMValueRef ptr,
-                    LLVMValueRef index)
-{
-   LLVMBuilderRef builder = gallivm->builder;
-   LLVMValueRef member_ptr, indices[3];
-
-   indices[0] = lp_build_const_int32(gallivm, 0);
-   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
-   indices[2] = index;
-   member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), "");
-   return LLVMBuildLoad(builder, member_ptr, "cache_data");
-}
-
-
-static LLVMValueRef
-lookup_tag_data(struct gallivm_state *gallivm,
-                LLVMValueRef ptr,
-                LLVMValueRef index)
-{
-   LLVMBuilderRef builder = gallivm->builder;
-   LLVMValueRef member_ptr, indices[3];
-
-   indices[0] = lp_build_const_int32(gallivm, 0);
-   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
-   indices[2] = index;
-   member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), "");
-   return LLVMBuildLoad(builder, member_ptr, "tag_data");
-}
-
-
-static void
-update_cached_block(struct gallivm_state *gallivm,
-                    const struct util_format_description *format_desc,
-                    LLVMValueRef ptr_addr,
-                    LLVMValueRef hash_index,
-                    LLVMValueRef cache)
-
-{
-   LLVMBuilderRef builder = gallivm->builder;
-   LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
-   LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
-   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
-   LLVMTypeRef i32x4 = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
-   LLVMValueRef function;
-   LLVMValueRef tag_value, tmp_ptr;
-   LLVMValueRef col[4];
-   unsigned i, j;
-
-   /*
-    * Use format_desc->fetch_rgba_8unorm() for each pixel in the block.
-    * This doesn't actually make any sense whatsoever, someone would need
-    * to write a function doing this for all pixels in a block (either as
-    * an external c function or with generated code). Don't ask.
-    */
-
-   {
-      /*
-       * Function to call looks like:
-       *   fetch(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
-       */
-      LLVMTypeRef ret_type;
-      LLVMTypeRef arg_types[4];
-      LLVMTypeRef function_type;
-
-      assert(format_desc->fetch_rgba_8unorm);
-
-      ret_type = LLVMVoidTypeInContext(gallivm->context);
-      arg_types[0] = pi8t;
-      arg_types[1] = pi8t;
-      arg_types[2] = i32t;
-      arg_types[3] = i32t;
-      function_type = LLVMFunctionType(ret_type, arg_types,
-                                       ARRAY_SIZE(arg_types), 0);
-
-      /* make const pointer for the C fetch_rgba_8unorm function */
-      function = lp_build_const_int_pointer(gallivm,
-         func_to_pointer((func_pointer) format_desc->fetch_rgba_8unorm));
-
-      /* cast the callee pointer to the function's type */
-      function = LLVMBuildBitCast(builder, function,
-                                  LLVMPointerType(function_type, 0),
-                                  "cast callee");
-   }
-
-   tmp_ptr = lp_build_array_alloca(gallivm, i32x4,
-                                   lp_build_const_int32(gallivm, 16),
-                                   "tmp_decode_store");
-   tmp_ptr = LLVMBuildBitCast(builder, tmp_ptr, pi8t, "");
-
-   /*
-    * Invoke format_desc->fetch_rgba_8unorm() for each pixel.
-    * This is going to be really really slow.
-    * Note: the block store format is actually
-    * x0y0x0y1x0y2x0y3 x1y0x1y1x1y2x1y3 ...
-    */
-   for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j) {
-         LLVMValueRef args[4];
-         LLVMValueRef dst_offset = lp_build_const_int32(gallivm, (i * 4 + j) * 4);
-
-         /*
-          * Note we actually supply a pointer to the start of the block,
-          * not the start of the texture.
-          */
-         args[0] = LLVMBuildGEP(gallivm->builder, tmp_ptr, &dst_offset, 1, "");
-         args[1] = ptr_addr;
-         args[2] = LLVMConstInt(i32t, i, 0);
-         args[3] = LLVMConstInt(i32t, j, 0);
-         LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
-      }
-   }
-
-   /* Finally store the block - pointless mem copy + update tag. */
-   tmp_ptr = LLVMBuildBitCast(builder, tmp_ptr, LLVMPointerType(i32x4, 0), "");
-   for (i = 0; i < 4; ++i) {
-      LLVMValueRef tmp_offset = lp_build_const_int32(gallivm, i);
-      LLVMValueRef ptr = LLVMBuildGEP(gallivm->builder, tmp_ptr, &tmp_offset, 1, "");
-      col[i] = LLVMBuildLoad(builder, ptr, "");
-   }
-
-   tag_value = LLVMBuildPtrToInt(gallivm->builder, ptr_addr,
-                                 LLVMInt64TypeInContext(gallivm->context), "");
-   store_cached_block(gallivm, col, tag_value, hash_index, cache);
-}
-
-
-/*
- * Do a cached lookup.
- *
- * Returns (vectors of) 4x8 rgba aos value
- */
-LLVMValueRef
-lp_build_fetch_cached_texels(struct gallivm_state *gallivm,
-                             const struct util_format_description *format_desc,
-                             unsigned n,
-                             LLVMValueRef base_ptr,
-                             LLVMValueRef offset,
-                             LLVMValueRef i,
-                             LLVMValueRef j,
-                             LLVMValueRef cache)
-
-{
-   LLVMBuilderRef builder = gallivm->builder;
-   unsigned count, low_bit, log2size;
-   LLVMValueRef color, offset_stored, addr, ptr_addrtrunc, tmp;
-   LLVMValueRef ij_index, hash_index, hash_mask, block_index;
-   LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
-   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
-   LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
-   struct lp_type type;
-   struct lp_build_context bld32;
-   memset(&type, 0, sizeof type);
-   type.width = 32;
-   type.length = n;
-
-   assert(format_desc->block.width == 4);
-   assert(format_desc->block.height == 4);
-
-   lp_build_context_init(&bld32, gallivm, type);
-
-   /*
-    * compute hash - we use direct mapped cache, the hash function could
-    *                be better but it needs to be simple
-    * per-element:
-    *    compare offset with offset stored at tag (hash)
-    *    if not equal decode/store block, update tag
-    *    extract color from cache
-    *    assemble result vector
-    */
-
-   /* TODO: not ideal with 32bit pointers... */
-
-   low_bit = util_logbase2(format_desc->block.bits / 8);
-   log2size = util_logbase2(LP_BUILD_FORMAT_CACHE_SIZE);
-   addr = LLVMBuildPtrToInt(builder, base_ptr, i64t, "");
-   ptr_addrtrunc = LLVMBuildPtrToInt(builder, base_ptr, i32t, "");
-   ptr_addrtrunc = lp_build_broadcast_scalar(&bld32, ptr_addrtrunc);
-   /* For the hash function, first mask off the unused lowest bits. Then just
-      do some xor with address bits - only use lower 32bits */
-   ptr_addrtrunc = LLVMBuildAdd(builder, offset, ptr_addrtrunc, "");
-   ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
-                                 lp_build_const_int_vec(gallivm, type, low_bit), "");
-   /* This only really makes sense for size 64,128,256 */
-   hash_index = ptr_addrtrunc;
-   ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
-                                 lp_build_const_int_vec(gallivm, type, 2*log2size), "");
-   hash_index = LLVMBuildXor(builder, ptr_addrtrunc, hash_index, "");
-   tmp = LLVMBuildLShr(builder, hash_index,
-                       lp_build_const_int_vec(gallivm, type, log2size), "");
-   hash_index = LLVMBuildXor(builder, hash_index, tmp, "");
-
-   hash_mask = lp_build_const_int_vec(gallivm, type, LP_BUILD_FORMAT_CACHE_SIZE - 1);
-   hash_index = LLVMBuildAnd(builder, hash_index, hash_mask, "");
-   ij_index = LLVMBuildShl(builder, i, lp_build_const_int_vec(gallivm, type, 2), "");
-   ij_index = LLVMBuildAdd(builder, ij_index, j, "");
-   block_index = LLVMBuildShl(builder, hash_index,
-                              lp_build_const_int_vec(gallivm, type, 4), "");
-   block_index = LLVMBuildAdd(builder, ij_index, block_index, "");
-
-   if (n > 1) {
-      color = LLVMGetUndef(LLVMVectorType(i32t, n));
-      for (count = 0; count < n; count++) {
-         LLVMValueRef index, cond, colorx;
-         LLVMValueRef block_indexx, hash_indexx, addrx, offsetx, ptr_addrx;
-         struct lp_build_if_state if_ctx;
-
-         index = lp_build_const_int32(gallivm, count);
-         offsetx = LLVMBuildExtractElement(builder, offset, index, "");
-         addrx = LLVMBuildZExt(builder, offsetx, i64t, "");
-         addrx = LLVMBuildAdd(builder, addrx, addr, "");
-         block_indexx = LLVMBuildExtractElement(builder, block_index, index, "");
-         hash_indexx = LLVMBuildLShr(builder, block_indexx,
-                                     lp_build_const_int32(gallivm, 4), "");
-         offset_stored = lookup_tag_data(gallivm, cache, hash_indexx);
-         cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addrx, "");
-
-         lp_build_if(&if_ctx, gallivm, cond);
-         {
-            ptr_addrx = LLVMBuildIntToPtr(builder, addrx,
-                                          LLVMPointerType(i8t, 0), "");
-            update_cached_block(gallivm, format_desc, ptr_addrx, hash_indexx, cache);
-#if LP_BUILD_FORMAT_CACHE_DEBUG
-            update_cache_access(gallivm, cache, 1,
-                                LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
-#endif
-         }
-         lp_build_endif(&if_ctx);
-
-         colorx = lookup_cached_pixel(gallivm, cache, block_indexx);
-
-         color = LLVMBuildInsertElement(builder, color, colorx,
-                                        lp_build_const_int32(gallivm, count), "");
-      }
-   }
-   else {
-      LLVMValueRef cond;
-      struct lp_build_if_state if_ctx;
-
-      tmp = LLVMBuildZExt(builder, offset, i64t, "");
-      addr = LLVMBuildAdd(builder, tmp, addr, "");
-      offset_stored = lookup_tag_data(gallivm, cache, hash_index);
-      cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addr, "");
-
-      lp_build_if(&if_ctx, gallivm, cond);
-      {
-         tmp = LLVMBuildIntToPtr(builder, addr, LLVMPointerType(i8t, 0), "");
-         update_cached_block(gallivm, format_desc, tmp, hash_index, cache);
-#if LP_BUILD_FORMAT_CACHE_DEBUG
-         update_cache_access(gallivm, cache, 1,
-                             LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
-#endif
-      }
-      lp_build_endif(&if_ctx);
-
-      color = lookup_cached_pixel(gallivm, cache, block_index);
-   }
-#if LP_BUILD_FORMAT_CACHE_DEBUG
-   update_cache_access(gallivm, cache, n,
-                       LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL);
-#endif
-   return LLVMBuildBitCast(builder, color, LLVMVectorType(i8t, n * 4), "");
-}
-
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
new file mode 100644
index 00000000000..2b143566f24
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
@@ -0,0 +1,2229 @@
+/**************************************************************************
+ *
+ * Copyright 2010-2018 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * s3tc pixel format manipulation.
+ *
+ * @author Roland Scheidegger <sroland@vmware.com>
+ */
+
+
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_string.h"
+#include "util/u_cpu_detect.h"
+#include "util/u_debug.h"
+
+#include "lp_bld_arit.h"
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_conv.h"
+#include "lp_bld_gather.h"
+#include "lp_bld_format.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_pack.h"
+#include "lp_bld_flow.h"
+#include "lp_bld_printf.h"
+#include "lp_bld_struct.h"
+#include "lp_bld_swizzle.h"
+#include "lp_bld_init.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_intr.h"
+
+
+/**
+ * Reverse an interleave2_half
+ * (ie. pick every second element, independent lower/upper halfs)
+ * sse2 can only do that with 32bit (shufps) or larger elements
+ * natively. (Otherwise, and/pack (even) or shift/pack (odd)
+ * could be used, ideally llvm would do that for us.)
+ * XXX: Unfortunately, this does NOT translate to a shufps if those
+ * are int vectors (and casting will not help, llvm needs to recognize it
+ * as "real" float). Instead, llvm will use a pshufd/pshufd/punpcklqdq
+ * sequence which I'm pretty sure is a lot worse despite domain transition
+ * penalties with shufps (except maybe on Nehalem).
+ */
+static LLVMValueRef
+lp_build_uninterleave2_half(struct gallivm_state *gallivm,
+                            struct lp_type type,
+                            LLVMValueRef a,
+                            LLVMValueRef b,
+                            unsigned lo_hi)
+{
+   LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i, j;
+
+   assert(type.length <= LP_MAX_VECTOR_LENGTH);
+   assert(lo_hi < 2);
+
+   if (type.length * type.width == 256) {
+      assert(type.length >= 4);
+      for (i = 0, j = 0; i < type.length; ++i) {
+         if (i == type.length / 4) {
+            j = type.length;
+         } else if (i == type.length / 2) {
+            j = type.length / 2;
+         } else if (i == 3 * type.length / 4) {
+            j = 3 * type.length / 4;
+         } else {
+            j += 2;
+         }
+         elems[i] = lp_build_const_int32(gallivm, j + lo_hi);
+      }
+   } else {
+      for (i = 0; i < type.length; ++i) {
+         elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi);
+      }
+   }
+
+   shuffle = LLVMConstVector(elems, type.length);
+
+   return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
+
+}
+
+
+/**
+ * Build shuffle for extending vectors.
+ */
+static LLVMValueRef
+lp_build_const_extend_shuffle(struct gallivm_state *gallivm,
+                              unsigned n, unsigned length)
+{
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   assert(n <= length);
+   assert(length <= LP_MAX_VECTOR_LENGTH);
+
+   /* TODO: cache results in a static table */
+
+   for(i = 0; i < n; i++) {
+      elems[i] = lp_build_const_int32(gallivm, i);
+   }
+   for (i = n; i < length; i++) {
+      elems[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+   }
+
+   return LLVMConstVector(elems, length);
+}
+
+static LLVMValueRef
+lp_build_const_unpackx2_shuffle(struct gallivm_state *gallivm, unsigned n)
+{
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i, j;
+
+   assert(n <= LP_MAX_VECTOR_LENGTH);
+
+   /* TODO: cache results in a static table */
+
+   for(i = 0, j = 0; i < n; i += 2, ++j) {
+      elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
+      elems[i + 1] = lp_build_const_int32(gallivm, n + j);
+      elems[n + i + 0] = lp_build_const_int32(gallivm, 0 + n/2 + j);
+      elems[n + i + 1] = lp_build_const_int32(gallivm, n + n/2 + j);
+   }
+
+   return LLVMConstVector(elems, n * 2);
+}
+
+/*
+ * broadcast 1 element to all elements
+ */
+static LLVMValueRef
+lp_build_const_shuffle1(struct gallivm_state *gallivm,
+                        unsigned index, unsigned n)
+{
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   assert(n <= LP_MAX_VECTOR_LENGTH);
+
+   /* TODO: cache results in a static table */
+
+   for (i = 0; i < n; i++) {
+      elems[i] = lp_build_const_int32(gallivm, index);
+   }
+
+   return LLVMConstVector(elems, n);
+}
+
+/*
+ * move 1 element to pos 0, rest undef
+ */
+static LLVMValueRef
+lp_build_shuffle1undef(struct gallivm_state *gallivm,
+                       LLVMValueRef a, unsigned index, unsigned n)
+{
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH], shuf;
+   unsigned i;
+
+   assert(n <= LP_MAX_VECTOR_LENGTH);
+
+   elems[0] = lp_build_const_int32(gallivm, index);
+
+   for (i = 1; i < n; i++) {
+      elems[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+   }
+   shuf = LLVMConstVector(elems, n);
+
+   return LLVMBuildShuffleVector(gallivm->builder, a, a, shuf, "");
+}
+
+static boolean
+format_dxt1_variant(enum pipe_format format)
+{
+  return format == PIPE_FORMAT_DXT1_RGB ||
+         format == PIPE_FORMAT_DXT1_RGBA ||
+         format == PIPE_FORMAT_DXT1_SRGB ||
+         format == PIPE_FORMAT_DXT1_SRGBA;
+
+}
+
+/**
+ * Gather elements from scatter positions in memory into vectors.
+ * This is customised for fetching texels from s3tc textures.
+ * For SSE, typical value is length=4.
+ *
+ * @param length length of the offsets
+ * @param colors the stored colors of the blocks will be extracted into this.
+ * @param codewords the codewords of the blocks will be extracted into this.
+ * @param alpha_lo used for storing lower 32bit of alpha components for dxt3/5
+ * @param alpha_hi used for storing higher 32bit of alpha components for dxt3/5
+ * @param base_ptr base pointer, should be a i8 pointer type.
+ * @param offsets vector with offsets
+ */
+static void
+lp_build_gather_s3tc(struct gallivm_state *gallivm,
+                     unsigned length,
+                     const struct util_format_description *format_desc,
+                     LLVMValueRef *colors,
+                     LLVMValueRef *codewords,
+                     LLVMValueRef *alpha_lo,
+                     LLVMValueRef *alpha_hi,
+                     LLVMValueRef base_ptr,
+                     LLVMValueRef offsets)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   unsigned block_bits = format_desc->block.bits;
+   unsigned i;
+   LLVMValueRef elems[8];
+   LLVMTypeRef type32 = LLVMInt32TypeInContext(gallivm->context);
+   LLVMTypeRef type64 = LLVMInt64TypeInContext(gallivm->context);
+   LLVMTypeRef type32dxt;
+   struct lp_type lp_type32dxt;
+
+   memset(&lp_type32dxt, 0, sizeof lp_type32dxt);
+   lp_type32dxt.width = 32;
+   lp_type32dxt.length = block_bits / 32;
+   type32dxt = lp_build_vec_type(gallivm, lp_type32dxt);
+
+   assert(block_bits == 64 || block_bits == 128);
+   assert(length == 1 || length == 4 || length == 8);
+
+   for (i = 0; i < length; ++i) {
+      elems[i] = lp_build_gather_elem(gallivm, length,
+                                      block_bits, block_bits, TRUE,
+                                      base_ptr, offsets, i, FALSE);
+      elems[i] = LLVMBuildBitCast(builder, elems[i], type32dxt, "");
+   }
+   if (length == 1) {
+      LLVMValueRef elem = elems[0];
+      if (block_bits == 128) {
+         *alpha_lo = LLVMBuildExtractElement(builder, elem,
+                                             lp_build_const_int32(gallivm, 0), "");
+         *alpha_hi = LLVMBuildExtractElement(builder, elem,
+                                             lp_build_const_int32(gallivm, 1), "");
+         *colors = LLVMBuildExtractElement(builder, elem,
+                                           lp_build_const_int32(gallivm, 2), "");
+         *codewords = LLVMBuildExtractElement(builder, elem,
+                                              lp_build_const_int32(gallivm, 3), "");
+      }
+      else {
+         *alpha_lo = LLVMGetUndef(type32);
+         *alpha_hi = LLVMGetUndef(type32);
+         *colors = LLVMBuildExtractElement(builder, elem,
+                                           lp_build_const_int32(gallivm, 0), "");
+         *codewords = LLVMBuildExtractElement(builder, elem,
+                                              lp_build_const_int32(gallivm, 1), "");
+      }
+   }
+   else {
+      LLVMValueRef tmp[4], cc01, cc23;
+      struct lp_type lp_type32, lp_type64, lp_type32dxt;
+      memset(&lp_type32, 0, sizeof lp_type32);
+      lp_type32.width = 32;
+      lp_type32.length = length;
+      memset(&lp_type64, 0, sizeof lp_type64);
+      lp_type64.width = 64;
+      lp_type64.length = length/2;
+
+      if (block_bits == 128) {
+         if (length == 8) {
+            for (i = 0; i < 4; ++i) {
+               tmp[0] = elems[i];
+               tmp[1] = elems[i+4];
+               elems[i] = lp_build_concat(gallivm, tmp, lp_type32dxt, 2);
+            }
+         }
+         lp_build_transpose_aos(gallivm, lp_type32, elems, tmp);
+         *colors = tmp[2];
+         *codewords = tmp[3];
+         *alpha_lo = tmp[0];
+         *alpha_hi = tmp[1];
+      } else {
+         LLVMTypeRef type64_vec = LLVMVectorType(type64, length/2);
+         LLVMTypeRef type32_vec = LLVMVectorType(type32, length);
+
+         for (i = 0; i < length; ++i) {
+            /* no-op shuffle */
+            elems[i] = LLVMBuildShuffleVector(builder, elems[i],
+                                              LLVMGetUndef(type32dxt),
+                                              lp_build_const_extend_shuffle(gallivm, 2, 4), "");
+         }
+         if (length == 8) {
+            for (i = 0; i < 4; ++i) {
+               tmp[0] = elems[i];
+               tmp[1] = elems[i+4];
+               elems[i] = lp_build_concat(gallivm, tmp, lp_type32, 2);
+            }
+         }
+         cc01 = lp_build_interleave2_half(gallivm, lp_type32, elems[0], elems[1], 0);
+         cc23 = lp_build_interleave2_half(gallivm, lp_type32, elems[2], elems[3], 0);
+         cc01 = LLVMBuildBitCast(builder, cc01, type64_vec, "");
+         cc23 = LLVMBuildBitCast(builder, cc23, type64_vec, "");
+         *colors = lp_build_interleave2_half(gallivm, lp_type64, cc01, cc23, 0);
+         *codewords = lp_build_interleave2_half(gallivm, lp_type64, cc01, cc23, 1);
+         *colors = LLVMBuildBitCast(builder, *colors, type32_vec, "");
+         *codewords = LLVMBuildBitCast(builder, *codewords, type32_vec, "");
+      }
+   }
+}
+
+/** Convert from <n x i32> containing 2 x n rgb565 colors
+ * to 2 <n x i32> rgba8888 colors
+ * This is the most optimized version I can think of
+ * should be nearly as fast as decoding only one color
+ * NOTE: alpha channel will be set to 0
+ * @param colors  is a <n x i32> vector containing the rgb565 colors
+ */
+static void
+color_expand2_565_to_8888(struct gallivm_state *gallivm,
+                          unsigned n,
+                          LLVMValueRef colors,
+                          LLVMValueRef *color0,
+                          LLVMValueRef *color1)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef r, g, b, rblo, glo;
+   LLVMValueRef rgblomask, rb, rgb0, rgb1;
+   struct lp_type type, type16, type8;
+
+   assert(n > 1);
+
+   memset(&type, 0, sizeof type);
+   type.width = 32;
+   type.length = n;
+
+   memset(&type16, 0, sizeof type16);
+   type16.width = 16;
+   type16.length = 2 * n;
+
+   memset(&type8, 0, sizeof type8);
+   type8.width = 8;
+   type8.length = 4 * n;
+
+   rgblomask = lp_build_const_int_vec(gallivm, type16, 0x0707);
+   colors = LLVMBuildBitCast(builder, colors,
+                             lp_build_vec_type(gallivm, type16), "");
+   /* move r into low 8 bits, b into high 8 bits, g into another reg (low bits)
+    * make sure low bits of r are zero - could use AND but requires constant */
+   r = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type16, 11), "");
+   r = LLVMBuildShl(builder, r, lp_build_const_int_vec(gallivm, type16, 3), "");
+   b = LLVMBuildShl(builder, colors, lp_build_const_int_vec(gallivm, type16, 11), "");
+   rb = LLVMBuildOr(builder, r, b, "");
+   rblo = LLVMBuildLShr(builder, rb, lp_build_const_int_vec(gallivm, type16, 5), "");
+   /* don't have byte shift hence need mask */
+   rblo = LLVMBuildAnd(builder, rblo, rgblomask, "");
+   rb = LLVMBuildOr(builder, rb, rblo, "");
+
+   /* make sure low bits of g are zero */
+   g = LLVMBuildAnd(builder, colors, lp_build_const_int_vec(gallivm, type16, 0x07e0), "");
+   g = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type16, 3), "");
+   glo = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type16, 6), "");
+   g = LLVMBuildOr(builder, g, glo, "");
+
+   rb = LLVMBuildBitCast(builder, rb, lp_build_vec_type(gallivm, type8), "");
+   g = LLVMBuildBitCast(builder, g, lp_build_vec_type(gallivm, type8), "");
+   rgb0 = lp_build_interleave2_half(gallivm, type8, rb, g, 0);
+   rgb1 = lp_build_interleave2_half(gallivm, type8, rb, g, 1);
+
+   rgb0 = LLVMBuildBitCast(builder, rgb0, lp_build_vec_type(gallivm, type), "");
+   rgb1 = LLVMBuildBitCast(builder, rgb1, lp_build_vec_type(gallivm, type), "");
+
+   /* rgb0 is rgb00, rgb01, rgb10, rgb11
+    * instead of rgb00, rgb10, rgb20, rgb30 hence need reshuffle
+    * on x86 this _should_ just generate one shufps...
+    */
+   *color0 = lp_build_uninterleave2_half(gallivm, type, rgb0, rgb1, 0);
+   *color1 = lp_build_uninterleave2_half(gallivm, type, rgb0, rgb1, 1);
+}
+
+
+/** Convert from <n x i32> containing rgb565 colors
+ * (in first 16 bits) to <n x i32> rgba8888 colors
+ * bits 16-31 MBZ
+ * NOTE: alpha channel will be set to 0
+ * @param colors  is a <n x i32> vector containing the rgb565 colors
+ */
+static LLVMValueRef
+color_expand_565_to_8888(struct gallivm_state *gallivm,
+                         unsigned n,
+                         LLVMValueRef colors)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef rgba, r, g, b, rgblo, glo;
+   LLVMValueRef rbhimask, g6mask, rgblomask;
+   struct lp_type type;
+   memset(&type, 0, sizeof type);
+   type.width = 32;
+   type.length = n;
+
+   /* color expansion:
+    * first extract and shift colors into their final locations
+    * (high bits - low bits zero at this point)
+    * then replicate highest bits to the lowest bits
+    * note rb replication can be done in parallel but not g
+    * (different shift)
+    * r5mask = 0xf800, g6mask = 0x07e0, b5mask = 0x001f
+    * rhigh = 8, ghigh = 5, bhigh = 19
+    * rblow = 5, glow = 6
+    * rgblowmask = 0x00070307
+    * r = colors >> rhigh
+    * b = colors << bhigh
+    * g = (colors & g6mask) << ghigh
+    * rb = (r | b) rbhimask
+    * rbtmp = rb >> rblow
+    * gtmp = rb >> glow
+    * rbtmp = rbtmp | gtmp
+    * rbtmp = rbtmp & rgblowmask
+    * rgb = rb | g | rbtmp
+    */
+   g6mask = lp_build_const_int_vec(gallivm, type, 0x07e0);
+   rbhimask = lp_build_const_int_vec(gallivm, type, 0x00f800f8);
+   rgblomask = lp_build_const_int_vec(gallivm, type, 0x00070307);
+
+   r = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type, 8), "");
+   b = LLVMBuildShl(builder, colors, lp_build_const_int_vec(gallivm, type, 19), "");
+   g = LLVMBuildAnd(builder, colors, g6mask, "");
+   g = LLVMBuildShl(builder, g, lp_build_const_int_vec(gallivm, type, 5), "");
+   rgba = LLVMBuildOr(builder, r, b, "");
+   rgba = LLVMBuildAnd(builder, rgba, rbhimask, "");
+   rgblo = LLVMBuildLShr(builder, rgba, lp_build_const_int_vec(gallivm, type, 5), "");
+   glo = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type, 6), "");
+   rgblo = LLVMBuildOr(builder, rgblo, glo, "");
+   rgblo = LLVMBuildAnd(builder, rgblo, rgblomask, "");
+   rgba = LLVMBuildOr(builder, rgba, g, "");
+   rgba = LLVMBuildOr(builder, rgba, rgblo, "");
+
+   return rgba;
+}
+
+
+/**
+ * Calculate 1/3(v1-v0) + v0
+ * and 2*1/3(v1-v0) + v0
+ */
+static void
+lp_build_lerp23(struct lp_build_context *bld,
+                LLVMValueRef v0,
+                LLVMValueRef v1,
+                LLVMValueRef *res0,
+                LLVMValueRef *res1)
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMValueRef x, x_lo, x_hi, delta_lo, delta_hi;
+   LLVMValueRef mul_lo, mul_hi, v0_lo, v0_hi, v1_lo, v1_hi, tmp;
+   const struct lp_type type = bld->type;
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   struct lp_type i16_type = lp_wider_type(type);
+   struct lp_build_context bld2;
+
+   assert(lp_check_value(type, v0));
+   assert(lp_check_value(type, v1));
+   assert(!type.floating && !type.fixed && !type.norm && type.width == 8);
+
+   lp_build_context_init(&bld2, gallivm, i16_type);
+   bld2.type.sign = TRUE;
+   x = lp_build_const_int_vec(gallivm, bld->type, 255*1/3);
+
+   /* FIXME: use native avx256 unpack/pack */
+   lp_build_unpack2(gallivm, type, i16_type, x, &x_lo, &x_hi);
+   lp_build_unpack2(gallivm, type, i16_type, v0, &v0_lo, &v0_hi);
+   lp_build_unpack2(gallivm, type, i16_type, v1, &v1_lo, &v1_hi);
+   delta_lo = lp_build_sub(&bld2, v1_lo, v0_lo);
+   delta_hi = lp_build_sub(&bld2, v1_hi, v0_hi);
+
+   mul_lo = LLVMBuildMul(builder, x_lo, delta_lo, "");
+   mul_hi = LLVMBuildMul(builder, x_hi, delta_hi, "");
+
+   x_lo = LLVMBuildLShr(builder, mul_lo, lp_build_const_int_vec(gallivm, i16_type, 8), "");
+   x_hi = LLVMBuildLShr(builder, mul_hi, lp_build_const_int_vec(gallivm, i16_type, 8), "");
+   /* lerp optimization: pack now, do add afterwards */
+   tmp = lp_build_pack2(gallivm, i16_type, type, x_lo, x_hi);
+   *res0 = lp_build_add(bld, tmp, v0);
+
+   x_lo = LLVMBuildLShr(builder, mul_lo, lp_build_const_int_vec(gallivm, i16_type, 7), "");
+   x_hi = LLVMBuildLShr(builder, mul_hi, lp_build_const_int_vec(gallivm, i16_type, 7), "");
+   /* unlike above still need mask (but add still afterwards). */
+   x_lo = LLVMBuildAnd(builder, x_lo, lp_build_const_int_vec(gallivm, i16_type, 0xff), "");
+   x_hi = LLVMBuildAnd(builder, x_hi, lp_build_const_int_vec(gallivm, i16_type, 0xff), "");
+   tmp = lp_build_pack2(gallivm, i16_type, type, x_lo, x_hi);
+   *res1 = lp_build_add(bld, tmp, v0);
+}
+
+/**
+ * Convert from <n x i64> s3tc dxt1 to <4n x i8> RGBA AoS
+ * @param colors  is a <n x i32> vector with n x 2x16bit colors
+ * @param codewords  is a <n x i32> vector containing the codewords
+ * @param i  is a <n x i32> vector with the x pixel coordinate (0 to 3)
+ * @param j  is a <n x i32> vector with the y pixel coordinate (0 to 3)
+ */
+static LLVMValueRef
+s3tc_dxt1_full_to_rgba_aos(struct gallivm_state *gallivm,
+                           unsigned n,
+                           enum pipe_format format,
+                           LLVMValueRef colors,
+                           LLVMValueRef codewords,
+                           LLVMValueRef i,
+                           LLVMValueRef j)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef color0, color1, color2, color3, color2_2, color3_2;
+   LLVMValueRef rgba, a, colors0, colors1, col0, col1, const2;
+   LLVMValueRef bit_pos, sel_mask, sel_lo, sel_hi, indices;
+   struct lp_type type, type8;
+   struct lp_build_context bld8, bld32;
+   boolean is_dxt1_variant = format_dxt1_variant(format);
+
+   memset(&type, 0, sizeof type);
+   type.width = 32;
+   type.length = n;
+
+   memset(&type8, 0, sizeof type8);
+   type8.width = 8;
+   type8.length = 4*n;
+
+   assert(lp_check_value(type, i));
+   assert(lp_check_value(type, j));
+
+   a = lp_build_const_int_vec(gallivm, type, 0xff000000);
+
+   lp_build_context_init(&bld32, gallivm, type);
+   lp_build_context_init(&bld8, gallivm, type8);
+
+   /*
+    * works as follows:
+    * - expand color0/color1 to rgba8888
+    * - calculate color2/3 (interpolation) according to color0 < color1 rules
+    * - calculate color2/3 according to color0 >= color1 rules
+    * - do selection of color2/3 according to comparison of color0/1
+    * - extract indices (vector shift).
+    * - use compare/select to select the correct color. Since we have 2bit
+    *   indices (and 4 colors), needs at least three compare/selects.
+    */
+   /*
+    * expand the two colors
+    */
+   col0 = LLVMBuildAnd(builder, colors, lp_build_const_int_vec(gallivm, type, 0x0000ffff), "");
+   col1 = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type, 16), "");
+   if (n > 1) {
+      color_expand2_565_to_8888(gallivm, n, colors, &color0, &color1);
+   }
+   else {
+      color0 = color_expand_565_to_8888(gallivm, n, col0);
+      color1 = color_expand_565_to_8888(gallivm, n, col1);
+   }
+
+   /*
+    * interpolate colors
+    * color2_1 is 2/3 color0 + 1/3 color1
+    * color3_1 is 1/3 color0 + 2/3 color1
+    * color2_2 is 1/2 color0 + 1/2 color1
+    * color3_2 is 0
+    */
+
+   colors0 = LLVMBuildBitCast(builder, color0, bld8.vec_type, "");
+   colors1 = LLVMBuildBitCast(builder, color1, bld8.vec_type, "");
+   /* can combine 2 lerps into one mostly - still looks expensive enough. */
+   lp_build_lerp23(&bld8, colors0, colors1, &color2, &color3);
+   color2 = LLVMBuildBitCast(builder, color2, bld32.vec_type, "");
+   color3 = LLVMBuildBitCast(builder, color3, bld32.vec_type, "");
+
+   /* dxt3/5 always use 4-color encoding */
+   if (is_dxt1_variant) {
+      /* fix up alpha */
+      if (format == PIPE_FORMAT_DXT1_RGBA ||
+          format == PIPE_FORMAT_DXT1_SRGBA) {
+         color0 = LLVMBuildOr(builder, color0, a, "");
+         color1 = LLVMBuildOr(builder, color1, a, "");
+         color3 = LLVMBuildOr(builder, color3, a, "");
+      }
+      /*
+       * XXX with sse2 and 16x8 vectors, should use pavgb even when n == 1.
+       * Much cheaper (but we don't care that much if n == 1).
+       */
+      if ((util_cpu_caps.has_sse2 && n == 4) ||
+          (util_cpu_caps.has_avx2 && n == 8)) {
+         LLVMValueRef intrargs[2];
+         char *intr_name = n == 8 ? "llvm.x86.avx2.pavg.b" :
+                                    "llvm.x86.sse2.pavg.b";
+         intrargs[0] = colors0;
+         intrargs[1] = colors1;
+         color2_2 = lp_build_intrinsic(builder, intr_name,
+                                       bld8.vec_type, intrargs, 2, 0);
+         color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, "");
+      }
+      else {
+         struct lp_type i16_type = lp_wider_type(type8);
+         struct lp_build_context bld2;
+         LLVMValueRef v0_lo, v0_hi, v1_lo, v1_hi, addlo, addhi;
+
+         lp_build_context_init(&bld2, gallivm, i16_type);
+         bld2.type.sign = TRUE;
+
+         /*
+          * This isn't as expensive as it looks (the unpack is the same as
+          * for lerp23), with correct rounding.
+          * (Note that while rounding is correct, this will always round down,
+          * whereas pavgb will always round up.)
+          */
+         /* FIXME: use native avx256 unpack/pack */
+         lp_build_unpack2(gallivm, type8, i16_type, colors0, &v0_lo, &v0_hi);
+         lp_build_unpack2(gallivm, type8, i16_type, colors1, &v1_lo, &v1_hi);
+
+         addlo = lp_build_add(&bld2, v0_lo, v1_lo);
+         addhi = lp_build_add(&bld2, v0_hi, v1_hi);
+         addlo = LLVMBuildLShr(builder, addlo,
+                               lp_build_const_int_vec(gallivm, i16_type, 1), "");
+         addhi = LLVMBuildLShr(builder, addhi,
+                               lp_build_const_int_vec(gallivm, i16_type, 1), "");
+         color2_2 = lp_build_pack2(gallivm, i16_type, type8, addlo, addhi);
+         color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, "");
+      }
+      color3_2 = lp_build_const_int_vec(gallivm, type, 0);
+
+      /* select between colors2/3 */
+      /* signed compare is faster saves some xors */
+      type.sign = TRUE;
+      sel_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, col0, col1);
+      color2 = lp_build_select(&bld32, sel_mask, color2, color2_2);
+      color3 = lp_build_select(&bld32, sel_mask, color3, color3_2);
+      type.sign = FALSE;
+
+      if (format == PIPE_FORMAT_DXT1_RGBA ||
+          format == PIPE_FORMAT_DXT1_SRGBA) {
+         color2 = LLVMBuildOr(builder, color2, a, "");
+      }
+   }
+
+   const2 = lp_build_const_int_vec(gallivm, type, 2);
+   /* extract 2-bit index values */
+   bit_pos = LLVMBuildShl(builder, j, const2, "");
+   bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
+   bit_pos = LLVMBuildAdd(builder, bit_pos, bit_pos, "");
+   /*
+    * NOTE: This innocent looking shift is very expensive with x86/ssex.
+    * Shifts with per-elemnent shift count get roughly translated to
+    * extract (count), extract (value), shift, move (back to xmm), unpack
+    * per element!
+    * So about 20 instructions here for 4xi32.
+    * Newer llvm versions (3.7+) will not do extract/insert but use a
+    * a couple constant count vector shifts plus shuffles. About same
+    * amount of instructions unfortunately...
+    * Would get much worse with 8xi16 even...
+    * We could actually do better here:
+    * - subtract bit_pos from 128+30, shl 23, convert float to int...
+    * - now do mul with codewords followed by shr 30...
+    * But requires 32bit->32bit mul, sse41 only (well that's emulatable
+    * with 2 32bit->64bit muls...) and not exactly cheap
+    * AVX2, of course, fixes this nonsense.
+    */
+   indices = LLVMBuildLShr(builder, codewords, bit_pos, "");
+
+   /* finally select the colors */
+   sel_lo = LLVMBuildAnd(builder, indices, bld32.one, "");
+   sel_lo = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, sel_lo, bld32.one);
+   color0 = lp_build_select(&bld32, sel_lo, color1, color0);
+   color2 = lp_build_select(&bld32, sel_lo, color3, color2);
+   sel_hi = LLVMBuildAnd(builder, indices, const2, "");
+   sel_hi = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, sel_hi, const2);
+   rgba = lp_build_select(&bld32, sel_hi, color2, color0);
+
+   /* fix up alpha */
+   if (format == PIPE_FORMAT_DXT1_RGB ||
+       format == PIPE_FORMAT_DXT1_SRGB) {
+      rgba = LLVMBuildOr(builder, rgba, a, "");
+   }
+   return LLVMBuildBitCast(builder, rgba, bld8.vec_type, "");
+}
+
+
+static LLVMValueRef
+s3tc_dxt1_to_rgba_aos(struct gallivm_state *gallivm,
+                      unsigned n,
+                      enum pipe_format format,
+                      LLVMValueRef colors,
+                      LLVMValueRef codewords,
+                      LLVMValueRef i,
+                      LLVMValueRef j)
+{
+   return s3tc_dxt1_full_to_rgba_aos(gallivm, n, format,
+                                     colors, codewords, i, j);
+}
+
+
+/**
+ * Convert from <n x i128> s3tc dxt3 to <4n x i8> RGBA AoS
+ * @param colors  is a <n x i32> vector with n x 2x16bit colors
+ * @param codewords  is a <n x i32> vector containing the codewords
+ * @param alphas  is a <n x i64> vector containing the alpha values
+ * @param i  is a <n x i32> vector with the x pixel coordinate (0 to 3)
+ * @param j  is a <n x i32> vector with the y pixel coordinate (0 to 3)
+ */
+static LLVMValueRef
+s3tc_dxt3_to_rgba_aos(struct gallivm_state *gallivm,
+                      unsigned n,
+                      enum pipe_format format,
+                      LLVMValueRef colors,
+                      LLVMValueRef codewords,
+                      LLVMValueRef alpha_low,
+                      LLVMValueRef alpha_hi,
+                      LLVMValueRef i,
+                      LLVMValueRef j)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef rgba, tmp, tmp2;
+   LLVMValueRef bit_pos, sel_mask;
+   struct lp_type type, type8;
+   struct lp_build_context bld;
+
+   memset(&type, 0, sizeof type);
+   type.width = 32;
+   type.length = n;
+
+   memset(&type8, 0, sizeof type8);
+   type8.width = 8;
+   type8.length = n*4;
+
+   assert(lp_check_value(type, i));
+   assert(lp_check_value(type, j));
+
+   lp_build_context_init(&bld, gallivm, type);
+
+   rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format,
+                                colors, codewords, i, j);
+
+   rgba = LLVMBuildBitCast(builder, rgba, bld.vec_type, "");
+
+   /*
+    * Extract alpha values. Since we now need to select from
+    * which 32bit vector values are fetched, construct selection
+    * mask from highest bit of bit_pos, and use select, then shift
+    * according to the bit_pos (without the highest bit).
+    * Note this is pointless for n == 1 case. Could just
+    * directly use 64bit arithmetic if we'd extract 64bit
+    * alpha value instead of 2x32...
+    */
+   /* pos = 4*(4j+i) */
+   bit_pos = LLVMBuildShl(builder, j, lp_build_const_int_vec(gallivm, type, 2), "");
+   bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
+   bit_pos = LLVMBuildShl(builder, bit_pos,
+                          lp_build_const_int_vec(gallivm, type, 2), "");
+   sel_mask = LLVMBuildLShr(builder, bit_pos,
+                            lp_build_const_int_vec(gallivm, type, 5), "");
+   sel_mask = LLVMBuildSub(builder, sel_mask, bld.one, "");
+   tmp = lp_build_select(&bld, sel_mask, alpha_low, alpha_hi);
+   bit_pos = LLVMBuildAnd(builder, bit_pos,
+                          lp_build_const_int_vec(gallivm, type, 0xffffffdf), "");
+   /* Warning: slow shift with per element count */
+   /*
+    * Could do pshufb here as well - just use appropriate 2 bits in bit_pos
+    * to select the right byte with pshufb. Then for the remaining one bit
+    * just do shift/select.
+    */
+   tmp = LLVMBuildLShr(builder, tmp, bit_pos, "");
+
+   /* combined expand from a4 to a8 and shift into position */
+   tmp = LLVMBuildShl(builder, tmp, lp_build_const_int_vec(gallivm, type, 28), "");
+   tmp2 = LLVMBuildLShr(builder, tmp, lp_build_const_int_vec(gallivm, type, 4), "");
+   tmp = LLVMBuildOr(builder, tmp, tmp2, "");
+
+   rgba = LLVMBuildOr(builder, tmp, rgba, "");
+
+   return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
+}
+
+static LLVMValueRef
+lp_build_lerpdxta(struct gallivm_state *gallivm,
+                  LLVMValueRef alpha0,
+                  LLVMValueRef alpha1,
+                  LLVMValueRef code,
+                  LLVMValueRef sel_mask,
+                  unsigned n)
+{
+   /*
+    * note we're doing lerp in 16bit since 32bit pmulld is only available in sse41
+    * (plus pmullw is actually faster...)
+    * we just pretend our 32bit values (which are really only 8bit) are 16bits.
+    * Note that this is obviously a disaster for the scalar case.
+    */
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef delta, ainterp;
+   LLVMValueRef weight5, weight7, weight;
+   struct lp_type type32, type16, type8;
+   struct lp_build_context bld16;
+
+   memset(&type32, 0, sizeof type32);
+   type32.width = 32;
+   type32.length = n;
+   memset(&type16, 0, sizeof type16);
+   type16.width = 16;
+   type16.length = 2*n;
+   type16.sign = TRUE;
+   memset(&type8, 0, sizeof type8);
+   type8.width = 8;
+   type8.length = 4*n;
+
+   lp_build_context_init(&bld16, gallivm, type16);
+   /* 255/7 is a bit off - increase accuracy at the expense of shift later */
+   sel_mask = LLVMBuildBitCast(builder, sel_mask, bld16.vec_type, "");
+   weight5 = lp_build_const_int_vec(gallivm, type16, 255*64/5);
+   weight7 = lp_build_const_int_vec(gallivm, type16, 255*64/7);
+   weight = lp_build_select(&bld16, sel_mask, weight7, weight5);
+
+   alpha0 = LLVMBuildBitCast(builder, alpha0, bld16.vec_type, "");
+   alpha1 = LLVMBuildBitCast(builder, alpha1, bld16.vec_type, "");
+   code = LLVMBuildBitCast(builder, code, bld16.vec_type, "");
+   /* we'll get garbage in the elements which had code 0 (or larger than 5 or 7)
+      but we don't care */
+   code = LLVMBuildSub(builder, code, bld16.one, "");
+
+   weight = LLVMBuildMul(builder, weight, code, "");
+   weight = LLVMBuildLShr(builder, weight,
+                          lp_build_const_int_vec(gallivm, type16, 6), "");
+
+   delta = LLVMBuildSub(builder, alpha1, alpha0, "");
+
+   ainterp = LLVMBuildMul(builder, delta, weight, "");
+   ainterp = LLVMBuildLShr(builder, ainterp,
+                           lp_build_const_int_vec(gallivm, type16, 8), "");
+
+   ainterp = LLVMBuildBitCast(builder, ainterp, lp_build_vec_type(gallivm, type8), "");
+   alpha0 = LLVMBuildBitCast(builder, alpha0, lp_build_vec_type(gallivm, type8), "");
+   ainterp = LLVMBuildAdd(builder, alpha0, ainterp, "");
+   ainterp = LLVMBuildBitCast(builder, ainterp, lp_build_vec_type(gallivm, type32), "");
+
+   return ainterp;
+}
+
+/**
+ * Convert from <n x i128> s3tc dxt5 to <4n x i8> RGBA AoS
+ * @param colors  is a <n x i32> vector with n x 2x16bit colors
+ * @param codewords  is a <n x i32> vector containing the codewords
+ * @param alphas  is a <n x i64> vector containing the alpha values
+ * @param i  is a <n x i32> vector with the x pixel coordinate (0 to 3)
+ * @param j  is a <n x i32> vector with the y pixel coordinate (0 to 3)
+ */
+static LLVMValueRef
+s3tc_dxt5_full_to_rgba_aos(struct gallivm_state *gallivm,
+                           unsigned n,
+                           enum pipe_format format,
+                           LLVMValueRef colors,
+                           LLVMValueRef codewords,
+                           LLVMValueRef alpha_lo,
+                           LLVMValueRef alpha_hi,
+                           LLVMValueRef i,
+                           LLVMValueRef j)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef rgba, tmp, alpha0, alpha1, alphac, alphac0, bit_pos, shift;
+   LLVMValueRef sel_mask, tmp_mask, alpha, alpha64, code_s;
+   LLVMValueRef mask6, mask7, ainterp;
+   LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
+   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+   struct lp_type type, type8;
+   struct lp_build_context bld32;
+
+   memset(&type, 0, sizeof type);
+   type.width = 32;
+   type.length = n;
+
+   memset(&type8, 0, sizeof type8);
+   type8.width = 8;
+   type8.length = n*4;
+
+   assert(lp_check_value(type, i));
+   assert(lp_check_value(type, j));
+
+   lp_build_context_init(&bld32, gallivm, type);
+
+   assert(lp_check_value(type, i));
+   assert(lp_check_value(type, j));
+
+   rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format,
+                                colors, codewords, i, j);
+
+   rgba = LLVMBuildBitCast(builder, rgba, bld32.vec_type, "");
+
+   /* this looks pretty complex for vectorization:
+    * extract a0/a1 values
+    * extract code
+    * select weights for interpolation depending on a0 > a1
+    * mul weights by code - 1
+    * lerp a0/a1/weights
+    * use selects for getting either a0, a1, interp a, interp a/0.0, interp a/1.0
+    */
+
+   alpha0 = LLVMBuildAnd(builder, alpha_lo,
+                         lp_build_const_int_vec(gallivm, type, 0xff), "");
+   alpha1 = LLVMBuildLShr(builder, alpha_lo,
+                          lp_build_const_int_vec(gallivm, type, 8), "");
+   alpha1 = LLVMBuildAnd(builder, alpha1,
+                         lp_build_const_int_vec(gallivm, type, 0xff), "");
+
+   /* pos = 3*(4j+i) */
+   bit_pos = LLVMBuildShl(builder, j, lp_build_const_int_vec(gallivm, type, 2), "");
+   bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
+   tmp = LLVMBuildAdd(builder, bit_pos, bit_pos, "");
+   bit_pos = LLVMBuildAdd(builder, bit_pos, tmp, "");
+   /* get rid of first 2 bytes - saves shifts of alpha_lo/hi */
+   bit_pos = LLVMBuildAdd(builder, bit_pos,
+                          lp_build_const_int_vec(gallivm, type, 16), "");
+
+   if (n == 1) {
+      struct lp_type type64;
+      memset(&type64, 0, sizeof type64);
+      type64.width = 64;
+      type64.length = 1;
+      /* This is pretty pointless could avoid by just directly extracting
+         64bit in the first place but makes it more complicated elsewhere */
+      alpha_lo = LLVMBuildZExt(builder, alpha_lo, i64t, "");
+      alpha_hi = LLVMBuildZExt(builder, alpha_hi, i64t, "");
+      alphac0 = LLVMBuildShl(builder, alpha_hi,
+                             lp_build_const_int_vec(gallivm, type64, 32), "");
+      alphac0 = LLVMBuildOr(builder, alpha_lo, alphac0, "");
+
+      shift = LLVMBuildZExt(builder, bit_pos, i64t, "");
+      alphac0 = LLVMBuildLShr(builder, alphac0, shift, "");
+      alphac0 = LLVMBuildTrunc(builder, alphac0, i32t, "");
+      alphac = LLVMBuildAnd(builder, alphac0,
+                            lp_build_const_int_vec(gallivm, type, 0x7), "");
+   }
+   else {
+      /*
+       * Using non-native vector length here (actually, with avx2 and
+       * n == 4 llvm will indeed expand to ymm regs...)
+       * At least newer llvm versions handle that ok.
+       * llvm 3.7+ will even handle the emulated 64bit shift with variable
+       * shift count without extraction (and it's actually easier to
+       * emulate than the 32bit one).
+       */
+      alpha64 = LLVMBuildShuffleVector(builder, alpha_lo, alpha_hi,
+                                       lp_build_const_unpackx2_shuffle(gallivm, n), "");
+
+      alpha64 = LLVMBuildBitCast(builder, alpha64, LLVMVectorType(i64t, n), "");
+      shift = LLVMBuildZExt(builder, bit_pos, LLVMVectorType(i64t, n), "");
+      alphac = LLVMBuildLShr(builder, alpha64, shift, "");
+      alphac = LLVMBuildTrunc(builder, alphac, bld32.vec_type, "");
+
+      alphac = LLVMBuildAnd(builder, alphac,
+                            lp_build_const_int_vec(gallivm, type, 0x7), "");
+   }
+
+   /* signed compare is faster saves some xors */
+   type.sign = TRUE;
+   /* alpha0 > alpha1 selection */
+   sel_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER,
+                               alpha0, alpha1);
+   ainterp = lp_build_lerpdxta(gallivm, alpha0, alpha1, alphac, sel_mask, n);
+
+   /*
+    * if a0 > a1 then we select a0 for case 0, a1 for case 1, interp otherwise.
+    * else we select a0 for case 0, a1 for case 1,
+    * interp for case 2-5, 00 for 6 and 0xff(ffffff) for 7
+    * a = (c == 0) ? a0 : a1
+    * a = (c > 1) ? ainterp : a
+    * Finally handle case 6/7 for !(a0 > a1)
+    * a = (!(a0 > a1) && c == 6) ? 0 : a (andnot with mask)
+    * a = (!(a0 > a1) && c == 7) ? 0xffffffff : a (or with mask)
+    */
+   tmp_mask = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
+                               alphac, bld32.zero);
+   alpha = lp_build_select(&bld32, tmp_mask, alpha0, alpha1);
+   tmp_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER,
+                               alphac, bld32.one);
+   alpha = lp_build_select(&bld32, tmp_mask, ainterp, alpha);
+
+   code_s = LLVMBuildAnd(builder, alphac,
+                         LLVMBuildNot(builder, sel_mask, ""), "");
+   mask6 = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
+                            code_s, lp_build_const_int_vec(gallivm, type, 6));
+   mask7 = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
+                            code_s, lp_build_const_int_vec(gallivm, type, 7));
+   alpha = LLVMBuildAnd(builder, alpha, LLVMBuildNot(builder, mask6, ""), "");
+   alpha = LLVMBuildOr(builder, alpha, mask7, "");
+
+   alpha = LLVMBuildShl(builder, alpha, lp_build_const_int_vec(gallivm, type, 24), "");
+   rgba = LLVMBuildOr(builder, alpha, rgba, "");
+
+   return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
+}
+
+
+static void
+lp_build_gather_s3tc_simple_scalar(struct gallivm_state *gallivm,
+                                   const struct util_format_description *format_desc,
+                                   LLVMValueRef *dxt_block,
+                                   LLVMValueRef ptr)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   unsigned block_bits = format_desc->block.bits;
+   LLVMValueRef elem, shuf;
+   LLVMTypeRef type32 = LLVMIntTypeInContext(gallivm->context, 32);
+   LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, block_bits);
+   LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
+   LLVMTypeRef type32_4 = LLVMVectorType(type32, 4);
+
+   assert(block_bits == 64 || block_bits == 128);
+
+   ptr = LLVMBuildBitCast(builder, ptr, src_ptr_type, "");
+   elem = LLVMBuildLoad(builder, ptr, "");
+
+   if (block_bits == 128) {
+      /* just return block as is */
+      *dxt_block = LLVMBuildBitCast(builder, elem, type32_4, "");
+   }
+   else {
+      LLVMTypeRef type32_2 = LLVMVectorType(type32, 2);
+      shuf = lp_build_const_extend_shuffle(gallivm, 2, 4);
+      elem = LLVMBuildBitCast(builder, elem, type32_2, "");
+      *dxt_block = LLVMBuildShuffleVector(builder, elem,
+                                          LLVMGetUndef(type32_2), shuf, "");
+   }
+}
+
+
+static void
+s3tc_store_cached_block(struct gallivm_state *gallivm,
+                        LLVMValueRef *col,
+                        LLVMValueRef tag_value,
+                        LLVMValueRef hash_index,
+                        LLVMValueRef cache)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef ptr, indices[3];
+   LLVMTypeRef type_ptr4x32;
+   unsigned count;
+
+   type_ptr4x32 = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0);
+   indices[0] = lp_build_const_int32(gallivm, 0);
+   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
+   indices[2] = hash_index;
+   ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), "");
+   LLVMBuildStore(builder, tag_value, ptr);
+
+   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
+   hash_index = LLVMBuildMul(builder, hash_index,
+                             lp_build_const_int32(gallivm, 16), "");
+   for (count = 0; count < 4; count++) {
+      indices[2] = hash_index;
+      ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), "");
+      ptr = LLVMBuildBitCast(builder, ptr, type_ptr4x32, "");
+      LLVMBuildStore(builder, col[count], ptr);
+      hash_index = LLVMBuildAdd(builder, hash_index,
+                                lp_build_const_int32(gallivm, 4), "");
+   }
+}
+
+static LLVMValueRef
+s3tc_lookup_cached_pixel(struct gallivm_state *gallivm,
+                         LLVMValueRef ptr,
+                         LLVMValueRef index)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef member_ptr, indices[3];
+
+   indices[0] = lp_build_const_int32(gallivm, 0);
+   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
+   indices[2] = index;
+   member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), "");
+   return LLVMBuildLoad(builder, member_ptr, "cache_data");
+}
+
+static LLVMValueRef
+s3tc_lookup_tag_data(struct gallivm_state *gallivm,
+                     LLVMValueRef ptr,
+                     LLVMValueRef index)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef member_ptr, indices[3];
+
+   indices[0] = lp_build_const_int32(gallivm, 0);
+   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
+   indices[2] = index;
+   member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), "");
+   return LLVMBuildLoad(builder, member_ptr, "tag_data");
+}
+
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+static void
+s3tc_update_cache_access(struct gallivm_state *gallivm,
+                         LLVMValueRef ptr,
+                         unsigned count,
+                         unsigned index)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef member_ptr, cache_access;
+
+   assert(index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL ||
+          index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
+
+   member_ptr = lp_build_struct_get_ptr(gallivm, ptr, index, "");
+   cache_access = LLVMBuildLoad(builder, member_ptr, "cache_access");
+   cache_access = LLVMBuildAdd(builder, cache_access,
+                               LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
+                                                                   count, 0), "");
+   LLVMBuildStore(builder, cache_access, member_ptr);
+}
+#endif
+
+/** 
+ * Calculate 1/3(v1-v0) + v0 and 2*1/3(v1-v0) + v0.
+ * The lerp is performed between the first 2 32bit colors
+ * in the source vector, both results are returned packed in result vector.
+ */
+static LLVMValueRef
+lp_build_lerp23_single(struct lp_build_context *bld,
+                       LLVMValueRef v01)
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMValueRef x, mul, delta, res, v0, v1, elems[8];
+   const struct lp_type type = bld->type;
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   struct lp_type i16_type = lp_wider_type(type);
+   struct lp_type i32_type = lp_wider_type(i16_type);
+   struct lp_build_context bld2;
+
+   assert(!type.floating && !type.fixed && !type.norm && type.width == 8);
+
+   lp_build_context_init(&bld2, gallivm, i16_type);
+   bld2.type.sign = TRUE;
+
+   /* weights 256/3, 256*2/3, with correct rounding */
+   elems[0] = elems[1] = elems[2] = elems[3] =
+      lp_build_const_elem(gallivm, i16_type, 255*1/3);
+   elems[4] = elems[5] = elems[6] = elems[7] =
+      lp_build_const_elem(gallivm, i16_type, 171);
+   x = LLVMConstVector(elems, 8);
+
+   /*
+    * v01 has col0 in 32bit elem 0, col1 in elem 1.
+    * Interleave/unpack will give us separate v0/v1 vectors.
+    */
+   v01 = lp_build_interleave2(gallivm, i32_type, v01, v01, 0);
+   v01 = LLVMBuildBitCast(builder, v01, bld->vec_type, "");
+
+   lp_build_unpack2(gallivm, type, i16_type, v01, &v0, &v1);
+   delta = lp_build_sub(&bld2, v1, v0);
+
+   mul = LLVMBuildMul(builder, x, delta, "");
+
+   mul = LLVMBuildLShr(builder, mul, lp_build_const_int_vec(gallivm, i16_type, 8), "");
+   /* lerp optimization: pack now, do add afterwards */
+   res = lp_build_pack2(gallivm, i16_type, type, mul, bld2.undef);
+   /* only lower 2 elems are valid - for these v0 is really v0 */
+   return lp_build_add(bld, res, v01);
+}
+
+/*
+ * decode one dxt1 block.
+ */
+static void
+s3tc_decode_block_dxt1(struct gallivm_state *gallivm,
+                       enum pipe_format format,
+                       LLVMValueRef dxt_block,
+                       LLVMValueRef *col)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef color01, color23, color01_16, color0123;
+   LLVMValueRef rgba, tmp, a, sel_mask, indices, code, const2;
+   struct lp_type type8, type32, type16, type64;
+   struct lp_build_context bld8, bld32, bld16, bld64;
+   unsigned i;
+   boolean is_dxt1_variant = format_dxt1_variant(format);
+
+   memset(&type32, 0, sizeof type32);
+   type32.width = 32;
+   type32.length = 4;
+   type32.sign = TRUE;
+
+   memset(&type8, 0, sizeof type8);
+   type8.width = 8;
+   type8.length = 16;
+
+   memset(&type16, 0, sizeof type16);
+   type16.width = 16;
+   type16.length = 8;
+
+   memset(&type64, 0, sizeof type64);
+   type64.width = 64;
+   type64.length = 2;
+
+   a = lp_build_const_int_vec(gallivm, type32, 0xff000000);
+   const2 = lp_build_const_int_vec(gallivm, type32, 2);
+
+   lp_build_context_init(&bld32, gallivm, type32);
+   lp_build_context_init(&bld16, gallivm, type16);
+   lp_build_context_init(&bld8, gallivm, type8);
+   lp_build_context_init(&bld64, gallivm, type64);
+
+   if (is_dxt1_variant) {
+      color01 = lp_build_shuffle1undef(gallivm, dxt_block, 0, 4);
+      code = lp_build_shuffle1undef(gallivm, dxt_block, 1, 4);
+   } else {
+      color01 = lp_build_shuffle1undef(gallivm, dxt_block, 2, 4);
+      code = lp_build_shuffle1undef(gallivm, dxt_block, 3, 4);
+   }
+   code = LLVMBuildBitCast(builder, code, bld8.vec_type, "");
+   /* expand bytes to dwords */
+   code = lp_build_interleave2(gallivm, type8, code, code, 0);
+   code = lp_build_interleave2(gallivm, type8, code, code, 0);
+
+
+   /*
+    * works as follows:
+    * - expand color0/color1 to rgba8888
+    * - calculate color2/3 (interpolation) according to color0 < color1 rules
+    * - calculate color2/3 according to color0 >= color1 rules
+    * - do selection of color2/3 according to comparison of color0/1
+    * - extract indices.
+    * - use compare/select to select the correct color. Since we have 2bit
+    *   indices (and 4 colors), needs at least three compare/selects.
+    */
+
+   /*
+    * expand the two colors
+    */
+   color01 = LLVMBuildBitCast(builder, color01, bld16.vec_type, "");
+   color01 = lp_build_interleave2(gallivm, type16, color01,
+                                  bld16.zero, 0);
+   color01_16 = LLVMBuildBitCast(builder, color01, bld32.vec_type, "");
+   color01 = color_expand_565_to_8888(gallivm, 4, color01_16);
+
+   /*
+    * interpolate colors
+    * color2_1 is 2/3 color0 + 1/3 color1
+    * color3_1 is 1/3 color0 + 2/3 color1
+    * color2_2 is 1/2 color0 + 1/2 color1
+    * color3_2 is 0
+    */
+
+   /* TODO: since this is now always scalar, should
+    * probably just use control flow here instead of calculating
+    * both cases and then selection
+    */
+   if (format == PIPE_FORMAT_DXT1_RGBA ||
+       format == PIPE_FORMAT_DXT1_SRGBA) {
+      color01 = LLVMBuildOr(builder, color01, a, "");
+   }
+   /* can combine 2 lerps into one mostly */
+   color23 = lp_build_lerp23_single(&bld8, color01);
+   color23 = LLVMBuildBitCast(builder, color23, bld32.vec_type, "");
+
+   /* dxt3/5 always use 4-color encoding */
+   if (is_dxt1_variant) {
+      LLVMValueRef color23_2, color2_2;
+
+      if (util_cpu_caps.has_sse2) {
+         LLVMValueRef intrargs[2];
+         intrargs[0] = LLVMBuildBitCast(builder, color01, bld8.vec_type, "");
+         /* same interleave as for lerp23 - correct result in 2nd element */
+         intrargs[1] = lp_build_interleave2(gallivm, type32, color01, color01, 0);
+         intrargs[1] = LLVMBuildBitCast(builder, intrargs[1], bld8.vec_type, "");
+         color2_2 = lp_build_intrinsic(builder, "llvm.x86.sse2.pavg.b",
+                                       bld8.vec_type, intrargs, 2, 0);
+      }
+      else {
+         LLVMValueRef v01, v0, v1, vhalf;
+         /*
+          * This isn't as expensive as it looks (the unpack is the same as
+          * for lerp23, which is the reason why we do the pointless
+          * interleave2 too), with correct rounding (the two lower elements
+          * will be the same).
+          */
+         v01 = lp_build_interleave2(gallivm, type32, color01, color01, 0);
+         v01 = LLVMBuildBitCast(builder, v01, bld8.vec_type, "");
+         lp_build_unpack2(gallivm, type8, type16, v01, &v0, &v1);
+         vhalf = lp_build_add(&bld16, v0, v1);
+         vhalf = LLVMBuildLShr(builder, vhalf, bld16.one, "");
+         color2_2 = lp_build_pack2(gallivm, type16, type8, vhalf, bld16.undef);
+      }
+      /* shuffle in color 3 as elem 2 zero, color 2 elem 1 */
+      color23_2 = LLVMBuildBitCast(builder, color2_2, bld64.vec_type, "");
+      color23_2 = LLVMBuildLShr(builder, color23_2,
+                                lp_build_const_int_vec(gallivm, type64, 32), "");
+      color23_2 = LLVMBuildBitCast(builder, color23_2, bld32.vec_type, "");
+
+      tmp = LLVMBuildBitCast(builder, color01_16, bld64.vec_type, "");
+      tmp = LLVMBuildLShr(builder, tmp,
+                          lp_build_const_int_vec(gallivm, type64, 32), "");
+      tmp = LLVMBuildBitCast(builder, tmp, bld32.vec_type, "");
+      sel_mask = lp_build_compare(gallivm, type32, PIPE_FUNC_GREATER,
+                                  color01_16, tmp);
+      sel_mask = lp_build_interleave2(gallivm, type32, sel_mask, sel_mask, 0);
+      color23 = lp_build_select(&bld32, sel_mask, color23, color23_2);
+   }
+
+   if (util_cpu_caps.has_ssse3) {
+      /*
+       * Use pshufb as mini-lut. (Only doable with intrinsics as the
+       * final shuffles are non-constant. pshufb is awesome!)
+       */
+      LLVMValueRef shuf[16], low2mask;
+      LLVMValueRef intrargs[2], lut_ind, lut_adj;
+
+      color01 = LLVMBuildBitCast(builder, color01, bld64.vec_type, "");
+      color23 = LLVMBuildBitCast(builder, color23, bld64.vec_type, "");
+      color0123 = lp_build_interleave2(gallivm, type64, color01, color23, 0);
+      color0123 = LLVMBuildBitCast(builder, color0123, bld32.vec_type, "");
+
+      if (format == PIPE_FORMAT_DXT1_RGB ||
+          format == PIPE_FORMAT_DXT1_SRGB) {
+         color0123 = LLVMBuildOr(builder, color0123, a, "");
+      }
+
+      /* shuffle as r0r1r2r3g0g1... */
+      for (i = 0; i < 4; i++) {
+         shuf[4*i] = lp_build_const_int32(gallivm, 0 + i);
+         shuf[4*i+1] = lp_build_const_int32(gallivm, 4 + i);
+         shuf[4*i+2] = lp_build_const_int32(gallivm, 8 + i);
+         shuf[4*i+3] = lp_build_const_int32(gallivm, 12 + i);
+      }
+      color0123 = LLVMBuildBitCast(builder, color0123, bld8.vec_type, "");
+      color0123 = LLVMBuildShuffleVector(builder, color0123, bld8.undef,
+                                         LLVMConstVector(shuf, 16), "");
+
+      /* lowest 2 bits of each 8 bit value contain index into "LUT" */
+      low2mask = lp_build_const_int_vec(gallivm, type8, 3);
+      /* add 0/4/8/12 for r/g/b/a */
+      lut_adj = lp_build_const_int_vec(gallivm, type32, 0x0c080400);
+      lut_adj = LLVMBuildBitCast(builder, lut_adj, bld8.vec_type, "");
+      intrargs[0] = color0123;
+      for (i = 0; i < 4; i++) {
+         lut_ind = LLVMBuildAnd(builder, code, low2mask, "");
+         lut_ind = LLVMBuildOr(builder, lut_ind, lut_adj, "");
+         intrargs[1] = lut_ind;
+         col[i] = lp_build_intrinsic(builder, "llvm.x86.ssse3.pshuf.b.128",
+                                     bld8.vec_type, intrargs, 2, 0);
+         col[i] = LLVMBuildBitCast(builder, col[i], bld32.vec_type, "");
+         code = LLVMBuildBitCast(builder, code, bld32.vec_type, "");
+         code = LLVMBuildLShr(builder, code, const2, "");
+         code = LLVMBuildBitCast(builder, code, bld8.vec_type, "");
+      }
+   }
+   else {
+      /* Thanks to vectorization can do 4 texels in parallel */
+      LLVMValueRef color0, color1, color2, color3;
+      if (format == PIPE_FORMAT_DXT1_RGB ||
+          format == PIPE_FORMAT_DXT1_SRGB) {
+         color01 = LLVMBuildOr(builder, color01, a, "");
+         color23 = LLVMBuildOr(builder, color23, a, "");
+      }
+      color0 = LLVMBuildShuffleVector(builder, color01, bld32.undef,
+                                      lp_build_const_shuffle1(gallivm, 0, 4), "");
+      color1 = LLVMBuildShuffleVector(builder, color01, bld32.undef,
+                                      lp_build_const_shuffle1(gallivm, 1, 4), "");
+      color2 = LLVMBuildShuffleVector(builder, color23, bld32.undef,
+                                      lp_build_const_shuffle1(gallivm, 0, 4), "");
+      color3 = LLVMBuildShuffleVector(builder, color23, bld32.undef,
+                                      lp_build_const_shuffle1(gallivm, 1, 4), "");
+      code = LLVMBuildBitCast(builder, code, bld32.vec_type, "");
+
+      for (i = 0; i < 4; i++) {
+         /* select the colors */
+         LLVMValueRef selmasklo, rgba01, rgba23, bitlo;
+         bitlo = bld32.one;
+         indices = LLVMBuildAnd(builder, code, bitlo, "");
+         selmasklo = lp_build_compare(gallivm, type32, PIPE_FUNC_EQUAL,
+                                      indices, bitlo);
+         rgba01 = lp_build_select(&bld32, selmasklo, color1, color0);
+
+         LLVMValueRef selmaskhi;
+         indices = LLVMBuildAnd(builder, code, const2, "");
+         selmaskhi = lp_build_compare(gallivm, type32, PIPE_FUNC_EQUAL,
+                                      indices, const2);
+         rgba23 = lp_build_select(&bld32, selmasklo, color3, color2);
+         rgba = lp_build_select(&bld32, selmaskhi, rgba23, rgba01);
+
+         /*
+          * Note that this will give "wrong" order.
+          * col0 will be rgba0, rgba4, rgba8, rgba12, col1 rgba1, rgba5, ...
+          * This would be easily fixable by using different shuffle, bitlo/hi
+          * vectors above (and different shift), but seems slightly easier to
+          * deal with for dxt3/dxt5 alpha too. So instead change lookup.
+          */
+         col[i] = rgba;
+         code = LLVMBuildLShr(builder, code, const2, "");
+      }
+   }
+}
+
+/*
+ * decode one dxt3 block.
+ */
+static void
+s3tc_decode_block_dxt3(struct gallivm_state *gallivm,
+                       enum pipe_format format,
+                       LLVMValueRef dxt_block,
+                       LLVMValueRef *col)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef alpha, alphas0, alphas1, shift4_16, a[4], mask8hi;
+   struct lp_type type32, type8, type16;
+   unsigned i;
+
+   memset(&type32, 0, sizeof type32);
+   type32.width = 32;
+   type32.length = 4;
+
+   memset(&type8, 0, sizeof type8);
+   type8.width = 8;
+   type8.length = 16;
+
+   memset(&type16, 0, sizeof type16);
+   type16.width = 16;
+   type16.length = 8;
+
+   s3tc_decode_block_dxt1(gallivm, format, dxt_block, col);
+
+   shift4_16 = lp_build_const_int_vec(gallivm, type16, 4);
+   mask8hi = lp_build_const_int_vec(gallivm, type32, 0xff000000);
+
+   alpha = LLVMBuildBitCast(builder, dxt_block,
+                            lp_build_vec_type(gallivm, type8), "");
+   alpha = lp_build_interleave2(gallivm, type8, alpha, alpha, 0);
+   alpha = LLVMBuildBitCast(builder, alpha,
+                            lp_build_vec_type(gallivm, type16), "");
+   alpha = LLVMBuildAnd(builder, alpha,
+                        lp_build_const_int_vec(gallivm, type16, 0xf00f), "");
+   alphas0 = LLVMBuildLShr(builder, alpha, shift4_16, "");
+   alphas1 = LLVMBuildShl(builder, alpha, shift4_16, "");
+   alpha = LLVMBuildOr(builder, alphas0, alpha, "");
+   alpha = LLVMBuildOr(builder, alphas1, alpha, "");
+   alpha = LLVMBuildBitCast(builder, alpha,
+                            lp_build_vec_type(gallivm, type32), "");
+   /*
+    * alpha now contains elems 0,1,2,3,... (ubytes)
+    * we need 0,4,8,12, 1,5,9,13 etc. in dwords to match color (which
+    * is just as easy as "natural" order - 3 shift/and instead of 6 unpack).
+    */
+   a[0] = LLVMBuildShl(builder, alpha,
+                       lp_build_const_int_vec(gallivm, type32, 24), "");
+   a[1] = LLVMBuildShl(builder, alpha,
+                       lp_build_const_int_vec(gallivm, type32, 16), "");
+   a[1] = LLVMBuildAnd(builder, a[1], mask8hi, "");
+   a[2] = LLVMBuildShl(builder, alpha,
+                       lp_build_const_int_vec(gallivm, type32, 8), "");
+   a[2] = LLVMBuildAnd(builder, a[2], mask8hi, "");
+   a[3] = LLVMBuildAnd(builder, alpha, mask8hi, "");
+
+   for (i = 0; i < 4; i++) {
+      col[i] = LLVMBuildOr(builder, col[i], a[i], "");
+   }
+}
+
+
+static LLVMValueRef
+lp_build_lerpdxta_block(struct gallivm_state *gallivm,
+                        LLVMValueRef alpha0,
+                        LLVMValueRef alpha1,
+                        LLVMValueRef code,
+                        LLVMValueRef sel_mask)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef delta, ainterp;
+   LLVMValueRef weight5, weight7, weight;
+   struct lp_type type16;
+   struct lp_build_context bld;
+
+   memset(&type16, 0, sizeof type16);
+   type16.width = 16;
+   type16.length = 8;
+   type16.sign = TRUE;
+
+   lp_build_context_init(&bld, gallivm, type16);
+   /*
+    * 256/7 is only 36.57 so we'd lose quite some precision. Since it would
+    * actually be desirable to do this here with even higher accuracy than
+    * even 8 bit (more or less required for rgtc, albeit that's not handled
+    * here right now), shift the weights after multiplication by code.
+    */
+   weight5 = lp_build_const_int_vec(gallivm, type16, 256*64/5);
+   weight7 = lp_build_const_int_vec(gallivm, type16, 256*64/7);
+   weight = lp_build_select(&bld, sel_mask, weight7, weight5);
+
+   /*
+    * we'll get garbage in the elements which had code 0 (or larger than
+    * 5 or 7) but we don't care (or rather, need to fix up anyway).
+    */
+   code = LLVMBuildSub(builder, code, bld.one, "");
+
+   weight = LLVMBuildMul(builder, weight, code, "");
+   weight = LLVMBuildLShr(builder, weight,
+                          lp_build_const_int_vec(gallivm, type16, 6), "");
+
+   delta = LLVMBuildSub(builder, alpha1, alpha0, "");
+
+   ainterp = LLVMBuildMul(builder, delta, weight, "");
+   ainterp = LLVMBuildLShr(builder, ainterp,
+                           lp_build_const_int_vec(gallivm, type16, 8), "");
+
+   /* lerp is done later (with packed values) */
+
+   return ainterp;
+}
+
+
+/*
+ * decode one dxt5 block.
+ */
+static void
+s3tc_decode_block_dxt5(struct gallivm_state *gallivm,
+                       enum pipe_format format,
+                       LLVMValueRef dxt_block,
+                       LLVMValueRef *col)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef alpha, alpha0, alpha1, ares;
+   LLVMValueRef ainterp, ainterp0, ainterp1, shuffle1, sel_mask, sel_mask2;
+   LLVMValueRef a[4], acode, tmp0, tmp1;
+   LLVMTypeRef i64t, i32t;
+   struct lp_type type32, type64, type8, type16;
+   struct lp_build_context bld16, bld8;
+   unsigned i;
+
+   memset(&type32, 0, sizeof type32);
+   type32.width = 32;
+   type32.length = 4;
+
+   memset(&type64, 0, sizeof type64);
+   type64.width = 64;
+   type64.length = 2;
+
+   memset(&type8, 0, sizeof type8);
+   type8.width = 8;
+   type8.length = 16;
+
+   memset(&type16, 0, sizeof type16);
+   type16.width = 16;
+   type16.length = 8;
+
+   lp_build_context_init(&bld16, gallivm, type16);
+   lp_build_context_init(&bld8, gallivm, type8);
+
+   i64t = lp_build_vec_type(gallivm, type64);
+   i32t = lp_build_vec_type(gallivm, type32);
+
+   s3tc_decode_block_dxt1(gallivm, format, dxt_block, col);
+
+   /*
+    * three possible strategies for vectorizing alpha:
+    * 1) compute all 8 values then use scalar extraction
+    *    (i.e. have all 8 alpha values packed in one 64bit scalar
+    *    and do something like ax = vals >> (codex * 8) followed
+    *    by inserting these values back into color)
+    * 2) same as 8 but just use pshufb as a mini-LUT for selection.
+    *    (without pshufb would need boatloads of cmp/selects trying to
+    *    keep things vectorized for essentially scalar selection).
+    * 3) do something similar to the uncached case
+    *    needs more calculations (need to calc 16 values instead of 8 though
+    *    that's only an issue for the lerp which we need to do twice otherwise
+    *    everything still fits into 128bit) but keeps things vectorized mostly.
+    * Trying 3) here though not sure it's really faster...
+    * With pshufb, we try 2) (cheaper and more accurate)
+    */
+
+   /*
+    * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't
+    * help since code crosses 8bit boundaries). But variable shifts are
+    * AVX2 only, and even then only dword/quadword (intel _really_ hates
+    * shifts!). Instead, emulate by 16bit muls.
+    * Also, the required byte shuffles are essentially non-emulatable, so
+    * require ssse3 (albeit other archs might do them fine).
+    * This is not directly tied to ssse3 - just need sane byte shuffles.
+    * But ordering is going to be different below so use same condition.
+    */
+
+
+   /* vectorize alpha */
+   alpha = LLVMBuildBitCast(builder, dxt_block, i64t, "");
+   alpha0 = LLVMBuildAnd(builder, alpha,
+                         lp_build_const_int_vec(gallivm, type64, 0xff), "");
+   alpha0 = LLVMBuildBitCast(builder, alpha0, bld16.vec_type, "");
+   alpha = LLVMBuildBitCast(builder, alpha, bld16.vec_type, "");
+   alpha1 = LLVMBuildLShr(builder, alpha,
+                          lp_build_const_int_vec(gallivm, type16, 8), "");
+   alpha = LLVMBuildBitCast(builder, alpha,  i64t, "");
+   shuffle1 = lp_build_const_shuffle1(gallivm, 0, 8);
+   /* XXX this shuffle broken with LLVM 2.8 */
+   alpha0 = LLVMBuildShuffleVector(builder, alpha0, alpha0, shuffle1, "");
+   alpha1 = LLVMBuildShuffleVector(builder, alpha1, alpha1, shuffle1, "");
+
+   type16.sign = TRUE;
+   sel_mask = lp_build_compare(gallivm, type16, PIPE_FUNC_GREATER,
+                               alpha0, alpha1);
+   type16.sign = FALSE;
+   sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, "");
+
+   if (!util_cpu_caps.has_ssse3) {
+      LLVMValueRef acodeg, mask1, acode0, acode1;
+
+      /* extraction of the 3 bit values into something more useful is HARD */
+      /* first steps are actually scalar */
+      acode = LLVMBuildLShr(builder, alpha,
+                            lp_build_const_int_vec(gallivm, type64, 16), "");
+      tmp0 = LLVMBuildAnd(builder, acode,
+                          lp_build_const_int_vec(gallivm, type64, 0xffffff), "");
+      tmp1 =  LLVMBuildLShr(builder, acode,
+                            lp_build_const_int_vec(gallivm, type64, 24), "");
+      tmp0 = LLVMBuildBitCast(builder, tmp0, i32t, "");
+      tmp1 = LLVMBuildBitCast(builder, tmp1, i32t, "");
+      acode = lp_build_interleave2(gallivm, type32, tmp0, tmp1, 0);
+      /* now have 2x24bit in 4x32bit, order 01234567, 89..., undef, undef */
+      tmp0 = LLVMBuildAnd(builder, acode,
+                          lp_build_const_int_vec(gallivm, type32, 0xfff), "");
+      tmp1 =  LLVMBuildLShr(builder, acode,
+                            lp_build_const_int_vec(gallivm, type32, 12), "");
+      acode = lp_build_interleave2(gallivm, type32, tmp0, tmp1, 0);
+      /* now have 4x12bit in 4x32bit, order 0123, 4567, ,,, */
+      tmp0 = LLVMBuildAnd(builder, acode,
+                          lp_build_const_int_vec(gallivm, type32, 0x3f), "");
+      tmp1 =  LLVMBuildLShr(builder, acode,
+                            lp_build_const_int_vec(gallivm, type32, 6), "");
+      /* use signed pack doesn't matter and otherwise need sse41 */
+      type32.sign = type16.sign = TRUE;
+      acode = lp_build_pack2(gallivm, type32, type16, tmp0, tmp1);
+      type32.sign = type16.sign = FALSE;
+      /* now have 8x6bit in 8x16bit, 01, 45, 89, ..., 23, 67, ... */
+      acode0 = LLVMBuildAnd(builder, acode,
+                            lp_build_const_int_vec(gallivm, type16, 0x7), "");
+      acode1 =  LLVMBuildLShr(builder, acode,
+                              lp_build_const_int_vec(gallivm, type16, 3), "");
+      acode = lp_build_pack2(gallivm, type16, type8, acode0, acode1);
+      /* acode0 contains elems 0,4,8,12,2,6,10,14, acode1 1,5,9,... */
+
+      acodeg = LLVMBuildAnd(builder, acode,
+                            LLVMBuildNot(builder, sel_mask, ""), "");
+      mask1 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
+                               acode, bld8.one);
+
+      sel_mask = LLVMBuildBitCast(builder, sel_mask, bld16.vec_type, "");
+      ainterp0 = lp_build_lerpdxta_block(gallivm, alpha0, alpha1, acode0, sel_mask);
+      ainterp1 = lp_build_lerpdxta_block(gallivm, alpha0, alpha1, acode1, sel_mask);
+      sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, "");
+      ainterp = lp_build_pack2(gallivm, type16, type8, ainterp0, ainterp1);
+      alpha0 = lp_build_pack2(gallivm, type16, type8, alpha0, alpha0);
+      alpha1 = lp_build_pack2(gallivm, type16, type8, alpha1, alpha1);
+      ainterp = LLVMBuildAdd(builder, ainterp, alpha0, "");
+      /* Fix up val01 */
+      sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
+                                   acode, bld8.zero);
+      ainterp = lp_build_select(&bld8, sel_mask2, alpha0, ainterp);
+      ainterp = lp_build_select(&bld8, mask1, alpha1, ainterp);
+
+      /* fix up val67 if a0 <= a1 */
+      sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
+                                   acodeg, lp_build_const_int_vec(gallivm, type8, 6));
+      ares = LLVMBuildAnd(builder, ainterp, LLVMBuildNot(builder, sel_mask2, ""), "");
+      sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
+                                   acodeg, lp_build_const_int_vec(gallivm, type8, 7));
+      ares = LLVMBuildOr(builder, ares, sel_mask2, "");
+
+      /* unpack in right order (0,4,8,12,1,5,..) */
+      /* this gives us zero, a0, zero, a4, zero, a8, ... for tmp0 */
+      tmp0 = lp_build_interleave2(gallivm, type8, bld8.zero, ares, 0);
+      tmp1 = lp_build_interleave2(gallivm, type8, bld8.zero, ares, 1);
+      tmp0 = LLVMBuildBitCast(builder, tmp0, bld16.vec_type, "");
+      tmp1 = LLVMBuildBitCast(builder, tmp1, bld16.vec_type, "");
+
+      a[0] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp0, 0);
+      a[1] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp1, 0);
+      a[2] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp0, 1);
+      a[3] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp1, 1);
+   }
+   else {
+      LLVMValueRef elems[16], intrargs[2], shufa, mulclo, mulchi, mask8hi;
+      LLVMTypeRef type16s = LLVMInt16TypeInContext(gallivm->context);
+      LLVMTypeRef type8s = LLVMInt8TypeInContext(gallivm->context);
+      unsigned i, j;
+      /*
+       * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't
+       * help since code crosses 8bit boundaries). But variable shifts are
+       * AVX2 only, and even then only dword/quadword (intel _really_ hates
+       * shifts!). Instead, emulate by 16bit muls.
+       * Also, the required byte shuffles are essentially non-emulatable, so
+       * require ssse3 (albeit other archs might do them fine, but the
+       * complete path is ssse3 only for now).
+       */
+      for (i = 0, j = 0; i < 16; i += 8, j += 3) {
+         elems[i+0] = elems[i+1] = elems[i+2] = lp_build_const_int32(gallivm, j+2);
+         elems[i+3] = elems[i+4] = lp_build_const_int32(gallivm, j+3);
+         elems[i+5] = elems[i+6] = elems[i+7] = lp_build_const_int32(gallivm, j+4);
+      }
+      shufa = LLVMConstVector(elems, 16);
+      alpha = LLVMBuildBitCast(builder, alpha, bld8.vec_type, "");
+      acode = LLVMBuildShuffleVector(builder, alpha, bld8.undef, shufa, "");
+      acode = LLVMBuildBitCast(builder, acode, bld16.vec_type, "");
+      /*
+       * Put 0/2/4/6 into high 3 bits of 16 bits (save AND mask)
+       * Do the same for 1/3/5/7 (albeit still need mask there - ideally
+       * we'd place them into bits 4-7 so could save shift but impossible.)
+       */
+      for (i = 0; i < 8; i += 4) {
+         elems[i+0] = LLVMConstInt(type16s, 1 << (13-0), 0);
+         elems[i+1] = LLVMConstInt(type16s, 1 << (13-6), 0);
+         elems[i+2] = LLVMConstInt(type16s, 1 << (13-4), 0);
+         elems[i+3] = LLVMConstInt(type16s, 1 << (13-2), 0);
+      }
+      mulclo = LLVMConstVector(elems, 8);
+      for (i = 0; i < 8; i += 4) {
+         elems[i+0] = LLVMConstInt(type16s, 1 << (13-3), 0);
+         elems[i+1] = LLVMConstInt(type16s, 1 << (13-9), 0);
+         elems[i+2] = LLVMConstInt(type16s, 1 << (13-7), 0);
+         elems[i+3] = LLVMConstInt(type16s, 1 << (13-5), 0);
+      }
+      mulchi = LLVMConstVector(elems, 8);
+
+      tmp0 = LLVMBuildMul(builder, acode, mulclo, "");
+      tmp1 = LLVMBuildMul(builder, acode, mulchi, "");
+      tmp0 = LLVMBuildLShr(builder, tmp0,
+                           lp_build_const_int_vec(gallivm, type16, 13), "");
+      tmp1 = LLVMBuildLShr(builder, tmp1,
+                           lp_build_const_int_vec(gallivm, type16, 5), "");
+      tmp1 = LLVMBuildAnd(builder, tmp1,
+                          lp_build_const_int_vec(gallivm, type16, 0x700), "");
+      acode = LLVMBuildOr(builder, tmp0, tmp1, "");
+      acode = LLVMBuildBitCast(builder, acode, bld8.vec_type, "");
+
+      /*
+       * Note that ordering is different here to non-ssse3 path:
+       * 0/1/2/3/4/5...
+       */
+
+      LLVMValueRef weight0, weight1, weight, delta;
+      LLVMValueRef constff_elem7, const0_elem6;
+      /* weights, correctly rounded (round(256*x/7)) */
+      elems[0] = LLVMConstInt(type16s, 256, 0);
+      elems[1] = LLVMConstInt(type16s, 0, 0);
+      elems[2] = LLVMConstInt(type16s, 219, 0);
+      elems[3] =  LLVMConstInt(type16s, 183, 0);
+      elems[4] =  LLVMConstInt(type16s, 146, 0);
+      elems[5] =  LLVMConstInt(type16s, 110, 0);
+      elems[6] =  LLVMConstInt(type16s, 73, 0);
+      elems[7] =  LLVMConstInt(type16s, 37, 0);
+      weight0 = LLVMConstVector(elems, 8);
+
+      elems[0] = LLVMConstInt(type16s, 256, 0);
+      elems[1] = LLVMConstInt(type16s, 0, 0);
+      elems[2] = LLVMConstInt(type16s, 205, 0);
+      elems[3] =  LLVMConstInt(type16s, 154, 0);
+      elems[4] =  LLVMConstInt(type16s, 102, 0);
+      elems[5] =  LLVMConstInt(type16s, 51, 0);
+      elems[6] =  LLVMConstInt(type16s, 0, 0);
+      elems[7] =  LLVMConstInt(type16s, 0, 0);
+      weight1 = LLVMConstVector(elems, 8);
+
+      weight0 = LLVMBuildBitCast(builder, weight0, bld8.vec_type, "");
+      weight1 = LLVMBuildBitCast(builder, weight1, bld8.vec_type, "");
+      weight = lp_build_select(&bld8, sel_mask, weight0, weight1);
+      weight = LLVMBuildBitCast(builder, weight, bld16.vec_type, "");
+
+      for (i = 0; i < 16; i++) {
+         elems[i] = LLVMConstNull(type8s);
+      }
+      elems[7] = LLVMConstInt(type8s, 255, 0);
+      constff_elem7 = LLVMConstVector(elems, 16);
+
+      for (i = 0; i < 16; i++) {
+         elems[i] = LLVMConstInt(type8s, 255, 0);
+      }
+      elems[6] = LLVMConstInt(type8s, 0, 0);
+      const0_elem6 = LLVMConstVector(elems, 16);
+
+      /* standard simple lerp - but the version we need isn't available */
+      delta = LLVMBuildSub(builder, alpha0, alpha1, "");
+      ainterp = LLVMBuildMul(builder, delta, weight, "");
+      ainterp = LLVMBuildLShr(builder, ainterp,
+                              lp_build_const_int_vec(gallivm, type16, 8), "");
+      ainterp = LLVMBuildBitCast(builder, ainterp, bld8.vec_type, "");
+      alpha1 = LLVMBuildBitCast(builder, alpha1, bld8.vec_type, "");
+      ainterp = LLVMBuildAdd(builder, ainterp, alpha1, "");
+      ainterp = LLVMBuildBitCast(builder, ainterp, bld16.vec_type, "");
+      ainterp = lp_build_pack2(gallivm, type16, type8, ainterp, bld16.undef);
+
+      /* fixing 0/0xff case is slightly more complex */
+      constff_elem7 = LLVMBuildAnd(builder, constff_elem7,
+                                   LLVMBuildNot(builder, sel_mask, ""), "");
+      const0_elem6 = LLVMBuildOr(builder, const0_elem6, sel_mask, "");
+      ainterp = LLVMBuildOr(builder, ainterp, constff_elem7, "");
+      ainterp = LLVMBuildAnd(builder, ainterp, const0_elem6, "");
+
+      /* now pick all 16 elements at once! */
+      intrargs[0] = ainterp;
+      intrargs[1] = acode;
+      ares = lp_build_intrinsic(builder, "llvm.x86.ssse3.pshuf.b.128",
+                                bld8.vec_type, intrargs, 2, 0);
+
+      ares = LLVMBuildBitCast(builder, ares, i32t, "");
+      mask8hi = lp_build_const_int_vec(gallivm, type32, 0xff000000);
+      a[0] = LLVMBuildShl(builder, ares,
+                          lp_build_const_int_vec(gallivm, type32, 24), "");
+      a[1] = LLVMBuildShl(builder, ares,
+                          lp_build_const_int_vec(gallivm, type32, 16), "");
+      a[1] = LLVMBuildAnd(builder, a[1], mask8hi, "");
+      a[2] = LLVMBuildShl(builder, ares,
+                          lp_build_const_int_vec(gallivm, type32, 8), "");
+      a[2] = LLVMBuildAnd(builder, a[2], mask8hi, "");
+      a[3] = LLVMBuildAnd(builder, ares, mask8hi, "");
+   }
+
+   for (i = 0; i < 4; i++) {
+      a[i] = LLVMBuildBitCast(builder, a[i], i32t, "");
+      col[i] = LLVMBuildOr(builder, col[i], a[i], "");
+   }
+}
+
+
+static void
+generate_update_cache_one_block(struct gallivm_state *gallivm,
+                                LLVMValueRef function,
+                                const struct util_format_description *format_desc)
+{
+   LLVMBasicBlockRef block;
+   LLVMBuilderRef old_builder;
+   LLVMValueRef ptr_addr;
+   LLVMValueRef hash_index;
+   LLVMValueRef cache;
+   LLVMValueRef dxt_block, tag_value;
+   LLVMValueRef col[LP_MAX_VECTOR_LENGTH];
+
+   ptr_addr     = LLVMGetParam(function, 0);
+   hash_index   = LLVMGetParam(function, 1);
+   cache        = LLVMGetParam(function, 2);
+
+   lp_build_name(ptr_addr,   "ptr_addr"  );
+   lp_build_name(hash_index, "hash_index");
+   lp_build_name(cache,      "cache_addr");
+
+   /*
+    * Function body
+    */
+
+   old_builder = gallivm->builder;
+   block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
+   gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
+   LLVMPositionBuilderAtEnd(gallivm->builder, block);
+
+   lp_build_gather_s3tc_simple_scalar(gallivm, format_desc, &dxt_block,
+                                      ptr_addr);
+
+   switch (format_desc->format) {
+   case PIPE_FORMAT_DXT1_RGB:
+   case PIPE_FORMAT_DXT1_RGBA:
+   case PIPE_FORMAT_DXT1_SRGB:
+   case PIPE_FORMAT_DXT1_SRGBA:
+      s3tc_decode_block_dxt1(gallivm, format_desc->format, dxt_block, col);
+      break;
+   case PIPE_FORMAT_DXT3_RGBA:
+   case PIPE_FORMAT_DXT3_SRGBA:
+      s3tc_decode_block_dxt3(gallivm, format_desc->format, dxt_block, col);
+      break;
+   case PIPE_FORMAT_DXT5_RGBA:
+   case PIPE_FORMAT_DXT5_SRGBA:
+      s3tc_decode_block_dxt5(gallivm, format_desc->format, dxt_block, col);
+      break;
+   default:
+      assert(0);
+      s3tc_decode_block_dxt1(gallivm, format_desc->format, dxt_block, col);
+      break;
+   }
+
+   tag_value = LLVMBuildPtrToInt(gallivm->builder, ptr_addr,
+                                 LLVMInt64TypeInContext(gallivm->context), "");
+   s3tc_store_cached_block(gallivm, col, tag_value, hash_index, cache);
+
+   LLVMBuildRetVoid(gallivm->builder);
+
+   LLVMDisposeBuilder(gallivm->builder);
+   gallivm->builder = old_builder;
+
+   gallivm_verify_function(gallivm, function);
+}
+
+
+static void
+update_cached_block(struct gallivm_state *gallivm,
+                    const struct util_format_description *format_desc,
+                    LLVMValueRef ptr_addr,
+                    LLVMValueRef hash_index,
+                    LLVMValueRef cache)
+
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMModuleRef module = gallivm->module;
+   char name[256];
+   LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
+   LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
+   LLVMValueRef function, inst;
+   LLVMBasicBlockRef bb;
+   LLVMValueRef args[3];
+
+   util_snprintf(name, sizeof name, "%s_update_cache_one_block",
+                 format_desc->short_name);
+   function = LLVMGetNamedFunction(module, name);
+
+   if (!function) {
+      LLVMTypeRef ret_type;
+      LLVMTypeRef arg_types[3];
+      LLVMTypeRef function_type;
+      unsigned arg;
+
+      /*
+       * Generate the function prototype.
+       */
+
+      ret_type = LLVMVoidTypeInContext(gallivm->context);
+      arg_types[0] = pi8t;
+      arg_types[1] = LLVMInt32TypeInContext(gallivm->context);
+      arg_types[2] = LLVMTypeOf(cache); // XXX: put right type here
+      function_type = LLVMFunctionType(ret_type, arg_types, ARRAY_SIZE(arg_types), 0);
+      function = LLVMAddFunction(module, name, function_type);
+
+      for (arg = 0; arg < ARRAY_SIZE(arg_types); ++arg)
+         if (LLVMGetTypeKind(arg_types[arg]) == LLVMPointerTypeKind)
+            lp_add_function_attr(function, arg + 1, LP_FUNC_ATTR_NOALIAS);
+
+      LLVMSetFunctionCallConv(function, LLVMFastCallConv);
+      LLVMSetVisibility(function, LLVMHiddenVisibility);
+      generate_update_cache_one_block(gallivm, function, format_desc);
+   }
+
+   args[0] = ptr_addr;
+   args[1] = hash_index;
+   args[2] = cache;
+ 
+   LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
+   bb = LLVMGetInsertBlock(builder);
+   inst = LLVMGetLastInstruction(bb);
+   LLVMSetInstructionCallConv(inst, LLVMFastCallConv);
+}
+
+/*
+ * cached lookup
+ */
+static LLVMValueRef
+compressed_fetch_cached(struct gallivm_state *gallivm,
+                        const struct util_format_description *format_desc,
+                        unsigned n,
+                        LLVMValueRef base_ptr,
+                        LLVMValueRef offset,
+                        LLVMValueRef i,
+                        LLVMValueRef j,
+                        LLVMValueRef cache)
+
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   unsigned count, low_bit, log2size;
+   LLVMValueRef color, offset_stored, addr, ptr_addrtrunc, tmp;
+   LLVMValueRef ij_index, hash_index, hash_mask, block_index;
+   LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
+   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+   LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
+   struct lp_type type;
+   struct lp_build_context bld32;
+   memset(&type, 0, sizeof type);
+   type.width = 32;
+   type.length = n;
+
+   lp_build_context_init(&bld32, gallivm, type);
+
+   /*
+    * compute hash - we use direct mapped cache, the hash function could
+    *                be better but it needs to be simple
+    * per-element:
+    *    compare offset with offset stored at tag (hash)
+    *    if not equal extract block, store block, update tag
+    *    extract color from cache
+    *    assemble colors
+    */
+
+   low_bit = util_logbase2(format_desc->block.bits / 8);
+   log2size = util_logbase2(LP_BUILD_FORMAT_CACHE_SIZE);
+   addr = LLVMBuildPtrToInt(builder, base_ptr, i64t, "");
+   ptr_addrtrunc = LLVMBuildPtrToInt(builder, base_ptr, i32t, "");
+   ptr_addrtrunc = lp_build_broadcast_scalar(&bld32, ptr_addrtrunc);
+   /* For the hash function, first mask off the unused lowest bits. Then just
+      do some xor with address bits - only use lower 32bits */
+   ptr_addrtrunc = LLVMBuildAdd(builder, offset, ptr_addrtrunc, "");
+   ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
+                                 lp_build_const_int_vec(gallivm, type, low_bit), "");
+   /* This only really makes sense for size 64,128,256 */
+   hash_index = ptr_addrtrunc;
+   ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
+                                 lp_build_const_int_vec(gallivm, type, 2*log2size), "");
+   hash_index = LLVMBuildXor(builder, ptr_addrtrunc, hash_index, "");
+   tmp = LLVMBuildLShr(builder, hash_index,
+                       lp_build_const_int_vec(gallivm, type, log2size), "");
+   hash_index = LLVMBuildXor(builder, hash_index, tmp, "");
+
+   hash_mask = lp_build_const_int_vec(gallivm, type, LP_BUILD_FORMAT_CACHE_SIZE - 1);
+   hash_index = LLVMBuildAnd(builder, hash_index, hash_mask, "");
+   ij_index = LLVMBuildShl(builder, i, lp_build_const_int_vec(gallivm, type, 2), "");
+   ij_index = LLVMBuildAdd(builder, ij_index, j, "");
+   block_index = LLVMBuildShl(builder, hash_index,
+                              lp_build_const_int_vec(gallivm, type, 4), "");
+   block_index = LLVMBuildAdd(builder, ij_index, block_index, "");
+
+   if (n > 1) {
+      color = bld32.undef;
+      for (count = 0; count < n; count++) {
+         LLVMValueRef index, cond, colorx;
+         LLVMValueRef block_indexx, hash_indexx, addrx, offsetx, ptr_addrx;
+         struct lp_build_if_state if_ctx;
+
+         index = lp_build_const_int32(gallivm, count);
+         offsetx = LLVMBuildExtractElement(builder, offset, index, "");
+         addrx = LLVMBuildZExt(builder, offsetx, i64t, "");
+         addrx = LLVMBuildAdd(builder, addrx, addr, "");
+         block_indexx = LLVMBuildExtractElement(builder, block_index, index, "");
+         hash_indexx = LLVMBuildLShr(builder, block_indexx,
+                                     lp_build_const_int32(gallivm, 4), "");
+         offset_stored = s3tc_lookup_tag_data(gallivm, cache, hash_indexx);
+         cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addrx, "");
+
+         lp_build_if(&if_ctx, gallivm, cond);
+         {
+            ptr_addrx = LLVMBuildIntToPtr(builder, addrx,
+                                          LLVMPointerType(i8t, 0), "");
+            update_cached_block(gallivm, format_desc, ptr_addrx, hash_indexx, cache);
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+            s3tc_update_cache_access(gallivm, cache, 1,
+                                     LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
+#endif
+         }
+         lp_build_endif(&if_ctx);
+
+         colorx = s3tc_lookup_cached_pixel(gallivm, cache, block_indexx);
+
+         color = LLVMBuildInsertElement(builder, color, colorx,
+                                        lp_build_const_int32(gallivm, count), "");
+      }
+   }
+   else {
+      LLVMValueRef cond;
+      struct lp_build_if_state if_ctx;
+
+      tmp = LLVMBuildZExt(builder, offset, i64t, "");
+      addr = LLVMBuildAdd(builder, tmp, addr, "");
+      offset_stored = s3tc_lookup_tag_data(gallivm, cache, hash_index);
+      cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addr, "");
+
+      lp_build_if(&if_ctx, gallivm, cond);
+      {
+         tmp = LLVMBuildIntToPtr(builder, addr, LLVMPointerType(i8t, 0), "");
+         update_cached_block(gallivm, format_desc, tmp, hash_index, cache);
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+         s3tc_update_cache_access(gallivm, cache, 1,
+                                  LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
+#endif
+      }
+      lp_build_endif(&if_ctx);
+
+      color = s3tc_lookup_cached_pixel(gallivm, cache, block_index);
+   }
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+   s3tc_update_cache_access(gallivm, cache, n,
+                            LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL);
+#endif
+   return LLVMBuildBitCast(builder, color, LLVMVectorType(i8t, n * 4), "");
+}
+
+
+static LLVMValueRef
+s3tc_dxt5_to_rgba_aos(struct gallivm_state *gallivm,
+                      unsigned n,
+                      enum pipe_format format,
+                      LLVMValueRef colors,
+                      LLVMValueRef codewords,
+                      LLVMValueRef alpha_lo,
+                      LLVMValueRef alpha_hi,
+                      LLVMValueRef i,
+                      LLVMValueRef j)
+{
+   return s3tc_dxt5_full_to_rgba_aos(gallivm, n, format, colors,
+                                     codewords, alpha_lo, alpha_hi, i, j);
+}
+
+
+/**
+ * @param n  number of pixels processed (usually n=4, but it should also work with n=1
+ *           and multiples of 4)
+ * @param base_ptr  base pointer (32bit or 64bit pointer depending on the architecture)
+ * @param offset <n x i32> vector with the relative offsets of the S3TC blocks
+ * @param i  is a <n x i32> vector with the x subpixel coordinate (0..3)
+ * @param j  is a <n x i32> vector with the y subpixel coordinate (0..3)
+ * @return  a <4*n x i8> vector with the pixel RGBA values in AoS
+ */
+LLVMValueRef
+lp_build_fetch_s3tc_rgba_aos(struct gallivm_state *gallivm,
+                             const struct util_format_description *format_desc,
+                             unsigned n,
+                             LLVMValueRef base_ptr,
+                             LLVMValueRef offset,
+                             LLVMValueRef i,
+                             LLVMValueRef j,
+                             LLVMValueRef cache)
+{
+   LLVMValueRef rgba;
+   LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
+   LLVMBuilderRef builder = gallivm->builder;
+
+   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
+   assert(format_desc->block.width == 4);
+   assert(format_desc->block.height == 4);
+
+   assert((n == 1) || (n % 4 == 0));
+
+/*   debug_printf("format = %d\n", format_desc->format);*/
+   if (cache) {
+      rgba = compressed_fetch_cached(gallivm, format_desc, n,
+                                     base_ptr, offset, i, j, cache);
+      return rgba;
+   }
+
+   if (n > 4) {
+      unsigned count;
+      LLVMTypeRef i8_vectype = LLVMVectorType(i8t, 4 * n);
+      LLVMTypeRef i128_type = LLVMIntTypeInContext(gallivm->context, 128);
+      LLVMTypeRef i128_vectype =  LLVMVectorType(i128_type, n / 4);
+      LLVMTypeRef i324_vectype = LLVMVectorType(LLVMInt32TypeInContext(
+                                                gallivm->context), 4);
+      LLVMValueRef offset4, i4, j4, rgba4[LP_MAX_VECTOR_LENGTH/16];
+      struct lp_type lp_324_vectype = lp_type_uint_vec(32, 128);
+
+      assert(n / 4 <= ARRAY_SIZE(rgba4));
+
+      rgba = LLVMGetUndef(i128_vectype);
+
+      for (count = 0; count < n / 4; count++) {
+         LLVMValueRef colors, codewords, alpha_lo, alpha_hi;
+
+         i4 = lp_build_extract_range(gallivm, i, count * 4, 4);
+         j4 = lp_build_extract_range(gallivm, j, count * 4, 4);
+         offset4 = lp_build_extract_range(gallivm, offset, count * 4, 4);
+
+         lp_build_gather_s3tc(gallivm, 4, format_desc, &colors, &codewords,
+                              &alpha_lo, &alpha_hi, base_ptr, offset4);
+
+         switch (format_desc->format) {
+         case PIPE_FORMAT_DXT1_RGB:
+         case PIPE_FORMAT_DXT1_RGBA:
+         case PIPE_FORMAT_DXT1_SRGB:
+         case PIPE_FORMAT_DXT1_SRGBA:
+            rgba4[count] = s3tc_dxt1_to_rgba_aos(gallivm, 4, format_desc->format,
+                                                 colors, codewords, i4, j4);
+            break;
+         case PIPE_FORMAT_DXT3_RGBA:
+         case PIPE_FORMAT_DXT3_SRGBA:
+            rgba4[count] = s3tc_dxt3_to_rgba_aos(gallivm, 4, format_desc->format, colors,
+                                                 codewords, alpha_lo, alpha_hi, i4, j4);
+            break;
+         case PIPE_FORMAT_DXT5_RGBA:
+         case PIPE_FORMAT_DXT5_SRGBA:
+            rgba4[count] = s3tc_dxt5_to_rgba_aos(gallivm, 4, format_desc->format, colors,
+                                                 codewords, alpha_lo, alpha_hi, i4, j4);
+            break;
+         default:
+            assert(0);
+            rgba4[count] = LLVMGetUndef(LLVMVectorType(i8t, 4));
+            break;
+         }
+         /* shuffles typically give best results with dword elements...*/
+         rgba4[count] = LLVMBuildBitCast(builder, rgba4[count], i324_vectype, "");
+      }
+      rgba = lp_build_concat(gallivm, rgba4, lp_324_vectype, n / 4);
+      rgba = LLVMBuildBitCast(builder, rgba, i8_vectype, "");
+   }
+   else {
+      LLVMValueRef colors, codewords, alpha_lo, alpha_hi;
+
+      lp_build_gather_s3tc(gallivm, n, format_desc, &colors, &codewords,
+                           &alpha_lo, &alpha_hi, base_ptr, offset);
+
+      switch (format_desc->format) {
+      case PIPE_FORMAT_DXT1_RGB:
+      case PIPE_FORMAT_DXT1_RGBA:
+      case PIPE_FORMAT_DXT1_SRGB:
+      case PIPE_FORMAT_DXT1_SRGBA:
+         rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format_desc->format,
+                                      colors, codewords, i, j);
+         break;
+      case PIPE_FORMAT_DXT3_RGBA:
+      case PIPE_FORMAT_DXT3_SRGBA:
+         rgba = s3tc_dxt3_to_rgba_aos(gallivm, n, format_desc->format, colors,
+                                      codewords, alpha_lo, alpha_hi, i, j);
+         break;
+      case PIPE_FORMAT_DXT5_RGBA:
+      case PIPE_FORMAT_DXT5_SRGBA:
+         rgba = s3tc_dxt5_to_rgba_aos(gallivm, n, format_desc->format, colors,
+                                      codewords, alpha_lo, alpha_hi, i, j);
+         break;
+      default:
+         assert(0);
+         rgba = LLVMGetUndef(LLVMVectorType(i8t, 4*n));
+         break;
+      }
+   }
+
+   /* always return just decompressed values - srgb conversion is done later */
+
+   return rgba;
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 018cca8f9df..a6662c5e01b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -3549,10 +3549,6 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
       const struct util_format_description *format_desc;
       format_desc = util_format_description(static_texture_state->format);
       if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
-         /*
-          * This is not 100% correct, if we have cache but the
-          * util_format_s3tc_prefer is true the cache won't get used
-          * regardless (could hook up the block decode there...) */
          need_cache = TRUE;
       }
    }
diff --git a/src/gallium/auxiliary/meson.build b/src/gallium/auxiliary/meson.build
index a4dbcf7b4ca..57f7e69050f 100644
--- a/src/gallium/auxiliary/meson.build
+++ b/src/gallium/auxiliary/meson.build
@@ -389,8 +389,8 @@ if with_llvm
     'gallivm/lp_bld_flow.h',
     'gallivm/lp_bld_format_aos_array.c',
     'gallivm/lp_bld_format_aos.c',
-    'gallivm/lp_bld_format_cached.c',
     'gallivm/lp_bld_format_float.c',
+    'gallivm/lp_bld_format_s3tc.c',
     'gallivm/lp_bld_format.c',
     'gallivm/lp_bld_format.h',
     'gallivm/lp_bld_format_soa.c',