From b50e362dbbced42c93fb28a420977dfecd38c823 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Sat, 23 Mar 2013 02:05:54 +0100
Subject: [PATCH] gallivm: Add code for rgb9e5 shared exponent format to float
 conversion

And use this (and the code for r11g11b10 packed float to float conversion)
in the soa texturing code (the generated code looks quite good).
Should be an order of magnitude faster probably than using the fallback
(not measured).
Tested with piglit texwrap GL_EXT_packed_float and
GL_EXT_texture_shared_exponent respectively (didn't find much else using
it).

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/auxiliary/gallivm/lp_bld_conv.c   | 86 +++++++++++++++++++
 src/gallium/auxiliary/gallivm/lp_bld_conv.h   |  5 ++
 .../auxiliary/gallivm/lp_bld_format_soa.c     | 30 ++++++-
 3 files changed, 118 insertions(+), 3 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 053f4132080..2f39abc63d8 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -397,6 +397,92 @@ lp_build_r11g11b10_to_float(struct gallivm_state *gallivm,
 }
 
 
+static LLVMValueRef
+lp_build_rgb9_to_float_helper(struct gallivm_state *gallivm,
+                              struct lp_type f32_type,
+                              LLVMValueRef src,
+                              LLVMValueRef scale,
+                              unsigned mantissa_start)
+{
+   LLVMValueRef shift, mask;
+
+   struct lp_type i32_type = lp_type_int_vec(32, 32 * f32_type.length);
+   struct lp_build_context i32_bld, f32_bld;
+
+   lp_build_context_init(&i32_bld, gallivm, i32_type);
+   lp_build_context_init(&f32_bld, gallivm, f32_type);
+
+   /*
+    * This is much easier as other weirdo float formats, since
+    * there's no sign, no Inf/NaN, and there's nothing special
+    * required for normals/denormals neither (as without the implied one
+    * for the mantissa for other formats, everything looks like a denormal).
+    * So just do (float)comp_bits * scale
+    */
+   shift = lp_build_const_int_vec(gallivm, i32_type, mantissa_start);
+   mask = lp_build_const_int_vec(gallivm, i32_type, 0x1ff);
+   src = lp_build_shr(&i32_bld, src, shift);
+   src = lp_build_and(&i32_bld, src, mask);
+   src = lp_build_int_to_float(&f32_bld, src);
+   return lp_build_mul(&f32_bld, src, scale);
+}
+
+
+/**
+ * Convert shared exponent format (rgb9e5) value(s) to rgba float SoA values.
+ *
+ * @param src   packed AoS rgb9e5 values (as (vector) int32)
+ * @param dst   pointer to the SoA result values
+ */
+void
+lp_build_rgb9e5_to_float(struct gallivm_state *gallivm,
+                         LLVMValueRef src,
+                         LLVMValueRef *dst)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMTypeRef src_type = LLVMTypeOf(src);
+   LLVMValueRef shift, scale, bias, exp;
+   unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
+                            LLVMGetVectorSize(src_type) : 1;
+   struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
+   struct lp_type u32_type = lp_type_uint_vec(32, 32 * src_length);
+   struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
+   struct lp_build_context i32_bld, u32_bld, f32_bld;
+
+   lp_build_context_init(&i32_bld, gallivm, i32_type);
+   lp_build_context_init(&u32_bld, gallivm, u32_type);
+   lp_build_context_init(&f32_bld, gallivm, f32_type);
+
+   /* extract exponent */
+   shift = lp_build_const_int_vec(gallivm, i32_type, 27);
+   /* this shift needs to be unsigned otherwise need mask */
+   exp = lp_build_shr(&u32_bld, src, shift);
+
+   /*
+    * scale factor is 2 ^ (exp - bias)
+    * (and additionally corrected here for the mantissa bits)
+    * not using shift because
+    * a) don't have vector shift in a lot of cases
+    * b) shift direction changes hence need 2 shifts + conditional
+    *    (or rotate instruction which is even more rare (for instance XOP))
+    * so use whacky float 2 ^ function instead manipulating exponent
+    * (saves us the float conversion at the end too)
+    */
+   bias = lp_build_const_int_vec(gallivm, i32_type, 127 - (15 + 9));
+   scale = lp_build_add(&i32_bld, exp, bias);
+   shift = lp_build_const_int_vec(gallivm, i32_type, 23);
+   scale = lp_build_shl(&i32_bld, scale, shift);
+   scale = LLVMBuildBitCast(builder, scale, f32_bld.vec_type, "");
+
+   dst[0] = lp_build_rgb9_to_float_helper(gallivm, f32_type, src, scale, 0);
+   dst[1] = lp_build_rgb9_to_float_helper(gallivm, f32_type, src, scale, 9);
+   dst[2] = lp_build_rgb9_to_float_helper(gallivm, f32_type, src, scale, 18);
+
+   /* Just set alpha to one */
+   dst[3] = f32_bld.one;
+}
+
+
 /**
  * Converts int16 half-float to float32
  * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i think?)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.h b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
index 5bd6f4f1d75..d8bc294bce0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
@@ -70,6 +70,11 @@ lp_build_r11g11b10_to_float(struct gallivm_state *gallivm,
                             LLVMValueRef src,
                             LLVMValueRef *dst);
 
+void
+lp_build_rgb9e5_to_float(struct gallivm_state *gallivm,
+                         LLVMValueRef src,
+                         LLVMValueRef *dst);
+
 LLVMValueRef
 lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
                                         struct lp_type src_type,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index 9eb6ef5438d..54ca61a0c38 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -310,9 +310,10 @@ lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,
  * \param type  the desired return type for 'rgba'.  The vector length
  *              is the number of texels to fetch
  *
- * \param base_ptr  points to start of the texture image block.  For non-
- *                  compressed formats, this simply points to the texel.
- *                  For compressed formats, it points to the start of the
+ * \param base_ptr  points to the base of the texture mip tree.
+ * \param offset    offset to start of the texture image block.  For non-
+ *                  compressed formats, this simply is an offset to the texel.
+ *                  For compressed formats, it is an offset to the start of the
  *                  compressed data block.
  *
  * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
@@ -368,6 +369,29 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
       return;
    }
 
+   if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
+       format_desc->format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+      /*
+       * similar conceptually to above but requiring special
+       * AoS packed -> SoA float conversion code.
+       */
+      LLVMValueRef packed;
+
+      assert(type.floating);
+      assert(type.width == 32);
+
+      packed = lp_build_gather(gallivm, type.length,
+                               format_desc->block.bits,
+                               type.width, base_ptr, offset);
+      if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
+         lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
+      }
+      else {
+         lp_build_rgb9e5_to_float(gallivm, packed, rgba_out);
+      }
+      return;
+   }
+
    /*
     * Try calling lp_build_fetch_rgba_aos for all pixels.
     */