i965/tiled_memcpy: Rework the RGBA -> BGRA mem_copy functions

This splits the two copy functions into three: One for unaligned copies, one for aligned sources, and one for aligned destinations. Thanks to the previous commit, we are now guaranteed that the aligned ones will *only* operate on aligned memory so they should be safe. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93962 Cc: "11.1 11.2" <mesa-stable@lists.freedesktop.org> Reviewed-by: Matt Turner <mattst88@gmail.com> Reviewed-by: Roland Scheidegger <sroland@vmware.com> Reviewed-by: Chad Versace <chad.versace@intel.com>
2016-04-07 11:21:19 -07:00 · 2016-04-07 11:21:19 -07:00 · d2b32656e1
parent f6f54a29ca
commit d2b32656e1
1 changed files with 64 additions and 77 deletions
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
@ -56,48 +56,16 @@ static const uint32_t ytile_width = 128;
 static const uint32_t ytile_height = 32;
 static const uint32_t ytile_span = 16;

-#ifdef __SSSE3__
-static const uint8_t rgba8_permutation[16] =
-   { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
-
-/* NOTE: dst must be 16-byte aligned. src may be unaligned. */
-#define rgba8_copy_16_aligned_dst(dst, src)                            \
-   _mm_store_si128((__m128i *)(dst),                                   \
-                   _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(src)), \
-                                    *(__m128i *) rgba8_permutation))
-
-/* NOTE: src must be 16-byte aligned. dst may be unaligned. */
-#define rgba8_copy_16_aligned_src(dst, src)                            \
-   _mm_storeu_si128((__m128i *)(dst),                                  \
-                    _mm_shuffle_epi8(_mm_load_si128((__m128i *)(src)), \
-                                     *(__m128i *) rgba8_permutation))
-#endif
-
 /**
- * Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned.
+ * Copy RGBA to BGRA - swap R and B.
 */
 static inline void *
-rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
+rgba8_copy(void *dst, const void *src, size_t bytes)
 {
   uint8_t *d = dst;
   uint8_t const *s = src;

-#ifdef __SSSE3__
-   if (bytes == 16) {
-      assert(!(((uintptr_t)dst) & 0xf));
-      rgba8_copy_16_aligned_dst(d+ 0, s+ 0);
-      return dst;
-   }
-
-   if (bytes == 64) {
-      assert(!(((uintptr_t)dst) & 0xf));
-      rgba8_copy_16_aligned_dst(d+ 0, s+ 0);
-      rgba8_copy_16_aligned_dst(d+16, s+16);
-      rgba8_copy_16_aligned_dst(d+32, s+32);
-      rgba8_copy_16_aligned_dst(d+48, s+48);
-      return dst;
-   }
-#endif
+   assert(bytes % 4 == 0);

   while (bytes >= 4) {
      d[0] = s[2];
@ -111,6 +79,38 @@ rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
   return dst;
 }

+#ifdef __SSSE3__
+static const uint8_t rgba8_permutation[16] =
+   { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
+#endif
+
+/**
+ * Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned.
+ */
+static inline void *
+rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
+{
+   uint8_t *d = dst;
+   uint8_t const *s = src;
+
+   assert(bytes == 0 || !(((uintptr_t)dst) & 0xf));
+
+#ifdef __SSSE3__
+   while (bytes >= 16) {
+      _mm_store_si128((__m128i *)d,
+                      _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)s),
+                                       *(__m128i *) rgba8_permutation));
+      s += 16;
+      d += 16;
+      bytes -= 16;
+   }
+#endif
+
+   rgba8_copy(d, s, bytes);
+
+   return dst;
+}
+
 /**
 * Copy RGBA to BGRA - swap R and B, with the source 16-byte aligned.
 */
@ -120,32 +120,21 @@ rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
   uint8_t *d = dst;
   uint8_t const *s = src;

-#ifdef __SSSE3__
-   if (bytes == 16) {
-      assert(!(((uintptr_t)src) & 0xf));
-      rgba8_copy_16_aligned_src(d+ 0, s+ 0);
-      return dst;
-   }
+   assert(bytes == 0 || !(((uintptr_t)src) & 0xf));

-   if (bytes == 64) {
-      assert(!(((uintptr_t)src) & 0xf));
-      rgba8_copy_16_aligned_src(d+ 0, s+ 0);
-      rgba8_copy_16_aligned_src(d+16, s+16);
-      rgba8_copy_16_aligned_src(d+32, s+32);
-      rgba8_copy_16_aligned_src(d+48, s+48);
-      return dst;
+#ifdef __SSSE3__
+   while (bytes >= 16) {
+      _mm_storeu_si128((__m128i *)d,
+                       _mm_shuffle_epi8(_mm_load_si128((__m128i *)s),
+                                        *(__m128i *) rgba8_permutation));
+      s += 16;
+      d += 16;
+      bytes -= 16;
   }
 #endif

-   while (bytes >= 4) {
-      d[0] = s[2];
-      d[1] = s[1];
-      d[2] = s[0];
-      d[3] = s[3];
-      d += 4;
-      s += 4;
-      bytes -= 4;
-   }
+   rgba8_copy(d, s, bytes);
+
   return dst;
 }

@ -404,10 +393,10 @@ linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
      if (mem_copy == memcpy)
         return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
                                 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_dst)
+      else if (mem_copy == rgba8_copy)
         return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
                                 dst, src, src_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_dst, rgba8_copy_aligned_dst);
+                                 rgba8_copy, rgba8_copy_aligned_dst);
      else
         unreachable("not reached");
   } else {
@ -415,10 +404,10 @@ linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
         return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
                                 dst, src, src_pitch, swizzle_bit,
                                 memcpy, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_dst)
+      else if (mem_copy == rgba8_copy)
         return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
                                 dst, src, src_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_dst, rgba8_copy_aligned_dst);
+                                 rgba8_copy, rgba8_copy_aligned_dst);
      else
         unreachable("not reached");
   }
@ -447,20 +436,20 @@ linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
      if (mem_copy == memcpy)
         return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
                                 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_dst)
+      else if (mem_copy == rgba8_copy)
         return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
                                 dst, src, src_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_dst, rgba8_copy_aligned_dst);
+                                 rgba8_copy, rgba8_copy_aligned_dst);
      else
         unreachable("not reached");
   } else {
      if (mem_copy == memcpy)
         return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
                                 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_dst)
+      else if (mem_copy == rgba8_copy)
         return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
                                 dst, src, src_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_dst, rgba8_copy_aligned_dst);
+                                 rgba8_copy, rgba8_copy_aligned_dst);
      else
         unreachable("not reached");
   }
@ -489,20 +478,20 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
      if (mem_copy == memcpy)
         return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
                                 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_src)
+      else if (mem_copy == rgba8_copy)
         return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
                                 dst, src, dst_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_src, rgba8_copy_aligned_src);
+                                 rgba8_copy, rgba8_copy_aligned_src);
      else
         unreachable("not reached");
   } else {
      if (mem_copy == memcpy)
         return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
                                 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_src)
+      else if (mem_copy == rgba8_copy)
         return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
                                 dst, src, dst_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_src, rgba8_copy_aligned_src);
+                                 rgba8_copy, rgba8_copy_aligned_src);
      else
         unreachable("not reached");
   }
@ -531,20 +520,20 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
      if (mem_copy == memcpy)
         return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
                                 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_src)
+      else if (mem_copy == rgba8_copy)
         return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
                                 dst, src, dst_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_src, rgba8_copy_aligned_src);
+                                 rgba8_copy, rgba8_copy_aligned_src);
      else
         unreachable("not reached");
   } else {
      if (mem_copy == memcpy)
         return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
                                 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_src)
+      else if (mem_copy == rgba8_copy)
         return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
                                 dst, src, dst_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_src, rgba8_copy_aligned_src);
+                                 rgba8_copy, rgba8_copy_aligned_src);
      else
         unreachable("not reached");
   }
@ -775,8 +764,7 @@ bool intel_get_memcpy(mesa_format tiledFormat, GLenum format,
      if (format == GL_BGRA) {
         *mem_copy = memcpy;
      } else if (format == GL_RGBA) {
-         *mem_copy = direction == INTEL_UPLOAD ? rgba8_copy_aligned_dst
-                                               : rgba8_copy_aligned_src;
+         *mem_copy = rgba8_copy;
      }
   } else if ((tiledFormat == MESA_FORMAT_R8G8B8A8_UNORM) ||
              (tiledFormat == MESA_FORMAT_R8G8B8X8_UNORM) ||
@ -787,8 +775,7 @@ bool intel_get_memcpy(mesa_format tiledFormat, GLenum format,
         /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we can
          * use the same function.
          */
-         *mem_copy = direction == INTEL_UPLOAD ? rgba8_copy_aligned_dst
-                                               : rgba8_copy_aligned_src;
+         *mem_copy = rgba8_copy;
      } else if (format == GL_RGBA) {
         *mem_copy = memcpy;
      }