From 3d2b157e23c9d66df97d59be6efd1098878cc110 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Fri, 5 Jan 2018 18:26:58 -0800
Subject: [PATCH] i965/fs: Use UW types when using V immediates

Gen 10 has a strange hardware bug involving V immediates with W types.
It appears that a mov(8) g2<1>W 0x76543210V will actually result in g2
getting the value {3, 2, 1, 0, 3, 2, 1, 0}.  In particular, the bottom
four nibbles are repeated instead of the top four being taken.  (A mov
of 0x00003210V yields the same result.)  This bug does not appear in any
hardware documentation as far as we can tell and the simulator does not
implement the bug either.

Commit 6132992cdb858268af0e985727d80e4140be389c was mostly a no-op
except that it changed the type of the subgroup invocation from UW to W
and caused us to tickle this bug with basically every compute shader
that uses any sort of invocation ID (which is most of them).  This is
also potentially an issue for geometry shader input pulls and SampleID
setup.  The easy solution is just to change the few places where we use
a vector integer immediate with a W type to use a UW type.

Reviewed-by: Matt Turner <mattst88@gmail.com>
Cc: mesa-stable@lists.freedesktop.org
Fixes: 6132992cdb858268af0e985727d80e4140be389c
---
 src/intel/compiler/brw_fs.cpp     | 6 +++---
 src/intel/compiler/brw_fs_nir.cpp | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 9d0546e5797..09bfc9c552d 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -1256,16 +1256,16 @@ fs_visitor::emit_sampleid_setup()
        * TODO: These payload bits exist on Gen7 too, but they appear to always
        *       be zero, so this code fails to work.  We should find out why.
        */
-      fs_reg tmp(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
+      fs_reg tmp(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UW);
 
       abld.SHR(tmp, fs_reg(stride(retype(brw_vec1_grf(1, 0),
-                                         BRW_REGISTER_TYPE_B), 1, 8, 0)),
+                                         BRW_REGISTER_TYPE_UB), 1, 8, 0)),
                     brw_imm_v(0x44440000));
       abld.AND(*reg, tmp, brw_imm_w(0xf));
    } else {
       const fs_reg t1 = component(fs_reg(VGRF, alloc.allocate(1),
                                          BRW_REGISTER_TYPE_D), 0);
-      const fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
+      const fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UW);
 
       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
        * 8x multisampling, subspan 0 will represent sample N (where N
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index ab132f700a3..0d775649303 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -237,7 +237,7 @@ fs_visitor::nir_emit_system_values()
    {
       const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
       fs_reg &reg = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
-      reg = abld.vgrf(BRW_REGISTER_TYPE_W);
+      reg = abld.vgrf(BRW_REGISTER_TYPE_UW);
 
       const fs_builder allbld8 = abld.group(8, 0).exec_all();
       allbld8.MOV(reg, brw_imm_v(0x76543210));
@@ -2134,7 +2134,7 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
           * by 32 (shifting by 5), and add the two together.  This is
           * the final indirect byte offset.
           */
-         fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_W, 1);
+         fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1);
          fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
          fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
          fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);