From 61a8a55f557784c8ec17fb1758775c6f18252201 Mon Sep 17 00:00:00 2001
From: Antia Puentes <apuentes@igalia.com>
Date: Fri, 21 Oct 2016 11:40:11 +0200
Subject: [PATCH] i965/gen8: Fix vertex attrib upload for dvec3/4 shader inputs

The emission of vertex attributes corresponding to dvec3 and dvec4
vertex shader input variables was not correct when the <size> passed
to the VertexAttribL* commands was <= 2.

This was because we were using the vertex array size when emitting vertices
to decide if we uploaded a 64-bit floating point attribute as 1 slot (128-bits)
for sizes 1 and 2, or 2 slots (256-bits) for sizes 3 and 4. This caused problems
when mapping the input variables to registers because, for deciding which
registers contain the values uploaded for a certain variable, we use the size
and type given to the variable in the shader, so we will be assigning 256-bits
to dvec3/4 variables, even if we only uploaded 128-bits for them, which happened
when the vertex array size was <= 2.

The patch uses the shader information to only emit as 128-bits those 64-bit floating
point variables that were declared as double or dvec2 in the vertex shader. Dvec3 and
dvec4 variables will be always uploaded as 256-bits, independently of the <size> given
to the VertexAttribL* command.

From the ARB_vertex_attrib_64bit specification:

   "For the 64-bit double precision types listed in Table X.1, no default
    attribute values are provided if the values of the vertex attribute variable
    are specified with fewer components than required for the attribute
    variable. For example, the fourth component of a variable of type dvec4
    will be undefined if specified using VertexAttribL3dv or using a vertex
    array specified with VertexAttribLPointer and a size of three."

We are filling these unspecified components with zeros, which coincidentally is
also what the GL44-CTS.vertex_attrib_binding.basic-inputL-case1 expects.

v2: Do not use bitcount (Kenneth Graunke)

Fixes: GL44-CTS.vertex_attrib_binding.basic-inputL-case1 test

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=97287
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_compiler.h     |  1 +
 src/mesa/drivers/dri/i965/brw_context.h      |  2 +-
 src/mesa/drivers/dri/i965/brw_draw_upload.c  |  3 +-
 src/mesa/drivers/dri/i965/brw_vs.c           |  1 +
 src/mesa/drivers/dri/i965/gen8_draw_upload.c | 35 +++++++++-----------
 5 files changed, 20 insertions(+), 22 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index 819c7d604e1..c2400f99352 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -641,6 +641,7 @@ struct brw_vs_prog_data {
    struct brw_vue_prog_data base;
 
    GLbitfield64 inputs_read;
+   GLbitfield64 double_inputs_read;
 
    unsigned nr_attributes;
    unsigned nr_attribute_slots;
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 308ba99a318..310372ac82f 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -535,7 +535,7 @@ struct brw_vertex_element {
    const struct gl_vertex_array *glarray;
 
    int buffer;
-
+   bool is_dual_slot;
    /** Offset of the first element within the buffer object */
    unsigned int offset;
 };
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index da13e7acc74..ab2fc505c79 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -472,7 +472,8 @@ brw_prepare_vertices(struct brw_context *brw)
    while (vs_inputs) {
       GLuint index = ffsll(vs_inputs) - 1;
       struct brw_vertex_element *input = &brw->vb.inputs[index];
-
+      input->is_dual_slot = brw->gen >= 8 &&
+         (vs_prog_data->double_inputs_read & BITFIELD64_BIT(index)) != 0;
       vs_inputs &= ~BITFIELD64_BIT(index);
       brw->vb.enabled[brw->vb.nr_enabled++] = input;
    }
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 842c5165c8e..02a88ca0988 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -151,6 +151,7 @@ brw_codegen_vs_prog(struct brw_context *brw,
    uint64_t outputs_written =
       brw_vs_outputs_written(brw, key, vp->program.info.outputs_written);
    prog_data.inputs_read = vp->program.info.inputs_read;
+   prog_data.double_inputs_read = vp->program.info.double_inputs_read;
 
    if (key->copy_edgeflag) {
       prog_data.inputs_read |= VERT_BIT_EDGEFLAG;
diff --git a/src/mesa/drivers/dri/i965/gen8_draw_upload.c b/src/mesa/drivers/dri/i965/gen8_draw_upload.c
index 23c75879458..69ba8e923e7 100644
--- a/src/mesa/drivers/dri/i965/gen8_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/gen8_draw_upload.c
@@ -230,8 +230,15 @@ gen8_emit_vertices(struct brw_context *brw)
       case 0: comp0 = BRW_VE1_COMPONENT_STORE_0;
       case 1: comp1 = BRW_VE1_COMPONENT_STORE_0;
       case 2: comp2 = BRW_VE1_COMPONENT_STORE_0;
-      case 3: comp3 = input->glarray->Integer ? BRW_VE1_COMPONENT_STORE_1_INT
-                                              : BRW_VE1_COMPONENT_STORE_1_FLT;
+      case 3:
+         if (input->glarray->Doubles) {
+            comp3 = BRW_VE1_COMPONENT_STORE_0;
+         } else if (input->glarray->Integer) {
+            comp3 = BRW_VE1_COMPONENT_STORE_1_INT;
+         } else {
+            comp3 = BRW_VE1_COMPONENT_STORE_1_FLT;
+         }
+
          break;
       }
 
@@ -250,24 +257,12 @@ gen8_emit_vertices(struct brw_context *brw)
        *     to be specified as VFCOMP_STORE_0 in order to output a 256-bit vertex
        *     element."
        */
-      if (input->glarray->Doubles) {
-         switch (input->glarray->Size) {
-         case 0:
-         case 1:
-         case 2:
-            /*  Use 128-bits instead of 256-bits to write double and dvec2
-             *  vertex elements.
-             */
-            comp2 = BRW_VE1_COMPONENT_NOSTORE;
-            comp3 = BRW_VE1_COMPONENT_NOSTORE;
-            break;
-         case 3:
-            /* Pad the output using VFCOMP_STORE_0 as suggested
-             * by the BDW PRM.
-             */
-            comp3 = BRW_VE1_COMPONENT_STORE_0;
-            break;
-         }
+      if (input->glarray->Doubles && !input->is_dual_slot) {
+         /* Store vertex elements which correspond to double and dvec2 vertex
+          * shader inputs as 128-bit vertex elements, instead of 256-bits.
+          */
+         comp2 = BRW_VE1_COMPONENT_NOSTORE;
+         comp3 = BRW_VE1_COMPONENT_NOSTORE;
       }
 
       OUT_BATCH((input->buffer << GEN6_VE0_INDEX_SHIFT) |