From 08ab13d3400891cd6a0e7d97ff1e292cc927c7c9 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Tue, 17 Sep 2019 11:01:01 +0200
Subject: [PATCH] radv/gfx10: fix storing/loading NGG stream outputs for GS

The GS outputs are stored differently in the LDS storage, they
are indexed by out_idx which is incremented for each stored DWORD.
Thus, we need a different path for exporting the stream outputs.

This fixes a bunch of CTS failures when NGG GS is force enabled.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
---
 src/amd/vulkan/radv_nir_to_llvm.c | 91 ++++++++++++++++++++++++++-----
 1 file changed, 78 insertions(+), 13 deletions(-)

diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c
index 69a5e0bb9f9..46d9ae97049 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -3175,6 +3175,17 @@ static void build_export_prim(struct radv_shader_context *ctx,
 	ac_build_export(&ctx->ac, &args);
 }
 
+static struct radv_stream_output *
+radv_get_stream_output_by_loc(struct radv_streamout_info *so, unsigned location)
+{
+	for (unsigned i = 0; i < so->num_outputs; ++i) {
+		if (so->outputs[i].location == location)
+			return &so->outputs[i];
+	}
+
+	return NULL;
+}
+
 static void build_streamout_vertex(struct radv_shader_context *ctx,
 				   LLVMValueRef *so_buffer, LLVMValueRef *wg_offset_dw,
 				   unsigned stream, LLVMValueRef offset_vtx,
@@ -3195,25 +3206,79 @@ static void build_streamout_vertex(struct radv_shader_context *ctx,
 		offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), "");
 	}
 
-	for (unsigned i = 0; i < so->num_outputs; ++i) {
-		struct radv_stream_output *output =
-			&ctx->shader_info->so.outputs[i];
+	if (ctx->stage == MESA_SHADER_GEOMETRY) {
+		struct radv_shader_output_values outputs[AC_LLVM_MAX_OUTPUTS];
+		unsigned noutput = 0;
+		unsigned out_idx = 0;
 
-		if (stream != output->stream)
-			continue;
+		for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
+			unsigned output_usage_mask =
+				ctx->shader_info->gs.output_usage_mask[i];
+			uint8_t output_stream =
+				output_stream = ctx->shader_info->gs.output_streams[i];
 
-		struct radv_shader_output_values out = {};
-
-		for (unsigned comp = 0; comp < 4; comp++) {
-			if (!(output->component_mask & (1 << comp)))
+			if (!(ctx->output_mask & (1ull << i)) ||
+			    output_stream != stream)
 				continue;
 
-			tmp = ac_build_gep0(&ctx->ac, vertexptr,
-					    LLVMConstInt(ctx->ac.i32, 4 * i + comp, false));
-			out.values[comp] = LLVMBuildLoad(builder, tmp, "");
+			outputs[noutput].slot_name = i;
+			outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;
+			outputs[noutput].usage_mask = output_usage_mask;
+
+			int length = util_last_bit(output_usage_mask);
+
+			for (unsigned j = 0; j < length; j++, out_idx++) {
+				if (!(output_usage_mask & (1 << j)))
+					continue;
+
+				tmp = ac_build_gep0(&ctx->ac, vertexptr,
+						    LLVMConstInt(ctx->ac.i32, out_idx, false));
+				outputs[noutput].values[j] = LLVMBuildLoad(builder, tmp, "");
+			}
+
+			for (unsigned j = length; j < 4; j++)
+				outputs[noutput].values[j] = LLVMGetUndef(ctx->ac.f32);
+
+			noutput++;
 		}
 
-		radv_emit_stream_output(ctx, so_buffer, offset, output, &out);
+		for (unsigned i = 0; i < noutput; i++) {
+			struct radv_stream_output *output =
+				radv_get_stream_output_by_loc(so, outputs[i].slot_name);
+
+			if (!output ||
+			    output->stream != stream)
+				continue;
+
+			struct radv_shader_output_values out = {};
+
+			for (unsigned j = 0; j < 4; j++) {
+				out.values[j] = outputs[i].values[j];
+			}
+
+			radv_emit_stream_output(ctx, so_buffer, offset, output, &out);
+		}
+	} else {
+		for (unsigned i = 0; i < so->num_outputs; ++i) {
+			struct radv_stream_output *output =
+				&ctx->shader_info->so.outputs[i];
+
+			if (stream != output->stream)
+				continue;
+
+			struct radv_shader_output_values out = {};
+
+			for (unsigned comp = 0; comp < 4; comp++) {
+				if (!(output->component_mask & (1 << comp)))
+					continue;
+
+				tmp = ac_build_gep0(&ctx->ac, vertexptr,
+						    LLVMConstInt(ctx->ac.i32, 4 * i + comp, false));
+				out.values[comp] = LLVMBuildLoad(builder, tmp, "");
+			}
+
+			radv_emit_stream_output(ctx, so_buffer, offset, output, &out);
+		}
 	}
 }