Merge remote-tracking branch 'mattst88/nir-lower-pack-unpack' into vulkan

2016-01-25 15:50:31 -08:00 · 2016-01-25 15:50:31 -08:00 · e462d4d815
parent 6bbf3814dc 874ede4983
commit e462d4d815
145 changed files with 2306 additions and 1271 deletions
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@ -271,6 +271,7 @@ C_SOURCES := \
 	util/u_prim_restart.h \
 	util/u_pstipple.c \
 	util/u_pstipple.h \
+	util/u_pwr8.h \
 	util/u_range.h \
 	util/u_rect.h \
 	util/u_resource.c \
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@ -1618,6 +1618,12 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
   context_ptr               = LLVMGetParam(variant_func, 0);
   io_ptr                    = LLVMGetParam(variant_func, 1);
   vbuffers_ptr              = LLVMGetParam(variant_func, 2);
+   /*
+    * XXX: stride is actually unused. The stride we use is strictly calculated
+    * from the number of outputs (including the draw_extra outputs).
+    * Should probably fix some day (we need a new vs just because of extra
+    * outputs which the generated vs won't touch).
+    */
   stride                    = LLVMGetParam(variant_func, 5 + (elts ? 1 : 0));
   vb_ptr                    = LLVMGetParam(variant_func, 6 + (elts ? 1 : 0));
   system_values.instance_id = LLVMGetParam(variant_func, 7 + (elts ? 1 : 0));
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@ -461,50 +461,49 @@ lp_build_pack2(struct gallivm_state *gallivm,
   assert(src_type.length * 2 == dst_type.length);

   /* Check for special cases first */
-   if((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) &&
-       src_type.width * src_type.length >= 128) {
+   if ((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) &&
+        src_type.width * src_type.length >= 128) {
      const char *intrinsic = NULL;
      boolean swap_intrinsic_operands = FALSE;

      switch(src_type.width) {
      case 32:
         if (util_cpu_caps.has_sse2) {
-           if(dst_type.sign) {
+           if (dst_type.sign) {
              intrinsic = "llvm.x86.sse2.packssdw.128";
-           }
-           else {
+           } else {
              if (util_cpu_caps.has_sse4_1) {
                 intrinsic = "llvm.x86.sse41.packusdw";
              }
           }
         } else if (util_cpu_caps.has_altivec) {
            if (dst_type.sign) {
-              intrinsic = "llvm.ppc.altivec.vpkswus";
-           } else {
-              intrinsic = "llvm.ppc.altivec.vpkuwus";
-           }
+               intrinsic = "llvm.ppc.altivec.vpkswss";
+            } else {
+               intrinsic = "llvm.ppc.altivec.vpkuwus";
+            }
 #ifdef PIPE_ARCH_LITTLE_ENDIAN
-           swap_intrinsic_operands = TRUE;
+            swap_intrinsic_operands = TRUE;
 #endif
         }
         break;
      case 16:
         if (dst_type.sign) {
            if (util_cpu_caps.has_sse2) {
-              intrinsic = "llvm.x86.sse2.packsswb.128";
+               intrinsic = "llvm.x86.sse2.packsswb.128";
            } else if (util_cpu_caps.has_altivec) {
-              intrinsic = "llvm.ppc.altivec.vpkshss";
+               intrinsic = "llvm.ppc.altivec.vpkshss";
 #ifdef PIPE_ARCH_LITTLE_ENDIAN
-              swap_intrinsic_operands = TRUE;
+               swap_intrinsic_operands = TRUE;
 #endif
            }
         } else {
            if (util_cpu_caps.has_sse2) {
-              intrinsic = "llvm.x86.sse2.packuswb.128";
+               intrinsic = "llvm.x86.sse2.packuswb.128";
            } else if (util_cpu_caps.has_altivec) {
-	      intrinsic = "llvm.ppc.altivec.vpkshus";
+               intrinsic = "llvm.ppc.altivec.vpkshus";
 #ifdef PIPE_ARCH_LITTLE_ENDIAN
-              swap_intrinsic_operands = TRUE;
+               swap_intrinsic_operands = TRUE;
 #endif
            }
         }
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@ -1536,8 +1536,22 @@ mod_emit_cpu(
   struct lp_build_tgsi_context * bld_base,
   struct lp_build_emit_data * emit_data)
 {
-   emit_data->output[emit_data->chan] = lp_build_mod(&bld_base->int_bld,
-                                   emit_data->args[0], emit_data->args[1]);
+   LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+   LLVMValueRef div_mask = lp_build_cmp(&bld_base->uint_bld,
+                                        PIPE_FUNC_EQUAL, emit_data->args[1],
+                                        bld_base->uint_bld.zero);
+   /* We want to make sure that we never divide/mod by zero to not
+    * generate sigfpe. We don't want to crash just because the
+    * shader is doing something weird. */
+   LLVMValueRef divisor = LLVMBuildOr(builder,
+                                      div_mask,
+                                      emit_data->args[1], "");
+   LLVMValueRef result = lp_build_mod(&bld_base->int_bld,
+                                      emit_data->args[0], divisor);
+   /* umod by zero doesn't have a guaranteed return value chose -1 for now. */
+   emit_data->output[emit_data->chan] = LLVMBuildOr(builder,
+                                                    div_mask,
+                                                    result, "");
 }

 /* TGSI_OPCODE_NOT */
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@ -673,10 +673,6 @@ ttn_get_dest(struct ttn_compile *c, struct tgsi_full_dst_register *tgsi_fdst)

   if (tgsi_dst->File == TGSI_FILE_TEMPORARY) {
      if (c->temp_regs[index].var) {
-          nir_builder *b = &c->build;
-          nir_intrinsic_instr *load;
-          struct tgsi_ind_register *indirect =
-                tgsi_dst->Indirect ? &tgsi_fdst->Indirect : NULL;
          nir_register *reg;

         /* this works, because TGSI will give us a base offset
@ -690,26 +686,6 @@ ttn_get_dest(struct ttn_compile *c, struct tgsi_full_dst_register *tgsi_fdst)
         reg->num_components = 4;
         dest.dest.reg.reg = reg;
         dest.dest.reg.base_offset = 0;
-
-         /* since the alu op might not write to all components
-          * of the temporary, we must first do a load_var to
-          * get the previous array elements into the register.
-          * This is one area that NIR could use a bit of
-          * improvement (or opt pass to clean up the mess
-          * once things are scalarized)
-          */
-
-         load = nir_intrinsic_instr_create(c->build.shader,
-                                           nir_intrinsic_load_var);
-         load->num_components = 4;
-         load->variables[0] =
-               ttn_array_deref(c, load, c->temp_regs[index].var,
-                               c->temp_regs[index].offset,
-                               indirect);
-
-         load->dest = nir_dest_for_reg(reg);
-
-         nir_builder_instr_insert(b, &load->instr);
      } else {
         assert(!tgsi_dst->Indirect);
         dest.dest.reg.reg = c->temp_regs[index].reg;
@ -1886,7 +1862,7 @@ ttn_emit_instruction(struct ttn_compile *c)
      ttn_move_dest(b, dest, nir_fsat(b, ttn_src_for_dest(b, &dest)));
   }

-   /* if the dst has a matching var, append store_global to move
+   /* if the dst has a matching var, append store_var to move
    * output from reg to var
    */
   nir_variable *var = ttn_get_var(c, tgsi_dst);
@ -1899,7 +1875,7 @@ ttn_emit_instruction(struct ttn_compile *c)
                                           &tgsi_dst->Indirect : NULL;

      store->num_components = 4;
-      store->const_index[0] = 0xf;
+      store->const_index[0] = dest.write_mask;
      store->variables[0] = ttn_array_deref(c, store, var, offset, indirect);
      store->src[0] = nir_src_for_reg(dest.dest.reg.reg);

@ -1932,6 +1908,7 @@ ttn_add_output_stores(struct ttn_compile *c)
         store->src[0].reg.reg = c->output_regs[loc].reg;
         store->src[0].reg.base_offset = c->output_regs[loc].offset;
         store->const_index[0] = loc;
+         store->const_index[1] = 0xf;  /* writemask */
         store->src[1] = nir_src_for_ssa(nir_imm_int(b, 0));
         nir_builder_instr_insert(b, &store->instr);
      }
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@ -110,6 +110,7 @@ tgsi_default_declaration( void )
   declaration.Invariant = 0;
   declaration.Local = 0;
   declaration.Array = 0;
+   declaration.Atomic = 0;
   declaration.Padding = 0;

   return declaration;
--- a/src/gallium/auxiliary/util/u_pstipple.c
+++ b/src/gallium/auxiliary/util/u_pstipple.c
@ -230,6 +230,7 @@ pstip_transform_immed(struct tgsi_transform_context *ctx,
   struct pstip_transform_context *pctx =
      (struct pstip_transform_context *) ctx;
   pctx->numImmed++;
+   ctx->emit_immediate(ctx, immed);
 }


--- a/src/gallium/auxiliary/util/u_pwr8.h
+++ b/src/gallium/auxiliary/util/u_pwr8.h
@ -153,6 +153,12 @@ vec_mullo_epi32 (__m128i a, __m128i b)
   return v;
 }

+static inline __m128i
+vec_andnot_si128 (__m128i a, __m128i b)
+{
+   return vec_andc (b, a);
+}
+
 static inline void
 transpose4_epi32(const __m128i * restrict a,
                 const __m128i * restrict b,
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@ -305,6 +305,7 @@ The integer capabilities:
  for buffers is supported.
 * ``PIPE_CAP_GENERATE_MIPMAP``: Indicates whether pipe_context::generate_mipmap
  is supported.
+* ``PIPE_CAP_STRING_MARKER``: Whether pipe->emit_string_marker() is supported.


 .. _pipe_capf:
--- a/src/gallium/drivers/freedreno/a2xx/fd2_context.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_context.c
@ -109,6 +109,7 @@ fd2_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
 	fd2_gmem_init(pctx);
 	fd2_texture_init(pctx);
 	fd2_prog_init(pctx);
+	fd2_emit_init(pctx);

 	pctx = fd_context_init(&fd2_ctx->base, pscreen,
 			(screen->gpu_id >= 220) ? a22x_primtypes : a20x_primtypes,
--- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
@ -446,3 +446,17 @@ fd2_emit_setup(struct fd_context *ctx)
 	fd_ringbuffer_flush(ring);
 	fd_ringmarker_mark(ctx->draw_start);
 }
+
+static void
+fd2_emit_ib(struct fd_ringbuffer *ring, struct fd_ringmarker *start,
+		struct fd_ringmarker *end)
+{
+	__OUT_IB(ring, false, start, end);
+}
+
+void
+fd2_emit_init(struct pipe_context *pctx)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	ctx->emit_ib = fd2_emit_ib;
+}
--- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.h
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.h
@ -45,4 +45,6 @@ void fd2_emit_vertex_bufs(struct fd_ringbuffer *ring, uint32_t val,
 void fd2_emit_state(struct fd_context *ctx, uint32_t dirty);
 void fd2_emit_setup(struct fd_context *ctx);

+void fd2_emit_init(struct pipe_context *pctx);
+
 #endif /* FD2_EMIT_H */
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@ -891,10 +891,18 @@ fd3_emit_restore(struct fd_context *ctx)
 	ctx->needs_rb_fbd = true;
 }

+static void
+fd3_emit_ib(struct fd_ringbuffer *ring, struct fd_ringmarker *start,
+		struct fd_ringmarker *end)
+{
+	__OUT_IB(ring, true, start, end);
+}
+
 void
 fd3_emit_init(struct pipe_context *pctx)
 {
 	struct fd_context *ctx = fd_context(pctx);
 	ctx->emit_const = fd3_emit_const;
 	ctx->emit_const_bo = fd3_emit_const_bo;
+	ctx->emit_ib = fd3_emit_ib;
 }
--- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
@ -853,7 +853,7 @@ emit_binning_pass(struct fd_context *ctx)
 			A3XX_PC_VSTREAM_CONTROL_N(0));

 	/* emit IB to binning drawcmds: */
-	OUT_IB(ring, ctx->binning_start, ctx->binning_end);
+	ctx->emit_ib(ring, ctx->binning_start, ctx->binning_end);
 	fd_reset_wfi(ctx);

 	fd_wfi(ctx, ring);
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@ -885,10 +885,18 @@ fd4_emit_restore(struct fd_context *ctx)
 	ctx->needs_rb_fbd = true;
 }

+static void
+fd4_emit_ib(struct fd_ringbuffer *ring, struct fd_ringmarker *start,
+		struct fd_ringmarker *end)
+{
+	__OUT_IB(ring, true, start, end);
+}
+
 void
 fd4_emit_init(struct pipe_context *pctx)
 {
 	struct fd_context *ctx = fd_context(pctx);
 	ctx->emit_const = fd4_emit_const;
 	ctx->emit_const_bo = fd4_emit_const_bo;
+	ctx->emit_ib = fd4_emit_ib;
 }
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
@ -217,6 +217,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 	struct stage s[MAX_STAGES];
 	uint32_t pos_regid, posz_regid, psize_regid, color_regid[8];
 	uint32_t face_regid, coord_regid, zwcoord_regid;
+	enum a3xx_threadsize fssz;
 	int constmode;
 	int i, j, k;

@ -224,6 +225,8 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,

 	setup_stages(emit, s);

+	fssz = (s[FS].i->max_reg >= 24) ? TWO_QUADS : FOUR_QUADS;
+
 	/* blob seems to always use constmode currently: */
 	constmode = 1;

@ -258,7 +261,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 	OUT_RING(ring, 0x00000003);

 	OUT_PKT0(ring, REG_A4XX_HLSQ_CONTROL_0_REG, 5);
-	OUT_RING(ring, A4XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS) |
+	OUT_RING(ring, A4XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(fssz) |
 			A4XX_HLSQ_CONTROL_0_REG_CONSTMODE(constmode) |
 			A4XX_HLSQ_CONTROL_0_REG_FSSUPERTHREADENABLE |
 			/* NOTE:  I guess SHADERRESTART and CONSTFULLUPDATE maybe
@ -385,7 +388,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 			A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(s[FS].i->max_half_reg + 1) |
 			A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(s[FS].i->max_reg + 1) |
 			A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) |
-			A4XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) |
+			A4XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) |
 			A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE |
 			COND(s[FS].v->has_samp, A4XX_SP_FS_CTRL_REG0_PIXLODENABLE));
 	OUT_RING(ring, A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) |
--- a/src/gallium/drivers/freedreno/freedreno_context.c
+++ b/src/gallium/drivers/freedreno/freedreno_context.c
@ -141,6 +141,32 @@ fd_context_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence,
 	}
 }

+/**
+ * emit marker string as payload of a no-op packet, which can be
+ * decoded by cffdump.
+ */
+static void
+fd_emit_string_marker(struct pipe_context *pctx, const char *string, int len)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	struct fd_ringbuffer *ring = ctx->ring;
+	const uint32_t *buf = (const void *)string;
+
+	OUT_PKT3(ring, CP_NOP, align(len, 4) / 4);
+	while (len >= 4) {
+		OUT_RING(ring, *buf);
+		buf++;
+		len -= 4;
+	}
+
+	/* copy remainder bytes without reading past end of input string: */
+	if (len > 0) {
+		uint32_t w = 0;
+		memcpy(&w, buf, len);
+		OUT_RING(ring, w);
+	}
+}
+
 void
 fd_context_destroy(struct pipe_context *pctx)
 {
@ -207,6 +233,7 @@ fd_context_init(struct fd_context *ctx, struct pipe_screen *pscreen,
 	pctx->screen = pscreen;
 	pctx->priv = priv;
 	pctx->flush = fd_context_flush;
+	pctx->emit_string_marker = fd_emit_string_marker;

 	for (i = 0; i < ARRAY_SIZE(ctx->rings); i++) {
 		ctx->rings[i] = fd_ringbuffer_new(screen->pipe, 0x100000);
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@ -386,6 +386,10 @@ struct fd_context {
 			const uint32_t *dwords, struct pipe_resource *prsc);
 	void (*emit_const_bo)(struct fd_ringbuffer *ring, enum shader_t type, boolean write,
 			uint32_t regid, uint32_t num, struct fd_bo **bos, uint32_t *offsets);
+
+	/* indirect-branch emit: */
+	void (*emit_ib)(struct fd_ringbuffer *ring, struct fd_ringmarker *start,
+			struct fd_ringmarker *end);
 };

 static inline struct fd_context *
--- a/src/gallium/drivers/freedreno/freedreno_gmem.c
+++ b/src/gallium/drivers/freedreno/freedreno_gmem.c
@ -331,7 +331,7 @@ render_tiles(struct fd_context *ctx)
 		fd_hw_query_prepare_tile(ctx, i, ctx->ring);

 		/* emit IB to drawcmds: */
-		OUT_IB(ctx->ring, ctx->draw_start, ctx->draw_end);
+		ctx->emit_ib(ctx->ring, ctx->draw_start, ctx->draw_end);
 		fd_reset_wfi(ctx);

 		/* emit gmem2mem to transfer tile back to system memory: */
@ -349,7 +349,7 @@ render_sysmem(struct fd_context *ctx)
 	fd_hw_query_prepare_tile(ctx, 0, ctx->ring);

 	/* emit IB to drawcmds: */
-	OUT_IB(ctx->ring, ctx->draw_start, ctx->draw_end);
+	ctx->emit_ib(ctx->ring, ctx->draw_start, ctx->draw_end);
 	fd_reset_wfi(ctx);
 }

--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@ -155,6 +155,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_USER_CONSTANT_BUFFERS:
 	case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
 	case PIPE_CAP_VERTEXID_NOBASE:
+	case PIPE_CAP_STRING_MARKER:
 		return 1;

 	case PIPE_CAP_SHADER_STENCIL_EXPORT:
@ -400,9 +401,16 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 		return 1;
 	case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
 	case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
+		/* Technically this should be the same as for TEMP/CONST, since
+		 * everything is just normal registers.  This is just temporary
+		 * hack until load_input/store_output handle arrays in a similar
+		 * way as load_var/store_var..
+		 */
+		return 0;
 	case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
 	case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
-		return 1;
+		/* a2xx compiler doesn't handle indirect: */
+		return is_ir3(screen) ? 1 : 0;
 	case PIPE_SHADER_CAP_SUBROUTINES:
 	case PIPE_SHADER_CAP_DOUBLES:
 	case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
@ -566,6 +574,7 @@ fd_screen_create(struct fd_device *dev)
 		fd3_screen_init(pscreen);
 		break;
 	case 420:
+	case 430:
 		fd4_screen_init(pscreen);
 		break;
 	default:
--- a/src/gallium/drivers/freedreno/freedreno_util.h
+++ b/src/gallium/drivers/freedreno/freedreno_util.h
@ -265,8 +265,8 @@ OUT_WFI(struct fd_ringbuffer *ring)
 }

 static inline void
-OUT_IB(struct fd_ringbuffer *ring, struct fd_ringmarker *start,
-		struct fd_ringmarker *end)
+__OUT_IB(struct fd_ringbuffer *ring, bool prefetch,
+		struct fd_ringmarker *start, struct fd_ringmarker *end)
 {
 	uint32_t dwords = fd_ringmarker_dwords(start, end);

@ -280,7 +280,7 @@ OUT_IB(struct fd_ringbuffer *ring, struct fd_ringmarker *start,
 	 */
 	emit_marker(ring, 6);

-	OUT_PKT3(ring, CP_INDIRECT_BUFFER_PFD, 2);
+	OUT_PKT3(ring, prefetch ? CP_INDIRECT_BUFFER_PFE : CP_INDIRECT_BUFFER_PFD, 2);
 	fd_ringbuffer_emit_reloc_ring(ring, start, end);
 	OUT_RING(ring, dwords);

--- a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
+++ b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
@ -220,7 +220,7 @@ static void print_instr_cat1(instr_t *instr)
 		else if (cat1->off > 0)
 			printf("%c<a0.x + %d>", type, cat1->off);
 		else
-			printf("c<a0.x>");
+			printf("%c<a0.x>", type);
 	} else {
 		print_reg_src((reg_t)(cat1->src), type_size(cat1->src_type) == 32,
 				cat1->src_r, cat1->src_c, cat1->src_im, false, false, false);
@ -650,7 +650,7 @@ static void print_instr_cat6(instr_t *instr)
 /* size of largest OPC field of all the instruction categories: */
 #define NOPC_BITS 6

-struct opc_info {
+static const struct opc_info {
 	uint16_t cat;
 	uint16_t opc;
 	const char *name;
--- a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
+++ b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
@ -261,6 +261,7 @@ typedef union PACKED {
 	/* to make compiler happy: */
 	uint32_t dummy32;
 	uint32_t dummy10   : 10;
+	int32_t  idummy10  : 10;
 	uint32_t dummy11   : 11;
 	uint32_t dummy12   : 12;
 	uint32_t dummy13   : 13;
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@ -81,6 +81,7 @@ struct ir3 * ir3_create(struct ir3_compiler *compiler,
 	shader->outputs = ir3_alloc(shader, sizeof(shader->outputs[0]) * nout);

 	list_inithead(&shader->block_list);
+	list_inithead(&shader->array_list);

 	return shader;
 }
@ -121,18 +122,19 @@ static uint32_t reg(struct ir3_register *reg, struct ir3_info *info,
 		val.iim_val = reg->iim_val;
 	} else {
 		unsigned components;
+		int16_t max;

 		if (reg->flags & IR3_REG_RELATIV) {
 			components = reg->size;
-			val.dummy10 = reg->offset;
+			val.idummy10 = reg->array.offset;
+			max = (reg->array.offset + repeat + components - 1) >> 2;
 		} else {
 			components = util_last_bit(reg->wrmask);
 			val.comp = reg->num & 0x3;
 			val.num  = reg->num >> 2;
+			max = (reg->num + repeat + components - 1) >> 2;
 		}

-		int16_t max = (reg->num + repeat + components - 1) >> 2;
-
 		if (reg->flags & IR3_REG_CONST) {
 			info->max_const = MAX2(info->max_const, max);
 		} else if (val.num == 63) {
@ -233,7 +235,7 @@ static int emit_cat2(struct ir3_instruction *instr, void *ptr,
 	iassert((instr->regs_count == 2) || (instr->regs_count == 3));

 	if (src1->flags & IR3_REG_RELATIV) {
-		iassert(src1->num < (1 << 10));
+		iassert(src1->array.offset < (1 << 10));
 		cat2->rel1.src1      = reg(src1, info, instr->repeat,
 				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
 				IR3_REG_HALF | absneg);
@ -260,7 +262,7 @@ static int emit_cat2(struct ir3_instruction *instr, void *ptr,
 				!((src1->flags ^ src2->flags) & IR3_REG_HALF));

 		if (src2->flags & IR3_REG_RELATIV) {
-			iassert(src2->num < (1 << 10));
+			iassert(src2->array.offset < (1 << 10));
 			cat2->rel2.src2      = reg(src2, info, instr->repeat,
 					IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
 					IR3_REG_HALF | absneg);
@ -333,7 +335,7 @@ static int emit_cat3(struct ir3_instruction *instr, void *ptr,
 	iassert(!((src3->flags ^ src_flags) & IR3_REG_HALF));

 	if (src1->flags & IR3_REG_RELATIV) {
-		iassert(src1->num < (1 << 10));
+		iassert(src1->array.offset < (1 << 10));
 		cat3->rel1.src1      = reg(src1, info, instr->repeat,
 				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
 				IR3_REG_HALF | absneg);
@ -361,7 +363,7 @@ static int emit_cat3(struct ir3_instruction *instr, void *ptr,


 	if (src3->flags & IR3_REG_RELATIV) {
-		iassert(src3->num < (1 << 10));
+		iassert(src3->array.offset < (1 << 10));
 		cat3->rel2.src3      = reg(src3, info, instr->repeat,
 				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
 				IR3_REG_HALF | absneg);
@ -404,7 +406,7 @@ static int emit_cat4(struct ir3_instruction *instr, void *ptr,
 	iassert(instr->regs_count == 2);

 	if (src->flags & IR3_REG_RELATIV) {
-		iassert(src->num < (1 << 10));
+		iassert(src->array.offset < (1 << 10));
 		cat4->rel.src      = reg(src, info, instr->repeat,
 				IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_FNEG |
 				IR3_REG_FABS | IR3_REG_R | IR3_REG_HALF);
@ -737,6 +739,14 @@ struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
 	return reg;
 }

+struct ir3_register * ir3_reg_clone(struct ir3 *shader,
+		struct ir3_register *reg)
+{
+	struct ir3_register *new_reg = reg_create(shader, 0, 0);
+	*new_reg = *reg;
+	return new_reg;
+}
+
 void
 ir3_instr_set_address(struct ir3_instruction *instr,
 		struct ir3_instruction *addr)
@ -777,3 +787,12 @@ ir3_count_instructions(struct ir3 *ir)
 	}
 	return cnt;
 }
+
+struct ir3_array *
+ir3_lookup_array(struct ir3 *ir, unsigned id)
+{
+	list_for_each_entry (struct ir3_array, arr, &ir->array_list, node)
+		if (arr->id == id)
+			return arr;
+	return NULL;
+}
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@ -83,7 +83,8 @@ struct ir3_register {
 		 * before register assignment is done:
 		 */
 		IR3_REG_SSA    = 0x2000,   /* 'instr' is ptr to assigning instr */
-		IR3_REG_PHI_SRC= 0x4000,   /* phi src, regs[0]->instr points to phi */
+		IR3_REG_ARRAY  = 0x4000,
+		IR3_REG_PHI_SRC= 0x8000,   /* phi src, regs[0]->instr points to phi */

 	} flags;
 	union {
@ -97,11 +98,18 @@ struct ir3_register {
 		uint32_t uim_val;
 		float    fim_val;
 		/* relative: */
-		int   offset;
+		struct {
+			uint16_t id;
+			int16_t offset;
+		} array;
 	};

-	/* for IR3_REG_SSA, src registers contain ptr back to
-	 * assigning instruction.
+	/* For IR3_REG_SSA, src registers contain ptr back to assigning
+	 * instruction.
+	 *
+	 * For IR3_REG_ARRAY, the pointer is back to the last dependent
+	 * array access (although the net effect is the same, it points
+	 * back to a previous instruction that we depend on).
 	 */
 	struct ir3_instruction *instr;

@ -221,9 +229,6 @@ struct ir3_instruction {
 		struct {
 			int off;              /* component/offset */
 		} fo;
-		struct {
-			int aid;
-		} fi;
 		struct {
 			/* used to temporarily hold reference to nir_phi_instr
 			 * until we resolve the phi srcs
@ -293,19 +298,6 @@ struct ir3_instruction {
 	 */
 	struct ir3_instruction *address;

-	/* in case of a instruction with relative dst instruction, we need to
-	 * capture the dependency on the fanin for the previous values of
-	 * the array elements.  Since we don't know at compile time actually
-	 * which array elements are written, this serves to preserve the
-	 * unconditional write to array elements prior to the conditional
-	 * write.
-	 *
-	 * TODO only cat1 can do indirect write.. we could maybe move this
-	 * into instr->cat1.fanin (but would require the frontend to insert
-	 * the extra mov)
-	 */
-	struct ir3_instruction *fanin;
-
 	/* Entry in ir3_block's instruction list: */
 	struct list_head node;

@ -379,10 +371,41 @@ struct ir3 {
 	/* List of blocks: */
 	struct list_head block_list;

+	/* List of ir3_array's: */
+	struct list_head array_list;
+
 	unsigned heap_idx;
 	struct ir3_heap_chunk *chunk;
 };

+typedef struct nir_variable nir_variable;
+
+struct ir3_array {
+	struct list_head node;
+	unsigned length;
+	unsigned id;
+
+	nir_variable *var;
+
+	/* We track the last write and last access (read or write) to
+	 * setup dependencies on instructions that read or write the
+	 * array.  Reads can be re-ordered wrt. other reads, but should
+	 * not be re-ordered wrt. to writes.  Writes cannot be reordered
+	 * wrt. any other access to the array.
+	 *
+	 * So array reads depend on last write, and array writes depend
+	 * on the last access.
+	 */
+	struct ir3_instruction *last_write, *last_access;
+
+	/* extra stuff used in RA pass: */
+	unsigned base;      /* base vreg name */
+	unsigned reg;       /* base physical reg */
+	uint16_t start_ip, end_ip;
+};
+
+struct ir3_array * ir3_lookup_array(struct ir3 *ir, unsigned id);
+
 typedef struct nir_block nir_block;

 struct ir3_block {
@ -430,6 +453,8 @@ const char *ir3_instr_name(struct ir3_instruction *instr);

 struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
 		int num, int flags);
+struct ir3_register * ir3_reg_clone(struct ir3 *shader,
+		struct ir3_register *reg);

 void ir3_instr_set_address(struct ir3_instruction *instr,
 		struct ir3_instruction *addr);
@ -510,6 +535,9 @@ static inline bool is_same_type_mov(struct ir3_instruction *instr)
 	if (dst->num == regid(REG_A0, 0))
 		return false;

+	if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
+		return false;
+
 	if ((instr->category == 1) &&
 			(instr->cat1.src_type == instr->cat1.dst_type))
 		return true;
@ -623,8 +651,10 @@ static inline bool writes_pred(struct ir3_instruction *instr)
 /* TODO better name */
 static inline struct ir3_instruction *ssa(struct ir3_register *reg)
 {
-	if (reg->flags & IR3_REG_SSA)
+	if (reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) {
+		debug_assert(!(reg->instr && (reg->instr->flags & IR3_INSTR_UNUSED)));
 		return reg->instr;
+	}
 	return NULL;
 }

@ -813,8 +843,6 @@ static inline unsigned ir3_cat3_absneg(opc_t opc)

 static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
 {
-	if (instr->fanin)
-		return instr->regs_count + 2;
 	if (instr->address)
 		return instr->regs_count + 1;
 	return instr->regs_count;
@ -822,8 +850,6 @@ static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)

 static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n)
 {
-	if (n == (instr->regs_count + 1))
-		return instr->fanin;
 	if (n == (instr->regs_count + 0))
 		return instr->address;
 	return ssa(instr->regs[n]);
@ -834,8 +860,8 @@ static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr
 /* iterator for an instruction's SSA sources (instr), also returns src #: */
 #define foreach_ssa_src_n(__srcinst, __n, __instr) \
 	if ((__instr)->regs_count) \
-		for (unsigned __cnt = __ssa_src_cnt(__instr) - 1, __n = 0; __n < __cnt; __n++) \
-			if ((__srcinst = __ssa_src_n(__instr, __n + 1)))
+		for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \
+			if ((__srcinst = __ssa_src_n(__instr, __n)))

 /* iterator for an instruction's SSA sources (instr): */
 #define foreach_ssa_src(__srcinst, __instr) \
@ -878,7 +904,15 @@ ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
 	struct ir3_instruction *instr =
 		ir3_instr_create(block, 1, 0);
 	ir3_reg_create(instr, 0, 0);   /* dst */
-	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
+	if (src->regs[0]->flags & IR3_REG_ARRAY) {
+		struct ir3_register *src_reg =
+			ir3_reg_create(instr, 0, IR3_REG_ARRAY);
+		src_reg->array = src->regs[0]->array;
+		src_reg->instr = src;
+	} else {
+		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
+	}
+	debug_assert(!(src->regs[0]->flags & IR3_REG_RELATIV));
 	instr->cat1.src_type = type;
 	instr->cat1.dst_type = type;
 	return instr;
@ -894,6 +928,7 @@ ir3_COV(struct ir3_block *block, struct ir3_instruction *src,
 	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
 	instr->cat1.src_type = src_type;
 	instr->cat1.dst_type = dst_type;
+	debug_assert(!(src->regs[0]->flags & IR3_REG_ARRAY));
 	return instr;
 }

@ -1083,7 +1118,7 @@ typedef uint8_t regmask_t[2 * MAX_REG / 8];

 static inline unsigned regmask_idx(struct ir3_register *reg)
 {
-	unsigned num = reg->num;
+	unsigned num = (reg->flags & IR3_REG_RELATIV) ? reg->array.offset : reg->num;
 	debug_assert(num < MAX_REG);
 	if (reg->flags & IR3_REG_HALF)
 		num += MAX_REG;
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@ -46,7 +46,6 @@
 struct ir3_compile {
 	struct ir3_compiler *compiler;

-	const struct tgsi_token *tokens;
 	struct nir_shader *s;

 	struct ir3 *ir;
@ -75,8 +74,6 @@ struct ir3_compile {
 	/* mapping from nir_register to defining instruction: */
 	struct hash_table *def_ht;

-	/* mapping from nir_variable to ir3_array: */
-	struct hash_table *var_ht;
 	unsigned num_arrays;

 	/* a common pattern for indirect addressing is to request the
@ -143,8 +140,6 @@ compile_init(struct ir3_compiler *compiler,
 	ctx->so = so;
 	ctx->def_ht = _mesa_hash_table_create(ctx,
 			_mesa_hash_pointer, _mesa_key_pointer_equal);
-	ctx->var_ht = _mesa_hash_table_create(ctx,
-			_mesa_hash_pointer, _mesa_key_pointer_equal);
 	ctx->block_ht = _mesa_hash_table_create(ctx,
 			_mesa_hash_pointer, _mesa_key_pointer_equal);

@ -221,206 +216,26 @@ compile_free(struct ir3_compile *ctx)
 	ralloc_free(ctx);
 }

-/* global per-array information: */
-struct ir3_array {
-	unsigned length, aid;
-};
-
-/* per-block array state: */
-struct ir3_array_value {
-	/* TODO drop length/aid, and just have ptr back to ir3_array */
-	unsigned length, aid;
-	/* initial array element values are phi's, other than for the
-	 * entry block.  The phi src's get added later in a resolve step
-	 * after we have visited all the blocks, to account for back
-	 * edges in the cfg.
-	 */
-	struct ir3_instruction **phis;
-	/* current array element values (as block is processed).  When
-	 * the array phi's are resolved, it will contain the array state
-	 * at exit of block, so successor blocks can use it to add their
-	 * phi srcs.
-	 */
-	struct ir3_instruction *arr[];
-};
-
-/* track array assignments per basic block.  When an array is read
- * outside of the same basic block, we can use NIR's dominance-frontier
- * information to figure out where phi nodes are needed.
- */
-struct ir3_nir_block_data {
-	unsigned foo;
-	/* indexed by array-id (aid): */
-	struct ir3_array_value *arrs[];
-};
-
-static struct ir3_nir_block_data *
-get_block_data(struct ir3_compile *ctx, struct ir3_block *block)
-{
-	if (!block->data) {
-		struct ir3_nir_block_data *bd = ralloc_size(ctx, sizeof(*bd) +
-				((ctx->num_arrays + 1) * sizeof(bd->arrs[0])));
-		block->data = bd;
-	}
-	return block->data;
-}
-
 static void
 declare_var(struct ir3_compile *ctx, nir_variable *var)
 {
 	unsigned length = glsl_get_length(var->type) * 4;  /* always vec4, at least with ttn */
 	struct ir3_array *arr = ralloc(ctx, struct ir3_array);
+	arr->id = ++ctx->num_arrays;
 	arr->length = length;
-	arr->aid = ++ctx->num_arrays;
-	_mesa_hash_table_insert(ctx->var_ht, var, arr);
+	arr->var = var;
+	list_addtail(&arr->node, &ctx->ir->array_list);
 }

-static nir_block *
-nir_block_pred(nir_block *block)
-{
-	assert(block->predecessors->entries < 2);
-	if (block->predecessors->entries == 0)
-		return NULL;
-	return (nir_block *)_mesa_set_next_entry(block->predecessors, NULL)->key;
-}
-
-static struct ir3_array_value *
+static struct ir3_array *
 get_var(struct ir3_compile *ctx, nir_variable *var)
 {
-	struct hash_entry *entry = _mesa_hash_table_search(ctx->var_ht, var);
-	struct ir3_block *block = ctx->block;
-	struct ir3_nir_block_data *bd = get_block_data(ctx, block);
-	struct ir3_array *arr = entry->data;
-
-	if (!bd->arrs[arr->aid]) {
-		struct ir3_array_value *av = ralloc_size(bd, sizeof(*av) +
-				(arr->length * sizeof(av->arr[0])));
-		struct ir3_array_value *defn = NULL;
-		nir_block *pred_block;
-
-		av->length = arr->length;
-		av->aid = arr->aid;
-
-		/* For loops, we have to consider that we have not visited some
-		 * of the blocks who should feed into the phi (ie. back-edges in
-		 * the cfg).. for example:
-		 *
-		 *   loop {
-		 *      block { load_var; ... }
-		 *      if then block {} else block {}
-		 *      block { store_var; ... }
-		 *      if then block {} else block {}
-		 *      block {...}
-		 *   }
-		 *
-		 * We can skip the phi if we can chase the block predecessors
-		 * until finding the block previously defining the array without
-		 * crossing a block that has more than one predecessor.
-		 *
-		 * Otherwise create phi's and resolve them as a post-pass after
-		 * all the blocks have been visited (to handle back-edges).
-		 */
-
-		for (pred_block = block->nblock;
-				pred_block && (pred_block->predecessors->entries < 2) && !defn;
-				pred_block = nir_block_pred(pred_block)) {
-			struct ir3_block *pblock = get_block(ctx, pred_block);
-			struct ir3_nir_block_data *pbd = pblock->data;
-			if (!pbd)
-				continue;
-			defn = pbd->arrs[arr->aid];
-		}
-
-		if (defn) {
-			/* only one possible definer: */
-			for (unsigned i = 0; i < arr->length; i++)
-				av->arr[i] = defn->arr[i];
-		} else if (pred_block) {
-			/* not the first block, and multiple potential definers: */
-			av->phis = ralloc_size(av, arr->length * sizeof(av->phis[0]));
-
-			for (unsigned i = 0; i < arr->length; i++) {
-				struct ir3_instruction *phi;
-
-				phi = ir3_instr_create2(block, -1, OPC_META_PHI,
-						1 + ctx->impl->num_blocks);
-				ir3_reg_create(phi, 0, 0);         /* dst */
-
-				/* phi's should go at head of block: */
-				list_delinit(&phi->node);
-				list_add(&phi->node, &block->instr_list);
-
-				av->phis[i] = av->arr[i] = phi;
-			}
-		} else {
-			/* Some shaders end up reading array elements without
-			 * first writing.. so initialize things to prevent null
-			 * instr ptrs later:
-			 */
-			for (unsigned i = 0; i < arr->length; i++)
-				av->arr[i] = create_immed(block, 0);
-		}
-
-		bd->arrs[arr->aid] = av;
-	}
-
-	return bd->arrs[arr->aid];
-}
-
-static void
-add_array_phi_srcs(struct ir3_compile *ctx, nir_block *nblock,
-		struct ir3_array_value *av, BITSET_WORD *visited)
-{
-	struct ir3_block *block;
-	struct ir3_nir_block_data *bd;
-
-	if (BITSET_TEST(visited, nblock->index))
-		return;
-
-	BITSET_SET(visited, nblock->index);
-
-	block = get_block(ctx, nblock);
-	bd = block->data;
-
-	if (bd && bd->arrs[av->aid]) {
-		struct ir3_array_value *dav = bd->arrs[av->aid];
-		for (unsigned i = 0; i < av->length; i++) {
-			ir3_reg_create(av->phis[i], 0, IR3_REG_SSA)->instr =
-					dav->arr[i];
-		}
-	} else {
-		/* didn't find defn, recurse predecessors: */
-		struct set_entry *entry;
-		set_foreach(nblock->predecessors, entry) {
-			add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited);
-		}
-	}
-}
-
-static void
-resolve_array_phis(struct ir3_compile *ctx, struct ir3_block *block)
-{
-	struct ir3_nir_block_data *bd = block->data;
-	unsigned bitset_words = BITSET_WORDS(ctx->impl->num_blocks);
-
-	if (!bd)
-		return;
-
-	/* TODO use nir dom_frontier to help us with this? */
-
-	for (unsigned i = 1; i <= ctx->num_arrays; i++) {
-		struct ir3_array_value *av = bd->arrs[i];
-		BITSET_WORD visited[bitset_words];
-		struct set_entry *entry;
-
-		if (!(av && av->phis))
-			continue;
-
-		memset(visited, 0, sizeof(visited));
-		set_foreach(block->nblock->predecessors, entry) {
-			add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited);
-		}
+	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+		if (arr->var == var)
+			return arr;
 	}
+	compile_error(ctx, "bogus var: %s\n", var->name);
+	return NULL;
 }

 /* allocate a n element value array (to be populated by caller) and
@ -438,6 +253,7 @@ __get_dst(struct ir3_compile *ctx, void *key, unsigned n)
 static struct ir3_instruction **
 get_dst(struct ir3_compile *ctx, nir_dest *dst, unsigned n)
 {
+	compile_assert(ctx, dst->is_ssa);
 	if (dst->is_ssa) {
 		return __get_dst(ctx, &dst->ssa, n);
 	} else {
@ -455,6 +271,7 @@ static struct ir3_instruction **
 get_src(struct ir3_compile *ctx, nir_src *src)
 {
 	struct hash_entry *entry;
+	compile_assert(ctx, src->is_ssa);
 	if (src->is_ssa) {
 		entry = _mesa_hash_table_search(ctx->def_ht, src->ssa);
 	} else {
@ -560,7 +377,7 @@ create_uniform(struct ir3_compile *ctx, unsigned n)
 }

 static struct ir3_instruction *
-create_uniform_indirect(struct ir3_compile *ctx, unsigned n,
+create_uniform_indirect(struct ir3_compile *ctx, int n,
 		struct ir3_instruction *address)
 {
 	struct ir3_instruction *mov;
@ -569,7 +386,7 @@ create_uniform_indirect(struct ir3_compile *ctx, unsigned n,
 	mov->cat1.src_type = TYPE_U32;
 	mov->cat1.dst_type = TYPE_U32;
 	ir3_reg_create(mov, 0, 0);
-	ir3_reg_create(mov, n, IR3_REG_CONST | IR3_REG_RELATIV);
+	ir3_reg_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;

 	ir3_instr_set_address(mov, address);

@ -594,7 +411,7 @@ create_collect(struct ir3_block *block, struct ir3_instruction **arr,
 }

 static struct ir3_instruction *
-create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
+create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, int n,
 		struct ir3_instruction *address, struct ir3_instruction *collect)
 {
 	struct ir3_block *block = ctx->block;
@ -608,17 +425,45 @@ create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
 	src = ir3_reg_create(mov, 0, IR3_REG_SSA | IR3_REG_RELATIV);
 	src->instr = collect;
 	src->size  = arrsz;
-	src->offset = n;
+	src->array.offset = n;

 	ir3_instr_set_address(mov, address);

 	return mov;
 }

+/* relative (indirect) if address!=NULL */
 static struct ir3_instruction *
-create_indirect_store(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
-		struct ir3_instruction *src, struct ir3_instruction *address,
-		struct ir3_instruction *collect)
+create_var_load(struct ir3_compile *ctx, struct ir3_array *arr, int n,
+		struct ir3_instruction *address)
+{
+	struct ir3_block *block = ctx->block;
+	struct ir3_instruction *mov;
+	struct ir3_register *src;
+
+	mov = ir3_instr_create(block, 1, 0);
+	mov->cat1.src_type = TYPE_U32;
+	mov->cat1.dst_type = TYPE_U32;
+	ir3_reg_create(mov, 0, 0);
+	src = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
+			COND(address, IR3_REG_RELATIV));
+	src->instr = arr->last_write;
+	src->size  = arr->length;
+	src->array.id = arr->id;
+	src->array.offset = n;
+
+	if (address)
+		ir3_instr_set_address(mov, address);
+
+	arr->last_access = mov;
+
+	return mov;
+}
+
+/* relative (indirect) if address!=NULL */
+static struct ir3_instruction *
+create_var_store(struct ir3_compile *ctx, struct ir3_array *arr, int n,
+		struct ir3_instruction *src, struct ir3_instruction *address)
 {
 	struct ir3_block *block = ctx->block;
 	struct ir3_instruction *mov;
@ -627,14 +472,18 @@ create_indirect_store(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
 	mov = ir3_instr_create(block, 1, 0);
 	mov->cat1.src_type = TYPE_U32;
 	mov->cat1.dst_type = TYPE_U32;
-	dst = ir3_reg_create(mov, 0, IR3_REG_RELATIV);
-	dst->size  = arrsz;
-	dst->offset = n;
+	dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
+			COND(address, IR3_REG_RELATIV));
+	dst->instr = arr->last_access;
+	dst->size  = arr->length;
+	dst->array.id = arr->id;
+	dst->array.offset = n;
 	ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src;
-	mov->fanin = collect;

 	ir3_instr_set_address(mov, address);

+	arr->last_write = arr->last_access = mov;
+
 	return mov;
 }

@ -1151,7 +1000,7 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
 	nir_const_value *const_offset;
 	/* UBO addresses are the first driver params: */
 	unsigned ubo = regid(ctx->so->first_driver_param + IR3_UBOS_OFF, 0);
-	unsigned off = intr->const_index[0];
+	int off = intr->const_index[0];

 	/* First src is ubo index, which could either be an immed or not: */
 	src0 = get_src(ctx, &intr->src[0])[0];
@ -1199,7 +1048,7 @@ emit_intrinsic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
 {
 	nir_deref_var *dvar = intr->variables[0];
 	nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
-	struct ir3_array_value *arr = get_var(ctx, dvar->var);
+	struct ir3_array *arr = get_var(ctx, dvar->var);

 	compile_assert(ctx, dvar->deref.child &&
 		(dvar->deref.child->deref_type == nir_deref_type_array));
@ -1210,19 +1059,17 @@ emit_intrinsic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
 		for (int i = 0; i < intr->num_components; i++) {
 			unsigned n = darr->base_offset * 4 + i;
 			compile_assert(ctx, n < arr->length);
-			dst[i] = arr->arr[n];
+			dst[i] = create_var_load(ctx, arr, n, NULL);
 		}
 		break;
 	case nir_deref_array_type_indirect: {
 		/* for indirect, we need to collect all the array elements: */
-		struct ir3_instruction *collect =
-				create_collect(ctx->block, arr->arr, arr->length);
 		struct ir3_instruction *addr =
 				get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
 		for (int i = 0; i < intr->num_components; i++) {
 			unsigned n = darr->base_offset * 4 + i;
 			compile_assert(ctx, n < arr->length);
-			dst[i] = create_indirect_load(ctx, arr->length, n, addr, collect);
+			dst[i] = create_var_load(ctx, arr, n, addr);
 		}
 		break;
 	}
@ -1239,8 +1086,9 @@ emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 {
 	nir_deref_var *dvar = intr->variables[0];
 	nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
-	struct ir3_array_value *arr = get_var(ctx, dvar->var);
-	struct ir3_instruction **src;
+	struct ir3_array *arr = get_var(ctx, dvar->var);
+	struct ir3_instruction *addr, **src;
+	unsigned wrmask = intr->const_index[0];

 	compile_assert(ctx, dvar->deref.child &&
 		(dvar->deref.child->deref_type == nir_deref_type_array));
@ -1249,66 +1097,24 @@ emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)

 	switch (darr->deref_array_type) {
 	case nir_deref_array_type_direct:
-		/* direct access does not require anything special: */
-		for (int i = 0; i < intr->num_components; i++) {
-			/* ttn doesn't generate partial writemasks */
-			assert(intr->const_index[0] ==
-			       (1 << intr->num_components) - 1);
-
-			unsigned n = darr->base_offset * 4 + i;
-			compile_assert(ctx, n < arr->length);
-			arr->arr[n] = src[i];
-		}
+		addr = NULL;
 		break;
-	case nir_deref_array_type_indirect: {
-		/* for indirect, create indirect-store and fan that out: */
-		struct ir3_instruction *collect =
-				create_collect(ctx->block, arr->arr, arr->length);
-		struct ir3_instruction *addr =
-				get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
-		for (int i = 0; i < intr->num_components; i++) {
-			/* ttn doesn't generate partial writemasks */
-			assert(intr->const_index[0] ==
-			       (1 << intr->num_components) - 1);
-
-			struct ir3_instruction *store;
-			unsigned n = darr->base_offset * 4 + i;
-			compile_assert(ctx, n < arr->length);
-
-			store = create_indirect_store(ctx, arr->length,
-					n, src[i], addr, collect);
-
-			store->fanin->fi.aid = arr->aid;
-
-			/* TODO: probably split this out to be used for
-			 * store_output_indirect? or move this into
-			 * create_indirect_store()?
-			 */
-			for (int j = i; j < arr->length; j += intr->num_components) {
-				struct ir3_instruction *split;
-
-				split = ir3_instr_create(ctx->block, -1, OPC_META_FO);
-				split->fo.off = j;
-				ir3_reg_create(split, 0, 0);
-				ir3_reg_create(split, 0, IR3_REG_SSA)->instr = store;
-
-				arr->arr[j] = split;
-			}
-		}
-		/* fixup fanout/split neighbors: */
-		for (int i = 0; i < arr->length; i++) {
-			arr->arr[i]->cp.right = (i < (arr->length - 1)) ?
-					arr->arr[i+1] : NULL;
-			arr->arr[i]->cp.left = (i > 0) ?
-					arr->arr[i-1] : NULL;
-		}
+	case nir_deref_array_type_indirect:
+		addr = get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
 		break;
-	}
 	default:
 		compile_error(ctx, "Unhandled store deref type: %u\n",
 				darr->deref_array_type);
 		break;
 	}
+
+	for (int i = 0; i < intr->num_components; i++) {
+		if (!(wrmask & (1 << i)))
+			continue;
+		unsigned n = darr->base_offset * 4 + i;
+		compile_assert(ctx, n < arr->length);
+		create_var_store(ctx, arr, n, src[i], addr);
+	}
 }

 static void add_sysval_input(struct ir3_compile *ctx, gl_system_value slot,
@ -1335,7 +1141,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 	const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
 	struct ir3_instruction **dst, **src;
 	struct ir3_block *b = ctx->block;
-	unsigned idx = intr->const_index[0];
+	int idx = intr->const_index[0];
 	nir_const_value *const_offset;

 	if (info->has_dest) {
@ -1356,7 +1162,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		} else {
 			src = get_src(ctx, &intr->src[0]);
 			for (int i = 0; i < intr->num_components; i++) {
-				unsigned n = idx * 4 + i;
+				int n = idx * 4 + i;
 				dst[i] = create_uniform_indirect(ctx, n,
 						get_addr(ctx, src[0]));
 			}
@ -1836,8 +1642,6 @@ resolve_phis(struct ir3_compile *ctx, struct ir3_block *block)
 			ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
 		}
 	}
-
-	resolve_array_phis(ctx, block);
 }

 static void
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@ -41,16 +41,22 @@ static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags)
 		struct ir3_register *dst = instr->regs[0];
 		struct ir3_register *src = instr->regs[1];
 		struct ir3_instruction *src_instr = ssa(src);
+
+		/* only if mov src is SSA (not const/immed): */
+		if (!src_instr)
+			return false;
+
+		/* no indirect: */
 		if (dst->flags & IR3_REG_RELATIV)
 			return false;
 		if (src->flags & IR3_REG_RELATIV)
 			return false;
+
 		if (!allow_flags)
 			if (src->flags & (IR3_REG_FABS | IR3_REG_FNEG |
 					IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
 				return false;
-		if (!src_instr)
-			return false;
+
 		/* TODO: remove this hack: */
 		if (is_meta(src_instr) && (src_instr->opc == OPC_META_FO))
 			return false;
@ -82,10 +88,17 @@ static bool valid_flags(struct ir3_instruction *instr, unsigned n,
 	unsigned valid_flags;
 	flags = cp_flags(flags);

+	/* If destination is indirect, then source cannot be.. at least
+	 * I don't think so..
+	 */
+	if ((instr->regs[0]->flags & IR3_REG_RELATIV) &&
+			(flags & IR3_REG_RELATIV))
+		return false;
+
 	/* clear flags that are 'ok' */
 	switch (instr->category) {
 	case 1:
-		valid_flags = IR3_REG_IMMED | IR3_REG_RELATIV;
+		valid_flags = IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV;
 		if (flags & ~valid_flags)
 			return false;
 		break;
@ -183,9 +196,14 @@ static void combine_flags(unsigned *dstflags, unsigned srcflags)
 		*dstflags ^= IR3_REG_SNEG;
 	if (srcflags & IR3_REG_BNOT)
 		*dstflags ^= IR3_REG_BNOT;
-}

-static struct ir3_instruction * instr_cp(struct ir3_instruction *instr, unsigned *flags);
+	*dstflags &= ~IR3_REG_SSA;
+	*dstflags |= srcflags & IR3_REG_SSA;
+	*dstflags |= srcflags & IR3_REG_CONST;
+	*dstflags |= srcflags & IR3_REG_IMMED;
+	*dstflags |= srcflags & IR3_REG_RELATIV;
+	*dstflags |= srcflags & IR3_REG_ARRAY;
+}

 /* the "plain" MAD's (ie. the ones that don't shift first src prior to
 * multiply) can swap their first two srcs if src[0] is !CONST and
@ -206,52 +224,35 @@ static bool is_valid_mad(struct ir3_instruction *instr)
 static void
 reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 {
-	unsigned src_flags = 0, new_flags;
-	struct ir3_instruction *src_instr;
+	struct ir3_instruction *src = ssa(reg);

-	if (is_meta(instr)) {
-		/* meta instructions cannot fold up register
-		 * flags.. they are usually src for texture
-		 * fetch, etc, where we cannot specify abs/neg
-		 */
-		reg->instr = instr_cp(reg->instr, NULL);
-		return;
-	}
+	if (is_eligible_mov(src, true)) {
+		/* simple case, no immed/const/relativ, only mov's w/ ssa src: */
+		struct ir3_register *src_reg = src->regs[1];
+		unsigned new_flags = reg->flags;

-	src_instr = instr_cp(reg->instr, &src_flags);
+		combine_flags(&new_flags, src_reg->flags);

-	new_flags = reg->flags;
-	combine_flags(&new_flags, src_flags);
-
-	reg->flags = new_flags;
-	reg->instr = src_instr;
-
-	if (!valid_flags(instr, n, reg->flags)) {
-		/* insert an absneg.f */
-		if (reg->flags & (IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT)) {
-			debug_assert(!(reg->flags & (IR3_REG_FNEG | IR3_REG_FABS)));
-			reg->instr = ir3_ABSNEG_S(instr->block,
-					reg->instr, cp_flags(src_flags));
-		} else {
-			debug_assert(!(reg->flags & (IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT)));
-			reg->instr = ir3_ABSNEG_F(instr->block,
-					reg->instr, cp_flags(src_flags));
+		if (valid_flags(instr, n, new_flags)) {
+			if (new_flags & IR3_REG_ARRAY) {
+				debug_assert(!(reg->flags & IR3_REG_ARRAY));
+				reg->array = src_reg->array;
+			}
+			reg->flags = new_flags;
+			reg->instr = ssa(src_reg);
 		}
-		reg->flags &= ~cp_flags(src_flags);
-		debug_assert(valid_flags(instr, n, reg->flags));
-		/* send it through instr_cp() again since
-		 * the absneg src might be a mov from const
-		 * that could be cleaned up:
-		 */
-		reg->instr = instr_cp(reg->instr, NULL);
-		return;
-	}

-	if (is_same_type_mov(reg->instr)) {
-		struct ir3_register *src_reg = reg->instr->regs[1];
-		unsigned new_flags = src_reg->flags;
+		src = ssa(reg);      /* could be null for IR3_REG_ARRAY case */
+		if (!src)
+			return;
+	} else if (is_same_type_mov(src) &&
+			/* cannot collapse const/immed/etc into meta instrs: */
+			!is_meta(instr)) {
+		/* immed/const/etc cases, which require some special handling: */
+		struct ir3_register *src_reg = src->regs[1];
+		unsigned new_flags = reg->flags;

-		combine_flags(&new_flags, reg->flags);
+		combine_flags(&new_flags, src_reg->flags);

 		if (!valid_flags(instr, n, new_flags)) {
 			/* special case for "normal" mad instructions, we can
@ -287,6 +288,16 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 					conflicts(instr->address, reg->instr->address))
 				return;

+			/* This seems to be a hw bug, or something where the timings
+			 * just somehow don't work out.  This restriction may only
+			 * apply if the first src is also CONST.
+			 */
+			if ((instr->category == 3) && (n == 2) &&
+					(src_reg->flags & IR3_REG_RELATIV) &&
+					(src_reg->array.offset == 0))
+				return;
+
+			src_reg = ir3_reg_clone(instr->block->shader, src_reg);
 			src_reg->flags = new_flags;
 			instr->regs[n+1] = src_reg;

@ -298,6 +309,7 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)

 		if ((src_reg->flags & IR3_REG_RELATIV) &&
 				!conflicts(instr->address, reg->instr->address)) {
+			src_reg = ir3_reg_clone(instr->block->shader, src_reg);
 			src_reg->flags = new_flags;
 			instr->regs[n+1] = src_reg;
 			ir3_instr_set_address(instr, reg->instr->address);
@ -330,8 +342,10 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 			if (new_flags & IR3_REG_BNOT)
 				iim_val = ~iim_val;

-			if (!(iim_val & ~0x3ff)) {
+			/* other than category 1 (mov) we can only encode up to 10 bits: */
+			if ((instr->category == 1) || !(iim_val & ~0x3ff)) {
 				new_flags &= ~(IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT);
+				src_reg = ir3_reg_clone(instr->block->shader, src_reg);
 				src_reg->flags = new_flags;
 				src_reg->iim_val = iim_val;
 				instr->regs[n+1] = src_reg;
@ -342,56 +356,68 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
 	}
 }

-/**
- * Given an SSA src (instruction), return the one with extraneous
- * mov's removed, ie, for (to copy NIR syntax):
- *
- *   vec1 ssa1 = fadd <something>, <somethingelse>
- *   vec1 ssa2 = fabs ssa1
- *   vec1 ssa3 = fneg ssa1
- *
- * then calling instr_cp(ssa3, &flags) would return ssa1 with
- * (IR3_REG_ABS | IR3_REG_NEGATE) in flags.  If flags is NULL,
- * then disallow eliminating copies which would require flag
- * propagation (for example, we cannot propagate abs/neg into
- * an output).
+/* Handle special case of eliminating output mov, and similar cases where
+ * there isn't a normal "consuming" instruction.  In this case we cannot
+ * collapse flags (ie. output mov from const, or w/ abs/neg flags, cannot
+ * be eliminated)
 */
 static struct ir3_instruction *
-instr_cp(struct ir3_instruction *instr, unsigned *flags)
+eliminate_output_mov(struct ir3_instruction *instr)
+{
+	if (is_eligible_mov(instr, false)) {
+		struct ir3_register *reg = instr->regs[1];
+		if (!(reg->flags & IR3_REG_ARRAY)) {
+			struct ir3_instruction *src_instr = ssa(reg);
+			debug_assert(src_instr);
+			return src_instr;
+		}
+	}
+	return instr;
+}
+
+/**
+ * Find instruction src's which are mov's that can be collapsed, replacing
+ * the mov dst with the mov src
+ */
+static void
+instr_cp(struct ir3_instruction *instr)
 {
 	struct ir3_register *reg;

-	if (is_eligible_mov(instr, !!flags)) {
-		struct ir3_register *reg = instr->regs[1];
-		struct ir3_instruction *src_instr = ssa(reg);
-		if (flags)
-			combine_flags(flags, reg->flags);
-		return instr_cp(src_instr, flags);
-	}
+	if (instr->regs_count == 0)
+		return;

-	/* Check termination condition before walking children (rather
-	 * than before checking eligible-mov).  A mov instruction may
-	 * appear as ssa-src for multiple other instructions, and we
-	 * want to consider it for removal for each, rather than just
-	 * the first one.  (But regardless of how many places it shows
-	 * up as a src, we only need to recursively walk the children
-	 * once.)
-	 */
 	if (ir3_instr_check_mark(instr))
-		return instr;
+		return;

 	/* walk down the graph from each src: */
 	foreach_src_n(reg, n, instr) {
-		if (!(reg->flags & IR3_REG_SSA))
+		struct ir3_instruction *src = ssa(reg);
+
+		if (!src)
+			continue;
+
+		instr_cp(src);
+
+		/* TODO non-indirect access we could figure out which register
+		 * we actually want and allow cp..
+		 */
+		if (reg->flags & IR3_REG_ARRAY)
 			continue;

 		reg_cp(instr, reg, n);
 	}

-	if (instr->address)
-		ir3_instr_set_address(instr, instr_cp(instr->address, NULL));
+	if (instr->regs[0]->flags & IR3_REG_ARRAY) {
+		struct ir3_instruction *src = ssa(instr->regs[0]);
+		if (src)
+			instr_cp(src);
+	}

-	return instr;
+	if (instr->address) {
+		instr_cp(instr->address);
+		ir3_instr_set_address(instr, eliminate_output_mov(instr->address));
+	}
 }

 void
@ -401,19 +427,20 @@ ir3_cp(struct ir3 *ir)

 	for (unsigned i = 0; i < ir->noutputs; i++) {
 		if (ir->outputs[i]) {
-			struct ir3_instruction *out =
-					instr_cp(ir->outputs[i], NULL);
-
-			ir->outputs[i] = out;
+			instr_cp(ir->outputs[i]);
+			ir->outputs[i] = eliminate_output_mov(ir->outputs[i]);
 		}
 	}

 	for (unsigned i = 0; i < ir->keeps_count; i++) {
-		ir->keeps[i] = instr_cp(ir->keeps[i], NULL);
+		instr_cp(ir->keeps[i]);
+		ir->keeps[i] = eliminate_output_mov(ir->keeps[i]);
 	}

 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-		if (block->condition)
-			block->condition = instr_cp(block->condition, NULL);
+		if (block->condition) {
+			instr_cp(block->condition);
+			block->condition = eliminate_output_mov(block->condition);
+		}
 	}
 }
--- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
@ -76,7 +76,7 @@ int ir3_delayslots(struct ir3_instruction *assigner,
 		return 6;
 	} else if ((consumer->category == 3) &&
 			(is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
-			(n == 2)) {
+			(n == 3)) {
 		/* special case, 3rd src to cat3 not required on first cycle */
 		return 1;
 	} else {
@ -118,6 +118,10 @@ ir3_instr_depth(struct ir3_instruction *instr)
 		/* visit child to compute it's depth: */
 		ir3_instr_depth(src);

+		/* for array writes, no need to delay on previous write: */
+		if (i == 0)
+			continue;
+
 		sd = ir3_delayslots(src, instr, i) + src->depth;

 		instr->depth = MAX2(instr->depth, sd);
--- a/src/gallium/drivers/freedreno/ir3/ir3_print.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_print.c
@ -94,7 +94,7 @@ static void print_instr_name(struct ir3_instruction *instr)
 	}
 }

-static void print_reg_name(struct ir3_register *reg, bool followssa)
+static void print_reg_name(struct ir3_register *reg)
 {
 	if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) &&
 			(reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)))
@ -106,20 +106,29 @@ static void print_reg_name(struct ir3_register *reg, bool followssa)

 	if (reg->flags & IR3_REG_IMMED) {
 		printf("imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val);
-	} else if (reg->flags & IR3_REG_SSA) {
-		printf("_");
-		if (followssa) {
-			printf("[");
+	} else if (reg->flags & IR3_REG_ARRAY) {
+		printf("arr[id=%u, offset=%d, size=%u", reg->array.id,
+				reg->array.offset, reg->size);
+		/* for ARRAY we could have null src, for example first write
+		 * instruction..
+		 */
+		if (reg->instr) {
+			printf(", _[");
 			print_instr_name(reg->instr);
 			printf("]");
 		}
+		printf("]");
+	} else if (reg->flags & IR3_REG_SSA) {
+		printf("_[");
+		print_instr_name(reg->instr);
+		printf("]");
 	} else if (reg->flags & IR3_REG_RELATIV) {
 		if (reg->flags & IR3_REG_HALF)
 			printf("h");
 		if (reg->flags & IR3_REG_CONST)
-			printf("c<a0.x + %u>", reg->num);
+			printf("c<a0.x + %d>", reg->array.offset);
 		else
-			printf("\x1b[0;31mr<a0.x + %u>\x1b[0m (%u)", reg->num, reg->size);
+			printf("\x1b[0;31mr<a0.x + %d>\x1b[0m (%u)", reg->array.offset, reg->size);
 	} else {
 		if (reg->flags & IR3_REG_HALF)
 			printf("h");
@ -158,7 +167,7 @@ print_instr(struct ir3_instruction *instr, int lvl)
 	for (i = 0; i < instr->regs_count; i++) {
 		struct ir3_register *reg = instr->regs[i];
 		printf(i ? ", " : " ");
-		print_reg_name(reg, !!i);
+		print_reg_name(reg);
 	}

 	if (instr->address) {
@ -168,13 +177,6 @@ print_instr(struct ir3_instruction *instr, int lvl)
 		printf("]");
 	}

-	if (instr->fanin) {
-		printf(", fanin=_");
-		printf("[");
-		print_instr_name(instr->fanin);
-		printf("]");
-	}
-
 	if (instr->cp.left) {
 		printf(", left=_");
 		printf("[");
@ -192,8 +194,6 @@ print_instr(struct ir3_instruction *instr, int lvl)
 	if (is_meta(instr)) {
 		if (instr->opc == OPC_META_FO) {
 			printf(", off=%d", instr->fo.off);
-		} else if ((instr->opc == OPC_META_FI) && instr->fi.aid) {
-			printf(", aid=%d", instr->fi.aid);
 		}
 	}

--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@ -68,25 +68,24 @@
 * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after
 * register assignment.  But for us that is horrible from a scheduling
 * standpoint.  Instead what we do is use idea of 'definer' instruction.
- * Ie. the first instruction (lowest ip) to write to the array is the
+ * Ie. the first instruction (lowest ip) to write to the variable is the
 * one we consider from use/def perspective when building interference
- * graph.  (Other instructions which write other array elements just
- * define the variable some more.)
+ * graph.  (Other instructions which write other variable components
+ * just define the variable some more.)
+ *
+ * Arrays of arbitrary size are handled via pre-coloring a consecutive
+ * sequence of registers.  Additional scalar (single component) reg
+ * names are allocated starting at ctx->class_base[total_class_count]
+ * (see arr->base), which are pre-colored.  In the use/def graph direct
+ * access is treated as a single element use/def, and indirect access
+ * is treated as use or def of all array elements.  (Only the first
+ * def is tracked, in case of multiple indirect writes, etc.)
 */

 static const unsigned class_sizes[] = {
 	1, 2, 3, 4,
 	4 + 4, /* txd + 1d/2d */
 	4 + 6, /* txd + 3d */
-	/* temporary: until we can assign arrays, create classes so we
-	 * can round up array to fit.  NOTE with tgsi arrays should
-	 * really all be multiples of four:
-	 */
-	4 * 4,
-	4 * 8,
-	4 * 16,
-	4 * 32,
-
 };
 #define class_count ARRAY_SIZE(class_sizes)

@ -265,13 +264,21 @@ struct ir3_ra_ctx {
 	struct ir3_ra_reg_set *set;
 	struct ra_graph *g;
 	unsigned alloc_count;
-	unsigned class_alloc_count[total_class_count];
-	unsigned class_base[total_class_count];
+	/* one per class, plus one slot for arrays: */
+	unsigned class_alloc_count[total_class_count + 1];
+	unsigned class_base[total_class_count + 1];
 	unsigned instr_cnt;
 	unsigned *def, *use;     /* def/use table */
 	struct ir3_ra_instr_data *instrd;
 };

+/* does it conflict? */
+static inline bool
+intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end)
+{
+	return !((a_start >= b_end) || (b_start >= a_end));
+}
+
 static bool
 is_half(struct ir3_instruction *instr)
 {
@ -329,9 +336,6 @@ get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
 	struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
 	struct ir3_instruction *d = NULL;

-	if (instr->fanin)
-		return get_definer(ctx, instr->fanin, sz, off);
-
 	if (id->defn) {
 		*sz = id->sz;
 		*off = id->off;
@ -485,10 +489,13 @@ ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 		/* couple special cases: */
 		if (writes_addr(instr) || writes_pred(instr)) {
 			id->cls = -1;
-			continue;
+		} else if (instr->regs[0]->flags & IR3_REG_ARRAY) {
+			id->cls = total_class_count;
+			id->defn = instr;
+		} else {
+			id->defn = get_definer(ctx, instr, &id->sz, &id->off);
+			id->cls = size_to_class(id->sz, is_half(id->defn));
 		}
-		id->defn = get_definer(ctx, instr, &id->sz, &id->off);
-		id->cls = size_to_class(id->sz, is_half(id->defn));
 	}
 }

@ -518,8 +525,6 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)

 		/* arrays which don't fit in one of the pre-defined class
 		 * sizes are pre-colored:
-		 *
-		 * TODO but we still need to allocate names for them, don't we??
 		 */
 		if (id->cls >= 0) {
 			instr->name = ctx->class_alloc_count[id->cls]++;
@ -531,7 +536,7 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 static void
 ra_init(struct ir3_ra_ctx *ctx)
 {
-	unsigned n;
+	unsigned n, base;

 	ir3_clear_mark(ctx->ir);
 	n = ir3_count_instructions(ctx->ir);
@ -550,11 +555,20 @@ ra_init(struct ir3_ra_ctx *ctx)
 	 * actual ra name is class_base[cls] + instr->name;
 	 */
 	ctx->class_base[0] = 0;
-	for (unsigned i = 1; i < total_class_count; i++) {
+	for (unsigned i = 1; i <= total_class_count; i++) {
 		ctx->class_base[i] = ctx->class_base[i-1] +
 				ctx->class_alloc_count[i-1];
 	}

+	/* and vreg names for array elements: */
+	base = ctx->class_base[total_class_count];
+	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+		arr->base = base;
+		ctx->class_alloc_count[total_class_count] += arr->length;
+		base += arr->length;
+	}
+	ctx->alloc_count += ctx->class_alloc_count[total_class_count];
+
 	ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
 	ralloc_steal(ctx->g, ctx->instrd);
 	ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
@ -562,15 +576,23 @@ ra_init(struct ir3_ra_ctx *ctx)
 }

 static unsigned
-ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
+__ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
 {
 	unsigned name;
 	debug_assert(cls >= 0);
+	debug_assert(cls < total_class_count);  /* we shouldn't get arrays here.. */
 	name = ctx->class_base[cls] + defn->name;
 	debug_assert(name < ctx->alloc_count);
 	return name;
 }

+static int
+ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id)
+{
+	/* TODO handle name mapping for arrays */
+	return __ra_name(ctx, id->cls, id->defn);
+}
+
 static void
 ra_destroy(struct ir3_ra_ctx *ctx)
 {
@ -583,6 +605,22 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 	struct ir3_ra_block_data *bd;
 	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);

+	void def(unsigned name, struct ir3_instruction *instr)
+	{
+		/* defined on first write: */
+		if (!ctx->def[name])
+			ctx->def[name] = instr->ip;
+		ctx->use[name] = instr->ip;
+		BITSET_SET(bd->def, name);
+	}
+
+	void use(unsigned name, struct ir3_instruction *instr)
+	{
+		ctx->use[name] = MAX2(ctx->use[name], instr->ip);
+		if (!BITSET_TEST(bd->def, name))
+			BITSET_SET(bd->use, name);
+	}
+
 	bd = rzalloc(ctx->g, struct ir3_ra_block_data);

 	bd->def     = rzalloc_array(bd, BITSET_WORD, bitset_words);
@ -594,6 +632,7 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)

 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 		struct ir3_instruction *src;
+		struct ir3_register *reg;

 		if (instr->regs_count == 0)
 			continue;
@ -625,61 +664,101 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)

 		if (writes_gpr(instr)) {
 			struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+			struct ir3_register *dst = instr->regs[0];

-			if (id->defn == instr) {
-				/* arrays which don't fit in one of the pre-defined class
-				 * sizes are pre-colored:
+			if (dst->flags & IR3_REG_ARRAY) {
+				struct ir3_array *arr =
+					ir3_lookup_array(ctx->ir, dst->array.id);
+				unsigned i;
+
+				debug_assert(!(dst->flags & IR3_REG_PHI_SRC));
+
+				arr->start_ip = MIN2(arr->start_ip, instr->ip);
+				arr->end_ip = MAX2(arr->end_ip, instr->ip);
+
+				/* set the node class now.. in case we don't encounter
+				 * this array dst again.  From register_alloc algo's
+				 * perspective, these are all single/scalar regs:
 				 */
-				if (id->cls >= 0) {
-					unsigned name = ra_name(ctx, id->cls, id->defn);
+				for (i = 0; i < arr->length; i++) {
+					unsigned name = arr->base + i;
+					ra_set_node_class(ctx->g, name, ctx->set->classes[0]);
+				}

-					ctx->def[name] = id->defn->ip;
-					ctx->use[name] = id->defn->ip;
-
-					/* since we are in SSA at this point: */
-					debug_assert(!BITSET_TEST(bd->use, name));
-
-					BITSET_SET(bd->def, name);
-
-					if (is_half(id->defn)) {
-						ra_set_node_class(ctx->g, name,
-								ctx->set->half_classes[id->cls - class_count]);
-					} else {
-						ra_set_node_class(ctx->g, name,
-								ctx->set->classes[id->cls]);
+				/* indirect write is treated like a write to all array
+				 * elements, since we don't know which one is actually
+				 * written:
+				 */
+				if (dst->flags & IR3_REG_RELATIV) {
+					for (i = 0; i < arr->length; i++) {
+						unsigned name = arr->base + i;
+						def(name, instr);
 					}
+				} else {
+					unsigned name = arr->base + dst->array.offset;
+					def(name, instr);
+				}

-					/* extend the live range for phi srcs, which may come
-					 * from the bottom of the loop
-					 */
-					if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) {
-						struct ir3_instruction *phi = id->defn->regs[0]->instr;
-						foreach_ssa_src(src, phi) {
-							/* if src is after phi, then we need to extend
-							 * the liverange to the end of src's block:
-							 */
-							if (src->ip > phi->ip) {
-								struct ir3_instruction *last =
+			} else if (id->defn == instr) {
+				unsigned name = ra_name(ctx, id);
+
+				/* since we are in SSA at this point: */
+				debug_assert(!BITSET_TEST(bd->use, name));
+
+				def(name, id->defn);
+
+				if (is_half(id->defn)) {
+					ra_set_node_class(ctx->g, name,
+							ctx->set->half_classes[id->cls - class_count]);
+				} else {
+					ra_set_node_class(ctx->g, name,
+							ctx->set->classes[id->cls]);
+				}
+
+				/* extend the live range for phi srcs, which may come
+				 * from the bottom of the loop
+				 */
+				if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) {
+					struct ir3_instruction *phi = id->defn->regs[0]->instr;
+					foreach_ssa_src(src, phi) {
+						/* if src is after phi, then we need to extend
+						 * the liverange to the end of src's block:
+						 */
+						if (src->ip > phi->ip) {
+							struct ir3_instruction *last =
 									list_last_entry(&src->block->instr_list,
-										struct ir3_instruction, node);
-								ctx->use[name] = MAX2(ctx->use[name], last->ip);
-							}
+											struct ir3_instruction, node);
+							ctx->use[name] = MAX2(ctx->use[name], last->ip);
 						}
 					}
 				}
 			}
 		}

-		foreach_ssa_src(src, instr) {
-			if (writes_gpr(src)) {
-				struct ir3_ra_instr_data *id = &ctx->instrd[src->ip];
-
-				if (id->cls >= 0) {
-					unsigned name = ra_name(ctx, id->cls, id->defn);
-					ctx->use[name] = MAX2(ctx->use[name], instr->ip);
-					if (!BITSET_TEST(bd->def, name))
-						BITSET_SET(bd->use, name);
+		foreach_src(reg, instr) {
+			if (reg->flags & IR3_REG_ARRAY) {
+				struct ir3_array *arr =
+					ir3_lookup_array(ctx->ir, reg->array.id);
+				arr->start_ip = MIN2(arr->start_ip, instr->ip);
+				arr->end_ip = MAX2(arr->end_ip, instr->ip);
+				/* indirect read is treated like a read fromall array
+				 * elements, since we don't know which one is actually
+				 * read:
+				 */
+				if (reg->flags & IR3_REG_RELATIV) {
+					unsigned i;
+					for (i = 0; i < arr->length; i++) {
+						unsigned name = arr->base + i;
+						use(name, instr);
+					}
+				} else {
+					unsigned name = arr->base + reg->array.offset;
+					use(name, instr);
+					debug_assert(reg->array.offset < arr->length);
 				}
+			} else if ((src = ssa(reg)) && writes_gpr(src)) {
+				unsigned name = ra_name(ctx, &ctx->instrd[src->ip]);
+				use(name, instr);
 			}
 		}
 	}
@ -735,6 +814,12 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
 {
 	struct ir3 *ir = ctx->ir;

+	/* initialize array live ranges: */
+	list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) {
+		arr->start_ip = ~0;
+		arr->end_ip = 0;
+	}
+
 	/* compute live ranges (use/def) on a block level, also updating
 	 * block's def/use bitmasks (used below to calculate per-block
 	 * livein/liveout):
@ -767,18 +852,14 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
 	/* need to fix things up to keep outputs live: */
 	for (unsigned i = 0; i < ir->noutputs; i++) {
 		struct ir3_instruction *instr = ir->outputs[i];
-		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
-
-		if (id->cls >= 0) {
-			unsigned name = ra_name(ctx, id->cls, id->defn);
-			ctx->use[name] = ctx->instr_cnt;
-		}
+		unsigned name = ra_name(ctx, &ctx->instrd[instr->ip]);
+		ctx->use[name] = ctx->instr_cnt;
 	}

 	for (unsigned i = 0; i < ctx->alloc_count; i++) {
 		for (unsigned j = 0; j < ctx->alloc_count; j++) {
-			if (!((ctx->def[i] >= ctx->use[j]) ||
-					(ctx->def[j] >= ctx->use[i]))) {
+			if (intersects(ctx->def[i], ctx->use[i],
+					ctx->def[j], ctx->use[j])) {
 				ra_add_node_interference(ctx->g, i, j);
 			}
 		}
@ -836,19 +917,36 @@ static void fixup_half_instr_src(struct ir3_instruction *instr)
 	}
 }

+/* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first
+ * array access(es) which do not have any previous access to depend
+ * on from scheduling point of view
+ */
 static void
 reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
 		struct ir3_instruction *instr)
 {
-	struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+	struct ir3_ra_instr_data *id;

-	if (id->cls >= 0) {
-		unsigned name = ra_name(ctx, id->cls, id->defn);
+	if (reg->flags & IR3_REG_ARRAY) {
+		struct ir3_array *arr =
+			ir3_lookup_array(ctx->ir, reg->array.id);
+		unsigned name = arr->base + reg->array.offset;
+		unsigned r = ra_get_node_reg(ctx->g, name);
+		unsigned num = ctx->set->ra_reg_to_gpr[r];
+
+		if (reg->flags & IR3_REG_RELATIV) {
+			reg->array.offset = num;
+		} else {
+			reg->num = num;
+		}
+
+		reg->flags &= ~IR3_REG_ARRAY;
+	} else if ((id = &ctx->instrd[instr->ip]) && id->defn) {
+		unsigned name = ra_name(ctx, id);
 		unsigned r = ra_get_node_reg(ctx->g, name);
 		unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;

-		if (reg->flags & IR3_REG_RELATIV)
-			num += reg->offset;
+		debug_assert(!(reg->flags & IR3_REG_RELATIV));

 		reg->num = num;
 		reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC);
@ -875,9 +973,9 @@ ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)

 		foreach_src_n(reg, n, instr) {
 			struct ir3_instruction *src = reg->instr;
-			if (!src)
+			/* Note: reg->instr could be null for IR3_REG_ARRAY */
+			if (!(src || (reg->flags & IR3_REG_ARRAY)))
 				continue;
-
 			reg_assign(ctx, instr->regs[n+1], src);
 			if (instr->regs[n+1]->flags & IR3_REG_HALF)
 				fixup_half_instr_src(instr);
@ -888,6 +986,8 @@ ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 static int
 ra_alloc(struct ir3_ra_ctx *ctx)
 {
+	unsigned n = 0;
+
 	/* frag shader inputs get pre-assigned, since we have some
 	 * constraints/unknowns about setup for some of these regs:
 	 */
@ -897,7 +997,7 @@ ra_alloc(struct ir3_ra_ctx *ctx)
 		if (ctx->frag_face && (i < ir->ninputs) && ir->inputs[i]) {
 			struct ir3_instruction *instr = ir->inputs[i];
 			int cls = size_to_class(1, true);
-			unsigned name = ra_name(ctx, cls, instr);
+			unsigned name = __ra_name(ctx, cls, instr);
 			unsigned reg = ctx->set->gpr_to_ra_reg[cls][0];

 			/* if we have frag_face, it gets hr0.x */
@ -905,7 +1005,8 @@ ra_alloc(struct ir3_ra_ctx *ctx)
 			i += 4;
 		}

-		for (j = 0; i < ir->ninputs; i++) {
+		j = 0;
+		for (; i < ir->ninputs; i++) {
 			struct ir3_instruction *instr = ir->inputs[i];
 			if (instr) {
 				struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
@ -913,7 +1014,7 @@ ra_alloc(struct ir3_ra_ctx *ctx)
 				if (id->defn == instr) {
 					unsigned name, reg;

-					name = ra_name(ctx, id->cls, id->defn);
+					name = ra_name(ctx, id);
 					reg = ctx->set->gpr_to_ra_reg[id->cls][j];

 					ra_set_node_reg(ctx->g, name, reg);
@ -921,6 +1022,46 @@ ra_alloc(struct ir3_ra_ctx *ctx)
 				}
 			}
 		}
+		n = j;
+	}
+
+	/* pre-assign array elements:
+	 */
+	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+		unsigned base = n;
+
+		if (arr->end_ip == 0)
+			continue;
+
+		/* figure out what else we conflict with which has already
+		 * been assigned:
+		 */
+retry:
+		list_for_each_entry (struct ir3_array, arr2, &ctx->ir->array_list, node) {
+			if (arr2 == arr)
+				break;
+			if (arr2->end_ip == 0)
+				continue;
+			/* if it intersects with liverange AND register range.. */
+			if (intersects(arr->start_ip, arr->end_ip,
+					arr2->start_ip, arr2->end_ip) &&
+				intersects(base, base + arr->length,
+					arr2->reg, arr2->reg + arr2->length)) {
+				base = MAX2(base, arr2->reg + arr2->length);
+				goto retry;
+			}
+		}
+
+		arr->reg = base;
+
+		for (unsigned i = 0; i < arr->length; i++) {
+			unsigned name, reg;
+
+			name = arr->base + i;
+			reg = ctx->set->gpr_to_ra_reg[0][base++];
+
+			ra_set_node_reg(ctx->g, name, reg);
+		}
 	}

 	if (!ra_allocate(ctx->g))
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@ -187,6 +187,9 @@ delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)

 	foreach_ssa_src_n(src, i, instr) {
 		unsigned d;
+		/* for array writes, no need to delay on previous write: */
+		if (i == 0)
+			continue;
 		if (src->block != instr->block)
 			continue;
 		d = delay_calc_srcn(ctx, src, instr, i);
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@ -261,6 +261,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
   case PIPE_CAP_INVALIDATE_BUFFER:
   case PIPE_CAP_GENERATE_MIPMAP:
+   case PIPE_CAP_STRING_MARKER:
      return 0;

   case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
--- a/src/gallium/drivers/i915/i915_state_derived.c
+++ b/src/gallium/drivers/i915/i915_state_derived.c
@ -184,7 +184,7 @@ static void calculate_vertex_layout(struct i915_context *i915)
 struct i915_tracked_state i915_update_vertex_layout = {
   "vertex_layout",
   calculate_vertex_layout,
-   I915_NEW_FS | I915_NEW_VS
+   I915_NEW_RASTERIZER | I915_NEW_FS | I915_NEW_VS
 };


--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@ -485,6 +485,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
   case PIPE_CAP_INVALIDATE_BUFFER:
   case PIPE_CAP_GENERATE_MIPMAP:
+   case PIPE_CAP_STRING_MARKER:
      return 0;

   case PIPE_CAP_VENDOR_ID:
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@ -82,8 +82,6 @@ struct llvmpipe_context {
   struct pipe_viewport_state viewports[PIPE_MAX_VIEWPORTS];
   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
   struct pipe_index_buffer index_buffer;
-   struct pipe_resource *mapped_vs_tex[PIPE_MAX_SHADER_SAMPLER_VIEWS];
-   struct pipe_resource *mapped_gs_tex[PIPE_MAX_SHADER_SAMPLER_VIEWS];

   unsigned num_samplers[PIPE_SHADER_TYPES];
   unsigned num_sampler_views[PIPE_SHADER_TYPES];
--- a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
+++ b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
@ -149,9 +149,6 @@ llvmpipe_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
         draw_vs_reset_so(lp->vs);
      }
   }
-   
-   llvmpipe_cleanup_vertex_sampling(lp);
-   llvmpipe_cleanup_geometry_sampling(lp);

   /*
    * TODO: Flush only when a user vertex/index buffer is present
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@ -310,6 +310,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
   case PIPE_CAP_INVALIDATE_BUFFER:
   case PIPE_CAP_GENERATE_MIPMAP:
+   case PIPE_CAP_STRING_MARKER:
      return 0;
   }
   /* should only get here on unhandled cases */
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@ -476,27 +476,30 @@ lp_setup_try_clear_zs(struct lp_setup_context *setup,
   uint64_t zsvalue = 0;
   uint32_t zmask32;
   uint8_t smask8;
+   enum pipe_format format = setup->fb.zsbuf->format;

   LP_DBG(DEBUG_SETUP, "%s state %d\n", __FUNCTION__, setup->state);

   zmask32 = (flags & PIPE_CLEAR_DEPTH) ? ~0 : 0;
   smask8 = (flags & PIPE_CLEAR_STENCIL) ? ~0 : 0;

-   zsvalue = util_pack64_z_stencil(setup->fb.zsbuf->format,
-                                   depth,
-                                   stencil);
+   zsvalue = util_pack64_z_stencil(format, depth, stencil);

-   /*
-    * XXX: should make a full mask here for things like D24X8,
-    * otherwise we'll do a read-modify-write clear later which
-    * should be unnecessary.
-    */
-   zsmask = util_pack64_mask_z_stencil(setup->fb.zsbuf->format,
-                                       zmask32,
-                                       smask8);
+   zsmask = util_pack64_mask_z_stencil(format, zmask32, smask8);

   zsvalue &= zsmask;

+   if (format == PIPE_FORMAT_Z24X8_UNORM ||
+       format == PIPE_FORMAT_X8Z24_UNORM) {
+      /*
+       * Make full mask if there's "X" bits so we can do full
+       * clear (without rmw).
+       */
+      uint32_t zsmask_full = 0;
+      zsmask_full = util_pack_mask_z_stencil(format, ~0, ~0);
+      zsmask |= ~zsmask_full;
+   }
+
   if (setup->state == SETUP_ACTIVE) {
      struct lp_scene *scene = setup->scene;

@ -796,13 +799,15 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
                                    unsigned num,
                                    struct pipe_sampler_view **views)
 {
-   unsigned i;
+   unsigned i, max_tex_num;

   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);

   assert(num <= PIPE_MAX_SHADER_SAMPLER_VIEWS);

-   for (i = 0; i < PIPE_MAX_SHADER_SAMPLER_VIEWS; i++) {
+   max_tex_num = MAX2(num, setup->fs.current_tex_num);
+
+   for (i = 0; i < max_tex_num; i++) {
      struct pipe_sampler_view *view = i < num ? views[i] : NULL;

      if (view) {
@ -922,7 +927,11 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
            assert(jit_tex->base);
         }
      }
+      else {
+         pipe_resource_reference(&setup->fs.current_tex[i], NULL);
+      }
   }
+   setup->fs.current_tex_num = num;

   setup->dirty |= LP_SETUP_NEW_FS;
 }
--- a/src/gallium/drivers/llvmpipe/lp_setup_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h
@ -133,6 +133,7 @@ struct lp_setup_context
      const struct lp_rast_state *stored; /**< what's in the scene */
      struct lp_rast_state current;  /**< currently set state */
      struct pipe_resource *current_tex[PIPE_MAX_SHADER_SAMPLER_VIEWS];
+      unsigned current_tex_num;
   } fs;

   /** fragment shader constants */
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@ -556,7 +556,7 @@ do_triangle_ccw(struct lp_setup_context *setup,

      /* Calculate trivial reject values:
       */
-      eo = vec_sub_epi32(vec_andc(dcdy_neg_mask, dcdy),
+      eo = vec_sub_epi32(vec_andnot_si128(dcdy_neg_mask, dcdy),
                         vec_and(dcdx_neg_mask, dcdx));

      /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
--- a/src/gallium/drivers/llvmpipe/lp_state.h
+++ b/src/gallium/drivers/llvmpipe/lp_state.h
@ -130,16 +130,10 @@ void
 llvmpipe_prepare_vertex_sampling(struct llvmpipe_context *ctx,
                                 unsigned num,
                                 struct pipe_sampler_view **views);
-void
-llvmpipe_cleanup_vertex_sampling(struct llvmpipe_context *ctx);
-

 void
 llvmpipe_prepare_geometry_sampling(struct llvmpipe_context *ctx,
                                   unsigned num,
                                   struct pipe_sampler_view **views);
-void
-llvmpipe_cleanup_geometry_sampling(struct llvmpipe_context *ctx);
-

 #endif
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@ -190,8 +190,10 @@ void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe )
      llvmpipe->tex_timestamp = lp_screen->timestamp;
      llvmpipe->dirty |= LP_NEW_SAMPLER_VIEW;
   }
-      
-   if (llvmpipe->dirty & (LP_NEW_FS |
+
+   /* This needs LP_NEW_RASTERIZER because of draw_prepare_shader_outputs(). */
+   if (llvmpipe->dirty & (LP_NEW_RASTERIZER |
+                          LP_NEW_FS |
                          LP_NEW_VS))
      compute_vertex_info(llvmpipe);

--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@ -98,8 +98,9 @@ llvmpipe_bind_sampler_states(struct pipe_context *pipe,
                        llvmpipe->samplers[shader],
                        llvmpipe->num_samplers[shader]);
   }
-
-   llvmpipe->dirty |= LP_NEW_SAMPLER;
+   else {
+      llvmpipe->dirty |= LP_NEW_SAMPLER;
+   }
 }


@ -128,6 +129,15 @@ llvmpipe_set_sampler_views(struct pipe_context *pipe,
       */
      pipe_sampler_view_release(pipe,
                                &llvmpipe->sampler_views[shader][start + i]);
+      /*
+       * Warn if someone tries to set a view created in a different context
+       * (which is why we need the hack above in the first place).
+       * An assert would be better but st/mesa relies on it...
+       */
+      if (views[i] && views[i]->context != pipe) {
+         debug_printf("Illegal setting of sampler_view %d created in another "
+                      "context\n", i);
+      }
      pipe_sampler_view_reference(&llvmpipe->sampler_views[shader][start + i],
                                  views[i]);
   }
@ -146,8 +156,9 @@ llvmpipe_set_sampler_views(struct pipe_context *pipe,
                             llvmpipe->sampler_views[shader],
                             llvmpipe->num_sampler_views[shader]);
   }
-
-   llvmpipe->dirty |= LP_NEW_SAMPLER_VIEW;
+   else {
+      llvmpipe->dirty |= LP_NEW_SAMPLER_VIEW;
+   }
 }


@ -228,8 +239,7 @@ prepare_shader_sampling(
   struct llvmpipe_context *lp,
   unsigned num,
   struct pipe_sampler_view **views,
-   unsigned shader_type,
-   struct pipe_resource *mapped_tex[PIPE_MAX_SHADER_SAMPLER_VIEWS])
+   unsigned shader_type)
 {

   unsigned i;
@ -242,7 +252,7 @@ prepare_shader_sampling(
   if (!num)
      return;

-   for (i = 0; i < PIPE_MAX_SHADER_SAMPLER_VIEWS; i++) {
+   for (i = 0; i < num; i++) {
      struct pipe_sampler_view *view = i < num ? views[i] : NULL;

      if (view) {
@ -253,11 +263,6 @@ prepare_shader_sampling(
         unsigned first_level = 0;
         unsigned last_level = 0;

-         /* We're referencing the texture's internal data, so save a
-          * reference to it.
-          */
-         pipe_resource_reference(&mapped_tex[i], tex);
-
         if (!lp_tex->dt) {
            /* regular texture - setup array of mipmap level offsets */
            struct pipe_resource *res = view->texture;
@ -335,47 +340,28 @@ prepare_shader_sampling(


 /**
- * Called during state validation when LP_NEW_SAMPLER_VIEW is set.
+ * Called whenever we're about to draw (no dirty flag, FIXME?).
 */
 void
 llvmpipe_prepare_vertex_sampling(struct llvmpipe_context *lp,
                                 unsigned num,
                                 struct pipe_sampler_view **views)
 {
-   prepare_shader_sampling(lp, num, views, PIPE_SHADER_VERTEX,
-                           lp->mapped_vs_tex);
-}
-
-void
-llvmpipe_cleanup_vertex_sampling(struct llvmpipe_context *ctx)
-{
-   unsigned i;
-   for (i = 0; i < Elements(ctx->mapped_vs_tex); i++) {
-      pipe_resource_reference(&ctx->mapped_vs_tex[i], NULL);
-   }
+   prepare_shader_sampling(lp, num, views, PIPE_SHADER_VERTEX);
 }


 /**
- * Called during state validation when LP_NEW_SAMPLER_VIEW is set.
+ * Called whenever we're about to draw (no dirty flag, FIXME?).
 */
 void
 llvmpipe_prepare_geometry_sampling(struct llvmpipe_context *lp,
                                   unsigned num,
                                   struct pipe_sampler_view **views)
 {
-   prepare_shader_sampling(lp, num, views, PIPE_SHADER_GEOMETRY,
-                           lp->mapped_gs_tex);
+   prepare_shader_sampling(lp, num, views, PIPE_SHADER_GEOMETRY);
 }

-void
-llvmpipe_cleanup_geometry_sampling(struct llvmpipe_context *ctx)
-{
-   unsigned i;
-   for (i = 0; i < Elements(ctx->mapped_gs_tex); i++) {
-      pipe_resource_reference(&ctx->mapped_gs_tex[i], NULL);
-   }
-}

 void
 llvmpipe_init_sampler_funcs(struct llvmpipe_context *llvmpipe)
--- a/src/gallium/drivers/llvmpipe/lp_state_so.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_so.c
@ -70,6 +70,15 @@ llvmpipe_set_so_targets(struct pipe_context *pipe,
   int i;
   for (i = 0; i < num_targets; i++) {
      const boolean append = (offsets[i] == (unsigned)-1);
+      /*
+       * Warn if the so target was created in another context.
+       * XXX Not entirely sure if mesa/st may rely on this?
+       * Otherwise should just assert.
+       */
+      if (targets[i] && targets[i]->context != pipe) {
+         debug_printf("Illegal setting of so target with target %d created in "
+                       "another context\n", i);
+      }
      pipe_so_target_reference((struct pipe_stream_output_target **)&llvmpipe->so_targets[i], targets[i]);
      /* If we're not appending then lets set the internal
         offset to what was requested */
--- a/src/gallium/drivers/llvmpipe/lp_state_surface.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_surface.c
@ -52,6 +52,7 @@ llvmpipe_set_framebuffer_state(struct pipe_context *pipe,
   struct llvmpipe_context *lp = llvmpipe_context(pipe);

   boolean changed = !util_framebuffer_state_equal(&lp->framebuffer, fb);
+   unsigned i;

   assert(fb->width <= LP_MAX_WIDTH);
   assert(fb->height <= LP_MAX_HEIGHT);
@ -66,10 +67,22 @@ llvmpipe_set_framebuffer_state(struct pipe_context *pipe,
      const struct util_format_description *depth_desc =
         util_format_description(depth_format);

+      if (lp->framebuffer.zsbuf && lp->framebuffer.zsbuf->context != pipe) {
+         debug_printf("Illegal setting of fb state with zsbuf created in "
+                       "another context\n");
+      }
+      for (i = 0; i < fb->nr_cbufs; i++) {
+         if (lp->framebuffer.cbufs[i] &&
+             lp->framebuffer.cbufs[i]->context != pipe) {
+            debug_printf("Illegal setting of fb state with cbuf %d created in "
+                          "another context\n", i);
+         }
+      }
+
      util_copy_framebuffer_state(&lp->framebuffer, fb);

      if (LP_PERF & PERF_NO_DEPTH) {
-	 pipe_surface_reference(&lp->framebuffer.zsbuf, NULL);
+         pipe_surface_reference(&lp->framebuffer.zsbuf, NULL);
      }

      /*
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
@ -615,6 +615,7 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
         case FILE_MEMORY_CONST:
         case FILE_MEMORY_SHARED:
         case FILE_SHADER_INPUT:
+         case FILE_SHADER_OUTPUT:
            hi->getSrc(s)->reg.data.offset += 4;
            break;
         default:
@ -625,7 +626,7 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
      }
   }
   if (srcNr == 2) {
-      lo->setDef(1, carry);
+      lo->setFlagsDef(1, carry);
      hi->setFlagsSrc(hi->srcCount(), carry);
   }
   return hi;
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@ -75,7 +75,8 @@ private:
   void emitLOAD(const Instruction *);
   void emitSTORE(const Instruction *);
   void emitMOV(const Instruction *);
-   void emitMEMBAR(const Instruction *);
+   void emitATOM(const Instruction *);
+   void emitCCTL(const Instruction *);

   void emitINTERP(const Instruction *);
   void emitAFETCH(const Instruction *);
@ -123,6 +124,7 @@ private:
   void emitPIXLD(const Instruction *);

   void emitBAR(const Instruction *);
+   void emitMEMBAR(const Instruction *);

   void emitFlow(const Instruction *);

@ -698,6 +700,10 @@ CodeEmitterGK110::emitIMAD(const Instruction *i)

   if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
      code[1] |= 1 << 25;
+
+   if (i->flagsDef >= 0) code[1] |= 1 << 18;
+   if (i->flagsSrc >= 0) code[1] |= 1 << 20;
+
   SAT_(35);
 }

@ -1252,8 +1258,32 @@ CodeEmitterGK110::emitPIXLD(const Instruction *i)
 void
 CodeEmitterGK110::emitBAR(const Instruction *i)
 {
-   /* TODO */
-   emitNOP(i);
+   code[0] = 0x00000002;
+   code[1] = 0x85400000;
+
+   switch (i->subOp) {
+   case NV50_IR_SUBOP_BAR_ARRIVE:   code[1] |= 0x08; break;
+   case NV50_IR_SUBOP_BAR_RED_AND:  code[1] |= 0x50; break;
+   case NV50_IR_SUBOP_BAR_RED_OR:   code[1] |= 0x90; break;
+   case NV50_IR_SUBOP_BAR_RED_POPC: code[1] |= 0x10; break;
+   default:
+      code[1] |= 0x20;
+      assert(i->subOp == NV50_IR_SUBOP_BAR_SYNC);
+      break;
+   }
+
+   emitPredicate(i);
+
+   srcId(i->src(0), 10);
+   srcId(i->src(1), 23);
+}
+
+void CodeEmitterGK110::emitMEMBAR(const Instruction *i)
+{
+   code[0] = 0x00000002 | NV50_IR_SUBOP_MEMBAR_SCOPE(i->subOp) << 8;
+   code[1] = 0x7cc00000;
+
+   emitPredicate(i);
 }

 void
@ -1587,6 +1617,10 @@ CodeEmitterGK110::emitSTORE(const Instruction *i)

   srcId(i->src(1), 2);
   srcId(i->src(0).getIndirect(0), 10);
+   if (i->src(0).getFile() == FILE_MEMORY_GLOBAL &&
+       i->src(0).isIndirect(0) &&
+       i->getIndirect(0, 0)->reg.size == 8)
+      code[1] |= 1 << 23;
 }

 void
@ -1597,7 +1631,7 @@ CodeEmitterGK110::emitLOAD(const Instruction *i)
   switch (i->src(0).getFile()) {
   case FILE_MEMORY_GLOBAL: code[1] = 0xc0000000; code[0] = 0x00000000; break;
   case FILE_MEMORY_LOCAL:  code[1] = 0x7a000000; code[0] = 0x00000002; break;
-   case FILE_MEMORY_SHARED: code[1] = 0x7ac00000; code[0] = 0x00000002; break;
+   case FILE_MEMORY_SHARED: code[1] = 0x7a400000; code[0] = 0x00000002; break;
   case FILE_MEMORY_CONST:
      if (!i->src(0).isIndirect(0) && typeSizeof(i->dType) == 4) {
         emitMOV(i);
@ -1628,7 +1662,13 @@ CodeEmitterGK110::emitLOAD(const Instruction *i)
   emitPredicate(i);

   defId(i->def(0), 2);
-   srcId(i->src(0).getIndirect(0), 10);
+   if (i->getIndirect(0, 0)) {
+      srcId(i->src(0).getIndirect(0), 10);
+      if (i->getIndirect(0, 0)->reg.size == 8)
+         code[1] |= 1 << 23;
+   } else {
+      code[0] |= 255 << 10;
+   }
 }

 uint8_t
@ -1683,10 +1723,83 @@ CodeEmitterGK110::emitMOV(const Instruction *i)
   }
 }

-void CodeEmitterGK110::emitMEMBAR(const Instruction *i)
+static inline bool
+uses64bitAddress(const Instruction *ldst)
 {
-   code[0] = 0x00000002 | NV50_IR_SUBOP_MEMBAR_SCOPE(i->subOp) << 8;
-   code[1] = 0x7cc00000;
+   return ldst->src(0).getFile() == FILE_MEMORY_GLOBAL &&
+      ldst->src(0).isIndirect(0) &&
+      ldst->getIndirect(0, 0)->reg.size == 8;
+}
+
+void
+CodeEmitterGK110::emitATOM(const Instruction *i)
+{
+   code[0] = 0x00000002;
+   if (i->subOp == NV50_IR_SUBOP_ATOM_CAS)
+      code[1] = 0x77800000;
+   else
+      code[1] = 0x68000000;
+
+   switch (i->subOp) {
+   case NV50_IR_SUBOP_ATOM_CAS: break;
+   case NV50_IR_SUBOP_ATOM_EXCH: code[1] |= 0x04000000; break;
+   default: code[1] |= i->subOp << 23; break;
+   }
+
+   switch (i->dType) {
+   case TYPE_U32: break;
+   case TYPE_S32: code[1] |= 0x00100000; break;
+   case TYPE_U64: code[1] |= 0x00200000; break;
+   case TYPE_F32: code[1] |= 0x00300000; break;
+   case TYPE_B128: code[1] |= 0x00400000; break; /* TODO: U128 */
+   case TYPE_S64: code[1] |= 0x00500000; break;
+   default: assert(!"unsupported type"); break;
+   }
+
+   emitPredicate(i);
+
+   /* TODO: cas: check that src regs line up */
+   /* TODO: cas: flip bits if $r255 is used */
+   srcId(i->src(1), 23);
+
+   if (i->defExists(0))
+      defId(i->def(0), 2);
+   else
+      code[0] |= 255 << 2;
+
+   const int32_t offset = SDATA(i->src(0)).offset;
+   assert(offset < 0x80000 && offset >= -0x80000);
+   code[0] |= (offset & 1) << 31;
+   code[1] |= (offset & 0xffffe) >> 1;
+
+   if (i->getIndirect(0, 0)) {
+      srcId(i->getIndirect(0, 0), 10);
+      if (i->getIndirect(0, 0)->reg.size == 8)
+         code[1] |= 1 << 19;
+   } else {
+      code[0] |= 255 << 10;
+   }
+}
+
+void
+CodeEmitterGK110::emitCCTL(const Instruction *i)
+{
+   int32_t offset = SDATA(i->src(0)).offset;
+
+   code[0] = 0x00000002 | (i->subOp << 2);
+
+   if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) {
+      code[1] = 0x7b000000;
+   } else {
+      code[1] = 0x7c000000;
+      offset &= 0xffffff;
+   }
+   code[0] |= offset << 23;
+   code[1] |= offset >> 9;
+
+   if (uses64bitAddress(i))
+      code[1] |= 1 << 23;
+   srcId(i->src(0).getIndirect(0), 10);

   emitPredicate(i);
 }
@ -1925,6 +2038,12 @@ CodeEmitterGK110::emitInstruction(Instruction *insn)
   case OP_MEMBAR:
      emitMEMBAR(insn);
      break;
+   case OP_ATOM:
+      emitATOM(insn);
+      break;
+   case OP_CCTL:
+      emitCCTL(insn);
+      break;
   case OP_PHI:
   case OP_UNION:
   case OP_CONSTRAINT:
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@ -176,6 +176,8 @@ private:
   void emitISBERD();
   void emitAL2P();
   void emitIPA();
+   void emitATOM();
+   void emitCCTL();

   void emitPIXLD();

@ -1552,11 +1554,13 @@ CodeEmitterGM107::emitLOP()
         break;
      }
      emitPRED (0x30);
+      emitX    (0x2b);
      emitField(0x29, 2, lop);
      emitINV  (0x28, insn->src(1));
      emitINV  (0x27, insn->src(0));
   } else {
      emitInsn (0x04000000);
+      emitX    (0x39);
      emitINV  (0x38, insn->src(1));
      emitINV  (0x37, insn->src(0));
      emitField(0x35, 2, lop);
@ -1624,9 +1628,11 @@ CodeEmitterGM107::emitIADD()
      emitNEG(0x31, insn->src(0));
      emitNEG(0x30, insn->src(1));
      emitCC (0x2f);
+      emitX  (0x2b);
   } else {
      emitInsn(0x1c000000);
      emitSAT (0x36);
+      emitX   (0x35);
      emitCC  (0x34);
      emitIMMD(0x14, 32, insn->src(1));
   }
@ -2146,6 +2152,7 @@ CodeEmitterGM107::emitLD()
   emitPRED (0x3a);
   emitLDSTc(0x38);
   emitLDSTs(0x35, insn->dType);
+   emitField(0x34, 1, insn->src(0).getIndirect(0)->getSize() == 8);
   emitADDR (0x08, 0x14, 32, 0, insn->src(0));
   emitGPR  (0x00, insn->def(0));
 }
@ -2176,6 +2183,7 @@ CodeEmitterGM107::emitST()
   emitPRED (0x3a);
   emitLDSTc(0x38);
   emitLDSTs(0x35, insn->dType);
+   emitField(0x34, 1, insn->src(0).getIndirect(0)->getSize() == 8);
   emitADDR (0x08, 0x14, 32, 0, insn->src(0));
   emitGPR  (0x00, insn->src(1));
 }
@ -2296,6 +2304,50 @@ CodeEmitterGM107::emitIPA()
      emitGPR(0x27);
 }

+void
+CodeEmitterGM107::emitATOM()
+{
+   unsigned dType, subOp;
+   switch (insn->dType) {
+   case TYPE_U32: dType = 0; break;
+   case TYPE_S32: dType = 1; break;
+   case TYPE_U64: dType = 2; break;
+   case TYPE_F32: dType = 3; break;
+   case TYPE_B128: dType = 4; break;
+   case TYPE_S64: dType = 5; break;
+   default: assert(!"unexpected dType"); dType = 0; break;
+   }
+   if (insn->subOp == NV50_IR_SUBOP_ATOM_EXCH)
+      subOp = 8;
+   else
+      subOp = insn->subOp;
+   assert(insn->subOp != NV50_IR_SUBOP_ATOM_CAS); /* XXX */
+
+   emitInsn (0xed000000);
+   emitField(0x34, 4, subOp);
+   emitField(0x31, 3, dType);
+   emitField(0x30, 1, insn->src(0).getIndirect(0)->getSize() == 8);
+   emitGPR  (0x14, insn->src(1));
+   emitADDR (0x08, 0x1c, 20, 0, insn->src(0));
+   emitGPR  (0x00, insn->def(0));
+}
+
+void
+CodeEmitterGM107::emitCCTL()
+{
+   unsigned width;
+   if (insn->src(0).getFile() == FILE_MEMORY_GLOBAL) {
+      emitInsn(0xef600000);
+      width = 30;
+   } else {
+      emitInsn(0xef800000);
+      width = 22;
+   }
+   emitField(0x34, 1, insn->src(0).getIndirect(0)->getSize() == 8);
+   emitADDR (0x08, 0x16, width, 2, insn->src(0));
+   emitField(0x00, 4, insn->subOp);
+}
+
 /*******************************************************************************
 * surface
 ******************************************************************************/
@ -2795,6 +2847,12 @@ CodeEmitterGM107::emitInstruction(Instruction *i)
         break;
      }
      break;
+   case OP_ATOM:
+      emitATOM();
+      break;
+   case OP_CCTL:
+      emitCCTL();
+      break;
   case OP_VFETCH:
      emitALD();
      break;
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@ -1463,6 +1463,7 @@ CodeEmitterNV50::emitSFnOp(const Instruction *i, uint8_t subOp)

   if (i->encSize == 4) {
      assert(i->op == OP_RCP);
+      assert(!i->saturate);
      code[0] |= i->src(0).mod.abs() << 15;
      code[0] |= i->src(0).mod.neg() << 22;
      emitForm_MUL(i);
@ -1470,6 +1471,10 @@ CodeEmitterNV50::emitSFnOp(const Instruction *i, uint8_t subOp)
      code[1] = subOp << 29;
      code[1] |= i->src(0).mod.abs() << 20;
      code[1] |= i->src(0).mod.neg() << 26;
+      if (i->saturate) {
+         assert(subOp == 6 && i->op == OP_EX2);
+         code[1] |= 1 << 27;
+      }
      emitForm_MAD(i);
   }
 }
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@ -95,6 +95,13 @@ public:
         return tgsi_util_get_src_register_swizzle(&reg, chan);
      }

+      int getArrayId() const
+      {
+         if (isIndirect(0))
+            return fsr->Indirect.ArrayID;
+         return 0;
+      }
+
      nv50_ir::Modifier getMod(int chan) const;

      SrcRegister getIndirect(int dim) const
@ -154,6 +161,13 @@ public:
         return SrcRegister(fdr->Indirect);
      }

+      int getArrayId() const
+      {
+         if (isIndirect(0))
+            return fdr->Indirect.ArrayID;
+         return 0;
+      }
+
   private:
      const struct tgsi_dst_register reg;
      const struct tgsi_full_dst_register *fdr;
@ -809,7 +823,10 @@ public:
   // these registers are per-subroutine, cannot be used for parameter passing
   std::set<Location> locals;

-   bool mainTempsInLMem;
+   std::set<int> indirectTempArrays;
+   std::map<int, int> indirectTempOffsets;
+   std::map<int, std::pair<int, int> > tempArrayInfo;
+   std::vector<int> tempArrayId;

   int clipVertexOutput;

@ -841,8 +858,6 @@ Source::Source(struct nv50_ir_prog_info *prog) : info(prog)

   if (prog->dbgFlags & NV50_IR_DEBUG_BASIC)
      tgsi_dump(tokens, 0);
-
-   mainTempsInLMem = false;
 }

 Source::~Source()
@ -872,6 +887,7 @@ bool Source::scanSource()

   textureViews.resize(scan.file_max[TGSI_FILE_SAMPLER_VIEW] + 1);
   //resources.resize(scan.file_max[TGSI_FILE_RESOURCE] + 1);
+   tempArrayId.resize(scan.file_max[TGSI_FILE_TEMPORARY] + 1);

   info->immd.bufSize = 0;

@ -917,8 +933,16 @@ bool Source::scanSource()
   }
   tgsi_parse_free(&parse);

-   if (mainTempsInLMem)
-      info->bin.tlsSpace += (scan.file_max[TGSI_FILE_TEMPORARY] + 1) * 16;
+   if (indirectTempArrays.size()) {
+      int tempBase = 0;
+      for (std::set<int>::const_iterator it = indirectTempArrays.begin();
+           it != indirectTempArrays.end(); ++it) {
+         std::pair<int, int>& info = tempArrayInfo[*it];
+         indirectTempOffsets.insert(std::make_pair(*it, tempBase - info.first));
+         tempBase += info.second;
+      }
+      info->bin.tlsSpace += tempBase * 16;
+   }

   if (info->io.genUserClip > 0) {
      info->io.clipDistances = info->io.genUserClip;
@ -1028,6 +1052,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
   unsigned sn = TGSI_SEMANTIC_GENERIC;
   unsigned si = 0;
   const unsigned first = decl->Range.First, last = decl->Range.Last;
+   const int arrayId = decl->Array.ArrayID;

   if (decl->Declaration.Semantic) {
      sn = decl->Semantic.Name;
@ -1172,8 +1197,14 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
      for (i = first; i <= last; ++i)
         textureViews[i].target = decl->SamplerView.Resource;
      break;
-   case TGSI_FILE_NULL:
   case TGSI_FILE_TEMPORARY:
+      for (i = first; i <= last; ++i)
+         tempArrayId[i] = arrayId;
+      if (arrayId)
+         tempArrayInfo.insert(std::make_pair(arrayId, std::make_pair(
+                                                   first, last - first + 1)));
+      break;
+   case TGSI_FILE_NULL:
   case TGSI_FILE_ADDRESS:
   case TGSI_FILE_CONSTANT:
   case TGSI_FILE_IMMEDIATE:
@ -1223,7 +1254,7 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
      } else
      if (insn.getDst(0).getFile() == TGSI_FILE_TEMPORARY) {
         if (insn.getDst(0).isIndirect(0))
-            mainTempsInLMem = true;
+            indirectTempArrays.insert(insn.getDst(0).getArrayId());
      }
   }

@ -1231,7 +1262,7 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
      Instruction::SrcRegister src = insn.getSrc(s);
      if (src.getFile() == TGSI_FILE_TEMPORARY) {
         if (src.isIndirect(0))
-            mainTempsInLMem = true;
+            indirectTempArrays.insert(src.getArrayId());
      } else
 /*
      if (src.getFile() == TGSI_FILE_RESOURCE) {
@ -1337,6 +1368,7 @@ private:
   void storeDst(const tgsi::Instruction::DstRegister dst, int c,
                 Value *val, Value *ptr);

+   void adjustTempIndex(int arrayId, int &idx, int &idx2d) const;
   Value *applySrcMod(Value *, int s, int c);

   Symbol *makeSym(uint file, int fileIndex, int idx, int c, uint32_t addr);
@ -1416,6 +1448,7 @@ private:
   DataType srcTy;

   DataArray tData; // TGSI_FILE_TEMPORARY
+   DataArray lData; // TGSI_FILE_TEMPORARY, for indirect arrays
   DataArray aData; // TGSI_FILE_ADDRESS
   DataArray pData; // TGSI_FILE_PREDICATE
   DataArray oData; // TGSI_FILE_OUTPUT (if outputs in registers)
@ -1619,7 +1652,7 @@ Converter::getArrayForFile(unsigned file, int idx)
 {
   switch (file) {
   case TGSI_FILE_TEMPORARY:
-      return &tData;
+      return idx == 0 ? &tData : &lData;
   case TGSI_FILE_PREDICATE:
      return &pData;
   case TGSI_FILE_ADDRESS:
@ -1641,11 +1674,23 @@ Converter::shiftAddress(Value *index)
   return mkOp2v(OP_SHL, TYPE_U32, getSSA(4, FILE_ADDRESS), index, mkImm(4));
 }

+void
+Converter::adjustTempIndex(int arrayId, int &idx, int &idx2d) const
+{
+   std::map<int, int>::const_iterator it =
+      code->indirectTempOffsets.find(arrayId);
+   if (it == code->indirectTempOffsets.end())
+      return;
+
+   idx2d = 1;
+   idx += it->second;
+}
+
 Value *
 Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
 {
-   const int idx2d = src.is2D() ? src.getIndex(1) : 0;
-   const int idx = src.getIndex(0);
+   int idx2d = src.is2D() ? src.getIndex(1) : 0;
+   int idx = src.getIndex(0);
   const int swz = src.getSwizzle(c);
   Instruction *ld;

@ -1686,6 +1731,13 @@ Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
      ld = mkOp1(OP_RDSV, TYPE_U32, getSSA(), srcToSym(src, c));
      ld->perPatch = info->sv[idx].patch;
      return ld->getDef(0);
+   case TGSI_FILE_TEMPORARY: {
+      int arrayid = src.getArrayId();
+      if (!arrayid)
+         arrayid = code->tempArrayId[idx];
+      adjustTempIndex(arrayid, idx, idx2d);
+   }
+      /* fallthrough */
   default:
      return getArrayForFile(src.getFile(), idx2d)->load(
         sub.cur->values, idx, swz, shiftAddress(ptr));
@ -1697,8 +1749,8 @@ Converter::acquireDst(int d, int c)
 {
   const tgsi::Instruction::DstRegister dst = tgsi.getDst(d);
   const unsigned f = dst.getFile();
-   const int idx = dst.getIndex(0);
-   const int idx2d = dst.is2D() ? dst.getIndex(1) : 0;
+   int idx = dst.getIndex(0);
+   int idx2d = dst.is2D() ? dst.getIndex(1) : 0;

   if (dst.isMasked(c)/* || f == TGSI_FILE_RESOURCE*/)
      return NULL;
@ -1708,6 +1760,13 @@ Converter::acquireDst(int d, int c)
       (f == TGSI_FILE_OUTPUT && prog->getType() != Program::TYPE_FRAGMENT))
      return getScratch();

+   if (f == TGSI_FILE_TEMPORARY) {
+      int arrayid = dst.getArrayId();
+      if (!arrayid)
+         arrayid = code->tempArrayId[idx];
+      adjustTempIndex(arrayid, idx, idx2d);
+   }
+
   return getArrayForFile(f, idx2d)-> acquire(sub.cur->values, idx, c);
 }

@ -1739,8 +1798,8 @@ Converter::storeDst(const tgsi::Instruction::DstRegister dst, int c,
                    Value *val, Value *ptr)
 {
   const unsigned f = dst.getFile();
-   const int idx = dst.getIndex(0);
-   const int idx2d = dst.is2D() ? dst.getIndex(1) : 0;
+   int idx = dst.getIndex(0);
+   int idx2d = dst.is2D() ? dst.getIndex(1) : 0;

   if (f == TGSI_FILE_SYSTEM_VALUE) {
      assert(!ptr);
@ -1763,6 +1822,13 @@ Converter::storeDst(const tgsi::Instruction::DstRegister dst, int c,
       f == TGSI_FILE_PREDICATE ||
       f == TGSI_FILE_ADDRESS ||
       f == TGSI_FILE_OUTPUT) {
+      if (f == TGSI_FILE_TEMPORARY) {
+         int arrayid = dst.getArrayId();
+         if (!arrayid)
+            arrayid = code->tempArrayId[idx];
+         adjustTempIndex(arrayid, idx, idx2d);
+      }
+
      getArrayForFile(f, idx2d)->store(sub.cur->values, idx, c, ptr, val);
   } else {
      assert(!"invalid dst file");
@ -3326,18 +3392,17 @@ Converter::exportOutputs()
 Converter::Converter(Program *ir, const tgsi::Source *code) : BuildUtil(ir),
     code(code),
     tgsi(NULL),
-     tData(this), aData(this), pData(this), oData(this)
+     tData(this), lData(this), aData(this), pData(this), oData(this)
 {
   info = code->info;

-   const DataFile tFile = code->mainTempsInLMem ? FILE_MEMORY_LOCAL : FILE_GPR;
-
   const unsigned tSize = code->fileSize(TGSI_FILE_TEMPORARY);
   const unsigned pSize = code->fileSize(TGSI_FILE_PREDICATE);
   const unsigned aSize = code->fileSize(TGSI_FILE_ADDRESS);
   const unsigned oSize = code->fileSize(TGSI_FILE_OUTPUT);

-   tData.setup(TGSI_FILE_TEMPORARY, 0, 0, tSize, 4, 4, tFile, 0);
+   tData.setup(TGSI_FILE_TEMPORARY, 0, 0, tSize, 4, 4, FILE_GPR, 0);
+   lData.setup(TGSI_FILE_TEMPORARY, 1, 0, tSize, 4, 4, FILE_MEMORY_LOCAL, 0);
   pData.setup(TGSI_FILE_PREDICATE, 0, 0, pSize, 4, 4, FILE_PREDICATE, 0);
   aData.setup(TGSI_FILE_ADDRESS, 0, 0, aSize, 4, 4, FILE_GPR, 0);
   oData.setup(TGSI_FILE_OUTPUT, 0, 0, oSize, 4, 4, FILE_GPR, 0);
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@ -540,6 +540,12 @@ NVC0LegalizePostRA::visit(BasicBlock *bb)
         // It seems like barriers are never required for tessellation since
         // the warp size is 32, and there are always at most 32 tcs threads.
         bb->remove(i);
+      } else
+      if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LDC_IS) {
+         int offset = i->src(0).get()->reg.data.offset;
+         if (abs(offset) > 0x10000)
+            i->src(0).get()->reg.fileIndex += offset >> 16;
+         i->src(0).get()->reg.data.offset = (int)(short)offset;
      } else {
         // TODO: Move this to before register allocation for operations that
         // need the $c register !
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@ -171,7 +171,10 @@ LoadPropagation::isImmdLoad(Instruction *ld)
   if (!ld || (ld->op != OP_MOV) ||
       ((typeSizeof(ld->dType) != 4) && (typeSizeof(ld->dType) != 8)))
      return false;
-   return ld->src(0).getFile() == FILE_IMMEDIATE;
+
+   // A 0 can be replaced with a register, so it doesn't count as an immediate.
+   ImmediateValue val;
+   return ld->src(0).getImmediate(val) && !val.isInteger(0);
 }

 bool
@ -187,7 +190,8 @@ LoadPropagation::isAttribOrSharedLoad(Instruction *ld)
 void
 LoadPropagation::checkSwapSrc01(Instruction *insn)
 {
-   if (!prog->getTarget()->getOpInfo(insn).commutative)
+   const Target *targ = prog->getTarget();
+   if (!targ->getOpInfo(insn).commutative)
      if (insn->op != OP_SET && insn->op != OP_SLCT)
         return;
   if (insn->src(1).getFile() != FILE_GPR)
@ -196,14 +200,15 @@ LoadPropagation::checkSwapSrc01(Instruction *insn)
   Instruction *i0 = insn->getSrc(0)->getInsn();
   Instruction *i1 = insn->getSrc(1)->getInsn();

-   if (isCSpaceLoad(i0)) {
-      if (!isCSpaceLoad(i1))
-         insn->swapSources(0, 1);
-      else
-         return;
-   } else
-   if (isImmdLoad(i0)) {
-      if (!isCSpaceLoad(i1) && !isImmdLoad(i1))
+   // Swap sources to inline the less frequently used source. That way,
+   // optimistically, it will eventually be able to remove the instruction.
+   int i0refs = insn->getSrc(0)->refCount();
+   int i1refs = insn->getSrc(1)->refCount();
+
+   if ((isCSpaceLoad(i0) || isImmdLoad(i0)) && targ->insnCanLoad(insn, 1, i0)) {
+      if ((!isImmdLoad(i1) && !isCSpaceLoad(i1)) ||
+          !targ->insnCanLoad(insn, 1, i1) ||
+          i0refs < i1refs)
         insn->swapSources(0, 1);
      else
         return;
@ -1224,6 +1229,8 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
            adds = 1;
         else
            return;
+         if (si->src(!adds).mod != Modifier(0))
+            return;
         // SHL(ADD(x, y), z) = ADD(SHL(x, z), SHL(y, z))

         // This is more operations, but if one of x, y is an immediate, then
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
@ -192,7 +192,7 @@ public:
   virtual bool insnCanLoad(const Instruction *insn, int s,
                            const Instruction *ld) const = 0;
   virtual bool insnCanLoadOffset(const Instruction *insn, int s,
-                                  int offset) const { return true; }
+                                  int offset) const = 0;
   virtual bool isOpSupported(operation, DataType) const = 0;
   virtual bool isAccessSupported(DataFile, DataType) const = 0;
   virtual bool isModSupported(const Instruction *,
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@ -99,6 +99,7 @@ static const struct opProperties _initProps[] =
   { OP_SET,    0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 },
   { OP_PREEX2, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
   { OP_PRESIN, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_EX2,    0x0, 0x0, 0x0, 0x8, 0x0, 0x0, 0x0, 0x0 },
   { OP_LG2,    0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
   { OP_RCP,    0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
   { OP_RSQ,    0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@ -383,6 +383,16 @@ TargetNVC0::insnCanLoad(const Instruction *i, int s,
   return true;
 }

+bool
+TargetNVC0::insnCanLoadOffset(const Instruction *insn, int s, int offset) const
+{
+   const ValueRef& ref = insn->src(s);
+   if (ref.getFile() == FILE_MEMORY_CONST &&
+       (insn->op != OP_LOAD || insn->subOp != NV50_IR_SUBOP_LDC_IS))
+      return offset >= -0x8000 && offset < 0x8000;
+   return true;
+}
+
 bool
 TargetNVC0::isAccessSupported(DataFile file, DataType ty) const
 {
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h
@ -48,6 +48,8 @@ public:

   virtual bool insnCanLoad(const Instruction *insn, int s,
                            const Instruction *ld) const;
+   virtual bool insnCanLoadOffset(const Instruction *insn, int s,
+                                  int offset) const;
   virtual bool isOpSupported(operation, DataType) const;
   virtual bool isAccessSupported(DataFile, DataType) const;
   virtual bool isModSupported(const Instruction *, int s, Modifier) const;
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@ -183,6 +183,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
   case PIPE_CAP_INVALIDATE_BUFFER:
   case PIPE_CAP_GENERATE_MIPMAP:
+   case PIPE_CAP_STRING_MARKER:
      return 0;

   case PIPE_CAP_VENDOR_ID:
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@ -226,6 +226,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
   case PIPE_CAP_INVALIDATE_BUFFER:
   case PIPE_CAP_GENERATE_MIPMAP:
+   case PIPE_CAP_STRING_MARKER:
      return 0;

   case PIPE_CAP_VENDOR_ID:
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@ -215,6 +215,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
   case PIPE_CAP_INVALIDATE_BUFFER:
   case PIPE_CAP_GENERATE_MIPMAP:
+   case PIPE_CAP_STRING_MARKER:
      return 0;

   case PIPE_CAP_VENDOR_ID:
@ -295,9 +296,10 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
      if (shader == PIPE_SHADER_COMPUTE && class_3d >= NVE4_3D_CLASS)
         return NVE4_MAX_PIPE_CONSTBUFS_COMPUTE;
      return NVC0_MAX_PIPE_CONSTBUFS;
-   case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
   case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
      return shader != PIPE_SHADER_FRAGMENT;
+   case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
+      return shader != PIPE_SHADER_FRAGMENT || class_3d < GM107_3D_CLASS;
   case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
   case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
      return 1;
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
@ -64,7 +64,7 @@ nvc0_decoder_bsp_next(struct nouveau_vp3_decoder *dec,
      bsp_size += num_bytes[i];
   bsp_size += 256; /* the 4 end markers */

-   if (!bsp_bo || bsp_size > bsp_bo->size) {
+   if (bsp_size > bsp_bo->size) {
      union nouveau_bo_config cfg;
      struct nouveau_bo *tmp_bo = NULL;

--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@ -209,6 +209,7 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
        case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
        case PIPE_CAP_INVALIDATE_BUFFER:
        case PIPE_CAP_GENERATE_MIPMAP:
+        case PIPE_CAP_STRING_MARKER:
            return 0;

        /* SWTCL-only features. */
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@ -68,6 +68,7 @@ static const struct debug_named_value r600_debug_options[] = {
 static void r600_destroy_context(struct pipe_context *context)
 {
 	struct r600_context *rctx = (struct r600_context *)context;
+	unsigned sh;

 	r600_isa_destroy(rctx->isa);

@ -76,6 +77,11 @@ static void r600_destroy_context(struct pipe_context *context)
 	pipe_resource_reference((struct pipe_resource**)&rctx->dummy_cmask, NULL);
 	pipe_resource_reference((struct pipe_resource**)&rctx->dummy_fmask, NULL);

+	for (sh = 0; sh < PIPE_SHADER_TYPES; sh++) {
+		rctx->b.b.set_constant_buffer(&rctx->b.b, sh, R600_BUFFER_INFO_CONST_BUFFER, NULL);
+		free(rctx->driver_consts[sh].constants);
+	}
+
 	if (rctx->fixed_func_tcs_shader)
 		rctx->b.b.delete_tcs_state(&rctx->b.b, rctx->fixed_func_tcs_shader);

@ -357,6 +363,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
 	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
 	case PIPE_CAP_GENERATE_MIPMAP:
+	case PIPE_CAP_STRING_MARKER:
 		return 0;

 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@ -210,8 +210,8 @@ static void r600_buffer_destroy(struct pipe_screen *screen,
 }

 static bool
-r600_do_invalidate_resource(struct r600_common_context *rctx,
-			    struct r600_resource *rbuffer)
+r600_invalidate_buffer(struct r600_common_context *rctx,
+		       struct r600_resource *rbuffer)
 {
 	/* In AMD_pinned_memory, the user pointer association only gets
 	 * broken when the buffer is explicitly re-allocated.
@ -236,7 +236,9 @@ void r600_invalidate_resource(struct pipe_context *ctx,
 	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
 	struct r600_resource *rbuffer = r600_resource(resource);

-	(void)r600_do_invalidate_resource(rctx, rbuffer);
+	/* We currently only do anyting here for buffers */
+	if (resource->target == PIPE_BUFFER)
+		(void)r600_invalidate_buffer(rctx, rbuffer);
 }

 static void *r600_buffer_get_transfer(struct pipe_context *ctx,
@ -306,7 +308,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 	    !(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
 		assert(usage & PIPE_TRANSFER_WRITE);

-		if (r600_do_invalidate_resource(rctx, rbuffer)) {
+		if (r600_invalidate_buffer(rctx, rbuffer)) {
 			/* At this point, the buffer is always idle. */
 			usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
 		}
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@ -349,6 +349,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
 	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
 	case PIPE_CAP_GENERATE_MIPMAP:
+	case PIPE_CAP_STRING_MARKER:
 		return 0;

 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@ -3728,6 +3728,9 @@ void si_shader_binary_read_config(struct radeon_shader_binary *binary,
 		case R_0286CC_SPI_PS_INPUT_ENA:
 			conf->spi_ps_input_ena = value;
 			break;
+		case R_0286D0_SPI_PS_INPUT_ADDR:
+			/* Not used yet, but will be in the future */
+			break;
 		case R_0286E8_SPI_TMPRING_SIZE:
 		case R_00B860_COMPUTE_TMPRING_SIZE:
 			/* WAVESIZE is in units of 256 dwords. */
@ -3735,8 +3738,15 @@ void si_shader_binary_read_config(struct radeon_shader_binary *binary,
 				G_00B860_WAVESIZE(value) * 256 * 4 * 1;
 			break;
 		default:
-			fprintf(stderr, "Warning: Compiler emitted unknown "
-				"config register: 0x%x\n", reg);
+			{
+				static bool printed;
+
+				if (!printed) {
+					fprintf(stderr, "Warning: LLVM emitted unknown "
+						"config register: 0x%x\n", reg);
+					printed = true;
+				}
+			}
 			break;
 		}
 	}
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@ -260,6 +260,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
   case PIPE_CAP_INVALIDATE_BUFFER:
   case PIPE_CAP_GENERATE_MIPMAP:
+   case PIPE_CAP_STRING_MARKER:
      return 0;
   }
   /* should only get here on unhandled cases */
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@ -357,6 +357,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
   case PIPE_CAP_INVALIDATE_BUFFER:
+   case PIPE_CAP_STRING_MARKER:
      return 0;
   case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
      return 64;
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@ -198,6 +198,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
        case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
        case PIPE_CAP_INVALIDATE_BUFFER:
        case PIPE_CAP_GENERATE_MIPMAP:
+        case PIPE_CAP_STRING_MARKER:
                return 0;

                /* Stream output. */
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@ -678,6 +678,13 @@ struct pipe_context {
   void (*dump_debug_state)(struct pipe_context *ctx, FILE *stream,
                            unsigned flags);

+   /**
+    * Emit string marker in cmdstream
+    */
+   void (*emit_string_marker)(struct pipe_context *ctx,
+                              const char *string,
+                              int len);
+
   /**
    * Generate mipmap.
    * \return TRUE if mipmap generation succeeds, FALSE otherwise
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@ -644,6 +644,7 @@ enum pipe_cap
   PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT,
   PIPE_CAP_INVALIDATE_BUFFER,
   PIPE_CAP_GENERATE_MIPMAP,
+   PIPE_CAP_STRING_MARKER,
 };

 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)
--- a/src/gallium/state_trackers/dri/dri_drawable.c
+++ b/src/gallium/state_trackers/dri/dri_drawable.c
@ -492,8 +492,10 @@ dri_flush(__DRIcontext *cPriv,

      if (pipe->invalidate_resource &&
          (flags & __DRI2_FLUSH_INVALIDATE_ANCILLARY)) {
-         pipe->invalidate_resource(pipe, drawable->textures[ST_ATTACHMENT_DEPTH_STENCIL]);
-         pipe->invalidate_resource(pipe, drawable->msaa_textures[ST_ATTACHMENT_DEPTH_STENCIL]);
+         if (drawable->textures[ST_ATTACHMENT_DEPTH_STENCIL])
+            pipe->invalidate_resource(pipe, drawable->textures[ST_ATTACHMENT_DEPTH_STENCIL]);
+         if (drawable->msaa_textures[ST_ATTACHMENT_DEPTH_STENCIL])
+            pipe->invalidate_resource(pipe, drawable->msaa_textures[ST_ATTACHMENT_DEPTH_STENCIL]);
      }
   }

--- a/src/gallium/state_trackers/va/context.c
+++ b/src/gallium/state_trackers/va/context.c
@ -31,6 +31,7 @@
 #include "util/u_memory.h"
 #include "util/u_handle_table.h"
 #include "util/u_video.h"
+#include "vl/vl_deint_filter.h"
 #include "vl/vl_winsys.h"

 #include "va_private.h"
@ -296,6 +297,10 @@ vlVaDestroyContext(VADriverContextP ctx, VAContextID context_id)
      }
      context->decoder->destroy(context->decoder);
   }
+   if (context->deint) {
+      vl_deint_filter_cleanup(context->deint);
+      FREE(context->deint);
+   }
   FREE(context);
   handle_table_remove(drv->htab, context_id);
   pipe_mutex_unlock(drv->mutex);
--- a/src/gallium/state_trackers/va/postproc.c
+++ b/src/gallium/state_trackers/va/postproc.c
@ -29,6 +29,7 @@

 #include "vl/vl_defines.h"
 #include "vl/vl_video_buffer.h"
+#include "vl/vl_deint_filter.h"

 #include "va_private.h"

@ -174,6 +175,51 @@ static VAStatus vlVaPostProcBlit(vlVaDriver *drv, vlVaContext *context,
   return VA_STATUS_SUCCESS;
 }

+static struct pipe_video_buffer *
+vlVaApplyDeint(vlVaDriver *drv, vlVaContext *context,
+               VAProcPipelineParameterBuffer *param,
+               struct pipe_video_buffer *current,
+               unsigned field)
+{
+   vlVaSurface *prevprev, *prev, *next;
+
+   if (param->num_forward_references < 1 ||
+       param->num_backward_references < 2)
+      return current;
+
+   prevprev = handle_table_get(drv->htab, param->backward_references[1]);
+   prev = handle_table_get(drv->htab, param->backward_references[0]);
+   next = handle_table_get(drv->htab, param->forward_references[0]);
+
+   if (!prevprev || !prev || !next)
+      return current;
+
+   if (context->deint && (context->deint->video_width != current->width ||
+       context->deint->video_height != current->height)) {
+      vl_deint_filter_cleanup(context->deint);
+      FREE(context->deint);
+      context->deint = NULL;
+   }
+
+   if (!context->deint) {
+      context->deint = MALLOC(sizeof(struct vl_deint_filter));
+      if (!vl_deint_filter_init(context->deint, drv->pipe, current->width,
+                                current->height, false, false)) {
+         FREE(context->deint);
+         context->deint = NULL;
+         return current;
+      }
+   }
+
+   if (!vl_deint_filter_check_buffers(context->deint, prevprev->buffer,
+                                      prev->buffer, current, next->buffer))
+      return current;
+
+   vl_deint_filter_render(context->deint, prevprev->buffer, prev->buffer,
+                          current, next->buffer, field);
+   return context->deint->video_buffer;
+}
+
 VAStatus
 vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf)
 {
@ -181,6 +227,7 @@ vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *contex
   VARectangle def_src_region, def_dst_region;
   const VARectangle *src_region, *dst_region;
   VAProcPipelineParameterBuffer *param;
+   struct pipe_video_buffer *src;
   vlVaSurface *src_surface;
   unsigned i;

@ -199,6 +246,8 @@ vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *contex
   if (!src_surface || !src_surface->buffer)
      return VA_STATUS_ERROR_INVALID_SURFACE;

+   src = src_surface->buffer;
+
   for (i = 0; i < param->num_filters; i++) {
      vlVaBuffer *buf = handle_table_get(drv->htab, param->filters[i]);
      VAProcFilterParameterBufferBase *filter;
@ -222,6 +271,11 @@ vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *contex
            deinterlace = VL_COMPOSITOR_WEAVE;
            break;

+         case VAProcDeinterlacingMotionAdaptive:
+            src = vlVaApplyDeint(drv, context, param, src,
+				 !!(deint->flags & VA_DEINTERLACING_BOTTOM_FIELD));
+            break;
+
         default:
            return VA_STATUS_ERROR_UNIMPLEMENTED;
         }
@ -239,10 +293,8 @@ vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *contex

   if (context->target->buffer_format != PIPE_FORMAT_NV12)
      return vlVaPostProcCompositor(drv, context, src_region, dst_region,
-                                    src_surface->buffer, context->target,
-                                    deinterlace);
+                                    src, context->target, deinterlace);
   else
      return vlVaPostProcBlit(drv, context, src_region, dst_region,
-                              src_surface->buffer, context->target,
-                              deinterlace);
+                              src, context->target, deinterlace);
 }
--- a/src/gallium/state_trackers/va/surface.c
+++ b/src/gallium/state_trackers/va/surface.c
@ -691,13 +691,14 @@ vlVaQueryVideoProcFilterCaps(VADriverContextP ctx, VAContextID context,
   case VAProcFilterDeinterlacing: {
      VAProcFilterCapDeinterlacing *deint = filter_caps;

-      if (*num_filter_caps < 2) {
-         *num_filter_caps = 2;
+      if (*num_filter_caps < 3) {
+         *num_filter_caps = 3;
         return VA_STATUS_ERROR_MAX_NUM_EXCEEDED;
      }

      deint[i++].type = VAProcDeinterlacingBob;
      deint[i++].type = VAProcDeinterlacingWeave;
+      deint[i++].type = VAProcDeinterlacingMotionAdaptive;
      break;
   }

@ -750,9 +751,24 @@ vlVaQueryVideoProcPipelineCaps(VADriverContextP ctx, VAContextID context,

   for (i = 0; i < num_filters; i++) {
      vlVaBuffer *buf = handle_table_get(VL_VA_DRIVER(ctx)->htab, filters[i]);
+      VAProcFilterParameterBufferBase *filter;

-      if (!buf || buf->type >= VABufferTypeMax)
+      if (!buf || buf->type != VAProcFilterParameterBufferType)
         return VA_STATUS_ERROR_INVALID_BUFFER;
+
+      filter = buf->data;
+      switch (filter->type) {
+      case VAProcFilterDeinterlacing: {
+         VAProcFilterParameterBufferDeinterlacing *deint = buf->data;
+         if (deint->algorithm == VAProcDeinterlacingMotionAdaptive) {
+            pipeline_cap->num_forward_references = 1;
+            pipeline_cap->num_backward_references = 2;
+         }
+         break;
+      }
+      default:
+         return VA_STATUS_ERROR_UNIMPLEMENTED;
+      }
   }

   return VA_STATUS_SUCCESS;
--- a/src/gallium/state_trackers/va/va_private.h
+++ b/src/gallium/state_trackers/va/va_private.h
@ -236,6 +236,8 @@ typedef struct {
      VAPictureParameterBufferMPEG4 pps;
      uint8_t start_code[32];
   } mpeg4;
+
+   struct vl_deint_filter *deint;
 } vlVaContext;

 typedef struct {
--- a/src/glsl/ast.h
+++ b/src/glsl/ast.h
@ -699,17 +699,18 @@ struct ast_type_qualifier {

   bool merge_qualifier(YYLTYPE *loc,
 			_mesa_glsl_parse_state *state,
-			const ast_type_qualifier &q);
+                        const ast_type_qualifier &q,
+                        bool is_single_layout_merge);

   bool merge_out_qualifier(YYLTYPE *loc,
                           _mesa_glsl_parse_state *state,
                           const ast_type_qualifier &q,
-                           ast_node* &node);
+                           ast_node* &node, bool create_node);

   bool merge_in_qualifier(YYLTYPE *loc,
                           _mesa_glsl_parse_state *state,
                           const ast_type_qualifier &q,
-                           ast_node* &node);
+                           ast_node* &node, bool create_node);

   ast_subroutine_list *subroutine_list;
 };
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@ -487,15 +487,17 @@ unary_arithmetic_result_type(const struct glsl_type *type,
 * If the given types to the bit-logic operator are invalid, return
 * glsl_type::error_type.
 *
- * \param type_a Type of LHS of bit-logic op
- * \param type_b Type of RHS of bit-logic op
+ * \param value_a LHS of bit-logic op
+ * \param value_b RHS of bit-logic op
 */
 static const struct glsl_type *
-bit_logic_result_type(const struct glsl_type *type_a,
-                      const struct glsl_type *type_b,
+bit_logic_result_type(ir_rvalue * &value_a, ir_rvalue * &value_b,
                      ast_operators op,
                      struct _mesa_glsl_parse_state *state, YYLTYPE *loc)
 {
+   const glsl_type *type_a = value_a->type;
+   const glsl_type *type_b = value_b->type;
+
   if (!state->check_bitwise_operations_allowed(loc)) {
      return glsl_type::error_type;
   }
@ -517,6 +519,36 @@ bit_logic_result_type(const struct glsl_type *type_a,
      return glsl_type::error_type;
   }

+   /* Prior to GLSL 4.0 / GL_ARB_gpu_shader5, implicit conversions didn't
+    * make sense for bitwise operations, as they don't operate on floats.
+    *
+    * GLSL 4.0 added implicit int -> uint conversions, which are relevant
+    * here.  It wasn't clear whether or not we should apply them to bitwise
+    * operations.  However, Khronos has decided that they should in future
+    * language revisions.  Applications also rely on this behavior.  We opt
+    * to apply them in general, but issue a portability warning.
+    *
+    * See https://www.khronos.org/bugzilla/show_bug.cgi?id=1405
+    */
+   if (type_a->base_type != type_b->base_type) {
+      if (!apply_implicit_conversion(type_a, value_b, state)
+          && !apply_implicit_conversion(type_b, value_a, state)) {
+         _mesa_glsl_error(loc, state,
+                          "could not implicitly convert operands to "
+                          "`%s` operator",
+                          ast_expression::operator_string(op));
+         return glsl_type::error_type;
+      } else {
+         _mesa_glsl_warning(loc, state,
+                            "some implementations may not support implicit "
+                            "int -> uint conversions for `%s' operators; "
+                            "consider casting explicitly for portability",
+                            ast_expression::operator_string(op));
+      }
+      type_a = value_a->type;
+      type_b = value_b->type;
+   }
+
   /*     "The fundamental types of the operands (signed or unsigned) must
    *     match,"
    */
@ -1435,8 +1467,7 @@ ast_expression::do_hir(exec_list *instructions,
   case ast_bit_or:
      op[0] = this->subexpressions[0]->hir(instructions, state);
      op[1] = this->subexpressions[1]->hir(instructions, state);
-      type = bit_logic_result_type(op[0]->type, op[1]->type, this->oper,
-                                   state, &loc);
+      type = bit_logic_result_type(op[0], op[1], this->oper, state, &loc);
      result = new(ctx) ir_expression(operations[this->oper], type,
                                      op[0], op[1]);
      error_emitted = op[0]->type->is_error() || op[1]->type->is_error();
@ -1626,8 +1657,7 @@ ast_expression::do_hir(exec_list *instructions,
   case ast_or_assign: {
      op[0] = this->subexpressions[0]->hir(instructions, state);
      op[1] = this->subexpressions[1]->hir(instructions, state);
-      type = bit_logic_result_type(op[0]->type, op[1]->type, this->oper,
-                                   state, &loc);
+      type = bit_logic_result_type(op[0], op[1], this->oper, state, &loc);
      ir_rvalue *temp_rhs = new(ctx) ir_expression(operations[this->oper],
                                                   type, op[0], op[1]);
      error_emitted =
@ -6329,7 +6359,7 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
             qual_stream != block_stream) {
            _mesa_glsl_error(&loc, state, "stream layout qualifier on "
                             "interface block member does not match "
-                             "the interface block (%d vs %d)", qual->stream,
+                             "the interface block (%u vs %u)", qual_stream,
                             block_stream);
         }
      }
--- a/src/glsl/ast_type.cpp
+++ b/src/glsl/ast_type.cpp
@ -74,9 +74,11 @@ ast_type_qualifier::has_layout() const
          || this->flags.q.row_major
          || this->flags.q.packed
          || this->flags.q.explicit_location
+          || this->flags.q.explicit_image_format
          || this->flags.q.explicit_index
          || this->flags.q.explicit_binding
-          || this->flags.q.explicit_offset;
+          || this->flags.q.explicit_offset
+          || this->flags.q.explicit_stream;
 }

 bool
@ -113,10 +115,16 @@ ast_type_qualifier::interpolation_string() const
      return NULL;
 }

+/**
+ * This function merges both duplicate identifies within a single layout and
+ * multiple layout qualifiers on a single variable declaration. The
+ * is_single_layout_merge param is used differentiate between the two.
+ */
 bool
 ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
 				    _mesa_glsl_parse_state *state,
-				    const ast_type_qualifier &q)
+                                    const ast_type_qualifier &q,
+                                    bool is_single_layout_merge)
 {
   ast_type_qualifier ubo_mat_mask;
   ubo_mat_mask.flags.i = 0;
@ -156,7 +164,8 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
      allowed_duplicates_mask.flags.i |=
         stream_layout_mask.flags.i;

-   if ((this->flags.i & q.flags.i & ~allowed_duplicates_mask.flags.i) != 0) {
+   if (is_single_layout_merge && !state->has_enhanced_layouts() &&
+       (this->flags.i & q.flags.i & ~allowed_duplicates_mask.flags.i) != 0) {
      _mesa_glsl_error(loc, state,
 		       "duplicate layout qualifiers used");
      return false;
@ -207,11 +216,6 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
            this->flags.q.stream = 1;
            this->stream = state->out_qualifier->stream;
         }
-      } else {
-         if (q.flags.q.explicit_stream) {
-            _mesa_glsl_error(loc, state,
-                             "duplicate layout `stream' qualifier");
-         }
      }
   }

@ -294,13 +298,35 @@ bool
 ast_type_qualifier::merge_out_qualifier(YYLTYPE *loc,
                                        _mesa_glsl_parse_state *state,
                                        const ast_type_qualifier &q,
-                                        ast_node* &node)
+                                        ast_node* &node, bool create_node)
 {
   void *mem_ctx = state;
-   const bool r = this->merge_qualifier(loc, state, q);
+   const bool r = this->merge_qualifier(loc, state, q, false);

-   if (state->stage == MESA_SHADER_TESS_CTRL) {
-      node = new(mem_ctx) ast_tcs_output_layout(*loc);
+   if (state->stage == MESA_SHADER_GEOMETRY) {
+      if (q.flags.q.prim_type) {
+         /* Make sure this is a valid output primitive type. */
+         switch (q.prim_type) {
+         case GL_POINTS:
+         case GL_LINE_STRIP:
+         case GL_TRIANGLE_STRIP:
+            break;
+         default:
+            _mesa_glsl_error(loc, state, "invalid geometry shader output "
+                             "primitive type");
+            break;
+         }
+      }
+
+      /* Allow future assigments of global out's stream id value */
+      this->flags.q.explicit_stream = 0;
+   } else if (state->stage == MESA_SHADER_TESS_CTRL) {
+      if (create_node) {
+         node = new(mem_ctx) ast_tcs_output_layout(*loc);
+      }
+   } else {
+      _mesa_glsl_error(loc, state, "out layout qualifiers only valid in "
+                       "tessellation control or geometry shaders");
   }

   return r;
@ -310,7 +336,7 @@ bool
 ast_type_qualifier::merge_in_qualifier(YYLTYPE *loc,
                                       _mesa_glsl_parse_state *state,
                                       const ast_type_qualifier &q,
-                                       ast_node* &node)
+                                       ast_node* &node, bool create_node)
 {
   void *mem_ctx = state;
   bool create_gs_ast = false;
@ -450,10 +476,12 @@ ast_type_qualifier::merge_in_qualifier(YYLTYPE *loc,
      this->point_mode = q.point_mode;
   }

-   if (create_gs_ast) {
-      node = new(mem_ctx) ast_gs_input_layout(*loc, q.prim_type);
-   } else if (create_cs_ast) {
-      node = new(mem_ctx) ast_cs_input_layout(*loc, q.local_size);
+   if (create_node) {
+      if (create_gs_ast) {
+         node = new(mem_ctx) ast_gs_input_layout(*loc, q.prim_type);
+      } else if (create_cs_ast) {
+         node = new(mem_ctx) ast_cs_input_layout(*loc, q.local_size);
+      }
   }

   return true;
--- a/src/glsl/glsl_parser.yy
+++ b/src/glsl/glsl_parser.yy
@ -299,6 +299,10 @@ static bool match_layout_qualifier(const char *s1, const char *s2,
 %type <node> for_init_statement
 %type <for_rest_statement> for_rest_statement
 %type <node> layout_defaults
+%type <node> layout_uniform_defaults
+%type <node> layout_buffer_defaults
+%type <node> layout_in_defaults
+%type <node> layout_out_defaults

 %right THEN ELSE
 %%
@ -953,7 +957,7 @@ parameter_qualifier:
                                      "or precise");

      $$ = $1;
-      $$.merge_qualifier(&@1, state, $2);
+      $$.merge_qualifier(&@1, state, $2, false);
   }
   | precision_qualifier parameter_qualifier
   {
@ -970,7 +974,7 @@ parameter_qualifier:
   | memory_qualifier parameter_qualifier
   {
      $$ = $1;
-      $$.merge_qualifier(&@1, state, $2);
+      $$.merge_qualifier(&@1, state, $2, false);
   }

 parameter_direction_qualifier:
@ -1149,7 +1153,7 @@ layout_qualifier_id_list:
   | layout_qualifier_id_list ',' layout_qualifier_id
   {
      $$ = $1;
-      if (!$$.merge_qualifier(& @3, state, $3)) {
+      if (!$$.merge_qualifier(& @3, state, $3, true)) {
         YYERROR;
      }
   }
@ -1758,7 +1762,7 @@ type_qualifier:
      }

      $$ = $1;
-      $$.merge_qualifier(&@1, state, $2);
+      $$.merge_qualifier(&@1, state, $2, false);
   }
   | layout_qualifier type_qualifier
   {
@ -1775,12 +1779,12 @@ type_qualifier:
         _mesa_glsl_error(&@1, state, "duplicate layout(...) qualifiers");

      $$ = $1;
-      $$.merge_qualifier(&@1, state, $2);
+      $$.merge_qualifier(&@1, state, $2, false);
   }
   | subroutine_qualifier type_qualifier
   {
      $$ = $1;
-      $$.merge_qualifier(&@1, state, $2);
+      $$.merge_qualifier(&@1, state, $2, false);
   }
   | auxiliary_storage_qualifier type_qualifier
   {
@ -1796,7 +1800,7 @@ type_qualifier:
                          "just before storage qualifiers");
      }
      $$ = $1;
-      $$.merge_qualifier(&@1, state, $2);
+      $$.merge_qualifier(&@1, state, $2, false);
   }
   | storage_qualifier type_qualifier
   {
@ -1816,7 +1820,7 @@ type_qualifier:
      }

      $$ = $1;
-      $$.merge_qualifier(&@1, state, $2);
+      $$.merge_qualifier(&@1, state, $2, false);
   }
   | precision_qualifier type_qualifier
   {
@ -1833,7 +1837,7 @@ type_qualifier:
   | memory_qualifier type_qualifier
   {
      $$ = $1;
-      $$.merge_qualifier(&@1, state, $2);
+      $$.merge_qualifier(&@1, state, $2, false);
   }
   ;

@ -2585,7 +2589,7 @@ interface_block:
         YYERROR;
      }

-      if (!block->layout.merge_qualifier(& @1, state, $1)) {
+      if (!block->layout.merge_qualifier(& @1, state, $1, false)) {
         YYERROR;
      }

@ -2602,7 +2606,7 @@ interface_block:
                             "memory qualifiers can only be used in the "
                             "declaration of shader storage blocks");
      }
-      if (!block->layout.merge_qualifier(& @1, state, $1)) {
+      if (!block->layout.merge_qualifier(& @1, state, $1, false)) {
         YYERROR;
      }
      $$ = block;
@ -2737,18 +2741,48 @@ member_declaration:
   }
   ;

-layout_defaults:
-   layout_qualifier UNIFORM ';'
+layout_uniform_defaults:
+   layout_qualifier layout_uniform_defaults
   {
-      if (!state->default_uniform_qualifier->merge_qualifier(& @1, state, $1)) {
+      $$ = NULL;
+      if (!state->has_420pack_or_es31()) {
+         _mesa_glsl_error(&@1, state, "duplicate layout(...) qualifiers");
+         YYERROR;
+      } else {
+         if (!state->default_uniform_qualifier->
+                merge_qualifier(& @1, state, $1, false)) {
+            YYERROR;
+         }
+      }
+   }
+   | layout_qualifier UNIFORM ';'
+   {
+      if (!state->default_uniform_qualifier->
+             merge_qualifier(& @1, state, $1, false)) {
         YYERROR;
      }
      $$ = NULL;
   }
+   ;

+layout_buffer_defaults:
+   layout_qualifier layout_buffer_defaults
+   {
+      $$ = NULL;
+      if (!state->has_420pack_or_es31()) {
+         _mesa_glsl_error(&@1, state, "duplicate layout(...) qualifiers");
+         YYERROR;
+      } else {
+         if (!state->default_shader_storage_qualifier->
+                merge_qualifier(& @1, state, $1, false)) {
+            YYERROR;
+         }
+      }
+   }
   | layout_qualifier BUFFER ';'
   {
-      if (!state->default_shader_storage_qualifier->merge_qualifier(& @1, state, $1)) {
+      if (!state->default_shader_storage_qualifier->
+             merge_qualifier(& @1, state, $1, false)) {
         YYERROR;
      }

@ -2764,43 +2798,58 @@ layout_defaults:

      $$ = NULL;
   }
+   ;

+layout_in_defaults:
+   layout_qualifier layout_in_defaults
+   {
+      $$ = NULL;
+      if (!state->has_420pack_or_es31()) {
+         _mesa_glsl_error(&@1, state, "duplicate layout(...) qualifiers");
+         YYERROR;
+      } else {
+         if (!state->in_qualifier->
+                merge_in_qualifier(& @1, state, $1, $$, false)) {
+            YYERROR;
+         }
+      }
+   }
   | layout_qualifier IN_TOK ';'
   {
      $$ = NULL;
-      if (!state->in_qualifier->merge_in_qualifier(& @1, state, $1, $$)) {
+      if (!state->in_qualifier->
+             merge_in_qualifier(& @1, state, $1, $$, true)) {
         YYERROR;
      }
   }
+   ;

+layout_out_defaults:
+   layout_qualifier layout_out_defaults
+   {
+      $$ = NULL;
+      if (!state->has_420pack_or_es31()) {
+         _mesa_glsl_error(&@1, state, "duplicate layout(...) qualifiers");
+         YYERROR;
+      } else {
+         if (!state->out_qualifier->
+                merge_out_qualifier(& @1, state, $1, $$, false)) {
+            YYERROR;
+         }
+      }
+   }
   | layout_qualifier OUT_TOK ';'
   {
      $$ = NULL;
-      if (state->stage == MESA_SHADER_GEOMETRY) {
-         if ($1.flags.q.prim_type) {
-            /* Make sure this is a valid output primitive type. */
-            switch ($1.prim_type) {
-            case GL_POINTS:
-            case GL_LINE_STRIP:
-            case GL_TRIANGLE_STRIP:
-               break;
-            default:
-               _mesa_glsl_error(&@1, state, "invalid geometry shader output "
-                                "primitive type");
-               break;
-            }
-         }
-         if (!state->out_qualifier->merge_qualifier(& @1, state, $1))
-            YYERROR;
-
-         /* Allow future assigments of global out's stream id value */
-         state->out_qualifier->flags.q.explicit_stream = 0;
-      } else if (state->stage == MESA_SHADER_TESS_CTRL) {
-         if (!state->out_qualifier->merge_out_qualifier(& @1, state, $1, $$))
-            YYERROR;
-      } else {
-         _mesa_glsl_error(& @1, state,
-                          "out layout qualifiers only valid in "
-                          "tessellation control or geometry shaders");
-      }
+      if (!state->out_qualifier->
+             merge_out_qualifier(& @1, state, $1, $$, true))
+         YYERROR;
   }
+   ;
+
+layout_defaults:
+   layout_uniform_defaults
+   | layout_buffer_defaults
+   | layout_in_defaults
+   | layout_out_defaults
+   ;
--- a/src/glsl/ir.cpp
+++ b/src/glsl/ir.cpp
@ -298,8 +298,6 @@ ir_expression::ir_expression(int op, ir_rvalue *op0)
      break;

   case ir_unop_noise:
-   case ir_unop_unpack_half_2x16_split_x:
-   case ir_unop_unpack_half_2x16_split_y:
      this->type = glsl_type::float_type;
      break;

@ -422,10 +420,6 @@ ir_expression::ir_expression(int op, ir_rvalue *op0, ir_rvalue *op1)
      this->type = op0->type->get_base_type();
      break;

-   case ir_binop_pack_half_2x16_split:
-      this->type = glsl_type::uint_type;
-      break;
-
   case ir_binop_imul_high:
   case ir_binop_carry:
   case ir_binop_borrow:
@ -555,8 +549,6 @@ static const char *const operator_strs[] = {
   "unpackUnorm2x16",
   "unpackUnorm4x8",
   "unpackHalf2x16",
-   "unpackHalf2x16_split_x",
-   "unpackHalf2x16_split_y",
   "bitfield_reverse",
   "bit_count",
   "find_msb",
@ -599,7 +591,6 @@ static const char *const operator_strs[] = {
   "min",
   "max",
   "pow",
-   "packHalf2x16_split",
   "ubo_load",
   "ldexp",
   "vector_extract",
--- a/src/glsl/ir.h
+++ b/src/glsl/ir.h
@ -1401,16 +1401,6 @@ enum ir_expression_operation {
   ir_unop_unpack_half_2x16,
   /*@}*/

-   /**
-    * \name Lowered floating point unpacking operations.
-    *
-    * \see lower_packing_builtins_visitor::split_unpack_half_2x16
-    */
-   /*@{*/
-   ir_unop_unpack_half_2x16_split_x,
-   ir_unop_unpack_half_2x16_split_y,
-   /*@}*/
-
   /**
    * \name Bit operations, part of ARB_gpu_shader5.
    */
@ -1541,15 +1531,6 @@ enum ir_expression_operation {

   ir_binop_pow,

-   /**
-    * \name Lowered floating point packing operations.
-    *
-    * \see lower_packing_builtins_visitor::split_pack_half_2x16
-    */
-   /*@{*/
-   ir_binop_pack_half_2x16_split,
-   /*@}*/
-
   /**
    * Load a value the size of a given GLSL type from a uniform block.
    *
--- a/src/glsl/ir_optimization.h
+++ b/src/glsl/ir_optimization.h
@ -58,17 +58,14 @@ enum lower_packing_builtins_op {
   LOWER_PACK_HALF_2x16                 = 0x0010,
   LOWER_UNPACK_HALF_2x16               = 0x0020,

-   LOWER_PACK_HALF_2x16_TO_SPLIT        = 0x0040,
-   LOWER_UNPACK_HALF_2x16_TO_SPLIT      = 0x0080,
+   LOWER_PACK_SNORM_4x8                 = 0x0040,
+   LOWER_UNPACK_SNORM_4x8               = 0x0080,

-   LOWER_PACK_SNORM_4x8                 = 0x0100,
-   LOWER_UNPACK_SNORM_4x8               = 0x0200,
+   LOWER_PACK_UNORM_4x8                 = 0x0100,
+   LOWER_UNPACK_UNORM_4x8               = 0x0200,

-   LOWER_PACK_UNORM_4x8                 = 0x0400,
-   LOWER_UNPACK_UNORM_4x8               = 0x0800,
-
-   LOWER_PACK_USE_BFI                   = 0x1000,
-   LOWER_PACK_USE_BFE                   = 0x2000,
+   LOWER_PACK_USE_BFI                   = 0x0400,
+   LOWER_PACK_USE_BFE                   = 0x0800,
 };

 bool do_common_optimization(exec_list *ir, bool linked,
--- a/src/glsl/ir_validate.cpp
+++ b/src/glsl/ir_validate.cpp
@ -372,12 +372,6 @@ ir_validate::visit_leave(ir_expression *ir)
      assert(ir->operands[0]->type == glsl_type::uint_type);
      break;

-   case ir_unop_unpack_half_2x16_split_x:
-   case ir_unop_unpack_half_2x16_split_y:
-      assert(ir->type == glsl_type::float_type);
-      assert(ir->operands[0]->type == glsl_type::uint_type);
-      break;
-
   case ir_unop_unpack_double_2x32:
      assert(ir->type == glsl_type::uvec2_type);
      assert(ir->operands[0]->type == glsl_type::double_type);
@ -567,12 +561,6 @@ ir_validate::visit_leave(ir_expression *ir)
      assert(ir->operands[0]->type == ir->operands[1]->type);
      break;

-   case ir_binop_pack_half_2x16_split:
-      assert(ir->type == glsl_type::uint_type);
-      assert(ir->operands[0]->type == glsl_type::float_type);
-      assert(ir->operands[1]->type == glsl_type::float_type);
-      break;
-
   case ir_binop_ubo_load:
      assert(ir->operands[0]->type == glsl_type::uint_type);

--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@ -968,10 +968,12 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var)
   }

   if ((consumer_var == NULL && producer_var->type->contains_integer()) ||
-       consumer_stage != MESA_SHADER_FRAGMENT) {
+       (consumer_stage != -1 && consumer_stage != MESA_SHADER_FRAGMENT)) {
      /* Since this varying is not being consumed by the fragment shader, its
-       * interpolation type varying cannot possibly affect rendering.  Also,
-       * this variable is non-flat and is (or contains) an integer.
+       * interpolation type varying cannot possibly affect rendering.
+       * Also, this variable is non-flat and is (or contains) an integer.
+       * If the consumer stage is unknown, don't modify the interpolation
+       * type as it could affect rendering later with separate shaders.
       *
       * lower_packed_varyings requires all integer varyings to flat,
       * regardless of where they appear.  We can trivially satisfy that
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@ -992,7 +992,17 @@ cross_validate_globals(struct gl_shader_program *prog,

 	       existing->data.location = var->data.location;
 	       existing->data.explicit_location = true;
-	    }
+	    } else {
+               /* Check if uniform with implicit location was marked explicit
+                * by earlier shader stage. If so, mark it explicit in this stage
+                * too to make sure later processing does not treat it as
+                * implicit one.
+                */
+               if (existing->data.explicit_location) {
+	          var->data.location = existing->data.location;
+	          var->data.explicit_location = true;
+               }
+            }

            /* From the GLSL 4.20 specification:
             * "A link error will result if two compilation units in a program
@ -3152,7 +3162,7 @@ check_explicit_uniform_locations(struct gl_context *ctx,

         if (var->data.explicit_location) {
            bool ret;
-            if (var->type->is_subroutine())
+            if (var->type->without_array()->is_subroutine())
               ret = reserve_subroutine_explicit_locations(prog, sh, var);
            else
               ret = reserve_explicit_locations(prog, uniform_map, var);
--- a/src/glsl/lower_packing_builtins.cpp
+++ b/src/glsl/lower_packing_builtins.cpp
@ -43,13 +43,6 @@ public:
      : op_mask(op_mask),
        progress(false)
   {
-      /* Mutually exclusive options. */
-      assert(!((op_mask & LOWER_PACK_HALF_2x16) &&
-               (op_mask & LOWER_PACK_HALF_2x16_TO_SPLIT)));
-
-      assert(!((op_mask & LOWER_UNPACK_HALF_2x16) &&
-               (op_mask & LOWER_UNPACK_HALF_2x16_TO_SPLIT)));
-
      factory.instructions = &factory_instructions;
   }

@ -96,9 +89,6 @@ public:
      case LOWER_PACK_HALF_2x16:
         *rvalue = lower_pack_half_2x16(op0);
         break;
-      case LOWER_PACK_HALF_2x16_TO_SPLIT:
-         *rvalue = split_pack_half_2x16(op0);
-         break;
      case LOWER_UNPACK_SNORM_2x16:
         *rvalue = lower_unpack_snorm_2x16(op0);
         break;
@ -114,9 +104,6 @@ public:
      case LOWER_UNPACK_HALF_2x16:
         *rvalue = lower_unpack_half_2x16(op0);
         break;
-      case LOWER_UNPACK_HALF_2x16_TO_SPLIT:
-         *rvalue = split_unpack_half_2x16(op0);
-         break;
      case LOWER_PACK_UNPACK_NONE:
      case LOWER_PACK_USE_BFI:
      case LOWER_PACK_USE_BFE:
@ -161,7 +148,7 @@ private:
         result = op_mask & LOWER_PACK_UNORM_4x8;
         break;
      case ir_unop_pack_half_2x16:
-         result = op_mask & (LOWER_PACK_HALF_2x16 | LOWER_PACK_HALF_2x16_TO_SPLIT);
+         result = op_mask & LOWER_PACK_HALF_2x16;
         break;
      case ir_unop_unpack_snorm_2x16:
         result = op_mask & LOWER_UNPACK_SNORM_2x16;
@ -176,7 +163,7 @@ private:
         result = op_mask & LOWER_UNPACK_UNORM_4x8;
         break;
      case ir_unop_unpack_half_2x16:
-         result = op_mask & (LOWER_UNPACK_HALF_2x16 | LOWER_UNPACK_HALF_2x16_TO_SPLIT);
+         result = op_mask & LOWER_UNPACK_HALF_2x16;
         break;
      default:
         result = LOWER_PACK_UNPACK_NONE;
@ -1092,41 +1079,6 @@ private:
      return result;
   }

-   /**
-    * \brief Split packHalf2x16's vec2 operand into two floats.
-    *
-    * \param vec2_rval is packHalf2x16's input
-    * \return a uint rvalue
-    *
-    * Some code generators, such as the i965 fragment shader, require that all
-    * vector expressions be lowered to a sequence of scalar expressions.
-    * However, packHalf2x16 cannot be scalarized by the same mechanism as
-    * a true vector operation because its input and output have a differing
-    * number of vector components.
-    *
-    * This method scalarizes packHalf2x16 by transforming it from an unary
-    * operation having vector input to a binary operation having scalar input.
-    * That is, it transforms
-    *
-    *    packHalf2x16(VEC2_RVAL);
-    *
-    * into
-    *
-    *    vec2 v = VEC2_RVAL;
-    *    return packHalf2x16_split(v.x, v.y);
-    */
-   ir_rvalue*
-   split_pack_half_2x16(ir_rvalue *vec2_rval)
-   {
-      assert(vec2_rval->type == glsl_type::vec2_type);
-
-      ir_variable *v = factory.make_temp(glsl_type::vec2_type,
-                                         "tmp_split_pack_half_2x16_v");
-      factory.emit(assign(v, vec2_rval));
-
-      return expr(ir_binop_pack_half_2x16_split, swizzle_x(v), swizzle_y(v));
-   }
-
   /**
    * \brief Lower the component-wise calculation of unpackHalf2x16.
    *
@ -1341,59 +1293,6 @@ private:
      assert(result->type == glsl_type::vec2_type);
      return result;
   }
-
-   /**
-    * \brief Split unpackHalf2x16 into two operations.
-    *
-    * \param uint_rval is unpackHalf2x16's input
-    * \return a vec2 rvalue
-    *
-    * Some code generators, such as the i965 fragment shader, require that all
-    * vector expressions be lowered to a sequence of scalar expressions.
-    * However, unpackHalf2x16 cannot be scalarized by the same method as
-    * a true vector operation because the number of components of its input
-    * and output differ.
-    *
-    * This method scalarizes unpackHalf2x16 by transforming it from a single
-    * operation having vec2 output to a pair of operations each having float
-    * output. That is, it transforms
-    *
-    *   unpackHalf2x16(UINT_RVAL)
-    *
-    * into
-    *
-    *   uint u = UINT_RVAL;
-    *   vec2 v;
-    *
-    *   v.x = unpackHalf2x16_split_x(u);
-    *   v.y = unpackHalf2x16_split_y(u);
-    *
-    *   return v;
-    */
-   ir_rvalue*
-   split_unpack_half_2x16(ir_rvalue *uint_rval)
-   {
-      assert(uint_rval->type == glsl_type::uint_type);
-
-      /* uint u = uint_rval; */
-      ir_variable *u = factory.make_temp(glsl_type::uint_type,
-                                          "tmp_split_unpack_half_2x16_u");
-      factory.emit(assign(u, uint_rval));
-
-      /* vec2 v; */
-      ir_variable *v = factory.make_temp(glsl_type::vec2_type,
-                                          "tmp_split_unpack_half_2x16_v");
-
-      /* v.x = unpack_half_2x16_split_x(u); */
-      factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_x, u),
-                           WRITEMASK_X));
-
-      /* v.y = unpack_half_2x16_split_y(u); */
-      factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_y, u),
-                           WRITEMASK_Y));
-
-      return deref(v).val;
-   }
 };

 } // namespace anonymous
--- a/src/glsl/lower_subroutine.cpp
+++ b/src/glsl/lower_subroutine.cpp
@ -44,6 +44,7 @@ public:
   }

   ir_visitor_status visit_leave(ir_call *);
+   ir_call *call_clone(ir_call *call, ir_function_signature *callee);
   bool progress;
   struct _mesa_glsl_parse_state *state;
 };
@ -58,6 +59,23 @@ lower_subroutine(exec_list *instructions, struct _mesa_glsl_parse_state *state)
   return v.progress;
 }

+ir_call *
+lower_subroutine_visitor::call_clone(ir_call *call, ir_function_signature *callee)
+{
+   void *mem_ctx = ralloc_parent(call);
+   ir_dereference_variable *new_return_ref = NULL;
+   if (call->return_deref != NULL)
+      new_return_ref = call->return_deref->clone(mem_ctx, NULL);
+
+   exec_list new_parameters;
+
+   foreach_in_list(ir_instruction, ir, &call->actual_parameters) {
+      new_parameters.push_tail(ir->clone(mem_ctx, NULL));
+   }
+
+   return new(mem_ctx) ir_call(callee, new_return_ref, &new_parameters);
+}
+
 ir_visitor_status
 lower_subroutine_visitor::visit_leave(ir_call *ir)
 {
@ -66,7 +84,6 @@ lower_subroutine_visitor::visit_leave(ir_call *ir)

   void *mem_ctx = ralloc_parent(ir);
   ir_if *last_branch = NULL;
-   ir_dereference_variable *return_deref = ir->return_deref;

   for (int s = this->state->num_subroutines - 1; s >= 0; s--) {
      ir_rvalue *var;
@ -92,14 +109,11 @@ lower_subroutine_visitor::visit_leave(ir_call *ir)
         fn->exact_matching_signature(this->state,
                                      &ir->actual_parameters);

-      ir_call *new_call = new(mem_ctx) ir_call(sub_sig, return_deref, &ir->actual_parameters);
+      ir_call *new_call = call_clone(ir, sub_sig);
      if (!last_branch)
         last_branch = if_tree(equal(subr_to_int(var), lc), new_call);
      else
         last_branch = if_tree(equal(subr_to_int(var), lc), new_call, last_branch);
-
-      if (return_deref && s > 0)
-        return_deref = return_deref->clone(mem_ctx, NULL);
   }
   if (last_branch)
      ir->insert_before(last_branch);
--- a/src/glsl/nir/glsl_to_nir.cpp
+++ b/src/glsl/nir/glsl_to_nir.cpp
@ -1442,12 +1442,6 @@ nir_visitor::visit(ir_expression *ir)
   case ir_unop_unpack_half_2x16:
      result = nir_unpack_half_2x16(&b, srcs[0]);
      break;
-   case ir_unop_unpack_half_2x16_split_x:
-      result = nir_unpack_half_2x16_split_x(&b, srcs[0]);
-      break;
-   case ir_unop_unpack_half_2x16_split_y:
-      result = nir_unpack_half_2x16_split_y(&b, srcs[0]);
-      break;
   case ir_unop_bitfield_reverse:
      result = nir_bitfield_reverse(&b, srcs[0]);
      break;
@ -1731,9 +1725,6 @@ nir_visitor::visit(ir_expression *ir)
      }
      break;

-   case ir_binop_pack_half_2x16_split:
-         result = nir_pack_half_2x16_split(&b, srcs[0], srcs[1]);
-         break;
   case ir_binop_ldexp: result = nir_ldexp(&b, srcs[0], srcs[1]); break;
   case ir_triop_fma:
      result = nir_ffma(&b, srcs[0], srcs[1], srcs[2]);
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@ -140,7 +140,7 @@ typedef enum {
 * ir_variable - it should be easy to translate between the two.
 */

-typedef struct {
+typedef struct nir_variable {
   struct exec_node node;

   /**
@ -383,7 +383,7 @@ nir_variable_get_io_mask(nir_variable *var, gl_shader_stage stage)
   return ((1ull << slots) - 1) << var->data.location;
 }

-typedef struct {
+typedef struct nir_register {
   struct exec_node node;

   unsigned num_components; /** < number of vector components */
@ -477,7 +477,7 @@ nir_instr_is_last(nir_instr *instr)
   return exec_node_is_tail_sentinel(exec_node_get_next(&instr->node));
 }

-typedef struct {
+typedef struct nir_ssa_def {
   /** for debugging only, can be NULL */
   const char* name;

@ -1530,6 +1530,20 @@ typedef struct nir_shader_compiler_options {
   /** lowers ffract to fsub+ffloor: */
   bool lower_ffract;

+   bool lower_pack_half_2x16;
+   bool lower_pack_unorm_2x16;
+   bool lower_pack_snorm_2x16;
+   bool lower_pack_unorm_4x8;
+   bool lower_pack_snorm_4x8;
+   bool lower_unpack_half_2x16;
+   bool lower_unpack_unorm_2x16;
+   bool lower_unpack_snorm_2x16;
+   bool lower_unpack_unorm_4x8;
+   bool lower_unpack_snorm_4x8;
+
+   bool lower_extract_byte;
+   bool lower_extract_word;
+
   /**
    * Does the driver support real 32-bit integers?  (Otherwise, integers
    * are simulated by floats.)
--- a/src/glsl/nir/nir_builder.h
+++ b/src/glsl/nir/nir_builder.h
@ -134,6 +134,20 @@ nir_imm_int(nir_builder *build, int x)
   return nir_build_imm(build, 1, v);
 }

+static inline nir_ssa_def *
+nir_imm_ivec4(nir_builder *build, int x, int y, int z, int w)
+{
+   nir_const_value v;
+
+   memset(&v, 0, sizeof(v));
+   v.i[0] = x;
+   v.i[1] = y;
+   v.i[2] = z;
+   v.i[3] = w;
+
+   return nir_build_imm(build, 4, v);
+}
+
 static inline nir_ssa_def *
 nir_build_alu(nir_builder *build, nir_op op, nir_ssa_def *src0,
              nir_ssa_def *src1, nir_ssa_def *src2, nir_ssa_def *src3)
--- a/src/glsl/nir/nir_lower_alu_to_scalar.c
+++ b/src/glsl/nir/nir_lower_alu_to_scalar.c
@ -97,6 +97,20 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b)
       */
      return;

+   case nir_op_pack_half_2x16:
+      if (!b->shader->options->lower_pack_half_2x16)
+         return;
+
+      nir_ssa_def *val =
+         nir_pack_half_2x16_split(b, nir_channel(b, instr->src[0].src.ssa,
+                                                 instr->src[0].swizzle[0]),
+                                     nir_channel(b, instr->src[0].src.ssa,
+                                                 instr->src[0].swizzle[1]));
+
+      nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(val));
+      nir_instr_remove(&instr->instr);
+      return;
+
   case nir_op_unpack_unorm_4x8:
   case nir_op_unpack_snorm_4x8:
   case nir_op_unpack_unorm_2x16:
@ -106,11 +120,51 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b)
       */
      return;

-   case nir_op_unpack_half_2x16:
-      /* We could split this into unpack_half_2x16_split_[xy], but should
-       * we?
-       */
+   case nir_op_unpack_half_2x16: {
+      if (!b->shader->options->lower_unpack_half_2x16)
+         return;
+
+      nir_ssa_def *comps[2];
+      comps[0] = nir_unpack_half_2x16_split_x(b, instr->src[0].src.ssa);
+      comps[1] = nir_unpack_half_2x16_split_y(b, instr->src[0].src.ssa);
+      nir_ssa_def *vec = nir_vec(b, comps, 2);
+
+      nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(vec));
+      nir_instr_remove(&instr->instr);
      return;
+   }
+
+   case nir_op_pack_uvec2_to_uint: {
+      assert(b->shader->options->lower_pack_snorm_2x16 ||
+             b->shader->options->lower_pack_unorm_2x16);
+
+      nir_ssa_def *word =
+         nir_extract_uword(b, instr->src[0].src.ssa, nir_imm_int(b, 0));
+      nir_ssa_def *val =
+         nir_ior(b, nir_ishl(b, nir_channel(b, word, 1), nir_imm_int(b, 16)),
+                                nir_channel(b, word, 0));
+
+      nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(val));
+      nir_instr_remove(&instr->instr);
+      break;
+   }
+
+   case nir_op_pack_uvec4_to_uint: {
+      assert(b->shader->options->lower_pack_snorm_4x8 ||
+             b->shader->options->lower_pack_unorm_4x8);
+
+      nir_ssa_def *byte =
+         nir_extract_ubyte(b, instr->src[0].src.ssa, nir_imm_int(b, 0));
+      nir_ssa_def *val =
+         nir_ior(b, nir_ior(b, nir_ishl(b, nir_channel(b, byte, 3), nir_imm_int(b, 24)),
+                               nir_ishl(b, nir_channel(b, byte, 2), nir_imm_int(b, 16))),
+                    nir_ior(b, nir_ishl(b, nir_channel(b, byte, 1), nir_imm_int(b, 8)),
+                               nir_channel(b, byte, 0)));
+
+      nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(val));
+      nir_instr_remove(&instr->instr);
+      break;
+   }

   case nir_op_fdph: {
      nir_ssa_def *sum[4];
--- a/src/glsl/nir/nir_opcodes.py
+++ b/src/glsl/nir/nir_opcodes.py
@ -105,7 +105,7 @@ def opcode(name, output_size, output_type, input_sizes, input_types,
   opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
                          input_types, algebraic_properties, const_expr)

-def unop_convert(name, in_type, out_type, const_expr):
+def unop_convert(name, out_type, in_type, const_expr):
   opcode(name, 0, out_type, [0], [in_type], "", const_expr)

 def unop(name, ty, const_expr):
@ -155,17 +155,17 @@ unop("frsq", tfloat, "1.0f / sqrtf(src0)")
 unop("fsqrt", tfloat, "sqrtf(src0)")
 unop("fexp2", tfloat, "exp2f(src0)")
 unop("flog2", tfloat, "log2f(src0)")
-unop_convert("f2i", tfloat, tint, "src0") # Float-to-integer conversion.
-unop_convert("f2u", tfloat, tuint, "src0") # Float-to-unsigned conversion
-unop_convert("i2f", tint, tfloat, "src0") # Integer-to-float conversion.
+unop_convert("f2i", tint, tfloat, "src0") # Float-to-integer conversion.
+unop_convert("f2u", tuint, tfloat, "src0") # Float-to-unsigned conversion
+unop_convert("i2f", tfloat, tint, "src0") # Integer-to-float conversion.
 # Float-to-boolean conversion
-unop_convert("f2b", tfloat, tbool, "src0 != 0.0f")
+unop_convert("f2b", tbool, tfloat, "src0 != 0.0f")
 # Boolean-to-float conversion
-unop_convert("b2f", tbool, tfloat, "src0 ? 1.0f : 0.0f")
+unop_convert("b2f", tfloat, tbool, "src0 ? 1.0f : 0.0f")
 # Int-to-boolean conversion
-unop_convert("i2b", tint, tbool, "src0 != 0")
-unop_convert("b2i", tbool, tint, "src0 ? 1 : 0") # Boolean-to-int conversion
-unop_convert("u2f", tuint, tfloat, "src0") # Unsigned-to-float conversion.
+unop_convert("i2b", tbool, tint, "src0 != 0")
+unop_convert("b2i", tint, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion
+unop_convert("u2f", tfloat, tuint, "src0") # Unsigned-to-float conversion.

 # Unary floating-point rounding operations.

@ -238,6 +238,16 @@ unpack_2x16("unorm")
 unpack_4x8("unorm")
 unpack_2x16("half")

+unop_horiz("pack_uvec2_to_uint", 0, tuint, 2, tuint, """
+dst = (src0.x & 0xffff) | (src0.y >> 16);
+""")
+
+unop_horiz("pack_uvec4_to_uint", 0, tuint, 4, tuint, """
+dst = (src0.x <<  0) |
+      (src0.y <<  8) |
+      (src0.z << 16) |
+      (src0.w << 24);
+""")

 # Lowered floating point unpacking operations.

@ -265,7 +275,7 @@ for (unsigned bit = 0; bit < 32; bit++) {
 }
 """)

-unop_convert("ufind_msb", tuint, tint, """
+unop_convert("ufind_msb", tint, tuint, """
 dst = -1;
 for (int bit = 31; bit > 0; bit--) {
   if ((src0 >> bit) & 1) {
@ -551,6 +561,15 @@ dst.x = src0.x;
 dst.y = src1.x;
 """)

+# Byte extraction
+binop("extract_ubyte", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
+binop("extract_ibyte", tint, "", "(int8_t)(src0 >> (src1 * 8))")
+
+# Word extraction
+binop("extract_uword", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
+binop("extract_iword", tint, "", "(int16_t)(src0 >> (src1 * 16))")
+
+
 def triop(name, ty, const_expr):
   opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
--- a/src/glsl/nir/nir_opt_algebraic.py
+++ b/src/glsl/nir/nir_opt_algebraic.py
@ -245,6 +245,70 @@ optimizations = [
    ('bcsel', ('ult', 31, 'bits'), 'value',
              ('ubfe', 'value', 'offset', 'bits')),
    'options->lower_bitfield_extract'),
+
+   (('extract_ibyte', a, b),
+    ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 8),
+    'options->lower_extract_byte'),
+
+   (('extract_ubyte', a, b),
+    ('iand', ('ushr', a, ('imul', b, 8)), 0xff),
+    'options->lower_extract_byte'),
+
+   (('extract_iword', a, b),
+    ('ishr', ('ishl', a, ('imul', ('isub', 1, b), 16)), 16),
+    'options->lower_extract_word'),
+
+   (('extract_uword', a, b),
+    ('iand', ('ushr', a, ('imul', b, 16)), 0xffff),
+    'options->lower_extract_word'),
+
+    (('pack_unorm_2x16', 'v'),
+     ('pack_uvec2_to_uint',
+        ('f2u', ('fround_even', ('fmul', ('fsat', 'v'), 65535.0)))),
+     'options->lower_pack_unorm_2x16'),
+
+    (('pack_unorm_4x8', 'v'),
+     ('pack_uvec4_to_uint',
+        ('f2u', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))),
+     'options->lower_pack_unorm_4x8'),
+
+    (('pack_snorm_2x16', 'v'),
+     ('pack_uvec2_to_uint',
+        ('f2i', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 32767.0)))),
+     'options->lower_pack_snorm_2x16'),
+
+    (('pack_snorm_4x8', 'v'),
+     ('pack_uvec4_to_uint',
+        ('f2i', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))),
+     'options->lower_pack_snorm_4x8'),
+
+    (('unpack_unorm_2x16', 'v'),
+     ('fdiv', ('u2f', ('vec4', ('extract_uword', 'v', 0),
+                               ('extract_uword', 'v', 1), 0, 0)),
+              65535.0),
+     'options->lower_unpack_unorm_2x16'),
+
+    (('unpack_unorm_4x8', 'v'),
+     ('fdiv', ('u2f', ('vec4', ('extract_ubyte', 'v', 0),
+                               ('extract_ubyte', 'v', 1),
+                               ('extract_ubyte', 'v', 2),
+                               ('extract_ubyte', 'v', 3))),
+              255.0),
+     'options->lower_unpack_unorm_4x8'),
+
+    (('unpack_snorm_2x16', 'v'),
+     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_iword', 'v', 0),
+                                                            ('extract_iword', 'v', 1), 0, 0)),
+                                           32767.0))),
+     'options->lower_unpack_snorm_2x16'),
+
+    (('unpack_snorm_4x8', 'v'),
+     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_ibyte', 'v', 0),
+                                                            ('extract_ibyte', 'v', 1),
+                                                            ('extract_ibyte', 'v', 2),
+                                                            ('extract_ibyte', 'v', 3))),
+                                           127.0))),
+     'options->lower_unpack_snorm_4x8'),
 ]

 # Add optimizations to handle the case where the result of a ternary is
--- a/src/glsl/nir/nir_print.c
+++ b/src/glsl/nir/nir_print.c
@ -487,7 +487,7 @@ print_intrinsic_instr(nir_intrinsic_instr *instr, print_state *state)
      if (i != 0)
         fprintf(fp, ", ");

-      fprintf(fp, "%u", instr->const_index[i]);
+      fprintf(fp, "%d", instr->const_index[i]);
   }

   fprintf(fp, ")");
--- a/src/glsl/nir/shader_enums.c
+++ b/src/glsl/nir/shader_enums.c
@ -33,7 +33,8 @@
 #define ENUM(x) [x] = #x
 #define NAME(val) ((((val) < ARRAY_SIZE(names)) && names[(val)]) ? names[(val)] : "UNKNOWN")

-const char * gl_shader_stage_name(gl_shader_stage stage)
+const char *
+gl_shader_stage_name(gl_shader_stage stage)
 {
   static const char *names[] = {
      ENUM(MESA_SHADER_VERTEX),
@ -51,15 +52,16 @@ const char * gl_shader_stage_name(gl_shader_stage stage)
 * Translate a gl_shader_stage to a short shader stage name for debug
 * printouts and error messages.
 */
-const char * _mesa_shader_stage_to_string(unsigned stage)
+const char *
+_mesa_shader_stage_to_string(unsigned stage)
 {
   switch (stage) {
   case MESA_SHADER_VERTEX:   return "vertex";
   case MESA_SHADER_FRAGMENT: return "fragment";
   case MESA_SHADER_GEOMETRY: return "geometry";
   case MESA_SHADER_COMPUTE:  return "compute";
-   case MESA_SHADER_TESS_CTRL: return "tess ctrl";
-   case MESA_SHADER_TESS_EVAL: return "tess eval";
+   case MESA_SHADER_TESS_CTRL: return "tessellation control";
+   case MESA_SHADER_TESS_EVAL: return "tessellation evaluation";
   }

   unreachable("Unknown shader stage.");
@ -69,7 +71,8 @@ const char * _mesa_shader_stage_to_string(unsigned stage)
 * Translate a gl_shader_stage to a shader stage abbreviation (VS, GS, FS)
 * for debug printouts and error messages.
 */
-const char * _mesa_shader_stage_to_abbrev(unsigned stage)
+const char *
+_mesa_shader_stage_to_abbrev(unsigned stage)
 {
   switch (stage) {
   case MESA_SHADER_VERTEX:   return "VS";
@ -83,7 +86,8 @@ const char * _mesa_shader_stage_to_abbrev(unsigned stage)
   unreachable("Unknown shader stage.");
 }

-const char * gl_vert_attrib_name(gl_vert_attrib attrib)
+const char *
+gl_vert_attrib_name(gl_vert_attrib attrib)
 {
   static const char *names[] = {
      ENUM(VERT_ATTRIB_POS),
@ -124,7 +128,8 @@ const char * gl_vert_attrib_name(gl_vert_attrib attrib)
   return NAME(attrib);
 }

-const char * gl_varying_slot_name(gl_varying_slot slot)
+const char *
+gl_varying_slot_name(gl_varying_slot slot)
 {
   static const char *names[] = {
      ENUM(VARYING_SLOT_POS),
@ -190,7 +195,8 @@ const char * gl_varying_slot_name(gl_varying_slot slot)
   return NAME(slot);
 }

-const char * gl_system_value_name(gl_system_value sysval)
+const char *
+gl_system_value_name(gl_system_value sysval)
 {
   static const char *names[] = {
     ENUM(SYSTEM_VALUE_VERTEX_ID),
@ -218,7 +224,8 @@ const char * gl_system_value_name(gl_system_value sysval)
   return NAME(sysval);
 }

-const char * glsl_interp_qualifier_name(enum glsl_interp_qualifier qual)
+const char *
+glsl_interp_qualifier_name(enum glsl_interp_qualifier qual)
 {
   static const char *names[] = {
      ENUM(INTERP_QUALIFIER_NONE),
@ -230,7 +237,8 @@ const char * glsl_interp_qualifier_name(enum glsl_interp_qualifier qual)
   return NAME(qual);
 }

-const char * gl_frag_result_name(gl_frag_result result)
+const char *
+gl_frag_result_name(gl_frag_result result)
 {
   static const char *names[] = {
      ENUM(FRAG_RESULT_DEPTH),
--- a/src/glsl/nir/shader_enums.h
+++ b/src/glsl/nir/shader_enums.h
@ -47,19 +47,19 @@ typedef enum
   MESA_SHADER_COMPUTE = 5,
 } gl_shader_stage;

-const char * gl_shader_stage_name(gl_shader_stage stage);
+const char *gl_shader_stage_name(gl_shader_stage stage);

 /**
 * Translate a gl_shader_stage to a short shader stage name for debug
 * printouts and error messages.
 */
-const char * _mesa_shader_stage_to_string(unsigned stage);
+const char *_mesa_shader_stage_to_string(unsigned stage);

 /**
 * Translate a gl_shader_stage to a shader stage abbreviation (VS, GS, FS)
 * for debug printouts and error messages.
 */
-const char * _mesa_shader_stage_to_abbrev(unsigned stage);
+const char *_mesa_shader_stage_to_abbrev(unsigned stage);

 #define MESA_SHADER_STAGES (MESA_SHADER_COMPUTE + 1)

@ -109,7 +109,7 @@ typedef enum
   VERT_ATTRIB_MAX = 33
 } gl_vert_attrib;

-const char * gl_vert_attrib_name(gl_vert_attrib attrib);
+const char *gl_vert_attrib_name(gl_vert_attrib attrib);

 /**
 * Symbolic constats to help iterating over
@ -254,7 +254,7 @@ typedef enum
 #define VARYING_SLOT_PATCH0	(VARYING_SLOT_MAX)
 #define VARYING_SLOT_TESS_MAX	(VARYING_SLOT_PATCH0 + MAX_VARYING)

-const char * gl_varying_slot_name(gl_varying_slot slot);
+const char *gl_varying_slot_name(gl_varying_slot slot);

 /**
 * Bitflags for varying slots.
@ -467,7 +467,7 @@ typedef enum
   SYSTEM_VALUE_MAX             /**< Number of values */
 } gl_system_value;

-const char * gl_system_value_name(gl_system_value sysval);
+const char *gl_system_value_name(gl_system_value sysval);

 /**
 * The possible interpolation qualifiers that can be applied to a fragment
@ -485,7 +485,7 @@ enum glsl_interp_qualifier
   INTERP_QUALIFIER_COUNT /**< Number of interpolation qualifiers */
 };

-const char * glsl_interp_qualifier_name(enum glsl_interp_qualifier qual);
+const char *glsl_interp_qualifier_name(enum glsl_interp_qualifier qual);

 /**
 * Fragment program results
@ -516,7 +516,7 @@ typedef enum
   FRAG_RESULT_DATA7,
 } gl_frag_result;

-const char * gl_frag_result_name(gl_frag_result result);
+const char *gl_frag_result_name(gl_frag_result result);

 #define FRAG_RESULT_MAX		(FRAG_RESULT_DATA0 + MAX_DRAW_BUFFERS)

--- a/src/mapi/Makefile.am
+++ b/src/mapi/Makefile.am
@ -35,6 +35,7 @@ EXTRA_DIST = \
 	es2api/ABI-check \
 	mapi_abi.py \
 	glapi/SConscript \
+	glapi/registry/gl.xml \
 	shared-glapi/SConscript

 AM_CFLAGS = \
@ -106,12 +107,16 @@ if HAVE_SPARC_ASM
 GLAPI_ASM_SOURCES = glapi/glapi_sparc.S
 endif

-glapi_libglapi_la_SOURCES = glapi/glapi_gentable.c
+glapi_libglapi_la_SOURCES =
 glapi_libglapi_la_CPPFLAGS = \
 	$(AM_CPPFLAGS) \
 	-I$(top_srcdir)/src/mapi/glapi \
 	-I$(top_srcdir)/src/mesa

+if HAVE_APPLEDRI
+glapi_libglapi_la_SOURCES += glapi/glapi_gentable.c
+endif
+
 if HAVE_SHARED_GLAPI
 glapi_libglapi_la_SOURCES += $(MAPI_BRIDGE_FILES) glapi/glapi_mapi_tmp.h
 glapi_libglapi_la_CPPFLAGS += \
--- a/src/mapi/glapi/gen/GREMEDY_string_marker.xml
+++ b/src/mapi/glapi/gen/GREMEDY_string_marker.xml
@ -0,0 +1,18 @@
+<?xml version="1.0"?>
+<!DOCTYPE OpenGLAPI SYSTEM "gl_API.dtd">
+
+<!-- Note: no GLX protocol info yet. -->
+
+
+<OpenGLAPI>
+
+<category name="GL_GREMEDY_string_marker" number="311">
+
+    <function name="StringMarkerGREMEDY">
+        <param name="len" type="GLsizei"/>
+        <param name="string" type="const GLvoid *"/>
+    </function>
+
+</category>
+
+</OpenGLAPI>
--- a/src/mapi/glapi/gen/Makefile.am
+++ b/src/mapi/glapi/gen/Makefile.am
@ -27,8 +27,11 @@ MESA_GLAPI_OUTPUTS = \
 	$(MESA_GLAPI_DIR)/glapi_mapi_tmp.h \
 	$(MESA_GLAPI_DIR)/glprocs.h \
 	$(MESA_GLAPI_DIR)/glapitemp.h \
-	$(MESA_GLAPI_DIR)/glapitable.h \
-	$(MESA_GLAPI_DIR)/glapi_gentable.c
+	$(MESA_GLAPI_DIR)/glapitable.h
+
+if HAVE_APPLEDRI
+MESA_GLAPI_OUTPUTS += $(MESA_GLAPI_DIR)/glapi_gentable.c
+endif

 MESA_GLAPI_ASM_OUTPUTS =
 if HAVE_X86_ASM
@ -57,6 +60,7 @@ BUILT_SOURCES = \
 	$(MESA_GLX_DIR)/indirect_size.c
 EXTRA_DIST= \
 	$(BUILT_SOURCES) \
+	$(MESA_GLAPI_DIR)/glapi_gentable.c \
 	$(MESA_GLAPI_DIR)/glapi_x86.S \
 	$(MESA_GLAPI_DIR)/glapi_x86-64.S \
 	$(MESA_GLAPI_DIR)/glapi_sparc.S \
@ -88,8 +92,12 @@ XORG_GLAPI_DIR = $(XORG_BASE)/glx
 XORG_GLAPI_OUTPUTS = \
 	$(XORG_GLAPI_DIR)/glprocs.h \
 	$(XORG_GLAPI_DIR)/glapitable.h \
-	$(XORG_GLAPI_DIR)/dispatch.h \
+	$(XORG_GLAPI_DIR)/dispatch.h
+
+if HAVE_APPLEDRI
+XORG_GLAPI_OUTPUTS += \
 	$(XORG_GLAPI_DIR)/glapi_gentable.c
+endif

 XORG_OUTPUTS = \
 	$(XORG_GLAPI_OUTPUTS) \
@ -188,6 +196,7 @@ API_XML = \
 	EXT_texture_array.xml \
 	EXT_texture_integer.xml \
 	EXT_transform_feedback.xml \
+	GREMEDY_string_marker.xml \
 	INTEL_performance_query.xml \
 	KHR_debug.xml \
 	KHR_context_flush_control.xml \
--- a/src/mapi/glapi/gen/gl_API.xml
+++ b/src/mapi/glapi/gen/gl_API.xml
@ -12620,6 +12620,8 @@

 <xi:include href="EXT_framebuffer_object.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>

+<xi:include href="GREMEDY_string_marker.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
+
 <xi:include href="EXT_packed_depth_stencil.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>

 <xi:include href="EXT_provoking_vertex.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
--- a/Show More
+++ b/Show More