broadcom: Add VC5 NIR compiler.

This is a pretty straightforward fork of VC4's NIR compiler to VC5. The condition codes, registers, and I/O have all changed, making the backend hard to share, though their heritage is still recognizable. v2: Move to src/broadcom/compiler to match intel's layout, rename more "vc5" to "v3d", rename QIR to VIR ("V3D IR") to avoid symbol conflicts with vc4, use new v3d_debug header, add compiler init/free functions, do texture swizzling in NIR to allow optimization.
2017-02-03 10:24:14 -08:00 · 2017-02-03 10:24:14 -08:00 · ade416d023
parent f71364f297
commit ade416d023
17 changed files with 7498 additions and 0 deletions
--- a/src/broadcom/Makefile.am
+++ b/src/broadcom/Makefile.am
@ -26,6 +26,8 @@ AM_CPPFLAGS = \
 	-I$(top_srcdir)/src \
 	-I$(top_srcdir)/src/broadcom/ \
 	-I$(top_srcdir)/src/broadcom/include \
+	-I$(top_srcdir)/src/gallium/auxiliary \
+	-I$(top_srcdir)/src/gallium/include \
 	$(VALGRIND_CFLAGS) \
 	$(DEFINES)

--- a/src/broadcom/Makefile.sources
+++ b/src/broadcom/Makefile.sources
@ -16,6 +16,19 @@ BROADCOM_FILES = \
 	clif/clif_dump.c \
 	clif/clif_dump.h \
 	common/v3d_device_info.h \
+	compiler/nir_to_vir.c \
+	compiler/vir.c \
+	compiler/vir_dump.c \
+	compiler/vir_live_variables.c \
+	compiler/vir_lower_uniforms.c \
+	compiler/vir_opt_copy_propagate.c \
+	compiler/vir_opt_dead_code.c \
+	compiler/vir_register_allocate.c \
+	compiler/vir_to_qpu.c \
+	compiler/qpu_schedule.c \
+	compiler/qpu_validate.c \
+	compiler/v3d_compiler.h \
+	compiler/v3d_nir_lower_io.c \
 	qpu/qpu_disasm.c \
 	qpu/qpu_disasm.h \
 	qpu/qpu_instr.c \
--- a/src/broadcom/Makefile.vc5.am
+++ b/src/broadcom/Makefile.vc5.am
@ -13,6 +13,7 @@ check_PROGRAMS += \

 LDADD = \
 	libbroadcom.la \
+	$(top_builddir)/src/compiler/nir/libnir.la \
 	$(top_builddir)/src/util/libmesautil.la \
 	$(NULL)

--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
--- a/src/broadcom/compiler/qpu_validate.c
+++ b/src/broadcom/compiler/qpu_validate.c
@ -0,0 +1,208 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file
+ *
+ * Validates the QPU instruction sequence after register allocation and
+ * scheduling.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "v3d_compiler.h"
+#include "qpu/qpu_disasm.h"
+
+struct v3d_qpu_validate_state {
+        struct v3d_compile *c;
+        const struct v3d_qpu_instr *last;
+        int ip;
+        int last_sfu_write;
+};
+
+static void
+fail_instr(struct v3d_qpu_validate_state *state, const char *msg)
+{
+        struct v3d_compile *c = state->c;
+
+        fprintf(stderr, "v3d_qpu_validate at ip %d: %s:\n", state->ip, msg);
+
+        int dump_ip = 0;
+        vir_for_each_inst_inorder(inst, c) {
+                v3d_qpu_dump(c->devinfo, &inst->qpu);
+
+                if (dump_ip++ == state->ip)
+                        fprintf(stderr, " *** ERROR ***");
+
+                fprintf(stderr, "\n");
+        }
+
+        fprintf(stderr, "\n");
+        abort();
+}
+
+static bool
+qpu_magic_waddr_matches(const struct v3d_qpu_instr *inst,
+                        bool (*predicate)(enum v3d_qpu_waddr waddr))
+{
+        if (inst->type == V3D_QPU_INSTR_TYPE_ALU)
+                return false;
+
+        if (inst->alu.add.op != V3D_QPU_A_NOP &&
+            inst->alu.add.magic_write &&
+            predicate(inst->alu.add.waddr))
+                return true;
+
+        if (inst->alu.mul.op != V3D_QPU_M_NOP &&
+            inst->alu.mul.magic_write &&
+            predicate(inst->alu.mul.waddr))
+                return true;
+
+        return false;
+}
+
+static void
+qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
+{
+        const struct v3d_qpu_instr *inst = &qinst->qpu;
+
+        if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
+                return;
+
+        /* LDVARY writes r5 two instructions later and LDUNIF writes
+         * r5 one instruction later, which is illegal to have
+         * together.
+         */
+        if (state->last && state->last->sig.ldvary && inst->sig.ldunif) {
+                fail_instr(state, "LDUNIF after a LDVARY");
+        }
+
+        int tmu_writes = 0;
+        int sfu_writes = 0;
+        int vpm_writes = 0;
+        int tlb_writes = 0;
+        int tsy_writes = 0;
+
+        if (inst->alu.add.op != V3D_QPU_A_NOP) {
+                if (inst->alu.add.magic_write) {
+                        if (v3d_qpu_magic_waddr_is_tmu(inst->alu.add.waddr))
+                                tmu_writes++;
+                        if (v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr))
+                                sfu_writes++;
+                        if (v3d_qpu_magic_waddr_is_vpm(inst->alu.add.waddr))
+                                vpm_writes++;
+                        if (v3d_qpu_magic_waddr_is_tlb(inst->alu.add.waddr))
+                                tlb_writes++;
+                        if (v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr))
+                                tsy_writes++;
+                }
+        }
+
+        if (inst->alu.mul.op != V3D_QPU_M_NOP) {
+                if (inst->alu.mul.magic_write) {
+                        if (v3d_qpu_magic_waddr_is_tmu(inst->alu.mul.waddr))
+                                tmu_writes++;
+                        if (v3d_qpu_magic_waddr_is_sfu(inst->alu.mul.waddr))
+                                sfu_writes++;
+                        if (v3d_qpu_magic_waddr_is_vpm(inst->alu.mul.waddr))
+                                vpm_writes++;
+                        if (v3d_qpu_magic_waddr_is_tlb(inst->alu.mul.waddr))
+                                tlb_writes++;
+                        if (v3d_qpu_magic_waddr_is_tsy(inst->alu.mul.waddr))
+                                tsy_writes++;
+                }
+        }
+
+        (void)qpu_magic_waddr_matches; /* XXX */
+
+        /* SFU r4 results come back two instructions later.  No doing
+         * r4 read/writes or other SFU lookups until it's done.
+         */
+        if (state->ip - state->last_sfu_write < 2) {
+                if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_R4))
+                        fail_instr(state, "R4 read too soon after SFU");
+
+                if (v3d_qpu_writes_r4(inst))
+                        fail_instr(state, "R4 write too soon after SFU");
+
+                if (sfu_writes)
+                        fail_instr(state, "SFU write too soon after SFU");
+        }
+
+        /* XXX: The docs say VPM can happen with the others, but the simulator
+         * disagrees.
+         */
+        if (tmu_writes +
+            sfu_writes +
+            vpm_writes +
+            tlb_writes +
+            tsy_writes +
+            inst->sig.ldtmu +
+            inst->sig.ldtlb +
+            inst->sig.ldvpm +
+            inst->sig.ldtlbu > 1) {
+                fail_instr(state,
+                           "Only one of [TMU, SFU, TSY, TLB read, VPM] allowed");
+        }
+
+        if (sfu_writes)
+                state->last_sfu_write = state->ip;
+}
+
+static void
+qpu_validate_block(struct v3d_qpu_validate_state *state, struct qblock *block)
+{
+        vir_for_each_inst(qinst, block) {
+                qpu_validate_inst(state, qinst);
+
+                state->last = &qinst->qpu;
+                state->ip++;
+        }
+}
+
+/**
+ * Checks for the instruction restrictions from page 37 ("Summary of
+ * Instruction Restrictions").
+ */
+void
+qpu_validate(struct v3d_compile *c)
+{
+        /* We don't want to do validation in release builds, but we want to
+         * keep compiling the validation code to make sure it doesn't get
+         * broken.
+         */
+#ifndef DEBUG
+        return;
+#endif
+
+        struct v3d_qpu_validate_state state = {
+                .c = c,
+                .last_sfu_write = -10,
+                .ip = 0,
+        };
+
+        vir_for_each_block(block, c) {
+                qpu_validate_block(&state, block);
+        }
+}
--- a/src/broadcom/compiler/v3d_compiler.c
+++ b/src/broadcom/compiler/v3d_compiler.c
@ -0,0 +1,43 @@
+/*
+ * Copyright © 2016 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+struct v3d_compiler *
+v3d_compiler_init(void)
+{
+        struct v3d_compile *c = rzalloc(struct v3d_compile);
+
+        return c;
+}
+
+void
+v3d_add_qpu_inst(struct v3d_compiler *c, uint64_t inst)
+{
+        if (c->qpu_inst_count >= c->qpu_inst_size) {
+                c->qpu_inst_size = MAX2(c->qpu_inst_size * 2, 16);
+                c->qpu_insts = reralloc(c, c->qpu_insts, uint64_t,
+                                        c->qpu_inst_size_array_size);
+
+        }
+
+        c->qpu_insts[c->qpu_inst_count++] = inst;
+}
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@ -0,0 +1,927 @@
+/*
+ * Copyright © 2016 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef V3D_COMPILER_H
+#define V3D_COMPILER_H
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "util/macros.h"
+#include "common/v3d_debug.h"
+#include "compiler/nir/nir.h"
+#include "util/list.h"
+#include "util/u_math.h"
+
+#include "qpu/qpu_instr.h"
+#include "pipe/p_state.h"
+
+#define V3D_MAX_TEXTURE_SAMPLERS 32
+#define V3D_MAX_SAMPLES 4
+#define V3D_MAX_FS_INPUTS 64
+#define V3D_MAX_VS_INPUTS 64
+
+struct nir_builder;
+
+struct v3d_fs_inputs {
+        /**
+         * Array of the meanings of the VPM inputs this shader needs.
+         *
+         * It doesn't include those that aren't part of the VPM, like
+         * point/line coordinates.
+         */
+        struct v3d_varying_slot *input_slots;
+        uint32_t num_inputs;
+};
+
+enum qfile {
+        /** An unused source or destination register. */
+        QFILE_NULL,
+
+        /** A physical register, such as the W coordinate payload. */
+        QFILE_REG,
+        /** One of the regsiters for fixed function interactions. */
+        QFILE_MAGIC,
+
+        /**
+         *  A virtual register, that will be allocated to actual accumulator
+         * or physical registers later.
+         */
+        QFILE_TEMP,
+        QFILE_VARY,
+        QFILE_UNIF,
+        QFILE_TLB,
+        QFILE_TLBU,
+
+        /**
+         * VPM reads use this with an index value to say what part of the VPM
+         * is being read.
+         */
+        QFILE_VPM,
+
+        /**
+         * Stores an immediate value in the index field that will be used
+         * directly by qpu_load_imm().
+         */
+        QFILE_LOAD_IMM,
+
+        /**
+         * Stores an immediate value in the index field that can be turned
+         * into a small immediate field by qpu_encode_small_immediate().
+         */
+        QFILE_SMALL_IMM,
+};
+
+/**
+ * A reference to a QPU register or a virtual temp register.
+ */
+struct qreg {
+        enum qfile file;
+        uint32_t index;
+};
+
+static inline struct qreg vir_reg(enum qfile file, uint32_t index)
+{
+        return (struct qreg){file, index};
+}
+
+/**
+ * A reference to an actual register at the QPU level, for register
+ * allocation.
+ */
+struct qpu_reg {
+        bool magic;
+        int index;
+};
+
+struct qinst {
+        /** Entry in qblock->instructions */
+        struct list_head link;
+
+        /**
+         * The instruction being wrapped.  Its condition codes, pack flags,
+         * signals, etc. will all be used, with just the register references
+         * being replaced by the contents of qinst->dst and qinst->src[].
+         */
+        struct v3d_qpu_instr qpu;
+
+        /* Pre-register-allocation references to src/dst registers */
+        struct qreg dst;
+        struct qreg src[3];
+        bool cond_is_exec_mask;
+        bool has_implicit_uniform;
+
+        /* After vir_to_qpu.c: If instr reads a uniform, which uniform from
+         * the uncompiled stream it is.
+         */
+        int uniform;
+};
+
+enum quniform_contents {
+        /**
+         * Indicates that a constant 32-bit value is copied from the program's
+         * uniform contents.
+         */
+        QUNIFORM_CONSTANT,
+        /**
+         * Indicates that the program's uniform contents are used as an index
+         * into the GL uniform storage.
+         */
+        QUNIFORM_UNIFORM,
+
+        /** @{
+         * Scaling factors from clip coordinates to relative to the viewport
+         * center.
+         *
+         * This is used by the coordinate and vertex shaders to produce the
+         * 32-bit entry consisting of 2 16-bit fields with 12.4 signed fixed
+         * point offsets from the viewport ccenter.
+         */
+        QUNIFORM_VIEWPORT_X_SCALE,
+        QUNIFORM_VIEWPORT_Y_SCALE,
+        /** @} */
+
+        QUNIFORM_VIEWPORT_Z_OFFSET,
+        QUNIFORM_VIEWPORT_Z_SCALE,
+
+        QUNIFORM_USER_CLIP_PLANE,
+
+        /**
+         * A reference to a texture config parameter 0 uniform.
+         *
+         * This is a uniform implicitly loaded with a QPU_W_TMU* write, which
+         * defines texture type, miplevels, and such.  It will be found as a
+         * parameter to the first QOP_TEX_[STRB] instruction in a sequence.
+         */
+        QUNIFORM_TEXTURE_CONFIG_P0_0,
+        QUNIFORM_TEXTURE_CONFIG_P0_1,
+        QUNIFORM_TEXTURE_CONFIG_P0_2,
+        QUNIFORM_TEXTURE_CONFIG_P0_3,
+        QUNIFORM_TEXTURE_CONFIG_P0_4,
+        QUNIFORM_TEXTURE_CONFIG_P0_5,
+        QUNIFORM_TEXTURE_CONFIG_P0_6,
+        QUNIFORM_TEXTURE_CONFIG_P0_7,
+        QUNIFORM_TEXTURE_CONFIG_P0_8,
+        QUNIFORM_TEXTURE_CONFIG_P0_9,
+        QUNIFORM_TEXTURE_CONFIG_P0_10,
+        QUNIFORM_TEXTURE_CONFIG_P0_11,
+        QUNIFORM_TEXTURE_CONFIG_P0_12,
+        QUNIFORM_TEXTURE_CONFIG_P0_13,
+        QUNIFORM_TEXTURE_CONFIG_P0_14,
+        QUNIFORM_TEXTURE_CONFIG_P0_15,
+        QUNIFORM_TEXTURE_CONFIG_P0_16,
+        QUNIFORM_TEXTURE_CONFIG_P0_17,
+        QUNIFORM_TEXTURE_CONFIG_P0_18,
+        QUNIFORM_TEXTURE_CONFIG_P0_19,
+        QUNIFORM_TEXTURE_CONFIG_P0_20,
+        QUNIFORM_TEXTURE_CONFIG_P0_21,
+        QUNIFORM_TEXTURE_CONFIG_P0_22,
+        QUNIFORM_TEXTURE_CONFIG_P0_23,
+        QUNIFORM_TEXTURE_CONFIG_P0_24,
+        QUNIFORM_TEXTURE_CONFIG_P0_25,
+        QUNIFORM_TEXTURE_CONFIG_P0_26,
+        QUNIFORM_TEXTURE_CONFIG_P0_27,
+        QUNIFORM_TEXTURE_CONFIG_P0_28,
+        QUNIFORM_TEXTURE_CONFIG_P0_29,
+        QUNIFORM_TEXTURE_CONFIG_P0_30,
+        QUNIFORM_TEXTURE_CONFIG_P0_31,
+        QUNIFORM_TEXTURE_CONFIG_P0_32,
+
+        /**
+         * A reference to a texture config parameter 1 uniform.
+         *
+         * This is a uniform implicitly loaded with a QPU_W_TMU* write, which
+         * defines texture width, height, filters, and wrap modes.  It will be
+         * found as a parameter to the second QOP_TEX_[STRB] instruction in a
+         * sequence.
+         */
+        QUNIFORM_TEXTURE_CONFIG_P1,
+
+        QUNIFORM_TEXTURE_FIRST_LEVEL,
+
+        QUNIFORM_TEXTURE_WIDTH,
+        QUNIFORM_TEXTURE_HEIGHT,
+        QUNIFORM_TEXTURE_DEPTH,
+        QUNIFORM_TEXTURE_ARRAY_SIZE,
+        QUNIFORM_TEXTURE_LEVELS,
+
+        QUNIFORM_TEXTURE_MSAA_ADDR,
+
+        QUNIFORM_UBO_ADDR,
+
+        QUNIFORM_TEXRECT_SCALE_X,
+        QUNIFORM_TEXRECT_SCALE_Y,
+
+        QUNIFORM_TEXTURE_BORDER_COLOR,
+
+        QUNIFORM_STENCIL,
+
+        QUNIFORM_ALPHA_REF,
+        QUNIFORM_SAMPLE_MASK,
+};
+
+struct v3d_varying_slot {
+        uint8_t slot_and_component;
+};
+
+static inline struct v3d_varying_slot
+v3d_slot_from_slot_and_component(uint8_t slot, uint8_t component)
+{
+        assert(slot < 255 / 4);
+        return (struct v3d_varying_slot){ (slot << 2) + component };
+}
+
+static inline uint8_t v3d_slot_get_slot(struct v3d_varying_slot slot)
+{
+        return slot.slot_and_component >> 2;
+}
+
+static inline uint8_t v3d_slot_get_component(struct v3d_varying_slot slot)
+{
+        return slot.slot_and_component & 3;
+}
+
+struct v3d_ubo_range {
+        /**
+         * offset in bytes from the start of the ubo where this range is
+         * uploaded.
+         *
+         * Only set once used is set.
+         */
+        uint32_t dst_offset;
+
+        /**
+         * offset in bytes from the start of the gallium uniforms where the
+         * data comes from.
+         */
+        uint32_t src_offset;
+
+        /** size in bytes of this ubo range */
+        uint32_t size;
+};
+
+struct v3d_key {
+        void *shader_state;
+        struct {
+                uint8_t swizzle[4];
+                uint8_t return_size;
+                uint8_t return_channels;
+                union {
+                        struct {
+                                unsigned compare_mode:1;
+                                unsigned compare_func:3;
+                                unsigned wrap_s:3;
+                                unsigned wrap_t:3;
+                        };
+                        struct {
+                                uint16_t msaa_width, msaa_height;
+                        };
+                };
+        } tex[V3D_MAX_TEXTURE_SAMPLERS];
+        uint8_t ucp_enables;
+};
+
+struct v3d_fs_key {
+        struct v3d_key base;
+        bool depth_enabled;
+        bool is_points;
+        bool is_lines;
+        bool alpha_test;
+        bool point_coord_upper_left;
+        bool light_twoside;
+        bool msaa;
+        bool sample_coverage;
+        bool sample_alpha_to_coverage;
+        bool sample_alpha_to_one;
+        bool clamp_color;
+        bool swap_color_rb;
+        uint8_t alpha_test_func;
+        uint8_t logicop_func;
+        uint32_t point_sprite_mask;
+
+        struct pipe_rt_blend_state blend;
+};
+
+struct v3d_vs_key {
+        struct v3d_key base;
+
+        struct v3d_varying_slot fs_inputs[V3D_MAX_FS_INPUTS];
+        uint8_t num_fs_inputs;
+
+        bool is_coord;
+        bool per_vertex_point_size;
+        bool clamp_color;
+};
+
+/** A basic block of VIR intructions. */
+struct qblock {
+        struct list_head link;
+
+        struct list_head instructions;
+
+        struct set *predecessors;
+        struct qblock *successors[2];
+
+        int index;
+
+        /* Instruction IPs for the first and last instruction of the block.
+         * Set by qpu_schedule.c.
+         */
+        uint32_t start_qpu_ip;
+        uint32_t end_qpu_ip;
+
+        /* Instruction IP for the branch instruction of the block.  Set by
+         * qpu_schedule.c.
+         */
+        uint32_t branch_qpu_ip;
+
+        /** Offset within the uniform stream at the start of the block. */
+        uint32_t start_uniform;
+        /** Offset within the uniform stream of the branch instruction */
+        uint32_t branch_uniform;
+
+        /** @{ used by v3d_vir_live_variables.c */
+        BITSET_WORD *def;
+        BITSET_WORD *use;
+        BITSET_WORD *live_in;
+        BITSET_WORD *live_out;
+        int start_ip, end_ip;
+        /** @} */
+};
+
+/**
+ * Compiler state saved across compiler invocations, for any expensive global
+ * setup.
+ */
+struct v3d_compiler {
+        const struct v3d_device_info *devinfo;
+        struct ra_regs *regs;
+        unsigned int reg_class[3];
+};
+
+struct v3d_compile {
+        const struct v3d_device_info *devinfo;
+        nir_shader *s;
+        nir_function_impl *impl;
+        struct exec_list *cf_node_list;
+        const struct v3d_compiler *compiler;
+
+        /**
+         * Mapping from nir_register * or nir_ssa_def * to array of struct
+         * qreg for the values.
+         */
+        struct hash_table *def_ht;
+
+        /* For each temp, the instruction generating its value. */
+        struct qinst **defs;
+        uint32_t defs_array_size;
+
+        /**
+         * Inputs to the shader, arranged by TGSI declaration order.
+         *
+         * Not all fragment shader QFILE_VARY reads are present in this array.
+         */
+        struct qreg *inputs;
+        struct qreg *outputs;
+        bool msaa_per_sample_output;
+        struct qreg color_reads[V3D_MAX_SAMPLES];
+        struct qreg sample_colors[V3D_MAX_SAMPLES];
+        uint32_t inputs_array_size;
+        uint32_t outputs_array_size;
+        uint32_t uniforms_array_size;
+
+        /* Booleans for whether the corresponding QFILE_VARY[i] is
+         * flat-shaded.  This doesn't count gl_FragColor flat-shading, which is
+         * controlled by shader->color_inputs and rasterizer->flatshade in the
+         * gallium driver.
+         */
+        BITSET_WORD flat_shade_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)];
+
+        struct v3d_ubo_range *ubo_ranges;
+        bool *ubo_range_used;
+        uint32_t ubo_ranges_array_size;
+        /** Number of uniform areas tracked in ubo_ranges. */
+        uint32_t num_ubo_ranges;
+        uint32_t next_ubo_dst_offset;
+
+        /* State for whether we're executing on each channel currently.  0 if
+         * yes, otherwise a block number + 1 that the channel jumped to.
+         */
+        struct qreg execute;
+
+        struct qreg line_x, point_x, point_y;
+
+        /**
+         * Instance ID, which comes in before the vertex attribute payload if
+         * the shader record requests it.
+         */
+        struct qreg iid;
+
+        /**
+         * Vertex ID, which comes in before the vertex attribute payload
+         * (after Instance ID) if the shader record requests it.
+         */
+        struct qreg vid;
+
+        /* Fragment shader payload regs. */
+        struct qreg payload_w, payload_w_centroid, payload_z;
+
+        /** boolean (~0 -> true) if the fragment has been discarded. */
+        struct qreg discard;
+
+        uint8_t vattr_sizes[V3D_MAX_VS_INPUTS];
+        uint32_t num_vpm_writes;
+
+        /**
+         * Array of the VARYING_SLOT_* of all FS QFILE_VARY reads.
+         *
+         * This includes those that aren't part of the VPM varyings, like
+         * point/line coordinates.
+         */
+        struct v3d_varying_slot input_slots[V3D_MAX_FS_INPUTS];
+
+        /**
+         * An entry per outputs[] in the VS indicating what the VARYING_SLOT_*
+         * of the output is.  Used to emit from the VS in the order that the
+         * FS needs.
+         */
+        struct v3d_varying_slot *output_slots;
+
+        struct pipe_shader_state *shader_state;
+        struct v3d_key *key;
+        struct v3d_fs_key *fs_key;
+        struct v3d_vs_key *vs_key;
+
+        /* Live ranges of temps. */
+        int *temp_start, *temp_end;
+
+        uint32_t *uniform_data;
+        enum quniform_contents *uniform_contents;
+        uint32_t uniform_array_size;
+        uint32_t num_uniforms;
+        uint32_t num_outputs;
+        uint32_t output_position_index;
+        nir_variable *output_color_var;
+        uint32_t output_point_size_index;
+        uint32_t output_sample_mask_index;
+
+        struct qreg undef;
+        uint32_t num_temps;
+
+        struct list_head blocks;
+        int next_block_index;
+        struct qblock *cur_block;
+        struct qblock *loop_cont_block;
+        struct qblock *loop_break_block;
+
+        uint64_t *qpu_insts;
+        uint32_t qpu_inst_count;
+        uint32_t qpu_inst_size;
+
+        /* For the FS, the number of varying inputs not counting the
+         * point/line varyings payload
+         */
+        uint32_t num_inputs;
+
+        /**
+         * Number of inputs from num_inputs remaining to be queued to the read
+         * FIFO in the VS/CS.
+         */
+        uint32_t num_inputs_remaining;
+
+        /* Number of inputs currently in the read FIFO for the VS/CS */
+        uint32_t num_inputs_in_fifo;
+
+        /** Next offset in the VPM to read from in the VS/CS */
+        uint32_t vpm_read_offset;
+
+        uint32_t program_id;
+        uint32_t variant_id;
+
+        /* Set to compile program in threaded FS mode, where SIG_THREAD_SWITCH
+         * is used to hide texturing latency at the cost of limiting ourselves
+         * to the bottom half of physical reg space.
+         */
+        bool fs_threaded;
+
+        bool last_thrsw_at_top_level;
+
+        bool failed;
+};
+
+struct v3d_uniform_list {
+        enum quniform_contents *contents;
+        uint32_t *data;
+        uint32_t count;
+};
+
+struct v3d_prog_data {
+        struct v3d_uniform_list uniforms;
+
+        struct v3d_ubo_range *ubo_ranges;
+        uint32_t num_ubo_ranges;
+        uint32_t ubo_size;
+
+        uint8_t num_inputs;
+
+};
+
+struct v3d_vs_prog_data {
+        struct v3d_prog_data base;
+
+        bool uses_iid, uses_vid;
+
+        /* Number of components read from each vertex attribute. */
+        uint8_t vattr_sizes[32];
+
+        /* Total number of components read, for the shader state record. */
+        uint32_t vpm_input_size;
+
+        /* Total number of components written, for the shader state record. */
+        uint32_t vpm_output_size;
+};
+
+struct v3d_fs_prog_data {
+        struct v3d_prog_data base;
+
+        struct v3d_varying_slot input_slots[V3D_MAX_FS_INPUTS];
+
+        /** bitmask of which inputs are color inputs, for flat shade handling. */
+        uint32_t color_inputs[BITSET_WORDS(V3D_MAX_FS_INPUTS)];
+
+        /* Bitmask for whether the corresponding input is flat-shaded,
+         * independent of rasterizer (gl_FragColor) flat-shading.
+         */
+        BITSET_WORD flat_shade_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)];
+
+        bool writes_z;
+};
+
+/* Special nir_load_input intrinsic index for loading the current TLB
+ * destination color.
+ */
+#define V3D_NIR_TLB_COLOR_READ_INPUT		2000000000
+
+#define V3D_NIR_MS_MASK_OUTPUT			2000000000
+
+extern const nir_shader_compiler_options v3d_nir_options;
+
+const struct v3d_compiler *v3d_compiler_init(const struct v3d_device_info *devinfo);
+void v3d_compiler_free(const struct v3d_compiler *compiler);
+void v3d_optimize_nir(struct nir_shader *s);
+
+uint64_t *v3d_compile_vs(const struct v3d_compiler *compiler,
+                         struct v3d_vs_key *key,
+                         struct v3d_vs_prog_data *prog_data,
+                         nir_shader *s,
+                         int program_id, int variant_id,
+                         uint32_t *final_assembly_size);
+
+uint64_t *v3d_compile_fs(const struct v3d_compiler *compiler,
+                         struct v3d_fs_key *key,
+                         struct v3d_fs_prog_data *prog_data,
+                         nir_shader *s,
+                         int program_id, int variant_id,
+                         uint32_t *final_assembly_size);
+
+void v3d_nir_to_vir(struct v3d_compile *c);
+
+void vir_compile_destroy(struct v3d_compile *c);
+const char *vir_get_stage_name(struct v3d_compile *c);
+struct qblock *vir_new_block(struct v3d_compile *c);
+void vir_set_emit_block(struct v3d_compile *c, struct qblock *block);
+void vir_link_blocks(struct qblock *predecessor, struct qblock *successor);
+struct qblock *vir_entry_block(struct v3d_compile *c);
+struct qblock *vir_exit_block(struct v3d_compile *c);
+struct qinst *vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst,
+                           struct qreg src0, struct qreg src1);
+struct qinst *vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst,
+                           struct qreg src0, struct qreg src1);
+struct qinst *vir_branch_inst(enum v3d_qpu_branch_cond cond, struct qreg src0);
+void vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst);
+struct qreg vir_uniform(struct v3d_compile *c,
+                        enum quniform_contents contents,
+                        uint32_t data);
+void vir_schedule_instructions(struct v3d_compile *c);
+struct v3d_qpu_instr v3d_qpu_nop(void);
+
+struct qreg vir_emit_def(struct v3d_compile *c, struct qinst *inst);
+struct qinst *vir_emit_nondef(struct v3d_compile *c, struct qinst *inst);
+void vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond);
+void vir_set_pf(struct qinst *inst, enum v3d_qpu_pf pf);
+void vir_set_unpack(struct qinst *inst, int src,
+                    enum v3d_qpu_input_unpack unpack);
+
+struct qreg vir_get_temp(struct v3d_compile *c);
+void vir_calculate_live_intervals(struct v3d_compile *c);
+bool vir_has_implicit_uniform(struct qinst *inst);
+int vir_get_implicit_uniform_src(struct qinst *inst);
+int vir_get_non_sideband_nsrc(struct qinst *inst);
+int vir_get_nsrc(struct qinst *inst);
+bool vir_has_side_effects(struct v3d_compile *c, struct qinst *inst);
+bool vir_get_add_op(struct qinst *inst, enum v3d_qpu_add_op *op);
+bool vir_get_mul_op(struct qinst *inst, enum v3d_qpu_mul_op *op);
+bool vir_is_raw_mov(struct qinst *inst);
+bool vir_is_tex(struct qinst *inst);
+bool vir_is_add(struct qinst *inst);
+bool vir_is_mul(struct qinst *inst);
+bool vir_is_float_input(struct qinst *inst);
+bool vir_depends_on_flags(struct qinst *inst);
+bool vir_writes_r3(struct qinst *inst);
+bool vir_writes_r4(struct qinst *inst);
+struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg);
+uint8_t vir_channels_written(struct qinst *inst);
+
+void vir_dump(struct v3d_compile *c);
+void vir_dump_inst(struct v3d_compile *c, struct qinst *inst);
+
+void vir_validate(struct v3d_compile *c);
+
+void vir_optimize(struct v3d_compile *c);
+bool vir_opt_algebraic(struct v3d_compile *c);
+bool vir_opt_constant_folding(struct v3d_compile *c);
+bool vir_opt_copy_propagate(struct v3d_compile *c);
+bool vir_opt_dead_code(struct v3d_compile *c);
+bool vir_opt_peephole_sf(struct v3d_compile *c);
+bool vir_opt_small_immediates(struct v3d_compile *c);
+bool vir_opt_vpm(struct v3d_compile *c);
+void v3d_nir_lower_blend(nir_shader *s, struct v3d_compile *c);
+void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c);
+void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c);
+void vir_lower_uniforms(struct v3d_compile *c);
+
+void v3d_vir_to_qpu(struct v3d_compile *c);
+uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c);
+void qpu_validate(struct v3d_compile *c);
+struct qpu_reg *v3d_register_allocate(struct v3d_compile *c);
+bool vir_init_reg_sets(struct v3d_compiler *compiler);
+
+void vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf);
+
+static inline bool
+quniform_contents_is_texture_p0(enum quniform_contents contents)
+{
+        return (contents >= QUNIFORM_TEXTURE_CONFIG_P0_0 &&
+                contents < (QUNIFORM_TEXTURE_CONFIG_P0_0 +
+                            V3D_MAX_TEXTURE_SAMPLERS));
+}
+
+static inline struct qreg
+vir_uniform_ui(struct v3d_compile *c, uint32_t ui)
+{
+        return vir_uniform(c, QUNIFORM_CONSTANT, ui);
+}
+
+static inline struct qreg
+vir_uniform_f(struct v3d_compile *c, float f)
+{
+        return vir_uniform(c, QUNIFORM_CONSTANT, fui(f));
+}
+
+#define VIR_ALU0(name, vir_inst, op)                                     \
+static inline struct qreg                                                \
+vir_##name(struct v3d_compile *c)                                        \
+{                                                                        \
+        return vir_emit_def(c, vir_inst(op, c->undef,                    \
+                                        c->undef, c->undef));            \
+}                                                                        \
+static inline struct qinst *                                             \
+vir_##name##_dest(struct v3d_compile *c, struct qreg dest)               \
+{                                                                        \
+        return vir_emit_nondef(c, vir_inst(op, dest,                     \
+                                           c->undef, c->undef));         \
+}
+
+#define VIR_ALU1(name, vir_inst, op)                                     \
+static inline struct qreg                                                \
+vir_##name(struct v3d_compile *c, struct qreg a)                         \
+{                                                                        \
+        return vir_emit_def(c, vir_inst(op, c->undef,                    \
+                                        a, c->undef));                   \
+}                                                                        \
+static inline struct qinst *                                             \
+vir_##name##_dest(struct v3d_compile *c, struct qreg dest,               \
+                  struct qreg a)                                         \
+{                                                                        \
+        return vir_emit_nondef(c, vir_inst(op, dest, a,          \
+                                           c->undef));                   \
+}
+
+#define VIR_ALU2(name, vir_inst, op)                                       \
+static inline struct qreg                                                \
+vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b)          \
+{                                                                        \
+        return vir_emit_def(c, vir_inst(op, c->undef, a, b));    \
+}                                                                        \
+static inline struct qinst *                                             \
+vir_##name##_dest(struct v3d_compile *c, struct qreg dest,               \
+                  struct qreg a, struct qreg b)                          \
+{                                                                        \
+        return vir_emit_nondef(c, vir_inst(op, dest, a, b));     \
+}
+
+#define VIR_NODST_1(name, vir_inst, op)                                               \
+static inline struct qinst *                                            \
+vir_##name(struct v3d_compile *c, struct qreg a)                        \
+{                                                                       \
+        return vir_emit_nondef(c, vir_inst(op, c->undef,        \
+                                           a, c->undef));               \
+}
+
+#define VIR_NODST_2(name, vir_inst, op)                                               \
+static inline struct qinst *                                            \
+vir_##name(struct v3d_compile *c, struct qreg a, struct qreg b)         \
+{                                                                       \
+        return vir_emit_nondef(c, vir_inst(op, c->undef,                \
+                                           a, b));                      \
+}
+
+#define VIR_A_ALU2(name) VIR_ALU2(name, vir_add_inst, V3D_QPU_A_##name)
+#define VIR_M_ALU2(name) VIR_ALU2(name, vir_mul_inst, V3D_QPU_M_##name)
+#define VIR_A_ALU1(name) VIR_ALU1(name, vir_add_inst, V3D_QPU_A_##name)
+#define VIR_M_ALU1(name) VIR_ALU1(name, vir_mul_inst, V3D_QPU_M_##name)
+#define VIR_A_ALU0(name) VIR_ALU0(name, vir_add_inst, V3D_QPU_A_##name)
+#define VIR_M_ALU0(name) VIR_ALU0(name, vir_mul_inst, V3D_QPU_M_##name)
+#define VIR_A_NODST_2(name) VIR_NODST_2(name, vir_add_inst, V3D_QPU_A_##name)
+#define VIR_M_NODST_2(name) VIR_NODST_2(name, vir_mul_inst, V3D_QPU_M_##name)
+#define VIR_A_NODST_1(name) VIR_NODST_1(name, vir_add_inst, V3D_QPU_A_##name)
+#define VIR_M_NODST_1(name) VIR_NODST_1(name, vir_mul_inst, V3D_QPU_M_##name)
+
+VIR_A_ALU2(FADD)
+VIR_A_ALU2(VFPACK)
+VIR_A_ALU2(FSUB)
+VIR_A_ALU2(FMIN)
+VIR_A_ALU2(FMAX)
+
+VIR_A_ALU2(ADD)
+VIR_A_ALU2(SUB)
+VIR_A_ALU2(SHL)
+VIR_A_ALU2(SHR)
+VIR_A_ALU2(ASR)
+VIR_A_ALU2(ROR)
+VIR_A_ALU2(MIN)
+VIR_A_ALU2(MAX)
+VIR_A_ALU2(UMIN)
+VIR_A_ALU2(UMAX)
+VIR_A_ALU2(AND)
+VIR_A_ALU2(OR)
+VIR_A_ALU2(XOR)
+VIR_A_ALU2(VADD)
+VIR_A_ALU2(VSUB)
+VIR_A_ALU1(NOT)
+VIR_A_ALU1(NEG)
+VIR_A_ALU1(FLAPUSH)
+VIR_A_ALU1(FLBPUSH)
+VIR_A_ALU1(FLBPOP)
+VIR_A_ALU1(SETMSF)
+VIR_A_ALU1(SETREVF)
+VIR_A_ALU1(TIDX)
+VIR_A_ALU1(EIDX)
+
+VIR_A_ALU0(FXCD)
+VIR_A_ALU0(XCD)
+VIR_A_ALU0(FYCD)
+VIR_A_ALU0(YCD)
+VIR_A_ALU0(MSF)
+VIR_A_ALU0(REVF)
+VIR_A_NODST_1(VPMSETUP)
+VIR_A_ALU2(FCMP)
+VIR_A_ALU2(VFMAX)
+
+VIR_A_ALU1(FROUND)
+VIR_A_ALU1(FTOIN)
+VIR_A_ALU1(FTRUNC)
+VIR_A_ALU1(FTOIZ)
+VIR_A_ALU1(FFLOOR)
+VIR_A_ALU1(FTOUZ)
+VIR_A_ALU1(FCEIL)
+VIR_A_ALU1(FTOC)
+
+VIR_A_ALU1(FDX)
+VIR_A_ALU1(FDY)
+
+VIR_A_ALU1(ITOF)
+VIR_A_ALU1(CLZ)
+VIR_A_ALU1(UTOF)
+
+VIR_M_ALU2(UMUL24)
+VIR_M_ALU2(FMUL)
+VIR_M_ALU2(SMUL24)
+VIR_M_NODST_2(MULTOP)
+
+VIR_M_ALU1(MOV)
+VIR_M_ALU1(FMOV)
+
+static inline struct qinst *
+vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond,
+             struct qreg dest, struct qreg src)
+{
+        struct qinst *mov = vir_MOV_dest(c, dest, src);
+        vir_set_cond(mov, cond);
+        return mov;
+}
+
+static inline struct qreg
+vir_SEL(struct v3d_compile *c, enum v3d_qpu_cond cond,
+        struct qreg src0, struct qreg src1)
+{
+        struct qreg t = vir_get_temp(c);
+        vir_MOV_dest(c, t, src1);
+        vir_MOV_cond(c, cond, t, src0);
+        return t;
+}
+
+static inline void
+vir_VPM_WRITE(struct v3d_compile *c, struct qreg val)
+{
+        vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
+}
+
+static inline struct qinst *
+vir_NOP(struct v3d_compile *c)
+{
+        return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_NOP,
+                                               c->undef, c->undef, c->undef));
+}
+/*
+static inline struct qreg
+vir_LOAD_IMM(struct v3d_compile *c, uint32_t val)
+{
+        return vir_emit_def(c, vir_inst(QOP_LOAD_IMM, c->undef,
+                                        vir_reg(QFILE_LOAD_IMM, val), c->undef));
+}
+
+static inline struct qreg
+vir_LOAD_IMM_U2(struct v3d_compile *c, uint32_t val)
+{
+        return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_U2, c->undef,
+                                        vir_reg(QFILE_LOAD_IMM, val),
+                                        c->undef));
+}
+static inline struct qreg
+vir_LOAD_IMM_I2(struct v3d_compile *c, uint32_t val)
+{
+        return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_I2, c->undef,
+                                        vir_reg(QFILE_LOAD_IMM, val),
+                                        c->undef));
+}
+*/
+
+static inline struct qinst *
+vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_cond cond)
+{
+        /* The actual uniform_data value will be set at scheduling time */
+        return vir_emit_nondef(c, vir_branch_inst(cond, vir_uniform_ui(c, 0)));
+}
+
+#define vir_for_each_block(block, c)                                    \
+        list_for_each_entry(struct qblock, block, &c->blocks, link)
+
+#define vir_for_each_block_rev(block, c)                                \
+        list_for_each_entry_rev(struct qblock, block, &c->blocks, link)
+
+/* Loop over the non-NULL members of the successors array. */
+#define vir_for_each_successor(succ, block)                             \
+        for (struct qblock *succ = block->successors[0];                \
+             succ != NULL;                                              \
+             succ = (succ == block->successors[1] ? NULL :              \
+                     block->successors[1]))
+
+#define vir_for_each_inst(inst, block)                                  \
+        list_for_each_entry(struct qinst, inst, &block->instructions, link)
+
+#define vir_for_each_inst_rev(inst, block)                                  \
+        list_for_each_entry_rev(struct qinst, inst, &block->instructions, link)
+
+#define vir_for_each_inst_safe(inst, block)                             \
+        list_for_each_entry_safe(struct qinst, inst, &block->instructions, link)
+
+#define vir_for_each_inst_inorder(inst, c)                              \
+        vir_for_each_block(_block, c)                                   \
+                vir_for_each_inst(inst, _block)
+
+#endif /* V3D_COMPILER_H */
--- a/src/broadcom/compiler/v3d_nir_lower_io.c
+++ b/src/broadcom/compiler/v3d_nir_lower_io.c
@ -0,0 +1,176 @@
+/*
+ * Copyright © 2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/v3d_compiler.h"
+#include "compiler/nir/nir_builder.h"
+
+/**
+ * Walks the NIR generated by TGSI-to-NIR or GLSL-to-NIR to lower its io
+ * intrinsics into something amenable to the V3D architecture.
+ *
+ * Currently, it splits VS inputs and uniforms into scalars, drops any
+ * non-position outputs in coordinate shaders, and fixes up the addressing on
+ * indirect uniform loads.  FS input and VS output scalarization is handled by
+ * nir_lower_io_to_scalar().
+ */
+
+static void
+replace_intrinsic_with_vec(nir_builder *b, nir_intrinsic_instr *intr,
+                           nir_ssa_def **comps)
+{
+
+        /* Batch things back together into a vector.  This will get split by
+         * the later ALU scalarization pass.
+         */
+        nir_ssa_def *vec = nir_vec(b, comps, intr->num_components);
+
+        /* Replace the old intrinsic with a reference to our reconstructed
+         * vector.
+         */
+        nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(vec));
+        nir_instr_remove(&intr->instr);
+}
+
+static void
+v3d_nir_lower_output(struct v3d_compile *c, nir_builder *b,
+                     nir_intrinsic_instr *intr)
+{
+        nir_variable *output_var = NULL;
+        nir_foreach_variable(var, &c->s->outputs) {
+                if (var->data.driver_location == nir_intrinsic_base(intr)) {
+                        output_var = var;
+                        break;
+                }
+        }
+        assert(output_var);
+
+        if (c->vs_key) {
+                int slot = output_var->data.location;
+                bool used = false;
+
+                switch (slot) {
+                case VARYING_SLOT_PSIZ:
+                case VARYING_SLOT_POS:
+                        used = true;
+                        break;
+
+                default:
+                        for (int i = 0; i < c->vs_key->num_fs_inputs; i++) {
+                                if (v3d_slot_get_slot(c->vs_key->fs_inputs[i]) == slot) {
+                                        used = true;
+                                        break;
+                                }
+                        }
+                        break;
+                }
+
+                if (!used)
+                        nir_instr_remove(&intr->instr);
+        }
+}
+
+static void
+v3d_nir_lower_uniform(struct v3d_compile *c, nir_builder *b,
+                      nir_intrinsic_instr *intr)
+{
+        b->cursor = nir_before_instr(&intr->instr);
+
+        /* Generate scalar loads equivalent to the original vector. */
+        nir_ssa_def *dests[4];
+        for (unsigned i = 0; i < intr->num_components; i++) {
+                nir_intrinsic_instr *intr_comp =
+                        nir_intrinsic_instr_create(c->s, intr->intrinsic);
+                intr_comp->num_components = 1;
+                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, 32, NULL);
+
+                /* Convert the uniform offset to bytes.  If it happens
+                 * to be a constant, constant-folding will clean up
+                 * the shift for us.
+                 */
+                nir_intrinsic_set_base(intr_comp,
+                                       nir_intrinsic_base(intr) * 16 +
+                                       i * 4);
+
+                intr_comp->src[0] =
+                        nir_src_for_ssa(nir_ishl(b, intr->src[0].ssa,
+                                                 nir_imm_int(b, 4)));
+
+                dests[i] = &intr_comp->dest.ssa;
+
+                nir_builder_instr_insert(b, &intr_comp->instr);
+        }
+
+        replace_intrinsic_with_vec(b, intr, dests);
+}
+
+static void
+v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
+                       struct nir_instr *instr)
+{
+        if (instr->type != nir_instr_type_intrinsic)
+                return;
+        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+        switch (intr->intrinsic) {
+        case nir_intrinsic_load_input:
+                break;
+
+        case nir_intrinsic_store_output:
+                v3d_nir_lower_output(c, b, intr);
+                break;
+
+        case nir_intrinsic_load_uniform:
+                v3d_nir_lower_uniform(c, b, intr);
+                break;
+
+        case nir_intrinsic_load_user_clip_plane:
+        default:
+                break;
+        }
+}
+
+static bool
+v3d_nir_lower_io_impl(struct v3d_compile *c, nir_function_impl *impl)
+{
+        nir_builder b;
+        nir_builder_init(&b, impl);
+
+        nir_foreach_block(block, impl) {
+                nir_foreach_instr_safe(instr, block)
+                        v3d_nir_lower_io_instr(c, &b, instr);
+        }
+
+        nir_metadata_preserve(impl, nir_metadata_block_index |
+                              nir_metadata_dominance);
+
+        return true;
+}
+
+void
+v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c)
+{
+        nir_foreach_function(function, s) {
+                if (function->impl)
+                        v3d_nir_lower_io_impl(c, function->impl);
+        }
+}
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@ -0,0 +1,907 @@
+/*
+ * Copyright © 2016-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3d_compiler.h"
+
+int
+vir_get_non_sideband_nsrc(struct qinst *inst)
+{
+        switch (inst->qpu.type) {
+        case V3D_QPU_INSTR_TYPE_BRANCH:
+                return 0;
+        case V3D_QPU_INSTR_TYPE_ALU:
+                if (inst->qpu.alu.add.op != V3D_QPU_A_NOP)
+                        return v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
+                else
+                        return v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
+        }
+
+        return 0;
+}
+
+int
+vir_get_nsrc(struct qinst *inst)
+{
+        int nsrc = vir_get_non_sideband_nsrc(inst);
+
+        if (vir_has_implicit_uniform(inst))
+                nsrc++;
+
+        return nsrc;
+}
+
+bool
+vir_has_implicit_uniform(struct qinst *inst)
+{
+        switch (inst->qpu.type) {
+        case V3D_QPU_INSTR_TYPE_BRANCH:
+                return true;
+        case V3D_QPU_INSTR_TYPE_ALU:
+                switch (inst->dst.file) {
+                case QFILE_TLBU:
+                        return true;
+                default:
+                        return inst->has_implicit_uniform;
+                }
+        }
+        return false;
+}
+
+/* The sideband uniform for textures gets stored after the normal ALU
+ * arguments.
+ */
+int
+vir_get_implicit_uniform_src(struct qinst *inst)
+{
+        return vir_get_nsrc(inst) - 1;
+}
+
+/**
+ * Returns whether the instruction has any side effects that must be
+ * preserved.
+ */
+bool
+vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
+{
+        switch (inst->qpu.type) {
+        case V3D_QPU_INSTR_TYPE_BRANCH:
+                return true;
+        case V3D_QPU_INSTR_TYPE_ALU:
+                switch (inst->qpu.alu.add.op) {
+                case V3D_QPU_A_SETREVF:
+                case V3D_QPU_A_SETMSF:
+                case V3D_QPU_A_VPMSETUP:
+                        return true;
+                default:
+                        break;
+                }
+
+                switch (inst->qpu.alu.mul.op) {
+                case V3D_QPU_M_MULTOP:
+                        return true;
+                default:
+                        break;
+                }
+        }
+
+        if (inst->qpu.sig.ldtmu)
+                return true;
+
+        return false;
+}
+
+bool
+vir_is_float_input(struct qinst *inst)
+{
+        /* XXX: More instrs */
+        switch (inst->qpu.type) {
+        case V3D_QPU_INSTR_TYPE_BRANCH:
+                return false;
+        case V3D_QPU_INSTR_TYPE_ALU:
+                switch (inst->qpu.alu.add.op) {
+                case V3D_QPU_A_FADD:
+                case V3D_QPU_A_FSUB:
+                case V3D_QPU_A_FMIN:
+                case V3D_QPU_A_FMAX:
+                case V3D_QPU_A_FTOIN:
+                        return true;
+                default:
+                        break;
+                }
+
+                switch (inst->qpu.alu.mul.op) {
+                case V3D_QPU_M_FMOV:
+                case V3D_QPU_M_VFMUL:
+                case V3D_QPU_M_FMUL:
+                        return true;
+                default:
+                        break;
+                }
+        }
+
+        return false;
+}
+
+bool
+vir_is_raw_mov(struct qinst *inst)
+{
+        if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
+            (inst->qpu.alu.mul.op != V3D_QPU_M_FMOV &&
+             inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) {
+                return false;
+        }
+
+        if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE ||
+            inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) {
+                return false;
+        }
+
+        if (inst->qpu.flags.ac != V3D_QPU_COND_NONE ||
+            inst->qpu.flags.mc != V3D_QPU_COND_NONE)
+                return false;
+
+        return true;
+}
+
+bool
+vir_is_add(struct qinst *inst)
+{
+        return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
+                inst->qpu.alu.add.op != V3D_QPU_A_NOP);
+}
+
+bool
+vir_is_mul(struct qinst *inst)
+{
+        return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
+                inst->qpu.alu.mul.op != V3D_QPU_M_NOP);
+}
+
+bool
+vir_is_tex(struct qinst *inst)
+{
+        if (inst->dst.file == QFILE_MAGIC)
+                return v3d_qpu_magic_waddr_is_tmu(inst->dst.index);
+
+        return false;
+}
+
+bool
+vir_depends_on_flags(struct qinst *inst)
+{
+        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) {
+                return (inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS);
+        } else {
+                return (inst->qpu.flags.ac != V3D_QPU_COND_NONE &&
+                        inst->qpu.flags.mc != V3D_QPU_COND_NONE);
+        }
+}
+
+bool
+vir_writes_r3(struct qinst *inst)
+{
+        for (int i = 0; i < vir_get_nsrc(inst); i++) {
+                switch (inst->src[i].file) {
+                case QFILE_VARY:
+                case QFILE_VPM:
+                        return true;
+                default:
+                        break;
+                }
+        }
+
+        return false;
+}
+
+bool
+vir_writes_r4(struct qinst *inst)
+{
+        switch (inst->dst.file) {
+        case QFILE_MAGIC:
+                switch (inst->dst.index) {
+                case V3D_QPU_WADDR_RECIP:
+                case V3D_QPU_WADDR_RSQRT:
+                case V3D_QPU_WADDR_EXP:
+                case V3D_QPU_WADDR_LOG:
+                case V3D_QPU_WADDR_SIN:
+                        return true;
+                }
+                break;
+        default:
+                break;
+        }
+
+        if (inst->qpu.sig.ldtmu)
+                return true;
+
+        return false;
+}
+
+void
+vir_set_unpack(struct qinst *inst, int src,
+               enum v3d_qpu_input_unpack unpack)
+{
+        assert(src == 0 || src == 1);
+
+        if (vir_is_add(inst)) {
+                if (src == 0)
+                        inst->qpu.alu.add.a_unpack = unpack;
+                else
+                        inst->qpu.alu.add.b_unpack = unpack;
+        } else {
+                assert(vir_is_mul(inst));
+                if (src == 0)
+                        inst->qpu.alu.mul.a_unpack = unpack;
+                else
+                        inst->qpu.alu.mul.b_unpack = unpack;
+        }
+}
+
+void
+vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond)
+{
+        if (vir_is_add(inst)) {
+                inst->qpu.flags.ac = cond;
+        } else {
+                assert(vir_is_mul(inst));
+                inst->qpu.flags.mc = cond;
+        }
+}
+
+void
+vir_set_pf(struct qinst *inst, enum v3d_qpu_pf pf)
+{
+        if (vir_is_add(inst)) {
+                inst->qpu.flags.apf = pf;
+        } else {
+                assert(vir_is_mul(inst));
+                inst->qpu.flags.mpf = pf;
+        }
+}
+
+#if 0
+uint8_t
+vir_channels_written(struct qinst *inst)
+{
+        if (vir_is_mul(inst)) {
+                switch (inst->dst.pack) {
+                case QPU_PACK_MUL_NOP:
+                case QPU_PACK_MUL_8888:
+                        return 0xf;
+                case QPU_PACK_MUL_8A:
+                        return 0x1;
+                case QPU_PACK_MUL_8B:
+                        return 0x2;
+                case QPU_PACK_MUL_8C:
+                        return 0x4;
+                case QPU_PACK_MUL_8D:
+                        return 0x8;
+                }
+        } else {
+                switch (inst->dst.pack) {
+                case QPU_PACK_A_NOP:
+                case QPU_PACK_A_8888:
+                case QPU_PACK_A_8888_SAT:
+                case QPU_PACK_A_32_SAT:
+                        return 0xf;
+                case QPU_PACK_A_8A:
+                case QPU_PACK_A_8A_SAT:
+                        return 0x1;
+                case QPU_PACK_A_8B:
+                case QPU_PACK_A_8B_SAT:
+                        return 0x2;
+                case QPU_PACK_A_8C:
+                case QPU_PACK_A_8C_SAT:
+                        return 0x4;
+                case QPU_PACK_A_8D:
+                case QPU_PACK_A_8D_SAT:
+                        return 0x8;
+                case QPU_PACK_A_16A:
+                case QPU_PACK_A_16A_SAT:
+                        return 0x3;
+                case QPU_PACK_A_16B:
+                case QPU_PACK_A_16B_SAT:
+                        return 0xc;
+                }
+        }
+        unreachable("Bad pack field");
+}
+#endif
+
+struct qreg
+vir_get_temp(struct v3d_compile *c)
+{
+        struct qreg reg;
+
+        reg.file = QFILE_TEMP;
+        reg.index = c->num_temps++;
+
+        if (c->num_temps > c->defs_array_size) {
+                uint32_t old_size = c->defs_array_size;
+                c->defs_array_size = MAX2(old_size * 2, 16);
+                c->defs = reralloc(c, c->defs, struct qinst *,
+                                   c->defs_array_size);
+                memset(&c->defs[old_size], 0,
+                       sizeof(c->defs[0]) * (c->defs_array_size - old_size));
+        }
+
+        return reg;
+}
+
+struct qinst *
+vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct qreg src1)
+{
+        struct qinst *inst = calloc(1, sizeof(*inst));
+
+        inst->qpu = v3d_qpu_nop();
+        inst->qpu.alu.add.op = op;
+
+        inst->dst = dst;
+        inst->src[0] = src0;
+        inst->src[1] = src1;
+        inst->uniform = ~0;
+
+        return inst;
+}
+
+struct qinst *
+vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct qreg src1)
+{
+        struct qinst *inst = calloc(1, sizeof(*inst));
+
+        inst->qpu = v3d_qpu_nop();
+        inst->qpu.alu.mul.op = op;
+
+        inst->dst = dst;
+        inst->src[0] = src0;
+        inst->src[1] = src1;
+        inst->uniform = ~0;
+
+        return inst;
+}
+
+struct qinst *
+vir_branch_inst(enum v3d_qpu_branch_cond cond, struct qreg src)
+{
+        struct qinst *inst = calloc(1, sizeof(*inst));
+
+        inst->qpu = v3d_qpu_nop();
+        inst->qpu.type = V3D_QPU_INSTR_TYPE_BRANCH;
+        inst->qpu.branch.cond = cond;
+        inst->qpu.branch.msfign = V3D_QPU_MSFIGN_NONE;
+        inst->qpu.branch.bdi = V3D_QPU_BRANCH_DEST_REL;
+        inst->qpu.branch.ub = true;
+        inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL;
+
+        inst->dst = vir_reg(QFILE_NULL, 0);
+        inst->src[0] = src;
+        inst->uniform = ~0;
+
+        return inst;
+}
+
+static void
+vir_emit(struct v3d_compile *c, struct qinst *inst)
+{
+        list_addtail(&inst->link, &c->cur_block->instructions);
+
+        if (inst->dst.file == QFILE_MAGIC &&
+            inst->dst.index == V3D_QPU_WADDR_VPM)
+                c->num_vpm_writes++;
+}
+
+/* Updates inst to write to a new temporary, emits it, and notes the def. */
+struct qreg
+vir_emit_def(struct v3d_compile *c, struct qinst *inst)
+{
+        assert(inst->dst.file == QFILE_NULL);
+
+        inst->dst = vir_get_temp(c);
+
+        if (inst->dst.file == QFILE_TEMP)
+                c->defs[inst->dst.index] = inst;
+
+        vir_emit(c, inst);
+
+        return inst->dst;
+}
+
+struct qinst *
+vir_emit_nondef(struct v3d_compile *c, struct qinst *inst)
+{
+        if (inst->dst.file == QFILE_TEMP)
+                c->defs[inst->dst.index] = NULL;
+
+        vir_emit(c, inst);
+
+        return inst;
+}
+
+struct qblock *
+vir_new_block(struct v3d_compile *c)
+{
+        struct qblock *block = rzalloc(c, struct qblock);
+
+        list_inithead(&block->instructions);
+
+        block->predecessors = _mesa_set_create(block,
+                                               _mesa_hash_pointer,
+                                               _mesa_key_pointer_equal);
+
+        block->index = c->next_block_index++;
+
+        return block;
+}
+
+void
+vir_set_emit_block(struct v3d_compile *c, struct qblock *block)
+{
+        c->cur_block = block;
+        list_addtail(&block->link, &c->blocks);
+}
+
+struct qblock *
+vir_entry_block(struct v3d_compile *c)
+{
+        return list_first_entry(&c->blocks, struct qblock, link);
+}
+
+struct qblock *
+vir_exit_block(struct v3d_compile *c)
+{
+        return list_last_entry(&c->blocks, struct qblock, link);
+}
+
+void
+vir_link_blocks(struct qblock *predecessor, struct qblock *successor)
+{
+        _mesa_set_add(successor->predecessors, predecessor);
+        if (predecessor->successors[0]) {
+                assert(!predecessor->successors[1]);
+                predecessor->successors[1] = successor;
+        } else {
+                predecessor->successors[0] = successor;
+        }
+}
+
+const struct v3d_compiler *
+v3d_compiler_init(const struct v3d_device_info *devinfo)
+{
+        struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler);
+        if (!compiler)
+                return NULL;
+
+        compiler->devinfo = devinfo;
+
+        if (!vir_init_reg_sets(compiler)) {
+                ralloc_free(compiler);
+                return NULL;
+        }
+
+        return compiler;
+}
+
+void
+v3d_compiler_free(const struct v3d_compiler *compiler)
+{
+        ralloc_free((void *)compiler);
+}
+
+static struct v3d_compile *
+vir_compile_init(const struct v3d_compiler *compiler,
+                 struct v3d_key *key,
+                 nir_shader *s,
+                 int program_id, int variant_id)
+{
+        struct v3d_compile *c = rzalloc(NULL, struct v3d_compile);
+
+        c->compiler = compiler;
+        c->devinfo = compiler->devinfo;
+        c->key = key;
+        c->program_id = program_id;
+        c->variant_id = variant_id;
+
+        s = nir_shader_clone(c, s);
+        c->s = s;
+
+        list_inithead(&c->blocks);
+        vir_set_emit_block(c, vir_new_block(c));
+
+        c->output_position_index = -1;
+        c->output_point_size_index = -1;
+        c->output_sample_mask_index = -1;
+
+        c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer,
+                                            _mesa_key_pointer_equal);
+
+        return c;
+}
+
+static void
+v3d_lower_nir(struct v3d_compile *c)
+{
+        struct nir_lower_tex_options tex_options = {
+                .lower_rect = false, /* XXX */
+                .lower_txp = ~0,
+                /* Apply swizzles to all samplers. */
+                .swizzle_result = ~0,
+        };
+
+        /* Lower the format swizzle and (for 32-bit returns)
+         * ARB_texture_swizzle-style swizzle.
+         */
+        for (int i = 0; i < ARRAY_SIZE(c->key->tex); i++) {
+                for (int j = 0; j < 4; j++)
+                        tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j];
+        }
+
+        NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
+}
+
+static void
+v3d_lower_nir_late(struct v3d_compile *c)
+{
+        NIR_PASS_V(c->s, v3d_nir_lower_io, c);
+        NIR_PASS_V(c->s, nir_lower_idiv);
+}
+
+static void
+v3d_set_prog_data_uniforms(struct v3d_compile *c,
+                           struct v3d_prog_data *prog_data)
+{
+        int count = c->num_uniforms;
+        struct v3d_uniform_list *ulist = &prog_data->uniforms;
+
+        ulist->count = count;
+        ulist->data = ralloc_array(prog_data, uint32_t, count);
+        memcpy(ulist->data, c->uniform_data,
+               count * sizeof(*ulist->data));
+        ulist->contents = ralloc_array(prog_data, enum quniform_contents, count);
+        memcpy(ulist->contents, c->uniform_contents,
+               count * sizeof(*ulist->contents));
+}
+
+/* Copy the compiler UBO range state to the compiled shader, dropping out
+ * arrays that were never referenced by an indirect load.
+ *
+ * (Note that QIR dead code elimination of an array access still leaves that
+ * array alive, though)
+ */
+static void
+v3d_set_prog_data_ubo(struct v3d_compile *c,
+                      struct v3d_prog_data *prog_data)
+{
+        if (!c->num_ubo_ranges)
+                return;
+
+        prog_data->num_ubo_ranges = 0;
+        prog_data->ubo_ranges = ralloc_array(prog_data, struct v3d_ubo_range,
+                                             c->num_ubo_ranges);
+        for (int i = 0; i < c->num_ubo_ranges; i++) {
+                if (!c->ubo_range_used[i])
+                        continue;
+
+                struct v3d_ubo_range *range = &c->ubo_ranges[i];
+                prog_data->ubo_ranges[prog_data->num_ubo_ranges++] = *range;
+                prog_data->ubo_size += range->size;
+        }
+
+        if (prog_data->ubo_size) {
+                if (V3D_DEBUG & V3D_DEBUG_SHADERDB) {
+                        fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n",
+                                vir_get_stage_name(c),
+                                c->program_id, c->variant_id,
+                                prog_data->ubo_size / 4);
+                }
+        }
+}
+
+static void
+v3d_set_prog_data(struct v3d_compile *c,
+                  struct v3d_prog_data *prog_data)
+{
+        v3d_set_prog_data_uniforms(c, prog_data);
+        v3d_set_prog_data_ubo(c, prog_data);
+}
+
+static uint64_t *
+v3d_return_qpu_insts(struct v3d_compile *c, uint32_t *final_assembly_size)
+{
+        *final_assembly_size = c->qpu_inst_count * sizeof(uint64_t);
+
+        uint64_t *qpu_insts = malloc(*final_assembly_size);
+        if (!qpu_insts)
+                return NULL;
+
+        memcpy(qpu_insts, c->qpu_insts, *final_assembly_size);
+
+        vir_compile_destroy(c);
+
+        return qpu_insts;
+}
+
+uint64_t *v3d_compile_vs(const struct v3d_compiler *compiler,
+                         struct v3d_vs_key *key,
+                         struct v3d_vs_prog_data *prog_data,
+                         nir_shader *s,
+                         int program_id, int variant_id,
+                         uint32_t *final_assembly_size)
+{
+        struct v3d_compile *c = vir_compile_init(compiler, &key->base, s,
+                                                 program_id, variant_id);
+
+        c->vs_key = key;
+
+        v3d_lower_nir(c);
+
+        if (key->clamp_color)
+                NIR_PASS_V(c->s, nir_lower_clamp_color_outputs);
+
+        if (key->base.ucp_enables) {
+                NIR_PASS_V(c->s, nir_lower_clip_vs, key->base.ucp_enables);
+                NIR_PASS_V(c->s, nir_lower_io_to_scalar,
+                           nir_var_shader_out);
+        }
+
+        /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */
+        NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
+
+        v3d_lower_nir_late(c);
+        v3d_optimize_nir(c->s);
+        NIR_PASS_V(c->s, nir_convert_from_ssa, true);
+
+        v3d_nir_to_vir(c);
+
+        v3d_set_prog_data(c, &prog_data->base);
+
+        prog_data->base.num_inputs = c->num_inputs;
+
+        /* The vertex data gets format converted by the VPM so that
+         * each attribute channel takes up a VPM column.  Precompute
+         * the sizes for the shader record.
+         */
+        for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) {
+                prog_data->vattr_sizes[i] = c->vattr_sizes[i];
+                prog_data->vpm_input_size += c->vattr_sizes[i];
+        }
+
+        /* Input/output segment size are in 8x32-bit multiples. */
+        prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8;
+        prog_data->vpm_output_size = align(c->num_vpm_writes, 8) / 8;
+
+        prog_data->uses_vid = (s->info.system_values_read &
+                               (1ull << SYSTEM_VALUE_VERTEX_ID));
+        prog_data->uses_iid = (s->info.system_values_read &
+                               (1ull << SYSTEM_VALUE_INSTANCE_ID));
+
+        return v3d_return_qpu_insts(c, final_assembly_size);
+}
+
+static void
+v3d_set_fs_prog_data_inputs(struct v3d_compile *c,
+                            struct v3d_fs_prog_data *prog_data)
+{
+        prog_data->base.num_inputs = c->num_inputs;
+        memcpy(prog_data->input_slots, c->input_slots,
+               c->num_inputs * sizeof(*c->input_slots));
+
+        for (int i = 0; i < c->num_inputs; i++) {
+                struct v3d_varying_slot v3d_slot = c->input_slots[i];
+                uint8_t slot = v3d_slot_get_slot(v3d_slot);
+
+                if (slot == VARYING_SLOT_COL0 ||
+                    slot == VARYING_SLOT_COL1 ||
+                    slot == VARYING_SLOT_BFC0 ||
+                    slot == VARYING_SLOT_BFC1) {
+                        BITSET_SET(prog_data->color_inputs, i);
+                }
+
+                if (BITSET_TEST(c->flat_shade_flags, i))
+                        BITSET_SET(prog_data->flat_shade_flags, i);
+        }
+}
+
+uint64_t *v3d_compile_fs(const struct v3d_compiler *compiler,
+                         struct v3d_fs_key *key,
+                         struct v3d_fs_prog_data *prog_data,
+                         nir_shader *s,
+                         int program_id, int variant_id,
+                         uint32_t *final_assembly_size)
+{
+        struct v3d_compile *c = vir_compile_init(compiler, &key->base, s,
+                                                 program_id, variant_id);
+
+        c->fs_key = key;
+
+        v3d_lower_nir(c);
+
+        if (key->light_twoside)
+                NIR_PASS_V(c->s, nir_lower_two_sided_color);
+
+        if (key->clamp_color)
+                NIR_PASS_V(c->s, nir_lower_clamp_color_outputs);
+
+        if (key->alpha_test) {
+                NIR_PASS_V(c->s, nir_lower_alpha_test, key->alpha_test_func,
+                           false);
+        }
+
+        if (key->base.ucp_enables)
+                NIR_PASS_V(c->s, nir_lower_clip_fs, key->base.ucp_enables);
+
+        /* Note: FS input scalarizing must happen after
+         * nir_lower_two_sided_color, which only handles a vec4 at a time.
+         */
+        NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in);
+
+        v3d_lower_nir_late(c);
+        v3d_optimize_nir(c->s);
+        NIR_PASS_V(c->s, nir_convert_from_ssa, true);
+
+        v3d_nir_to_vir(c);
+
+        v3d_set_prog_data(c, &prog_data->base);
+        v3d_set_fs_prog_data_inputs(c, prog_data);
+        if (c->s->info.outputs_written & (1 << FRAG_RESULT_DEPTH))
+                prog_data->writes_z = true;
+
+        return v3d_return_qpu_insts(c, final_assembly_size);
+}
+
+void
+vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst)
+{
+        if (qinst->dst.file == QFILE_TEMP)
+                c->defs[qinst->dst.index] = NULL;
+
+        list_del(&qinst->link);
+        free(qinst);
+}
+
+struct qreg
+vir_follow_movs(struct v3d_compile *c, struct qreg reg)
+{
+        /* XXX
+        int pack = reg.pack;
+
+        while (reg.file == QFILE_TEMP &&
+               c->defs[reg.index] &&
+               (c->defs[reg.index]->op == QOP_MOV ||
+                c->defs[reg.index]->op == QOP_FMOV) &&
+               !c->defs[reg.index]->dst.pack &&
+               !c->defs[reg.index]->src[0].pack) {
+                reg = c->defs[reg.index]->src[0];
+        }
+
+        reg.pack = pack;
+        */
+        return reg;
+}
+
+void
+vir_compile_destroy(struct v3d_compile *c)
+{
+        vir_for_each_block(block, c) {
+                while (!list_empty(&block->instructions)) {
+                        struct qinst *qinst =
+                                list_first_entry(&block->instructions,
+                                                 struct qinst, link);
+                        vir_remove_instruction(c, qinst);
+                }
+        }
+
+        ralloc_free(c);
+}
+
+struct qreg
+vir_uniform(struct v3d_compile *c,
+            enum quniform_contents contents,
+            uint32_t data)
+{
+        for (int i = 0; i < c->num_uniforms; i++) {
+                if (c->uniform_contents[i] == contents &&
+                    c->uniform_data[i] == data) {
+                        return vir_reg(QFILE_UNIF, i);
+                }
+        }
+
+        uint32_t uniform = c->num_uniforms++;
+
+        if (uniform >= c->uniform_array_size) {
+                c->uniform_array_size = MAX2(MAX2(16, uniform + 1),
+                                             c->uniform_array_size * 2);
+
+                c->uniform_data = reralloc(c, c->uniform_data,
+                                           uint32_t,
+                                           c->uniform_array_size);
+                c->uniform_contents = reralloc(c, c->uniform_contents,
+                                               enum quniform_contents,
+                                               c->uniform_array_size);
+        }
+
+        c->uniform_contents[uniform] = contents;
+        c->uniform_data[uniform] = data;
+
+        return vir_reg(QFILE_UNIF, uniform);
+}
+
+void
+vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf)
+{
+        struct qinst *last_inst = NULL;
+
+        if (!list_empty(&c->cur_block->instructions))
+                last_inst = (struct qinst *)c->cur_block->instructions.prev;
+
+        if (src.file != QFILE_TEMP ||
+            !c->defs[src.index] ||
+            last_inst != c->defs[src.index]) {
+                /* XXX: Make the MOV be the appropriate type */
+                last_inst = vir_MOV_dest(c, vir_reg(QFILE_NULL, 0), src);
+                last_inst = (struct qinst *)c->cur_block->instructions.prev;
+        }
+
+        vir_set_pf(last_inst, pf);
+}
+
+#define OPTPASS(func)                                                   \
+        do {                                                            \
+                bool stage_progress = func(c);                          \
+                if (stage_progress) {                                   \
+                        progress = true;                                \
+                        if (print_opt_debug) {                          \
+                                fprintf(stderr,                         \
+                                        "VIR opt pass %2d: %s progress\n", \
+                                        pass, #func);                   \
+                        }                                               \
+                        /*XXX vir_validate(c);*/                        \
+                }                                                       \
+        } while (0)
+
+void
+vir_optimize(struct v3d_compile *c)
+{
+        bool print_opt_debug = false;
+        int pass = 1;
+
+        while (true) {
+                bool progress = false;
+
+                OPTPASS(vir_opt_copy_propagate);
+                OPTPASS(vir_opt_dead_code);
+
+                if (!progress)
+                        break;
+
+                pass++;
+        }
+}
+
+const char *
+vir_get_stage_name(struct v3d_compile *c)
+{
+        if (c->vs_key && c->vs_key->is_coord)
+                return "MESA_SHADER_COORD";
+        else
+                return gl_shader_stage_name(c->s->stage);
+}
--- a/src/broadcom/compiler/vir_dump.c
+++ b/src/broadcom/compiler/vir_dump.c
@ -0,0 +1,339 @@
+/*
+ * Copyright © 2016-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3d_compiler.h"
+
+static void
+vir_print_reg(struct v3d_compile *c, struct qreg reg)
+{
+        static const char *files[] = {
+                [QFILE_TEMP] = "t",
+                [QFILE_VARY] = "v",
+                [QFILE_UNIF] = "u",
+                [QFILE_TLB] = "tlb",
+                [QFILE_TLBU] = "tlbu",
+        };
+        static const char *quniform_names[] = {
+                [QUNIFORM_VIEWPORT_X_SCALE] = "vp_x_scale",
+                [QUNIFORM_VIEWPORT_Y_SCALE] = "vp_y_scale",
+                [QUNIFORM_VIEWPORT_Z_OFFSET] = "vp_z_offset",
+                [QUNIFORM_VIEWPORT_Z_SCALE] = "vp_z_scale",
+        };
+
+        switch (reg.file) {
+
+        case QFILE_NULL:
+                fprintf(stderr, "null");
+                break;
+
+        case QFILE_LOAD_IMM:
+                fprintf(stderr, "0x%08x (%f)", reg.index, uif(reg.index));
+                break;
+
+        case QFILE_REG:
+                fprintf(stderr, "rf%d", reg.index);
+                break;
+
+        case QFILE_MAGIC:
+                fprintf(stderr, "%s", v3d_qpu_magic_waddr_name(reg.index));
+                break;
+
+        case QFILE_SMALL_IMM:
+                if ((int)reg.index >= -16 && (int)reg.index <= 15)
+                        fprintf(stderr, "%d", reg.index);
+                else
+                        fprintf(stderr, "%f", uif(reg.index));
+                break;
+
+        case QFILE_VPM:
+                fprintf(stderr, "vpm%d.%d",
+                        reg.index / 4, reg.index % 4);
+                break;
+
+        case QFILE_TLB:
+                fprintf(stderr, "%s", files[reg.file]);
+                break;
+
+        case QFILE_UNIF: {
+                enum quniform_contents contents = c->uniform_contents[reg.index];
+
+                fprintf(stderr, "%s%d", files[reg.file], reg.index);
+
+                switch (contents) {
+                case QUNIFORM_CONSTANT:
+                        fprintf(stderr, " (0x%08x / %f)",
+                                c->uniform_data[reg.index],
+                                uif(c->uniform_data[reg.index]));
+                        break;
+
+                case QUNIFORM_UNIFORM:
+                        fprintf(stderr, " (push[%d])",
+                                c->uniform_data[reg.index]);
+                        break;
+
+                case QUNIFORM_TEXTURE_CONFIG_P1:
+                        fprintf(stderr, " (tex[%d].p1)",
+                                c->uniform_data[reg.index]);
+                        break;
+
+                case QUNIFORM_TEXTURE_WIDTH:
+                        fprintf(stderr, " (tex[%d].width)",
+                                c->uniform_data[reg.index]);
+                        break;
+                case QUNIFORM_TEXTURE_HEIGHT:
+                        fprintf(stderr, " (tex[%d].height)",
+                                c->uniform_data[reg.index]);
+                        break;
+                case QUNIFORM_TEXTURE_DEPTH:
+                        fprintf(stderr, " (tex[%d].depth)",
+                                c->uniform_data[reg.index]);
+                        break;
+                case QUNIFORM_TEXTURE_ARRAY_SIZE:
+                        fprintf(stderr, " (tex[%d].array_size)",
+                                c->uniform_data[reg.index]);
+                        break;
+                case QUNIFORM_TEXTURE_LEVELS:
+                        fprintf(stderr, " (tex[%d].levels)",
+                                c->uniform_data[reg.index]);
+                        break;
+
+                case QUNIFORM_UBO_ADDR:
+                        fprintf(stderr, " (ubo[%d])",
+                                c->uniform_data[reg.index]);
+                        break;
+
+                default:
+                        if (quniform_contents_is_texture_p0(contents)) {
+                                fprintf(stderr, " (tex[%d].p0: 0x%08x)",
+                                        contents - QUNIFORM_TEXTURE_CONFIG_P0_0,
+                                        c->uniform_data[reg.index]);
+                        } else if (contents < ARRAY_SIZE(quniform_names)) {
+                                fprintf(stderr, " (%s)",
+                                        quniform_names[contents]);
+                        } else {
+                                fprintf(stderr, " (%d / 0x%08x)", contents,
+                                        c->uniform_data[reg.index]);
+                        }
+                }
+
+                break;
+        }
+
+        default:
+                fprintf(stderr, "%s%d", files[reg.file], reg.index);
+                break;
+        }
+}
+
+static void
+vir_dump_sig(struct v3d_compile *c, struct qinst *inst)
+{
+        struct v3d_qpu_sig *sig = &inst->qpu.sig;
+
+        if (sig->thrsw)
+                fprintf(stderr, "; thrsw");
+        if (sig->ldvary)
+                fprintf(stderr, "; ldvary");
+        if (sig->ldvpm)
+                fprintf(stderr, "; ldvpm");
+        if (sig->ldtmu)
+                fprintf(stderr, "; ldtmu");
+        if (sig->ldunif)
+                fprintf(stderr, "; ldunif");
+        if (sig->wrtmuc)
+                fprintf(stderr, "; wrtmuc");
+}
+
+static void
+vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
+{
+        struct v3d_qpu_instr *instr = &inst->qpu;
+        int nsrc = vir_get_non_sideband_nsrc(inst);
+        int sideband_nsrc = vir_get_nsrc(inst);
+        enum v3d_qpu_input_unpack unpack[2];
+
+        if (inst->qpu.alu.add.op != V3D_QPU_A_NOP) {
+                fprintf(stderr, "%s", v3d_qpu_add_op_name(instr->alu.add.op));
+                fprintf(stderr, "%s", v3d_qpu_cond_name(instr->flags.ac));
+                fprintf(stderr, "%s", v3d_qpu_pf_name(instr->flags.apf));
+                fprintf(stderr, "%s", v3d_qpu_uf_name(instr->flags.auf));
+                fprintf(stderr, " ");
+
+                vir_print_reg(c, inst->dst);
+                fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.add.output_pack));
+
+                unpack[0] = instr->alu.add.a_unpack;
+                unpack[1] = instr->alu.add.b_unpack;
+        } else {
+                fprintf(stderr, "%s", v3d_qpu_mul_op_name(instr->alu.mul.op));
+                fprintf(stderr, "%s", v3d_qpu_cond_name(instr->flags.mc));
+                fprintf(stderr, "%s", v3d_qpu_pf_name(instr->flags.mpf));
+                fprintf(stderr, "%s", v3d_qpu_uf_name(instr->flags.muf));
+                fprintf(stderr, " ");
+
+                vir_print_reg(c, inst->dst);
+                fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.mul.output_pack));
+
+                unpack[0] = instr->alu.mul.a_unpack;
+                unpack[1] = instr->alu.mul.b_unpack;
+        }
+
+        for (int i = 0; i < sideband_nsrc; i++) {
+                fprintf(stderr, ", ");
+                vir_print_reg(c, inst->src[i]);
+                if (i < nsrc)
+                        fprintf(stderr, "%s", v3d_qpu_unpack_name(unpack[i]));
+        }
+
+        vir_dump_sig(c, inst);
+}
+
+void
+vir_dump_inst(struct v3d_compile *c, struct qinst *inst)
+{
+        struct v3d_qpu_instr *instr = &inst->qpu;
+
+        switch (inst->qpu.type) {
+        case V3D_QPU_INSTR_TYPE_ALU:
+                vir_dump_alu(c, inst);
+                break;
+        case V3D_QPU_INSTR_TYPE_BRANCH:
+                fprintf(stderr, "b");
+                if (instr->branch.ub)
+                        fprintf(stderr, "u");
+
+                fprintf(stderr, "%s",
+                        v3d_qpu_branch_cond_name(instr->branch.cond));
+                fprintf(stderr, "%s", v3d_qpu_msfign_name(instr->branch.msfign));
+
+                switch (instr->branch.bdi) {
+                case V3D_QPU_BRANCH_DEST_ABS:
+                        fprintf(stderr, "  zero_addr+0x%08x", instr->branch.offset);
+                        break;
+
+                case V3D_QPU_BRANCH_DEST_REL:
+                        fprintf(stderr, "  %d", instr->branch.offset);
+                        break;
+
+                case V3D_QPU_BRANCH_DEST_LINK_REG:
+                        fprintf(stderr, "  lri");
+                        break;
+
+                case V3D_QPU_BRANCH_DEST_REGFILE:
+                        fprintf(stderr, "  rf%d", instr->branch.raddr_a);
+                        break;
+                }
+
+                if (instr->branch.ub) {
+                        switch (instr->branch.bdu) {
+                        case V3D_QPU_BRANCH_DEST_ABS:
+                                fprintf(stderr, ", a:unif");
+                                break;
+
+                        case V3D_QPU_BRANCH_DEST_REL:
+                                fprintf(stderr, ", r:unif");
+                                break;
+
+                        case V3D_QPU_BRANCH_DEST_LINK_REG:
+                                fprintf(stderr, ", lri");
+                                break;
+
+                        case V3D_QPU_BRANCH_DEST_REGFILE:
+                                fprintf(stderr, ", rf%d", instr->branch.raddr_a);
+                                break;
+                        }
+                }
+
+                if (vir_has_implicit_uniform(inst)) {
+                        fprintf(stderr, " ");
+                        vir_print_reg(c, inst->src[vir_get_implicit_uniform_src(inst)]);
+                }
+
+                break;
+        }
+}
+
+void
+vir_dump(struct v3d_compile *c)
+{
+        int ip = 0;
+
+        vir_for_each_block(block, c) {
+                fprintf(stderr, "BLOCK %d:\n", block->index);
+                vir_for_each_inst(inst, block) {
+                        if (c->temp_start) {
+                                bool first = true;
+
+                                for (int i = 0; i < c->num_temps; i++) {
+                                        if (c->temp_start[i] != ip)
+                                                continue;
+
+                                        if (first) {
+                                                first = false;
+                                        } else {
+                                                fprintf(stderr, ", ");
+                                        }
+                                        fprintf(stderr, "S%4d", i);
+                                }
+
+                                if (first)
+                                        fprintf(stderr, "      ");
+                                else
+                                        fprintf(stderr, " ");
+                        }
+
+                        if (c->temp_end) {
+                                bool first = true;
+
+                                for (int i = 0; i < c->num_temps; i++) {
+                                        if (c->temp_end[i] != ip)
+                                                continue;
+
+                                        if (first) {
+                                                first = false;
+                                        } else {
+                                                fprintf(stderr, ", ");
+                                        }
+                                        fprintf(stderr, "E%4d", i);
+                                }
+
+                                if (first)
+                                        fprintf(stderr, "      ");
+                                else
+                                        fprintf(stderr, " ");
+                        }
+
+                        vir_dump_inst(c, inst);
+                        fprintf(stderr, "\n");
+                        ip++;
+                }
+                if (block->successors[1]) {
+                        fprintf(stderr, "-> BLOCK %d, %d\n",
+                                block->successors[0]->index,
+                                block->successors[1]->index);
+                } else if (block->successors[0]) {
+                        fprintf(stderr, "-> BLOCK %d\n",
+                                block->successors[0]->index);
+                }
+        }
+}
--- a/src/broadcom/compiler/vir_live_variables.c
+++ b/src/broadcom/compiler/vir_live_variables.c
@ -0,0 +1,340 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ * Copyright © 2016 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define MAX_INSTRUCTION (1 << 30)
+
+#include "util/ralloc.h"
+#include "util/register_allocate.h"
+#include "v3d_compiler.h"
+
+struct partial_update_state {
+        struct qinst *insts[4];
+        uint8_t channels;
+};
+
+static uint32_t
+int_hash(const void *key)
+{
+        return _mesa_hash_data(key, sizeof(int));
+}
+
+static bool
+int_compare(const void *key1, const void *key2)
+{
+        return *(const int *)key1 == *(const int *)key2;
+}
+
+static int
+vir_reg_to_var(struct qreg reg)
+{
+        if (reg.file == QFILE_TEMP)
+                return reg.index;
+
+        return -1;
+}
+
+static void
+vir_setup_use(struct v3d_compile *c, struct qblock *block, int ip,
+              struct qreg src)
+{
+        int var = vir_reg_to_var(src);
+        if (var == -1)
+                return;
+
+        c->temp_start[var] = MIN2(c->temp_start[var], ip);
+        c->temp_end[var] = MAX2(c->temp_end[var], ip);
+
+        /* The use[] bitset marks when the block makes
+         * use of a variable without having completely
+         * defined that variable within the block.
+         */
+        if (!BITSET_TEST(block->def, var))
+                BITSET_SET(block->use, var);
+}
+
+static struct partial_update_state *
+get_partial_update_state(struct hash_table *partial_update_ht,
+                         struct qinst *inst)
+{
+        struct hash_entry *entry =
+                _mesa_hash_table_search(partial_update_ht,
+                                        &inst->dst.index);
+        if (entry)
+                return entry->data;
+
+        struct partial_update_state *state =
+                rzalloc(partial_update_ht, struct partial_update_state);
+
+        _mesa_hash_table_insert(partial_update_ht, &inst->dst.index, state);
+
+        return state;
+}
+
+static void
+vir_setup_def(struct v3d_compile *c, struct qblock *block, int ip,
+              struct hash_table *partial_update_ht, struct qinst *inst)
+{
+        if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
+                return;
+
+        /* The def[] bitset marks when an initialization in a
+         * block completely screens off previous updates of
+         * that variable.
+         */
+        int var = vir_reg_to_var(inst->dst);
+        if (var == -1)
+                return;
+
+        c->temp_start[var] = MIN2(c->temp_start[var], ip);
+        c->temp_end[var] = MAX2(c->temp_end[var], ip);
+
+        /* If we've already tracked this as a def, or already used it within
+         * the block, there's nothing to do.
+         */
+        if (BITSET_TEST(block->use, var) || BITSET_TEST(block->def, var))
+                return;
+
+        /* Easy, common case: unconditional full register update.
+         *
+         * We treat conditioning on the exec mask as the same as not being
+         * conditional.  This makes sure that if the register gets set on
+         * either side of an if, it is treated as being screened off before
+         * the if.  Otherwise, if there was no intervening def, its live
+         * interval doesn't extend back to the start of he program, and if too
+         * many registers did that we'd fail to register allocate.
+         */
+        if (((inst->qpu.flags.ac == V3D_QPU_COND_NONE &&
+              inst->qpu.flags.mc == V3D_QPU_COND_NONE) ||
+             inst->cond_is_exec_mask) &&
+            inst->qpu.alu.add.output_pack == V3D_QPU_PACK_NONE &&
+            inst->qpu.alu.mul.output_pack == V3D_QPU_PACK_NONE) {
+                BITSET_SET(block->def, var);
+                return;
+        }
+
+        /* Finally, look at the condition code and packing and mark it as a
+         * def.  We need to make sure that we understand sequences
+         * instructions like:
+         *
+         *     mov.zs t0, t1
+         *     mov.zc t0, t2
+         *
+         * or:
+         *
+         *     mmov t0.8a, t1
+         *     mmov t0.8b, t2
+         *     mmov t0.8c, t3
+         *     mmov t0.8d, t4
+         *
+         * as defining the temp within the block, because otherwise dst's live
+         * range will get extended up the control flow to the top of the
+         * program.
+         */
+        struct partial_update_state *state =
+                get_partial_update_state(partial_update_ht, inst);
+        uint8_t mask = 0xf; /* XXX vir_channels_written(inst); */
+
+        if (inst->qpu.flags.ac == V3D_QPU_COND_NONE &&
+            inst->qpu.flags.mc == V3D_QPU_COND_NONE) {
+                state->channels |= mask;
+        } else {
+                for (int i = 0; i < 4; i++) {
+                        if (!(mask & (1 << i)))
+                                continue;
+
+                        /* XXXif (state->insts[i] &&
+                            state->insts[i]->cond ==
+                            qpu_cond_complement(inst->cond))
+                                state->channels |= 1 << i;
+                        else
+                        */
+                                state->insts[i] = inst;
+                }
+        }
+
+        if (state->channels == 0xf)
+                BITSET_SET(block->def, var);
+}
+
+static void
+sf_state_clear(struct hash_table *partial_update_ht)
+{
+        struct hash_entry *entry;
+
+        hash_table_foreach(partial_update_ht, entry) {
+                struct partial_update_state *state = entry->data;
+
+                for (int i = 0; i < 4; i++) {
+                        if (state->insts[i] &&
+                            (state->insts[i]->qpu.flags.ac != V3D_QPU_COND_NONE ||
+                             state->insts[i]->qpu.flags.mc != V3D_QPU_COND_NONE))
+                                state->insts[i] = NULL;
+                }
+        }
+}
+
+/* Sets up the def/use arrays for when variables are used-before-defined or
+ * defined-before-used in the block.
+ *
+ * Also initializes the temp_start/temp_end to cover just the instruction IPs
+ * where the variable is used, which will be extended later in
+ * vir_compute_start_end().
+ */
+static void
+vir_setup_def_use(struct v3d_compile *c)
+{
+        struct hash_table *partial_update_ht =
+                _mesa_hash_table_create(c, int_hash, int_compare);
+        int ip = 0;
+
+        vir_for_each_block(block, c) {
+                block->start_ip = ip;
+
+                _mesa_hash_table_clear(partial_update_ht, NULL);
+
+                vir_for_each_inst(inst, block) {
+                        for (int i = 0; i < vir_get_nsrc(inst); i++)
+                                vir_setup_use(c, block, ip, inst->src[i]);
+
+                        vir_setup_def(c, block, ip, partial_update_ht, inst);
+
+                        if (false /* XXX inst->uf */)
+                                sf_state_clear(partial_update_ht);
+
+                        /* Payload registers: r0/1/2 contain W, centroid W,
+                         * and Z at program start.  Register allocation will
+                         * force their nodes to R0/1/2.
+                         */
+                        if (inst->src[0].file == QFILE_REG) {
+                                switch (inst->src[0].index) {
+                                case 0:
+                                case 1:
+                                case 2:
+                                        c->temp_start[inst->dst.index] = 0;
+                                        break;
+                                }
+                        }
+
+                        ip++;
+                }
+                block->end_ip = ip;
+        }
+
+        _mesa_hash_table_destroy(partial_update_ht, NULL);
+}
+
+static bool
+vir_live_variables_dataflow(struct v3d_compile *c, int bitset_words)
+{
+        bool cont = false;
+
+        vir_for_each_block_rev(block, c) {
+                /* Update live_out: Any successor using the variable
+                 * on entrance needs us to have the variable live on
+                 * exit.
+                 */
+                vir_for_each_successor(succ, block) {
+                        for (int i = 0; i < bitset_words; i++) {
+                                BITSET_WORD new_live_out = (succ->live_in[i] &
+                                                            ~block->live_out[i]);
+                                if (new_live_out) {
+                                        block->live_out[i] |= new_live_out;
+                                        cont = true;
+                                }
+                        }
+                }
+
+                /* Update live_in */
+                for (int i = 0; i < bitset_words; i++) {
+                        BITSET_WORD new_live_in = (block->use[i] |
+                                                   (block->live_out[i] &
+                                                    ~block->def[i]));
+                        if (new_live_in & ~block->live_in[i]) {
+                                block->live_in[i] |= new_live_in;
+                                cont = true;
+                        }
+                }
+        }
+
+        return cont;
+}
+
+/**
+ * Extend the start/end ranges for each variable to account for the
+ * new information calculated from control flow.
+ */
+static void
+vir_compute_start_end(struct v3d_compile *c, int num_vars)
+{
+        vir_for_each_block(block, c) {
+                for (int i = 0; i < num_vars; i++) {
+                        if (BITSET_TEST(block->live_in, i)) {
+                                c->temp_start[i] = MIN2(c->temp_start[i],
+                                                        block->start_ip);
+                                c->temp_end[i] = MAX2(c->temp_end[i],
+                                                      block->start_ip);
+                        }
+
+                        if (BITSET_TEST(block->live_out, i)) {
+                                c->temp_start[i] = MIN2(c->temp_start[i],
+                                                        block->end_ip);
+                                c->temp_end[i] = MAX2(c->temp_end[i],
+                                                      block->end_ip);
+                        }
+                }
+        }
+}
+
+void
+vir_calculate_live_intervals(struct v3d_compile *c)
+{
+        int bitset_words = BITSET_WORDS(c->num_temps);
+
+        /* If we called this function more than once, then we should be
+         * freeing the previous arrays.
+         */
+        assert(!c->temp_start);
+
+        c->temp_start = rzalloc_array(c, int, c->num_temps);
+        c->temp_end = rzalloc_array(c, int, c->num_temps);
+
+        for (int i = 0; i < c->num_temps; i++) {
+                c->temp_start[i] = MAX_INSTRUCTION;
+                c->temp_end[i] = -1;
+        }
+
+        vir_for_each_block(block, c) {
+                block->def = rzalloc_array(c, BITSET_WORD, bitset_words);
+                block->use = rzalloc_array(c, BITSET_WORD, bitset_words);
+                block->live_in = rzalloc_array(c, BITSET_WORD, bitset_words);
+                block->live_out = rzalloc_array(c, BITSET_WORD, bitset_words);
+        }
+
+        vir_setup_def_use(c);
+
+        while (vir_live_variables_dataflow(c, bitset_words))
+                ;
+
+        vir_compute_start_end(c, c->num_temps);
+}
--- a/src/broadcom/compiler/vir_lower_uniforms.c
+++ b/src/broadcom/compiler/vir_lower_uniforms.c
@ -0,0 +1,209 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file v3d_vir_lower_uniforms.c
+ *
+ * This is the pre-code-generation pass for fixing up instructions that try to
+ * read from multiple uniform values.
+ */
+
+#include "v3d_compiler.h"
+#include "util/hash_table.h"
+#include "util/u_math.h"
+
+static inline uint32_t
+index_hash(const void *key)
+{
+        return (uintptr_t)key;
+}
+
+static inline bool
+index_compare(const void *a, const void *b)
+{
+        return a == b;
+}
+
+static void
+add_uniform(struct hash_table *ht, struct qreg reg)
+{
+        struct hash_entry *entry;
+        void *key = (void *)(uintptr_t)(reg.index + 1);
+
+        entry = _mesa_hash_table_search(ht, key);
+        if (entry) {
+                entry->data++;
+        } else {
+                _mesa_hash_table_insert(ht, key, (void *)(uintptr_t)1);
+        }
+}
+
+static void
+remove_uniform(struct hash_table *ht, struct qreg reg)
+{
+        struct hash_entry *entry;
+        void *key = (void *)(uintptr_t)(reg.index + 1);
+
+        entry = _mesa_hash_table_search(ht, key);
+        assert(entry);
+        entry->data--;
+        if (entry->data == NULL)
+                _mesa_hash_table_remove(ht, entry);
+}
+
+static bool
+is_lowerable_uniform(struct qinst *inst, int i)
+{
+        if (inst->src[i].file != QFILE_UNIF)
+                return false;
+        if (vir_has_implicit_uniform(inst))
+                return i != vir_get_implicit_uniform_src(inst);
+        return true;
+}
+
+/* Returns the number of different uniform values referenced by the
+ * instruction.
+ */
+static uint32_t
+vir_get_instruction_uniform_count(struct qinst *inst)
+{
+        uint32_t count = 0;
+
+        for (int i = 0; i < vir_get_nsrc(inst); i++) {
+                if (inst->src[i].file != QFILE_UNIF)
+                        continue;
+
+                bool is_duplicate = false;
+                for (int j = 0; j < i; j++) {
+                        if (inst->src[j].file == QFILE_UNIF &&
+                            inst->src[j].index == inst->src[i].index) {
+                                is_duplicate = true;
+                                break;
+                        }
+                }
+                if (!is_duplicate)
+                        count++;
+        }
+
+        return count;
+}
+
+void
+vir_lower_uniforms(struct v3d_compile *c)
+{
+        struct hash_table *ht =
+                _mesa_hash_table_create(c, index_hash, index_compare);
+
+        /* Walk the instruction list, finding which instructions have more
+         * than one uniform referenced, and add those uniform values to the
+         * ht.
+         */
+        vir_for_each_inst_inorder(inst, c) {
+                uint32_t nsrc = vir_get_nsrc(inst);
+
+                if (vir_get_instruction_uniform_count(inst) <= 1)
+                        continue;
+
+                for (int i = 0; i < nsrc; i++) {
+                        if (is_lowerable_uniform(inst, i))
+                                add_uniform(ht, inst->src[i]);
+                }
+        }
+
+        while (ht->entries) {
+                /* Find the most commonly used uniform in instructions that
+                 * need a uniform lowered.
+                 */
+                uint32_t max_count = 0;
+                uint32_t max_index = 0;
+                struct hash_entry *entry;
+                hash_table_foreach(ht, entry) {
+                        uint32_t count = (uintptr_t)entry->data;
+                        uint32_t index = (uintptr_t)entry->key - 1;
+                        if (count > max_count) {
+                                max_count = count;
+                                max_index = index;
+                        }
+                }
+
+                struct qreg unif = vir_reg(QFILE_UNIF, max_index);
+
+                /* Now, find the instructions using this uniform and make them
+                 * reference a temp instead.
+                 */
+                vir_for_each_block(block, c) {
+                        struct qinst *mov = NULL;
+
+                        vir_for_each_inst(inst, block) {
+                                uint32_t nsrc = vir_get_nsrc(inst);
+
+                                uint32_t count = vir_get_instruction_uniform_count(inst);
+
+                                if (count <= 1)
+                                        continue;
+
+                                /* If the block doesn't have a load of the
+                                 * uniform yet, add it.  We could potentially
+                                 * do better and CSE MOVs from multiple blocks
+                                 * into dominating blocks, except that may
+                                 * cause troubles for register allocation.
+                                 */
+                                if (!mov) {
+                                        mov = vir_mul_inst(V3D_QPU_M_MOV,
+                                                           vir_get_temp(c),
+                                                           unif, c->undef);
+                                        list_add(&mov->link,
+                                                 &block->instructions);
+                                        c->defs[mov->dst.index] = mov;
+                                }
+
+                                bool removed = false;
+                                for (int i = 0; i < nsrc; i++) {
+                                        if (is_lowerable_uniform(inst, i) &&
+                                            inst->src[i].index == max_index) {
+                                                inst->src[i].file =
+                                                        mov->dst.file;
+                                                inst->src[i].index =
+                                                        mov->dst.index;
+                                                remove_uniform(ht, unif);
+                                                removed = true;
+                                        }
+                                }
+                                if (removed)
+                                        count--;
+
+                                /* If the instruction doesn't need lowering any more,
+                                 * then drop it from the list.
+                                 */
+                                if (count <= 1) {
+                                        for (int i = 0; i < nsrc; i++) {
+                                                if (is_lowerable_uniform(inst, i))
+                                                        remove_uniform(ht, inst->src[i]);
+                                        }
+                                }
+                        }
+                }
+        }
+
+        _mesa_hash_table_destroy(ht, NULL);
+}
--- a/src/broadcom/compiler/vir_opt_copy_propagate.c
+++ b/src/broadcom/compiler/vir_opt_copy_propagate.c
@ -0,0 +1,233 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file v3d_opt_copy_propagation.c
+ *
+ * This implements simple copy propagation for VIR without control flow.
+ *
+ * For each temp, it keeps a qreg of which source it was MOVed from, if it
+ * was.  If we see that used later, we can just reuse the source value, since
+ * we know we don't have control flow, and we have SSA for our values so
+ * there's no killing to worry about.
+ */
+
+#include "v3d_compiler.h"
+
+static bool
+is_copy_mov(struct qinst *inst)
+{
+        if (!inst)
+                return false;
+
+        if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
+            (inst->qpu.alu.mul.op != V3D_QPU_M_FMOV &&
+             inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) {
+                return false;
+        }
+
+        if (inst->dst.file != QFILE_TEMP)
+                return false;
+
+        if (inst->src[0].file != QFILE_TEMP &&
+            inst->src[0].file != QFILE_UNIF) {
+                return false;
+        }
+
+        if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE ||
+            inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) {
+                return false;
+        }
+
+        if (inst->qpu.flags.ac != V3D_QPU_COND_NONE ||
+            inst->qpu.flags.mc != V3D_QPU_COND_NONE) {
+                return false;
+        }
+
+        switch (inst->src[0].file) {
+        case QFILE_MAGIC:
+                /* No copy propagating from R3/R4/R5 -- the MOVs from those
+                 * are there to register allocate values produced into R3/4/5
+                 * to other regs (though hopefully r3/4/5).
+                 */
+                switch (inst->src[0].index) {
+                case V3D_QPU_WADDR_R3:
+                case V3D_QPU_WADDR_R4:
+                case V3D_QPU_WADDR_R5:
+                        return false;
+                default:
+                        break;
+                }
+                break;
+
+        case QFILE_REG:
+                switch (inst->src[0].index) {
+                case 0:
+                case 1:
+                case 2:
+                        /* MOVs from rf0/1/2 are only to track the live
+                         * intervals for W/centroid W/Z.
+                         */
+                        return false;
+                }
+                break;
+
+        default:
+                break;
+        }
+
+        return true;
+}
+
+static bool
+vir_has_unpack(struct qinst *inst, int chan)
+{
+        assert(chan == 0 || chan == 1);
+
+        if (vir_is_add(inst)) {
+                if (chan == 0)
+                        return inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE;
+                else
+                        return inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE;
+        } else {
+                if (chan == 0)
+                        return inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE;
+                else
+                        return inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE;
+        }
+}
+
+static bool
+try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
+{
+        bool debug = false;
+        bool progress = false;
+
+        for (int i = 0; i < vir_get_nsrc(inst); i++) {
+                if (inst->src[i].file != QFILE_TEMP)
+                        continue;
+
+                /* We have two ways of finding MOVs we can copy propagate
+                 * from.  One is if it's an SSA def: then we can reuse it from
+                 * any block in the program, as long as its source is also an
+                 * SSA def.  Alternatively, if it's in the "movs" array
+                 * tracked within the block, then we know the sources for it
+                 * haven't been changed since we saw the instruction within
+                 * our block.
+                 */
+                struct qinst *mov = movs[inst->src[i].index];
+                if (!mov) {
+                        if (!is_copy_mov(c->defs[inst->src[i].index]))
+                                continue;
+                        mov = c->defs[inst->src[i].index];
+
+                        if (mov->src[0].file == QFILE_TEMP &&
+                            !c->defs[mov->src[0].index])
+                                continue;
+                }
+
+                if (vir_has_unpack(mov, 0)) {
+                        /* Make sure that the meaning of the unpack
+                         * would be the same between the two
+                         * instructions.
+                         */
+                        if (vir_is_float_input(inst) !=
+                            vir_is_float_input(mov)) {
+                                continue;
+                        }
+                        /* No composing the unpacks. */
+                        if (vir_has_unpack(inst, i))
+                            continue;
+                }
+
+                if (debug) {
+                        fprintf(stderr, "Copy propagate: ");
+                        vir_dump_inst(c, inst);
+                        fprintf(stderr, "\n");
+                }
+
+                inst->src[i] = mov->src[0];
+                if (vir_has_unpack(mov, 0)) {
+                        enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a_unpack;
+
+                        vir_set_unpack(inst, i, unpack);
+                }
+
+                if (debug) {
+                        fprintf(stderr, "to: ");
+                        vir_dump_inst(c, inst);
+                        fprintf(stderr, "\n");
+                }
+
+                progress = true;
+        }
+
+        return progress;
+}
+
+static void
+apply_kills(struct v3d_compile *c, struct qinst **movs, struct qinst *inst)
+{
+        if (inst->dst.file != QFILE_TEMP)
+                return;
+
+        for (int i = 0; i < c->num_temps; i++) {
+                if (movs[i] &&
+                    (movs[i]->dst.index == inst->dst.index ||
+                     (movs[i]->src[0].file == QFILE_TEMP &&
+                      movs[i]->src[0].index == inst->dst.index))) {
+                        movs[i] = NULL;
+                }
+        }
+}
+
+bool
+vir_opt_copy_propagate(struct v3d_compile *c)
+{
+        bool progress = false;
+        struct qinst **movs;
+
+        movs = ralloc_array(c, struct qinst *, c->num_temps);
+        if (!movs)
+                return false;
+
+        vir_for_each_block(block, c) {
+                /* The MOVs array tracks only available movs within the
+                 * block.
+                 */
+                memset(movs, 0, sizeof(struct qinst *) * c->num_temps);
+
+                vir_for_each_inst(inst, block) {
+                        progress = try_copy_prop(c, inst, movs) || progress;
+
+                        apply_kills(c, movs, inst);
+
+                        if (is_copy_mov(inst))
+                                movs[inst->dst.index] = inst;
+                }
+        }
+
+        ralloc_free(movs);
+
+        return progress;
+}
--- a/src/broadcom/compiler/vir_opt_dead_code.c
+++ b/src/broadcom/compiler/vir_opt_dead_code.c
@ -0,0 +1,162 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file v3d_opt_dead_code.c
+ *
+ * This is a simple dead code eliminator for SSA values in VIR.
+ *
+ * It walks all the instructions finding what temps are used, then walks again
+ * to remove instructions writing unused temps.
+ *
+ * This is an inefficient implementation if you have long chains of
+ * instructions where the entire chain is dead, but we expect those to have
+ * been eliminated at the NIR level, and here we're just cleaning up small
+ * problems produced by NIR->VIR.
+ */
+
+#include "v3d_compiler.h"
+
+static bool debug;
+
+static void
+dce(struct v3d_compile *c, struct qinst *inst)
+{
+        if (debug) {
+                fprintf(stderr, "Removing: ");
+                vir_dump_inst(c, inst);
+                fprintf(stderr, "\n");
+        }
+        assert(inst->qpu.flags.apf == V3D_QPU_PF_NONE);
+        assert(inst->qpu.flags.mpf == V3D_QPU_PF_NONE);
+        vir_remove_instruction(c, inst);
+}
+
+static bool
+has_nonremovable_reads(struct v3d_compile *c, struct qinst *inst)
+{
+        for (int i = 0; i < vir_get_nsrc(inst); i++) {
+                if (inst->src[i].file == QFILE_VPM) {
+                        /* Instance ID, Vertex ID: Should have been removed at
+                         * the NIR level
+                         */
+                        if (inst->src[i].index == ~0)
+                                return true;
+
+                        uint32_t attr = inst->src[i].index / 4;
+                        uint32_t offset = inst->src[i].index % 4;
+
+                        if (c->vattr_sizes[attr] != offset)
+                                return true;
+
+                        /* Can't get rid of the last VPM read, or the
+                         * simulator (at least) throws an error.
+                         */
+                        uint32_t total_size = 0;
+                        for (uint32_t i = 0; i < ARRAY_SIZE(c->vattr_sizes); i++)
+                                total_size += c->vattr_sizes[i];
+                        if (total_size == 1)
+                                return true;
+                }
+
+                /* Dead code removal of varyings is tricky, so just assert
+                 * that it all happened at the NIR level.
+                 */
+                if (inst->src[i].file == QFILE_VARY)
+                        return true;
+        }
+
+        return false;
+}
+
+bool
+vir_opt_dead_code(struct v3d_compile *c)
+{
+        bool progress = false;
+        bool *used = calloc(c->num_temps, sizeof(bool));
+
+        vir_for_each_inst_inorder(inst, c) {
+                for (int i = 0; i < vir_get_nsrc(inst); i++) {
+                        if (inst->src[i].file == QFILE_TEMP)
+                                used[inst->src[i].index] = true;
+                }
+        }
+
+        vir_for_each_block(block, c) {
+                vir_for_each_inst_safe(inst, block) {
+                        if (inst->dst.file != QFILE_NULL &&
+                            !(inst->dst.file == QFILE_TEMP &&
+                              !used[inst->dst.index])) {
+                                continue;
+                        }
+
+                        if (vir_has_side_effects(c, inst))
+                                continue;
+
+                        if (inst->qpu.flags.apf != V3D_QPU_PF_NONE ||
+                            inst->qpu.flags.mpf != V3D_QPU_PF_NONE||
+                            has_nonremovable_reads(c, inst)) {
+                                /* If we can't remove the instruction, but we
+                                 * don't need its destination value, just
+                                 * remove the destination.  The register
+                                 * allocator would trivially color it and it
+                                 * wouldn't cause any register pressure, but
+                                 * it's nicer to read the VIR code without
+                                 * unused destination regs.
+                                 */
+                                if (inst->dst.file == QFILE_TEMP) {
+                                        if (debug) {
+                                                fprintf(stderr,
+                                                        "Removing dst from: ");
+                                                vir_dump_inst(c, inst);
+                                                fprintf(stderr, "\n");
+                                        }
+                                        c->defs[inst->dst.index] = NULL;
+                                        inst->dst.file = QFILE_NULL;
+                                        progress = true;
+                                }
+                                continue;
+                        }
+
+                        for (int i = 0; i < vir_get_nsrc(inst); i++) {
+                                if (inst->src[i].file != QFILE_VPM)
+                                        continue;
+                                uint32_t attr = inst->src[i].index / 4;
+                                uint32_t offset = (inst->src[i].index % 4);
+
+                                if (c->vattr_sizes[attr] == offset) {
+                                        c->num_inputs--;
+                                        c->vattr_sizes[attr]--;
+                                }
+                        }
+
+                        dce(c, inst);
+                        progress = true;
+                        continue;
+                }
+        }
+
+        free(used);
+
+        return progress;
+}
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@ -0,0 +1,254 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/ralloc.h"
+#include "util/register_allocate.h"
+#include "v3d_compiler.h"
+
+#define QPU_R(i) { .magic = false, .index = i }
+
+#define ACC_INDEX     0
+#define ACC_COUNT     5
+#define PHYS_INDEX    (ACC_INDEX + ACC_COUNT)
+#define PHYS_COUNT    64
+
+bool
+vir_init_reg_sets(struct v3d_compiler *compiler)
+{
+        compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,
+                                          true);
+        if (!compiler->regs)
+                return false;
+
+        /* Allocate 3 regfile classes, for the ways the physical register file
+         * can be divided up for fragment shader threading.
+         */
+        for (int threads = 0; threads < 3; threads++) {
+                compiler->reg_class[threads] =
+                        ra_alloc_reg_class(compiler->regs);
+
+                for (int i = PHYS_INDEX;
+                     i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) {
+                        ra_class_add_reg(compiler->regs,
+                                         compiler->reg_class[threads], i);
+                }
+
+                for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT; i++) {
+                        ra_class_add_reg(compiler->regs,
+                                         compiler->reg_class[threads], i);
+                }
+        }
+
+        ra_set_finalize(compiler->regs, NULL);
+
+        return true;
+}
+
+struct node_to_temp_map {
+        uint32_t temp;
+        uint32_t priority;
+};
+
+static int
+node_to_temp_priority(const void *in_a, const void *in_b)
+{
+        const struct node_to_temp_map *a = in_a;
+        const struct node_to_temp_map *b = in_b;
+
+        return a->priority - b->priority;
+}
+
+#define CLASS_BIT_PHYS			(1 << 0)
+#define CLASS_BIT_R0_R2			(1 << 1)
+#define CLASS_BIT_R3			(1 << 2)
+#define CLASS_BIT_R4			(1 << 3)
+
+/**
+ * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
+ *
+ * The return value should be freed by the caller.
+ */
+struct qpu_reg *
+v3d_register_allocate(struct v3d_compile *c)
+{
+        struct node_to_temp_map map[c->num_temps];
+        uint32_t temp_to_node[c->num_temps];
+        uint8_t class_bits[c->num_temps];
+        struct qpu_reg *temp_registers = calloc(c->num_temps,
+                                                sizeof(*temp_registers));
+        int acc_nodes[ACC_COUNT];
+
+        struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
+                                                         c->num_temps +
+                                                         ARRAY_SIZE(acc_nodes));
+
+        /* Make some fixed nodes for the accumulators, which we will need to
+         * interfere with when ops have implied r3/r4 writes or for the thread
+         * switches.  We could represent these as classes for the nodes to
+         * live in, but the classes take up a lot of memory to set up, so we
+         * don't want to make too many.
+         */
+        for (int i = 0; i < ARRAY_SIZE(acc_nodes); i++) {
+                acc_nodes[i] = c->num_temps + i;
+                ra_set_node_reg(g, acc_nodes[i], ACC_INDEX + i);
+        }
+
+        /* Compute the live ranges so we can figure out interference. */
+        vir_calculate_live_intervals(c);
+
+        for (uint32_t i = 0; i < c->num_temps; i++) {
+                map[i].temp = i;
+                map[i].priority = c->temp_end[i] - c->temp_start[i];
+        }
+        qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority);
+        for (uint32_t i = 0; i < c->num_temps; i++) {
+                temp_to_node[map[i].temp] = i;
+        }
+
+        /* Figure out our register classes and preallocated registers.  We
+         * start with any temp being able to be in any file, then instructions
+         * incrementally remove bits that the temp definitely can't be in.
+         */
+        memset(class_bits,
+               CLASS_BIT_PHYS | CLASS_BIT_R0_R2 | CLASS_BIT_R3 | CLASS_BIT_R4,
+               sizeof(class_bits));
+
+        int ip = 0;
+        vir_for_each_inst_inorder(inst, c) {
+                /* If the instruction writes r3/r4 (and optionally moves its
+                 * result to a temp), nothing else can be stored in r3/r4 across
+                 * it.
+                 */
+                if (vir_writes_r3(inst)) {
+                        for (int i = 0; i < c->num_temps; i++) {
+                                if (c->temp_start[i] < ip &&
+                                    c->temp_end[i] > ip) {
+                                        ra_add_node_interference(g,
+                                                                 temp_to_node[i],
+                                                                 acc_nodes[3]);
+                                }
+                        }
+                }
+                if (vir_writes_r4(inst)) {
+                        for (int i = 0; i < c->num_temps; i++) {
+                                if (c->temp_start[i] < ip &&
+                                    c->temp_end[i] > ip) {
+                                        ra_add_node_interference(g,
+                                                                 temp_to_node[i],
+                                                                 acc_nodes[4]);
+                                }
+                        }
+                }
+
+                if (inst->src[0].file == QFILE_REG) {
+                        switch (inst->src[0].index) {
+                        case 0:
+                        case 1:
+                        case 2:
+                                /* Payload setup instructions: Force allocate
+                                 * the dst to the given register (so the MOV
+                                 * will disappear).
+                                 */
+                                assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
+                                assert(inst->dst.file == QFILE_TEMP);
+                                ra_set_node_reg(g,
+                                                temp_to_node[inst->dst.index],
+                                                PHYS_INDEX +
+                                                inst->src[0].index);
+                                break;
+                        }
+                }
+
+#if 0
+                switch (inst->op) {
+                case QOP_THRSW:
+                        /* All accumulators are invalidated across a thread
+                         * switch.
+                         */
+                        for (int i = 0; i < c->num_temps; i++) {
+                                if (c->temp_start[i] < ip && c->temp_end[i] > ip)
+                                        class_bits[i] &= ~(CLASS_BIT_R0_R3 |
+                                                           CLASS_BIT_R4);
+                        }
+                        break;
+
+                default:
+                        break;
+                }
+#endif
+
+                ip++;
+        }
+
+        for (uint32_t i = 0; i < c->num_temps; i++) {
+                ra_set_node_class(g, temp_to_node[i],
+                                  c->compiler->reg_class[c->fs_threaded]);
+        }
+
+        for (uint32_t i = 0; i < c->num_temps; i++) {
+                for (uint32_t j = i + 1; j < c->num_temps; j++) {
+                        if (!(c->temp_start[i] >= c->temp_end[j] ||
+                              c->temp_start[j] >= c->temp_end[i])) {
+                                ra_add_node_interference(g,
+                                                         temp_to_node[i],
+                                                         temp_to_node[j]);
+                        }
+                }
+        }
+
+        bool ok = ra_allocate(g);
+        if (!ok) {
+                if (!c->fs_threaded) {
+                        fprintf(stderr, "Failed to register allocate:\n");
+                        vir_dump(c);
+                }
+
+                c->failed = true;
+                free(temp_registers);
+                return NULL;
+        }
+
+        for (uint32_t i = 0; i < c->num_temps; i++) {
+                int ra_reg = ra_get_node_reg(g, temp_to_node[i]);
+                if (ra_reg < PHYS_INDEX) {
+                        temp_registers[i].magic = true;
+                        temp_registers[i].index = (V3D_QPU_WADDR_R0 +
+                                                   ra_reg - ACC_INDEX);
+                } else {
+                        temp_registers[i].magic = false;
+                        temp_registers[i].index = ra_reg - PHYS_INDEX;
+                }
+
+                /* If the value's never used, just write to the NOP register
+                 * for clarity in debug output.
+                 */
+                if (c->temp_start[i] == c->temp_end[i]) {
+                        temp_registers[i].magic = true;
+                        temp_registers[i].index = V3D_QPU_WADDR_NOP;
+                }
+        }
+
+        ralloc_free(g);
+
+        return temp_registers;
+}
--- a/src/broadcom/compiler/vir_to_qpu.c
+++ b/src/broadcom/compiler/vir_to_qpu.c
@ -0,0 +1,359 @@
+/*
+ * Copyright © 2016 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/v3d_compiler.h"
+#include "qpu/qpu_instr.h"
+#include "qpu/qpu_disasm.h"
+
+static inline struct qpu_reg
+qpu_reg(int index)
+{
+        struct qpu_reg reg = {
+                .magic = false,
+                .index = index,
+        };
+        return reg;
+}
+
+static inline struct qpu_reg
+qpu_magic(enum v3d_qpu_waddr waddr)
+{
+        struct qpu_reg reg = {
+                .magic = true,
+                .index = waddr,
+        };
+        return reg;
+}
+
+static inline struct qpu_reg
+qpu_acc(int acc)
+{
+        return qpu_magic(V3D_QPU_WADDR_R0 + acc);
+}
+
+struct v3d_qpu_instr
+v3d_qpu_nop(void)
+{
+        struct v3d_qpu_instr instr = {
+                .type = V3D_QPU_INSTR_TYPE_ALU,
+                .alu = {
+                        .add = {
+                                .op = V3D_QPU_A_NOP,
+                                .waddr = V3D_QPU_WADDR_NOP,
+                                .magic_write = true,
+                        },
+                        .mul = {
+                                .op = V3D_QPU_M_NOP,
+                                .waddr = V3D_QPU_WADDR_NOP,
+                                .magic_write = true,
+                        },
+                }
+        };
+
+        return instr;
+}
+
+static struct qinst *
+vir_nop(void)
+{
+        struct qreg undef = { QFILE_NULL, 0 };
+        struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
+
+        return qinst;
+}
+
+static struct qinst *
+new_qpu_nop_before(struct qinst *inst)
+{
+        struct qinst *q = vir_nop();
+
+        list_addtail(&q->link, &inst->link);
+
+        return q;
+}
+
+static void
+new_ldunif_instr(struct qinst *inst, int i)
+{
+        struct qinst *ldunif = new_qpu_nop_before(inst);
+
+        ldunif->qpu.sig.ldunif = true;
+        assert(inst->src[i].file == QFILE_UNIF);
+        ldunif->uniform = inst->src[i].index;
+}
+
+/**
+ * Allocates the src register (accumulator or register file) into the RADDR
+ * fields of the instruction.
+ */
+static void
+set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
+{
+        if (src.magic) {
+                assert(src.index >= V3D_QPU_WADDR_R0 &&
+                       src.index <= V3D_QPU_WADDR_R5);
+                *mux = src.index - V3D_QPU_WADDR_R0 + V3D_QPU_MUX_R0;
+                return;
+        }
+
+        if (instr->alu.add.a != V3D_QPU_MUX_A &&
+            instr->alu.add.b != V3D_QPU_MUX_A &&
+            instr->alu.mul.a != V3D_QPU_MUX_A &&
+            instr->alu.mul.b != V3D_QPU_MUX_A) {
+                instr->raddr_a = src.index;
+                *mux = V3D_QPU_MUX_A;
+        } else {
+                if (instr->raddr_a == src.index) {
+                        *mux = V3D_QPU_MUX_A;
+                } else {
+                        assert(!(instr->alu.add.a == V3D_QPU_MUX_B &&
+                                 instr->alu.add.b == V3D_QPU_MUX_B &&
+                                 instr->alu.mul.a == V3D_QPU_MUX_B &&
+                                 instr->alu.mul.b == V3D_QPU_MUX_B) ||
+                               src.index == instr->raddr_b);
+
+                        instr->raddr_b = src.index;
+                        *mux = V3D_QPU_MUX_B;
+                }
+        }
+}
+
+static void
+v3d_generate_code_block(struct v3d_compile *c,
+                        struct qblock *block,
+                        struct qpu_reg *temp_registers)
+{
+        int last_vpm_read_index = -1;
+
+        vir_for_each_inst(qinst, block) {
+#if 0
+                fprintf(stderr, "translating qinst to qpu: ");
+                vir_dump_inst(c, qinst);
+                fprintf(stderr, "\n");
+#endif
+
+                struct qinst *temp;
+
+                if (vir_has_implicit_uniform(qinst)) {
+                        int src = vir_get_implicit_uniform_src(qinst);
+                        assert(qinst->src[src].file == QFILE_UNIF);
+                        qinst->uniform = qinst->src[src].index;
+                        c->num_uniforms++;
+                }
+
+                int nsrc = vir_get_non_sideband_nsrc(qinst);
+                struct qpu_reg src[ARRAY_SIZE(qinst->src)];
+                bool emitted_ldunif = false;
+                for (int i = 0; i < nsrc; i++) {
+                        int index = qinst->src[i].index;
+                        switch (qinst->src[i].file) {
+                        case QFILE_REG:
+                                src[i] = qpu_reg(qinst->src[i].index);
+                                break;
+                        case QFILE_MAGIC:
+                                src[i] = qpu_magic(qinst->src[i].index);
+                                break;
+                        case QFILE_NULL:
+                        case QFILE_LOAD_IMM:
+                                src[i] = qpu_acc(0);
+                                break;
+                        case QFILE_TEMP:
+                                src[i] = temp_registers[index];
+                                break;
+                        case QFILE_UNIF:
+                                if (!emitted_ldunif) {
+                                        new_ldunif_instr(qinst, i);
+                                        c->num_uniforms++;
+                                        emitted_ldunif = true;
+                                }
+
+                                src[i] = qpu_acc(5);
+                                break;
+                        case QFILE_VARY:
+                                temp = new_qpu_nop_before(qinst);
+                                temp->qpu.sig.ldvary = true;
+
+                                src[i] = qpu_acc(3);
+                                break;
+                        case QFILE_SMALL_IMM:
+                                abort(); /* XXX */
+#if 0
+                                src[i].mux = QPU_MUX_SMALL_IMM;
+                                src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
+                                /* This should only have returned a valid
+                                 * small immediate field, not ~0 for failure.
+                                 */
+                                assert(src[i].addr <= 47);
+#endif
+                                break;
+
+                        case QFILE_VPM:
+                                assert((int)qinst->src[i].index >=
+                                       last_vpm_read_index);
+                                (void)last_vpm_read_index;
+                                last_vpm_read_index = qinst->src[i].index;
+
+                                temp = new_qpu_nop_before(qinst);
+                                temp->qpu.sig.ldvpm = true;
+
+                                src[i] = qpu_acc(3);
+                                break;
+
+                        case QFILE_TLB:
+                        case QFILE_TLBU:
+                                unreachable("bad vir src file");
+                        }
+                }
+
+                struct qpu_reg dst;
+                switch (qinst->dst.file) {
+                case QFILE_NULL:
+                        dst = qpu_magic(V3D_QPU_WADDR_NOP);
+                        break;
+
+                case QFILE_REG:
+                        dst = qpu_reg(qinst->dst.index);
+                        break;
+
+                case QFILE_MAGIC:
+                        dst = qpu_magic(qinst->dst.index);
+                        break;
+
+                case QFILE_TEMP:
+                        dst = temp_registers[qinst->dst.index];
+                        break;
+
+                case QFILE_VPM:
+                        dst = qpu_magic(V3D_QPU_WADDR_VPM);
+                        break;
+
+                case QFILE_TLB:
+                        dst = qpu_magic(V3D_QPU_WADDR_TLB);
+                        break;
+
+                case QFILE_TLBU:
+                        dst = qpu_magic(V3D_QPU_WADDR_TLBU);
+                        break;
+
+                case QFILE_VARY:
+                case QFILE_UNIF:
+                case QFILE_SMALL_IMM:
+                case QFILE_LOAD_IMM:
+                        assert(!"not reached");
+                        break;
+                }
+
+                if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
+                        if (qinst->qpu.alu.add.op != V3D_QPU_A_NOP) {
+                                assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
+                                if (nsrc >= 1) {
+                                        set_src(&qinst->qpu,
+                                                &qinst->qpu.alu.add.a, src[0]);
+                                }
+                                if (nsrc >= 2) {
+                                        set_src(&qinst->qpu,
+                                                &qinst->qpu.alu.add.b, src[1]);
+                                }
+
+                                qinst->qpu.alu.add.waddr = dst.index;
+                                qinst->qpu.alu.add.magic_write = dst.magic;
+                        } else {
+                                if (nsrc >= 1) {
+                                        set_src(&qinst->qpu,
+                                                &qinst->qpu.alu.mul.a, src[0]);
+                                }
+                                if (nsrc >= 2) {
+                                        set_src(&qinst->qpu,
+                                                &qinst->qpu.alu.mul.b, src[1]);
+                                }
+
+                                qinst->qpu.alu.mul.waddr = dst.index;
+                                qinst->qpu.alu.mul.magic_write = dst.magic;
+                        }
+                } else {
+                        assert(qinst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
+                }
+        }
+}
+
+
+static void
+v3d_dump_qpu(struct v3d_compile *c)
+{
+        fprintf(stderr, "%s prog %d/%d QPU:\n",
+                vir_get_stage_name(c),
+                c->program_id, c->variant_id);
+
+        for (int i = 0; i < c->qpu_inst_count; i++) {
+                const char *str = v3d_qpu_disasm(c->devinfo, c->qpu_insts[i]);
+                fprintf(stderr, "0x%016"PRIx64" %s\n", c->qpu_insts[i], str);
+        }
+        fprintf(stderr, "\n");
+}
+
+void
+v3d_vir_to_qpu(struct v3d_compile *c)
+{
+        struct qpu_reg *temp_registers = v3d_register_allocate(c);
+        struct qblock *end_block = list_last_entry(&c->blocks,
+                                                   struct qblock, link);
+
+        /* Reset the uniform count to how many will be actually loaded by the
+         * generated QPU code.
+         */
+        c->num_uniforms = 0;
+
+        vir_for_each_block(block, c)
+                v3d_generate_code_block(c, block, temp_registers);
+
+        struct qinst *thrsw = vir_nop();
+        list_addtail(&thrsw->link, &end_block->instructions);
+        thrsw->qpu.sig.thrsw = true;
+
+        uint32_t cycles = v3d_qpu_schedule_instructions(c);
+
+        c->qpu_insts = rzalloc_array(c, uint64_t, c->qpu_inst_count);
+        int i = 0;
+        vir_for_each_inst_inorder(inst, c) {
+                bool ok = v3d_qpu_instr_pack(c->devinfo, &inst->qpu,
+                                             &c->qpu_insts[i++]);
+                assert(ok); (void) ok;
+        }
+        assert(i == c->qpu_inst_count);
+
+        if (V3D_DEBUG & V3D_DEBUG_SHADERDB) {
+                fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
+                        vir_get_stage_name(c),
+                        c->program_id, c->variant_id,
+                        cycles);
+        }
+
+        if (V3D_DEBUG & (V3D_DEBUG_QPU |
+                         v3d_debug_flag_for_shader_stage(c->s->stage))) {
+                v3d_dump_qpu(c);
+        }
+
+        qpu_validate(c);
+
+        free(temp_registers);
+}