From 613eaac7b53bfbfcd6ef536412be6c9c63cdea4f Mon Sep 17 00:00:00 2001
From: Connor Abbott <cwabbott0@gmail.com>
Date: Fri, 23 Jul 2021 13:12:30 +0200
Subject: [PATCH] ir3: Initial support for spilling non-shared registers

Support for spilling shared registers to normal registers is still TODO.
There are also several improvements to be made, like rematerialization.

Note, there is one behavior change to register pressure accounting: we
now include half registers in the current full pressure directly in
mergedregs mode, rather than adding the max half pressure to the max
full pressure afterwards, which might result in lower calculated max
pressure in some cases with half registers. This is needed for spilling,
since we need to make sure the total pressure including half registers
is below the maximum at each instruction. Because the entire pass is
rewritten, including the register pressure calculating parts, it didn't
seem worth it to separate out this change.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12033>
---
 .../ci/deqp-freedreno-a630-fails.txt          |  221 --
 .../ci/deqp-freedreno-a630-skips.txt          |    5 +
 src/freedreno/ir3/disasm-a3xx.c               |    3 +
 src/freedreno/ir3/instr-a3xx.h                |    3 +
 src/freedreno/ir3/ir3.h                       |   14 +
 src/freedreno/ir3/ir3_compiler.c              |    1 +
 src/freedreno/ir3/ir3_compiler.h              |    1 +
 src/freedreno/ir3/ir3_lower_spill.c           |  163 ++
 src/freedreno/ir3/ir3_merge_regs.c            |   20 +-
 src/freedreno/ir3/ir3_ra.c                    |  197 +-
 src/freedreno/ir3/ir3_ra.h                    |   10 +
 src/freedreno/ir3/ir3_spill.c                 | 1789 ++++++++++++++++-
 src/freedreno/ir3/ir3_validate.c              |    3 +-
 src/freedreno/ir3/meson.build                 |    1 +
 .../ci/piglit-freedreno-a630-fails.txt        |    5 -
 15 files changed, 2121 insertions(+), 315 deletions(-)
 create mode 100644 src/freedreno/ir3/ir3_lower_spill.c

diff --git a/src/freedreno/ci/deqp-freedreno-a630-fails.txt b/src/freedreno/ci/deqp-freedreno-a630-fails.txt
index 18325d68824..e5790183e98 100644
--- a/src/freedreno/ci/deqp-freedreno-a630-fails.txt
+++ b/src/freedreno/ci/deqp-freedreno-a630-fails.txt
@@ -14,11 +14,6 @@ KHR-GL33.transform_feedback.query_vertex_separate_test,Fail
 # "*** Color comparison failed"
 KHR-GLES3.packed_depth_stencil.verify_read_pixels.depth24_stencil8,Fail
 
-# "MESA: error: ir3_ra() failed!"
-KHR-GLES31.core.arrays_of_arrays.InteractionFunctionCalls2,Fail
-KHR-GLES31.core.arrays_of_arrays.InteractionArgumentAliasing5,Fail
-KHR-GLES31.core.arrays_of_arrays.InteractionArgumentAliasing6,Fail
-
 # "The values of resultStd[i] & 0xFFFFFFFE and resultFma[i] & 0xFFFFFFFE and resultCPU[i] & 0xFFFFFFFE are not bitwise equal for i = 0..99 "
 KHR-GLES31.core.gpu_shader5.fma_precision_float,Fail
 KHR-GLES31.core.gpu_shader5.fma_precision_vec2,Fail
@@ -86,11 +81,6 @@ dEQP-VK.api.info.get_physical_device_properties2.properties,Fail
 dEQP-VK.api.object_management.alloc_callback_fail.device,Fail
 dEQP-VK.api.object_management.alloc_callback_fail.device_group,Fail
 
-# "MESA: error: ir3_ra() failed!"
-# https://gitlab.freedesktop.org/mesa/mesa/-/issues/33
-dEQP-VK.graphicsfuzz.spv-stable-maze-flatten-copy-composite,Fail
-dEQP-VK.graphicsfuzz.spv-stable-pillars-volatile-nontemporal-store,Fail
-
 # https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3019
 # should be fixed by https://gerrit.khronos.org/c/vk-gl-cts/+/7745
 dEQP-VK.renderpass.dedicated_allocation.attachment_allocation.input_output.7,Fail
@@ -98,10 +88,6 @@ dEQP-VK.renderpass.suballocation.attachment_allocation.input_output.7,Fail
 dEQP-VK.renderpass2.dedicated_allocation.attachment_allocation.input_output.7,Fail
 dEQP-VK.renderpass2.suballocation.attachment_allocation.input_output.7,Fail
 
-# "MESA: error: ir3_ra() failed!
-# https://gitlab.freedesktop.org/mesa/mesa/-/issues/33
-dEQP-VK.spirv_assembly.instruction.compute.opcopymemory.array,Fail
-
 # "deqp-vk: ../src/freedreno/vulkan/tu_cs.h:186: tu_cs_reserve: Assertion `tu_cs_get_space(cs) >= reserved_size' failed."
 # https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8841
 dEQP-VK.spirv_assembly.instruction.compute.opphi.wide,Crash
@@ -120,14 +106,6 @@ dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_si
 dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_geom,Fail
 dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_vert,Fail
 
-# "MESA: error: ir3_ra() failed!"
-# https://gitlab.freedesktop.org/mesa/mesa/-/issues/33
-# Needs spilling, or maybe some scheduling (though throwing a bit of nir_move/sink
-# at it didn't help).
-dEQP-VK.spirv_assembly.instruction.spirv1p4.opcopylogical.nested_arrays_different_inner_stride,Fail
-dEQP-VK.spirv_assembly.instruction.spirv1p4.opcopylogical.nested_arrays_different_outer_stride,Fail
-dEQP-VK.spirv_assembly.instruction.spirv1p4.opcopylogical.nested_arrays_different_strides,Fail
-
 dEQP-VK.texture.filtering.2d.formats.d24_unorm_s8_uint_stencil.nearest,Fail
 dEQP-VK.texture.filtering.2d_array.formats.d24_unorm_s8_uint_stencil.d24_unorm_s8_uint_stencil_nearest,Fail
 dEQP-VK.texture.filtering.cube.formats.d24_unorm_s8_uint_stencil.nearest,Fail
@@ -136,205 +114,6 @@ dEQP-VK.texture.filtering.unnormal.formats.d24_unorm_s8_uint_stencil.nearest,Fai
 # Broken on all drivers: https://gitlab.freedesktop.org/mesa/mesa/-/issues/4582
 dEQP-VK.wsi.display_control.register_device_event,Fail
 
-# "MESA: error: ir3_ra() failed!"
-# https://gitlab.freedesktop.org/mesa/mesa/-/issues/33
-dEQP-VK.ssbo.layout.2_level_array.scalar.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.2_level_array.scalar.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.2_level_array.scalar.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.2_level_array.scalar.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.2_level_array.std140.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.2_level_array.std140.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.2_level_array.std140.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.2_level_array.std140.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.2_level_array.std430.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.2_level_array.std430.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.2_level_array.std430.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.2_level_array.std430.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat2x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat2x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat2x4,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat2x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x2_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x2_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x2,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x2_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat2x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat2x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat2x4,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat2x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x2_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x2_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x2,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x2_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat2x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat2x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat2x4,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat2x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x2_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x2_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x2,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x2_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat2x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat2x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat2x4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat2x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x2_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x2_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x2,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x2_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat2x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat2x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat2x4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat2x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x2_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x2_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x2,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x2_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat2x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat2x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat2x4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat2x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x2_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x2_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x2,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x2_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.random.all_shared_buffer.5,Fail
-dEQP-VK.ssbo.layout.random.nested_structs_arrays.0,Fail
-dEQP-VK.ssbo.layout.random.nested_structs_arrays.17,Fail
-dEQP-VK.ssbo.layout.random.scalar.19,Fail
-
 bypass-dEQP-VK.renderpass.dedicated_allocation.attachment_allocation.input_output.7,Fail
 bypass-dEQP-VK.renderpass.suballocation.attachment_allocation.input_output.7,Fail
 bypass-dEQP-VK.renderpass2.dedicated_allocation.attachment_allocation.input_output.7,Fail
diff --git a/src/freedreno/ci/deqp-freedreno-a630-skips.txt b/src/freedreno/ci/deqp-freedreno-a630-skips.txt
index b3531a2c6ac..d95dac8a996 100644
--- a/src/freedreno/ci/deqp-freedreno-a630-skips.txt
+++ b/src/freedreno/ci/deqp-freedreno-a630-skips.txt
@@ -25,3 +25,8 @@ dEQP-VK.ubo.random.all_shared_buffer.48
 
 # Still running after 3 hours, time is spent in batch_draw_tracking().
 KHR-GLES31.core.shader_image_load_store.basic-allFormats-store-fs
+
+# causes a hangcheck timeout on a630:
+# msm ae00000.mdss: [drm:hangcheck_handler] *ERROR* A630: hangcheck detected gpu lockup rb 0!
+dEQP-VK.graphicsfuzz.spv-stable-maze-flatten-copy-composite
+dEQP-VK.graphicsfuzz.spv-stable-pillars-volatile-nontemporal-store
diff --git a/src/freedreno/ir3/disasm-a3xx.c b/src/freedreno/ir3/disasm-a3xx.c
index 4a0fb40bbd1..ee4524797bd 100644
--- a/src/freedreno/ir3/disasm-a3xx.c
+++ b/src/freedreno/ir3/disasm-a3xx.c
@@ -348,6 +348,9 @@ static const struct opc_info {
    OPC(6, OPC_GETSPID,      getspid),
    OPC(6, OPC_GETWID,       getwid),
 
+   OPC(6, OPC_SPILL_MACRO,  spill.macro),
+   OPC(6, OPC_RELOAD_MACRO, reload.macro),
+
    OPC(7, OPC_BAR,          bar),
    OPC(7, OPC_FENCE,        fence),
 /* clang-format on */
diff --git a/src/freedreno/ir3/instr-a3xx.h b/src/freedreno/ir3/instr-a3xx.h
index 93162c103b4..f367d6a197d 100644
--- a/src/freedreno/ir3/instr-a3xx.h
+++ b/src/freedreno/ir3/instr-a3xx.h
@@ -308,6 +308,9 @@ typedef enum {
    OPC_LDG_A           = _OPC(6, 55),
    OPC_STG_A           = _OPC(6, 56),
 
+   OPC_SPILL_MACRO     = _OPC(6, 57),
+   OPC_RELOAD_MACRO    = _OPC(6, 58),
+
    /* category 7: */
    OPC_BAR             = _OPC(7, 0),
    OPC_FENCE           = _OPC(7, 1),
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index 187f8e13d3e..e0c678b0534 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -89,6 +89,7 @@ struct ir3_merge_set {
    uint16_t alignment;
 
    unsigned interval_start;
+   unsigned spill_slot;
 
    unsigned regs_count;
    struct ir3_register **regs;
@@ -202,6 +203,8 @@ struct ir3_register {
     */
    struct ir3_register *tied;
 
+   unsigned spill_slot, next_use;
+
    unsigned merge_set_offset;
    struct ir3_merge_set *merge_set;
    unsigned interval_start, interval_end;
@@ -711,6 +714,17 @@ ir3_instr_move_after(struct ir3_instruction *instr,
    list_add(&instr->node, &before->node);
 }
 
+/**
+ * Move 'instr' to the beginning of the block:
+ */
+static inline void
+ir3_instr_move_before_block(struct ir3_instruction *instr,
+                            struct ir3_block *block)
+{
+   list_delinit(&instr->node);
+   list_add(&instr->node, &block->instr_list);
+}
+
 void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps);
 
 void ir3_set_dst_type(struct ir3_instruction *instr, bool half);
diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c
index 29a59052f99..cf407e6b2b3 100644
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -44,6 +44,7 @@ static const struct debug_named_value shader_debug_options[] = {
    {"nouboopt",   IR3_DBG_NOUBOOPT,   "Disable lowering UBO to uniform"},
    {"nofp16",     IR3_DBG_NOFP16,     "Don't lower mediump to fp16"},
    {"nocache",    IR3_DBG_NOCACHE,    "Disable shader cache"},
+   {"spillall",   IR3_DBG_SPILLALL,   "Spill as much as possible to test the spiller"},
 #ifdef DEBUG
    /* DEBUG-only options: */
    {"schedmsgs",  IR3_DBG_SCHEDMSGS,  "Enable scheduler debug messages"},
diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h
index cf2fe2ad221..afe6113b1e4 100644
--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@@ -194,6 +194,7 @@ enum ir3_shader_debug {
    IR3_DBG_NOUBOOPT = BITFIELD_BIT(9),
    IR3_DBG_NOFP16 = BITFIELD_BIT(10),
    IR3_DBG_NOCACHE = BITFIELD_BIT(11),
+   IR3_DBG_SPILLALL = BITFIELD_BIT(12),
 
    /* DEBUG-only options: */
    IR3_DBG_SCHEDMSGS = BITFIELD_BIT(20),
diff --git a/src/freedreno/ir3/ir3_lower_spill.c b/src/freedreno/ir3/ir3_lower_spill.c
new file mode 100644
index 00000000000..265207105e9
--- /dev/null
+++ b/src/freedreno/ir3/ir3_lower_spill.c
@@ -0,0 +1,163 @@
+/*
+ * Copyright (C) 2021 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ir3_ra.h"
+
+/* The spilling pass leaves out a few details required to successfully operate
+ * ldp/stp:
+ *
+ * 1. ldp/stp can only load/store 4 components at a time, but spilling ignores
+ *    that and just spills/restores entire values, including arrays and values
+ *    created for texture setup which can be more than 4 components.
+ * 2. The spiller doesn't add barrier dependencies needed for post-RA
+ *    scheduling.
+ *
+ * The first one, in particular, is much easier to handle after RA because
+ * arrays and normal values can be treated the same way. Therefore this pass
+ * runs after RA, and handles both issues. This keeps the complexity out of the
+ * spiller.
+ */
+
+static void
+split_spill(struct ir3_instruction *spill)
+{
+   unsigned orig_components = spill->srcs[2]->uim_val;
+
+   /* We don't handle splitting dependencies. */
+   assert(spill->deps_count == 0);
+
+   if (orig_components <= 4) {
+      if (spill->srcs[1]->flags & IR3_REG_ARRAY) {
+         spill->srcs[1]->wrmask = MASK(orig_components);
+         spill->srcs[1]->num = spill->srcs[1]->array.base;
+         spill->srcs[1]->flags &= ~IR3_REG_ARRAY;
+      }
+      return;
+   }
+
+   for (unsigned comp = 0; comp < orig_components; comp += 4) {
+      unsigned components = MIN2(orig_components - comp, 4);
+      struct ir3_instruction *clone = ir3_instr_clone(spill);
+      ir3_instr_move_before(clone, spill);
+
+      clone->srcs[1]->wrmask = MASK(components);
+      if (clone->srcs[1]->flags & IR3_REG_ARRAY) {
+         clone->srcs[1]->num = clone->srcs[1]->array.base + comp;
+         clone->srcs[1]->flags &= ~IR3_REG_ARRAY;
+      }
+
+      clone->srcs[2]->uim_val = components;
+      clone->cat6.dst_offset +=
+         comp * ((spill->srcs[1]->flags & IR3_REG_HALF) ? 2 : 4);
+   }
+
+   list_delinit(&spill->node);
+}
+
+static void
+split_reload(struct ir3_instruction *reload)
+{
+   unsigned orig_components = reload->srcs[2]->uim_val;
+
+   assert(reload->deps_count == 0);
+
+   if (orig_components <= 4) {
+      if (reload->dsts[0]->flags & IR3_REG_ARRAY) {
+         reload->dsts[0]->wrmask = MASK(orig_components);
+         reload->dsts[0]->num = reload->dsts[0]->array.base;
+         reload->dsts[0]->flags &= ~IR3_REG_ARRAY;
+      }
+      return;
+   }
+
+   for (unsigned comp = 0; comp < orig_components; comp += 4) {
+      unsigned components = MIN2(orig_components - comp, 4);
+      struct ir3_instruction *clone = ir3_instr_clone(reload);
+      ir3_instr_move_before(clone, reload);
+
+      clone->dsts[0]->wrmask = MASK(components);
+      if (clone->dsts[0]->flags & IR3_REG_ARRAY) {
+         clone->dsts[0]->num = clone->dsts[0]->array.base + comp;
+         clone->dsts[0]->flags &= ~IR3_REG_ARRAY;
+      }
+
+      clone->srcs[2]->uim_val = components;
+      clone->srcs[1]->uim_val +=
+         comp * ((reload->dsts[0]->flags & IR3_REG_HALF) ? 2 : 4);
+   }
+
+   list_delinit(&reload->node);
+}
+
+static void
+add_spill_reload_deps(struct ir3_block *block)
+{
+   struct ir3_instruction *last_spill = NULL;
+
+   foreach_instr (instr, &block->instr_list) {
+      if ((instr->opc == OPC_SPILL_MACRO || instr->opc == OPC_RELOAD_MACRO) &&
+          last_spill) {
+         ir3_instr_add_dep(instr, last_spill);
+      }
+
+      if (instr->opc == OPC_SPILL_MACRO)
+         last_spill = instr;
+   }
+
+
+   last_spill = NULL;
+
+   foreach_instr_rev (instr, &block->instr_list) {
+      if ((instr->opc == OPC_SPILL_MACRO || instr->opc == OPC_RELOAD_MACRO) &&
+          last_spill) {
+         ir3_instr_add_dep(last_spill, instr);
+      }
+
+      if (instr->opc == OPC_SPILL_MACRO)
+         last_spill = instr;
+   }
+}
+
+bool
+ir3_lower_spill(struct ir3 *ir)
+{
+   foreach_block (block, &ir->block_list) {
+      foreach_instr_safe (instr, &block->instr_list) {
+         if (instr->opc == OPC_SPILL_MACRO)
+            split_spill(instr);
+         else if (instr->opc == OPC_RELOAD_MACRO)
+            split_reload(instr);
+      }
+
+      add_spill_reload_deps(block);
+
+      foreach_instr (instr, &block->instr_list) {
+         if (instr->opc == OPC_SPILL_MACRO)
+            instr->opc = OPC_STP;
+         else if (instr->opc == OPC_RELOAD_MACRO)
+            instr->opc = OPC_LDP;
+      }
+   }
+
+   return true;
+}
diff --git a/src/freedreno/ir3/ir3_merge_regs.c b/src/freedreno/ir3/ir3_merge_regs.c
index bb88dbe8fc7..674bc648e03 100644
--- a/src/freedreno/ir3/ir3_merge_regs.c
+++ b/src/freedreno/ir3/ir3_merge_regs.c
@@ -198,6 +198,7 @@ get_merge_set(struct ir3_register *def)
    struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set);
    set->preferred_reg = ~0;
    set->interval_start = ~0;
+   set->spill_slot = ~0;
    set->size = reg_size(def);
    set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2;
    set->regs_count = 1;
@@ -339,6 +340,19 @@ try_merge_defs(struct ir3_liveness *live, struct ir3_register *a,
       merge_merge_sets(a_set, b_set, b_set_offset);
 }
 
+void
+ir3_force_merge(struct ir3_register *a, struct ir3_register *b, int b_offset)
+{
+   struct ir3_merge_set *a_set = get_merge_set(a);
+   struct ir3_merge_set *b_set = get_merge_set(b);
+
+   if (a_set == b_set)
+      return;
+
+   int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
+   merge_merge_sets(a_set, b_set, b_set_offset);
+}
+
 static void
 coalesce_phi(struct ir3_liveness *live, struct ir3_instruction *phi)
 {
@@ -462,7 +476,7 @@ ir3_create_parallel_copies(struct ir3 *ir)
 }
 
 static void
-index_merge_sets(struct ir3 *ir)
+index_merge_sets(struct ir3_liveness *live, struct ir3 *ir)
 {
    unsigned offset = 0;
    foreach_block (block, &ir->block_list) {
@@ -489,6 +503,8 @@ index_merge_sets(struct ir3 *ir)
          }
       }
    }
+
+   live->interval_offset = offset;
 }
 
 #define RESET      "\x1b[0m"
@@ -559,7 +575,7 @@ ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir)
       }
    }
 
-   index_merge_sets(ir);
+   index_merge_sets(live, ir);
 
    if (ir3_shader_debug & IR3_DBG_RAMSGS)
       dump_merge_sets(ir);
diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c
index 9c320e0ed90..6870769f74d 100644
--- a/src/freedreno/ir3/ir3_ra.c
+++ b/src/freedreno/ir3/ir3_ra.c
@@ -1990,6 +1990,152 @@ calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure)
    return (target - 1) * 2 * 4;
 }
 
+static void
+add_pressure(struct ir3_pressure *pressure, struct ir3_register *reg,
+             bool merged_regs)
+{
+   unsigned size = reg_size(reg);
+   if (reg->flags & IR3_REG_HALF)
+      pressure->half += size;
+   if (!(reg->flags & IR3_REG_HALF) || merged_regs)
+      pressure->full += size;
+}
+
+static void
+dummy_interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
+{
+}
+
+static void
+dummy_interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
+{
+}
+
+static void
+dummy_interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *parent,
+                     struct ir3_reg_interval *child)
+{
+}
+
+/* Calculate the minimum possible limit on register pressure so that spilling
+ * still succeeds. Used to implement IR3_SHADER_DEBUG=spillall.
+ */
+
+static void
+calc_min_limit_pressure(struct ir3_shader_variant *v,
+                        struct ir3_liveness *live,
+                        struct ir3_pressure *limit)
+{
+   struct ir3_block *start = ir3_start_block(v->ir);
+   struct ir3_reg_ctx *ctx = ralloc(NULL, struct ir3_reg_ctx);
+   struct ir3_reg_interval *intervals =
+      rzalloc_array(ctx, struct ir3_reg_interval, live->definitions_count);
+
+   ctx->interval_add = dummy_interval_add;
+   ctx->interval_delete = dummy_interval_delete;
+   ctx->interval_readd = dummy_interval_readd;
+
+   limit->full = limit->half = 0;
+
+   struct ir3_pressure cur_pressure = {0};
+   foreach_instr (input, &start->instr_list) {
+      if (input->opc != OPC_META_INPUT &&
+          input->opc != OPC_META_TEX_PREFETCH)
+         break;
+
+      add_pressure(&cur_pressure, input->dsts[0], v->mergedregs);
+   }
+
+   limit->full = MAX2(limit->full, cur_pressure.full);
+   limit->half = MAX2(limit->half, cur_pressure.half);
+
+   foreach_instr (input, &start->instr_list) {
+      if (input->opc != OPC_META_INPUT &&
+          input->opc != OPC_META_TEX_PREFETCH)
+         break;
+
+      /* pre-colored inputs may have holes, which increases the pressure. */
+      struct ir3_register *dst = input->dsts[0];
+      if (dst->num != INVALID_REG) {
+         unsigned physreg = ra_reg_get_physreg(dst) + reg_size(dst);
+         if (dst->flags & IR3_REG_HALF)
+            limit->half = MAX2(limit->half, physreg);
+         if (!(dst->flags & IR3_REG_HALF) || v->mergedregs)
+            limit->full = MAX2(limit->full, physreg);
+      }
+   }
+
+   foreach_block (block, &v->ir->block_list) {
+      rb_tree_init(&ctx->intervals);
+
+      unsigned name;
+      BITSET_FOREACH_SET (name, live->live_in[block->index],
+                          live->definitions_count) {
+         struct ir3_register *reg = live->definitions[name];
+         ir3_reg_interval_init(&intervals[reg->name], reg);
+         ir3_reg_interval_insert(ctx, &intervals[reg->name]);
+      }
+
+      foreach_instr (instr, &block->instr_list) {
+         ra_foreach_dst (dst, instr) {
+            ir3_reg_interval_init(&intervals[dst->name], dst);
+         }
+         /* phis and parallel copies can be deleted via spilling */
+
+         if (instr->opc == OPC_META_PHI) {
+            ir3_reg_interval_insert(ctx, &intervals[instr->dsts[0]->name]);
+            continue;
+         }
+
+         if (instr->opc == OPC_META_PARALLEL_COPY)
+            continue;
+
+         cur_pressure = (struct ir3_pressure) {0};
+
+         ra_foreach_dst (dst, instr) {
+            if (dst->tied && !(dst->tied->flags & IR3_REG_KILL))
+               add_pressure(&cur_pressure, dst, v->mergedregs);
+         }
+
+         ra_foreach_src_rev (src, instr) {
+            /* We currently don't support spilling the parent of a source when
+             * making space for sources, so we have to keep track of the
+             * intervals and figure out the root of the tree to figure out how
+             * much space we need.
+             *
+             * TODO: We should probably support this in the spiller.
+             */
+            struct ir3_reg_interval *interval = &intervals[src->def->name];
+            while (interval->parent)
+               interval = interval->parent;
+            add_pressure(&cur_pressure, interval->reg, v->mergedregs);
+
+            if (src->flags & IR3_REG_FIRST_KILL)
+               ir3_reg_interval_remove(ctx, &intervals[src->def->name]);
+         }
+
+         limit->full = MAX2(limit->full, cur_pressure.full);
+         limit->half = MAX2(limit->half, cur_pressure.half);
+
+         cur_pressure = (struct ir3_pressure) {0};
+
+         ra_foreach_dst (dst, instr) {
+            ir3_reg_interval_init(&intervals[dst->name], dst);
+            ir3_reg_interval_insert(ctx, &intervals[dst->name]);
+            add_pressure(&cur_pressure, dst, v->mergedregs);
+         }
+
+         limit->full = MAX2(limit->full, cur_pressure.full);
+         limit->half = MAX2(limit->half, cur_pressure.half);
+      }
+   }
+
+   /* Account for the base register, which needs to be available everywhere. */
+   limit->full += 2;
+
+   ralloc_free(ctx);
+}
+
 int
 ir3_ra(struct ir3_shader_variant *v)
 {
@@ -2010,15 +2156,35 @@ ir3_ra(struct ir3_shader_variant *v)
    d("\thalf: %u", max_pressure.half);
    d("\tshared: %u", max_pressure.shared);
 
-   if (v->mergedregs) {
-      max_pressure.full += max_pressure.half;
-      max_pressure.half = 0;
+   /* TODO: calculate half/full limit correctly for CS with barrier */
+   struct ir3_pressure limit_pressure;
+   limit_pressure.full = RA_FULL_SIZE;
+   limit_pressure.half = RA_HALF_SIZE;
+   limit_pressure.shared = RA_SHARED_SIZE;
+
+   /* If requested, lower the limit so that spilling happens more often. */
+   if (ir3_shader_debug & IR3_DBG_SPILLALL)
+      calc_min_limit_pressure(v, live, &limit_pressure);
+
+   if (max_pressure.shared > limit_pressure.shared) {
+      /* TODO shared reg -> normal reg spilling */
+      d("shared max pressure exceeded!");
+      return 1;
    }
 
-   if (max_pressure.full > RA_FULL_SIZE || max_pressure.half > RA_HALF_SIZE ||
-       max_pressure.shared > RA_SHARED_SIZE) {
-      d("max pressure exceeded!");
-      return 1;
+   bool spilled = false;
+   if (max_pressure.full > limit_pressure.full ||
+       max_pressure.half > limit_pressure.half) {
+      if (!v->shader->compiler->has_pvtmem) {
+         d("max pressure exceeded!");
+         return 1;
+      }
+      d("max pressure exceeded, spilling!");
+      IR3_PASS(v->ir, ir3_spill, v, &live, &limit_pressure);
+      ir3_calc_pressure(v, live, &max_pressure);
+      assert(max_pressure.full <= limit_pressure.full &&
+             max_pressure.half <= limit_pressure.half);
+      spilled = true;
    }
 
    struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx);
@@ -2054,19 +2220,20 @@ ir3_ra(struct ir3_shader_variant *v)
          for (unsigned i = 0; i < instr->dsts_count; i++) {
             instr->dsts[i]->flags &= ~IR3_REG_SSA;
 
-            /* Parallel copies of array registers copy the whole register,
-             * and we need some way to let the parallel copy code know
-             * that this was an array whose size is determined by
-             * reg->size. So keep the array flag on those.
+            /* Parallel copies of array registers copy the whole register, and
+             * we need some way to let the parallel copy code know that this was
+             * an array whose size is determined by reg->size. So keep the array
+             * flag on those. spill/reload also need to work on the entire
+             * array.
              */
-            if (!is_meta(instr))
+            if (!is_meta(instr) && instr->opc != OPC_RELOAD_MACRO)
                instr->dsts[i]->flags &= ~IR3_REG_ARRAY;
          }
 
          for (unsigned i = 0; i < instr->srcs_count; i++) {
             instr->srcs[i]->flags &= ~IR3_REG_SSA;
 
-            if (!is_meta(instr))
+            if (!is_meta(instr) && instr->opc != OPC_SPILL_MACRO)
                instr->srcs[i]->flags &= ~IR3_REG_ARRAY;
          }
       }
@@ -2074,6 +2241,10 @@ ir3_ra(struct ir3_shader_variant *v)
 
    ir3_debug_print(v->ir, "AFTER: register allocation");
 
+   if (spilled) {
+      IR3_PASS(v->ir, ir3_lower_spill);
+   }
+
    ir3_lower_copies(v);
 
    ir3_debug_print(v->ir, "AFTER: ir3_lower_copies");
diff --git a/src/freedreno/ir3/ir3_ra.h b/src/freedreno/ir3/ir3_ra.h
index fcef6a908e1..4a7c9d7752a 100644
--- a/src/freedreno/ir3/ir3_ra.h
+++ b/src/freedreno/ir3/ir3_ra.h
@@ -137,6 +137,7 @@ ra_reg_is_dst(const struct ir3_register *reg)
 
 struct ir3_liveness {
    unsigned block_count;
+   unsigned interval_offset;
    DECLARE_ARRAY(struct ir3_register *, definitions);
    DECLARE_ARRAY(BITSET_WORD *, live_out);
    DECLARE_ARRAY(BITSET_WORD *, live_in);
@@ -151,6 +152,9 @@ void ir3_create_parallel_copies(struct ir3 *ir);
 
 void ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir);
 
+void ir3_force_merge(struct ir3_register *a, struct ir3_register *b,
+                     int b_offset);
+
 struct ir3_pressure {
    unsigned full, half, shared;
 };
@@ -158,6 +162,12 @@ struct ir3_pressure {
 void ir3_calc_pressure(struct ir3_shader_variant *v, struct ir3_liveness *live,
                        struct ir3_pressure *max_pressure);
 
+bool ir3_spill(struct ir3 *ir, struct ir3_shader_variant *v,
+               struct ir3_liveness **live,
+               const struct ir3_pressure *limit_pressure);
+
+bool ir3_lower_spill(struct ir3 *ir);
+
 void ir3_ra_validate(struct ir3_shader_variant *v, unsigned full_size,
                      unsigned half_size, unsigned block_count);
 
diff --git a/src/freedreno/ir3/ir3_spill.c b/src/freedreno/ir3/ir3_spill.c
index 54ae56b1beb..0b9c56a7680 100644
--- a/src/freedreno/ir3/ir3_spill.c
+++ b/src/freedreno/ir3/ir3_spill.c
@@ -26,62 +26,318 @@
 #include "ir3_shader.h"
 
 /*
- * This pass does one thing so far:
+ * This pass does two things:
  *
  * 1. Calculates the maximum register pressure. To do this, we need to use the
- * exact same technique that RA uses for combining meta_split instructions
- * with their sources, so that our calculation agrees with RA.
- *
- * It will also optionally spill registers once that's implemented.
+ *    exact same technique that RA uses for combining meta_split instructions
+ *    with their sources, so that our calculation agrees with RA.
+ * 2. Spills when the register pressure is exceeded a limit calculated by RA.
+ *    The implementation is based on "Register Spilling and Live-Range Splitting
+ *    for SSA-Form Programs" by Braun and Hack, although again care has to be
+ *    taken to handle combining split/collect instructions.
  */
 
+struct reg_or_immed {
+   unsigned flags;
+   union {
+      struct ir3_register *def;
+      uint32_t uimm;
+      unsigned const_num;
+   };
+};
+
 struct ra_spill_interval {
    struct ir3_reg_interval interval;
+
+   struct rb_node node;
+   struct rb_node half_node;
+
+   /* The current SSA value/const/immed this source is mapped to. */
+   struct reg_or_immed dst;
+
+   /* When computing use distances we use the distance relative to the start
+    * of the block. So, for example, a value that's defined in cycle 5 of the
+    * block and used 6 cycles later will always have a next_use_distance of 11
+    * until we reach that use.
+    */
+   unsigned next_use_distance;
+
+   /* Whether this value was reloaded and therefore doesn't need to be
+    * spilled again. Corresponds to the S set in the paper.
+    */
+   bool already_spilled;
+
+   /* We need to add sources early for accounting purposes, but we have to
+    * insert the reload code for them last. Keep track of whether this interval
+    * needs to be reloaded later.
+    */
+   bool needs_reload;
+
+   /* Keep track of whether this interval currently can't be spilled because:
+    * - It or one of its children is a source and we're making space for
+    *   sources.
+    * - It is a destination and we're making space for destinations.
+    */
+   bool cant_spill;
+};
+
+struct ra_spill_block_state {
+   unsigned *next_use_end;
+   unsigned *next_use_start;
+
+   unsigned cycles;
+
+   /* Map from SSA def to reg_or_immed it is mapped to at the end of the block.
+    * This map only contains values which we didn't spill, so it also serves as
+    * a record of the new live-out set for this block.
+    */
+   struct hash_table *remap;
+
+   /* For blocks whose successors are visited first (i.e. loop backedges), which
+    * values should be live at the end.
+    */
+   BITSET_WORD *live_out;
+
+   bool visited;
 };
 
 struct ra_spill_ctx {
    struct ir3_reg_ctx reg_ctx;
 
-   struct ra_spill_interval *intervals;
+   struct ra_spill_interval **intervals;
+   unsigned intervals_count;
+
+   /* rb tree of live intervals that we can spill, ordered by next-use distance.
+    * full_live_intervals contains the full+shared intervals in the merged_regs
+    * case. We use this list to determine what to spill.
+    */
+   struct rb_tree full_live_intervals;
+   struct rb_tree half_live_intervals;
 
    struct ir3_pressure cur_pressure, max_pressure;
 
+   struct ir3_pressure limit_pressure;
+
+   /* When spilling, we need to reserve a register to serve as the zero'd
+    * "base". For simplicity we reserve a register at the beginning so that it's
+    * always available.
+    */
+   struct ir3_register *base_reg;
+
+   /* Current pvtmem offset in bytes. */
+   unsigned spill_slot;
+
    struct ir3_liveness *live;
 
    const struct ir3_compiler *compiler;
+
+   struct ra_spill_block_state *blocks;
+
+   bool spilling;
+
+   bool merged_regs;
 };
 
+static void
+add_base_reg(struct ra_spill_ctx *ctx, struct ir3 *ir)
+{
+   struct ir3_block *start = ir3_start_block(ir);
+
+   /* We need to stick it after any meta instructions which need to be first. */
+   struct ir3_instruction *after = NULL;
+   foreach_instr (instr, &start->instr_list) {
+      if (instr->opc != OPC_META_INPUT &&
+          instr->opc != OPC_META_TEX_PREFETCH) {
+         after = instr;
+         break;
+      }
+   }
+
+   struct ir3_instruction *mov = create_immed(start, 0);
+
+   if (after)
+      ir3_instr_move_before(mov, after);
+
+   ctx->base_reg = mov->dsts[0];
+
+   /* We don't create an interval, etc. for the base reg, so just lower the
+    * register pressure limit to account for it. We assume it's always
+    * available for simplicity.
+    */
+   ctx->limit_pressure.full -= reg_size(ctx->base_reg);
+}
+
+
+/* Compute the number of cycles per instruction used for next-use-distance
+ * analysis. This is just approximate, obviously.
+ */
+static unsigned
+instr_cycles(struct ir3_instruction *instr)
+{
+   if (instr->opc == OPC_META_PARALLEL_COPY) {
+      unsigned cycles = 0;
+      for (unsigned i = 0; i < instr->dsts_count; i++) {
+         if (!instr->srcs[i]->def ||
+             instr->srcs[i]->def->merge_set != instr->dsts[i]->merge_set) {
+            cycles += reg_elems(instr->srcs[i]);
+         }
+      }
+
+      return cycles;
+   }
+
+   if (instr->opc == OPC_META_COLLECT) {
+      unsigned cycles = 0;
+      for (unsigned i = 0; i < instr->srcs_count; i++) {
+         if (!instr->srcs[i]->def ||
+             instr->srcs[i]->def->merge_set != instr->dsts[0]->merge_set) {
+            cycles++;
+         }
+      }
+
+      return cycles;
+   }
+
+   if (is_meta(instr))
+      return 0;
+
+   return 1 + instr->repeat;
+}
+
+static bool
+compute_block_next_distance(struct ra_spill_ctx *ctx, struct ir3_block *block,
+                            unsigned *tmp_next_use)
+{
+   struct ra_spill_block_state *state = &ctx->blocks[block->index];
+   memcpy(tmp_next_use, state->next_use_end,
+          ctx->live->definitions_count * sizeof(*tmp_next_use));
+
+   unsigned cycle = state->cycles;
+   foreach_instr_rev (instr, &block->instr_list) {
+      ra_foreach_dst (dst, instr) {
+         dst->next_use = tmp_next_use[dst->name];
+      }
+
+      ra_foreach_src (src, instr) {
+         src->next_use = tmp_next_use[src->def->name];
+      }
+
+      cycle -= instr_cycles(instr);
+
+      if (instr->opc == OPC_META_PARALLEL_COPY) {
+         ra_foreach_src_n (src, i, instr) {
+            if (src->def->merge_set == instr->dsts[i]->merge_set &&
+                src->def->merge_set_offset == instr->dsts[i]->merge_set_offset) {
+               tmp_next_use[src->def->name] =
+                  tmp_next_use[instr->dsts[i]->name];
+            } else {
+               tmp_next_use[src->def->name] = cycle;
+            }
+         }
+      } else if (instr->opc != OPC_META_PHI) {
+         ra_foreach_src (src, instr) {
+            tmp_next_use[src->def->name] = cycle;
+         }
+      }
+
+      ra_foreach_dst (dst, instr) {
+         tmp_next_use[dst->name] = UINT_MAX;
+      }
+   }
+
+   memcpy(state->next_use_start, tmp_next_use,
+          ctx->live->definitions_count * sizeof(*tmp_next_use));
+
+   bool progress = false;
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      const struct ir3_block *pred = block->predecessors[i];
+      struct ra_spill_block_state *pred_state = &ctx->blocks[pred->index];
+
+      /* Add a large-enough distance in front of edges exiting the loop so that
+       * variables that are live-through the loop but not used inside it are
+       * prioritized for spilling, as per the paper. This just needs to be
+       * larger than the longest path through the loop.
+       */
+      bool loop_exit = pred->loop_depth < block->loop_depth;
+      unsigned block_distance = pred_state->cycles + (loop_exit ? 100000 : 0);
+
+      for (unsigned j = 0; j < ctx->live->definitions_count; j++) {
+         if (state->next_use_start[j] < UINT_MAX &&
+             state->next_use_start[j] + block_distance <
+             pred_state->next_use_end[j]) {
+            pred_state->next_use_end[j] = state->next_use_start[j] +
+               block_distance;
+            progress = true;
+         }
+      }
+
+      foreach_instr (phi, &block->instr_list) {
+         if (phi->opc != OPC_META_PHI)
+            break;
+         if (!phi->srcs[i]->def)
+            continue;
+         unsigned src = phi->srcs[i]->def->name;
+         if (phi->dsts[0]->next_use < UINT_MAX &&
+             phi->dsts[0]->next_use + block_distance <
+             pred_state->next_use_end[src]) {
+            pred_state->next_use_end[src] = phi->dsts[0]->next_use +
+               block_distance;
+            progress = true;
+         }
+      }
+   }
+
+   return progress;
+}
+
+static void
+compute_next_distance(struct ra_spill_ctx *ctx, struct ir3 *ir)
+{
+   for (unsigned i = 0; i < ctx->live->block_count; i++) {
+      ctx->blocks[i].next_use_start =
+         ralloc_array(ctx, unsigned, ctx->live->definitions_count);
+      ctx->blocks[i].next_use_end =
+         ralloc_array(ctx, unsigned, ctx->live->definitions_count);
+
+      for (unsigned j = 0; j < ctx->live->definitions_count; j++) {
+         ctx->blocks[i].next_use_start[j] = UINT_MAX;
+         ctx->blocks[i].next_use_end[j] = UINT_MAX;
+      }
+   }
+
+   foreach_block (block, &ir->block_list) {
+      struct ra_spill_block_state *state = &ctx->blocks[block->index];
+      state->cycles = 0;
+      foreach_instr (instr, &block->instr_list) {
+         state->cycles += instr_cycles(instr);
+         foreach_dst (dst, instr) {
+            dst->spill_slot = ~0;
+         }
+      }
+   }
+
+   unsigned *tmp_next_use =
+      ralloc_array(ctx, unsigned, ctx->live->definitions_count);
+
+   bool progress = true;
+   while (progress) {
+      progress = false;
+      foreach_block_rev (block, &ir->block_list) {
+         progress |= compute_block_next_distance(ctx, block, tmp_next_use);
+      }
+   }
+}
+
 static void
 ra_spill_interval_init(struct ra_spill_interval *interval,
                        struct ir3_register *reg)
 {
    ir3_reg_interval_init(&interval->interval, reg);
-}
-
-static void
-ra_pressure_add(struct ir3_pressure *pressure,
-                struct ra_spill_interval *interval)
-{
-   unsigned size = reg_size(interval->interval.reg);
-   if (interval->interval.reg->flags & IR3_REG_SHARED)
-      pressure->shared += size;
-   else if (interval->interval.reg->flags & IR3_REG_HALF)
-      pressure->half += size;
-   else
-      pressure->full += size;
-}
-
-static void
-ra_pressure_sub(struct ir3_pressure *pressure,
-                struct ra_spill_interval *interval)
-{
-   unsigned size = reg_size(interval->interval.reg);
-   if (interval->interval.reg->flags & IR3_REG_SHARED)
-      pressure->shared -= size;
-   else if (interval->interval.reg->flags & IR3_REG_HALF)
-      pressure->half -= size;
-   else
-      pressure->full -= size;
+   interval->dst.flags = reg->flags;
+   interval->dst.def = reg;
+   interval->already_spilled = false;
+   interval->needs_reload = false;
+   interval->cant_spill = false;
 }
 
 static struct ra_spill_interval *
@@ -90,19 +346,66 @@ ir3_reg_interval_to_interval(struct ir3_reg_interval *interval)
    return rb_node_data(struct ra_spill_interval, interval, interval);
 }
 
+static struct ra_spill_interval *
+ra_spill_interval_root(struct ra_spill_interval *interval)
+{
+   struct ir3_reg_interval *ir3_interval = &interval->interval;
+   while (ir3_interval->parent)
+      ir3_interval = ir3_interval->parent;
+   return ir3_reg_interval_to_interval(ir3_interval);
+}
+
 static struct ra_spill_ctx *
 ir3_reg_ctx_to_ctx(struct ir3_reg_ctx *ctx)
 {
    return rb_node_data(struct ra_spill_ctx, ctx, reg_ctx);
 }
 
+static int
+ra_spill_interval_cmp(const struct rb_node *_a, const struct rb_node *_b)
+{
+   const struct ra_spill_interval *a =
+      rb_node_data(const struct ra_spill_interval, _a, node);
+   const struct ra_spill_interval *b =
+      rb_node_data(const struct ra_spill_interval, _b, node);
+   return a->next_use_distance - b->next_use_distance;
+}
+
+static int
+ra_spill_interval_half_cmp(const struct rb_node *_a, const struct rb_node *_b)
+{
+   const struct ra_spill_interval *a =
+      rb_node_data(const struct ra_spill_interval, _a, half_node);
+   const struct ra_spill_interval *b =
+      rb_node_data(const struct ra_spill_interval, _b, half_node);
+   return a->next_use_distance - b->next_use_distance;
+}
+
 static void
 interval_add(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
 {
    struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
    struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
 
-   ra_pressure_add(&ctx->cur_pressure, interval);
+   unsigned size = reg_size(interval->interval.reg);
+   if (interval->interval.reg->flags & IR3_REG_SHARED) {
+      ctx->cur_pressure.shared += size;
+   } else {
+      if (interval->interval.reg->flags & IR3_REG_HALF) {
+         ctx->cur_pressure.half += size;
+         if (ctx->spilling) {
+            rb_tree_insert(&ctx->half_live_intervals, &interval->half_node,
+                           ra_spill_interval_half_cmp);
+         }
+      }
+      if (ctx->merged_regs || !(interval->interval.reg->flags & IR3_REG_HALF)) {
+         ctx->cur_pressure.full += size;
+         if (ctx->spilling) {
+            rb_tree_insert(&ctx->full_live_intervals, &interval->node,
+                           ra_spill_interval_cmp);
+         }
+      }
+   }
 }
 
 static void
@@ -111,7 +414,23 @@ interval_delete(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
    struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
    struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
 
-   ra_pressure_sub(&ctx->cur_pressure, interval);
+   unsigned size = reg_size(interval->interval.reg);
+   if (interval->interval.reg->flags & IR3_REG_SHARED) {
+      ctx->cur_pressure.shared -= size;
+   } else {
+      if (interval->interval.reg->flags & IR3_REG_HALF) {
+         ctx->cur_pressure.half -= size;
+         if (ctx->spilling) {
+            rb_tree_remove(&ctx->half_live_intervals, &interval->half_node);
+         }
+      }
+      if (ctx->merged_regs || !(interval->interval.reg->flags & IR3_REG_HALF)) {
+         ctx->cur_pressure.full -= size;
+         if (ctx->spilling) {
+            rb_tree_remove(&ctx->full_live_intervals, &interval->node);
+         }
+      }
+   }
 }
 
 static void
@@ -122,8 +441,22 @@ interval_readd(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_parent,
 }
 
 static void
-spill_ctx_init(struct ra_spill_ctx *ctx)
+spill_ctx_init(struct ra_spill_ctx *ctx, struct ir3_shader_variant *v,
+               struct ir3_liveness *live)
 {
+   ctx->live = live;
+   ctx->intervals = ralloc_array(ctx, struct ra_spill_interval *,
+                                 ctx->live->definitions_count);
+   struct ra_spill_interval *intervals =
+      rzalloc_array(ctx, struct ra_spill_interval,
+                    ctx->live->definitions_count);
+   for (unsigned i = 0; i < ctx->live->definitions_count; i++)
+      ctx->intervals[i] = &intervals[i];
+
+   ctx->intervals_count = ctx->live->definitions_count;
+   ctx->compiler = v->shader->compiler;
+   ctx->merged_regs = v->mergedregs;
+
    rb_tree_init(&ctx->reg_ctx.intervals);
    ctx->reg_ctx.interval_add = interval_add;
    ctx->reg_ctx.interval_delete = interval_delete;
@@ -147,18 +480,21 @@ ra_spill_ctx_remove(struct ra_spill_ctx *ctx,
 static void
 init_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
 {
-   struct ra_spill_interval *interval = &ctx->intervals[dst->name];
+   struct ra_spill_interval *interval = ctx->intervals[dst->name];
    ra_spill_interval_init(interval, dst);
+   if (ctx->spilling)
+      interval->next_use_distance = dst->next_use;
 }
 
 static void
 insert_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
 {
-   struct ra_spill_interval *interval = &ctx->intervals[dst->name];
+   struct ra_spill_interval *interval = ctx->intervals[dst->name];
    if (interval->interval.inserted)
       return;
 
    ra_spill_ctx_insert(ctx, interval);
+   interval->cant_spill = true;
 
    /* For precolored inputs, make sure we leave enough registers to allow for
     * holes in the inputs. It can happen that the binning shader has a lower
@@ -179,14 +515,26 @@ insert_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
    }
 }
 
+static void
+insert_src(struct ra_spill_ctx *ctx, struct ir3_register *src)
+{
+   struct ra_spill_interval *interval = ctx->intervals[src->def->name];
+
+   ra_spill_interval_root(interval)->cant_spill = true;
+
+   if (interval->interval.inserted)
+      return;
+
+   ra_spill_ctx_insert(ctx, interval);
+   interval->needs_reload = true;
+   interval->already_spilled = true;
+}
+
 static void
 remove_src_early(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
                  struct ir3_register *src)
 {
-   if (!(src->flags & IR3_REG_FIRST_KILL))
-      return;
-
-   struct ra_spill_interval *interval = &ctx->intervals[src->def->name];
+   struct ra_spill_interval *interval = ctx->intervals[src->def->name];
 
    if (!interval->interval.inserted || interval->interval.parent ||
        !rb_tree_is_empty(&interval->interval.children))
@@ -199,10 +547,7 @@ static void
 remove_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
            struct ir3_register *src)
 {
-   if (!(src->flags & IR3_REG_FIRST_KILL))
-      return;
-
-   struct ra_spill_interval *interval = &ctx->intervals[src->def->name];
+   struct ra_spill_interval *interval = ctx->intervals[src->def->name];
 
    if (!interval->interval.inserted)
       return;
@@ -210,10 +555,17 @@ remove_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
    ra_spill_ctx_remove(ctx, interval);
 }
 
+static void
+finish_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
+{
+   struct ra_spill_interval *interval = ctx->intervals[dst->name];
+   interval->cant_spill = false;
+}
+
 static void
 remove_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
 {
-   struct ra_spill_interval *interval = &ctx->intervals[dst->name];
+   struct ra_spill_interval *interval = ctx->intervals[dst->name];
 
    if (!interval->interval.inserted)
       return;
@@ -221,6 +573,361 @@ remove_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
    ra_spill_ctx_remove(ctx, interval);
 }
 
+static void
+update_src_next_use(struct ra_spill_ctx *ctx, struct ir3_register *src)
+{
+   struct ra_spill_interval *interval = ctx->intervals[src->def->name];
+
+   assert(interval->interval.inserted);
+
+   interval->next_use_distance = src->next_use;
+
+   /* If this node is inserted in one of the trees, then it needs to be resorted
+    * as its key has changed.
+    */
+   if (!interval->interval.parent && !(src->flags & IR3_REG_SHARED)) {
+      if (src->flags & IR3_REG_HALF) {
+         rb_tree_remove(&ctx->half_live_intervals, &interval->half_node);
+         rb_tree_insert(&ctx->half_live_intervals, &interval->half_node,
+                        ra_spill_interval_half_cmp);
+      }
+      if (ctx->merged_regs || !(src->flags & IR3_REG_HALF)) {
+         rb_tree_remove(&ctx->full_live_intervals, &interval->node);
+         rb_tree_insert(&ctx->full_live_intervals, &interval->node,
+                        ra_spill_interval_cmp);
+      }
+   }
+}
+
+static unsigned
+get_spill_slot(struct ra_spill_ctx *ctx, struct ir3_register *reg)
+{
+   if (reg->merge_set) {
+      if (reg->merge_set->spill_slot == ~0) {
+         reg->merge_set->spill_slot = ALIGN_POT(ctx->spill_slot,
+                                                reg->merge_set->alignment);
+         ctx->spill_slot = reg->merge_set->spill_slot + reg->merge_set->size * 2;
+      }
+      return reg->merge_set->spill_slot + reg->merge_set_offset * 2;
+   } else {
+      if (reg->spill_slot == ~0) {
+         reg->spill_slot = ALIGN_POT(ctx->spill_slot, reg_elem_size(reg));
+         ctx->spill_slot = reg->spill_slot + reg_size(reg) * 2;
+      }
+      return reg->spill_slot;
+   }
+}
+
+static void
+set_src_val(struct ir3_register *src, const struct reg_or_immed *val)
+{
+   if (val->flags & IR3_REG_IMMED) {
+      src->flags = IR3_REG_IMMED | (val->flags & IR3_REG_HALF);
+      src->uim_val = val->uimm;
+      src->def = NULL;
+   } else if (val->flags & IR3_REG_CONST) {
+      src->flags = IR3_REG_CONST | (val->flags & IR3_REG_HALF);
+      src->num = val->const_num;
+      src->def = NULL;
+   } else {
+      src->def = val->def;
+   }
+}
+
+static struct ir3_register *
+materialize_pcopy_src(const struct reg_or_immed *src,
+                      struct ir3_instruction *instr,
+                      struct ir3_block *block)
+{
+   struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1);
+   struct ir3_register *dst = __ssa_dst(mov);
+   dst->flags |= src->flags & IR3_REG_HALF;
+   struct ir3_register *mov_src = ir3_src_create(mov, INVALID_REG, src->flags);
+   set_src_val(mov_src, src);
+   mov->cat1.src_type = mov->cat1.dst_type =
+      (src->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+
+   if (instr)
+      ir3_instr_move_before(mov, instr);
+   return dst;
+}
+
+static void
+spill(struct ra_spill_ctx *ctx, const struct reg_or_immed *val,
+      unsigned spill_slot, struct ir3_instruction *instr, struct ir3_block *block)
+{
+   struct ir3_register *reg;
+
+   /* If spilling an immed/const pcopy src, we need to actually materialize it
+    * first with a mov.
+    */
+   if (val->flags & (IR3_REG_CONST | IR3_REG_IMMED)) {
+      reg = materialize_pcopy_src(val, instr, block);
+   } else {
+      reg = val->def;
+   }
+
+   d("spilling ssa_%u:%u to %u", reg->instr->serialno, reg->name,
+     spill_slot);
+
+   unsigned elems = reg_elems(reg);
+   struct ir3_instruction *spill =
+      ir3_instr_create(block, OPC_SPILL_MACRO, 0, 3);
+   ir3_src_create(spill, INVALID_REG, ctx->base_reg->flags)->def = ctx->base_reg;
+   unsigned src_flags = reg->flags & (IR3_REG_HALF | IR3_REG_IMMED |
+                                      IR3_REG_CONST | IR3_REG_SSA |
+                                      IR3_REG_ARRAY);
+   struct ir3_register *src = ir3_src_create(spill, INVALID_REG, src_flags);
+   ir3_src_create(spill, INVALID_REG, IR3_REG_IMMED)->uim_val = elems;
+   spill->cat6.dst_offset = spill_slot;
+   spill->cat6.type = (reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+   
+   src->def = reg;
+   if (reg->flags & IR3_REG_ARRAY) {
+      src->size = reg->size;
+      src->array.id = reg->array.id;
+      src->array.offset = 0;
+   } else {
+      src->wrmask = reg->wrmask;
+   }
+
+   if (instr)
+      ir3_instr_move_before(spill, instr);
+}
+
+static void
+spill_interval(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval,
+               struct ir3_instruction *instr, struct ir3_block *block)
+{
+   spill(ctx, &interval->dst, get_spill_slot(ctx, interval->interval.reg),
+         instr, block);
+}
+
+/* This is similar to "limit" in the paper. */
+static void
+limit(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
+{
+   if (ctx->cur_pressure.half > ctx->limit_pressure.half) {
+      d("cur half pressure %u exceeds %u", ctx->cur_pressure.half,
+        ctx->limit_pressure.half);
+      rb_tree_foreach_safe (struct ra_spill_interval, interval,
+                            &ctx->half_live_intervals, half_node) {
+         d("trying ssa_%u:%u", interval->interval.reg->instr->serialno,
+           interval->interval.reg->name);
+         if (!interval->cant_spill) {
+            if (!interval->already_spilled)
+               spill_interval(ctx, interval, instr, instr->block);
+            ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
+            if (ctx->cur_pressure.half <= ctx->limit_pressure.half)
+               break;
+         }
+      }
+
+      assert(ctx->cur_pressure.half <= ctx->limit_pressure.half);
+   }
+
+   if (ctx->cur_pressure.full > ctx->limit_pressure.full) {
+      d("cur full pressure %u exceeds %u", ctx->cur_pressure.full,
+        ctx->limit_pressure.full);
+      rb_tree_foreach_safe (struct ra_spill_interval, interval,
+                            &ctx->full_live_intervals, node) {
+         d("trying ssa_%u:%u", interval->interval.reg->instr->serialno,
+           interval->interval.reg->name);
+         if (!interval->cant_spill) {
+            if (!interval->already_spilled)
+               spill_interval(ctx, interval, instr, instr->block);
+            ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
+            if (ctx->cur_pressure.full <= ctx->limit_pressure.full)
+               break;
+         } else {
+            d("can't spill");
+         }
+      }
+
+      assert(ctx->cur_pressure.full <= ctx->limit_pressure.full);
+   }
+}
+
+/* There's a corner case where we reload a value which has overlapping live
+ * values already reloaded, either because it's the child of some other interval
+ * that was already reloaded or some of its children have already been
+ * reloaded. Because RA only expects overlapping source/dest intervals for meta
+ * instructions (split/collect), and we don't want to add register pressure by
+ * creating an entirely separate value, we need to add splits and collects to
+ * deal with this case. These splits/collects have to also have correct merge
+ * set information, so that it doesn't result in any actual code or register
+ * pressure in practice.
+ */
+
+static void
+add_to_merge_set(struct ir3_merge_set *set, struct ir3_register *def,
+                 unsigned offset)
+{
+   def->merge_set = set;
+   def->merge_set_offset = offset;
+   def->interval_start = set->interval_start + offset;
+   def->interval_end = set->interval_start + offset + reg_size(def);
+}
+
+static struct ir3_register *
+split(struct ir3_register *def, unsigned offset,
+      struct ir3_instruction *after, struct ir3_block *block)
+{
+   if (reg_elems(def) == 1) {
+      assert(offset == 0);
+      return def;
+   }
+
+   assert(!(def->flags & IR3_REG_ARRAY));
+   assert(def->merge_set);
+   struct ir3_instruction *split =
+      ir3_instr_create(after->block, OPC_META_SPLIT, 1, 1);
+   struct ir3_register *dst = __ssa_dst(split);
+   dst->flags |= def->flags & IR3_REG_HALF;
+   struct ir3_register *src = ir3_src_create(split, INVALID_REG, def->flags);
+   src->wrmask = def->wrmask;
+   src->def = def;
+   add_to_merge_set(def->merge_set, dst,
+                    def->merge_set_offset + offset * reg_elem_size(def));
+   if (after)
+      ir3_instr_move_before(split, after);
+   return dst;
+}
+
+static struct ir3_register *
+extract(struct ir3_register *parent_def, unsigned offset, unsigned elems,
+        struct ir3_instruction *after, struct ir3_block *block)
+{
+   if (offset == 0 && elems == reg_elems(parent_def))
+      return parent_def;
+
+   struct ir3_instruction *collect =
+      ir3_instr_create(after->block, OPC_META_COLLECT, 1, elems);
+   struct ir3_register *dst = __ssa_dst(collect);
+   dst->flags |= parent_def->flags & IR3_REG_HALF;
+   dst->wrmask = MASK(elems);
+   add_to_merge_set(parent_def->merge_set, dst, parent_def->merge_set_offset);
+
+   for (unsigned i = 0; i < elems; i++) {
+      ir3_src_create(collect, INVALID_REG, parent_def->flags)->def =
+         split(parent_def, offset + i, after, block);
+   }
+
+   if (after)
+      ir3_instr_move_before(collect, after);
+   return dst;
+}
+
+static struct ir3_register *
+reload(struct ra_spill_ctx *ctx, struct ir3_register *reg,
+       struct ir3_instruction *after, struct ir3_block *block)
+{
+   unsigned spill_slot = get_spill_slot(ctx, reg);
+
+   d("reloading ssa_%u:%u from %u", reg->instr->serialno, reg->name,
+     spill_slot);
+
+   unsigned elems = reg_elems(reg);
+   struct ir3_instruction *reload =
+      ir3_instr_create(block, OPC_RELOAD_MACRO, 1, 3);
+   struct ir3_register *dst = __ssa_dst(reload);
+   dst->flags |= reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
+   ir3_src_create(reload, INVALID_REG, ctx->base_reg->flags)->def = ctx->base_reg;
+   struct ir3_register *offset_reg =
+      ir3_src_create(reload, INVALID_REG, IR3_REG_IMMED);
+   offset_reg->uim_val = spill_slot;
+   ir3_src_create(reload, INVALID_REG, IR3_REG_IMMED)->uim_val = elems;
+   reload->cat6.type = (reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+
+   if (reg->flags & IR3_REG_ARRAY) {
+      dst->array.offset = 0;
+      dst->array.id = reg->array.id;
+      dst->size = reg->size;
+   } else {
+      dst->wrmask = MASK(elems);
+   }
+
+   dst->merge_set = reg->merge_set;
+   dst->merge_set_offset = reg->merge_set_offset;
+   dst->interval_start = reg->interval_start;
+   dst->interval_end = reg->interval_end;
+
+   if (after)
+      ir3_instr_move_before(reload, after);
+
+   return dst;
+}
+
+static void
+rewrite_src_interval(struct ra_spill_ctx *ctx,
+                    struct ra_spill_interval *interval,
+                    struct ir3_register *def,
+                    struct ir3_instruction *instr,
+                    struct ir3_block *block)
+{
+   interval->dst.flags = def->flags;
+   interval->dst.def = def;
+   interval->needs_reload = false;
+
+   rb_tree_foreach (struct ra_spill_interval, child, 
+                    &interval->interval.children, interval.node) {
+      struct ir3_register *child_reg = child->interval.reg;
+      struct ir3_register *child_def =
+         extract(def, (child_reg->interval_start -
+                       interval->interval.reg->interval_start) / reg_elem_size(def),
+                 reg_elems(child_reg), instr, block);
+      rewrite_src_interval(ctx, child, child_def, instr, block);
+   }
+}
+
+static void
+reload_def(struct ra_spill_ctx *ctx, struct ir3_register *def,
+           struct ir3_instruction *instr, struct ir3_block *block)
+{
+   unsigned elems = reg_elems(def);
+   struct ra_spill_interval *interval = ctx->intervals[def->name];
+
+   struct ir3_reg_interval *ir3_parent = interval->interval.parent;
+
+   if (ir3_parent) {
+      struct ra_spill_interval *parent =
+         ir3_reg_interval_to_interval(ir3_parent);
+      if (!parent->needs_reload) {
+         interval->dst.flags = def->flags;
+         interval->dst.def = extract(
+            parent->dst.def, (def->interval_start - parent->dst.def->interval_start) /
+            reg_elem_size(def), elems, instr, block);
+         return;
+      }
+   }
+
+   struct ir3_register *dst = reload(ctx, def, instr, block);
+
+   rewrite_src_interval(ctx, interval, dst, instr, block);
+}
+
+static void
+reload_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
+            struct ir3_register *src)
+{
+   struct ra_spill_interval *interval = ctx->intervals[src->def->name];
+
+   if (interval->needs_reload) {
+      reload_def(ctx, src->def, instr, instr->block);
+   }
+
+   ra_spill_interval_root(interval)->cant_spill = false;
+}
+
+static void
+rewrite_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
+            struct ir3_register *src)
+{
+   struct ra_spill_interval *interval = ctx->intervals[src->def->name];
+
+   set_src_val(src, &interval->dst);
+}
+
 static void
 update_max_pressure(struct ra_spill_ctx *ctx)
 {
@@ -240,12 +947,15 @@ update_max_pressure(struct ra_spill_ctx *ctx)
 static void
 handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
 {
-   di(instr, "processing");
-
    ra_foreach_dst (dst, instr) {
       init_dst(ctx, dst);
    }
 
+   if (ctx->spilling) {
+      ra_foreach_src (src, instr)
+         insert_src(ctx, src);
+   }
+
    /* Handle tied destinations. If a destination is tied to a source and that
     * source is live-through, then we need to allocate a new register for the
     * destination which is live-through itself and cannot overlap the
@@ -258,7 +968,17 @@ handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
          insert_dst(ctx, dst);
    }
 
-   update_max_pressure(ctx);
+   if (ctx->spilling)
+      limit(ctx, instr);
+   else
+      update_max_pressure(ctx);
+
+   if (ctx->spilling) {
+      ra_foreach_src (src, instr) {
+         reload_src(ctx, instr, src);
+         update_src_next_use(ctx, src);
+      }
+   }
 
    ra_foreach_src (src, instr) {
       if (src->flags & IR3_REG_FIRST_KILL)
@@ -269,13 +989,29 @@ handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
       insert_dst(ctx, dst);
    }
 
-   update_max_pressure(ctx);
+   if (ctx->spilling)
+      limit(ctx, instr);
+   else
+      update_max_pressure(ctx);
 
-   for (unsigned i = 0; i < instr->srcs_count; i++) {
-      if (ra_reg_is_src(instr->srcs[i]) &&
-          (instr->srcs[i]->flags & IR3_REG_FIRST_KILL))
-         remove_src(ctx, instr, instr->srcs[i]);
+   /* We have to remove sources before rewriting them so that we can lookup the
+    * interval to remove before the source itself is changed.
+    */
+   ra_foreach_src (src, instr) {
+      if (src->flags & IR3_REG_FIRST_KILL)
+         remove_src(ctx, instr, src);
    }
+
+   if (ctx->spilling) {
+      ra_foreach_src (src, instr) {
+         rewrite_src(ctx, instr, src);
+      }
+   }
+
+   ra_foreach_dst (dst, instr) {
+      finish_dst(ctx, dst);
+   }
+
    for (unsigned i = 0; i < instr->dsts_count; i++) {
       if (ra_reg_is_dst(instr->dsts[i]) &&
           (instr->dsts[i]->flags & IR3_REG_UNUSED))
@@ -283,28 +1019,672 @@ handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
    }
 }
 
+static struct ra_spill_interval *
+create_temp_interval(struct ra_spill_ctx *ctx, struct ir3_register *def)
+{
+   unsigned name = ctx->intervals_count++;
+   unsigned offset = ctx->live->interval_offset;
+
+   /* This is kinda hacky, but we need to create a fake SSA def here that is
+    * only used as part of the pcopy accounting. See below.
+    */
+   struct ir3_register *reg = rzalloc(ctx, struct ir3_register);
+   *reg = *def;
+   reg->name = name;
+   reg->interval_start = offset;
+   reg->interval_end = offset + reg_size(def);
+   reg->merge_set = NULL;
+
+   ctx->intervals = reralloc(ctx, ctx->intervals, struct ra_spill_interval *,
+                             ctx->intervals_count); 
+   struct ra_spill_interval *interval = rzalloc(ctx, struct ra_spill_interval);
+   ra_spill_interval_init(interval, reg);
+   ctx->intervals[name] = interval;
+   ctx->live->interval_offset += reg_size(def);
+   return interval;
+}
+
+/* In the sequence of copies generated (see below), would this source be killed?
+ */
+static bool
+is_last_pcopy_src(struct ir3_instruction *pcopy, unsigned src_n)
+{
+   struct ir3_register *src = pcopy->srcs[src_n];
+   if (!(src->flags & IR3_REG_KILL))
+      return false;
+   for (unsigned j = src_n + 1; j < pcopy->srcs_count; j++) {
+      if (pcopy->srcs[j]->def == src->def)
+         return false;
+   }
+   return true;
+}
+
+/* Parallel copies are different from normal instructions. The sources together
+ * may be larger than the entire register file, so we cannot just reload every
+ * source like normal, and indeed that probably wouldn't be a great idea.
+ * Instead we essentially need to lower the parallel copy to "copies," just like
+ * in the normal CSSA construction, although we implement the copies by
+ * reloading and then possibly spilling values. We essentially just shuffle
+ * around the sources until each source either (a) is live or (b) has the same
+ * spill slot as its corresponding destination. We do this by decomposing the
+ * copy into a series of copies, so:
+ *
+ * a, b, c = d, e, f
+ *
+ * becomes:
+ *
+ * d' = d
+ * e' = e
+ * f' = f
+ * a = d'
+ * b = e'
+ * c = f'
+ *
+ * the temporary SSA values d', e', and f' never actually show up in the result.
+ * They are only used for our internal accounting. They may, however, have their
+ * own spill slot created for them. Similarly, we don't actually emit any copy
+ * instructions, although we emit the spills/reloads that *would've* been
+ * required if those copies were there.
+ *
+ * TODO: in order to reduce the number of temporaries and therefore spill slots,
+ * we could instead do a more complicated analysis that considers the location
+ * transfer graph.
+ *
+ * In addition, we actually remove the parallel copy and rewrite all its uses
+ * (in the phi nodes) rather than rewrite its sources at the end. Recreating it
+ * later turns out to be easier than keeping it up-to-date throughout this pass,
+ * since we may have to remove entries for phi sources that are spilled and add
+ * entries for live-outs that are spilled and reloaded, which can happen here
+ * and then possibly be undone or done again when processing live-ins of the
+ * successor block.
+ */
+
+static void
+handle_pcopy(struct ra_spill_ctx *ctx, struct ir3_instruction *pcopy)
+{
+   foreach_dst (dst, pcopy) {
+      struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
+      ra_spill_interval_init(dst_interval, dst);
+   }
+
+   foreach_src_n (src, i, pcopy) {
+      d("processing src %u", i);
+      struct ir3_register *dst = pcopy->dsts[i];
+
+      /* Skip the intermediate copy for cases where the source is merged with
+       * the destination. Crucially this means that we also don't reload/spill
+       * it if it's been spilled, because it shares the same spill slot.
+       */
+      if (src->def && src->def->merge_set &&
+          src->def->merge_set == dst->merge_set &&
+          src->def->merge_set_offset == dst->merge_set_offset) {
+         struct ra_spill_interval *src_interval = ctx->intervals[src->def->name];
+         struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
+         if (src_interval->interval.inserted) {
+            update_src_next_use(ctx, src);
+            if (is_last_pcopy_src(pcopy, i))
+               ra_spill_ctx_remove(ctx, src_interval);
+            dst_interval->cant_spill = true;
+            ra_spill_ctx_insert(ctx, dst_interval);
+            limit(ctx, pcopy);
+            dst_interval->cant_spill = false;
+            dst_interval->dst = src_interval->dst;
+         }
+      } else if (src->def) {
+         struct ra_spill_interval *temp_interval =
+            create_temp_interval(ctx, dst);
+         struct ir3_register *temp = temp_interval->interval.reg;
+         temp_interval->next_use_distance = src->next_use;
+
+         insert_src(ctx, src);
+         limit(ctx, pcopy);
+         reload_src(ctx, pcopy, src);
+         update_src_next_use(ctx, src);
+         if (is_last_pcopy_src(pcopy, i))
+            remove_src(ctx, pcopy, src);
+         struct ra_spill_interval *src_interval =
+            ctx->intervals[src->def->name];
+         temp_interval->dst = src_interval->dst;
+
+         temp_interval->cant_spill = true;
+         ra_spill_ctx_insert(ctx, temp_interval);
+         limit(ctx, pcopy);
+         temp_interval->cant_spill = false;
+
+         src->flags = temp->flags;
+         src->def = temp;
+      }
+   }
+
+   d("done with pcopy srcs");
+
+   foreach_src_n (src, i, pcopy) {
+      struct ir3_register *dst = pcopy->dsts[i];
+
+      if (src->def && src->def->merge_set &&
+          src->def->merge_set == dst->merge_set &&
+          src->def->merge_set_offset == dst->merge_set_offset)
+         continue;
+
+      struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
+
+      if (!src->def) {
+         dst_interval->cant_spill = true;
+         ra_spill_ctx_insert(ctx, dst_interval);
+         limit(ctx, pcopy);
+         dst_interval->cant_spill = false;
+
+         assert(src->flags & (IR3_REG_CONST | IR3_REG_IMMED));
+         if (src->flags & IR3_REG_CONST) {
+            dst_interval->dst.flags = src->flags;
+            dst_interval->dst.const_num = src->num;
+         } else {
+            dst_interval->dst.flags = src->flags;
+            dst_interval->dst.uimm = src->uim_val;
+         }
+      } else {
+         struct ra_spill_interval *temp_interval = ctx->intervals[src->def->name];
+
+         insert_src(ctx, src);
+         limit(ctx, pcopy);
+         reload_src(ctx, pcopy, src);
+         remove_src(ctx, pcopy, src);
+
+         dst_interval->dst = temp_interval->dst;
+         ra_spill_ctx_insert(ctx, dst_interval);
+      }
+   }
+
+   pcopy->flags |= IR3_INSTR_UNUSED;
+}
+
 static void
 handle_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
 {
    init_dst(ctx, instr->dsts[0]);
    insert_dst(ctx, instr->dsts[0]);
+   finish_dst(ctx, instr->dsts[0]);
 }
 
 static void
 remove_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
 {
-   ra_foreach_src (src, instr)
-      remove_src(ctx, instr, src);
+   if (instr->opc == OPC_META_TEX_PREFETCH) {
+      ra_foreach_src (src, instr)
+         remove_src(ctx, instr, src);
+   }
    if (instr->dsts[0]->flags & IR3_REG_UNUSED)
       remove_dst(ctx, instr->dsts[0]);
 }
 
 static void
-handle_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def)
+handle_live_in(struct ra_spill_ctx *ctx, struct ir3_block *block,
+               struct ir3_register *def)
 {
-   struct ra_spill_interval *interval = &ctx->intervals[def->name];
+   struct ra_spill_interval *interval = ctx->intervals[def->name];
    ra_spill_interval_init(interval, def);
-   insert_dst(ctx, def);
+   if (ctx->spilling) {
+      interval->next_use_distance =
+         ctx->blocks[block->index].next_use_start[def->name];
+   }
+
+   ra_spill_ctx_insert(ctx, interval);
+}
+
+static bool
+is_live_in_phi(struct ir3_register *def, struct ir3_block *block)
+{
+   return def->instr->opc == OPC_META_PHI && def->instr->block == block;
+}
+
+static bool
+is_live_in_pred(struct ra_spill_ctx *ctx, struct ir3_register *def,
+                struct ir3_block *block, unsigned pred_idx)
+{
+   struct ir3_block *pred = block->predecessors[pred_idx];
+   struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+   if (is_live_in_phi(def, block)) {
+      def = def->instr->srcs[pred_idx]->def;
+      if (!def)
+         return false;
+   }
+
+   return _mesa_hash_table_search(state->remap, def);
+}
+
+static bool
+is_live_in_undef(struct ir3_register *def,
+                 struct ir3_block *block, unsigned pred_idx)
+{
+   if (!is_live_in_phi(def, block))
+      return false;
+
+   return !def->instr->srcs[pred_idx]->def;
+}
+
+static struct reg_or_immed *
+read_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
+             struct ir3_block *block, unsigned pred_idx)
+{
+   struct ir3_block *pred = block->predecessors[pred_idx];
+   struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+
+   if (is_live_in_phi(def, block)) {
+      def = def->instr->srcs[pred_idx]->def;
+      if (!def)
+         return NULL;
+   }
+
+   struct hash_entry *entry = _mesa_hash_table_search(state->remap, def);
+   if (entry)
+      return entry->data;
+   else
+      return NULL;
+}
+
+static bool
+is_live_in_all_preds(struct ra_spill_ctx *ctx, struct ir3_register *def,
+                     struct ir3_block *block)
+{
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      if (!is_live_in_pred(ctx, def, block, i))
+         return false;
+   }
+
+   return true;
+}
+
+static void
+spill_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
+              struct ir3_block *block)
+{
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *pred = block->predecessors[i];
+      struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+
+      if (!state->visited)
+         continue;
+
+      struct reg_or_immed *pred_def = read_live_in(ctx, def, block, i);
+      if (pred_def) {
+         spill(ctx, pred_def, get_spill_slot(ctx, def), NULL, pred);
+      }
+   }
+}
+
+static void
+spill_live_ins(struct ra_spill_ctx *ctx, struct ir3_block *block)
+{
+   bool all_preds_visited = true;
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *pred = block->predecessors[i];
+      struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+      if (!state->visited) {
+         all_preds_visited = false;
+         break;
+      }
+   }
+
+   /* Note: in the paper they explicitly spill live-through values first, but we
+    * should be doing that automatically by virtue of picking the largest
+    * distance due to the extra distance added to edges out of loops.
+    *
+    * TODO: Keep track of pressure in each block and preemptively spill
+    * live-through values as described in the paper to avoid spilling them
+    * inside the loop.
+    */
+
+   if (ctx->cur_pressure.half > ctx->limit_pressure.half) {
+      rb_tree_foreach_safe (struct ra_spill_interval, interval,
+                            &ctx->half_live_intervals, half_node) {
+         if (all_preds_visited &&
+             is_live_in_all_preds(ctx, interval->interval.reg, block))
+            continue;
+         spill_live_in(ctx, interval->interval.reg, block);
+         ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
+         if (ctx->cur_pressure.half <= ctx->limit_pressure.half)
+            break;
+      }
+   }
+
+   if (ctx->cur_pressure.full > ctx->limit_pressure.full) {
+      rb_tree_foreach_safe (struct ra_spill_interval, interval,
+                            &ctx->full_live_intervals, node) {
+         if (all_preds_visited &&
+             is_live_in_all_preds(ctx, interval->interval.reg, block))
+            continue;
+         spill_live_in(ctx, interval->interval.reg, block);
+         ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
+         if (ctx->cur_pressure.full <= ctx->limit_pressure.full)
+            break;
+      }
+   }
+}
+
+static void
+live_in_rewrite(struct ra_spill_ctx *ctx,
+                struct ra_spill_interval *interval,
+                struct reg_or_immed *new_val,
+                struct ir3_block *block, unsigned pred_idx)
+{
+   struct ir3_block *pred = block->predecessors[pred_idx];
+   struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+   struct ir3_register *def = interval->interval.reg;
+   if (is_live_in_phi(def, block)) {
+      def = def->instr->srcs[pred_idx]->def;
+   }
+
+   if (def)
+      _mesa_hash_table_insert(state->remap, def, new_val);
+
+   rb_tree_foreach (struct ra_spill_interval, child,
+                    &interval->interval.children, interval.node) {
+      assert(new_val->flags & IR3_REG_SSA);
+      struct ir3_register *child_def =
+         extract(new_val->def,
+                 (child->interval.reg->interval_start - def->interval_start) /
+                 reg_elem_size(def), reg_elems(child->interval.reg),
+                 NULL, pred);
+      struct reg_or_immed *child_val = ralloc(ctx, struct reg_or_immed);
+      child_val->def = child_def;
+      child_val->flags = child_def->flags;
+      live_in_rewrite(ctx, child, child_val, block, pred_idx);
+   }
+}
+
+static void
+reload_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
+               struct ir3_block *block)
+{
+   struct ra_spill_interval *interval = ctx->intervals[def->name];
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *pred = block->predecessors[i];
+      struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+      if (!state->visited)
+         continue;
+
+      if (is_live_in_undef(def, block, i))
+         continue;
+
+      struct reg_or_immed *new_val = read_live_in(ctx, def, block, i);
+
+      if (!new_val) {
+         new_val = ralloc(ctx, struct reg_or_immed);
+         new_val->def = reload(ctx, def, NULL, pred);
+         new_val->flags = new_val->def->flags;
+      }
+      live_in_rewrite(ctx, interval, new_val, block, i);
+   }
+}
+
+static void
+reload_live_ins(struct ra_spill_ctx *ctx, struct ir3_block *block)
+{
+   rb_tree_foreach (struct ra_spill_interval, interval, &ctx->reg_ctx.intervals,
+                    interval.node) {
+      reload_live_in(ctx, interval->interval.reg, block);
+   }
+}
+
+static void
+add_live_in_phi(struct ra_spill_ctx *ctx, struct ir3_register *def,
+                struct ir3_block *block)
+{
+   struct ra_spill_interval *interval = ctx->intervals[def->name];
+   if (!interval->interval.inserted)
+      return;
+
+   bool needs_phi = false;
+   struct ir3_register *cur_def = NULL;
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *pred = block->predecessors[i];
+      struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+
+      if (!state->visited) {
+         needs_phi = true;
+         break;
+      }
+
+      struct hash_entry *entry =
+         _mesa_hash_table_search(state->remap, def);
+      assert(entry);
+      struct reg_or_immed *pred_val = entry->data;
+      if ((pred_val->flags & (IR3_REG_IMMED | IR3_REG_CONST)) ||
+          !pred_val->def ||
+          (cur_def && cur_def != pred_val->def)) {
+         needs_phi = true;
+         break;
+      }
+      cur_def = pred_val->def;
+   }
+
+   if (!needs_phi) {
+      interval->dst.def = cur_def;
+      interval->dst.flags = cur_def->flags;
+      return;
+   }
+
+   struct ir3_instruction *phi =
+      ir3_instr_create(block, OPC_META_PHI, 1, block->predecessors_count);
+   struct ir3_register *dst = __ssa_dst(phi);
+   dst->flags |= def->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
+   dst->size = def->size;
+   dst->wrmask = def->wrmask;
+
+   dst->interval_start = def->interval_start;
+   dst->interval_end = def->interval_end;
+   dst->merge_set = def->merge_set;
+   dst->merge_set_offset = def->merge_set_offset;
+
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *pred = block->predecessors[i];
+      struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+      struct ir3_register *src = ir3_src_create(phi, INVALID_REG, dst->flags);
+      src->size = def->size;
+      src->wrmask = def->wrmask;
+
+      if (state->visited) {
+         struct hash_entry *entry =
+            _mesa_hash_table_search(state->remap, def);
+         assert(entry);
+         struct reg_or_immed *new_val = entry->data;
+         set_src_val(src, new_val);
+      } else {
+         src->def = def;
+      }
+   }
+
+   interval->dst.def = dst;
+   interval->dst.flags = dst->flags;
+
+   ir3_instr_move_before_block(phi, block);
+}
+
+/* When spilling a block with a single predecessors, the pred may have other
+ * successors so we can't choose what's live in and we can't spill/restore
+ * anything. Just make the inserted intervals exactly match the predecessor. If
+ * it wasn't live in the predecessor then it must've already been spilled. Also,
+ * there are no phi nodes and no live-ins.
+ */
+static void
+spill_single_pred_live_in(struct ra_spill_ctx *ctx,
+                          struct ir3_block *block)
+{
+   unsigned name;
+   BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
+                       ctx->live->definitions_count) {
+      struct ir3_register *reg = ctx->live->definitions[name];
+      struct ra_spill_interval *interval = ctx->intervals[reg->name];
+      struct reg_or_immed *val = read_live_in(ctx, reg, block, 0);
+      if (val)
+         interval->dst = *val;
+      else
+         ra_spill_ctx_remove(ctx, interval);
+   }
+}
+
+static void
+rewrite_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *phi,
+            struct ir3_block *block)
+{
+   if (!ctx->intervals[phi->dsts[0]->name]->interval.inserted) {
+      phi->flags |= IR3_INSTR_UNUSED;
+      return;
+   }
+
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *pred = block->predecessors[i];
+      struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+
+      if (!state->visited)
+         continue;
+
+      struct ir3_register *src = phi->srcs[i];
+      if (!src->def)
+         continue;
+
+      struct hash_entry *entry =
+         _mesa_hash_table_search(state->remap, src->def);
+      assert(entry);
+      struct reg_or_immed *new_val = entry->data;
+      set_src_val(src, new_val);
+   }
+}
+
+static void
+spill_live_out(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval,
+               struct ir3_block *block)
+{
+   struct ir3_register *def = interval->interval.reg;
+
+   spill(ctx, &interval->dst, get_spill_slot(ctx, def), NULL, block);
+   ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
+}
+
+static void
+spill_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
+{
+   struct ra_spill_block_state *state = &ctx->blocks[block->index];
+   rb_tree_foreach_safe (struct ra_spill_interval, interval,
+                         &ctx->reg_ctx.intervals, interval.node) {
+      if (!BITSET_TEST(state->live_out, interval->interval.reg->name)) {
+         spill_live_out(ctx, interval, block);
+      }
+   }
+}
+
+static void
+reload_live_out(struct ra_spill_ctx *ctx, struct ir3_register *def,
+                struct ir3_block *block)
+{
+   struct ra_spill_interval *interval = ctx->intervals[def->name];
+   ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
+
+   reload_def(ctx, def, NULL, block);
+}
+
+static void
+reload_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
+{
+   struct ra_spill_block_state *state = &ctx->blocks[block->index];
+   unsigned name;
+   BITSET_FOREACH_SET (name, state->live_out, ctx->live->definitions_count) {
+      struct ir3_register *reg = ctx->live->definitions[name];
+      struct ra_spill_interval *interval = ctx->intervals[name];
+      if (!interval->interval.inserted)
+         reload_live_out(ctx, reg, block);
+   }
+}
+
+static void
+update_live_out_phis(struct ra_spill_ctx *ctx, struct ir3_block *block)
+{
+   assert(!block->successors[1]);
+   struct ir3_block *succ = block->successors[0];
+   unsigned pred_idx = ir3_block_get_pred_index(succ, block);
+   
+   foreach_instr (instr, &succ->instr_list) {
+      if (instr->opc != OPC_META_PHI)
+         break;
+
+      struct ir3_register *def = instr->srcs[pred_idx]->def;
+      if (!def)
+         continue;
+
+      struct ra_spill_interval *interval = ctx->intervals[def->name];
+      if (!interval->interval.inserted)
+         continue;
+      set_src_val(instr->srcs[pred_idx], &interval->dst);
+   }
+}
+
+static void
+record_pred_live_out(struct ra_spill_ctx *ctx,
+                     struct ra_spill_interval *interval,
+                     struct ir3_block *block, unsigned pred_idx)
+{
+   struct ir3_block *pred = block->predecessors[pred_idx];
+   struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+
+   struct ir3_register *def = interval->interval.reg;
+   if (is_live_in_phi(def, block)) {
+      def = def->instr->srcs[pred_idx]->def;
+   }
+   BITSET_SET(state->live_out, def->name);
+
+   rb_tree_foreach (struct ra_spill_interval, child,
+                    &interval->interval.children, interval.node) {
+      record_pred_live_out(ctx, child, block, pred_idx);
+   }
+}
+
+static void
+record_pred_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
+{
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *pred = block->predecessors[i];
+      struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+      if (state->visited)
+         continue;
+
+      state->live_out = rzalloc_array(ctx, BITSET_WORD,
+                                      BITSET_WORDS(ctx->live->definitions_count));
+
+
+      rb_tree_foreach (struct ra_spill_interval, interval,
+                       &ctx->reg_ctx.intervals, interval.node) {
+         record_pred_live_out(ctx, interval, block, i);
+      }
+   }
+}
+
+static void
+record_live_out(struct ra_spill_ctx *ctx,
+                struct ra_spill_block_state *state,
+                struct ra_spill_interval *interval)
+{
+   if (!(interval->dst.flags & IR3_REG_SSA) ||
+       interval->dst.def) {
+      struct reg_or_immed *val = ralloc(ctx, struct reg_or_immed);
+      *val = interval->dst;
+      _mesa_hash_table_insert(state->remap, interval->interval.reg, val);
+   }
+   rb_tree_foreach (struct ra_spill_interval, child,
+                    &interval->interval.children, interval.node) {
+      record_live_out(ctx, state, child);
+   }
+}
+
+static void
+record_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
+{
+   struct ra_spill_block_state *state = &ctx->blocks[block->index];
+   state->remap = _mesa_pointer_hash_table_create(ctx);
+
+   rb_tree_foreach (struct ra_spill_interval, interval, &ctx->reg_ctx.intervals,
+                    interval.node) {
+      record_live_out(ctx, state, interval);
+   }
 }
 
 static void
@@ -312,12 +1692,14 @@ handle_block(struct ra_spill_ctx *ctx, struct ir3_block *block)
 {
    memset(&ctx->cur_pressure, 0, sizeof(ctx->cur_pressure));
    rb_tree_init(&ctx->reg_ctx.intervals);
+   rb_tree_init(&ctx->full_live_intervals);
+   rb_tree_init(&ctx->half_live_intervals);
 
    unsigned name;
    BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
                        ctx->live->definitions_count) {
       struct ir3_register *reg = ctx->live->definitions[name];
-      handle_live_in(ctx, reg);
+      handle_live_in(ctx, block, reg);
    }
 
    foreach_instr (instr, &block->instr_list) {
@@ -327,36 +1709,297 @@ handle_block(struct ra_spill_ctx *ctx, struct ir3_block *block)
       handle_input_phi(ctx, instr);
    }
 
-   update_max_pressure(ctx);
+   if (ctx->spilling) {
+      if (block->predecessors_count == 1) {
+         spill_single_pred_live_in(ctx, block);
+      } else {
+         spill_live_ins(ctx, block);
+         reload_live_ins(ctx, block);
+         record_pred_live_outs(ctx, block);
+         foreach_instr (instr, &block->instr_list) {
+            if (instr->opc != OPC_META_PHI)
+               break;
+            rewrite_phi(ctx, instr, block);
+         }
+         BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
+                             ctx->live->definitions_count) {
+            struct ir3_register *reg = ctx->live->definitions[name];
+            add_live_in_phi(ctx, reg, block);
+         }
+      }
+   } else {
+      update_max_pressure(ctx);
+   }
 
    foreach_instr (instr, &block->instr_list) {
+      di(instr, "processing");
+
       if (instr->opc == OPC_META_PHI || instr->opc == OPC_META_INPUT ||
           instr->opc == OPC_META_TEX_PREFETCH)
          remove_input_phi(ctx, instr);
+      else if (ctx->spilling && instr->opc == OPC_META_PARALLEL_COPY)
+         handle_pcopy(ctx, instr);
+      else if (ctx->spilling && instr->opc == OPC_MOV &&
+               instr->dsts[0] == ctx->base_reg)
+         /* skip */;
       else
          handle_instr(ctx, instr);
    }
+
+   if (ctx->spilling && block->successors[0]) {
+      struct ra_spill_block_state *state =
+         &ctx->blocks[block->successors[0]->index];
+      if (state->visited) {
+         assert(!block->successors[1]);
+
+         spill_live_outs(ctx, block);
+         reload_live_outs(ctx, block);
+         update_live_out_phis(ctx, block);
+      }
+   }
+
+   if (ctx->spilling) {
+      record_live_outs(ctx, block);
+      ctx->blocks[block->index].visited = true;
+   }
+}
+
+static bool
+simplify_phi_node(struct ir3_instruction *phi)
+{
+   struct ir3_register *def = NULL;
+   foreach_src (src, phi) {
+      /* Ignore phi sources which point to the phi itself. */
+      if (src->def == phi->dsts[0])
+         continue;
+      /* If it's undef or it doesn't match the previous sources, bail */
+      if (!src->def || (def && def != src->def))
+         return false;
+      def = src->def;
+   }
+
+   phi->data = def;
+   phi->flags |= IR3_INSTR_UNUSED;
+   return true;
+}
+
+static void
+simplify_phi_srcs(struct ir3_instruction *instr)
+{
+   foreach_src (src, instr) {
+      if (src->def && src->def->instr->opc == OPC_META_PHI) {
+         struct ir3_instruction *phi = src->def->instr;
+         if (phi->data)
+            src->def = phi->data;
+      }
+   }
+}
+
+/* We insert phi nodes for all live-ins of loops in case we need to split the
+ * live range. This pass cleans that up for the case where the live range didn't
+ * actually need to be split.
+ */
+static void
+simplify_phi_nodes(struct ir3 *ir)
+{
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         if (instr->opc != OPC_META_PHI)
+            break;
+         instr->data = NULL;
+      }
+   }
+
+   bool progress;
+   do {
+      progress = false;
+      foreach_block (block, &ir->block_list) {
+         foreach_instr (instr, &block->instr_list) {
+            if (instr->opc == OPC_META_PHI || (instr->flags & IR3_INSTR_UNUSED))
+               continue;
+
+            simplify_phi_srcs(instr);
+         }
+
+         for (unsigned i = 0; i < 2; i++) {
+            struct ir3_block *succ = block->successors[i];
+            if (!succ)
+               continue;
+            foreach_instr (instr, &succ->instr_list) {
+               if (instr->opc != OPC_META_PHI)
+                  break;
+               if (instr->flags & IR3_INSTR_UNUSED)
+                  continue;
+
+               simplify_phi_srcs(instr);
+               progress |= simplify_phi_node(instr);
+            }
+         }
+      }
+   } while (progress);
+}
+
+static void
+unmark_dead(struct ir3 *ir)
+{
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         instr->flags &= ~IR3_INSTR_UNUSED;
+      }
+   }
+}
+
+/* Simple pass to remove now-dead phi nodes and pcopy instructions. We mark
+ * which ones are dead along the way, so there's nothing to compute here.
+ */
+static void
+cleanup_dead(struct ir3 *ir)
+{
+   foreach_block (block, &ir->block_list) {
+      foreach_instr_safe (instr, &block->instr_list) {
+         if (instr->flags & IR3_INSTR_UNUSED)
+            list_delinit(&instr->node);
+      }
+   }
+}
+
+/* Deal with merge sets after spilling. Spilling generally leaves the merge sets
+ * in a mess, and even if we properly cleaned up after ourselves, we would want
+ * to recompute the merge sets afterward anway. That's because
+ * spilling/reloading can "break up" phi webs and split/collect webs so that
+ * allocating them to the same register no longer gives any benefit. For
+ * example, imagine we have this:
+ *
+ * if (...) {
+ *    foo = ...
+ * } else {
+ *    bar = ...
+ * }
+ * baz = phi(foo, bar)
+ *
+ * and we spill "baz":
+ *
+ * if (...) {
+ *    foo = ...
+ *    spill(foo)
+ * } else {
+ *    bar = ...
+ *    spill(bar)
+ * }
+ * baz = reload()
+ *
+ * now foo, bar, and baz don't have to be allocated to the same register. How
+ * exactly the merge sets change can be complicated, so it's easier just to
+ * recompute them.
+ *
+ * However, there's a wrinkle in this: those same merge sets determine the
+ * register pressure, due to multiple values inhabiting the same register! And
+ * we assume that this sharing happens when spilling. Therefore we need a
+ * three-step procedure:
+ *
+ * 1. Drop the original merge sets.
+ * 2. Calculate which values *must* be merged, being careful to only use the
+ *    interval information which isn't trashed by spilling, and forcibly merge
+ *    them.
+ * 3. Let ir3_merge_regs() finish the job, including recalculating the
+ *    intervals.
+ */
+
+static void
+fixup_merge_sets(struct ir3_liveness *live, struct ir3 *ir)
+{
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         ra_foreach_dst (dst, instr) {
+            dst->merge_set = NULL;
+            dst->merge_set_offset = 0;
+         }
+      }
+   }
+
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         if (instr->opc != OPC_META_SPLIT &&
+             instr->opc != OPC_META_COLLECT)
+            continue;
+
+         struct ir3_register *dst = instr->dsts[0];
+         ra_foreach_src (src, instr) {
+            if (!(src->flags & IR3_REG_KILL) &&
+                src->def->interval_start < dst->interval_end &&
+                dst->interval_start < src->def->interval_end) {
+               ir3_force_merge(dst, src->def,
+                               src->def->interval_start - dst->interval_start);
+            }
+         }
+      }
+   }
+
+   ir3_merge_regs(live, ir);
 }
 
 void
 ir3_calc_pressure(struct ir3_shader_variant *v, struct ir3_liveness *live,
                   struct ir3_pressure *max_pressure)
 {
-   struct ra_spill_ctx ctx = {};
-   ctx.live = live;
-   ctx.intervals = calloc(live->definitions_count, sizeof(*ctx.intervals));
-   ctx.compiler = v->shader->compiler;
-   spill_ctx_init(&ctx);
+   struct ra_spill_ctx *ctx = rzalloc(NULL, struct ra_spill_ctx);
+   spill_ctx_init(ctx, v, live);
 
    foreach_block (block, &v->ir->block_list) {
-      handle_block(&ctx, block);
+      handle_block(ctx, block);
    }
 
-   assert(ctx.cur_pressure.full == 0);
-   assert(ctx.cur_pressure.half == 0);
-   assert(ctx.cur_pressure.shared == 0);
+   assert(ctx->cur_pressure.full == 0);
+   assert(ctx->cur_pressure.half == 0);
+   assert(ctx->cur_pressure.shared == 0);
 
-   free(ctx.intervals);
-
-   *max_pressure = ctx.max_pressure;
+   *max_pressure = ctx->max_pressure;
+   ralloc_free(ctx);
+}
+
+bool
+ir3_spill(struct ir3 *ir, struct ir3_shader_variant *v,
+          struct ir3_liveness **live,
+          const struct ir3_pressure *limit_pressure)
+{
+   struct ra_spill_ctx *ctx = rzalloc(NULL, struct ra_spill_ctx);
+   spill_ctx_init(ctx, v, *live);
+
+   ctx->spilling = true;
+
+   ctx->blocks = rzalloc_array(ctx, struct ra_spill_block_state,
+                               ctx->live->block_count);
+   rb_tree_init(&ctx->full_live_intervals);
+   rb_tree_init(&ctx->half_live_intervals);
+
+   ctx->limit_pressure = *limit_pressure;
+   ctx->spill_slot = v->pvtmem_size;
+
+   add_base_reg(ctx, ir);
+   compute_next_distance(ctx, ir);
+
+   unmark_dead(ir);
+
+   foreach_block (block, &ir->block_list) {
+      handle_block(ctx, block);
+   }
+
+   simplify_phi_nodes(ir);
+
+   cleanup_dead(ir);
+
+   ir3_create_parallel_copies(ir);
+
+   /* After this point, we're done mutating the IR. Liveness has been trashed,
+    * so recalculate it. We'll need it for recalculating the merge sets.
+    */
+   ralloc_free(ctx->live);
+   *live = ir3_calc_liveness(v);
+
+   fixup_merge_sets(*live, ir);
+
+   v->pvtmem_size = ctx->spill_slot;
+   ralloc_free(ctx);
+
+   return true;
 }
diff --git a/src/freedreno/ir3/ir3_validate.c b/src/freedreno/ir3/ir3_validate.c
index 4fe56b45c9c..08f2df4251a 100644
--- a/src/freedreno/ir3/ir3_validate.c
+++ b/src/freedreno/ir3/ir3_validate.c
@@ -187,7 +187,7 @@ validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr)
          /* end/chmask/etc are allowed to have different size sources */
       } else if (instr->opc == OPC_META_PARALLEL_COPY) {
          /* pcopy sources have to match with their destination but can have
-          * different size.
+          * different sizes from each other.
           */
       } else if (n > 0) {
          validate_assert(ctx, (last_reg->flags & IR3_REG_HALF) ==
@@ -303,6 +303,7 @@ validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr)
       case OPC_STL:
       case OPC_STP:
       case OPC_STLW:
+      case OPC_SPILL_MACRO:
          validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
          validate_reg_size(ctx, instr->srcs[1], instr->cat6.type);
          validate_assert(ctx, !(instr->srcs[2]->flags & IR3_REG_HALF));
diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build
index 40bdb26194d..0456bc59253 100644
--- a/src/freedreno/ir3/meson.build
+++ b/src/freedreno/ir3/meson.build
@@ -88,6 +88,7 @@ libfreedreno_ir3_files = files(
   'ir3_legalize.c',
   'ir3_liveness.c',
   'ir3_lower_parallelcopy.c',
+  'ir3_lower_spill.c',
   'ir3_lower_subgroups.c',
   'ir3_merge_regs.c',
   'ir3_nir.c',
diff --git a/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-fails.txt b/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-fails.txt
index 61c1a5b0cda..d311f70ae3b 100644
--- a/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-fails.txt
+++ b/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-fails.txt
@@ -345,7 +345,6 @@ spec@glsl-1.50@execution@compatibility@vs-gs-ff-frag,Crash
 spec@glsl-1.50@execution@compatibility@vs-gs-texcoord-array-2,Crash
 spec@glsl-1.50@execution@compatibility@vs-gs-texcoord-array,Crash
 spec@glsl-1.50@execution@geometry@end-primitive 0,Fail
-spec@glsl-1.50@execution@geometry@max-input-components,Fail
 spec@glsl-1.50@execution@geometry@primitive-id-restart gl_line_loop ffs,Fail
 spec@glsl-1.50@execution@geometry@primitive-id-restart gl_line_loop other,Fail
 spec@glsl-1.50@execution@geometry@primitive-id-restart gl_lines_adjacency ffs,Fail
@@ -385,11 +384,7 @@ spec@glsl-1.50@execution@geometry@tri-strip-ordering-with-prim-restart gl_triang
 spec@glsl-1.50@execution@geometry@tri-strip-ordering-with-prim-restart gl_triangle_strip other,Fail
 spec@glsl-1.50@execution@primitive-id-no-gs-quads,Fail
 spec@glsl-1.50@execution@primitive-id-no-gs-quad-strip,Fail
-spec@glsl-1.50@execution@variable-indexing@gs-input-array-vec2-index-rd,Fail
-spec@glsl-1.50@execution@variable-indexing@gs-input-array-vec3-index-rd,Fail
-spec@glsl-1.50@execution@variable-indexing@gs-output-array-vec3-index-wr,Fail
 spec@glsl-1.50@execution@variable-indexing@gs-output-array-vec4-index-wr,Crash
-spec@glsl-1.50@execution@variable-indexing@vs-output-array-vec4-index-wr-before-gs,Fail
 spec@glsl-1.50@gs-max-output-components,Fail
 spec@intel_performance_query@intel_performance_query-issue_2235,Fail
 spec@khr_texture_compression_astc@array-gl@12x12 Block Dim,Fail