From 4861835d1cc07e5068694905b5a3538303f6de32 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke
Date: Tue, 3 Nov 2015 12:51:32 -0800
Subject: [PATCH 001/287] i965: Fix the fs_visitor GS constructor to take
shader_time_index.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Jason reworked this so it isn't simply ST_GS anymore...it's either -1
(not enabled) or an actual offset.
Signed-off-by: Kenneth Graunke
Reviewed-by: Kristian Høgsberg
---
src/mesa/drivers/dri/i965/brw_fs.h | 3 ++-
src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 5 +++--
2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 8058b344b7a..caf56555981 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -105,7 +105,8 @@ public:
void *mem_ctx,
struct brw_gs_compile *gs_compile,
struct brw_gs_prog_data *prog_data,
- const nir_shader *shader);
+ const nir_shader *shader,
+ int shader_time_index);
void init();
~fs_visitor();
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 5c57944ca39..b6d1c3b6d4a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1112,13 +1112,14 @@ fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
void *mem_ctx,
struct brw_gs_compile *c,
struct brw_gs_prog_data *prog_data,
- const nir_shader *shader)
+ const nir_shader *shader,
+ int shader_time_index)
: backend_shader(compiler, log_data, mem_ctx, shader,
&prog_data->base.base),
key(&c->key), gs_compile(c),
prog_data(&prog_data->base.base), prog(NULL),
dispatch_width(8),
- shader_time_index(ST_GS),
+ shader_time_index(shader_time_index),
bld(fs_builder(this, dispatch_width).at_end())
{
init();
From c9541a74e4d179ad844bdf8af1e3de541c5b14c2 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke
Date: Wed, 23 Sep 2015 20:52:19 -0700
Subject: [PATCH 002/287] i965: Add scalar GS input lowering code.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
We really ought to compute the VUE map at link time and stash it, rather
than recomputing it here, but with the mess of program structures I
wasn't sure where to put it. We can improve that later.
Signed-off-by: Kenneth Graunke
Reviewed-by: Kristian Høgsberg
---
src/mesa/drivers/dri/i965/brw_nir.c | 44 +++++++++++++++++++++++++----
1 file changed, 39 insertions(+), 5 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index 11f111382f4..a7a5eb511cd 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -56,7 +56,8 @@ remap_vs_attrs(nir_block *block, void *closure)
}
static void
-brw_nir_lower_inputs(nir_shader *nir, bool is_scalar)
+brw_nir_lower_inputs(const struct brw_device_info *devinfo,
+ nir_shader *nir, bool is_scalar)
{
switch (nir->stage) {
case MESA_SHADER_VERTEX:
@@ -90,11 +91,43 @@ brw_nir_lower_inputs(nir_shader *nir, bool is_scalar)
}
}
break;
- case MESA_SHADER_GEOMETRY:
- foreach_list_typed(nir_variable, var, node, &nir->inputs) {
- var->data.driver_location = var->data.location;
+ case MESA_SHADER_GEOMETRY: {
+ if (!is_scalar) {
+ foreach_list_typed(nir_variable, var, node, &nir->inputs) {
+ var->data.driver_location = var->data.location;
+ }
+ } else {
+ /* The GLSL linker will have already matched up GS inputs and
+ * the outputs of prior stages. The driver does extend VS outputs
+ * in some cases, but only for legacy OpenGL or Gen4-5 hardware,
+ * neither of which offer geometry shader support. So we can
+ * safely ignore that.
+ *
+ * For SSO pipelines, we use a fixed VUE map layout based on variable
+ * locations, so we can rely on rendezvous-by-location to make this
+ * work.
+ *
+ * However, we need to ignore VARYING_SLOT_PRIMITIVE_ID, as it's not
+ * written by previous stages and shows up via payload magic.
+ */
+ struct brw_vue_map input_vue_map;
+ GLbitfield64 inputs_read =
+ nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID;
+ brw_compute_vue_map(devinfo, &input_vue_map, inputs_read,
+ nir->info.separate_shader);
+
+ /* Start with the slot for the variable's base. */
+ foreach_list_typed(nir_variable, var, node, &nir->inputs) {
+ assert(input_vue_map.varying_to_slot[var->data.location] != -1);
+ var->data.driver_location =
+ input_vue_map.varying_to_slot[var->data.location];
+ }
+
+ /* Inputs are stored in vec4 slots, so use type_size_vec4(). */
+ nir_lower_io(nir, nir_var_shader_in, type_size_vec4);
}
break;
+ }
case MESA_SHADER_FRAGMENT:
assert(is_scalar);
nir_assign_var_locations(&nir->inputs, &nir->num_inputs,
@@ -187,6 +220,7 @@ brw_create_nir(struct brw_context *brw,
bool is_scalar)
{
struct gl_context *ctx = &brw->ctx;
+ const struct brw_device_info *devinfo = brw->intelScreen->devinfo;
const nir_shader_compiler_options *options =
ctx->Const.ShaderCompilerOptions[stage].NirOptions;
static const nir_lower_tex_options tex_options = {
@@ -230,7 +264,7 @@ brw_create_nir(struct brw_context *brw,
/* Get rid of split copies */
nir_optimize(nir, is_scalar);
- brw_nir_lower_inputs(nir, is_scalar);
+ brw_nir_lower_inputs(devinfo, nir, is_scalar);
brw_nir_lower_outputs(nir, is_scalar);
nir_assign_var_locations(&nir->uniforms,
&nir->num_uniforms,
From 36fd65381756ed1b8f774f7fcdd555941a3d39e1 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke
Date: Wed, 11 Mar 2015 23:14:31 -0700
Subject: [PATCH 003/287] i965: Add scalar geometry shader support.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This is hidden behind INTEL_SCALAR_GS=1 for now, as we don't yet support
instanced geometry shaders, and Orbital Explorer's shader spills like
crazy. But the infrastructure is in place, and it's largely working.
v2: Lots of rebasing.
v3: (feedback from Kristian Høgsberg)
- Handle stride and subreg_offset correctly for ATTRs; use a helper.
- Fix missing emit_shader_time_end() call.
- Delete dead code after early EOT in static vertex case to avoid
tripping asserts in emit_shader_time_end().
- Use proper D/UD type in intexp2().
- Fix "EndPrimitve" and "to that" typos.
- Assert that invocations == 1 so we know this is missing.
Signed-off-by: Kenneth Graunke
Reviewed-by: Kristian Høgsberg
---
src/mesa/drivers/dri/i965/brw_fs.cpp | 210 +++++++++-
src/mesa/drivers/dri/i965/brw_fs.h | 17 +-
src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 391 ++++++++++++++++++
src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 49 ++-
.../drivers/dri/i965/brw_vec4_gs_visitor.cpp | 25 ++
5 files changed, 667 insertions(+), 25 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 5ab8c15bc0c..4cc962613b3 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -43,6 +43,7 @@
#include "brw_wm.h"
#include "brw_fs.h"
#include "brw_cs.h"
+#include "brw_vec4_gs_visitor.h"
#include "brw_cfg.h"
#include "brw_dead_control_flow.h"
#include "main/uniforms.h"
@@ -1360,6 +1361,57 @@ fs_visitor::emit_discard_jump()
discard_jump->predicate_inverse = true;
}
+void
+fs_visitor::emit_gs_thread_end()
+{
+ assert(stage == MESA_SHADER_GEOMETRY);
+
+ struct brw_gs_prog_data *gs_prog_data =
+ (struct brw_gs_prog_data *) prog_data;
+
+ if (gs_compile->control_data_header_size_bits > 0) {
+ emit_gs_control_data_bits(this->final_gs_vertex_count);
+ }
+
+ const fs_builder abld = bld.annotate("thread end");
+ fs_inst *inst;
+
+ if (gs_prog_data->static_vertex_count != -1) {
+ foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
+ if (prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8 ||
+ prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
+ prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
+ prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) {
+ prev->eot = true;
+
+ /* Delete now dead instructions. */
+ foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
+ if (dead == prev)
+ break;
+ dead->remove();
+ }
+ return;
+ } else if (prev->is_control_flow() || prev->has_side_effects()) {
+ break;
+ }
+ }
+ fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)));
+ inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, hdr);
+ inst->mlen = 1;
+ } else {
+ fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+ fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
+ sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
+ sources[1] = this->final_gs_vertex_count;
+ abld.LOAD_PAYLOAD(payload, sources, 2, 2);
+ inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
+ inst->mlen = 2;
+ }
+ inst->eot = true;
+ inst->offset = 0;
+}
+
void
fs_visitor::assign_curb_setup()
{
@@ -1531,6 +1583,26 @@ fs_visitor::assign_urb_setup()
this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
}
+void
+fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
+{
+ for (int i = 0; i < inst->sources; i++) {
+ if (inst->src[i].file == ATTR) {
+ int grf = payload.num_regs +
+ prog_data->curb_read_length +
+ inst->src[i].reg +
+ inst->src[i].reg_offset;
+
+ inst->src[i].file = HW_REG;
+ inst->src[i].fixed_hw_reg =
+ stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
+ inst->src[i].subreg_offset),
+ inst->exec_size * inst->src[i].stride,
+ inst->exec_size, inst->src[i].stride);
+ }
+ }
+}
+
void
fs_visitor::assign_vs_urb_setup()
{
@@ -1548,24 +1620,44 @@ fs_visitor::assign_vs_urb_setup()
/* Rewrite all ATTR file references to the hw grf that they land in. */
foreach_block_and_inst(block, fs_inst, inst, cfg) {
- for (int i = 0; i < inst->sources; i++) {
- if (inst->src[i].file == ATTR) {
- int grf = payload.num_regs +
- prog_data->curb_read_length +
- inst->src[i].reg +
- inst->src[i].reg_offset;
-
- inst->src[i].file = HW_REG;
- inst->src[i].fixed_hw_reg =
- stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
- inst->src[i].subreg_offset),
- inst->exec_size * inst->src[i].stride,
- inst->exec_size, inst->src[i].stride);
- }
- }
+ convert_attr_sources_to_hw_regs(inst);
}
}
+void
+fs_visitor::assign_gs_urb_setup()
+{
+ assert(stage == MESA_SHADER_GEOMETRY);
+
+ brw_vue_prog_data *vue_prog_data = (brw_vue_prog_data *) prog_data;
+
+ first_non_payload_grf +=
+ 8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
+
+ const unsigned first_icp_handle = payload.num_regs -
+ (vue_prog_data->include_vue_handles ? nir->info.gs.vertices_in : 0);
+
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ /* Lower URB_READ_SIMD8 opcodes into real messages. */
+ if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8) {
+ assert(inst->src[0].file == IMM);
+ inst->src[0] = retype(brw_vec8_grf(first_icp_handle +
+ inst->src[0].fixed_hw_reg.dw1.ud,
+ 0), BRW_REGISTER_TYPE_UD);
+ /* for now, assume constant - we can do per-slot offsets later */
+ assert(inst->src[1].file == IMM);
+ inst->offset = inst->src[1].fixed_hw_reg.dw1.ud;
+ inst->src[1] = fs_reg();
+ inst->mlen = 1;
+ inst->base_mrf = -1;
+ }
+
+ /* Rewrite all ATTR file references to HW_REGs. */
+ convert_attr_sources_to_hw_regs(inst);
+ }
+}
+
+
/**
* Split large virtual GRFs into separate components if we can.
*
@@ -4762,6 +4854,45 @@ fs_visitor::setup_vs_payload()
* conveying the data, and thereby reduce push constant usage.
*
*/
+void
+fs_visitor::setup_gs_payload()
+{
+ assert(stage == MESA_SHADER_GEOMETRY);
+
+ struct brw_gs_prog_data *gs_prog_data =
+ (struct brw_gs_prog_data *) prog_data;
+ struct brw_vue_prog_data *vue_prog_data =
+ (struct brw_vue_prog_data *) prog_data;
+
+ /* R0: thread header, R1: output URB handles */
+ payload.num_regs = 2;
+
+ if (gs_prog_data->include_primitive_id) {
+ /* R2: Primitive ID 0..7 */
+ payload.num_regs++;
+ }
+
+ /* Use a maximum of 32 registers for push-model inputs. */
+ const unsigned max_push_components = 32;
+
+ /* If pushing our inputs would take too many registers, reduce the URB read
+ * length (which is in HWords, or 8 registers), and resort to pulling.
+ *
+ * Note that the GS reads HWords for every vertex - so we
+ * have to multiply by VerticesIn to obtain the total storage requirement.
+ */
+ if (8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in >
+ max_push_components) {
+ gs_prog_data->base.include_vue_handles = true;
+
+ /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
+ payload.num_regs += nir->info.gs.vertices_in;
+
+ vue_prog_data->urb_read_length =
+ ROUND_DOWN_TO(max_push_components / nir->info.gs.vertices_in, 8) / 8;
+ }
+}
+
void
fs_visitor::setup_cs_payload()
{
@@ -5018,6 +5149,55 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes)
return !failed;
}
+bool
+fs_visitor::run_gs()
+{
+ assert(stage == MESA_SHADER_GEOMETRY);
+
+ setup_gs_payload();
+
+ this->final_gs_vertex_count = vgrf(glsl_type::uint_type);
+
+ if (gs_compile->control_data_header_size_bits > 0) {
+ /* Create a VGRF to store accumulated control data bits. */
+ this->control_data_bits = vgrf(glsl_type::uint_type);
+
+ /* If we're outputting more than 32 control data bits, then EmitVertex()
+ * will set control_data_bits to 0 after emitting the first vertex.
+ * Otherwise, we need to initialize it to 0 here.
+ */
+ if (gs_compile->control_data_header_size_bits <= 32) {
+ const fs_builder abld = bld.annotate("initialize control data bits");
+ abld.MOV(this->control_data_bits, fs_reg(0u));
+ }
+ }
+
+ if (shader_time_index >= 0)
+ emit_shader_time_begin();
+
+ emit_nir_code();
+
+ emit_gs_thread_end();
+
+ if (shader_time_index >= 0)
+ emit_shader_time_end();
+
+ if (failed)
+ return false;
+
+ calculate_cfg();
+
+ optimize();
+
+ assign_curb_setup();
+ assign_gs_urb_setup();
+
+ fixup_3src_null_dest();
+ allocate_registers();
+
+ return !failed;
+}
+
bool
fs_visitor::run_fs(bool do_rep_send)
{
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index caf56555981..2dfcab1c51a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -132,18 +132,22 @@ public:
bool run_fs(bool do_rep_send);
bool run_vs(gl_clip_plane *clip_planes);
+ bool run_gs();
bool run_cs();
void optimize();
void allocate_registers();
void setup_payload_gen4();
void setup_payload_gen6();
void setup_vs_payload();
+ void setup_gs_payload();
void setup_cs_payload();
void fixup_3src_null_dest();
void assign_curb_setup();
void calculate_urb_setup();
void assign_urb_setup();
+ void convert_attr_sources_to_hw_regs(fs_inst *inst);
void assign_vs_urb_setup();
+ void assign_gs_urb_setup();
bool assign_regs(bool allow_spilling);
void assign_regs_trivial();
void calculate_payload_ranges(int payload_node_count,
@@ -281,7 +285,16 @@ public:
fs_reg color1, fs_reg color2,
fs_reg src0_alpha, unsigned components);
void emit_fb_writes();
- void emit_urb_writes();
+ void emit_urb_writes(const fs_reg &gs_vertex_count = fs_reg());
+ void set_gs_stream_control_data_bits(const fs_reg &vertex_count,
+ unsigned stream_id);
+ void emit_gs_control_data_bits(const fs_reg &vertex_count);
+ void emit_gs_end_primitive(const nir_src &vertex_count_nir_src);
+ void emit_gs_vertex(const nir_src &vertex_count_nir_src,
+ unsigned stream_id);
+ void emit_gs_thread_end();
+ void emit_gs_input_load(const fs_reg &dst, const nir_src &vertex_src,
+ unsigned offset, unsigned num_components);
void emit_cs_terminate();
fs_reg *emit_cs_local_invocation_id_setup();
fs_reg *emit_cs_work_group_id_setup();
@@ -389,6 +402,8 @@ public:
fs_reg delta_xy[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
fs_reg shader_start_time;
fs_reg userplane[MAX_CLIP_PLANES];
+ fs_reg final_gs_vertex_count;
+ fs_reg control_data_bits;
unsigned grf_used;
bool spilled_any_registers;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 7eeff93e465..b6eab069a1f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -28,6 +28,7 @@
#include "program/prog_to_nir.h"
#include "brw_fs.h"
#include "brw_fs_surface_builder.h"
+#include "brw_vec4_gs_visitor.h"
#include "brw_nir.h"
#include "brw_fs_surface_builder.h"
#include "brw_vec4_gs_visitor.h"
@@ -102,6 +103,7 @@ fs_visitor::nir_setup_outputs()
switch (stage) {
case MESA_SHADER_VERTEX:
+ case MESA_SHADER_GEOMETRY:
for (unsigned int i = 0; i < ALIGN(type_size_scalar(var->type), 4) / 4; i++) {
int output = var->data.location + i;
this->outputs[output] = offset(reg, bld, 4 * i);
@@ -1194,6 +1196,375 @@ emit_pixel_interpolater_send(const fs_builder &bld,
return inst;
}
+/**
+ * Computes 1 << x, given a D/UD register containing some value x.
+ */
+static fs_reg
+intexp2(const fs_builder &bld, const fs_reg &x)
+{
+ assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);
+
+ fs_reg result = bld.vgrf(x.type, 1);
+ fs_reg one = bld.vgrf(x.type, 1);
+
+ bld.MOV(one, retype(fs_reg(1), one.type));
+ bld.SHL(result, one, x);
+ return result;
+}
+
+void
+fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
+{
+ assert(stage == MESA_SHADER_GEOMETRY);
+
+ struct brw_gs_prog_data *gs_prog_data =
+ (struct brw_gs_prog_data *) prog_data;
+
+ /* We can only do EndPrimitive() functionality when the control data
+ * consists of cut bits. Fortunately, the only time it isn't is when the
+ * output type is points, in which case EndPrimitive() is a no-op.
+ */
+ if (gs_prog_data->control_data_format !=
+ GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
+ return;
+ }
+
+ /* Cut bits use one bit per vertex. */
+ assert(gs_compile->control_data_bits_per_vertex == 1);
+
+ fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
+ vertex_count.type = BRW_REGISTER_TYPE_UD;
+
+ /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
+ * vertex n, 0 otherwise. So all we need to do here is mark bit
+ * (vertex_count - 1) % 32 in the cut_bits register to indicate that
+ * EndPrimitive() was called after emitting vertex (vertex_count - 1);
+ * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
+ *
+ * Note that if EndPrimitive() is called before emitting any vertices, this
+ * will cause us to set bit 31 of the control_data_bits register to 1.
+ * That's fine because:
+ *
+ * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
+ * output, so the hardware will ignore cut bit 31.
+ *
+ * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
+ * last vertex, so setting cut bit 31 has no effect (since the primitive
+ * is automatically ended when the GS terminates).
+ *
+ * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
+ * control_data_bits register to 0 when the first vertex is emitted.
+ */
+
+ const fs_builder abld = bld.annotate("end primitive");
+
+ /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
+ fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ abld.ADD(prev_count, vertex_count, fs_reg(0xffffffffu));
+ fs_reg mask = intexp2(abld, prev_count);
+ /* Note: we're relying on the fact that the GEN SHL instruction only pays
+ * attention to the lower 5 bits of its second source argument, so on this
+ * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
+ * ((vertex_count - 1) % 32).
+ */
+ abld.OR(this->control_data_bits, this->control_data_bits, mask);
+}
+
+void
+fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
+{
+ assert(stage == MESA_SHADER_GEOMETRY);
+ assert(gs_compile->control_data_bits_per_vertex != 0);
+
+ struct brw_gs_prog_data *gs_prog_data =
+ (struct brw_gs_prog_data *) prog_data;
+
+ const fs_builder abld = bld.annotate("emit control data bits");
+ const fs_builder fwa_bld = bld.exec_all();
+
+ /* We use a single UD register to accumulate control data bits (32 bits
+ * for each of the SIMD8 channels). So we need to write a DWord (32 bits)
+ * at a time.
+ *
+ * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
+ * We have select a 128-bit group via the Global and Per-Slot Offsets, then
+ * use the Channel Mask phase to enable/disable which DWord within that
+ * group to write. (Remember, different SIMD8 channels may have emitted
+ * different numbers of vertices, so we may need per-slot offsets.)
+ *
+ * Channel masking presents an annoying problem: we may have to replicate
+ * the data up to 4 times:
+ *
+ * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
+ *
+ * To avoid penalizing shaders that emit a small number of vertices, we
+ * can avoid these sometimes: if the size of the control data header is
+ * <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land
+ * land in the same 128-bit group, so we can skip per-slot offsets.
+ *
+ * Similarly, if the control data header is <= 32 bits, there is only one
+ * DWord, so we can skip channel masks.
+ */
+ enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
+
+ fs_reg channel_mask, per_slot_offset;
+
+ if (gs_compile->control_data_header_size_bits > 32) {
+ opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
+ channel_mask = vgrf(glsl_type::uint_type);
+ }
+
+ if (gs_compile->control_data_header_size_bits > 128) {
+ opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT;
+ per_slot_offset = vgrf(glsl_type::uint_type);
+ }
+
+ /* Figure out which DWord we're trying to write to using the formula:
+ *
+ * dword_index = (vertex_count - 1) * bits_per_vertex / 32
+ *
+ * Since bits_per_vertex is a power of two, and is known at compile
+ * time, this can be optimized to:
+ *
+ * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
+ */
+ if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) {
+ fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ abld.ADD(prev_count, vertex_count, fs_reg(0xffffffffu));
+ unsigned log2_bits_per_vertex =
+ _mesa_fls(gs_compile->control_data_bits_per_vertex);
+ abld.SHR(dword_index, prev_count, fs_reg(6u - log2_bits_per_vertex));
+
+ if (per_slot_offset.file != BAD_FILE) {
+ /* Set the per-slot offset to dword_index / 4, so that we'll write to
+ * the appropriate OWord within the control data header.
+ */
+ abld.SHR(per_slot_offset, dword_index, fs_reg(2u));
+ }
+
+ /* Set the channel masks to 1 << (dword_index % 4), so that we'll
+ * write to the appropriate DWORD within the OWORD.
+ */
+ fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ fwa_bld.AND(channel, dword_index, fs_reg(3u));
+ channel_mask = intexp2(fwa_bld, channel);
+ /* Then the channel masks need to be in bits 23:16. */
+ fwa_bld.SHL(channel_mask, channel_mask, fs_reg(16u));
+ }
+
+ /* Store the control data bits in the message payload and send it. */
+ int mlen = 2;
+ if (channel_mask.file != BAD_FILE)
+ mlen += 4; /* channel masks, plus 3 extra copies of the data */
+ if (per_slot_offset.file != BAD_FILE)
+ mlen++;
+
+ fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
+ fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen);
+ int i = 0;
+ sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
+ if (per_slot_offset.file != BAD_FILE)
+ sources[i++] = per_slot_offset;
+ if (channel_mask.file != BAD_FILE)
+ sources[i++] = channel_mask;
+ while (i < mlen) {
+ sources[i++] = this->control_data_bits;
+ }
+
+ abld.LOAD_PAYLOAD(payload, sources, mlen, mlen);
+ fs_inst *inst = abld.emit(opcode, reg_undef, payload);
+ inst->mlen = mlen;
+ /* We need to increment Global Offset by 256-bits to make room for
+ * Broadwell's extra "Vertex Count" payload at the beginning of the
+ * URB entry. Since this is an OWord message, Global Offset is counted
+ * in 128-bit units, so we must set it to 2.
+ */
+ if (gs_prog_data->static_vertex_count == -1)
+ inst->offset = 2;
+}
+
+void
+fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
+ unsigned stream_id)
+{
+ /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
+
+ /* Note: we are calling this *before* increasing vertex_count, so
+ * this->vertex_count == vertex_count - 1 in the formula above.
+ */
+
+ /* Stream mode uses 2 bits per vertex */
+ assert(gs_compile->control_data_bits_per_vertex == 2);
+
+ /* Must be a valid stream */
+ assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS);
+
+ /* Control data bits are initialized to 0 so we don't have to set any
+ * bits when sending vertices to stream 0.
+ */
+ if (stream_id == 0)
+ return;
+
+ const fs_builder abld = bld.annotate("set stream control data bits", NULL);
+
+ /* reg::sid = stream_id */
+ fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ abld.MOV(sid, fs_reg(stream_id));
+
+ /* reg:shift_count = 2 * (vertex_count - 1) */
+ fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ abld.SHL(shift_count, vertex_count, fs_reg(1u));
+
+ /* Note: we're relying on the fact that the GEN SHL instruction only pays
+ * attention to the lower 5 bits of its second source argument, so on this
+ * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
+ * stream_id << ((2 * (vertex_count - 1)) % 32).
+ */
+ fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+ abld.SHL(mask, sid, shift_count);
+ abld.OR(this->control_data_bits, this->control_data_bits, mask);
+}
+
+void
+fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
+ unsigned stream_id)
+{
+ assert(stage == MESA_SHADER_GEOMETRY);
+
+ struct brw_gs_prog_data *gs_prog_data =
+ (struct brw_gs_prog_data *) prog_data;
+
+ fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
+ vertex_count.type = BRW_REGISTER_TYPE_UD;
+
+ /* Haswell and later hardware ignores the "Render Stream Select" bits
+ * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
+ * and instead sends all primitives down the pipeline for rasterization.
+ * If the SOL stage is enabled, "Render Stream Select" is honored and
+ * primitives bound to non-zero streams are discarded after stream output.
+ *
+ * Since the only purpose of primives sent to non-zero streams is to
+ * be recorded by transform feedback, we can simply discard all geometry
+ * bound to these streams when transform feedback is disabled.
+ */
+ if (stream_id > 0 && !nir->info.has_transform_feedback_varyings)
+ return;
+
+ /* If we're outputting 32 control data bits or less, then we can wait
+ * until the shader is over to output them all. Otherwise we need to
+ * output them as we go. Now is the time to do it, since we're about to
+ * output the vertex_count'th vertex, so it's guaranteed that the
+ * control data bits associated with the (vertex_count - 1)th vertex are
+ * correct.
+ */
+ if (gs_compile->control_data_header_size_bits > 32) {
+ const fs_builder abld =
+ bld.annotate("emit vertex: emit control data bits");
+
+ /* Only emit control data bits if we've finished accumulating a batch
+ * of 32 bits. This is the case when:
+ *
+ * (vertex_count * bits_per_vertex) % 32 == 0
+ *
+ * (in other words, when the last 5 bits of vertex_count *
+ * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some
+ * integer n (which is always the case, since bits_per_vertex is
+ * always 1 or 2), this is equivalent to requiring that the last 5-n
+ * bits of vertex_count are 0:
+ *
+ * vertex_count & (2^(5-n) - 1) == 0
+ *
+ * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
+ * equivalent to:
+ *
+ * vertex_count & (32 / bits_per_vertex - 1) == 0
+ *
+ * TODO: If vertex_count is an immediate, we could do some of this math
+ * at compile time...
+ */
+ fs_inst *inst =
+ abld.AND(bld.null_reg_d(), vertex_count,
+ fs_reg(32u / gs_compile->control_data_bits_per_vertex - 1u));
+ inst->conditional_mod = BRW_CONDITIONAL_Z;
+
+ abld.IF(BRW_PREDICATE_NORMAL);
+ /* If vertex_count is 0, then no control data bits have been
+ * accumulated yet, so we can skip emitting them.
+ */
+ abld.CMP(bld.null_reg_d(), vertex_count, fs_reg(0u),
+ BRW_CONDITIONAL_NEQ);
+ abld.IF(BRW_PREDICATE_NORMAL);
+ emit_gs_control_data_bits(vertex_count);
+ abld.emit(BRW_OPCODE_ENDIF);
+
+ /* Reset control_data_bits to 0 so we can start accumulating a new
+ * batch.
+ *
+ * Note: in the case where vertex_count == 0, this neutralizes the
+ * effect of any call to EndPrimitive() that the shader may have
+ * made before outputting its first vertex.
+ */
+ inst = abld.MOV(this->control_data_bits, fs_reg(0u));
+ inst->force_writemask_all = true;
+ abld.emit(BRW_OPCODE_ENDIF);
+ }
+
+ emit_urb_writes(vertex_count);
+
+ /* In stream mode we have to set control data bits for all vertices
+ * unless we have disabled control data bits completely (which we do
+ * do for GL_POINTS outputs that don't use streams).
+ */
+ if (gs_compile->control_data_header_size_bits > 0 &&
+ gs_prog_data->control_data_format ==
+ GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
+ set_gs_stream_control_data_bits(vertex_count, stream_id);
+ }
+}
+
+void
+fs_visitor::emit_gs_input_load(const fs_reg &dst,
+ const nir_src &vertex_src,
+ unsigned input_offset,
+ unsigned num_components)
+{
+ const brw_vue_prog_data *vue_prog_data = (const brw_vue_prog_data *) prog_data;
+ const unsigned vertex = nir_src_as_const_value(vertex_src)->u[0];
+
+ const unsigned array_stride = vue_prog_data->urb_read_length * 8;
+
+ const bool pushed = 4 * input_offset < array_stride;
+
+ if (input_offset == 0) {
+ /* This is the VUE header, containing VARYING_SLOT_LAYER [.y],
+ * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w].
+ * Only gl_PointSize is available as a GS input, so they must
+ * be asking for that input.
+ */
+ if (pushed) {
+ bld.MOV(dst, fs_reg(ATTR, array_stride * vertex + 3, dst.type));
+ } else {
+ fs_reg tmp = bld.vgrf(dst.type, 4);
+ fs_inst *inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
+ fs_reg(vertex), fs_reg(0));
+ inst->regs_written = 4;
+ bld.MOV(dst, offset(tmp, bld, 3));
+ }
+ } else {
+ if (pushed) {
+ int index = vertex * array_stride + 4 * input_offset;
+ for (unsigned i = 0; i < num_components; i++) {
+ bld.MOV(offset(dst, bld, i), fs_reg(ATTR, index + i, dst.type));
+ }
+ } else {
+ fs_inst *inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
+ fs_reg(vertex), fs_reg(input_offset));
+ inst->regs_written = num_components;
+ }
+ }
+}
+
void
fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
{
@@ -1579,6 +1950,14 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
break;
}
+ case nir_intrinsic_load_per_vertex_input_indirect:
+ assert(!"Not allowed");
+ /* fallthrough */
+ case nir_intrinsic_load_per_vertex_input:
+ emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
+ instr->num_components);
+ break;
+
/* Handle ARB_gpu_shader5 interpolation intrinsics
*
* It's worth a quick word of explanation as to why we handle the full
@@ -1929,6 +2308,18 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
break;
}
+ case nir_intrinsic_emit_vertex_with_counter:
+ emit_gs_vertex(instr->src[0], instr->const_index[0]);
+ break;
+
+ case nir_intrinsic_end_primitive_with_counter:
+ emit_gs_end_primitive(instr->src[0]);
+ break;
+
+ case nir_intrinsic_set_vertex_count:
+ bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));
+ break;
+
default:
unreachable("unknown intrinsic");
}
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index b6d1c3b6d4a..ef92098286c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -880,7 +880,7 @@ void fs_visitor::compute_clip_distance(gl_clip_plane *clip_planes)
}
void
-fs_visitor::emit_urb_writes()
+fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
{
int slot, urb_offset, length;
int starting_urb_offset = 0;
@@ -916,9 +916,13 @@ fs_visitor::emit_urb_writes()
return;
}
+ opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
+ int header_size = 1;
+ fs_reg per_slot_offsets;
+
if (stage == MESA_SHADER_GEOMETRY) {
const struct brw_gs_prog_data *gs_prog_data =
- (const struct brw_gs_prog_data *) prog_data;
+ (const struct brw_gs_prog_data *) this->prog_data;
/* We need to increment the Global Offset to skip over the control data
* header and the extra "Vertex Count" field (1 HWord) at the beginning
@@ -927,6 +931,27 @@ fs_visitor::emit_urb_writes()
starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords;
if (gs_prog_data->static_vertex_count == -1)
starting_urb_offset += 2;
+
+ /* We also need to use per-slot offsets. The per-slot offset is the
+ * Vertex Count. SIMD8 mode processes 8 different primitives at a
+ * time; each may output a different number of vertices.
+ */
+ opcode = SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT;
+ header_size++;
+
+ /* The URB offset is in 128-bit units, so we need to multiply by 2 */
+ const int output_vertex_size_owords =
+ gs_prog_data->output_vertex_size_hwords * 2;
+
+ fs_reg offset;
+ if (gs_vertex_count.file == IMM) {
+ per_slot_offsets = fs_reg(output_vertex_size_owords *
+ gs_vertex_count.fixed_hw_reg.dw1.ud);
+ } else {
+ per_slot_offsets = vgrf(glsl_type::int_type);
+ bld.MUL(per_slot_offsets, gs_vertex_count,
+ fs_reg(output_vertex_size_owords));
+ }
}
length = 0;
@@ -1023,19 +1048,25 @@ fs_visitor::emit_urb_writes()
if (length == 8 || last)
flush = true;
if (flush) {
- fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
- fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1),
+ fs_reg *payload_sources =
+ ralloc_array(mem_ctx, fs_reg, length + header_size);
+ fs_reg payload = fs_reg(GRF, alloc.allocate(length + header_size),
BRW_REGISTER_TYPE_F);
payload_sources[0] =
fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
- memcpy(&payload_sources[1], sources, length * sizeof sources[0]);
- abld.LOAD_PAYLOAD(payload, payload_sources, length + 1, 1);
+ if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT)
+ payload_sources[1] = per_slot_offsets;
- fs_inst *inst =
- abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
+ memcpy(&payload_sources[header_size], sources,
+ length * sizeof sources[0]);
+
+ abld.LOAD_PAYLOAD(payload, payload_sources, length + header_size,
+ header_size);
+
+ fs_inst *inst = abld.emit(opcode, reg_undef, payload);
inst->eot = last && stage == MESA_SHADER_VERTEX;
- inst->mlen = length + 1;
+ inst->mlen = length + header_size;
inst->offset = urb_offset;
urb_offset = starting_urb_offset + slot + 1;
length = 0;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index cfb5cd95cb1..49c10837334 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -29,6 +29,7 @@
#include "brw_vec4_gs_visitor.h"
#include "gen6_gs_visitor.h"
+#include "brw_fs.h"
namespace brw {
@@ -812,6 +813,30 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
* program.
*/
+ if (compiler->scalar_gs) {
+ /* TODO: Support instanced GS. We have basically no tests... */
+ assert(prog_data->invocations == 1);
+
+ fs_visitor v(compiler, log_data, mem_ctx, &c, prog_data, shader,
+ shader_time_index);
+ if (v.run_gs()) {
+ prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
+
+ fs_generator g(compiler, log_data, mem_ctx, &c.key,
+ &prog_data->base.base, v.promoted_constants,
+ false, "GS");
+ if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
+ const char *label =
+ shader->info.label ? shader->info.label : "unnamed";
+ char *name = ralloc_asprintf(mem_ctx, "%s geometry shader %s",
+ label, shader->info.name);
+ g.enable_debug(name);
+ }
+ g.generate_code(v.cfg, 8);
+ return g.get_assembly(final_assembly_size);
+ }
+ }
+
if (compiler->devinfo->gen >= 7) {
/* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
* so without spilling. If the GS invocations count > 1, then we can't use
From 7f9122c9680a882fee5a9d5a8e09c3e3b7466937 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?=
Date: Wed, 28 Oct 2015 12:59:38 +0100
Subject: [PATCH 004/287] gallium/radeon: always return the last SDMA fence on
SDMA flush if needed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Reviewed-by: Michel Dänzer
---
src/gallium/drivers/radeon/r600_pipe_common.c | 11 +++++++----
src/gallium/drivers/radeon/r600_pipe_common.h | 1 +
2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 0ad36849645..56977c06869 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -192,13 +192,15 @@ static void r600_flush_dma_ring(void *ctx, unsigned flags,
struct r600_common_context *rctx = (struct r600_common_context *)ctx;
struct radeon_winsys_cs *cs = rctx->rings.dma.cs;
- if (!cs->cdw) {
- return;
- }
+ if (!cs->cdw)
+ goto done;
rctx->rings.dma.flushing = true;
- rctx->ws->cs_flush(cs, flags, fence, 0);
+ rctx->ws->cs_flush(cs, flags, &rctx->last_sdma_fence, 0);
rctx->rings.dma.flushing = false;
+done:
+ if (fence)
+ rctx->ws->fence_reference(fence, rctx->last_sdma_fence);
}
static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx)
@@ -297,6 +299,7 @@ void r600_common_context_cleanup(struct r600_common_context *rctx)
if (rctx->allocator_so_filled_size) {
u_suballocator_destroy(rctx->allocator_so_filled_size);
}
+ rctx->ws->fence_reference(&rctx->last_sdma_fence, NULL);
}
void r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resource *r)
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index c300c0b3332..b7f1a234baf 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -384,6 +384,7 @@ struct r600_common_context {
enum radeon_family family;
enum chip_class chip_class;
struct r600_rings rings;
+ struct pipe_fence_handle *last_sdma_fence;
unsigned initial_gfx_cs_size;
unsigned gpu_reset_counter;
From 3b37155a68acc351cba86a1fa142bd0de2192d4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?=
Date: Wed, 28 Oct 2015 13:50:08 +0100
Subject: [PATCH 005/287] gallium/radeon: allow returning SDMA fences from
pipe->flush
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
pipe->flush never returned SDMA fences. This fixes it.
This is only an issue on amdgpu where fences can signal out of order.
Reviewed-by: Michel Dänzer
---
src/gallium/drivers/radeon/r600_pipe_common.c | 62 ++++++++++++++++---
1 file changed, 55 insertions(+), 7 deletions(-)
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 56977c06869..79e624ea12b 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -31,6 +31,7 @@
#include "util/u_memory.h"
#include "util/u_format_s3tc.h"
#include "util/u_upload_mgr.h"
+#include "os/os_time.h"
#include "vl/vl_decoder.h"
#include "vl/vl_video_buffer.h"
#include "radeon/radeon_video.h"
@@ -40,6 +41,12 @@
#define HAVE_LLVM 0
#endif
+struct r600_multi_fence {
+ struct pipe_reference reference;
+ struct pipe_fence_handle *gfx;
+ struct pipe_fence_handle *sdma;
+};
+
/*
* pipe_context
*/
@@ -174,16 +181,34 @@ static void r600_flush_from_st(struct pipe_context *ctx,
struct pipe_fence_handle **fence,
unsigned flags)
{
+ struct pipe_screen *screen = ctx->screen;
struct r600_common_context *rctx = (struct r600_common_context *)ctx;
unsigned rflags = 0;
+ struct pipe_fence_handle *gfx_fence = NULL;
+ struct pipe_fence_handle *sdma_fence = NULL;
if (flags & PIPE_FLUSH_END_OF_FRAME)
rflags |= RADEON_FLUSH_END_OF_FRAME;
if (rctx->rings.dma.cs) {
- rctx->rings.dma.flush(rctx, rflags, NULL);
+ rctx->rings.dma.flush(rctx, rflags, fence ? &sdma_fence : NULL);
+ }
+ rctx->rings.gfx.flush(rctx, rflags, fence ? &gfx_fence : NULL);
+
+ /* Both engines can signal out of order, so we need to keep both fences. */
+ if (gfx_fence || sdma_fence) {
+ struct r600_multi_fence *multi_fence =
+ CALLOC_STRUCT(r600_multi_fence);
+ if (!multi_fence)
+ return;
+
+ multi_fence->reference.count = 1;
+ multi_fence->gfx = gfx_fence;
+ multi_fence->sdma = sdma_fence;
+
+ screen->fence_reference(screen, fence, NULL);
+ *fence = (struct pipe_fence_handle*)multi_fence;
}
- rctx->rings.gfx.flush(rctx, rflags, fence);
}
static void r600_flush_dma_ring(void *ctx, unsigned flags,
@@ -757,12 +782,19 @@ static int r600_get_driver_query_info(struct pipe_screen *screen,
}
static void r600_fence_reference(struct pipe_screen *screen,
- struct pipe_fence_handle **ptr,
- struct pipe_fence_handle *fence)
+ struct pipe_fence_handle **dst,
+ struct pipe_fence_handle *src)
{
- struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws;
+ struct radeon_winsys *ws = ((struct r600_common_screen*)screen)->ws;
+ struct r600_multi_fence **rdst = (struct r600_multi_fence **)dst;
+ struct r600_multi_fence *rsrc = (struct r600_multi_fence *)src;
- rws->fence_reference(ptr, fence);
+ if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) {
+ ws->fence_reference(&(*rdst)->gfx, NULL);
+ ws->fence_reference(&(*rdst)->sdma, NULL);
+ FREE(*rdst);
+ }
+ *rdst = rsrc;
}
static boolean r600_fence_finish(struct pipe_screen *screen,
@@ -770,8 +802,24 @@ static boolean r600_fence_finish(struct pipe_screen *screen,
uint64_t timeout)
{
struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws;
+ struct r600_multi_fence *rfence = (struct r600_multi_fence *)fence;
+ int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
- return rws->fence_wait(rws, fence, timeout);
+ if (rfence->sdma) {
+ if (!rws->fence_wait(rws, rfence->sdma, timeout))
+ return false;
+
+ /* Recompute the timeout after waiting. */
+ if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
+ int64_t time = os_time_get_nano();
+ timeout = abs_timeout > time ? abs_timeout - time : 0;
+ }
+ }
+
+ if (!rfence->gfx)
+ return true;
+
+ return rws->fence_wait(rws, rfence->gfx, timeout);
}
static bool r600_interpret_tiling(struct r600_common_screen *rscreen,
From cf3121ed1885b257217dbac24a131dbfd5f8e438 Mon Sep 17 00:00:00 2001
From: Matt Turner
Date: Fri, 30 Oct 2015 10:07:23 -0700
Subject: [PATCH 006/287] i965/vec4: Send from GRF in atomic operations.
Reviewed-by: Kenneth Graunke
---
.../drivers/dri/i965/brw_vec4_visitor.cpp | 30 +++++++++++--------
1 file changed, 18 insertions(+), 12 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index b8f90f2aa20..606fbd06278 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1183,24 +1183,27 @@ vec4_visitor::gs_end_primitive()
void
vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
- dst_reg dst, src_reg offset,
+ dst_reg dst, src_reg surf_offset,
src_reg src0, src_reg src1)
{
- unsigned mlen = 0;
+ unsigned mlen = 1 + (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
+ src_reg src_payload(this, glsl_type::uint_type, mlen);
+ dst_reg payload(src_payload);
+ payload.writemask = WRITEMASK_X;
/* Set the atomic operation offset. */
- emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
- mlen++;
+ emit(MOV(offset(payload, 0), surf_offset));
+ unsigned i = 1;
/* Set the atomic operation arguments. */
if (src0.file != BAD_FILE) {
- emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
- mlen++;
+ emit(MOV(offset(payload, i), src0));
+ i++;
}
if (src1.file != BAD_FILE) {
- emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
- mlen++;
+ emit(MOV(offset(payload, i), src1));
+ i++;
}
/* Emit the instruction. Note that this maps to the normal SIMD8
@@ -1208,24 +1211,27 @@ vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
* unused channels will be masked out.
*/
vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
- brw_message_reg(0),
+ src_payload,
src_reg(surf_index), src_reg(atomic_op));
inst->mlen = mlen;
}
void
vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
- src_reg offset)
+ src_reg surf_offset)
{
+ dst_reg offset(this, glsl_type::uint_type);
+ offset.writemask = WRITEMASK_X;
+
/* Set the surface read offset. */
- emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
+ emit(MOV(offset, surf_offset));
/* Emit the instruction. Note that this maps to the normal SIMD8
* untyped surface read message, but that's OK because unused
* channels will be masked out.
*/
vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
- brw_message_reg(0),
+ src_reg(offset),
src_reg(surf_index), src_reg(1));
inst->mlen = 1;
}
From 4bc16ad2176efda5f8c59e222b4735ee35c434b5 Mon Sep 17 00:00:00 2001
From: Jordan Justen
Date: Fri, 23 Oct 2015 16:10:02 -0700
Subject: [PATCH 007/287] mesa: rename UniformBlockStageIndex to
InterfaceBlockStageIndex
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: Jordan Justen
Cc: Samuel Iglesias Gonsálvez
Cc: Iago Toral
Reviewed-by: Iago Toral Quiroga
Reviewed-by: Juha-Pekka Heikkila
---
src/glsl/link_uniform_initializers.cpp | 2 +-
src/glsl/linker.cpp | 16 ++++++++--------
src/glsl/standalone_scaffolding.cpp | 4 ++--
src/mesa/main/mtypes.h | 11 ++++++-----
src/mesa/main/shader_query.cpp | 2 +-
src/mesa/main/shaderobj.c | 4 ++--
src/mesa/main/uniforms.c | 4 ++--
7 files changed, 22 insertions(+), 21 deletions(-)
diff --git a/src/glsl/link_uniform_initializers.cpp b/src/glsl/link_uniform_initializers.cpp
index 682a4eef13c..58d21e5125e 100644
--- a/src/glsl/link_uniform_initializers.cpp
+++ b/src/glsl/link_uniform_initializers.cpp
@@ -178,7 +178,7 @@ set_block_binding(gl_shader_program *prog, const char *block_name, int binding)
/* This is a field of a UBO. val is the binding index. */
for (int i = 0; i < MESA_SHADER_STAGES; i++) {
- int stage_index = prog->UniformBlockStageIndex[i][block_index];
+ int stage_index = prog->InterfaceBlockStageIndex[i][block_index];
if (stage_index != -1) {
struct gl_shader *sh = prog->_LinkedShaders[i];
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index c35d87acea6..9dcc2a76c9a 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -1174,10 +1174,10 @@ interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog)
for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
struct gl_shader *sh = prog->_LinkedShaders[i];
- prog->UniformBlockStageIndex[i] = ralloc_array(prog, int,
- max_num_uniform_blocks);
+ prog->InterfaceBlockStageIndex[i] = ralloc_array(prog, int,
+ max_num_uniform_blocks);
for (unsigned int j = 0; j < max_num_uniform_blocks; j++)
- prog->UniformBlockStageIndex[i][j] = -1;
+ prog->InterfaceBlockStageIndex[i][j] = -1;
if (sh == NULL)
continue;
@@ -1194,7 +1194,7 @@ interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog)
return false;
}
- prog->UniformBlockStageIndex[i][index] = j;
+ prog->InterfaceBlockStageIndex[i][index] = j;
}
}
@@ -2836,9 +2836,9 @@ check_resources(struct gl_context *ctx, struct gl_shader_program *prog)
}
for (unsigned j = 0; j < MESA_SHADER_STAGES; j++) {
- if (prog->UniformBlockStageIndex[j][i] != -1) {
+ if (prog->InterfaceBlockStageIndex[j][i] != -1) {
struct gl_shader *sh = prog->_LinkedShaders[j];
- int stage_index = prog->UniformBlockStageIndex[j][i];
+ int stage_index = prog->InterfaceBlockStageIndex[j][i];
if (sh && sh->BufferInterfaceBlocks[stage_index].IsShaderStorage) {
shader_blocks[j]++;
total_shader_storage_blocks++;
@@ -2955,7 +2955,7 @@ check_image_resources(struct gl_context *ctx, struct gl_shader_program *prog)
total_image_units += sh->NumImages;
for (unsigned j = 0; j < prog->NumBufferInterfaceBlocks; j++) {
- int stage_index = prog->UniformBlockStageIndex[i][j];
+ int stage_index = prog->InterfaceBlockStageIndex[i][j];
if (stage_index != -1 && sh->BufferInterfaceBlocks[stage_index].IsShaderStorage)
total_shader_storage_blocks++;
}
@@ -3734,7 +3734,7 @@ build_program_resource_list(struct gl_shader_program *shProg)
int block_index = shProg->UniformStorage[i].block_index;
if (block_index != -1) {
for (unsigned j = 0; j < MESA_SHADER_STAGES; j++) {
- if (shProg->UniformBlockStageIndex[j][block_index] != -1)
+ if (shProg->InterfaceBlockStageIndex[j][block_index] != -1)
stageref |= (1 << j);
}
}
diff --git a/src/glsl/standalone_scaffolding.cpp b/src/glsl/standalone_scaffolding.cpp
index eccf094b5cd..fe1d820f2ea 100644
--- a/src/glsl/standalone_scaffolding.cpp
+++ b/src/glsl/standalone_scaffolding.cpp
@@ -120,8 +120,8 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg)
shProg->NumShaderStorageBlocks = 0;
for (i = 0; i < MESA_SHADER_STAGES; i++) {
- ralloc_free(shProg->UniformBlockStageIndex[i]);
- shProg->UniformBlockStageIndex[i] = NULL;
+ ralloc_free(shProg->InterfaceBlockStageIndex[i]);
+ shProg->InterfaceBlockStageIndex[i] = NULL;
}
ralloc_free(shProg->AtomicBuffers);
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index d6c1eb8511e..fdb3b3df318 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2716,13 +2716,14 @@ struct gl_shader_program
struct gl_uniform_block **ShaderStorageBlocks;
/**
- * Indices into the _LinkedShaders's UniformBlocks[] array for each stage
- * they're used in, or -1.
+ * Indices into the BufferInterfaceBlocks[] array for each stage they're
+ * used in, or -1.
*
- * This is used to maintain the Binding values of the stage's UniformBlocks[]
- * and to answer the GL_UNIFORM_BLOCK_REFERENCED_BY_*_SHADER queries.
+ * This is used to maintain the Binding values of the stage's
+ * BufferInterfaceBlocks[] and to answer the
+ * GL_UNIFORM_BLOCK_REFERENCED_BY_*_SHADER queries.
*/
- int *UniformBlockStageIndex[MESA_SHADER_STAGES];
+ int *InterfaceBlockStageIndex[MESA_SHADER_STAGES];
/**
* Map of active uniform names to locations
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index dd51bba3386..5cb877b0104 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -980,7 +980,7 @@ is_resource_referenced(struct gl_shader_program *shProg,
return RESOURCE_ATC(res)->StageReferences[stage];
if (res->Type == GL_UNIFORM_BLOCK || res->Type == GL_SHADER_STORAGE_BLOCK)
- return shProg->UniformBlockStageIndex[stage][index] != -1;
+ return shProg->InterfaceBlockStageIndex[stage][index] != -1;
return res->StageReferences & (1 << stage);
}
diff --git a/src/mesa/main/shaderobj.c b/src/mesa/main/shaderobj.c
index ffc71931fec..203ccef7fc4 100644
--- a/src/mesa/main/shaderobj.c
+++ b/src/mesa/main/shaderobj.c
@@ -294,8 +294,8 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg)
shProg->BufferInterfaceBlocks = NULL;
shProg->NumBufferInterfaceBlocks = 0;
for (i = 0; i < MESA_SHADER_STAGES; i++) {
- ralloc_free(shProg->UniformBlockStageIndex[i]);
- shProg->UniformBlockStageIndex[i] = NULL;
+ ralloc_free(shProg->InterfaceBlockStageIndex[i]);
+ shProg->InterfaceBlockStageIndex[i] = NULL;
}
ralloc_free(shProg->AtomicBuffers);
diff --git a/src/mesa/main/uniforms.c b/src/mesa/main/uniforms.c
index bc235380d97..758ca2456df 100644
--- a/src/mesa/main/uniforms.c
+++ b/src/mesa/main/uniforms.c
@@ -1026,7 +1026,7 @@ _mesa_UniformBlockBinding(GLuint program,
shProg->BufferInterfaceBlocks[uniformBlockIndex].Binding = uniformBlockBinding;
for (i = 0; i < MESA_SHADER_STAGES; i++) {
- int stage_index = shProg->UniformBlockStageIndex[i][uniformBlockIndex];
+ int stage_index = shProg->InterfaceBlockStageIndex[i][uniformBlockIndex];
if (stage_index != -1) {
struct gl_shader *sh = shProg->_LinkedShaders[i];
@@ -1079,7 +1079,7 @@ _mesa_ShaderStorageBlockBinding(GLuint program,
shProg->BufferInterfaceBlocks[shaderStorageBlockIndex].Binding = shaderStorageBlockBinding;
for (i = 0; i < MESA_SHADER_STAGES; i++) {
- int stage_index = shProg->UniformBlockStageIndex[i][shaderStorageBlockIndex];
+ int stage_index = shProg->InterfaceBlockStageIndex[i][shaderStorageBlockIndex];
if (stage_index != -1) {
struct gl_shader *sh = shProg->_LinkedShaders[i];
From 531be601d5f9ac4f8a9cc77240ba865fda077709 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand
Date: Wed, 28 Oct 2015 10:11:11 -0700
Subject: [PATCH 008/287] nir: Unexpose _impl versions of copy_prop and dce
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Reviewed-by: Kristian Høgsberg
Reviewed-by: Kenneth Graunke
---
src/glsl/nir/nir.h | 2 --
src/glsl/nir/nir_opt_copy_propagate.c | 2 +-
src/glsl/nir/nir_opt_dce.c | 2 +-
3 files changed, 2 insertions(+), 4 deletions(-)
diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index ac422514d52..874a03966be 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -2004,12 +2004,10 @@ bool nir_opt_constant_folding(nir_shader *shader);
bool nir_opt_global_to_local(nir_shader *shader);
-bool nir_copy_prop_impl(nir_function_impl *impl);
bool nir_copy_prop(nir_shader *shader);
bool nir_opt_cse(nir_shader *shader);
-bool nir_opt_dce_impl(nir_function_impl *impl);
bool nir_opt_dce(nir_shader *shader);
bool nir_opt_dead_cf(nir_shader *shader);
diff --git a/src/glsl/nir/nir_opt_copy_propagate.c b/src/glsl/nir/nir_opt_copy_propagate.c
index 71367d001bb..96520f8a361 100644
--- a/src/glsl/nir/nir_opt_copy_propagate.c
+++ b/src/glsl/nir/nir_opt_copy_propagate.c
@@ -256,7 +256,7 @@ copy_prop_block(nir_block *block, void *_state)
return true;
}
-bool
+static bool
nir_copy_prop_impl(nir_function_impl *impl)
{
bool progress = false;
diff --git a/src/glsl/nir/nir_opt_dce.c b/src/glsl/nir/nir_opt_dce.c
index e0ebdc61c2f..603252825c3 100644
--- a/src/glsl/nir/nir_opt_dce.c
+++ b/src/glsl/nir/nir_opt_dce.c
@@ -145,7 +145,7 @@ delete_block_cb(nir_block *block, void *_state)
return true;
}
-bool
+static bool
nir_opt_dce_impl(nir_function_impl *impl)
{
struct exec_list *worklist = ralloc(NULL, struct exec_list);
From aea40091f003f8772afce3562b0f8c6a17dad07f Mon Sep 17 00:00:00 2001
From: Kenneth Graunke
Date: Mon, 2 Nov 2015 21:02:37 -0800
Subject: [PATCH 009/287] nir: Properly invalidate metadata in
nir_lower_global_vars_to_local().
v2: Preserve nir_metadata_live_variables as well (caught by Jason).
Signed-off-by: Kenneth Graunke
Reviewed-by: Jason Ekstrand
Reviewed-by: Eduardo Lima Mitev
---
src/glsl/nir/nir_lower_global_vars_to_local.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/src/glsl/nir/nir_lower_global_vars_to_local.c b/src/glsl/nir/nir_lower_global_vars_to_local.c
index fab236611a5..dcd091ae2fa 100644
--- a/src/glsl/nir/nir_lower_global_vars_to_local.c
+++ b/src/glsl/nir/nir_lower_global_vars_to_local.c
@@ -100,6 +100,9 @@ nir_lower_global_vars_to_local(nir_shader *shader)
exec_node_remove(&var->node);
var->data.mode = nir_var_local;
exec_list_push_tail(&impl->locals, &var->node);
+ nir_metadata_preserve(impl, nir_metadata_block_index |
+ nir_metadata_dominance |
+ nir_metadata_live_variables);
progress = true;
}
}
From 8bb44510fca5315bbdd61502c72c22c7198c0daf Mon Sep 17 00:00:00 2001
From: Kenneth Graunke
Date: Mon, 2 Nov 2015 21:05:08 -0800
Subject: [PATCH 010/287] nir: Properly invalidate metadata in
nir_split_var_copies().
Signed-off-by: Kenneth Graunke
Reviewed-by: Jason Ekstrand
Reviewed-by: Eduardo Lima Mitev
Cc: mesa-stable@lists.freedesktop.org
---
src/glsl/nir/nir_split_var_copies.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/src/glsl/nir/nir_split_var_copies.c b/src/glsl/nir/nir_split_var_copies.c
index d2ea58a8b7c..d463f7bdae9 100644
--- a/src/glsl/nir/nir_split_var_copies.c
+++ b/src/glsl/nir/nir_split_var_copies.c
@@ -271,6 +271,11 @@ split_var_copies_impl(nir_function_impl *impl)
ralloc_free(state.dead_ctx);
+ if (state.progress) {
+ nir_metadata_preserve(impl, nir_metadata_block_index |
+ nir_metadata_dominance);
+ }
+
return state.progress;
}
From 4cb7546066f3f06b8030b8fce78f82469b0c6980 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke
Date: Mon, 2 Nov 2015 21:28:26 -0800
Subject: [PATCH 011/287] nir: Properly invalidate metadata in
nir_remove_dead_variables().
v2: Preserve live_variables too (Jason).
Signed-off-by: Kenneth Graunke
Reviewed-by: Jason Ekstrand
Reviewed-by: Eduardo Lima Mitev
---
src/glsl/nir/nir_remove_dead_variables.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/src/glsl/nir/nir_remove_dead_variables.c b/src/glsl/nir/nir_remove_dead_variables.c
index d6783e78803..530a8475ed5 100644
--- a/src/glsl/nir/nir_remove_dead_variables.c
+++ b/src/glsl/nir/nir_remove_dead_variables.c
@@ -126,8 +126,14 @@ nir_remove_dead_variables(nir_shader *shader)
progress = remove_dead_vars(&shader->globals, live) || progress;
nir_foreach_overload(shader, overload) {
- if (overload->impl)
- progress = remove_dead_vars(&overload->impl->locals, live) || progress;
+ if (overload->impl) {
+ if (remove_dead_vars(&overload->impl->locals, live)) {
+ nir_metadata_preserve(overload->impl, nir_metadata_block_index |
+ nir_metadata_dominance |
+ nir_metadata_live_variables);
+ progress = true;
+ }
+ }
}
_mesa_set_destroy(live, NULL);
From 0f037bd71ffe083c05cd0867ef54bce91ff84243 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke
Date: Mon, 2 Nov 2015 21:21:25 -0800
Subject: [PATCH 012/287] nir: Properly invalidate metadata in
nir_opt_copy_prop().
Signed-off-by: Kenneth Graunke
Reviewed-by: Jason Ekstrand
Reviewed-by: Eduardo Lima Mitev
Cc: mesa-stable@lists.freedesktop.org
---
src/glsl/nir/nir_opt_copy_propagate.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/src/glsl/nir/nir_opt_copy_propagate.c b/src/glsl/nir/nir_opt_copy_propagate.c
index 96520f8a361..7d8bdd7f2ca 100644
--- a/src/glsl/nir/nir_opt_copy_propagate.c
+++ b/src/glsl/nir/nir_opt_copy_propagate.c
@@ -262,6 +262,12 @@ nir_copy_prop_impl(nir_function_impl *impl)
bool progress = false;
nir_foreach_block(impl, copy_prop_block, &progress);
+
+ if (progress) {
+ nir_metadata_preserve(impl, nir_metadata_block_index |
+ nir_metadata_dominance);
+ }
+
return progress;
}
From bc3942e2970c60a816cf954b1fa4d416d0852bd9 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke
Date: Mon, 2 Nov 2015 21:38:56 -0800
Subject: [PATCH 013/287] nir: Properly invalidate metadata in
nir_lower_vec_to_movs().
Signed-off-by: Kenneth Graunke
Reviewed-by: Jason Ekstrand
Reviewed-by: Eduardo Lima Mitev
Cc: mesa-stable@lists.freedesktop.org
---
src/glsl/nir/nir_lower_vec_to_movs.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/src/glsl/nir/nir_lower_vec_to_movs.c b/src/glsl/nir/nir_lower_vec_to_movs.c
index c08b721dae4..736a66c8639 100644
--- a/src/glsl/nir/nir_lower_vec_to_movs.c
+++ b/src/glsl/nir/nir_lower_vec_to_movs.c
@@ -288,6 +288,11 @@ nir_lower_vec_to_movs_impl(nir_function_impl *impl)
nir_foreach_block(impl, lower_vec_to_movs_block, &state);
+ if (state.progress) {
+ nir_metadata_preserve(impl, nir_metadata_block_index |
+ nir_metadata_dominance);
+ }
+
return state.progress;
}
From 59bbe2681b73c3795b7298e2486d5fde7c464ed5 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke
Date: Mon, 2 Nov 2015 21:43:40 -0800
Subject: [PATCH 014/287] nir: Properly invalidate metadata in
nir_opt_remove_phis().
Signed-off-by: Kenneth Graunke
Reviewed-by: Jason Ekstrand
Reviewed-by: Eduardo Lima Mitev
Cc: mesa-stable@lists.freedesktop.org
---
src/glsl/nir/nir_opt_remove_phis.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/src/glsl/nir/nir_opt_remove_phis.c b/src/glsl/nir/nir_opt_remove_phis.c
index 5bdf7ef4da7..66d37544115 100644
--- a/src/glsl/nir/nir_opt_remove_phis.c
+++ b/src/glsl/nir/nir_opt_remove_phis.c
@@ -108,6 +108,11 @@ remove_phis_impl(nir_function_impl *impl)
nir_foreach_block(impl, remove_phis_block, &progress);
+ if (progress) {
+ nir_metadata_preserve(impl, nir_metadata_block_index |
+ nir_metadata_dominance);
+ }
+
return progress;
}
From 39b4dfe6ab1003863778a25c091c080e098833ec Mon Sep 17 00:00:00 2001
From: Oded Gabbay
Date: Tue, 3 Nov 2015 10:36:01 +0200
Subject: [PATCH 015/287] llvmpipe: use simple coeffs calc for 128bit vectors
There are currently two methods in llvmpipe code to calculate coeffs to
be used as inputs for the fragment shader. The two methods use slightly
different ways to do the floating point calculations and thus produce
slightly different results.
The decision which method to use is determined by the size of the vector
that is used by the platform.
For vectors with size of more than 128bit, a single-step method is used,
in which coeffs_init_simple() + attribs_update_simple() are called.
For vectors with size of 128bit or less, a two-step method is used, in
which coeffs_init() + attribs_update() are called.
This causes some piglit tests (clip-distance-bulk-copy,
interface-vs-unnamed-to-fs-unnamed) to fail when using platforms with
128bit vectors (such as ppc64le or x86-64 without AVX).
This patch makes platforms with 128bit vectors use the single-step
method (aka "simple" method) instead of the two-step method.
This would make the resulting coeffs identical between more platforms,
make sure the piglit tests passes, and make debugging and maintainability
a bit easier as the generated LLVM IR will be the same for more platforms.
The performance impact is negligible for x86-64 without AVX, and
basically non-existent for ppc64le, as it can be seen from the following
benchmarking results:
- glxspheres, on ppc64le:
- original code: 4.892745317 frames/sec 5.460303857 Mpixels/sec
- with the patch: 4.932083873 frames/sec 5.504205571 Mpixels/sec
- Additional 0.8% performance boost
- glxspheres, on x86-64 without AVX:
- original code: 20.16418809 frames/sec 22.50323395 Mpixels/sec
- with the patch: 20.31328989 frames/sec 22.66963152 Mpixels/sec
- Additional 0.74% performance boost
- glmark2, on ppc64le:
- original code: score of 58
- with my change: score of 57
- glmark2, on x86-64 without AVX:
- original code: score of 175
- with the patch: score of 167
- Impact of of -4.5% on performance
- OpenArena, on ppc64le:
- original code: 3398 frames 1719.0 seconds 2.0 fps
255.0/505.9/2773.0/0.0 ms
- with the patch: 3398 frames 1690.4 seconds 2.0 fps
241.0/497.5/2563.0/0.2 ms
- 29 seconds faster with the patch, which is about 2%
- OpenArena, on x86-64 without AVX:
- original code: 3398 frames 239.6 seconds 14.2 fps
38.0/70.5/719.0/14.6 ms
- with the patch: 3398 frames 244.4 seconds 13.9 fps
38.0/71.9/697.0/14.3 ms
- 0.3 fps slower with the patch (about 2%)
Additional details can be found at:
http://lists.freedesktop.org/archives/mesa-dev/2015-October/098635.html
Signed-off-by: Oded Gabbay
Reviewed-by: Roland Scheidegger
---
src/gallium/drivers/llvmpipe/lp_bld_interp.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
index df262fa4716..ceac86abe1d 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
@@ -746,7 +746,12 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
pos_init(bld, x0, y0);
- if (coeff_type.length > 4) {
+ /*
+ * Simple method (single step interpolation) may be slower if vector length
+ * is just 4, but the results are different (generally less accurate) with
+ * the other method, so always use more accurate version.
+ */
+ if (1) {
bld->simple_interp = TRUE;
{
/* XXX this should use a global static table */
From 9285ed98f7557722fbb94f47c5bc138ef5dd9c70 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger
Date: Tue, 27 Oct 2015 05:34:00 +0100
Subject: [PATCH 016/287] llvmpipe: add cache for compressed textures
compressed textures are very slow because decoding is rather complex
(and because there's no jit code code to decode them too for non-technical
reasons).
Thus, add some texture cache which holds a couple of decoded blocks.
Right now this handles only s3tc format albeit it could be extended to work
with other formats rather trivially as long as the result of decode fits into
32bit per texel (ideally, rgtc actually would decode to more than 8 bits
per channel, but even then making it work for it shouldn't be too difficult).
This can improve performance noticeably but don't expect wonders (uncompressed
is unsurprisingly still faster). It's also possible it might be slower in
some cases (using nearest filtering for example or if there's otherwise not
many cache hits, the cache is only direct mapped which isn't great).
Also, actual decode of a block relies on util code, thus even though always
full blocks are decoded it is done texel by texel - this could obviously
benefit greatly from simd-optimized code decoding full blocks at once...
Note the cache is per (raster) thread, and currently only used for fragment
shaders.
Reviewed-by: Jose Fonseca
---
src/gallium/auxiliary/Makefile.sources | 2 +
src/gallium/auxiliary/draw/draw_llvm.c | 5 +-
src/gallium/auxiliary/gallivm/lp_bld_format.c | 56 +++
src/gallium/auxiliary/gallivm/lp_bld_format.h | 56 ++-
.../auxiliary/gallivm/lp_bld_format_aos.c | 31 +-
.../auxiliary/gallivm/lp_bld_format_cached.c | 374 ++++++++++++++++++
.../auxiliary/gallivm/lp_bld_format_soa.c | 37 +-
src/gallium/auxiliary/gallivm/lp_bld_sample.h | 13 +
.../auxiliary/gallivm/lp_bld_sample_aos.c | 6 +-
.../auxiliary/gallivm/lp_bld_sample_soa.c | 42 ++
src/gallium/auxiliary/gallivm/lp_bld_tgsi.h | 2 +
.../auxiliary/gallivm/lp_bld_tgsi_soa.c | 5 +
src/gallium/drivers/llvmpipe/lp_jit.c | 3 +
src/gallium/drivers/llvmpipe/lp_jit.h | 8 +-
src/gallium/drivers/llvmpipe/lp_rast.c | 44 ++-
src/gallium/drivers/llvmpipe/lp_state_fs.c | 4 +-
src/gallium/drivers/llvmpipe/lp_test_format.c | 36 +-
src/gallium/drivers/llvmpipe/lp_tex_sample.c | 19 +
src/gallium/drivers/llvmpipe/lp_tex_sample.h | 5 +-
19 files changed, 730 insertions(+), 18 deletions(-)
create mode 100644 src/gallium/auxiliary/gallivm/lp_bld_format.c
create mode 100644 src/gallium/auxiliary/gallivm/lp_bld_format_cached.c
diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
index 9df4e265b5b..6e22ced4e41 100644
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -378,7 +378,9 @@ GALLIVM_SOURCES := \
gallivm/lp_bld_flow.h \
gallivm/lp_bld_format_aos_array.c \
gallivm/lp_bld_format_aos.c \
+ gallivm/lp_bld_format_cached.c \
gallivm/lp_bld_format_float.c \
+ gallivm/lp_bld_format.c \
gallivm/lp_bld_format.h \
gallivm/lp_bld_format_soa.c \
gallivm/lp_bld_format_srgb.c \
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index b1e1bcbee04..8435991fb6b 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -625,6 +625,7 @@ generate_vs(struct draw_llvm_variant *variant,
inputs,
outputs,
context_ptr,
+ NULL,
draw_sampler,
&llvm->draw->vs.vertex_shader->info,
NULL);
@@ -749,7 +750,8 @@ generate_fetch(struct gallivm_state *gallivm,
lp_float32_vec4_type(),
FALSE,
map_ptr,
- zero, zero, zero);
+ zero, zero, zero,
+ NULL);
LLVMBuildStore(builder, val, temp_ptr);
}
lp_build_endif(&if_ctx);
@@ -2193,6 +2195,7 @@ draw_gs_llvm_generate(struct draw_llvm *llvm,
NULL,
outputs,
context_ptr,
+ NULL,
sampler,
&llvm->draw->gs.geometry_shader->info,
(const struct lp_build_tgsi_gs_iface *)&gs_iface);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.c b/src/gallium/auxiliary/gallivm/lp_bld_format.c
new file mode 100644
index 00000000000..a82fd8feee8
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.c
@@ -0,0 +1,56 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#include "lp_bld_format.h"
+
+
+
+LLVMTypeRef
+lp_build_format_cache_type(struct gallivm_state *gallivm)
+{
+ LLVMTypeRef elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_COUNT];
+ LLVMTypeRef s;
+
+ elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_DATA] =
+ LLVMArrayType(LLVMInt32TypeInContext(gallivm->context),
+ LP_BUILD_FORMAT_CACHE_SIZE * 16);
+ elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_TAGS] =
+ LLVMArrayType(LLVMInt64TypeInContext(gallivm->context),
+ LP_BUILD_FORMAT_CACHE_SIZE);
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+ elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL] =
+ LLVMInt64TypeInContext(gallivm->context);
+ elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS] =
+ LLVMInt64TypeInContext(gallivm->context);
+#endif
+
+ s = LLVMStructTypeInContext(gallivm->context, elem_types,
+ LP_BUILD_FORMAT_CACHE_MEMBER_COUNT, 0);
+
+ return s;
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h
index 969f1f6cc94..5c866f420bd 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
@@ -44,6 +44,45 @@ struct lp_type;
struct lp_build_context;
+#define LP_BUILD_FORMAT_CACHE_DEBUG 0
+/*
+ * Block cache
+ *
+ * Optional block cache to be used when unpacking big pixel blocks.
+ * Must be a power of 2
+ */
+
+#define LP_BUILD_FORMAT_CACHE_SIZE 128
+
+/*
+ * Note: cache_data needs 16 byte alignment.
+ */
+struct lp_build_format_cache
+{
+ PIPE_ALIGN_VAR(16) uint32_t cache_data[LP_BUILD_FORMAT_CACHE_SIZE][4][4];
+ uint64_t cache_tags[LP_BUILD_FORMAT_CACHE_SIZE];
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+ uint64_t cache_access_total;
+ uint64_t cache_access_miss;
+#endif
+};
+
+
+enum {
+ LP_BUILD_FORMAT_CACHE_MEMBER_DATA = 0,
+ LP_BUILD_FORMAT_CACHE_MEMBER_TAGS,
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+ LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL,
+ LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS,
+#endif
+ LP_BUILD_FORMAT_CACHE_MEMBER_COUNT
+};
+
+
+LLVMTypeRef
+lp_build_format_cache_type(struct gallivm_state *gallivm);
+
+
/*
* AoS
*/
@@ -66,7 +105,8 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
LLVMValueRef base_ptr,
LLVMValueRef offset,
LLVMValueRef i,
- LLVMValueRef j);
+ LLVMValueRef j,
+ LLVMValueRef cache);
LLVMValueRef
lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm,
@@ -107,13 +147,13 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
LLVMValueRef offsets,
LLVMValueRef i,
LLVMValueRef j,
+ LLVMValueRef cache,
LLVMValueRef rgba_out[4]);
/*
* YUV
*/
-
LLVMValueRef
lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm,
const struct util_format_description *format_desc,
@@ -123,6 +163,18 @@ lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm,
LLVMValueRef i,
LLVMValueRef j);
+
+LLVMValueRef
+lp_build_fetch_cached_texels(struct gallivm_state *gallivm,
+ const struct util_format_description *format_desc,
+ unsigned n,
+ LLVMValueRef base_ptr,
+ LLVMValueRef offset,
+ LLVMValueRef i,
+ LLVMValueRef j,
+ LLVMValueRef cache);
+
+
/*
* special float formats
*/
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index ddf3ad1dfc6..a41b30bbb96 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -370,7 +370,8 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
LLVMValueRef base_ptr,
LLVMValueRef offset,
LLVMValueRef i,
- LLVMValueRef j)
+ LLVMValueRef j,
+ LLVMValueRef cache)
{
LLVMBuilderRef builder = gallivm->builder;
unsigned num_pixels = type.length / 4;
@@ -502,6 +503,34 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
return tmp;
}
+ /*
+ * s3tc rgb formats
+ */
+
+ if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC && cache) {
+ struct lp_type tmp_type;
+ LLVMValueRef tmp;
+
+ memset(&tmp_type, 0, sizeof tmp_type);
+ tmp_type.width = 8;
+ tmp_type.length = num_pixels * 4;
+ tmp_type.norm = TRUE;
+
+ tmp = lp_build_fetch_cached_texels(gallivm,
+ format_desc,
+ num_pixels,
+ base_ptr,
+ offset,
+ i, j,
+ cache);
+
+ lp_build_conv(gallivm,
+ tmp_type, type,
+ &tmp, 1, &tmp, 1);
+
+ return tmp;
+ }
+
/*
* Fallback to util_format_description::fetch_rgba_8unorm().
*/
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c b/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c
new file mode 100644
index 00000000000..b683e7f960c
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c
@@ -0,0 +1,374 @@
+/**************************************************************************
+ *
+ * Copyright 2015 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "lp_bld_format.h"
+#include "lp_bld_type.h"
+#include "lp_bld_struct.h"
+#include "lp_bld_const.h"
+#include "lp_bld_flow.h"
+#include "lp_bld_swizzle.h"
+
+#include "util/u_math.h"
+
+
+/**
+ * @file
+ * Complex block-compression based formats are handled here by using a cache,
+ * so re-decoding of every pixel is not required.
+ * Especially for bilinear filtering, texel reuse is very high hence even
+ * a small cache helps.
+ * The elements in the cache are the decoded blocks - currently things
+ * are restricted to formats which are 4x4 block based, and the decoded
+ * texels must fit into 4x8 bits.
+ * The cache is direct mapped so hitrates aren't all that great and cache
+ * thrashing could happen.
+ *
+ * @author Roland Scheidegger
+ */
+
+
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+static void
+update_cache_access(struct gallivm_state *gallivm,
+ LLVMValueRef ptr,
+ unsigned count,
+ unsigned index)
+{
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef member_ptr, cache_access;
+
+ assert(index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL ||
+ index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
+
+ member_ptr = lp_build_struct_get_ptr(gallivm, ptr, index, "");
+ cache_access = LLVMBuildLoad(builder, member_ptr, "cache_access");
+ cache_access = LLVMBuildAdd(builder, cache_access,
+ LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
+ count, 0), "");
+ LLVMBuildStore(builder, cache_access, member_ptr);
+}
+#endif
+
+
+static void
+store_cached_block(struct gallivm_state *gallivm,
+ LLVMValueRef *col,
+ LLVMValueRef tag_value,
+ LLVMValueRef hash_index,
+ LLVMValueRef cache)
+{
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef ptr, indices[3];
+ LLVMTypeRef type_ptr4x32;
+ unsigned count;
+
+ type_ptr4x32 = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0);
+ indices[0] = lp_build_const_int32(gallivm, 0);
+ indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
+ indices[2] = hash_index;
+ ptr = LLVMBuildGEP(builder, cache, indices, Elements(indices), "");
+ LLVMBuildStore(builder, tag_value, ptr);
+
+ indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
+ hash_index = LLVMBuildMul(builder, hash_index,
+ lp_build_const_int32(gallivm, 16), "");
+ for (count = 0; count < 4; count++) {
+ indices[2] = hash_index;
+ ptr = LLVMBuildGEP(builder, cache, indices, Elements(indices), "");
+ ptr = LLVMBuildBitCast(builder, ptr, type_ptr4x32, "");
+ LLVMBuildStore(builder, col[count], ptr);
+ hash_index = LLVMBuildAdd(builder, hash_index,
+ lp_build_const_int32(gallivm, 4), "");
+ }
+}
+
+
+static LLVMValueRef
+lookup_cached_pixel(struct gallivm_state *gallivm,
+ LLVMValueRef ptr,
+ LLVMValueRef index)
+{
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef member_ptr, indices[3];
+
+ indices[0] = lp_build_const_int32(gallivm, 0);
+ indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
+ indices[2] = index;
+ member_ptr = LLVMBuildGEP(builder, ptr, indices, Elements(indices), "");
+ return LLVMBuildLoad(builder, member_ptr, "cache_data");
+}
+
+
+static LLVMValueRef
+lookup_tag_data(struct gallivm_state *gallivm,
+ LLVMValueRef ptr,
+ LLVMValueRef index)
+{
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef member_ptr, indices[3];
+
+ indices[0] = lp_build_const_int32(gallivm, 0);
+ indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
+ indices[2] = index;
+ member_ptr = LLVMBuildGEP(builder, ptr, indices, Elements(indices), "");
+ return LLVMBuildLoad(builder, member_ptr, "tag_data");
+}
+
+
+static void
+update_cached_block(struct gallivm_state *gallivm,
+ const struct util_format_description *format_desc,
+ LLVMValueRef ptr_addr,
+ LLVMValueRef hash_index,
+ LLVMValueRef cache)
+
+{
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
+ LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
+ LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+ LLVMTypeRef i32x4 = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
+ LLVMValueRef function;
+ LLVMValueRef tag_value, tmp_ptr;
+ LLVMValueRef col[4];
+ unsigned i, j;
+
+ /*
+ * Use format_desc->fetch_rgba_8unorm() for each pixel in the block.
+ * This doesn't actually make any sense whatsoever, someone would need
+ * to write a function doing this for all pixels in a block (either as
+ * an external c function or with generated code). Don't ask.
+ */
+
+ {
+ /*
+ * Function to call looks like:
+ * fetch(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
+ */
+ LLVMTypeRef ret_type;
+ LLVMTypeRef arg_types[4];
+ LLVMTypeRef function_type;
+
+ assert(format_desc->fetch_rgba_8unorm);
+
+ ret_type = LLVMVoidTypeInContext(gallivm->context);
+ arg_types[0] = pi8t;
+ arg_types[1] = pi8t;
+ arg_types[2] = i32t;
+ arg_types[3] = i32t;
+ function_type = LLVMFunctionType(ret_type, arg_types,
+ Elements(arg_types), 0);
+
+ /* make const pointer for the C fetch_rgba_8unorm function */
+ function = lp_build_const_int_pointer(gallivm,
+ func_to_pointer((func_pointer) format_desc->fetch_rgba_8unorm));
+
+ /* cast the callee pointer to the function's type */
+ function = LLVMBuildBitCast(builder, function,
+ LLVMPointerType(function_type, 0),
+ "cast callee");
+ }
+
+ tmp_ptr = lp_build_array_alloca(gallivm, i32x4,
+ lp_build_const_int32(gallivm, 16),
+ "tmp_decode_store");
+ tmp_ptr = LLVMBuildBitCast(builder, tmp_ptr, pi8t, "");
+
+ /*
+ * Invoke format_desc->fetch_rgba_8unorm() for each pixel.
+ * This is going to be really really slow.
+ * Note: the block store format is actually
+ * x0y0x0y1x0y2x0y3 x1y0x1y1x1y2x1y3 ...
+ */
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j) {
+ LLVMValueRef args[4];
+ LLVMValueRef dst_offset = lp_build_const_int32(gallivm, (i * 4 + j) * 4);
+
+ /*
+ * Note we actually supply a pointer to the start of the block,
+ * not the start of the texture.
+ */
+ args[0] = LLVMBuildGEP(gallivm->builder, tmp_ptr, &dst_offset, 1, "");
+ args[1] = ptr_addr;
+ args[2] = LLVMConstInt(i32t, i, 0);
+ args[3] = LLVMConstInt(i32t, j, 0);
+ LLVMBuildCall(builder, function, args, Elements(args), "");
+ }
+ }
+
+ /* Finally store the block - pointless mem copy + update tag. */
+ tmp_ptr = LLVMBuildBitCast(builder, tmp_ptr, LLVMPointerType(i32x4, 0), "");
+ for (i = 0; i < 4; ++i) {
+ LLVMValueRef tmp_offset = lp_build_const_int32(gallivm, i);
+ LLVMValueRef ptr = LLVMBuildGEP(gallivm->builder, tmp_ptr, &tmp_offset, 1, "");
+ col[i] = LLVMBuildLoad(builder, ptr, "");
+ }
+
+ tag_value = LLVMBuildPtrToInt(gallivm->builder, ptr_addr,
+ LLVMInt64TypeInContext(gallivm->context), "");
+ store_cached_block(gallivm, col, tag_value, hash_index, cache);
+}
+
+
+/*
+ * Do a cached lookup.
+ *
+ * Returns (vectors of) 4x8 rgba aos value
+ */
+LLVMValueRef
+lp_build_fetch_cached_texels(struct gallivm_state *gallivm,
+ const struct util_format_description *format_desc,
+ unsigned n,
+ LLVMValueRef base_ptr,
+ LLVMValueRef offset,
+ LLVMValueRef i,
+ LLVMValueRef j,
+ LLVMValueRef cache)
+
+{
+ LLVMBuilderRef builder = gallivm->builder;
+ unsigned count, low_bit, log2size;
+ LLVMValueRef color, offset_stored, addr, ptr_addrtrunc, tmp;
+ LLVMValueRef ij_index, hash_index, hash_mask, block_index;
+ LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
+ LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+ LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
+ struct lp_type type;
+ struct lp_build_context bld32;
+ memset(&type, 0, sizeof type);
+ type.width = 32;
+ type.length = n;
+
+ assert(format_desc->block.width == 4);
+ assert(format_desc->block.height == 4);
+
+ lp_build_context_init(&bld32, gallivm, type);
+
+ /*
+ * compute hash - we use direct mapped cache, the hash function could
+ * be better but it needs to be simple
+ * per-element:
+ * compare offset with offset stored at tag (hash)
+ * if not equal decode/store block, update tag
+ * extract color from cache
+ * assemble result vector
+ */
+
+ /* TODO: not ideal with 32bit pointers... */
+
+ low_bit = util_logbase2(format_desc->block.bits / 8);
+ log2size = util_logbase2(LP_BUILD_FORMAT_CACHE_SIZE);
+ addr = LLVMBuildPtrToInt(builder, base_ptr, i64t, "");
+ ptr_addrtrunc = LLVMBuildPtrToInt(builder, base_ptr, i32t, "");
+ ptr_addrtrunc = lp_build_broadcast_scalar(&bld32, ptr_addrtrunc);
+ /* For the hash function, first mask off the unused lowest bits. Then just
+ do some xor with address bits - only use lower 32bits */
+ ptr_addrtrunc = LLVMBuildAdd(builder, offset, ptr_addrtrunc, "");
+ ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
+ lp_build_const_int_vec(gallivm, type, low_bit), "");
+ /* This only really makes sense for size 64,128,256 */
+ hash_index = ptr_addrtrunc;
+ ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
+ lp_build_const_int_vec(gallivm, type, 2*log2size), "");
+ hash_index = LLVMBuildXor(builder, ptr_addrtrunc, hash_index, "");
+ tmp = LLVMBuildLShr(builder, hash_index,
+ lp_build_const_int_vec(gallivm, type, log2size), "");
+ hash_index = LLVMBuildXor(builder, hash_index, tmp, "");
+
+ hash_mask = lp_build_const_int_vec(gallivm, type, LP_BUILD_FORMAT_CACHE_SIZE - 1);
+ hash_index = LLVMBuildAnd(builder, hash_index, hash_mask, "");
+ ij_index = LLVMBuildShl(builder, i, lp_build_const_int_vec(gallivm, type, 2), "");
+ ij_index = LLVMBuildAdd(builder, ij_index, j, "");
+ block_index = LLVMBuildShl(builder, hash_index,
+ lp_build_const_int_vec(gallivm, type, 4), "");
+ block_index = LLVMBuildAdd(builder, ij_index, block_index, "");
+
+ if (n > 1) {
+ color = LLVMGetUndef(LLVMVectorType(i32t, n));
+ for (count = 0; count < n; count++) {
+ LLVMValueRef index, cond, colorx;
+ LLVMValueRef block_indexx, hash_indexx, addrx, offsetx, ptr_addrx;
+ struct lp_build_if_state if_ctx;
+
+ index = lp_build_const_int32(gallivm, count);
+ offsetx = LLVMBuildExtractElement(builder, offset, index, "");
+ addrx = LLVMBuildZExt(builder, offsetx, i64t, "");
+ addrx = LLVMBuildAdd(builder, addrx, addr, "");
+ block_indexx = LLVMBuildExtractElement(builder, block_index, index, "");
+ hash_indexx = LLVMBuildLShr(builder, block_indexx,
+ lp_build_const_int32(gallivm, 4), "");
+ offset_stored = lookup_tag_data(gallivm, cache, hash_indexx);
+ cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addrx, "");
+
+ lp_build_if(&if_ctx, gallivm, cond);
+ {
+ ptr_addrx = LLVMBuildIntToPtr(builder, addrx,
+ LLVMPointerType(i8t, 0), "");
+ update_cached_block(gallivm, format_desc, ptr_addrx, hash_indexx, cache);
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+ update_cache_access(gallivm, cache, 1,
+ LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
+#endif
+ }
+ lp_build_endif(&if_ctx);
+
+ colorx = lookup_cached_pixel(gallivm, cache, block_indexx);
+
+ color = LLVMBuildInsertElement(builder, color, colorx,
+ lp_build_const_int32(gallivm, count), "");
+ }
+ }
+ else {
+ LLVMValueRef cond;
+ struct lp_build_if_state if_ctx;
+
+ tmp = LLVMBuildZExt(builder, offset, i64t, "");
+ addr = LLVMBuildAdd(builder, tmp, addr, "");
+ offset_stored = lookup_tag_data(gallivm, cache, hash_index);
+ cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addr, "");
+
+ lp_build_if(&if_ctx, gallivm, cond);
+ {
+ tmp = LLVMBuildIntToPtr(builder, addr, LLVMPointerType(i8t, 0), "");
+ update_cached_block(gallivm, format_desc, tmp, hash_index, cache);
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+ update_cache_access(gallivm, cache, 1,
+ LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
+#endif
+ }
+ lp_build_endif(&if_ctx);
+
+ color = lookup_cached_pixel(gallivm, cache, block_index);
+ }
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+ update_cache_access(gallivm, cache, n,
+ LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL);
+#endif
+ return LLVMBuildBitCast(builder, color, LLVMVectorType(i8t, n * 4), "");
+}
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index afaabc08790..42aef8376f8 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -346,6 +346,7 @@ lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,
* \param i, j the sub-block pixel coordinates. For non-compressed formats
* these will always be (0,0). For compressed formats, i will
* be in [0, block_width-1] and j will be in [0, block_height-1].
+ * \param cache optional value pointing to a lp_build_format_cache structure
*/
void
lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
@@ -355,6 +356,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
LLVMValueRef offset,
LLVMValueRef i,
LLVMValueRef j,
+ LLVMValueRef cache,
LLVMValueRef rgba_out[4])
{
LLVMBuilderRef builder = gallivm->builder;
@@ -473,7 +475,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
tmp_type.norm = TRUE;
tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
- TRUE, base_ptr, offset, i, j);
+ TRUE, base_ptr, offset, i, j, cache);
lp_build_rgba8_to_fi32_soa(gallivm,
type,
@@ -483,6 +485,37 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
return;
}
+ if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC &&
+ /* non-srgb case is already handled above */
+ format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
+ type.floating && type.width == 32 &&
+ (type.length == 1 || (type.length % 4 == 0)) &&
+ cache) {
+ const struct util_format_description *format_decompressed;
+ LLVMValueRef packed;
+ packed = lp_build_fetch_cached_texels(gallivm,
+ format_desc,
+ type.length,
+ base_ptr,
+ offset,
+ i, j,
+ cache);
+ packed = LLVMBuildBitCast(builder, packed,
+ lp_build_int_vec_type(gallivm, type), "");
+ /*
+ * The values are now packed so they match ordinary srgb RGBA8 format,
+ * hence need to use matching format for unpack.
+ */
+ format_decompressed = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
+
+ lp_build_unpack_rgba_soa(gallivm,
+ format_decompressed,
+ type,
+ packed, rgba_out);
+
+ return;
+ }
+
/*
* Fallback to calling lp_build_fetch_rgba_aos for each pixel.
*
@@ -524,7 +557,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
/* Get a single float[4]={R,G,B,A} pixel */
tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
TRUE, base_ptr, offset_elem,
- i_elem, j_elem);
+ i_elem, j_elem, cache);
/*
* Insert the AoS tmp value channels into the SoA result vectors at
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index eba758da6ae..a6f0eff42f6 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -99,6 +99,7 @@ struct lp_sampler_params
unsigned sampler_index;
unsigned sample_key;
LLVMValueRef context_ptr;
+ LLVMValueRef thread_data_ptr;
const LLVMValueRef *coords;
const LLVMValueRef *offsets;
LLVMValueRef lod;
@@ -267,6 +268,17 @@ struct lp_sampler_dynamic_state
struct gallivm_state *gallivm,
LLVMValueRef context_ptr,
unsigned sampler_unit);
+
+ /**
+ * Obtain texture cache (returns ptr to lp_build_format_cache).
+ *
+ * It's optional: no caching will be done if it's NULL.
+ */
+ LLVMValueRef
+ (*cache_ptr)(const struct lp_sampler_dynamic_state *state,
+ struct gallivm_state *gallivm,
+ LLVMValueRef thread_data_ptr,
+ unsigned unit);
};
@@ -356,6 +368,7 @@ struct lp_build_sample_context
LLVMValueRef img_stride_array;
LLVMValueRef base_ptr;
LLVMValueRef mip_offsets;
+ LLVMValueRef cache;
/** Integer vector with texture width, height, depth */
LLVMValueRef int_size;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
index d7fde810a76..729c5b8f6ef 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -593,7 +593,8 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
TRUE,
data_ptr, offset,
x_subcoord,
- y_subcoord);
+ y_subcoord,
+ bld->cache);
}
*colors = rgba8;
@@ -933,7 +934,8 @@ lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
TRUE,
data_ptr, offset[k][j][i],
x_subcoord[i],
- y_subcoord[j]);
+ y_subcoord[j],
+ bld->cache);
}
neighbors[k][j][i] = rgba8;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 26bfa0d2677..e21933ffc85 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -161,6 +161,7 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
bld->texel_type,
data_ptr, offset,
i, j,
+ bld->cache,
texel_out);
/*
@@ -2389,6 +2390,7 @@ lp_build_fetch_texel(struct lp_build_sample_context *bld,
bld->texel_type,
bld->base_ptr, offset,
i, j,
+ bld->cache,
colors_out);
if (out_of_bound_ret_zero) {
@@ -2442,6 +2444,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
unsigned texture_index,
unsigned sampler_index,
LLVMValueRef context_ptr,
+ LLVMValueRef thread_data_ptr,
const LLVMValueRef *coords,
const LLVMValueRef *offsets,
const struct lp_derivatives *derivs, /* optional */
@@ -2707,6 +2710,11 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
context_ptr, texture_index);
/* Note that mip_offsets is an array[level] of offsets to texture images */
+ if (dynamic_state->cache_ptr && thread_data_ptr) {
+ bld.cache = dynamic_state->cache_ptr(dynamic_state, gallivm,
+ thread_data_ptr, texture_index);
+ }
+
/* width, height, depth as single int vector */
if (dims <= 1) {
bld.int_size = tex_width;
@@ -2883,6 +2891,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
bld4.base_ptr = bld.base_ptr;
bld4.mip_offsets = bld.mip_offsets;
bld4.int_size = bld.int_size;
+ bld4.cache = bld.cache;
bld4.vector_width = lp_type_width(type4);
@@ -3081,12 +3090,14 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm,
LLVMValueRef offsets[3] = { NULL };
LLVMValueRef lod = NULL;
LLVMValueRef context_ptr;
+ LLVMValueRef thread_data_ptr = NULL;
LLVMValueRef texel_out[4];
struct lp_derivatives derivs;
struct lp_derivatives *deriv_ptr = NULL;
unsigned num_param = 0;
unsigned i, num_coords, num_derivs, num_offsets, layer;
enum lp_sampler_lod_control lod_control;
+ boolean need_cache = FALSE;
lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
LP_SAMPLER_LOD_CONTROL_SHIFT;
@@ -3094,8 +3105,19 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm,
get_target_info(static_texture_state->target,
&num_coords, &num_derivs, &num_offsets, &layer);
+ if (dynamic_state->cache_ptr) {
+ const struct util_format_description *format_desc;
+ format_desc = util_format_description(static_texture_state->format);
+ if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+ need_cache = TRUE;
+ }
+ }
+
/* "unpack" arguments */
context_ptr = LLVMGetParam(function, num_param++);
+ if (need_cache) {
+ thread_data_ptr = LLVMGetParam(function, num_param++);
+ }
for (i = 0; i < num_coords; i++) {
coords[i] = LLVMGetParam(function, num_param++);
}
@@ -3146,6 +3168,7 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm,
texture_index,
sampler_index,
context_ptr,
+ thread_data_ptr,
coords,
offsets,
deriv_ptr,
@@ -3189,6 +3212,7 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
const LLVMValueRef *offsets = params->offsets;
const struct lp_derivatives *derivs = params->derivs;
enum lp_sampler_lod_control lod_control;
+ boolean need_cache = FALSE;
lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
LP_SAMPLER_LOD_CONTROL_SHIFT;
@@ -3196,6 +3220,17 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
get_target_info(static_texture_state->target,
&num_coords, &num_derivs, &num_offsets, &layer);
+ if (dynamic_state->cache_ptr) {
+ const struct util_format_description *format_desc;
+ format_desc = util_format_description(static_texture_state->format);
+ if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+ /*
+ * This is not 100% correct, if we have cache but the
+ * util_format_s3tc_prefer is true the cache won't get used
+ * regardless (could hook up the block decode there...) */
+ need_cache = TRUE;
+ }
+ }
/*
* texture function matches are found by name.
* Thus the name has to include both the texture and sampler unit
@@ -3221,6 +3256,9 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
*/
arg_types[num_param++] = LLVMTypeOf(params->context_ptr);
+ if (need_cache) {
+ arg_types[num_param++] = LLVMTypeOf(params->thread_data_ptr);
+ }
for (i = 0; i < num_coords; i++) {
arg_types[num_param++] = LLVMTypeOf(coords[0]);
assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[i]));
@@ -3280,6 +3318,9 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
num_args = 0;
args[num_args++] = params->context_ptr;
+ if (need_cache) {
+ args[num_args++] = params->thread_data_ptr;
+ }
for (i = 0; i < num_coords; i++) {
args[num_args++] = coords[i];
}
@@ -3384,6 +3425,7 @@ lp_build_sample_soa(const struct lp_static_texture_state *static_texture_state,
params->texture_index,
params->sampler_index,
params->context_ptr,
+ params->thread_data_ptr,
params->coords,
params->offsets,
params->derivs,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index 2ca9c6194b3..cc4549778a3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -230,6 +230,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
const LLVMValueRef (*inputs)[4],
LLVMValueRef (*outputs)[4],
LLVMValueRef context_ptr,
+ LLVMValueRef thread_data_ptr,
struct lp_build_sampler_soa *sampler,
const struct tgsi_shader_info *info,
const struct lp_build_tgsi_gs_iface *gs_iface);
@@ -447,6 +448,7 @@ struct lp_build_tgsi_soa_context
const LLVMValueRef (*inputs)[TGSI_NUM_CHANNELS];
LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS];
LLVMValueRef context_ptr;
+ LLVMValueRef thread_data_ptr;
const struct lp_build_sampler_soa *sampler;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index fae604e2f9c..7d2cd9a9e73 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -2321,6 +2321,7 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
params.texture_index = unit;
params.sampler_index = unit;
params.context_ptr = bld->context_ptr;
+ params.thread_data_ptr = bld->thread_data_ptr;
params.coords = coords;
params.offsets = offsets;
params.lod = lod;
@@ -2488,6 +2489,7 @@ emit_sample(struct lp_build_tgsi_soa_context *bld,
params.texture_index = texture_unit;
params.sampler_index = sampler_unit;
params.context_ptr = bld->context_ptr;
+ params.thread_data_ptr = bld->thread_data_ptr;
params.coords = coords;
params.offsets = offsets;
params.lod = lod;
@@ -2608,6 +2610,7 @@ emit_fetch_texels( struct lp_build_tgsi_soa_context *bld,
params.texture_index = unit;
params.sampler_index = unit;
params.context_ptr = bld->context_ptr;
+ params.thread_data_ptr = bld->thread_data_ptr;
params.coords = coords;
params.offsets = offsets;
params.derivs = NULL;
@@ -3858,6 +3861,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
const LLVMValueRef (*inputs)[TGSI_NUM_CHANNELS],
LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
LLVMValueRef context_ptr,
+ LLVMValueRef thread_data_ptr,
struct lp_build_sampler_soa *sampler,
const struct tgsi_shader_info *info,
const struct lp_build_tgsi_gs_iface *gs_iface)
@@ -3893,6 +3897,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
bld.bld_base.info = info;
bld.indirect_files = info->indirect_files;
bld.context_ptr = context_ptr;
+ bld.thread_data_ptr = thread_data_ptr;
/*
* If the number of temporaries is rather large then we just
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c
index 9acde4f1b06..b915c1d64ff 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -36,6 +36,7 @@
#include "util/u_memory.h"
#include "gallivm/lp_bld_init.h"
#include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_format.h"
#include "lp_context.h"
#include "lp_jit.h"
@@ -208,6 +209,8 @@ lp_jit_create_types(struct lp_fragment_shader_variant *lp)
LLVMTypeRef elem_types[LP_JIT_THREAD_DATA_COUNT];
LLVMTypeRef thread_data_type;
+ elem_types[LP_JIT_THREAD_DATA_CACHE] =
+ LLVMPointerType(lp_build_format_cache_type(gallivm), 0);
elem_types[LP_JIT_THREAD_DATA_COUNTER] = LLVMInt64TypeInContext(lc);
elem_types[LP_JIT_THREAD_DATA_RASTER_STATE_VIEWPORT_INDEX] =
LLVMInt32TypeInContext(lc);
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h
index 097fa7dce7c..9db26f2cba9 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.h
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -43,6 +43,7 @@
#include "lp_texture.h"
+struct lp_build_format_cache;
struct lp_fragment_shader_variant;
struct llvmpipe_screen;
@@ -189,6 +190,7 @@ enum {
struct lp_jit_thread_data
{
+ struct lp_build_format_cache *cache;
uint64_t vis_counter;
/*
@@ -201,12 +203,16 @@ struct lp_jit_thread_data
enum {
- LP_JIT_THREAD_DATA_COUNTER = 0,
+ LP_JIT_THREAD_DATA_CACHE = 0,
+ LP_JIT_THREAD_DATA_COUNTER,
LP_JIT_THREAD_DATA_RASTER_STATE_VIEWPORT_INDEX,
LP_JIT_THREAD_DATA_COUNT
};
+#define lp_jit_thread_data_cache(_gallivm, _ptr) \
+ lp_build_struct_get(_gallivm, _ptr, LP_JIT_THREAD_DATA_CACHE, "cache")
+
#define lp_jit_thread_data_counter(_gallivm, _ptr) \
lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_THREAD_DATA_COUNTER, "counter")
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
index c726707c062..d22e50777fa 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -43,6 +43,7 @@
#include "lp_query.h"
#include "lp_rast.h"
#include "lp_rast_priv.h"
+#include "gallivm/lp_bld_format.h"
#include "gallivm/lp_bld_debug.h"
#include "lp_scene.h"
#include "lp_tex_sample.h"
@@ -664,6 +665,17 @@ rasterize_scene(struct lp_rasterizer_task *task,
{
task->scene = scene;
+ /* Clear the cache tags. This should not always be necessary but
+ simpler for now. */
+#if LP_USE_TEXTURE_CACHE
+ memset(task->thread_data.cache->cache_tags, 0,
+ sizeof(task->thread_data.cache->cache_tags));
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+ task->thread_data.cache->cache_access_total = 0;
+ task->thread_data.cache->cache_access_miss = 0;
+#endif
+#endif
+
if (!task->rast->no_rast && !scene->discard) {
/* loop over scene bins, rasterize each */
{
@@ -679,6 +691,20 @@ rasterize_scene(struct lp_rasterizer_task *task,
}
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+ {
+ uint64_t total, miss;
+ total = task->thread_data.cache->cache_access_total;
+ miss = task->thread_data.cache->cache_access_miss;
+ if (total) {
+ debug_printf("thread %d cache access %llu miss %llu hit rate %f\n",
+ task->thread_index, (long long unsigned)total,
+ (long long unsigned)miss,
+ (float)(total - miss)/(float)total);
+ }
+ }
+#endif
+
if (scene->fence) {
lp_fence_signal(scene->fence);
}
@@ -866,10 +892,15 @@ lp_rast_create( unsigned num_threads )
goto no_full_scenes;
}
- for (i = 0; i < Elements(rast->tasks); i++) {
+ for (i = 0; i < MAX2(1, num_threads); i++) {
struct lp_rasterizer_task *task = &rast->tasks[i];
task->rast = rast;
task->thread_index = i;
+ task->thread_data.cache = align_malloc(sizeof(struct lp_build_format_cache),
+ 16);
+ if (!task->thread_data.cache) {
+ goto no_thread_data_cache;
+ }
}
rast->num_threads = num_threads;
@@ -885,6 +916,14 @@ lp_rast_create( unsigned num_threads )
return rast;
+no_thread_data_cache:
+ for (i = 0; i < MAX2(1, rast->num_threads); i++) {
+ if (rast->tasks[i].thread_data.cache) {
+ align_free(rast->tasks[i].thread_data.cache);
+ }
+ }
+
+ lp_scene_queue_destroy(rast->full_scenes);
no_full_scenes:
FREE(rast);
no_rast:
@@ -923,6 +962,9 @@ void lp_rast_destroy( struct lp_rasterizer *rast )
pipe_semaphore_destroy(&rast->tasks[i].work_ready);
pipe_semaphore_destroy(&rast->tasks[i].work_done);
}
+ for (i = 0; i < MAX2(1, rast->num_threads); i++) {
+ align_free(rast->tasks[i].thread_data.cache);
+ }
/* for synchronizing rasterization threads */
pipe_barrier_destroy( &rast->barrier );
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index fd6c49aacd8..f55f6b4fa4f 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -421,7 +421,7 @@ generate_fs_loop(struct gallivm_state *gallivm,
lp_build_tgsi_soa(gallivm, tokens, type, &mask,
consts_ptr, num_consts_ptr, &system_values,
interp->inputs,
- outputs, context_ptr,
+ outputs, context_ptr, thread_data_ptr,
sampler, &shader->info.base, NULL);
/* Alpha test */
@@ -2303,8 +2303,8 @@ generate_fragment(struct llvmpipe_context *lp,
lp_build_name(dady_ptr, "dady");
lp_build_name(color_ptr_ptr, "color_ptr_ptr");
lp_build_name(depth_ptr, "depth");
- lp_build_name(thread_data_ptr, "thread_data");
lp_build_name(mask_input, "mask_input");
+ lp_build_name(thread_data_ptr, "thread_data");
lp_build_name(stride_ptr, "stride_ptr");
lp_build_name(depth_stride, "depth_stride");
diff --git a/src/gallium/drivers/llvmpipe/lp_test_format.c b/src/gallium/drivers/llvmpipe/lp_test_format.c
index d9abd1ae37c..0640a217874 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_format.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_format.c
@@ -44,6 +44,9 @@
#include "lp_test.h"
+#define USE_TEXTURE_CACHE 1
+
+static struct lp_build_format_cache *cache_ptr;
void
write_tsv_header(FILE *fp)
@@ -71,7 +74,7 @@ write_tsv_row(FILE *fp,
typedef void
(*fetch_ptr_t)(void *unpacked, const void *packed,
- unsigned i, unsigned j);
+ unsigned i, unsigned j, struct lp_build_format_cache *cache);
static LLVMValueRef
@@ -83,7 +86,7 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose,
LLVMContextRef context = gallivm->context;
LLVMModuleRef module = gallivm->module;
LLVMBuilderRef builder = gallivm->builder;
- LLVMTypeRef args[4];
+ LLVMTypeRef args[5];
LLVMValueRef func;
LLVMValueRef packed_ptr;
LLVMValueRef offset = LLVMConstNull(LLVMInt32TypeInContext(context));
@@ -92,6 +95,7 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose,
LLVMValueRef j;
LLVMBasicBlockRef block;
LLVMValueRef rgba;
+ LLVMValueRef cache = NULL;
util_snprintf(name, sizeof name, "fetch_%s_%s", desc->short_name,
type.floating ? "float" : "unorm8");
@@ -99,6 +103,7 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose,
args[0] = LLVMPointerType(lp_build_vec_type(gallivm, type), 0);
args[1] = LLVMPointerType(LLVMInt8TypeInContext(context), 0);
args[3] = args[2] = LLVMInt32TypeInContext(context);
+ args[4] = LLVMPointerType(lp_build_format_cache_type(gallivm), 0);
func = LLVMAddFunction(module, name,
LLVMFunctionType(LLVMVoidTypeInContext(context),
@@ -109,11 +114,15 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose,
i = LLVMGetParam(func, 2);
j = LLVMGetParam(func, 3);
+ if (cache_ptr) {
+ cache = LLVMGetParam(func, 4);
+ }
+
block = LLVMAppendBasicBlockInContext(context, func, "entry");
LLVMPositionBuilderAtEnd(builder, block);
rgba = lp_build_fetch_rgba_aos(gallivm, desc, type, TRUE,
- packed_ptr, offset, i, j);
+ packed_ptr, offset, i, j, cache);
LLVMBuildStore(builder, rgba, rgba_ptr);
@@ -170,7 +179,7 @@ test_format_float(unsigned verbose, FILE *fp,
memset(unpacked, 0, sizeof unpacked);
- fetch_ptr(unpacked, packed, j, i);
+ fetch_ptr(unpacked, packed, j, i, cache_ptr);
for(k = 0; k < 4; ++k) {
if (util_double_inf_sign(test->unpacked[i][j][k]) != util_inf_sign(unpacked[k])) {
@@ -187,6 +196,11 @@ test_format_float(unsigned verbose, FILE *fp,
}
}
+ /* Ignore errors in S3TC for now */
+ if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+ match = TRUE;
+ }
+
if (!match) {
printf("FAILED\n");
printf(" Packed: %02x %02x %02x %02x\n",
@@ -261,7 +275,7 @@ test_format_unorm8(unsigned verbose, FILE *fp,
memset(unpacked, 0, sizeof unpacked);
- fetch_ptr(unpacked, packed, j, i);
+ fetch_ptr(unpacked, packed, j, i, cache_ptr);
match = TRUE;
for(k = 0; k < 4; ++k) {
@@ -277,6 +291,11 @@ test_format_unorm8(unsigned verbose, FILE *fp,
match = FALSE;
}
+ /* Ignore errors in S3TC as we only implement a poor man approach */
+ if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+ match = TRUE;
+ }
+
if (!match) {
printf("FAILED\n");
printf(" Packed: %02x %02x %02x %02x\n",
@@ -334,6 +353,10 @@ test_all(unsigned verbose, FILE *fp)
util_format_s3tc_init();
+#if USE_TEXTURE_CACHE
+ cache_ptr = align_malloc(sizeof(struct lp_build_format_cache), 16);
+#endif
+
for (format = 1; format < PIPE_FORMAT_COUNT; ++format) {
const struct util_format_description *format_desc;
@@ -363,6 +386,9 @@ test_all(unsigned verbose, FILE *fp)
success = FALSE;
}
}
+#if USE_TEXTURE_CACHE
+ align_free(cache_ptr);
+#endif
return success;
}
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.c b/src/gallium/drivers/llvmpipe/lp_tex_sample.c
index 316d1c55082..217abe963b7 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.c
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.c
@@ -221,6 +221,21 @@ LP_LLVM_SAMPLER_MEMBER(lod_bias, LP_JIT_SAMPLER_LOD_BIAS, TRUE)
LP_LLVM_SAMPLER_MEMBER(border_color, LP_JIT_SAMPLER_BORDER_COLOR, FALSE)
+#if LP_USE_TEXTURE_CACHE
+static LLVMValueRef
+lp_llvm_texture_cache_ptr(const struct lp_sampler_dynamic_state *base,
+ struct gallivm_state *gallivm,
+ LLVMValueRef thread_data_ptr,
+ unsigned unit)
+{
+ /* We use the same cache for all units */
+ (void)unit;
+
+ return lp_jit_thread_data_cache(gallivm, thread_data_ptr);
+}
+#endif
+
+
static void
lp_llvm_sampler_soa_destroy(struct lp_build_sampler_soa *sampler)
{
@@ -314,6 +329,10 @@ lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state)
sampler->dynamic_state.base.lod_bias = lp_llvm_sampler_lod_bias;
sampler->dynamic_state.base.border_color = lp_llvm_sampler_border_color;
+#if LP_USE_TEXTURE_CACHE
+ sampler->dynamic_state.base.cache_ptr = lp_llvm_texture_cache_ptr;
+#endif
+
sampler->dynamic_state.static_state = static_state;
return &sampler->base;
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.h b/src/gallium/drivers/llvmpipe/lp_tex_sample.h
index f4aff226ce1..939131e7975 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.h
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.h
@@ -34,6 +34,10 @@
struct lp_sampler_static_state;
+/**
+ * Whether texture cache is used for s3tc textures.
+ */
+#define LP_USE_TEXTURE_CACHE 1
/**
* Pure-LLVM texture sampling code generator.
@@ -42,5 +46,4 @@ struct lp_sampler_static_state;
struct lp_build_sampler_soa *
lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *key);
-
#endif /* LP_TEX_SAMPLE_H */
From fb77da89f51fd82d5cee95400acb20ad74d9e7bc Mon Sep 17 00:00:00 2001
From: Timothy Arceri
Date: Sat, 31 Oct 2015 10:31:37 +1100
Subject: [PATCH 017/287] i965: add support for image AoA
V3: clamp array index to the correct size (the size of the current array
rather than the inner array) Francisco Jerez.
V2: avoid useless zero-initialization and addition for the first AoA level,
avoid redundant temporary, make use of type_size_scalar(), rename aoa_size
to element_size, assign the indirect indexing temporary directly to
image.reladdr, and replace while loop with a for loop. All suggested
by Francisco Jerez.
Reviewed-by: Francisco Jerez
---
src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 30 ++++++++++---------
.../drivers/dri/i965/brw_nir_uniforms.cpp | 2 ++
2 files changed, 18 insertions(+), 14 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index b6eab069a1f..e7a39ff741c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -1062,18 +1062,17 @@ fs_visitor::get_nir_image_deref(const nir_deref_var *deref)
fs_reg image(UNIFORM, deref->var->data.driver_location,
BRW_REGISTER_TYPE_UD);
- if (deref->deref.child) {
- const nir_deref_array *deref_array =
- nir_deref_as_array(deref->deref.child);
- assert(deref->deref.child->deref_type == nir_deref_type_array &&
- deref_array->deref.child == NULL);
- const unsigned size = glsl_get_length(deref->var->type);
+ for (const nir_deref *tail = &deref->deref; tail->child;
+ tail = tail->child) {
+ const nir_deref_array *deref_array = nir_deref_as_array(tail->child);
+ assert(tail->child->deref_type == nir_deref_type_array);
+ const unsigned size = glsl_get_length(tail->type);
+ const unsigned element_size = type_size_scalar(deref_array->deref.type);
const unsigned base = MIN2(deref_array->base_offset, size - 1);
-
- image = offset(image, bld, base * BRW_IMAGE_PARAM_SIZE);
+ image = offset(image, bld, base * element_size);
if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
- fs_reg *tmp = new(mem_ctx) fs_reg(vgrf(glsl_type::int_type));
+ fs_reg tmp = vgrf(glsl_type::int_type);
if (devinfo->gen == 7 && !devinfo->is_haswell) {
/* IVB hangs when trying to access an invalid surface index with
@@ -1084,15 +1083,18 @@ fs_visitor::get_nir_image_deref(const nir_deref_var *deref)
* of the possible outcomes of the hang. Clamp the index to
* prevent access outside of the array bounds.
*/
- bld.emit_minmax(*tmp, retype(get_nir_src(deref_array->indirect),
- BRW_REGISTER_TYPE_UD),
+ bld.emit_minmax(tmp, retype(get_nir_src(deref_array->indirect),
+ BRW_REGISTER_TYPE_UD),
fs_reg(size - base - 1), BRW_CONDITIONAL_L);
} else {
- bld.MOV(*tmp, get_nir_src(deref_array->indirect));
+ bld.MOV(tmp, get_nir_src(deref_array->indirect));
}
- bld.MUL(*tmp, *tmp, fs_reg(BRW_IMAGE_PARAM_SIZE));
- image.reladdr = tmp;
+ bld.MUL(tmp, tmp, fs_reg(element_size));
+ if (image.reladdr)
+ bld.ADD(*image.reladdr, *image.reladdr, tmp);
+ else
+ image.reladdr = new(mem_ctx) fs_reg(tmp);
}
}
diff --git a/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp b/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp
index d3326e9fb86..87b383919df 100644
--- a/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp
+++ b/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp
@@ -98,6 +98,8 @@ brw_nir_setup_glsl_uniform(gl_shader_stage stage, nir_variable *var,
if (storage->type->is_image()) {
brw_setup_image_uniform_values(stage, stage_prog_data,
uniform_index, storage);
+ uniform_index +=
+ BRW_IMAGE_PARAM_SIZE * MAX2(storage->array_elements, 1);
} else {
gl_constant_value *components = storage->storage;
unsigned vector_count = (MAX2(storage->array_elements, 1) *
From 5b75dbd7be09fdc80eff8141ef47c63a6a913c98 Mon Sep 17 00:00:00 2001
From: Timothy Arceri
Date: Fri, 16 Oct 2015 10:28:47 +1100
Subject: [PATCH 018/287] i965: enable ARB_arrays_of_arrays
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Reviewed-by: Samuel Iglesias Gonsálvez
---
src/mesa/drivers/dri/i965/intel_extensions.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index 4643ea3e87b..386b63c123d 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -174,6 +174,7 @@ intelInitExtensions(struct gl_context *ctx)
assert(brw->gen >= 4);
+ ctx->Extensions.ARB_arrays_of_arrays = true;
ctx->Extensions.ARB_buffer_storage = true;
ctx->Extensions.ARB_clear_texture = true;
ctx->Extensions.ARB_clip_control = true;
From 6e3b380387378e9f8e92eed3dc4a95767857b0de Mon Sep 17 00:00:00 2001
From: Timothy Arceri
Date: Fri, 16 Oct 2015 10:28:48 +1100
Subject: [PATCH 019/287] docs: Mark AoA as done for i965
Reviewed-by: Ian Romanick
---
docs/GL3.txt | 4 ++--
docs/relnotes/11.1.0.html | 1 +
2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/docs/GL3.txt b/docs/GL3.txt
index 7f6b8c9ef27..7abdcd8dea1 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -149,7 +149,7 @@ GL 4.2, GLSL 4.20:
GL 4.3, GLSL 4.30:
- GL_ARB_arrays_of_arrays started (Timothy)
+ GL_ARB_arrays_of_arrays DONE (i965)
GL_ARB_ES3_compatibility DONE (all drivers that support GLSL 3.30)
GL_ARB_clear_buffer_object DONE (all drivers)
GL_ARB_compute_shader in progress (jljusten)
@@ -209,7 +209,7 @@ GL 4.5, GLSL 4.50:
These are the extensions cherry-picked to make GLES 3.1
GLES3.1, GLSL ES 3.1
- GL_ARB_arrays_of_arrays started (Timothy)
+ GL_ARB_arrays_of_arrays DONE (i965)
GL_ARB_compute_shader in progress (jljusten)
GL_ARB_draw_indirect DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_explicit_uniform_location DONE (all drivers that support GLSL)
diff --git a/docs/relnotes/11.1.0.html b/docs/relnotes/11.1.0.html
index 7160244fcb4..86549d7672b 100644
--- a/docs/relnotes/11.1.0.html
+++ b/docs/relnotes/11.1.0.html
@@ -44,6 +44,7 @@ Note: some of the new features are only available with certain drivers.
+- GL_ARB_arrays_of_arrays on i965
- GL_ARB_blend_func_extended on freedreno (a3xx)
- GL_ARB_copy_image on radeonsi
- GL_ARB_gpu_shader_fp64 on r600 for Cypress/Cayman/Aruba chips
From f6b3c163f954c4fb5a525af39ce906f63b445e89 Mon Sep 17 00:00:00 2001
From: Timothy Arceri
Date: Wed, 4 Nov 2015 14:50:49 +1100
Subject: [PATCH 020/287] glsl: remove old TODO
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
SSBO support now exists as of commits f24e5e and f408a13dd30.
Reviewed-by: Tapani Pälli
Acked-by: Matt Turner
---
src/glsl/linker.cpp | 5 -----
1 file changed, 5 deletions(-)
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 9dcc2a76c9a..3ad295587f8 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -3799,11 +3799,6 @@ build_program_resource_list(struct gl_shader_program *shProg)
return;
}
}
-
- /* TODO - following extensions will require more resource types:
- *
- * GL_ARB_shader_storage_buffer_object
- */
}
/**
From 8e4cf900f0af9eb8a72c81a0e5e393906b11764a Mon Sep 17 00:00:00 2001
From: Timothy Arceri
Date: Wed, 4 Nov 2015 08:41:29 +1100
Subject: [PATCH 021/287] glsl: make sure to only add subroutines to resource
list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Over looked in 763cd8c080353.
Reviewed-by: Tapani Pälli
---
src/glsl/linker.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 3ad295587f8..26c02986be4 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -3776,7 +3776,8 @@ build_program_resource_list(struct gl_shader_program *shProg)
continue;
for (int j = MESA_SHADER_VERTEX; j < MESA_SHADER_STAGES; j++) {
- if (!shProg->UniformStorage[i].opaque[j].active)
+ if (!shProg->UniformStorage[i].opaque[j].active ||
+ !shProg->UniformStorage[i].type->is_subroutine())
continue;
type = _mesa_shader_stage_to_subroutine_uniform((gl_shader_stage)j);
From 13b19aa815661cd17b74c8694b6c466bfaf75740 Mon Sep 17 00:00:00 2001
From: Ryan Houdek
Date: Mon, 2 Nov 2015 19:30:18 -0600
Subject: [PATCH 022/287] mesa: expose support for GL_EXT_buffer_storage
This extension requires ES 3.1 since it relies on glMemoryBarrier.
For testing purposes I temporarily moved glMemoryBarrier to be an ES 3.0
function.
This has been tested with the piglit in the ML and the Dolphin emulator.
Reviewed-by: Ilia Mirkin
---
docs/relnotes/11.1.0.html | 1 +
src/mapi/glapi/gen/es_EXT.xml | 9 +++++++++
src/mesa/main/extensions.c | 1 +
3 files changed, 11 insertions(+)
diff --git a/docs/relnotes/11.1.0.html b/docs/relnotes/11.1.0.html
index 86549d7672b..c35d91f4329 100644
--- a/docs/relnotes/11.1.0.html
+++ b/docs/relnotes/11.1.0.html
@@ -56,6 +56,7 @@ Note: some of the new features are only available with certain drivers.
- GL_ARB_texture_barrier / GL_NV_texture_barrier on i965
- GL_ARB_texture_query_lod on softpipe
- GL_ARB_texture_view on radeonsi
+- GL_EXT_buffer_storage implemented for when ES 3.1 support is gained
- GL_EXT_draw_elements_base_vertex on all drivers
- GL_OES_draw_elements_base_vertex on all drivers
- EGL_KHR_create_context on softpipe, llvmpipe
diff --git a/src/mapi/glapi/gen/es_EXT.xml b/src/mapi/glapi/gen/es_EXT.xml
index bf20e4801cc..9a777a24c61 100644
--- a/src/mapi/glapi/gen/es_EXT.xml
+++ b/src/mapi/glapi/gen/es_EXT.xml
@@ -905,4 +905,13 @@
+
+
+
+
+
+
+
+
+
diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c
index d964f030ecb..bdc68175bf2 100644
--- a/src/mesa/main/extensions.c
+++ b/src/mesa/main/extensions.c
@@ -222,6 +222,7 @@ static const struct extension extension_table[] = {
{ "GL_EXT_blend_color", o(EXT_blend_color), GLL, 1995 },
{ "GL_EXT_blend_equation_separate", o(EXT_blend_equation_separate), GL, 2003 },
{ "GL_EXT_blend_func_separate", o(EXT_blend_func_separate), GLL, 1999 },
+ { "GL_EXT_buffer_storage", o(ARB_buffer_storage), ES31, 2015 },
{ "GL_EXT_discard_framebuffer", o(dummy_true), ES1 | ES2, 2009 },
{ "GL_EXT_blend_minmax", o(EXT_blend_minmax), GLL | ES1 | ES2, 1995 },
{ "GL_EXT_blend_subtract", o(dummy_true), GLL, 1995 },
From d56a1478a8006af48aa65ab62e676e5f974f1ec3 Mon Sep 17 00:00:00 2001
From: Ben Widawsky
Date: Tue, 13 Oct 2015 20:50:25 -0700
Subject: [PATCH 023/287] i965/meta: Assert fast clears and rep clears never
overlap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
There is nothing wrong with the code today, but as one modifies the code it
turns out to be not too difficult to mess up the code, and this easy assertion
should catch such driver implementation failures quickly.
Cc: Kristian Høgsberg
Signed-off-by: Ben Widawsky
Reviewed-by: Chad Versace
Reviewed-by: Neil Roberts
---
src/mesa/drivers/dri/i965/brw_meta_fast_clear.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
index fbde3f04204..69fe7b4aa5b 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
@@ -536,6 +536,8 @@ brw_meta_fast_clear(struct brw_context *brw, struct gl_framebuffer *fb,
}
}
+ assert((fast_clear_buffers & rep_clear_buffers) == 0);
+
if (!(fast_clear_buffers | rep_clear_buffers)) {
if (plain_clear_buffers)
/* If we only have plain clears, skip the meta save/restore. */
From c19443bc8b68ef4697ead1998286e42bd4d8a572 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger
Date: Wed, 4 Nov 2015 14:21:43 +0100
Subject: [PATCH 024/287] gallivm: fix sampling for s3tc srgb formats when
using texture cache
This actually stored the values as 8bit linear values in the cache,
then did another srgb->linear conversion...
We don't want to do the former (decoding 8bit srgb values to 8bit linear
completely defeats the purpose of srgb in the first place), so just decode
to 8bit srgb.
Fixes piglit.spec.ext_texture_srgb.texwrap formats-s3tc tests.
---
src/gallium/auxiliary/gallivm/lp_bld_format_soa.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index 42aef8376f8..8bae94af3d7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -492,9 +492,11 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
(type.length == 1 || (type.length % 4 == 0)) &&
cache) {
const struct util_format_description *format_decompressed;
+ const struct util_format_description *flinear_desc;
LLVMValueRef packed;
+ flinear_desc = util_format_description(util_format_linear(format_desc->format));
packed = lp_build_fetch_cached_texels(gallivm,
- format_desc,
+ flinear_desc,
type.length,
base_ptr,
offset,
From c3d7caa1e006f00c3544a79a0be7d78904ce4177 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?=
Date: Thu, 22 Oct 2015 22:22:14 +0200
Subject: [PATCH 025/287] i965: check inst->predicate when clearing flag_live
at dead code eliminate
Detected by Matt Turner while reviewing commit
a59359ecd22154cc2b3f88bb8c599f21af8a3934
Reviewed-by: Matt Turner
---
src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp | 2 +-
src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp b/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp
index 4b5548a9dc5..1eaf1478877 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp
@@ -105,7 +105,7 @@ fs_visitor::dead_code_eliminate()
}
}
- if (inst->writes_flag()) {
+ if (inst->writes_flag() && !inst->predicate) {
BITSET_CLEAR(flag_live, inst->flag_subreg);
}
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
index 284e0a8d0a5..e8a51d6e066 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
@@ -145,7 +145,7 @@ vec4_visitor::dead_code_eliminate()
}
}
- if (inst->writes_flag()) {
+ if (inst->writes_flag() && !inst->predicate) {
for (unsigned c = 0; c < 4; c++)
BITSET_CLEAR(flag_live, c);
}
From fa6efbd27d1c725f38e960005d8806521bd58156 Mon Sep 17 00:00:00 2001
From: Brian Paul
Date: Sat, 31 Oct 2015 07:44:23 -0600
Subject: [PATCH 026/287] util/indices: replace #define tokens with enum type
To ease debugging in gdb.
Reviewed-by: Charmaine Lee
---
src/gallium/auxiliary/indices/u_indices.c | 45 ++++-----
src/gallium/auxiliary/indices/u_indices.h | 96 ++++++++++---------
.../auxiliary/indices/u_unfilled_indices.c | 34 +++----
3 files changed, 90 insertions(+), 85 deletions(-)
diff --git a/src/gallium/auxiliary/indices/u_indices.c b/src/gallium/auxiliary/indices/u_indices.c
index c25594b4b7a..436f8f008cb 100644
--- a/src/gallium/auxiliary/indices/u_indices.c
+++ b/src/gallium/auxiliary/indices/u_indices.c
@@ -68,17 +68,18 @@ static void translate_memcpy_uint( const void *in,
* \param out_nr returns number of new vertices
* \param out_translate returns the translation function to use by the caller
*/
-int u_index_translator( unsigned hw_mask,
- unsigned prim,
- unsigned in_index_size,
- unsigned nr,
- unsigned in_pv,
- unsigned out_pv,
- unsigned prim_restart,
- unsigned *out_prim,
- unsigned *out_index_size,
- unsigned *out_nr,
- u_translate_func *out_translate )
+enum indices_mode
+u_index_translator(unsigned hw_mask,
+ unsigned prim,
+ unsigned in_index_size,
+ unsigned nr,
+ unsigned in_pv,
+ unsigned out_pv,
+ unsigned prim_restart,
+ unsigned *out_prim,
+ unsigned *out_index_size,
+ unsigned *out_nr,
+ u_translate_func *out_translate)
{
unsigned in_idx;
unsigned out_idx;
@@ -204,17 +205,17 @@ int u_index_translator( unsigned hw_mask,
* \param out_nr returns new number of vertices to draw
* \param out_generate returns pointer to the generator function
*/
-int u_index_generator( unsigned hw_mask,
- unsigned prim,
- unsigned start,
- unsigned nr,
- unsigned in_pv,
- unsigned out_pv,
- unsigned *out_prim,
- unsigned *out_index_size,
- unsigned *out_nr,
- u_generate_func *out_generate )
-
+enum indices_mode
+u_index_generator(unsigned hw_mask,
+ unsigned prim,
+ unsigned start,
+ unsigned nr,
+ unsigned in_pv,
+ unsigned out_pv,
+ unsigned *out_prim,
+ unsigned *out_index_size,
+ unsigned *out_nr,
+ u_generate_func *out_generate)
{
unsigned out_idx;
diff --git a/src/gallium/auxiliary/indices/u_indices.h b/src/gallium/auxiliary/indices/u_indices.h
index e01201e4b04..4483eb81337 100644
--- a/src/gallium/auxiliary/indices/u_indices.h
+++ b/src/gallium/auxiliary/indices/u_indices.h
@@ -67,66 +67,68 @@ typedef void (*u_generate_func)( unsigned start,
/* Return codes describe the translate/generate operation. Caller may
* be able to reuse translated indices under some circumstances.
*/
-#define U_TRANSLATE_ERROR -1
-#define U_TRANSLATE_NORMAL 1
-#define U_TRANSLATE_MEMCPY 2
-#define U_GENERATE_LINEAR 3
-#define U_GENERATE_REUSABLE 4
-#define U_GENERATE_ONE_OFF 5
-
+enum indices_mode {
+ U_TRANSLATE_ERROR = -1,
+ U_TRANSLATE_NORMAL = 1,
+ U_TRANSLATE_MEMCPY = 2,
+ U_GENERATE_LINEAR = 3,
+ U_GENERATE_REUSABLE= 4,
+ U_GENERATE_ONE_OFF = 5,
+};
void u_index_init( void );
-int u_index_translator( unsigned hw_mask,
- unsigned prim,
- unsigned in_index_size,
- unsigned nr,
- unsigned in_pv, /* API */
- unsigned out_pv, /* hardware */
- unsigned prim_restart,
- unsigned *out_prim,
- unsigned *out_index_size,
- unsigned *out_nr,
- u_translate_func *out_translate );
+enum indices_mode
+u_index_translator(unsigned hw_mask,
+ unsigned prim,
+ unsigned in_index_size,
+ unsigned nr,
+ unsigned in_pv, /* API */
+ unsigned out_pv, /* hardware */
+ unsigned prim_restart,
+ unsigned *out_prim,
+ unsigned *out_index_size,
+ unsigned *out_nr,
+ u_translate_func *out_translate);
/* Note that even when generating it is necessary to know what the
* API's PV is, as the indices generated will depend on whether it is
* the same as hardware or not, and in the case of triangle strips,
* whether it is first or last.
*/
-int u_index_generator( unsigned hw_mask,
- unsigned prim,
- unsigned start,
- unsigned nr,
- unsigned in_pv, /* API */
- unsigned out_pv, /* hardware */
- unsigned *out_prim,
- unsigned *out_index_size,
- unsigned *out_nr,
- u_generate_func *out_generate );
+enum indices_mode
+u_index_generator(unsigned hw_mask,
+ unsigned prim,
+ unsigned start,
+ unsigned nr,
+ unsigned in_pv, /* API */
+ unsigned out_pv, /* hardware */
+ unsigned *out_prim,
+ unsigned *out_index_size,
+ unsigned *out_nr,
+ u_generate_func *out_generate);
void u_unfilled_init( void );
-int u_unfilled_translator( unsigned prim,
- unsigned in_index_size,
- unsigned nr,
- unsigned unfilled_mode,
- unsigned *out_prim,
- unsigned *out_index_size,
- unsigned *out_nr,
- u_translate_func *out_translate );
-
-int u_unfilled_generator( unsigned prim,
- unsigned start,
- unsigned nr,
- unsigned unfilled_mode,
- unsigned *out_prim,
- unsigned *out_index_size,
- unsigned *out_nr,
- u_generate_func *out_generate );
-
-
+enum indices_mode
+u_unfilled_translator(unsigned prim,
+ unsigned in_index_size,
+ unsigned nr,
+ unsigned unfilled_mode,
+ unsigned *out_prim,
+ unsigned *out_index_size,
+ unsigned *out_nr,
+ u_translate_func *out_translate);
+enum indices_mode
+u_unfilled_generator(unsigned prim,
+ unsigned start,
+ unsigned nr,
+ unsigned unfilled_mode,
+ unsigned *out_prim,
+ unsigned *out_index_size,
+ unsigned *out_nr,
+ u_generate_func *out_generate);
#endif
diff --git a/src/gallium/auxiliary/indices/u_unfilled_indices.c b/src/gallium/auxiliary/indices/u_unfilled_indices.c
index 121877a60fb..fc974f8b946 100644
--- a/src/gallium/auxiliary/indices/u_unfilled_indices.c
+++ b/src/gallium/auxiliary/indices/u_unfilled_indices.c
@@ -111,14 +111,15 @@ static unsigned nr_lines( unsigned prim,
-int u_unfilled_translator( unsigned prim,
- unsigned in_index_size,
- unsigned nr,
- unsigned unfilled_mode,
- unsigned *out_prim,
- unsigned *out_index_size,
- unsigned *out_nr,
- u_translate_func *out_translate )
+enum indices_mode
+u_unfilled_translator(unsigned prim,
+ unsigned in_index_size,
+ unsigned nr,
+ unsigned unfilled_mode,
+ unsigned *out_prim,
+ unsigned *out_index_size,
+ unsigned *out_nr,
+ u_translate_func *out_translate)
{
unsigned in_idx;
unsigned out_idx;
@@ -170,14 +171,15 @@ int u_unfilled_translator( unsigned prim,
* different front/back fill modes, that can be handled with the
* 'draw' module.
*/
-int u_unfilled_generator( unsigned prim,
- unsigned start,
- unsigned nr,
- unsigned unfilled_mode,
- unsigned *out_prim,
- unsigned *out_index_size,
- unsigned *out_nr,
- u_generate_func *out_generate )
+enum indices_mode
+u_unfilled_generator(unsigned prim,
+ unsigned start,
+ unsigned nr,
+ unsigned unfilled_mode,
+ unsigned *out_prim,
+ unsigned *out_index_size,
+ unsigned *out_nr,
+ u_generate_func *out_generate)
{
unsigned out_idx;
From 3f98c812b30d739b744d70267a28a25afcaa8b13 Mon Sep 17 00:00:00 2001
From: Brian Paul
Date: Sat, 31 Oct 2015 07:44:49 -0600
Subject: [PATCH 027/287] svga: use new enum indices_mode type
Reviewed-by: Charmaine Lee
---
src/gallium/drivers/svga/svga_draw_arrays.c | 3 ++-
src/gallium/drivers/svga/svga_draw_elements.c | 3 ++-
2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/src/gallium/drivers/svga/svga_draw_arrays.c b/src/gallium/drivers/svga/svga_draw_arrays.c
index caf4b17de16..acb2e95e747 100644
--- a/src/gallium/drivers/svga/svga_draw_arrays.c
+++ b/src/gallium/drivers/svga/svga_draw_arrays.c
@@ -204,7 +204,8 @@ svga_hwtnl_draw_arrays(struct svga_hwtnl *hwtnl,
unsigned prim, unsigned start, unsigned count,
unsigned start_instance, unsigned instance_count)
{
- unsigned gen_prim, gen_size, gen_nr, gen_type;
+ unsigned gen_prim, gen_size, gen_nr;
+ enum indices_mode gen_type;
u_generate_func gen_func;
enum pipe_error ret = PIPE_OK;
unsigned api_pv = hwtnl->api_pv;
diff --git a/src/gallium/drivers/svga/svga_draw_elements.c b/src/gallium/drivers/svga/svga_draw_elements.c
index 9df8f6e9beb..0213409ef29 100644
--- a/src/gallium/drivers/svga/svga_draw_elements.c
+++ b/src/gallium/drivers/svga/svga_draw_elements.c
@@ -133,7 +133,8 @@ svga_hwtnl_draw_range_elements(struct svga_hwtnl *hwtnl,
unsigned prim, unsigned start, unsigned count,
unsigned start_instance, unsigned instance_count)
{
- unsigned gen_prim, gen_size, gen_nr, gen_type;
+ unsigned gen_prim, gen_size, gen_nr;
+ enum indices_mode gen_type;
u_translate_func gen_func;
enum pipe_error ret = PIPE_OK;
From e450d4371a4166f57a7e412d2c1e68aa1162a703 Mon Sep 17 00:00:00 2001
From: Brian Paul
Date: Wed, 28 Oct 2015 19:02:38 -0600
Subject: [PATCH 028/287] u_vbuf: add some const qualifiers
Trivial.
---
src/gallium/auxiliary/util/u_vbuf.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/gallium/auxiliary/util/u_vbuf.c b/src/gallium/auxiliary/util/u_vbuf.c
index b31ada138b8..9ddd9222e7e 100644
--- a/src/gallium/auxiliary/util/u_vbuf.c
+++ b/src/gallium/auxiliary/util/u_vbuf.c
@@ -998,7 +998,7 @@ u_vbuf_upload_buffers(struct u_vbuf *mgr,
return PIPE_OK;
}
-static boolean u_vbuf_need_minmax_index(struct u_vbuf *mgr)
+static boolean u_vbuf_need_minmax_index(const struct u_vbuf *mgr)
{
/* See if there are any per-vertex attribs which will be uploaded or
* translated. Use bitmasks to get the info instead of looping over vertex
@@ -1009,7 +1009,7 @@ static boolean u_vbuf_need_minmax_index(struct u_vbuf *mgr)
mgr->ve->noninstance_vb_mask_any & mgr->nonzero_stride_vb_mask)) != 0;
}
-static boolean u_vbuf_mapping_vertex_buffer_blocks(struct u_vbuf *mgr)
+static boolean u_vbuf_mapping_vertex_buffer_blocks(const struct u_vbuf *mgr)
{
/* Return true if there are hw buffers which don't need to be translated.
*
From 149ac1fe43a87ee4219f9979dcce2de7964c31a9 Mon Sep 17 00:00:00 2001
From: Brian Paul
Date: Wed, 28 Oct 2015 19:05:27 -0600
Subject: [PATCH 029/287] u_vbuf: minor code reformatting / line wrapping
Trivial.
---
src/gallium/auxiliary/util/u_vbuf.c | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/src/gallium/auxiliary/util/u_vbuf.c b/src/gallium/auxiliary/util/u_vbuf.c
index 9ddd9222e7e..54e9e717104 100644
--- a/src/gallium/auxiliary/util/u_vbuf.c
+++ b/src/gallium/auxiliary/util/u_vbuf.c
@@ -1004,9 +1004,11 @@ static boolean u_vbuf_need_minmax_index(const struct u_vbuf *mgr)
* translated. Use bitmasks to get the info instead of looping over vertex
* elements. */
return (mgr->ve->used_vb_mask &
- ((mgr->user_vb_mask | mgr->incompatible_vb_mask |
+ ((mgr->user_vb_mask |
+ mgr->incompatible_vb_mask |
mgr->ve->incompatible_vb_mask_any) &
- mgr->ve->noninstance_vb_mask_any & mgr->nonzero_stride_vb_mask)) != 0;
+ mgr->ve->noninstance_vb_mask_any &
+ mgr->nonzero_stride_vb_mask)) != 0;
}
static boolean u_vbuf_mapping_vertex_buffer_blocks(const struct u_vbuf *mgr)
@@ -1016,8 +1018,10 @@ static boolean u_vbuf_mapping_vertex_buffer_blocks(const struct u_vbuf *mgr)
* We could query whether each buffer is busy, but that would
* be way more costly than this. */
return (mgr->ve->used_vb_mask &
- (~mgr->user_vb_mask & ~mgr->incompatible_vb_mask &
- mgr->ve->compatible_vb_mask_all & mgr->ve->noninstance_vb_mask_any &
+ (~mgr->user_vb_mask &
+ ~mgr->incompatible_vb_mask &
+ mgr->ve->compatible_vb_mask_all &
+ mgr->ve->noninstance_vb_mask_any &
mgr->nonzero_stride_vb_mask)) != 0;
}
From d31481e70ab0da293d4c3010815f643f161b7168 Mon Sep 17 00:00:00 2001
From: Brian Paul
Date: Tue, 3 Nov 2015 14:34:15 -0700
Subject: [PATCH 030/287] svga: implement 'white_fragments' option for VGPU10
fragment shaders
When we emulate XOR logicop mode with blend-subtract, we need to ensure
that the fragment shader always emits white. We had this implemented
for VGPU9, but not VGPU10.
VMware bug 1545492.
Reviewed-by: Charmaine Lee
---
src/gallium/drivers/svga/svga_tgsi_vgpu10.c | 35 ++++++++++++++++++---
1 file changed, 30 insertions(+), 5 deletions(-)
diff --git a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
index e70ee689c59..9b7ab16103f 100644
--- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
@@ -2672,6 +2672,7 @@ emit_temporaries_declaration(struct svga_shader_emitter_v10 *emit)
}
else if (emit->unit == PIPE_SHADER_FRAGMENT) {
if (emit->key.fs.alpha_func != SVGA3D_CMP_ALWAYS ||
+ emit->key.fs.white_fragments ||
emit->key.fs.write_color0_to_n_cbufs > 1) {
/* Allocate a temp to hold the output color */
emit->fs.color_tmp_index = total_temps;
@@ -6369,8 +6370,11 @@ emit_alpha_test_instructions(struct svga_shader_emitter_v10 *emit,
emit_src_register(emit, &tmp_src_x);
end_emit_instruction(emit);
- /* If we don't need to broadcast the color below, emit final color here */
- if (emit->key.fs.write_color0_to_n_cbufs <= 1) {
+ /* If we don't need to broadcast the color below or set fragments to
+ * white, emit final color here.
+ */
+ if (emit->key.fs.write_color0_to_n_cbufs <= 1 &&
+ !emit->key.fs.white_fragments) {
/* MOV output.color, tempcolor */
emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &color_dst,
&color_src, FALSE); /* XXX saturate? */
@@ -6380,10 +6384,28 @@ emit_alpha_test_instructions(struct svga_shader_emitter_v10 *emit,
}
+/**
+ * When we need to emit white for all fragments (for emulating XOR logicop
+ * mode), this function copies white into the temporary color output register.
+ */
+static void
+emit_set_color_white(struct svga_shader_emitter_v10 *emit,
+ unsigned fs_color_tmp_index)
+{
+ struct tgsi_full_dst_register color_dst =
+ make_dst_temp_reg(fs_color_tmp_index);
+ struct tgsi_full_src_register white =
+ make_immediate_reg_float(emit, 1.0f);
+
+ emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &color_dst, &white, FALSE);
+}
+
+
/**
* Emit instructions for writing a single color output to multiple
* color buffers.
- * This is used when the TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS
+ * This is used when the TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS (or
+ * when key.fs.white_fragments is true).
* property is set and the number of render targets is greater than one.
* \param fs_color_tmp_index index of the temp register that holds the
* color to broadcast.
@@ -6398,7 +6420,6 @@ emit_broadcast_color_instructions(struct svga_shader_emitter_v10 *emit,
make_src_temp_reg(fs_color_tmp_index);
assert(emit->unit == PIPE_SHADER_FRAGMENT);
- assert(n > 1);
for (i = 0; i < n; i++) {
unsigned output_reg = emit->fs.color_out_index[i];
@@ -6440,7 +6461,11 @@ emit_post_helpers(struct svga_shader_emitter_v10 *emit)
if (emit->key.fs.alpha_func != SVGA3D_CMP_ALWAYS) {
emit_alpha_test_instructions(emit, fs_color_tmp_index);
}
- if (emit->key.fs.write_color0_to_n_cbufs > 1) {
+ if (emit->key.fs.white_fragments) {
+ emit_set_color_white(emit, fs_color_tmp_index);
+ }
+ if (emit->key.fs.write_color0_to_n_cbufs > 1 ||
+ emit->key.fs.white_fragments) {
emit_broadcast_color_instructions(emit, fs_color_tmp_index);
}
}
From bdf6cef0333bf7278e2e2347aaae399288e87dcd Mon Sep 17 00:00:00 2001
From: Brian Paul
Date: Sat, 31 Oct 2015 07:02:36 -0600
Subject: [PATCH 031/287] vbo: fix another GL_LINE_LOOP bug
Very long line loops which spanned 3 or more vertex buffers were not
handled correctly and could result in stray lines.
The piglit lineloop test draws 10000 vertices by default, and is not
long enough to trigger this. Even 'lineloop -count 100000' doesn't
trigger the bug.
For future reference, the issue can be reproduced by changing Mesa's
VBO_VERT_BUFFER_SIZE to 4096 and changing the piglit lineloop test to
use glVertex2f(), draw 3 loops instead of 1, and specifying -count
1023.
Acked-by: Sinclair Yeh
Reviewed-by: Roland Scheidegger
---
src/mesa/vbo/vbo_exec_api.c | 11 +++++++++--
src/mesa/vbo/vbo_exec_draw.c | 1 +
2 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/src/mesa/vbo/vbo_exec_api.c b/src/mesa/vbo/vbo_exec_api.c
index a614b26cae4..7534599c313 100644
--- a/src/mesa/vbo/vbo_exec_api.c
+++ b/src/mesa/vbo/vbo_exec_api.c
@@ -114,6 +114,7 @@ static void vbo_exec_wrap_buffers( struct vbo_exec_context *exec )
if (_mesa_inside_begin_end(exec->ctx)) {
exec->vtx.prim[0].mode = exec->ctx->Driver.CurrentExecPrimitive;
exec->vtx.prim[0].begin = 0;
+ exec->vtx.prim[0].end = 0;
exec->vtx.prim[0].start = 0;
exec->vtx.prim[0].count = 0;
exec->vtx.prim_count++;
@@ -846,17 +847,23 @@ static void GLAPIENTRY vbo_exec_End( void )
/* We're finishing drawing a line loop. Append 0th vertex onto
* end of vertex buffer so we can draw it as a line strip.
*/
- const fi_type *src = exec->vtx.buffer_map;
+ const fi_type *src = exec->vtx.buffer_map +
+ last_prim->start * exec->vtx.vertex_size;
fi_type *dst = exec->vtx.buffer_map +
exec->vtx.vert_count * exec->vtx.vertex_size;
/* copy 0th vertex to end of buffer */
memcpy(dst, src, exec->vtx.vertex_size * sizeof(fi_type));
- assert(last_prim->start == 0);
last_prim->start++; /* skip vertex0 */
/* note that last_prim->count stays unchanged */
last_prim->mode = GL_LINE_STRIP;
+
+ /* Increment the vertex count so the next primitive doesn't
+ * overwrite the last vertex which we just added.
+ */
+ exec->vtx.vert_count++;
+ exec->vtx.buffer_ptr += exec->vtx.vertex_size;
}
try_vbo_merge(exec);
diff --git a/src/mesa/vbo/vbo_exec_draw.c b/src/mesa/vbo/vbo_exec_draw.c
index ed5d9e947b0..0d42618f246 100644
--- a/src/mesa/vbo/vbo_exec_draw.c
+++ b/src/mesa/vbo/vbo_exec_draw.c
@@ -117,6 +117,7 @@ vbo_copy_vertices( struct vbo_exec_context *exec )
* subtract one from last_prim->start) so that we copy the 0th vertex
* to the next vertex buffer.
*/
+ assert(last_prim->start > 0);
src -= sz;
}
/* fall-through */
From 5bbd522452cfe86fc600203fe1a9b056582e2000 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin
Date: Wed, 4 Nov 2015 14:26:37 -0500
Subject: [PATCH 032/287] mesa/tests: add glBufferStorageEXT to ES 3.1 dispatch
list
I thought that aliased functions didn't need to be added, but that might
only be if the function aliases something in the same {desktop,ES}
space. Resolves the dispatch sanity test failure.
Fixes: 13b19aa81 (mesa: expose support for GL_EXT_buffer_storage)
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=92824
Signed-off-by: Ilia Mirkin
---
src/mesa/main/tests/dispatch_sanity.cpp | 3 +++
1 file changed, 3 insertions(+)
diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index ac2d2332df8..abe0f432572 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -2506,5 +2506,8 @@ const struct function gles31_functions_possible[] = {
/* GL_OES_texture_storage_multisample_2d_array */
{ "glTexStorage3DMultisampleOES", 31, -1 },
+ /* GL_EXT_buffer_storage */
+ { "glBufferStorageEXT", 31, -1 },
+
{ NULL, 0, -1 },
};
From 4a951f1c0847353101d28db583e1dd397fdce9ba Mon Sep 17 00:00:00 2001
From: Eric Anholt
Date: Wed, 4 Nov 2015 13:13:39 -0800
Subject: [PATCH 033/287] vc4: Fix dumping the size of BOs allocated/cached.
60MB of cached BOs are a lot less scary than 600MB.
---
src/gallium/drivers/vc4/vc4_bufmgr.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.c b/src/gallium/drivers/vc4/vc4_bufmgr.c
index f7b41f5816d..171a5544bea 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.c
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.c
@@ -42,9 +42,9 @@ vc4_bo_dump_stats(struct vc4_screen *screen)
struct vc4_bo_cache *cache = &screen->bo_cache;
fprintf(stderr, " BOs allocated: %d\n", screen->bo_count);
- fprintf(stderr, " BOs size: %dkb\n", screen->bo_size / 102);
+ fprintf(stderr, " BOs size: %dkb\n", screen->bo_size / 1024);
fprintf(stderr, " BOs cached: %d\n", cache->bo_count);
- fprintf(stderr, " BOs cached size: %dkb\n", cache->bo_size / 102);
+ fprintf(stderr, " BOs cached size: %dkb\n", cache->bo_size / 1024);
if (!list_empty(&cache->time_list)) {
struct vc4_bo *first = LIST_ENTRY(struct vc4_bo,
From 3f7c96c36cb18a9e4616d373369a130416884bf9 Mon Sep 17 00:00:00 2001
From: Eric Anholt
Date: Wed, 4 Nov 2015 13:10:28 -0800
Subject: [PATCH 034/287] vc4: Print the rounded shader size in debug output.
It's surprising to see "0kb" printed for debug on short shaders, while
4kb alignment won't be suprising.
---
src/gallium/drivers/vc4/vc4_bufmgr.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.c b/src/gallium/drivers/vc4/vc4_bufmgr.c
index 171a5544bea..52ba8ab19ef 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.c
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.c
@@ -428,7 +428,7 @@ vc4_bo_alloc_shader(struct vc4_screen *screen, const void *data, uint32_t size)
screen->bo_count++;
screen->bo_size += bo->size;
if (dump_stats) {
- fprintf(stderr, "Allocated shader %dkb:\n", size / 1024);
+ fprintf(stderr, "Allocated shader %dkb:\n", bo->size / 1024);
vc4_bo_dump_stats(screen);
}
From 6d3a24bce80a32063aedfe568efd5532aea4c875 Mon Sep 17 00:00:00 2001
From: Eric Anholt
Date: Wed, 4 Nov 2015 13:27:16 -0800
Subject: [PATCH 035/287] vc4: When the create ioctl fails, free our cache and
try again.
This greatly increases the pressure you can put on the driver before
create fails. Ultimately we need to let the kernel take control of
our cached BOs and just take them from us (and other clients)
directly, but this is a very easy patch for the moment.
Cc: "11.0"
---
src/gallium/drivers/vc4/vc4_bufmgr.c | 29 +++++++++++++++++++++++-----
1 file changed, 24 insertions(+), 5 deletions(-)
diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.c b/src/gallium/drivers/vc4/vc4_bufmgr.c
index 52ba8ab19ef..2f822f04c21 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.c
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.c
@@ -36,6 +36,9 @@
static bool dump_stats = false;
+static void
+vc4_bo_cache_free_all(struct vc4_bo_cache *cache);
+
static void
vc4_bo_dump_stats(struct vc4_screen *screen)
{
@@ -136,6 +139,8 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name)
bo->name = name;
bo->private = true;
+ bool cleared_and_retried = false;
+retry:
if (!using_vc4_simulator) {
struct drm_vc4_create_bo create;
memset(&create, 0, sizeof(create));
@@ -157,6 +162,12 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name)
assert(create.size >= size);
}
if (ret != 0) {
+ if (!list_empty(&screen->bo_cache.time_list) &&
+ !cleared_and_retried) {
+ cleared_and_retried = true;
+ vc4_bo_cache_free_all(&screen->bo_cache);
+ goto retry;
+ }
fprintf(stderr, "create ioctl failure\n");
abort();
}
@@ -248,6 +259,18 @@ free_stale_bos(struct vc4_screen *screen, time_t time)
}
}
+static void
+vc4_bo_cache_free_all(struct vc4_bo_cache *cache)
+{
+ pipe_mutex_lock(cache->lock);
+ list_for_each_entry_safe(struct vc4_bo, bo, &cache->time_list,
+ time_list) {
+ vc4_bo_remove_from_cache(cache, bo);
+ vc4_bo_free(bo);
+ }
+ pipe_mutex_unlock(cache->lock);
+}
+
void
vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time)
{
@@ -600,11 +623,7 @@ vc4_bufmgr_destroy(struct pipe_screen *pscreen)
struct vc4_screen *screen = vc4_screen(pscreen);
struct vc4_bo_cache *cache = &screen->bo_cache;
- list_for_each_entry_safe(struct vc4_bo, bo, &cache->time_list,
- time_list) {
- vc4_bo_remove_from_cache(cache, bo);
- vc4_bo_free(bo);
- }
+ vc4_bo_cache_free_all(cache);
if (dump_stats) {
fprintf(stderr, "BO stats after screen destroy:\n");
From bb73fc4cb82c1abdf47aa373c78c2a85fe29b3ec Mon Sep 17 00:00:00 2001
From: Ilia Mirkin
Date: Wed, 4 Nov 2015 22:42:41 -0500
Subject: [PATCH 036/287] nouveau: relax fence emit space assert
We also have the "reserved for kick" space available. Some of my earlier
changes can probably be removed, but this is a quick fix for some of the
rarer fallout.
Signed-off-by: Ilia Mirkin
Cc:
---
src/gallium/drivers/nouveau/nv30/nv30_screen.c | 2 +-
src/gallium/drivers/nouveau/nv50/nv50_screen.c | 2 +-
src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index bdecb0a32b3..794a0898eaf 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -353,7 +353,7 @@ nv30_screen_fence_emit(struct pipe_screen *pscreen, uint32_t *sequence)
*sequence = ++screen->base.fence.sequence;
- assert(PUSH_AVAIL(push) >= 3);
+ assert(PUSH_AVAIL(push) + push->rsvd_kick >= 3);
PUSH_DATA (push, NV30_3D_FENCE_OFFSET |
(2 /* size */ << 18) | (7 /* subchan */ << 13));
PUSH_DATA (push, 0);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index a9e0c478322..de2150ca08c 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -392,7 +392,7 @@ nv50_screen_fence_emit(struct pipe_screen *pscreen, u32 *sequence)
/* we need to do it after possible flush in MARK_RING */
*sequence = ++screen->base.fence.sequence;
- assert(PUSH_AVAIL(push) >= 5);
+ assert(PUSH_AVAIL(push) + push->rsvd_kick >= 5);
PUSH_DATA (push, NV50_FIFO_PKHDR(NV50_3D(QUERY_ADDRESS_HIGH), 4));
PUSH_DATAh(push, screen->fence.bo->offset);
PUSH_DATA (push, screen->fence.bo->offset);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 6ad3980911d..3b543929f3c 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -547,7 +547,7 @@ nvc0_screen_fence_emit(struct pipe_screen *pscreen, u32 *sequence)
/* we need to do it after possible flush in MARK_RING */
*sequence = ++screen->base.fence.sequence;
- assert(PUSH_AVAIL(push) >= 5);
+ assert(PUSH_AVAIL(push) + push->rsvd_kick >= 5);
PUSH_DATA (push, NVC0_FIFO_PKHDR_SQ(NVC0_3D(QUERY_ADDRESS_HIGH), 4));
PUSH_DATAh(push, screen->fence.bo->offset);
PUSH_DATA (push, screen->fence.bo->offset);
From 56774e63028b2997a7d8c0abb5009a4c79f9a453 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?=
Date: Tue, 20 Oct 2015 13:08:09 +0200
Subject: [PATCH 037/287] i965/vec4: select predicate based on writemask for
sel emissions
Equivalent to commit 8ac3b525c but with sel operations. In this case
we select the PredCtrl based on the writemask.
This patch helps on cases like this:
1: cmp.l.f0.0 vgrf40.0.x:F, vgrf0.zzzz:F, vgrf7.xxxx:F
2: cmp.nz.f0.0 null:D, vgrf40.xxxx:D, 0D
3: (+f0.0) sel vgrf41.0.x:UD, vgrf6.xxxx:UD, vgrf5.xxxx:UD
In this case, cmod propagation can't optimize instruction #2, because
instructions #1 and #2 have different writemasks, and we can't update
directly instruction #2 writemask because our code thinks that sel at
instruction #3 reads all four channels of the flag, when it actually
only reads .x.
So, with this patch, the previous case becames this:
1: cmp.l.f0.0 vgrf40.0.x:F, vgrf0.zzzz:F, vgrf7.xxxx:F
2: cmp.nz.f0.0 null:D, vgrf40.xxxx:D, 0D
3: (+f0.0.x) sel vgrf41.0.x:UD, vgrf6.xxxx:UD, vgrf5.xxxx:UD
Now only the x channel of the flag is used, allowing dead code
eliminate to update the writemask at the second instruction:
1: cmp.l.f0.0 vgrf40.0.x:F, vgrf0.zzzz:F, vgrf7.xxxx:F
2: cmp.nz.f0.0 null.x:D, vgrf40.xxxx:D, 0D
3: (+f0.0.x) sel vgrf41.0.x:UD, vgrf6.xxxx:UD, vgrf5.xxxx:UD
So now cmod propagation can simplify out #2:
1: cmp.l.f0.0 vgrf40.0.x:F, attr18.wwww:F, vgrf7.xxxx:F
2: (+f0.0.x) sel vgrf41.0.x:UD, vgrf6.xxxx:UD, vgrf5.xxxx:UD
Shader-db numbers:
total instructions in shared programs: 6235835 -> 6228008 (-0.13%)
instructions in affected programs: 219850 -> 212023 (-3.56%)
total loops in shared programs: 1979 -> 1979 (0.00%)
helped: 1192
HURT: 0
---
src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 18 +++++++++++++++++-
1 file changed, 17 insertions(+), 1 deletion(-)
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 8ca8ddb98fb..b848810ebc7 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1407,7 +1407,23 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
case nir_op_bcsel:
emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
inst = emit(BRW_OPCODE_SEL, dst, op[1], op[2]);
- inst->predicate = BRW_PREDICATE_NORMAL;
+ switch (dst.writemask) {
+ case WRITEMASK_X:
+ inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_X;
+ break;
+ case WRITEMASK_Y:
+ inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Y;
+ break;
+ case WRITEMASK_Z:
+ inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Z;
+ break;
+ case WRITEMASK_W:
+ inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_W;
+ break;
+ default:
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ break;
+ }
break;
case nir_op_fdot_replicated2:
From 5c6f21579d7db802f4db96bae8b166e7409afabe Mon Sep 17 00:00:00 2001
From: Kenneth Graunke
Date: Tue, 3 Nov 2015 17:15:24 -0800
Subject: [PATCH 038/287] nir: Rename live_variables to live_ssa_defs.
This computes liveness of SSA values, not nir_variables.
Signed-off-by: Kenneth Graunke
Reviewed-by: Jason Ekstrand
---
src/glsl/nir/nir.h | 4 ++--
src/glsl/nir/nir_from_ssa.c | 2 +-
src/glsl/nir/nir_live_variables.c | 12 ++++++------
src/glsl/nir/nir_lower_global_vars_to_local.c | 2 +-
src/glsl/nir/nir_metadata.c | 4 ++--
src/glsl/nir/nir_opt_dead_cf.c | 2 +-
src/glsl/nir/nir_remove_dead_variables.c | 2 +-
7 files changed, 14 insertions(+), 14 deletions(-)
diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 874a03966be..f8de40d0d13 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -1310,7 +1310,7 @@ typedef enum {
nir_metadata_none = 0x0,
nir_metadata_block_index = 0x1,
nir_metadata_dominance = 0x2,
- nir_metadata_live_variables = 0x4,
+ nir_metadata_live_ssa_defs = 0x4,
} nir_metadata;
typedef struct {
@@ -1986,7 +1986,7 @@ bool nir_lower_gs_intrinsics(nir_shader *shader);
bool nir_normalize_cubemap_coords(nir_shader *shader);
-void nir_live_variables_impl(nir_function_impl *impl);
+void nir_live_ssa_defs_impl(nir_function_impl *impl);
bool nir_ssa_defs_interfere(nir_ssa_def *a, nir_ssa_def *b);
void nir_convert_to_ssa_impl(nir_function_impl *impl);
diff --git a/src/glsl/nir/nir_from_ssa.c b/src/glsl/nir/nir_from_ssa.c
index eaf883dbaa0..f2797f72c8e 100644
--- a/src/glsl/nir/nir_from_ssa.c
+++ b/src/glsl/nir/nir_from_ssa.c
@@ -777,7 +777,7 @@ nir_convert_from_ssa_impl(nir_function_impl *impl, bool phi_webs_only)
nir_metadata_preserve(impl, nir_metadata_block_index |
nir_metadata_dominance);
- nir_metadata_require(impl, nir_metadata_live_variables |
+ nir_metadata_require(impl, nir_metadata_live_ssa_defs |
nir_metadata_dominance);
nir_foreach_block(impl, coalesce_phi_nodes_block, &state);
diff --git a/src/glsl/nir/nir_live_variables.c b/src/glsl/nir/nir_live_variables.c
index 1c96dcf36c5..05f79d7bc61 100644
--- a/src/glsl/nir/nir_live_variables.c
+++ b/src/glsl/nir/nir_live_variables.c
@@ -42,7 +42,7 @@
* block but not in the live-in of the block containing the phi node.
*/
-struct live_variables_state {
+struct live_ssa_defs_state {
unsigned num_ssa_defs;
unsigned bitset_words;
@@ -52,7 +52,7 @@ struct live_variables_state {
static bool
index_ssa_def(nir_ssa_def *def, void *void_state)
{
- struct live_variables_state *state = void_state;
+ struct live_ssa_defs_state *state = void_state;
if (def->parent_instr->type == nir_instr_type_ssa_undef)
def->live_index = 0;
@@ -77,7 +77,7 @@ index_ssa_definitions_block(nir_block *block, void *state)
static bool
init_liveness_block(nir_block *block, void *void_state)
{
- struct live_variables_state *state = void_state;
+ struct live_ssa_defs_state *state = void_state;
block->live_in = reralloc(block, block->live_in, BITSET_WORD,
state->bitset_words);
@@ -129,7 +129,7 @@ set_ssa_def_dead(nir_ssa_def *def, void *void_live)
*/
static bool
propagate_across_edge(nir_block *pred, nir_block *succ,
- struct live_variables_state *state)
+ struct live_ssa_defs_state *state)
{
NIR_VLA(BITSET_WORD, live, state->bitset_words);
memcpy(live, succ->live_in, state->bitset_words * sizeof *live);
@@ -165,9 +165,9 @@ propagate_across_edge(nir_block *pred, nir_block *succ,
}
void
-nir_live_variables_impl(nir_function_impl *impl)
+nir_live_ssa_defs_impl(nir_function_impl *impl)
{
- struct live_variables_state state;
+ struct live_ssa_defs_state state;
/* We start at 1 because we reserve the index value of 0 for ssa_undef
* instructions. Those are never live, so their liveness information
diff --git a/src/glsl/nir/nir_lower_global_vars_to_local.c b/src/glsl/nir/nir_lower_global_vars_to_local.c
index dcd091ae2fa..d549ee79bb4 100644
--- a/src/glsl/nir/nir_lower_global_vars_to_local.c
+++ b/src/glsl/nir/nir_lower_global_vars_to_local.c
@@ -102,7 +102,7 @@ nir_lower_global_vars_to_local(nir_shader *shader)
exec_list_push_tail(&impl->locals, &var->node);
nir_metadata_preserve(impl, nir_metadata_block_index |
nir_metadata_dominance |
- nir_metadata_live_variables);
+ nir_metadata_live_ssa_defs);
progress = true;
}
}
diff --git a/src/glsl/nir/nir_metadata.c b/src/glsl/nir/nir_metadata.c
index a03e12456a1..6de981f430f 100644
--- a/src/glsl/nir/nir_metadata.c
+++ b/src/glsl/nir/nir_metadata.c
@@ -39,8 +39,8 @@ nir_metadata_require(nir_function_impl *impl, nir_metadata required)
nir_index_blocks(impl);
if (NEEDS_UPDATE(nir_metadata_dominance))
nir_calc_dominance_impl(impl);
- if (NEEDS_UPDATE(nir_metadata_live_variables))
- nir_live_variables_impl(impl);
+ if (NEEDS_UPDATE(nir_metadata_live_ssa_defs))
+ nir_live_ssa_defs_impl(impl);
#undef NEEDS_UPDATE
diff --git a/src/glsl/nir/nir_opt_dead_cf.c b/src/glsl/nir/nir_opt_dead_cf.c
index 0d4819b5158..356e926ffe3 100644
--- a/src/glsl/nir/nir_opt_dead_cf.c
+++ b/src/glsl/nir/nir_opt_dead_cf.c
@@ -204,7 +204,7 @@ loop_is_dead(nir_loop *loop)
return false;
nir_function_impl *impl = nir_cf_node_get_function(&loop->cf_node);
- nir_metadata_require(impl, nir_metadata_live_variables |
+ nir_metadata_require(impl, nir_metadata_live_ssa_defs |
nir_metadata_dominance);
for (nir_block *cur = after->imm_dom; cur != before; cur = cur->imm_dom) {
diff --git a/src/glsl/nir/nir_remove_dead_variables.c b/src/glsl/nir/nir_remove_dead_variables.c
index 530a8475ed5..8f0833c7e24 100644
--- a/src/glsl/nir/nir_remove_dead_variables.c
+++ b/src/glsl/nir/nir_remove_dead_variables.c
@@ -130,7 +130,7 @@ nir_remove_dead_variables(nir_shader *shader)
if (remove_dead_vars(&overload->impl->locals, live)) {
nir_metadata_preserve(overload->impl, nir_metadata_block_index |
nir_metadata_dominance |
- nir_metadata_live_variables);
+ nir_metadata_live_ssa_defs);
progress = true;
}
}
From b9f8e729c88ad0d934422976a20a7c765016fcb8 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke
Date: Tue, 3 Nov 2015 17:16:49 -0800
Subject: [PATCH 039/287] nir: Rename nir_live_variables.c to nir_liveness.c.
It doesn't actually operate on variables.
Reviewed-by: Jason Ekstrand
---
src/glsl/Makefile.sources | 2 +-
src/glsl/nir/{nir_live_variables.c => nir_liveness.c} | 0
2 files changed, 1 insertion(+), 1 deletion(-)
rename src/glsl/nir/{nir_live_variables.c => nir_liveness.c} (100%)
diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
index ca870367640..0266f290ccb 100644
--- a/src/glsl/Makefile.sources
+++ b/src/glsl/Makefile.sources
@@ -37,7 +37,7 @@ NIR_FILES = \
nir/nir_intrinsics.h \
nir/nir_instr_set.c \
nir/nir_instr_set.h \
- nir/nir_live_variables.c \
+ nir/nir_liveness.c \
nir/nir_lower_alu_to_scalar.c \
nir/nir_lower_atomics.c \
nir/nir_lower_clip.c \
diff --git a/src/glsl/nir/nir_live_variables.c b/src/glsl/nir/nir_liveness.c
similarity index 100%
rename from src/glsl/nir/nir_live_variables.c
rename to src/glsl/nir/nir_liveness.c
From 5048da974e68a05b86a0cec494e1380e81978684 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke
Date: Wed, 28 Jan 2015 23:58:43 -0800
Subject: [PATCH 040/287] i965: Handle 16x MSAA in IMS dimension munging code.
Signed-off-by: Kenneth Graunke
Reviewed-by: Neil Roberts
Reviewed-by: Ben Widawsky
---
src/mesa/drivers/dri/i965/intel_mipmap_tree.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index b6e35205727..0802b92502c 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -416,9 +416,13 @@ intel_miptree_create_layout(struct brw_context *brw,
width0 = ALIGN(width0, 2) * 4;
height0 = ALIGN(height0, 2) * 2;
break;
+ case 16:
+ width0 = ALIGN(width0, 2) * 4;
+ height0 = ALIGN(height0, 2) * 4;
+ break;
default:
- /* num_samples should already have been quantized to 0, 1, 2, 4, or
- * 8.
+ /* num_samples should already have been quantized to 0, 1, 2, 4, 8
+ * or 16.
*/
unreachable("not reached");
}
From 20250e854eca3209133d592d98559ac474a5f60f Mon Sep 17 00:00:00 2001
From: Neil Roberts
Date: Wed, 16 Sep 2015 11:48:42 +0100
Subject: [PATCH 041/287] i965: Program 16x MSAA sample positions.
This is the standard pattern used by the other 3D graphics API.
BDW has slots for these values, but they aren't actually used until
SKL. Even though the documentation for BDW says they must be zero, it
doesn't seem to cause any harm to program them anyway.
The comment above for the 8x sample positions says that the hardware
implements centroid interpolation by picking the centre-most sample
that is inside the primitive. That implies that it might be worthwhile
to pick a pattern that includes 0.5,0.5. However by experimentation
this doesn't seem to actually be the case. With the sample positions
in this patch, if I modify the piglit test below so that it instead
reports the centroid position, it reports 0.492188,0.421875 which
doesn't match any of the positions. If I modify the sample positions
so that they include one at exactly 0.5,0.5 it doesn't help and it
reports another position which is even further from the center for
some reason.
arb_gpu_shader5-interpolateAtSample-different
Kenneth Graunke experimented with some other patterns that have a
higher standard deviation but I think after some discussion it was
decided that it would be better to pick the same pattern as the other
graphics API in case there are games that rely on this pattern.
(Based on a patch by Kenneth Graunke)
Cc: Kenneth Graunke
Reviewed-by: Ben Widawsky
---
.../drivers/dri/i965/brw_multisample_state.h | 26 +++++++++++++++++++
.../drivers/dri/i965/gen6_multisample_state.c | 3 +++
.../drivers/dri/i965/gen8_multisample_state.c | 12 ++++-----
3 files changed, 34 insertions(+), 7 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_multisample_state.h b/src/mesa/drivers/dri/i965/brw_multisample_state.h
index 26633e72983..42a7fd35121 100644
--- a/src/mesa/drivers/dri/i965/brw_multisample_state.h
+++ b/src/mesa/drivers/dri/i965/brw_multisample_state.h
@@ -81,3 +81,29 @@ brw_multisample_positions_4x = 0xae2ae662;
*/
static const uint32_t
brw_multisample_positions_8x[] = { 0xdbb39d79, 0x3ff55117 };
+
+/**
+ * Sample positions:
+ *
+ * 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ * 0 15
+ * 1 9
+ * 2 10
+ * 3 7
+ * 4 13
+ * 5 1
+ * 6 4
+ * 7 3
+ * 8 12
+ * 9 0
+ * a 2
+ * b 6
+ * c 11
+ * d 5
+ * e 8
+ * f 14
+ */
+static const uint32_t
+brw_multisample_positions_16x[] = {
+ 0xc75a7599, 0xb3dbad36, 0x2c42816e, 0x10eff408
+};
diff --git a/src/mesa/drivers/dri/i965/gen6_multisample_state.c b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
index 8444c0c9bae..49c6ebabfba 100644
--- a/src/mesa/drivers/dri/i965/gen6_multisample_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
@@ -48,6 +48,9 @@ gen6_get_sample_position(struct gl_context *ctx,
case 8:
bits = brw_multisample_positions_8x[index >> 2] >> (8 * (index & 3));
break;
+ case 16:
+ bits = brw_multisample_positions_16x[index >> 2] >> (8 * (index & 3));
+ break;
default:
unreachable("Not implemented");
}
diff --git a/src/mesa/drivers/dri/i965/gen8_multisample_state.c b/src/mesa/drivers/dri/i965/gen8_multisample_state.c
index 75cbe06c522..4427f15996d 100644
--- a/src/mesa/drivers/dri/i965/gen8_multisample_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_multisample_state.c
@@ -52,13 +52,11 @@ gen8_emit_3dstate_sample_pattern(struct brw_context *brw)
BEGIN_BATCH(9);
OUT_BATCH(_3DSTATE_SAMPLE_PATTERN << 16 | (9 - 2));
- /* 16x MSAA
- * XXX: Need to program these.
- */
- OUT_BATCH(0);
- OUT_BATCH(0);
- OUT_BATCH(0);
- OUT_BATCH(0);
+ /* 16x MSAA */
+ OUT_BATCH(brw_multisample_positions_16x[0]); /* positions 3, 2, 1, 0 */
+ OUT_BATCH(brw_multisample_positions_16x[1]); /* positions 7, 6, 5, 4 */
+ OUT_BATCH(brw_multisample_positions_16x[2]); /* positions 11, 10, 9, 8 */
+ OUT_BATCH(brw_multisample_positions_16x[3]); /* positions 15, 14, 13, 12 */
/* 8x MSAA */
OUT_BATCH(brw_multisample_positions_8x[1]); /* sample positions 7654 */
From e386fb0dee40d0f2342b43b6750b64c8174463a9 Mon Sep 17 00:00:00 2001
From: Neil Roberts
Date: Tue, 8 Sep 2015 15:52:09 +0100
Subject: [PATCH 042/287] i965/fs/skl+: Use ld2dms_w instead of ld2dms
In order to support 16x MSAA, skl+ has a wider version of ld2dms that
takes two parameters for the MCS data. The MCS data retrieved from the
ld_mcs instruction already returns 4 or 8 registers and is documented
to return zeroes for the mcsh value when the sample count is less than
16.
v2: Use get_lowered_simd_width to fall back to SIMD8 instructions when
the message length would be too long in SIMD16.
Reviewed-by: Ben Widawsky
---
src/mesa/drivers/dri/i965/brw_defines.h | 3 ++
src/mesa/drivers/dri/i965/brw_disasm.c | 1 +
src/mesa/drivers/dri/i965/brw_fs.cpp | 42 ++++++++++++++++++-
.../drivers/dri/i965/brw_fs_generator.cpp | 5 +++
src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 9 ++--
src/mesa/drivers/dri/i965/brw_shader.cpp | 5 +++
6 files changed, 60 insertions(+), 5 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 6433cffc919..0396e13d0c2 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -964,6 +964,8 @@ enum opcode {
FS_OPCODE_TXB_LOGICAL,
SHADER_OPCODE_TXF_CMS,
SHADER_OPCODE_TXF_CMS_LOGICAL,
+ SHADER_OPCODE_TXF_CMS_W,
+ SHADER_OPCODE_TXF_CMS_W_LOGICAL,
SHADER_OPCODE_TXF_UMS,
SHADER_OPCODE_TXF_UMS_LOGICAL,
SHADER_OPCODE_TXF_MCS,
@@ -1539,6 +1541,7 @@ enum brw_message_target {
#define GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO 17
#define GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C 18
#define HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE 20
+#define GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W 28
#define GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS 29
#define GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS 30
#define GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS 31
diff --git a/src/mesa/drivers/dri/i965/brw_disasm.c b/src/mesa/drivers/dri/i965/brw_disasm.c
index df747107188..fd93beaec19 100644
--- a/src/mesa/drivers/dri/i965/brw_disasm.c
+++ b/src/mesa/drivers/dri/i965/brw_disasm.c
@@ -622,6 +622,7 @@ static const char *const gen5_sampler_msg_type[] = {
[GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO] = "gather4_po",
[GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C] = "gather4_po_c",
[HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE] = "sample_d_c",
+ [GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W] = "ld2dms_w",
[GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS] = "ld_mcs",
[GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS] = "ld2dms",
[GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS] = "ld2dss",
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 4cc962613b3..f5294195656 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -717,6 +717,7 @@ fs_inst::components_read(unsigned i) const
case SHADER_OPCODE_TXS_LOGICAL:
case FS_OPCODE_TXB_LOGICAL:
case SHADER_OPCODE_TXF_CMS_LOGICAL:
+ case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
case SHADER_OPCODE_TXF_UMS_LOGICAL:
case SHADER_OPCODE_TXF_MCS_LOGICAL:
case SHADER_OPCODE_LOD_LOGICAL:
@@ -732,6 +733,9 @@ fs_inst::components_read(unsigned i) const
/* Texture offset. */
else if (i == 7)
return 2;
+ /* MCS */
+ else if (i == 5 && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
+ return 2;
else
return 1;
@@ -896,6 +900,7 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)
case SHADER_OPCODE_TXD:
case SHADER_OPCODE_TXF:
case SHADER_OPCODE_TXF_CMS:
+ case SHADER_OPCODE_TXF_CMS_W:
case SHADER_OPCODE_TXF_MCS:
case SHADER_OPCODE_TG4:
case SHADER_OPCODE_TG4_OFFSET:
@@ -3920,17 +3925,31 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
coordinate_done = true;
break;
case SHADER_OPCODE_TXF_CMS:
+ case SHADER_OPCODE_TXF_CMS_W:
case SHADER_OPCODE_TXF_UMS:
case SHADER_OPCODE_TXF_MCS:
- if (op == SHADER_OPCODE_TXF_UMS || op == SHADER_OPCODE_TXF_CMS) {
+ if (op == SHADER_OPCODE_TXF_UMS ||
+ op == SHADER_OPCODE_TXF_CMS ||
+ op == SHADER_OPCODE_TXF_CMS_W) {
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
length++;
}
- if (op == SHADER_OPCODE_TXF_CMS) {
+ if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) {
/* Data from the multisample control surface. */
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
length++;
+
+ /* On Gen9+ we'll use ld2dms_w instead which has two registers for
+ * the MCS data.
+ */
+ if (op == SHADER_OPCODE_TXF_CMS_W) {
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD),
+ mcs.file == IMM ?
+ mcs :
+ offset(mcs, bld, 1));
+ length++;
+ }
}
/* There is no offsetting for this message; just copy in the integer
@@ -4144,6 +4163,10 @@ fs_visitor::lower_logical_sends()
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
break;
+ case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W);
+ break;
+
case SHADER_OPCODE_TXF_UMS_LOGICAL:
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
break;
@@ -4336,6 +4359,21 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
else
return inst->exec_size;
+ case SHADER_OPCODE_TXF_CMS_W_LOGICAL: {
+ /* This opcode can take up to 6 arguments which means that in some
+ * circumstances it can end up with a message that is too long in SIMD16
+ * mode.
+ */
+ const unsigned coord_components = inst->src[8].fixed_hw_reg.dw1.ud;
+ /* First three arguments are the sample index and the two arguments for
+ * the MCS data.
+ */
+ if ((coord_components + 3) * 2 > MAX_SAMPLER_MESSAGE_SIZE)
+ return 8;
+ else
+ return inst->exec_size;
+ }
+
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index e207a77fdc1..28fb620279b 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -741,6 +741,10 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
case SHADER_OPCODE_TXF:
msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
break;
+ case SHADER_OPCODE_TXF_CMS_W:
+ assert(devinfo->gen >= 9);
+ msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
+ break;
case SHADER_OPCODE_TXF_CMS:
if (devinfo->gen >= 7)
msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
@@ -2050,6 +2054,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
case SHADER_OPCODE_TXD:
case SHADER_OPCODE_TXF:
case SHADER_OPCODE_TXF_CMS:
+ case SHADER_OPCODE_TXF_CMS_W:
case SHADER_OPCODE_TXF_UMS:
case SHADER_OPCODE_TXF_MCS:
case SHADER_OPCODE_TXL:
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index ef92098286c..94a9c1b68f2 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -208,8 +208,8 @@ fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs,
ARRAY_SIZE(srcs));
- /* We only care about one reg of response, but the sampler always writes
- * 4/8.
+ /* We only care about one or two regs of response, but the sampler always
+ * writes 4/8.
*/
inst->regs_written = 4 * dispatch_width / 8;
@@ -295,7 +295,10 @@ fs_visitor::emit_texture(ir_texture_opcode op,
opcode = SHADER_OPCODE_TXF_LOGICAL;
break;
case ir_txf_ms:
- opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
+ if (devinfo->gen >= 9)
+ opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
+ else
+ opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
break;
case ir_txs:
case ir_query_levels:
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 4ea297ade4c..0312024ed1b 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -351,6 +351,10 @@ brw_instruction_name(enum opcode op)
return "txf_cms";
case SHADER_OPCODE_TXF_CMS_LOGICAL:
return "txf_cms_logical";
+ case SHADER_OPCODE_TXF_CMS_W:
+ return "txf_cms_w";
+ case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+ return "txf_cms_w_logical";
case SHADER_OPCODE_TXF_UMS:
return "txf_ums";
case SHADER_OPCODE_TXF_UMS_LOGICAL:
@@ -787,6 +791,7 @@ backend_instruction::is_tex() const
opcode == SHADER_OPCODE_TXD ||
opcode == SHADER_OPCODE_TXF ||
opcode == SHADER_OPCODE_TXF_CMS ||
+ opcode == SHADER_OPCODE_TXF_CMS_W ||
opcode == SHADER_OPCODE_TXF_UMS ||
opcode == SHADER_OPCODE_TXF_MCS ||
opcode == SHADER_OPCODE_TXL ||
From 4ef27745c8ed5153464db22950a90d74d2ef4435 Mon Sep 17 00:00:00 2001
From: Neil Roberts
Date: Wed, 9 Sep 2015 15:59:36 +0100
Subject: [PATCH 043/287] i965/vec4/skl+: Use ld2dms_w instead of ld2dms
In order to support 16x MSAA, skl+ has a wider version of ld2dms that
takes two parameters for the MCS data. The MCS data in the response
still fits in a single register so we just need to ensure we copy both
values rather than just the lower one.
Acked-by: Ben Widawsky
---
src/mesa/drivers/dri/i965/brw_vec4.cpp | 1 +
src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 5 +++++
src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 14 ++++++++++++--
3 files changed, 18 insertions(+), 2 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 01eb1580953..8350a024e88 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -339,6 +339,7 @@ vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
case SHADER_OPCODE_TXD:
case SHADER_OPCODE_TXF:
case SHADER_OPCODE_TXF_CMS:
+ case SHADER_OPCODE_TXF_CMS_W:
case SHADER_OPCODE_TXF_MCS:
case SHADER_OPCODE_TXS:
case SHADER_OPCODE_TG4:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index 8bc21df5ffc..f0ad903c572 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -135,6 +135,10 @@ generate_tex(struct brw_codegen *p,
case SHADER_OPCODE_TXF:
msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
break;
+ case SHADER_OPCODE_TXF_CMS_W:
+ assert(devinfo->gen >= 9);
+ msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
+ break;
case SHADER_OPCODE_TXF_CMS:
if (devinfo->gen >= 7)
msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
@@ -1313,6 +1317,7 @@ generate_code(struct brw_codegen *p,
case SHADER_OPCODE_TXD:
case SHADER_OPCODE_TXF:
case SHADER_OPCODE_TXF_CMS:
+ case SHADER_OPCODE_TXF_CMS_W:
case SHADER_OPCODE_TXF_MCS:
case SHADER_OPCODE_TXL:
case SHADER_OPCODE_TXS:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 606fbd06278..7d949896bcc 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -900,7 +900,8 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
case ir_txl: opcode = SHADER_OPCODE_TXL; break;
case ir_txd: opcode = SHADER_OPCODE_TXD; break;
case ir_txf: opcode = SHADER_OPCODE_TXF; break;
- case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
+ case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
+ SHADER_OPCODE_TXF_CMS); break;
case ir_txs: opcode = SHADER_OPCODE_TXS; break;
case ir_tg4: opcode = offset_value.file != BAD_FILE
? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
@@ -992,7 +993,16 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
} else if (op == ir_txf_ms) {
emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
sample_index));
- if (devinfo->gen >= 7) {
+ if (opcode == SHADER_OPCODE_TXF_CMS_W) {
+ /* MCS data is stored in the first two channels of ‘mcs’, but we
+ * need to get it into the .y and .z channels of the second vec4
+ * of params.
+ */
+ mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
+ emit(MOV(dst_reg(MRF, param_base + 1,
+ glsl_type::uint_type, WRITEMASK_YZ),
+ mcs));
+ } else if (devinfo->gen >= 7) {
/* MCS data is in the first channel of `mcs`, but we need to get it into
* the .y channel of the second vec4 of params, so replicate .x across
* the whole vec4 and then mask off everything except .y
From 1a97cac767425b22e56fe698127795bc287bb773 Mon Sep 17 00:00:00 2001
From: Neil Roberts
Date: Tue, 15 Sep 2015 16:34:35 +0100
Subject: [PATCH 044/287] i965/fs: Add a sampler program key for whether the
texture is 16x MSAA
When 16x MSAA is used for sampling with texelFetch the compiler needs
to use a different instruction which passes more arguments for the MCS
data. Previously on skl+ it was unconditionally using this new
instruction. However since 16x MSAA is probably going to be pretty
rare, it is probably worthwhile to avoid using this instruction for
the other sample counts. In order to do that this patch adds a new
member to brw_sampler_prog_key_data to track when a sampler refers to
a buffer with 16 samples.
Note that this isn't done for the vec4 backend because it wouldn't
change how many registers it uses.
Acked-by: Ben Widawsky
---
src/mesa/drivers/dri/i965/brw_compiler.h | 7 +++++++
src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 2 +-
src/mesa/drivers/dri/i965/brw_wm.c | 8 ++++++++
3 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index 91eabaf7787..f022f3829be 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -142,6 +142,13 @@ struct brw_sampler_prog_key_data {
*/
uint32_t compressed_multisample_layout_mask;
+ /**
+ * Whether this sampler is using 16x multisampling. If so fetching from
+ * this sampler will be handled with a different instruction, ld2dms_w
+ * instead of ld2dms.
+ */
+ uint32_t msaa_16;
+
/**
* For Sandybridge, which shader w/a we need for gather quirks.
*/
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 94a9c1b68f2..213c9120b50 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -295,7 +295,7 @@ fs_visitor::emit_texture(ir_texture_opcode op,
opcode = SHADER_OPCODE_TXF_LOGICAL;
break;
case ir_txf_ms:
- if (devinfo->gen >= 9)
+ if ((key_tex->msaa_16 & (1 << sampler)))
opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
else
opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 5c49db9e63e..8d9ed3a6c33 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -212,6 +212,9 @@ brw_debug_recompile_sampler_key(struct brw_context *brw,
found |= key_debug(brw, "compressed multisample layout",
old_key->compressed_multisample_layout_mask,
key->compressed_multisample_layout_mask);
+ found |= key_debug(brw, "16x msaa",
+ old_key->msaa_16,
+ key->msaa_16);
for (unsigned int i = 0; i < MAX_SAMPLERS; i++) {
found |= key_debug(brw, "textureGather workarounds",
@@ -371,6 +374,11 @@ brw_populate_sampler_prog_key_data(struct gl_context *ctx,
if (brw->gen >= 7 &&
intel_tex->mt->msaa_layout == INTEL_MSAA_LAYOUT_CMS) {
key->compressed_multisample_layout_mask |= 1 << s;
+
+ if (intel_tex->mt->num_samples >= 16) {
+ assert(brw->gen >= 9);
+ key->msaa_16 |= 1 << s;
+ }
}
}
}
From b4c2e6054fe830c299113b143622bcd2158cd257 Mon Sep 17 00:00:00 2001
From: Neil Roberts
Date: Wed, 9 Sep 2015 14:36:42 +0100
Subject: [PATCH 045/287] i965: Support calculating the bits needed to set up
16x MSAA
The gen7_surface_msaa_bits function already returns the right values
for 16 samples but it just needs its assert to be relaxed.
Reviewed-by: Ben Widawsky
---
src/mesa/drivers/dri/i965/gen7_wm_surface_state.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c b/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
index 5080f1c3fe4..438caefdd4a 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
@@ -78,7 +78,7 @@ gen7_surface_msaa_bits(unsigned num_samples, enum intel_msaa_layout layout)
{
uint32_t ss4 = 0;
- assert(num_samples <= 8);
+ assert(num_samples <= 16);
/* The SURFACE_MULTISAMPLECOUNT_X enums are simply log2(num_samples) << 3. */
ss4 |= (ffs(MAX2(num_samples, 1)) - 1) << 3;
From bf6bd7eaf09fadc516a1e46635ed8590f4d88535 Mon Sep 17 00:00:00 2001
From: Neil Roberts
Date: Wed, 9 Sep 2015 14:38:08 +0100
Subject: [PATCH 046/287] i965: Support allocating the MCS buffer for 16x MSAA
When 16 samples are used the MCS buffer needs 64 bits per pixel.
Reviewed-by: Ben Widawsky
---
src/mesa/drivers/dri/i965/intel_mipmap_tree.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index 0802b92502c..b1a7632d82f 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -1427,6 +1427,12 @@ intel_miptree_alloc_mcs(struct brw_context *brw,
*/
format = MESA_FORMAT_R_UINT32;
break;
+ case 16:
+ /* 64 bits/pixel are required for MCS data when using 16x MSAA (4 bits
+ * for each sample).
+ */
+ format = MESA_FORMAT_RG_UINT32;
+ break;
default:
unreachable("Unrecognized sample count in intel_miptree_alloc_mcs");
};
From a6804654283a9d03bee92d61eee5b1d036c8db68 Mon Sep 17 00:00:00 2001
From: Neil Roberts
Date: Wed, 9 Sep 2015 17:44:17 +0100
Subject: [PATCH 047/287] i965/fs/skl+: Fix calculating gl_SampleID for 16x
MSAA
In order to accomodate 16x MSAA, the starting sample pair index is now
3 bits rather than 2 on SKL+.
Reviewed-by: Ben Widawsky
Reviewed-by: Anuj Phogat
---
src/mesa/drivers/dri/i965/brw_fs.cpp | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index f5294195656..cb2536263dd 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1313,9 +1313,15 @@ fs_visitor::emit_sampleid_setup()
* are sample 1 of subspan 0; the third group is sample 0 of
* subspan 1, and finally sample 1 of subspan 1.
*/
+
+ /* SKL+ has an extra bit for the Starting Sample Pair Index to
+ * accomodate 16x MSAA.
+ */
+ unsigned sspi_mask = devinfo->gen >= 9 ? 0x1c0 : 0xc0;
+
abld.exec_all().group(1, 0)
.AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
- fs_reg(0xc0));
+ fs_reg(sspi_mask));
abld.exec_all().group(1, 0).SHR(t1, t1, fs_reg(5));
/* This works for both SIMD8 and SIMD16 */
From 1a22b12fc51e80c20c700f93904ffd12caa73473 Mon Sep 17 00:00:00 2001
From: Neil Roberts
Date: Fri, 11 Sep 2015 18:09:46 +0100
Subject: [PATCH 048/287] i965/meta: Support 16x MSAA in the meta stencil blit
The destination rectangle is now drawn at 4x4 the size and the shader
code to calculate the sample number is adjusted accordingly.
Acked-by: Ben Widawsky
---
.../drivers/dri/i965/brw_meta_stencil_blit.c | 22 ++++++++++++++-----
1 file changed, 17 insertions(+), 5 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c b/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
index cbbb919c6ee..4e9aa949506 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
@@ -163,6 +163,13 @@ static const char *fs_tmpl =
" txl_coords.x = ((X & int(0xfff8)) >> 2) | (X & int(0x1));\n"
" txl_coords.y = ((Y & int(0xfffc)) >> 1) | (Y & int(0x1));\n"
" sample_index = (X & 0x4) | (Y & 0x2) | ((X & 0x2) >> 1);\n"
+ " break;\n"
+ " case 16:\n"
+ " txl_coords.x = ((X & int(0xfff8)) >> 2) | (X & int(0x1));\n"
+ " txl_coords.y = ((Y & int(0xfff8)) >> 2) | (Y & int(0x1));\n"
+ " sample_index = (((Y & 0x4) << 1) | (X & 0x4) | (Y & 0x2) |\n"
+ " ((X & 0x2) >> 1));\n"
+ " break;\n"
" }\n"
"}\n"
"\n"
@@ -313,11 +320,16 @@ adjust_msaa(struct blit_dims *dims, int num_samples)
dims->dst_x0 *= 2;
dims->dst_x1 *= 2;
} else if (num_samples) {
- const int x_num_samples = num_samples / 2;
- dims->dst_x0 = ROUND_DOWN_TO(dims->dst_x0 * x_num_samples, num_samples);
- dims->dst_y0 = ROUND_DOWN_TO(dims->dst_y0 * 2, 4);
- dims->dst_x1 = ALIGN(dims->dst_x1 * x_num_samples, num_samples);
- dims->dst_y1 = ALIGN(dims->dst_y1 * 2, 4);
+ const int y_num_samples = num_samples >= 16 ? 4 : 2;
+ const int x_num_samples = num_samples / y_num_samples;
+ dims->dst_x0 = ROUND_DOWN_TO(dims->dst_x0 * x_num_samples,
+ x_num_samples * 2);
+ dims->dst_y0 = ROUND_DOWN_TO(dims->dst_y0 * y_num_samples,
+ y_num_samples * 2);
+ dims->dst_x1 = ALIGN(dims->dst_x1 * x_num_samples,
+ x_num_samples * 2);
+ dims->dst_y1 = ALIGN(dims->dst_y1 * y_num_samples,
+ y_num_samples * 2);
}
}
From 2dd76ec16e599bd919962f439b59fdd73e85ff94 Mon Sep 17 00:00:00 2001
From: Neil Roberts
Date: Wed, 16 Sep 2015 17:43:33 +0100
Subject: [PATCH 049/287] meta: Support 16x MSAA in the multisample scaled blit
shader
v2: Fix the x_scale in the shader. Remove the doubts in the commit
message.
Reviewed-by: Anuj Phogat
---
src/mesa/drivers/common/meta.h | 2 ++
src/mesa/drivers/common/meta_blit.c | 29 ++++++++++++-------
.../drivers/dri/i965/gen6_multisample_state.c | 14 +++++++++
src/mesa/main/mtypes.h | 15 +++++++++-
4 files changed, 49 insertions(+), 11 deletions(-)
diff --git a/src/mesa/drivers/common/meta.h b/src/mesa/drivers/common/meta.h
index 23fa209905d..d742eaa9f67 100644
--- a/src/mesa/drivers/common/meta.h
+++ b/src/mesa/drivers/common/meta.h
@@ -285,9 +285,11 @@ enum blit_msaa_shader {
BLIT_2X_MSAA_SHADER_2D_MULTISAMPLE_SCALED_RESOLVE,
BLIT_4X_MSAA_SHADER_2D_MULTISAMPLE_SCALED_RESOLVE,
BLIT_8X_MSAA_SHADER_2D_MULTISAMPLE_SCALED_RESOLVE,
+ BLIT_16X_MSAA_SHADER_2D_MULTISAMPLE_SCALED_RESOLVE,
BLIT_2X_MSAA_SHADER_2D_MULTISAMPLE_ARRAY_SCALED_RESOLVE,
BLIT_4X_MSAA_SHADER_2D_MULTISAMPLE_ARRAY_SCALED_RESOLVE,
BLIT_8X_MSAA_SHADER_2D_MULTISAMPLE_ARRAY_SCALED_RESOLVE,
+ BLIT_16X_MSAA_SHADER_2D_MULTISAMPLE_ARRAY_SCALED_RESOLVE,
BLIT_MSAA_SHADER_COUNT,
};
diff --git a/src/mesa/drivers/common/meta_blit.c b/src/mesa/drivers/common/meta_blit.c
index 5972a5af0c9..b92c2e2f22b 100644
--- a/src/mesa/drivers/common/meta_blit.c
+++ b/src/mesa/drivers/common/meta_blit.c
@@ -72,20 +72,25 @@ setup_glsl_msaa_blit_scaled_shader(struct gl_context *ctx,
char *sample_map_expr = rzalloc_size(mem_ctx, 1);
char *texel_fetch_macro = rzalloc_size(mem_ctx, 1);
const char *sampler_array_suffix = "";
- float y_scale;
+ float x_scale, y_scale;
enum blit_msaa_shader shader_index;
assert(src_rb);
samples = MAX2(src_rb->NumSamples, 1);
- y_scale = samples * 0.5;
+
+ if (samples == 16)
+ x_scale = 4.0;
+ else
+ x_scale = 2.0;
+ y_scale = samples / x_scale;
/* We expect only power of 2 samples in source multisample buffer. */
assert(samples > 0 && _mesa_is_pow_two(samples));
while (samples >> (shader_offset + 1)) {
shader_offset++;
}
- /* Update the assert if we plan to support more than 8X MSAA. */
- assert(shader_offset > 0 && shader_offset < 4);
+ /* Update the assert if we plan to support more than 16X MSAA. */
+ assert(shader_offset > 0 && shader_offset <= 4);
assert(target == GL_TEXTURE_2D_MULTISAMPLE ||
target == GL_TEXTURE_2D_MULTISAMPLE_ARRAY);
@@ -129,6 +134,10 @@ setup_glsl_msaa_blit_scaled_shader(struct gl_context *ctx,
sample_number = "sample_map[int(2 * fract(coord.x) + 8 * fract(coord.y))]";
sample_map = ctx->Const.SampleMap8x;
break;
+ case 16:
+ sample_number = "sample_map[int(4 * fract(coord.x) + 16 * fract(coord.y))]";
+ sample_map = ctx->Const.SampleMap16x;
+ break;
default:
sample_number = NULL;
sample_map = NULL;
@@ -184,9 +193,9 @@ setup_glsl_msaa_blit_scaled_shader(struct gl_context *ctx,
"{\n"
"%s"
" vec2 interp;\n"
- " const vec2 scale = vec2(2.0f, %ff);\n"
- " const vec2 scale_inv = vec2(0.5f, %ff);\n"
- " const vec2 s_0_offset = vec2(0.25f, %ff);\n"
+ " const vec2 scale = vec2(%ff, %ff);\n"
+ " const vec2 scale_inv = vec2(%ff, %ff);\n"
+ " const vec2 s_0_offset = vec2(%ff, %ff);\n"
" vec2 s_0_coord, s_1_coord, s_2_coord, s_3_coord;\n"
" vec4 s_0_color, s_1_color, s_2_color, s_3_color;\n"
" vec4 x_0_color, x_1_color;\n"
@@ -219,9 +228,9 @@ setup_glsl_msaa_blit_scaled_shader(struct gl_context *ctx,
"}\n",
sampler_array_suffix,
sample_map_expr,
- y_scale,
- 1.0f / y_scale,
- 1.0f / samples,
+ x_scale, y_scale,
+ 1.0f / x_scale, 1.0f / y_scale,
+ 0.5f / x_scale, 0.5f / y_scale,
texel_fetch_macro);
_mesa_meta_compile_and_link_program(ctx, vs_source, fs_source, name,
diff --git a/src/mesa/drivers/dri/i965/gen6_multisample_state.c b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
index 49c6ebabfba..8eb620de56b 100644
--- a/src/mesa/drivers/dri/i965/gen6_multisample_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
@@ -91,6 +91,17 @@ gen6_get_sample_position(struct gl_context *ctx,
* | 6 | 7 | | 7 | 1 |
* --------- ---------
*
+ * 16X MSAA sample index layout 16x MSAA sample number layout
+ * ----------------- -----------------
+ * | 0 | 1 | 2 | 3 | |15 |10 | 9 | 7 |
+ * ----------------- -----------------
+ * | 4 | 5 | 6 | 7 | | 4 | 1 | 3 |13 |
+ * ----------------- -----------------
+ * | 8 | 9 |10 |11 | |12 | 2 | 0 | 6 |
+ * ----------------- -----------------
+ * |12 |13 |14 |15 | |11 | 8 | 5 |14 |
+ * ----------------- -----------------
+ *
* A sample map is used to map sample indices to sample numbers.
*/
void
@@ -99,10 +110,13 @@ gen6_set_sample_maps(struct gl_context *ctx)
uint8_t map_2x[2] = {0, 1};
uint8_t map_4x[4] = {0, 1, 2, 3};
uint8_t map_8x[8] = {5, 2, 4, 6, 0, 3, 7, 1};
+ uint8_t map_16x[16] = { 15, 10, 9, 7, 4, 1, 3, 13,
+ 12, 2, 0, 6, 11, 8, 5, 14 };
memcpy(ctx->Const.SampleMap2x, map_2x, sizeof(map_2x));
memcpy(ctx->Const.SampleMap4x, map_4x, sizeof(map_4x));
memcpy(ctx->Const.SampleMap8x, map_8x, sizeof(map_8x));
+ memcpy(ctx->Const.SampleMap16x, map_16x, sizeof(map_16x));
}
/**
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index fdb3b3df318..05c546e00a0 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3578,11 +3578,24 @@ struct gl_constants
* below:
* SampleMap8x = {a, b, c, d, e, f, g, h};
*
- * Follow the logic for other sample counts.
+ * Follow the logic for sample counts 2-8.
+ *
+ * For 16x the sample indices layout as a 4x4 grid as follows:
+ *
+ * -----------------
+ * | 0 | 1 | 2 | 3 |
+ * -----------------
+ * | 4 | 5 | 6 | 7 |
+ * -----------------
+ * | 8 | 9 |10 |11 |
+ * -----------------
+ * |12 |13 |14 |15 |
+ * -----------------
*/
uint8_t SampleMap2x[2];
uint8_t SampleMap4x[4];
uint8_t SampleMap8x[8];
+ uint8_t SampleMap16x[16];
/** GL_ARB_shader_atomic_counters */
GLuint MaxAtomicBufferBindings;
From b080b3d54d99dfb46b5e8a6eb94fdbdeb937f255 Mon Sep 17 00:00:00 2001
From: Neil Roberts
Date: Thu, 22 Oct 2015 10:55:35 +0200
Subject: [PATCH 050/287] meta/blit: Always try to enable GL_ARB_sample_shading
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Previously this extension was only enabled when blitting between two
multisampled buffers. However I don't think it does any harm to just
enable it all the time. The ‘enable’ option is used instead of
‘require’ so that the shader will still compile if the extension isn't
available in the cases where it isn't used. This will make the next
patch simpler because it wants to add another optional extension.
Reviewed-by: Anuj Phogat
---
src/mesa/drivers/common/meta_blit.c | 16 ++--------------
1 file changed, 2 insertions(+), 14 deletions(-)
diff --git a/src/mesa/drivers/common/meta_blit.c b/src/mesa/drivers/common/meta_blit.c
index b92c2e2f22b..496ce458824 100644
--- a/src/mesa/drivers/common/meta_blit.c
+++ b/src/mesa/drivers/common/meta_blit.c
@@ -357,17 +357,11 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
shader_index == BLIT_MSAA_SHADER_2D_MULTISAMPLE_ARRAY_DEPTH_COPY ||
shader_index == BLIT_MSAA_SHADER_2D_MULTISAMPLE_DEPTH_COPY) {
char *sample_index;
- const char *arb_sample_shading_extension_string;
if (dst_is_msaa) {
- arb_sample_shading_extension_string = "#extension GL_ARB_sample_shading : enable";
sample_index = "gl_SampleID";
name = "depth MSAA copy";
} else {
- /* Don't need that extension, since we're drawing to a single-sampled
- * destination.
- */
- arb_sample_shading_extension_string = "";
/* From the GL 4.3 spec:
*
* "If there is a multisample buffer (the value of SAMPLE_BUFFERS
@@ -397,7 +391,7 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
fs_source = ralloc_asprintf(mem_ctx,
"#version 130\n"
"#extension GL_ARB_texture_multisample : enable\n"
- "%s\n"
+ "#extension GL_ARB_sample_shading : enable\n"
"uniform sampler2DMS%s texSampler;\n"
"in %s texCoords;\n"
"out vec4 out_color;\n"
@@ -406,7 +400,6 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
"{\n"
" gl_FragDepth = texelFetch(texSampler, i%s(texCoords), %s).r;\n"
"}\n",
- arb_sample_shading_extension_string,
sampler_array_suffix,
texcoord_type,
texcoord_type,
@@ -416,14 +409,12 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
* sample). Yes, this is ridiculous.
*/
char *sample_resolve;
- const char *arb_sample_shading_extension_string;
const char *merge_function;
name = ralloc_asprintf(mem_ctx, "%svec4 MSAA %s",
vec4_prefix,
dst_is_msaa ? "copy" : "resolve");
if (dst_is_msaa) {
- arb_sample_shading_extension_string = "#extension GL_ARB_sample_shading : enable";
sample_resolve = ralloc_asprintf(mem_ctx, " out_color = texelFetch(texSampler, i%s(texCoords), gl_SampleID);", texcoord_type);
merge_function = "";
} else {
@@ -439,8 +430,6 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
"vec4 merge(vec4 a, vec4 b) { return (a + b); }\n";
}
- arb_sample_shading_extension_string = "";
-
/* We're assuming power of two samples for this resolution procedure.
*
* To avoid losing any floating point precision if the samples all
@@ -496,7 +485,7 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
fs_source = ralloc_asprintf(mem_ctx,
"#version 130\n"
"#extension GL_ARB_texture_multisample : enable\n"
- "%s\n"
+ "#extension GL_ARB_sample_shading : enable\n"
"#define gvec4 %svec4\n"
"uniform %ssampler2DMS%s texSampler;\n"
"in %s texCoords;\n"
@@ -507,7 +496,6 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
"{\n"
"%s\n" /* sample_resolve */
"}\n",
- arb_sample_shading_extension_string,
vec4_prefix,
vec4_prefix,
sampler_array_suffix,
From aa3f9aaf31e9056a255f9e0472ebdfdaa60abe54 Mon Sep 17 00:00:00 2001
From: Neil Roberts
Date: Mon, 28 Sep 2015 18:22:32 +0100
Subject: [PATCH 051/287] mesa/meta: Use interpolateAtOffset for 16x MSAA copy
blit
Previously there was a problem in i965 where if 16x MSAA is used then
some of the sample positions are exactly on the 0 x or y axis. When
the MSAA copy blit shader interpolates the texture coordinates at
these sample positions it was possible that it would jump to a
neighboring texel due to rounding errors. It is likely that these
positions would be used on 16x MSAA because that is where they are
defined to be in D3D.
To fix that this patch makes it use interpolateAtOffset in the blit
shader whenever 16x MSAA is used and the GL_ARB_gpu_shader5 extension
is available. This forces it to interpolate the texture coordinates at
the pixel center to avoid these problematic positions.
This fixes ext_framebuffer_multisample-unaligned-blit and
ext_framebuffer_multisample-clip-and-scissor-blit with 16x MSAA on
SKL+.
v2: Use interpolateAtOffset instead of interpolateAtSample
v3: Always try to enable GL_ARB_gpu_shader5 in the shader
[Ian Romanick]
Reviewed-by: Anuj Phogat
---
src/mesa/drivers/common/meta_blit.c | 39 +++++++++++++++++++++++++++--
1 file changed, 37 insertions(+), 2 deletions(-)
diff --git a/src/mesa/drivers/common/meta_blit.c b/src/mesa/drivers/common/meta_blit.c
index 496ce458824..4a2444af0f9 100644
--- a/src/mesa/drivers/common/meta_blit.c
+++ b/src/mesa/drivers/common/meta_blit.c
@@ -357,10 +357,16 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
shader_index == BLIT_MSAA_SHADER_2D_MULTISAMPLE_ARRAY_DEPTH_COPY ||
shader_index == BLIT_MSAA_SHADER_2D_MULTISAMPLE_DEPTH_COPY) {
char *sample_index;
+ const char *tex_coords = "texCoords";
if (dst_is_msaa) {
sample_index = "gl_SampleID";
name = "depth MSAA copy";
+
+ if (ctx->Extensions.ARB_gpu_shader5 && samples >= 16) {
+ /* See comment below for the color copy */
+ tex_coords = "interpolateAtOffset(texCoords, vec2(0.0))";
+ }
} else {
/* From the GL 4.3 spec:
*
@@ -392,17 +398,19 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
"#version 130\n"
"#extension GL_ARB_texture_multisample : enable\n"
"#extension GL_ARB_sample_shading : enable\n"
+ "#extension GL_ARB_gpu_shader5 : enable\n"
"uniform sampler2DMS%s texSampler;\n"
"in %s texCoords;\n"
"out vec4 out_color;\n"
"\n"
"void main()\n"
"{\n"
- " gl_FragDepth = texelFetch(texSampler, i%s(texCoords), %s).r;\n"
+ " gl_FragDepth = texelFetch(texSampler, i%s(%s), %s).r;\n"
"}\n",
sampler_array_suffix,
texcoord_type,
texcoord_type,
+ tex_coords,
sample_index);
} else {
/* You can create 2D_MULTISAMPLE textures with 0 sample count (meaning 1
@@ -415,7 +423,33 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
dst_is_msaa ? "copy" : "resolve");
if (dst_is_msaa) {
- sample_resolve = ralloc_asprintf(mem_ctx, " out_color = texelFetch(texSampler, i%s(texCoords), gl_SampleID);", texcoord_type);
+ const char *tex_coords;
+
+ if (ctx->Extensions.ARB_gpu_shader5 && samples >= 16) {
+ /* If interpolateAtOffset is available then it will be used to
+ * force the interpolation to the center. This is required at
+ * least on Intel hardware because it is possible to have a sample
+ * position on the 0 x or y axis which means it will lie exactly
+ * on the pixel boundary. If we let the hardware interpolate the
+ * coordinates at one of these positions then it is possible for
+ * it to jump to a neighboring texel when converting to ints due
+ * to rounding errors. This is only done for >= 16x MSAA because
+ * it probably has some overhead. It is more likely that some
+ * hardware will use one of these problematic positions at 16x
+ * MSAA because in that case in D3D they are defined to be at
+ * these positions.
+ */
+ tex_coords = "interpolateAtOffset(texCoords, vec2(0.0))";
+ } else {
+ tex_coords = "texCoords";
+ }
+
+ sample_resolve =
+ ralloc_asprintf(mem_ctx,
+ " out_color = texelFetch(texSampler, "
+ "i%s(%s), gl_SampleID);",
+ texcoord_type, tex_coords);
+
merge_function = "";
} else {
int i;
@@ -486,6 +520,7 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
"#version 130\n"
"#extension GL_ARB_texture_multisample : enable\n"
"#extension GL_ARB_sample_shading : enable\n"
+ "#extension GL_ARB_gpu_shader5 : enable\n"
"#define gvec4 %svec4\n"
"uniform %ssampler2DMS%s texSampler;\n"
"in %s texCoords;\n"
From 6c5f371a27f901d5bc60cf5a2a11cf6629f96f78 Mon Sep 17 00:00:00 2001
From: Neil Roberts
Date: Mon, 7 Sep 2015 18:23:14 +0100
Subject: [PATCH 052/287] i965/skl+: Enable support for 16x multisampling
Reviewed-by: Ben Widawsky
---
src/mesa/drivers/dri/i965/brw_context.c | 6 ++++++
src/mesa/drivers/dri/i965/intel_screen.c | 5 ++++-
2 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 3b125448e14..ac6045dbba9 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -84,6 +84,12 @@ brw_query_samples_for_format(struct gl_context *ctx, GLenum target,
switch (brw->gen) {
case 9:
+ samples[0] = 16;
+ samples[1] = 8;
+ samples[2] = 4;
+ samples[3] = 2;
+ return 4;
+
case 8:
samples[0] = 8;
samples[1] = 4;
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index fb95fb629ad..d64ebade769 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -1178,12 +1178,15 @@ intel_detect_timestamp(struct intel_screen *screen)
const int*
intel_supported_msaa_modes(const struct intel_screen *screen)
{
+ static const int gen9_modes[] = {16, 8, 4, 2, 0, -1};
static const int gen8_modes[] = {8, 4, 2, 0, -1};
static const int gen7_modes[] = {8, 4, 0, -1};
static const int gen6_modes[] = {4, 0, -1};
static const int gen4_modes[] = {0, -1};
- if (screen->devinfo->gen >= 8) {
+ if (screen->devinfo->gen >= 9) {
+ return gen9_modes;
+ } else if (screen->devinfo->gen >= 8) {
return gen8_modes;
} else if (screen->devinfo->gen >= 7) {
return gen7_modes;
From 027b64a55afc0fe8efcf9f6217192807e285c830 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga
Date: Fri, 30 Oct 2015 08:39:11 +0100
Subject: [PATCH 053/287] i965/fs: Do not mark direct used surfaces in
VARYING_PULL_CONSTANT_LOAD
Right now the generator marks direct surfaces as used but leaves marking of
indirect surfaces to the caller. Just make the callers handle marking in both
cases for consistency.
v2: Use const and remove useless surf_index temporary (Curro)
Reviewed-by: Francisco Jerez
---
src/mesa/drivers/dri/i965/brw_fs.cpp | 7 ++++---
src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 8 --------
src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 6 ++++--
3 files changed, 8 insertions(+), 13 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index cb2536263dd..a813746cffc 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2012,7 +2012,7 @@ fs_visitor::demote_pull_constants()
/* Set up the annotation tracking for new generated instructions. */
const fs_builder ibld(this, block, inst);
- fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
+ const unsigned index = stage_prog_data->binding_table.pull_constants_start;
fs_reg dst = vgrf(glsl_type::float_type);
assert(inst->src[i].stride == 0);
@@ -2020,16 +2020,17 @@ fs_visitor::demote_pull_constants()
/* Generate a pull load into dst. */
if (inst->src[i].reladdr) {
VARYING_PULL_CONSTANT_LOAD(ibld, dst,
- surf_index,
+ fs_reg(index),
*inst->src[i].reladdr,
pull_index);
inst->src[i].reladdr = NULL;
inst->src[i].stride = 1;
+ brw_mark_surface_used(prog_data, index);
} else {
const fs_builder ubld = ibld.exec_all().group(8, 0);
fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
- dst, surf_index, offset);
+ dst, fs_reg(index), offset);
inst->src[i].set_smear(pull_index & 3);
}
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 28fb620279b..87152634c73 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -1349,8 +1349,6 @@ fs_generator::generate_varying_pull_constant_load(fs_inst *inst,
inst->header_size != 0,
simd_mode,
return_format);
-
- brw_mark_surface_used(prog_data, surf_index);
}
void
@@ -1395,8 +1393,6 @@ fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
simd_mode,
0);
- brw_mark_surface_used(prog_data, surf_index);
-
} else {
struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
@@ -1427,10 +1423,6 @@ fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
false /* header */,
simd_mode,
0);
-
- /* visitor knows more than we do about the surface limit required,
- * so has already done marking.
- */
}
}
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index e7a39ff741c..50b8218e934 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -1827,8 +1827,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
fs_reg surf_index;
if (const_index) {
- surf_index = fs_reg(stage_prog_data->binding_table.ubo_start +
- const_index->u[0]);
+ const unsigned index = stage_prog_data->binding_table.ubo_start +
+ const_index->u[0];
+ surf_index = fs_reg(index);
+ brw_mark_surface_used(prog_data, index);
} else {
/* The block index is not a constant. Evaluate the index expression
* per-channel and add the base UBO index; we have to select a value
From d7013988fb1d1c277e1fbce8623abddc43f78e05 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga
Date: Fri, 30 Oct 2015 08:48:57 +0100
Subject: [PATCH 054/287] i965/fs: Do not mark used direct surfaces in
UNIFORM_PULL_CONSTANT_LOAD
Right now the generator marks direct surfaces as used but leaves marking of
indirect surfaces to the caller. Just make the callers handle marking in both
cases for consistency.
Reviewed-by: Francisco Jerez
---
src/mesa/drivers/dri/i965/brw_fs.cpp | 2 +-
src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 10 ----------
2 files changed, 1 insertion(+), 11 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index a813746cffc..629fbbdf01b 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2025,7 +2025,6 @@ fs_visitor::demote_pull_constants()
pull_index);
inst->src[i].reladdr = NULL;
inst->src[i].stride = 1;
- brw_mark_surface_used(prog_data, index);
} else {
const fs_builder ubld = ibld.exec_all().group(8, 0);
fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
@@ -2033,6 +2032,7 @@ fs_visitor::demote_pull_constants()
dst, fs_reg(index), offset);
inst->src[i].set_smear(pull_index & 3);
}
+ brw_mark_surface_used(prog_data, index);
/* Rewrite the instruction to use the temporary VGRF. */
inst->src[i].file = GRF;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 87152634c73..c73257a8d21 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -1184,8 +1184,6 @@ fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
read_offset, surf_index);
-
- brw_mark_surface_used(prog_data, surf_index);
}
void
@@ -1246,9 +1244,6 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
header_present,
BRW_SAMPLER_SIMD_MODE_SIMD4X2,
0);
-
- brw_mark_surface_used(prog_data, surf_index);
-
} else {
struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
@@ -1278,11 +1273,6 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
0);
brw_pop_insn_state(p);
-
- /* visitor knows more than we do about the surface limit required,
- * so has already done marking.
- */
-
}
}
From 6105d1d0a02c7eea83b327965713be3bada306f7 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga
Date: Fri, 30 Oct 2015 10:24:12 +0100
Subject: [PATCH 055/287] i965/vec4: Do not mark used direct surfaces in
VS_OPCODE_PULL_CONSTANT_LOAD
Right now the generator marks direct surfaces as used but leaves marking of
indirect surfaces to the caller. Just make the callers handle marking in both
cases for consistency.
v2: Use const, do not add unnecessary temporary (Curro)
Reviewed-by: Francisco Jerez
---
src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 9 ---------
src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 6 ++++--
src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 6 ++++--
3 files changed, 8 insertions(+), 13 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index f0ad903c572..d9252ef4c89 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -929,8 +929,6 @@ generate_pull_constant_load(struct brw_codegen *p,
2, /* mlen */
true, /* header_present */
1 /* rlen */);
-
- brw_mark_surface_used(&prog_data->base, surf_index);
}
static void
@@ -985,9 +983,6 @@ generate_pull_constant_load_gen7(struct brw_codegen *p,
inst->header_size != 0,
BRW_SAMPLER_SIMD_MODE_SIMD4X2,
0);
-
- brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud);
-
} else {
struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
@@ -1017,10 +1012,6 @@ generate_pull_constant_load_gen7(struct brw_codegen *p,
inst->header_size != 0,
BRW_SAMPLER_SIMD_MODE_SIMD4X2,
0);
-
- /* visitor knows more than we do about the surface limit required,
- * so has already done marking.
- */
}
}
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index b848810ebc7..e6c018e52ae 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -749,8 +749,10 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
/* The block index is a constant, so just emit the binding table entry
* as an immediate.
*/
- surf_index = src_reg(prog_data->base.binding_table.ubo_start +
- const_block_index->u[0]);
+ const unsigned index = prog_data->base.binding_table.ubo_start +
+ const_block_index->u[0];
+ surf_index = src_reg(index);
+ brw_mark_surface_used(&prog_data->base, index);
} else {
/* The block index is not a constant. Evaluate the index expression
* per-channel and add the base UBO index; we have to select a value
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 7d949896bcc..94759afd166 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1745,14 +1745,16 @@ vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
int base_offset)
{
int reg_offset = base_offset + orig_src.reg_offset;
- src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
+ const unsigned index = prog_data->base.binding_table.pull_constants_start;
src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
reg_offset);
emit_pull_constant_load_reg(temp,
- index,
+ src_reg(index),
offset,
block, inst);
+
+ brw_mark_surface_used(&prog_data->base, index);
}
/**
From eca4c43a33c5c1bb63c8aa9d0506ed2ba3f9d8cb Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga
Date: Fri, 30 Oct 2015 10:57:47 +0100
Subject: [PATCH 056/287] i965/vec4: Do not mark used surfaces in
VS_OPCODE_GET_BUFFER_SIZE
Do it in the visitor, like we do for other opcodes.
v2: use const, get rid of useless surf_index temporary (Curro)
Reviewed-by: Francisco Jerez
---
src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 2 --
src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 8 +++++---
2 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index d9252ef4c89..693f5835412 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -955,8 +955,6 @@ generate_get_buffer_size(struct brw_codegen *p,
inst->header_size > 0,
BRW_SAMPLER_SIMD_MODE_SIMD4X2,
BRW_SAMPLER_RETURN_FORMAT_SINT32);
-
- brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud);
}
static void
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index e6c018e52ae..e0d5a14981a 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -427,15 +427,15 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]);
unsigned ssbo_index = const_uniform_block ? const_uniform_block->u[0] : 0;
- src_reg surf_index = src_reg(prog_data->base.binding_table.ssbo_start +
- ssbo_index);
+ const unsigned index =
+ prog_data->base.binding_table.ssbo_start + ssbo_index;
dst_reg result_dst = get_nir_dest(instr->dest);
vec4_instruction *inst = new(mem_ctx)
vec4_instruction(VS_OPCODE_GET_BUFFER_SIZE, result_dst);
inst->base_mrf = 2;
inst->mlen = 1; /* always at least one */
- inst->src[1] = src_reg(surf_index);
+ inst->src[1] = src_reg(index);
/* MRF for the first parameter */
src_reg lod = src_reg(0);
@@ -444,6 +444,8 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
emit(MOV(dst_reg(MRF, param_base, glsl_type::int_type, writemask), lod));
emit(inst);
+
+ brw_mark_surface_used(&prog_data->base, index);
break;
}
From eea3c907cc480a105224b21be51d62bc64ea1057 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga
Date: Fri, 30 Oct 2015 11:10:02 +0100
Subject: [PATCH 057/287] i965/fs: Do not mark used surfaces in
FS_OPCODE_GET_BUFFER_SIZE
Do it in the visitor, like we do for other opcodes.
v2: use const, get rid of useless surf_index temporary (Curro)
Reviewed-by: Francisco Jerez
---
src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 2 --
src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 6 ++++--
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index c73257a8d21..974219f3ece 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -666,8 +666,6 @@ fs_generator::generate_get_buffer_size(fs_inst *inst,
inst->header_size > 0,
simd_mode,
BRW_SAMPLER_RETURN_FORMAT_SINT32);
-
- brw_mark_surface_used(prog_data, surf_index.dw1.ud);
}
void
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 50b8218e934..b6f4c52c50f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -2275,12 +2275,14 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
BRW_REGISTER_TYPE_UD);
bld.LOAD_PAYLOAD(src_payload, &source, 1, 0);
- fs_reg surf_index = fs_reg(prog_data->binding_table.ssbo_start + ssbo_index);
+ const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index;
fs_inst *inst = bld.emit(FS_OPCODE_GET_BUFFER_SIZE, dest,
- src_payload, surf_index);
+ src_payload, fs_reg(index));
inst->header_size = 0;
inst->mlen = mlen;
bld.emit(inst);
+
+ brw_mark_surface_used(prog_data, index);
break;
}
From e587590a83588133d7a9044e3935585f675bbb30 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin
Date: Thu, 5 Nov 2015 00:33:22 -0500
Subject: [PATCH 058/287] st/mesa: account for texture views when doing
CopyImageSubData
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: Ilia Mirkin
Reviewed-by: Marek Olšák
---
src/mesa/state_tracker/st_cb_copyimage.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/src/mesa/state_tracker/st_cb_copyimage.c b/src/mesa/state_tracker/st_cb_copyimage.c
index 75114cdb712..03a7294e7c9 100644
--- a/src/mesa/state_tracker/st_cb_copyimage.c
+++ b/src/mesa/state_tracker/st_cb_copyimage.c
@@ -552,6 +552,10 @@ st_CopyImageSubData(struct gl_context *ctx,
src_res = src->pt;
src_level = src_image->Level;
src_z += src_image->Face;
+ if (src_image->TexObject->Immutable) {
+ src_level += src_image->TexObject->MinLevel;
+ src_z += src_image->TexObject->MinLayer;
+ }
} else {
struct st_renderbuffer *src = st_renderbuffer(src_renderbuffer);
src_res = src->texture;
@@ -563,6 +567,10 @@ st_CopyImageSubData(struct gl_context *ctx,
dst_res = dst->pt;
dst_level = dst_image->Level;
dst_z += dst_image->Face;
+ if (dst_image->TexObject->Immutable) {
+ dst_level += dst_image->TexObject->MinLevel;
+ dst_z += dst_image->TexObject->MinLayer;
+ }
} else {
struct st_renderbuffer *dst = st_renderbuffer(dst_renderbuffer);
dst_res = dst->texture;
From fc76cc05e39839c0933320f28b4cc9041d4e7770 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin
Date: Fri, 30 Oct 2015 03:17:35 -0400
Subject: [PATCH 059/287] gallium: expose a debug message callback settable by
context owner
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This will allow gallium drivers to send messages to KHR_debug endpoints
Signed-off-by: Ilia Mirkin
Reviewed-by: Marek Olšák
---
src/gallium/auxiliary/util/u_debug.c | 14 ++++++++++++++
src/gallium/auxiliary/util/u_debug.h | 20 ++++++++++++++++++++
src/gallium/docs/source/context.rst | 3 +++
src/gallium/include/pipe/p_context.h | 8 ++++++++
src/gallium/include/pipe/p_defines.h | 12 ++++++++++++
src/gallium/include/pipe/p_state.h | 25 +++++++++++++++++++++++++
6 files changed, 82 insertions(+)
diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c
index 7388a499c74..702953673ba 100644
--- a/src/gallium/auxiliary/util/u_debug.c
+++ b/src/gallium/auxiliary/util/u_debug.c
@@ -70,6 +70,20 @@ void _debug_vprintf(const char *format, va_list ap)
#endif
}
+void
+_pipe_debug_message(
+ struct pipe_debug_callback *cb,
+ unsigned *id,
+ enum pipe_debug_type type,
+ const char *fmt, ...)
+{
+ va_list args;
+ va_start(args, fmt);
+ if (cb && cb->debug_message)
+ cb->debug_message(cb->data, id, type, fmt, args);
+ va_end(args);
+}
+
void
debug_disable_error_message_boxes(void)
diff --git a/src/gallium/auxiliary/util/u_debug.h b/src/gallium/auxiliary/util/u_debug.h
index 926063a1918..aaf223c6f68 100644
--- a/src/gallium/auxiliary/util/u_debug.h
+++ b/src/gallium/auxiliary/util/u_debug.h
@@ -42,6 +42,7 @@
#include "os/os_misc.h"
#include "pipe/p_format.h"
+#include "pipe/p_defines.h"
#ifdef __cplusplus
@@ -262,6 +263,25 @@ void _debug_assert_fail(const char *expr,
_debug_printf("error: %s\n", __msg)
#endif
+/**
+ * Output a debug log message to the debug info callback.
+ */
+#define pipe_debug_message(cb, type, fmt, ...) do { \
+ static unsigned id = 0; \
+ _pipe_debug_message(cb, &id, \
+ PIPE_DEBUG_TYPE_ ## type, \
+ fmt, __VA_ARGS__); \
+} while (0)
+
+struct pipe_debug_callback;
+
+void
+_pipe_debug_message(
+ struct pipe_debug_callback *cb,
+ unsigned *id,
+ enum pipe_debug_type type,
+ const char *fmt, ...) _util_printf_format(4, 5);
+
/**
* Used by debug_dump_enum and debug_dump_flags to describe symbols.
diff --git a/src/gallium/docs/source/context.rst b/src/gallium/docs/source/context.rst
index a7d08d2c7f9..dbc087700b5 100644
--- a/src/gallium/docs/source/context.rst
+++ b/src/gallium/docs/source/context.rst
@@ -84,6 +84,9 @@ objects. They all follow simple, one-method binding calls, e.g.
levels. This corresponds to GL's ``PATCH_DEFAULT_OUTER_LEVEL``.
* ``default_inner_level`` is the default value for the inner tessellation
levels. This corresponds to GL's ``PATCH_DEFAULT_INNER_LEVEL``.
+* ``set_debug_callback`` sets the callback to be used for reporting
+ various debug messages, eventually reported via KHR_debug and
+ similar mechanisms.
Sampler Views
diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h
index 6f9fe767404..5adbd18e690 100644
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -45,6 +45,7 @@ struct pipe_blit_info;
struct pipe_box;
struct pipe_clip_state;
struct pipe_constant_buffer;
+struct pipe_debug_callback;
struct pipe_depth_stencil_alpha_state;
struct pipe_draw_info;
struct pipe_fence_handle;
@@ -238,6 +239,13 @@ struct pipe_context {
const float default_outer_level[4],
const float default_inner_level[2]);
+ /**
+ * Sets the debug callback. If the pointer is null, then no callback is
+ * set, otherwise a copy of the data should be made.
+ */
+ void (*set_debug_callback)(struct pipe_context *,
+ const struct pipe_debug_callback *);
+
/**
* Bind an array of shader buffers that will be used by a shader.
* Any buffers that were previously bound to the specified range
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index b15c8809c1d..d6f87ccae12 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -868,6 +868,18 @@ struct pipe_driver_query_group_info
unsigned num_queries;
};
+enum pipe_debug_type
+{
+ PIPE_DEBUG_TYPE_OUT_OF_MEMORY = 1,
+ PIPE_DEBUG_TYPE_ERROR,
+ PIPE_DEBUG_TYPE_SHADER_INFO,
+ PIPE_DEBUG_TYPE_PERF_INFO,
+ PIPE_DEBUG_TYPE_INFO,
+ PIPE_DEBUG_TYPE_FALLBACK,
+ PIPE_DEBUG_TYPE_CONFORMANCE,
+};
+
+
#ifdef __cplusplus
}
#endif
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index 4bf8d46c686..6bdf03a8b2b 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -684,6 +684,31 @@ struct pipe_compute_state
unsigned req_input_mem; /**< Required size of the INPUT resource. */
};
+/**
+ * Structure that contains a callback for debug messages from the driver back
+ * to the state tracker.
+ */
+struct pipe_debug_callback
+{
+ /**
+ * Callback for the driver to report debug/performance/etc information back
+ * to the state tracker.
+ *
+ * \param data user-supplied data pointer
+ * \param id message type identifier, if pointed value is 0, then a
+ * new id is assigned
+ * \param type PIPE_DEBUG_TYPE_*
+ * \param format printf-style format string
+ * \param args args for format string
+ */
+ void (*debug_message)(void *data,
+ unsigned *id,
+ enum pipe_debug_type type,
+ const char *fmt,
+ va_list args);
+ void *data;
+};
+
#ifdef __cplusplus
}
#endif
From c93c9d220baa60fdd0e685a072a61857d3a2846b Mon Sep 17 00:00:00 2001
From: Ilia Mirkin
Date: Fri, 30 Oct 2015 23:28:01 -0400
Subject: [PATCH 060/287] st/mesa: set debug callback for debug contexts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: Ilia Mirkin
Reviewed-by: Marek Olšák
Reviewed-by: Brian Paul
---
src/mesa/state_tracker/st_manager.c | 57 +++++++++++++++++++++++++++++
1 file changed, 57 insertions(+)
diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c
index 7abd128e719..d0d261f4fde 100644
--- a/src/mesa/state_tracker/st_manager.c
+++ b/src/mesa/state_tracker/st_manager.c
@@ -623,6 +623,58 @@ st_context_destroy(struct st_context_iface *stctxi)
st_destroy_context(st);
}
+static void
+st_debug_message(void *data,
+ unsigned *id,
+ enum pipe_debug_type ptype,
+ const char *fmt,
+ va_list args)
+{
+ struct st_context *st = data;
+ enum mesa_debug_source source;
+ enum mesa_debug_type type;
+ enum mesa_debug_severity severity;
+
+ switch (ptype) {
+ case PIPE_DEBUG_TYPE_OUT_OF_MEMORY:
+ source = MESA_DEBUG_SOURCE_API;
+ type = MESA_DEBUG_TYPE_ERROR;
+ severity = MESA_DEBUG_SEVERITY_MEDIUM;
+ break;
+ case PIPE_DEBUG_TYPE_ERROR:
+ source = MESA_DEBUG_SOURCE_API;
+ type = MESA_DEBUG_TYPE_ERROR;
+ severity = MESA_DEBUG_SEVERITY_MEDIUM;
+ break;
+ case PIPE_DEBUG_TYPE_SHADER_INFO:
+ source = MESA_DEBUG_SOURCE_SHADER_COMPILER;
+ type = MESA_DEBUG_TYPE_OTHER;
+ severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
+ break;
+ case PIPE_DEBUG_TYPE_PERF_INFO:
+ source = MESA_DEBUG_SOURCE_API;
+ type = MESA_DEBUG_TYPE_PERFORMANCE;
+ severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
+ break;
+ case PIPE_DEBUG_TYPE_INFO:
+ source = MESA_DEBUG_SOURCE_API;
+ type = MESA_DEBUG_TYPE_OTHER;
+ severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
+ break;
+ case PIPE_DEBUG_TYPE_FALLBACK:
+ source = MESA_DEBUG_SOURCE_API;
+ type = MESA_DEBUG_TYPE_PERFORMANCE;
+ severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
+ break;
+ case PIPE_DEBUG_TYPE_CONFORMANCE:
+ source = MESA_DEBUG_SOURCE_API;
+ type = MESA_DEBUG_TYPE_OTHER;
+ severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
+ break;
+ }
+ _mesa_gl_vdebug(st->ctx, id, source, type, severity, fmt, args);
+}
+
static struct st_context_iface *
st_api_create_context(struct st_api *stapi, struct st_manager *smapi,
const struct st_context_attribs *attribs,
@@ -677,6 +729,11 @@ st_api_create_context(struct st_api *stapi, struct st_manager *smapi,
return NULL;
}
st->ctx->Const.ContextFlags |= GL_CONTEXT_FLAG_DEBUG_BIT;
+
+ if (pipe->set_debug_callback) {
+ struct pipe_debug_callback cb = { st_debug_message, st };
+ pipe->set_debug_callback(pipe, &cb);
+ }
}
if (attribs->flags & ST_CONTEXT_FLAG_FORWARD_COMPATIBLE)
From 6706cc1671bfd8e6c021db8b68815959fa7fceba Mon Sep 17 00:00:00 2001
From: Ilia Mirkin
Date: Fri, 30 Oct 2015 23:25:59 -0400
Subject: [PATCH 061/287] st/clover: provide a path for drivers to call through
to pfn_notify
Signed-off-by: Ilia Mirkin
[ Francisco Jerez: Clean up clover::context interface by passing
around a function object. ]
---
.../state_trackers/clover/api/context.cpp | 7 ++++++-
.../state_trackers/clover/core/context.cpp | 5 +++--
.../state_trackers/clover/core/context.hpp | 7 ++++++-
.../state_trackers/clover/core/queue.cpp | 21 +++++++++++++++++++
4 files changed, 36 insertions(+), 4 deletions(-)
diff --git a/src/gallium/state_trackers/clover/api/context.cpp b/src/gallium/state_trackers/clover/api/context.cpp
index 021eea36f6e..c0cd2d32b95 100644
--- a/src/gallium/state_trackers/clover/api/context.cpp
+++ b/src/gallium/state_trackers/clover/api/context.cpp
@@ -45,8 +45,13 @@ clCreateContext(const cl_context_properties *d_props, cl_uint num_devs,
throw error(CL_INVALID_PROPERTY);
}
+ const auto notify = (!pfn_notify ? context::notify_action() :
+ [=](const char *s) {
+ pfn_notify(s, NULL, 0, user_data);
+ });
+
ret_error(r_errcode, CL_SUCCESS);
- return desc(new context(props, devs));
+ return desc(new context(props, devs, notify));
} catch (error &e) {
ret_error(r_errcode, e);
diff --git a/src/gallium/state_trackers/clover/core/context.cpp b/src/gallium/state_trackers/clover/core/context.cpp
index bf4df39dc2a..c3e20829384 100644
--- a/src/gallium/state_trackers/clover/core/context.cpp
+++ b/src/gallium/state_trackers/clover/core/context.cpp
@@ -25,8 +25,9 @@
using namespace clover;
context::context(const property_list &props,
- const ref_vector &devs) :
- props(props), devs(devs) {
+ const ref_vector &devs,
+ const notify_action ¬ify) :
+ notify(notify), props(props), devs(devs) {
}
bool
diff --git a/src/gallium/state_trackers/clover/core/context.hpp b/src/gallium/state_trackers/clover/core/context.hpp
index 0ec4ff4a231..7b22ccae78f 100644
--- a/src/gallium/state_trackers/clover/core/context.hpp
+++ b/src/gallium/state_trackers/clover/core/context.hpp
@@ -36,7 +36,10 @@ namespace clover {
typedef clover::property_list property_list;
public:
- context(const property_list &props, const ref_vector &devs);
+ typedef std::function notify_action;
+
+ context(const property_list &props, const ref_vector &devs,
+ const notify_action ¬ify);
context(const context &ctx) = delete;
context &
@@ -53,6 +56,8 @@ namespace clover {
device_range
devices() const;
+ const notify_action notify;
+
private:
property_list props;
const std::vector> devs;
diff --git a/src/gallium/state_trackers/clover/core/queue.cpp b/src/gallium/state_trackers/clover/core/queue.cpp
index 4aaf67de241..24d71f186e0 100644
--- a/src/gallium/state_trackers/clover/core/queue.cpp
+++ b/src/gallium/state_trackers/clover/core/queue.cpp
@@ -24,15 +24,36 @@
#include "core/event.hpp"
#include "pipe/p_screen.h"
#include "pipe/p_context.h"
+#include "pipe/p_state.h"
using namespace clover;
+namespace {
+ void
+ debug_notify_callback(void *data,
+ unsigned *id,
+ enum pipe_debug_type type,
+ const char *fmt,
+ va_list args) {
+ const command_queue *queue = (const command_queue *)data;
+ char buffer[1024];
+ vsnprintf(buffer, sizeof(buffer), fmt, args);
+ queue->context().notify(buffer);
+ }
+}
+
command_queue::command_queue(clover::context &ctx, clover::device &dev,
cl_command_queue_properties props) :
context(ctx), device(dev), props(props) {
pipe = dev.pipe->context_create(dev.pipe, NULL, PIPE_CONTEXT_COMPUTE_ONLY);
if (!pipe)
throw error(CL_INVALID_DEVICE);
+
+ if (ctx.notify) {
+ struct pipe_debug_callback cb = { &debug_notify_callback, this };
+ if (pipe->set_debug_callback)
+ pipe->set_debug_callback(pipe, &cb);
+ }
}
command_queue::~command_queue() {
From 4335b28840be53ad3c230a4f2dfc2262bf56a0a7 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin
Date: Fri, 30 Oct 2015 17:23:22 -0400
Subject: [PATCH 062/287] nouveau: add support for sending debug messages via
KHR_debug
Signed-off-by: Ilia Mirkin
---
src/gallium/drivers/nouveau/nouveau_context.h | 4 ++++
src/gallium/drivers/nouveau/nouveau_screen.c | 19 +++++++++++++++++++
.../drivers/nouveau/nv30/nv30_context.c | 1 +
.../drivers/nouveau/nv50/nv50_context.c | 1 +
.../drivers/nouveau/nvc0/nvc0_context.c | 1 +
5 files changed, 26 insertions(+)
diff --git a/src/gallium/drivers/nouveau/nouveau_context.h b/src/gallium/drivers/nouveau/nouveau_context.h
index decb2714ede..a8189b82f82 100644
--- a/src/gallium/drivers/nouveau/nouveau_context.h
+++ b/src/gallium/drivers/nouveau/nouveau_context.h
@@ -14,6 +14,7 @@ struct nouveau_context {
struct nouveau_client *client;
struct nouveau_pushbuf *pushbuf;
+ struct pipe_debug_callback debug;
bool vbo_dirty;
@@ -63,6 +64,9 @@ nouveau_context(struct pipe_context *pipe)
void
nouveau_context_init_vdec(struct nouveau_context *);
+void
+nouveau_context_init(struct nouveau_context *);
+
void
nouveau_scratch_runout_release(struct nouveau_context *);
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index 47603b0b7fd..21d431788ec 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -18,6 +18,7 @@
#include "nouveau_winsys.h"
#include "nouveau_screen.h"
+#include "nouveau_context.h"
#include "nouveau_fence.h"
#include "nouveau_mm.h"
#include "nouveau_buffer.h"
@@ -238,3 +239,21 @@ nouveau_screen_fini(struct nouveau_screen *screen)
nouveau_device_del(&screen->device);
}
+
+static void
+nouveau_set_debug_callback(struct pipe_context *pipe,
+ const struct pipe_debug_callback *cb)
+{
+ struct nouveau_context *context = nouveau_context(pipe);
+
+ if (cb)
+ context->debug = *cb;
+ else
+ memset(&context->debug, 0, sizeof(context->debug));
+}
+
+void
+nouveau_context_init(struct nouveau_context *context)
+{
+ context->pipe.set_debug_callback = nouveau_set_debug_callback;
+}
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_context.c b/src/gallium/drivers/nouveau/nv30/nv30_context.c
index a36fd57fae7..3ed088912e2 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_context.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_context.c
@@ -242,6 +242,7 @@ nv30_context_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
if (debug_get_bool_option("NV30_SWTNL", false))
nv30->draw_flags |= NV30_NEW_SWTNL;
+ nouveau_context_init(&nv30->base);
nv30->sample_mask = 0xffff;
nv30_vbo_init(pipe);
nv30_query_init(pipe);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.c b/src/gallium/drivers/nouveau/nv50/nv50_context.c
index 4108f48005e..7867c2df7f3 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c
@@ -306,6 +306,7 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
}
nv50->base.pushbuf->kick_notify = nv50_default_kick_notify;
+ nouveau_context_init(&nv50->base);
nv50_init_query_functions(nv50);
nv50_init_surface_functions(nv50);
nv50_init_state_functions(nv50);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
index f7604f11788..82ed5a1864e 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@@ -309,6 +309,7 @@ nvc0_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
pipe->memory_barrier = nvc0_memory_barrier;
pipe->get_sample_position = nvc0_context_get_sample_position;
+ nouveau_context_init(&nvc0->base);
nvc0_init_query_functions(nvc0);
nvc0_init_surface_functions(nvc0);
nvc0_init_state_functions(nvc0);
From 4f6cd5fad03757e371b66049dcd42855e4853c14 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin
Date: Fri, 30 Oct 2015 18:41:09 -0400
Subject: [PATCH 063/287] nv50,nvc0: provide debug messages with shader
compilation stats
Signed-off-by: Ilia Mirkin
---
src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h | 1 +
src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp | 2 ++
src/gallium/drivers/nouveau/nv50/nv50_program.c | 8 +++++++-
src/gallium/drivers/nouveau/nv50/nv50_program.h | 3 ++-
src/gallium/drivers/nouveau/nv50/nv50_shader_state.c | 2 +-
src/gallium/drivers/nouveau/nv50/nv50_state.c | 3 ++-
src/gallium/drivers/nouveau/nvc0/nvc0_compute.c | 2 +-
src/gallium/drivers/nouveau/nvc0/nvc0_context.h | 3 ++-
src/gallium/drivers/nouveau/nvc0/nvc0_program.c | 8 +++++++-
src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c | 2 +-
src/gallium/drivers/nouveau/nvc0/nvc0_state.c | 3 ++-
11 files changed, 28 insertions(+), 9 deletions(-)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index c0cab3299b5..b49bf9d53bc 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -96,6 +96,7 @@ struct nv50_ir_prog_info
uint32_t tlsSpace; /* required local memory per thread */
uint32_t *code;
uint32_t codeSize;
+ uint32_t instructions;
uint8_t sourceRep; /* NV50_PROGRAM_IR */
const void *source;
void *relocData;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
index afc8ff1374f..4390a726d1c 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
@@ -373,6 +373,7 @@ Program::emitBinary(struct nv50_ir_prog_info *info)
if (!code)
return false;
emit->setCodeLocation(code, binSize);
+ info->bin.instructions = 0;
for (ArrayList::Iterator fi = allFuncs.iterator(); !fi.end(); fi.next()) {
Function *fn = reinterpret_cast(fi.get());
@@ -382,6 +383,7 @@ Program::emitBinary(struct nv50_ir_prog_info *info)
for (int b = 0; b < fn->bbCount; ++b) {
for (Instruction *i = fn->bbArray[b]->getEntry(); i; i = i->next) {
emit->emitInstruction(i);
+ info->bin.instructions++;
if (i->sType == TYPE_F64 || i->dType == TYPE_F64)
info->io.fp64 = true;
}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c
index 299629b6438..89e7a338283 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@@ -318,7 +318,8 @@ nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info,
}
bool
-nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
+nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
+ struct pipe_debug_callback *debug)
{
struct nv50_ir_prog_info *info;
int ret;
@@ -406,6 +407,11 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
prog->so = nv50_program_create_strmout_state(info,
&prog->pipe.stream_output);
+ pipe_debug_message(debug, SHADER_INFO,
+ "type: %d, local: %d, gpr: %d, inst: %d, bytes: %d",
+ prog->type, info->bin.tlsSpace, prog->max_gpr,
+ info->bin.instructions, info->bin.codeSize);
+
out:
FREE(info);
return !ret;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.h b/src/gallium/drivers/nouveau/nv50/nv50_program.h
index 24cc96567d7..7a33eb11d6d 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.h
@@ -106,7 +106,8 @@ struct nv50_program {
struct nv50_stream_output_state *so;
};
-bool nv50_program_translate(struct nv50_program *, uint16_t chipset);
+bool nv50_program_translate(struct nv50_program *, uint16_t chipset,
+ struct pipe_debug_callback *);
bool nv50_program_upload_code(struct nv50_context *, struct nv50_program *);
void nv50_program_destroy(struct nv50_context *, struct nv50_program *);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
index 9b911043132..8e4b2b42bda 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
@@ -113,7 +113,7 @@ nv50_program_validate(struct nv50_context *nv50, struct nv50_program *prog)
{
if (!prog->translated) {
prog->translated = nv50_program_translate(
- prog, nv50->screen->base.device->chipset);
+ prog, nv50->screen->base.device->chipset, &nv50->base.debug);
if (!prog->translated)
return false;
} else
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index 6c8c9f0b4e6..d27f12ca94b 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -727,7 +727,8 @@ nv50_sp_state_create(struct pipe_context *pipe,
prog->pipe.stream_output = cso->stream_output;
prog->translated = nv50_program_translate(
- prog, nv50_context(pipe)->screen->base.device->chipset);
+ prog, nv50_context(pipe)->screen->base.device->chipset,
+ &nouveau_context(pipe)->debug);
return (void *)prog;
}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
index e33af042620..2e7c790e9ee 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
@@ -120,7 +120,7 @@ nvc0_compute_validate_program(struct nvc0_context *nvc0)
if (!prog->translated) {
prog->translated = nvc0_program_translate(
- prog, nvc0->screen->base.device->chipset);
+ prog, nvc0->screen->base.device->chipset, &nvc0->base.debug);
if (!prog->translated)
return false;
}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index 4af83c53224..39b73ecb0c2 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -224,7 +224,8 @@ void nvc0_default_kick_notify(struct nouveau_pushbuf *);
extern struct draw_stage *nvc0_draw_render_stage(struct nvc0_context *);
/* nvc0_program.c */
-bool nvc0_program_translate(struct nvc0_program *, uint16_t chipset);
+bool nvc0_program_translate(struct nvc0_program *, uint16_t chipset,
+ struct pipe_debug_callback *);
bool nvc0_program_upload_code(struct nvc0_context *, struct nvc0_program *);
void nvc0_program_destroy(struct nvc0_context *, struct nvc0_program *);
void nvc0_program_library_upload(struct nvc0_context *);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index 68048f9d6c0..43d7c7b1123 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -517,7 +517,8 @@ nvc0_program_dump(struct nvc0_program *prog)
#endif
bool
-nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
+nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
+ struct pipe_debug_callback *debug)
{
struct nv50_ir_prog_info *info;
int ret;
@@ -639,6 +640,11 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
prog->tfb = nvc0_program_create_tfb_state(info,
&prog->pipe.stream_output);
+ pipe_debug_message(debug, SHADER_INFO,
+ "type: %d, local: %d, gpr: %d, inst: %d, bytes: %d",
+ prog->type, info->bin.tlsSpace, prog->num_gprs,
+ info->bin.instructions, info->bin.codeSize);
+
out:
FREE(info);
return !ret;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
index 8595800592c..7e2e9992fe8 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -72,7 +72,7 @@ nvc0_program_validate(struct nvc0_context *nvc0, struct nvc0_program *prog)
if (!prog->translated) {
prog->translated = nvc0_program_translate(
- prog, nvc0->screen->base.device->chipset);
+ prog, nvc0->screen->base.device->chipset, &nvc0->base.debug);
if (!prog->translated)
return false;
}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index ba1714da010..5dce5f0e65d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -681,7 +681,8 @@ nvc0_sp_state_create(struct pipe_context *pipe,
prog->pipe.stream_output = cso->stream_output;
prog->translated = nvc0_program_translate(
- prog, nvc0_context(pipe)->screen->base.device->chipset);
+ prog, nvc0_context(pipe)->screen->base.device->chipset,
+ &nouveau_context(pipe)->debug);
return (void *)prog;
}
From ba093a099af13a630c255b34dc5d315760248e5f Mon Sep 17 00:00:00 2001
From: Ilia Mirkin
Date: Fri, 30 Oct 2015 20:44:57 -0400
Subject: [PATCH 064/287] nouveau: send back a debug message when waiting for a
fence to complete
Signed-off-by: Ilia Mirkin
---
src/gallium/drivers/nouveau/nouveau_buffer.c | 13 +++++++------
src/gallium/drivers/nouveau/nouveau_context.h | 1 +
src/gallium/drivers/nouveau/nouveau_fence.c | 14 ++++++++++++--
src/gallium/drivers/nouveau/nouveau_fence.h | 4 +++-
src/gallium/drivers/nouveau/nouveau_screen.c | 2 +-
src/gallium/drivers/nouveau/nv30/nv30_screen.c | 2 +-
src/gallium/drivers/nouveau/nv50/nv50_screen.c | 2 +-
src/gallium/drivers/nouveau/nv50/nv50_vbo.c | 2 +-
src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 2 +-
src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c | 4 ++--
10 files changed, 30 insertions(+), 16 deletions(-)
diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.c b/src/gallium/drivers/nouveau/nouveau_buffer.c
index 72e070b5f06..68e69beb08f 100644
--- a/src/gallium/drivers/nouveau/nouveau_buffer.c
+++ b/src/gallium/drivers/nouveau/nouveau_buffer.c
@@ -225,21 +225,22 @@ nouveau_transfer_write(struct nouveau_context *nv, struct nouveau_transfer *tx,
* for write/read by waiting on the buffer's relevant fences.
*/
static inline bool
-nouveau_buffer_sync(struct nv04_resource *buf, unsigned rw)
+nouveau_buffer_sync(struct nouveau_context *nv,
+ struct nv04_resource *buf, unsigned rw)
{
if (rw == PIPE_TRANSFER_READ) {
if (!buf->fence_wr)
return true;
NOUVEAU_DRV_STAT_RES(buf, buf_non_kernel_fence_sync_count,
!nouveau_fence_signalled(buf->fence_wr));
- if (!nouveau_fence_wait(buf->fence_wr))
+ if (!nouveau_fence_wait(buf->fence_wr, &nv->debug))
return false;
} else {
if (!buf->fence)
return true;
NOUVEAU_DRV_STAT_RES(buf, buf_non_kernel_fence_sync_count,
!nouveau_fence_signalled(buf->fence));
- if (!nouveau_fence_wait(buf->fence))
+ if (!nouveau_fence_wait(buf->fence, &nv->debug))
return false;
nouveau_fence_ref(NULL, &buf->fence);
@@ -478,7 +479,7 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe,
if (unlikely(usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE)) {
/* Discarding was not possible, must sync because
* subsequent transfers might use UNSYNCHRONIZED. */
- nouveau_buffer_sync(buf, usage & PIPE_TRANSFER_READ_WRITE);
+ nouveau_buffer_sync(nv, buf, usage & PIPE_TRANSFER_READ_WRITE);
} else
if (usage & PIPE_TRANSFER_DISCARD_RANGE) {
/* The whole range is being discarded, so it doesn't matter what was
@@ -490,7 +491,7 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe,
if (usage & PIPE_TRANSFER_DONTBLOCK)
map = NULL;
else
- nouveau_buffer_sync(buf, usage & PIPE_TRANSFER_READ_WRITE);
+ nouveau_buffer_sync(nv, buf, usage & PIPE_TRANSFER_READ_WRITE);
} else {
/* It is expected that the returned buffer be a representation of the
* data in question, so we must copy it over from the buffer. */
@@ -615,7 +616,7 @@ nouveau_resource_map_offset(struct nouveau_context *nv,
if (res->mm) {
unsigned rw;
rw = (flags & NOUVEAU_BO_WR) ? PIPE_TRANSFER_WRITE : PIPE_TRANSFER_READ;
- nouveau_buffer_sync(res, rw);
+ nouveau_buffer_sync(nv, res, rw);
if (nouveau_bo_map(res->bo, 0, NULL))
return NULL;
} else {
diff --git a/src/gallium/drivers/nouveau/nouveau_context.h b/src/gallium/drivers/nouveau/nouveau_context.h
index a8189b82f82..c3bbb11bd60 100644
--- a/src/gallium/drivers/nouveau/nouveau_context.h
+++ b/src/gallium/drivers/nouveau/nouveau_context.h
@@ -2,6 +2,7 @@
#define __NOUVEAU_CONTEXT_H__
#include "pipe/p_context.h"
+#include "pipe/p_state.h"
#include
#define NOUVEAU_MAX_SCRATCH_BUFS 4
diff --git a/src/gallium/drivers/nouveau/nouveau_fence.c b/src/gallium/drivers/nouveau/nouveau_fence.c
index 21cf2b9ae5e..d3a34060952 100644
--- a/src/gallium/drivers/nouveau/nouveau_fence.c
+++ b/src/gallium/drivers/nouveau/nouveau_fence.c
@@ -23,6 +23,7 @@
#include "nouveau_screen.h"
#include "nouveau_winsys.h"
#include "nouveau_fence.h"
+#include "os/os_time.h"
#ifdef PIPE_OS_UNIX
#include
@@ -182,10 +183,11 @@ nouveau_fence_signalled(struct nouveau_fence *fence)
}
bool
-nouveau_fence_wait(struct nouveau_fence *fence)
+nouveau_fence_wait(struct nouveau_fence *fence, struct pipe_debug_callback *debug)
{
struct nouveau_screen *screen = fence->screen;
uint32_t spins = 0;
+ int64_t start = 0;
/* wtf, someone is waiting on a fence in flush_notify handler? */
assert(fence->state != NOUVEAU_FENCE_STATE_EMITTING);
@@ -206,11 +208,19 @@ nouveau_fence_wait(struct nouveau_fence *fence)
if (fence == screen->fence.current)
nouveau_fence_next(screen);
+ if (debug && debug->debug_message)
+ start = os_time_get_nano();
+
do {
nouveau_fence_update(screen, false);
- if (fence->state == NOUVEAU_FENCE_STATE_SIGNALLED)
+ if (fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) {
+ if (debug && debug->debug_message)
+ pipe_debug_message(debug, PERF_INFO,
+ "stalled %.3f ms waiting for fence",
+ (os_time_get_nano() - start) / 1000000.f);
return true;
+ }
if (!spins)
NOUVEAU_DRV_STAT(screen, any_non_kernel_fence_sync_count, 1);
spins++;
diff --git a/src/gallium/drivers/nouveau/nouveau_fence.h b/src/gallium/drivers/nouveau/nouveau_fence.h
index 2efcab2172d..0fa9d020f50 100644
--- a/src/gallium/drivers/nouveau/nouveau_fence.h
+++ b/src/gallium/drivers/nouveau/nouveau_fence.h
@@ -11,6 +11,8 @@
#define NOUVEAU_FENCE_STATE_FLUSHED 3
#define NOUVEAU_FENCE_STATE_SIGNALLED 4
+struct pipe_debug_callback;
+
struct nouveau_fence_work {
struct list_head list;
void (*func)(void *);
@@ -34,7 +36,7 @@ bool nouveau_fence_new(struct nouveau_screen *, struct nouveau_fence **,
bool nouveau_fence_work(struct nouveau_fence *, void (*)(void *), void *);
void nouveau_fence_update(struct nouveau_screen *, bool flushed);
void nouveau_fence_next(struct nouveau_screen *);
-bool nouveau_fence_wait(struct nouveau_fence *);
+bool nouveau_fence_wait(struct nouveau_fence *, struct pipe_debug_callback *);
bool nouveau_fence_signalled(struct nouveau_fence *);
void nouveau_fence_unref_bo(void *data); /* generic unref bo callback */
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index 21d431788ec..a6065e45aaa 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -76,7 +76,7 @@ nouveau_screen_fence_finish(struct pipe_screen *screen,
if (!timeout)
return nouveau_fence_signalled(nouveau_fence(pfence));
- return nouveau_fence_wait(nouveau_fence(pfence));
+ return nouveau_fence_wait(nouveau_fence(pfence), NULL);
}
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 794a0898eaf..44aac22010f 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -383,7 +383,7 @@ nv30_screen_destroy(struct pipe_screen *pscreen)
* _current_ one, and remove both.
*/
nouveau_fence_ref(screen->base.fence.current, ¤t);
- nouveau_fence_wait(current);
+ nouveau_fence_wait(current, NULL);
nouveau_fence_ref(NULL, ¤t);
nouveau_fence_ref(NULL, &screen->base.fence.current);
}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index de2150ca08c..5dda98141de 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -350,7 +350,7 @@ nv50_screen_destroy(struct pipe_screen *pscreen)
* _current_ one, and remove both.
*/
nouveau_fence_ref(screen->base.fence.current, ¤t);
- nouveau_fence_wait(current);
+ nouveau_fence_wait(current, NULL);
nouveau_fence_ref(NULL, ¤t);
nouveau_fence_ref(NULL, &screen->base.fence.current);
}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
index 9fa6fceeefa..9aa593f919e 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -636,7 +636,7 @@ nv50_draw_elements(struct nv50_context *nv50, bool shorten,
* pushbuf submit, but it's probably not a big performance difference.
*/
if (buf->fence_wr && !nouveau_fence_signalled(buf->fence_wr))
- nouveau_fence_wait(buf->fence_wr);
+ nouveau_fence_wait(buf->fence_wr, &nv50->base.debug);
while (instance_count--) {
BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 3b543929f3c..7d96977c24b 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -415,7 +415,7 @@ nvc0_screen_destroy(struct pipe_screen *pscreen)
* _current_ one, and remove both.
*/
nouveau_fence_ref(screen->base.fence.current, ¤t);
- nouveau_fence_wait(current);
+ nouveau_fence_wait(current, NULL);
nouveau_fence_ref(NULL, ¤t);
nouveau_fence_ref(NULL, &screen->base.fence.current);
}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
index d459dd61c19..279c7e93cc8 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
@@ -340,8 +340,8 @@ nvc0_mt_sync(struct nvc0_context *nvc0, struct nv50_miptree *mt, unsigned usage)
return !nouveau_bo_wait(mt->base.bo, access, nvc0->base.client);
}
if (usage & PIPE_TRANSFER_WRITE)
- return !mt->base.fence || nouveau_fence_wait(mt->base.fence);
- return !mt->base.fence_wr || nouveau_fence_wait(mt->base.fence_wr);
+ return !mt->base.fence || nouveau_fence_wait(mt->base.fence, &nvc0->base.debug);
+ return !mt->base.fence_wr || nouveau_fence_wait(mt->base.fence_wr, &nvc0->base.debug);
}
void *
From 5ae37ae6151623303300047d7465d199df8199a4 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger
Date: Thu, 5 Nov 2015 18:00:40 +0100
Subject: [PATCH 065/287] llvmpipe: disable texture cache
There are some weird problems with 8-wide vectors.
---
src/gallium/drivers/llvmpipe/lp_tex_sample.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.h b/src/gallium/drivers/llvmpipe/lp_tex_sample.h
index 939131e7975..e26d608c9eb 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.h
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.h
@@ -37,7 +37,7 @@ struct lp_sampler_static_state;
/**
* Whether texture cache is used for s3tc textures.
*/
-#define LP_USE_TEXTURE_CACHE 1
+#define LP_USE_TEXTURE_CACHE 0
/**
* Pure-LLVM texture sampling code generator.
From 8dcf807cb43383590ba193c7ff20b8a98e4a9f65 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke
Date: Tue, 13 Oct 2015 15:30:03 -0700
Subject: [PATCH 066/287] i965: Fix scalar VS float[] and vec2[] output arrays.
The scalar VS backend has never handled float[] and vec2[] outputs
correctly (my original code was broken). Outputs need to be padded
out to vec4 slots.
In fs_visitor::nir_setup_outputs(), we tried to process each vec4 slot
by looping from 0 to ALIGN(type_size_scalar(type), 4) / 4. However,
this is wrong: type_size_scalar() for a float[2] would return 2, or
for vec2[2] it would return 4. This looked like a single slot, even
though in reality each array element would be stored in separate vec4
slots.
Because of this bug, outputs[] and output_components[] would not get
initialized for the second element's VARYING_SLOT, which meant
emit_urb_writes() would skip writing them. Nothing used those values,
and dead code elimination threw a party.
To fix this, we introduce a new type_size_vec4_times_4() function which
pads array elements correctly, but still counts in scalar components,
generating correct indices in store_output intrinsics.
Normally, varying packing avoids this problem by turning varyings into
vec4s. So this doesn't actually fix any Piglit or dEQP tests today.
However, if varying packing is disabled, things would be broken.
Tessellation shaders can't use varying packing, so this fixes various
tcs-input Piglit tests on a branch of mine.
v2: Shorten the implementation of type_size_4x to a single line (caught
by Connor Abbott), and rename it to type_size_vec4_times_4()
(renaming suggested by Jason Ekstrand). Use type_size_vec4
rather than using type_size_vec4_times_4 and then dividing by 4.
Signed-off-by: Kenneth Graunke
Reviewed-by: Jason Ekstrand
---
src/mesa/drivers/dri/i965/brw_fs.cpp | 13 +++++++++++++
src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 2 +-
src/mesa/drivers/dri/i965/brw_nir.c | 3 ++-
src/mesa/drivers/dri/i965/brw_shader.h | 1 +
4 files changed, 17 insertions(+), 2 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 629fbbdf01b..ad94fa479e2 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -513,6 +513,19 @@ type_size_scalar(const struct glsl_type *type)
return 0;
}
+/**
+ * Returns the number of scalar components needed to store type, assuming
+ * that vectors are padded out to vec4.
+ *
+ * This has the packing rules of type_size_vec4(), but counts components
+ * similar to type_size_scalar().
+ */
+extern "C" int
+type_size_vec4_times_4(const struct glsl_type *type)
+{
+ return 4 * type_size_vec4(type);
+}
+
/**
* Create a MOV to read the timestamp register.
*
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index b6f4c52c50f..261518605b7 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -104,7 +104,7 @@ fs_visitor::nir_setup_outputs()
switch (stage) {
case MESA_SHADER_VERTEX:
case MESA_SHADER_GEOMETRY:
- for (unsigned int i = 0; i < ALIGN(type_size_scalar(var->type), 4) / 4; i++) {
+ for (int i = 0; i < type_size_vec4(var->type); i++) {
int output = var->data.location + i;
this->outputs[output] = offset(reg, bld, 4 * i);
this->output_components[output] = vector_elements;
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index a7a5eb511cd..dece208233f 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -150,7 +150,8 @@ brw_nir_lower_outputs(nir_shader *nir, bool is_scalar)
case MESA_SHADER_GEOMETRY:
if (is_scalar) {
nir_assign_var_locations(&nir->outputs, &nir->num_outputs,
- type_size_scalar);
+ type_size_vec4_times_4);
+ nir_lower_io(nir, nir_var_shader_out, type_size_vec4_times_4);
} else {
nir_foreach_variable(var, &nir->outputs)
var->data.driver_location = var->data.location;
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index 6a2dfc9bbb6..29baebf0cc1 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -277,6 +277,7 @@ bool brw_cs_precompile(struct gl_context *ctx,
int type_size_scalar(const struct glsl_type *type);
int type_size_vec4(const struct glsl_type *type);
+int type_size_vec4_times_4(const struct glsl_type *type);
bool is_scalar_shader_stage(const struct brw_compiler *compiler, int stage);
From e0b896c86c92c4dd02aea7fb5eb8eabe089b9e58 Mon Sep 17 00:00:00 2001
From: Julien Isorce
Date: Thu, 5 Nov 2015 08:24:44 +0000
Subject: [PATCH 067/287] st/va: indent vlVaQuerySurfaceAttributes and
vlVaCreateSurfaces2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Some lines were using 4 indentation spaces instead of 3.
Signed-off-by: Julien Isorce
Reviewed-by: Christian König
Reviewed-by: Emil Velikov
---
src/gallium/state_trackers/va/surface.c | 498 ++++++++++++------------
1 file changed, 249 insertions(+), 249 deletions(-)
diff --git a/src/gallium/state_trackers/va/surface.c b/src/gallium/state_trackers/va/surface.c
index 8f406e09990..59815aa4c5a 100644
--- a/src/gallium/state_trackers/va/surface.c
+++ b/src/gallium/state_trackers/va/surface.c
@@ -311,101 +311,101 @@ VAStatus
vlVaQuerySurfaceAttributes(VADriverContextP ctx, VAConfigID config,
VASurfaceAttrib *attrib_list, unsigned int *num_attribs)
{
- vlVaDriver *drv;
- VASurfaceAttrib *attribs;
- struct pipe_screen *pscreen;
- int i;
+ vlVaDriver *drv;
+ VASurfaceAttrib *attribs;
+ struct pipe_screen *pscreen;
+ int i;
- if (config == VA_INVALID_ID)
- return VA_STATUS_ERROR_INVALID_CONFIG;
+ if (config == VA_INVALID_ID)
+ return VA_STATUS_ERROR_INVALID_CONFIG;
- if (!attrib_list && !num_attribs)
- return VA_STATUS_ERROR_INVALID_PARAMETER;
+ if (!attrib_list && !num_attribs)
+ return VA_STATUS_ERROR_INVALID_PARAMETER;
- if (!attrib_list) {
- *num_attribs = VASurfaceAttribCount;
- return VA_STATUS_SUCCESS;
- }
+ if (!attrib_list) {
+ *num_attribs = VASurfaceAttribCount;
+ return VA_STATUS_SUCCESS;
+ }
- if (!ctx)
- return VA_STATUS_ERROR_INVALID_CONTEXT;
+ if (!ctx)
+ return VA_STATUS_ERROR_INVALID_CONTEXT;
- drv = VL_VA_DRIVER(ctx);
+ drv = VL_VA_DRIVER(ctx);
- if (!drv)
- return VA_STATUS_ERROR_INVALID_CONTEXT;
+ if (!drv)
+ return VA_STATUS_ERROR_INVALID_CONTEXT;
- pscreen = VL_VA_PSCREEN(ctx);
+ pscreen = VL_VA_PSCREEN(ctx);
- if (!pscreen)
- return VA_STATUS_ERROR_INVALID_CONTEXT;
+ if (!pscreen)
+ return VA_STATUS_ERROR_INVALID_CONTEXT;
- attribs = CALLOC(VASurfaceAttribCount, sizeof(VASurfaceAttrib));
+ attribs = CALLOC(VASurfaceAttribCount, sizeof(VASurfaceAttrib));
- if (!attribs)
- return VA_STATUS_ERROR_ALLOCATION_FAILED;
+ if (!attribs)
+ return VA_STATUS_ERROR_ALLOCATION_FAILED;
- i = 0;
+ i = 0;
- if (config == PIPE_VIDEO_PROFILE_UNKNOWN) {
- /* vlVaCreateConfig returns PIPE_VIDEO_PROFILE_UNKNOWN
+ if (config == PIPE_VIDEO_PROFILE_UNKNOWN) {
+ /* vlVaCreateConfig returns PIPE_VIDEO_PROFILE_UNKNOWN
only for VAEntrypointVideoProc. */
- attribs[i].type = VASurfaceAttribPixelFormat;
- attribs[i].value.type = VAGenericValueTypeInteger;
- attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
- attribs[i].value.value.i = VA_FOURCC_BGRA;
- i++;
+ attribs[i].type = VASurfaceAttribPixelFormat;
+ attribs[i].value.type = VAGenericValueTypeInteger;
+ attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
+ attribs[i].value.value.i = VA_FOURCC_BGRA;
+ i++;
- attribs[i].type = VASurfaceAttribPixelFormat;
- attribs[i].value.type = VAGenericValueTypeInteger;
- attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
- attribs[i].value.value.i = VA_FOURCC_RGBA;
- i++;
- } else {
- /* Assume VAEntrypointVLD for now. */
- attribs[i].type = VASurfaceAttribPixelFormat;
- attribs[i].value.type = VAGenericValueTypeInteger;
- attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
- attribs[i].value.value.i = VA_FOURCC_NV12;
- i++;
- }
+ attribs[i].type = VASurfaceAttribPixelFormat;
+ attribs[i].value.type = VAGenericValueTypeInteger;
+ attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
+ attribs[i].value.value.i = VA_FOURCC_RGBA;
+ i++;
+ } else {
+ /* Assume VAEntrypointVLD for now. */
+ attribs[i].type = VASurfaceAttribPixelFormat;
+ attribs[i].value.type = VAGenericValueTypeInteger;
+ attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
+ attribs[i].value.value.i = VA_FOURCC_NV12;
+ i++;
+ }
- attribs[i].type = VASurfaceAttribMemoryType;
- attribs[i].value.type = VAGenericValueTypeInteger;
- attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
- attribs[i].value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_VA |
- VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME;
- i++;
+ attribs[i].type = VASurfaceAttribMemoryType;
+ attribs[i].value.type = VAGenericValueTypeInteger;
+ attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
+ attribs[i].value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_VA |
+ VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME;
+ i++;
- attribs[i].type = VASurfaceAttribExternalBufferDescriptor;
- attribs[i].value.type = VAGenericValueTypePointer;
- attribs[i].flags = VA_SURFACE_ATTRIB_SETTABLE;
- attribs[i].value.value.p = NULL; /* ignore */
- i++;
+ attribs[i].type = VASurfaceAttribExternalBufferDescriptor;
+ attribs[i].value.type = VAGenericValueTypePointer;
+ attribs[i].flags = VA_SURFACE_ATTRIB_SETTABLE;
+ attribs[i].value.value.p = NULL; /* ignore */
+ i++;
- attribs[i].type = VASurfaceAttribMaxWidth;
- attribs[i].value.type = VAGenericValueTypeInteger;
- attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE;
- attribs[i].value.value.i = vl_video_buffer_max_size(pscreen);
- i++;
+ attribs[i].type = VASurfaceAttribMaxWidth;
+ attribs[i].value.type = VAGenericValueTypeInteger;
+ attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE;
+ attribs[i].value.value.i = vl_video_buffer_max_size(pscreen);
+ i++;
- attribs[i].type = VASurfaceAttribMaxHeight;
- attribs[i].value.type = VAGenericValueTypeInteger;
- attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE;
- attribs[i].value.value.i = vl_video_buffer_max_size(pscreen);
- i++;
+ attribs[i].type = VASurfaceAttribMaxHeight;
+ attribs[i].value.type = VAGenericValueTypeInteger;
+ attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE;
+ attribs[i].value.value.i = vl_video_buffer_max_size(pscreen);
+ i++;
- if (i > *num_attribs) {
- *num_attribs = i;
- FREE(attribs);
- return VA_STATUS_ERROR_MAX_NUM_EXCEEDED;
- }
+ if (i > *num_attribs) {
+ *num_attribs = i;
+ FREE(attribs);
+ return VA_STATUS_ERROR_MAX_NUM_EXCEEDED;
+ }
- *num_attribs = i;
- memcpy(attrib_list, attribs, i * sizeof(VASurfaceAttrib));
- FREE(attribs);
+ *num_attribs = i;
+ memcpy(attrib_list, attribs, i * sizeof(VASurfaceAttrib));
+ FREE(attribs);
- return VA_STATUS_SUCCESS;
+ return VA_STATUS_SUCCESS;
}
static VAStatus
@@ -414,75 +414,75 @@ suface_from_external_memory(VADriverContextP ctx, vlVaSurface *surface,
int index, VASurfaceID *surfaces,
struct pipe_video_buffer *templat)
{
- vlVaDriver *drv;
- struct pipe_screen *pscreen;
- struct pipe_resource *resource;
- struct pipe_resource res_templ;
- struct winsys_handle whandle;
- struct pipe_resource *resources[VL_NUM_COMPONENTS];
+ vlVaDriver *drv;
+ struct pipe_screen *pscreen;
+ struct pipe_resource *resource;
+ struct pipe_resource res_templ;
+ struct winsys_handle whandle;
+ struct pipe_resource *resources[VL_NUM_COMPONENTS];
- if (!ctx)
- return VA_STATUS_ERROR_INVALID_PARAMETER;
+ if (!ctx)
+ return VA_STATUS_ERROR_INVALID_PARAMETER;
- pscreen = VL_VA_PSCREEN(ctx);
- drv = VL_VA_DRIVER(ctx);
+ pscreen = VL_VA_PSCREEN(ctx);
+ drv = VL_VA_DRIVER(ctx);
- if (!memory_attibute || !memory_attibute->buffers ||
- index > memory_attibute->num_buffers)
- return VA_STATUS_ERROR_INVALID_PARAMETER;
+ if (!memory_attibute || !memory_attibute->buffers ||
+ index > memory_attibute->num_buffers)
+ return VA_STATUS_ERROR_INVALID_PARAMETER;
- if (surface->templat.width != memory_attibute->width ||
- surface->templat.height != memory_attibute->height ||
- memory_attibute->num_planes < 1)
- return VA_STATUS_ERROR_INVALID_PARAMETER;
+ if (surface->templat.width != memory_attibute->width ||
+ surface->templat.height != memory_attibute->height ||
+ memory_attibute->num_planes < 1)
+ return VA_STATUS_ERROR_INVALID_PARAMETER;
- switch (memory_attibute->pixel_format) {
- case VA_FOURCC_RGBA:
- case VA_FOURCC_RGBX:
- case VA_FOURCC_BGRA:
- case VA_FOURCC_BGRX:
- if (memory_attibute->num_planes != 1)
- return VA_STATUS_ERROR_INVALID_PARAMETER;
- break;
- default:
- return VA_STATUS_ERROR_INVALID_PARAMETER;
- }
+ switch (memory_attibute->pixel_format) {
+ case VA_FOURCC_RGBA:
+ case VA_FOURCC_RGBX:
+ case VA_FOURCC_BGRA:
+ case VA_FOURCC_BGRX:
+ if (memory_attibute->num_planes != 1)
+ return VA_STATUS_ERROR_INVALID_PARAMETER;
+ break;
+ default:
+ return VA_STATUS_ERROR_INVALID_PARAMETER;
+ }
- memset(&res_templ, 0, sizeof(res_templ));
- res_templ.target = PIPE_TEXTURE_2D;
- res_templ.last_level = 0;
- res_templ.depth0 = 1;
- res_templ.array_size = 1;
- res_templ.width0 = memory_attibute->width;
- res_templ.height0 = memory_attibute->height;
- res_templ.format = surface->templat.buffer_format;
- res_templ.bind = PIPE_BIND_SAMPLER_VIEW;
- res_templ.usage = PIPE_USAGE_DEFAULT;
+ memset(&res_templ, 0, sizeof(res_templ));
+ res_templ.target = PIPE_TEXTURE_2D;
+ res_templ.last_level = 0;
+ res_templ.depth0 = 1;
+ res_templ.array_size = 1;
+ res_templ.width0 = memory_attibute->width;
+ res_templ.height0 = memory_attibute->height;
+ res_templ.format = surface->templat.buffer_format;
+ res_templ.bind = PIPE_BIND_SAMPLER_VIEW;
+ res_templ.usage = PIPE_USAGE_DEFAULT;
- memset(&whandle, 0, sizeof(struct winsys_handle));
- whandle.type = DRM_API_HANDLE_TYPE_FD;
- whandle.handle = memory_attibute->buffers[index];
- whandle.stride = memory_attibute->pitches[index];
+ memset(&whandle, 0, sizeof(struct winsys_handle));
+ whandle.type = DRM_API_HANDLE_TYPE_FD;
+ whandle.handle = memory_attibute->buffers[index];
+ whandle.stride = memory_attibute->pitches[index];
- resource = pscreen->resource_from_handle(pscreen, &res_templ, &whandle);
+ resource = pscreen->resource_from_handle(pscreen, &res_templ, &whandle);
- if (!resource)
- return VA_STATUS_ERROR_ALLOCATION_FAILED;
-
- memset(resources, 0, sizeof resources);
- resources[0] = resource;
-
- surface->buffer = vl_video_buffer_create_ex2(drv->pipe, templat, resources);
- if (!surface->buffer)
- return VA_STATUS_ERROR_ALLOCATION_FAILED;
-
- util_dynarray_init(&surface->subpics);
- surfaces[index] = handle_table_add(drv->htab, surface);
-
- if (!surfaces[index])
+ if (!resource)
return VA_STATUS_ERROR_ALLOCATION_FAILED;
- return VA_STATUS_SUCCESS;
+ memset(resources, 0, sizeof resources);
+ resources[0] = resource;
+
+ surface->buffer = vl_video_buffer_create_ex2(drv->pipe, templat, resources);
+ if (!surface->buffer)
+ return VA_STATUS_ERROR_ALLOCATION_FAILED;
+
+ util_dynarray_init(&surface->subpics);
+ surfaces[index] = handle_table_add(drv->htab, surface);
+
+ if (!surfaces[index])
+ return VA_STATUS_ERROR_ALLOCATION_FAILED;
+
+ return VA_STATUS_SUCCESS;
}
VAStatus
@@ -491,143 +491,143 @@ vlVaCreateSurfaces2(VADriverContextP ctx, unsigned int format,
VASurfaceID *surfaces, unsigned int num_surfaces,
VASurfaceAttrib *attrib_list, unsigned int num_attribs)
{
- vlVaDriver *drv;
- VASurfaceAttribExternalBuffers *memory_attibute;
- struct pipe_video_buffer templat;
- struct pipe_screen *pscreen;
- int i;
- int memory_type;
- int expected_fourcc;
- VAStatus vaStatus;
+ vlVaDriver *drv;
+ VASurfaceAttribExternalBuffers *memory_attibute;
+ struct pipe_video_buffer templat;
+ struct pipe_screen *pscreen;
+ int i;
+ int memory_type;
+ int expected_fourcc;
+ VAStatus vaStatus;
- if (!ctx)
- return VA_STATUS_ERROR_INVALID_CONTEXT;
+ if (!ctx)
+ return VA_STATUS_ERROR_INVALID_CONTEXT;
- if (!(width && height))
- return VA_STATUS_ERROR_INVALID_IMAGE_FORMAT;
+ if (!(width && height))
+ return VA_STATUS_ERROR_INVALID_IMAGE_FORMAT;
- drv = VL_VA_DRIVER(ctx);
+ drv = VL_VA_DRIVER(ctx);
- if (!drv)
- return VA_STATUS_ERROR_INVALID_CONTEXT;
+ if (!drv)
+ return VA_STATUS_ERROR_INVALID_CONTEXT;
- pscreen = VL_VA_PSCREEN(ctx);
+ pscreen = VL_VA_PSCREEN(ctx);
- if (!pscreen)
- return VA_STATUS_ERROR_INVALID_CONTEXT;
+ if (!pscreen)
+ return VA_STATUS_ERROR_INVALID_CONTEXT;
- /* Default. */
- memory_attibute = NULL;
- memory_type = VA_SURFACE_ATTRIB_MEM_TYPE_VA;
- expected_fourcc = 0;
+ /* Default. */
+ memory_attibute = NULL;
+ memory_type = VA_SURFACE_ATTRIB_MEM_TYPE_VA;
+ expected_fourcc = 0;
- for (i = 0; i < num_attribs && attrib_list; i++) {
- if ((attrib_list[i].type == VASurfaceAttribPixelFormat) &&
- (attrib_list[i].flags & VA_SURFACE_ATTRIB_SETTABLE)) {
- if (attrib_list[i].value.type != VAGenericValueTypeInteger)
- return VA_STATUS_ERROR_INVALID_PARAMETER;
- expected_fourcc = attrib_list[i].value.value.i;
- }
+ for (i = 0; i < num_attribs && attrib_list; i++) {
+ if ((attrib_list[i].type == VASurfaceAttribPixelFormat) &&
+ (attrib_list[i].flags & VA_SURFACE_ATTRIB_SETTABLE)) {
+ if (attrib_list[i].value.type != VAGenericValueTypeInteger)
+ return VA_STATUS_ERROR_INVALID_PARAMETER;
+ expected_fourcc = attrib_list[i].value.value.i;
+ }
- if ((attrib_list[i].type == VASurfaceAttribMemoryType) &&
- (attrib_list[i].flags & VA_SURFACE_ATTRIB_SETTABLE)) {
+ if ((attrib_list[i].type == VASurfaceAttribMemoryType) &&
+ (attrib_list[i].flags & VA_SURFACE_ATTRIB_SETTABLE)) {
- if (attrib_list[i].value.type != VAGenericValueTypeInteger)
- return VA_STATUS_ERROR_INVALID_PARAMETER;
+ if (attrib_list[i].value.type != VAGenericValueTypeInteger)
+ return VA_STATUS_ERROR_INVALID_PARAMETER;
- switch (attrib_list[i].value.value.i) {
- case VA_SURFACE_ATTRIB_MEM_TYPE_VA:
- case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME:
- memory_type = attrib_list[i].value.value.i;
- break;
- default:
- return VA_STATUS_ERROR_UNSUPPORTED_MEMORY_TYPE;
- }
- }
-
- if ((attrib_list[i].type == VASurfaceAttribExternalBufferDescriptor) &&
- (attrib_list[i].flags == VA_SURFACE_ATTRIB_SETTABLE)) {
- if (attrib_list[i].value.type != VAGenericValueTypePointer)
- return VA_STATUS_ERROR_INVALID_PARAMETER;
- memory_attibute = (VASurfaceAttribExternalBuffers *)attrib_list[i].value.value.p;
- }
- }
-
- if (VA_RT_FORMAT_YUV420 != format &&
- VA_RT_FORMAT_YUV422 != format &&
- VA_RT_FORMAT_YUV444 != format &&
- VA_RT_FORMAT_RGB32 != format) {
- return VA_STATUS_ERROR_UNSUPPORTED_RT_FORMAT;
- }
-
- switch (memory_type) {
- case VA_SURFACE_ATTRIB_MEM_TYPE_VA:
+ switch (attrib_list[i].value.value.i) {
+ case VA_SURFACE_ATTRIB_MEM_TYPE_VA:
+ case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME:
+ memory_type = attrib_list[i].value.value.i;
break;
- case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME:
- if (!memory_attibute)
- return VA_STATUS_ERROR_INVALID_PARAMETER;
+ default:
+ return VA_STATUS_ERROR_UNSUPPORTED_MEMORY_TYPE;
+ }
+ }
- expected_fourcc = memory_attibute->pixel_format;
- break;
- default:
- assert(0);
- }
+ if ((attrib_list[i].type == VASurfaceAttribExternalBufferDescriptor) &&
+ (attrib_list[i].flags == VA_SURFACE_ATTRIB_SETTABLE)) {
+ if (attrib_list[i].value.type != VAGenericValueTypePointer)
+ return VA_STATUS_ERROR_INVALID_PARAMETER;
+ memory_attibute = (VASurfaceAttribExternalBuffers *)attrib_list[i].value.value.p;
+ }
+ }
- memset(&templat, 0, sizeof(templat));
+ if (VA_RT_FORMAT_YUV420 != format &&
+ VA_RT_FORMAT_YUV422 != format &&
+ VA_RT_FORMAT_YUV444 != format &&
+ VA_RT_FORMAT_RGB32 != format) {
+ return VA_STATUS_ERROR_UNSUPPORTED_RT_FORMAT;
+ }
- if (expected_fourcc) {
- templat.buffer_format = VaFourccToPipeFormat(expected_fourcc);
- templat.interlaced = 0;
- } else {
- templat.buffer_format = pscreen->get_video_param
- (
- pscreen,
- PIPE_VIDEO_PROFILE_UNKNOWN,
- PIPE_VIDEO_ENTRYPOINT_BITSTREAM,
- PIPE_VIDEO_CAP_PREFERED_FORMAT
- );
- templat.interlaced = pscreen->get_video_param
- (
- pscreen,
- PIPE_VIDEO_PROFILE_UNKNOWN,
- PIPE_VIDEO_ENTRYPOINT_BITSTREAM,
- PIPE_VIDEO_CAP_PREFERS_INTERLACED
- );
- }
+ switch (memory_type) {
+ case VA_SURFACE_ATTRIB_MEM_TYPE_VA:
+ break;
+ case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME:
+ if (!memory_attibute)
+ return VA_STATUS_ERROR_INVALID_PARAMETER;
- templat.chroma_format = ChromaToPipe(format);
+ expected_fourcc = memory_attibute->pixel_format;
+ break;
+ default:
+ assert(0);
+ }
- templat.width = width;
- templat.height = height;
+ memset(&templat, 0, sizeof(templat));
- memset(surfaces, VA_INVALID_ID, num_surfaces * sizeof(VASurfaceID));
+ if (expected_fourcc) {
+ templat.buffer_format = VaFourccToPipeFormat(expected_fourcc);
+ templat.interlaced = 0;
+ } else {
+ templat.buffer_format = pscreen->get_video_param
+ (
+ pscreen,
+ PIPE_VIDEO_PROFILE_UNKNOWN,
+ PIPE_VIDEO_ENTRYPOINT_BITSTREAM,
+ PIPE_VIDEO_CAP_PREFERED_FORMAT
+ );
+ templat.interlaced = pscreen->get_video_param
+ (
+ pscreen,
+ PIPE_VIDEO_PROFILE_UNKNOWN,
+ PIPE_VIDEO_ENTRYPOINT_BITSTREAM,
+ PIPE_VIDEO_CAP_PREFERS_INTERLACED
+ );
+ }
- for (i = 0; i < num_surfaces; i++) {
- vlVaSurface *surf = CALLOC(1, sizeof(vlVaSurface));
- if (!surf)
+ templat.chroma_format = ChromaToPipe(format);
+
+ templat.width = width;
+ templat.height = height;
+
+ memset(surfaces, VA_INVALID_ID, num_surfaces * sizeof(VASurfaceID));
+
+ for (i = 0; i < num_surfaces; i++) {
+ vlVaSurface *surf = CALLOC(1, sizeof(vlVaSurface));
+ if (!surf)
+ goto no_res;
+
+ surf->templat = templat;
+
+ switch (memory_type) {
+ case VA_SURFACE_ATTRIB_MEM_TYPE_VA:
+ surf->buffer = drv->pipe->create_video_buffer(drv->pipe, &templat);
+ if (!surf->buffer)
goto no_res;
+ util_dynarray_init(&surf->subpics);
+ surfaces[i] = handle_table_add(drv->htab, surf);
+ break;
+ case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME:
+ vaStatus = suface_from_external_memory(ctx, surf, memory_attibute, i, surfaces, &templat);
+ if (vaStatus != VA_STATUS_SUCCESS)
+ goto no_res;
+ break;
+ default:
+ assert(0);
+ }
+ }
- surf->templat = templat;
-
- switch (memory_type) {
- case VA_SURFACE_ATTRIB_MEM_TYPE_VA:
- surf->buffer = drv->pipe->create_video_buffer(drv->pipe, &templat);
- if (!surf->buffer)
- goto no_res;
- util_dynarray_init(&surf->subpics);
- surfaces[i] = handle_table_add(drv->htab, surf);
- break;
- case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME:
- vaStatus = suface_from_external_memory(ctx, surf, memory_attibute, i, surfaces, &templat);
- if (vaStatus != VA_STATUS_SUCCESS)
- goto no_res;
- break;
- default:
- assert(0);
- }
- }
-
- return VA_STATUS_SUCCESS;
+ return VA_STATUS_SUCCESS;
no_res:
if (i)
@@ -707,7 +707,7 @@ vlVaQueryVideoProcPipelineCaps(VADriverContextP ctx, VAContextID context,
return VA_STATUS_ERROR_INVALID_CONTEXT;
if (!pipeline_cap)
- return VA_STATUS_ERROR_INVALID_PARAMETER;
+ return VA_STATUS_ERROR_INVALID_PARAMETER;
if (num_filters && !filters)
return VA_STATUS_ERROR_INVALID_PARAMETER;
From 497bde6727260e7719c680dc483b10c0751a3fcd Mon Sep 17 00:00:00 2001
From: Julien Isorce
Date: Thu, 5 Nov 2015 08:24:45 +0000
Subject: [PATCH 068/287] st/va: fix memory leak on error in
vlVaCreateSurfaces2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Found by coverity: CID #1337953
Signed-off-by: Julien Isorce
Reviewed-by: Christian König
Reviewed-by: Emil Velikov
---
src/gallium/state_trackers/va/surface.c | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/src/gallium/state_trackers/va/surface.c b/src/gallium/state_trackers/va/surface.c
index 59815aa4c5a..3db21c3de39 100644
--- a/src/gallium/state_trackers/va/surface.c
+++ b/src/gallium/state_trackers/va/surface.c
@@ -479,8 +479,10 @@ suface_from_external_memory(VADriverContextP ctx, vlVaSurface *surface,
util_dynarray_init(&surface->subpics);
surfaces[index] = handle_table_add(drv->htab, surface);
- if (!surfaces[index])
+ if (!surfaces[index]) {
+ surface->buffer->destroy(surface->buffer);
return VA_STATUS_ERROR_ALLOCATION_FAILED;
+ }
return VA_STATUS_SUCCESS;
}
@@ -612,15 +614,19 @@ vlVaCreateSurfaces2(VADriverContextP ctx, unsigned int format,
switch (memory_type) {
case VA_SURFACE_ATTRIB_MEM_TYPE_VA:
surf->buffer = drv->pipe->create_video_buffer(drv->pipe, &templat);
- if (!surf->buffer)
+ if (!surf->buffer) {
+ FREE(surf);
goto no_res;
+ }
util_dynarray_init(&surf->subpics);
surfaces[i] = handle_table_add(drv->htab, surf);
break;
case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME:
vaStatus = suface_from_external_memory(ctx, surf, memory_attibute, i, surfaces, &templat);
- if (vaStatus != VA_STATUS_SUCCESS)
+ if (vaStatus != VA_STATUS_SUCCESS) {
+ FREE(surf);
goto no_res;
+ }
break;
default:
assert(0);
From 581111c4d67c65305dcae83789ac504deeec9da2 Mon Sep 17 00:00:00 2001
From: Brian Paul
Date: Thu, 5 Nov 2015 19:03:39 -0700
Subject: [PATCH 069/287] mesa: report enum name in glClientActiveTexture()
error string
As we do for glActiveTexture(). Trivial.
---
src/mesa/main/texstate.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/src/mesa/main/texstate.c b/src/mesa/main/texstate.c
index cb147fac476..9d88554d945 100644
--- a/src/mesa/main/texstate.c
+++ b/src/mesa/main/texstate.c
@@ -330,7 +330,8 @@ _mesa_ClientActiveTexture(GLenum texture)
return;
if (texUnit >= ctx->Const.MaxTextureCoordUnits) {
- _mesa_error(ctx, GL_INVALID_ENUM, "glClientActiveTexture(texture)");
+ _mesa_error(ctx, GL_INVALID_ENUM, "glClientActiveTexture(texture=%s)",
+ _mesa_enum_to_string(texture));
return;
}
From d68226087cf5f2f686d6c8f3377c5a1dec3d8bc4 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin
Date: Thu, 5 Nov 2015 23:12:52 -0500
Subject: [PATCH 070/287] nvc0: reintroduce BGRA4 format support
Commit 342e68dc60 (nvc0: remove BGRA4 format support) removed the
support to fix a WoW trace. However after further experimentation, I was
able to get the blit to work by using a different "fake" format in the
2d engine.
The reason why this worked on nv50 is that nv50 falls back to the 3d
blit path in case either the src or the dst aren't "faithfully"
supported, while nvc0 only does it for the dst format. RG8 is better
supported by the nvc0 2d engine than R16.
Signed-off-by: Ilia Mirkin
---
src/gallium/drivers/nouveau/nv50/nv50_formats.c | 2 --
src/gallium/drivers/nouveau/nvc0/nvc0_surface.c | 2 +-
2 files changed, 1 insertion(+), 3 deletions(-)
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_formats.c b/src/gallium/drivers/nouveau/nv50/nv50_formats.c
index 80f92be682d..49a93bf1d91 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_formats.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_formats.c
@@ -203,10 +203,8 @@ const struct nv50_format nv50_format_table[PIPE_FORMAT_COUNT] =
F3B(B5G6R5_UNORM, B5G6R5_UNORM, C2, C1, C0, xx, UNORM, 5_6_5, TD),
C4B(B5G5R5A1_UNORM, BGR5_A1_UNORM, C2, C1, C0, C3, UNORM, 5_5_5_1, TD),
F3B(B5G5R5X1_UNORM, BGR5_X1_UNORM, C2, C1, C0, xx, UNORM, 5_5_5_1, TD),
-#if NOUVEAU_DRIVER != 0xc0
C4B(B4G4R4A4_UNORM, NONE, C2, C1, C0, C3, UNORM, 4_4_4_4, T),
F3B(B4G4R4X4_UNORM, NONE, C2, C1, C0, xx, UNORM, 4_4_4_4, T),
-#endif
F3B(R9G9B9E5_FLOAT, NONE, C0, C1, C2, xx, FLOAT, 9_9_9_E5, T),
C4A(R10G10B10A2_UNORM, RGB10_A2_UNORM, C0, C1, C2, C3, UNORM, 10_10_10_2,
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index be123349148..5f47bad22f3 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -67,7 +67,7 @@ nvc0_2d_format(enum pipe_format format, bool dst, bool dst_src_equal)
case 1:
return NV50_SURFACE_FORMAT_R8_UNORM;
case 2:
- return NV50_SURFACE_FORMAT_R16_UNORM;
+ return NV50_SURFACE_FORMAT_RG8_UNORM;
case 4:
return NV50_SURFACE_FORMAT_BGRA8_UNORM;
case 8:
From 99597d033a62bdfa31148714f4d2c40f84655a5a Mon Sep 17 00:00:00 2001
From: Rob Clark
Date: Wed, 21 Oct 2015 10:57:15 -0400
Subject: [PATCH 071/287] nir: some small cleanups
The various cf nodes all get allocated w/ shader as their ralloc_parent,
so lets make this more explicit. Plus couple other corrections/
clarifications.
Signed-off-by: Rob Clark
Reviewed-by: Jason Ekstrand
---
src/glsl/nir/nir.c | 18 +++++++++---------
src/glsl/nir/nir.h | 10 +++++-----
2 files changed, 14 insertions(+), 14 deletions(-)
diff --git a/src/glsl/nir/nir.c b/src/glsl/nir/nir.c
index 5f03095d673..bb7a5fa5835 100644
--- a/src/glsl/nir/nir.c
+++ b/src/glsl/nir/nir.c
@@ -302,9 +302,9 @@ nir_function_impl_create(nir_function_overload *overload)
}
nir_block *
-nir_block_create(void *mem_ctx)
+nir_block_create(nir_shader *shader)
{
- nir_block *block = ralloc(mem_ctx, nir_block);
+ nir_block *block = ralloc(shader, nir_block);
cf_init(&block->cf_node, nir_cf_node_block);
@@ -330,19 +330,19 @@ src_init(nir_src *src)
}
nir_if *
-nir_if_create(void *mem_ctx)
+nir_if_create(nir_shader *shader)
{
- nir_if *if_stmt = ralloc(mem_ctx, nir_if);
+ nir_if *if_stmt = ralloc(shader, nir_if);
cf_init(&if_stmt->cf_node, nir_cf_node_if);
src_init(&if_stmt->condition);
- nir_block *then = nir_block_create(mem_ctx);
+ nir_block *then = nir_block_create(shader);
exec_list_make_empty(&if_stmt->then_list);
exec_list_push_tail(&if_stmt->then_list, &then->cf_node.node);
then->cf_node.parent = &if_stmt->cf_node;
- nir_block *else_stmt = nir_block_create(mem_ctx);
+ nir_block *else_stmt = nir_block_create(shader);
exec_list_make_empty(&if_stmt->else_list);
exec_list_push_tail(&if_stmt->else_list, &else_stmt->cf_node.node);
else_stmt->cf_node.parent = &if_stmt->cf_node;
@@ -351,13 +351,13 @@ nir_if_create(void *mem_ctx)
}
nir_loop *
-nir_loop_create(void *mem_ctx)
+nir_loop_create(nir_shader *shader)
{
- nir_loop *loop = ralloc(mem_ctx, nir_loop);
+ nir_loop *loop = ralloc(shader, nir_loop);
cf_init(&loop->cf_node, nir_cf_node_loop);
- nir_block *body = nir_block_create(mem_ctx);
+ nir_block *body = nir_block_create(shader);
exec_list_make_empty(&loop->body);
exec_list_push_tail(&loop->body, &body->cf_node.node);
body->cf_node.parent = &loop->cf_node;
diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index f8de40d0d13..ef39df5dc51 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -394,10 +394,10 @@ typedef struct {
*/
bool is_packed;
- /** set of nir_instr's where this register is used (read from) */
+ /** set of nir_src's where this register is used (read from) */
struct list_head uses;
- /** set of nir_instr's where this register is defined (written to) */
+ /** set of nir_dest's where this register is defined (written to) */
struct list_head defs;
/** set of nir_if's where this register is used as a condition */
@@ -1621,9 +1621,9 @@ nir_function_overload *nir_function_overload_create(nir_function *func);
nir_function_impl *nir_function_impl_create(nir_function_overload *func);
-nir_block *nir_block_create(void *mem_ctx);
-nir_if *nir_if_create(void *mem_ctx);
-nir_loop *nir_loop_create(void *mem_ctx);
+nir_block *nir_block_create(nir_shader *shader);
+nir_if *nir_if_create(nir_shader *shader);
+nir_loop *nir_loop_create(nir_shader *shader);
nir_function_impl *nir_cf_node_get_function(nir_cf_node *node);
From 8f55ebe802ea930d14eef9cd622aeb9a8d989e01 Mon Sep 17 00:00:00 2001
From: Boyan Ding
Date: Fri, 16 Oct 2015 15:15:38 +0800
Subject: [PATCH 072/287] freedreno/ir3: Use nir_foreach_variable
Signed-off-by: Boyan Ding
Signed-off-by: Rob Clark
---
src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 8c9234b3847..157dc73a3c6 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -2325,17 +2325,17 @@ emit_instructions(struct ir3_compile *ctx)
}
/* Setup inputs: */
- foreach_list_typed(nir_variable, var, node, &ctx->s->inputs) {
+ nir_foreach_variable(var, &ctx->s->inputs) {
setup_input(ctx, var);
}
/* Setup outputs: */
- foreach_list_typed(nir_variable, var, node, &ctx->s->outputs) {
+ nir_foreach_variable(var, &ctx->s->outputs) {
setup_output(ctx, var);
}
/* Setup variables (which should only be arrays): */
- foreach_list_typed(nir_variable, var, node, &ctx->s->globals) {
+ nir_foreach_variable(var, &ctx->s->globals) {
declare_var(ctx, var);
}
From 6f5e0c08a477c6872e8be6d1b09aea97db7fe125 Mon Sep 17 00:00:00 2001
From: Guillaume Charifi
Date: Fri, 6 Nov 2015 11:17:25 -0500
Subject: [PATCH 073/287] freedreno: add a305 support
Signed-off-by: Rob Clark
---
src/gallium/drivers/freedreno/freedreno_screen.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 9f8c33263fb..7ee1a3fa9cf 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -549,6 +549,7 @@ fd_screen_create(struct fd_device *dev)
case 220:
fd2_screen_init(pscreen);
break;
+ case 305:
case 307:
case 320:
case 330:
From 7465e161248b94d0bd1cdae6fc4c501ecfcf9b0b Mon Sep 17 00:00:00 2001
From: Rob Clark
Date: Tue, 27 Oct 2015 11:33:32 -0400
Subject: [PATCH 074/287] freedreno: update generated headers
Signed-off-by: Rob Clark
---
src/gallium/drivers/freedreno/a2xx/a2xx.xml.h | 5 +-
src/gallium/drivers/freedreno/a3xx/a3xx.xml.h | 5 +-
src/gallium/drivers/freedreno/a4xx/a4xx.xml.h | 65 ++++++++++++-------
src/gallium/drivers/freedreno/a4xx/fd4_emit.c | 12 ----
.../drivers/freedreno/adreno_common.xml.h | 5 +-
.../drivers/freedreno/adreno_pm4.xml.h | 5 +-
6 files changed, 54 insertions(+), 43 deletions(-)
diff --git a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
index 2853787a340..ef235734755 100644
--- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
+++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
@@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/
git clone https://github.com/freedreno/envytools.git
The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 398 bytes, from 2015-09-24 17:25:31)
- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2015-05-20 20:03:07)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10755 bytes, from 2015-09-14 20:46:55)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 67771 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63970 bytes, from 2015-09-14 20:50:12)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63914 bytes, from 2015-10-27 17:13:16)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml ( 1773 bytes, from 2015-09-24 17:30:00)
Copyright (C) 2013-2015 by the following authors:
- Rob Clark (robclark)
diff --git a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
index 4bbcb33614c..b5e1ddadde0 100644
--- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
+++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
@@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/
git clone https://github.com/freedreno/envytools.git
The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 398 bytes, from 2015-09-24 17:25:31)
- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2015-05-20 20:03:07)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10755 bytes, from 2015-09-14 20:46:55)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 67771 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63970 bytes, from 2015-09-14 20:50:12)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63914 bytes, from 2015-10-27 17:13:16)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml ( 1773 bytes, from 2015-09-24 17:30:00)
Copyright (C) 2013-2015 by the following authors:
- Rob Clark (robclark)
diff --git a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
index 819f5b14a17..9f970365464 100644
--- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
+++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
@@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/
git clone https://github.com/freedreno/envytools.git
The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 398 bytes, from 2015-09-24 17:25:31)
- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2015-05-20 20:03:07)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10755 bytes, from 2015-09-14 20:46:55)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 67771 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63970 bytes, from 2015-09-14 20:50:12)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63914 bytes, from 2015-10-27 17:13:16)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml ( 1773 bytes, from 2015-09-24 17:30:00)
Copyright (C) 2013-2015 by the following authors:
- Rob Clark (robclark)
@@ -489,8 +490,8 @@ static inline uint32_t A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(enum adreno_r
return ((val) << A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR__SHIFT) & A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR__MASK;
}
-#define REG_A4XX_RB_BLEND_RED 0x000020f3
-#define A4XX_RB_BLEND_RED_UINT__MASK 0x00007fff
+#define REG_A4XX_RB_BLEND_RED 0x000020f0
+#define A4XX_RB_BLEND_RED_UINT__MASK 0x0000ffff
#define A4XX_RB_BLEND_RED_UINT__SHIFT 0
static inline uint32_t A4XX_RB_BLEND_RED_UINT(uint32_t val)
{
@@ -503,8 +504,16 @@ static inline uint32_t A4XX_RB_BLEND_RED_FLOAT(float val)
return ((util_float_to_half(val)) << A4XX_RB_BLEND_RED_FLOAT__SHIFT) & A4XX_RB_BLEND_RED_FLOAT__MASK;
}
-#define REG_A4XX_RB_BLEND_GREEN 0x000020f4
-#define A4XX_RB_BLEND_GREEN_UINT__MASK 0x00007fff
+#define REG_A4XX_RB_BLEND_RED_F32 0x000020f1
+#define A4XX_RB_BLEND_RED_F32__MASK 0xffffffff
+#define A4XX_RB_BLEND_RED_F32__SHIFT 0
+static inline uint32_t A4XX_RB_BLEND_RED_F32(float val)
+{
+ return ((fui(val)) << A4XX_RB_BLEND_RED_F32__SHIFT) & A4XX_RB_BLEND_RED_F32__MASK;
+}
+
+#define REG_A4XX_RB_BLEND_GREEN 0x000020f2
+#define A4XX_RB_BLEND_GREEN_UINT__MASK 0x0000ffff
#define A4XX_RB_BLEND_GREEN_UINT__SHIFT 0
static inline uint32_t A4XX_RB_BLEND_GREEN_UINT(uint32_t val)
{
@@ -517,8 +526,16 @@ static inline uint32_t A4XX_RB_BLEND_GREEN_FLOAT(float val)
return ((util_float_to_half(val)) << A4XX_RB_BLEND_GREEN_FLOAT__SHIFT) & A4XX_RB_BLEND_GREEN_FLOAT__MASK;
}
-#define REG_A4XX_RB_BLEND_BLUE 0x000020f5
-#define A4XX_RB_BLEND_BLUE_UINT__MASK 0x00007fff
+#define REG_A4XX_RB_BLEND_GREEN_F32 0x000020f3
+#define A4XX_RB_BLEND_GREEN_F32__MASK 0xffffffff
+#define A4XX_RB_BLEND_GREEN_F32__SHIFT 0
+static inline uint32_t A4XX_RB_BLEND_GREEN_F32(float val)
+{
+ return ((fui(val)) << A4XX_RB_BLEND_GREEN_F32__SHIFT) & A4XX_RB_BLEND_GREEN_F32__MASK;
+}
+
+#define REG_A4XX_RB_BLEND_BLUE 0x000020f4
+#define A4XX_RB_BLEND_BLUE_UINT__MASK 0x0000ffff
#define A4XX_RB_BLEND_BLUE_UINT__SHIFT 0
static inline uint32_t A4XX_RB_BLEND_BLUE_UINT(uint32_t val)
{
@@ -531,8 +548,16 @@ static inline uint32_t A4XX_RB_BLEND_BLUE_FLOAT(float val)
return ((util_float_to_half(val)) << A4XX_RB_BLEND_BLUE_FLOAT__SHIFT) & A4XX_RB_BLEND_BLUE_FLOAT__MASK;
}
+#define REG_A4XX_RB_BLEND_BLUE_F32 0x000020f5
+#define A4XX_RB_BLEND_BLUE_F32__MASK 0xffffffff
+#define A4XX_RB_BLEND_BLUE_F32__SHIFT 0
+static inline uint32_t A4XX_RB_BLEND_BLUE_F32(float val)
+{
+ return ((fui(val)) << A4XX_RB_BLEND_BLUE_F32__SHIFT) & A4XX_RB_BLEND_BLUE_F32__MASK;
+}
+
#define REG_A4XX_RB_BLEND_ALPHA 0x000020f6
-#define A4XX_RB_BLEND_ALPHA_UINT__MASK 0x00007fff
+#define A4XX_RB_BLEND_ALPHA_UINT__MASK 0x0000ffff
#define A4XX_RB_BLEND_ALPHA_UINT__SHIFT 0
static inline uint32_t A4XX_RB_BLEND_ALPHA_UINT(uint32_t val)
{
@@ -545,6 +570,14 @@ static inline uint32_t A4XX_RB_BLEND_ALPHA_FLOAT(float val)
return ((util_float_to_half(val)) << A4XX_RB_BLEND_ALPHA_FLOAT__SHIFT) & A4XX_RB_BLEND_ALPHA_FLOAT__MASK;
}
+#define REG_A4XX_RB_BLEND_ALPHA_F32 0x000020f7
+#define A4XX_RB_BLEND_ALPHA_F32__MASK 0xffffffff
+#define A4XX_RB_BLEND_ALPHA_F32__SHIFT 0
+static inline uint32_t A4XX_RB_BLEND_ALPHA_F32(float val)
+{
+ return ((fui(val)) << A4XX_RB_BLEND_ALPHA_F32__SHIFT) & A4XX_RB_BLEND_ALPHA_F32__MASK;
+}
+
#define REG_A4XX_RB_ALPHA_CONTROL 0x000020f8
#define A4XX_RB_ALPHA_CONTROL_ALPHA_REF__MASK 0x000000ff
#define A4XX_RB_ALPHA_CONTROL_ALPHA_REF__SHIFT 0
@@ -2645,20 +2678,6 @@ static inline uint32_t A4XX_PC_HS_PARAM_PRIMTYPE(enum adreno_pa_su_sc_draw val)
#define REG_A4XX_UNKNOWN_20EF 0x000020ef
-#define REG_A4XX_UNKNOWN_20F0 0x000020f0
-
-#define REG_A4XX_UNKNOWN_20F1 0x000020f1
-
-#define REG_A4XX_UNKNOWN_20F2 0x000020f2
-
-#define REG_A4XX_UNKNOWN_20F7 0x000020f7
-#define A4XX_UNKNOWN_20F7__MASK 0xffffffff
-#define A4XX_UNKNOWN_20F7__SHIFT 0
-static inline uint32_t A4XX_UNKNOWN_20F7(float val)
-{
- return ((fui(val)) << A4XX_UNKNOWN_20F7__SHIFT) & A4XX_UNKNOWN_20F7__MASK;
-}
-
#define REG_A4XX_UNKNOWN_2152 0x00002152
#define REG_A4XX_UNKNOWN_2153 0x00002153
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index cf5dd7b0f17..848a82fee7e 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -699,15 +699,6 @@ fd4_emit_restore(struct fd_context *ctx)
OUT_PKT0(ring, REG_A4XX_UNKNOWN_20EF, 1);
OUT_RING(ring, 0x00000000);
- OUT_PKT0(ring, REG_A4XX_UNKNOWN_20F0, 1);
- OUT_RING(ring, 0x00000000);
-
- OUT_PKT0(ring, REG_A4XX_UNKNOWN_20F1, 1);
- OUT_RING(ring, 0x00000000);
-
- OUT_PKT0(ring, REG_A4XX_UNKNOWN_20F2, 1);
- OUT_RING(ring, 0x00000000);
-
OUT_PKT0(ring, REG_A4XX_RB_BLEND_RED, 4);
OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(0) |
A4XX_RB_BLEND_RED_FLOAT(0.0));
@@ -718,9 +709,6 @@ fd4_emit_restore(struct fd_context *ctx)
OUT_RING(ring, A4XX_RB_BLEND_ALPHA_UINT(0x7fff) |
A4XX_RB_BLEND_ALPHA_FLOAT(1.0));
- OUT_PKT0(ring, REG_A4XX_UNKNOWN_20F7, 1);
- OUT_RING(ring, 0x3f800000);
-
OUT_PKT0(ring, REG_A4XX_UNKNOWN_2152, 1);
OUT_RING(ring, 0x00000000);
diff --git a/src/gallium/drivers/freedreno/adreno_common.xml.h b/src/gallium/drivers/freedreno/adreno_common.xml.h
index 906368c0efa..ca3d2ac3fca 100644
--- a/src/gallium/drivers/freedreno/adreno_common.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_common.xml.h
@@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/
git clone https://github.com/freedreno/envytools.git
The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 398 bytes, from 2015-09-24 17:25:31)
- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2015-05-20 20:03:07)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10755 bytes, from 2015-09-14 20:46:55)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 67771 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63970 bytes, from 2015-09-14 20:50:12)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63914 bytes, from 2015-10-27 17:13:16)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml ( 1773 bytes, from 2015-09-24 17:30:00)
Copyright (C) 2013-2015 by the following authors:
- Rob Clark (robclark)
diff --git a/src/gallium/drivers/freedreno/adreno_pm4.xml.h b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
index 490cf5beaf0..f095e3061b2 100644
--- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
@@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/
git clone https://github.com/freedreno/envytools.git
The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 398 bytes, from 2015-09-24 17:25:31)
- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2015-05-20 20:03:07)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10755 bytes, from 2015-09-14 20:46:55)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 67771 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63970 bytes, from 2015-09-14 20:50:12)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63914 bytes, from 2015-10-27 17:13:16)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml ( 1773 bytes, from 2015-09-24 17:30:00)
Copyright (C) 2013-2015 by the following authors:
- Rob Clark (robclark)
From 6459e780ae44d8826322e0dc2466d0ee6d9e9800 Mon Sep 17 00:00:00 2001
From: Rob Clark
Date: Tue, 27 Oct 2015 11:38:34 -0400
Subject: [PATCH 075/287] freedreno/a4xx: fix blend color
Signed-off-by: Rob Clark
---
src/gallium/drivers/freedreno/a4xx/fd4_emit.c | 14 +++++++++-----
1 file changed, 9 insertions(+), 5 deletions(-)
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index 848a82fee7e..26b58718cd8 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -613,15 +613,19 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
if (dirty & FD_DIRTY_BLEND_COLOR) {
struct pipe_blend_color *bcolor = &ctx->blend_color;
- OUT_PKT0(ring, REG_A4XX_RB_BLEND_RED, 4);
- OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(bcolor->color[0] * 255.0) |
+ OUT_PKT0(ring, REG_A4XX_RB_BLEND_RED, 8);
+ OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(bcolor->color[0] * 65535.0) |
A4XX_RB_BLEND_RED_FLOAT(bcolor->color[0]));
- OUT_RING(ring, A4XX_RB_BLEND_GREEN_UINT(bcolor->color[1] * 255.0) |
+ OUT_RING(ring, A4XX_RB_BLEND_RED_F32(bcolor->color[0]));
+ OUT_RING(ring, A4XX_RB_BLEND_GREEN_UINT(bcolor->color[1] * 65535.0) |
A4XX_RB_BLEND_GREEN_FLOAT(bcolor->color[1]));
- OUT_RING(ring, A4XX_RB_BLEND_BLUE_UINT(bcolor->color[2] * 255.0) |
+ OUT_RING(ring, A4XX_RB_BLEND_GREEN_F32(bcolor->color[1]));
+ OUT_RING(ring, A4XX_RB_BLEND_BLUE_UINT(bcolor->color[2] * 65535.0) |
A4XX_RB_BLEND_BLUE_FLOAT(bcolor->color[2]));
- OUT_RING(ring, A4XX_RB_BLEND_ALPHA_UINT(bcolor->color[3] * 255.0) |
+ OUT_RING(ring, A4XX_RB_BLEND_BLUE_F32(bcolor->color[2]));
+ OUT_RING(ring, A4XX_RB_BLEND_ALPHA_UINT(bcolor->color[3] * 65535.0) |
A4XX_RB_BLEND_ALPHA_FLOAT(bcolor->color[3]));
+ OUT_RING(ring, A4XX_RB_BLEND_ALPHA_F32(bcolor->color[3]));
}
if (dirty & FD_DIRTY_VERTTEX) {
From bfc245e9ac430aab0b3c4c2a1b7767793f2854b0 Mon Sep 17 00:00:00 2001
From: Julien Isorce
Date: Fri, 6 Nov 2015 09:45:11 +0000
Subject: [PATCH 076/287] st/va: properly indent buffer.c, config.c, image.c
and picture.c
Some lines were using 4 indentation spaces instead of 3.
Signed-off-by: Julien Isorce
Reviewed-by: Christian Knig
Reviewed-by: Emil Velikov
---
src/gallium/state_trackers/va/buffer.c | 14 ++---
src/gallium/state_trackers/va/config.c | 12 ++--
src/gallium/state_trackers/va/image.c | 4 +-
src/gallium/state_trackers/va/picture.c | 80 ++++++++++++-------------
4 files changed, 55 insertions(+), 55 deletions(-)
diff --git a/src/gallium/state_trackers/va/buffer.c b/src/gallium/state_trackers/va/buffer.c
index 71a65037757..47bf35ac725 100644
--- a/src/gallium/state_trackers/va/buffer.c
+++ b/src/gallium/state_trackers/va/buffer.c
@@ -152,11 +152,11 @@ vlVaUnmapBuffer(VADriverContextP ctx, VABufferID buf_id)
return VA_STATUS_ERROR_INVALID_BUFFER;
if (buf->derived_surface.resource) {
- if (!buf->derived_surface.transfer)
- return VA_STATUS_ERROR_INVALID_BUFFER;
+ if (!buf->derived_surface.transfer)
+ return VA_STATUS_ERROR_INVALID_BUFFER;
- pipe_buffer_unmap(drv->pipe, buf->derived_surface.transfer);
- buf->derived_surface.transfer = NULL;
+ pipe_buffer_unmap(drv->pipe, buf->derived_surface.transfer);
+ buf->derived_surface.transfer = NULL;
}
return VA_STATUS_SUCCESS;
@@ -175,10 +175,10 @@ vlVaDestroyBuffer(VADriverContextP ctx, VABufferID buf_id)
return VA_STATUS_ERROR_INVALID_BUFFER;
if (buf->derived_surface.resource) {
- if (buf->export_refcount > 0)
- return VA_STATUS_ERROR_INVALID_BUFFER;
+ if (buf->export_refcount > 0)
+ return VA_STATUS_ERROR_INVALID_BUFFER;
- pipe_resource_reference(&buf->derived_surface.resource, NULL);
+ pipe_resource_reference(&buf->derived_surface.resource, NULL);
}
FREE(buf->data);
diff --git a/src/gallium/state_trackers/va/config.c b/src/gallium/state_trackers/va/config.c
index 0f47aacdbd6..a545a18c1e0 100644
--- a/src/gallium/state_trackers/va/config.c
+++ b/src/gallium/state_trackers/va/config.c
@@ -71,8 +71,8 @@ vlVaQueryConfigEntrypoints(VADriverContextP ctx, VAProfile profile,
*num_entrypoints = 0;
if (profile == VAProfileNone) {
- entrypoint_list[(*num_entrypoints)++] = VAEntrypointVideoProc;
- return VA_STATUS_SUCCESS;
+ entrypoint_list[(*num_entrypoints)++] = VAEntrypointVideoProc;
+ return VA_STATUS_SUCCESS;
}
p = ProfileToPipe(profile);
@@ -104,7 +104,7 @@ vlVaGetConfigAttributes(VADriverContextP ctx, VAProfile profile, VAEntrypoint en
value = VA_RT_FORMAT_YUV420;
break;
case VAConfigAttribRateControl:
- value = VA_RC_NONE;
+ value = VA_RC_NONE;
break;
default:
value = VA_ATTRIB_NOT_SUPPORTED;
@@ -127,8 +127,8 @@ vlVaCreateConfig(VADriverContextP ctx, VAProfile profile, VAEntrypoint entrypoin
return VA_STATUS_ERROR_INVALID_CONTEXT;
if (profile == VAProfileNone && entrypoint == VAEntrypointVideoProc) {
- *config_id = PIPE_VIDEO_PROFILE_UNKNOWN;
- return VA_STATUS_SUCCESS;
+ *config_id = PIPE_VIDEO_PROFILE_UNKNOWN;
+ return VA_STATUS_SUCCESS;
}
p = ProfileToPipe(profile);
@@ -167,7 +167,7 @@ vlVaQueryConfigAttributes(VADriverContextP ctx, VAConfigID config_id, VAProfile
if (config_id == PIPE_VIDEO_PROFILE_UNKNOWN) {
*entrypoint = VAEntrypointVideoProc;
- *num_attribs = 0;
+ *num_attribs = 0;
return VA_STATUS_SUCCESS;
}
diff --git a/src/gallium/state_trackers/va/image.c b/src/gallium/state_trackers/va/image.c
index c6d0c5abf65..ae07da857e1 100644
--- a/src/gallium/state_trackers/va/image.c
+++ b/src/gallium/state_trackers/va/image.c
@@ -447,8 +447,8 @@ vlVaPutImage(VADriverContextP ctx, VASurfaceID surface, VAImageID image,
tmp_buf = drv->pipe->create_video_buffer(drv->pipe, &surf->templat);
if (!tmp_buf) {
- surf->templat.buffer_format = old_surf_format;
- return VA_STATUS_ERROR_ALLOCATION_FAILED;
+ surf->templat.buffer_format = old_surf_format;
+ return VA_STATUS_ERROR_ALLOCATION_FAILED;
}
surf->buffer->destroy(surf->buffer);
diff --git a/src/gallium/state_trackers/va/picture.c b/src/gallium/state_trackers/va/picture.c
index e850689005d..644b8488ec2 100644
--- a/src/gallium/state_trackers/va/picture.c
+++ b/src/gallium/state_trackers/va/picture.c
@@ -65,7 +65,7 @@ vlVaBeginPicture(VADriverContextP ctx, VAContextID context_id, VASurfaceID rende
if ((context->target->buffer_format != PIPE_FORMAT_B8G8R8A8_UNORM &&
context->target->buffer_format != PIPE_FORMAT_R8G8B8A8_UNORM) ||
context->target->interlaced)
- return VA_STATUS_ERROR_UNIMPLEMENTED;
+ return VA_STATUS_ERROR_UNIMPLEMENTED;
return VA_STATUS_SUCCESS;
}
@@ -717,60 +717,60 @@ handleVASliceDataBufferType(vlVaContext *context, vlVaBuffer *buf)
static VAStatus
handleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf)
{
- struct u_rect src_rect;
- struct u_rect dst_rect;
- struct u_rect *dirty_area;
- vlVaSurface *src_surface;
- VAProcPipelineParameterBuffer *pipeline_param;
- struct pipe_surface **surfaces;
- struct pipe_screen *screen;
- struct pipe_surface *psurf;
+ struct u_rect src_rect;
+ struct u_rect dst_rect;
+ struct u_rect *dirty_area;
+ vlVaSurface *src_surface;
+ VAProcPipelineParameterBuffer *pipeline_param;
+ struct pipe_surface **surfaces;
+ struct pipe_screen *screen;
+ struct pipe_surface *psurf;
- if (!drv || !context)
- return VA_STATUS_ERROR_INVALID_CONTEXT;
+ if (!drv || !context)
+ return VA_STATUS_ERROR_INVALID_CONTEXT;
- if (!buf || !buf->data)
- return VA_STATUS_ERROR_INVALID_BUFFER;
+ if (!buf || !buf->data)
+ return VA_STATUS_ERROR_INVALID_BUFFER;
- if (!context->target)
- return VA_STATUS_ERROR_INVALID_SURFACE;
+ if (!context->target)
+ return VA_STATUS_ERROR_INVALID_SURFACE;
- pipeline_param = (VAProcPipelineParameterBuffer *)buf->data;
+ pipeline_param = (VAProcPipelineParameterBuffer *)buf->data;
- src_surface = handle_table_get(drv->htab, pipeline_param->surface);
- if (!src_surface || !src_surface->buffer)
- return VA_STATUS_ERROR_INVALID_SURFACE;
+ src_surface = handle_table_get(drv->htab, pipeline_param->surface);
+ if (!src_surface || !src_surface->buffer)
+ return VA_STATUS_ERROR_INVALID_SURFACE;
- surfaces = context->target->get_surfaces(context->target);
+ surfaces = context->target->get_surfaces(context->target);
- if (!surfaces || !surfaces[0])
- return VA_STATUS_ERROR_INVALID_SURFACE;
+ if (!surfaces || !surfaces[0])
+ return VA_STATUS_ERROR_INVALID_SURFACE;
- screen = drv->pipe->screen;
+ screen = drv->pipe->screen;
- psurf = surfaces[0];
+ psurf = surfaces[0];
- src_rect.x0 = pipeline_param->surface_region->x;
- src_rect.y0 = pipeline_param->surface_region->y;
- src_rect.x1 = pipeline_param->surface_region->x + pipeline_param->surface_region->width;
- src_rect.y1 = pipeline_param->surface_region->y + pipeline_param->surface_region->height;
+ src_rect.x0 = pipeline_param->surface_region->x;
+ src_rect.y0 = pipeline_param->surface_region->y;
+ src_rect.x1 = pipeline_param->surface_region->x + pipeline_param->surface_region->width;
+ src_rect.y1 = pipeline_param->surface_region->y + pipeline_param->surface_region->height;
- dst_rect.x0 = pipeline_param->output_region->x;
- dst_rect.y0 = pipeline_param->output_region->y;
- dst_rect.x1 = pipeline_param->output_region->x + pipeline_param->output_region->width;
- dst_rect.y1 = pipeline_param->output_region->y + pipeline_param->output_region->height;
+ dst_rect.x0 = pipeline_param->output_region->x;
+ dst_rect.y0 = pipeline_param->output_region->y;
+ dst_rect.x1 = pipeline_param->output_region->x + pipeline_param->output_region->width;
+ dst_rect.y1 = pipeline_param->output_region->y + pipeline_param->output_region->height;
- dirty_area = vl_screen_get_dirty_area(drv->vscreen);
+ dirty_area = vl_screen_get_dirty_area(drv->vscreen);
- vl_compositor_clear_layers(&drv->cstate);
- vl_compositor_set_buffer_layer(&drv->cstate, &drv->compositor, 0, src_surface->buffer, &src_rect, NULL, VL_COMPOSITOR_WEAVE);
- vl_compositor_set_layer_dst_area(&drv->cstate, 0, &dst_rect);
- vl_compositor_render(&drv->cstate, &drv->compositor, psurf, dirty_area, true);
+ vl_compositor_clear_layers(&drv->cstate);
+ vl_compositor_set_buffer_layer(&drv->cstate, &drv->compositor, 0, src_surface->buffer, &src_rect, NULL, VL_COMPOSITOR_WEAVE);
+ vl_compositor_set_layer_dst_area(&drv->cstate, 0, &dst_rect);
+ vl_compositor_render(&drv->cstate, &drv->compositor, psurf, dirty_area, true);
- screen->fence_reference(screen, &src_surface->fence, NULL);
- drv->pipe->flush(drv->pipe, &src_surface->fence, 0);
+ screen->fence_reference(screen, &src_surface->fence, NULL);
+ drv->pipe->flush(drv->pipe, &src_surface->fence, 0);
- return VA_STATUS_SUCCESS;
+ return VA_STATUS_SUCCESS;
}
VAStatus
From bf6acbb2db4baaf18ae5a139142acf06e84d1b9c Mon Sep 17 00:00:00 2001
From: Julien Isorce
Date: Fri, 6 Nov 2015 09:45:17 +0000
Subject: [PATCH 077/287] st/va: properly use brackets in
vlVaAcquireBufferHandle's switch
In "switch (mem_type)" the brackets were surrounding "case+default"
instead of "case" only.
Signed-off-by: Julien Isorce
Reviewed-by: Christian Knig
Reviewed-by: Emil Velikov
---
src/gallium/state_trackers/va/buffer.c | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/src/gallium/state_trackers/va/buffer.c b/src/gallium/state_trackers/va/buffer.c
index 47bf35ac725..769305e2999 100644
--- a/src/gallium/state_trackers/va/buffer.c
+++ b/src/gallium/state_trackers/va/buffer.c
@@ -280,15 +280,14 @@ vlVaAcquireBufferHandle(VADriverContextP ctx, VABufferID buf_id,
buf_info->handle = (intptr_t)whandle.handle;
break;
+ }
default:
return VA_STATUS_ERROR_UNSUPPORTED_MEMORY_TYPE;
}
- }
-
- buf_info->type = buf->type;
- buf_info->mem_type = mem_type;
- buf_info->mem_size = buf->num_elements * buf->size;
+ buf_info->type = buf->type;
+ buf_info->mem_type = mem_type;
+ buf_info->mem_size = buf->num_elements * buf->size;
}
buf->export_refcount++;
From 42a5e143a8d58a0ad15dd5747449eb4b57c87177 Mon Sep 17 00:00:00 2001
From: Julien Isorce
Date: Fri, 6 Nov 2015 09:45:19 +0000
Subject: [PATCH 078/287] vl/buffers: add RGBX and BGRX to the supported
formats
Useful is one wants to create RGBX or BGRX surfaces.
The infrastructure is such that it required just a
few definitions to support these formats.
Signed-off-by: Julien Isorce
Reviewed-by: Christian Knig
Reviewed-by: Emil Velikov
---
src/gallium/auxiliary/vl/vl_video_buffer.c | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
diff --git a/src/gallium/auxiliary/vl/vl_video_buffer.c b/src/gallium/auxiliary/vl/vl_video_buffer.c
index 5e0ae0ecb8b..6cd2557a892 100644
--- a/src/gallium/auxiliary/vl/vl_video_buffer.c
+++ b/src/gallium/auxiliary/vl/vl_video_buffer.c
@@ -62,6 +62,18 @@ const enum pipe_format const_resource_formats_VUYA[3] = {
PIPE_FORMAT_NONE
};
+const enum pipe_format const_resource_formats_YUVX[3] = {
+ PIPE_FORMAT_R8G8B8X8_UNORM,
+ PIPE_FORMAT_NONE,
+ PIPE_FORMAT_NONE
+};
+
+const enum pipe_format const_resource_formats_VUYX[3] = {
+ PIPE_FORMAT_B8G8R8X8_UNORM,
+ PIPE_FORMAT_NONE,
+ PIPE_FORMAT_NONE
+};
+
const enum pipe_format const_resource_formats_YUYV[3] = {
PIPE_FORMAT_R8G8_R8B8_UNORM,
PIPE_FORMAT_NONE,
@@ -102,6 +114,12 @@ vl_video_buffer_formats(struct pipe_screen *screen, enum pipe_format format)
case PIPE_FORMAT_B8G8R8A8_UNORM:
return const_resource_formats_VUYA;
+ case PIPE_FORMAT_R8G8B8X8_UNORM:
+ return const_resource_formats_VUYX;
+
+ case PIPE_FORMAT_B8G8R8X8_UNORM:
+ return const_resource_formats_VUYX;
+
case PIPE_FORMAT_YUYV:
return const_resource_formats_YUYV;
From cc1e5c972eff8c774c93c8dc51d89b550d00633e Mon Sep 17 00:00:00 2001
From: Julien Isorce
Date: Fri, 6 Nov 2015 09:45:22 +0000
Subject: [PATCH 079/287] st/va: add support for RGBX and BGRX in VPP
Before it was only possible to convert a NV12 surface to
RGBA or BGRA. This patch uses the same post processing
function, "handleVAProcPipelineParameterBufferType", but
add definitions for RGBX and BGRX.
This patch also makes vlVaQuerySurfaceAttributes more generic
to avoid copy and pasting the same lines.
Signed-off-by: Julien Isorce
Reviewed-by: Christian Knig
Reviewed-by: Emil Velikov
---
src/gallium/state_trackers/va/picture.c | 5 ++--
src/gallium/state_trackers/va/surface.c | 36 ++++++++++++++-----------
2 files changed, 23 insertions(+), 18 deletions(-)
diff --git a/src/gallium/state_trackers/va/picture.c b/src/gallium/state_trackers/va/picture.c
index 644b8488ec2..d6cdbea197d 100644
--- a/src/gallium/state_trackers/va/picture.c
+++ b/src/gallium/state_trackers/va/picture.c
@@ -59,11 +59,12 @@ vlVaBeginPicture(VADriverContextP ctx, VAContextID context_id, VASurfaceID rende
return VA_STATUS_ERROR_INVALID_SURFACE;
context->target = surf->buffer;
-
if (!context->decoder) {
/* VPP */
if ((context->target->buffer_format != PIPE_FORMAT_B8G8R8A8_UNORM &&
- context->target->buffer_format != PIPE_FORMAT_R8G8B8A8_UNORM) ||
+ context->target->buffer_format != PIPE_FORMAT_R8G8B8A8_UNORM &&
+ context->target->buffer_format != PIPE_FORMAT_B8G8R8X8_UNORM &&
+ context->target->buffer_format != PIPE_FORMAT_R8G8B8X8_UNORM) ||
context->target->interlaced)
return VA_STATUS_ERROR_UNIMPLEMENTED;
return VA_STATUS_SUCCESS;
diff --git a/src/gallium/state_trackers/va/surface.c b/src/gallium/state_trackers/va/surface.c
index 3db21c3de39..589d6860b6a 100644
--- a/src/gallium/state_trackers/va/surface.c
+++ b/src/gallium/state_trackers/va/surface.c
@@ -45,6 +45,11 @@
#include
+static const enum pipe_format vpp_surface_formats[] = {
+ PIPE_FORMAT_B8G8R8A8_UNORM, PIPE_FORMAT_R8G8B8A8_UNORM,
+ PIPE_FORMAT_B8G8R8X8_UNORM, PIPE_FORMAT_R8G8B8X8_UNORM
+};
+
VAStatus
vlVaCreateSurfaces(VADriverContextP ctx, int width, int height, int format,
int num_surfaces, VASurfaceID *surfaces)
@@ -314,7 +319,9 @@ vlVaQuerySurfaceAttributes(VADriverContextP ctx, VAConfigID config,
vlVaDriver *drv;
VASurfaceAttrib *attribs;
struct pipe_screen *pscreen;
- int i;
+ int i, j;
+
+ STATIC_ASSERT(ARRAY_SIZE(vpp_surface_formats) <= VL_VA_MAX_IMAGE_FORMATS);
if (config == VA_INVALID_ID)
return VA_STATUS_ERROR_INVALID_CONFIG;
@@ -323,7 +330,7 @@ vlVaQuerySurfaceAttributes(VADriverContextP ctx, VAConfigID config,
return VA_STATUS_ERROR_INVALID_PARAMETER;
if (!attrib_list) {
- *num_attribs = VASurfaceAttribCount;
+ *num_attribs = VL_VA_MAX_IMAGE_FORMATS + VASurfaceAttribCount;
return VA_STATUS_SUCCESS;
}
@@ -340,27 +347,24 @@ vlVaQuerySurfaceAttributes(VADriverContextP ctx, VAConfigID config,
if (!pscreen)
return VA_STATUS_ERROR_INVALID_CONTEXT;
- attribs = CALLOC(VASurfaceAttribCount, sizeof(VASurfaceAttrib));
+ attribs = CALLOC(VL_VA_MAX_IMAGE_FORMATS + VASurfaceAttribCount,
+ sizeof(VASurfaceAttrib));
if (!attribs)
return VA_STATUS_ERROR_ALLOCATION_FAILED;
i = 0;
+ /* vlVaCreateConfig returns PIPE_VIDEO_PROFILE_UNKNOWN
+ * only for VAEntrypointVideoProc. */
if (config == PIPE_VIDEO_PROFILE_UNKNOWN) {
- /* vlVaCreateConfig returns PIPE_VIDEO_PROFILE_UNKNOWN
- only for VAEntrypointVideoProc. */
- attribs[i].type = VASurfaceAttribPixelFormat;
- attribs[i].value.type = VAGenericValueTypeInteger;
- attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
- attribs[i].value.value.i = VA_FOURCC_BGRA;
- i++;
-
- attribs[i].type = VASurfaceAttribPixelFormat;
- attribs[i].value.type = VAGenericValueTypeInteger;
- attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
- attribs[i].value.value.i = VA_FOURCC_RGBA;
- i++;
+ for (j = 0; j < ARRAY_SIZE(vpp_surface_formats); ++j) {
+ attribs[i].type = VASurfaceAttribPixelFormat;
+ attribs[i].value.type = VAGenericValueTypeInteger;
+ attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
+ attribs[i].value.value.i = PipeFormatToVaFourcc(vpp_surface_formats[j]);
+ i++;
+ }
} else {
/* Assume VAEntrypointVLD for now. */
attribs[i].type = VASurfaceAttribPixelFormat;
From ed55def44febbe1662ddcc0c33a23308899ce488 Mon Sep 17 00:00:00 2001
From: Boyuan Zhang
Date: Wed, 23 Sep 2015 10:11:07 +0200
Subject: [PATCH 080/287] st/vaapi: fix vaapi VC-1 simple/main corruption v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Apply the start code fix only to advanced profile.
v2 (chk): add commit message
Signed-off-by: Boyuan Zhang
Reviewed-by: Christian König
Reviewed-by: Alex Deucher
Cc: "10.6 11.0"
---
src/gallium/state_trackers/va/picture.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/src/gallium/state_trackers/va/picture.c b/src/gallium/state_trackers/va/picture.c
index d6cdbea197d..5e7841a0521 100644
--- a/src/gallium/state_trackers/va/picture.c
+++ b/src/gallium/state_trackers/va/picture.c
@@ -694,8 +694,10 @@ handleVASliceDataBufferType(vlVaContext *context, vlVaBuffer *buf)
bufHasStartcode(buf, 0x0000010b, 32))
break;
+ if (context->decoder->profile == PIPE_VIDEO_PROFILE_VC1_ADVANCED) {
buffers[num_buffers] = (void *const)&start_code_vc1;
sizes[num_buffers++] = sizeof(start_code_vc1);
+ }
break;
case PIPE_VIDEO_FORMAT_MPEG4:
if (bufHasStartcode(buf, 0x000001, 24))
From 6bad554d98004e6c8ab46e8cbe73f3b3024e55c5 Mon Sep 17 00:00:00 2001
From: Boyuan Zhang
Date: Wed, 23 Sep 2015 10:11:08 +0200
Subject: [PATCH 081/287] radeon/uvd: fix VC-1 simple/main profile decode v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
We just needed to set the extra width/height fields to get this working.
v2 (chk): rebased, CC stable added, commit message added, fixed coding style
Signed-off-by: Boyuan Zhang
Signed-off-by: Christian König
Reviewed-by: Alex Deucher
Cc: "10.6 11.0"
---
src/gallium/drivers/radeon/radeon_uvd.c | 6 ++++++
src/gallium/drivers/radeon/radeon_video.c | 3 +--
2 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c
index 33b01361aa5..0c643e5cd59 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/src/gallium/drivers/radeon/radeon_uvd.c
@@ -947,6 +947,12 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
dec->msg->body.decode.width_in_samples = dec->base.width;
dec->msg->body.decode.height_in_samples = dec->base.height;
+ if ((picture->profile == PIPE_VIDEO_PROFILE_VC1_SIMPLE) ||
+ (picture->profile == PIPE_VIDEO_PROFILE_VC1_MAIN)) {
+ dec->msg->body.decode.width_in_samples = align(dec->msg->body.decode.width_in_samples, 16) / 16;
+ dec->msg->body.decode.height_in_samples = align(dec->msg->body.decode.height_in_samples, 16) / 16;
+ }
+
dec->msg->body.decode.dpb_size = dec->dpb.res->buf->size;
dec->msg->body.decode.bsd_size = bs_size;
dec->msg->body.decode.db_pitch = dec->base.width;
diff --git a/src/gallium/drivers/radeon/radeon_video.c b/src/gallium/drivers/radeon/radeon_video.c
index 32bfc32073b..f56c6cf6cb4 100644
--- a/src/gallium/drivers/radeon/radeon_video.c
+++ b/src/gallium/drivers/radeon/radeon_video.c
@@ -244,8 +244,7 @@ int rvid_get_video_param(struct pipe_screen *screen,
return codec != PIPE_VIDEO_FORMAT_MPEG4;
return true;
case PIPE_VIDEO_FORMAT_VC1:
- /* FIXME: VC-1 simple/main profile is broken */
- return profile == PIPE_VIDEO_PROFILE_VC1_ADVANCED;
+ return true;
case PIPE_VIDEO_FORMAT_HEVC:
/* Carrizo only supports HEVC Main */
return rscreen->family >= CHIP_CARRIZO &&
From 91f188710ad8dce79936c5d28fd7b9a91b6d870a Mon Sep 17 00:00:00 2001
From: Jordan Justen
Date: Fri, 9 Oct 2015 14:16:05 -0700
Subject: [PATCH 082/287] glsl: Add new barrier functions for compute shaders
When these functions are called in GLSL code, we create an intrinsic
function call:
* groupMemoryBarrier => __intrinsic_group_memory_barrier
* memoryBarrierAtomicCounter => __intrinsic_memory_barrier_atomic_counter
* memoryBarrierBuffer => __intrinsic_memory_barrier_buffer
* memoryBarrierImage => __intrinsic_memory_barrier_image
* memoryBarrierShared => __intrinsic_memory_barrier_shared
v2:
* Consolidate with memoryBarrier function/intrinsic creation (curro)
v3:
* Instead of add_memory_barrier_function, add an intrinsic_name
parameter to _memory_barrier (curro)
Signed-off-by: Jordan Justen
Reviewed-by: Francisco Jerez
---
src/glsl/builtin_functions.cpp | 55 ++++++++++++++++++++++++++++++----
1 file changed, 49 insertions(+), 6 deletions(-)
diff --git a/src/glsl/builtin_functions.cpp b/src/glsl/builtin_functions.cpp
index 509a57b8813..13494446b59 100644
--- a/src/glsl/builtin_functions.cpp
+++ b/src/glsl/builtin_functions.cpp
@@ -458,10 +458,16 @@ fp64(const _mesa_glsl_parse_state *state)
return state->has_double();
}
+static bool
+compute_shader(const _mesa_glsl_parse_state *state)
+{
+ return state->stage == MESA_SHADER_COMPUTE;
+}
+
static bool
barrier_supported(const _mesa_glsl_parse_state *state)
{
- return state->stage == MESA_SHADER_COMPUTE ||
+ return compute_shader(state) ||
state->stage == MESA_SHADER_TESS_CTRL;
}
@@ -785,8 +791,8 @@ private:
ir_function_signature *_memory_barrier_intrinsic(
builtin_available_predicate avail);
- ir_function_signature *_memory_barrier(
- builtin_available_predicate avail);
+ ir_function_signature *_memory_barrier(const char *intrinsic_name,
+ builtin_available_predicate avail);
ir_function_signature *_shader_clock_intrinsic(builtin_available_predicate avail,
const glsl_type *type);
@@ -963,6 +969,21 @@ builtin_builder::create_intrinsics()
add_function("__intrinsic_memory_barrier",
_memory_barrier_intrinsic(shader_image_load_store),
NULL);
+ add_function("__intrinsic_group_memory_barrier",
+ _memory_barrier_intrinsic(compute_shader),
+ NULL);
+ add_function("__intrinsic_memory_barrier_atomic_counter",
+ _memory_barrier_intrinsic(compute_shader),
+ NULL);
+ add_function("__intrinsic_memory_barrier_buffer",
+ _memory_barrier_intrinsic(compute_shader),
+ NULL);
+ add_function("__intrinsic_memory_barrier_image",
+ _memory_barrier_intrinsic(compute_shader),
+ NULL);
+ add_function("__intrinsic_memory_barrier_shared",
+ _memory_barrier_intrinsic(compute_shader),
+ NULL);
add_function("__intrinsic_shader_clock",
_shader_clock_intrinsic(shader_clock,
@@ -2754,7 +2775,28 @@ builtin_builder::create_builtins()
add_image_functions(true);
add_function("memoryBarrier",
- _memory_barrier(shader_image_load_store),
+ _memory_barrier("__intrinsic_memory_barrier",
+ shader_image_load_store),
+ NULL);
+ add_function("groupMemoryBarrier",
+ _memory_barrier("__intrinsic_group_memory_barrier",
+ compute_shader),
+ NULL);
+ add_function("memoryBarrierAtomicCounter",
+ _memory_barrier("__intrinsic_memory_barrier_atomic_counter",
+ compute_shader),
+ NULL);
+ add_function("memoryBarrierBuffer",
+ _memory_barrier("__intrinsic_memory_barrier_buffer",
+ compute_shader),
+ NULL);
+ add_function("memoryBarrierImage",
+ _memory_barrier("__intrinsic_memory_barrier_image",
+ compute_shader),
+ NULL);
+ add_function("memoryBarrierShared",
+ _memory_barrier("__intrinsic_memory_barrier_shared",
+ compute_shader),
NULL);
add_function("clock2x32ARB",
@@ -5264,10 +5306,11 @@ builtin_builder::_memory_barrier_intrinsic(builtin_available_predicate avail)
}
ir_function_signature *
-builtin_builder::_memory_barrier(builtin_available_predicate avail)
+builtin_builder::_memory_barrier(const char *intrinsic_name,
+ builtin_available_predicate avail)
{
MAKE_SIG(glsl_type::void_type, avail, 0);
- body.emit(call(shader->symbols->get_function("__intrinsic_memory_barrier"),
+ body.emit(call(shader->symbols->get_function(intrinsic_name),
NULL, sig->parameters));
return sig;
}
From 9d65f3208bbded17119c7ad38f4b692d3ed00635 Mon Sep 17 00:00:00 2001
From: Jordan Justen
Date: Sat, 10 Oct 2015 08:59:42 -0700
Subject: [PATCH 083/287] nir: Add new barrier functions for compute shaders
When these functions are called in glsl-ir, we create a corresponding
nir intrinsic function call.
Signed-off-by: Jordan Justen
Reviewed-by: Francisco Jerez
---
src/glsl/nir/glsl_to_nir.cpp | 15 +++++++++++++++
src/glsl/nir/nir_intrinsics.h | 11 +++++++++++
2 files changed, 26 insertions(+)
diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp
index 57aba5be0f5..facb9fa4a7a 100644
--- a/src/glsl/nir/glsl_to_nir.cpp
+++ b/src/glsl/nir/glsl_to_nir.cpp
@@ -719,6 +719,16 @@ nir_visitor::visit(ir_call *ir)
op = nir_intrinsic_ssbo_atomic_comp_swap;
} else if (strcmp(ir->callee_name(), "__intrinsic_shader_clock") == 0) {
op = nir_intrinsic_shader_clock;
+ } else if (strcmp(ir->callee_name(), "__intrinsic_group_memory_barrier") == 0) {
+ op = nir_intrinsic_group_memory_barrier;
+ } else if (strcmp(ir->callee_name(), "__intrinsic_memory_barrier_atomic_counter") == 0) {
+ op = nir_intrinsic_memory_barrier_atomic_counter;
+ } else if (strcmp(ir->callee_name(), "__intrinsic_memory_barrier_buffer") == 0) {
+ op = nir_intrinsic_memory_barrier_buffer;
+ } else if (strcmp(ir->callee_name(), "__intrinsic_memory_barrier_image") == 0) {
+ op = nir_intrinsic_memory_barrier_image;
+ } else if (strcmp(ir->callee_name(), "__intrinsic_memory_barrier_shared") == 0) {
+ op = nir_intrinsic_memory_barrier_shared;
} else {
unreachable("not reached");
}
@@ -821,6 +831,11 @@ nir_visitor::visit(ir_call *ir)
break;
}
case nir_intrinsic_memory_barrier:
+ case nir_intrinsic_group_memory_barrier:
+ case nir_intrinsic_memory_barrier_atomic_counter:
+ case nir_intrinsic_memory_barrier_buffer:
+ case nir_intrinsic_memory_barrier_image:
+ case nir_intrinsic_memory_barrier_shared:
nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr);
break;
case nir_intrinsic_shader_clock:
diff --git a/src/glsl/nir/nir_intrinsics.h b/src/glsl/nir/nir_intrinsics.h
index c2b6fe7166d..36fb2861c16 100644
--- a/src/glsl/nir/nir_intrinsics.h
+++ b/src/glsl/nir/nir_intrinsics.h
@@ -91,6 +91,17 @@ BARRIER(memory_barrier)
*/
INTRINSIC(shader_clock, 0, ARR(), true, 1, 0, 0, NIR_INTRINSIC_CAN_ELIMINATE)
+/*
+ * Memory barrier with semantics analogous to the compute shader
+ * groupMemoryBarrier(), memoryBarrierAtomicCounter(), memoryBarrierBuffer(),
+ * memoryBarrierImage() and memoryBarrierShared() GLSL intrinsics.
+ */
+BARRIER(group_memory_barrier)
+BARRIER(memory_barrier_atomic_counter)
+BARRIER(memory_barrier_buffer)
+BARRIER(memory_barrier_image)
+BARRIER(memory_barrier_shared)
+
/** A conditional discard, with a single boolean source. */
INTRINSIC(discard_if, 1, ARR(1), false, 0, 0, 0, 0)
From faa119307035787f5e421dd6a9eb4d0101de963b Mon Sep 17 00:00:00 2001
From: Jordan Justen
Date: Sat, 10 Oct 2015 13:00:04 -0700
Subject: [PATCH 084/287] i965/nir/fs: Implement new barrier functions for
compute shaders
For these nir intrinsics, we emit the same code as
nir_intrinsic_memory_barrier:
* nir_intrinsic_memory_barrier_atomic_counter
* nir_intrinsic_memory_barrier_buffer
* nir_intrinsic_memory_barrier_image
We treat these nir intrinsics as no-ops:
* nir_intrinsic_group_memory_barrier
* nir_intrinsic_memory_barrier_shared
v3:
* Add comment for no-op cases (curro)
v4:
* Moving comment to a separate patch authored by curro
Signed-off-by: Jordan Justen
Reviewed-by: Francisco Jerez
---
src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 261518605b7..5d2dd18552a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -1697,6 +1697,9 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
break;
}
+ case nir_intrinsic_memory_barrier_atomic_counter:
+ case nir_intrinsic_memory_barrier_buffer:
+ case nir_intrinsic_memory_barrier_image:
case nir_intrinsic_memory_barrier: {
const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 16 / dispatch_width);
bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
@@ -1704,6 +1707,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
break;
}
+ case nir_intrinsic_group_memory_barrier:
+ case nir_intrinsic_memory_barrier_shared:
+ break;
+
case nir_intrinsic_shader_clock: {
/* We cannot do anything if there is an event, so ignore it for now */
fs_reg shader_clock = get_timestamp(bld);
From 51694072218b5ae84b5d8f98ee2172d7c5d61b31 Mon Sep 17 00:00:00 2001
From: Francisco Jerez
Date: Fri, 6 Nov 2015 13:19:56 -0800
Subject: [PATCH 085/287] i965/nir/fs: Add comment for no-op memory barrier
functions
Reviewed-by: Jordan Justen
---
src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 19 +++++++++++++++++++
1 file changed, 19 insertions(+)
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 5d2dd18552a..02b9f5bbc8a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -1709,6 +1709,25 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
case nir_intrinsic_group_memory_barrier:
case nir_intrinsic_memory_barrier_shared:
+ /* We treat these workgroup-level barriers as no-ops. This should be
+ * safe at present and as long as:
+ *
+ * - Memory access instructions are not subsequently reordered by the
+ * compiler back-end.
+ *
+ * - All threads from a given compute shader workgroup fit within a
+ * single subslice and therefore talk to the same HDC shared unit
+ * what supposedly guarantees ordering and coherency between threads
+ * from the same workgroup. This may change in the future when we
+ * start splitting workgroups across multiple subslices.
+ *
+ * - The context is not in fault-and-stream mode, which could cause
+ * memory transactions (including to SLM) prior to the barrier to be
+ * replayed after the barrier if a pagefault occurs. This shouldn't
+ * be a problem up to and including SKL because fault-and-stream is
+ * not usable due to hardware issues, but that's likely to change in
+ * the future.
+ */
break;
case nir_intrinsic_shader_clock: {
From 12c850d01ce2bf364f2b1719154df789d43a7a59 Mon Sep 17 00:00:00 2001
From: Hans de Goede
Date: Thu, 5 Nov 2015 14:32:34 +0100
Subject: [PATCH 086/287] nvc0/ir: Add support for double immediates
Add support for encoding double immediates (up to 20 bits of precision)
into the generated nvc0 machine-code.
Signed-off-by: Hans de Goede
Reviewed-by: Ilia Mirkin
---
src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index fd103146c72..8784f3b0a21 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -323,6 +323,14 @@ CodeEmitterNVC0::setImmediate(const Instruction *i, const int s)
assert(imm);
u32 = imm->reg.data.u32;
+ if ((code[0] & 0xf) == 0x1) {
+ // double immediate
+ uint64_t u64 = imm->reg.data.u64;
+ assert(!(u64 & 0x00000fffffffffffULL));
+ assert(!(code[1] & 0xc000));
+ code[0] |= ((u64 >> 44) & 0x3f) << 26;
+ code[1] |= 0xc000 | (u64 >> 50);
+ } else
if ((code[0] & 0xf) == 0x2) {
// LIMM
code[0] |= (u32 & 0x3f) << 26;
From b487b55f7d08c00f2efabc097c7138403528893f Mon Sep 17 00:00:00 2001
From: Hans de Goede
Date: Thu, 5 Nov 2015 14:32:35 +0100
Subject: [PATCH 087/287] gm107/ir: Add support for double immediates
Add support for encoding double immediates (up to 20 bits of precision)
into the generated gm107 machine-code.
Signed-off-by: Hans de Goede
Reviewed-by: Ilia Mirkin
---
src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index a327d572470..7e6ed842d54 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -310,9 +310,12 @@ CodeEmitterGM107::emitIMMD(int pos, int len, const ValueRef &ref)
uint32_t val = imm->reg.data.u32;
if (len == 19) {
- if (isFloatType(insn->sType)) {
+ if (insn->sType == TYPE_F32 || insn->sType == TYPE_F16) {
assert(!(val & 0x00000fff));
val >>= 12;
+ } else if (insn->sType == TYPE_F64) {
+ assert(!(imm->reg.data.u64 & 0x00000fffffffffffULL));
+ val = imm->reg.data.u64 >> 44;
}
assert(!(val & 0xfff00000) || (val & 0xfff00000) == 0xfff00000);
emitField( 56, 1, (val & 0x80000) >> 19);
From 11e3dac36e7b992e30efbce4473451c4e1ac617f Mon Sep 17 00:00:00 2001
From: Ilia Mirkin
Date: Fri, 6 Nov 2015 17:18:01 -0500
Subject: [PATCH 088/287] nv50/ir: allow movs with TYPE_F64 destinations to be
split
Signed-off-by: Ilia Mirkin
---
src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
index 19418c0e0f1..ece6ce40643 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
@@ -555,6 +555,12 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
switch (i->dType) {
case TYPE_U64: hTy = TYPE_U32; break;
case TYPE_S64: hTy = TYPE_S32; break;
+ case TYPE_F64:
+ if (i->op == OP_MOV) {
+ hTy = TYPE_U32;
+ break;
+ }
+ /* fallthrough */
default:
return NULL;
}
From 2437f0085372355980864454964749ac8231ca44 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin
Date: Fri, 6 Nov 2015 17:58:42 -0500
Subject: [PATCH 089/287] nv50/ir: disallow 64-bit immediates on nv50 targets
No instructions are able to load short immediates like nvc0 can.
Signed-off-by: Ilia Mirkin
---
src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
index f3ddcaa5199..94cf0f0e05e 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -343,7 +343,7 @@ TargetNV50::insnCanLoad(const Instruction *i, int s,
}
if (sf == FILE_IMMEDIATE)
- return true;
+ return ldSize <= 4;
// Check if memory access is encodable:
From 428506ece2c7627392d0f02c7f83021caa46bb4f Mon Sep 17 00:00:00 2001
From: Hans de Goede
Date: Thu, 5 Nov 2015 14:32:36 +0100
Subject: [PATCH 090/287] nv50/ir: Add support for merge-s to the
ConstantFolding pass
This allows later passes like LoadPropagation to properly deal with 64
bit immediates.
If the new 64 bit load this introduces does not get optimized away then
split64BitOpPostRA() will split this into 2 instructions again.
Signed-off-by: Hans de Goede
Reviewed-by: Ilia Mirkin
---
.../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 44f74c61304..8e241f1ebc4 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -447,6 +447,7 @@ ConstantFolding::expr(Instruction *i,
{
struct Storage *const a = &imm0.reg, *const b = &imm1.reg;
struct Storage res;
+ uint8_t fixSrc0Size = 0;
memset(&res.data, 0, sizeof(res.data));
@@ -589,6 +590,18 @@ ConstantFolding::expr(Instruction *i,
// the second argument will not be constant, but that can happen.
res.data.u32 = a->data.u32 + b->data.u32;
break;
+ case OP_MERGE:
+ switch (i->dType) {
+ case TYPE_U64:
+ case TYPE_S64:
+ case TYPE_F64:
+ res.data.u64 = (((uint64_t)b->data.u32) << 32) | a->data.u32;
+ fixSrc0Size = 8;
+ break;
+ default:
+ return;
+ }
+ break;
default:
return;
}
@@ -602,6 +615,8 @@ ConstantFolding::expr(Instruction *i,
i->setSrc(1, NULL);
i->getSrc(0)->reg.data = res.data;
+ if (fixSrc0Size)
+ i->getSrc(0)->reg.size = fixSrc0Size;
switch (i->op) {
case OP_MAD:
From 9f2f8bda6e060cb85f6e099a4ad65c58cde36ba0 Mon Sep 17 00:00:00 2001
From: Hans de Goede
Date: Thu, 5 Nov 2015 14:32:37 +0100
Subject: [PATCH 091/287] nvc0/ir: Teach insnCanLoad about double immediates
Teach insnCanLoad about double immediates, together with the
"Add support for merge-s to the ConstantFolding pass"
This turns the following (nvc0) code:
1: mov u32 $r2 0x00000000 (8)
2: mov u32 $r3 0x3fe00000 (8)
3: add f64 $r0d $r0d $r2d (8)
Into:
1: add f64 $r0d $r0d 0.500000 (8)
Signed-off-by: Hans de Goede
Reviewed-by: Ilia Mirkin
---
.../nouveau/codegen/nv50_ir_target_nvc0.cpp | 25 ++++++++++++++-----
1 file changed, 19 insertions(+), 6 deletions(-)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
index 27df0eba66b..8f59d86a72f 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -338,17 +338,30 @@ TargetNVC0::insnCanLoad(const Instruction *i, int s,
if (sf == FILE_IMMEDIATE) {
Storage ® = ld->getSrc(0)->asImm()->reg;
- if (typeSizeof(i->sType) > 4)
- return false;
- if (opInfo[i->op].immdBits != 0xffffffff) {
- if (i->sType == TYPE_F32) {
+ if (opInfo[i->op].immdBits != 0xffffffff || typeSizeof(i->sType) > 4) {
+ switch (i->sType) {
+ case TYPE_F64:
+ if (reg.data.u64 & 0x00000fffffffffffULL)
+ return false;
+ break;
+ case TYPE_F32:
if (reg.data.u32 & 0xfff)
return false;
- } else
- if (i->sType == TYPE_S32 || i->sType == TYPE_U32) {
+ break;
+ case TYPE_S32:
+ case TYPE_U32:
// with u32, 0xfffff counts as 0xffffffff as well
if (reg.data.s32 > 0x7ffff || reg.data.s32 < -0x80000)
return false;
+ break;
+ case TYPE_U8:
+ case TYPE_S8:
+ case TYPE_U16:
+ case TYPE_S16:
+ case TYPE_F16:
+ break;
+ default:
+ return false;
}
} else
if (i->op == OP_MAD || i->op == OP_FMA) {
From f979d3cfec2b336801fe59ccd264111f403428f5 Mon Sep 17 00:00:00 2001
From: Hans de Goede
Date: Thu, 5 Nov 2015 14:32:38 +0100
Subject: [PATCH 092/287] nv50/ir: Add support for 64bit immediates to
checkSwapSrc01
Now that we support 64 bit immediates in insnCanLoad, we need to swap
64 bit immediate sources too for optimal effect.
Signed-off-by: Hans de Goede
Reviewed-by: Ilia Mirkin
---
.../drivers/nouveau/codegen/nv50_ir_peephole.cpp | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 8e241f1ebc4..b952c760a21 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -155,7 +155,7 @@ private:
void checkSwapSrc01(Instruction *);
bool isCSpaceLoad(Instruction *);
- bool isImmd32Load(Instruction *);
+ bool isImmdLoad(Instruction *);
bool isAttribOrSharedLoad(Instruction *);
};
@@ -166,9 +166,10 @@ LoadPropagation::isCSpaceLoad(Instruction *ld)
}
bool
-LoadPropagation::isImmd32Load(Instruction *ld)
+LoadPropagation::isImmdLoad(Instruction *ld)
{
- if (!ld || (ld->op != OP_MOV) || (typeSizeof(ld->dType) != 4))
+ if (!ld || (ld->op != OP_MOV) ||
+ ((typeSizeof(ld->dType) != 4) && (typeSizeof(ld->dType) != 8)))
return false;
return ld->src(0).getFile() == FILE_IMMEDIATE;
}
@@ -201,8 +202,8 @@ LoadPropagation::checkSwapSrc01(Instruction *insn)
else
return;
} else
- if (isImmd32Load(i0)) {
- if (!isCSpaceLoad(i1) && !isImmd32Load(i1))
+ if (isImmdLoad(i0)) {
+ if (!isCSpaceLoad(i1) && !isImmdLoad(i1))
insn->swapSources(0, 1);
else
return;
From 76957389fc6952e59c1f0f1cbdf74f6949a7a956 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin
Date: Sun, 22 Feb 2015 19:49:49 -0500
Subject: [PATCH 093/287] nv50/ir: add fp64 opcode emission support for G200
(NVA0)
Need to emulate rcp/rsq before providing full fp64 support
Signed-off-by: Ilia Mirkin
---
.../nouveau/codegen/nv50_ir_emit_nv50.cpp | 94 +++++++++++++++++--
1 file changed, 84 insertions(+), 10 deletions(-)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
index 9f1e4b803d5..ee115b581b8 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -96,9 +96,12 @@ private:
void emitUADD(const Instruction *);
void emitAADD(const Instruction *);
void emitFADD(const Instruction *);
+ void emitDADD(const Instruction *);
void emitIMUL(const Instruction *);
void emitFMUL(const Instruction *);
+ void emitDMUL(const Instruction *);
void emitFMAD(const Instruction *);
+ void emitDMAD(const Instruction *);
void emitIMAD(const Instruction *);
void emitISAD(const Instruction *);
@@ -954,11 +957,13 @@ CodeEmitterNV50::emitMINMAX(const Instruction *i)
assert(0);
break;
}
- code[1] |= i->src(0).mod.abs() << 20;
- code[1] |= i->src(0).mod.neg() << 26;
- code[1] |= i->src(1).mod.abs() << 19;
- code[1] |= i->src(1).mod.neg() << 27;
}
+
+ code[1] |= i->src(0).mod.abs() << 20;
+ code[1] |= i->src(0).mod.neg() << 26;
+ code[1] |= i->src(1).mod.abs() << 19;
+ code[1] |= i->src(1).mod.neg() << 27;
+
emitForm_MAD(i);
}
@@ -993,6 +998,26 @@ CodeEmitterNV50::emitFMAD(const Instruction *i)
}
}
+void
+CodeEmitterNV50::emitDMAD(const Instruction *i)
+{
+ const int neg_mul = i->src(0).mod.neg() ^ i->src(1).mod.neg();
+ const int neg_add = i->src(2).mod.neg();
+
+ assert(i->encSize == 8);
+ assert(!i->saturate);
+
+ code[1] = 0x40000000;
+ code[0] = 0xe0000000;
+
+ code[1] |= neg_mul << 26;
+ code[1] |= neg_add << 27;
+
+ roundMode_MAD(i);
+
+ emitForm_MAD(i);
+}
+
void
CodeEmitterNV50::emitFADD(const Instruction *i)
{
@@ -1027,6 +1052,25 @@ CodeEmitterNV50::emitFADD(const Instruction *i)
}
}
+void
+CodeEmitterNV50::emitDADD(const Instruction *i)
+{
+ const int neg0 = i->src(0).mod.neg();
+ const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0);
+
+ assert(!(i->src(0).mod | i->src(1).mod).abs());
+ assert(!i->saturate);
+ assert(i->encSize == 8);
+
+ code[1] = 0x60000000;
+ code[0] = 0xe0000000;
+
+ emitForm_ADD(i);
+
+ code[1] |= neg0 << 26;
+ code[1] |= neg1 << 27;
+}
+
void
CodeEmitterNV50::emitUADD(const Instruction *i)
{
@@ -1120,6 +1164,25 @@ CodeEmitterNV50::emitFMUL(const Instruction *i)
}
}
+void
+CodeEmitterNV50::emitDMUL(const Instruction *i)
+{
+ const int neg = (i->src(0).mod ^ i->src(1).mod).neg();
+
+ assert(!i->saturate);
+ assert(i->encSize == 8);
+
+ code[1] = 0x80000000;
+ code[0] = 0xe0000000;
+
+ if (neg)
+ code[1] |= 0x08000000;
+
+ roundMode_CVT(i->rnd);
+
+ emitForm_MAD(i);
+}
+
void
CodeEmitterNV50::emitIMAD(const Instruction *i)
{
@@ -1181,9 +1244,11 @@ CodeEmitterNV50::emitSET(const Instruction *i)
code[0] = 0x30000000;
code[1] = 0x60000000;
- emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14);
-
switch (i->sType) {
+ case TYPE_F64:
+ code[0] = 0xe0000000;
+ code[1] = 0xe0000000;
+ break;
case TYPE_F32: code[0] |= 0x80000000; break;
case TYPE_S32: code[1] |= 0x0c000000; break;
case TYPE_U32: code[1] |= 0x04000000; break;
@@ -1193,6 +1258,9 @@ CodeEmitterNV50::emitSET(const Instruction *i)
assert(0);
break;
}
+
+ emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14);
+
if (i->src(0).mod.neg()) code[1] |= 0x04000000;
if (i->src(1).mod.neg()) code[1] |= 0x08000000;
if (i->src(0).mod.abs()) code[1] |= 0x00100000;
@@ -1756,7 +1824,9 @@ CodeEmitterNV50::emitInstruction(Instruction *insn)
break;
case OP_ADD:
case OP_SUB:
- if (isFloatType(insn->dType))
+ if (insn->dType == TYPE_F64)
+ emitDADD(insn);
+ else if (isFloatType(insn->dType))
emitFADD(insn);
else if (insn->getDef(0)->reg.file == FILE_ADDRESS)
emitAADD(insn);
@@ -1764,14 +1834,18 @@ CodeEmitterNV50::emitInstruction(Instruction *insn)
emitUADD(insn);
break;
case OP_MUL:
- if (isFloatType(insn->dType))
+ if (insn->dType == TYPE_F64)
+ emitDMUL(insn);
+ else if (isFloatType(insn->dType))
emitFMUL(insn);
else
emitIMUL(insn);
break;
case OP_MAD:
case OP_FMA:
- if (isFloatType(insn->dType))
+ if (insn->dType == TYPE_F64)
+ emitDMAD(insn);
+ else if (isFloatType(insn->dType))
emitFMAD(insn);
else
emitIMAD(insn);
@@ -1943,7 +2017,7 @@ CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const
{
const Target::OpInfo &info = targ->getOpInfo(i);
- if (info.minEncSize > 4)
+ if (info.minEncSize > 4 || i->dType == TYPE_F64)
return 8;
// check constraints on dst and src operands
From 2f9aaed7499499679d44e47b7a070df237f77683 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin
Date: Fri, 6 Nov 2015 19:13:35 -0500
Subject: [PATCH 094/287] nv50/ir: add support for const-folding OP_CVT with
F64 source/dest
Signed-off-by: Ilia Mirkin
---
.../nouveau/codegen/nv50_ir_build_util.cpp | 12 +++++++
.../nouveau/codegen/nv50_ir_build_util.h | 2 ++
.../nouveau/codegen/nv50_ir_peephole.cpp | 31 +++++++++++++++++++
3 files changed, 45 insertions(+)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
index ece6ce40643..dca799dd9b5 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
@@ -392,12 +392,24 @@ BuildUtil::mkImm(float f)
return mkImm(u.u32);
}
+ImmediateValue *
+BuildUtil::mkImm(double d)
+{
+ return new_ImmediateValue(prog, d);
+}
+
Value *
BuildUtil::loadImm(Value *dst, float f)
{
return mkOp1v(OP_MOV, TYPE_F32, dst ? dst : getScratch(), mkImm(f));
}
+Value *
+BuildUtil::loadImm(Value *dst, double d)
+{
+ return mkOp1v(OP_MOV, TYPE_F64, dst ? dst : getScratch(), mkImm(d));
+}
+
Value *
BuildUtil::loadImm(Value *dst, uint32_t u)
{
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
index 0d544581697..8f3bf77949c 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
@@ -90,12 +90,14 @@ public:
void mkClobber(DataFile file, uint32_t regMask, int regUnitLog2);
ImmediateValue *mkImm(float);
+ ImmediateValue *mkImm(double);
ImmediateValue *mkImm(uint32_t);
ImmediateValue *mkImm(uint64_t);
ImmediateValue *mkImm(int i) { return mkImm((uint32_t)i); }
Value *loadImm(Value *dst, float);
+ Value *loadImm(Value *dst, double);
Value *loadImm(Value *dst, uint32_t);
Value *loadImm(Value *dst, uint64_t);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index b952c760a21..f0955978dc8 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -1164,6 +1164,11 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
#define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \
case type: \
switch (i->sType) { \
+ case TYPE_F64: \
+ res.data.dst = util_iround(i->saturate ? \
+ CLAMP(imm0.reg.data.f64, fmin, fmax) : \
+ imm0.reg.data.f64); \
+ break; \
case TYPE_F32: \
res.data.dst = util_iround(i->saturate ? \
CLAMP(imm0.reg.data.f32, fmin, fmax) : \
@@ -1201,6 +1206,11 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
CASE(TYPE_S32, s32, INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX, 0, INT32_MAX);
case TYPE_F32:
switch (i->sType) {
+ case TYPE_F64:
+ res.data.f32 = i->saturate ?
+ CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
+ imm0.reg.data.f64;
+ break;
case TYPE_F32:
res.data.f32 = i->saturate ?
CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
@@ -1215,6 +1225,27 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
}
i->setSrc(0, bld.mkImm(res.data.f32));
break;
+ case TYPE_F64:
+ switch (i->sType) {
+ case TYPE_F64:
+ res.data.f64 = i->saturate ?
+ CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
+ imm0.reg.data.f64;
+ break;
+ case TYPE_F32:
+ res.data.f64 = i->saturate ?
+ CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
+ imm0.reg.data.f32;
+ break;
+ case TYPE_U16: res.data.f64 = (double) imm0.reg.data.u16; break;
+ case TYPE_U32: res.data.f64 = (double) imm0.reg.data.u32; break;
+ case TYPE_S16: res.data.f64 = (double) imm0.reg.data.s16; break;
+ case TYPE_S32: res.data.f64 = (double) imm0.reg.data.s32; break;
+ default:
+ return;
+ }
+ i->setSrc(0, bld.mkImm(res.data.f64));
+ break;
default:
return;
}
From 393d0c336bc766a123e139ae85383663f81e00d1 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin
Date: Fri, 6 Nov 2015 19:28:29 -0500
Subject: [PATCH 095/287] nv50/ir: properly set the type of the constant
folding result
This removes the hack used for merge, which only covers a fraction of
the cases.
Signed-off-by: Ilia Mirkin
---
src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index f0955978dc8..0f1dcf0dacd 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -448,7 +448,7 @@ ConstantFolding::expr(Instruction *i,
{
struct Storage *const a = &imm0.reg, *const b = &imm1.reg;
struct Storage res;
- uint8_t fixSrc0Size = 0;
+ DataType type = i->dType;
memset(&res.data, 0, sizeof(res.data));
@@ -590,6 +590,7 @@ ConstantFolding::expr(Instruction *i,
// The two arguments to pfetch are logically added together. Normally
// the second argument will not be constant, but that can happen.
res.data.u32 = a->data.u32 + b->data.u32;
+ type = TYPE_U32;
break;
case OP_MERGE:
switch (i->dType) {
@@ -597,7 +598,6 @@ ConstantFolding::expr(Instruction *i,
case TYPE_S64:
case TYPE_F64:
res.data.u64 = (((uint64_t)b->data.u32) << 32) | a->data.u32;
- fixSrc0Size = 8;
break;
default:
return;
@@ -616,8 +616,8 @@ ConstantFolding::expr(Instruction *i,
i->setSrc(1, NULL);
i->getSrc(0)->reg.data = res.data;
- if (fixSrc0Size)
- i->getSrc(0)->reg.size = fixSrc0Size;
+ i->getSrc(0)->reg.type = type;
+ i->getSrc(0)->reg.size = typeSizeof(type);
switch (i->op) {
case OP_MAD:
From 8e9ade7eb3582fc541700ade1d232a329da890b0 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin
Date: Sat, 7 Nov 2015 00:41:05 -0500
Subject: [PATCH 096/287] nv50/ir: allow emission of immediates in imul/imad
ops
Nothing actually uses this yet (due to complications), but the emission
logic is right.
Signed-off-by: Ilia Mirkin
---
.../drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
index ee115b581b8..7e0fb532565 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -1125,7 +1125,10 @@ CodeEmitterNV50::emitIMUL(const Instruction *i)
if (i->encSize == 8) {
code[1] = (i->sType == TYPE_S16) ? (0x8000 | 0x4000) : 0x0000;
- emitForm_MAD(i);
+ if (i->src(1).getFile() == FILE_IMMEDIATE)
+ emitForm_IMM(i);
+ else
+ emitForm_MAD(i);
} else {
if (i->sType == TYPE_S16)
code[0] |= 0x8100;
@@ -1199,7 +1202,10 @@ CodeEmitterNV50::emitIMAD(const Instruction *i)
code[1] |= neg1 << 27;
code[1] |= neg2 << 26;
- emitForm_MAD(i);
+ if (i->src(1).getFile() == FILE_IMMEDIATE)
+ emitForm_IMM(i);
+ else
+ emitForm_MAD(i);
if (i->flagsSrc >= 0) {
// add with carry from $cX
From c3e527f93d4281ad6e2ca165eaf6ff588e4faefa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?=
Date: Sat, 31 Oct 2015 01:03:42 +0100
Subject: [PATCH 097/287] radeonsi: only enable write confirmation on the last
CP DMA packet
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This should improve performance for big copies that need to be split.
Reviewed-by: Michel Dänzer
---
src/gallium/drivers/radeonsi/si_cp_dma.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index d4bd7b28cf3..c5636444e62 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -48,6 +48,7 @@ static void si_emit_cp_dma_copy_buffer(struct si_context *sctx,
{
struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0;
+ uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM(1) : 0;
uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0;
uint32_t sel = flags & CIK_CP_DMA_USE_L2 ?
S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2) |
@@ -70,7 +71,7 @@ static void si_emit_cp_dma_copy_buffer(struct si_context *sctx,
radeon_emit(cs, sync_flag | ((src_va >> 32) & 0xffff)); /* CP_SYNC [31] | SRC_ADDR_HI [15:0] */
radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */
radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */
- radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */
+ radeon_emit(cs, size | wr_confirm | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */
}
}
@@ -81,6 +82,7 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
{
struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0;
+ uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM(1) : 0;
uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0;
uint32_t dst_sel = flags & CIK_CP_DMA_USE_L2 ? S_411_DSL_SEL(V_411_DST_ADDR_TC_L2) : 0;
@@ -101,7 +103,7 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
radeon_emit(cs, sync_flag | S_411_SRC_SEL(V_411_DATA)); /* CP_SYNC [31] | SRC_SEL[30:29] */
radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */
radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */
- radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */
+ radeon_emit(cs, size | wr_confirm | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */
}
}
From 89da3b4458762a76de2774118bbb53953f01c562 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?=
Date: Sat, 31 Oct 2015 01:21:01 +0100
Subject: [PATCH 098/287] radeonsi: unify CP DMA code determining various flags
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
v2: don't call get_flush_flags twice per function
Reviewed-by: Michel Dänzer
---
src/gallium/drivers/radeonsi/si_cp_dma.c | 51 +++++++++++-------------
1 file changed, 23 insertions(+), 28 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index c5636444e62..993fb44328c 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -107,6 +107,21 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
}
}
+static unsigned get_flush_flags(struct si_context *sctx, bool is_framebuffer)
+{
+ if (is_framebuffer)
+ return SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
+
+ return SI_CONTEXT_INV_TC_L1 |
+ (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
+ SI_CONTEXT_INV_KCACHE;
+}
+
+static unsigned get_tc_l2_flag(struct si_context *sctx, bool is_framebuffer)
+{
+ return is_framebuffer || sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
+}
+
/* The max number of bytes to copy per packet. */
#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
@@ -115,7 +130,8 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
bool is_framebuffer)
{
struct si_context *sctx = (struct si_context*)ctx;
- unsigned flush_flags, tc_l2_flag;
+ unsigned tc_l2_flag = get_tc_l2_flag(sctx, is_framebuffer);
+ unsigned flush_flags = get_flush_flags(sctx, is_framebuffer);
if (!size)
return;
@@ -139,19 +155,8 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
uint64_t va = r600_resource(dst)->gpu_address + offset;
- /* Flush the caches where the resource is bound. */
- if (is_framebuffer) {
- flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
- tc_l2_flag = 0;
- } else {
- flush_flags = SI_CONTEXT_INV_TC_L1 |
- (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
- SI_CONTEXT_INV_KCACHE;
- tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
- }
-
- sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
- flush_flags;
+ /* Flush the caches. */
+ sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
while (size) {
unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
@@ -195,7 +200,8 @@ void si_copy_buffer(struct si_context *sctx,
uint64_t dst_offset, uint64_t src_offset, unsigned size,
bool is_framebuffer)
{
- unsigned flush_flags, tc_l2_flag;
+ unsigned tc_l2_flag = get_tc_l2_flag(sctx, is_framebuffer);
+ unsigned flush_flags = get_flush_flags(sctx, is_framebuffer);
if (!size)
return;
@@ -209,19 +215,8 @@ void si_copy_buffer(struct si_context *sctx,
dst_offset += r600_resource(dst)->gpu_address;
src_offset += r600_resource(src)->gpu_address;
- /* Flush the caches where the resource is bound. */
- if (is_framebuffer) {
- flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
- tc_l2_flag = 0;
- } else {
- flush_flags = SI_CONTEXT_INV_TC_L1 |
- (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
- SI_CONTEXT_INV_KCACHE;
- tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
- }
-
- sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
- flush_flags;
+ /* Flush the caches. */
+ sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
while (size) {
unsigned sync_flags = tc_l2_flag;
From fc0416ef5d7775b00f13a5fa83620abb7b1669a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?=
Date: Sat, 31 Oct 2015 01:33:42 +0100
Subject: [PATCH 099/287] radeonsi: unify CP DMA preparation logic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Reviewed-by: Michel Dänzer
---
src/gallium/drivers/radeonsi/si_cp_dma.c | 71 ++++++++++++------------
1 file changed, 34 insertions(+), 37 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 993fb44328c..2e39a24071b 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -122,6 +122,36 @@ static unsigned get_tc_l2_flag(struct si_context *sctx, bool is_framebuffer)
return is_framebuffer || sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
}
+static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst,
+ struct pipe_resource *src, unsigned byte_count,
+ unsigned remaining_size, unsigned *flags)
+{
+ si_need_cs_space(sctx);
+
+ /* This must be done after need_cs_space. */
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ (struct r600_resource*)dst,
+ RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
+ if (src)
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ (struct r600_resource*)src,
+ RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
+
+ /* Flush the caches for the first copy only.
+ * Also wait for the previous CP DMA operations.
+ */
+ if (sctx->b.flags) {
+ si_emit_cache_flush(sctx, NULL);
+ *flags |= SI_CP_DMA_RAW_WAIT;
+ }
+
+ /* Do the synchronization after the last dma, so that all data
+ * is written to memory.
+ */
+ if (byte_count == remaining_size)
+ *flags |= R600_CP_DMA_SYNC;
+}
+
/* The max number of bytes to copy per packet. */
#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
@@ -162,23 +192,7 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
unsigned dma_flags = tc_l2_flag;
- si_need_cs_space(sctx);
-
- /* This must be done after need_cs_space. */
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
- (struct r600_resource*)dst, RADEON_USAGE_WRITE,
- RADEON_PRIO_CP_DMA);
-
- /* Flush the caches for the first copy only.
- * Also wait for the previous CP DMA operations. */
- if (sctx->b.flags) {
- si_emit_cache_flush(sctx, NULL);
- dma_flags |= SI_CP_DMA_RAW_WAIT; /* same as WAIT_UNTIL=CP_DMA_IDLE */
- }
-
- /* Do the synchronization after the last copy, so that all data is written to memory. */
- if (size == byte_count)
- dma_flags |= R600_CP_DMA_SYNC;
+ si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, &dma_flags);
/* Emit the clear packet. */
si_emit_cp_dma_clear_buffer(sctx, va, byte_count, value, dma_flags);
@@ -219,29 +233,12 @@ void si_copy_buffer(struct si_context *sctx,
sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
while (size) {
- unsigned sync_flags = tc_l2_flag;
+ unsigned dma_flags = tc_l2_flag;
unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
- si_need_cs_space(sctx);
+ si_cp_dma_prepare(sctx, dst, src, byte_count, size, &dma_flags);
- /* Flush the caches for the first copy only. Also wait for old CP DMA packets to complete. */
- if (sctx->b.flags) {
- si_emit_cache_flush(sctx, NULL);
- sync_flags |= SI_CP_DMA_RAW_WAIT;
- }
-
- /* Do the synchronization after the last copy, so that all data is written to memory. */
- if (size == byte_count) {
- sync_flags |= R600_CP_DMA_SYNC;
- }
-
- /* This must be done after r600_need_cs_space. */
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)src,
- RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)dst,
- RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
-
- si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, byte_count, sync_flags);
+ si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, byte_count, dma_flags);
size -= byte_count;
src_offset += byte_count;
From 2658777f468e8c0d71669a043ff7401672717622 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?=
Date: Sun, 1 Nov 2015 13:43:26 +0100
Subject: [PATCH 100/287] radeonsi: add workarounds for CP DMA to stay on the
fast path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
v2: set emit_scratch_reloc, add a NULL check
Reviewed-by: Michel Dänzer
---
src/gallium/drivers/radeonsi/si_cp_dma.c | 93 ++++++++++++++++++++++--
1 file changed, 88 insertions(+), 5 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 2e39a24071b..418b2cf65c5 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -152,8 +152,10 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst
*flags |= R600_CP_DMA_SYNC;
}
+/* Alignment for optimal performance. */
+#define CP_DMA_ALIGNMENT 32
/* The max number of bytes to copy per packet. */
-#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
+#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - CP_DMA_ALIGNMENT)
static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
unsigned offset, unsigned size, unsigned value,
@@ -209,11 +211,51 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
r600_resource(dst)->TC_L2_dirty = true;
}
+/**
+ * Realign the CP DMA engine. This must be done after a copy with an unaligned
+ * size.
+ *
+ * \param size Remaining size to the CP DMA alignment.
+ */
+static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size)
+{
+ uint64_t va;
+ unsigned dma_flags = 0;
+ unsigned scratch_size = CP_DMA_ALIGNMENT * 2;
+
+ assert(size < CP_DMA_ALIGNMENT);
+
+ /* Use the scratch buffer as the dummy buffer. The 3D engine should be
+ * idle at this point.
+ */
+ if (!sctx->scratch_buffer ||
+ sctx->scratch_buffer->b.b.width0 < scratch_size) {
+ r600_resource_reference(&sctx->scratch_buffer, NULL);
+ sctx->scratch_buffer =
+ si_resource_create_custom(&sctx->screen->b.b,
+ PIPE_USAGE_DEFAULT,
+ scratch_size);
+ if (!sctx->scratch_buffer)
+ return;
+ sctx->emit_scratch_reloc = true;
+ }
+
+ si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b,
+ &sctx->scratch_buffer->b.b, size, size, &dma_flags);
+
+ va = sctx->scratch_buffer->gpu_address;
+ si_emit_cp_dma_copy_buffer(sctx, va, va + CP_DMA_ALIGNMENT, size,
+ dma_flags);
+}
+
void si_copy_buffer(struct si_context *sctx,
struct pipe_resource *dst, struct pipe_resource *src,
uint64_t dst_offset, uint64_t src_offset, unsigned size,
bool is_framebuffer)
{
+ uint64_t main_dst_offset, main_src_offset;
+ unsigned skipped_size = 0;
+ unsigned realign_size = 0;
unsigned tc_l2_flag = get_tc_l2_flag(sctx, is_framebuffer);
unsigned flush_flags = get_flush_flags(sctx, is_framebuffer);
@@ -229,22 +271,63 @@ void si_copy_buffer(struct si_context *sctx,
dst_offset += r600_resource(dst)->gpu_address;
src_offset += r600_resource(src)->gpu_address;
+ /* If the size is not aligned, we must add a dummy copy at the end
+ * just to align the internal counter. Otherwise, the DMA engine
+ * would slow down by an order of magnitude for following copies.
+ */
+ if (size % CP_DMA_ALIGNMENT)
+ realign_size = CP_DMA_ALIGNMENT - (size % CP_DMA_ALIGNMENT);
+
+ /* If the copy begins unaligned, we must start copying from the next
+ * aligned block and the skipped part should be copied after everything
+ * else has been copied. Only the src alignment matters, not dst.
+ */
+ if (src_offset % CP_DMA_ALIGNMENT) {
+ skipped_size = CP_DMA_ALIGNMENT - (src_offset % CP_DMA_ALIGNMENT);
+ /* The main part will be skipped if the size is too small. */
+ skipped_size = MIN2(skipped_size, size);
+ size -= skipped_size;
+ }
+
/* Flush the caches. */
sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
+ /* This is the main part doing the copying. Src is always aligned. */
+ main_dst_offset = dst_offset + skipped_size;
+ main_src_offset = src_offset + skipped_size;
+
while (size) {
unsigned dma_flags = tc_l2_flag;
unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
- si_cp_dma_prepare(sctx, dst, src, byte_count, size, &dma_flags);
+ si_cp_dma_prepare(sctx, dst, src, byte_count,
+ size + skipped_size + realign_size,
+ &dma_flags);
- si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, byte_count, dma_flags);
+ si_emit_cp_dma_copy_buffer(sctx, main_dst_offset, main_src_offset,
+ byte_count, dma_flags);
size -= byte_count;
- src_offset += byte_count;
- dst_offset += byte_count;
+ main_src_offset += byte_count;
+ main_dst_offset += byte_count;
}
+ /* Copy the part we skipped because src wasn't aligned. */
+ if (skipped_size) {
+ unsigned dma_flags = tc_l2_flag;
+
+ si_cp_dma_prepare(sctx, dst, src, skipped_size,
+ skipped_size + realign_size,
+ &dma_flags);
+
+ si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset,
+ skipped_size, dma_flags);
+ }
+
+ /* Finally, realign the engine if the size wasn't aligned. */
+ if (realign_size)
+ si_cp_dma_realign_engine(sctx, realign_size);
+
/* Flush the caches again in case the 3D engine has been prefetching
* the resource. */
sctx->b.flags |= flush_flags;
From d57ede92b7832f01df2aa5755c8c34b4de4866d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?=
Date: Tue, 3 Nov 2015 12:20:18 +0100
Subject: [PATCH 101/287] radeonsi: add register definitions for Stoney
There are a few non-stoney changes too.
Reviewed-by: Alex Deucher
---
src/gallium/drivers/radeonsi/sid.h | 322 +++++++++++++++++++++++++++++
1 file changed, 322 insertions(+)
diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h
index 4bb24572b90..0c48340beef 100644
--- a/src/gallium/drivers/radeonsi/sid.h
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -3608,6 +3608,9 @@
#define S_00B854_WAVES_PER_SH(x) (((x) & 0x3F) << 0) /* mask is 0x3FF on CIK */
#define G_00B854_WAVES_PER_SH(x) (((x) >> 0) & 0x3F) /* mask is 0x3FF on CIK */
#define C_00B854_WAVES_PER_SH 0xFFFFFFC0 /* mask is 0x3FF on CIK */
+#define S_00B854_WAVES_PER_SH_CIK(x) (((x) & 0x3FF) << 0)
+#define G_00B854_WAVES_PER_SH_CIK(x) (((x) >> 0) & 0x3FF)
+#define C_00B854_WAVES_PER_SH_CIK 0xFFFFFC00
#define S_00B854_TG_PER_CU(x) (((x) & 0x0F) << 12)
#define G_00B854_TG_PER_CU(x) (((x) >> 12) & 0x0F)
#define C_00B854_TG_PER_CU 0xFFFF0FFF
@@ -5211,6 +5214,296 @@
#define V_028714_SPI_SHADER_UINT16_ABGR 0x07
#define V_028714_SPI_SHADER_SINT16_ABGR 0x08
#define V_028714_SPI_SHADER_32_ABGR 0x09
+/* Stoney */
+#define R_028754_SX_PS_DOWNCONVERT 0x028754
+#define S_028754_MRT0(x) (((x) & 0x0F) << 0)
+#define G_028754_MRT0(x) (((x) >> 0) & 0x0F)
+#define C_028754_MRT0 0xFFFFFFF0
+#define V_028754_SX_RT_EXPORT_NO_CONVERSION 0
+#define V_028754_SX_RT_EXPORT_32_R 1
+#define V_028754_SX_RT_EXPORT_32_A 2
+#define V_028754_SX_RT_EXPORT_10_11_11 3
+#define V_028754_SX_RT_EXPORT_2_10_10_10 4
+#define V_028754_SX_RT_EXPORT_8_8_8_8 5
+#define V_028754_SX_RT_EXPORT_5_6_5 6
+#define V_028754_SX_RT_EXPORT_1_5_5_5 7
+#define V_028754_SX_RT_EXPORT_4_4_4_4 8
+#define V_028754_SX_RT_EXPORT_16_16_GR 9
+#define V_028754_SX_RT_EXPORT_16_16_AR 10
+#define S_028754_MRT1(x) (((x) & 0x0F) << 4)
+#define G_028754_MRT1(x) (((x) >> 4) & 0x0F)
+#define C_028754_MRT1 0xFFFFFF0F
+#define S_028754_MRT2(x) (((x) & 0x0F) << 8)
+#define G_028754_MRT2(x) (((x) >> 8) & 0x0F)
+#define C_028754_MRT2 0xFFFFF0FF
+#define S_028754_MRT3(x) (((x) & 0x0F) << 12)
+#define G_028754_MRT3(x) (((x) >> 12) & 0x0F)
+#define C_028754_MRT3 0xFFFF0FFF
+#define S_028754_MRT4(x) (((x) & 0x0F) << 16)
+#define G_028754_MRT4(x) (((x) >> 16) & 0x0F)
+#define C_028754_MRT4 0xFFF0FFFF
+#define S_028754_MRT5(x) (((x) & 0x0F) << 20)
+#define G_028754_MRT5(x) (((x) >> 20) & 0x0F)
+#define C_028754_MRT5 0xFF0FFFFF
+#define S_028754_MRT6(x) (((x) & 0x0F) << 24)
+#define G_028754_MRT6(x) (((x) >> 24) & 0x0F)
+#define C_028754_MRT6 0xF0FFFFFF
+#define S_028754_MRT7(x) (((x) & 0x0F) << 28)
+#define G_028754_MRT7(x) (((x) >> 28) & 0x0F)
+#define C_028754_MRT7 0x0FFFFFFF
+#define R_028758_SX_BLEND_OPT_EPSILON 0x028758
+#define S_028758_MRT0_EPSILON(x) (((x) & 0x0F) << 0)
+#define G_028758_MRT0_EPSILON(x) (((x) >> 0) & 0x0F)
+#define C_028758_MRT0_EPSILON 0xFFFFFFF0
+#define V_028758_EXACT 0
+#define V_028758_11BIT_FORMAT 1
+#define V_028758_10BIT_FORMAT 3
+#define V_028758_8BIT_FORMAT 7
+#define V_028758_6BIT_FORMAT 11
+#define V_028758_5BIT_FORMAT 13
+#define V_028758_4BIT_FORMAT 15
+#define S_028758_MRT1_EPSILON(x) (((x) & 0x0F) << 4)
+#define G_028758_MRT1_EPSILON(x) (((x) >> 4) & 0x0F)
+#define C_028758_MRT1_EPSILON 0xFFFFFF0F
+#define S_028758_MRT2_EPSILON(x) (((x) & 0x0F) << 8)
+#define G_028758_MRT2_EPSILON(x) (((x) >> 8) & 0x0F)
+#define C_028758_MRT2_EPSILON 0xFFFFF0FF
+#define S_028758_MRT3_EPSILON(x) (((x) & 0x0F) << 12)
+#define G_028758_MRT3_EPSILON(x) (((x) >> 12) & 0x0F)
+#define C_028758_MRT3_EPSILON 0xFFFF0FFF
+#define S_028758_MRT4_EPSILON(x) (((x) & 0x0F) << 16)
+#define G_028758_MRT4_EPSILON(x) (((x) >> 16) & 0x0F)
+#define C_028758_MRT4_EPSILON 0xFFF0FFFF
+#define S_028758_MRT5_EPSILON(x) (((x) & 0x0F) << 20)
+#define G_028758_MRT5_EPSILON(x) (((x) >> 20) & 0x0F)
+#define C_028758_MRT5_EPSILON 0xFF0FFFFF
+#define S_028758_MRT6_EPSILON(x) (((x) & 0x0F) << 24)
+#define G_028758_MRT6_EPSILON(x) (((x) >> 24) & 0x0F)
+#define C_028758_MRT6_EPSILON 0xF0FFFFFF
+#define S_028758_MRT7_EPSILON(x) (((x) & 0x0F) << 28)
+#define G_028758_MRT7_EPSILON(x) (((x) >> 28) & 0x0F)
+#define C_028758_MRT7_EPSILON 0x0FFFFFFF
+#define R_02875C_SX_BLEND_OPT_CONTROL 0x02875C
+#define S_02875C_MRT0_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 0)
+#define G_02875C_MRT0_COLOR_OPT_DISABLE(x) (((x) >> 0) & 0x1)
+#define C_02875C_MRT0_COLOR_OPT_DISABLE 0xFFFFFFFE
+#define S_02875C_MRT0_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 1)
+#define G_02875C_MRT0_ALPHA_OPT_DISABLE(x) (((x) >> 1) & 0x1)
+#define C_02875C_MRT0_ALPHA_OPT_DISABLE 0xFFFFFFFD
+#define S_02875C_MRT1_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 4)
+#define G_02875C_MRT1_COLOR_OPT_DISABLE(x) (((x) >> 4) & 0x1)
+#define C_02875C_MRT1_COLOR_OPT_DISABLE 0xFFFFFFEF
+#define S_02875C_MRT1_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 5)
+#define G_02875C_MRT1_ALPHA_OPT_DISABLE(x) (((x) >> 5) & 0x1)
+#define C_02875C_MRT1_ALPHA_OPT_DISABLE 0xFFFFFFDF
+#define S_02875C_MRT2_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 8)
+#define G_02875C_MRT2_COLOR_OPT_DISABLE(x) (((x) >> 8) & 0x1)
+#define C_02875C_MRT2_COLOR_OPT_DISABLE 0xFFFFFEFF
+#define S_02875C_MRT2_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 9)
+#define G_02875C_MRT2_ALPHA_OPT_DISABLE(x) (((x) >> 9) & 0x1)
+#define C_02875C_MRT2_ALPHA_OPT_DISABLE 0xFFFFFDFF
+#define S_02875C_MRT3_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 12)
+#define G_02875C_MRT3_COLOR_OPT_DISABLE(x) (((x) >> 12) & 0x1)
+#define C_02875C_MRT3_COLOR_OPT_DISABLE 0xFFFFEFFF
+#define S_02875C_MRT3_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 13)
+#define G_02875C_MRT3_ALPHA_OPT_DISABLE(x) (((x) >> 13) & 0x1)
+#define C_02875C_MRT3_ALPHA_OPT_DISABLE 0xFFFFDFFF
+#define S_02875C_MRT4_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 16)
+#define G_02875C_MRT4_COLOR_OPT_DISABLE(x) (((x) >> 16) & 0x1)
+#define C_02875C_MRT4_COLOR_OPT_DISABLE 0xFFFEFFFF
+#define S_02875C_MRT4_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 17)
+#define G_02875C_MRT4_ALPHA_OPT_DISABLE(x) (((x) >> 17) & 0x1)
+#define C_02875C_MRT4_ALPHA_OPT_DISABLE 0xFFFDFFFF
+#define S_02875C_MRT5_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 20)
+#define G_02875C_MRT5_COLOR_OPT_DISABLE(x) (((x) >> 20) & 0x1)
+#define C_02875C_MRT5_COLOR_OPT_DISABLE 0xFFEFFFFF
+#define S_02875C_MRT5_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 21)
+#define G_02875C_MRT5_ALPHA_OPT_DISABLE(x) (((x) >> 21) & 0x1)
+#define C_02875C_MRT5_ALPHA_OPT_DISABLE 0xFFDFFFFF
+#define S_02875C_MRT6_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 24)
+#define G_02875C_MRT6_COLOR_OPT_DISABLE(x) (((x) >> 24) & 0x1)
+#define C_02875C_MRT6_COLOR_OPT_DISABLE 0xFEFFFFFF
+#define S_02875C_MRT6_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 25)
+#define G_02875C_MRT6_ALPHA_OPT_DISABLE(x) (((x) >> 25) & 0x1)
+#define C_02875C_MRT6_ALPHA_OPT_DISABLE 0xFDFFFFFF
+#define S_02875C_MRT7_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 28)
+#define G_02875C_MRT7_COLOR_OPT_DISABLE(x) (((x) >> 28) & 0x1)
+#define C_02875C_MRT7_COLOR_OPT_DISABLE 0xEFFFFFFF
+#define S_02875C_MRT7_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 29)
+#define G_02875C_MRT7_ALPHA_OPT_DISABLE(x) (((x) >> 29) & 0x1)
+#define C_02875C_MRT7_ALPHA_OPT_DISABLE 0xDFFFFFFF
+#define S_02875C_PIXEN_ZERO_OPT_DISABLE(x) (((x) & 0x1) << 31)
+#define G_02875C_PIXEN_ZERO_OPT_DISABLE(x) (((x) >> 31) & 0x1)
+#define C_02875C_PIXEN_ZERO_OPT_DISABLE 0x7FFFFFFF
+#define R_028760_SX_MRT0_BLEND_OPT 0x028760
+#define S_028760_COLOR_SRC_OPT(x) (((x) & 0x07) << 0)
+#define G_028760_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07)
+#define C_028760_COLOR_SRC_OPT 0xFFFFFFF8
+#define V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL 0
+#define V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE 1
+#define V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0 2
+#define V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1 3
+#define V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0 4
+#define V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1 5
+#define V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0 6
+#define V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE 7
+#define S_028760_COLOR_DST_OPT(x) (((x) & 0x07) << 4)
+#define G_028760_COLOR_DST_OPT(x) (((x) >> 4) & 0x07)
+#define C_028760_COLOR_DST_OPT 0xFFFFFF8F
+#define S_028760_COLOR_COMB_FCN(x) (((x) & 0x07) << 8)
+#define G_028760_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07)
+#define C_028760_COLOR_COMB_FCN 0xFFFFF8FF
+#define V_028760_OPT_COMB_NONE 0
+#define V_028760_OPT_COMB_ADD 1
+#define V_028760_OPT_COMB_SUBTRACT 2
+#define V_028760_OPT_COMB_MIN 3
+#define V_028760_OPT_COMB_MAX 4
+#define V_028760_OPT_COMB_REVSUBTRACT 5
+#define V_028760_OPT_COMB_BLEND_DISABLED 6
+#define V_028760_OPT_COMB_SAFE_ADD 7
+#define S_028760_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16)
+#define G_028760_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07)
+#define C_028760_ALPHA_SRC_OPT 0xFFF8FFFF
+#define S_028760_ALPHA_DST_OPT(x) (((x) & 0x07) << 20)
+#define G_028760_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07)
+#define C_028760_ALPHA_DST_OPT 0xFF8FFFFF
+#define S_028760_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24)
+#define G_028760_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07)
+#define C_028760_ALPHA_COMB_FCN 0xF8FFFFFF
+#define R_028764_SX_MRT1_BLEND_OPT 0x028764
+#define S_028764_COLOR_SRC_OPT(x) (((x) & 0x07) << 0)
+#define G_028764_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07)
+#define C_028764_COLOR_SRC_OPT 0xFFFFFFF8
+#define S_028764_COLOR_DST_OPT(x) (((x) & 0x07) << 4)
+#define G_028764_COLOR_DST_OPT(x) (((x) >> 4) & 0x07)
+#define C_028764_COLOR_DST_OPT 0xFFFFFF8F
+#define S_028764_COLOR_COMB_FCN(x) (((x) & 0x07) << 8)
+#define G_028764_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07)
+#define C_028764_COLOR_COMB_FCN 0xFFFFF8FF
+#define S_028764_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16)
+#define G_028764_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07)
+#define C_028764_ALPHA_SRC_OPT 0xFFF8FFFF
+#define S_028764_ALPHA_DST_OPT(x) (((x) & 0x07) << 20)
+#define G_028764_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07)
+#define C_028764_ALPHA_DST_OPT 0xFF8FFFFF
+#define S_028764_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24)
+#define G_028764_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07)
+#define C_028764_ALPHA_COMB_FCN 0xF8FFFFFF
+#define R_028768_SX_MRT2_BLEND_OPT 0x028768
+#define S_028768_COLOR_SRC_OPT(x) (((x) & 0x07) << 0)
+#define G_028768_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07)
+#define C_028768_COLOR_SRC_OPT 0xFFFFFFF8
+#define S_028768_COLOR_DST_OPT(x) (((x) & 0x07) << 4)
+#define G_028768_COLOR_DST_OPT(x) (((x) >> 4) & 0x07)
+#define C_028768_COLOR_DST_OPT 0xFFFFFF8F
+#define S_028768_COLOR_COMB_FCN(x) (((x) & 0x07) << 8)
+#define G_028768_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07)
+#define C_028768_COLOR_COMB_FCN 0xFFFFF8FF
+#define S_028768_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16)
+#define G_028768_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07)
+#define C_028768_ALPHA_SRC_OPT 0xFFF8FFFF
+#define S_028768_ALPHA_DST_OPT(x) (((x) & 0x07) << 20)
+#define G_028768_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07)
+#define C_028768_ALPHA_DST_OPT 0xFF8FFFFF
+#define S_028768_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24)
+#define G_028768_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07)
+#define C_028768_ALPHA_COMB_FCN 0xF8FFFFFF
+#define R_02876C_SX_MRT3_BLEND_OPT 0x02876C
+#define S_02876C_COLOR_SRC_OPT(x) (((x) & 0x07) << 0)
+#define G_02876C_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07)
+#define C_02876C_COLOR_SRC_OPT 0xFFFFFFF8
+#define S_02876C_COLOR_DST_OPT(x) (((x) & 0x07) << 4)
+#define G_02876C_COLOR_DST_OPT(x) (((x) >> 4) & 0x07)
+#define C_02876C_COLOR_DST_OPT 0xFFFFFF8F
+#define S_02876C_COLOR_COMB_FCN(x) (((x) & 0x07) << 8)
+#define G_02876C_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07)
+#define C_02876C_COLOR_COMB_FCN 0xFFFFF8FF
+#define S_02876C_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16)
+#define G_02876C_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07)
+#define C_02876C_ALPHA_SRC_OPT 0xFFF8FFFF
+#define S_02876C_ALPHA_DST_OPT(x) (((x) & 0x07) << 20)
+#define G_02876C_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07)
+#define C_02876C_ALPHA_DST_OPT 0xFF8FFFFF
+#define S_02876C_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24)
+#define G_02876C_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07)
+#define C_02876C_ALPHA_COMB_FCN 0xF8FFFFFF
+#define R_028770_SX_MRT4_BLEND_OPT 0x028770
+#define S_028770_COLOR_SRC_OPT(x) (((x) & 0x07) << 0)
+#define G_028770_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07)
+#define C_028770_COLOR_SRC_OPT 0xFFFFFFF8
+#define S_028770_COLOR_DST_OPT(x) (((x) & 0x07) << 4)
+#define G_028770_COLOR_DST_OPT(x) (((x) >> 4) & 0x07)
+#define C_028770_COLOR_DST_OPT 0xFFFFFF8F
+#define S_028770_COLOR_COMB_FCN(x) (((x) & 0x07) << 8)
+#define G_028770_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07)
+#define C_028770_COLOR_COMB_FCN 0xFFFFF8FF
+#define S_028770_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16)
+#define G_028770_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07)
+#define C_028770_ALPHA_SRC_OPT 0xFFF8FFFF
+#define S_028770_ALPHA_DST_OPT(x) (((x) & 0x07) << 20)
+#define G_028770_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07)
+#define C_028770_ALPHA_DST_OPT 0xFF8FFFFF
+#define S_028770_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24)
+#define G_028770_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07)
+#define C_028770_ALPHA_COMB_FCN 0xF8FFFFFF
+#define R_028774_SX_MRT5_BLEND_OPT 0x028774
+#define S_028774_COLOR_SRC_OPT(x) (((x) & 0x07) << 0)
+#define G_028774_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07)
+#define C_028774_COLOR_SRC_OPT 0xFFFFFFF8
+#define S_028774_COLOR_DST_OPT(x) (((x) & 0x07) << 4)
+#define G_028774_COLOR_DST_OPT(x) (((x) >> 4) & 0x07)
+#define C_028774_COLOR_DST_OPT 0xFFFFFF8F
+#define S_028774_COLOR_COMB_FCN(x) (((x) & 0x07) << 8)
+#define G_028774_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07)
+#define C_028774_COLOR_COMB_FCN 0xFFFFF8FF
+#define S_028774_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16)
+#define G_028774_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07)
+#define C_028774_ALPHA_SRC_OPT 0xFFF8FFFF
+#define S_028774_ALPHA_DST_OPT(x) (((x) & 0x07) << 20)
+#define G_028774_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07)
+#define C_028774_ALPHA_DST_OPT 0xFF8FFFFF
+#define S_028774_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24)
+#define G_028774_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07)
+#define C_028774_ALPHA_COMB_FCN 0xF8FFFFFF
+#define R_028778_SX_MRT6_BLEND_OPT 0x028778
+#define S_028778_COLOR_SRC_OPT(x) (((x) & 0x07) << 0)
+#define G_028778_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07)
+#define C_028778_COLOR_SRC_OPT 0xFFFFFFF8
+#define S_028778_COLOR_DST_OPT(x) (((x) & 0x07) << 4)
+#define G_028778_COLOR_DST_OPT(x) (((x) >> 4) & 0x07)
+#define C_028778_COLOR_DST_OPT 0xFFFFFF8F
+#define S_028778_COLOR_COMB_FCN(x) (((x) & 0x07) << 8)
+#define G_028778_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07)
+#define C_028778_COLOR_COMB_FCN 0xFFFFF8FF
+#define S_028778_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16)
+#define G_028778_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07)
+#define C_028778_ALPHA_SRC_OPT 0xFFF8FFFF
+#define S_028778_ALPHA_DST_OPT(x) (((x) & 0x07) << 20)
+#define G_028778_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07)
+#define C_028778_ALPHA_DST_OPT 0xFF8FFFFF
+#define S_028778_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24)
+#define G_028778_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07)
+#define C_028778_ALPHA_COMB_FCN 0xF8FFFFFF
+#define R_02877C_SX_MRT7_BLEND_OPT 0x02877C
+#define S_02877C_COLOR_SRC_OPT(x) (((x) & 0x07) << 0)
+#define G_02877C_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07)
+#define C_02877C_COLOR_SRC_OPT 0xFFFFFFF8
+#define S_02877C_COLOR_DST_OPT(x) (((x) & 0x07) << 4)
+#define G_02877C_COLOR_DST_OPT(x) (((x) >> 4) & 0x07)
+#define C_02877C_COLOR_DST_OPT 0xFFFFFF8F
+#define S_02877C_COLOR_COMB_FCN(x) (((x) & 0x07) << 8)
+#define G_02877C_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07)
+#define C_02877C_COLOR_COMB_FCN 0xFFFFF8FF
+#define S_02877C_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16)
+#define G_02877C_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07)
+#define C_02877C_ALPHA_SRC_OPT 0xFFF8FFFF
+#define S_02877C_ALPHA_DST_OPT(x) (((x) & 0x07) << 20)
+#define G_02877C_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07)
+#define C_02877C_ALPHA_DST_OPT 0xFF8FFFFF
+#define S_02877C_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24)
+#define G_02877C_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07)
+#define C_02877C_ALPHA_COMB_FCN 0xF8FFFFFF
+/* */
#define R_028780_CB_BLEND0_CONTROL 0x028780
#define S_028780_COLOR_SRCBLEND(x) (((x) & 0x1F) << 0)
#define G_028780_COLOR_SRCBLEND(x) (((x) >> 0) & 0x1F)
@@ -5473,6 +5766,7 @@
#define V_028808_CB_ELIMINATE_FAST_CLEAR 0x02
#define V_028808_CB_RESOLVE 0x03
#define V_028808_CB_FMASK_DECOMPRESS 0x05
+#define V_028808_CB_DCC_DECOMPRESS 0x06
#define S_028808_ROP3(x) (((x) & 0xFF) << 16)
#define G_028808_ROP3(x) (((x) >> 16) & 0xFF)
#define C_028808_ROP3 0xFF00FFFF
@@ -5551,6 +5845,11 @@
#define V_02880C_EXPORT_GREATER_THAN_Z 2
#define V_02880C_EXPORT_RESERVED 3
/* */
+/* Stoney */
+#define S_02880C_DUAL_QUAD_DISABLE(x) (((x) & 0x1) << 15)
+#define G_02880C_DUAL_QUAD_DISABLE(x) (((x) >> 15) & 0x1)
+#define C_02880C_DUAL_QUAD_DISABLE 0xFFFF7FFF
+/* */
#define R_028810_PA_CL_CLIP_CNTL 0x028810
#define S_028810_UCP_ENA_0(x) (((x) & 0x1) << 0)
#define G_028810_UCP_ENA_0(x) (((x) >> 0) & 0x1)
@@ -6132,6 +6431,9 @@
#define V_028A40_GS_SCENARIO_G 0x03
#define V_028A40_GS_SCENARIO_C 0x04
#define V_028A40_SPRITE_EN 0x05
+#define S_028A40_RESERVED_0(x) (((x) & 0x1) << 3)
+#define G_028A40_RESERVED_0(x) (((x) >> 3) & 0x1)
+#define C_028A40_RESERVED_0 0xFFFFFFF7
#define S_028A40_CUT_MODE(x) (((x) & 0x03) << 4)
#define G_028A40_CUT_MODE(x) (((x) >> 4) & 0x03)
#define C_028A40_CUT_MODE 0xFFFFFFCF
@@ -6139,12 +6441,19 @@
#define V_028A40_GS_CUT_512 0x01
#define V_028A40_GS_CUT_256 0x02
#define V_028A40_GS_CUT_128 0x03
+#define S_028A40_RESERVED_1(x) (((x) & 0x1F) << 6)
+#define G_028A40_RESERVED_1(x) (((x) >> 6) & 0x1F)
+#define C_028A40_RESERVED_1 0xFFFFF83F
#define S_028A40_GS_C_PACK_EN(x) (((x) & 0x1) << 11)
#define G_028A40_GS_C_PACK_EN(x) (((x) >> 11) & 0x1)
#define C_028A40_GS_C_PACK_EN 0xFFFFF7FF
+#define S_028A40_RESERVED_2(x) (((x) & 0x1) << 12)
+#define G_028A40_RESERVED_2(x) (((x) >> 12) & 0x1)
+#define C_028A40_RESERVED_2 0xFFFFEFFF
#define S_028A40_ES_PASSTHRU(x) (((x) & 0x1) << 13)
#define G_028A40_ES_PASSTHRU(x) (((x) >> 13) & 0x1)
#define C_028A40_ES_PASSTHRU 0xFFFFDFFF
+/* SI-CIK */
#define S_028A40_COMPUTE_MODE(x) (((x) & 0x1) << 14)
#define G_028A40_COMPUTE_MODE(x) (((x) >> 14) & 0x1)
#define C_028A40_COMPUTE_MODE 0xFFFFBFFF
@@ -6154,6 +6463,7 @@
#define S_028A40_ELEMENT_INFO_EN(x) (((x) & 0x1) << 16)
#define G_028A40_ELEMENT_INFO_EN(x) (((x) >> 16) & 0x1)
#define C_028A40_ELEMENT_INFO_EN 0xFFFEFFFF
+/* */
#define S_028A40_PARTIAL_THD_AT_EOI(x) (((x) & 0x1) << 17)
#define G_028A40_PARTIAL_THD_AT_EOI(x) (((x) >> 17) & 0x1)
#define C_028A40_PARTIAL_THD_AT_EOI 0xFFFDFFFF
@@ -6339,6 +6649,9 @@
#define C_028A7C_RDREQ_POLICY 0xFFFFFF3F
#define V_028A7C_VGT_POLICY_LRU 0x00
#define V_028A7C_VGT_POLICY_STREAM 0x01
+#define S_028A7C_RDREQ_POLICY_VI(x) (((x) & 0x1) << 6)
+#define G_028A7C_RDREQ_POLICY_VI(x) (((x) >> 6) & 0x1)
+#define C_028A7C_RDREQ_POLICY_VI 0xFFFFFFBF
#define S_028A7C_ATC(x) (((x) & 0x1) << 8)
#define G_028A7C_ATC(x) (((x) >> 8) & 0x1)
#define C_028A7C_ATC 0xFFFFFEFF
@@ -6715,6 +7028,9 @@
#define V_028B6C_VGT_POLICY_BYPASS 0x02
/* */
/* VI */
+#define S_028B6C_RDREQ_POLICY_VI(x) (((x) & 0x1) << 15)
+#define G_028B6C_RDREQ_POLICY_VI(x) (((x) >> 15) & 0x1)
+#define C_028B6C_RDREQ_POLICY_VI 0xFFFF7FFF
#define S_028B6C_DISTRIBUTION_MODE(x) (((x) & 0x03) << 17)
#define G_028B6C_DISTRIBUTION_MODE(x) (((x) >> 17) & 0x03)
#define C_028B6C_DISTRIBUTION_MODE 0xFFF9FFFF
@@ -7317,6 +7633,12 @@
#define S_028C3C_AA_MASK_X1Y1(x) (((x) & 0xFFFF) << 16)
#define G_028C3C_AA_MASK_X1Y1(x) (((x) >> 16) & 0xFFFF)
#define C_028C3C_AA_MASK_X1Y1 0x0000FFFF
+/* Stoney */
+#define R_028C40_PA_SC_SHADER_CONTROL 0x028C40
+#define S_028C40_REALIGN_DQUADS_AFTER_N_WAVES(x) (((x) & 0x03) << 0)
+#define G_028C40_REALIGN_DQUADS_AFTER_N_WAVES(x) (((x) >> 0) & 0x03)
+#define C_028C40_REALIGN_DQUADS_AFTER_N_WAVES 0xFFFFFFFC
+/* */
#define R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL 0x028C58
#define S_028C58_VTX_REUSE_DEPTH(x) (((x) & 0xFF) << 0)
#define G_028C58_VTX_REUSE_DEPTH(x) (((x) >> 0) & 0xFF)
From c839174d55216cf1da5cdc4bf0f735ab8359d221 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand
Date: Thu, 22 Oct 2015 16:53:27 -0700
Subject: [PATCH 102/287] nir/validate: Add better validation of load/store
types
Reviewed-by: Connor Abbott
---
src/glsl/nir/nir_validate.c | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/src/glsl/nir/nir_validate.c b/src/glsl/nir/nir_validate.c
index c6fedf9b1ad..a42e830fd72 100644
--- a/src/glsl/nir/nir_validate.c
+++ b/src/glsl/nir/nir_validate.c
@@ -398,15 +398,27 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
}
switch (instr->intrinsic) {
- case nir_intrinsic_load_var:
+ case nir_intrinsic_load_var: {
+ const struct glsl_type *type =
+ nir_deref_tail(&instr->variables[0]->deref)->type;
+ assert(glsl_type_is_vector_or_scalar(type));
+ assert(instr->num_components == glsl_get_vector_elements(type));
assert(instr->variables[0]->var->data.mode != nir_var_shader_out);
break;
- case nir_intrinsic_store_var:
+ }
+ case nir_intrinsic_store_var: {
+ const struct glsl_type *type =
+ nir_deref_tail(&instr->variables[0]->deref)->type;
+ assert(glsl_type_is_vector_or_scalar(type));
+ assert(instr->num_components == glsl_get_vector_elements(type));
assert(instr->variables[0]->var->data.mode != nir_var_shader_in &&
instr->variables[0]->var->data.mode != nir_var_uniform &&
instr->variables[0]->var->data.mode != nir_var_shader_storage);
break;
+ }
case nir_intrinsic_copy_var:
+ assert(nir_deref_tail(&instr->variables[0]->deref)->type ==
+ nir_deref_tail(&instr->variables[1]->deref)->type);
assert(instr->variables[0]->var->data.mode != nir_var_shader_in &&
instr->variables[0]->var->data.mode != nir_var_uniform &&
instr->variables[0]->var->data.mode != nir_var_shader_storage);
From d43e16b1638cdadc7fcff2007b106e2a559dae7d Mon Sep 17 00:00:00 2001
From: Jason Ekstrand
Date: Thu, 5 Nov 2015 16:37:47 -0800
Subject: [PATCH 103/287] i965/fs: Use regs_read/written for post-RA scheduling
in calculate_deps
Previously, we were assuming that everything read/wrote exactly 1 logical
GRF (1 in SIMD8 and 2 in SIMD16). This isn't actually true. In
particular, the PLN instruction reads 2 logical registers in one of the
components. This commit changes post-RA scheduling to use regs_read and
regs_written instead so that we add enough dependencies.
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=92770
Reviewed-by: Matt Turner
Reviewed-by: Connor Abbott
---
.../dri/i965/brw_schedule_instructions.cpp | 15 ++++-----------
1 file changed, 4 insertions(+), 11 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 88c45f74333..d21bc677c82 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -927,7 +927,6 @@ fs_instruction_scheduler::calculate_deps()
* granular level.
*/
schedule_node *last_fixed_grf_write = NULL;
- int reg_width = v->dispatch_width / 8;
/* The last instruction always needs to still be the last
* instruction. Either it's flow control (IF, ELSE, ENDIF, DO,
@@ -964,10 +963,7 @@ fs_instruction_scheduler::calculate_deps()
(inst->src[i].fixed_hw_reg.file ==
BRW_GENERAL_REGISTER_FILE)) {
if (post_reg_alloc) {
- int size = reg_width;
- if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0)
- size = 1;
- for (int r = 0; r < size; r++)
+ for (int r = 0; r < inst->regs_read(i); r++)
add_dep(last_grf_write[inst->src[i].fixed_hw_reg.nr + r], n);
} else {
add_dep(last_fixed_grf_write, n);
@@ -1031,7 +1027,7 @@ fs_instruction_scheduler::calculate_deps()
} else if (inst->dst.file == HW_REG &&
inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
if (post_reg_alloc) {
- for (int r = 0; r < reg_width; r++)
+ for (int r = 0; r < inst->regs_written; r++)
last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
} else {
last_fixed_grf_write = n;
@@ -1093,10 +1089,7 @@ fs_instruction_scheduler::calculate_deps()
(inst->src[i].fixed_hw_reg.file ==
BRW_GENERAL_REGISTER_FILE)) {
if (post_reg_alloc) {
- int size = reg_width;
- if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0)
- size = 1;
- for (int r = 0; r < size; r++)
+ for (int r = 0; r < inst->regs_read(i); r++)
add_dep(n, last_grf_write[inst->src[i].fixed_hw_reg.nr + r], 0);
} else {
add_dep(n, last_fixed_grf_write, 0);
@@ -1159,7 +1152,7 @@ fs_instruction_scheduler::calculate_deps()
} else if (inst->dst.file == HW_REG &&
inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
if (post_reg_alloc) {
- for (int r = 0; r < reg_width; r++)
+ for (int r = 0; r < inst->regs_written; r++)
last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
} else {
last_fixed_grf_write = n;
From 7d90e570f311066d1fd1eaafe681a8c939c86bae Mon Sep 17 00:00:00 2001
From: Jason Ekstrand
Date: Fri, 1 May 2015 11:26:40 -0700
Subject: [PATCH 104/287] nir/types: Add an is_vector_or_scalar helper
Reviewed-by: Connor Abbott
---
src/glsl/nir/nir_types.cpp | 6 ++++++
src/glsl/nir/nir_types.h | 1 +
2 files changed, 7 insertions(+)
diff --git a/src/glsl/nir/nir_types.cpp b/src/glsl/nir/nir_types.cpp
index 965f42320be..135591ab97d 100644
--- a/src/glsl/nir/nir_types.cpp
+++ b/src/glsl/nir/nir_types.cpp
@@ -143,6 +143,12 @@ glsl_type_is_scalar(const struct glsl_type *type)
return type->is_scalar();
}
+bool
+glsl_type_is_vector_or_scalar(const struct glsl_type *type)
+{
+ return type->is_vector() || type->is_scalar();
+}
+
bool
glsl_type_is_matrix(const struct glsl_type *type)
{
diff --git a/src/glsl/nir/nir_types.h b/src/glsl/nir/nir_types.h
index 60d561b25ee..b0b51842a43 100644
--- a/src/glsl/nir/nir_types.h
+++ b/src/glsl/nir/nir_types.h
@@ -70,6 +70,7 @@ unsigned glsl_get_record_location_offset(const struct glsl_type *type,
bool glsl_type_is_void(const struct glsl_type *type);
bool glsl_type_is_vector(const struct glsl_type *type);
bool glsl_type_is_scalar(const struct glsl_type *type);
+bool glsl_type_is_vector_or_scalar(const struct glsl_type *type);
bool glsl_type_is_matrix(const struct glsl_type *type);
const struct glsl_type *glsl_void_type(void);
From 6c731d85666abb61c49e5b4affa196545f5ac086 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand
Date: Sat, 7 Nov 2015 12:01:50 -0800
Subject: [PATCH 105/287] nir: Add a nir_deref_tail helper
Reviewed-by: Connor Abbott
---
src/glsl/nir/nir.h | 9 +++++++++
src/glsl/nir/nir_lower_var_copies.c | 15 ++-------------
src/glsl/nir/nir_split_var_copies.c | 12 ++----------
3 files changed, 13 insertions(+), 23 deletions(-)
diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index ef39df5dc51..2559ef2a456 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -785,6 +785,15 @@ NIR_DEFINE_CAST(nir_deref_as_var, nir_deref, nir_deref_var, deref)
NIR_DEFINE_CAST(nir_deref_as_array, nir_deref, nir_deref_array, deref)
NIR_DEFINE_CAST(nir_deref_as_struct, nir_deref, nir_deref_struct, deref)
+/* Returns the last deref in the chain. */
+static inline nir_deref *
+nir_deref_tail(nir_deref *deref)
+{
+ while (deref->child)
+ deref = deref->child;
+ return deref;
+}
+
typedef struct {
nir_instr instr;
diff --git a/src/glsl/nir/nir_lower_var_copies.c b/src/glsl/nir/nir_lower_var_copies.c
index 21672901f04..98c107aa50e 100644
--- a/src/glsl/nir/nir_lower_var_copies.c
+++ b/src/glsl/nir/nir_lower_var_copies.c
@@ -53,17 +53,6 @@ deref_next_wildcard_parent(nir_deref *deref)
return NULL;
}
-/* Returns the last deref in the chain.
- */
-static nir_deref *
-get_deref_tail(nir_deref *deref)
-{
- while (deref->child)
- deref = deref->child;
-
- return deref;
-}
-
/* This function recursively walks the given deref chain and replaces the
* given copy instruction with an equivalent sequence load/store
* operations.
@@ -121,8 +110,8 @@ emit_copy_load_store(nir_intrinsic_instr *copy_instr,
} else {
/* In this case, we have no wildcards anymore, so all we have to do
* is just emit the load and store operations. */
- src_tail = get_deref_tail(src_tail);
- dest_tail = get_deref_tail(dest_tail);
+ src_tail = nir_deref_tail(src_tail);
+ dest_tail = nir_deref_tail(dest_tail);
assert(src_tail->type == dest_tail->type);
diff --git a/src/glsl/nir/nir_split_var_copies.c b/src/glsl/nir/nir_split_var_copies.c
index d463f7bdae9..bfbef72c1ab 100644
--- a/src/glsl/nir/nir_split_var_copies.c
+++ b/src/glsl/nir/nir_split_var_copies.c
@@ -67,14 +67,6 @@ struct split_var_copies_state {
bool progress;
};
-static nir_deref *
-get_deref_tail(nir_deref *deref)
-{
- while (deref->child != NULL)
- deref = deref->child;
- return deref;
-}
-
/* Recursively constructs deref chains to split a copy instruction into
* multiple (if needed) copy instructions with full-length deref chains.
* External callers of this function should pass the tail and head of the
@@ -227,8 +219,8 @@ split_var_copies_block(nir_block *block, void *void_state)
nir_deref *dest_head = &intrinsic->variables[0]->deref;
nir_deref *src_head = &intrinsic->variables[1]->deref;
- nir_deref *dest_tail = get_deref_tail(dest_head);
- nir_deref *src_tail = get_deref_tail(src_head);
+ nir_deref *dest_tail = nir_deref_tail(dest_head);
+ nir_deref *src_tail = nir_deref_tail(src_head);
switch (glsl_get_base_type(src_tail->type)) {
case GLSL_TYPE_ARRAY:
From 87711183ac35d85ca7d2c2ee67536fe689d6bef3 Mon Sep 17 00:00:00 2001
From: Dave Airlie
Date: Sat, 31 Oct 2015 16:19:43 +1000
Subject: [PATCH 106/287] virgl: wrap ret assignment with braces to do correct
thing
Coverity reported that ret could only be 0 or 1, since it
was setting ret = fn() > 0, instead of doing (ret = fn()) > 0.
Signed-off-by: Dave Airlie
---
src/gallium/winsys/virgl/drm/virgl_drm_winsys.c | 2 +-
src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
index d77ebd6ca15..b5d4435e5e6 100644
--- a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
+++ b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
@@ -309,7 +309,7 @@ virgl_drm_winsys_resource_cache_create(struct virgl_winsys *qws,
while (curr != &qdws->delayed) {
curr_res = LIST_ENTRY(struct virgl_hw_res, curr, head);
- if (!res && (ret = virgl_is_res_compat(qdws, curr_res, size, bind, format) > 0))
+ if (!res && ((ret = virgl_is_res_compat(qdws, curr_res, size, bind, format)) > 0))
res = curr_res;
else if (os_time_timeout(curr_res->start, curr_res->end, now)) {
LIST_DEL(&curr_res->head);
diff --git a/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c b/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c
index b19c4561493..9c9ec044591 100644
--- a/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c
+++ b/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c
@@ -343,7 +343,7 @@ virgl_vtest_winsys_resource_cache_create(struct virgl_winsys *vws,
while (curr != &vtws->delayed) {
curr_res = LIST_ENTRY(struct virgl_hw_res, curr, head);
- if (!res && (ret = virgl_is_res_compat(vtws, curr_res, size, bind, format) > 0))
+ if (!res && ((ret = virgl_is_res_compat(vtws, curr_res, size, bind, format)) > 0))
res = curr_res;
else if (os_time_timeout(curr_res->start, curr_res->end, now)) {
LIST_DEL(&curr_res->head);
From 0f5b1409fd2f9b26c45e750a37947d27c892ee60 Mon Sep 17 00:00:00 2001
From: Dave Airlie
Date: Sun, 8 Nov 2015 07:55:17 +1000
Subject: [PATCH 107/287] llvmpipe: disable front updates for now
As pointed out by Emil, this sometimes hangs, appears to be due to threading
need to rethink how this stuff works for llvmpipe.
Signed-off-by: Dave Airlie
---
src/gallium/drivers/llvmpipe/lp_texture.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c
index 7862ac8f217..82868814581 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -805,7 +805,7 @@ llvmpipe_init_screen_resource_funcs(struct pipe_screen *screen)
#endif
screen->resource_create = llvmpipe_resource_create;
- screen->resource_create_front = llvmpipe_resource_create_front;
+/* screen->resource_create_front = llvmpipe_resource_create_front; */
screen->resource_destroy = llvmpipe_resource_destroy;
screen->resource_from_handle = llvmpipe_resource_from_handle;
screen->resource_get_handle = llvmpipe_resource_get_handle;
From 53cbb11707a502a31bb9f0380d730840245ee9b2 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin
Date: Fri, 6 Nov 2015 00:44:10 -0500
Subject: [PATCH 108/287] nouveau: avoid queueing too much work onto a single
fence
Force the fence to get kicked off, which won't actually wait for its
completion, but any additional work will be put onto a fresh list.
This fixes crashes in teximage-colors --benchmark with too many active
maps.
Signed-off-by: Ilia Mirkin
---
src/gallium/drivers/nouveau/nouveau_fence.c | 68 +++++++++++++--------
src/gallium/drivers/nouveau/nouveau_fence.h | 1 +
2 files changed, 43 insertions(+), 26 deletions(-)
diff --git a/src/gallium/drivers/nouveau/nouveau_fence.c b/src/gallium/drivers/nouveau/nouveau_fence.c
index d3a34060952..691553ae7e4 100644
--- a/src/gallium/drivers/nouveau/nouveau_fence.c
+++ b/src/gallium/drivers/nouveau/nouveau_fence.c
@@ -59,26 +59,6 @@ nouveau_fence_trigger_work(struct nouveau_fence *fence)
}
}
-bool
-nouveau_fence_work(struct nouveau_fence *fence,
- void (*func)(void *), void *data)
-{
- struct nouveau_fence_work *work;
-
- if (!fence || fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) {
- func(data);
- return true;
- }
-
- work = CALLOC_STRUCT(nouveau_fence_work);
- if (!work)
- return false;
- work->func = func;
- work->data = data;
- LIST_ADD(&work->list, &fence->work);
- return true;
-}
-
void
nouveau_fence_emit(struct nouveau_fence *fence)
{
@@ -182,12 +162,10 @@ nouveau_fence_signalled(struct nouveau_fence *fence)
return fence->state == NOUVEAU_FENCE_STATE_SIGNALLED;
}
-bool
-nouveau_fence_wait(struct nouveau_fence *fence, struct pipe_debug_callback *debug)
+static bool
+nouveau_fence_kick(struct nouveau_fence *fence)
{
struct nouveau_screen *screen = fence->screen;
- uint32_t spins = 0;
- int64_t start = 0;
/* wtf, someone is waiting on a fence in flush_notify handler? */
assert(fence->state != NOUVEAU_FENCE_STATE_EMITTING);
@@ -208,12 +186,25 @@ nouveau_fence_wait(struct nouveau_fence *fence, struct pipe_debug_callback *debu
if (fence == screen->fence.current)
nouveau_fence_next(screen);
+ nouveau_fence_update(screen, false);
+
+ return true;
+}
+
+bool
+nouveau_fence_wait(struct nouveau_fence *fence, struct pipe_debug_callback *debug)
+{
+ struct nouveau_screen *screen = fence->screen;
+ uint32_t spins = 0;
+ int64_t start = 0;
+
if (debug && debug->debug_message)
start = os_time_get_nano();
- do {
- nouveau_fence_update(screen, false);
+ if (!nouveau_fence_kick(fence))
+ return false;
+ do {
if (fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) {
if (debug && debug->debug_message)
pipe_debug_message(debug, PERF_INFO,
@@ -228,6 +219,8 @@ nouveau_fence_wait(struct nouveau_fence *fence, struct pipe_debug_callback *debu
if (!(spins % 8)) /* donate a few cycles */
sched_yield();
#endif
+
+ nouveau_fence_update(screen, false);
} while (spins < NOUVEAU_FENCE_MAX_SPINS);
debug_printf("Wait on fence %u (ack = %u, next = %u) timed out !\n",
@@ -259,3 +252,26 @@ nouveau_fence_unref_bo(void *data)
nouveau_bo_ref(NULL, &bo);
}
+
+bool
+nouveau_fence_work(struct nouveau_fence *fence,
+ void (*func)(void *), void *data)
+{
+ struct nouveau_fence_work *work;
+
+ if (!fence || fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) {
+ func(data);
+ return true;
+ }
+
+ work = CALLOC_STRUCT(nouveau_fence_work);
+ if (!work)
+ return false;
+ work->func = func;
+ work->data = data;
+ LIST_ADD(&work->list, &fence->work);
+ p_atomic_inc(&fence->work_count);
+ if (fence->work_count > 64)
+ nouveau_fence_kick(fence);
+ return true;
+}
diff --git a/src/gallium/drivers/nouveau/nouveau_fence.h b/src/gallium/drivers/nouveau/nouveau_fence.h
index 0fa9d020f50..f10016da826 100644
--- a/src/gallium/drivers/nouveau/nouveau_fence.h
+++ b/src/gallium/drivers/nouveau/nouveau_fence.h
@@ -25,6 +25,7 @@ struct nouveau_fence {
int state;
int ref;
uint32_t sequence;
+ uint32_t work_count;
struct list_head work;
};
From af218217d71152df8562b7f087086197f28080fe Mon Sep 17 00:00:00 2001
From: Ilia Mirkin
Date: Sat, 7 Nov 2015 18:47:40 -0500
Subject: [PATCH 109/287] nv50/ir: only take abs value when computing high
result
Not reachable from TGSI since it only has UMUL, no IMUL. However it's
surprising that setting argument types to s32 will cause sign to get
lost.
Signed-off-by: Ilia Mirkin
---
src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index eec502be798..75164ef0641 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -75,7 +75,7 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul)
s[0] = mul->getSrc(0);
s[1] = mul->getSrc(1);
- if (isSignedType(mul->sType)) {
+ if (isSignedType(mul->sType) && highResult) {
s[0] = bld->getSSA(fullSize);
s[1] = bld->getSSA(fullSize);
bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
From e06238cb9e50e3b994d5abac921ad800692a90af Mon Sep 17 00:00:00 2001
From: Ilia Mirkin
Date: Sat, 7 Nov 2015 18:48:55 -0500
Subject: [PATCH 110/287] nv50/ir: fix emission of s[] args in certain
situations
There might only be a single arg (e.g. cvt), so use mode rather than
looking at the source directly. Also we don't want to rely on the type
of the value, which can be unreliable, but instead use the
instruction's. This works out well since mkSplit doesn't adjust the
type.
Signed-off-by: Ilia Mirkin
---
src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
index 7e0fb532565..0b5288218d1 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -441,9 +441,9 @@ CodeEmitterNV50::setSrcFileBits(const Instruction *i, int enc)
return;
if ((mode & 3) == 1) {
- const int pos = i->src(1).getFile() == FILE_IMMEDIATE ? 13 : 14;
+ const int pos = ((mode >> 2) & 3) == 3 ? 13 : 14;
- switch (i->getSrc(0)->reg.type) {
+ switch (i->sType) {
case TYPE_U8:
break;
case TYPE_U16:
From ffb60e77882d2da9f42a76d602114cdb26dd25bc Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset
Date: Fri, 6 Nov 2015 00:33:48 +0100
Subject: [PATCH 111/287] nvc0: enable compute support on Fermi
Altough the compute support is still not complete because textures and
surfaces need to be implemented, it allows to launch very simple compute
kernel like one which reads reading MP performance counters.
This turns on PIPE_CAP_COMPUTE and PIPE_SHADER_COMPUTE.
Signed-off-by: Samuel Pitoiset
Reviewed-by: Ilia Mirkin
---
src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 7d96977c24b..7f8ce21a348 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -186,7 +186,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
case PIPE_CAP_COMPUTE:
- return (class_3d == NVE4_3D_CLASS) ? 1 : 0;
+ return (class_3d <= NVE4_3D_CLASS) ? 1 : 0;
case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
return nouveau_screen(pscreen)->vram_domain & NOUVEAU_BO_VRAM ? 1 : 0;
@@ -245,7 +245,7 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
return 0;
break;
case PIPE_SHADER_COMPUTE:
- if (class_3d != NVE4_3D_CLASS)
+ if (class_3d > NVE4_3D_CLASS)
return 0;
break;
default:
From d115e47099b6c3ceb27d0c462eb559df6d1f9fd7 Mon Sep 17 00:00:00 2001
From: Leo Liu
Date: Thu, 5 Nov 2015 11:22:22 -0500
Subject: [PATCH 112/287] st/va: fix build fails with pipe loader
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
There is no dev in drv, and dev should be from vl_screen here
Signed-off-by: Leo Liu
Reviewed-by: Christian König
---
src/gallium/state_trackers/va/context.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/src/gallium/state_trackers/va/context.c b/src/gallium/state_trackers/va/context.c
index ec9e0488d85..25fa9058edb 100644
--- a/src/gallium/state_trackers/va/context.c
+++ b/src/gallium/state_trackers/va/context.c
@@ -151,8 +151,9 @@ VA_DRIVER_INIT_FUNC(VADriverContextP ctx)
#if GALLIUM_STATIC_TARGETS
drv->vscreen->pscreen = dd_create_screen(drm_fd);
#else
- if (pipe_loader_drm_probe_fd(&drv->dev, drm_fd))
- drv->vscreen->pscreen = pipe_loader_create_screen(drv->dev, PIPE_SEARCH_DIR);
+ if (pipe_loader_drm_probe_fd(&drv->vscreen->dev, drm_fd))
+ drv->vscreen->pscreen =
+ pipe_loader_create_screen(drv->vscreen->dev, PIPE_SEARCH_DIR);
#endif
if (!drv->vscreen->pscreen)
From 7da86e0ec0cd38dcf58db97bb5c8a0eff9a3dd15 Mon Sep 17 00:00:00 2001
From: Leo Liu
Date: Wed, 4 Nov 2015 16:24:26 -0500
Subject: [PATCH 113/287] vl: add drm support for vl_screen
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This will allow the state trackers to use render nodes
with screen creation
v2: dup fd for pipe loader
Signed-off-by: Leo Liu
Reviewed-by: Christian König
---
src/gallium/auxiliary/Makefile.sources | 3 +-
src/gallium/auxiliary/vl/vl_winsys.h | 6 ++
src/gallium/auxiliary/vl/vl_winsys_drm.c | 77 ++++++++++++++++++++++++
3 files changed, 85 insertions(+), 1 deletion(-)
create mode 100644 src/gallium/auxiliary/vl/vl_winsys_drm.c
diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
index 6e22ced4e41..82ef5ecfce4 100644
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -349,7 +349,8 @@ VL_SOURCES := \
# XXX: Nuke this as our dri targets no longer depend on VL.
VL_WINSYS_SOURCES := \
- vl/vl_winsys_dri.c
+ vl/vl_winsys_dri.c \
+ vl/vl_winsys_drm.c
VL_STUB_SOURCES := \
vl/vl_stubs.c
diff --git a/src/gallium/auxiliary/vl/vl_winsys.h b/src/gallium/auxiliary/vl/vl_winsys.h
index f6b47c964f9..df01917466f 100644
--- a/src/gallium/auxiliary/vl/vl_winsys.h
+++ b/src/gallium/auxiliary/vl/vl_winsys.h
@@ -66,4 +66,10 @@ vl_screen_set_next_timestamp(struct vl_screen *vscreen, uint64_t stamp);
void*
vl_screen_get_private(struct vl_screen *vscreen);
+struct vl_screen*
+vl_drm_screen_create(int fd);
+
+void
+vl_drm_screen_destroy(struct vl_screen *vscreen);
+
#endif
diff --git a/src/gallium/auxiliary/vl/vl_winsys_drm.c b/src/gallium/auxiliary/vl/vl_winsys_drm.c
new file mode 100644
index 00000000000..1167fcf6a90
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_winsys_drm.c
@@ -0,0 +1,77 @@
+/**************************************************************************
+ *
+ * Copyright 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include