i965: Add scalar geometry shader support.
This is hidden behind INTEL_SCALAR_GS=1 for now, as we don't yet support instanced geometry shaders, and Orbital Explorer's shader spills like crazy. But the infrastructure is in place, and it's largely working. v2: Lots of rebasing. v3: (feedback from Kristian Høgsberg) - Handle stride and subreg_offset correctly for ATTRs; use a helper. - Fix missing emit_shader_time_end() call. - Delete dead code after early EOT in static vertex case to avoid tripping asserts in emit_shader_time_end(). - Use proper D/UD type in intexp2(). - Fix "EndPrimitve" and "to that" typos. - Assert that invocations == 1 so we know this is missing. Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Kristian Høgsberg <krh@bitplanet.net>
This commit is contained in:
parent
c9541a74e4
commit
36fd653817
|
@ -43,6 +43,7 @@
|
|||
#include "brw_wm.h"
|
||||
#include "brw_fs.h"
|
||||
#include "brw_cs.h"
|
||||
#include "brw_vec4_gs_visitor.h"
|
||||
#include "brw_cfg.h"
|
||||
#include "brw_dead_control_flow.h"
|
||||
#include "main/uniforms.h"
|
||||
|
@ -1360,6 +1361,57 @@ fs_visitor::emit_discard_jump()
|
|||
discard_jump->predicate_inverse = true;
|
||||
}
|
||||
|
||||
void
|
||||
fs_visitor::emit_gs_thread_end()
|
||||
{
|
||||
assert(stage == MESA_SHADER_GEOMETRY);
|
||||
|
||||
struct brw_gs_prog_data *gs_prog_data =
|
||||
(struct brw_gs_prog_data *) prog_data;
|
||||
|
||||
if (gs_compile->control_data_header_size_bits > 0) {
|
||||
emit_gs_control_data_bits(this->final_gs_vertex_count);
|
||||
}
|
||||
|
||||
const fs_builder abld = bld.annotate("thread end");
|
||||
fs_inst *inst;
|
||||
|
||||
if (gs_prog_data->static_vertex_count != -1) {
|
||||
foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
|
||||
if (prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8 ||
|
||||
prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
|
||||
prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
|
||||
prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) {
|
||||
prev->eot = true;
|
||||
|
||||
/* Delete now dead instructions. */
|
||||
foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
|
||||
if (dead == prev)
|
||||
break;
|
||||
dead->remove();
|
||||
}
|
||||
return;
|
||||
} else if (prev->is_control_flow() || prev->has_side_effects()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
||||
abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)));
|
||||
inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, hdr);
|
||||
inst->mlen = 1;
|
||||
} else {
|
||||
fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2);
|
||||
fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
|
||||
sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
|
||||
sources[1] = this->final_gs_vertex_count;
|
||||
abld.LOAD_PAYLOAD(payload, sources, 2, 2);
|
||||
inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
|
||||
inst->mlen = 2;
|
||||
}
|
||||
inst->eot = true;
|
||||
inst->offset = 0;
|
||||
}
|
||||
|
||||
void
|
||||
fs_visitor::assign_curb_setup()
|
||||
{
|
||||
|
@ -1531,6 +1583,26 @@ fs_visitor::assign_urb_setup()
|
|||
this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
|
||||
}
|
||||
|
||||
void
|
||||
fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
|
||||
{
|
||||
for (int i = 0; i < inst->sources; i++) {
|
||||
if (inst->src[i].file == ATTR) {
|
||||
int grf = payload.num_regs +
|
||||
prog_data->curb_read_length +
|
||||
inst->src[i].reg +
|
||||
inst->src[i].reg_offset;
|
||||
|
||||
inst->src[i].file = HW_REG;
|
||||
inst->src[i].fixed_hw_reg =
|
||||
stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
|
||||
inst->src[i].subreg_offset),
|
||||
inst->exec_size * inst->src[i].stride,
|
||||
inst->exec_size, inst->src[i].stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
fs_visitor::assign_vs_urb_setup()
|
||||
{
|
||||
|
@ -1548,24 +1620,44 @@ fs_visitor::assign_vs_urb_setup()
|
|||
|
||||
/* Rewrite all ATTR file references to the hw grf that they land in. */
|
||||
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
||||
for (int i = 0; i < inst->sources; i++) {
|
||||
if (inst->src[i].file == ATTR) {
|
||||
int grf = payload.num_regs +
|
||||
prog_data->curb_read_length +
|
||||
inst->src[i].reg +
|
||||
inst->src[i].reg_offset;
|
||||
|
||||
inst->src[i].file = HW_REG;
|
||||
inst->src[i].fixed_hw_reg =
|
||||
stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
|
||||
inst->src[i].subreg_offset),
|
||||
inst->exec_size * inst->src[i].stride,
|
||||
inst->exec_size, inst->src[i].stride);
|
||||
}
|
||||
}
|
||||
convert_attr_sources_to_hw_regs(inst);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
fs_visitor::assign_gs_urb_setup()
|
||||
{
|
||||
assert(stage == MESA_SHADER_GEOMETRY);
|
||||
|
||||
brw_vue_prog_data *vue_prog_data = (brw_vue_prog_data *) prog_data;
|
||||
|
||||
first_non_payload_grf +=
|
||||
8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
|
||||
|
||||
const unsigned first_icp_handle = payload.num_regs -
|
||||
(vue_prog_data->include_vue_handles ? nir->info.gs.vertices_in : 0);
|
||||
|
||||
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
||||
/* Lower URB_READ_SIMD8 opcodes into real messages. */
|
||||
if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8) {
|
||||
assert(inst->src[0].file == IMM);
|
||||
inst->src[0] = retype(brw_vec8_grf(first_icp_handle +
|
||||
inst->src[0].fixed_hw_reg.dw1.ud,
|
||||
0), BRW_REGISTER_TYPE_UD);
|
||||
/* for now, assume constant - we can do per-slot offsets later */
|
||||
assert(inst->src[1].file == IMM);
|
||||
inst->offset = inst->src[1].fixed_hw_reg.dw1.ud;
|
||||
inst->src[1] = fs_reg();
|
||||
inst->mlen = 1;
|
||||
inst->base_mrf = -1;
|
||||
}
|
||||
|
||||
/* Rewrite all ATTR file references to HW_REGs. */
|
||||
convert_attr_sources_to_hw_regs(inst);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Split large virtual GRFs into separate components if we can.
|
||||
*
|
||||
|
@ -4762,6 +4854,45 @@ fs_visitor::setup_vs_payload()
|
|||
* conveying the data, and thereby reduce push constant usage.
|
||||
*
|
||||
*/
|
||||
void
|
||||
fs_visitor::setup_gs_payload()
|
||||
{
|
||||
assert(stage == MESA_SHADER_GEOMETRY);
|
||||
|
||||
struct brw_gs_prog_data *gs_prog_data =
|
||||
(struct brw_gs_prog_data *) prog_data;
|
||||
struct brw_vue_prog_data *vue_prog_data =
|
||||
(struct brw_vue_prog_data *) prog_data;
|
||||
|
||||
/* R0: thread header, R1: output URB handles */
|
||||
payload.num_regs = 2;
|
||||
|
||||
if (gs_prog_data->include_primitive_id) {
|
||||
/* R2: Primitive ID 0..7 */
|
||||
payload.num_regs++;
|
||||
}
|
||||
|
||||
/* Use a maximum of 32 registers for push-model inputs. */
|
||||
const unsigned max_push_components = 32;
|
||||
|
||||
/* If pushing our inputs would take too many registers, reduce the URB read
|
||||
* length (which is in HWords, or 8 registers), and resort to pulling.
|
||||
*
|
||||
* Note that the GS reads <URB Read Length> HWords for every vertex - so we
|
||||
* have to multiply by VerticesIn to obtain the total storage requirement.
|
||||
*/
|
||||
if (8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in >
|
||||
max_push_components) {
|
||||
gs_prog_data->base.include_vue_handles = true;
|
||||
|
||||
/* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
|
||||
payload.num_regs += nir->info.gs.vertices_in;
|
||||
|
||||
vue_prog_data->urb_read_length =
|
||||
ROUND_DOWN_TO(max_push_components / nir->info.gs.vertices_in, 8) / 8;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
fs_visitor::setup_cs_payload()
|
||||
{
|
||||
|
@ -5018,6 +5149,55 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes)
|
|||
return !failed;
|
||||
}
|
||||
|
||||
bool
|
||||
fs_visitor::run_gs()
|
||||
{
|
||||
assert(stage == MESA_SHADER_GEOMETRY);
|
||||
|
||||
setup_gs_payload();
|
||||
|
||||
this->final_gs_vertex_count = vgrf(glsl_type::uint_type);
|
||||
|
||||
if (gs_compile->control_data_header_size_bits > 0) {
|
||||
/* Create a VGRF to store accumulated control data bits. */
|
||||
this->control_data_bits = vgrf(glsl_type::uint_type);
|
||||
|
||||
/* If we're outputting more than 32 control data bits, then EmitVertex()
|
||||
* will set control_data_bits to 0 after emitting the first vertex.
|
||||
* Otherwise, we need to initialize it to 0 here.
|
||||
*/
|
||||
if (gs_compile->control_data_header_size_bits <= 32) {
|
||||
const fs_builder abld = bld.annotate("initialize control data bits");
|
||||
abld.MOV(this->control_data_bits, fs_reg(0u));
|
||||
}
|
||||
}
|
||||
|
||||
if (shader_time_index >= 0)
|
||||
emit_shader_time_begin();
|
||||
|
||||
emit_nir_code();
|
||||
|
||||
emit_gs_thread_end();
|
||||
|
||||
if (shader_time_index >= 0)
|
||||
emit_shader_time_end();
|
||||
|
||||
if (failed)
|
||||
return false;
|
||||
|
||||
calculate_cfg();
|
||||
|
||||
optimize();
|
||||
|
||||
assign_curb_setup();
|
||||
assign_gs_urb_setup();
|
||||
|
||||
fixup_3src_null_dest();
|
||||
allocate_registers();
|
||||
|
||||
return !failed;
|
||||
}
|
||||
|
||||
bool
|
||||
fs_visitor::run_fs(bool do_rep_send)
|
||||
{
|
||||
|
|
|
@ -132,18 +132,22 @@ public:
|
|||
|
||||
bool run_fs(bool do_rep_send);
|
||||
bool run_vs(gl_clip_plane *clip_planes);
|
||||
bool run_gs();
|
||||
bool run_cs();
|
||||
void optimize();
|
||||
void allocate_registers();
|
||||
void setup_payload_gen4();
|
||||
void setup_payload_gen6();
|
||||
void setup_vs_payload();
|
||||
void setup_gs_payload();
|
||||
void setup_cs_payload();
|
||||
void fixup_3src_null_dest();
|
||||
void assign_curb_setup();
|
||||
void calculate_urb_setup();
|
||||
void assign_urb_setup();
|
||||
void convert_attr_sources_to_hw_regs(fs_inst *inst);
|
||||
void assign_vs_urb_setup();
|
||||
void assign_gs_urb_setup();
|
||||
bool assign_regs(bool allow_spilling);
|
||||
void assign_regs_trivial();
|
||||
void calculate_payload_ranges(int payload_node_count,
|
||||
|
@ -281,7 +285,16 @@ public:
|
|||
fs_reg color1, fs_reg color2,
|
||||
fs_reg src0_alpha, unsigned components);
|
||||
void emit_fb_writes();
|
||||
void emit_urb_writes();
|
||||
void emit_urb_writes(const fs_reg &gs_vertex_count = fs_reg());
|
||||
void set_gs_stream_control_data_bits(const fs_reg &vertex_count,
|
||||
unsigned stream_id);
|
||||
void emit_gs_control_data_bits(const fs_reg &vertex_count);
|
||||
void emit_gs_end_primitive(const nir_src &vertex_count_nir_src);
|
||||
void emit_gs_vertex(const nir_src &vertex_count_nir_src,
|
||||
unsigned stream_id);
|
||||
void emit_gs_thread_end();
|
||||
void emit_gs_input_load(const fs_reg &dst, const nir_src &vertex_src,
|
||||
unsigned offset, unsigned num_components);
|
||||
void emit_cs_terminate();
|
||||
fs_reg *emit_cs_local_invocation_id_setup();
|
||||
fs_reg *emit_cs_work_group_id_setup();
|
||||
|
@ -389,6 +402,8 @@ public:
|
|||
fs_reg delta_xy[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
|
||||
fs_reg shader_start_time;
|
||||
fs_reg userplane[MAX_CLIP_PLANES];
|
||||
fs_reg final_gs_vertex_count;
|
||||
fs_reg control_data_bits;
|
||||
|
||||
unsigned grf_used;
|
||||
bool spilled_any_registers;
|
||||
|
|
|
@ -28,6 +28,7 @@
|
|||
#include "program/prog_to_nir.h"
|
||||
#include "brw_fs.h"
|
||||
#include "brw_fs_surface_builder.h"
|
||||
#include "brw_vec4_gs_visitor.h"
|
||||
#include "brw_nir.h"
|
||||
#include "brw_fs_surface_builder.h"
|
||||
#include "brw_vec4_gs_visitor.h"
|
||||
|
@ -102,6 +103,7 @@ fs_visitor::nir_setup_outputs()
|
|||
|
||||
switch (stage) {
|
||||
case MESA_SHADER_VERTEX:
|
||||
case MESA_SHADER_GEOMETRY:
|
||||
for (unsigned int i = 0; i < ALIGN(type_size_scalar(var->type), 4) / 4; i++) {
|
||||
int output = var->data.location + i;
|
||||
this->outputs[output] = offset(reg, bld, 4 * i);
|
||||
|
@ -1194,6 +1196,375 @@ emit_pixel_interpolater_send(const fs_builder &bld,
|
|||
return inst;
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes 1 << x, given a D/UD register containing some value x.
|
||||
*/
|
||||
static fs_reg
|
||||
intexp2(const fs_builder &bld, const fs_reg &x)
|
||||
{
|
||||
assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);
|
||||
|
||||
fs_reg result = bld.vgrf(x.type, 1);
|
||||
fs_reg one = bld.vgrf(x.type, 1);
|
||||
|
||||
bld.MOV(one, retype(fs_reg(1), one.type));
|
||||
bld.SHL(result, one, x);
|
||||
return result;
|
||||
}
|
||||
|
||||
void
|
||||
fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
|
||||
{
|
||||
assert(stage == MESA_SHADER_GEOMETRY);
|
||||
|
||||
struct brw_gs_prog_data *gs_prog_data =
|
||||
(struct brw_gs_prog_data *) prog_data;
|
||||
|
||||
/* We can only do EndPrimitive() functionality when the control data
|
||||
* consists of cut bits. Fortunately, the only time it isn't is when the
|
||||
* output type is points, in which case EndPrimitive() is a no-op.
|
||||
*/
|
||||
if (gs_prog_data->control_data_format !=
|
||||
GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Cut bits use one bit per vertex. */
|
||||
assert(gs_compile->control_data_bits_per_vertex == 1);
|
||||
|
||||
fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
|
||||
vertex_count.type = BRW_REGISTER_TYPE_UD;
|
||||
|
||||
/* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
|
||||
* vertex n, 0 otherwise. So all we need to do here is mark bit
|
||||
* (vertex_count - 1) % 32 in the cut_bits register to indicate that
|
||||
* EndPrimitive() was called after emitting vertex (vertex_count - 1);
|
||||
* vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
|
||||
*
|
||||
* Note that if EndPrimitive() is called before emitting any vertices, this
|
||||
* will cause us to set bit 31 of the control_data_bits register to 1.
|
||||
* That's fine because:
|
||||
*
|
||||
* - If max_vertices < 32, then vertex number 31 (zero-based) will never be
|
||||
* output, so the hardware will ignore cut bit 31.
|
||||
*
|
||||
* - If max_vertices == 32, then vertex number 31 is guaranteed to be the
|
||||
* last vertex, so setting cut bit 31 has no effect (since the primitive
|
||||
* is automatically ended when the GS terminates).
|
||||
*
|
||||
* - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
|
||||
* control_data_bits register to 0 when the first vertex is emitted.
|
||||
*/
|
||||
|
||||
const fs_builder abld = bld.annotate("end primitive");
|
||||
|
||||
/* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
|
||||
fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
||||
abld.ADD(prev_count, vertex_count, fs_reg(0xffffffffu));
|
||||
fs_reg mask = intexp2(abld, prev_count);
|
||||
/* Note: we're relying on the fact that the GEN SHL instruction only pays
|
||||
* attention to the lower 5 bits of its second source argument, so on this
|
||||
* architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
|
||||
* ((vertex_count - 1) % 32).
|
||||
*/
|
||||
abld.OR(this->control_data_bits, this->control_data_bits, mask);
|
||||
}
|
||||
|
||||
void
|
||||
fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
|
||||
{
|
||||
assert(stage == MESA_SHADER_GEOMETRY);
|
||||
assert(gs_compile->control_data_bits_per_vertex != 0);
|
||||
|
||||
struct brw_gs_prog_data *gs_prog_data =
|
||||
(struct brw_gs_prog_data *) prog_data;
|
||||
|
||||
const fs_builder abld = bld.annotate("emit control data bits");
|
||||
const fs_builder fwa_bld = bld.exec_all();
|
||||
|
||||
/* We use a single UD register to accumulate control data bits (32 bits
|
||||
* for each of the SIMD8 channels). So we need to write a DWord (32 bits)
|
||||
* at a time.
|
||||
*
|
||||
* Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
|
||||
* We have select a 128-bit group via the Global and Per-Slot Offsets, then
|
||||
* use the Channel Mask phase to enable/disable which DWord within that
|
||||
* group to write. (Remember, different SIMD8 channels may have emitted
|
||||
* different numbers of vertices, so we may need per-slot offsets.)
|
||||
*
|
||||
* Channel masking presents an annoying problem: we may have to replicate
|
||||
* the data up to 4 times:
|
||||
*
|
||||
* Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
|
||||
*
|
||||
* To avoid penalizing shaders that emit a small number of vertices, we
|
||||
* can avoid these sometimes: if the size of the control data header is
|
||||
* <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land
|
||||
* land in the same 128-bit group, so we can skip per-slot offsets.
|
||||
*
|
||||
* Similarly, if the control data header is <= 32 bits, there is only one
|
||||
* DWord, so we can skip channel masks.
|
||||
*/
|
||||
enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
|
||||
|
||||
fs_reg channel_mask, per_slot_offset;
|
||||
|
||||
if (gs_compile->control_data_header_size_bits > 32) {
|
||||
opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
|
||||
channel_mask = vgrf(glsl_type::uint_type);
|
||||
}
|
||||
|
||||
if (gs_compile->control_data_header_size_bits > 128) {
|
||||
opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT;
|
||||
per_slot_offset = vgrf(glsl_type::uint_type);
|
||||
}
|
||||
|
||||
/* Figure out which DWord we're trying to write to using the formula:
|
||||
*
|
||||
* dword_index = (vertex_count - 1) * bits_per_vertex / 32
|
||||
*
|
||||
* Since bits_per_vertex is a power of two, and is known at compile
|
||||
* time, this can be optimized to:
|
||||
*
|
||||
* dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
|
||||
*/
|
||||
if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) {
|
||||
fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
||||
fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
||||
abld.ADD(prev_count, vertex_count, fs_reg(0xffffffffu));
|
||||
unsigned log2_bits_per_vertex =
|
||||
_mesa_fls(gs_compile->control_data_bits_per_vertex);
|
||||
abld.SHR(dword_index, prev_count, fs_reg(6u - log2_bits_per_vertex));
|
||||
|
||||
if (per_slot_offset.file != BAD_FILE) {
|
||||
/* Set the per-slot offset to dword_index / 4, so that we'll write to
|
||||
* the appropriate OWord within the control data header.
|
||||
*/
|
||||
abld.SHR(per_slot_offset, dword_index, fs_reg(2u));
|
||||
}
|
||||
|
||||
/* Set the channel masks to 1 << (dword_index % 4), so that we'll
|
||||
* write to the appropriate DWORD within the OWORD.
|
||||
*/
|
||||
fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
||||
fwa_bld.AND(channel, dword_index, fs_reg(3u));
|
||||
channel_mask = intexp2(fwa_bld, channel);
|
||||
/* Then the channel masks need to be in bits 23:16. */
|
||||
fwa_bld.SHL(channel_mask, channel_mask, fs_reg(16u));
|
||||
}
|
||||
|
||||
/* Store the control data bits in the message payload and send it. */
|
||||
int mlen = 2;
|
||||
if (channel_mask.file != BAD_FILE)
|
||||
mlen += 4; /* channel masks, plus 3 extra copies of the data */
|
||||
if (per_slot_offset.file != BAD_FILE)
|
||||
mlen++;
|
||||
|
||||
fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
|
||||
fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen);
|
||||
int i = 0;
|
||||
sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
|
||||
if (per_slot_offset.file != BAD_FILE)
|
||||
sources[i++] = per_slot_offset;
|
||||
if (channel_mask.file != BAD_FILE)
|
||||
sources[i++] = channel_mask;
|
||||
while (i < mlen) {
|
||||
sources[i++] = this->control_data_bits;
|
||||
}
|
||||
|
||||
abld.LOAD_PAYLOAD(payload, sources, mlen, mlen);
|
||||
fs_inst *inst = abld.emit(opcode, reg_undef, payload);
|
||||
inst->mlen = mlen;
|
||||
/* We need to increment Global Offset by 256-bits to make room for
|
||||
* Broadwell's extra "Vertex Count" payload at the beginning of the
|
||||
* URB entry. Since this is an OWord message, Global Offset is counted
|
||||
* in 128-bit units, so we must set it to 2.
|
||||
*/
|
||||
if (gs_prog_data->static_vertex_count == -1)
|
||||
inst->offset = 2;
|
||||
}
|
||||
|
||||
void
|
||||
fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
|
||||
unsigned stream_id)
|
||||
{
|
||||
/* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
|
||||
|
||||
/* Note: we are calling this *before* increasing vertex_count, so
|
||||
* this->vertex_count == vertex_count - 1 in the formula above.
|
||||
*/
|
||||
|
||||
/* Stream mode uses 2 bits per vertex */
|
||||
assert(gs_compile->control_data_bits_per_vertex == 2);
|
||||
|
||||
/* Must be a valid stream */
|
||||
assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS);
|
||||
|
||||
/* Control data bits are initialized to 0 so we don't have to set any
|
||||
* bits when sending vertices to stream 0.
|
||||
*/
|
||||
if (stream_id == 0)
|
||||
return;
|
||||
|
||||
const fs_builder abld = bld.annotate("set stream control data bits", NULL);
|
||||
|
||||
/* reg::sid = stream_id */
|
||||
fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
||||
abld.MOV(sid, fs_reg(stream_id));
|
||||
|
||||
/* reg:shift_count = 2 * (vertex_count - 1) */
|
||||
fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
||||
abld.SHL(shift_count, vertex_count, fs_reg(1u));
|
||||
|
||||
/* Note: we're relying on the fact that the GEN SHL instruction only pays
|
||||
* attention to the lower 5 bits of its second source argument, so on this
|
||||
* architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
|
||||
* stream_id << ((2 * (vertex_count - 1)) % 32).
|
||||
*/
|
||||
fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
||||
abld.SHL(mask, sid, shift_count);
|
||||
abld.OR(this->control_data_bits, this->control_data_bits, mask);
|
||||
}
|
||||
|
||||
void
|
||||
fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
|
||||
unsigned stream_id)
|
||||
{
|
||||
assert(stage == MESA_SHADER_GEOMETRY);
|
||||
|
||||
struct brw_gs_prog_data *gs_prog_data =
|
||||
(struct brw_gs_prog_data *) prog_data;
|
||||
|
||||
fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
|
||||
vertex_count.type = BRW_REGISTER_TYPE_UD;
|
||||
|
||||
/* Haswell and later hardware ignores the "Render Stream Select" bits
|
||||
* from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
|
||||
* and instead sends all primitives down the pipeline for rasterization.
|
||||
* If the SOL stage is enabled, "Render Stream Select" is honored and
|
||||
* primitives bound to non-zero streams are discarded after stream output.
|
||||
*
|
||||
* Since the only purpose of primives sent to non-zero streams is to
|
||||
* be recorded by transform feedback, we can simply discard all geometry
|
||||
* bound to these streams when transform feedback is disabled.
|
||||
*/
|
||||
if (stream_id > 0 && !nir->info.has_transform_feedback_varyings)
|
||||
return;
|
||||
|
||||
/* If we're outputting 32 control data bits or less, then we can wait
|
||||
* until the shader is over to output them all. Otherwise we need to
|
||||
* output them as we go. Now is the time to do it, since we're about to
|
||||
* output the vertex_count'th vertex, so it's guaranteed that the
|
||||
* control data bits associated with the (vertex_count - 1)th vertex are
|
||||
* correct.
|
||||
*/
|
||||
if (gs_compile->control_data_header_size_bits > 32) {
|
||||
const fs_builder abld =
|
||||
bld.annotate("emit vertex: emit control data bits");
|
||||
|
||||
/* Only emit control data bits if we've finished accumulating a batch
|
||||
* of 32 bits. This is the case when:
|
||||
*
|
||||
* (vertex_count * bits_per_vertex) % 32 == 0
|
||||
*
|
||||
* (in other words, when the last 5 bits of vertex_count *
|
||||
* bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some
|
||||
* integer n (which is always the case, since bits_per_vertex is
|
||||
* always 1 or 2), this is equivalent to requiring that the last 5-n
|
||||
* bits of vertex_count are 0:
|
||||
*
|
||||
* vertex_count & (2^(5-n) - 1) == 0
|
||||
*
|
||||
* 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
|
||||
* equivalent to:
|
||||
*
|
||||
* vertex_count & (32 / bits_per_vertex - 1) == 0
|
||||
*
|
||||
* TODO: If vertex_count is an immediate, we could do some of this math
|
||||
* at compile time...
|
||||
*/
|
||||
fs_inst *inst =
|
||||
abld.AND(bld.null_reg_d(), vertex_count,
|
||||
fs_reg(32u / gs_compile->control_data_bits_per_vertex - 1u));
|
||||
inst->conditional_mod = BRW_CONDITIONAL_Z;
|
||||
|
||||
abld.IF(BRW_PREDICATE_NORMAL);
|
||||
/* If vertex_count is 0, then no control data bits have been
|
||||
* accumulated yet, so we can skip emitting them.
|
||||
*/
|
||||
abld.CMP(bld.null_reg_d(), vertex_count, fs_reg(0u),
|
||||
BRW_CONDITIONAL_NEQ);
|
||||
abld.IF(BRW_PREDICATE_NORMAL);
|
||||
emit_gs_control_data_bits(vertex_count);
|
||||
abld.emit(BRW_OPCODE_ENDIF);
|
||||
|
||||
/* Reset control_data_bits to 0 so we can start accumulating a new
|
||||
* batch.
|
||||
*
|
||||
* Note: in the case where vertex_count == 0, this neutralizes the
|
||||
* effect of any call to EndPrimitive() that the shader may have
|
||||
* made before outputting its first vertex.
|
||||
*/
|
||||
inst = abld.MOV(this->control_data_bits, fs_reg(0u));
|
||||
inst->force_writemask_all = true;
|
||||
abld.emit(BRW_OPCODE_ENDIF);
|
||||
}
|
||||
|
||||
emit_urb_writes(vertex_count);
|
||||
|
||||
/* In stream mode we have to set control data bits for all vertices
|
||||
* unless we have disabled control data bits completely (which we do
|
||||
* do for GL_POINTS outputs that don't use streams).
|
||||
*/
|
||||
if (gs_compile->control_data_header_size_bits > 0 &&
|
||||
gs_prog_data->control_data_format ==
|
||||
GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
|
||||
set_gs_stream_control_data_bits(vertex_count, stream_id);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
fs_visitor::emit_gs_input_load(const fs_reg &dst,
|
||||
const nir_src &vertex_src,
|
||||
unsigned input_offset,
|
||||
unsigned num_components)
|
||||
{
|
||||
const brw_vue_prog_data *vue_prog_data = (const brw_vue_prog_data *) prog_data;
|
||||
const unsigned vertex = nir_src_as_const_value(vertex_src)->u[0];
|
||||
|
||||
const unsigned array_stride = vue_prog_data->urb_read_length * 8;
|
||||
|
||||
const bool pushed = 4 * input_offset < array_stride;
|
||||
|
||||
if (input_offset == 0) {
|
||||
/* This is the VUE header, containing VARYING_SLOT_LAYER [.y],
|
||||
* VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w].
|
||||
* Only gl_PointSize is available as a GS input, so they must
|
||||
* be asking for that input.
|
||||
*/
|
||||
if (pushed) {
|
||||
bld.MOV(dst, fs_reg(ATTR, array_stride * vertex + 3, dst.type));
|
||||
} else {
|
||||
fs_reg tmp = bld.vgrf(dst.type, 4);
|
||||
fs_inst *inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
|
||||
fs_reg(vertex), fs_reg(0));
|
||||
inst->regs_written = 4;
|
||||
bld.MOV(dst, offset(tmp, bld, 3));
|
||||
}
|
||||
} else {
|
||||
if (pushed) {
|
||||
int index = vertex * array_stride + 4 * input_offset;
|
||||
for (unsigned i = 0; i < num_components; i++) {
|
||||
bld.MOV(offset(dst, bld, i), fs_reg(ATTR, index + i, dst.type));
|
||||
}
|
||||
} else {
|
||||
fs_inst *inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
|
||||
fs_reg(vertex), fs_reg(input_offset));
|
||||
inst->regs_written = num_components;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
|
||||
{
|
||||
|
@ -1579,6 +1950,14 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
|
|||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_per_vertex_input_indirect:
|
||||
assert(!"Not allowed");
|
||||
/* fallthrough */
|
||||
case nir_intrinsic_load_per_vertex_input:
|
||||
emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
|
||||
instr->num_components);
|
||||
break;
|
||||
|
||||
/* Handle ARB_gpu_shader5 interpolation intrinsics
|
||||
*
|
||||
* It's worth a quick word of explanation as to why we handle the full
|
||||
|
@ -1929,6 +2308,18 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
|
|||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_emit_vertex_with_counter:
|
||||
emit_gs_vertex(instr->src[0], instr->const_index[0]);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_end_primitive_with_counter:
|
||||
emit_gs_end_primitive(instr->src[0]);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_set_vertex_count:
|
||||
bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));
|
||||
break;
|
||||
|
||||
default:
|
||||
unreachable("unknown intrinsic");
|
||||
}
|
||||
|
|
|
@ -880,7 +880,7 @@ void fs_visitor::compute_clip_distance(gl_clip_plane *clip_planes)
|
|||
}
|
||||
|
||||
void
|
||||
fs_visitor::emit_urb_writes()
|
||||
fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
|
||||
{
|
||||
int slot, urb_offset, length;
|
||||
int starting_urb_offset = 0;
|
||||
|
@ -916,9 +916,13 @@ fs_visitor::emit_urb_writes()
|
|||
return;
|
||||
}
|
||||
|
||||
opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
|
||||
int header_size = 1;
|
||||
fs_reg per_slot_offsets;
|
||||
|
||||
if (stage == MESA_SHADER_GEOMETRY) {
|
||||
const struct brw_gs_prog_data *gs_prog_data =
|
||||
(const struct brw_gs_prog_data *) prog_data;
|
||||
(const struct brw_gs_prog_data *) this->prog_data;
|
||||
|
||||
/* We need to increment the Global Offset to skip over the control data
|
||||
* header and the extra "Vertex Count" field (1 HWord) at the beginning
|
||||
|
@ -927,6 +931,27 @@ fs_visitor::emit_urb_writes()
|
|||
starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords;
|
||||
if (gs_prog_data->static_vertex_count == -1)
|
||||
starting_urb_offset += 2;
|
||||
|
||||
/* We also need to use per-slot offsets. The per-slot offset is the
|
||||
* Vertex Count. SIMD8 mode processes 8 different primitives at a
|
||||
* time; each may output a different number of vertices.
|
||||
*/
|
||||
opcode = SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT;
|
||||
header_size++;
|
||||
|
||||
/* The URB offset is in 128-bit units, so we need to multiply by 2 */
|
||||
const int output_vertex_size_owords =
|
||||
gs_prog_data->output_vertex_size_hwords * 2;
|
||||
|
||||
fs_reg offset;
|
||||
if (gs_vertex_count.file == IMM) {
|
||||
per_slot_offsets = fs_reg(output_vertex_size_owords *
|
||||
gs_vertex_count.fixed_hw_reg.dw1.ud);
|
||||
} else {
|
||||
per_slot_offsets = vgrf(glsl_type::int_type);
|
||||
bld.MUL(per_slot_offsets, gs_vertex_count,
|
||||
fs_reg(output_vertex_size_owords));
|
||||
}
|
||||
}
|
||||
|
||||
length = 0;
|
||||
|
@ -1023,19 +1048,25 @@ fs_visitor::emit_urb_writes()
|
|||
if (length == 8 || last)
|
||||
flush = true;
|
||||
if (flush) {
|
||||
fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
|
||||
fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1),
|
||||
fs_reg *payload_sources =
|
||||
ralloc_array(mem_ctx, fs_reg, length + header_size);
|
||||
fs_reg payload = fs_reg(GRF, alloc.allocate(length + header_size),
|
||||
BRW_REGISTER_TYPE_F);
|
||||
payload_sources[0] =
|
||||
fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
|
||||
|
||||
memcpy(&payload_sources[1], sources, length * sizeof sources[0]);
|
||||
abld.LOAD_PAYLOAD(payload, payload_sources, length + 1, 1);
|
||||
if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT)
|
||||
payload_sources[1] = per_slot_offsets;
|
||||
|
||||
fs_inst *inst =
|
||||
abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
|
||||
memcpy(&payload_sources[header_size], sources,
|
||||
length * sizeof sources[0]);
|
||||
|
||||
abld.LOAD_PAYLOAD(payload, payload_sources, length + header_size,
|
||||
header_size);
|
||||
|
||||
fs_inst *inst = abld.emit(opcode, reg_undef, payload);
|
||||
inst->eot = last && stage == MESA_SHADER_VERTEX;
|
||||
inst->mlen = length + 1;
|
||||
inst->mlen = length + header_size;
|
||||
inst->offset = urb_offset;
|
||||
urb_offset = starting_urb_offset + slot + 1;
|
||||
length = 0;
|
||||
|
|
|
@ -29,6 +29,7 @@
|
|||
|
||||
#include "brw_vec4_gs_visitor.h"
|
||||
#include "gen6_gs_visitor.h"
|
||||
#include "brw_fs.h"
|
||||
|
||||
namespace brw {
|
||||
|
||||
|
@ -812,6 +813,30 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
|
|||
* program.
|
||||
*/
|
||||
|
||||
if (compiler->scalar_gs) {
|
||||
/* TODO: Support instanced GS. We have basically no tests... */
|
||||
assert(prog_data->invocations == 1);
|
||||
|
||||
fs_visitor v(compiler, log_data, mem_ctx, &c, prog_data, shader,
|
||||
shader_time_index);
|
||||
if (v.run_gs()) {
|
||||
prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
|
||||
|
||||
fs_generator g(compiler, log_data, mem_ctx, &c.key,
|
||||
&prog_data->base.base, v.promoted_constants,
|
||||
false, "GS");
|
||||
if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
|
||||
const char *label =
|
||||
shader->info.label ? shader->info.label : "unnamed";
|
||||
char *name = ralloc_asprintf(mem_ctx, "%s geometry shader %s",
|
||||
label, shader->info.name);
|
||||
g.enable_debug(name);
|
||||
}
|
||||
g.generate_code(v.cfg, 8);
|
||||
return g.get_assembly(final_assembly_size);
|
||||
}
|
||||
}
|
||||
|
||||
if (compiler->devinfo->gen >= 7) {
|
||||
/* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
|
||||
* so without spilling. If the GS invocations count > 1, then we can't use
|
||||
|
|
Loading…
Reference in New Issue