intel/compiler: Use nir_var_mem_task_payload
Instead of reusing the in/out slot mechanism, use a separated NIR variable mode. This will make easier later to implement staging the output in shared memory (and storing all at the end to the URB). Note to get 64-bit type support we currently rely on the brw_nir_lower_mem_access_bit_sizes() pass. Reviewed-by: Marcin Ślusarz <marcin.slusarz@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15022>
This commit is contained in:
parent
2d1e6b756e
commit
c36ae42e4c
|
@ -1412,8 +1412,6 @@ struct brw_clip_prog_data {
|
|||
};
|
||||
|
||||
struct brw_tue_map {
|
||||
int32_t start_dw[VARYING_SLOT_MAX];
|
||||
|
||||
uint32_t size_dw;
|
||||
|
||||
uint32_t per_task_data_start_dw;
|
||||
|
|
|
@ -77,18 +77,33 @@ type_size_scalar_dwords(const struct glsl_type *type, bool bindless)
|
|||
return glsl_count_dword_slots(type, bindless);
|
||||
}
|
||||
|
||||
/* TODO(mesh): Make this a common function. */
|
||||
static void
|
||||
shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
|
||||
{
|
||||
assert(glsl_type_is_vector_or_scalar(type));
|
||||
|
||||
uint32_t comp_size = glsl_type_is_boolean(type)
|
||||
? 4 : glsl_get_bit_size(type) / 8;
|
||||
unsigned length = glsl_get_vector_elements(type);
|
||||
*size = comp_size * length,
|
||||
*align = comp_size * (length == 3 ? 4 : length);
|
||||
}
|
||||
|
||||
static void
|
||||
brw_nir_lower_tue_outputs(nir_shader *nir, const brw_tue_map *map)
|
||||
{
|
||||
nir_foreach_shader_out_variable(var, nir) {
|
||||
int location = var->data.location;
|
||||
assert(location >= 0);
|
||||
assert(map->start_dw[location] != -1);
|
||||
var->data.driver_location = map->start_dw[location];
|
||||
assert(var->data.location == VARYING_SLOT_TASK_COUNT);
|
||||
/* First word in TUE header. */
|
||||
var->data.driver_location = 0;
|
||||
}
|
||||
|
||||
nir_lower_io(nir, nir_var_shader_out, type_size_scalar_dwords,
|
||||
nir_lower_io_lower_64bit_to_32);
|
||||
|
||||
nir_lower_explicit_io(nir, nir_var_mem_task_payload,
|
||||
nir_address_format_32bit_offset);
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -96,63 +111,79 @@ brw_compute_tue_map(struct nir_shader *nir, struct brw_tue_map *map)
|
|||
{
|
||||
memset(map, 0, sizeof(*map));
|
||||
|
||||
map->start_dw[VARYING_SLOT_TASK_COUNT] = 0;
|
||||
|
||||
/* Words 1-3 are used for "Dispatch Dimensions" feature, to allow mapping a
|
||||
* 3D dispatch into the 1D dispatch supported by HW. So ignore those.
|
||||
*/
|
||||
|
||||
/* From bspec: "It is suggested that SW reserve the 16 bytes following the
|
||||
/* TUE header contains 4 words:
|
||||
*
|
||||
* - Word 0 for Task Count.
|
||||
*
|
||||
* - Words 1-3 used for "Dispatch Dimensions" feature, to allow mapping a
|
||||
* 3D dispatch into the 1D dispatch supported by HW. Currently not used.
|
||||
*
|
||||
* From bspec: "It is suggested that SW reserve the 16 bytes following the
|
||||
* TUE Header, and therefore start the SW-defined data structure at 32B
|
||||
* alignment. This allows the TUE Header to always be written as 32 bytes
|
||||
* with 32B alignment, the most optimal write performance case."
|
||||
*/
|
||||
map->per_task_data_start_dw = 8;
|
||||
|
||||
/* Lowering to explicit types will start offsets from task_payload_size, so
|
||||
* set it to start after the header.
|
||||
*/
|
||||
nir->info.task_payload_size = map->per_task_data_start_dw * 4;
|
||||
nir_lower_vars_to_explicit_types(nir, nir_var_mem_task_payload,
|
||||
shared_type_info);
|
||||
|
||||
/* Compact the data: find the size associated with each location... */
|
||||
nir_foreach_shader_out_variable(var, nir) {
|
||||
const int location = var->data.location;
|
||||
if (location == VARYING_SLOT_TASK_COUNT)
|
||||
continue;
|
||||
assert(location >= VARYING_SLOT_VAR0);
|
||||
assert(location < VARYING_SLOT_MAX);
|
||||
|
||||
map->start_dw[location] += type_size_scalar_dwords(var->type, false);
|
||||
}
|
||||
|
||||
/* ...then assign positions using those sizes. */
|
||||
unsigned next = map->per_task_data_start_dw;
|
||||
for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
|
||||
if (i == VARYING_SLOT_TASK_COUNT)
|
||||
continue;
|
||||
if (map->start_dw[i] == 0) {
|
||||
map->start_dw[i] = -1;
|
||||
} else {
|
||||
const unsigned size = map->start_dw[i];
|
||||
map->start_dw[i] = next;
|
||||
next += size;
|
||||
}
|
||||
}
|
||||
|
||||
map->size_dw = ALIGN(next, 8);
|
||||
map->size_dw = ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8);
|
||||
}
|
||||
|
||||
static void
|
||||
brw_print_tue_map(FILE *fp, const struct brw_tue_map *map)
|
||||
{
|
||||
fprintf(fp, "TUE map (%d dwords)\n", map->size_dw);
|
||||
fprintf(fp, " %4d: VARYING_SLOT_TASK_COUNT\n",
|
||||
map->start_dw[VARYING_SLOT_TASK_COUNT]);
|
||||
fprintf(fp, "TUE (%d dwords)\n\n", map->size_dw);
|
||||
}
|
||||
|
||||
for (int i = VARYING_SLOT_VAR0; i < VARYING_SLOT_MAX; i++) {
|
||||
if (map->start_dw[i] != -1) {
|
||||
fprintf(fp, " %4d: VARYING_SLOT_VAR%d\n", map->start_dw[i],
|
||||
i - VARYING_SLOT_VAR0);
|
||||
}
|
||||
static bool
|
||||
brw_nir_adjust_task_payload_offsets_instr(struct nir_builder *b,
|
||||
nir_instr *instr, void *data)
|
||||
{
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
return false;
|
||||
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_store_task_payload:
|
||||
case nir_intrinsic_load_task_payload: {
|
||||
nir_src *offset_src = nir_get_io_offset_src(intrin);
|
||||
|
||||
if (nir_src_is_const(*offset_src))
|
||||
assert(nir_src_as_uint(*offset_src) % 4 == 0);
|
||||
|
||||
b->cursor = nir_before_instr(&intrin->instr);
|
||||
|
||||
/* Regular I/O uses dwords while explicit I/O used for task payload uses
|
||||
* bytes. Normalize it to dwords.
|
||||
*
|
||||
* TODO(mesh): Figure out how to handle 8-bit, 16-bit.
|
||||
*/
|
||||
|
||||
assert(offset_src->is_ssa);
|
||||
nir_ssa_def *offset = nir_ishr_imm(b, offset_src->ssa, 2);
|
||||
nir_instr_rewrite_src(&intrin->instr, offset_src, nir_src_for_ssa(offset));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
fprintf(fp, "\n");
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
brw_nir_adjust_task_payload_offsets(nir_shader *nir)
|
||||
{
|
||||
nir_shader_instructions_pass(nir, brw_nir_adjust_task_payload_offsets_instr,
|
||||
nir_metadata_block_index |
|
||||
nir_metadata_dominance,
|
||||
NULL);
|
||||
}
|
||||
|
||||
const unsigned *
|
||||
|
@ -178,6 +209,7 @@ brw_compile_task(const struct brw_compiler *compiler,
|
|||
|
||||
brw_compute_tue_map(nir, &prog_data->map);
|
||||
NIR_PASS_V(nir, brw_nir_lower_tue_outputs, &prog_data->map);
|
||||
NIR_PASS_V(nir, brw_nir_adjust_task_payload_offsets);
|
||||
|
||||
const unsigned required_dispatch_width =
|
||||
brw_required_dispatch_width(&nir->info, key->base.subgroup_size_type);
|
||||
|
@ -259,15 +291,24 @@ brw_nir_lower_tue_inputs(nir_shader *nir, const brw_tue_map *map)
|
|||
if (!map)
|
||||
return;
|
||||
|
||||
nir_foreach_shader_in_variable(var, nir) {
|
||||
int location = var->data.location;
|
||||
assert(location >= 0);
|
||||
assert(map->start_dw[location] != -1);
|
||||
var->data.driver_location = map->start_dw[location];
|
||||
nir->info.task_payload_size = map->per_task_data_start_dw * 4;
|
||||
|
||||
if (nir_lower_vars_to_explicit_types(nir, nir_var_mem_task_payload,
|
||||
shared_type_info)) {
|
||||
/* The types for Task Output and Mesh Input should match, so their sizes
|
||||
* should also match.
|
||||
*/
|
||||
assert(map->size_dw == ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8));
|
||||
} else {
|
||||
/* Mesh doesn't read any input, to make it clearer set the
|
||||
* task_payload_size to zero instead of keeping an incomplete size that
|
||||
* just includes the header.
|
||||
*/
|
||||
nir->info.task_payload_size = 0;
|
||||
}
|
||||
|
||||
nir_lower_io(nir, nir_var_shader_in, type_size_scalar_dwords,
|
||||
nir_lower_io_lower_64bit_to_32);
|
||||
nir_lower_explicit_io(nir, nir_var_mem_task_payload,
|
||||
nir_address_format_32bit_offset);
|
||||
}
|
||||
|
||||
/* Mesh URB Entry consists of an initial section
|
||||
|
@ -645,6 +686,7 @@ brw_compile_mesh(const struct brw_compiler *compiler,
|
|||
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
|
||||
|
||||
NIR_PASS_V(nir, brw_nir_lower_tue_inputs, params->tue_map);
|
||||
NIR_PASS_V(nir, brw_nir_adjust_task_payload_offsets);
|
||||
|
||||
brw_compute_mue_map(nir, &prog_data->map);
|
||||
NIR_PASS_V(nir, brw_nir_lower_mue_outputs, &prog_data->map);
|
||||
|
@ -742,7 +784,13 @@ brw_compile_mesh(const struct brw_compiler *compiler,
|
|||
static fs_reg
|
||||
get_mesh_urb_handle(const fs_builder &bld, nir_intrinsic_op op)
|
||||
{
|
||||
const unsigned subreg = op == nir_intrinsic_load_input ? 7 : 6;
|
||||
unsigned subreg;
|
||||
if (bld.shader->stage == MESA_SHADER_TASK) {
|
||||
subreg = 6;
|
||||
} else {
|
||||
assert(bld.shader->stage == MESA_SHADER_MESH);
|
||||
subreg = op == nir_intrinsic_load_task_payload ? 7 : 6;
|
||||
}
|
||||
|
||||
fs_builder ubld8 = bld.group(8, 0).exec_all();
|
||||
|
||||
|
@ -753,6 +801,15 @@ get_mesh_urb_handle(const fs_builder &bld, nir_intrinsic_op op)
|
|||
return h;
|
||||
}
|
||||
|
||||
static unsigned
|
||||
component_from_intrinsic(nir_intrinsic_instr *instr)
|
||||
{
|
||||
if (nir_intrinsic_has_component(instr))
|
||||
return nir_intrinsic_component(instr);
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void
|
||||
adjust_handle_and_offset(const fs_builder &bld,
|
||||
fs_reg &urb_handle,
|
||||
|
@ -787,7 +844,7 @@ emit_urb_direct_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
|
|||
const unsigned mask = nir_intrinsic_write_mask(instr);
|
||||
const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
|
||||
nir_src_as_uint(*offset_nir_src) +
|
||||
nir_intrinsic_component(instr);
|
||||
component_from_intrinsic(instr);
|
||||
|
||||
/* URB writes are vec4 aligned but the intrinsic offsets are in dwords.
|
||||
* With a max of 4 components, an intrinsic can require up to two writes.
|
||||
|
@ -873,7 +930,7 @@ emit_urb_indirect_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
|
|||
fs_reg urb_handle = get_mesh_urb_handle(bld, instr->intrinsic);
|
||||
|
||||
const unsigned base_in_dwords = nir_intrinsic_base(instr) +
|
||||
nir_intrinsic_component(instr);
|
||||
component_from_intrinsic(instr);
|
||||
|
||||
/* Use URB write message that allow different offsets per-slot. The offset
|
||||
* is in units of vec4s (128 bits), so we use a write for each component,
|
||||
|
@ -940,7 +997,7 @@ emit_urb_direct_reads(const fs_builder &bld, nir_intrinsic_instr *instr,
|
|||
|
||||
const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
|
||||
nir_src_as_uint(*offset_nir_src) +
|
||||
nir_intrinsic_component(instr);
|
||||
component_from_intrinsic(instr);
|
||||
|
||||
unsigned urb_global_offset = offset_in_dwords / 4;
|
||||
adjust_handle_and_offset(bld, urb_handle, urb_global_offset);
|
||||
|
@ -987,7 +1044,7 @@ emit_urb_indirect_reads(const fs_builder &bld, nir_intrinsic_instr *instr,
|
|||
fs_reg urb_handle = get_mesh_urb_handle(bld, instr->intrinsic);
|
||||
|
||||
const unsigned base_in_dwords = nir_intrinsic_base(instr) +
|
||||
nir_intrinsic_component(instr);
|
||||
component_from_intrinsic(instr);
|
||||
|
||||
for (unsigned c = 0; c < comps; c++) {
|
||||
for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
|
||||
|
@ -1072,10 +1129,12 @@ fs_visitor::nir_emit_task_intrinsic(const fs_builder &bld,
|
|||
|
||||
switch (instr->intrinsic) {
|
||||
case nir_intrinsic_store_output:
|
||||
case nir_intrinsic_store_task_payload:
|
||||
emit_task_mesh_store(bld, instr);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_output:
|
||||
case nir_intrinsic_load_task_payload:
|
||||
emit_task_mesh_load(bld, instr);
|
||||
break;
|
||||
|
||||
|
@ -1098,10 +1157,10 @@ fs_visitor::nir_emit_mesh_intrinsic(const fs_builder &bld,
|
|||
emit_task_mesh_store(bld, instr);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_input:
|
||||
case nir_intrinsic_load_per_vertex_output:
|
||||
case nir_intrinsic_load_per_primitive_output:
|
||||
case nir_intrinsic_load_output:
|
||||
case nir_intrinsic_load_task_payload:
|
||||
emit_task_mesh_load(bld, instr);
|
||||
break;
|
||||
|
||||
|
|
|
@ -258,12 +258,14 @@ lower_mem_access_bit_sizes_instr(nir_builder *b,
|
|||
case nir_intrinsic_load_ssbo:
|
||||
case nir_intrinsic_load_shared:
|
||||
case nir_intrinsic_load_scratch:
|
||||
case nir_intrinsic_load_task_payload:
|
||||
return lower_mem_load_bit_size(b, intrin, devinfo);
|
||||
|
||||
case nir_intrinsic_store_global:
|
||||
case nir_intrinsic_store_ssbo:
|
||||
case nir_intrinsic_store_shared:
|
||||
case nir_intrinsic_store_scratch:
|
||||
case nir_intrinsic_store_task_payload:
|
||||
return lower_mem_store_bit_size(b, intrin, devinfo);
|
||||
|
||||
default:
|
||||
|
|
Loading…
Reference in New Issue