intel/compiler: Lower SSBO and shared loads/stores in NIR
We have a bunch of code to do this in the back-end compiler but it's fairly specific to typed surface messages and the way we emit them. This breaks it out into NIR were it's easier to do things a bit more generally. It also means we can easily share the code between the vec4 and FS back-ends if we wish. Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
This commit is contained in:
parent
d34fd81e76
commit
6339aba775
|
@ -85,6 +85,7 @@ COMPILER_FILES = \
|
||||||
compiler/brw_nir_attribute_workarounds.c \
|
compiler/brw_nir_attribute_workarounds.c \
|
||||||
compiler/brw_nir_lower_cs_intrinsics.c \
|
compiler/brw_nir_lower_cs_intrinsics.c \
|
||||||
compiler/brw_nir_lower_image_load_store.c \
|
compiler/brw_nir_lower_image_load_store.c \
|
||||||
|
compiler/brw_nir_lower_mem_access_bit_sizes.c \
|
||||||
compiler/brw_nir_opt_peephole_ffma.c \
|
compiler/brw_nir_opt_peephole_ffma.c \
|
||||||
compiler/brw_nir_tcs_workarounds.c \
|
compiler/brw_nir_tcs_workarounds.c \
|
||||||
compiler/brw_packed_float.c \
|
compiler/brw_packed_float.c \
|
||||||
|
|
|
@ -26,6 +26,7 @@
|
||||||
#include "brw_fs_surface_builder.h"
|
#include "brw_fs_surface_builder.h"
|
||||||
#include "brw_nir.h"
|
#include "brw_nir.h"
|
||||||
#include "util/u_math.h"
|
#include "util/u_math.h"
|
||||||
|
#include "util/bitscan.h"
|
||||||
|
|
||||||
using namespace brw;
|
using namespace brw;
|
||||||
using namespace brw::surface_access;
|
using namespace brw::surface_access;
|
||||||
|
@ -2250,107 +2251,6 @@ fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
|
||||||
return get_nir_src(*offset_src);
|
return get_nir_src(*offset_src);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
|
||||||
do_untyped_vector_read(const fs_builder &bld,
|
|
||||||
const fs_reg dest,
|
|
||||||
const fs_reg surf_index,
|
|
||||||
const fs_reg offset_reg,
|
|
||||||
unsigned num_components)
|
|
||||||
{
|
|
||||||
if (type_sz(dest.type) <= 2) {
|
|
||||||
assert(dest.stride == 1);
|
|
||||||
boolean is_const_offset = offset_reg.file == BRW_IMMEDIATE_VALUE;
|
|
||||||
|
|
||||||
if (is_const_offset) {
|
|
||||||
uint32_t start = offset_reg.ud & ~3;
|
|
||||||
uint32_t end = offset_reg.ud + num_components * type_sz(dest.type);
|
|
||||||
end = ALIGN(end, 4);
|
|
||||||
assert (end - start <= 16);
|
|
||||||
|
|
||||||
/* At this point we have 16-bit component/s that have constant
|
|
||||||
* offset aligned to 4-bytes that can be read with untyped_reads.
|
|
||||||
* untyped_read message requires 32-bit aligned offsets.
|
|
||||||
*/
|
|
||||||
unsigned first_component = (offset_reg.ud & 3) / type_sz(dest.type);
|
|
||||||
unsigned num_components_32bit = (end - start) / 4;
|
|
||||||
|
|
||||||
fs_reg read_result =
|
|
||||||
emit_untyped_read(bld, surf_index, brw_imm_ud(start),
|
|
||||||
1 /* dims */,
|
|
||||||
num_components_32bit,
|
|
||||||
BRW_PREDICATE_NONE);
|
|
||||||
shuffle_from_32bit_read(bld, dest, read_result, first_component,
|
|
||||||
num_components);
|
|
||||||
} else {
|
|
||||||
fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
|
||||||
for (unsigned i = 0; i < num_components; i++) {
|
|
||||||
if (i == 0) {
|
|
||||||
bld.MOV(read_offset, offset_reg);
|
|
||||||
} else {
|
|
||||||
bld.ADD(read_offset, offset_reg,
|
|
||||||
brw_imm_ud(i * type_sz(dest.type)));
|
|
||||||
}
|
|
||||||
/* Non constant offsets are not guaranteed to be aligned 32-bits
|
|
||||||
* so they are read using one byte_scattered_read message
|
|
||||||
* for each component.
|
|
||||||
*/
|
|
||||||
fs_reg read_result =
|
|
||||||
emit_byte_scattered_read(bld, surf_index, read_offset,
|
|
||||||
1 /* dims */, 1,
|
|
||||||
type_sz(dest.type) * 8 /* bit_size */,
|
|
||||||
BRW_PREDICATE_NONE);
|
|
||||||
bld.MOV(offset(dest, bld, i),
|
|
||||||
subscript (read_result, dest.type, 0));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (type_sz(dest.type) == 4) {
|
|
||||||
fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
|
|
||||||
1 /* dims */,
|
|
||||||
num_components,
|
|
||||||
BRW_PREDICATE_NONE);
|
|
||||||
read_result.type = dest.type;
|
|
||||||
for (unsigned i = 0; i < num_components; i++)
|
|
||||||
bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
|
|
||||||
} else if (type_sz(dest.type) == 8) {
|
|
||||||
/* Reading a dvec, so we need to:
|
|
||||||
*
|
|
||||||
* 1. Multiply num_components by 2, to account for the fact that we
|
|
||||||
* need to read 64-bit components.
|
|
||||||
* 2. Shuffle the result of the load to form valid 64-bit elements
|
|
||||||
* 3. Emit a second load (for components z/w) if needed.
|
|
||||||
*/
|
|
||||||
fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
|
||||||
bld.MOV(read_offset, offset_reg);
|
|
||||||
|
|
||||||
int iters = num_components <= 2 ? 1 : 2;
|
|
||||||
|
|
||||||
/* Load the dvec, the first iteration loads components x/y, the second
|
|
||||||
* iteration, if needed, loads components z/w
|
|
||||||
*/
|
|
||||||
for (int it = 0; it < iters; it++) {
|
|
||||||
/* Compute number of components to read in this iteration */
|
|
||||||
int iter_components = MIN2(2, num_components);
|
|
||||||
num_components -= iter_components;
|
|
||||||
|
|
||||||
/* Read. Since this message reads 32-bit components, we need to
|
|
||||||
* read twice as many components.
|
|
||||||
*/
|
|
||||||
fs_reg read_result = emit_untyped_read(bld, surf_index, read_offset,
|
|
||||||
1 /* dims */,
|
|
||||||
iter_components * 2,
|
|
||||||
BRW_PREDICATE_NONE);
|
|
||||||
|
|
||||||
/* Shuffle the 32-bit load result into valid 64-bit data */
|
|
||||||
shuffle_from_32bit_read(bld, offset(dest, bld, it * 2),
|
|
||||||
read_result, 0, iter_components);
|
|
||||||
|
|
||||||
bld.ADD(read_offset, read_offset, brw_imm_ud(16));
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
unreachable("Unsupported type");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
void
|
||||||
fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
|
fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
|
||||||
nir_intrinsic_instr *instr)
|
nir_intrinsic_instr *instr)
|
||||||
|
@ -3572,93 +3472,64 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
|
||||||
|
|
||||||
case nir_intrinsic_load_shared: {
|
case nir_intrinsic_load_shared: {
|
||||||
assert(devinfo->gen >= 7);
|
assert(devinfo->gen >= 7);
|
||||||
|
assert(stage == MESA_SHADER_COMPUTE);
|
||||||
|
|
||||||
fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
|
const unsigned bit_size = nir_dest_bit_size(instr->dest);
|
||||||
|
fs_reg offset_reg = retype(get_nir_src(instr->src[0]),
|
||||||
|
BRW_REGISTER_TYPE_UD);
|
||||||
|
|
||||||
/* Get the offset to read from */
|
/* Make dest unsigned because that's what the temporary will be */
|
||||||
fs_reg offset_reg;
|
dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
|
||||||
if (nir_src_is_const(instr->src[0])) {
|
|
||||||
offset_reg = brw_imm_ud(instr->const_index[0] +
|
|
||||||
nir_src_as_uint(instr->src[0]));
|
|
||||||
} else {
|
|
||||||
offset_reg = vgrf(glsl_type::uint_type);
|
|
||||||
bld.ADD(offset_reg,
|
|
||||||
retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
|
|
||||||
brw_imm_ud(instr->const_index[0]));
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Read the vector */
|
/* Read the vector */
|
||||||
do_untyped_vector_read(bld, dest, surf_index, offset_reg,
|
if (nir_intrinsic_align(instr) >= 4) {
|
||||||
instr->num_components);
|
assert(nir_dest_bit_size(instr->dest) == 32);
|
||||||
|
fs_reg read_result = emit_untyped_read(bld, brw_imm_ud(GEN7_BTI_SLM),
|
||||||
|
offset_reg, 1 /* dims */,
|
||||||
|
instr->num_components,
|
||||||
|
BRW_PREDICATE_NONE);
|
||||||
|
for (unsigned i = 0; i < instr->num_components; i++)
|
||||||
|
bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
|
||||||
|
} else {
|
||||||
|
assert(nir_dest_bit_size(instr->dest) <= 32);
|
||||||
|
assert(nir_dest_num_components(instr->dest) == 1);
|
||||||
|
fs_reg read_result =
|
||||||
|
emit_byte_scattered_read(bld, brw_imm_ud(GEN7_BTI_SLM), offset_reg,
|
||||||
|
1 /* dims */, 1, bit_size,
|
||||||
|
BRW_PREDICATE_NONE);
|
||||||
|
bld.MOV(dest, read_result);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
case nir_intrinsic_store_shared: {
|
case nir_intrinsic_store_shared: {
|
||||||
assert(devinfo->gen >= 7);
|
assert(devinfo->gen >= 7);
|
||||||
|
assert(stage == MESA_SHADER_COMPUTE);
|
||||||
|
|
||||||
/* Block index */
|
const unsigned bit_size = nir_src_bit_size(instr->src[0]);
|
||||||
fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
|
|
||||||
|
|
||||||
/* Value */
|
|
||||||
fs_reg val_reg = get_nir_src(instr->src[0]);
|
fs_reg val_reg = get_nir_src(instr->src[0]);
|
||||||
|
fs_reg offset_reg = retype(get_nir_src(instr->src[1]),
|
||||||
|
BRW_REGISTER_TYPE_UD);
|
||||||
|
|
||||||
/* Writemask */
|
val_reg.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
|
||||||
unsigned writemask = instr->const_index[1];
|
|
||||||
|
|
||||||
/* get_nir_src() retypes to integer. Be wary of 64-bit types though
|
assert(nir_intrinsic_write_mask(instr) ==
|
||||||
* since the untyped writes below operate in units of 32-bits, which
|
(1 << instr->num_components) - 1);
|
||||||
* means that we need to write twice as many components each time.
|
if (nir_intrinsic_align(instr) >= 4) {
|
||||||
* Also, we have to suffle 64-bit data to be in the appropriate layout
|
assert(nir_src_bit_size(instr->src[0]) == 32);
|
||||||
* expected by our 32-bit write messages.
|
assert(nir_src_num_components(instr->src[0]) <= 4);
|
||||||
*/
|
emit_untyped_write(bld, brw_imm_ud(GEN7_BTI_SLM), offset_reg, val_reg,
|
||||||
unsigned type_size = 4;
|
1 /* dims */, instr->num_components,
|
||||||
if (nir_src_bit_size(instr->src[0]) == 64) {
|
|
||||||
type_size = 8;
|
|
||||||
val_reg = shuffle_for_32bit_write(bld, val_reg, 0,
|
|
||||||
instr->num_components);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned type_slots = type_size / 4;
|
|
||||||
|
|
||||||
/* Combine groups of consecutive enabled channels in one write
|
|
||||||
* message. We use ffs to find the first enabled channel and then ffs on
|
|
||||||
* the bit-inverse, down-shifted writemask to determine the length of
|
|
||||||
* the block of enabled bits.
|
|
||||||
*/
|
|
||||||
while (writemask) {
|
|
||||||
unsigned first_component = ffs(writemask) - 1;
|
|
||||||
unsigned length = ffs(~(writemask >> first_component)) - 1;
|
|
||||||
|
|
||||||
/* We can't write more than 2 64-bit components at once. Limit the
|
|
||||||
* length of the write to what we can do and let the next iteration
|
|
||||||
* handle the rest
|
|
||||||
*/
|
|
||||||
if (type_size > 4)
|
|
||||||
length = MIN2(2, length);
|
|
||||||
|
|
||||||
fs_reg offset_reg;
|
|
||||||
if (nir_src_is_const(instr->src[1])) {
|
|
||||||
offset_reg = brw_imm_ud(instr->const_index[0] +
|
|
||||||
nir_src_as_uint(instr->src[1]) +
|
|
||||||
type_size * first_component);
|
|
||||||
} else {
|
|
||||||
offset_reg = vgrf(glsl_type::uint_type);
|
|
||||||
bld.ADD(offset_reg,
|
|
||||||
retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD),
|
|
||||||
brw_imm_ud(instr->const_index[0] + type_size * first_component));
|
|
||||||
}
|
|
||||||
|
|
||||||
emit_untyped_write(bld, surf_index, offset_reg,
|
|
||||||
offset(val_reg, bld, first_component * type_slots),
|
|
||||||
1 /* dims */, length * type_slots,
|
|
||||||
BRW_PREDICATE_NONE);
|
BRW_PREDICATE_NONE);
|
||||||
|
} else {
|
||||||
/* Clear the bits in the writemask that we just wrote, then try
|
assert(nir_src_bit_size(instr->src[0]) <= 32);
|
||||||
* again to see if more channels are left.
|
assert(nir_src_num_components(instr->src[0]) == 1);
|
||||||
*/
|
fs_reg write_src = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||||
writemask &= (15 << (first_component + length));
|
bld.MOV(write_src, val_reg);
|
||||||
|
emit_byte_scattered_write(bld, brw_imm_ud(GEN7_BTI_SLM), offset_reg,
|
||||||
|
write_src, 1 /* dims */, bit_size,
|
||||||
|
BRW_PREDICATE_NONE);
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4155,13 +4026,32 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
|
||||||
case nir_intrinsic_load_ssbo: {
|
case nir_intrinsic_load_ssbo: {
|
||||||
assert(devinfo->gen >= 7);
|
assert(devinfo->gen >= 7);
|
||||||
|
|
||||||
|
const unsigned bit_size = nir_dest_bit_size(instr->dest);
|
||||||
fs_reg surf_index = get_nir_ssbo_intrinsic_index(bld, instr);
|
fs_reg surf_index = get_nir_ssbo_intrinsic_index(bld, instr);
|
||||||
fs_reg offset_reg = get_nir_src_imm(instr->src[1]);
|
fs_reg offset_reg = retype(get_nir_src(instr->src[1]),
|
||||||
|
BRW_REGISTER_TYPE_UD);
|
||||||
|
|
||||||
|
/* Make dest unsigned because that's what the temporary will be */
|
||||||
|
dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
|
||||||
|
|
||||||
/* Read the vector */
|
/* Read the vector */
|
||||||
do_untyped_vector_read(bld, dest, surf_index, offset_reg,
|
if (nir_intrinsic_align(instr) >= 4) {
|
||||||
instr->num_components);
|
assert(nir_dest_bit_size(instr->dest) == 32);
|
||||||
|
fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
|
||||||
|
1 /* dims */,
|
||||||
|
instr->num_components,
|
||||||
|
BRW_PREDICATE_NONE);
|
||||||
|
for (unsigned i = 0; i < instr->num_components; i++)
|
||||||
|
bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
|
||||||
|
} else {
|
||||||
|
assert(nir_dest_bit_size(instr->dest) <= 32);
|
||||||
|
assert(nir_dest_num_components(instr->dest) == 1);
|
||||||
|
fs_reg read_result =
|
||||||
|
emit_byte_scattered_read(bld, surf_index, offset_reg,
|
||||||
|
1 /* dims */, 1, bit_size,
|
||||||
|
BRW_PREDICATE_NONE);
|
||||||
|
bld.MOV(dest, read_result);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4171,125 +4061,30 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
|
||||||
if (stage == MESA_SHADER_FRAGMENT)
|
if (stage == MESA_SHADER_FRAGMENT)
|
||||||
brw_wm_prog_data(prog_data)->has_side_effects = true;
|
brw_wm_prog_data(prog_data)->has_side_effects = true;
|
||||||
|
|
||||||
fs_reg surf_index = get_nir_ssbo_intrinsic_index(bld, instr);
|
const unsigned bit_size = nir_src_bit_size(instr->src[0]);
|
||||||
|
|
||||||
/* Value */
|
|
||||||
fs_reg val_reg = get_nir_src(instr->src[0]);
|
fs_reg val_reg = get_nir_src(instr->src[0]);
|
||||||
|
fs_reg surf_index = get_nir_ssbo_intrinsic_index(bld, instr);
|
||||||
|
fs_reg offset_reg = retype(get_nir_src(instr->src[2]),
|
||||||
|
BRW_REGISTER_TYPE_UD);
|
||||||
|
|
||||||
/* Writemask */
|
val_reg.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
|
||||||
unsigned writemask = instr->const_index[0];
|
|
||||||
|
|
||||||
/* get_nir_src() retypes to integer. Be wary of 64-bit types though
|
assert(nir_intrinsic_write_mask(instr) ==
|
||||||
* since the untyped writes below operate in units of 32-bits, which
|
(1 << instr->num_components) - 1);
|
||||||
* means that we need to write twice as many components each time.
|
if (nir_intrinsic_align(instr) >= 4) {
|
||||||
* Also, we have to suffle 64-bit data to be in the appropriate layout
|
assert(nir_src_bit_size(instr->src[0]) == 32);
|
||||||
* expected by our 32-bit write messages.
|
assert(nir_src_num_components(instr->src[0]) <= 4);
|
||||||
*/
|
emit_untyped_write(bld, surf_index, offset_reg, val_reg,
|
||||||
unsigned bit_size = nir_src_bit_size(instr->src[0]);
|
1 /* dims */, instr->num_components,
|
||||||
unsigned type_size = bit_size / 8;
|
BRW_PREDICATE_NONE);
|
||||||
|
} else {
|
||||||
/* Combine groups of consecutive enabled channels in one write
|
assert(nir_src_bit_size(instr->src[0]) <= 32);
|
||||||
* message. We use ffs to find the first enabled channel and then ffs on
|
assert(nir_src_num_components(instr->src[0]) == 1);
|
||||||
* the bit-inverse, down-shifted writemask to determine the num_components
|
fs_reg write_src = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||||
* of the block of enabled bits.
|
bld.MOV(write_src, val_reg);
|
||||||
*/
|
emit_byte_scattered_write(bld, surf_index, offset_reg,
|
||||||
while (writemask) {
|
write_src, 1 /* dims */, bit_size,
|
||||||
unsigned first_component = ffs(writemask) - 1;
|
BRW_PREDICATE_NONE);
|
||||||
unsigned num_components = ffs(~(writemask >> first_component)) - 1;
|
|
||||||
fs_reg write_src = offset(val_reg, bld, first_component);
|
|
||||||
|
|
||||||
if (type_size > 4) {
|
|
||||||
/* We can't write more than 2 64-bit components at once. Limit
|
|
||||||
* the num_components of the write to what we can do and let the next
|
|
||||||
* iteration handle the rest.
|
|
||||||
*/
|
|
||||||
num_components = MIN2(2, num_components);
|
|
||||||
write_src = shuffle_for_32bit_write(bld, write_src, 0,
|
|
||||||
num_components);
|
|
||||||
} else if (type_size < 4) {
|
|
||||||
/* For 16-bit types we pack two consecutive values into a 32-bit
|
|
||||||
* word and use an untyped write message. For single values or not
|
|
||||||
* 32-bit-aligned we need to use byte-scattered writes because
|
|
||||||
* untyped writes works with 32-bit components with 32-bit
|
|
||||||
* alignment. byte_scattered_write messages only support one
|
|
||||||
* 16-bit component at a time. As VK_KHR_relaxed_block_layout
|
|
||||||
* could be enabled we can not guarantee that not constant offsets
|
|
||||||
* to be 32-bit aligned for 16-bit types. For example an array, of
|
|
||||||
* 16-bit vec3 with array element stride of 6.
|
|
||||||
*
|
|
||||||
* In the case of 32-bit aligned constant offsets if there is
|
|
||||||
* a 3-components vector we submit one untyped-write message
|
|
||||||
* of 32-bit (first two components), and one byte-scattered
|
|
||||||
* write message (the last component).
|
|
||||||
*/
|
|
||||||
|
|
||||||
if (!nir_src_is_const(instr->src[2]) ||
|
|
||||||
((nir_src_as_uint(instr->src[2]) +
|
|
||||||
type_size * first_component) % 4)) {
|
|
||||||
/* If we use a .yz writemask we also need to emit 2
|
|
||||||
* byte-scattered write messages because of y-component not
|
|
||||||
* being aligned to 32-bit.
|
|
||||||
*/
|
|
||||||
num_components = 1;
|
|
||||||
} else if (num_components * type_size > 4 &&
|
|
||||||
(num_components * type_size % 4)) {
|
|
||||||
/* If the pending components size is not a multiple of 4 bytes
|
|
||||||
* we left the not aligned components for following emits of
|
|
||||||
* length == 1 with byte_scattered_write.
|
|
||||||
*/
|
|
||||||
num_components -= (num_components * type_size % 4) / type_size;
|
|
||||||
} else if (num_components * type_size < 4) {
|
|
||||||
num_components = 1;
|
|
||||||
}
|
|
||||||
/* For num_components == 1 we are also shuffling the component
|
|
||||||
* because byte scattered writes of 16-bit need values to be dword
|
|
||||||
* aligned. Shuffling only one component would be the same as
|
|
||||||
* striding it.
|
|
||||||
*/
|
|
||||||
write_src = shuffle_for_32bit_write(bld, write_src, 0,
|
|
||||||
num_components);
|
|
||||||
}
|
|
||||||
|
|
||||||
fs_reg offset_reg;
|
|
||||||
|
|
||||||
if (nir_src_is_const(instr->src[2])) {
|
|
||||||
offset_reg = brw_imm_ud(nir_src_as_uint(instr->src[2]) +
|
|
||||||
type_size * first_component);
|
|
||||||
} else {
|
|
||||||
offset_reg = vgrf(glsl_type::uint_type);
|
|
||||||
bld.ADD(offset_reg,
|
|
||||||
retype(get_nir_src(instr->src[2]), BRW_REGISTER_TYPE_UD),
|
|
||||||
brw_imm_ud(type_size * first_component));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (type_size < 4 && num_components == 1) {
|
|
||||||
/* Untyped Surface messages have a fixed 32-bit size, so we need
|
|
||||||
* to rely on byte scattered in order to write 16-bit elements.
|
|
||||||
* The byte_scattered_write message needs that every written 16-bit
|
|
||||||
* type to be aligned 32-bits (stride=2).
|
|
||||||
*/
|
|
||||||
emit_byte_scattered_write(bld, surf_index, offset_reg,
|
|
||||||
write_src,
|
|
||||||
1 /* dims */,
|
|
||||||
bit_size,
|
|
||||||
BRW_PREDICATE_NONE);
|
|
||||||
} else {
|
|
||||||
assert(num_components * type_size <= 16);
|
|
||||||
assert((num_components * type_size) % 4 == 0);
|
|
||||||
assert(offset_reg.file != BRW_IMMEDIATE_VALUE ||
|
|
||||||
offset_reg.ud % 4 == 0);
|
|
||||||
unsigned num_slots = (num_components * type_size) / 4;
|
|
||||||
|
|
||||||
emit_untyped_write(bld, surf_index, offset_reg,
|
|
||||||
write_src,
|
|
||||||
1 /* dims */, num_slots,
|
|
||||||
BRW_PREDICATE_NONE);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Clear the bits in the writemask that we just wrote, then try
|
|
||||||
* again to see if more channels are left.
|
|
||||||
*/
|
|
||||||
writemask &= (15 << (first_component + num_components));
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -714,6 +714,8 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir)
|
||||||
brw_nir_no_indirect_mask(compiler, nir->info.stage);
|
brw_nir_no_indirect_mask(compiler, nir->info.stage);
|
||||||
OPT(nir_lower_indirect_derefs, indirect_mask);
|
OPT(nir_lower_indirect_derefs, indirect_mask);
|
||||||
|
|
||||||
|
OPT(brw_nir_lower_mem_access_bit_sizes);
|
||||||
|
|
||||||
/* Get rid of split copies */
|
/* Get rid of split copies */
|
||||||
nir = brw_nir_optimize(nir, compiler, is_scalar, false);
|
nir = brw_nir_optimize(nir, compiler, is_scalar, false);
|
||||||
|
|
||||||
|
|
|
@ -119,6 +119,8 @@ bool brw_nir_lower_image_load_store(nir_shader *nir,
|
||||||
void brw_nir_rewrite_image_intrinsic(nir_intrinsic_instr *intrin,
|
void brw_nir_rewrite_image_intrinsic(nir_intrinsic_instr *intrin,
|
||||||
nir_ssa_def *index);
|
nir_ssa_def *index);
|
||||||
|
|
||||||
|
bool brw_nir_lower_mem_access_bit_sizes(nir_shader *shader);
|
||||||
|
|
||||||
nir_shader *brw_postprocess_nir(nir_shader *nir,
|
nir_shader *brw_postprocess_nir(nir_shader *nir,
|
||||||
const struct brw_compiler *compiler,
|
const struct brw_compiler *compiler,
|
||||||
bool is_scalar);
|
bool is_scalar);
|
||||||
|
|
|
@ -0,0 +1,313 @@
|
||||||
|
/*
|
||||||
|
* Copyright © 2018 Intel Corporation
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice (including the next
|
||||||
|
* paragraph) shall be included in all copies or substantial portions of the
|
||||||
|
* Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||||
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||||
|
* IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "brw_nir.h"
|
||||||
|
#include "compiler/nir/nir_builder.h"
|
||||||
|
#include "util/u_math.h"
|
||||||
|
#include "util/bitscan.h"
|
||||||
|
|
||||||
|
static nir_ssa_def *
|
||||||
|
dup_mem_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
|
||||||
|
nir_ssa_def *store_src, int offset,
|
||||||
|
unsigned num_components, unsigned bit_size,
|
||||||
|
unsigned align)
|
||||||
|
{
|
||||||
|
const nir_intrinsic_info *info = &nir_intrinsic_infos[intrin->intrinsic];
|
||||||
|
|
||||||
|
nir_intrinsic_instr *dup =
|
||||||
|
nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
|
||||||
|
|
||||||
|
nir_src *intrin_offset_src = nir_get_io_offset_src(intrin);
|
||||||
|
for (unsigned i = 0; i < info->num_srcs; i++) {
|
||||||
|
assert(intrin->src[i].is_ssa);
|
||||||
|
if (i == 0 && store_src) {
|
||||||
|
assert(!info->has_dest);
|
||||||
|
assert(&intrin->src[i] != intrin_offset_src);
|
||||||
|
dup->src[i] = nir_src_for_ssa(store_src);
|
||||||
|
} else if (&intrin->src[i] == intrin_offset_src) {
|
||||||
|
dup->src[i] = nir_src_for_ssa(nir_iadd_imm(b, intrin->src[i].ssa,
|
||||||
|
offset));
|
||||||
|
} else {
|
||||||
|
dup->src[i] = nir_src_for_ssa(intrin->src[i].ssa);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dup->num_components = num_components;
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < info->num_indices; i++)
|
||||||
|
dup->const_index[i] = intrin->const_index[i];
|
||||||
|
|
||||||
|
nir_intrinsic_set_align(dup, align, 0);
|
||||||
|
|
||||||
|
if (info->has_dest) {
|
||||||
|
assert(intrin->dest.is_ssa);
|
||||||
|
nir_ssa_dest_init(&dup->instr, &dup->dest,
|
||||||
|
num_components, bit_size,
|
||||||
|
intrin->dest.ssa.name);
|
||||||
|
} else {
|
||||||
|
nir_intrinsic_set_write_mask(dup, (1 << num_components) - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
nir_builder_instr_insert(b, &dup->instr);
|
||||||
|
|
||||||
|
return info->has_dest ? &dup->dest.ssa : NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
lower_mem_load_bit_size(nir_builder *b, nir_intrinsic_instr *intrin)
|
||||||
|
{
|
||||||
|
assert(intrin->dest.is_ssa);
|
||||||
|
if (intrin->dest.ssa.bit_size == 32)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
const unsigned bit_size = intrin->dest.ssa.bit_size;
|
||||||
|
const unsigned num_components = intrin->dest.ssa.num_components;
|
||||||
|
const unsigned bytes_read = num_components * (bit_size / 8);
|
||||||
|
const unsigned align = nir_intrinsic_align(intrin);
|
||||||
|
|
||||||
|
nir_ssa_def *result[4] = { NULL, };
|
||||||
|
|
||||||
|
nir_src *offset_src = nir_get_io_offset_src(intrin);
|
||||||
|
if (bit_size < 32 && nir_src_is_const(*offset_src)) {
|
||||||
|
/* The offset is constant so we can use a 32-bit load and just shift it
|
||||||
|
* around as needed.
|
||||||
|
*/
|
||||||
|
const int load_offset = nir_src_as_uint(*offset_src) % 4;
|
||||||
|
assert(load_offset % (bit_size / 8) == 0);
|
||||||
|
const unsigned load_comps32 = DIV_ROUND_UP(bytes_read + load_offset, 4);
|
||||||
|
/* A 16-bit vec4 is a 32-bit vec2. We add an extra component in case
|
||||||
|
* we offset into a component with load_offset.
|
||||||
|
*/
|
||||||
|
assert(load_comps32 <= 3);
|
||||||
|
|
||||||
|
nir_ssa_def *load = dup_mem_intrinsic(b, intrin, NULL, -load_offset,
|
||||||
|
load_comps32, 32, 4);
|
||||||
|
nir_ssa_def *unpacked[3];
|
||||||
|
for (unsigned i = 0; i < load_comps32; i++)
|
||||||
|
unpacked[i] = nir_unpack_bits(b, nir_channel(b, load, i), bit_size);
|
||||||
|
|
||||||
|
assert(load_offset % (bit_size / 8) == 0);
|
||||||
|
const unsigned divisor = 32 / bit_size;
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < num_components; i++) {
|
||||||
|
unsigned load_i = i + load_offset / (bit_size / 8);
|
||||||
|
result[i] = nir_channel(b, unpacked[load_i / divisor],
|
||||||
|
load_i % divisor);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
/* Otherwise, we have to break it into smaller loads */
|
||||||
|
unsigned res_idx = 0;
|
||||||
|
int load_offset = 0;
|
||||||
|
while (load_offset < bytes_read) {
|
||||||
|
const unsigned bytes_left = bytes_read - load_offset;
|
||||||
|
unsigned load_bit_size, load_comps;
|
||||||
|
if (align < 4) {
|
||||||
|
load_comps = 1;
|
||||||
|
/* Choose a byte, word, or dword */
|
||||||
|
load_bit_size = util_next_power_of_two(MIN2(bytes_left, 4)) * 8;
|
||||||
|
} else {
|
||||||
|
assert(load_offset % 4 == 0);
|
||||||
|
load_bit_size = 32;
|
||||||
|
load_comps = DIV_ROUND_UP(MIN2(bytes_left, 16), 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
nir_ssa_def *load = dup_mem_intrinsic(b, intrin, NULL, load_offset,
|
||||||
|
load_comps, load_bit_size,
|
||||||
|
align);
|
||||||
|
|
||||||
|
nir_ssa_def *unpacked = nir_bitcast_vector(b, load, bit_size);
|
||||||
|
for (unsigned i = 0; i < unpacked->num_components; i++) {
|
||||||
|
if (res_idx < num_components)
|
||||||
|
result[res_idx++] = nir_channel(b, unpacked, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
load_offset += load_comps * (load_bit_size / 8);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
nir_ssa_def *vec_result = nir_vec(b, result, num_components);
|
||||||
|
nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
|
||||||
|
nir_src_for_ssa(vec_result));
|
||||||
|
nir_instr_remove(&intrin->instr);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
lower_mem_store_bit_size(nir_builder *b, nir_intrinsic_instr *intrin)
|
||||||
|
{
|
||||||
|
assert(intrin->src[0].is_ssa);
|
||||||
|
nir_ssa_def *value = intrin->src[0].ssa;
|
||||||
|
|
||||||
|
assert(intrin->num_components == value->num_components);
|
||||||
|
const unsigned bit_size = value->bit_size;
|
||||||
|
const unsigned num_components = intrin->num_components;
|
||||||
|
const unsigned bytes_written = num_components * (bit_size / 8);
|
||||||
|
const unsigned align_mul = nir_intrinsic_align_mul(intrin);
|
||||||
|
const unsigned align_offset = nir_intrinsic_align_offset(intrin);
|
||||||
|
const unsigned align = nir_intrinsic_align(intrin);
|
||||||
|
|
||||||
|
nir_component_mask_t writemask = nir_intrinsic_write_mask(intrin);
|
||||||
|
assert(writemask < (1 << num_components));
|
||||||
|
|
||||||
|
if ((value->bit_size <= 32 && num_components == 1) ||
|
||||||
|
(value->bit_size == 32 && writemask == (1 << num_components) - 1))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
nir_src *offset_src = nir_get_io_offset_src(intrin);
|
||||||
|
const bool offset_is_const = nir_src_is_const(*offset_src);
|
||||||
|
const unsigned const_offset =
|
||||||
|
offset_is_const ? nir_src_as_uint(*offset_src) : 0;
|
||||||
|
|
||||||
|
assert(num_components * (bit_size / 8) <= 32);
|
||||||
|
uint32_t byte_mask = 0;
|
||||||
|
for (unsigned i = 0; i < num_components; i++) {
|
||||||
|
if (writemask & (1 << i))
|
||||||
|
byte_mask |= ((1 << (bit_size / 8)) - 1) << i * (bit_size / 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
while (byte_mask) {
|
||||||
|
const int start = ffs(byte_mask) - 1;
|
||||||
|
assert(start % (bit_size / 8) == 0);
|
||||||
|
|
||||||
|
int end;
|
||||||
|
for (end = start + 1; end < bytes_written; end++) {
|
||||||
|
if (!(byte_mask & (1 << end)))
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
/* The size of the current contiguous chunk in bytes */
|
||||||
|
const unsigned chunk_bytes = end - start;
|
||||||
|
|
||||||
|
const bool is_dword_aligned =
|
||||||
|
(align_mul >= 4 && (align_offset + start) % 4 == 0) ||
|
||||||
|
(offset_is_const && (start + const_offset) % 4 == 0);
|
||||||
|
|
||||||
|
unsigned store_comps, store_bit_size, store_align;
|
||||||
|
if (chunk_bytes >= 4 && is_dword_aligned) {
|
||||||
|
store_align = MAX2(align, 4);
|
||||||
|
store_bit_size = 32;
|
||||||
|
store_comps = MIN2(chunk_bytes, 16) / 4;
|
||||||
|
} else {
|
||||||
|
store_align = align;
|
||||||
|
store_comps = 1;
|
||||||
|
store_bit_size = MIN2(chunk_bytes, 4) * 8;
|
||||||
|
/* The bit size must be a power of two */
|
||||||
|
if (store_bit_size == 24)
|
||||||
|
store_bit_size = 16;
|
||||||
|
}
|
||||||
|
|
||||||
|
const unsigned store_bytes = store_comps * (store_bit_size / 8);
|
||||||
|
assert(store_bytes % (bit_size / 8) == 0);
|
||||||
|
const unsigned store_first_src_comp = start / (bit_size / 8);
|
||||||
|
const unsigned store_src_comps = store_bytes / (bit_size / 8);
|
||||||
|
assert(store_first_src_comp + store_src_comps <= num_components);
|
||||||
|
|
||||||
|
unsigned src_swiz[4];
|
||||||
|
for (unsigned i = 0; i < store_src_comps; i++)
|
||||||
|
src_swiz[i] = store_first_src_comp + i;
|
||||||
|
nir_ssa_def *store_value =
|
||||||
|
nir_swizzle(b, value, src_swiz, store_src_comps, false);
|
||||||
|
nir_ssa_def *packed = nir_bitcast_vector(b, store_value, store_bit_size);
|
||||||
|
|
||||||
|
dup_mem_intrinsic(b, intrin, packed, start,
|
||||||
|
store_comps, store_bit_size, store_align);
|
||||||
|
|
||||||
|
byte_mask &= ~(((1u << store_bytes) - 1) << start);
|
||||||
|
}
|
||||||
|
|
||||||
|
nir_instr_remove(&intrin->instr);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
lower_mem_access_bit_sizes_impl(nir_function_impl *impl)
|
||||||
|
{
|
||||||
|
bool progress = false;
|
||||||
|
|
||||||
|
nir_builder b;
|
||||||
|
nir_builder_init(&b, impl);
|
||||||
|
|
||||||
|
nir_foreach_block(block, impl) {
|
||||||
|
nir_foreach_instr_safe(instr, block) {
|
||||||
|
if (instr->type != nir_instr_type_intrinsic)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
b.cursor = nir_after_instr(instr);
|
||||||
|
|
||||||
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||||
|
switch (intrin->intrinsic) {
|
||||||
|
case nir_intrinsic_load_ssbo:
|
||||||
|
case nir_intrinsic_load_shared:
|
||||||
|
if (lower_mem_load_bit_size(&b, intrin))
|
||||||
|
progress = true;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case nir_intrinsic_store_ssbo:
|
||||||
|
case nir_intrinsic_store_shared:
|
||||||
|
if (lower_mem_store_bit_size(&b, intrin))
|
||||||
|
progress = true;
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (progress) {
|
||||||
|
nir_metadata_preserve(impl, nir_metadata_block_index |
|
||||||
|
nir_metadata_dominance);
|
||||||
|
}
|
||||||
|
|
||||||
|
return progress;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This pass loads arbitrary SSBO and shared memory load/store operations to
|
||||||
|
* intrinsics which are natively handleable by GEN hardware. In particular,
|
||||||
|
* we have two general types of memory load/store messages:
|
||||||
|
*
|
||||||
|
* - Untyped surface read/write: These can load/store between one and four
|
||||||
|
* dword components to/from a dword-aligned offset.
|
||||||
|
*
|
||||||
|
* - Byte scattered read/write: These can load/store a single byte, word, or
|
||||||
|
* dword scalar to/from an unaligned byte offset.
|
||||||
|
*
|
||||||
|
* Neither type of message can do a write-masked store. This pass converts
|
||||||
|
* all nir load/store intrinsics into a series of either 8 or 32-bit
|
||||||
|
* load/store intrinsics with a number of components that we can directly
|
||||||
|
* handle in hardware and with a trivial write-mask.
|
||||||
|
*/
|
||||||
|
bool
|
||||||
|
brw_nir_lower_mem_access_bit_sizes(nir_shader *shader)
|
||||||
|
{
|
||||||
|
bool progress = false;
|
||||||
|
|
||||||
|
nir_foreach_function(func, shader) {
|
||||||
|
if (func->impl && lower_mem_access_bit_sizes_impl(func->impl))
|
||||||
|
progress = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return progress;
|
||||||
|
}
|
|
@ -500,6 +500,11 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
|
||||||
case nir_intrinsic_store_ssbo: {
|
case nir_intrinsic_store_ssbo: {
|
||||||
assert(devinfo->gen >= 7);
|
assert(devinfo->gen >= 7);
|
||||||
|
|
||||||
|
/* brw_nir_lower_mem_access_bit_sizes takes care of this */
|
||||||
|
assert(nir_src_bit_size(instr->src[0]) == 32);
|
||||||
|
assert(nir_intrinsic_write_mask(instr) ==
|
||||||
|
(1 << instr->num_components) - 1);
|
||||||
|
|
||||||
src_reg surf_index = get_nir_ssbo_intrinsic_index(instr);
|
src_reg surf_index = get_nir_ssbo_intrinsic_index(instr);
|
||||||
src_reg offset_reg = retype(get_nir_src_imm(instr->src[2]),
|
src_reg offset_reg = retype(get_nir_src_imm(instr->src[2]),
|
||||||
BRW_REGISTER_TYPE_UD);
|
BRW_REGISTER_TYPE_UD);
|
||||||
|
@ -507,9 +512,6 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
|
||||||
/* Value */
|
/* Value */
|
||||||
src_reg val_reg = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F, 4);
|
src_reg val_reg = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F, 4);
|
||||||
|
|
||||||
/* Writemask */
|
|
||||||
unsigned write_mask = instr->const_index[0];
|
|
||||||
|
|
||||||
/* IvyBridge does not have a native SIMD4x2 untyped write message so untyped
|
/* IvyBridge does not have a native SIMD4x2 untyped write message so untyped
|
||||||
* writes will use SIMD8 mode. In order to hide this and keep symmetry across
|
* writes will use SIMD8 mode. In order to hide this and keep symmetry across
|
||||||
* typed and untyped messages and across hardware platforms, the
|
* typed and untyped messages and across hardware platforms, the
|
||||||
|
@ -551,92 +553,18 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
|
||||||
const vec4_builder bld = vec4_builder(this).at_end()
|
const vec4_builder bld = vec4_builder(this).at_end()
|
||||||
.annotate(current_annotation, base_ir);
|
.annotate(current_annotation, base_ir);
|
||||||
|
|
||||||
unsigned type_slots = nir_src_bit_size(instr->src[0]) / 32;
|
emit_untyped_write(bld, surf_index, offset_reg, val_reg,
|
||||||
if (type_slots == 2) {
|
1 /* dims */, instr->num_components /* size */,
|
||||||
dst_reg tmp = dst_reg(this, glsl_type::dvec4_type);
|
BRW_PREDICATE_NONE);
|
||||||
shuffle_64bit_data(tmp, retype(val_reg, tmp.type), true);
|
|
||||||
val_reg = src_reg(retype(tmp, BRW_REGISTER_TYPE_F));
|
|
||||||
}
|
|
||||||
|
|
||||||
uint8_t swizzle[4] = { 0, 0, 0, 0};
|
|
||||||
int num_channels = 0;
|
|
||||||
unsigned skipped_channels = 0;
|
|
||||||
int num_components = instr->num_components;
|
|
||||||
for (int i = 0; i < num_components; i++) {
|
|
||||||
/* Read components Z/W of a dvec from the appropriate place. We will
|
|
||||||
* also have to adjust the swizzle (we do that with the '% 4' below)
|
|
||||||
*/
|
|
||||||
if (i == 2 && type_slots == 2)
|
|
||||||
val_reg = byte_offset(val_reg, REG_SIZE);
|
|
||||||
|
|
||||||
/* Check if this channel needs to be written. If so, record the
|
|
||||||
* channel we need to take the data from in the swizzle array
|
|
||||||
*/
|
|
||||||
int component_mask = 1 << i;
|
|
||||||
int write_test = write_mask & component_mask;
|
|
||||||
if (write_test) {
|
|
||||||
/* If we are writing doubles we have to write 2 channels worth of
|
|
||||||
* of data (64 bits) for each double component.
|
|
||||||
*/
|
|
||||||
swizzle[num_channels++] = (i * type_slots) % 4;
|
|
||||||
if (type_slots == 2)
|
|
||||||
swizzle[num_channels++] = (i * type_slots + 1) % 4;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If we don't have to write this channel it means we have a gap in the
|
|
||||||
* vector, so write the channels we accumulated until now, if any. Do
|
|
||||||
* the same if this was the last component in the vector, if we have
|
|
||||||
* enough channels for a full vec4 write or if we have processed
|
|
||||||
* components XY of a dvec (since components ZW are not in the same
|
|
||||||
* SIMD register)
|
|
||||||
*/
|
|
||||||
if (!write_test || i == num_components - 1 || num_channels == 4 ||
|
|
||||||
(i == 1 && type_slots == 2)) {
|
|
||||||
if (num_channels > 0) {
|
|
||||||
/* We have channels to write, so update the offset we need to
|
|
||||||
* write at to skip the channels we skipped, if any.
|
|
||||||
*/
|
|
||||||
if (skipped_channels > 0) {
|
|
||||||
if (offset_reg.file == IMM) {
|
|
||||||
offset_reg.ud += 4 * skipped_channels;
|
|
||||||
} else {
|
|
||||||
emit(ADD(dst_reg(offset_reg), offset_reg,
|
|
||||||
brw_imm_ud(4 * skipped_channels)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Swizzle the data register so we take the data from the channels
|
|
||||||
* we need to write and send the write message. This will write
|
|
||||||
* num_channels consecutive dwords starting at offset.
|
|
||||||
*/
|
|
||||||
val_reg.swizzle =
|
|
||||||
BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
|
|
||||||
emit_untyped_write(bld, surf_index, offset_reg, val_reg,
|
|
||||||
1 /* dims */, num_channels /* size */,
|
|
||||||
BRW_PREDICATE_NONE);
|
|
||||||
|
|
||||||
/* If we have to do a second write we will have to update the
|
|
||||||
* offset so that we jump over the channels we have just written
|
|
||||||
* now.
|
|
||||||
*/
|
|
||||||
skipped_channels = num_channels;
|
|
||||||
|
|
||||||
/* Restart the count for the next write message */
|
|
||||||
num_channels = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If we didn't write the channel, increase skipped count */
|
|
||||||
if (!write_test)
|
|
||||||
skipped_channels += type_slots;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
case nir_intrinsic_load_ssbo: {
|
case nir_intrinsic_load_ssbo: {
|
||||||
assert(devinfo->gen >= 7);
|
assert(devinfo->gen >= 7);
|
||||||
|
|
||||||
|
/* brw_nir_lower_mem_access_bit_sizes takes care of this */
|
||||||
|
assert(nir_dest_bit_size(instr->dest) == 32);
|
||||||
|
|
||||||
src_reg surf_index = get_nir_ssbo_intrinsic_index(instr);
|
src_reg surf_index = get_nir_ssbo_intrinsic_index(instr);
|
||||||
src_reg offset_reg = retype(get_nir_src_imm(instr->src[1]),
|
src_reg offset_reg = retype(get_nir_src_imm(instr->src[1]),
|
||||||
BRW_REGISTER_TYPE_UD);
|
BRW_REGISTER_TYPE_UD);
|
||||||
|
@ -645,36 +573,10 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
|
||||||
const vec4_builder bld = vec4_builder(this).at_end()
|
const vec4_builder bld = vec4_builder(this).at_end()
|
||||||
.annotate(current_annotation, base_ir);
|
.annotate(current_annotation, base_ir);
|
||||||
|
|
||||||
src_reg read_result;
|
src_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
|
||||||
|
1 /* dims */, 4 /* size*/,
|
||||||
|
BRW_PREDICATE_NONE);
|
||||||
dst_reg dest = get_nir_dest(instr->dest);
|
dst_reg dest = get_nir_dest(instr->dest);
|
||||||
if (type_sz(dest.type) < 8) {
|
|
||||||
read_result = emit_untyped_read(bld, surf_index, offset_reg,
|
|
||||||
1 /* dims */, 4 /* size*/,
|
|
||||||
BRW_PREDICATE_NONE);
|
|
||||||
} else {
|
|
||||||
src_reg shuffled = src_reg(this, glsl_type::dvec4_type);
|
|
||||||
|
|
||||||
src_reg temp;
|
|
||||||
temp = emit_untyped_read(bld, surf_index, offset_reg,
|
|
||||||
1 /* dims */, 4 /* size*/,
|
|
||||||
BRW_PREDICATE_NONE);
|
|
||||||
emit(MOV(dst_reg(retype(shuffled, temp.type)), temp));
|
|
||||||
|
|
||||||
if (offset_reg.file == IMM)
|
|
||||||
offset_reg.ud += 16;
|
|
||||||
else
|
|
||||||
emit(ADD(dst_reg(offset_reg), offset_reg, brw_imm_ud(16)));
|
|
||||||
|
|
||||||
temp = emit_untyped_read(bld, surf_index, offset_reg,
|
|
||||||
1 /* dims */, 4 /* size*/,
|
|
||||||
BRW_PREDICATE_NONE);
|
|
||||||
emit(MOV(dst_reg(retype(byte_offset(shuffled, REG_SIZE), temp.type)),
|
|
||||||
temp));
|
|
||||||
|
|
||||||
read_result = src_reg(this, glsl_type::dvec4_type);
|
|
||||||
shuffle_64bit_data(dst_reg(read_result), shuffled, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
read_result.type = dest.type;
|
read_result.type = dest.type;
|
||||||
read_result.swizzle = brw_swizzle_for_size(instr->num_components);
|
read_result.swizzle = brw_swizzle_for_size(instr->num_components);
|
||||||
emit(MOV(dest, read_result));
|
emit(MOV(dest, read_result));
|
||||||
|
|
|
@ -78,6 +78,7 @@ libintel_compiler_files = files(
|
||||||
'brw_nir_attribute_workarounds.c',
|
'brw_nir_attribute_workarounds.c',
|
||||||
'brw_nir_lower_cs_intrinsics.c',
|
'brw_nir_lower_cs_intrinsics.c',
|
||||||
'brw_nir_lower_image_load_store.c',
|
'brw_nir_lower_image_load_store.c',
|
||||||
|
'brw_nir_lower_mem_access_bit_sizes.c',
|
||||||
'brw_nir_opt_peephole_ffma.c',
|
'brw_nir_opt_peephole_ffma.c',
|
||||||
'brw_nir_tcs_workarounds.c',
|
'brw_nir_tcs_workarounds.c',
|
||||||
'brw_packed_float.c',
|
'brw_packed_float.c',
|
||||||
|
|
Loading…
Reference in New Issue