intel/fs: Implement load/store_global with A64 untyped messages

eviewed-by: Kenneth Graunke <kenneth@whitecape.org>
This commit is contained in:
Jason Ekstrand 2018-11-14 17:13:57 -06:00
parent b4f0d062cd
commit 1c25bf4373
7 changed files with 273 additions and 1 deletions

View File

@ -437,6 +437,10 @@ static const char *const dp_dc1_msg_type_hsw[32] = {
[HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP_SIMD4X2] =
"DC 4x2 atomic counter op",
[HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE] = "DC typed surface write",
[GEN9_DATAPORT_DC_PORT1_A64_SCATTERED_READ] = "DC A64 scattered read",
[GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ] = "DC A64 untyped surface read",
[GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE] = "DC A64 untyped surface write",
[GEN8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE] = "DC A64 scattered write",
[GEN9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP] =
"DC untyped atomic float op",
};
@ -1941,7 +1945,9 @@ brw_disassemble_inst(FILE *file, const struct gen_device_info *devinfo,
case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ:
case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE:
case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ:
case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE: {
case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE:
case GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE:
case GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ: {
static const char *simd_modes[] = { "4x2", "16", "8" };
format(file, "SIMD%s, Mask = 0x%x",
simd_modes[msg_ctrl >> 4], msg_ctrl & 0xf);

View File

@ -687,6 +687,68 @@ brw_dp_byte_scattered_rw_desc(const struct gen_device_info *devinfo,
return brw_dp_surface_desc(devinfo, msg_type, msg_control);
}
static inline uint32_t
brw_dp_a64_untyped_surface_rw_desc(const struct gen_device_info *devinfo,
unsigned exec_size, /**< 0 for SIMD4x2 */
unsigned num_channels,
bool write)
{
assert(exec_size <= 8 || exec_size == 16);
assert(devinfo->gen >= 8);
unsigned msg_type =
write ? GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE :
GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ;
/* See also MDC_SM3 in the SKL PRM Vol 2d. */
const unsigned simd_mode = exec_size == 0 ? 0 : /* SIMD4x2 */
exec_size <= 8 ? 2 : 1;
const unsigned msg_control =
SET_BITS(brw_mdc_cmask(num_channels), 3, 0) |
SET_BITS(simd_mode, 5, 4);
return brw_dp_desc(devinfo, BRW_BTI_STATELESS, msg_type, msg_control);
}
/**
* Calculate the data size (see MDC_A64_DS in the "Structures" volume of the
* Skylake PRM).
*/
static inline uint32_t
brw_mdc_a64_ds(unsigned elems)
{
switch (elems) {
case 1: return 0;
case 2: return 1;
case 4: return 2;
case 8: return 3;
default:
unreachable("Unsupported elmeent count for A64 scattered message");
}
}
static inline uint32_t
brw_dp_a64_byte_scattered_rw_desc(const struct gen_device_info *devinfo,
unsigned exec_size, /**< 0 for SIMD4x2 */
unsigned bit_size,
bool write)
{
assert(exec_size <= 8 || exec_size == 16);
assert(devinfo->gen >= 8);
unsigned msg_type =
write ? GEN8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE :
GEN9_DATAPORT_DC_PORT1_A64_SCATTERED_READ;
const unsigned msg_control =
SET_BITS(GEN8_A64_SCATTERED_SUBTYPE_BYTE, 1, 0) |
SET_BITS(brw_mdc_a64_ds(bit_size / 8), 3, 2) |
SET_BITS(exec_size == 16, 4, 4);
return brw_dp_desc(devinfo, BRW_BTI_STATELESS, msg_type, msg_control);
}
static inline uint32_t
brw_dp_typed_atomic_desc(const struct gen_device_info *devinfo,
unsigned exec_size,

View File

@ -412,6 +412,19 @@ enum opcode {
SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
/**
* Untyped A64 surface access opcodes.
*
* Source 0: 64-bit address
* Source 1: Operational source
* Source 2: [required] Opcode-specific control immediate, same as source 2
* of the matching non-LOGICAL opcode.
*/
SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL,
SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL,
SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL,
SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL,
SHADER_OPCODE_TYPED_ATOMIC,
SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
SHADER_OPCODE_TYPED_SURFACE_READ,
@ -1170,12 +1183,22 @@ enum brw_message_target {
#define HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP 11
#define HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP_SIMD4X2 12
#define HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE 13
#define GEN9_DATAPORT_DC_PORT1_A64_SCATTERED_READ 0x10
#define GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ 0x11
#define GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE 0x19
#define GEN8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE 0x1a
#define GEN9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP 0x1b
/* GEN9 */
#define GEN9_DATAPORT_RC_RENDER_TARGET_WRITE 12
#define GEN9_DATAPORT_RC_RENDER_TARGET_READ 13
/* A64 scattered message subtype */
#define GEN8_A64_SCATTERED_SUBTYPE_BYTE 0
#define GEN8_A64_SCATTERED_SUBTYPE_DWORD 1
#define GEN8_A64_SCATTERED_SUBTYPE_QWORD 2
#define GEN8_A64_SCATTERED_SUBTYPE_HWORD 3
/* Dataport special binding table indices: */
#define BRW_BTI_STATELESS 255
#define GEN7_BTI_SLM 254

View File

@ -789,6 +789,14 @@ fs_inst::components_read(unsigned i) const
else
return 1;
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
assert(src[2].file == IMM);
return 1;
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
assert(src[2].file == IMM);
return i == 1 ? src[2].ud : 1;
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
/* Scattered logical opcodes use the following params:
* src[0] Surface coordinates
@ -5207,6 +5215,92 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst)
inst->resize_sources(4);
}
static void
lower_a64_logical_send(const fs_builder &bld, fs_inst *inst)
{
const gen_device_info *devinfo = bld.shader->devinfo;
const fs_reg &addr = inst->src[0];
const fs_reg &src = inst->src[1];
const unsigned src_comps = inst->components_read(1);
assert(inst->src[2].file == IMM);
const unsigned arg = inst->src[2].ud;
const bool has_side_effects = inst->has_side_effects();
/* If the surface message has side effects and we're a fragment shader, we
* have to predicate with the sample mask to avoid helper invocations.
*/
if (has_side_effects && bld.shader->stage == MESA_SHADER_FRAGMENT) {
inst->flag_subreg = 2;
inst->predicate = BRW_PREDICATE_NORMAL;
inst->predicate_inverse = false;
fs_reg sample_mask = bld.sample_mask_reg();
const fs_builder ubld = bld.group(1, 0).exec_all();
ubld.MOV(retype(brw_flag_subreg(inst->flag_subreg), sample_mask.type),
sample_mask);
}
/* Add two because the address is 64-bit */
const unsigned dwords = 2 + src_comps;
const unsigned mlen = dwords * (inst->exec_size / 8);
fs_reg sources[5];
sources[0] = addr;
for (unsigned i = 0; i < src_comps; i++)
sources[1 + i] = offset(src, bld, i);
const fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, dwords);
bld.LOAD_PAYLOAD(payload, sources, 1 + src_comps, 0);
uint32_t desc;
switch (inst->opcode) {
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
arg, /* num_channels */
false /* write */);
break;
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
arg, /* num_channels */
true /* write */);
break;
case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
arg, /* bit_size */
false /* write */);
break;
case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
arg, /* bit_size */
true /* write */);
break;
default:
unreachable("Unknown A64 logical instruction");
}
/* Update the original instruction. */
inst->opcode = SHADER_OPCODE_SEND;
inst->mlen = mlen;
inst->header_size = 0;
inst->send_has_side_effects = has_side_effects;
inst->send_is_volatile = !has_side_effects;
/* Set up SFID and descriptors */
inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
inst->desc = desc;
inst->resize_sources(3);
inst->src[0] = brw_imm_ud(0); /* desc */
inst->src[1] = brw_imm_ud(0); /* ex_desc */
inst->src[2] = payload;
}
static void
lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst)
{
@ -5381,6 +5475,13 @@ fs_visitor::lower_logical_sends()
lower_surface_logical_send(ibld, inst);
break;
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
lower_a64_logical_send(ibld, inst);
break;
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
lower_varying_pull_constant_logical_send(ibld, inst);
break;
@ -5878,6 +5979,12 @@ get_lowered_simd_width(const struct gen_device_info *devinfo,
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
return MIN2(16, inst->exec_size);
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
return devinfo->gen <= 8 ? 8 : MIN2(16, inst->exec_size);
case SHADER_OPCODE_URB_READ_SIMD8:
case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
case SHADER_OPCODE_URB_WRITE_SIMD8:

View File

@ -3971,6 +3971,64 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
break;
}
case nir_intrinsic_load_global: {
assert(devinfo->gen >= 8);
if (nir_intrinsic_align(instr) >= 4) {
assert(nir_dest_bit_size(instr->dest) == 32);
fs_inst *inst = bld.emit(SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL,
dest,
get_nir_src(instr->src[0]), /* Address */
fs_reg(), /* No source data */
brw_imm_ud(instr->num_components));
inst->size_written = instr->num_components *
inst->dst.component_size(inst->exec_size);
} else {
const unsigned bit_size = nir_dest_bit_size(instr->dest);
assert(bit_size <= 32);
assert(nir_dest_num_components(instr->dest) == 1);
brw_reg_type data_type =
brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL,
tmp,
get_nir_src(instr->src[0]), /* Address */
fs_reg(), /* No source data */
brw_imm_ud(bit_size));
bld.MOV(retype(dest, data_type), tmp);
}
break;
}
case nir_intrinsic_store_global:
assert(devinfo->gen >= 8);
if (stage == MESA_SHADER_FRAGMENT)
brw_wm_prog_data(prog_data)->has_side_effects = true;
if (nir_intrinsic_align(instr) >= 4) {
assert(nir_src_bit_size(instr->src[0]) == 32);
bld.emit(SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL,
fs_reg(),
get_nir_src(instr->src[1]), /* Address */
get_nir_src(instr->src[0]), /* Data */
brw_imm_ud(instr->num_components));
} else {
const unsigned bit_size = nir_src_bit_size(instr->src[0]);
assert(bit_size <= 32);
assert(nir_src_num_components(instr->src[0]) == 1);
brw_reg_type data_type =
brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.MOV(tmp, retype(get_nir_src(instr->src[0]), data_type));
bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL,
fs_reg(),
get_nir_src(instr->src[1]), /* Address */
tmp, /* Data */
brw_imm_ud(nir_src_bit_size(instr->src[0])));
}
break;
case nir_intrinsic_load_ssbo: {
assert(devinfo->gen >= 7);

View File

@ -486,6 +486,10 @@ schedule_node::set_latency_gen7(bool is_haswell)
case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE:
case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ:
case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE:
case GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE:
case GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ:
case GEN8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE:
case GEN9_DATAPORT_DC_PORT1_A64_SCATTERED_READ:
/* See also SHADER_OPCODE_UNTYPED_SURFACE_READ */
latency = 300;
break;

View File

@ -294,6 +294,14 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op)
return "untyped_surface_write";
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
return "untyped_surface_write_logical";
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
return "a64_untyped_read_logical";
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
return "a64_untyped_write_logical";
case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
return "a64_byte_scattered_read_logical";
case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
return "a64_byte_scattered_write_logical";
case SHADER_OPCODE_TYPED_ATOMIC:
return "typed_atomic";
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
@ -1010,6 +1018,8 @@ backend_instruction::has_side_effects() const
case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
case SHADER_OPCODE_BYTE_SCATTERED_WRITE:
case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
case SHADER_OPCODE_TYPED_ATOMIC:
@ -1048,6 +1058,8 @@ backend_instruction::is_volatile() const
case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
case SHADER_OPCODE_BYTE_SCATTERED_READ:
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
case SHADER_OPCODE_URB_READ_SIMD8:
case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
case VEC4_OPCODE_URB_READ: