intel/fs: Implement load/store_global with A64 untyped messages
eviewed-by: Kenneth Graunke <kenneth@whitecape.org>
This commit is contained in:
parent
b4f0d062cd
commit
1c25bf4373
|
@ -437,6 +437,10 @@ static const char *const dp_dc1_msg_type_hsw[32] = {
|
|||
[HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP_SIMD4X2] =
|
||||
"DC 4x2 atomic counter op",
|
||||
[HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE] = "DC typed surface write",
|
||||
[GEN9_DATAPORT_DC_PORT1_A64_SCATTERED_READ] = "DC A64 scattered read",
|
||||
[GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ] = "DC A64 untyped surface read",
|
||||
[GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE] = "DC A64 untyped surface write",
|
||||
[GEN8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE] = "DC A64 scattered write",
|
||||
[GEN9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP] =
|
||||
"DC untyped atomic float op",
|
||||
};
|
||||
|
@ -1941,7 +1945,9 @@ brw_disassemble_inst(FILE *file, const struct gen_device_info *devinfo,
|
|||
case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ:
|
||||
case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE:
|
||||
case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ:
|
||||
case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE: {
|
||||
case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE:
|
||||
case GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE:
|
||||
case GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ: {
|
||||
static const char *simd_modes[] = { "4x2", "16", "8" };
|
||||
format(file, "SIMD%s, Mask = 0x%x",
|
||||
simd_modes[msg_ctrl >> 4], msg_ctrl & 0xf);
|
||||
|
|
|
@ -687,6 +687,68 @@ brw_dp_byte_scattered_rw_desc(const struct gen_device_info *devinfo,
|
|||
return brw_dp_surface_desc(devinfo, msg_type, msg_control);
|
||||
}
|
||||
|
||||
static inline uint32_t
|
||||
brw_dp_a64_untyped_surface_rw_desc(const struct gen_device_info *devinfo,
|
||||
unsigned exec_size, /**< 0 for SIMD4x2 */
|
||||
unsigned num_channels,
|
||||
bool write)
|
||||
{
|
||||
assert(exec_size <= 8 || exec_size == 16);
|
||||
assert(devinfo->gen >= 8);
|
||||
|
||||
unsigned msg_type =
|
||||
write ? GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE :
|
||||
GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ;
|
||||
|
||||
/* See also MDC_SM3 in the SKL PRM Vol 2d. */
|
||||
const unsigned simd_mode = exec_size == 0 ? 0 : /* SIMD4x2 */
|
||||
exec_size <= 8 ? 2 : 1;
|
||||
|
||||
const unsigned msg_control =
|
||||
SET_BITS(brw_mdc_cmask(num_channels), 3, 0) |
|
||||
SET_BITS(simd_mode, 5, 4);
|
||||
|
||||
return brw_dp_desc(devinfo, BRW_BTI_STATELESS, msg_type, msg_control);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate the data size (see MDC_A64_DS in the "Structures" volume of the
|
||||
* Skylake PRM).
|
||||
*/
|
||||
static inline uint32_t
|
||||
brw_mdc_a64_ds(unsigned elems)
|
||||
{
|
||||
switch (elems) {
|
||||
case 1: return 0;
|
||||
case 2: return 1;
|
||||
case 4: return 2;
|
||||
case 8: return 3;
|
||||
default:
|
||||
unreachable("Unsupported elmeent count for A64 scattered message");
|
||||
}
|
||||
}
|
||||
|
||||
static inline uint32_t
|
||||
brw_dp_a64_byte_scattered_rw_desc(const struct gen_device_info *devinfo,
|
||||
unsigned exec_size, /**< 0 for SIMD4x2 */
|
||||
unsigned bit_size,
|
||||
bool write)
|
||||
{
|
||||
assert(exec_size <= 8 || exec_size == 16);
|
||||
assert(devinfo->gen >= 8);
|
||||
|
||||
unsigned msg_type =
|
||||
write ? GEN8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE :
|
||||
GEN9_DATAPORT_DC_PORT1_A64_SCATTERED_READ;
|
||||
|
||||
const unsigned msg_control =
|
||||
SET_BITS(GEN8_A64_SCATTERED_SUBTYPE_BYTE, 1, 0) |
|
||||
SET_BITS(brw_mdc_a64_ds(bit_size / 8), 3, 2) |
|
||||
SET_BITS(exec_size == 16, 4, 4);
|
||||
|
||||
return brw_dp_desc(devinfo, BRW_BTI_STATELESS, msg_type, msg_control);
|
||||
}
|
||||
|
||||
static inline uint32_t
|
||||
brw_dp_typed_atomic_desc(const struct gen_device_info *devinfo,
|
||||
unsigned exec_size,
|
||||
|
|
|
@ -412,6 +412,19 @@ enum opcode {
|
|||
SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
|
||||
SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
|
||||
|
||||
/**
|
||||
* Untyped A64 surface access opcodes.
|
||||
*
|
||||
* Source 0: 64-bit address
|
||||
* Source 1: Operational source
|
||||
* Source 2: [required] Opcode-specific control immediate, same as source 2
|
||||
* of the matching non-LOGICAL opcode.
|
||||
*/
|
||||
SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL,
|
||||
SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL,
|
||||
SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL,
|
||||
SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL,
|
||||
|
||||
SHADER_OPCODE_TYPED_ATOMIC,
|
||||
SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
|
||||
SHADER_OPCODE_TYPED_SURFACE_READ,
|
||||
|
@ -1170,12 +1183,22 @@ enum brw_message_target {
|
|||
#define HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP 11
|
||||
#define HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP_SIMD4X2 12
|
||||
#define HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE 13
|
||||
#define GEN9_DATAPORT_DC_PORT1_A64_SCATTERED_READ 0x10
|
||||
#define GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ 0x11
|
||||
#define GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE 0x19
|
||||
#define GEN8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE 0x1a
|
||||
#define GEN9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP 0x1b
|
||||
|
||||
/* GEN9 */
|
||||
#define GEN9_DATAPORT_RC_RENDER_TARGET_WRITE 12
|
||||
#define GEN9_DATAPORT_RC_RENDER_TARGET_READ 13
|
||||
|
||||
/* A64 scattered message subtype */
|
||||
#define GEN8_A64_SCATTERED_SUBTYPE_BYTE 0
|
||||
#define GEN8_A64_SCATTERED_SUBTYPE_DWORD 1
|
||||
#define GEN8_A64_SCATTERED_SUBTYPE_QWORD 2
|
||||
#define GEN8_A64_SCATTERED_SUBTYPE_HWORD 3
|
||||
|
||||
/* Dataport special binding table indices: */
|
||||
#define BRW_BTI_STATELESS 255
|
||||
#define GEN7_BTI_SLM 254
|
||||
|
|
|
@ -789,6 +789,14 @@ fs_inst::components_read(unsigned i) const
|
|||
else
|
||||
return 1;
|
||||
|
||||
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
|
||||
assert(src[2].file == IMM);
|
||||
return 1;
|
||||
|
||||
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
|
||||
assert(src[2].file == IMM);
|
||||
return i == 1 ? src[2].ud : 1;
|
||||
|
||||
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
|
||||
/* Scattered logical opcodes use the following params:
|
||||
* src[0] Surface coordinates
|
||||
|
@ -5207,6 +5215,92 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst)
|
|||
inst->resize_sources(4);
|
||||
}
|
||||
|
||||
static void
|
||||
lower_a64_logical_send(const fs_builder &bld, fs_inst *inst)
|
||||
{
|
||||
const gen_device_info *devinfo = bld.shader->devinfo;
|
||||
|
||||
const fs_reg &addr = inst->src[0];
|
||||
const fs_reg &src = inst->src[1];
|
||||
const unsigned src_comps = inst->components_read(1);
|
||||
assert(inst->src[2].file == IMM);
|
||||
const unsigned arg = inst->src[2].ud;
|
||||
const bool has_side_effects = inst->has_side_effects();
|
||||
|
||||
/* If the surface message has side effects and we're a fragment shader, we
|
||||
* have to predicate with the sample mask to avoid helper invocations.
|
||||
*/
|
||||
if (has_side_effects && bld.shader->stage == MESA_SHADER_FRAGMENT) {
|
||||
inst->flag_subreg = 2;
|
||||
inst->predicate = BRW_PREDICATE_NORMAL;
|
||||
inst->predicate_inverse = false;
|
||||
|
||||
fs_reg sample_mask = bld.sample_mask_reg();
|
||||
const fs_builder ubld = bld.group(1, 0).exec_all();
|
||||
ubld.MOV(retype(brw_flag_subreg(inst->flag_subreg), sample_mask.type),
|
||||
sample_mask);
|
||||
}
|
||||
|
||||
/* Add two because the address is 64-bit */
|
||||
const unsigned dwords = 2 + src_comps;
|
||||
const unsigned mlen = dwords * (inst->exec_size / 8);
|
||||
|
||||
fs_reg sources[5];
|
||||
|
||||
sources[0] = addr;
|
||||
|
||||
for (unsigned i = 0; i < src_comps; i++)
|
||||
sources[1 + i] = offset(src, bld, i);
|
||||
|
||||
const fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, dwords);
|
||||
bld.LOAD_PAYLOAD(payload, sources, 1 + src_comps, 0);
|
||||
|
||||
uint32_t desc;
|
||||
switch (inst->opcode) {
|
||||
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
|
||||
desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
|
||||
arg, /* num_channels */
|
||||
false /* write */);
|
||||
break;
|
||||
|
||||
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
|
||||
desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
|
||||
arg, /* num_channels */
|
||||
true /* write */);
|
||||
break;
|
||||
|
||||
case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
|
||||
desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
|
||||
arg, /* bit_size */
|
||||
false /* write */);
|
||||
break;
|
||||
|
||||
case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
|
||||
desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
|
||||
arg, /* bit_size */
|
||||
true /* write */);
|
||||
break;
|
||||
|
||||
default:
|
||||
unreachable("Unknown A64 logical instruction");
|
||||
}
|
||||
|
||||
/* Update the original instruction. */
|
||||
inst->opcode = SHADER_OPCODE_SEND;
|
||||
inst->mlen = mlen;
|
||||
inst->header_size = 0;
|
||||
inst->send_has_side_effects = has_side_effects;
|
||||
inst->send_is_volatile = !has_side_effects;
|
||||
|
||||
/* Set up SFID and descriptors */
|
||||
inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
|
||||
inst->desc = desc;
|
||||
inst->resize_sources(3);
|
||||
inst->src[0] = brw_imm_ud(0); /* desc */
|
||||
inst->src[1] = brw_imm_ud(0); /* ex_desc */
|
||||
inst->src[2] = payload;
|
||||
}
|
||||
|
||||
static void
|
||||
lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst)
|
||||
{
|
||||
|
@ -5381,6 +5475,13 @@ fs_visitor::lower_logical_sends()
|
|||
lower_surface_logical_send(ibld, inst);
|
||||
break;
|
||||
|
||||
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
|
||||
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
|
||||
case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
|
||||
case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
|
||||
lower_a64_logical_send(ibld, inst);
|
||||
break;
|
||||
|
||||
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
|
||||
lower_varying_pull_constant_logical_send(ibld, inst);
|
||||
break;
|
||||
|
@ -5878,6 +5979,12 @@ get_lowered_simd_width(const struct gen_device_info *devinfo,
|
|||
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
|
||||
return MIN2(16, inst->exec_size);
|
||||
|
||||
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
|
||||
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
|
||||
case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
|
||||
case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
|
||||
return devinfo->gen <= 8 ? 8 : MIN2(16, inst->exec_size);
|
||||
|
||||
case SHADER_OPCODE_URB_READ_SIMD8:
|
||||
case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
|
||||
case SHADER_OPCODE_URB_WRITE_SIMD8:
|
||||
|
|
|
@ -3971,6 +3971,64 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
|
|||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_global: {
|
||||
assert(devinfo->gen >= 8);
|
||||
|
||||
if (nir_intrinsic_align(instr) >= 4) {
|
||||
assert(nir_dest_bit_size(instr->dest) == 32);
|
||||
fs_inst *inst = bld.emit(SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL,
|
||||
dest,
|
||||
get_nir_src(instr->src[0]), /* Address */
|
||||
fs_reg(), /* No source data */
|
||||
brw_imm_ud(instr->num_components));
|
||||
inst->size_written = instr->num_components *
|
||||
inst->dst.component_size(inst->exec_size);
|
||||
} else {
|
||||
const unsigned bit_size = nir_dest_bit_size(instr->dest);
|
||||
assert(bit_size <= 32);
|
||||
assert(nir_dest_num_components(instr->dest) == 1);
|
||||
brw_reg_type data_type =
|
||||
brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
|
||||
fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||
bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL,
|
||||
tmp,
|
||||
get_nir_src(instr->src[0]), /* Address */
|
||||
fs_reg(), /* No source data */
|
||||
brw_imm_ud(bit_size));
|
||||
bld.MOV(retype(dest, data_type), tmp);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_store_global:
|
||||
assert(devinfo->gen >= 8);
|
||||
|
||||
if (stage == MESA_SHADER_FRAGMENT)
|
||||
brw_wm_prog_data(prog_data)->has_side_effects = true;
|
||||
|
||||
if (nir_intrinsic_align(instr) >= 4) {
|
||||
assert(nir_src_bit_size(instr->src[0]) == 32);
|
||||
bld.emit(SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL,
|
||||
fs_reg(),
|
||||
get_nir_src(instr->src[1]), /* Address */
|
||||
get_nir_src(instr->src[0]), /* Data */
|
||||
brw_imm_ud(instr->num_components));
|
||||
} else {
|
||||
const unsigned bit_size = nir_src_bit_size(instr->src[0]);
|
||||
assert(bit_size <= 32);
|
||||
assert(nir_src_num_components(instr->src[0]) == 1);
|
||||
brw_reg_type data_type =
|
||||
brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
|
||||
fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||
bld.MOV(tmp, retype(get_nir_src(instr->src[0]), data_type));
|
||||
bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL,
|
||||
fs_reg(),
|
||||
get_nir_src(instr->src[1]), /* Address */
|
||||
tmp, /* Data */
|
||||
brw_imm_ud(nir_src_bit_size(instr->src[0])));
|
||||
}
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_ssbo: {
|
||||
assert(devinfo->gen >= 7);
|
||||
|
||||
|
|
|
@ -486,6 +486,10 @@ schedule_node::set_latency_gen7(bool is_haswell)
|
|||
case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE:
|
||||
case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ:
|
||||
case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE:
|
||||
case GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE:
|
||||
case GEN8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ:
|
||||
case GEN8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE:
|
||||
case GEN9_DATAPORT_DC_PORT1_A64_SCATTERED_READ:
|
||||
/* See also SHADER_OPCODE_UNTYPED_SURFACE_READ */
|
||||
latency = 300;
|
||||
break;
|
||||
|
|
|
@ -294,6 +294,14 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op)
|
|||
return "untyped_surface_write";
|
||||
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
|
||||
return "untyped_surface_write_logical";
|
||||
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
|
||||
return "a64_untyped_read_logical";
|
||||
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
|
||||
return "a64_untyped_write_logical";
|
||||
case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
|
||||
return "a64_byte_scattered_read_logical";
|
||||
case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
|
||||
return "a64_byte_scattered_write_logical";
|
||||
case SHADER_OPCODE_TYPED_ATOMIC:
|
||||
return "typed_atomic";
|
||||
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
|
||||
|
@ -1010,6 +1018,8 @@ backend_instruction::has_side_effects() const
|
|||
case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
|
||||
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
|
||||
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
|
||||
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
|
||||
case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
|
||||
case SHADER_OPCODE_BYTE_SCATTERED_WRITE:
|
||||
case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
|
||||
case SHADER_OPCODE_TYPED_ATOMIC:
|
||||
|
@ -1048,6 +1058,8 @@ backend_instruction::is_volatile() const
|
|||
case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
|
||||
case SHADER_OPCODE_BYTE_SCATTERED_READ:
|
||||
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
|
||||
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
|
||||
case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
|
||||
case SHADER_OPCODE_URB_READ_SIMD8:
|
||||
case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
|
||||
case VEC4_OPCODE_URB_READ:
|
||||
|
|
Loading…
Reference in New Issue