From d372abe397316fd8f8e21111e87d925ceda42d56 Mon Sep 17 00:00:00 2001 From: Caio Marcelo de Oliveira Filho Date: Thu, 29 Oct 2020 14:20:39 -0700 Subject: [PATCH] intel/fs: Add surface OWORD BLOCK opcodes Reviewed-by: Jason Ekstrand Part-of: --- src/intel/compiler/brw_eu.h | 20 +++ src/intel/compiler/brw_eu_defines.h | 4 + src/intel/compiler/brw_fs.cpp | 149 +++++++++++++++--- .../compiler/brw_schedule_instructions.cpp | 1 + src/intel/compiler/brw_shader.cpp | 7 + 5 files changed, 156 insertions(+), 25 deletions(-) diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h index cbb6e51694b..0cf6722437b 100644 --- a/src/intel/compiler/brw_eu.h +++ b/src/intel/compiler/brw_eu.h @@ -781,6 +781,26 @@ brw_dp_dword_scattered_rw_desc(const struct gen_device_info *devinfo, return brw_dp_surface_desc(devinfo, msg_type, msg_control); } +static inline uint32_t +brw_dp_oword_block_rw_desc(const struct gen_device_info *devinfo, + bool align_16B, + unsigned num_dwords, + bool write) +{ + /* Writes can only have addresses aligned by OWORDs (16 Bytes). */ + assert(!write || align_16B); + + const unsigned msg_type = + write ? GEN7_DATAPORT_DC_OWORD_BLOCK_WRITE : + align_16B ? GEN7_DATAPORT_DC_OWORD_BLOCK_READ : + GEN7_DATAPORT_DC_UNALIGNED_OWORD_BLOCK_READ; + + const unsigned msg_control = + SET_BITS(BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_dwords), 2, 0); + + return brw_dp_surface_desc(devinfo, msg_type, msg_control); +} + static inline uint32_t brw_dp_a64_untyped_surface_rw_desc(const struct gen_device_info *devinfo, unsigned exec_size, /**< 0 for SIMD4x2 */ diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index 986b876c251..c1114a78968 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -415,6 +415,10 @@ enum opcode { VEC4_OPCODE_UNTYPED_SURFACE_WRITE, SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, + SHADER_OPCODE_OWORD_BLOCK_READ_LOGICAL, + SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL, + SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL, + /** * Untyped A64 surface access opcodes. * diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 42f20695433..1124a245f90 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -839,6 +839,21 @@ fs_inst::components_read(unsigned i) const return 1; } + case SHADER_OPCODE_OWORD_BLOCK_READ_LOGICAL: + case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL: + assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM); + return 1; + + case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL: + assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM); + if (i == SURFACE_LOGICAL_SRC_DATA) { + const unsigned comps = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud / exec_size; + assert(comps > 0); + return comps; + } else { + return 1; + } + case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: assert(src[2].file == IMM); return i == 1 ? src[2].ud : 1; @@ -5367,6 +5382,39 @@ emit_predicate_on_sample_mask(const fs_builder &bld, fs_inst *inst) } } +static void +setup_surface_descriptors(const fs_builder &bld, fs_inst *inst, uint32_t desc, + const fs_reg &surface, const fs_reg &surface_handle) +{ + const gen_device_info *devinfo = bld.shader->devinfo; + + /* We must have exactly one of surface and surface_handle */ + assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE)); + + if (surface.file == IMM) { + inst->desc = desc | (surface.ud & 0xff); + inst->src[0] = brw_imm_ud(0); + inst->src[1] = brw_imm_ud(0); /* ex_desc */ + } else if (surface_handle.file != BAD_FILE) { + /* Bindless surface */ + assert(devinfo->gen >= 9); + inst->desc = desc | GEN9_BTI_BINDLESS; + inst->src[0] = brw_imm_ud(0); + + /* We assume that the driver provided the handle in the top 20 bits so + * we can use the surface handle directly as the extended descriptor. + */ + inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD); + } else { + inst->desc = desc; + const fs_builder ubld = bld.exec_all().group(1, 0); + fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD); + ubld.AND(tmp, surface, brw_imm_ud(0xff)); + inst->src[0] = component(tmp, 0); + inst->src[1] = brw_imm_ud(0); /* ex_desc */ + } +} + static void lower_surface_logical_send(const fs_builder &bld, fs_inst *inst) { @@ -5384,9 +5432,6 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst) assert(arg.file == IMM); assert(allow_sample_mask.file == IMM); - /* We must have exactly one of surface and surface_handle */ - assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE)); - /* Calculate the total number of components of the payload. */ const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS); const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA); @@ -5608,28 +5653,7 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst) /* Set up SFID and descriptors */ inst->sfid = sfid; - inst->desc = desc; - if (surface.file == IMM) { - inst->desc |= surface.ud & 0xff; - inst->src[0] = brw_imm_ud(0); - inst->src[1] = brw_imm_ud(0); /* ex_desc */ - } else if (surface_handle.file != BAD_FILE) { - /* Bindless surface */ - assert(devinfo->gen >= 9); - inst->desc |= GEN9_BTI_BINDLESS; - inst->src[0] = brw_imm_ud(0); - - /* We assume that the driver provided the handle in the top 20 bits so - * we can use the surface handle directly as the extended descriptor. - */ - inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD); - } else { - const fs_builder ubld = bld.exec_all().group(1, 0); - fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD); - ubld.AND(tmp, surface, brw_imm_ud(0xff)); - inst->src[0] = component(tmp, 0); - inst->src[1] = brw_imm_ud(0); /* ex_desc */ - } + setup_surface_descriptors(bld, inst, desc, surface, surface_handle); /* Finally, the payload */ inst->src[2] = payload; @@ -5638,6 +5662,75 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst) inst->resize_sources(4); } +static void +lower_surface_block_logical_send(const fs_builder &bld, fs_inst *inst) +{ + const gen_device_info *devinfo = bld.shader->devinfo; + assert(devinfo->gen >= 9); + + /* Get the logical send arguments. */ + const fs_reg &addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS]; + const fs_reg &src = inst->src[SURFACE_LOGICAL_SRC_DATA]; + const fs_reg &surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE]; + const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE]; + const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG]; + assert(arg.file == IMM); + assert(inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == BAD_FILE); + assert(inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK].file == BAD_FILE); + + const bool is_stateless = + surface.file == IMM && (surface.ud == BRW_BTI_STATELESS || + surface.ud == GEN8_BTI_STATELESS_NON_COHERENT); + + const bool has_side_effects = inst->has_side_effects(); + + const bool align_16B = + inst->opcode != SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL; + + const bool write = inst->opcode == SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL; + + /* The address is stored in the header. See MH_A32_GO and MH_BTS_GO. */ + fs_builder ubld = bld.exec_all().group(8, 0); + fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD); + + if (is_stateless) + ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header); + else + ubld.MOV(header, brw_imm_d(0)); + + /* Address in OWord units when aligned to OWords. */ + if (align_16B) + ubld.group(1, 0).SHR(component(header, 2), addr, brw_imm_ud(4)); + else + ubld.group(1, 0).MOV(component(header, 2), addr); + + fs_reg data; + unsigned ex_mlen = 0; + if (write) { + const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA); + data = retype(bld.move_to_vgrf(src, src_sz), BRW_REGISTER_TYPE_UD); + ex_mlen = src_sz * type_sz(src.type) * inst->exec_size / REG_SIZE; + } + + inst->opcode = SHADER_OPCODE_SEND; + inst->mlen = 1; + inst->ex_mlen = ex_mlen; + inst->header_size = 1; + inst->send_has_side_effects = has_side_effects; + inst->send_is_volatile = !has_side_effects; + + inst->sfid = GEN7_SFID_DATAPORT_DATA_CACHE; + + const uint32_t desc = brw_dp_oword_block_rw_desc(devinfo, align_16B, + arg.ud, write); + setup_surface_descriptors(bld, inst, desc, surface, surface_handle); + + inst->src[2] = header; + inst->src[3] = data; + + inst->resize_sources(4); +} + static fs_reg emit_a64_oword_block_header(const fs_builder &bld, const fs_reg &addr) { @@ -6019,6 +6112,12 @@ fs_visitor::lower_logical_sends() lower_surface_logical_send(ibld, inst); break; + case SHADER_OPCODE_OWORD_BLOCK_READ_LOGICAL: + case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL: + case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL: + lower_surface_block_logical_send(ibld, inst); + break; + case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL: diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp index bed9e793d59..d9f6f9d852f 100644 --- a/src/intel/compiler/brw_schedule_instructions.cpp +++ b/src/intel/compiler/brw_schedule_instructions.cpp @@ -425,6 +425,7 @@ schedule_node::set_latency_gen7(bool is_haswell) case GEN7_SFID_DATAPORT_DATA_CACHE: switch ((inst->desc >> 14) & 0x1f) { case BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ: + case GEN7_DATAPORT_DC_UNALIGNED_OWORD_BLOCK_READ: case GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE: /* We have no data for this but assume it's a little faster than * untyped surface read/write. diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp index c6d997287c0..dbe28748508 100644 --- a/src/intel/compiler/brw_shader.cpp +++ b/src/intel/compiler/brw_shader.cpp @@ -301,6 +301,12 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op) return "untyped_surface_write"; case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: return "untyped_surface_write_logical"; + case SHADER_OPCODE_OWORD_BLOCK_READ_LOGICAL: + return "oword_block_read_logical"; + case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL: + return "unaligned_oword_block_read_logical"; + case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL: + return "oword_block_write_logical"; case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: return "a64_untyped_read_logical"; case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL: @@ -1094,6 +1100,7 @@ backend_instruction::has_side_effects() const case SHADER_OPCODE_RND_MODE: case SHADER_OPCODE_FLOAT_CONTROL_MODE: case FS_OPCODE_SCHEDULING_FENCE: + case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL: case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL: return true; default: