diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index fafc5fc22c9..92a9c9902e4 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -5368,6 +5368,15 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst) inst->opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL || inst->opcode == SHADER_OPCODE_TYPED_ATOMIC_LOGICAL; + const bool is_surface_access = is_typed_access || + inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL || + inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL || + inst->opcode == SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL; + + const bool is_stateless = + surface.file == IMM && (surface.ud == BRW_BTI_STATELESS || + surface.ud == GEN8_BTI_STATELESS_NON_COHERENT); + const bool has_side_effects = inst->has_side_effects(); fs_reg sample_mask = has_side_effects ? bld.sample_mask_reg() : fs_reg(brw_imm_d(0xffff)); @@ -5381,25 +5390,63 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst) * we don't attempt to implement sample masks via predication for such * messages prior to Gen9, since we have to provide a header anyway. On * Gen11+ the header has been removed so we can only use predication. + * + * For all stateless A32 messages, we also need a header */ fs_reg header; - if (devinfo->gen < 9 && is_typed_access) { + if ((devinfo->gen < 9 && is_typed_access) || is_stateless) { fs_builder ubld = bld.exec_all().group(8, 0); header = ubld.vgrf(BRW_REGISTER_TYPE_UD); ubld.MOV(header, brw_imm_d(0)); - ubld.group(1, 0).MOV(component(header, 7), sample_mask); + if (is_stateless) { + /* Both the typed and scattered byte/dword A32 messages take a buffer + * base address in R0.5:[31:0] (See MH1_A32_PSM for typed messages or + * MH_A32_GO for byte/dword scattered messages in the SKL PRM Vol. 2d + * for more details.) This is conveniently where the HW places the + * scratch surface base address. + * + * From the SKL PRM Vol. 7 "Per-Thread Scratch Space": + * + * "When a thread becomes 'active' it is allocated a portion of + * scratch space, sized according to PerThreadScratchSpace. The + * starting location of each thread’s scratch space allocation, + * ScratchSpaceOffset, is passed in the thread payload in + * R0.5[31:10] and is specified as a 1KB-granular offset from the + * GeneralStateBaseAddress. The computation of ScratchSpaceOffset + * includes the starting address of the stage’s scratch space + * allocation, as programmed by ScratchSpaceBasePointer." + * + * The base address is passed in bits R0.5[31:10] and the bottom 10 + * bits of R0.5 are used for other things. Therefore, we have to + * mask off the bottom 10 bits so that we don't get a garbage base + * address. + */ + ubld.group(1, 0).AND(component(header, 5), + retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), + brw_imm_ud(0xfffffc00)); + } + if (is_surface_access) + ubld.group(1, 0).MOV(component(header, 7), sample_mask); } const unsigned header_sz = header.file != BAD_FILE ? 1 : 0; fs_reg payload, payload2; unsigned mlen, ex_mlen = 0; - if (devinfo->gen >= 9) { + if (devinfo->gen >= 9 && + (src.file == BAD_FILE || header.file == BAD_FILE)) { /* We have split sends on gen9 and above */ - assert(header.file == BAD_FILE); - payload = bld.move_to_vgrf(addr, addr_sz); - payload2 = bld.move_to_vgrf(src, src_sz); - mlen = addr_sz * (inst->exec_size / 8); - ex_mlen = src_sz * (inst->exec_size / 8); + if (header.file == BAD_FILE) { + payload = bld.move_to_vgrf(addr, addr_sz); + payload2 = bld.move_to_vgrf(src, src_sz); + mlen = addr_sz * (inst->exec_size / 8); + ex_mlen = src_sz * (inst->exec_size / 8); + } else { + assert(src.file == BAD_FILE); + payload = header; + payload2 = bld.move_to_vgrf(addr, addr_sz); + mlen = header_sz; + ex_mlen = addr_sz * (inst->exec_size / 8); + } } else { /* Allocate space for the payload. */ const unsigned sz = header_sz + addr_sz + src_sz; @@ -5426,8 +5473,8 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst) /* Predicate the instruction on the sample mask if no header is * provided. */ - if (header.file == BAD_FILE && sample_mask.file != BAD_FILE && - sample_mask.file != IMM) { + if ((header.file == BAD_FILE || !is_surface_access) && + sample_mask.file != BAD_FILE && sample_mask.file != IMM) { const fs_builder ubld = bld.group(1, 0).exec_all(); if (inst->predicate) { assert(inst->predicate == BRW_PREDICATE_NORMAL); diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index ede5b0a701c..fd5a1a9071a 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -228,6 +228,9 @@ public: nir_intrinsic_instr *instr); fs_reg get_nir_ssbo_intrinsic_index(const brw::fs_builder &bld, nir_intrinsic_instr *instr); + fs_reg swizzle_nir_scratch_addr(const brw::fs_builder &bld, + const fs_reg &addr, + bool in_dwords); void nir_emit_intrinsic(const brw::fs_builder &bld, nir_intrinsic_instr *instr); void nir_emit_tes_intrinsic(const brw::fs_builder &bld, @@ -341,6 +344,7 @@ public: int *push_constant_loc; fs_reg subgroup_id; + fs_reg scratch_base; fs_reg frag_depth; fs_reg frag_stencil; fs_reg sample_mask; diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index 036aa58cc7c..65beebf6d09 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -2062,7 +2062,15 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, case SHADER_OPCODE_SEND: generate_send(inst, dst, src[0], src[1], src[2], inst->ex_mlen > 0 ? src[3] : brw_null_reg()); - send_count++; + if ((inst->desc & 0xff) == BRW_BTI_STATELESS || + (inst->desc & 0xff) == GEN8_BTI_STATELESS_NON_COHERENT) { + if (inst->size_written) + fill_count++; + else + spill_count++; + } else { + send_count++; + } break; case SHADER_OPCODE_GET_BUFFER_SIZE: diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index d63ef8c3247..d767b123036 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -42,6 +42,7 @@ fs_visitor::emit_nir_code() nir_setup_outputs(); nir_setup_uniforms(); nir_emit_system_values(); + last_scratch = ALIGN(nir->scratch_size, 4) * dispatch_width; nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir)); } @@ -4023,6 +4024,61 @@ image_intrinsic_coord_components(nir_intrinsic_instr *instr) } } +/** + * The offsets we get from NIR act as if each SIMD channel has it's own blob + * of contiguous space. However, if we actually place each SIMD channel in + * it's own space, we end up with terrible cache performance because each SIMD + * channel accesses a different cache line even when they're all accessing the + * same byte offset. To deal with this problem, we swizzle the address using + * a simple algorithm which ensures that any time a SIMD message reads or + * writes the same address, it's all in the same cache line. We have to keep + * the bottom two bits fixed so that we can read/write up to a dword at a time + * and the individual element is contiguous. We do this by splitting the + * address as follows: + * + * 31 4-6 2 0 + * +-------------------------------+------------+----------+ + * | Hi address bits | chan index | addr low | + * +-------------------------------+------------+----------+ + * + * In other words, the bottom two address bits stay, and the top 30 get + * shifted up so that we can stick the SIMD channel index in the middle. This + * way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit + * at the same logical offset, the scratch read/write instruction acts on + * continuous elements and we get good cache locality. + */ +fs_reg +fs_visitor::swizzle_nir_scratch_addr(const brw::fs_builder &bld, + const fs_reg &nir_addr, + bool in_dwords) +{ + const fs_reg &chan_index = + nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]; + const unsigned chan_index_bits = ffs(dispatch_width) - 1; + + fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD); + if (in_dwords) { + /* In this case, we know the address is aligned to a DWORD and we want + * the final address in DWORDs. + */ + bld.SHL(addr, nir_addr, brw_imm_ud(chan_index_bits - 2)); + bld.OR(addr, addr, chan_index); + } else { + /* This case substantially more annoying because we have to pay + * attention to those pesky two bottom bits. + */ + fs_reg addr_hi = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.AND(addr_hi, nir_addr, brw_imm_ud(~0x3u)); + bld.SHL(addr_hi, addr_hi, brw_imm_ud(chan_index_bits)); + fs_reg chan_addr = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.SHL(chan_addr, chan_index, brw_imm_ud(2)); + bld.AND(addr, nir_addr, brw_imm_ud(0x3u)); + bld.OR(addr, addr, addr_hi); + bld.OR(addr, addr, chan_addr); + } + return addr; +} + void fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) { @@ -4682,6 +4738,99 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } + case nir_intrinsic_load_scratch: { + assert(devinfo->gen >= 7); + + assert(nir_dest_num_components(instr->dest) == 1); + const unsigned bit_size = nir_dest_bit_size(instr->dest); + fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; + + if (devinfo->gen >= 8) { + srcs[SURFACE_LOGICAL_SRC_SURFACE] = + brw_imm_ud(GEN8_BTI_STATELESS_NON_COHERENT); + } else { + srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS); + } + + srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); + const fs_reg nir_addr = get_nir_src(instr->src[0]); + + /* Make dest unsigned because that's what the temporary will be */ + dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); + + /* Read the vector */ + if (nir_intrinsic_align(instr) >= 4) { + assert(nir_dest_bit_size(instr->dest) == 32); + + /* The offset for a DWORD scattered message is in dwords. */ + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = + swizzle_nir_scratch_addr(bld, nir_addr, true); + + bld.emit(SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL, + dest, srcs, SURFACE_LOGICAL_NUM_SRCS); + } else { + assert(nir_dest_bit_size(instr->dest) <= 32); + + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = + swizzle_nir_scratch_addr(bld, nir_addr, false); + + fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL, + read_result, srcs, SURFACE_LOGICAL_NUM_SRCS); + bld.MOV(dest, read_result); + } + break; + } + + case nir_intrinsic_store_scratch: { + assert(devinfo->gen >= 7); + + assert(nir_src_num_components(instr->src[0]) == 1); + const unsigned bit_size = nir_src_bit_size(instr->src[0]); + fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; + + if (devinfo->gen >= 8) { + srcs[SURFACE_LOGICAL_SRC_SURFACE] = + brw_imm_ud(GEN8_BTI_STATELESS_NON_COHERENT); + } else { + srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS); + } + + srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); + const fs_reg nir_addr = get_nir_src(instr->src[1]); + + fs_reg data = get_nir_src(instr->src[0]); + data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); + + assert(nir_intrinsic_write_mask(instr) == + (1u << instr->num_components) - 1); + if (nir_intrinsic_align(instr) >= 4) { + assert(nir_src_bit_size(instr->src[0]) == 32); + srcs[SURFACE_LOGICAL_SRC_DATA] = data; + + /* The offset for a DWORD scattered message is in dwords. */ + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = + swizzle_nir_scratch_addr(bld, nir_addr, true); + + bld.emit(SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL, + fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); + } else { + assert(nir_src_bit_size(instr->src[0]) <= 32); + + srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD); + bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data); + + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = + swizzle_nir_scratch_addr(bld, nir_addr, false); + + bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL, + fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); + } + break; + } + case nir_intrinsic_load_subgroup_size: /* This should only happen for fragment shaders because every other case * is lowered in NIR so we can optimize on it. diff --git a/src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c b/src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c index 564cb6dabe3..ef9aa206b44 100644 --- a/src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c +++ b/src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c @@ -77,8 +77,12 @@ static bool lower_mem_load_bit_size(nir_builder *b, nir_intrinsic_instr *intrin, const struct gen_device_info *devinfo) { + const bool needs_scalar = + intrin->intrinsic == nir_intrinsic_load_scratch; + assert(intrin->dest.is_ssa); - if (intrin->dest.ssa.bit_size == 32) + if (intrin->dest.ssa.bit_size == 32 && + (!needs_scalar || intrin->num_components == 1)) return false; const unsigned bit_size = intrin->dest.ssa.bit_size; @@ -119,7 +123,8 @@ lower_mem_load_bit_size(nir_builder *b, nir_intrinsic_instr *intrin, } else { assert(load_offset % 4 == 0); load_bit_size = 32; - load_comps = DIV_ROUND_UP(MIN2(bytes_left, 16), 4); + load_comps = needs_scalar ? 1 : + DIV_ROUND_UP(MIN2(bytes_left, 16), 4); } loads[num_loads++] = dup_mem_intrinsic(b, intrin, NULL, load_offset, @@ -144,6 +149,9 @@ static bool lower_mem_store_bit_size(nir_builder *b, nir_intrinsic_instr *intrin, const struct gen_device_info *devinfo) { + const bool needs_scalar = + intrin->intrinsic == nir_intrinsic_store_scratch; + assert(intrin->src[0].is_ssa); nir_ssa_def *value = intrin->src[0].ssa; @@ -159,7 +167,9 @@ lower_mem_store_bit_size(nir_builder *b, nir_intrinsic_instr *intrin, assert(writemask < (1 << num_components)); if ((value->bit_size <= 32 && num_components == 1) || - (value->bit_size == 32 && writemask == (1 << num_components) - 1)) + (value->bit_size == 32 && + writemask == (1 << num_components) - 1 && + !needs_scalar)) return false; nir_src *offset_src = nir_get_io_offset_src(intrin); @@ -180,7 +190,6 @@ lower_mem_store_bit_size(nir_builder *b, nir_intrinsic_instr *intrin, while (BITSET_FFS(mask) != 0) { const int start = BITSET_FFS(mask) - 1; - assert(start % byte_size == 0); int end; for (end = start + 1; end < bytes_written; end++) { @@ -198,7 +207,7 @@ lower_mem_store_bit_size(nir_builder *b, nir_intrinsic_instr *intrin, if (chunk_bytes >= 4 && is_dword_aligned) { store_align = MAX2(align, 4); store_bit_size = 32; - store_comps = MIN2(chunk_bytes, 16) / 4; + store_comps = needs_scalar ? 1 : MIN2(chunk_bytes, 16) / 4; } else { store_align = align; store_comps = 1; @@ -208,7 +217,6 @@ lower_mem_store_bit_size(nir_builder *b, nir_intrinsic_instr *intrin, store_bit_size = 16; } const unsigned store_bytes = store_comps * (store_bit_size / 8); - assert(store_bytes % byte_size == 0); nir_ssa_def *packed = nir_extract_bits(b, &value, 1, start * 8, store_comps, store_bit_size); @@ -245,6 +253,7 @@ lower_mem_access_bit_sizes_impl(nir_function_impl *impl, case nir_intrinsic_load_global: case nir_intrinsic_load_ssbo: case nir_intrinsic_load_shared: + case nir_intrinsic_load_scratch: if (lower_mem_load_bit_size(&b, intrin, devinfo)) progress = true; break; @@ -252,6 +261,7 @@ lower_mem_access_bit_sizes_impl(nir_function_impl *impl, case nir_intrinsic_store_global: case nir_intrinsic_store_ssbo: case nir_intrinsic_store_shared: + case nir_intrinsic_store_scratch: if (lower_mem_store_bit_size(&b, intrin, devinfo)) progress = true; break; @@ -285,6 +295,12 @@ lower_mem_access_bit_sizes_impl(nir_function_impl *impl, * all nir load/store intrinsics into a series of either 8 or 32-bit * load/store intrinsics with a number of components that we can directly * handle in hardware and with a trivial write-mask. + * + * For scratch access, additional consideration has to be made due to the way + * that we swizzle the memory addresses to achieve decent cache locality. In + * particular, even though untyped surface read/write messages exist and work, + * we can't use them to load multiple components in a single SEND. For more + * detail on the scratch swizzle, see fs_visitor::swizzle_nir_scratch_addr. */ bool brw_nir_lower_mem_access_bit_sizes(nir_shader *shader,