intel/fs: Add support for bindless texture ops

We add two new texture sources for bindless surface and sampler handles.
Bindless surface handles are expected to be pre-shifted so that the
20-bit surface state table index is in the top 20 bits of the 32-bit
handle.  This lets us avoid any extra shifts in the shader.  Bindless
sampler handles are 32-byte aligned byte offsets from general state base
address.  We use 32-byte aligned instead of 16-byte aligned to avoid
having to use more indirect messages than needed.  It means we can't
tightly pack samplers but that's probably not a big deal.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
This commit is contained in:
Jason Ekstrand 2019-02-06 15:42:17 -06:00 committed by Jason Ekstrand
parent 2edf29b933
commit 843286d324
5 changed files with 86 additions and 10 deletions

View File

@ -835,6 +835,10 @@ enum tex_logical_srcs {
TEX_LOGICAL_SRC_SURFACE,
/** Texture sampler index */
TEX_LOGICAL_SRC_SAMPLER,
/** Texture surface bindless handle */
TEX_LOGICAL_SRC_SURFACE_HANDLE,
/** Texture sampler bindless handle */
TEX_LOGICAL_SRC_SAMPLER_HANDLE,
/** Texel offset for gathers */
TEX_LOGICAL_SRC_TG4_OFFSET,
/** REQUIRED: Number of coordinate components (as UD immediate) */
@ -1224,6 +1228,7 @@ enum brw_message_target {
*/
#define GEN8_BTI_STATELESS_IA_COHERENT 255
#define GEN8_BTI_STATELESS_NON_COHERENT 253
#define GEN9_BTI_BINDLESS 252
/* Dataport atomic operations for Untyped Atomic Integer Operation message
* (and others).

View File

@ -4685,6 +4685,8 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
const fs_reg &mcs,
const fs_reg &surface,
const fs_reg &sampler,
const fs_reg &surface_handle,
const fs_reg &sampler_handle,
const fs_reg &tg4_offset,
unsigned coord_components,
unsigned grad_components)
@ -4697,9 +4699,14 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
/* We must have exactly one of surface/sampler and surface/sampler_handle */
assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
assert((sampler.file == BAD_FILE) != (sampler_handle.file == BAD_FILE));
if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
inst->offset != 0 || inst->eot ||
op == SHADER_OPCODE_SAMPLEINFO ||
sampler_handle.file != BAD_FILE ||
is_high_sampler(devinfo, sampler)) {
/* For general texture offsets (no txf workaround), we need a header to
* put them in.
@ -4739,7 +4746,21 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
ubld1.MOV(component(header, 2), brw_imm_ud(0));
}
if (is_high_sampler(devinfo, sampler)) {
if (sampler_handle.file != BAD_FILE) {
/* Bindless sampler handles aren't relative to the sampler state
* pointer passed into the shader through SAMPLER_STATE_POINTERS_*.
* Instead, it's an absolute pointer relative to dynamic state base
* address.
*
* Sampler states are 16 bytes each and the pointer we give here has
* to be 32-byte aligned. In order to avoid more indirect messages
* than required, we assume that all bindless sampler states are
* 32-byte aligned. This sacrifices a bit of general state base
* address space but means we can do something more efficient in the
* shader.
*/
ubld1.MOV(component(header, 3), sampler_handle);
} else if (is_high_sampler(devinfo, sampler)) {
if (sampler.file == BRW_IMMEDIATE_VALUE) {
assert(sampler.ud >= 16);
const int sampler_state_size = 16; /* 16 bytes */
@ -4942,14 +4963,42 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
}
inst->sfid = BRW_SFID_SAMPLER;
if (surface.file == IMM && sampler.file == IMM) {
if (surface.file == IMM &&
(sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
inst->desc = brw_sampler_desc(devinfo,
surface.ud + base_binding_table_index,
sampler.ud % 16,
sampler.file == IMM ? sampler.ud % 16 : 0,
msg_type,
simd_mode,
0 /* return_format unused on gen7+ */);
inst->src[0] = brw_imm_ud(0);
inst->src[1] = brw_imm_ud(0); /* ex_desc */
} else if (surface_handle.file != BAD_FILE) {
/* Bindless surface */
assert(devinfo->gen >= 9);
inst->desc = brw_sampler_desc(devinfo,
GEN9_BTI_BINDLESS,
sampler.file == IMM ? sampler.ud % 16 : 0,
msg_type,
simd_mode,
0 /* return_format unused on gen7+ */);
/* For bindless samplers, the entire address is included in the message
* header so we can leave the portion in the message descriptor 0.
*/
if (sampler_handle.file != BAD_FILE || sampler.file == IMM) {
inst->src[0] = brw_imm_ud(0);
} else {
const fs_builder ubld = bld.group(1, 0).exec_all();
fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
ubld.SHL(desc, sampler, brw_imm_ud(8));
inst->src[0] = desc;
}
/* We assume that the driver provided the handle in the top 20 bits so
* we can use the surface handle directly as the extended descriptor.
*/
inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
} else {
/* Immediate portion of the descriptor */
inst->desc = brw_sampler_desc(devinfo,
@ -4964,7 +5013,9 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
/* This case is common in GL */
ubld.MUL(desc, surface, brw_imm_ud(0x101));
} else {
if (sampler.file == IMM) {
if (sampler_handle.file != BAD_FILE) {
ubld.MOV(desc, surface);
} else if (sampler.file == IMM) {
ubld.OR(desc, surface, brw_imm_ud(sampler.ud << 8));
} else {
ubld.SHL(desc, sampler, brw_imm_ud(8));
@ -4976,8 +5027,8 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
ubld.AND(desc, desc, brw_imm_ud(0xfff));
inst->src[0] = component(desc, 0);
inst->src[1] = brw_imm_ud(0); /* ex_desc */
}
inst->src[1] = brw_imm_ud(0); /* ex_desc */
inst->src[2] = src_payload;
inst->resize_sources(3);
@ -5009,6 +5060,8 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS];
const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
const fs_reg &surface_handle = inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];
const fs_reg &sampler_handle = inst->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];
const fs_reg &tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
@ -5019,7 +5072,9 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
shadow_c, lod, lod2, min_lod,
sample_index,
mcs, surface, sampler, tg4_offset,
mcs, surface, sampler,
surface_handle, sampler_handle,
tg4_offset,
coord_components, grad_components);
} else if (devinfo->gen >= 5) {
lower_sampler_logical_send_gen5(bld, inst, op, coordinate,

View File

@ -183,7 +183,8 @@ public:
void emit_interpolation_setup_gen6();
void compute_sample_position(fs_reg dst, fs_reg int_sample_pos);
fs_reg emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
const fs_reg &sampler);
const fs_reg &texture,
const fs_reg &texture_handle);
void emit_gen6_gather_wa(uint8_t wa, fs_reg dst);
fs_reg resolve_source_modifiers(const fs_reg &src);
void emit_discard_jump();

View File

@ -3201,7 +3201,7 @@ fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst,
const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
const fs_reg mcs = wm_key->multisample_fbo ?
emit_mcs_fetch(coords, 3, brw_imm_ud(surface)) : fs_reg();
emit_mcs_fetch(coords, 3, brw_imm_ud(surface), fs_reg()) : fs_reg();
/* Use either a normal or a CMS texel fetch message depending on whether
* the framebuffer is single or multisample. On SKL+ use the wide CMS
@ -5237,6 +5237,18 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
break;
}
case nir_tex_src_texture_handle:
assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1);
srcs[TEX_LOGICAL_SRC_SURFACE] = fs_reg();
srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = bld.emit_uniformize(src);
break;
case nir_tex_src_sampler_handle:
assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
srcs[TEX_LOGICAL_SRC_SAMPLER] = fs_reg();
srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = bld.emit_uniformize(src);
break;
case nir_tex_src_ms_mcs:
assert(instr->op == nir_texop_txf_ms);
srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D);
@ -5266,7 +5278,8 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
srcs[TEX_LOGICAL_SRC_MCS] =
emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE],
instr->coord_components,
srcs[TEX_LOGICAL_SRC_SURFACE]);
srcs[TEX_LOGICAL_SRC_SURFACE],
srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE]);
} else {
srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u);
}

View File

@ -35,7 +35,8 @@ using namespace brw;
/* Sample from the MCS surface attached to this multisample texture. */
fs_reg
fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
const fs_reg &texture)
const fs_reg &texture,
const fs_reg &texture_handle)
{
const fs_reg dest = vgrf(glsl_type::uvec4_type);
@ -43,6 +44,7 @@ fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
srcs[TEX_LOGICAL_SRC_COORDINATE] = coordinate;
srcs[TEX_LOGICAL_SRC_SURFACE] = texture;
srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(0);
srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = texture_handle;
srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(components);
srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);