gallivm/nir: Add a short circuit uniform-offset mode for load_ssbo/load_shared.
dEQP-VK.binding_model.buffer_device_address.set3.depth3.basessbo.convertuvec2.nostore.multi.scalar.vert runtime -24.4002% +/- 1.94375% (n=7). The win (I think) is in LLVM not having to chew through handling the extra loops on every constant-offset SSBO load, not in actual rendering time. Reviewed-by: Dave Airlie <airlied@redhat.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14999>
This commit is contained in:
parent
181f25aff4
commit
591899eedd
|
@ -1397,8 +1397,9 @@ static void visit_load_ssbo(struct lp_build_nir_context *bld_base,
|
||||||
{
|
{
|
||||||
LLVMValueRef idx = cast_type(bld_base, get_src(bld_base, instr->src[0]), nir_type_uint, 32);
|
LLVMValueRef idx = cast_type(bld_base, get_src(bld_base, instr->src[0]), nir_type_uint, 32);
|
||||||
LLVMValueRef offset = get_src(bld_base, instr->src[1]);
|
LLVMValueRef offset = get_src(bld_base, instr->src[1]);
|
||||||
|
bool index_and_offset_are_uniform = nir_src_is_always_uniform(instr->src[0]) && nir_src_is_always_uniform(instr->src[1]);
|
||||||
bld_base->load_mem(bld_base, nir_dest_num_components(instr->dest), nir_dest_bit_size(instr->dest),
|
bld_base->load_mem(bld_base, nir_dest_num_components(instr->dest), nir_dest_bit_size(instr->dest),
|
||||||
idx, offset, result);
|
index_and_offset_are_uniform, idx, offset, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void visit_store_ssbo(struct lp_build_nir_context *bld_base,
|
static void visit_store_ssbo(struct lp_build_nir_context *bld_base,
|
||||||
|
@ -1634,8 +1635,9 @@ static void visit_shared_load(struct lp_build_nir_context *bld_base,
|
||||||
LLVMValueRef result[NIR_MAX_VEC_COMPONENTS])
|
LLVMValueRef result[NIR_MAX_VEC_COMPONENTS])
|
||||||
{
|
{
|
||||||
LLVMValueRef offset = get_src(bld_base, instr->src[0]);
|
LLVMValueRef offset = get_src(bld_base, instr->src[0]);
|
||||||
|
bool offset_is_uniform = nir_src_is_always_uniform(instr->src[0]);
|
||||||
bld_base->load_mem(bld_base, nir_dest_num_components(instr->dest), nir_dest_bit_size(instr->dest),
|
bld_base->load_mem(bld_base, nir_dest_num_components(instr->dest), nir_dest_bit_size(instr->dest),
|
||||||
NULL, offset, result);
|
offset_is_uniform, NULL, offset, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void visit_shared_store(struct lp_build_nir_context *bld_base,
|
static void visit_shared_store(struct lp_build_nir_context *bld_base,
|
||||||
|
|
|
@ -111,6 +111,7 @@ struct lp_build_nir_context
|
||||||
/* for SSBO and shared memory */
|
/* for SSBO and shared memory */
|
||||||
void (*load_mem)(struct lp_build_nir_context *bld_base,
|
void (*load_mem)(struct lp_build_nir_context *bld_base,
|
||||||
unsigned nc, unsigned bit_size,
|
unsigned nc, unsigned bit_size,
|
||||||
|
bool index_and_offset_are_uniform,
|
||||||
LLVMValueRef index, LLVMValueRef offset, LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]);
|
LLVMValueRef index, LLVMValueRef offset, LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]);
|
||||||
void (*store_mem)(struct lp_build_nir_context *bld_base,
|
void (*store_mem)(struct lp_build_nir_context *bld_base,
|
||||||
unsigned writemask, unsigned nc, unsigned bit_size,
|
unsigned writemask, unsigned nc, unsigned bit_size,
|
||||||
|
|
|
@ -1143,6 +1143,7 @@ mem_access_base_pointer(struct lp_build_nir_context *bld_base,
|
||||||
static void emit_load_mem(struct lp_build_nir_context *bld_base,
|
static void emit_load_mem(struct lp_build_nir_context *bld_base,
|
||||||
unsigned nc,
|
unsigned nc,
|
||||||
unsigned bit_size,
|
unsigned bit_size,
|
||||||
|
bool index_and_offset_are_uniform,
|
||||||
LLVMValueRef index,
|
LLVMValueRef index,
|
||||||
LLVMValueRef offset,
|
LLVMValueRef offset,
|
||||||
LLVMValueRef outval[NIR_MAX_VEC_COMPONENTS])
|
LLVMValueRef outval[NIR_MAX_VEC_COMPONENTS])
|
||||||
|
@ -1158,6 +1159,42 @@ static void emit_load_mem(struct lp_build_nir_context *bld_base,
|
||||||
|
|
||||||
offset = LLVMBuildAShr(gallivm->builder, offset, lp_build_const_int_vec(gallivm, uint_bld->type, shift_val), "");
|
offset = LLVMBuildAShr(gallivm->builder, offset, lp_build_const_int_vec(gallivm, uint_bld->type, shift_val), "");
|
||||||
|
|
||||||
|
/* If the address is uniform, then use the address from invocation 0 to load,
|
||||||
|
* and broadcast to all invocations.
|
||||||
|
*/
|
||||||
|
if (index_and_offset_are_uniform && invocation_0_must_be_active(bld_base)) {
|
||||||
|
LLVMValueRef ssbo_limit;
|
||||||
|
LLVMValueRef mem_ptr = mem_access_base_pointer(bld_base, load_bld, bit_size, index,
|
||||||
|
lp_build_const_int32(gallivm, 0), &ssbo_limit);
|
||||||
|
|
||||||
|
offset = LLVMBuildExtractElement(gallivm->builder, offset, lp_build_const_int32(gallivm, 0), "");
|
||||||
|
|
||||||
|
for (unsigned c = 0; c < nc; c++) {
|
||||||
|
LLVMValueRef chan_offset = LLVMBuildAdd(builder, offset, lp_build_const_int32(gallivm, c), "");
|
||||||
|
|
||||||
|
LLVMValueRef scalar;
|
||||||
|
/* If loading outside the SSBO, we need to skip the load and read 0 instead. */
|
||||||
|
if (ssbo_limit) {
|
||||||
|
LLVMValueRef zero = lp_build_zero_bits(gallivm, bit_size);
|
||||||
|
LLVMValueRef res_store = lp_build_alloca(gallivm, LLVMTypeOf(zero), "");
|
||||||
|
LLVMBuildStore(builder, zero, res_store);
|
||||||
|
|
||||||
|
LLVMValueRef fetch_cond = LLVMBuildICmp(gallivm->builder, LLVMIntUGE, ssbo_limit, chan_offset, "");
|
||||||
|
struct lp_build_if_state ifthen;
|
||||||
|
lp_build_if(&ifthen, gallivm, fetch_cond);
|
||||||
|
LLVMBuildStore(builder, lp_build_pointer_get(builder, mem_ptr, chan_offset), res_store);
|
||||||
|
lp_build_endif(&ifthen);
|
||||||
|
|
||||||
|
scalar = LLVMBuildLoad(builder, res_store, "");
|
||||||
|
} else {
|
||||||
|
scalar = lp_build_pointer_get(builder, mem_ptr, chan_offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
outval[c] = lp_build_broadcast_scalar(load_bld, scalar);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
/* although the index is dynamically uniform that doesn't count if exec mask isn't set, so read the one-by-one */
|
/* although the index is dynamically uniform that doesn't count if exec mask isn't set, so read the one-by-one */
|
||||||
|
|
||||||
LLVMValueRef result[NIR_MAX_VEC_COMPONENTS];
|
LLVMValueRef result[NIR_MAX_VEC_COMPONENTS];
|
||||||
|
|
Loading…
Reference in New Issue