gallivm: Optimize single-invocation SSBO stores.
The CTS does a lot of 1x1x1 compute shaders (all that stuff like dEQP-GLES31.functional.shaders.builtin_functions.precision.mul.highp_compute.scalar) which finish with store_ssbos. Instead of doing the invocation loop in that case (which LLVM has to later unroll), just emit the single invocation's store. Fixes timeouts running dEQP-GLES31.functional.ssbo.layout.random.all_shared_buffer.36, which does a spectacular number of SSBO stores in a long 1x1x1 compute shader. Reduces runtime of on llvmpipe from 66s to 29s locally, and virgl from 1:38 to 43s. virgl dEQP-GLES31.functional.ssbo.layout.random.nested_structs_arrays_instance_arrays.22 goes down to 7 seconds. Fixes: #6797 Reviewed-by: Dave Airlie <airlied@redhat.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17730>
This commit is contained in:
parent
8d41f8f384
commit
bd8740da77
|
@ -1550,10 +1550,11 @@ visit_store_ssbo(struct lp_build_nir_context *bld_base,
|
||||||
LLVMValueRef val = get_src(bld_base, instr->src[0]);
|
LLVMValueRef val = get_src(bld_base, instr->src[0]);
|
||||||
LLVMValueRef idx = cast_type(bld_base, get_src(bld_base, instr->src[1]), nir_type_uint, 32);
|
LLVMValueRef idx = cast_type(bld_base, get_src(bld_base, instr->src[1]), nir_type_uint, 32);
|
||||||
LLVMValueRef offset = get_src(bld_base, instr->src[2]);
|
LLVMValueRef offset = get_src(bld_base, instr->src[2]);
|
||||||
|
bool index_and_offset_are_uniform = nir_src_is_always_uniform(instr->src[1]) && nir_src_is_always_uniform(instr->src[2]);
|
||||||
int writemask = instr->const_index[0];
|
int writemask = instr->const_index[0];
|
||||||
int nc = nir_src_num_components(instr->src[0]);
|
int nc = nir_src_num_components(instr->src[0]);
|
||||||
int bitsize = nir_src_bit_size(instr->src[0]);
|
int bitsize = nir_src_bit_size(instr->src[0]);
|
||||||
bld_base->store_mem(bld_base, writemask, nc, bitsize, idx, offset, val);
|
bld_base->store_mem(bld_base, writemask, nc, bitsize, index_and_offset_are_uniform, idx, offset, val);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -1821,10 +1822,11 @@ visit_shared_store(struct lp_build_nir_context *bld_base,
|
||||||
{
|
{
|
||||||
LLVMValueRef val = get_src(bld_base, instr->src[0]);
|
LLVMValueRef val = get_src(bld_base, instr->src[0]);
|
||||||
LLVMValueRef offset = get_src(bld_base, instr->src[1]);
|
LLVMValueRef offset = get_src(bld_base, instr->src[1]);
|
||||||
|
bool offset_is_uniform = nir_src_is_always_uniform(instr->src[1]);
|
||||||
int writemask = instr->const_index[1];
|
int writemask = instr->const_index[1];
|
||||||
int nc = nir_src_num_components(instr->src[0]);
|
int nc = nir_src_num_components(instr->src[0]);
|
||||||
int bitsize = nir_src_bit_size(instr->src[0]);
|
int bitsize = nir_src_bit_size(instr->src[0]);
|
||||||
bld_base->store_mem(bld_base, writemask, nc, bitsize, NULL, offset, val);
|
bld_base->store_mem(bld_base, writemask, nc, bitsize, offset_is_uniform, NULL, offset, val);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -115,6 +115,7 @@ struct lp_build_nir_context
|
||||||
LLVMValueRef index, LLVMValueRef offset, LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]);
|
LLVMValueRef index, LLVMValueRef offset, LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]);
|
||||||
void (*store_mem)(struct lp_build_nir_context *bld_base,
|
void (*store_mem)(struct lp_build_nir_context *bld_base,
|
||||||
unsigned writemask, unsigned nc, unsigned bit_size,
|
unsigned writemask, unsigned nc, unsigned bit_size,
|
||||||
|
bool index_and_offset_are_uniform,
|
||||||
LLVMValueRef index, LLVMValueRef offset, LLVMValueRef dst);
|
LLVMValueRef index, LLVMValueRef offset, LLVMValueRef dst);
|
||||||
|
|
||||||
void (*atomic_mem)(struct lp_build_nir_context *bld_base,
|
void (*atomic_mem)(struct lp_build_nir_context *bld_base,
|
||||||
|
|
|
@ -1296,6 +1296,7 @@ static void emit_store_mem(struct lp_build_nir_context *bld_base,
|
||||||
unsigned writemask,
|
unsigned writemask,
|
||||||
unsigned nc,
|
unsigned nc,
|
||||||
unsigned bit_size,
|
unsigned bit_size,
|
||||||
|
bool index_and_offset_are_uniform,
|
||||||
LLVMValueRef index,
|
LLVMValueRef index,
|
||||||
LLVMValueRef offset,
|
LLVMValueRef offset,
|
||||||
LLVMValueRef dst)
|
LLVMValueRef dst)
|
||||||
|
@ -1310,6 +1311,41 @@ static void emit_store_mem(struct lp_build_nir_context *bld_base,
|
||||||
|
|
||||||
offset = lp_build_shr_imm(uint_bld, offset, shift_val);
|
offset = lp_build_shr_imm(uint_bld, offset, shift_val);
|
||||||
|
|
||||||
|
/* If the address is uniform, then just store the value from the first
|
||||||
|
* channel instead of making LLVM unroll the invocation loop.
|
||||||
|
*/
|
||||||
|
if (index_and_offset_are_uniform && invocation_0_must_be_active(bld_base)) {
|
||||||
|
LLVMValueRef ssbo_limit;
|
||||||
|
LLVMValueRef mem_ptr = mem_access_base_pointer(bld_base, store_bld, bit_size, index,
|
||||||
|
lp_build_const_int32(gallivm, 0), &ssbo_limit);
|
||||||
|
|
||||||
|
offset = LLVMBuildExtractElement(gallivm->builder, offset, lp_build_const_int32(gallivm, 0), "");
|
||||||
|
|
||||||
|
for (unsigned c = 0; c < nc; c++) {
|
||||||
|
if (!(writemask & (1u << c)))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/* Pick out invocation 0's value. */
|
||||||
|
LLVMValueRef val = (nc == 1) ? dst : LLVMBuildExtractValue(builder, dst, c, "");
|
||||||
|
LLVMValueRef value_ptr = LLVMBuildExtractElement(gallivm->builder, val,
|
||||||
|
lp_build_const_int32(gallivm, 0), "");
|
||||||
|
value_ptr = LLVMBuildBitCast(gallivm->builder, value_ptr, store_bld->elem_type, "");
|
||||||
|
|
||||||
|
LLVMValueRef chan_offset = LLVMBuildAdd(builder, offset, lp_build_const_int32(gallivm, c), "");
|
||||||
|
|
||||||
|
/* If storing outside the SSBO, we need to skip the store instead. */
|
||||||
|
if (ssbo_limit) {
|
||||||
|
struct lp_build_if_state ifthen;
|
||||||
|
lp_build_if(&ifthen, gallivm, lp_offset_in_range(bld_base, chan_offset, ssbo_limit));
|
||||||
|
lp_build_pointer_set(builder, mem_ptr, chan_offset, value_ptr);
|
||||||
|
lp_build_endif(&ifthen);
|
||||||
|
} else {
|
||||||
|
lp_build_pointer_set(builder, mem_ptr, chan_offset, value_ptr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
LLVMValueRef exec_mask = mask_vec(bld_base);
|
LLVMValueRef exec_mask = mask_vec(bld_base);
|
||||||
LLVMValueRef cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, "");
|
LLVMValueRef cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, "");
|
||||||
struct lp_build_loop_state loop_state;
|
struct lp_build_loop_state loop_state;
|
||||||
|
|
|
@ -13,7 +13,6 @@ KHR-GL45.texture_size_promotion.functional
|
||||||
KHR-GL45.texture_swizzle.functional
|
KHR-GL45.texture_swizzle.functional
|
||||||
KHR-GL45.texture_swizzle.smoke
|
KHR-GL45.texture_swizzle.smoke
|
||||||
KHR-GLES31.core.arrays_of_arrays.InteractionFunctionCalls2
|
KHR-GLES31.core.arrays_of_arrays.InteractionFunctionCalls2
|
||||||
dEQP-GLES31.functional.ssbo.layout.random.all_shared_buffer.36
|
|
||||||
arb_pipeline_statistics_query-comp
|
arb_pipeline_statistics_query-comp
|
||||||
gl-1.0-blend-func
|
gl-1.0-blend-func
|
||||||
|
|
||||||
|
|
|
@ -36,7 +36,10 @@ dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two
|
||||||
dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two_samples.multisample_rbo_2,Fail
|
dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two_samples.multisample_rbo_2,Fail
|
||||||
dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two_samples.multisample_texture_1,Fail
|
dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two_samples.multisample_texture_1,Fail
|
||||||
dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two_samples.multisample_texture_2,Fail
|
dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two_samples.multisample_texture_2,Fail
|
||||||
|
|
||||||
|
# Times out waiting for >15s compile on the host side.
|
||||||
dEQP-GLES31.functional.ssbo.layout.random.all_shared_buffer.36,Fail
|
dEQP-GLES31.functional.ssbo.layout.random.all_shared_buffer.36,Fail
|
||||||
|
dEQP-GLES31.functional.ssbo.layout.random.nested_structs_arrays_instance_arrays.22,Fail
|
||||||
|
|
||||||
KHR-GL30.transform_feedback.api_errors_test,Fail
|
KHR-GL30.transform_feedback.api_errors_test,Fail
|
||||||
KHR-GL32.transform_feedback_overflow_query_ARB.advanced-single-stream-interleaved-attribs,Fail
|
KHR-GL32.transform_feedback_overflow_query_ARB.advanced-single-stream-interleaved-attribs,Fail
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
# Sometimes crashes, e.g. https://gitlab.freedesktop.org/kusma/mesa/-/jobs/4109419
|
# Sometimes crashes, e.g. https://gitlab.freedesktop.org/kusma/mesa/-/jobs/4109419
|
||||||
dEQP-GLES31.functional.compute.basic.empty
|
dEQP-GLES31.functional.compute.basic.empty
|
||||||
|
|
||||||
# too slow.
|
# too slow (>15s compile on host causes timeouts that make for flakes)
|
||||||
dEQP-GLES31.functional.ssbo.layout.random.all_shared_buffer.36
|
dEQP-GLES31.functional.ssbo.layout.random.all_shared_buffer.36
|
||||||
dEQP-GLES31.functional.ssbo.layout.random.nested_structs_arrays_instance_arrays.22
|
dEQP-GLES31.functional.ssbo.layout.random.nested_structs_arrays_instance_arrays.22
|
||||||
|
|
||||||
|
|
|
@ -26,8 +26,11 @@ dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two
|
||||||
dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two_samples.multisample_rbo_2,Fail
|
dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two_samples.multisample_rbo_2,Fail
|
||||||
dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two_samples.multisample_texture_1,Fail
|
dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two_samples.multisample_texture_1,Fail
|
||||||
dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two_samples.multisample_texture_2,Fail
|
dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two_samples.multisample_texture_2,Fail
|
||||||
|
|
||||||
|
# Times out waiting for >15s compile on the host side.
|
||||||
dEQP-GLES31.functional.ssbo.layout.random.all_shared_buffer.36,Fail
|
dEQP-GLES31.functional.ssbo.layout.random.all_shared_buffer.36,Fail
|
||||||
dEQP-GLES31.functional.ssbo.layout.random.nested_structs_arrays_instance_arrays.22,Fail
|
dEQP-GLES31.functional.ssbo.layout.random.nested_structs_arrays_instance_arrays.22,Fail
|
||||||
|
|
||||||
KHR-GL30.glsl_noperspective.functionaltest,Fail
|
KHR-GL30.glsl_noperspective.functionaltest,Fail
|
||||||
|
|
||||||
KHR-GL30.transform_feedback.api_errors_test,Fail
|
KHR-GL30.transform_feedback.api_errors_test,Fail
|
||||||
|
|
|
@ -12,9 +12,3 @@ dEQP-GLES2.functional.clipping.point.wide_point_clip_viewport_corner
|
||||||
dEQP-GLES3.functional.clipping.point.wide_point_clip
|
dEQP-GLES3.functional.clipping.point.wide_point_clip
|
||||||
dEQP-GLES3.functional.clipping.point.wide_point_clip_viewport_center
|
dEQP-GLES3.functional.clipping.point.wide_point_clip_viewport_center
|
||||||
dEQP-GLES3.functional.clipping.point.wide_point_clip_viewport_corner
|
dEQP-GLES3.functional.clipping.point.wide_point_clip_viewport_corner
|
||||||
|
|
||||||
dEQP-GLES31.functional.ssbo.layout.random.arrays_of_arrays.1
|
|
||||||
|
|
||||||
dEQP-GLES31.functional.ssbo.layout.unsized_struct_array.single_buffer.std430_instance_array
|
|
||||||
dEQP-GLES31.functional.ssbo.layout.unsized_struct_array.per_block_buffer.std430_instance_array
|
|
||||||
dEQP-GLES31.functional.ssbo.layout.unsized_struct_array.single_buffer.std140_instance_array
|
|
||||||
|
|
|
@ -4,5 +4,9 @@
|
||||||
|
|
||||||
KHR-GL32.packed_pixels.varied_rectangle.depth*
|
KHR-GL32.packed_pixels.varied_rectangle.depth*
|
||||||
|
|
||||||
|
# too slow (>15s compile on host causes timeouts that make for flakes)
|
||||||
|
dEQP-GLES31.functional.ssbo.layout.random.all_shared_buffer.36
|
||||||
|
dEQP-GLES31.functional.ssbo.layout.random.nested_structs_arrays_instance_arrays.22
|
||||||
|
|
||||||
# Sometimes crashes, e.g. https://gitlab.freedesktop.org/kusma/mesa/-/jobs/4109419
|
# Sometimes crashes, e.g. https://gitlab.freedesktop.org/kusma/mesa/-/jobs/4109419
|
||||||
dEQP-GLES31.functional.compute.basic.empty
|
dEQP-GLES31.functional.compute.basic.empty
|
||||||
|
|
Loading…
Reference in New Issue