From bd8740da77c191e1da7c93ff0e42df333840212f Mon Sep 17 00:00:00 2001 From: Emma Anholt Date: Sun, 24 Jul 2022 08:11:49 -0700 Subject: [PATCH] gallivm: Optimize single-invocation SSBO stores. The CTS does a lot of 1x1x1 compute shaders (all that stuff like dEQP-GLES31.functional.shaders.builtin_functions.precision.mul.highp_compute.scalar) which finish with store_ssbos. Instead of doing the invocation loop in that case (which LLVM has to later unroll), just emit the single invocation's store. Fixes timeouts running dEQP-GLES31.functional.ssbo.layout.random.all_shared_buffer.36, which does a spectacular number of SSBO stores in a long 1x1x1 compute shader. Reduces runtime of on llvmpipe from 66s to 29s locally, and virgl from 1:38 to 43s. virgl dEQP-GLES31.functional.ssbo.layout.random.nested_structs_arrays_instance_arrays.22 goes down to 7 seconds. Fixes: #6797 Reviewed-by: Dave Airlie Part-of: --- src/gallium/auxiliary/gallivm/lp_bld_nir.c | 6 ++-- src/gallium/auxiliary/gallivm/lp_bld_nir.h | 1 + .../auxiliary/gallivm/lp_bld_nir_soa.c | 36 +++++++++++++++++++ .../drivers/llvmpipe/ci/llvmpipe-skips.txt | 1 - .../drivers/virgl/ci/virgl-gl-fails.txt | 3 ++ .../drivers/virgl/ci/virgl-gl-skips.txt | 2 +- .../drivers/virgl/ci/virgl-gles-fails.txt | 3 ++ .../drivers/virgl/ci/virgl-gles-flakes.txt | 6 ---- .../drivers/virgl/ci/virgl-gles-skips.txt | 4 +++ 9 files changed, 52 insertions(+), 10 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_nir.c b/src/gallium/auxiliary/gallivm/lp_bld_nir.c index 858002f5d15..3cf110f533a 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_nir.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_nir.c @@ -1550,10 +1550,11 @@ visit_store_ssbo(struct lp_build_nir_context *bld_base, LLVMValueRef val = get_src(bld_base, instr->src[0]); LLVMValueRef idx = cast_type(bld_base, get_src(bld_base, instr->src[1]), nir_type_uint, 32); LLVMValueRef offset = get_src(bld_base, instr->src[2]); + bool index_and_offset_are_uniform = nir_src_is_always_uniform(instr->src[1]) && nir_src_is_always_uniform(instr->src[2]); int writemask = instr->const_index[0]; int nc = nir_src_num_components(instr->src[0]); int bitsize = nir_src_bit_size(instr->src[0]); - bld_base->store_mem(bld_base, writemask, nc, bitsize, idx, offset, val); + bld_base->store_mem(bld_base, writemask, nc, bitsize, index_and_offset_are_uniform, idx, offset, val); } @@ -1821,10 +1822,11 @@ visit_shared_store(struct lp_build_nir_context *bld_base, { LLVMValueRef val = get_src(bld_base, instr->src[0]); LLVMValueRef offset = get_src(bld_base, instr->src[1]); + bool offset_is_uniform = nir_src_is_always_uniform(instr->src[1]); int writemask = instr->const_index[1]; int nc = nir_src_num_components(instr->src[0]); int bitsize = nir_src_bit_size(instr->src[0]); - bld_base->store_mem(bld_base, writemask, nc, bitsize, NULL, offset, val); + bld_base->store_mem(bld_base, writemask, nc, bitsize, offset_is_uniform, NULL, offset, val); } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_nir.h b/src/gallium/auxiliary/gallivm/lp_bld_nir.h index 6c40d982ad1..13236719a1c 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_nir.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_nir.h @@ -115,6 +115,7 @@ struct lp_build_nir_context LLVMValueRef index, LLVMValueRef offset, LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]); void (*store_mem)(struct lp_build_nir_context *bld_base, unsigned writemask, unsigned nc, unsigned bit_size, + bool index_and_offset_are_uniform, LLVMValueRef index, LLVMValueRef offset, LLVMValueRef dst); void (*atomic_mem)(struct lp_build_nir_context *bld_base, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c index a7a1cf5d800..a7b2af8a8c6 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c @@ -1296,6 +1296,7 @@ static void emit_store_mem(struct lp_build_nir_context *bld_base, unsigned writemask, unsigned nc, unsigned bit_size, + bool index_and_offset_are_uniform, LLVMValueRef index, LLVMValueRef offset, LLVMValueRef dst) @@ -1310,6 +1311,41 @@ static void emit_store_mem(struct lp_build_nir_context *bld_base, offset = lp_build_shr_imm(uint_bld, offset, shift_val); + /* If the address is uniform, then just store the value from the first + * channel instead of making LLVM unroll the invocation loop. + */ + if (index_and_offset_are_uniform && invocation_0_must_be_active(bld_base)) { + LLVMValueRef ssbo_limit; + LLVMValueRef mem_ptr = mem_access_base_pointer(bld_base, store_bld, bit_size, index, + lp_build_const_int32(gallivm, 0), &ssbo_limit); + + offset = LLVMBuildExtractElement(gallivm->builder, offset, lp_build_const_int32(gallivm, 0), ""); + + for (unsigned c = 0; c < nc; c++) { + if (!(writemask & (1u << c))) + continue; + + /* Pick out invocation 0's value. */ + LLVMValueRef val = (nc == 1) ? dst : LLVMBuildExtractValue(builder, dst, c, ""); + LLVMValueRef value_ptr = LLVMBuildExtractElement(gallivm->builder, val, + lp_build_const_int32(gallivm, 0), ""); + value_ptr = LLVMBuildBitCast(gallivm->builder, value_ptr, store_bld->elem_type, ""); + + LLVMValueRef chan_offset = LLVMBuildAdd(builder, offset, lp_build_const_int32(gallivm, c), ""); + + /* If storing outside the SSBO, we need to skip the store instead. */ + if (ssbo_limit) { + struct lp_build_if_state ifthen; + lp_build_if(&ifthen, gallivm, lp_offset_in_range(bld_base, chan_offset, ssbo_limit)); + lp_build_pointer_set(builder, mem_ptr, chan_offset, value_ptr); + lp_build_endif(&ifthen); + } else { + lp_build_pointer_set(builder, mem_ptr, chan_offset, value_ptr); + } + } + return; + } + LLVMValueRef exec_mask = mask_vec(bld_base); LLVMValueRef cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, ""); struct lp_build_loop_state loop_state; diff --git a/src/gallium/drivers/llvmpipe/ci/llvmpipe-skips.txt b/src/gallium/drivers/llvmpipe/ci/llvmpipe-skips.txt index 50c6669ad97..dabe47e4340 100644 --- a/src/gallium/drivers/llvmpipe/ci/llvmpipe-skips.txt +++ b/src/gallium/drivers/llvmpipe/ci/llvmpipe-skips.txt @@ -13,7 +13,6 @@ KHR-GL45.texture_size_promotion.functional KHR-GL45.texture_swizzle.functional KHR-GL45.texture_swizzle.smoke KHR-GLES31.core.arrays_of_arrays.InteractionFunctionCalls2 -dEQP-GLES31.functional.ssbo.layout.random.all_shared_buffer.36 arb_pipeline_statistics_query-comp gl-1.0-blend-func diff --git a/src/gallium/drivers/virgl/ci/virgl-gl-fails.txt b/src/gallium/drivers/virgl/ci/virgl-gl-fails.txt index 2b7f51ea990..2133432f82f 100644 --- a/src/gallium/drivers/virgl/ci/virgl-gl-fails.txt +++ b/src/gallium/drivers/virgl/ci/virgl-gl-fails.txt @@ -36,7 +36,10 @@ dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two_samples.multisample_rbo_2,Fail dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two_samples.multisample_texture_1,Fail dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two_samples.multisample_texture_2,Fail + +# Times out waiting for >15s compile on the host side. dEQP-GLES31.functional.ssbo.layout.random.all_shared_buffer.36,Fail +dEQP-GLES31.functional.ssbo.layout.random.nested_structs_arrays_instance_arrays.22,Fail KHR-GL30.transform_feedback.api_errors_test,Fail KHR-GL32.transform_feedback_overflow_query_ARB.advanced-single-stream-interleaved-attribs,Fail diff --git a/src/gallium/drivers/virgl/ci/virgl-gl-skips.txt b/src/gallium/drivers/virgl/ci/virgl-gl-skips.txt index 4c3e6f27fa9..5fc9f2d6e2c 100644 --- a/src/gallium/drivers/virgl/ci/virgl-gl-skips.txt +++ b/src/gallium/drivers/virgl/ci/virgl-gl-skips.txt @@ -5,7 +5,7 @@ # Sometimes crashes, e.g. https://gitlab.freedesktop.org/kusma/mesa/-/jobs/4109419 dEQP-GLES31.functional.compute.basic.empty -# too slow. +# too slow (>15s compile on host causes timeouts that make for flakes) dEQP-GLES31.functional.ssbo.layout.random.all_shared_buffer.36 dEQP-GLES31.functional.ssbo.layout.random.nested_structs_arrays_instance_arrays.22 diff --git a/src/gallium/drivers/virgl/ci/virgl-gles-fails.txt b/src/gallium/drivers/virgl/ci/virgl-gles-fails.txt index bc664b233bc..68db7b58f53 100644 --- a/src/gallium/drivers/virgl/ci/virgl-gles-fails.txt +++ b/src/gallium/drivers/virgl/ci/virgl-gles-fails.txt @@ -26,8 +26,11 @@ dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two_samples.multisample_rbo_2,Fail dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two_samples.multisample_texture_1,Fail dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two_samples.multisample_texture_2,Fail + +# Times out waiting for >15s compile on the host side. dEQP-GLES31.functional.ssbo.layout.random.all_shared_buffer.36,Fail dEQP-GLES31.functional.ssbo.layout.random.nested_structs_arrays_instance_arrays.22,Fail + KHR-GL30.glsl_noperspective.functionaltest,Fail KHR-GL30.transform_feedback.api_errors_test,Fail diff --git a/src/gallium/drivers/virgl/ci/virgl-gles-flakes.txt b/src/gallium/drivers/virgl/ci/virgl-gles-flakes.txt index 8fec80970e3..a40530cf5d9 100644 --- a/src/gallium/drivers/virgl/ci/virgl-gles-flakes.txt +++ b/src/gallium/drivers/virgl/ci/virgl-gles-flakes.txt @@ -12,9 +12,3 @@ dEQP-GLES2.functional.clipping.point.wide_point_clip_viewport_corner dEQP-GLES3.functional.clipping.point.wide_point_clip dEQP-GLES3.functional.clipping.point.wide_point_clip_viewport_center dEQP-GLES3.functional.clipping.point.wide_point_clip_viewport_corner - -dEQP-GLES31.functional.ssbo.layout.random.arrays_of_arrays.1 - -dEQP-GLES31.functional.ssbo.layout.unsized_struct_array.single_buffer.std430_instance_array -dEQP-GLES31.functional.ssbo.layout.unsized_struct_array.per_block_buffer.std430_instance_array -dEQP-GLES31.functional.ssbo.layout.unsized_struct_array.single_buffer.std140_instance_array diff --git a/src/gallium/drivers/virgl/ci/virgl-gles-skips.txt b/src/gallium/drivers/virgl/ci/virgl-gles-skips.txt index d47f896e062..fd7e4d1848d 100644 --- a/src/gallium/drivers/virgl/ci/virgl-gles-skips.txt +++ b/src/gallium/drivers/virgl/ci/virgl-gles-skips.txt @@ -4,5 +4,9 @@ KHR-GL32.packed_pixels.varied_rectangle.depth* +# too slow (>15s compile on host causes timeouts that make for flakes) +dEQP-GLES31.functional.ssbo.layout.random.all_shared_buffer.36 +dEQP-GLES31.functional.ssbo.layout.random.nested_structs_arrays_instance_arrays.22 + # Sometimes crashes, e.g. https://gitlab.freedesktop.org/kusma/mesa/-/jobs/4109419 dEQP-GLES31.functional.compute.basic.empty