diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 016d3a043d9..94b35a3a063 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -5887,6 +5887,11 @@ Operand emit_tfe_init(Builder& bld, Temp dst) for (unsigned i = 0; i < dst.size(); i++) vec->operands[i] = Operand(0u); vec->definitions[0] = Definition(tmp); + /* Since this is fixed to an instruction's definition register, any CSE will + * just create copies. Copying costs about the same as zero-initialization, + * but these copies can break up clauses. + */ + vec->definitions[0].setNoCSE(true); bld.insert(std::move(vec)); return Operand(tmp); diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 9345321c1dd..a97927b2036 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -851,7 +851,7 @@ class Definition final { public: constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), - isKill_(0), isPrecise_(0), isNUW_(0) {} + isKill_(0), isPrecise_(0), isNUW_(0), isNoCSE_(0) {} Definition(uint32_t index, RegClass type) noexcept : temp(index, type) {} explicit Definition(Temp tmp) noexcept @@ -959,6 +959,16 @@ public: return isNUW_; } + constexpr void setNoCSE(bool noCSE) noexcept + { + isNoCSE_ = noCSE; + } + + constexpr bool isNoCSE() const noexcept + { + return isNoCSE_; + } + private: Temp temp = Temp(0, s1); PhysReg reg_; @@ -969,6 +979,7 @@ private: uint8_t isKill_:1; uint8_t isPrecise_:1; uint8_t isNUW_:1; + uint8_t isNoCSE_:1; }; /* can't initialize bit-fields in c++11, so work around using a union */ uint8_t control_ = 0; diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp index af05df9c682..8dc2812bc7e 100644 --- a/src/amd/compiler/aco_opt_value_numbering.cpp +++ b/src/amd/compiler/aco_opt_value_numbering.cpp @@ -383,7 +383,7 @@ void process_block(vn_ctx& ctx, Block& block) instr->opcode == aco_opcode::p_demote_to_helper) ctx.exec_id++; - if (instr->definitions.empty() || instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi) { + if (instr->definitions.empty() || is_phi(instr) || instr->definitions[0].isNoCSE()) { new_instructions.emplace_back(std::move(instr)); continue; } diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp index 6a5da5dac19..f99046da007 100644 --- a/src/amd/compiler/aco_print_ir.cpp +++ b/src/amd/compiler/aco_print_ir.cpp @@ -192,6 +192,8 @@ static void print_definition(const Definition *definition, FILE *output) fprintf(output, "(precise)"); if (definition->isNUW()) fprintf(output, "(nuw)"); + if (definition->isNoCSE()) + fprintf(output, "(noCSE)"); fprintf(output, "%%%d", definition->tempId()); if (definition->isFixed()) diff --git a/src/amd/compiler/tests/test_isel.cpp b/src/amd/compiler/tests/test_isel.cpp index 83daf37b8d9..676ff2c83c2 100644 --- a/src/amd/compiler/tests/test_isel.cpp +++ b/src/amd/compiler/tests/test_isel.cpp @@ -133,3 +133,44 @@ BEGIN_TEST(isel.gs.no_verts) fprintf(output, "success\n"); } END_TEST + +BEGIN_TEST(isel.sparse.clause) + for (unsigned i = GFX10; i <= GFX10; i++) { + if (!set_variant((chip_class)i)) + continue; + + QoShaderModuleCreateInfo cs = qoShaderModuleCreateInfoGLSL(COMPUTE, + QO_EXTENSION GL_ARB_sparse_texture2 : require + layout(local_size_x=1) in; + layout(binding=0) uniform sampler2D tex; + layout(binding=0) buffer Buf { + vec4 res[4]; + uint code[4]; + }; + void main() { + //>> v5: (noCSE)%zero0 = p_create_vector 0, 0, 0, 0, 0 + //>> v5: %_ = image_sample_lz_o %_, %_, %_, %zero0 dmask:xyzw 2d tfe storage: semantics: scope:invocation + //>> v5: (noCSE)%zero1 = p_create_vector 0, 0, 0, 0, 0 + //>> v5: %_ = image_sample_lz_o %_, %_, %_, %zero1 dmask:xyzw 2d tfe storage: semantics: scope:invocation + //>> v5: (noCSE)%zero2 = p_create_vector 0, 0, 0, 0, 0 + //>> v5: %_ = image_sample_lz_o %_, %_, %_, %zero2 dmask:xyzw 2d tfe storage: semantics: scope:invocation + //>> v5: (noCSE)%zero3 = p_create_vector 0, 0, 0, 0, 0 + //>> v5: %_ = image_sample_lz_o %_, %_, %_, %zero3 dmask:xyzw 2d tfe storage: semantics: scope:invocation + //>> s_clause 0x3 + //! image_sample_lz_o v#_, v[#_:#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe + //! image_sample_lz_o v#_, v[#_:#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe + //! image_sample_lz_o v#_, v[#_:#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe + //! image_sample_lz_o v#_, v[#_:#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe + code[0] = sparseTextureOffsetARB(tex, vec2(0.5), ivec2(1, 0), res[0]); + code[1] = sparseTextureOffsetARB(tex, vec2(0.5), ivec2(2, 0), res[1]); + code[2] = sparseTextureOffsetARB(tex, vec2(0.5), ivec2(3, 0), res[2]); + code[3] = sparseTextureOffsetARB(tex, vec2(0.5), ivec2(4, 0), res[3]); + } + ); + + PipelineBuilder pbld(get_vk_device((chip_class)i)); + pbld.add_cs(cs); + pbld.print_ir(VK_SHADER_STAGE_COMPUTE_BIT, "ACO IR", true); + pbld.print_ir(VK_SHADER_STAGE_COMPUTE_BIT, "Assembly", true); + } +END_TEST