From f06c6463157c72d294720812c5d5445abf3ceb71 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Wed, 3 Apr 2024 14:55:43 +0200 Subject: [PATCH] [dxbc] Remove broken atomic counter subgroup optimization This is not a legal optimization inside non-uniform control flow due to Vulkan's extremely permissive convergence rules, and apparently breaks on Nvidia as a result. Mesa drivers already do the same thing internally anyway. --- src/dxbc/dxbc_compiler.cpp | 79 ++------------------------------------ src/dxbc/dxbc_options.cpp | 3 -- src/dxbc/dxbc_options.h | 4 -- 3 files changed, 4 insertions(+), 82 deletions(-) diff --git a/src/dxbc/dxbc_compiler.cpp b/src/dxbc/dxbc_compiler.cpp index c3a6718b..8391d9f4 100644 --- a/src/dxbc/dxbc_compiler.cpp +++ b/src/dxbc/dxbc_compiler.cpp @@ -2464,58 +2464,6 @@ namespace dxvk { if (m_uavs.at(registerId).ctrId == 0) m_uavs.at(registerId).ctrId = emitDclUavCounter(registerId); - // Only use subgroup ops on compute to avoid having to - // deal with helper invocations or hardware limitations - bool useSubgroupOps = m_moduleInfo.options.useSubgroupOpsForAtomicCounters - && m_programInfo.type() == DxbcProgramType::ComputeShader; - - // Current block ID used in a phi later on - uint32_t baseBlockId = m_module.getBlockId(); - - // In case we have subgroup ops enabled, we need to - // count the number of active lanes, the lane index, - // and we need to perform the atomic op conditionally - uint32_t laneCount = 0; - uint32_t laneIndex = 0; - - DxbcConditional elect; - - if (useSubgroupOps) { - m_module.enableCapability(spv::CapabilityGroupNonUniform); - m_module.enableCapability(spv::CapabilityGroupNonUniformBallot); - - uint32_t ballot = m_module.opGroupNonUniformBallot( - getVectorTypeId({ DxbcScalarType::Uint32, 4 }), - m_module.constu32(spv::ScopeSubgroup), - m_module.constBool(true)); - - laneCount = m_module.opGroupNonUniformBallotBitCount( - getScalarTypeId(DxbcScalarType::Uint32), - m_module.constu32(spv::ScopeSubgroup), - spv::GroupOperationReduce, ballot); - - laneIndex = m_module.opGroupNonUniformBallotBitCount( - getScalarTypeId(DxbcScalarType::Uint32), - m_module.constu32(spv::ScopeSubgroup), - spv::GroupOperationExclusiveScan, ballot); - - // Elect one lane to perform the atomic op - uint32_t election = m_module.opGroupNonUniformElect( - m_module.defBoolType(), - m_module.constu32(spv::ScopeSubgroup)); - - elect.labelIf = m_module.allocateId(); - elect.labelEnd = m_module.allocateId(); - - m_module.opSelectionMerge(elect.labelEnd, spv::SelectionControlMaskNone); - m_module.opBranchConditional(election, elect.labelIf, elect.labelEnd); - - m_module.opLabel(elect.labelIf); - } else { - // We're going to use this for the increment - laneCount = m_module.constu32(1); - } - // Get a pointer to the atomic counter in question DxbcRegisterInfo ptrType; ptrType.type.ctype = DxbcScalarType::Uint32; @@ -2547,13 +2495,14 @@ namespace dxvk { switch (ins.op) { case DxbcOpcode::ImmAtomicAlloc: value.id = m_module.opAtomicIAdd(typeId, ptrId, - scopeId, semanticsId, laneCount); + scopeId, semanticsId, m_module.constu32(1)); break; case DxbcOpcode::ImmAtomicConsume: value.id = m_module.opAtomicISub(typeId, ptrId, - scopeId, semanticsId, laneCount); - value.id = m_module.opISub(typeId, value.id, laneCount); + scopeId, semanticsId, m_module.constu32(1)); + value.id = m_module.opISub(typeId, value.id, + m_module.constu32(1)); break; default: @@ -2563,26 +2512,6 @@ namespace dxvk { return; } - // If we're using subgroup ops, we have to broadcast - // the result of the atomic op and compute the index - if (useSubgroupOps) { - m_module.opBranch(elect.labelEnd); - m_module.opLabel (elect.labelEnd); - - uint32_t undef = m_module.constUndef(typeId); - - std::array phiLabels = {{ - { value.id, elect.labelIf }, - { undef, baseBlockId }, - }}; - - value.id = m_module.opPhi(typeId, - phiLabels.size(), phiLabels.data()); - value.id = m_module.opGroupNonUniformBroadcastFirst(typeId, - m_module.constu32(spv::ScopeSubgroup), value.id); - value.id = m_module.opIAdd(typeId, value.id, laneIndex); - } - // Store the result emitRegisterStore(ins.dst[0], value); } diff --git a/src/dxbc/dxbc_options.cpp b/src/dxbc/dxbc_options.cpp index 3cdf2833..e55a0f94 100644 --- a/src/dxbc/dxbc_options.cpp +++ b/src/dxbc/dxbc_options.cpp @@ -17,9 +17,6 @@ namespace dxvk { useDepthClipWorkaround = !devFeatures.extDepthClipEnable.depthClipEnable; - useSubgroupOpsForAtomicCounters - = (devInfo.vk11.subgroupSupportedStages & VK_SHADER_STAGE_COMPUTE_BIT) - && (devInfo.vk11.subgroupSupportedOperations & VK_SUBGROUP_FEATURE_BALLOT_BIT); VkFormatFeatureFlags2 r32Features = device->getFormatFeatures(VK_FORMAT_R32_SFLOAT).optimal diff --git a/src/dxbc/dxbc_options.h b/src/dxbc/dxbc_options.h index 6ea5eccb..27ecca1f 100644 --- a/src/dxbc/dxbc_options.h +++ b/src/dxbc/dxbc_options.h @@ -30,10 +30,6 @@ namespace dxvk { /// Determines whether raw access chains are supported bool supportsRawAccessChains = false; - /// Use subgroup operations to reduce the number of - /// atomic operations for append/consume buffers. - bool useSubgroupOpsForAtomicCounters = false; - /// Clear thread-group shared memory to zero bool zeroInitWorkgroupMemory = false;