freedreno/ir3: handle global atomics

Only for a6xx since we don't know the instructions for global
atomics on previous gens. Per Qualcomm's docs in OpenCL atomics
are only supported since a5xx together with Generic memory space.

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8717>
This commit is contained in:
Danylo Piliaiev 2021-07-26 13:50:03 +03:00 committed by Marge Bot
parent 5d5b1fc472
commit 99388f0c27
7 changed files with 152 additions and 3 deletions

View File

@ -344,6 +344,16 @@ nir_intrinsic_writes_external_memory(const nir_intrinsic_instr *instr)
case nir_intrinsic_global_atomic_umax:
case nir_intrinsic_global_atomic_umin:
case nir_intrinsic_global_atomic_xor:
case nir_intrinsic_global_atomic_add_ir3:
case nir_intrinsic_global_atomic_and_ir3:
case nir_intrinsic_global_atomic_comp_swap_ir3:
case nir_intrinsic_global_atomic_exchange_ir3:
case nir_intrinsic_global_atomic_imax_ir3:
case nir_intrinsic_global_atomic_imin_ir3:
case nir_intrinsic_global_atomic_or_ir3:
case nir_intrinsic_global_atomic_umax_ir3:
case nir_intrinsic_global_atomic_umin_ir3:
case nir_intrinsic_global_atomic_xor_ir3:
case nir_intrinsic_image_atomic_add:
case nir_intrinsic_image_atomic_and:
case nir_intrinsic_image_atomic_comp_swap:

View File

@ -659,18 +659,25 @@ intrinsic("load_vulkan_descriptor", src_comp=[-1], dest_comp=0,
# 1: The data parameter to the atomic function (i.e. the value to add
# in shared_atomic_add, etc).
# 2: For CompSwap only: the second data parameter.
#
# IR3 global operations take 32b vec2 as memory address. IR3 doesn't support
# float atomics.
def memory_atomic_data1(name):
intrinsic("deref_atomic_" + name, src_comp=[-1, 1], dest_comp=1, indices=[ACCESS])
intrinsic("ssbo_atomic_" + name, src_comp=[-1, 1, 1], dest_comp=1, indices=[ACCESS])
intrinsic("shared_atomic_" + name, src_comp=[1, 1], dest_comp=1, indices=[BASE])
intrinsic("global_atomic_" + name, src_comp=[1, 1], dest_comp=1, indices=[BASE])
if not name.startswith('f'):
intrinsic("global_atomic_" + name + "_ir3", src_comp=[2, 1], dest_comp=1, indices=[BASE])
def memory_atomic_data2(name):
intrinsic("deref_atomic_" + name, src_comp=[-1, 1, 1], dest_comp=1, indices=[ACCESS])
intrinsic("ssbo_atomic_" + name, src_comp=[-1, 1, 1, 1], dest_comp=1, indices=[ACCESS])
intrinsic("shared_atomic_" + name, src_comp=[1, 1, 1], dest_comp=1, indices=[BASE])
intrinsic("global_atomic_" + name, src_comp=[1, 1, 1], dest_comp=1, indices=[BASE])
if not name.startswith('f'):
intrinsic("global_atomic_" + name + "_ir3", src_comp=[2, 1, 1], dest_comp=1, indices=[BASE])
memory_atomic_data1("add")
memory_atomic_data1("imin")

View File

@ -355,6 +355,12 @@ emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
return atomic;
}
static struct ir3_instruction *
emit_intrinsic_atomic_global(struct ir3_context *ctx, nir_intrinsic_instr *intr)
{
unreachable("Global atomic are unimplemented on A5xx");
}
const struct ir3_context_funcs ir3_a4xx_funcs = {
.emit_intrinsic_load_ssbo = emit_intrinsic_load_ssbo,
.emit_intrinsic_store_ssbo = emit_intrinsic_store_ssbo,
@ -365,4 +371,5 @@ const struct ir3_context_funcs ir3_a4xx_funcs = {
.emit_intrinsic_image_size = emit_intrinsic_image_size_tex,
.emit_intrinsic_load_global_ir3 = NULL,
.emit_intrinsic_store_global_ir3 = NULL,
.emit_intrinsic_atomic_global = emit_intrinsic_atomic_global,
};

View File

@ -441,6 +441,73 @@ emit_intrinsic_store_global_ir3(struct ir3_context *ctx,
stg->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
}
static struct ir3_instruction *
emit_intrinsic_atomic_global(struct ir3_context *ctx, nir_intrinsic_instr *intr)
{
struct ir3_block *b = ctx->block;
struct ir3_instruction *addr, *atomic, *src1;
struct ir3_instruction *value = ir3_get_src(ctx, &intr->src[1])[0];
type_t type = TYPE_U32;
addr = ir3_collect(b, ir3_get_src(ctx, &intr->src[0])[0],
ir3_get_src(ctx, &intr->src[0])[1]);
if (intr->intrinsic == nir_intrinsic_global_atomic_comp_swap_ir3) {
struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[2])[0];
src1 = ir3_collect(b, compare, value);
} else {
src1 = value;
}
switch (intr->intrinsic) {
case nir_intrinsic_global_atomic_add_ir3:
atomic = ir3_ATOMIC_G_ADD(b, addr, 0, src1, 0);
break;
case nir_intrinsic_global_atomic_imin_ir3:
atomic = ir3_ATOMIC_G_MIN(b, addr, 0, src1, 0);
type = TYPE_S32;
break;
case nir_intrinsic_global_atomic_umin_ir3:
atomic = ir3_ATOMIC_G_MIN(b, addr, 0, src1, 0);
break;
case nir_intrinsic_global_atomic_imax_ir3:
atomic = ir3_ATOMIC_G_MAX(b, addr, 0, src1, 0);
type = TYPE_S32;
break;
case nir_intrinsic_global_atomic_umax_ir3:
atomic = ir3_ATOMIC_G_MAX(b, addr, 0, src1, 0);
break;
case nir_intrinsic_global_atomic_and_ir3:
atomic = ir3_ATOMIC_G_AND(b, addr, 0, src1, 0);
break;
case nir_intrinsic_global_atomic_or_ir3:
atomic = ir3_ATOMIC_G_OR(b, addr, 0, src1, 0);
break;
case nir_intrinsic_global_atomic_xor_ir3:
atomic = ir3_ATOMIC_G_XOR(b, addr, 0, src1, 0);
break;
case nir_intrinsic_global_atomic_exchange_ir3:
atomic = ir3_ATOMIC_G_XCHG(b, addr, 0, src1, 0);
break;
case nir_intrinsic_global_atomic_comp_swap_ir3:
atomic = ir3_ATOMIC_G_CMPXCHG(b, addr, 0, src1, 0);
break;
default:
unreachable("Unknown global atomic op");
}
atomic->cat6.iim_val = 1;
atomic->cat6.d = 1;
atomic->cat6.type = type;
atomic->barrier_class = IR3_BARRIER_BUFFER_W;
atomic->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
/* even if nothing consume the result, we can't DCE the instruction: */
array_insert(b, b->keeps, atomic);
return atomic;
}
const struct ir3_context_funcs ir3_a6xx_funcs = {
.emit_intrinsic_load_ssbo = emit_intrinsic_load_ssbo,
.emit_intrinsic_store_ssbo = emit_intrinsic_store_ssbo,
@ -451,4 +518,5 @@ const struct ir3_context_funcs ir3_a6xx_funcs = {
.emit_intrinsic_image_size = emit_intrinsic_image_size,
.emit_intrinsic_load_global_ir3 = emit_intrinsic_load_global_ir3,
.emit_intrinsic_store_global_ir3 = emit_intrinsic_store_global_ir3,
.emit_intrinsic_atomic_global = emit_intrinsic_atomic_global,
};

View File

@ -2250,6 +2250,20 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
case nir_intrinsic_bindless_resource_ir3:
dst[0] = ir3_get_src(ctx, &intr->src[0])[0];
break;
case nir_intrinsic_global_atomic_add_ir3:
case nir_intrinsic_global_atomic_imin_ir3:
case nir_intrinsic_global_atomic_umin_ir3:
case nir_intrinsic_global_atomic_imax_ir3:
case nir_intrinsic_global_atomic_umax_ir3:
case nir_intrinsic_global_atomic_and_ir3:
case nir_intrinsic_global_atomic_or_ir3:
case nir_intrinsic_global_atomic_xor_ir3:
case nir_intrinsic_global_atomic_exchange_ir3:
case nir_intrinsic_global_atomic_comp_swap_ir3: {
dst[0] = ctx->funcs->emit_intrinsic_atomic_global(ctx, intr);
break;
}
default:
ir3_context_error(ctx, "Unhandled intrinsic type: %s\n",
nir_intrinsic_infos[intr->intrinsic].name);

View File

@ -188,6 +188,8 @@ struct ir3_context_funcs {
struct ir3_instruction **dst);
void (*emit_intrinsic_store_global_ir3)(struct ir3_context *ctx,
nir_intrinsic_instr *intr);
struct ir3_instruction *(*emit_intrinsic_atomic_global)(
struct ir3_context *ctx, nir_intrinsic_instr *intr);
};
extern const struct ir3_context_funcs ir3_a4xx_funcs;

View File

@ -229,9 +229,24 @@ lower_64b_global_filter(const nir_instr *instr, const void *unused)
return false;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
return (intr->intrinsic == nir_intrinsic_load_global) ||
(intr->intrinsic == nir_intrinsic_load_global_constant) ||
(intr->intrinsic == nir_intrinsic_store_global);
switch (intr->intrinsic) {
case nir_intrinsic_load_global:
case nir_intrinsic_load_global_constant:
case nir_intrinsic_store_global:
case nir_intrinsic_global_atomic_add:
case nir_intrinsic_global_atomic_imin:
case nir_intrinsic_global_atomic_umin:
case nir_intrinsic_global_atomic_imax:
case nir_intrinsic_global_atomic_umax:
case nir_intrinsic_global_atomic_and:
case nir_intrinsic_global_atomic_or:
case nir_intrinsic_global_atomic_xor:
case nir_intrinsic_global_atomic_exchange:
case nir_intrinsic_global_atomic_comp_swap:
return true;
default:
return false;
}
}
static nir_ssa_def *
@ -250,6 +265,32 @@ lower_64b_global(nir_builder *b, nir_instr *instr, void *unused)
* those up into max 4 components per load/store.
*/
#define GLOBAL_IR3_2SRC(name) \
case nir_intrinsic_##name: { \
return nir_build_##name##_ir3(b, nir_dest_bit_size(intr->dest), addr, \
nir_ssa_for_src(b, intr->src[1], 1)); \
}
switch (intr->intrinsic) {
GLOBAL_IR3_2SRC(global_atomic_add)
GLOBAL_IR3_2SRC(global_atomic_imin)
GLOBAL_IR3_2SRC(global_atomic_umin)
GLOBAL_IR3_2SRC(global_atomic_imax)
GLOBAL_IR3_2SRC(global_atomic_umax)
GLOBAL_IR3_2SRC(global_atomic_and)
GLOBAL_IR3_2SRC(global_atomic_or)
GLOBAL_IR3_2SRC(global_atomic_xor)
GLOBAL_IR3_2SRC(global_atomic_exchange)
case nir_intrinsic_global_atomic_comp_swap:
return nir_build_global_atomic_comp_swap_ir3(
b, nir_dest_bit_size(intr->dest), addr,
nir_ssa_for_src(b, intr->src[1], 1),
nir_ssa_for_src(b, intr->src[2], 1));
default:
break;
}
#undef GLOBAL_IR3_2SRC
if (load) {
unsigned num_comp = nir_intrinsic_dest_components(intr);
nir_ssa_def *components[num_comp];