diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index cdedb516542..7f4618b21e9 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -5555,13 +5555,15 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) emit_wqm(ctx, tmp.getTemp(), get_ssa_temp(ctx, &instr->dest.ssa)); break; } - case nir_intrinsic_shuffle: { + case nir_intrinsic_shuffle: + case nir_intrinsic_read_invocation: { Temp src = get_ssa_temp(ctx, instr->src[0].ssa); - if (!ctx->divergent_vals[instr->dest.ssa.index] && - !ctx->divergent_vals[instr->src[0].ssa->index]) { + if (!ctx->divergent_vals[instr->src[0].ssa->index]) { emit_uniform_subgroup(ctx, instr, src); } else { Temp tid = get_ssa_temp(ctx, instr->src[1].ssa); + if (instr->intrinsic == nir_intrinsic_read_invocation || !ctx->divergent_vals[instr->src[1].ssa->index]) + tid = bld.as_uniform(tid); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); if (src.regClass() == v1) { emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), dst); @@ -5572,6 +5574,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) hi = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, hi)); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); emit_split_vector(ctx, dst, 2); + } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2 && tid.regClass() == s1) { + emit_wqm(ctx, bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src, tid), dst); } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) { Temp tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src); tmp = emit_extract_vector(ctx, tmp, 0, v1); @@ -5624,32 +5628,6 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) } break; } - case nir_intrinsic_read_invocation: { - Temp src = get_ssa_temp(ctx, instr->src[0].ssa); - Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)); - Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - if (src.regClass() == v1) { - emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), src, lane), dst); - } else if (src.regClass() == v2) { - Temp lo = bld.tmp(v1), hi = bld.tmp(v1); - bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); - lo = emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), lo, lane)); - hi = emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), hi, lane)); - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); - emit_split_vector(ctx, dst, 2); - } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) { - emit_wqm(ctx, bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src, lane), dst); - } else if (src.regClass() == s1) { - bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src); - } else if (src.regClass() == s2) { - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src); - } else { - fprintf(stderr, "Unimplemented NIR instr bit size: "); - nir_print_instr(&instr->instr, stderr); - fprintf(stderr, "\n"); - } - break; - } case nir_intrinsic_vote_all: { Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);