diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index 85a3858f439..25c71edbc5d 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -2424,10 +2424,8 @@ static void visit_store_output(struct ac_nir_context *ctx, nir_intrinsic_instr * case 32: break; case 64: - writemask = widen_mask(writemask, 2); - src = LLVMBuildBitCast(ctx->ac.builder, src, - LLVMVectorType(ctx->ac.f32, ac_get_llvm_num_components(src) * 2), ""); - break; + unreachable("64-bit IO should have been lowered to 32 bits"); + return; default: unreachable("unhandled store_output bit size"); return; @@ -3404,12 +3402,24 @@ static LLVMValueRef visit_load(struct ac_nir_context *ctx, nir_intrinsic_instr * LLVMTypeRef component_type; unsigned base = nir_intrinsic_base(instr); unsigned component = nir_intrinsic_component(instr); - unsigned count = instr->dest.ssa.num_components * (instr->dest.ssa.bit_size == 64 ? 2 : 1); + unsigned count = instr->dest.ssa.num_components; nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr); LLVMValueRef vertex_index = vertex_index_src ? get_src(ctx, *vertex_index_src) : NULL; nir_src offset = *nir_get_io_offset_src(instr); LLVMValueRef indir_index = NULL; + switch (instr->dest.ssa.bit_size) { + case 16: + case 32: + break; + case 64: + unreachable("64-bit IO should have been lowered"); + return NULL; + default: + unreachable("unhandled load type"); + return NULL; + } + if (LLVMGetTypeKind(dest_type) == LLVMVectorTypeKind) component_type = LLVMGetElementType(dest_type); else @@ -3420,10 +3430,13 @@ static LLVMValueRef visit_load(struct ac_nir_context *ctx, nir_intrinsic_instr * else indir_index = get_src(ctx, offset); - if (ctx->stage == MESA_SHADER_TESS_CTRL || (ctx->stage == MESA_SHADER_TESS_EVAL && !is_output)) { - LLVMValueRef result = ctx->abi->load_tess_varyings( - ctx->abi, component_type, vertex_index, indir_index, 0, 0, base * 4, component, - instr->num_components, false, false, !is_output); + if (ctx->stage == MESA_SHADER_TESS_CTRL || + (ctx->stage == MESA_SHADER_TESS_EVAL && !is_output)) { + LLVMValueRef result = ctx->abi->load_tess_varyings(ctx->abi, component_type, + vertex_index, indir_index, + 0, 0, base * 4, + component, count, + false, false, !is_output); if (instr->dest.ssa.bit_size == 16) { result = ac_to_integer(&ctx->ac, result); result = LLVMBuildTrunc(ctx->ac.builder, result, dest_type, ""); @@ -3435,11 +3448,10 @@ static LLVMValueRef visit_load(struct ac_nir_context *ctx, nir_intrinsic_instr * assert(!indir_index); if (ctx->stage == MESA_SHADER_GEOMETRY) { - LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size); assert(nir_src_is_const(*vertex_index_src)); - return ctx->abi->load_inputs(ctx->abi, 0, base * 4, component, instr->num_components, - nir_src_as_uint(*vertex_index_src), 0, type); + return ctx->abi->load_inputs(ctx->abi, 0, base * 4, component, count, + nir_src_as_uint(*vertex_index_src), 0, component_type); } if (ctx->stage == MESA_SHADER_FRAGMENT && is_output && @@ -3485,8 +3497,6 @@ static LLVMValueRef visit_load(struct ac_nir_context *ctx, nir_intrinsic_instr * LLVMValueRef attr_number = LLVMConstInt(ctx->ac.i32, base, false); for (unsigned chan = 0; chan < count; chan++) { - if (component + chan > 4) - attr_number = LLVMConstInt(ctx->ac.i32, base + 1, false); LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, (component + chan) % 4, false); values[chan] = ac_build_fs_interp_mov(&ctx->ac, LLVMConstInt(ctx->ac.i32, vertex_id, false), llvm_chan, diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 5e3a46cf836..2e57eb5e30f 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -242,8 +242,6 @@ LLVMValueRef si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueR LLVMValueRef si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret, struct ac_arg param, unsigned return_index); LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx); -LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx, LLVMTypeRef type, - LLVMValueRef val1, LLVMValueRef val2); void si_llvm_emit_barrier(struct si_shader_context *ctx); void si_llvm_declare_esgs_ring(struct si_shader_context *ctx); void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param, diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index 0602593ba6e..ab3aed107e3 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -262,17 +262,6 @@ LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx) return list; } -LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx, LLVMTypeRef type, - LLVMValueRef val1, LLVMValueRef val2) -{ - LLVMValueRef values[2] = { - ac_to_integer(&ctx->ac, val1), - ac_to_integer(&ctx->ac, val2), - }; - LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, 2); - return LLVMBuildBitCast(ctx->ac.builder, result, type, ""); -} - void si_llvm_emit_barrier(struct si_shader_context *ctx) { /* GFX6 only (thanks to a hw bug workaround): diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c index 9337b28a9aa..6b38160c936 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c @@ -79,11 +79,6 @@ static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, unsigned in LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset); LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, ""); - if (ac_get_type_size(type) == 8) { - ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &ctx->ac.i32_1, 1, ""); - LLVMValueRef values[2] = {value, LLVMBuildLoad(ctx->ac.builder, ptr, "")}; - value = ac_build_gather_values(&ctx->ac, values, 2); - } return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); } @@ -97,14 +92,6 @@ static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, unsigned in value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0, vtx_offset, soffset, 0, ac_glc, true, false); - if (ac_get_type_size(type) == 8) { - LLVMValueRef value2; - soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle + 1) * 256, 0); - - value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->ac.i32_0, vtx_offset, soffset, - 0, ac_glc, true, false); - return si_build_gather_64bit(ctx, type, value, value2); - } return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); } @@ -116,14 +103,9 @@ static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi, unsigned loc struct si_shader_context *ctx = si_shader_context_from_abi(abi); LLVMValueRef value[4]; - for (unsigned i = 0; i < num_components; i++) { - unsigned offset = i; - if (ac_get_type_size(type) == 8) - offset *= 2; - - offset += component; - value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location / 4 + const_index, - vertex_index, type, offset); + for (unsigned i = component; i < component + num_components; i++) { + value[i] = si_llvm_load_input_gs(&ctx->abi, driver_location / 4 + const_index, + vertex_index, type, i); } return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c index 75d80b0ac5b..5bb3eeb7e05 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c @@ -269,7 +269,7 @@ static LLVMValueRef buffer_load(struct si_shader_context *ctx, LLVMTypeRef type, LLVMValueRef buffer, LLVMValueRef offset, LLVMValueRef base, bool can_speculate) { - LLVMValueRef value, value2; + LLVMValueRef value; LLVMTypeRef vec_type = LLVMVectorType(type, 4); if (swizzle == ~0) { @@ -279,22 +279,12 @@ static LLVMValueRef buffer_load(struct si_shader_context *ctx, LLVMTypeRef type, return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, ""); } - if (ac_get_type_size(type) != 8) { - value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, 0, ac_glc, - can_speculate, false); - - value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, ""); - return LLVMBuildExtractElement(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, swizzle, 0), - ""); - } - - value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, swizzle * 4, ac_glc, + value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset, 0, ac_glc, can_speculate, false); - value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset, swizzle * 4 + 4, ac_glc, - can_speculate, false); - - return si_build_gather_64bit(ctx, type, value, value2); + value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, ""); + return LLVMBuildExtractElement(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, swizzle, 0), + ""); } /** @@ -318,19 +308,8 @@ static LLVMValueRef lshs_lds_load(struct si_shader_context *ctx, LLVMTypeRef typ return ac_build_gather_values(&ctx->ac, values, 4); } - /* Split 64-bit loads. */ - if (ac_get_type_size(type) == 8) { - LLVMValueRef lo, hi; - - lo = lshs_lds_load(ctx, ctx->ac.i32, swizzle, dw_addr); - hi = lshs_lds_load(ctx, ctx->ac.i32, swizzle + 1, dw_addr); - return si_build_gather_64bit(ctx, type, lo, hi); - } - dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, LLVMConstInt(ctx->ac.i32, swizzle, 0), ""); - value = ac_lds_load(&ctx->ac, dw_addr); - return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); } @@ -443,14 +422,8 @@ static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi, LLVMType semantic); LLVMValueRef value[4]; - for (unsigned i = 0; i < num_components; i++) { - unsigned offset = i; - if (ac_get_type_size(type) == 8) - offset *= 2; - - offset += component; - value[i + component] = lshs_lds_load(ctx, type, offset, dw_addr); - } + for (unsigned i = component; i < component + num_components; i++) + value[i] = lshs_lds_load(ctx, type, i, dw_addr); return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); } @@ -487,23 +460,8 @@ static LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi, LLVMTypeRef * to refactor buffer_load(). */ LLVMValueRef value[4]; - for (unsigned i = 0; i < num_components; i++) { - unsigned offset = i; - if (ac_get_type_size(type) == 8) { - offset *= 2; - if (offset == 4) { - ubyte semantic = info->input_semantic[driver_location + 1]; - addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index, - semantic); - } - - offset = offset % 4; - } - - offset += component; - value[i + component] = - buffer_load(ctx, type, offset, ctx->tess_offchip_ring, base, addr, true); - } + for (unsigned i = component; i < component + num_components; i++) + value[i] = buffer_load(ctx, type, i, ctx->tess_offchip_ring, base, addr, true); return ac_build_varying_gather_values(&ctx->ac, value, num_components, component); } @@ -563,20 +521,13 @@ static void si_nir_store_output_tcs(struct ac_shader_abi *abi, const struct nir_ addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index, semantic); - for (unsigned chan = component; chan < 8; chan++) { + for (unsigned chan = component; chan < 4; chan++) { if (!(writemask & (1 << chan))) continue; LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component); - unsigned buffer_store_offset = chan % 4; - if (chan == 4) { - ubyte semantic = info->output_semantic[driver_location + 1]; - addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index, param_index, - semantic); - } - /* Skip LDS stores if there is no LDS read of this output. */ - if (info->output_readmask[driver_location + chan / 4] & (1 << (chan % 4)) || + if (info->output_readmask[driver_location] & (1 << chan) || /* The epilog reads LDS if invocation 0 doesn't define tess factors. */ (is_tess_factor && !ctx->shader->selector->info.tessfactors_are_def_in_all_invocs)) @@ -587,7 +538,7 @@ static void si_nir_store_output_tcs(struct ac_shader_abi *abi, const struct nir_ if (writemask != 0xF && !is_tess_factor) { ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1, addr, base, - 4 * buffer_store_offset, ac_glc); + 4 * chan, ac_glc); } /* Write tess factors into VGPRs for the epilog. */ diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index d88e07c9642..534973bcf49 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -64,29 +64,18 @@ static void scan_io_usage(struct si_shader_info *info, nir_intrinsic_instr *intr } unsigned mask, bit_size; - bool dual_slot, is_output_load; + bool is_output_load; if (nir_intrinsic_infos[intr->intrinsic].index_map[NIR_INTRINSIC_WRMASK] > 0) { mask = nir_intrinsic_write_mask(intr); /* store */ bit_size = nir_src_bit_size(intr->src[0]); - dual_slot = bit_size == 64 && nir_src_num_components(intr->src[0]) >= 3; is_output_load = false; } else { mask = nir_ssa_def_components_read(&intr->dest.ssa); /* load */ bit_size = intr->dest.ssa.bit_size; - dual_slot = bit_size == 64 && intr->dest.ssa.num_components >= 3; is_output_load = !is_input; } - - /* Convert the 64-bit component mask to a 32-bit component mask. */ - if (bit_size == 64) { - unsigned new_mask = 0; - for (unsigned i = 0; i < 4; i++) { - if (mask & (1 << i)) - new_mask |= 0x3 << (2 * i); - } - mask = new_mask; - } + assert(bit_size != 64 && !(mask & ~0xf) && "64-bit IO should have been lowered"); /* Convert the 16-bit component mask to a 32-bit component mask. */ if (bit_size == 16) { @@ -120,20 +109,19 @@ static void scan_io_usage(struct si_shader_info *info, nir_intrinsic_instr *intr } unsigned driver_location = nir_intrinsic_base(intr); - unsigned num_slots = indirect ? nir_intrinsic_io_semantics(intr).num_slots : (1 + dual_slot); + unsigned num_slots = indirect ? nir_intrinsic_io_semantics(intr).num_slots : 1; if (is_input) { assert(driver_location + num_slots <= ARRAY_SIZE(info->input_usage_mask)); for (unsigned i = 0; i < num_slots; i++) { unsigned loc = driver_location + i; - unsigned slot_mask = (dual_slot && i % 2 ? mask >> 4 : mask) & 0xf; info->input_semantic[loc] = semantic + i; info->input_interpolate[loc] = interp; - if (slot_mask) { - info->input_usage_mask[loc] |= slot_mask; + if (mask) { + info->input_usage_mask[loc] |= mask; info->num_inputs = MAX2(info->num_inputs, loc + 1); if (semantic == VARYING_SLOT_PRIMITIVE_ID) @@ -147,24 +135,23 @@ static void scan_io_usage(struct si_shader_info *info, nir_intrinsic_instr *intr for (unsigned i = 0; i < num_slots; i++) { unsigned loc = driver_location + i; - unsigned slot_mask = (dual_slot && i % 2 ? mask >> 4 : mask) & 0xf; info->output_semantic[loc] = semantic + i; info->output_semantic_to_slot[semantic + i] = loc; if (is_output_load) { /* Output loads have only a few things that we need to track. */ - info->output_readmask[loc] |= slot_mask; + info->output_readmask[loc] |= mask; if (info->stage == MESA_SHADER_FRAGMENT && nir_intrinsic_io_semantics(intr).fb_fetch_output) info->uses_fbfetch = true; - } else if (slot_mask) { + } else if (mask) { /* Output stores. */ if (info->stage == MESA_SHADER_GEOMETRY) { unsigned gs_streams = (uint32_t)nir_intrinsic_io_semantics(intr).gs_streams << (nir_intrinsic_component(intr) * 2); - unsigned new_mask = slot_mask & ~info->output_usagemask[loc]; + unsigned new_mask = mask & ~info->output_usagemask[loc]; for (unsigned i = 0; i < 4; i++) { unsigned stream = (gs_streams >> (i * 2)) & 0x3; @@ -176,7 +163,7 @@ static void scan_io_usage(struct si_shader_info *info, nir_intrinsic_instr *intr } } - info->output_usagemask[loc] |= slot_mask; + info->output_usagemask[loc] |= mask; info->num_outputs = MAX2(info->num_outputs, loc + 1); if (info->stage == MESA_SHADER_FRAGMENT) { @@ -632,7 +619,7 @@ static void si_lower_io(struct nir_shader *nir) si_nir_lower_color(nir); NIR_PASS_V(nir, nir_lower_io, nir_var_shader_out | nir_var_shader_in, - type_size_vec4, 0); + type_size_vec4, nir_lower_io_lower_64bit_to_32); nir->info.io_lowered = true; /* This pass needs actual constants */