aco: Use common argument handling

Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
This commit is contained in:
Connor Abbott 2019-11-13 13:30:52 +01:00
parent e7f4cadd02
commit bb78f9b4e4
6 changed files with 214 additions and 640 deletions

View File

@ -58,6 +58,7 @@ struct ac_shader_args {
enum ac_arg_regfile file;
uint8_t offset;
uint8_t size;
bool skip;
} args[AC_MAX_ARGS];
uint8_t arg_count;

View File

@ -2911,12 +2911,11 @@ void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
{
aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
for (unsigned i = 0; i < num_components; i++)
vec->operands[i] = Operand(ctx->fs_inputs[fs_input::frag_pos_0 + i]);
if (ctx->fs_vgpr_args[fs_input::frag_pos_3]) {
vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i]));
if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
assert(num_components == 4);
Builder bld(ctx->program, ctx->block);
vec->operands[3] = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ctx->fs_inputs[fs_input::frag_pos_3]);
vec->operands[3] = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3]));
}
for (Operand& op : vec->operands)
@ -2934,7 +2933,7 @@ void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr
Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
unsigned idx = nir_intrinsic_base(instr);
unsigned component = nir_intrinsic_component(instr);
Temp prim_mask = ctx->prim_mask;
Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
if (offset) {
@ -3039,7 +3038,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
}
uint32_t offset = nir_instr_as_load_const(off_instr)->value[0].u32;
Temp vertex_buffers = convert_pointer_to_64_bit(ctx, ctx->vertex_buffers);
Temp vertex_buffers = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->vertex_buffers));
unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset;
unsigned component = nir_intrinsic_component(instr);
@ -3064,21 +3063,24 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
Temp index;
if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
Temp start_instance = get_arg(ctx, ctx->args->ac.start_instance);
if (divisor) {
ctx->needs_instance_id = true;
Temp instance_id = get_arg(ctx, ctx->args->ac.instance_id);
if (divisor != 1) {
Temp divided = bld.tmp(v1);
emit_v_div_u32(ctx, divided, as_vgpr(ctx, ctx->instance_id), divisor);
index = bld.vadd32(bld.def(v1), ctx->start_instance, divided);
emit_v_div_u32(ctx, divided, as_vgpr(ctx, instance_id), divisor);
index = bld.vadd32(bld.def(v1), start_instance, divided);
} else {
index = bld.vadd32(bld.def(v1), ctx->start_instance, ctx->instance_id);
index = bld.vadd32(bld.def(v1), start_instance, instance_id);
}
} else {
index = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), ctx->start_instance);
index = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), start_instance);
}
} else {
index = bld.vadd32(bld.def(v1), ctx->base_vertex, ctx->vertex_id);
index = bld.vadd32(bld.def(v1),
get_arg(ctx, ctx->args->ac.base_vertex),
get_arg(ctx, ctx->args->ac.vertex_id));
}
if (attrib_stride != 0 && attrib_offset > attrib_stride) {
@ -3165,7 +3167,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
fprintf(stderr, "\n");
}
Temp prim_mask = ctx->prim_mask;
Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
nir_const_value* offset = nir_src_as_const_value(instr->src[0]);
if (offset) {
assert(offset->u32 == 0);
@ -3204,11 +3206,11 @@ Temp load_desc_ptr(isel_context *ctx, unsigned desc_set)
{
if (ctx->program->info->need_indirect_descriptor_sets) {
Builder bld(ctx->program, ctx->block);
Temp ptr64 = convert_pointer_to_64_bit(ctx, ctx->descriptor_sets[0]);
Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0]));
return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, Operand(desc_set << 2));//, false, false, false);
}
return ctx->descriptor_sets[desc_set];
return get_arg(ctx, ctx->args->descriptor_sets[desc_set]);
}
@ -3229,7 +3231,7 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset;
desc_ptr = ctx->push_constants;
desc_ptr = get_arg(ctx, ctx->args->ac.push_constants);
offset = pipeline_layout->push_constant_size + 16 * idx;
stride = 16;
} else {
@ -3473,12 +3475,12 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
unsigned count = instr->dest.ssa.num_components;
unsigned start = (offset + index_cv->u32) / 4u;
start -= ctx->base_inline_push_consts;
if (start + count <= ctx->num_inline_push_consts) {
start -= ctx->args->ac.base_inline_push_consts;
if (start + count <= ctx->args->ac.num_inline_push_consts) {
std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
for (unsigned i = 0; i < count; ++i) {
elems[i] = ctx->inline_push_consts[start + i];
elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[start + i]);
vec->operands[i] = Operand{elems[i]};
}
vec->definitions[0] = Definition(dst);
@ -3491,7 +3493,7 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
if (offset != 0) // TODO check if index != 0 as well
index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
Temp ptr = convert_pointer_to_64_bit(ctx, ctx->push_constants);
Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
Temp vec = dst;
bool trim = false;
aco_opcode op;
@ -5091,11 +5093,12 @@ void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) {
Builder bld(ctx->program, ctx->block);
Temp sample_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), ctx->fs_inputs[fs_input::ancillary], Operand(8u), Operand(4u));
Temp sample_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u));
Temp ps_iter_mask = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(ps_iter_masks[log2_ps_iter_samples]));
Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id, ps_iter_mask);
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, ctx->fs_inputs[fs_input::sample_coverage]);
bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, get_arg(ctx, ctx->args->ac.sample_coverage));
}
Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src)
@ -5239,8 +5242,9 @@ void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp s
void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2)
{
Builder bld(ctx->program, ctx->block);
Temp p1 = ctx->fs_inputs[fs_input::persp_center_p1];
Temp p2 = ctx->fs_inputs[fs_input::persp_center_p2];
Temp persp_center = get_arg(ctx, ctx->args->ac.persp_center);
Temp p1 = emit_extract_vector(ctx, persp_center, 0, v1);
Temp p2 = emit_extract_vector(ctx, persp_center, 1, v1);
/* Build DD X/Y */
Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_quad_perm(0, 0, 0, 0));
@ -5271,17 +5275,33 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
case nir_intrinsic_load_barycentric_pixel:
case nir_intrinsic_load_barycentric_centroid: {
glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
fs_input input = get_interp_input(instr->intrinsic, mode);
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
if (input == fs_input::max_inputs) {
bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
Operand(0u), Operand(0u));
} else {
bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
ctx->fs_inputs[input],
ctx->fs_inputs[input + 1]);
Temp bary = Temp(0, s2);
switch (mode) {
case INTERP_MODE_SMOOTH:
case INTERP_MODE_NONE:
if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
bary = get_arg(ctx, ctx->args->ac.persp_center);
else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
bary = ctx->persp_centroid;
else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
bary = get_arg(ctx, ctx->args->ac.persp_sample);
break;
case INTERP_MODE_NOPERSPECTIVE:
if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
bary = get_arg(ctx, ctx->args->ac.linear_center);
else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
bary = ctx->linear_centroid;
else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
bary = get_arg(ctx, ctx->args->ac.linear_sample);
break;
default:
break;
}
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
Operand(p1), Operand(p2));
emit_split_vector(ctx, dst, 2);
break;
}
@ -5352,20 +5372,20 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
}
case nir_intrinsic_load_front_face: {
bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
Operand(0u), ctx->fs_inputs[fs_input::front_face]).def(0).setHint(vcc);
Operand(0u), get_arg(ctx, ctx->args->ac.front_face)).def(0).setHint(vcc);
break;
}
case nir_intrinsic_load_view_index:
case nir_intrinsic_load_layer_id: {
if (instr->intrinsic == nir_intrinsic_load_view_index && (ctx->stage & sw_vs)) {
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
bld.copy(Definition(dst), Operand(ctx->view_index));
bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index)));
break;
}
unsigned idx = nir_intrinsic_base(instr);
bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
Operand(2u), bld.m0(ctx->prim_mask), idx, 0);
Operand(2u), bld.m0(get_arg(ctx, ctx->args->ac.prim_mask)), idx, 0);
break;
}
case nir_intrinsic_load_frag_coord: {
@ -5373,8 +5393,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
break;
}
case nir_intrinsic_load_sample_pos: {
Temp posx = ctx->fs_inputs[fs_input::frag_pos_0];
Temp posy = ctx->fs_inputs[fs_input::frag_pos_1];
Temp posx = get_arg(ctx, ctx->args->ac.frag_pos[0]);
Temp posy = get_arg(ctx, ctx->args->ac.frag_pos[1]);
bld.pseudo(aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand(0u),
posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand(0u));
@ -5496,36 +5516,38 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
break;
case nir_intrinsic_load_num_work_groups: {
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
bld.copy(Definition(dst), Operand(ctx->num_workgroups));
bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups)));
emit_split_vector(ctx, dst, 3);
break;
}
case nir_intrinsic_load_local_invocation_id: {
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
bld.copy(Definition(dst), Operand(ctx->local_invocation_ids));
bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.local_invocation_ids)));
emit_split_vector(ctx, dst, 3);
break;
}
case nir_intrinsic_load_work_group_id: {
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
Temp* ids = ctx->workgroup_ids;
struct ac_arg *args = ctx->args->ac.workgroup_ids;
bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
ids[0].id() ? Operand(ids[0]) : Operand(1u),
ids[1].id() ? Operand(ids[1]) : Operand(1u),
ids[2].id() ? Operand(ids[2]) : Operand(1u));
args[0].used ? Operand(get_arg(ctx, args[0])) : Operand(1u),
args[1].used ? Operand(get_arg(ctx, args[1])) : Operand(1u),
args[2].used ? Operand(get_arg(ctx, args[2])) : Operand(1u));
emit_split_vector(ctx, dst, 3);
break;
}
case nir_intrinsic_load_local_invocation_index: {
Temp id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u), ctx->tg_size);
Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u),
get_arg(ctx, ctx->args->ac.tg_size));
bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, id);
break;
}
case nir_intrinsic_load_subgroup_id: {
if (ctx->stage == compute_cs) {
Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u), ctx->tg_size);
Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u),
get_arg(ctx, ctx->args->ac.tg_size));
bld.sop2(aco_opcode::s_lshr_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), tg_num, Operand(0x6u));
} else {
bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u));
@ -5539,7 +5561,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
}
case nir_intrinsic_load_num_subgroups: {
if (ctx->stage == compute_cs)
bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu), ctx->tg_size);
bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu),
get_arg(ctx, ctx->args->ac.tg_size));
else
bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u));
break;
@ -5601,7 +5624,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
}
case nir_intrinsic_load_sample_id: {
bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
ctx->fs_inputs[ancillary], Operand(8u), Operand(4u));
get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u));
break;
}
case nir_intrinsic_load_sample_mask_in: {
@ -5939,27 +5962,27 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
break;
case nir_intrinsic_load_vertex_id_zero_base: {
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
bld.copy(Definition(dst), ctx->vertex_id);
bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id));
break;
}
case nir_intrinsic_load_first_vertex: {
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
bld.copy(Definition(dst), ctx->base_vertex);
bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.base_vertex));
break;
}
case nir_intrinsic_load_base_instance: {
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
bld.copy(Definition(dst), ctx->start_instance);
bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.start_instance));
break;
}
case nir_intrinsic_load_instance_id: {
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
bld.copy(Definition(dst), ctx->instance_id);
bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.instance_id));
break;
}
case nir_intrinsic_load_draw_id: {
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
bld.copy(Definition(dst), ctx->draw_id);
bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id));
break;
}
default:
@ -7470,12 +7493,12 @@ static void create_vs_exports(isel_context *ctx)
if (outinfo->export_prim_id) {
ctx->vs_output.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
ctx->vs_output.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = ctx->vs_prim_id;
ctx->vs_output.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = get_arg(ctx, ctx->args->vs_prim_id);
}
if (ctx->options->key.has_multiview_view_index) {
ctx->vs_output.mask[VARYING_SLOT_LAYER] |= 0x1;
ctx->vs_output.outputs[VARYING_SLOT_LAYER][0] = as_vgpr(ctx, ctx->view_index);
ctx->vs_output.outputs[VARYING_SLOT_LAYER][0] = as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index));
}
/* the order these position exports are created is important */
@ -7579,7 +7602,7 @@ static void emit_streamout(isel_context *ctx, unsigned stream)
Builder bld(ctx->program, ctx->block);
Temp so_buffers[4];
Temp buf_ptr = convert_pointer_to_64_bit(ctx, ctx->streamout_buffers);
Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers));
for (unsigned i = 0; i < 4; i++) {
unsigned stride = ctx->program->info->so.strides[i];
if (!stride)
@ -7589,7 +7612,7 @@ static void emit_streamout(isel_context *ctx, unsigned stream)
}
Temp so_vtx_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
ctx->streamout_config, Operand(0x70010u));
get_arg(ctx, ctx->args->streamout_config), Operand(0x70010u));
Temp tid = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
@ -7601,7 +7624,7 @@ static void emit_streamout(isel_context *ctx, unsigned stream)
bld.reset(ctx->block);
Temp so_write_index = bld.vadd32(bld.def(v1), ctx->streamout_write_idx, tid);
Temp so_write_index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->streamout_write_idx), tid);
Temp so_write_offset[4];
@ -7612,13 +7635,15 @@ static void emit_streamout(isel_context *ctx, unsigned stream)
if (stride == 1) {
Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
ctx->streamout_write_idx, ctx->streamout_offset[i]);
get_arg(ctx, ctx->args->streamout_write_idx),
get_arg(ctx, ctx->args->streamout_offset[i]));
Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
so_write_offset[i] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), new_offset);
} else {
Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(4u), ctx->streamout_offset[i]);
Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(4u),
get_arg(ctx, ctx->args->streamout_offset[i]));
so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);
}
}
@ -7658,27 +7683,38 @@ void handle_bc_optimize(isel_context *ctx)
uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;
bool uses_center = G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);
bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid);
ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid);
if (uses_center && uses_centroid) {
Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(s2)), ctx->prim_mask, Operand(0u));
Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(s2)),
get_arg(ctx, ctx->args->ac.prim_mask), Operand(0u));
if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) {
Temp new_coord[2];
for (unsigned i = 0; i < 2; i++) {
Temp new_coord = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
ctx->fs_inputs[fs_input::persp_centroid_p1 + i],
ctx->fs_inputs[fs_input::persp_center_p1 + i],
sel);
ctx->fs_inputs[fs_input::persp_centroid_p1 + i] = new_coord;
Temp persp_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1);
Temp persp_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1);
new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
persp_centroid, persp_center, sel);
}
ctx->persp_centroid = bld.tmp(v2);
bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->persp_centroid),
Operand(new_coord[0]), Operand(new_coord[1]));
emit_split_vector(ctx, ctx->persp_centroid, 2);
}
if (G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena)) {
Temp new_coord[2];
for (unsigned i = 0; i < 2; i++) {
Temp new_coord = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
ctx->fs_inputs[fs_input::linear_centroid_p1 + i],
ctx->fs_inputs[fs_input::linear_center_p1 + i],
sel);
ctx->fs_inputs[fs_input::linear_centroid_p1 + i] = new_coord;
Temp linear_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1);
Temp linear_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1);
new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
linear_centroid, linear_center, sel);
}
ctx->linear_centroid = bld.tmp(v2);
bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->linear_centroid),
Operand(new_coord[0]), Operand(new_coord[1]));
emit_split_vector(ctx, ctx->linear_centroid, 2);
}
}
}
@ -7737,10 +7773,9 @@ void select_program(Program *program,
unsigned shader_count,
struct nir_shader *const *shaders,
ac_shader_config* config,
struct radv_shader_info *info,
const struct radv_nir_compiler_options *options)
struct radv_shader_args *args)
{
isel_context ctx = setup_isel_context(program, shader_count, shaders, config, info, options);
isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args);
for (unsigned i = 0; i < shader_count; i++) {
nir_shader *nir = shaders[i];

View File

@ -28,6 +28,7 @@
#include "nir.h"
#include "vulkan/radv_shader.h"
#include "vulkan/radv_descriptor_set.h"
#include "vulkan/radv_shader_args.h"
#include "sid.h"
#include "ac_exp_param.h"
#include "ac_shader_util.h"
@ -38,32 +39,6 @@
namespace aco {
enum fs_input {
persp_sample_p1,
persp_sample_p2,
persp_center_p1,
persp_center_p2,
persp_centroid_p1,
persp_centroid_p2,
persp_pull_model,
linear_sample_p1,
linear_sample_p2,
linear_center_p1,
linear_center_p2,
linear_centroid_p1,
linear_centroid_p2,
line_stipple,
frag_pos_0,
frag_pos_1,
frag_pos_2,
frag_pos_3,
front_face,
ancillary,
sample_coverage,
fixed_pt,
max_inputs,
};
struct vs_output_state {
uint8_t mask[VARYING_SLOT_VAR31 + 1];
Temp outputs[VARYING_SLOT_VAR31 + 1][4];
@ -71,6 +46,7 @@ struct vs_output_state {
struct isel_context {
const struct radv_nir_compiler_options *options;
struct radv_shader_args *args;
Program *program;
nir_shader *shader;
uint32_t constant_data_offset;
@ -95,51 +71,30 @@ struct isel_context {
bool exec_potentially_empty = false;
} cf_info;
Temp arg_temps[AC_MAX_ARGS];
/* inputs common for merged stages */
Temp merged_wave_info = Temp(0, s1);
/* FS inputs */
bool fs_vgpr_args[fs_input::max_inputs];
Temp fs_inputs[fs_input::max_inputs];
Temp prim_mask = Temp(0, s1);
Temp descriptor_sets[MAX_SETS];
Temp push_constants = Temp(0, s1);
Temp inline_push_consts[MAX_INLINE_PUSH_CONSTS];
unsigned num_inline_push_consts = 0;
unsigned base_inline_push_consts = 0;
Temp persp_centroid, linear_centroid;
/* VS inputs */
Temp vertex_buffers = Temp(0, s1);
Temp base_vertex = Temp(0, s1);
Temp start_instance = Temp(0, s1);
Temp draw_id = Temp(0, s1);
Temp view_index = Temp(0, s1);
Temp es2gs_offset = Temp(0, s1);
Temp vertex_id = Temp(0, v1);
Temp rel_auto_id = Temp(0, v1);
Temp instance_id = Temp(0, v1);
Temp vs_prim_id = Temp(0, v1);
bool needs_instance_id;
/* CS inputs */
Temp num_workgroups = Temp(0, s3);
Temp workgroup_ids[3] = {Temp(0, s1), Temp(0, s1), Temp(0, s1)};
Temp tg_size = Temp(0, s1);
Temp local_invocation_ids = Temp(0, v3);
/* VS output information */
unsigned num_clip_distances;
unsigned num_cull_distances;
vs_output_state vs_output;
/* Streamout */
Temp streamout_buffers = Temp(0, s1);
Temp streamout_write_idx = Temp(0, s1);
Temp streamout_config = Temp(0, s1);
Temp streamout_offset[4] = {Temp(0, s1), Temp(0, s1), Temp(0, s1), Temp(0, s1)};
};
fs_input get_interp_input(nir_intrinsic_op intrin, enum glsl_interp_mode interp)
Temp get_arg(isel_context *ctx, struct ac_arg arg)
{
assert(arg.used);
return ctx->arg_temps[arg.arg_index];
}
unsigned get_interp_input(nir_intrinsic_op intrin, enum glsl_interp_mode interp)
{
switch (interp) {
case INTERP_MODE_SMOOTH:
@ -147,24 +102,24 @@ fs_input get_interp_input(nir_intrinsic_op intrin, enum glsl_interp_mode interp)
if (intrin == nir_intrinsic_load_barycentric_pixel ||
intrin == nir_intrinsic_load_barycentric_at_sample ||
intrin == nir_intrinsic_load_barycentric_at_offset)
return fs_input::persp_center_p1;
return S_0286CC_PERSP_CENTER_ENA(1);
else if (intrin == nir_intrinsic_load_barycentric_centroid)
return fs_input::persp_centroid_p1;
return S_0286CC_PERSP_CENTROID_ENA(1);
else if (intrin == nir_intrinsic_load_barycentric_sample)
return fs_input::persp_sample_p1;
return S_0286CC_PERSP_SAMPLE_ENA(1);
break;
case INTERP_MODE_NOPERSPECTIVE:
if (intrin == nir_intrinsic_load_barycentric_pixel)
return fs_input::linear_center_p1;
return S_0286CC_LINEAR_CENTER_ENA(1);
else if (intrin == nir_intrinsic_load_barycentric_centroid)
return fs_input::linear_centroid_p1;
return S_0286CC_LINEAR_CENTROID_ENA(1);
else if (intrin == nir_intrinsic_load_barycentric_sample)
return fs_input::linear_sample_p1;
return S_0286CC_LINEAR_SAMPLE_ENA(1);
break;
default:
break;
}
return fs_input::max_inputs;
return 0;
}
void init_context(isel_context *ctx, nir_shader *shader)
@ -175,7 +130,8 @@ void init_context(isel_context *ctx, nir_shader *shader)
ctx->divergent_vals = nir_divergence_analysis(shader, nir_divergence_view_index_uniform);
std::unique_ptr<Temp[]> allocated{new Temp[impl->ssa_alloc]()};
memset(&ctx->fs_vgpr_args, false, sizeof(ctx->fs_vgpr_args));
unsigned spi_ps_inputs = 0;
bool done = false;
while (!done) {
@ -457,28 +413,28 @@ void init_context(isel_context *ctx, nir_shader *shader)
case nir_intrinsic_load_barycentric_at_sample:
case nir_intrinsic_load_barycentric_at_offset: {
glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(intrinsic);
ctx->fs_vgpr_args[get_interp_input(intrinsic->intrinsic, mode)] = true;
spi_ps_inputs |= get_interp_input(intrinsic->intrinsic, mode);
break;
}
case nir_intrinsic_load_front_face:
ctx->fs_vgpr_args[fs_input::front_face] = true;
spi_ps_inputs |= S_0286CC_FRONT_FACE_ENA(1);
break;
case nir_intrinsic_load_frag_coord:
case nir_intrinsic_load_sample_pos: {
uint8_t mask = nir_ssa_def_components_read(&intrinsic->dest.ssa);
for (unsigned i = 0; i < 4; i++) {
if (mask & (1 << i))
ctx->fs_vgpr_args[fs_input::frag_pos_0 + i] = true;
spi_ps_inputs |= S_0286CC_POS_X_FLOAT_ENA(1) << i;
}
break;
}
case nir_intrinsic_load_sample_id:
ctx->fs_vgpr_args[fs_input::ancillary] = true;
spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1);
break;
case nir_intrinsic_load_sample_mask_in:
ctx->fs_vgpr_args[fs_input::ancillary] = true;
ctx->fs_vgpr_args[fs_input::sample_coverage] = true;
spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1);
spi_ps_inputs |= S_0286CC_SAMPLE_COVERAGE_ENA(1);
break;
default:
break;
@ -555,479 +511,81 @@ void init_context(isel_context *ctx, nir_shader *shader)
}
}
if (G_0286CC_POS_W_FLOAT_ENA(spi_ps_inputs)) {
/* If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be enabled too */
spi_ps_inputs |= S_0286CC_PERSP_CENTER_ENA(1);
}
if (!(spi_ps_inputs & 0x7F)) {
/* At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled */
spi_ps_inputs |= S_0286CC_PERSP_CENTER_ENA(1);
}
ctx->program->config->spi_ps_input_ena = spi_ps_inputs;
ctx->program->config->spi_ps_input_addr = spi_ps_inputs;
for (unsigned i = 0; i < impl->ssa_alloc; i++)
allocated[i] = Temp(ctx->program->allocateId(), allocated[i].regClass());
ctx->allocated.reset(allocated.release());
}
struct user_sgpr_info {
uint8_t num_sgpr;
uint8_t remaining_sgprs;
uint8_t user_sgpr_idx;
bool need_ring_offsets;
bool indirect_all_descriptor_sets;
};
static void allocate_inline_push_consts(isel_context *ctx,
user_sgpr_info& user_sgpr_info)
{
uint8_t remaining_sgprs = user_sgpr_info.remaining_sgprs;
/* Only supported if shaders use push constants. */
if (ctx->program->info->min_push_constant_used == UINT8_MAX)
return;
/* Only supported if shaders don't have indirect push constants. */
if (ctx->program->info->has_indirect_push_constants)
return;
/* Only supported for 32-bit push constants. */
//TODO: it's possible that some day, the load/store vectorization could make this inaccurate
if (!ctx->program->info->has_only_32bit_push_constants)
return;
uint8_t num_push_consts =
(ctx->program->info->max_push_constant_used -
ctx->program->info->min_push_constant_used) / 4;
/* Check if the number of user SGPRs is large enough. */
if (num_push_consts < remaining_sgprs) {
ctx->program->info->num_inline_push_consts = num_push_consts;
} else {
ctx->program->info->num_inline_push_consts = remaining_sgprs;
}
/* Clamp to the maximum number of allowed inlined push constants. */
if (ctx->program->info->num_inline_push_consts > MAX_INLINE_PUSH_CONSTS)
ctx->program->info->num_inline_push_consts = MAX_INLINE_PUSH_CONSTS;
if (ctx->program->info->num_inline_push_consts == num_push_consts &&
!ctx->program->info->loads_dynamic_offsets) {
/* Disable the default push constants path if all constants are
* inlined and if shaders don't use dynamic descriptors.
*/
ctx->program->info->loads_push_constants = false;
user_sgpr_info.num_sgpr--;
user_sgpr_info.remaining_sgprs++;
}
ctx->program->info->base_inline_push_consts =
ctx->program->info->min_push_constant_used / 4;
user_sgpr_info.num_sgpr += ctx->program->info->num_inline_push_consts;
user_sgpr_info.remaining_sgprs -= ctx->program->info->num_inline_push_consts;
}
static void allocate_user_sgprs(isel_context *ctx,
bool needs_view_index, user_sgpr_info& user_sgpr_info)
{
memset(&user_sgpr_info, 0, sizeof(struct user_sgpr_info));
uint32_t user_sgpr_count = 0;
/* until we sort out scratch/global buffers always assign ring offsets for gs/vs/es */
if (ctx->stage != fragment_fs &&
ctx->stage != compute_cs
/*|| ctx->is_gs_copy_shader */)
user_sgpr_info.need_ring_offsets = true;
if (ctx->stage == fragment_fs &&
ctx->program->info->ps.needs_sample_positions)
user_sgpr_info.need_ring_offsets = true;
/* 2 user sgprs will nearly always be allocated for scratch/rings */
user_sgpr_count += 2;
switch (ctx->stage) {
case vertex_vs:
/* if (!ctx->is_gs_copy_shader) */ {
if (ctx->program->info->vs.has_vertex_buffers)
user_sgpr_count++;
user_sgpr_count += ctx->program->info->vs.needs_draw_id ? 3 : 2;
}
break;
case fragment_fs:
//user_sgpr_count += ctx->program->info->ps.needs_sample_positions;
break;
case compute_cs:
if (ctx->program->info->cs.uses_grid_size)
user_sgpr_count += 3;
break;
default:
unreachable("Shader stage not implemented");
}
if (needs_view_index)
user_sgpr_count++;
if (ctx->program->info->loads_push_constants)
user_sgpr_count += 1; /* we use 32bit pointers */
if (ctx->program->info->so.num_outputs)
user_sgpr_count += 1; /* we use 32bit pointers */
uint32_t available_sgprs = ctx->options->chip_class >= GFX9 && !(ctx->stage & hw_cs) ? 32 : 16;
uint32_t remaining_sgprs = available_sgprs - user_sgpr_count;
uint32_t num_desc_set = util_bitcount(ctx->program->info->desc_set_used_mask);
if (available_sgprs < user_sgpr_count + num_desc_set) {
user_sgpr_info.indirect_all_descriptor_sets = true;
user_sgpr_info.num_sgpr = user_sgpr_count + 1;
user_sgpr_info.remaining_sgprs = remaining_sgprs - 1;
} else {
user_sgpr_info.num_sgpr = user_sgpr_count + num_desc_set;
user_sgpr_info.remaining_sgprs = remaining_sgprs - num_desc_set;
}
allocate_inline_push_consts(ctx, user_sgpr_info);
}
#define MAX_ARGS 64
struct arg_info {
RegClass types[MAX_ARGS];
Temp *assign[MAX_ARGS];
PhysReg reg[MAX_ARGS];
unsigned array_params_mask;
uint8_t count;
uint8_t sgpr_count;
uint8_t num_sgprs_used;
uint8_t num_vgprs_used;
};
static void
add_arg(arg_info *info, RegClass rc, Temp *param_ptr, unsigned reg)
{
assert(info->count < MAX_ARGS);
info->assign[info->count] = param_ptr;
info->types[info->count] = rc;
if (rc.type() == RegType::sgpr) {
info->num_sgprs_used += rc.size();
info->sgpr_count++;
info->reg[info->count] = PhysReg{reg};
} else {
assert(rc.type() == RegType::vgpr);
info->num_vgprs_used += rc.size();
info->reg[info->count] = PhysReg{reg + 256};
}
info->count++;
}
static void
set_loc(struct radv_userdata_info *ud_info, uint8_t *sgpr_idx, uint8_t num_sgprs)
{
ud_info->sgpr_idx = *sgpr_idx;
ud_info->num_sgprs = num_sgprs;
*sgpr_idx += num_sgprs;
}
static void
set_loc_shader(isel_context *ctx, int idx, uint8_t *sgpr_idx,
uint8_t num_sgprs)
{
struct radv_userdata_info *ud_info = &ctx->program->info->user_sgprs_locs.shader_data[idx];
assert(ud_info);
set_loc(ud_info, sgpr_idx, num_sgprs);
}
static void
set_loc_shader_ptr(isel_context *ctx, int idx, uint8_t *sgpr_idx)
{
bool use_32bit_pointers = idx != AC_UD_SCRATCH_RING_OFFSETS;
set_loc_shader(ctx, idx, sgpr_idx, use_32bit_pointers ? 1 : 2);
}
static void
set_loc_desc(isel_context *ctx, int idx, uint8_t *sgpr_idx)
{
struct radv_userdata_locations *locs = &ctx->program->info->user_sgprs_locs;
struct radv_userdata_info *ud_info = &locs->descriptor_sets[idx];
assert(ud_info);
set_loc(ud_info, sgpr_idx, 1);
locs->descriptor_sets_enabled |= 1 << idx;
}
static void
declare_global_input_sgprs(isel_context *ctx,
/* bool has_previous_stage, gl_shader_stage previous_stage, */
user_sgpr_info *user_sgpr_info,
struct arg_info *args,
Temp *desc_sets)
{
/* 1 for each descriptor set */
if (!user_sgpr_info->indirect_all_descriptor_sets) {
uint32_t mask = ctx->program->info->desc_set_used_mask;
while (mask) {
int i = u_bit_scan(&mask);
add_arg(args, s1, &desc_sets[i], user_sgpr_info->user_sgpr_idx);
set_loc_desc(ctx, i, &user_sgpr_info->user_sgpr_idx);
}
/* NIR->LLVM might have set this to true if RADV_DEBUG=compiletime */
ctx->program->info->need_indirect_descriptor_sets = false;
} else {
add_arg(args, s1, desc_sets, user_sgpr_info->user_sgpr_idx);
set_loc_shader_ptr(ctx, AC_UD_INDIRECT_DESCRIPTOR_SETS, &user_sgpr_info->user_sgpr_idx);
ctx->program->info->need_indirect_descriptor_sets = true;
}
if (ctx->program->info->loads_push_constants) {
/* 1 for push constants and dynamic descriptors */
add_arg(args, s1, &ctx->push_constants, user_sgpr_info->user_sgpr_idx);
set_loc_shader_ptr(ctx, AC_UD_PUSH_CONSTANTS, &user_sgpr_info->user_sgpr_idx);
}
if (ctx->program->info->num_inline_push_consts) {
unsigned count = ctx->program->info->num_inline_push_consts;
for (unsigned i = 0; i < count; i++)
add_arg(args, s1, &ctx->inline_push_consts[i], user_sgpr_info->user_sgpr_idx + i);
set_loc_shader(ctx, AC_UD_INLINE_PUSH_CONSTANTS, &user_sgpr_info->user_sgpr_idx, count);
ctx->num_inline_push_consts = ctx->program->info->num_inline_push_consts;
ctx->base_inline_push_consts = ctx->program->info->base_inline_push_consts;
}
if (ctx->program->info->so.num_outputs) {
add_arg(args, s1, &ctx->streamout_buffers, user_sgpr_info->user_sgpr_idx);
set_loc_shader_ptr(ctx, AC_UD_STREAMOUT_BUFFERS, &user_sgpr_info->user_sgpr_idx);
}
}
static void
declare_vs_input_vgprs(isel_context *ctx, struct arg_info *args)
{
unsigned vgpr_idx = 0;
add_arg(args, v1, &ctx->vertex_id, vgpr_idx++);
if (ctx->options->chip_class >= GFX10) {
add_arg(args, v1, NULL, vgpr_idx++); /* unused */
add_arg(args, v1, &ctx->vs_prim_id, vgpr_idx++);
add_arg(args, v1, &ctx->instance_id, vgpr_idx++);
} else {
if (ctx->options->key.vs.out.as_ls) {
add_arg(args, v1, &ctx->rel_auto_id, vgpr_idx++);
add_arg(args, v1, &ctx->instance_id, vgpr_idx++);
} else {
add_arg(args, v1, &ctx->instance_id, vgpr_idx++);
add_arg(args, v1, &ctx->vs_prim_id, vgpr_idx++);
}
add_arg(args, v1, NULL, vgpr_idx); /* unused */
}
}
static void
declare_streamout_sgprs(isel_context *ctx, struct arg_info *args, unsigned *idx)
{
/* Streamout SGPRs. */
if (ctx->program->info->so.num_outputs) {
assert(ctx->stage & hw_vs);
if (ctx->stage != tess_eval_vs) {
add_arg(args, s1, &ctx->streamout_config, (*idx)++);
} else {
args->assign[args->count - 1] = &ctx->streamout_config;
args->types[args->count - 1] = s1;
}
add_arg(args, s1, &ctx->streamout_write_idx, (*idx)++);
}
/* A streamout buffer offset is loaded if the stride is non-zero. */
for (unsigned i = 0; i < 4; i++) {
if (!ctx->program->info->so.strides[i])
continue;
add_arg(args, s1, &ctx->streamout_offset[i], (*idx)++);
}
}
static bool needs_view_index_sgpr(isel_context *ctx)
{
switch (ctx->stage) {
case vertex_vs:
return ctx->program->info->needs_multiview_view_index || ctx->options->key.has_multiview_view_index;
case tess_eval_vs:
return ctx->program->info->needs_multiview_view_index && ctx->options->key.has_multiview_view_index;
case vertex_ls:
case vertex_es:
case vertex_tess_control_hs:
case vertex_geometry_gs:
case tess_control_hs:
case tess_eval_es:
case tess_eval_geometry_gs:
case geometry_gs:
return ctx->program->info->needs_multiview_view_index;
default:
return false;
}
}
static inline bool
add_fs_arg(isel_context *ctx, arg_info *args, unsigned &vgpr_idx, fs_input input, unsigned value, bool enable_next = false, RegClass rc = v1)
{
if (!ctx->fs_vgpr_args[input])
return false;
add_arg(args, rc, &ctx->fs_inputs[input], vgpr_idx);
vgpr_idx += rc.size();
if (enable_next) {
add_arg(args, rc, &ctx->fs_inputs[input + 1], vgpr_idx);
vgpr_idx += rc.size();
}
ctx->program->config->spi_ps_input_addr |= value;
ctx->program->config->spi_ps_input_ena |= value;
return true;
}
Pseudo_instruction *add_startpgm(struct isel_context *ctx)
{
user_sgpr_info user_sgpr_info;
bool needs_view_index = needs_view_index_sgpr(ctx);
allocate_user_sgprs(ctx, needs_view_index, user_sgpr_info);
arg_info args = {};
/* this needs to be in sgprs 0 and 1 */
add_arg(&args, s2, &ctx->program->private_segment_buffer, 0);
set_loc_shader_ptr(ctx, AC_UD_SCRATCH_RING_OFFSETS, &user_sgpr_info.user_sgpr_idx);
unsigned vgpr_idx = 0;
switch (ctx->stage) {
case vertex_vs: {
declare_global_input_sgprs(ctx, &user_sgpr_info, &args, ctx->descriptor_sets);
if (ctx->program->info->vs.has_vertex_buffers) {
add_arg(&args, s1, &ctx->vertex_buffers, user_sgpr_info.user_sgpr_idx);
set_loc_shader_ptr(ctx, AC_UD_VS_VERTEX_BUFFERS, &user_sgpr_info.user_sgpr_idx);
}
add_arg(&args, s1, &ctx->base_vertex, user_sgpr_info.user_sgpr_idx);
add_arg(&args, s1, &ctx->start_instance, user_sgpr_info.user_sgpr_idx + 1);
if (ctx->program->info->vs.needs_draw_id) {
add_arg(&args, s1, &ctx->draw_id, user_sgpr_info.user_sgpr_idx + 2);
set_loc_shader(ctx, AC_UD_VS_BASE_VERTEX_START_INSTANCE, &user_sgpr_info.user_sgpr_idx, 3);
} else
set_loc_shader(ctx, AC_UD_VS_BASE_VERTEX_START_INSTANCE, &user_sgpr_info.user_sgpr_idx, 2);
if (needs_view_index) {
add_arg(&args, s1, &ctx->view_index, user_sgpr_info.user_sgpr_idx);
set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_info.user_sgpr_idx, 1);
}
assert(user_sgpr_info.user_sgpr_idx == user_sgpr_info.num_sgpr);
unsigned idx = user_sgpr_info.user_sgpr_idx;
if (ctx->options->key.vs.out.as_es)
add_arg(&args, s1, &ctx->es2gs_offset, idx++);
else
declare_streamout_sgprs(ctx, &args, &idx);
add_arg(&args, s1, &ctx->program->scratch_offset, idx++);
declare_vs_input_vgprs(ctx, &args);
break;
}
case fragment_fs: {
declare_global_input_sgprs(ctx, &user_sgpr_info, &args, ctx->descriptor_sets);
assert(user_sgpr_info.user_sgpr_idx == user_sgpr_info.num_sgpr);
add_arg(&args, s1, &ctx->prim_mask, user_sgpr_info.user_sgpr_idx);
add_arg(&args, s1, &ctx->program->scratch_offset, user_sgpr_info.user_sgpr_idx + 1);
ctx->program->config->spi_ps_input_addr = 0;
ctx->program->config->spi_ps_input_ena = 0;
bool has_interp_mode = false;
has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_sample_p1, S_0286CC_PERSP_SAMPLE_ENA(1), true);
has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_center_p1, S_0286CC_PERSP_CENTER_ENA(1), true);
has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_centroid_p1, S_0286CC_PERSP_CENTROID_ENA(1), true);
has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_pull_model, S_0286CC_PERSP_PULL_MODEL_ENA(1), false, v3);
if (!has_interp_mode && ctx->fs_vgpr_args[fs_input::frag_pos_3]) {
/* If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be enabled too */
ctx->fs_vgpr_args[fs_input::persp_center_p1] = true;
has_interp_mode = add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_center_p1, S_0286CC_PERSP_CENTER_ENA(1), true);
}
has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::linear_sample_p1, S_0286CC_LINEAR_SAMPLE_ENA(1), true);
has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::linear_center_p1, S_0286CC_LINEAR_CENTER_ENA(1), true);
has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::linear_centroid_p1, S_0286CC_LINEAR_CENTROID_ENA(1), true);
has_interp_mode |= add_fs_arg(ctx, &args, vgpr_idx, fs_input::line_stipple, S_0286CC_LINE_STIPPLE_TEX_ENA(1));
if (!has_interp_mode) {
/* At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled */
ctx->fs_vgpr_args[fs_input::persp_center_p1] = true;
has_interp_mode = add_fs_arg(ctx, &args, vgpr_idx, fs_input::persp_center_p1, S_0286CC_PERSP_CENTER_ENA(1), true);
}
add_fs_arg(ctx, &args, vgpr_idx, fs_input::frag_pos_0, S_0286CC_POS_X_FLOAT_ENA(1));
add_fs_arg(ctx, &args, vgpr_idx, fs_input::frag_pos_1, S_0286CC_POS_Y_FLOAT_ENA(1));
add_fs_arg(ctx, &args, vgpr_idx, fs_input::frag_pos_2, S_0286CC_POS_Z_FLOAT_ENA(1));
add_fs_arg(ctx, &args, vgpr_idx, fs_input::frag_pos_3, S_0286CC_POS_W_FLOAT_ENA(1));
add_fs_arg(ctx, &args, vgpr_idx, fs_input::front_face, S_0286CC_FRONT_FACE_ENA(1));
add_fs_arg(ctx, &args, vgpr_idx, fs_input::ancillary, S_0286CC_ANCILLARY_ENA(1));
add_fs_arg(ctx, &args, vgpr_idx, fs_input::sample_coverage, S_0286CC_SAMPLE_COVERAGE_ENA(1));
add_fs_arg(ctx, &args, vgpr_idx, fs_input::fixed_pt, S_0286CC_POS_FIXED_PT_ENA(1));
ASSERTED bool unset_interp_mode = !(ctx->program->config->spi_ps_input_addr & 0x7F) ||
(G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_addr)
&& !(ctx->program->config->spi_ps_input_addr & 0xF));
assert(has_interp_mode);
assert(!unset_interp_mode);
break;
}
case compute_cs: {
declare_global_input_sgprs(ctx, &user_sgpr_info, &args, ctx->descriptor_sets);
if (ctx->program->info->cs.uses_grid_size) {
add_arg(&args, s3, &ctx->num_workgroups, user_sgpr_info.user_sgpr_idx);
set_loc_shader(ctx, AC_UD_CS_GRID_SIZE, &user_sgpr_info.user_sgpr_idx, 3);
}
assert(user_sgpr_info.user_sgpr_idx == user_sgpr_info.num_sgpr);
unsigned idx = user_sgpr_info.user_sgpr_idx;
for (unsigned i = 0; i < 3; i++) {
if (ctx->program->info->cs.uses_block_id[i])
add_arg(&args, s1, &ctx->workgroup_ids[i], idx++);
}
if (ctx->program->info->cs.uses_local_invocation_idx)
add_arg(&args, s1, &ctx->tg_size, idx++);
add_arg(&args, s1, &ctx->program->scratch_offset, idx++);
add_arg(&args, v3, &ctx->local_invocation_ids, vgpr_idx++);
break;
}
default:
unreachable("Shader stage not implemented");
}
ctx->program->info->num_input_vgprs = 0;
ctx->program->info->num_input_sgprs = args.num_sgprs_used;
ctx->program->info->num_user_sgprs = user_sgpr_info.num_sgpr;
ctx->program->info->num_input_vgprs = args.num_vgprs_used;
unsigned arg_count = ctx->args->ac.arg_count;
if (ctx->stage == fragment_fs) {
/* Verify that we have a correct assumption about input VGPR count */
ASSERTED unsigned input_vgpr_cnt = ac_get_fs_input_vgpr_cnt(ctx->program->config, nullptr, nullptr);
assert(input_vgpr_cnt == ctx->program->info->num_input_vgprs);
}
/* LLVM optimizes away unused FS inputs and computes spi_ps_input_addr
* itself and then communicates the results back via the ELF binary.
* Mirror what LLVM does by re-mapping the VGPR arguments here.
*
* TODO: If we made the FS input scanning code into a separate pass that
* could run before argument setup, then this wouldn't be necessary
* anymore.
*/
struct ac_shader_args *args = &ctx->args->ac;
arg_count = 0;
for (unsigned i = 0, vgpr_arg = 0, vgpr_reg = 0; i < args->arg_count; i++) {
if (args->args[i].file != AC_ARG_VGPR) {
arg_count++;
continue;
}
aco_ptr<Pseudo_instruction> startpgm{create_instruction<Pseudo_instruction>(aco_opcode::p_startpgm, Format::PSEUDO, 0, args.count + 1)};
for (unsigned i = 0; i < args.count; i++) {
if (args.assign[i]) {
*args.assign[i] = Temp{ctx->program->allocateId(), args.types[i]};
startpgm->definitions[i] = Definition(*args.assign[i]);
startpgm->definitions[i].setFixed(args.reg[i]);
if (!(ctx->program->config->spi_ps_input_addr & (1 << vgpr_arg))) {
args->args[i].skip = true;
} else {
args->args[i].offset = vgpr_reg;
vgpr_reg += args->args[i].size;
arg_count++;
}
vgpr_arg++;
}
}
startpgm->definitions[args.count] = Definition{ctx->program->allocateId(), exec, s2};
aco_ptr<Pseudo_instruction> startpgm{create_instruction<Pseudo_instruction>(aco_opcode::p_startpgm, Format::PSEUDO, 0, arg_count + 1)};
for (unsigned i = 0, arg = 0; i < ctx->args->ac.arg_count; i++) {
if (ctx->args->ac.args[i].skip)
continue;
enum ac_arg_regfile file = ctx->args->ac.args[i].file;
unsigned size = ctx->args->ac.args[i].size;
unsigned reg = ctx->args->ac.args[i].offset;
RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
Temp dst = Temp{ctx->program->allocateId(), type};
ctx->arg_temps[i] = dst;
startpgm->definitions[arg] = Definition(dst);
startpgm->definitions[arg].setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
arg++;
}
startpgm->definitions[arg_count] = Definition{ctx->program->allocateId(), exec, s2};
Pseudo_instruction *instr = startpgm.get();
ctx->block->instructions.push_back(std::move(startpgm));
/* Stash these in the program so that they can be accessed later when
* handling spilling.
*/
ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets);
ctx->program->scratch_offset = get_arg(ctx, ctx->args->scratch_offset);
return instr;
}
@ -1168,8 +726,7 @@ setup_isel_context(Program* program,
unsigned shader_count,
struct nir_shader *const *shaders,
ac_shader_config* config,
radv_shader_info *info,
const radv_nir_compiler_options *options)
struct radv_shader_args *args)
{
program->stage = 0;
for (unsigned i = 0; i < shader_count; i++) {
@ -1206,23 +763,23 @@ setup_isel_context(Program* program,
unreachable("Shader stage not implemented");
program->config = config;
program->info = info;
program->chip_class = options->chip_class;
program->family = options->family;
program->wave_size = info->wave_size;
program->info = args->shader_info;
program->chip_class = args->options->chip_class;
program->family = args->options->family;
program->wave_size = args->shader_info->wave_size;
program->lds_alloc_granule = options->chip_class >= GFX7 ? 512 : 256;
program->lds_limit = options->chip_class >= GFX7 ? 65536 : 32768;
program->lds_alloc_granule = args->options->chip_class >= GFX7 ? 512 : 256;
program->lds_limit = args->options->chip_class >= GFX7 ? 65536 : 32768;
program->vgpr_limit = 256;
if (options->chip_class >= GFX10) {
if (args->options->chip_class >= GFX10) {
program->physical_sgprs = 2560; /* doesn't matter as long as it's at least 128 * 20 */
program->sgpr_alloc_granule = 127;
program->sgpr_limit = 106;
} else if (program->chip_class >= GFX8) {
program->physical_sgprs = 800;
program->sgpr_alloc_granule = 15;
if (options->family == CHIP_TONGA || options->family == CHIP_ICELAND)
if (args->options->family == CHIP_TONGA || args->options->family == CHIP_ICELAND)
program->sgpr_limit = 94; /* workaround hardware bug */
else
program->sgpr_limit = 102;
@ -1234,28 +791,12 @@ setup_isel_context(Program* program,
/* TODO: we don't have to allocate VCC if we don't need it */
program->needs_vcc = true;
for (unsigned i = 0; i < MAX_SETS; ++i)
program->info->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1;
for (unsigned i = 0; i < AC_UD_MAX_UD; ++i)
program->info->user_sgprs_locs.shader_data[i].sgpr_idx = -1;
isel_context ctx = {};
ctx.program = program;
ctx.options = options;
ctx.args = args;
ctx.options = args->options;
ctx.stage = program->stage;
for (unsigned i = 0; i < fs_input::max_inputs; ++i)
ctx.fs_inputs[i] = Temp(0, v1);
ctx.fs_inputs[fs_input::persp_pull_model] = Temp(0, v3);
for (unsigned i = 0; i < MAX_SETS; ++i)
ctx.descriptor_sets[i] = Temp(0, s1);
for (unsigned i = 0; i < MAX_INLINE_PUSH_CONSTS; ++i)
ctx.inline_push_consts[i] = Temp(0, s1);
for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
for (unsigned j = 0; j < 4; ++j)
ctx.vs_output.outputs[i][j] = Temp(0, v1);
}
for (unsigned i = 0; i < shader_count; i++) {
nir_shader *nir = shaders[i];
@ -1339,7 +880,7 @@ setup_isel_context(Program* program,
nir_function_impl *func = nir_shader_get_entrypoint(nir);
nir_index_ssa_defs(func);
if (options->dump_preoptir) {
if (args->options->dump_preoptir) {
fprintf(stderr, "NIR shader before instruction selection:\n");
nir_print_shader(nir, stderr);
}

View File

@ -65,8 +65,7 @@ void aco_compile_shader(unsigned shader_count,
std::unique_ptr<aco::Program> program{new aco::Program};
/* Instruction Selection */
aco::select_program(program.get(), shader_count, shaders, &config,
args->shader_info, args->options);
aco::select_program(program.get(), shader_count, shaders, &config, args);
if (args->options->dump_preoptir) {
std::cerr << "After Instruction Selection:\n";
aco_print_program(program.get(), stderr);

View File

@ -37,6 +37,7 @@
#include "aco_util.h"
struct radv_nir_compiler_options;
struct radv_shader_args;
struct radv_shader_info;
namespace aco {
@ -1208,8 +1209,7 @@ void select_program(Program *program,
unsigned shader_count,
struct nir_shader *const *shaders,
ac_shader_config* config,
struct radv_shader_info *info,
const struct radv_nir_compiler_options *options);
struct radv_shader_args *args);
void lower_wqm(Program* program, live& live_vars,
const struct radv_nir_compiler_options *options);

View File

@ -695,9 +695,7 @@ radv_declare_shader_args(struct radv_shader_args *args,
args->shader_info->num_input_vgprs = 0;
args->shader_info->num_input_sgprs = 2;
args->shader_info->num_input_sgprs += args->ac.num_sgprs_used;
if (stage != MESA_SHADER_FRAGMENT)
args->shader_info->num_input_vgprs = args->ac.num_vgprs_used;
args->shader_info->num_input_vgprs = args->ac.num_vgprs_used;
uint8_t user_sgpr_idx = 0;