diff --git a/src/gallium/drivers/r600/meson.build b/src/gallium/drivers/r600/meson.build index 880dad590cb..c5c8a99e1e0 100644 --- a/src/gallium/drivers/r600/meson.build +++ b/src/gallium/drivers/r600/meson.build @@ -149,6 +149,7 @@ files_r600 = files( 'sfn/sfn_liverange.h', 'sfn/sfn_nir.cpp', 'sfn/sfn_nir.h', + 'sfn/sfn_nir_lower_64bit.cpp', 'sfn/sfn_nir_lower_fs_out_to_vector.cpp', 'sfn/sfn_nir_lower_fs_out_to_vector.h', 'sfn/sfn_nir_lower_tess_io.cpp', diff --git a/src/gallium/drivers/r600/sfn/sfn_nir.h b/src/gallium/drivers/r600/sfn/sfn_nir.h index f3209109319..2740f425bd5 100644 --- a/src/gallium/drivers/r600/sfn/sfn_nir.h +++ b/src/gallium/drivers/r600/sfn/sfn_nir.h @@ -60,6 +60,12 @@ bool r600_lower_scratch_addresses(nir_shader *shader); bool r600_lower_ubo_to_align16(nir_shader *shader); +bool r600_nir_split_64bit_io(nir_shader *sh); + +bool r600_nir_64_to_vec2(nir_shader *sh); + +bool r600_merge_vec2_stores(nir_shader *shader); + class Shader { public: std::vector& m_ir; diff --git a/src/gallium/drivers/r600/sfn/sfn_nir_lower_64bit.cpp b/src/gallium/drivers/r600/sfn/sfn_nir_lower_64bit.cpp new file mode 100644 index 00000000000..14caedd7195 --- /dev/null +++ b/src/gallium/drivers/r600/sfn/sfn_nir_lower_64bit.cpp @@ -0,0 +1,1064 @@ +/* -*- mesa-c++ -*- + * + * Copyright (c) 2020 Collabora LTD + * + * Author: Gert Wollny + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "sfn_nir.h" + +#include "nir.h" +#include "nir_builder.h" + +#include +#include +#include + +namespace r600 { + +using std::map; +using std::pair; +using std::make_pair; +using std::vector; + +class LowerSplit64BitVar : public NirLowerInstruction { +public: + + ~LowerSplit64BitVar(); + using VarSplit = pair; + using VarMap = map; + + nir_ssa_def * + split_double_load_deref(nir_intrinsic_instr *intr); + + nir_ssa_def * + split_double_store_deref(nir_intrinsic_instr *intr); + +private: + nir_ssa_def * + split_load_deref_array(nir_intrinsic_instr *intr, nir_src& index); + + nir_ssa_def * + split_load_deref_var(nir_intrinsic_instr *intr); + + nir_ssa_def * + split_store_deref_array(nir_intrinsic_instr *intr, nir_deref_instr *deref); + + nir_ssa_def * + split_store_deref_var(nir_intrinsic_instr *intr, nir_deref_instr *deref1); + + VarSplit get_var_pair(nir_variable *old_var); + + nir_ssa_def * + merge_64bit_loads(nir_ssa_def *load1, nir_ssa_def *load2, bool out_is_vec3); + + nir_ssa_def *split_double_load(nir_intrinsic_instr *load1); + + nir_ssa_def * + split_store_output(nir_intrinsic_instr *store1); + + nir_ssa_def *split_double_load_uniform(nir_intrinsic_instr *intr); + + nir_ssa_def * + split_double_load_ssbo(nir_intrinsic_instr *intr); + + nir_ssa_def * + split_double_load_ubo(nir_intrinsic_instr *intr); + + nir_ssa_def * + split_reduction(nir_ssa_def *src[2][2], nir_op op1, nir_op op2, nir_op reduction); + + nir_ssa_def * + split_reduction3(nir_alu_instr *alu, + nir_op op1, nir_op op2, nir_op reduction); + + nir_ssa_def * + split_reduction4(nir_alu_instr *alu, + nir_op op1, nir_op op2, nir_op reduction); + + nir_ssa_def *split_bcsel(nir_alu_instr *alu); + + nir_ssa_def *split_load_const(nir_load_const_instr *lc); + + bool filter(const nir_instr *instr) const override; + nir_ssa_def *lower(nir_instr *instr) override; + + VarMap m_varmap; + vector m_old_vars; + vector m_old_stores; +}; + + +bool +LowerSplit64BitVar::filter(const nir_instr *instr) const +{ + switch (instr->type) { + case nir_instr_type_intrinsic: { + auto intr = nir_instr_as_intrinsic(instr); + + switch (intr->intrinsic) { + case nir_intrinsic_load_deref: + case nir_intrinsic_load_uniform: + case nir_intrinsic_load_input: + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ssbo: + if (nir_dest_bit_size(intr->dest) != 64) + return false; + return nir_dest_num_components(intr->dest) >= 3; + case nir_intrinsic_store_output: + if (nir_src_bit_size(intr->src[0]) != 64) + return false; + return nir_src_num_components(intr->src[0]) >= 3; + case nir_intrinsic_store_deref: + if (nir_src_bit_size(intr->src[1]) != 64) + return false; + return nir_src_num_components(intr->src[1]) >= 3; + default: + return false; + } + } + case nir_instr_type_alu: { + auto alu = nir_instr_as_alu(instr); + switch (alu->op) { + case nir_op_bcsel: + if (nir_dest_num_components(alu->dest.dest) < 3) + return false; + return nir_dest_bit_size(alu->dest.dest) == 64; + case nir_op_bany_fnequal3: + case nir_op_bany_fnequal4: + case nir_op_ball_fequal3: + case nir_op_ball_fequal4: + case nir_op_bany_inequal3: + case nir_op_bany_inequal4: + case nir_op_ball_iequal3: + case nir_op_ball_iequal4: + case nir_op_fdot3: + case nir_op_fdot4: + return nir_src_bit_size(alu->src[1].src) == 64; + default: + return false; + } + } + case nir_instr_type_load_const: { + auto lc = nir_instr_as_load_const(instr); + if (lc->def.bit_size != 64) + return false; + return lc->def.num_components >= 3; + } + default: + return false; + } +} + +nir_ssa_def * +LowerSplit64BitVar::merge_64bit_loads(nir_ssa_def *load1, + nir_ssa_def *load2, bool out_is_vec3) +{ + if (out_is_vec3) + return nir_vec3(b, nir_channel(b, load1, 0), + nir_channel(b, load1, 1), + nir_channel(b, load2, 0)); + else + return nir_vec4(b, nir_channel(b, load1, 0), + nir_channel(b, load1, 1), + nir_channel(b, load2, 0), + nir_channel(b, load2, 1)); +} + +LowerSplit64BitVar::~LowerSplit64BitVar() +{ + for(auto&& v: m_old_vars) + exec_node_remove(&v->node); + + for(auto&& v: m_old_stores) + nir_instr_remove(v); +} + +nir_ssa_def * +LowerSplit64BitVar::split_double_store_deref(nir_intrinsic_instr *intr) +{ + auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr); + if (deref->deref_type == nir_deref_type_var) + return split_store_deref_var(intr, deref); + else if (deref->deref_type == nir_deref_type_array) + return split_store_deref_array(intr, deref); + else { + assert(0 && "only splitting of stores to vars and arrays is supported"); + } +} + +nir_ssa_def * +LowerSplit64BitVar::split_double_load_deref(nir_intrinsic_instr *intr) +{ + auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr); + if (deref->deref_type == nir_deref_type_var) + return split_load_deref_var(intr); + else if (deref->deref_type == nir_deref_type_array) + return split_load_deref_array(intr, deref->arr.index); + else { + assert(0 && "only splitting of loads from vars and arrays is supported"); + } + m_old_stores.push_back(&intr->instr); +} + +nir_ssa_def * +LowerSplit64BitVar::split_load_deref_array(nir_intrinsic_instr *intr, nir_src& index) +{ + auto old_var = nir_intrinsic_get_var(intr, 0); + unsigned old_components = old_var->type->without_array()->components(); + + assert(old_components > 2 && old_components <= 4); + + auto vars = get_var_pair(old_var); + + auto deref1 = nir_build_deref_var(b, vars.first); + auto deref_array1 = nir_build_deref_array(b, deref1, nir_ssa_for_src(b, index, 1)); + auto load1 = nir_build_load_deref(b, 2, 64, &deref_array1->dest.ssa, (enum gl_access_qualifier)0); + + auto deref2 = nir_build_deref_var(b, vars.second); + auto deref_array2 = nir_build_deref_array(b, deref2, nir_ssa_for_src(b, index, 1)); + + auto load2 = nir_build_load_deref(b, old_components - 2, 64, &deref_array2->dest.ssa, (enum gl_access_qualifier)0); + + return merge_64bit_loads(load1, load2, old_components == 3); +} + +nir_ssa_def * +LowerSplit64BitVar::split_store_deref_array(nir_intrinsic_instr *intr, nir_deref_instr *deref) +{ + auto old_var = nir_intrinsic_get_var(intr, 0); + unsigned old_components = old_var->type->without_array()->components(); + + assert(old_components > 2 && old_components <= 4); + + auto src_xy = nir_channels(b, intr->src[1].ssa, 3); + + auto vars = get_var_pair(old_var); + + auto deref1 = nir_build_deref_var(b, vars.first); + auto deref_array1 = nir_build_deref_array(b, deref1, nir_ssa_for_src(b, deref->arr.index, 1)); + + nir_build_store_deref(b, &deref_array1->dest.ssa, src_xy, 3); + + auto deref2 = nir_build_deref_var(b, vars.second); + auto deref_array2 = nir_build_deref_array(b, deref2, nir_ssa_for_src(b, deref->arr.index, 1)); + + if (old_components == 3) + nir_build_store_deref(b, &deref_array2->dest.ssa, nir_channel(b, intr->src[1].ssa, 2), 1); + else + nir_build_store_deref(b, &deref_array2->dest.ssa, nir_channels(b, intr->src[1].ssa, 0xc), 3); + + return progress_replace; +} + +nir_ssa_def * +LowerSplit64BitVar::split_store_deref_var(nir_intrinsic_instr *intr, nir_deref_instr *deref) +{ + auto old_var = nir_intrinsic_get_var(intr, 0); + unsigned old_components = old_var->type->without_array()->components(); + + assert(old_components > 2 && old_components <= 4); + + auto src_xy = nir_channels(b, intr->src[1].ssa, 3); + + auto vars = get_var_pair(old_var); + + auto deref1 = nir_build_deref_var(b, vars.first); + nir_build_store_deref(b, &deref1->dest.ssa, src_xy, 3); + + auto deref2 = nir_build_deref_var(b, vars.second); + if (old_components == 3) + nir_build_store_deref(b, &deref2->dest.ssa, nir_channel(b, intr->src[1].ssa, 2), 1); + else + nir_build_store_deref(b, &deref2->dest.ssa, nir_channels(b, intr->src[1].ssa, 0xc), 3); + + return progress_replace; +} + +nir_ssa_def * +LowerSplit64BitVar::split_load_deref_var(nir_intrinsic_instr *intr) +{ + auto old_var = nir_intrinsic_get_var(intr, 0); + auto vars = get_var_pair(old_var); + unsigned old_components = old_var->type->components(); + + nir_deref_instr *deref1 = nir_build_deref_var(b, vars.first); + auto *load1 = nir_load_deref(b, deref1); + + nir_deref_instr *deref2 = nir_build_deref_var(b, vars.second); + deref2->type = vars.second->type; + + auto *load2 = nir_load_deref(b, deref2); + + return merge_64bit_loads(load1, load2, old_components == 3); +} + +LowerSplit64BitVar::VarSplit +LowerSplit64BitVar::get_var_pair(nir_variable *old_var) +{ + auto split_vars = m_varmap.find(old_var->data.driver_location); + + assert(old_var->type->without_array()->components() > 2); + + if (split_vars == m_varmap.end()) { + auto var1 = nir_variable_clone(old_var, b->shader); + auto var2 = nir_variable_clone(old_var, b->shader); + + var1->type = glsl_dvec_type(2); + var2->type = glsl_dvec_type(old_var->type->without_array()->components() - 2); + + if (old_var->type->is_array()) { + var1->type = glsl_array_type(var1->type, old_var->type->array_size(), 0); + var2->type = glsl_array_type(var2->type, old_var->type->array_size(), 0); + } + + if (old_var->data.mode == nir_var_shader_in || + old_var->data.mode == nir_var_shader_out) { + ++var2->data.driver_location; + ++var2->data.location; + nir_shader_add_variable(b->shader, var1); + nir_shader_add_variable(b->shader, var2); + } else if (old_var->data.mode == nir_var_function_temp) { + exec_list_push_tail(&b->impl->locals, &var1->node); + exec_list_push_tail(&b->impl->locals, &var2->node); + } + + m_varmap[old_var->data.driver_location] = make_pair(var1, var2); + } + return m_varmap[old_var->data.driver_location]; +} + + +nir_ssa_def * +LowerSplit64BitVar::split_double_load(nir_intrinsic_instr *load1) +{ + unsigned old_components = nir_dest_num_components(load1->dest); + auto load2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &load1->instr)); + nir_io_semantics sem = nir_intrinsic_io_semantics(load1); + + load1->dest.ssa.num_components = 2; + sem.num_slots = 1; + nir_intrinsic_set_io_semantics(load1, sem); + + load2->dest.ssa.num_components = old_components - 2; + sem.location += 1; + nir_intrinsic_set_io_semantics(load2, sem); + nir_intrinsic_set_base(load2, nir_intrinsic_base(load1) + 1); + nir_builder_instr_insert(b, &load2->instr); + + return merge_64bit_loads(&load1->dest.ssa, &load2->dest.ssa, old_components == 3); +} + + +nir_ssa_def * +LowerSplit64BitVar::split_store_output(nir_intrinsic_instr *store1) +{ + auto src = store1->src[0]; + unsigned old_components = nir_src_num_components(src); + nir_io_semantics sem = nir_intrinsic_io_semantics(store1); + + auto store2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &store1->instr)); + auto src1 = nir_channels(b, src.ssa, 3); + auto src2 = nir_channels(b, src.ssa, old_components == 3 ? 4 : 0xc); + + nir_instr_rewrite_src(&store1->instr, &src, nir_src_for_ssa(src1)); + nir_intrinsic_set_write_mask(store1, 3); + + nir_instr_rewrite_src(&store2->instr, &src, nir_src_for_ssa(src2)); + nir_intrinsic_set_write_mask(store2, old_components == 3 ? 1 : 3); + + sem.num_slots = 1; + nir_intrinsic_set_io_semantics(store1, sem); + + sem.location += 1; + nir_intrinsic_set_io_semantics(store2, sem); + nir_intrinsic_set_base(store2, nir_intrinsic_base(store1)); + + nir_builder_instr_insert(b, &store2->instr); + return progress_keep; +} + + +nir_ssa_def * +LowerSplit64BitVar::split_double_load_uniform(nir_intrinsic_instr *intr) +{ + unsigned second_components = nir_dest_num_components(intr->dest) - 2; + nir_intrinsic_instr *load2 = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform); + load2->src[0] = nir_src_for_ssa(nir_iadd_imm(b, intr->src[0].ssa, 1)); + nir_intrinsic_set_dest_type(load2, nir_intrinsic_dest_type(intr)); + nir_intrinsic_set_base(load2, nir_intrinsic_base(intr)); + nir_intrinsic_set_range(load2, nir_intrinsic_range(intr)); + load2->num_components = second_components; + + nir_ssa_dest_init(&load2->instr, &load2->dest, second_components, 64, nullptr); + nir_builder_instr_insert(b, &load2->instr); + + intr->dest.ssa.num_components = intr->num_components = 2; + + if (second_components == 1) + return nir_vec3(b, nir_channel(b, &intr->dest.ssa, 0), + nir_channel(b, &intr->dest.ssa, 1), + nir_channel(b, &load2->dest.ssa, 0)); + else + return nir_vec4(b, nir_channel(b, &intr->dest.ssa, 0), + nir_channel(b, &intr->dest.ssa, 1), + nir_channel(b, &load2->dest.ssa, 0), + nir_channel(b, &load2->dest.ssa, 1)); +} + +nir_ssa_def * +LowerSplit64BitVar::split_double_load_ssbo(nir_intrinsic_instr *intr) +{ + unsigned second_components = nir_dest_num_components(intr->dest) - 2; + nir_intrinsic_instr *load2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr)); + + auto new_src0 = nir_src_for_ssa(nir_iadd_imm(b, intr->src[0].ssa, 1)); + nir_instr_rewrite_src(&load2->instr, &load2->src[0], new_src0); + load2->num_components = second_components; + nir_ssa_dest_init(&load2->instr, &load2->dest, second_components, 64, nullptr); + + nir_intrinsic_set_dest_type(load2, nir_intrinsic_dest_type(intr)); + nir_builder_instr_insert(b, &load2->instr); + + intr->dest.ssa.num_components = intr->num_components = 2; + + return merge_64bit_loads(&intr->dest.ssa, &load2->dest.ssa, second_components == 1); +} + + +nir_ssa_def * +LowerSplit64BitVar::split_double_load_ubo(nir_intrinsic_instr *intr) +{ + unsigned second_components = nir_dest_num_components(intr->dest) - 2; + nir_intrinsic_instr *load2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr)); + load2->src[0] = intr->src[0]; + load2->src[1] = nir_src_for_ssa(nir_iadd_imm(b, intr->src[1].ssa, 16)); + nir_intrinsic_set_range_base(load2, nir_intrinsic_range_base(intr) + 16); + nir_intrinsic_set_range(load2, nir_intrinsic_range(intr)); + nir_intrinsic_set_access(load2, nir_intrinsic_access(intr)); + nir_intrinsic_set_align_mul(load2, nir_intrinsic_align_mul(intr)); + nir_intrinsic_set_align_offset(load2, nir_intrinsic_align_offset(intr) + 16); + + load2->num_components = second_components; + + nir_ssa_dest_init(&load2->instr, &load2->dest, second_components, 64, nullptr); + nir_builder_instr_insert(b, &load2->instr); + + intr->dest.ssa.num_components = intr->num_components = 2; + + return merge_64bit_loads(&intr->dest.ssa, &load2->dest.ssa, second_components == 1); +} + +nir_ssa_def * +LowerSplit64BitVar::split_reduction(nir_ssa_def *src[2][2], nir_op op1, nir_op op2, nir_op reduction) +{ + auto cmp0 = nir_build_alu(b, op1, src[0][0], src[0][1], nullptr, nullptr); + auto cmp1 = nir_build_alu(b, op2, src[1][0], src[1][1], nullptr, nullptr); + return nir_build_alu(b, reduction, cmp0, cmp1, nullptr, nullptr); +} + +nir_ssa_def * +LowerSplit64BitVar::split_reduction3(nir_alu_instr *alu, + nir_op op1, nir_op op2, nir_op reduction) +{ + nir_ssa_def *src[2][2]; + + src[0][0] = nir_channels(b, nir_ssa_for_src(b, alu->src[0].src, 2), 3); + src[0][1] = nir_channels(b, nir_ssa_for_src(b, alu->src[1].src, 2), 3); + + src[1][0] = nir_channel(b, nir_ssa_for_src(b, alu->src[0].src, 3), 2); + src[1][1] = nir_channel(b, nir_ssa_for_src(b, alu->src[1].src, 3), 2); + + return split_reduction(src, op1, op2, reduction); +} + +nir_ssa_def * +LowerSplit64BitVar::split_reduction4(nir_alu_instr *alu, + nir_op op1, nir_op op2, nir_op reduction) +{ + nir_ssa_def *src[2][2]; + + src[0][0] = nir_channels(b, nir_ssa_for_src(b, alu->src[0].src, 2), 3); + src[0][1] = nir_channels(b, nir_ssa_for_src(b, alu->src[1].src, 2), 3); + + src[1][0] = nir_channels(b, nir_ssa_for_src(b, alu->src[0].src, 4), 0xc); + src[1][1] = nir_channels(b, nir_ssa_for_src(b, alu->src[1].src, 4), 0xc); + + return split_reduction(src, op1, op2, reduction); +} + +nir_ssa_def * +LowerSplit64BitVar::split_bcsel(nir_alu_instr *alu) +{ + static nir_ssa_def *dest[4]; + for (unsigned i = 0; i < nir_dest_num_components(alu->dest.dest); ++i) { + dest[i] = nir_bcsel(b, + nir_channel(b, alu->src[0].src.ssa, i), + nir_channel(b, alu->src[1].src.ssa, i), + nir_channel(b, alu->src[2].src.ssa, i)); + } + return nir_vec(b, dest, nir_dest_num_components(alu->dest.dest)); +} + +nir_ssa_def * +LowerSplit64BitVar::split_load_const(nir_load_const_instr *lc) +{ + nir_ssa_def *ir[4]; + for (unsigned i = 0; i < lc->def.num_components; ++i) + ir[i] = nir_imm_double(b, lc->value[i].f64); + + return nir_vec(b, ir, lc->def.num_components); +} + +nir_ssa_def * +LowerSplit64BitVar::lower(nir_instr *instr) +{ + switch (instr->type) { + case nir_instr_type_intrinsic: { + auto intr = nir_instr_as_intrinsic(instr); + switch (intr->intrinsic) { + case nir_intrinsic_load_deref: + return this->split_double_load_deref(intr); + case nir_intrinsic_load_uniform: + return split_double_load_uniform(intr); + case nir_intrinsic_load_ubo: + return split_double_load_ubo(intr); + case nir_intrinsic_load_ssbo: + return split_double_load_ssbo(intr); + case nir_intrinsic_load_input: + return split_double_load(intr); + case nir_intrinsic_store_output: + return split_store_output(intr); + case nir_intrinsic_store_deref: + return split_double_store_deref(intr); + default: + assert(0); + } + } + case nir_instr_type_alu: { + auto alu = nir_instr_as_alu(instr); + nir_print_instr(instr, stderr); + fprintf(stderr, "\n"); + switch (alu->op) { + case nir_op_bany_fnequal3: + return split_reduction3(alu, nir_op_bany_fnequal2, nir_op_fneu, nir_op_ior); + case nir_op_ball_fequal3: + return split_reduction3(alu, nir_op_ball_fequal2, nir_op_feq, nir_op_iand); + case nir_op_bany_inequal3: + return split_reduction3(alu, nir_op_bany_inequal2, nir_op_ine, nir_op_ior); + case nir_op_ball_iequal3: + return split_reduction3(alu, nir_op_ball_iequal2, nir_op_ieq, nir_op_iand); + case nir_op_fdot3: + return split_reduction3(alu, nir_op_fdot2, nir_op_fmul, nir_op_fadd); + case nir_op_bany_fnequal4: + return split_reduction4(alu, nir_op_bany_fnequal2, nir_op_bany_fnequal2, nir_op_ior); + case nir_op_ball_fequal4: + return split_reduction4(alu, nir_op_ball_fequal2, nir_op_ball_fequal2, nir_op_iand); + case nir_op_bany_inequal4: + return split_reduction4(alu, nir_op_bany_inequal2, nir_op_bany_inequal2, nir_op_ior); + case nir_op_ball_iequal4: + return split_reduction4(alu, nir_op_bany_fnequal2, nir_op_bany_fnequal2, nir_op_ior); + case nir_op_fdot4: + return split_reduction4(alu, nir_op_fdot2, nir_op_fdot2, nir_op_fadd); + case nir_op_bcsel: + return split_bcsel(alu); + default: + assert(0); + } + } + case nir_instr_type_load_const: { + auto lc = nir_instr_as_load_const(instr); + return split_load_const(lc); + } + default: + assert(0); + } + return nullptr; +} + +/* Split 64 bit instruction so that at most two 64 bit components are + * used in one instruction */ + +bool +r600_nir_split_64bit_io(nir_shader *sh) +{ + return LowerSplit64BitVar().run(sh); +} + +/* */ +class Lower64BitToVec2 : public NirLowerInstruction { + +private: + bool filter(const nir_instr *instr) const override; + nir_ssa_def *lower(nir_instr *instr) override; + + nir_ssa_def *load_deref_64_to_vec2(nir_intrinsic_instr *intr); + nir_ssa_def *load_uniform_64_to_vec2(nir_intrinsic_instr *intr); + nir_ssa_def *load_ssbo_64_to_vec2(nir_intrinsic_instr *intr); + nir_ssa_def *load_64_to_vec2(nir_intrinsic_instr *intr); + nir_ssa_def *store_64_to_vec2(nir_intrinsic_instr *intr); +}; + +bool +Lower64BitToVec2::filter(const nir_instr *instr) const +{ + switch (instr->type) { + case nir_instr_type_intrinsic: { + auto intr = nir_instr_as_intrinsic(instr); + + switch (intr->intrinsic) { + case nir_intrinsic_load_deref: + case nir_intrinsic_load_input: + case nir_intrinsic_load_uniform: + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ubo_vec4: + case nir_intrinsic_load_ssbo: + return nir_dest_bit_size(intr->dest) == 64; + case nir_intrinsic_store_deref: { + if (nir_src_bit_size(intr->src[1]) == 64) + return true; + auto var = nir_intrinsic_get_var(intr, 0); + if (var->type->without_array()->bit_size() == 64) + return true; + return (var->type->without_array()->components() != intr->num_components); + } + default: + return false; + } + } + case nir_instr_type_alu: { + auto alu = nir_instr_as_alu(instr); + return nir_dest_bit_size(alu->dest.dest) == 64; + } + case nir_instr_type_phi: { + auto phi = nir_instr_as_phi(instr); + return nir_dest_bit_size(phi->dest) == 64; + } + case nir_instr_type_load_const: { + auto lc = nir_instr_as_load_const(instr); + return lc->def.bit_size == 64; + } + case nir_instr_type_ssa_undef: { + auto undef = nir_instr_as_ssa_undef(instr); + return undef->def.bit_size == 64; + } + default: + return false; + } +} + +nir_ssa_def * +Lower64BitToVec2::lower(nir_instr *instr) +{ + switch (instr->type) { + case nir_instr_type_intrinsic: { + auto intr = nir_instr_as_intrinsic(instr); + switch (intr->intrinsic) { + case nir_intrinsic_load_deref: + return load_deref_64_to_vec2(intr); + case nir_intrinsic_load_uniform: + return load_uniform_64_to_vec2(intr); + case nir_intrinsic_load_ssbo: + return load_ssbo_64_to_vec2(intr); + case nir_intrinsic_load_input: + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ubo_vec4: + return load_64_to_vec2(intr); + case nir_intrinsic_store_deref: + return store_64_to_vec2(intr); + default: + + return nullptr; + } + } + case nir_instr_type_alu: { + auto alu = nir_instr_as_alu(instr); + alu->dest.dest.ssa.bit_size = 32; + alu->dest.dest.ssa.num_components *= 2; + alu->dest.write_mask = (1 << alu->dest.dest.ssa.num_components) - 1; + switch (alu->op) { + case nir_op_pack_64_2x32_split: + alu->op = nir_op_vec2; + break; + case nir_op_pack_64_2x32: + alu->op = nir_op_mov; + break; + case nir_op_vec2: + return nir_vec4(b, + nir_channel(b, alu->src[0].src.ssa, 0), + nir_channel(b, alu->src[0].src.ssa, 1), + nir_channel(b, alu->src[1].src.ssa, 0), + nir_channel(b, alu->src[1].src.ssa, 1)); + default: + return NULL; + } + return progress_keep; + } + case nir_instr_type_phi: { + auto phi = nir_instr_as_phi(instr); + phi->dest.ssa.bit_size = 32; + phi->dest.ssa.num_components = 2; + return progress_keep; + } + case nir_instr_type_load_const: { + auto lc = nir_instr_as_load_const(instr); + assert(lc->def.num_components < 3); + nir_const_value val[4] = {0}; + for (uint i = 0; i < lc->def.num_components; ++i) { + uint64_t v = lc->value[i].u64; + val[0].u32 = v & 0xffffffff; + val[1].u32 = (v >> 32) & 0xffffffff; + } + + return nir_build_imm(b, 2 * lc->def.num_components, 32, val); + } + case nir_instr_type_ssa_undef: { + auto undef = nir_instr_as_ssa_undef(instr); + undef->def.num_components *= 2; + undef->def.bit_size = 32; + return progress_keep; + } + default: + return nullptr; + } + +} + + +nir_ssa_def * +Lower64BitToVec2::load_deref_64_to_vec2(nir_intrinsic_instr *intr) +{ + auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr); + auto var = nir_intrinsic_get_var(intr, 0); + unsigned components = var->type->without_array()->components(); + if (var->type->without_array()->bit_size() == 64) { + components *= 2; + if (deref->deref_type == nir_deref_type_var) { + var->type = glsl_vec_type(components); + } else if (deref->deref_type == nir_deref_type_array) { + + var->type = glsl_array_type(glsl_vec_type(components), + var->type->array_size(), 0); + + } else { + nir_print_shader(b->shader, stderr); + assert(0 && "Only lowring of var and array derefs supported\n"); + } + } + deref->type = var->type; + if (deref->deref_type == nir_deref_type_array) { + auto deref_array = nir_instr_as_deref(deref->parent.ssa->parent_instr); + deref_array->type = var->type; + deref->type = deref_array->type->without_array(); + } + + intr->num_components = components; + intr->dest.ssa.bit_size = 32; + intr->dest.ssa.num_components = components; + return progress_keep; +} + +nir_ssa_def * +Lower64BitToVec2::store_64_to_vec2(nir_intrinsic_instr *intr) +{ + auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr); + auto var = nir_intrinsic_get_var(intr, 0); + + unsigned components = var->type->without_array()->components(); + unsigned wrmask = nir_intrinsic_write_mask(intr); + if (var->type->without_array()->bit_size() == 64) { + components *= 2; + if (deref->deref_type == nir_deref_type_var) { + var->type = glsl_vec_type(components); + } else if (deref->deref_type == nir_deref_type_array) { + var->type = glsl_array_type(glsl_vec_type(components), + var->type->array_size(), 0); + } else { + nir_print_shader(b->shader, stderr); + assert(0 && "Only lowring of var and array derefs supported\n"); + } + } + deref->type = var->type; + if (deref->deref_type == nir_deref_type_array) { + auto deref_array = nir_instr_as_deref(deref->parent.ssa->parent_instr); + deref_array->type = var->type; + deref->type = deref_array->type->without_array(); + } + intr->num_components = components; + nir_intrinsic_set_write_mask(intr, wrmask == 1 ? 3 : 0xf); + return progress_keep; +} + + +nir_ssa_def * +Lower64BitToVec2::load_uniform_64_to_vec2(nir_intrinsic_instr *intr) +{ + intr->num_components *= 2; + intr->dest.ssa.bit_size = 32; + intr->dest.ssa.num_components *= 2; + nir_intrinsic_set_dest_type(intr, nir_type_float32); + return progress_keep; +} + +nir_ssa_def * +Lower64BitToVec2::load_64_to_vec2(nir_intrinsic_instr *intr) +{ + intr->num_components *= 2; + intr->dest.ssa.bit_size = 32; + intr->dest.ssa.num_components *= 2; + nir_intrinsic_set_component(intr, nir_intrinsic_component(intr) * 2); + return progress_keep; +} + +nir_ssa_def * +Lower64BitToVec2::load_ssbo_64_to_vec2(nir_intrinsic_instr *intr) +{ + intr->num_components *= 2; + intr->dest.ssa.bit_size = 32; + intr->dest.ssa.num_components *= 2; + return progress_keep; +} + +static bool store_64bit_intr(nir_src *src, void *state) +{ + bool *s = (bool *)state; + *s = nir_src_bit_size(*src) == 64; + return !*s; +} + +static bool double2vec2(nir_src *src, void *state) +{ + if (nir_src_bit_size(*src) != 64) + return true; + + assert(src->is_ssa); + src->ssa->bit_size = 32; + src->ssa->num_components *= 2; + return true; +} + +bool +r600_nir_64_to_vec2(nir_shader *sh) +{ + vector intr64bit; + nir_foreach_function(function, sh) { + if (function->impl) { + nir_builder b; + nir_builder_init(&b, function->impl); + + nir_foreach_block(block, function->impl) { + nir_foreach_instr_safe(instr, block) { + switch (instr->type) { + case nir_instr_type_alu: { + bool success = false; + nir_foreach_src(instr, store_64bit_intr, &success); + if (success) + intr64bit.push_back(instr); + break; + } + case nir_instr_type_intrinsic: { + auto ir = nir_instr_as_intrinsic(instr); + switch (ir->intrinsic) { + case nir_intrinsic_store_output: + case nir_intrinsic_store_ssbo: { + bool success = false; + nir_foreach_src(instr, store_64bit_intr, &success); + if (success) { + auto wm = nir_intrinsic_write_mask(ir); + nir_intrinsic_set_write_mask(ir, (wm == 1) ? 3 : 0xf); + ir->num_components *= 2; + } + break; + } + default: + ; + } + } + default: + ; + } + } + } + } + } + + bool result = Lower64BitToVec2().run(sh); + + if (result || !intr64bit.empty()) { + + for(auto&& instr: intr64bit) { + if (instr->type == nir_instr_type_alu) { + auto alu = nir_instr_as_alu(instr); + auto alu_info = nir_op_infos[alu->op]; + for (unsigned i = 0; i < alu_info.num_inputs; ++i) { + int swizzle[NIR_MAX_VEC_COMPONENTS] = {0}; + for (unsigned k = 0; k < NIR_MAX_VEC_COMPONENTS / 2; k++) { + if (!nir_alu_instr_channel_used(alu, i, k)) { + continue; + } + + switch (alu->op) { + case nir_op_unpack_64_2x32_split_x: + swizzle[2 * k] = alu->src[i].swizzle[k] * 2; + alu->op = nir_op_mov; + break; + case nir_op_unpack_64_2x32_split_y: + swizzle[2 * k] = alu->src[i].swizzle[k] * 2 + 1; + alu->op = nir_op_mov; + break; + case nir_op_unpack_64_2x32: + alu->op = nir_op_mov; + break; + case nir_op_bcsel: + if (i == 0) { + swizzle[2 * k] = swizzle[2 * k + 1] = alu->src[i].swizzle[k] * 2; + break; + } + /* fallthrough */ + default: + swizzle[2 * k] = alu->src[i].swizzle[k] * 2; + swizzle[2 * k + 1] = alu->src[i].swizzle[k] * 2 + 1; + } + } + for (unsigned k = 0; k < NIR_MAX_VEC_COMPONENTS; ++k) { + alu->src[i].swizzle[k] = swizzle[k]; + } + } + } else + nir_foreach_src(instr, double2vec2, nullptr); + } + result = true; + } + + return result; +} + +using std::map; +using std::vector; +using std::pair; + +class StoreMerger { +public: + StoreMerger(nir_shader *shader); + void collect_stores(); + bool combine(); + void combine_one_slot(vector& stores); + + using StoreCombos = map>; + + StoreCombos m_stores; + nir_shader *sh; + nir_builder b; +}; + +StoreMerger::StoreMerger(nir_shader *shader): + sh(shader) +{ +} + + +void StoreMerger::collect_stores() +{ + unsigned vertex = 0; + nir_foreach_function(function, sh) { + if (function->impl) { + nir_foreach_block(block, function->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + auto ir = nir_instr_as_intrinsic(instr); + if (ir->intrinsic == nir_intrinsic_emit_vertex || + ir->intrinsic == nir_intrinsic_emit_vertex_with_counter) { + ++vertex; + continue; + } + if (ir->intrinsic != nir_intrinsic_store_output) + continue; + + unsigned index = nir_intrinsic_base(ir) + 64 * vertex + + 8 * 64 * nir_intrinsic_io_semantics(ir).gs_streams; + m_stores[index].push_back(ir); + } + } + } + } +} + +bool StoreMerger::combine() +{ + bool progress = false; + for(auto&& i : m_stores) { + if (i.second.size() < 2) + continue; + + combine_one_slot(i.second); + progress = true; + } + return progress; +} + +void StoreMerger::combine_one_slot(vector& stores) +{ + nir_ssa_def *srcs[4] = {nullptr}; + + nir_builder b; + nir_builder_init(&b, nir_shader_get_entrypoint(sh)); + auto last_store = *stores.rbegin(); + + b.cursor = nir_before_instr(&last_store->instr); + + unsigned comps = 0; + unsigned writemask = 0; + unsigned first_comp = 4; + for (auto&& store : stores) { + int cmp = nir_intrinsic_component(store); + for (unsigned i = 0; i < nir_src_num_components(store->src[0]); ++i, ++comps) { + unsigned out_comp = i + cmp; + srcs[out_comp] = nir_channel(&b, store->src[0].ssa, i); + writemask |= 1 << out_comp; + if (first_comp > out_comp) + first_comp = out_comp; + } + } + + auto new_src = nir_vec(&b, srcs, comps); + + nir_instr_rewrite_src(&last_store->instr, &last_store->src[0], nir_src_for_ssa(new_src)); + last_store->num_components = comps; + nir_intrinsic_set_component(last_store, first_comp); + nir_intrinsic_set_write_mask(last_store, writemask); + + for (auto i = stores.begin(); i != stores.end() - 1; ++i) + nir_instr_remove(&(*i)->instr); +} + +bool r600_merge_vec2_stores(nir_shader *shader) +{ + r600::StoreMerger merger(shader); + merger.collect_stores(); + return merger.combine(); +} + +} // end namespace r600 + +