/* * Copyright © Microsoft Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include "dxil_nir.h" #include "nir_builder.h" #include "nir_deref.h" #include "nir_to_dxil.h" #include "util/u_math.h" #include "vulkan/vulkan_core.h" static void cl_type_size_align(const struct glsl_type *type, unsigned *size, unsigned *align) { *size = glsl_get_cl_size(type); *align = glsl_get_cl_alignment(type); } static void extract_comps_from_vec32(nir_builder *b, nir_ssa_def *vec32, unsigned dst_bit_size, nir_ssa_def **dst_comps, unsigned num_dst_comps) { unsigned step = DIV_ROUND_UP(dst_bit_size, 32); unsigned comps_per32b = 32 / dst_bit_size; nir_ssa_def *tmp; for (unsigned i = 0; i < vec32->num_components; i += step) { switch (dst_bit_size) { case 64: tmp = nir_pack_64_2x32_split(b, nir_channel(b, vec32, i), nir_channel(b, vec32, i + 1)); dst_comps[i / 2] = tmp; break; case 32: dst_comps[i] = nir_channel(b, vec32, i); break; case 16: case 8: { unsigned dst_offs = i * comps_per32b; tmp = nir_unpack_bits(b, nir_channel(b, vec32, i), dst_bit_size); for (unsigned j = 0; j < comps_per32b && dst_offs + j < num_dst_comps; j++) dst_comps[dst_offs + j] = nir_channel(b, tmp, j); } break; } } } static nir_ssa_def * load_comps_to_vec32(nir_builder *b, unsigned src_bit_size, nir_ssa_def **src_comps, unsigned num_src_comps) { unsigned num_vec32comps = DIV_ROUND_UP(num_src_comps * src_bit_size, 32); unsigned step = DIV_ROUND_UP(src_bit_size, 32); unsigned comps_per32b = 32 / src_bit_size; nir_ssa_def *vec32comps[4]; for (unsigned i = 0; i < num_vec32comps; i += step) { switch (src_bit_size) { case 64: vec32comps[i] = nir_unpack_64_2x32_split_x(b, src_comps[i / 2]); vec32comps[i + 1] = nir_unpack_64_2x32_split_y(b, src_comps[i / 2]); break; case 32: vec32comps[i] = src_comps[i]; break; case 16: case 8: { unsigned src_offs = i * comps_per32b; vec32comps[i] = nir_u2u32(b, src_comps[src_offs]); for (unsigned j = 1; j < comps_per32b && src_offs + j < num_src_comps; j++) { nir_ssa_def *tmp = nir_ishl(b, nir_u2u32(b, src_comps[src_offs + j]), nir_imm_int(b, j * src_bit_size)); vec32comps[i] = nir_ior(b, vec32comps[i], tmp); } break; } } } return nir_vec(b, vec32comps, num_vec32comps); } static nir_ssa_def * build_load_ptr_dxil(nir_builder *b, nir_deref_instr *deref, nir_ssa_def *idx) { return nir_load_ptr_dxil(b, 1, 32, &deref->dest.ssa, idx); } static bool lower_load_deref(nir_builder *b, nir_intrinsic_instr *intr) { assert(intr->dest.is_ssa); b->cursor = nir_before_instr(&intr->instr); nir_deref_instr *deref = nir_src_as_deref(intr->src[0]); if (!nir_deref_mode_is(deref, nir_var_shader_temp)) return false; nir_ssa_def *ptr = nir_u2u32(b, nir_build_deref_offset(b, deref, cl_type_size_align)); nir_ssa_def *offset = nir_iand(b, ptr, nir_inot(b, nir_imm_int(b, 3))); assert(intr->dest.is_ssa); unsigned num_components = nir_dest_num_components(intr->dest); unsigned bit_size = nir_dest_bit_size(intr->dest); unsigned load_size = MAX2(32, bit_size); unsigned num_bits = num_components * bit_size; nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; unsigned comp_idx = 0; nir_deref_path path; nir_deref_path_init(&path, deref, NULL); nir_ssa_def *base_idx = nir_ishr(b, offset, nir_imm_int(b, 2 /* log2(32 / 8) */)); /* Split loads into 32-bit chunks */ for (unsigned i = 0; i < num_bits; i += load_size) { unsigned subload_num_bits = MIN2(num_bits - i, load_size); nir_ssa_def *idx = nir_iadd(b, base_idx, nir_imm_int(b, i / 32)); nir_ssa_def *vec32 = build_load_ptr_dxil(b, path.path[0], idx); if (load_size == 64) { idx = nir_iadd(b, idx, nir_imm_int(b, 1)); vec32 = nir_vec2(b, vec32, build_load_ptr_dxil(b, path.path[0], idx)); } /* If we have 2 bytes or less to load we need to adjust the u32 value so * we can always extract the LSB. */ if (subload_num_bits <= 16) { nir_ssa_def *shift = nir_imul(b, nir_iand(b, ptr, nir_imm_int(b, 3)), nir_imm_int(b, 8)); vec32 = nir_ushr(b, vec32, shift); } /* And now comes the pack/unpack step to match the original type. */ extract_comps_from_vec32(b, vec32, bit_size, &comps[comp_idx], subload_num_bits / bit_size); comp_idx += subload_num_bits / bit_size; } nir_deref_path_finish(&path); assert(comp_idx == num_components); nir_ssa_def *result = nir_vec(b, comps, num_components); nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); nir_instr_remove(&intr->instr); return true; } static nir_ssa_def * ubo_load_select_32b_comps(nir_builder *b, nir_ssa_def *vec32, nir_ssa_def *offset, unsigned num_bytes) { assert(num_bytes == 16 || num_bytes == 12 || num_bytes == 8 || num_bytes == 4 || num_bytes == 3 || num_bytes == 2 || num_bytes == 1); assert(vec32->num_components == 4); /* 16 and 12 byte types are always aligned on 16 bytes. */ if (num_bytes > 8) return vec32; nir_ssa_def *comps[4]; nir_ssa_def *cond; for (unsigned i = 0; i < 4; i++) comps[i] = nir_channel(b, vec32, i); /* If we have 8bytes or less to load, select which half the vec4 should * be used. */ cond = nir_ine(b, nir_iand(b, offset, nir_imm_int(b, 0x8)), nir_imm_int(b, 0)); comps[0] = nir_bcsel(b, cond, comps[2], comps[0]); comps[1] = nir_bcsel(b, cond, comps[3], comps[1]); /* Thanks to the CL alignment constraints, if we want 8 bytes we're done. */ if (num_bytes == 8) return nir_vec(b, comps, 2); /* 4 bytes or less needed, select which of the 32bit component should be * used and return it. The sub-32bit split is handled in * extract_comps_from_vec32(). */ cond = nir_ine(b, nir_iand(b, offset, nir_imm_int(b, 0x4)), nir_imm_int(b, 0)); return nir_bcsel(b, cond, comps[1], comps[0]); } nir_ssa_def * build_load_ubo_dxil(nir_builder *b, nir_ssa_def *buffer, nir_ssa_def *offset, unsigned num_components, unsigned bit_size) { nir_ssa_def *idx = nir_ushr(b, offset, nir_imm_int(b, 4)); nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; unsigned num_bits = num_components * bit_size; unsigned comp_idx = 0; /* We need to split loads in 16byte chunks because that's the * granularity of cBufferLoadLegacy(). */ for (unsigned i = 0; i < num_bits; i += (16 * 8)) { /* For each 16byte chunk (or smaller) we generate a 32bit ubo vec * load. */ unsigned subload_num_bits = MIN2(num_bits - i, 16 * 8); nir_ssa_def *vec32 = nir_load_ubo_dxil(b, 4, 32, buffer, nir_iadd(b, idx, nir_imm_int(b, i / (16 * 8)))); /* First re-arrange the vec32 to account for intra 16-byte offset. */ vec32 = ubo_load_select_32b_comps(b, vec32, offset, subload_num_bits / 8); /* If we have 2 bytes or less to load we need to adjust the u32 value so * we can always extract the LSB. */ if (subload_num_bits <= 16) { nir_ssa_def *shift = nir_imul(b, nir_iand(b, offset, nir_imm_int(b, 3)), nir_imm_int(b, 8)); vec32 = nir_ushr(b, vec32, shift); } /* And now comes the pack/unpack step to match the original type. */ extract_comps_from_vec32(b, vec32, bit_size, &comps[comp_idx], subload_num_bits / bit_size); comp_idx += subload_num_bits / bit_size; } assert(comp_idx == num_components); return nir_vec(b, comps, num_components); } static bool lower_load_ssbo(nir_builder *b, nir_intrinsic_instr *intr) { assert(intr->dest.is_ssa); assert(intr->src[0].is_ssa); assert(intr->src[1].is_ssa); b->cursor = nir_before_instr(&intr->instr); nir_ssa_def *buffer = intr->src[0].ssa; nir_ssa_def *offset = nir_iand(b, intr->src[1].ssa, nir_imm_int(b, ~3)); enum gl_access_qualifier access = nir_intrinsic_access(intr); unsigned bit_size = nir_dest_bit_size(intr->dest); unsigned num_components = nir_dest_num_components(intr->dest); unsigned num_bits = num_components * bit_size; nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; unsigned comp_idx = 0; /* We need to split loads in 16byte chunks because that's the optimal * granularity of bufferLoad(). Minimum alignment is 4byte, which saves * from us from extra complexity to extract >= 32 bit components. */ for (unsigned i = 0; i < num_bits; i += 4 * 32) { /* For each 16byte chunk (or smaller) we generate a 32bit ssbo vec * load. */ unsigned subload_num_bits = MIN2(num_bits - i, 4 * 32); /* The number of components to store depends on the number of bytes. */ nir_ssa_def *vec32 = nir_load_ssbo(b, DIV_ROUND_UP(subload_num_bits, 32), 32, buffer, nir_iadd(b, offset, nir_imm_int(b, i / 8)), .align_mul = 4, .align_offset = 0, .access = access); /* If we have 2 bytes or less to load we need to adjust the u32 value so * we can always extract the LSB. */ if (subload_num_bits <= 16) { nir_ssa_def *shift = nir_imul(b, nir_iand(b, intr->src[1].ssa, nir_imm_int(b, 3)), nir_imm_int(b, 8)); vec32 = nir_ushr(b, vec32, shift); } /* And now comes the pack/unpack step to match the original type. */ extract_comps_from_vec32(b, vec32, bit_size, &comps[comp_idx], subload_num_bits / bit_size); comp_idx += subload_num_bits / bit_size; } assert(comp_idx == num_components); nir_ssa_def *result = nir_vec(b, comps, num_components); nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); nir_instr_remove(&intr->instr); return true; } static bool lower_store_ssbo(nir_builder *b, nir_intrinsic_instr *intr) { b->cursor = nir_before_instr(&intr->instr); assert(intr->src[0].is_ssa); assert(intr->src[1].is_ssa); assert(intr->src[2].is_ssa); nir_ssa_def *val = intr->src[0].ssa; nir_ssa_def *buffer = intr->src[1].ssa; nir_ssa_def *offset = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, ~3)); unsigned bit_size = val->bit_size; unsigned num_components = val->num_components; unsigned num_bits = num_components * bit_size; nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS] = { 0 }; unsigned comp_idx = 0; unsigned write_mask = nir_intrinsic_write_mask(intr); for (unsigned i = 0; i < num_components; i++) if (write_mask & (1 << i)) comps[i] = nir_channel(b, val, i); /* We split stores in 16byte chunks because that's the optimal granularity * of bufferStore(). Minimum alignment is 4byte, which saves from us from * extra complexity to store >= 32 bit components. */ unsigned bit_offset = 0; while (true) { /* Skip over holes in the write mask */ while (comp_idx < num_components && comps[comp_idx] == NULL) { comp_idx++; bit_offset += bit_size; } if (comp_idx >= num_components) break; /* For each 16byte chunk (or smaller) we generate a 32bit ssbo vec * store. If a component is skipped by the write mask, do a smaller * sub-store */ unsigned num_src_comps_stored = 0, substore_num_bits = 0; while(num_src_comps_stored + comp_idx < num_components && substore_num_bits + bit_offset < num_bits && substore_num_bits < 4 * 32 && comps[comp_idx + num_src_comps_stored]) { ++num_src_comps_stored; substore_num_bits += bit_size; } nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, bit_offset / 8)); nir_ssa_def *vec32 = load_comps_to_vec32(b, bit_size, &comps[comp_idx], num_src_comps_stored); nir_intrinsic_instr *store; if (substore_num_bits < 32) { nir_ssa_def *mask = nir_imm_int(b, (1 << substore_num_bits) - 1); /* If we have 16 bits or less to store we need to place them * correctly in the u32 component. Anything greater than 16 bits * (including uchar3) is naturally aligned on 32bits. */ if (substore_num_bits <= 16) { nir_ssa_def *pos = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, 3)); nir_ssa_def *shift = nir_imul_imm(b, pos, 8); vec32 = nir_ishl(b, vec32, shift); mask = nir_ishl(b, mask, shift); } store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_ssbo_masked_dxil); store->src[0] = nir_src_for_ssa(vec32); store->src[1] = nir_src_for_ssa(nir_inot(b, mask)); store->src[2] = nir_src_for_ssa(buffer); store->src[3] = nir_src_for_ssa(local_offset); } else { store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_ssbo); store->src[0] = nir_src_for_ssa(vec32); store->src[1] = nir_src_for_ssa(buffer); store->src[2] = nir_src_for_ssa(local_offset); nir_intrinsic_set_align(store, 4, 0); } /* The number of components to store depends on the number of bits. */ store->num_components = DIV_ROUND_UP(substore_num_bits, 32); nir_builder_instr_insert(b, &store->instr); comp_idx += num_src_comps_stored; bit_offset += substore_num_bits; if (nir_intrinsic_has_write_mask(store)) nir_intrinsic_set_write_mask(store, (1 << store->num_components) - 1); } nir_instr_remove(&intr->instr); return true; } static void lower_load_vec32(nir_builder *b, nir_ssa_def *index, unsigned num_comps, nir_ssa_def **comps, nir_intrinsic_op op) { for (unsigned i = 0; i < num_comps; i++) { nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, op); load->num_components = 1; load->src[0] = nir_src_for_ssa(nir_iadd(b, index, nir_imm_int(b, i))); nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL); nir_builder_instr_insert(b, &load->instr); comps[i] = &load->dest.ssa; } } static bool lower_32b_offset_load(nir_builder *b, nir_intrinsic_instr *intr) { assert(intr->dest.is_ssa); unsigned bit_size = nir_dest_bit_size(intr->dest); unsigned num_components = nir_dest_num_components(intr->dest); unsigned num_bits = num_components * bit_size; b->cursor = nir_before_instr(&intr->instr); nir_intrinsic_op op = intr->intrinsic; assert(intr->src[0].is_ssa); nir_ssa_def *offset = intr->src[0].ssa; if (op == nir_intrinsic_load_shared) { offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_base(intr))); op = nir_intrinsic_load_shared_dxil; } else { offset = nir_u2u32(b, offset); op = nir_intrinsic_load_scratch_dxil; } nir_ssa_def *index = nir_ushr(b, offset, nir_imm_int(b, 2)); nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; nir_ssa_def *comps_32bit[NIR_MAX_VEC_COMPONENTS * 2]; /* We need to split loads in 32-bit accesses because the buffer * is an i32 array and DXIL does not support type casts. */ unsigned num_32bit_comps = DIV_ROUND_UP(num_bits, 32); lower_load_vec32(b, index, num_32bit_comps, comps_32bit, op); unsigned num_comps_per_pass = MIN2(num_32bit_comps, 4); for (unsigned i = 0; i < num_32bit_comps; i += num_comps_per_pass) { unsigned num_vec32_comps = MIN2(num_32bit_comps - i, 4); unsigned num_dest_comps = num_vec32_comps * 32 / bit_size; nir_ssa_def *vec32 = nir_vec(b, &comps_32bit[i], num_vec32_comps); /* If we have 16 bits or less to load we need to adjust the u32 value so * we can always extract the LSB. */ if (num_bits <= 16) { nir_ssa_def *shift = nir_imul(b, nir_iand(b, offset, nir_imm_int(b, 3)), nir_imm_int(b, 8)); vec32 = nir_ushr(b, vec32, shift); } /* And now comes the pack/unpack step to match the original type. */ unsigned dest_index = i * 32 / bit_size; extract_comps_from_vec32(b, vec32, bit_size, &comps[dest_index], num_dest_comps); } nir_ssa_def *result = nir_vec(b, comps, num_components); nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); nir_instr_remove(&intr->instr); return true; } static void lower_store_vec32(nir_builder *b, nir_ssa_def *index, nir_ssa_def *vec32, nir_intrinsic_op op) { for (unsigned i = 0; i < vec32->num_components; i++) { nir_intrinsic_instr *store = nir_intrinsic_instr_create(b->shader, op); store->src[0] = nir_src_for_ssa(nir_channel(b, vec32, i)); store->src[1] = nir_src_for_ssa(nir_iadd(b, index, nir_imm_int(b, i))); store->num_components = 1; nir_builder_instr_insert(b, &store->instr); } } static void lower_masked_store_vec32(nir_builder *b, nir_ssa_def *offset, nir_ssa_def *index, nir_ssa_def *vec32, unsigned num_bits, nir_intrinsic_op op) { nir_ssa_def *mask = nir_imm_int(b, (1 << num_bits) - 1); /* If we have 16 bits or less to store we need to place them correctly in * the u32 component. Anything greater than 16 bits (including uchar3) is * naturally aligned on 32bits. */ if (num_bits <= 16) { nir_ssa_def *shift = nir_imul_imm(b, nir_iand(b, offset, nir_imm_int(b, 3)), 8); vec32 = nir_ishl(b, vec32, shift); mask = nir_ishl(b, mask, shift); } if (op == nir_intrinsic_store_shared_dxil) { /* Use the dedicated masked intrinsic */ nir_store_shared_masked_dxil(b, vec32, nir_inot(b, mask), index); } else { /* For scratch, since we don't need atomics, just generate the read-modify-write in NIR */ nir_ssa_def *load = nir_load_scratch_dxil(b, 1, 32, index); nir_ssa_def *new_val = nir_ior(b, vec32, nir_iand(b, nir_inot(b, mask), load)); lower_store_vec32(b, index, new_val, op); } } static bool lower_32b_offset_store(nir_builder *b, nir_intrinsic_instr *intr) { assert(intr->src[0].is_ssa); unsigned num_components = nir_src_num_components(intr->src[0]); unsigned bit_size = nir_src_bit_size(intr->src[0]); unsigned num_bits = num_components * bit_size; b->cursor = nir_before_instr(&intr->instr); nir_intrinsic_op op = intr->intrinsic; nir_ssa_def *offset = intr->src[1].ssa; if (op == nir_intrinsic_store_shared) { offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_base(intr))); op = nir_intrinsic_store_shared_dxil; } else { offset = nir_u2u32(b, offset); op = nir_intrinsic_store_scratch_dxil; } nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; unsigned comp_idx = 0; for (unsigned i = 0; i < num_components; i++) comps[i] = nir_channel(b, intr->src[0].ssa, i); for (unsigned i = 0; i < num_bits; i += 4 * 32) { /* For each 4byte chunk (or smaller) we generate a 32bit scalar store. */ unsigned substore_num_bits = MIN2(num_bits - i, 4 * 32); nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, i / 8)); nir_ssa_def *vec32 = load_comps_to_vec32(b, bit_size, &comps[comp_idx], substore_num_bits / bit_size); nir_ssa_def *index = nir_ushr(b, local_offset, nir_imm_int(b, 2)); /* For anything less than 32bits we need to use the masked version of the * intrinsic to preserve data living in the same 32bit slot. */ if (num_bits < 32) { lower_masked_store_vec32(b, local_offset, index, vec32, num_bits, op); } else { lower_store_vec32(b, index, vec32, op); } comp_idx += substore_num_bits / bit_size; } nir_instr_remove(&intr->instr); return true; } static void ubo_to_temp_patch_deref_mode(nir_deref_instr *deref) { deref->modes = nir_var_shader_temp; nir_foreach_use(use_src, &deref->dest.ssa) { if (use_src->parent_instr->type != nir_instr_type_deref) continue; nir_deref_instr *parent = nir_instr_as_deref(use_src->parent_instr); ubo_to_temp_patch_deref_mode(parent); } } static void ubo_to_temp_update_entry(nir_deref_instr *deref, struct hash_entry *he) { assert(nir_deref_mode_is(deref, nir_var_mem_constant)); assert(deref->dest.is_ssa); assert(he->data); nir_foreach_use(use_src, &deref->dest.ssa) { if (use_src->parent_instr->type == nir_instr_type_deref) { ubo_to_temp_update_entry(nir_instr_as_deref(use_src->parent_instr), he); } else if (use_src->parent_instr->type == nir_instr_type_intrinsic) { nir_intrinsic_instr *intr = nir_instr_as_intrinsic(use_src->parent_instr); if (intr->intrinsic != nir_intrinsic_load_deref) he->data = NULL; } else { he->data = NULL; } if (!he->data) break; } } bool dxil_nir_lower_ubo_to_temp(nir_shader *nir) { struct hash_table *ubo_to_temp = _mesa_pointer_hash_table_create(NULL); bool progress = false; /* First pass: collect all UBO accesses that could be turned into * shader temp accesses. */ foreach_list_typed(nir_function, func, node, &nir->functions) { if (!func->is_entrypoint) continue; assert(func->impl); nir_foreach_block(block, func->impl) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_deref) continue; nir_deref_instr *deref = nir_instr_as_deref(instr); if (!nir_deref_mode_is(deref, nir_var_mem_constant) || deref->deref_type != nir_deref_type_var) continue; struct hash_entry *he = _mesa_hash_table_search(ubo_to_temp, deref->var); if (!he) he = _mesa_hash_table_insert(ubo_to_temp, deref->var, deref->var); if (!he->data) continue; ubo_to_temp_update_entry(deref, he); } } } hash_table_foreach(ubo_to_temp, he) { nir_variable *var = he->data; if (!var) continue; /* Change the variable mode. */ var->data.mode = nir_var_shader_temp; /* Make sure the variable has a name. * DXIL variables must have names. */ if (!var->name) var->name = ralloc_asprintf(nir, "global_%d", exec_list_length(&nir->variables)); progress = true; } _mesa_hash_table_destroy(ubo_to_temp, NULL); /* Second pass: patch all derefs that were accessing the converted UBOs * variables. */ foreach_list_typed(nir_function, func, node, &nir->functions) { if (!func->is_entrypoint) continue; assert(func->impl); nir_foreach_block(block, func->impl) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_deref) continue; nir_deref_instr *deref = nir_instr_as_deref(instr); if (nir_deref_mode_is(deref, nir_var_mem_constant) && deref->deref_type == nir_deref_type_var && deref->var->data.mode == nir_var_shader_temp) ubo_to_temp_patch_deref_mode(deref); } } } return progress; } static bool lower_load_ubo(nir_builder *b, nir_intrinsic_instr *intr) { assert(intr->dest.is_ssa); assert(intr->src[0].is_ssa); assert(intr->src[1].is_ssa); b->cursor = nir_before_instr(&intr->instr); nir_ssa_def *result = build_load_ubo_dxil(b, intr->src[0].ssa, intr->src[1].ssa, nir_dest_num_components(intr->dest), nir_dest_bit_size(intr->dest)); nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); nir_instr_remove(&intr->instr); return true; } bool dxil_nir_lower_loads_stores_to_dxil(nir_shader *nir) { bool progress = false; foreach_list_typed(nir_function, func, node, &nir->functions) { if (!func->is_entrypoint) continue; assert(func->impl); nir_builder b; nir_builder_init(&b, func->impl); nir_foreach_block(block, func->impl) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); switch (intr->intrinsic) { case nir_intrinsic_load_deref: progress |= lower_load_deref(&b, intr); break; case nir_intrinsic_load_shared: case nir_intrinsic_load_scratch: progress |= lower_32b_offset_load(&b, intr); break; case nir_intrinsic_load_ssbo: progress |= lower_load_ssbo(&b, intr); break; case nir_intrinsic_load_ubo: progress |= lower_load_ubo(&b, intr); break; case nir_intrinsic_store_shared: case nir_intrinsic_store_scratch: progress |= lower_32b_offset_store(&b, intr); break; case nir_intrinsic_store_ssbo: progress |= lower_store_ssbo(&b, intr); break; default: break; } } } } return progress; } static bool lower_shared_atomic(nir_builder *b, nir_intrinsic_instr *intr, nir_intrinsic_op dxil_op) { b->cursor = nir_before_instr(&intr->instr); assert(intr->src[0].is_ssa); nir_ssa_def *offset = nir_iadd(b, intr->src[0].ssa, nir_imm_int(b, nir_intrinsic_base(intr))); nir_ssa_def *index = nir_ushr(b, offset, nir_imm_int(b, 2)); nir_intrinsic_instr *atomic = nir_intrinsic_instr_create(b->shader, dxil_op); atomic->src[0] = nir_src_for_ssa(index); assert(intr->src[1].is_ssa); atomic->src[1] = nir_src_for_ssa(intr->src[1].ssa); if (dxil_op == nir_intrinsic_shared_atomic_comp_swap_dxil) { assert(intr->src[2].is_ssa); atomic->src[2] = nir_src_for_ssa(intr->src[2].ssa); } atomic->num_components = 0; nir_ssa_dest_init(&atomic->instr, &atomic->dest, 1, 32, NULL); nir_builder_instr_insert(b, &atomic->instr); nir_ssa_def_rewrite_uses(&intr->dest.ssa, &atomic->dest.ssa); nir_instr_remove(&intr->instr); return true; } bool dxil_nir_lower_atomics_to_dxil(nir_shader *nir) { bool progress = false; foreach_list_typed(nir_function, func, node, &nir->functions) { if (!func->is_entrypoint) continue; assert(func->impl); nir_builder b; nir_builder_init(&b, func->impl); nir_foreach_block(block, func->impl) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); switch (intr->intrinsic) { #define ATOMIC(op) \ case nir_intrinsic_shared_atomic_##op: \ progress |= lower_shared_atomic(&b, intr, \ nir_intrinsic_shared_atomic_##op##_dxil); \ break ATOMIC(add); ATOMIC(imin); ATOMIC(umin); ATOMIC(imax); ATOMIC(umax); ATOMIC(and); ATOMIC(or); ATOMIC(xor); ATOMIC(exchange); ATOMIC(comp_swap); #undef ATOMIC default: break; } } } } return progress; } static bool lower_deref_ssbo(nir_builder *b, nir_deref_instr *deref) { assert(nir_deref_mode_is(deref, nir_var_mem_ssbo)); assert(deref->deref_type == nir_deref_type_var || deref->deref_type == nir_deref_type_cast); nir_variable *var = deref->var; b->cursor = nir_before_instr(&deref->instr); if (deref->deref_type == nir_deref_type_var) { /* We turn all deref_var into deref_cast and build a pointer value based on * the var binding which encodes the UAV id. */ nir_ssa_def *ptr = nir_imm_int64(b, (uint64_t)var->data.binding << 32); nir_deref_instr *deref_cast = nir_build_deref_cast(b, ptr, nir_var_mem_ssbo, deref->type, glsl_get_explicit_stride(var->type)); nir_ssa_def_rewrite_uses(&deref->dest.ssa, &deref_cast->dest.ssa); nir_instr_remove(&deref->instr); deref = deref_cast; return true; } return false; } bool dxil_nir_lower_deref_ssbo(nir_shader *nir) { bool progress = false; foreach_list_typed(nir_function, func, node, &nir->functions) { if (!func->is_entrypoint) continue; assert(func->impl); nir_builder b; nir_builder_init(&b, func->impl); nir_foreach_block(block, func->impl) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_deref) continue; nir_deref_instr *deref = nir_instr_as_deref(instr); if (!nir_deref_mode_is(deref, nir_var_mem_ssbo) || (deref->deref_type != nir_deref_type_var && deref->deref_type != nir_deref_type_cast)) continue; progress |= lower_deref_ssbo(&b, deref); } } } return progress; } static bool lower_alu_deref_srcs(nir_builder *b, nir_alu_instr *alu) { const nir_op_info *info = &nir_op_infos[alu->op]; bool progress = false; b->cursor = nir_before_instr(&alu->instr); for (unsigned i = 0; i < info->num_inputs; i++) { nir_deref_instr *deref = nir_src_as_deref(alu->src[i].src); if (!deref) continue; nir_deref_path path; nir_deref_path_init(&path, deref, NULL); nir_deref_instr *root_deref = path.path[0]; nir_deref_path_finish(&path); if (root_deref->deref_type != nir_deref_type_cast) continue; nir_ssa_def *ptr = nir_iadd(b, root_deref->parent.ssa, nir_build_deref_offset(b, deref, cl_type_size_align)); nir_instr_rewrite_src(&alu->instr, &alu->src[i].src, nir_src_for_ssa(ptr)); progress = true; } return progress; } bool dxil_nir_opt_alu_deref_srcs(nir_shader *nir) { bool progress = false; foreach_list_typed(nir_function, func, node, &nir->functions) { if (!func->is_entrypoint) continue; assert(func->impl); nir_builder b; nir_builder_init(&b, func->impl); nir_foreach_block(block, func->impl) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_alu) continue; nir_alu_instr *alu = nir_instr_as_alu(instr); progress |= lower_alu_deref_srcs(&b, alu); } } } return progress; } static nir_ssa_def * memcpy_load_deref_elem(nir_builder *b, nir_deref_instr *parent, nir_ssa_def *index) { nir_deref_instr *deref; index = nir_i2i(b, index, nir_dest_bit_size(parent->dest)); assert(parent->deref_type == nir_deref_type_cast); deref = nir_build_deref_ptr_as_array(b, parent, index); return nir_load_deref(b, deref); } static void memcpy_store_deref_elem(nir_builder *b, nir_deref_instr *parent, nir_ssa_def *index, nir_ssa_def *value) { nir_deref_instr *deref; index = nir_i2i(b, index, nir_dest_bit_size(parent->dest)); assert(parent->deref_type == nir_deref_type_cast); deref = nir_build_deref_ptr_as_array(b, parent, index); nir_store_deref(b, deref, value, 1); } static bool lower_memcpy_deref(nir_builder *b, nir_intrinsic_instr *intr) { nir_deref_instr *dst_deref = nir_src_as_deref(intr->src[0]); nir_deref_instr *src_deref = nir_src_as_deref(intr->src[1]); assert(intr->src[2].is_ssa); nir_ssa_def *num_bytes = intr->src[2].ssa; assert(dst_deref && src_deref); b->cursor = nir_after_instr(&intr->instr); dst_deref = nir_build_deref_cast(b, &dst_deref->dest.ssa, dst_deref->modes, glsl_uint8_t_type(), 1); src_deref = nir_build_deref_cast(b, &src_deref->dest.ssa, src_deref->modes, glsl_uint8_t_type(), 1); /* * We want to avoid 64b instructions, so let's assume we'll always be * passed a value that fits in a 32b type and truncate the 64b value. */ num_bytes = nir_u2u32(b, num_bytes); nir_variable *loop_index_var = nir_local_variable_create(b->impl, glsl_uint_type(), "loop_index"); nir_deref_instr *loop_index_deref = nir_build_deref_var(b, loop_index_var); nir_store_deref(b, loop_index_deref, nir_imm_int(b, 0), 1); nir_loop *loop = nir_push_loop(b); nir_ssa_def *loop_index = nir_load_deref(b, loop_index_deref); nir_ssa_def *cmp = nir_ige(b, loop_index, num_bytes); nir_if *loop_check = nir_push_if(b, cmp); nir_jump(b, nir_jump_break); nir_pop_if(b, loop_check); nir_ssa_def *val = memcpy_load_deref_elem(b, src_deref, loop_index); memcpy_store_deref_elem(b, dst_deref, loop_index, val); nir_store_deref(b, loop_index_deref, nir_iadd_imm(b, loop_index, 1), 1); nir_pop_loop(b, loop); nir_instr_remove(&intr->instr); return true; } bool dxil_nir_lower_memcpy_deref(nir_shader *nir) { bool progress = false; foreach_list_typed(nir_function, func, node, &nir->functions) { if (!func->is_entrypoint) continue; assert(func->impl); nir_builder b; nir_builder_init(&b, func->impl); nir_foreach_block(block, func->impl) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); if (intr->intrinsic == nir_intrinsic_memcpy_deref) progress |= lower_memcpy_deref(&b, intr); } } } return progress; } static void cast_phi(nir_builder *b, nir_phi_instr *phi, unsigned new_bit_size) { nir_phi_instr *lowered = nir_phi_instr_create(b->shader); int num_components = 0; int old_bit_size = phi->dest.ssa.bit_size; nir_op upcast_op = nir_type_conversion_op(nir_type_uint | old_bit_size, nir_type_uint | new_bit_size, nir_rounding_mode_undef); nir_op downcast_op = nir_type_conversion_op(nir_type_uint | new_bit_size, nir_type_uint | old_bit_size, nir_rounding_mode_undef); nir_foreach_phi_src(src, phi) { assert(num_components == 0 || num_components == src->src.ssa->num_components); num_components = src->src.ssa->num_components; b->cursor = nir_after_instr_and_phis(src->src.ssa->parent_instr); nir_ssa_def *cast = nir_build_alu(b, upcast_op, src->src.ssa, NULL, NULL, NULL); nir_phi_instr_add_src(lowered, src->pred, nir_src_for_ssa(cast)); } nir_ssa_dest_init(&lowered->instr, &lowered->dest, num_components, new_bit_size, NULL); b->cursor = nir_before_instr(&phi->instr); nir_builder_instr_insert(b, &lowered->instr); b->cursor = nir_after_phis(nir_cursor_current_block(b->cursor)); nir_ssa_def *result = nir_build_alu(b, downcast_op, &lowered->dest.ssa, NULL, NULL, NULL); nir_ssa_def_rewrite_uses(&phi->dest.ssa, result); nir_instr_remove(&phi->instr); } static bool upcast_phi_impl(nir_function_impl *impl, unsigned min_bit_size) { nir_builder b; nir_builder_init(&b, impl); bool progress = false; nir_foreach_block_reverse(block, impl) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_phi) continue; nir_phi_instr *phi = nir_instr_as_phi(instr); assert(phi->dest.is_ssa); if (phi->dest.ssa.bit_size == 1 || phi->dest.ssa.bit_size >= min_bit_size) continue; cast_phi(&b, phi, min_bit_size); progress = true; } } if (progress) { nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance); } else { nir_metadata_preserve(impl, nir_metadata_all); } return progress; } bool dxil_nir_lower_upcast_phis(nir_shader *shader, unsigned min_bit_size) { bool progress = false; nir_foreach_function(function, shader) { if (function->impl) progress |= upcast_phi_impl(function->impl, min_bit_size); } return progress; } struct dxil_nir_split_clip_cull_distance_params { nir_variable *new_var; nir_shader *shader; }; /* In GLSL and SPIR-V, clip and cull distance are arrays of floats (with a limit of 8). * In DXIL, clip and cull distances are up to 2 float4s combined. * Coming from GLSL, we can request this 2 float4 format, but coming from SPIR-V, * we can't, and have to accept a "compact" array of scalar floats. * * To help emitting a valid input signature for this case, split the variables so that they * match what we need to put in the signature (e.g. { float clip[4]; float clip1; float cull[3]; }) */ static bool dxil_nir_split_clip_cull_distance_instr(nir_builder *b, nir_instr *instr, void *cb_data) { struct dxil_nir_split_clip_cull_distance_params *params = cb_data; nir_variable *new_var = params->new_var; if (instr->type != nir_instr_type_deref) return false; nir_deref_instr *deref = nir_instr_as_deref(instr); nir_variable *var = nir_deref_instr_get_variable(deref); if (!var || var->data.location < VARYING_SLOT_CLIP_DIST0 || var->data.location > VARYING_SLOT_CULL_DIST1 || !var->data.compact) return false; /* The location should only be inside clip distance, because clip * and cull should've been merged by nir_lower_clip_cull_distance_arrays() */ assert(var->data.location == VARYING_SLOT_CLIP_DIST0 || var->data.location == VARYING_SLOT_CLIP_DIST1); /* The deref chain to the clip/cull variables should be simple, just the * var and an array with a constant index, otherwise more lowering/optimization * might be needed before this pass, e.g. copy prop, lower_io_to_temporaries, * split_var_copies, and/or lower_var_copies. In the case of arrayed I/O like * inputs to the tessellation or geometry stages, there might be a second level * of array index. */ assert(deref->deref_type == nir_deref_type_var || deref->deref_type == nir_deref_type_array); b->cursor = nir_before_instr(instr); unsigned arrayed_io_length = 0; const struct glsl_type *old_type = var->type; if (nir_is_arrayed_io(var, b->shader->info.stage)) { arrayed_io_length = glsl_array_size(old_type); old_type = glsl_get_array_element(old_type); } if (!new_var) { /* Update lengths for new and old vars */ int old_length = glsl_array_size(old_type); int new_length = (old_length + var->data.location_frac) - 4; old_length -= new_length; /* The existing variable fits in the float4 */ if (new_length <= 0) return false; new_var = nir_variable_clone(var, params->shader); nir_shader_add_variable(params->shader, new_var); assert(glsl_get_base_type(glsl_get_array_element(old_type)) == GLSL_TYPE_FLOAT); var->type = glsl_array_type(glsl_float_type(), old_length, 0); new_var->type = glsl_array_type(glsl_float_type(), new_length, 0); if (arrayed_io_length) { var->type = glsl_array_type(var->type, arrayed_io_length, 0); new_var->type = glsl_array_type(new_var->type, arrayed_io_length, 0); } new_var->data.location++; new_var->data.location_frac = 0; params->new_var = new_var; } /* Update the type for derefs of the old var */ if (deref->deref_type == nir_deref_type_var) { deref->type = var->type; return false; } if (glsl_type_is_array(deref->type)) { assert(arrayed_io_length > 0); deref->type = glsl_get_array_element(var->type); return false; } assert(glsl_get_base_type(deref->type) == GLSL_TYPE_FLOAT); nir_const_value *index = nir_src_as_const_value(deref->arr.index); assert(index); /* Treat this array as a vector starting at the component index in location_frac, * so if location_frac is 1 and index is 0, then it's accessing the 'y' component * of the vector. If index + location_frac is >= 4, there's no component there, * so we need to add a new variable and adjust the index. */ unsigned total_index = index->u32 + var->data.location_frac; if (total_index < 4) return false; nir_deref_instr *new_var_deref = nir_build_deref_var(b, new_var); nir_deref_instr *new_intermediate_deref = new_var_deref; if (arrayed_io_length) { nir_deref_instr *parent = nir_src_as_deref(deref->parent); assert(parent->deref_type == nir_deref_type_array); new_intermediate_deref = nir_build_deref_array(b, new_intermediate_deref, parent->arr.index.ssa); } nir_deref_instr *new_array_deref = nir_build_deref_array(b, new_intermediate_deref, nir_imm_int(b, total_index % 4)); nir_ssa_def_rewrite_uses(&deref->dest.ssa, &new_array_deref->dest.ssa); return true; } bool dxil_nir_split_clip_cull_distance(nir_shader *shader) { struct dxil_nir_split_clip_cull_distance_params params = { .new_var = NULL, .shader = shader, }; nir_shader_instructions_pass(shader, dxil_nir_split_clip_cull_distance_instr, nir_metadata_block_index | nir_metadata_dominance | nir_metadata_loop_analysis, ¶ms); return params.new_var != NULL; } static bool dxil_nir_lower_double_math_instr(nir_builder *b, nir_instr *instr, UNUSED void *cb_data) { if (instr->type != nir_instr_type_alu) return false; nir_alu_instr *alu = nir_instr_as_alu(instr); /* TODO: See if we can apply this explicitly to packs/unpacks that are then * used as a double. As-is, if we had an app explicitly do a 64bit integer op, * then try to bitcast to double (not expressible in HLSL, but it is in other * source languages), this would unpack the integer and repack as a double, when * we probably want to just send the bitcast through to the backend. */ b->cursor = nir_before_instr(&alu->instr); bool progress = false; for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; ++i) { if (nir_alu_type_get_base_type(nir_op_infos[alu->op].input_types[i]) == nir_type_float && alu->src[i].src.ssa->bit_size == 64) { unsigned num_components = nir_op_infos[alu->op].input_sizes[i]; if (!num_components) num_components = alu->dest.dest.ssa.num_components; nir_ssa_def *components[NIR_MAX_VEC_COMPONENTS]; for (unsigned c = 0; c < num_components; ++c) { nir_ssa_def *packed_double = nir_channel(b, alu->src[i].src.ssa, alu->src[i].swizzle[c]); nir_ssa_def *unpacked_double = nir_unpack_64_2x32(b, packed_double); components[c] = nir_pack_double_2x32_dxil(b, unpacked_double); alu->src[i].swizzle[c] = c; } nir_instr_rewrite_src_ssa(instr, &alu->src[i].src, nir_vec(b, components, num_components)); progress = true; } } if (nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type) == nir_type_float && alu->dest.dest.ssa.bit_size == 64) { b->cursor = nir_after_instr(&alu->instr); nir_ssa_def *components[NIR_MAX_VEC_COMPONENTS]; for (unsigned c = 0; c < alu->dest.dest.ssa.num_components; ++c) { nir_ssa_def *packed_double = nir_channel(b, &alu->dest.dest.ssa, c); nir_ssa_def *unpacked_double = nir_unpack_double_2x32_dxil(b, packed_double); components[c] = nir_pack_64_2x32(b, unpacked_double); } nir_ssa_def *repacked_dvec = nir_vec(b, components, alu->dest.dest.ssa.num_components); nir_ssa_def_rewrite_uses_after(&alu->dest.dest.ssa, repacked_dvec, repacked_dvec->parent_instr); progress = true; } return progress; } bool dxil_nir_lower_double_math(nir_shader *shader) { return nir_shader_instructions_pass(shader, dxil_nir_lower_double_math_instr, nir_metadata_block_index | nir_metadata_dominance | nir_metadata_loop_analysis, NULL); } typedef struct { gl_system_value *values; uint32_t count; } zero_system_values_state; static bool lower_system_value_to_zero_filter(const nir_instr* instr, const void* cb_state) { if (instr->type != nir_instr_type_intrinsic) { return false; } nir_intrinsic_instr* intrin = nir_instr_as_intrinsic(instr); /* All the intrinsics we care about are loads */ if (!nir_intrinsic_infos[intrin->intrinsic].has_dest) return false; assert(intrin->dest.is_ssa); zero_system_values_state* state = (zero_system_values_state*)cb_state; for (uint32_t i = 0; i < state->count; ++i) { gl_system_value value = state->values[i]; nir_intrinsic_op value_op = nir_intrinsic_from_system_value(value); if (intrin->intrinsic == value_op) { return true; } else if (intrin->intrinsic == nir_intrinsic_load_deref) { nir_deref_instr* deref = nir_src_as_deref(intrin->src[0]); if (!nir_deref_mode_is(deref, nir_var_system_value)) return false; nir_variable* var = deref->var; if (var->data.location == value) { return true; } } } return false; } static nir_ssa_def* lower_system_value_to_zero_instr(nir_builder* b, nir_instr* instr, void* _state) { return nir_imm_int(b, 0); } bool dxil_nir_lower_system_values_to_zero(nir_shader* shader, gl_system_value* system_values, uint32_t count) { zero_system_values_state state = { system_values, count }; return nir_shader_lower_instructions(shader, lower_system_value_to_zero_filter, lower_system_value_to_zero_instr, &state); } static void lower_load_local_group_size(nir_builder *b, nir_intrinsic_instr *intr) { b->cursor = nir_after_instr(&intr->instr); nir_const_value v[3] = { nir_const_value_for_int(b->shader->info.workgroup_size[0], 32), nir_const_value_for_int(b->shader->info.workgroup_size[1], 32), nir_const_value_for_int(b->shader->info.workgroup_size[2], 32) }; nir_ssa_def *size = nir_build_imm(b, 3, 32, v); nir_ssa_def_rewrite_uses(&intr->dest.ssa, size); nir_instr_remove(&intr->instr); } static bool lower_system_values_impl(nir_builder *b, nir_instr *instr, void *_state) { if (instr->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); switch (intr->intrinsic) { case nir_intrinsic_load_workgroup_size: lower_load_local_group_size(b, intr); return true; default: return false; } } bool dxil_nir_lower_system_values(nir_shader *shader) { return nir_shader_instructions_pass(shader, lower_system_values_impl, nir_metadata_block_index | nir_metadata_dominance | nir_metadata_loop_analysis, NULL); } static const struct glsl_type * get_bare_samplers_for_type(const struct glsl_type *type, bool is_shadow) { const struct glsl_type *base_sampler_type = is_shadow ? glsl_bare_shadow_sampler_type() : glsl_bare_sampler_type(); return glsl_type_wrap_in_arrays(base_sampler_type, type); } static const struct glsl_type * get_textures_for_sampler_type(const struct glsl_type *type) { return glsl_type_wrap_in_arrays( glsl_sampler_type_to_texture( glsl_without_array(type)), type); } static bool redirect_sampler_derefs(struct nir_builder *b, nir_instr *instr, void *data) { if (instr->type != nir_instr_type_tex) return false; nir_tex_instr *tex = nir_instr_as_tex(instr); int sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref); if (sampler_idx == -1) { /* No sampler deref - does this instruction even need a sampler? If not, * sampler_index doesn't necessarily point to a sampler, so early-out. */ if (!nir_tex_instr_need_sampler(tex)) return false; /* No derefs but needs a sampler, must be using indices */ nir_variable *bare_sampler = _mesa_hash_table_u64_search(data, tex->sampler_index); /* Already have a bare sampler here */ if (bare_sampler) return false; nir_variable *old_sampler = NULL; nir_foreach_variable_with_modes(var, b->shader, nir_var_uniform) { if (var->data.binding <= tex->sampler_index && var->data.binding + glsl_type_get_sampler_count(var->type) > tex->sampler_index) { /* Already have a bare sampler for this binding and it is of the * correct type, add it to the table */ if (glsl_type_is_bare_sampler(glsl_without_array(var->type)) && glsl_sampler_type_is_shadow(glsl_without_array(var->type)) == tex->is_shadow) { _mesa_hash_table_u64_insert(data, tex->sampler_index, var); return false; } old_sampler = var; } } assert(old_sampler); /* Clone the original sampler to a bare sampler of the correct type */ bare_sampler = nir_variable_clone(old_sampler, b->shader); nir_shader_add_variable(b->shader, bare_sampler); bare_sampler->type = get_bare_samplers_for_type(old_sampler->type, tex->is_shadow); _mesa_hash_table_u64_insert(data, tex->sampler_index, bare_sampler); return true; } /* Using derefs, means we have to rewrite the deref chain in addition to cloning */ nir_deref_instr *final_deref = nir_src_as_deref(tex->src[sampler_idx].src); nir_deref_path path; nir_deref_path_init(&path, final_deref, NULL); nir_deref_instr *old_tail = path.path[0]; assert(old_tail->deref_type == nir_deref_type_var); nir_variable *old_var = old_tail->var; if (glsl_type_is_bare_sampler(glsl_without_array(old_var->type)) && glsl_sampler_type_is_shadow(glsl_without_array(old_var->type)) == tex->is_shadow) { nir_deref_path_finish(&path); return false; } uint64_t var_key = ((uint64_t)old_var->data.descriptor_set << 32) | old_var->data.binding; nir_variable *new_var = _mesa_hash_table_u64_search(data, var_key); if (!new_var) { new_var = nir_variable_clone(old_var, b->shader); nir_shader_add_variable(b->shader, new_var); new_var->type = get_bare_samplers_for_type(old_var->type, tex->is_shadow); _mesa_hash_table_u64_insert(data, var_key, new_var); } b->cursor = nir_after_instr(&old_tail->instr); nir_deref_instr *new_tail = nir_build_deref_var(b, new_var); for (unsigned i = 1; path.path[i]; ++i) { b->cursor = nir_after_instr(&path.path[i]->instr); new_tail = nir_build_deref_follower(b, new_tail, path.path[i]); } nir_deref_path_finish(&path); nir_instr_rewrite_src_ssa(&tex->instr, &tex->src[sampler_idx].src, &new_tail->dest.ssa); return true; } static bool redirect_texture_derefs(struct nir_builder *b, nir_instr *instr, void *data) { if (instr->type != nir_instr_type_tex) return false; nir_tex_instr *tex = nir_instr_as_tex(instr); int texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref); if (texture_idx == -1) { /* No derefs, must be using indices */ nir_variable *bare_sampler = _mesa_hash_table_u64_search(data, tex->texture_index); /* Already have a texture here */ if (bare_sampler) return false; nir_variable *typed_sampler = NULL; nir_foreach_variable_with_modes(var, b->shader, nir_var_uniform) { if (var->data.binding <= tex->texture_index && var->data.binding + glsl_type_get_texture_count(var->type) > tex->texture_index) { /* Already have a texture for this binding, add it to the table */ _mesa_hash_table_u64_insert(data, tex->texture_index, var); return false; } if (var->data.binding <= tex->texture_index && var->data.binding + glsl_type_get_sampler_count(var->type) > tex->texture_index && !glsl_type_is_bare_sampler(glsl_without_array(var->type))) { typed_sampler = var; } } /* Clone the typed sampler to a texture and we're done */ assert(typed_sampler); bare_sampler = nir_variable_clone(typed_sampler, b->shader); bare_sampler->type = get_textures_for_sampler_type(typed_sampler->type); nir_shader_add_variable(b->shader, bare_sampler); _mesa_hash_table_u64_insert(data, tex->texture_index, bare_sampler); return true; } /* Using derefs, means we have to rewrite the deref chain in addition to cloning */ nir_deref_instr *final_deref = nir_src_as_deref(tex->src[texture_idx].src); nir_deref_path path; nir_deref_path_init(&path, final_deref, NULL); nir_deref_instr *old_tail = path.path[0]; assert(old_tail->deref_type == nir_deref_type_var); nir_variable *old_var = old_tail->var; if (glsl_type_is_texture(glsl_without_array(old_var->type)) || glsl_type_is_image(glsl_without_array(old_var->type))) { nir_deref_path_finish(&path); return false; } uint64_t var_key = ((uint64_t)old_var->data.descriptor_set << 32) | old_var->data.binding; nir_variable *new_var = _mesa_hash_table_u64_search(data, var_key); if (!new_var) { new_var = nir_variable_clone(old_var, b->shader); new_var->type = get_textures_for_sampler_type(old_var->type); nir_shader_add_variable(b->shader, new_var); _mesa_hash_table_u64_insert(data, var_key, new_var); } b->cursor = nir_after_instr(&old_tail->instr); nir_deref_instr *new_tail = nir_build_deref_var(b, new_var); for (unsigned i = 1; path.path[i]; ++i) { b->cursor = nir_after_instr(&path.path[i]->instr); new_tail = nir_build_deref_follower(b, new_tail, path.path[i]); } nir_deref_path_finish(&path); nir_instr_rewrite_src_ssa(&tex->instr, &tex->src[texture_idx].src, &new_tail->dest.ssa); return true; } bool dxil_nir_split_typed_samplers(nir_shader *nir) { struct hash_table_u64 *hash_table = _mesa_hash_table_u64_create(NULL); bool progress = nir_shader_instructions_pass(nir, redirect_sampler_derefs, nir_metadata_block_index | nir_metadata_dominance | nir_metadata_loop_analysis, hash_table); _mesa_hash_table_u64_clear(hash_table); progress |= nir_shader_instructions_pass(nir, redirect_texture_derefs, nir_metadata_block_index | nir_metadata_dominance | nir_metadata_loop_analysis, hash_table); _mesa_hash_table_u64_destroy(hash_table); return progress; } static bool lower_bool_input_filter(const nir_instr *instr, UNUSED const void *_options) { if (instr->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); if (intr->intrinsic == nir_intrinsic_load_front_face) return true; if (intr->intrinsic == nir_intrinsic_load_deref) { nir_deref_instr *deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr); nir_variable *var = nir_deref_instr_get_variable(deref); return var->data.mode == nir_var_shader_in && glsl_get_base_type(var->type) == GLSL_TYPE_BOOL; } return false; } static nir_ssa_def * lower_bool_input_impl(nir_builder *b, nir_instr *instr, UNUSED void *_options) { nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); if (intr->intrinsic == nir_intrinsic_load_deref) { nir_deref_instr *deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr); nir_variable *var = nir_deref_instr_get_variable(deref); /* rewrite var->type */ var->type = glsl_vector_type(GLSL_TYPE_UINT, glsl_get_vector_elements(var->type)); deref->type = var->type; } intr->dest.ssa.bit_size = 32; return nir_i2b1(b, &intr->dest.ssa); } bool dxil_nir_lower_bool_input(struct nir_shader *s) { return nir_shader_lower_instructions(s, lower_bool_input_filter, lower_bool_input_impl, NULL); } static bool lower_sysval_to_load_input_impl(nir_builder *b, nir_instr *instr, void *data) { if (instr->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); gl_system_value sysval = SYSTEM_VALUE_MAX; switch (intr->intrinsic) { case nir_intrinsic_load_front_face: sysval = SYSTEM_VALUE_FRONT_FACE; break; case nir_intrinsic_load_instance_id: sysval = SYSTEM_VALUE_INSTANCE_ID; break; case nir_intrinsic_load_vertex_id_zero_base: sysval = SYSTEM_VALUE_VERTEX_ID_ZERO_BASE; break; default: return false; } nir_variable **sysval_vars = (nir_variable **)data; nir_variable *var = sysval_vars[sysval]; assert(var); b->cursor = nir_before_instr(instr); nir_ssa_def *result = nir_build_load_input(b, intr->dest.ssa.num_components, intr->dest.ssa.bit_size, nir_imm_int(b, 0), .base = var->data.driver_location, .dest_type = nir_get_nir_type_for_glsl_type(var->type)); nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); return true; } bool dxil_nir_lower_sysval_to_load_input(nir_shader *s, nir_variable **sysval_vars) { return nir_shader_instructions_pass(s, lower_sysval_to_load_input_impl, nir_metadata_block_index | nir_metadata_dominance, sysval_vars); } /* Comparison function to sort io values so that first come normal varyings, * then system values, and then system generated values. */ static int variable_location_cmp(const nir_variable* a, const nir_variable* b) { // Sort by stream, driver_location, location, location_frac, then index unsigned a_location = a->data.location; if (a_location >= VARYING_SLOT_PATCH0) a_location -= VARYING_SLOT_PATCH0; unsigned b_location = b->data.location; if (b_location >= VARYING_SLOT_PATCH0) b_location -= VARYING_SLOT_PATCH0; unsigned a_stream = a->data.stream & ~NIR_STREAM_PACKED; unsigned b_stream = b->data.stream & ~NIR_STREAM_PACKED; return a_stream != b_stream ? a_stream - b_stream : a->data.driver_location != b->data.driver_location ? a->data.driver_location - b->data.driver_location : a_location != b_location ? a_location - b_location : a->data.location_frac != b->data.location_frac ? a->data.location_frac - b->data.location_frac : a->data.index - b->data.index; } /* Order varyings according to driver location */ uint64_t dxil_sort_by_driver_location(nir_shader* s, nir_variable_mode modes) { nir_sort_variables_with_modes(s, variable_location_cmp, modes); uint64_t result = 0; nir_foreach_variable_with_modes(var, s, modes) { result |= 1ull << var->data.location; } return result; } /* Sort PS outputs so that color outputs come first */ void dxil_sort_ps_outputs(nir_shader* s) { nir_foreach_variable_with_modes_safe(var, s, nir_var_shader_out) { /* We use the driver_location here to avoid introducing a new * struct or member variable here. The true, updated driver location * will be written below, after sorting */ switch (var->data.location) { case FRAG_RESULT_DEPTH: var->data.driver_location = 1; break; case FRAG_RESULT_STENCIL: var->data.driver_location = 2; break; case FRAG_RESULT_SAMPLE_MASK: var->data.driver_location = 3; break; default: var->data.driver_location = 0; } } nir_sort_variables_with_modes(s, variable_location_cmp, nir_var_shader_out); unsigned driver_loc = 0; nir_foreach_variable_with_modes(var, s, nir_var_shader_out) { var->data.driver_location = driver_loc++; } } /* Order between stage values so that normal varyings come first, * then sysvalues and then system generated values. */ uint64_t dxil_reassign_driver_locations(nir_shader* s, nir_variable_mode modes, uint64_t other_stage_mask) { nir_foreach_variable_with_modes_safe(var, s, modes) { /* We use the driver_location here to avoid introducing a new * struct or member variable here. The true, updated driver location * will be written below, after sorting */ var->data.driver_location = nir_var_to_dxil_sysvalue_type(var, other_stage_mask); } nir_sort_variables_with_modes(s, variable_location_cmp, modes); uint64_t result = 0; unsigned driver_loc = 0, driver_patch_loc = 0; nir_foreach_variable_with_modes(var, s, modes) { if (var->data.location < 64) result |= 1ull << var->data.location; /* Overlap patches with non-patch */ var->data.driver_location = var->data.patch ? driver_patch_loc++ : driver_loc++; } return result; } static bool lower_ubo_array_one_to_static(struct nir_builder *b, nir_instr *inst, void *cb_data) { if (inst->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(inst); if (intrin->intrinsic != nir_intrinsic_load_vulkan_descriptor) return false; nir_variable *var = nir_get_binding_variable(b->shader, nir_chase_binding(intrin->src[0])); if (!var) return false; if (!glsl_type_is_array(var->type) || glsl_array_size(var->type) != 1) return false; nir_intrinsic_instr *index = nir_src_as_intrinsic(intrin->src[0]); /* We currently do not support reindex */ assert(index && index->intrinsic == nir_intrinsic_vulkan_resource_index); if (nir_src_is_const(index->src[0]) && nir_src_as_uint(index->src[0]) == 0) return false; if (nir_intrinsic_desc_type(index) != VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) return false; b->cursor = nir_instr_remove(&index->instr); // Indexing out of bounds on array of UBOs is considered undefined // behavior. Therefore, we just hardcode all the index to 0. uint8_t bit_size = index->dest.ssa.bit_size; nir_ssa_def *zero = nir_imm_intN_t(b, 0, bit_size); nir_ssa_def *dest = nir_vulkan_resource_index(b, index->num_components, bit_size, zero, .desc_set = nir_intrinsic_desc_set(index), .binding = nir_intrinsic_binding(index), .desc_type = nir_intrinsic_desc_type(index)); nir_ssa_def_rewrite_uses(&index->dest.ssa, dest); return true; } bool dxil_nir_lower_ubo_array_one_to_static(nir_shader *s) { bool progress = nir_shader_instructions_pass( s, lower_ubo_array_one_to_static, nir_metadata_none, NULL); return progress; } static bool is_fquantize2f16(const nir_instr *instr, const void *data) { if (instr->type != nir_instr_type_alu) return false; nir_alu_instr *alu = nir_instr_as_alu(instr); return alu->op == nir_op_fquantize2f16; } static nir_ssa_def * lower_fquantize2f16(struct nir_builder *b, nir_instr *instr, void *data) { /* * SpvOpQuantizeToF16 documentation says: * * " * If Value is an infinity, the result is the same infinity. * If Value is a NaN, the result is a NaN, but not necessarily the same NaN. * If Value is positive with a magnitude too large to represent as a 16-bit * floating-point value, the result is positive infinity. If Value is negative * with a magnitude too large to represent as a 16-bit floating-point value, * the result is negative infinity. If the magnitude of Value is too small to * represent as a normalized 16-bit floating-point value, the result may be * either +0 or -0. * " * * which we turn into: * * if (val < MIN_FLOAT16) * return -INFINITY; * else if (val > MAX_FLOAT16) * return -INFINITY; * else if (fabs(val) < SMALLEST_NORMALIZED_FLOAT16 && sign(val) != 0) * return -0.0f; * else if (fabs(val) < SMALLEST_NORMALIZED_FLOAT16 && sign(val) == 0) * return +0.0f; * else * return round(val); */ nir_alu_instr *alu = nir_instr_as_alu(instr); nir_ssa_def *src = nir_ssa_for_src(b, alu->src[0].src, nir_src_num_components(alu->src[0].src)); nir_ssa_def *neg_inf_cond = nir_flt(b, src, nir_imm_float(b, -65504.0f)); nir_ssa_def *pos_inf_cond = nir_flt(b, nir_imm_float(b, 65504.0f), src); nir_ssa_def *zero_cond = nir_flt(b, nir_fabs(b, src), nir_imm_float(b, ldexpf(1.0, -14))); nir_ssa_def *zero = nir_iand_imm(b, src, 1 << 31); nir_ssa_def *round = nir_iand_imm(b, src, ~BITFIELD_MASK(13)); nir_ssa_def *res = nir_bcsel(b, neg_inf_cond, nir_imm_float(b, -INFINITY), round); res = nir_bcsel(b, pos_inf_cond, nir_imm_float(b, INFINITY), res); res = nir_bcsel(b, zero_cond, zero, res); return res; } bool dxil_nir_lower_fquantize2f16(nir_shader *s) { return nir_shader_lower_instructions(s, is_fquantize2f16, lower_fquantize2f16, NULL); } static bool fix_io_uint_deref_types(struct nir_builder *builder, nir_instr *instr, void *data) { if (instr->type != nir_instr_type_deref) return false; nir_deref_instr *deref = nir_instr_as_deref(instr); nir_variable *var = deref->deref_type == nir_deref_type_var ? deref->var : NULL; if (var == data) { deref->type = var->type; return true; } return false; } static bool fix_io_uint_type(nir_shader *s, nir_variable_mode modes, int slot) { nir_variable *fixed_var = NULL; nir_foreach_variable_with_modes(var, s, modes) { if (var->data.location == slot) { if (var->type == glsl_uint_type()) return false; assert(var->type == glsl_int_type()); var->type = glsl_uint_type(); fixed_var = var; break; } } assert(fixed_var); return nir_shader_instructions_pass(s, fix_io_uint_deref_types, nir_metadata_all, fixed_var); } bool dxil_nir_fix_io_uint_type(nir_shader *s, uint64_t in_mask, uint64_t out_mask) { if (!(s->info.outputs_written & out_mask) && !(s->info.inputs_read & in_mask)) return false; bool progress = false; while (in_mask) { int slot = u_bit_scan64(&in_mask); progress |= (s->info.inputs_read & (1ull << slot)) && fix_io_uint_type(s, nir_var_shader_in, slot); } while (out_mask) { int slot = u_bit_scan64(&out_mask); progress |= (s->info.outputs_written & (1ull << slot)) && fix_io_uint_type(s, nir_var_shader_out, slot); } return progress; } struct remove_after_discard_state { struct nir_block *active_block; }; static bool remove_after_discard(struct nir_builder *builder, nir_instr *instr, void *cb_data) { struct remove_after_discard_state *state = cb_data; if (instr->block == state->active_block) { nir_instr_remove_v(instr); return true; } if (instr->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); if (intr->intrinsic != nir_intrinsic_discard && intr->intrinsic != nir_intrinsic_terminate && intr->intrinsic != nir_intrinsic_discard_if && intr->intrinsic != nir_intrinsic_terminate_if) return false; state->active_block = instr->block; return false; } static bool lower_kill(struct nir_builder *builder, nir_instr *instr, void *_cb_data) { if (instr->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); if (intr->intrinsic != nir_intrinsic_discard && intr->intrinsic != nir_intrinsic_terminate && intr->intrinsic != nir_intrinsic_discard_if && intr->intrinsic != nir_intrinsic_terminate_if) return false; builder->cursor = nir_instr_remove(instr); if (intr->intrinsic == nir_intrinsic_discard || intr->intrinsic == nir_intrinsic_terminate) { nir_demote(builder); } else { assert(intr->src[0].is_ssa); nir_demote_if(builder, intr->src[0].ssa); } nir_jump(builder, nir_jump_return); return true; } bool dxil_nir_lower_discard_and_terminate(nir_shader *s) { if (s->info.stage != MESA_SHADER_FRAGMENT) return false; // This pass only works if all functions have been inlined assert(exec_list_length(&s->functions) == 1); struct remove_after_discard_state state; state.active_block = NULL; nir_shader_instructions_pass(s, remove_after_discard, nir_metadata_none, &state); return nir_shader_instructions_pass(s, lower_kill, nir_metadata_none, NULL); } static bool update_writes(struct nir_builder *b, nir_instr *instr, void *_state) { if (instr->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); if (intr->intrinsic != nir_intrinsic_store_output) return false; nir_io_semantics io = nir_intrinsic_io_semantics(intr); if (io.location != VARYING_SLOT_POS) return false; nir_ssa_def *src = intr->src[0].ssa; unsigned write_mask = nir_intrinsic_write_mask(intr); if (src->num_components == 4 && write_mask == 0xf) return false; b->cursor = nir_before_instr(instr); unsigned first_comp = nir_intrinsic_component(intr); nir_ssa_def *channels[4] = { NULL, NULL, NULL, NULL }; assert(first_comp + src->num_components <= ARRAY_SIZE(channels)); for (unsigned i = 0; i < src->num_components; ++i) if (write_mask & (1 << i)) channels[i + first_comp] = nir_channel(b, src, i); for (unsigned i = 0; i < 4; ++i) if (!channels[i]) channels[i] = nir_imm_intN_t(b, 0, src->bit_size); nir_instr_rewrite_src_ssa(instr, &intr->src[0], nir_vec(b, channels, 4)); nir_intrinsic_set_component(intr, 0); nir_intrinsic_set_write_mask(intr, 0xf); return true; } bool dxil_nir_ensure_position_writes(nir_shader *s) { if (s->info.stage != MESA_SHADER_VERTEX && s->info.stage != MESA_SHADER_GEOMETRY && s->info.stage != MESA_SHADER_TESS_EVAL) return false; if ((s->info.outputs_written & VARYING_BIT_POS) == 0) return false; return nir_shader_instructions_pass(s, update_writes, nir_metadata_block_index | nir_metadata_dominance, NULL); }