diff --git a/meson.build b/meson.build index 614c9877072..e25f9fcde39 100644 --- a/meson.build +++ b/meson.build @@ -299,6 +299,26 @@ if with_aco_tests and not with_amd_vk error('ACO tests require Radv') endif +_microsoft_clc = get_option('microsoft-clc') +if _microsoft_clc == 'auto' + with_microsoft_clc = false +else + with_microsoft_clc = _microsoft_clc == 'true' +endif + +if with_microsoft_clc + with_clc = true + dep_clang = dependency( + 'clang', + method: 'cmake', + static: true, + modules: [ + 'clangBasic', 'clangCodeGen', 'clangDriver', 'clangFrontend', 'clangFrontendTool', + 'clangHandleCXX', 'clangHandleLLVM', + ], + ) +endif + if host_machine.system() == 'darwin' with_dri_platform = 'apple' pre_args += '-DBUILDING_MESA' @@ -1470,8 +1490,13 @@ if with_gallium_opencl 'lto', 'option', 'objcarcopts', 'profiledata', ] endif +if with_microsoft_clc + llvm_modules += ['target', 'linker', 'irreader', 'option', 'libdriver'] +endif -if with_amd_vk or with_gallium_radeonsi or with_gallium_opencl +if with_microsoft_clc + _llvm_version = '>= 10.0.0' +elif with_amd_vk or with_gallium_radeonsi or with_gallium_opencl _llvm_version = '>= 8.0.0' elif with_gallium_swr _llvm_version = '>= 6.0.0' @@ -1521,7 +1546,7 @@ if _llvm != 'disabled' optional_modules : llvm_optional_modules, required : ( with_amd_vk or with_gallium_radeonsi or with_gallium_swr or - with_gallium_opencl or _llvm == 'enabled' + with_gallium_opencl or with_microsoft_clc or _llvm == 'enabled' ), static : not _shared_llvm, method : _llvm_method, @@ -1564,9 +1589,11 @@ elif with_amd_vk or with_gallium_radeonsi or with_gallium_swr error('The following drivers require LLVM: Radv, RadeonSI, SWR. One of these is enabled, but LLVM is disabled.') elif with_gallium_opencl error('The OpenCL "Clover" state tracker requires LLVM, but LLVM is disabled.') +elif with_microsoft_clc + error('The Microsoft CLC compiler requires LLVM, but LLVM is disabled.') endif -with_opencl_spirv = _opencl != 'disabled' and get_option('opencl-spirv') +with_opencl_spirv = (_opencl != 'disabled' and get_option('opencl-spirv')) or with_microsoft_clc if with_opencl_spirv chosen_llvm_version_array = dep_llvm.version().split('.') chosen_llvm_version_major = chosen_llvm_version_array[0].to_int() diff --git a/meson_options.txt b/meson_options.txt index 7db6907857a..7637c4cbb73 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -261,6 +261,13 @@ option( value : false, description : 'Enable GLVND support.' ) +option( + 'microsoft-clc', + type : 'combo', + value : 'auto', + choices : ['auto', 'true', 'false'], + description : 'Build support for the Microsoft CLC to DXIL compiler' +) option( 'glx-read-only-text', type : 'boolean', diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index d21c13ed978..aee01cd6a8f 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -946,9 +946,45 @@ load("global_ir3", [2, 1], indices=[ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN intrinsic("bindless_resource_ir3", [1], dest_comp=1, indices=[DESC_SET], flags=[CAN_ELIMINATE, CAN_REORDER]) # DXIL specific intrinsics +# src[] = { value, mask, index, offset }. +intrinsic("store_ssbo_masked_dxil", [1, 1, 1, 1]) +# src[] = { value, index }. +intrinsic("store_shared_dxil", [1, 1]) +# src[] = { value, mask, index }. +intrinsic("store_shared_masked_dxil", [1, 1, 1]) +# src[] = { value, index }. +intrinsic("store_scratch_dxil", [1, 1]) +# src[] = { index }. +load("shared_dxil", [1], [], [CAN_ELIMINATE]) +# src[] = { index }. +load("scratch_dxil", [1], [], [CAN_ELIMINATE]) +# src[] = { deref_var, offset } +load("ptr_dxil", [1, 1], [], []) # src[] = { index, 16-byte-based-offset } load("ubo_dxil", [1, 1], [], [CAN_ELIMINATE]) +# DXIL Shared atomic intrinsics +# +# All of the shared variable atomic memory operations read a value from +# memory, compute a new value using one of the operations below, write the +# new value to memory, and return the original value read. +# +# All operations take 2 sources: +# +# 0: The index in the i32 array for by the shared memory region +# 1: The data parameter to the atomic function (i.e. the value to add +# in shared_atomic_add, etc). +intrinsic("shared_atomic_add_dxil", src_comp=[1, 1], dest_comp=1) +intrinsic("shared_atomic_imin_dxil", src_comp=[1, 1], dest_comp=1) +intrinsic("shared_atomic_umin_dxil", src_comp=[1, 1], dest_comp=1) +intrinsic("shared_atomic_imax_dxil", src_comp=[1, 1], dest_comp=1) +intrinsic("shared_atomic_umax_dxil", src_comp=[1, 1], dest_comp=1) +intrinsic("shared_atomic_and_dxil", src_comp=[1, 1], dest_comp=1) +intrinsic("shared_atomic_or_dxil", src_comp=[1, 1], dest_comp=1) +intrinsic("shared_atomic_xor_dxil", src_comp=[1, 1], dest_comp=1) +intrinsic("shared_atomic_exchange_dxil", src_comp=[1, 1], dest_comp=1) +intrinsic("shared_atomic_comp_swap_dxil", src_comp=[1, 1, 1], dest_comp=1) + # Intrinsics used by the Midgard/Bifrost blend pipeline. These are defined # within a blend shader to read/write the raw value from the tile buffer, # without applying any format conversion in the process. If the shader needs diff --git a/src/meson.build b/src/meson.build index b5f4933abca..7deb202207a 100644 --- a/src/meson.build +++ b/src/meson.build @@ -91,7 +91,7 @@ endif if with_any_intel subdir('intel') endif -if with_gallium_d3d12 +if with_microsoft_clc or with_gallium_d3d12 subdir('microsoft') endif subdir('mesa') diff --git a/src/microsoft/clc/clc_compiler.c b/src/microsoft/clc/clc_compiler.c new file mode 100644 index 00000000000..dc841865132 --- /dev/null +++ b/src/microsoft/clc/clc_compiler.c @@ -0,0 +1,1447 @@ +/* + * Copyright © Microsoft Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "nir.h" +#include "nir_serialize.h" +#include "glsl_types.h" +#include "nir_types.h" +#include "clc_compiler.h" +#include "clc_helpers.h" +#include "clc_nir.h" +#include "../compiler/dxil_nir.h" +#include "../compiler/dxil_nir_lower_int_samplers.h" +#include "../compiler/nir_to_dxil.h" + +#include "util/u_debug.h" +#include +#include "spirv/nir_spirv.h" +#include "nir_builder.h" +#include "nir_builtin_builder.h" + +#include "git_sha1.h" + +enum clc_debug_flags { + CLC_DEBUG_DUMP_SPIRV = 1 << 0, + CLC_DEBUG_VERBOSE = 1 << 1, +}; + +static const struct debug_named_value debug_options[] = { + { "dump_spirv", CLC_DEBUG_DUMP_SPIRV, "Dump spirv blobs" }, + { "verbose", CLC_DEBUG_VERBOSE, NULL }, + DEBUG_NAMED_VALUE_END +}; + +DEBUG_GET_ONCE_FLAGS_OPTION(debug_clc, "CLC_DEBUG", debug_options, 0) + +static void +clc_print_kernels_info(const struct clc_object *obj) +{ + fprintf(stdout, "Kernels:\n"); + for (unsigned i = 0; i < obj->num_kernels; i++) { + const struct clc_kernel_arg *args = obj->kernels[i].args; + bool first = true; + + fprintf(stdout, "\tvoid %s(", obj->kernels[i].name); + for (unsigned j = 0; j < obj->kernels[i].num_args; j++) { + if (!first) + fprintf(stdout, ", "); + else + first = false; + + switch (args[j].address_qualifier) { + case CLC_KERNEL_ARG_ADDRESS_GLOBAL: + fprintf(stdout, "__global "); + break; + case CLC_KERNEL_ARG_ADDRESS_LOCAL: + fprintf(stdout, "__local "); + break; + case CLC_KERNEL_ARG_ADDRESS_CONSTANT: + fprintf(stdout, "__constant "); + break; + default: + break; + } + + if (args[j].type_qualifier & CLC_KERNEL_ARG_TYPE_VOLATILE) + fprintf(stdout, "volatile "); + if (args[j].type_qualifier & CLC_KERNEL_ARG_TYPE_CONST) + fprintf(stdout, "const "); + if (args[j].type_qualifier & CLC_KERNEL_ARG_TYPE_RESTRICT) + fprintf(stdout, "restrict "); + + fprintf(stdout, "%s %s", args[j].type_name, args[j].name); + } + fprintf(stdout, ");\n"); + } +} + +struct clc_image_lower_context +{ + struct clc_dxil_metadata *metadata; + unsigned *num_srvs; + unsigned *num_uavs; + nir_deref_instr *deref; + unsigned num_buf_ids; + int metadata_index; +}; + +static int +lower_image_deref_impl(nir_builder *b, struct clc_image_lower_context *context, + const struct glsl_type *new_var_type, + unsigned *num_bindings) +{ + nir_variable *in_var = nir_deref_instr_get_variable(context->deref); + nir_variable *uniform = nir_variable_create(b->shader, nir_var_uniform, new_var_type, NULL); + uniform->data.access = in_var->data.access; + uniform->data.binding = in_var->data.binding; + if (context->num_buf_ids > 0) { + // Need to assign a new binding + context->metadata->args[context->metadata_index]. + image.buf_ids[context->num_buf_ids] = uniform->data.binding = (*num_bindings)++; + } + context->num_buf_ids++; + return uniform->data.binding; +} + +static int +lower_read_only_image_deref(nir_builder *b, struct clc_image_lower_context *context, + nir_alu_type image_type) +{ + nir_variable *in_var = nir_deref_instr_get_variable(context->deref); + + // Non-writeable images should be converted to samplers, + // since they may have texture operations done on them + const struct glsl_type *new_var_type = + glsl_sampler_type(glsl_get_sampler_dim(in_var->type), + false, glsl_sampler_type_is_array(in_var->type), + nir_get_glsl_base_type_for_nir_type(image_type | 32)); + return lower_image_deref_impl(b, context, new_var_type, context->num_srvs); +} + +static int +lower_read_write_image_deref(nir_builder *b, struct clc_image_lower_context *context, + nir_alu_type image_type) +{ + nir_variable *in_var = nir_deref_instr_get_variable(context->deref); + const struct glsl_type *new_var_type = + glsl_image_type(glsl_get_sampler_dim(in_var->type), + glsl_sampler_type_is_array(in_var->type), + nir_get_glsl_base_type_for_nir_type(image_type | 32)); + return lower_image_deref_impl(b, context, new_var_type, context->num_uavs); +} + +static void +clc_lower_input_image_deref(nir_builder *b, struct clc_image_lower_context *context) +{ + // The input variable here isn't actually an image, it's just the + // image format data. + // + // For every use of an image in a different way, we'll add an + // appropriate uniform to match it. That can result in up to + // 3 uniforms (float4, int4, uint4) for each image. Only one of these + // formats will actually produce correct data, but a single kernel + // could use runtime conditionals to potentially access any of them. + // + // If the image is used in a query that doesn't have a corresponding + // DXIL intrinsic (CL image channel order or channel format), then + // we'll add a kernel input for that data that'll be lowered by the + // explicit IO pass later on. + // + // After all that, we can remove the image input variable and deref. + + enum image_uniform_type { + FLOAT4, + INT4, + UINT4, + IMAGE_UNIFORM_TYPE_COUNT + }; + + int image_bindings[IMAGE_UNIFORM_TYPE_COUNT] = {-1, -1, -1}; + nir_ssa_def *format_deref_dest = NULL, *order_deref_dest = NULL; + + nir_variable *in_var = nir_deref_instr_get_variable(context->deref); + enum gl_access_qualifier access = in_var->data.access; + + context->metadata_index = 0; + while (context->metadata->args[context->metadata_index].image.buf_ids[0] != in_var->data.binding) + context->metadata_index++; + + context->num_buf_ids = 0; + + /* Do this in 2 passes: + * 1. When encountering a strongly-typed access (load/store), replace the deref + * with one that references an appropriately typed variable. When encountering + * an untyped access (size query), if we have a strongly-typed variable already, + * replace the deref to point to it. + * 2. If there's any references left, they should all be untyped. If we found + * a strongly-typed access later in the 1st pass, then just replace the reference. + * If we didn't, e.g. the resource is only used for a size query, then pick an + * arbitrary type for it. + */ + for (int pass = 0; pass < 2; ++pass) { + nir_foreach_use_safe(src, &context->deref->dest.ssa) { + enum image_uniform_type type; + + if (src->parent_instr->type == nir_instr_type_intrinsic) { + nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(src->parent_instr); + enum nir_alu_type dest_type; + + b->cursor = nir_before_instr(&intrinsic->instr); + + switch (intrinsic->intrinsic) { + case nir_intrinsic_image_deref_load: + case nir_intrinsic_image_deref_store: { + dest_type = intrinsic->intrinsic == nir_intrinsic_image_deref_load ? + nir_intrinsic_dest_type(intrinsic) : nir_intrinsic_src_type(intrinsic); + + switch (nir_alu_type_get_base_type(dest_type)) { + case nir_type_float: type = FLOAT4; break; + case nir_type_int: type = INT4; break; + case nir_type_uint: type = UINT4; break; + default: unreachable("Unsupported image type for load."); + } + + int image_binding = image_bindings[type]; + if (image_binding < 0) { + image_binding = image_bindings[type] = + lower_read_write_image_deref(b, context, dest_type); + } + + assert((in_var->data.access & ACCESS_NON_WRITEABLE) == 0); + nir_rewrite_image_intrinsic(intrinsic, nir_imm_int(b, image_binding), false); + break; + } + + case nir_intrinsic_image_deref_size: { + int image_binding = -1; + for (unsigned i = 0; i < IMAGE_UNIFORM_TYPE_COUNT; ++i) { + if (image_bindings[i] >= 0) { + image_binding = image_bindings[i]; + break; + } + } + if (image_binding < 0) { + // Skip for now and come back to it + if (pass == 0) + break; + + type = FLOAT4; + image_binding = image_bindings[type] = + lower_read_write_image_deref(b, context, nir_type_float32); + } + + assert((in_var->data.access & ACCESS_NON_WRITEABLE) == 0); + nir_rewrite_image_intrinsic(intrinsic, nir_imm_int(b, image_binding), false); + break; + } + + case nir_intrinsic_image_deref_format: + case nir_intrinsic_image_deref_order: { + nir_ssa_def **cached_deref = intrinsic->intrinsic == nir_intrinsic_image_deref_format ? + &format_deref_dest : &order_deref_dest; + if (!*cached_deref) { + nir_variable *new_input = nir_variable_create(b->shader, nir_var_uniform, glsl_uint_type(), NULL); + new_input->data.driver_location = in_var->data.driver_location; + if (intrinsic->intrinsic == nir_intrinsic_image_deref_format) { + /* Match cl_image_format { image_channel_order, image_channel_data_type }; */ + new_input->data.driver_location += glsl_get_cl_size(new_input->type); + } + + b->cursor = nir_after_instr(&context->deref->instr); + *cached_deref = nir_load_var(b, new_input); + } + + /* No actual intrinsic needed here, just reference the loaded variable */ + nir_ssa_def_rewrite_uses(&intrinsic->dest.ssa, nir_src_for_ssa(*cached_deref)); + nir_instr_remove(&intrinsic->instr); + break; + } + + default: + unreachable("Unsupported image intrinsic"); + } + } else if (src->parent_instr->type == nir_instr_type_tex) { + assert(in_var->data.access & ACCESS_NON_WRITEABLE); + nir_tex_instr *tex = nir_instr_as_tex(src->parent_instr); + + switch (nir_alu_type_get_base_type(tex->dest_type)) { + case nir_type_float: type = FLOAT4; break; + case nir_type_int: type = INT4; break; + case nir_type_uint: type = UINT4; break; + default: unreachable("Unsupported image format for sample."); + } + + int image_binding = image_bindings[type]; + if (image_binding < 0) { + image_binding = image_bindings[type] = + lower_read_only_image_deref(b, context, tex->dest_type); + } + + nir_tex_instr_remove_src(tex, nir_tex_instr_src_index(tex, nir_tex_src_texture_deref)); + tex->texture_index = image_binding; + } + } + } + + context->metadata->args[context->metadata_index].image.num_buf_ids = context->num_buf_ids; + + nir_instr_remove(&context->deref->instr); + exec_node_remove(&in_var->node); +} + +static void +clc_lower_images(nir_shader *nir, struct clc_image_lower_context *context) +{ + nir_foreach_function(func, nir) { + if (!func->is_entrypoint) + continue; + assert(func->impl); + + nir_builder b; + nir_builder_init(&b, func->impl); + + nir_foreach_block(block, func->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type == nir_instr_type_deref) { + context->deref = nir_instr_as_deref(instr); + + if (glsl_type_is_image(context->deref->type)) { + assert(context->deref->deref_type == nir_deref_type_var); + clc_lower_input_image_deref(&b, context); + } + } + } + } + } +} + +static void +clc_lower_64bit_semantics(nir_shader *nir) +{ + nir_foreach_function(func, nir) { + nir_builder b; + nir_builder_init(&b, func->impl); + + nir_foreach_block(block, func->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type == nir_instr_type_intrinsic) { + nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr); + switch (intrinsic->intrinsic) { + case nir_intrinsic_load_global_invocation_id: + case nir_intrinsic_load_global_invocation_id_zero_base: + case nir_intrinsic_load_base_global_invocation_id: + case nir_intrinsic_load_local_invocation_id: + case nir_intrinsic_load_work_group_id: + case nir_intrinsic_load_work_group_id_zero_base: + case nir_intrinsic_load_base_work_group_id: + case nir_intrinsic_load_num_work_groups: + break; + default: + continue; + } + + if (nir_instr_ssa_def(instr)->bit_size != 64) + continue; + + intrinsic->dest.ssa.bit_size = 32; + b.cursor = nir_after_instr(instr); + + nir_ssa_def *i64 = nir_u2u64(&b, &intrinsic->dest.ssa); + nir_ssa_def_rewrite_uses_after( + &intrinsic->dest.ssa, + nir_src_for_ssa(i64), + i64->parent_instr); + } + } + } + } +} + +static void +clc_lower_nonnormalized_samplers(nir_shader *nir, + const dxil_wrap_sampler_state *states) +{ + nir_foreach_function(func, nir) { + if (!func->is_entrypoint) + continue; + assert(func->impl); + + nir_builder b; + nir_builder_init(&b, func->impl); + + nir_foreach_block(block, func->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_tex) + continue; + nir_tex_instr *tex = nir_instr_as_tex(instr); + + int sampler_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref); + if (sampler_src_idx == -1) + continue; + + nir_src *sampler_src = &tex->src[sampler_src_idx].src; + assert(sampler_src->is_ssa && sampler_src->ssa->parent_instr->type == nir_instr_type_deref); + nir_variable *sampler = nir_deref_instr_get_variable( + nir_instr_as_deref(sampler_src->ssa->parent_instr)); + + // If the sampler returns ints, we'll handle this in the int lowering pass + if (nir_alu_type_get_base_type(tex->dest_type) != nir_type_float) + continue; + + // If sampler uses normalized coords, nothing to do + if (!states[sampler->data.binding].is_nonnormalized_coords) + continue; + + b.cursor = nir_before_instr(&tex->instr); + + int coords_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord); + assert(coords_idx != -1); + nir_ssa_def *coords = + nir_ssa_for_src(&b, tex->src[coords_idx].src, tex->coord_components); + + nir_ssa_def *txs = nir_i2f32(&b, nir_get_texture_size(&b, tex)); + + // Normalize coords for tex + nir_ssa_def *scale = nir_frcp(&b, txs); + nir_ssa_def *comps[4]; + for (unsigned i = 0; i < coords->num_components; ++i) { + comps[i] = nir_channel(&b, coords, i); + if (tex->is_array && i == coords->num_components - 1) { + // Don't scale the array index, but do clamp it + comps[i] = nir_fround_even(&b, comps[i]); + comps[i] = nir_fmax(&b, comps[i], nir_imm_float(&b, 0.0f)); + comps[i] = nir_fmin(&b, comps[i], nir_fsub(&b, nir_channel(&b, txs, i), nir_imm_float(&b, 1.0f))); + break; + } + + // The CTS is pretty clear that this value has to be floored for nearest sampling + // but must not be for linear sampling. + if (!states[sampler->data.binding].is_linear_filtering) + comps[i] = nir_fadd_imm(&b, nir_ffloor(&b, comps[i]), 0.5f); + comps[i] = nir_fmul(&b, comps[i], nir_channel(&b, scale, i)); + } + nir_ssa_def *normalized_coords = nir_vec(&b, comps, coords->num_components); + nir_instr_rewrite_src(&tex->instr, + &tex->src[coords_idx].src, + nir_src_for_ssa(normalized_coords)); + } + } + } +} + + +static void +clc_context_optimize(nir_shader *s) +{ + bool progress; + do { + progress = false; + NIR_PASS(progress, s, nir_split_var_copies); + NIR_PASS(progress, s, nir_opt_copy_prop_vars); + NIR_PASS(progress, s, nir_lower_var_copies); + NIR_PASS(progress, s, nir_lower_vars_to_ssa); + NIR_PASS(progress, s, nir_copy_prop); + NIR_PASS(progress, s, nir_opt_remove_phis); + NIR_PASS(progress, s, nir_opt_dce); + NIR_PASS(progress, s, nir_opt_if, true); + NIR_PASS(progress, s, nir_opt_dead_cf); + NIR_PASS(progress, s, nir_opt_cse); + NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true); + NIR_PASS(progress, s, nir_opt_algebraic); + NIR_PASS(progress, s, nir_opt_constant_folding); + NIR_PASS(progress, s, nir_opt_undef); + NIR_PASS(progress, s, nir_lower_undef_to_zero); + NIR_PASS(progress, s, nir_opt_deref); + } while (progress); +} + +struct clc_context * +clc_context_new(const struct clc_logger *logger, const struct clc_context_options *options) +{ + struct clc_context *ctx = rzalloc(NULL, struct clc_context); + if (!ctx) { + clc_error(logger, "D3D12: failed to allocate a clc_context"); + return NULL; + } + + const struct spirv_to_nir_options libclc_spirv_options = { + .environment = NIR_SPIRV_OPENCL, + .create_library = true, + .constant_addr_format = nir_address_format_32bit_index_offset_pack64, + .global_addr_format = nir_address_format_32bit_index_offset_pack64, + .shared_addr_format = nir_address_format_32bit_offset_as_64bit, + .temp_addr_format = nir_address_format_32bit_offset_as_64bit, + .float_controls_execution_mode = FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32, + .caps = { + .address = true, + .float64 = true, + .int8 = true, + .int16 = true, + .int64 = true, + .kernel = true, + }, + }; + const struct nir_shader_compiler_options *libclc_nir_options = + dxil_get_nir_compiler_options(); + + glsl_type_singleton_init_or_ref(); + nir_shader *s = nir_load_libclc_shader(64, NULL, &libclc_spirv_options, libclc_nir_options); + if (!s) { + clc_error(logger, "D3D12: spirv_to_nir failed on libclc blob"); + ralloc_free(ctx); + return NULL; + } + + if (options && options->optimize) + clc_context_optimize(s); + + ctx->libclc_nir = s; + ralloc_steal(ctx, ctx->libclc_nir); + + return ctx; +} + +void +clc_free_context(struct clc_context *ctx) +{ + ralloc_free(ctx); + glsl_type_singleton_decref(); +}; + +void clc_context_serialize(struct clc_context *context, + void **serialized, + size_t *serialized_size) +{ + struct blob tmp; + blob_init(&tmp); + nir_serialize(&tmp, context->libclc_nir, true); + + blob_finish_get_buffer(&tmp, serialized, serialized_size); +} + +void clc_context_free_serialized(void *serialized) +{ + free(serialized); +} + +struct clc_context * + clc_context_deserialize(const void *serialized, size_t serialized_size) +{ + struct clc_context *ctx = rzalloc(NULL, struct clc_context); + if (!ctx) { + return NULL; + } + const struct nir_shader_compiler_options *libclc_nir_options = + dxil_get_nir_compiler_options(); + + glsl_type_singleton_init_or_ref(); + + struct blob_reader tmp; + blob_reader_init(&tmp, serialized, serialized_size); + + ctx->libclc_nir = nir_deserialize(NULL, libclc_nir_options, &tmp); + if (!ctx->libclc_nir) { + free(ctx); + return NULL; + } + + ralloc_steal(ctx, ctx->libclc_nir); + + return ctx; +} + +struct clc_object * +clc_compile(struct clc_context *ctx, + const struct clc_compile_args *args, + const struct clc_logger *logger) +{ + struct clc_object *obj; + int ret; + + obj = calloc(1, sizeof(*obj)); + if (!obj) { + clc_error(logger, "D3D12: failed to allocate a clc_object"); + return NULL; + } + + ret = clc_to_spirv(args, &obj->spvbin, logger); + if (ret < 0) { + free(obj); + return NULL; + } + + if (debug_get_option_debug_clc() & CLC_DEBUG_DUMP_SPIRV) + clc_dump_spirv(&obj->spvbin, stdout); + + return obj; +} + +struct clc_object * +clc_link(struct clc_context *ctx, + const struct clc_linker_args *args, + const struct clc_logger *logger) +{ + struct clc_object *out_obj; + int ret; + + out_obj = malloc(sizeof(*out_obj)); + if (!out_obj) { + clc_error(logger, "failed to allocate a clc_object"); + return NULL; + } + + ret = clc_link_spirv_binaries(args, &out_obj->spvbin, logger); + if (ret < 0) { + free(out_obj); + return NULL; + } + + if (debug_get_option_debug_clc() & CLC_DEBUG_DUMP_SPIRV) + clc_dump_spirv(&out_obj->spvbin, stdout); + + out_obj->kernels = clc_spirv_get_kernels_info(&out_obj->spvbin, + &out_obj->num_kernels); + + if (debug_get_option_debug_clc() & CLC_DEBUG_VERBOSE) + clc_print_kernels_info(out_obj); + + return out_obj; +} + +void clc_free_object(struct clc_object *obj) +{ + clc_free_kernels_info(obj->kernels, obj->num_kernels); + clc_free_spirv_binary(&obj->spvbin); + free(obj); +} + +static nir_variable * +add_kernel_inputs_var(struct clc_dxil_object *dxil, nir_shader *nir, + unsigned *cbv_id) +{ + if (!dxil->kernel->num_args) + return NULL; + + struct clc_dxil_metadata *metadata = &dxil->metadata; + unsigned size = 0; + + nir_foreach_variable_with_modes(var, nir, nir_var_uniform) + size = MAX2(size, + var->data.driver_location + + glsl_get_cl_size(var->type)); + + size = align(size, 4); + + nir_variable *var = + nir_variable_create(nir, nir_var_mem_ubo, + glsl_array_type(glsl_uint_type(), + size / 4, 0), + "kernel_inputs"); + var->data.binding = (*cbv_id)++; + var->data.how_declared = nir_var_hidden; + return var; +} + +static nir_variable * +add_work_properties_var(struct clc_dxil_object *dxil, + struct nir_shader *nir, unsigned *cbv_id) +{ + struct clc_dxil_metadata *metadata = &dxil->metadata; + nir_variable *var = + nir_variable_create(nir, nir_var_mem_ubo, + glsl_array_type(glsl_uint_type(), + sizeof(struct clc_work_properties_data) / sizeof(unsigned), + 0), + "kernel_work_properies"); + var->data.binding = (*cbv_id)++; + var->data.how_declared = nir_var_hidden; + return var; +} + +static void +clc_lower_constant_to_ssbo(nir_shader *nir, + const struct clc_kernel_info *kerninfo, unsigned *uav_id) +{ + /* Update UBO vars and assign them a binding. */ + nir_foreach_variable_with_modes(var, nir, nir_var_mem_constant) { + var->data.mode = nir_var_mem_ssbo; + var->data.binding = (*uav_id)++; + } + + /* And finally patch all the derefs referincing the constant + * variables/pointers. + */ + nir_foreach_function(func, nir) { + if (!func->is_entrypoint) + continue; + + assert(func->impl); + + nir_builder b; + nir_builder_init(&b, func->impl); + + nir_foreach_block(block, func->impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_deref) + continue; + + nir_deref_instr *deref = nir_instr_as_deref(instr); + + if (deref->modes != nir_var_mem_constant) + continue; + + deref->modes = nir_var_mem_ssbo; + } + } + } +} + +static void +clc_lower_global_to_ssbo(nir_shader *nir) +{ + nir_foreach_function(func, nir) { + if (!func->is_entrypoint) + continue; + + assert(func->impl); + + nir_foreach_block(block, func->impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_deref) + continue; + + nir_deref_instr *deref = nir_instr_as_deref(instr); + + if (deref->modes != nir_var_mem_global) + continue; + + deref->modes = nir_var_mem_ssbo; + } + } + } +} + +static void +copy_const_initializer(const nir_constant *constant, const struct glsl_type *type, + uint8_t *data) +{ + unsigned size = glsl_get_cl_size(type); + + if (glsl_type_is_array(type)) { + const struct glsl_type *elm_type = glsl_get_array_element(type); + unsigned step_size = glsl_get_explicit_stride(type); + + for (unsigned i = 0; i < constant->num_elements; i++) { + copy_const_initializer(constant->elements[i], elm_type, + data + (i * step_size)); + } + } else if (glsl_type_is_struct(type)) { + for (unsigned i = 0; i < constant->num_elements; i++) { + const struct glsl_type *elm_type = glsl_get_struct_field(type, i); + int offset = glsl_get_struct_field_offset(type, i); + copy_const_initializer(constant->elements[i], elm_type, data + offset); + } + } else { + assert(glsl_type_is_vector_or_scalar(type)); + + for (unsigned i = 0; i < glsl_get_components(type); i++) { + switch (glsl_get_bit_size(type)) { + case 64: + *((uint64_t *)data) = constant->values[i].u64; + break; + case 32: + *((uint32_t *)data) = constant->values[i].u32; + break; + case 16: + *((uint16_t *)data) = constant->values[i].u16; + break; + case 8: + *((uint8_t *)data) = constant->values[i].u8; + break; + default: + unreachable("Invalid base type"); + } + + data += glsl_get_bit_size(type) / 8; + } + } +} + +static const struct glsl_type * +get_cast_type(unsigned bit_size) +{ + switch (bit_size) { + case 64: + return glsl_int64_t_type(); + case 32: + return glsl_int_type(); + case 16: + return glsl_int16_t_type(); + case 8: + return glsl_int8_t_type(); + } + unreachable("Invalid bit_size"); +} + +static void +split_unaligned_load(nir_builder *b, nir_intrinsic_instr *intrin, unsigned alignment) +{ + enum gl_access_qualifier access = nir_intrinsic_access(intrin); + nir_ssa_def *srcs[NIR_MAX_VEC_COMPONENTS * NIR_MAX_VEC_COMPONENTS * sizeof(int64_t) / 8]; + unsigned comp_size = intrin->dest.ssa.bit_size / 8; + unsigned num_comps = intrin->dest.ssa.num_components; + + b->cursor = nir_before_instr(&intrin->instr); + + nir_deref_instr *ptr = nir_src_as_deref(intrin->src[0]); + + const struct glsl_type *cast_type = get_cast_type(alignment * 8); + nir_deref_instr *cast = nir_build_deref_cast(b, &ptr->dest.ssa, ptr->modes, cast_type, alignment); + + unsigned num_loads = DIV_ROUND_UP(comp_size * num_comps, alignment); + for (unsigned i = 0; i < num_loads; ++i) { + nir_deref_instr *elem = nir_build_deref_ptr_as_array(b, cast, nir_imm_intN_t(b, i, cast->dest.ssa.bit_size)); + srcs[i] = nir_load_deref_with_access(b, elem, access); + } + + nir_ssa_def *new_dest = nir_extract_bits(b, srcs, num_loads, 0, num_comps, intrin->dest.ssa.bit_size); + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(new_dest)); + nir_instr_remove(&intrin->instr); +} + +static void +split_unaligned_store(nir_builder *b, nir_intrinsic_instr *intrin, unsigned alignment) +{ + enum gl_access_qualifier access = nir_intrinsic_access(intrin); + + assert(intrin->src[1].is_ssa); + nir_ssa_def *value = intrin->src[1].ssa; + unsigned comp_size = value->bit_size / 8; + unsigned num_comps = value->num_components; + + b->cursor = nir_before_instr(&intrin->instr); + + nir_deref_instr *ptr = nir_src_as_deref(intrin->src[0]); + + const struct glsl_type *cast_type = get_cast_type(alignment * 8); + nir_deref_instr *cast = nir_build_deref_cast(b, &ptr->dest.ssa, ptr->modes, cast_type, alignment); + + unsigned num_stores = DIV_ROUND_UP(comp_size * num_comps, alignment); + for (unsigned i = 0; i < num_stores; ++i) { + nir_ssa_def *substore_val = nir_extract_bits(b, &value, 1, i * alignment * 8, 1, alignment * 8); + nir_deref_instr *elem = nir_build_deref_ptr_as_array(b, cast, nir_imm_intN_t(b, i, cast->dest.ssa.bit_size)); + nir_store_deref_with_access(b, elem, substore_val, ~0, access); + } + + nir_instr_remove(&intrin->instr); +} + +static bool +split_unaligned_loads_stores(nir_shader *shader) +{ + bool progress = false; + + nir_foreach_function(function, shader) { + if (!function->impl) + continue; + + nir_builder b; + nir_builder_init(&b, function->impl); + + nir_foreach_block(block, function->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_load_deref && + intrin->intrinsic != nir_intrinsic_store_deref) + continue; + nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); + + unsigned align_mul = 0, align_offset = 0; + nir_get_explicit_deref_align(deref, true, &align_mul, &align_offset); + + unsigned alignment = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul; + + /* We can load anything at 4-byte alignment, except for + * UBOs (AKA CBs where the granularity is 16 bytes). + */ + if (alignment >= (deref->modes == nir_var_mem_ubo ? 16 : 4)) + continue; + + nir_ssa_def *val; + if (intrin->intrinsic == nir_intrinsic_load_deref) { + assert(intrin->dest.is_ssa); + val = &intrin->dest.ssa; + } else { + assert(intrin->src[1].is_ssa); + val = intrin->src[1].ssa; + } + + unsigned natural_alignment = + val->bit_size / 8 * + (val->num_components == 3 ? 4 : val->num_components); + + if (alignment >= natural_alignment) + continue; + + if (intrin->intrinsic == nir_intrinsic_load_deref) + split_unaligned_load(&b, intrin, alignment); + else + split_unaligned_store(&b, intrin, alignment); + progress = true; + } + } + } + + return progress; +} + +static enum pipe_tex_wrap +wrap_from_cl_addressing(unsigned addressing_mode) +{ + switch (addressing_mode) + { + default: + case SAMPLER_ADDRESSING_MODE_NONE: + case SAMPLER_ADDRESSING_MODE_CLAMP: + // Since OpenCL's only border color is 0's and D3D specs out-of-bounds loads to return 0, don't apply any wrap mode + return (enum pipe_tex_wrap)-1; + case SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE: return PIPE_TEX_WRAP_CLAMP_TO_EDGE; + case SAMPLER_ADDRESSING_MODE_REPEAT: return PIPE_TEX_WRAP_REPEAT; + case SAMPLER_ADDRESSING_MODE_REPEAT_MIRRORED: return PIPE_TEX_WRAP_MIRROR_REPEAT; + } +} + +static bool shader_has_double(nir_shader *nir) +{ + bool progress = false; + + foreach_list_typed(nir_function, func, node, &nir->functions) { + if (!func->is_entrypoint) + continue; + + assert(func->impl); + + nir_foreach_block(block, func->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_alu) + continue; + + nir_alu_instr *alu = nir_instr_as_alu(instr); + const nir_op_info *info = &nir_op_infos[alu->op]; + + if (info->output_type & nir_type_float && + nir_dest_bit_size(alu->dest.dest) == 64) + return true; + } + } + } + + return false; +} + +static bool +scale_fdiv(nir_shader *nir) +{ + bool progress = false; + nir_foreach_function(func, nir) { + if (!func->impl) + continue; + nir_builder b; + nir_builder_init(&b, func->impl); + nir_foreach_block(block, func->impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_alu) + continue; + nir_alu_instr *alu = nir_instr_as_alu(instr); + if (alu->op != nir_op_fdiv) + continue; + + b.cursor = nir_before_instr(instr); + nir_ssa_def *fabs = nir_fabs(&b, alu->src[1].src.ssa); + nir_ssa_def *big = nir_flt(&b, nir_imm_int(&b, 0x7e800000), fabs); + nir_ssa_def *small = nir_flt(&b, fabs, nir_imm_int(&b, 0x00800000)); + + nir_ssa_def *scaled_down_a = nir_fmul_imm(&b, alu->src[0].src.ssa, 0.25); + nir_ssa_def *scaled_down_b = nir_fmul_imm(&b, alu->src[1].src.ssa, 0.25); + nir_ssa_def *scaled_up_a = nir_fmul_imm(&b, alu->src[0].src.ssa, 16777216.0); + nir_ssa_def *scaled_up_b = nir_fmul_imm(&b, alu->src[1].src.ssa, 16777216.0); + + nir_ssa_def *final_a = + nir_bcsel(&b, big, scaled_down_a, + (nir_bcsel(&b, small, scaled_up_a, alu->src[0].src.ssa))); + nir_ssa_def *final_b = + nir_bcsel(&b, big, scaled_down_b, + (nir_bcsel(&b, small, scaled_up_b, alu->src[1].src.ssa))); + + nir_instr_rewrite_src(instr, &alu->src[0].src, nir_src_for_ssa(final_a)); + nir_instr_rewrite_src(instr, &alu->src[1].src, nir_src_for_ssa(final_b)); + progress = true; + } + } + } + return progress; +} + +struct clc_dxil_object * +clc_to_dxil(struct clc_context *ctx, + const struct clc_object *obj, + const char *entrypoint, + const struct clc_runtime_kernel_conf *conf, + const struct clc_logger *logger) +{ + struct clc_dxil_object *dxil; + struct nir_shader *nir; + char *err_log; + int ret; + + dxil = calloc(1, sizeof(*dxil)); + if (!dxil) { + clc_error(logger, "failed to allocate the dxil object"); + return NULL; + } + + for (unsigned i = 0; i < obj->num_kernels; i++) { + if (!strcmp(obj->kernels[i].name, entrypoint)) { + dxil->kernel = &obj->kernels[i]; + break; + } + } + + if (!dxil->kernel) { + clc_error(logger, "no '%s' kernel found", entrypoint); + goto err_free_dxil; + } + + const struct spirv_to_nir_options spirv_options = { + .environment = NIR_SPIRV_OPENCL, + .clc_shader = ctx->libclc_nir, + .constant_addr_format = nir_address_format_32bit_index_offset_pack64, + .global_addr_format = nir_address_format_32bit_index_offset_pack64, + .shared_addr_format = nir_address_format_32bit_offset_as_64bit, + .temp_addr_format = nir_address_format_32bit_offset_as_64bit, + .float_controls_execution_mode = FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32, + .caps = { + .address = true, + .float64 = true, + .int8 = true, + .int16 = true, + .int64 = true, + .kernel = true, + .kernel_image = true, + .literal_sampler = true, + }, + }; + nir_shader_compiler_options nir_options = + *dxil_get_nir_compiler_options(); + + if (conf && conf->lower_bit_size & 64) { + nir_options.lower_pack_64_2x32_split = false; + nir_options.lower_unpack_64_2x32_split = false; + nir_options.lower_int64_options = ~0; + } + + if (conf && conf->lower_bit_size & 16) + nir_options.support_16bit_alu = true; + + glsl_type_singleton_init_or_ref(); + + nir = spirv_to_nir(obj->spvbin.data, obj->spvbin.size / 4, + NULL, 0, + MESA_SHADER_KERNEL, entrypoint, + &spirv_options, + &nir_options); + if (!nir) { + clc_error(logger, "spirv_to_nir() failed"); + goto err_free_dxil; + } + nir->info.cs.local_size_variable = true; + + NIR_PASS_V(nir, nir_lower_goto_ifs); + NIR_PASS_V(nir, nir_opt_dead_cf); + + struct clc_dxil_metadata *metadata = &dxil->metadata; + + metadata->args = calloc(dxil->kernel->num_args, + sizeof(*metadata->args)); + if (!metadata->args) { + clc_error(logger, "failed to allocate arg positions"); + goto err_free_dxil; + } + + // Calculate input offsets/metadata. + unsigned uav_id = 0, sampler_id = 0, offset = 0; + dxil_wrap_sampler_state int_sampler_states[PIPE_MAX_SHADER_SAMPLER_VIEWS] = {{{0}}}; + nir_foreach_variable_with_modes(var, nir, nir_var_uniform) { + int i = var->data.location; + if (i < 0) + continue; + + unsigned size = glsl_get_cl_size(var->type); + offset = align(offset, glsl_get_cl_alignment(var->type)); + var->data.driver_location = offset; + + metadata->args[i].offset = offset; + metadata->args[i].size = size; + metadata->kernel_inputs_buf_size = MAX2(metadata->kernel_inputs_buf_size, + offset + size); + if ((dxil->kernel->args[i].address_qualifier == CLC_KERNEL_ARG_ADDRESS_GLOBAL || + dxil->kernel->args[i].address_qualifier == CLC_KERNEL_ARG_ADDRESS_CONSTANT) && + // Ignore images during this pass - global memory buffers need to have contiguous bindings + !glsl_type_is_image(var->type)) { + metadata->args[i].globconstptr.buf_id = uav_id++; + } else if (glsl_type_is_sampler(var->type)) { + unsigned address_mode = conf ? conf->args[i].sampler.addressing_mode : 0u; + int_sampler_states[sampler_id].wrap[0] = + int_sampler_states[sampler_id].wrap[1] = + int_sampler_states[sampler_id].wrap[2] = wrap_from_cl_addressing(address_mode); + int_sampler_states[sampler_id].is_nonnormalized_coords = + conf ? !conf->args[i].sampler.normalized_coords : 0; + int_sampler_states[sampler_id].is_linear_filtering = + conf ? conf->args[i].sampler.linear_filtering : 0; + metadata->args[i].sampler.sampler_id = var->data.binding = sampler_id++; + } + offset += size; + } + + unsigned num_global_inputs = uav_id; + + // Second pass over inputs to calculate image bindings + unsigned srv_id = 0; + nir_foreach_variable_with_modes(var, nir, nir_var_uniform) { + int i = var->data.location; + if (i < 0) + continue; + + if (glsl_type_is_image(var->type)) { + if (var->data.access == ACCESS_NON_WRITEABLE) { + metadata->args[i].image.buf_ids[0] = srv_id++; + } else { + // Write or read-write are UAVs + metadata->args[i].image.buf_ids[0] = uav_id++; + } + + metadata->args[i].image.num_buf_ids = 1; + var->data.binding = metadata->args[i].image.buf_ids[0]; + } + } + + { + bool progress; + do + { + progress = false; + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_copy_prop_vars); + NIR_PASS(progress, nir, nir_opt_deref); + NIR_PASS(progress, nir, nir_opt_dce); + NIR_PASS(progress, nir, nir_opt_undef); + NIR_PASS(progress, nir, nir_opt_constant_folding); + NIR_PASS(progress, nir, nir_opt_cse); + NIR_PASS(progress, nir, nir_lower_vars_to_ssa); + NIR_PASS(progress, nir, nir_opt_algebraic); + } while (progress); + } + + // Inline all functions first. + // according to the comment on nir_inline_functions + NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp); + NIR_PASS_V(nir, nir_lower_returns); + NIR_PASS_V(nir, nir_lower_libclc, ctx->libclc_nir); + NIR_PASS_V(nir, nir_inline_functions); + + // Pick off the single entrypoint that we want. + foreach_list_typed_safe(nir_function, func, node, &nir->functions) { + if (!func->is_entrypoint) + exec_node_remove(&func->node); + } + assert(exec_list_length(&nir->functions) == 1); + + { + bool progress; + do + { + progress = false; + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_copy_prop_vars); + NIR_PASS(progress, nir, nir_opt_deref); + NIR_PASS(progress, nir, nir_opt_dce); + NIR_PASS(progress, nir, nir_opt_undef); + NIR_PASS(progress, nir, nir_opt_constant_folding); + NIR_PASS(progress, nir, nir_opt_cse); + NIR_PASS(progress, nir, nir_split_var_copies); + NIR_PASS(progress, nir, nir_lower_var_copies); + NIR_PASS(progress, nir, nir_lower_vars_to_ssa); + NIR_PASS(progress, nir, nir_opt_algebraic); + NIR_PASS(progress, nir, nir_opt_if, true); + NIR_PASS(progress, nir, nir_opt_dead_cf); + NIR_PASS(progress, nir, nir_opt_remove_phis); + NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true); + NIR_PASS(progress, nir, nir_lower_vec3_to_vec4, nir_var_mem_generic | nir_var_uniform); + } while (progress); + } + + // Before removing dead uniforms, dedupe constant samplers to make more dead uniforms + NIR_PASS_V(nir, clc_nir_dedupe_const_samplers); + NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_uniform | nir_var_mem_ubo | nir_var_mem_constant | nir_var_function_temp, NULL); + + NIR_PASS_V(nir, scale_fdiv); + + // Assign bindings for constant samplers + nir_foreach_variable_with_modes(var, nir, nir_var_uniform) { + if (glsl_type_is_sampler(var->type) && var->data.sampler.is_inline_sampler) { + int_sampler_states[sampler_id].wrap[0] = + int_sampler_states[sampler_id].wrap[0] = + int_sampler_states[sampler_id].wrap[0] = + wrap_from_cl_addressing(var->data.sampler.addressing_mode); + int_sampler_states[sampler_id].is_nonnormalized_coords = + !var->data.sampler.normalized_coordinates; + int_sampler_states[sampler_id].is_linear_filtering = + var->data.sampler.filter_mode == SAMPLER_FILTER_MODE_LINEAR; + var->data.binding = sampler_id++; + + assert(metadata->num_const_samplers < CLC_MAX_SAMPLERS); + metadata->const_samplers[metadata->num_const_samplers].sampler_id = var->data.binding; + metadata->const_samplers[metadata->num_const_samplers].addressing_mode = var->data.sampler.addressing_mode; + metadata->const_samplers[metadata->num_const_samplers].normalized_coords = var->data.sampler.normalized_coordinates; + metadata->const_samplers[metadata->num_const_samplers].filter_mode = var->data.sampler.filter_mode; + metadata->num_const_samplers++; + } + } + + NIR_PASS_V(nir, nir_lower_variable_initializers, ~(nir_var_function_temp | nir_var_shader_temp)); + + // Lower memcpy + NIR_PASS_V(nir, dxil_nir_lower_memcpy_deref); + + bool has_printf = false; + //NIR_PASS(has_printf, nir, clc_nir_lower_printf, uav_id); + metadata->printf_uav_id = has_printf ? uav_id++ : -1; + + // copy propagate to prepare for lower_explicit_io + NIR_PASS_V(nir, nir_split_var_copies); + NIR_PASS_V(nir, nir_opt_copy_prop_vars); + NIR_PASS_V(nir, nir_lower_var_copies); + NIR_PASS_V(nir, nir_lower_vars_to_ssa); + NIR_PASS_V(nir, nir_lower_alu); + NIR_PASS_V(nir, nir_opt_dce); + NIR_PASS_V(nir, nir_opt_deref); + + // Needs to come before lower_explicit_io + NIR_PASS_V(nir, nir_lower_cl_images_to_tex); + struct clc_image_lower_context image_lower_context = { metadata, &srv_id, &uav_id }; + NIR_PASS_V(nir, clc_lower_images, &image_lower_context); + NIR_PASS_V(nir, clc_lower_nonnormalized_samplers, int_sampler_states); + NIR_PASS_V(nir, nir_lower_samplers); + NIR_PASS_V(nir, dxil_lower_sample_to_txf_for_integer_tex, + int_sampler_states, NULL, 14.0f); + + NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_mem_shared | nir_var_function_temp, NULL); + assert(nir->scratch_size == 0); + + NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, + nir_var_mem_shared | nir_var_function_temp | nir_var_uniform | nir_var_mem_global | nir_var_mem_constant, + glsl_get_cl_type_size_align); + + NIR_PASS_V(nir, dxil_nir_lower_ubo_to_temp); + NIR_PASS_V(nir, clc_lower_constant_to_ssbo, dxil->kernel, &uav_id); + NIR_PASS_V(nir, clc_lower_global_to_ssbo); + NIR_PASS_V(nir, dxil_nir_lower_deref_ssbo); + + NIR_PASS_V(nir, split_unaligned_loads_stores); + + assert(nir->info.cs.ptr_size == 64); + NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ssbo, + nir_address_format_32bit_index_offset_pack64); + NIR_PASS_V(nir, nir_lower_explicit_io, + nir_var_mem_shared | nir_var_function_temp | nir_var_uniform, + nir_address_format_32bit_offset_as_64bit); + + NIR_PASS_V(nir, nir_lower_system_values); + + nir_lower_compute_system_values_options compute_options = { + .has_base_global_invocation_id = (conf && conf->support_global_work_id_offsets), + .has_base_work_group_id = (conf && conf->support_work_group_id_offsets), + }; + NIR_PASS_V(nir, nir_lower_compute_system_values, &compute_options); + + NIR_PASS_V(nir, clc_lower_64bit_semantics); + + NIR_PASS_V(nir, nir_opt_deref); + NIR_PASS_V(nir, nir_lower_vars_to_ssa); + + unsigned cbv_id = 0; + + nir_variable *inputs_var = + add_kernel_inputs_var(dxil, nir, &cbv_id); + nir_variable *work_properties_var = + add_work_properties_var(dxil, nir, &cbv_id); + + // Patch the localsize before calling clc_nir_lower_system_values(). + if (conf) { + for (unsigned i = 0; i < ARRAY_SIZE(nir->info.cs.local_size); i++) { + if (!conf->local_size[i] || + conf->local_size[i] == nir->info.cs.local_size[i]) + continue; + + if (nir->info.cs.local_size[i] && + nir->info.cs.local_size[i] != conf->local_size[i]) { + debug_printf("D3D12: runtime local size does not match reqd_work_group_size() values\n"); + goto err_free_dxil; + } + + nir->info.cs.local_size[i] = conf->local_size[i]; + } + } + + NIR_PASS_V(nir, clc_nir_lower_kernel_input_loads, inputs_var); + NIR_PASS_V(nir, split_unaligned_loads_stores); + NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ubo, + nir_address_format_32bit_index_offset); + NIR_PASS_V(nir, clc_nir_lower_system_values, work_properties_var); + NIR_PASS_V(nir, dxil_nir_lower_loads_stores_to_dxil); + NIR_PASS_V(nir, dxil_nir_opt_alu_deref_srcs); + NIR_PASS_V(nir, dxil_nir_lower_atomics_to_dxil); + NIR_PASS_V(nir, dxil_nir_lower_fp16_casts); + NIR_PASS_V(nir, nir_lower_convert_alu_types, NULL); + + // Convert pack to pack_split + NIR_PASS_V(nir, nir_lower_pack); + // Lower pack_split to bit math + NIR_PASS_V(nir, nir_opt_algebraic); + + NIR_PASS_V(nir, nir_opt_dce); + + nir_validate_shader(nir, "Validate before feeding NIR to the DXIL compiler"); + struct nir_to_dxil_options opts = { + .interpolate_at_vertex = false, + .lower_int16 = (conf && (conf->lower_bit_size & 16) != 0), + .ubo_binding_offset = 0, + .disable_math_refactoring = true, + .num_kernel_globals = num_global_inputs, + }; + + for (unsigned i = 0; i < dxil->kernel->num_args; i++) { + if (dxil->kernel->args[i].address_qualifier != CLC_KERNEL_ARG_ADDRESS_LOCAL) + continue; + + /* If we don't have the runtime conf yet, we just create a dummy variable. + * This will be adjusted when clc_to_dxil() is called with a conf + * argument. + */ + unsigned size = 4; + if (conf && conf->args) + size = conf->args[i].localptr.size; + + /* The alignment required for the pointee type is not easy to get from + * here, so let's base our logic on the size itself. Anything bigger than + * the maximum alignment constraint (which is 128 bytes, since ulong16 or + * doubl16 size are the biggest base types) should be aligned on this + * maximum alignment constraint. For smaller types, we use the size + * itself to calculate the alignment. + */ + unsigned alignment = size < 128 ? (1 << (ffs(size) - 1)) : 128; + + nir->info.cs.shared_size = align(nir->info.cs.shared_size, alignment); + metadata->args[i].localptr.sharedmem_offset = nir->info.cs.shared_size; + nir->info.cs.shared_size += size; + } + + metadata->local_mem_size = nir->info.cs.shared_size; + metadata->priv_mem_size = nir->scratch_size; + + /* DXIL double math is too limited compared to what NIR expects. Let's refuse + * to compile a shader when it contains double operations until we have + * double lowering hooked up. + */ + if (shader_has_double(nir)) { + clc_error(logger, "NIR shader contains doubles, which we don't support yet"); + goto err_free_dxil; + } + + struct blob tmp; + if (!nir_to_dxil(nir, &opts, &tmp)) { + debug_printf("D3D12: nir_to_dxil failed\n"); + goto err_free_dxil; + } + + memcpy(metadata->local_size, nir->info.cs.local_size, + sizeof(metadata->local_size)); + memcpy(metadata->local_size_hint, nir->info.cs.local_size_hint, + sizeof(metadata->local_size)); + + nir_foreach_variable_with_modes(var, nir, nir_var_mem_ssbo) { + if (var->constant_initializer) { + if (glsl_type_is_array(var->type)) { + int size = align(glsl_get_cl_size(var->type), 4); + uint8_t *data = malloc(size); + if (!data) + goto err_free_dxil; + + copy_const_initializer(var->constant_initializer, var->type, data); + metadata->consts[metadata->num_consts].data = data; + metadata->consts[metadata->num_consts].size = size; + metadata->consts[metadata->num_consts].uav_id = var->data.binding; + metadata->num_consts++; + } else + unreachable("unexpected constant initializer"); + } + } + + metadata->kernel_inputs_cbv_id = inputs_var ? inputs_var->data.binding : 0; + metadata->work_properties_cbv_id = work_properties_var->data.binding; + metadata->num_uavs = uav_id; + metadata->num_srvs = srv_id; + metadata->num_samplers = sampler_id; + + ralloc_free(nir); + glsl_type_singleton_decref(); + + blob_finish_get_buffer(&tmp, &dxil->binary.data, + &dxil->binary.size); + return dxil; + +err_free_dxil: + clc_free_dxil_object(dxil); + return NULL; +} + +void clc_free_dxil_object(struct clc_dxil_object *dxil) +{ + for (unsigned i = 0; i < dxil->metadata.num_consts; i++) + free(dxil->metadata.consts[i].data); + + free(dxil->binary.data); + free(dxil); +} + +uint64_t clc_compiler_get_version() +{ + const char sha1[] = MESA_GIT_SHA1; + const char* dash = strchr(sha1, '-'); + if (dash) { + return strtoull(dash + 1, NULL, 16); + } + return 0; +} diff --git a/src/microsoft/clc/clc_compiler.h b/src/microsoft/clc/clc_compiler.h new file mode 100644 index 00000000000..8b73d9edc48 --- /dev/null +++ b/src/microsoft/clc/clc_compiler.h @@ -0,0 +1,266 @@ +/* + * Copyright © Microsoft Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef CLC_COMPILER_H +#define CLC_COMPILER_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +struct clc_named_value { + const char *name; + const char *value; +}; + +struct clc_compile_args { + const struct clc_named_value *headers; + unsigned num_headers; + struct clc_named_value source; + const char * const *args; + unsigned num_args; +}; + +struct clc_linker_args { + const struct clc_object * const *in_objs; + unsigned num_in_objs; + unsigned create_library; +}; + +typedef void (*clc_msg_callback)(void *priv, const char *msg); + +struct clc_logger { + void *priv; + clc_msg_callback error; + clc_msg_callback warning; +}; + +struct spirv_binary { + uint32_t *data; + size_t size; +}; + +enum clc_kernel_arg_type_qualifier { + CLC_KERNEL_ARG_TYPE_CONST = 1 << 0, + CLC_KERNEL_ARG_TYPE_RESTRICT = 1 << 1, + CLC_KERNEL_ARG_TYPE_VOLATILE = 1 << 2, +}; + +enum clc_kernel_arg_access_qualifier { + CLC_KERNEL_ARG_ACCESS_READ = 1 << 0, + CLC_KERNEL_ARG_ACCESS_WRITE = 1 << 1, +}; + +enum clc_kernel_arg_address_qualifier { + CLC_KERNEL_ARG_ADDRESS_PRIVATE, + CLC_KERNEL_ARG_ADDRESS_CONSTANT, + CLC_KERNEL_ARG_ADDRESS_LOCAL, + CLC_KERNEL_ARG_ADDRESS_GLOBAL, +}; + +struct clc_kernel_arg { + const char *name; + const char *type_name; + unsigned type_qualifier; + unsigned access_qualifier; + enum clc_kernel_arg_address_qualifier address_qualifier; +}; + +enum clc_vec_hint_type { + CLC_VEC_HINT_TYPE_CHAR = 0, + CLC_VEC_HINT_TYPE_SHORT = 1, + CLC_VEC_HINT_TYPE_INT = 2, + CLC_VEC_HINT_TYPE_LONG = 3, + CLC_VEC_HINT_TYPE_HALF = 4, + CLC_VEC_HINT_TYPE_FLOAT = 5, + CLC_VEC_HINT_TYPE_DOUBLE = 6 +}; + +struct clc_kernel_info { + const char *name; + size_t num_args; + const struct clc_kernel_arg *args; + + unsigned vec_hint_size; + enum clc_vec_hint_type vec_hint_type; +}; + +struct clc_object { + struct spirv_binary spvbin; + const struct clc_kernel_info *kernels; + unsigned num_kernels; +}; + +#define CLC_MAX_CONSTS 32 +#define CLC_MAX_BINDINGS_PER_ARG 3 +#define CLC_MAX_SAMPLERS 16 + +struct clc_dxil_metadata { + struct { + unsigned offset; + unsigned size; + union { + struct { + unsigned buf_ids[CLC_MAX_BINDINGS_PER_ARG]; + unsigned num_buf_ids; + } image; + struct { + unsigned sampler_id; + } sampler; + struct { + unsigned buf_id; + } globconstptr; + struct { + unsigned sharedmem_offset; + } localptr; + }; + } *args; + unsigned kernel_inputs_cbv_id; + unsigned kernel_inputs_buf_size; + unsigned work_properties_cbv_id; + size_t num_uavs; + size_t num_srvs; + size_t num_samplers; + + struct { + void *data; + size_t size; + unsigned uav_id; + } consts[CLC_MAX_CONSTS]; + size_t num_consts; + + struct { + unsigned sampler_id; + unsigned addressing_mode; + unsigned normalized_coords; + unsigned filter_mode; + } const_samplers[CLC_MAX_SAMPLERS]; + size_t num_const_samplers; + size_t local_mem_size; + size_t priv_mem_size; + + uint16_t local_size[3]; + uint16_t local_size_hint[3]; + + int printf_uav_id; +}; + +struct clc_dxil_object { + const struct clc_kernel_info *kernel; + struct clc_dxil_metadata metadata; + struct { + void *data; + size_t size; + } binary; +}; + +struct clc_context { + const void *libclc_nir; +}; + +struct clc_context_options { + unsigned optimize; +}; + +struct clc_context *clc_context_new(const struct clc_logger *logger, const struct clc_context_options *options); + +void clc_free_context(struct clc_context *ctx); + +void clc_context_serialize(struct clc_context *ctx, void **serialized, size_t *size); +void clc_context_free_serialized(void *serialized); +struct clc_context *clc_context_deserialize(void *serialized, size_t size); + +struct clc_object * +clc_compile(struct clc_context *ctx, + const struct clc_compile_args *args, + const struct clc_logger *logger); + +struct clc_object * +clc_link(struct clc_context *ctx, + const struct clc_linker_args *args, + const struct clc_logger *logger); + +void clc_free_object(struct clc_object *obj); + +struct clc_runtime_arg_info { + union { + struct { + unsigned size; + } localptr; + struct { + unsigned normalized_coords; + unsigned addressing_mode; /* See SPIR-V spec for value meanings */ + unsigned linear_filtering; + } sampler; + }; +}; + +struct clc_runtime_kernel_conf { + uint16_t local_size[3]; + struct clc_runtime_arg_info *args; + unsigned lower_bit_size; + unsigned support_global_work_id_offsets; + unsigned support_work_group_id_offsets; +}; + +struct clc_dxil_object * +clc_to_dxil(struct clc_context *ctx, + const struct clc_object *obj, + const char *entrypoint, + const struct clc_runtime_kernel_conf *conf, + const struct clc_logger *logger); + +void clc_free_dxil_object(struct clc_dxil_object *dxil); + +/* This struct describes the layout of data expected in the CB bound at global_work_offset_cbv_id */ +struct clc_work_properties_data { + /* Returned from get_global_offset(), and added into get_global_id() */ + unsigned global_offset_x; + unsigned global_offset_y; + unsigned global_offset_z; + /* Returned from get_work_dim() */ + unsigned work_dim; + /* The number of work groups being launched (i.e. the parameters to Dispatch). + * If the requested global size doesn't fit in a single Dispatch, these values should + * indicate the total number of groups that *should* have been launched. */ + unsigned group_count_total_x; + unsigned group_count_total_y; + unsigned group_count_total_z; + unsigned padding; + /* If the requested global size doesn't fit in a single Dispatch, subsequent dispatches + * should fill out these offsets to indicate how many groups have already been launched */ + unsigned group_id_offset_x; + unsigned group_id_offset_y; + unsigned group_id_offset_z; +}; + +uint64_t clc_compiler_get_version(); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/microsoft/clc/clc_compiler_test.cpp b/src/microsoft/clc/clc_compiler_test.cpp new file mode 100644 index 00000000000..eb7509d4fe0 --- /dev/null +++ b/src/microsoft/clc/clc_compiler_test.cpp @@ -0,0 +1,2187 @@ +/* + * Copyright © Microsoft Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "compute_test.h" + +using std::vector; + +TEST_F(ComputeTest, runtime_memcpy) +{ + struct shift { uint8_t val; uint8_t shift; uint16_t ret; }; + const char *kernel_source = + "struct shift { uchar val; uchar shift; ushort ret; };\n\ + __kernel void main_test(__global struct shift *inout)\n\ + {\n\ + uint id = get_global_id(0);\n\ + uint id2 = id + get_global_id(1);\n\ + struct shift lc[4] = { { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }};\n\ + lc[id] = inout[id];\n\ + inout[id2].ret = (ushort) lc[id2].val << (ushort) lc[id2].shift;\n\ + }\n"; + + auto inout = ShaderArg({ + { 0x10, 1, 0xffff }, + { 0x20, 2, 0xffff }, + { 0x30, 3, 0xffff }, + { 0x40, 4, 0xffff }, + }, + SHADER_ARG_INOUT); + const uint16_t expected[] = { 0x20, 0x80, 0x180, 0x400 }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i].ret, expected[i]); +} + +TEST_F(ComputeTest, two_global_arrays) +{ + const char *kernel_source = + "__kernel void main_test(__global uint *g1, __global uint *g2)\n\ + {\n\ + uint idx = get_global_id(0);\n\ + g1[idx] -= g2[idx];\n\ + }\n"; + auto g1 = ShaderArg({ 10, 20, 30, 40 }, SHADER_ARG_INOUT); + auto g2 = ShaderArg({ 1, 2, 3, 4 }, SHADER_ARG_INPUT); + const uint32_t expected[] = { + 9, 18, 27, 36 + }; + + run_shader(kernel_source, g1.size(), 1, 1, g1, g2); + for (int i = 0; i < g1.size(); ++i) + EXPECT_EQ(g1[i], expected[i]); +} + +TEST_F(ComputeTest, i64tof32) +{ + const char *kernel_source = + "__kernel void main_test(__global long *out, __constant long *in)\n\ + {\n\ + __local float tmp[12];\n\ + uint idx = get_global_id(0);\n\ + tmp[idx] = in[idx];\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + out[idx] = tmp[idx + get_global_id(1)];\n\ + }\n"; + auto in = ShaderArg({ 0x100000000LL, + -0x100000000LL, + 0x7fffffffffffffffLL, + 0x4000004000000000LL, + 0x4000003fffffffffLL, + 0x4000004000000001LL, + -1, + -0x4000004000000000LL, + -0x4000003fffffffffLL, + -0x4000004000000001LL, + 0, + INT64_MIN }, + SHADER_ARG_INPUT); + auto out = ShaderArg(std::vector(12, 0xdeadbeed), SHADER_ARG_OUTPUT); + const int64_t expected[] = { + 0x100000000LL, + -0x100000000LL, + 0x7fffffffffffffffLL, + 0x4000000000000000LL, + 0x4000000000000000LL, + 0x4000008000000000LL, + -1, + -0x4000000000000000LL, + -0x4000000000000000LL, + -0x4000008000000000LL, + 0, + INT64_MIN, + }; + + run_shader(kernel_source, out.size(), 1, 1, out, in); + for (int i = 0; i < out.size(); ++i) { + EXPECT_EQ((int64_t)out[i], expected[i]); + } +} +TEST_F(ComputeTest, two_constant_arrays) +{ + const char *kernel_source = + "__kernel void main_test(__constant uint *c1, __global uint *g1, __constant uint *c2)\n\ + {\n\ + uint idx = get_global_id(0);\n\ + g1[idx] -= c1[idx] + c2[idx];\n\ + }\n"; + auto g1 = ShaderArg({ 10, 20, 30, 40 }, SHADER_ARG_INOUT); + auto c1 = ShaderArg({ 1, 2, 3, 4 }, SHADER_ARG_INPUT); + auto c2 = ShaderArg(std::vector(16384, 5), SHADER_ARG_INPUT); + const uint32_t expected[] = { + 4, 13, 22, 31 + }; + + run_shader(kernel_source, g1.size(), 1, 1, c1, g1, c2); + for (int i = 0; i < g1.size(); ++i) + EXPECT_EQ(g1[i], expected[i]); +} + +TEST_F(ComputeTest, null_constant_ptr) +{ + const char *kernel_source = + "__kernel void main_test(__global uint *g1, __constant uint *c1)\n\ + {\n\ + __constant uint fallback[] = {2, 3, 4, 5};\n\ + __constant uint *c = c1 ? c1 : fallback;\n\ + uint idx = get_global_id(0);\n\ + g1[idx] -= c[idx];\n\ + }\n"; + auto g1 = ShaderArg({ 10, 20, 30, 40 }, SHADER_ARG_INOUT); + auto c1 = ShaderArg({ 1, 2, 3, 4 }, SHADER_ARG_INPUT); + const uint32_t expected1[] = { + 9, 18, 27, 36 + }; + + run_shader(kernel_source, g1.size(), 1, 1, g1, c1); + for (int i = 0; i < g1.size(); ++i) + EXPECT_EQ(g1[i], expected1[i]); + + const uint32_t expected2[] = { + 8, 17, 26, 35 + }; + + g1 = ShaderArg({ 10, 20, 30, 40 }, SHADER_ARG_INOUT); + auto c2 = NullShaderArg(); + run_shader(kernel_source, g1.size(), 1, 1, g1, c2); + for (int i = 0; i < g1.size(); ++i) + EXPECT_EQ(g1[i], expected2[i]); +} + +/* This test seems to fail on older versions of WARP. */ +TEST_F(ComputeTest, DISABLED_null_global_ptr) +{ + const char *kernel_source = + "__kernel void main_test(__global uint *g1, __global uint *g2)\n\ + {\n\ + __constant uint fallback[] = {2, 3, 4, 5};\n\ + uint idx = get_global_id(0);\n\ + g1[idx] -= g2 ? g2[idx] : fallback[idx];\n\ + }\n"; + auto g1 = ShaderArg({ 10, 20, 30, 40 }, SHADER_ARG_INOUT); + auto g2 = ShaderArg({ 1, 2, 3, 4 }, SHADER_ARG_INPUT); + const uint32_t expected1[] = { + 9, 18, 27, 36 + }; + + run_shader(kernel_source, g1.size(), 1, 1, g1, g2); + for (int i = 0; i < g1.size(); ++i) + EXPECT_EQ(g1[i], expected1[i]); + + const uint32_t expected2[] = { + 8, 17, 26, 35 + }; + + g1 = ShaderArg({ 10, 20, 30, 40 }, SHADER_ARG_INOUT); + auto g2null = NullShaderArg(); + run_shader(kernel_source, g1.size(), 1, 1, g1, g2null); + for (int i = 0; i < g1.size(); ++i) + EXPECT_EQ(g1[i], expected2[i]); +} + +TEST_F(ComputeTest, ret_constant_ptr) +{ + struct s { uint64_t ptr; uint32_t val; }; + const char *kernel_source = + "struct s { __constant uint *ptr; uint val; };\n\ + __kernel void main_test(__global struct s *out, __constant uint *in)\n\ + {\n\ + __constant uint foo[] = { 1, 2 };\n\ + uint idx = get_global_id(0);\n\ + if (idx == 0)\n\ + out[idx].ptr = foo;\n\ + else\n\ + out[idx].ptr = in;\n\ + out[idx].val = out[idx].ptr[idx];\n\ + }\n"; + auto out = ShaderArg(std::vector(2, {0xdeadbeefdeadbeef, 0}), SHADER_ARG_OUTPUT); + auto in = ShaderArg({ 3, 4 }, SHADER_ARG_INPUT); + const uint32_t expected_val[] = { + 1, 4 + }; + const uint64_t expected_ptr[] = { + 2ull << 32, 1ull << 32 + }; + + run_shader(kernel_source, out.size(), 1, 1, out, in); + for (int i = 0; i < out.size(); ++i) { + EXPECT_EQ(out[i].val, expected_val[i]); + EXPECT_EQ(out[i].ptr, expected_ptr[i]); + } +} + +TEST_F(ComputeTest, ret_global_ptr) +{ + struct s { uint64_t ptr; uint32_t val; }; + const char *kernel_source = + "struct s { __global uint *ptr; uint val; };\n\ + __kernel void main_test(__global struct s *out, __global uint *in1, __global uint *in2)\n\ + {\n\ + uint idx = get_global_id(0);\n\ + out[idx].ptr = idx ? in2 : in1;\n\ + out[idx].val = out[idx].ptr[idx];\n\ + }\n"; + auto out = ShaderArg(std::vector(2, {0xdeadbeefdeadbeef, 0}), SHADER_ARG_OUTPUT); + auto in1 = ShaderArg({ 1, 2 }, SHADER_ARG_INPUT); + auto in2 = ShaderArg({ 3, 4 }, SHADER_ARG_INPUT); + const uint32_t expected_val[] = { + 1, 4 + }; + const uint64_t expected_ptr[] = { + 1ull << 32, 2ull << 32 + }; + + run_shader(kernel_source, out.size(), 1, 1, out, in1, in2); + for (int i = 0; i < out.size(); ++i) { + EXPECT_EQ(out[i].val, expected_val[i]); + EXPECT_EQ(out[i].ptr, expected_ptr[i]); + } +} + +TEST_F(ComputeTest, ret_local_ptr) +{ + struct s { uint64_t ptr; }; + const char *kernel_source = + "struct s { __local uint *ptr; };\n\ + __kernel void main_test(__global struct s *out)\n\ + {\n\ + __local uint tmp[2];\n\ + uint idx = get_global_id(0);\n\ + tmp[idx] = idx;\n\ + out[idx].ptr = &tmp[idx];\n\ + }\n"; + auto out = ShaderArg(std::vector(2, { 0xdeadbeefdeadbeef }), SHADER_ARG_OUTPUT); + const uint64_t expected_ptr[] = { + 0, 4, + }; + + run_shader(kernel_source, out.size(), 1, 1, out); + for (int i = 0; i < out.size(); ++i) { + EXPECT_EQ(out[i].ptr, expected_ptr[i]); + } +} + +TEST_F(ComputeTest, ret_private_ptr) +{ + struct s { uint64_t ptr; uint32_t value; }; + const char *kernel_source = + "struct s { __private uint *ptr; uint value; };\n\ + __kernel void main_test(__global struct s *out)\n\ + {\n\ + uint tmp[2] = {1, 2};\n\ + uint idx = get_global_id(0);\n\ + out[idx].ptr = &tmp[idx];\n\ + out[idx].value = *out[idx].ptr;\n\ + }\n"; + auto out = ShaderArg(std::vector(2, { 0xdeadbeefdeadbeef }), SHADER_ARG_OUTPUT); + const uint64_t expected_ptr[] = { + 0, 4, + }; + const uint32_t expected_value[] = { + 1, 2 + }; + + run_shader(kernel_source, out.size(), 1, 1, out); + for (int i = 0; i < out.size(); ++i) { + EXPECT_EQ(out[i].ptr, expected_ptr[i]); + } +} + +TEST_F(ComputeTest, globals_8bit) +{ + const char *kernel_source = + "__kernel void main_test(__global unsigned char *inout)\n\ + {\n\ + uint idx = get_global_id(0);\n\ + inout[idx] = inout[idx] + 1;\n\ + }\n"; + auto inout = ShaderArg ({ 100, 110, 120, 130 }, SHADER_ARG_INOUT); + const uint8_t expected[] = { + 101, 111, 121, 131 + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, globals_16bit) +{ + const char *kernel_source = + "__kernel void main_test(__global unsigned short *inout)\n\ + {\n\ + uint idx = get_global_id(0);\n\ + inout[idx] = inout[idx] + 1;\n\ + }\n"; + auto inout = ShaderArg ({ 10000, 10010, 10020, 10030 }, SHADER_ARG_INOUT); + const uint16_t expected[] = { + 10001, 10011, 10021, 10031 + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, DISABLED_globals_64bit) +{ + /* Test disabled, because we need a fixed version of WARP that hasn't + been officially shipped yet */ + + const char *kernel_source = + "__kernel void main_test(__global unsigned long *inout)\n\ + {\n\ + uint idx = get_global_id(0);\n\ + inout[idx] = inout[idx] + 1;\n\ + }\n"; + uint64_t base = 1ull << 50; + auto inout = ShaderArg({ base, base + 10, base + 20, base + 30 }, + SHADER_ARG_INOUT); + const uint64_t expected[] = { + base + 1, base + 11, base + 21, base + 31 + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, built_ins_global_id) +{ + const char *kernel_source = + "__kernel void main_test(__global uint *output)\n\ + {\n\ + output[get_global_id(0)] = get_global_id(0);\n\ + }\n"; + auto output = ShaderArg(std::vector(4, 0xdeadbeef), + SHADER_ARG_OUTPUT); + const uint32_t expected[] = { + 0, 1, 2, 3 + }; + + run_shader(kernel_source, output.size(), 1, 1, output); + for (int i = 0; i < output.size(); ++i) + EXPECT_EQ(output[i], expected[i]); +} + +TEST_F(ComputeTest, built_ins_global_id_rmw) +{ + const char *kernel_source = + "__kernel void main_test(__global uint *output)\n\ + {\n\ + uint id = get_global_id(0);\n\ + output[id] = output[id] * (id + 1);\n\ + }\n"; + auto inout = ShaderArg({0x00000001, 0x10000001, 0x00020002, 0x04010203}, + SHADER_ARG_INOUT); + const uint32_t expected[] = { + 0x00000001, 0x20000002, 0x00060006, 0x1004080c + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, types_float_basics) +{ + const char *kernel_source = + "__kernel void main_test(__global uint *output)\n\ + {\n\ + output[get_global_id(0)] = (uint)((float)get_global_id(0) + 1.5f);\n\ + }\n"; + auto output = ShaderArg(std::vector(4, 0xdeadbeef), + SHADER_ARG_OUTPUT); + const uint32_t expected[] = { + 1, 2, 3, 4 + }; + run_shader(kernel_source, output.size(), 1, 1, output); + for (int i = 0; i < output.size(); ++i) + EXPECT_EQ(output[i], expected[i]); +} + +TEST_F(ComputeTest, DISABLED_types_double_basics) +{ + const char *kernel_source = + "__kernel void main_test(__global uint *output)\n\ + {\n\ + output[get_global_id(0)] = (uint)((double)get_global_id(0) + 1.5);\n\ + }\n"; + auto output = ShaderArg(std::vector(4, 0xdeadbeef), + SHADER_ARG_OUTPUT); + const uint32_t expected[] = { + 1, 2, 3, 4 + }; + run_shader(kernel_source, output.size(), 1, 1, output); + for (int i = 0; i < output.size(); ++i) + EXPECT_EQ(output[i], expected[i]); +} + +TEST_F(ComputeTest, types_short_basics) +{ + const char *kernel_source = + "__kernel void main_test(__global uint *output)\n\ + {\n\ + output[get_global_id(0)] = (uint)((short)get_global_id(0) + (short)1);\n\ + }\n"; + auto output = ShaderArg(std::vector(4, 0xdeadbeef), + SHADER_ARG_OUTPUT); + const uint32_t expected[] = { + 1, 2, 3, 4 + }; + run_shader(kernel_source, output.size(), 1, 1, output); + for (int i = 0; i < output.size(); ++i) + EXPECT_EQ(output[i], expected[i]); +} + +TEST_F(ComputeTest, types_char_basics) +{ + const char *kernel_source = + "__kernel void main_test(__global uint *output)\n\ + {\n\ + output[get_global_id(0)] = (uint)((char)get_global_id(0) + (char)1);\n\ + }\n"; + auto output = ShaderArg(std::vector(4, 0xdeadbeef), + SHADER_ARG_OUTPUT); + const uint32_t expected[] = { + 1, 2, 3, 4 + }; + run_shader(kernel_source, output.size(), 1, 1, output); + for (int i = 0; i < output.size(); ++i) + EXPECT_EQ(output[i], expected[i]); +} + +TEST_F(ComputeTest, types_if_statement) +{ + const char *kernel_source = + "__kernel void main_test(__global uint *output)\n\ + {\n\ + int idx = get_global_id(0);\n\ + if (idx > 0)\n\ + output[idx] = ~idx;\n\ + else\n\ + output[0] = 0xff;\n\ + }\n"; + auto output = ShaderArg(std::vector(4, 0xdeadbeef), + SHADER_ARG_OUTPUT); + const uint32_t expected[] = { + 0xff, ~1u, ~2u, ~3u + }; + run_shader(kernel_source, output.size(), 1, 1, output); + for (int i = 0; i < output.size(); ++i) + EXPECT_EQ(output[i], expected[i]); +} + +TEST_F(ComputeTest, types_do_while_loop) +{ + const char *kernel_source = + "__kernel void main_test(__global uint *output)\n\ + {\n\ + int value = 1;\n\ + int i = 1, n = get_global_id(0);\n\ + do {\n\ + value *= i++;\n\ + } while (i <= n);\n\ + output[n] = value;\n\ + }\n"; + auto output = ShaderArg(std::vector(5, 0xdeadbeef), + SHADER_ARG_OUTPUT); + const uint32_t expected[] = { + 1, 1, 1*2, 1*2*3, 1*2*3*4 + }; + run_shader(kernel_source, output.size(), 1, 1, output); + for (int i = 0; i < output.size(); ++i) + EXPECT_EQ(output[i], expected[i]); +} + +TEST_F(ComputeTest, types_for_loop) +{ + const char *kernel_source = + "__kernel void main_test(__global uint *output)\n\ + {\n\ + int value = 1;\n\ + int n = get_global_id(0);\n\ + for (int i = 1; i <= n; ++i)\n\ + value *= i;\n\ + output[n] = value;\n\ + }\n"; + auto output = ShaderArg(std::vector(5, 0xdeadbeef), + SHADER_ARG_OUTPUT); + const uint32_t expected[] = { + 1, 1, 1*2, 1*2*3, 1*2*3*4 + }; + run_shader(kernel_source, output.size(), 1, 1, output); + for (int i = 0; i < output.size(); ++i) + EXPECT_EQ(output[i], expected[i]); +} + +TEST_F(ComputeTest, DISABLED_complex_types_local_array_long) +{ + const char *kernel_source = + "__kernel void main_test(__global ulong *inout)\n\ + {\n\ + ushort tmp[] = {\n\ + get_global_id(1) + 0x00000000,\n\ + get_global_id(1) + 0x10000001,\n\ + get_global_id(1) + 0x20000020,\n\ + get_global_id(1) + 0x30000300,\n\ + };\n\ + uint idx = get_global_id(0);\n\ + inout[idx] = tmp[idx];\n\ + }\n"; + auto inout = ShaderArg({ 0, 0, 0, 0 }, SHADER_ARG_INOUT); + const uint16_t expected[] = { + 0x00000000, 0x10000001, 0x20000020, 0x30000300, + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, complex_types_local_array_short) +{ + const char *kernel_source = + "__kernel void main_test(__global ushort *inout)\n\ + {\n\ + ushort tmp[] = {\n\ + get_global_id(1) + 0x00,\n\ + get_global_id(1) + 0x10,\n\ + get_global_id(1) + 0x20,\n\ + get_global_id(1) + 0x30,\n\ + };\n\ + uint idx = get_global_id(0);\n\ + inout[idx] = tmp[idx];\n\ + }\n"; + auto inout = ShaderArg({ 0, 0, 0, 0 }, SHADER_ARG_INOUT); + const uint16_t expected[] = { + 0x00, 0x10, 0x20, 0x30, + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, complex_types_local_array_struct_vec_float_misaligned) +{ + const char *kernel_source = + "struct has_vecs { uchar c; ushort s; float2 f; };\n\ + __kernel void main_test(__global uint *inout)\n\ + {\n\ + struct has_vecs tmp[] = {\n\ + { 10 + get_global_id(0), get_global_id(1), { 10.0f, 1.0f } },\n\ + { 19 + get_global_id(0), get_global_id(1), { 20.0f, 4.0f } },\n\ + { 28 + get_global_id(0), get_global_id(1), { 30.0f, 9.0f } },\n\ + { 37 + get_global_id(0), get_global_id(1), { 40.0f, 16.0f } },\n\ + };\n\ + uint idx = get_global_id(0);\n\ + uint mul = (tmp[idx].c + tmp[idx].s) * trunc(tmp[idx].f[0]);\n\ + inout[idx] = mul + trunc(tmp[idx].f[1]);\n\ + }\n"; + auto inout = ShaderArg({ 0, 0, 0, 0 }, SHADER_ARG_INOUT); + const uint16_t expected[] = { 101, 404, 909, 1616 }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, complex_types_local_array) +{ + const char *kernel_source = + "__kernel void main_test(__global uint *inout)\n\ + {\n\ + uint tmp[] = {\n\ + get_global_id(1) + 0x00,\n\ + get_global_id(1) + 0x10,\n\ + get_global_id(1) + 0x20,\n\ + get_global_id(1) + 0x30,\n\ + };\n\ + uint idx = get_global_id(0);\n\ + inout[idx] = tmp[idx];\n\ + }\n"; + auto inout = ShaderArg({ 0, 0, 0, 0 }, SHADER_ARG_INOUT); + const uint32_t expected[] = { + 0x00, 0x10, 0x20, 0x30, + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, complex_types_global_struct_array) +{ + struct two_vals { uint32_t add; uint32_t mul; }; + const char *kernel_source = + "struct two_vals { uint add; uint mul; };\n\ + __kernel void main_test(__global struct two_vals *in_out)\n\ + {\n\ + uint id = get_global_id(0);\n\ + in_out[id].add = in_out[id].add + id;\n\ + in_out[id].mul = in_out[id].mul * id;\n\ + }\n"; + auto inout = ShaderArg({ { 8, 8 }, { 16, 16 }, { 64, 64 }, { 65536, 65536 } }, + SHADER_ARG_INOUT); + const struct two_vals expected[] = { + { 8 + 0, 8 * 0 }, + { 16 + 1, 16 * 1 }, + { 64 + 2, 64 * 2 }, + { 65536 + 3, 65536 * 3 } + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) { + EXPECT_EQ(inout[i].add, expected[i].add); + EXPECT_EQ(inout[i].mul, expected[i].mul); + } +} + +TEST_F(ComputeTest, complex_types_global_uint2) +{ + struct uint2 { uint32_t x; uint32_t y; }; + const char *kernel_source = + "__kernel void main_test(__global uint2 *inout)\n\ + {\n\ + uint id = get_global_id(0);\n\ + inout[id].x = inout[id].x + id;\n\ + inout[id].y = inout[id].y * id;\n\ + }\n"; + auto inout = ShaderArg({ { 8, 8 }, { 16, 16 }, { 64, 64 }, { 65536, 65536 } }, + SHADER_ARG_INOUT); + const struct uint2 expected[] = { + { 8 + 0, 8 * 0 }, + { 16 + 1, 16 * 1 }, + { 64 + 2, 64 * 2 }, + { 65536 + 3, 65536 * 3 } + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) { + EXPECT_EQ(inout[i].x, expected[i].x); + EXPECT_EQ(inout[i].y, expected[i].y); + } +} + +TEST_F(ComputeTest, complex_types_global_ushort2) +{ + struct ushort2 { uint16_t x; uint16_t y; }; + const char *kernel_source = + "__kernel void main_test(__global ushort2 *inout)\n\ + {\n\ + uint id = get_global_id(0);\n\ + inout[id].x = inout[id].x + id;\n\ + inout[id].y = inout[id].y * id;\n\ + }\n"; + auto inout = ShaderArg({ { 8, 8 }, { 16, 16 }, { 64, 64 }, + { (uint16_t)65536, (uint16_t)65536 } }, + SHADER_ARG_INOUT); + const struct ushort2 expected[] = { + { 8 + 0, 8 * 0 }, + { 16 + 1, 16 * 1 }, + { 64 + 2, 64 * 2 }, + { (uint16_t)(65536 + 3), (uint16_t)(65536 * 3) } + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) { + EXPECT_EQ(inout[i].x, expected[i].x); + EXPECT_EQ(inout[i].y, expected[i].y); + } +} + +TEST_F(ComputeTest, complex_types_global_uchar3) +{ + struct uchar3 { uint8_t x; uint8_t y; uint8_t z; uint8_t pad; }; + const char *kernel_source = + "__kernel void main_test(__global uchar3 *inout)\n\ + {\n\ + uint id = get_global_id(0);\n\ + inout[id].x = inout[id].x + id;\n\ + inout[id].y = inout[id].y * id;\n\ + inout[id].z = inout[id].y + inout[id].x;\n\ + }\n"; + auto inout = ShaderArg({ { 8, 8, 8 }, { 16, 16, 16 }, { 64, 64, 64 }, { 255, 255, 255 } }, + SHADER_ARG_INOUT); + const struct uchar3 expected[] = { + { 8 + 0, 8 * 0, (8 + 0) + (8 * 0) }, + { 16 + 1, 16 * 1, (16 + 1) + (16 * 1) }, + { 64 + 2, 64 * 2, (64 + 2) + (64 * 2) }, + { (uint8_t)(255 + 3), (uint8_t)(255 * 3), (uint8_t)((255 + 3) + (255 * 3)) } + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) { + EXPECT_EQ(inout[i].x, expected[i].x); + EXPECT_EQ(inout[i].y, expected[i].y); + EXPECT_EQ(inout[i].z, expected[i].z); + } +} + +TEST_F(ComputeTest, complex_types_constant_uchar3) +{ + struct uchar3 { uint8_t x; uint8_t y; uint8_t z; uint8_t pad; }; + const char *kernel_source = + "__kernel void main_test(__global uchar3 *out, __constant uchar3 *in)\n\ + {\n\ + uint id = get_global_id(0);\n\ + out[id].x = in[id].x + id;\n\ + out[id].y = in[id].y * id;\n\ + out[id].z = out[id].y + out[id].x;\n\ + }\n"; + auto in = ShaderArg({ { 8, 8, 8 }, { 16, 16, 16 }, { 64, 64, 64 }, { 255, 255, 255 } }, + SHADER_ARG_INPUT); + auto out = ShaderArg(std::vector(4, { 0xff, 0xff, 0xff }), + SHADER_ARG_OUTPUT); + const struct uchar3 expected[] = { + { 8 + 0, 8 * 0, (8 + 0) + (8 * 0) }, + { 16 + 1, 16 * 1, (16 + 1) + (16 * 1) }, + { 64 + 2, 64 * 2, (64 + 2) + (64 * 2) }, + { (uint8_t)(255 + 3), (uint8_t)(255 * 3), (uint8_t)((255 + 3) + (255 * 3)) } + }; + run_shader(kernel_source, out.size(), 1, 1, out, in); + for (int i = 0; i < out.size(); ++i) { + EXPECT_EQ(out[i].x, expected[i].x); + EXPECT_EQ(out[i].y, expected[i].y); + EXPECT_EQ(out[i].z, expected[i].z); + } +} + +TEST_F(ComputeTest, complex_types_global_uint8) +{ + struct uint8 { + uint32_t s0; uint32_t s1; uint32_t s2; uint32_t s3; + uint32_t s4; uint32_t s5; uint32_t s6; uint32_t s7; + }; + const char *kernel_source = + "__kernel void main_test(__global uint8 *inout)\n\ + {\n\ + uint id = get_global_id(0);\n\ + inout[id].s01234567 = inout[id].s01234567 * 2;\n\ + }\n"; + auto inout = ShaderArg({ { 1, 2, 3, 4, 5, 6, 7, 8 } }, + SHADER_ARG_INOUT); + const struct uint8 expected[] = { + { 2, 4, 6, 8, 10, 12, 14, 16 } + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) { + EXPECT_EQ(inout[i].s0, expected[i].s0); + EXPECT_EQ(inout[i].s1, expected[i].s1); + EXPECT_EQ(inout[i].s2, expected[i].s2); + EXPECT_EQ(inout[i].s3, expected[i].s3); + EXPECT_EQ(inout[i].s4, expected[i].s4); + EXPECT_EQ(inout[i].s5, expected[i].s5); + EXPECT_EQ(inout[i].s6, expected[i].s6); + EXPECT_EQ(inout[i].s7, expected[i].s7); + } +} + +TEST_F(ComputeTest, complex_types_local_ulong16) +{ + struct ulong16 { + uint64_t values[16]; + }; + const char *kernel_source = + R"(__kernel void main_test(__global ulong16 *inout) + { + __local ulong16 local_array[2]; + uint id = get_global_id(0); + local_array[id] = inout[id]; + barrier(CLK_LOCAL_MEM_FENCE); + inout[id] = local_array[0] * 2; + })"; + auto inout = ShaderArg({ { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } }, + SHADER_ARG_INOUT); + const struct ulong16 expected[] = { + { 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 } + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) { + for (int j = 0; j < 16; ++j) { + EXPECT_EQ(inout[i].values[j], expected[i].values[j]); + } + } +} + +TEST_F(ComputeTest, complex_types_constant_uint8) +{ + struct uint8 { + uint32_t s0; uint32_t s1; uint32_t s2; uint32_t s3; + uint32_t s4; uint32_t s5; uint32_t s6; uint32_t s7; + }; + const char *kernel_source = + "__kernel void main_test(__global uint8 *out, __constant uint8 *in)\n\ + {\n\ + uint id = get_global_id(0);\n\ + out[id].s01234567 = in[id].s01234567 * 2;\n\ + }\n"; + auto in = ShaderArg({ { 1, 2, 3, 4, 5, 6, 7, 8 } }, + SHADER_ARG_INPUT); + auto out = ShaderArg({ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff } }, + SHADER_ARG_INOUT); + const struct uint8 expected[] = { + { 2, 4, 6, 8, 10, 12, 14, 16 } + }; + run_shader(kernel_source, out.size(), 1, 1, out, in); + for (int i = 0; i < out.size(); ++i) { + EXPECT_EQ(out[i].s0, expected[i].s0); + EXPECT_EQ(out[i].s1, expected[i].s1); + EXPECT_EQ(out[i].s2, expected[i].s2); + EXPECT_EQ(out[i].s3, expected[i].s3); + EXPECT_EQ(out[i].s4, expected[i].s4); + EXPECT_EQ(out[i].s5, expected[i].s5); + EXPECT_EQ(out[i].s6, expected[i].s6); + EXPECT_EQ(out[i].s7, expected[i].s7); + } +} + +TEST_F(ComputeTest, DISABLED_complex_types_const_array) +{ + /* DISABLED because current release versions of WARP either return + * rubbish from reads or crash: they are not prepared to handle + * non-float global constants */ + const char *kernel_source = + "__kernel void main_test(__global uint *output)\n\ + {\n\ + const uint foo[] = { 100, 101, 102, 103 };\n\ + output[get_global_id(0)] = foo[get_global_id(0) % 4];\n\ + }\n"; + auto output = ShaderArg(std::vector(4, 0xdeadbeef), + SHADER_ARG_OUTPUT); + const uint32_t expected[] = { + 100, 101, 102, 103 + }; + run_shader(kernel_source, output.size(), 1, 1, output); + for (int i = 0; i < output.size(); ++i) + EXPECT_EQ(output[i], expected[i]); +} + +TEST_F(ComputeTest, mem_access_load_store_ordering) +{ + const char *kernel_source = + "__kernel void main_test(__global uint *output)\n\ + {\n\ + uint foo[4];\n\ + foo[0] = 0x11111111;\n\ + foo[1] = 0x22222222;\n\ + foo[2] = 0x44444444;\n\ + foo[3] = 0x88888888;\n\ + foo[get_global_id(1)] -= 0x11111111; // foo[0] = 0 \n\ + foo[0] += get_global_id(0); // foo[0] = tid\n\ + foo[foo[get_global_id(1)]] = get_global_id(0); // foo[tid] = tid\n\ + output[get_global_id(0)] = foo[get_global_id(0)]; // output[tid] = tid\n\ + }\n"; + auto output = ShaderArg(std::vector(4, 0xdeadbeef), + SHADER_ARG_OUTPUT); + const uint16_t expected[] = { + 0, 1, 2, 3 + }; + run_shader(kernel_source, output.size(), 1, 1, output); + for (int i = 0; i < output.size(); ++i) + EXPECT_EQ(output[i], expected[i]); +} + +TEST_F(ComputeTest, DISABLED_two_const_arrays) +{ + /* DISABLED because current release versions of WARP either return + * rubbish from reads or crash: they are not prepared to handle + * non-float global constants */ + const char *kernel_source = + "__kernel void main_test(__global uint *output)\n\ + {\n\ + uint id = get_global_id(0);\n\ + uint foo[4] = {100, 101, 102, 103};\n\ + uint bar[4] = {1, 2, 3, 4};\n\ + output[id] = foo[id] * bar[id];\n\ + }\n"; + auto output = ShaderArg(std::vector(4, 0xdeadbeef), + SHADER_ARG_OUTPUT); + const uint32_t expected[] = { + 100, 202, 306, 412 + }; + run_shader(kernel_source, output.size(), 1, 1, output); + for (int i = 0; i < output.size(); ++i) + EXPECT_EQ(output[i], expected[i]); +} + +TEST_F(ComputeTest, imod_pos) +{ + const char *kernel_source = + "__kernel void main_test(__global int *inout)\n\ + {\n\ + inout[get_global_id(0)] = inout[get_global_id(0)] % 3;\n\ + }\n"; + auto inout = ShaderArg({ -4, -3, -2, -1, 0, 1, 2, 3, 4 }, + SHADER_ARG_INOUT); + const int32_t expected[] = { + -1, 0, -2, -1, 0, 1, 2, 0, 1 + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, imod_neg) +{ + const char *kernel_source = + "__kernel void main_test(__global int *inout)\n\ + {\n\ + inout[get_global_id(0)] = inout[get_global_id(0)] % -3;\n\ + }\n"; + auto inout = ShaderArg({ -4, -3, -2, -1, 0, 1, 2, 3, 4 }, + SHADER_ARG_INOUT); + const int32_t expected[] = { + -1, 0, -2, -1, 0, 1, 2, 0, 1 + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, umod) +{ + const char *kernel_source = + "__kernel void main_test(__global uint *inout)\n\ + {\n\ + inout[get_global_id(0)] = inout[get_global_id(0)] % 0xfffffffc;\n\ + }\n"; + auto inout = ShaderArg({ 0xfffffffa, 0xfffffffb, 0xfffffffc, 0xfffffffd, 0xfffffffe }, + SHADER_ARG_INOUT); + const uint32_t expected[] = { + 0xfffffffa, 0xfffffffb, 0, 1, 2 + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, rotate) +{ + const char *kernel_source = + "__kernel void main_test(__global uint *inout)\n\ + {\n\ + inout[get_global_id(0)] = rotate(inout[get_global_id(0)], (uint)get_global_id(0) * 4);\n\ + }\n"; + auto inout = ShaderArg(std::vector(4, 0xdeadbeef), + SHADER_ARG_INOUT); + const uint32_t expected[] = { + 0xdeadbeef, 0xeadbeefd, 0xadbeefde, 0xdbeefdea + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, popcount) +{ + const char *kernel_source = + "__kernel void main_test(__global uint *inout)\n\ + {\n\ + inout[get_global_id(0)] = popcount(inout[get_global_id(0)]);\n\ + }\n"; + auto inout = ShaderArg({ 0, 0x1, 0x3, 0x101, 0x110011, ~0u }, + SHADER_ARG_INOUT); + const uint32_t expected[] = { + 0, 1, 2, 2, 4, 32 + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, hadd) +{ + const char *kernel_source = + "__kernel void main_test(__global uint *inout)\n\ + {\n\ + inout[get_global_id(0)] = hadd(inout[get_global_id(0)], 1u << 31);\n\ + }\n"; + auto inout = ShaderArg({ 0, 1, 2, 3, 0xfffffffc, 0xfffffffd, 0xfffffffe, 0xffffffff }, + SHADER_ARG_INOUT); + const uint32_t expected[] = { + (1u << 31) >> 1, + ((1u << 31) + 1) >> 1, + ((1u << 31) + 2) >> 1, + ((1u << 31) + 3) >> 1, + ((1ull << 31) + 0xfffffffc) >> 1, + ((1ull << 31) + 0xfffffffd) >> 1, + ((1ull << 31) + 0xfffffffe) >> 1, + ((1ull << 31) + 0xffffffff) >> 1, + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, rhadd) +{ + const char *kernel_source = + "__kernel void main_test(__global uint *inout)\n\ + {\n\ + inout[get_global_id(0)] = rhadd(inout[get_global_id(0)], 1u << 31);\n\ + }\n"; + auto inout = ShaderArg({ 0, 1, 2, 3, 0xfffffffc, 0xfffffffd, 0xfffffffe, 0xffffffff }, + SHADER_ARG_INOUT); + const uint32_t expected[] = { + ((1u << 31) + 1) >> 1, + ((1u << 31) + 2) >> 1, + ((1u << 31) + 3) >> 1, + ((1u << 31) + 4) >> 1, + ((1ull << 31) + 0xfffffffd) >> 1, + ((1ull << 31) + 0xfffffffe) >> 1, + ((1ull << 31) + 0xffffffff) >> 1, + ((1ull << 31) + (1ull << 32)) >> 1, + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, add_sat) +{ + const char *kernel_source = + "__kernel void main_test(__global uint *inout)\n\ + {\n\ + inout[get_global_id(0)] = add_sat(inout[get_global_id(0)], 2u);\n\ + }\n"; + auto inout = ShaderArg({ 0xffffffff - 3, 0xffffffff - 2, 0xffffffff - 1, 0xffffffff }, + SHADER_ARG_INOUT); + const uint32_t expected[] = { + 0xffffffff - 1, 0xffffffff, 0xffffffff, 0xffffffff + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, sub_sat) +{ + const char *kernel_source = + "__kernel void main_test(__global uint *inout)\n\ + {\n\ + inout[get_global_id(0)] = sub_sat(inout[get_global_id(0)], 2u);\n\ + }\n"; + auto inout = ShaderArg({ 0, 1, 2, 3 }, SHADER_ARG_INOUT); + const uint32_t expected[] = { + 0, 0, 0, 1 + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, mul_hi) +{ + const char *kernel_source = + "__kernel void main_test(__global uint *inout)\n\ + {\n\ + inout[get_global_id(0)] = mul_hi(inout[get_global_id(0)], 1u << 31);\n\ + }\n"; + auto inout = ShaderArg({ 0, 1, 2, 3, (1u << 31) }, SHADER_ARG_INOUT); + const uint32_t expected[] = { + 0, 0, 1, 1, (1u << 30) + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, ldexp_x) +{ + const char *kernel_source = + "__kernel void main_test(__global float *inout)\n\ + {\n\ + inout[get_global_id(0)] = ldexp(inout[get_global_id(0)], 5);\n\ + }\n"; + auto inout = ShaderArg({ 0.0f, 0.5f, 1.0f, 2.0f }, SHADER_ARG_INOUT); + const float expected[] = { + ldexp(0.0f, 5), ldexp(0.5f, 5), ldexp(1.0f, 5), ldexp(2.0f, 5) + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_FLOAT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, ldexp_y) +{ + const char *kernel_source = + "__kernel void main_test(__global float *inout)\n\ + {\n\ + inout[get_global_id(0)] = ldexp(inout[get_global_id(0)], get_global_id(0));\n\ + }\n"; + auto inout = ShaderArg({ 0.25f, 0.5f, 0.75f, 1.0f }, SHADER_ARG_INOUT); + const float expected[] = { + ldexp(0.25f, 0), ldexp(0.5f, 1), ldexp(0.75f, 2), ldexp(1.0f, 3) + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_FLOAT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, frexp_ret) +{ + const char *kernel_source = + "__kernel void main_test(__global float *inout)\n\ + {\n\ + int exp;\n\ + inout[get_global_id(0)] = frexp(inout[get_global_id(0)], &exp);\n\ + }\n"; + auto inout = ShaderArg({ 0.0f, 0.5f, 1.0f, 3.0f }, SHADER_ARG_INOUT); + const float expected[] = { + 0.0f, 0.5f, 0.5f, 0.75f + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_FLOAT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, frexp_exp) +{ + const char *kernel_source = + "__kernel void main_test(__global float *inout)\n\ + {\n\ + int exp;\n\ + frexp(inout[get_global_id(0)], &exp);\n\ + inout[get_global_id(0)] = (float)exp;\n\ + }\n"; + auto inout = ShaderArg({ 0.0f, 0.5f, 1.0f, 3.0f }, SHADER_ARG_INOUT); + const float expected[] = { + 0.0f, 0.0f, 1.0f, 2.0f + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_FLOAT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, clz) +{ + const char *kernel_source = + "__kernel void main_test(__global uint *inout)\n\ + {\n\ + inout[get_global_id(0)] = clz(inout[get_global_id(0)]);\n\ + }\n"; + auto inout = ShaderArg({ 0, 1, 0xffff, (1u << 30), (1u << 31) }, SHADER_ARG_INOUT); + const uint32_t expected[] = { + 32, 31, 16, 1, 0 + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_FLOAT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, sin) +{ + struct sin_vals { float in; float clc; float native; }; + const char *kernel_source = + "struct sin_vals { float in; float clc; float native; };\n\ + __kernel void main_test(__global struct sin_vals *inout)\n\ + {\n\ + inout[get_global_id(0)].clc = sin(inout[get_global_id(0)].in);\n\ + inout[get_global_id(0)].native = native_sin(inout[get_global_id(0)].in);\n\ + }\n"; + const vector input = { + { 0.0f, 0.0f, 0.0f }, + { 1.0f, 0.0f, 0.0f }, + { 2.0f, 0.0f, 0.0f }, + { 3.0f, 0.0f, 0.0f }, + }; + auto inout = ShaderArg(input, SHADER_ARG_INOUT); + const struct sin_vals expected[] = { + { 0.0f, 0.0f, 0.0f }, + { 1.0f, sin(1.0f), sin(1.0f) }, + { 2.0f, sin(2.0f), sin(2.0f) }, + { 3.0f, sin(3.0f), sin(3.0f) }, + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) { + EXPECT_FLOAT_EQ(inout[i].in, inout[i].in); + EXPECT_FLOAT_EQ(inout[i].clc, inout[i].clc); + EXPECT_NEAR(inout[i].clc, inout[i].native, 0.008f); // range from DXIL spec + } +} + +TEST_F(ComputeTest, DISABLED_cosh) +{ + /* Disabled because of WARP failures, where we fetch incorrect results when + * sourcing from non-float ICBs */ + const char *kernel_source = + "__kernel void main_test(__global float *inout)\n\ + {\n\ + inout[get_global_id(0)] = cosh(inout[get_global_id(0)]);\n\ + }\n"; + auto inout = ShaderArg({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT); + const float expected[] = { + cosh(0.0f), cosh(1.0f), cosh(2.0f), cosh(3.0f) + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_FLOAT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, exp) +{ + const char *kernel_source = + "__kernel void main_test(__global float *inout)\n\ + {\n\ + inout[get_global_id(0)] = native_exp(inout[get_global_id(0)]);\n\ + }\n"; + auto inout = ShaderArg({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT); + const float expected[] = { + exp(0.0f), exp(1.0f), exp(2.0f), exp(3.0f) + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_FLOAT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, exp10) +{ + const char *kernel_source = + "__kernel void main_test(__global float *inout)\n\ + {\n\ + inout[get_global_id(0)] = native_exp10(inout[get_global_id(0)]);\n\ + }\n"; + auto inout = ShaderArg({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT); + const float expected[] = { + pow(10.0f, 0.0f), pow(10.0f, 1.0f), pow(10.0f, 2.0f), pow(10.0f, 3.0f) + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_FLOAT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, exp2) +{ + const char *kernel_source = + "__kernel void main_test(__global float *inout)\n\ + {\n\ + inout[get_global_id(0)] = native_exp2(inout[get_global_id(0)]);\n\ + }\n"; + auto inout = ShaderArg({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT); + const float expected[] = { + pow(2.0f, 0.0f), pow(2.0f, 1.0f), pow(2.0f, 2.0f), pow(2.0f, 3.0f) + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_FLOAT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, log) +{ + const char *kernel_source = + "__kernel void main_test(__global float *inout)\n\ + {\n\ + inout[get_global_id(0)] = native_log(inout[get_global_id(0)]);\n\ + }\n"; + auto inout = ShaderArg({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT); + const float expected[] = { + log(0.0f), log(1.0f), log(2.0f), log(3.0f) + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_FLOAT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, log10) +{ + const char *kernel_source = + "__kernel void main_test(__global float *inout)\n\ + {\n\ + inout[get_global_id(0)] = native_log10(inout[get_global_id(0)]);\n\ + }\n"; + auto inout = ShaderArg({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT); + const float expected[] = { + log10(0.0f), log10(1.0f), log10(2.0f), log10(3.0f) + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_FLOAT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, log2) +{ + const char *kernel_source = + "__kernel void main_test(__global float *inout)\n\ + {\n\ + inout[get_global_id(0)] = native_log2(inout[get_global_id(0)]);\n\ + }\n"; + auto inout = ShaderArg({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT); + const float expected[] = { + log(0.0f) / log(2), log(1.0f) / log(2), log(2.0f) / log(2), log(3.0f) / log(2) + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_FLOAT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, rint) +{ + const char *kernel_source = + "__kernel void main_test(__global float *inout)\n\ + {\n\ + inout[get_global_id(0)] = rint(inout[get_global_id(0)]);\n\ + }\n"; + + auto inout = ShaderArg({ 0.5f, 1.5f, -0.5f, -1.5f, 1.4f }, SHADER_ARG_INOUT); + const float expected[] = { + 0.0f, 2.0f, 0.0f, -2.0f, 1.0f, + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_FLOAT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, round) +{ + const char *kernel_source = + "__kernel void main_test(__global float *inout)\n\ + {\n\ + inout[get_global_id(0)] = round(inout[get_global_id(0)]);\n\ + }\n"; + auto inout = ShaderArg({ 0, 0.3f, -0.3f, 0.5f, -0.5f, 1.1f, -1.1f }, + SHADER_ARG_INOUT); + const float expected[] = { + 0.0f, 0.0f, -0.0f, 1.0f, -1.0f, 1.0f, -1.0f + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_FLOAT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, arg_by_val) +{ + const char *kernel_source = + "__kernel void main_test(__global float *inout, float mul)\n\ + {\n\ + inout[get_global_id(0)] = inout[get_global_id(0)] * mul;\n\ + }\n"; + auto inout = ShaderArg({ 0, 0.3f, -0.3f, 0.5f, -0.5f, 1.1f, -1.1f }, + SHADER_ARG_INOUT); + auto mul = ShaderArg(10.0f, SHADER_ARG_INPUT); + const float expected[] = { + 0.0f, 3.0f, -3.0f, 5.0f, -5.0f, 11.0f, -11.0f + }; + run_shader(kernel_source, inout.size(), 1, 1, inout, mul); + for (int i = 0; i < inout.size(); ++i) + EXPECT_FLOAT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, uint8_by_val) +{ + struct uint8 { + uint32_t s0; uint32_t s1; uint32_t s2; uint32_t s3; + uint32_t s4; uint32_t s5; uint32_t s6; uint32_t s7; + }; + const char *kernel_source = + "__kernel void main_test(__global uint *out, uint8 val)\n\ + {\n\ + out[get_global_id(0)] = val.s0 + val.s1 + val.s2 + val.s3 +\n\ + val.s4 + val.s5 + val.s6 + val.s7;\n\ + }\n"; + auto out = ShaderArg({ 0 }, SHADER_ARG_OUTPUT); + auto val = ShaderArg({ {0, 1, 2, 3, 4, 5, 6, 7 }}, SHADER_ARG_INPUT); + const uint32_t expected[] = { 0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 }; + run_shader(kernel_source, out.size(), 1, 1, out, val); + for (int i = 0; i < out.size(); ++i) + EXPECT_EQ(out[i], expected[i]); +} + +TEST_F(ComputeTest, link) +{ + const char *foo_src = + "float foo(float in)\n\ + {\n\ + return in * in;\n\ + }\n"; + const char *kernel_source = + "float foo(float in);\n\ + __kernel void main_test(__global float *inout)\n\ + {\n\ + inout[get_global_id(0)] = foo(inout[get_global_id(0)]);\n\ + }\n"; + std::vector srcs = { foo_src, kernel_source }; + auto inout = ShaderArg({ 2.0f }, SHADER_ARG_INOUT); + const float expected[] = { + 4.0f, + }; + run_shader(srcs, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, link_library) +{ + const char *bar_src = + "float bar(float in)\n\ + {\n\ + return in * 5;\n\ + }\n"; + const char *foo_src = + "float bar(float in);\n\ + float foo(float in)\n\ + {\n\ + return in * bar(in);\n\ + }\n"; + const char *kernel_source = + "float foo(float in);\n\ + __kernel void main_test(__global float *inout)\n\ + {\n\ + inout[get_global_id(0)] = foo(inout[get_global_id(0)]);\n\ + }\n"; + std::vector libraries = { + compile({ bar_src, kernel_source }, {}, true), + compile({ foo_src }, {}, true) + }; + Shader exe = link(libraries); + auto inout = ShaderArg({ 2.0f }, SHADER_ARG_INOUT); + const float expected[] = { + 20.0f, + }; + run_shader(exe, { (unsigned)inout.size(), 1, 1 }, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, localvar) +{ + const char *kernel_source = + "__kernel __attribute__((reqd_work_group_size(2, 1, 1)))\n\ + void main_test(__global float *inout)\n\ + {\n\ + __local float2 tmp[2];\n\ + tmp[get_local_id(0)].x = inout[get_global_id(0)] + 1;\n\ + tmp[get_local_id(0)].y = inout[get_global_id(0)] - 1;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + inout[get_global_id(0)] = tmp[get_local_id(0) % 2].x * tmp[(get_local_id(0) + 1) % 2].y;\n\ + }\n"; + + auto inout = ShaderArg({ 2.0f, 4.0f }, SHADER_ARG_INOUT); + const float expected[] = { + 9.0f, 5.0f + }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, localvar_uchar2) +{ + const char *kernel_source = + "__attribute__((reqd_work_group_size(2, 1, 1)))\n\ + __kernel void main_test(__global uchar *inout)\n\ + {\n\ + __local uchar2 tmp[2];\n\ + tmp[get_local_id(0)].x = inout[get_global_id(0)] + 1;\n\ + tmp[get_local_id(0)].y = inout[get_global_id(0)] - 1;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + inout[get_global_id(0)] = tmp[get_local_id(0) % 2].x * tmp[(get_local_id(0) + 1) % 2].y;\n\ + }\n"; + + auto inout = ShaderArg({ 2, 4 }, SHADER_ARG_INOUT); + const uint8_t expected[] = { 9, 5 }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, work_group_size_hint) +{ + const char *kernel_source = + "__attribute__((work_group_size_hint(2, 1, 1)))\n\ + __kernel void main_test(__global uint *output)\n\ + {\n\ + output[get_global_id(0)] = get_local_id(0);\n\ + }\n"; + auto output = ShaderArg(std::vector(4, 0xdeadbeef), + SHADER_ARG_OUTPUT); + const uint32_t expected[] = { + 0, 1, 2, 3 + }; + run_shader(kernel_source, output.size(), 1, 1, output); + for (int i = 0; i < output.size(); ++i) + EXPECT_EQ(output[i], expected[i]); +} + +TEST_F(ComputeTest, reqd_work_group_size) +{ + const char *kernel_source = + "__attribute__((reqd_work_group_size(2, 1, 1)))\n\ + __kernel void main_test(__global uint *output)\n\ + {\n\ + output[get_global_id(0)] = get_local_id(0);\n\ + }\n"; + auto output = ShaderArg(std::vector(4, 0xdeadbeef), + SHADER_ARG_OUTPUT); + const uint32_t expected[] = { + 0, 1, 0, 1 + }; + run_shader(kernel_source, output.size(), 1, 1, output); + for (int i = 0; i < output.size(); ++i) + EXPECT_EQ(output[i], expected[i]); +} + +TEST_F(ComputeTest, image) +{ + const char* kernel_source = + "__kernel void main_test(read_only image2d_t input, write_only image2d_t output)\n\ + {\n\ + int2 coords = (int2)(get_global_id(0), get_global_id(1));\n\ + write_imagef(output, coords, read_imagef(input, coords));\n\ + }\n"; + Shader shader = compile(std::vector({ kernel_source })); + validate(shader); +} + +TEST_F(ComputeTest, image_two_reads) +{ + const char* kernel_source = + "__kernel void main_test(image2d_t image, int is_float, __global float* output)\n\ + {\n\ + if (is_float)\n\ + output[get_global_id(0)] = read_imagef(image, (int2)(0, 0)).x;\n\ + else \n\ + output[get_global_id(0)] = (float)read_imagei(image, (int2)(0, 0)).x;\n\ + }\n"; + Shader shader = compile(std::vector({ kernel_source })); + validate(shader); +} + +TEST_F(ComputeTest, sampler) +{ + const char* kernel_source = + "__kernel void main_test(image2d_t image, sampler_t sampler, __global float* output)\n\ + {\n\ + output[get_global_id(0)] = read_imagef(image, sampler, (int2)(0, 0)).x;\n\ + }\n"; + Shader shader = compile(std::vector({ kernel_source })); + validate(shader); +} + +TEST_F(ComputeTest, image_dims) +{ + const char* kernel_source = + "__kernel void main_test(image2d_t roimage, write_only image2d_t woimage, __global uint* output)\n\ + {\n\ + output[get_global_id(0)] = get_image_width(roimage);\n\ + output[get_global_id(0) + 1] = get_image_width(woimage);\n\ + }\n"; + Shader shader = compile(std::vector({ kernel_source })); + validate(shader); +} + +TEST_F(ComputeTest, image_format) +{ + const char* kernel_source = + "__kernel void main_test(image2d_t roimage, write_only image2d_t woimage, __global uint* output)\n\ + {\n\ + output[get_global_id(0)] = get_image_channel_data_type(roimage);\n\ + output[get_global_id(0) + 1] = get_image_channel_order(woimage);\n\ + }\n"; + Shader shader = compile(std::vector({ kernel_source })); + validate(shader); +} + +TEST_F(ComputeTest, image1d_buffer_t) +{ + const char* kernel_source = + "__kernel void main_test(read_only image1d_buffer_t input, write_only image1d_buffer_t output)\n\ + {\n\ + write_imageui(output, get_global_id(0), read_imageui(input, get_global_id(0)));\n\ + }\n"; + Shader shader = compile(std::vector({ kernel_source })); + validate(shader); +} + +TEST_F(ComputeTest, local_ptr) +{ + struct uint2 { uint32_t x, y; }; + const char *kernel_source = + "__kernel void main_test(__global uint *inout, __local uint2 *tmp)\n\ + {\n\ + tmp[get_local_id(0)].x = inout[get_global_id(0)] + 1;\n\ + tmp[get_local_id(0)].y = inout[get_global_id(0)] - 1;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + inout[get_global_id(0)] = tmp[get_local_id(0) % 2].x * tmp[(get_local_id(0) + 1) % 2].y;\n\ + }\n"; + auto inout = ShaderArg({ 2, 4 }, SHADER_ARG_INOUT); + auto tmp = ShaderArg(std::vector(4096), SHADER_ARG_INPUT); + const uint8_t expected[] = { 9, 5 }; + run_shader(kernel_source, inout.size(), 1, 1, inout, tmp); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, two_local_ptrs) +{ + struct uint2 { uint32_t x, y; }; + const char *kernel_source = + "__kernel void main_test(__global uint *inout, __local uint2 *tmp, __local uint *tmp2)\n\ + {\n\ + tmp[get_local_id(0)].x = inout[get_global_id(0)] + 1;\n\ + tmp[get_local_id(0)].y = inout[get_global_id(0)] - 1;\n\ + tmp2[get_local_id(0)] = get_global_id(0);\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + inout[get_global_id(0)] = tmp[get_local_id(0) % 2].x * tmp[(get_local_id(0) + 1) % 2].y + tmp2[get_local_id(0) % 2];\n\ + }\n"; + auto inout = ShaderArg({ 2, 4 }, SHADER_ARG_INOUT); + auto tmp = ShaderArg(std::vector(1024), SHADER_ARG_INPUT); + auto tmp2 = ShaderArg(std::vector(1024), SHADER_ARG_INPUT); + const uint8_t expected[] = { 9, 6 }; + run_shader(kernel_source, inout.size(), 1, 1, inout, tmp, tmp2); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, int8_to_float) +{ + const char *kernel_source = + "__kernel void main_test(__global char* in, __global float* out)\n\ + {\n\ + uint pos = get_global_id(0);\n\ + out[pos] = in[pos] / 100.0f;\n\ + }"; + auto in = ShaderArg({ 10, 20, 30, 40 }, SHADER_ARG_INPUT); + auto out = ShaderArg(std::vector(4, std::numeric_limits::infinity()), SHADER_ARG_OUTPUT); + const float expected[] = { 0.1f, 0.2f, 0.3f, 0.4f }; + run_shader(kernel_source, in.size(), 1, 1, in, out); + for (int i = 0; i < in.size(); ++i) + EXPECT_FLOAT_EQ(out[i], expected[i]); +} + +TEST_F(ComputeTest, vec_hint_float4) +{ + const char *kernel_source = + "__kernel __attribute__((vec_type_hint(float4))) void main_test(__global float *inout)\n\ + {\n\ + inout[get_global_id(0)] *= inout[get_global_id(1)];\n\ + }"; + Shader shader = compile({ kernel_source }); + EXPECT_EQ(shader.obj->kernels[0].vec_hint_size, 4); + EXPECT_EQ(shader.obj->kernels[0].vec_hint_type, CLC_VEC_HINT_TYPE_FLOAT); +} + +TEST_F(ComputeTest, vec_hint_uchar2) +{ + const char *kernel_source = + "__kernel __attribute__((vec_type_hint(uchar2))) void main_test(__global float *inout)\n\ + {\n\ + inout[get_global_id(0)] *= inout[get_global_id(1)];\n\ + }"; + Shader shader = compile({ kernel_source }); + EXPECT_EQ(shader.obj->kernels[0].vec_hint_size, 2); + EXPECT_EQ(shader.obj->kernels[0].vec_hint_type, CLC_VEC_HINT_TYPE_CHAR); +} + +TEST_F(ComputeTest, vec_hint_none) +{ + const char *kernel_source = + "__kernel void main_test(__global float *inout)\n\ + {\n\ + inout[get_global_id(0)] *= inout[get_global_id(1)];\n\ + }"; + Shader shader = compile({ kernel_source }); + EXPECT_EQ(shader.obj->kernels[0].vec_hint_size, 0); +} + +TEST_F(ComputeTest, DISABLED_debug_layer_failure) +{ + const char *kernel_source = + "__kernel void main_test(__global float *inout, float mul)\n\ + {\n\ + inout[get_global_id(0)] = inout[get_global_id(0)] * mul;\n\ + }\n"; + auto inout = ShaderArg({ 0, 0.3f, -0.3f, 0.5f, -0.5f, 1.1f, -1.1f }, + SHADER_ARG_INOUT); + auto mul = ShaderArg(10.0f, SHADER_ARG_INPUT); + const float expected[] = { + 0.0f, 3.0f, -3.0f, 5.0f, -5.0f, 11.0f, -11.0f + }; + ComPtr info_queue; + dev->QueryInterface(info_queue.ReleaseAndGetAddressOf()); + if (!info_queue) { + GTEST_SKIP() << "No info queue"; + return; + } + + info_queue->AddApplicationMessage(D3D12_MESSAGE_SEVERITY_ERROR, "This should cause the test to fail"); + run_shader(kernel_source, inout.size(), 1, 1, inout, mul); + for (int i = 0; i < inout.size(); ++i) + EXPECT_FLOAT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, compiler_defines) +{ + const char *kernel_source = + "__kernel void main_test(__global int* out)\n\ + {\n\ + out[0] = OUT_VAL0;\n\ + out[1] = __OPENCL_C_VERSION__;\n\ + }"; + auto out = ShaderArg(std::vector(2, 0), SHADER_ARG_OUTPUT); + CompileArgs compile_args = { 1, 1, 1 }; + compile_args.compiler_command_line = { "-DOUT_VAL0=5", "-cl-std=cl" }; + std::vector raw_args = { &out }; + run_shader({ kernel_source }, compile_args, out); + EXPECT_EQ(out[0], 5); + EXPECT_EQ(out[1], 100); +} + +/* There's a bug in WARP turning atomic_add(ptr, x) into + * atomic_add(ptr, x * 4). Works fine on intel HW. + */ +TEST_F(ComputeTest, DISABLED_global_atomic_add) +{ + const char *kernel_source = + "__kernel void main_test(__global int *inout, __global int *old)\n\ + {\n\ + old[get_global_id(0)] = atomic_add(inout + get_global_id(0), 3);\n\ + }\n"; + auto inout = ShaderArg({ 2, 4 }, SHADER_ARG_INOUT); + auto old = ShaderArg(std::vector(2, 0xdeadbeef), SHADER_ARG_OUTPUT); + const int32_t expected_inout[] = { 5, 7 }; + const int32_t expected_old[] = { 2, 4 }; + run_shader(kernel_source, inout.size(), 1, 1, inout, old); + for (int i = 0; i < inout.size(); ++i) { + EXPECT_EQ(inout[i], expected_inout[i]); + EXPECT_EQ(old[i], expected_old[i]); + } +} + +TEST_F(ComputeTest, global_atomic_imin) +{ + const char *kernel_source = + "__kernel void main_test(__global int *inout, __global int *old)\n\ + {\n\ + old[get_global_id(0)] = atomic_min(inout + get_global_id(0), 1);\n\ + }\n"; + auto inout = ShaderArg({ 0, 2, -1 }, SHADER_ARG_INOUT); + auto old = ShaderArg(std::vector(3, 0xdeadbeef), SHADER_ARG_OUTPUT); + const int32_t expected_inout[] = { 0, 1, -1 }; + const int32_t expected_old[] = { 0, 2, -1 }; + run_shader(kernel_source, inout.size(), 1, 1, inout, old); + for (int i = 0; i < inout.size(); ++i) { + EXPECT_EQ(inout[i], expected_inout[i]); + EXPECT_EQ(old[i], expected_old[i]); + } +} + +TEST_F(ComputeTest, global_atomic_and_or) +{ + const char *kernel_source = + "__attribute__((reqd_work_group_size(3, 1, 1)))\n\ + __kernel void main_test(__global int *inout)\n\ + {\n\ + atomic_and(inout, ~(1 << get_global_id(0)));\n\ + atomic_or(inout, (1 << (get_global_id(0) + 4)));\n\ + }\n"; + auto inout = ShaderArg(0xf, SHADER_ARG_INOUT); + const int32_t expected[] = { 0x78 }; + run_shader(kernel_source, 3, 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, global_atomic_cmpxchg) +{ + const char *kernel_source = + "__attribute__((reqd_work_group_size(2, 1, 1)))\n\ + __kernel void main_test(__global int *inout)\n\ + {\n\ + while (atomic_cmpxchg(inout, get_global_id(0), get_global_id(0) + 1) != get_global_id(0))\n\ + ;\n\ + }\n"; + auto inout = ShaderArg(0, SHADER_ARG_INOUT); + const int32_t expected_inout[] = { 2 }; + run_shader(kernel_source, 2, 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected_inout[i]); +} + +TEST_F(ComputeTest, local_atomic_and_or) +{ + const char *kernel_source = + "__attribute__((reqd_work_group_size(2, 1, 1)))\n\ + __kernel void main_test(__global ushort *inout)\n\ + {\n\ + __local ushort tmp;\n\ + atomic_and(&tmp, ~(0xff << (get_global_id(0) * 8)));\n\ + atomic_or(&tmp, inout[get_global_id(0)] << (get_global_id(0) * 8));\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + inout[get_global_id(0)] = tmp;\n\ + }\n"; + auto inout = ShaderArg({ 2, 4 }, SHADER_ARG_INOUT); + const uint16_t expected[] = { 0x402, 0x402 }; + run_shader(kernel_source, inout.size(), 1, 1, inout); + for (int i = 0; i < inout.size(); ++i) + EXPECT_EQ(inout[i], expected[i]); +} + +TEST_F(ComputeTest, local_atomic_cmpxchg) +{ + const char *kernel_source = + "__attribute__((reqd_work_group_size(2, 1, 1)))\n\ + __kernel void main_test(__global int *out)\n\ + {\n\ + __local uint tmp;\n\ + tmp = 0;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + while (atomic_cmpxchg(&tmp, get_global_id(0), get_global_id(0) + 1) != get_global_id(0))\n\ + ;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + out[0] = tmp;\n\ + }\n"; + + auto out = ShaderArg(0xdeadbeef, SHADER_ARG_OUTPUT); + const uint16_t expected[] = { 2 }; + run_shader(kernel_source, 2, 1, 1, out); + for (int i = 0; i < out.size(); ++i) + EXPECT_EQ(out[i], expected[i]); +} + +TEST_F(ComputeTest, constant_sampler) +{ + const char* kernel_source = + "__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_LINEAR;\n\ + __kernel void main_test(read_only image2d_t input, write_only image2d_t output)\n\ + {\n\ + int2 coordsi = (int2)(get_global_id(0), get_global_id(1));\n\ + float2 coordsf = (float2)((float)coordsi.x / get_image_width(input), (float)coordsi.y / get_image_height(input));\n\ + write_imagef(output, coordsi, \n\ + read_imagef(input, sampler, coordsf) + \n\ + read_imagef(input, sampler, coordsf + (float2)(0.1, 0.1)));\n\ + }\n"; + Shader shader = compile(std::vector({ kernel_source })); + validate(shader); + EXPECT_EQ(shader.dxil->metadata.num_const_samplers, 1); +} + +TEST_F(ComputeTest, hi) +{ + const char *kernel_source = R"( + __kernel void main_test(__global char3 *srcA, __global char2 *dst) + { + int tid = get_global_id(0); + + char2 tmp = srcA[tid].hi; + dst[tid] = tmp; + })"; + Shader shader = compile(std::vector({ kernel_source })); + validate(shader); +} + +TEST_F(ComputeTest, system_values) +{ + const char *kernel_source = + "__kernel void main_test(__global uint* outputs)\n\ + {\n\ + outputs[0] = get_work_dim();\n\ + outputs[1] = get_global_size(0);\n\ + outputs[2] = get_local_size(0);\n\ + outputs[3] = get_num_groups(0);\n\ + outputs[4] = get_group_id(0);\n\ + outputs[5] = get_global_offset(0);\n\ + outputs[6] = get_global_id(0);\n\ + }\n"; + auto out = ShaderArg(std::vector(6, 0xdeadbeef), SHADER_ARG_OUTPUT); + const uint16_t expected[] = { 3, 1, 1, 1, 0, 0, 0, }; + CompileArgs args = { 1, 1, 1 }; + Shader shader = compile({ kernel_source }); + run_shader(shader, args, out); + for (int i = 0; i < out.size(); ++i) + EXPECT_EQ(out[i], expected[i]); + + args.work_props.work_dim = 2; + args.work_props.global_offset_x = 100; + args.work_props.group_id_offset_x = 2; + args.work_props.group_count_total_x = 5; + const uint32_t expected_withoffsets[] = { 2, 5, 1, 5, 2, 100, 102 }; + run_shader(shader, args, out); + for (int i = 0; i < out.size(); ++i) + EXPECT_EQ(out[i], expected_withoffsets[i]); +} + +TEST_F(ComputeTest, convert_round_sat) +{ + const char *kernel_source = + "__kernel void main_test(__global float *f, __global uchar *u)\n\ + {\n\ + uint idx = get_global_id(0);\n\ + u[idx] = convert_uchar_sat_rtp(f[idx]);\n\ + }\n"; + auto f = ShaderArg({ -1.0f, 1.1f, 20.0f, 255.5f }, SHADER_ARG_INPUT); + auto u = ShaderArg({ 255, 0, 0, 0 }, SHADER_ARG_OUTPUT); + const uint8_t expected[] = { + 0, 2, 20, 255 + }; + + run_shader(kernel_source, f.size(), 1, 1, f, u); + for (int i = 0; i < u.size(); ++i) + EXPECT_EQ(u[i], expected[i]); +} + +TEST_F(ComputeTest, convert_round_sat_vec) +{ + const char *kernel_source = + "__kernel void main_test(__global float16 *f, __global uchar16 *u)\n\ + {\n\ + uint idx = get_global_id(0);\n\ + u[idx] = convert_uchar16_sat_rtp(f[idx]);\n\ + }\n"; + auto f = ShaderArg({ + -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f, + -0.5f, 1.9f, 20.0f, 254.5f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f, + 0.0f, 1.3f, 20.0f, 255.1f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f, + -0.0f, 1.5555f, 20.0f, 254.9f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f, + }, SHADER_ARG_INPUT); + auto u = ShaderArg({ + 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, + 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, + 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, + 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, + }, SHADER_ARG_OUTPUT); + const uint8_t expected[] = { + 0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255, + 0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255, + 0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255, + 0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255, + }; + + run_shader(kernel_source, 4, 1, 1, f, u); + for (int i = 0; i < u.size(); ++i) + EXPECT_EQ(u[i], expected[i]); +} + +TEST_F(ComputeTest, convert_char2_uchar2) +{ + const char *kernel_source = + "__kernel void main_test( __global char2 *src, __global uchar2 *dest )\n\ + {\n\ + size_t i = get_global_id(0);\n\ + dest[i] = convert_uchar2_sat( src[i] );\n\ + }\n"; + + auto c = ShaderArg({ -127, -4, 0, 4, 126, 127, 16, 32 }, SHADER_ARG_INPUT); + auto u = ShaderArg({ 99, 99, 99, 99, 99, 99, 99, 99 }, SHADER_ARG_OUTPUT); + const uint8_t expected[] = { 0, 0, 0, 4, 126, 127, 16, 32 }; + run_shader(kernel_source, 4, 1, 1, c, u); + for (int i = 0; i < u.size(); i++) + EXPECT_EQ(u[i], expected[i]); +} + +TEST_F(ComputeTest, async_copy) +{ + const char *kernel_source = R"( + __kernel void main_test( const __global char *src, __global char *dst, __local char *localBuffer, int copiesPerWorkgroup, int copiesPerWorkItem ) + { + int i; + for(i=0; i({0, 0, 0}, SHADER_ARG_OUTPUT); + auto size = ShaderArg(0, SHADER_ARG_OUTPUT); + const struct s expected[] = { + { 1, 0xfbfcfdff12345678, 0xa112 } + }; + + run_shader(kernel_source, inout.size(), 1, 1, inout, size); + for (int i = 0; i < inout.size(); ++i) { + EXPECT_EQ(inout[i].uc, expected[i].uc); + EXPECT_EQ(inout[i].ul, expected[i].ul); + EXPECT_EQ(inout[i].us, expected[i].us); + } + EXPECT_EQ(size, sizeof(struct s)); +} + +TEST_F(ComputeTest, packed_struct_arg) +{ +#pragma pack(push, 1) + struct s { uint8_t uc; uint64_t ul; uint16_t us; }; +#pragma pack(pop) + + const char *kernel_source = + "struct __attribute__((packed)) s {uchar uc; ulong ul; ushort us; };\n\ + __kernel void main_test(__global struct s *out, struct s in)\n\ + {\n\ + uint idx = get_global_id(0);\n\ + out[idx].uc = in.uc + 0x12;\n\ + out[idx].ul = in.ul + 0x123456789abcdef;\n\ + out[idx].us = in.us + 0x1234;\n\ + }\n"; + auto out = ShaderArg({0, 0, 0}, SHADER_ARG_OUTPUT); + auto in = ShaderArg({1, 2, 3}, SHADER_ARG_INPUT); + const struct s expected[] = { + { 0x12 + 1, 0x123456789abcdef + 2, 0x1234 + 3 } + }; + + run_shader(kernel_source, out.size(), 1, 1, out, in); + for (int i = 0; i < out.size(); ++i) { + EXPECT_EQ(out[i].uc, expected[i].uc); + EXPECT_EQ(out[i].ul, expected[i].ul); + EXPECT_EQ(out[i].us, expected[i].us); + } +} + +TEST_F(ComputeTest, packed_struct_local) +{ +#pragma pack(push, 1) + struct s { uint8_t uc; uint64_t ul; uint16_t us; }; +#pragma pack(pop) + + const char *kernel_source = + "struct __attribute__((packed)) s {uchar uc; ulong ul; ushort us; };\n\ + __kernel void main_test(__global struct s *out, __constant struct s *in)\n\ + {\n\ + uint idx = get_global_id(0);\n\ + __local struct s tmp[2];\n\ + tmp[get_local_id(0)] = in[idx];\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + out[idx] = tmp[(get_local_id(0) + 1) % 2];\n\ + }\n"; + auto out = ShaderArg({{0, 0, 0}, {0, 0, 0}}, SHADER_ARG_OUTPUT); + auto in = ShaderArg({{1, 2, 3}, {0x12, 0x123456789abcdef, 0x1234} }, SHADER_ARG_INPUT); + const struct s expected[] = { + { 0x12, 0x123456789abcdef, 0x1234 }, + { 1, 2, 3 }, + }; + + run_shader(kernel_source, out.size(), 1, 1, out, in); + for (int i = 0; i < out.size(); ++i) { + EXPECT_EQ(out[i].uc, expected[i].uc); + EXPECT_EQ(out[i].ul, expected[i].ul); + EXPECT_EQ(out[i].us, expected[i].us); + } +} + +/* DISABLED because current release versions of WARP either return + * rubbish from reads or crash: they are not prepared to handle + * non-float global constants */ +TEST_F(ComputeTest, DISABLED_packed_struct_const) +{ +#pragma pack(push, 1) + struct s { uint8_t uc; uint64_t ul; uint16_t us; }; +#pragma pack(pop) + + const char *kernel_source = + "struct __attribute__((packed)) s {uchar uc; ulong ul; ushort us; };\n\ + __kernel void main_test(__global struct s *out, struct s in)\n\ + {\n\ + __constant struct s base[] = {\n\ + {0x12, 0x123456789abcdef, 0x1234},\n\ + {0x11, 0x123456789abcdee, 0x1233},\n\ + };\n\ + uint idx = get_global_id(0);\n\ + out[idx].uc = base[idx % 2].uc + in.uc;\n\ + out[idx].ul = base[idx % 2].ul + in.ul;\n\ + out[idx].us = base[idx % 2].us + in.us;\n\ + }\n"; + auto out = ShaderArg(std::vector(2, {0, 0, 0}), SHADER_ARG_OUTPUT); + auto in = ShaderArg({1, 2, 3}, SHADER_ARG_INPUT); + const struct s expected[] = { + { 0x12 + 1, 0x123456789abcdef + 2, 0x1234 + 3 }, + { 0x11 + 1, 0x123456789abcdee + 2, 0x1233 + 3 }, + }; + + run_shader(kernel_source, out.size(), 1, 1, out, in); + for (int i = 0; i < out.size(); ++i) { + EXPECT_EQ(out[i].uc, expected[i].uc); + EXPECT_EQ(out[i].ul, expected[i].ul); + EXPECT_EQ(out[i].us, expected[i].us); + } +} + +TEST_F(ComputeTest, DISABLED_printf) +{ + const char *kernel_source = R"( + __kernel void main_test(__global float *src, __global uint *dest) + { + __constant char *format_str = "%s: %f"; + __constant char *str_val = "Test"; + *dest = printf(format_str, str_val, src[0]); + })"; + + auto src = ShaderArg({ 1.0f }, SHADER_ARG_INPUT); + auto dest = ShaderArg({ 0xdeadbeef }, SHADER_ARG_OUTPUT); + run_shader(kernel_source, 1, 1, 1, src, dest); + EXPECT_EQ(dest[0], 0); +} + +TEST_F(ComputeTest, vload_half) +{ + const char *kernel_source = R"( + __kernel void main_test(__global half *src, __global float4 *dest) + { + int offset = get_global_id(0); + dest[offset] = vload_half4(offset, src); + })"; + auto src = ShaderArg({ 0x3c00, 0x4000, 0x4200, 0x4400, + 0x4500, 0x4600, 0x4700, 0x4800 }, SHADER_ARG_INPUT); + auto dest = ShaderArg({ FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, + FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX }, SHADER_ARG_OUTPUT); + run_shader(kernel_source, 2, 1, 1, src, dest); + for (unsigned i = 0; i < 8; ++i) + EXPECT_FLOAT_EQ(dest[i], (float)(i + 1)); +} + +TEST_F(ComputeTest, vstore_half) +{ + const char *kernel_source = R"( + __kernel void main_test(__global half *dst, __global float4 *src) + { + int offset = get_global_id(0); + vstore_half4(src[offset], offset, dst); + })"; + auto dest = ShaderArg({0xdead, 0xdead, 0xdead, 0xdead, + 0xdead, 0xdead, 0xdead, 0xdead}, SHADER_ARG_OUTPUT); + auto src = ShaderArg({ 1.0, 2.0, 3.0, 4.0, + 5.0, 6.0, 7.0, 8.0 }, SHADER_ARG_INPUT); + run_shader(kernel_source, 2, 1, 1, dest, src); + const uint16_t expected[] = { 0x3c00, 0x4000, 0x4200, 0x4400, + 0x4500, 0x4600, 0x4700, 0x4800 }; + for (unsigned i = 0; i < 8; ++i) + EXPECT_EQ(dest[i], expected[i]); +} diff --git a/src/microsoft/clc/clc_helpers.cpp b/src/microsoft/clc/clc_helpers.cpp new file mode 100644 index 00000000000..38642b89819 --- /dev/null +++ b/src/microsoft/clc/clc_helpers.cpp @@ -0,0 +1,811 @@ +// +// Copyright 2012-2016 Francisco Jerez +// Copyright 2012-2016 Advanced Micro Devices, Inc. +// Copyright 2014-2016 Jan Vesely +// Copyright 2014-2015 Serge Martin +// Copyright 2015 Zoltan Gilian +// +// Permission is hereby granted, free of charge, to any person obtaining a +// copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "util/macros.h" +#include "glsl_types.h" +#include "nir.h" +#include "nir_types.h" + +#include "clc_helpers.h" +#include "spirv.h" + +#include "opencl-c.h.h" +#include "opencl-c-base.h.h" + +using ::llvm::Function; +using ::llvm::LLVMContext; +using ::llvm::Module; +using ::llvm::raw_string_ostream; + +static void +llvm_log_handler(const ::llvm::DiagnosticInfo &di, void *data) { + raw_string_ostream os { *reinterpret_cast(data) }; + ::llvm::DiagnosticPrinterRawOStream printer { os }; + di.print(printer); +} + +class SPIRVKernelArg { +public: + SPIRVKernelArg(uint32_t id, uint32_t typeId) : id(id), typeId(typeId), + addrQualifier(CLC_KERNEL_ARG_ADDRESS_PRIVATE), + accessQualifier(0), + typeQualifier(0) { } + ~SPIRVKernelArg() { } + + uint32_t id; + uint32_t typeId; + std::string name; + std::string typeName; + enum clc_kernel_arg_address_qualifier addrQualifier; + unsigned accessQualifier; + unsigned typeQualifier; +}; + +class SPIRVKernelInfo { +public: + SPIRVKernelInfo(uint32_t fid, const char *nm) : funcId(fid), name(nm), vecHint(0) { } + ~SPIRVKernelInfo() { } + + uint32_t funcId; + std::string name; + std::vector args; + unsigned vecHint; +}; + +class SPIRVKernelParser { +public: + SPIRVKernelParser() : curKernel(NULL) + { + ctx = spvContextCreate(SPV_ENV_UNIVERSAL_1_0); + } + + ~SPIRVKernelParser() + { + spvContextDestroy(ctx); + } + + void parseEntryPoint(const spv_parsed_instruction_t *ins) + { + assert(ins->num_operands >= 3); + + const spv_parsed_operand_t *op = &ins->operands[1]; + + assert(op->type == SPV_OPERAND_TYPE_ID); + + uint32_t funcId = ins->words[op->offset]; + + for (auto &iter : kernels) { + if (funcId == iter.funcId) + return; + } + + op = &ins->operands[2]; + assert(op->type == SPV_OPERAND_TYPE_LITERAL_STRING); + const char *name = reinterpret_cast(ins->words + op->offset); + + kernels.push_back(SPIRVKernelInfo(funcId, name)); + } + + void parseFunction(const spv_parsed_instruction_t *ins) + { + assert(ins->num_operands == 4); + + const spv_parsed_operand_t *op = &ins->operands[1]; + + assert(op->type == SPV_OPERAND_TYPE_RESULT_ID); + + uint32_t funcId = ins->words[op->offset]; + + SPIRVKernelInfo *kernel = NULL; + + for (auto &kernel : kernels) { + if (funcId == kernel.funcId && !kernel.args.size()) { + curKernel = &kernel; + return; + } + } + } + + void parseFunctionParam(const spv_parsed_instruction_t *ins) + { + const spv_parsed_operand_t *op; + uint32_t id, typeId; + + if (!curKernel) + return; + + assert(ins->num_operands == 2); + op = &ins->operands[0]; + assert(op->type == SPV_OPERAND_TYPE_TYPE_ID); + typeId = ins->words[op->offset]; + op = &ins->operands[1]; + assert(op->type == SPV_OPERAND_TYPE_RESULT_ID); + id = ins->words[op->offset]; + curKernel->args.push_back(SPIRVKernelArg(id, typeId)); + } + + void parseName(const spv_parsed_instruction_t *ins) + { + const spv_parsed_operand_t *op; + const char *name; + uint32_t id; + + assert(ins->num_operands == 2); + + op = &ins->operands[0]; + assert(op->type == SPV_OPERAND_TYPE_ID); + id = ins->words[op->offset]; + op = &ins->operands[1]; + assert(op->type == SPV_OPERAND_TYPE_LITERAL_STRING); + name = reinterpret_cast(ins->words + op->offset); + + for (auto &kernel : kernels) { + for (auto &arg : kernel.args) { + if (arg.id == id && arg.name.empty()) { + arg.name = name; + break; + } + } + } + } + + void parseTypePointer(const spv_parsed_instruction_t *ins) + { + enum clc_kernel_arg_address_qualifier addrQualifier; + uint32_t typeId, targetTypeId, storageClass; + const spv_parsed_operand_t *op; + const char *typeName; + + assert(ins->num_operands == 3); + + op = &ins->operands[0]; + assert(op->type == SPV_OPERAND_TYPE_RESULT_ID); + typeId = ins->words[op->offset]; + + op = &ins->operands[1]; + assert(op->type == SPV_OPERAND_TYPE_STORAGE_CLASS); + storageClass = ins->words[op->offset]; + switch (storageClass) { + case SpvStorageClassCrossWorkgroup: + addrQualifier = CLC_KERNEL_ARG_ADDRESS_GLOBAL; + break; + case SpvStorageClassWorkgroup: + addrQualifier = CLC_KERNEL_ARG_ADDRESS_LOCAL; + break; + case SpvStorageClassUniformConstant: + addrQualifier = CLC_KERNEL_ARG_ADDRESS_CONSTANT; + break; + default: + addrQualifier = CLC_KERNEL_ARG_ADDRESS_PRIVATE; + break; + } + + for (auto &kernel : kernels) { + for (auto &arg : kernel.args) { + if (arg.typeId == typeId) + arg.addrQualifier = addrQualifier; + } + } + } + + void parseOpString(const spv_parsed_instruction_t *ins) + { + const spv_parsed_operand_t *op; + std::string str; + + assert(ins->num_operands == 2); + + op = &ins->operands[1]; + assert(op->type == SPV_OPERAND_TYPE_LITERAL_STRING); + str = reinterpret_cast(ins->words + op->offset); + + if (str.find("kernel_arg_type.") != 0) + return; + + size_t start = sizeof("kernel_arg_type.") - 1; + + for (auto &kernel : kernels) { + size_t pos; + + pos = str.find(kernel.name, start); + if (pos == std::string::npos || + pos != start || str[start + kernel.name.size()] != '.') + continue; + + pos = start + kernel.name.size(); + if (str[pos++] != '.') + continue; + + for (auto &arg : kernel.args) { + if (arg.name.empty()) + break; + + size_t typeEnd = str.find(',', pos); + if (typeEnd == std::string::npos) + break; + + arg.typeName = str.substr(pos, typeEnd - pos); + pos = typeEnd + 1; + } + } + } + + void applyDecoration(uint32_t id, const spv_parsed_instruction_t *ins) + { + auto iter = decorationGroups.find(id); + if (iter != decorationGroups.end()) { + for (uint32_t entry : iter->second) + applyDecoration(entry, ins); + return; + } + + const spv_parsed_operand_t *op; + uint32_t decoration; + + assert(ins->num_operands >= 2); + + op = &ins->operands[1]; + assert(op->type == SPV_OPERAND_TYPE_DECORATION); + decoration = ins->words[op->offset]; + + for (auto &kernel : kernels) { + for (auto &arg : kernel.args) { + if (arg.id == id) { + switch (decoration) { + case SpvDecorationVolatile: + arg.typeQualifier |= CLC_KERNEL_ARG_TYPE_VOLATILE; + break; + case SpvDecorationConstant: + arg.typeQualifier |= CLC_KERNEL_ARG_TYPE_CONST; + break; + case SpvDecorationRestrict: + arg.typeQualifier |= CLC_KERNEL_ARG_TYPE_RESTRICT; + break; + case SpvDecorationFuncParamAttr: + op = &ins->operands[2]; + assert(op->type == SPV_OPERAND_TYPE_FUNCTION_PARAMETER_ATTRIBUTE); + switch (ins->words[op->offset]) { + case SpvFunctionParameterAttributeNoAlias: + arg.typeQualifier |= CLC_KERNEL_ARG_TYPE_RESTRICT; + break; + case SpvFunctionParameterAttributeNoWrite: + arg.typeQualifier |= CLC_KERNEL_ARG_TYPE_CONST; + break; + } + break; + } + } + + } + } + } + + void parseOpDecorate(const spv_parsed_instruction_t *ins) + { + const spv_parsed_operand_t *op; + uint32_t id, decoration; + + assert(ins->num_operands >= 2); + + op = &ins->operands[0]; + assert(op->type == SPV_OPERAND_TYPE_ID); + id = ins->words[op->offset]; + + applyDecoration(id, ins); + } + + void parseOpGroupDecorate(const spv_parsed_instruction_t *ins) + { + assert(ins->num_operands >= 2); + + const spv_parsed_operand_t *op = &ins->operands[0]; + assert(op->type == SPV_OPERAND_TYPE_ID); + uint32_t groupId = ins->words[op->offset]; + + auto lowerBound = decorationGroups.lower_bound(groupId); + if (lowerBound != decorationGroups.end() && + lowerBound->first == groupId) + // Group already filled out + return; + + auto iter = decorationGroups.emplace_hint(lowerBound, groupId, std::vector{}); + auto& vec = iter->second; + vec.reserve(ins->num_operands - 1); + for (uint32_t i = 1; i < ins->num_operands; ++i) { + op = &ins->operands[i]; + assert(op->type == SPV_OPERAND_TYPE_ID); + vec.push_back(ins->words[op->offset]); + } + } + + void parseOpTypeImage(const spv_parsed_instruction_t *ins) + { + const spv_parsed_operand_t *op; + uint32_t typeId; + unsigned accessQualifier = CLC_KERNEL_ARG_ACCESS_READ; + + op = &ins->operands[0]; + assert(op->type == SPV_OPERAND_TYPE_RESULT_ID); + typeId = ins->words[op->offset]; + + if (ins->num_operands >= 9) { + op = &ins->operands[8]; + assert(op->type == SPV_OPERAND_TYPE_ACCESS_QUALIFIER); + switch (ins->words[op->offset]) { + case SpvAccessQualifierReadOnly: + accessQualifier = CLC_KERNEL_ARG_ACCESS_READ; + break; + case SpvAccessQualifierWriteOnly: + accessQualifier = CLC_KERNEL_ARG_ACCESS_WRITE; + break; + case SpvAccessQualifierReadWrite: + accessQualifier = CLC_KERNEL_ARG_ACCESS_WRITE | + CLC_KERNEL_ARG_ACCESS_READ; + break; + } + } + + for (auto &kernel : kernels) { + for (auto &arg : kernel.args) { + if (arg.typeId == typeId) { + arg.accessQualifier = accessQualifier; + arg.addrQualifier = CLC_KERNEL_ARG_ADDRESS_GLOBAL; + } + } + } + } + + void parseExecutionMode(const spv_parsed_instruction_t *ins) + { + uint32_t executionMode = ins->words[ins->operands[1].offset]; + if (executionMode != SpvExecutionModeVecTypeHint) + return; + + uint32_t funcId = ins->words[ins->operands[0].offset]; + uint32_t vecHint = ins->words[ins->operands[2].offset]; + for (auto& kernel : kernels) { + if (kernel.funcId == funcId) + kernel.vecHint = vecHint; + } + } + + static spv_result_t + parseInstruction(void *data, const spv_parsed_instruction_t *ins) + { + SPIRVKernelParser *parser = reinterpret_cast(data); + + switch (ins->opcode) { + case SpvOpName: + parser->parseName(ins); + break; + case SpvOpEntryPoint: + parser->parseEntryPoint(ins); + break; + case SpvOpFunction: + parser->parseFunction(ins); + break; + case SpvOpFunctionParameter: + parser->parseFunctionParam(ins); + break; + case SpvOpFunctionEnd: + case SpvOpLabel: + parser->curKernel = NULL; + break; + case SpvOpTypePointer: + parser->parseTypePointer(ins); + break; + case SpvOpTypeImage: + parser->parseOpTypeImage(ins); + break; + case SpvOpString: + parser->parseOpString(ins); + break; + case SpvOpDecorate: + parser->parseOpDecorate(ins); + break; + case SpvOpGroupDecorate: + parser->parseOpGroupDecorate(ins); + break; + case SpvOpExecutionMode: + parser->parseExecutionMode(ins); + break; + default: + break; + } + + return SPV_SUCCESS; + } + + bool parsingComplete() + { + for (auto &kernel : kernels) { + if (kernel.name.empty()) + return false; + + for (auto &arg : kernel.args) { + if (arg.name.empty() || arg.typeName.empty()) + return false; + } + } + + return true; + } + + void parseBinary(const struct spirv_binary &spvbin) + { + /* 3 passes should be enough to retrieve all kernel information: + * 1st pass: all entry point name and number of args + * 2nd pass: argument names and type names + * 3rd pass: pointer type names + */ + for (unsigned pass = 0; pass < 3; pass++) { + spvBinaryParse(ctx, reinterpret_cast(this), + spvbin.data, spvbin.size / 4, + NULL, parseInstruction, NULL); + + if (parsingComplete()) + return; + } + + assert(0); + } + + std::vector kernels; + std::map> decorationGroups; + SPIRVKernelInfo *curKernel; + spv_context ctx; +}; + +const struct clc_kernel_info * +clc_spirv_get_kernels_info(const struct spirv_binary *spvbin, + unsigned *num_kernels) +{ + struct clc_kernel_info *kernels; + + SPIRVKernelParser parser; + + parser.parseBinary(*spvbin); + *num_kernels = parser.kernels.size(); + if (!*num_kernels) + return NULL; + + kernels = reinterpret_cast(calloc(*num_kernels, + sizeof(*kernels))); + assert(kernels); + for (unsigned i = 0; i < parser.kernels.size(); i++) { + kernels[i].name = strdup(parser.kernels[i].name.c_str()); + kernels[i].num_args = parser.kernels[i].args.size(); + kernels[i].vec_hint_size = parser.kernels[i].vecHint >> 16; + kernels[i].vec_hint_type = (enum clc_vec_hint_type)(parser.kernels[i].vecHint & 0xFFFF); + if (!kernels[i].num_args) + continue; + + struct clc_kernel_arg *args; + + args = reinterpret_cast(calloc(kernels[i].num_args, + sizeof(*kernels->args))); + kernels[i].args = args; + assert(args); + for (unsigned j = 0; j < kernels[i].num_args; j++) { + if (!parser.kernels[i].args[j].name.empty()) + args[j].name = strdup(parser.kernels[i].args[j].name.c_str()); + args[j].type_name = strdup(parser.kernels[i].args[j].typeName.c_str()); + args[j].address_qualifier = parser.kernels[i].args[j].addrQualifier; + args[j].type_qualifier = parser.kernels[i].args[j].typeQualifier; + args[j].access_qualifier = parser.kernels[i].args[j].accessQualifier; + } + } + + return kernels; +} + +void +clc_free_kernels_info(const struct clc_kernel_info *kernels, + unsigned num_kernels) +{ + if (!kernels) + return; + + for (unsigned i = 0; i < num_kernels; i++) { + if (kernels[i].args) { + for (unsigned j = 0; j < kernels[i].num_args; j++) { + free((void *)kernels[i].args[j].name); + free((void *)kernels[i].args[j].type_name); + } + } + free((void *)kernels[i].name); + } + + free((void *)kernels); +} + +int +clc_to_spirv(const struct clc_compile_args *args, + struct spirv_binary *spvbin, + const struct clc_logger *logger) +{ + LLVMInitializeAllTargets(); + LLVMInitializeAllTargetInfos(); + LLVMInitializeAllTargetMCs(); + LLVMInitializeAllAsmPrinters(); + + std::string log; + std::unique_ptr llvm_ctx { new LLVMContext }; + llvm_ctx->setDiagnosticHandlerCallBack(llvm_log_handler, &log); + + std::unique_ptr c { new clang::CompilerInstance }; + clang::DiagnosticsEngine diag { new clang::DiagnosticIDs, + new clang::DiagnosticOptions, + new clang::TextDiagnosticPrinter(*new raw_string_ostream(log), + &c->getDiagnosticOpts(), true)}; + + std::vector clang_opts = { + args->source.name, + "-triple", "spir64-unknown-unknown", + // By default, clang prefers to use modules to pull in the default headers, + // which doesn't work with our technique of embedding the headers in our binary + "-finclude-default-header", + // Add a default CL compiler version. Clang will pick the last one specified + // on the command line, so the app can override this one. + "-cl-std=cl1.2", + // The LLVM-SPIRV-Translator doesn't support memset with variable size + "-fno-builtin-memset", + // LLVM's optimizations can produce code that the translator can't translate + "-O0", + }; + // We assume there's appropriate defines for __OPENCL_VERSION__ and __IMAGE_SUPPORT__ + // being provided by the caller here. + clang_opts.insert(clang_opts.end(), args->args, args->args + args->num_args); + + if (!clang::CompilerInvocation::CreateFromArgs(c->getInvocation(), +#if LLVM_VERSION_MAJOR >= 10 + clang_opts, +#else + clang_opts.data(), + clang_opts.data() + clang_opts.size(), +#endif + diag)) { + log += "Couldn't create Clang invocation.\n"; + clc_error(logger, log.c_str()); + return -1; + } + + if (diag.hasErrorOccurred()) { + log += "Errors occurred during Clang invocation.\n"; + clc_error(logger, log.c_str()); + return -1; + } + + // This is a workaround for a Clang bug which causes the number + // of warnings and errors to be printed to stderr. + // http://www.llvm.org/bugs/show_bug.cgi?id=19735 + c->getDiagnosticOpts().ShowCarets = false; + + c->createDiagnostics(new clang::TextDiagnosticPrinter( + *new raw_string_ostream(log), + &c->getDiagnosticOpts(), true)); + + c->setTarget(clang::TargetInfo::CreateTargetInfo( + c->getDiagnostics(), c->getInvocation().TargetOpts)); + + c->getFrontendOpts().ProgramAction = clang::frontend::EmitLLVMOnly; + c->getHeaderSearchOpts().UseBuiltinIncludes = false; + c->getHeaderSearchOpts().UseStandardSystemIncludes = false; + + // Add opencl-c generic search path + { + ::llvm::SmallString<128> system_header_path; + ::llvm::sys::path::system_temp_directory(true, system_header_path); + ::llvm::sys::path::append(system_header_path, "openclon12"); + c->getHeaderSearchOpts().AddPath(system_header_path.str(), + clang::frontend::Angled, + false, false); + + ::llvm::sys::path::append(system_header_path, "opencl-c.h"); + c->getPreprocessorOpts().addRemappedFile(system_header_path.str(), + ::llvm::MemoryBuffer::getMemBuffer(llvm::StringRef(opencl_c_source, _countof(opencl_c_source) - 1)).release()); + + ::llvm::sys::path::remove_filename(system_header_path); + ::llvm::sys::path::append(system_header_path, "opencl-c-base.h"); + c->getPreprocessorOpts().addRemappedFile(system_header_path.str(), + ::llvm::MemoryBuffer::getMemBuffer(llvm::StringRef(opencl_c_base_source, _countof(opencl_c_base_source) - 1)).release()); + } + + if (args->num_headers) { + ::llvm::SmallString<128> tmp_header_path; + ::llvm::sys::path::system_temp_directory(true, tmp_header_path); + ::llvm::sys::path::append(tmp_header_path, "openclon12"); + + c->getHeaderSearchOpts().AddPath(tmp_header_path.str(), + clang::frontend::Quoted, + false, false); + + for (size_t i = 0; i < args->num_headers; i++) { + auto path_copy = tmp_header_path; + ::llvm::sys::path::append(path_copy, ::llvm::sys::path::convert_to_slash(args->headers[i].name)); + c->getPreprocessorOpts().addRemappedFile(path_copy.str(), + ::llvm::MemoryBuffer::getMemBufferCopy(args->headers[i].value).release()); + } + } + + c->getPreprocessorOpts().addRemappedFile( + args->source.name, + ::llvm::MemoryBuffer::getMemBufferCopy(std::string(args->source.value)).release()); + + // Compile the code + clang::EmitLLVMOnlyAction act(llvm_ctx.get()); + if (!c->ExecuteAction(act)) { + log += "Error executing LLVM compilation action.\n"; + clc_error(logger, log.c_str()); + return -1; + } + + auto mod = act.takeModule(); + std::ostringstream spv_stream; + if (!::llvm::writeSpirv(mod.get(), spv_stream, log)) { + log += "Translation from LLVM IR to SPIR-V failed.\n"; + clc_error(logger, log.c_str()); + return -1; + } + + const std::string spv_out = spv_stream.str(); + spvbin->size = spv_out.size(); + spvbin->data = static_cast(malloc(spvbin->size)); + memcpy(spvbin->data, spv_out.data(), spvbin->size); + + return 0; +} + +static const char * +spv_result_to_str(spv_result_t res) +{ + switch (res) { + case SPV_SUCCESS: return "success"; + case SPV_UNSUPPORTED: return "unsupported"; + case SPV_END_OF_STREAM: return "end of stream"; + case SPV_WARNING: return "warning"; + case SPV_FAILED_MATCH: return "failed match"; + case SPV_REQUESTED_TERMINATION: return "requested termination"; + case SPV_ERROR_INTERNAL: return "internal error"; + case SPV_ERROR_OUT_OF_MEMORY: return "out of memory"; + case SPV_ERROR_INVALID_POINTER: return "invalid pointer"; + case SPV_ERROR_INVALID_BINARY: return "invalid binary"; + case SPV_ERROR_INVALID_TEXT: return "invalid text"; + case SPV_ERROR_INVALID_TABLE: return "invalid table"; + case SPV_ERROR_INVALID_VALUE: return "invalid value"; + case SPV_ERROR_INVALID_DIAGNOSTIC: return "invalid diagnostic"; + case SPV_ERROR_INVALID_LOOKUP: return "invalid lookup"; + case SPV_ERROR_INVALID_ID: return "invalid id"; + case SPV_ERROR_INVALID_CFG: return "invalid config"; + case SPV_ERROR_INVALID_LAYOUT: return "invalid layout"; + case SPV_ERROR_INVALID_CAPABILITY: return "invalid capability"; + case SPV_ERROR_INVALID_DATA: return "invalid data"; + case SPV_ERROR_MISSING_EXTENSION: return "missing extension"; + case SPV_ERROR_WRONG_VERSION: return "wrong version"; + default: return "unknown error"; + } +} + +class SPIRVMessageConsumer { +public: + SPIRVMessageConsumer(const struct clc_logger *logger): logger(logger) {} + + void operator()(spv_message_level_t level, const char *src, + const spv_position_t &pos, const char *msg) + { + switch(level) { + case SPV_MSG_FATAL: + case SPV_MSG_INTERNAL_ERROR: + case SPV_MSG_ERROR: + clc_error(logger, "(file=%s,line=%ld,column=%ld,index=%ld): %s", + src, pos.line, pos.column, pos.index, msg); + break; + + case SPV_MSG_WARNING: + clc_warning(logger, "(file=%s,line=%ld,column=%ld,index=%ld): %s", + src, pos.line, pos.column, pos.index, msg); + break; + + default: + break; + } + } + +private: + const struct clc_logger *logger; +}; + +int +clc_link_spirv_binaries(const struct clc_linker_args *args, + struct spirv_binary *dst_bin, + const struct clc_logger *logger) +{ + std::vector> binaries; + + for (unsigned i = 0; i < args->num_in_objs; i++) { + std::vector bin(args->in_objs[i]->spvbin.data, + args->in_objs[i]->spvbin.data + + (args->in_objs[i]->spvbin.size / 4)); + binaries.push_back(bin); + } + + SPIRVMessageConsumer msgconsumer(logger); + spvtools::Context context(SPV_ENV_UNIVERSAL_1_0); + context.SetMessageConsumer(msgconsumer); + spvtools::LinkerOptions options; + options.SetAllowPartialLinkage(args->create_library); + options.SetCreateLibrary(args->create_library); + std::vector linkingResult; + spv_result_t status = spvtools::Link(context, binaries, &linkingResult, options); + if (status != SPV_SUCCESS) { + return -1; + } + + dst_bin->size = linkingResult.size() * 4; + dst_bin->data = static_cast(malloc(dst_bin->size)); + memcpy(dst_bin->data, linkingResult.data(), dst_bin->size); + + return 0; +} + +void +clc_dump_spirv(const struct spirv_binary *spvbin, FILE *f) +{ + spvtools::SpirvTools tools(SPV_ENV_UNIVERSAL_1_0); + std::vector bin(spvbin->data, spvbin->data + (spvbin->size / 4)); + std::string out; + tools.Disassemble(bin, &out, + SPV_BINARY_TO_TEXT_OPTION_INDENT | + SPV_BINARY_TO_TEXT_OPTION_FRIENDLY_NAMES); + fwrite(out.c_str(), out.size(), 1, f); +} + +void +clc_free_spirv_binary(struct spirv_binary *spvbin) +{ + free(spvbin->data); +} diff --git a/src/microsoft/clc/clc_helpers.h b/src/microsoft/clc/clc_helpers.h new file mode 100644 index 00000000000..48f8c2df373 --- /dev/null +++ b/src/microsoft/clc/clc_helpers.h @@ -0,0 +1,81 @@ +/* + * Copyright © Microsoft Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef CLC_TO_NIR_H +#define CLC_TO_NIR_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "nir_types.h" + +#include "clc_compiler.h" +#include "util/u_string.h" + +#include +#include +#include +#include + +const struct clc_kernel_info * +clc_spirv_get_kernels_info(const struct spirv_binary *spvbin, + unsigned *num_kernels); + +void +clc_free_kernels_info(const struct clc_kernel_info *kernels, + unsigned num_kernels); + +int +clc_to_spirv(const struct clc_compile_args *args, + struct spirv_binary *spvbin, + const struct clc_logger *logger); + +int +clc_link_spirv_binaries(const struct clc_linker_args *args, + struct spirv_binary *dst_bin, + const struct clc_logger *logger); + +void +clc_dump_spirv(const struct spirv_binary *spvbin, FILE *f); + +void +clc_free_spirv_binary(struct spirv_binary *spvbin); + +#define clc_log(logger, level, fmt, ...) do { \ + if (!logger || !logger->level) break; \ + char *msg = NULL; \ + asprintf(&msg, fmt, __VA_ARGS__); \ + assert(msg); \ + logger->level(logger->priv, msg); \ + free(msg); \ + } while (0) + +#define clc_error(logger, fmt, ...) clc_log(logger, error, fmt, __VA_ARGS__) +#define clc_warning(logger, fmt, ...) clc_log(logger, warning, fmt, __VA_ARGS__) + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/microsoft/clc/clc_nir.c b/src/microsoft/clc/clc_nir.c new file mode 100644 index 00000000000..2dfeb925bff --- /dev/null +++ b/src/microsoft/clc/clc_nir.c @@ -0,0 +1,388 @@ +/* + * Copyright © Microsoft Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "u_math.h" +#include "nir.h" +#include "glsl_types.h" +#include "nir_types.h" +#include "nir_builder.h" + +#include "clc_nir.h" +#include "clc_compiler.h" +#include "../compiler/dxil_nir.h" + +static bool +lower_load_base_global_invocation_id(nir_builder *b, nir_intrinsic_instr *intr, + nir_variable *var) +{ + b->cursor = nir_after_instr(&intr->instr); + + nir_ssa_def *offset = + build_load_ubo_dxil(b, nir_imm_int(b, var->data.binding), + nir_imm_int(b, + offsetof(struct clc_work_properties_data, + global_offset_x)), + nir_dest_num_components(intr->dest), + nir_dest_bit_size(intr->dest)); + nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(offset)); + nir_instr_remove(&intr->instr); + return true; +} + +static bool +lower_load_work_dim(nir_builder *b, nir_intrinsic_instr *intr, + nir_variable *var) +{ + b->cursor = nir_after_instr(&intr->instr); + + nir_ssa_def *dim = + build_load_ubo_dxil(b, nir_imm_int(b, var->data.binding), + nir_imm_int(b, + offsetof(struct clc_work_properties_data, + work_dim)), + nir_dest_num_components(intr->dest), + nir_dest_bit_size(intr->dest)); + nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(dim)); + nir_instr_remove(&intr->instr); + return true; +} + +static bool +lower_load_local_group_size(nir_builder *b, nir_intrinsic_instr *intr) +{ + b->cursor = nir_after_instr(&intr->instr); + + nir_const_value v[3] = { + nir_const_value_for_int(b->shader->info.cs.local_size[0], 32), + nir_const_value_for_int(b->shader->info.cs.local_size[1], 32), + nir_const_value_for_int(b->shader->info.cs.local_size[2], 32) + }; + nir_ssa_def *size = nir_build_imm(b, 3, 32, v); + nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(size)); + nir_instr_remove(&intr->instr); + return true; +} + +static bool +lower_load_num_work_groups(nir_builder *b, nir_intrinsic_instr *intr, + nir_variable *var) +{ + b->cursor = nir_after_instr(&intr->instr); + + nir_ssa_def *count = + build_load_ubo_dxil(b, nir_imm_int(b, var->data.binding), + nir_imm_int(b, + offsetof(struct clc_work_properties_data, + group_count_total_x)), + nir_dest_num_components(intr->dest), + nir_dest_bit_size(intr->dest)); + nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(count)); + nir_instr_remove(&intr->instr); + return true; +} + +static bool +lower_load_base_work_group_id(nir_builder *b, nir_intrinsic_instr *intr, + nir_variable *var) +{ + b->cursor = nir_after_instr(&intr->instr); + + nir_ssa_def *offset = + build_load_ubo_dxil(b, nir_imm_int(b, var->data.binding), + nir_imm_int(b, + offsetof(struct clc_work_properties_data, + group_id_offset_x)), + nir_dest_num_components(intr->dest), + nir_dest_bit_size(intr->dest)); + nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(offset)); + nir_instr_remove(&intr->instr); + return true; +} + +bool +clc_nir_lower_system_values(nir_shader *nir, nir_variable *var) +{ + bool progress = false; + + foreach_list_typed(nir_function, func, node, &nir->functions) { + if (!func->is_entrypoint) + continue; + assert(func->impl); + + nir_builder b; + nir_builder_init(&b, func->impl); + + nir_foreach_block(block, func->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + switch (intr->intrinsic) { + case nir_intrinsic_load_base_global_invocation_id: + progress |= lower_load_base_global_invocation_id(&b, intr, var); + break; + case nir_intrinsic_load_work_dim: + progress |= lower_load_work_dim(&b, intr, var); + break; + case nir_intrinsic_load_local_group_size: + lower_load_local_group_size(&b, intr); + break; + case nir_intrinsic_load_num_work_groups: + lower_load_num_work_groups(&b, intr, var); + break; + case nir_intrinsic_load_base_work_group_id: + lower_load_base_work_group_id(&b, intr, var); + break; + default: break; + } + } + } + } + + return progress; +} + +static bool +lower_load_kernel_input(nir_builder *b, nir_intrinsic_instr *intr, + nir_variable *var) +{ + nir_intrinsic_instr *load; + + b->cursor = nir_before_instr(&intr->instr); + + unsigned bit_size = nir_dest_bit_size(intr->dest); + enum glsl_base_type base_type; + + switch (bit_size) { + case 64: + base_type = GLSL_TYPE_UINT64; + break; + case 32: + base_type = GLSL_TYPE_UINT; + break; + case 16: + base_type = GLSL_TYPE_UINT16; + break; + case 8: + base_type = GLSL_TYPE_UINT8; + break; + } + + const struct glsl_type *type = + glsl_vector_type(base_type, nir_dest_num_components(intr->dest)); + nir_ssa_def *ptr = nir_vec2(b, nir_imm_int(b, var->data.binding), + nir_u2u(b, intr->src[0].ssa, 32)); + nir_deref_instr *deref = nir_build_deref_cast(b, ptr, nir_var_mem_ubo, type, + bit_size / 8); + deref->cast.align_mul = nir_intrinsic_align_mul(intr); + deref->cast.align_offset = nir_intrinsic_align_offset(intr); + + nir_ssa_def *result = + nir_load_deref(b, deref); + nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(result)); + nir_instr_remove(&intr->instr); + return true; +} + +bool +clc_nir_lower_kernel_input_loads(nir_shader *nir, nir_variable *var) +{ + bool progress = false; + + foreach_list_typed(nir_function, func, node, &nir->functions) { + if (!func->is_entrypoint) + continue; + assert(func->impl); + + nir_builder b; + nir_builder_init(&b, func->impl); + + nir_foreach_block(block, func->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + if (intr->intrinsic == nir_intrinsic_load_kernel_input) + progress |= lower_load_kernel_input(&b, intr, var); + } + } + } + + return progress; +} + + +static nir_variable * +add_printf_var(struct nir_shader *nir, unsigned uav_id) +{ + /* This size is arbitrary. Minimum required per spec is 1MB */ + const unsigned max_printf_size = 1 * 1024 * 1024; + const unsigned printf_array_size = max_printf_size / sizeof(unsigned); + nir_variable *var = + nir_variable_create(nir, nir_var_mem_ssbo, + glsl_array_type(glsl_uint_type(), printf_array_size, sizeof(unsigned)), + "kernel_work_properies"); + var->data.binding = uav_id; + return var; +} + +static void +lower_printf_impl(nir_builder *b, nir_intrinsic_instr *instr, nir_variable *var) +{ + /* Atomic add a buffer size counter to determine where to write. + * If overflowed, return -1, otherwise, store the arguments and return 0. + */ + b->cursor = nir_before_instr(&instr->instr); + nir_deref_instr *ssbo_deref = nir_build_deref_var(b, var); + nir_deref_instr *counter_deref = nir_build_deref_array_imm(b, ssbo_deref, 0); + nir_deref_instr *struct_deref = nir_instr_as_deref(instr->src[1].ssa->parent_instr); + nir_variable *struct_var = nir_deref_instr_get_variable(struct_deref); + const struct glsl_type *struct_type = struct_var->type; + /* Align the struct size to 4 for natural SSBO alignment */ + int struct_size = align(glsl_get_cl_size(struct_type), 4); + + /* Hardcoding 64bit pointers to simplify some code below */ + assert(instr->src[0].ssa->num_components == 1 && instr->src[0].ssa->bit_size == 64); + + nir_intrinsic_instr *atomic = nir_intrinsic_instr_create(b->shader, nir_intrinsic_deref_atomic_add); + nir_ssa_dest_init(&atomic->instr, &atomic->dest, 1, 32, NULL); + atomic->src[0] = nir_src_for_ssa(&counter_deref->dest.ssa); + atomic->src[1] = nir_src_for_ssa(nir_imm_int(b, struct_size + sizeof(uint64_t))); + nir_builder_instr_insert(b, &atomic->instr); + + int max_valid_offset = + glsl_get_cl_size(var->type) - /* buffer size */ + struct_size - /* printf args size */ + sizeof(uint64_t) - /* format string */ + sizeof(int); /* the first int in the buffer is for the counter */ + nir_push_if(b, nir_ilt(b, &atomic->dest.ssa, nir_imm_int(b, max_valid_offset))); + nir_ssa_def *printf_succ_val = nir_imm_int(b, 0); + + nir_ssa_def *start_offset = nir_u2u64(b, nir_iadd(b, &atomic->dest.ssa, nir_imm_int(b, sizeof(int)))); + nir_deref_instr *as_byte_array = nir_build_deref_cast(b, &ssbo_deref->dest.ssa, nir_var_mem_ssbo, glsl_uint8_t_type(), 1); + nir_deref_instr *as_offset_byte_array = nir_build_deref_ptr_as_array(b, as_byte_array, start_offset); + nir_deref_instr *format_string_write_deref = + nir_build_deref_cast(b, &as_offset_byte_array->dest.ssa, nir_var_mem_ssbo, glsl_uint64_t_type(), 8); + nir_store_deref(b, format_string_write_deref, instr->src[0].ssa, ~0); + + for (unsigned i = 0; i < glsl_get_length(struct_type); ++i) { + nir_ssa_def *field_offset_from_start = nir_imm_int64(b, glsl_get_struct_field_offset(struct_type, i) + sizeof(uint64_t)); + nir_ssa_def *field_offset = nir_iadd(b, start_offset, field_offset_from_start); + + const struct glsl_type *field_type = glsl_get_struct_field(struct_type, i); + nir_deref_instr *field_read_deref = nir_build_deref_struct(b, struct_deref, i); + nir_ssa_def *field_value = nir_load_deref(b, field_read_deref); + + /* Clang does promotion of arguments to their "native" size. That means that any floats + * have been converted to doubles for the call to printf. Since we don't support doubles, + * convert them back here; copy-prop and other optimizations should remove all hint of doubles. + */ + if (glsl_get_base_type(field_type) == GLSL_TYPE_DOUBLE) { + field_value = nir_f2f32(b, field_value); + field_type = glsl_float_type(); + } + + as_offset_byte_array = nir_build_deref_ptr_as_array(b, as_byte_array, field_offset); + nir_deref_instr *field_write_deref = + nir_build_deref_cast(b, &as_offset_byte_array->dest.ssa, nir_var_mem_ssbo, field_type, glsl_get_cl_size(field_type)); + + nir_store_deref(b, field_write_deref, field_value, ~0); + } + + nir_push_else(b, NULL); + nir_ssa_def *printf_fail_val = nir_imm_int(b, -1); + nir_pop_if(b, NULL); + + nir_ssa_def *return_value = nir_if_phi(b, printf_succ_val, printf_fail_val); + nir_ssa_def_rewrite_uses(&instr->dest.ssa, nir_src_for_ssa(return_value)); + nir_instr_remove(&instr->instr); +} + +static nir_variable * +find_identical_const_sampler(nir_shader *nir, nir_variable *sampler) +{ + nir_foreach_variable_with_modes(uniform, nir, nir_var_uniform) { + if (!glsl_type_is_sampler(uniform->type) || !uniform->data.sampler.is_inline_sampler) + continue; + if (uniform->data.sampler.addressing_mode == sampler->data.sampler.addressing_mode && + uniform->data.sampler.normalized_coordinates == sampler->data.sampler.normalized_coordinates && + uniform->data.sampler.filter_mode == sampler->data.sampler.filter_mode) + return uniform; + } + unreachable("Should have at least found the input sampler"); +} + +bool +clc_nir_dedupe_const_samplers(nir_shader *nir) +{ + bool progress = false; + nir_foreach_function(func, nir) { + if (!func->impl) + continue; + + nir_builder b; + nir_builder_init(&b, func->impl); + + nir_foreach_block(block, func->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_tex) + continue; + + nir_tex_instr *tex = nir_instr_as_tex(instr); + int sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref); + if (sampler_idx == -1) + continue; + + nir_deref_instr *deref = nir_src_as_deref(tex->src[sampler_idx].src); + nir_variable *sampler = nir_deref_instr_get_variable(deref); + if (!sampler) + continue; + + assert(sampler->data.mode == nir_var_uniform); + + if (!sampler->data.sampler.is_inline_sampler) + continue; + + nir_variable *replacement = find_identical_const_sampler(nir, sampler); + if (replacement == sampler) + continue; + + b.cursor = nir_before_instr(&tex->instr); + nir_deref_instr *replacement_deref = nir_build_deref_var(&b, replacement); + nir_instr_rewrite_src(&tex->instr, &tex->src[sampler_idx].src, + nir_src_for_ssa(&replacement_deref->dest.ssa)); + nir_deref_instr_remove_if_unused(deref); + progress = true; + } + } + + if (progress) { + nir_metadata_preserve(func->impl, nir_metadata_block_index | nir_metadata_dominance); + } + } + return progress; +} diff --git a/src/microsoft/clc/clc_nir.h b/src/microsoft/clc/clc_nir.h new file mode 100644 index 00000000000..a452b7a7ff5 --- /dev/null +++ b/src/microsoft/clc/clc_nir.h @@ -0,0 +1,40 @@ +/* + * Copyright © Microsoft Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef CLC_NIR_H +#define CLC_NIR_H + +#include +#include "nir.h" + +bool +clc_nir_lower_system_values(nir_shader *nir, nir_variable *var); +bool dxil_nir_lower_kernel_input_loads(nir_shader *nir, nir_variable *var); + +bool +clc_nir_lower_printf(nir_shader *nir, unsigned uav_id); + +bool +clc_nir_dedupe_const_samplers(nir_shader *nir); + +#endif diff --git a/src/microsoft/clc/clglon12compiler.def b/src/microsoft/clc/clglon12compiler.def new file mode 100644 index 00000000000..924f7aa6723 --- /dev/null +++ b/src/microsoft/clc/clglon12compiler.def @@ -0,0 +1,12 @@ +EXPORTS + clc_context_new + clc_free_context + clc_context_serialize + clc_context_free_serialized + clc_context_deserialize + clc_compile + clc_link + clc_free_object + clc_to_dxil + clc_free_dxil_object + clc_compiler_get_version diff --git a/src/microsoft/clc/compute_test.cpp b/src/microsoft/clc/compute_test.cpp new file mode 100644 index 00000000000..46f5d87014c --- /dev/null +++ b/src/microsoft/clc/compute_test.cpp @@ -0,0 +1,880 @@ +/* + * Copyright © Microsoft Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include "util/u_debug.h" +#include "clc_compiler.h" +#include "compute_test.h" +#include "dxcapi.h" + +using std::runtime_error; +using Microsoft::WRL::ComPtr; + +enum compute_test_debug_flags { + COMPUTE_DEBUG_EXPERIMENTAL_SHADERS = 1 << 0, + COMPUTE_DEBUG_USE_HW_D3D = 1 << 1, + COMPUTE_DEBUG_OPTIMIZE_LIBCLC = 1 << 2, + COMPUTE_DEBUG_SERIALIZE_LIBCLC = 1 << 3, +}; + +static const struct debug_named_value debug_options[] = { + { "experimental_shaders", COMPUTE_DEBUG_EXPERIMENTAL_SHADERS, "Enable experimental shaders" }, + { "use_hw_d3d", COMPUTE_DEBUG_USE_HW_D3D, "Use a hardware D3D device" }, + { "optimize_libclc", COMPUTE_DEBUG_OPTIMIZE_LIBCLC, "Optimize the clc_context before using it" }, + { "serialize_libclc", COMPUTE_DEBUG_SERIALIZE_LIBCLC, "Serialize and deserialize the clc_context" }, + DEBUG_NAMED_VALUE_END +}; + +DEBUG_GET_ONCE_FLAGS_OPTION(debug_compute, "COMPUTE_TEST_DEBUG", debug_options, 0) + +static void warning_callback(void *priv, const char *msg) +{ + fprintf(stderr, "WARNING: %s\n", msg); +} + +static void error_callback(void *priv, const char *msg) +{ + fprintf(stderr, "ERROR: %s\n", msg); +} + +static const struct clc_logger logger = { + NULL, + error_callback, + warning_callback, +}; + +void +ComputeTest::enable_d3d12_debug_layer() +{ + HMODULE hD3D12Mod = LoadLibrary("D3D12.DLL"); + if (!hD3D12Mod) { + fprintf(stderr, "D3D12: failed to load D3D12.DLL\n"); + return; + } + + typedef HRESULT(WINAPI * PFN_D3D12_GET_DEBUG_INTERFACE)(REFIID riid, + void **ppFactory); + PFN_D3D12_GET_DEBUG_INTERFACE D3D12GetDebugInterface = (PFN_D3D12_GET_DEBUG_INTERFACE)GetProcAddress(hD3D12Mod, "D3D12GetDebugInterface"); + if (!D3D12GetDebugInterface) { + fprintf(stderr, "D3D12: failed to load D3D12GetDebugInterface from D3D12.DLL\n"); + return; + } + + ID3D12Debug *debug; + if (FAILED(D3D12GetDebugInterface(__uuidof(ID3D12Debug), (void **)& debug))) { + fprintf(stderr, "D3D12: D3D12GetDebugInterface failed\n"); + return; + } + + debug->EnableDebugLayer(); +} + +IDXGIFactory4 * +ComputeTest::get_dxgi_factory() +{ + static const GUID IID_IDXGIFactory4 = { + 0x1bc6ea02, 0xef36, 0x464f, + { 0xbf, 0x0c, 0x21, 0xca, 0x39, 0xe5, 0x16, 0x8a } + }; + + typedef HRESULT(WINAPI * PFN_CREATE_DXGI_FACTORY)(REFIID riid, + void **ppFactory); + PFN_CREATE_DXGI_FACTORY CreateDXGIFactory; + + HMODULE hDXGIMod = LoadLibrary("DXGI.DLL"); + if (!hDXGIMod) + throw runtime_error("Failed to load DXGI.DLL"); + + CreateDXGIFactory = (PFN_CREATE_DXGI_FACTORY)GetProcAddress(hDXGIMod, "CreateDXGIFactory"); + if (!CreateDXGIFactory) + throw runtime_error("Failed to load CreateDXGIFactory from DXGI.DLL"); + + IDXGIFactory4 *factory = NULL; + HRESULT hr = CreateDXGIFactory(IID_IDXGIFactory4, (void **)&factory); + if (FAILED(hr)) + throw runtime_error("CreateDXGIFactory failed"); + + return factory; +} + +IDXGIAdapter1 * +ComputeTest::choose_adapter(IDXGIFactory4 *factory) +{ + IDXGIAdapter1 *ret; + + if (debug_get_option_debug_compute() & COMPUTE_DEBUG_USE_HW_D3D) { + for (unsigned i = 0; SUCCEEDED(factory->EnumAdapters1(i, &ret)); i++) { + DXGI_ADAPTER_DESC1 desc; + ret->GetDesc1(&desc); + if (!(desc.Flags & D3D_DRIVER_TYPE_SOFTWARE)) + return ret; + } + throw runtime_error("Failed to enum hardware adapter"); + } else { + if (FAILED(factory->EnumWarpAdapter(__uuidof(IDXGIAdapter1), + (void **)& ret))) + throw runtime_error("Failed to enum warp adapter"); + return ret; + } +} + +ID3D12Device * +ComputeTest::create_device(IDXGIAdapter1 *adapter) +{ + typedef HRESULT(WINAPI *PFN_D3D12CREATEDEVICE)(IUnknown *, D3D_FEATURE_LEVEL, REFIID, void **); + PFN_D3D12CREATEDEVICE D3D12CreateDevice; + + HMODULE hD3D12Mod = LoadLibrary("D3D12.DLL"); + if (!hD3D12Mod) + throw runtime_error("failed to load D3D12.DLL"); + + if (debug_get_option_debug_compute() & COMPUTE_DEBUG_EXPERIMENTAL_SHADERS) { + typedef HRESULT(WINAPI *PFN_D3D12ENABLEEXPERIMENTALFEATURES)(UINT, const IID *, void *, UINT *); + PFN_D3D12ENABLEEXPERIMENTALFEATURES D3D12EnableExperimentalFeatures; + D3D12EnableExperimentalFeatures = (PFN_D3D12ENABLEEXPERIMENTALFEATURES) + GetProcAddress(hD3D12Mod, "D3D12EnableExperimentalFeatures"); + if (FAILED(D3D12EnableExperimentalFeatures(1, &D3D12ExperimentalShaderModels, NULL, NULL))) + throw runtime_error("failed to enable experimental shader models"); + } + + D3D12CreateDevice = (PFN_D3D12CREATEDEVICE)GetProcAddress(hD3D12Mod, "D3D12CreateDevice"); + if (!D3D12CreateDevice) + throw runtime_error("failed to load D3D12CreateDevice from D3D12.DLL"); + + ID3D12Device *dev; + if (FAILED(D3D12CreateDevice(adapter, D3D_FEATURE_LEVEL_12_0, + __uuidof(ID3D12Device), (void **)& dev))) + throw runtime_error("D3D12CreateDevice failed"); + + return dev; +} + +ComPtr +ComputeTest::create_root_signature(const ComputeTest::Resources &resources) +{ + D3D12_ROOT_PARAMETER1 root_param; + root_param.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; + root_param.DescriptorTable.NumDescriptorRanges = resources.ranges.size(); + root_param.DescriptorTable.pDescriptorRanges = resources.ranges.data(); + root_param.ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + + D3D12_ROOT_SIGNATURE_DESC1 root_sig_desc; + root_sig_desc.NumParameters = 1; + root_sig_desc.pParameters = &root_param; + root_sig_desc.NumStaticSamplers = 0; + root_sig_desc.pStaticSamplers = NULL; + root_sig_desc.Flags = D3D12_ROOT_SIGNATURE_FLAG_NONE; + + D3D12_VERSIONED_ROOT_SIGNATURE_DESC versioned_desc; + versioned_desc.Version = D3D_ROOT_SIGNATURE_VERSION_1_1; + versioned_desc.Desc_1_1 = root_sig_desc; + + ID3DBlob *sig, *error; + if (FAILED(D3D12SerializeVersionedRootSignature(&versioned_desc, + &sig, &error))) + throw runtime_error("D3D12SerializeVersionedRootSignature failed"); + + ComPtr ret; + if (FAILED(dev->CreateRootSignature(0, + sig->GetBufferPointer(), + sig->GetBufferSize(), + __uuidof(ret), + (void **)& ret))) + throw runtime_error("CreateRootSignature failed"); + + return ret; +} + +ComPtr +ComputeTest::create_pipeline_state(ComPtr &root_sig, + const struct clc_dxil_object &dxil) +{ + D3D12_COMPUTE_PIPELINE_STATE_DESC pipeline_desc = { root_sig.Get() }; + pipeline_desc.CS.pShaderBytecode = dxil.binary.data; + pipeline_desc.CS.BytecodeLength = dxil.binary.size; + + ComPtr pipeline_state; + if (FAILED(dev->CreateComputePipelineState(&pipeline_desc, + __uuidof(pipeline_state), + (void **)& pipeline_state))) + throw runtime_error("Failed to create pipeline state"); + return pipeline_state; +} + +ComPtr +ComputeTest::create_buffer(int size, D3D12_HEAP_TYPE heap_type) +{ + D3D12_RESOURCE_DESC desc; + desc.Format = DXGI_FORMAT_UNKNOWN; + desc.Alignment = D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT; + desc.Width = size; + desc.Height = 1; + desc.DepthOrArraySize = 1; + desc.MipLevels = 1; + desc.SampleDesc.Count = 1; + desc.SampleDesc.Quality = 0; + desc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; + desc.Flags = heap_type == D3D12_HEAP_TYPE_DEFAULT ? D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS : D3D12_RESOURCE_FLAG_NONE; + desc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; + + D3D12_HEAP_PROPERTIES heap_pris = dev->GetCustomHeapProperties(0, heap_type); + + D3D12_RESOURCE_STATES initial_state = D3D12_RESOURCE_STATE_COMMON; + switch (heap_type) { + case D3D12_HEAP_TYPE_UPLOAD: + initial_state = D3D12_RESOURCE_STATE_GENERIC_READ; + break; + + case D3D12_HEAP_TYPE_READBACK: + initial_state = D3D12_RESOURCE_STATE_COPY_DEST; + break; + } + + ComPtr res; + if (FAILED(dev->CreateCommittedResource(&heap_pris, + D3D12_HEAP_FLAG_NONE, &desc, initial_state, + NULL, __uuidof(ID3D12Resource), (void **)&res))) + throw runtime_error("CreateCommittedResource failed"); + + return res; +} + +ComPtr +ComputeTest::create_upload_buffer_with_data(const void *data, size_t size) +{ + auto upload_res = create_buffer(size, D3D12_HEAP_TYPE_UPLOAD); + + void *ptr = NULL; + D3D12_RANGE res_range = { 0, (SIZE_T)size }; + if (FAILED(upload_res->Map(0, &res_range, (void **)&ptr))) + throw runtime_error("Failed to map upload-buffer"); + assert(ptr); + memcpy(ptr, data, size); + upload_res->Unmap(0, &res_range); + return upload_res; +} + +ComPtr +ComputeTest::create_sized_buffer_with_data(size_t buffer_size, + const void *data, + size_t data_size) +{ + auto upload_res = create_upload_buffer_with_data(data, data_size); + + auto res = create_buffer(buffer_size, D3D12_HEAP_TYPE_DEFAULT); + resource_barrier(res, D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_COPY_DEST); + cmdlist->CopyBufferRegion(res.Get(), 0, upload_res.Get(), 0, data_size); + resource_barrier(res, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COMMON); + execute_cmdlist(); + + return res; +} + +void +ComputeTest::get_buffer_data(ComPtr res, + void *buf, size_t size) +{ + auto readback_res = create_buffer(align(size, 4), D3D12_HEAP_TYPE_READBACK); + resource_barrier(res, D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_COPY_SOURCE); + cmdlist->CopyResource(readback_res.Get(), res.Get()); + resource_barrier(res, D3D12_RESOURCE_STATE_COPY_SOURCE, D3D12_RESOURCE_STATE_COMMON); + execute_cmdlist(); + + void *ptr = NULL; + D3D12_RANGE res_range = { 0, size }; + if (FAILED(readback_res->Map(0, &res_range, &ptr))) + throw runtime_error("Failed to map readback-buffer"); + + memcpy(buf, ptr, size); + + D3D12_RANGE empty_range = { 0, 0 }; + readback_res->Unmap(0, &empty_range); +} + +void +ComputeTest::resource_barrier(ComPtr &res, + D3D12_RESOURCE_STATES state_before, + D3D12_RESOURCE_STATES state_after) +{ + D3D12_RESOURCE_BARRIER barrier; + barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION; + barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE; + barrier.Transition.pResource = res.Get(); + barrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES; + barrier.Transition.StateBefore = state_before; + barrier.Transition.StateAfter = state_after; + cmdlist->ResourceBarrier(1, &barrier); +} + +void +ComputeTest::execute_cmdlist() +{ + if (FAILED(cmdlist->Close())) + throw runtime_error("Closing ID3D12GraphicsCommandList failed"); + + ID3D12CommandList *cmdlists[] = { cmdlist }; + cmdqueue->ExecuteCommandLists(1, cmdlists); + cmdqueue_fence->SetEventOnCompletion(fence_value, event); + cmdqueue->Signal(cmdqueue_fence, fence_value); + fence_value++; + WaitForSingleObject(event, INFINITE); + + if (FAILED(cmdalloc->Reset())) + throw runtime_error("resetting ID3D12CommandAllocator failed"); + + if (FAILED(cmdlist->Reset(cmdalloc, NULL))) + throw runtime_error("resetting ID3D12GraphicsCommandList failed"); +} + +void +ComputeTest::create_uav_buffer(ComPtr res, + size_t width, size_t byte_stride, + D3D12_CPU_DESCRIPTOR_HANDLE cpu_handle) +{ + D3D12_UNORDERED_ACCESS_VIEW_DESC uav_desc; + uav_desc.Format = DXGI_FORMAT_R32_TYPELESS; + uav_desc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER; + uav_desc.Buffer.FirstElement = 0; + uav_desc.Buffer.NumElements = DIV_ROUND_UP(width * byte_stride, 4); + uav_desc.Buffer.StructureByteStride = 0; + uav_desc.Buffer.CounterOffsetInBytes = 0; + uav_desc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_RAW; + + dev->CreateUnorderedAccessView(res.Get(), NULL, &uav_desc, cpu_handle); +} + +void +ComputeTest::create_cbv(ComPtr res, size_t size, + D3D12_CPU_DESCRIPTOR_HANDLE cpu_handle) +{ + D3D12_CONSTANT_BUFFER_VIEW_DESC cbv_desc; + cbv_desc.BufferLocation = res ? res->GetGPUVirtualAddress() : 0; + cbv_desc.SizeInBytes = size; + + dev->CreateConstantBufferView(&cbv_desc, cpu_handle); +} + +ComPtr +ComputeTest::add_uav_resource(ComputeTest::Resources &resources, + unsigned spaceid, unsigned resid, + const void *data, size_t num_elems, + size_t elem_size) +{ + size_t size = align(elem_size * num_elems, 4); + D3D12_CPU_DESCRIPTOR_HANDLE handle; + ComPtr res; + handle = uav_heap->GetCPUDescriptorHandleForHeapStart(); + handle = offset_cpu_handle(handle, resources.descs.size() * uav_heap_incr); + + if (size) { + if (data) + res = create_buffer_with_data(data, size); + else + res = create_buffer(size, D3D12_HEAP_TYPE_DEFAULT); + + resource_barrier(res, D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + } + create_uav_buffer(res, num_elems, elem_size, handle); + resources.add(res, D3D12_DESCRIPTOR_RANGE_TYPE_UAV, spaceid, resid); + return res; +} + +ComPtr +ComputeTest::add_cbv_resource(ComputeTest::Resources &resources, + unsigned spaceid, unsigned resid, + const void *data, size_t size) +{ + unsigned aligned_size = align(size, 256); + D3D12_CPU_DESCRIPTOR_HANDLE handle; + ComPtr res; + handle = uav_heap->GetCPUDescriptorHandleForHeapStart(); + handle = offset_cpu_handle(handle, resources.descs.size() * uav_heap_incr); + + if (size) { + assert(data); + res = create_sized_buffer_with_data(aligned_size, data, size); + } + create_cbv(res, aligned_size, handle); + resources.add(res, D3D12_DESCRIPTOR_RANGE_TYPE_CBV, spaceid, resid); + return res; +} + +void +ComputeTest::run_shader_with_raw_args(Shader shader, + const CompileArgs &compile_args, + const std::vector &args) +{ + if (args.size() < 1) + throw runtime_error("no inputs"); + + static HMODULE hD3D12Mod = LoadLibrary("D3D12.DLL"); + if (!hD3D12Mod) + throw runtime_error("Failed to load D3D12.DLL"); + + D3D12SerializeVersionedRootSignature = (PFN_D3D12_SERIALIZE_VERSIONED_ROOT_SIGNATURE)GetProcAddress(hD3D12Mod, "D3D12SerializeVersionedRootSignature"); + + if (args.size() != shader.dxil->kernel->num_args) + throw runtime_error("incorrect number of inputs"); + + struct clc_runtime_kernel_conf conf = { 0 }; + + // Older WARP and some hardware doesn't support int64, so for these tests, unconditionally lower away int64 + // A more complex runtime can be smarter about detecting when this needs to be done + conf.lower_bit_size = 64; + + if (!shader.dxil->metadata.local_size[0]) + conf.local_size[0] = compile_args.x; + else + conf.local_size[0] = shader.dxil->metadata.local_size[0]; + + if (!shader.dxil->metadata.local_size[1]) + conf.local_size[1] = compile_args.y; + else + conf.local_size[1] = shader.dxil->metadata.local_size[1]; + + if (!shader.dxil->metadata.local_size[2]) + conf.local_size[2] = compile_args.z; + else + conf.local_size[2] = shader.dxil->metadata.local_size[2]; + + if (compile_args.x % conf.local_size[0] || + compile_args.y % conf.local_size[1] || + compile_args.z % conf.local_size[2]) + throw runtime_error("invalid global size must be a multiple of local size"); + + std::vector argsinfo(args.size()); + + conf.args = argsinfo.data(); + conf.support_global_work_id_offsets = + compile_args.work_props.global_offset_x != 0 || + compile_args.work_props.global_offset_y != 0 || + compile_args.work_props.global_offset_z != 0; + conf.support_work_group_id_offsets = + compile_args.work_props.group_id_offset_x != 0 || + compile_args.work_props.group_id_offset_y != 0 || + compile_args.work_props.group_id_offset_z != 0; + + for (unsigned i = 0; i < shader.dxil->kernel->num_args; ++i) { + RawShaderArg *arg = args[i]; + size_t size = arg->get_elem_size() * arg->get_num_elems(); + + switch (shader.dxil->kernel->args[i].address_qualifier) { + case CLC_KERNEL_ARG_ADDRESS_LOCAL: + argsinfo[i].localptr.size = size; + break; + default: + break; + } + } + + configure(shader, &conf); + validate(shader); + + std::shared_ptr &dxil = shader.dxil; + + std::vector argsbuf(dxil->metadata.kernel_inputs_buf_size); + std::vector> argres(shader.dxil->kernel->num_args); + clc_work_properties_data work_props = compile_args.work_props; + if (!conf.support_work_group_id_offsets) { + work_props.group_count_total_x = compile_args.x / conf.local_size[0]; + work_props.group_count_total_y = compile_args.y / conf.local_size[1]; + work_props.group_count_total_z = compile_args.z / conf.local_size[2]; + } + if (work_props.work_dim == 0) + work_props.work_dim = 3; + Resources resources; + + for (unsigned i = 0; i < dxil->kernel->num_args; ++i) { + RawShaderArg *arg = args[i]; + size_t size = arg->get_elem_size() * arg->get_num_elems(); + void *slot = argsbuf.data() + dxil->metadata.args[i].offset; + + switch (dxil->kernel->args[i].address_qualifier) { + case CLC_KERNEL_ARG_ADDRESS_CONSTANT: + case CLC_KERNEL_ARG_ADDRESS_GLOBAL: { + assert(dxil->metadata.args[i].size == sizeof(uint64_t)); + uint64_t *ptr_slot = (uint64_t *)slot; + if (arg->get_data()) + *ptr_slot = (uint64_t)dxil->metadata.args[i].globconstptr.buf_id << 32; + else + *ptr_slot = ~0ull; + break; + } + case CLC_KERNEL_ARG_ADDRESS_LOCAL: { + assert(dxil->metadata.args[i].size == sizeof(uint64_t)); + uint64_t *ptr_slot = (uint64_t *)slot; + *ptr_slot = dxil->metadata.args[i].localptr.sharedmem_offset; + break; + } + case CLC_KERNEL_ARG_ADDRESS_PRIVATE: { + assert(size == dxil->metadata.args[i].size); + memcpy(slot, arg->get_data(), size); + break; + } + default: + assert(0); + } + } + + for (unsigned i = 0; i < dxil->kernel->num_args; ++i) { + RawShaderArg *arg = args[i]; + + if (dxil->kernel->args[i].address_qualifier == CLC_KERNEL_ARG_ADDRESS_GLOBAL || + dxil->kernel->args[i].address_qualifier == CLC_KERNEL_ARG_ADDRESS_CONSTANT) { + argres[i] = add_uav_resource(resources, 0, + dxil->metadata.args[i].globconstptr.buf_id, + arg->get_data(), arg->get_num_elems(), + arg->get_elem_size()); + } + } + + if (dxil->metadata.printf_uav_id > 0) + add_uav_resource(resources, 0, dxil->metadata.printf_uav_id, NULL, 1024 * 1024 / 4, 4); + + for (unsigned i = 0; i < dxil->metadata.num_consts; ++i) + add_uav_resource(resources, 0, dxil->metadata.consts[i].uav_id, + dxil->metadata.consts[i].data, + dxil->metadata.consts[i].size / 4, 4); + + if (argsbuf.size()) + add_cbv_resource(resources, 0, dxil->metadata.kernel_inputs_cbv_id, + argsbuf.data(), argsbuf.size()); + + add_cbv_resource(resources, 0, dxil->metadata.work_properties_cbv_id, + &work_props, sizeof(work_props)); + + auto root_sig = create_root_signature(resources); + auto pipeline_state = create_pipeline_state(root_sig, *dxil); + + cmdlist->SetDescriptorHeaps(1, &uav_heap); + cmdlist->SetComputeRootSignature(root_sig.Get()); + cmdlist->SetComputeRootDescriptorTable(0, uav_heap->GetGPUDescriptorHandleForHeapStart()); + cmdlist->SetPipelineState(pipeline_state.Get()); + + cmdlist->Dispatch(compile_args.x / conf.local_size[0], + compile_args.y / conf.local_size[1], + compile_args.z / conf.local_size[2]); + + for (auto &range : resources.ranges) { + if (range.RangeType == D3D12_DESCRIPTOR_RANGE_TYPE_UAV) { + for (unsigned i = range.OffsetInDescriptorsFromTableStart; + i < range.NumDescriptors; i++) { + if (!resources.descs[i].Get()) + continue; + + resource_barrier(resources.descs[i], + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + D3D12_RESOURCE_STATE_COMMON); + } + } + } + + execute_cmdlist(); + + for (unsigned i = 0; i < args.size(); i++) { + if (!(args[i]->get_direction() & SHADER_ARG_OUTPUT)) + continue; + + assert(dxil->kernel->args[i].address_qualifier == CLC_KERNEL_ARG_ADDRESS_GLOBAL); + get_buffer_data(argres[i], args[i]->get_data(), + args[i]->get_elem_size() * args[i]->get_num_elems()); + } + + ComPtr info_queue; + dev->QueryInterface(info_queue.ReleaseAndGetAddressOf()); + if (info_queue) + { + EXPECT_EQ(0, info_queue->GetNumStoredMessages()); + for (unsigned i = 0; i < info_queue->GetNumStoredMessages(); ++i) { + SIZE_T message_size = 0; + info_queue->GetMessageA(i, nullptr, &message_size); + D3D12_MESSAGE* message = (D3D12_MESSAGE*)malloc(message_size); + info_queue->GetMessageA(i, message, &message_size); + FAIL() << message->pDescription; + free(message); + } + } +} + +void +ComputeTest::SetUp() +{ + static struct clc_context *compiler_ctx_g = nullptr; + + if (!compiler_ctx_g) { + clc_context_options options = { }; + options.optimize = (debug_get_option_debug_compute() & COMPUTE_DEBUG_OPTIMIZE_LIBCLC) != 0; + + compiler_ctx_g = clc_context_new(&logger, &options); + if (!compiler_ctx_g) + throw runtime_error("failed to create CLC compiler context"); + + if (debug_get_option_debug_compute() & COMPUTE_DEBUG_SERIALIZE_LIBCLC) { + void *serialized = nullptr; + size_t serialized_size = 0; + clc_context_serialize(compiler_ctx_g, &serialized, &serialized_size); + if (!serialized) + throw runtime_error("failed to serialize CLC compiler context"); + + clc_free_context(compiler_ctx_g); + compiler_ctx_g = nullptr; + + compiler_ctx_g = clc_context_deserialize(serialized, serialized_size); + if (!compiler_ctx_g) + throw runtime_error("failed to deserialize CLC compiler context"); + + clc_context_free_serialized(serialized); + } + } + compiler_ctx = compiler_ctx_g; + + enable_d3d12_debug_layer(); + + factory = get_dxgi_factory(); + if (!factory) + throw runtime_error("failed to create DXGI factory"); + + adapter = choose_adapter(factory); + if (!adapter) + throw runtime_error("failed to choose adapter"); + + dev = create_device(adapter); + if (!dev) + throw runtime_error("failed to create device"); + + if (FAILED(dev->CreateFence(0, D3D12_FENCE_FLAG_NONE, + __uuidof(cmdqueue_fence), + (void **)&cmdqueue_fence))) + throw runtime_error("failed to create fence\n"); + + D3D12_COMMAND_QUEUE_DESC queue_desc; + queue_desc.Type = D3D12_COMMAND_LIST_TYPE_COMPUTE; + queue_desc.Priority = D3D12_COMMAND_QUEUE_PRIORITY_NORMAL; + queue_desc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE; + queue_desc.NodeMask = 0; + if (FAILED(dev->CreateCommandQueue(&queue_desc, + __uuidof(cmdqueue), + (void **)&cmdqueue))) + throw runtime_error("failed to create command queue"); + + if (FAILED(dev->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_COMPUTE, + __uuidof(cmdalloc), (void **)&cmdalloc))) + throw runtime_error("failed to create command allocator"); + + if (FAILED(dev->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_COMPUTE, + cmdalloc, NULL, __uuidof(cmdlist), (void **)&cmdlist))) + throw runtime_error("failed to create command list"); + + D3D12_DESCRIPTOR_HEAP_DESC heap_desc; + heap_desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV; + heap_desc.NumDescriptors = 1000; + heap_desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE; + heap_desc.NodeMask = 0; + if (FAILED(dev->CreateDescriptorHeap(&heap_desc, + __uuidof(uav_heap), (void **)&uav_heap))) + throw runtime_error("failed to create descriptor heap"); + + uav_heap_incr = dev->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); + + event = CreateEvent(NULL, FALSE, FALSE, NULL); + if (!event) + throw runtime_error("Failed to create event"); + fence_value = 1; +} + +void +ComputeTest::TearDown() +{ + CloseHandle(event); + + uav_heap->Release(); + cmdlist->Release(); + cmdalloc->Release(); + cmdqueue->Release(); + cmdqueue_fence->Release(); + dev->Release(); + adapter->Release(); + factory->Release(); +} + +PFN_D3D12_SERIALIZE_VERSIONED_ROOT_SIGNATURE ComputeTest::D3D12SerializeVersionedRootSignature; + +bool +validate_module(const struct clc_dxil_object &dxil) +{ + static HMODULE hmod = LoadLibrary("DXIL.DLL"); + if (!hmod) { + /* Enabling experimental shaders allows us to run unsigned shader code, + * such as when under the debugger where we can't run the validator. */ + if (debug_get_option_debug_compute() & COMPUTE_DEBUG_EXPERIMENTAL_SHADERS) + return true; + else + throw runtime_error("failed to load DXIL.DLL"); + } + + DxcCreateInstanceProc pfnDxcCreateInstance = + (DxcCreateInstanceProc)GetProcAddress(hmod, "DxcCreateInstance"); + if (!pfnDxcCreateInstance) + throw runtime_error("failed to load DxcCreateInstance"); + + struct shader_blob : public IDxcBlob { + shader_blob(void *data, size_t size) : data(data), size(size) {} + LPVOID STDMETHODCALLTYPE GetBufferPointer() override { return data; } + SIZE_T STDMETHODCALLTYPE GetBufferSize() override { return size; } + HRESULT STDMETHODCALLTYPE QueryInterface(REFIID, void **) override { return E_NOINTERFACE; } + ULONG STDMETHODCALLTYPE AddRef() override { return 1; } + ULONG STDMETHODCALLTYPE Release() override { return 0; } + void *data; + size_t size; + } blob(dxil.binary.data, dxil.binary.size); + + IDxcValidator *validator; + if (FAILED(pfnDxcCreateInstance(CLSID_DxcValidator, __uuidof(IDxcValidator), + (void **)&validator))) + throw runtime_error("failed to create IDxcValidator"); + + IDxcOperationResult *result; + if (FAILED(validator->Validate(&blob, DxcValidatorFlags_InPlaceEdit, + &result))) + throw runtime_error("Validate failed"); + + HRESULT hr; + if (FAILED(result->GetStatus(&hr)) || + FAILED(hr)) { + IDxcBlobEncoding *message; + result->GetErrorBuffer(&message); + fprintf(stderr, "D3D12: validation failed: %*s\n", + (int)message->GetBufferSize(), + (char *)message->GetBufferPointer()); + message->Release(); + validator->Release(); + result->Release(); + return false; + } + + validator->Release(); + result->Release(); + return true; +} + +static void +dump_blob(const char *path, const struct clc_dxil_object &dxil) +{ + FILE *fp = fopen(path, "wb"); + if (fp) { + fwrite(dxil.binary.data, 1, dxil.binary.size, fp); + fclose(fp); + printf("D3D12: wrote '%s'...\n", path); + } +} + +ComputeTest::Shader +ComputeTest::compile(const std::vector &sources, + const std::vector &compile_args, + bool create_library) +{ + struct clc_compile_args args = { 0 }; + args.args = compile_args.data(); + args.num_args = (unsigned)compile_args.size(); + struct clc_dxil_object *dxil; + ComputeTest::Shader shader; + + std::vector shaders; + + args.source.name = "obj.cl"; + + for (unsigned i = 0; i < sources.size(); i++) { + args.source.value = sources[i]; + + auto obj = clc_compile(compiler_ctx, &args, &logger); + if (!obj) + throw runtime_error("failed to compile object!"); + + Shader shader; + shader.obj = std::shared_ptr(obj, clc_free_object); + shaders.push_back(shader); + } + + if (shaders.size() == 1 && create_library) + return shaders[0]; + + return link(shaders, create_library); +} + +ComputeTest::Shader +ComputeTest::link(const std::vector &sources, + bool create_library) +{ + std::vector objs; + for (auto& source : sources) + objs.push_back(&*source.obj); + + struct clc_linker_args link_args = {}; + link_args.in_objs = objs.data(); + link_args.num_in_objs = (unsigned)objs.size(); + link_args.create_library = create_library; + struct clc_object *obj = clc_link(compiler_ctx, + &link_args, + &logger); + if (!obj) + throw runtime_error("failed to link objects!"); + + ComputeTest::Shader shader; + shader.obj = std::shared_ptr(obj, clc_free_object); + if (!link_args.create_library) + configure(shader, NULL); + + return shader; +} + +void +ComputeTest::configure(Shader &shader, + const struct clc_runtime_kernel_conf *conf) +{ + struct clc_dxil_object *dxil; + + dxil = clc_to_dxil(compiler_ctx, shader.obj.get(), "main_test", conf, &logger); + if (!dxil) + throw runtime_error("failed to compile kernel!"); + + shader.dxil = std::shared_ptr(dxil, clc_free_dxil_object); +} + +void +ComputeTest::validate(ComputeTest::Shader &shader) +{ + dump_blob("unsigned.cso", *shader.dxil); + if (!validate_module(*shader.dxil)) + throw runtime_error("failed to validate module!"); + + dump_blob("signed.cso", *shader.dxil); +} diff --git a/src/microsoft/clc/compute_test.h b/src/microsoft/clc/compute_test.h new file mode 100644 index 00000000000..6fb06e6ab67 --- /dev/null +++ b/src/microsoft/clc/compute_test.h @@ -0,0 +1,324 @@ +/* + * Copyright © Microsoft Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include "clc_compiler.h" + +using std::runtime_error; +using Microsoft::WRL::ComPtr; + +inline D3D12_CPU_DESCRIPTOR_HANDLE +offset_cpu_handle(D3D12_CPU_DESCRIPTOR_HANDLE handle, UINT offset) +{ + handle.ptr += offset; + return handle; +} + +inline size_t +align(size_t value, unsigned alignment) +{ + assert(alignment > 0); + return ((value + (alignment - 1)) / alignment) * alignment; +} + +class ComputeTest : public ::testing::Test { +protected: + struct Shader { + std::shared_ptr obj; + std::shared_ptr dxil; + }; + + static void + enable_d3d12_debug_layer(); + + static IDXGIFactory4 * + get_dxgi_factory(); + + static IDXGIAdapter1 * + choose_adapter(IDXGIFactory4 *factory); + + static ID3D12Device * + create_device(IDXGIAdapter1 *adapter); + + struct Resources { + void add(ComPtr res, + D3D12_DESCRIPTOR_RANGE_TYPE type, + unsigned spaceid, + unsigned resid) + { + descs.push_back(res); + + if(!ranges.empty() && + ranges.back().RangeType == type && + ranges.back().RegisterSpace == spaceid && + ranges.back().BaseShaderRegister + ranges.back().NumDescriptors == resid) { + ranges.back().NumDescriptors++; + return; + } + + D3D12_DESCRIPTOR_RANGE1 range; + + range.RangeType = type; + range.NumDescriptors = 1; + range.BaseShaderRegister = resid; + range.RegisterSpace = spaceid; + range.OffsetInDescriptorsFromTableStart = descs.size() - 1; + range.Flags = D3D12_DESCRIPTOR_RANGE_FLAG_DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS; + ranges.push_back(range); + } + + std::vector ranges; + std::vector> descs; + }; + + ComPtr + create_root_signature(const Resources &resources); + + ComPtr + create_pipeline_state(ComPtr &root_sig, + const struct clc_dxil_object &dxil); + + ComPtr + create_buffer(int size, D3D12_HEAP_TYPE heap_type); + + ComPtr + create_upload_buffer_with_data(const void *data, size_t size); + + ComPtr + create_sized_buffer_with_data(size_t buffer_size, const void *data, + size_t data_size); + + ComPtr + create_buffer_with_data(const void *data, size_t size) + { + return create_sized_buffer_with_data(size, data, size); + } + + void + get_buffer_data(ComPtr res, + void *buf, size_t size); + + void + resource_barrier(ComPtr &res, + D3D12_RESOURCE_STATES state_before, + D3D12_RESOURCE_STATES state_after); + + void + execute_cmdlist(); + + void + create_uav_buffer(ComPtr res, + size_t width, size_t byte_stride, + D3D12_CPU_DESCRIPTOR_HANDLE cpu_handle); + + void create_cbv(ComPtr res, size_t size, + D3D12_CPU_DESCRIPTOR_HANDLE cpu_handle); + + ComPtr + add_uav_resource(Resources &resources, unsigned spaceid, unsigned resid, + const void *data = NULL, size_t num_elems = 0, + size_t elem_size = 0); + + ComPtr + add_cbv_resource(Resources &resources, unsigned spaceid, unsigned resid, + const void *data, size_t size); + + void + SetUp() override; + + void + TearDown() override; + + Shader + compile(const std::vector &sources, + const std::vector &compile_args = {}, + bool create_library = false); + + Shader + link(const std::vector &sources, + bool create_library = false); + + void + configure(Shader &shader, + const struct clc_runtime_kernel_conf *conf); + + void + validate(Shader &shader); + + enum ShaderArgDirection { + SHADER_ARG_INPUT = 1, + SHADER_ARG_OUTPUT = 2, + SHADER_ARG_INOUT = SHADER_ARG_INPUT | SHADER_ARG_OUTPUT, + }; + + class RawShaderArg { + public: + RawShaderArg(enum ShaderArgDirection dir) : dir(dir) { } + virtual size_t get_elem_size() const = 0; + virtual size_t get_num_elems() const = 0; + virtual const void *get_data() const = 0; + virtual void *get_data() = 0; + enum ShaderArgDirection get_direction() { return dir; } + private: + enum ShaderArgDirection dir; + }; + + class NullShaderArg : public RawShaderArg { + public: + NullShaderArg() : RawShaderArg(SHADER_ARG_INPUT) { } + size_t get_elem_size() const override { return 0; } + size_t get_num_elems() const override { return 0; } + const void *get_data() const override { return NULL; } + void *get_data() override { return NULL; } + }; + + template + class ShaderArg : public std::vector, public RawShaderArg + { + public: + ShaderArg(const T &v, enum ShaderArgDirection dir = SHADER_ARG_INOUT) : + std::vector({ v }), RawShaderArg(dir) { } + ShaderArg(const std::vector &v, enum ShaderArgDirection dir = SHADER_ARG_INOUT) : + std::vector(v), RawShaderArg(dir) { } + ShaderArg(const std::initializer_list v, enum ShaderArgDirection dir = SHADER_ARG_INOUT) : + std::vector(v), RawShaderArg(dir) { } + + ShaderArg& operator =(const T &v) + { + this->clear(); + this->push_back(v); + return *this; + } + + operator T&() { return this->at(0); } + operator const T&() const { return this->at(0); } + + ShaderArg& operator =(const std::vector &v) + { + *this = v; + return *this; + } + + ShaderArg& operator =(std::initializer_list v) + { + *this = v; + return *this; + } + + size_t get_elem_size() const override { return sizeof(T); } + size_t get_num_elems() const override { return this->size(); } + const void *get_data() const override { return this->data(); } + void *get_data() override { return this->data(); } + }; + + struct CompileArgs + { + unsigned x, y, z; + std::vector compiler_command_line; + clc_work_properties_data work_props; + }; + +private: + void gather_args(std::vector &args) { } + + template + void gather_args(std::vector &args, T &arg, Rest&... rest) + { + args.push_back(&arg); + gather_args(args, rest...); + } + + void run_shader_with_raw_args(Shader shader, + const CompileArgs &compile_args, + const std::vector &args); + +protected: + template + void run_shader(Shader shader, + const CompileArgs &compile_args, + Args&... args) + { + std::vector raw_args; + gather_args(raw_args, args...); + run_shader_with_raw_args(shader, compile_args, raw_args); + } + + template + void run_shader(const std::vector &sources, + unsigned x, unsigned y, unsigned z, + Args&... args) + { + std::vector raw_args; + gather_args(raw_args, args...); + CompileArgs compile_args = { x, y, z }; + run_shader_with_raw_args(compile(sources), compile_args, raw_args); + } + + template + void run_shader(const std::vector &sources, + const CompileArgs &compile_args, + Args&... args) + { + std::vector raw_args; + gather_args(raw_args, args...); + run_shader_with_raw_args( + compile(sources, compile_args.compiler_command_line), + compile_args, raw_args); + } + + template + void run_shader(const char *source, + unsigned x, unsigned y, unsigned z, + Args&... args) + { + std::vector raw_args; + gather_args(raw_args, args...); + CompileArgs compile_args = { x, y, z }; + run_shader_with_raw_args(compile({ source }), compile_args, raw_args); + } + + IDXGIFactory4 *factory; + IDXGIAdapter1 *adapter; + ID3D12Device *dev; + ID3D12Fence *cmdqueue_fence; + ID3D12CommandQueue *cmdqueue; + ID3D12CommandAllocator *cmdalloc; + ID3D12GraphicsCommandList *cmdlist; + ID3D12DescriptorHeap *uav_heap; + + struct clc_context *compiler_ctx; + + UINT uav_heap_incr; + int fence_value; + + HANDLE event; + static PFN_D3D12_SERIALIZE_VERSIONED_ROOT_SIGNATURE D3D12SerializeVersionedRootSignature; +}; diff --git a/src/microsoft/clc/meson.build b/src/microsoft/clc/meson.build new file mode 100644 index 00000000000..9dc371d21e4 --- /dev/null +++ b/src/microsoft/clc/meson.build @@ -0,0 +1,59 @@ +# Copyright © Microsoft Corporation + +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +clang_resource_dir = join_paths( + dep_clang.get_variable(cmake: 'CLANG_INCLUDE_DIRS'), '..', + 'lib', 'clang', dep_clang.version(), 'include' +) + +opencl_c_h = custom_target( + 'opencl-c.h', + input : [files_xxd, join_paths(clang_resource_dir, 'opencl-c.h')], + output : 'opencl-c.h.h', + command : [prog_python, '@INPUT@', '@OUTPUT@', '-n', 'opencl_c_source'], +) +opencl_c_base_h = custom_target( + 'opencl-c-base.h', + input : [files_xxd, join_paths(clang_resource_dir, 'opencl-c-base.h')], + output : 'opencl-c-base.h.h', + command : [prog_python, '@INPUT@', '@OUTPUT@', '-n', 'opencl_c_base_source'], +) + +libclc_compiler = shared_library( + 'clglon12compiler', + 'clc_compiler.c', + 'clc_nir.c', + 'clc_helpers.cpp', + opencl_c_h, + opencl_c_base_h, + vs_module_defs : 'clglon12compiler.def', + include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_compiler, inc_gallium, inc_spirv], + dependencies: [idep_nir_headers, dep_clang, dep_llvm, cc.find_library('version'), + dep_llvmspirvlib, idep_mesautil, idep_libdxil_compiler, idep_nir, dep_spirv_tools] +) + +clc_compiler_test = executable('clc_compiler_test', + ['clc_compiler_test.cpp', 'compute_test.cpp'], + link_with : [libclc_compiler], + dependencies : [idep_gtest, idep_mesautil], + include_directories : [inc_include, inc_src]) + +test('clc_compiler_test', clc_compiler_test, timeout: 120) diff --git a/src/microsoft/compiler/dxcapi.h b/src/microsoft/compiler/dxcapi.h new file mode 100644 index 00000000000..cde8f442b74 --- /dev/null +++ b/src/microsoft/compiler/dxcapi.h @@ -0,0 +1,676 @@ + +/////////////////////////////////////////////////////////////////////////////// +// // +// dxcapi.h // +// Copyright (C) Microsoft Corporation. All rights reserved. // +// This file is distributed under the University of Illinois Open Source // +// License. See LICENSE.TXT for details. // +// // +// Provides declarations for the DirectX Compiler API entry point. // +// // +/////////////////////////////////////////////////////////////////////////////// + +#ifndef __DXC_API__ +#define __DXC_API__ + +#ifdef _WIN32 +#ifndef DXC_API_IMPORT +#define DXC_API_IMPORT __declspec(dllimport) +#endif +#else +#ifndef DXC_API_IMPORT +#define DXC_API_IMPORT __attribute__ ((visibility ("default"))) +#endif +#endif + +#ifdef _WIN32 +#define DECLARE_CROSS_PLATFORM_UUIDOF(T) +#define DEFINE_CROSS_PLATFORM_UUIDOF(T) +#else +#include +#include "dxc/Support/WinAdapter.h" +#endif + +struct IMalloc; + +struct IDxcIncludeHandler; + +typedef HRESULT (__stdcall *DxcCreateInstanceProc)( + _In_ REFCLSID rclsid, + _In_ REFIID riid, + _Out_ LPVOID* ppv +); + +typedef HRESULT(__stdcall *DxcCreateInstance2Proc)( + _In_ IMalloc *pMalloc, + _In_ REFCLSID rclsid, + _In_ REFIID riid, + _Out_ LPVOID* ppv + ); + +/// +/// Creates a single uninitialized object of the class associated with a specified CLSID. +/// +/// +/// The CLSID associated with the data and code that will be used to create the object. +/// +/// +/// A reference to the identifier of the interface to be used to communicate +/// with the object. +/// +/// +/// Address of pointer variable that receives the interface pointer requested +/// in riid. Upon successful return, *ppv contains the requested interface +/// pointer. Upon failure, *ppv contains NULL. +/// +/// While this function is similar to CoCreateInstance, there is no COM involvement. +/// + +extern "C" +DXC_API_IMPORT HRESULT __stdcall DxcCreateInstance( + _In_ REFCLSID rclsid, + _In_ REFIID riid, + _Out_ LPVOID* ppv + ); + +extern "C" +DXC_API_IMPORT HRESULT __stdcall DxcCreateInstance2( + _In_ IMalloc *pMalloc, + _In_ REFCLSID rclsid, + _In_ REFIID riid, + _Out_ LPVOID* ppv +); + +// For convenience, equivalent definitions to CP_UTF8 and CP_UTF16. +#define DXC_CP_UTF8 65001 +#define DXC_CP_UTF16 1200 +// Use DXC_CP_ACP for: Binary; ANSI Text; Autodetect UTF with BOM +#define DXC_CP_ACP 0 + +// This flag indicates that the shader hash was computed taking into account source information (-Zss) +#define DXC_HASHFLAG_INCLUDES_SOURCE 1 + +// Hash digest type for ShaderHash +typedef struct DxcShaderHash { + UINT32 Flags; // DXC_HASHFLAG_* + BYTE HashDigest[16]; +} DxcShaderHash; + +#define DXC_FOURCC(ch0, ch1, ch2, ch3) ( \ + (UINT32)(UINT8)(ch0) | (UINT32)(UINT8)(ch1) << 8 | \ + (UINT32)(UINT8)(ch2) << 16 | (UINT32)(UINT8)(ch3) << 24 \ + ) +#define DXC_PART_PDB DXC_FOURCC('I', 'L', 'D', 'B') +#define DXC_PART_PDB_NAME DXC_FOURCC('I', 'L', 'D', 'N') +#define DXC_PART_PRIVATE_DATA DXC_FOURCC('P', 'R', 'I', 'V') +#define DXC_PART_ROOT_SIGNATURE DXC_FOURCC('R', 'T', 'S', '0') +#define DXC_PART_DXIL DXC_FOURCC('D', 'X', 'I', 'L') +#define DXC_PART_REFLECTION_DATA DXC_FOURCC('R', 'D', 'A', 'T') +#define DXC_PART_SHADER_HASH DXC_FOURCC('H', 'A', 'S', 'H') +#define DXC_PART_INPUT_SIGNATURE DXC_FOURCC('I', 'S', 'G', '1') +#define DXC_PART_OUTPUT_SIGNATURE DXC_FOURCC('O', 'S', 'G', '1') +#define DXC_PART_PATCH_CONSTANT_SIGNATURE DXC_FOURCC('P', 'S', 'G', '1') + +// Some option arguments are defined here for continuity with D3DCompile interface +#define DXC_ARG_DEBUG L"-Zi" +#define DXC_ARG_SKIP_VALIDATION L"-Vd" +#define DXC_ARG_SKIP_OPTIMIZATIONS L"-Od" +#define DXC_ARG_PACK_MATRIX_ROW_MAJOR L"-Zpr" +#define DXC_ARG_PACK_MATRIX_COLUMN_MAJOR L"-Zpc" +#define DXC_ARG_AVOID_FLOW_CONTROL L"-Gfa" +#define DXC_ARG_PREFER_FLOW_CONTROL L"-Gfp" +#define DXC_ARG_ENABLE_STRICTNESS L"-Ges" +#define DXC_ARG_ENABLE_BACKWARDS_COMPATIBILITY L"-Gec" +#define DXC_ARG_IEEE_STRICTNESS L"-Gis" +#define DXC_ARG_OPTIMIZATION_LEVEL0 L"-O0" +#define DXC_ARG_OPTIMIZATION_LEVEL1 L"-O1" +#define DXC_ARG_OPTIMIZATION_LEVEL2 L"-O2" +#define DXC_ARG_OPTIMIZATION_LEVEL3 L"-O3" +#define DXC_ARG_WARNINGS_ARE_ERRORS L"-WX" +#define DXC_ARG_RESOURCES_MAY_ALIAS L"-res_may_alias" +#define DXC_ARG_ALL_RESOURCES_BOUND L"-all_resources_bound" +#define DXC_ARG_DEBUG_NAME_FOR_SOURCE L"-Zss" +#define DXC_ARG_DEBUG_NAME_FOR_BINARY L"-Zsb" + +// IDxcBlob is an alias of ID3D10Blob and ID3DBlob +struct __declspec(uuid("8BA5FB08-5195-40e2-AC58-0D989C3A0102")) +IDxcBlob : public IUnknown { +public: + virtual LPVOID STDMETHODCALLTYPE GetBufferPointer(void) = 0; + virtual SIZE_T STDMETHODCALLTYPE GetBufferSize(void) = 0; + + DECLARE_CROSS_PLATFORM_UUIDOF(IDxcBlob) +}; + +struct __declspec(uuid("7241d424-2646-4191-97c0-98e96e42fc68")) +IDxcBlobEncoding : public IDxcBlob { +public: + virtual HRESULT STDMETHODCALLTYPE GetEncoding(_Out_ BOOL *pKnown, + _Out_ UINT32 *pCodePage) = 0; + + DECLARE_CROSS_PLATFORM_UUIDOF(IDxcBlobEncoding) +}; + +// Notes on IDxcBlobUtf16 and IDxcBlobUtf8 +// These guarantee null-terminated text and the stated encoding. +// GetBufferSize() will return the size in bytes, including null-terminator +// GetStringLength() will return the length in characters, excluding the null-terminator +// Name strings will use IDxcBlobUtf16, while other string output blobs, +// such as errors/warnings, preprocessed HLSL, or other text will be based +// on the -encoding option. + +// The API will use this interface for output name strings +struct __declspec(uuid("A3F84EAB-0FAA-497E-A39C-EE6ED60B2D84")) +IDxcBlobUtf16 : public IDxcBlobEncoding { +public: + virtual LPCWSTR STDMETHODCALLTYPE GetStringPointer(void) = 0; + virtual SIZE_T STDMETHODCALLTYPE GetStringLength(void) = 0; + + DECLARE_CROSS_PLATFORM_UUIDOF(IDxcBlobUtf16) +}; +struct __declspec(uuid("3DA636C9-BA71-4024-A301-30CBF125305B")) +IDxcBlobUtf8 : public IDxcBlobEncoding { +public: + virtual LPCSTR STDMETHODCALLTYPE GetStringPointer(void) = 0; + virtual SIZE_T STDMETHODCALLTYPE GetStringLength(void) = 0; + + DECLARE_CROSS_PLATFORM_UUIDOF(IDxcBlobUtf8) +}; + +struct __declspec(uuid("7f61fc7d-950d-467f-b3e3-3c02fb49187c")) +IDxcIncludeHandler : public IUnknown { + virtual HRESULT STDMETHODCALLTYPE LoadSource( + _In_z_ LPCWSTR pFilename, // Candidate filename. + _COM_Outptr_result_maybenull_ IDxcBlob **ppIncludeSource // Resultant source object for included file, nullptr if not found. + ) = 0; + + DECLARE_CROSS_PLATFORM_UUIDOF(IDxcIncludeHandler) +}; + +// Structure for supplying bytes or text input to Dxc APIs. +// Use Encoding = 0 for non-text bytes, ANSI text, or unknown with BOM. +typedef struct DxcBuffer { + LPCVOID Ptr; + SIZE_T Size; + UINT Encoding; +} DxcText; + +struct DxcDefine { + LPCWSTR Name; + _Maybenull_ LPCWSTR Value; +}; + +struct __declspec(uuid("73EFFE2A-70DC-45F8-9690-EFF64C02429D")) +IDxcCompilerArgs : public IUnknown { + // Pass GetArguments() and GetCount() to Compile + virtual LPCWSTR* STDMETHODCALLTYPE GetArguments() = 0; + virtual UINT32 STDMETHODCALLTYPE GetCount() = 0; + + // Add additional arguments or defines here, if desired. + virtual HRESULT STDMETHODCALLTYPE AddArguments( + _In_opt_count_(argCount) LPCWSTR *pArguments, // Array of pointers to arguments to add + _In_ UINT32 argCount // Number of arguments to add + ) = 0; + virtual HRESULT STDMETHODCALLTYPE AddArgumentsUTF8( + _In_opt_count_(argCount)LPCSTR *pArguments, // Array of pointers to UTF-8 arguments to add + _In_ UINT32 argCount // Number of arguments to add + ) = 0; + virtual HRESULT STDMETHODCALLTYPE AddDefines( + _In_count_(defineCount) const DxcDefine *pDefines, // Array of defines + _In_ UINT32 defineCount // Number of defines + ) = 0; + + DECLARE_CROSS_PLATFORM_UUIDOF(IDxcCompilerArgs) +}; + +////////////////////////// +// Legacy Interfaces +///////////////////////// + +// NOTE: IDxcUtils replaces IDxcLibrary +struct __declspec(uuid("e5204dc7-d18c-4c3c-bdfb-851673980fe7")) +IDxcLibrary : public IUnknown { + virtual HRESULT STDMETHODCALLTYPE SetMalloc(_In_opt_ IMalloc *pMalloc) = 0; + virtual HRESULT STDMETHODCALLTYPE CreateBlobFromBlob( + _In_ IDxcBlob *pBlob, UINT32 offset, UINT32 length, _COM_Outptr_ IDxcBlob **ppResult) = 0; + virtual HRESULT STDMETHODCALLTYPE CreateBlobFromFile( + _In_z_ LPCWSTR pFileName, _In_opt_ UINT32* codePage, + _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0; + virtual HRESULT STDMETHODCALLTYPE CreateBlobWithEncodingFromPinned( + _In_bytecount_(size) LPCVOID pText, UINT32 size, UINT32 codePage, + _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0; + virtual HRESULT STDMETHODCALLTYPE CreateBlobWithEncodingOnHeapCopy( + _In_bytecount_(size) LPCVOID pText, UINT32 size, UINT32 codePage, + _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0; + virtual HRESULT STDMETHODCALLTYPE CreateBlobWithEncodingOnMalloc( + _In_bytecount_(size) LPCVOID pText, IMalloc *pIMalloc, UINT32 size, UINT32 codePage, + _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0; + virtual HRESULT STDMETHODCALLTYPE CreateIncludeHandler( + _COM_Outptr_ IDxcIncludeHandler **ppResult) = 0; + virtual HRESULT STDMETHODCALLTYPE CreateStreamFromBlobReadOnly( + _In_ IDxcBlob *pBlob, _COM_Outptr_ IStream **ppStream) = 0; + virtual HRESULT STDMETHODCALLTYPE GetBlobAsUtf8( + _In_ IDxcBlob *pBlob, _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0; + virtual HRESULT STDMETHODCALLTYPE GetBlobAsUtf16( + _In_ IDxcBlob *pBlob, _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0; + + DECLARE_CROSS_PLATFORM_UUIDOF(IDxcLibrary) +}; + +// NOTE: IDxcResult replaces IDxcOperationResult +struct __declspec(uuid("CEDB484A-D4E9-445A-B991-CA21CA157DC2")) +IDxcOperationResult : public IUnknown { + virtual HRESULT STDMETHODCALLTYPE GetStatus(_Out_ HRESULT *pStatus) = 0; + + // GetResult returns the main result of the operation. + // This corresponds to: + // DXC_OUT_OBJECT - Compile() with shader or library target + // DXC_OUT_DISASSEMBLY - Disassemble() + // DXC_OUT_HLSL - Compile() with -P + // DXC_OUT_ROOT_SIGNATURE - Compile() with rootsig_* target + virtual HRESULT STDMETHODCALLTYPE GetResult(_COM_Outptr_result_maybenull_ IDxcBlob **ppResult) = 0; + + // GetErrorBuffer Corresponds to DXC_OUT_ERRORS. + virtual HRESULT STDMETHODCALLTYPE GetErrorBuffer(_COM_Outptr_result_maybenull_ IDxcBlobEncoding **ppErrors) = 0; + + DECLARE_CROSS_PLATFORM_UUIDOF(IDxcOperationResult) +}; + +// NOTE: IDxcCompiler3 replaces IDxcCompiler and IDxcCompiler2 +struct __declspec(uuid("8c210bf3-011f-4422-8d70-6f9acb8db617")) +IDxcCompiler : public IUnknown { + // Compile a single entry point to the target shader model + virtual HRESULT STDMETHODCALLTYPE Compile( + _In_ IDxcBlob *pSource, // Source text to compile + _In_opt_z_ LPCWSTR pSourceName, // Optional file name for pSource. Used in errors and include handlers. + _In_opt_z_ LPCWSTR pEntryPoint, // entry point name + _In_z_ LPCWSTR pTargetProfile, // shader profile to compile + _In_opt_count_(argCount) LPCWSTR *pArguments, // Array of pointers to arguments + _In_ UINT32 argCount, // Number of arguments + _In_count_(defineCount) + const DxcDefine *pDefines, // Array of defines + _In_ UINT32 defineCount, // Number of defines + _In_opt_ IDxcIncludeHandler *pIncludeHandler, // user-provided interface to handle #include directives (optional) + _COM_Outptr_ IDxcOperationResult **ppResult // Compiler output status, buffer, and errors + ) = 0; + + // Preprocess source text + virtual HRESULT STDMETHODCALLTYPE Preprocess( + _In_ IDxcBlob *pSource, // Source text to preprocess + _In_opt_z_ LPCWSTR pSourceName, // Optional file name for pSource. Used in errors and include handlers. + _In_opt_count_(argCount) LPCWSTR *pArguments, // Array of pointers to arguments + _In_ UINT32 argCount, // Number of arguments + _In_count_(defineCount) + const DxcDefine *pDefines, // Array of defines + _In_ UINT32 defineCount, // Number of defines + _In_opt_ IDxcIncludeHandler *pIncludeHandler, // user-provided interface to handle #include directives (optional) + _COM_Outptr_ IDxcOperationResult **ppResult // Preprocessor output status, buffer, and errors + ) = 0; + + // Disassemble a program. + virtual HRESULT STDMETHODCALLTYPE Disassemble( + _In_ IDxcBlob *pSource, // Program to disassemble. + _COM_Outptr_ IDxcBlobEncoding **ppDisassembly // Disassembly text. + ) = 0; + + DECLARE_CROSS_PLATFORM_UUIDOF(IDxcCompiler) +}; + +// NOTE: IDxcCompiler3 replaces IDxcCompiler and IDxcCompiler2 +struct __declspec(uuid("A005A9D9-B8BB-4594-B5C9-0E633BEC4D37")) +IDxcCompiler2 : public IDxcCompiler { + // Compile a single entry point to the target shader model with debug information. + virtual HRESULT STDMETHODCALLTYPE CompileWithDebug( + _In_ IDxcBlob *pSource, // Source text to compile + _In_opt_z_ LPCWSTR pSourceName, // Optional file name for pSource. Used in errors and include handlers. + _In_opt_z_ LPCWSTR pEntryPoint, // Entry point name + _In_z_ LPCWSTR pTargetProfile, // Shader profile to compile + _In_opt_count_(argCount) LPCWSTR *pArguments, // Array of pointers to arguments + _In_ UINT32 argCount, // Number of arguments + _In_count_(defineCount) + const DxcDefine *pDefines, // Array of defines + _In_ UINT32 defineCount, // Number of defines + _In_opt_ IDxcIncludeHandler *pIncludeHandler, // user-provided interface to handle #include directives (optional) + _COM_Outptr_ IDxcOperationResult **ppResult, // Compiler output status, buffer, and errors + _Outptr_opt_result_z_ LPWSTR *ppDebugBlobName,// Suggested file name for debug blob. (Must be HeapFree()'d!) + _COM_Outptr_opt_ IDxcBlob **ppDebugBlob // Debug blob + ) = 0; + + DECLARE_CROSS_PLATFORM_UUIDOF(IDxcCompiler2) +}; + +struct __declspec(uuid("F1B5BE2A-62DD-4327-A1C2-42AC1E1E78E6")) +IDxcLinker : public IUnknown { +public: + // Register a library with name to ref it later. + virtual HRESULT RegisterLibrary( + _In_opt_ LPCWSTR pLibName, // Name of the library. + _In_ IDxcBlob *pLib // Library blob. + ) = 0; + + // Links the shader and produces a shader blob that the Direct3D runtime can + // use. + virtual HRESULT STDMETHODCALLTYPE Link( + _In_opt_ LPCWSTR pEntryName, // Entry point name + _In_ LPCWSTR pTargetProfile, // shader profile to link + _In_count_(libCount) + const LPCWSTR *pLibNames, // Array of library names to link + _In_ UINT32 libCount, // Number of libraries to link + _In_opt_count_(argCount) const LPCWSTR *pArguments, // Array of pointers to arguments + _In_ UINT32 argCount, // Number of arguments + _COM_Outptr_ + IDxcOperationResult **ppResult // Linker output status, buffer, and errors + ) = 0; + + DECLARE_CROSS_PLATFORM_UUIDOF(IDxcLinker) +}; + +///////////////////////// +// Latest interfaces. Please use these +//////////////////////// + +// NOTE: IDxcUtils replaces IDxcLibrary +struct __declspec(uuid("4605C4CB-2019-492A-ADA4-65F20BB7D67F")) +IDxcUtils : public IUnknown { + // Create a sub-blob that holds a reference to the outer blob and points to its memory. + virtual HRESULT STDMETHODCALLTYPE CreateBlobFromBlob( + _In_ IDxcBlob *pBlob, UINT32 offset, UINT32 length, _COM_Outptr_ IDxcBlob **ppResult) = 0; + + // For codePage, use 0 (or DXC_CP_ACP) for raw binary or ANSI code page + + // Creates a blob referencing existing memory, with no copy. + // User must manage the memory lifetime separately. + // (was: CreateBlobWithEncodingFromPinned) + virtual HRESULT STDMETHODCALLTYPE CreateBlobFromPinned( + _In_bytecount_(size) LPCVOID pData, UINT32 size, UINT32 codePage, + _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0; + + // Create blob, taking ownership of memory allocated with supplied allocator. + // (was: CreateBlobWithEncodingOnMalloc) + virtual HRESULT STDMETHODCALLTYPE MoveToBlob( + _In_bytecount_(size) LPCVOID pData, IMalloc *pIMalloc, UINT32 size, UINT32 codePage, + _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0; + + //// + // New blobs and copied contents are allocated with the current allocator + + // Copy blob contents to memory owned by the new blob. + // (was: CreateBlobWithEncodingOnHeapCopy) + virtual HRESULT STDMETHODCALLTYPE CreateBlob( + _In_bytecount_(size) LPCVOID pData, UINT32 size, UINT32 codePage, + _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0; + + // (was: CreateBlobFromFile) + virtual HRESULT STDMETHODCALLTYPE LoadFile( + _In_z_ LPCWSTR pFileName, _In_opt_ UINT32* pCodePage, + _COM_Outptr_ IDxcBlobEncoding **pBlobEncoding) = 0; + + virtual HRESULT STDMETHODCALLTYPE CreateReadOnlyStreamFromBlob( + _In_ IDxcBlob *pBlob, _COM_Outptr_ IStream **ppStream) = 0; + + // Create default file-based include handler + virtual HRESULT STDMETHODCALLTYPE CreateDefaultIncludeHandler( + _COM_Outptr_ IDxcIncludeHandler **ppResult) = 0; + + // Convert or return matching encoded text blobs + virtual HRESULT STDMETHODCALLTYPE GetBlobAsUtf8( + _In_ IDxcBlob *pBlob, _COM_Outptr_ IDxcBlobUtf8 **pBlobEncoding) = 0; + virtual HRESULT STDMETHODCALLTYPE GetBlobAsUtf16( + _In_ IDxcBlob *pBlob, _COM_Outptr_ IDxcBlobUtf16 **pBlobEncoding) = 0; + + virtual HRESULT STDMETHODCALLTYPE GetDxilContainerPart( + _In_ const DxcBuffer *pShader, + _In_ UINT32 DxcPart, + _Outptr_result_nullonfailure_ void **ppPartData, + _Out_ UINT32 *pPartSizeInBytes) = 0; + + // Create reflection interface from serialized Dxil container, or DXC_PART_REFLECTION_DATA. + // TBD: Require part header for RDAT? (leaning towards yes) + virtual HRESULT STDMETHODCALLTYPE CreateReflection( + _In_ const DxcBuffer *pData, REFIID iid, void **ppvReflection) = 0; + + virtual HRESULT STDMETHODCALLTYPE BuildArguments( + _In_opt_z_ LPCWSTR pSourceName, // Optional file name for pSource. Used in errors and include handlers. + _In_opt_z_ LPCWSTR pEntryPoint, // Entry point name. (-E) + _In_z_ LPCWSTR pTargetProfile, // Shader profile to compile. (-T) + _In_opt_count_(argCount) LPCWSTR *pArguments, // Array of pointers to arguments + _In_ UINT32 argCount, // Number of arguments + _In_count_(defineCount) + const DxcDefine *pDefines, // Array of defines + _In_ UINT32 defineCount, // Number of defines + _COM_Outptr_ IDxcCompilerArgs **ppArgs // Arguments you can use with Compile() method + ) = 0; + + // Takes the shader PDB and returns the hash and the container inside it + virtual HRESULT STDMETHODCALLTYPE GetPDBContents( + _In_ IDxcBlob *pPDBBlob, _COM_Outptr_ IDxcBlob **ppHash, _COM_Outptr_ IDxcBlob **ppContainer) = 0; + + DECLARE_CROSS_PLATFORM_UUIDOF(IDxcUtils) +}; + +// For use with IDxcResult::[Has|Get]Output dxcOutKind argument +// Note: text outputs returned from version 2 APIs are UTF-8 or UTF-16 based on -encoding option +typedef enum DXC_OUT_KIND { + DXC_OUT_NONE = 0, + DXC_OUT_OBJECT = 1, // IDxcBlob - Shader or library object + DXC_OUT_ERRORS = 2, // IDxcBlobUtf8 or IDxcBlobUtf16 + DXC_OUT_PDB = 3, // IDxcBlob + DXC_OUT_SHADER_HASH = 4, // IDxcBlob - DxcShaderHash of shader or shader with source info (-Zsb/-Zss) + DXC_OUT_DISASSEMBLY = 5, // IDxcBlobUtf8 or IDxcBlobUtf16 - from Disassemble + DXC_OUT_HLSL = 6, // IDxcBlobUtf8 or IDxcBlobUtf16 - from Preprocessor or Rewriter + DXC_OUT_TEXT = 7, // IDxcBlobUtf8 or IDxcBlobUtf16 - other text, such as -ast-dump or -Odump + DXC_OUT_REFLECTION = 8, // IDxcBlob - RDAT part with reflection data + DXC_OUT_ROOT_SIGNATURE = 9, // IDxcBlob - Serialized root signature output + + DXC_OUT_FORCE_DWORD = 0xFFFFFFFF +} DXC_OUT_KIND; + +struct __declspec(uuid("58346CDA-DDE7-4497-9461-6F87AF5E0659")) +IDxcResult : public IDxcOperationResult { + virtual BOOL STDMETHODCALLTYPE HasOutput(_In_ DXC_OUT_KIND dxcOutKind) = 0; + virtual HRESULT STDMETHODCALLTYPE GetOutput(_In_ DXC_OUT_KIND dxcOutKind, + _In_ REFIID iid, _COM_Outptr_opt_result_maybenull_ void **ppvObject, + _COM_Outptr_ IDxcBlobUtf16 **ppOutputName) = 0; + + virtual UINT32 GetNumOutputs() = 0; + virtual DXC_OUT_KIND GetOutputByIndex(UINT32 Index) = 0; + virtual DXC_OUT_KIND PrimaryOutput() = 0; + + DECLARE_CROSS_PLATFORM_UUIDOF(IDxcResult) +}; + +struct __declspec(uuid("228B4687-5A6A-4730-900C-9702B2203F54")) +IDxcCompiler3 : public IUnknown { + // Compile a single entry point to the target shader model, + // Compile a library to a library target (-T lib_*), + // Compile a root signature (-T rootsig_*), or + // Preprocess HLSL source (-P) + virtual HRESULT STDMETHODCALLTYPE Compile( + _In_ const DxcBuffer *pSource, // Source text to compile + _In_opt_count_(argCount) LPCWSTR *pArguments, // Array of pointers to arguments + _In_ UINT32 argCount, // Number of arguments + _In_opt_ IDxcIncludeHandler *pIncludeHandler, // user-provided interface to handle #include directives (optional) + _In_ REFIID riid, _Out_ LPVOID *ppResult // IDxcResult: status, buffer, and errors + ) = 0; + + // Disassemble a program. + virtual HRESULT STDMETHODCALLTYPE Disassemble( + _In_ const DxcBuffer *pObject, // Program to disassemble: dxil container or bitcode. + _In_ REFIID riid, _Out_ LPVOID *ppResult // IDxcResult: status, disassembly text, and errors + ) = 0; + + DECLARE_CROSS_PLATFORM_UUIDOF(IDxcCompiler3) +}; + +static const UINT32 DxcValidatorFlags_Default = 0; +static const UINT32 DxcValidatorFlags_InPlaceEdit = 1; // Validator is allowed to update shader blob in-place. +static const UINT32 DxcValidatorFlags_RootSignatureOnly = 2; +static const UINT32 DxcValidatorFlags_ModuleOnly = 4; +static const UINT32 DxcValidatorFlags_ValidMask = 0x7; + +struct __declspec(uuid("A6E82BD2-1FD7-4826-9811-2857E797F49A")) +IDxcValidator : public IUnknown { + // Validate a shader. + virtual HRESULT STDMETHODCALLTYPE Validate( + _In_ IDxcBlob *pShader, // Shader to validate. + _In_ UINT32 Flags, // Validation flags. + _COM_Outptr_ IDxcOperationResult **ppResult // Validation output status, buffer, and errors + ) = 0; + + DECLARE_CROSS_PLATFORM_UUIDOF(IDxcValidator) +}; + +struct __declspec(uuid("334b1f50-2292-4b35-99a1-25588d8c17fe")) +IDxcContainerBuilder : public IUnknown { + virtual HRESULT STDMETHODCALLTYPE Load(_In_ IDxcBlob *pDxilContainerHeader) = 0; // Loads DxilContainer to the builder + virtual HRESULT STDMETHODCALLTYPE AddPart(_In_ UINT32 fourCC, _In_ IDxcBlob *pSource) = 0; // Part to add to the container + virtual HRESULT STDMETHODCALLTYPE RemovePart(_In_ UINT32 fourCC) = 0; // Remove the part with fourCC + virtual HRESULT STDMETHODCALLTYPE SerializeContainer(_Out_ IDxcOperationResult **ppResult) = 0; // Builds a container of the given container builder state + + DECLARE_CROSS_PLATFORM_UUIDOF(IDxcContainerBuilder) +}; + +struct __declspec(uuid("091f7a26-1c1f-4948-904b-e6e3a8a771d5")) +IDxcAssembler : public IUnknown { + // Assemble dxil in ll or llvm bitcode to DXIL container. + virtual HRESULT STDMETHODCALLTYPE AssembleToContainer( + _In_ IDxcBlob *pShader, // Shader to assemble. + _COM_Outptr_ IDxcOperationResult **ppResult // Assembly output status, buffer, and errors + ) = 0; + + DECLARE_CROSS_PLATFORM_UUIDOF(IDxcAssembler) +}; + +struct __declspec(uuid("d2c21b26-8350-4bdc-976a-331ce6f4c54c")) +IDxcContainerReflection : public IUnknown { + virtual HRESULT STDMETHODCALLTYPE Load(_In_ IDxcBlob *pContainer) = 0; // Container to load. + virtual HRESULT STDMETHODCALLTYPE GetPartCount(_Out_ UINT32 *pResult) = 0; + virtual HRESULT STDMETHODCALLTYPE GetPartKind(UINT32 idx, _Out_ UINT32 *pResult) = 0; + virtual HRESULT STDMETHODCALLTYPE GetPartContent(UINT32 idx, _COM_Outptr_ IDxcBlob **ppResult) = 0; + virtual HRESULT STDMETHODCALLTYPE FindFirstPartKind(UINT32 kind, _Out_ UINT32 *pResult) = 0; + virtual HRESULT STDMETHODCALLTYPE GetPartReflection(UINT32 idx, REFIID iid, void **ppvObject) = 0; + + DECLARE_CROSS_PLATFORM_UUIDOF(IDxcContainerReflection) +}; + +struct __declspec(uuid("AE2CD79F-CC22-453F-9B6B-B124E7A5204C")) +IDxcOptimizerPass : public IUnknown { + virtual HRESULT STDMETHODCALLTYPE GetOptionName(_COM_Outptr_ LPWSTR *ppResult) = 0; + virtual HRESULT STDMETHODCALLTYPE GetDescription(_COM_Outptr_ LPWSTR *ppResult) = 0; + virtual HRESULT STDMETHODCALLTYPE GetOptionArgCount(_Out_ UINT32 *pCount) = 0; + virtual HRESULT STDMETHODCALLTYPE GetOptionArgName(UINT32 argIndex, _COM_Outptr_ LPWSTR *ppResult) = 0; + virtual HRESULT STDMETHODCALLTYPE GetOptionArgDescription(UINT32 argIndex, _COM_Outptr_ LPWSTR *ppResult) = 0; + + DECLARE_CROSS_PLATFORM_UUIDOF(IDxcOptimizerPass) +}; + +struct __declspec(uuid("25740E2E-9CBA-401B-9119-4FB42F39F270")) +IDxcOptimizer : public IUnknown { + virtual HRESULT STDMETHODCALLTYPE GetAvailablePassCount(_Out_ UINT32 *pCount) = 0; + virtual HRESULT STDMETHODCALLTYPE GetAvailablePass(UINT32 index, _COM_Outptr_ IDxcOptimizerPass** ppResult) = 0; + virtual HRESULT STDMETHODCALLTYPE RunOptimizer(IDxcBlob *pBlob, + _In_count_(optionCount) LPCWSTR *ppOptions, UINT32 optionCount, + _COM_Outptr_ IDxcBlob **pOutputModule, + _COM_Outptr_opt_ IDxcBlobEncoding **ppOutputText) = 0; + + DECLARE_CROSS_PLATFORM_UUIDOF(IDxcOptimizer) +}; + +static const UINT32 DxcVersionInfoFlags_None = 0; +static const UINT32 DxcVersionInfoFlags_Debug = 1; // Matches VS_FF_DEBUG +static const UINT32 DxcVersionInfoFlags_Internal = 2; // Internal Validator (non-signing) + +struct __declspec(uuid("b04f5b50-2059-4f12-a8ff-a1e0cde1cc7e")) +IDxcVersionInfo : public IUnknown { + virtual HRESULT STDMETHODCALLTYPE GetVersion(_Out_ UINT32 *pMajor, _Out_ UINT32 *pMinor) = 0; + virtual HRESULT STDMETHODCALLTYPE GetFlags(_Out_ UINT32 *pFlags) = 0; + + DECLARE_CROSS_PLATFORM_UUIDOF(IDxcVersionInfo) +}; + +struct __declspec(uuid("fb6904c4-42f0-4b62-9c46-983af7da7c83")) +IDxcVersionInfo2 : public IDxcVersionInfo { + virtual HRESULT STDMETHODCALLTYPE GetCommitInfo(_Out_ UINT32 *pCommitCount, _Out_ char **pCommitHash) = 0; + + DECLARE_CROSS_PLATFORM_UUIDOF(IDxcVersionInfo2) +}; + +// Note: __declspec(selectany) requires 'extern' +// On Linux __declspec(selectany) is removed and using 'extern' results in link error. +#ifdef _MSC_VER +#define CLSID_SCOPE __declspec(selectany) extern +#else +#define CLSID_SCOPE +#endif + +CLSID_SCOPE const CLSID CLSID_DxcCompiler = { + 0x73e22d93, + 0xe6ce, + 0x47f3, + {0xb5, 0xbf, 0xf0, 0x66, 0x4f, 0x39, 0xc1, 0xb0}}; + +// {EF6A8087-B0EA-4D56-9E45-D07E1A8B7806} +CLSID_SCOPE const GUID CLSID_DxcLinker = { + 0xef6a8087, + 0xb0ea, + 0x4d56, + {0x9e, 0x45, 0xd0, 0x7e, 0x1a, 0x8b, 0x78, 0x6}}; + +// {CD1F6B73-2AB0-484D-8EDC-EBE7A43CA09F} +CLSID_SCOPE const CLSID CLSID_DxcDiaDataSource = { + 0xcd1f6b73, + 0x2ab0, + 0x484d, + {0x8e, 0xdc, 0xeb, 0xe7, 0xa4, 0x3c, 0xa0, 0x9f}}; + +// {3E56AE82-224D-470F-A1A1-FE3016EE9F9D} +CLSID_SCOPE const CLSID CLSID_DxcCompilerArgs = { + 0x3e56ae82, + 0x224d, + 0x470f, + {0xa1, 0xa1, 0xfe, 0x30, 0x16, 0xee, 0x9f, 0x9d}}; + +// {6245D6AF-66E0-48FD-80B4-4D271796748C} +CLSID_SCOPE const GUID CLSID_DxcLibrary = { + 0x6245d6af, + 0x66e0, + 0x48fd, + {0x80, 0xb4, 0x4d, 0x27, 0x17, 0x96, 0x74, 0x8c}}; + +CLSID_SCOPE const GUID CLSID_DxcUtils = CLSID_DxcLibrary; + +// {8CA3E215-F728-4CF3-8CDD-88AF917587A1} +CLSID_SCOPE const GUID CLSID_DxcValidator = { + 0x8ca3e215, + 0xf728, + 0x4cf3, + {0x8c, 0xdd, 0x88, 0xaf, 0x91, 0x75, 0x87, 0xa1}}; + +// {D728DB68-F903-4F80-94CD-DCCF76EC7151} +CLSID_SCOPE const GUID CLSID_DxcAssembler = { + 0xd728db68, + 0xf903, + 0x4f80, + {0x94, 0xcd, 0xdc, 0xcf, 0x76, 0xec, 0x71, 0x51}}; + +// {b9f54489-55b8-400c-ba3a-1675e4728b91} +CLSID_SCOPE const GUID CLSID_DxcContainerReflection = { + 0xb9f54489, + 0x55b8, + 0x400c, + {0xba, 0x3a, 0x16, 0x75, 0xe4, 0x72, 0x8b, 0x91}}; + +// {AE2CD79F-CC22-453F-9B6B-B124E7A5204C} +CLSID_SCOPE const GUID CLSID_DxcOptimizer = { + 0xae2cd79f, + 0xcc22, + 0x453f, + {0x9b, 0x6b, 0xb1, 0x24, 0xe7, 0xa5, 0x20, 0x4c}}; + +// {94134294-411f-4574-b4d0-8741e25240d2} +CLSID_SCOPE const GUID CLSID_DxcContainerBuilder = { + 0x94134294, + 0x411f, + 0x4574, + {0xb4, 0xd0, 0x87, 0x41, 0xe2, 0x52, 0x40, 0xd2}}; +#endif diff --git a/src/microsoft/compiler/dxil_nir.c b/src/microsoft/compiler/dxil_nir.c index e6c7a091653..d08fd52d9d6 100644 --- a/src/microsoft/compiler/dxil_nir.c +++ b/src/microsoft/compiler/dxil_nir.c @@ -27,6 +27,14 @@ #include "nir_deref.h" #include "util/u_math.h" +static void +cl_type_size_align(const struct glsl_type *type, unsigned *size, + unsigned *align) +{ + *size = glsl_get_cl_size(type); + *align = glsl_get_cl_alignment(type); +} + static void extract_comps_from_vec32(nir_builder *b, nir_ssa_def *vec32, unsigned dst_bit_size, @@ -61,6 +69,116 @@ extract_comps_from_vec32(nir_builder *b, nir_ssa_def *vec32, } } +static nir_ssa_def * +load_comps_to_vec32(nir_builder *b, unsigned src_bit_size, + nir_ssa_def **src_comps, unsigned num_src_comps) +{ + unsigned num_vec32comps = DIV_ROUND_UP(num_src_comps * src_bit_size, 32); + unsigned step = DIV_ROUND_UP(src_bit_size, 32); + unsigned comps_per32b = 32 / src_bit_size; + nir_ssa_def *vec32comps[4]; + + for (unsigned i = 0; i < num_vec32comps; i += step) { + nir_ssa_def *tmp; + switch (src_bit_size) { + case 64: + vec32comps[i] = nir_unpack_64_2x32_split_x(b, src_comps[i / 2]); + vec32comps[i + 1] = nir_unpack_64_2x32_split_y(b, src_comps[i / 2]); + break; + case 32: + vec32comps[i] = src_comps[i]; + break; + case 16: + case 8: + unsigned src_offs = i * comps_per32b; + + vec32comps[i] = nir_u2u32(b, src_comps[src_offs]); + for (unsigned j = 1; j < comps_per32b && src_offs + j < num_src_comps; j++) { + nir_ssa_def *tmp = nir_ishl(b, nir_u2u32(b, src_comps[src_offs + j]), + nir_imm_int(b, j * src_bit_size)); + vec32comps[i] = nir_ior(b, vec32comps[i], tmp); + } + break; + } + } + + return nir_vec(b, vec32comps, num_vec32comps); +} + +static nir_ssa_def * +build_load_ptr_dxil(nir_builder *b, nir_deref_instr *deref, nir_ssa_def *idx) +{ + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ptr_dxil); + + load->num_components = 1; + load->src[0] = nir_src_for_ssa(&deref->dest.ssa); + load->src[1] = nir_src_for_ssa(idx); + nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL); + nir_builder_instr_insert(b, &load->instr); + return &load->dest.ssa; +} + +static bool +lower_load_deref(nir_builder *b, nir_intrinsic_instr *intr) +{ + assert(intr->dest.is_ssa); + + b->cursor = nir_before_instr(&intr->instr); + + nir_deref_instr *deref = nir_src_as_deref(intr->src[0]); + if (!nir_deref_mode_is(deref, nir_var_shader_temp)) + return false; + nir_ssa_def *ptr = nir_u2u32(b, nir_build_deref_offset(b, deref, cl_type_size_align)); + nir_ssa_def *offset = nir_iand(b, ptr, nir_inot(b, nir_imm_int(b, 3))); + + assert(intr->dest.is_ssa); + unsigned num_components = nir_dest_num_components(intr->dest); + unsigned bit_size = nir_dest_bit_size(intr->dest); + unsigned load_size = MAX2(32, bit_size); + unsigned num_bits = num_components * bit_size; + nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; + unsigned comp_idx = 0; + + nir_deref_path path; + nir_deref_path_init(&path, deref, NULL); + nir_ssa_def *base_idx = nir_ishr(b, offset, nir_imm_int(b, 2 /* log2(32 / 8) */)); + + /* Split loads into 32-bit chunks */ + for (unsigned i = 0; i < num_bits; i += load_size) { + unsigned subload_num_bits = MIN2(num_bits - i, load_size); + nir_ssa_def *idx = nir_iadd(b, base_idx, nir_imm_int(b, i / 32)); + nir_ssa_def *vec32 = build_load_ptr_dxil(b, path.path[0], idx); + + if (load_size == 64) { + idx = nir_iadd(b, idx, nir_imm_int(b, 1)); + vec32 = nir_vec2(b, vec32, + build_load_ptr_dxil(b, path.path[0], idx)); + } + + /* If we have 2 bytes or less to load we need to adjust the u32 value so + * we can always extract the LSB. + */ + if (subload_num_bits <= 16) { + nir_ssa_def *shift = nir_imul(b, nir_iand(b, ptr, nir_imm_int(b, 3)), + nir_imm_int(b, 8)); + vec32 = nir_ushr(b, vec32, shift); + } + + /* And now comes the pack/unpack step to match the original type. */ + extract_comps_from_vec32(b, vec32, bit_size, &comps[comp_idx], + subload_num_bits / bit_size); + comp_idx += subload_num_bits / bit_size; + } + + nir_deref_path_finish(&path); + assert(comp_idx == num_components); + nir_ssa_def *result = nir_vec(b, comps, num_components); + nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(result)); + nir_instr_remove(&intr->instr); + return true; +} + static nir_ssa_def * ubo_load_select_32b_comps(nir_builder *b, nir_ssa_def *vec32, nir_ssa_def *offset, unsigned num_bytes) @@ -155,3 +273,1107 @@ build_load_ubo_dxil(nir_builder *b, nir_ssa_def *buffer, assert(comp_idx == num_components); return nir_vec(b, comps, num_components); } + +static bool +lower_load_ssbo(nir_builder *b, nir_intrinsic_instr *intr) +{ + assert(intr->dest.is_ssa); + assert(intr->src[0].is_ssa); + assert(intr->src[1].is_ssa); + + b->cursor = nir_before_instr(&intr->instr); + + nir_ssa_def *buffer = intr->src[0].ssa; + nir_ssa_def *offset = nir_iand(b, intr->src[1].ssa, nir_imm_int(b, ~3UL)); + unsigned bit_size = nir_dest_bit_size(intr->dest); + unsigned num_components = nir_dest_num_components(intr->dest); + unsigned num_bits = num_components * bit_size; + + nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; + unsigned comp_idx = 0; + + /* We need to split loads in 16byte chunks because that's the optimal + * granularity of bufferLoad(). Minimum alignment is 4byte, which saves + * from us from extra complexity to extract >= 32 bit components. + */ + for (unsigned i = 0; i < num_bits; i += 4 * 32) { + /* For each 16byte chunk (or smaller) we generate a 32bit ssbo vec + * load. + */ + unsigned subload_num_bits = MIN2(num_bits - i, 4 * 32); + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(b->shader, + nir_intrinsic_load_ssbo); + + /* The number of components to store depends on the number of bytes. */ + load->num_components = DIV_ROUND_UP(subload_num_bits, 32); + load->src[0] = nir_src_for_ssa(buffer); + load->src[1] = nir_src_for_ssa(nir_iadd(b, offset, nir_imm_int(b, i / 8))); + nir_ssa_dest_init(&load->instr, &load->dest, load->num_components, + 32, NULL); + nir_builder_instr_insert(b, &load->instr); + + nir_ssa_def *vec32 = &load->dest.ssa; + + /* If we have 2 bytes or less to load we need to adjust the u32 value so + * we can always extract the LSB. + */ + if (subload_num_bits <= 16) { + nir_ssa_def *shift = nir_imul(b, nir_iand(b, intr->src[1].ssa, nir_imm_int(b, 3)), + nir_imm_int(b, 8)); + vec32 = nir_ushr(b, vec32, shift); + } + + nir_intrinsic_set_align(load, 4, 0); + + /* And now comes the pack/unpack step to match the original type. */ + extract_comps_from_vec32(b, vec32, bit_size, &comps[comp_idx], + subload_num_bits / bit_size); + comp_idx += subload_num_bits / bit_size; + } + + assert(comp_idx == num_components); + nir_ssa_def *result = nir_vec(b, comps, num_components); + nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(result)); + nir_instr_remove(&intr->instr); + return true; +} + +static bool +lower_store_ssbo(nir_builder *b, nir_intrinsic_instr *intr) +{ + b->cursor = nir_before_instr(&intr->instr); + + assert(intr->src[0].is_ssa); + assert(intr->src[1].is_ssa); + assert(intr->src[2].is_ssa); + + nir_ssa_def *val = intr->src[0].ssa; + nir_ssa_def *buffer = intr->src[1].ssa; + nir_ssa_def *offset = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, ~3UL)); + + unsigned bit_size = val->bit_size; + unsigned num_components = val->num_components; + unsigned num_bits = num_components * bit_size; + + nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; + unsigned comp_idx = 0; + + for (unsigned i = 0; i < num_components; i++) + comps[i] = nir_channel(b, val, i); + + /* We split stores in 16byte chunks because that's the optimal granularity + * of bufferStore(). Minimum alignment is 4byte, which saves from us from + * extra complexity to store >= 32 bit components. + */ + for (unsigned i = 0; i < num_bits; i += 4 * 32) { + /* For each 16byte chunk (or smaller) we generate a 32bit ssbo vec + * store. + */ + unsigned substore_num_bits = MIN2(num_bits - i, 4 * 32); + nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, i / 8)); + nir_ssa_def *vec32 = load_comps_to_vec32(b, bit_size, &comps[comp_idx], + substore_num_bits / bit_size); + nir_intrinsic_instr *store; + + if (substore_num_bits < 32) { + nir_ssa_def *mask = nir_imm_int(b, (1 << substore_num_bits) - 1); + + /* If we have 16 bits or less to store we need to place them + * correctly in the u32 component. Anything greater than 16 bits + * (including uchar3) is naturally aligned on 32bits. + */ + if (substore_num_bits <= 16) { + nir_ssa_def *pos = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, 3)); + nir_ssa_def *shift = nir_imul_imm(b, pos, 8); + + vec32 = nir_ishl(b, vec32, shift); + mask = nir_ishl(b, mask, shift); + } + + store = nir_intrinsic_instr_create(b->shader, + nir_intrinsic_store_ssbo_masked_dxil); + store->src[0] = nir_src_for_ssa(vec32); + store->src[1] = nir_src_for_ssa(nir_inot(b, mask)); + store->src[2] = nir_src_for_ssa(buffer); + store->src[3] = nir_src_for_ssa(local_offset); + } else { + store = nir_intrinsic_instr_create(b->shader, + nir_intrinsic_store_ssbo); + store->src[0] = nir_src_for_ssa(vec32); + store->src[1] = nir_src_for_ssa(buffer); + store->src[2] = nir_src_for_ssa(local_offset); + + nir_intrinsic_set_align(store, 4, 0); + } + + /* The number of components to store depends on the number of bits. */ + store->num_components = DIV_ROUND_UP(substore_num_bits, 32); + nir_builder_instr_insert(b, &store->instr); + comp_idx += substore_num_bits / bit_size; + } + + nir_instr_remove(&intr->instr); + return true; +} + +void +lower_load_vec32(nir_builder *b, nir_ssa_def *index, unsigned num_comps, nir_ssa_def **comps, nir_intrinsic_op op) +{ + for (unsigned i = 0; i < num_comps; i++) { + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(b->shader, op); + + load->num_components = 1; + load->src[0] = nir_src_for_ssa(nir_iadd(b, index, nir_imm_int(b, i))); + nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL); + nir_builder_instr_insert(b, &load->instr); + comps[i] = &load->dest.ssa; + } +} + +static bool +lower_32b_offset_load(nir_builder *b, nir_intrinsic_instr *intr) +{ + assert(intr->dest.is_ssa); + unsigned bit_size = nir_dest_bit_size(intr->dest); + unsigned num_components = nir_dest_num_components(intr->dest); + unsigned num_bits = num_components * bit_size; + + b->cursor = nir_before_instr(&intr->instr); + nir_intrinsic_op op = intr->intrinsic; + + assert(intr->src[0].is_ssa); + nir_ssa_def *offset = intr->src[0].ssa; + if (op == nir_intrinsic_load_shared) { + offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_base(intr))); + op = nir_intrinsic_load_shared_dxil; + } else { + offset = nir_u2u32(b, offset); + op = nir_intrinsic_load_scratch_dxil; + } + nir_ssa_def *index = nir_ushr(b, offset, nir_imm_int(b, 2)); + nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; + nir_ssa_def *comps_32bit[NIR_MAX_VEC_COMPONENTS * 2]; + + /* We need to split loads in 32-bit accesses because the buffer + * is an i32 array and DXIL does not support type casts. + */ + unsigned num_32bit_comps = DIV_ROUND_UP(num_bits, 32); + lower_load_vec32(b, index, num_32bit_comps, comps_32bit, op); + unsigned num_comps_per_pass = MIN2(num_32bit_comps, 4); + + for (unsigned i = 0; i < num_32bit_comps; i += num_comps_per_pass) { + unsigned num_vec32_comps = MIN2(num_32bit_comps - i, 4); + unsigned num_dest_comps = num_vec32_comps * 32 / bit_size; + nir_ssa_def *vec32 = nir_vec(b, &comps_32bit[i], num_vec32_comps); + + /* If we have 16 bits or less to load we need to adjust the u32 value so + * we can always extract the LSB. + */ + if (num_bits <= 16) { + nir_ssa_def *shift = + nir_imul(b, nir_iand(b, offset, nir_imm_int(b, 3)), + nir_imm_int(b, 8)); + vec32 = nir_ushr(b, vec32, shift); + } + + /* And now comes the pack/unpack step to match the original type. */ + unsigned dest_index = i * 32 / bit_size; + extract_comps_from_vec32(b, vec32, bit_size, &comps[dest_index], num_dest_comps); + } + + nir_ssa_def *result = nir_vec(b, comps, num_components); + nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(result)); + nir_instr_remove(&intr->instr); + + return true; +} + +static void +lower_store_vec32(nir_builder *b, nir_ssa_def *index, nir_ssa_def *vec32, nir_intrinsic_op op) +{ + + for (unsigned i = 0; i < vec32->num_components; i++) { + nir_intrinsic_instr *store = + nir_intrinsic_instr_create(b->shader, op); + + store->src[0] = nir_src_for_ssa(nir_channel(b, vec32, i)); + store->src[1] = nir_src_for_ssa(nir_iadd(b, index, nir_imm_int(b, i))); + store->num_components = 1; + nir_builder_instr_insert(b, &store->instr); + } +} + +static void +lower_masked_store_vec32(nir_builder *b, nir_ssa_def *offset, nir_ssa_def *index, + nir_ssa_def *vec32, unsigned num_bits, nir_intrinsic_op op) +{ + nir_ssa_def *mask = nir_imm_int(b, (1 << num_bits) - 1); + + /* If we have 16 bits or less to store we need to place them correctly in + * the u32 component. Anything greater than 16 bits (including uchar3) is + * naturally aligned on 32bits. + */ + if (num_bits <= 16) { + nir_ssa_def *shift = + nir_imul_imm(b, nir_iand(b, offset, nir_imm_int(b, 3)), 8); + + vec32 = nir_ishl(b, vec32, shift); + mask = nir_ishl(b, mask, shift); + } + + if (op == nir_intrinsic_store_shared_dxil) { + /* Use the dedicated masked intrinsic */ + nir_intrinsic_instr *store = + nir_intrinsic_instr_create(b->shader, + nir_intrinsic_store_shared_masked_dxil); + store->src[0] = nir_src_for_ssa(vec32); + store->src[1] = nir_src_for_ssa(nir_inot(b, mask)); + store->src[2] = nir_src_for_ssa(index); + store->num_components = 1; + nir_builder_instr_insert(b, &store->instr); + } else { + /* For scratch, since we don't need atomics, just generate the read-modify-write in NIR */ + nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_scratch_dxil); + load->src[0] = nir_src_for_ssa(index); + load->num_components = 1; + nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL); + nir_builder_instr_insert(b, &load->instr); + + nir_ssa_def *new_val = nir_ior(b, vec32, + nir_iand(b, + nir_inot(b, mask), + &load->dest.ssa)); + + lower_store_vec32(b, index, new_val, op); + } +} + +static bool +lower_32b_offset_store(nir_builder *b, nir_intrinsic_instr *intr) +{ + assert(intr->src[0].is_ssa); + unsigned num_components = nir_src_num_components(intr->src[0]); + unsigned bit_size = nir_src_bit_size(intr->src[0]); + unsigned num_bits = num_components * bit_size; + + b->cursor = nir_before_instr(&intr->instr); + nir_intrinsic_op op = intr->intrinsic; + + nir_ssa_def *offset = intr->src[1].ssa; + if (op == nir_intrinsic_store_shared) { + offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_base(intr))); + op = nir_intrinsic_store_shared_dxil; + } else { + offset = nir_u2u32(b, offset); + op = nir_intrinsic_store_scratch_dxil; + } + nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; + + unsigned comp_idx = 0; + for (unsigned i = 0; i < num_components; i++) + comps[i] = nir_channel(b, intr->src[0].ssa, i); + + for (unsigned i = 0; i < num_bits; i += 4 * 32) { + /* For each 4byte chunk (or smaller) we generate a 32bit scalar store. + */ + unsigned substore_num_bits = MIN2(num_bits - i, 4 * 32); + nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, i / 8)); + nir_ssa_def *vec32 = load_comps_to_vec32(b, bit_size, &comps[comp_idx], + substore_num_bits / bit_size); + nir_ssa_def *index = nir_ushr(b, local_offset, nir_imm_int(b, 2)); + + /* For anything less than 32bits we need to use the masked version of the + * intrinsic to preserve data living in the same 32bit slot. + */ + if (num_bits < 32) { + lower_masked_store_vec32(b, local_offset, index, vec32, num_bits, op); + } else { + lower_store_vec32(b, index, vec32, op); + } + + comp_idx += substore_num_bits / bit_size; + } + + nir_instr_remove(&intr->instr); + + return true; +} + +static void +ubo_to_temp_patch_deref_mode(nir_deref_instr *deref) +{ + deref->modes = nir_var_shader_temp; + nir_foreach_use(use_src, &deref->dest.ssa) { + if (use_src->parent_instr->type != nir_instr_type_deref) + continue; + + nir_deref_instr *parent = nir_instr_as_deref(use_src->parent_instr); + ubo_to_temp_patch_deref_mode(parent); + } +} + +static void +ubo_to_temp_update_entry(nir_deref_instr *deref, struct hash_entry *he) +{ + assert(nir_deref_mode_is(deref, nir_var_mem_constant)); + assert(deref->dest.is_ssa); + assert(he->data); + + nir_foreach_use(use_src, &deref->dest.ssa) { + if (use_src->parent_instr->type == nir_instr_type_deref) { + ubo_to_temp_update_entry(nir_instr_as_deref(use_src->parent_instr), he); + } else if (use_src->parent_instr->type == nir_instr_type_intrinsic) { + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(use_src->parent_instr); + if (intr->intrinsic != nir_intrinsic_load_deref) + he->data = NULL; + } else { + he->data = NULL; + } + + if (!he->data) + break; + } +} + +bool +dxil_nir_lower_ubo_to_temp(nir_shader *nir) +{ + struct hash_table *ubo_to_temp = _mesa_pointer_hash_table_create(NULL); + bool progress = false; + + /* First pass: collect all UBO accesses that could be turned into + * shader temp accesses. + */ + foreach_list_typed(nir_function, func, node, &nir->functions) { + if (!func->is_entrypoint) + continue; + assert(func->impl); + + nir_foreach_block(block, func->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_deref) + continue; + + nir_deref_instr *deref = nir_instr_as_deref(instr); + if (!nir_deref_mode_is(deref, nir_var_mem_constant) || + deref->deref_type != nir_deref_type_var) + continue; + + struct hash_entry *he = + _mesa_hash_table_search(ubo_to_temp, deref->var); + + if (!he) + he = _mesa_hash_table_insert(ubo_to_temp, deref->var, deref->var); + + if (!he->data) + continue; + + ubo_to_temp_update_entry(deref, he); + } + } + } + + hash_table_foreach(ubo_to_temp, he) { + nir_variable *var = he->data; + + if (!var) + continue; + + /* Change the variable mode. */ + var->data.mode = nir_var_shader_temp; + + /* Make sure the variable has a name. + * DXIL variables must have names. + */ + if (!var->name) + var->name = ralloc_asprintf(nir, "global_%d", exec_list_length(&nir->variables)); + + progress = true; + } + _mesa_hash_table_destroy(ubo_to_temp, NULL); + + /* Second pass: patch all derefs that were accessing the converted UBOs + * variables. + */ + foreach_list_typed(nir_function, func, node, &nir->functions) { + if (!func->is_entrypoint) + continue; + assert(func->impl); + + nir_foreach_block(block, func->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_deref) + continue; + + nir_deref_instr *deref = nir_instr_as_deref(instr); + if (nir_deref_mode_is(deref, nir_var_mem_constant) && + deref->deref_type == nir_deref_type_var && + deref->var->data.mode == nir_var_shader_temp) + ubo_to_temp_patch_deref_mode(deref); + } + } + } + + return progress; +} + +bool +lower_load_ubo(nir_builder *b, nir_intrinsic_instr *intr) +{ + assert(intr->dest.is_ssa); + assert(intr->src[0].is_ssa); + assert(intr->src[1].is_ssa); + + b->cursor = nir_before_instr(&intr->instr); + + nir_ssa_def *result = + build_load_ubo_dxil(b, intr->src[0].ssa, intr->src[1].ssa, + nir_dest_num_components(intr->dest), + nir_dest_bit_size(intr->dest)); + + nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(result)); + nir_instr_remove(&intr->instr); + return true; +} + +bool +dxil_nir_lower_loads_stores_to_dxil(nir_shader *nir) +{ + bool progress = false; + + foreach_list_typed(nir_function, func, node, &nir->functions) { + if (!func->is_entrypoint) + continue; + assert(func->impl); + + nir_builder b; + nir_builder_init(&b, func->impl); + + nir_foreach_block(block, func->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + switch (intr->intrinsic) { + case nir_intrinsic_load_deref: + progress |= lower_load_deref(&b, intr); + break; + case nir_intrinsic_load_shared: + case nir_intrinsic_load_scratch: + progress |= lower_32b_offset_load(&b, intr); + break; + case nir_intrinsic_load_ssbo: + progress |= lower_load_ssbo(&b, intr); + break; + case nir_intrinsic_load_ubo: + progress |= lower_load_ubo(&b, intr); + break; + case nir_intrinsic_store_shared: + case nir_intrinsic_store_scratch: + progress |= lower_32b_offset_store(&b, intr); + break; + case nir_intrinsic_store_ssbo: + progress |= lower_store_ssbo(&b, intr); + break; + } + } + } + } + + return progress; +} + +lower_shared_atomic(nir_builder *b, nir_intrinsic_instr *intr, + nir_intrinsic_op dxil_op) +{ + b->cursor = nir_before_instr(&intr->instr); + + assert(intr->src[0].is_ssa); + nir_ssa_def *offset = + nir_iadd(b, intr->src[0].ssa, nir_imm_int(b, nir_intrinsic_base(intr))); + nir_ssa_def *index = nir_ushr(b, offset, nir_imm_int(b, 2)); + + nir_intrinsic_instr *atomic = nir_intrinsic_instr_create(b->shader, dxil_op); + atomic->src[0] = nir_src_for_ssa(index); + assert(intr->src[1].is_ssa); + atomic->src[1] = nir_src_for_ssa(intr->src[1].ssa); + if (dxil_op == nir_intrinsic_shared_atomic_comp_swap_dxil) { + assert(intr->src[2].is_ssa); + atomic->src[2] = nir_src_for_ssa(intr->src[2].ssa); + } + atomic->num_components = 0; + nir_ssa_dest_init(&atomic->instr, &atomic->dest, 1, 32, intr->dest.ssa.name); + + nir_builder_instr_insert(b, &atomic->instr); + nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(&atomic->dest.ssa)); + nir_instr_remove(&intr->instr); + return true; +} + +bool +dxil_nir_lower_atomics_to_dxil(nir_shader *nir) +{ + bool progress = false; + + foreach_list_typed(nir_function, func, node, &nir->functions) { + if (!func->is_entrypoint) + continue; + assert(func->impl); + + nir_builder b; + nir_builder_init(&b, func->impl); + + nir_foreach_block(block, func->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + switch (intr->intrinsic) { + +#define ATOMIC(op) \ + case nir_intrinsic_shared_atomic_##op: \ + progress |= lower_shared_atomic(&b, intr, \ + nir_intrinsic_shared_atomic_##op##_dxil); \ + break + + ATOMIC(add); + ATOMIC(imin); + ATOMIC(umin); + ATOMIC(imax); + ATOMIC(umax); + ATOMIC(and); + ATOMIC(or); + ATOMIC(xor); + ATOMIC(exchange); + ATOMIC(comp_swap); + +#undef ATOMIC + } + } + } + } + + return progress; +} + +static bool +lower_deref_ssbo(nir_builder *b, nir_deref_instr *deref) +{ + assert(nir_deref_mode_is(deref, nir_var_mem_ssbo)); + assert(deref->deref_type == nir_deref_type_var || + deref->deref_type == nir_deref_type_cast); + nir_variable *var = deref->var; + + b->cursor = nir_before_instr(&deref->instr); + + if (deref->deref_type == nir_deref_type_var) { + /* We turn all deref_var into deref_cast and build a pointer value based on + * the var binding which encodes the UAV id. + */ + nir_ssa_def *ptr = nir_imm_int64(b, (uint64_t)var->data.binding << 32); + nir_deref_instr *deref_cast = + nir_build_deref_cast(b, ptr, nir_var_mem_ssbo, deref->type, + glsl_get_explicit_stride(var->type)); + nir_ssa_def_rewrite_uses(&deref->dest.ssa, + nir_src_for_ssa(&deref_cast->dest.ssa)); + nir_instr_remove(&deref->instr); + + deref = deref_cast; + return true; + } + return false; +} + +bool +dxil_nir_lower_deref_ssbo(nir_shader *nir) +{ + bool progress = false; + + foreach_list_typed(nir_function, func, node, &nir->functions) { + if (!func->is_entrypoint) + continue; + assert(func->impl); + + nir_builder b; + nir_builder_init(&b, func->impl); + + nir_foreach_block(block, func->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_deref) + continue; + + nir_deref_instr *deref = nir_instr_as_deref(instr); + + if (!nir_deref_mode_is(deref, nir_var_mem_ssbo) || + (deref->deref_type != nir_deref_type_var && + deref->deref_type != nir_deref_type_cast)) + continue; + + progress |= lower_deref_ssbo(&b, deref); + } + } + } + + return progress; +} + +static bool +lower_alu_deref_srcs(nir_builder *b, nir_alu_instr *alu) +{ + const nir_op_info *info = &nir_op_infos[alu->op]; + bool progress = false; + + b->cursor = nir_before_instr(&alu->instr); + + for (unsigned i = 0; i < info->num_inputs; i++) { + nir_deref_instr *deref = nir_src_as_deref(alu->src[i].src); + + if (!deref) + continue; + + nir_deref_path path; + nir_deref_path_init(&path, deref, NULL); + nir_deref_instr *root_deref = path.path[0]; + nir_deref_path_finish(&path); + + if (root_deref->deref_type != nir_deref_type_cast) + continue; + + nir_ssa_def *ptr = + nir_iadd(b, root_deref->parent.ssa, + nir_build_deref_offset(b, deref, cl_type_size_align)); + nir_instr_rewrite_src(&alu->instr, &alu->src[i].src, nir_src_for_ssa(ptr)); + progress = true; + } + + return progress; +} + +bool +dxil_nir_opt_alu_deref_srcs(nir_shader *nir) +{ + bool progress = false; + + foreach_list_typed(nir_function, func, node, &nir->functions) { + if (!func->is_entrypoint) + continue; + assert(func->impl); + + bool progress = false; + nir_builder b; + nir_builder_init(&b, func->impl); + + nir_foreach_block(block, func->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_alu) + continue; + + nir_alu_instr *alu = nir_instr_as_alu(instr); + progress |= lower_alu_deref_srcs(&b, alu); + } + } + } + + return progress; +} + +static nir_ssa_def * +memcpy_load_deref_elem(nir_builder *b, nir_deref_instr *parent, + nir_ssa_def *index) +{ + nir_deref_instr *deref; + + index = nir_i2i(b, index, nir_dest_bit_size(parent->dest)); + assert(parent->deref_type == nir_deref_type_cast); + deref = nir_build_deref_ptr_as_array(b, parent, index); + + return nir_load_deref(b, deref); +} + +static void +memcpy_store_deref_elem(nir_builder *b, nir_deref_instr *parent, + nir_ssa_def *index, nir_ssa_def *value) +{ + nir_deref_instr *deref; + + index = nir_i2i(b, index, nir_dest_bit_size(parent->dest)); + assert(parent->deref_type == nir_deref_type_cast); + deref = nir_build_deref_ptr_as_array(b, parent, index); + nir_store_deref(b, deref, value, 1); +} + +static bool +lower_memcpy_deref(nir_builder *b, nir_intrinsic_instr *intr) +{ + nir_deref_instr *dst_deref = nir_src_as_deref(intr->src[0]); + nir_deref_instr *src_deref = nir_src_as_deref(intr->src[1]); + assert(intr->src[2].is_ssa); + nir_ssa_def *num_bytes = intr->src[2].ssa; + + assert(dst_deref && src_deref); + + b->cursor = nir_after_instr(&intr->instr); + + dst_deref = nir_build_deref_cast(b, &dst_deref->dest.ssa, dst_deref->modes, + glsl_uint8_t_type(), 1); + src_deref = nir_build_deref_cast(b, &src_deref->dest.ssa, src_deref->modes, + glsl_uint8_t_type(), 1); + + /* + * We want to avoid 64b instructions, so let's assume we'll always be + * passed a value that fits in a 32b type and truncate the 64b value. + */ + num_bytes = nir_u2u32(b, num_bytes); + + nir_variable *loop_index_var = + nir_local_variable_create(b->impl, glsl_uint_type(), "loop_index"); + nir_deref_instr *loop_index_deref = nir_build_deref_var(b, loop_index_var); + nir_store_deref(b, loop_index_deref, nir_imm_int(b, 0), 1); + + nir_loop *loop = nir_push_loop(b); + nir_ssa_def *loop_index = nir_load_deref(b, loop_index_deref); + nir_ssa_def *cmp = nir_ige(b, loop_index, num_bytes); + nir_if *loop_check = nir_push_if(b, cmp); + nir_jump(b, nir_jump_break); + nir_pop_if(b, loop_check); + nir_ssa_def *val = memcpy_load_deref_elem(b, src_deref, loop_index); + memcpy_store_deref_elem(b, dst_deref, loop_index, val); + nir_store_deref(b, loop_index_deref, nir_iadd_imm(b, loop_index, 1), 1); + nir_pop_loop(b, loop); + nir_instr_remove(&intr->instr); + return true; +} + +bool +dxil_nir_lower_memcpy_deref(nir_shader *nir) +{ + bool progress = false; + + foreach_list_typed(nir_function, func, node, &nir->functions) { + if (!func->is_entrypoint) + continue; + assert(func->impl); + + nir_builder b; + nir_builder_init(&b, func->impl); + + nir_foreach_block(block, func->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + if (intr->intrinsic == nir_intrinsic_memcpy_deref) + progress |= lower_memcpy_deref(&b, intr); + } + } + } + + return progress; +} + +static void +cast_phi(nir_builder *b, nir_phi_instr *phi, unsigned new_bit_size) +{ + nir_phi_instr *lowered = nir_phi_instr_create(b->shader); + int num_components = 0; + int old_bit_size = phi->dest.ssa.bit_size; + + nir_op upcast_op = nir_type_conversion_op(nir_type_uint | old_bit_size, + nir_type_uint | new_bit_size, + nir_rounding_mode_undef); + nir_op downcast_op = nir_type_conversion_op(nir_type_uint | new_bit_size, + nir_type_uint | old_bit_size, + nir_rounding_mode_undef); + + nir_foreach_phi_src(src, phi) { + assert(num_components == 0 || num_components == src->src.ssa->num_components); + num_components = src->src.ssa->num_components; + + b->cursor = nir_after_instr(src->src.ssa->parent_instr); + + nir_ssa_def *cast = nir_build_alu(b, upcast_op, src->src.ssa, NULL, NULL, NULL); + + nir_phi_src *new_src = rzalloc(lowered, nir_phi_src); + new_src->pred = src->pred; + new_src->src = nir_src_for_ssa(cast); + exec_list_push_tail(&lowered->srcs, &new_src->node); + } + + nir_ssa_dest_init(&lowered->instr, &lowered->dest, + num_components, new_bit_size, NULL); + + b->cursor = nir_before_instr(&phi->instr); + nir_builder_instr_insert(b, &lowered->instr); + + b->cursor = nir_after_phis(nir_cursor_current_block(b->cursor)); + nir_ssa_def *result = nir_build_alu(b, downcast_op, &lowered->dest.ssa, NULL, NULL, NULL); + + nir_ssa_def_rewrite_uses(&phi->dest.ssa, nir_src_for_ssa(result)); + nir_instr_remove(&phi->instr); +} + +static bool +upcast_phi_impl(nir_function_impl *impl, unsigned min_bit_size) +{ + nir_builder b; + nir_builder_init(&b, impl); + bool progress = false; + + nir_foreach_block_reverse(block, impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_phi) + continue; + + nir_phi_instr *phi = nir_instr_as_phi(instr); + assert(phi->dest.is_ssa); + + if (phi->dest.ssa.bit_size == 1 || + phi->dest.ssa.bit_size >= min_bit_size) + continue; + + cast_phi(&b, phi, min_bit_size); + progress = true; + } + } + + if (progress) { + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance); + } + + return progress; +} + +bool +dxil_nir_lower_upcast_phis(nir_shader *shader, unsigned min_bit_size) +{ + bool progress = false; + + nir_foreach_function(function, shader) { + if (function->impl) + progress |= upcast_phi_impl(function->impl, min_bit_size); + } + + return progress; +} + +/* The following float-to-half conversion routines are based on the "half" library: + * https://sourceforge.net/projects/half/ + * + * half - IEEE 754-based half-precision floating-point library. + * + * Copyright (c) 2012-2019 Christian Rau + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE + * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Version 2.1.0 + */ + + +static bool +lower_fp16_casts_filter(const nir_instr *instr, const void *data) +{ + if (instr->type == nir_instr_type_alu) { + nir_alu_instr *alu = nir_instr_as_alu(instr); + /* TODO: DXIL has instructions for f2f16_rtz. For CL, it's not precise enough + * due to denorm handling. If the f2f16 instruction has undef rounding mode, + * we could map that too, but for CL, f2f16 is implied to mean rtne. + */ + switch (alu->op) { + case nir_op_f2f16: + case nir_op_f2f16_rtne: + case nir_op_f2f16_rtz: + return true; + } + } else if (instr->type == nir_instr_type_intrinsic) { + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + return intrin->intrinsic == nir_intrinsic_convert_alu_types && + nir_intrinsic_dest_type(intrin) == nir_type_float16; + } + return false; +} + +static nir_ssa_def * +half_rounded(nir_builder *b, nir_ssa_def *value, nir_ssa_def *guard, nir_ssa_def *sticky, + nir_ssa_def *sign, nir_rounding_mode mode) +{ + switch (mode) { + case nir_rounding_mode_rtne: + return nir_iadd(b, value, nir_iand(b, guard, nir_ior(b, sticky, value))); + case nir_rounding_mode_ru: + sign = nir_ushr(b, sign, nir_imm_int(b, 31)); + return nir_iadd(b, value, nir_iand(b, nir_inot(b, sign), + nir_ior(b, guard, sticky))); + case nir_rounding_mode_rd: + sign = nir_ushr(b, sign, nir_imm_int(b, 31)); + return nir_iadd(b, value, nir_iand(b, sign, + nir_ior(b, guard, sticky))); + default: + return value; + } +} + +static nir_ssa_def * +float_to_half_impl(nir_builder *b, nir_ssa_def *src, nir_rounding_mode mode) +{ + nir_ssa_def *f32infinity = nir_imm_int(b, 255 << 23); + nir_ssa_def *f16max = nir_imm_int(b, (127 + 16) << 23); + nir_ssa_def *denorm_magic = nir_imm_int(b, ((127 - 15) + (23 - 10) + 1) << 23); + nir_ssa_def *sign = nir_iand(b, src, nir_imm_int(b, 0x80000000)); + nir_ssa_def *one = nir_imm_int(b, 1); + + nir_ssa_def *abs = nir_iand(b, src, nir_imm_int(b, 0x7FFFFFFF)); + /* NaN or INF. For rtne, overflow also becomes INF, so combine the comparisons */ + nir_push_if(b, nir_ige(b, abs, mode == nir_rounding_mode_rtne ? f16max : f32infinity)); + nir_ssa_def *inf_nanfp16 = nir_bcsel(b, + nir_ilt(b, f32infinity, abs), + nir_imm_int(b, 0x7E00), + nir_imm_int(b, 0x7C00)); + nir_push_else(b, NULL); + + nir_ssa_def *overflowed_fp16 = NULL; + if (mode != nir_rounding_mode_rtne) { + /* Handle overflow */ + nir_push_if(b, nir_ige(b, abs, f16max)); + switch (mode) { + case nir_rounding_mode_rtz: + overflowed_fp16 = nir_imm_int(b, 0x7BFF); + break; + case nir_rounding_mode_ru: + /* Negative becomes max float, positive becomes inf */ + overflowed_fp16 = nir_bcsel(b, nir_i2b1(b, sign), nir_imm_int(b, 0x7BFF), nir_imm_int(b, 0x7C00)); + break; + case nir_rounding_mode_rd: + /* Negative becomes inf, positive becomes max float */ + overflowed_fp16 = nir_bcsel(b, nir_i2b1(b, sign), nir_imm_int(b, 0x7C00), nir_imm_int(b, 0x7BFF)); + break; + default: unreachable("Should've been handled already"); + } + nir_push_else(b, NULL); + } + + nir_push_if(b, nir_ige(b, abs, nir_imm_int(b, 113 << 23))); + + /* FP16 will be normal */ + nir_ssa_def *zero = nir_imm_int(b, 0); + nir_ssa_def *value = nir_ior(b, + nir_ishl(b, + nir_isub(b, + nir_ushr(b, abs, nir_imm_int(b, 23)), + nir_imm_int(b, 112)), + nir_imm_int(b, 10)), + nir_iand(b, nir_ushr(b, abs, nir_imm_int(b, 13)), nir_imm_int(b, 0x3FFF))); + nir_ssa_def *guard = nir_iand(b, nir_ushr(b, abs, nir_imm_int(b, 12)), one); + nir_ssa_def *sticky = nir_bcsel(b, nir_ine(b, nir_iand(b, abs, nir_imm_int(b, 0xFFF)), zero), one, zero); + nir_ssa_def *normal_fp16 = half_rounded(b, value, guard, sticky, sign, mode); + + nir_push_else(b, NULL); + nir_push_if(b, nir_ige(b, abs, nir_imm_int(b, 102 << 23))); + + /* FP16 will be denormal */ + nir_ssa_def *i = nir_isub(b, nir_imm_int(b, 125), nir_ushr(b, abs, nir_imm_int(b, 23))); + nir_ssa_def *masked = nir_ior(b, nir_iand(b, abs, nir_imm_int(b, 0x7FFFFF)), nir_imm_int(b, 0x800000)); + value = nir_ushr(b, masked, nir_iadd(b, i, one)); + guard = nir_iand(b, nir_ushr(b, masked, i), one); + sticky = nir_bcsel(b, nir_ine(b, nir_iand(b, masked, nir_isub(b, nir_ishl(b, one, i), one)), zero), one, zero); + nir_ssa_def *denormal_fp16 = half_rounded(b, value, guard, sticky, sign, mode); + + nir_push_else(b, NULL); + + /* Handle underflow. Nonzero values need to shift up or down for round-up or round-down */ + nir_ssa_def *underflowed_fp16 = zero; + if (mode == nir_rounding_mode_ru || + mode == nir_rounding_mode_rd) { + nir_push_if(b, nir_i2b1(b, abs)); + + if (mode == nir_rounding_mode_ru) + underflowed_fp16 = nir_bcsel(b, nir_i2b1(b, sign), zero, one); + else + underflowed_fp16 = nir_bcsel(b, nir_i2b1(b, sign), one, zero); + + nir_push_else(b, NULL); + nir_pop_if(b, NULL); + underflowed_fp16 = nir_if_phi(b, underflowed_fp16, zero); + } + + nir_pop_if(b, NULL); + nir_ssa_def *underflowed_or_denorm_fp16 = nir_if_phi(b, denormal_fp16, underflowed_fp16); + + nir_pop_if(b, NULL); + nir_ssa_def *finite_fp16 = nir_if_phi(b, normal_fp16, underflowed_or_denorm_fp16); + + nir_ssa_def *finite_or_overflowed_fp16 = finite_fp16; + if (mode != nir_rounding_mode_rtne) { + nir_pop_if(b, NULL); + finite_or_overflowed_fp16 = nir_if_phi(b, overflowed_fp16, finite_fp16); + } + + nir_pop_if(b, NULL); + nir_ssa_def *fp16 = nir_if_phi(b, inf_nanfp16, finite_or_overflowed_fp16); + + return nir_u2u16(b, nir_ior(b, fp16, nir_ushr(b, sign, nir_imm_int(b, 16)))); +} + +static nir_ssa_def * +lower_fp16_cast_impl(nir_builder *b, nir_instr *instr, void *data) +{ + nir_ssa_def *src, *dst; + uint8_t *swizzle = NULL; + nir_rounding_mode mode = nir_rounding_mode_rtne; + + if (instr->type == nir_instr_type_alu) { + nir_alu_instr *alu = nir_instr_as_alu(instr); + src = alu->src[0].src.ssa; + swizzle = alu->src[0].swizzle; + dst = &alu->dest.dest.ssa; + assert(src->bit_size == 32); + switch (alu->op) { + case nir_op_f2f16: + case nir_op_f2f16_rtne: + break; + case nir_op_f2f16_rtz: + mode = nir_rounding_mode_rtz; + break; + } + } else { + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + assert(nir_intrinsic_src_type(intrin) == nir_type_float32); + src = intrin->src[0].ssa; + dst = &intrin->dest.ssa; + mode = nir_intrinsic_rounding_mode(intrin); + } + + nir_ssa_def *rets[NIR_MAX_VEC_COMPONENTS] = { NULL }; + + for (unsigned i = 0; i < dst->num_components; i++) { + nir_ssa_def *comp = nir_channel(b, src, swizzle ? swizzle[i] : i); + rets[i] = float_to_half_impl(b, comp, mode); + } + + return nir_vec(b, rets, dst->num_components); +} + +bool +dxil_nir_lower_fp16_casts(nir_shader *shader) +{ + return nir_shader_lower_instructions(shader, + lower_fp16_casts_filter, + lower_fp16_cast_impl, + NULL); +} diff --git a/src/microsoft/compiler/dxil_nir.h b/src/microsoft/compiler/dxil_nir.h index b20b63257b9..3dc8b4f3e4e 100644 --- a/src/microsoft/compiler/dxil_nir.h +++ b/src/microsoft/compiler/dxil_nir.h @@ -32,6 +32,14 @@ bool dxil_nir_lower_8bit_conv(nir_shader *shader); bool dxil_nir_lower_16bit_conv(nir_shader *shader); bool dxil_nir_lower_x2b(nir_shader *shader); bool dxil_nir_lower_inot(nir_shader *shader); +bool dxil_nir_lower_ubo_to_temp(nir_shader *shader); +bool dxil_nir_lower_loads_stores_to_dxil(nir_shader *shader); +bool dxil_nir_lower_atomics_to_dxil(nir_shader *shader); +bool dxil_nir_lower_deref_ssbo(nir_shader *shader); +bool dxil_nir_opt_alu_deref_srcs(nir_shader *shader); +bool dxil_nir_lower_memcpy_deref(nir_shader *shader); +bool dxil_nir_lower_upcast_phis(nir_shader *shader, unsigned min_bit_size); +bool dxil_nir_lower_fp16_casts(nir_shader *shader); nir_ssa_def * build_load_ubo_dxil(nir_builder *b, nir_ssa_def *buffer, diff --git a/src/microsoft/compiler/nir_to_dxil.c b/src/microsoft/compiler/nir_to_dxil.c index 443917944f3..f5fef4f1ce2 100644 --- a/src/microsoft/compiler/nir_to_dxil.c +++ b/src/microsoft/compiler/nir_to_dxil.c @@ -97,8 +97,10 @@ nir_options = { .lower_pack_32_2x16_split = true, .lower_unpack_64_2x32_split = true, .lower_unpack_32_2x16_split = true, + .use_scoped_barrier = true, .vertex_id_zero_based = true, .lower_base_vertex = true, + .has_cs_global_id = true, }; const nir_shader_compiler_options* @@ -808,6 +810,48 @@ emit_srv(struct ntd_context *ctx, nir_variable *var, unsigned binding, unsigned return true; } +static bool +emit_globals(struct ntd_context *ctx, nir_shader *s, unsigned size) +{ + nir_foreach_variable_with_modes(var, s, nir_var_mem_ssbo) + size++; + + if (!size) + return true; + + const struct dxil_type *type = dxil_module_get_int_type(&ctx->mod, 32); + if (!type) + return false; + + const struct dxil_type *struct_type = + dxil_module_get_struct_type(&ctx->mod, NULL, &type, 1); + if (!struct_type) + return false; + + const struct dxil_type *array_type = + dxil_module_get_array_type(&ctx->mod, struct_type, size); + if (!array_type) + return false; + + resource_array_layout layout = {0, 0, size}; + const struct dxil_mdnode *uav_meta = + emit_uav_metadata(&ctx->mod, array_type, + "globals", &layout, + DXIL_COMP_TYPE_INVALID, + DXIL_RESOURCE_KIND_RAW_BUFFER); + if (!uav_meta) + return false; + + ctx->uav_metadata_nodes[ctx->num_uav_arrays++] = uav_meta; + if (ctx->num_uav_arrays > 8) + ctx->mod.feats.use_64uavs = 1; + /* Handles to UAVs used for kernel globals are created on-demand */ + ctx->num_uavs += size; + add_resource(ctx, DXIL_RES_UAV_RAW, &layout); + ctx->mod.raw_and_structured_buffers = true; + return true; +} + static bool emit_uav(struct ntd_context *ctx, nir_variable *var, unsigned count) { @@ -936,6 +980,53 @@ var_fill_const_array(struct ntd_context *ctx, const struct nir_constant *c, unreachable("unknown GLSL type in var_fill_const_array"); } +static bool +emit_global_consts(struct ntd_context *ctx, nir_shader *s) +{ + nir_foreach_variable_with_modes(var, s, nir_var_shader_temp) { + struct dxil_value *ret; + bool err; + + assert(var->constant_initializer); + + unsigned int num_members = DIV_ROUND_UP(glsl_get_cl_size(var->type), 4); + uint32_t *const_ints = ralloc_array(ctx->ralloc_ctx, uint32_t, num_members); + err = var_fill_const_array(ctx, var->constant_initializer, var->type, + const_ints, 0); + if (!err) + return false; + const struct dxil_value **const_vals = + ralloc_array(ctx->ralloc_ctx, const struct dxil_value *, num_members); + if (!const_vals) + return false; + for (int i = 0; i < num_members; i++) + const_vals[i] = dxil_module_get_int32_const(&ctx->mod, const_ints[i]); + + const struct dxil_type *elt_type = dxil_module_get_int_type(&ctx->mod, 32); + if (!elt_type) + return false; + const struct dxil_type *type = + dxil_module_get_array_type(&ctx->mod, elt_type, num_members); + if (!type) + return false; + const struct dxil_value *agg_vals = + dxil_module_get_array_const(&ctx->mod, type, const_vals); + if (!agg_vals) + return false; + + const struct dxil_value *gvar = dxil_add_global_ptr_var(&ctx->mod, var->name, type, + DXIL_AS_DEFAULT, 4, + agg_vals); + if (!gvar) + return false; + + if (!_mesa_hash_table_insert(ctx->consts, var, (void *)gvar)) + return false; + } + + return true; +} + static bool emit_cbv(struct ntd_context *ctx, unsigned binding, unsigned size, char *name) @@ -1882,6 +1973,8 @@ emit_alu(struct ntd_context *ctx, nir_alu_instr *alu) case nir_op_flog2: return emit_unary_intin(ctx, alu, DXIL_INTR_FLOG2, src[0]); case nir_op_ffloor: return emit_unary_intin(ctx, alu, DXIL_INTR_ROUND_NI, src[0]); case nir_op_ffract: return emit_unary_intin(ctx, alu, DXIL_INTR_FRC, src[0]); + case nir_op_fisnormal: return emit_unary_intin(ctx, alu, DXIL_INTR_ISNORMAL, src[0]); + case nir_op_fisfinite: return emit_unary_intin(ctx, alu, DXIL_INTR_ISFINITE, src[0]); case nir_op_fddx: case nir_op_fddx_coarse: return emit_unary_intin(ctx, alu, DXIL_INTR_DDX_COARSE, src[0]); @@ -1966,6 +2059,120 @@ load_ubo(struct ntd_context *ctx, const struct dxil_value *handle, return dxil_emit_call(&ctx->mod, func, args, ARRAY_SIZE(args)); } +static bool +emit_barrier(struct ntd_context *ctx, nir_intrinsic_instr *intr) +{ + const struct dxil_value *opcode, *mode; + const struct dxil_func *func; + uint32_t flags = 0; + + if (nir_intrinsic_execution_scope(intr) == NIR_SCOPE_WORKGROUP) + flags |= DXIL_BARRIER_MODE_SYNC_THREAD_GROUP; + + nir_variable_mode modes = nir_intrinsic_memory_modes(intr); + nir_scope mem_scope = nir_intrinsic_memory_scope(intr); + + if (modes & ~(nir_var_mem_ssbo | nir_var_mem_global | nir_var_mem_shared)) + return false; + + if (mem_scope != NIR_SCOPE_DEVICE && mem_scope != NIR_SCOPE_WORKGROUP) + return false; + + if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) { + if (mem_scope == NIR_SCOPE_DEVICE) + flags |= DXIL_BARRIER_MODE_UAV_FENCE_GLOBAL; + else + flags |= DXIL_BARRIER_MODE_UAV_FENCE_THREAD_GROUP; + } + + if (modes & nir_var_mem_shared) + flags |= DXIL_BARRIER_MODE_UAV_FENCE_THREAD_GROUP; + + func = dxil_get_function(&ctx->mod, "dx.op.barrier", DXIL_NONE); + if (!func) + return false; + + opcode = dxil_module_get_int32_const(&ctx->mod, DXIL_INTR_BARRIER); + if (!opcode) + return false; + + mode = dxil_module_get_int32_const(&ctx->mod, flags); + if (!mode) + return false; + + const struct dxil_value *args[] = { opcode, mode }; + + return dxil_emit_call_void(&ctx->mod, func, + args, ARRAY_SIZE(args)); +} + +static bool +emit_load_global_invocation_id(struct ntd_context *ctx, + nir_intrinsic_instr *intr) +{ + assert(intr->dest.is_ssa); + nir_component_mask_t comps = nir_ssa_def_components_read(&intr->dest.ssa); + + for (int i = 0; i < nir_intrinsic_dest_components(intr); i++) { + if (comps & (1 << i)) { + const struct dxil_value *idx = dxil_module_get_int32_const(&ctx->mod, i); + if (!idx) + return false; + const struct dxil_value *globalid = emit_threadid_call(ctx, idx); + + if (!globalid) + return false; + + store_dest_value(ctx, &intr->dest, i, globalid); + } + } + return true; +} + +static bool +emit_load_local_invocation_id(struct ntd_context *ctx, + nir_intrinsic_instr *intr) +{ + assert(intr->dest.is_ssa); + nir_component_mask_t comps = nir_ssa_def_components_read(&intr->dest.ssa); + + for (int i = 0; i < nir_intrinsic_dest_components(intr); i++) { + if (comps & (1 << i)) { + const struct dxil_value + *idx = dxil_module_get_int32_const(&ctx->mod, i); + if (!idx) + return false; + const struct dxil_value + *threadidingroup = emit_threadidingroup_call(ctx, idx); + if (!threadidingroup) + return false; + store_dest_value(ctx, &intr->dest, i, threadidingroup); + } + } + return true; +} + +static bool +emit_load_local_work_group_id(struct ntd_context *ctx, + nir_intrinsic_instr *intr) +{ + assert(intr->dest.is_ssa); + nir_component_mask_t comps = nir_ssa_def_components_read(&intr->dest.ssa); + + for (int i = 0; i < nir_intrinsic_dest_components(intr); i++) { + if (comps & (1 << i)) { + const struct dxil_value *idx = dxil_module_get_int32_const(&ctx->mod, i); + if (!idx) + return false; + const struct dxil_value *groupid = emit_groupid_call(ctx, idx); + if (!groupid) + return false; + store_dest_value(ctx, &intr->dest, i, groupid); + } + } + return true; +} + static bool emit_load_primitiveid(struct ntd_context *ctx, nir_intrinsic_instr *intr) @@ -2000,6 +2207,249 @@ get_int32_undef(struct dxil_module *m) return dxil_module_get_undef(m, int32_type); } +static const struct dxil_value * +offset_to_index(struct dxil_module *m, const struct dxil_value *offset, + unsigned bit_size) +{ + unsigned shift_amt = util_logbase2(bit_size / 8); + const struct dxil_value *shift = + dxil_module_get_int32_const(m, shift_amt); + if (!shift) + return NULL; + + return dxil_emit_binop(m, DXIL_BINOP_LSHR, offset, shift, 0); +} + +static const struct dxil_value * +index_to_offset(struct dxil_module *m, const struct dxil_value *index, + unsigned bit_size) +{ + unsigned shift_amt = util_logbase2(bit_size / 8); + const struct dxil_value *shift = + dxil_module_get_int32_const(m, shift_amt); + if (!shift) + return NULL; + + return dxil_emit_binop(m, DXIL_BINOP_SHL, index, shift, 0); +} + +static const struct dxil_value * +emit_gep_for_index(struct ntd_context *ctx, const nir_variable *var, + const struct dxil_value *index) +{ + assert(var->data.mode == nir_var_shader_temp); + + struct hash_entry *he = _mesa_hash_table_search(ctx->consts, var); + assert(he != NULL); + const struct dxil_value *ptr = he->data; + + const struct dxil_value *zero = dxil_module_get_int32_const(&ctx->mod, 0); + if (!zero) + return NULL; + + const struct dxil_value *ops[] = { ptr, zero, index }; + return dxil_emit_gep_inbounds(&ctx->mod, ops, ARRAY_SIZE(ops)); +} + +static bool +emit_load_ssbo(struct ntd_context *ctx, nir_intrinsic_instr *intr) +{ + const struct dxil_value *int32_undef = get_int32_undef(&ctx->mod); + const struct dxil_value *buffer = + get_src(ctx, &intr->src[0], 0, nir_type_uint); + const struct dxil_value *offset = + get_src(ctx, &intr->src[1], 0, nir_type_uint); + if (!int32_undef || !buffer || !offset) + return false; + + assert(nir_src_bit_size(intr->src[0]) == 32); + assert(nir_intrinsic_dest_components(intr) <= 4); + + const struct dxil_value *handle = + emit_createhandle_call(ctx, DXIL_RESOURCE_CLASS_UAV, 0, buffer, + nir_src_is_const(intr->src[0])); + if (!handle) + return false; + + const struct dxil_value *coord[2] = { + offset, + int32_undef + }; + + const struct dxil_value *load = emit_bufferload_call(ctx, handle, coord); + if (!load) + return false; + + for (int i = 0; i < nir_intrinsic_dest_components(intr); i++) { + const struct dxil_value *val = + dxil_emit_extractval(&ctx->mod, load, i); + if (!val) + return false; + store_dest_value(ctx, &intr->dest, i, val); + } + return true; +} + +static bool +emit_store_ssbo(struct ntd_context *ctx, nir_intrinsic_instr *intr) +{ + const struct dxil_value *buffer = + get_src(ctx, &intr->src[1], 0, nir_type_uint); + const struct dxil_value *offset = + get_src(ctx, &intr->src[2], 0, nir_type_uint); + if (!buffer || !offset) + return false; + + const struct dxil_value *handle = + emit_createhandle_call(ctx, DXIL_RESOURCE_CLASS_UAV, 0, buffer, + nir_src_is_const(intr->src[1])); + if (!handle) + return false; + + assert(nir_src_bit_size(intr->src[0]) == 32); + unsigned num_components = nir_src_num_components(intr->src[0]); + assert(num_components <= 4); + const struct dxil_value *value[4]; + for (unsigned i = 0; i < num_components; ++i) { + value[i] = get_src(ctx, &intr->src[0], i, nir_type_uint); + if (!value[i]) + return false; + } + + const struct dxil_value *int32_undef = get_int32_undef(&ctx->mod); + if (!int32_undef) + return false; + + const struct dxil_value *coord[2] = { + offset, + int32_undef + }; + + for (int i = num_components; i < 4; ++i) + value[i] = int32_undef; + + const struct dxil_value *write_mask = + dxil_module_get_int8_const(&ctx->mod, (1u << num_components) - 1); + if (!write_mask) + return false; + + return emit_bufferstore_call(ctx, handle, coord, value, write_mask, DXIL_I32); +} + +static bool +emit_store_ssbo_masked(struct ntd_context *ctx, nir_intrinsic_instr *intr) +{ + const struct dxil_value *value = + get_src(ctx, &intr->src[0], 0, nir_type_uint); + const struct dxil_value *mask = + get_src(ctx, &intr->src[1], 0, nir_type_uint); + const struct dxil_value *buffer = + get_src(ctx, &intr->src[2], 0, nir_type_uint); + const struct dxil_value *offset = + get_src(ctx, &intr->src[3], 0, nir_type_uint); + if (!value || !mask || !buffer || !offset) + return false; + + const struct dxil_value *handle = + emit_createhandle_call(ctx, DXIL_RESOURCE_CLASS_UAV, 0, buffer, + nir_src_is_const(intr->src[2])); + if (!handle) + return false; + + const struct dxil_value *int32_undef = get_int32_undef(&ctx->mod); + if (!int32_undef) + return false; + + const struct dxil_value *coord[3] = { + offset, int32_undef, int32_undef + }; + + return + emit_atomic_binop(ctx, handle, DXIL_ATOMIC_AND, coord, mask) != NULL && + emit_atomic_binop(ctx, handle, DXIL_ATOMIC_OR, coord, value) != NULL; +} + +static bool +emit_store_shared(struct ntd_context *ctx, nir_intrinsic_instr *intr) +{ + const struct dxil_value *zero, *index; + unsigned bit_size = nir_src_bit_size(intr->src[0]); + + /* All shared mem accesses should have been lowered to scalar 32bit + * accesses. + */ + assert(bit_size == 32); + assert(nir_src_num_components(intr->src[0]) == 1); + + zero = dxil_module_get_int32_const(&ctx->mod, 0); + if (!zero) + return false; + + if (intr->intrinsic == nir_intrinsic_store_shared_dxil) + index = get_src(ctx, &intr->src[1], 0, nir_type_uint); + else + index = get_src(ctx, &intr->src[2], 0, nir_type_uint); + if (!index) + return false; + + const struct dxil_value *ops[] = { ctx->sharedvars, zero, index }; + const struct dxil_value *ptr, *value; + + ptr = dxil_emit_gep_inbounds(&ctx->mod, ops, ARRAY_SIZE(ops)); + if (!ptr) + return false; + + value = get_src(ctx, &intr->src[0], 0, nir_type_uint); + + if (intr->intrinsic == nir_intrinsic_store_shared_dxil) + return dxil_emit_store(&ctx->mod, value, ptr, 4, false); + + const struct dxil_value *mask = get_src(ctx, &intr->src[1], 0, nir_type_uint); + + if (!dxil_emit_atomicrmw(&ctx->mod, mask, ptr, DXIL_RMWOP_AND, false, + DXIL_ATOMIC_ORDERING_ACQREL, + DXIL_SYNC_SCOPE_CROSSTHREAD)) + return false; + + if (!dxil_emit_atomicrmw(&ctx->mod, value, ptr, DXIL_RMWOP_OR, false, + DXIL_ATOMIC_ORDERING_ACQREL, + DXIL_SYNC_SCOPE_CROSSTHREAD)) + return false; + + return true; +} + +static bool +emit_store_scratch(struct ntd_context *ctx, nir_intrinsic_instr *intr) +{ + const struct dxil_value *zero, *index; + unsigned bit_size = nir_src_bit_size(intr->src[0]); + + /* All scratch mem accesses should have been lowered to scalar 32bit + * accesses. + */ + assert(bit_size == 32); + assert(nir_src_num_components(intr->src[0]) == 1); + + zero = dxil_module_get_int32_const(&ctx->mod, 0); + if (!zero) + return false; + + index = get_src(ctx, &intr->src[1], 0, nir_type_uint); + if (!index) + return false; + + const struct dxil_value *ops[] = { ctx->scratchvars, zero, index }; + const struct dxil_value *ptr, *value; + + ptr = dxil_emit_gep_inbounds(&ctx->mod, ops, ARRAY_SIZE(ops)); + if (!ptr) + return false; + + value = get_src(ctx, &intr->src[0], 0, nir_type_uint); + return dxil_emit_store(&ctx->mod, value, ptr, 4, false); +} + static bool emit_load_ubo(struct ntd_context *ctx, nir_intrinsic_instr *intr) { @@ -2224,6 +2674,97 @@ emit_load_input(struct ntd_context *ctx, nir_intrinsic_instr *intr, return emit_load_input_flat(ctx, intr, input); } +static bool +emit_load_ptr(struct ntd_context *ctx, nir_intrinsic_instr *intr) +{ + struct nir_variable *var = + nir_deref_instr_get_variable(nir_src_as_deref(intr->src[0])); + const struct dxil_value *index = + get_src(ctx, &intr->src[1], 0, nir_type_uint); + + const struct dxil_value *ptr = emit_gep_for_index(ctx, var, index); + if (!ptr) + return false; + + const struct dxil_value *retval = + dxil_emit_load(&ctx->mod, ptr, 4, false); + + store_dest(ctx, &intr->dest, 0, retval, nir_type_uint); + return true; +} + +static bool +emit_load_shared(struct ntd_context *ctx, nir_intrinsic_instr *intr) +{ + const struct dxil_value *zero, *index; + unsigned bit_size = nir_dest_bit_size(intr->dest); + unsigned align = bit_size / 8; + + /* All shared mem accesses should have been lowered to scalar 32bit + * accesses. + */ + assert(bit_size == 32); + assert(nir_dest_num_components(intr->dest) == 1); + + zero = dxil_module_get_int32_const(&ctx->mod, 0); + if (!zero) + return false; + + index = get_src(ctx, &intr->src[0], 0, nir_type_uint); + if (!index) + return false; + + const struct dxil_value *ops[] = { ctx->sharedvars, zero, index }; + const struct dxil_value *ptr, *retval; + + ptr = dxil_emit_gep_inbounds(&ctx->mod, ops, ARRAY_SIZE(ops)); + if (!ptr) + return false; + + retval = dxil_emit_load(&ctx->mod, ptr, align, false); + if (!retval) + return false; + + store_dest(ctx, &intr->dest, 0, retval, nir_type_uint); + return true; +} + +static bool +emit_load_scratch(struct ntd_context *ctx, nir_intrinsic_instr *intr) +{ + const struct dxil_value *zero, *one, *index; + unsigned bit_size = nir_dest_bit_size(intr->dest); + unsigned align = bit_size / 8; + + /* All scratch mem accesses should have been lowered to scalar 32bit + * accesses. + */ + assert(bit_size == 32); + assert(nir_dest_num_components(intr->dest) == 1); + + zero = dxil_module_get_int32_const(&ctx->mod, 0); + if (!zero) + return false; + + index = get_src(ctx, &intr->src[0], 0, nir_type_uint); + if (!index) + return false; + + const struct dxil_value *ops[] = { ctx->scratchvars, zero, index }; + const struct dxil_value *ptr, *retval; + + ptr = dxil_emit_gep_inbounds(&ctx->mod, ops, ARRAY_SIZE(ops)); + if (!ptr) + return false; + + retval = dxil_emit_load(&ctx->mod, ptr, align, false); + if (!retval) + return false; + + store_dest(ctx, &intr->dest, 0, retval, nir_type_uint); + return true; +} + static bool emit_load_deref(struct ntd_context *ctx, nir_intrinsic_instr *intr) { @@ -2573,10 +3114,31 @@ static bool emit_intrinsic(struct ntd_context *ctx, nir_intrinsic_instr *intr) { switch (intr->intrinsic) { + case nir_intrinsic_load_global_invocation_id: + case nir_intrinsic_load_global_invocation_id_zero_base: + return emit_load_global_invocation_id(ctx, intr); + case nir_intrinsic_load_local_invocation_id: + return emit_load_local_invocation_id(ctx, intr); + case nir_intrinsic_load_work_group_id: + case nir_intrinsic_load_work_group_id_zero_base: + return emit_load_local_work_group_id(ctx, intr); + case nir_intrinsic_load_ssbo: + return emit_load_ssbo(ctx, intr); + case nir_intrinsic_store_ssbo: + return emit_store_ssbo(ctx, intr); + case nir_intrinsic_store_ssbo_masked_dxil: + return emit_store_ssbo_masked(ctx, intr); case nir_intrinsic_store_deref: return emit_store_deref(ctx, intr); + case nir_intrinsic_store_shared_dxil: + case nir_intrinsic_store_shared_masked_dxil: + return emit_store_shared(ctx, intr); + case nir_intrinsic_store_scratch_dxil: + return emit_store_scratch(ctx, intr); case nir_intrinsic_load_deref: return emit_load_deref(ctx, intr); + case nir_intrinsic_load_ptr_dxil: + return emit_load_ptr(ctx, intr); case nir_intrinsic_load_ubo: return emit_load_ubo(ctx, intr); case nir_intrinsic_load_ubo_dxil: @@ -2592,6 +3154,10 @@ emit_intrinsic(struct ntd_context *ctx, nir_intrinsic_instr *intr) ctx->system_value[SYSTEM_VALUE_INSTANCE_ID]); case nir_intrinsic_load_primitive_id: return emit_load_primitiveid(ctx, intr); + case nir_intrinsic_load_shared_dxil: + return emit_load_shared(ctx, intr); + case nir_intrinsic_load_scratch_dxil: + return emit_load_scratch(ctx, intr); case nir_intrinsic_discard_if: return emit_discard_if(ctx, intr); case nir_intrinsic_discard: @@ -2600,7 +3166,55 @@ emit_intrinsic(struct ntd_context *ctx, nir_intrinsic_instr *intr) return emit_emit_vertex(ctx, intr); case nir_intrinsic_end_primitive: return emit_end_primitive(ctx, intr); + case nir_intrinsic_scoped_barrier: + return emit_barrier(ctx, intr); + case nir_intrinsic_ssbo_atomic_add: + return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_ADD, nir_type_int); + case nir_intrinsic_ssbo_atomic_imin: + return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_IMIN, nir_type_int); + case nir_intrinsic_ssbo_atomic_umin: + return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_UMIN, nir_type_uint); + case nir_intrinsic_ssbo_atomic_imax: + return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_IMAX, nir_type_int); + case nir_intrinsic_ssbo_atomic_umax: + return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_UMAX, nir_type_uint); + case nir_intrinsic_ssbo_atomic_and: + return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_AND, nir_type_uint); + case nir_intrinsic_ssbo_atomic_or: + return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_OR, nir_type_uint); + case nir_intrinsic_ssbo_atomic_xor: + return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_XOR, nir_type_uint); + case nir_intrinsic_ssbo_atomic_exchange: + return emit_ssbo_atomic(ctx, intr, DXIL_ATOMIC_EXCHANGE, nir_type_int); + case nir_intrinsic_ssbo_atomic_comp_swap: + return emit_ssbo_atomic_comp_swap(ctx, intr); + case nir_intrinsic_shared_atomic_add_dxil: + return emit_shared_atomic(ctx, intr, DXIL_RMWOP_ADD, nir_type_int); + case nir_intrinsic_shared_atomic_imin_dxil: + return emit_shared_atomic(ctx, intr, DXIL_RMWOP_MIN, nir_type_int); + case nir_intrinsic_shared_atomic_umin_dxil: + return emit_shared_atomic(ctx, intr, DXIL_RMWOP_UMIN, nir_type_uint); + case nir_intrinsic_shared_atomic_imax_dxil: + return emit_shared_atomic(ctx, intr, DXIL_RMWOP_MAX, nir_type_int); + case nir_intrinsic_shared_atomic_umax_dxil: + return emit_shared_atomic(ctx, intr, DXIL_RMWOP_UMAX, nir_type_uint); + case nir_intrinsic_shared_atomic_and_dxil: + return emit_shared_atomic(ctx, intr, DXIL_RMWOP_AND, nir_type_uint); + case nir_intrinsic_shared_atomic_or_dxil: + return emit_shared_atomic(ctx, intr, DXIL_RMWOP_OR, nir_type_uint); + case nir_intrinsic_shared_atomic_xor_dxil: + return emit_shared_atomic(ctx, intr, DXIL_RMWOP_XOR, nir_type_uint); + case nir_intrinsic_shared_atomic_exchange_dxil: + return emit_shared_atomic(ctx, intr, DXIL_RMWOP_XCHG, nir_type_int); + case nir_intrinsic_shared_atomic_comp_swap_dxil: + return emit_shared_atomic_comp_swap(ctx, intr); + case nir_intrinsic_image_store: + return emit_image_store(ctx, intr); + case nir_intrinsic_image_size: + return emit_image_size(ctx, intr); + case nir_intrinsic_load_num_work_groups: + case nir_intrinsic_load_local_group_size: default: NIR_INSTR_UNSUPPORTED(&intr->instr); assert("Unimplemented intrinsic instruction"); @@ -3266,18 +3880,88 @@ prepare_phi_values(struct ntd_context *ctx, nir_shader *shader) static bool emit_cbvs(struct ntd_context *ctx, nir_shader *s) { - for (int i = ctx->opts->ubo_binding_offset; i < s->info.num_ubos; ++i) { - char name[64]; - snprintf(name, sizeof(name), "__ubo%d", i); - if (!emit_cbv(ctx, i, 16384 /*4096 vec4's*/, name)) - return false; + if (s->info.stage == MESA_SHADER_KERNEL) { + nir_foreach_variable_with_modes(var, s, nir_var_mem_ubo) { + if (!emit_ubo_var(ctx, var)) + return false; + } + } else { + for (int i = ctx->opts->ubo_binding_offset; i < s->info.num_ubos; ++i) { + char name[64]; + snprintf(name, sizeof(name), "__ubo%d", i); + if (!emit_cbv(ctx, i, 16384 /*4096 vec4's*/, name)) + return false; + } } return true; } static bool -emit_module(struct ntd_context *ctx, nir_shader *s) +emit_scratch(struct ntd_context *ctx, nir_shader *s) +{ + if (s->scratch_size) { + /* + * We always allocate an u32 array, no matter the actual variable types. + * According to the DXIL spec, the minimum load/store granularity is + * 32-bit, anything smaller requires using a read-extract/read-write-modify + * approach. + */ + unsigned size = ALIGN_POT(s->scratch_size, sizeof(uint32_t)); + const struct dxil_type *int32 = dxil_module_get_int_type(&ctx->mod, 32); + const struct dxil_value *array_length = dxil_module_get_int32_const(&ctx->mod, size / sizeof(uint32_t)); + if (!int32 || !array_length) + return false; + + const struct dxil_type *type = dxil_module_get_array_type( + &ctx->mod, int32, size / sizeof(uint32_t)); + if (!type) + return false; + + ctx->scratchvars = dxil_emit_alloca(&ctx->mod, type, int32, array_length, 4); + if (!ctx->scratchvars) + return false; + } + + return true; +} + +/* The validator complains if we don't have ops that reference a global variable. */ +static bool +shader_has_shared_ops(struct nir_shader *s) +{ + nir_foreach_function(func, s) { + if (!func->impl) + continue; + nir_foreach_block(block, func->impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_load_shared_dxil: + case nir_intrinsic_store_shared_dxil: + case nir_intrinsic_shared_atomic_add_dxil: + case nir_intrinsic_shared_atomic_and_dxil: + case nir_intrinsic_shared_atomic_comp_swap_dxil: + case nir_intrinsic_shared_atomic_exchange_dxil: + case nir_intrinsic_shared_atomic_imax_dxil: + case nir_intrinsic_shared_atomic_imin_dxil: + case nir_intrinsic_shared_atomic_or_dxil: + case nir_intrinsic_shared_atomic_umax_dxil: + case nir_intrinsic_shared_atomic_umin_dxil: + case nir_intrinsic_shared_atomic_xor_dxil: + return true; + default: break; + } + } + } + } + return false; +} + +static bool +emit_module(struct ntd_context *ctx, nir_shader *s, const struct nir_to_dxil_options *opts) { unsigned binding; @@ -3314,6 +3998,45 @@ emit_module(struct ntd_context *ctx, nir_shader *s) } } + if (s->info.cs.shared_size && shader_has_shared_ops(s)) { + const struct dxil_type *type; + unsigned size; + + /* + * We always allocate an u32 array, no matter the actual variable types. + * According to the DXIL spec, the minimum load/store granularity is + * 32-bit, anything smaller requires using a read-extract/read-write-modify + * approach. Non-atomic 64-bit accesses are allowed, but the + * GEP(cast(gvar, u64[] *), offset) and cast(GEP(gvar, offset), u64 *)) + * sequences don't seem to be accepted by the DXIL validator when the + * pointer is in the groupshared address space, making the 32-bit -> 64-bit + * pointer cast impossible. + */ + size = ALIGN_POT(s->info.cs.shared_size, sizeof(uint32_t)); + type = dxil_module_get_array_type(&ctx->mod, + dxil_module_get_int_type(&ctx->mod, 32), + size / sizeof(uint32_t)); + ctx->sharedvars = dxil_add_global_ptr_var(&ctx->mod, "shared", type, + DXIL_AS_GROUPSHARED, + ffs(sizeof(uint64_t)), + NULL); + } + + if (!emit_scratch(ctx, s)) + return false; + + /* UAVs */ + if (s->info.stage == MESA_SHADER_KERNEL) { + if (!emit_globals(ctx, s, opts->num_kernel_globals)) + return false; + + ctx->consts = _mesa_pointer_hash_table_create(ctx->ralloc_ctx); + if (!ctx->consts) + return false; + if (!emit_global_consts(ctx, s)) + return false; + } + nir_foreach_variable_with_modes(var, s, nir_var_uniform) { unsigned count = glsl_type_get_image_count(var->type); if (var->data.mode == nir_var_uniform && count) { @@ -3383,6 +4106,7 @@ get_dxil_shader_kind(struct nir_shader *s) return DXIL_GEOMETRY_SHADER; case MESA_SHADER_FRAGMENT: return DXIL_PIXEL_SHADER; + case MESA_SHADER_KERNEL: case MESA_SHADER_COMPUTE: return DXIL_COMPUTE_SHADER; default: @@ -3437,11 +4161,16 @@ optimize_nir(struct nir_shader *s, const struct nir_to_dxil_options *opts) NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true); NIR_PASS(progress, s, nir_opt_algebraic); NIR_PASS(progress, s, dxil_nir_lower_x2b); + if (s->options->lower_int64_options) + NIR_PASS(progress, s, nir_lower_int64); NIR_PASS(progress, s, nir_lower_alu); NIR_PASS(progress, s, dxil_nir_lower_inot); NIR_PASS(progress, s, nir_opt_constant_folding); NIR_PASS(progress, s, nir_opt_undef); + NIR_PASS(progress, s, nir_lower_undef_to_zero); NIR_PASS(progress, s, nir_opt_deref); + NIR_PASS(progress, s, dxil_nir_lower_upcast_phis, opts->lower_int16 ? 32 : 16); + NIR_PASS(progress, s, nir_lower_64bit_phis); NIR_PASS_V(s, nir_lower_system_values); } while (progress); @@ -3602,7 +4331,7 @@ nir_to_dxil(struct nir_shader *s, const struct nir_to_dxil_options *opts, if (debug_dxil & DXIL_DEBUG_VERBOSE) nir_print_shader(s, stderr); - if (!emit_module(ctx, s)) { + if (!emit_module(ctx, s, opts)) { debug_printf("D3D12: dxil_container_add_module failed\n"); retval = false; goto out; diff --git a/src/microsoft/compiler/nir_to_dxil.h b/src/microsoft/compiler/nir_to_dxil.h index d0d7d163f9b..654fc9d350d 100644 --- a/src/microsoft/compiler/nir_to_dxil.h +++ b/src/microsoft/compiler/nir_to_dxil.h @@ -52,6 +52,7 @@ struct nir_to_dxil_options { bool disable_math_refactoring; unsigned ubo_binding_offset; unsigned provoking_vertex; + unsigned num_kernel_globals; }; bool diff --git a/src/microsoft/meson.build b/src/microsoft/meson.build index ed218ad4870..05d86832a13 100644 --- a/src/microsoft/meson.build +++ b/src/microsoft/meson.build @@ -20,4 +20,7 @@ # IN THE SOFTWARE. subdir('compiler') +if with_microsoft_clc + subdir('clc') +endif subdir('resource_state_manager')