diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index fec4d6f43a7..172d372c129 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -25,6 +25,7 @@ #define BRW_COMPILER_H #include +#include "c11/threads.h" #include "dev/intel_device_info.h" #include "main/config.h" #include "util/ralloc.h" @@ -45,6 +46,11 @@ typedef struct nir_shader nir_shader; struct brw_compiler { const struct intel_device_info *devinfo; + /* This lock must be taken if the compiler is to be modified in any way, + * including adding something to the ralloc child list. + */ + mtx_t mutex; + struct { struct ra_regs *regs; @@ -109,6 +115,8 @@ struct brw_compiler { * constant or data cache, UBOs must use VK_FORMAT_RAW. */ bool indirect_ubos_use_sampler; + + struct nir_shader *clc_shader; }; #define brw_shader_debug_log(compiler, data, fmt, ... ) do { \ diff --git a/src/intel/compiler/brw_kernel.c b/src/intel/compiler/brw_kernel.c new file mode 100644 index 00000000000..246343cdbe8 --- /dev/null +++ b/src/intel/compiler/brw_kernel.c @@ -0,0 +1,362 @@ +/* + * Copyright © 2020 Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_kernel.h" +#include "brw_nir.h" + +#include "compiler/nir/nir_builder.h" +#include "compiler/spirv/nir_spirv.h" +#include "dev/intel_debug.h" +#include "util/u_atomic.h" + +static const nir_shader * +load_clc_shader(struct brw_compiler *compiler, struct disk_cache *disk_cache, + const nir_shader_compiler_options *nir_options, + const struct spirv_to_nir_options *spirv_options) +{ + if (compiler->clc_shader) + return compiler->clc_shader; + + nir_shader *nir = nir_load_libclc_shader(64, disk_cache, + spirv_options, nir_options); + if (nir == NULL) + return NULL; + + const nir_shader *old_nir = + p_atomic_cmpxchg(&compiler->clc_shader, NULL, nir); + if (old_nir == NULL) { + /* We won the race */ + return nir; + } else { + /* Someone else built the shader first */ + ralloc_free(nir); + return old_nir; + } +} + +static void +builder_init_new_impl(nir_builder *b, nir_function *func) +{ + nir_function_impl *impl = nir_function_impl_create(func); + nir_builder_init(b, impl); + b->cursor = nir_before_cf_list(&impl->body); +} + +static bool +lower_kernel_intrinsics(nir_shader *nir) +{ + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + + bool progress = false; + + unsigned kernel_sysvals_start = 0; + unsigned kernel_arg_start = sizeof(struct brw_kernel_sysvals); + nir->num_uniforms += kernel_arg_start; + + nir_builder b; + nir_builder_init(&b, impl); + + nir_foreach_block(block, impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_load_kernel_input: { + b.cursor = nir_instr_remove(&intrin->instr); + + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(nir, nir_intrinsic_load_uniform); + load->num_components = intrin->num_components; + load->src[0] = nir_src_for_ssa(nir_u2u32(&b, intrin->src[0].ssa)); + nir_intrinsic_set_base(load, kernel_arg_start); + nir_intrinsic_set_range(load, nir->num_uniforms); + nir_ssa_dest_init(&load->instr, &load->dest, + intrin->dest.ssa.num_components, + intrin->dest.ssa.bit_size, NULL); + nir_builder_instr_insert(&b, &load->instr); + + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, &load->dest.ssa); + progress = true; + break; + } + + case nir_intrinsic_load_constant_base_ptr: { + b.cursor = nir_instr_remove(&intrin->instr); + nir_ssa_def *const_data_base_addr = nir_pack_64_2x32_split(&b, + nir_load_reloc_const_intel(&b, BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW), + nir_load_reloc_const_intel(&b, BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH)); + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, const_data_base_addr); + progress = true; + break; + } + + case nir_intrinsic_load_num_workgroups: { + b.cursor = nir_instr_remove(&intrin->instr); + + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(nir, nir_intrinsic_load_uniform); + load->num_components = 3; + load->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0)); + nir_intrinsic_set_base(load, kernel_sysvals_start + + offsetof(struct brw_kernel_sysvals, num_work_groups)); + nir_intrinsic_set_range(load, 3 * 4); + nir_ssa_dest_init(&load->instr, &load->dest, 3, 32, NULL); + nir_builder_instr_insert(&b, &load->instr); + + /* We may need to do a bit-size cast here */ + nir_ssa_def *num_work_groups = + nir_u2u(&b, &load->dest.ssa, intrin->dest.ssa.bit_size); + + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, num_work_groups); + progress = true; + break; + } + + default: + break; + } + } + } + + if (progress) { + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance); + } else { + nir_metadata_preserve(impl, nir_metadata_all); + } + + return progress; +} + +bool +brw_kernel_from_spirv(struct brw_compiler *compiler, + struct disk_cache *disk_cache, + struct brw_kernel *kernel, + void *log_data, void *mem_ctx, + const uint32_t *spirv, size_t spirv_size, + const char *entrypoint_name, + char **error_str) +{ + const struct intel_device_info *devinfo = compiler->devinfo; + const nir_shader_compiler_options *nir_options = + compiler->nir_options[MESA_SHADER_KERNEL]; + + struct spirv_to_nir_options spirv_options = { + .environment = NIR_SPIRV_OPENCL, + .caps = { + .address = true, + .float16 = devinfo->ver >= 8, + .float64 = devinfo->ver >= 8, + .image_write_without_format = true, + .int8 = devinfo->ver >= 8, + .int16 = devinfo->ver >= 8, + .int64 = devinfo->ver >= 8, + .int64_atomics = devinfo->ver >= 9, + .kernel = true, + .float_controls = devinfo->ver >= 8, + .generic_pointers = true, + .storage_8bit = devinfo->ver >= 8, + .storage_16bit = devinfo->ver >= 8, + .subgroup_arithmetic = true, + .subgroup_basic = true, + .subgroup_ballot = true, + .subgroup_dispatch = true, + .subgroup_quad = true, + .subgroup_shuffle = true, + .subgroup_vote = true, + + .intel_subgroup_shuffle = true, + .intel_subgroup_buffer_block_io = true, + }, + .shared_addr_format = nir_address_format_62bit_generic, + .global_addr_format = nir_address_format_62bit_generic, + .temp_addr_format = nir_address_format_62bit_generic, + .constant_addr_format = nir_address_format_64bit_global, + }; + + spirv_options.clc_shader = load_clc_shader(compiler, disk_cache, + nir_options, &spirv_options); + + assert(spirv_size % 4 == 0); + nir_shader *nir = + spirv_to_nir(spirv, spirv_size / 4, NULL, 0, MESA_SHADER_KERNEL, + entrypoint_name, &spirv_options, nir_options); + nir_validate_shader(nir, "after spirv_to_nir"); + nir_validate_ssa_dominance(nir, "after spirv_to_nir"); + ralloc_steal(mem_ctx, nir); + nir->info.name = ralloc_strdup(nir, entrypoint_name); + + if (INTEL_DEBUG(DEBUG_CS)) { + /* Re-index SSA defs so we print more sensible numbers. */ + nir_foreach_function(function, nir) { + if (function->impl) + nir_index_ssa_defs(function->impl); + } + + fprintf(stderr, "NIR (from SPIR-V) for kernel\n"); + nir_print_shader(nir, stderr); + } + + NIR_PASS_V(nir, nir_lower_libclc, spirv_options.clc_shader); + + /* We have to lower away local constant initializers right before we + * inline functions. That way they get properly initialized at the top + * of the function and not at the top of its caller. + */ + NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp); + NIR_PASS_V(nir, nir_lower_returns); + NIR_PASS_V(nir, nir_inline_functions); + NIR_PASS_V(nir, nir_copy_prop); + NIR_PASS_V(nir, nir_opt_deref); + + /* Pick off the single entrypoint that we want */ + foreach_list_typed_safe(nir_function, func, node, &nir->functions) { + if (!func->is_entrypoint) + exec_node_remove(&func->node); + } + assert(exec_list_length(&nir->functions) == 1); + + /* Now that we've deleted all but the main function, we can go ahead and + * lower the rest of the constant initializers. We do this here so that + * nir_remove_dead_variables and split_per_member_structs below see the + * corresponding stores. + */ + NIR_PASS_V(nir, nir_lower_variable_initializers, ~0); + + /* LLVM loves take advantage of the fact that vec3s in OpenCL are 16B + * aligned and so it can just read/write them as vec4s. This results in a + * LOT of vec4->vec3 casts on loads and stores. One solution to this + * problem is to get rid of all vec3 variables. + */ + NIR_PASS_V(nir, nir_lower_vec3_to_vec4, + nir_var_shader_temp | nir_var_function_temp | + nir_var_mem_shared | nir_var_mem_global| + nir_var_mem_constant); + + /* We assign explicit types early so that the optimizer can take advantage + * of that information and hopefully get rid of some of our memcpys. + */ + NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, + nir_var_uniform | + nir_var_shader_temp | nir_var_function_temp | + nir_var_mem_shared | nir_var_mem_global, + glsl_get_cl_type_size_align); + + brw_preprocess_nir(compiler, nir, NULL); + + int max_arg_idx = -1; + nir_foreach_uniform_variable(var, nir) { + assert(var->data.location < 256); + max_arg_idx = MAX2(max_arg_idx, var->data.location); + } + + kernel->args_size = nir->num_uniforms; + kernel->arg_count = max_arg_idx + 1; + + /* No bindings */ + struct brw_kernel_arg_desc *args = + rzalloc_array(mem_ctx, struct brw_kernel_arg_desc, kernel->arg_count); + + nir_foreach_uniform_variable(var, nir) { + struct brw_kernel_arg_desc arg_desc = { + .offset = var->data.driver_location, + .size = glsl_get_explicit_size(var->type, false), + }; + assert(arg_desc.offset + arg_desc.size <= nir->num_uniforms); + + assert(var->data.location >= 0); + args[var->data.location] = arg_desc; + } + + NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_all, NULL); + + /* Lower again, this time after dead-variables to get more compact variable + * layouts. + */ + nir->scratch_size = 0; + NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, + nir_var_shader_temp | nir_var_function_temp | + nir_var_mem_shared | nir_var_mem_global | nir_var_mem_constant, + glsl_get_cl_type_size_align); + if (nir->constant_data_size > 0) { + assert(nir->constant_data == NULL); + nir->constant_data = rzalloc_size(nir, nir->constant_data_size); + nir_gather_explicit_io_initializers(nir, nir->constant_data, + nir->constant_data_size, + nir_var_mem_constant); + } + + if (INTEL_DEBUG(DEBUG_CS)) { + /* Re-index SSA defs so we print more sensible numbers. */ + nir_foreach_function(function, nir) { + if (function->impl) + nir_index_ssa_defs(function->impl); + } + + fprintf(stderr, "NIR (before I/O lowering) for kernel\n"); + nir_print_shader(nir, stderr); + } + + NIR_PASS_V(nir, nir_lower_memcpy); + + NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_constant, + nir_address_format_64bit_global); + + NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_uniform, + nir_address_format_32bit_offset_as_64bit); + + NIR_PASS_V(nir, nir_lower_explicit_io, + nir_var_shader_temp | nir_var_function_temp | + nir_var_mem_shared | nir_var_mem_global, + nir_address_format_62bit_generic); + + NIR_PASS_V(nir, nir_lower_frexp); + NIR_PASS_V(nir, nir_lower_convert_alu_types, NULL); + + NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics); + NIR_PASS_V(nir, lower_kernel_intrinsics); + + struct brw_cs_prog_key key = { + .base.subgroup_size_type = BRW_SUBGROUP_SIZE_VARYING, + }; + + memset(&kernel->prog_data, 0, sizeof(kernel->prog_data)); + kernel->prog_data.base.nr_params = DIV_ROUND_UP(nir->num_uniforms, 4); + + struct brw_compile_cs_params params = { + .nir = nir, + .key = &key, + .prog_data = &kernel->prog_data, + .stats = &kernel->stats, + .log_data = log_data, + }; + + kernel->code = brw_compile_cs(compiler, mem_ctx, ¶ms); + + if (error_str) + *error_str = params.error_str; + + return kernel->code != NULL; +} diff --git a/src/intel/compiler/brw_kernel.h b/src/intel/compiler/brw_kernel.h new file mode 100644 index 00000000000..837dc57b4cd --- /dev/null +++ b/src/intel/compiler/brw_kernel.h @@ -0,0 +1,74 @@ +/* + * Copyright © 2020 Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef BRW_KERNEL_H +#define BRW_KERNEL_H + +#include "brw_compiler.h" + +struct disk_cache; + +#ifdef __cplusplus +extern "C" { +#endif + +/** Software interface for system values in kernels + * + * These are intended to go at the start of the kernel argument buffer. + */ +struct brw_kernel_sysvals { + uint32_t num_work_groups[3]; + uint32_t pad[5]; +}; + +struct brw_kernel_arg_desc { + uint16_t offset; + uint16_t size; +}; + +struct brw_kernel { + struct brw_cs_prog_data prog_data; + + struct brw_compile_stats stats; + + uint16_t args_size; + uint16_t arg_count; + const struct brw_kernel_arg_desc *args; + + const void *code; +}; + +bool +brw_kernel_from_spirv(struct brw_compiler *compiler, + struct disk_cache *disk_cache, + struct brw_kernel *kernel, + void *log_data, void *mem_ctx, + const uint32_t *spirv, size_t spirv_size, + const char *entrypoint_name, + char **error_str); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* BRW_KERNEL_H */ diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build index 3a04973035c..c389dfd54cc 100644 --- a/src/intel/compiler/meson.build +++ b/src/intel/compiler/meson.build @@ -76,6 +76,7 @@ libintel_compiler_files = files( 'brw_ir_performance.h', 'brw_ir_performance.cpp', 'brw_ir_vec4.h', + 'brw_kernel.c', 'brw_mesh.cpp', 'brw_nir.h', 'brw_nir.c',