/* * Copyright © 2021 Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include "brw_private.h" #include "compiler/shader_info.h" #include "intel/dev/intel_debug.h" #include "intel/dev/intel_device_info.h" #include "util/ralloc.h" unsigned brw_required_dispatch_width(const struct shader_info *info) { if ((int)info->subgroup_size >= (int)SUBGROUP_SIZE_REQUIRE_8) { assert(gl_shader_stage_uses_workgroup(info->stage)); /* These enum values are expressly chosen to be equal to the subgroup * size that they require. */ return (unsigned)info->subgroup_size; } else { return 0; } } static inline bool test_bit(unsigned mask, unsigned bit) { return mask & (1u << bit); } bool brw_simd_should_compile(void *mem_ctx, unsigned simd, const struct intel_device_info *devinfo, struct brw_cs_prog_data *prog_data, unsigned required, const char **error) { assert(!test_bit(prog_data->prog_mask, simd)); assert(error); const unsigned width = 8u << simd; /* For shaders with variable size workgroup, we will always compile all the * variants, since the choice will happen only at dispatch time. */ const bool workgroup_size_variable = prog_data->local_size[0] == 0; if (!workgroup_size_variable) { if (test_bit(prog_data->prog_spilled, simd)) { *error = ralloc_asprintf( mem_ctx, "SIMD%u skipped because would spill", width); return false; } const unsigned workgroup_size = prog_data->local_size[0] * prog_data->local_size[1] * prog_data->local_size[2]; unsigned max_threads = devinfo->max_cs_workgroup_threads; if (required && required != width) { *error = ralloc_asprintf( mem_ctx, "SIMD%u skipped because required dispatch width is %u", width, required); return false; } if (simd > 0 && test_bit(prog_data->prog_mask, simd - 1) && workgroup_size <= (width / 2)) { *error = ralloc_asprintf( mem_ctx, "SIMD%u skipped because workgroup size %u already fits in SIMD%u", width, workgroup_size, width / 2); return false; } if (DIV_ROUND_UP(workgroup_size, width) > max_threads) { *error = ralloc_asprintf( mem_ctx, "SIMD%u can't fit all %u invocations in %u threads", width, workgroup_size, max_threads); return false; } /* The SIMD32 is only enabled for cases it is needed unless forced. * * TODO: Use performance_analysis and drop this rule. */ if (width == 32) { if (!INTEL_DEBUG(DEBUG_DO32) && prog_data->prog_mask) { *error = ralloc_strdup( mem_ctx, "SIMD32 skipped because not required"); return false; } } } const bool env_skip[3] = { INTEL_DEBUG(DEBUG_NO8), INTEL_DEBUG(DEBUG_NO16), INTEL_DEBUG(DEBUG_NO32), }; if (unlikely(env_skip[simd])) { *error = ralloc_asprintf( mem_ctx, "SIMD%u skipped because INTEL_DEBUG=no%u", width, width); return false; } return true; } void brw_simd_mark_compiled(unsigned simd, struct brw_cs_prog_data *prog_data, bool spilled) { assert(!test_bit(prog_data->prog_mask, simd)); prog_data->prog_mask |= 1u << simd; /* If a SIMD spilled, all the larger ones would spill too. */ if (spilled) { for (unsigned i = simd; i < 3; i++) prog_data->prog_spilled |= 1u << i; } } int brw_simd_select(const struct brw_cs_prog_data *prog_data) { assert((prog_data->prog_mask & ~0x7u) == 0); const unsigned not_spilled_mask = prog_data->prog_mask & ~prog_data->prog_spilled; /* Util functions index bits from 1 instead of 0, adjust before return. */ if (not_spilled_mask) return util_last_bit(not_spilled_mask) - 1; else if (prog_data->prog_mask) return ffs(prog_data->prog_mask) - 1; else return -1; } int brw_simd_select_for_workgroup_size(const struct intel_device_info *devinfo, const struct brw_cs_prog_data *prog_data, const unsigned *sizes) { assert(sizes); if (prog_data->local_size[0] == sizes[0] && prog_data->local_size[1] == sizes[1] && prog_data->local_size[2] == sizes[2]) return brw_simd_select(prog_data); void *mem_ctx = ralloc_context(NULL); struct brw_cs_prog_data cloned = *prog_data; for (unsigned i = 0; i < 3; i++) cloned.local_size[i] = sizes[i]; cloned.prog_mask = 0; cloned.prog_spilled = 0; const char *error[3] = {0}; for (unsigned simd = 0; simd < 3; simd++) { /* We are not recompiling, so use original results of prog_mask and * prog_spilled as they will already contain all possible compilations. */ if (brw_simd_should_compile(mem_ctx, simd, devinfo, &cloned, 0 /* required_dispatch_width */, &error[simd]) && test_bit(prog_data->prog_mask, simd)) { brw_simd_mark_compiled(simd, &cloned, test_bit(prog_data->prog_spilled, simd)); } } ralloc_free(mem_ctx); return brw_simd_select(&cloned); }