mesa/src/nouveau/vulkan/nvk_shader.c

988 lines
33 KiB
C

/*
* Copyright © 2022 Collabora Ltd. and Red Hat Inc.
* SPDX-License-Identifier: MIT
*/
#include "nvk_shader.h"
#include "nvk_cmd_buffer.h"
#include "nvk_descriptor_set_layout.h"
#include "nvk_device.h"
#include "nvk_physical_device.h"
#include "nvk_sampler.h"
#include "nvk_shader.h"
#include "vk_nir_convert_ycbcr.h"
#include "vk_pipeline.h"
#include "vk_pipeline_layout.h"
#include "vk_shader_module.h"
#include "vk_ycbcr_conversion.h"
#include "nak.h"
#include "nir.h"
#include "nir_builder.h"
#include "compiler/spirv/nir_spirv.h"
#include "nv50_ir_driver.h"
#include "util/mesa-sha1.h"
#include "util/u_debug.h"
#include "cla097.h"
#include "clb097.h"
#include "clc397.h"
#include "clc597.h"
static void
shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align)
{
assert(glsl_type_is_vector_or_scalar(type));
uint32_t comp_size = glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
unsigned length = glsl_get_vector_elements(type);
*size = comp_size * length, *align = comp_size;
}
VkShaderStageFlags
nvk_nak_stages(const struct nv_device_info *info)
{
const VkShaderStageFlags all =
VK_SHADER_STAGE_VERTEX_BIT |
VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT |
VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT |
VK_SHADER_STAGE_GEOMETRY_BIT |
VK_SHADER_STAGE_FRAGMENT_BIT |
VK_SHADER_STAGE_COMPUTE_BIT;
const struct debug_control flags[] = {
{ "vs", BITFIELD64_BIT(MESA_SHADER_VERTEX) },
{ "tcs", BITFIELD64_BIT(MESA_SHADER_TESS_CTRL) },
{ "tes", BITFIELD64_BIT(MESA_SHADER_TESS_EVAL) },
{ "gs", BITFIELD64_BIT(MESA_SHADER_GEOMETRY) },
{ "fs", BITFIELD64_BIT(MESA_SHADER_FRAGMENT) },
{ "cs", BITFIELD64_BIT(MESA_SHADER_COMPUTE) },
{ "all", all },
{ NULL, 0 },
};
const char *env_str = getenv("NVK_USE_NAK");
if (env_str == NULL)
return info->cls_eng3d >= VOLTA_A ? all : 0;
else
return parse_debug_string(env_str, flags);
}
static bool
use_nak(const struct nvk_physical_device *pdev, gl_shader_stage stage)
{
return nvk_nak_stages(&pdev->info) & mesa_to_vk_shader_stage(stage);
}
uint64_t
nvk_physical_device_compiler_flags(const struct nvk_physical_device *pdev)
{
bool no_cbufs = pdev->debug_flags & NVK_DEBUG_NO_CBUF;
uint64_t prog_debug = nvk_cg_get_prog_debug();
uint64_t prog_optimize = nvk_cg_get_prog_optimize();
uint64_t nak_stages = nvk_nak_stages(&pdev->info);
uint64_t nak_flags = nak_debug_flags(pdev->nak);
assert(prog_debug <= UINT8_MAX);
assert(prog_optimize < 16);
assert(nak_stages <= UINT32_MAX);
assert(nak_flags <= UINT16_MAX);
return prog_debug
| (prog_optimize << 8)
| ((uint64_t)no_cbufs << 12)
| (nak_stages << 16)
| (nak_flags << 48);
}
static const nir_shader_compiler_options *
nvk_get_nir_options(struct vk_physical_device *vk_pdev,
gl_shader_stage stage,
UNUSED const struct vk_pipeline_robustness_state *rs)
{
const struct nvk_physical_device *pdev =
container_of(vk_pdev, struct nvk_physical_device, vk);
if (use_nak(pdev, stage))
return nak_nir_options(pdev->nak);
else
return nvk_cg_nir_options(pdev, stage);
}
static struct spirv_to_nir_options
nvk_get_spirv_options(struct vk_physical_device *vk_pdev,
UNUSED gl_shader_stage stage,
const struct vk_pipeline_robustness_state *rs)
{
const struct nvk_physical_device *pdev =
container_of(vk_pdev, struct nvk_physical_device, vk);
return (struct spirv_to_nir_options) {
.caps = {
.demote_to_helper_invocation = true,
.descriptor_array_dynamic_indexing = true,
.descriptor_array_non_uniform_indexing = true,
.descriptor_indexing = true,
.device_group = true,
.draw_parameters = true,
.float_controls = true,
.float16 = true,
.float64 = true,
.fragment_barycentric = true,
.geometry_streams = true,
.image_atomic_int64 = true,
.image_ms_array = true,
.image_read_without_format = true,
.image_write_without_format = true,
.int8 = true,
.int16 = true,
.int64 = true,
.int64_atomics = true,
.min_lod = true,
.multiview = true,
.physical_storage_buffer_address = true,
.runtime_descriptor_array = true,
.shader_clock = true,
.shader_sm_builtins_nv = true,
.shader_viewport_index_layer = true,
.sparse_residency = true,
.storage_8bit = true,
.storage_16bit = true,
.storage_image_ms = true,
.subgroup_arithmetic = true,
.subgroup_ballot = true,
.subgroup_basic = true,
.subgroup_quad = true,
.subgroup_rotate = true,
.subgroup_shuffle = true,
.subgroup_uniform_control_flow = true,
.subgroup_vote = true,
.tessellation = true,
.transform_feedback = true,
.variable_pointers = true,
.vk_memory_model_device_scope = true,
.vk_memory_model = true,
.workgroup_memory_explicit_layout = true,
},
.ssbo_addr_format = nvk_buffer_addr_format(rs->storage_buffers),
.phys_ssbo_addr_format = nir_address_format_64bit_global,
.ubo_addr_format = nvk_buffer_addr_format(rs->uniform_buffers),
.shared_addr_format = nir_address_format_32bit_offset,
.min_ssbo_alignment = NVK_MIN_SSBO_ALIGNMENT,
.min_ubo_alignment = nvk_min_cbuf_alignment(&pdev->info),
};
}
static void
nvk_preprocess_nir(struct vk_physical_device *vk_pdev, nir_shader *nir)
{
const struct nvk_physical_device *pdev =
container_of(vk_pdev, struct nvk_physical_device, vk);
NIR_PASS_V(nir, nir_lower_io_to_temporaries,
nir_shader_get_entrypoint(nir), true, false);
if (use_nak(pdev, nir->info.stage))
nak_preprocess_nir(nir, pdev->nak);
else
nvk_cg_preprocess_nir(nir);
}
static void
nvk_populate_fs_key(struct nak_fs_key *key,
const struct vk_graphics_pipeline_state *state)
{
memset(key, 0, sizeof(*key));
key->sample_locations_cb = 0;
key->sample_locations_offset = nvk_root_descriptor_offset(draw.sample_locations);
if (state == NULL)
return;
if (state->pipeline_flags &
VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT)
key->zs_self_dep = true;
/* We force per-sample interpolation whenever sampleShadingEnable is set
* regardless of minSampleShading or rasterizationSamples.
*
* When sampleShadingEnable is set, few guarantees are made about the
* location of interpolation of the inputs. The only real guarantees are
* that the inputs are interpolated within the pixel and that you get at
* least `rasterizationSamples * minSampleShading` unique positions.
* Importantly, it does not require that when `rasterizationSamples *
* minSampleShading <= 1.0` that those positions are at the fragment
* center. Therefore, it's valid to just always do per-sample (which maps
* to CENTROID on NVIDIA hardware) all the time and let the hardware sort
* it out based on what we set in HYBRID_ANTI_ALIAS_CONTROL::passes.
*
* Also, we set HYBRID_ANTI_ALIAS_CONTROL::centroid at draw time based on
* `rasterizationSamples * minSampleShading` so it should be per-pixel
* whenever we're running only a single pass. However, this would still be
* correct even if it got interpolated at some other sample.
*
* The one caveat here is that we have to be careful about gl_SampleMaskIn.
* When `nak_fs_key::force_sample_shading = true` we also turn any reads of
* gl_SampleMaskIn into `1 << gl_SampleID` because the hardware sample mask
* is actually per-fragment, not per-pass. We handle this by smashing
* minSampleShading to 1.0 whenever gl_SampleMaskIn is read.
*/
const struct vk_multisample_state *ms = state->ms;
if (ms != NULL && ms->sample_shading_enable)
key->force_sample_shading = true;
}
static void
nvk_hash_graphics_state(struct vk_physical_device *device,
const struct vk_graphics_pipeline_state *state,
VkShaderStageFlags stages,
blake3_hash blake3_out)
{
struct mesa_blake3 blake3_ctx;
_mesa_blake3_init(&blake3_ctx);
if (stages & VK_SHADER_STAGE_FRAGMENT_BIT) {
struct nak_fs_key key;
nvk_populate_fs_key(&key, state);
_mesa_blake3_update(&blake3_ctx, &key, sizeof(key));
const bool is_multiview = state->rp->view_mask != 0;
_mesa_blake3_update(&blake3_ctx, &is_multiview, sizeof(is_multiview));
}
_mesa_blake3_final(&blake3_ctx, blake3_out);
}
static bool
lower_load_global_constant_offset_instr(nir_builder *b,
nir_intrinsic_instr *intrin,
UNUSED void *_data)
{
if (intrin->intrinsic != nir_intrinsic_load_global_constant_offset &&
intrin->intrinsic != nir_intrinsic_load_global_constant_bounded)
return false;
b->cursor = nir_before_instr(&intrin->instr);
nir_def *base_addr = intrin->src[0].ssa;
nir_def *offset = intrin->src[1].ssa;
nir_def *zero = NULL;
if (intrin->intrinsic == nir_intrinsic_load_global_constant_bounded) {
nir_def *bound = intrin->src[2].ssa;
unsigned bit_size = intrin->def.bit_size;
assert(bit_size >= 8 && bit_size % 8 == 0);
unsigned byte_size = bit_size / 8;
zero = nir_imm_zero(b, intrin->num_components, bit_size);
unsigned load_size = byte_size * intrin->num_components;
nir_def *sat_offset =
nir_umin(b, offset, nir_imm_int(b, UINT32_MAX - (load_size - 1)));
nir_def *in_bounds =
nir_ilt(b, nir_iadd_imm(b, sat_offset, load_size - 1), bound);
nir_push_if(b, in_bounds);
}
nir_def *val =
nir_build_load_global_constant(b, intrin->def.num_components,
intrin->def.bit_size,
nir_iadd(b, base_addr, nir_u2u64(b, offset)),
.align_mul = nir_intrinsic_align_mul(intrin),
.align_offset = nir_intrinsic_align_offset(intrin));
if (intrin->intrinsic == nir_intrinsic_load_global_constant_bounded) {
nir_pop_if(b, NULL);
val = nir_if_phi(b, val, zero);
}
nir_def_rewrite_uses(&intrin->def, val);
return true;
}
struct lower_ycbcr_state {
uint32_t set_layout_count;
struct vk_descriptor_set_layout * const *set_layouts;
};
static const struct vk_ycbcr_conversion_state *
lookup_ycbcr_conversion(const void *_state, uint32_t set,
uint32_t binding, uint32_t array_index)
{
const struct lower_ycbcr_state *state = _state;
assert(set < state->set_layout_count);
assert(state->set_layouts[set] != NULL);
const struct nvk_descriptor_set_layout *set_layout =
vk_to_nvk_descriptor_set_layout(state->set_layouts[set]);
assert(binding < set_layout->binding_count);
const struct nvk_descriptor_set_binding_layout *bind_layout =
&set_layout->binding[binding];
if (bind_layout->immutable_samplers == NULL)
return NULL;
array_index = MIN2(array_index, bind_layout->array_size - 1);
const struct nvk_sampler *sampler =
bind_layout->immutable_samplers[array_index];
return sampler && sampler->vk.ycbcr_conversion ?
&sampler->vk.ycbcr_conversion->state : NULL;
}
static inline bool
nir_has_image_var(nir_shader *nir)
{
nir_foreach_image_variable(_, nir)
return true;
return false;
}
void
nvk_lower_nir(struct nvk_device *dev, nir_shader *nir,
const struct vk_pipeline_robustness_state *rs,
bool is_multiview,
uint32_t set_layout_count,
struct vk_descriptor_set_layout * const *set_layouts,
struct nvk_cbuf_map *cbuf_map_out)
{
struct nvk_physical_device *pdev = nvk_device_physical(dev);
if (nir->info.stage == MESA_SHADER_FRAGMENT) {
NIR_PASS(_, nir, nir_lower_input_attachments,
&(nir_input_attachment_options) {
.use_fragcoord_sysval = use_nak(pdev, nir->info.stage),
.use_layer_id_sysval = use_nak(pdev, nir->info.stage) ||
is_multiview,
.use_view_id_for_layer = is_multiview,
});
}
if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
NIR_PASS(_, nir, nir_lower_patch_vertices,
nir->info.tess.tcs_vertices_out, NULL);
}
const struct lower_ycbcr_state ycbcr_state = {
.set_layout_count = set_layout_count,
.set_layouts = set_layouts,
};
NIR_PASS(_, nir, nir_vk_lower_ycbcr_tex,
lookup_ycbcr_conversion, &ycbcr_state);
nir_lower_compute_system_values_options csv_options = {
.has_base_workgroup_id = true,
};
NIR_PASS(_, nir, nir_lower_compute_system_values, &csv_options);
/* Lower push constants before lower_descriptors */
NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_push_const,
nir_address_format_32bit_offset);
/* Lower non-uniform access before lower_descriptors */
enum nir_lower_non_uniform_access_type lower_non_uniform_access_types =
nir_lower_non_uniform_ubo_access;
if (pdev->info.cls_eng3d < TURING_A) {
lower_non_uniform_access_types |= nir_lower_non_uniform_texture_access |
nir_lower_non_uniform_image_access;
}
/* In practice, most shaders do not have non-uniform-qualified accesses
* thus a cheaper and likely to fail check is run first.
*/
if (nir_has_non_uniform_access(nir, lower_non_uniform_access_types)) {
struct nir_lower_non_uniform_access_options opts = {
.types = lower_non_uniform_access_types,
.callback = NULL,
};
NIR_PASS(_, nir, nir_opt_non_uniform_access);
NIR_PASS(_, nir, nir_lower_non_uniform_access, &opts);
}
/* TODO: Kepler image lowering requires image params to be loaded from the
* descriptor set which we don't currently support.
*/
assert(pdev->info.cls_eng3d >= MAXWELL_A || !nir_has_image_var(nir));
struct nvk_cbuf_map *cbuf_map = NULL;
if (use_nak(pdev, nir->info.stage) &&
!(pdev->debug_flags & NVK_DEBUG_NO_CBUF)) {
cbuf_map = cbuf_map_out;
/* Large constant support assumes cbufs */
NIR_PASS(_, nir, nir_opt_large_constants, NULL, 32);
} else {
/* Codegen sometimes puts stuff in cbuf 1 and adds 1 to our cbuf indices
* so we can't really rely on it for lowering to cbufs and instead place
* the root descriptors in both cbuf 0 and cbuf 1.
*/
*cbuf_map_out = (struct nvk_cbuf_map) {
.cbuf_count = 2,
.cbufs = {
{ .type = NVK_CBUF_TYPE_ROOT_DESC },
{ .type = NVK_CBUF_TYPE_ROOT_DESC },
}
};
}
NIR_PASS(_, nir, nvk_nir_lower_descriptors, rs,
set_layout_count, set_layouts, cbuf_map);
NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global,
nir_address_format_64bit_global);
NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ssbo,
nvk_buffer_addr_format(rs->storage_buffers));
NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo,
nvk_buffer_addr_format(rs->uniform_buffers));
NIR_PASS(_, nir, nir_shader_intrinsics_pass,
lower_load_global_constant_offset_instr, nir_metadata_none, NULL);
if (!nir->info.shared_memory_explicit_layout) {
NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
nir_var_mem_shared, shared_var_info);
}
NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_shared,
nir_address_format_32bit_offset);
if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {
/* QMD::SHARED_MEMORY_SIZE requires an alignment of 256B so it's safe to
* align everything up to 16B so we can write whole vec4s.
*/
nir->info.shared_size = align(nir->info.shared_size, 16);
NIR_PASS(_, nir, nir_zero_initialize_shared_memory,
nir->info.shared_size, 16);
/* We need to call lower_compute_system_values again because
* nir_zero_initialize_shared_memory generates load_invocation_id which
* has to be lowered to load_invocation_index.
*/
NIR_PASS(_, nir, nir_lower_compute_system_values, NULL);
}
}
#ifndef NDEBUG
static void
nvk_shader_dump(struct nvk_shader *shader)
{
unsigned pos;
if (shader->info.stage != MESA_SHADER_COMPUTE) {
_debug_printf("dumping HDR for %s shader\n",
_mesa_shader_stage_to_string(shader->info.stage));
for (pos = 0; pos < ARRAY_SIZE(shader->info.hdr); ++pos)
_debug_printf("HDR[%02"PRIxPTR"] = 0x%08x\n",
pos * sizeof(shader->info.hdr[0]), shader->info.hdr[pos]);
}
_debug_printf("shader binary code (0x%x bytes):", shader->code_size);
for (pos = 0; pos < shader->code_size / 4; ++pos) {
if ((pos % 8) == 0)
_debug_printf("\n");
_debug_printf("%08x ", ((const uint32_t *)shader->code_ptr)[pos]);
}
_debug_printf("\n");
}
#endif
static VkResult
nvk_compile_nir_with_nak(struct nvk_physical_device *pdev,
nir_shader *nir,
VkShaderCreateFlagsEXT shader_flags,
const struct vk_pipeline_robustness_state *rs,
const struct nak_fs_key *fs_key,
struct nvk_shader *shader)
{
const bool dump_asm =
shader_flags & VK_SHADER_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_MESA;
nir_variable_mode robust2_modes = 0;
if (rs->uniform_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT)
robust2_modes |= nir_var_mem_ubo;
if (rs->storage_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT)
robust2_modes |= nir_var_mem_ssbo;
shader->nak = nak_compile_shader(nir, dump_asm, pdev->nak, robust2_modes, fs_key);
shader->info = shader->nak->info;
shader->code_ptr = shader->nak->code;
shader->code_size = shader->nak->code_size;
return VK_SUCCESS;
}
static VkResult
nvk_compile_nir(struct nvk_device *dev, nir_shader *nir,
VkShaderCreateFlagsEXT shader_flags,
const struct vk_pipeline_robustness_state *rs,
const struct nak_fs_key *fs_key,
struct nvk_shader *shader)
{
struct nvk_physical_device *pdev = nvk_device_physical(dev);
VkResult result;
if (use_nak(pdev, nir->info.stage)) {
result = nvk_compile_nir_with_nak(pdev, nir, shader_flags, rs,
fs_key, shader);
} else {
result = nvk_cg_compile_nir(pdev, nir, fs_key, shader);
}
if (result != VK_SUCCESS)
return result;
if (nir->constant_data_size > 0) {
uint32_t data_align = nvk_min_cbuf_alignment(&pdev->info);
uint32_t data_size = align(nir->constant_data_size, data_align);
void *data = malloc(data_size);
if (data == NULL)
return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
memcpy(data, nir->constant_data, nir->constant_data_size);
assert(nir->constant_data_size <= data_size);
memset(data + nir->constant_data_size, 0,
data_size - nir->constant_data_size);
shader->data_ptr = data;
shader->data_size = data_size;
}
return VK_SUCCESS;
}
VkResult
nvk_shader_upload(struct nvk_device *dev, struct nvk_shader *shader)
{
struct nvk_physical_device *pdev = nvk_device_physical(dev);
uint32_t hdr_size = 0;
if (shader->info.stage != MESA_SHADER_COMPUTE) {
if (pdev->info.cls_eng3d >= TURING_A)
hdr_size = TU102_SHADER_HEADER_SIZE;
else
hdr_size = GF100_SHADER_HEADER_SIZE;
}
/* Fermi needs 0x40 alignment
* Kepler+ needs the first instruction to be 0x80 aligned, so we waste 0x30 bytes
*/
int alignment = pdev->info.cls_eng3d >= KEPLER_A ? 0x80 : 0x40;
uint32_t total_size = 0;
if (pdev->info.cls_eng3d >= KEPLER_A &&
pdev->info.cls_eng3d < TURING_A &&
hdr_size > 0) {
/* The instructions are what has to be aligned so we need to start at a
* small offset (0x30 B) into the upload area.
*/
total_size = alignment - hdr_size;
}
const uint32_t hdr_offset = total_size;
total_size += hdr_size;
const uint32_t code_offset = total_size;
assert(code_offset % alignment == 0);
total_size += shader->code_size;
uint32_t data_offset = 0;
if (shader->data_size > 0) {
total_size = align(total_size, nvk_min_cbuf_alignment(&pdev->info));
data_offset = total_size;
total_size += shader->data_size;
}
char *data = malloc(total_size);
if (data == NULL)
return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
assert(hdr_size <= sizeof(shader->info.hdr));
memcpy(data + hdr_offset, shader->info.hdr, hdr_size);
memcpy(data + code_offset, shader->code_ptr, shader->code_size);
if (shader->data_size > 0)
memcpy(data + data_offset, shader->data_ptr, shader->data_size);
#ifndef NDEBUG
if (debug_get_bool_option("NV50_PROG_DEBUG", false))
nvk_shader_dump(shader);
#endif
VkResult result = nvk_heap_upload(dev, &dev->shader_heap, data,
total_size, alignment,
&shader->upload_addr);
if (result == VK_SUCCESS) {
shader->upload_size = total_size;
shader->hdr_addr = shader->upload_addr + hdr_offset;
if (pdev->info.cls_eng3d < VOLTA_A) {
const uint64_t heap_base_addr =
nvk_heap_contiguous_base_address(&dev->shader_heap);
assert(shader->upload_addr - heap_base_addr < UINT32_MAX);
shader->hdr_addr -= heap_base_addr;
}
shader->data_addr = shader->upload_addr + data_offset;
}
free(data);
return result;
}
static const struct vk_shader_ops nvk_shader_ops;
static void
nvk_shader_destroy(struct vk_device *vk_dev,
struct vk_shader *vk_shader,
const VkAllocationCallbacks* pAllocator)
{
struct nvk_device *dev = container_of(vk_dev, struct nvk_device, vk);
struct nvk_shader *shader = container_of(vk_shader, struct nvk_shader, vk);
if (shader->upload_size > 0) {
nvk_heap_free(dev, &dev->shader_heap,
shader->upload_addr,
shader->upload_size);
}
if (shader->nak) {
nak_shader_bin_destroy(shader->nak);
} else {
/* This came from codegen or deserialize, just free it */
free((void *)shader->code_ptr);
}
free((void *)shader->data_ptr);
vk_shader_free(&dev->vk, pAllocator, &shader->vk);
}
static VkResult
nvk_compile_shader(struct nvk_device *dev,
struct vk_shader_compile_info *info,
const struct vk_graphics_pipeline_state *state,
const VkAllocationCallbacks* pAllocator,
struct vk_shader **shader_out)
{
struct nvk_shader *shader;
VkResult result;
/* We consume the NIR, regardless of success or failure */
nir_shader *nir = info->nir;
shader = vk_shader_zalloc(&dev->vk, &nvk_shader_ops, info->stage,
pAllocator, sizeof(*shader));
if (shader == NULL) {
ralloc_free(nir);
return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
}
/* TODO: Multiview with ESO */
const bool is_multiview = state && state->rp->view_mask != 0;
nvk_lower_nir(dev, nir, info->robustness, is_multiview,
info->set_layout_count, info->set_layouts,
&shader->cbuf_map);
struct nak_fs_key fs_key_tmp, *fs_key = NULL;
if (nir->info.stage == MESA_SHADER_FRAGMENT) {
nvk_populate_fs_key(&fs_key_tmp, state);
fs_key = &fs_key_tmp;
}
result = nvk_compile_nir(dev, nir, info->flags, info->robustness,
fs_key, shader);
ralloc_free(nir);
if (result != VK_SUCCESS) {
nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
return result;
}
result = nvk_shader_upload(dev, shader);
if (result != VK_SUCCESS) {
nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
return result;
}
if (info->stage == MESA_SHADER_FRAGMENT) {
if (shader->info.fs.reads_sample_mask ||
shader->info.fs.uses_sample_shading) {
shader->min_sample_shading = 1;
} else if (state != NULL && state->ms != NULL &&
state->ms->sample_shading_enable) {
shader->min_sample_shading =
CLAMP(state->ms->min_sample_shading, 0, 1);
} else {
shader->min_sample_shading = 0;
}
}
*shader_out = &shader->vk;
return VK_SUCCESS;
}
static VkResult
nvk_compile_shaders(struct vk_device *vk_dev,
uint32_t shader_count,
struct vk_shader_compile_info *infos,
const struct vk_graphics_pipeline_state *state,
const VkAllocationCallbacks* pAllocator,
struct vk_shader **shaders_out)
{
struct nvk_device *dev = container_of(vk_dev, struct nvk_device, vk);
for (uint32_t i = 0; i < shader_count; i++) {
VkResult result = nvk_compile_shader(dev, &infos[i], state,
pAllocator, &shaders_out[i]);
if (result != VK_SUCCESS) {
/* Clean up all the shaders before this point */
for (uint32_t j = 0; j < i; j++)
nvk_shader_destroy(&dev->vk, shaders_out[j], pAllocator);
/* Clean up all the NIR after this point */
for (uint32_t j = i + 1; j < shader_count; j++)
ralloc_free(infos[j].nir);
/* Memset the output array */
memset(shaders_out, 0, shader_count * sizeof(*shaders_out));
return result;
}
}
return VK_SUCCESS;
}
static VkResult
nvk_deserialize_shader(struct vk_device *vk_dev,
struct blob_reader *blob,
uint32_t binary_version,
const VkAllocationCallbacks* pAllocator,
struct vk_shader **shader_out)
{
struct nvk_device *dev = container_of(vk_dev, struct nvk_device, vk);
struct nvk_shader *shader;
VkResult result;
struct nak_shader_info info;
blob_copy_bytes(blob, &info, sizeof(info));
struct nvk_cbuf_map cbuf_map;
blob_copy_bytes(blob, &cbuf_map, sizeof(cbuf_map));
float min_sample_shading;
blob_copy_bytes(blob, &min_sample_shading, sizeof(min_sample_shading));
const uint32_t code_size = blob_read_uint32(blob);
const uint32_t data_size = blob_read_uint32(blob);
if (blob->overrun)
return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
shader = vk_shader_zalloc(&dev->vk, &nvk_shader_ops, info.stage,
pAllocator, sizeof(*shader));
if (shader == NULL)
return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
shader->info = info;
shader->cbuf_map = cbuf_map;
shader->min_sample_shading = min_sample_shading;
shader->code_size = code_size;
shader->data_size = data_size;
shader->code_ptr = malloc(code_size);
if (shader->code_ptr == NULL) {
nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
}
shader->data_ptr = malloc(data_size);
if (shader->data_ptr == NULL) {
nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
}
blob_copy_bytes(blob, (void *)shader->code_ptr, shader->code_size);
blob_copy_bytes(blob, (void *)shader->data_ptr, shader->data_size);
if (blob->overrun) {
nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
}
result = nvk_shader_upload(dev, shader);
if (result != VK_SUCCESS) {
nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
return result;
}
*shader_out = &shader->vk;
return VK_SUCCESS;
}
static bool
nvk_shader_serialize(struct vk_device *vk_dev,
const struct vk_shader *vk_shader,
struct blob *blob)
{
struct nvk_shader *shader = container_of(vk_shader, struct nvk_shader, vk);
/* We can't currently cache assmbly */
if (shader->nak != NULL && shader->nak->asm_str != NULL)
return false;
blob_write_bytes(blob, &shader->info, sizeof(shader->info));
blob_write_bytes(blob, &shader->cbuf_map, sizeof(shader->cbuf_map));
blob_write_bytes(blob, &shader->min_sample_shading,
sizeof(shader->min_sample_shading));
blob_write_uint32(blob, shader->code_size);
blob_write_uint32(blob, shader->data_size);
blob_write_bytes(blob, shader->code_ptr, shader->code_size);
blob_write_bytes(blob, shader->data_ptr, shader->data_size);
return !blob->out_of_memory;
}
#define WRITE_STR(field, ...) ({ \
memset(field, 0, sizeof(field)); \
UNUSED int i = snprintf(field, sizeof(field), __VA_ARGS__); \
assert(i > 0 && i < sizeof(field)); \
})
static VkResult
nvk_shader_get_executable_properties(
UNUSED struct vk_device *device,
const struct vk_shader *vk_shader,
uint32_t *executable_count,
VkPipelineExecutablePropertiesKHR *properties)
{
struct nvk_shader *shader = container_of(vk_shader, struct nvk_shader, vk);
VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out,
properties, executable_count);
vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) {
props->stages = mesa_to_vk_shader_stage(shader->info.stage);
props->subgroupSize = 32;
WRITE_STR(props->name, "%s",
_mesa_shader_stage_to_string(shader->info.stage));
WRITE_STR(props->description, "%s shader",
_mesa_shader_stage_to_string(shader->info.stage));
}
return vk_outarray_status(&out);
}
static VkResult
nvk_shader_get_executable_statistics(
UNUSED struct vk_device *device,
const struct vk_shader *vk_shader,
uint32_t executable_index,
uint32_t *statistic_count,
VkPipelineExecutableStatisticKHR *statistics)
{
struct nvk_shader *shader = container_of(vk_shader, struct nvk_shader, vk);
VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out,
statistics, statistic_count);
assert(executable_index == 0);
vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
WRITE_STR(stat->name, "Code Size");
WRITE_STR(stat->description,
"Size of the compiled shader binary, in bytes");
stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
stat->value.u64 = shader->code_size;
}
vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
WRITE_STR(stat->name, "Number of GPRs");
WRITE_STR(stat->description, "Number of GPRs used by this pipeline");
stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
stat->value.u64 = shader->info.num_gprs;
}
vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
WRITE_STR(stat->name, "SLM Size");
WRITE_STR(stat->description,
"Size of shader local (scratch) memory, in bytes");
stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
stat->value.u64 = shader->info.slm_size;
}
return vk_outarray_status(&out);
}
static bool
write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir,
const char *data)
{
ir->isText = VK_TRUE;
size_t data_len = strlen(data) + 1;
if (ir->pData == NULL) {
ir->dataSize = data_len;
return true;
}
strncpy(ir->pData, data, ir->dataSize);
if (ir->dataSize < data_len)
return false;
ir->dataSize = data_len;
return true;
}
static VkResult
nvk_shader_get_executable_internal_representations(
UNUSED struct vk_device *device,
const struct vk_shader *vk_shader,
uint32_t executable_index,
uint32_t *internal_representation_count,
VkPipelineExecutableInternalRepresentationKHR *internal_representations)
{
struct nvk_shader *shader = container_of(vk_shader, struct nvk_shader, vk);
VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
internal_representations,
internal_representation_count);
bool incomplete_text = false;
assert(executable_index == 0);
if (shader->nak != NULL && shader->nak->asm_str != NULL) {
vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
WRITE_STR(ir->name, "NAK assembly");
WRITE_STR(ir->description, "NAK assembly");
if (!write_ir_text(ir, shader->nak->asm_str))
incomplete_text = true;
}
}
return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
}
static const struct vk_shader_ops nvk_shader_ops = {
.destroy = nvk_shader_destroy,
.serialize = nvk_shader_serialize,
.get_executable_properties = nvk_shader_get_executable_properties,
.get_executable_statistics = nvk_shader_get_executable_statistics,
.get_executable_internal_representations =
nvk_shader_get_executable_internal_representations,
};
const struct vk_device_shader_ops nvk_device_shader_ops = {
.get_nir_options = nvk_get_nir_options,
.get_spirv_options = nvk_get_spirv_options,
.preprocess_nir = nvk_preprocess_nir,
.hash_graphics_state = nvk_hash_graphics_state,
.compile = nvk_compile_shaders,
.deserialize = nvk_deserialize_shader,
.cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state,
.cmd_bind_shaders = nvk_cmd_bind_shaders,
};