/* * Copyright © 2022 Imagination Technologies Ltd. * * based in part on v3dv driver which is: * Copyright © 2019 Raspberry Pi * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include "compiler/shader_enums.h" #include "hwdef/rogue_hw_utils.h" #include "nir/nir.h" #include "pvr_bo.h" #include "pvr_csb.h" #include "pvr_csb_enum_helpers.h" #include "pvr_hardcode.h" #include "pvr_pds.h" #include "pvr_private.h" #include "pvr_shader.h" #include "pvr_types.h" #include "rogue/rogue.h" #include "rogue/rogue_build_data.h" #include "util/log.h" #include "util/macros.h" #include "util/ralloc.h" #include "util/u_math.h" #include "vk_alloc.h" #include "vk_log.h" #include "vk_object.h" #include "vk_util.h" /***************************************************************************** PDS functions *****************************************************************************/ /* If allocator == NULL, the internal one will be used. */ static VkResult pvr_pds_coeff_program_create_and_upload( struct pvr_device *device, const VkAllocationCallbacks *allocator, const uint32_t *fpu_iterators, uint32_t fpu_iterators_count, const uint32_t *destinations, struct pvr_pds_upload *const pds_upload_out) { struct pvr_pds_coeff_loading_program program = { .num_fpu_iterators = fpu_iterators_count, }; uint32_t staging_buffer_size; uint32_t *staging_buffer; VkResult result; assert(fpu_iterators_count < PVR_MAXIMUM_ITERATIONS); /* Get the size of the program and then allocate that much memory. */ pvr_pds_coefficient_loading(&program, NULL, PDS_GENERATE_SIZES); staging_buffer_size = (program.code_size + program.data_size) * sizeof(*staging_buffer); staging_buffer = vk_alloc2(&device->vk.alloc, allocator, staging_buffer_size, 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); if (!staging_buffer) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); /* FIXME: Should we save pointers when we redesign the pds gen api ? */ typed_memcpy(program.FPU_iterators, fpu_iterators, program.num_fpu_iterators); typed_memcpy(program.destination, destinations, program.num_fpu_iterators); /* Generate the program into is the staging_buffer. */ pvr_pds_coefficient_loading(&program, staging_buffer, PDS_GENERATE_CODEDATA_SEGMENTS); /* FIXME: Figure out the define for alignment of 16. */ result = pvr_gpu_upload_pds(device, &staging_buffer[0], program.data_size, 16, &staging_buffer[program.data_size], program.code_size, 16, 16, pds_upload_out); if (result != VK_SUCCESS) { vk_free2(&device->vk.alloc, allocator, staging_buffer); return result; } vk_free2(&device->vk.alloc, allocator, staging_buffer); return VK_SUCCESS; } /* FIXME: move this elsewhere since it's also called in pvr_pass.c? */ /* If allocator == NULL, the internal one will be used. */ VkResult pvr_pds_fragment_program_create_and_upload( struct pvr_device *device, const VkAllocationCallbacks *allocator, const struct pvr_bo *fragment_shader_bo, uint32_t fragment_temp_count, enum rogue_msaa_mode msaa_mode, bool has_phase_rate_change, struct pvr_pds_upload *const pds_upload_out) { const enum PVRX(PDSINST_DOUTU_SAMPLE_RATE) sample_rate = pvr_pdsinst_doutu_sample_rate_from_rogue(msaa_mode); struct pvr_pds_kickusc_program program = { 0 }; uint32_t staging_buffer_size; uint32_t *staging_buffer; VkResult result; /* FIXME: Should it be passing in the USC offset rather than address here? */ /* Note this is not strictly required to be done before calculating the * staging_buffer_size in this particular case. It can also be done after * allocating the buffer. The size from pvr_pds_kick_usc() is constant. */ pvr_pds_setup_doutu(&program.usc_task_control, fragment_shader_bo->vma->dev_addr.addr, fragment_temp_count, sample_rate, has_phase_rate_change); pvr_pds_kick_usc(&program, NULL, 0, false, PDS_GENERATE_SIZES); staging_buffer_size = (program.code_size + program.data_size) * sizeof(*staging_buffer); staging_buffer = vk_alloc2(&device->vk.alloc, allocator, staging_buffer_size, 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); if (!staging_buffer) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); pvr_pds_kick_usc(&program, staging_buffer, 0, false, PDS_GENERATE_CODEDATA_SEGMENTS); /* FIXME: Figure out the define for alignment of 16. */ result = pvr_gpu_upload_pds(device, &staging_buffer[0], program.data_size, 16, &staging_buffer[program.data_size], program.code_size, 16, 16, pds_upload_out); if (result != VK_SUCCESS) { vk_free2(&device->vk.alloc, allocator, staging_buffer); return result; } vk_free2(&device->vk.alloc, allocator, staging_buffer); return VK_SUCCESS; } static inline size_t pvr_pds_get_max_vertex_program_const_map_size_in_bytes( const struct pvr_device_info *dev_info, bool robust_buffer_access) { /* FIXME: Use more local variable to improve formatting. */ /* Maximum memory allocation needed for const map entries in * pvr_pds_generate_vertex_primary_program(). * When robustBufferAccess is disabled, it must be >= 410. * When robustBufferAccess is enabled, it must be >= 570. * * 1. Size of entry for base instance * (pvr_const_map_entry_base_instance) * * 2. Max. number of vertex inputs (PVR_MAX_VERTEX_INPUT_BINDINGS) * ( * if (!robustBufferAccess) * size of vertex attribute entry * (pvr_const_map_entry_vertex_attribute_address) + * else * size of robust vertex attribute entry * (pvr_const_map_entry_robust_vertex_attribute_address) + * size of entry for max attribute index * (pvr_const_map_entry_vertex_attribute_max_index) + * fi * size of Unified Store burst entry * (pvr_const_map_entry_literal32) + * size of entry for vertex stride * (pvr_const_map_entry_literal32) + * size of entries for DDMAD control word * (num_ddmad_literals * pvr_const_map_entry_literal32)) * * 3. Size of entry for DOUTW vertex/instance control word * (pvr_const_map_entry_literal32) * * 4. Size of DOUTU entry (pvr_const_map_entry_doutu_address) */ const size_t attribute_size = (!robust_buffer_access) ? sizeof(struct pvr_const_map_entry_vertex_attribute_address) : sizeof(struct pvr_const_map_entry_robust_vertex_attribute_address) + sizeof(struct pvr_const_map_entry_vertex_attribute_max_index); /* If has_pds_ddmadt the DDMAD control word is now a DDMADT control word * and is increased by one DWORD to contain the data for the DDMADT's * out-of-bounds check. */ const size_t pvr_pds_const_map_vertex_entry_num_ddmad_literals = 1U + (size_t)PVR_HAS_FEATURE(dev_info, pds_ddmadt); return (sizeof(struct pvr_const_map_entry_base_instance) + PVR_MAX_VERTEX_INPUT_BINDINGS * (attribute_size + (2 + pvr_pds_const_map_vertex_entry_num_ddmad_literals) * sizeof(struct pvr_const_map_entry_literal32)) + sizeof(struct pvr_const_map_entry_literal32) + sizeof(struct pvr_const_map_entry_doutu_address)); } /* This is a const pointer to an array of pvr_pds_vertex_dma structs. * The array being pointed to is of PVR_MAX_VERTEX_ATTRIB_DMAS size. */ typedef struct pvr_pds_vertex_dma ( *const pvr_pds_attrib_dma_descriptions_array_ptr)[PVR_MAX_VERTEX_ATTRIB_DMAS]; /* dma_descriptions_out_ptr is a pointer to the array used as output. * The whole array might not be filled so dma_count_out indicates how many * elements were used. */ static void pvr_pds_vertex_attrib_init_dma_descriptions( const VkPipelineVertexInputStateCreateInfo *const vertex_input_state, const struct rogue_vs_build_data *vs_data, pvr_pds_attrib_dma_descriptions_array_ptr dma_descriptions_out_ptr, uint32_t *const dma_count_out) { struct pvr_pds_vertex_dma *const dma_descriptions = *dma_descriptions_out_ptr; uint32_t dma_count = 0; if (!vertex_input_state) { *dma_count_out = 0; return; } for (uint32_t i = 0; i < vertex_input_state->vertexAttributeDescriptionCount; i++) { const VkVertexInputAttributeDescription *const attrib_desc = &vertex_input_state->pVertexAttributeDescriptions[i]; const VkVertexInputBindingDescription *binding_desc = NULL; /* Finding the matching binding description. */ for (uint32_t j = 0; j < vertex_input_state->vertexBindingDescriptionCount; j++) { const VkVertexInputBindingDescription *const current_binding_desc = &vertex_input_state->pVertexBindingDescriptions[j]; if (current_binding_desc->binding == attrib_desc->binding) { binding_desc = current_binding_desc; break; } } /* From the Vulkan 1.2.195 spec for * VkPipelineVertexInputStateCreateInfo: * * "For every binding specified by each element of * pVertexAttributeDescriptions, a * VkVertexInputBindingDescription must exist in * pVertexBindingDescriptions with the same value of binding" * * So we don't check if we found the matching binding description * or not. */ struct pvr_pds_vertex_dma *const dma_desc = &dma_descriptions[dma_count]; size_t location = attrib_desc->location; assert(location < vs_data->inputs.num_input_vars); dma_desc->offset = attrib_desc->offset; dma_desc->stride = binding_desc->stride; dma_desc->flags = 0; if (binding_desc->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE) dma_desc->flags |= PVR_PDS_VERTEX_DMA_FLAGS_INSTANCE_RATE; dma_desc->size_in_dwords = vs_data->inputs.components[location]; /* TODO: This will be different when other types are supported. * Store in vs_data with base and components? */ /* TODO: Use attrib_desc->format. */ dma_desc->component_size_in_bytes = ROGUE_REG_SIZE_BYTES; dma_desc->destination = vs_data->inputs.base[location]; dma_desc->binding_index = attrib_desc->binding; dma_desc->divisor = 1; dma_desc->robustness_buffer_offset = 0; ++dma_count; } *dma_count_out = dma_count; } static VkResult pvr_pds_vertex_attrib_program_create_and_upload( struct pvr_device *const device, const VkAllocationCallbacks *const allocator, struct pvr_pds_vertex_primary_program_input *const input, struct pvr_pds_attrib_program *const program_out) { const size_t const_entries_size_in_bytes = pvr_pds_get_max_vertex_program_const_map_size_in_bytes( &device->pdevice->dev_info, device->features.robustBufferAccess); struct pvr_pds_upload *const program = &program_out->program; struct pvr_pds_info *const info = &program_out->info; struct pvr_const_map_entry *entries_buffer; ASSERTED uint32_t code_size_in_dwords; size_t staging_buffer_size; uint32_t *staging_buffer; VkResult result; memset(info, 0, sizeof(*info)); entries_buffer = vk_alloc2(&device->vk.alloc, allocator, const_entries_size_in_bytes, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!entries_buffer) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); info->entries = entries_buffer; info->entries_size_in_bytes = const_entries_size_in_bytes; pvr_pds_generate_vertex_primary_program(input, NULL, info, device->features.robustBufferAccess, &device->pdevice->dev_info); code_size_in_dwords = info->code_size_in_dwords; staging_buffer_size = info->code_size_in_dwords * sizeof(*staging_buffer); staging_buffer = vk_alloc2(&device->vk.alloc, allocator, staging_buffer_size, 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); if (!staging_buffer) { vk_free2(&device->vk.alloc, allocator, entries_buffer); return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); } /* This also fills in info->entries. */ pvr_pds_generate_vertex_primary_program(input, staging_buffer, info, device->features.robustBufferAccess, &device->pdevice->dev_info); assert(info->code_size_in_dwords <= code_size_in_dwords); /* FIXME: Add a vk_realloc2() ? */ entries_buffer = vk_realloc((!allocator) ? &device->vk.alloc : allocator, entries_buffer, info->entries_written_size_in_bytes, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!entries_buffer) { vk_free2(&device->vk.alloc, allocator, staging_buffer); return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); } info->entries = entries_buffer; info->entries_size_in_bytes = info->entries_written_size_in_bytes; /* FIXME: Figure out the define for alignment of 16. */ result = pvr_gpu_upload_pds(device, NULL, 0, 0, staging_buffer, info->code_size_in_dwords, 16, 16, program); if (result != VK_SUCCESS) { vk_free2(&device->vk.alloc, allocator, entries_buffer); vk_free2(&device->vk.alloc, allocator, staging_buffer); return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); } vk_free2(&device->vk.alloc, allocator, staging_buffer); return VK_SUCCESS; } static inline void pvr_pds_vertex_attrib_program_destroy( struct pvr_device *const device, const struct VkAllocationCallbacks *const allocator, struct pvr_pds_attrib_program *const program) { pvr_bo_free(device, program->program.pvr_bo); vk_free2(&device->vk.alloc, allocator, program->info.entries); } /* This is a const pointer to an array of pvr_pds_attrib_program structs. * The array being pointed to is of PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT size. */ typedef struct pvr_pds_attrib_program (*const pvr_pds_attrib_programs_array_ptr) [PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT]; /* Generate and uploads a PDS program for DMAing vertex attribs into USC vertex * inputs. This will bake the code segment and create a template of the data * segment for the command buffer to fill in. */ /* If allocator == NULL, the internal one will be used. * * programs_out_ptr is a pointer to the array where the outputs will be placed. * */ static VkResult pvr_pds_vertex_attrib_programs_create_and_upload( struct pvr_device *device, const VkAllocationCallbacks *const allocator, const VkPipelineVertexInputStateCreateInfo *const vertex_input_state, uint32_t usc_temp_count, const struct rogue_vs_build_data *vs_data, pvr_pds_attrib_programs_array_ptr programs_out_ptr) { struct pvr_pds_vertex_dma dma_descriptions[PVR_MAX_VERTEX_ATTRIB_DMAS]; struct pvr_pds_attrib_program *const programs_out = *programs_out_ptr; struct pvr_pds_vertex_primary_program_input input = { .dma_list = dma_descriptions, }; VkResult result; pvr_pds_vertex_attrib_init_dma_descriptions(vertex_input_state, vs_data, &dma_descriptions, &input.dma_count); pvr_pds_setup_doutu(&input.usc_task_control, 0, usc_temp_count, PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE), false); /* TODO: If statements for all the "bRequired"s + ui32ExtraFlags. */ /* Note: programs_out_ptr is a pointer to an array so this is fine. See the * typedef. */ for (uint32_t i = 0; i < ARRAY_SIZE(*programs_out_ptr); i++) { switch (i) { case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC: input.flags = 0; break; case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE: input.flags = PVR_PDS_VERTEX_FLAGS_BASE_INSTANCE_VARIANT; break; case PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT: /* We unset INSTANCE and set INDIRECT. */ input.flags = PVR_PDS_VERTEX_FLAGS_DRAW_INDIRECT_VARIANT; break; default: unreachable("Invalid vertex attrib program type."); } result = pvr_pds_vertex_attrib_program_create_and_upload(device, allocator, &input, &programs_out[i]); if (result != VK_SUCCESS) { for (uint32_t j = 0; j < i; j++) { pvr_pds_vertex_attrib_program_destroy(device, allocator, &programs_out[j]); } return result; } } return VK_SUCCESS; } static size_t pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes() { /* Maximum memory allocation needed for const map entries in * pvr_pds_generate_descriptor_upload_program(). * It must be >= 688 bytes. This size is calculated as the sum of: * * 1. Max. number of descriptor sets (8) * ( * size of descriptor entry * (pvr_const_map_entry_descriptor_set) + * size of Common Store burst entry * (pvr_const_map_entry_literal32)) * * 2. Max. number of PDS program buffers (24) * ( * size of the largest buffer structure * (pvr_const_map_entry_constant_buffer) + * size of Common Store burst entry * (pvr_const_map_entry_literal32) * * 3. Size of DOUTU entry (pvr_const_map_entry_doutu_address) */ /* FIXME: PVR_MAX_DESCRIPTOR_SETS is 4 and not 8. The comment above seems to * say that it should be 8. * Figure our a define for this or is the comment wrong? */ return (8 * (sizeof(struct pvr_const_map_entry_descriptor_set) + sizeof(struct pvr_const_map_entry_literal32)) + PVR_PDS_MAX_BUFFERS * (sizeof(struct pvr_const_map_entry_constant_buffer) + sizeof(struct pvr_const_map_entry_literal32)) + sizeof(struct pvr_const_map_entry_doutu_address)); } /* This is a const pointer to an array of PVR_PDS_MAX_BUFFERS pvr_pds_buffer * structs. */ typedef struct pvr_pds_buffer ( *const pvr_pds_descriptor_program_buffer_array_ptr)[PVR_PDS_MAX_BUFFERS]; /** * \brief Setup buffers for the PDS descriptor program. * * Sets up buffers required by the PDS gen api based on compiler info. * * For compile time static constants that need DMAing it uploads them and * returns the upload in \r static_consts_pvr_bo_out . */ static VkResult pvr_pds_descriptor_program_setup_buffers( struct pvr_device *device, bool robust_buffer_access, const struct rogue_compile_time_consts_data *compile_time_consts_data, const struct rogue_ubo_data *ubo_data, pvr_pds_descriptor_program_buffer_array_ptr buffers_out_ptr, uint32_t *const buffer_count_out, struct pvr_bo **const static_consts_pvr_bo_out) { struct pvr_pds_buffer *const buffers = *buffers_out_ptr; uint32_t buffer_count = 0; for (size_t i = 0; i < ubo_data->num_ubo_entries; i++) { struct pvr_pds_buffer *current_buffer = &buffers[buffer_count]; /* This is fine since buffers_out_ptr is a pointer to an array. */ assert(buffer_count < ARRAY_SIZE(*buffers_out_ptr)); current_buffer->type = PVR_BUFFER_TYPE_UBO; current_buffer->size_in_dwords = ubo_data->size[i]; current_buffer->destination = ubo_data->dest[i]; current_buffer->buffer_id = buffer_count; current_buffer->desc_set = ubo_data->desc_set[i]; current_buffer->binding = ubo_data->binding[i]; /* TODO: Is this always the case? * E.g. can multiple UBOs have the same base buffer? */ current_buffer->source_offset = 0; buffer_count++; } if (compile_time_consts_data->static_consts.num > 0) { VkResult result; assert(compile_time_consts_data->static_consts.num <= ARRAY_SIZE(compile_time_consts_data->static_consts.value)); /* This is fine since buffers_out_ptr is a pointer to an array. */ assert(buffer_count < ARRAY_SIZE(*buffers_out_ptr)); /* TODO: Is it possible to have multiple static consts buffer where the * destination is not adjoining? If so we need to handle that. * Currently we're only setting up a single buffer. */ buffers[buffer_count++] = (struct pvr_pds_buffer){ .type = PVR_BUFFER_TYPES_COMPILE_TIME, .size_in_dwords = compile_time_consts_data->static_consts.num, .destination = compile_time_consts_data->static_consts.dest, }; result = pvr_gpu_upload(device, device->heaps.general_heap, compile_time_consts_data->static_consts.value, compile_time_consts_data->static_consts.num * ROGUE_REG_SIZE_BYTES, ROGUE_REG_SIZE_BYTES, static_consts_pvr_bo_out); if (result != VK_SUCCESS) return result; } else { *static_consts_pvr_bo_out = NULL; } *buffer_count_out = buffer_count; return VK_SUCCESS; } static VkResult pvr_pds_descriptor_program_create_and_upload( struct pvr_device *const device, const VkAllocationCallbacks *const allocator, const struct rogue_compile_time_consts_data *const compile_time_consts_data, const struct rogue_ubo_data *const ubo_data, const struct pvr_explicit_constant_usage *const explicit_const_usage, const struct pvr_pipeline_layout *const layout, enum pvr_stage_allocation stage, struct pvr_stage_allocation_descriptor_state *const descriptor_state) { const size_t const_entries_size_in_bytes = pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes(); struct pvr_pds_info *const pds_info = &descriptor_state->pds_info; struct pvr_descriptor_program_input program = { 0 }; struct pvr_const_map_entry *entries_buffer; ASSERTED uint32_t code_size_in_dwords; uint32_t staging_buffer_size; uint32_t *staging_buffer; VkResult result; assert(stage != PVR_STAGE_ALLOCATION_COUNT); *pds_info = (struct pvr_pds_info){ 0 }; result = pvr_pds_descriptor_program_setup_buffers( device, device->features.robustBufferAccess, compile_time_consts_data, ubo_data, &program.buffers, &program.buffer_count, &descriptor_state->static_consts); if (result != VK_SUCCESS) return result; if (layout->per_stage_reg_info[stage].primary_dynamic_size_in_dwords) assert(!"Unimplemented"); for (uint32_t set_num = 0; set_num < layout->set_count; set_num++) { const struct pvr_descriptor_set_layout_mem_layout *const reg_layout = &layout->register_layout_in_dwords_per_stage[stage][set_num]; const uint32_t start_offset = explicit_const_usage->start_offset; /* TODO: Use compiler usage info to optimize this? */ /* Only dma primaries if they are actually required. */ if (reg_layout->primary_size) { program.descriptor_sets[program.descriptor_set_count++] = (struct pvr_pds_descriptor_set){ .descriptor_set = set_num, .size_in_dwords = reg_layout->primary_size, .destination = reg_layout->primary_offset + start_offset, .primary = true, }; } /* Only dma secondaries if they are actually required. */ if (!reg_layout->secondary_size) continue; program.descriptor_sets[program.descriptor_set_count++] = (struct pvr_pds_descriptor_set){ .descriptor_set = set_num, .size_in_dwords = reg_layout->secondary_size, .destination = reg_layout->secondary_offset + start_offset, }; } entries_buffer = vk_alloc2(&device->vk.alloc, allocator, const_entries_size_in_bytes, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!entries_buffer) { pvr_bo_free(device, descriptor_state->static_consts); return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); } pds_info->entries = entries_buffer; pds_info->entries_size_in_bytes = const_entries_size_in_bytes; pvr_pds_generate_descriptor_upload_program(&program, NULL, pds_info); code_size_in_dwords = pds_info->code_size_in_dwords; staging_buffer_size = pds_info->code_size_in_dwords * sizeof(*staging_buffer); if (!staging_buffer_size) { vk_free2(&device->vk.alloc, allocator, entries_buffer); *descriptor_state = (struct pvr_stage_allocation_descriptor_state){ 0 }; return VK_SUCCESS; } staging_buffer = vk_alloc2(&device->vk.alloc, allocator, staging_buffer_size, 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); if (!staging_buffer) { pvr_bo_free(device, descriptor_state->static_consts); vk_free2(&device->vk.alloc, allocator, entries_buffer); return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); } pvr_pds_generate_descriptor_upload_program(&program, staging_buffer, pds_info); assert(pds_info->code_size_in_dwords <= code_size_in_dwords); /* FIXME: use vk_realloc2() ? */ entries_buffer = vk_realloc((!allocator) ? &device->vk.alloc : allocator, entries_buffer, pds_info->entries_written_size_in_bytes, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!entries_buffer) { pvr_bo_free(device, descriptor_state->static_consts); vk_free2(&device->vk.alloc, allocator, staging_buffer); return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); } pds_info->entries = entries_buffer; pds_info->entries_size_in_bytes = pds_info->entries_written_size_in_bytes; /* FIXME: Figure out the define for alignment of 16. */ result = pvr_gpu_upload_pds(device, NULL, 0, 0, staging_buffer, pds_info->code_size_in_dwords, 16, 16, &descriptor_state->pds_code); if (result != VK_SUCCESS) { pvr_bo_free(device, descriptor_state->static_consts); vk_free2(&device->vk.alloc, allocator, entries_buffer); vk_free2(&device->vk.alloc, allocator, staging_buffer); return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); } vk_free2(&device->vk.alloc, allocator, staging_buffer); return VK_SUCCESS; } static void pvr_pds_descriptor_program_destroy( struct pvr_device *const device, const struct VkAllocationCallbacks *const allocator, struct pvr_stage_allocation_descriptor_state *const descriptor_state) { pvr_bo_free(device, descriptor_state->pds_code.pvr_bo); vk_free2(&device->vk.alloc, allocator, descriptor_state->pds_info.entries); pvr_bo_free(device, descriptor_state->static_consts); } static void pvr_pds_compute_program_setup( const struct pvr_device_info *dev_info, const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS], const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS], uint32_t barrier_coefficient, bool add_base_workgroup, uint32_t usc_temps, pvr_dev_addr_t usc_shader_dev_addr, struct pvr_pds_compute_shader_program *const program) { *program = (struct pvr_pds_compute_shader_program){ /* clang-format off */ .local_input_regs = { local_input_regs[0], local_input_regs[1], local_input_regs[2] }, .work_group_input_regs = { work_group_input_regs[0], work_group_input_regs[1], work_group_input_regs[2] }, .global_input_regs = { [0 ... (PVR_WORKGROUP_DIMENSIONS - 1)] = PVR_PDS_COMPUTE_INPUT_REG_UNUSED }, /* clang-format on */ .barrier_coefficient = barrier_coefficient, .flattened_work_groups = true, .clear_pds_barrier = false, .add_base_workgroup = add_base_workgroup, .kick_usc = true, }; STATIC_ASSERT(ARRAY_SIZE(program->local_input_regs) == PVR_WORKGROUP_DIMENSIONS); STATIC_ASSERT(ARRAY_SIZE(program->work_group_input_regs) == PVR_WORKGROUP_DIMENSIONS); STATIC_ASSERT(ARRAY_SIZE(program->global_input_regs) == PVR_WORKGROUP_DIMENSIONS); pvr_pds_setup_doutu(&program->usc_task_control, usc_shader_dev_addr.addr, usc_temps, PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE), false); pvr_pds_compute_shader(program, NULL, PDS_GENERATE_SIZES, dev_info); } /* FIXME: See if pvr_device_init_compute_pds_program() and this could be merged. */ static VkResult pvr_pds_compute_program_create_and_upload( struct pvr_device *const device, const VkAllocationCallbacks *const allocator, const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS], const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS], uint32_t barrier_coefficient, uint32_t usc_temps, pvr_dev_addr_t usc_shader_dev_addr, struct pvr_pds_upload *const pds_upload_out, struct pvr_pds_info *const pds_info_out) { struct pvr_device_info *dev_info = &device->pdevice->dev_info; struct pvr_pds_compute_shader_program program; uint32_t staging_buffer_size; uint32_t *staging_buffer; VkResult result; pvr_pds_compute_program_setup(dev_info, local_input_regs, work_group_input_regs, barrier_coefficient, false, usc_temps, usc_shader_dev_addr, &program); /* FIXME: According to pvr_device_init_compute_pds_program() the code size * is in bytes. Investigate this. */ staging_buffer_size = (program.code_size + program.data_size) * sizeof(*staging_buffer); staging_buffer = vk_alloc2(&device->vk.alloc, allocator, staging_buffer_size, 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); if (!staging_buffer) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); /* FIXME: pvr_pds_compute_shader doesn't implement * PDS_GENERATE_CODEDATA_SEGMENTS. */ pvr_pds_compute_shader(&program, &staging_buffer[0], PDS_GENERATE_CODE_SEGMENT, dev_info); pvr_pds_compute_shader(&program, &staging_buffer[program.code_size], PDS_GENERATE_DATA_SEGMENT, dev_info); /* FIXME: Figure out the define for alignment of 16. */ result = pvr_gpu_upload_pds(device, &staging_buffer[program.code_size], program.data_size, 16, &staging_buffer[0], program.code_size, 16, 16, pds_upload_out); if (result != VK_SUCCESS) { vk_free2(&device->vk.alloc, allocator, staging_buffer); return result; } *pds_info_out = (struct pvr_pds_info){ .temps_required = program.highest_temp, .code_size_in_dwords = program.code_size, .data_size_in_dwords = program.data_size, }; vk_free2(&device->vk.alloc, allocator, staging_buffer); return VK_SUCCESS; }; static void pvr_pds_compute_program_destroy( struct pvr_device *const device, const struct VkAllocationCallbacks *const allocator, struct pvr_pds_upload *const pds_program, struct pvr_pds_info *const pds_info) { /* We don't allocate an entries buffer so we don't need to free it */ pvr_bo_free(device, pds_program->pvr_bo); } /* This only uploads the code segment. The data segment will need to be patched * with the base workgroup before uploading. */ static VkResult pvr_pds_compute_base_workgroup_variant_program_init( struct pvr_device *const device, const VkAllocationCallbacks *const allocator, const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS], const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS], uint32_t barrier_coefficient, uint32_t usc_temps, pvr_dev_addr_t usc_shader_dev_addr, struct pvr_pds_base_workgroup_program *program_out) { struct pvr_device_info *dev_info = &device->pdevice->dev_info; struct pvr_pds_compute_shader_program program; uint32_t buffer_size; uint32_t *buffer; VkResult result; pvr_pds_compute_program_setup(dev_info, local_input_regs, work_group_input_regs, barrier_coefficient, true, usc_temps, usc_shader_dev_addr, &program); /* FIXME: According to pvr_device_init_compute_pds_program() the code size * is in bytes. Investigate this. */ buffer_size = MAX2(program.code_size, program.data_size) * sizeof(*buffer); buffer = vk_alloc2(&device->vk.alloc, allocator, buffer_size, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!buffer) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); pvr_pds_compute_shader(&program, &buffer[0], PDS_GENERATE_CODE_SEGMENT, dev_info); /* FIXME: Figure out the define for alignment of 16. */ result = pvr_gpu_upload_pds(device, NULL, 0, 0, buffer, program.code_size, 16, 16, &program_out->code_upload); if (result != VK_SUCCESS) { vk_free2(&device->vk.alloc, allocator, buffer); return result; } pvr_pds_compute_shader(&program, buffer, PDS_GENERATE_DATA_SEGMENT, dev_info); program_out->data_section = buffer; /* We'll need to patch the base workgroup in the PDS data section before * dispatch so we save the offsets at which to patch. We only need to save * the offset for the first workgroup id since the workgroup ids are stored * contiguously in the data segment. */ program_out->base_workgroup_data_patching_offset = program.base_workgroup_constant_offset_in_dwords[0]; program_out->info = (struct pvr_pds_info){ .temps_required = program.highest_temp, .code_size_in_dwords = program.code_size, .data_size_in_dwords = program.data_size, }; return VK_SUCCESS; } static void pvr_pds_compute_base_workgroup_variant_program_finish( struct pvr_device *device, const VkAllocationCallbacks *const allocator, struct pvr_pds_base_workgroup_program *const state) { pvr_bo_free(device, state->code_upload.pvr_bo); vk_free2(&device->vk.alloc, allocator, state->data_section); } /****************************************************************************** Generic pipeline functions ******************************************************************************/ static void pvr_pipeline_init(struct pvr_device *device, enum pvr_pipeline_type type, struct pvr_pipeline *const pipeline) { assert(!pipeline->layout); vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE); pipeline->type = type; } static void pvr_pipeline_finish(struct pvr_pipeline *pipeline) { vk_object_base_finish(&pipeline->base); } /****************************************************************************** Compute pipeline functions ******************************************************************************/ /* Compiles and uploads shaders and PDS programs. */ static VkResult pvr_compute_pipeline_compile( struct pvr_device *const device, struct pvr_pipeline_cache *pipeline_cache, const VkComputePipelineCreateInfo *pCreateInfo, const VkAllocationCallbacks *const allocator, struct pvr_compute_pipeline *const compute_pipeline) { struct rogue_compile_time_consts_data compile_time_consts_data; uint32_t work_group_input_regs[PVR_WORKGROUP_DIMENSIONS]; struct pvr_explicit_constant_usage explicit_const_usage; uint32_t local_input_regs[PVR_WORKGROUP_DIMENSIONS]; struct rogue_ubo_data ubo_data; uint32_t barrier_coefficient; uint32_t usc_temps; VkResult result; if (pvr_hard_code_shader_required(&device->pdevice->dev_info)) { struct pvr_hard_code_compute_build_info build_info; result = pvr_hard_code_compute_pipeline(device, &compute_pipeline->state.shader, &build_info); if (result != VK_SUCCESS) return result; ubo_data = build_info.ubo_data; /* We make sure that the compiler's unused reg value is compatible with * the pds api. */ STATIC_ASSERT(ROGUE_REG_UNUSED == PVR_PDS_COMPUTE_INPUT_REG_UNUSED); barrier_coefficient = build_info.barrier_reg; /* TODO: Maybe change the pds api to use pointers so we avoid the copy. */ local_input_regs[0] = build_info.local_invocation_regs[0]; local_input_regs[1] = build_info.local_invocation_regs[1]; /* This is not a mistake. We want to assign element 1 to 2. */ local_input_regs[2] = build_info.local_invocation_regs[1]; STATIC_ASSERT( __same_type(work_group_input_regs, build_info.work_group_regs)); typed_memcpy(work_group_input_regs, build_info.work_group_regs, PVR_WORKGROUP_DIMENSIONS); usc_temps = build_info.usc_temps; explicit_const_usage = build_info.explicit_conts_usage; } else { /* FIXME: Compile and upload the shader. */ /* FIXME: Initialize the shader state and setup build info. */ abort(); }; result = pvr_pds_descriptor_program_create_and_upload( device, allocator, &compile_time_consts_data, &ubo_data, &explicit_const_usage, compute_pipeline->base.layout, PVR_STAGE_ALLOCATION_COMPUTE, &compute_pipeline->state.descriptor); if (result != VK_SUCCESS) goto err_free_shader; result = pvr_pds_compute_program_create_and_upload( device, allocator, local_input_regs, work_group_input_regs, barrier_coefficient, usc_temps, compute_pipeline->state.shader.bo->vma->dev_addr, &compute_pipeline->state.primary_program, &compute_pipeline->state.primary_program_info); if (result != VK_SUCCESS) goto err_free_descriptor_program; /* If the workgroup ID is required, then we require the base workgroup * variant of the PDS compute program as well. */ compute_pipeline->state.flags.base_workgroup = work_group_input_regs[0] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED || work_group_input_regs[1] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED || work_group_input_regs[2] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED; if (compute_pipeline->state.flags.base_workgroup) { result = pvr_pds_compute_base_workgroup_variant_program_init( device, allocator, local_input_regs, work_group_input_regs, barrier_coefficient, usc_temps, compute_pipeline->state.shader.bo->vma->dev_addr, &compute_pipeline->state.primary_base_workgroup_variant_program); if (result != VK_SUCCESS) goto err_destroy_compute_program; } return VK_SUCCESS; err_destroy_compute_program: pvr_pds_compute_program_destroy( device, allocator, &compute_pipeline->state.primary_program, &compute_pipeline->state.primary_program_info); err_free_descriptor_program: pvr_bo_free(device, compute_pipeline->state.descriptor.pds_code.pvr_bo); err_free_shader: pvr_bo_free(device, compute_pipeline->state.shader.bo); return result; } static VkResult pvr_compute_pipeline_init(struct pvr_device *device, struct pvr_pipeline_cache *pipeline_cache, const VkComputePipelineCreateInfo *pCreateInfo, const VkAllocationCallbacks *allocator, struct pvr_compute_pipeline *compute_pipeline) { VkResult result; pvr_pipeline_init(device, PVR_PIPELINE_TYPE_COMPUTE, &compute_pipeline->base); compute_pipeline->base.layout = pvr_pipeline_layout_from_handle(pCreateInfo->layout); result = pvr_compute_pipeline_compile(device, pipeline_cache, pCreateInfo, allocator, compute_pipeline); if (result != VK_SUCCESS) { pvr_pipeline_finish(&compute_pipeline->base); return result; } return VK_SUCCESS; } static VkResult pvr_compute_pipeline_create(struct pvr_device *device, struct pvr_pipeline_cache *pipeline_cache, const VkComputePipelineCreateInfo *pCreateInfo, const VkAllocationCallbacks *allocator, VkPipeline *const pipeline_out) { struct pvr_compute_pipeline *compute_pipeline; VkResult result; compute_pipeline = vk_zalloc2(&device->vk.alloc, allocator, sizeof(*compute_pipeline), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); if (!compute_pipeline) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); /* Compiles and uploads shaders and PDS programs. */ result = pvr_compute_pipeline_init(device, pipeline_cache, pCreateInfo, allocator, compute_pipeline); if (result != VK_SUCCESS) { vk_free2(&device->vk.alloc, allocator, compute_pipeline); return result; } *pipeline_out = pvr_pipeline_to_handle(&compute_pipeline->base); return VK_SUCCESS; } static void pvr_compute_pipeline_destroy( struct pvr_device *const device, const VkAllocationCallbacks *const allocator, struct pvr_compute_pipeline *const compute_pipeline) { if (compute_pipeline->state.flags.base_workgroup) { pvr_pds_compute_base_workgroup_variant_program_finish( device, allocator, &compute_pipeline->state.primary_base_workgroup_variant_program); } pvr_pds_compute_program_destroy( device, allocator, &compute_pipeline->state.primary_program, &compute_pipeline->state.primary_program_info); pvr_pds_descriptor_program_destroy(device, allocator, &compute_pipeline->state.descriptor); pvr_bo_free(device, compute_pipeline->state.shader.bo); pvr_pipeline_finish(&compute_pipeline->base); vk_free2(&device->vk.alloc, allocator, compute_pipeline); } VkResult pvr_CreateComputePipelines(VkDevice _device, VkPipelineCache pipelineCache, uint32_t createInfoCount, const VkComputePipelineCreateInfo *pCreateInfos, const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines) { PVR_FROM_HANDLE(pvr_pipeline_cache, pipeline_cache, pipelineCache); PVR_FROM_HANDLE(pvr_device, device, _device); VkResult result = VK_SUCCESS; for (uint32_t i = 0; i < createInfoCount; i++) { const VkResult local_result = pvr_compute_pipeline_create(device, pipeline_cache, &pCreateInfos[i], pAllocator, &pPipelines[i]); if (local_result != VK_SUCCESS) { result = local_result; pPipelines[i] = VK_NULL_HANDLE; } } return result; } /****************************************************************************** Graphics pipeline functions ******************************************************************************/ static inline uint32_t pvr_dynamic_state_bit_from_vk(VkDynamicState state) { switch (state) { case VK_DYNAMIC_STATE_VIEWPORT: return PVR_DYNAMIC_STATE_BIT_VIEWPORT; case VK_DYNAMIC_STATE_SCISSOR: return PVR_DYNAMIC_STATE_BIT_SCISSOR; case VK_DYNAMIC_STATE_LINE_WIDTH: return PVR_DYNAMIC_STATE_BIT_LINE_WIDTH; case VK_DYNAMIC_STATE_DEPTH_BIAS: return PVR_DYNAMIC_STATE_BIT_DEPTH_BIAS; case VK_DYNAMIC_STATE_BLEND_CONSTANTS: return PVR_DYNAMIC_STATE_BIT_BLEND_CONSTANTS; case VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK: return PVR_DYNAMIC_STATE_BIT_STENCIL_COMPARE_MASK; case VK_DYNAMIC_STATE_STENCIL_WRITE_MASK: return PVR_DYNAMIC_STATE_BIT_STENCIL_WRITE_MASK; case VK_DYNAMIC_STATE_STENCIL_REFERENCE: return PVR_DYNAMIC_STATE_BIT_STENCIL_REFERENCE; default: unreachable("Unsupported state."); } } static void pvr_graphics_pipeline_destroy(struct pvr_device *const device, const VkAllocationCallbacks *const allocator, struct pvr_graphics_pipeline *const gfx_pipeline) { const uint32_t num_vertex_attrib_programs = ARRAY_SIZE(gfx_pipeline->vertex_shader_state.pds_attrib_programs); pvr_pds_descriptor_program_destroy( device, allocator, &gfx_pipeline->fragment_shader_state.descriptor_state); pvr_pds_descriptor_program_destroy( device, allocator, &gfx_pipeline->vertex_shader_state.descriptor_state); for (uint32_t i = 0; i < num_vertex_attrib_programs; i++) { struct pvr_pds_attrib_program *const attrib_program = &gfx_pipeline->vertex_shader_state.pds_attrib_programs[i]; pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program); } pvr_bo_free(device, gfx_pipeline->fragment_shader_state.pds_fragment_program.pvr_bo); pvr_bo_free(device, gfx_pipeline->fragment_shader_state.pds_coeff_program.pvr_bo); pvr_bo_free(device, gfx_pipeline->fragment_shader_state.bo); pvr_bo_free(device, gfx_pipeline->vertex_shader_state.bo); pvr_pipeline_finish(&gfx_pipeline->base); vk_free2(&device->vk.alloc, allocator, gfx_pipeline); } static void pvr_vertex_state_init(struct pvr_graphics_pipeline *gfx_pipeline, const struct rogue_common_build_data *common_data, const struct rogue_vs_build_data *vs_data) { struct pvr_vertex_shader_state *vertex_state = &gfx_pipeline->vertex_shader_state; /* TODO: Hard coding these for now. These should be populated based on the * information returned by the compiler. */ vertex_state->stage_state.const_shared_reg_count = common_data->shareds; vertex_state->stage_state.const_shared_reg_offset = 0; vertex_state->stage_state.temps_count = common_data->temps; vertex_state->stage_state.coefficient_size = common_data->coeffs; vertex_state->stage_state.uses_atomic_ops = false; vertex_state->stage_state.uses_texture_rw = false; vertex_state->stage_state.uses_barrier = false; vertex_state->stage_state.has_side_effects = false; vertex_state->stage_state.empty_program = false; vertex_state->vertex_input_size = vs_data->num_vertex_input_regs; vertex_state->vertex_output_size = vs_data->num_vertex_outputs * ROGUE_REG_SIZE_BYTES; vertex_state->user_clip_planes_mask = 0; vertex_state->entry_offset = 0; /* TODO: The number of varyings should be checked against the fragment * shader inputs and assigned in the place where that happens. * There will also be an opportunity to cull unused fs inputs/vs outputs. */ pvr_csb_pack (&gfx_pipeline->vertex_shader_state.varying[0], TA_STATE_VARYING0, varying0) { varying0.f32_linear = vs_data->num_varyings; varying0.f32_flat = 0; varying0.f32_npc = 0; } pvr_csb_pack (&gfx_pipeline->vertex_shader_state.varying[1], TA_STATE_VARYING1, varying1) { varying1.f16_linear = 0; varying1.f16_flat = 0; varying1.f16_npc = 0; } } static void pvr_fragment_state_init(struct pvr_graphics_pipeline *gfx_pipeline, const struct rogue_common_build_data *common_data) { struct pvr_fragment_shader_state *fragment_state = &gfx_pipeline->fragment_shader_state; /* TODO: Hard coding these for now. These should be populated based on the * information returned by the compiler. */ fragment_state->stage_state.const_shared_reg_count = 0; fragment_state->stage_state.const_shared_reg_offset = 0; fragment_state->stage_state.temps_count = common_data->temps; fragment_state->stage_state.coefficient_size = common_data->coeffs; fragment_state->stage_state.uses_atomic_ops = false; fragment_state->stage_state.uses_texture_rw = false; fragment_state->stage_state.uses_barrier = false; fragment_state->stage_state.has_side_effects = false; fragment_state->stage_state.empty_program = false; fragment_state->pass_type = 0; fragment_state->entry_offset = 0; } /* Compiles and uploads shaders and PDS programs. */ static VkResult pvr_graphics_pipeline_compile(struct pvr_device *const device, struct pvr_pipeline_cache *pipeline_cache, const VkGraphicsPipelineCreateInfo *pCreateInfo, const VkAllocationCallbacks *const allocator, struct pvr_graphics_pipeline *const gfx_pipeline) { /* FIXME: Remove this hard coding. */ struct pvr_explicit_constant_usage vert_explicit_const_usage = { .start_offset = 16, }; struct pvr_explicit_constant_usage frag_explicit_const_usage = { .start_offset = 0, }; static uint32_t hard_code_pipeline_n = 0; const VkPipelineVertexInputStateCreateInfo *const vertex_input_state = pCreateInfo->pVertexInputState; const uint32_t cache_line_size = rogue_get_slc_cache_line_size(&device->pdevice->dev_info); struct rogue_compiler *compiler = device->pdevice->compiler; struct rogue_build_ctx *ctx; VkResult result; /* Setup shared build context. */ ctx = rogue_create_build_context(compiler); if (!ctx) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); /* NIR middle-end translation. */ for (gl_shader_stage stage = MESA_SHADER_FRAGMENT; stage > MESA_SHADER_NONE; stage--) { const VkPipelineShaderStageCreateInfo *create_info; size_t stage_index = gfx_pipeline->stage_indices[stage]; if (pvr_hard_code_shader_required(&device->pdevice->dev_info)) { if (pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) & BITFIELD_BIT(stage)) { continue; } } /* Skip unused/inactive stages. */ if (stage_index == ~0) continue; create_info = &pCreateInfo->pStages[stage_index]; /* SPIR-V to NIR. */ ctx->nir[stage] = pvr_spirv_to_nir(ctx, stage, create_info); if (!ctx->nir[stage]) { ralloc_free(ctx); return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); } } /* Pre-back-end analysis and optimization, driver data extraction. */ /* TODO: Analyze and cull unused I/O between stages. */ /* TODO: Allocate UBOs between stages; * pipeline->layout->set_{count,layout}. */ /* Back-end translation. */ for (gl_shader_stage stage = MESA_SHADER_FRAGMENT; stage > MESA_SHADER_NONE; stage--) { if (pvr_hard_code_shader_required(&device->pdevice->dev_info) && pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) & BITFIELD_BIT(stage)) { const struct pvr_device_info *const dev_info = &device->pdevice->dev_info; struct pvr_explicit_constant_usage *explicit_const_usage; switch (stage) { case MESA_SHADER_VERTEX: explicit_const_usage = &vert_explicit_const_usage; break; case MESA_SHADER_FRAGMENT: explicit_const_usage = &frag_explicit_const_usage; break; default: unreachable("Unsupported stage."); } pvr_hard_code_graphics_shader(dev_info, hard_code_pipeline_n, stage, &ctx->binary[stage]); pvr_hard_code_graphics_get_build_info(dev_info, hard_code_pipeline_n, stage, &ctx->common_data[stage], &ctx->stage_data, explicit_const_usage); continue; } if (!ctx->nir[stage]) continue; ctx->rogue[stage] = pvr_nir_to_rogue(ctx, ctx->nir[stage]); if (!ctx->rogue[stage]) { ralloc_free(ctx); return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); } ctx->binary[stage] = pvr_rogue_to_binary(ctx, ctx->rogue[stage]); if (!ctx->binary[stage]) { ralloc_free(ctx); return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); } } if (pvr_hard_code_shader_required(&device->pdevice->dev_info) && pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) & BITFIELD_BIT(MESA_SHADER_VERTEX)) { pvr_hard_code_graphics_vertex_state(&device->pdevice->dev_info, hard_code_pipeline_n, &gfx_pipeline->vertex_shader_state); } else { pvr_vertex_state_init(gfx_pipeline, &ctx->common_data[MESA_SHADER_VERTEX], &ctx->stage_data.vs); } result = pvr_gpu_upload_usc(device, ctx->binary[MESA_SHADER_VERTEX]->data, ctx->binary[MESA_SHADER_VERTEX]->size, cache_line_size, &gfx_pipeline->vertex_shader_state.bo); if (result != VK_SUCCESS) goto err_free_build_context; if (pvr_hard_code_shader_required(&device->pdevice->dev_info) && pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) & BITFIELD_BIT(MESA_SHADER_FRAGMENT)) { pvr_hard_code_graphics_fragment_state( &device->pdevice->dev_info, hard_code_pipeline_n, &gfx_pipeline->fragment_shader_state); } else { pvr_fragment_state_init(gfx_pipeline, &ctx->common_data[MESA_SHADER_FRAGMENT]); } result = pvr_gpu_upload_usc(device, ctx->binary[MESA_SHADER_FRAGMENT]->data, ctx->binary[MESA_SHADER_FRAGMENT]->size, cache_line_size, &gfx_pipeline->fragment_shader_state.bo); if (result != VK_SUCCESS) goto err_free_vertex_bo; /* TODO: powervr has an optimization where it attempts to recompile shaders. * See PipelineCompileNoISPFeedbackFragmentStage. Unimplemented since in our * case the optimization doesn't happen. */ /* TODO: The programs we use are hard coded for now, but these should be * selected dynamically. */ result = pvr_pds_coeff_program_create_and_upload( device, allocator, ctx->stage_data.fs.iterator_args.fpu_iterators, ctx->stage_data.fs.iterator_args.num_fpu_iterators, ctx->stage_data.fs.iterator_args.destination, &gfx_pipeline->fragment_shader_state.pds_coeff_program); if (result != VK_SUCCESS) goto err_free_fragment_bo; result = pvr_pds_fragment_program_create_and_upload( device, allocator, gfx_pipeline->fragment_shader_state.bo, ctx->common_data[MESA_SHADER_FRAGMENT].temps, ctx->stage_data.fs.msaa_mode, ctx->stage_data.fs.phas, &gfx_pipeline->fragment_shader_state.pds_fragment_program); if (result != VK_SUCCESS) goto err_free_coeff_program; result = pvr_pds_vertex_attrib_programs_create_and_upload( device, allocator, vertex_input_state, ctx->common_data[MESA_SHADER_VERTEX].temps, &ctx->stage_data.vs, &gfx_pipeline->vertex_shader_state.pds_attrib_programs); if (result != VK_SUCCESS) goto err_free_frag_program; result = pvr_pds_descriptor_program_create_and_upload( device, allocator, &ctx->common_data[MESA_SHADER_VERTEX].compile_time_consts_data, &ctx->common_data[MESA_SHADER_VERTEX].ubo_data, &vert_explicit_const_usage, gfx_pipeline->base.layout, PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY, &gfx_pipeline->vertex_shader_state.descriptor_state); if (result != VK_SUCCESS) goto err_free_vertex_attrib_program; /* FIXME: When the temp_buffer_total_size is non-zero we need to allocate a * scratch buffer for both vertex and fragment stage. * Figure out the best place to do this. */ /* assert(pvr_pds_descriptor_program_variables.temp_buff_total_size == 0); */ /* TODO: Implement spilling with the above. */ /* TODO: Call pvr_pds_program_program_create_and_upload in a loop. */ /* FIXME: For now we pass in the same explicit_const_usage since it contains * all invalid entries. Fix this by hooking it up to the compiler. */ result = pvr_pds_descriptor_program_create_and_upload( device, allocator, &ctx->common_data[MESA_SHADER_FRAGMENT].compile_time_consts_data, &ctx->common_data[MESA_SHADER_FRAGMENT].ubo_data, &frag_explicit_const_usage, gfx_pipeline->base.layout, PVR_STAGE_ALLOCATION_FRAGMENT, &gfx_pipeline->fragment_shader_state.descriptor_state); if (result != VK_SUCCESS) goto err_free_vertex_descriptor_program; ralloc_free(ctx); hard_code_pipeline_n++; return VK_SUCCESS; err_free_vertex_descriptor_program: pvr_pds_descriptor_program_destroy( device, allocator, &gfx_pipeline->vertex_shader_state.descriptor_state); err_free_vertex_attrib_program: for (uint32_t i = 0; i < ARRAY_SIZE(gfx_pipeline->vertex_shader_state.pds_attrib_programs); i++) { struct pvr_pds_attrib_program *const attrib_program = &gfx_pipeline->vertex_shader_state.pds_attrib_programs[i]; pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program); } err_free_frag_program: pvr_bo_free(device, gfx_pipeline->fragment_shader_state.pds_fragment_program.pvr_bo); err_free_coeff_program: pvr_bo_free(device, gfx_pipeline->fragment_shader_state.pds_coeff_program.pvr_bo); err_free_fragment_bo: pvr_bo_free(device, gfx_pipeline->fragment_shader_state.bo); err_free_vertex_bo: pvr_bo_free(device, gfx_pipeline->vertex_shader_state.bo); err_free_build_context: ralloc_free(ctx); return result; } static void pvr_graphics_pipeline_init_depth_and_stencil_state( struct pvr_graphics_pipeline *gfx_pipeline, const VkPipelineDepthStencilStateCreateInfo *depth_stencil_state) { const VkStencilOpState *front; const VkStencilOpState *back; if (!depth_stencil_state) return; front = &depth_stencil_state->front; back = &depth_stencil_state->back; if (depth_stencil_state->depthTestEnable) { gfx_pipeline->depth_compare_op = depth_stencil_state->depthCompareOp; gfx_pipeline->depth_write_disable = !depth_stencil_state->depthWriteEnable; } else { gfx_pipeline->depth_compare_op = VK_COMPARE_OP_ALWAYS; gfx_pipeline->depth_write_disable = true; } if (depth_stencil_state->stencilTestEnable) { gfx_pipeline->stencil_front.compare_op = front->compareOp; gfx_pipeline->stencil_front.fail_op = front->failOp; gfx_pipeline->stencil_front.depth_fail_op = front->depthFailOp; gfx_pipeline->stencil_front.pass_op = front->passOp; gfx_pipeline->stencil_back.compare_op = back->compareOp; gfx_pipeline->stencil_back.fail_op = back->failOp; gfx_pipeline->stencil_back.depth_fail_op = back->depthFailOp; gfx_pipeline->stencil_back.pass_op = back->passOp; } else { gfx_pipeline->stencil_front.compare_op = VK_COMPARE_OP_ALWAYS; gfx_pipeline->stencil_front.fail_op = VK_STENCIL_OP_KEEP; gfx_pipeline->stencil_front.depth_fail_op = VK_STENCIL_OP_KEEP; gfx_pipeline->stencil_front.pass_op = VK_STENCIL_OP_KEEP; gfx_pipeline->stencil_back = gfx_pipeline->stencil_front; } } static void pvr_graphics_pipeline_init_dynamic_state( struct pvr_graphics_pipeline *gfx_pipeline, const VkPipelineDynamicStateCreateInfo *dynamic_state, const VkPipelineViewportStateCreateInfo *viewport_state, const VkPipelineDepthStencilStateCreateInfo *depth_stencil_state, const VkPipelineColorBlendStateCreateInfo *color_blend_state, const VkPipelineRasterizationStateCreateInfo *rasterization_state) { struct pvr_dynamic_state *const internal_dynamic_state = &gfx_pipeline->dynamic_state; uint32_t dynamic_states = 0; if (dynamic_state) { for (uint32_t i = 0; i < dynamic_state->dynamicStateCount; i++) { dynamic_states |= pvr_dynamic_state_bit_from_vk(dynamic_state->pDynamicStates[i]); } } /* TODO: Verify this. * We don't zero out the pipeline's state if they are dynamic since they * should be set later on in the command buffer. */ /* TODO: Handle rasterizerDiscardEnable. */ if (rasterization_state) { if (!(dynamic_states & PVR_DYNAMIC_STATE_BIT_LINE_WIDTH)) internal_dynamic_state->line_width = rasterization_state->lineWidth; /* TODO: Do we need the depthBiasEnable check? */ if (!(dynamic_states & PVR_DYNAMIC_STATE_BIT_DEPTH_BIAS)) { internal_dynamic_state->depth_bias.constant_factor = rasterization_state->depthBiasConstantFactor; internal_dynamic_state->depth_bias.clamp = rasterization_state->depthBiasClamp; internal_dynamic_state->depth_bias.slope_factor = rasterization_state->depthBiasSlopeFactor; } } /* TODO: handle viewport state flags. */ /* TODO: handle static viewport state. */ /* We assume the viewport state to by dynamic for now. */ /* TODO: handle static scissor state. */ /* We assume the scissor state to by dynamic for now. */ if (depth_stencil_state) { const VkStencilOpState *const front = &depth_stencil_state->front; const VkStencilOpState *const back = &depth_stencil_state->back; /* VkPhysicalDeviceFeatures->depthBounds is false. */ assert(depth_stencil_state->depthBoundsTestEnable == VK_FALSE); if (!(dynamic_states & PVR_DYNAMIC_STATE_BIT_STENCIL_COMPARE_MASK)) { internal_dynamic_state->compare_mask.front = front->compareMask; internal_dynamic_state->compare_mask.back = back->compareMask; } if (!(dynamic_states & PVR_DYNAMIC_STATE_BIT_STENCIL_WRITE_MASK)) { internal_dynamic_state->write_mask.front = front->writeMask; internal_dynamic_state->write_mask.back = back->writeMask; } if (!(dynamic_states & PVR_DYNAMIC_STATE_BIT_STENCIL_REFERENCE)) { internal_dynamic_state->reference.front = front->reference; internal_dynamic_state->reference.back = back->reference; } } if (color_blend_state && !(dynamic_states & PVR_DYNAMIC_STATE_BIT_BLEND_CONSTANTS)) { STATIC_ASSERT(__same_type(internal_dynamic_state->blend_constants, color_blend_state->blendConstants)); typed_memcpy(internal_dynamic_state->blend_constants, color_blend_state->blendConstants, ARRAY_SIZE(internal_dynamic_state->blend_constants)); } /* TODO: handle STATIC_STATE_DEPTH_BOUNDS ? */ internal_dynamic_state->mask = dynamic_states; } static VkResult pvr_graphics_pipeline_init(struct pvr_device *device, struct pvr_pipeline_cache *pipeline_cache, const VkGraphicsPipelineCreateInfo *pCreateInfo, const VkAllocationCallbacks *allocator, struct pvr_graphics_pipeline *gfx_pipeline) { /* If rasterization is not enabled, various CreateInfo structs must be * ignored. */ const bool raster_discard_enabled = pCreateInfo->pRasterizationState->rasterizerDiscardEnable; const VkPipelineViewportStateCreateInfo *vs_info = !raster_discard_enabled ? pCreateInfo->pViewportState : NULL; const VkPipelineDepthStencilStateCreateInfo *dss_info = !raster_discard_enabled ? pCreateInfo->pDepthStencilState : NULL; const VkPipelineRasterizationStateCreateInfo *rs_info = !raster_discard_enabled ? pCreateInfo->pRasterizationState : NULL; const VkPipelineColorBlendStateCreateInfo *cbs_info = !raster_discard_enabled ? pCreateInfo->pColorBlendState : NULL; const VkPipelineMultisampleStateCreateInfo *ms_info = !raster_discard_enabled ? pCreateInfo->pMultisampleState : NULL; VkResult result; pvr_pipeline_init(device, PVR_PIPELINE_TYPE_GRAPHICS, &gfx_pipeline->base); pvr_finishme("ignoring pCreateInfo flags."); pvr_finishme("ignoring pipeline cache."); gfx_pipeline->raster_state.discard_enable = raster_discard_enabled; gfx_pipeline->raster_state.cull_mode = pCreateInfo->pRasterizationState->cullMode; gfx_pipeline->raster_state.front_face = pCreateInfo->pRasterizationState->frontFace; gfx_pipeline->raster_state.depth_bias_enable = pCreateInfo->pRasterizationState->depthBiasEnable; gfx_pipeline->raster_state.depth_clamp_enable = pCreateInfo->pRasterizationState->depthClampEnable; /* FIXME: Handle depthClampEnable. */ pvr_graphics_pipeline_init_depth_and_stencil_state(gfx_pipeline, dss_info); pvr_graphics_pipeline_init_dynamic_state(gfx_pipeline, pCreateInfo->pDynamicState, vs_info, dss_info, cbs_info, rs_info); if (pCreateInfo->pInputAssemblyState) { gfx_pipeline->input_asm_state.topology = pCreateInfo->pInputAssemblyState->topology; gfx_pipeline->input_asm_state.primitive_restart = pCreateInfo->pInputAssemblyState->primitiveRestartEnable; } memset(gfx_pipeline->stage_indices, ~0, sizeof(gfx_pipeline->stage_indices)); for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) { VkShaderStageFlagBits vk_stage = pCreateInfo->pStages[i].stage; gl_shader_stage gl_stage = vk_to_mesa_shader_stage(vk_stage); /* From the Vulkan 1.2.192 spec for VkPipelineShaderStageCreateInfo: * * "stage must not be VK_SHADER_STAGE_ALL_GRAPHICS, * or VK_SHADER_STAGE_ALL." * * So we don't handle that. * * We also don't handle VK_SHADER_STAGE_TESSELLATION_* and * VK_SHADER_STAGE_GEOMETRY_BIT stages as 'tessellationShader' and * 'geometryShader' are set to false in the VkPhysicalDeviceFeatures * structure returned by the driver. */ switch (pCreateInfo->pStages[i].stage) { case VK_SHADER_STAGE_VERTEX_BIT: case VK_SHADER_STAGE_FRAGMENT_BIT: gfx_pipeline->stage_indices[gl_stage] = i; break; default: unreachable("Unsupported stage."); } } gfx_pipeline->base.layout = pvr_pipeline_layout_from_handle(pCreateInfo->layout); if (ms_info) { gfx_pipeline->rasterization_samples = ms_info->rasterizationSamples; gfx_pipeline->sample_mask = (ms_info->pSampleMask) ? ms_info->pSampleMask[0] : 0xFFFFFFFF; } else { gfx_pipeline->rasterization_samples = VK_SAMPLE_COUNT_1_BIT; gfx_pipeline->sample_mask = 0xFFFFFFFF; } /* Compiles and uploads shaders and PDS programs. */ result = pvr_graphics_pipeline_compile(device, pipeline_cache, pCreateInfo, allocator, gfx_pipeline); if (result != VK_SUCCESS) { pvr_pipeline_finish(&gfx_pipeline->base); return result; } return VK_SUCCESS; } /* If allocator == NULL, the internal one will be used. */ static VkResult pvr_graphics_pipeline_create(struct pvr_device *device, struct pvr_pipeline_cache *pipeline_cache, const VkGraphicsPipelineCreateInfo *pCreateInfo, const VkAllocationCallbacks *allocator, VkPipeline *const pipeline_out) { struct pvr_graphics_pipeline *gfx_pipeline; VkResult result; gfx_pipeline = vk_zalloc2(&device->vk.alloc, allocator, sizeof(*gfx_pipeline), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); if (!gfx_pipeline) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); /* Compiles and uploads shaders and PDS programs too. */ result = pvr_graphics_pipeline_init(device, pipeline_cache, pCreateInfo, allocator, gfx_pipeline); if (result != VK_SUCCESS) { vk_free2(&device->vk.alloc, allocator, gfx_pipeline); return result; } *pipeline_out = pvr_pipeline_to_handle(&gfx_pipeline->base); return VK_SUCCESS; } VkResult pvr_CreateGraphicsPipelines(VkDevice _device, VkPipelineCache pipelineCache, uint32_t createInfoCount, const VkGraphicsPipelineCreateInfo *pCreateInfos, const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines) { PVR_FROM_HANDLE(pvr_pipeline_cache, pipeline_cache, pipelineCache); PVR_FROM_HANDLE(pvr_device, device, _device); VkResult result = VK_SUCCESS; for (uint32_t i = 0; i < createInfoCount; i++) { const VkResult local_result = pvr_graphics_pipeline_create(device, pipeline_cache, &pCreateInfos[i], pAllocator, &pPipelines[i]); if (local_result != VK_SUCCESS) { result = local_result; pPipelines[i] = VK_NULL_HANDLE; } } return result; } /***************************************************************************** Other functions *****************************************************************************/ void pvr_DestroyPipeline(VkDevice _device, VkPipeline _pipeline, const VkAllocationCallbacks *pAllocator) { PVR_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline); PVR_FROM_HANDLE(pvr_device, device, _device); if (!pipeline) return; switch (pipeline->type) { case PVR_PIPELINE_TYPE_GRAPHICS: { struct pvr_graphics_pipeline *const gfx_pipeline = to_pvr_graphics_pipeline(pipeline); pvr_graphics_pipeline_destroy(device, pAllocator, gfx_pipeline); break; } case PVR_PIPELINE_TYPE_COMPUTE: { struct pvr_compute_pipeline *const compute_pipeline = to_pvr_compute_pipeline(pipeline); pvr_compute_pipeline_destroy(device, pAllocator, compute_pipeline); break; } default: unreachable("Unknown pipeline type."); } }