From b92c40d40ad195039893edea36af3b85a5a3c4cd Mon Sep 17 00:00:00 2001 From: Karmjit Mahil Date: Fri, 24 Jun 2022 16:34:31 +0100 Subject: [PATCH] pvr: Add IDF/WDF program for compute pipeline barrier. Signed-off-by: Karmjit Mahil Reviewed-by: Rajnesh Kanwal Part-of: --- src/imagination/csbgen/rogue_texstate.xml | 6 + src/imagination/include/hwdef/rogue_hw_defs.h | 3 + src/imagination/vulkan/pvr_cmd_buffer.c | 2 +- src/imagination/vulkan/pvr_device.c | 296 +++++++++++++++++- src/imagination/vulkan/pvr_formats.c | 2 + src/imagination/vulkan/pvr_hardcode.c | 17 + src/imagination/vulkan/pvr_hardcode.h | 6 + src/imagination/vulkan/pvr_private.h | 14 + src/imagination/vulkan/pvr_tex_state.c | 1 + 9 files changed, 345 insertions(+), 2 deletions(-) diff --git a/src/imagination/csbgen/rogue_texstate.xml b/src/imagination/csbgen/rogue_texstate.xml index 8f2fc0f0fa3..537146de332 100644 --- a/src/imagination/csbgen/rogue_texstate.xml +++ b/src/imagination/csbgen/rogue_texstate.xml @@ -336,4 +336,10 @@ SOFTWARE. + + + + + + diff --git a/src/imagination/include/hwdef/rogue_hw_defs.h b/src/imagination/include/hwdef/rogue_hw_defs.h index 1ee1eb5ee34..e66f50537a0 100644 --- a/src/imagination/include/hwdef/rogue_hw_defs.h +++ b/src/imagination/include/hwdef/rogue_hw_defs.h @@ -90,6 +90,9 @@ /* Number of TEXSTATE_IMAGE_WORD values that need setting up. */ #define ROGUE_NUM_TEXSTATE_IMAGE_WORDS 2U +/* Number of TEXSTATE_SAMPLER state words that need setting up. */ +#define ROGUE_NUM_TEXSTATE_SAMPLER_WORDS 2U + #define ROGUE_MAX_RENDER_TARGETS 2048U /* 12 dwords reserved for shared register management. The first dword is the diff --git a/src/imagination/vulkan/pvr_cmd_buffer.c b/src/imagination/vulkan/pvr_cmd_buffer.c index 9e7d02d7882..8cc6e1db876 100644 --- a/src/imagination/vulkan/pvr_cmd_buffer.c +++ b/src/imagination/vulkan/pvr_cmd_buffer.c @@ -1094,7 +1094,7 @@ pvr_sub_cmd_compute_job_init(const struct pvr_physical_device *pdevice, value.border_colour_table_address = PVR_DEV_ADDR_INVALID; } - sub_cmd->num_shared_regs = MAX2(PVR_IDF_WDF_IN_REGISTER_CONST_COUNT, + sub_cmd->num_shared_regs = MAX2(cmd_buffer->device->idfwdf_state.usc_shareds, cmd_buffer->state.max_shared_regs); cmd_buffer->state.max_shared_regs = 0U; diff --git a/src/imagination/vulkan/pvr_device.c b/src/imagination/vulkan/pvr_device.c index 16407fff347..f74fd092000 100644 --- a/src/imagination/vulkan/pvr_device.c +++ b/src/imagination/vulkan/pvr_device.c @@ -39,16 +39,19 @@ #include #include "hwdef/rogue_hw_utils.h" +#include "pipe/p_defines.h" #include "pvr_bo.h" #include "pvr_csb.h" #include "pvr_csb_enum_helpers.h" #include "pvr_debug.h" #include "pvr_device_info.h" +#include "pvr_hardcode.h" #include "pvr_job_render.h" #include "pvr_limits.h" #include "pvr_nop_usc.h" #include "pvr_pds.h" #include "pvr_private.h" +#include "pvr_tex_state.h" #include "pvr_types.h" #include "pvr_winsys.h" #include "rogue/rogue_compiler.h" @@ -1177,6 +1180,289 @@ static VkResult pvr_device_init_compute_fence_program(struct pvr_device *device) return result; } +static VkResult pvr_pds_idfwdf_programs_create_and_upload( + struct pvr_device *device, + pvr_dev_addr_t usc_addr, + uint32_t shareds, + uint32_t temps, + pvr_dev_addr_t shareds_buffer_addr, + struct pvr_pds_upload *const upload_out, + struct pvr_pds_upload *const sw_compute_barrier_upload_out) +{ + const struct pvr_device_info *dev_info = &device->pdevice->dev_info; + struct pvr_pds_vertex_shader_sa_program program = { + .kick_usc = true, + .clear_pds_barrier = PVR_NEED_SW_COMPUTE_PDS_BARRIER(dev_info), + }; + size_t staging_buffer_size; + uint32_t *staging_buffer; + VkResult result; + + /* We'll need to DMA the shareds into the USC's Common Store. */ + program.num_dma_kicks = pvr_pds_encode_dma_burst(program.dma_control, + program.dma_address, + 0, + shareds, + shareds_buffer_addr.addr, + dev_info); + + /* DMA temp regs. */ + pvr_pds_setup_doutu(&program.usc_task_control, + usc_addr.addr, + temps, + PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE), + false); + + pvr_pds_vertex_shader_sa(&program, NULL, PDS_GENERATE_SIZES, dev_info); + + staging_buffer_size = + (program.code_size + program.data_size) * sizeof(*staging_buffer); + + staging_buffer = vk_alloc(&device->vk.alloc, + staging_buffer_size, + 8, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + if (!staging_buffer) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + /* FIXME: Add support for PDS_GENERATE_CODEDATA_SEGMENTS? */ + pvr_pds_vertex_shader_sa(&program, + staging_buffer, + PDS_GENERATE_DATA_SEGMENT, + dev_info); + pvr_pds_vertex_shader_sa(&program, + &staging_buffer[program.data_size], + PDS_GENERATE_CODE_SEGMENT, + dev_info); + + /* At the time of writing, the SW_COMPUTE_PDS_BARRIER variant of the program + * is bigger so we handle it first (if needed) and realloc() for a smaller + * size. + */ + if (PVR_NEED_SW_COMPUTE_PDS_BARRIER(dev_info)) { + /* FIXME: Figure out the define for alignment of 16. */ + result = pvr_gpu_upload_pds(device, + &staging_buffer[0], + program.data_size, + 16, + &staging_buffer[program.data_size], + program.code_size, + 16, + 16, + sw_compute_barrier_upload_out); + if (result != VK_SUCCESS) { + vk_free(&device->vk.alloc, staging_buffer); + return result; + } + + program.clear_pds_barrier = false; + + pvr_pds_vertex_shader_sa(&program, NULL, PDS_GENERATE_SIZES, dev_info); + + staging_buffer_size = + (program.code_size + program.data_size) * sizeof(*staging_buffer); + + staging_buffer = vk_realloc(&device->vk.alloc, + staging_buffer, + staging_buffer_size, + 8, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + if (!staging_buffer) { + pvr_bo_free(device, sw_compute_barrier_upload_out->pvr_bo); + + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + } + + /* FIXME: Add support for PDS_GENERATE_CODEDATA_SEGMENTS? */ + pvr_pds_vertex_shader_sa(&program, + staging_buffer, + PDS_GENERATE_DATA_SEGMENT, + dev_info); + pvr_pds_vertex_shader_sa(&program, + &staging_buffer[program.data_size], + PDS_GENERATE_CODE_SEGMENT, + dev_info); + } else { + *sw_compute_barrier_upload_out = (struct pvr_pds_upload){ + .pvr_bo = NULL, + }; + } + + /* FIXME: Figure out the define for alignment of 16. */ + result = pvr_gpu_upload_pds(device, + &staging_buffer[0], + program.data_size, + 16, + &staging_buffer[program.data_size], + program.code_size, + 16, + 16, + upload_out); + if (result != VK_SUCCESS) { + vk_free(&device->vk.alloc, staging_buffer); + pvr_bo_free(device, sw_compute_barrier_upload_out->pvr_bo); + + return result; + } + + vk_free(&device->vk.alloc, staging_buffer); + + return VK_SUCCESS; +} + +static VkResult pvr_device_init_compute_idfwdf_state(struct pvr_device *device) +{ + uint64_t sampler_state[ROGUE_NUM_TEXSTATE_SAMPLER_WORDS]; + uint64_t image_state[ROGUE_NUM_TEXSTATE_IMAGE_WORDS]; + const struct rogue_shader_binary *usc_program; + struct pvr_texture_state_info tex_info; + uint32_t *dword_ptr; + uint32_t usc_shareds; + uint32_t usc_temps; + VkResult result; + + pvr_hard_code_get_idfwdf_program(&device->pdevice->dev_info, + &usc_program, + &usc_shareds, + &usc_temps); + + device->idfwdf_state.usc_shareds = usc_shareds; + + /* FIXME: Figure out the define for alignment of 16. */ + result = pvr_gpu_upload_usc(device, + usc_program->data, + usc_program->size, + 16, + &device->idfwdf_state.usc); + if (result != VK_SUCCESS) + return result; + + /* TODO: Get the store buffer size from the compiler? */ + /* TODO: How was the size derived here? */ + result = pvr_bo_alloc(device, + device->heaps.general_heap, + 4 * sizeof(float) * 4 * 2, + 4, + 0, + &device->idfwdf_state.store_bo); + if (result != VK_SUCCESS) + goto err_free_usc_program; + + result = pvr_bo_alloc(device, + device->heaps.general_heap, + usc_shareds * ROGUE_REG_SIZE_BYTES, + ROGUE_REG_SIZE_BYTES, + PVR_BO_ALLOC_FLAG_CPU_MAPPED, + &device->idfwdf_state.shareds_bo); + if (result != VK_SUCCESS) + goto err_free_store_buffer; + + /* Pack state words. */ + + pvr_csb_pack (&sampler_state[0], TEXSTATE_SAMPLER, sampler) { + sampler.dadjust = PVRX(TEXSTATE_DADJUST_ZERO_UINT); + sampler.magfilter = PVRX(TEXSTATE_FILTER_POINT); + sampler.addrmode_u = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE); + sampler.addrmode_v = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE); + } + + /* clang-format off */ + pvr_csb_pack (&sampler_state[1], TEXSTATE_SAMPLER_WORD1, sampler_word1) {} + /* clang-format on */ + + STATIC_ASSERT(1 + 1 == ROGUE_NUM_TEXSTATE_SAMPLER_WORDS); + + tex_info = (struct pvr_texture_state_info){ + .format = VK_FORMAT_R32G32B32A32_SFLOAT, + .mem_layout = PVR_MEMLAYOUT_LINEAR, + .flags = PVR_TEXFLAGS_INDEX_LOOKUP, + /* TODO: Is this correct? Is it 2D, 3D, or 2D_ARRAY? */ + .type = VK_IMAGE_VIEW_TYPE_2D, + .extent = { .width = 4, .height = 2, .depth = 0 }, + .mip_levels = 1, + .sample_count = 1, + .stride = 4, + .swizzle = { PIPE_SWIZZLE_X, + PIPE_SWIZZLE_Y, + PIPE_SWIZZLE_Z, + PIPE_SWIZZLE_W }, + .addr = device->idfwdf_state.store_bo->vma->dev_addr, + }; + + result = pvr_pack_tex_state(device, &tex_info, image_state); + if (result != VK_SUCCESS) + goto err_free_shareds_buffer; + + /* Fill the shareds buffer. */ + + dword_ptr = (uint32_t *)device->idfwdf_state.shareds_bo->bo->map; + +#define HIGH_32(val) ((uint32_t)((val) >> 32U)) +#define LOW_32(val) ((uint32_t)(val)) + + /* TODO: Should we use compiler info to setup the shareds data instead of + * assuming there's always 12 and this is how they should be setup? + */ + + dword_ptr[0] = HIGH_32(device->idfwdf_state.store_bo->vma->dev_addr.addr); + dword_ptr[1] = LOW_32(device->idfwdf_state.store_bo->vma->dev_addr.addr); + + /* Pad the shareds as the texture/sample state words are 128 bit aligned. */ + dword_ptr[2] = 0U; + dword_ptr[3] = 0U; + + dword_ptr[4] = LOW_32(image_state[0]); + dword_ptr[5] = HIGH_32(image_state[0]); + dword_ptr[6] = LOW_32(image_state[1]); + dword_ptr[7] = HIGH_32(image_state[1]); + + dword_ptr[8] = LOW_32(sampler_state[0]); + dword_ptr[9] = HIGH_32(sampler_state[0]); + dword_ptr[10] = LOW_32(sampler_state[1]); + dword_ptr[11] = HIGH_32(sampler_state[1]); + assert(11 + 1 == usc_shareds); + +#undef HIGH_32 +#undef LOW_32 + + pvr_bo_cpu_unmap(device, device->idfwdf_state.shareds_bo); + dword_ptr = NULL; + + /* Generate and upload PDS programs. */ + result = pvr_pds_idfwdf_programs_create_and_upload( + device, + device->idfwdf_state.usc->vma->dev_addr, + usc_shareds, + usc_temps, + device->idfwdf_state.shareds_bo->vma->dev_addr, + &device->idfwdf_state.pds, + &device->idfwdf_state.sw_compute_barrier_pds); + if (result != VK_SUCCESS) + goto err_free_shareds_buffer; + + return VK_SUCCESS; + +err_free_shareds_buffer: + pvr_bo_free(device, device->idfwdf_state.shareds_bo); + +err_free_store_buffer: + pvr_bo_free(device, device->idfwdf_state.store_bo); + +err_free_usc_program: + pvr_bo_free(device, device->idfwdf_state.usc); + + return result; +} + +static void pvr_device_finish_compute_idfwdf_state(struct pvr_device *device) +{ + pvr_bo_free(device, device->idfwdf_state.pds.pvr_bo); + pvr_bo_free(device, device->idfwdf_state.sw_compute_barrier_pds.pvr_bo); + pvr_bo_free(device, device->idfwdf_state.shareds_bo); + pvr_bo_free(device, device->idfwdf_state.store_bo); + pvr_bo_free(device, device->idfwdf_state.usc); +} + /* FIXME: We should be calculating the size when we upload the code in * pvr_srv_setup_static_pixel_event_program(). */ @@ -1358,10 +1644,14 @@ VkResult pvr_CreateDevice(VkPhysicalDevice physicalDevice, if (result != VK_SUCCESS) goto err_pvr_free_nop_program; - result = pvr_queues_create(device, pCreateInfo); + result = pvr_device_init_compute_idfwdf_state(device); if (result != VK_SUCCESS) goto err_pvr_free_compute_fence; + result = pvr_queues_create(device, pCreateInfo); + if (result != VK_SUCCESS) + goto err_pvr_finish_compute_idfwdf; + pvr_device_init_default_sampler_state(device); if (pCreateInfo->pEnabledFeatures) @@ -1384,6 +1674,9 @@ VkResult pvr_CreateDevice(VkPhysicalDevice physicalDevice, return VK_SUCCESS; +err_pvr_finish_compute_idfwdf: + pvr_device_finish_compute_idfwdf_state(device); + err_pvr_free_compute_fence: pvr_bo_free(device, device->pds_compute_fence_program.pvr_bo); @@ -1418,6 +1711,7 @@ void pvr_DestroyDevice(VkDevice _device, PVR_FROM_HANDLE(pvr_device, device, _device); pvr_queues_destroy(device); + pvr_device_finish_compute_idfwdf_state(device); pvr_bo_free(device, device->pds_compute_fence_program.pvr_bo); pvr_bo_free(device, device->nop_program.pds.pvr_bo); pvr_bo_free(device, device->nop_program.usc); diff --git a/src/imagination/vulkan/pvr_formats.c b/src/imagination/vulkan/pvr_formats.c index c7ae23116cb..c84985e508d 100644 --- a/src/imagination/vulkan/pvr_formats.c +++ b/src/imagination/vulkan/pvr_formats.c @@ -55,6 +55,8 @@ static const struct pvr_format pvr_format_table[] = { FORMAT(R32_UINT, U32, U32), /* VK_FORMAT_R32G32B32A32_UINT = 107. */ FORMAT(R32G32B32A32_UINT, U32U32U32U32, U32U32U32U32), + /* VK_FORMAT_R32G32B32A32_SFLOAT = 109. */ + FORMAT(R32G32B32A32_SFLOAT, F32F32F32F32, F32F32F32F32), /* VK_FORMAT_D32_SFLOAT = 126. */ FORMAT(D32_SFLOAT, F32, F32), }; diff --git a/src/imagination/vulkan/pvr_hardcode.c b/src/imagination/vulkan/pvr_hardcode.c index d9f651d0bd5..5296b7db259 100644 --- a/src/imagination/vulkan/pvr_hardcode.c +++ b/src/imagination/vulkan/pvr_hardcode.c @@ -332,3 +332,20 @@ void pvr_hard_code_graphics_get_build_info( unreachable("Unsupported stage."); } } + +void pvr_hard_code_get_idfwdf_program( + const struct pvr_device_info *const dev_info, + const struct rogue_shader_binary **const program_out, + uint32_t *usc_shareds_out, + uint32_t *usc_temps_out) +{ + static const struct rogue_shader_binary shader = { + .size = 8U, + .data = { 0, 0, 0, 0, 0, 0, 0, 0 } + }; + + mesa_loge("No hard coded idfwdf program. Returning empty program."); + *program_out = &shader; + *usc_shareds_out = 12U; + *usc_temps_out = 4U; +} diff --git a/src/imagination/vulkan/pvr_hardcode.h b/src/imagination/vulkan/pvr_hardcode.h index 0661426dc26..ea0fecb130b 100644 --- a/src/imagination/vulkan/pvr_hardcode.h +++ b/src/imagination/vulkan/pvr_hardcode.h @@ -119,4 +119,10 @@ void pvr_hard_code_graphics_get_build_info( struct rogue_build_data *const build_data, struct pvr_explicit_constant_usage *const explicit_const_usage); +void pvr_hard_code_get_idfwdf_program( + const struct pvr_device_info *const dev_info, + const struct rogue_shader_binary **const program_out, + uint32_t *usc_shareds_out, + uint32_t *usc_temps_out); + #endif /* PVR_HARDCODE_SHADERS_H */ diff --git a/src/imagination/vulkan/pvr_private.h b/src/imagination/vulkan/pvr_private.h index c514e697d26..466fa3d7b88 100644 --- a/src/imagination/vulkan/pvr_private.h +++ b/src/imagination/vulkan/pvr_private.h @@ -279,6 +279,20 @@ struct pvr_device { struct pvr_bo *usc; } nop_program; + /* Issue Data Fence, Wait for Data Fence state. */ + struct { + uint32_t usc_shareds; + struct pvr_bo *usc; + + /* Buffer in which the IDF/WDF program performs store ops. */ + struct pvr_bo *store_bo; + /* Contains the initialization values for the shared registers. */ + struct pvr_bo *shareds_bo; + + struct pvr_pds_upload pds; + struct pvr_pds_upload sw_compute_barrier_pds; + } idfwdf_state; + VkPhysicalDeviceFeatures features; }; diff --git a/src/imagination/vulkan/pvr_tex_state.c b/src/imagination/vulkan/pvr_tex_state.c index 2df77f564d7..f37a06349db 100644 --- a/src/imagination/vulkan/pvr_tex_state.c +++ b/src/imagination/vulkan/pvr_tex_state.c @@ -25,6 +25,7 @@ #include #include "hwdef/rogue_hw_defs.h" +#include "pipe/p_defines.h" #include "pvr_csb.h" #include "pvr_device_info.h" #include "pvr_formats.h"