pvr: Add IDF/WDF program for compute pipeline barrier.

Signed-off-by: Karmjit Mahil <Karmjit.Mahil@imgtec.com>
Reviewed-by: Rajnesh Kanwal <rajnesh.kanwal@imgtec.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17683>
This commit is contained in:
Karmjit Mahil 2022-06-24 16:34:31 +01:00 committed by Marge Bot
parent 6d672e0336
commit b92c40d40a
9 changed files with 345 additions and 2 deletions

View File

@ -336,4 +336,10 @@ SOFTWARE.
<field name="dadjust" start="0" end="12" type="DADJUST"/>
</struct>
<struct name="SAMPLER_WORD1" length="2">
<field name="cemedge_dontfilter" start="63" end="63" type="bool"/>
<field name="texaddr_plane3" start="24" end="61" shift="2" type="address"/>
<field name="texaddr_plane2_hi" start="0" end="23" shift="16" type="address"/>
</struct>
</csbgen>

View File

@ -90,6 +90,9 @@
/* Number of TEXSTATE_IMAGE_WORD values that need setting up. */
#define ROGUE_NUM_TEXSTATE_IMAGE_WORDS 2U
/* Number of TEXSTATE_SAMPLER state words that need setting up. */
#define ROGUE_NUM_TEXSTATE_SAMPLER_WORDS 2U
#define ROGUE_MAX_RENDER_TARGETS 2048U
/* 12 dwords reserved for shared register management. The first dword is the

View File

@ -1094,7 +1094,7 @@ pvr_sub_cmd_compute_job_init(const struct pvr_physical_device *pdevice,
value.border_colour_table_address = PVR_DEV_ADDR_INVALID;
}
sub_cmd->num_shared_regs = MAX2(PVR_IDF_WDF_IN_REGISTER_CONST_COUNT,
sub_cmd->num_shared_regs = MAX2(cmd_buffer->device->idfwdf_state.usc_shareds,
cmd_buffer->state.max_shared_regs);
cmd_buffer->state.max_shared_regs = 0U;

View File

@ -39,16 +39,19 @@
#include <xf86drm.h>
#include "hwdef/rogue_hw_utils.h"
#include "pipe/p_defines.h"
#include "pvr_bo.h"
#include "pvr_csb.h"
#include "pvr_csb_enum_helpers.h"
#include "pvr_debug.h"
#include "pvr_device_info.h"
#include "pvr_hardcode.h"
#include "pvr_job_render.h"
#include "pvr_limits.h"
#include "pvr_nop_usc.h"
#include "pvr_pds.h"
#include "pvr_private.h"
#include "pvr_tex_state.h"
#include "pvr_types.h"
#include "pvr_winsys.h"
#include "rogue/rogue_compiler.h"
@ -1177,6 +1180,289 @@ static VkResult pvr_device_init_compute_fence_program(struct pvr_device *device)
return result;
}
static VkResult pvr_pds_idfwdf_programs_create_and_upload(
struct pvr_device *device,
pvr_dev_addr_t usc_addr,
uint32_t shareds,
uint32_t temps,
pvr_dev_addr_t shareds_buffer_addr,
struct pvr_pds_upload *const upload_out,
struct pvr_pds_upload *const sw_compute_barrier_upload_out)
{
const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
struct pvr_pds_vertex_shader_sa_program program = {
.kick_usc = true,
.clear_pds_barrier = PVR_NEED_SW_COMPUTE_PDS_BARRIER(dev_info),
};
size_t staging_buffer_size;
uint32_t *staging_buffer;
VkResult result;
/* We'll need to DMA the shareds into the USC's Common Store. */
program.num_dma_kicks = pvr_pds_encode_dma_burst(program.dma_control,
program.dma_address,
0,
shareds,
shareds_buffer_addr.addr,
dev_info);
/* DMA temp regs. */
pvr_pds_setup_doutu(&program.usc_task_control,
usc_addr.addr,
temps,
PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
false);
pvr_pds_vertex_shader_sa(&program, NULL, PDS_GENERATE_SIZES, dev_info);
staging_buffer_size =
(program.code_size + program.data_size) * sizeof(*staging_buffer);
staging_buffer = vk_alloc(&device->vk.alloc,
staging_buffer_size,
8,
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
if (!staging_buffer)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
/* FIXME: Add support for PDS_GENERATE_CODEDATA_SEGMENTS? */
pvr_pds_vertex_shader_sa(&program,
staging_buffer,
PDS_GENERATE_DATA_SEGMENT,
dev_info);
pvr_pds_vertex_shader_sa(&program,
&staging_buffer[program.data_size],
PDS_GENERATE_CODE_SEGMENT,
dev_info);
/* At the time of writing, the SW_COMPUTE_PDS_BARRIER variant of the program
* is bigger so we handle it first (if needed) and realloc() for a smaller
* size.
*/
if (PVR_NEED_SW_COMPUTE_PDS_BARRIER(dev_info)) {
/* FIXME: Figure out the define for alignment of 16. */
result = pvr_gpu_upload_pds(device,
&staging_buffer[0],
program.data_size,
16,
&staging_buffer[program.data_size],
program.code_size,
16,
16,
sw_compute_barrier_upload_out);
if (result != VK_SUCCESS) {
vk_free(&device->vk.alloc, staging_buffer);
return result;
}
program.clear_pds_barrier = false;
pvr_pds_vertex_shader_sa(&program, NULL, PDS_GENERATE_SIZES, dev_info);
staging_buffer_size =
(program.code_size + program.data_size) * sizeof(*staging_buffer);
staging_buffer = vk_realloc(&device->vk.alloc,
staging_buffer,
staging_buffer_size,
8,
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
if (!staging_buffer) {
pvr_bo_free(device, sw_compute_barrier_upload_out->pvr_bo);
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
}
/* FIXME: Add support for PDS_GENERATE_CODEDATA_SEGMENTS? */
pvr_pds_vertex_shader_sa(&program,
staging_buffer,
PDS_GENERATE_DATA_SEGMENT,
dev_info);
pvr_pds_vertex_shader_sa(&program,
&staging_buffer[program.data_size],
PDS_GENERATE_CODE_SEGMENT,
dev_info);
} else {
*sw_compute_barrier_upload_out = (struct pvr_pds_upload){
.pvr_bo = NULL,
};
}
/* FIXME: Figure out the define for alignment of 16. */
result = pvr_gpu_upload_pds(device,
&staging_buffer[0],
program.data_size,
16,
&staging_buffer[program.data_size],
program.code_size,
16,
16,
upload_out);
if (result != VK_SUCCESS) {
vk_free(&device->vk.alloc, staging_buffer);
pvr_bo_free(device, sw_compute_barrier_upload_out->pvr_bo);
return result;
}
vk_free(&device->vk.alloc, staging_buffer);
return VK_SUCCESS;
}
static VkResult pvr_device_init_compute_idfwdf_state(struct pvr_device *device)
{
uint64_t sampler_state[ROGUE_NUM_TEXSTATE_SAMPLER_WORDS];
uint64_t image_state[ROGUE_NUM_TEXSTATE_IMAGE_WORDS];
const struct rogue_shader_binary *usc_program;
struct pvr_texture_state_info tex_info;
uint32_t *dword_ptr;
uint32_t usc_shareds;
uint32_t usc_temps;
VkResult result;
pvr_hard_code_get_idfwdf_program(&device->pdevice->dev_info,
&usc_program,
&usc_shareds,
&usc_temps);
device->idfwdf_state.usc_shareds = usc_shareds;
/* FIXME: Figure out the define for alignment of 16. */
result = pvr_gpu_upload_usc(device,
usc_program->data,
usc_program->size,
16,
&device->idfwdf_state.usc);
if (result != VK_SUCCESS)
return result;
/* TODO: Get the store buffer size from the compiler? */
/* TODO: How was the size derived here? */
result = pvr_bo_alloc(device,
device->heaps.general_heap,
4 * sizeof(float) * 4 * 2,
4,
0,
&device->idfwdf_state.store_bo);
if (result != VK_SUCCESS)
goto err_free_usc_program;
result = pvr_bo_alloc(device,
device->heaps.general_heap,
usc_shareds * ROGUE_REG_SIZE_BYTES,
ROGUE_REG_SIZE_BYTES,
PVR_BO_ALLOC_FLAG_CPU_MAPPED,
&device->idfwdf_state.shareds_bo);
if (result != VK_SUCCESS)
goto err_free_store_buffer;
/* Pack state words. */
pvr_csb_pack (&sampler_state[0], TEXSTATE_SAMPLER, sampler) {
sampler.dadjust = PVRX(TEXSTATE_DADJUST_ZERO_UINT);
sampler.magfilter = PVRX(TEXSTATE_FILTER_POINT);
sampler.addrmode_u = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE);
sampler.addrmode_v = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE);
}
/* clang-format off */
pvr_csb_pack (&sampler_state[1], TEXSTATE_SAMPLER_WORD1, sampler_word1) {}
/* clang-format on */
STATIC_ASSERT(1 + 1 == ROGUE_NUM_TEXSTATE_SAMPLER_WORDS);
tex_info = (struct pvr_texture_state_info){
.format = VK_FORMAT_R32G32B32A32_SFLOAT,
.mem_layout = PVR_MEMLAYOUT_LINEAR,
.flags = PVR_TEXFLAGS_INDEX_LOOKUP,
/* TODO: Is this correct? Is it 2D, 3D, or 2D_ARRAY? */
.type = VK_IMAGE_VIEW_TYPE_2D,
.extent = { .width = 4, .height = 2, .depth = 0 },
.mip_levels = 1,
.sample_count = 1,
.stride = 4,
.swizzle = { PIPE_SWIZZLE_X,
PIPE_SWIZZLE_Y,
PIPE_SWIZZLE_Z,
PIPE_SWIZZLE_W },
.addr = device->idfwdf_state.store_bo->vma->dev_addr,
};
result = pvr_pack_tex_state(device, &tex_info, image_state);
if (result != VK_SUCCESS)
goto err_free_shareds_buffer;
/* Fill the shareds buffer. */
dword_ptr = (uint32_t *)device->idfwdf_state.shareds_bo->bo->map;
#define HIGH_32(val) ((uint32_t)((val) >> 32U))
#define LOW_32(val) ((uint32_t)(val))
/* TODO: Should we use compiler info to setup the shareds data instead of
* assuming there's always 12 and this is how they should be setup?
*/
dword_ptr[0] = HIGH_32(device->idfwdf_state.store_bo->vma->dev_addr.addr);
dword_ptr[1] = LOW_32(device->idfwdf_state.store_bo->vma->dev_addr.addr);
/* Pad the shareds as the texture/sample state words are 128 bit aligned. */
dword_ptr[2] = 0U;
dword_ptr[3] = 0U;
dword_ptr[4] = LOW_32(image_state[0]);
dword_ptr[5] = HIGH_32(image_state[0]);
dword_ptr[6] = LOW_32(image_state[1]);
dword_ptr[7] = HIGH_32(image_state[1]);
dword_ptr[8] = LOW_32(sampler_state[0]);
dword_ptr[9] = HIGH_32(sampler_state[0]);
dword_ptr[10] = LOW_32(sampler_state[1]);
dword_ptr[11] = HIGH_32(sampler_state[1]);
assert(11 + 1 == usc_shareds);
#undef HIGH_32
#undef LOW_32
pvr_bo_cpu_unmap(device, device->idfwdf_state.shareds_bo);
dword_ptr = NULL;
/* Generate and upload PDS programs. */
result = pvr_pds_idfwdf_programs_create_and_upload(
device,
device->idfwdf_state.usc->vma->dev_addr,
usc_shareds,
usc_temps,
device->idfwdf_state.shareds_bo->vma->dev_addr,
&device->idfwdf_state.pds,
&device->idfwdf_state.sw_compute_barrier_pds);
if (result != VK_SUCCESS)
goto err_free_shareds_buffer;
return VK_SUCCESS;
err_free_shareds_buffer:
pvr_bo_free(device, device->idfwdf_state.shareds_bo);
err_free_store_buffer:
pvr_bo_free(device, device->idfwdf_state.store_bo);
err_free_usc_program:
pvr_bo_free(device, device->idfwdf_state.usc);
return result;
}
static void pvr_device_finish_compute_idfwdf_state(struct pvr_device *device)
{
pvr_bo_free(device, device->idfwdf_state.pds.pvr_bo);
pvr_bo_free(device, device->idfwdf_state.sw_compute_barrier_pds.pvr_bo);
pvr_bo_free(device, device->idfwdf_state.shareds_bo);
pvr_bo_free(device, device->idfwdf_state.store_bo);
pvr_bo_free(device, device->idfwdf_state.usc);
}
/* FIXME: We should be calculating the size when we upload the code in
* pvr_srv_setup_static_pixel_event_program().
*/
@ -1358,10 +1644,14 @@ VkResult pvr_CreateDevice(VkPhysicalDevice physicalDevice,
if (result != VK_SUCCESS)
goto err_pvr_free_nop_program;
result = pvr_queues_create(device, pCreateInfo);
result = pvr_device_init_compute_idfwdf_state(device);
if (result != VK_SUCCESS)
goto err_pvr_free_compute_fence;
result = pvr_queues_create(device, pCreateInfo);
if (result != VK_SUCCESS)
goto err_pvr_finish_compute_idfwdf;
pvr_device_init_default_sampler_state(device);
if (pCreateInfo->pEnabledFeatures)
@ -1384,6 +1674,9 @@ VkResult pvr_CreateDevice(VkPhysicalDevice physicalDevice,
return VK_SUCCESS;
err_pvr_finish_compute_idfwdf:
pvr_device_finish_compute_idfwdf_state(device);
err_pvr_free_compute_fence:
pvr_bo_free(device, device->pds_compute_fence_program.pvr_bo);
@ -1418,6 +1711,7 @@ void pvr_DestroyDevice(VkDevice _device,
PVR_FROM_HANDLE(pvr_device, device, _device);
pvr_queues_destroy(device);
pvr_device_finish_compute_idfwdf_state(device);
pvr_bo_free(device, device->pds_compute_fence_program.pvr_bo);
pvr_bo_free(device, device->nop_program.pds.pvr_bo);
pvr_bo_free(device, device->nop_program.usc);

View File

@ -55,6 +55,8 @@ static const struct pvr_format pvr_format_table[] = {
FORMAT(R32_UINT, U32, U32),
/* VK_FORMAT_R32G32B32A32_UINT = 107. */
FORMAT(R32G32B32A32_UINT, U32U32U32U32, U32U32U32U32),
/* VK_FORMAT_R32G32B32A32_SFLOAT = 109. */
FORMAT(R32G32B32A32_SFLOAT, F32F32F32F32, F32F32F32F32),
/* VK_FORMAT_D32_SFLOAT = 126. */
FORMAT(D32_SFLOAT, F32, F32),
};

View File

@ -332,3 +332,20 @@ void pvr_hard_code_graphics_get_build_info(
unreachable("Unsupported stage.");
}
}
void pvr_hard_code_get_idfwdf_program(
const struct pvr_device_info *const dev_info,
const struct rogue_shader_binary **const program_out,
uint32_t *usc_shareds_out,
uint32_t *usc_temps_out)
{
static const struct rogue_shader_binary shader = {
.size = 8U,
.data = { 0, 0, 0, 0, 0, 0, 0, 0 }
};
mesa_loge("No hard coded idfwdf program. Returning empty program.");
*program_out = &shader;
*usc_shareds_out = 12U;
*usc_temps_out = 4U;
}

View File

@ -119,4 +119,10 @@ void pvr_hard_code_graphics_get_build_info(
struct rogue_build_data *const build_data,
struct pvr_explicit_constant_usage *const explicit_const_usage);
void pvr_hard_code_get_idfwdf_program(
const struct pvr_device_info *const dev_info,
const struct rogue_shader_binary **const program_out,
uint32_t *usc_shareds_out,
uint32_t *usc_temps_out);
#endif /* PVR_HARDCODE_SHADERS_H */

View File

@ -279,6 +279,20 @@ struct pvr_device {
struct pvr_bo *usc;
} nop_program;
/* Issue Data Fence, Wait for Data Fence state. */
struct {
uint32_t usc_shareds;
struct pvr_bo *usc;
/* Buffer in which the IDF/WDF program performs store ops. */
struct pvr_bo *store_bo;
/* Contains the initialization values for the shared registers. */
struct pvr_bo *shareds_bo;
struct pvr_pds_upload pds;
struct pvr_pds_upload sw_compute_barrier_pds;
} idfwdf_state;
VkPhysicalDeviceFeatures features;
};

View File

@ -25,6 +25,7 @@
#include <vulkan/vulkan.h>
#include "hwdef/rogue_hw_defs.h"
#include "pipe/p_defines.h"
#include "pvr_csb.h"
#include "pvr_device_info.h"
#include "pvr_formats.h"