pvr: Add compute update shared kernel in vkCmdDispatch().

Signed-off-by: Karmjit Mahil <Karmjit.Mahil@imgtec.com>
Reviewed-by: Rajnesh Kanwal <rajnesh.kanwal@imgtec.com>
Reviewed-by: Frank Binns <frank.binns@imgtec.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16040>
This commit is contained in:
Karmjit Mahil 2022-02-14 14:13:19 +00:00 committed by Marge Bot
parent 7ccf9494b6
commit 5e9e3fe7f7
3 changed files with 75 additions and 16 deletions

View File

@ -67,7 +67,7 @@ struct pvr_compute_kernel_info {
uint32_t usc_unified_size;
uint32_t pds_temp_size;
uint32_t pds_data_size;
bool usc_target_any;
enum PVRX(CDMCTRL_USC_TARGET) usc_target;
bool is_fence;
uint32_t pds_data_offset;
uint32_t pds_code_offset;
@ -1134,10 +1134,11 @@ static void pvr_sub_cmd_compute_job_init(struct pvr_device *device,
#define PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS \
(1024 / PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE))
static uint32_t pvr_compute_slot_size(const struct pvr_device_info *dev_info,
uint32_t coeff_regs_count,
bool use_barrier,
const uint32_t local_size[static 3U])
static uint32_t
pvr_compute_flat_slot_size(const struct pvr_device_info *dev_info,
uint32_t coeff_regs_count,
bool use_barrier,
uint32_t total_workitems)
{
uint32_t max_workgroups_per_task = ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK;
uint32_t max_avail_coeff_regs =
@ -1145,7 +1146,6 @@ static uint32_t pvr_compute_slot_size(const struct pvr_device_info *dev_info,
uint32_t localstore_chunks_count =
DIV_ROUND_UP(coeff_regs_count << 2,
PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
uint32_t total_workitems = local_size[0U] * local_size[1U] * local_size[2U];
/* Ensure that we cannot have more workgroups in a slot than the available
* number of coefficients allow us to have.
@ -1242,12 +1242,7 @@ pvr_compute_generate_control_stream(struct pvr_csb *csb,
kernel0.usc_unified_size = info->usc_unified_size;
kernel0.pds_temp_size = info->pds_temp_size;
kernel0.pds_data_size = info->pds_data_size;
if (info->usc_target_any)
kernel0.usc_target = PVRX(CDMCTRL_USC_TARGET_ANY);
else
kernel0.usc_target = PVRX(CDMCTRL_USC_TARGET_ALL);
kernel0.usc_target = info->usc_target;
kernel0.fence = info->is_fence;
}
@ -1330,7 +1325,7 @@ static void pvr_compute_generate_fence(struct pvr_cmd_buffer *cmd_buffer,
.pds_data_size =
DIV_ROUND_UP(program->data_size << 2,
PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
.usc_target_any = true,
.usc_target = PVRX(CDMCTRL_USC_TARGET_ANY),
.is_fence = true,
.pds_data_offset = program->data_offset,
.sd_type = PVRX(CDMCTRL_SD_TYPE_PDS),
@ -1344,8 +1339,7 @@ static void pvr_compute_generate_fence(struct pvr_cmd_buffer *cmd_buffer,
/* Here we calculate the slot size. This can depend on the use of barriers,
* local memory, BRN's or other factors.
*/
info.max_instances =
pvr_compute_slot_size(dev_info, 0U, false, info.local_size);
info.max_instances = pvr_compute_flat_slot_size(dev_info, 0U, false, 1U);
pvr_compute_generate_control_stream(csb, &info);
}
@ -2825,6 +2819,67 @@ static VkResult pvr_setup_descriptor_mappings(
#undef PVR_WRITE
static void pvr_compute_update_shared(struct pvr_cmd_buffer *cmd_buffer)
{
const struct pvr_device_info *dev_info =
&cmd_buffer->device->pdevice->dev_info;
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
struct pvr_csb *csb = &state->current_sub_cmd->compute.control_stream;
const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
const uint32_t const_shared_reg_count =
pipeline->state.shader.const_shared_reg_count;
struct pvr_compute_kernel_info info;
/* No shared regs, no need to use an allocation kernel. */
if (!const_shared_reg_count)
return;
info = (struct pvr_compute_kernel_info){
.indirect_buffer_addr.addr = 0ULL,
.sd_type = PVRX(CDMCTRL_SD_TYPE_NONE),
.usc_target = PVRX(CDMCTRL_USC_TARGET_ALL),
.usc_common_shared = true,
.usc_common_size =
DIV_ROUND_UP(const_shared_reg_count,
PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)),
.local_size = { 1, 1, 1 },
.global_size = { 1, 1, 1 },
};
/* Sometimes we don't have a secondary program if there were no constants to
* write, but we still need to run a PDS program to accomplish the
* allocation of the local/common store shared registers so we repurpose the
* deallocation PDS program.
*/
if (pipeline->state.uniform.pds_info.code_size_in_dwords) {
uint32_t pds_data_size_in_dwords =
pipeline->state.uniform.pds_info.data_size_in_dwords;
info.pds_data_offset = state->pds_compute_uniform_data_offset;
info.pds_data_size =
DIV_ROUND_UP(pds_data_size_in_dwords << 2U,
PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE));
/* Check that we have upload the code section. */
assert(pipeline->state.uniform.pds_code.code_size);
info.pds_code_offset = pipeline->state.uniform.pds_code.code_offset;
} else {
/* FIXME: There should be a deallocation pds program already uploaded
* that we use at this point.
*/
assert(!"Unimplemented");
}
/* We don't need to pad the workgroup size. */
info.max_instances =
pvr_compute_flat_slot_size(dev_info, const_shared_reg_count, false, 1U);
pvr_compute_generate_control_stream(csb, &info);
}
void pvr_CmdDispatch(VkCommandBuffer commandBuffer,
uint32_t groupCountX,
uint32_t groupCountY,
@ -2896,7 +2951,8 @@ void pvr_CmdDispatch(VkCommandBuffer commandBuffer,
return;
}
/* FIXME: Create shared update kernel end emit control stream. */
pvr_compute_update_shared(cmd_buffer);
/* FIXME: Create update kernel end emit control stream. */
}

View File

@ -961,6 +961,7 @@ static VkResult pvr_compute_pipeline_compile(
compute_pipeline->state.shader.uses_atomic_ops = false;
compute_pipeline->state.shader.uses_barrier = false;
compute_pipeline->state.shader.uses_num_workgroups = false;
compute_pipeline->state.shader.const_shared_reg_count = 4;
result = pvr_gpu_upload_usc(device,
pvr_usc_compute_shader,

View File

@ -1034,6 +1034,8 @@ struct pvr_compute_pipeline {
bool uses_barrier;
/* E.g. GLSL shader uses gl_NumWorkGroups. */
bool uses_num_workgroups;
uint32_t const_shared_reg_count;
} shader;
struct {