radv: detect command buffers that do no work and drop them (v2)

If a buffer is just full of flushes we flush things on command
buffer submission, so don't bother submitting these.

This will reduce some CPU overhead on dota2, which submits a fair
few command streams that don't end up drawing anything.

v2: reorganise loop to count first then malloc,
rename some vars (Bas)

Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Signed-off-by: Dave Airlie <airlied@redhat.com>
This commit is contained in:
Dave Airlie 2017-02-10 00:20:44 +00:00
parent d49d275c41
commit 8b47b97215
5 changed files with 27 additions and 8 deletions

View File

@ -1277,6 +1277,7 @@ radv_cmd_buffer_flush_state(struct radv_cmd_buffer *cmd_buffer)
MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
cmd_buffer->cs, 4096);
cmd_buffer->no_draws = false;
if ((cmd_buffer->state.vertex_descriptors_dirty || cmd_buffer->state.vb_dirty) &&
cmd_buffer->state.pipeline->num_vertex_attribs) {
unsigned vb_offset;
@ -1592,6 +1593,7 @@ static void radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
cmd_buffer->record_fail = false;
cmd_buffer->ring_offsets_idx = -1;
cmd_buffer->no_draws = true;
}
VkResult radv_ResetCommandBuffer(
@ -2423,6 +2425,7 @@ void radv_CmdDrawIndexedIndirectCountAMD(
static void
radv_flush_compute_state(struct radv_cmd_buffer *cmd_buffer)
{
cmd_buffer->no_draws = false;
radv_emit_compute_pipeline(cmd_buffer);
radv_flush_descriptors(cmd_buffer, cmd_buffer->state.compute_pipeline,
VK_SHADER_STAGE_COMPUTE_BIT);

View File

@ -1452,8 +1452,18 @@ VkResult radv_QueueSubmit(
struct radeon_winsys_cs **cs_array;
bool can_patch = true;
uint32_t advance;
int draw_cmd_buffers_count = 0;
if (!pSubmits[i].commandBufferCount) {
for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer,
pSubmits[i].pCommandBuffers[j]);
assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
if (cmd_buffer->no_draws == true)
continue;
draw_cmd_buffers_count++;
}
if (!draw_cmd_buffers_count) {
if (pSubmits[i].waitSemaphoreCount || pSubmits[i].signalSemaphoreCount) {
ret = queue->device->ws->cs_submit(ctx, queue->queue_idx,
&queue->device->empty_cs[queue->queue_family_index],
@ -1472,24 +1482,27 @@ VkResult radv_QueueSubmit(
continue;
}
cs_array = malloc(sizeof(struct radeon_winsys_cs *) *
pSubmits[i].commandBufferCount);
cs_array = malloc(sizeof(struct radeon_winsys_cs *) * draw_cmd_buffers_count);
int draw_cmd_buffer_idx = 0;
for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer,
pSubmits[i].pCommandBuffers[j]);
assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
if (cmd_buffer->no_draws == true)
continue;
cs_array[j] = cmd_buffer->cs;
cs_array[draw_cmd_buffer_idx] = cmd_buffer->cs;
draw_cmd_buffer_idx++;
if ((cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
can_patch = false;
}
for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j += advance) {
for (uint32_t j = 0; j < draw_cmd_buffers_count; j += advance) {
advance = MIN2(max_cs_submission,
pSubmits[i].commandBufferCount - j);
draw_cmd_buffers_count - j);
bool b = j == 0;
bool e = j + advance == pSubmits[i].commandBufferCount;
bool e = j + advance == draw_cmd_buffers_count;
if (queue->device->trace_bo)
*queue->device->trace_id_ptr = 0;

View File

@ -523,6 +523,7 @@ void radv_CmdUpdateBuffer(
assert(!(dataSize & 3));
assert(!(va & 3));
cmd_buffer->no_draws = false;
if (dataSize < 4096) {
cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, dst_buffer->bo, 8);

View File

@ -750,6 +750,8 @@ struct radv_cmd_buffer {
uint32_t gsvs_ring_size_needed;
int ring_offsets_idx; /* just used for verification */
bool no_draws;
};
struct radv_image;

View File

@ -828,7 +828,7 @@ static void si_emit_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer,
static void si_cp_dma_prepare(struct radv_cmd_buffer *cmd_buffer, uint64_t byte_count,
uint64_t remaining_size, unsigned *flags)
{
cmd_buffer->no_draws = false;
/* Flush the caches for the first copy only.
* Also wait for the previous CP DMA operations.
*/