radv: refactor draw dispatch

this breaks up the monolithic draw path used for all draw calls into
pre/post functions to handle general setup and a couple helper functions
to more directly handle different draw modes in a way that's both more readable
and, potentially, more optimizable

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8788>
This commit is contained in:
Mike Blumenkrantz 2021-01-27 15:19:49 -05:00 committed by Marge Bot
parent d0ef28d85e
commit 75a2476dfb
1 changed files with 177 additions and 158 deletions

View File

@ -3069,11 +3069,6 @@ struct radv_draw_info {
*/
uint32_t count;
/**
* Index of the first vertex.
*/
int32_t vertex_offset;
/**
* First instance id.
*/
@ -5262,12 +5257,11 @@ static void radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned in
static void
radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer,
uint32_t vertex_count,
bool use_opaque)
uint32_t use_opaque)
{
radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating));
radeon_emit(cmd_buffer->cs, vertex_count);
radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX |
S_0287F0_USE_OPAQUE(use_opaque));
radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
}
/**
@ -5291,7 +5285,8 @@ radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer,
radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA);
}
static void
/* MUST inline this function to avoid massive perf loss in drawoverhead */
ALWAYS_INLINE static void
radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer,
bool indexed,
uint32_t draw_count,
@ -5299,8 +5294,7 @@ radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer,
uint32_t stride)
{
struct radeon_cmdbuf *cs = cmd_buffer->cs;
unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA
: V_0287F0_DI_SRC_SEL_AUTO_INDEX;
const unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
bool draw_id_enable = radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.vs.needs_draw_id;
uint32_t base_reg = cmd_buffer->state.pipeline->graphics.vtx_base_sgpr;
bool predicating = cmd_buffer->state.predicating;
@ -5336,124 +5330,107 @@ radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer,
}
}
static void
radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer,
const struct radv_draw_info *info)
ALWAYS_INLINE static void
radv_emit_draw_packets_indexed(struct radv_cmd_buffer *cmd_buffer,
const struct radv_draw_info *info,
uint32_t count,
uint32_t first_index)
{
struct radv_cmd_state *state = &cmd_buffer->state;
struct radeon_winsys *ws = cmd_buffer->device->ws;
struct radeon_cmdbuf *cs = cmd_buffer->cs;
const struct radv_cmd_state *state = &cmd_buffer->state;
const int index_size = radv_get_vgt_index_size(state->index_type);
uint64_t index_va;
radv_describe_draw(cmd_buffer);
uint32_t remaining_indexes = cmd_buffer->state.max_index_count;
remaining_indexes = MAX2(remaining_indexes, info->first_index) - info->first_index;
if (info->indirect) {
uint64_t va = radv_buffer_get_va(info->indirect->bo);
uint64_t count_va = 0;
/* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
if (!remaining_indexes &&
cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
return;
va += info->indirect->offset + info->indirect_offset;
index_va = state->index_va;
index_va += first_index * index_size;
radv_cs_add_buffer(ws, cs, info->indirect->bo);
radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
radeon_emit(cs, 1);
radeon_emit(cs, va);
radeon_emit(cs, va >> 32);
if (info->count_buffer) {
count_va = radv_buffer_get_va(info->count_buffer->bo);
count_va += info->count_buffer->offset +
info->count_buffer_offset;
radv_cs_add_buffer(ws, cs, info->count_buffer->bo);
}
if (!state->subpass->view_mask) {
radv_cs_emit_indirect_draw_packet(cmd_buffer,
info->indexed,
info->count,
count_va,
info->stride);
} else {
u_foreach_bit(i, state->subpass->view_mask) {
radv_emit_view_index(cmd_buffer, i);
radv_cs_emit_indirect_draw_packet(cmd_buffer,
info->indexed,
info->count,
count_va,
info->stride);
}
}
if (!state->subpass->view_mask) {
radv_cs_emit_draw_indexed_packet(cmd_buffer,
index_va,
remaining_indexes,
count);
} else {
assert(state->pipeline->graphics.vtx_base_sgpr);
u_foreach_bit(i, state->subpass->view_mask) {
radv_emit_view_index(cmd_buffer, i);
if (info->vertex_offset != state->last_vertex_offset ||
info->first_instance != state->last_first_instance) {
radeon_set_sh_reg_seq(cs, state->pipeline->graphics.vtx_base_sgpr,
state->pipeline->graphics.vtx_emit_num);
radeon_emit(cs, info->vertex_offset);
radeon_emit(cs, info->first_instance);
if (state->pipeline->graphics.vtx_emit_num == 3)
radeon_emit(cs, 0);
state->last_first_instance = info->first_instance;
state->last_vertex_offset = info->vertex_offset;
}
if (state->last_num_instances != info->instance_count) {
radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false));
radeon_emit(cs, info->instance_count);
state->last_num_instances = info->instance_count;
}
if (info->indexed) {
int index_size = radv_get_vgt_index_size(state->index_type);
uint64_t index_va;
uint32_t remaining_indexes = cmd_buffer->state.max_index_count;
remaining_indexes = MAX2(remaining_indexes, info->first_index) - info->first_index;
/* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
if (!remaining_indexes &&
cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
return;
index_va = state->index_va;
index_va += info->first_index * index_size;
if (!state->subpass->view_mask) {
radv_cs_emit_draw_indexed_packet(cmd_buffer,
index_va,
remaining_indexes,
info->count);
} else {
u_foreach_bit(i, state->subpass->view_mask) {
radv_emit_view_index(cmd_buffer, i);
radv_cs_emit_draw_indexed_packet(cmd_buffer,
index_va,
remaining_indexes,
info->count);
}
}
} else {
if (!state->subpass->view_mask) {
radv_cs_emit_draw_packet(cmd_buffer,
info->count,
!!info->strmout_buffer);
} else {
u_foreach_bit(i, state->subpass->view_mask) {
radv_emit_view_index(cmd_buffer, i);
radv_cs_emit_draw_packet(cmd_buffer,
info->count,
!!info->strmout_buffer);
}
}
radv_cs_emit_draw_indexed_packet(cmd_buffer,
index_va,
remaining_indexes,
count);
}
}
}
ALWAYS_INLINE static void
radv_emit_direct_draw_packets(struct radv_cmd_buffer *cmd_buffer,
const struct radv_draw_info *info,
uint32_t count,
uint32_t use_opaque)
{
const struct radv_cmd_state *state = &cmd_buffer->state;
if (!state->subpass->view_mask) {
radv_cs_emit_draw_packet(cmd_buffer,
count,
use_opaque);
} else {
u_foreach_bit(i, state->subpass->view_mask) {
radv_emit_view_index(cmd_buffer, i);
radv_cs_emit_draw_packet(cmd_buffer,
count,
use_opaque);
}
}
}
static void
radv_emit_indirect_draw_packets(struct radv_cmd_buffer *cmd_buffer,
const struct radv_draw_info *info)
{
const struct radv_cmd_state *state = &cmd_buffer->state;
struct radeon_winsys *ws = cmd_buffer->device->ws;
struct radeon_cmdbuf *cs = cmd_buffer->cs;
const uint64_t va = radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
const uint64_t count_va = info->count_buffer ? radv_buffer_get_va(info->count_buffer->bo) + info->count_buffer->offset +
info->count_buffer_offset : 0;
radv_cs_add_buffer(ws, cs, info->indirect->bo);
radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
radeon_emit(cs, 1);
radeon_emit(cs, va);
radeon_emit(cs, va >> 32);
if (info->count_buffer) {
radv_cs_add_buffer(ws, cs, info->count_buffer->bo);
}
if (!state->subpass->view_mask) {
radv_cs_emit_indirect_draw_packet(cmd_buffer,
info->indexed,
info->count,
count_va,
info->stride);
} else {
u_foreach_bit(i, state->subpass->view_mask) {
radv_emit_view_index(cmd_buffer, i);
radv_cs_emit_indirect_draw_packet(cmd_buffer,
info->indexed,
info->count,
count_va,
info->stride);
}
}
}
/*
* Vega and raven have a bug which triggers if there are multiple context
* register contexts active at the same time with different scissor values.
@ -5548,19 +5525,19 @@ radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer,
radv_emit_scissor(cmd_buffer);
}
static void
radv_draw(struct radv_cmd_buffer *cmd_buffer,
const struct radv_draw_info *info)
/* MUST inline this function to avoid massive perf loss in drawoverhead */
ALWAYS_INLINE static bool
radv_before_draw(struct radv_cmd_buffer *cmd_buffer,
const struct radv_draw_info *info,
uint32_t vertex_offset)
{
struct radeon_info *rad_info =
&cmd_buffer->device->physical_device->rad_info;
bool has_prefetch =
const bool has_prefetch =
cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
bool pipeline_is_dirty =
const bool pipeline_is_dirty =
(cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) &&
cmd_buffer->state.pipeline != cmd_buffer->state.emitted_pipeline;
ASSERTED unsigned cdw_max =
ASSERTED const unsigned cdw_max =
radeon_check_space(cmd_buffer->device->ws,
cmd_buffer->cs, 4096);
@ -5570,11 +5547,11 @@ radv_draw(struct radv_cmd_buffer *cmd_buffer,
* direct draws.
*/
if (unlikely(!info->instance_count))
return;
return false;
/* Handle count == 0. */
if (unlikely(!info->count && !info->strmout_buffer))
return;
return false;
}
/* Need to apply this workaround early as it can set flush flags. */
@ -5600,18 +5577,6 @@ radv_draw(struct radv_cmd_buffer *cmd_buffer,
/* <-- CUs are idle here --> */
radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
radv_emit_draw_packets(cmd_buffer, info);
/* <-- CUs are busy here --> */
/* Start prefetches after the draw has been started. Both will
* run in parallel, but starting the draw first is more
* important.
*/
if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
radv_emit_prefetch_L2(cmd_buffer,
cmd_buffer->state.pipeline, false);
}
} else {
/* If we don't wait for idle, start prefetches first, then set
* states, and draw at the end.
@ -5629,15 +5594,50 @@ radv_draw(struct radv_cmd_buffer *cmd_buffer,
radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
radv_emit_all_graphics_states(cmd_buffer, info);
radv_emit_draw_packets(cmd_buffer, info);
}
/* Prefetch the remaining shaders after the draw has been
* started.
*/
if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
radv_emit_prefetch_L2(cmd_buffer,
cmd_buffer->state.pipeline, false);
radv_describe_draw(cmd_buffer);
if (likely(!info->indirect)) {
struct radv_cmd_state *state = &cmd_buffer->state;
struct radeon_cmdbuf *cs = cmd_buffer->cs;
assert(state->pipeline->graphics.vtx_base_sgpr);
if (state->last_num_instances != info->instance_count) {
radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false));
radeon_emit(cs, info->instance_count);
state->last_num_instances = info->instance_count;
}
if (vertex_offset != state->last_vertex_offset ||
info->first_instance != state->last_first_instance) {
radeon_set_sh_reg_seq(cs, state->pipeline->graphics.vtx_base_sgpr,
state->pipeline->graphics.vtx_emit_num);
radeon_emit(cs, vertex_offset);
radeon_emit(cs, info->first_instance);
if (state->pipeline->graphics.vtx_emit_num == 3)
radeon_emit(cs, 0);
state->last_first_instance = info->first_instance;
state->last_vertex_offset = vertex_offset;
}
}
assert(cmd_buffer->cs->cdw <= cdw_max);
return true;
}
static void
radv_after_draw(struct radv_cmd_buffer *cmd_buffer)
{
const struct radeon_info *rad_info =
&cmd_buffer->device->physical_device->rad_info;
bool has_prefetch =
cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
/* Start prefetches after the draw has been started. Both will
* run in parallel, but starting the draw first is more
* important.
*/
if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
radv_emit_prefetch_L2(cmd_buffer,
cmd_buffer->state.pipeline, false);
}
/* Workaround for a VGT hang when streamout is enabled.
@ -5650,7 +5650,6 @@ radv_draw(struct radv_cmd_buffer *cmd_buffer,
cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_STREAMOUT_SYNC;
}
assert(cmd_buffer->cs->cdw <= cdw_max);
radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_PS_PARTIAL_FLUSH);
}
@ -5667,12 +5666,15 @@ void radv_CmdDraw(
info.count = vertexCount;
info.instance_count = instanceCount;
info.first_instance = firstInstance;
info.vertex_offset = firstVertex;
info.strmout_buffer = NULL;
info.indirect = NULL;
info.indexed = false;
radv_draw(cmd_buffer, &info);
if (!radv_before_draw(cmd_buffer, &info, firstVertex))
return;
radv_emit_direct_draw_packets(cmd_buffer, &info,
vertexCount, 0);
radv_after_draw(cmd_buffer);
}
void radv_CmdDrawIndexed(
@ -5690,12 +5692,15 @@ void radv_CmdDrawIndexed(
info.count = indexCount;
info.instance_count = instanceCount;
info.first_index = firstIndex;
info.vertex_offset = vertexOffset;
info.first_instance = firstInstance;
info.strmout_buffer = NULL;
info.indirect = NULL;
radv_draw(cmd_buffer, &info);
if (!radv_before_draw(cmd_buffer, &info, vertexOffset))
return;
radv_emit_draw_packets_indexed(cmd_buffer, &info,
indexCount, firstIndex);
radv_after_draw(cmd_buffer);
}
void radv_CmdDrawIndirect(
@ -5717,7 +5722,10 @@ void radv_CmdDrawIndirect(
info.count_buffer = NULL;
info.indexed = false;
radv_draw(cmd_buffer, &info);
if (!radv_before_draw(cmd_buffer, &info, 0))
return;
radv_emit_indirect_draw_packets(cmd_buffer, &info);
radv_after_draw(cmd_buffer);
}
void radv_CmdDrawIndexedIndirect(
@ -5739,7 +5747,10 @@ void radv_CmdDrawIndexedIndirect(
info.count_buffer = NULL;
info.strmout_buffer = NULL;
radv_draw(cmd_buffer, &info);
if (!radv_before_draw(cmd_buffer, &info, 0))
return;
radv_emit_indirect_draw_packets(cmd_buffer, &info);
radv_after_draw(cmd_buffer);
}
void radv_CmdDrawIndirectCount(
@ -5765,7 +5776,10 @@ void radv_CmdDrawIndirectCount(
info.strmout_buffer = NULL;
info.indexed = false;
radv_draw(cmd_buffer, &info);
if (!radv_before_draw(cmd_buffer, &info, 0))
return;
radv_emit_indirect_draw_packets(cmd_buffer, &info);
radv_after_draw(cmd_buffer);
}
void radv_CmdDrawIndexedIndirectCount(
@ -5791,7 +5805,10 @@ void radv_CmdDrawIndexedIndirectCount(
info.stride = stride;
info.strmout_buffer = NULL;
radv_draw(cmd_buffer, &info);
if (!radv_before_draw(cmd_buffer, &info, 0))
return;
radv_emit_indirect_draw_packets(cmd_buffer, &info);
radv_after_draw(cmd_buffer);
}
struct radv_dispatch_info {
@ -7230,7 +7247,6 @@ void radv_CmdDrawIndirectByteCountEXT(
struct radv_draw_info info;
info.count = 0;
info.vertex_offset = 0;
info.instance_count = instanceCount;
info.first_instance = firstInstance;
info.strmout_buffer = counterBuffer;
@ -7239,7 +7255,10 @@ void radv_CmdDrawIndirectByteCountEXT(
info.indexed = false;
info.indirect = NULL;
radv_draw(cmd_buffer, &info);
if (!radv_before_draw(cmd_buffer, &info, 0))
return;
radv_emit_direct_draw_packets(cmd_buffer, &info, 0, S_0287F0_USE_OPAQUE(1));
radv_after_draw(cmd_buffer);
}
/* VK_AMD_buffer_marker */