turnip: use SUBDRAW_SIZE and constant sized tess bos

This fixes the problem of large indirect draws, and at the same time avoids
allocating too large buffers for tessellation.

Reworked by @anholt to use a separate tess factor BO so we can skip the
WFIs to set the TESSFACTOR_ADDR.

Signed-off-by: Jonathan Marek <jonathan@marek.ca>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6089>
This commit is contained in:
Jonathan Marek 2020-07-27 10:06:46 -04:00 committed by Marge Bot
parent 3748b8afce
commit fd11d99254
3 changed files with 77 additions and 124 deletions

View File

@ -64,6 +64,23 @@ tu6_emit_event_write(struct tu_cmd_buffer *cmd,
}
}
/* Emits the tessfactor address to the top-level CS if it hasn't been already.
* Updating this register requires a WFI if outstanding drawing is using it, but
* tu6_init_hardware() will have WFIed before we started and no other draws
* could be using the tessfactor address yet since we only emit one per cmdbuf.
*/
static void
tu6_lazy_emit_tessfactor_addr(struct tu_cmd_buffer *cmd)
{
if (cmd->state.tessfactor_addr_set)
return;
assert(cmd->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
tu_cs_emit_regs(&cmd->cs, A6XX_PC_TESSFACTOR_ADDR(.qword = cmd->device->tess_bo.iova));
cmd->state.tessfactor_addr_set = true;
}
static void
tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
struct tu_cs *cs,
@ -2215,6 +2232,14 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i, pipeline->dynamic_state[i]);
}
if (cmd->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
(pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT)) {
/* Set up the tess factor address if this is the first tess pipeline bound
* to the primary cmdbuf.
*/
tu6_lazy_emit_tessfactor_addr(cmd);
}
if (cmd->state.line_mode != pipeline->line_mode) {
cmd->state.line_mode = pipeline->line_mode;
@ -2983,8 +3008,13 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
break;
}
if (secondary->state.has_tess)
/* Set up the tess factor address if this is the first time a tess
* pipeline has been executed on this primary cmdbuf.
*/
if (secondary->state.has_tess) {
tu6_lazy_emit_tessfactor_addr(cmd);
cmd->state.has_tess = true;
}
if (secondary->state.has_subpass_predication)
cmd->state.has_subpass_predication = true;
if (secondary->state.disable_gmem)
@ -3477,103 +3507,17 @@ tu6_emit_consts_geom(struct tu_cmd_buffer *cmd,
return tu_cs_end_draw_state(&cmd->sub_cs, &cs);
}
static uint64_t
get_tess_param_bo_size(const struct tu_pipeline *pipeline,
uint32_t draw_count)
{
/* TODO: For indirect draws, we can't compute the BO size ahead of time.
* Still not sure what to do here, so just allocate a reasonably large
* BO and hope for the best for now. */
if (!draw_count)
draw_count = 2048;
/* the tess param BO is pipeline->tess.param_stride bytes per patch,
* which includes both the per-vertex outputs and per-patch outputs
* build_primitive_map in ir3 calculates this stride
*/
uint32_t verts_per_patch = pipeline->ia.primtype - DI_PT_PATCHES0;
uint32_t num_patches = draw_count / verts_per_patch;
return num_patches * pipeline->tess.param_stride;
}
static uint64_t
get_tess_factor_bo_size(const struct tu_pipeline *pipeline,
uint32_t draw_count)
{
/* TODO: For indirect draws, we can't compute the BO size ahead of time.
* Still not sure what to do here, so just allocate a reasonably large
* BO and hope for the best for now. */
if (!draw_count)
draw_count = 2048;
/* Each distinct patch gets its own tess factor output. */
uint32_t verts_per_patch = pipeline->ia.primtype - DI_PT_PATCHES0;
uint32_t num_patches = draw_count / verts_per_patch;
uint32_t factor_stride = ir3_tess_factor_stride(pipeline->tess.patch_type);
return factor_stride * num_patches;
}
static VkResult
tu6_emit_tess_consts(struct tu_cmd_buffer *cmd,
uint32_t draw_count,
const struct tu_pipeline *pipeline,
struct tu_draw_state *state,
uint64_t *factor_iova)
tu6_setup_tess(struct tu_cmd_buffer *cmd,
const struct tu_pipeline *pipeline,
uint32_t *subdraw_size)
{
struct tu_cs cs;
VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 16, &cs);
if (result != VK_SUCCESS)
return result;
/* maximum number of patches that can fit in tess factor/param buffers */
*subdraw_size = MIN2(TU_TESS_FACTOR_SIZE / ir3_tess_factor_stride(pipeline->tess.patch_type),
TU_TESS_PARAM_SIZE / pipeline->tess.param_stride);
/* convert from # of patches to draw count */
*subdraw_size *= (pipeline->ia.primtype - DI_PT_PATCHES0);
const struct tu_program_descriptor_linkage *hs_link =
&pipeline->program.link[MESA_SHADER_TESS_CTRL];
bool hs_uses_bo = pipeline->tess.hs_bo_regid < hs_link->constlen;
const struct tu_program_descriptor_linkage *ds_link =
&pipeline->program.link[MESA_SHADER_TESS_EVAL];
bool ds_uses_bo = pipeline->tess.ds_bo_regid < ds_link->constlen;
uint64_t tess_factor_size = get_tess_factor_bo_size(pipeline, draw_count);
uint64_t tess_param_size = get_tess_param_bo_size(pipeline, draw_count);
uint64_t tess_bo_size = tess_factor_size + tess_param_size;
if ((hs_uses_bo || ds_uses_bo) && tess_bo_size > 0) {
struct tu_bo *tess_bo;
result = tu_get_scratch_bo(cmd->device, tess_bo_size, &tess_bo);
if (result != VK_SUCCESS)
return result;
uint64_t tess_factor_iova = tess_bo->iova;
uint64_t tess_param_iova = tess_factor_iova + tess_factor_size;
if (hs_uses_bo) {
tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(pipeline->tess.hs_bo_regid) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(SB6_HS_SHADER) |
CP_LOAD_STATE6_0_NUM_UNIT(1));
tu_cs_emit(&cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
tu_cs_emit(&cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
tu_cs_emit_qw(&cs, tess_param_iova);
tu_cs_emit_qw(&cs, tess_factor_iova);
}
if (ds_uses_bo) {
tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(pipeline->tess.ds_bo_regid) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(SB6_DS_SHADER) |
CP_LOAD_STATE6_0_NUM_UNIT(1));
tu_cs_emit(&cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
tu_cs_emit(&cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
tu_cs_emit_qw(&cs, tess_param_iova);
tu_cs_emit_qw(&cs, tess_factor_iova);
}
*factor_iova = tess_factor_iova;
}
*state = tu_cs_end_draw_state(&cmd->sub_cs, &cs);
return VK_SUCCESS;
}
@ -3928,25 +3872,16 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_FRAGMENT);
}
struct tu_draw_state tess_consts = {};
if (has_tess) {
uint64_t tess_factor_iova = 0;
uint32_t subdraw_size;
cmd->state.has_tess = true;
result = tu6_emit_tess_consts(cmd, draw_count, pipeline, &tess_consts, &tess_factor_iova);
result = tu6_setup_tess(cmd, pipeline, &subdraw_size);
if (result != VK_SUCCESS)
return result;
/* this sequence matches what the blob does before every tess draw
* PC_TESSFACTOR_ADDR_LO is a non-context register and needs a wfi
* before writing to it
*/
tu_cs_emit_wfi(cs);
tu_cs_emit_regs(cs, A6XX_PC_TESSFACTOR_ADDR(.qword = tess_factor_iova));
tu_cs_emit_pkt7(cs, CP_SET_SUBDRAW_SIZE, 1);
tu_cs_emit(cs, draw_count);
tu_cs_emit(cs, subdraw_size);
}
/* for the first draw in a renderpass, re-emit all the draw states
@ -3965,7 +3900,6 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_TESS, tess_consts);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI, pipeline->vi.state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_RAST, pipeline->rast_state);
@ -3991,7 +3925,6 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
*/
bool emit_binding_stride = false;
uint32_t draw_state_count =
has_tess +
((cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) ? 2 : 0) +
((cmd->state.dirty & TU_CMD_DIRTY_DESC_SETS_LOAD) ? 1 : 0) +
((cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) ? 1 : 0) +
@ -4007,10 +3940,6 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
if (draw_state_count > 0)
tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_count);
/* We may need to re-emit tess consts if the current draw call is
* sufficiently larger than the last draw call. */
if (has_tess)
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_TESS, tess_consts);
if (cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) {
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_SHADER_GEOM_CONST, cmd->state.shader_const[0]);
tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const[1]);

View File

@ -1560,6 +1560,8 @@ tu6_emit_geom_tess_consts(struct tu_cs *cs,
const struct ir3_shader_variant *gs,
uint32_t cps_per_patch)
{
struct tu_device *dev = cs->device;
uint32_t num_vertices =
hs ? cps_per_patch : gs->shader->nir->info.gs.vertices_in;
@ -1575,29 +1577,49 @@ tu6_emit_geom_tess_consts(struct tu_cs *cs,
if (hs) {
assert(ds->type != MESA_SHADER_NONE);
uint32_t hs_params[4] = {
/* Create the shared tess factor BO the first time tess is used on the device. */
mtx_lock(&dev->mutex);
if (!dev->tess_bo.size)
tu_bo_init_new(dev, &dev->tess_bo, TU_TESS_BO_SIZE, TU_BO_ALLOC_NO_FLAGS);
mtx_unlock(&dev->mutex);
uint64_t tess_factor_iova = dev->tess_bo.iova;
uint64_t tess_param_iova = tess_factor_iova + TU_TESS_FACTOR_SIZE;
uint32_t hs_params[8] = {
vs->output_size * num_vertices * 4, /* hs primitive stride */
vs->output_size * 4, /* hs vertex stride */
hs->output_size,
cps_per_patch,
tess_param_iova,
tess_param_iova >> 32,
tess_factor_iova,
tess_factor_iova >> 32,
};
uint32_t hs_base = hs->const_state->offsets.primitive_param;
uint32_t hs_param_dwords = MIN2((hs->constlen - hs_base) * 4, ARRAY_SIZE(hs_params));
tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, hs_base, SB6_HS_SHADER, 0,
ARRAY_SIZE(hs_params), hs_params);
hs_param_dwords, hs_params);
if (gs)
num_vertices = gs->shader->nir->info.gs.vertices_in;
uint32_t ds_params[4] = {
uint32_t ds_params[8] = {
ds->output_size * num_vertices * 4, /* ds primitive stride */
ds->output_size * 4, /* ds vertex stride */
hs->output_size, /* hs vertex stride (dwords) */
hs->shader->nir->info.tess.tcs_vertices_out
hs->shader->nir->info.tess.tcs_vertices_out,
tess_param_iova,
tess_param_iova >> 32,
tess_factor_iova,
tess_factor_iova >> 32,
};
uint32_t ds_base = ds->const_state->offsets.primitive_param;
uint32_t ds_param_dwords = MIN2((ds->constlen - ds_base) * 4, ARRAY_SIZE(ds_params));
tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, ds_base, SB6_DS_SHADER, 0,
ARRAY_SIZE(ds_params), ds_params);
ds_param_dwords, ds_params);
}
if (gs) {
@ -2716,10 +2738,7 @@ tu_pipeline_builder_parse_tessellation(struct tu_pipeline_builder *builder,
pipeline->tess.upper_left_domain_origin = !domain_info ||
domain_info->domainOrigin == VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT;
const struct ir3_shader_variant *hs = builder->variants[MESA_SHADER_TESS_CTRL];
const struct ir3_shader_variant *ds = builder->variants[MESA_SHADER_TESS_EVAL];
pipeline->tess.param_stride = hs->output_size * 4;
pipeline->tess.hs_bo_regid = hs->const_state->offsets.primitive_param + 1;
pipeline->tess.ds_bo_regid = ds->const_state->offsets.primitive_param + 1;
}
static void

View File

@ -399,6 +399,13 @@ struct tu_device
struct tu_bo global_bo;
/* the blob seems to always use 8K factor and 128K param sizes, copy them */
#define TU_TESS_FACTOR_SIZE (8 * 1024)
#define TU_TESS_PARAM_SIZE (128 * 1024)
#define TU_TESS_BO_SIZE (TU_TESS_FACTOR_SIZE + TU_TESS_PARAM_SIZE)
/* Lazily allocated, protected by the device mutex. */
struct tu_bo tess_bo;
struct ir3_shader_variant *global_shaders[GLOBAL_SH_COUNT];
uint64_t global_shader_va[GLOBAL_SH_COUNT];
@ -536,7 +543,6 @@ enum tu_draw_state_group_id
TU_DRAW_STATE_PROGRAM_CONFIG,
TU_DRAW_STATE_PROGRAM,
TU_DRAW_STATE_PROGRAM_BINNING,
TU_DRAW_STATE_TESS,
TU_DRAW_STATE_VB,
TU_DRAW_STATE_VI,
TU_DRAW_STATE_VI_BINNING,
@ -1025,6 +1031,7 @@ struct tu_cmd_state
bool xfb_used;
bool has_tess;
bool tessfactor_addr_set;
bool has_subpass_predication;
bool predication_active;
bool disable_gmem;
@ -1253,8 +1260,6 @@ struct tu_pipeline
{
uint32_t patch_type;
uint32_t param_stride;
uint32_t hs_bo_regid;
uint32_t ds_bo_regid;
bool upper_left_domain_origin;
} tess;