turnip: vsc improvements

* Remove scratch_bo from cmdbuffer, use a device-global bo instead, which
  also includes border color (and eventually shaders for 3D blit path)
* Use CP_SET_BIN_DATA5_OFFSET to allow setting VSC buffer addresses only
  once at the start of the cmdstream
* Use scratch bo mechanism for a resizable VSC buffer
* Use feedback from "vsc_draw_overflow" and "vsc_prim_overflow" values to
  increase the size of VSC buffer when beginning to record a new cmdbuffer

Signed-off-by: Jonathan Marek <jonathan@marek.ca>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5570>
This commit is contained in:
Jonathan Marek 2020-06-18 18:08:58 -04:00 committed by Marge Bot
parent 4ac851ea25
commit 0e7b7c3087
3 changed files with 109 additions and 125 deletions

View File

@ -130,7 +130,7 @@ tu6_emit_event_write(struct tu_cmd_buffer *cmd,
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, need_seqno ? 4 : 1);
tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(event));
if (need_seqno) {
tu_cs_emit_qw(cs, cmd->scratch_bo.iova);
tu_cs_emit_qw(cs, global_iova(cmd, seqno_dummy));
tu_cs_emit(cs, 0);
}
}
@ -598,12 +598,12 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
tu_cs_emit(cs, 0x0);
tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5, 7);
tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5_OFFSET, 4);
tu_cs_emit(cs, fb->pipe_sizes[pipe] |
CP_SET_BIN_DATA5_0_VSC_N(slot));
tu_cs_emit_qw(cs, cmd->vsc_draw_strm.iova + pipe * cmd->vsc_draw_strm_pitch);
tu_cs_emit_qw(cs, cmd->vsc_draw_strm.iova + pipe * 4 + 32 * cmd->vsc_draw_strm_pitch);
tu_cs_emit_qw(cs, cmd->vsc_prim_strm.iova + pipe * cmd->vsc_prim_strm_pitch);
tu_cs_emit(cs, pipe * cmd->vsc_draw_strm_pitch);
tu_cs_emit(cs, pipe * 4);
tu_cs_emit(cs, pipe * cmd->vsc_prim_strm_pitch);
tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
tu_cs_emit(cs, 0x0);
@ -714,7 +714,8 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
static void
tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
const struct tu_physical_device *phys_dev = cmd->device->physical_device;
struct tu_device *dev = cmd->device;
const struct tu_physical_device *phys_dev = dev->physical_device;
tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
@ -827,9 +828,52 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
A6XX_RB_LRZ_CNTL(0));
tu_cs_emit_regs(cs,
A6XX_SP_TP_BORDER_COLOR_BASE_ADDR(.bo = &cmd->device->border_color));
A6XX_SP_TP_BORDER_COLOR_BASE_ADDR(.bo = &dev->global_bo,
.bo_offset = gb_offset(border_color)));
tu_cs_emit_regs(cs,
A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR(.bo = &cmd->device->border_color));
A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR(.bo = &dev->global_bo,
.bo_offset = gb_offset(border_color)));
/* VSC buffers:
* use vsc pitches from the largest values used so far with this device
* if there hasn't been overflow, there will already be a scratch bo
* allocated for these sizes
*
* if overflow is detected, the stream size is increased by 2x
*/
mtx_lock(&dev->vsc_pitch_mtx);
struct tu6_global *global = dev->global_bo.map;
uint32_t vsc_draw_overflow = global->vsc_draw_overflow;
uint32_t vsc_prim_overflow = global->vsc_prim_overflow;
if (vsc_draw_overflow >= dev->vsc_draw_strm_pitch)
dev->vsc_draw_strm_pitch = (dev->vsc_draw_strm_pitch - VSC_PAD) * 2 + VSC_PAD;
if (vsc_prim_overflow >= dev->vsc_prim_strm_pitch)
dev->vsc_prim_strm_pitch = (dev->vsc_prim_strm_pitch - VSC_PAD) * 2 + VSC_PAD;
cmd->vsc_prim_strm_pitch = dev->vsc_prim_strm_pitch;
cmd->vsc_draw_strm_pitch = dev->vsc_draw_strm_pitch;
mtx_unlock(&dev->vsc_pitch_mtx);
struct tu_bo *vsc_bo;
uint32_t size0 = cmd->vsc_prim_strm_pitch * MAX_VSC_PIPES +
cmd->vsc_draw_strm_pitch * MAX_VSC_PIPES;
tu_get_scratch_bo(dev, size0 + MAX_VSC_PIPES * 4, &vsc_bo);
tu_cs_emit_regs(cs,
A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.bo = vsc_bo, .bo_offset = size0));
tu_cs_emit_regs(cs,
A6XX_VSC_PRIM_STRM_ADDRESS(.bo = vsc_bo));
tu_cs_emit_regs(cs,
A6XX_VSC_DRAW_STRM_ADDRESS(.bo = vsc_bo,
.bo_offset = cmd->vsc_prim_strm_pitch * MAX_VSC_PIPES));
tu_bo_list_add(&cmd->bo_list, vsc_bo, MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
tu_cs_sanity_check(cs);
}
@ -841,9 +885,7 @@ update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
tu_cs_emit_regs(cs,
A6XX_VSC_BIN_SIZE(.width = fb->tile0.width,
.height = fb->tile0.height),
A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.bo = &cmd->vsc_draw_strm,
.bo_offset = 32 * cmd->vsc_draw_strm_pitch));
.height = fb->tile0.height));
tu_cs_emit_regs(cs,
A6XX_VSC_BIN_COUNT(.nx = fb->tile_count.width,
@ -853,14 +895,12 @@ update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
tu_cs_emit_array(cs, fb->pipe_config, 32);
tu_cs_emit_regs(cs,
A6XX_VSC_PRIM_STRM_ADDRESS(.bo = &cmd->vsc_prim_strm),
A6XX_VSC_PRIM_STRM_PITCH(cmd->vsc_prim_strm_pitch),
A6XX_VSC_PRIM_STRM_LIMIT(cmd->vsc_prim_strm_pitch - 64));
A6XX_VSC_PRIM_STRM_LIMIT(cmd->vsc_prim_strm_pitch - VSC_PAD));
tu_cs_emit_regs(cs,
A6XX_VSC_DRAW_STRM_ADDRESS(.bo = &cmd->vsc_draw_strm),
A6XX_VSC_DRAW_STRM_PITCH(cmd->vsc_draw_strm_pitch),
A6XX_VSC_DRAW_STRM_LIMIT(cmd->vsc_draw_strm_pitch - 64));
A6XX_VSC_DRAW_STRM_LIMIT(cmd->vsc_draw_strm_pitch - VSC_PAD));
}
static void
@ -870,32 +910,26 @@ emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
const uint32_t used_pipe_count =
fb->pipe_count.width * fb->pipe_count.height;
/* Clear vsc_scratch: */
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(vsc_scratch));
tu_cs_emit(cs, 0x0);
/* Check for overflow, write vsc_scratch if detected: */
for (int i = 0; i < used_pipe_count; i++) {
tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
CP_COND_WRITE5_0_WRITE_MEMORY);
tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_DRAW_STRM_SIZE_REG(i)));
tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_draw_strm_pitch - 64));
tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_draw_strm_pitch - VSC_PAD));
tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(vsc_scratch));
tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(1 + cmd->vsc_draw_strm_pitch));
tu_cs_emit_qw(cs, global_iova(cmd, vsc_draw_overflow));
tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(cmd->vsc_draw_strm_pitch));
tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
CP_COND_WRITE5_0_WRITE_MEMORY);
tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_PRIM_STRM_SIZE_REG(i)));
tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_prim_strm_pitch - 64));
tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_prim_strm_pitch - VSC_PAD));
tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(vsc_scratch));
tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(3 + cmd->vsc_prim_strm_pitch));
tu_cs_emit_qw(cs, global_iova(cmd, vsc_prim_overflow));
tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(cmd->vsc_prim_strm_pitch));
}
tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
@ -1241,9 +1275,6 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd)
{
const struct tu_framebuffer *fb = cmd->state.framebuffer;
if (use_hw_binning(cmd))
cmd->use_vsc_data = true;
tu6_tile_render_begin(cmd, &cmd->cs);
uint32_t pipe = 0;
@ -1334,28 +1365,12 @@ tu_create_cmd_buffer(struct tu_device *device,
list_inithead(&cmd_buffer->upload.list);
VkResult result = tu_bo_init_new(device, &cmd_buffer->scratch_bo, 0x1000);
if (result != VK_SUCCESS)
goto fail_scratch_bo;
/* TODO: resize on overflow */
cmd_buffer->vsc_draw_strm_pitch = device->vsc_draw_strm_pitch;
cmd_buffer->vsc_prim_strm_pitch = device->vsc_prim_strm_pitch;
cmd_buffer->vsc_draw_strm = device->vsc_draw_strm;
cmd_buffer->vsc_prim_strm = device->vsc_prim_strm;
return VK_SUCCESS;
fail_scratch_bo:
list_del(&cmd_buffer->pool_link);
return result;
}
static void
tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)
{
tu_bo_finish(cmd_buffer->device, &cmd_buffer->scratch_bo);
list_del(&cmd_buffer->pool_link);
tu_cs_finish(&cmd_buffer->cs);
@ -1839,7 +1854,7 @@ void tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
for (uint32_t i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
/* note: FLUSH_BASE is always the same, so it could go in init_hw()? */
tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_FLUSH_BASE(i), 2);
tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(flush_base[i]));
tu_cs_emit_qw(cs, global_iova(cmd, flush_base[i]));
tu6_emit_event_write(cmd, cs, FLUSH_SO_0 + i);
}
@ -1861,7 +1876,7 @@ void tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
0x40000 | /* ??? */
CP_MEM_TO_REG_0_UNK31 |
CP_MEM_TO_REG_0_CNT(1));
tu_cs_emit_qw(cs, cmd->scratch_bo.iova + ctrl_offset(flush_base[idx]));
tu_cs_emit_qw(cs, global_iova(cmd, flush_base[idx]));
if (offset) {
tu_cs_emit_pkt7(cs, CP_REG_RMW, 3);
@ -1933,18 +1948,8 @@ tu_EndCommandBuffer(VkCommandBuffer commandBuffer)
tu_emit_cache_flush(cmd_buffer, &cmd_buffer->cs);
}
tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->scratch_bo,
MSM_SUBMIT_BO_WRITE);
if (cmd_buffer->use_vsc_data) {
tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->vsc_draw_strm,
MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->vsc_prim_strm,
MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
}
tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->device->border_color,
MSM_SUBMIT_BO_READ);
tu_bo_list_add(&cmd_buffer->bo_list, &cmd_buffer->device->global_bo,
MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
for (uint32_t i = 0; i < cmd_buffer->draw_cs.bo_count; i++) {
tu_bo_list_add(&cmd_buffer->bo_list, cmd_buffer->draw_cs.bos[i],

View File

@ -1182,7 +1182,6 @@ struct PACKED bcolor_entry {
},
};
VkResult
tu_CreateDevice(VkPhysicalDevice physicalDevice,
const VkDeviceCreateInfo *pCreateInfo,
@ -1265,30 +1264,20 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
if (!device->compiler)
goto fail_queues;
#define VSC_DRAW_STRM_SIZE(pitch) ((pitch) * 32 + 0x100) /* extra size to store VSC_SIZE */
#define VSC_PRIM_STRM_SIZE(pitch) ((pitch) * 32)
/* initial sizes, these will increase if there is overflow */
device->vsc_draw_strm_pitch = 0x1000 + VSC_PAD;
device->vsc_prim_strm_pitch = 0x4000 + VSC_PAD;
device->vsc_draw_strm_pitch = 0x440 * 4;
device->vsc_prim_strm_pitch = 0x1040 * 4;
result = tu_bo_init_new(device, &device->vsc_draw_strm, VSC_DRAW_STRM_SIZE(device->vsc_draw_strm_pitch));
STATIC_ASSERT(sizeof(border_color) == sizeof(((struct tu6_global*) 0)->border_color));
result = tu_bo_init_new(device, &device->global_bo, sizeof(struct tu6_global));
if (result != VK_SUCCESS)
goto fail_vsc_data;
goto fail_global_bo;
result = tu_bo_init_new(device, &device->vsc_prim_strm, VSC_PRIM_STRM_SIZE(device->vsc_prim_strm_pitch));
result = tu_bo_map(device, &device->global_bo);
if (result != VK_SUCCESS)
goto fail_vsc_data2;
goto fail_global_bo_map;
STATIC_ASSERT(sizeof(struct bcolor_entry) == 128);
result = tu_bo_init_new(device, &device->border_color, sizeof(border_color));
if (result != VK_SUCCESS)
goto fail_border_color;
result = tu_bo_map(device, &device->border_color);
if (result != VK_SUCCESS)
goto fail_border_color_map;
memcpy(device->border_color.map, border_color, sizeof(border_color));
memcpy(device->global_bo.map + gb_offset(border_color), border_color, sizeof(border_color));
VkPipelineCacheCreateInfo ci;
ci.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO;
@ -1307,20 +1296,16 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++)
mtx_init(&device->scratch_bos[i].construct_mtx, mtx_plain);
mtx_init(&device->vsc_pitch_mtx, mtx_plain);
*pDevice = tu_device_to_handle(device);
return VK_SUCCESS;
fail_pipeline_cache:
fail_border_color_map:
tu_bo_finish(device, &device->border_color);
fail_global_bo_map:
tu_bo_finish(device, &device->global_bo);
fail_border_color:
tu_bo_finish(device, &device->vsc_prim_strm);
fail_vsc_data2:
tu_bo_finish(device, &device->vsc_draw_strm);
fail_vsc_data:
fail_global_bo:
ralloc_free(device->compiler);
fail_queues:
@ -1343,9 +1328,6 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
if (!device)
return;
tu_bo_finish(device, &device->vsc_draw_strm);
tu_bo_finish(device, &device->vsc_prim_strm);
for (unsigned i = 0; i < TU_MAX_QUEUE_FAMILIES; i++) {
for (unsigned q = 0; q < device->queue_count[i]; q++)
tu_queue_finish(&device->queues[i][q]);

View File

@ -339,6 +339,31 @@ struct tu_bo
void *map;
};
/* This struct defines the layout of the global_bo */
struct tu6_global
{
/* 6 bcolor_entry entries, one for each VK_BORDER_COLOR */
uint8_t border_color[128 * 6];
uint32_t seqno_dummy; /* dummy seqno for CP_EVENT_WRITE */
uint32_t _pad0;
volatile uint32_t vsc_draw_overflow;
uint32_t _pad1;
volatile uint32_t vsc_prim_overflow;
uint32_t _pad2[3];
/* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */
struct {
uint32_t offset;
uint32_t pad[7];
} flush_base[4];
};
#define gb_offset(member) offsetof(struct tu6_global, member)
#define global_iova(cmd, member) ((cmd)->device->global_bo.iova + gb_offset(member))
/* extra space in vsc draw/prim streams */
#define VSC_PAD 0x40
struct tu_device
{
VK_LOADER_DATA _loader_data;
@ -358,11 +383,6 @@ struct tu_device
/* Backup in-memory cache to be used if the app doesn't provide one */
struct tu_pipeline_cache *mem_cache;
struct tu_bo vsc_draw_strm;
struct tu_bo vsc_prim_strm;
uint32_t vsc_draw_strm_pitch;
uint32_t vsc_prim_strm_pitch;
#define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */
/* Currently the kernel driver uses a 32-bit GPU address space, but it
@ -374,9 +394,13 @@ struct tu_device
bool initialized;
} scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2];
struct tu_bo border_color;
struct tu_bo global_bo;
struct tu_device_extension_table enabled_extensions;
uint32_t vsc_draw_strm_pitch;
uint32_t vsc_prim_strm_pitch;
mtx_t vsc_pitch_mtx;
};
VkResult _tu_device_set_lost(struct tu_device *device,
@ -883,28 +907,6 @@ tu_bo_list_add(struct tu_bo_list *list,
VkResult
tu_bo_list_merge(struct tu_bo_list *list, const struct tu_bo_list *other);
/* This struct defines the layout of the scratch_bo */
struct tu6_control
{
uint32_t seqno_dummy; /* dummy seqno for CP_EVENT_WRITE */
uint32_t _pad0;
volatile uint32_t vsc_overflow;
uint32_t _pad1;
/* flag set from cmdstream when VSC overflow detected: */
uint32_t vsc_scratch;
uint32_t _pad2;
uint32_t _pad3;
uint32_t _pad4;
/* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */
struct {
uint32_t offset;
uint32_t pad[7];
} flush_base[4];
};
#define ctrl_offset(member) offsetof(struct tu6_control, member)
struct tu_cmd_buffer
{
VK_LOADER_DATA _loader_data;
@ -939,15 +941,10 @@ struct tu_cmd_buffer
struct tu_cs draw_epilogue_cs;
struct tu_cs sub_cs;
struct tu_bo scratch_bo;
bool has_tess;
struct tu_bo vsc_draw_strm;
struct tu_bo vsc_prim_strm;
uint32_t vsc_draw_strm_pitch;
uint32_t vsc_prim_strm_pitch;
bool use_vsc_data;
};
/* Temporary struct for tracking a register state to be written, used by