freedreno/a6xx: hwbinning

Signed-off-by: Rob Clark <robdclark@gmail.com>
This commit is contained in:
Rob Clark 2018-09-11 15:59:22 -04:00
parent 8ff349e564
commit ae78489d3e
8 changed files with 159 additions and 105 deletions

View File

@ -49,7 +49,8 @@ fd6_context_destroy(struct pipe_context *pctx)
fd_bo_del(fd6_ctx->vs_pvt_mem);
fd_bo_del(fd6_ctx->fs_pvt_mem);
fd_bo_del(fd6_ctx->vsc_size_mem);
fd_bo_del(fd6_ctx->vsc_data);
fd_bo_del(fd6_ctx->vsc_data2);
fd_bo_del(fd6_ctx->blit_mem);
fd_context_cleanup_common_vbos(&fd6_ctx->base);
@ -104,7 +105,12 @@ fd6_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
fd6_ctx->fs_pvt_mem = fd_bo_new(screen->dev, 0x2000,
DRM_FREEDRENO_GEM_TYPE_KMEM);
fd6_ctx->vsc_size_mem = fd_bo_new(screen->dev, 0x1000,
fd6_ctx->vsc_data = fd_bo_new(screen->dev,
(A6XX_VSC_DATA_PITCH * 32) + 0x100,
DRM_FREEDRENO_GEM_TYPE_KMEM);
fd6_ctx->vsc_data2 = fd_bo_new(screen->dev,
A6XX_VSC_DATA2_PITCH * 32,
DRM_FREEDRENO_GEM_TYPE_KMEM);
fd6_ctx->blit_mem = fd_bo_new(screen->dev, 0x1000,

View File

@ -50,10 +50,20 @@ struct fd6_context {
struct fd_bo *vs_pvt_mem, *fs_pvt_mem;
/* This only needs to be 4 * num_of_pipes bytes (ie. 32 bytes). We
* could combine it with another allocation.
/* Two buffers related to hw binning / visibility stream (VSC).
* Compared to previous generations
* (1) we cannot specify individual buffers per VSC, instead
* just a pitch and base address
* (2) there is a second smaller buffer, for something.. we
* also stash VSC_BIN_SIZE at end of 2nd buffer.
*/
struct fd_bo *vsc_size_mem;
struct fd_bo *vsc_data, *vsc_data2;
// TODO annoyingly large sizes to prevent hangs with larger amounts
// of geometry, like aquarium with max # of fish. Need to figure
// out how to calculate the required size.
#define A6XX_VSC_DATA_PITCH 0x4400
#define A6XX_VSC_DATA2_PITCH 0x10400
/* TODO not sure what this is for.. probably similar to
* CACHE_FLUSH_TS on kernel side, where value gets written

View File

@ -248,7 +248,8 @@ fd6_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info,
/* figure out whether we need to disable LRZ write for binning
* pass using draw pass's fp:
*/
emit.no_lrz_write = fp->writes_pos || fp->has_kill;
// TODO disable until lrz is wired up:
emit.no_lrz_write = true; // fp->writes_pos || fp->has_kill;
emit.key.binning_pass = false;
emit.dirty = dirty;

View File

@ -214,7 +214,12 @@ emit_zs(struct fd_ringbuffer *ring, struct pipe_surface *zsbuf,
static bool
use_hw_binning(struct fd_batch *batch)
{
return false;
struct fd_gmem_stateobj *gmem = &batch->ctx->gmem;
// TODO figure out hw limits for binning
return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) > 2) &&
(batch->num_draws > 0);
}
static void
@ -244,28 +249,44 @@ patch_gmem_bases(struct fd_batch *batch)
util_dynarray_resize(&batch->gmem_patches, 0);
}
static void
update_render_cntl(struct fd_batch *batch, bool binning)
{
struct fd_ringbuffer *ring = batch->gmem;
uint32_t cntl = 0;
cntl |= A6XX_RB_RENDER_CNTL_UNK4;
if (binning)
cntl |= A6XX_RB_RENDER_CNTL_BINNING;
OUT_PKT7(ring, CP_REG_WRITE, 3);
OUT_RING(ring, 0x2);
OUT_RING(ring, REG_A6XX_RB_RENDER_CNTL);
OUT_RING(ring, cntl);
}
static void
update_vsc_pipe(struct fd_batch *batch)
{
struct fd_context *ctx = batch->ctx;
struct fd6_context *fd6_ctx = fd6_context(ctx);
struct fd_gmem_stateobj *gmem = &batch->ctx->gmem;
struct fd_gmem_stateobj *gmem = &ctx->gmem;
struct fd_ringbuffer *ring = batch->gmem;
unsigned n = gmem->nbins_x * gmem->nbins_y;
int i;
OUT_PKT4(ring, REG_A6XX_VSC_BIN_SIZE, 3);
OUT_RING(ring, A6XX_VSC_BIN_SIZE_WIDTH(gmem->bin_w) |
A6XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h));
OUT_RELOCW(ring, fd6_ctx->vsc_size_mem, 0, 0, 0); /* VSC_SIZE_ADDRESS_LO/HI */
OUT_RELOCW(ring, fd6_ctx->vsc_data,
n * A6XX_VSC_DATA_PITCH, 0, 0); /* VSC_SIZE_ADDRESS_LO/HI */
#if 0
OUT_PKT4(ring, REG_A6XX_UNKNOWN_0BC5, 2);
OUT_RING(ring, 0x00000000); /* UNKNOWN_0BC5 */
OUT_RING(ring, 0x00000000); /* UNKNOWN_0BC6 */
#endif
OUT_PKT4(ring, REG_A6XX_VSC_BIN_COUNT, 1);
OUT_RING(ring, A6XX_VSC_BIN_COUNT_NX(gmem->nbins_x) |
A6XX_VSC_BIN_COUNT_NY(gmem->nbins_y));
OUT_PKT4(ring, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 16);
for (i = 0; i < 16; i++) {
OUT_PKT4(ring, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32);
for (i = 0; i < 32; i++) {
struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i];
OUT_RING(ring, A6XX_VSC_PIPE_CONFIG_REG_X(pipe->x) |
A6XX_VSC_PIPE_CONFIG_REG_Y(pipe->y) |
@ -273,25 +294,15 @@ update_vsc_pipe(struct fd_batch *batch)
A6XX_VSC_PIPE_CONFIG_REG_H(pipe->h));
}
#if 0
OUT_PKT4(ring, REG_A6XX_VSC_PIPE_DATA_ADDRESS_LO(0), 32);
for (i = 0; i < 16; i++) {
struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i];
if (!pipe->bo) {
pipe->bo = fd_bo_new(ctx->dev, 0x20000,
DRM_FREEDRENO_GEM_TYPE_KMEM);
}
OUT_RELOCW(ring, pipe->bo, 0, 0, 0); /* VSC_PIPE_DATA_ADDRESS[i].LO/HI */
}
#endif
OUT_PKT4(ring, REG_A6XX_VSC_PIPE_DATA2_ADDRESS_LO, 4);
OUT_RELOCW(ring, fd6_ctx->vsc_data2, 0, 0, 0);
OUT_RING(ring, A6XX_VSC_DATA2_PITCH);
OUT_RING(ring, fd_bo_size(fd6_ctx->vsc_data2));
#if 0
OUT_PKT4(ring, REG_A6XX_VSC_PIPE_DATA_LENGTH_REG(0), 16);
for (i = 0; i < 16; i++) {
struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i];
OUT_RING(ring, fd_bo_size(pipe->bo) - 32); /* VSC_PIPE_DATA_LENGTH[i] */
}
#endif
OUT_PKT4(ring, REG_A6XX_VSC_PIPE_DATA_ADDRESS_LO, 4);
OUT_RELOCW(ring, fd6_ctx->vsc_data, 0, 0, 0);
OUT_RING(ring, A6XX_VSC_DATA_PITCH);
OUT_RING(ring, fd_bo_size(fd6_ctx->vsc_data));
}
static void
@ -310,6 +321,23 @@ set_scissor(struct fd_ringbuffer *ring, uint32_t x1, uint32_t y1, uint32_t x2, u
A6XX_GRAS_RESOLVE_CNTL_2_Y(y2));
}
static void
set_bin_size(struct fd_ringbuffer *ring, uint32_t w, uint32_t h, uint32_t flag)
{
OUT_PKT4(ring, REG_A6XX_GRAS_BIN_CONTROL, 1);
OUT_RING(ring, A6XX_GRAS_BIN_CONTROL_BINW(w) |
A6XX_GRAS_BIN_CONTROL_BINH(h) | flag);
OUT_PKT4(ring, REG_A6XX_RB_BIN_CONTROL, 1);
OUT_RING(ring, A6XX_RB_BIN_CONTROL_BINW(w) |
A6XX_RB_BIN_CONTROL_BINH(h) | flag);
/* no flag for RB_BIN_CONTROL2... */
OUT_PKT4(ring, REG_A6XX_RB_BIN_CONTROL2, 1);
OUT_RING(ring, A6XX_RB_BIN_CONTROL2_BINW(w) |
A6XX_RB_BIN_CONTROL2_BINH(h));
}
static void
emit_binning_pass(struct fd_batch *batch)
{
@ -322,25 +350,31 @@ emit_binning_pass(struct fd_batch *batch)
uint32_t x2 = gmem->minx + gmem->width - 1;
uint32_t y2 = gmem->miny + gmem->height - 1;
set_scissor(ring, x1, y1, x2, y2);
emit_marker6(ring, 7);
OUT_PKT7(ring, CP_SET_MARKER, 1);
OUT_RING(ring, A2XX_CP_SET_MARKER_0_MODE(RM6_BINNING) | 0x10); /* | 0x10 ? */
OUT_RING(ring, A2XX_CP_SET_MARKER_0_MODE(RM6_BINNING));
emit_marker6(ring, 7);
#if 0
OUT_PKT4(ring, REG_A6XX_RB_CNTL, 1);
OUT_RING(ring, A6XX_RB_CNTL_WIDTH(gmem->bin_w) |
A6XX_RB_CNTL_HEIGHT(gmem->bin_h));
#endif
OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1);
OUT_RING(ring, 0x1);
set_scissor(ring, x1, y1, x2, y2);
OUT_PKT7(ring, CP_SET_MODE, 1);
OUT_RING(ring, 0x1);
OUT_WFI5(ring);
OUT_PKT4(ring, REG_A6XX_VFD_MODE_CNTL, 1);
OUT_RING(ring, A6XX_VFD_MODE_CNTL_BINNING_PASS);
update_vsc_pipe(batch);
#if 0
OUT_PKT4(ring, REG_A6XX_VPC_MODE_CNTL, 1);
OUT_RING(ring, A6XX_VPC_MODE_CNTL_BINNING_PASS);
#endif
OUT_PKT4(ring, REG_A6XX_PC_UNKNOWN_9805, 1);
OUT_RING(ring, 0x1);
OUT_PKT4(ring, REG_A6XX_SP_UNKNOWN_A0F8, 1);
OUT_RING(ring, 0x1);
OUT_PKT7(ring, CP_EVENT_WRITE, 1);
OUT_RING(ring, UNK_2C);
@ -349,11 +383,22 @@ emit_binning_pass(struct fd_batch *batch)
OUT_RING(ring, A6XX_RB_WINDOW_OFFSET_X(0) |
A6XX_RB_WINDOW_OFFSET_Y(0));
OUT_PKT4(ring, REG_A6XX_SP_TP_WINDOW_OFFSET, 1);
OUT_RING(ring, A6XX_SP_TP_WINDOW_OFFSET_X(0) |
A6XX_SP_TP_WINDOW_OFFSET_Y(0));
/* emit IB to binning drawcmds: */
ctx->emit_ib(ring, batch->binning);
fd_reset_wfi(batch);
OUT_PKT7(ring, CP_SET_DRAW_STATE, 3);
OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) |
CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
CP_SET_DRAW_STATE__0_GROUP_ID(0));
OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0));
OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0));
OUT_PKT7(ring, CP_EVENT_WRITE, 1);
OUT_RING(ring, UNK_2D);
@ -362,14 +407,7 @@ emit_binning_pass(struct fd_batch *batch)
OUT_RELOCW(ring, fd6_context(ctx)->blit_mem, 0, 0, 0); /* ADDR_LO/HI */
OUT_RING(ring, 0x00000000);
// TODO CP_COND_WRITE's for all the vsc buffers (check for overflow??)
fd_wfi(batch, ring);
#if 0
OUT_PKT4(ring, REG_A6XX_VPC_MODE_CNTL, 1);
OUT_RING(ring, 0x0);
#endif
}
static void
@ -392,23 +430,6 @@ disable_msaa(struct fd_ringbuffer *ring)
A6XX_RB_DEST_MSAA_CNTL_MSAA_DISABLE);
}
static void
set_bin_size(struct fd_ringbuffer *ring, uint32_t w, uint32_t h, uint32_t flag)
{
OUT_PKT4(ring, REG_A6XX_GRAS_BIN_CONTROL, 1);
OUT_RING(ring, A6XX_GRAS_BIN_CONTROL_BINW(w) |
A6XX_GRAS_BIN_CONTROL_BINH(h) | flag);
OUT_PKT4(ring, REG_A6XX_RB_BIN_CONTROL, 1);
OUT_RING(ring, A6XX_RB_BIN_CONTROL_BINW(w) |
A6XX_RB_BIN_CONTROL_BINH(h) | flag);
/* no flag for X3_BIN_SIZE... */
OUT_PKT4(ring, REG_A6XX_RB_BIN_CONTROL2, 1);
OUT_RING(ring, A6XX_RB_BIN_CONTROL2_BINW(w) |
A6XX_RB_BIN_CONTROL2_BINH(h));
}
/* before first tile */
static void
fd6_emit_tile_init(struct fd_batch *batch)
@ -428,46 +449,40 @@ fd6_emit_tile_init(struct fd_batch *batch)
OUT_PKT7(ring, CP_EVENT_WRITE, 1);
OUT_RING(ring, 0x31); /* vertex cache invalidate? */
#if 0
OUT_PKT4(ring, REG_A6XX_GRAS_CL_CNTL, 1);
OUT_RING(ring, 0x00000080); /* GRAS_CL_CNTL */
#endif
OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
OUT_RING(ring, 0x0);
#if 0
OUT_PKT4(ring, REG_A6XX_PC_POWER_CNTL, 1);
OUT_RING(ring, 0x00000003); /* PC_POWER_CNTL */
#endif
#if 0
OUT_PKT4(ring, REG_A6XX_VFD_POWER_CNTL, 1);
OUT_RING(ring, 0x00000003); /* VFD_POWER_CNTL */
#endif
/* 0x10000000 for BYPASS.. 0x7c13c080 for GMEM: */
fd_wfi(batch, ring);
OUT_PKT4(ring, REG_A6XX_RB_CCU_CNTL, 1);
OUT_RING(ring, 0x7c400004); /* RB_CCU_CNTL */
DBG("emit_mrt");
emit_zs(ring, pfb->zsbuf, &ctx->gmem);
emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, &ctx->gmem);
patch_gmem_bases(batch);
set_bin_size(ring, gmem->bin_w, gmem->bin_h, 0x6000000);
disable_msaa(ring);
if (use_hw_binning(batch)) {
set_bin_size(ring, gmem->bin_w, gmem->bin_h,
A6XX_RB_BIN_CONTROL_BINNING_PASS | 0x6000000);
update_render_cntl(batch, true);
emit_binning_pass(batch);
fd6_emit_lrz_flush(ring);
patch_draws(batch, USE_VISIBILITY);
set_bin_size(ring, gmem->bin_w, gmem->bin_h,
A6XX_RB_BIN_CONTROL_USE_VIZ | 0x6000000);
OUT_PKT4(ring, REG_A6XX_VFD_MODE_CNTL, 1);
OUT_RING(ring, 0x0);
} else {
set_bin_size(ring, gmem->bin_w, gmem->bin_h, 0x6000000);
patch_draws(batch, IGNORE_VISIBILITY);
}
update_render_cntl(batch, false);
}
static void
@ -498,9 +513,12 @@ fd6_emit_tile_prep(struct fd_batch *batch, struct fd_tile *tile)
struct fd6_context *fd6_ctx = fd6_context(ctx);
struct fd_ringbuffer *ring = batch->gmem;
OUT_PKT7(ring, CP_SET_MARKER, 1);
OUT_RING(ring, A2XX_CP_SET_MARKER_0_MODE(0x7));
emit_marker6(ring, 7);
OUT_PKT7(ring, CP_SET_MARKER, 1);
OUT_RING(ring, A2XX_CP_SET_MARKER_0_MODE(RM6_GMEM) | 0x10); /* | 0x10 ? */
OUT_RING(ring, A2XX_CP_SET_MARKER_0_MODE(RM6_GMEM) | 0x10);
emit_marker6(ring, 7);
uint32_t x1 = tile->xoff;
@ -516,26 +534,34 @@ fd6_emit_tile_prep(struct fd_batch *batch, struct fd_tile *tile)
OUT_RING(ring, A6XX_VPC_SO_OVERRIDE_SO_DISABLE);
if (use_hw_binning(batch)) {
struct fd_gmem_stateobj *gmem = &ctx->gmem;
struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[tile->p];
unsigned n = gmem->nbins_x * gmem->nbins_y;
OUT_PKT7(ring, CP_WAIT_FOR_ME, 0);
OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1);
OUT_RING(ring, 0x0);
OUT_PKT7(ring, CP_SET_BIN_DATA5, 5);
OUT_PKT7(ring, CP_SET_MODE, 1);
OUT_RING(ring, 0x0);
OUT_PKT7(ring, CP_SET_BIN_DATA5, 7);
OUT_RING(ring, CP_SET_BIN_DATA5_0_VSC_SIZE(pipe->w * pipe->h) |
CP_SET_BIN_DATA5_0_VSC_N(tile->n));
OUT_RELOC(ring, pipe->bo, 0, 0, 0); /* VSC_PIPE[p].DATA_ADDRESS */
OUT_RELOC(ring, fd6_ctx->vsc_size_mem, /* VSC_SIZE_ADDRESS + (p * 4) */
(tile->p * 4), 0, 0);
OUT_RELOC(ring, fd6_ctx->vsc_data, /* VSC_PIPE[p].DATA_ADDRESS */
(tile->p * A6XX_VSC_DATA_PITCH), 0, 0);
OUT_RELOC(ring, fd6_ctx->vsc_data, /* VSC_SIZE_ADDRESS + (p * 4) */
(tile->p * 4) + (n * A6XX_VSC_DATA_PITCH), 0, 0);
OUT_RELOC(ring, fd6_ctx->vsc_data2,
(tile->p * A6XX_VSC_DATA2_PITCH), 0, 0);
} else {
OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1);
OUT_RING(ring, 0x1);
}
OUT_PKT7(ring, CP_SET_MODE, 1);
OUT_RING(ring, 0x0);
OUT_PKT7(ring, CP_SET_MODE, 1);
OUT_RING(ring, 0x0);
}
}
static void
@ -719,12 +745,17 @@ fd6_emit_tile_gmem2mem(struct fd_batch *batch, struct fd_tile *tile)
struct pipe_framebuffer_state *pfb = &batch->framebuffer;
struct fd_ringbuffer *ring = batch->gmem;
if (use_hw_binning(batch)) {
OUT_PKT7(ring, CP_SET_MARKER, 1);
OUT_RING(ring, A2XX_CP_SET_MARKER_0_MODE(0x5) | 0x10);
}
OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
OUT_RING(ring, 0x0);
emit_marker6(ring, 7);
OUT_PKT7(ring, CP_SET_MARKER, 1);
OUT_RING(ring, A2XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE)); /* | 0x10 ? */
OUT_RING(ring, A2XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE) | 0x10);
emit_marker6(ring, 7);
set_blit_scissor(batch);

View File

@ -258,7 +258,7 @@ struct fd_context {
* means we'd always have to recalc tiles ever batch)
*/
struct fd_gmem_stateobj gmem;
struct fd_vsc_pipe vsc_pipe[16];
struct fd_vsc_pipe vsc_pipe[32];
struct fd_tile tile[512];
/* which state objects need to be re-emit'd: */

View File

@ -107,17 +107,18 @@ static void
calculate_tiles(struct fd_batch *batch)
{
struct fd_context *ctx = batch->ctx;
struct fd_screen *screen = ctx->screen;
struct fd_gmem_stateobj *gmem = &ctx->gmem;
struct pipe_scissor_state *scissor = &batch->max_scissor;
struct pipe_framebuffer_state *pfb = &batch->framebuffer;
const uint32_t gmem_alignw = ctx->screen->gmem_alignw;
const uint32_t gmem_alignh = ctx->screen->gmem_alignh;
const unsigned npipes = ctx->screen->num_vsc_pipes;
const uint32_t gmem_size = ctx->screen->gmemsize_bytes;
const uint32_t gmem_alignw = screen->gmem_alignw;
const uint32_t gmem_alignh = screen->gmem_alignh;
const unsigned npipes = screen->num_vsc_pipes;
const uint32_t gmem_size = screen->gmemsize_bytes;
uint32_t minx, miny, width, height;
uint32_t nbins_x = 1, nbins_y = 1;
uint32_t bin_w, bin_h;
uint32_t max_width = bin_width(ctx->screen);
uint32_t max_width = bin_width(screen);
uint8_t cbuf_cpp[MAX_RENDER_TARGETS] = {0}, zsbuf_cpp[2] = {0};
uint32_t i, j, t, xoff, yoff;
uint32_t tpp_x, tpp_y;
@ -216,10 +217,10 @@ calculate_tiles(struct fd_batch *batch)
#define div_round_up(v, a) (((v) + (a) - 1) / (a))
/* figure out number of tiles per pipe: */
tpp_x = tpp_y = 1;
while (div_round_up(nbins_y, tpp_y) > 8)
while (div_round_up(nbins_y, tpp_y) > screen->num_vsc_pipes)
tpp_y += 2;
while ((div_round_up(nbins_y, tpp_y) *
div_round_up(nbins_x, tpp_x)) > 8)
div_round_up(nbins_x, tpp_x)) > screen->num_vsc_pipes)
tpp_x += 1;
gmem->maxpw = tpp_x;

View File

@ -35,6 +35,7 @@
/* per-pipe configuration for hw binning: */
struct fd_vsc_pipe {
// TODO a3xx/a4xx/a5xx could probably move to single bo for vsc stream, like a6xx does
struct fd_bo *bo;
uint8_t x, y, w, h; /* VSC_PIPE[p].CONFIG */
};

View File

@ -822,7 +822,11 @@ fd_screen_create(struct fd_device *dev)
goto fail;
}
if (screen->gpu_id >= 500) {
if (screen->gpu_id >= 600) {
screen->gmem_alignw = 32;
screen->gmem_alignh = 32;
screen->num_vsc_pipes = 32;
} else if (screen->gpu_id >= 500) {
screen->gmem_alignw = 64;
screen->gmem_alignh = 32;
screen->num_vsc_pipes = 16;