From 6fa8a6d60f0bbc112870dedbc7e9958f07a85d09 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Tue, 30 Apr 2019 10:07:02 -0700 Subject: [PATCH] freedreno/a6xx: KHR_blend_equation_advanced support Wire up support to sample from the fb (and force GMEM rendering when we have fb reads). The existing GLSL IR lowering for blend_equation_advanced does the rest. Signed-off-by: Rob Clark Reviewed-by: Kristian H. Kristensen --- src/gallium/drivers/freedreno/a6xx/fd6_emit.c | 71 +++++++++++++++++-- src/gallium/drivers/freedreno/a6xx/fd6_gmem.c | 13 ++++ .../drivers/freedreno/a6xx/fd6_program.c | 1 - .../drivers/freedreno/freedreno_batch.c | 2 + .../drivers/freedreno/freedreno_batch.h | 4 ++ .../drivers/freedreno/freedreno_screen.c | 9 +++ .../drivers/freedreno/freedreno_screen.h | 1 + 7 files changed, 96 insertions(+), 5 deletions(-) diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c index efb4d5fd84b..c17bd49be4d 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c @@ -340,11 +340,49 @@ emit_border_color(struct fd_context *ctx, struct fd_ringbuffer *ring) u_upload_unmap(fd6_ctx->border_color_uploader); } +static void +fd6_emit_fb_tex(struct fd_ringbuffer *state, struct fd_context *ctx) +{ + struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; + struct pipe_surface *psurf = pfb->cbufs[0]; + struct fd_resource *rsc = fd_resource(psurf->texture); + + uint32_t texconst0 = fd6_tex_const_0(psurf->texture, psurf->u.tex.level, + psurf->format, PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, + PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W); + + /* always TILE6_2 mode in GMEM.. which also means no swap: */ + texconst0 &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK); + texconst0 |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2); + + OUT_RING(state, texconst0); + OUT_RING(state, A6XX_TEX_CONST_1_WIDTH(pfb->width) | + A6XX_TEX_CONST_1_HEIGHT(pfb->height)); + OUT_RINGP(state, A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) | + A6XX_TEX_CONST_2_FETCHSIZE(TFETCH6_2_BYTE), + &ctx->batch->fb_read_patches); + OUT_RING(state, A6XX_TEX_CONST_3_ARRAY_PITCH(rsc->layer_size)); + + OUT_RING(state, A6XX_TEX_CONST_4_BASE_LO(ctx->screen->gmem_base)); + OUT_RING(state, A6XX_TEX_CONST_5_BASE_HI(ctx->screen->gmem_base >> 32) | + A6XX_TEX_CONST_5_DEPTH(1)); + OUT_RING(state, 0); /* texconst6 */ + OUT_RING(state, 0); /* texconst7 */ + OUT_RING(state, 0); /* texconst8 */ + OUT_RING(state, 0); /* texconst9 */ + OUT_RING(state, 0); /* texconst10 */ + OUT_RING(state, 0); /* texconst11 */ + OUT_RING(state, 0); + OUT_RING(state, 0); + OUT_RING(state, 0); + OUT_RING(state, 0); +} + bool fd6_emit_textures(struct fd_pipe *pipe, struct fd_ringbuffer *ring, enum pipe_shader_type type, struct fd_texture_stateobj *tex, unsigned bcolor_offset, - /* can be NULL if no image/SSBO state to merge in: */ + /* can be NULL if no image/SSBO/fb state to merge in: */ const struct ir3_shader_variant *v, struct fd_context *ctx) { bool needs_border = false; @@ -412,6 +450,9 @@ fd6_emit_textures(struct fd_pipe *pipe, struct fd_ringbuffer *ring, if (v) { num_merged_textures += v->image_mapping.num_tex; + if (v->fb_read) + num_merged_textures++; + /* There could be more bound textures than what the shader uses. * Which isn't known at shader compile time. So in the case we * are merging tex state, only emit the textures that the shader @@ -479,6 +520,10 @@ fd6_emit_textures(struct fd_pipe *pipe, struct fd_ringbuffer *ring, fd6_emit_image_tex(state, &img->si[idx]); } } + + if (v->fb_read) { + fd6_emit_fb_tex(state, ctx); + } } /* emit texture state: */ @@ -528,10 +573,20 @@ fd6_emit_combined_textures(struct fd_ringbuffer *ring, struct fd6_emit *emit, debug_assert(state_id[type]); - if (!v->image_mapping.num_tex) { + if (!v->image_mapping.num_tex && !v->fb_read) { /* in the fast-path, when we don't have to mix in any image/SSBO * related texture state, we can just lookup the stateobj and * re-emit that: + * + * Also, framebuffer-read is a slow-path because an extra + * texture needs to be inserted. + * + * TODO we can probably simmplify things if we also treated + * border_color as a slow-path.. this way the tex state key + * wouldn't depend on bcolor_offset.. but fb_read might rather + * be *somehow* a fast-path if we eventually used it for PLS. + * I suppose there would be no harm in just *always* inserting + * an fb_read texture? */ if ((ctx->dirty_shader[type] & FD_DIRTY_SHADER_TEX) && ctx->tex[type].num_textures > 0) { @@ -546,9 +601,10 @@ fd6_emit_combined_textures(struct fd_ringbuffer *ring, struct fd6_emit *emit, /* In the slow-path, create a one-shot texture state object * if either TEX|PROG|SSBO|IMAGE state is dirty: */ - if (ctx->dirty_shader[type] & + if ((ctx->dirty_shader[type] & (FD_DIRTY_SHADER_TEX | FD_DIRTY_SHADER_PROG | - FD_DIRTY_SHADER_IMAGE | FD_DIRTY_SHADER_SSBO)) { + FD_DIRTY_SHADER_IMAGE | FD_DIRTY_SHADER_SSBO)) || + v->fb_read) { struct fd_texture_stateobj *tex = &ctx->tex[type]; struct fd_ringbuffer *stateobj = fd_submit_new_ringbuffer(ctx->batch->submit, @@ -738,6 +794,13 @@ fd6_emit_state(struct fd_ringbuffer *ring, struct fd6_emit *emit) emit_marker6(ring, 5); + /* NOTE: we track fb_read differently than _BLEND_ENABLED since + * we might at some point decide to do sysmem in some cases when + * blend is enabled: + */ + if (fp->fb_read) + ctx->batch->gmem_reason |= FD_GMEM_FB_READ; + if (emit->dirty & (FD_DIRTY_VTXBUF | FD_DIRTY_VTXSTATE)) { struct fd_ringbuffer *state; diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c b/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c index 1c1ca65598c..efdd52d4fc5 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c @@ -264,6 +264,18 @@ use_hw_binning(struct fd_batch *batch) (batch->num_draws > 0); } +static void +patch_fb_read(struct fd_batch *batch) +{ + struct fd_gmem_stateobj *gmem = &batch->ctx->gmem; + + for (unsigned i = 0; i < fd_patch_num_elements(&batch->fb_read_patches); i++) { + struct fd_cs_patch *patch = fd_patch_element(&batch->fb_read_patches, i); + *patch->cs = patch->val | A6XX_TEX_CONST_2_PITCH(gmem->bin_w * gmem->cbuf_cpp[0]); + } + util_dynarray_resize(&batch->fb_read_patches, 0); +} + static void patch_draws(struct fd_batch *batch, enum pc_di_vis_cull_mode vismode) { @@ -518,6 +530,7 @@ fd6_emit_tile_init(struct fd_batch *batch) emit_zs(ring, pfb->zsbuf, &ctx->gmem); emit_mrt(ring, pfb, &ctx->gmem); emit_msaa(ring, pfb->samples); + patch_fb_read(batch); if (use_hw_binning(batch)) { set_bin_size(ring, gmem->bin_w, gmem->bin_h, diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.c b/src/gallium/drivers/freedreno/a6xx/fd6_program.c index 3f8cdb3cc38..9888e51f86e 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_program.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.c @@ -596,7 +596,6 @@ setup_stateobj(struct fd_ringbuffer *ring, struct fd6_program_state *state, OUT_RING(ring, CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) | - COND(sample_shading, A6XX_RB_RENDER_CONTROL1_UNK4 | A6XX_RB_RENDER_CONTROL1_UNK5) | CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) | CONDREG(ij_size_regid, A6XX_RB_RENDER_CONTROL1_SIZE) | COND(s[FS].v->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS)); diff --git a/src/gallium/drivers/freedreno/freedreno_batch.c b/src/gallium/drivers/freedreno/freedreno_batch.c index 84dfa898c51..14b4e38568c 100644 --- a/src/gallium/drivers/freedreno/freedreno_batch.c +++ b/src/gallium/drivers/freedreno/freedreno_batch.c @@ -89,6 +89,7 @@ batch_init(struct fd_batch *batch) fd_reset_wfi(batch); util_dynarray_init(&batch->draw_patches, NULL); + util_dynarray_init(&batch->fb_read_patches, NULL); if (is_a2xx(ctx->screen)) { util_dynarray_init(&batch->shader_patches, NULL); @@ -168,6 +169,7 @@ batch_fini(struct fd_batch *batch) fd_submit_del(batch->submit); util_dynarray_fini(&batch->draw_patches); + util_dynarray_fini(&batch->fb_read_patches); if (is_a2xx(batch->ctx->screen)) { util_dynarray_fini(&batch->shader_patches); diff --git a/src/gallium/drivers/freedreno/freedreno_batch.h b/src/gallium/drivers/freedreno/freedreno_batch.h index e771ad6c0bd..d38bd3f8b22 100644 --- a/src/gallium/drivers/freedreno/freedreno_batch.h +++ b/src/gallium/drivers/freedreno/freedreno_batch.h @@ -122,6 +122,7 @@ struct fd_batch { FD_GMEM_BLEND_ENABLED = 0x10, FD_GMEM_LOGICOP_ENABLED = 0x20, + FD_GMEM_FB_READ = 0x40, } gmem_reason; unsigned num_draws; /* number of draws in current batch */ unsigned num_vertices; /* number of vertices in current batch */ @@ -137,6 +138,9 @@ struct fd_batch { */ struct util_dynarray draw_patches; + /* texture state that needs patching for fb_read: */ + struct util_dynarray fb_read_patches; + /* Keep track of writes to RB_RENDER_CONTROL which need to be patched * once we know whether or not to use GMEM, and GMEM tile pitch. * diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index a437afb0d4e..889737ae30c 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -307,6 +307,11 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_FORCE_PERSAMPLE_INTERP: return 0; + case PIPE_CAP_TGSI_FS_FBFETCH: + if (fd_device_version(screen->dev) >= FD_VERSION_GMEM_BASE && + is_a6xx(screen)) + return 1; + return 0; case PIPE_CAP_SAMPLE_SHADING: if (is_a6xx(screen)) return 1; return 0; @@ -784,6 +789,10 @@ fd_screen_create(struct fd_device *dev, struct renderonly *ro) } screen->gmemsize_bytes = val; + if (fd_device_version(dev) >= FD_VERSION_GMEM_BASE) { + fd_pipe_get_param(screen->pipe, FD_GMEM_BASE, &screen->gmem_base); + } + if (fd_pipe_get_param(screen->pipe, FD_DEVICE_ID, &val)) { DBG("could not get device-id"); goto fail; diff --git a/src/gallium/drivers/freedreno/freedreno_screen.h b/src/gallium/drivers/freedreno/freedreno_screen.h index 35685be1d22..12f3f849b5d 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.h +++ b/src/gallium/drivers/freedreno/freedreno_screen.h @@ -59,6 +59,7 @@ struct fd_screen { struct slab_parent_pool transfer_pool; + uint64_t gmem_base; uint32_t gmemsize_bytes; uint32_t device_id; uint32_t gpu_id; /* 220, 305, etc */