From b4e94af37d454e42ba18c807b5d8db89746dd96d Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Wed, 10 Oct 2018 15:58:57 -0400 Subject: [PATCH] freedreno/a6xx: use program cache Use the in-memory cache to construct shader program state and re-use it on subsequent draws, to lower driver overhead. Signed-off-by: Rob Clark --- .../drivers/freedreno/a6xx/fd6_context.c | 2 + .../drivers/freedreno/a6xx/fd6_context.h | 18 +- src/gallium/drivers/freedreno/a6xx/fd6_draw.c | 63 +++--- src/gallium/drivers/freedreno/a6xx/fd6_emit.c | 24 ++- src/gallium/drivers/freedreno/a6xx/fd6_emit.h | 46 ++--- .../drivers/freedreno/a6xx/fd6_program.c | 195 ++++++++++++------ .../drivers/freedreno/a6xx/fd6_program.h | 35 +++- 7 files changed, 250 insertions(+), 133 deletions(-) diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_context.c b/src/gallium/drivers/freedreno/a6xx/fd6_context.c index ab10ccb113b..33f769619f3 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_context.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_context.c @@ -56,6 +56,8 @@ fd6_context_destroy(struct pipe_context *pctx) fd_context_cleanup_common_vbos(&fd6_ctx->base); + ir3_cache_destroy(fd6_ctx->shader_cache); + fd6_texture_fini(pctx); free(fd6_ctx); diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_context.h b/src/gallium/drivers/freedreno/a6xx/fd6_context.h index 85245c8a65f..43a1b1837c4 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_context.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_context.h @@ -38,13 +38,6 @@ #include "a6xx.xml.h" -struct fd6_streamout_state { - uint32_t ncomp[PIPE_MAX_SO_BUFFERS]; - uint32_t prog[256/2]; - uint32_t prog_count; - uint32_t vpc_so_buf_cntl; -}; - struct fd6_context { struct fd_context base; @@ -101,10 +94,13 @@ struct fd6_context { /* number of active samples-passed queries: */ int samples_passed_queries; - /* cached state about current emitted shader program (3d): */ - /*{*/ - struct fd6_streamout_state tf; - /*}*/ + /* maps per-shader-stage state plus variant key to hw + * program stateobj: + */ + struct ir3_cache *shader_cache; + + /* cached stateobjs to avoid hashtable lookup when not dirty: */ + const struct fd6_program_state *prog; uint16_t tex_seqno; struct hash_table *tex_cache; diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_draw.c b/src/gallium/drivers/freedreno/a6xx/fd6_draw.c index be13f5f6098..0061a6d094a 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_draw.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_draw.c @@ -199,41 +199,52 @@ fd6_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info, { struct fd6_context *fd6_ctx = fd6_context(ctx); struct fd6_emit emit = { - .debug = &ctx->debug, + .ctx = ctx, .vtx = &ctx->vtx, - .prog = &ctx->prog, .info = info, .key = { - .color_two_side = ctx->rasterizer->light_twoside, - .vclamp_color = ctx->rasterizer->clamp_vertex_color, - .fclamp_color = ctx->rasterizer->clamp_fragment_color, - .rasterflat = ctx->rasterizer->flatshade, - .half_precision = ctx->in_blit && - fd_half_precision(&ctx->batch->framebuffer), - .ucp_enables = ctx->rasterizer->clip_plane_enable, - .has_per_samp = (fd6_ctx->fsaturate || fd6_ctx->vsaturate || - fd6_ctx->fastc_srgb || fd6_ctx->vastc_srgb), - .vsaturate_s = fd6_ctx->vsaturate_s, - .vsaturate_t = fd6_ctx->vsaturate_t, - .vsaturate_r = fd6_ctx->vsaturate_r, - .fsaturate_s = fd6_ctx->fsaturate_s, - .fsaturate_t = fd6_ctx->fsaturate_t, - .fsaturate_r = fd6_ctx->fsaturate_r, - .vastc_srgb = fd6_ctx->vastc_srgb, - .fastc_srgb = fd6_ctx->fastc_srgb, - .vsamples = ctx->tex[PIPE_SHADER_VERTEX].samples, - .fsamples = ctx->tex[PIPE_SHADER_FRAGMENT].samples, + .vs = ctx->prog.vp, + .fs = ctx->prog.fp, + .key = { + .color_two_side = ctx->rasterizer->light_twoside, + .vclamp_color = ctx->rasterizer->clamp_vertex_color, + .fclamp_color = ctx->rasterizer->clamp_fragment_color, + .rasterflat = ctx->rasterizer->flatshade, + .ucp_enables = ctx->rasterizer->clip_plane_enable, + .has_per_samp = (fd6_ctx->fsaturate || fd6_ctx->vsaturate || + fd6_ctx->fastc_srgb || fd6_ctx->vastc_srgb), + .vsaturate_s = fd6_ctx->vsaturate_s, + .vsaturate_t = fd6_ctx->vsaturate_t, + .vsaturate_r = fd6_ctx->vsaturate_r, + .fsaturate_s = fd6_ctx->fsaturate_s, + .fsaturate_t = fd6_ctx->fsaturate_t, + .fsaturate_r = fd6_ctx->fsaturate_r, + .vastc_srgb = fd6_ctx->vastc_srgb, + .fastc_srgb = fd6_ctx->fastc_srgb, + .vsamples = ctx->tex[PIPE_SHADER_VERTEX].samples, + .fsamples = ctx->tex[PIPE_SHADER_FRAGMENT].samples, + } }, .rasterflat = ctx->rasterizer->flatshade, .sprite_coord_enable = ctx->rasterizer->sprite_coord_enable, .sprite_coord_mode = ctx->rasterizer->sprite_coord_mode, }; - fixup_shader_state(ctx, &emit.key); + fixup_shader_state(ctx, &emit.key.key); unsigned dirty = ctx->dirty; - const struct ir3_shader_variant *vp = fd6_emit_get_vp(&emit); - const struct ir3_shader_variant *fp = fd6_emit_get_fp(&emit); + + if (!(dirty & FD_DIRTY_PROG)) { + emit.prog = fd6_ctx->prog; + } else { + fd6_ctx->prog = fd6_emit_get_prog(&emit); + } + + emit.vs = fd6_emit_get_prog(&emit)->vs; + emit.fs = fd6_emit_get_prog(&emit)->fs; + + const struct ir3_shader_variant *vp = emit.vs; + const struct ir3_shader_variant *fp = emit.fs; /* do regular pass first, since that is more likely to fail compiling: */ @@ -256,8 +267,8 @@ fd6_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info, /* and now binning pass: */ emit.binning_pass = true; emit.dirty = dirty & ~(FD_DIRTY_BLEND); - emit.vp = NULL; /* we changed key so need to refetch vp */ - emit.fp = NULL; + emit.vs = fd6_emit_get_prog(&emit)->bs; + draw_impl(ctx, ctx->batch->binning, &emit, index_offset); if (emit.streamout_mask) { diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c index 706386af9cb..d2c9878e500 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c @@ -533,7 +533,7 @@ fd6_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd6_emit *emit) { int32_t i, j; const struct fd_vertex_state *vtx = emit->vtx; - const struct ir3_shader_variant *vp = fd6_emit_get_vp(emit); + const struct ir3_shader_variant *vp = emit->vs; for (i = 0, j = 0; i <= vp->inputs_count; i++) { if (vp->inputs[i].sysval) @@ -588,8 +588,9 @@ fd6_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, struct fd6_emit *emit) { struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; - const struct ir3_shader_variant *vp = fd6_emit_get_vp(emit); - const struct ir3_shader_variant *fp = fd6_emit_get_fp(emit); + const struct fd6_program_state *prog = fd6_emit_get_prog(emit); + const struct ir3_shader_variant *vp = emit->vs; + const struct ir3_shader_variant *fp = emit->fs; const enum fd_dirty_3d_state dirty = emit->dirty; bool needs_border = false; @@ -719,8 +720,19 @@ fd6_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, OUT_RING(ring, A6XX_GRAS_CL_VPORT_ZSCALE_0(ctx->viewport.scale[2])); } - if (dirty & FD_DIRTY_PROG) - fd6_program_emit(ctx, ring, emit); + if (dirty & FD_DIRTY_PROG) { + struct fd_ringbuffer *stateobj = emit->binning_pass ? + prog->binning_stateobj : prog->stateobj; + + fd6_emit_add_group(emit, stateobj, FD6_GROUP_PROG, 0x7); + + /* emit remaining non-stateobj program state, ie. what depends + * on other emit state, so cannot be pre-baked. This could + * be moved to a separate stateobj which is dynamically + * created. + */ + fd6_program_emit(ring, emit); + } if (dirty & FD_DIRTY_RASTERIZER) { struct fd6_rasterizer_stateobj *rasterizer = @@ -854,7 +866,7 @@ fd6_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, } if (emit->streamout_mask) { - struct fd6_streamout_state *tf = &fd6_context(ctx)->tf; + const struct fd6_streamout_state *tf = &prog->tf; OUT_PKT7(ring, CP_CONTEXT_REG_BUNCH, 12 + (2 * tf->prog_count)); OUT_RING(ring, REG_A6XX_VPC_SO_BUF_CNTL); diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.h b/src/gallium/drivers/freedreno/a6xx/fd6_emit.h index c05b0a45c96..02c41f03ce7 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.h @@ -43,6 +43,7 @@ struct fd_ringbuffer; * need to be emit'd. */ enum fd6_state_id { + FD6_GROUP_PROG, FD6_GROUP_VS_CONST, FD6_GROUP_FS_CONST, FD6_GROUP_VS_TEX, @@ -57,12 +58,11 @@ struct fd6_state_group { /* grouped together emit-state for prog/vertex/state emit: */ struct fd6_emit { - struct pipe_debug_callback *debug; + struct fd_context *ctx; const struct fd_vertex_state *vtx; - const struct fd_program_stateobj *prog; const struct pipe_draw_info *info; bool binning_pass; - struct ir3_shader_key key; + struct ir3_cache_key key; enum fd_dirty_3d_state dirty; uint32_t sprite_coord_enable; /* bitmask */ @@ -76,9 +76,11 @@ struct fd6_emit { */ bool no_lrz_write; - /* cached to avoid repeated lookups of same variants: */ - const struct ir3_shader_variant *vp, *fp; - /* TODO: other shader stages.. */ + /* cached to avoid repeated lookups: */ + const struct fd6_program_state *prog; + + struct ir3_shader_variant *vs; + struct ir3_shader_variant *fs; unsigned streamout_mask; @@ -86,32 +88,16 @@ struct fd6_emit { unsigned num_groups; }; -static inline const struct ir3_shader_variant * -fd6_emit_get_vp(struct fd6_emit *emit) +static inline const struct fd6_program_state * +fd6_emit_get_prog(struct fd6_emit *emit) { - if (!emit->vp) { - struct ir3_shader *shader = emit->prog->vp; - emit->vp = ir3_shader_variant(shader, emit->key, - emit->binning_pass, emit->debug); + if (!emit->prog) { + struct fd6_context *fd6_ctx = fd6_context(emit->ctx); + struct ir3_program_state *s = + ir3_cache_lookup(fd6_ctx->shader_cache, &emit->key, &emit->ctx->debug); + emit->prog = fd6_program_state(s); } - return emit->vp; -} - -static inline const struct ir3_shader_variant * -fd6_emit_get_fp(struct fd6_emit *emit) -{ - if (!emit->fp) { - if (emit->binning_pass) { - /* use dummy stateobj to simplify binning vs non-binning: */ - static const struct ir3_shader_variant binning_fp = {}; - emit->fp = &binning_fp; - } else { - struct ir3_shader *shader = emit->prog->fp; - emit->fp = ir3_shader_variant(shader, emit->key, - false, emit->debug); - } - } - return emit->fp; + return emit->prog; } static inline void diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.c b/src/gallium/drivers/freedreno/a6xx/fd6_program.c index 33f5962ad13..9fff9d9f7b0 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_program.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.c @@ -59,6 +59,8 @@ static void fd6_fp_state_delete(struct pipe_context *pctx, void *hwcso) { struct ir3_shader *so = hwcso; + struct fd_context *ctx = fd_context(pctx); + ir3_cache_invalidate(fd6_context(ctx)->shader_cache, hwcso); ir3_shader_destroy(so); } @@ -73,6 +75,8 @@ static void fd6_vp_state_delete(struct pipe_context *pctx, void *hwcso) { struct ir3_shader *so = hwcso; + struct fd_context *ctx = fd_context(pctx); + ir3_cache_invalidate(fd6_context(ctx)->shader_cache, hwcso); ir3_shader_destroy(so); } @@ -178,11 +182,11 @@ link_stream_out(struct ir3_shader_linkage *l, const struct ir3_shader_variant *v } static void -setup_stream_out(struct fd_context *ctx, const struct ir3_shader_variant *v, +setup_stream_out(struct fd6_program_state *state, const struct ir3_shader_variant *v, struct ir3_shader_linkage *l) { const struct pipe_stream_output_info *strmout = &v->shader->stream_output; - struct fd6_streamout_state *tf = &fd6_context(ctx)->tf; + struct fd6_streamout_state *tf = &state->tf; memset(tf, 0, sizeof(*tf)); @@ -251,12 +255,19 @@ enum { }; static void -setup_stages(struct fd6_emit *emit, struct stage *s) +setup_stages(struct fd6_program_state *state, struct stage *s, bool binning_pass) { unsigned i; - s[VS].v = fd6_emit_get_vp(emit); - s[FS].v = fd6_emit_get_fp(emit); + if (binning_pass) { + static const struct ir3_shader_variant dummy_fs = {0}; + + s[VS].v = state->bs; + s[FS].v = &dummy_fs; + } else { + s[VS].v = state->vs; + s[FS].v = state->fs; + } s[HS].v = s[DS].v = s[GS].v = NULL; /* for now */ @@ -287,9 +298,9 @@ setup_stages(struct fd6_emit *emit, struct stage *s) s[HS].instroff = s[DS].instroff = s[GS].instroff = s[FS].instroff; } -void -fd6_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct fd6_emit *emit) +static void +setup_stateobj(struct fd_ringbuffer *ring, + struct fd6_program_state *state, bool binning_pass) { struct stage s[MAX_STAGES]; uint32_t pos_regid, psize_regid, color_regid[8]; @@ -299,7 +310,7 @@ fd6_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, uint8_t psize_loc = ~0; int i, j; - setup_stages(emit, s); + setup_stages(state, s, binning_pass); fssz = FOUR_QUADS; @@ -392,8 +403,7 @@ fd6_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, struct ir3_shader_linkage l = {0}; ir3_link_shaders(&l, s[VS].v, s[FS].v); - if ((s[VS].v->shader->stream_output.num_outputs > 0) && - !emit->binning_pass) + if ((s[VS].v->shader->stream_output.num_outputs > 0) && !binning_pass) link_stream_out(&l, s[VS].v); BITSET_DECLARE(varbs, 128) = {0}; @@ -418,9 +428,8 @@ fd6_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, ir3_link_add(&l, psize_regid, 0x1, l.max_loc); } - if ((s[VS].v->shader->stream_output.num_outputs > 0) && - !emit->binning_pass) { - setup_stream_out(ctx, s[VS].v, &l); + if ((s[VS].v->shader->stream_output.num_outputs > 0) && !binning_pass) { + setup_stream_out(state, s[VS].v, &l); } for (i = 0, j = 0; (i < 16) && (j < l.cnt); i++) { @@ -478,7 +487,7 @@ fd6_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, OUT_RING(ring, A6XX_PC_PRIMITIVE_CNTL_1_STRIDE_IN_VPC(l.max_loc) | COND(psize_regid != regid(63,0), 0x100)); - if (emit->binning_pass) { + if (binning_pass) { OUT_PKT4(ring, REG_A6XX_SP_FS_OBJ_START_LO, 2); OUT_RING(ring, 0x00000000); /* SP_FS_OBJ_START_LO */ OUT_RING(ring, 0x00000000); /* SP_FS_OBJ_START_HI */ @@ -549,8 +558,10 @@ fd6_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_REG(0), 8); for (i = 0; i < 8; i++) { + // TODO we could have a mix of half and full precision outputs, + // we really need to figure out half-precision from IR3_REG_HALF OUT_RING(ring, A6XX_SP_FS_OUTPUT_REG_REGID(color_regid[i]) | - COND(emit->key.half_precision, + COND(false, A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION)); } @@ -559,27 +570,7 @@ fd6_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, A6XX_VPC_PACK_PSIZELOC(psize_loc) | A6XX_VPC_PACK_STRIDE_IN_VPC(l.max_loc)); - if (!emit->binning_pass) { - uint32_t vinterp[8], vpsrepl[8]; - - memset(vinterp, 0, sizeof(vinterp)); - memset(vpsrepl, 0, sizeof(vpsrepl)); - - /* looks like we need to do int varyings in the frag - * shader on a5xx (no flatshad reg? or a420.0 bug?): - * - * (sy)(ss)nop - * (sy)ldlv.u32 r0.x,l[r0.x], 1 - * ldlv.u32 r0.y,l[r0.x+1], 1 - * (ss)bary.f (ei)r63.x, 0, r0.x - * (ss)(rpt1)cov.s32f16 hr0.x, (r)r0.x - * (rpt5)nop - * sam (f16)(xyzw)hr0.x, hr0.x, s#0, t#0 - * - * Possibly on later a5xx variants we'll be able to use - * something like the code below instead of workaround - * in the shader: - */ + if (!binning_pass) { /* figure out VARYING_INTERP / VARYING_PS_REPL register values: */ for (j = -1; (j = ir3_next_varying(s[FS].v, j)) < (int)s[FS].v->inputs_count; ) { /* NOTE: varyings are packed, so if compmask is 0xb @@ -590,20 +581,83 @@ fd6_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, uint32_t inloc = s[FS].v->inputs[j].inloc; - if ((s[FS].v->inputs[j].interpolate == INTERP_MODE_FLAT) || - (s[FS].v->inputs[j].rasterflat && emit->rasterflat)) { + if (s[FS].v->inputs[j].interpolate == INTERP_MODE_FLAT) { + uint32_t loc = inloc; + + for (i = 0; i < 4; i++) { + if (compmask & (1 << i)) { + state->vinterp[loc / 16] |= 1 << ((loc % 16) * 2); + loc++; + } + } + } + } + } + + if (!binning_pass) + if (s[FS].instrlen) + fd6_emit_shader(ring, s[FS].v); + + OUT_PKT4(ring, REG_A6XX_VFD_CONTROL_1, 6); + OUT_RING(ring, A6XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) | + A6XX_VFD_CONTROL_1_REGID4INST(instance_regid) | + 0xfcfc0000); + OUT_RING(ring, 0x0000fcfc); /* VFD_CONTROL_2 */ + OUT_RING(ring, 0xfcfcfcfc); /* VFD_CONTROL_3 */ + OUT_RING(ring, 0x000000fc); /* VFD_CONTROL_4 */ + OUT_RING(ring, 0x0000fcfc); /* VFD_CONTROL_5 */ + OUT_RING(ring, 0x00000000); /* VFD_CONTROL_6 */ +} + +/* emits the program state which is not part of the stateobj because of + * dependency on other gl state (rasterflat or sprite-coord-replacement) + */ +void +fd6_program_emit(struct fd_ringbuffer *ring, struct fd6_emit *emit) +{ + const struct fd6_program_state *state = fd6_emit_get_prog(emit); + + if (!unlikely(emit->rasterflat || emit->sprite_coord_enable)) { + /* fastpath: */ + OUT_PKT4(ring, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8); + for (int i = 0; i < 8; i++) + OUT_RING(ring, state->vinterp[i]); /* VPC_VARYING_INTERP[i].MODE */ + + OUT_PKT4(ring, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8); + for (int i = 0; i < 8; i++) + OUT_RING(ring, 0x00000000); /* VPC_VARYING_PS_REPL[i] */ + } else { + /* slow-path: */ + struct ir3_shader_variant *fs = state->fs; + uint32_t vinterp[8], vpsrepl[8]; + + memset(vinterp, 0, sizeof(vinterp)); + memset(vpsrepl, 0, sizeof(vpsrepl)); + + for (int i = 0; i < state->fs_inputs_count; i++) { + int j = state->fs_inputs[i]; + + /* NOTE: varyings are packed, so if compmask is 0xb + * then first, third, and fourth component occupy + * three consecutive varying slots: + */ + unsigned compmask = fs->inputs[j].compmask; + + uint32_t inloc = fs->inputs[j].inloc; + + if ((fs->inputs[j].interpolate == INTERP_MODE_FLAT) || + (fs->inputs[j].rasterflat && emit->rasterflat)) { uint32_t loc = inloc; for (i = 0; i < 4; i++) { if (compmask & (1 << i)) { vinterp[loc / 16] |= 1 << ((loc % 16) * 2); - //flatshade[loc / 32] |= 1 << (loc % 32); loc++; } } } - gl_varying_slot slot = s[FS].v->inputs[j].slot; + gl_varying_slot slot = fs->inputs[j].slot; /* since we don't enable PIPE_CAP_TGSI_TEXCOORD: */ if (slot >= VARYING_SLOT_VAR0) { @@ -642,32 +696,57 @@ fd6_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, } OUT_PKT4(ring, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8); - for (i = 0; i < 8; i++) + for (int i = 0; i < 8; i++) OUT_RING(ring, vinterp[i]); /* VPC_VARYING_INTERP[i].MODE */ OUT_PKT4(ring, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8); - for (i = 0; i < 8; i++) - OUT_RING(ring, vpsrepl[i]); /* VPC_VARYING_PS_REPL[i] */ + for (int i = 0; i < 8; i++) + OUT_RING(ring, vpsrepl[i]); /* VPC_VARYING_PS_REPL[i] */ } - - if (!emit->binning_pass) - if (s[FS].instrlen) - fd6_emit_shader(ring, s[FS].v); - - OUT_PKT4(ring, REG_A6XX_VFD_CONTROL_1, 6); - OUT_RING(ring, A6XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) | - A6XX_VFD_CONTROL_1_REGID4INST(instance_regid) | - 0xfcfc0000); - OUT_RING(ring, 0x0000fcfc); /* VFD_CONTROL_2 */ - OUT_RING(ring, 0xfcfcfcfc); /* VFD_CONTROL_3 */ - OUT_RING(ring, 0x000000fc); /* VFD_CONTROL_4 */ - OUT_RING(ring, 0x0000fcfc); /* VFD_CONTROL_5 */ - OUT_RING(ring, 0x00000000); /* VFD_CONTROL_6 */ } +static struct ir3_program_state * +fd6_program_create(void *data, struct ir3_shader_variant *bs, + struct ir3_shader_variant *vs, + struct ir3_shader_variant *fs, + const struct ir3_shader_key *key) +{ + struct fd_context *ctx = data; + struct fd6_program_state *state = CALLOC_STRUCT(fd6_program_state); + + state->bs = bs; + state->vs = vs; + state->fs = fs; + state->binning_stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); + state->stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); + + setup_stateobj(state->binning_stateobj, state, true); + setup_stateobj(state->stateobj, state, false); + + return &state->base; +} + +static void +fd6_program_destroy(void *data, struct ir3_program_state *state) +{ + struct fd6_program_state *so = fd6_program_state(state); + fd_ringbuffer_del(so->stateobj); + fd_ringbuffer_del(so->binning_stateobj); + free(so); +} + +static const struct ir3_cache_funcs cache_funcs = { + .create_state = fd6_program_create, + .destroy_state = fd6_program_destroy, +}; + void fd6_prog_init(struct pipe_context *pctx) { + struct fd_context *ctx = fd_context(pctx); + + fd6_context(ctx)->shader_cache = ir3_cache_create(&cache_funcs, ctx); + pctx->create_fs_state = fd6_fp_state_create; pctx->delete_fs_state = fd6_fp_state_delete; diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.h b/src/gallium/drivers/freedreno/a6xx/fd6_program.h index 4e7afe78dc7..83c4688a243 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_program.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.h @@ -31,13 +31,44 @@ #include "pipe/p_context.h" #include "freedreno_context.h" #include "ir3_shader.h" +#include "ir3_cache.h" + +struct fd6_streamout_state { + uint32_t ncomp[PIPE_MAX_SO_BUFFERS]; + uint32_t prog[256/2]; + uint32_t prog_count; + uint32_t vpc_so_buf_cntl; +}; struct fd6_emit; +struct fd6_program_state { + struct ir3_program_state base; + struct ir3_shader_variant *bs; /* binning pass vs */ + struct ir3_shader_variant *vs; + struct ir3_shader_variant *fs; + struct fd_ringbuffer *binning_stateobj; + struct fd_ringbuffer *stateobj; + + /* cached state about current emitted shader program (3d): */ + struct fd6_streamout_state tf; + + /* index and # of varyings: */ + uint8_t fs_inputs[16]; + uint8_t fs_inputs_count; + + uint32_t vinterp[8]; +}; + +static inline struct fd6_program_state * +fd6_program_state(struct ir3_program_state *state) +{ + return (struct fd6_program_state *)state; +} + void fd6_emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so); -void fd6_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct fd6_emit *emit); +void fd6_program_emit(struct fd_ringbuffer *ring, struct fd6_emit *emit); void fd6_prog_init(struct pipe_context *pctx);